Rev 3: new encoder, allows non monotonically increasing sequence matches for moar compression. in http://people.ubuntu.com/~robertc/baz2.0/plugins/groupcompress/trunk
Robert Collins
robertc at robertcollins.net
Mon Jul 7 03:27:04 BST 2008
At http://people.ubuntu.com/~robertc/baz2.0/plugins/groupcompress/trunk
------------------------------------------------------------
revno: 3
revision-id: robertc at robertcollins.net-20080707022703-rfk7o3i2xysdl9o6
parent: robertc at robertcollins.net-20080706031554-j10ybun5rn5fjiei
committer: Robert Collins <robertc at robertcollins.net>
branch nick: trunk
timestamp: Mon 2008-07-07 12:27:03 +1000
message:
new encoder, allows non monotonically increasing sequence matches for moar compression.
modified:
groupcompress.py groupcompress.py-20080705181503-ccbxd6xuy1bdnrpu-8
tests/test_groupcompress.py test_groupcompress.p-20080705181503-ccbxd6xuy1bdnrpu-13
=== modified file 'groupcompress.py'
--- a/groupcompress.py 2008-07-06 03:15:54 +0000
+++ b/groupcompress.py 2008-07-07 02:27:03 +0000
@@ -39,9 +39,14 @@
next = lines.next
print next(), next()
for header in lines:
- start, end, count = [int(n) for n in header.split(',')]
- contents = [next() for i in xrange(count)]
- result.append((start, end, count, contents))
+ op = header[0]
+ numbers = header[2:]
+ numbers = [int(n) for n in header[2:].split(',')]
+ if op == 'c':
+ result.append((op, numbers[0], numbers[1], None))
+ else:
+ contents = [next() for i in xrange(numbers[0])]
+ result.append((op, None, numbers[0], contents))
return result
def apply_delta(basis, delta):
@@ -50,13 +55,11 @@
last_offset = 0
# eq ranges occur where gaps occur
# start, end refer to offsets in basis
- for start, end, count, delta_lines in delta:
- if last_offset != start: # copy an eq range
- lines.extend(basis[last_offset:start])
- lines[start:end] = delta_lines
- last_offset = end
- if last_offset != len(basis):
- lines.extend(basis[last_offset:])
+ for op, start, count, delta_lines in delta:
+ if op == 'c':
+ lines.extend(basis[start:start+count])
+ else:
+ lines.extend(delta_lines)
trim_encoding_newline(lines)
return lines
@@ -69,7 +72,20 @@
class GroupCompressor(object):
- """Produce a serialised group of compressed texts."""
+ """Produce a serialised group of compressed texts.
+
+ It contains code very similar to SequenceMatcher because of having a similar
+ task. However some key differences apply:
+ - there is no junk, we want a minimal edit not a human readable diff.
+ - we don't filter very common lines (because we don't know where a good
+ range will start, and after the first text we want to be emitting minmal
+ edits only.
+ - we chain the left side, not the right side
+ - we incrementally update the adjacency matrix as new lines are provided.
+ - we look for matches in all of the left side, so the routine which does
+ the analagous task of find_longest_match does not need to filter on the
+ left side.
+ """
def __init__(self, delta=True):
"""Create a GroupCompressor.
@@ -80,6 +96,8 @@
self.lines = []
self.endpoint = 0
self.input_bytes = 0
+ # line: set(locations it appears at), set(N+1 for N in locations)
+ self.line_locations = {}
def compress(self, key, lines, expected_sha):
"""Compress lines with label key.
@@ -104,26 +122,72 @@
new_lines = []
new_lines.append('label: %s\n' % label)
new_lines.append('sha1: %s\n' % sha1)
- if 0:
- delta_seq = diff.difflib.SequenceMatcher(
- None, self.lines, lines)
- else:
- delta_seq = patiencediff.PatienceSequenceMatcher(
- None, self.lines, lines)
- diff_hunks = []
- for op in delta_seq.get_opcodes():
- if op[0] == 'equal':
- continue
- diff_hunks.append((op[1], op[2], op[4]-op[3], lines[op[3]:op[4]]))
- for start, end, count, new in diff_hunks:
- new_lines.append('%d,%d,%d\n' % (start, end, count))
- new_lines.extend(new)
- self.endpoint += sum(map(len, new_lines))
- self.lines.extend(new_lines)
+ pos = 0
+ line_locations = self.line_locations
+ accumulator = []
+ copying = False
+ new_len = 0
+ new_start = 0
+ # We either copy a range (while there are reusable lines) or we
+ # insert new lines. To find reusable lines we traverse
+ while pos < len(lines):
+ line = lines[pos]
+ if line not in line_locations:
+ if copying:
+ # flush the copy
+ copy_start = min(copy_ends) - copy_len
+ new_lines.append("c,%d,%d\n" % (copy_start,copy_len))
+ copying = False
+ new_start = pos
+ new_len = 1
+ else:
+ new_len += 1
+ else:
+ if copying:
+ locations, next = line_locations[line]
+ next_locations = locations.intersection(copy_ends)
+ if len(next_locations):
+ # range continues
+ copy_len += 1
+ copy_ends = set(loc + 1 for loc in next_locations)
+ else:
+ # range stops, flush and start a new copy range
+ copy_start = min(copy_ends) - copy_len
+ new_lines.append("c,%d,%d\n" % (copy_start,copy_len))
+ copy_len = 1
+ copy_ends = next
+ else:
+ # Flush
+ if new_len:
+ new_lines.append("i,%d\n" % new_len)
+ new_lines.extend(lines[new_start:new_start+new_len])
+ # setup a copy
+ copy_len = 1
+ copy_ends = line_locations[line][1]
+ copying = True
+ pos += 1
+ if copying:
+ copy_start = min(copy_ends) - copy_len
+ new_lines.append("c,%d,%d\n" % (copy_start,copy_len))
+ elif new_len:
+ new_lines.append("i,%d\n" % new_len)
+ new_lines.extend(lines[new_start:new_start+new_len])
+
+ self.output_lines(new_lines)
trim_encoding_newline(lines)
self.input_bytes += sum(map(len, lines))
return sha1, self.endpoint
+ def output_lines(self, new_lines):
+ self.endpoint += sum(map(len, new_lines))
+ offset = len(self.lines)
+ self.lines.extend(new_lines)
+ for pos, line in enumerate(new_lines):
+ indices, next_lines = self.line_locations.setdefault(line,
+ (set(), set()))
+ indices.add(pos + offset)
+ next_lines.add(pos + offset + 1)
+
def ratio(self):
"""Return the overall compression ratio."""
return float(self.input_bytes) / float(self.endpoint)
@@ -224,7 +288,7 @@
# double handling for now. Make it work until then.
bytes = ''.join(lines)
record = FulltextContentFactory(key, parents, None, bytes)
- sha1 = self._insert_record_stream([record])
+ sha1 = self._insert_record_stream([record]).next()
return sha1, len(bytes), None
def _check_add(self, key, lines, random_id, check_content):
@@ -267,6 +331,7 @@
for record in stream:
found_sha1, end_point = compressor.compress(record.key,
split_lines(record.get_bytes_as('fulltext')), record.sha1)
+ yield found_sha1
class _GCGraphIndex(object):
"""Mapper from GroupCompressVersionedFiles needs into GraphIndex storage."""
=== modified file 'tests/test_groupcompress.py'
--- a/tests/test_groupcompress.py 2008-07-06 03:15:54 +0000
+++ b/tests/test_groupcompress.py 2008-07-07 02:27:03 +0000
@@ -64,7 +64,7 @@
expected_lines = [
'label: label\n',
'sha1: %s\n' % sha1,
- '0,0,3\n',
+ 'i,3\n',
'strange\n',
'common\n',
'\n', # the last \n in a text is removed, which allows safe
@@ -77,26 +77,22 @@
compressor = groupcompress.GroupCompressor(True)
sha1_1, _ = compressor.compress(('label',),
['strange\n', 'common\n'], None)
+ expected_lines = list(compressor.lines)
sha1_2, end_point = compressor.compress(('newlabel',),
['common\n', 'different\n'], None)
self.assertEqual(sha_strings(['common\n', 'different\n']), sha1_2)
- expected_lines = [
- 'label: label\n',
- 'sha1: %s\n' % sha1_1,
- '0,0,3\n',
- 'strange\n',
- 'common\n',
- '\n',
+ expected_lines.extend([
'label: newlabel\n',
'sha1: %s\n' % sha1_2,
- # Delete what we don't want. Perhaps we want an implicit
- # delete all to keep from bloating with useless delete
- # instructions.
- '0,4,0\n',
- # add the new lines
- '5,5,1\n',
+ # copy the line common
+ 'c,4,1\n',
+ # add the line different
+ 'i,1\n',
'different\n',
- ]
+ # copy the line \n. Note that when we filter on encoding-overhead
+ # this will become a fresh insert instead
+ 'c,5,1\n',
+ ])
self.assertEqual(expected_lines, compressor.lines)
self.assertEqual(sum(map(len, expected_lines)), end_point)
@@ -108,42 +104,26 @@
['strange\n', 'common\n'], None)
sha1_2, _ = compressor.compress(('newlabel',),
['common\n', 'different\n', 'moredifferent\n'], None)
+ expected_lines = list(compressor.lines)
sha1_3, end_point = compressor.compress(('label3',),
['new\n', 'common\n', 'different\n', 'moredifferent\n'], None)
self.assertEqual(
sha_strings(['new\n', 'common\n', 'different\n', 'moredifferent\n']),
sha1_3)
- expected_lines = [
- 'label: label\n',
- 'sha1: %s\n' % sha1_1,
- '0,0,3\n',
- 'strange\n',
- 'common\n',
- '\n',
- 'label: newlabel\n',
- 'sha1: %s\n' % sha1_2,
- # Delete what we don't want. Perhaps we want an implicit
- # delete all to keep from bloating with useless delete
- # instructions.
- '0,4,0\n',
- # add the new lines
- '5,5,2\n',
- 'different\n',
- 'moredifferent\n',
+ expected_lines.extend([
'label: label3\n',
'sha1: %s\n' % sha1_3,
- # Delete what we don't want. Perhaps we want an implicit
- # delete all to keep from bloating with useless delete
- # instructions.
- # replace 'strange' with 'new'
- '0,4,1\n',
+ # insert new
+ 'i,1\n',
'new\n',
- # delete from after common up to differnet
- '5,10,0\n',
- # add new \n
- '12,12,1\n',
- '\n',
- ]
+ # copy the line common
+ 'c,4,1\n',
+ # copy the lines different, moredifferent
+ 'c,10,2\n',
+ # copy the line \n. Note that when we filter on encoding-overhead
+ # this will become a fresh insert instead
+ 'c,5,1\n',
+ ])
self.assertEqualDiff(''.join(expected_lines), ''.join(compressor.lines))
self.assertEqual(sum(map(len, expected_lines)), end_point)
More information about the bazaar-commits
mailing list