Rev 3914: Move even more functionality into EquivalenceTable. in http://bzr.arbash-meinel.com/branches/bzr/brisbane/vilajam
John Arbash Meinel
john at arbash-meinel.com
Wed Mar 25 21:20:39 GMT 2009
At http://bzr.arbash-meinel.com/branches/bzr/brisbane/vilajam
------------------------------------------------------------
revno: 3914
revision-id: john at arbash-meinel.com-20090325212018-nju3kld8lvjhzn9y
parent: john at arbash-meinel.com-20090325210346-f4c2x0dh17dlj4mi
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: vilajam
timestamp: Wed 2009-03-25 16:20:18 -0500
message:
Move even more functionality into EquivalenceTable.
At this point, ET takes care of the delta generation, which is nice.
Also, I uncovered a bug in how we handle insert of > 1 line, because
the matcher is strictly line based, but the insert was based on
number of bytes. So it created a 'line' which could never be
matched.
We just need to tie in 'make_delta' and we can run most of the
tests, and then start working on apply_delta()
-------------- next part --------------
=== modified file 'bzrlib/_groupcompress_py.py'
--- a/bzrlib/_groupcompress_py.py 2009-03-25 21:03:46 +0000
+++ b/bzrlib/_groupcompress_py.py 2009-03-25 21:20:18 +0000
@@ -20,6 +20,9 @@
useless stuff.
"""
+from bzrlib import osutils
+
+
### v imported from gc plugin at revno30
class EquivalenceTable(object):
"""This class tracks equivalencies between lists of hashable objects.
@@ -171,6 +174,94 @@
"""Set the lines we will be matching against."""
self._right_lines = lines
+ def _flush_insert(self, start_linenum, end_linenum,
+ new_lines, out_lines, index_lines):
+ """Add an 'insert' request to the data stream."""
+ bytes_to_insert = ''.join(new_lines[start_linenum:end_linenum])
+ insert_length = len(bytes_to_insert)
+ # Each insert instruction is at most 127 bytes long
+ for start_byte in xrange(0, insert_length, 127):
+ insert_count = min(insert_length - start_byte, 127)
+ assert insert_count <= 127
+ out_lines.append(chr(insert_count))
+ # Don't index the 'insert' instruction
+ index_lines.append(False)
+ insert = bytes_to_insert[start_byte:start_byte+insert_count]
+ as_lines = osutils.split_lines(insert)
+ out_lines.extend(as_lines)
+ index_lines.extend([True]*len(as_lines))
+
+ def _flush_copy(self, old_start_linenum, num_lines,
+ out_lines, index_lines):
+ if old_start_linenum == 0:
+ first_byte = 0
+ else:
+ first_byte = self.line_offsets[old_start_linenum - 1]
+ stop_byte = self.line_offsets[old_start_linenum + num_lines - 1]
+ num_bytes = stop_byte - first_byte
+ # The data stream allows >64kB in a copy, but to match the compiled
+ # code, we will also limit it to a 64kB copy
+ for start_byte in xrange(first_byte, stop_byte, 64*1024):
+ num_bytes = min(64*1024, stop_byte - first_byte)
+ copy_bytes = encode_copy_instruction(start_byte, num_bytes)
+ out_lines.append(copy_bytes)
+ index_lines.append(False)
+
+ def make_delta(self, new_lines, soft=False):
+ """Compute the delta for this content versus the original content."""
+ # reserved for content type, content length, source_len, target_len
+ from bzrlib.groupcompress import encode_base128_int
+ out_lines = ['', '', '', '']
+ index_lines = [False, False, False, False]
+ blocks = self.get_matching_blocks(new_lines, soft=soft)
+ current_line_num = 0
+ # We either copy a range (while there are reusable lines) or we
+ # insert new lines. To find reusable lines we traverse
+ for old_start, new_start, range_len in blocks:
+ if new_start != current_line_num:
+ # non-matching region, insert the content
+ self._flush_insert(current_line_num, new_start,
+ new_lines, out_lines, index_lines)
+ current_line_num = new_start + range_len
+ if range_len:
+ self._flush_copy(old_start, range_len, out_lines, index_lines)
+ out_lines[2] = encode_base128_int(self.endpoint)
+ out_lines[3] = encode_base128_int(sum(map(len, new_lines)))
+ return out_lines, index_lines
+
+
+
+def encode_copy_instruction(offset, length):
+ """Convert this offset into a control code and bytes."""
+ copy_command = 0x80
+ copy_bytes = [None]
+
+ for copy_bit in (0x01, 0x02, 0x04, 0x08):
+ base_byte = offset & 0xff
+ if base_byte:
+ copy_command |= copy_bit
+ copy_bytes.append(chr(base_byte))
+ offset >>= 8
+ if length is None:
+ # None is used by the test suite
+ copy_bytes[0] = chr(copy_command)
+ return ''.join(copy_bytes)
+ if length > 0x10000:
+ raise ValueError("we don't emit copy records for lengths > 64KiB")
+ if length == 0:
+ raise ValueError("We cannot emit a copy of length 0")
+ if length != 0x10000:
+ # A copy of length exactly 64*1024 == 0x10000 is sent as a length of 0,
+ # since that saves bytes for large chained copies
+ for copy_bit in (0x10, 0x20):
+ base_byte = length & 0xff
+ if base_byte:
+ copy_command |= copy_bit
+ copy_bytes.append(chr(base_byte))
+ length >>= 8
+ copy_bytes[0] = chr(copy_command)
+ return ''.join(copy_bytes)
+
def make_delta(source_bytes, target_bytes):
=== modified file 'bzrlib/groupcompress.py'
--- a/bzrlib/groupcompress.py 2009-03-25 21:03:46 +0000
+++ b/bzrlib/groupcompress.py 2009-03-25 21:20:18 +0000
@@ -84,38 +84,6 @@
return val, offset
-def encode_copy_instruction(offset, length):
- """Convert this offset into a control code and bytes."""
- copy_command = 0x80
- copy_bytes = [None]
-
- for copy_bit in (0x01, 0x02, 0x04, 0x08):
- base_byte = offset & 0xff
- if base_byte:
- copy_command |= copy_bit
- copy_bytes.append(chr(base_byte))
- offset >>= 8
- if length is None:
- # None is used by the test suite
- copy_bytes[0] = chr(copy_command)
- return ''.join(copy_bytes)
- if length > 0x10000:
- raise ValueError("we don't emit copy records for lengths > 64KiB")
- if length == 0:
- raise ValueError("We cannot emit a copy of length 0")
- if length != 0x10000:
- # A copy of length exactly 64*1024 == 0x10000 is sent as a length of 0,
- # since that saves bytes for large chained copies
- for copy_bit in (0x10, 0x20):
- base_byte = length & 0xff
- if base_byte:
- copy_command |= copy_bit
- copy_bytes.append(chr(base_byte))
- length >>= 8
- copy_bytes[0] = chr(copy_command)
- return ''.join(copy_bytes)
-
-
def sort_gc_optimal(parent_map):
"""Sort and group the keys in parent_map into groupcompress order.
@@ -878,21 +846,8 @@
raise errors.ExistingContent()
if key[-1] is None:
key = key[:-1] + ('sha1:' + sha1,)
- # reserved for content type, content length, source_len, target_len
- out_lines = ['', '', '', '']
- index_lines = [False, False, False, False]
- blocks = self.line_locations.get_matching_blocks(new_lines, soft=soft)
- current_line_num = 0
- # We either copy a range (while there are reusable lines) or we
- # insert new lines. To find reusable lines we traverse
- for old_start, new_start, range_len in blocks:
- if new_start != current_line_num:
- # non-matching region, insert the content
- self._flush_insert(current_line_num, new_start,
- new_lines, out_lines, index_lines)
- current_line_num = new_start + range_len
- if range_len:
- self._flush_copy(old_start, range_len, out_lines, index_lines)
+ out_lines, index_lines = self.line_locations.make_delta(new_lines,
+ soft=soft)
delta_length = sum(map(len, out_lines))
if delta_length * 2 > bytes_length:
# The delta is longer than the fulltext, insert a fulltext
@@ -906,11 +861,7 @@
# this is a worthy delta, output it
type = 'delta'
out_lines[0] = 'd'
- # A delta record includes the source and target lengths
- out_lines[2] = encode_base128_int(self.endpoint)
- out_lines[3] = encode_base128_int(bytes_length)
# Update the delta_length to include those two encoded integers
- delta_length += len(out_lines[2]) + len(out_lines[3])
out_lines[1] = encode_base128_int(delta_length)
out_length = len(out_lines[3]) + 1 + delta_length
self._block.add_entry(key, type=type, sha1=sha1,
@@ -924,38 +875,6 @@
# FIXME: lot of guessing below
return sha1, start, self.endpoint, 'delta', out_length
- def _flush_insert(self, start_linenum, end_linenum,
- new_lines, out_lines, index_lines):
- """Add an 'insert' request to the data stream."""
- bytes_to_insert = ''.join(new_lines[start_linenum:end_linenum])
- insert_length = len(bytes_to_insert)
- # Each insert instruction is at most 127 bytes long
- for start_byte in xrange(0, insert_length, 127):
- insert_count = min(insert_length - start_byte, 127)
- assert insert_count <= 127
- out_lines.append(chr(insert_count))
- # Don't index the 'insert' instruction
- index_lines.append(False)
- insert = bytes_to_insert[start_byte:start_byte+insert_count]
- out_lines.append(insert)
- index_lines.append(True)
-
- def _flush_copy(self, old_start_linenum, num_lines,
- out_lines, index_lines):
- if old_start_linenum == 0:
- first_byte = 0
- else:
- first_byte = self.line_locations.line_offsets[old_start_linenum - 1]
- stop_byte = self.line_locations.line_offsets[old_start_linenum + num_lines - 1]
- num_bytes = stop_byte - first_byte
- # The data stream allows >64kB in a copy, but to match the compiled
- # code, we will also limit it to a 64kB copy
- for start_byte in xrange(first_byte, stop_byte, 64*1024):
- num_bytes = min(64*1024, stop_byte - first_byte)
- copy_bytes = encode_copy_instruction(start_byte, num_bytes)
- out_lines.append(copy_bytes)
- index_lines.append(False)
-
def flush(self):
# FIXME: ugly hack to masquerade ourself as the pyrex version
self._block.set_content(''.join(self.lines))
@@ -1989,6 +1908,7 @@
from bzrlib._groupcompress_py import (
apply_delta,
+ encode_copy_instruction,
EquivalenceTable,
)
try:
=== modified file 'bzrlib/tests/test_groupcompress.py'
--- a/bzrlib/tests/test_groupcompress.py 2009-03-25 20:58:16 +0000
+++ b/bzrlib/tests/test_groupcompress.py 2009-03-25 21:20:18 +0000
@@ -127,15 +127,22 @@
def test_stats(self):
compressor = self.compressor()
- compressor.compress(('label',), 'strange\ncommon long line\n'
- 'plus more text\n', None)
+ compressor.compress(('label',),
+ 'strange\n'
+ 'common very very long line\n'
+ 'plus more text\n', None)
compressor.compress(('newlabel',),
- 'common long line\nplus more text\n'
- 'different\nmoredifferent\n', None)
+ 'common very very long line\n'
+ 'plus more text\n'
+ 'different\n'
+ 'moredifferent\n', None)
compressor.compress(('label3',),
- 'new\ncommon long line\nplus more text\n'
- '\ndifferent\nmoredifferent\n', None)
- self.assertAlmostEqual(1.4, compressor.ratio(), 1)
+ 'new\n'
+ 'common very very long line\n'
+ 'plus more text\n'
+ 'different\n'
+ 'moredifferent\n', None)
+ self.assertAlmostEqual(1.9, compressor.ratio(), 1)
def test_two_nosha_delta(self):
compressor = self.compressor()
@@ -199,15 +206,22 @@
def test_stats(self):
compressor = self.compressor()
- compressor.compress(('label',), 'strange\ncommon long line\n'
- 'plus more text\n', None)
+ compressor.compress(('label',),
+ 'strange\n'
+ 'common very very long line\n'
+ 'plus more text\n', None)
compressor.compress(('newlabel',),
- 'common long line\nplus more text\n'
- 'different\nmoredifferent\n', None)
+ 'common very very long line\n'
+ 'plus more text\n'
+ 'different\n'
+ 'moredifferent\n', None)
compressor.compress(('label3',),
- 'new\ncommon long line\nplus more text\n'
- '\ndifferent\nmoredifferent\n', None)
- self.assertAlmostEqual(1.1, compressor.ratio(), 1)
+ 'new\n'
+ 'common very very long line\n'
+ 'plus more text\n'
+ 'different\n'
+ 'moredifferent\n', None)
+ self.assertAlmostEqual(1.9, compressor.ratio(), 1)
def test_two_nosha_delta(self):
compressor = self.compressor()
More information about the bazaar-commits
mailing list