Rev 3918: Implement make_delta and apply_delta. in http://bzr.arbash-meinel.com/branches/bzr/brisbane/vilajam

Fri Mar 27 20:12:29 GMT 2009

At http://bzr.arbash-meinel.com/branches/bzr/brisbane/vilajam

------------------------------------------------------------
revno: 3918
revision-id: john at arbash-meinel.com-20090327201212-1ykalx15yr1cquxt
parent: john at arbash-meinel.com-20090327193046-9i5yqw3estnvgv5a
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: vilajam
timestamp: Fri 2009-03-27 15:12:12 -0500
message:
  Implement make_delta and apply_delta.
  
  Update the permuted tests so that both implementations are tested.
-------------- next part --------------
=== modified file 'bzrlib/_groupcompress_py.py'

--- a/bzrlib/_groupcompress_py.py	2009-03-27 19:30:46 +0000
+++ b/bzrlib/_groupcompress_py.py	2009-03-27 20:12:12 +0000
@@ -206,10 +206,12 @@
             out_lines.append(copy_bytes)
             index_lines.append(False)
 
-    def make_delta(self, new_lines, soft=False):
+    def make_delta(self, new_lines, bytes_length=None, soft=False):
         """Compute the delta for this content versus the original content."""
-        # reserved for content type, content length, target_len
-        out_lines = ['', '', '']
+        if bytes_length is None:
+            bytes_length = sum(map(len, new_lines))
+        # reserved for content type, content length
+        out_lines = ['', '', encode_base128_int(bytes_length)]
         index_lines = [False, False, False]
         blocks = self.get_matching_blocks(new_lines, soft=soft)
         current_line_num = 0
@@ -226,6 +228,33 @@
         return out_lines, index_lines
 
 
+def encode_base128_int(val):
+    """Convert an integer into a 7-bit lsb encoding."""
+    bytes = []
+    count = 0
+    while val >= 0x80:
+        bytes.append(chr((val | 0x80) & 0xFF))
+        val >>= 7
+    bytes.append(chr(val))
+    return ''.join(bytes)
+
+
+def decode_base128_int(bytes):
+    """Decode an integer from a 7-bit lsb encoding."""
+    offset = 0
+    val = 0
+    shift = 0
+    bval = ord(bytes[offset])
+    while bval >= 0x80:
+        val |= (bval & 0x7F) << shift
+        shift += 7
+        offset += 1
+        bval = ord(bytes[offset])
+    val |= bval << shift
+    offset += 1
+    return val, offset
+
+
 def encode_copy_instruction(offset, length):
     """Convert this offset into a control code and bytes."""
     copy_command = 0x80
@@ -258,27 +287,87 @@
     return ''.join(copy_bytes)
 
 
+def decode_copy_instruction(bytes, cmd, pos):
+    """Decode a copy instruction from the next few bytes.
+
+    A copy instruction is a variable number of bytes, so we will parse the
+    bytes we care about, and return the new position, as well as the offset and
+    length referred to in the bytes.
+
+    :param bytes: A string of bytes
+    :param cmd: The command code
+    :param pos: The position in bytes right after the copy command
+    :return: (offset, length, newpos)
+        The offset of the copy start, the number of bytes to copy, and the
+        position after the last byte of the copy
+    """
+    if cmd & 0x80 != 0x80:
+        raise ValueError('copy instructions must have bit 0x80 set')
+    offset = 0
+    length = 0
+    if (cmd & 0x01):
+        offset = ord(bytes[pos])
+        pos += 1
+    if (cmd & 0x02):
+        offset = offset | (ord(bytes[pos]) << 8)
+        pos += 1
+    if (cmd & 0x04):
+        offset = offset | (ord(bytes[pos]) << 16)
+        pos += 1
+    if (cmd & 0x08):
+        offset = offset | (ord(bytes[pos]) << 24)
+        pos += 1
+    if (cmd & 0x10):
+        length = ord(bytes[pos])
+        pos += 1
+    if (cmd & 0x20):
+        length = length | (ord(bytes[pos]) << 8)
+        pos += 1
+    if (cmd & 0x40):
+        length = length | (ord(bytes[pos]) << 16)
+        pos += 1
+    if length == 0:
+        length = 65536
+    return (offset, length, pos)
+
 
 def make_delta(source_bytes, target_bytes):
     """Create a delta from source to target."""
     # TODO: The checks below may not be a the right place yet.
-    if not isinstance(source_bytes, str):
+    if type(source_bytes) is not str:
         raise TypeError('source is not a str')
-    if not isinstance(target_bytes, str):
+    if type(target_bytes) is not str:
         raise TypeError('target is not a str')
     line_locations = EquivalenceTable([])
-    return None
+    source_lines = osutils.split_lines(source_bytes)
+    line_locations.extend_lines(source_lines, [True]*len(source_lines))
+    delta, _ = line_locations.make_delta(osutils.split_lines(target_bytes),
+                                         bytes_length=len(target_bytes))
+    return ''.join(delta)
 
 
 def apply_delta(basis, delta):
     """Apply delta to this object to become new_version_id."""
+    if type(basis) is not str:
+        raise TypeError('basis is not a str')
+    if type(delta) is not str:
+        raise TypeError('delta is not a str')
+    target_length, pos = decode_base128_int(delta)
     lines = []
-    last_offset = 0
-    # eq ranges occur where gaps occur
-    # start, end refer to offsets in basis
-    for op, start, count, delta_lines in delta:
-        if op == 'c':
-            lines.append(basis[start:start+count])
-        else:
-            lines.extend(delta_lines)
-    return lines
+    len_delta = len(delta)
+    while pos < len_delta:
+        cmd = ord(delta[pos])
+        pos += 1
+        if cmd & 0x80:
+            offset, length, pos = decode_copy_instruction(delta, cmd, pos)
+            lines.append(basis[offset:offset+length])
+        else: # Insert of 'cmd' bytes
+            if cmd == 0:
+                raise ValueError('Command == 0 not supported yet')
+            lines.append(delta[pos:pos+cmd])
+            pos += cmd
+    bytes = ''.join(lines)
+    if len(bytes) != target_length:
+        raise ValueError('Delta claimed to be %d long, but ended up'
+                         ' %d long' % (target_length, len(bytes)))
+    return bytes

=== modified file 'bzrlib/groupcompress.py'
--- a/bzrlib/groupcompress.py	2009-03-27 19:30:46 +0000
+++ b/bzrlib/groupcompress.py	2009-03-27 20:12:12 +0000
@@ -55,33 +55,6 @@
 _null_sha1 = 'da39a3ee5e6b4b0d3255bfef95601890afd80709'
 
 
-def encode_base128_int(val):
-    """Convert an integer into a 7-bit lsb encoding."""
-    bytes = []
-    count = 0
-    while val >= 0x80:
-        bytes.append(chr((val | 0x80) & 0xFF))
-        val >>= 7
-    bytes.append(chr(val))
-    return ''.join(bytes)
-
-
-def decode_base128_int(bytes):
-    """Decode an integer from a 7-bit lsb encoding."""
-    offset = 0
-    val = 0
-    shift = 0
-    bval = ord(bytes[offset])
-    while bval >= 0x80:
-        val |= (bval & 0x7F) << shift
-        shift += 7
-        offset += 1
-        bval = ord(bytes[offset])
-    val |= bval << shift
-    offset += 1
-    return val, offset
-
-
 def sort_gc_optimal(parent_map):
     """Sort and group the keys in parent_map into groupcompress order.
 
@@ -784,8 +757,7 @@
         bytes_length = len(bytes)
         new_lines = osutils.split_lines(bytes)
         out_lines, index_lines = self.line_locations.make_delta(new_lines,
-                                                                soft=soft)
-        out_lines[2] = encode_base128_int(bytes_length)
+            bytes_length=bytes_length, soft=soft)
         delta_length = sum(map(len, out_lines))
         if delta_length > max_delta_size:
             # The delta is longer than the fulltext, insert a fulltext
@@ -1809,6 +1781,8 @@
 
 from bzrlib._groupcompress_py import (
     apply_delta,
+    encode_base128_int,
+    decode_base128_int,
     encode_copy_instruction,
     EquivalenceTable,
     )

=== modified file 'bzrlib/tests/test__groupcompress.py'
--- a/bzrlib/tests/test__groupcompress.py	2009-03-27 19:30:46 +0000
+++ b/bzrlib/tests/test__groupcompress.py	2009-03-27 20:12:12 +0000
@@ -14,7 +14,7 @@
 # along with this program; if not, write to the Free Software
 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 
-"""Tests for the pyrex extension of groupcompress"""
+"""Tests for the python and pyrex extensions of groupcompress"""
 
 from bzrlib import (
     groupcompress,
@@ -138,16 +138,35 @@
         ident_delta = self.make_delta(_text3, _text3)
         self.assertEqual('\x87\x01\x90\x87', ident_delta)
 
+    def assertDeltaIn(self, delta1, delta2, delta):
+        """Make sure that the delta bytes match one of the expectations."""
+        # In general, the python delta matcher gives different results than the
+        # pyrex delta matcher. Both should be valid deltas, though.
+        if delta not in (delta1, delta2):
+            self.fail("Delta bytes:\n"
+                      "       %r\n"
+                      "not in %r\n"
+                      "    or %r"
+                      % (delta, delta1, delta2))
+
     def test_make_delta(self):
         delta = self.make_delta(_text1, _text2)
-        self.assertEqual('N\x90/\x1fdiffer from\nagainst other text\n', delta)
+        self.assertDeltaIn(
+            'N\x90/\x1fdiffer from\nagainst other text\n',
+            'N\x90\x1d\x1ewhich is meant to differ from\n\x91:\x13',
+            delta)
         delta = self.make_delta(_text2, _text1)
-        self.assertEqual('M\x90/\x1ebe matched\nagainst other text\n', delta)
+        self.assertDeltaIn(
+            'M\x90/\x1ebe matched\nagainst other text\n',
+            'M\x90\x1d\x1dwhich is meant to be matched\n\x91;\x13',
+            delta)
         delta = self.make_delta(_text3, _text1)
         self.assertEqual('M\x90M', delta)
         delta = self.make_delta(_text3, _text2)
-        self.assertEqual('N\x90/\x1fdiffer from\nagainst other text\n',
-                         delta)
+        self.assertDeltaIn(
+            'N\x90/\x1fdiffer from\nagainst other text\n',
+            'N\x90\x1d\x1ewhich is meant to differ from\n\x91:\x13',
+            delta)
 
     def test_apply_delta_is_typesafe(self):
         self.apply_delta(_text1, 'M\x90M')
@@ -263,3 +282,133 @@
         self.assertEqual(_fourth_text,
                          self._gc_module.apply_delta(source, fifth_delta))
         self.assertEqual('\x80\x01\x91\xa7\x7f\x01\n', fifth_delta)
+
+
+class TestCopyInstruction(tests.TestCase):
+
+    def assertEncode(self, expected, offset, length):
+        bytes = _groupcompress_py.encode_copy_instruction(offset, length)
+        if expected != bytes:
+            self.assertEqual([hex(ord(e)) for e in expected],
+                             [hex(ord(b)) for b in bytes])
+
+    def assertDecode(self, exp_offset, exp_length, exp_newpos, bytes, pos):
+        cmd = ord(bytes[pos])
+        pos += 1
+        out = _groupcompress_py.decode_copy_instruction(bytes, cmd, pos)
+        self.assertEqual((exp_offset, exp_length, exp_newpos), out)
+
+    def test_encode_no_length(self):
+        self.assertEncode('\x80', 0, None)
+        self.assertEncode('\x81\x01', 1, None)
+        self.assertEncode('\x81\x0a', 10, None)
+        self.assertEncode('\x81\xff', 255, None)
+        self.assertEncode('\x82\x01', 256, None)
+        self.assertEncode('\x83\x01\x01', 257, None)
+        self.assertEncode('\x8F\xff\xff\xff\xff', 0xFFFFFFFF, None)
+        self.assertEncode('\x8E\xff\xff\xff', 0xFFFFFF00, None)
+        self.assertEncode('\x8D\xff\xff\xff', 0xFFFF00FF, None)
+        self.assertEncode('\x8B\xff\xff\xff', 0xFF00FFFF, None)
+        self.assertEncode('\x87\xff\xff\xff', 0x00FFFFFF, None)
+        self.assertEncode('\x8F\x04\x03\x02\x01', 0x01020304, None)
+
+    def test_encode_no_offset(self):
+        self.assertEncode('\x90\x01', 0, 1)
+        self.assertEncode('\x90\x0a', 0, 10)
+        self.assertEncode('\x90\xff', 0, 255)
+        self.assertEncode('\xA0\x01', 0, 256)
+        self.assertEncode('\xB0\x01\x01', 0, 257)
+        self.assertEncode('\xB0\xff\xff', 0, 0xFFFF)
+        # Special case, if copy == 64KiB, then we store exactly 0
+        # Note that this puns with a copy of exactly 0 bytes, but we don't care
+        # about that, as we would never actually copy 0 bytes
+        self.assertEncode('\x80', 0, 64*1024)
+
+    def test_encode(self):
+        self.assertEncode('\x91\x01\x01', 1, 1)
+        self.assertEncode('\x91\x09\x0a', 9, 10)
+        self.assertEncode('\x91\xfe\xff', 254, 255)
+        self.assertEncode('\xA2\x02\x01', 512, 256)
+        self.assertEncode('\xB3\x02\x01\x01\x01', 258, 257)
+        self.assertEncode('\xB0\x01\x01', 0, 257)
+        # Special case, if copy == 64KiB, then we store exactly 0
+        # Note that this puns with a copy of exactly 0 bytes, but we don't care
+        # about that, as we would never actually copy 0 bytes
+        self.assertEncode('\x81\x0a', 10, 64*1024)
+
+    def test_decode_no_length(self):
+        # If length is 0, it is interpreted as 64KiB
+        # The shortest possible instruction is a copy of 64KiB from offset 0
+        self.assertDecode(0, 65536, 1, '\x80', 0)
+        self.assertDecode(1, 65536, 2, '\x81\x01', 0)
+        self.assertDecode(10, 65536, 2, '\x81\x0a', 0)
+        self.assertDecode(255, 65536, 2, '\x81\xff', 0)
+        self.assertDecode(256, 65536, 2, '\x82\x01', 0)
+        self.assertDecode(257, 65536, 3, '\x83\x01\x01', 0)
+        self.assertDecode(0xFFFFFFFF, 65536, 5, '\x8F\xff\xff\xff\xff', 0)
+        self.assertDecode(0xFFFFFF00, 65536, 4, '\x8E\xff\xff\xff', 0)
+        self.assertDecode(0xFFFF00FF, 65536, 4, '\x8D\xff\xff\xff', 0)
+        self.assertDecode(0xFF00FFFF, 65536, 4, '\x8B\xff\xff\xff', 0)
+        self.assertDecode(0x00FFFFFF, 65536, 4, '\x87\xff\xff\xff', 0)
+        self.assertDecode(0x01020304, 65536, 5, '\x8F\x04\x03\x02\x01', 0)
+
+    def test_decode_no_offset(self):
+        self.assertDecode(0, 1, 2, '\x90\x01', 0)
+        self.assertDecode(0, 10, 2, '\x90\x0a', 0)
+        self.assertDecode(0, 255, 2, '\x90\xff', 0)
+        self.assertDecode(0, 256, 2, '\xA0\x01', 0)
+        self.assertDecode(0, 257, 3, '\xB0\x01\x01', 0)
+        self.assertDecode(0, 65535, 3, '\xB0\xff\xff', 0)
+        # Special case, if copy == 64KiB, then we store exactly 0
+        # Note that this puns with a copy of exactly 0 bytes, but we don't care
+        # about that, as we would never actually copy 0 bytes
+        self.assertDecode(0, 65536, 1, '\x80', 0)
+
+    def test_decode(self):
+        self.assertDecode(1, 1, 3, '\x91\x01\x01', 0)
+        self.assertDecode(9, 10, 3, '\x91\x09\x0a', 0)
+        self.assertDecode(254, 255, 3, '\x91\xfe\xff', 0)
+        self.assertDecode(512, 256, 3, '\xA2\x02\x01', 0)
+        self.assertDecode(258, 257, 5, '\xB3\x02\x01\x01\x01', 0)
+        self.assertDecode(0, 257, 3, '\xB0\x01\x01', 0)
+
+    def test_decode_not_start(self):
+        self.assertDecode(1, 1, 6, 'abc\x91\x01\x01def', 3)
+        self.assertDecode(9, 10, 5, 'ab\x91\x09\x0ade', 2)
+        self.assertDecode(254, 255, 6, 'not\x91\xfe\xffcopy', 3)
+
+
+class TestBase128Int(tests.TestCase):
+
+    def assertEqualEncode(self, bytes, val):
+        self.assertEqual(bytes, _groupcompress_py.encode_base128_int(val))
+
+    def assertEqualDecode(self, val, num_decode, bytes):
+        self.assertEqual((val, num_decode),
+                         _groupcompress_py.decode_base128_int(bytes))
+
+    def test_encode(self):
+        self.assertEqualEncode('\x01', 1)
+        self.assertEqualEncode('\x02', 2)
+        self.assertEqualEncode('\x7f', 127)
+        self.assertEqualEncode('\x80\x01', 128)
+        self.assertEqualEncode('\xff\x01', 255)
+        self.assertEqualEncode('\x80\x02', 256)
+        self.assertEqualEncode('\xff\xff\xff\xff\x0f', 0xFFFFFFFF)
+
+    def test_decode(self):
+        self.assertEqualDecode(1, 1, '\x01')
+        self.assertEqualDecode(2, 1, '\x02')
+        self.assertEqualDecode(127, 1, '\x7f')
+        self.assertEqualDecode(128, 2, '\x80\x01')
+        self.assertEqualDecode(255, 2, '\xff\x01')
+        self.assertEqualDecode(256, 2, '\x80\x02')
+        self.assertEqualDecode(0xFFFFFFFF, 5, '\xff\xff\xff\xff\x0f')
+
+    def test_decode_with_trailing_bytes(self):
+        self.assertEqualDecode(1, 1, '\x01abcdef')
+        self.assertEqualDecode(127, 1, '\x7f\x01')
+        self.assertEqualDecode(128, 2, '\x80\x01abcdef')
+        self.assertEqualDecode(255, 2, '\xff\x01\xff')
+
+

=== modified file 'bzrlib/tests/test_groupcompress.py'
--- a/bzrlib/tests/test_groupcompress.py	2009-03-27 19:30:46 +0000
+++ b/bzrlib/tests/test_groupcompress.py	2009-03-27 20:12:12 +0000
@@ -279,88 +279,6 @@
         self.assertEqual(sum(map(len, expected_lines)), end_point)
 
 
-class TestEncodeCopyInstruction(tests.TestCase):
-
-    def assertCopyInstruction(self, expected, offset, length):
-        bytes = groupcompress.encode_copy_instruction(offset, length)
-        if expected != bytes:
-            self.assertEqual([hex(ord(e)) for e in expected],
-                             [hex(ord(b)) for b in bytes])
-
-    def test_encode_no_length(self):
-        self.assertCopyInstruction('\x80', 0, None)
-        self.assertCopyInstruction('\x81\x01', 1, None)
-        self.assertCopyInstruction('\x81\x0a', 10, None)
-        self.assertCopyInstruction('\x81\xff', 255, None)
-        self.assertCopyInstruction('\x82\x01', 256, None)
-        self.assertCopyInstruction('\x83\x01\x01', 257, None)
-        self.assertCopyInstruction('\x8F\xff\xff\xff\xff', 0xFFFFFFFF, None)
-        self.assertCopyInstruction('\x8E\xff\xff\xff', 0xFFFFFF00, None)
-        self.assertCopyInstruction('\x8D\xff\xff\xff', 0xFFFF00FF, None)
-        self.assertCopyInstruction('\x8B\xff\xff\xff', 0xFF00FFFF, None)
-        self.assertCopyInstruction('\x87\xff\xff\xff', 0x00FFFFFF, None)
-        self.assertCopyInstruction('\x8F\x04\x03\x02\x01', 0x01020304, None)
-
-    def test_encode_no_offset(self):
-        self.assertCopyInstruction('\x90\x01', 0, 1)
-        self.assertCopyInstruction('\x90\x0a', 0, 10)
-        self.assertCopyInstruction('\x90\xff', 0, 255)
-        self.assertCopyInstruction('\xA0\x01', 0, 256)
-        self.assertCopyInstruction('\xB0\x01\x01', 0, 257)
-        self.assertCopyInstruction('\xB0\x01\x01', 0, 257)
-        self.assertCopyInstruction('\xB0\xff\xff', 0, 0xFFFF)
-        # Special case, if copy == 64KiB, then we store exactly 0
-        # Note that this puns with a copy of exactly 0 bytes, but we don't care
-        # about that, as we would never actually copy 0 bytes
-        self.assertCopyInstruction('\x80', 0, 64*1024)
-
-    def test_encode(self):
-        self.assertCopyInstruction('\x91\x01\x01', 1, 1)
-        self.assertCopyInstruction('\x91\x09\x0a', 9, 10)
-        self.assertCopyInstruction('\x91\xfe\xff', 254, 255)
-        self.assertCopyInstruction('\xA2\x02\x01', 512, 256)
-        self.assertCopyInstruction('\xB3\x02\x01\x01\x01', 258, 257)
-        self.assertCopyInstruction('\xB0\x01\x01', 0, 257)
-        # Special case, if copy == 64KiB, then we store exactly 0
-        # Note that this puns with a copy of exactly 0 bytes, but we don't care
-        # about that, as we would never actually copy 0 bytes
-        self.assertCopyInstruction('\x81\x0a', 10, 64*1024)
-
-
-class TestBase128Int(tests.TestCase):
-
-    def assertEqualEncode(self, bytes, val):
-        self.assertEqual(bytes, groupcompress.encode_base128_int(val))
-
-    def assertEqualDecode(self, val, num_decode, bytes):
-        self.assertEqual((val, num_decode),
-                         groupcompress.decode_base128_int(bytes))
-
-    def test_encode(self):
-        self.assertEqualEncode('\x01', 1)
-        self.assertEqualEncode('\x02', 2)
-        self.assertEqualEncode('\x7f', 127)
-        self.assertEqualEncode('\x80\x01', 128)
-        self.assertEqualEncode('\xff\x01', 255)
-        self.assertEqualEncode('\x80\x02', 256)
-        self.assertEqualEncode('\xff\xff\xff\xff\x0f', 0xFFFFFFFF)
-
-    def test_decode(self):
-        self.assertEqualDecode(1, 1, '\x01')
-        self.assertEqualDecode(2, 1, '\x02')
-        self.assertEqualDecode(127, 1, '\x7f')
-        self.assertEqualDecode(128, 2, '\x80\x01')
-        self.assertEqualDecode(255, 2, '\xff\x01')
-        self.assertEqualDecode(256, 2, '\x80\x02')
-        self.assertEqualDecode(0xFFFFFFFF, 5, '\xff\xff\xff\xff\x0f')
-
-    def test_decode_with_trailing_bytes(self):
-        self.assertEqualDecode(1, 1, '\x01abcdef')
-        self.assertEqualDecode(127, 1, '\x7f\x01')
-        self.assertEqualDecode(128, 2, '\x80\x01abcdef')
-        self.assertEqualDecode(255, 2, '\xff\x01\xff')
-
-
 class TestGroupCompressBlock(tests.TestCase):
 
     def make_block(self, key_to_text):