Rev 3909: Add a groupcompress.encode_copy_instruction function. in http://bzr.arbash-meinel.com/branches/bzr/brisbane/vilajam

Wed Mar 25 19:23:28 GMT 2009

At http://bzr.arbash-meinel.com/branches/bzr/brisbane/vilajam

------------------------------------------------------------
revno: 3909
revision-id: john at arbash-meinel.com-20090325192307-yhus35dm17i9531s
parent: v.ladeuil+lp at free.fr-20090325181317-4l9ubfugzfehd85q
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: vilajam
timestamp: Wed 2009-03-25 14:23:07 -0500
message:
  Add a groupcompress.encode_copy_instruction function.
  This handles the complexity of the variable-width copy instructions.
-------------- next part --------------
=== modified file 'bzrlib/groupcompress.py'

--- a/bzrlib/groupcompress.py	2009-03-25 17:29:19 +0000
+++ b/bzrlib/groupcompress.py	2009-03-25 19:23:07 +0000
@@ -84,6 +84,36 @@
     return val, offset
 
 
+def encode_copy_instruction(offset, length):
+    """Convert this offset into a control code and bytes."""
+    copy_command = 0x80
+    copy_bytes = []
+
+    for copy_bit in (0x01, 0x02, 0x04, 0x08):
+        base_byte = offset & 0xff
+        if base_byte:
+            copy_command |= copy_bit
+            copy_bytes.append(chr(base_byte))
+        offset >>= 8
+    if length is None:
+        # None is used by the test suite
+        return copy_command, copy_bytes
+    if length > 0x10000:
+        raise ValueError("we don't emit copy records for lengths > 64KiB")
+    if length == 0:
+        raise ValueError("We cannot emit a copy of length 0")
+    if length != 0x10000:
+        # A copy of length exactly 64*1024 == 0x10000 is sent as a length of 0,
+        # since that saves bytes for large chained copies
+        for copy_bit in (0x10, 0x20):
+            base_byte = length & 0xff
+            if base_byte:
+                copy_command |= copy_bit
+                copy_bytes.append(chr(base_byte))
+            length >>= 8
+    return copy_command, copy_bytes
+
+
 def sort_gc_optimal(parent_map):
     """Sort and group the keys in parent_map into groupcompress order.
 
@@ -734,9 +764,6 @@
 
     def __init__(self):
         """Create a GroupCompressor."""
-        # Consider seeding the lines with some sort of GC Start flag, or
-        # putting it as part of the output stream, rather than in the
-        # compressed bytes.
         self.lines = []
         self.endpoint = 0
         self.input_bytes = 0
@@ -749,13 +776,12 @@
 
 class PythonGroupCompressor(_CommonGroupCompressor):
 
-    def __init__(self, delta=True):
+    def __init__(self):
         """Create a GroupCompressor.
 
         :param delta: If False, do not compress records.
         """
         super(PythonGroupCompressor, self).__init__()
-        self._delta = delta
         self.line_offsets = []
         self.line_locations = EquivalenceTable([])
         self.lines = self.line_locations.lines
@@ -774,8 +800,6 @@
         pos = 0
         line_locations = self.line_locations
         line_locations.set_right_lines(lines)
-        # We either copy a range (while there are reusable lines) or we 
-        # insert new lines. To find reusable lines we traverse 
         locations = None
         max_pos = len(lines)
         result_append = result.append
@@ -803,73 +827,49 @@
         return result
 
     # FIXME: implement nostore_sha
-    def compress(self, key, lines, expected_sha, nostore_sha=False, soft=False):
+    def compress(self, key, bytes, expected_sha, nostore_sha=None, soft=False):
         """Compress lines with label key.
 
         :param key: A key tuple. It is stored in the output
             for identification of the text during decompression. If the last
             element is 'None' it is replaced with the sha1 of the text -
             e.g. sha1:xxxxxxx.
-        :param lines: The lines to be compressed. Must be split
-            on \n, with the \n preserved.'
+        :param bytes: The bytes to be compressed
         :param expected_sha: If non-None, the sha the lines are believed to
             have. During compression the sha is calculated; a mismatch will
             cause an error.
+        :param nostore_sha: If the computed sha1 sum matches, we will raise
+            ExistingContent rather than adding the text.
         :param soft: Do a 'soft' compression. This means that we require larger
             ranges to match to be considered for a copy command.
         :return: The sha1 of lines, and the number of bytes accumulated in
             the group output so far.
+        :seealso VersionedFiles.add_lines:
         """
-        lines = osutils.split_lines(lines)
-        sha1 = osutils.sha_strings(lines)
+        new_lines = osutils.split_lines(bytes)
+        sha1 = osutils.sha_string(bytes)
         if key[-1] is None:
             key = key[:-1] + ('sha1:' + sha1,)
-        label = '\x00'.join(key)
-        ## new_lines = []
-        new_lines = ['label: %s\n' % label,
-                     'sha1: %s\n' % sha1,
-                    ]
-        ## index_lines = []
-        index_lines = [False, False]
-        # setup good encoding for trailing \n support.
-        if not lines or lines[-1].endswith('\n'):
-            lines.append('\n')
-        else:
-            lines[-1] = lines[-1] + '\n'
-        pos = 0
-        range_len = 0
-        range_start = 0
-        flush_range = self.flush_range
-        copy_ends = None
-        blocks = self.get_matching_blocks(lines, soft=soft)
-        current_pos = 0
-        #copies_without_insertion = []
+        out_lines = []
+        index_lines = []
+        blocks = self.get_matching_blocks(new_lines, soft=soft)
+        current_line_num = 0
         # We either copy a range (while there are reusable lines) or we
         # insert new lines. To find reusable lines we traverse
         for old_start, new_start, range_len in blocks:
-            if new_start != current_pos:
-                # if copies_without_insertion:
-                #     self.flush_multi(copies_without_insertion,
-                #                      lines, new_lines, index_lines)
-                #     copies_without_insertion = []
+            if new_start != current_line_num:
                 # non-matching region
-                flush_range(current_pos, None, new_start - current_pos,
-                    lines, new_lines, index_lines)
-            current_pos = new_start + range_len
+                self.flush_insert(current_line_num, new_start,
+                                  new_lines, out_lines, index_lines)
+            current_line_num = new_start + range_len
             if not range_len:
                 continue
-            # copies_without_insertion.append((new_start, old_start, range_len))
-            flush_range(new_start, old_start, range_len,
-                        lines, new_lines, index_lines)
-        # if copies_without_insertion:
-        #     self.flush_multi(copies_without_insertion,
-        #                      lines, new_lines, index_lines)
-        #     copies_without_insertion = []
+            self._flush_copy(old_start, range_len,
+                             new_lines, out_lines, index_lines)
         start = self.endpoint # Keep it
         delta_start = (self.endpoint, len(self.lines))
-        self.output_lines(new_lines, index_lines)
-        trim_encoding_newline(lines)
-        length = sum(map(len, lines))
+        self.output_lines(out_lines, index_lines)
+        length = len(bytes)
         self.input_bytes += length
         delta_end = (self.endpoint, len(self.lines))
         self.labels_deltas[key] = (delta_start, delta_end)
@@ -878,7 +878,7 @@
 
     def extract(self, key):
         """Extract a key previously added to the compressor.
-        
+
         :param key: The key to extract.
         :return: An iterable over bytes and the sha1.
         """
@@ -893,27 +893,60 @@
         sha1 = osutils.sha_strings(chunks)
         return chunks, sha1
 
-    def flush_multi(self, instructions, lines, new_lines, index_lines):
-        """Flush a bunch of different ranges out.
-
-        This should only be called with data that are "pure" copies.
+    def _flush_insert(self, start_linenum, end_linenum,
+                      new_lines, out_lines, index_lines):
+        """Add an 'insert' request to the data stream."""
+        bytes_to_insert = ''.join(new_lines[start_linenum:end_linenum])
+        insert_length = len(bytes_to_insert)
+        # Each insert instruction is at most 127 bytes long
+        for start_byte in xrange(0, insert_length, 127):
+            insert_count = min(insert_length - start_byte, 127)
+            assert insert_count <= 127
+            out_lines.append(chr(insert_count))
+            # Don't index the 'insert' instruction
+            index_lines.append(False)
+            insert = bytes_to_insert[start_byte:start_byte+insert_count]
+            out_lines.append(insert)
+            index_lines.append(True)
+
+    def _flush_copy(self, old_start_linenum, num_lines,
+                    out_lines, index_lines):
+        if old_start_linenum == 0:
+            first_byte = 0
+        else:
+            first_byte = self.line_offsets[old_start_linenum - 1]
+        stop_byte = self.line_offsets[old_start_linenum + num_lines - 1]
+        num_bytes = stop_byte - first_byte
+        # The data stream allows >64kB in a copy, but to match the compiled
+        # code, we will also limit it to a 64kB copy
+        for start_byte in xrange(first_byte, stop_byte, 64*1024):
+            num_bytes = min(64*1024, stop_byte - first_byte)
+            copy_command = 0x80
+            copy_bytes = []
+            base_byte = start_byte & 0x000000ff
+            if base_byte:
+                copy_command |= 0x01
+                assert 0 <= base_byte <= 255
+                copy_bytes.append(chr(base_byte))
+            base_byte = (start_byte & 0x0000ff00) >> 8
+            if base_byte:
+                copy_bytes |= 0x02
+                copy_bytes.append(chr(base_byte))
+            base_byte = (start_byte & 0x00ff0000) >> 16
+            if base_byte:
+                copy_bytes |= 0x03
+                copy_bytes.append(chr(base_byte))
+            base_byte = (start_byte & 0xff000000) >> 24
+            if base_byte:
+                copy_bytes |= 0x04
+                copy_bytes.append(chr(base_byte))
+
+    def flush_range(self, new_line_start, source_line_start, match_num_lines,
+                    new_lines, out_lines, index_lines):
+        """Insert the control codes for this copy & insert instruction.
+
+        :param range_start: 
         """
-        flush_range = self.flush_range
-        if len(instructions) > 2:
-            # This is the number of lines to be copied
-            total_copy_range = sum(i[2] for i in instructions)
-            if len(instructions) > 0.5 * total_copy_range:
-                # We are copying N lines, but taking more than N/2
-                # copy instructions to do so. We will go ahead and expand this
-                # text so that other code is able to match against it
-                flush_range(instructions[0][0], None, total_copy_range,
-                            lines, new_lines, index_lines)
-                return
-        for ns, os, rl in instructions:
-            flush_range(ns, os, rl, lines, new_lines, index_lines)
-
-    def flush_range(self, range_start, copy_start, range_len, lines, new_lines, index_lines):
-        insert_instruction = "i,%d\n" % range_len
         if copy_start is not None:
             # range stops, flush and start a new copy range
             stop_byte = self.line_offsets[copy_start + range_len - 1]
@@ -922,12 +955,11 @@
             else:
                 start_byte = self.line_offsets[copy_start - 1]
             bytes = stop_byte - start_byte
-            copy_control_instruction = "c,%d,%d\n" % (start_byte, bytes)
-            if (bytes + len(insert_instruction) >
-                len(copy_control_instruction)):
-                new_lines.append(copy_control_instruction)
-                index_lines.append(False)
-                return
+            copy_byte = 0
+            copy_control_instruction =0
+            new_lines.append(copy_control_instruction)
+            index_lines.append(False)
+            return
         # not copying, or inserting is shorter than copying, so insert.
         new_lines.append(insert_instruction)
         new_lines.extend(lines[range_start:range_start+range_len])
@@ -1587,8 +1619,8 @@
                 result[record.key] = record.sha1
             else:
                 if record.storage_kind != 'absent':
-                    result[record.key] = olsutils.sha_string(record.get_bytes_as(
-                        'fulltext'))
+                    result[record.key] = osutils.sha_string(
+                        record.get_bytes_as('fulltext'))
         return result
 
     def insert_record_stream(self, stream):
@@ -1990,7 +2022,7 @@
         apply_delta,
         DeltaIndex,
         )
-    GroupCompressor = PyrexCompressor
+    GroupCompressor = PyrexGroupCompressor
 except ImportError:
     from bzrlib._groupcompress_py import (
         apply_delta,

=== modified file 'bzrlib/tests/test_groupcompress.py'
--- a/bzrlib/tests/test_groupcompress.py	2009-03-24 19:36:34 +0000
+++ b/bzrlib/tests/test_groupcompress.py	2009-03-25 19:23:07 +0000
@@ -174,6 +174,59 @@
                          compressor.extract(('newlabel',)))
 
 
+class TestEncodeCopyInstruction(tests.TestCase):
+
+    def assertCopyInstruction(self, control, bytes, offset, length):
+        self.assertEqual((control, bytes),
+                         groupcompress.encode_copy_instruction(offset, length))
+
+    def test_encode_no_length(self):
+        self.assertCopyInstruction(0x80, [], 0, None)
+        self.assertCopyInstruction(0x81, ['\x01'], 1, None)
+        self.assertCopyInstruction(0x81, ['\x0a'], 10, None)
+        self.assertCopyInstruction(0x81, ['\xff'], 255, None)
+        self.assertCopyInstruction(0x82, ['\x01'], 256, None)
+        self.assertCopyInstruction(0x83, ['\x01', '\x01'], 257, None)
+        self.assertCopyInstruction(0x8F, ['\xff', '\xff', '\xff', '\xff'],
+                                   0xFFFFFFFF, None)
+        self.assertCopyInstruction(0x8E, ['\xff', '\xff', '\xff'],
+                                   0xFFFFFF00, None)
+        self.assertCopyInstruction(0x8D, ['\xff', '\xff', '\xff'],
+                                   0xFFFF00FF, None)
+        self.assertCopyInstruction(0x8B, ['\xff', '\xff', '\xff'],
+                                   0xFF00FFFF, None)
+        self.assertCopyInstruction(0x87, ['\xff', '\xff', '\xff'],
+                                   0x00FFFFFF, None)
+        self.assertCopyInstruction(0x8F, ['\x04', '\x03', '\x02', '\x01'],
+                                   0x01020304, None)
+
+    def test_encode_no_offset(self):
+        self.assertCopyInstruction(0x90, ['\x01'], 0, 1)
+        self.assertCopyInstruction(0x90, ['\x0a'], 0, 10)
+        self.assertCopyInstruction(0x90, ['\xff'], 0, 255)
+        self.assertCopyInstruction(0xA0, ['\x01'], 0, 256)
+        self.assertCopyInstruction(0xB0, ['\x01', '\x01'], 0, 257)
+        self.assertCopyInstruction(0xB0, ['\x01', '\x01'], 0, 257)
+        self.assertCopyInstruction(0xB0, ['\xff', '\xff'], 0, 0xFFFF)
+        # Special case, if copy == 64KiB, then we store exactly 0
+        # Note that this puns with a copy of exactly 0 bytes, but we don't care
+        # about that, as we would never actually copy 0 bytes
+        self.assertCopyInstruction(0x80, [], 0, 64*1024)
+
+    def test_encode(self):
+        self.assertCopyInstruction(0x91, ['\x01', '\x01'], 1, 1)
+        self.assertCopyInstruction(0x91, ['\x09', '\x0a'], 9, 10)
+        self.assertCopyInstruction(0x91, ['\xfe', '\xff'], 254, 255)
+        self.assertCopyInstruction(0xA2, ['\x02', '\x01'], 512, 256)
+        self.assertCopyInstruction(0xB3, ['\x02', '\x01', '\x01', '\x01'],
+                                   258, 257)
+        self.assertCopyInstruction(0xB0, ['\x01', '\x01'], 0, 257)
+        # Special case, if copy == 64KiB, then we store exactly 0
+        # Note that this puns with a copy of exactly 0 bytes, but we don't care
+        # about that, as we would never actually copy 0 bytes
+        self.assertCopyInstruction(0x81, ['\x0a'], 10, 64*1024)
+
+
 class TestBase128Int(tests.TestCase):
 
     def assertEqualEncode(self, bytes, val):