Rev 36: Adding a mini-len to the delta/fulltext bytes in http://bazaar.launchpad.net/%7Ebzr/bzr-groupcompress/trunk

Thu Mar 5 18:10:52 GMT 2009

At http://bazaar.launchpad.net/%7Ebzr/bzr-groupcompress/trunk

------------------------------------------------------------
revno: 36
revision-id: john at arbash-meinel.com-20090305181021-dsjdgu54gva425r7
parent: john at arbash-meinel.com-20090305172911-4ajyqk6tt0gd43oe
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: trunk
timestamp: Thu 2009-03-05 12:10:21 -0600
message:
  Adding a mini-len to the delta/fulltext bytes
  This adds 1.6bytes/entry for inventory pages, and 2.5 bytes/entry for
  text pages. But that is down in the <1% increased content.
  The main advantage is that if we get rid of labels,
  it allows the content to be fully self describing.
  Especially beneficial for chk pages, as you could regenerate the
  chk index from the .pack file, without including any labels.
  (the label is the sha1 sum).
  It *does* increase the time to extract, as you are now decoding
  those little bytes in the beginning.
  However, that time could be significantly decreased with a
  Pyrex decoder. (at the moment the overhead is 0.2s out of 14s under
  lsprof.)
  If we decide to go with labels, then the 1% is a bit superflous,
  but if we get rid of labels, we trade 1% here for 40% in the
  labels.
-------------- next part --------------
=== modified file 'groupcompress.py'

--- a/groupcompress.py	2009-03-05 17:29:11 +0000
+++ b/groupcompress.py	2009-03-05 18:10:21 +0000
@@ -213,27 +213,37 @@
         """
         if _NO_LABELS or not self._entries:
             start, end = index_memo[3:5]
+            # The bytes are 'f' or 'd' for the type, then a variable-length
+            # base128 integer for the content size, then the actual content
+            # We know that the variable-length integer won't be longer than 10
+            # bytes (it only takes 5 bytes to encode 2^32)
             c = self._content[start]
             if c == 'f':
-                bytes = self._content[start+1:end]
                 type = 'fulltext'
             else:
                 assert c == 'd'
                 type = 'delta'
-                delta = self._content[start+1:end]
-                bytes = _groupcompress_pyx.apply_delta(self._content, delta)
-            entry = GroupCompressBlockEntry(key, type, sha_string(bytes),
-                                            start, end-start)
-            return entry, bytes
-        entry = self._entries[key]
-        if entry.type == 'fulltext':
-            assert self._content[entry.start] == 'f'
-            bytes = self._content[entry.start+1:entry.start + entry.length]
-        elif entry.type == 'delta':
-            assert self._content[entry.start] == 'd'
-            delta = self._content[entry.start+1:entry.start + entry.length]
-            bytes = _groupcompress_pyx.apply_delta(self._content, delta)
-        # XXX: sha1?
+            entry = GroupCompressBlockEntry(key, type, sha1=None,
+                                            start=start, length=end-start)
+        else:
+            entry = self._entries[key]
+            c = self._content[entry.start]
+            if entry.type == 'fulltext':
+                assert c == 'f'
+            elif entry.type == 'delta':
+                assert c == 'd'
+        content_len, len_len = decode_base128_int(
+                                self._content[start + 1:start + 11])
+        assert entry.length == content_len + 1 + len_len
+        content_start = start + 1 + len_len
+        end = entry.start + entry.length
+        content = self._content[content_start:end]
+        if c == 'f':
+            bytes = content
+        elif c == 'd':
+            bytes = _groupcompress_pyx.apply_delta(self._content, content)
+        if entry.sha1 is None:
+            entry.sha1 = sha_string(bytes)
         return entry, bytes
 
     def add_entry(self, key, type, sha1, start, length):
@@ -374,17 +384,21 @@
         delta = self._delta_index.make_delta(bytes, max_delta_size)
         if (delta is None):
             type = 'fulltext'
-            length = len(bytes) + 1
-            self._delta_index.add_source(bytes, 1)
-            new_chunks = ['f', bytes]
+            enc_length = encode_base128_int(len(bytes))
+            len_mini_header = 1 + len(enc_length)
+            length = len(bytes) + len_mini_header
+            self._delta_index.add_source(bytes, len_mini_header)
+            new_chunks = ['f', enc_length, bytes]
         else:
             type = 'delta'
-            length = len(delta) + 1
-            new_chunks = ['d', delta]
+            enc_length = encode_base128_int(len(delta))
+            len_mini_header = 1 + len(enc_length)
+            length = len(delta) + len_mini_header
+            new_chunks = ['d', enc_length, delta]
             if _FAST:
                 self._delta_index._source_offset += length
             else:
-                self._delta_index.add_delta_source(delta, 1)
+                self._delta_index.add_delta_source(delta, len_mini_header)
         self._block.add_entry(key, type=type, sha1=sha1,
                               start=self.endpoint, length=length)
         delta_start = (self.endpoint, len(self.lines))
@@ -412,13 +426,18 @@
         entry = self._block._entries[key]
         if entry.type == 'fulltext':
             assert stored_bytes[0] == 'f'
-            bytes = stored_bytes[1:]
+            fulltext_len, offset = decode_base128_int(stored_bytes[1:10])
+            assert fulltext_len + 1 + offset == len(stored_bytes)
+            bytes = stored_bytes[offset + 1:]
         else:
             assert entry.type == 'delta'
             # XXX: This is inefficient at best
             source = ''.join(self.lines)
             assert stored_bytes[0] == 'd'
-            bytes = _groupcompress_pyx.apply_delta(source, stored_bytes[1:])
+            delta_len, offset = decode_base128_int(stored_bytes[1:10])
+            assert delta_len + 1 + offset == len(stored_bytes)
+            bytes = _groupcompress_pyx.apply_delta(source,
+                                                   stored_bytes[offset + 1:])
             assert entry.sha1 == sha_string(bytes)
         return bytes, entry.sha1
 
@@ -726,6 +745,8 @@
                 block = self._get_block(index_memo)
                 entry, bytes = block.extract(key, index_memo)
                 sha1 = entry.sha1
+                # TODO: If we don't have labels, then the sha1 here is computed
+                #       from the data, so we don't want to re-sha the string.
                 if not _FAST and sha_string(bytes) != sha1:
                     raise AssertionError('sha1 sum did not match')
             yield FulltextContentFactory(key, parents, sha1, bytes)

=== modified file 'tests/test_groupcompress.py'
--- a/tests/test_groupcompress.py	2009-03-05 17:29:11 +0000
+++ b/tests/test_groupcompress.py	2009-03-05 18:10:21 +0000
@@ -63,7 +63,7 @@
             'strange\ncommon\n', None)
         self.assertEqual(sha_string('strange\ncommon\n'), sha1)
         expected_lines = [
-            'f', 'strange\ncommon\n',
+            'f', '\x0f', 'strange\ncommon\n',
             ]
         self.assertEqual(expected_lines, compressor.lines)
         self.assertEqual(sum(map(len, expected_lines)), end_point)
@@ -94,10 +94,12 @@
                                     'that needs a 16 byte match\n'
                                     'different\n'), sha1_2)
         expected_lines.extend([
+            # 'delta', delta length
+            'd\x10',
             # source and target length
-            'd\x35\x36',
+            '\x36\x36',
             # copy the line common
-            '\x91\x09\x2c', #copy, offset 0x09, len 0x2c
+            '\x91\x0a\x2c', #copy, offset 0x0a, len 0x2c
             # add the line different, and the trailing newline
             '\x0adifferent\n', # insert 10 bytes
             ])
@@ -122,13 +124,16 @@
                        'different\nmoredifferent\nand then some more\n'),
             sha1_3)
         expected_lines.extend([
-            'd\x65\x5f' # source and target length
+            # 'delta', delta length
+            'd\x0c',
+            # source and target length
+            '\x67\x5f'
             # insert new
             '\x03new',
             # Copy of first parent 'common' range
-            '\x91\x08\x31' # copy, offset 0x08, 0x31 bytes
+            '\x91\x09\x31' # copy, offset 0x09, 0x31 bytes
             # Copy of second parent 'different' range
-            '\x91\x3a\x2b' # copy, offset 0x3a, 0x2b bytes
+            '\x91\x3c\x2b' # copy, offset 0x3c, 0x2b bytes
             ])
         self.assertEqualDiffEncoded(expected_lines, compressor.lines)
         self.assertEqual(sum(map(len, expected_lines)), end_point)