Rev 3915: Change the delta byte stream to remove the 'source length' entry. in http://bzr.arbash-meinel.com/branches/bzr/brisbane/disk_format

John Arbash Meinel john at arbash-meinel.com
Fri Mar 27 16:07:59 GMT 2009


At http://bzr.arbash-meinel.com/branches/bzr/brisbane/disk_format

------------------------------------------------------------
revno: 3915
revision-id: john at arbash-meinel.com-20090327160744-anntc1oyh859s9rm
parent: john at arbash-meinel.com-20090326201840-ddb2uqof335ysvnu
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: disk_format
timestamp: Fri 2009-03-27 11:07:44 -0500
message:
  Change the delta byte stream to remove the 'source length' entry.
  This should generally remove ~3.5 bytes from delta records.
  The groupcompress tests at least pass again.
-------------- next part --------------
=== modified file 'bzrlib/_groupcompress_pyx.pyx'
--- a/bzrlib/_groupcompress_pyx.pyx	2009-03-24 20:02:26 +0000
+++ b/bzrlib/_groupcompress_pyx.pyx	2009-03-27 16:07:44 +0000
@@ -251,15 +251,6 @@
     data = <unsigned char *>delta
     top = data + delta_size
 
-    # make sure the orig file size matches what we expect
-    # XXX: gcc warns because data isn't defined as 'const'
-    size = get_delta_hdr_size(&data, top)
-    if (size > source_size):
-        # XXX: mismatched source size
-        raise RuntimeError('source size %d < expected source size %d'
-                           % (source_size, size))
-    source_size = size
-
     # now the result size
     size = get_delta_hdr_size(&data, top)
     result = PyString_FromStringAndSize(NULL, size)

=== modified file 'bzrlib/delta.h'
--- a/bzrlib/delta.h	2009-03-26 16:22:58 +0000
+++ b/bzrlib/delta.h	2009-03-27 16:07:44 +0000
@@ -77,8 +77,10 @@
          const void *buf, unsigned long bufsize,
          unsigned long *delta_size, unsigned long max_delta_size);
 
-/* the smallest possible delta size is 4 bytes */
-#define DELTA_SIZE_MIN  4
+/* the smallest possible delta size is 3 bytes
+ * Target size, Copy command, Copy length
+ */
+#define DELTA_SIZE_MIN  3
 
 /*
  * This must be called twice on the delta data buffer, first to get the

=== modified file 'bzrlib/diff-delta.c'
--- a/bzrlib/diff-delta.c	2009-03-19 23:30:50 +0000
+++ b/bzrlib/diff-delta.c	2009-03-27 16:07:44 +0000
@@ -707,8 +707,6 @@
     /* then populate the index for the new data */
     prev_val = ~0;
     data = buffer;
-    /* source size */
-    get_delta_hdr_size(&data, top);
     /* target size */
     get_delta_hdr_size(&data, top);
     entry = entries; /* start at the first slot */
@@ -881,14 +879,7 @@
     if (!out)
         return NULL;
 
-    /* store reference buffer size */
     source_size = index->last_src->size + index->last_src->agg_offset;
-    i = source_size;
-    while (i >= 0x80) {
-        out[outpos++] = i | 0x80;
-        i >>= 7;
-    }
-    out[outpos++] = i;
 
     /* store target buffer size */
     i = trg_size;

=== modified file 'bzrlib/groupcompress.py'
--- a/bzrlib/groupcompress.py	2009-03-26 20:18:40 +0000
+++ b/bzrlib/groupcompress.py	2009-03-27 16:07:44 +0000
@@ -1462,6 +1462,9 @@
                     value = "%d %d %d %d" % (block_start, block_length,
                                              record._start, record._end)
                     nodes = [(record.key, value, (record.parents,))]
+                    # TODO: Consider buffering up many nodes to be added, not
+                    #       sure how much overhead this has, but we're seeing
+                    #       ~23s / 120s in add_records calls
                     self._index.add_records(nodes, random_id=random_id)
                     continue
             try:

=== modified file 'bzrlib/tests/test__groupcompress_pyx.py'
--- a/bzrlib/tests/test__groupcompress_pyx.py	2009-03-24 19:36:34 +0000
+++ b/bzrlib/tests/test__groupcompress_pyx.py	2009-03-27 16:07:44 +0000
@@ -123,40 +123,40 @@
 
     def test_make_noop_delta(self):
         ident_delta = self.make_delta(_text1, _text1)
-        self.assertEqual('MM\x90M', ident_delta)
+        self.assertEqual('M\x90M', ident_delta)
         ident_delta = self.make_delta(_text2, _text2)
-        self.assertEqual('NN\x90N', ident_delta)
+        self.assertEqual('N\x90N', ident_delta)
         ident_delta = self.make_delta(_text3, _text3)
-        self.assertEqual('\x87\x01\x87\x01\x90\x87', ident_delta)
+        self.assertEqual('\x87\x01\x90\x87', ident_delta)
 
     def test_make_delta(self):
         delta = self.make_delta(_text1, _text2)
-        self.assertEqual('MN\x90/\x1fdiffer from\nagainst other text\n', delta)
+        self.assertEqual('N\x90/\x1fdiffer from\nagainst other text\n', delta)
         delta = self.make_delta(_text2, _text1)
-        self.assertEqual('NM\x90/\x1ebe matched\nagainst other text\n', delta)
+        self.assertEqual('M\x90/\x1ebe matched\nagainst other text\n', delta)
         delta = self.make_delta(_text3, _text1)
-        self.assertEqual('\x87\x01M\x90M', delta)
+        self.assertEqual('M\x90M', delta)
         delta = self.make_delta(_text3, _text2)
-        self.assertEqual('\x87\x01N\x90/\x1fdiffer from\nagainst other text\n',
+        self.assertEqual('N\x90/\x1fdiffer from\nagainst other text\n',
                          delta)
 
     def test_apply_delta_is_typesafe(self):
-        self.apply_delta(_text1, 'MM\x90M')
-        self.assertRaises(TypeError,
-            self.apply_delta, object(), 'MM\x90M')
-        self.assertRaises(TypeError,
-            self.apply_delta, unicode(_text1), 'MM\x90M')
-        self.assertRaises(TypeError,
-            self.apply_delta, _text1, u'MM\x90M')
+        self.apply_delta(_text1, 'M\x90M')
+        self.assertRaises(TypeError,
+            self.apply_delta, object(), 'M\x90M')
+        self.assertRaises(TypeError,
+            self.apply_delta, unicode(_text1), 'M\x90M')
+        self.assertRaises(TypeError,
+            self.apply_delta, _text1, u'M\x90M')
         self.assertRaises(TypeError,
             self.apply_delta, _text1, object())
 
     def test_apply_delta(self):
         target = self.apply_delta(_text1,
-                    'MN\x90/\x1fdiffer from\nagainst other text\n')
+                    'N\x90/\x1fdiffer from\nagainst other text\n')
         self.assertEqual(_text2, target)
         target = self.apply_delta(_text2,
-                    'NM\x90/\x1ebe matched\nagainst other text\n')
+                    'M\x90/\x1ebe matched\nagainst other text\n')
         self.assertEqual(_text1, target)
 
 
@@ -169,7 +169,7 @@
     def test_make_delta(self):
         di = self._gc_module.DeltaIndex(_text1)
         delta = di.make_delta(_text2)
-        self.assertEqual('MN\x90/\x1fdiffer from\nagainst other text\n', delta)
+        self.assertEqual('N\x90/\x1fdiffer from\nagainst other text\n', delta)
 
     def test_delta_against_multiple_sources(self):
         di = self._gc_module.DeltaIndex()
@@ -180,7 +180,7 @@
         delta = di.make_delta(_third_text)
         result = self._gc_module.apply_delta(_first_text + _second_text, delta)
         self.assertEqualDiff(_third_text, result)
-        self.assertEqual('\xac\x01\x85\x01\x90\x14\x0chas some in '
+        self.assertEqual('\x85\x01\x90\x14\x0chas some in '
                          '\x91v6\x03and\x91d"\x91:\n', delta)
 
     def test_delta_with_offsets(self):
@@ -196,7 +196,7 @@
             '12345' + _first_text + '1234567890' + _second_text, delta)
         self.assertIsNot(None, result)
         self.assertEqualDiff(_third_text, result)
-        self.assertEqual('\xbb\x01\x85\x01\x91\x05\x14\x0chas some in '
+        self.assertEqual('\x85\x01\x91\x05\x14\x0chas some in '
                          '\x91\x856\x03and\x91s"\x91?\n', delta)
 
     def test_delta_with_delta_bytes(self):
@@ -205,7 +205,7 @@
         di.add_source(_first_text, 0)
         self.assertEqual(len(_first_text), di._source_offset)
         delta = di.make_delta(_second_text)
-        self.assertEqual('Dh\tsome more\x91\x019'
+        self.assertEqual('h\tsome more\x91\x019'
                          '&previous text\nand has some extra text\n', delta)
         di.add_delta_source(delta, 0)
         source += delta
@@ -218,8 +218,8 @@
         # Note that we don't match the 'common with the', because it isn't long
         # enough to match in the original text, and those bytes are not present
         # in the delta for the second text.
-        self.assertEqual('z\x85\x01\x90\x14\x1chas some in common with the '
-                         '\x91T&\x03and\x91\x18,', second_delta)
+        self.assertEqual('\x85\x01\x90\x14\x1chas some in common with the '
+                         '\x91S&\x03and\x91\x18,', second_delta)
         # Add this delta, and create a new delta for the same text. We should
         # find the remaining text, and only insert the short 'and' text.
         di.add_delta_source(second_delta, 0)
@@ -227,14 +227,14 @@
         third_delta = di.make_delta(_third_text)
         result = self._gc_module.apply_delta(source, third_delta)
         self.assertEqualDiff(_third_text, result)
-        self.assertEqual('\xa6\x01\x85\x01\x90\x14\x91\x80\x1c'
-                         '\x91T&\x03and\x91\x18,', third_delta)
+        self.assertEqual('\x85\x01\x90\x14\x91\x7e\x1c'
+                         '\x91S&\x03and\x91\x18,', third_delta)
         # Now create a delta, which we know won't be able to be 'fit' into the
         # existing index
         fourth_delta = di.make_delta(_fourth_text)
         self.assertEqual(_fourth_text,
                          self._gc_module.apply_delta(source, fourth_delta))
-        self.assertEqual('\xa6\x01\x80\x01'
+        self.assertEqual('\x80\x01'
                          '\x7f123456789012345\nsame rabin hash\n'
                          '123456789012345\nsame rabin hash\n'
                          '123456789012345\nsame rabin hash\n'
@@ -246,4 +246,4 @@
         fifth_delta = di.make_delta(_fourth_text)
         self.assertEqual(_fourth_text,
                          self._gc_module.apply_delta(source, fifth_delta))
-        self.assertEqual('\xac\x02\x80\x01\x91\xab\x7f\x01\n', fifth_delta)
+        self.assertEqual('\x80\x01\x91\xa7\x7f\x01\n', fifth_delta)

=== modified file 'bzrlib/tests/test_groupcompress.py'
--- a/bzrlib/tests/test_groupcompress.py	2009-03-24 19:36:34 +0000
+++ b/bzrlib/tests/test_groupcompress.py	2009-03-27 16:07:44 +0000
@@ -100,9 +100,9 @@
                                     'different\n'), sha1_2)
         expected_lines.extend([
             # 'delta', delta length
-            'd\x10',
-            # source and target length
-            '\x36\x36',
+            'd\x0f',
+            # target length
+            '\x36',
             # copy the line common
             '\x91\x0a\x2c', #copy, offset 0x0a, len 0x2c
             # add the line different, and the trailing newline
@@ -130,9 +130,9 @@
             sha1_3)
         expected_lines.extend([
             # 'delta', delta length
-            'd\x0c',
-            # source and target length
-            '\x67\x5f'
+            'd\x0b',
+            # target length
+            '\x5f'
             # insert new
             '\x03new',
             # Copy of first parent 'common' range
@@ -661,6 +661,7 @@
         entries, block = self.make_block(self._texts)
         manager = groupcompress._LazyGroupContentManager(block)
         self.add_key_to_manager(('key1',), entries, block, manager)
+        self.add_key_to_manager(('key3',), entries, block, manager)
         self.add_key_to_manager(('key4',), entries, block, manager)
         block_bytes = block.to_bytes()
         wire_bytes = manager._wire_bytes()
@@ -670,23 +671,29 @@
         header_len = int(header_len)
         block_len = int(block_len)
         self.assertEqual('groupcompress-block', storage_kind)
-        self.assertEqual(33, z_header_len)
-        self.assertEqual(25, header_len)
+        self.assertEqual(41, z_header_len)
+        self.assertEqual(39, header_len)
         self.assertEqual(len(block_bytes), block_len)
         z_header = rest[:z_header_len]
         header = zlib.decompress(z_header)
         self.assertEqual(header_len, len(header))
         entry1 = entries[('key1',)]
+        entry3 = entries[('key3',)]
         entry4 = entries[('key4',)]
         self.assertEqualDiff('key1\n'
                              '\n'  # no parents
                              '%d\n' # start offset
                              '%d\n' # end byte
+                             'key3\n'
+                             '\n'
+                             '%d\n'
+                             '%d\n'
                              'key4\n'
                              '\n'
                              '%d\n'
                              '%d\n'
                              % (entry1.start, entry1.end,
+                                entry3.start, entry3.end,
                                 entry4.start, entry4.end),
                             header)
         z_block = rest[z_header_len:]
@@ -696,19 +703,20 @@
         entries, block = self.make_block(self._texts)
         manager = groupcompress._LazyGroupContentManager(block)
         self.add_key_to_manager(('key1',), entries, block, manager)
+        self.add_key_to_manager(('key3',), entries, block, manager)
         self.add_key_to_manager(('key4',), entries, block, manager)
         wire_bytes = manager._wire_bytes()
         self.assertStartsWith(wire_bytes, 'groupcompress-block\n')
         manager = groupcompress._LazyGroupContentManager.from_bytes(wire_bytes)
         self.assertIsInstance(manager, groupcompress._LazyGroupContentManager)
-        self.assertEqual(2, len(manager._factories))
+        self.assertEqual(3, len(manager._factories))
         self.assertEqual(block._z_content, manager._block._z_content)
         result_order = []
         for record in manager.get_record_stream():
             result_order.append(record.key)
             text = self._texts[record.key]
             self.assertEqual(text, record.get_bytes_as('fulltext'))
-        self.assertEqual([('key1',), ('key4',)], result_order)
+        self.assertEqual([('key1',), ('key3',), ('key4',)], result_order)
 
     def test__check_rebuild_no_changes(self):
         entries, block = self.make_block(self._texts)



More information about the bazaar-commits mailing list