Rev 74: We now start to make use of the ability to extend the delta index in http://bzr.arbash-meinel.com/plugins/groupcompress_rabin

Mon Mar 2 19:36:33 GMT 2009

At http://bzr.arbash-meinel.com/plugins/groupcompress_rabin

------------------------------------------------------------
revno: 74
revision-id: john at arbash-meinel.com-20090302193629-51hqsvh1rhh71gku
parent: john at arbash-meinel.com-20090302191537-7mvjwk2042fvj9gg
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: groupcompress_rabin
timestamp: Mon 2009-03-02 13:36:29 -0600
message:
  We now start to make use of the ability to extend the delta index
  with new sources. Next step is to understand the delta encoding, so as to
  avoid linking up with lines in the deltas.
-------------- next part --------------
=== modified file '_groupcompress_pyx.pyx'

--- a/_groupcompress_pyx.pyx	2009-03-02 19:15:37 +0000
+++ b/_groupcompress_pyx.pyx	2009-03-02 19:36:29 +0000
@@ -101,16 +101,23 @@
         self._source_offset = 0
 
         if source is not None:
-            self.add_source(source)
+            self.add_source(source, 0)
 
     def __dealloc__(self):
         self._ensure_no_indexes()
 
-    def add_source(self, source):
+    def add_source(self, source, unadded_bytes):
+        """Add a new bit of source text to the delta indexes.
+
+        :param source: The text in question, this must be a byte string
+        :param unadded_bytes: Assume there are this many bytes that didn't get
+            added between this source and the end of the previous source.
+        """
         cdef char *c_source
         cdef Py_ssize_t c_source_size
         cdef delta_index *index
         cdef unsigned int num_indexes
+        cdef unsigned long agg_src_offset
 
         if not PyString_CheckExact(source):
             raise TypeError('source is not a str')
@@ -125,8 +132,9 @@
         #       fit just fine into the structure. But for now, we just wrap
         #       create_delta_index (For example, we could always reserve enough
         #       space to hash a 4MB string, etc.)
-        index = create_delta_index(c_source, c_source_size, self._source_offset)
-        self._source_offset += c_source_size
+        agg_src_offset = self._source_offset + unadded_bytes
+        index = create_delta_index(c_source, c_source_size, agg_src_offset)
+        self._source_offset = agg_src_offset + c_source_size
         if index != NULL:
             num_indexes = self._num_indexes + 1
             if num_indexes >= self._max_num_indexes:

=== modified file 'diff-delta.c'
--- a/diff-delta.c	2009-03-02 18:52:36 +0000
+++ b/diff-delta.c	2009-03-02 19:36:29 +0000
@@ -341,6 +341,8 @@
 		index = indexes[j];
 		i += index->src_size;
 	}
+	assert(i <= index->src_size + index->agg_src_offset);
+	i = index->src_size + index->agg_src_offset;
 	while (i >= 0x80) {
 		out[outpos++] = i | 0x80;
 		i >>= 7;

=== modified file 'groupcompress.py'
--- a/groupcompress.py	2009-03-02 17:05:33 +0000
+++ b/groupcompress.py	2009-03-02 19:36:29 +0000
@@ -53,7 +53,6 @@
     )
 
 _NO_LABELS = False
-_FAST = True
 
 def parse(bytes):
     if _NO_LABELS:
@@ -132,7 +131,7 @@
         self.endpoint = 0
         self.input_bytes = 0
         self.labels_deltas = {}
-        self._last_delta_index = None
+        self._delta_index = _groupcompress_pyx.DeltaIndex()
 
     def compress(self, key, chunks, expected_sha, soft=False):
         """Compress lines with label key.
@@ -168,42 +167,27 @@
             new_chunks = []
         else:
             new_chunks = ['label: %s\nsha1: %s\n' % (label, sha1)]
-        # PROF: 5s to this constant extra joining
-        if self._last_delta_index is not None:
-            delta_index = self._last_delta_index
-        else:
-            source_text = ''.join(self.lines)
-            # XXX: We have a few possibilities here. We could consider a few
-            #      different 'previous' windows, such as only the initial text,
-            #      we could do something with the 'just inserted' text we could
-            #      try a delta against whatever the last delta we computed,
-            #      (the idea being we just computed the delta_index, so we
-            #      re-use it here, and see if that is good enough, etc)
-            # PROF: 15s to building the delta index
-            delta_index = _groupcompress_pyx.make_delta_index(source_text)
-        # PROF: only 0.67s to actually create a delta
-        delta = delta_index.make_delta(target_text)
+        delta = self._delta_index.make_delta(target_text)
         if (delta is None
             or len(delta) > len(target_text) / 2):
             # We can't delta (perhaps source_text is empty)
             # so mark this as an insert
             if _NO_LABELS:
                 new_chunks = ['f']
-                new_chunks.extend(chunks)
             else:
                 new_chunks.insert(0, 'fulltext\n')
                 new_chunks.append('len: %s\n' % (input_len,))
-                new_chunks.extend(chunks)
-            self._last_delta_index = None
+            unadded_bytes = sum(map(len, new_chunks))
+            self._delta_index.add_source(target_text, unadded_bytes)
+            new_chunks.append(target_text)
         else:
             if _NO_LABELS:
-                new_chunks = ['d', delta]
+                new_chunks = ['d']
             else:
                 new_chunks.insert(0, 'delta\n')
                 new_chunks.append('len: %s\n' % (len(delta),))
-                new_chunks.append(delta)
-            if _FAST:
-                self._last_delta_index = delta_index
+            unadded_bytes = sum(map(len, new_chunks))
+            new_chunks.append(delta)
         delta_start = (self.endpoint, len(self.lines))
         self.output_chunks(new_chunks)
         self.input_bytes += input_len

=== modified file 'tests/test__groupcompress_pyx.py'
--- a/tests/test__groupcompress_pyx.py	2009-03-02 19:15:37 +0000
+++ b/tests/test__groupcompress_pyx.py	2009-03-02 19:36:29 +0000
@@ -60,6 +60,26 @@
 at the end of the file
 """
 
+_first_text = """\
+a bit of text, that
+does not have much in
+common with the next text
+"""
+
+_second_text = """\
+some more bits of text
+which does have a little bit in
+common with the previous text
+"""
+
+
+_third_text = """\
+a bit of text, that
+has some in common with the previous text
+and not much in
+common with the next text
+"""
+
 
 class Test_GroupCompress(tests.TestCase):
     """Direct tests for the compiled extension."""
@@ -141,30 +161,36 @@
 
     def test_delta_against_multiple_sources(self):
         di = self._gc_module.DeltaIndex()
-        first_text = ('a bit of text, that\n'
-                      'does not have much in\n'
-                      'common with the next text\n'
-                     )
-        di.add_source(first_text)
+        di.add_source(_first_text, 0)
         self.assertEqual(1, di._num_indexes)
         self.assertEqual(1024, di._max_num_indexes)
-        self.assertEqual(len(first_text), di._source_offset)
-        second_text = ('some more bits of text\n'
-                       'which does have a little bit in\n'
-                       'common with the previous text\n'
-                      )
-        di.add_source(second_text)
+        self.assertEqual(len(_first_text), di._source_offset)
+        di.add_source(_second_text, 0)
         self.assertEqual(2, di._num_indexes)
         self.assertEqual(1024, di._max_num_indexes)
-        self.assertEqual(len(first_text) + len(second_text), di._source_offset)
-        third_text = ('a bit of text, that\n'
-                      'has some in common with the previous text\n'
-                      'and not much in\n'
-                      'common with the next text\n'
-                     )
-        delta = di.make_delta(third_text)
-        result = self._gc_module.apply_delta(first_text + second_text, delta)
-        self.assertEqualDiff(third_text, result)
+        self.assertEqual(len(_first_text) + len(_second_text), di._source_offset)
+        delta = di.make_delta(_third_text)
+        result = self._gc_module.apply_delta(_first_text + _second_text, delta)
+        self.assertEqualDiff(_third_text, result)
         self.assertEqual('\x99\x01h\x90\x14\x0chas some in '
                          '\x91{\x1e\x07and not\x91!#', delta)
 
+    def test_delta_with_offsets(self):
+        di = self._gc_module.DeltaIndex()
+        di.add_source(_first_text, 5)
+        self.assertEqual(1, di._num_indexes)
+        self.assertEqual(1024, di._max_num_indexes)
+        self.assertEqual(len(_first_text) + 5, di._source_offset)
+        di.add_source(_second_text, 10)
+        self.assertEqual(2, di._num_indexes)
+        self.assertEqual(1024, di._max_num_indexes)
+        self.assertEqual(len(_first_text) + len(_second_text) + 15,
+                         di._source_offset)
+        delta = di.make_delta(_third_text)
+        self.assertIsNot(None, delta)
+        result = self._gc_module.apply_delta(
+            '12345' + _first_text + '1234567890' + _second_text, delta)
+        self.assertIsNot(None, result)
+        self.assertEqualDiff(_third_text, result)
+        self.assertEqual('\xa8\x01h\x91\x05\x14\x0chas some in '
+                         '\x91\x8a\x1e\x07and not\x91&#', delta)