Rev 61: Factor out the ability to have/not have labels. in http://bzr.arbash-meinel.com/plugins/groupcompress_rabin

Sat Feb 28 03:59:52 GMT 2009

At http://bzr.arbash-meinel.com/plugins/groupcompress_rabin

------------------------------------------------------------
revno: 61
revision-id: john at arbash-meinel.com-20090228035950-mg89wms3nyse6n2d
parent: john at arbash-meinel.com-20090228032304-13o0os3ho1nqq4ze
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: groupcompress_rabin
timestamp: Fri 2009-02-27 21:59:50 -0600
message:
  Factor out the ability to have/not have labels.
  
  It turns out that labels now cost overall 10% increase in repo size. A rather
  large 40% increase for inventory pages.
  Perhaps since label == sha1 we could get away doing something differently.
  Note also that repository-details doesn't take into account the indexes.
  The .cix index for a conversion is approx 380kB, which starts to be an
  important factor when you consider the total content for all chk pages
  is less than 1.5MB.
-------------- next part --------------
=== modified file '_groupcompress_c.pyx'

--- a/_groupcompress_c.pyx	2009-02-27 20:18:47 +0000
+++ b/_groupcompress_c.pyx	2009-02-28 03:59:50 +0000
@@ -75,6 +75,26 @@
 #         free(val[0])
 #         val[0] = NULL
 
+class DeltaIndex:
+
+  cdef delta_index *_index
+
+
+def make_delta_index(source_bytes):
+    """Build the RABIN index for this source text."""
+    # TODO: Are usage is ultimately going to be different than the one that was
+    #       originally designed. Specifically, we are going to want to be able
+    #       to update the index by hashing future data. It should fit just fine
+    #       into the structure. But for now, we just wrap create_delta_index
+    #       (For example, we could always reserve enough space to hash a 4MB
+    #       string, etc.)
+    cdef char *source
+    cdef Py_ssize_t source_size
+    cdef delta_index *index
+
+    index = create_delta_index(source, source_size)
+    if index == NULL:
+        return None
 
 def make_delta(source_bytes, target_bytes):
     """Create a delta from source_bytes => target_bytes."""

=== modified file 'groupcompress.py'
--- a/groupcompress.py	2009-02-28 03:23:04 +0000
+++ b/groupcompress.py	2009-02-28 03:59:50 +0000
@@ -52,14 +52,20 @@
     VersionedFiles,
     )
 
+_NO_LABELS = True
 
 def parse(bytes):
+    if _NO_LABELS:
+        action_byte = bytes[0]
+        action = {'f':'fulltext', 'd':'delta'}[action_byte]
+        return action, None, None, bytes[1:]
     (action, label_line, sha1_line, len_line,
-     delta_bytes) = bytes.split('\n', 4)
+     delta_bytes) = bytes.split('\n', 3)
     if (action not in ('fulltext', 'delta')
         or not label_line.startswith('label: ')
         or not sha1_line.startswith('sha1: ')
-        or not len_line.startswith('len: ')):
+        or not len_line.startswith('len: ')
+        ):
         raise AssertionError("bad text record %r" % (bytes,))
     label = tuple(label_line[7:].split('\x00'))
     sha1 = sha1_line[6:]
@@ -118,6 +124,9 @@
 
         :paeam delta: If False, do not compress records.
         """
+        # Consider seeding the lines with some sort of GC Start flag, or
+        # putting it as part of the output stream, rather than in the
+        # compressed bytes.
         self.lines = []
         self.endpoint = 0
         self.input_bytes = 0
@@ -150,7 +159,13 @@
         # bytes are valid from sha1, and we know where to find the end of this
         # record because of 'len'. (the delta record itself will store the
         # total length for the expanded record)
-        new_chunks = ['label: %s\nsha1: %s\n' % (label, sha1)]
+        # 'len: %d\n' costs approximately 1% increase in total data
+        # Having the labels at all costs us 9-10% increase, 38% increase for
+        # inventory pages, and 5.8% increase for text pages
+        if _NO_LABELS:
+            new_chunks = []
+        else:
+            new_chunks = ['label: %s\nsha1: %s\n' % (label, sha1)]
         source_text = ''.join(self.lines)
         # XXX: We have a few possibilities here. We could consider a few
         #      different 'previous' windows, such as only the initial text, we
@@ -163,13 +178,20 @@
             or len(delta) > len(target_text) / 2):
             # We can't delta (perhaps source_text is empty)
             # so mark this as an insert
-            new_chunks.insert(0, 'fulltext\n')
-            new_chunks.append('len: %s\n' % (input_len,))
-            new_chunks.extend(chunks)
+            if _NO_LABELS:
+                new_chunks = ['f']
+                new_chunks.extend(chunks)
+            else:
+                new_chunks.insert(0, 'fulltext\n')
+                new_chunks.append('len: %s\n' % (input_len,))
+                new_chunks.extend(chunks)
         else:
-            new_chunks.insert(0, 'delta\n')
-            new_chunks.append('len: %s\n' % (len(delta),))
-            new_chunks.append(delta)
+            if _NO_LABELS:
+                new_chunks = ['d', delta]
+            else:
+                new_chunks.insert(0, 'delta\n')
+                new_chunks.append('len: %s\n' % (len(delta),))
+                new_chunks.append(delta)
         delta_start = (self.endpoint, len(self.lines))
         self.output_chunks(new_chunks)
         self.input_bytes += input_len
@@ -186,14 +208,17 @@
         delta_details = self.labels_deltas[key]
         delta_chunks = self.lines[delta_details[0][1]:delta_details[1][1]]
         action, label, sha1, delta = parse(''.join(delta_chunks))
-        if label != key:
+        if not _NO_LABELS and label != key:
             raise AssertionError("wrong key: %r, wanted %r" % (label, key))
         if action == 'fulltext':
             bytes = delta
         else:
             source = ''.join(self.lines[delta_details[0][0]])
             bytes = _groupcompress_c.apply_delta(source, delta)
-        sha1 = sha_string(bytes)
+        if _NO_LABELS:
+            sha1 = sha_string(bytes)
+        else:
+            assert sha1 == sha_string(bytes)
         return [bytes], sha1
 
     def output_chunks(self, new_chunks):
@@ -487,7 +512,7 @@
                 index_memo, _, parents, (method, _) = locations[key]
                 plain, delta_bytes = self._get_group_and_delta_bytes(index_memo)
                 action, label, sha1, delta = parse(delta_bytes)
-                if label != key:
+                if not _NO_LABELS and label != key:
                     raise AssertionError("wrong key: %r, wanted %r" % (label, key))
                 if action == 'fulltext':
                     chunks = [delta]
@@ -499,8 +524,11 @@
                         import pdb; pdb.set_trace()
                     chunks = [bytes]
                     del bytes
-                if sha_strings(chunks) != sha1:
-                    raise AssertionError('sha1 sum did not match')
+                if _NO_LABELS:
+                    sha1 = sha_strings(chunks)
+                else:
+                    if sha_strings(chunks) != sha1:
+                        raise AssertionError('sha1 sum did not match')
             yield ChunkedContentFactory(key, parents, sha1, chunks)
 
     def get_sha1s(self, keys):