Rev 38: (tests broken) implement the basic ability to have a separate header in http://bzr.arbash-meinel.com/plugins/groupcompress

Wed Mar 4 21:06:29 GMT 2009

At http://bzr.arbash-meinel.com/plugins/groupcompress

------------------------------------------------------------
revno: 38
revision-id: john at arbash-meinel.com-20090304210622-ur7wz2dz0w4lhzn3
parent: john at arbash-meinel.com-20090304183131-p433dz5coqrmv8pw
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: groupcompress
timestamp: Wed 2009-03-04 15:06:22 -0600
message:
  (tests broken) implement the basic ability to have a separate header
  This puts the labels/sha1/etc together, and then has the actual content deltas
  combined later on.
-------------- next part --------------
=== modified file 'groupcompress.py'

--- a/groupcompress.py	2009-03-04 18:31:31 +0000
+++ b/groupcompress.py	2009-03-04 21:06:22 +0000
@@ -53,30 +53,8 @@
     )
 from bzrlib.plugins.groupcompress import errors as gc_errors
 
-_NO_LABELS = False
 _FAST = False
 
-def parse(bytes):
-    if _NO_LABELS:
-        action_byte = bytes[0]
-        action = {'f':'fulltext', 'd':'delta'}[action_byte]
-        return action, None, None, bytes[1:]
-    (action, label_line, sha1_line, len_line,
-     delta_bytes) = bytes.split('\n', 4)
-    if (action not in ('fulltext', 'delta')
-        or not label_line.startswith('label:')
-        or not sha1_line.startswith('sha1:')
-        or not len_line.startswith('len:')
-        ):
-        raise AssertionError("bad text record %r" % (bytes,))
-    label = tuple(label_line[6:].split('\x00'))
-    sha1 = sha1_line[5:]
-    length = int(len_line[4:])
-    if not len(delta_bytes) == length:
-        raise AssertionError("bad length record %r" % (bytes,))
-    return action, label, sha1, delta_bytes
-
-
 def encode_base128_int(val):
     """Convert an integer into a 7-bit lsb encoding."""
     bytes = []
@@ -145,6 +123,12 @@
         self.start = start # Byte offset to start of data
         self.length = length # Length of content
 
+    def __repr__(self):
+        return '%s(%s, %s, %s, %s, %s)' % (
+            self.__class__.__name__,
+            self.key, self.type, self.sha1, self.start, self.length
+            )
+
 
 class GroupCompressBlock(object):
     """An object which maintains the internal structure of the compressed data.
@@ -158,10 +142,15 @@
     def __init__(self):
         # map by key? or just order in file?
         self._entries = {}
+        self._content = None
+        self._size = 0
 
     def _parse_header(self):
         """Parse the meta-info from the stream."""
 
+    def __len__(self):
+        return self._size
+
     @classmethod
     def from_bytes(cls, bytes):
         out = cls()
@@ -184,6 +173,7 @@
         assert len(header_bytes) == header_length
         del z_header_bytes
         lines = header_bytes.split('\n')
+        header_len = len(header_bytes)
         del header_bytes
         info_dict = {}
         for line in lines:
@@ -198,7 +188,13 @@
                 value = tuple(map(intern, value.split('\x00')))
             elif key in ('start', 'length'):
                 value = int(value)
+            elif key == 'type':
+                value = intern(value)
             info_dict[key] = value
+        zcontent = bytes[pos2:]
+        if zcontent:
+            out._content = zlib.decompress(zcontent)
+            out._size = header_len + len(out._content)
         return out
 
     def extract(self, key, sha1=None):
@@ -208,6 +204,14 @@
         :param sha1: TODO (should we validate only when sha1 is supplied?)
         :return: The bytes for the content
         """
+        entry = self._entries[key]
+        if entry.type == 'fulltext':
+            bytes = self._content[entry.start:entry.start + entry.length]
+        elif entry.type == 'delta':
+            delta = self._content[entry.start:entry.start + entry.length]
+            bytes = _groupcompress_pyx.apply_delta(self._content, delta)
+        # XXX: sha1?
+        return entry, bytes
 
     def add_entry(self, key, type, sha1, start, length):
         """Add new meta info about an entry.
@@ -281,6 +285,7 @@
         self.input_bytes = 0
         self.labels_deltas = {}
         self._delta_index = _groupcompress_pyx.DeltaIndex()
+        self._block = GroupCompressBlock()
 
     def compress(self, key, bytes, expected_sha, soft=False):
         """Compress lines with label key.
@@ -304,7 +309,6 @@
             sha1 = expected_sha
         if key[-1] is None:
             key = key[:-1] + ('sha1:' + sha1,)
-        label = '\x00'.join(key)
         input_len = len(bytes)
         # By having action/label/sha1/len, we can parse the group if the index
         # was ever destroyed, we have the key in 'label', we know the final
@@ -314,10 +318,7 @@
         # 'len: %d\n' costs approximately 1% increase in total data
         # Having the labels at all costs us 9-10% increase, 38% increase for
         # inventory pages, and 5.8% increase for text pages
-        if _NO_LABELS:
-            new_chunks = []
-        else:
-            new_chunks = ['label:%s\nsha1:%s\n' % (label, sha1)]
+        # new_chunks = ['label:%s\nsha1:%s\n' % (label, sha1)]
         if self._delta_index._source_offset != self.endpoint:
             raise AssertionError('_source_offset != endpoint'
                 ' somehow the DeltaIndex got out of sync with'
@@ -327,28 +328,18 @@
         if (delta is None):
             # We can't delta (perhaps source_text is empty)
             # so mark this as an insert
-            if _NO_LABELS:
-                new_chunks = ['f']
-            else:
-                new_chunks.insert(0, 'fulltext\n')
-                new_chunks.append('len:%s\n' % (input_len,))
-            unadded_bytes = sum(map(len, new_chunks))
-            self._delta_index.add_source(bytes, unadded_bytes)
-            new_chunks.append(bytes)
+            self._block.add_entry(key, type='fulltext', sha1=sha1,
+                                  start=self.endpoint, length=len(bytes))
+            self._delta_index.add_source(bytes, 0)
+            new_chunks = [bytes]
         else:
-            if _NO_LABELS:
-                new_chunks = ['d']
-            else:
-                new_chunks.insert(0, 'delta\n')
-                new_chunks.append('len:%s\n' % (len(delta),))
+            self._block.add_entry(key, type='delta', sha1=sha1,
+                                  start=self.endpoint, length=len(delta))
+            new_chunks = [delta]
             if _FAST:
-                new_chunks.append(delta)
-                unadded_bytes = sum(map(len, new_chunks))
-                self._delta_index._source_offset += unadded_bytes
+                self._delta_index._source_offset += len(delta)
             else:
-                unadded_bytes = sum(map(len, new_chunks))
-                self._delta_index.add_delta_source(delta, unadded_bytes)
-                new_chunks.append(delta)
+                self._delta_index.add_delta_source(delta, 0)
         delta_start = (self.endpoint, len(self.lines))
         self.output_chunks(new_chunks)
         self.input_bytes += input_len
@@ -368,19 +359,18 @@
         """
         delta_details = self.labels_deltas[key]
         delta_chunks = self.lines[delta_details[0][1]:delta_details[1][1]]
-        action, label, sha1, delta = parse(''.join(delta_chunks))
-        if not _NO_LABELS and label != key:
-            raise AssertionError("wrong key: %r, wanted %r" % (label, key))
-        if action == 'fulltext':
-            bytes = delta
+        stored_bytes = ''.join(delta_chunks)
+        # TODO: Fix this, we shouldn't really be peeking here
+        entry = self._block._entries[key]
+        if entry.type == 'fulltext':
+            bytes = stored_bytes
         else:
-            source = ''.join(self.lines[delta_details[0][0]])
+            assert entry.type == 'delta'
+            # XXX: This is inefficient at best
+            source = ''.join(self.lines)
             bytes = _groupcompress_pyx.apply_delta(source, delta)
-        if _NO_LABELS:
-            sha1 = sha_string(bytes)
-        else:
-            assert sha1 == sha_string(bytes)
-        return [bytes], sha1
+            assert entry.sha1 == sha_string(bytes)
+        return bytes, entry.sha1
 
     def output_chunks(self, new_chunks):
         """Output some chunks.
@@ -573,11 +563,11 @@
                     result[key] = self._unadded_refs[key]
         return result
 
-    def _get_group_and_delta_bytes(self, index_memo):
+    def _get_block(self, index_memo):
         read_memo = index_memo[0:3]
         # get the group:
         try:
-            plain = self._group_cache[read_memo]
+            block = self._group_cache[read_memo]
         except KeyError:
             # read the group
             zdata = self._access.get_raw_records([read_memo]).next()
@@ -585,14 +575,14 @@
             # permits caching. We might want to store the partially
             # decompresed group and decompress object, so that recent
             # texts are not penalised by big groups.
-            plain = zlib.decompress(zdata) #, index_memo[4])
-            self._group_cache[read_memo] = plain
+            block = GroupCompressBlock.from_bytes(zdata)
+            self._group_cache[read_memo] = block
         # cheapo debugging:
         # print len(zdata), len(plain)
         # parse - requires split_lines, better to have byte offsets
         # here (but not by much - we only split the region for the
         # recipe, and we often want to end up with lines anyway.
-        return plain, plain[index_memo[3]:index_memo[4]]
+        return block
 
     def get_missing_compression_parent_keys(self):
         """Return the keys of missing compression parents.
@@ -671,26 +661,12 @@
                 parents = self._unadded_refs[key]
             else:
                 index_memo, _, parents, (method, _) = locations[key]
-                plain, delta_bytes = self._get_group_and_delta_bytes(index_memo)
-                action, label, sha1, delta = parse(delta_bytes)
-                if not _NO_LABELS and label != key:
-                    raise AssertionError("wrong key: %r, wanted %r" % (label, key))
-                if action == 'fulltext':
-                    chunks = [delta]
-                else:
-                    # TODO: relax apply_delta so that it can allow source to be
-                    #       longer than expected
-                    bytes = _groupcompress_pyx.apply_delta(plain, delta)
-                    if bytes is None:
-                        import pdb; pdb.set_trace()
-                    chunks = [bytes]
-                    del bytes
-                if _NO_LABELS:
-                    sha1 = sha_strings(chunks)
-                else:
-                    if not _FAST and sha_strings(chunks) != sha1:
-                        raise AssertionError('sha1 sum did not match')
-            yield ChunkedContentFactory(key, parents, sha1, chunks)
+                block = self._get_block(index_memo)
+                entry, bytes = block.extract(key)
+                sha1 = entry.sha1
+                if not _FAST and sha_string(bytes) != sha1:
+                    raise AssertionError('sha1 sum did not match')
+            yield FulltextContentFactory(key, parents, sha1, bytes)
 
     def get_sha1s(self, keys):
         """See VersionedFiles.get_sha1s()."""
@@ -742,9 +718,18 @@
         basis_end = 0
         groups = 1
         def flush():
+            # TODO: we may want to have the header compressed in the same chain
+            #       as the data, or we may not, evaulate it
+            #       having them compressed together is probably a win for
+            #       revisions and the 'inv' portion of chk inventories. As the
+            #       label in the header is duplicated in the text.
+            #       For chk pages and real bytes, I would guess this is not
+            #       true.
+            header = self._compressor._block.to_bytes()
             compressed = zlib.compress(''.join(self._compressor.lines))
+            out = header + compressed
             index, start, length = self._access.add_raw_records(
-                [(None, len(compressed))], compressed)[0]
+                [(None, len(out))], out)[0]
             nodes = []
             for key, reads, refs in keys_to_add:
                 nodes.append((key, "%d %d %s" % (start, length, reads), refs))
@@ -1024,44 +1009,6 @@
         return node[0], start, stop, basis_end, delta_end
 
 
-def _get_longest_match(equivalence_table, pos, max_pos, locations):
-    """Get the longest possible match for the current position."""
-    range_start = pos
-    range_len = 0
-    copy_ends = None
-    while pos < max_pos:
-        if locations is None:
-            locations = equivalence_table.get_idx_matches(pos)
-        if locations is None:
-            # No more matches, just return whatever we have, but we know that
-            # this last position is not going to match anything
-            pos += 1
-            break
-        else:
-            if copy_ends is None:
-                # We are starting a new range
-                copy_ends = [loc + 1 for loc in locations]
-                range_len = 1
-                locations = None # Consumed
-            else:
-                # We are currently in the middle of a match
-                next_locations = set(copy_ends).intersection(locations)
-                if len(next_locations):
-                    # range continues
-                    copy_ends = [loc + 1 for loc in next_locations]
-                    range_len += 1
-                    locations = None # Consumed
-                else:
-                    # But we are done with this match, we should be
-                    # starting a new one, though. We will pass back 'locations'
-                    # so that we don't have to do another lookup.
-                    break
-        pos += 1
-    if copy_ends is None:
-        return None, pos, locations
-    return ((min(copy_ends) - range_len, range_start, range_len)), pos, locations
-
-
 try:
     from bzrlib.plugins.groupcompress import _groupcompress_pyx
 except ImportError:

=== modified file 'repofmt.py'
--- a/repofmt.py	2009-03-04 16:01:55 +0000
+++ b/repofmt.py	2009-03-04 21:06:22 +0000
@@ -627,7 +627,9 @@
         """A hashed CHK+group compress pack repository."""
 
         repository_class = GCCHKPackRepository
-        rich_root_data = True
+        # For right now, setting this to True gives us InterModel1And2 rather
+        # than InterDifferingSerializer
+        rich_root_data = False
 
         def get_format_string(self):
             """See RepositoryFormat.get_format_string()."""
@@ -641,7 +643,7 @@
 
 def pack_incompatible(source, target, orig_method=InterPackRepo.is_compatible):
     """Be incompatible with the regular fetch code."""
-    formats = (RepositoryFormatPackGC,)
+    formats = (RepositoryFormatPackGCPlain,)
     if chk_support:
         formats = formats + (RepositoryFormatPackGCCHK16,
                              RepositoryFormatPackGCCHK255)

=== modified file 'tests/test_groupcompress.py'
--- a/tests/test_groupcompress.py	2009-03-04 18:31:31 +0000
+++ b/tests/test_groupcompress.py	2009-03-04 21:06:22 +0000
@@ -157,15 +157,18 @@
         # Knit fetching will try to reconstruct texts locally which results in
         # reading something that is in the compressor stream already.
         compressor = groupcompress.GroupCompressor(True)
-        sha_1,  _ = compressor.compress(('label',), 'strange\ncommon\n', None)
-        sha_2, _ = compressor.compress(('newlabel',),
-            'common\ndifferent\nmoredifferent\n', None)
+        sha1_1, _ = compressor.compress(('label',),
+            'strange\ncommon long line\nthat needs a 16 byte match\n', None)
+        expected_lines = list(compressor.lines)
+        sha1_2, end_point = compressor.compress(('newlabel',),
+            'common long line\nthat needs a 16 byte match\ndifferent\n', None)
         # get the first out
-        self.assertEqual((['strange\ncommon\n'], sha_1),
+        self.assertEqual(('strange\ncommon\n', sha1_1),
             compressor.extract(('label',)))
         # and the second
-        self.assertEqual((['common\ndifferent\nmoredifferent\n'],
-            sha_2), compressor.extract(('newlabel',)))
+        self.assertEqual(('common long line\nthat needs a 16 byte match\n'
+                          'different\n', sha1_2),
+                         compressor.extract(('newlabel',)))
 
 
 class TestBase128Int(tests.TestCase):
@@ -214,7 +217,7 @@
         self.assertEqual({}, block._entries)
 
     def test_from_bytes(self):
-        block = groupcompress.GroupCompressBlock.from_bytes(
+        z_header_bytes = (
             'gcb1z\n' # group compress block v1 plain
             '76\n' # Length of zlib bytes
             '183\n' # Length of all meta-info
@@ -231,6 +234,10 @@
             'start:0\n'
             'length:100\n'
             '\n'))
+        block = groupcompress.GroupCompressBlock.from_bytes(
+            z_header_bytes)
+        self.assertIs(None, block._content)
+        self.assertIsInstance(block, groupcompress.GroupCompressBlock)
         self.assertEqual([('bing',), ('foo', 'bar')], sorted(block._entries))
         bing = block._entries[('bing',)]
         self.assertEqual(('bing',), bing.key)