Rev 3925: Get rid of the entries dict in GroupCompressBlock. in http://bzr.arbash-meinel.com/branches/bzr/brisbane/vilajam

John Arbash Meinel john at arbash-meinel.com
Fri Mar 27 22:05:54 GMT 2009


At http://bzr.arbash-meinel.com/branches/bzr/brisbane/vilajam

------------------------------------------------------------
revno: 3925
revision-id: john at arbash-meinel.com-20090327220537-loj7fdr9hi360qc3
parent: john at arbash-meinel.com-20090327214708-sy13r2m4cu0qn72k
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: vilajam
timestamp: Fri 2009-03-27 17:05:37 -0500
message:
  Get rid of the entries dict in GroupCompressBlock.
  We weren't making use of it, and it was overhead to update it.
  This simplifies the code a bit more.
  The only difference now between the python and pyrex compressors
  is the __init__ and _compress functions.
  If it wasn't for circular import issues, I would be tempted
  to move them one step further, into the _groupcompress_* modules.
-------------- next part --------------
=== modified file 'bzrlib/groupcompress.py'
--- a/bzrlib/groupcompress.py	2009-03-27 21:47:08 +0000
+++ b/bzrlib/groupcompress.py	2009-03-27 22:05:37 +0000
@@ -83,29 +83,6 @@
     return present_keys
 
 
-class GroupCompressBlockEntry(object):
-    """Track the information about a single object inside a GC group.
-
-    This is generally just the dumb data structure.
-    """
-
-    def __init__(self, key, type, sha1, start, length):
-        self.key = key
-        self.type = type # delta, fulltext, external?
-        self.sha1 = sha1 # Sha1 of content
-        self.start = start # Byte offset to start of data
-        self.length = length # Length of content
-
-    def __repr__(self):
-        return '%s(%s, %s, %s, %s, %s)' % (
-            self.__class__.__name__,
-            self.key, self.type, self.sha1, self.start, self.length
-            )
-
-    @property
-    def end(self):
-        return self.start + self.length
-
 # The max zlib window size is 32kB, so if we set 'max_size' output of the
 # decompressor to the requested bytes + 32kB, then we should guarantee
 # num_bytes coming out.
@@ -123,7 +100,6 @@
 
     def __init__(self):
         # map by key? or just order in file?
-        self._entries = {}
         self._compressor_name = None
         self._z_content = None
         self._z_content_decompressor = None
@@ -279,22 +255,6 @@
             bytes = apply_delta(self._content, content)
         return bytes
 
-    def add_entry(self, key, type, sha1, start, length):
-        """Add new meta info about an entry.
-
-        :param key: The key for the new content
-        :param type: Whether this is a delta or fulltext entry (external?)
-        :param sha1: sha1sum of the fulltext of this entry
-        :param start: where the encoded bytes start
-        :param length: total number of bytes in the encoded form
-        :return: The entry?
-        """
-        entry = GroupCompressBlockEntry(key, type, sha1, start, length)
-        if key in self._entries:
-            raise ValueError('Duplicate key found: %s' % (key,))
-        self._entries[key] = entry
-        return entry
-
     def set_content(self, content):
         """Set the content of this block."""
         self._content_length = len(content)
@@ -643,9 +603,6 @@
         if not bytes: # empty, like a dir entry, etc
             if nostore_sha == _null_sha1:
                 raise errors.ExistingContent()
-            self._block.add_entry(key, type='empty',
-                                  sha1=None, start=0,
-                                  length=0)
             return _null_sha1, 0, 0, 'fulltext', 0
         # we assume someone knew what they were doing when they passed it in
         if expected_sha is not None:
@@ -688,42 +645,32 @@
         :param key: The key to extract.
         :return: An iterable over bytes and the sha1.
         """
-        delta_details = self.labels_deltas[key]
-        delta_chunks = self.chunks[delta_details[0][1]:delta_details[1][1]]
+        (start_byte, start_chunk, end_byte, end_chunk) = self.labels_deltas[key]
+        delta_chunks = self.chunks[start_chunk:end_chunk]
         stored_bytes = ''.join(delta_chunks)
-        # TODO: Fix this, we shouldn't really be peeking here
-        entry = self._block._entries[key]
-        if entry.type == 'fulltext':
-            if stored_bytes[0] != 'f':
-                raise ValueError('Index claimed fulltext, but stored bytes'
-                                 ' indicate %s' % (stored_bytes[0],))
+        if stored_bytes[0] == 'f':
             fulltext_len, offset = decode_base128_int(stored_bytes[1:10])
-            if fulltext_len + 1 + offset != len(stored_bytes):
+            data_len = fulltext_len + 1 + offset
+            if  data_len != len(stored_bytes):
                 raise ValueError('Index claimed fulltext len, but stored bytes'
                                  ' claim %s != %s'
-                                 % (len(stored_bytes),
-                                    fulltext_len + 1 + offset))
+                                 % (len(stored_bytes), data_len))
             bytes = stored_bytes[offset + 1:]
         else:
-            if entry.type != 'delta':
-                raise ValueError('Unknown entry type: %s' % (entry.type,))
             # XXX: This is inefficient at best
-            source = ''.join(self.chunks)
+            source = ''.join(self.chunks[:start_chunk])
             if stored_bytes[0] != 'd':
-                raise ValueError('Entry type claims delta, bytes claim %s'
+                raise ValueError('Unknown content kind, bytes claim %s'
                                  % (stored_bytes[0],))
             delta_len, offset = decode_base128_int(stored_bytes[1:10])
-            if delta_len + 1 + offset != len(stored_bytes):
+            data_len = delta_len + 1 + offset
+            if data_len != len(stored_bytes):
                 raise ValueError('Index claimed delta len, but stored bytes'
                                  ' claim %s != %s'
-                                 % (len(stored_bytes),
-                                    delta_len + 1 + offset))
+                                 % (len(stored_bytes), data_len))
             bytes = apply_delta(source, stored_bytes[offset + 1:])
         bytes_sha1 = osutils.sha_string(bytes)
-        if entry.sha1 != bytes_sha1:
-            raise ValueError('Recorded sha1 != measured %s != %s'
-                             % (entry.sha1, bytes_sha1))
-        return bytes, entry.sha1
+        return bytes, bytes_sha1
 
     def flush(self):
         """Finish this group, creating a formatted stream.
@@ -786,15 +733,14 @@
             # Update the delta_length to include those two encoded integers
             out_lines[1] = encode_base128_int(delta_length)
             out_length = len(out_lines[3]) + 1 + delta_length
-        self._block.add_entry(key, type=type, sha1=sha1,
-                              start=self.endpoint, length=out_length)
         start = self.endpoint # Before insertion
-        delta_start = (start, len(self._delta_index.lines))
+        chunk_start = len(self._delta_index.lines)
         self._delta_index.extend_lines(out_lines, index_lines)
         self.endpoint = self._delta_index.endpoint
         self.input_bytes += bytes_length
-        delta_end = (self.endpoint, len(self._delta_index.lines))
-        self.labels_deltas[key] = (delta_start, delta_end)
+        chunk_end = len(self._delta_index.lines)
+        self.labels_deltas[key] = (start, chunk_start,
+                                   self.endpoint, chunk_end)
         return sha1, start, self.endpoint, type, out_length
 
 
@@ -849,14 +795,15 @@
             length = len(delta) + len_mini_header
             new_chunks = ['d', enc_length, delta]
             self._delta_index.add_delta_source(delta, len_mini_header)
-        self._block.add_entry(key, type=type, sha1=sha1,
-                              start=self.endpoint, length=length)
-        start = self.endpoint # Before insertion
-        delta_start = (self.endpoint, len(self.chunks))
+        # Before insertion
+        start = self.endpoint
+        chunk_start = len(self.chunks)
+        # Now output these bytes
         self._output_chunks(new_chunks)
         self.input_bytes += input_len
-        delta_end = (self.endpoint, len(self.chunks))
-        self.labels_deltas[key] = (delta_start, delta_end)
+        chunk_end = len(self.chunks)
+        self.labels_deltas[key] = (start, chunk_start,
+                                   self.endpoint, chunk_end)
         if not self._delta_index._source_offset == self.endpoint:
             raise AssertionError('the delta index is out of sync'
                 'with the output lines %s != %s'

=== modified file 'bzrlib/tests/test_groupcompress.py'
--- a/bzrlib/tests/test_groupcompress.py	2009-03-27 21:47:08 +0000
+++ b/bzrlib/tests/test_groupcompress.py	2009-03-27 22:05:37 +0000
@@ -287,12 +287,13 @@
         start = 0
         for key in sorted(key_to_text):
             compressor.compress(key, key_to_text[key], None)
+        locs = dict((key, (start, end)) for key, (start, _, end, _)
+                    in compressor.labels_deltas.iteritems())
         block = compressor.flush()
-        entries = block._entries
+        raw_bytes = block.to_bytes()
         # Go through from_bytes(to_bytes()) so that we start with a compressed
         # content object
-        return entries, groupcompress.GroupCompressBlock.from_bytes(
-            block.to_bytes())
+        return locs, groupcompress.GroupCompressBlock.from_bytes(raw_bytes)
 
     def test_from_empty_bytes(self):
         self.assertRaises(ValueError,
@@ -302,7 +303,6 @@
         block = groupcompress.GroupCompressBlock.from_bytes(
             'gcb1z\n0\n0\n')
         self.assertIsInstance(block, groupcompress.GroupCompressBlock)
-        self.assertEqual({}, block._entries)
         self.assertIs(None, block._content)
         self.assertEqual('', block._z_content)
         block._ensure_content()
@@ -329,22 +329,10 @@
         self.assertEqual(z_content, block._z_content)
         self.assertEqual(content, block._content)
 
-    def test_add_entry(self):
-        gcb = groupcompress.GroupCompressBlock()
-        e = gcb.add_entry(('foo', 'bar'), 'fulltext', 'abcd'*10, 0, 100)
-        self.assertIsInstance(e, groupcompress.GroupCompressBlockEntry)
-        self.assertEqual(('foo', 'bar'), e.key)
-        self.assertEqual('fulltext', e.type)
-        self.assertEqual('abcd'*10, e.sha1)
-        self.assertEqual(0, e.start)
-        self.assertEqual(100, e.length)
-
     def test_to_bytes(self):
         content = ('this is some content\n'
                    'this content will be compressed\n')
         gcb = groupcompress.GroupCompressBlock()
-        gcb.add_entry(('foo', 'bar'), 'fulltext', 'abcd'*10, 0, 100)
-        gcb.add_entry(('bing',), 'fulltext', 'abcd'*10, 100, 100)
         gcb.set_content(content)
         bytes = gcb.to_bytes()
         self.assertEqual(gcb._z_content_length, len(gcb._z_content))
@@ -612,20 +600,21 @@
         start = 0
         for key in sorted(key_to_text):
             compressor.compress(key, key_to_text[key], None)
+        locs = dict((key, (start, end)) for key, (start, _, end, _)
+                    in compressor.labels_deltas.iteritems())
         block = compressor.flush()
-        entries = block._entries
         raw_bytes = block.to_bytes()
-        return entries, groupcompress.GroupCompressBlock.from_bytes(raw_bytes)
+        return locs, groupcompress.GroupCompressBlock.from_bytes(raw_bytes)
 
-    def add_key_to_manager(self, key, entries, block, manager):
-        entry = entries[key]
-        manager.add_factory(entry.key, (), entry.start, entry.end)
+    def add_key_to_manager(self, key, locations, block, manager):
+        start, end = locations[key]
+        manager.add_factory(key, (), start, end)
 
     def test_get_fulltexts(self):
-        entries, block = self.make_block(self._texts)
+        locations, block = self.make_block(self._texts)
         manager = groupcompress._LazyGroupContentManager(block)
-        self.add_key_to_manager(('key1',), entries, block, manager)
-        self.add_key_to_manager(('key2',), entries, block, manager)
+        self.add_key_to_manager(('key1',), locations, block, manager)
+        self.add_key_to_manager(('key2',), locations, block, manager)
         result_order = []
         for record in manager.get_record_stream():
             result_order.append(record.key)
@@ -636,8 +625,8 @@
         # If we build the manager in the opposite order, we should get them
         # back in the opposite order
         manager = groupcompress._LazyGroupContentManager(block)
-        self.add_key_to_manager(('key2',), entries, block, manager)
-        self.add_key_to_manager(('key1',), entries, block, manager)
+        self.add_key_to_manager(('key2',), locations, block, manager)
+        self.add_key_to_manager(('key1',), locations, block, manager)
         result_order = []
         for record in manager.get_record_stream():
             result_order.append(record.key)
@@ -646,7 +635,7 @@
         self.assertEqual([('key2',), ('key1',)], result_order)
 
     def test__wire_bytes_no_keys(self):
-        entries, block = self.make_block(self._texts)
+        locations, block = self.make_block(self._texts)
         manager = groupcompress._LazyGroupContentManager(block)
         wire_bytes = manager._wire_bytes()
         block_length = len(block.to_bytes())
@@ -665,10 +654,10 @@
                          wire_bytes)
 
     def test__wire_bytes(self):
-        entries, block = self.make_block(self._texts)
+        locations, block = self.make_block(self._texts)
         manager = groupcompress._LazyGroupContentManager(block)
-        self.add_key_to_manager(('key1',), entries, block, manager)
-        self.add_key_to_manager(('key4',), entries, block, manager)
+        self.add_key_to_manager(('key1',), locations, block, manager)
+        self.add_key_to_manager(('key4',), locations, block, manager)
         block_bytes = block.to_bytes()
         wire_bytes = manager._wire_bytes()
         (storage_kind, z_header_len, header_len,
@@ -683,8 +672,8 @@
         z_header = rest[:z_header_len]
         header = zlib.decompress(z_header)
         self.assertEqual(header_len, len(header))
-        entry1 = entries[('key1',)]
-        entry4 = entries[('key4',)]
+        entry1 = locations[('key1',)]
+        entry4 = locations[('key4',)]
         self.assertEqualDiff('key1\n'
                              '\n'  # no parents
                              '%d\n' # start offset
@@ -693,17 +682,17 @@
                              '\n'
                              '%d\n'
                              '%d\n'
-                             % (entry1.start, entry1.end,
-                                entry4.start, entry4.end),
+                             % (entry1[0], entry1[1],
+                                entry4[0], entry4[1]),
                             header)
         z_block = rest[z_header_len:]
         self.assertEqual(block_bytes, z_block)
 
     def test_from_bytes(self):
-        entries, block = self.make_block(self._texts)
+        locations, block = self.make_block(self._texts)
         manager = groupcompress._LazyGroupContentManager(block)
-        self.add_key_to_manager(('key1',), entries, block, manager)
-        self.add_key_to_manager(('key4',), entries, block, manager)
+        self.add_key_to_manager(('key1',), locations, block, manager)
+        self.add_key_to_manager(('key4',), locations, block, manager)
         wire_bytes = manager._wire_bytes()
         self.assertStartsWith(wire_bytes, 'groupcompress-block\n')
         manager = groupcompress._LazyGroupContentManager.from_bytes(wire_bytes)
@@ -718,21 +707,21 @@
         self.assertEqual([('key1',), ('key4',)], result_order)
 
     def test__check_rebuild_no_changes(self):
-        entries, block = self.make_block(self._texts)
+        locations, block = self.make_block(self._texts)
         manager = groupcompress._LazyGroupContentManager(block)
         # Request all the keys, which ensures that we won't rebuild
-        self.add_key_to_manager(('key1',), entries, block, manager)
-        self.add_key_to_manager(('key2',), entries, block, manager)
-        self.add_key_to_manager(('key3',), entries, block, manager)
-        self.add_key_to_manager(('key4',), entries, block, manager)
+        self.add_key_to_manager(('key1',), locations, block, manager)
+        self.add_key_to_manager(('key2',), locations, block, manager)
+        self.add_key_to_manager(('key3',), locations, block, manager)
+        self.add_key_to_manager(('key4',), locations, block, manager)
         manager._check_rebuild_block()
         self.assertIs(block, manager._block)
 
     def test__check_rebuild_only_one(self):
-        entries, block = self.make_block(self._texts)
+        locations, block = self.make_block(self._texts)
         manager = groupcompress._LazyGroupContentManager(block)
         # Request just the first key, which should trigger a 'strip' action
-        self.add_key_to_manager(('key1',), entries, block, manager)
+        self.add_key_to_manager(('key1',), locations, block, manager)
         manager._check_rebuild_block()
         self.assertIsNot(block, manager._block)
         self.assertTrue(block._content_length > manager._block._content_length)
@@ -744,10 +733,10 @@
                              record.get_bytes_as('fulltext'))
 
     def test__check_rebuild_middle(self):
-        entries, block = self.make_block(self._texts)
+        locations, block = self.make_block(self._texts)
         manager = groupcompress._LazyGroupContentManager(block)
         # Request a small key in the middle should trigger a 'rebuild'
-        self.add_key_to_manager(('key4',), entries, block, manager)
+        self.add_key_to_manager(('key4',), locations, block, manager)
         manager._check_rebuild_block()
         self.assertIsNot(block, manager._block)
         self.assertTrue(block._content_length > manager._block._content_length)



More information about the bazaar-commits mailing list