Rev 4672: (robertc) Fix bug 402652: recompress badly packed groups during in file:///home/pqm/archives/thelove/bzr/%2Btrunk/

Fri Sep 4 03:43:11 BST 2009

At file:///home/pqm/archives/thelove/bzr/%2Btrunk/

------------------------------------------------------------
revno: 4672 [merge]
revision-id: pqm at pqm.ubuntu.com-20090904024310-7a3uqlf6iruxvz6m
parent: pqm at pqm.ubuntu.com-20090904002239-lnn3vwtab1tcbqfg
parent: robertc at robertcollins.net-20090904014431-2x6fris6dyw42gfg
committer: Canonical.com Patch Queue Manager <pqm at pqm.ubuntu.com>
branch nick: +trunk
timestamp: Fri 2009-09-04 03:43:10 +0100
message:
  (robertc) Fix bug 402652: recompress badly packed groups during
  	fetch. (John Arbash Meinel, Robert  Collins)
modified:
  NEWS                           NEWS-20050323055033-4e00b5db738777ff
  bzrlib/groupcompress.py        groupcompress.py-20080705181503-ccbxd6xuy1bdnrpu-8
  bzrlib/repofmt/groupcompress_repo.py repofmt.py-20080715094215-wp1qfvoo7093c8qr-1
  bzrlib/tests/test_groupcompress.py test_groupcompress.p-20080705181503-ccbxd6xuy1bdnrpu-13
  bzrlib/tests/test_repository.py test_repository.py-20060131075918-65c555b881612f4d
=== modified file 'NEWS'

--- a/NEWS	2009-09-03 23:43:16 +0000
+++ b/NEWS	2009-09-04 00:49:55 +0000
@@ -33,6 +33,13 @@
 * Don't restrict the command name used to run the test suite.
   (Vincent Ladeuil, #419950)
 
+* Fetches from 2a to 2a are now again requested in 'groupcompress' order.
+  Groups that are seen as 'underutilized' will be repacked on-the-fly.
+  This means that when the source is fully packed, there is minimal
+  overhead during the fetch, but if the source is poorly packed the result
+  is a fairly well packed repository (not as good as 'bzr pack' but
+  good-enough.) (Robert Collins, John Arbash Meinel, #402652)
+
 * Network streams now decode adjacent records of the same type into a
   single stream, reducing layering churn. (Robert Collins)
 
@@ -109,6 +116,7 @@
 * The main table of contents now provides links to the new Migration Docs
   and Plugins Guide. (Ian Clatworthy)
 
+
 bzr 2.0rc1
 ##########
 

=== modified file 'bzrlib/groupcompress.py'
--- a/bzrlib/groupcompress.py	2009-08-30 21:34:42 +0000
+++ b/bzrlib/groupcompress.py	2009-09-04 01:44:31 +0000
@@ -457,7 +457,6 @@
                 # There are code paths that first extract as fulltext, and then
                 # extract as storage_kind (smart fetch). So we don't break the
                 # refcycle here, but instead in manager.get_record_stream()
-                # self._manager = None
             if storage_kind == 'fulltext':
                 return self._bytes
             else:
@@ -469,6 +468,14 @@
 class _LazyGroupContentManager(object):
     """This manages a group of _LazyGroupCompressFactory objects."""
 
+    _max_cut_fraction = 0.75 # We allow a block to be trimmed to 75% of
+                             # current size, and still be considered
+                             # resuable
+    _full_block_size = 4*1024*1024
+    _full_mixed_block_size = 2*1024*1024
+    _full_enough_block_size = 3*1024*1024 # size at which we won't repack
+    _full_enough_mixed_block_size = 2*768*1024 # 1.5MB
+
     def __init__(self, block):
         self._block = block
         # We need to preserve the ordering
@@ -546,22 +553,23 @@
         # time (self._block._content) is a little expensive.
         self._block._ensure_content(self._last_byte)
 
-    def _check_rebuild_block(self):
+    def _check_rebuild_action(self):
         """Check to see if our block should be repacked."""
         total_bytes_used = 0
         last_byte_used = 0
         for factory in self._factories:
             total_bytes_used += factory._end - factory._start
-            last_byte_used = max(last_byte_used, factory._end)
-        # If we are using most of the bytes from the block, we have nothing
-        # else to check (currently more that 1/2)
+            if last_byte_used < factory._end:
+                last_byte_used = factory._end
+        # If we are using more than half of the bytes from the block, we have
+        # nothing else to check
         if total_bytes_used * 2 >= self._block._content_length:
-            return
-        # Can we just strip off the trailing bytes? If we are going to be
-        # transmitting more than 50% of the front of the content, go ahead
+            return None, last_byte_used, total_bytes_used
+        # We are using less than 50% of the content. Is the content we are
+        # using at the beginning of the block? If so, we can just trim the
+        # tail, rather than rebuilding from scratch.
         if total_bytes_used * 2 > last_byte_used:
-            self._trim_block(last_byte_used)
-            return
+            return 'trim', last_byte_used, total_bytes_used
 
         # We are using a small amount of the data, and it isn't just packed
         # nicely at the front, so rebuild the content.
@@ -574,7 +582,80 @@
         #       expanding many deltas into fulltexts, as well.
         #       If we build a cheap enough 'strip', then we could try a strip,
         #       if that expands the content, we then rebuild.
-        self._rebuild_block()
+        return 'rebuild', last_byte_used, total_bytes_used
+
+    def check_is_well_utilized(self):
+        """Is the current block considered 'well utilized'?
+
+        This heuristic asks if the current block considers itself to be a fully
+        developed group, rather than just a loose collection of data.
+        """
+        if len(self._factories) == 1:
+            # A block of length 1 could be improved by combining with other
+            # groups - don't look deeper. Even larger than max size groups
+            # could compress well with adjacent versions of the same thing.
+            return False
+        action, last_byte_used, total_bytes_used = self._check_rebuild_action()
+        block_size = self._block._content_length
+        if total_bytes_used < block_size * self._max_cut_fraction:
+            # This block wants to trim itself small enough that we want to
+            # consider it under-utilized.
+            return False
+        # TODO: This code is meant to be the twin of _insert_record_stream's
+        #       'start_new_block' logic. It would probably be better to factor
+        #       out that logic into a shared location, so that it stays
+        #       together better
+        # We currently assume a block is properly utilized whenever it is >75%
+        # of the size of a 'full' block. In normal operation, a block is
+        # considered full when it hits 4MB of same-file content. So any block
+        # >3MB is 'full enough'.
+        # The only time this isn't true is when a given block has large-object
+        # content. (a single file >4MB, etc.)
+        # Under these circumstances, we allow a block to grow to
+        # 2 x largest_content.  Which means that if a given block had a large
+        # object, it may actually be under-utilized. However, given that this
+        # is 'pack-on-the-fly' it is probably reasonable to not repack large
+        # content blobs on-the-fly. Note that because we return False for all
+        # 1-item blobs, we will repack them; we may wish to reevaluate our
+        # treatment of large object blobs in the future.
+        if block_size >= self._full_enough_block_size:
+            return True
+        # If a block is <3MB, it still may be considered 'full' if it contains
+        # mixed content. The current rule is 2MB of mixed content is considered
+        # full. So check to see if this block contains mixed content, and
+        # set the threshold appropriately.
+        common_prefix = None
+        for factory in self._factories:
+            prefix = factory.key[:-1]
+            if common_prefix is None:
+                common_prefix = prefix
+            elif prefix != common_prefix:
+                # Mixed content, check the size appropriately
+                if block_size >= self._full_enough_mixed_block_size:
+                    return True
+                break
+        # The content failed both the mixed check and the single-content check
+        # so obviously it is not fully utilized
+        # TODO: there is one other constraint that isn't being checked
+        #       namely, that the entries in the block are in the appropriate
+        #       order. For example, you could insert the entries in exactly
+        #       reverse groupcompress order, and we would think that is ok.
+        #       (all the right objects are in one group, and it is fully
+        #       utilized, etc.) For now, we assume that case is rare,
+        #       especially since we should always fetch in 'groupcompress'
+        #       order.
+        return False
+
+    def _check_rebuild_block(self):
+        action, last_byte_used, total_bytes_used = self._check_rebuild_action()
+        if action is None:
+            return
+        if action == 'trim':
+            self._trim_block(last_byte_used)
+        elif action == 'rebuild':
+            self._rebuild_block()
+        else:
+            raise ValueError('unknown rebuild action: %r' % (action,))
 
     def _wire_bytes(self):
         """Return a byte stream suitable for transmitting over the wire."""
@@ -1570,6 +1651,7 @@
         block_length = None
         # XXX: TODO: remove this, it is just for safety checking for now
         inserted_keys = set()
+        reuse_this_block = reuse_blocks
         for record in stream:
             # Raise an error when a record is missing.
             if record.storage_kind == 'absent':
@@ -1583,10 +1665,20 @@
             if reuse_blocks:
                 # If the reuse_blocks flag is set, check to see if we can just
                 # copy a groupcompress block as-is.
+                # We only check on the first record (groupcompress-block) not
+                # on all of the (groupcompress-block-ref) entries.
+                # The reuse_this_block flag is then kept for as long as
+                if record.storage_kind == 'groupcompress-block':
+                    # Check to see if we really want to re-use this block
+                    insert_manager = record._manager
+                    reuse_this_block = insert_manager.check_is_well_utilized()
+            else:
+                reuse_this_block = False
+            if reuse_this_block:
+                # We still want to reuse this block
                 if record.storage_kind == 'groupcompress-block':
                     # Insert the raw block into the target repo
                     insert_manager = record._manager
-                    insert_manager._check_rebuild_block()
                     bytes = record._manager._block.to_bytes()
                     _, start, length = self._access.add_raw_records(
                         [(None, len(bytes))], bytes)[0]
@@ -1597,6 +1689,11 @@
                                            'groupcompress-block-ref'):
                     if insert_manager is None:
                         raise AssertionError('No insert_manager set')
+                    if insert_manager is not record._manager:
+                        raise AssertionError('insert_manager does not match'
+                            ' the current record, we cannot be positive'
+                            ' that the appropriate content was inserted.'
+                            )
                     value = "%d %d %d %d" % (block_start, block_length,
                                              record._start, record._end)
                     nodes = [(record.key, value, (record.parents,))]

=== modified file 'bzrlib/repofmt/groupcompress_repo.py'
--- a/bzrlib/repofmt/groupcompress_repo.py	2009-08-24 19:34:13 +0000
+++ b/bzrlib/repofmt/groupcompress_repo.py	2009-09-01 06:10:24 +0000
@@ -932,7 +932,7 @@
         super(GroupCHKStreamSource, self).__init__(from_repository, to_format)
         self._revision_keys = None
         self._text_keys = None
-        # self._text_fetch_order = 'unordered'
+        self._text_fetch_order = 'groupcompress'
         self._chk_id_roots = None
         self._chk_p_id_roots = None
 
@@ -949,7 +949,7 @@
             p_id_roots_set = set()
             source_vf = self.from_repository.inventories
             stream = source_vf.get_record_stream(inventory_keys,
-                                                 'unordered', True)
+                                                 'groupcompress', True)
             for record in stream:
                 if record.storage_kind == 'absent':
                     if allow_absent:

=== modified file 'bzrlib/tests/test_groupcompress.py'
--- a/bzrlib/tests/test_groupcompress.py	2009-09-02 18:07:58 +0000
+++ b/bzrlib/tests/test_groupcompress.py	2009-09-04 00:49:55 +0000
@@ -538,7 +538,7 @@
                     'as-requested', False)]
         self.assertEqual([('b',), ('a',), ('d',), ('c',)], keys)
 
-    def test_insert_record_stream_re_uses_blocks(self):
+    def test_insert_record_stream_reuses_blocks(self):
         vf = self.make_test_vf(True, dir='source')
         def grouped_stream(revision_ids, first_parents=()):
             parents = first_parents
@@ -582,8 +582,14 @@
         vf2 = self.make_test_vf(True, dir='target')
         # ordering in 'groupcompress' order, should actually swap the groups in
         # the target vf, but the groups themselves should not be disturbed.
-        vf2.insert_record_stream(vf.get_record_stream(
-            [(r,) for r in 'abcdefgh'], 'groupcompress', False))
+        def small_size_stream():
+            for record in vf.get_record_stream([(r,) for r in 'abcdefgh'],
+                                               'groupcompress', False):
+                record._manager._full_enough_block_size = \
+                    record._manager._block._content_length
+                yield record
+                        
+        vf2.insert_record_stream(small_size_stream())
         stream = vf2.get_record_stream([(r,) for r in 'abcdefgh'],
                                        'groupcompress', False)
         vf2.writer.end()
@@ -594,6 +600,44 @@
                              record._manager._block._z_content)
         self.assertEqual(8, num_records)
 
+    def test_insert_record_stream_packs_on_the_fly(self):
+        vf = self.make_test_vf(True, dir='source')
+        def grouped_stream(revision_ids, first_parents=()):
+            parents = first_parents
+            for revision_id in revision_ids:
+                key = (revision_id,)
+                record = versionedfile.FulltextContentFactory(
+                    key, parents, None,
+                    'some content that is\n'
+                    'identical except for\n'
+                    'revision_id:%s\n' % (revision_id,))
+                yield record
+                parents = (key,)
+        # One group, a-d
+        vf.insert_record_stream(grouped_stream(['a', 'b', 'c', 'd']))
+        # Second group, e-h
+        vf.insert_record_stream(grouped_stream(['e', 'f', 'g', 'h'],
+                                               first_parents=(('d',),)))
+        # Now copy the blocks into another vf, and see that the
+        # insert_record_stream rebuilt a new block on-the-fly because of
+        # under-utilization
+        vf2 = self.make_test_vf(True, dir='target')
+        vf2.insert_record_stream(vf.get_record_stream(
+            [(r,) for r in 'abcdefgh'], 'groupcompress', False))
+        stream = vf2.get_record_stream([(r,) for r in 'abcdefgh'],
+                                       'groupcompress', False)
+        vf2.writer.end()
+        num_records = 0
+        # All of the records should be recombined into a single block
+        block = None
+        for record in stream:
+            num_records += 1
+            if block is None:
+                block = record._manager._block
+            else:
+                self.assertIs(block, record._manager._block)
+        self.assertEqual(8, num_records)
+
     def test__insert_record_stream_no_reuse_block(self):
         vf = self.make_test_vf(True, dir='source')
         def grouped_stream(revision_ids, first_parents=()):
@@ -811,15 +855,19 @@
 
     _texts = {
         ('key1',): "this is a text\n"
-                   "with a reasonable amount of compressible bytes\n",
+                   "with a reasonable amount of compressible bytes\n"
+                   "which can be shared between various other texts\n",
         ('key2',): "another text\n"
-                   "with a reasonable amount of compressible bytes\n",
+                   "with a reasonable amount of compressible bytes\n"
+                   "which can be shared between various other texts\n",
         ('key3',): "yet another text which won't be extracted\n"
-                   "with a reasonable amount of compressible bytes\n",
+                   "with a reasonable amount of compressible bytes\n"
+                   "which can be shared between various other texts\n",
         ('key4',): "this will be extracted\n"
                    "but references most of its bytes from\n"
                    "yet another text which won't be extracted\n"
-                   "with a reasonable amount of compressible bytes\n",
+                   "with a reasonable amount of compressible bytes\n"
+                   "which can be shared between various other texts\n",
     }
     def make_block(self, key_to_text):
         """Create a GroupCompressBlock, filling it with the given texts."""
@@ -837,6 +885,13 @@
         start, end = locations[key]
         manager.add_factory(key, (), start, end)
 
+    def make_block_and_full_manager(self, texts):
+        locations, block = self.make_block(texts)
+        manager = groupcompress._LazyGroupContentManager(block)
+        for key in sorted(texts):
+            self.add_key_to_manager(key, locations, block, manager)
+        return block, manager
+
     def test_get_fulltexts(self):
         locations, block = self.make_block(self._texts)
         manager = groupcompress._LazyGroupContentManager(block)
@@ -893,8 +948,8 @@
         header_len = int(header_len)
         block_len = int(block_len)
         self.assertEqual('groupcompress-block', storage_kind)
-        self.assertEqual(33, z_header_len)
-        self.assertEqual(25, header_len)
+        self.assertEqual(34, z_header_len)
+        self.assertEqual(26, header_len)
         self.assertEqual(len(block_bytes), block_len)
         z_header = rest[:z_header_len]
         header = zlib.decompress(z_header)
@@ -934,13 +989,7 @@
         self.assertEqual([('key1',), ('key4',)], result_order)
 
     def test__check_rebuild_no_changes(self):
-        locations, block = self.make_block(self._texts)
-        manager = groupcompress._LazyGroupContentManager(block)
-        # Request all the keys, which ensures that we won't rebuild
-        self.add_key_to_manager(('key1',), locations, block, manager)
-        self.add_key_to_manager(('key2',), locations, block, manager)
-        self.add_key_to_manager(('key3',), locations, block, manager)
-        self.add_key_to_manager(('key4',), locations, block, manager)
+        block, manager = self.make_block_and_full_manager(self._texts)
         manager._check_rebuild_block()
         self.assertIs(block, manager._block)
 
@@ -971,3 +1020,50 @@
             self.assertEqual(('key4',), record.key)
             self.assertEqual(self._texts[record.key],
                              record.get_bytes_as('fulltext'))
+
+    def test_check_is_well_utilized_all_keys(self):
+        block, manager = self.make_block_and_full_manager(self._texts)
+        self.assertFalse(manager.check_is_well_utilized())
+        # Though we can fake it by changing the recommended minimum size
+        manager._full_enough_block_size = block._content_length
+        self.assertTrue(manager.check_is_well_utilized())
+        # Setting it just above causes it to fail
+        manager._full_enough_block_size = block._content_length + 1
+        self.assertFalse(manager.check_is_well_utilized())
+        # Setting the mixed-block size doesn't do anything, because the content
+        # is considered to not be 'mixed'
+        manager._full_enough_mixed_block_size = block._content_length
+        self.assertFalse(manager.check_is_well_utilized())
+
+    def test_check_is_well_utilized_mixed_keys(self):
+        texts = {}
+        f1k1 = ('f1', 'k1')
+        f1k2 = ('f1', 'k2')
+        f2k1 = ('f2', 'k1')
+        f2k2 = ('f2', 'k2')
+        texts[f1k1] = self._texts[('key1',)]
+        texts[f1k2] = self._texts[('key2',)]
+        texts[f2k1] = self._texts[('key3',)]
+        texts[f2k2] = self._texts[('key4',)]
+        block, manager = self.make_block_and_full_manager(texts)
+        self.assertFalse(manager.check_is_well_utilized())
+        manager._full_enough_block_size = block._content_length
+        self.assertTrue(manager.check_is_well_utilized())
+        manager._full_enough_block_size = block._content_length + 1
+        self.assertFalse(manager.check_is_well_utilized())
+        manager._full_enough_mixed_block_size = block._content_length
+        self.assertTrue(manager.check_is_well_utilized())
+
+    def test_check_is_well_utilized_partial_use(self):
+        locations, block = self.make_block(self._texts)
+        manager = groupcompress._LazyGroupContentManager(block)
+        manager._full_enough_block_size = block._content_length
+        self.add_key_to_manager(('key1',), locations, block, manager)
+        self.add_key_to_manager(('key2',), locations, block, manager)
+        # Just using the content from key1 and 2 is not enough to be considered
+        # 'complete'
+        self.assertFalse(manager.check_is_well_utilized())
+        # However if we add key3, then we have enough, as we only require 75%
+        # consumption
+        self.add_key_to_manager(('key4',), locations, block, manager)
+        self.assertTrue(manager.check_is_well_utilized())

=== modified file 'bzrlib/tests/test_repository.py'
--- a/bzrlib/tests/test_repository.py	2009-09-03 04:51:46 +0000
+++ b/bzrlib/tests/test_repository.py	2009-09-04 00:49:55 +0000
@@ -683,6 +683,28 @@
 
 class Test2a(tests.TestCaseWithMemoryTransport):
 
+    def test_fetch_combines_groups(self):
+        builder = self.make_branch_builder('source', format='2a')
+        builder.start_series()
+        builder.build_snapshot('1', None, [
+            ('add', ('', 'root-id', 'directory', '')),
+            ('add', ('file', 'file-id', 'file', 'content\n'))])
+        builder.build_snapshot('2', ['1'], [
+            ('modify', ('file-id', 'content-2\n'))])
+        builder.finish_series()
+        source = builder.get_branch()
+        target = self.make_repository('target', format='2a')
+        target.fetch(source.repository)
+        target.lock_read()
+        self.addCleanup(target.unlock)
+        details = target.texts._index.get_build_details(
+            [('file-id', '1',), ('file-id', '2',)])
+        file_1_details = details[('file-id', '1')]
+        file_2_details = details[('file-id', '2')]
+        # The index, and what to read off disk, should be the same for both
+        # versions of the file.
+        self.assertEqual(file_1_details[0][:3], file_2_details[0][:3])
+
     def test_format_pack_compresses_True(self):
         repo = self.make_repository('repo', format='2a')
         self.assertTrue(repo._format.pack_compresses)