Rev 4672: We needed a bit more data to actually get groups doing delta-compression. in http://bazaar.launchpad.net/~jameinel/bzr/2.1b1-pack-on-the-fly

John Arbash Meinel john at arbash-meinel.com
Wed Sep 2 20:54:55 BST 2009


At http://bazaar.launchpad.net/~jameinel/bzr/2.1b1-pack-on-the-fly

------------------------------------------------------------
revno: 4672
revision-id: john at arbash-meinel.com-20090902195448-1xueel7882fbuvx3
parent: john at arbash-meinel.com-20090902192442-xacg1ky4pz9mryvd
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: 2.1b1-pack-on-the-fly
timestamp: Wed 2009-09-02 14:54:48 -0500
message:
  We needed a bit more data to actually get groups doing delta-compression.
  We also now allow a given group to be cut by up to 75% of current bytes before we
  consider it to be 'under utilized'.
  This is to match the 75% of 'full utilized' that we use later on.
-------------- next part --------------
=== modified file 'bzrlib/groupcompress.py'
--- a/bzrlib/groupcompress.py	2009-09-02 19:24:42 +0000
+++ b/bzrlib/groupcompress.py	2009-09-02 19:54:48 +0000
@@ -468,6 +468,14 @@
 class _LazyGroupContentManager(object):
     """This manages a group of _LazyGroupCompressFactory objects."""
 
+    _max_cut_fraction = 0.75 # We allow a block to be trimmed to 75% of
+                             # current size, and still be considered
+                             # resuable
+    _full_block_size = 4*1024*1024
+    _full_mixed_block_size = 2*1024*1024
+    _full_enough_block_size = 3*1024*1024 # size at which we won't repack
+    _full_enough_mixed_block_size = 2*768*1024 # 1.5MB
+
     def __init__(self, block):
         self._block = block
         # We need to preserve the ordering
@@ -587,10 +595,10 @@
             # A block of length 1 is never considered 'well utilized' :)
             return False
         action, last_byte_used, total_bytes_used = self._check_rebuild_action()
-        if action is not None or total_bytes_used < self._block._content_length:
-            # This block wants to trim itself somehow, which inherently means
-            # that it is under-utilized, since it holds data that isn't being
-            # referenced
+        block_size = self._block._content_length
+        if total_bytes_used < block_size * self._max_cut_fraction:
+            # This block wants to trim itself small enough that we want to
+            # consider it under-utilized.
             return False
         # TODO: This code is meant to be the twin of _insert_record_stream's
         #       'start_new_block' logic. It would probably be better to factor
@@ -607,7 +615,7 @@
         # object, it may actually be under-utilized. However, given that this
         # is 'pack-on-the-fly' it is probably reasonable to not repack large
         # contet blobs on-the-fly.
-        if self._block._content_length >= 3*1024*1024:
+        if block_size >= self._full_enough_block_size:
             return True
         # If a block is <3MB, it still may be considered 'full' if it contains
         # mixed content. The current rule is 2MB of mixed content is considered
@@ -620,7 +628,7 @@
                 common_prefix = prefix
             elif prefix != common_prefix:
                 # Mixed content, check the size appropriately
-                if self._block._content_length >= 2*768*1024: #1.5MB
+                if block_size >= self._full_enough_mixed_block_size:
                     return True
                 break
         # The content failed both the mixed check and the single-content check

=== modified file 'bzrlib/tests/test_groupcompress.py'
--- a/bzrlib/tests/test_groupcompress.py	2009-08-28 02:37:00 +0000
+++ b/bzrlib/tests/test_groupcompress.py	2009-09-02 19:54:48 +0000
@@ -811,15 +811,19 @@
 
     _texts = {
         ('key1',): "this is a text\n"
-                   "with a reasonable amount of compressible bytes\n",
+                   "with a reasonable amount of compressible bytes\n"
+                   "which can be shared between various other texts\n",
         ('key2',): "another text\n"
-                   "with a reasonable amount of compressible bytes\n",
+                   "with a reasonable amount of compressible bytes\n"
+                   "which can be shared between various other texts\n",
         ('key3',): "yet another text which won't be extracted\n"
-                   "with a reasonable amount of compressible bytes\n",
+                   "with a reasonable amount of compressible bytes\n"
+                   "which can be shared between various other texts\n",
         ('key4',): "this will be extracted\n"
                    "but references most of its bytes from\n"
                    "yet another text which won't be extracted\n"
-                   "with a reasonable amount of compressible bytes\n",
+                   "with a reasonable amount of compressible bytes\n"
+                   "which can be shared between various other texts\n",
     }
     def make_block(self, key_to_text):
         """Create a GroupCompressBlock, filling it with the given texts."""
@@ -837,6 +841,13 @@
         start, end = locations[key]
         manager.add_factory(key, (), start, end)
 
+    def make_block_and_full_manager(self, texts):
+        locations, block = self.make_block(texts)
+        manager = groupcompress._LazyGroupContentManager(block)
+        for key in sorted(texts):
+            self.add_key_to_manager(key, locations, block, manager)
+        return block, manager
+
     def test_get_fulltexts(self):
         locations, block = self.make_block(self._texts)
         manager = groupcompress._LazyGroupContentManager(block)
@@ -934,13 +945,7 @@
         self.assertEqual([('key1',), ('key4',)], result_order)
 
     def test__check_rebuild_no_changes(self):
-        locations, block = self.make_block(self._texts)
-        manager = groupcompress._LazyGroupContentManager(block)
-        # Request all the keys, which ensures that we won't rebuild
-        self.add_key_to_manager(('key1',), locations, block, manager)
-        self.add_key_to_manager(('key2',), locations, block, manager)
-        self.add_key_to_manager(('key3',), locations, block, manager)
-        self.add_key_to_manager(('key4',), locations, block, manager)
+        block, manager = self.make_block_and_full_manager(self._texts)
         manager._check_rebuild_block()
         self.assertIs(block, manager._block)
 
@@ -971,3 +976,50 @@
             self.assertEqual(('key4',), record.key)
             self.assertEqual(self._texts[record.key],
                              record.get_bytes_as('fulltext'))
+
+    def test_check_is_well_utilized_all_keys(self):
+        block, manager = self.make_block_and_full_manager(self._texts)
+        self.assertFalse(manager.check_is_well_utilized())
+        # Though we can fake it by changing the recommended minimum size
+        manager._full_enough_block_size = block._content_length
+        self.assertTrue(manager.check_is_well_utilized())
+        # Setting it just above causes it to fail
+        manager._full_enough_block_size = block._content_length + 1
+        self.assertFalse(manager.check_is_well_utilized())
+        # Setting the mixed-block size doesn't do anything, because the content
+        # is considered to not be 'mixed'
+        manager._full_enough_mixed_block_size = block._content_length
+        self.assertFalse(manager.check_is_well_utilized())
+
+    def test_check_is_well_utilized_mixed_keys(self):
+        texts = {}
+        f1k1 = ('f1', 'k1')
+        f1k2 = ('f1', 'k2')
+        f2k1 = ('f2', 'k1')
+        f2k2 = ('f2', 'k2')
+        texts[f1k1] = self._texts[('key1',)]
+        texts[f1k2] = self._texts[('key2',)]
+        texts[f2k1] = self._texts[('key3',)]
+        texts[f2k2] = self._texts[('key4',)]
+        block, manager = self.make_block_and_full_manager(texts)
+        self.assertFalse(manager.check_is_well_utilized())
+        manager._full_enough_block_size = block._content_length
+        self.assertTrue(manager.check_is_well_utilized())
+        manager._full_enough_block_size = block._content_length + 1
+        self.assertFalse(manager.check_is_well_utilized())
+        manager._full_enough_mixed_block_size = block._content_length
+        self.assertTrue(manager.check_is_well_utilized())
+
+    def test_check_is_well_utilized_partial_use(self):
+        locations, block = self.make_block(self._texts)
+        manager = groupcompress._LazyGroupContentManager(block)
+        manager._full_enough_block_size = block._content_length
+        self.add_key_to_manager(('key1',), locations, block, manager)
+        self.add_key_to_manager(('key2',), locations, block, manager)
+        # Just using the content from key1 and 2 is not enough to be considered
+        # 'complete'
+        self.assertFalse(manager.check_is_well_utilized())
+        # However if we add key3, then we have enough, as we only require 75%
+        # consumption
+        self.add_key_to_manager(('key4',), locations, block, manager)
+        self.assertTrue(manager.check_is_well_utilized())



More information about the bazaar-commits mailing list