Rev 4672: We needed a bit more data to actually get groups doing delta-compression. in http://bazaar.launchpad.net/~jameinel/bzr/2.1b1-pack-on-the-fly
John Arbash Meinel
john at arbash-meinel.com
Wed Sep 2 20:54:55 BST 2009
At http://bazaar.launchpad.net/~jameinel/bzr/2.1b1-pack-on-the-fly
------------------------------------------------------------
revno: 4672
revision-id: john at arbash-meinel.com-20090902195448-1xueel7882fbuvx3
parent: john at arbash-meinel.com-20090902192442-xacg1ky4pz9mryvd
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: 2.1b1-pack-on-the-fly
timestamp: Wed 2009-09-02 14:54:48 -0500
message:
We needed a bit more data to actually get groups doing delta-compression.
We also now allow a given group to be cut by up to 75% of current bytes before we
consider it to be 'under utilized'.
This is to match the 75% of 'full utilized' that we use later on.
-------------- next part --------------
=== modified file 'bzrlib/groupcompress.py'
--- a/bzrlib/groupcompress.py 2009-09-02 19:24:42 +0000
+++ b/bzrlib/groupcompress.py 2009-09-02 19:54:48 +0000
@@ -468,6 +468,14 @@
class _LazyGroupContentManager(object):
"""This manages a group of _LazyGroupCompressFactory objects."""
+ _max_cut_fraction = 0.75 # We allow a block to be trimmed to 75% of
+ # current size, and still be considered
+ # resuable
+ _full_block_size = 4*1024*1024
+ _full_mixed_block_size = 2*1024*1024
+ _full_enough_block_size = 3*1024*1024 # size at which we won't repack
+ _full_enough_mixed_block_size = 2*768*1024 # 1.5MB
+
def __init__(self, block):
self._block = block
# We need to preserve the ordering
@@ -587,10 +595,10 @@
# A block of length 1 is never considered 'well utilized' :)
return False
action, last_byte_used, total_bytes_used = self._check_rebuild_action()
- if action is not None or total_bytes_used < self._block._content_length:
- # This block wants to trim itself somehow, which inherently means
- # that it is under-utilized, since it holds data that isn't being
- # referenced
+ block_size = self._block._content_length
+ if total_bytes_used < block_size * self._max_cut_fraction:
+ # This block wants to trim itself small enough that we want to
+ # consider it under-utilized.
return False
# TODO: This code is meant to be the twin of _insert_record_stream's
# 'start_new_block' logic. It would probably be better to factor
@@ -607,7 +615,7 @@
# object, it may actually be under-utilized. However, given that this
# is 'pack-on-the-fly' it is probably reasonable to not repack large
# contet blobs on-the-fly.
- if self._block._content_length >= 3*1024*1024:
+ if block_size >= self._full_enough_block_size:
return True
# If a block is <3MB, it still may be considered 'full' if it contains
# mixed content. The current rule is 2MB of mixed content is considered
@@ -620,7 +628,7 @@
common_prefix = prefix
elif prefix != common_prefix:
# Mixed content, check the size appropriately
- if self._block._content_length >= 2*768*1024: #1.5MB
+ if block_size >= self._full_enough_mixed_block_size:
return True
break
# The content failed both the mixed check and the single-content check
=== modified file 'bzrlib/tests/test_groupcompress.py'
--- a/bzrlib/tests/test_groupcompress.py 2009-08-28 02:37:00 +0000
+++ b/bzrlib/tests/test_groupcompress.py 2009-09-02 19:54:48 +0000
@@ -811,15 +811,19 @@
_texts = {
('key1',): "this is a text\n"
- "with a reasonable amount of compressible bytes\n",
+ "with a reasonable amount of compressible bytes\n"
+ "which can be shared between various other texts\n",
('key2',): "another text\n"
- "with a reasonable amount of compressible bytes\n",
+ "with a reasonable amount of compressible bytes\n"
+ "which can be shared between various other texts\n",
('key3',): "yet another text which won't be extracted\n"
- "with a reasonable amount of compressible bytes\n",
+ "with a reasonable amount of compressible bytes\n"
+ "which can be shared between various other texts\n",
('key4',): "this will be extracted\n"
"but references most of its bytes from\n"
"yet another text which won't be extracted\n"
- "with a reasonable amount of compressible bytes\n",
+ "with a reasonable amount of compressible bytes\n"
+ "which can be shared between various other texts\n",
}
def make_block(self, key_to_text):
"""Create a GroupCompressBlock, filling it with the given texts."""
@@ -837,6 +841,13 @@
start, end = locations[key]
manager.add_factory(key, (), start, end)
+ def make_block_and_full_manager(self, texts):
+ locations, block = self.make_block(texts)
+ manager = groupcompress._LazyGroupContentManager(block)
+ for key in sorted(texts):
+ self.add_key_to_manager(key, locations, block, manager)
+ return block, manager
+
def test_get_fulltexts(self):
locations, block = self.make_block(self._texts)
manager = groupcompress._LazyGroupContentManager(block)
@@ -934,13 +945,7 @@
self.assertEqual([('key1',), ('key4',)], result_order)
def test__check_rebuild_no_changes(self):
- locations, block = self.make_block(self._texts)
- manager = groupcompress._LazyGroupContentManager(block)
- # Request all the keys, which ensures that we won't rebuild
- self.add_key_to_manager(('key1',), locations, block, manager)
- self.add_key_to_manager(('key2',), locations, block, manager)
- self.add_key_to_manager(('key3',), locations, block, manager)
- self.add_key_to_manager(('key4',), locations, block, manager)
+ block, manager = self.make_block_and_full_manager(self._texts)
manager._check_rebuild_block()
self.assertIs(block, manager._block)
@@ -971,3 +976,50 @@
self.assertEqual(('key4',), record.key)
self.assertEqual(self._texts[record.key],
record.get_bytes_as('fulltext'))
+
+ def test_check_is_well_utilized_all_keys(self):
+ block, manager = self.make_block_and_full_manager(self._texts)
+ self.assertFalse(manager.check_is_well_utilized())
+ # Though we can fake it by changing the recommended minimum size
+ manager._full_enough_block_size = block._content_length
+ self.assertTrue(manager.check_is_well_utilized())
+ # Setting it just above causes it to fail
+ manager._full_enough_block_size = block._content_length + 1
+ self.assertFalse(manager.check_is_well_utilized())
+ # Setting the mixed-block size doesn't do anything, because the content
+ # is considered to not be 'mixed'
+ manager._full_enough_mixed_block_size = block._content_length
+ self.assertFalse(manager.check_is_well_utilized())
+
+ def test_check_is_well_utilized_mixed_keys(self):
+ texts = {}
+ f1k1 = ('f1', 'k1')
+ f1k2 = ('f1', 'k2')
+ f2k1 = ('f2', 'k1')
+ f2k2 = ('f2', 'k2')
+ texts[f1k1] = self._texts[('key1',)]
+ texts[f1k2] = self._texts[('key2',)]
+ texts[f2k1] = self._texts[('key3',)]
+ texts[f2k2] = self._texts[('key4',)]
+ block, manager = self.make_block_and_full_manager(texts)
+ self.assertFalse(manager.check_is_well_utilized())
+ manager._full_enough_block_size = block._content_length
+ self.assertTrue(manager.check_is_well_utilized())
+ manager._full_enough_block_size = block._content_length + 1
+ self.assertFalse(manager.check_is_well_utilized())
+ manager._full_enough_mixed_block_size = block._content_length
+ self.assertTrue(manager.check_is_well_utilized())
+
+ def test_check_is_well_utilized_partial_use(self):
+ locations, block = self.make_block(self._texts)
+ manager = groupcompress._LazyGroupContentManager(block)
+ manager._full_enough_block_size = block._content_length
+ self.add_key_to_manager(('key1',), locations, block, manager)
+ self.add_key_to_manager(('key2',), locations, block, manager)
+ # Just using the content from key1 and 2 is not enough to be considered
+ # 'complete'
+ self.assertFalse(manager.check_is_well_utilized())
+ # However if we add key3, then we have enough, as we only require 75%
+ # consumption
+ self.add_key_to_manager(('key4',), locations, block, manager)
+ self.assertTrue(manager.check_is_well_utilized())
More information about the bazaar-commits
mailing list