Rev 4670: Work out a heuristic about when a block is well utilized in http://bazaar.launchpad.net/~jameinel/bzr/2.1b1-pack-on-the-fly
John Arbash Meinel
john at arbash-meinel.com
Tue Sep 1 22:58:20 BST 2009
At http://bazaar.launchpad.net/~jameinel/bzr/2.1b1-pack-on-the-fly
------------------------------------------------------------
revno: 4670
revision-id: john at arbash-meinel.com-20090901215814-5x0804myuqf42j87
parent: john at arbash-meinel.com-20090901214127-gs4xwl6t5q81k0hh
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: 2.1b1-pack-on-the-fly
timestamp: Tue 2009-09-01 16:58:14 -0500
message:
Work out a heuristic about when a block is well utilized
which mirrors the code that decides when to generate a new block.
-------------- next part --------------
=== modified file 'bzrlib/groupcompress.py'
--- a/bzrlib/groupcompress.py 2009-09-01 21:41:27 +0000
+++ b/bzrlib/groupcompress.py 2009-09-01 21:58:14 +0000
@@ -556,12 +556,12 @@
# If we are using more than half of the bytes from the block, we have
# nothing else to check
if total_bytes_used * 2 >= self._block._content_length:
- return None, last_byte_used
+ return None, last_byte_used, total_bytes_used
# We are using less than 50% of the content. Is the content we are
# using at the beginning of the block? If so, we can just trim the
# tail, rather than rebuilding from scratch.
if total_bytes_used * 2 > last_byte_used:
- return 'trim', last_byte_used
+ return 'trim', last_byte_used, total_bytes_used
# We are using a small amount of the data, and it isn't just packed
# nicely at the front, so rebuild the content.
@@ -574,10 +574,51 @@
# expanding many deltas into fulltexts, as well.
# If we build a cheap enough 'strip', then we could try a strip,
# if that expands the content, we then rebuild.
- return 'rebuild', last_byte_used
+ return 'rebuild', last_byte_used, total_bytes_used
+
+ def check_is_well_utilized(self):
+ """Is the current block considered 'well utilized'?
+
+ This is a bit of a heuristic, but it basically asks if the current
+ block considers itself to be a fully developed group, rather than just
+ a loose collection of data.
+ """
+ if len(self._factories) == 1:
+ # A block of length 1 is never considered 'well utilized' :)
+ return False
+ action, last_byte_used, total_bytes_used = self._check_rebuild_action()
+ if action is not None or total_bytes_used < self._block._content_length:
+ # This block wants to trim itself somehow, which inherently means
+ # that it is under-utilized, since it holds data that isn't being
+ # referenced
+ return False
+ # TODO: This code is meant to be the twin of _insert_record_stream's
+ # 'start_new_block' logic. It would probably be better to factor
+ # out that logic into a shared location, so that it stays
+ # together better
+ if self._block._content_length >= 4*1024*1024:
+ # This only violates the 'large content grows to 2x single content
+ # size' rule. However most of that is probably caught by the
+ # 'len(self._factories) == 1' check.
+ return True
+ # TODO: We can get the raw content's real size from the stored data. We
+ # have to zlib.decompress it, but we don't have to apply the deltas.
+ common_prefix = None
+ for factory in self._factories:
+ prefix = factory.key[:-1]
+ if common_prefix is None:
+ common_prefix = prefix
+ elif prefix != common_prefix:
+ # No common prefix
+ common_prefix = None
+ break
+ if common_prefix is None and self._block._content_length >= 2*1024*1024:
+ # Mixed content blocks are capped at 2MB
+ return True
+ return False
def _check_rebuild_block(self):
- action, last_byte_used = self._check_rebuild_action()
+ action, last_byte_used, total_bytes_used = self._check_rebuild_action()
if action is None:
return
if action == 'trim':
More information about the bazaar-commits
mailing list