Rev 3905: Try splitting the chk stream based on prefix, rather than just 'newness'. in http://bzr.arbash-meinel.com/branches/bzr/brisbane/split_pack
John Arbash Meinel
john at arbash-meinel.com
Wed Mar 25 14:53:24 GMT 2009
At http://bzr.arbash-meinel.com/branches/bzr/brisbane/split_pack
------------------------------------------------------------
revno: 3905
revision-id: john at arbash-meinel.com-20090325145300-gycg1qqm2mtbm7tq
parent: john at arbash-meinel.com-20090324215215-ceot294mf4q57qh8
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: split_pack
timestamp: Wed 2009-03-25 09:53:00 -0500
message:
Try splitting the chk stream based on prefix, rather than just 'newness'.
-------------- next part --------------
=== modified file 'bzrlib/repofmt/groupcompress_repo.py'
--- a/bzrlib/repofmt/groupcompress_repo.py 2009-03-24 21:52:15 +0000
+++ b/bzrlib/repofmt/groupcompress_repo.py 2009-03-25 14:53:00 +0000
@@ -251,75 +251,63 @@
inv = inventory.CHKInventory(None)
self._text_refs = set()
def _get_referenced_stream(root_keys, parse_leaf_nodes=False):
- cur_keys = root_keys
- while cur_keys:
- keys_by_search_prefix = {}
- remaining_keys.difference_update(cur_keys)
+ keys_by_search_prefix = {'':root_keys}
+ def handle_internal_node(node):
+ for prefix, value in node._items.iteritems():
+ # We don't want to request the same key twice, and we
+ # want to order it by the first time it is seen.
+ # Even further, we don't want to request a key which is
+ # not in this group of pack files (it should be in the
+ # repo, but it doesn't have to be in the group being
+ # packed.)
+ # TODO: consider how to treat externally referenced chk
+ # pages as 'external_references' so that we
+ # always fill them in for stacked branches
+ if value not in next_keys and value in remaining_keys:
+ keys_by_search_prefix.setdefault(prefix,
+ []).append(value)
+ next_keys.add(value)
+ def handle_leaf_node(node):
+ # Store is None, because we know we have a LeafNode, and we
+ # just want its entries
+ for file_id, bytes in node.iteritems(None):
+ entry = inv._bytes_to_entry(bytes)
+ self._text_refs.add((entry.file_id, entry.revision))
+ def next_stream(subkeys):
+ remaining_keys.difference_update(subkeys)
+ stream = source_vf.get_record_stream(subkeys,
+ 'as-requested', True)
+ for record in stream:
+ bytes = record.get_bytes_as('fulltext')
+ # We don't care about search_key_func for this code,
+ # because we only care about external references.
+ node = chk_map._deserialise(bytes, record.key,
+ search_key_func=None)
+ common_base = node._search_prefix
+ if isinstance(node, chk_map.InternalNode):
+ handle_internal_node(node)
+ elif parse_leaf_nodes:
+ handle_leaf_node(node)
+ counter[0] += 1
+ if pb is not None:
+ pb.update('chk node', counter[0], total_keys)
+ yield record
+ while keys_by_search_prefix:
next_keys = set()
- def handle_internal_node(node):
- for prefix, value in node._items.iteritems():
- # We don't want to request the same key twice, and we
- # want to order it by the first time it is seen.
- # Even further, we don't want to request a key which is
- # not in this group of pack files (it should be in the
- # repo, but it doesn't have to be in the group being
- # packed.)
- # TODO: consider how to treat externally referenced chk
- # pages as 'external_references' so that we
- # always fill them in for stacked branches
- if value not in next_keys and value in remaining_keys:
- keys_by_search_prefix.setdefault(prefix,
- []).append(value)
- next_keys.add(value)
- def handle_leaf_node(node):
- # Store is None, because we know we have a LeafNode, and we
- # just want its entries
- for file_id, bytes in node.iteritems(None):
- entry = inv._bytes_to_entry(bytes)
- self._text_refs.add((entry.file_id, entry.revision))
- def next_stream():
- stream = source_vf.get_record_stream(cur_keys,
- 'as-requested', True)
- for record in stream:
- bytes = record.get_bytes_as('fulltext')
- # We don't care about search_key_func for this code,
- # because we only care about external references.
- node = chk_map._deserialise(bytes, record.key,
- search_key_func=None)
- common_base = node._search_prefix
- if isinstance(node, chk_map.InternalNode):
- handle_internal_node(node)
- elif parse_leaf_nodes:
- handle_leaf_node(node)
- counter[0] += 1
- if pb is not None:
- pb.update('chk node', counter[0], total_keys)
- yield record
- yield next_stream()
- # Double check that we won't be emitting any keys twice
- # If we get rid of the pre-calculation of all keys, we could
- # turn this around and do
- # next_keys.difference_update(seen_keys)
- # However, we also may have references to chk pages in another
- # pack file during autopack. We filter earlier, so we should no
- # longer need to do this
- # next_keys = next_keys.intersection(remaining_keys)
- cur_keys = []
+ todo = []
+ # Empty the current keys_by_search_prefix, by poping them into
+ # a todo list. At that point, we will fill it up again, by
+ # processing those nodes
for prefix in sorted(keys_by_search_prefix):
- cur_keys.extend(keys_by_search_prefix.pop(prefix))
- recent_roots = self._chk_id_roots[:self._RECENT_HORIZON]
- old_roots = self._chk_id_roots[self._RECENT_HORIZON:]
- del self._chk_id_roots
+ todo.append(keys_by_search_prefix.pop(prefix))
+ for subkeys in todo:
+ yield next_stream(subkeys)
# Grab the text keys that are referenced by recent commits, so we can
# prioritize those as well
- for stream in _get_referenced_stream(recent_roots,
- self._gather_text_refs):
- yield stream
- del recent_roots
- for stream in _get_referenced_stream(old_roots,
+ for stream in _get_referenced_stream(self._chk_id_roots,
self._gather_text_refs):
yield stream
- del old_roots
+ del self._chk_id_roots
# while it isn't really possible for chk_id_roots to not be in the
# local group of packs, it is possible that the tree shape has not
# changed recently, so we need to filter _chk_p_id_roots by the
@@ -327,14 +315,9 @@
chk_p_id_roots = [key for key in self._chk_p_id_roots
if key in remaining_keys]
del self._chk_p_id_roots
- recent_pid_roots = chk_p_id_roots[:self._RECENT_HORIZON]
- old_pid_roots = chk_p_id_roots[self._RECENT_HORIZON:]
- for stream in _get_referenced_stream(recent_pid_roots, False):
- yield stream
- del recent_pid_roots
- for stream in _get_referenced_stream(old_pid_roots, False):
- yield stream
- del old_pid_roots
+ for stream in _get_referenced_stream(chk_p_id_roots, False):
+ yield stream
+ del chk_p_id_roots
if remaining_keys:
trace.mutter('There were %d keys in the chk index, %d of which'
' were not referenced', total_keys,
More information about the bazaar-commits
mailing list