Rev 3764: Some hack code to investigate where we spend all of our time during 'generate_text_key_index' in http://bzr.arbash-meinel.com/branches/bzr/1.8-dev/generate_text_index_investigation
John Arbash Meinel
john at arbash-meinel.com
Wed Oct 8 20:58:37 BST 2008
At http://bzr.arbash-meinel.com/branches/bzr/1.8-dev/generate_text_index_investigation
------------------------------------------------------------
revno: 3764
revision-id: john at arbash-meinel.com-20081008195805-j836b8kjeh7n60y4
parent: pqm at pqm.ubuntu.com-20081002172844-d6df1l8dzpsqzyup
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: generate_text_index_investigation
timestamp: Wed 2008-10-08 14:58:05 -0500
message:
Some hack code to investigate where we spend all of our time during 'generate_text_key_index'
-------------- next part --------------
=== modified file 'bzrlib/graph.py'
--- a/bzrlib/graph.py 2008-07-16 18:14:23 +0000
+++ b/bzrlib/graph.py 2008-10-08 19:58:05 +0000
@@ -1085,6 +1085,8 @@
def __init__(self, graph):
self.graph = graph
self._heads = {}
+ self._try = 0
+ self._miss = 0
def heads(self, keys):
"""Return the heads of keys.
@@ -1100,8 +1102,10 @@
"""
keys = frozenset(keys)
try:
+ self._try += 1
return set(self._heads[keys])
except KeyError:
+ self._miss += 1
heads = self.graph.heads(keys)
self._heads[keys] = heads
return set(heads)
=== modified file 'bzrlib/repository.py'
--- a/bzrlib/repository.py 2008-10-01 05:40:45 +0000
+++ b/bzrlib/repository.py 2008-10-08 19:58:05 +0000
@@ -1408,37 +1408,67 @@
# a cache of the text keys to allow reuse; costs a dict of all the
# keys, but saves a 2-tuple for every child of a given key.
text_key_cache = {}
+ per_file_indexes = {}
for text_key, valid in text_key_references.iteritems():
if not valid:
invalid_keys.add(text_key)
else:
revision_keys[text_key[1]].add(text_key)
text_key_cache[text_key] = text_key
+ per_file_indexes[text_key[0]] = {}
del text_key_references
text_index = {}
text_graph = graph.Graph(graph.DictParentsProvider(text_index))
+ ancestor_graph = graph.HeadsCache(graph.Graph(
+ graph.DictParentsProvider(ancestors)))
NULL_REVISION = _mod_revision.NULL_REVISION
# Set a cache with a size of 10 - this suffices for bzr.dev but may be
# too small for large or very branchy trees. However, for 55K path
# trees, it would be easy to use too much memory trivially. Ideally we
# could gauge this by looking at available real memory etc, but this is
# always a tricky proposition.
- inventory_cache = lru_cache.LRUCache(10)
- batch_size = 10 # should be ~150MB on a 55K path tree
+ inventory_cache = lru_cache.LRUCache(1000)
+ batch_size = 400 # should be ~150MB on a 55K path tree
batch_count = len(revision_order) / batch_size + 1
processed_texts = 0
pb.update("Calculating text parents.", processed_texts, text_count)
+ check_parent_hit = 0
+ check_parent_miss = 0
+ inv_hit = 0
+ inv_miss = 0
+ tstart = time.clock()
+ t_rev_tree = 0.0
for offset in xrange(batch_count):
to_query = revision_order[offset * batch_size:(offset + 1) *
batch_size]
if not to_query:
break
- for rev_tree in self.revision_trees(to_query):
+ start = time.clock()
+ rev_trees = list(self.revision_trees(to_query))
+ end = time.clock()
+ t_rev_tree += (end - start)
+ for rev_tree in rev_trees:
revision_id = rev_tree.get_revision_id()
+ inventory_cache[revision_id] = rev_tree.inventory
parent_ids = ancestors[revision_id]
for text_key in revision_keys[revision_id]:
+ file_id, file_rev_id = text_key
pb.update("Calculating text parents.", processed_texts)
processed_texts += 1
+ if processed_texts & 0xFF == 0:
+ tnow = time.clock()
+ pb.note('%5.1fs %5.1fs %s/%s:'
+ # ' heads t/m/r: %s/%s/%.1f%%'
+ ' cp h/m %s/%s'
+ ' inv h/m %s/%s',
+ tnow - tstart, t_rev_tree,
+ processed_texts, text_count,
+ # ancestor_graph._try, ancestor_graph._miss,
+ # ancestor_graph._miss * 100.0 /
+ # float(ancestor_graph._try),
+ check_parent_hit, check_parent_miss,
+ inv_hit, inv_miss)
+
candidate_parents = []
for parent_id in parent_ids:
parent_text_key = (text_key[0], parent_id)
@@ -1454,25 +1484,39 @@
# look at the parent commit details inventories to
# determine possible candidates in the per file graph.
# TODO: cache here.
+ check_parent_miss += 1
try:
inv = inventory_cache[parent_id]
+ inv_hit += 1
except KeyError:
inv = self.revision_tree(parent_id).inventory
inventory_cache[parent_id] = inv
+ inv_miss += 1
parent_entry = inv._byid.get(text_key[0], None)
if parent_entry is not None:
parent_text_key = (
text_key[0], parent_entry.revision)
else:
parent_text_key = None
+ else:
+ check_parent_hit += 1
if parent_text_key is not None:
candidate_parents.append(
text_key_cache[parent_text_key])
+ # pfi_graph = graph.Graph(graph.DictParentsProvider(
+ # per_file_indexes[file_id]))
parent_heads = text_graph.heads(candidate_parents)
+ # parent_heads = pfi_graph.heads(candidate_parents)
+ # alt_parent_heads = text_graph.heads(candidate_parents)
+ # assert parent_heads == alt_parent_heads
new_parents = list(parent_heads)
new_parents.sort(key=lambda x:candidate_parents.index(x))
if new_parents == []:
- new_parents = [NULL_REVISION]
+ new_parents = (NULL_REVISION,)
+ # else:
+ # new_parents = tuple([(file_id, p_id) for p_id
+ # in new_parents])
+ per_file_indexes[file_id][text_key] = new_parents
text_index[text_key] = new_parents
for text_key in invalid_keys:
More information about the bazaar-commits
mailing list