Rev 3764: Some hack code to investigate where we spend all of our time during 'generate_text_key_index' in http://bzr.arbash-meinel.com/branches/bzr/1.8-dev/generate_text_index_investigation

Wed Oct 8 20:58:37 BST 2008

At http://bzr.arbash-meinel.com/branches/bzr/1.8-dev/generate_text_index_investigation

------------------------------------------------------------
revno: 3764
revision-id: john at arbash-meinel.com-20081008195805-j836b8kjeh7n60y4
parent: pqm at pqm.ubuntu.com-20081002172844-d6df1l8dzpsqzyup
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: generate_text_index_investigation
timestamp: Wed 2008-10-08 14:58:05 -0500
message:
  Some hack code to investigate where we spend all of our time during 'generate_text_key_index'
-------------- next part --------------
=== modified file 'bzrlib/graph.py'

--- a/bzrlib/graph.py	2008-07-16 18:14:23 +0000
+++ b/bzrlib/graph.py	2008-10-08 19:58:05 +0000
@@ -1085,6 +1085,8 @@
     def __init__(self, graph):
         self.graph = graph
         self._heads = {}
+        self._try = 0
+        self._miss = 0
 
     def heads(self, keys):
         """Return the heads of keys.
@@ -1100,8 +1102,10 @@
         """
         keys = frozenset(keys)
         try:
+            self._try += 1
             return set(self._heads[keys])
         except KeyError:
+            self._miss += 1
             heads = self.graph.heads(keys)
             self._heads[keys] = heads
             return set(heads)

=== modified file 'bzrlib/repository.py'
--- a/bzrlib/repository.py	2008-10-01 05:40:45 +0000
+++ b/bzrlib/repository.py	2008-10-08 19:58:05 +0000
@@ -1408,37 +1408,67 @@
         # a cache of the text keys to allow reuse; costs a dict of all the
         # keys, but saves a 2-tuple for every child of a given key.
         text_key_cache = {}
+        per_file_indexes = {}
         for text_key, valid in text_key_references.iteritems():
             if not valid:
                 invalid_keys.add(text_key)
             else:
                 revision_keys[text_key[1]].add(text_key)
             text_key_cache[text_key] = text_key
+            per_file_indexes[text_key[0]] = {}
         del text_key_references
         text_index = {}
         text_graph = graph.Graph(graph.DictParentsProvider(text_index))
+        ancestor_graph = graph.HeadsCache(graph.Graph(
+            graph.DictParentsProvider(ancestors)))
         NULL_REVISION = _mod_revision.NULL_REVISION
         # Set a cache with a size of 10 - this suffices for bzr.dev but may be
         # too small for large or very branchy trees. However, for 55K path
         # trees, it would be easy to use too much memory trivially. Ideally we
         # could gauge this by looking at available real memory etc, but this is
         # always a tricky proposition.
-        inventory_cache = lru_cache.LRUCache(10)
-        batch_size = 10 # should be ~150MB on a 55K path tree
+        inventory_cache = lru_cache.LRUCache(1000)
+        batch_size = 400 # should be ~150MB on a 55K path tree
         batch_count = len(revision_order) / batch_size + 1
         processed_texts = 0
         pb.update("Calculating text parents.", processed_texts, text_count)
+        check_parent_hit = 0
+        check_parent_miss = 0
+        inv_hit = 0
+        inv_miss = 0
+        tstart = time.clock()
+        t_rev_tree = 0.0
         for offset in xrange(batch_count):
             to_query = revision_order[offset * batch_size:(offset + 1) *
                 batch_size]
             if not to_query:
                 break
-            for rev_tree in self.revision_trees(to_query):
+            start = time.clock()
+            rev_trees = list(self.revision_trees(to_query))
+            end = time.clock()
+            t_rev_tree += (end - start)
+            for rev_tree in rev_trees:
                 revision_id = rev_tree.get_revision_id()
+                inventory_cache[revision_id] = rev_tree.inventory
                 parent_ids = ancestors[revision_id]
                 for text_key in revision_keys[revision_id]:
+                    file_id, file_rev_id = text_key
                     pb.update("Calculating text parents.", processed_texts)
                     processed_texts += 1
+                    if processed_texts & 0xFF == 0:
+                        tnow = time.clock()
+                        pb.note('%5.1fs %5.1fs %s/%s:'
+                                # ' heads t/m/r: %s/%s/%.1f%%'
+                                '  cp h/m %s/%s'
+                                '  inv h/m %s/%s',
+                                tnow - tstart, t_rev_tree,
+                                processed_texts, text_count,
+                                # ancestor_graph._try, ancestor_graph._miss,
+                                # ancestor_graph._miss * 100.0 /
+                                #     float(ancestor_graph._try),
+                                check_parent_hit, check_parent_miss,
+                                inv_hit, inv_miss)
+
                     candidate_parents = []
                     for parent_id in parent_ids:
                         parent_text_key = (text_key[0], parent_id)
@@ -1454,25 +1484,39 @@
                             # look at the parent commit details inventories to
                             # determine possible candidates in the per file graph.
                             # TODO: cache here.
+                            check_parent_miss += 1
                             try:
                                 inv = inventory_cache[parent_id]
+                                inv_hit += 1
                             except KeyError:
                                 inv = self.revision_tree(parent_id).inventory
                                 inventory_cache[parent_id] = inv
+                                inv_miss += 1
                             parent_entry = inv._byid.get(text_key[0], None)
                             if parent_entry is not None:
                                 parent_text_key = (
                                     text_key[0], parent_entry.revision)
                             else:
                                 parent_text_key = None
+                        else:
+                            check_parent_hit += 1
                         if parent_text_key is not None:
                             candidate_parents.append(
                                 text_key_cache[parent_text_key])
+                    # pfi_graph = graph.Graph(graph.DictParentsProvider(
+                    #     per_file_indexes[file_id]))
                     parent_heads = text_graph.heads(candidate_parents)
+                    # parent_heads = pfi_graph.heads(candidate_parents)
+                    # alt_parent_heads = text_graph.heads(candidate_parents)
+                    # assert parent_heads == alt_parent_heads
                     new_parents = list(parent_heads)
                     new_parents.sort(key=lambda x:candidate_parents.index(x))
                     if new_parents == []:
-                        new_parents = [NULL_REVISION]
+                        new_parents = (NULL_REVISION,)
+                    # else:
+                    #     new_parents = tuple([(file_id, p_id) for p_id
+                    #                          in new_parents])
+                    per_file_indexes[file_id][text_key] = new_parents
                     text_index[text_key] = new_parents
 
         for text_key in invalid_keys: