Rev 29: Tremendously faster, but gives different results wrt ghosts. in http://bzr.arbash-meinel.com/plugins/history_db

Fri Apr 2 23:44:32 BST 2010

At http://bzr.arbash-meinel.com/plugins/history_db

------------------------------------------------------------
revno: 29
revision-id: john at arbash-meinel.com-20100402224358-yk2z3ukcwo504v66
parent: john at arbash-meinel.com-20100402222713-05ufbktsvladp3lm
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: history_db
timestamp: Fri 2010-04-02 17:43:58 -0500
message:
  Tremendously faster, but gives different results wrt ghosts.
  
  Specifically, the merge_sort result omits ghosts, so they aren't in the dotted_revno
  table. However the *time* spent is great. Down to 45ms to get the entire
  ancestry for an arbitrary branch. (vs 161ms using the mainline-then-merged
  method, and 197ms for the walk-db-parent table method.)
-------------- next part --------------
=== modified file '__init__.py'

--- a/__init__.py	2010-04-02 22:27:13 +0000
+++ b/__init__.py	2010-04-02 22:43:58 +0000
@@ -156,6 +156,7 @@
 _ancestry_walk_types.register('db-rev-id', None)
 _ancestry_walk_types.register('db-db-id', None)
 _ancestry_walk_types.register('db-range', None)
+_ancestry_walk_types.register('db-dotted', None)
 _ancestry_walk_types.register('bzr-iter-anc', None)
 _ancestry_walk_types.register('bzr-kg', None)
 
@@ -182,21 +183,23 @@
         if method.startswith('db'):
             query = history_db.Querier(db, b)
             if method == 'db-db-id':
-                count = query.walk_ancestry_db_ids()
+                ancestors = query.walk_ancestry_db_ids()
             elif method == 'db-rev-id':
-                count = query.walk_ancestry()
+                ancestors = query.walk_ancestry()
+            elif method == 'db-dotted':
+                ancestors = query.walk_ancestry_range_and_dotted()
             else:
                 assert method == 'db-range'
-                count = query.walk_ancestry_range()
+                ancestors = query.walk_ancestry_range()
             trace.note('Stats:\n%s' % (pprint.pformat(dict(query._stats)),))
         elif method == 'bzr-iter-anc':
             g = b.repository.get_graph()
-            count = len(list(g.iter_ancestry([b.last_revision()])))
+            ancestors = list(g.iter_ancestry([b.last_revision()]))
         elif method == 'bzr-kg':
             kg = b.repository.revisions.get_known_graph_ancestry(
                 [(b.last_revision(),)])
-            count = len(kg._nodes)
-        self.outf.write('Found %d ancestors\n' % (count,))
+            ancestors = len(kg._nodes)
+        self.outf.write('Found %d ancestors\n' % (len(ancestors),))
         trace.note('Time: %.3fs' % (time.time() - t,))
 
 commands.register_command(cmd_create_history_db)

=== modified file 'history_db.py'
--- a/history_db.py	2010-04-02 22:27:13 +0000
+++ b/history_db.py	2010-04-02 22:43:58 +0000
@@ -414,7 +414,7 @@
             next_parents = [p[0] for p in parents if p[0] not in all]
             all.update(next_parents)
             remaining.extend(next_parents)
-        return len(all)
+        return all
 
     def walk_ancestry_db_ids(self):
         _exec = self._cursor.execute
@@ -424,14 +424,14 @@
         remaining = [db_id]
         while remaining:
             self._stats['num_steps'] += 1
-            next = remaining[:1000]
+            next = remaining[:100]
             remaining = remaining[len(next):]
             res = _exec("SELECT parent FROM parent WHERE child in (%s)"
                         % (', '.join('?'*len(next))), tuple(next))
             next_p = [p[0] for p in res if p[0] not in all_ancestors]
             all_ancestors.update(next_p)
             remaining.extend(next_p)
-        return len(all_ancestors)
+        return all_ancestors
 
     def walk_ancestry_range(self):
         """Walk the whole ancestry.
@@ -439,20 +439,68 @@
         Use the mainline_parent_range/mainline_parent table to speed things up.
         """
         _exec = self._cursor.execute
-        all_ancestors = set()
-        db_id = self._get_db_id(self._branch_tip_rev_id)
-        all_ancestors.add(db_id)
-        remaining = [db_id]
+        # All we are doing is pre-seeding the search with all the mainline
+        # revisions, we could probably do more with interleaving calls to
+        # mainline with calls to parents but this is easier to write :)
+        all_mainline = self.walk_mainline_using_ranges()
+        t = time.time()
+        all_ancestors = set(all_mainline)
+        remaining = list(all_mainline)
         while remaining:
             self._stats['num_steps'] += 1
-            next = remaining[:1000]
+            next = remaining[:100]
             remaining = remaining[len(next):]
             res = _exec("SELECT parent FROM parent WHERE child in (%s)"
                         % (', '.join('?'*len(next))), tuple(next))
             next_p = [p[0] for p in res if p[0] not in all_ancestors]
             all_ancestors.update(next_p)
             remaining.extend(next_p)
-        return len(all_ancestors)
+        self._stats['query_time'] += (time.time() - t)
+        # Using this shortcut to grab the mainline first helps, but not a lot.
+        # Probably because the limiting factor is the 'child in (...)' step,
+        # which is 100 entries or so. (note that setting the range to :1000
+        # shows a failure, which indicates the old code path was definitely
+        # capped at a maximum range.)
+        # 1.719s walk_ancestry       
+        # 0.198s walk_ancestry_db_ids
+        # 0.164s walk_mainline_using_ranges
+        return all_ancestors
+
+    def walk_ancestry_range_and_dotted(self):
+        """Walk the whole ancestry.
+
+        Use the information from the dotted_revno table and the mainline_parent
+        table to speed things up.
+        """
+        db_id = self._get_db_id(self._branch_tip_rev_id)
+        all_ancestors = set()
+        t = time.time()
+        while db_id is not None:
+            self._stats['num_steps'] += 1
+            range_res = self._cursor.execute(
+                "SELECT pkey, tail"
+                "  FROM mainline_parent_range"
+                " WHERE head = ?"
+                " ORDER BY count DESC LIMIT 1",
+                (db_id,)).fetchone()
+            if range_res is None:
+                next_db_id = self._get_lh_parent_db_id(db_id)
+                merged_revs = self._cursor.execute(
+                    "SELECT merged_revision FROM dotted_revno"
+                    " WHERE tip_revision = ?",
+                    (db_id,)).fetchall()
+                all_ancestors.update([r[0] for r in merged_revs])
+            else:
+                pkey, next_db_id = range_res
+                merged_revs = self._cursor.execute(
+                    "SELECT merged_revision FROM dotted_revno, mainline_parent"
+                    " WHERE tip_revision = mainline_parent.revision"
+                    "   AND mainline_parent.range = ?",
+                    (pkey,)).fetchall()
+                all_ancestors.update([r[0] for r in merged_revs])
+            db_id = next_db_id
+        self._stats['query_time'] += (time.time() - t)
+        return all_ancestors
 
     def heads(self, revision_ids):
         """Compute Graph.heads() on the given data."""