Rev 30: Do similar work to optimize the dotted revno lookups. in http://bzr.arbash-meinel.com/plugins/history_db

John Arbash Meinel john at arbash-meinel.com
Sun Apr 4 05:37:00 BST 2010


At http://bzr.arbash-meinel.com/plugins/history_db

------------------------------------------------------------
revno: 30
revision-id: john at arbash-meinel.com-20100404043654-hlxk78v319ldf5lc
parent: john at arbash-meinel.com-20100402224358-yk2z3ukcwo504v66
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: history_db
timestamp: Sat 2010-04-03 23:36:54 -0500
message:
  Do similar work to optimize the dotted revno lookups.
  1.487s  bzr
  0.587s  db-rev-id
  0.507s  db-db-id
  0.031s  db-range
  0.026s  db-range-multi
  
  It would be good to look at the multi-way query plan and how it makes it so fast.
  What indices is it using, what order, what joins, etc.
-------------- next part --------------
=== modified file '__init__.py'
--- a/__init__.py	2010-04-02 22:43:58 +0000
+++ b/__init__.py	2010-04-04 04:36:54 +0000
@@ -50,6 +50,13 @@
         trace.note('Stats:\n%s' % (pprint.pformat(dict(importer._stats)),))
 
 
+_dotted_revno_walk_types = registry.Registry()
+_dotted_revno_walk_types.register('db-rev-id', None)
+_dotted_revno_walk_types.register('db-db-id', None)
+_dotted_revno_walk_types.register('db-range', None)
+_dotted_revno_walk_types.register('db-range-multi', None)
+_dotted_revno_walk_types.register('bzr', None)
+
 class cmd_get_dotted_revno(commands.Command):
     """Query the db for a dotted revno.
     """
@@ -59,11 +66,13 @@
                      option.Option('directory', type=unicode, short_name='d',
                         help='Import this location instead of "."'),
                      'revision',
-                     option.Option('use-db-ids',
-                        help='Do the queries using database ids'),
+                     option.RegistryOption('method',
+                        help='How do you want to do the walking.',
+                        converter=str, registry=_dotted_revno_walk_types)
                     ]
 
-    def run(self, directory='.', db=None, revision=None, use_db_ids=False):
+    def run(self, directory='.', db=None, revision=None,
+            method=None):
         import pprint
         from bzrlib.plugins.history_db import history_db
         from bzrlib import branch
@@ -73,13 +82,35 @@
         b.lock_read()
         try:
             rev_ids = [rspec.as_revision_id(b) for rspec in revision]
-            query = history_db.Querier(db, b)
-            if use_db_ids:
-                revnos = [(query.get_dotted_revno_db_ids(rev_id), rev_id)
-                          for rev_id in rev_ids]
+            t = time.time()
+            if method == 'bzr':
+                # Make sure to use a different branch, because the existing one
+                # has already cached the mainline, etc in decoding the supplied
+                # revision ids.
+                b2 = b.bzrdir.open_branch()
+                b2.lock_read()
+                try:
+                    revnos = [(b2.revision_id_to_dotted_revno(r), r)
+                              for r in rev_ids]
+                finally:
+                    b2.unlock()
             else:
-                revnos = [(query.get_dotted_revno(rev_id), rev_id)
-                          for rev_id in rev_ids]
+                query = history_db.Querier(db, b)
+                if method == 'db-db-id':
+                    revnos = [(query.get_dotted_revno_db_ids(rev_id), rev_id)
+                              for rev_id in rev_ids]
+                elif method == 'db-rev-id':
+                    revnos = [(query.get_dotted_revno(rev_id), rev_id)
+                              for rev_id in rev_ids]
+                elif method == 'db-range':
+                    revnos = [(query.get_dotted_revno_range(rev_id), rev_id)
+                              for rev_id in rev_ids]
+                else:
+                    assert method == 'db-range-multi'
+                    revno_map = query.get_dotted_revno_range_multi(rev_ids)
+                    revnos = [(revno_map[rev_id], rev_id) for rev_id in rev_ids]
+                trace.note('Stats:\n%s' % (pprint.pformat(dict(query._stats)),))
+            tdelta = time.time() - t
             revno_strs = []
             max_len = 0
             for revno, rev_id in revnos:
@@ -94,7 +125,7 @@
                                      for s, r in revno_strs]))
         finally:
             b.unlock()
-        trace.note('Stats:\n%s' % (pprint.pformat(dict(query._stats)),))
+        trace.note('Time: %.3fs' % (tdelta,))
 
 
 _mainline_walk_types = registry.Registry()
@@ -198,7 +229,7 @@
         elif method == 'bzr-kg':
             kg = b.repository.revisions.get_known_graph_ancestry(
                 [(b.last_revision(),)])
-            ancestors = len(kg._nodes)
+            ancestors = kg._nodes
         self.outf.write('Found %d ancestors\n' % (len(ancestors),))
         trace.note('Time: %.3fs' % (time.time() - t,))
 

=== modified file 'history_db.py'
--- a/history_db.py	2010-04-02 22:43:58 +0000
+++ b/history_db.py	2010-04-04 04:36:54 +0000
@@ -333,6 +333,87 @@
         self._stats['query_time'] += (time.time() - t)
         return None
 
+    def get_dotted_revno_range(self, revision_id):
+        """Determine the dotted revno, using the range info, etc."""
+        t = time.time()
+        tip_db_id = self._get_db_id(self._branch_tip_rev_id)
+        rev_db_id = self._get_db_id(revision_id)
+        revno = None
+        while tip_db_id is not None:
+            self._stats['num_steps'] += 1
+            range_res = self._cursor.execute(
+                "SELECT pkey, tail"
+                "  FROM mainline_parent_range"
+                " WHERE head = ?"
+                " ORDER BY count DESC LIMIT 1",
+                (tip_db_id,)).fetchone()
+            if range_res is None:
+                revno_res = self._cursor.execute(
+                    "SELECT revno FROM dotted_revno"
+                    " WHERE tip_revision = ? AND merged_revision = ?",
+                    (tip_db_id, rev_db_id)).fetchone()
+                next_db_id = self._get_lh_parent_db_id(tip_db_id)
+            else:
+                pkey, next_db_id = range_res
+                revno_res = self._cursor.execute(
+                    "SELECT revno FROM dotted_revno, mainline_parent"
+                    " WHERE tip_revision = mainline_parent.revision"
+                    "   AND mainline_parent.range = ?"
+                    "   AND merged_revision = ?",
+                    (pkey, rev_db_id)).fetchone()
+            tip_db_id = next_db_id
+            if revno_res is not None:
+                revno = tuple(map(int, revno_res[0].split('.')))
+                break
+        self._stats['query_time'] += (time.time() - t)
+        return revno
+
+    def get_dotted_revno_range_multi(self, revision_ids):
+        """Determine the dotted revno, using the range info, etc."""
+        t = time.time()
+        rev_id_to_db_id = {}
+        need_ids = [self._branch_tip_rev_id]
+        need_ids.extend(revision_ids)
+        schema.ensure_revisions(self._cursor, need_ids,
+                                rev_id_to_db_id, graph=None)
+        tip_db_id = rev_id_to_db_id[self._branch_tip_rev_id]
+        db_id_to_rev_id = dict((d, r) for r, d in rev_id_to_db_id.iteritems())
+        db_ids = set([rev_id_to_db_id[r] for r in revision_ids])
+        revnos = {}
+        while tip_db_id is not None and db_ids:
+            self._stats['num_steps'] += 1
+            range_res = self._cursor.execute(
+                "SELECT pkey, tail"
+                "  FROM mainline_parent_range"
+                " WHERE head = ?"
+                " ORDER BY count DESC LIMIT 1",
+                (tip_db_id,)).fetchone()
+            if range_res is None:
+                revno_res = self._cursor.execute(
+                    "SELECT merged_revision, revno FROM dotted_revno"
+                    " WHERE tip_revision = ?"
+                    "   AND merged_revision IN (%s)"
+                    % (', '.join('?'*len(db_ids))),
+                    [tip_db_id] + list(db_ids)).fetchall()
+                next_db_id = self._get_lh_parent_db_id(tip_db_id)
+            else:
+                pkey, next_db_id = range_res
+                revno_res = self._cursor.execute(
+                    "SELECT merged_revision, revno"
+                    "  FROM dotted_revno, mainline_parent"
+                    " WHERE tip_revision = mainline_parent.revision"
+                    "   AND mainline_parent.range = ?"
+                    "   AND merged_revision IN (%s)"
+                    % (', '.join('?'*len(db_ids))),
+                    [pkey] + list(db_ids)).fetchall()
+            tip_db_id = next_db_id
+            for db_id, revno in revno_res:
+                db_ids.discard(db_id)
+                revnos[db_id_to_rev_id[db_id]] = tuple(map(int,
+                    revno.split('.')))
+        self._stats['query_time'] += (time.time() - t)
+        return revnos
+
     def walk_mainline(self):
         """Walk the db, and grab all the mainline identifiers."""
         t = time.time()
@@ -463,7 +544,7 @@
         # capped at a maximum range.)
         # 1.719s walk_ancestry       
         # 0.198s walk_ancestry_db_ids
-        # 0.164s walk_mainline_using_ranges
+        # 0.164s walk_ancestry_range
         return all_ancestors
 
     def walk_ancestry_range_and_dotted(self):



More information about the bazaar-commits mailing list