Rev 24: Some updates to the mainline_parent tables, and a new query that uses it. in http://bzr.arbash-meinel.com/plugins/history_db

Fri Apr 2 22:53:48 BST 2010

At http://bzr.arbash-meinel.com/plugins/history_db

------------------------------------------------------------
revno: 24
revision-id: john at arbash-meinel.com-20100402215313-a2k8n1nbpan8n8cs
parent: john at arbash-meinel.com-20100402211723-8c8s905eifd0olgj
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: history_db
timestamp: Fri 2010-04-02 16:53:13 -0500
message:
  Some updates to the mainline_parent tables, and a new query that uses it.
  
  Turns out that partial denormalization makes a big difference.
  Specifically, the time to walk the bzr mainline drops from ~300ms down to
  120ms, using only 70 round trips (vs 5000 walking one-by-one).
-------------- next part --------------
=== modified file '__init__.py'

--- a/__init__.py	2010-04-02 20:17:33 +0000
+++ b/__init__.py	2010-04-02 21:53:13 +0000
@@ -97,6 +97,12 @@
         trace.note('Stats:\n%s' % (pprint.pformat(dict(query._stats)),))
 
 
+_mainline_walk_types = registry.Registry()
+_mainline_walk_types.register('db-rev-id', None)
+_mainline_walk_types.register('db-db-id', None)
+_mainline_walk_types.register('db-range', None)
+_mainline_walk_types.register('bzr', None)
+
 class cmd_walk_mainline(commands.Command):
     """Walk the mainline of the branch."""
 
@@ -104,37 +110,43 @@
                         help='Use this as the database for storage'),
                      option.Option('directory', type=unicode, short_name='d',
                         help='Import this location instead of "."'),
-                     option.Option('use-db-ids',
-                        help='Do the queries using database ids'),
-                     option.Option('in-bzr', help="Use the bzr graph."),
+                     option.RegistryOption('method',
+                        help='How do you want to do the walking.',
+                        converter=str, registry=_mainline_walk_types)
                     ]
 
-    def run(self, directory='.', db=None, in_bzr=False, use_db_ids=False):
+    def run(self, directory='.', db=None, method=None):
         from bzrlib.plugins.history_db import history_db
         from bzrlib import branch
+        import pprint
+        import time
+        t = time.time()
         b = branch.Branch.open(directory)
         b.lock_read()
         try:
-            if in_bzr:
-                import time
-                t = time.time()
-                b.revision_history()
-                trace.note('Time: %.3fs' % (time.time() - t,))
-            else:
+            if method.startswith('db'):
                 query = history_db.Querier(db, b)
-                if use_db_ids:
-                    query.walk_mainline_db_ids()
+                if method == 'db-db-id':
+                    mainline = query.walk_mainline_db_ids()
+                elif method == 'db-rev-id':
+                    mainline = query.walk_mainline()
                 else:
-                    query.walk_mainline()
-                import pprint
+                    assert method == 'db-range'
+                    mainline = query.walk_mainline_using_ranges()
                 trace.note('Stats:\n%s' % (pprint.pformat(dict(query._stats)),))
+            else:
+                assert method == 'bzr'
+                mainline = b.revision_history()
         finally:
             b.unlock()
+        self.outf.write('Found %d revs\n' % (len(mainline),))
+        trace.note('Time: %.3fs' % (time.time() - t,))
         # Time to walk bzr mainline
-        #  bzr 31packs  683ms
-        #  bzr 1pack    320ms
-        #  db rev_ids   295ms
-        #  db db_ids    236ms
+        #  bzr 13packs  646ms
+        #  bzr 1pack    406ms
+        #  db rev_ids   381ms
+        #  db db_ids    331ms
+        #  db range     118ms
 
 
 _ancestry_walk_types = registry.Registry()

=== modified file 'history_db.py'
--- a/history_db.py	2010-04-02 21:17:23 +0000
+++ b/history_db.py	2010-04-02 21:53:13 +0000
@@ -183,9 +183,17 @@
         # Note that inserting head and tail into mainline_parent is redundant,
         # since the data is available. But I'm sure it will make the *queries*
         # much easier.
+        # TODO: So far the queries actually seem worse being inclusive, because
+        #       you have to worry about the overlap. Needs a bit more time to
+        #       figure out whether making head or tail exclusive is better. ATM
+        #       I'm thinking tail, but I haven't figured out how to handle that
+        #       first revision. One option is to allow tail to be NULL,
+        #       indicating the end of the whole range. Or to give it something
+        #       like 'null:'...
         self._cursor.executemany(
-            "INSERT INTO mainline_parent (range, revision)"
-            " VALUES (?, ?)", [(range_key, d) for d in range_db_ids])
+            "INSERT INTO mainline_parent (range, revision, dist)"
+            " VALUES (?, ?, ?)",
+            [(range_key, d, idx) for idx, d in enumerate(range_db_ids)])
 
     def build_mainline_cache(self):
         """Given the current branch, cache mainline information."""
@@ -249,6 +257,11 @@
         self._branch_tip_rev_id = a_branch.last_revision()
         self._stats = defaultdict(lambda: 0)
 
+    def _get_db_id(self, revision_id):
+        return self._cursor.execute('SELECT db_id FROM revision'
+                                    ' WHERE revision_id = ?',
+                                    (revision_id,)).fetchone()[0]
+
     def _get_lh_parent_rev_id(self, revision_id):
         parent_res = self._cursor.execute("""
             SELECT p.revision_id
@@ -347,20 +360,50 @@
             all_ids.append(cur_id)
             cur_id = self._get_lh_parent_rev_id(cur_id)
         self._stats['query_time'] += (time.time() - t)
-        return
+        return all_ids
 
     def walk_mainline_db_ids(self):
         """Walk the db, and grab all the mainline identifiers."""
         t = time.time()
-        db_id = self._cursor.execute('SELECT db_id FROM revision'
-                                     ' WHERE revision_id = ?',
-                                     (self._branch_tip_rev_id,)).fetchone()[0]
+        db_id = self._get_db_id(self._branch_tip_rev_id)
         all_ids = []
         while db_id is not None:
             all_ids.append(db_id)
             db_id = self._get_lh_parent_db_id(db_id)
         self._stats['query_time'] += (time.time() - t)
-        return
+        return all_ids
+
+    def walk_mainline_using_ranges(self):
+        t = time.time()
+        db_id = self._get_db_id(self._branch_tip_rev_id)
+        all_ids = []
+        while db_id is not None:
+            self._stats['num_steps'] += 1
+            all_ids.append(db_id)
+            range_res = self._cursor.execute(
+                "SELECT pkey, tail"
+                "  FROM mainline_parent_range"
+                " WHERE head = ?"
+                " ORDER BY count DESC LIMIT 1",
+                (db_id,)).fetchone()
+            if range_res is None:
+                # No range, so switch to using by-parent search
+                db_id = self._get_lh_parent_db_id(db_id)
+            else:
+                # We have a range, so read in the whole range, and append the
+                # info. Note that we already have db_id itself, so 
+                range_key, tail_db_id = range_res
+                range_db_ids = self._cursor.execute(
+                    "SELECT revision FROM mainline_parent"
+                    " WHERE range = ? ORDER BY dist ASC",
+                    (range_key,)).fetchall()
+                db_ids = [r[0] for r in range_db_ids]
+                assert db_ids[0] == db_id
+                assert db_ids[-1] == tail_db_id
+                all_ids.extend(db_ids[1:-1])
+                db_id = tail_db_id
+        self._stats['query_time'] += (time.time() - t)
+        return all_ids
 
     def walk_ancestry(self):
         """Walk all parents of the given revision."""
@@ -384,8 +427,7 @@
     def walk_ancestry_db_ids(self):
         _exec = self._cursor.execute
         all_ancestors = set()
-        db_id = _exec("SELECT db_id FROM revision WHERE revision_id = ?",
-                      (self._branch_tip_rev_id,)).fetchone()[0]
+        db_id = self._get_db_id(self._branch_tip_rev_id)
         all_ancestors.add(db_id)
         remaining = [db_id]
         while remaining:

=== modified file 'history_denormalization.txt'
--- a/history_denormalization.txt	2010-04-02 20:17:33 +0000
+++ b/history_denormalization.txt	2010-04-02 21:53:13 +0000
@@ -95,7 +95,8 @@
     ON mainline_parent_range (head);
   CREATE TABLE mainline_parent (
     range INTEGER REFERENCES mainline_parent_range NOT NULL,
-    revision INTEGER REFERENCES revision NOT NULL
+    revision INTEGER REFERENCES revision NOT NULL,
+    dist INTEGER NOT NULL -- Offset from head, so we know the order
     -- Not adding the constraint at this time, but it is logically there
     -- CONSTRAINT mainline_parent_rev_unique UNIQUE (range, revision)
   );

=== modified file 'schema.py'
--- a/schema.py	2010-04-02 20:17:33 +0000
+++ b/schema.py	2010-04-02 21:53:13 +0000
@@ -92,7 +92,10 @@
 mainline_parent_t = """
 CREATE TABLE mainline_parent (
     range INTEGER REFERENCES mainline_parent_range NOT NULL,
-    revision INTEGER REFERENCES revision NOT NULL
+    revision INTEGER REFERENCES revision NOT NULL,
+    dist INTEGER NOT NULL -- Offset from head, so we preserve the order
+    -- Not adding the constraint at this time, but it is logically there
+    -- CONSTRAINT mainline_parent_rev_unique UNIQUE (range, revision)
 );
 """
 _create_statements.append(mainline_parent_t)