Rev 101: Change how lookups for dotted revno<=>revision_id work. in http://bzr.arbash-meinel.com/branches/bzr/history_db/trunk

John Arbash Meinel john at arbash-meinel.com
Thu Apr 15 20:10:42 BST 2010


At http://bzr.arbash-meinel.com/branches/bzr/history_db/trunk

------------------------------------------------------------
revno: 101
revision-id: john at arbash-meinel.com-20100415191015-ttshezx11mr3w4fd
parent: john at arbash-meinel.com-20100415173653-ikarwcl4xou56wa4
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: trunk
timestamp: Thu 2010-04-15 14:10:15 -0500
message:
  Change how lookups for dotted revno<=>revision_id work.
  
  Instead of just searching until you find an entry, find the range where
  the entry is present, and then load all revnos from that range.
  This has a very positive effect on the 'files' page, dropping it from 10+s
  down to 3.8s.
  It has a negative effect on 'bzr log' times, as we now page in more stuff
  than we use. (Mostly because iter_merge_sorted gets called if we actually
  need any of those merged revisions.)
-------------- next part --------------
=== modified file '__init__.py'
--- a/__init__.py	2010-04-15 17:11:43 +0000
+++ b/__init__.py	2010-04-15 19:10:15 +0000
@@ -445,15 +445,16 @@
     t1 = time.clock()
     revision_id_map = query.get_dotted_revno_range_multi([revision_id])
     t2 = time.clock()
-    trace.note('history_db rev=>dotted %s took %.3fs, %.3fs to init,'
-               ' %.3fs to query' % (revision_id_map.values(),
-                                    t2-t0, t1-t0, t2-t1))
     self._partial_revision_id_to_revno_cache.update(revision_id_map)
 
     if revision_id not in revision_id_map:
         trace.mutter('history_db failed to find a mapping for {%s},'
                      'falling back' % (revision_id,))
-        return _orig_do_rev_id_to_dotted(self, revno)
+        return _orig_do_rev_id_to_dotted(self, revision_id)
+    revno = revision_id_map[revision_id]
+    trace.note('history_db rev=>dotted %s added %s took %.3fs, %.3fs to init,'
+               ' %.3fs to query' % (revno, len(revision_id_map),
+                                    t2-t0, t1-t0, t2-t1))
     return revision_id_map[revision_id]
 
 

=== modified file 'history_db.py'
--- a/history_db.py	2010-04-15 17:36:53 +0000
+++ b/history_db.py	2010-04-15 19:10:15 +0000
@@ -1310,48 +1310,88 @@
         t = time.time()
         tip_db_id = self._get_db_id(self._branch_tip_rev_id)
         db_ids = set()
-        db_id_to_rev_id = {}
+        # TODO: We could probably do this cheaper, such as using
+        #       ensure_revisions, or some other N-way query
         for rev_id in revision_ids:
             db_id = self._get_db_id(rev_id)
             if db_id is None:
                 import pdb; pdb.set_trace()
             db_ids.add(db_id)
-            db_id_to_rev_id[db_id] = rev_id
         revnos = {}
         while tip_db_id is not None and db_ids:
-            self._stats['num_steps'] += 1
-            range_res = self._cursor.execute(
-                "SELECT pkey, tail"
-                "  FROM mainline_parent_range"
-                " WHERE head = ?"
-                " ORDER BY count DESC LIMIT 1",
-                (tip_db_id,)).fetchone()
-            if range_res is None:
-                revno_res = self._cursor.execute(_add_n_params(
-                    "SELECT merged_revision, revno FROM dotted_revno"
+            tip_db_id, next_db_id, range_key = self._find_next_range(tip_db_id,
+                list(db_ids), 'merged_revision')
+            if range_key is None:
+                revno_res = self._cursor.execute(
+                    "SELECT merged_revision, revision_id, revno"
+                    "  FROM dotted_revno, revision"
                     " WHERE tip_revision = ?"
-                    "   AND merged_revision IN (%s)",
-                    len(db_ids)), 
-                    [tip_db_id] + list(db_ids)).fetchall()
+                    "   AND merged_revision = db_id",
+                    [tip_db_id]).fetchall()
                 next_db_id = self._get_lh_parent_db_id(tip_db_id)
             else:
-                pkey, next_db_id = range_res
-                revno_res = self._cursor.execute(_add_n_params(
-                    "SELECT merged_revision, revno"
-                    "  FROM dotted_revno, mainline_parent"
+                revno_res = self._cursor.execute(
+                    "SELECT merged_revision, revision_id, revno"
+                    "  FROM dotted_revno, revision, mainline_parent"
                     " WHERE tip_revision = mainline_parent.revision"
-                    "   AND mainline_parent.range = ?"
-                    "   AND merged_revision IN (%s)",
-                    len(db_ids)), 
-                    [pkey] + list(db_ids)).fetchall()
+                    "   AND merged_revision = db_id"
+                    "   AND mainline_parent.range = ?",
+                    [range_key]).fetchall()
             tip_db_id = next_db_id
-            for db_id, revno in revno_res:
+            self._stats['dotted_revno_loaded'] += len(revno_res)
+            for db_id, rev_id, revno in revno_res:
                 db_ids.discard(db_id)
-                revnos[db_id_to_rev_id[db_id]] = tuple(map(int,
-                    revno.split('.')))
+                revnos[rev_id] = tuple(map(int, revno.split('.')))
         self._stats['query_time'] += (time.time() - t)
         return revnos
 
+    def _find_next_range(self, tip_db_id, look_for, query_field):
+        """Step back through mainline, finding any child entries.
+
+        You can search for revnos or revision_ids this way. For example,
+        look_for can be a list of revno_strs, and then query_field would be
+        "revno". Or look_for could be db_ids, and then query_field would
+        be "merged_revision".
+        
+        :param tip_db_id: The current search tip
+        :param look_for: A list of things we want to find
+        :param query_field: The field name that we are looking for 'look_for'
+        :return: (tip_db_id, next_db_id, range_key)
+        """
+        # This doesn't actually pull out the data, it just walks quickly until
+        # it finds something
+        while tip_db_id is not None:
+            self._stats['num_steps'] += 1
+            range_res = self._cursor.execute(
+                "SELECT pkey, tail"
+                "  FROM mainline_parent_range"
+                " WHERE head = ?"
+                " ORDER BY count DESC LIMIT 1",
+                (tip_db_id,)).fetchone()
+            if range_res is None:
+                range_key = None
+                search_res = self._cursor.execute(_add_n_params(
+                    "SELECT 1"
+                    "  FROM dotted_revno"
+                    " WHERE tip_revision = ?"
+                    " AND " + query_field + " IN (%s)"
+                    " LIMIT 1", len(look_for)),
+                    [tip_db_id] + look_for).fetchone()
+                next_db_id = self._get_lh_parent_db_id(tip_db_id)
+            else:
+                range_key, next_db_id = range_res
+                search_res = self._cursor.execute(_add_n_params(
+                    "SELECT 1"
+                    "  FROM dotted_revno, mainline_parent"
+                    " WHERE tip_revision = mainline_parent.revision"
+                    "   AND mainline_parent.range = ?"
+                    " AND " + query_field + " IN (%s)"
+                    " LIMIT 1", len(look_for)),
+                    [range_key] + look_for).fetchone()
+            if search_res is not None:
+                return tip_db_id, next_db_id, range_key
+            tip_db_id = next_db_id
+
     def get_revision_ids(self, revnos):
         """Map from a dotted-revno back into a revision_id."""
         t = time.time()
@@ -1361,33 +1401,28 @@
         revno_strs = set(['.'.join(map(str, revno)) for revno in revnos])
         revno_map = {}
         while tip_db_id is not None and revno_strs:
-            self._stats['num_steps'] += 1
-            range_res = self._cursor.execute(
-                "SELECT pkey, tail"
-                "  FROM mainline_parent_range"
-                " WHERE head = ?"
-                " ORDER BY count DESC LIMIT 1",
-                (tip_db_id,)).fetchone()
-            if range_res is None:
-                revision_res = self._cursor.execute(_add_n_params(
+            tip_db_id, next_db_id, range_key = self._find_next_range(tip_db_id,
+                list(revno_strs), 'revno')
+            # We found a tip/range that contains the entries we are interested
+            # in. Load-it-up
+            if range_key is None:
+                revision_res = self._cursor.execute(
                     "SELECT revision_id, revno"
                     "  FROM dotted_revno, revision"
                     " WHERE merged_revision = revision.db_id"
-                    "   tip_revision = ?"
-                    "   AND revno IN (%s)", len(revno_strs)),
-                    [tip_db_id] + list(revno_strs)).fetchall()
+                    "   tip_revision = ?",
+                    [tip_db_id]).fetchall()
                 next_db_id = self._get_lh_parent_db_id(tip_db_id)
             else:
-                pkey, next_db_id = range_res
-                revision_res = self._cursor.execute(_add_n_params(
+                revision_res = self._cursor.execute(
                     "SELECT revision_id, revno"
                     "  FROM dotted_revno, mainline_parent, revision"
                     " WHERE tip_revision = mainline_parent.revision"
                     "   AND merged_revision = revision.db_id"
-                    "   AND mainline_parent.range = ?"
-                    "   AND revno IN (%s)", len(revno_strs)),
-                    [pkey] + list(revno_strs)).fetchall()
+                    "   AND mainline_parent.range = ?",
+                    [range_key]).fetchall()
             tip_db_id = next_db_id
+            self._stats['get-rev-imported'] += len(revision_res)
             for revision_id, revno_str in revision_res:
                 dotted = tuple(map(int, revno_str.split('.')))
                 revno_strs.discard(revno_str)



More information about the bazaar-commits mailing list