Rev 31: Support expanding all nodes in the db into their own dotted_revno info. in http://bzr.arbash-meinel.com/plugins/history_db

John Arbash Meinel john at arbash-meinel.com
Sun Apr 4 16:32:52 BST 2010


At http://bzr.arbash-meinel.com/plugins/history_db

------------------------------------------------------------
revno: 31
revision-id: john at arbash-meinel.com-20100404153249-lqy4j3f4dcxwzfll
parent: john at arbash-meinel.com-20100404043654-hlxk78v319ldf5lc
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: history_db
timestamp: Sun 2010-04-04 10:32:49 -0500
message:
  Support expanding all nodes in the db into their own dotted_revno info.
  
  This is meant to represent the extreme case where every revision is its own branch.
  Note that linear revisions are still only 1 branch. However, when importing bzr.dev,
  we end up with 4,706 'mainlines', versus ~30,000 revisions.
  This is quite slow, and does expand the data a lot.
  Specific numbers for bzr.dev:
    Time to just import bzr.dev: 5.642s
    Time to import and expand: 3m22s
    Num expanded nodes: 4706
    Avg time per node: 0.043s
  
  I haven't profiled it, but my guess is that 40ms is the 'merge_sort' time. It is
  pretty fast, but obviously not that fast when you are doing 5k of them.
  
   entries in dotted_revno for bzr.dev: 30,496
   entries in dotted_revno after expansion: 821,488
   avg extra dotted_revno entries per expand: 168
  
  168 new dotted revnos per branch was a bit surprising to me. My guess is that it
  primarily stems from all the times when trunk was merged back into dev branches.
  (Consider all the NEWS times when you have to merge a lot of stuff back, just to
  handle NEWS conflicts.)
  
  I think it does validate the design, at least for bzr. Yes 800k is a lot. But
  when you consider that it represents almost 5k branches, versus the idea of
  using a 'whole ancestry' table. Which would be far closer to 5k*30k = 141M
  It is a savings of 175:1.
  Now to try with stuff like mysql...
-------------- next part --------------
=== modified file '__init__.py'
--- a/__init__.py	2010-04-04 04:36:54 +0000
+++ b/__init__.py	2010-04-04 15:32:49 +0000
@@ -33,9 +33,10 @@
                         help='Use this as the database for storage'),
                      option.Option('directory', type=unicode, short_name='d',
                         help='Import this location instead of "."'),
+                     option.Option('expand-all', help='Expand all revnos'),
                     ]
 
-    def run(self, directory='.', db=None):
+    def run(self, directory='.', db=None, expand_all=False):
         import pprint
         from bzrlib.plugins.history_db import history_db
         from bzrlib import branch
@@ -43,7 +44,7 @@
         b.lock_read()
         try:
             importer = history_db.Importer(db, b)
-            importer.do_import()
+            importer.do_import(expand_all=expand_all)
             importer.build_mainline_cache()
         finally:
             b.unlock()

=== modified file 'history_db.py'
--- a/history_db.py	2010-04-04 04:36:54 +0000
+++ b/history_db.py	2010-04-04 15:32:49 +0000
@@ -61,17 +61,20 @@
         self._graph = repo.revisions.get_known_graph_ancestry(
             [self._branch_tip_key])
 
-    def _insert_nodes(self, tip_rev_id, nodes):
-        """Insert all of the nodes mentioned into the database."""
-        self._stats['_insert_node_calls'] += 1
-        self._ensure_revisions([n.key[0] for n in nodes])
+    def _is_imported(self, tip_rev_id):
         res = self._cursor.execute(
             "SELECT count(*) FROM dotted_revno JOIN revision"
             "    ON dotted_revno.tip_revision = revision.db_id"
             " WHERE revision_id = ?"
             "   AND tip_revision = merged_revision",
             (tip_rev_id,)).fetchone()
-        if res[0] > 0:
+        return res[0] > 0
+
+    def _insert_nodes(self, tip_rev_id, nodes):
+        """Insert all of the nodes mentioned into the database."""
+        self._stats['_insert_node_calls'] += 1
+        self._ensure_revisions([n.key[0] for n in nodes])
+        if self._is_imported(tip_rev_id):
             # Not importing anything because the data is already present
             return False
         self._stats['total_nodes_inserted'] += len(nodes)
@@ -103,14 +106,45 @@
                                  "  (child, parent, parent_idx)"
                                  "VALUES (?, ?, ?)", data)
 
-    def do_import(self):
-        merge_sorted = self._graph.merge_sort(self._branch_tip_key)
+    def do_import(self, expand_all=False):
+        merge_sorted = self._import_tip(self._branch_tip_rev_id)
+        if not expand_all:
+            return
+        self._stats['nodes_expanded'] += 0 # create an entry
+        # We want to expand every possible mainline into a dotted_revno cache.
+        # We don't really want to have to compute all the ones we have already
+        # cached. And we want to compute as much as possible per pass. So we
+        # start again at the tip, and just skip all the ones that already have
+        # db entries.
+        pb = ui.ui_factory.nested_progress_bar()
+        for idx, node in enumerate(merge_sorted):
+            pb.update('expanding', idx, len(merge_sorted))
+            # this progress is very non-linear, it is expected the first few
+            # will be slow, and the last few very fast.
+            tip_rev_id = node.key[0]
+            if self._is_imported(tip_rev_id):
+                # This node its info is already imported
+                continue
+            self._stats['nodes_expanded'] += 1
+            # Note: Suppressing the commit until we are finished saves a fair
+            #       amount of time. expanding all of bzr.dev goes from 4m37s
+            #       down to 3m21s.
+            self._import_tip(tip_rev_id, suppress_progress_and_commit=True)
+        self._db_conn.commit()
+        pb.finished()
+
+    def _import_tip(self, tip_revision_id, suppress_progress_and_commit=False):
+        merge_sorted = self._graph.merge_sort((tip_revision_id,))
         try:
-            pb = ui.ui_factory.nested_progress_bar()
+            if suppress_progress_and_commit:
+                pb = None
+            else:
+                pb = ui.ui_factory.nested_progress_bar()
             last_mainline_rev_id = None
             new_nodes = []
             for idx, node in enumerate(merge_sorted):
-                pb.update('importing', idx, len(merge_sorted))
+                if pb is not None:
+                    pb.update('importing', idx, len(merge_sorted))
                 if last_mainline_rev_id is None:
                     assert not new_nodes
                     assert node.merge_depth == 0, \
@@ -136,6 +170,8 @@
                     last_mainline_rev_id = node.key[0]
                     new_nodes = []
                 new_nodes.append(node)
+            if pb is not None:
+                pb.finished()
             if new_nodes:
                 assert last_mainline_rev_id is not None
                 self._insert_nodes(last_mainline_rev_id, new_nodes)
@@ -144,7 +180,9 @@
             self._db_conn.rollback()
             raise
         else:
-            self._db_conn.commit()
+            if not suppress_progress_and_commit:
+                self._db_conn.commit()
+        return merge_sorted
 
     def _check_range_exists_for(self, head_db_id):
         """Does the given head_db_id already have a range defined using it."""



More information about the bazaar-commits mailing list