Rev 72: Initial results showed caching parent_map slowed things down. in http://bzr.arbash-meinel.com/plugins/history_db

Fri Apr 9 18:53:06 BST 2010

At http://bzr.arbash-meinel.com/plugins/history_db

------------------------------------------------------------
revno: 72
revision-id: john at arbash-meinel.com-20100409175248-u3fd9q70lwvyv7ns
parent: john at arbash-meinel.com-20100409170157-w9d7yz1x3iejmzwn
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: history_db
timestamp: Fri 2010-04-09 12:52:48 -0500
message:
  Initial results showed caching parent_map *slowed* things down.
  
  On the hunch that it was a 'gc' overhead issue, I switched to caching using
  StaticTuple, which gave us a fair improvement. Assuming the numbers are accurate.
  4m no cache
  4m30s cache
  3m42s cache w/ StaticTuple
-------------- next part --------------
=== modified file 'history_db.py'

--- a/history_db.py	2010-04-09 17:01:57 +0000
+++ b/history_db.py	2010-04-09 17:52:48 +0000
@@ -26,6 +26,7 @@
 
 from bzrlib import (
     revision,
+    static_tuple,
     trace,
     ui,
     )
@@ -93,6 +94,10 @@
         self._rev_id_to_db_id = {}
         self._db_id_to_rev_id = {}
         self._stats = defaultdict(lambda: 0)
+        # A cache of entries in the dotted_revno table
+        self._dotted_revno_cache = {}
+        # Map child_id => [parent_db_ids]
+        self._db_parent_map = {}
 
     def _ensure_schema(self):
         if not schema.is_initialized(self._db_conn, dbapi2.OperationalError):
@@ -159,9 +164,16 @@
         self._ensure_revisions(rev_ids)
         data = []
         r_to_d = self._rev_id_to_db_id
+        stuple = static_tuple.StaticTuple.from_sequence
         for rev_id, parent_ids in parent_map.iteritems():
-            for idx, parent_id in enumerate(parent_ids):
-                data.append((r_to_d[rev_id], r_to_d[parent_id], idx))
+            db_id = r_to_d[rev_id]
+            if db_id in self._db_parent_map:
+                # This has already been imported, skip it
+                continue
+            parent_db_ids = stuple([r_to_d[p_id] for p_id in parent_ids])
+            self._db_parent_map[db_id] = parent_db_ids
+            for idx, parent_db_id in enumerate(parent_db_ids):
+                data.append((db_id, parent_db_id, idx))
         self._cursor.executemany("INSERT OR IGNORE INTO parent"
                                  "  (child, parent, parent_idx)"
                                  "VALUES (?, ?, ?)", data)
@@ -545,7 +557,7 @@
         self._scheduled_stack = None
         self._seen_parents = None
         # Map from db_id => parent_ids
-        self._parent_map = {}
+        self._parent_map = self._importer._db_parent_map
 
         # We just populate all known ghosts here.
         # TODO: Ghosts are expected to be rare. If we find a case where probing
@@ -879,7 +891,8 @@
             parent_res = self._cursor.execute(
                         "SELECT parent FROM parent WHERE child = ?"
                         " ORDER BY parent_idx", (db_id,)).fetchall()
-            parent_ids = tuple([r[0] for r in parent_res])
+            parent_ids = static_tuple.StaticTuple.from_sequence(
+                [r[0] for r in parent_res])
             self._parent_map[db_id] = parent_ids
         return parent_ids
 



Rev 72: Initial results showed caching parent_map *slowed* things down. in http://bzr.arbash-meinel.com/plugins/history_db

Rev 72: Initial results showed caching parent_map slowed things down. in http://bzr.arbash-meinel.com/plugins/history_db