Rev 108: Simplify further. in http://bzr.arbash-meinel.com/branches/bzr/history_db/tip_numbering

Fri Apr 16 22:23:40 BST 2010

At http://bzr.arbash-meinel.com/branches/bzr/history_db/tip_numbering

------------------------------------------------------------
revno: 108
revision-id: john at arbash-meinel.com-20100416212316-niq43zadt5mziuwo
parent: john at arbash-meinel.com-20100416205756-1jlpjufjle2jt3hz
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: tip_numbering
timestamp: Fri 2010-04-16 16:23:16 -0500
message:
  Simplify further.
  
  We don't care about the details of revs already merged, so only track that we know
  they are merged, rather than also tracking their info.
  
  New data for bzr.dev:
  Stats:
  {'_insert_node_calls': 30621,
   'gdfo hit': 3869,
   'gdfo miss': 811978,
   'is interesting': 804941,
   'nodes_expanded': 4734,
   'not interesting already merged': 32713,
   'not interesting is mainline': 5456,
   'not interesting new numbered': 228158,
   'not interesting step already merged': 726,
   'pushed': 835562,
   'ranges_inserted': 52,
   'revs_in_ranges': 5159,
   'step mainline': 830845,
   'step mainline added': 5002192,
   'step mainline unknown': 830845,
   'total_nodes_inserted': 835562}
  real    1m13.472s
  
  MySQL:
  Stats:
  {'_insert_node_calls': 68814,
   'gdfo hit': 1181,
   'gdfo miss': 615292,
   'is interesting': 592684,
   'nodes_expanded': 17059,
   'not interesting already merged': 73125,
   'not interesting is mainline': 4877,
   'not interesting new numbered': 244676,
   'not interesting step already merged': 1853,
   'pushed': 661498,
   'ranges_inserted': 28,
   'revs_in_ranges': 2778,
   'step mainline': 1950421,
   'step mainline added': 56149882,
   'step mainline unknown': 1950421,
   'total_nodes_inserted': 661498}
  real    2m23.279s
  
  Quite a bit faster when we don't load the content (probably also reduces
  the size of dicts, etc, reducing gc pressure?)
-------------- next part --------------
=== modified file 'history_db.py'

--- a/history_db.py	2010-04-16 20:57:56 +0000
+++ b/history_db.py	2010-04-16 21:23:16 +0000
@@ -571,15 +571,15 @@
         self._cursor = importer._cursor
         self._stats = importer._stats
 
-        # db_id => gdfo
-        self._known_gdfo = {}
         # db_ids that we know are ancestors of mainline_db_ids that are not
         # ancestors of pre_mainline_id
         self._interesting_ancestor_ids = set()
 
-        # Information from the dotted_revno table for revisions that are in the
-        # already-imported mainline.
-        self._imported_dotted_revno = {}
+        # db_ids that we know are already merged
+        self._already_merged = set()
+
+        # new dotted revnos that we generate
+        self._new_dotted_revno = {}
         # What dotted revnos have been loaded
         ## self._known_dotted = set()
         # This is the gdfo of the current mainline revision search tip. This is
@@ -654,29 +654,12 @@
         # TODO: We could probably use a gdfo hint to determine if we want to
         #       step-by-one, or step-by-many
         self._stats['step mainline'] += 1
-        if self._imported_mainline_id in self._importer._dotted_revno_cache:
-            self._stats['step mainline cached'] += 1
-            dotted_info = self._importer._dotted_revno_cache[
-                                self._imported_mainline_id]
-        else:
-            res = self._cursor.execute(
-                "SELECT merged_revision, revno, end_of_merge, merge_depth"
-                "  FROM dotted_revno WHERE tip_revision = ? ORDER BY dist",
-                [self._imported_mainline_id]).fetchall()
-            stuple = static_tuple.StaticTuple.from_sequence
-            st = static_tuple.StaticTuple
-            dotted_info = [st(r[0], st(stuple(map(int, r[1].split('.'))),
-                                       r[2], r[3]))
-                           for r in res]
-            self._stats['step mainline cache missed'] += 1
-            self._importer._dotted_revno_cache[self._imported_mainline_id] = \
-                dotted_info
-        self._stats['step mainline added'] += len(dotted_info)
-        self._update_info_from_dotted_revno(dotted_info)
-        # TODO: We could remove search tips that show up as newly merged
-        #       though that can wait until the next
-        #       _split_search_tips_by_gdfo
-        # new_merged_ids = [r[0] for r in res]
+        res = self._cursor.execute(
+            "SELECT merged_revision"
+            "  FROM dotted_revno WHERE tip_revision = ?",
+            [self._imported_mainline_id]).fetchall()
+        self._already_merged.update([r[0] for r in res])
+        self._stats['step mainline added'] += len(res)
         res = self._cursor.execute("SELECT parent, gdfo"
                                    "  FROM parent, revision"
                                    " WHERE parent = db_id"
@@ -685,44 +668,10 @@
                                    [self._imported_mainline_id]).fetchone()
         if res is None:
             # Walked off the mainline...
-            # TODO: Make sure this stuff is tested
             self._imported_mainline_id = None
             self._imported_gdfo = 0
         else:
             self._imported_mainline_id, self._imported_gdfo = res
-            self._known_gdfo[self._imported_mainline_id] = self._imported_gdfo
-
-    def _step_search_tips(self):
-        """Move the search tips to their parents."""
-        self._stats['step search tips'] += 1
-        res = _get_result_for_many(self._cursor,
-            "SELECT parent, gdfo FROM parent, revision"
-            " WHERE parent=db_id AND child IN (%s)",
-            list(self._search_tips))
-        # TODO: We could use this time to fill out _parent_map, rather than
-        #       waiting until _push_node and duplicating a request to the
-        #       parent table. It may be reasonable to wait on gdfo also...
-
-        # Filter out search tips that we've already searched via a different
-        # path. By construction, if we are stepping the search tips, we know
-        # that all previous search tips are either in
-        # self._imported_dotted_revno or in self._interesting_ancestor_ids.
-        # _imported_dotted_revno will be filtered in the first
-        # _split_search_tips_by_gdfo call, so we just filter out already
-        # interesting ones.
-        interesting = self._interesting_ancestor_ids
-        self._search_tips = set([r[0] for r in res if r[0] not in interesting])
-        # TODO: For search tips we will be removing, we don't need to join
-        #       against revision since we should already have them. There may
-        #       be other ways that we already know gdfo. It may be cheaper to
-        #       check first.
-        self._stats['num_search_tips'] += len(self._search_tips)
-        self._known_gdfo.update(res)
-
-    def _update_info_from_dotted_revno(self, dotted_info):
-        """Update info like 'child_seen' from the dotted_revno info."""
-        self._imported_dotted_revno.update(dotted_info)
-        ## self._known_dotted.update([i[1][0] for i in dotted_info])
 
     def _get_parents(self, db_id):
         if db_id in self._parent_map:
@@ -765,11 +714,11 @@
         if node.revno is None:
             last_dot = 1
             if (node._left_parent is not None
-                and node._left_parent in self._imported_dotted_revno):
+                and node._left_parent in self._new_dotted_revno):
                 # If we haven't loaded the left parent, then we know we
                 # won't be numbering from it, it is outside the
                 # 'interesting' ancestry
-                parent_revno = self._imported_dotted_revno[node._left_parent][0]
+                parent_revno = self._new_dotted_revno[node._left_parent][0]
                 if (parent_revno[0] == node._base_revno
                     and parent_revno[1] == node._branch_num):
                     last_dot = parent_revno[-1] + 1
@@ -793,30 +742,28 @@
             else:
                 end_of_merge = False
         node.end_of_merge = end_of_merge
-        self._imported_dotted_revno[node.key] = (
+        self._new_dotted_revno[node.key] = (
             node.revno, end_of_merge, node.merge_depth)
         ## self._known_dotted.add(node.revno)
         node._pending_parents = None
         self._scheduled_stack.append(node)
 
     def _get_gdfo(self, db_id):
-        if db_id in self._known_gdfo:
-            self._stats['gdfo hit'] += 1
-            return self._known_gdfo[db_id]
-        self._stats['gdfo miss'] += 1
         res = self._cursor.execute("SELECT gdfo"
                                    "  FROM revision WHERE db_id = ?",
                                    [db_id]).fetchone()
         assert res is not None
         gdfo = res[0]
-        self._known_gdfo[db_id] = gdfo
         return gdfo
 
     def _is_interesting(self, db_id):
         """We are considering pushing this db_id to be numbered. Do we want to?
         """
-        if db_id in self._imported_dotted_revno:
-            self._stats['not interesting known imported'] +=1
+        if db_id in self._already_merged:
+            self._stats['not interesting already merged'] +=1
+            return False
+        if db_id in self._new_dotted_revno:
+            self._stats['not interesting new numbered'] += 1
             return False
         gdfo = self._get_gdfo(db_id)
         # If this gdfo > the mainline gdfo, then we know it cannot have been
@@ -825,12 +772,12 @@
         # TODO: Track interesting_ancestor_ids and use the interesting_children
         #       trick
         while (gdfo < self._imported_gdfo
-                and db_id not in self._imported_dotted_revno):
+                and db_id not in self._already_merged):
             # We don't know if this is interesting or not. 
             self._stats['step mainline unknown'] += 1
             self._step_mainline()
-        if db_id in self._imported_dotted_revno:
-            self._stats['not interesting imported'] +=1
+        if db_id in self._already_merged:
+            self._stats['not interesting step already merged'] +=1
             return False
         if (gdfo == self._imported_gdfo
             and db_id == self._imported_mainline_id):