Rev 53: A couple of fixes to the _find_interesting_ancestry code. in http://bzr.arbash-meinel.com/plugins/history_db

Wed Apr 7 17:47:37 BST 2010

At http://bzr.arbash-meinel.com/plugins/history_db

------------------------------------------------------------
revno: 53
revision-id: john at arbash-meinel.com-20100407164722-9obvhf939r5nfw58
parent: john at arbash-meinel.com-20100407150745-2t3a9o66annnsxdo
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: history_db
timestamp: Wed 2010-04-07 11:47:22 -0500
message:
  A couple of fixes to the _find_interesting_ancestry code.
  
  1) Always step the mainline first. We know we need to step it at least once.
  Might as well do it first, and get extra filtering.
  2) When finished, we may have filtered out ancestors without having
  to load their parent info, but we need to load that info in the end.
  So add a loop to bring in all LH parent's dotted_revno info.
  
  Now we still need to fix some of the other merge_sort issues, where it
  needs to bring in more mainline ancestry to avoid 'race' conditions, etc.
  But we are at least back to working order.
-------------- next part --------------
=== modified file 'history_db.py'

--- a/history_db.py	2010-04-07 15:07:45 +0000
+++ b/history_db.py	2010-04-07 16:47:22 +0000
@@ -490,14 +490,11 @@
             "   AND child IN (%s)", len(self._mainline_db_ids)),
             self._mainline_db_ids).fetchall()
         self._search_tips = set([r[0] for r in res])
-        known_gdfo = self._known_gdfo
-        known_gdfo.update(res)
-        res = self._cursor.execute(
-            "SELECT gdfo FROM revision WHERE db_id = ?",
-            [self._imported_mainline_id]).fetchone()
-        imported_gdfo = res[0]
-        self._imported_gdfo = imported_gdfo
-        known_gdfo[self._imported_mainline_id] = imported_gdfo
+        self._known_gdfo.update(res)
+        # We know that we will eventually need at least 1 step of the mainline
+        # (it gives us the basis for numbering everything). We do it now,
+        # because it increases the 'cheap' filtering we can do right away.
+        self._step_mainline()
 
     def _is_imported_db_id(self, tip_db_id):
         res = self._cursor.execute(
@@ -642,6 +639,10 @@
             "SELECT parent, gdfo FROM parent, revision"
             " WHERE parent=db_id AND child IN (%s)",
             len(self._search_tips)), list(self._search_tips)).fetchall()
+        # TODO: We could use this time to fill out _parent_map, rather than
+        #       waiting until _push_node and duplicating a request to the
+        #       parent table. It may be reasonable to wait on gdfo also...
+
         # Filter out search tips that we've already searched via a different
         # path. By construction, if we are stepping the search tips, we know
         # that all previous search tips are either in
@@ -657,9 +658,33 @@
         #       check first.
         self._known_gdfo.update(res)
 
+    def _ensure_lh_parent_info(self):
+        """LH parents of interesting_ancestor_ids is either present or pending.
+
+        Either the data should be in _imported_dotted_revno, or the lh parent
+        should be in interesting_ancestor_ids (meaning we will number it).
+        """
+        pmap = self._parent_map
+        missing_parent_ids = set()
+        for db_id in self._interesting_ancestor_ids:
+            parent_ids = self._get_parents(db_id)
+            if not parent_ids: # no parents, nothing to add
+                continue
+            lh_parent = parent_ids[0]
+            if lh_parent in self._interesting_ancestor_ids:
+                continue
+            if lh_parent in self._imported_dotted_revno:
+                continue
+            missing_parent_ids.add(lh_parent)
+        while missing_parent_ids:
+            self._step_mainline()
+            missing_parent_ids = missing_parent_ids.difference(
+                                    self._imported_dotted_revno)
+
     def _find_interesting_ancestry(self):
         self._find_needed_mainline()
         self._get_initial_search_tips()
+        self._step_mainline()
         while self._search_tips:
             # We don't know whether these search tips are known interesting, or
             # known uninteresting
@@ -685,10 +710,10 @@
             # All search_tips are known to either be interesting or
             # uninteresting. Walk any search tips that remain.
             self._step_search_tips()
-        # Once we get to here, we should have loaded enough of
-        # _imported_dotted_revno to be able to create the merge_sort graph.
-        # Also all of the new pending revisions should be in
-        # self._interesting_ancestor_ids
+        # We're now sure we have all of the now-interesting revisions. To
+        # number them, we need their left-hand parents to be in
+        # _imported_dotted_revno
+        self._ensure_lh_parent_info()
 
     def _update_info_from_dotted_revno(self):
         """Update info like 'child_seen' from the dotted_revno info."""
@@ -737,14 +762,20 @@
         # If we got this far, it doesn't appear to have been seen.
         return True
 
-    # XXX: For now, we just re-implement some of the merge_sort logic locally
+    def _get_parents(self, db_id):
+        if db_id in self._parent_map:
+            parent_ids = self._parent_map[db_id]
+        else:
+            parent_res = self._cursor.execute(
+                        "SELECT parent FROM parent WHERE child = ?"
+                        " ORDER BY parent_idx", (db_id,)).fetchall()
+            parent_ids = tuple([r[0] for r in parent_res])
+            self._parent_map[db_id] = parent_ids
+        return parent_ids
+
     def _push_node(self, db_id, merge_depth):
         # TODO: Check if db_id is a ghost (not allowed on the stack)
-        parent_res = self._cursor.execute(
-                    "SELECT parent FROM parent WHERE child = ?"
-                    " ORDER BY parent_idx", (db_id,)).fetchall()
-        parent_ids = tuple([r[0] for r in parent_res])
-        self._parent_map[db_id] = parent_ids
+        parent_ids = self._get_parents(db_id)
         if len(parent_ids) <= 0:
             left_parent = None
             # We are dealing with a 'new root' possibly because of a ghost,
@@ -755,7 +786,7 @@
         else:
             left_parent = parent_ids[0]
             is_first = self._is_first_child(left_parent)
-        pending_parents = parent_ids[1:]
+        pending_parents = tuple(parent_ids[1:])
         # v- logically probably better as a tuple or object. We currently
         # modify it in place, so we use a list
         self._depth_first_stack.append([db_id, merge_depth, left_parent,
@@ -786,6 +817,7 @@
             else:
                 # we need a new branch number. To get this correct, we have to
                 # make sure that the beginning of this branch has been loaded
+                ## XXX:
                 ## branch_root = parent_revno[:2] + (1,)
                 ## while branch_root not in self._dotted_to_db_id:
                 ##     self._step_mainline()
@@ -797,6 +829,7 @@
         else:
             # New Root. To get the numbering correct, we have to walk the
             # mainline back to the beginning... :(
+            ## XXX:
             ## while self._imported_mainline_id is not None:
             ##     self._step_mainline()
             branch_count = self._revno_to_branch_count.get(0, -1) + 1

=== modified file 'test_importer.py'
--- a/test_importer.py	2010-04-07 15:07:45 +0000
+++ b/test_importer.py	2010-04-07 16:47:22 +0000
@@ -232,90 +232,97 @@
 
     def test__incremental_importer_step_by_step(self):
         b = self.make_interesting_branch()
-        b._tip_revision = 'D' # Something older
+        b._tip_revision = 'G' # Something older
         importer = history_db.Importer(':memory:', b, incremental=False)
         importer.do_import()
         self.assertEqual(1, importer._cursor.execute(
             "SELECT count(*) FROM dotted_revno, revision"
             " WHERE tip_revision = merged_revision"
             "   AND tip_revision = db_id"
-            "   AND revision_id = ?", ('D',)).fetchone()[0])
+            "   AND revision_id = ?", ('G',)).fetchone()[0])
         # Now work on just importing G
-        importer._update_ancestry('G')
+        importer._update_ancestry('N')
         self.grab_interesting_ids(importer._rev_id_to_db_id)
-        inc_importer = history_db._IncrementalImporter(importer, self.G_id)
+        inc_importer = history_db._IncrementalImporter(importer, self.N_id)
         inc_importer._find_needed_mainline()
-        self.assertEqual([self.G_id], inc_importer._mainline_db_ids)
-        self.assertEqual(self.D_id, inc_importer._imported_mainline_id)
-        self.assertEqual(set([self.G_id]), inc_importer._interesting_ancestor_ids)
+        self.assertEqual([self.N_id, self.I_id], inc_importer._mainline_db_ids)
+        self.assertEqual(self.G_id, inc_importer._imported_mainline_id)
+        self.assertEqual(set([self.N_id, self.I_id]),
+                         inc_importer._interesting_ancestor_ids)
         # This should populate ._search_tips, as well as gdfo information
         inc_importer._get_initial_search_tips()
-        self.assertEqual(set([self.F_id]), inc_importer._search_tips)
+        self.assertEqual(set([self.J_id, self.H_id]), inc_importer._search_tips)
+        # We should have taken a single step on the mainline
+        self.assertEqual(self.D_id, inc_importer._imported_mainline_id)
         self.assertEqual(4, inc_importer._imported_gdfo)
-        self.assertEqual(self.D_id, inc_importer._imported_mainline_id)
-        self.assertEqual({self.F_id: 4, self.D_id: 4}, inc_importer._known_gdfo)
-        # D_id has gdfo >= F_id, so we know it isn't merged. So
-        # _split_search_tips_by_gdfo should return nothing, and update
-        # _interesting_ancestor_ids
-        self.assertEqual([], inc_importer._split_search_tips_by_gdfo([self.F_id]))
-        self.assertEqual(set([self.G_id, self.F_id]),
-                         inc_importer._interesting_ancestor_ids)
-        # Since we know that all search tips are interesting, we walk them
-        inc_importer._step_search_tips()
-        self.assertEqual(set([self.E_id]), inc_importer._search_tips)
-        # Now when we go to _split_search_tips_by_gdfo, we aren't positive that
-        # E wasn't merged, it should tell us so.
-        self.assertEqual([self.E_id],
-                         inc_importer._split_search_tips_by_gdfo([self.E_id]))
-        # And it should not have updatde search tips or ancestor_ids
-        self.assertEqual(set([self.G_id, self.F_id]),
-                         inc_importer._interesting_ancestor_ids)
-        self.assertEqual(set([self.E_id]), inc_importer._search_tips)
-        # Checking children should notice that no children have gdfo < F, so E
-        # is auto-marked as interesting.
+        self.assertEqual({self.D_id: 4, self.J_id: 5, self.H_id: 3},
+                         inc_importer._known_gdfo)
+        # Both J has gdfo > D so it is quickly removed. H does not, so it is
+        # left behind.
+        self.assertEqual([self.H_id],
+             inc_importer._split_search_tips_by_gdfo([self.J_id, self.H_id]))
+        self.assertEqual(set([self.N_id, self.I_id, self.J_id]),
+                         inc_importer._interesting_ancestor_ids)
+        # Checking children should notice that all known children are fine, so
+        # H now gets marked interesting
         self.assertEqual([],
-                         inc_importer._split_interesting_using_children([self.E_id]))
-        self.assertEqual(set([self.E_id, self.G_id, self.F_id]),
+                 inc_importer._split_interesting_using_children([self.H_id]))
+        self.assertEqual(set([self.N_id, self.I_id, self.J_id, self.H_id]),
                          inc_importer._interesting_ancestor_ids)
         # Since we know all search tips are interesting, step again.
         inc_importer._step_search_tips()
-        self.assertEqual(set([self.B_id]), inc_importer._search_tips)
-        self.assertEqual(set([self.E_id, self.G_id, self.F_id]),
+        self.assertEqual(set([self.B_id, self.F_id]), inc_importer._search_tips)
+        self.assertEqual(set([self.N_id, self.I_id, self.J_id, self.H_id]),
                          inc_importer._interesting_ancestor_ids)
-        # B is merged, so these two steps should not filter it out
-        self.assertEqual([self.B_id],
-                         inc_importer._split_search_tips_by_gdfo([self.B_id]))
-        self.assertEqual([self.B_id],
-                         inc_importer._split_interesting_using_children([self.B_id]))
-        # At this point, we have to step the mainline, to find out if we can
-        # filter out this search tip. After stepping, _imported_dotted_revno
-        # should be filled with the next mainline step
-        inc_importer._step_mainline()
+        # F is known-merged, so the first step should filter it from unknowns,
+        # and remove it from the search tips
+        # However B is not known yet, and has GDFO < D (since it was merged
+        # in).
+        # However E is a child of B, and that is already known to be merged. So
+        # B gets filtered out in the child step, and removed as a search tip
+        self.assertEqual([self.B_id],
+             inc_importer._split_search_tips_by_gdfo([self.B_id, self.F_id]))
+        self.assertEqual(set([self.B_id]), inc_importer._search_tips)
+        self.assertEqual([],
+                     inc_importer._split_interesting_using_children([self.B_id]))
+        self.assertEqual(set([]), inc_importer._search_tips)
+        # At this point, we will stop searching. XXX: B's info has not been
+        # loaded yet...
+        self.assertEqual(self.D_id, inc_importer._imported_mainline_id)
+        self.assertEqual(4, inc_importer._imported_gdfo)
+        self.assertEqual({self.E_id: ((1,2,1), 1, 1),
+                          self.F_id: ((1,2,2), 0, 1),
+                          self.G_id: ((3,), 0, 0),
+                         }, inc_importer._imported_dotted_revno)
+        self.assertEqual({(1,2,1): self.E_id, (1,2,2): self.F_id,
+                          (3,): self.G_id,
+                         }, inc_importer._dotted_to_db_id)
+        # At this point, B_id isn't in _imported_dotted_revno, so we loop to
+        # ensure we have enough dotted_revno data
+        inc_importer._ensure_lh_parent_info()
         self.assertEqual(self.A_id, inc_importer._imported_mainline_id)
         self.assertEqual(1, inc_importer._imported_gdfo)
-        self.assertEqual({self.D_id: ((2,), 0, 0), self.C_id: ((1,1,2), 0, 1),
-                          self.B_id: ((1,1,1), 1, 1),
+        self.assertEqual({self.B_id: ((1,1,1), 1, 1),
+                          self.C_id: ((1,1,2), 0, 1),
+                          self.D_id: ((2,), 0, 0),
+                          self.E_id: ((1,2,1), 1, 1),
+                          self.F_id: ((1,2,2), 0, 1),
+                          self.G_id: ((3,), 0, 0),
                          }, inc_importer._imported_dotted_revno)
-        self.assertEqual({(2,): self.D_id, (1,1,2): self.C_id,
-                          (1,1,1): self.B_id}, inc_importer._dotted_to_db_id)
-        # Search tips is not yet changed
-        self.assertEqual(set([self.B_id]), inc_importer._search_tips)
-        # And now when we check gdfo again, it should remove B_id from the
-        # search_tips, because it sees it in _imported_dotted_revno
-        self.assertEqual([], inc_importer._split_search_tips_by_gdfo([self.B_id]))
-        self.assertEqual(set([]), inc_importer._search_tips)
         inc_importer._update_info_from_dotted_revno()
-        self.assertEqual({1: 1}, inc_importer._revno_to_branch_count)
-        self.assertEqual({0: 2, (1, 1): 2}, inc_importer._branch_to_child_count)
+        self.assertEqual({1: 2}, inc_importer._revno_to_branch_count)
+        self.assertEqual({0: 3, (1, 1): 2, (1, 2): 2},
+                         inc_importer._branch_to_child_count)
         inc_importer._compute_merge_sort()
-        self.assertEqual([(self.E_id, (1, 2, 1), True, 1),
-                          (self.F_id, (1, 2, 2), False, 1),
-                          (self.G_id, (3,), False, 0),
+        self.assertEqual([(self.H_id, (1, 3, 1), True, 1),
+                          (self.I_id, (4,), False, 0),
+                          (self.J_id, (1, 2, 3), True, 1),
+                          (self.N_id, (5,), False, 0),
                          ], inc_importer._scheduled_stack)
 
     def test__incremental_find_interesting_ancestry(self):
         b = self.make_interesting_branch()
-        b._tip_revision = 'D' # Something older
+        b._tip_revision = 'G' # Something older
         importer = history_db.Importer(':memory:', b, incremental=False)
         importer.do_import()
         importer._update_ancestry('O')
@@ -324,21 +331,26 @@
         # This should step through the ancestry, and load in the necessary
         # data. Check the final state
         inc_importer._find_interesting_ancestry()
-        self.assertEqual([self.O_id, self.N_id, self.I_id, self.G_id],
+        self.assertEqual([self.O_id, self.N_id, self.I_id],
                          inc_importer._mainline_db_ids)
         # We should stop loading A, I need to figure out why it gets loaded
         self.assertEqual(self.A_id, inc_importer._imported_mainline_id)
         self.assertEqual(1, inc_importer._imported_gdfo)
-        self.assertEqual(set([self.E_id, self.F_id, self.G_id, self.H_id,
-                              self.I_id, self.J_id, self.K_id, self.L_id,
-                              self.M_id, self.N_id, self.O_id]),
+        self.assertEqual(set([self.H_id, self.I_id, self.J_id, self.K_id,
+                              self.L_id, self.M_id, self.N_id, self.O_id]),
                          inc_importer._interesting_ancestor_ids)
         self.assertEqual(set([]), inc_importer._search_tips)
-        self.assertEqual({self.D_id: ((2,), 0, 0), self.C_id: ((1,1,2), 0, 1),
-                          self.B_id: ((1,1,1), 1, 1),
+        self.assertEqual({self.B_id: ((1,1,1), 1, 1),
+                          self.C_id: ((1,1,2), 0, 1),
+                          self.D_id: ((2,), 0, 0),
+                          self.E_id: ((1,2,1), 1, 1),
+                          self.F_id: ((1,2,2), 0, 1),
+                          self.G_id: ((3,), 0, 0),
                          }, inc_importer._imported_dotted_revno)
-        self.assertEqual({(2,): self.D_id, (1,1,2): self.C_id,
-                          (1,1,1): self.B_id}, inc_importer._dotted_to_db_id)
+        self.assertEqual({(1,1,1): self.B_id, (1,1,2): self.C_id,
+                          (2,): self.D_id, (1,2,1): self.E_id,
+                          (1,2,2): self.F_id, (3,): self.G_id,
+                         }, inc_importer._dotted_to_db_id)
 
     def test__incremental_step_skips_already_seen(self):
         # Simpler graph:
@@ -383,3 +395,24 @@
         # It should want to include D_id, but it should know that we've already
         # been there
         self.assertEqual(set([self.C_id, self.E_id]), inc_importer._search_tips)
+
+    def test__incremental_branch_counter_correct(self):
+        # The trick is that if we have already imported to N, then we will be
+        # hiding the correct branch counter for revno 1. We will see it as 2
+        # from the revisions we've loaded, but really it is 3 because of the H
+        # branch.
+        b = self.make_interesting_branch()
+        b._tip_revision = 'N'
+        importer = history_db.Importer(':memory:', b, incremental=False)
+        importer.do_import()
+        importer._update_ancestry('O')
+        self.grab_interesting_ids(importer._rev_id_to_db_id)
+        inc_importer = history_db._IncrementalImporter(importer, self.O_id)
+        inc_importer._find_interesting_ancestry()
+        inc_importer._update_info_from_dotted_revno()
+        inc_importer._compute_merge_sort()
+        self.assertEqual([(self.K_id, (1, 2, 4), True, 1),
+                          (self.L_id, (1, 4, 1), True, 2),
+                          (self.M_id, (1, 2, 5), False, 1),
+                          (self.O_id, (6,), False, 0),
+                         ], inc_importer._scheduled_stack)