Rev 51: Basic implementation of merge_sort using limited information. in http://bzr.arbash-meinel.com/plugins/history_db

Tue Apr 6 22:55:40 BST 2010

At http://bzr.arbash-meinel.com/plugins/history_db

------------------------------------------------------------
revno: 51
revision-id: john at arbash-meinel.com-20100406215530-ellqwfhrk1n2qy55
parent: john at arbash-meinel.com-20100406200231-zbu5l4hhn1xdumi1
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: history_db
timestamp: Tue 2010-04-06 16:55:30 -0500
message:
  Basic implementation of merge_sort using limited information.
  
  There are still edge cases that I should be getting wrong. But at least I get
  the basic case right. \o/
-------------- next part --------------
=== modified file 'history_db.py'

--- a/history_db.py	2010-04-05 22:01:03 +0000
+++ b/history_db.py	2010-04-06 21:55:30 +0000
@@ -441,6 +441,8 @@
         # Information from the dotted_revno table for revisions that are in the
         # already-imported mainline.
         self._imported_dotted_revno = {}
+        # Map from (dotted,revno,) => db_id
+        self._dotted_to_db_id = {}
         # This is the gdfo of the current mainline revision search tip. This is
         # the threshold such that 
         self._imported_gdfo = None
@@ -453,6 +455,12 @@
         # (revno, branch_num) => oldest seen child
         self._branch_to_child_count = {}
 
+        self._depth_first_stack = None
+        self._scheduled_stack = None
+        self._seen_parents = None
+        # Map from db_id => parent_ids
+        self._parent_map = {}
+
     def _find_needed_mainline(self):
         """Find mainline revisions that need to be filled out.
         
@@ -605,9 +613,10 @@
             "SELECT merged_revision, revno, end_of_merge, merge_depth"
             "  FROM dotted_revno WHERE tip_revision = ?",
             [self._imported_mainline_id]).fetchall()
-        self._imported_dotted_revno.update(
-            [(r[0], (tuple(map(int, r[1].split('.'))), r[2], r[3]))
-             for r in res])
+        dotted_info = [(r[0], (tuple(map(int, r[1].split('.'))), r[2], r[3]))
+                       for r in res]
+        self._imported_dotted_revno.update(dotted_info)
+        self._dotted_to_db_id.update([(i[1][0], i[0]) for i in dotted_info])
         # TODO: We could remove search tips that show up as newly merged
         #       though that can wait until the next
         #       _split_search_tips_by_gdfo
@@ -680,6 +689,8 @@
 
     def _update_info_from_dotted_revno(self):
         """Update info like 'child_seen' from the dotted_revno info."""
+        # TODO: We can move this iterator into a parameter, and have it
+        #       continuously updated from _step_mainline()
         iterator = self._imported_dotted_revno.iteritems()
         for db_id, (revno, eom, depth) in iterator:
             if len(revno) > 1: # dotted revno, make sure branch count is right
@@ -688,9 +699,159 @@
                     or revno[1] > self._revno_to_branch_count[base_revno]):
                     self._revno_to_branch_count[base_revno] = revno[1]
                 branch_key = revno[:2]
-                if (branch_key not in self._branch_to_child_count
-                    or revno[2] > self._branch_to_child_count[branch_key]):
-                    self._branch_to_child_count[branch_key] = revno[2]
+                mini_revno = revno[2]
+            else:
+                # *mainline* branch
+                branch_key = 0
+                mini_revno = revno[0]
+            if (branch_key not in self._branch_to_child_count
+                or mini_revno > self._branch_to_child_count[branch_key]):
+                self._branch_to_child_count[branch_key] = mini_revno
+
+    def _is_first_child(self, parent_id):
+        """Is this the first child seen for the given parent?"""
+        if parent_id in self._seen_parents:
+            return False
+        # We haven't seen this while walking, but perhaps the already merged
+        # stuff has.
+        self._seen_parents.add(parent_id)
+        if parent_id not in self._imported_dotted_revno:
+            # Haven't seen this parent merged before, so we can't have seen
+            # children of it
+            return True
+        revno = self._imported_dotted_revno[parent_id][0]
+        if len(revno) > 1:
+            branch_key = revno[:2]
+            mini_revno = revno[2]
+        else:
+            branch_key = 0
+            mini_revno = revno[0]
+        if self._branch_to_child_count.get(branch_key, 0) > mini_revno:
+            # This revision shows up in the graph, but another revision in this
+            # branch shows up later, so this revision must have already been
+            # seen
+            return False
+        # If we got this far, it doesn't appear to have been seen.
+        return True
+
+    # XXX: For now, we just re-implement some of the merge_sort logic locally
+    def _push_node(self, db_id, merge_depth):
+        # TODO: Check if db_id is a ghost (not allowed on the stack)
+        parent_res = self._cursor.execute(
+                    "SELECT parent FROM parent WHERE child = ?"
+                    " ORDER BY parent_idx", (db_id,)).fetchall()
+        parent_ids = tuple([r[0] for r in parent_res])
+        self._parent_map[db_id] = parent_ids
+        if len(parent_ids) <= 0:
+            left_parent = None
+            # We are dealing with a 'new root' possibly because of a ghost,
+            # possibly because of merging a new ancestry.
+            # KnownGraph.merge_sort just always says True here, so stick with
+            # that
+            is_first = True
+        else:
+            left_parent = parent_ids[0]
+            is_first = self._is_first_child(left_parent)
+        pending_parents = parent_ids[1:]
+        # v- logically probably better as a tuple or object. We currently
+        # modify it in place, so we use a list
+        self._depth_first_stack.append([db_id, merge_depth, left_parent,
+                                        left_parent, pending_parents,
+                                        is_first])
+
+    def _pop_node(self):
+        """Move the last node from the _depth_first_stack to _scheduled_stack.
+
+        This is the most left-hand node that we are able to find.
+        """
+        (db_id, merge_depth, left_parent_id, _, _,
+         is_first) = self._depth_first_stack.pop()
+        if left_parent_id is not None:
+            parent_revno = self._imported_dotted_revno[left_parent_id][0]
+            if is_first: # We simply number as parent + 1
+                if len(parent_revno) == 1:
+                    mini_revno = parent_revno[0] + 1
+                    revno = (mini_revno,)
+                    # TODO: Do we need to increment
+                    #       self._branch_to_child_count[0]
+                    #       I think we do, but it is currently only used by
+                    #       _is_first_child, which should already be setting
+                    #       the 'seen' counters...
+                    # self._branch_to_child_count[0] = revno / max(revno, ...)
+                else:
+                    revno = parent_revno[:2] + (parent_revno[2] + 1,)
+            else:
+                # we need a new branch number. To get this correct, we have to
+                # make sure that the beginning of this branch has been loaded
+                ## branch_root = parent_revno[:2] + (1,)
+                ## while branch_root not in self._dotted_to_db_id:
+                ##     self._step_mainline()
+                base_revno = parent_revno[0]
+                branch_count = (
+                    self._revno_to_branch_count.get(base_revno, 0) + 1)
+                self._revno_to_branch_count[base_revno] = branch_count
+                revno = (base_revno, branch_count, 1)
+        else:
+            # New Root. To get the numbering correct, we have to walk the
+            # mainline back to the beginning... :(
+            ## while self._imported_mainline_id is not None:
+            ##     self._step_mainline()
+            branch_count = self._revno_to_branch_count.get(0, -1) + 1
+            self._revno_to_branch_count[0] = branch_count
+            if branch_count == 0: # This is the mainline
+                revno = (1,)
+            else:
+                revno = (0, branch_count, 1)
+        # XXX: This isn't correct any more, we need to look at the parent,
+        #      what *would* have been scheduled if we weren't doing partial
+        #      scheduling. Then again, maybe it is correct for all but the
+        #      mainline (first entry on the stack). Because we always break
+        #      apart the dotted_revno cache based on what has gotten merged...
+        if not self._scheduled_stack:
+            end_of_merge = True
+        else:
+            prev_db_id, prev_revno, _, prev_depth = self._scheduled_stack[-1]
+            if prev_depth < merge_depth:
+                end_of_merge = True
+            elif (prev_depth == merge_depth
+                  and prev_db_id not in self._parent_map[db_id]):
+                # Next node is not a direct parent
+                end_of_merge = True
+            else:
+                end_of_merge = False
+        self._imported_dotted_revno[db_id] = (revno, end_of_merge, merge_depth)
+        self._scheduled_stack.append((db_id, revno, end_of_merge, merge_depth))
+
+    def _compute_merge_sort(self):
+        self._depth_first_stack = []
+        self._scheduled_stack = []
+        self._seen_parents = set()
+        self._push_node(self._mainline_db_ids[0], 0)
+
+        while self._depth_first_stack:
+            last = self._depth_first_stack[-1]
+            if last[3] is None and not last[4]:
+                # The parents have been processed, pop the node
+                self._pop_node()
+                continue
+            while last[3] is not None or last[4]:
+                if last[3] is not None:
+                    # Push on the left-hand-parent
+                    next_db_id = last[3]
+                    last[3] = None
+                else:
+                    pending_parents = last[4]
+                    next_db_id = pending_parents[-1]
+                    last[4] = pending_parents[:-1]
+                if next_db_id in self._imported_dotted_revno:
+                    continue
+                if next_db_id == last[2]: #Is the left-parent?
+                    next_merge_depth = last[1]
+                else:
+                    next_merge_depth = last[1] + 1
+                self._push_node(next_db_id, next_merge_depth)
+                # And switch to the outer loop
+                break
 
     def do_import(self):
         self._find_interesting_ancestry()

=== modified file 'test_importer.py'
--- a/test_importer.py	2010-04-06 19:53:43 +0000
+++ b/test_importer.py	2010-04-06 21:55:30 +0000
@@ -299,6 +299,8 @@
         self.assertEqual({self.D_id: ((2,), 0, 0), self.C_id: ((1,1,2), 0, 1),
                           self.B_id: ((1,1,1), 1, 1),
                          }, inc_importer._imported_dotted_revno)
+        self.assertEqual({(2,): self.D_id, (1,1,2): self.C_id,
+                          (1,1,1): self.B_id}, inc_importer._dotted_to_db_id)
         # Search tips is not yet changed
         self.assertEqual(set([self.B_id]), inc_importer._search_tips)
         # And now when we check gdfo again, it should remove B_id from the
@@ -307,7 +309,12 @@
         self.assertEqual(set([]), inc_importer._search_tips)
         inc_importer._update_info_from_dotted_revno()
         self.assertEqual({1: 1}, inc_importer._revno_to_branch_count)
-        self.assertEqual({(1, 1): 2}, inc_importer._branch_to_child_count)
+        self.assertEqual({0: 2, (1, 1): 2}, inc_importer._branch_to_child_count)
+        inc_importer._compute_merge_sort()
+        self.assertEqual([(self.E_id, (1, 2, 1), True, 1),
+                          (self.F_id, (1, 2, 2), False, 1),
+                          (self.G_id, (3,), False, 0),
+                         ], inc_importer._scheduled_stack)
 
     def test__incremental_find_interesting_ancestry(self):
         b = self.make_interesting_branch()
@@ -333,3 +340,5 @@
         self.assertEqual({self.D_id: ((2,), 0, 0), self.C_id: ((1,1,2), 0, 1),
                           self.B_id: ((1,1,1), 1, 1),
                          }, inc_importer._imported_dotted_revno)
+        self.assertEqual({(2,): self.D_id, (1,1,2): self.C_id,
+                          (1,1,1): self.B_id}, inc_importer._dotted_to_db_id)