Rev 3378: Bring in some of the changes from graph_update and graph_optimization in http://bzr.arbash-meinel.com/branches/bzr/1.4-dev/find_differences

Tue Apr 22 19:55:17 BST 2008

At http://bzr.arbash-meinel.com/branches/bzr/1.4-dev/find_differences

------------------------------------------------------------
revno: 3378
revision-id: john at arbash-meinel.com-20080422184821-wd5n1y365ev60lyq
parent: pqm at pqm.ubuntu.com-20080422120059-sony5sthnlewabge
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: find_differences
timestamp: Tue 2008-04-22 13:48:21 -0500
message:
  Bring in some of the changes from graph_update and graph_optimization
  
  _BreadthFirstSearcher.find_seen_ancestors takes a list/set instead of a single node.
  find_difference() is still broken, but mostly just in ways that it was already broken.
modified:
  bzrlib/graph.py                graph_walker.py-20070525030359-y852guab65d4wtn0-1
  bzrlib/tests/test_graph.py     test_graph_walker.py-20070525030405-enq4r60hhi9xrujc-1
-------------- next part --------------
=== modified file 'bzrlib/graph.py'

--- a/bzrlib/graph.py	2008-03-15 13:51:09 +0000
+++ b/bzrlib/graph.py	2008-04-22 18:48:21 +0000
@@ -199,10 +199,12 @@
 
     def find_difference(self, left_revision, right_revision):
         """Determine the graph difference between two revisions"""
-        border, common, (left, right) = self._find_border_ancestors(
+        border, common, searchers = self._find_border_ancestors(
             [left_revision, right_revision])
-        return (left.difference(right).difference(common),
-                right.difference(left).difference(common))
+        self._search_for_extra_common(common, searchers)
+        left = searchers[0].seen
+        right = searchers[1].seen
+        return (left.difference(right), right.difference(left))
 
     @symbol_versioning.deprecated_method(symbol_versioning.one_one)
     def get_parents(self, revisions):
@@ -261,15 +263,14 @@
         border_ancestors = set()
         def update_common(searcher, revisions):
             w_seen_ancestors = searcher.find_seen_ancestors(
-                revision)
+                [revision])
             stopped = searcher.stop_searching_any(w_seen_ancestors)
             common_ancestors.update(w_seen_ancestors)
             common_searcher.start_searching(stopped)
 
         while True:
             if len(active_searchers) == 0:
-                return border_ancestors, common_ancestors, [s.seen for s in
-                                                            searchers]
+                return border_ancestors, common_ancestors, searchers
             try:
                 new_common = common_searcher.next()
                 common_ancestors.update(new_common)
@@ -387,7 +388,7 @@
                         new_common.add(ancestor)
                         for searcher in searchers.itervalues():
                             seen_ancestors =\
-                                searcher.find_seen_ancestors(ancestor)
+                                searcher.find_seen_ancestors([ancestor])
                             searcher.stop_searching_any(seen_ancestors)
             common_walker.start_searching(new_common)
         return candidate_heads
@@ -469,6 +470,163 @@
         return set([candidate_descendant]) == self.heads(
             [candidate_ancestor, candidate_descendant])
 
+    def _search_for_extra_common(self, common, searchers):
+        """Make sure that unique nodes are genuinely unique.
+
+        After a simple search, we end up with genuine common nodes, but some
+        uncommon nodes might actually be descended from common nodes, and we
+        just didn't search far enough.
+
+        We know that we have searched enough when all common search tips are
+        descended from all unique (uncommon) nodes because we know that a node
+        cannot be an ancestor of its own ancestor.
+
+        :param common: A set of common nodes
+        :param searchers: The searchers returned from _find_border_ancestors
+        :return: None
+        """
+        # Basic algorithm...
+        #   A) The passed in searchers should all be on the same tips, thus
+        #      they should be considered the "common" searchers.
+        #   B) We find the difference between the searchers, these are the
+        #      "unique" nodes for each side.
+        #   C) We do a quick culling so that we only start searching from the
+        #      more interesting unique nodes. (A unique ancestor is more
+        #      interesting that any of its children.)
+        #   D) We start searching for ancestors common to all unique nodes.
+        #   E) We have the common searchers stop searching any ancestors of
+        #      nodes found by (D)
+        #   F) When there are no more common search tips, we stop
+        assert len(searchers) == 2, (
+            "Algorithm not yet implemented for > 2 searchers")
+        common_searchers = searchers
+        left_searcher = searchers[0]
+        right_searcher = searchers[1]
+        unique = left_searcher.seen.symmetric_difference(right_searcher.seen)
+        unique = self._remove_simple_descendants(unique,
+                    self.get_parent_map(unique))
+        # TODO: jam 20071214 Would it be possible to seed these searchers with
+        #       the revisions that we have already seen on each side?
+        #       Maybe something like:
+        #       unique_searchers = []
+        #       for left_unique in left.difference(right):
+        #          l_searcher = self._make_breadth_first_searcher(left_unique)
+        #          l_searcher.seen.update(left.seen)
+        #       ... (same for right.difference(left))
+        #       This might also be easier to set up for the case of >2
+        #       searchers.
+        unique_searchers = [self._make_breadth_first_searcher([r])
+                            for r in unique]
+        # Aggregate all of the searchers into a single common searcher, would
+        # it be okay to do this?
+        # okay to do this?
+        # common_searcher = self._make_breadth_first_searcher([])
+        # for searcher in searchers:
+        #     common_searcher.start_searching(searcher.will_search())
+        #     common_searcher.seen.update(searcher.seen)
+        common_ancestors_unique = set()
+
+        while True: # If we have no more nodes we have nothing to do
+            # XXX: Any nodes here which don't match between searchers indicate
+            #      that we have found a genuinely unique node, which would not
+            #      have been found by the other searching techniques
+            newly_seen_common = set()
+            for searcher in common_searchers:
+                newly_seen_common.update(searcher.step())
+            newly_seen_unique = set()
+            for searcher in unique_searchers:
+                newly_seen_unique.update(searcher.step())
+            new_common_unique = set()
+            for revision in newly_seen_unique:
+                if revision in common_ancestors_unique:
+                    # TODO: Do we need to add it to new_common_unique, since it
+                    #       seems to have already been found... ?
+                    new_common_unique.add(revision)
+                    continue
+                for searcher in unique_searchers:
+                    if revision not in searcher.seen:
+                        break
+                else:
+                    # This is a border because it is a first common that we see
+                    # after walking for a while.
+                    new_common_unique.add(revision)
+            if newly_seen_common:
+                # These are nodes descended from one of the 'common' searchers.
+                # Make sure all searchers are on the same page
+                for searcher in common_searchers:
+                    newly_seen_common.update(searcher.find_seen_ancestors(newly_seen_common))
+                for searcher in common_searchers:
+                    searcher.start_searching(newly_seen_common)
+            if new_common_unique:
+                # We found some ancestors that are common, jump all the way to
+                # their most ancestral node that we have already seen.
+                try:
+                    for searcher in unique_searchers:
+                        new_common_unique.update(searcher.find_seen_ancestors(new_common_unique))
+                except TypeError:
+                    import pdb; pdb.set_trace()
+                    raise
+                # Since these are common, we can grab another set of ancestors
+                # that we have seen
+                for searcher in common_searchers:
+                    new_common_unique.update(searcher.find_seen_ancestors(new_common_unique))
+
+                # Now we have a complete set of common nodes which are
+                # ancestors of the unique nodes.
+                # We can tell all of the unique searchers to start at these
+                # nodes, and tell all of the common searchers to *stop*
+                # searching these nodes
+                for searcher in unique_searchers:
+                    searcher.start_searching(new_common_unique)
+                for searcher in common_searchers:
+                    searcher.stop_searching_any(new_common_unique)
+                common_ancestors_unique.update(new_common_unique)
+            if not newly_seen_common: # No new common nodes, we are done
+                break
+            if not newly_seen_unique: # No new unique nodes, we are done
+                break
+
+
+    def _remove_simple_descendants(self, revisions, parent_map):
+        """remove revisions which are children of other ones in the set
+
+        This doesn't do any graph searching, it just checks the immediate
+        parent_map to find if there are any children which can be removed.
+
+        :param revisions: A set of revision_ids
+        :return: A set of revision_ids with the children removed
+        """
+        simple_ancestors = revisions.copy()
+        # TODO: jam 20071214 we *could* restrict it to searching only the
+        #       parent_map of revisions already present in 'revisions', but
+        #       considering the general use case, I think this is actually
+        #       better.
+
+        # This is the same as the following loop. I don't know that it is any
+        # faster.
+        ## simple_ancestors.difference_update(r for r, p_ids in parent_map.iteritems()
+        ##     if p_ids is not None and revisions.intersection(p_ids))
+        ## return simple_ancestors
+
+        # Yet Another Way, invert the parent map (which can be cached)
+        ## descendants = {}
+        ## for revision_id, parent_ids in parent_map.iteritems():
+        ##   for p_id in parent_ids:
+        ##       descendants.setdefault(p_id, []).append(revision_id)
+        ## for revision in revisions.intersection(descendants):
+        ##   simple_ancestors.difference_update(descendants[revision])
+        ## return simple_ancestors
+        for revision, parent_ids in parent_map.iteritems():
+            if parent_ids is None:
+                continue
+            for parent_id in parent_ids:
+                if parent_id in revisions:
+                    # This node has a parent present in the set, so we can
+                    # remove it
+                    simple_ancestors.discard(revision)
+                    break
+        return simple_ancestors
+
 
 class HeadsCache(object):
     """A cache of results for graph heads calls."""
@@ -582,6 +740,12 @@
         return SearchResult(self._started_keys, excludes, len(included_keys),
             included_keys)
 
+    def step(self):
+        try:
+            return self.next()
+        except StopIteration:
+            return ()
+
     def next(self):
         """Return the next ancestors of this revision.
 
@@ -667,16 +831,18 @@
     def __iter__(self):
         return self
 
-    def find_seen_ancestors(self, revision):
-        """Find ancestors of this revision that have already been seen."""
-        searcher = _BreadthFirstSearcher([revision], self._parents_provider)
+    def find_seen_ancestors(self, revisions):
+        """Find ancestors of these revisions that have already been seen."""
+        searcher = _BreadthFirstSearcher(revisions, self._parents_provider)
         seen_ancestors = set()
         for ancestors in searcher:
             for ancestor in ancestors:
+                stop_nodes = set()
                 if ancestor not in self.seen:
-                    searcher.stop_searching_any([ancestor])
+                    stop_nodes.add(ancestor)
                 else:
                     seen_ancestors.add(ancestor)
+                searcher.stop_searching_any(stop_nodes)
         return seen_ancestors
 
     def stop_searching_any(self, revisions):

=== modified file 'bzrlib/tests/test_graph.py'
--- a/bzrlib/tests/test_graph.py	2008-03-10 15:10:47 +0000
+++ b/bzrlib/tests/test_graph.py	2008-04-22 18:48:21 +0000
@@ -164,8 +164,8 @@
 
 # Complex shortcut
 # This has a failure mode in that a shortcut will find some nodes in common,
-# but the common searcher won't have time to find that one branch is actually
-# in common. The extra nodes at the top are because we want to avoid
+# but the common searcher won't have time to find that one branche is actually
+# in common. The extra nodes at the beginning are because we want to avoid
 # walking off the graph. Specifically, node G should be considered common, but
 # is likely to be seen by M long before the common searcher finds it.
 #
@@ -181,6 +181,32 @@
 #     |\
 #     e f
 #     | |\
+#     | g h
+#     |/| |
+#     i j |
+#     | | |
+#     | k |
+#     | | |
+#     | l |
+#     |/|/
+#     m n
+complex_shortcut = {'a':[NULL_REVISION], 'b':['a'], 'c':['b'], 'd':['c'],
+                    'e':['d'], 'f':['d'], 'g':['f'], 'h':['f'],
+                    'i':['e', 'g'], 'j':['g'], 'k':['j'],
+                    'l':['k'], 'm':['i', 'l'], 'n':['l', 'h']}
+
+# NULL_REVISION
+#     |
+#     a
+#     |
+#     b
+#     |
+#     c
+#     |
+#     d
+#     |\
+#     e f
+#     | |\
 #     i | h
 #     |\| |
 #     | g |
@@ -192,7 +218,7 @@
 #     | l |
 #     |/|/
 #     m n
-complex_shortcut = {'d':[NULL_REVISION],
+complex_shortcut2 = {'d':[NULL_REVISION],
                     'x':['d'], 'y':['x'],
                     'e':['y'], 'f':['d'], 'g':['f', 'i'], 'h':['f'],
                     'i':['e'], 'j':['g'], 'k':['j'],
@@ -264,6 +290,10 @@
         self.calls.extend(nodes)
         return self._real_parents_provider.get_parent_map(nodes)
 
+    def get_parent_map(self, nodes):
+        self.calls.extend(nodes)
+        return self._real_parents_provider.get_parent_map(nodes)
+
 
 class TestGraph(TestCaseWithMemoryTransport):
 
@@ -357,6 +387,31 @@
                          graph.find_unique_lca('rev2a', 'rev2b',
                          count_steps=True))
 
+    def assertRemoveDescendants(self, expected, graph, revisions):
+        parents = graph.get_parent_map(revisions)
+        self.assertEqual(expected,
+                         graph._remove_simple_descendants(revisions, parents))
+
+    def test__remove_simple_descendants(self):
+        graph = self.make_graph(ancestry_1)
+        self.assertRemoveDescendants(set(['rev1']), graph,
+            set(['rev1', 'rev2a', 'rev2b', 'rev3', 'rev4']))
+
+    def test__remove_simple_descendants_disjoint(self):
+        graph = self.make_graph(ancestry_1)
+        self.assertRemoveDescendants(set(['rev1', 'rev3']), graph,
+            set(['rev1', 'rev3']))
+
+    def test__remove_simple_descendants_chain(self):
+        graph = self.make_graph(ancestry_1)
+        self.assertRemoveDescendants(set(['rev1']), graph,
+            set(['rev1', 'rev2a', 'rev3']))
+
+    def test__remove_simple_descendants_siblings(self):
+        graph = self.make_graph(ancestry_1)
+        self.assertRemoveDescendants(set(['rev2a', 'rev2b']), graph,
+            set(['rev2a', 'rev2b', 'rev3']))
+
     def test_unique_lca_criss_cross(self):
         """Ensure we don't pick non-unique lcas in a criss-cross"""
         graph = self.make_graph(criss_cross)
@@ -427,9 +482,9 @@
 
     def test_graph_difference_extended_history(self):
         graph = self.make_graph(extended_history_shortcut)
-        self.expectFailure('find_difference cannot handle shortcuts',
-            self.assertEqual, (set(['e']), set(['f'])),
-                graph.find_difference('e', 'f'))
+        # self.expectFailure('find_difference cannot handle shortcuts',
+        #     self.assertEqual, (set(['e']), set(['f'])),
+        #         graph.find_difference('e', 'f'))
         self.assertEqual((set(['e']), set(['f'])),
                          graph.find_difference('e', 'f'))
         self.assertEqual((set(['f']), set(['e'])),
@@ -442,17 +497,22 @@
 
     def test_graph_difference_complex_shortcut(self):
         graph = self.make_graph(complex_shortcut)
-        self.expectFailure('find_difference cannot handle shortcuts',
-            self.assertEqual, (set(['m']), set(['h', 'n'])),
-                graph.find_difference('m', 'n'))
+        # self.expectFailure('find_difference cannot handle shortcuts',
+        #     self.assertEqual, (set(['m']), set(['h', 'n'])),
+        #         graph.find_difference('m', 'n'))
+        self.assertEqual((set(['m', 'i', 'e']), set(['n', 'h'])),
+                         graph.find_difference('m', 'n'))
+
+    def test_graph_difference_complex_shortcut2(self):
+        graph = self.make_graph(complex_shortcut2)
         self.assertEqual((set(['m']), set(['h', 'n'])),
                          graph.find_difference('m', 'n'))
 
     def test_graph_difference_shortcut_extra_root(self):
         graph = self.make_graph(shortcut_extra_root)
-        self.expectFailure('find_difference cannot handle shortcuts',
-            self.assertEqual, (set(['e']), set(['f', 'g'])),
-                graph.find_difference('e', 'f'))
+        # self.expectFailure('find_difference cannot handle shortcuts',
+        #     self.assertEqual, (set(['e']), set(['f', 'g'])),
+        #         graph.find_difference('e', 'f'))
         self.assertEqual((set(['e']), set(['f', 'g'])),
                          graph.find_difference('e', 'f'))
 
@@ -641,6 +701,13 @@
                     self.fail('key deeper was accessed')
                 result[key] = graph_dict[key]
             return result
+        def get_parent_map(keys):
+            result = {}
+            for key in keys:
+                if key == 'deeper':
+                    self.fail('key deeper was accessed')
+                result[key] = graph_dict[key]
+            return result
         an_obj = stub()
         an_obj.get_parent_map = get_parent_map
         graph = _mod_graph.Graph(an_obj)