Rev 3425: Lots of refactoring for find_unique_ancestors. in http://bzr.arbash-meinel.com/branches/bzr/1.4-dev/find_unique_ancestors

Wed May 21 00:10:31 BST 2008

At http://bzr.arbash-meinel.com/branches/bzr/1.4-dev/find_unique_ancestors

------------------------------------------------------------
revno: 3425
revision-id: john at arbash-meinel.com-20080520231020-wh8g3pq0whu2p78i
parent: john at arbash-meinel.com-20080513221726-vqpvwthr1tncpb29
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: find_unique_ancestors
timestamp: Tue 2008-05-20 18:10:20 -0500
message:
  Lots of refactoring for find_unique_ancestors.
  
  Split up the function into a bunch of helper functions.
modified:
  bzrlib/graph.py                graph_walker.py-20070525030359-y852guab65d4wtn0-1
-------------- next part --------------
=== modified file 'bzrlib/graph.py'

--- a/bzrlib/graph.py	2008-05-13 22:17:26 +0000
+++ b/bzrlib/graph.py	2008-05-20 23:10:20 +0000
@@ -240,19 +240,49 @@
         #    information you have so far.
         # 5) Continue searching, stopping the common searches when the search
         #    tip is an ancestor of all unique nodes.
-        # 6) Search is done when all common searchers have completed.
-
+        # 6) Aggregate together unique searchers when they are searching the
+        #    same tips. When all unique searchers are searching the same node,
+        #    stop move it to a single 'all_unique_searcher'.
+        # 7) The 'all_unique_searcher' represents the very 'tip' of searching.
+        #    Most of the time this produces very little important information.
+        #    So don't step it as quickly as the other searchers.
+        # 8) Search is done when all common searchers have completed.
+
+        unique_searcher, common_searcher = self._find_initial_unique_nodes(
+            [unique_revision], common_revisions)
+
+        unique_nodes = unique_searcher.seen.difference(common_searcher.seen)
+        if not unique_nodes:
+            return unique_nodes
+
+        (all_unique_searcher,
+         unique_tip_searchers) = self._make_unique_searchers(unique_nodes,
+                                    unique_searcher, common_searcher)
+
+        self._refine_unique_nodes(unique_searcher, all_unique_searcher,
+                                  unique_tip_searchers, common_searcher)
+        true_unique_nodes = unique_nodes.difference(common_searcher.seen)
         if 'graph' in debug.debug_flags:
-            _mutter = trace.mutter
-        else:
-            def _mutter(*args, **kwargs):
-                pass
-
-        unique_searcher = self._make_breadth_first_searcher([unique_revision])
-        # we know that unique_revision isn't in common_revisions
+            trace.mutter('Found %s truly unique nodes out of %s',
+                         len(true_unique_nodes), len(unique_nodes))
+        return true_unique_nodes
+
+    def _find_initial_unique_nodes(self, unique_revisions, common_revisions):
+        """Steps 1-3 of find_unique_ancestors.
+
+        Find the maximal set of unique nodes. Some of these might actually
+        still be common, but we are sure that there are no other unique nodes.
+
+        :return: (unique_searcher, common_searcher)
+        """
+
+        unique_searcher = self._make_breadth_first_searcher(unique_revisions)
+        # we know that unique_revisions aren't in common_revisions, so skip
+        # past them.
         unique_searcher.next()
         common_searcher = self._make_breadth_first_searcher(common_revisions)
 
+        # As long as we are still finding unique nodes, keep searching
         while unique_searcher._next_query:
             next_unique_nodes = set(unique_searcher.step())
             next_common_nodes = set(common_searcher.step())
@@ -264,27 +294,36 @@
             unique_are_common_nodes.update(
                 next_common_nodes.intersection(unique_searcher.seen))
             if unique_are_common_nodes:
+                ancestors = unique_searcher.find_seen_ancestors(
+                                unique_are_common_nodes)
                 # TODO: This is a bit overboard, we only really care about
                 #       the ancestors of the tips because the rest we
                 #       already know. This is *correct* but causes us to
-                #       search far too much ancestry.
-                ancestors = unique_searcher.find_seen_ancestors(
-                                unique_are_common_nodes)
+                #       search too much ancestry.
                 ancestors.update(common_searcher.find_seen_ancestors(ancestors))
                 unique_searcher.stop_searching_any(ancestors)
                 common_searcher.start_searching(ancestors)
 
-        unique_nodes = unique_searcher.seen.difference(common_searcher.seen)
-        if not unique_nodes:
-            return unique_nodes
+        return unique_searcher, common_searcher
+
+
+    def _make_unique_searchers(self, unique_nodes, unique_searcher,
+                               common_searcher):
+        """Create a searcher for all the unique search tips (step 4).
+
+        As a side effect, the common_searcher will stop searching any nodes
+        that are ancestors of the unique searcher tips.
+
+        :return: (all_unique_searcher, unique_tip_searchers)
+        """
         unique_tips = self._remove_simple_descendants(unique_nodes,
                         self.get_parent_map(unique_nodes))
 
         if len(unique_tips) == 1:
-            unique_searchers = []
+            unique_tip_searchers = []
             ancestor_all_unique = unique_searcher.find_seen_ancestors(unique_tips)
         else:
-            unique_searchers = []
+            unique_tip_searchers = []
             for tip in unique_tips:
                 revs_to_search = unique_searcher.find_seen_ancestors([tip])
                 revs_to_search.update(
@@ -293,18 +332,21 @@
                 # We don't care about the starting nodes.
                 searcher._label = tip
                 searcher.step()
-                unique_searchers.append(searcher)
+                unique_tip_searchers.append(searcher)
 
             ancestor_all_unique = None
-            for searcher in unique_searchers:
+            for searcher in unique_tip_searchers:
                 if ancestor_all_unique is None:
                     ancestor_all_unique = set(searcher.seen)
                 else:
                     ancestor_all_unique = ancestor_all_unique.intersection(
                                                 searcher.seen)
         # Collapse all the common nodes into a single searcher
-        all_unique_searcher = self._make_breadth_first_searcher(ancestor_all_unique)
+        all_unique_searcher = self._make_breadth_first_searcher(
+                                ancestor_all_unique)
         if ancestor_all_unique:
+            # We've seen these nodes in all the searchers, so we'll just go to
+            # the next
             all_unique_searcher.step()
 
             # Stop any search tips that are already known as ancestors of the
@@ -313,55 +355,137 @@
                 common_searcher.find_seen_ancestors(ancestor_all_unique))
 
             total_stopped = 0
-            for searcher in unique_searchers:
+            for searcher in unique_tip_searchers:
                 total_stopped += len(searcher.stop_searching_any(
                     searcher.find_seen_ancestors(ancestor_all_unique)))
-            _mutter('For %s unique nodes, created %s + 1 unique searchers'
-                    ' (%s stopped search tips, %s common ancestors'
-                    ' (%s stopped common)',
-                    len(unique_nodes), len(unique_searchers), total_stopped,
-                    len(ancestor_all_unique), len(stopped_common))
-            del ancestor_all_unique, stopped_common
-
-        # We step the ancestor_all_unique searcher only every other step
-        step_all_unique = 0
+        if 'graph' in debug.debug_flags:
+            trace.mutter('For %s unique nodes, created %s + 1 unique searchers'
+                         ' (%s stopped search tips, %s common ancestors'
+                         ' (%s stopped common)',
+                         len(unique_nodes), len(unique_tip_searchers),
+                         total_stopped, len(ancestor_all_unique),
+                         len(stopped_common))
+        return all_unique_searcher, unique_tip_searchers
+
+    def _step_unique_and_common_searchers(self, common_searcher,
+                                          unique_tip_searchers,
+                                          unique_searcher):
+        newly_seen_common = set(common_searcher.step())
+        newly_seen_unique = set()
+        for searcher in unique_tip_searchers:
+            next = set(searcher.step())
+            next.update(unique_searcher.find_seen_ancestors(next))
+            next.update(common_searcher.find_seen_ancestors(next))
+            for alt_searcher in unique_tip_searchers:
+                if alt_searcher is searcher:
+                    continue
+                next.update(alt_searcher.find_seen_ancestors(next))
+            searcher.start_searching(next)
+            newly_seen_unique.update(next)
+        return newly_seen_common, newly_seen_unique,
+
+    def _find_nodes_common_to_all_unique(self, unique_tip_searchers,
+                                         all_unique_searcher,
+                                         newly_seen_unique, step_all_unique):
+        """Find nodes that are common to all unique_tip_searchers.
+
+        If it is time, step the all_unique_searcher, and add its nodes to the
+        result.
+        """
+        unique_are_common_nodes = newly_seen_unique.copy()
+        for searcher in unique_tip_searchers:
+            unique_are_common_nodes.intersection_update(searcher.seen)
+        unique_are_common_nodes.intersection_update(
+                                    all_unique_searcher.seen)
+        # Step all-unique less frequently than the other searchers.
+        # In the common case, we don't need to spider out far here, so
+        # avoid doing extra work.
+        if not step_all_unique:
+            tstart = time.clock()
+            nodes = all_unique_searcher.step()
+            unique_are_common_nodes.update(nodes)
+            tdelta = time.clock() - tstart
+            if 'graph' in debug.debug_flags:
+                trace.mutter('all_unique_searcher step() took %.3fs'
+                             'for %d nodes (%d total), iteration: %s',
+                             tdelta, len(nodes), len(all_unique_searcher.seen),
+                             all_unique_searcher._iterations)
+        return unique_are_common_nodes
+
+    def _collapse_unique_searchers(self, unique_tip_searchers,
+                                   unique_are_common_nodes):
+        """Combine searchers that are searching the same tips.
+
+        When two searchers are searching the same tips, we can stop one of the
+        searchers. We also know that the maximal set of common ancestors is the
+        intersection of the two original searchers.
+
+        :return: A list of searchers that are searching unique nodes.
+        """
+        # Filter out searchers that don't actually search different
+        # nodes. We already have the ancestry intersection for them
+        unique_search_tips = {}
+        for searcher in unique_tip_searchers:
+            stopped = searcher.stop_searching_any(unique_are_common_nodes)
+            will_search_set = frozenset(searcher._next_query)
+            if not will_search_set:
+                if 'graph' in debug.debug_flags:
+                    trace.mutter('Unique searcher %s was stopped.'
+                                 ' (%s iterations) %d nodes stopped',
+                                 searcher._label,
+                                 searcher._iterations,
+                                 len(stopped))
+            elif will_search_set not in unique_search_tips:
+                # This searcher is searching a unique set of nodes, let it
+                unique_search_tips[will_search_set] = [searcher]
+            else:
+                unique_search_tips[will_search_set].append(searcher)
+        # TODO: it might be possible to collapse searchers faster when they
+        #       only have *some* search tips in common.
+        next_unique_searchers = []
+        for searchers in unique_search_tips.itervalues():
+            if len(searchers) == 1:
+                # Searching unique tips, go for it
+                next_unique_searchers.append(searchers[0])
+            else:
+                # These searchers have started searching the same tips, we
+                # don't need them to cover the same ground. The
+                # intersection of their ancestry won't change, so create a
+                # new searcher, combining their histories.
+                next_searcher = searchers[0]
+                for searcher in searchers[1:]:
+                    next_searcher.seen.intersection_update(searcher.seen)
+                if 'graph' in debug.debug_flags:
+                    trace.mutter('Combining %s searchers into a single'
+                                 ' searcher searching %s nodes with'
+                                 ' %s ancestry',
+                                 len(searchers),
+                                 len(next_searcher._next_query),
+                                 len(next_searcher.seen))
+                next_unique_searchers.append(next_searcher)
+        return next_unique_searchers
+
+    def _refine_unique_nodes(self, unique_searcher, all_unique_searcher,
+                             unique_tip_searchers, common_searcher):
+        """Steps 5-8 of find_unique_ancestors.
+        
+        This function returns when common_searcher has stopped searching for
+        more nodes.
+        """
+        # We step the ancestor_all_unique searcher only every
+        # STEP_UNIQUE_SEARCHER_EVERY steps.
+        step_all_unique_counter = 0
         # While we still have common nodes to search
         while common_searcher._next_query:
-            newly_seen_common = set(common_searcher.step())
-            newly_seen_unique = set()
-            def step_searchers():
-                for searcher in unique_searchers:
-                    next = set(searcher.step())
-                    next.update(unique_searcher.find_seen_ancestors(next))
-                    next.update(common_searcher.find_seen_ancestors(next))
-                    for alt_searcher in unique_searchers:
-                        if alt_searcher is searcher:
-                            continue
-                        next.update(alt_searcher.find_seen_ancestors(next))
-                    searcher.start_searching(next)
-                    newly_seen_unique.update(next)
-            step_searchers()
+            (newly_seen_common,
+             newly_seen_unique) = self._step_unique_and_common_searchers(
+                common_searcher, unique_tip_searchers, unique_searcher)
             # These nodes are common ancestors of all unique nodes
-            unique_are_common_nodes = newly_seen_unique.copy()
-            def intersect_searchers():
-                for searcher in unique_searchers:
-                    unique_are_common_nodes.intersection_update(searcher.seen)
-                unique_are_common_nodes.intersection_update(
-                                            all_unique_searcher.seen)
-                # Step the all-unique less frequently than the other searchers.
-                # In the common case, we don't need to spider out far here, so
-                # avoid doing extra work.
-                if not step_all_unique:
-                    tstart = time.clock()
-                    nodes = all_unique_searcher.step()
-                    unique_are_common_nodes.update(nodes)
-                    tdelta = time.clock() - tstart
-                    _mutter('all_unique_searcher step() took %.3fs for %d nodes'
-                            ' (%d total), iteration: %s', tdelta, 
-                            len(nodes), len(all_unique_searcher.seen),
-                            all_unique_searcher._iterations)
-            intersect_searchers()
-            step_all_unique = (step_all_unique + 1) % STEP_UNIQUE_SEARCHER_EVERY
+            unique_are_common_nodes = self._find_nodes_common_to_all_unique(
+                unique_tip_searchers, all_unique_searcher, newly_seen_unique,
+                step_all_unique_counter==0)
+            step_all_unique_counter = ((step_all_unique_counter + 1)
+                                       % STEP_UNIQUE_SEARCHER_EVERY)
 
             if newly_seen_common:
                 # If a 'common' node is an ancestor of all unique searchers, we
@@ -376,56 +500,16 @@
                 all_unique_searcher.start_searching(unique_are_common_nodes)
                 common_searcher.stop_searching_any(unique_are_common_nodes)
 
-            def filter_searchers():
-                # Filter out searchers that don't actually search different
-                # nodes. We already have the ancestry intersection for them
-                unique_search_tips = {}
-                for searcher in unique_searchers:
-                    stopped = searcher.stop_searching_any(unique_are_common_nodes)
-                    will_search_set = frozenset(searcher._next_query)
-                    if not will_search_set:
-                        _mutter('Unique searcher %s was stopped.'
-                                ' (%s iterations) %d nodes stopped',
-                                searcher._label,
-                                searcher._iterations,
-                                len(stopped))
-                    elif will_search_set not in unique_search_tips:
-                        # This searcher is searching a unique set of nodes, let it
-                        unique_search_tips[will_search_set] = [searcher]
-                    else:
-                        unique_search_tips[will_search_set].append(searcher)
-                next_unique_searchers = []
-                for searchers in unique_search_tips.itervalues():
-                    if len(searchers) == 1:
-                        # Searching unique tips, go for it
-                        next_unique_searchers.append(searchers[0])
-                    else:
-                        # These searchers have started searching the same tips, we
-                        # don't need them to cover the same ground. The
-                        # intersection of their ancestry won't change, so create a
-                        # new searcher, combining their histories.
-                        next_searcher = searchers[0]
-                        ancestor_intersection = next_searcher.seen
-                        for searcher in searchers[1:]:
-                            ancestor_intersection = ancestor_intersection.intersection(searcher.seen)
-                        next_searcher.seen = ancestor_intersection
-                        _mutter('Combining %s searchers into a single searcher'
-                                ' searching %s nodes with %s ancestry',
-                                len(searchers), len(next_searcher._next_query),
-                                len(next_searcher.seen))
-                        next_unique_searchers.append(next_searcher)
-                return next_unique_searchers
-            next_unique_searchers = filter_searchers()
-            if len(unique_searchers) != len(next_unique_searchers):
-                _mutter('Collapsed %s unique searchers => %s'
-                        ' at %s iterations',
-                        len(unique_searchers), len(next_unique_searchers),
-                        all_unique_searcher._iterations)
-            unique_searchers = next_unique_searchers
-        true_unique_nodes = unique_nodes.difference(common_searcher.seen)
-        _mutter('Found %s truly unique nodes out of %s',
-                len(true_unique_nodes), len(unique_nodes))
-        return true_unique_nodes
+            next_unique_searchers = self._collapse_unique_searchers(
+                unique_tip_searchers, unique_are_common_nodes)
+            if len(unique_tip_searchers) != len(next_unique_searchers):
+                if 'graph' in debug.debug_flags:
+                    trace.mutter('Collapsed %s unique searchers => %s'
+                                 ' at %s iterations',
+                                 len(unique_tip_searchers),
+                                 len(next_unique_searchers),
+                                 all_unique_searcher._iterations)
+            unique_tip_searchers = next_unique_searchers
 
     @symbol_versioning.deprecated_method(symbol_versioning.one_one)
     def get_parents(self, revisions):
@@ -1095,7 +1179,16 @@
         return self
 
     def find_seen_ancestors(self, revisions):
-        """Find ancestors of these revisions that have already been seen."""
+        """Find ancestors of these revisions that have already been seen.
+        
+        This function generally makes the assumption that querying for the
+        parents of a node that has already been queried is reasonably cheap.
+        (eg, not a round trip to a remote host).
+        """
+        # TODO: Often we might ask one searcher for its seen ancestors, and
+        #       then ask another searcher the same question. This can result in
+        #       searching the same revisions repeatedly if the two searchers
+        #       have a lot of overlap.
         all_seen = self.seen
         pending = set(revisions).intersection(all_seen)
         seen_ancestors = set(pending)
@@ -1130,6 +1223,9 @@
         None of the specified revisions are required to be present in the
         search list.  In this case, the call is a no-op.
         """
+        # TODO: does this help performance?
+        # if not revisions:
+        #     return set()
         revisions = frozenset(revisions)
         if self._returning == 'next':
             stopped = self._next_query.intersection(revisions)