[MERGE][1.0] Graph.heads() optimization... Now 1770x

Tue Dec 18 03:26:16 GMT 2007

-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1

John Arbash Meinel wrote:

Sorry to take so long getting to this review.  This graph stuff is
always challenging for me.

> So I went ahead and change the functions so that step() actually takes a
> dictionary of parents that it is going to care about. And made
> "preload_parents" just a member of Graph. Which allows me to make it private,
> and doesn't involve running into a bzr-svn implementation that isn't providing it.

It would be great if you could rename that to something clearer, like
get_parents_map

> The reason the original code couldn't was because _BreadthFirstSearcher was
> defined as an iterator, and there was nowhere you could supply a parameter. But
> making "step()" a separate function it was easy to do, I just needed to take
> that step.

That looks like a reasonable change.

=== modified file 'bzrlib/graph.py'
- --- bzrlib/graph.py	2007-12-04 00:34:34 +0000
+++ bzrlib/graph.py	2007-12-07 20:10:53 +0000
@@ -88,6 +88,51 @@
         return [found.get(r, None) for r in revision_ids]


+class CachingParentsProvider(object):
+    """A parents provider which will cache the revision => parents in a
dict.
+
+    This is useful for providers that have an expensive lookup.
+    """
+
+    def __init__(self, parent_provider):
+        self._real_provider = parent_provider
+        # Theoretically we could use an LRUCache here
+        self._cache = {}
+
+    def __repr__(self):
+        return "%s(%r)" % (self.__class__.__name__, self._real_provider)
+
+    def get_parents(self, revision_ids):
+        """Find revision ids of the parents of a list of revisions
+
+        A list is returned of the same length as the input.  Each entry
+        is a list of parent ids for the corresponding input revision.
+
+        [NULL_REVISION] is used as the parent of the first user-committed
+        revision.  Its parent list is empty.
+
+        If the revision is not present (i.e. a ghost), None is used in
place
+        of the list of parents.
+        """
+        needed = set()

^^^ I think this docstring should just reference the
StackedParentsProvider one, like the others do.


+        # TODO: is it worth having the dict here, or is it better to
just load
+        # the cache, and then return requests from it?
+        parents = {}
+        cache = self._cache

^^^ I don't see the need for the parents dict.

+        for revision_id in revision_ids:
+            if revision_id in cache:
+                parents[revision_id] = cache[revision_id]
+            else:
+                needed.add(revision_id)

^^^ this would just simplify to
if revision_id not in cache:
    needed.add(revision_id)

+        if needed:
+            new_parents = dict(zip(needed,
+
self._real_provider.get_parents(needed)))

^^^ I prefer more explicit tests, like "if len(needed) > 0".  Still, I
wouldn't demand such a change.

 class Graph(object):
     """Provide incremental access to revision graphs.

@@ -183,40 +228,36 @@
         active_searchers = searchers[:]
         border_ancestors = set()
         def update_common(searcher, revisions):
- -            w_seen_ancestors = searcher.find_seen_ancestors(
- -                revision)
+            w_seen_ancestors = searcher.find_seen_ancestors(revisions)

^^^ Is this a longstanding bug?


@@ -224,7 +265,19 @@
                 else:
                     border_ancestors.add(revision)
                     for searcher in searchers:
- -                        update_common(searcher, revision)
+                        update_common(searcher, [revision])
+            next_to_search = set(common_searcher.will_search())
+            for searcher in active_searchers:
+                next_to_search.update(searcher.will_search())
+
+    def _preload_parents(self, keys):
+        """Call get_parents() for all of the keys we will need.
+
+        :return: A dictionary mapping key => parents
+        """
+        # TODO: Use izip? the number of keys here will probably always be
+        #       relatively small, it may be faster to just use zip
+        return dict(zip(keys, self.get_parents(keys)))

^^^ This API should match get_parents_map, whatever API you've decided
to use for it.

@@ -295,9 +350,7 @@
                 if ancestor in common_walker.seen:
                     # some searcher has encountered our known common nodes:
                     # just stop it
- -                    ancestor_set = set([ancestor])
- -                    for searcher in searchers.itervalues():
- -                        searcher.stop_searching_any(ancestor_set)
+                    new_common.add(ancestor)


^^^ "just stop it" comment no longer makes sense.

+    def will_search(self):
+        """Give the list of nodes that will be searched next."""
+        if self._search_revisions is None:
+            return self._start

^^^ I don't think this is really accurate.  The next iteration will
yield  the _start revisions.  It won't traverse from them into their
parents.  It's mostly harmless, but it's probably causing an extra
Graph._preload_parents call.


     def __iter__(self):
         return self

- -    def find_seen_ancestors(self, revision):
- -        """Find ancestors of this revision that have already been
seen."""
- -        searcher = _BreadthFirstSearcher([revision],
self._parents_provider)
+    def find_seen_ancestors(self, revisions):
+        """Find ancestors of these revisions that have already been
seen."""
+        revisions = self.seen.intersection(revisions)
         seen_ancestors = set()
- -        for ancestors in searcher:
- -            for ancestor in ancestors:
- -                if ancestor not in self.seen:
- -                    searcher.stop_searching_any([ancestor])
- -                else:
- -                    seen_ancestors.add(ancestor)
+        while revisions:
+            seen_ancestors.update(revisions)
+            next_revisions = set()
+            for parents in self._parents_provider.get_parents(revisions):
+                if parents is None:
+                    continue
+                next_revisions.update(self.seen.intersection(parents))
+            revisions = next_revisions
         return seen_ancestors

^^^ I don't understand the reasons for this change.  You're doing a
breadth-first traversal; why not use a breadth-first searcher?

=== modified file 'bzrlib/tests/test_graph.py'
- --- bzrlib/tests/test_graph.py	2007-12-04 00:34:34 +0000
+++ bzrlib/tests/test_graph.py	2007-12-07 17:32:16 +0000
@@ -296,6 +317,16 @@
         self.assertEqual((set([]), set(['rev3b', 'rev2b'])),
                          graph.find_difference('rev2a', 'rev3b'))

+    def test_graph_difference_extended_history(self):
+        graph = self.make_graph(extended_history_shortcut)
+        self.expectFailure('find_difference is confused by shortcuts',
+            self.assertEqual, (set(['e']), set(['f'])),
+            graph.find_difference('e', 'f'))
+        self.assertEqual((set(['e']), set(['f'])),
+                         graph.find_difference('e', 'f'))
+        self.assertEqual((set(['f']), set(['e'])),
+                         graph.find_difference('f', 'e'))
+

^^^ Thanks for adding this.

bb:tweak

The key things I'd like to see are:
1. update preload_parents to get_parents_map API
2. comment updates.

I'd also appreciate an explanation of why BreadthFirstSearcher can't be
used efficiently for find_seen_ancestors, but I trust you have measured
an efficiency gain there.

Aaron
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.4.6 (GNU/Linux)
Comment: Using GnuPG with Mozilla - http://enigmail.mozdev.org

iD8DBQFHZz3Y0F+nu1YWqI0RApYXAJ9zNXjOgd38XcGbEDm+gtYGYd05MACeKglo
MtxB53Dgbu661NjB/HHiREA=
=fM34
-----END PGP SIGNATURE-----