[MERGE][1.0] Graph.heads() optimization... Now 1770x
Aaron Bentley
aaron.bentley at utoronto.ca
Tue Dec 18 03:26:16 GMT 2007
-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1
John Arbash Meinel wrote:
Sorry to take so long getting to this review. This graph stuff is
always challenging for me.
> So I went ahead and change the functions so that step() actually takes a
> dictionary of parents that it is going to care about. And made
> "preload_parents" just a member of Graph. Which allows me to make it private,
> and doesn't involve running into a bzr-svn implementation that isn't providing it.
It would be great if you could rename that to something clearer, like
get_parents_map
> The reason the original code couldn't was because _BreadthFirstSearcher was
> defined as an iterator, and there was nowhere you could supply a parameter. But
> making "step()" a separate function it was easy to do, I just needed to take
> that step.
That looks like a reasonable change.
=== modified file 'bzrlib/graph.py'
- --- bzrlib/graph.py 2007-12-04 00:34:34 +0000
+++ bzrlib/graph.py 2007-12-07 20:10:53 +0000
@@ -88,6 +88,51 @@
return [found.get(r, None) for r in revision_ids]
+class CachingParentsProvider(object):
+ """A parents provider which will cache the revision => parents in a
dict.
+
+ This is useful for providers that have an expensive lookup.
+ """
+
+ def __init__(self, parent_provider):
+ self._real_provider = parent_provider
+ # Theoretically we could use an LRUCache here
+ self._cache = {}
+
+ def __repr__(self):
+ return "%s(%r)" % (self.__class__.__name__, self._real_provider)
+
+ def get_parents(self, revision_ids):
+ """Find revision ids of the parents of a list of revisions
+
+ A list is returned of the same length as the input. Each entry
+ is a list of parent ids for the corresponding input revision.
+
+ [NULL_REVISION] is used as the parent of the first user-committed
+ revision. Its parent list is empty.
+
+ If the revision is not present (i.e. a ghost), None is used in
place
+ of the list of parents.
+ """
+ needed = set()
^^^ I think this docstring should just reference the
StackedParentsProvider one, like the others do.
+ # TODO: is it worth having the dict here, or is it better to
just load
+ # the cache, and then return requests from it?
+ parents = {}
+ cache = self._cache
^^^ I don't see the need for the parents dict.
+ for revision_id in revision_ids:
+ if revision_id in cache:
+ parents[revision_id] = cache[revision_id]
+ else:
+ needed.add(revision_id)
^^^ this would just simplify to
if revision_id not in cache:
needed.add(revision_id)
+ if needed:
+ new_parents = dict(zip(needed,
+
self._real_provider.get_parents(needed)))
^^^ I prefer more explicit tests, like "if len(needed) > 0". Still, I
wouldn't demand such a change.
class Graph(object):
"""Provide incremental access to revision graphs.
@@ -183,40 +228,36 @@
active_searchers = searchers[:]
border_ancestors = set()
def update_common(searcher, revisions):
- - w_seen_ancestors = searcher.find_seen_ancestors(
- - revision)
+ w_seen_ancestors = searcher.find_seen_ancestors(revisions)
^^^ Is this a longstanding bug?
@@ -224,7 +265,19 @@
else:
border_ancestors.add(revision)
for searcher in searchers:
- - update_common(searcher, revision)
+ update_common(searcher, [revision])
+ next_to_search = set(common_searcher.will_search())
+ for searcher in active_searchers:
+ next_to_search.update(searcher.will_search())
+
+ def _preload_parents(self, keys):
+ """Call get_parents() for all of the keys we will need.
+
+ :return: A dictionary mapping key => parents
+ """
+ # TODO: Use izip? the number of keys here will probably always be
+ # relatively small, it may be faster to just use zip
+ return dict(zip(keys, self.get_parents(keys)))
^^^ This API should match get_parents_map, whatever API you've decided
to use for it.
@@ -295,9 +350,7 @@
if ancestor in common_walker.seen:
# some searcher has encountered our known common nodes:
# just stop it
- - ancestor_set = set([ancestor])
- - for searcher in searchers.itervalues():
- - searcher.stop_searching_any(ancestor_set)
+ new_common.add(ancestor)
^^^ "just stop it" comment no longer makes sense.
+ def will_search(self):
+ """Give the list of nodes that will be searched next."""
+ if self._search_revisions is None:
+ return self._start
^^^ I don't think this is really accurate. The next iteration will
yield the _start revisions. It won't traverse from them into their
parents. It's mostly harmless, but it's probably causing an extra
Graph._preload_parents call.
def __iter__(self):
return self
- - def find_seen_ancestors(self, revision):
- - """Find ancestors of this revision that have already been
seen."""
- - searcher = _BreadthFirstSearcher([revision],
self._parents_provider)
+ def find_seen_ancestors(self, revisions):
+ """Find ancestors of these revisions that have already been
seen."""
+ revisions = self.seen.intersection(revisions)
seen_ancestors = set()
- - for ancestors in searcher:
- - for ancestor in ancestors:
- - if ancestor not in self.seen:
- - searcher.stop_searching_any([ancestor])
- - else:
- - seen_ancestors.add(ancestor)
+ while revisions:
+ seen_ancestors.update(revisions)
+ next_revisions = set()
+ for parents in self._parents_provider.get_parents(revisions):
+ if parents is None:
+ continue
+ next_revisions.update(self.seen.intersection(parents))
+ revisions = next_revisions
return seen_ancestors
^^^ I don't understand the reasons for this change. You're doing a
breadth-first traversal; why not use a breadth-first searcher?
=== modified file 'bzrlib/tests/test_graph.py'
- --- bzrlib/tests/test_graph.py 2007-12-04 00:34:34 +0000
+++ bzrlib/tests/test_graph.py 2007-12-07 17:32:16 +0000
@@ -296,6 +317,16 @@
self.assertEqual((set([]), set(['rev3b', 'rev2b'])),
graph.find_difference('rev2a', 'rev3b'))
+ def test_graph_difference_extended_history(self):
+ graph = self.make_graph(extended_history_shortcut)
+ self.expectFailure('find_difference is confused by shortcuts',
+ self.assertEqual, (set(['e']), set(['f'])),
+ graph.find_difference('e', 'f'))
+ self.assertEqual((set(['e']), set(['f'])),
+ graph.find_difference('e', 'f'))
+ self.assertEqual((set(['f']), set(['e'])),
+ graph.find_difference('f', 'e'))
+
^^^ Thanks for adding this.
bb:tweak
The key things I'd like to see are:
1. update preload_parents to get_parents_map API
2. comment updates.
I'd also appreciate an explanation of why BreadthFirstSearcher can't be
used efficiently for find_seen_ancestors, but I trust you have measured
an efficiency gain there.
Aaron
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.4.6 (GNU/Linux)
Comment: Using GnuPG with Mozilla - http://enigmail.mozdev.org
iD8DBQFHZz3Y0F+nu1YWqI0RApYXAJ9zNXjOgd38XcGbEDm+gtYGYd05MACeKglo
MtxB53Dgbu661NjB/HHiREA=
=fM34
-----END PGP SIGNATURE-----
More information about the bazaar
mailing list