Rev 3100: Implement get_parent_map for ParentProviders in http://bzr.arbash-meinel.com/branches/bzr/1.1-dev/graph_optimization
John Arbash Meinel
john at arbash-meinel.com
Tue Dec 18 17:07:16 GMT 2007
At http://bzr.arbash-meinel.com/branches/bzr/1.1-dev/graph_optimization
------------------------------------------------------------
revno: 3100
revision-id:john at arbash-meinel.com-20071218170642-nls9ory76cmh4r6y
parent: pqm at pqm.ubuntu.com-20071210120611-a3j02d26cbzvlyju
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: graph_optimization
timestamp: Tue 2007-12-18 11:06:42 -0600
message:
Implement get_parent_map for ParentProviders
Add a CachingParentProvider for PackRepository to use.
Add some XFAIL tests for the find_difference algorithm.
modified:
bzrlib/graph.py graph_walker.py-20070525030359-y852guab65d4wtn0-1
bzrlib/index.py index.py-20070712131115-lolkarso50vjr64s-1
bzrlib/repofmt/knitrepo.py knitrepo.py-20070206081537-pyy4a00xdas0j4pf-1
bzrlib/repofmt/pack_repo.py pack_repo.py-20070813041115-gjv5ma7ktfqwsjgn-1
bzrlib/repository.py rev_storage.py-20051111201905-119e9401e46257e3
bzrlib/tests/test_graph.py test_graph_walker.py-20070525030405-enq4r60hhi9xrujc-1
-------------- next part --------------
=== modified file 'bzrlib/graph.py'
--- a/bzrlib/graph.py 2007-12-07 05:31:54 +0000
+++ b/bzrlib/graph.py 2007-12-18 17:06:42 +0000
@@ -55,6 +55,11 @@
def get_parents(self, revisions):
return [self.ancestry.get(r, None) for r in revisions]
+ def get_parent_map(self, keys):
+ """See _StackedParentsProvider.get_parent_map"""
+ ancestry = self.ancestry
+ return dict((k, ancestry[k]) for k in keys if k in ancestry)
+
class _StackedParentsProvider(object):
@@ -76,17 +81,76 @@
If the revision is not present (i.e. a ghost), None is used in place
of the list of parents.
"""
+ found = self.get_parent_map(revision_ids)
+ return [found.get(r, None) for r in revision_ids]
+
+ def get_parent_map(self, keys):
+ """Get a mapping of keys => parents
+
+ A dictionary is returned with an entry for each key present in this
+ source. If this source doesn't have information about a key, it should
+ not include an entry.
+
+ [NULL_REVISION] is used as the parent of the first user-committed
+ revision. Its parent list is empty.
+
+ :param keys: An iterable returning keys to check (eg revision_ids)
+ :return: A dictionary mapping each key to its parents
+ """
found = {}
+ remaining = set(keys)
for parents_provider in self._parent_providers:
- pending_revisions = [r for r in revision_ids if r not in found]
- parent_list = parents_provider.get_parents(pending_revisions)
- new_found = dict((k, v) for k, v in zip(pending_revisions,
- parent_list) if v is not None)
+ new_found = parents_provider.get_parent_map(remaining)
found.update(new_found)
- if len(found) == len(revision_ids):
+ remaining.difference_update(new_found)
+ if not remaining:
break
+ return found
+
+
+class CachingParentsProvider(object):
+ """A parents provider which will cache the revision => parents in a dict.
+
+ This is useful for providers that have an expensive lookup.
+ """
+
+ def __init__(self, parent_provider):
+ self._real_provider = parent_provider
+ # Theoretically we could use an LRUCache here
+ self._cache = {}
+
+ def __repr__(self):
+ return "%s(%r)" % (self.__class__.__name__, self._real_provider)
+
+ def get_parents(self, revision_ids):
+ """See _StackedParentsProvider.get_parents"""
+ found = self.get_parent_map(revision_ids)
return [found.get(r, None) for r in revision_ids]
+ def get_parent_map(self, keys):
+ """See _StackedParentsProvider.get_parent_map"""
+ needed = set()
+ # If the _real_provider doesn't have a key, we cache a value of None,
+ # which we then later use to realize we cannot provide a value for that
+ # key.
+ parent_map = {}
+ cache = self._cache
+ for key in keys:
+ if key in cache:
+ value = cache[key]
+ if value is not None:
+ parent_map[key] = value
+ else:
+ needed.add(key)
+
+ if needed:
+ new_parents = self._real_provider.get_parent_map(needed)
+ cache.update(new_parents)
+ parent_map.update(new_parents)
+ needed.difference_update(new_parents)
+ cache.update(dict.fromkeys(needed, None))
+ return parent_map
+
class Graph(object):
"""Provide incremental access to revision graphs.
@@ -106,6 +170,7 @@
conforming to the behavior of StackedParentsProvider.get_parents
"""
self.get_parents = parents_provider.get_parents
+ self.get_parent_map = parents_provider.get_parent_map
self._parents_provider = parents_provider
def __repr__(self):
@@ -432,11 +497,15 @@
self._start = set(revisions)
self._search_revisions = None
self.seen = set(revisions)
- self._parents_provider = parents_provider
+ self._parents_provider = parents_provider
def __repr__(self):
- return ('_BreadthFirstSearcher(self._search_revisions=%r,'
- ' self.seen=%r)' % (self._search_revisions, self.seen))
+ if self._search_revisions is not None:
+ search = 'searching=%r' % (list(self._search_revisions),)
+ else:
+ search = 'starting=%r' % (list(self._start),)
+ return ('_BreadthFirstSearcher(%s,'
+ ' seen=%r)' % (search, list(self.seen)))
def next(self):
"""Return the next ancestors of this revision.
=== modified file 'bzrlib/index.py'
--- a/bzrlib/index.py 2007-11-28 00:59:30 +0000
+++ b/bzrlib/index.py 2007-12-18 17:06:42 +0000
@@ -996,7 +996,7 @@
', '.join(map(repr, self._indices)))
def get_parents(self, revision_ids):
- """See StackedParentsProvider.get_parents.
+ """See graph._StackedParentsProvider.get_parents.
This implementation thunks the graph.Graph.get_parents api across to
GraphIndex.
@@ -1008,21 +1008,23 @@
* (NULL_REVISION,) when the key has no parents.
* (parent_key, parent_key...) otherwise.
"""
- search_keys = set(revision_ids)
- search_keys.discard(NULL_REVISION)
- found_parents = {NULL_REVISION:[]}
+ parent_map = self.get_parent_map(revision_ids)
+ return [parent_map.get(r, None) for r in revision_ids]
+
+ def get_parent_map(self, keys):
+ """See graph._StackedParentsProvider.get_parent_map"""
+ search_keys = set(keys)
+ if NULL_REVISION in search_keys:
+ search_keys.discard(NULL_REVISION)
+ found_parents = {NULL_REVISION:[]}
+ else:
+ found_parents = {}
for index, key, value, refs in self.iter_entries(search_keys):
parents = refs[0]
if not parents:
parents = (NULL_REVISION,)
found_parents[key] = parents
- result = []
- for key in revision_ids:
- try:
- result.append(found_parents[key])
- except KeyError:
- result.append(None)
- return result
+ return found_parents
def insert_index(self, pos, index):
"""Insert a new index in the list of indices to query.
=== modified file 'bzrlib/repofmt/knitrepo.py'
--- a/bzrlib/repofmt/knitrepo.py 2007-11-18 18:42:17 +0000
+++ b/bzrlib/repofmt/knitrepo.py 2007-12-18 17:06:42 +0000
@@ -59,20 +59,26 @@
return 'KnitParentsProvider(%r)' % self._knit
def get_parents(self, revision_ids):
- parents_list = []
- for revision_id in revision_ids:
+ """See graph._StackedParentsProvider.get_parents"""
+ parent_map = self.get_parent_map(revision_ids)
+ return [parent_map.get(r, None) for r in revision_ids]
+
+ def get_parent_map(self, keys):
+ """See graph._StackedParentsProvider.get_parent_map"""
+ parent_map = {}
+ for revision_id in keys:
if revision_id == _mod_revision.NULL_REVISION:
- parents = []
+ parent_map[revision_id] = []
else:
try:
parents = self._knit.get_parents_with_ghosts(revision_id)
except errors.RevisionNotPresent:
- parents = None
+ pass
else:
if len(parents) == 0:
parents = [_mod_revision.NULL_REVISION]
- parents_list.append(parents)
- return parents_list
+ parent_map[revision_id] = parents
+ return parent_map
class KnitRepository(MetaDirRepository):
=== modified file 'bzrlib/repofmt/pack_repo.py'
--- a/bzrlib/repofmt/pack_repo.py 2007-12-03 19:26:40 +0000
+++ b/bzrlib/repofmt/pack_repo.py 2007-12-18 17:06:42 +0000
@@ -23,10 +23,10 @@
from bzrlib import (
debug,
+ graph,
pack,
ui,
)
-from bzrlib.graph import Graph
from bzrlib.index import (
GraphIndex,
GraphIndexBuilder,
@@ -81,7 +81,7 @@
CommitBuilder.__init__(self, repository, parents, config,
timestamp=timestamp, timezone=timezone, committer=committer,
revprops=revprops, revision_id=revision_id)
- self._file_graph = Graph(
+ self._file_graph = graph.Graph(
repository._pack_collection.text_index.combined_index)
def _add_text_to_weave(self, file_id, new_lines, parents, nostore_sha):
@@ -107,7 +107,7 @@
CommitBuilder.__init__(self, repository, parents, config,
timestamp=timestamp, timezone=timezone, committer=committer,
revprops=revprops, revision_id=revision_id)
- self._file_graph = Graph(
+ self._file_graph = graph.Graph(
repository._pack_collection.text_index.combined_index)
def _add_text_to_weave(self, file_id, new_lines, parents, nostore_sha):
@@ -1828,18 +1828,25 @@
return result
def get_parents(self, revision_ids):
- """See StackedParentsProvider.get_parents.
-
+ """See graph._StackedParentsProvider.get_parents."""
+ parent_map = self.get_parent_map(revision_ids)
+ return [parent_map.get(r, None) for r in revision_ids]
+
+ def get_parent_map(self, keys):
+ """See graph._StackedParentsProvider.get_parent_map
+
This implementation accesses the combined revision index to provide
answers.
"""
self._pack_collection.ensure_loaded()
index = self._pack_collection.revision_index.combined_index
- search_keys = set()
- for revision_id in revision_ids:
- if revision_id != _mod_revision.NULL_REVISION:
- search_keys.add((revision_id,))
- found_parents = {_mod_revision.NULL_REVISION:[]}
+ keys = set(keys)
+ if _mod_revision.NULL_REVISION in keys:
+ keys.discard(_mod_revision.NULL_REVISION)
+ found_parents = {_mod_revision.NULL_REVISION:[]}
+ else:
+ found_parents = {}
+ search_keys = set((revision_id,) for revision_id in keys)
for index, key, value, refs in index.iter_entries(search_keys):
parents = refs[0]
if not parents:
@@ -1847,16 +1854,10 @@
else:
parents = tuple(parent[0] for parent in parents)
found_parents[key[0]] = parents
- result = []
- for revision_id in revision_ids:
- try:
- result.append(found_parents[revision_id])
- except KeyError:
- result.append(None)
- return result
+ return found_parents
def _make_parents_provider(self):
- return self
+ return graph.CachingParentsProvider(self)
def _refresh_data(self):
if self._write_lock_count == 1 or (
=== modified file 'bzrlib/repository.py'
--- a/bzrlib/repository.py 2007-12-03 06:16:05 +0000
+++ b/bzrlib/repository.py 2007-12-18 17:06:42 +0000
@@ -1645,20 +1645,25 @@
def get_parents(self, revision_ids):
"""See StackedParentsProvider.get_parents"""
- parents_list = []
- for revision_id in revision_ids:
+ parent_map = self.get_parent_map(revision_ids)
+ return [parent_map.get(r, None) for r in revision_ids]
+
+ def get_parent_map(self, keys):
+ """See graph._StackedParentsProvider.get_parent_map"""
+ parent_map = {}
+ for revision_id in keys:
if revision_id == _mod_revision.NULL_REVISION:
- parents = []
+ parents[revision_id] = []
else:
try:
- parents = self.get_revision(revision_id).parent_ids
+ parent_ids = self.get_revision(revision_id).parent_ids
except errors.NoSuchRevision:
- parents = None
+ pass
else:
- if len(parents) == 0:
- parents = [_mod_revision.NULL_REVISION]
- parents_list.append(parents)
- return parents_list
+ if len(parent_ids) == 0:
+ parent_ids = [_mod_revision.NULL_REVISION]
+ parent_map[revision_id] = parent_ids
+ return parent_map
def _make_parents_provider(self):
return self
=== modified file 'bzrlib/tests/test_graph.py'
--- a/bzrlib/tests/test_graph.py 2007-12-04 00:34:34 +0000
+++ b/bzrlib/tests/test_graph.py 2007-12-18 17:06:42 +0000
@@ -17,6 +17,7 @@
from bzrlib import (
errors,
graph as _mod_graph,
+ tests,
)
from bzrlib.revision import NULL_REVISION
from bzrlib.tests import TestCaseWithMemoryTransport
@@ -115,11 +116,113 @@
# / \ \
# rev2a rev2b rev2c
# | / \ /
-# rev3a reveb
+# rev3a rev3b
history_shortcut = {'rev1': [NULL_REVISION], 'rev2a': ['rev1'],
'rev2b': ['rev1'], 'rev2c': ['rev1'],
'rev3a': ['rev2a', 'rev2b'], 'rev3b': ['rev2b', 'rev2c']}
+# Extended history shortcut
+# NULL_REVISION
+# |
+# a
+# |\
+# b |
+# | |
+# c |
+# | |
+# d |
+# |\|
+# e f
+extended_history_shortcut = {'a': [NULL_REVISION],
+ 'b': ['a'],
+ 'c': ['b'],
+ 'd': ['c'],
+ 'e': ['d'],
+ 'f': ['a', 'd'],
+ }
+
+# Double shortcut
+# Both sides will see 'A' first, even though it is actually a decendent of a
+# different common revision.
+#
+# NULL_REVISION
+# |
+# a
+# /|\
+# / b \
+# / | \
+# | c |
+# | / \ |
+# | d e |
+# |/ \|
+# f g
+
+double_shortcut = {'a':[NULL_REVISION], 'b':['a'], 'c':['b'],
+ 'd':['c'], 'e':['c'], 'f':['a', 'd'],
+ 'g':['a', 'e']}
+
+# Complex shortcut
+# This has a failure mode in that a shortcut will find some nodes in common,
+# but the common searcher won't have time to find that one branch is actually
+# in common. The extra nodes at the top are because we want to avoid
+# walking off the graph. Specifically, node G should be considered common, but
+# is likely to be seen by M long before the common searcher finds it.
+#
+# NULL_REVISION
+# |
+# a
+# |
+# b
+# |
+# c
+# |
+# d
+# |\
+# e f
+# | |\
+# i | h
+# |\| |
+# | g |
+# | | |
+# | j |
+# | | |
+# | k |
+# | | |
+# | l |
+# |/|/
+# m n
+complex_shortcut = {'d':[NULL_REVISION],
+ 'x':['d'], 'y':['x'],
+ 'e':['y'], 'f':['d'], 'g':['f', 'i'], 'h':['f'],
+ 'i':['e'], 'j':['g'], 'k':['j'],
+ 'l':['k'], 'm':['i', 's'], 'n':['s', 'h'],
+ 'o':['l'], 'p':['o'], 'q':['p'],
+ 'r':['q'], 's':['r'],
+ }
+
+# Shortcut with extra root
+# We have a long history shortcut, and an extra root, which is why we can't
+# stop searchers based on seeing NULL_REVISION
+# NULL_REVISION
+# | |
+# a |
+# |\ |
+# b | |
+# | | |
+# c | |
+# | | |
+# d | g
+# |\|/
+# e f
+shortcut_extra_root = {'a': [NULL_REVISION],
+ 'b': ['a'],
+ 'c': ['b'],
+ 'd': ['c'],
+ 'e': ['d'],
+ 'f': ['a', 'd', 'g'],
+ 'g': [NULL_REVISION],
+ }
+
# NULL_REVISION
# |
# f
@@ -144,10 +247,17 @@
self.calls.extend(nodes)
return self._real_parents_provider.get_parents(nodes)
+ def get_parent_map(self, nodes):
+ self.calls.extend(nodes)
+ return self._real_parents_provider.get_parent_map(nodes)
+
class TestGraph(TestCaseWithMemoryTransport):
def make_graph(self, ancestors):
+ # XXX: This seems valid, is there a reason to actually create a
+ # repository and put things in it?
+ return _mod_graph.Graph(_mod_graph.DictParentsProvider(ancestors))
tree = self.prepare_memory_tree('.')
self.build_ancestry(tree, ancestors)
self.addCleanup(tree.unlock)
@@ -261,6 +371,10 @@
self.assertEqual(NULL_REVISION,
graph.find_unique_lca('rev4a', 'rev1b'))
+ def test_lca_double_shortcut(self):
+ graph = self.make_graph(double_shortcut)
+ self.assertEqual('c', graph.find_unique_lca('f', 'g'))
+
def test_common_ancestor_two_repos(self):
"""Ensure we do unique_lca using data from two repos"""
mainline_tree = self.prepare_memory_tree('mainline')
@@ -289,6 +403,14 @@
self.assertEqual((set(['rev4', 'rev3', 'rev2a']), set()),
graph.find_difference('rev4', 'rev2b'))
+ def test_graph_difference_separate_ancestry(self):
+ graph = self.make_graph(ancestry_2)
+ self.assertEqual((set(['rev1a']), set(['rev1b'])),
+ graph.find_difference('rev1a', 'rev1b'))
+ self.assertEqual((set(['rev1a', 'rev2a', 'rev3a', 'rev4a']),
+ set(['rev1b'])),
+ graph.find_difference('rev4a', 'rev1b'))
+
def test_graph_difference_criss_cross(self):
graph = self.make_graph(criss_cross)
self.assertEqual((set(['rev3a']), set(['rev3b'])),
@@ -296,6 +418,37 @@
self.assertEqual((set([]), set(['rev3b', 'rev2b'])),
graph.find_difference('rev2a', 'rev3b'))
+ def test_graph_difference_extended_history(self):
+ graph = self.make_graph(extended_history_shortcut)
+ self.expectFailure('find_difference cannot handle shortcuts',
+ self.assertEqual, (set(['e']), set(['f'])),
+ graph.find_difference('e', 'f'))
+ self.assertEqual((set(['e']), set(['f'])),
+ graph.find_difference('e', 'f'))
+ self.assertEqual((set(['f']), set(['e'])),
+ graph.find_difference('f', 'e'))
+
+ def test_graph_difference_double_shortcut(self):
+ graph = self.make_graph(double_shortcut)
+ self.assertEqual((set(['d', 'f']), set(['e', 'g'])),
+ graph.find_difference('f', 'g'))
+
+ def test_graph_difference_complex_shortcut(self):
+ graph = self.make_graph(complex_shortcut)
+ self.expectFailure('find_difference cannot handle shortcuts',
+ self.assertEqual, (set(['m']), set(['h', 'n'])),
+ graph.find_difference('m', 'n'))
+ self.assertEqual((set(['m']), set(['h', 'n'])),
+ graph.find_difference('m', 'n'))
+
+ def test_graph_difference_shortcut_extra_root(self):
+ graph = self.make_graph(shortcut_extra_root)
+ self.expectFailure('find_difference cannot handle shortcuts',
+ self.assertEqual, (set(['e']), set(['f', 'g'])),
+ graph.find_difference('e', 'f'))
+ self.assertEqual((set(['e']), set(['f', 'g'])),
+ graph.find_difference('e', 'f'))
+
def test_stacked_parents_provider(self):
parents1 = _mod_graph.DictParentsProvider({'rev2': ['rev3']})
@@ -463,8 +616,16 @@
self.fail('key deeper was accessed')
result.append(graph_dict[key])
return result
+ def get_parent_map(keys):
+ result = {}
+ for key in keys:
+ if key == 'deeper':
+ self.fail('key deeper was accessed')
+ result[key] = graph_dict[key]
+ return result
an_obj = stub()
an_obj.get_parents = get_parents
+ an_obj.get_parent_map = get_parent_map
graph = _mod_graph.Graph(an_obj)
return graph.heads(search)
@@ -502,3 +663,47 @@
}
self.assertEqual(set(['h1', 'h2']),
self._run_heads_break_deeper(graph_dict, ['h1', 'h2']))
+
+
+class TestCachingParentsProvider(tests.TestCase):
+
+ def setUp(self):
+ super(TestCachingParentsProvider, self).setUp()
+ dict_pp = _mod_graph.DictParentsProvider({'a':('b',)})
+ self.inst_pp = InstrumentedParentsProvider(dict_pp)
+ self.caching_pp = _mod_graph.CachingParentsProvider(self.inst_pp)
+
+ def test_get_parents(self):
+ """Requesting the same revision should be returned from cache"""
+ self.assertEqual({}, self.caching_pp._cache)
+ self.assertEqual([('b',)], self.caching_pp.get_parents(['a']))
+ self.assertEqual(['a'], self.inst_pp.calls)
+ self.assertEqual([('b',)], self.caching_pp.get_parents(['a']))
+ # No new call, as it should have been returned from the cache
+ self.assertEqual(['a'], self.inst_pp.calls)
+ self.assertEqual({'a':('b',)}, self.caching_pp._cache)
+
+ def test_get_parents_not_present(self):
+ """The cache should also track when a revision doesn't exist"""
+ self.assertEqual([None], self.caching_pp.get_parents(['b']))
+ self.assertEqual(['b'], self.inst_pp.calls)
+ self.assertEqual([None], self.caching_pp.get_parents(['b']))
+ # No new calls
+ self.assertEqual(['b'], self.inst_pp.calls)
+ self.assertEqual({'b':None}, self.caching_pp._cache)
+
+ def test_get_parents_mixed(self):
+ """Anything that can be returned from cache, should be"""
+ self.assertEqual([None], self.caching_pp.get_parents(['b']))
+ self.assertEqual(['b'], self.inst_pp.calls)
+ self.assertEqual([('b',), None],
+ self.caching_pp.get_parents(['a', 'b']))
+ self.assertEqual(['b', 'a'], self.inst_pp.calls)
+
+ def test_get_parents_repeated(self):
+ """Asking for the same parent 2x will only forward 1 request."""
+ self.assertEqual([None, ('b',), None],
+ self.caching_pp.get_parents(['b', 'a', 'b']))
+ # Use sorted because we don't care about the order, just that each is
+ # only present 1 time.
+ self.assertEqual(['a', 'b'], sorted(self.inst_pp.calls))
More information about the bazaar-commits
mailing list