Rev 6099: (jameinel) Bug #388269, when walking to find revisions to transmit, in file:///home/pqm/archives/thelove/bzr/%2Btrunk/
Canonical.com Patch Queue Manager
pqm at pqm.ubuntu.com
Thu Aug 25 07:51:42 UTC 2011
At file:///home/pqm/archives/thelove/bzr/%2Btrunk/
------------------------------------------------------------
revno: 6099 [merge]
revision-id: pqm at pqm.ubuntu.com-20110825075137-9h77a9wtrnh14hkc
parent: pqm at pqm.ubuntu.com-20110825054210-ynb198gs8dkiuv5p
parent: john at arbash-meinel.com-20110824090144-udyd40lmdlcwssyv
committer: Canonical.com Patch Queue Manager <pqm at pqm.ubuntu.com>
branch nick: +trunk
timestamp: Thu 2011-08-25 07:51:37 +0000
message:
(jameinel) Bug #388269, when walking to find revisions to transmit,
only send a local part of the search graph,
not the whole graph walked for every step. (John A Meinel)
modified:
bzrlib/graph.py graph_walker.py-20070525030359-y852guab65d4wtn0-1
bzrlib/remote.py remote.py-20060720103555-yeeg2x51vn0rbtdp-1
bzrlib/tests/test_graph.py test_graph_walker.py-20070525030405-enq4r60hhi9xrujc-1
bzrlib/tests/test_remote.py test_remote.py-20060720103555-yeeg2x51vn0rbtdp-2
bzrlib/vf_repository.py vf_repository.py-20110502151858-yh9nnoxpokg86unk-1
=== modified file 'bzrlib/graph.py'
--- a/bzrlib/graph.py 2011-06-20 11:03:53 +0000
+++ b/bzrlib/graph.py 2011-08-24 09:01:44 +0000
@@ -61,7 +61,7 @@
def get_parent_map(self, keys):
"""See StackedParentsProvider.get_parent_map"""
ancestry = self.ancestry
- return dict((k, ancestry[k]) for k in keys if k in ancestry)
+ return dict([(k, ancestry[k]) for k in keys if k in ancestry])
class StackedParentsProvider(object):
@@ -1419,13 +1419,14 @@
parents_of_found = set()
# revisions may contain nodes that point to other nodes in revisions:
# we want to filter them out.
- self.seen.update(revisions)
+ seen = self.seen
+ seen.update(revisions)
parent_map = self._parents_provider.get_parent_map(revisions)
found_revisions.update(parent_map)
for rev_id, parents in parent_map.iteritems():
if parents is None:
continue
- new_found_parents = [p for p in parents if p not in self.seen]
+ new_found_parents = [p for p in parents if p not in seen]
if new_found_parents:
# Calling set.update() with an empty generator is actually
# rather expensive.
@@ -1891,6 +1892,160 @@
limit=self.limit)
+def invert_parent_map(parent_map):
+ """Given a map from child => parents, create a map of parent=>children"""
+ child_map = {}
+ for child, parents in parent_map.iteritems():
+ for p in parents:
+ # Any given parent is likely to have only a small handful
+ # of children, many will have only one. So we avoid mem overhead of
+ # a list, in exchange for extra copying of tuples
+ if p not in child_map:
+ child_map[p] = (child,)
+ else:
+ child_map[p] = child_map[p] + (child,)
+ return child_map
+
+
+def _find_possible_heads(parent_map, tip_keys, depth):
+ """Walk backwards (towards children) through the parent_map.
+
+ This finds 'heads' that will hopefully succinctly describe our search
+ graph.
+ """
+ child_map = invert_parent_map(parent_map)
+ heads = set()
+ current_roots = tip_keys
+ walked = set(current_roots)
+ while current_roots and depth > 0:
+ depth -= 1
+ children = set()
+ children_update = children.update
+ for p in current_roots:
+ # Is it better to pre- or post- filter the children?
+ try:
+ children_update(child_map[p])
+ except KeyError:
+ heads.add(p)
+ # If we've seen a key before, we don't want to walk it again. Note that
+ # 'children' stays relatively small while 'walked' grows large. So
+ # don't use 'difference_update' here which has to walk all of 'walked'.
+ # '.difference' is smart enough to walk only children and compare it to
+ # walked.
+ children = children.difference(walked)
+ walked.update(children)
+ current_roots = children
+ if current_roots:
+ # We walked to the end of depth, so these are the new tips.
+ heads.update(current_roots)
+ return heads
+
+
+def _run_search(parent_map, heads, exclude_keys):
+ """Given a parent map, run a _BreadthFirstSearcher on it.
+
+ Start at heads, walk until you hit exclude_keys. As a further improvement,
+ watch for any heads that you encounter while walking, which means they were
+ not heads of the search.
+
+ This is mostly used to generate a succinct recipe for how to walk through
+ most of parent_map.
+
+ :return: (_BreadthFirstSearcher, set(heads_encountered_by_walking))
+ """
+ g = Graph(DictParentsProvider(parent_map))
+ s = g._make_breadth_first_searcher(heads)
+ found_heads = set()
+ while True:
+ try:
+ next_revs = s.next()
+ except StopIteration:
+ break
+ for parents in s._current_parents.itervalues():
+ f_heads = heads.intersection(parents)
+ if f_heads:
+ found_heads.update(f_heads)
+ stop_keys = exclude_keys.intersection(next_revs)
+ if stop_keys:
+ s.stop_searching_any(stop_keys)
+ for parents in s._current_parents.itervalues():
+ f_heads = heads.intersection(parents)
+ if f_heads:
+ found_heads.update(f_heads)
+ return s, found_heads
+
+
+def limited_search_result_from_parent_map(parent_map, missing_keys, tip_keys,
+ depth):
+ """Transform a parent_map that is searching 'tip_keys' into an
+ approximate SearchResult.
+
+ We should be able to generate a SearchResult from a given set of starting
+ keys, that covers a subset of parent_map that has the last step pointing at
+ tip_keys. This is to handle the case that really-long-searches shouldn't be
+ started from scratch on each get_parent_map request, but we *do* want to
+ filter out some of the keys that we've already seen, so we don't get
+ information that we already know about on every request.
+
+ The server will validate the search (that starting at start_keys and
+ stopping at stop_keys yields the exact key_count), so we have to be careful
+ to give an exact recipe.
+
+ Basic algorithm is:
+ 1) Invert parent_map to get child_map (todo: have it cached and pass it
+ in)
+ 2) Starting at tip_keys, walk towards children for 'depth' steps.
+ 3) At that point, we have the 'start' keys.
+ 4) Start walking parent_map from 'start' keys, counting how many keys
+ are seen, and generating stop_keys for anything that would walk
+ outside of the parent_map.
+
+ :param parent_map: A map from {child_id: (parent_ids,)}
+ :param missing_keys: parent_ids that we know are unavailable
+ :param tip_keys: the revision_ids that we are searching
+ :param depth: How far back to walk.
+ """
+ if not parent_map:
+ # No search to send, because we haven't done any searching yet.
+ return [], [], 0
+ heads = _find_possible_heads(parent_map, tip_keys, depth)
+ s, found_heads = _run_search(parent_map, heads, set(tip_keys))
+ _, start_keys, exclude_keys, key_count = s.get_result().get_recipe()
+ if found_heads:
+ # Anything in found_heads are redundant start_keys, we hit them while
+ # walking, so we can exclude them from the start list.
+ start_keys = set(start_keys).difference(found_heads)
+ return start_keys, exclude_keys, key_count
+
+
+def search_result_from_parent_map(parent_map, missing_keys):
+ """Transform a parent_map into SearchResult information."""
+ if not parent_map:
+ # parent_map is empty or None, simple search result
+ return [], [], 0
+ # start_set is all the keys in the cache
+ start_set = set(parent_map)
+ # result set is all the references to keys in the cache
+ result_parents = set()
+ for parents in parent_map.itervalues():
+ result_parents.update(parents)
+ stop_keys = result_parents.difference(start_set)
+ # We don't need to send ghosts back to the server as a position to
+ # stop either.
+ stop_keys.difference_update(missing_keys)
+ key_count = len(parent_map)
+ if (revision.NULL_REVISION in result_parents
+ and revision.NULL_REVISION in missing_keys):
+ # If we pruned NULL_REVISION from the stop_keys because it's also
+ # in our cache of "missing" keys we need to increment our key count
+ # by 1, because the reconsitituted SearchResult on the server will
+ # still consider NULL_REVISION to be an included key.
+ key_count += 1
+ included_keys = start_set.intersection(result_parents)
+ start_set.difference_update(included_keys)
+ return start_set, stop_keys, key_count
+
+
def collapse_linear_regions(parent_map):
"""Collapse regions of the graph that are 'linear'.
=== modified file 'bzrlib/remote.py'
--- a/bzrlib/remote.py 2011-08-17 01:19:17 +0000
+++ b/bzrlib/remote.py 2011-08-25 07:51:37 +0000
@@ -48,6 +48,9 @@
from bzrlib.trace import mutter, note, warning
+_DEFAULT_SEARCH_DEPTH = 100
+
+
class _RpcHelper(object):
"""Mixin class that helps with issuing RPCs."""
@@ -1745,26 +1748,15 @@
if parents_map is None:
# Repository is not locked, so there's no cache.
parents_map = {}
- # start_set is all the keys in the cache
- start_set = set(parents_map)
- # result set is all the references to keys in the cache
- result_parents = set()
- for parents in parents_map.itervalues():
- result_parents.update(parents)
- stop_keys = result_parents.difference(start_set)
- # We don't need to send ghosts back to the server as a position to
- # stop either.
- stop_keys.difference_update(self._unstacked_provider.missing_keys)
- key_count = len(parents_map)
- if (NULL_REVISION in result_parents
- and NULL_REVISION in self._unstacked_provider.missing_keys):
- # If we pruned NULL_REVISION from the stop_keys because it's also
- # in our cache of "missing" keys we need to increment our key count
- # by 1, because the reconsitituted SearchResult on the server will
- # still consider NULL_REVISION to be an included key.
- key_count += 1
- included_keys = start_set.intersection(result_parents)
- start_set.difference_update(included_keys)
+ if _DEFAULT_SEARCH_DEPTH <= 0:
+ (start_set, stop_keys,
+ key_count) = graph.search_result_from_parent_map(
+ parents_map, self._unstacked_provider.missing_keys)
+ else:
+ (start_set, stop_keys,
+ key_count) = graph.limited_search_result_from_parent_map(
+ parents_map, self._unstacked_provider.missing_keys,
+ keys, depth=_DEFAULT_SEARCH_DEPTH)
recipe = ('manual', start_set, stop_keys, key_count)
body = self._serialise_search_recipe(recipe)
path = self.bzrdir._path_for_remote_call(self._client)
=== modified file 'bzrlib/tests/test_graph.py'
--- a/bzrlib/tests/test_graph.py 2011-06-20 11:03:53 +0000
+++ b/bzrlib/tests/test_graph.py 2011-08-24 09:01:44 +0000
@@ -1744,3 +1744,74 @@
self.assertEqual(set([NULL_REVISION, 'tip', 'tag', 'mid']), recipe[2])
self.assertEqual(0, recipe[3])
self.assertTrue(result.is_empty())
+
+
+class TestSearchResultFromParentMap(TestGraphBase):
+
+ def assertSearchResult(self, start_keys, stop_keys, key_count, parent_map,
+ missing_keys=()):
+ (start, stop, count) = _mod_graph.search_result_from_parent_map(
+ parent_map, missing_keys)
+ self.assertEqual((sorted(start_keys), sorted(stop_keys), key_count),
+ (sorted(start), sorted(stop), count))
+
+ def test_no_parents(self):
+ self.assertSearchResult([], [], 0, {})
+ self.assertSearchResult([], [], 0, None)
+
+ def test_ancestry_1(self):
+ self.assertSearchResult(['rev4'], [NULL_REVISION], len(ancestry_1),
+ ancestry_1)
+
+ def test_ancestry_2(self):
+ self.assertSearchResult(['rev1b', 'rev4a'], [NULL_REVISION],
+ len(ancestry_2), ancestry_2)
+ self.assertSearchResult(['rev1b', 'rev4a'], [],
+ len(ancestry_2)+1, ancestry_2,
+ missing_keys=[NULL_REVISION])
+
+ def test_partial_search(self):
+ parent_map = dict((k,extended_history_shortcut[k])
+ for k in ['e', 'f'])
+ self.assertSearchResult(['e', 'f'], ['d', 'a'], 2,
+ parent_map)
+ parent_map.update((k,extended_history_shortcut[k])
+ for k in ['d', 'a'])
+ self.assertSearchResult(['e', 'f'], ['c', NULL_REVISION], 4,
+ parent_map)
+ parent_map['c'] = extended_history_shortcut['c']
+ self.assertSearchResult(['e', 'f'], ['b'], 6,
+ parent_map, missing_keys=[NULL_REVISION])
+ parent_map['b'] = extended_history_shortcut['b']
+ self.assertSearchResult(['e', 'f'], [], 7,
+ parent_map, missing_keys=[NULL_REVISION])
+
+
+class TestLimitedSearchResultFromParentMap(TestGraphBase):
+
+ def assertSearchResult(self, start_keys, stop_keys, key_count, parent_map,
+ missing_keys, tip_keys, depth):
+ (start, stop, count) = _mod_graph.limited_search_result_from_parent_map(
+ parent_map, missing_keys, tip_keys, depth)
+ self.assertEqual((sorted(start_keys), sorted(stop_keys), key_count),
+ (sorted(start), sorted(stop), count))
+
+ def test_empty_ancestry(self):
+ self.assertSearchResult([], [], 0, {}, (), ['tip-rev-id'], 10)
+
+ def test_ancestry_1(self):
+ self.assertSearchResult(['rev4'], ['rev1'], 4,
+ ancestry_1, (), ['rev1'], 10)
+ self.assertSearchResult(['rev2a', 'rev2b'], ['rev1'], 2,
+ ancestry_1, (), ['rev1'], 1)
+
+
+ def test_multiple_heads(self):
+ self.assertSearchResult(['e', 'f'], ['a'], 5,
+ extended_history_shortcut, (), ['a'], 10)
+ # Note that even though we only take 1 step back, we find 'f', which
+ # means the described search will still find d and c.
+ self.assertSearchResult(['f'], ['a'], 4,
+ extended_history_shortcut, (), ['a'], 1)
+ self.assertSearchResult(['f'], ['a'], 4,
+ extended_history_shortcut, (), ['a'], 2)
=== modified file 'bzrlib/tests/test_remote.py'
--- a/bzrlib/tests/test_remote.py 2011-08-12 09:49:24 +0000
+++ b/bzrlib/tests/test_remote.py 2011-08-25 07:51:37 +0000
@@ -2147,8 +2147,8 @@
parents = repo.get_parent_map([rev_id])
self.assertEqual(
[('call_with_body_bytes_expecting_body',
- 'Repository.get_parent_map', ('quack/', 'include-missing:',
- rev_id), '\n\n0'),
+ 'Repository.get_parent_map',
+ ('quack/', 'include-missing:', rev_id), '\n\n0'),
('disconnect medium',),
('call_expecting_body', 'Repository.get_revision_graph',
('quack/', ''))],
=== modified file 'bzrlib/vf_repository.py'
--- a/bzrlib/vf_repository.py 2011-08-21 21:03:57 +0000
+++ b/bzrlib/vf_repository.py 2011-08-25 07:51:37 +0000
@@ -70,7 +70,7 @@
)
from bzrlib.trace import (
- mutter,
+ mutter
)
More information about the bazaar-commits
mailing list