Rev 6099: (jameinel) Bug #388269, when walking to find revisions to transmit, in file:///home/pqm/archives/thelove/bzr/%2Btrunk/

Thu Aug 25 07:51:42 UTC 2011

At file:///home/pqm/archives/thelove/bzr/%2Btrunk/

------------------------------------------------------------
revno: 6099 [merge]
revision-id: pqm at pqm.ubuntu.com-20110825075137-9h77a9wtrnh14hkc
parent: pqm at pqm.ubuntu.com-20110825054210-ynb198gs8dkiuv5p
parent: john at arbash-meinel.com-20110824090144-udyd40lmdlcwssyv
committer: Canonical.com Patch Queue Manager <pqm at pqm.ubuntu.com>
branch nick: +trunk
timestamp: Thu 2011-08-25 07:51:37 +0000
message:
  (jameinel) Bug #388269, when walking to find revisions to transmit,
   only send a local part of the search graph,
   not the whole graph walked for every step. (John A Meinel)
modified:
  bzrlib/graph.py                graph_walker.py-20070525030359-y852guab65d4wtn0-1
  bzrlib/remote.py               remote.py-20060720103555-yeeg2x51vn0rbtdp-1
  bzrlib/tests/test_graph.py     test_graph_walker.py-20070525030405-enq4r60hhi9xrujc-1
  bzrlib/tests/test_remote.py    test_remote.py-20060720103555-yeeg2x51vn0rbtdp-2
  bzrlib/vf_repository.py        vf_repository.py-20110502151858-yh9nnoxpokg86unk-1
=== modified file 'bzrlib/graph.py'

--- a/bzrlib/graph.py	2011-06-20 11:03:53 +0000
+++ b/bzrlib/graph.py	2011-08-24 09:01:44 +0000
@@ -61,7 +61,7 @@
     def get_parent_map(self, keys):
         """See StackedParentsProvider.get_parent_map"""
         ancestry = self.ancestry
-        return dict((k, ancestry[k]) for k in keys if k in ancestry)
+        return dict([(k, ancestry[k]) for k in keys if k in ancestry])
 
 
 class StackedParentsProvider(object):
@@ -1419,13 +1419,14 @@
         parents_of_found = set()
         # revisions may contain nodes that point to other nodes in revisions:
         # we want to filter them out.
-        self.seen.update(revisions)
+        seen = self.seen
+        seen.update(revisions)
         parent_map = self._parents_provider.get_parent_map(revisions)
         found_revisions.update(parent_map)
         for rev_id, parents in parent_map.iteritems():
             if parents is None:
                 continue
-            new_found_parents = [p for p in parents if p not in self.seen]
+            new_found_parents = [p for p in parents if p not in seen]
             if new_found_parents:
                 # Calling set.update() with an empty generator is actually
                 # rather expensive.
@@ -1891,6 +1892,160 @@
             limit=self.limit)
 
 
+def invert_parent_map(parent_map):
+    """Given a map from child => parents, create a map of parent=>children"""
+    child_map = {}
+    for child, parents in parent_map.iteritems():
+        for p in parents:
+            # Any given parent is likely to have only a small handful
+            # of children, many will have only one. So we avoid mem overhead of
+            # a list, in exchange for extra copying of tuples
+            if p not in child_map:
+                child_map[p] = (child,)
+            else:
+                child_map[p] = child_map[p] + (child,)
+    return child_map
+
+
+def _find_possible_heads(parent_map, tip_keys, depth):
+    """Walk backwards (towards children) through the parent_map.
+
+    This finds 'heads' that will hopefully succinctly describe our search
+    graph.
+    """
+    child_map = invert_parent_map(parent_map)
+    heads = set()
+    current_roots = tip_keys
+    walked = set(current_roots)
+    while current_roots and depth > 0:
+        depth -= 1
+        children = set()
+        children_update = children.update
+        for p in current_roots:
+            # Is it better to pre- or post- filter the children?
+            try:
+                children_update(child_map[p])
+            except KeyError:
+                heads.add(p)
+        # If we've seen a key before, we don't want to walk it again. Note that
+        # 'children' stays relatively small while 'walked' grows large. So
+        # don't use 'difference_update' here which has to walk all of 'walked'.
+        # '.difference' is smart enough to walk only children and compare it to
+        # walked.
+        children = children.difference(walked)
+        walked.update(children)
+        current_roots = children
+    if current_roots:
+        # We walked to the end of depth, so these are the new tips.
+        heads.update(current_roots)
+    return heads
+
+
+def _run_search(parent_map, heads, exclude_keys):
+    """Given a parent map, run a _BreadthFirstSearcher on it.
+
+    Start at heads, walk until you hit exclude_keys. As a further improvement,
+    watch for any heads that you encounter while walking, which means they were
+    not heads of the search.
+
+    This is mostly used to generate a succinct recipe for how to walk through
+    most of parent_map.
+
+    :return: (_BreadthFirstSearcher, set(heads_encountered_by_walking))
+    """
+    g = Graph(DictParentsProvider(parent_map))
+    s = g._make_breadth_first_searcher(heads)
+    found_heads = set()
+    while True:
+        try:
+            next_revs = s.next()
+        except StopIteration:
+            break
+        for parents in s._current_parents.itervalues():
+            f_heads = heads.intersection(parents)
+            if f_heads:
+                found_heads.update(f_heads)
+        stop_keys = exclude_keys.intersection(next_revs)
+        if stop_keys:
+            s.stop_searching_any(stop_keys)
+    for parents in s._current_parents.itervalues():
+        f_heads = heads.intersection(parents)
+        if f_heads:
+            found_heads.update(f_heads)
+    return s, found_heads
+
+
+def limited_search_result_from_parent_map(parent_map, missing_keys, tip_keys,
+                                          depth):
+    """Transform a parent_map that is searching 'tip_keys' into an
+    approximate SearchResult.
+
+    We should be able to generate a SearchResult from a given set of starting
+    keys, that covers a subset of parent_map that has the last step pointing at
+    tip_keys. This is to handle the case that really-long-searches shouldn't be
+    started from scratch on each get_parent_map request, but we *do* want to
+    filter out some of the keys that we've already seen, so we don't get
+    information that we already know about on every request.
+
+    The server will validate the search (that starting at start_keys and
+    stopping at stop_keys yields the exact key_count), so we have to be careful
+    to give an exact recipe.
+
+    Basic algorithm is:
+        1) Invert parent_map to get child_map (todo: have it cached and pass it
+           in)
+        2) Starting at tip_keys, walk towards children for 'depth' steps.
+        3) At that point, we have the 'start' keys.
+        4) Start walking parent_map from 'start' keys, counting how many keys
+           are seen, and generating stop_keys for anything that would walk
+           outside of the parent_map.
+
+    :param parent_map: A map from {child_id: (parent_ids,)}
+    :param missing_keys: parent_ids that we know are unavailable
+    :param tip_keys: the revision_ids that we are searching
+    :param depth: How far back to walk.
+    """
+    if not parent_map:
+        # No search to send, because we haven't done any searching yet.
+        return [], [], 0
+    heads = _find_possible_heads(parent_map, tip_keys, depth)
+    s, found_heads = _run_search(parent_map, heads, set(tip_keys))
+    _, start_keys, exclude_keys, key_count = s.get_result().get_recipe()
+    if found_heads:
+        # Anything in found_heads are redundant start_keys, we hit them while
+        # walking, so we can exclude them from the start list.
+        start_keys = set(start_keys).difference(found_heads)
+    return start_keys, exclude_keys, key_count
+
+
+def search_result_from_parent_map(parent_map, missing_keys):
+    """Transform a parent_map into SearchResult information."""
+    if not parent_map:
+        # parent_map is empty or None, simple search result
+        return [], [], 0
+    # start_set is all the keys in the cache
+    start_set = set(parent_map)
+    # result set is all the references to keys in the cache
+    result_parents = set()
+    for parents in parent_map.itervalues():
+        result_parents.update(parents)
+    stop_keys = result_parents.difference(start_set)
+    # We don't need to send ghosts back to the server as a position to
+    # stop either.
+    stop_keys.difference_update(missing_keys)
+    key_count = len(parent_map)
+    if (revision.NULL_REVISION in result_parents
+        and revision.NULL_REVISION in missing_keys):
+        # If we pruned NULL_REVISION from the stop_keys because it's also
+        # in our cache of "missing" keys we need to increment our key count
+        # by 1, because the reconsitituted SearchResult on the server will
+        # still consider NULL_REVISION to be an included key.
+        key_count += 1
+    included_keys = start_set.intersection(result_parents)
+    start_set.difference_update(included_keys)
+    return start_set, stop_keys, key_count
+
+
 def collapse_linear_regions(parent_map):
     """Collapse regions of the graph that are 'linear'.
 

=== modified file 'bzrlib/remote.py'
--- a/bzrlib/remote.py	2011-08-17 01:19:17 +0000
+++ b/bzrlib/remote.py	2011-08-25 07:51:37 +0000
@@ -48,6 +48,9 @@
 from bzrlib.trace import mutter, note, warning
 
 
+_DEFAULT_SEARCH_DEPTH = 100
+
+
 class _RpcHelper(object):
     """Mixin class that helps with issuing RPCs."""
 
@@ -1745,26 +1748,15 @@
         if parents_map is None:
             # Repository is not locked, so there's no cache.
             parents_map = {}
-        # start_set is all the keys in the cache
-        start_set = set(parents_map)
-        # result set is all the references to keys in the cache
-        result_parents = set()
-        for parents in parents_map.itervalues():
-            result_parents.update(parents)
-        stop_keys = result_parents.difference(start_set)
-        # We don't need to send ghosts back to the server as a position to
-        # stop either.
-        stop_keys.difference_update(self._unstacked_provider.missing_keys)
-        key_count = len(parents_map)
-        if (NULL_REVISION in result_parents
-            and NULL_REVISION in self._unstacked_provider.missing_keys):
-            # If we pruned NULL_REVISION from the stop_keys because it's also
-            # in our cache of "missing" keys we need to increment our key count
-            # by 1, because the reconsitituted SearchResult on the server will
-            # still consider NULL_REVISION to be an included key.
-            key_count += 1
-        included_keys = start_set.intersection(result_parents)
-        start_set.difference_update(included_keys)
+        if _DEFAULT_SEARCH_DEPTH <= 0:
+            (start_set, stop_keys,
+             key_count) = graph.search_result_from_parent_map(
+                parents_map, self._unstacked_provider.missing_keys)
+        else:
+            (start_set, stop_keys,
+             key_count) = graph.limited_search_result_from_parent_map(
+                parents_map, self._unstacked_provider.missing_keys,
+                keys, depth=_DEFAULT_SEARCH_DEPTH)
         recipe = ('manual', start_set, stop_keys, key_count)
         body = self._serialise_search_recipe(recipe)
         path = self.bzrdir._path_for_remote_call(self._client)

=== modified file 'bzrlib/tests/test_graph.py'
--- a/bzrlib/tests/test_graph.py	2011-06-20 11:03:53 +0000
+++ b/bzrlib/tests/test_graph.py	2011-08-24 09:01:44 +0000
@@ -1744,3 +1744,74 @@
         self.assertEqual(set([NULL_REVISION, 'tip', 'tag', 'mid']), recipe[2])
         self.assertEqual(0, recipe[3])
         self.assertTrue(result.is_empty())
+
+
+class TestSearchResultFromParentMap(TestGraphBase):
+
+    def assertSearchResult(self, start_keys, stop_keys, key_count, parent_map,
+                           missing_keys=()):
+        (start, stop, count) = _mod_graph.search_result_from_parent_map(
+            parent_map, missing_keys)
+        self.assertEqual((sorted(start_keys), sorted(stop_keys), key_count),
+                         (sorted(start), sorted(stop), count))
+
+    def test_no_parents(self):
+        self.assertSearchResult([], [], 0, {})
+        self.assertSearchResult([], [], 0, None)
+
+    def test_ancestry_1(self):
+        self.assertSearchResult(['rev4'], [NULL_REVISION], len(ancestry_1),
+                                ancestry_1)
+
+    def test_ancestry_2(self):
+        self.assertSearchResult(['rev1b', 'rev4a'], [NULL_REVISION],
+                                len(ancestry_2), ancestry_2)
+        self.assertSearchResult(['rev1b', 'rev4a'], [],
+                                len(ancestry_2)+1, ancestry_2,
+                                missing_keys=[NULL_REVISION])
+
+    def test_partial_search(self):
+        parent_map = dict((k,extended_history_shortcut[k])
+                          for k in ['e', 'f'])
+        self.assertSearchResult(['e', 'f'], ['d', 'a'], 2,
+                                parent_map)
+        parent_map.update((k,extended_history_shortcut[k])
+                          for k in ['d', 'a'])
+        self.assertSearchResult(['e', 'f'], ['c', NULL_REVISION], 4,
+                                parent_map)
+        parent_map['c'] = extended_history_shortcut['c']
+        self.assertSearchResult(['e', 'f'], ['b'], 6,
+                                parent_map, missing_keys=[NULL_REVISION])
+        parent_map['b'] = extended_history_shortcut['b']
+        self.assertSearchResult(['e', 'f'], [], 7,
+                                parent_map, missing_keys=[NULL_REVISION])
+
+
+class TestLimitedSearchResultFromParentMap(TestGraphBase):
+
+    def assertSearchResult(self, start_keys, stop_keys, key_count, parent_map,
+                           missing_keys, tip_keys, depth):
+        (start, stop, count) = _mod_graph.limited_search_result_from_parent_map(
+            parent_map, missing_keys, tip_keys, depth)
+        self.assertEqual((sorted(start_keys), sorted(stop_keys), key_count),
+                         (sorted(start), sorted(stop), count))
+
+    def test_empty_ancestry(self):
+        self.assertSearchResult([], [], 0, {}, (), ['tip-rev-id'], 10)
+
+    def test_ancestry_1(self):
+        self.assertSearchResult(['rev4'], ['rev1'], 4,
+                                ancestry_1, (), ['rev1'], 10)
+        self.assertSearchResult(['rev2a', 'rev2b'], ['rev1'], 2,
+                                ancestry_1, (), ['rev1'], 1)
+
+
+    def test_multiple_heads(self):
+        self.assertSearchResult(['e', 'f'], ['a'], 5,
+                                extended_history_shortcut, (), ['a'], 10)
+        # Note that even though we only take 1 step back, we find 'f', which
+        # means the described search will still find d and c.
+        self.assertSearchResult(['f'], ['a'], 4,
+                                extended_history_shortcut, (), ['a'], 1)
+        self.assertSearchResult(['f'], ['a'], 4,
+                                extended_history_shortcut, (), ['a'], 2)

=== modified file 'bzrlib/tests/test_remote.py'
--- a/bzrlib/tests/test_remote.py	2011-08-12 09:49:24 +0000
+++ b/bzrlib/tests/test_remote.py	2011-08-25 07:51:37 +0000
@@ -2147,8 +2147,8 @@
         parents = repo.get_parent_map([rev_id])
         self.assertEqual(
             [('call_with_body_bytes_expecting_body',
-              'Repository.get_parent_map', ('quack/', 'include-missing:',
-              rev_id), '\n\n0'),
+              'Repository.get_parent_map',
+              ('quack/', 'include-missing:', rev_id), '\n\n0'),
              ('disconnect medium',),
              ('call_expecting_body', 'Repository.get_revision_graph',
               ('quack/', ''))],

=== modified file 'bzrlib/vf_repository.py'
--- a/bzrlib/vf_repository.py	2011-08-21 21:03:57 +0000
+++ b/bzrlib/vf_repository.py	2011-08-25 07:51:37 +0000
@@ -70,7 +70,7 @@
     )
 
 from bzrlib.trace import (
-    mutter,
+    mutter
     )