Rev 4613: (jam) Introduce CombinedGraphIndex.find_ancestry and in file:///home/pqm/archives/thelove/bzr/%2Btrunk/

Fri Aug 14 20:14:30 BST 2009

At file:///home/pqm/archives/thelove/bzr/%2Btrunk/

------------------------------------------------------------
revno: 4613 [merge]
revision-id: pqm at pqm.ubuntu.com-20090814191423-zs4ej0tde7yhamha
parent: pqm at pqm.ubuntu.com-20090814164557-05bpjwxjr318ptaf
parent: john at arbash-meinel.com-20090814173603-m2bfkj5haqgvzl7r
committer: Canonical.com Patch Queue Manager <pqm at pqm.ubuntu.com>
branch nick: +trunk
timestamp: Fri 2009-08-14 20:14:23 +0100
message:
  (jam) Introduce CombinedGraphIndex.find_ancestry and
  	BTreeGraphIndex._find_ancestors()
modified:
  NEWS                           NEWS-20050323055033-4e00b5db738777ff
  bzrlib/btree_index.py          index.py-20080624222253-p0x5f92uyh5hw734-7
  bzrlib/index.py                index.py-20070712131115-lolkarso50vjr64s-1
  bzrlib/tests/test_btree_index.py test_index.py-20080624222253-p0x5f92uyh5hw734-13
  bzrlib/tests/test_index.py     test_index.py-20070712131115-lolkarso50vjr64s-2
=== modified file 'NEWS'

--- a/NEWS	2009-08-14 16:45:57 +0000
+++ b/NEWS	2009-08-14 17:36:03 +0000
@@ -81,6 +81,13 @@
 * RemoteBranch.open now honours ignore_fallbacks correctly on bzr-v2
   protocols. (Robert Collins)
 
+* The index code now has some specialized routines to extract the full
+  ancestry of a key in a more efficient manner.
+  ``CombinedGraphIndex.find_ancestry()``. This is not fully exposed to the
+  higher levels yet, but has the potential to improve grabbing the full
+  ancestry tremendously. (Time to get ancestry for bzr.dev drops from 1.5s
+  down to 300ms. For OOo from 33s => 10.5s) (John Arbash Meinel)
+
 Testing
 *******
 

=== modified file 'bzrlib/btree_index.py'
--- a/bzrlib/btree_index.py	2009-07-01 10:51:47 +0000
+++ b/bzrlib/btree_index.py	2009-08-13 19:56:26 +0000
@@ -586,13 +586,19 @@
 class _LeafNode(object):
     """A leaf node for a serialised B+Tree index."""
 
-    __slots__ = ('keys',)
+    __slots__ = ('keys', 'min_key', 'max_key')
 
     def __init__(self, bytes, key_length, ref_list_length):
         """Parse bytes to create a leaf node object."""
         # splitlines mangles the \r delimiters.. don't use it.
-        self.keys = dict(_btree_serializer._parse_leaf_lines(bytes,
-            key_length, ref_list_length))
+        key_list = _btree_serializer._parse_leaf_lines(bytes,
+            key_length, ref_list_length)
+        if key_list:
+            self.min_key = key_list[0][0]
+            self.max_key = key_list[-1][0]
+        else:
+            self.min_key = self.max_key = None
+        self.keys = dict(key_list)
 
 
 class _InternalNode(object):
@@ -1039,6 +1045,39 @@
             output.append(cur_out)
         return output
 
+    def _walk_through_internal_nodes(self, keys):
+        """Take the given set of keys, and find the corresponding LeafNodes.
+
+        :param keys: An unsorted iterable of keys to search for
+        :return: (nodes, index_and_keys)
+            nodes is a dict mapping {index: LeafNode}
+            keys_at_index is a list of tuples of [(index, [keys for Leaf])]
+        """
+        # 6 seconds spent in miss_torture using the sorted() line.
+        # Even with out of order disk IO it seems faster not to sort it when
+        # large queries are being made.
+        keys_at_index = [(0, sorted(keys))]
+
+        for row_pos, next_row_start in enumerate(self._row_offsets[1:-1]):
+            node_indexes = [idx for idx, s_keys in keys_at_index]
+            nodes = self._get_internal_nodes(node_indexes)
+
+            next_nodes_and_keys = []
+            for node_index, sub_keys in keys_at_index:
+                node = nodes[node_index]
+                positions = self._multi_bisect_right(sub_keys, node.keys)
+                node_offset = next_row_start + node.offset
+                next_nodes_and_keys.extend([(node_offset + pos, s_keys)
+                                           for pos, s_keys in positions])
+            keys_at_index = next_nodes_and_keys
+        # We should now be at the _LeafNodes
+        node_indexes = [idx for idx, s_keys in keys_at_index]
+
+        # TODO: We may *not* want to always read all the nodes in one
+        #       big go. Consider setting a max size on this.
+        nodes = self._get_leaf_nodes(node_indexes)
+        return nodes, keys_at_index
+
     def iter_entries(self, keys):
         """Iterate over keys within the index.
 
@@ -1082,32 +1121,7 @@
         needed_keys = keys
         if not needed_keys:
             return
-        # 6 seconds spent in miss_torture using the sorted() line.
-        # Even with out of order disk IO it seems faster not to sort it when
-        # large queries are being made.
-        needed_keys = sorted(needed_keys)
-
-        nodes_and_keys = [(0, needed_keys)]
-
-        for row_pos, next_row_start in enumerate(self._row_offsets[1:-1]):
-            node_indexes = [idx for idx, s_keys in nodes_and_keys]
-            nodes = self._get_internal_nodes(node_indexes)
-
-            next_nodes_and_keys = []
-            for node_index, sub_keys in nodes_and_keys:
-                node = nodes[node_index]
-                positions = self._multi_bisect_right(sub_keys, node.keys)
-                node_offset = next_row_start + node.offset
-                next_nodes_and_keys.extend([(node_offset + pos, s_keys)
-                                           for pos, s_keys in positions])
-            nodes_and_keys = next_nodes_and_keys
-        # We should now be at the _LeafNodes
-        node_indexes = [idx for idx, s_keys in nodes_and_keys]
-
-        # TODO: We may *not* want to always read all the nodes in one
-        #       big go. Consider setting a max size on this.
-
-        nodes = self._get_leaf_nodes(node_indexes)
+        nodes, nodes_and_keys = self._walk_through_internal_nodes(needed_keys)
         for node_index, sub_keys in nodes_and_keys:
             if not sub_keys:
                 continue
@@ -1120,6 +1134,133 @@
                     else:
                         yield (self, next_sub_key, value)
 
+    def _find_ancestors(self, keys, ref_list_num, parent_map, missing_keys):
+        """Find the parent_map information for the set of keys.
+
+        This populates the parent_map dict and missing_keys set based on the
+        queried keys. It also can fill out an arbitrary number of parents that
+        it finds while searching for the supplied keys.
+
+        It is unlikely that you want to call this directly. See
+        "CombinedGraphIndex.find_ancestry()" for a more appropriate API.
+
+        :param keys: A keys whose ancestry we want to return
+            Every key will either end up in 'parent_map' or 'missing_keys'.
+        :param ref_list_num: This index in the ref_lists is the parents we
+            care about.
+        :param parent_map: {key: parent_keys} for keys that are present in this
+            index. This may contain more entries than were in 'keys', that are
+            reachable ancestors of the keys requested.
+        :param missing_keys: keys which are known to be missing in this index.
+            This may include parents that were not directly requested, but we
+            were able to determine that they are not present in this index.
+        :return: search_keys    parents that were found but not queried to know
+            if they are missing or present. Callers can re-query this index for
+            those keys, and they will be placed into parent_map or missing_keys
+        """
+        if not self.key_count():
+            # We use key_count() to trigger reading the root node and
+            # determining info about this BTreeGraphIndex
+            # If we don't have any keys, then everything is missing
+            missing_keys.update(keys)
+            return set()
+        if ref_list_num >= self.node_ref_lists:
+            raise ValueError('No ref list %d, index has %d ref lists'
+                % (ref_list_num, self.node_ref_lists))
+
+        # The main trick we are trying to accomplish is that when we find a
+        # key listing its parents, we expect that the parent key is also likely
+        # to sit on the same page. Allowing us to expand parents quickly
+        # without suffering the full stack of bisecting, etc.
+        nodes, nodes_and_keys = self._walk_through_internal_nodes(keys)
+
+        # These are parent keys which could not be immediately resolved on the
+        # page where the child was present. Note that we may already be
+        # searching for that key, and it may actually be present [or known
+        # missing] on one of the other pages we are reading.
+        # TODO:
+        #   We could try searching for them in the immediate previous or next
+        #   page. If they occur "later" we could put them in a pending lookup
+        #   set, and then for each node we read thereafter we could check to
+        #   see if they are present.
+        #   However, we don't know the impact of keeping this list of things
+        #   that I'm going to search for every node I come across from here on
+        #   out.
+        #   It doesn't handle the case when the parent key is missing on a
+        #   page that we *don't* read. So we already have to handle being
+        #   re-entrant for that.
+        #   Since most keys contain a date string, they are more likely to be
+        #   found earlier in the file than later, but we would know that right
+        #   away (key < min_key), and wouldn't keep searching it on every other
+        #   page that we read.
+        #   Mostly, it is an idea, one which should be benchmarked.
+        parents_not_on_page = set()
+
+        for node_index, sub_keys in nodes_and_keys:
+            if not sub_keys:
+                continue
+            # sub_keys is all of the keys we are looking for that should exist
+            # on this page, if they aren't here, then they won't be found
+            node = nodes[node_index]
+            node_keys = node.keys
+            parents_to_check = set()
+            for next_sub_key in sub_keys:
+                if next_sub_key not in node_keys:
+                    # This one is just not present in the index at all
+                    missing_keys.add(next_sub_key)
+                else:
+                    value, refs = node_keys[next_sub_key]
+                    parent_keys = refs[ref_list_num]
+                    parent_map[next_sub_key] = parent_keys
+                    parents_to_check.update(parent_keys)
+            # Don't look for things we've already found
+            parents_to_check = parents_to_check.difference(parent_map)
+            # this can be used to test the benefit of having the check loop
+            # inlined.
+            # parents_not_on_page.update(parents_to_check)
+            # continue
+            while parents_to_check:
+                next_parents_to_check = set()
+                for key in parents_to_check:
+                    if key in node_keys:
+                        value, refs = node_keys[key]
+                        parent_keys = refs[ref_list_num]
+                        parent_map[key] = parent_keys
+                        next_parents_to_check.update(parent_keys)
+                    else:
+                        # This parent either is genuinely missing, or should be
+                        # found on another page. Perf test whether it is better
+                        # to check if this node should fit on this page or not.
+                        # in the 'everything-in-one-pack' scenario, this *not*
+                        # doing the check is 237ms vs 243ms.
+                        # So slightly better, but I assume the standard 'lots
+                        # of packs' is going to show a reasonable improvement
+                        # from the check, because it avoids 'going around
+                        # again' for everything that is in another index
+                        # parents_not_on_page.add(key)
+                        # Missing for some reason
+                        if key < node.min_key:
+                            # in the case of bzr.dev, 3.4k/5.3k misses are
+                            # 'earlier' misses (65%)
+                            parents_not_on_page.add(key)
+                        elif key > node.max_key:
+                            # This parent key would be present on a different
+                            # LeafNode
+                            parents_not_on_page.add(key)
+                        else:
+                            # assert key != node.min_key and key != node.max_key
+                            # If it was going to be present, it would be on
+                            # *this* page, so mark it missing.
+                            missing_keys.add(key)
+                parents_to_check = next_parents_to_check.difference(parent_map)
+                # Might want to do another .difference() from missing_keys
+        # parents_not_on_page could have been found on a different page, or be
+        # known to be missing. So cull out everything that has already been
+        # found.
+        search_keys = parents_not_on_page.difference(
+            parent_map).difference(missing_keys)
+        return search_keys
+
     def iter_entries_prefix(self, keys):
         """Iterate over keys within the index using prefix matching.
 

=== modified file 'bzrlib/index.py'
--- a/bzrlib/index.py	2009-07-01 10:53:08 +0000
+++ b/bzrlib/index.py	2009-08-13 19:56:26 +0000
@@ -702,6 +702,23 @@
                 # the last thing looked up was a terminal element
                 yield (self, ) + key_dict
 
+    def _find_ancestors(self, keys, ref_list_num, parent_map, missing_keys):
+        """See BTreeIndex._find_ancestors."""
+        # The api can be implemented as a trivial overlay on top of
+        # iter_entries, it is not an efficient implementation, but it at least
+        # gets the job done.
+        found_keys = set()
+        search_keys = set()
+        for index, key, value, refs in self.iter_entries(keys):
+            parent_keys = refs[ref_list_num]
+            found_keys.add(key)
+            parent_map[key] = parent_keys
+            search_keys.update(parent_keys)
+        # Figure out what, if anything, was missing
+        missing_keys.update(set(keys).difference(found_keys))
+        search_keys = search_keys.difference(parent_map)
+        return search_keys
+
     def key_count(self):
         """Return an estimate of the number of keys in this index.
 
@@ -1297,6 +1314,69 @@
             except errors.NoSuchFile:
                 self._reload_or_raise()
 
+    def find_ancestry(self, keys, ref_list_num):
+        """Find the complete ancestry for the given set of keys.
+
+        Note that this is a whole-ancestry request, so it should be used
+        sparingly.
+
+        :param keys: An iterable of keys to look for
+        :param ref_list_num: The reference list which references the parents
+            we care about.
+        :return: (parent_map, missing_keys)
+        """
+        missing_keys = set()
+        parent_map = {}
+        keys_to_lookup = set(keys)
+        generation = 0
+        while keys_to_lookup:
+            # keys that *all* indexes claim are missing, stop searching them
+            generation += 1
+            all_index_missing = None
+            # print 'gen\tidx\tsub\tn_keys\tn_pmap\tn_miss'
+            # print '%4d\t\t\t%4d\t%5d\t%5d' % (generation, len(keys_to_lookup),
+            #                                   len(parent_map),
+            #                                   len(missing_keys))
+            for index_idx, index in enumerate(self._indices):
+                # TODO: we should probably be doing something with
+                #       'missing_keys' since we've already determined that
+                #       those revisions have not been found anywhere
+                index_missing_keys = set()
+                # Find all of the ancestry we can from this index
+                # keep looking until the search_keys set is empty, which means
+                # things we didn't find should be in index_missing_keys
+                search_keys = keys_to_lookup
+                sub_generation = 0
+                # print '    \t%2d\t\t%4d\t%5d\t%5d' % (
+                #     index_idx, len(search_keys),
+                #     len(parent_map), len(index_missing_keys))
+                while search_keys:
+                    sub_generation += 1
+                    # TODO: ref_list_num should really be a parameter, since
+                    #       CombinedGraphIndex does not know what the ref lists
+                    #       mean.
+                    search_keys = index._find_ancestors(search_keys,
+                        ref_list_num, parent_map, index_missing_keys)
+                    # print '    \t  \t%2d\t%4d\t%5d\t%5d' % (
+                    #     sub_generation, len(search_keys),
+                    #     len(parent_map), len(index_missing_keys))
+                # Now set whatever was missing to be searched in the next index
+                keys_to_lookup = index_missing_keys
+                if all_index_missing is None:
+                    all_index_missing = set(index_missing_keys)
+                else:
+                    all_index_missing.intersection_update(index_missing_keys)
+                if not keys_to_lookup:
+                    break
+            if all_index_missing is None:
+                # There were no indexes, so all search keys are 'missing'
+                missing_keys.update(keys_to_lookup)
+                keys_to_lookup = None
+            else:
+                missing_keys.update(all_index_missing)
+                keys_to_lookup.difference_update(all_index_missing)
+        return parent_map, missing_keys
+
     def key_count(self):
         """Return an estimate of the number of keys in this index.
 

=== modified file 'bzrlib/tests/test_btree_index.py'
--- a/bzrlib/tests/test_btree_index.py	2009-06-22 12:52:39 +0000
+++ b/bzrlib/tests/test_btree_index.py	2009-08-13 19:56:26 +0000
@@ -1,4 +1,4 @@
-# Copyright (C) 2008 Canonical Ltd
+# Copyright (C) 2008, 2009 Canonical Ltd
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@@ -23,6 +23,7 @@
 from bzrlib import (
     btree_index,
     errors,
+    osutils,
     tests,
     )
 from bzrlib.tests import (
@@ -980,6 +981,140 @@
             ])
         self.assertEqual(set([]), index.external_references(0))
 
+    def test__find_ancestors_one_page(self):
+        key1 = ('key-1',)
+        key2 = ('key-2',)
+        index = self.make_index(ref_lists=1, key_elements=1, nodes=[
+            (key1, 'value', ([key2],)),
+            (key2, 'value', ([],)),
+            ])
+        parent_map = {}
+        missing_keys = set()
+        search_keys = index._find_ancestors([key1], 0, parent_map, missing_keys)
+        self.assertEqual({key1: (key2,), key2: ()}, parent_map)
+        self.assertEqual(set(), missing_keys)
+        self.assertEqual(set(), search_keys)
+
+    def test__find_ancestors_one_page_w_missing(self):
+        key1 = ('key-1',)
+        key2 = ('key-2',)
+        key3 = ('key-3',)
+        index = self.make_index(ref_lists=1, key_elements=1, nodes=[
+            (key1, 'value', ([key2],)),
+            (key2, 'value', ([],)),
+            ])
+        parent_map = {}
+        missing_keys = set()
+        search_keys = index._find_ancestors([key2, key3], 0, parent_map,
+                                            missing_keys)
+        self.assertEqual({key2: ()}, parent_map)
+        # we know that key3 is missing because we read the page that it would
+        # otherwise be on
+        self.assertEqual(set([key3]), missing_keys)
+        self.assertEqual(set(), search_keys)
+
+    def test__find_ancestors_one_parent_missing(self):
+        key1 = ('key-1',)
+        key2 = ('key-2',)
+        key3 = ('key-3',)
+        index = self.make_index(ref_lists=1, key_elements=1, nodes=[
+            (key1, 'value', ([key2],)),
+            (key2, 'value', ([key3],)),
+            ])
+        parent_map = {}
+        missing_keys = set()
+        search_keys = index._find_ancestors([key1], 0, parent_map,
+                                            missing_keys)
+        self.assertEqual({key1: (key2,), key2: (key3,)}, parent_map)
+        self.assertEqual(set(), missing_keys)
+        # all we know is that key3 wasn't present on the page we were reading
+        # but if you look, the last key is key2 which comes before key3, so we
+        # don't know whether key3 would land on this page or not.
+        self.assertEqual(set([key3]), search_keys)
+        search_keys = index._find_ancestors(search_keys, 0, parent_map,
+                                            missing_keys)
+        # passing it back in, we are sure it is 'missing'
+        self.assertEqual({key1: (key2,), key2: (key3,)}, parent_map)
+        self.assertEqual(set([key3]), missing_keys)
+        self.assertEqual(set([]), search_keys)
+
+    def test__find_ancestors_dont_search_known(self):
+        key1 = ('key-1',)
+        key2 = ('key-2',)
+        key3 = ('key-3',)
+        index = self.make_index(ref_lists=1, key_elements=1, nodes=[
+            (key1, 'value', ([key2],)),
+            (key2, 'value', ([key3],)),
+            (key3, 'value', ([],)),
+            ])
+        # We already know about key2, so we won't try to search for key3
+        parent_map = {key2: (key3,)}
+        missing_keys = set()
+        search_keys = index._find_ancestors([key1], 0, parent_map,
+                                            missing_keys)
+        self.assertEqual({key1: (key2,), key2: (key3,)}, parent_map)
+        self.assertEqual(set(), missing_keys)
+        self.assertEqual(set(), search_keys)
+
+    def test__find_ancestors_multiple_pages(self):
+        # We need to use enough keys that we actually cause a split
+        start_time = 1249671539
+        email = "joebob at example.com"
+        nodes = []
+        ref_lists = ((),)
+        rev_keys = []
+        for i in xrange(400):
+            rev_id = '%s-%s-%s' % (email,
+                                   osutils.compact_date(start_time + i),
+                                   osutils.rand_chars(16))
+            rev_key = (rev_id,)
+            nodes.append((rev_key, 'value', ref_lists))
+            # We have a ref 'list' of length 1, with a list of parents, with 1
+            # parent which is a key
+            ref_lists = ((rev_key,),)
+            rev_keys.append(rev_key)
+        index = self.make_index(ref_lists=1, key_elements=1, nodes=nodes)
+        self.assertEqual(400, index.key_count())
+        self.assertEqual(3, len(index._row_offsets))
+        nodes = dict(index._read_nodes([1, 2]))
+        l1 = nodes[1]
+        l2 = nodes[2]
+        min_l2_key = l2.min_key
+        max_l1_key = l1.max_key
+        self.assertTrue(max_l1_key < min_l2_key)
+        parents_min_l2_key = l2.keys[min_l2_key][1][0]
+        self.assertEqual((l1.max_key,), parents_min_l2_key)
+        # Now, whatever key we select that would fall on the second page,
+        # should give us all the parents until the page break
+        key_idx = rev_keys.index(min_l2_key)
+        next_key = rev_keys[key_idx+1]
+        # So now when we get the parent map, we should get the key we are
+        # looking for, min_l2_key, and then a reference to go look for the
+        # parent of that key
+        parent_map = {}
+        missing_keys = set()
+        search_keys = index._find_ancestors([next_key], 0, parent_map,
+                                            missing_keys)
+        self.assertEqual([min_l2_key, next_key], sorted(parent_map))
+        self.assertEqual(set(), missing_keys)
+        self.assertEqual(set([max_l1_key]), search_keys)
+        parent_map = {}
+        search_keys = index._find_ancestors([max_l1_key], 0, parent_map,
+                                            missing_keys)
+        self.assertEqual(sorted(l1.keys), sorted(parent_map))
+        self.assertEqual(set(), missing_keys)
+        self.assertEqual(set(), search_keys)
+
+    def test__find_ancestors_empty_index(self):
+        index = self.make_index(ref_lists=1, key_elements=1, nodes=[])
+        parent_map = {}
+        missing_keys = set()
+        search_keys = index._find_ancestors([('one',), ('two',)], 0, parent_map,
+                                            missing_keys)
+        self.assertEqual(set(), search_keys)
+        self.assertEqual({}, parent_map)
+        self.assertEqual(set([('one',), ('two',)]), missing_keys)
+
 
 class TestBTreeNodes(BTreeTestCase):
 

=== modified file 'bzrlib/tests/test_index.py'
--- a/bzrlib/tests/test_index.py	2009-03-23 14:59:43 +0000
+++ b/bzrlib/tests/test_index.py	2009-08-13 19:56:26 +0000
@@ -953,6 +953,59 @@
             ])
         self.assertEqual(set([]), index.external_references(0))
 
+    def test__find_ancestors(self):
+        key1 = ('key-1',)
+        key2 = ('key-2',)
+        index = self.make_index(ref_lists=1, key_elements=1, nodes=[
+            (key1, 'value', ([key2],)),
+            (key2, 'value', ([],)),
+            ])
+        parent_map = {}
+        missing_keys = set()
+        search_keys = index._find_ancestors([key1], 0, parent_map, missing_keys)
+        self.assertEqual({key1: (key2,)}, parent_map)
+        self.assertEqual(set(), missing_keys)
+        self.assertEqual(set([key2]), search_keys)
+        search_keys = index._find_ancestors(search_keys, 0, parent_map,
+                                            missing_keys)
+        self.assertEqual({key1: (key2,), key2: ()}, parent_map)
+        self.assertEqual(set(), missing_keys)
+        self.assertEqual(set(), search_keys)
+
+    def test__find_ancestors_w_missing(self):
+        key1 = ('key-1',)
+        key2 = ('key-2',)
+        key3 = ('key-3',)
+        index = self.make_index(ref_lists=1, key_elements=1, nodes=[
+            (key1, 'value', ([key2],)),
+            (key2, 'value', ([],)),
+            ])
+        parent_map = {}
+        missing_keys = set()
+        search_keys = index._find_ancestors([key2, key3], 0, parent_map,
+                                            missing_keys)
+        self.assertEqual({key2: ()}, parent_map)
+        self.assertEqual(set([key3]), missing_keys)
+        self.assertEqual(set(), search_keys)
+
+    def test__find_ancestors_dont_search_known(self):
+        key1 = ('key-1',)
+        key2 = ('key-2',)
+        key3 = ('key-3',)
+        index = self.make_index(ref_lists=1, key_elements=1, nodes=[
+            (key1, 'value', ([key2],)),
+            (key2, 'value', ([key3],)),
+            (key3, 'value', ([],)),
+            ])
+        # We already know about key2, so we won't try to search for key3
+        parent_map = {key2: (key3,)}
+        missing_keys = set()
+        search_keys = index._find_ancestors([key1], 0, parent_map,
+                                            missing_keys)
+        self.assertEqual({key1: (key2,), key2: (key3,)}, parent_map)
+        self.assertEqual(set(), missing_keys)
+        self.assertEqual(set(), search_keys)
+
 
 class TestCombinedGraphIndex(TestCaseWithMemoryTransport):
 
@@ -1271,6 +1324,86 @@
                                     ['1', '2', '3'])
         self.assertRaises(errors.NoSuchFile, index.validate)
 
+    def test_find_ancestors_across_indexes(self):
+        key1 = ('key-1',)
+        key2 = ('key-2',)
+        key3 = ('key-3',)
+        key4 = ('key-4',)
+        index1 = self.make_index('12', ref_lists=1, nodes=[
+            (key1, 'value', ([],)),
+            (key2, 'value', ([key1],)),
+            ])
+        index2 = self.make_index('34', ref_lists=1, nodes=[
+            (key3, 'value', ([key2],)),
+            (key4, 'value', ([key3],)),
+            ])
+        c_index = CombinedGraphIndex([index1, index2])
+        parent_map, missing_keys = c_index.find_ancestry([key1], 0)
+        self.assertEqual({key1: ()}, parent_map)
+        self.assertEqual(set(), missing_keys)
+        # Now look for a key from index2 which requires us to find the key in
+        # the second index, and then continue searching for parents in the
+        # first index
+        parent_map, missing_keys = c_index.find_ancestry([key3], 0)
+        self.assertEqual({key1: (), key2: (key1,), key3: (key2,)}, parent_map)
+        self.assertEqual(set(), missing_keys)
+
+    def test_find_ancestors_missing_keys(self):
+        key1 = ('key-1',)
+        key2 = ('key-2',)
+        key3 = ('key-3',)
+        key4 = ('key-4',)
+        index1 = self.make_index('12', ref_lists=1, nodes=[
+            (key1, 'value', ([],)),
+            (key2, 'value', ([key1],)),
+            ])
+        index2 = self.make_index('34', ref_lists=1, nodes=[
+            (key3, 'value', ([key2],)),
+            ])
+        c_index = CombinedGraphIndex([index1, index2])
+        # Searching for a key which is actually not present at all should
+        # eventually converge
+        parent_map, missing_keys = c_index.find_ancestry([key4], 0)
+        self.assertEqual({}, parent_map)
+        self.assertEqual(set([key4]), missing_keys)
+
+    def test_find_ancestors_no_indexes(self):
+        c_index = CombinedGraphIndex([])
+        key1 = ('key-1',)
+        parent_map, missing_keys = c_index.find_ancestry([key1], 0)
+        self.assertEqual({}, parent_map)
+        self.assertEqual(set([key1]), missing_keys)
+
+    def test_find_ancestors_ghost_parent(self):
+        key1 = ('key-1',)
+        key2 = ('key-2',)
+        key3 = ('key-3',)
+        key4 = ('key-4',)
+        index1 = self.make_index('12', ref_lists=1, nodes=[
+            (key1, 'value', ([],)),
+            (key2, 'value', ([key1],)),
+            ])
+        index2 = self.make_index('34', ref_lists=1, nodes=[
+            (key4, 'value', ([key2, key3],)),
+            ])
+        c_index = CombinedGraphIndex([index1, index2])
+        # Searching for a key which is actually not present at all should
+        # eventually converge
+        parent_map, missing_keys = c_index.find_ancestry([key4], 0)
+        self.assertEqual({key4: (key2, key3), key2: (key1,), key1: ()},
+                         parent_map)
+        self.assertEqual(set([key3]), missing_keys)
+
+    def test__find_ancestors_empty_index(self):
+        index = self.make_index('test', ref_lists=1, key_elements=1, nodes=[])
+        parent_map = {}
+        missing_keys = set()
+        search_keys = index._find_ancestors([('one',), ('two',)], 0, parent_map,
+                                            missing_keys)
+        self.assertEqual(set(), search_keys)
+        self.assertEqual({}, parent_map)
+        self.assertEqual(set([('one',), ('two',)]), missing_keys)
+
 
 class TestInMemoryGraphIndex(TestCaseWithMemoryTransport):