Rev 5141: (andrew) Optimise index lookups in repositories with many pack files. in file:///home/pqm/archives/thelove/bzr/%2Btrunk/

Thu Apr 8 09:49:05 BST 2010

At file:///home/pqm/archives/thelove/bzr/%2Btrunk/

------------------------------------------------------------
revno: 5141 [merge]
revision-id: pqm at pqm.ubuntu.com-20100408084859-lbi26gvsu2rtz370
parent: pqm at pqm.ubuntu.com-20100408073248-aj7k8qkvbv4nzlxd
parent: andrew.bennetts at canonical.com-20100408070923-pp4d8xephvwtcsw2
committer: Canonical.com Patch Queue Manager <pqm at pqm.ubuntu.com>
branch nick: +trunk
timestamp: Thu 2010-04-08 09:48:59 +0100
message:
  (andrew) Optimise index lookups in repositories with many pack files.
modified:
  NEWS                           NEWS-20050323055033-4e00b5db738777ff
  bzrlib/index.py                index.py-20070712131115-lolkarso50vjr64s-1
  bzrlib/repofmt/pack_repo.py    pack_repo.py-20070813041115-gjv5ma7ktfqwsjgn-1
  bzrlib/tests/per_pack_repository.py test_pack_repository-20080801043947-eaw0e6h2gu75kwmy-1
  bzrlib/tests/test_index.py     test_index.py-20070712131115-lolkarso50vjr64s-2
=== modified file 'NEWS'

--- a/NEWS	2010-04-08 07:32:48 +0000
+++ b/NEWS	2010-04-08 08:48:59 +0000
@@ -38,6 +38,12 @@
   generated by a template and not edited by the user.
   (Robert Collins, #530265)
 
+* Index lookups in pack repositories search recently hit pack files first.  
+  In repositories with many pack files this can greatly reduce the
+  number of files accessed, the number of bytes read, and the number of
+  read calls.  An incremental pull via plain HTTP takes half the time and
+  bytes for a moderately large repository.  (Andrew Bennetts)
+
 * Less code is loaded at startup.  (Cold-cache start time is about 10-20%
   less.)
   (Martin Pool, #553017)

=== modified file 'bzrlib/index.py'
--- a/bzrlib/index.py	2010-03-05 17:56:55 +0000
+++ b/bzrlib/index.py	2010-04-08 07:01:10 +0000
@@ -1245,10 +1245,15 @@
     static data.
 
     Queries against the combined index will be made against the first index,
-    and then the second and so on. The order of index's can thus influence
+    and then the second and so on. The order of indices can thus influence
     performance significantly. For example, if one index is on local disk and a
     second on a remote server, the local disk index should be before the other
     in the index list.
+    
+    Also, queries tend to need results from the same indices as previous
+    queries.  So the indices will be reordered after every query to put the
+    indices that had the result(s) of that query first (while otherwise
+    preserving the relative ordering).
     """
 
     def __init__(self, indices, reload_func=None):
@@ -1261,6 +1266,13 @@
         """
         self._indices = indices
         self._reload_func = reload_func
+        # Sibling indices are other CombinedGraphIndex that we should call
+        # _move_to_front_by_name on when we auto-reorder ourself.
+        self._sibling_indices = []
+        # A list of names that corresponds to the instances in self._indices,
+        # so _index_names[0] is always the name for _indices[0], etc.  Sibling
+        # indices must all use the same set of names as each other.
+        self._index_names = [None] * len(self._indices)
 
     def __repr__(self):
         return "%s(%s)" % (
@@ -1289,13 +1301,17 @@
 
     has_key = _has_key_from_parent_map
 
-    def insert_index(self, pos, index):
+    def insert_index(self, pos, index, name=None):
         """Insert a new index in the list of indices to query.
 
         :param pos: The position to insert the index.
         :param index: The index to insert.
+        :param name: a name for this index, e.g. a pack name.  These names can
+            be used to reflect index reorderings to related CombinedGraphIndex
+            instances that use the same names.  (see set_sibling_indices)
         """
         self._indices.insert(pos, index)
+        self._index_names.insert(pos, name)
 
     def iter_all_entries(self):
         """Iterate over all keys within the index
@@ -1326,22 +1342,28 @@
         value and are only reported once.
 
         :param keys: An iterable providing the keys to be retrieved.
-        :return: An iterable of (index, key, reference_lists, value). There is no
-            defined order for the result iteration - it will be in the most
+        :return: An iterable of (index, key, reference_lists, value). There is
+            no defined order for the result iteration - it will be in the most
             efficient order for the index.
         """
         keys = set(keys)
+        hit_indices = []
         while True:
             try:
                 for index in self._indices:
                     if not keys:
-                        return
+                        break
+                    index_hit = False
                     for node in index.iter_entries(keys):
                         keys.remove(node[1])
                         yield node
-                return
+                        index_hit = True
+                    if index_hit:
+                        hit_indices.append(index)
+                break
             except errors.NoSuchFile:
                 self._reload_or_raise()
+        self._move_to_front(hit_indices)
 
     def iter_entries_prefix(self, keys):
         """Iterate over keys within the index using prefix matching.
@@ -1367,17 +1389,77 @@
         if not keys:
             return
         seen_keys = set()
+        hit_indices = []
         while True:
             try:
                 for index in self._indices:
+                    index_hit = False
                     for node in index.iter_entries_prefix(keys):
                         if node[1] in seen_keys:
                             continue
                         seen_keys.add(node[1])
                         yield node
-                return
+                        index_hit = True
+                    if index_hit:
+                        hit_indices.append(index)
+                break
             except errors.NoSuchFile:
                 self._reload_or_raise()
+        self._move_to_front(hit_indices)
+
+    def _move_to_front(self, hit_indices):
+        """Rearrange self._indices so that hit_indices are first.
+
+        Order is maintained as much as possible, e.g. the first unhit index
+        will be the first index in _indices after the hit_indices, and the
+        hit_indices will be present in exactly the order they are passed to
+        _move_to_front.
+
+        _move_to_front propagates to all objects in self._sibling_indices by
+        calling _move_to_front_by_name.
+        """
+        hit_names = self._move_to_front_by_index(hit_indices)
+        for sibling_idx in self._sibling_indices:
+            sibling_idx._move_to_front_by_name(hit_names)
+
+    def _move_to_front_by_index(self, hit_indices):
+        """Core logic for _move_to_front.
+        
+        Returns a list of names corresponding to the hit_indices param.
+        """
+        indices_info = zip(self._index_names, self._indices)
+        if 'index' in debug.debug_flags:
+            mutter('CombinedGraphIndex reordering: currently %r, promoting %r',
+                   indices_info, hit_indices)
+        hit_indices_info = []
+        hit_names = []
+        unhit_indices_info = []
+        for name, idx in indices_info:
+            if idx in hit_indices:
+                info = hit_indices_info
+                hit_names.append(name)
+            else:
+                info = unhit_indices_info
+            info.append((name, idx))
+        final_info = hit_indices_info + unhit_indices_info
+        self._indices = [idx for (name, idx) in final_info]
+        self._index_names = [name for (name, idx) in final_info]
+        if 'index' in debug.debug_flags:
+            mutter('CombinedGraphIndex reordered: %r', self._indices)
+        return hit_names
+
+    def _move_to_front_by_name(self, hit_names):
+        """Moves indices named by 'hit_names' to front of the search order, as
+        described in _move_to_front.
+        """
+        # Translate names to index instances, and then call
+        # _move_to_front_by_index.
+        indices_info = zip(self._index_names, self._indices)
+        hit_indices = []
+        for name, idx in indices_info:
+            if name in hit_names:
+                hit_indices.append(idx)
+        self._move_to_front_by_index(hit_indices)
 
     def find_ancestry(self, keys, ref_list_num):
         """Find the complete ancestry for the given set of keys.
@@ -1390,6 +1472,7 @@
             we care about.
         :return: (parent_map, missing_keys)
         """
+        # XXX: make this call _move_to_front?
         missing_keys = set()
         parent_map = {}
         keys_to_lookup = set(keys)
@@ -1475,6 +1558,11 @@
                          ' Raising original exception.')
             raise exc_type, exc_value, exc_traceback
 
+    def set_sibling_indices(self, sibling_combined_graph_indices):
+        """Set the CombinedGraphIndex objects to reorder after reordering self.
+        """
+        self._sibling_indices = sibling_combined_graph_indices
+
     def validate(self):
         """Validate that everything in the index can be accessed."""
         while True:

=== modified file 'bzrlib/repofmt/pack_repo.py'
--- a/bzrlib/repofmt/pack_repo.py	2010-02-12 11:58:21 +0000
+++ b/bzrlib/repofmt/pack_repo.py	2010-03-17 11:37:38 +0000
@@ -587,26 +587,6 @@
                                              flush_func=flush_func)
         self.add_callback = None
 
-    def replace_indices(self, index_to_pack, indices):
-        """Replace the current mappings with fresh ones.
-
-        This should probably not be used eventually, rather incremental add and
-        removal of indices. It has been added during refactoring of existing
-        code.
-
-        :param index_to_pack: A mapping from index objects to
-            (transport, name) tuples for the pack file data.
-        :param indices: A list of indices.
-        """
-        # refresh the revision pack map dict without replacing the instance.
-        self.index_to_pack.clear()
-        self.index_to_pack.update(index_to_pack)
-        # XXX: API break - clearly a 'replace' method would be good?
-        self.combined_index._indices[:] = indices
-        # the current add nodes callback for the current writable index if
-        # there is one.
-        self.add_callback = None
-
     def add_index(self, index, pack):
         """Add index to the aggregate, which is an index for Pack pack.
 
@@ -619,7 +599,7 @@
         # expose it to the index map
         self.index_to_pack[index] = pack.access_tuple()
         # put it at the front of the linear index list
-        self.combined_index.insert_index(0, index)
+        self.combined_index.insert_index(0, index, pack.name)
 
     def add_writable_index(self, index, pack):
         """Add an index which is able to have data added to it.
@@ -645,6 +625,7 @@
         self.data_access.set_writer(None, None, (None, None))
         self.index_to_pack.clear()
         del self.combined_index._indices[:]
+        del self.combined_index._index_names[:]
         self.add_callback = None
 
     def remove_index(self, index):
@@ -653,7 +634,9 @@
         :param index: An index from the pack parameter.
         """
         del self.index_to_pack[index]
-        self.combined_index._indices.remove(index)
+        pos = self.combined_index._indices.index(index)
+        del self.combined_index._indices[pos]
+        del self.combined_index._index_names[pos]
         if (self.add_callback is not None and
             getattr(index, 'add_nodes', None) == self.add_callback):
             self.add_callback = None
@@ -1415,11 +1398,20 @@
         self.inventory_index = AggregateIndex(self.reload_pack_names, flush)
         self.text_index = AggregateIndex(self.reload_pack_names, flush)
         self.signature_index = AggregateIndex(self.reload_pack_names, flush)
+        all_indices = [self.revision_index, self.inventory_index,
+                self.text_index, self.signature_index]
         if use_chk_index:
             self.chk_index = AggregateIndex(self.reload_pack_names, flush)
+            all_indices.append(self.chk_index)
         else:
             # used to determine if we're using a chk_index elsewhere.
             self.chk_index = None
+        # Tell all the CombinedGraphIndex objects about each other, so they can
+        # share hints about which pack names to search first.
+        all_combined = [agg_idx.combined_index for agg_idx in all_indices]
+        for combined_idx in all_combined:
+            combined_idx.set_sibling_indices(
+                set(all_combined).difference([combined_idx]))
         # resumed packs
         self._resumed_packs = []
 

=== modified file 'bzrlib/tests/per_pack_repository.py'
--- a/bzrlib/tests/per_pack_repository.py	2010-02-23 07:43:11 +0000
+++ b/bzrlib/tests/per_pack_repository.py	2010-03-18 05:24:02 +0000
@@ -288,6 +288,23 @@
         repo._pack_collection._clear_obsolete_packs()
         self.assertTrue(repo_transport.has('obsolete_packs/.nfsblahblah'))
 
+    def test_pack_collection_sets_sibling_indices(self):
+        """The CombinedGraphIndex objects in the pack collection are all
+        siblings of each other, so that search-order reorderings will be copied
+        to each other.
+        """
+        repo = self.make_repository('repo')
+        pack_coll = repo._pack_collection
+        indices = set([pack_coll.revision_index, pack_coll.inventory_index,
+                pack_coll.text_index, pack_coll.signature_index])
+        if pack_coll.chk_index is not None:
+            indices.add(pack_coll.chk_index)
+        combined_indices = set(idx.combined_index for idx in indices)
+        for combined_index in combined_indices:
+            self.assertEqual(
+                combined_indices.difference([combined_index]),
+                combined_index._sibling_indices)
+
     def test_pack_after_two_commits_packs_everything(self):
         format = self.get_format()
         tree = self.make_branch_and_tree('.', format=format)

=== modified file 'bzrlib/tests/test_index.py'
--- a/bzrlib/tests/test_index.py	2010-03-05 17:56:55 +0000
+++ b/bzrlib/tests/test_index.py	2010-04-08 07:01:10 +0000
@@ -1380,6 +1380,50 @@
         self.assertListRaises(errors.NoSuchFile, index.iter_entries_prefix,
                                                  [('1',)])
 
+
+    def make_index_with_simple_nodes(self, name, num_nodes=1):
+        """Make an index named after 'name', with keys named after 'name' too.
+
+        Nodes will have a value of '' and no references.
+        """
+        nodes = [
+            (('index-%s-key-%s' % (name, n),), '', ())
+            for n in range(1, num_nodes+1)]
+        return self.make_index('index-%s' % name, 0, nodes=nodes)
+
+    def test_reorder_after_iter_entries(self):
+        # Four indices: [key1] in index1, [key2,key3] in index2, [] in index3,
+        # [key4] in index4.
+        index = CombinedGraphIndex([])
+        index.insert_index(0, self.make_index_with_simple_nodes('1'), '1')
+        index.insert_index(1, self.make_index_with_simple_nodes('2'), '2')
+        index.insert_index(2, self.make_index_with_simple_nodes('3'), '3')
+        index.insert_index(3, self.make_index_with_simple_nodes('4'), '4')
+        index1, index2, index3, index4 = index._indices
+        # Query a key from index4 and index2.
+        self.assertLength(2, list(index.iter_entries(
+            [('index-4-key-1',), ('index-2-key-1',)])))
+        # Now index2 and index4 should be moved to the front (and index1 should
+        # still be before index3).
+        self.assertEqual([index2, index4, index1, index3], index._indices)
+        self.assertEqual(['2', '4', '1', '3'], index._index_names)
+
+    def test_reorder_propagates_to_siblings(self):
+        # Two CombinedGraphIndex objects, with the same number of indicies with
+        # matching names.
+        cgi1 = CombinedGraphIndex([])
+        cgi2 = CombinedGraphIndex([])
+        cgi1.insert_index(0, self.make_index_with_simple_nodes('1-1'), 'one')
+        cgi1.insert_index(1, self.make_index_with_simple_nodes('1-2'), 'two')
+        cgi2.insert_index(0, self.make_index_with_simple_nodes('2-1'), 'one')
+        cgi2.insert_index(1, self.make_index_with_simple_nodes('2-2'), 'two')
+        index2_1, index2_2 = cgi2._indices
+        cgi1.set_sibling_indices([cgi2])
+        # Trigger a reordering in cgi1.  cgi2 will be reordered as well.
+        list(cgi1.iter_entries([('index-1-2-key-1',)]))
+        self.assertEqual([index2_2, index2_1], cgi2._indices)
+        self.assertEqual(['two', 'one'], cgi2._index_names)
+
     def test_validate_reloads(self):
         index, reload_counter = self.make_combined_index_with_missing()
         index.validate()