Rev 5141: (andrew) Optimise index lookups in repositories with many pack files. in file:///home/pqm/archives/thelove/bzr/%2Btrunk/
Canonical.com Patch Queue Manager
pqm at pqm.ubuntu.com
Thu Apr 8 09:49:05 BST 2010
At file:///home/pqm/archives/thelove/bzr/%2Btrunk/
------------------------------------------------------------
revno: 5141 [merge]
revision-id: pqm at pqm.ubuntu.com-20100408084859-lbi26gvsu2rtz370
parent: pqm at pqm.ubuntu.com-20100408073248-aj7k8qkvbv4nzlxd
parent: andrew.bennetts at canonical.com-20100408070923-pp4d8xephvwtcsw2
committer: Canonical.com Patch Queue Manager <pqm at pqm.ubuntu.com>
branch nick: +trunk
timestamp: Thu 2010-04-08 09:48:59 +0100
message:
(andrew) Optimise index lookups in repositories with many pack files.
modified:
NEWS NEWS-20050323055033-4e00b5db738777ff
bzrlib/index.py index.py-20070712131115-lolkarso50vjr64s-1
bzrlib/repofmt/pack_repo.py pack_repo.py-20070813041115-gjv5ma7ktfqwsjgn-1
bzrlib/tests/per_pack_repository.py test_pack_repository-20080801043947-eaw0e6h2gu75kwmy-1
bzrlib/tests/test_index.py test_index.py-20070712131115-lolkarso50vjr64s-2
=== modified file 'NEWS'
--- a/NEWS 2010-04-08 07:32:48 +0000
+++ b/NEWS 2010-04-08 08:48:59 +0000
@@ -38,6 +38,12 @@
generated by a template and not edited by the user.
(Robert Collins, #530265)
+* Index lookups in pack repositories search recently hit pack files first.
+ In repositories with many pack files this can greatly reduce the
+ number of files accessed, the number of bytes read, and the number of
+ read calls. An incremental pull via plain HTTP takes half the time and
+ bytes for a moderately large repository. (Andrew Bennetts)
+
* Less code is loaded at startup. (Cold-cache start time is about 10-20%
less.)
(Martin Pool, #553017)
=== modified file 'bzrlib/index.py'
--- a/bzrlib/index.py 2010-03-05 17:56:55 +0000
+++ b/bzrlib/index.py 2010-04-08 07:01:10 +0000
@@ -1245,10 +1245,15 @@
static data.
Queries against the combined index will be made against the first index,
- and then the second and so on. The order of index's can thus influence
+ and then the second and so on. The order of indices can thus influence
performance significantly. For example, if one index is on local disk and a
second on a remote server, the local disk index should be before the other
in the index list.
+
+ Also, queries tend to need results from the same indices as previous
+ queries. So the indices will be reordered after every query to put the
+ indices that had the result(s) of that query first (while otherwise
+ preserving the relative ordering).
"""
def __init__(self, indices, reload_func=None):
@@ -1261,6 +1266,13 @@
"""
self._indices = indices
self._reload_func = reload_func
+ # Sibling indices are other CombinedGraphIndex that we should call
+ # _move_to_front_by_name on when we auto-reorder ourself.
+ self._sibling_indices = []
+ # A list of names that corresponds to the instances in self._indices,
+ # so _index_names[0] is always the name for _indices[0], etc. Sibling
+ # indices must all use the same set of names as each other.
+ self._index_names = [None] * len(self._indices)
def __repr__(self):
return "%s(%s)" % (
@@ -1289,13 +1301,17 @@
has_key = _has_key_from_parent_map
- def insert_index(self, pos, index):
+ def insert_index(self, pos, index, name=None):
"""Insert a new index in the list of indices to query.
:param pos: The position to insert the index.
:param index: The index to insert.
+ :param name: a name for this index, e.g. a pack name. These names can
+ be used to reflect index reorderings to related CombinedGraphIndex
+ instances that use the same names. (see set_sibling_indices)
"""
self._indices.insert(pos, index)
+ self._index_names.insert(pos, name)
def iter_all_entries(self):
"""Iterate over all keys within the index
@@ -1326,22 +1342,28 @@
value and are only reported once.
:param keys: An iterable providing the keys to be retrieved.
- :return: An iterable of (index, key, reference_lists, value). There is no
- defined order for the result iteration - it will be in the most
+ :return: An iterable of (index, key, reference_lists, value). There is
+ no defined order for the result iteration - it will be in the most
efficient order for the index.
"""
keys = set(keys)
+ hit_indices = []
while True:
try:
for index in self._indices:
if not keys:
- return
+ break
+ index_hit = False
for node in index.iter_entries(keys):
keys.remove(node[1])
yield node
- return
+ index_hit = True
+ if index_hit:
+ hit_indices.append(index)
+ break
except errors.NoSuchFile:
self._reload_or_raise()
+ self._move_to_front(hit_indices)
def iter_entries_prefix(self, keys):
"""Iterate over keys within the index using prefix matching.
@@ -1367,17 +1389,77 @@
if not keys:
return
seen_keys = set()
+ hit_indices = []
while True:
try:
for index in self._indices:
+ index_hit = False
for node in index.iter_entries_prefix(keys):
if node[1] in seen_keys:
continue
seen_keys.add(node[1])
yield node
- return
+ index_hit = True
+ if index_hit:
+ hit_indices.append(index)
+ break
except errors.NoSuchFile:
self._reload_or_raise()
+ self._move_to_front(hit_indices)
+
+ def _move_to_front(self, hit_indices):
+ """Rearrange self._indices so that hit_indices are first.
+
+ Order is maintained as much as possible, e.g. the first unhit index
+ will be the first index in _indices after the hit_indices, and the
+ hit_indices will be present in exactly the order they are passed to
+ _move_to_front.
+
+ _move_to_front propagates to all objects in self._sibling_indices by
+ calling _move_to_front_by_name.
+ """
+ hit_names = self._move_to_front_by_index(hit_indices)
+ for sibling_idx in self._sibling_indices:
+ sibling_idx._move_to_front_by_name(hit_names)
+
+ def _move_to_front_by_index(self, hit_indices):
+ """Core logic for _move_to_front.
+
+ Returns a list of names corresponding to the hit_indices param.
+ """
+ indices_info = zip(self._index_names, self._indices)
+ if 'index' in debug.debug_flags:
+ mutter('CombinedGraphIndex reordering: currently %r, promoting %r',
+ indices_info, hit_indices)
+ hit_indices_info = []
+ hit_names = []
+ unhit_indices_info = []
+ for name, idx in indices_info:
+ if idx in hit_indices:
+ info = hit_indices_info
+ hit_names.append(name)
+ else:
+ info = unhit_indices_info
+ info.append((name, idx))
+ final_info = hit_indices_info + unhit_indices_info
+ self._indices = [idx for (name, idx) in final_info]
+ self._index_names = [name for (name, idx) in final_info]
+ if 'index' in debug.debug_flags:
+ mutter('CombinedGraphIndex reordered: %r', self._indices)
+ return hit_names
+
+ def _move_to_front_by_name(self, hit_names):
+ """Moves indices named by 'hit_names' to front of the search order, as
+ described in _move_to_front.
+ """
+ # Translate names to index instances, and then call
+ # _move_to_front_by_index.
+ indices_info = zip(self._index_names, self._indices)
+ hit_indices = []
+ for name, idx in indices_info:
+ if name in hit_names:
+ hit_indices.append(idx)
+ self._move_to_front_by_index(hit_indices)
def find_ancestry(self, keys, ref_list_num):
"""Find the complete ancestry for the given set of keys.
@@ -1390,6 +1472,7 @@
we care about.
:return: (parent_map, missing_keys)
"""
+ # XXX: make this call _move_to_front?
missing_keys = set()
parent_map = {}
keys_to_lookup = set(keys)
@@ -1475,6 +1558,11 @@
' Raising original exception.')
raise exc_type, exc_value, exc_traceback
+ def set_sibling_indices(self, sibling_combined_graph_indices):
+ """Set the CombinedGraphIndex objects to reorder after reordering self.
+ """
+ self._sibling_indices = sibling_combined_graph_indices
+
def validate(self):
"""Validate that everything in the index can be accessed."""
while True:
=== modified file 'bzrlib/repofmt/pack_repo.py'
--- a/bzrlib/repofmt/pack_repo.py 2010-02-12 11:58:21 +0000
+++ b/bzrlib/repofmt/pack_repo.py 2010-03-17 11:37:38 +0000
@@ -587,26 +587,6 @@
flush_func=flush_func)
self.add_callback = None
- def replace_indices(self, index_to_pack, indices):
- """Replace the current mappings with fresh ones.
-
- This should probably not be used eventually, rather incremental add and
- removal of indices. It has been added during refactoring of existing
- code.
-
- :param index_to_pack: A mapping from index objects to
- (transport, name) tuples for the pack file data.
- :param indices: A list of indices.
- """
- # refresh the revision pack map dict without replacing the instance.
- self.index_to_pack.clear()
- self.index_to_pack.update(index_to_pack)
- # XXX: API break - clearly a 'replace' method would be good?
- self.combined_index._indices[:] = indices
- # the current add nodes callback for the current writable index if
- # there is one.
- self.add_callback = None
-
def add_index(self, index, pack):
"""Add index to the aggregate, which is an index for Pack pack.
@@ -619,7 +599,7 @@
# expose it to the index map
self.index_to_pack[index] = pack.access_tuple()
# put it at the front of the linear index list
- self.combined_index.insert_index(0, index)
+ self.combined_index.insert_index(0, index, pack.name)
def add_writable_index(self, index, pack):
"""Add an index which is able to have data added to it.
@@ -645,6 +625,7 @@
self.data_access.set_writer(None, None, (None, None))
self.index_to_pack.clear()
del self.combined_index._indices[:]
+ del self.combined_index._index_names[:]
self.add_callback = None
def remove_index(self, index):
@@ -653,7 +634,9 @@
:param index: An index from the pack parameter.
"""
del self.index_to_pack[index]
- self.combined_index._indices.remove(index)
+ pos = self.combined_index._indices.index(index)
+ del self.combined_index._indices[pos]
+ del self.combined_index._index_names[pos]
if (self.add_callback is not None and
getattr(index, 'add_nodes', None) == self.add_callback):
self.add_callback = None
@@ -1415,11 +1398,20 @@
self.inventory_index = AggregateIndex(self.reload_pack_names, flush)
self.text_index = AggregateIndex(self.reload_pack_names, flush)
self.signature_index = AggregateIndex(self.reload_pack_names, flush)
+ all_indices = [self.revision_index, self.inventory_index,
+ self.text_index, self.signature_index]
if use_chk_index:
self.chk_index = AggregateIndex(self.reload_pack_names, flush)
+ all_indices.append(self.chk_index)
else:
# used to determine if we're using a chk_index elsewhere.
self.chk_index = None
+ # Tell all the CombinedGraphIndex objects about each other, so they can
+ # share hints about which pack names to search first.
+ all_combined = [agg_idx.combined_index for agg_idx in all_indices]
+ for combined_idx in all_combined:
+ combined_idx.set_sibling_indices(
+ set(all_combined).difference([combined_idx]))
# resumed packs
self._resumed_packs = []
=== modified file 'bzrlib/tests/per_pack_repository.py'
--- a/bzrlib/tests/per_pack_repository.py 2010-02-23 07:43:11 +0000
+++ b/bzrlib/tests/per_pack_repository.py 2010-03-18 05:24:02 +0000
@@ -288,6 +288,23 @@
repo._pack_collection._clear_obsolete_packs()
self.assertTrue(repo_transport.has('obsolete_packs/.nfsblahblah'))
+ def test_pack_collection_sets_sibling_indices(self):
+ """The CombinedGraphIndex objects in the pack collection are all
+ siblings of each other, so that search-order reorderings will be copied
+ to each other.
+ """
+ repo = self.make_repository('repo')
+ pack_coll = repo._pack_collection
+ indices = set([pack_coll.revision_index, pack_coll.inventory_index,
+ pack_coll.text_index, pack_coll.signature_index])
+ if pack_coll.chk_index is not None:
+ indices.add(pack_coll.chk_index)
+ combined_indices = set(idx.combined_index for idx in indices)
+ for combined_index in combined_indices:
+ self.assertEqual(
+ combined_indices.difference([combined_index]),
+ combined_index._sibling_indices)
+
def test_pack_after_two_commits_packs_everything(self):
format = self.get_format()
tree = self.make_branch_and_tree('.', format=format)
=== modified file 'bzrlib/tests/test_index.py'
--- a/bzrlib/tests/test_index.py 2010-03-05 17:56:55 +0000
+++ b/bzrlib/tests/test_index.py 2010-04-08 07:01:10 +0000
@@ -1380,6 +1380,50 @@
self.assertListRaises(errors.NoSuchFile, index.iter_entries_prefix,
[('1',)])
+
+ def make_index_with_simple_nodes(self, name, num_nodes=1):
+ """Make an index named after 'name', with keys named after 'name' too.
+
+ Nodes will have a value of '' and no references.
+ """
+ nodes = [
+ (('index-%s-key-%s' % (name, n),), '', ())
+ for n in range(1, num_nodes+1)]
+ return self.make_index('index-%s' % name, 0, nodes=nodes)
+
+ def test_reorder_after_iter_entries(self):
+ # Four indices: [key1] in index1, [key2,key3] in index2, [] in index3,
+ # [key4] in index4.
+ index = CombinedGraphIndex([])
+ index.insert_index(0, self.make_index_with_simple_nodes('1'), '1')
+ index.insert_index(1, self.make_index_with_simple_nodes('2'), '2')
+ index.insert_index(2, self.make_index_with_simple_nodes('3'), '3')
+ index.insert_index(3, self.make_index_with_simple_nodes('4'), '4')
+ index1, index2, index3, index4 = index._indices
+ # Query a key from index4 and index2.
+ self.assertLength(2, list(index.iter_entries(
+ [('index-4-key-1',), ('index-2-key-1',)])))
+ # Now index2 and index4 should be moved to the front (and index1 should
+ # still be before index3).
+ self.assertEqual([index2, index4, index1, index3], index._indices)
+ self.assertEqual(['2', '4', '1', '3'], index._index_names)
+
+ def test_reorder_propagates_to_siblings(self):
+ # Two CombinedGraphIndex objects, with the same number of indicies with
+ # matching names.
+ cgi1 = CombinedGraphIndex([])
+ cgi2 = CombinedGraphIndex([])
+ cgi1.insert_index(0, self.make_index_with_simple_nodes('1-1'), 'one')
+ cgi1.insert_index(1, self.make_index_with_simple_nodes('1-2'), 'two')
+ cgi2.insert_index(0, self.make_index_with_simple_nodes('2-1'), 'one')
+ cgi2.insert_index(1, self.make_index_with_simple_nodes('2-2'), 'two')
+ index2_1, index2_2 = cgi2._indices
+ cgi1.set_sibling_indices([cgi2])
+ # Trigger a reordering in cgi1. cgi2 will be reordered as well.
+ list(cgi1.iter_entries([('index-1-2-key-1',)]))
+ self.assertEqual([index2_2, index2_1], cgi2._indices)
+ self.assertEqual(['two', 'one'], cgi2._index_names)
+
def test_validate_reloads(self):
index, reload_counter = self.make_combined_index_with_missing()
index.validate()
More information about the bazaar-commits
mailing list