Rev 15: Actually do partial index reads - 10 fold performance improvement on bzr.dev trees. in http://people.ubuntu.com/~robertc/baz2.0/plugins/search/trunk
Robert Collins
robertc at robertcollins.net
Mon Jun 9 11:17:17 BST 2008
At http://people.ubuntu.com/~robertc/baz2.0/plugins/search/trunk
------------------------------------------------------------
revno: 15
revision-id: robertc at robertcollins.net-20080609101716-jic7t43fvbeewazh
parent: robertc at robertcollins.net-20080609081415-uu3fuq592p13j97k
committer: Robert Collins <robertc at robertcollins.net>
branch nick: trunk
timestamp: Mon 2008-06-09 20:17:16 +1000
message:
Actually do partial index reads - 10 fold performance improvement on bzr.dev trees.
modified:
DESIGN design-20080608072426-vjoj110dtykfyb7g-1
index.py index.py-20080608055509-hnimeek7q8tctkqf-2
tests/test_index.py test_index.py-20080608055509-hnimeek7q8tctkqf-4
=== modified file 'DESIGN'
--- a/DESIGN 2008-06-08 11:41:20 +0000
+++ b/DESIGN 2008-06-09 10:17:16 +0000
@@ -96,6 +96,10 @@
(type, file_id, versioned_id) -> nothing
+Note that a single index has the full posting list for any document present in
+the index - this allows optimisations where we do not examine document names
+during discarding the document.
+
Terms index
-----------
=== modified file 'index.py'
--- a/index.py 2008-06-09 08:14:15 +0000
+++ b/index.py 2008-06-09 10:17:16 +0000
@@ -263,6 +263,7 @@
self._upload_transport.rename(index_name + suffix,
'../indices/' + index_name + suffix)
self._transport.put_file('names', new_names.finish())
+ self._orig_names[index_name] = (index_value, index)
finally:
self._lock.unlock()
# Add in-memory
@@ -298,7 +299,7 @@
if name not in self._orig_names:
added_names.add(name)
elif name in self._current_names:
- self.same_names.add(name)
+ same_names.add(name)
else:
# in our last read; not in memory anymore:
deleted_names.add(name)
@@ -317,6 +318,7 @@
self._add_index_to_memory(name, value, component)
for name in deleted_names:
self._remove_component_from_memory(name)
+ self._orig_names = new_names
def _remove_component_from_memory(self, name):
"""Remove the component name from the index list in memory."""
@@ -332,20 +334,33 @@
:return: An iterator of SearchResults for documents indexed by all
terms in the termlist.
"""
- # Slow but gets the UI up
- terms = dict(self.all_terms())
- candidate_documents = None
- for term in termlist:
- if term not in terms:
- return
- if candidate_documents is None:
- candidate_documents = set(terms[term])
- else:
- candidate_documents.intersection_update(set(terms[term]))
- for document in candidate_documents:
- if document[0] == 'r':
- # revision
- yield RevisionHit(document[2:3])
+ self._refresh_indices()
+ found_documents = []
+ # Use a set to remove duplicates
+ term_keys = set([(term,) for term in termlist])
+ for term_index in self._term_indices:
+ doc_references = {}
+ for node in term_index.iter_entries(term_keys):
+ doc_references[node[1]] = node[2]
+ if len(doc_references) != len(term_keys):
+ # One or more terms missing
+ continue
+ # Maybe not the cheapest, but certainly easy to read the code for:
+ common_doc_ids = None
+ for reference_string in doc_references.itervalues():
+ reference_set = set(reference_string.split(' '))
+ if common_doc_ids is None:
+ common_doc_ids = reference_set
+ else:
+ common_doc_ids.difference_update(reference_set)
+ found_documents = [(term_index, doc_id) for doc_id in
+ common_doc_ids]
+ for doc_key in self._document_ids_to_keys(found_documents):
+ if doc_key[0] == 'r':
+ # revision
+ yield RevisionHit(doc_key[2:3])
+ else:
+ raise Exception("unknown doc type")
def _terms_for_revs(self, repository, revision_ids):
"""Generate the posting list for the revision texts of revision_ids.
@@ -358,7 +373,7 @@
terms = {}
for revision in repository.get_revisions(revision_ids):
# its a revision, second component is ignored, third is id.
- document_key = ('r', 'x', revision.revision_id)
+ document_key = ('r', '', revision.revision_id)
# components of a revision:
# parents - not indexed (but we could)
# commit message (done)
=== modified file 'tests/test_index.py'
--- a/tests/test_index.py 2008-06-09 07:37:26 +0000
+++ b/tests/test_index.py 2008-06-09 10:17:16 +0000
@@ -101,8 +101,8 @@
# paths present
# content of documents.
expected_terms = {
- 'first':set([('r', 'x', revid)]),
- 'post':set([('r', 'x', revid)]),
+ 'first':set([('r', '', revid)]),
+ 'post':set([('r', '', revid)]),
}
all_terms = {}
for term, posting_list in rev_index.all_terms():
More information about the bazaar-commits
mailing list