Rev 15: Actually do partial index reads - 10 fold performance improvement on bzr.dev trees. in http://people.ubuntu.com/~robertc/baz2.0/plugins/search/trunk

Mon Jun 9 11:17:17 BST 2008

At http://people.ubuntu.com/~robertc/baz2.0/plugins/search/trunk

------------------------------------------------------------
revno: 15
revision-id: robertc at robertcollins.net-20080609101716-jic7t43fvbeewazh
parent: robertc at robertcollins.net-20080609081415-uu3fuq592p13j97k
committer: Robert Collins <robertc at robertcollins.net>
branch nick: trunk
timestamp: Mon 2008-06-09 20:17:16 +1000
message:
  Actually do partial index reads - 10 fold performance improvement on bzr.dev trees.
modified:
  DESIGN                         design-20080608072426-vjoj110dtykfyb7g-1
  index.py                       index.py-20080608055509-hnimeek7q8tctkqf-2
  tests/test_index.py            test_index.py-20080608055509-hnimeek7q8tctkqf-4
=== modified file 'DESIGN'

--- a/DESIGN	2008-06-08 11:41:20 +0000
+++ b/DESIGN	2008-06-09 10:17:16 +0000
@@ -96,6 +96,10 @@
 
 (type, file_id, versioned_id) -> nothing
 
+Note that a single index has the full posting list for any document present in
+the index - this allows optimisations where we do not examine document names 
+during discarding the document.
+
 Terms index
 -----------
 

=== modified file 'index.py'
--- a/index.py	2008-06-09 08:14:15 +0000
+++ b/index.py	2008-06-09 10:17:16 +0000
@@ -263,6 +263,7 @@
                 self._upload_transport.rename(index_name + suffix,
                     '../indices/' + index_name + suffix)
             self._transport.put_file('names', new_names.finish())
+            self._orig_names[index_name] = (index_value, index)
         finally:
             self._lock.unlock()
         # Add in-memory
@@ -298,7 +299,7 @@
             if name not in self._orig_names:
                 added_names.add(name)
             elif name in self._current_names:
-                self.same_names.add(name)
+                same_names.add(name)
             else:
                 # in our last read; not in memory anymore:
                 deleted_names.add(name)
@@ -317,6 +318,7 @@
             self._add_index_to_memory(name, value, component)
         for name in deleted_names:
             self._remove_component_from_memory(name)
+        self._orig_names = new_names
 
     def _remove_component_from_memory(self, name):
         """Remove the component name from the index list in memory."""
@@ -332,20 +334,33 @@
         :return: An iterator of SearchResults for documents indexed by all
             terms in the termlist.
         """
-        # Slow but gets the UI up
-        terms = dict(self.all_terms())
-        candidate_documents = None
-        for term in termlist:
-            if term not in terms:
-                return
-            if candidate_documents is None:
-                candidate_documents = set(terms[term])
-            else:
-                candidate_documents.intersection_update(set(terms[term]))
-        for document in candidate_documents:
-            if document[0] == 'r':
-                # revision
-                yield RevisionHit(document[2:3])
+        self._refresh_indices()
+        found_documents = []
+        # Use a set to remove duplicates
+        term_keys = set([(term,) for term in termlist])
+        for term_index in self._term_indices:
+            doc_references = {}
+            for node in term_index.iter_entries(term_keys):
+                doc_references[node[1]] = node[2]
+            if len(doc_references) != len(term_keys):
+                # One or more terms missing
+                continue
+            # Maybe not the cheapest, but certainly easy to read the code for:
+            common_doc_ids = None
+            for reference_string in doc_references.itervalues():
+                reference_set = set(reference_string.split(' '))
+                if common_doc_ids is None:
+                    common_doc_ids = reference_set
+                else:
+                    common_doc_ids.difference_update(reference_set)
+            found_documents = [(term_index, doc_id) for doc_id in
+                common_doc_ids]
+            for doc_key in self._document_ids_to_keys(found_documents):
+                if doc_key[0] == 'r':
+                    # revision
+                    yield RevisionHit(doc_key[2:3])
+                else:
+                    raise Exception("unknown doc type")
 
     def _terms_for_revs(self, repository, revision_ids):
         """Generate the posting list for the revision texts of revision_ids.
@@ -358,7 +373,7 @@
         terms = {}
         for revision in repository.get_revisions(revision_ids):
             # its a revision, second component is ignored, third is id.
-            document_key = ('r', 'x', revision.revision_id)
+            document_key = ('r', '', revision.revision_id)
             # components of a revision:
             # parents - not indexed (but we could)
             # commit message (done)

=== modified file 'tests/test_index.py'
--- a/tests/test_index.py	2008-06-09 07:37:26 +0000
+++ b/tests/test_index.py	2008-06-09 10:17:16 +0000
@@ -101,8 +101,8 @@
         # paths present
         # content of documents.
         expected_terms = {
-            'first':set([('r', 'x', revid)]),
-            'post':set([('r', 'x', revid)]),
+            'first':set([('r', '', revid)]),
+            'post':set([('r', '', revid)]),
             }
         all_terms = {}
         for term, posting_list in rev_index.all_terms():