Rev 39: Refactoring: reduce code duplication between search() and suggest(). in http://people.ubuntu.com/~robertc/baz2.0/plugins/search/trunk

Robert Collins robertc at robertcollins.net
Wed Jun 18 01:35:59 BST 2008


At http://people.ubuntu.com/~robertc/baz2.0/plugins/search/trunk

------------------------------------------------------------
revno: 39
revision-id: robertc at robertcollins.net-20080618003554-ee0vqxi1vy2swr2e
parent: robertc at robertcollins.net-20080617122537-eyv8tt7s1dki5dyw
committer: Robert Collins <robertc at robertcollins.net>
branch nick: trunk
timestamp: Wed 2008-06-18 10:35:54 +1000
message:
  Refactoring: reduce code duplication between search() and suggest().
modified:
  index.py                       index.py-20080608055509-hnimeek7q8tctkqf-2
=== modified file 'index.py'
--- a/index.py	2008-06-17 12:25:37 +0000
+++ b/index.py	2008-06-18 00:35:54 +0000
@@ -425,27 +425,19 @@
         """Core worker logic for performing searches.
         
         :param termlist: An iterable of terms to search for.
-        :return: An iterator over search results in each component.
+        :return: An iterator over (component, normalised_termlist,
+            matching_document_keys). Components where the query does not hit
+            anytthing are not included in the iterator. Using an empty query
+            results in all components being returned but no document keys being
+            listed for each component.
         """
         _ensure_regexes()
-
-    def search(self, termlist):
-        """Trivial set-based search of the index.
-
-        :param termlist: A list of terms.
-        :return: An iterator of SearchResults for documents indexed by all
-            terms in the termlist.
-        """
-        self._search_work(None)
         self._refresh_indices()
-        found_documents = []
         # Use a set to remove duplicates
         termlist = set(termlist)
         term_keys = [None, set(), set()]
         for term in termlist:
             term_keys[len(term)].add(term)
-        if 0 == (len(term_keys[1]) + len(term_keys[2])):
-            return
 
         for value, component in self._current_names.values():
             term_index = component.term_index
@@ -458,6 +450,9 @@
                     node[2].split(" ")
                 term_info.append((int(posting_count), term_id,
                     int(posting_start), int(posting_length)))
+            if not termlist:
+                yield component, termlist, None
+                continue
             if len(term_info) != len(termlist):
                 # One or more terms missing - no hits are possible.
                 continue
@@ -484,6 +479,19 @@
                 post_index = GraphIndex(view, post_name, posting_length)
                 common_doc_keys = set([node[1] for node in
                     post_index.iter_entries(common_doc_keys)])
+            yield component, termlist, common_doc_keys
+
+    def search(self, termlist):
+        """Trivial set-based search of the index.
+
+        :param termlist: A list of terms.
+        :return: An iterator of SearchResults for documents indexed by all
+            terms in the termlist.
+        """
+        found_documents = []
+        if not termlist:
+            return
+        for component, termlist, common_doc_keys in self._search_work(termlist):
             common_doc_ids = [key[0] for key in common_doc_keys]
             found_documents = [(component, doc_id) for doc_id in
                 common_doc_ids]
@@ -508,73 +516,19 @@
         :return: An iterator of terms that start with the last search term in
             termlist, and match the rest of the search.
         """
-        _ensure_regexes()
-        self._refresh_indices()
         found_documents = []
         if not termlist:
             return
         suggest_term = termlist[-1]
         suggestions = set()
-        # Use a set to remove duplicates
-        termlist = set(termlist[:-1])
-        term_keys = [None, set(), set()]
-        for term in termlist:
-            term_keys[len(term)].add(term)
-
-        for value, component in self._current_names.values():
-            term_index = component.term_index
-            # TODO: push into Component
-            # TODO: use a dequeue?
-            term_info = []
-            for node in chain(term_index.iter_entries(term_keys[1]),
-                component.term_2_index.iter_entries(term_keys[2])):
-                term_id, posting_count, posting_start, posting_length = \
-                    node[2].split(" ")
-                term_info.append((int(posting_count), term_id,
-                    int(posting_start), int(posting_length)))
-            if not termlist:
-                # no terms to search for other than the suggestion:
-                found_documents = []
-            elif len(term_info) != len(termlist):
-                # One or more terms missing - no hits are possible.
-                continue
-            else:
-                # filter down to some specific document ids
-                # load the first document list: 
-                term_info.sort()
-                _, term_id, posting_start, posting_length = term_info.pop(0)
-                posting_stop = posting_start + posting_length
-                post_name = "term_list." + term_id
-                filemap = {post_name:(posting_start, posting_stop)}
-                view = FileView(self._indices_transport, component.name + '.pack',
-                    filemap)
-                post_index = GraphIndex(view, post_name, posting_length)
-                common_doc_keys = set([node[1] for node in
-                    post_index.iter_all_entries()])
-                # Now we whittle down the nodes we need - still going in sorted
-                # order. (possibly doing concurrent reduction would be better).
-                while common_doc_keys and term_info:
-                    _, term_id, posting_start, posting_length = term_info.pop(0)
-                    posting_stop = posting_start + posting_length
-                    post_name = "term_list." + term_id
-                    filemap = {post_name:(posting_start, posting_stop)}
-                    view = FileView(self._indices_transport,
-                        component.name + '.pack', filemap)
-                    post_index = GraphIndex(view, post_name, posting_length)
-                    common_doc_keys = set([node[1] for node in
-                        post_index.iter_entries(common_doc_keys)])
-                common_doc_ids = [key[0] for key in common_doc_keys]
-                found_documents = [(component, doc_id) for doc_id in
-                    common_doc_ids]
+        for component, termlist, common_doc_keys in self._search_work(termlist[:-1]):
             if len(suggest_term) == 1:
-                suggest_index = term_index
+                suggest_index = component.term_index
             else:
                 suggest_index = component.term_2_index
             for node in suggest_index.iter_entries_starts_with(suggest_term):
                 suggestion = node[1]
-                if found_documents:
-                    # Friction: why don't we keep them as keys
-                    common_doc_keys = [(doc[1],) for doc in found_documents]
+                if common_doc_keys:
                     term_id, posting_count, posting_start, posting_length = \
                         node[2].split(" ")
                     posting_count = int(posting_count)




More information about the bazaar-commits mailing list