Rev 39: Refactoring: reduce code duplication between search() and suggest(). in http://people.ubuntu.com/~robertc/baz2.0/plugins/search/trunk
Robert Collins
robertc at robertcollins.net
Wed Jun 18 01:35:59 BST 2008
At http://people.ubuntu.com/~robertc/baz2.0/plugins/search/trunk
------------------------------------------------------------
revno: 39
revision-id: robertc at robertcollins.net-20080618003554-ee0vqxi1vy2swr2e
parent: robertc at robertcollins.net-20080617122537-eyv8tt7s1dki5dyw
committer: Robert Collins <robertc at robertcollins.net>
branch nick: trunk
timestamp: Wed 2008-06-18 10:35:54 +1000
message:
Refactoring: reduce code duplication between search() and suggest().
modified:
index.py index.py-20080608055509-hnimeek7q8tctkqf-2
=== modified file 'index.py'
--- a/index.py 2008-06-17 12:25:37 +0000
+++ b/index.py 2008-06-18 00:35:54 +0000
@@ -425,27 +425,19 @@
"""Core worker logic for performing searches.
:param termlist: An iterable of terms to search for.
- :return: An iterator over search results in each component.
+ :return: An iterator over (component, normalised_termlist,
+ matching_document_keys). Components where the query does not hit
+ anytthing are not included in the iterator. Using an empty query
+ results in all components being returned but no document keys being
+ listed for each component.
"""
_ensure_regexes()
-
- def search(self, termlist):
- """Trivial set-based search of the index.
-
- :param termlist: A list of terms.
- :return: An iterator of SearchResults for documents indexed by all
- terms in the termlist.
- """
- self._search_work(None)
self._refresh_indices()
- found_documents = []
# Use a set to remove duplicates
termlist = set(termlist)
term_keys = [None, set(), set()]
for term in termlist:
term_keys[len(term)].add(term)
- if 0 == (len(term_keys[1]) + len(term_keys[2])):
- return
for value, component in self._current_names.values():
term_index = component.term_index
@@ -458,6 +450,9 @@
node[2].split(" ")
term_info.append((int(posting_count), term_id,
int(posting_start), int(posting_length)))
+ if not termlist:
+ yield component, termlist, None
+ continue
if len(term_info) != len(termlist):
# One or more terms missing - no hits are possible.
continue
@@ -484,6 +479,19 @@
post_index = GraphIndex(view, post_name, posting_length)
common_doc_keys = set([node[1] for node in
post_index.iter_entries(common_doc_keys)])
+ yield component, termlist, common_doc_keys
+
+ def search(self, termlist):
+ """Trivial set-based search of the index.
+
+ :param termlist: A list of terms.
+ :return: An iterator of SearchResults for documents indexed by all
+ terms in the termlist.
+ """
+ found_documents = []
+ if not termlist:
+ return
+ for component, termlist, common_doc_keys in self._search_work(termlist):
common_doc_ids = [key[0] for key in common_doc_keys]
found_documents = [(component, doc_id) for doc_id in
common_doc_ids]
@@ -508,73 +516,19 @@
:return: An iterator of terms that start with the last search term in
termlist, and match the rest of the search.
"""
- _ensure_regexes()
- self._refresh_indices()
found_documents = []
if not termlist:
return
suggest_term = termlist[-1]
suggestions = set()
- # Use a set to remove duplicates
- termlist = set(termlist[:-1])
- term_keys = [None, set(), set()]
- for term in termlist:
- term_keys[len(term)].add(term)
-
- for value, component in self._current_names.values():
- term_index = component.term_index
- # TODO: push into Component
- # TODO: use a dequeue?
- term_info = []
- for node in chain(term_index.iter_entries(term_keys[1]),
- component.term_2_index.iter_entries(term_keys[2])):
- term_id, posting_count, posting_start, posting_length = \
- node[2].split(" ")
- term_info.append((int(posting_count), term_id,
- int(posting_start), int(posting_length)))
- if not termlist:
- # no terms to search for other than the suggestion:
- found_documents = []
- elif len(term_info) != len(termlist):
- # One or more terms missing - no hits are possible.
- continue
- else:
- # filter down to some specific document ids
- # load the first document list:
- term_info.sort()
- _, term_id, posting_start, posting_length = term_info.pop(0)
- posting_stop = posting_start + posting_length
- post_name = "term_list." + term_id
- filemap = {post_name:(posting_start, posting_stop)}
- view = FileView(self._indices_transport, component.name + '.pack',
- filemap)
- post_index = GraphIndex(view, post_name, posting_length)
- common_doc_keys = set([node[1] for node in
- post_index.iter_all_entries()])
- # Now we whittle down the nodes we need - still going in sorted
- # order. (possibly doing concurrent reduction would be better).
- while common_doc_keys and term_info:
- _, term_id, posting_start, posting_length = term_info.pop(0)
- posting_stop = posting_start + posting_length
- post_name = "term_list." + term_id
- filemap = {post_name:(posting_start, posting_stop)}
- view = FileView(self._indices_transport,
- component.name + '.pack', filemap)
- post_index = GraphIndex(view, post_name, posting_length)
- common_doc_keys = set([node[1] for node in
- post_index.iter_entries(common_doc_keys)])
- common_doc_ids = [key[0] for key in common_doc_keys]
- found_documents = [(component, doc_id) for doc_id in
- common_doc_ids]
+ for component, termlist, common_doc_keys in self._search_work(termlist[:-1]):
if len(suggest_term) == 1:
- suggest_index = term_index
+ suggest_index = component.term_index
else:
suggest_index = component.term_2_index
for node in suggest_index.iter_entries_starts_with(suggest_term):
suggestion = node[1]
- if found_documents:
- # Friction: why don't we keep them as keys
- common_doc_keys = [(doc[1],) for doc in found_documents]
+ if common_doc_keys:
term_id, posting_count, posting_start, posting_length = \
node[2].split(" ")
posting_count = int(posting_count)
More information about the bazaar-commits
mailing list