Rev 64: Get format 2 using btree indices (but suggestion will fail at this point). in http://people.ubuntu.com/~robertc/baz2.0/plugins/search/trunk
Robert Collins
robertc at robertcollins.net
Tue Dec 2 21:06:34 GMT 2008
At http://people.ubuntu.com/~robertc/baz2.0/plugins/search/trunk
------------------------------------------------------------
revno: 64
revision-id: robertc at robertcollins.net-20081202210630-zn6zbcm7lfkqi61x
parent: robertc at robertcollins.net-20081202205604-307gpsaw7whse1b2
committer: Robert Collins <robertc at robertcollins.net>
branch nick: trunk
timestamp: Wed 2008-12-03 08:06:30 +1100
message:
Get format 2 using btree indices (but suggestion will fail at this point).
modified:
index.py index.py-20080608055509-hnimeek7q8tctkqf-2
tests/test_index.py test_index.py-20080608055509-hnimeek7q8tctkqf-4
=== modified file 'index.py'
--- a/index.py 2008-12-02 20:56:04 +0000
+++ b/index.py 2008-12-02 21:06:30 +0000
@@ -71,11 +71,8 @@
_FORMAT_1 = 'bzr-search search folder 1\n'
_FORMAT_2 = 'bzr-search search folder 2\n'
-_FORMATS = {
- # format: index builder, index reader, index deletes
- _FORMAT_1:(InMemoryGraphIndex, GraphIndex, False),
- _FORMAT_2:(BTreeBuilder, BTreeGraphIndex, True)
- }
+# _FORMATS definitions are the end of the module, so that they can use index
+# subclasses.
_tokeniser_re = None
@@ -407,7 +404,7 @@
raise Exception(
"md5 collision with concurrent writer! rad! %s" % index_name)
# Serialise the index list
- new_names = InMemoryGraphIndex(0, 1)
+ new_names = self._format[0](0, 1)
new_names.add_node((index_name,), index_value, ())
for name, (value, index) in self._current_names.items():
new_names.add_node((name,), value, ())
@@ -456,7 +453,8 @@
# B) we don't want to regress infinitely; a flag to _add_index would do
# this.
# C) We need to remove components too.
- combiner = ComponentCombiner(components, self._upload_transport)
+ combiner = ComponentCombiner(self._format, components,
+ self._upload_transport)
self._add_index(combiner, to_remove=components, allow_pack=False)
def _add_index_to_memory(self, name, value, index):
@@ -477,7 +475,7 @@
:param to_remove: An optional list of components to remove from memory
even if they are still listed on disk.
"""
- names = GraphIndex(self._transport, 'names', None)
+ names = self._format[1](self._transport, 'names', None)
new_names = {}
merged_names = {}
deleted_names = set()
@@ -578,7 +576,7 @@
filemap = {post_name:(posting_start, posting_stop)}
view = FileView(self._indices_transport, component.name + '.pack',
filemap)
- post_index = GraphIndex(view, post_name, posting_length)
+ post_index = self._format[1](view, post_name, posting_length)
common_doc_keys = set([node[1] for node in
post_index.iter_all_entries()])
# Now we whittle down the nodes we need - still going in sorted
@@ -637,7 +635,7 @@
filemap = {post_name:(posting_start, posting_stop)}
view = FileView(self._indices_transport,
component.name + '.pack', filemap)
- post_index = GraphIndex(view, post_name, posting_length)
+ post_index = self._format[1](view, post_name, posting_length)
return set([node[1] for node in post_index.iter_entries(key_filter)])
def suggest(self, termlist):
@@ -670,7 +668,7 @@
filemap = {post_name:(posting_start, posting_stop)}
view = FileView(self._indices_transport,
component.name + '.pack', filemap)
- post_index = GraphIndex(view, post_name, posting_length)
+ post_index = self._format[1](view, post_name, posting_length)
common_doc_keys = set([node[1] for node in
post_index.iter_entries(common_doc_keys)])
if len(common_doc_keys):
@@ -907,18 +905,18 @@
"documents": (lengths[4], lengths[4] + lengths[5]),
"terms_2": (lengths[6], lengths[6] + lengths[7]),
}
+ self._format = format
view = FileView(transport, name + '.pack', filemap)
- rev_index = GraphIndex(view, "revisions", lengths[1])
- term_index = SuggestableGraphIndex(view, "terms", lengths[3])
- term_2_index = GraphIndex(view, "terms_2", lengths[7])
- doc_index = GraphIndex(view, "documents", lengths[5])
+ rev_index = self._format[1](view, "revisions", lengths[1])
+ term_index = self._format[1](view, "terms", lengths[3])
+ term_2_index = self._format[1](view, "terms_2", lengths[7])
+ doc_index = self._format[1](view, "documents", lengths[5])
self.revision_index = rev_index
self.term_index = term_index
self.term_2_index = term_2_index
self.document_index = doc_index
self.name = name
self.transport = transport
- self.format = format
def all_terms(self):
"""As per Index, but for a single component."""
@@ -936,7 +934,7 @@
post_name = "term_list." + term_id
filemap = {post_name:(posting_start, posting_stop)}
view = FileView(self.transport, self.name + '.pack', filemap)
- post_index = GraphIndex(view, post_name, posting_length)
+ post_index = self._format[1](view, post_name, posting_length)
doc_ids = set([node[1] for node in
post_index.iter_all_entries()])
posting_list = set(self._document_ids_to_keys(doc_ids))
@@ -1003,12 +1001,12 @@
"""Creates a component index."""
def __init__(self, format):
- self.document_index = InMemoryGraphIndex(0, 1)
+ self.document_index = format[0](0, 1)
self._document_ids = {}
self.terms = {}
- self.revision_index = InMemoryGraphIndex(0, 1)
+ self.revision_index = format[0](0, 1)
self.posting_lists = {}
- self.format = format
+ self._format = format
def add_term(self, term, posting_list):
"""Add a term to the index.
@@ -1082,11 +1080,11 @@
# generate a new term index with the length of the serialised posting
# lists.
term_indices = {}
- term_indices[1] = InMemoryGraphIndex(0, 1)
- term_indices[2] = InMemoryGraphIndex(0, 2)
+ term_indices[1] = self._format[0](0, 1)
+ term_indices[2] = self._format[0](0, 2)
for term, term_id in self.terms.iteritems():
posting_list = self.posting_lists[term_id]
- post_index = InMemoryGraphIndex(0, 1)
+ post_index = self._format[0](0, 1)
for doc_id in posting_list:
post_index.add_node((doc_id,), "", ())
posting_name = "term_list." + term_id
@@ -1119,9 +1117,10 @@
class ComponentCombiner(ComponentCreator):
"""Combines components into a new single larger component."""
- def __init__(self, components, transport):
+ def __init__(self, format, components, transport):
"""Create a combiner.
+ :param format: The format of component to create.
:param components: An iterable of components.
:param transport: A transport to upload the combined component to.
:return: A tuple - the component name, the value for the names file,
@@ -1129,6 +1128,7 @@
"""
self.components = list(components)
self.transport = transport
+ self._format = format
def _copy_documents(self):
"""Copy the document references from components to a new component.
@@ -1137,7 +1137,7 @@
component's document ids to the output document ids.
"""
self._document_ids = {}
- self.document_index = InMemoryGraphIndex(0, 1)
+ self.document_index = self._format[0](0, 1)
self.component_docids = {}
for component in self.components:
component_docs = {}
@@ -1161,8 +1161,8 @@
and self.terms to determine what to copy from.
It populates self.term_index as it progresses.
"""
- term_indices = {1:InMemoryGraphIndex(0, 1),
- 2:InMemoryGraphIndex(0, 2)
+ term_indices = {1:self._format[0](0, 1),
+ 2:self._format[0](0, 2)
}
for term, posting_lists in self.terms.iteritems():
posting_list = set()
@@ -1176,11 +1176,11 @@
filemap = {post_name:(posting_start, posting_stop)}
view = FileView(component.transport,
component.name + '.pack', filemap)
- post_index = GraphIndex(view, post_name, posting_length)
+ post_index = self._format[1](view, post_name, posting_length)
doc_mapping = self.component_docids[component]
for node in post_index.iter_all_entries():
posting_list.add(doc_mapping[node[1]])
- post_index = InMemoryGraphIndex(0, 1)
+ post_index = self._format[0](0, 1)
for doc_id in posting_list:
post_index.add_node((doc_id,), '', ())
term_id = str(term_indices[1].key_count() +
@@ -1207,7 +1207,7 @@
for component in self.components:
for node in component.revision_index.iter_all_entries():
revisions.add(node[1])
- revision_index = InMemoryGraphIndex(0, 1)
+ revision_index = self._format[0](0, 1)
for revision in revisions:
revision_index.add_node(revision, '', ())
index_file = revision_index.finish()
@@ -1442,3 +1442,10 @@
if _tokeniser_re.search(regex):
return None
return [(regex,)]
+
+
+_FORMATS = {
+ # format: index builder, index reader, index deletes
+ _FORMAT_1:(InMemoryGraphIndex, SuggestableGraphIndex, False),
+ _FORMAT_2:(BTreeBuilder, BTreeGraphIndex, True)
+ }
=== modified file 'tests/test_index.py'
--- a/tests/test_index.py 2008-12-02 20:56:04 +0000
+++ b/tests/test_index.py 2008-12-02 21:06:30 +0000
@@ -561,7 +561,7 @@
name, value, elements = builder.upload_index(transport)
component2 = index.ComponentIndex(self.format, name, value, transport)
components.append(component2)
- combiner = index.ComponentCombiner(components, transport)
+ combiner = index.ComponentCombiner(self.format, components, transport)
name, value, elements = combiner.combine()
combined = index.ComponentIndex(self.format, name, value, transport)
terms = {}
@@ -589,7 +589,7 @@
name, value, elements = builder.upload_index(transport)
component2 = index.ComponentIndex(self.format, name, value, transport)
components.append(component2)
- combiner = index.ComponentCombiner(components, transport)
+ combiner = index.ComponentCombiner(self.format, components, transport)
name, value, elements = combiner.combine()
combined = index.ComponentIndex(self.format, name, value, transport)
terms = {('file', 'revid'): set([('p', '', 'file path')])}
More information about the bazaar-commits
mailing list