Rev 33: New format bump, index the path of found documents to allow presenting the path on file text hits. in http://people.ubuntu.com/~robertc/baz2.0/plugins/search/trunk
Robert Collins
robertc at robertcollins.net
Sat Jun 14 02:41:49 BST 2008
At http://people.ubuntu.com/~robertc/baz2.0/plugins/search/trunk
------------------------------------------------------------
revno: 33
revision-id: robertc at robertcollins.net-20080614014142-qrfj2rgsqeqjdxv5
parent: robertc at robertcollins.net-20080613114812-oi86vamc6qh8tkt4
committer: Robert Collins <robertc at robertcollins.net>
branch nick: trunk
timestamp: Sat 2008-06-14 11:41:42 +1000
message:
New format bump, index the path of found documents to allow presenting the path on file text hits.
modified:
DESIGN design-20080608072426-vjoj110dtykfyb7g-1
index.py index.py-20080608055509-hnimeek7q8tctkqf-2
tests/test_index.py test_index.py-20080608055509-hnimeek7q8tctkqf-4
=== modified file 'DESIGN'
--- a/DESIGN 2008-06-13 11:48:12 +0000
+++ b/DESIGN 2008-06-14 01:41:42 +0000
@@ -47,7 +47,7 @@
* Mapping fileid, revisionid search hits back to paths for display during hits
on documents. This appears to need a concordance of the entire inventory
- state.
+ state for every file hit we can produce.
* Searching for a path ("show me the file README.txt")
* Accelerating per-file log (by identifying path-change events which the
file graph index elides, and-or
=== modified file 'index.py'
--- a/index.py 2008-06-13 11:48:12 +0000
+++ b/index.py 2008-06-14 01:41:42 +0000
@@ -17,6 +17,7 @@
"""The core logic for search."""
+from itertools import chain
import math
import md5
import re
@@ -149,7 +150,6 @@
self._orig_names = {}
self._current_names = {}
self._revision_indices = []
- self._term_indices = []
self._term_doc_indices = {}
self._revision_index = CombinedGraphIndex(self._revision_indices)
# because terms may occur in many component indices, we don't use a
@@ -251,6 +251,9 @@
graph = locked_branch.repository.get_graph()
parent_map = graph.get_parent_map(revisions_to_index)
order = topo_sort(parent_map)
+ order_dict = {}
+ for pos, revid in enumerate(order):
+ order_dict[revid] = pos
# 5000 uses 1GB on a mysql tree.
group_size = 2500
for offset in range(len(order) / group_size + 1):
@@ -262,6 +265,9 @@
terms = self._terms_for_texts(locked_branch.repository,
revision_group)
self._add_terms(builder, terms)
+ terms = self._terms_for_file_terms(locked_branch.repository,
+ terms, order_dict)
+ self._add_terms(builder, terms)
terms = self._terms_for_revs(locked_branch.repository,
revision_group)
self._add_terms(builder, terms)
@@ -345,8 +351,7 @@
"""Add an index (with meta-value value) to the in-memory index list."""
self._current_names[name] = (value, index)
self._revision_indices.append(index.revision_index)
- self._term_indices.append(index.term_index)
- self._term_doc_indices[index.term_index] = index.document_index
+ self._term_doc_indices[index] = index.document_index
def indexed_revisions(self):
"""Return the revision_keys that this index contains terms for."""
@@ -394,8 +399,7 @@
def _remove_component_from_memory(self, name):
"""Remove the component name from the index list in memory."""
index = self._current_names[name][1]
- del self._term_doc_indices[index.term_index]
- self._term_indices.remove(index.term_index)
+ del self._term_doc_indices[index]
self._revision_indices.remove(index.revision_index)
del self._current_names[name]
@@ -409,20 +413,25 @@
self._refresh_indices()
found_documents = []
# Use a set to remove duplicates
- term_keys = set(termlist)
- if len(term_keys) == 0:
+ termlist = set(termlist)
+ term_keys = [None, set(), set()]
+ for term in termlist:
+ term_keys[len(term)].add(term)
+ if 0 == (len(term_keys[1]) + len(term_keys[2])):
return
+
for value, component in self._current_names.values():
term_index = component.term_index
# TODO: push into Component
# TODO: use a dequeue?
term_info = []
- for node in term_index.iter_entries(term_keys):
+ for node in chain(term_index.iter_entries(term_keys[1]),
+ component.term_2_index.iter_entries(term_keys[2])):
term_id, posting_count, posting_start, posting_length = \
node[2].split(" ")
term_info.append((int(posting_count), term_id,
int(posting_start), int(posting_length)))
- if len(term_info) != len(term_keys):
+ if len(term_info) != len(termlist):
# One or more terms missing - no hits are possible.
continue
# load the first document list:
@@ -448,18 +457,60 @@
common_doc_keys = set([node[1] for node in
post_index.iter_entries(common_doc_keys)])
common_doc_ids = [key[0] for key in common_doc_keys]
- found_documents = [(term_index, doc_id) for doc_id in
+ found_documents = [(component, doc_id) for doc_id in
common_doc_ids]
for doc_key in self._document_ids_to_keys(found_documents):
if doc_key[0] == 'f':
# file text
- yield FileTextHit(self._branch.repository, doc_key[1:3])
+ yield FileTextHit(self, self._branch.repository,
+ doc_key[1:3])
elif doc_key[0] == 'r':
# revision
yield RevisionHit(self._branch.repository, doc_key[2:3])
+ elif doc_key[0] == 'p':
+ # path
+ yield PathHit(doc_key[2])
else:
raise Exception("unknown doc type %r" % (doc_key,))
+ def _terms_for_file_terms(self, repository, file_terms, order_dict):
+ """Generate terms for the path of every file_id, revision_id in terms.
+
+ :param repository: The repository to access inventories from.
+ :param terms: Text terms than have been inserted.
+ :param order_dict: A mapping from revision id to order from the
+ topological sort prepared for the indexing operation.
+ :return: An iterable of (term, posting_list) for the file_id,
+ revision_id pairs mentioned in terms.
+ """
+ terms = {}
+ # What revisions do we need inventories for:
+ revision_ids = {}
+ for term, posting_list in file_terms:
+ for post in posting_list:
+ if post[0] != 'f':
+ raise ValueError("Unknown post type for %r" % post)
+ fileids = revision_ids.setdefault(post[2], set())
+ fileids.add(post[1])
+ order = list(revision_ids)
+ order.sort(key=lambda revid:order_dict[revid])
+ group_size = 100
+ for offset in range(len(order) / group_size + 1):
+ inventory_group = order[offset * group_size:(offset + 1) * group_size]
+ #texts = repository.get_inventory_weave().get_texts(inventory_group)
+ #for text in inventory_group:
+ # pass
+ # for text, revision_id in zip(texts, revision_ids):
+ # For VersionedFiles:
+ # for xml in repository._iter_inventory_xmls(inventory_group):
+ # pass
+ for inventory in repository.iter_inventories(inventory_group):
+ revision_id = inventory.revision_id
+ for file_id in revision_ids[revision_id]:
+ path = inventory.id2path(file_id)
+ terms[(file_id, revision_id)] = [('p', '', path)]
+ return terms.iteritems()
+
def _terms_for_revs(self, repository, revision_ids):
"""Generate the posting list for the revision texts of revision_ids.
@@ -515,31 +566,54 @@
continue
posting_list = terms.setdefault((term,), set())
posting_list.add(document_key)
- return terms.iteritems()
+ return terms.items()
class FileTextHit(object):
"""A match found during a search in a file text."""
- def __init__(self, repository, text_key):
+ def __init__(self, index, repository, text_key):
"""Create a FileTextHit.
+ :param index: The index the search result is from, to look up the path
+ of the hit. NB
:param repository: A repository to extract revisions from.
:param text_key: The text_key that was hit.
"""
+ self.index = index
self.repository = repository
self.text_key = text_key
def document_name(self):
"""The name of the document found, for human consumption."""
# Perhaps need to utf_decode this?
- return "File id '%s', revision '%s'." % self.text_key
+ path = self.index.search((self.text_key,)).next()
+ return "%s in revision '%s'." % (path.document_name(), self.text_key[1])
def summary(self):
"""Get a summary of the hit, for display to users."""
return "No summaries yet."
+class PathHit(object):
+ """A match found during a search in a file path."""
+
+ def __init__(self, path_utf8):
+ """Create a PathHit.
+
+ :param path_utf8: The path (utf8 encoded).
+ """
+ self.path_utf8 = path_utf8
+
+ def document_name(self):
+ """The name of the document found, for human consumption."""
+ return self.path_utf8.decode("utf8")
+
+ def summary(self):
+ """Get a summary of the hit."""
+ return self.document_name()
+
+
class RevisionHit(object):
"""A match found during a search in a revision object."""
@@ -588,13 +662,16 @@
"revisions": (lengths[0], lengths[0] + lengths[1]),
"terms": (lengths[2], lengths[2] + lengths[3]),
"documents": (lengths[4], lengths[4] + lengths[5]),
+ "terms_2": (lengths[6], lengths[6] + lengths[7]),
}
view = FileView(transport, name + '.pack', filemap)
rev_index = GraphIndex(view, "revisions", lengths[1])
term_index = GraphIndex(view, "terms", lengths[3])
+ term_2_index = GraphIndex(view, "terms_2", lengths[7])
doc_index = GraphIndex(view, "documents", lengths[5])
self.revision_index = rev_index
self.term_index = term_index
+ self.term_2_index = term_2_index
self.document_index = doc_index
self.name = name
self.transport = transport
@@ -602,7 +679,8 @@
def all_terms(self):
"""As per Index, but for a single component."""
result = {}
- for node in self.term_index.iter_all_entries():
+ for node in chain(self.term_index.iter_all_entries(),
+ self.term_2_index.iter_all_entries()):
# XXX: Duplicated logic with search().
term = node[1]
term_id, posting_count, posting_start, posting_length = \
@@ -717,7 +795,7 @@
Unindexed terms return an empty iterator.
"""
- term_id = self.term_id((term,))
+ term_id = self.term_id(term)
if term_id is None:
return []
else:
@@ -754,7 +832,9 @@
del index_bytes
# generate a new term index with the length of the serialised posting
# lists.
- term_index = InMemoryGraphIndex(0, 1)
+ term_indices = {}
+ term_indices[1] = InMemoryGraphIndex(0, 1)
+ term_indices[2] = InMemoryGraphIndex(0, 2)
for term, term_id in self.terms.iteritems():
posting_list = self.posting_lists[term_id]
post_index = InMemoryGraphIndex(0, 1)
@@ -771,14 +851,18 @@
# read the pack later.
term_value = "%s %d %d %d" % (term_id, len(posting_list), start,
length)
- term_index.add_node(term, term_value, ())
- term_start, term_length = self._add_index_to_pack(term_index, "terms", writer)
+ term_indices[len(term)].add_node(term, term_value, ())
+ term_start, term_length = self._add_index_to_pack(term_indices[1],
+ "terms", writer)
+ term_2_start, term_2_length = self._add_index_to_pack(term_indices[2],
+ "terms2", writer)
doc_start, doc_length = self._add_index_to_pack(self.document_index,
"documents", writer)
writer.end()
write_stream.close()
- index_value = "%d %d %d %d %d %d" % (rev_start, rev_length, term_start,
- term_length, doc_start, doc_length)
+ index_value = "%d %d %d %d %d %d %d %d" % (rev_start, rev_length,
+ term_start, term_length, doc_start, doc_length, term_2_start,
+ term_2_length)
elements = [index_name + ".pack"]
return index_name, index_value, elements
@@ -828,7 +912,9 @@
and self.terms to determine what to copy from.
It populates self.term_index as it progresses.
"""
- term_index = InMemoryGraphIndex(0, 1)
+ term_indices = {1:InMemoryGraphIndex(0, 1),
+ 2:InMemoryGraphIndex(0, 2)
+ }
for term, posting_lists in self.terms.iteritems():
posting_list = set()
for component, posting_line in posting_lists:
@@ -848,15 +934,16 @@
post_index = InMemoryGraphIndex(0, 1)
for doc_id in posting_list:
post_index.add_node((doc_id,), '', ())
- term_id = str(term_index.key_count())
+ term_id = str(term_indices[1].key_count() +
+ term_indices[2].key_count())
start, length = self._add_index_to_pack(
post_index, "term_list." + term_id, self.writer)
# How many document ids, and the range for the file view when we
# read the pack later.
term_value = "%s %d %d %d" % (term_id, len(posting_list), start,
length)
- term_index.add_node(term, term_value, ())
- self.term_index = term_index
+ term_indices[len(term)].add_node(term, term_value, ())
+ self.term_indices = term_indices
# Clear out used objects
del self.terms
del self.component_docids
@@ -892,11 +979,14 @@
self._scan_terms()
self._copy_posting_lists()
self.term_start, self.term_length = self._add_index_to_pack(
- self.term_index, "terms", self.writer)
+ self.term_indices[1], "terms", self.writer)
+ self.term_2_start, self.term_2_length = self._add_index_to_pack(
+ self.term_indices[2], "terms2", self.writer)
self.writer.end()
self.write_stream.close()
- index_value = "%d %d %d %d %d %d" % (self.rev_start, self.rev_length,
- self.term_start, self.term_length, self.doc_start, self.doc_length)
+ index_value = "%d %d %d %d %d %d %d %d" % (self.rev_start,
+ self.rev_length, self.term_start, self.term_length, self.doc_start,
+ self.doc_length, self.term_2_start, self.term_2_length)
elements = [self.index_name + ".pack"]
return self.index_name, index_value, elements
@@ -904,7 +994,8 @@
"""Scan the terms in all components to prepare to copy posting lists."""
self.terms = {}
for component in self.components:
- for node in component.term_index.iter_all_entries():
+ for node in chain(component.term_index.iter_all_entries(),
+ component.term_2_index.iter_all_entries()):
term = node[1]
posting_info = node[2]
term_set = self.terms.setdefault(term, set())
=== modified file 'tests/test_index.py'
--- a/tests/test_index.py 2008-06-13 11:48:12 +0000
+++ b/tests/test_index.py 2008-06-14 01:41:42 +0000
@@ -111,6 +111,8 @@
# 3 should map to 2 packs, as should 4 (but with 2 discard)
# To test: we create four revisions:
tree = self.make_branch_and_tree('foo')
+ tree.add(['README.txt'], ['an-id'], ['file'])
+ tree.put_file_bytes_non_atomic('an-id', "file")
revid1 = tree.commit('1')
revid2 = tree.commit('2')
revid3 = tree.commit('3')
@@ -150,6 +152,16 @@
self.assertEqual(2, len(obsolete_names))
new_names = set(get_names()) - set(names)
self.assertEqual(1, len(new_names))
+ self.assertEqual({
+ ("1",):set([('r', '', revid1)]),
+ ("2",):set([('r', '', revid2)]),
+ ("3",):set([('r', '', revid3)]),
+ ("4",):set([('r', '', revid4)]),
+ ('an-id', revid1):set([('p', '', 'README.txt')]),
+ ("file",):set([('f', 'an-id', revid1)]),
+ }, dict(rev_index.all_terms()))
+ self.assertEqual(set([(revid1,), (revid2,), (revid3,), (revid4,)]),
+ set(rev_index.indexed_revisions()))
class TestIndexRevisions(TestCaseWithTransport):
@@ -186,6 +198,7 @@
("this",): set([('f', 'an-id', revid)]),
("working",): set([('f', 'an-id', revid)]),
("tree",): set([('f', 'an-id', revid)]),
+ ('an-id', revid): set([('p', '', 'README.txt')]),
}
all_terms = {}
for term, posting_list in rev_index.all_terms():
@@ -218,16 +231,19 @@
def test_TextHit(self):
tree = self.make_branch_and_tree('tree')
+ search_index = index.init_index(tree.branch)
tree.add(['README.txt'], ['an-id'], ['file'])
tree.put_file_bytes_non_atomic('an-id',
"This is the first commit to this working tree.\n"
)
rev_id1 = tree.commit('commit')
- result = index.FileTextHit(tree.branch.repository, ('an-id', rev_id1))
+ search_index.index_branch(tree.branch, rev_id1)
+ result = index.FileTextHit(search_index, tree.branch.repository,
+ ('an-id', rev_id1))
tree.lock_read()
self.addCleanup(tree.unlock)
self.assertEqualDiff(
- u"File id '%s', revision '%s'." % ('an-id', rev_id1),
+ u"README.txt in revision '%s'." % (rev_id1),
result.document_name())
self.assertEqual(('an-id', rev_id1), result.text_key)
self.assertEqual('No summaries yet.', result.summary())
@@ -266,7 +282,7 @@
self.assertEqual([(doc_index, ("0",), "r revid"),
(doc_index, ("1",), "r other-revid")], nodes)
# and the term refers to document ids
- self.assertEqual(set(["0", "1"]), set(builder.posting_list("term1")))
+ self.assertEqual(set(["0", "1"]), set(builder.posting_list(("term1",))))
# adding a new term adds unique documents
builder.add_term(("term2",), [('r', '', 'revid'),
('r', '', 'third-revid')])
@@ -275,8 +291,8 @@
self.assertEqual([(doc_index, ("0",), "r revid"),
(doc_index, ("1",), "r other-revid"),
(doc_index, ("2",), "r third-revid")], nodes)
- self.assertEqual(set(["0", "1"]), set(builder.posting_list("term1")))
- self.assertEqual(set(["0", "2"]), set(builder.posting_list("term2")))
+ self.assertEqual(set(["0", "1"]), set(builder.posting_list(("term1",))))
+ self.assertEqual(set(["0", "2"]), set(builder.posting_list(("term2",))))
# adding a term twice extends the posting list rather than replacing it
# or erroring.
builder.add_term(("term1",), [('r', '', 'revid'),
@@ -288,8 +304,52 @@
(doc_index, ("2",), "r third-revid"),
(doc_index, ("3",), "r fourth-revid"),
], nodes)
- self.assertEqual(set(["0", "1", "3"]), set(builder.posting_list("term1")))
- self.assertEqual(set(["0", "2"]), set(builder.posting_list("term2")))
+ self.assertEqual(set(["0", "1", "3"]),
+ set(builder.posting_list(("term1",))))
+ self.assertEqual(set(["0", "2"]), set(builder.posting_list(("term2",))))
+
+ def test_2_term_posting_list(self):
+ builder = index.ComponentIndexBuilder()
+ # adding a term adds its documents
+ builder.add_term(("term1", "term12"), [('r', '', 'revid'),
+ ('r', '', 'other-revid')])
+ doc_index = builder.document_index
+ nodes = sorted(list(doc_index.iter_all_entries()))
+ self.assertEqual([(doc_index, ("0",), "r revid"),
+ (doc_index, ("1",), "r other-revid")], nodes)
+ # and the term refers to document ids
+ self.assertEqual(set(["0", "1"]),
+ set(builder.posting_list(("term1", "term12"))))
+ # adding a new term adds unique documents
+ builder.add_term(("term2", "term12"), [('r', '', 'revid'),
+ ('r', '', 'third-revid')])
+ nodes = sorted(list(doc_index.iter_all_entries()))
+ # and refers to the correct ids
+ self.assertEqual([(doc_index, ("0",), "r revid"),
+ (doc_index, ("1",), "r other-revid"),
+ (doc_index, ("2",), "r third-revid")], nodes)
+ self.assertEqual(set(["0", "1"]),
+ set(builder.posting_list(("term1", "term12"))))
+ self.assertEqual(set(["0", "2"]),
+ set(builder.posting_list(("term2", "term12"))))
+ # adding a term twice extends the posting list rather than replacing it
+ # or erroring.
+ builder.add_term(("term1", "term12"), [('r', '', 'revid'),
+ ('r', '', 'fourth-revid')])
+ nodes = sorted(list(doc_index.iter_all_entries()))
+ # and refers to the correct ids
+ self.assertEqual([(doc_index, ("0",), "r revid"),
+ (doc_index, ("1",), "r other-revid"),
+ (doc_index, ("2",), "r third-revid"),
+ (doc_index, ("3",), "r fourth-revid"),
+ ], nodes)
+ self.assertEqual(set(["0", "1", "3"]),
+ set(builder.posting_list(("term1", "term12"))))
+ self.assertEqual(set(["0", "2"]),
+ set(builder.posting_list(("term2", "term12"))))
+ # Single-element terms are not erroneously being used
+ self.assertEqual(set(), set(builder.posting_list(("term1",))))
+ self.assertEqual(set(), set(builder.posting_list(("term2",))))
def test_add_revision(self):
builder = index.ComponentIndexBuilder()
@@ -316,6 +376,7 @@
('r', '', 'rev-common')])
builder.add_term(("term-common",), [('r', '', 'rev1'),
('r', '', 'rev-common')])
+ builder.add_term(("term", "complex"), [('f', 'foo', 'rev1')])
name, value, elements = builder.upload_index(transport)
component1 = index.ComponentIndex(name, value, transport)
components.append(component1)
@@ -336,6 +397,7 @@
('r', '', 'rev1'), ('r', '', 'rev2')])
terms[('term1',)] = set([('r', '', 'rev-common'), ('r', '', 'rev1')])
terms[('term2',)] = set([('r', '', 'other-revid'), ('r', '', 'rev2')])
+ terms[('term', 'complex')] = set([('f', 'foo', 'rev1')])
self.assertEqual(terms, combined.all_terms())
self.assertEqual(set([('rev1',), ('rev2',), ('rev-common',)]),
set(combined.indexed_revisions()))
More information about the bazaar-commits
mailing list