Rev 32: Work towards indexing pathid, revisionid phrases to allow nice summaries in http://people.ubuntu.com/~robertc/baz2.0/plugins/search/trunk
Robert Collins
robertc at robertcollins.net
Fri Jun 13 12:48:44 BST 2008
At http://people.ubuntu.com/~robertc/baz2.0/plugins/search/trunk
------------------------------------------------------------
revno: 32
revision-id: robertc at robertcollins.net-20080613114812-oi86vamc6qh8tkt4
parent: robertc at robertcollins.net-20080613061537-uz0no2h1jc9lkku3
committer: Robert Collins <robertc at robertcollins.net>
branch nick: trunk
timestamp: Fri 2008-06-13 21:48:12 +1000
message:
Work towards indexing pathid,revisionid phrases to allow nice summaries
for full text hits. (Robert Collins)
modified:
DESIGN design-20080608072426-vjoj110dtykfyb7g-1
NEWS news-20080608052041-z5bahsl8kwl0uf4x-2
commands.py commands.py-20080608052041-z5bahsl8kwl0uf4x-5
index.py index.py-20080608055509-hnimeek7q8tctkqf-2
tests/test_blackbox.py test_blackbox.py-20080608052041-z5bahsl8kwl0uf4x-9
tests/test_index.py test_index.py-20080608055509-hnimeek7q8tctkqf-4
=== modified file 'DESIGN'
--- a/DESIGN 2008-06-12 04:08:43 +0000
+++ b/DESIGN 2008-06-13 11:48:12 +0000
@@ -40,6 +40,40 @@
files can generate term-lists. This relates to the 'index deltas' question in
that external filters are likely n
+Indexing paths
+==============
+
+There are several use cases for path indices:
+
+ * Mapping fileid, revisionid search hits back to paths for display during hits
+ on documents. This appears to need a concordance of the entire inventory
+ state.
+ * Searching for a path ("show me the file README.txt")
+ * Accelerating per-file log (by identifying path-change events which the
+ file graph index elides, and-or
+
+Mapping fileid, revisionids to paths
+------------------------------------
+
+If we insert path ('p', '', PATH) as a document, and as terms insert the
+fileid, revisionid pair as a phrase, then we can search for the phrase
+fileid, revisionid to return the paths that match - and by virtue of our data
+model that will be unique.
+
+Indexing Phrases
+================
+
+For now, a naive approach is being used: There is a phrase-index for N-term
+phrases. It contains tuples of terms, mapping to a posting list for the phrase.
+For generic any-N phrase support we need a N->phrase_N_index mapping. However
+to bootstrap the process the simplest approach of adding a start,length tuple
+to the names value has been used.
+
+Possible future improvements to this are to:
+
+ * Use termid tuples rather than terms (to save space)
+
+
How to index
++++++++++++
=== modified file 'NEWS'
--- a/NEWS 2008-06-11 08:29:50 +0000
+++ b/NEWS 2008-06-13 11:48:12 +0000
@@ -48,3 +48,7 @@
* New modules: ``commands``, ``errors``, ``index``. These contain the
console ui, exceptions, and the search index core respectively.
(Robert Collins)
+
+ * New module ``transport`` containing ``FileView`` to map a packs contents
+ as a transport object, allowing bzr indices to be stored in a pack.
+ (Robert Collins)
=== modified file 'commands.py'
--- a/commands.py 2008-06-10 05:48:29 +0000
+++ b/commands.py 2008-06-13 11:48:12 +0000
@@ -54,8 +54,9 @@
trans = get_transport('.')
index = _mod_index.open_index_url(trans.base)
# XXX: Have a query translator etc.
+ query = [(query_item,) for query_item in query_list]
seen_count = 0
- for result in index.search(query_list):
+ for result in index.search(query):
self.outf.write(result.document_name())
self.outf.write(" Summary: '%s'\n" % result.summary())
seen_count += 1
=== modified file 'index.py'
--- a/index.py 2008-06-13 06:15:37 +0000
+++ b/index.py 2008-06-13 11:48:12 +0000
@@ -160,7 +160,7 @@
def _add_terms(self, index, terms):
"""Add a set of term posting lists to a in progress index.
- A term is a single index key suffix (e.g. 'first').
+ A term is a single index key (e.g. ('first',)).
A posting list is an iterable of full index keys (e.g.
('r', '', REVID) for a revision, or ('t', FILEID, REVID) for a file
text.)
@@ -409,7 +409,7 @@
self._refresh_indices()
found_documents = []
# Use a set to remove duplicates
- term_keys = set([(term,) for term in termlist])
+ term_keys = set(termlist)
if len(term_keys) == 0:
return
for value, component in self._current_names.values():
@@ -484,7 +484,7 @@
for term in commit_terms:
if not term:
continue
- posting_list = terms.setdefault(term,set())
+ posting_list = terms.setdefault((term,), set())
posting_list.add(document_key)
return terms.iteritems()
@@ -513,7 +513,7 @@
for term in line_terms:
if not term:
continue
- posting_list = terms.setdefault(term, set())
+ posting_list = terms.setdefault((term,), set())
posting_list.add(document_key)
return terms.iteritems()
@@ -604,7 +604,7 @@
result = {}
for node in self.term_index.iter_all_entries():
# XXX: Duplicated logic with search().
- term = node[1][0]
+ term = node[1]
term_id, posting_count, posting_start, posting_length = \
node[2].split(" ")
posting_count = int(posting_count)
@@ -688,13 +688,17 @@
def add_term(self, term, posting_list):
"""Add a term to the index.
- :param term: A term, e.g. 'foo'.
+ :param term: A term, e.g. ('foo',).
:param posting_list: A list of the document_key's that this term
indexes.
:return: None.
"""
- if type(term) != str:
- raise ValueError("terms must be bytestrings at this layer %r" % term)
+ if type(term) != tuple:
+ raise ValueError("terms need to be tuples %r" % term)
+ for component in term:
+ if type(component) != str:
+ raise ValueError(
+ "terms must be bytestrings at this layer %r" % term)
term_id = self.term_id(term)
if term_id is None:
term_id = str(len(self.terms))
@@ -713,7 +717,7 @@
Unindexed terms return an empty iterator.
"""
- term_id = self.term_id(term)
+ term_id = self.term_id((term,))
if term_id is None:
return []
else:
@@ -767,7 +771,7 @@
# read the pack later.
term_value = "%s %d %d %d" % (term_id, len(posting_list), start,
length)
- term_index.add_node((term,), term_value, ())
+ term_index.add_node(term, term_value, ())
term_start, term_length = self._add_index_to_pack(term_index, "terms", writer)
doc_start, doc_length = self._add_index_to_pack(self.document_index,
"documents", writer)
@@ -851,7 +855,7 @@
# read the pack later.
term_value = "%s %d %d %d" % (term_id, len(posting_list), start,
length)
- term_index.add_node((term,), term_value, ())
+ term_index.add_node(term, term_value, ())
self.term_index = term_index
# Clear out used objects
del self.terms
@@ -901,7 +905,7 @@
self.terms = {}
for component in self.components:
for node in component.term_index.iter_all_entries():
- term = node[1][0]
+ term = node[1]
posting_info = node[2]
term_set = self.terms.setdefault(term, set())
term_set.add((component, posting_info))
=== modified file 'tests/test_blackbox.py'
--- a/tests/test_blackbox.py 2008-06-10 06:15:03 +0000
+++ b/tests/test_blackbox.py 2008-06-13 11:48:12 +0000
@@ -73,7 +73,7 @@
out, error = self.run_bzr(['index', 'a-branch'])
self.assertEqual('', error)
self.assertEqual('', out)
- self.assertSubset(set(['a', 'search', 'term']),
+ self.assertSubset(set([('a',), ('search',), ('term',)]),
set(dict(open_index_url('a-branch').all_terms())))
def test_index_no_branch(self):
=== modified file 'tests/test_index.py'
--- a/tests/test_index.py 2008-06-13 06:15:37 +0000
+++ b/tests/test_index.py 2008-06-13 11:48:12 +0000
@@ -176,16 +176,16 @@
# paths present
# content of documents.
expected_terms = {
- 'first': set([('r', '', revid), ('f', 'an-id', revid)]),
- 'post': set([('r', '', revid)]),
- "This": set([('f', 'an-id', revid)]),
- "is": set([('f', 'an-id', revid)]),
- "the": set([('f', 'an-id', revid)]),
- "commit": set([('f', 'an-id', revid)]),
- "to": set([('f', 'an-id', revid)]),
- "this": set([('f', 'an-id', revid)]),
- "working": set([('f', 'an-id', revid)]),
- "tree": set([('f', 'an-id', revid)]),
+ ('first',): set([('r', '', revid), ('f', 'an-id', revid)]),
+ ('post',): set([('r', '', revid)]),
+ ("This",): set([('f', 'an-id', revid)]),
+ ("is",): set([('f', 'an-id', revid)]),
+ ("the",): set([('f', 'an-id', revid)]),
+ ("commit",): set([('f', 'an-id', revid)]),
+ ("to",): set([('f', 'an-id', revid)]),
+ ("this",): set([('f', 'an-id', revid)]),
+ ("working",): set([('f', 'an-id', revid)]),
+ ("tree",): set([('f', 'an-id', revid)]),
}
all_terms = {}
for term, posting_list in rev_index.all_terms():
@@ -200,7 +200,7 @@
rev_index = index.init_index(tree.branch)
# No exception because its a generator (and thus not guaranteed to run
# to completion).
- self.assertEqual([], list(rev_index.search(['missing_term'])))
+ self.assertEqual([], list(rev_index.search([('missing_term',)])))
def test_search_trivial(self):
tree = self.make_branch_and_tree('tree')
@@ -208,7 +208,7 @@
# The double-space is a cheap smoke test for the tokeniser.
revid = tree.commit('first post')
rev_index.index_revisions(tree.branch, [revid])
- results = list(rev_index.search(['post']))
+ results = list(rev_index.search([('post',)]))
self.assertEqual(1, len(results))
self.assertIsInstance(results[0], index.RevisionHit)
self.assertEqual((revid,), results[0].revision_key)
@@ -259,7 +259,8 @@
def test_posting_list(self):
builder = index.ComponentIndexBuilder()
# adding a term adds its documents
- builder.add_term("term1", [('r', '', 'revid'), ('r', '', 'other-revid')])
+ builder.add_term(("term1",), [('r', '', 'revid'),
+ ('r', '', 'other-revid')])
doc_index = builder.document_index
nodes = sorted(list(doc_index.iter_all_entries()))
self.assertEqual([(doc_index, ("0",), "r revid"),
@@ -267,7 +268,8 @@
# and the term refers to document ids
self.assertEqual(set(["0", "1"]), set(builder.posting_list("term1")))
# adding a new term adds unique documents
- builder.add_term("term2", [('r', '', 'revid'), ('r', '', 'third-revid')])
+ builder.add_term(("term2",), [('r', '', 'revid'),
+ ('r', '', 'third-revid')])
nodes = sorted(list(doc_index.iter_all_entries()))
# and refers to the correct ids
self.assertEqual([(doc_index, ("0",), "r revid"),
@@ -277,7 +279,8 @@
self.assertEqual(set(["0", "2"]), set(builder.posting_list("term2")))
# adding a term twice extends the posting list rather than replacing it
# or erroring.
- builder.add_term("term1", [('r', '', 'revid'), ('r', '', 'fourth-revid')])
+ builder.add_term(("term1",), [('r', '', 'revid'),
+ ('r', '', 'fourth-revid')])
nodes = sorted(list(doc_index.iter_all_entries()))
# and refers to the correct ids
self.assertEqual([(doc_index, ("0",), "r revid"),
@@ -309,16 +312,19 @@
builder = index.ComponentIndexBuilder()
builder.add_revision('rev1')
builder.add_revision('rev-common')
- builder.add_term("term1", [('r', '', 'rev1'), ('r', '', 'rev-common')])
- builder.add_term("term-common", [('r', '', 'rev1'), ('r', '', 'rev-common')])
+ builder.add_term(("term1",), [('r', '', 'rev1'),
+ ('r', '', 'rev-common')])
+ builder.add_term(("term-common",), [('r', '', 'rev1'),
+ ('r', '', 'rev-common')])
name, value, elements = builder.upload_index(transport)
component1 = index.ComponentIndex(name, value, transport)
components.append(component1)
builder = index.ComponentIndexBuilder()
builder.add_revision('rev-common')
builder.add_revision('rev2')
- builder.add_term("term-common", [('r', '', 'rev2'), ('r', '', 'rev-common')])
- builder.add_term("term2", [('r', '', 'rev2'), ('r', '', 'other-revid')])
+ builder.add_term(("term-common",), [('r', '', 'rev2'),
+ ('r', '', 'rev-common')])
+ builder.add_term(("term2",), [('r', '', 'rev2'), ('r', '', 'other-revid')])
name, value, elements = builder.upload_index(transport)
component2 = index.ComponentIndex(name, value, transport)
components.append(component2)
@@ -326,10 +332,10 @@
name, value, elements = combiner.combine()
combined = index.ComponentIndex(name, value, transport)
terms = {}
- terms['term-common'] = set([('r', '', 'rev-common'), ('r', '', 'rev1'),
- ('r', '', 'rev2')])
- terms['term1'] = set([('r', '', 'rev-common'), ('r', '', 'rev1')])
- terms['term2'] = set([('r', '', 'other-revid'), ('r', '', 'rev2')])
+ terms[('term-common',)] = set([('r', '', 'rev-common'),
+ ('r', '', 'rev1'), ('r', '', 'rev2')])
+ terms[('term1',)] = set([('r', '', 'rev-common'), ('r', '', 'rev1')])
+ terms[('term2',)] = set([('r', '', 'other-revid'), ('r', '', 'rev2')])
self.assertEqual(terms, combined.all_terms())
self.assertEqual(set([('rev1',), ('rev2',), ('rev-common',)]),
set(combined.indexed_revisions()))
More information about the bazaar-commits
mailing list