Rev 32: Work towards indexing pathid, revisionid phrases to allow nice summaries in http://people.ubuntu.com/~robertc/baz2.0/plugins/search/trunk

Fri Jun 13 12:48:44 BST 2008

At http://people.ubuntu.com/~robertc/baz2.0/plugins/search/trunk

------------------------------------------------------------
revno: 32
revision-id: robertc at robertcollins.net-20080613114812-oi86vamc6qh8tkt4
parent: robertc at robertcollins.net-20080613061537-uz0no2h1jc9lkku3
committer: Robert Collins <robertc at robertcollins.net>
branch nick: trunk
timestamp: Fri 2008-06-13 21:48:12 +1000
message:
  Work towards indexing pathid,revisionid phrases to allow nice summaries
  for full text hits. (Robert Collins)
modified:
  DESIGN                         design-20080608072426-vjoj110dtykfyb7g-1
  NEWS                           news-20080608052041-z5bahsl8kwl0uf4x-2
  commands.py                    commands.py-20080608052041-z5bahsl8kwl0uf4x-5
  index.py                       index.py-20080608055509-hnimeek7q8tctkqf-2
  tests/test_blackbox.py         test_blackbox.py-20080608052041-z5bahsl8kwl0uf4x-9
  tests/test_index.py            test_index.py-20080608055509-hnimeek7q8tctkqf-4
=== modified file 'DESIGN'

--- a/DESIGN	2008-06-12 04:08:43 +0000
+++ b/DESIGN	2008-06-13 11:48:12 +0000
@@ -40,6 +40,40 @@
 files can generate term-lists. This relates to the 'index deltas' question in
 that external filters are likely n
 
+Indexing paths
+==============
+
+There are several use cases for path indices:
+
+ * Mapping fileid, revisionid search hits back to paths for display during hits
+   on documents. This appears to need a concordance of the entire inventory
+   state.
+ * Searching for a path ("show me the file README.txt")
+ * Accelerating per-file log (by identifying path-change events which the 
+   file graph index elides, and-or
+
+Mapping fileid, revisionids to paths
+------------------------------------
+
+If we insert path ('p', '', PATH) as a document, and as terms insert the
+fileid, revisionid pair as a phrase, then we can search for the phrase
+fileid, revisionid to return the paths that match - and by virtue of our data
+model that will be unique. 
+
+Indexing Phrases
+================
+
+For now, a naive approach is being used: There is a phrase-index for N-term
+phrases. It contains tuples of terms, mapping to a posting list for the phrase.
+For generic any-N phrase support we need a N->phrase_N_index mapping. However
+to bootstrap the process the simplest approach of adding a start,length tuple
+to the names value has been used.
+
+Possible future improvements to this are to:
+ 
+ * Use termid tuples rather than terms (to save space)
+
+
 How to index
 ++++++++++++
 

=== modified file 'NEWS'
--- a/NEWS	2008-06-11 08:29:50 +0000
+++ b/NEWS	2008-06-13 11:48:12 +0000
@@ -48,3 +48,7 @@
     * New modules: ``commands``, ``errors``, ``index``. These contain the
       console ui, exceptions, and the search index core respectively.
       (Robert Collins)
+
+    * New module ``transport`` containing ``FileView`` to map a packs contents
+      as a transport object, allowing bzr indices to be stored in a pack.
+      (Robert Collins)

=== modified file 'commands.py'
--- a/commands.py	2008-06-10 05:48:29 +0000
+++ b/commands.py	2008-06-13 11:48:12 +0000
@@ -54,8 +54,9 @@
         trans = get_transport('.')
         index = _mod_index.open_index_url(trans.base)
         # XXX: Have a query translator etc.
+        query = [(query_item,) for query_item in query_list]
         seen_count = 0
-        for result in index.search(query_list):
+        for result in index.search(query):
             self.outf.write(result.document_name())
             self.outf.write(" Summary: '%s'\n" % result.summary())
             seen_count += 1

=== modified file 'index.py'
--- a/index.py	2008-06-13 06:15:37 +0000
+++ b/index.py	2008-06-13 11:48:12 +0000
@@ -160,7 +160,7 @@
     def _add_terms(self, index, terms):
         """Add a set of term posting lists to a in progress index.
 
-        A term is a single index key suffix (e.g. 'first').
+        A term is a single index key (e.g. ('first',)).
         A posting list is an iterable of full index keys (e.g.
         ('r', '', REVID) for a revision, or ('t', FILEID, REVID) for a file
         text.)
@@ -409,7 +409,7 @@
         self._refresh_indices()
         found_documents = []
         # Use a set to remove duplicates
-        term_keys = set([(term,) for term in termlist])
+        term_keys = set(termlist)
         if len(term_keys) == 0:
             return
         for value, component in self._current_names.values():
@@ -484,7 +484,7 @@
             for term in commit_terms:
                 if not term:
                     continue
-                posting_list = terms.setdefault(term,set())
+                posting_list = terms.setdefault((term,), set())
                 posting_list.add(document_key)
         return terms.iteritems()
 
@@ -513,7 +513,7 @@
                 for term in line_terms:
                     if not term:
                         continue
-                    posting_list = terms.setdefault(term, set())
+                    posting_list = terms.setdefault((term,), set())
                     posting_list.add(document_key)
         return terms.iteritems()
 
@@ -604,7 +604,7 @@
         result = {}
         for node in self.term_index.iter_all_entries():
             # XXX: Duplicated logic with search().
-            term = node[1][0]
+            term = node[1]
             term_id, posting_count, posting_start, posting_length = \
                 node[2].split(" ")
             posting_count = int(posting_count)
@@ -688,13 +688,17 @@
     def add_term(self, term, posting_list):
         """Add a term to the index.
 
-        :param term: A term, e.g. 'foo'.
+        :param term: A term, e.g. ('foo',).
         :param posting_list: A list of the document_key's that this term
             indexes.
         :return: None.
         """
-        if type(term) != str:
-            raise ValueError("terms must be bytestrings at this layer %r" % term)
+        if type(term) != tuple:
+            raise ValueError("terms need to be tuples %r" % term)
+        for component in term:
+            if type(component) != str:
+                raise ValueError(
+                    "terms must be bytestrings at this layer %r" % term)
         term_id = self.term_id(term)
         if term_id is None:
             term_id = str(len(self.terms))
@@ -713,7 +717,7 @@
 
         Unindexed terms return an empty iterator.
         """
-        term_id = self.term_id(term)
+        term_id = self.term_id((term,))
         if term_id is None:
             return []
         else:
@@ -767,7 +771,7 @@
             # read the pack later.
             term_value = "%s %d %d %d" % (term_id, len(posting_list), start,
                 length)
-            term_index.add_node((term,), term_value, ())
+            term_index.add_node(term, term_value, ())
         term_start, term_length = self._add_index_to_pack(term_index, "terms", writer)
         doc_start, doc_length = self._add_index_to_pack(self.document_index,
             "documents", writer)
@@ -851,7 +855,7 @@
             # read the pack later.
             term_value = "%s %d %d %d" % (term_id, len(posting_list), start,
                 length)
-            term_index.add_node((term,), term_value, ())
+            term_index.add_node(term, term_value, ())
         self.term_index = term_index
         # Clear out used objects
         del self.terms
@@ -901,7 +905,7 @@
         self.terms = {}
         for component in self.components:
             for node in component.term_index.iter_all_entries():
-                term = node[1][0]
+                term = node[1]
                 posting_info = node[2]
                 term_set = self.terms.setdefault(term, set())
                 term_set.add((component, posting_info))

=== modified file 'tests/test_blackbox.py'
--- a/tests/test_blackbox.py	2008-06-10 06:15:03 +0000
+++ b/tests/test_blackbox.py	2008-06-13 11:48:12 +0000
@@ -73,7 +73,7 @@
         out, error = self.run_bzr(['index', 'a-branch'])
         self.assertEqual('', error)
         self.assertEqual('', out)
-        self.assertSubset(set(['a', 'search', 'term']),
+        self.assertSubset(set([('a',), ('search',), ('term',)]),
             set(dict(open_index_url('a-branch').all_terms())))
 
     def test_index_no_branch(self):

=== modified file 'tests/test_index.py'
--- a/tests/test_index.py	2008-06-13 06:15:37 +0000
+++ b/tests/test_index.py	2008-06-13 11:48:12 +0000
@@ -176,16 +176,16 @@
         # paths present
         # content of documents.
         expected_terms = {
-            'first': set([('r', '', revid), ('f', 'an-id', revid)]),
-            'post': set([('r', '', revid)]),
-            "This": set([('f', 'an-id', revid)]),
-            "is": set([('f', 'an-id', revid)]),
-            "the": set([('f', 'an-id', revid)]),
-            "commit": set([('f', 'an-id', revid)]),
-            "to": set([('f', 'an-id', revid)]),
-            "this": set([('f', 'an-id', revid)]),
-            "working": set([('f', 'an-id', revid)]),
-            "tree": set([('f', 'an-id', revid)]),
+            ('first',): set([('r', '', revid), ('f', 'an-id', revid)]),
+            ('post',): set([('r', '', revid)]),
+            ("This",): set([('f', 'an-id', revid)]),
+            ("is",): set([('f', 'an-id', revid)]),
+            ("the",): set([('f', 'an-id', revid)]),
+            ("commit",): set([('f', 'an-id', revid)]),
+            ("to",): set([('f', 'an-id', revid)]),
+            ("this",): set([('f', 'an-id', revid)]),
+            ("working",): set([('f', 'an-id', revid)]),
+            ("tree",): set([('f', 'an-id', revid)]),
             }
         all_terms = {}
         for term, posting_list in rev_index.all_terms():
@@ -200,7 +200,7 @@
         rev_index = index.init_index(tree.branch)
         # No exception because its a generator (and thus not guaranteed to run
         # to completion).
-        self.assertEqual([], list(rev_index.search(['missing_term'])))
+        self.assertEqual([], list(rev_index.search([('missing_term',)])))
 
     def test_search_trivial(self):
         tree = self.make_branch_and_tree('tree')
@@ -208,7 +208,7 @@
         # The double-space is a cheap smoke test for the tokeniser.
         revid = tree.commit('first  post')
         rev_index.index_revisions(tree.branch, [revid])
-        results = list(rev_index.search(['post']))
+        results = list(rev_index.search([('post',)]))
         self.assertEqual(1, len(results))
         self.assertIsInstance(results[0], index.RevisionHit)
         self.assertEqual((revid,), results[0].revision_key)
@@ -259,7 +259,8 @@
     def test_posting_list(self):
         builder = index.ComponentIndexBuilder()
         # adding a term adds its documents
-        builder.add_term("term1", [('r', '', 'revid'), ('r', '', 'other-revid')])
+        builder.add_term(("term1",), [('r', '', 'revid'),
+            ('r', '', 'other-revid')])
         doc_index = builder.document_index
         nodes = sorted(list(doc_index.iter_all_entries()))
         self.assertEqual([(doc_index, ("0",), "r  revid"),
@@ -267,7 +268,8 @@
         # and the term refers to document ids
         self.assertEqual(set(["0", "1"]), set(builder.posting_list("term1")))
         # adding a new term adds unique documents
-        builder.add_term("term2", [('r', '', 'revid'), ('r', '', 'third-revid')])
+        builder.add_term(("term2",), [('r', '', 'revid'),
+            ('r', '', 'third-revid')])
         nodes = sorted(list(doc_index.iter_all_entries()))
         # and refers to the correct ids
         self.assertEqual([(doc_index, ("0",), "r  revid"),
@@ -277,7 +279,8 @@
         self.assertEqual(set(["0", "2"]), set(builder.posting_list("term2")))
         # adding a term twice extends the posting list rather than replacing it
         # or erroring.
-        builder.add_term("term1", [('r', '', 'revid'), ('r', '', 'fourth-revid')])
+        builder.add_term(("term1",), [('r', '', 'revid'),
+            ('r', '', 'fourth-revid')])
         nodes = sorted(list(doc_index.iter_all_entries()))
         # and refers to the correct ids
         self.assertEqual([(doc_index, ("0",), "r  revid"),
@@ -309,16 +312,19 @@
         builder = index.ComponentIndexBuilder()
         builder.add_revision('rev1')
         builder.add_revision('rev-common')
-        builder.add_term("term1", [('r', '', 'rev1'), ('r', '', 'rev-common')])
-        builder.add_term("term-common", [('r', '', 'rev1'), ('r', '', 'rev-common')])
+        builder.add_term(("term1",), [('r', '', 'rev1'),
+            ('r', '', 'rev-common')])
+        builder.add_term(("term-common",), [('r', '', 'rev1'),
+            ('r', '', 'rev-common')])
         name, value, elements = builder.upload_index(transport)
         component1 = index.ComponentIndex(name, value, transport)
         components.append(component1)
         builder = index.ComponentIndexBuilder()
         builder.add_revision('rev-common')
         builder.add_revision('rev2')
-        builder.add_term("term-common", [('r', '', 'rev2'), ('r', '', 'rev-common')])
-        builder.add_term("term2", [('r', '', 'rev2'), ('r', '', 'other-revid')])
+        builder.add_term(("term-common",), [('r', '', 'rev2'),
+            ('r', '', 'rev-common')])
+        builder.add_term(("term2",), [('r', '', 'rev2'), ('r', '', 'other-revid')])
         name, value, elements = builder.upload_index(transport)
         component2 = index.ComponentIndex(name, value, transport)
         components.append(component2)
@@ -326,10 +332,10 @@
         name, value, elements = combiner.combine()
         combined = index.ComponentIndex(name, value, transport)
         terms = {}
-        terms['term-common'] = set([('r', '', 'rev-common'), ('r', '', 'rev1'),
-            ('r', '', 'rev2')])
-        terms['term1'] = set([('r', '', 'rev-common'), ('r', '', 'rev1')])
-        terms['term2'] = set([('r', '', 'other-revid'), ('r', '', 'rev2')])
+        terms[('term-common',)] = set([('r', '', 'rev-common'),
+            ('r', '', 'rev1'), ('r', '', 'rev2')])
+        terms[('term1',)] = set([('r', '', 'rev-common'), ('r', '', 'rev1')])
+        terms[('term2',)] = set([('r', '', 'other-revid'), ('r', '', 'rev2')])
         self.assertEqual(terms, combined.all_terms())
         self.assertEqual(set([('rev1',), ('rev2',), ('rev-common',)]),
             set(combined.indexed_revisions()))