Rev 33: New format bump, index the path of found documents to allow presenting the path on file text hits. in http://people.ubuntu.com/~robertc/baz2.0/plugins/search/trunk

Robert Collins robertc at robertcollins.net
Sat Jun 14 02:41:49 BST 2008


At http://people.ubuntu.com/~robertc/baz2.0/plugins/search/trunk

------------------------------------------------------------
revno: 33
revision-id: robertc at robertcollins.net-20080614014142-qrfj2rgsqeqjdxv5
parent: robertc at robertcollins.net-20080613114812-oi86vamc6qh8tkt4
committer: Robert Collins <robertc at robertcollins.net>
branch nick: trunk
timestamp: Sat 2008-06-14 11:41:42 +1000
message:
  New format bump, index the path of found documents to allow presenting the path on file text hits.
modified:
  DESIGN                         design-20080608072426-vjoj110dtykfyb7g-1
  index.py                       index.py-20080608055509-hnimeek7q8tctkqf-2
  tests/test_index.py            test_index.py-20080608055509-hnimeek7q8tctkqf-4
=== modified file 'DESIGN'
--- a/DESIGN	2008-06-13 11:48:12 +0000
+++ b/DESIGN	2008-06-14 01:41:42 +0000
@@ -47,7 +47,7 @@
 
  * Mapping fileid, revisionid search hits back to paths for display during hits
    on documents. This appears to need a concordance of the entire inventory
-   state.
+   state for every file hit we can produce.
  * Searching for a path ("show me the file README.txt")
  * Accelerating per-file log (by identifying path-change events which the 
    file graph index elides, and-or

=== modified file 'index.py'
--- a/index.py	2008-06-13 11:48:12 +0000
+++ b/index.py	2008-06-14 01:41:42 +0000
@@ -17,6 +17,7 @@
 
 """The core logic for search."""
 
+from itertools import chain
 import math
 import md5
 import re
@@ -149,7 +150,6 @@
         self._orig_names = {}
         self._current_names = {}
         self._revision_indices = []
-        self._term_indices = []
         self._term_doc_indices = {}
         self._revision_index = CombinedGraphIndex(self._revision_indices)
         # because terms may occur in many component indices, we don't use a 
@@ -251,6 +251,9 @@
         graph = locked_branch.repository.get_graph()
         parent_map = graph.get_parent_map(revisions_to_index)
         order = topo_sort(parent_map)
+        order_dict = {}
+        for pos, revid in enumerate(order):
+            order_dict[revid] = pos
         # 5000 uses 1GB on a mysql tree.
         group_size = 2500
         for offset in range(len(order) / group_size + 1):
@@ -262,6 +265,9 @@
             terms = self._terms_for_texts(locked_branch.repository,
                 revision_group)
             self._add_terms(builder, terms)
+            terms = self._terms_for_file_terms(locked_branch.repository,
+                terms, order_dict)
+            self._add_terms(builder, terms)
             terms = self._terms_for_revs(locked_branch.repository,
                 revision_group)
             self._add_terms(builder, terms)
@@ -345,8 +351,7 @@
         """Add an index (with meta-value value) to the in-memory index list."""
         self._current_names[name] = (value, index)
         self._revision_indices.append(index.revision_index)
-        self._term_indices.append(index.term_index)
-        self._term_doc_indices[index.term_index] = index.document_index
+        self._term_doc_indices[index] = index.document_index
 
     def indexed_revisions(self):
         """Return the revision_keys that this index contains terms for."""
@@ -394,8 +399,7 @@
     def _remove_component_from_memory(self, name):
         """Remove the component name from the index list in memory."""
         index = self._current_names[name][1]
-        del self._term_doc_indices[index.term_index]
-        self._term_indices.remove(index.term_index)
+        del self._term_doc_indices[index]
         self._revision_indices.remove(index.revision_index)
         del self._current_names[name]
 
@@ -409,20 +413,25 @@
         self._refresh_indices()
         found_documents = []
         # Use a set to remove duplicates
-        term_keys = set(termlist)
-        if len(term_keys) == 0:
+        termlist = set(termlist)
+        term_keys = [None, set(), set()]
+        for term in termlist:
+            term_keys[len(term)].add(term)
+        if 0 == (len(term_keys[1]) + len(term_keys[2])):
             return
+
         for value, component in self._current_names.values():
             term_index = component.term_index
             # TODO: push into Component
             # TODO: use a dequeue?
             term_info = []
-            for node in term_index.iter_entries(term_keys):
+            for node in chain(term_index.iter_entries(term_keys[1]),
+                component.term_2_index.iter_entries(term_keys[2])):
                 term_id, posting_count, posting_start, posting_length = \
                     node[2].split(" ")
                 term_info.append((int(posting_count), term_id,
                     int(posting_start), int(posting_length)))
-            if len(term_info) != len(term_keys):
+            if len(term_info) != len(termlist):
                 # One or more terms missing - no hits are possible.
                 continue
             # load the first document list: 
@@ -448,18 +457,60 @@
                 common_doc_keys = set([node[1] for node in
                     post_index.iter_entries(common_doc_keys)])
             common_doc_ids = [key[0] for key in common_doc_keys]
-            found_documents = [(term_index, doc_id) for doc_id in
+            found_documents = [(component, doc_id) for doc_id in
                 common_doc_ids]
             for doc_key in self._document_ids_to_keys(found_documents):
                 if doc_key[0] == 'f':
                     # file text
-                    yield FileTextHit(self._branch.repository, doc_key[1:3])
+                    yield FileTextHit(self, self._branch.repository,
+                        doc_key[1:3])
                 elif doc_key[0] == 'r':
                     # revision
                     yield RevisionHit(self._branch.repository, doc_key[2:3])
+                elif doc_key[0] == 'p':
+                    # path
+                    yield PathHit(doc_key[2])
                 else:
                     raise Exception("unknown doc type %r" % (doc_key,))
 
+    def _terms_for_file_terms(self, repository, file_terms, order_dict):
+        """Generate terms for the path of every file_id, revision_id in terms.
+
+        :param repository: The repository to access inventories from.
+        :param terms: Text terms than have been inserted.
+        :param order_dict: A mapping from revision id to order from the
+            topological sort prepared for the indexing operation.
+        :return: An iterable of (term, posting_list) for the file_id,
+            revision_id pairs mentioned in terms.
+        """
+        terms = {}
+        # What revisions do we need inventories for:
+        revision_ids = {}
+        for term, posting_list in file_terms:
+            for post in posting_list:
+                if post[0] != 'f':
+                    raise ValueError("Unknown post type for %r" % post)
+                fileids = revision_ids.setdefault(post[2], set())
+                fileids.add(post[1])
+        order = list(revision_ids)
+        order.sort(key=lambda revid:order_dict[revid])
+        group_size = 100
+        for offset in range(len(order) / group_size + 1):
+            inventory_group = order[offset * group_size:(offset + 1) * group_size]
+            #texts = repository.get_inventory_weave().get_texts(inventory_group)
+            #for text in inventory_group:
+            #    pass
+            #            for text, revision_id in zip(texts, revision_ids):
+            # For VersionedFiles:
+            # for xml in repository._iter_inventory_xmls(inventory_group):
+            #     pass
+            for inventory in repository.iter_inventories(inventory_group):
+                revision_id = inventory.revision_id
+                for file_id in revision_ids[revision_id]:
+                    path = inventory.id2path(file_id)
+                    terms[(file_id, revision_id)] = [('p', '', path)]
+        return terms.iteritems()
+
     def _terms_for_revs(self, repository, revision_ids):
         """Generate the posting list for the revision texts of revision_ids.
 
@@ -515,31 +566,54 @@
                         continue
                     posting_list = terms.setdefault((term,), set())
                     posting_list.add(document_key)
-        return terms.iteritems()
+        return terms.items()
 
 
 class FileTextHit(object):
     """A match found during a search in a file text."""
 
-    def __init__(self, repository, text_key):
+    def __init__(self, index, repository, text_key):
         """Create a FileTextHit.
 
+        :param index: The index the search result is from, to look up the path
+            of the hit. NB
         :param repository: A repository to extract revisions from.
         :param text_key: The text_key that was hit.
         """
+        self.index = index
         self.repository = repository
         self.text_key = text_key
 
     def document_name(self):
         """The name of the document found, for human consumption."""
         # Perhaps need to utf_decode this?
-        return "File id '%s', revision '%s'." % self.text_key
+        path = self.index.search((self.text_key,)).next()
+        return "%s in revision '%s'." % (path.document_name(), self.text_key[1])
 
     def summary(self):
         """Get a summary of the hit, for display to users."""
         return "No summaries yet."
 
 
+class PathHit(object):
+    """A match found during a search in a file path."""
+
+    def __init__(self, path_utf8):
+        """Create a PathHit.
+
+        :param path_utf8: The path (utf8 encoded).
+        """
+        self.path_utf8 = path_utf8
+
+    def document_name(self):
+        """The name of the document found, for human consumption."""
+        return self.path_utf8.decode("utf8")
+
+    def summary(self):
+        """Get a summary of the hit."""
+        return self.document_name()
+
+
 class RevisionHit(object):
     """A match found during a search in a revision object."""
 
@@ -588,13 +662,16 @@
             "revisions": (lengths[0], lengths[0] + lengths[1]),
             "terms": (lengths[2], lengths[2] + lengths[3]),
             "documents": (lengths[4], lengths[4] + lengths[5]),
+            "terms_2": (lengths[6], lengths[6] + lengths[7]),
             }
         view = FileView(transport, name + '.pack', filemap)
         rev_index = GraphIndex(view, "revisions", lengths[1])
         term_index = GraphIndex(view, "terms", lengths[3])
+        term_2_index = GraphIndex(view, "terms_2", lengths[7])
         doc_index = GraphIndex(view, "documents", lengths[5])
         self.revision_index = rev_index
         self.term_index = term_index
+        self.term_2_index = term_2_index
         self.document_index = doc_index
         self.name = name
         self.transport = transport
@@ -602,7 +679,8 @@
     def all_terms(self):
         """As per Index, but for a single component."""
         result = {}
-        for node in self.term_index.iter_all_entries():
+        for node in chain(self.term_index.iter_all_entries(),
+            self.term_2_index.iter_all_entries()):
             # XXX: Duplicated logic with search().
             term = node[1]
             term_id, posting_count, posting_start, posting_length = \
@@ -717,7 +795,7 @@
 
         Unindexed terms return an empty iterator.
         """
-        term_id = self.term_id((term,))
+        term_id = self.term_id(term)
         if term_id is None:
             return []
         else:
@@ -754,7 +832,9 @@
         del index_bytes
         # generate a new term index with the length of the serialised posting
         # lists.
-        term_index = InMemoryGraphIndex(0, 1)
+        term_indices = {}
+        term_indices[1] = InMemoryGraphIndex(0, 1)
+        term_indices[2] = InMemoryGraphIndex(0, 2)
         for term, term_id in self.terms.iteritems():
             posting_list = self.posting_lists[term_id]
             post_index = InMemoryGraphIndex(0, 1)
@@ -771,14 +851,18 @@
             # read the pack later.
             term_value = "%s %d %d %d" % (term_id, len(posting_list), start,
                 length)
-            term_index.add_node(term, term_value, ())
-        term_start, term_length = self._add_index_to_pack(term_index, "terms", writer)
+            term_indices[len(term)].add_node(term, term_value, ())
+        term_start, term_length = self._add_index_to_pack(term_indices[1],
+            "terms", writer)
+        term_2_start, term_2_length = self._add_index_to_pack(term_indices[2],
+            "terms2", writer)
         doc_start, doc_length = self._add_index_to_pack(self.document_index,
             "documents", writer)
         writer.end()
         write_stream.close()
-        index_value = "%d %d %d %d %d %d" % (rev_start, rev_length, term_start,
-            term_length, doc_start, doc_length)
+        index_value = "%d %d %d %d %d %d %d %d" % (rev_start, rev_length,
+            term_start, term_length, doc_start, doc_length, term_2_start,
+            term_2_length)
         elements = [index_name + ".pack"]
         return index_name, index_value, elements
 
@@ -828,7 +912,9 @@
         and self.terms to determine what to copy from.
         It populates self.term_index as it progresses.
         """
-        term_index = InMemoryGraphIndex(0, 1)
+        term_indices = {1:InMemoryGraphIndex(0, 1),
+            2:InMemoryGraphIndex(0, 2)
+            }
         for term, posting_lists in self.terms.iteritems():
             posting_list = set()
             for component, posting_line in posting_lists:
@@ -848,15 +934,16 @@
             post_index = InMemoryGraphIndex(0, 1)
             for doc_id in posting_list:
                 post_index.add_node((doc_id,), '', ())
-            term_id = str(term_index.key_count())
+            term_id = str(term_indices[1].key_count() +
+                term_indices[2].key_count())
             start, length = self._add_index_to_pack(
                 post_index, "term_list." + term_id, self.writer)
             # How many document ids, and the range for the file view when we
             # read the pack later.
             term_value = "%s %d %d %d" % (term_id, len(posting_list), start,
                 length)
-            term_index.add_node(term, term_value, ())
-        self.term_index = term_index
+            term_indices[len(term)].add_node(term, term_value, ())
+        self.term_indices = term_indices
         # Clear out used objects
         del self.terms
         del self.component_docids
@@ -892,11 +979,14 @@
         self._scan_terms()
         self._copy_posting_lists()
         self.term_start, self.term_length = self._add_index_to_pack(
-            self.term_index, "terms", self.writer)
+            self.term_indices[1], "terms", self.writer)
+        self.term_2_start, self.term_2_length = self._add_index_to_pack(
+            self.term_indices[2], "terms2", self.writer)
         self.writer.end()
         self.write_stream.close()
-        index_value = "%d %d %d %d %d %d" % (self.rev_start, self.rev_length,
-            self.term_start, self.term_length, self.doc_start, self.doc_length)
+        index_value = "%d %d %d %d %d %d %d %d" % (self.rev_start,
+            self.rev_length, self.term_start, self.term_length, self.doc_start,
+            self.doc_length, self.term_2_start, self.term_2_length)
         elements = [self.index_name + ".pack"]
         return self.index_name, index_value, elements
 
@@ -904,7 +994,8 @@
         """Scan the terms in all components to prepare to copy posting lists."""
         self.terms = {}
         for component in self.components:
-            for node in component.term_index.iter_all_entries():
+            for node in chain(component.term_index.iter_all_entries(),
+                    component.term_2_index.iter_all_entries()):
                 term = node[1]
                 posting_info = node[2]
                 term_set = self.terms.setdefault(term, set())

=== modified file 'tests/test_index.py'
--- a/tests/test_index.py	2008-06-13 11:48:12 +0000
+++ b/tests/test_index.py	2008-06-14 01:41:42 +0000
@@ -111,6 +111,8 @@
         # 3 should map to 2 packs, as should 4 (but with 2 discard)
         # To test: we create four revisions:
         tree = self.make_branch_and_tree('foo')
+        tree.add(['README.txt'], ['an-id'], ['file'])
+        tree.put_file_bytes_non_atomic('an-id', "file")
         revid1 = tree.commit('1')
         revid2 = tree.commit('2')
         revid3 = tree.commit('3')
@@ -150,6 +152,16 @@
         self.assertEqual(2, len(obsolete_names))
         new_names = set(get_names()) - set(names)
         self.assertEqual(1, len(new_names))
+        self.assertEqual({
+            ("1",):set([('r', '', revid1)]),
+            ("2",):set([('r', '', revid2)]),
+            ("3",):set([('r', '', revid3)]),
+            ("4",):set([('r', '', revid4)]),
+            ('an-id', revid1):set([('p', '', 'README.txt')]),
+            ("file",):set([('f', 'an-id', revid1)]),
+            }, dict(rev_index.all_terms()))
+        self.assertEqual(set([(revid1,), (revid2,), (revid3,), (revid4,)]),
+            set(rev_index.indexed_revisions()))
 
 
 class TestIndexRevisions(TestCaseWithTransport):
@@ -186,6 +198,7 @@
             ("this",): set([('f', 'an-id', revid)]),
             ("working",): set([('f', 'an-id', revid)]),
             ("tree",): set([('f', 'an-id', revid)]),
+            ('an-id', revid): set([('p', '', 'README.txt')]),
             }
         all_terms = {}
         for term, posting_list in rev_index.all_terms():
@@ -218,16 +231,19 @@
 
     def test_TextHit(self):
         tree = self.make_branch_and_tree('tree')
+        search_index = index.init_index(tree.branch)
         tree.add(['README.txt'], ['an-id'], ['file'])
         tree.put_file_bytes_non_atomic('an-id',
             "This is the first commit to this working tree.\n"
             )
         rev_id1 = tree.commit('commit')
-        result = index.FileTextHit(tree.branch.repository, ('an-id', rev_id1))
+        search_index.index_branch(tree.branch, rev_id1)
+        result = index.FileTextHit(search_index, tree.branch.repository,
+            ('an-id', rev_id1))
         tree.lock_read()
         self.addCleanup(tree.unlock)
         self.assertEqualDiff(
-            u"File id '%s', revision '%s'." % ('an-id', rev_id1),
+            u"README.txt in revision '%s'." % (rev_id1),
             result.document_name())
         self.assertEqual(('an-id', rev_id1), result.text_key)
         self.assertEqual('No summaries yet.', result.summary())
@@ -266,7 +282,7 @@
         self.assertEqual([(doc_index, ("0",), "r  revid"),
             (doc_index, ("1",), "r  other-revid")], nodes)
         # and the term refers to document ids
-        self.assertEqual(set(["0", "1"]), set(builder.posting_list("term1")))
+        self.assertEqual(set(["0", "1"]), set(builder.posting_list(("term1",))))
         # adding a new term adds unique documents
         builder.add_term(("term2",), [('r', '', 'revid'),
             ('r', '', 'third-revid')])
@@ -275,8 +291,8 @@
         self.assertEqual([(doc_index, ("0",), "r  revid"),
             (doc_index, ("1",), "r  other-revid"),
             (doc_index, ("2",), "r  third-revid")], nodes)
-        self.assertEqual(set(["0", "1"]), set(builder.posting_list("term1")))
-        self.assertEqual(set(["0", "2"]), set(builder.posting_list("term2")))
+        self.assertEqual(set(["0", "1"]), set(builder.posting_list(("term1",))))
+        self.assertEqual(set(["0", "2"]), set(builder.posting_list(("term2",))))
         # adding a term twice extends the posting list rather than replacing it
         # or erroring.
         builder.add_term(("term1",), [('r', '', 'revid'),
@@ -288,8 +304,52 @@
             (doc_index, ("2",), "r  third-revid"),
             (doc_index, ("3",), "r  fourth-revid"),
             ], nodes)
-        self.assertEqual(set(["0", "1", "3"]), set(builder.posting_list("term1")))
-        self.assertEqual(set(["0", "2"]), set(builder.posting_list("term2")))
+        self.assertEqual(set(["0", "1", "3"]),
+            set(builder.posting_list(("term1",))))
+        self.assertEqual(set(["0", "2"]), set(builder.posting_list(("term2",))))
+
+    def test_2_term_posting_list(self):
+        builder = index.ComponentIndexBuilder()
+        # adding a term adds its documents
+        builder.add_term(("term1", "term12"), [('r', '', 'revid'),
+            ('r', '', 'other-revid')])
+        doc_index = builder.document_index
+        nodes = sorted(list(doc_index.iter_all_entries()))
+        self.assertEqual([(doc_index, ("0",), "r  revid"),
+            (doc_index, ("1",), "r  other-revid")], nodes)
+        # and the term refers to document ids
+        self.assertEqual(set(["0", "1"]),
+            set(builder.posting_list(("term1", "term12"))))
+        # adding a new term adds unique documents
+        builder.add_term(("term2", "term12"), [('r', '', 'revid'),
+            ('r', '', 'third-revid')])
+        nodes = sorted(list(doc_index.iter_all_entries()))
+        # and refers to the correct ids
+        self.assertEqual([(doc_index, ("0",), "r  revid"),
+            (doc_index, ("1",), "r  other-revid"),
+            (doc_index, ("2",), "r  third-revid")], nodes)
+        self.assertEqual(set(["0", "1"]),
+            set(builder.posting_list(("term1", "term12"))))
+        self.assertEqual(set(["0", "2"]),
+            set(builder.posting_list(("term2", "term12"))))
+        # adding a term twice extends the posting list rather than replacing it
+        # or erroring.
+        builder.add_term(("term1", "term12"), [('r', '', 'revid'),
+            ('r', '', 'fourth-revid')])
+        nodes = sorted(list(doc_index.iter_all_entries()))
+        # and refers to the correct ids
+        self.assertEqual([(doc_index, ("0",), "r  revid"),
+            (doc_index, ("1",), "r  other-revid"),
+            (doc_index, ("2",), "r  third-revid"),
+            (doc_index, ("3",), "r  fourth-revid"),
+            ], nodes)
+        self.assertEqual(set(["0", "1", "3"]),
+            set(builder.posting_list(("term1", "term12"))))
+        self.assertEqual(set(["0", "2"]),
+            set(builder.posting_list(("term2", "term12"))))
+        # Single-element terms are not erroneously being used
+        self.assertEqual(set(), set(builder.posting_list(("term1",))))
+        self.assertEqual(set(), set(builder.posting_list(("term2",))))
 
     def test_add_revision(self):
         builder = index.ComponentIndexBuilder()
@@ -316,6 +376,7 @@
             ('r', '', 'rev-common')])
         builder.add_term(("term-common",), [('r', '', 'rev1'),
             ('r', '', 'rev-common')])
+        builder.add_term(("term", "complex"), [('f', 'foo', 'rev1')])
         name, value, elements = builder.upload_index(transport)
         component1 = index.ComponentIndex(name, value, transport)
         components.append(component1)
@@ -336,6 +397,7 @@
             ('r', '', 'rev1'), ('r', '', 'rev2')])
         terms[('term1',)] = set([('r', '', 'rev-common'), ('r', '', 'rev1')])
         terms[('term2',)] = set([('r', '', 'other-revid'), ('r', '', 'rev2')])
+        terms[('term', 'complex')] = set([('f', 'foo', 'rev1')])
         self.assertEqual(terms, combined.all_terms())
         self.assertEqual(set([('rev1',), ('rev2',), ('rev-common',)]),
             set(combined.indexed_revisions()))




More information about the bazaar-commits mailing list