Rev 9: Actually generate indices when 'bzr index URL' is run. in http://people.ubuntu.com/~robertc/baz2.0/plugins/search/trunk

Sun Jun 8 15:37:17 BST 2008

At http://people.ubuntu.com/~robertc/baz2.0/plugins/search/trunk

------------------------------------------------------------
revno: 9
revision-id: robertc at robertcollins.net-20080608143716-0eq1vm6b2zj8mx2t
parent: robertc at robertcollins.net-20080608133638-j5yazpedr3mlixju
committer: Robert Collins <robertc at robertcollins.net>
branch nick: trunk
timestamp: Mon 2008-06-09 00:37:16 +1000
message:
  Actually generate indices when 'bzr index URL' is run.
modified:
  NEWS                           news-20080608052041-z5bahsl8kwl0uf4x-2
  index.py                       index.py-20080608055509-hnimeek7q8tctkqf-2
  tests/test_blackbox.py         test_blackbox.py-20080608052041-z5bahsl8kwl0uf4x-9
  tests/test_index.py            test_index.py-20080608055509-hnimeek7q8tctkqf-4
=== modified file 'NEWS'

--- a/NEWS	2008-06-08 11:41:20 +0000
+++ b/NEWS	2008-06-08 14:37:16 +0000
@@ -13,8 +13,8 @@
 
   FEATURES:
 
-    * New command ``index`` to create a search index for a branch.
-      (Robert Collins)
+    * New command ``index`` to create a search index for a branch. This
+      indexes the revisions (commit message only). (Robert Collins)
 
     * New command ``search`` to search a search index from within bzr.
       (Robert Collins)

=== modified file 'index.py'
--- a/index.py	2008-06-08 13:36:38 +0000
+++ b/index.py	2008-06-08 14:37:16 +0000
@@ -18,6 +18,7 @@
 """The core logic for search."""
 
 import md5
+import re
 
 from bzrlib import branch as _mod_branch
 from bzrlib.bzrdir import BzrDirMeta1
@@ -28,6 +29,16 @@
 
 
 _FORMAT_1 = 'bzr-search search folder 1\n'
+_tokeniser_re = None
+
+
+def _ensure_regexes():
+    global _tokeniser_re
+    if _tokeniser_re is None:
+        # NB: Perhaps we want to include non-ascii, or is there some unicode
+        # magic to generate good terms? (Known to be a hard problem, but this
+        # is sufficient for an indexer that may not even live a week!)
+        _tokeniser_re = re.compile("[^A-Za-z0-9_]")
 
 
 def init_index(branch):
@@ -59,11 +70,21 @@
     :param url: The url to index.
     :return: The resulting search index.
     """
+    branch = _mod_branch.Branch.open(url)
+    branch.lock_read()
     try:
-        index = open_index_url(url)
-    except errors.NoSearchIndex:
-        branch = _mod_branch.Branch.open(url)
-        index = init_index(branch)
+        try:
+            index = open_index_url(url)
+            # TODO: find the unindexed revisions by searching.
+            # This will index everything in the repo - bad.
+            revs_to_index = branch.repository.all_revision_ids()
+        except errors.NoSearchIndex:
+            index = init_index(branch)
+            # This will index everything in the repo - bad.
+            revs_to_index = branch.repository.all_revision_ids()
+        index.index_revisions(branch, revs_to_index)
+    finally:
+        branch.unlock()
     return index
 
 
@@ -107,20 +128,60 @@
         self._index = CombinedGraphIndex(self._indices)
         self._lock = LockDir(index_transport, 'names-lock')
 
+    def _add_terms(self, index, terms):
+        """Add a set of term posting lists to a in progress index.
+
+        A term is a single index key suffix (e.g. 'first').
+        A posting list is an iterable of full index keys (e.g.
+        ('r', 'x', REVID) for a revision, or ('t', FILEID, REVID) for a file
+        text.)
+
+        :param index: A GraphIndexBuilder.
+        :param terms: An iterable of term -> posting list.
+        """
+        for term, posting_list in terms:
+            # term, ignored, term-value
+            term_key = ('t', 'x', term)
+            index.add_node(term_key, '', (posting_list,))
+
+    def all_terms(self):
+        """Return an iterable of all the posting lists in the index.
+
+        :return: An iterator of (term -> document ids).
+        """
+        self._refresh_indices()
+        for node in self._index.iter_entries_prefix([('t', 'x', None)]):
+            term = node[1][2]
+            posting_list = node[3][0]
+            yield term, posting_list
+
     def index_revisions(self, branch, revisions_to_index):
         """Index some revisions from branch.
         
         :param branch: A branch to index.
         :param revisions_to_index: A set of revision ids to index.
         """
+        branch.lock_read()
+        try:
+            return self._index_revisions(branch, revisions_to_index)
+        finally:
+            branch.unlock()
+
+    def _index_revisions(self, locked_branch, revisions_to_index):
+        """Helper for indexed_revisions."""
         # TODO: split into groups of <reasonable memory size> for which we
         # then:
         # here: index texts
         # here: index inventory/paths
         # here: index revisions
+        _ensure_regexes()
         index = InMemoryGraphIndex(1, 3)
+        terms = self._terms_for_revs(locked_branch.repository,
+            revisions_to_index)
+        self._add_terms(index, terms)
         for rev_id in revisions_to_index:
-            index.add_node(('x', 'r', rev_id), '', ((),))
+            # (metadata field, revision, id)
+            index.add_node(('r', 'x', rev_id), '', ((),))
         # write to disc.
         index_bytes = index.finish().read()
         index_name = md5.new(index_bytes).hexdigest()
@@ -158,7 +219,7 @@
     def indexed_revisions(self):
         """Return the revision_keys that this index contains terms for."""
         self._refresh_indices()
-        for node in self._index.iter_entries_prefix([('x', 'r', None)]):
+        for node in self._index.iter_entries_prefix([('r', 'x', None)]):
             yield node[1][2:3]
 
     def _refresh_indices(self):
@@ -193,3 +254,30 @@
             self._indices.append(index)
         self._current_names = current_names
         self._orig_names = new_names
+
+    def _terms_for_revs(self, repository, revision_ids):
+        """Generate the posting list for the revision texts of revision_ids.
+
+        :param revision_ids: An iterable of revision_ids.
+        :return: An iterable of (term, posting_list) for the revision texts
+            (not the inventories or user texts) of revision_ids.
+        """
+        terms = {}
+        for revision in repository.get_revisions(revision_ids):
+            # its a revision, second component is ignored, third is id.
+            document_key = ('r', 'x', revision.revision_id)
+            # components of a revision:
+            # parents - not indexed (but we could)
+            # commit message (done)
+            # author (todo)
+            # committer (todo)
+            # properties (todo - names only?)
+            # bugfixes (a property we know how to read)
+            # other filters?
+            commit_terms = _tokeniser_re.split(revision.message)
+            for term in commit_terms:
+                if not term:
+                    continue
+                posting_list = terms.setdefault(term,set())
+                posting_list.add(document_key)
+        return terms.iteritems()

=== modified file 'tests/test_blackbox.py'
--- a/tests/test_blackbox.py	2008-06-08 06:42:53 +0000
+++ b/tests/test_blackbox.py	2008-06-08 14:37:16 +0000
@@ -17,6 +17,7 @@
 
 """Tests for the commands supplied by search."""
 
+from bzrlib.plugins.search.index import open_index_url
 from bzrlib.tests import TestCaseWithTransport
 
 
@@ -37,5 +38,14 @@
         self.assertEqual('', error)
         self.assertEqual('', out)
 
+    def test_index_branch_content(self):
+        tree = self.make_branch_and_tree('a-branch')
+        tree.commit('a search term')
+        out, error = self.run_bzr(['index', 'a-branch'])
+        self.assertEqual('', error)
+        self.assertEqual('', out)
+        self.assertSubset(set(['a', 'search', 'term']),
+            set(dict(open_index_url('a-branch').all_terms())))
+
     def test_index_no_branch(self):
         self.run_bzr_error(['Not a branch'], ['index', '.'])

=== modified file 'tests/test_index.py'
--- a/tests/test_index.py	2008-06-08 13:36:38 +0000
+++ b/tests/test_index.py	2008-06-08 14:37:16 +0000
@@ -75,6 +75,12 @@
         search_index = index.index_url(self.get_url('foo'))
         self.assertIsInstance(search_index, index.Index)
 
+    def test_index_url_does_index(self):
+        tree = self.make_branch_and_tree('foo')
+        revid = tree.commit('first post')
+        rev_index = index.index_url(self.get_url('foo'))
+        self.assertEqual(set([(revid,)]), set(rev_index.indexed_revisions()))
+
 
 class TestIndexRevisions(TestCaseWithTransport):
     """Tests for indexing of a set of revisions."""
@@ -82,9 +88,23 @@
     def test_empty_one_revision(self):
         tree = self.make_branch_and_tree('')
         rev_index = index.init_index(tree.branch)
-        revid = tree.commit('first post')
+        # The double-space is a cheap smoke test for the tokeniser.
+        revid = tree.commit('first  post')
         rev_index.index_revisions(tree.branch, [revid])
         self.assertEqual(set([(revid,)]), set(rev_index.indexed_revisions()))
         # reopen - it should retain the indexed revisions.
         rev_index = index.open_index_url('')
         self.assertEqual(set([(revid,)]), set(rev_index.indexed_revisions()))
+        # The terms posting-lists for a simple commit should be:
+        # The date (TODO, needs some thought on how to represent a date term)
+        # The commiter name, email, commit message, bug fixes, properties
+        # paths present
+        # content of documents.
+        expected_terms = {
+            'first':set([('r', 'x', revid)]),
+            'post':set([('r', 'x', revid)]),
+            }
+        all_terms = {}
+        for term, posting_list in rev_index.all_terms():
+            all_terms[term] = set(posting_list)
+        self.assertEqual(expected_terms, all_terms)