Rev 9: Actually generate indices when 'bzr index URL' is run. in http://people.ubuntu.com/~robertc/baz2.0/plugins/search/trunk
Robert Collins
robertc at robertcollins.net
Sun Jun 8 15:37:17 BST 2008
At http://people.ubuntu.com/~robertc/baz2.0/plugins/search/trunk
------------------------------------------------------------
revno: 9
revision-id: robertc at robertcollins.net-20080608143716-0eq1vm6b2zj8mx2t
parent: robertc at robertcollins.net-20080608133638-j5yazpedr3mlixju
committer: Robert Collins <robertc at robertcollins.net>
branch nick: trunk
timestamp: Mon 2008-06-09 00:37:16 +1000
message:
Actually generate indices when 'bzr index URL' is run.
modified:
NEWS news-20080608052041-z5bahsl8kwl0uf4x-2
index.py index.py-20080608055509-hnimeek7q8tctkqf-2
tests/test_blackbox.py test_blackbox.py-20080608052041-z5bahsl8kwl0uf4x-9
tests/test_index.py test_index.py-20080608055509-hnimeek7q8tctkqf-4
=== modified file 'NEWS'
--- a/NEWS 2008-06-08 11:41:20 +0000
+++ b/NEWS 2008-06-08 14:37:16 +0000
@@ -13,8 +13,8 @@
FEATURES:
- * New command ``index`` to create a search index for a branch.
- (Robert Collins)
+ * New command ``index`` to create a search index for a branch. This
+ indexes the revisions (commit message only). (Robert Collins)
* New command ``search`` to search a search index from within bzr.
(Robert Collins)
=== modified file 'index.py'
--- a/index.py 2008-06-08 13:36:38 +0000
+++ b/index.py 2008-06-08 14:37:16 +0000
@@ -18,6 +18,7 @@
"""The core logic for search."""
import md5
+import re
from bzrlib import branch as _mod_branch
from bzrlib.bzrdir import BzrDirMeta1
@@ -28,6 +29,16 @@
_FORMAT_1 = 'bzr-search search folder 1\n'
+_tokeniser_re = None
+
+
+def _ensure_regexes():
+ global _tokeniser_re
+ if _tokeniser_re is None:
+ # NB: Perhaps we want to include non-ascii, or is there some unicode
+ # magic to generate good terms? (Known to be a hard problem, but this
+ # is sufficient for an indexer that may not even live a week!)
+ _tokeniser_re = re.compile("[^A-Za-z0-9_]")
def init_index(branch):
@@ -59,11 +70,21 @@
:param url: The url to index.
:return: The resulting search index.
"""
+ branch = _mod_branch.Branch.open(url)
+ branch.lock_read()
try:
- index = open_index_url(url)
- except errors.NoSearchIndex:
- branch = _mod_branch.Branch.open(url)
- index = init_index(branch)
+ try:
+ index = open_index_url(url)
+ # TODO: find the unindexed revisions by searching.
+ # This will index everything in the repo - bad.
+ revs_to_index = branch.repository.all_revision_ids()
+ except errors.NoSearchIndex:
+ index = init_index(branch)
+ # This will index everything in the repo - bad.
+ revs_to_index = branch.repository.all_revision_ids()
+ index.index_revisions(branch, revs_to_index)
+ finally:
+ branch.unlock()
return index
@@ -107,20 +128,60 @@
self._index = CombinedGraphIndex(self._indices)
self._lock = LockDir(index_transport, 'names-lock')
+ def _add_terms(self, index, terms):
+ """Add a set of term posting lists to a in progress index.
+
+ A term is a single index key suffix (e.g. 'first').
+ A posting list is an iterable of full index keys (e.g.
+ ('r', 'x', REVID) for a revision, or ('t', FILEID, REVID) for a file
+ text.)
+
+ :param index: A GraphIndexBuilder.
+ :param terms: An iterable of term -> posting list.
+ """
+ for term, posting_list in terms:
+ # term, ignored, term-value
+ term_key = ('t', 'x', term)
+ index.add_node(term_key, '', (posting_list,))
+
+ def all_terms(self):
+ """Return an iterable of all the posting lists in the index.
+
+ :return: An iterator of (term -> document ids).
+ """
+ self._refresh_indices()
+ for node in self._index.iter_entries_prefix([('t', 'x', None)]):
+ term = node[1][2]
+ posting_list = node[3][0]
+ yield term, posting_list
+
def index_revisions(self, branch, revisions_to_index):
"""Index some revisions from branch.
:param branch: A branch to index.
:param revisions_to_index: A set of revision ids to index.
"""
+ branch.lock_read()
+ try:
+ return self._index_revisions(branch, revisions_to_index)
+ finally:
+ branch.unlock()
+
+ def _index_revisions(self, locked_branch, revisions_to_index):
+ """Helper for indexed_revisions."""
# TODO: split into groups of <reasonable memory size> for which we
# then:
# here: index texts
# here: index inventory/paths
# here: index revisions
+ _ensure_regexes()
index = InMemoryGraphIndex(1, 3)
+ terms = self._terms_for_revs(locked_branch.repository,
+ revisions_to_index)
+ self._add_terms(index, terms)
for rev_id in revisions_to_index:
- index.add_node(('x', 'r', rev_id), '', ((),))
+ # (metadata field, revision, id)
+ index.add_node(('r', 'x', rev_id), '', ((),))
# write to disc.
index_bytes = index.finish().read()
index_name = md5.new(index_bytes).hexdigest()
@@ -158,7 +219,7 @@
def indexed_revisions(self):
"""Return the revision_keys that this index contains terms for."""
self._refresh_indices()
- for node in self._index.iter_entries_prefix([('x', 'r', None)]):
+ for node in self._index.iter_entries_prefix([('r', 'x', None)]):
yield node[1][2:3]
def _refresh_indices(self):
@@ -193,3 +254,30 @@
self._indices.append(index)
self._current_names = current_names
self._orig_names = new_names
+
+ def _terms_for_revs(self, repository, revision_ids):
+ """Generate the posting list for the revision texts of revision_ids.
+
+ :param revision_ids: An iterable of revision_ids.
+ :return: An iterable of (term, posting_list) for the revision texts
+ (not the inventories or user texts) of revision_ids.
+ """
+ terms = {}
+ for revision in repository.get_revisions(revision_ids):
+ # its a revision, second component is ignored, third is id.
+ document_key = ('r', 'x', revision.revision_id)
+ # components of a revision:
+ # parents - not indexed (but we could)
+ # commit message (done)
+ # author (todo)
+ # committer (todo)
+ # properties (todo - names only?)
+ # bugfixes (a property we know how to read)
+ # other filters?
+ commit_terms = _tokeniser_re.split(revision.message)
+ for term in commit_terms:
+ if not term:
+ continue
+ posting_list = terms.setdefault(term,set())
+ posting_list.add(document_key)
+ return terms.iteritems()
=== modified file 'tests/test_blackbox.py'
--- a/tests/test_blackbox.py 2008-06-08 06:42:53 +0000
+++ b/tests/test_blackbox.py 2008-06-08 14:37:16 +0000
@@ -17,6 +17,7 @@
"""Tests for the commands supplied by search."""
+from bzrlib.plugins.search.index import open_index_url
from bzrlib.tests import TestCaseWithTransport
@@ -37,5 +38,14 @@
self.assertEqual('', error)
self.assertEqual('', out)
+ def test_index_branch_content(self):
+ tree = self.make_branch_and_tree('a-branch')
+ tree.commit('a search term')
+ out, error = self.run_bzr(['index', 'a-branch'])
+ self.assertEqual('', error)
+ self.assertEqual('', out)
+ self.assertSubset(set(['a', 'search', 'term']),
+ set(dict(open_index_url('a-branch').all_terms())))
+
def test_index_no_branch(self):
self.run_bzr_error(['Not a branch'], ['index', '.'])
=== modified file 'tests/test_index.py'
--- a/tests/test_index.py 2008-06-08 13:36:38 +0000
+++ b/tests/test_index.py 2008-06-08 14:37:16 +0000
@@ -75,6 +75,12 @@
search_index = index.index_url(self.get_url('foo'))
self.assertIsInstance(search_index, index.Index)
+ def test_index_url_does_index(self):
+ tree = self.make_branch_and_tree('foo')
+ revid = tree.commit('first post')
+ rev_index = index.index_url(self.get_url('foo'))
+ self.assertEqual(set([(revid,)]), set(rev_index.indexed_revisions()))
+
class TestIndexRevisions(TestCaseWithTransport):
"""Tests for indexing of a set of revisions."""
@@ -82,9 +88,23 @@
def test_empty_one_revision(self):
tree = self.make_branch_and_tree('')
rev_index = index.init_index(tree.branch)
- revid = tree.commit('first post')
+ # The double-space is a cheap smoke test for the tokeniser.
+ revid = tree.commit('first post')
rev_index.index_revisions(tree.branch, [revid])
self.assertEqual(set([(revid,)]), set(rev_index.indexed_revisions()))
# reopen - it should retain the indexed revisions.
rev_index = index.open_index_url('')
self.assertEqual(set([(revid,)]), set(rev_index.indexed_revisions()))
+ # The terms posting-lists for a simple commit should be:
+ # The date (TODO, needs some thought on how to represent a date term)
+ # The commiter name, email, commit message, bug fixes, properties
+ # paths present
+ # content of documents.
+ expected_terms = {
+ 'first':set([('r', 'x', revid)]),
+ 'post':set([('r', 'x', revid)]),
+ }
+ all_terms = {}
+ for term, posting_list in rev_index.all_terms():
+ all_terms[term] = set(posting_list)
+ self.assertEqual(expected_terms, all_terms)
More information about the bazaar-commits
mailing list