Rev 21: Index file content (although in a fairly crude manner). in http://people.ubuntu.com/~robertc/baz2.0/plugins/search/trunk
Robert Collins
robertc at robertcollins.net
Wed Jun 11 09:29:55 BST 2008
At http://people.ubuntu.com/~robertc/baz2.0/plugins/search/trunk
------------------------------------------------------------
revno: 21
revision-id: robertc at robertcollins.net-20080611082950-3abaodt5wpm4c5ac
parent: robertc at robertcollins.net-20080611051632-qqi4nd9g1500702j
committer: Robert Collins <robertc at robertcollins.net>
branch nick: trunk
timestamp: Wed 2008-06-11 18:29:50 +1000
message:
Index file content (although in a fairly crude manner).
modified:
BUGS bugs-20080609101902-m23i5z2ojdgkeyof-1
DESIGN design-20080608072426-vjoj110dtykfyb7g-1
NEWS news-20080608052041-z5bahsl8kwl0uf4x-2
index.py index.py-20080608055509-hnimeek7q8tctkqf-2
tests/test_index.py test_index.py-20080608055509-hnimeek7q8tctkqf-4
=== modified file 'BUGS'
--- a/BUGS 2008-06-11 05:16:32 +0000
+++ b/BUGS 2008-06-11 08:29:50 +0000
@@ -3,6 +3,9 @@
Some key caveats though (not bugs per se):
- - scaling: The current disk format creates thousands of small files on disk.
- This may impede performance on some file systems. When these are put into
- a single pack this will be rectified.
+ - disk scaling: The current disk format creates many thousands of small files
+ on disk. This may impede performance on some file systems. When these are
+ put into a single pack this will be rectified.
+ - memory scaling: Full text indexing currently requires very large amounts of
+ memory. To index the history of 'bzr' requires nearly 600MB of memory (revno
+ 3494). Larger trees are exceedingly likely to require as-much or more.
=== modified file 'DESIGN'
--- a/DESIGN 2008-06-11 05:16:32 +0000
+++ b/DESIGN 2008-06-11 08:29:50 +0000
@@ -6,6 +6,10 @@
I plan to use a single namespace for all documents: revisions, inventories and
file texts, branch tags etc.
+Currently this is:
+('r', '', revision_id)
+('f', file_id, revision_id)
+
What to index
+++++++++++++
@@ -20,6 +24,11 @@
some care to avoid either being pathologically slow, or blowing out memory on
large trees.
+The current file text indexing solution uses iter_lines_added_or_present_in -
+which means either that more, or less, hits than are interesting (depending on
+what you consider interesting) are returned. Indexing bzr.dev spikes at nearly
+600MB. Indexing very large trees will likely thrash.
+
Indexing non-text
=================
=== modified file 'NEWS'
--- a/NEWS 2008-06-08 22:19:48 +0000
+++ b/NEWS 2008-06-11 08:29:50 +0000
@@ -14,7 +14,8 @@
FEATURES:
* New command ``index`` to create a search index for a branch. This
- indexes the revisions (commit message only). (Robert Collins)
+ indexes the revisions (commit message only) and file texts.
+ (Robert Collins)
* New command ``search`` to search a search index from within bzr.
(Robert Collins)
=== modified file 'index.py'
--- a/index.py 2008-06-11 05:16:32 +0000
+++ b/index.py 2008-06-11 08:29:50 +0000
@@ -216,11 +216,14 @@
"""Helper for indexed_revisions."""
# TODO: split into groups of <reasonable memory size> for which we
# then:
+ _ensure_regexes()
+ builder = ComponentIndexBuilder()
# here: index texts
# here: index inventory/paths
# here: index revisions
- _ensure_regexes()
- builder = ComponentIndexBuilder()
+ terms = self._terms_for_texts(locked_branch.repository,
+ revisions_to_index)
+ self._add_terms(builder, terms)
terms = self._terms_for_revs(locked_branch.repository,
revisions_to_index)
self._add_terms(builder, terms)
@@ -357,11 +360,14 @@
found_documents = [(term_index, doc_id) for doc_id in
common_doc_ids]
for doc_key in self._document_ids_to_keys(found_documents):
- if doc_key[0] == 'r':
+ if doc_key[0] == 'f':
+ # file text
+ yield FileTextHit(self._branch.repository, doc_key[1:3])
+ elif doc_key[0] == 'r':
# revision
yield RevisionHit(self._branch.repository, doc_key[2:3])
else:
- raise Exception("unknown doc type")
+ raise Exception("unknown doc type %r" % (doc_key,))
def _terms_for_revs(self, repository, revision_ids):
"""Generate the posting list for the revision texts of revision_ids.
@@ -370,7 +376,6 @@
:return: An iterable of (term, posting_list) for the revision texts
(not the inventories or user texts) of revision_ids.
"""
- self._refresh_indices()
terms = {}
for revision in repository.get_revisions(revision_ids):
# its a revision, second component is ignored, third is id.
@@ -392,6 +397,57 @@
posting_list.add(document_key)
return terms.iteritems()
+ def _terms_for_texts(self, repository, revision_ids):
+ """Generate the posting list for the file texts of revision_ids.
+
+ :param revision_ids: An iterable of revision_ids.
+ :return: An iterable of (term, posting_list) for the revision texts
+ (not the inventories or user texts) of revision_ids.
+ """
+ terms = {}
+ files = {}
+ for item in repository.item_keys_introduced_by(revision_ids):
+ if item[0] != 'file':
+ continue
+ # partitions the files by id, to avoid serious memory overload.
+ file_versions = files.setdefault(item[1], set())
+ file_versions.update((item[2]))
+ transaction = repository.get_transaction()
+ for file_id, file_versions in files.iteritems():
+ vf = repository.weave_store.get_weave(file_id, transaction)
+ for line, version in vf.iter_lines_added_or_present_in_versions(
+ file_versions):
+ document_key = ('f', file_id, version)
+ line_terms = _tokeniser_re.split(line)
+ for term in line_terms:
+ if not term:
+ continue
+ posting_list = terms.setdefault(term, set())
+ posting_list.add(document_key)
+ return terms.iteritems()
+
+
+class FileTextHit(object):
+ """A match found during a search in a file text."""
+
+ def __init__(self, repository, text_key):
+ """Create a FileTextHit.
+
+ :param repository: A repository to extract revisions from.
+ :param text_key: The text_key that was hit.
+ """
+ self.repository = repository
+ self.text_key = text_key
+
+ def document_name(self):
+ """The name of the document found, for human consumption."""
+ # Perhaps need to utf_decode this?
+ return "File id '%s', revision '%s'." % self.text_key
+
+ def summary(self):
+ """Get a summary of the hit, for display to users."""
+ return "No summaries yet."
+
class RevisionHit(object):
"""A match found during a search in a revision object."""
=== modified file 'tests/test_index.py'
--- a/tests/test_index.py 2008-06-11 05:16:32 +0000
+++ b/tests/test_index.py 2008-06-11 08:29:50 +0000
@@ -86,7 +86,12 @@
"""Tests for indexing of a set of revisions."""
def test_empty_one_revision(self):
+ # Hugish smoke test - really want smaller units of testing...
tree = self.make_branch_and_tree('')
+ tree.add(['README.txt'], ['an-id'], ['file'])
+ tree.put_file_bytes_non_atomic('an-id',
+ "This is the first commit to this working tree.\n"
+ )
rev_index = index.init_index(tree.branch)
# The double-space is a cheap smoke test for the tokeniser.
revid = tree.commit('first post')
@@ -101,8 +106,16 @@
# paths present
# content of documents.
expected_terms = {
- 'first':set([('r', '', revid)]),
- 'post':set([('r', '', revid)]),
+ 'first': set([('r', '', revid), ('f', 'an-id', revid)]),
+ 'post': set([('r', '', revid)]),
+ "This": set([('f', 'an-id', revid)]),
+ "is": set([('f', 'an-id', revid)]),
+ "the": set([('f', 'an-id', revid)]),
+ "commit": set([('f', 'an-id', revid)]),
+ "to": set([('f', 'an-id', revid)]),
+ "this": set([('f', 'an-id', revid)]),
+ "working": set([('f', 'an-id', revid)]),
+ "tree": set([('f', 'an-id', revid)]),
}
all_terms = {}
for term, posting_list in rev_index.all_terms():
@@ -133,6 +146,22 @@
class TestResults(TestCaseWithTransport):
+ def test_TextHit(self):
+ tree = self.make_branch_and_tree('tree')
+ tree.add(['README.txt'], ['an-id'], ['file'])
+ tree.put_file_bytes_non_atomic('an-id',
+ "This is the first commit to this working tree.\n"
+ )
+ rev_id1 = tree.commit('commit')
+ result = index.FileTextHit(tree.branch.repository, ('an-id', rev_id1))
+ tree.lock_read()
+ self.addCleanup(tree.unlock)
+ self.assertEqualDiff(
+ u"File id '%s', revision '%s'." % ('an-id', rev_id1),
+ result.document_name())
+ self.assertEqual(('an-id', rev_id1), result.text_key)
+ self.assertEqual('No summaries yet.', result.summary())
+
def test_RevisionHit(self):
tree = self.make_branch_and_tree('tree')
rev_id1 = tree.commit('a multi\nline message')
More information about the bazaar-commits
mailing list