Rev 42: Use mpdiffs to generate indices, which should increase precision. (Robert Collins) in http://people.ubuntu.com/~robertc/baz2.0/plugins/search/trunk
Robert Collins
robertc at robertcollins.net
Sun Jun 22 06:45:18 BST 2008
At http://people.ubuntu.com/~robertc/baz2.0/plugins/search/trunk
------------------------------------------------------------
revno: 42
revision-id: robertc at robertcollins.net-20080622054515-k4qwckj2gfgoip8j
parent: robertc at robertcollins.net-20080621130301-hkrps8jew9h1d0v3
committer: Robert Collins <robertc at robertcollins.net>
branch nick: trunk
timestamp: Sun 2008-06-22 15:45:15 +1000
message:
Use mpdiffs to generate indices, which should increase precision. (Robert Collins)
modified:
DESIGN design-20080608072426-vjoj110dtykfyb7g-1
index.py index.py-20080608055509-hnimeek7q8tctkqf-2
tests/test_index.py test_index.py-20080608055509-hnimeek7q8tctkqf-4
=== modified file 'DESIGN'
--- a/DESIGN 2008-06-14 01:41:42 +0000
+++ b/DESIGN 2008-06-22 05:45:15 +0000
@@ -24,14 +24,14 @@
some care to avoid either being pathologically slow, or blowing out memory on
large trees.
-The current file text indexing solution uses iter_lines_added_or_present_in -
-which means either that more, or less, hits than are interesting (depending on
-what you consider interesting) are returned. Indexing bzr.dev stabilises at
-180MB for me. (Very) large trees have been observed to need 1GB of ram.
-If thrashing occurs, change the group size in inde.py down from 2500 revisions
-- the memory drop is roughly proportional to that figure. More index components
-causes incipient problems though, due to no component combining being done, so
-each component is searched sequentially.
+The current file text indexing solution uses make_mpdifs. This should be
+efficient as it has been tuned for bundle use. It allows us to only index
+new-in-a-version terms (though still at line granularity).
+
+Indexing bzr.dev stabilises at 180MB for me. (Very) large trees have been
+observed to need 1GB of ram. If thrashing occurs, change the group size in
+inde.py down from 2500 revisions - the memory drop is roughly proportional to
+that figure.
Indexing non-text
=================
=== modified file 'index.py'
--- a/index.py 2008-06-18 13:09:15 +0000
+++ b/index.py 2008-06-22 05:45:15 +0000
@@ -33,6 +33,7 @@
from bzrlib.plugins.search import errors
from bzrlib.plugins.search.inventory import paths_from_ids
from bzrlib.plugins.search.transport import FileView
+from bzrlib.multiparent import NewText
from bzrlib.revision import NULL_REVISION
from bzrlib.tsort import topo_sort
@@ -643,15 +644,18 @@
transaction = repository.get_transaction()
for file_id, file_versions in files.iteritems():
vf = repository.weave_store.get_weave(file_id, transaction)
- for line, version in vf.iter_lines_added_or_present_in_versions(
+ for diff, version in zip(vf.make_mpdiffs(file_versions),
file_versions):
document_key = ('f', file_id, version)
- line_terms = _tokeniser_re.split(line)
- for term in line_terms:
- if not term:
- continue
- posting_list = terms.setdefault((term,), set())
- posting_list.add(document_key)
+ for hunk in diff.hunks:
+ if type(hunk) == NewText:
+ for line in hunk.lines:
+ line_terms = _tokeniser_re.split(line)
+ for term in line_terms:
+ if not term:
+ continue
+ posting_list = terms.setdefault((term,), set())
+ posting_list.add(document_key)
return terms.items()
=== modified file 'tests/test_index.py'
--- a/tests/test_index.py 2008-06-18 13:09:15 +0000
+++ b/tests/test_index.py 2008-06-22 05:45:15 +0000
@@ -205,6 +205,44 @@
all_terms[term] = set(posting_list)
self.assertEqual(expected_terms, all_terms)
+ def test_knit_snapshots_not_indexed(self):
+ # knit snapshots are a contributing factor to getting too-many hits.
+ # instead only new lines should really be considered.
+ # Setup - knits do not expose where snapshots occur, so to test
+ # this we create three versions of a file, which differ nearly entirely
+ # between serial versions. This should trigger the heuristics on
+ # aggregate size causing the third one to be a snapshot; it should not
+ # be indexed with content matching the lines carried across from the
+ # first or second commits.
+ tree = self.make_branch_and_tree('')
+ tree.add(['README.txt'], ['an-id'], ['file'])
+ tree.put_file_bytes_non_atomic('an-id',
+ "small\ncontent\n")
+ rev_index = index.init_index(tree.branch)
+ tree.commit('')
+ tree.put_file_bytes_non_atomic('an-id',
+ "more\nmore\ncontent\nmore\nmore\nmore\n")
+ tree.commit('')
+ tree.put_file_bytes_non_atomic('an-id',
+ "other\nother\ncontent\nother\nother\nother\n")
+ revid3 = tree.commit('')
+ tree.lock_read()
+ self.assertEqual('fulltext',
+ tree.branch.repository.weave_store.get_weave(
+ 'an-id', None)._index.get_method(revid3))
+ tree.unlock()
+ rev_index.index_revisions(tree.branch, [revid3])
+ self.assertEqual(set([(revid3,)]), set(rev_index.indexed_revisions()))
+ rev_index = index.open_index_url('')
+ expected_terms = {
+ ('an-id', revid3): set([('p', '', 'README.txt')]),
+ ('other',): set([('f', 'an-id', revid3)]),
+ }
+ all_terms = {}
+ for term, posting_list in rev_index.all_terms():
+ all_terms[term] = set(posting_list)
+ self.assertEqual(expected_terms, all_terms)
+
class TestSearching(TestCaseWithTransport):
More information about the bazaar-commits
mailing list