Rev 42: Use mpdiffs to generate indices, which should increase precision. (Robert Collins) in http://people.ubuntu.com/~robertc/baz2.0/plugins/search/trunk

Sun Jun 22 06:45:18 BST 2008

At http://people.ubuntu.com/~robertc/baz2.0/plugins/search/trunk

------------------------------------------------------------
revno: 42
revision-id: robertc at robertcollins.net-20080622054515-k4qwckj2gfgoip8j
parent: robertc at robertcollins.net-20080621130301-hkrps8jew9h1d0v3
committer: Robert Collins <robertc at robertcollins.net>
branch nick: trunk
timestamp: Sun 2008-06-22 15:45:15 +1000
message:
  Use mpdiffs to generate indices, which should increase precision. (Robert Collins)
modified:
  DESIGN                         design-20080608072426-vjoj110dtykfyb7g-1
  index.py                       index.py-20080608055509-hnimeek7q8tctkqf-2
  tests/test_index.py            test_index.py-20080608055509-hnimeek7q8tctkqf-4
=== modified file 'DESIGN'

--- a/DESIGN	2008-06-14 01:41:42 +0000
+++ b/DESIGN	2008-06-22 05:45:15 +0000
@@ -24,14 +24,14 @@
 some care to avoid either being pathologically slow, or blowing out memory on
 large trees.
 
-The current file text indexing solution uses iter_lines_added_or_present_in -
-which means either that more, or less, hits than are interesting (depending on
-what you consider interesting) are returned. Indexing bzr.dev stabilises at 
-180MB for me. (Very) large trees have been observed to need 1GB of ram.
-If thrashing occurs, change the group size in inde.py down from 2500 revisions
-- the memory drop is roughly proportional to that figure. More index components
-causes incipient problems though, due to no component combining being done, so
-each component is searched sequentially.
+The current file text indexing solution uses make_mpdifs. This should be
+efficient as it has been tuned for bundle use. It allows us to only index
+new-in-a-version terms (though still at line granularity).
+
+Indexing bzr.dev stabilises at 180MB for me. (Very) large trees have been
+observed to need 1GB of ram.  If thrashing occurs, change the group size in
+inde.py down from 2500 revisions - the memory drop is roughly proportional to
+that figure.
 
 Indexing non-text
 =================

=== modified file 'index.py'
--- a/index.py	2008-06-18 13:09:15 +0000
+++ b/index.py	2008-06-22 05:45:15 +0000
@@ -33,6 +33,7 @@
 from bzrlib.plugins.search import errors
 from bzrlib.plugins.search.inventory import paths_from_ids
 from bzrlib.plugins.search.transport import FileView
+from bzrlib.multiparent import NewText
 from bzrlib.revision import NULL_REVISION
 from bzrlib.tsort import topo_sort
 
@@ -643,15 +644,18 @@
         transaction = repository.get_transaction()
         for file_id, file_versions in files.iteritems():
             vf = repository.weave_store.get_weave(file_id, transaction)
-            for line, version in vf.iter_lines_added_or_present_in_versions(
+            for diff, version in zip(vf.make_mpdiffs(file_versions),
                 file_versions):
                 document_key = ('f', file_id, version)
-                line_terms = _tokeniser_re.split(line)
-                for term in line_terms:
-                    if not term:
-                        continue
-                    posting_list = terms.setdefault((term,), set())
-                    posting_list.add(document_key)
+                for hunk in diff.hunks:
+                    if type(hunk) == NewText:
+                        for line in hunk.lines:
+                            line_terms = _tokeniser_re.split(line)
+                            for term in line_terms:
+                                if not term:
+                                    continue
+                                posting_list = terms.setdefault((term,), set())
+                                posting_list.add(document_key)
         return terms.items()
 
 

=== modified file 'tests/test_index.py'
--- a/tests/test_index.py	2008-06-18 13:09:15 +0000
+++ b/tests/test_index.py	2008-06-22 05:45:15 +0000
@@ -205,6 +205,44 @@
             all_terms[term] = set(posting_list)
         self.assertEqual(expected_terms, all_terms)
 
+    def test_knit_snapshots_not_indexed(self):
+        # knit snapshots are a contributing factor to getting too-many hits.
+        # instead only new lines should really be considered.
+        # Setup - knits do not expose where snapshots occur, so to test
+        # this we create three versions of a file, which differ nearly entirely
+        # between serial versions. This should trigger the heuristics on
+        # aggregate size causing the third one to be a snapshot; it should not
+        # be indexed with content matching the lines carried across from the
+        # first or second commits.
+        tree = self.make_branch_and_tree('')
+        tree.add(['README.txt'], ['an-id'], ['file'])
+        tree.put_file_bytes_non_atomic('an-id',
+            "small\ncontent\n")
+        rev_index = index.init_index(tree.branch)
+        tree.commit('')
+        tree.put_file_bytes_non_atomic('an-id',
+            "more\nmore\ncontent\nmore\nmore\nmore\n")
+        tree.commit('')
+        tree.put_file_bytes_non_atomic('an-id',
+            "other\nother\ncontent\nother\nother\nother\n")
+        revid3 = tree.commit('')
+        tree.lock_read()
+        self.assertEqual('fulltext',
+            tree.branch.repository.weave_store.get_weave(
+                'an-id', None)._index.get_method(revid3))
+        tree.unlock()
+        rev_index.index_revisions(tree.branch, [revid3])
+        self.assertEqual(set([(revid3,)]), set(rev_index.indexed_revisions()))
+        rev_index = index.open_index_url('')
+        expected_terms = {
+            ('an-id', revid3): set([('p', '', 'README.txt')]),
+            ('other',): set([('f', 'an-id', revid3)]),
+            }
+        all_terms = {}
+        for term, posting_list in rev_index.all_terms():
+            all_terms[term] = set(posting_list)
+        self.assertEqual(expected_terms, all_terms)
+
 
 class TestSearching(TestCaseWithTransport):