Rev 21: Index file content (although in a fairly crude manner). in http://people.ubuntu.com/~robertc/baz2.0/plugins/search/trunk

Wed Jun 11 09:29:55 BST 2008

At http://people.ubuntu.com/~robertc/baz2.0/plugins/search/trunk

------------------------------------------------------------
revno: 21
revision-id: robertc at robertcollins.net-20080611082950-3abaodt5wpm4c5ac
parent: robertc at robertcollins.net-20080611051632-qqi4nd9g1500702j
committer: Robert Collins <robertc at robertcollins.net>
branch nick: trunk
timestamp: Wed 2008-06-11 18:29:50 +1000
message:
  Index file content (although in a fairly crude manner).
modified:
  BUGS                           bugs-20080609101902-m23i5z2ojdgkeyof-1
  DESIGN                         design-20080608072426-vjoj110dtykfyb7g-1
  NEWS                           news-20080608052041-z5bahsl8kwl0uf4x-2
  index.py                       index.py-20080608055509-hnimeek7q8tctkqf-2
  tests/test_index.py            test_index.py-20080608055509-hnimeek7q8tctkqf-4
=== modified file 'BUGS'

--- a/BUGS	2008-06-11 05:16:32 +0000
+++ b/BUGS	2008-06-11 08:29:50 +0000
@@ -3,6 +3,9 @@
 
 Some key caveats though (not bugs per se):
 
- - scaling: The current disk format creates thousands of small files on disk.
-   This may impede performance on some file systems. When these are put into
-   a single pack this will be rectified.
+ - disk scaling: The current disk format creates many thousands of small files
+   on disk.  This may impede performance on some file systems. When these are
+   put into a single pack this will be rectified.
+ - memory scaling: Full text indexing currently requires very large amounts of
+   memory. To index the history of 'bzr' requires nearly 600MB of memory (revno
+   3494). Larger trees are exceedingly likely to require as-much or more.

=== modified file 'DESIGN'
--- a/DESIGN	2008-06-11 05:16:32 +0000
+++ b/DESIGN	2008-06-11 08:29:50 +0000
@@ -6,6 +6,10 @@
 I plan to use a single namespace for all documents: revisions, inventories and
 file texts, branch tags etc.
 
+Currently this is:
+('r', '', revision_id)
+('f', file_id, revision_id)
+
 What to index
 +++++++++++++
 
@@ -20,6 +24,11 @@
 some care to avoid either being pathologically slow, or blowing out memory on
 large trees.
 
+The current file text indexing solution uses iter_lines_added_or_present_in -
+which means either that more, or less, hits than are interesting (depending on
+what you consider interesting) are returned. Indexing bzr.dev spikes at nearly
+600MB. Indexing very large trees will likely thrash.
+
 Indexing non-text
 =================
 

=== modified file 'NEWS'
--- a/NEWS	2008-06-08 22:19:48 +0000
+++ b/NEWS	2008-06-11 08:29:50 +0000
@@ -14,7 +14,8 @@
   FEATURES:
 
     * New command ``index`` to create a search index for a branch. This
-      indexes the revisions (commit message only). (Robert Collins)
+      indexes the revisions (commit message only) and file texts.
+      (Robert Collins)
 
     * New command ``search`` to search a search index from within bzr.
       (Robert Collins)

=== modified file 'index.py'
--- a/index.py	2008-06-11 05:16:32 +0000
+++ b/index.py	2008-06-11 08:29:50 +0000
@@ -216,11 +216,14 @@
         """Helper for indexed_revisions."""
         # TODO: split into groups of <reasonable memory size> for which we
         # then:
+        _ensure_regexes()
+        builder = ComponentIndexBuilder()
         # here: index texts
         # here: index inventory/paths
         # here: index revisions
-        _ensure_regexes()
-        builder = ComponentIndexBuilder()
+        terms = self._terms_for_texts(locked_branch.repository,
+            revisions_to_index)
+        self._add_terms(builder, terms)
         terms = self._terms_for_revs(locked_branch.repository,
             revisions_to_index)
         self._add_terms(builder, terms)
@@ -357,11 +360,14 @@
             found_documents = [(term_index, doc_id) for doc_id in
                 common_doc_ids]
             for doc_key in self._document_ids_to_keys(found_documents):
-                if doc_key[0] == 'r':
+                if doc_key[0] == 'f':
+                    # file text
+                    yield FileTextHit(self._branch.repository, doc_key[1:3])
+                elif doc_key[0] == 'r':
                     # revision
                     yield RevisionHit(self._branch.repository, doc_key[2:3])
                 else:
-                    raise Exception("unknown doc type")
+                    raise Exception("unknown doc type %r" % (doc_key,))
 
     def _terms_for_revs(self, repository, revision_ids):
         """Generate the posting list for the revision texts of revision_ids.
@@ -370,7 +376,6 @@
         :return: An iterable of (term, posting_list) for the revision texts
             (not the inventories or user texts) of revision_ids.
         """
-        self._refresh_indices()
         terms = {}
         for revision in repository.get_revisions(revision_ids):
             # its a revision, second component is ignored, third is id.
@@ -392,6 +397,57 @@
                 posting_list.add(document_key)
         return terms.iteritems()
 
+    def _terms_for_texts(self, repository, revision_ids):
+        """Generate the posting list for the file texts of revision_ids.
+
+        :param revision_ids: An iterable of revision_ids.
+        :return: An iterable of (term, posting_list) for the revision texts
+            (not the inventories or user texts) of revision_ids.
+        """
+        terms = {}
+        files = {}
+        for item in repository.item_keys_introduced_by(revision_ids):
+            if item[0] != 'file':
+                continue
+            # partitions the files by id, to avoid serious memory overload.
+            file_versions = files.setdefault(item[1], set())
+            file_versions.update((item[2]))
+        transaction = repository.get_transaction()
+        for file_id, file_versions in files.iteritems():
+            vf = repository.weave_store.get_weave(file_id, transaction)
+            for line, version in vf.iter_lines_added_or_present_in_versions(
+                file_versions):
+                document_key = ('f', file_id, version)
+                line_terms = _tokeniser_re.split(line)
+                for term in line_terms:
+                    if not term:
+                        continue
+                    posting_list = terms.setdefault(term, set())
+                    posting_list.add(document_key)
+        return terms.iteritems()
+
+
+class FileTextHit(object):
+    """A match found during a search in a file text."""
+
+    def __init__(self, repository, text_key):
+        """Create a FileTextHit.
+
+        :param repository: A repository to extract revisions from.
+        :param text_key: The text_key that was hit.
+        """
+        self.repository = repository
+        self.text_key = text_key
+
+    def document_name(self):
+        """The name of the document found, for human consumption."""
+        # Perhaps need to utf_decode this?
+        return "File id '%s', revision '%s'." % self.text_key
+
+    def summary(self):
+        """Get a summary of the hit, for display to users."""
+        return "No summaries yet."
+
 
 class RevisionHit(object):
     """A match found during a search in a revision object."""

=== modified file 'tests/test_index.py'
--- a/tests/test_index.py	2008-06-11 05:16:32 +0000
+++ b/tests/test_index.py	2008-06-11 08:29:50 +0000
@@ -86,7 +86,12 @@
     """Tests for indexing of a set of revisions."""
 
     def test_empty_one_revision(self):
+        # Hugish smoke test - really want smaller units of testing...
         tree = self.make_branch_and_tree('')
+        tree.add(['README.txt'], ['an-id'], ['file'])
+        tree.put_file_bytes_non_atomic('an-id',
+            "This is the first commit to this working tree.\n"
+            )
         rev_index = index.init_index(tree.branch)
         # The double-space is a cheap smoke test for the tokeniser.
         revid = tree.commit('first  post')
@@ -101,8 +106,16 @@
         # paths present
         # content of documents.
         expected_terms = {
-            'first':set([('r', '', revid)]),
-            'post':set([('r', '', revid)]),
+            'first': set([('r', '', revid), ('f', 'an-id', revid)]),
+            'post': set([('r', '', revid)]),
+            "This": set([('f', 'an-id', revid)]),
+            "is": set([('f', 'an-id', revid)]),
+            "the": set([('f', 'an-id', revid)]),
+            "commit": set([('f', 'an-id', revid)]),
+            "to": set([('f', 'an-id', revid)]),
+            "this": set([('f', 'an-id', revid)]),
+            "working": set([('f', 'an-id', revid)]),
+            "tree": set([('f', 'an-id', revid)]),
             }
         all_terms = {}
         for term, posting_list in rev_index.all_terms():
@@ -133,6 +146,22 @@
 
 class TestResults(TestCaseWithTransport):
 
+    def test_TextHit(self):
+        tree = self.make_branch_and_tree('tree')
+        tree.add(['README.txt'], ['an-id'], ['file'])
+        tree.put_file_bytes_non_atomic('an-id',
+            "This is the first commit to this working tree.\n"
+            )
+        rev_id1 = tree.commit('commit')
+        result = index.FileTextHit(tree.branch.repository, ('an-id', rev_id1))
+        tree.lock_read()
+        self.addCleanup(tree.unlock)
+        self.assertEqualDiff(
+            u"File id '%s', revision '%s'." % ('an-id', rev_id1),
+            result.document_name())
+        self.assertEqual(('an-id', rev_id1), result.text_key)
+        self.assertEqual('No summaries yet.', result.summary())
+
     def test_RevisionHit(self):
         tree = self.make_branch_and_tree('tree')
         rev_id1 = tree.commit('a multi\nline message')