Rev 37: Add FileTextHit summaries, crude - only grabbing the first match, but better than nothing. in http://people.ubuntu.com/~robertc/baz2.0/plugins/search/trunk

Sat Jun 14 10:45:11 BST 2008

At http://people.ubuntu.com/~robertc/baz2.0/plugins/search/trunk

------------------------------------------------------------
revno: 37
revision-id: robertc at robertcollins.net-20080614094510-tow920w1dq381gz2
parent: robertc at robertcollins.net-20080614075117-x05en683cw32vzzf
committer: Robert Collins <robertc at robertcollins.net>
branch nick: trunk
timestamp: Sat 2008-06-14 19:45:10 +1000
message:
  Add FileTextHit summaries, crude - only grabbing the first match, but better than nothing.
modified:
  BUGS                           bugs-20080609101902-m23i5z2ojdgkeyof-1
  NEWS                           news-20080608052041-z5bahsl8kwl0uf4x-2
  commands.py                    commands.py-20080608052041-z5bahsl8kwl0uf4x-5
  index.py                       index.py-20080608055509-hnimeek7q8tctkqf-2
  tests/test_index.py            test_index.py-20080608055509-hnimeek7q8tctkqf-4
=== modified file 'BUGS'

--- a/BUGS	2008-06-12 04:19:22 +0000
+++ b/BUGS	2008-06-14 09:45:10 +0000
@@ -3,10 +3,6 @@
 
 Some key caveats though (not bugs per se):
 
- - disk scaling: The current disk format creates a single per index component,
-   but does not combine components. Each component has 2500 revisions indexed
-   within it. This places a lower limit on the latency involved in a search due
-   to having to search (total revisions / 2500) separate indices.
  - memory scaling: Full text indexing currently requires a significant amount
    of memory.  To index the history of 'bzr' requires nearly 200MB of memory
    (revno 3494).  Larger trees are exceedingly likely to require as-much or

=== modified file 'NEWS'
--- a/NEWS	2008-06-13 11:48:12 +0000
+++ b/NEWS	2008-06-14 09:45:10 +0000
@@ -49,6 +49,11 @@
       console ui, exceptions, and the search index core respectively.
       (Robert Collins)
 
+    * New module ``inventory`` containing ``paths_from_ids``, a helper
+      for efficient extraction of paths from inventory xml files without
+      creating a full Inventory object. This is approximately 5 times
+      faster than creating the full object. (Robert Collins)
+
     * New module ``transport`` containing ``FileView`` to map a packs contents
       as a transport object, allowing bzr indices to be stored in a pack.
       (Robert Collins)

=== modified file 'commands.py'
--- a/commands.py	2008-06-13 11:48:12 +0000
+++ b/commands.py	2008-06-14 09:45:10 +0000
@@ -56,9 +56,13 @@
         # XXX: Have a query translator etc.
         query = [(query_item,) for query_item in query_list]
         seen_count = 0
-        for result in index.search(query):
-            self.outf.write(result.document_name())
-            self.outf.write(" Summary: '%s'\n" % result.summary())
-            seen_count += 1
+        index._branch.lock_read()
+        try:
+            for result in index.search(query):
+                self.outf.write(result.document_name())
+                self.outf.write(" Summary: '%s'\n" % result.summary())
+                seen_count += 1
+        finally:
+            index._branch.unlock()
         if seen_count == 0:
             raise errors.NoMatch(query_list)

=== modified file 'index.py'
--- a/index.py	2008-06-14 06:37:30 +0000
+++ b/index.py	2008-06-14 09:45:10 +0000
@@ -28,6 +28,7 @@
 from bzrlib.errors import NotBranchError, NoSuchFile, UnknownFormatError
 from bzrlib.index import CombinedGraphIndex, GraphIndex, InMemoryGraphIndex
 from bzrlib.lockdir import LockDir
+from bzrlib.osutils import split_lines
 from bzrlib.pack import ContainerWriter
 from bzrlib.plugins.search import errors
 from bzrlib.plugins.search.inventory import paths_from_ids
@@ -427,6 +428,7 @@
         :return: An iterator of SearchResults for documents indexed by all
             terms in the termlist.
         """
+        _ensure_regexes()
         self._refresh_indices()
         found_documents = []
         # Use a set to remove duplicates
@@ -480,7 +482,7 @@
                 if doc_key[0] == 'f':
                     # file text
                     yield FileTextHit(self, self._branch.repository,
-                        doc_key[1:3])
+                        doc_key[1:3], termlist)
                 elif doc_key[0] == 'r':
                     # revision
                     yield RevisionHit(self._branch.repository, doc_key[2:3])
@@ -599,17 +601,20 @@
 class FileTextHit(object):
     """A match found during a search in a file text."""
 
-    def __init__(self, index, repository, text_key):
+    def __init__(self, index, repository, text_key, termlist):
         """Create a FileTextHit.
 
         :param index: The index the search result is from, to look up the path
             of the hit. NB
         :param repository: A repository to extract revisions from.
         :param text_key: The text_key that was hit.
+        :param termlist: The query that was issued, used for generating
+            summaries.
         """
         self.index = index
         self.repository = repository
         self.text_key = text_key
+        self.termlist = termlist
 
     def document_name(self):
         """The name of the document found, for human consumption."""
@@ -619,7 +624,21 @@
 
     def summary(self):
         """Get a summary of the hit, for display to users."""
-        return "No summaries yet."
+        lines = self.repository.iter_files_bytes([
+            (self.text_key[0], self.text_key[1], "")]).next()[1]
+        if not isinstance(lines, list):
+            # We got bytes back, not lines (which the contract supports).
+            lines = split_lines(lines)
+        # We could look for the best match, try to get context, line numbers
+        # etc. This is complex - what if 'foo' is on line 1 and 'bar' on line
+        # 54.
+        # NB: This does not handle phrases correctly - but - make it work.
+        flattened_terms = set([' '.join(term) for term in self.termlist])
+        for line in lines:
+            line_terms = set(_tokeniser_re.split(line))
+            if len(line_terms.intersection(flattened_terms)) > 0:
+                return line[:-1]
+        raise ValueError("no match? wtf? %r" % lines)
 
 
 class PathHit(object):

=== modified file 'tests/test_index.py'
--- a/tests/test_index.py	2008-06-14 01:41:42 +0000
+++ b/tests/test_index.py	2008-06-14 09:45:10 +0000
@@ -234,19 +234,20 @@
         search_index = index.init_index(tree.branch)
         tree.add(['README.txt'], ['an-id'], ['file'])
         tree.put_file_bytes_non_atomic('an-id',
-            "This is the first commit to this working tree.\n"
+            "This is the \nfirst commit \nto this working tree.\n"
             )
         rev_id1 = tree.commit('commit')
         search_index.index_branch(tree.branch, rev_id1)
+        query = [('commit',)]
         result = index.FileTextHit(search_index, tree.branch.repository,
-            ('an-id', rev_id1))
+            ('an-id', rev_id1), query)
         tree.lock_read()
         self.addCleanup(tree.unlock)
         self.assertEqualDiff(
             u"README.txt in revision '%s'." % (rev_id1),
             result.document_name())
         self.assertEqual(('an-id', rev_id1), result.text_key)
-        self.assertEqual('No summaries yet.', result.summary())
+        self.assertEqual('first commit ', result.summary())
 
     def test_RevisionHit(self):
         tree = self.make_branch_and_tree('tree')