Rev 37: Add FileTextHit summaries, crude - only grabbing the first match, but better than nothing. in http://people.ubuntu.com/~robertc/baz2.0/plugins/search/trunk
Robert Collins
robertc at robertcollins.net
Sat Jun 14 10:45:11 BST 2008
At http://people.ubuntu.com/~robertc/baz2.0/plugins/search/trunk
------------------------------------------------------------
revno: 37
revision-id: robertc at robertcollins.net-20080614094510-tow920w1dq381gz2
parent: robertc at robertcollins.net-20080614075117-x05en683cw32vzzf
committer: Robert Collins <robertc at robertcollins.net>
branch nick: trunk
timestamp: Sat 2008-06-14 19:45:10 +1000
message:
Add FileTextHit summaries, crude - only grabbing the first match, but better than nothing.
modified:
BUGS bugs-20080609101902-m23i5z2ojdgkeyof-1
NEWS news-20080608052041-z5bahsl8kwl0uf4x-2
commands.py commands.py-20080608052041-z5bahsl8kwl0uf4x-5
index.py index.py-20080608055509-hnimeek7q8tctkqf-2
tests/test_index.py test_index.py-20080608055509-hnimeek7q8tctkqf-4
=== modified file 'BUGS'
--- a/BUGS 2008-06-12 04:19:22 +0000
+++ b/BUGS 2008-06-14 09:45:10 +0000
@@ -3,10 +3,6 @@
Some key caveats though (not bugs per se):
- - disk scaling: The current disk format creates a single per index component,
- but does not combine components. Each component has 2500 revisions indexed
- within it. This places a lower limit on the latency involved in a search due
- to having to search (total revisions / 2500) separate indices.
- memory scaling: Full text indexing currently requires a significant amount
of memory. To index the history of 'bzr' requires nearly 200MB of memory
(revno 3494). Larger trees are exceedingly likely to require as-much or
=== modified file 'NEWS'
--- a/NEWS 2008-06-13 11:48:12 +0000
+++ b/NEWS 2008-06-14 09:45:10 +0000
@@ -49,6 +49,11 @@
console ui, exceptions, and the search index core respectively.
(Robert Collins)
+ * New module ``inventory`` containing ``paths_from_ids``, a helper
+ for efficient extraction of paths from inventory xml files without
+ creating a full Inventory object. This is approximately 5 times
+ faster than creating the full object. (Robert Collins)
+
* New module ``transport`` containing ``FileView`` to map a packs contents
as a transport object, allowing bzr indices to be stored in a pack.
(Robert Collins)
=== modified file 'commands.py'
--- a/commands.py 2008-06-13 11:48:12 +0000
+++ b/commands.py 2008-06-14 09:45:10 +0000
@@ -56,9 +56,13 @@
# XXX: Have a query translator etc.
query = [(query_item,) for query_item in query_list]
seen_count = 0
- for result in index.search(query):
- self.outf.write(result.document_name())
- self.outf.write(" Summary: '%s'\n" % result.summary())
- seen_count += 1
+ index._branch.lock_read()
+ try:
+ for result in index.search(query):
+ self.outf.write(result.document_name())
+ self.outf.write(" Summary: '%s'\n" % result.summary())
+ seen_count += 1
+ finally:
+ index._branch.unlock()
if seen_count == 0:
raise errors.NoMatch(query_list)
=== modified file 'index.py'
--- a/index.py 2008-06-14 06:37:30 +0000
+++ b/index.py 2008-06-14 09:45:10 +0000
@@ -28,6 +28,7 @@
from bzrlib.errors import NotBranchError, NoSuchFile, UnknownFormatError
from bzrlib.index import CombinedGraphIndex, GraphIndex, InMemoryGraphIndex
from bzrlib.lockdir import LockDir
+from bzrlib.osutils import split_lines
from bzrlib.pack import ContainerWriter
from bzrlib.plugins.search import errors
from bzrlib.plugins.search.inventory import paths_from_ids
@@ -427,6 +428,7 @@
:return: An iterator of SearchResults for documents indexed by all
terms in the termlist.
"""
+ _ensure_regexes()
self._refresh_indices()
found_documents = []
# Use a set to remove duplicates
@@ -480,7 +482,7 @@
if doc_key[0] == 'f':
# file text
yield FileTextHit(self, self._branch.repository,
- doc_key[1:3])
+ doc_key[1:3], termlist)
elif doc_key[0] == 'r':
# revision
yield RevisionHit(self._branch.repository, doc_key[2:3])
@@ -599,17 +601,20 @@
class FileTextHit(object):
"""A match found during a search in a file text."""
- def __init__(self, index, repository, text_key):
+ def __init__(self, index, repository, text_key, termlist):
"""Create a FileTextHit.
:param index: The index the search result is from, to look up the path
of the hit. NB
:param repository: A repository to extract revisions from.
:param text_key: The text_key that was hit.
+ :param termlist: The query that was issued, used for generating
+ summaries.
"""
self.index = index
self.repository = repository
self.text_key = text_key
+ self.termlist = termlist
def document_name(self):
"""The name of the document found, for human consumption."""
@@ -619,7 +624,21 @@
def summary(self):
"""Get a summary of the hit, for display to users."""
- return "No summaries yet."
+ lines = self.repository.iter_files_bytes([
+ (self.text_key[0], self.text_key[1], "")]).next()[1]
+ if not isinstance(lines, list):
+ # We got bytes back, not lines (which the contract supports).
+ lines = split_lines(lines)
+ # We could look for the best match, try to get context, line numbers
+ # etc. This is complex - what if 'foo' is on line 1 and 'bar' on line
+ # 54.
+ # NB: This does not handle phrases correctly - but - make it work.
+ flattened_terms = set([' '.join(term) for term in self.termlist])
+ for line in lines:
+ line_terms = set(_tokeniser_re.split(line))
+ if len(line_terms.intersection(flattened_terms)) > 0:
+ return line[:-1]
+ raise ValueError("no match? wtf? %r" % lines)
class PathHit(object):
=== modified file 'tests/test_index.py'
--- a/tests/test_index.py 2008-06-14 01:41:42 +0000
+++ b/tests/test_index.py 2008-06-14 09:45:10 +0000
@@ -234,19 +234,20 @@
search_index = index.init_index(tree.branch)
tree.add(['README.txt'], ['an-id'], ['file'])
tree.put_file_bytes_non_atomic('an-id',
- "This is the first commit to this working tree.\n"
+ "This is the \nfirst commit \nto this working tree.\n"
)
rev_id1 = tree.commit('commit')
search_index.index_branch(tree.branch, rev_id1)
+ query = [('commit',)]
result = index.FileTextHit(search_index, tree.branch.repository,
- ('an-id', rev_id1))
+ ('an-id', rev_id1), query)
tree.lock_read()
self.addCleanup(tree.unlock)
self.assertEqualDiff(
u"README.txt in revision '%s'." % (rev_id1),
result.document_name())
self.assertEqual(('an-id', rev_id1), result.text_key)
- self.assertEqual('No summaries yet.', result.summary())
+ self.assertEqual('first commit ', result.summary())
def test_RevisionHit(self):
tree = self.make_branch_and_tree('tree')
More information about the bazaar-commits
mailing list