Rev 24: Change the disk format to create a single pack with the posting lists for a component. in http://people.ubuntu.com/~robertc/baz2.0/plugins/search/trunk

Thu Jun 12 02:30:04 BST 2008

At http://people.ubuntu.com/~robertc/baz2.0/plugins/search/trunk

------------------------------------------------------------
revno: 24
revision-id: robertc at robertcollins.net-20080612013002-kl2rmh8imu8bknum
parent: robertc at robertcollins.net-20080611134613-r91btjp60gizwlpt
committer: Robert Collins <robertc at robertcollins.net>
branch nick: trunk
timestamp: Thu 2008-06-12 11:30:02 +1000
message:
  Change the disk format to create a single pack with the posting lists for a component.
added:
  tests/test_transport.py        test_transport.py-20080612012941-c32db8zkpo1mhde8-3
  transport.py                   transport.py-20080612012941-c32db8zkpo1mhde8-2
modified:
  BUGS                           bugs-20080609101902-m23i5z2ojdgkeyof-1
  DESIGN                         design-20080608072426-vjoj110dtykfyb7g-1
  index.py                       index.py-20080608055509-hnimeek7q8tctkqf-2
  tests/__init__.py              __init__.py-20080608052041-z5bahsl8kwl0uf4x-8
=== modified file 'BUGS'

--- a/BUGS	2008-06-11 08:29:50 +0000
+++ b/BUGS	2008-06-12 01:30:02 +0000
@@ -3,9 +3,10 @@
 
 Some key caveats though (not bugs per se):
 
- - disk scaling: The current disk format creates many thousands of small files
-   on disk.  This may impede performance on some file systems. When these are
-   put into a single pack this will be rectified.
- - memory scaling: Full text indexing currently requires very large amounts of
-   memory. To index the history of 'bzr' requires nearly 600MB of memory (revno
-   3494). Larger trees are exceedingly likely to require as-much or more.
+ - disk scaling: The current disk format creates a number of files per index
+   component, but does not component components. Each component has 2500
+   revisions indexed within it. This places a low limit on the latency involved
+   in a search.
+ - memory scaling: Full text indexing currently requires very significant memory.
+   To index the history of 'bzr' requires nearly 200B of memory (revno 3494).
+   Larger trees are exceedingly likely to require as-much or more.

=== modified file 'DESIGN'
--- a/DESIGN	2008-06-11 08:29:50 +0000
+++ b/DESIGN	2008-06-12 01:30:02 +0000
@@ -76,8 +76,8 @@
 
 A names file listing the indices.
 
-Second cut - TODO
-=================
+Second cut - in progress
+========================
 
 Each name in the names file is a pack, its length, and two read vectors for the
 terms and revisions lists. The pack contains a series of posting lists trailed

=== modified file 'index.py'
--- a/index.py	2008-06-11 13:46:13 +0000
+++ b/index.py	2008-06-12 01:30:02 +0000
@@ -25,7 +25,9 @@
 from bzrlib.errors import NotBranchError, NoSuchFile, UnknownFormatError
 from bzrlib.index import CombinedGraphIndex, GraphIndex, InMemoryGraphIndex
 from bzrlib.lockdir import LockDir
+from bzrlib.pack import ContainerWriter
 from bzrlib.plugins.search import errors
+from bzrlib.plugins.search.transport import FileView
 from bzrlib.revision import NULL_REVISION
 from bzrlib.tsort import topo_sort
 
@@ -172,12 +174,17 @@
             for node in index.iter_all_entries():
                 # XXX: Duplicated logic with search().
                 term = node[1][0]
-                term_id, posting_count, posting_length = node[2].split(" ")
+                term_id, posting_count, posting_start, posting_length = \
+                    node[2].split(" ")
                 posting_count = int(posting_count)
+                posting_start = int(posting_start)
                 posting_length = int(posting_length)
+                posting_stop = posting_start + posting_length
                 post_name = component.name + "." + term_id
-                post_index = GraphIndex(self._indices_transport, post_name,
-                    posting_length)
+                filemap = {post_name:(posting_start, posting_stop)}
+                view = FileView(self._indices_transport,
+                    component.name + '.pack', filemap)
+                post_index = GraphIndex(view, post_name, posting_length)
                 doc_ids = set([node[1][0] for node in
                     post_index.iter_all_entries()])
                 doc_ids = [(index, doc_id) for doc_id in doc_ids]
@@ -342,26 +349,33 @@
             # TODO: use a dequeue?
             term_info = []
             for node in term_index.iter_entries(term_keys):
-                term_id, posting_count, posting_length = node[2].split(" ")
+                term_id, posting_count, posting_start, posting_length = \
+                    node[2].split(" ")
                 term_info.append((int(posting_count), term_id,
-                    int(posting_length)))
+                    int(posting_start), int(posting_length)))
             if len(term_info) != len(term_keys):
                 # One or more terms missing - no hits are possible.
                 continue
             # load the first document list: 
-            _, term_id, posting_length = term_info.pop(0)
+            _, term_id, posting_start, posting_length = term_info.pop(0)
+            posting_stop = posting_start + posting_length
             post_name = component.name + "." + term_id
-            post_index = GraphIndex(self._indices_transport, post_name,
-                posting_length)
+            filemap = {post_name:(posting_start, posting_stop)}
+            view = FileView(self._indices_transport, component.name + '.pack',
+                filemap)
+            post_index = GraphIndex(view, post_name, posting_length)
             common_doc_keys = set([node[1] for node in
                 post_index.iter_all_entries()])
             # Now we whittle down the nodes we need - still going in sorted
             # order. (possibly doing concurrent reduction would be better).
             while common_doc_keys and term_info:
-                _, term_id, posting_length = term_info.pop(0)
+                _, term_id, posting_start, posting_length = term_info.pop(0)
+                posting_stop = posting_start + posting_length
                 post_name = component.name + "." + term_id
-                post_index = GraphIndex(self._indices_transport, post_name,
-                    posting_length)
+                filemap = {post_name:(posting_start, posting_stop)}
+                view = FileView(self._indices_transport,
+                    component.name + '.pack', filemap)
+                post_index = GraphIndex(view, post_name, posting_length)
                 common_doc_keys = set([node[1] for node in
                     post_index.iter_entries(common_doc_keys)])
             common_doc_ids = [key[0] for key in common_doc_keys]
@@ -601,6 +615,10 @@
         # lists.
         elements = []
         term_index = InMemoryGraphIndex(0, 1)
+        write_stream = upload_transport.open_write_stream(index_name + ".pack")
+        elements.append(index_name + ".pack")
+        writer = ContainerWriter(write_stream.write)
+        writer.begin()
         for term, term_id in self.terms.iteritems():
             posting_list = self.posting_lists[term_id]
             post_index = InMemoryGraphIndex(0, 1)
@@ -608,14 +626,20 @@
                 post_index.add_node((doc_id,), "", ())
             posting_bytes = post_index.finish().read()
             posting_name = index_name + "." + term_id
-            upload_transport.put_bytes_non_atomic(posting_name, posting_bytes)
-            elements.append(posting_name)
-            # How many document ids, and how long the file is (for use when we
-            # put these in packs).
-            term_value = "%s %d %d" % (term_id, len(str(posting_list)),
-                len(posting_bytes))
+            pos, size = writer.add_bytes_record(posting_bytes, [(posting_name,)])
+            length = len(posting_bytes)
+            offset = size - length
+            start = pos + offset
+            ### upload_transport.put_bytes_non_atomic(posting_name, posting_bytes)
+            ### elements.append(posting_name)
+            # How many document ids, and the range for the file view when we
+            # read the pack later.
+            term_value = "%s %d %d %d" % (term_id, len(str(posting_list)),
+                start, length)
+            del posting_bytes
             term_index.add_node((term,), term_value, ())
-            del posting_bytes
+        writer.end()
+        write_stream.close()
         index_bytes = term_index.finish().read()
         term_length = len(index_bytes)
         upload_transport.put_bytes_non_atomic(

=== modified file 'tests/__init__.py'
--- a/tests/__init__.py	2008-06-08 05:57:24 +0000
+++ b/tests/__init__.py	2008-06-12 01:30:02 +0000
@@ -24,6 +24,7 @@
         'blackbox',
         'errors',
         'index',
+        'transport',
         ]
     standard_tests.addTests(loader.loadTestsFromModuleNames(
         ['bzrlib.plugins.search.tests.test_' + name for 

=== added file 'tests/test_transport.py'
--- a/tests/test_transport.py	1970-01-01 00:00:00 +0000
+++ b/tests/test_transport.py	2008-06-12 01:30:02 +0000
@@ -0,0 +1,55 @@
+# search, a bzr plugin for searching within bzr branches/repositories.
+# Copyright (C) 2008 Robert Collins
+# 
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as published
+# by the Free Software Foundation.
+# 
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+# 
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+# 
+
+"""Tests for the transport layer."""
+
+from bzrlib.plugins.search import transport
+from bzrlib.tests import TestCaseWithTransport
+
+
+class TestFileView(TestCaseWithTransport):
+    
+    def get_bulk_and_view_data(self):
+        """Get sample data for a view on a file."""
+        bulk_data = []
+        for count in range(4096):
+            bulk_data.append(str(count))
+        bulk_data = ":".join(bulk_data)
+        view_data = bulk_data[400:1600]
+        file_map = {"Foo.1": (400, 1600)}
+        base_transport = self.get_transport(".")
+        base_transport.put_bytes("foo.pack", bulk_data)
+        return bulk_data, view_data, file_map
+
+    def test_get(self):
+        bulk_data, view_data, file_map = self.get_bulk_and_view_data()
+        base_transport = self.get_transport(".")
+        view = transport.FileView(base_transport, "foo.pack", file_map)
+        # Doing a get() returns a file which only contains the view_data.
+        visible_bytes = view.get("Foo.1").read()
+        self.assertEqual(visible_bytes, view_data)
+
+    def test_readv(self):
+        bulk_data, view_data, file_map = self.get_bulk_and_view_data()
+        base_transport = self.get_transport(".")
+        view = transport.FileView(base_transport, "foo.pack", file_map)
+        # Doing a readv for '' on view is trimmed to the data between 400 and
+        # 1600.
+        for offset, data in view.readv('Foo.1', [(0, 10), (700, 100)], True,
+            800):
+            matching_data = view_data[offset:offset + len(data)]
+            self.assertEqual(matching_data, data)

=== added file 'transport.py'
--- a/transport.py	1970-01-01 00:00:00 +0000
+++ b/transport.py	2008-06-12 01:30:02 +0000
@@ -0,0 +1,87 @@
+# search, a bzr plugin for searching within bzr branches/repositories.
+# Copyright (C) 2008 Robert Collins
+# 
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as published
+# by the Free Software Foundation.
+# 
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+# 
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+# 
+
+"""Transport facilities to support the index engine.
+
+The primary class here is FileView, an adapter for exposing a number of files 
+in a pack (with identity encoding only!) such that they can be accessed via
+readv.
+"""
+
+from cStringIO import StringIO
+
+
+class FileView(object):
+    """An adapter from a pack file to multiple smaller readvable files.
+
+    A typical use for this is to embed GraphIndex objects in a pack and then
+    use this to allow the GraphIndex logic to readv while actually reading
+    from the pack.
+
+    Currently only the get and readv methods are supported, all the rest of the
+    transport interface will raise AttributeError - this is deliberate to catch
+    unexpected uses.
+    """
+
+    def __init__(self, backing_transport, backing_file, file_map):
+        """Create a FileView.
+
+        :param backing_transport: The transport the pack file is located on.
+        :param backing_file: The url fragment name of the pack file.
+        :param file_map: A dict from file url fragments, to byte ranges in
+            the pack file. Pack file header and trailer overhead should not
+            be included in these ranges.
+        """
+        self._backing_transport = backing_transport
+        self._backing_file = backing_file
+        self._file_map = file_map
+
+    def get(self, relpath):
+        """See Transport.get."""
+        start, stop = self._file_map[relpath]
+        length = stop - start
+        _, bytes = self._backing_transport.readv(self._backing_file,
+            [(start, length)]).next()
+        return StringIO(bytes)
+
+    def readv(self, relpath, offsets, adjust_for_latency=False,
+        upper_limit=None):
+        """See Transport.readv.
+
+        This adapter will clip results back to the range defined by the
+        file_map.
+        """
+        base, upper_limit = self._file_map[relpath]
+        # adjust offsets
+        new_offsets = []
+        for offset, length in offsets:
+            new_offsets.append((offset + base, length))
+        for offset, data in self._backing_transport.readv(self._backing_file,
+            new_offsets, adjust_for_latency=adjust_for_latency,
+            upper_limit=upper_limit):
+            if offset + len(data) > upper_limit:
+                upper_trim = len(data) + offset - upper_limit
+            else:
+                upper_trim = None
+            if offset < base:
+                lower_trim = base - offset
+                offset = base
+            else:
+                lower_trim = 0
+            data = data[lower_trim:upper_trim]
+            offset = offset - base
+            yield offset, data