Rev 3670: Move the point at which we 'buffer_all' if we've read >50% of the index. in http://bzr.arbash-meinel.com/branches/bzr/1.7-dev/graph_index_autobuffer

Tue Sep 2 18:52:03 BST 2008

At http://bzr.arbash-meinel.com/branches/bzr/1.7-dev/graph_index_autobuffer

------------------------------------------------------------
revno: 3670
revision-id: john at arbash-meinel.com-20080902175200-nge9qgk0gklkd5ew
parent: john at arbash-meinel.com-20080829185848-svqlofrr394rp9x9
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: graph_index_autobuffer
timestamp: Tue 2008-09-02 12:52:00 -0500
message:
  Move the point at which we 'buffer_all' if we've read >50% of the index.
  
  We were doing it as soon as you entered 'iter_entries', but often you may already have enough
  info to return results. And for small mostly local ops, we don't need to buffer all.
  (This happens mostly with moderate size indexes, where the first read of the header
  is enough to give you the data you need, but happens to be >50% of the whole file.)
-------------- next part --------------
=== modified file 'bzrlib/index.py'

--- a/bzrlib/index.py	2008-08-29 18:58:48 +0000
+++ b/bzrlib/index.py	2008-09-02 17:52:00 +0000
@@ -480,11 +480,6 @@
         if self._size is None and self._nodes is None:
             self._buffer_all()
 
-        if self._nodes is None and self._bytes_read * 2 >= self._size:
-            # We've already read more than 50% of the file, go ahead and buffer
-            # the whole thing
-            self._buffer_all()
-
         # We fit about 20 keys per minimum-read (4K), so if we are looking for
         # more than 1/20th of the index its likely (assuming homogenous key
         # spread) that we'll read the entire index. If we're going to do that,
@@ -714,9 +709,15 @@
             if length > 0:
                 readv_ranges.append((location, length))
         self._read_and_parse(readv_ranges)
+        if self._nodes is not None:
+            # The _read_and_parse triggered a _buffer_all, grab the data and
+            # return it
+            for location, key in pending_references:
+                value, refs = self._nodes[key]
+                result.append(((location, key), (self, key, value, refs)))
+            return result
         for location, key in pending_references:
             # answer key references we had to look-up-late.
-            index = self._parsed_key_index(key)
             value, refs = self._bisect_nodes[key]
             result.append(((location, key), (self, key,
                 value, self._resolve_references(refs))))
@@ -992,25 +993,32 @@
 
         :param readv_ranges: A prepared readv range list.
         """
-        if readv_ranges:
-            readv_data = self._transport.readv(self._name, readv_ranges, True,
-                self._size)
-            # parse
-            for offset, data in readv_data:
-                self._bytes_read += len(data)
-                if offset == 0 and len(data) == self._size:
-                    # We read the whole range, most likely because the
-                    # Transport upcast our readv ranges into one long request
-                    # for enough total data to grab the whole index.
-                    self._buffer_all(StringIO(data))
-                    return
-                if self._bisect_nodes is None:
-                    # this must be the start
-                    if not (offset == 0):
-                        raise AssertionError()
-                    offset, data = self._parse_header_from_bytes(data)
-                # print readv_ranges, "[%d:%d]" % (offset, offset + len(data))
-                self._parse_region(offset, data)
+        if not readv_ranges:
+            return
+        if self._nodes is None and self._bytes_read * 2 >= self._size:
+            # We've already read more than 50% of the file and we are about to
+            # request more data, just _buffer_all() and be done
+            self._buffer_all()
+            return
+
+        readv_data = self._transport.readv(self._name, readv_ranges, True,
+            self._size)
+        # parse
+        for offset, data in readv_data:
+            self._bytes_read += len(data)
+            if offset == 0 and len(data) == self._size:
+                # We read the whole range, most likely because the
+                # Transport upcast our readv ranges into one long request
+                # for enough total data to grab the whole index.
+                self._buffer_all(StringIO(data))
+                return
+            if self._bisect_nodes is None:
+                # this must be the start
+                if not (offset == 0):
+                    raise AssertionError()
+                offset, data = self._parse_header_from_bytes(data)
+            # print readv_ranges, "[%d:%d]" % (offset, offset + len(data))
+            self._parse_region(offset, data)
 
     def _signature(self):
         """The file signature for this index type."""

=== modified file 'bzrlib/tests/test_index.py'
--- a/bzrlib/tests/test_index.py	2008-08-29 17:38:47 +0000
+++ b/bzrlib/tests/test_index.py	2008-09-02 17:52:00 +0000
@@ -595,14 +595,51 @@
         # generate a big enough index that we only read some of it on a typical
         # bisection lookup.
         nodes = []
+        for counter in range(99):
+            nodes.append((self.make_key(counter), self.make_value(counter),
+                ((self.make_key(counter + 20),),)  ))
+        index = self.make_index(ref_lists=1, nodes=nodes)
+        # lookup a key in the middle that does not exist, so that when we can
+        # check that the referred-to-keys are not accessed automatically.
+        index_size = index._size
+        index_center = index_size // 2
+        result = index._lookup_keys_via_location(
+            [(index_center, ('40', ))])
+        # check the parse map - only the start and middle should have been
+        # parsed.
+        self.assertEqual([(0, 4027), (10198, 14028)], index._parsed_byte_map)
+        self.assertEqual([(None, self.make_key(17)),
+                          (self.make_key(44), self.make_key(5))],
+            index._parsed_key_map)
+        # and check the transport activity likewise.
+        self.assertEqual(
+            [('readv', 'index', [(index_center, 800), (0, 200)], True,
+                                  index_size)],
+            index._transport._activity)
+        # reset the transport log for testing the reference lookup
+        del index._transport._activity[:]
+        # now looking up a key in the portion of the file already parsed should
+        # only perform IO to resolve its key references.
+        result = index._lookup_keys_via_location([(11000, self.make_key(45))])
+        self.assertEqual(
+            [((11000, self.make_key(45)),
+              (index, self.make_key(45), self.make_value(45),
+               ((self.make_key(65),),)))],
+            result)
+        self.assertEqual([('readv', 'index', [(15093, 800)], True, index_size)],
+            index._transport._activity)
+
+    def test_lookup_key_can_buffer_all(self):
+        nodes = []
         for counter in range(64):
             nodes.append((self.make_key(counter), self.make_value(counter),
                 ((self.make_key(counter + 20),),)  ))
         index = self.make_index(ref_lists=1, nodes=nodes)
         # lookup a key in the middle that does not exist, so that when we can
         # check that the referred-to-keys are not accessed automatically.
-        result =index._lookup_keys_via_location(
-            [(index._size // 2, ('40', ))])
+        index_size = index._size
+        index_center = index_size // 2
+        result = index._lookup_keys_via_location([(index_center, ('40', ))])
         # check the parse map - only the start and middle should have been
         # parsed.
         self.assertEqual([(0, 3890), (6444, 10274)], index._parsed_byte_map)
@@ -611,20 +648,22 @@
             index._parsed_key_map)
         # and check the transport activity likewise.
         self.assertEqual(
-            [('readv', 'index', [(7906, 800), (0, 200)], True, 15813)],
+            [('readv', 'index', [(index_center, 800), (0, 200)], True,
+                                  index_size)],
             index._transport._activity)
         # reset the transport log for testing the reference lookup
         del index._transport._activity[:]
         # now looking up a key in the portion of the file already parsed should
         # only perform IO to resolve its key references.
-        result = index._lookup_keys_via_location([(4000, self.make_key(40))])
+        result = index._lookup_keys_via_location([(7000, self.make_key(40))])
         self.assertEqual(
-            [((4000, self.make_key(40)),
+            [((7000, self.make_key(40)),
               (index, self.make_key(40), self.make_value(40),
                ((self.make_key(60),),)))],
             result)
-        self.assertEqual([('readv', 'index', [(11976, 800)], True, 15813)],
-            index._transport._activity)
+        # Resolving the references would have required more data read, and we
+        # are already above the 50% threshold, so it triggered a _buffer_all
+        self.assertEqual([('get', 'index')], index._transport._activity)
 
     def test_iter_all_entries_empty(self):
         index = self.make_index()
@@ -682,8 +721,13 @@
         list(index.iter_entries([self.make_key(40)]))
         self.assertIs(None, index._nodes)
         self.assertEqual(8192, index._bytes_read)
-        # But on the next pass, we will trigger buffer all
+        # On the next pass, we will not trigger buffer all if the key is
+        # available without reading more
         list(index.iter_entries([self.make_key(32)]))
+        self.assertIs(None, index._nodes)
+        # But if we *would* need to read more to resolve it, then we will
+        # buffer all.
+        list(index.iter_entries([self.make_key(60)]))
         self.assertIsNot(None, index._nodes)
 
     def test_iter_entries_references_resolved(self):