Rev 3221: Reduce round trips on index reads by resolving references in parallel with finding keys that have not yet been found. in http://people.ubuntu.com/~robertc/baz2.0/index.range_map

Fri Feb 8 01:56:57 GMT 2008

At http://people.ubuntu.com/~robertc/baz2.0/index.range_map

------------------------------------------------------------
revno: 3221
revision-id:robertc at robertcollins.net-20080208015653-igb6l64cr7mewh8v
parent: robertc at robertcollins.net-20080208004454-02go73ur2lwtnwqi
committer: Robert Collins <robertc at robertcollins.net>
branch nick: index.range_map
timestamp: Fri 2008-02-08 12:56:53 +1100
message:
  Reduce round trips on index reads by resolving references in parallel with finding keys that have not yet been found.
modified:
  bzrlib/index.py                index.py-20070712131115-lolkarso50vjr64s-1
  bzrlib/tests/test_index.py     test_index.py-20070712131115-lolkarso50vjr64s-2
=== modified file 'bzrlib/index.py'

--- a/bzrlib/index.py	2008-02-08 00:44:54 +0000
+++ b/bzrlib/index.py	2008-02-08 01:56:53 +0000
@@ -473,8 +473,9 @@
         """
         size = self._size
         search_keys = keys
+        state = None
         while search_keys:
-            search_results = self._lookup_keys(search_keys)
+            search_results, state = self._lookup_keys(search_keys, state)
             search_keys = []
             for key, status in search_results:
                 if status == 1:
@@ -571,7 +572,7 @@
             self._read_and_parse([_HEADER_READV])
         return self._key_count
 
-    def _lookup_keys(self, keys):
+    def _lookup_keys(self, keys, state=None):
         """Public interface for implementing partial disk reading.
 
         If _buffer_all has been called, then all the data for the index is in
@@ -587,21 +588,32 @@
         the next batch.
 
         :param keys: A list of keys.
+        :param state: The current lookup state - opaque to this method, use
+            None for the first call in a series of lookups.
         :return: A list of (key, status) tuples where status is False for
             not-present, True for still searching, or otherwise the result.
         """
-        # Possible improvements:
-        #  - only bisect lookup each key once
-        #  - sort the keys first, and use that to reduce the bisection window
-        # ----- 
         # this progresses in three parts:
         # read data
         # parse it
         # attempt to answer the question from the now in memory data.
+        # ---
         # build the readv request
         # for each location, ask for 800 bytes - much more than rows we've seen
         # anywhere.
         readv_ranges = []
+        if state is not None:
+            # Currently state is just a set of extra readv's we need to do to
+            # answer key references.
+            for location in state:
+                length = 800
+                if location + length > self._size:
+                    length = self._size - location
+                # TODO: trim out parsed locations (e.g. if the 800 is into the
+                # parsed region trim it, and dont use the adjust_for_latency
+                # facility)
+                assert length > 0
+                readv_ranges.append((location, length))
         range_map = self._parsed_map
         # keys we know we can answer in the negative
         missing_keys = set()
@@ -642,7 +654,6 @@
         #  - result present references so we can return them.
         result = []
         # keys that we cannot answer until we resolve references
-        pending_references = []
         pending_locations = set()
         for key in keys:
             # can we answer from cache?
@@ -660,7 +671,8 @@
                                 wanted_locations.append(ref)
                     if wanted_locations:
                         pending_locations.update(wanted_locations)
-                        pending_references.append((0, key))
+                        # Cannot answer this key yet
+                        result.append((key, True))
                         continue
                     result.append((key, (self, key,
                         value, self._resolve_references(refs))))
@@ -681,25 +693,7 @@
                 continue
             # The key is neither absent nor parsed. Keep it in flight.
             result.append((key, True))
-        readv_ranges = []
-        # lookup data to resolve references
-        for location in pending_locations:
-            length = 800
-            if location + length > self._size:
-                length = self._size - location
-            # TODO: trim out parsed locations (e.g. if the 800 is into the
-            # parsed region trim it, and dont use the adjust_for_latency
-            # facility)
-            assert length > 0
-            readv_ranges.append((location, length))
-        self._read_and_parse(readv_ranges)
-        for location, key in pending_references:
-            # answer key references we had to look-up-late.
-            index = self._parsed_key_index(key)
-            value, refs = self._bisect_nodes[key]
-            result.append((key, (self, key,
-                value, self._resolve_references(refs))))
-        return result
+        return result, pending_locations
 
     def _parse_header_from_bytes(self, bytes):
         """Parse the header from a region of bytes.

=== modified file 'bzrlib/tests/test_index.py'
--- a/bzrlib/tests/test_index.py	2008-02-08 00:44:54 +0000
+++ b/bzrlib/tests/test_index.py	2008-02-08 01:56:53 +0000
@@ -378,8 +378,7 @@
         del index._transport._activity[:]
         # do a _lookup_keys call for the middle of the file, which
         # is what bisection uses.
-        result = index._lookup_keys(
-            [('missing', )])
+        result, state = index._lookup_keys([('missing', )], None)
         # this should have asked for a readv request, with adjust_for_latency,
         # and two regions: the header, and half-way into the file.
         self.assertEqual([
@@ -403,7 +402,7 @@
         index = self.make_index(nodes=[(('name', ), 'data', ())])
         # reset the transport log
         del index._transport._activity[:]
-        result = index._lookup_keys([('missing', )])
+        result, state = index._lookup_keys([('missing', )], None)
         # this should have asked for a readv request, with adjust_for_latency,
         # and two regions: the header, and half-way into the file.
         self.assertEqual([
@@ -427,7 +426,7 @@
         for counter in range(64):
             nodes.append((make_key(counter), 'Y'*100, ()))
         index = self.make_index(nodes=nodes)
-        result = index._lookup_keys([('40', )])
+        result, state = index._lookup_keys([('40', )], None)
         # and the result should be that the key cannot be present, because key is
         # in the middle of the observed data from a 4K read - the smallest transport
         # will do today with this api.
@@ -463,8 +462,7 @@
         for counter in range(62):
             nodes.append((make_key(counter), make_value(counter), ()))
         index = self.make_index(nodes=nodes)
-        result = index._lookup_keys(
-            [('40', )])
+        result, state = index._lookup_keys([('40', )], None)
         # and we should have a parse map that includes the header and the
         # region that was parsed after trimming.
         expected_map = SortedKeyRangeMap()
@@ -474,8 +472,7 @@
             (12823, KeyEndSentinel()), (12824, KeyEndSentinel()))
         self.assertEqual(expected_map, index._parsed_map)
         # now ask for two keys, which are at the next bisection steps before and after the parsed region
-        result = index._lookup_keys(
-            [make_key(28), make_key(48)])
+        result, state = index._lookup_keys([make_key(28), make_key(48)], None)
         self.assertEqual([
             (make_key(28), (index, make_key(28), make_value(28))),
             (make_key(48), (index, make_key(48), make_value(48))),
@@ -498,7 +495,7 @@
             nodes.append((make_key(counter), 'Y'*100, ()))
         index = self.make_index(nodes=nodes)
         # lookup the keys in the middle of the file
-        result = index._lookup_keys([('40', )])
+        result, state = index._lookup_keys([('40', )], None)
         # check the parse map, this determines the test validity
         expected_map = SortedKeyRangeMap()
         expected_map.add_range((0, None), (3972, make_key(26)))
@@ -513,7 +510,7 @@
         # be in the index) - even when the byte location we ask for is outside
         # the parsed region
         # 
-        result = index._lookup_keys([('40', )])
+        result, state = index._lookup_keys([('40', )], None)
         self.assertEqual([(('40', ), False)], result)
         self.assertEqual([], index._transport._activity)
 
@@ -529,8 +526,7 @@
             nodes.append((make_key(counter), make_value(counter), ()))
         index = self.make_index(nodes=nodes)
         # lookup the keys in the middle of the file
-        result =index._lookup_keys(
-            [(index._size // 2, ('40', ))])
+        result, state = index._lookup_keys([('40', )], None)
         # check the parse map, this determines the test validity
         expected_map = SortedKeyRangeMap()
         expected_map.add_range((0, None), (4008, make_key(26)))
@@ -545,7 +541,7 @@
         # be in the index) - even when the byte location we ask for is outside
         # the parsed region
         # 
-        result = index._lookup_keys([make_key(40)])
+        result, state = index._lookup_keys([make_key(40)], None)
         self.assertEqual(
             [(make_key(40), (index, make_key(40), make_value(40)))],
             result)
@@ -564,7 +560,7 @@
         # asking for the middle) we want to be told that the key is below the
         # point we asked for (because its in the unparsed region after the
         # header and before the middle.)
-        result =index._lookup_keys([('27', )])
+        result, state = index._lookup_keys([('27', )], None)
         # We should have parsed the header and the middle of the file.
         expected_map = SortedKeyRangeMap()
         expected_map.add_range((0, None), (3972, make_key(26)))
@@ -586,7 +582,7 @@
         # ask for the key near the end via the bisection interface (but
         # asking for the middle) - we want to be told that the key is after the
         # point we asked for (because its in the unparsed region at the end.)
-        result =index._lookup_keys([('50', )])
+        result, state = index._lookup_keys([('50', )], None)
         # We should have parsed the header and the middle of the file.
         expected_map = SortedKeyRangeMap()
         expected_map.add_range((0, None), (3972, make_key(26)))
@@ -610,8 +606,7 @@
         index = self.make_index(ref_lists=1, nodes=nodes)
         # lookup a key in the middle that does not exist, so that when we can
         # check that the referred-to-keys are not accessed automatically.
-        result =index._lookup_keys(
-            [(index._size // 2, ('40', ))])
+        result, state = index._lookup_keys([('40', )], None)
         # check the parse map - only the start and middle should have been
         # parsed.
         expected_map = SortedKeyRangeMap()
@@ -627,8 +622,13 @@
         # reset the transport log for testing the reference lookup
         del index._transport._activity[:]
         # now looking up a key in the portion of the file already parsed should
-        # only perform IO to resolve its key references.
-        result = index._lookup_keys([make_key(40)])
+        # only perform IO to resolve its key references - and should do that on
+        # the second pass to eliminate unneeded readv calls.
+        result, state = index._lookup_keys([make_key(40)], None)
+        # state is opaque.
+        self.assertEqual([(make_key(40), True)], result)
+        # Try again, this time it should resolve:
+        result, state = index._lookup_keys([make_key(40)], state)
         self.assertEqual(
             [(make_key(40),
               (index, make_key(40), make_value(40), ((make_key(60),),)))],