Rev 3220: Simplify the interface for lookup up index keys incrementally - no longer use bisect_multi_bytes. in http://people.ubuntu.com/~robertc/baz2.0/index.range_map

Fri Feb 8 00:45:00 GMT 2008

At http://people.ubuntu.com/~robertc/baz2.0/index.range_map

------------------------------------------------------------
revno: 3220
revision-id:robertc at robertcollins.net-20080208004454-02go73ur2lwtnwqi
parent: robertc at robertcollins.net-20080207054031-icggsivf6box0v15
committer: Robert Collins <robertc at robertcollins.net>
branch nick: index.range_map
timestamp: Fri 2008-02-08 11:44:54 +1100
message:
  Simplify the interface for lookup up index keys incrementally - no longer use bisect_multi_bytes.
modified:
  bzrlib/index.py                index.py-20070712131115-lolkarso50vjr64s-1
  bzrlib/tests/test_index.py     test_index.py-20070712131115-lolkarso50vjr64s-2
=== modified file 'bzrlib/index.py'

--- a/bzrlib/index.py	2008-02-07 05:40:31 +0000
+++ b/bzrlib/index.py	2008-02-08 00:44:54 +0000
@@ -31,7 +31,6 @@
 from bzrlib.lazy_import import lazy_import
 lazy_import(globals(), """
 from bzrlib import trace
-from bzrlib.bisect_multi import bisect_multi_bytes
 from bzrlib.rangemap import KeyEndSentinel, SortedKeyRangeMap
 from bzrlib.revision import NULL_REVISION
 from bzrlib.trace import mutter
@@ -463,8 +462,28 @@
         if self._nodes is not None:
             return self._iter_entries_from_total_buffer(keys)
         else:
-            return (result[1] for result in bisect_multi_bytes(
-                self._lookup_keys_via_location, self._size, keys))
+            return self._iter_entries_from_disk(keys)
+
+    def _iter_entries_from_disk(self, keys):
+        """Iterate over keys loading from disk as needed.
+        
+        :param keys: The keys to lookup.
+        :return: A generator which yields the keys that have been found. No I/O
+            is outstanding when each yield is performed.
+        """
+        size = self._size
+        search_keys = keys
+        while search_keys:
+            search_results = self._lookup_keys(search_keys)
+            search_keys = []
+            for key, status in search_results:
+                if status == 1:
+                    search_keys.append(key)
+                elif status == False:
+                    # not present, stop searching
+                    continue
+                else:
+                    yield status
 
     def iter_entries_prefix(self, keys):
         """Iterate over keys within the index using prefix matching.
@@ -552,21 +571,24 @@
             self._read_and_parse([_HEADER_READV])
         return self._key_count
 
-    def _lookup_keys_via_location(self, location_keys):
-        """Public interface for implementing bisection.
+    def _lookup_keys(self, keys):
+        """Public interface for implementing partial disk reading.
 
         If _buffer_all has been called, then all the data for the index is in
         memory, and this method should not be called, as it uses a separate
         cache because it cannot pre-resolve all indices, which buffer_all does
         for performance.
 
-        Note that the location component of location_keys is entirely ignored.
-        Instead we read mid-way between the nearest parsed ranges, which
-        reduces lookups.
+        We probe for each key mid-way between the nearest parsed ranges, which
+        produces a bisection based disk lookup pattern. A small defect in this
+        is that to resolve key references we perform a second IO, an
+        improvement on this is to return the key as still pending and have some
+        state in the lookup process allowing us to issue a read for the key in
+        the next batch.
 
-        :param location_keys: A list of location(byte offset), key tuples.
-        :return: A list of (location_key, result) tuples as expected by
-            bzrlib.bisect_multi.bisect_multi_bytes.
+        :param keys: A list of keys.
+        :return: A list of (key, status) tuples where status is False for
+            not-present, True for still searching, or otherwise the result.
         """
         # Possible improvements:
         #  - only bisect lookup each key once
@@ -583,7 +605,7 @@
         range_map = self._parsed_map
         # keys we know we can answer in the negative
         missing_keys = set()
-        for _, key in location_keys:
+        for key in keys:
             # can we answer from cache?
             if self._bisect_nodes and key in self._bisect_nodes:
                 # We have the key parsed.
@@ -622,7 +644,7 @@
         # keys that we cannot answer until we resolve references
         pending_references = []
         pending_locations = set()
-        for _, key in location_keys:
+        for key in keys:
             # can we answer from cache?
             if key in self._bisect_nodes:
                 # the key has been parsed, and may or may not have parsed the
@@ -640,10 +662,10 @@
                         pending_locations.update(wanted_locations)
                         pending_references.append((0, key))
                         continue
-                    result.append(((0, key), (self, key,
+                    result.append((key, (self, key,
                         value, self._resolve_references(refs))))
                 else:
-                    result.append(((0, key),
+                    result.append((key,
                         (self, key, self._bisect_nodes[key])))
                 continue
             if key in missing_keys:
@@ -655,10 +677,10 @@
                 in_range, key_range = range_map.get_key_range(key)
             if in_range:
                 # We have parsed the region for the key, so its missing.
-                result.append(((0, key), False))
+                result.append((key, False))
                 continue
             # The key is neither absent nor parsed. Keep it in flight.
-            result.append(((0, key), 1))
+            result.append((key, True))
         readv_ranges = []
         # lookup data to resolve references
         for location in pending_locations:
@@ -668,14 +690,14 @@
             # TODO: trim out parsed locations (e.g. if the 800 is into the
             # parsed region trim it, and dont use the adjust_for_latency
             # facility)
-            if length > 0:
-                readv_ranges.append((location, length))
+            assert length > 0
+            readv_ranges.append((location, length))
         self._read_and_parse(readv_ranges)
         for location, key in pending_references:
             # answer key references we had to look-up-late.
             index = self._parsed_key_index(key)
             value, refs = self._bisect_nodes[key]
-            result.append(((location, key), (self, key,
+            result.append((key, (self, key,
                 value, self._resolve_references(refs))))
         return result
 

=== modified file 'bzrlib/tests/test_index.py'
--- a/bzrlib/tests/test_index.py	2008-02-07 05:40:31 +0000
+++ b/bzrlib/tests/test_index.py	2008-02-08 00:44:54 +0000
@@ -372,14 +372,14 @@
         index = self.make_index()
         self.assertEqual(SortedKeyRangeMap(), index._parsed_map)
 
-    def test_first_lookup_key_via_location(self):
+    def test_first_lookup_key(self):
         index = self.make_index()
         # reset the transport log
         del index._transport._activity[:]
-        # do a _lookup_keys_via_location call for the middle of the file, which
+        # do a _lookup_keys call for the middle of the file, which
         # is what bisection uses.
-        result = index._lookup_keys_via_location(
-            [(index._size // 2, ('missing', ))])
+        result = index._lookup_keys(
+            [('missing', )])
         # this should have asked for a readv request, with adjust_for_latency,
         # and two regions: the header, and half-way into the file.
         self.assertEqual([
@@ -388,7 +388,7 @@
             index._transport._activity)
         # and the result should be that the key cannot be present, because this
         # is a trivial index.
-        self.assertEqual([((0, ('missing', )), False)], result)
+        self.assertEqual([(('missing', ), False)], result)
         # And the regions of the file that have been parsed - in this case the
         # entire file - should be in the parsed region map.
         expected_map = SortedKeyRangeMap()
@@ -403,8 +403,7 @@
         index = self.make_index(nodes=[(('name', ), 'data', ())])
         # reset the transport log
         del index._transport._activity[:]
-        result = index._lookup_keys_via_location(
-            [(index._size // 2, ('missing', ))])
+        result = index._lookup_keys([('missing', )])
         # this should have asked for a readv request, with adjust_for_latency,
         # and two regions: the header, and half-way into the file.
         self.assertEqual([
@@ -413,7 +412,7 @@
             index._transport._activity)
         # and the result should be that the key cannot be present, because this
         # is a trivial index and we should not have to do more round trips.
-        self.assertEqual([((0, ('missing', )), False)], result)
+        self.assertEqual([(('missing', ), False)], result)
         # The whole file should be parsed at this point.
         expected_map = SortedKeyRangeMap()
         expected_map.add_range((0, None), (73, KeyEndSentinel()))
@@ -428,13 +427,11 @@
         for counter in range(64):
             nodes.append((make_key(counter), 'Y'*100, ()))
         index = self.make_index(nodes=nodes)
-        result = index._lookup_keys_via_location(
-            [(index._size // 2, ('40', ))])
+        result = index._lookup_keys([('40', )])
         # and the result should be that the key cannot be present, because key is
         # in the middle of the observed data from a 4K read - the smallest transport
         # will do today with this api.
-        self.assertEqual([((0, ('40', )), False)],
-            result)
+        self.assertEqual([(('40', ), False)], result)
         # and we should have a parse map that includes the header and the
         # region that was parsed after trimming.
         expected_map = SortedKeyRangeMap()
@@ -466,8 +463,8 @@
         for counter in range(62):
             nodes.append((make_key(counter), make_value(counter), ()))
         index = self.make_index(nodes=nodes)
-        result = index._lookup_keys_via_location(
-            [(index._size // 2, ('40', ))])
+        result = index._lookup_keys(
+            [('40', )])
         # and we should have a parse map that includes the header and the
         # region that was parsed after trimming.
         expected_map = SortedKeyRangeMap()
@@ -477,11 +474,11 @@
             (12823, KeyEndSentinel()), (12824, KeyEndSentinel()))
         self.assertEqual(expected_map, index._parsed_map)
         # now ask for two keys, which are at the next bisection steps before and after the parsed region
-        result = index._lookup_keys_via_location(
-            [(11450, make_key(28)), (15534, make_key(48))])
+        result = index._lookup_keys(
+            [make_key(28), make_key(48)])
         self.assertEqual([
-            ((0, make_key(28)), (index, make_key(28), make_value(28))),
-            ((0, make_key(48)), (index, make_key(48), make_value(48))),
+            (make_key(28), (index, make_key(28), make_value(28))),
+            (make_key(48), (index, make_key(48), make_value(48))),
             ],
             result)
         expected_map = SortedKeyRangeMap()
@@ -501,8 +498,7 @@
             nodes.append((make_key(counter), 'Y'*100, ()))
         index = self.make_index(nodes=nodes)
         # lookup the keys in the middle of the file
-        result =index._lookup_keys_via_location(
-            [(index._size // 2, ('40', ))])
+        result = index._lookup_keys([('40', )])
         # check the parse map, this determines the test validity
         expected_map = SortedKeyRangeMap()
         expected_map.add_range((0, None), (3972, make_key(26)))
@@ -517,10 +513,8 @@
         # be in the index) - even when the byte location we ask for is outside
         # the parsed region
         # 
-        result = index._lookup_keys_via_location(
-            [(4000, ('40', ))])
-        self.assertEqual([((0, ('40', )), False)],
-            result)
+        result = index._lookup_keys([('40', )])
+        self.assertEqual([(('40', ), False)], result)
         self.assertEqual([], index._transport._activity)
 
     def test_lookup_present_key_answers_without_io_when_map_permits(self):
@@ -535,7 +529,7 @@
             nodes.append((make_key(counter), make_value(counter), ()))
         index = self.make_index(nodes=nodes)
         # lookup the keys in the middle of the file
-        result =index._lookup_keys_via_location(
+        result =index._lookup_keys(
             [(index._size // 2, ('40', ))])
         # check the parse map, this determines the test validity
         expected_map = SortedKeyRangeMap()
@@ -551,9 +545,9 @@
         # be in the index) - even when the byte location we ask for is outside
         # the parsed region
         # 
-        result = index._lookup_keys_via_location([(4000, make_key(40))])
+        result = index._lookup_keys([make_key(40)])
         self.assertEqual(
-            [((0, make_key(40)), (index, make_key(40), make_value(40)))],
+            [(make_key(40), (index, make_key(40), make_value(40)))],
             result)
         self.assertEqual([], index._transport._activity)
 
@@ -570,8 +564,7 @@
         # asking for the middle) we want to be told that the key is below the
         # point we asked for (because its in the unparsed region after the
         # header and before the middle.)
-        result =index._lookup_keys_via_location(
-            [(index._size // 2, ('27', ))])
+        result =index._lookup_keys([('27', )])
         # We should have parsed the header and the middle of the file.
         expected_map = SortedKeyRangeMap()
         expected_map.add_range((0, None), (3972, make_key(26)))
@@ -579,7 +572,7 @@
         expected_map.add_range(
             (13235, KeyEndSentinel()), (13236, KeyEndSentinel()))
         self.assertEqual(expected_map, index._parsed_map)
-        self.assertEqual([((0, ('27', )), 1)], result)
+        self.assertEqual([(('27', ), True)], result)
 
     def test_lookup_key_above_probed_area(self):
         # generate a big enough index that we only read some of it on a typical
@@ -593,8 +586,7 @@
         # ask for the key near the end via the bisection interface (but
         # asking for the middle) - we want to be told that the key is after the
         # point we asked for (because its in the unparsed region at the end.)
-        result =index._lookup_keys_via_location(
-            [(index._size // 2, ('50', ))])
+        result =index._lookup_keys([('50', )])
         # We should have parsed the header and the middle of the file.
         expected_map = SortedKeyRangeMap()
         expected_map.add_range((0, None), (3972, make_key(26)))
@@ -602,7 +594,7 @@
         expected_map.add_range(
             (13235, KeyEndSentinel()), (13236, KeyEndSentinel()))
         self.assertEqual(expected_map, index._parsed_map)
-        self.assertEqual([((0, ('50', )), +1)], result)
+        self.assertEqual([(('50', ), True)], result)
 
     def test_lookup_key_resolves_references(self):
         # generate a big enough index that we only read some of it on a typical
@@ -618,7 +610,7 @@
         index = self.make_index(ref_lists=1, nodes=nodes)
         # lookup a key in the middle that does not exist, so that when we can
         # check that the referred-to-keys are not accessed automatically.
-        result =index._lookup_keys_via_location(
+        result =index._lookup_keys(
             [(index._size // 2, ('40', ))])
         # check the parse map - only the start and middle should have been
         # parsed.
@@ -636,9 +628,9 @@
         del index._transport._activity[:]
         # now looking up a key in the portion of the file already parsed should
         # only perform IO to resolve its key references.
-        result = index._lookup_keys_via_location([(4000, make_key(40))])
+        result = index._lookup_keys([make_key(40)])
         self.assertEqual(
-            [((0, make_key(40)),
+            [(make_key(40),
               (index, make_key(40), make_value(40), ((make_key(60),),)))],
             result)
         self.assertEqual([('readv', 'index', [(11976, 800)], True, 15813)],