Rev 3220: Simplify the interface for lookup up index keys incrementally - no longer use bisect_multi_bytes. in http://people.ubuntu.com/~robertc/baz2.0/index.range_map
Robert Collins
robertc at robertcollins.net
Fri Feb 8 00:45:00 GMT 2008
At http://people.ubuntu.com/~robertc/baz2.0/index.range_map
------------------------------------------------------------
revno: 3220
revision-id:robertc at robertcollins.net-20080208004454-02go73ur2lwtnwqi
parent: robertc at robertcollins.net-20080207054031-icggsivf6box0v15
committer: Robert Collins <robertc at robertcollins.net>
branch nick: index.range_map
timestamp: Fri 2008-02-08 11:44:54 +1100
message:
Simplify the interface for lookup up index keys incrementally - no longer use bisect_multi_bytes.
modified:
bzrlib/index.py index.py-20070712131115-lolkarso50vjr64s-1
bzrlib/tests/test_index.py test_index.py-20070712131115-lolkarso50vjr64s-2
=== modified file 'bzrlib/index.py'
--- a/bzrlib/index.py 2008-02-07 05:40:31 +0000
+++ b/bzrlib/index.py 2008-02-08 00:44:54 +0000
@@ -31,7 +31,6 @@
from bzrlib.lazy_import import lazy_import
lazy_import(globals(), """
from bzrlib import trace
-from bzrlib.bisect_multi import bisect_multi_bytes
from bzrlib.rangemap import KeyEndSentinel, SortedKeyRangeMap
from bzrlib.revision import NULL_REVISION
from bzrlib.trace import mutter
@@ -463,8 +462,28 @@
if self._nodes is not None:
return self._iter_entries_from_total_buffer(keys)
else:
- return (result[1] for result in bisect_multi_bytes(
- self._lookup_keys_via_location, self._size, keys))
+ return self._iter_entries_from_disk(keys)
+
+ def _iter_entries_from_disk(self, keys):
+ """Iterate over keys loading from disk as needed.
+
+ :param keys: The keys to lookup.
+ :return: A generator which yields the keys that have been found. No I/O
+ is outstanding when each yield is performed.
+ """
+ size = self._size
+ search_keys = keys
+ while search_keys:
+ search_results = self._lookup_keys(search_keys)
+ search_keys = []
+ for key, status in search_results:
+ if status == 1:
+ search_keys.append(key)
+ elif status == False:
+ # not present, stop searching
+ continue
+ else:
+ yield status
def iter_entries_prefix(self, keys):
"""Iterate over keys within the index using prefix matching.
@@ -552,21 +571,24 @@
self._read_and_parse([_HEADER_READV])
return self._key_count
- def _lookup_keys_via_location(self, location_keys):
- """Public interface for implementing bisection.
+ def _lookup_keys(self, keys):
+ """Public interface for implementing partial disk reading.
If _buffer_all has been called, then all the data for the index is in
memory, and this method should not be called, as it uses a separate
cache because it cannot pre-resolve all indices, which buffer_all does
for performance.
- Note that the location component of location_keys is entirely ignored.
- Instead we read mid-way between the nearest parsed ranges, which
- reduces lookups.
+ We probe for each key mid-way between the nearest parsed ranges, which
+ produces a bisection based disk lookup pattern. A small defect in this
+ is that to resolve key references we perform a second IO, an
+ improvement on this is to return the key as still pending and have some
+ state in the lookup process allowing us to issue a read for the key in
+ the next batch.
- :param location_keys: A list of location(byte offset), key tuples.
- :return: A list of (location_key, result) tuples as expected by
- bzrlib.bisect_multi.bisect_multi_bytes.
+ :param keys: A list of keys.
+ :return: A list of (key, status) tuples where status is False for
+ not-present, True for still searching, or otherwise the result.
"""
# Possible improvements:
# - only bisect lookup each key once
@@ -583,7 +605,7 @@
range_map = self._parsed_map
# keys we know we can answer in the negative
missing_keys = set()
- for _, key in location_keys:
+ for key in keys:
# can we answer from cache?
if self._bisect_nodes and key in self._bisect_nodes:
# We have the key parsed.
@@ -622,7 +644,7 @@
# keys that we cannot answer until we resolve references
pending_references = []
pending_locations = set()
- for _, key in location_keys:
+ for key in keys:
# can we answer from cache?
if key in self._bisect_nodes:
# the key has been parsed, and may or may not have parsed the
@@ -640,10 +662,10 @@
pending_locations.update(wanted_locations)
pending_references.append((0, key))
continue
- result.append(((0, key), (self, key,
+ result.append((key, (self, key,
value, self._resolve_references(refs))))
else:
- result.append(((0, key),
+ result.append((key,
(self, key, self._bisect_nodes[key])))
continue
if key in missing_keys:
@@ -655,10 +677,10 @@
in_range, key_range = range_map.get_key_range(key)
if in_range:
# We have parsed the region for the key, so its missing.
- result.append(((0, key), False))
+ result.append((key, False))
continue
# The key is neither absent nor parsed. Keep it in flight.
- result.append(((0, key), 1))
+ result.append((key, True))
readv_ranges = []
# lookup data to resolve references
for location in pending_locations:
@@ -668,14 +690,14 @@
# TODO: trim out parsed locations (e.g. if the 800 is into the
# parsed region trim it, and dont use the adjust_for_latency
# facility)
- if length > 0:
- readv_ranges.append((location, length))
+ assert length > 0
+ readv_ranges.append((location, length))
self._read_and_parse(readv_ranges)
for location, key in pending_references:
# answer key references we had to look-up-late.
index = self._parsed_key_index(key)
value, refs = self._bisect_nodes[key]
- result.append(((location, key), (self, key,
+ result.append((key, (self, key,
value, self._resolve_references(refs))))
return result
=== modified file 'bzrlib/tests/test_index.py'
--- a/bzrlib/tests/test_index.py 2008-02-07 05:40:31 +0000
+++ b/bzrlib/tests/test_index.py 2008-02-08 00:44:54 +0000
@@ -372,14 +372,14 @@
index = self.make_index()
self.assertEqual(SortedKeyRangeMap(), index._parsed_map)
- def test_first_lookup_key_via_location(self):
+ def test_first_lookup_key(self):
index = self.make_index()
# reset the transport log
del index._transport._activity[:]
- # do a _lookup_keys_via_location call for the middle of the file, which
+ # do a _lookup_keys call for the middle of the file, which
# is what bisection uses.
- result = index._lookup_keys_via_location(
- [(index._size // 2, ('missing', ))])
+ result = index._lookup_keys(
+ [('missing', )])
# this should have asked for a readv request, with adjust_for_latency,
# and two regions: the header, and half-way into the file.
self.assertEqual([
@@ -388,7 +388,7 @@
index._transport._activity)
# and the result should be that the key cannot be present, because this
# is a trivial index.
- self.assertEqual([((0, ('missing', )), False)], result)
+ self.assertEqual([(('missing', ), False)], result)
# And the regions of the file that have been parsed - in this case the
# entire file - should be in the parsed region map.
expected_map = SortedKeyRangeMap()
@@ -403,8 +403,7 @@
index = self.make_index(nodes=[(('name', ), 'data', ())])
# reset the transport log
del index._transport._activity[:]
- result = index._lookup_keys_via_location(
- [(index._size // 2, ('missing', ))])
+ result = index._lookup_keys([('missing', )])
# this should have asked for a readv request, with adjust_for_latency,
# and two regions: the header, and half-way into the file.
self.assertEqual([
@@ -413,7 +412,7 @@
index._transport._activity)
# and the result should be that the key cannot be present, because this
# is a trivial index and we should not have to do more round trips.
- self.assertEqual([((0, ('missing', )), False)], result)
+ self.assertEqual([(('missing', ), False)], result)
# The whole file should be parsed at this point.
expected_map = SortedKeyRangeMap()
expected_map.add_range((0, None), (73, KeyEndSentinel()))
@@ -428,13 +427,11 @@
for counter in range(64):
nodes.append((make_key(counter), 'Y'*100, ()))
index = self.make_index(nodes=nodes)
- result = index._lookup_keys_via_location(
- [(index._size // 2, ('40', ))])
+ result = index._lookup_keys([('40', )])
# and the result should be that the key cannot be present, because key is
# in the middle of the observed data from a 4K read - the smallest transport
# will do today with this api.
- self.assertEqual([((0, ('40', )), False)],
- result)
+ self.assertEqual([(('40', ), False)], result)
# and we should have a parse map that includes the header and the
# region that was parsed after trimming.
expected_map = SortedKeyRangeMap()
@@ -466,8 +463,8 @@
for counter in range(62):
nodes.append((make_key(counter), make_value(counter), ()))
index = self.make_index(nodes=nodes)
- result = index._lookup_keys_via_location(
- [(index._size // 2, ('40', ))])
+ result = index._lookup_keys(
+ [('40', )])
# and we should have a parse map that includes the header and the
# region that was parsed after trimming.
expected_map = SortedKeyRangeMap()
@@ -477,11 +474,11 @@
(12823, KeyEndSentinel()), (12824, KeyEndSentinel()))
self.assertEqual(expected_map, index._parsed_map)
# now ask for two keys, which are at the next bisection steps before and after the parsed region
- result = index._lookup_keys_via_location(
- [(11450, make_key(28)), (15534, make_key(48))])
+ result = index._lookup_keys(
+ [make_key(28), make_key(48)])
self.assertEqual([
- ((0, make_key(28)), (index, make_key(28), make_value(28))),
- ((0, make_key(48)), (index, make_key(48), make_value(48))),
+ (make_key(28), (index, make_key(28), make_value(28))),
+ (make_key(48), (index, make_key(48), make_value(48))),
],
result)
expected_map = SortedKeyRangeMap()
@@ -501,8 +498,7 @@
nodes.append((make_key(counter), 'Y'*100, ()))
index = self.make_index(nodes=nodes)
# lookup the keys in the middle of the file
- result =index._lookup_keys_via_location(
- [(index._size // 2, ('40', ))])
+ result = index._lookup_keys([('40', )])
# check the parse map, this determines the test validity
expected_map = SortedKeyRangeMap()
expected_map.add_range((0, None), (3972, make_key(26)))
@@ -517,10 +513,8 @@
# be in the index) - even when the byte location we ask for is outside
# the parsed region
#
- result = index._lookup_keys_via_location(
- [(4000, ('40', ))])
- self.assertEqual([((0, ('40', )), False)],
- result)
+ result = index._lookup_keys([('40', )])
+ self.assertEqual([(('40', ), False)], result)
self.assertEqual([], index._transport._activity)
def test_lookup_present_key_answers_without_io_when_map_permits(self):
@@ -535,7 +529,7 @@
nodes.append((make_key(counter), make_value(counter), ()))
index = self.make_index(nodes=nodes)
# lookup the keys in the middle of the file
- result =index._lookup_keys_via_location(
+ result =index._lookup_keys(
[(index._size // 2, ('40', ))])
# check the parse map, this determines the test validity
expected_map = SortedKeyRangeMap()
@@ -551,9 +545,9 @@
# be in the index) - even when the byte location we ask for is outside
# the parsed region
#
- result = index._lookup_keys_via_location([(4000, make_key(40))])
+ result = index._lookup_keys([make_key(40)])
self.assertEqual(
- [((0, make_key(40)), (index, make_key(40), make_value(40)))],
+ [(make_key(40), (index, make_key(40), make_value(40)))],
result)
self.assertEqual([], index._transport._activity)
@@ -570,8 +564,7 @@
# asking for the middle) we want to be told that the key is below the
# point we asked for (because its in the unparsed region after the
# header and before the middle.)
- result =index._lookup_keys_via_location(
- [(index._size // 2, ('27', ))])
+ result =index._lookup_keys([('27', )])
# We should have parsed the header and the middle of the file.
expected_map = SortedKeyRangeMap()
expected_map.add_range((0, None), (3972, make_key(26)))
@@ -579,7 +572,7 @@
expected_map.add_range(
(13235, KeyEndSentinel()), (13236, KeyEndSentinel()))
self.assertEqual(expected_map, index._parsed_map)
- self.assertEqual([((0, ('27', )), 1)], result)
+ self.assertEqual([(('27', ), True)], result)
def test_lookup_key_above_probed_area(self):
# generate a big enough index that we only read some of it on a typical
@@ -593,8 +586,7 @@
# ask for the key near the end via the bisection interface (but
# asking for the middle) - we want to be told that the key is after the
# point we asked for (because its in the unparsed region at the end.)
- result =index._lookup_keys_via_location(
- [(index._size // 2, ('50', ))])
+ result =index._lookup_keys([('50', )])
# We should have parsed the header and the middle of the file.
expected_map = SortedKeyRangeMap()
expected_map.add_range((0, None), (3972, make_key(26)))
@@ -602,7 +594,7 @@
expected_map.add_range(
(13235, KeyEndSentinel()), (13236, KeyEndSentinel()))
self.assertEqual(expected_map, index._parsed_map)
- self.assertEqual([((0, ('50', )), +1)], result)
+ self.assertEqual([(('50', ), True)], result)
def test_lookup_key_resolves_references(self):
# generate a big enough index that we only read some of it on a typical
@@ -618,7 +610,7 @@
index = self.make_index(ref_lists=1, nodes=nodes)
# lookup a key in the middle that does not exist, so that when we can
# check that the referred-to-keys are not accessed automatically.
- result =index._lookup_keys_via_location(
+ result =index._lookup_keys(
[(index._size // 2, ('40', ))])
# check the parse map - only the start and middle should have been
# parsed.
@@ -636,9 +628,9 @@
del index._transport._activity[:]
# now looking up a key in the portion of the file already parsed should
# only perform IO to resolve its key references.
- result = index._lookup_keys_via_location([(4000, make_key(40))])
+ result = index._lookup_keys([make_key(40)])
self.assertEqual(
- [((0, make_key(40)),
+ [(make_key(40),
(index, make_key(40), make_value(40), ((make_key(60),),)))],
result)
self.assertEqual([('readv', 'index', [(11976, 800)], True, 15813)],
More information about the bazaar-commits
mailing list