Rev 3221: Reduce round trips on index reads by resolving references in parallel with finding keys that have not yet been found. in http://people.ubuntu.com/~robertc/baz2.0/index.range_map
Robert Collins
robertc at robertcollins.net
Fri Feb 8 01:56:57 GMT 2008
At http://people.ubuntu.com/~robertc/baz2.0/index.range_map
------------------------------------------------------------
revno: 3221
revision-id:robertc at robertcollins.net-20080208015653-igb6l64cr7mewh8v
parent: robertc at robertcollins.net-20080208004454-02go73ur2lwtnwqi
committer: Robert Collins <robertc at robertcollins.net>
branch nick: index.range_map
timestamp: Fri 2008-02-08 12:56:53 +1100
message:
Reduce round trips on index reads by resolving references in parallel with finding keys that have not yet been found.
modified:
bzrlib/index.py index.py-20070712131115-lolkarso50vjr64s-1
bzrlib/tests/test_index.py test_index.py-20070712131115-lolkarso50vjr64s-2
=== modified file 'bzrlib/index.py'
--- a/bzrlib/index.py 2008-02-08 00:44:54 +0000
+++ b/bzrlib/index.py 2008-02-08 01:56:53 +0000
@@ -473,8 +473,9 @@
"""
size = self._size
search_keys = keys
+ state = None
while search_keys:
- search_results = self._lookup_keys(search_keys)
+ search_results, state = self._lookup_keys(search_keys, state)
search_keys = []
for key, status in search_results:
if status == 1:
@@ -571,7 +572,7 @@
self._read_and_parse([_HEADER_READV])
return self._key_count
- def _lookup_keys(self, keys):
+ def _lookup_keys(self, keys, state=None):
"""Public interface for implementing partial disk reading.
If _buffer_all has been called, then all the data for the index is in
@@ -587,21 +588,32 @@
the next batch.
:param keys: A list of keys.
+ :param state: The current lookup state - opaque to this method, use
+ None for the first call in a series of lookups.
:return: A list of (key, status) tuples where status is False for
not-present, True for still searching, or otherwise the result.
"""
- # Possible improvements:
- # - only bisect lookup each key once
- # - sort the keys first, and use that to reduce the bisection window
- # -----
# this progresses in three parts:
# read data
# parse it
# attempt to answer the question from the now in memory data.
+ # ---
# build the readv request
# for each location, ask for 800 bytes - much more than rows we've seen
# anywhere.
readv_ranges = []
+ if state is not None:
+ # Currently state is just a set of extra readv's we need to do to
+ # answer key references.
+ for location in state:
+ length = 800
+ if location + length > self._size:
+ length = self._size - location
+ # TODO: trim out parsed locations (e.g. if the 800 is into the
+ # parsed region trim it, and dont use the adjust_for_latency
+ # facility)
+ assert length > 0
+ readv_ranges.append((location, length))
range_map = self._parsed_map
# keys we know we can answer in the negative
missing_keys = set()
@@ -642,7 +654,6 @@
# - result present references so we can return them.
result = []
# keys that we cannot answer until we resolve references
- pending_references = []
pending_locations = set()
for key in keys:
# can we answer from cache?
@@ -660,7 +671,8 @@
wanted_locations.append(ref)
if wanted_locations:
pending_locations.update(wanted_locations)
- pending_references.append((0, key))
+ # Cannot answer this key yet
+ result.append((key, True))
continue
result.append((key, (self, key,
value, self._resolve_references(refs))))
@@ -681,25 +693,7 @@
continue
# The key is neither absent nor parsed. Keep it in flight.
result.append((key, True))
- readv_ranges = []
- # lookup data to resolve references
- for location in pending_locations:
- length = 800
- if location + length > self._size:
- length = self._size - location
- # TODO: trim out parsed locations (e.g. if the 800 is into the
- # parsed region trim it, and dont use the adjust_for_latency
- # facility)
- assert length > 0
- readv_ranges.append((location, length))
- self._read_and_parse(readv_ranges)
- for location, key in pending_references:
- # answer key references we had to look-up-late.
- index = self._parsed_key_index(key)
- value, refs = self._bisect_nodes[key]
- result.append((key, (self, key,
- value, self._resolve_references(refs))))
- return result
+ return result, pending_locations
def _parse_header_from_bytes(self, bytes):
"""Parse the header from a region of bytes.
=== modified file 'bzrlib/tests/test_index.py'
--- a/bzrlib/tests/test_index.py 2008-02-08 00:44:54 +0000
+++ b/bzrlib/tests/test_index.py 2008-02-08 01:56:53 +0000
@@ -378,8 +378,7 @@
del index._transport._activity[:]
# do a _lookup_keys call for the middle of the file, which
# is what bisection uses.
- result = index._lookup_keys(
- [('missing', )])
+ result, state = index._lookup_keys([('missing', )], None)
# this should have asked for a readv request, with adjust_for_latency,
# and two regions: the header, and half-way into the file.
self.assertEqual([
@@ -403,7 +402,7 @@
index = self.make_index(nodes=[(('name', ), 'data', ())])
# reset the transport log
del index._transport._activity[:]
- result = index._lookup_keys([('missing', )])
+ result, state = index._lookup_keys([('missing', )], None)
# this should have asked for a readv request, with adjust_for_latency,
# and two regions: the header, and half-way into the file.
self.assertEqual([
@@ -427,7 +426,7 @@
for counter in range(64):
nodes.append((make_key(counter), 'Y'*100, ()))
index = self.make_index(nodes=nodes)
- result = index._lookup_keys([('40', )])
+ result, state = index._lookup_keys([('40', )], None)
# and the result should be that the key cannot be present, because key is
# in the middle of the observed data from a 4K read - the smallest transport
# will do today with this api.
@@ -463,8 +462,7 @@
for counter in range(62):
nodes.append((make_key(counter), make_value(counter), ()))
index = self.make_index(nodes=nodes)
- result = index._lookup_keys(
- [('40', )])
+ result, state = index._lookup_keys([('40', )], None)
# and we should have a parse map that includes the header and the
# region that was parsed after trimming.
expected_map = SortedKeyRangeMap()
@@ -474,8 +472,7 @@
(12823, KeyEndSentinel()), (12824, KeyEndSentinel()))
self.assertEqual(expected_map, index._parsed_map)
# now ask for two keys, which are at the next bisection steps before and after the parsed region
- result = index._lookup_keys(
- [make_key(28), make_key(48)])
+ result, state = index._lookup_keys([make_key(28), make_key(48)], None)
self.assertEqual([
(make_key(28), (index, make_key(28), make_value(28))),
(make_key(48), (index, make_key(48), make_value(48))),
@@ -498,7 +495,7 @@
nodes.append((make_key(counter), 'Y'*100, ()))
index = self.make_index(nodes=nodes)
# lookup the keys in the middle of the file
- result = index._lookup_keys([('40', )])
+ result, state = index._lookup_keys([('40', )], None)
# check the parse map, this determines the test validity
expected_map = SortedKeyRangeMap()
expected_map.add_range((0, None), (3972, make_key(26)))
@@ -513,7 +510,7 @@
# be in the index) - even when the byte location we ask for is outside
# the parsed region
#
- result = index._lookup_keys([('40', )])
+ result, state = index._lookup_keys([('40', )], None)
self.assertEqual([(('40', ), False)], result)
self.assertEqual([], index._transport._activity)
@@ -529,8 +526,7 @@
nodes.append((make_key(counter), make_value(counter), ()))
index = self.make_index(nodes=nodes)
# lookup the keys in the middle of the file
- result =index._lookup_keys(
- [(index._size // 2, ('40', ))])
+ result, state = index._lookup_keys([('40', )], None)
# check the parse map, this determines the test validity
expected_map = SortedKeyRangeMap()
expected_map.add_range((0, None), (4008, make_key(26)))
@@ -545,7 +541,7 @@
# be in the index) - even when the byte location we ask for is outside
# the parsed region
#
- result = index._lookup_keys([make_key(40)])
+ result, state = index._lookup_keys([make_key(40)], None)
self.assertEqual(
[(make_key(40), (index, make_key(40), make_value(40)))],
result)
@@ -564,7 +560,7 @@
# asking for the middle) we want to be told that the key is below the
# point we asked for (because its in the unparsed region after the
# header and before the middle.)
- result =index._lookup_keys([('27', )])
+ result, state = index._lookup_keys([('27', )], None)
# We should have parsed the header and the middle of the file.
expected_map = SortedKeyRangeMap()
expected_map.add_range((0, None), (3972, make_key(26)))
@@ -586,7 +582,7 @@
# ask for the key near the end via the bisection interface (but
# asking for the middle) - we want to be told that the key is after the
# point we asked for (because its in the unparsed region at the end.)
- result =index._lookup_keys([('50', )])
+ result, state = index._lookup_keys([('50', )], None)
# We should have parsed the header and the middle of the file.
expected_map = SortedKeyRangeMap()
expected_map.add_range((0, None), (3972, make_key(26)))
@@ -610,8 +606,7 @@
index = self.make_index(ref_lists=1, nodes=nodes)
# lookup a key in the middle that does not exist, so that when we can
# check that the referred-to-keys are not accessed automatically.
- result =index._lookup_keys(
- [(index._size // 2, ('40', ))])
+ result, state = index._lookup_keys([('40', )], None)
# check the parse map - only the start and middle should have been
# parsed.
expected_map = SortedKeyRangeMap()
@@ -627,8 +622,13 @@
# reset the transport log for testing the reference lookup
del index._transport._activity[:]
# now looking up a key in the portion of the file already parsed should
- # only perform IO to resolve its key references.
- result = index._lookup_keys([make_key(40)])
+ # only perform IO to resolve its key references - and should do that on
+ # the second pass to eliminate unneeded readv calls.
+ result, state = index._lookup_keys([make_key(40)], None)
+ # state is opaque.
+ self.assertEqual([(make_key(40), True)], result)
+ # Try again, this time it should resolve:
+ result, state = index._lookup_keys([make_key(40)], state)
self.assertEqual(
[(make_key(40),
(index, make_key(40), make_value(40), ((make_key(60),),)))],
More information about the bazaar-commits
mailing list