Rev 2809: First round of index fixes. in http://people.ubuntu.com/~robertc/baz2.0/repository
Robert Collins
robertc at robertcollins.net
Thu Oct 11 03:31:19 BST 2007
At http://people.ubuntu.com/~robertc/baz2.0/repository
------------------------------------------------------------
revno: 2809
revision-id: robertc at robertcollins.net-20071011023106-1khgb5vi0yl4ti67
parent: robertc at robertcollins.net-20071010083158-zqmai921u4wiv28q
parent: robertc at robertcollins.net-20071011022646-fxw9pt0ohs7662sf
committer: Robert Collins <robertc at robertcollins.net>
branch nick: repository
timestamp: Thu 2007-10-11 12:31:06 +1000
message:
First round of index fixes.
modified:
bzrlib/index.py index.py-20070712131115-lolkarso50vjr64s-1
bzrlib/tests/test_index.py test_index.py-20070712131115-lolkarso50vjr64s-2
bzrlib/transport/__init__.py transport.py-20050711165921-4978aa7ce1285ad5
------------------------------------------------------------
revno: 2592.1.25.2.7.1.28.1.6.1.3.1.9.2.1.3.74.1.31.3.18.1.9.1.2.1.12.1.8.1.46.1.18.1.15
revision-id: robertc at robertcollins.net-20071011022646-fxw9pt0ohs7662sf
parent: robertc at robertcollins.net-20071011021626-p917pq7ytv8o7woz
committer: Robert Collins <robertc at robertcollins.net>
branch nick: index
timestamp: Thu 2007-10-11 12:26:46 +1000
message:
Parse more than one segment of data from a single readv response if needed.
modified:
bzrlib/index.py index.py-20070712131115-lolkarso50vjr64s-1
bzrlib/tests/test_index.py test_index.py-20070712131115-lolkarso50vjr64s-2
------------------------------------------------------------
revno: 2592.1.25.2.7.1.28.1.6.1.3.1.9.2.1.3.74.1.31.3.18.1.9.1.2.1.12.1.8.1.46.1.18.1.14
revision-id: robertc at robertcollins.net-20071011021626-p917pq7ytv8o7woz
parent: robertc at robertcollins.net-20071008045131-n60qsvujlkg00oyy
parent: robertc at robertcollins.net-20071009015950-oiq91zspjpoeiz6t
committer: Robert Collins <robertc at robertcollins.net>
branch nick: index
timestamp: Thu 2007-10-11 12:16:26 +1000
message:
Merge readv fixes.
modified:
bzrlib/tests/test_transport_implementations.py test_transport_implementations.py-20051227111451-f97c5c7d5c49fce7
bzrlib/transport/__init__.py transport.py-20050711165921-4978aa7ce1285ad5
------------------------------------------------------------
revno: 2592.1.25.2.7.1.28.1.6.1.3.1.9.2.1.3.74.1.31.3.18.1.9.1.2.1.12.1.8.1.46.1.18.1.1.1.3
revision-id: robertc at robertcollins.net-20071009015950-oiq91zspjpoeiz6t
parent: robertc at robertcollins.net-20071008044749-07yl1rtr3v9iw62o
committer: Robert Collins <robertc at robertcollins.net>
branch nick: readv
timestamp: Tue 2007-10-09 11:59:50 +1000
message:
Review feedback and discussion with Martin - split out the readv offset adjustment into a new helper and document where the design might/should go next.
modified:
bzrlib/transport/__init__.py transport.py-20050711165921-4978aa7ce1285ad5
=== modified file 'bzrlib/index.py'
--- a/bzrlib/index.py 2007-10-10 05:48:02 +0000
+++ b/bzrlib/index.py 2007-10-11 02:31:06 +0000
@@ -736,12 +736,27 @@
# trim the data.
# end first:
end = offset + len(data)
- index = self._parsed_byte_index(offset)
+ while True:
+ index = self._parsed_byte_index(offset)
+ # Trivial test - if the current index's end is within the
+ # low-matching parsed range, we're done.
+ if end < self._parsed_byte_map[index][1]:
+ return
+ if self._parse_segment(offset, data, end, index):
+ return
+
+ def _parse_segment(self, offset, data, end, index):
+ """Parse one segment of data.
+
+ :param offset: Where 'data' begins in the file.
+ :param data: Some data to parse a segment of.
+ :param end: Where data ends
+ :param index: The current index into the parsed bytes map.
+ :return: True if the parsed segment is the last possible one in the
+ range of data.
+ """
# default is to use all data
trim_end = None
- # trivial check for entirely parsed data:
- if end < self._parsed_byte_map[index][1]:
- return
# accomodate overlap with data before this.
if offset < self._parsed_byte_map[index][1]:
# overlaps the lower parsed region
@@ -767,30 +782,35 @@
trim_end = None
# do not strip to the last \n
end_adjacent = True
+ last_segment = True
elif index + 1 == len(self._parsed_byte_map):
# at the end of the parsed data
# use it all
trim_end = None
# but strip to the last \n
end_adjacent = False
+ last_segment = True
elif end == self._parsed_byte_map[index + 1][0]:
# buts up against the next parsed region
# use it all
trim_end = None
# do not strip to the last \n
end_adjacent = True
+ last_segment = True
elif end > self._parsed_byte_map[index + 1][0]:
# overlaps into the next parsed region
# only consider the unparsed data
trim_end = self._parsed_byte_map[index + 1][0] - offset
# do not strip to the last \n as we know its an entire record
end_adjacent = True
+ last_segment = end < self._parsed_byte_map[index + 1][1]
else:
# does not overlap into the next region
# use it all
trim_end = None
# but strip to the last \n
end_adjacent = False
+ last_segment = True
# now find bytes to discard if needed
if not start_adjacent:
# work around python bug in rfind
@@ -850,6 +870,7 @@
self._bisect_nodes[key] = node_value
# print "parsed ", key
self._parsed_bytes(offset, first_key, offset + len(trimmed_data), key)
+ return last_segment
def _parsed_bytes(self, start, start_key, end, end_key):
"""Mark the bytes from start to end as parsed.
@@ -915,8 +936,8 @@
# this must be the start
assert offset == 0
offset, data = self._parse_header_from_bytes(data)
+ # print readv_ranges, "[%d:%d]" % (offset, offset + len(data))
self._parse_region(offset, data)
- # print offset, len(data), data
def _signature(self):
"""The file signature for this index type."""
=== modified file 'bzrlib/tests/test_index.py'
--- a/bzrlib/tests/test_index.py 2007-10-07 23:37:29 +0000
+++ b/bzrlib/tests/test_index.py 2007-10-11 02:26:46 +0000
@@ -441,6 +441,44 @@
self.assertEqual([(None, make_key(26)), (make_key(31), make_key(48))],
index._parsed_key_map)
+ def test_parsing_data_handles_parsed_contained_regions(self):
+ # the following patten creates a parsed region that is wholly within a
+ # single result from the readv layer:
+ # .... single-read (readv-minimum-size) ...
+ # which then trims the start and end so the parsed size is < readv
+ # miniumum.
+ # then a dual lookup (or a reference lookup for that matter) which
+ # abuts or overlaps the parsed region on both sides will need to
+ # discard the data in the middle, but parse the end as well.
+ #
+ # we test this by doing a single lookup to seed the data, then
+ # a lookup for two keys that are present, and adjacent -
+ # we except both to be found, and the parsed byte map to include the
+ # locations of both keys.
+ nodes = []
+ def make_key(number):
+ return (str(number) + 'X'*100,)
+ def make_value(number):
+ return 'Y'*100
+ for counter in range(64):
+ nodes.append((make_key(counter), make_value(counter), ()))
+ index = self.make_index(nodes=nodes)
+ result = index.lookup_keys_via_location(
+ [(index._size // 2, ('40', ))])
+ # and we should have a parse map that includes the header and the
+ # region that was parsed after trimming.
+ self.assertEqual([(0, 3972), (5001, 8914)], index._parsed_byte_map)
+ self.assertEqual([(None, make_key(26)), (make_key(31), make_key(48))],
+ index._parsed_key_map)
+ # now ask for two keys, right before and after the parsed region
+ result = index.lookup_keys_via_location(
+ [(4900, make_key(30)), (8914, make_key(49))])
+ self.assertEqual([
+ ((4900, make_key(30)), (index, make_key(30), make_value(30))),
+ ((8914, make_key(49)), (index, make_key(49), make_value(49))),
+ ],
+ result)
+
def test_lookup_missing_key_answers_without_io_when_map_permits(self):
# generate a big enough index that we only read some of it on a typical
# bisection lookup.
=== modified file 'bzrlib/transport/__init__.py'
--- a/bzrlib/transport/__init__.py 2007-10-08 04:53:50 +0000
+++ b/bzrlib/transport/__init__.py 2007-10-11 02:31:06 +0000
@@ -662,54 +662,16 @@
:return: A list or generator of (offset, data) tuples
"""
if adjust_for_latency:
- offsets = sorted(offsets)
- # short circuit empty requests
- if len(offsets) == 0:
- def empty_yielder():
- # Quick thunk to stop this function becoming a generator
- # itself, rather we return a generator that has nothing to
- # yield.
- if False:
- yield None
- return empty_yielder()
- # expand by page size at either end
- maximum_expansion = self.recommended_page_size()
- new_offsets = []
- for offset, length in offsets:
- expansion = maximum_expansion - length
- if expansion < 0:
- # we're asking for more than the minimum read anyway.
- expansion = 0
- reduction = expansion / 2
- new_offset = offset - reduction
- new_length = length + expansion
- if new_offset < 0:
- # don't ask for anything < 0
- new_offset = 0
- if (upper_limit is not None and
- new_offset + new_length > upper_limit):
- new_length = upper_limit - new_offset
- new_offsets.append((new_offset, new_length))
- # combine the expanded offsets
- offsets = []
- current_offset, current_length = new_offsets[0]
- current_finish = current_length + current_offset
- for offset, length in new_offsets[1:]:
- finish = offset + length
- if offset > current_finish:
- # there is a gap, output the current accumulator and start
- # a new one for the region we're examining.
- offsets.append((current_offset, current_length))
- current_offset = offset
- current_length = length
- current_finish = finish
- continue
- if finish > current_finish:
- # extend the current accumulator to the end of the region
- # we're examining.
- current_finish = finish
- current_length = finish - current_offset
- offsets.append((current_offset, current_length))
+ # Design note: We may wish to have different algorithms for the
+ # expansion of the offsets per-transport. E.g. for local disk to
+ # use page-aligned expansion. If that is the case consider the following structure:
+ # - a test that transport.readv uses self._offset_expander or some similar attribute, to do the expansion
+ # - a test for each transport that it has some known-good offset expander
+ # - unit tests for each offset expander
+ # - a set of tests for the offset expander interface, giving
+ # baseline behaviour (which the current transport
+ # adjust_for_latency tests could be repurposed to).
+ offsets = self._sort_expand_and_combine(offsets, upper_limit)
return self._readv(relpath, offsets)
def _readv(self, relpath, offsets):
@@ -766,6 +728,65 @@
yield cur_offset_and_size[0], this_data
cur_offset_and_size = offset_stack.next()
+ def _sort_expand_and_combine(self, offsets, upper_limit):
+ """Helper for readv.
+
+ :param offsets: A readv vector - (offset, length) tuples.
+ :param upper_limit: The highest byte offset that may be requested.
+ :return: A readv vector that will read all the regions requested by
+ offsets, in start-to-end order, with no duplicated regions,
+ expanded by the transports recommended page size.
+ """
+ offsets = sorted(offsets)
+ # short circuit empty requests
+ if len(offsets) == 0:
+ def empty_yielder():
+ # Quick thunk to stop this function becoming a generator
+ # itself, rather we return a generator that has nothing to
+ # yield.
+ if False:
+ yield None
+ return empty_yielder()
+ # expand by page size at either end
+ maximum_expansion = self.recommended_page_size()
+ new_offsets = []
+ for offset, length in offsets:
+ expansion = maximum_expansion - length
+ if expansion < 0:
+ # we're asking for more than the minimum read anyway.
+ expansion = 0
+ reduction = expansion / 2
+ new_offset = offset - reduction
+ new_length = length + expansion
+ if new_offset < 0:
+ # don't ask for anything < 0
+ new_offset = 0
+ if (upper_limit is not None and
+ new_offset + new_length > upper_limit):
+ new_length = upper_limit - new_offset
+ new_offsets.append((new_offset, new_length))
+ # combine the expanded offsets
+ offsets = []
+ current_offset, current_length = new_offsets[0]
+ current_finish = current_length + current_offset
+ for offset, length in new_offsets[1:]:
+ finish = offset + length
+ if offset > current_finish:
+ # there is a gap, output the current accumulator and start
+ # a new one for the region we're examining.
+ offsets.append((current_offset, current_length))
+ current_offset = offset
+ current_length = length
+ current_finish = finish
+ continue
+ if finish > current_finish:
+ # extend the current accumulator to the end of the region
+ # we're examining.
+ current_finish = finish
+ current_length = finish - current_offset
+ offsets.append((current_offset, current_length))
+ return offsets
+
@staticmethod
def _coalesce_offsets(offsets, limit, fudge_factor):
"""Yield coalesced offsets.
More information about the bazaar-commits
mailing list