Rev 3053: Merge http readv issuing multiple requests since that fix bug #172701 in file:///v/home/vila/src/bzr/bugs/172701/
Vincent Ladeuil
v.ladeuil+lp at free.fr
Thu Nov 29 21:59:49 GMT 2007
At file:///v/home/vila/src/bzr/bugs/172701/
------------------------------------------------------------
revno: 3053
revision-id:v.ladeuil+lp at free.fr-20071129215945-qwzjfzz6kygljh4r
parent: pqm at pqm.ubuntu.com-20071129184101-u9506rihe4zbzyyz
parent: v.ladeuil+lp at free.fr-20071129154333-ddra1mbf487rlofw
committer: Vincent Ladeuil <v.ladeuil+lp at free.fr>
branch nick: 172701
timestamp: Thu 2007-11-29 22:59:45 +0100
message:
Merge http readv issuing multiple requests since that fix bug #172701
modified:
bzrlib/transport/http/__init__.py http_transport.py-20050711212304-506c5fd1059ace96
bzrlib/transport/http/_urllib2_wrappers.py _urllib2_wrappers.py-20060913231729-ha9ugi48ktx481ao-1
------------------------------------------------------------
revno: 3024.2.3
revision-id:v.ladeuil+lp at free.fr-20071129154333-ddra1mbf487rlofw
parent: v.ladeuil+lp at free.fr-20071127093537-gruuxzmso2r6c5pg
committer: Vincent Ladeuil <v.ladeuil+lp at free.fr>
branch nick: 165061
timestamp: Thu 2007-11-29 16:43:33 +0100
message:
Rewrite http_readv to allow several GET requests. Smoke tested against branch reported in the bug.
* bzrlib/transport/http/__init__.py:
(HttpTransportBase._readv): Issue several GET requests if too many
ranges are requested.
modified:
bzrlib/transport/http/__init__.py http_transport.py-20050711212304-506c5fd1059ace96
bzrlib/transport/http/_urllib2_wrappers.py _urllib2_wrappers.py-20060913231729-ha9ugi48ktx481ao-1
-------------- next part --------------
=== modified file 'bzrlib/transport/http/__init__.py'
--- a/bzrlib/transport/http/__init__.py 2007-11-27 08:26:00 +0000
+++ b/bzrlib/transport/http/__init__.py 2007-11-29 15:43:33 +0000
@@ -208,9 +208,9 @@
self._range_hint = None
mutter('Retry "%s" without ranges' % relpath)
else:
- # We tried all the tricks, but nothing worked. We re-raise original
- # exception; the 'mutter' calls above will indicate that further
- # tries were unsuccessful
+ # We tried all the tricks, but nothing worked. We re-raise the
+ # original exception; the 'mutter' calls above will indicate that
+ # further tries were unsuccessful
raise exc_info[0], exc_info[1], exc_info[2]
def _get_ranges_hinted(self, relpath, ranges):
@@ -251,6 +251,9 @@
# No limit on the offset number that get combined into one, we are trying
# to avoid downloading the whole file.
_max_readv_combine = 0
+ # By default Apache has a limit of ~400 ranges before replying with a 400
+ # Bad Request. So we go underneath that amount to be safe.
+ _max_get_ranges = 200
def _readv(self, relpath, offsets):
"""Get parts of the file at the given relative path.
@@ -258,41 +261,80 @@
:param offsets: A list of (offset, size) tuples.
:param return: A list or generator of (offset, data) tuples
"""
- sorted_offsets = sorted(list(offsets))
- fudge = self._bytes_to_read_before_seek
- coalesced = self._coalesce_offsets(sorted_offsets,
- limit=self._max_readv_combine,
- fudge_factor=fudge)
- coalesced = list(coalesced)
- mutter('http readv of %s offsets => %s collapsed %s',
- relpath, len(offsets), len(coalesced))
-
- f = self._get_ranges_hinted(relpath, coalesced)
- for start, size in offsets:
- try_again = True
- while try_again:
- try_again = False
- f.seek(start, ((start < 0) and 2) or 0)
- start = f.tell()
- try:
- data = f.read(size)
- if len(data) != size:
- raise errors.ShortReadvError(relpath, start, size,
- actual=len(data))
- except errors.ShortReadvError, e:
- self._degrade_range_hint(relpath, coalesced, sys.exc_info())
-
- # Since the offsets and the ranges may not be in the same
- # order, we don't try to calculate a restricted single
- # range encompassing unprocessed offsets.
-
- # Note: we replace 'f' here, it may need cleaning one day
- # before being thrown that way.
- f = self._get_ranges_hinted(relpath, coalesced)
- try_again = True
-
- # After one or more tries, we get the data.
- yield start, data
+
+ # offsets may be a genarator, we will iterate it several times, so
+ # build a list
+ offsets = list(offsets)
+
+ try_again = True
+ while try_again:
+ try_again = False
+
+ # Coalesce the offsets to minimize the GET requests issued
+ sorted_offsets = sorted(offsets)
+ coalesced = self._coalesce_offsets(
+ sorted_offsets, limit=self._max_readv_combine,
+ fudge_factor=self._bytes_to_read_before_seek)
+
+ # Turn it into a list, we will iterate it several times
+ coalesced = list(coalesced)
+ mutter('http readv of %s offsets => %s collapsed %s',
+ relpath, len(offsets), len(coalesced))
+
+ # Cache the data read, but only until it's been used
+ data_map = {}
+ # We will iterate on the data received from the GET requests and
+ # serve the corresponding offsets repecting the initial order. We
+ # need an offset iterator for that.
+ iter_offsets = iter(offsets)
+ cur_offset_and_size = iter_offsets.next()
+
+ try:
+ for cur_coal, file in self._coalesce_readv(relpath, coalesced):
+ # Split the received chunk
+ for offset, size in cur_coal.ranges:
+ key = (cur_coal.start + offset, size)
+ file.seek(cur_coal.start + offset, 0)
+ data = file.read(size)
+ data_len = len(data)
+ if data_len != size:
+ raise errors.ShortReadvError(relpath, start, size,
+ actual=data_len)
+ data_map[key] = data
+
+ # Yield everything we can
+ while cur_offset_and_size in data_map:
+ # Clean the cached data since we use it
+ # XXX: will break if offsets contains duplicates --
+ # vila20071129
+ this_data = data_map.pop(cur_offset_and_size)
+ yield cur_offset_and_size[0], this_data
+ cur_offset_and_size = iter_offsets.next()
+
+ except (errors.ShortReadvError,errors.InvalidRange), e:
+ self._degrade_range_hint(relpath, coalesced, sys.exc_info())
+ # Some offsets may have been already processed, so we retry
+ # only the unsuccessful ones.
+ offsets = [cur_offset_and_size] + [o for o in offset_stack]
+
+ def _coalesce_readv(self, relpath, coalesced):
+ """Issue several GET requests to satisfy the coalesced offsets"""
+ total = len(coalesced)
+ if self._range_hint == 'multi':
+ max_ranges = self._max_get_ranges
+ elif self._range_hint == 'single':
+ max_ranges = total
+ else:
+ # The whole file will be downloaded anyway
+ max_ranges = total
+ for group in xrange(0, len(coalesced), max_ranges):
+ ranges = coalesced[group * max_ranges:group+1 * max_ranges]
+ # Note that the following may raise errors.InvalidRange. It's the
+ # caller responsability to decide how to retry since it may provide
+ # different coalesced offsets.
+ file = self._get(relpath, ranges)
+ for range in ranges:
+ yield range, file
def recommended_page_size(self):
"""See Transport.recommended_page_size().
@@ -447,7 +489,7 @@
"""Prepare a HTTP Range header at a level the server should accept"""
if self._range_hint == 'multi':
- # Nothing to do here
+ # Generate the header describing all offsets
return self._range_header(offsets, tail_amount)
elif self._range_hint == 'single':
# Combine all the requested ranges into a single
=== modified file 'bzrlib/transport/http/_urllib2_wrappers.py'
--- a/bzrlib/transport/http/_urllib2_wrappers.py 2007-11-05 13:21:30 +0000
+++ b/bzrlib/transport/http/_urllib2_wrappers.py 2007-11-29 15:43:33 +0000
@@ -101,7 +101,8 @@
# having issued the response headers (even if the
# headers indicate a Content-Type...)
body = self.fp.read(self.length)
- if self.debuglevel > 0:
+ if self.debuglevel > 3:
+ # This one can be huge and is generally not interesting
print "Consumed body: [%s]" % body
self.close()
elif self.status == 200:
@@ -1285,7 +1286,7 @@
)
self.open = self._opener.open
- if DEBUG >= 2:
+ if DEBUG >= 3:
# When dealing with handler order, it's easy to mess
# things up, the following will help understand which
# handler is used, when and for what.
More information about the bazaar-commits
mailing list