Rev 2522: Fix #115209 by issuing a single range request on 400: Bad Request in file:///v/home/vila/src/bugs/115209/
Vincent Ladeuil
v.ladeuil+lp at free.fr
Wed Jun 20 14:56:23 BST 2007
At file:///v/home/vila/src/bugs/115209/
------------------------------------------------------------
revno: 2522
revision-id: v.ladeuil+lp at free.fr-20070620135621-x43c0hnmzu0iuo6m
parent: v.ladeuil+lp at free.fr-20070612144857-1swhz5c3qo8n76l8
committer: Vincent Ladeuil <v.ladeuil+lp at free.fr>
branch nick: 115209
timestamp: Wed 2007-06-20 15:56:21 +0200
message:
Fix #115209 by issuing a single range request on 400: Bad Request
* bzrlib/transport/http/response.py:
(handle_response): Consider 400 as an indication that too much
ranges were specified.
* bzrlib/transport/http/_urllib2_wrappers.py:
(Request): Add an 'accpeted_errors' parameters describing what
error codes the caller will handle.
(HTTPErrorProcessor): Mention that Request specific accepted error
codes takes precedence.
(HTTPDefaultErrorHandler.http_error_default): Remove dead code.
* bzrlib/transport/http/_urllib.py:
(HttpTransport_urllib._get): Add 400 as an accepted error iff
ranges are specified.
(HttpTransport_urllib._head): Restrict accepted errors.
* bzrlib/transport/http/__init__.py:
(HttpTransportBase._degrade_range_hint,
HttpTransportBase._get_ranges_hinted): Replace _retry_get.
(HttpTransportBase.readv): Simplified and avoid the spurious _get()
issued when _get was successful.
* bzrlib/tests/test_http.py:
(TestLimitedRangeRequestServer,
TestLimitedRangeRequestServer_urllib,
TestLimitedRangeRequestServer_pycurl): Bug #115209 specific tests.
* bzrlib/tests/HTTPTestUtil.py:
(LimitedRangeRequestHandler, LimitedRangeHTTPServer): New test
classes to emulate apache throwing 400: Bad Request when too much
ranges are specified.
(AuthRequestHandler.do_GET): Remove dead code. Yeah, I know,
not related to the bug :-/
modified:
bzrlib/tests/HTTPTestUtil.py HTTPTestUtil.py-20050914180604-247d3aafb7a43343
bzrlib/tests/test_http.py testhttp.py-20051018020158-b2eef6e867c514d9
bzrlib/transport/http/__init__.py http_transport.py-20050711212304-506c5fd1059ace96
bzrlib/transport/http/_urllib.py _urlgrabber.py-20060113083826-0bbf7d992fbf090c
bzrlib/transport/http/_urllib2_wrappers.py _urllib2_wrappers.py-20060913231729-ha9ugi48ktx481ao-1
bzrlib/transport/http/response.py _response.py-20060613154423-a2ci7hd4iw5c7fnt-1
-------------- next part --------------
=== modified file 'bzrlib/tests/HTTPTestUtil.py'
--- a/bzrlib/tests/HTTPTestUtil.py 2007-05-06 06:59:57 +0000
+++ b/bzrlib/tests/HTTPTestUtil.py 2007-06-20 13:56:21 +0000
@@ -150,6 +150,37 @@
self.wfile.write(out_buffer.getvalue())
+class LimitedRangeRequestHandler(TestingHTTPRequestHandler):
+ """Errors out when range specifiers exceed the limit"""
+
+ def get_multiple_ranges(self, file, file_size, ranges):
+ """Refuses the multiple ranges request"""
+ tcs = self.server.test_case_server
+ #import pdb; pdb.set_trace()
+ if tcs.range_limit is not None and len(ranges) > tcs.range_limit:
+ file.close()
+ # Emulate apache behavior
+ self.send_error(400, "Bad Request")
+ return
+ return TestingHTTPRequestHandler.get_multiple_ranges(self, file,
+ file_size, ranges)
+
+ def do_GET(self):
+ tcs = self.server.test_case_server
+ tcs.GET_request_nb += 1
+ return TestingHTTPRequestHandler.do_GET(self)
+
+
+class LimitedRangeHTTPServer(HttpServer):
+ """An HttpServer erroring out on requests with too much range specifiers"""
+
+ def __init__(self, request_handler=LimitedRangeRequestHandler,
+ range_limit=None):
+ HttpServer.__init__(self, request_handler)
+ self.range_limit = range_limit
+ self.GET_request_nb = 0
+
+
class SingleRangeRequestHandler(TestingHTTPRequestHandler):
"""Always reply to range request as if they were single.
@@ -337,8 +368,6 @@
self.end_headers()
return
- TestingHTTPRequestHandler.do_GET(self)
-
class BasicAuthRequestHandler(AuthRequestHandler):
"""Implements the basic authentication of a request"""
=== modified file 'bzrlib/tests/test_http.py'
--- a/bzrlib/tests/test_http.py 2007-06-12 14:48:57 +0000
+++ b/bzrlib/tests/test_http.py 2007-06-20 13:56:21 +0000
@@ -53,6 +53,7 @@
HTTPDigestAuthServer,
HTTPServerRedirecting,
InvalidStatusRequestHandler,
+ LimitedRangeHTTPServer,
NoRangeRequestHandler,
ProxyBasicAuthServer,
ProxyDigestAuthServer,
@@ -714,6 +715,67 @@
"""Tests range requests refusing server for pycurl implementation"""
+class TestLimitedRangeRequestServer(object):
+ """Tests readv requests against server that errors out on too much ranges.
+
+ This MUST be used by daughter classes that also inherit from
+ TestCaseWithWebserver.
+
+ We can't inherit directly from TestCaseWithWebserver or the
+ test framework will try to create an instance which cannot
+ run, its implementation being incomplete.
+ """
+
+ range_limit = 3
+
+ def create_transport_readonly_server(self):
+ # Requests with more range specifiers will error out
+ return LimitedRangeHTTPServer(range_limit=self.range_limit)
+
+ def get_transport(self):
+ return self._transport(self.get_readonly_server().get_url())
+
+ def setUp(self):
+ TestCaseWithWebserver.setUp(self)
+ # We need to manipulate ranges that correspond to real chunks in the
+ # response, so we build a content appropriately.
+ filler = ''.join(['abcdefghij' for _ in range(102)])
+ content = ''.join(['%04d' % v + filler for v in range(16)])
+ self.build_tree_contents([('a', content)],)
+
+ def test_few_ranges(self):
+ t = self.get_transport()
+ l = list(t.readv('a', ((0, 4), (1024, 4), )))
+ self.assertEqual(l[0], (0, '0000'))
+ self.assertEqual(l[1], (1024, '0001'))
+ self.assertEqual(1, self.get_readonly_server().GET_request_nb)
+
+ def test_a_lot_of_ranges(self):
+ t = self.get_transport()
+ l = list(t.readv('a', ((0, 4), (1024, 4), (4096, 4), (8192, 4))))
+ self.assertEqual(l[0], (0, '0000'))
+ self.assertEqual(l[1], (1024, '0001'))
+ self.assertEqual(l[2], (4096, '0004'))
+ self.assertEqual(l[3], (8192, '0008'))
+ # The server will refuse to serve the first request (too much ranges),
+ # a second request will succeeds.
+ self.assertEqual(2, self.get_readonly_server().GET_request_nb)
+
+
+class TestLimitedRangeRequestServer_urllib(TestLimitedRangeRequestServer,
+ TestCaseWithWebserver):
+ """Tests limited range requests server for urllib implementation"""
+
+ _transport = HttpTransport_urllib
+
+
+class TestLimitedRangeRequestServer_pycurl(TestWithTransport_pycurl,
+ TestLimitedRangeRequestServer,
+ TestCaseWithWebserver):
+ """Tests limited range requests server for pycurl implementation"""
+
+
+
class TestHttpProxyWhiteBox(TestCase):
"""Whitebox test proxy http authorization.
=== modified file 'bzrlib/transport/http/__init__.py'
--- a/bzrlib/transport/http/__init__.py 2007-06-12 14:48:57 +0000
+++ b/bzrlib/transport/http/__init__.py 2007-06-20 13:56:21 +0000
@@ -255,45 +255,56 @@
"""
return self
- def _retry_get(self, relpath, ranges, exc_info):
- """A GET request have failed, let's retry with a simpler request."""
-
- try_again = False
- # The server does not gives us enough data or
- # a bogus-looking result, let's try again with
- # a simpler request if possible.
+ def _degrade_range_hint(self, relpath, ranges, exc_info):
if self._range_hint == 'multi':
self._range_hint = 'single'
- mutter('Retry %s with single range request' % relpath)
- try_again = True
+ mutter('Retry "%s" with single range request' % relpath)
elif self._range_hint == 'single':
self._range_hint = None
- mutter('Retry %s without ranges' % relpath)
- try_again = True
- if try_again:
- # Note that since the offsets and the ranges may not
- # be in the same order, we don't try to calculate a
- # restricted single range encompassing unprocessed
- # offsets.
- code, f = self._get(relpath, ranges)
- return try_again, code, f
+ mutter('Retry "%s" without ranges' % relpath)
else:
- # We tried all the tricks, but nothing worked. We
- # re-raise original exception; the 'mutter' calls
- # above will indicate that further tries were
- # unsuccessful
+ # We tried all the tricks, but nothing worked. We re-raise original
+ # exception; the 'mutter' calls above will indicate that further
+ # tries were unsuccessful
raise exc_info[0], exc_info[1], exc_info[2]
- # Having to round trip to the server means waiting for a response, so it is
- # better to download extra bytes. We have two constraints to satisfy when
- # choosing the right value: a request with too much ranges will make some
- # servers issue a '400: Bad request' error (when we have too much ranges,
- # we exceed the apache header max size (8190 by default) for example) ;
- # coalescing too much ranges will increasing the data transferred and
- # degrade performances.
- _bytes_to_read_before_seek = 512
- # No limit on the offset number combined into one, we are trying to avoid
- # downloading the whole file.
+ def _get_ranges_hinted(self, relpath, ranges):
+ """Issue a ranged GET request taking server capabilities into account.
+
+ Depending of the errors returned by the server, we try several GET
+ requests, trying to minimize the data transferred.
+
+ :param relpath: Path relative to transport base URL
+ :param ranges: None to get the whole file;
+ or a list of _CoalescedOffset to fetch parts of a file.
+ :returns: A file handle containing at least the requested ranges.
+ """
+ exc_info = None
+ try_again = True
+ while try_again:
+ try_again = False
+ try:
+ code, f = self._get(relpath, ranges)
+ except errors.InvalidRange, e:
+ if exc_info is None:
+ exc_info = sys.exc_info()
+ self._degrade_range_hint(relpath, ranges, exc_info)
+ try_again = True
+ return f
+
+ # _coalesce_offsets is a helper for readv, it try to combine ranges without
+ # degrading readv performances. _bytes_to_read_before_seek is the value
+ # used for the limit parameter and has been tuned for other transports. For
+ # HTTP, the name is inappropriate but the parameter is still useful and
+ # helps reduce the number of chunks in the response. The overhead for a
+ # chunk (headers, length, footer around the data itself is variable but
+ # around 50 bytes. We use 128 to reduce the range specifiers that appear in
+ # the header, some servers (notably Apache) enforce a maximum length for a
+ # header and issue a '400: Bad request' error when too much ranges are
+ # specified.
+ _bytes_to_read_before_seek = 128
+ # No limit on the offset number that get combined into one, we are trying
+ # to avoid downloading the whole file.
_max_readv_combined = 0
def readv(self, relpath, offsets):
@@ -311,15 +322,7 @@
mutter('http readv of %s offsets => %s collapsed %s',
relpath, len(offsets), len(coalesced))
- try_again = True
- while try_again:
- try_again = False
- try:
- code, f = self._get(relpath, coalesced)
- except (errors.InvalidRange, errors.ShortReadvError), e:
- try_again, code, f = self._retry_get(relpath, coalesced,
- sys.exc_info())
-
+ f = self._get_ranges_hinted(relpath, coalesced)
for start, size in offsets:
try_again = True
while try_again:
@@ -331,12 +334,18 @@
if len(data) != size:
raise errors.ShortReadvError(relpath, start, size,
actual=len(data))
- except (errors.InvalidRange, errors.ShortReadvError), e:
- # Note that we replace 'f' here and that it
- # may need cleaning one day before being
- # thrown that way.
- try_again, code, f = self._retry_get(relpath, coalesced,
- sys.exc_info())
+ except errors.ShortReadvError, e:
+ self._degrade_range_hint(relpath, coalesced, sys.exc_info())
+
+ # Since the offsets and the ranges may not be in the same
+ # order, we don't try to calculate a restricted single
+ # range encompassing unprocessed offsets.
+
+ # Note: we replace 'f' here, it may need cleaning one day
+ # before being thrown that way.
+ f = self._get_ranges_hinted(relpath, coalesced)
+ try_again = True
+
# After one or more tries, we get the data.
yield start, data
=== modified file 'bzrlib/transport/http/_urllib.py'
--- a/bzrlib/transport/http/_urllib.py 2007-06-12 14:48:57 +0000
+++ b/bzrlib/transport/http/_urllib.py 2007-06-20 13:56:21 +0000
@@ -118,13 +118,17 @@
abspath = self._real_abspath(relpath)
headers = {}
+ accepted_errors = [200, 404]
if offsets or tail_amount:
range_header = self._attempted_range_header(offsets, tail_amount)
if range_header is not None:
+ accepted_errors.append(206)
+ accepted_errors.append(400)
bytes = 'bytes=' + range_header
headers = {'Range': bytes}
- request = Request('GET', abspath, None, headers)
+ request = Request('GET', abspath, None, headers,
+ accepted_errors=accepted_errors)
response = self._perform(request)
code = response.code
@@ -157,7 +161,8 @@
Performs the request and leaves callers handle the results.
"""
abspath = self._real_abspath(relpath)
- request = Request('HEAD', abspath)
+ request = Request('HEAD', abspath,
+ accepted_errors=[200, 404])
response = self._perform(request)
self._connection.fake_close()
@@ -172,7 +177,6 @@
if code == 200: # "ok",
return True
else:
- assert(code == 404, 'Only 200 or 404 are correct')
return False
=== modified file 'bzrlib/transport/http/_urllib2_wrappers.py'
--- a/bzrlib/transport/http/_urllib2_wrappers.py 2007-04-22 16:32:04 +0000
+++ b/bzrlib/transport/http/_urllib2_wrappers.py 2007-06-20 13:56:21 +0000
@@ -153,11 +153,13 @@
def __init__(self, method, url, data=None, headers={},
origin_req_host=None, unverifiable=False,
- connection=None, parent=None,):
+ connection=None, parent=None,
+ accepted_errors=None):
urllib2.Request.__init__(self, url, data, headers,
origin_req_host, unverifiable)
self.method = method
self.connection = connection
+ self.accepted_errors = accepted_errors
# To handle redirections
self.parent = parent
self.redirected_to = None
@@ -1133,13 +1135,24 @@
instead, we leave our Transport handle them.
"""
+ accepted_errors = [200, # Ok
+ 206, # Partial content
+ 404, # Not found
+ ]
+ """The error codes the caller will handle.
+
+ This can be specialized in the request on a case-by case basis, but the
+ common cases are covered here.
+ """
+
def http_response(self, request, response):
code, msg, hdrs = response.code, response.msg, response.info()
- if code not in (200, # Ok
- 206, # Partial content
- 404, # Not found
- ):
+ accepted_errors = request.accepted_errors
+ if accepted_errors is None:
+ accepted_errors = self.accepted_errors
+
+ if code not in accepted_errors:
response = self.parent.error('http', request, response,
code, msg, hdrs)
return response
@@ -1151,12 +1164,7 @@
"""Translate common errors into bzr Exceptions"""
def http_error_default(self, req, fp, code, msg, hdrs):
- if code == 404:
- raise errors.NoSuchFile(req.get_selector(),
- extra=HTTPError(req.get_full_url(),
- code, msg,
- hdrs, fp))
- elif code == 403:
+ if code == 403:
raise errors.TransportError('Server refuses to fullfil the request')
elif code == 416:
# We don't know which, but one of the ranges we
@@ -1168,6 +1176,7 @@
'Unable to handle http code %d: %s'
% (code, msg))
+
class Opener(object):
"""A wrapper around urllib2.build_opener
@@ -1191,6 +1200,7 @@
HTTPSHandler,
HTTPDefaultErrorHandler,
)
+
self.open = self._opener.open
if DEBUG >= 2:
# When dealing with handler order, it's easy to mess
=== modified file 'bzrlib/transport/http/response.py'
--- a/bzrlib/transport/http/response.py 2006-12-11 08:10:33 +0000
+++ b/bzrlib/transport/http/response.py 2007-06-20 13:56:21 +0000
@@ -279,7 +279,9 @@
return data
elif code == 404:
raise errors.NoSuchFile(url)
- elif code == 416:
+ # Some servers will retun "400: Bad Request" when too much ranges are
+ # specified
+ elif code in (400, 416):
# We don't know which, but one of the ranges we specified
# was wrong. So we raise with 0 for a lack of a better
# magic value.
More information about the bazaar-commits
mailing list