Rev 2522: Fix #115209 by issuing a single range request on 400: Bad Request in file:///v/home/vila/src/bugs/115209/

Wed Jun 20 14:56:23 BST 2007

At file:///v/home/vila/src/bugs/115209/

------------------------------------------------------------
revno: 2522
revision-id: v.ladeuil+lp at free.fr-20070620135621-x43c0hnmzu0iuo6m
parent: v.ladeuil+lp at free.fr-20070612144857-1swhz5c3qo8n76l8
committer: Vincent Ladeuil <v.ladeuil+lp at free.fr>
branch nick: 115209
timestamp: Wed 2007-06-20 15:56:21 +0200
message:
  Fix #115209 by issuing a single range request on 400: Bad Request
  
  * bzrlib/transport/http/response.py:
  (handle_response): Consider 400 as an indication that too much
  ranges were specified.
  
  * bzrlib/transport/http/_urllib2_wrappers.py:
  (Request): Add an 'accpeted_errors' parameters describing what
  error codes the caller will handle.
  (HTTPErrorProcessor): Mention that Request specific accepted error
  codes takes precedence.
  (HTTPDefaultErrorHandler.http_error_default): Remove dead code.
  
  * bzrlib/transport/http/_urllib.py:
  (HttpTransport_urllib._get): Add 400 as an accepted error iff
  ranges are specified.
  (HttpTransport_urllib._head): Restrict accepted errors.
  
  * bzrlib/transport/http/__init__.py:
  (HttpTransportBase._degrade_range_hint,
  HttpTransportBase._get_ranges_hinted): Replace _retry_get.
  (HttpTransportBase.readv): Simplified and avoid the spurious _get()
  issued when _get was successful.
  
  * bzrlib/tests/test_http.py:
  (TestLimitedRangeRequestServer,
  TestLimitedRangeRequestServer_urllib,
  TestLimitedRangeRequestServer_pycurl): Bug #115209 specific tests.
  
  * bzrlib/tests/HTTPTestUtil.py:
  (LimitedRangeRequestHandler, LimitedRangeHTTPServer): New test
  classes to emulate apache throwing 400: Bad Request when too much
  ranges are specified.
  (AuthRequestHandler.do_GET): Remove dead code. Yeah, I know,
  not related to the bug :-/
modified:
  bzrlib/tests/HTTPTestUtil.py   HTTPTestUtil.py-20050914180604-247d3aafb7a43343
  bzrlib/tests/test_http.py      testhttp.py-20051018020158-b2eef6e867c514d9
  bzrlib/transport/http/__init__.py http_transport.py-20050711212304-506c5fd1059ace96
  bzrlib/transport/http/_urllib.py _urlgrabber.py-20060113083826-0bbf7d992fbf090c
  bzrlib/transport/http/_urllib2_wrappers.py _urllib2_wrappers.py-20060913231729-ha9ugi48ktx481ao-1
  bzrlib/transport/http/response.py _response.py-20060613154423-a2ci7hd4iw5c7fnt-1
-------------- next part --------------
=== modified file 'bzrlib/tests/HTTPTestUtil.py'

--- a/bzrlib/tests/HTTPTestUtil.py	2007-05-06 06:59:57 +0000
+++ b/bzrlib/tests/HTTPTestUtil.py	2007-06-20 13:56:21 +0000
@@ -150,6 +150,37 @@
         self.wfile.write(out_buffer.getvalue())
 
 
+class LimitedRangeRequestHandler(TestingHTTPRequestHandler):
+    """Errors out when range specifiers exceed the limit"""
+
+    def get_multiple_ranges(self, file, file_size, ranges):
+        """Refuses the multiple ranges request"""
+        tcs = self.server.test_case_server
+        #import pdb; pdb.set_trace()
+        if tcs.range_limit is not None and len(ranges) > tcs.range_limit:
+            file.close()
+            # Emulate apache behavior
+            self.send_error(400, "Bad Request")
+            return
+        return TestingHTTPRequestHandler.get_multiple_ranges(self, file,
+                                                             file_size, ranges)
+
+    def do_GET(self):
+        tcs = self.server.test_case_server
+        tcs.GET_request_nb += 1
+        return TestingHTTPRequestHandler.do_GET(self)
+
+
+class LimitedRangeHTTPServer(HttpServer):
+    """An HttpServer erroring out on requests with too much range specifiers"""
+
+    def __init__(self, request_handler=LimitedRangeRequestHandler,
+                 range_limit=None):
+        HttpServer.__init__(self, request_handler)
+        self.range_limit = range_limit
+        self.GET_request_nb = 0
+
+
 class SingleRangeRequestHandler(TestingHTTPRequestHandler):
     """Always reply to range request as if they were single.
 
@@ -337,8 +368,6 @@
             self.end_headers()
             return
 
-        TestingHTTPRequestHandler.do_GET(self)
-
 
 class BasicAuthRequestHandler(AuthRequestHandler):
     """Implements the basic authentication of a request"""

=== modified file 'bzrlib/tests/test_http.py'
--- a/bzrlib/tests/test_http.py	2007-06-12 14:48:57 +0000
+++ b/bzrlib/tests/test_http.py	2007-06-20 13:56:21 +0000
@@ -53,6 +53,7 @@
     HTTPDigestAuthServer,
     HTTPServerRedirecting,
     InvalidStatusRequestHandler,
+    LimitedRangeHTTPServer,
     NoRangeRequestHandler,
     ProxyBasicAuthServer,
     ProxyDigestAuthServer,
@@ -714,6 +715,67 @@
     """Tests range requests refusing server for pycurl implementation"""
 
 
+class TestLimitedRangeRequestServer(object):
+    """Tests readv requests against server that errors out on too much ranges.
+
+    This MUST be used by daughter classes that also inherit from
+    TestCaseWithWebserver.
+
+    We can't inherit directly from TestCaseWithWebserver or the
+    test framework will try to create an instance which cannot
+    run, its implementation being incomplete.
+    """
+
+    range_limit = 3
+
+    def create_transport_readonly_server(self):
+        # Requests with more range specifiers will error out
+        return LimitedRangeHTTPServer(range_limit=self.range_limit)
+
+    def get_transport(self):
+        return self._transport(self.get_readonly_server().get_url())
+
+    def setUp(self):
+        TestCaseWithWebserver.setUp(self)
+        # We need to manipulate ranges that correspond to real chunks in the
+        # response, so we build a content appropriately.
+        filler = ''.join(['abcdefghij' for _ in range(102)])
+        content = ''.join(['%04d' % v + filler for v in range(16)])
+        self.build_tree_contents([('a', content)],)
+
+    def test_few_ranges(self):
+        t = self.get_transport()
+        l = list(t.readv('a', ((0, 4), (1024, 4), )))
+        self.assertEqual(l[0], (0, '0000'))
+        self.assertEqual(l[1], (1024, '0001'))
+        self.assertEqual(1, self.get_readonly_server().GET_request_nb)
+
+    def test_a_lot_of_ranges(self):
+        t = self.get_transport()
+        l = list(t.readv('a', ((0, 4), (1024, 4), (4096, 4), (8192, 4))))
+        self.assertEqual(l[0], (0, '0000'))
+        self.assertEqual(l[1], (1024, '0001'))
+        self.assertEqual(l[2], (4096, '0004'))
+        self.assertEqual(l[3], (8192, '0008'))
+        # The server will refuse to serve the first request (too much ranges),
+        # a second request will succeeds.
+        self.assertEqual(2, self.get_readonly_server().GET_request_nb)
+
+
+class TestLimitedRangeRequestServer_urllib(TestLimitedRangeRequestServer,
+                                          TestCaseWithWebserver):
+    """Tests limited range requests server for urllib implementation"""
+
+    _transport = HttpTransport_urllib
+
+
+class TestLimitedRangeRequestServer_pycurl(TestWithTransport_pycurl,
+                                          TestLimitedRangeRequestServer,
+                                          TestCaseWithWebserver):
+    """Tests limited range requests server for pycurl implementation"""
+
+
+
 class TestHttpProxyWhiteBox(TestCase):
     """Whitebox test proxy http authorization.
 

=== modified file 'bzrlib/transport/http/__init__.py'
--- a/bzrlib/transport/http/__init__.py	2007-06-12 14:48:57 +0000
+++ b/bzrlib/transport/http/__init__.py	2007-06-20 13:56:21 +0000
@@ -255,45 +255,56 @@
         """
         return self
 
-    def _retry_get(self, relpath, ranges, exc_info):
-        """A GET request have failed, let's retry with a simpler request."""
-
-        try_again = False
-        # The server does not gives us enough data or
-        # a bogus-looking result, let's try again with
-        # a simpler request if possible.
+    def _degrade_range_hint(self, relpath, ranges, exc_info):
         if self._range_hint == 'multi':
             self._range_hint = 'single'
-            mutter('Retry %s with single range request' % relpath)
-            try_again = True
+            mutter('Retry "%s" with single range request' % relpath)
         elif self._range_hint == 'single':
             self._range_hint = None
-            mutter('Retry %s without ranges' % relpath)
-            try_again = True
-        if try_again:
-            # Note that since the offsets and the ranges may not
-            # be in the same order, we don't try to calculate a
-            # restricted single range encompassing unprocessed
-            # offsets.
-            code, f = self._get(relpath, ranges)
-            return try_again, code, f
+            mutter('Retry "%s" without ranges' % relpath)
         else:
-            # We tried all the tricks, but nothing worked. We
-            # re-raise original exception; the 'mutter' calls
-            # above will indicate that further tries were
-            # unsuccessful
+            # We tried all the tricks, but nothing worked. We re-raise original
+            # exception; the 'mutter' calls above will indicate that further
+            # tries were unsuccessful
             raise exc_info[0], exc_info[1], exc_info[2]
 
-    # Having to round trip to the server means waiting for a response, so it is
-    # better to download extra bytes. We have two constraints to satisfy when
-    # choosing the right value: a request with too much ranges will make some
-    # servers issue a '400: Bad request' error (when we have too much ranges,
-    # we exceed the apache header max size (8190 by default) for example) ;
-    # coalescing too much ranges will increasing the data transferred and
-    # degrade performances.
-    _bytes_to_read_before_seek = 512
-    # No limit on the offset number combined into one, we are trying to avoid
-    # downloading the whole file.
+    def _get_ranges_hinted(self, relpath, ranges):
+        """Issue a ranged GET request taking server capabilities into account.
+
+        Depending of the errors returned by the server, we try several GET
+        requests, trying to minimize the data transferred.
+
+        :param relpath: Path relative to transport base URL
+        :param ranges: None to get the whole file;
+            or  a list of _CoalescedOffset to fetch parts of a file.
+        :returns: A file handle containing at least the requested ranges.
+        """
+        exc_info = None
+        try_again = True
+        while try_again:
+            try_again = False
+            try:
+                code, f = self._get(relpath, ranges)
+            except errors.InvalidRange, e:
+                if exc_info is None:
+                    exc_info = sys.exc_info()
+                self._degrade_range_hint(relpath, ranges, exc_info)
+                try_again = True
+        return f
+
+    # _coalesce_offsets is a helper for readv, it try to combine ranges without
+    # degrading readv performances. _bytes_to_read_before_seek is the value
+    # used for the limit parameter and has been tuned for other transports. For
+    # HTTP, the name is inappropriate but the parameter is still useful and
+    # helps reduce the number of chunks in the response. The overhead for a
+    # chunk (headers, length, footer around the data itself is variable but
+    # around 50 bytes. We use 128 to reduce the range specifiers that appear in
+    # the header, some servers (notably Apache) enforce a maximum length for a
+    # header and issue a '400: Bad request' error when too much ranges are
+    # specified.
+    _bytes_to_read_before_seek = 128
+    # No limit on the offset number that get combined into one, we are trying
+    # to avoid downloading the whole file.
     _max_readv_combined = 0
 
     def readv(self, relpath, offsets):
@@ -311,15 +322,7 @@
         mutter('http readv of %s  offsets => %s collapsed %s',
                 relpath, len(offsets), len(coalesced))
 
-        try_again = True
-        while try_again:
-            try_again = False
-            try:
-                code, f = self._get(relpath, coalesced)
-            except (errors.InvalidRange, errors.ShortReadvError), e:
-                try_again, code, f = self._retry_get(relpath, coalesced,
-                                                     sys.exc_info())
-
+        f = self._get_ranges_hinted(relpath, coalesced)
         for start, size in offsets:
             try_again = True
             while try_again:
@@ -331,12 +334,18 @@
                     if len(data) != size:
                         raise errors.ShortReadvError(relpath, start, size,
                                                      actual=len(data))
-                except (errors.InvalidRange, errors.ShortReadvError), e:
-                    # Note that we replace 'f' here and that it
-                    # may need cleaning one day before being
-                    # thrown that way.
-                    try_again, code, f = self._retry_get(relpath, coalesced,
-                                                         sys.exc_info())
+                except errors.ShortReadvError, e:
+                    self._degrade_range_hint(relpath, coalesced, sys.exc_info())
+
+                    # Since the offsets and the ranges may not be in the same
+                    # order, we don't try to calculate a restricted single
+                    # range encompassing unprocessed offsets.
+
+                    # Note: we replace 'f' here, it may need cleaning one day
+                    # before being thrown that way.
+                    f = self._get_ranges_hinted(relpath, coalesced)
+                    try_again = True
+
             # After one or more tries, we get the data.
             yield start, data
 

=== modified file 'bzrlib/transport/http/_urllib.py'
--- a/bzrlib/transport/http/_urllib.py	2007-06-12 14:48:57 +0000
+++ b/bzrlib/transport/http/_urllib.py	2007-06-20 13:56:21 +0000
@@ -118,13 +118,17 @@
 
         abspath = self._real_abspath(relpath)
         headers = {}
+        accepted_errors = [200, 404]
         if offsets or tail_amount:
             range_header = self._attempted_range_header(offsets, tail_amount)
             if range_header is not None:
+                accepted_errors.append(206)
+                accepted_errors.append(400)
                 bytes = 'bytes=' + range_header
                 headers = {'Range': bytes}
 
-        request = Request('GET', abspath, None, headers)
+        request = Request('GET', abspath, None, headers,
+                          accepted_errors=accepted_errors)
         response = self._perform(request)
 
         code = response.code
@@ -157,7 +161,8 @@
         Performs the request and leaves callers handle the results.
         """
         abspath = self._real_abspath(relpath)
-        request = Request('HEAD', abspath)
+        request = Request('HEAD', abspath,
+                          accepted_errors=[200, 404])
         response = self._perform(request)
 
         self._connection.fake_close()
@@ -172,7 +177,6 @@
         if code == 200: # "ok",
             return True
         else:
-            assert(code == 404, 'Only 200 or 404 are correct')
             return False
 
 

=== modified file 'bzrlib/transport/http/_urllib2_wrappers.py'
--- a/bzrlib/transport/http/_urllib2_wrappers.py	2007-04-22 16:32:04 +0000
+++ b/bzrlib/transport/http/_urllib2_wrappers.py	2007-06-20 13:56:21 +0000
@@ -153,11 +153,13 @@
 
     def __init__(self, method, url, data=None, headers={},
                  origin_req_host=None, unverifiable=False,
-                 connection=None, parent=None,):
+                 connection=None, parent=None,
+                 accepted_errors=None):
         urllib2.Request.__init__(self, url, data, headers,
                                  origin_req_host, unverifiable)
         self.method = method
         self.connection = connection
+        self.accepted_errors = accepted_errors
         # To handle redirections
         self.parent = parent
         self.redirected_to = None
@@ -1133,13 +1135,24 @@
     instead, we leave our Transport handle them.
     """
 
+    accepted_errors = [200, # Ok
+                       206, # Partial content
+                       404, # Not found
+                       ]
+    """The error codes the caller will handle.
+
+    This can be specialized in the request on a case-by case basis, but the
+    common cases are covered here.
+    """
+
     def http_response(self, request, response):
         code, msg, hdrs = response.code, response.msg, response.info()
 
-        if code not in (200, # Ok
-                        206, # Partial content
-                        404, # Not found
-                        ):
+        accepted_errors = request.accepted_errors
+        if accepted_errors is None:
+            accepted_errors = self.accepted_errors
+
+        if code not in accepted_errors:
             response = self.parent.error('http', request, response,
                                          code, msg, hdrs)
         return response
@@ -1151,12 +1164,7 @@
     """Translate common errors into bzr Exceptions"""
 
     def http_error_default(self, req, fp, code, msg, hdrs):
-        if code == 404:
-            raise errors.NoSuchFile(req.get_selector(),
-                                    extra=HTTPError(req.get_full_url(),
-                                                    code, msg,
-                                                    hdrs, fp))
-        elif code == 403:
+        if code == 403:
             raise errors.TransportError('Server refuses to fullfil the request')
         elif code == 416:
             # We don't know which, but one of the ranges we
@@ -1168,6 +1176,7 @@
                                              'Unable to handle http code %d: %s'
                                              % (code, msg))
 
+
 class Opener(object):
     """A wrapper around urllib2.build_opener
 
@@ -1191,6 +1200,7 @@
             HTTPSHandler,
             HTTPDefaultErrorHandler,
             )
+
         self.open = self._opener.open
         if DEBUG >= 2:
             # When dealing with handler order, it's easy to mess

=== modified file 'bzrlib/transport/http/response.py'
--- a/bzrlib/transport/http/response.py	2006-12-11 08:10:33 +0000
+++ b/bzrlib/transport/http/response.py	2007-06-20 13:56:21 +0000
@@ -279,7 +279,9 @@
         return data
     elif code == 404:
         raise errors.NoSuchFile(url)
-    elif code == 416:
+    # Some servers will retun "400: Bad Request" when too much ranges are
+    # specified
+    elif code in (400, 416):
         # We don't know which, but one of the ranges we specified
         # was wrong. So we raise with 0 for a lack of a better
         # magic value.