Rev 3148: Fix #179368 by keeping the current range hint on ShortReadvErrors. in file:///v/home/vila/src/bzr/bugs/179368/

Wed Jan 2 14:14:00 GMT 2008

At file:///v/home/vila/src/bzr/bugs/179368/

------------------------------------------------------------
revno: 3148
revision-id:v.ladeuil+lp at free.fr-20080102141355-k20yfjo6i1dasuny
parent: v.ladeuil+lp at free.fr-20071230124033-gk9iv67553aq5kw2
committer: Vincent Ladeuil <v.ladeuil+lp at free.fr>
branch nick: 179368
timestamp: Wed 2008-01-02 15:13:55 +0100
message:
  Fix #179368 by keeping the current range hint on ShortReadvErrors.
  
  * response.py:
  (RangeFile._checked_read): Avoid huge buffering when huge seeks
  are required.
  
  * _urllib2_wrappers.py:
  (Response.finish): Check for end-of-file or we'll loop if the
  server lied about Content-Length.
  
  * __init__.py:
  (HttpTransportBase._readv): When a ShortReadvError occurs, try
  again, staying in multiple range mode and degrades only if the
  error occurs again for the same offset.
  
  * test_http_response.py:
  (TestRangeFileMultipleRanges): Add a test to exercise the buffer
  overflow protection code.
  
  * test_http.py:
  (TestMultipleRangeWithoutContentLengthServer): Emulate lighttpd
  behavior regarding bug #179368.
modified:
  NEWS                           NEWS-20050323055033-4e00b5db738777ff
  bzrlib/tests/test_http.py      testhttp.py-20051018020158-b2eef6e867c514d9
  bzrlib/tests/test_http_response.py test_http_response.py-20060628233143-950b2a482a32505d
  bzrlib/transport/http/__init__.py http_transport.py-20050711212304-506c5fd1059ace96
  bzrlib/transport/http/_urllib2_wrappers.py _urllib2_wrappers.py-20060913231729-ha9ugi48ktx481ao-1
  bzrlib/transport/http/response.py _response.py-20060613154423-a2ci7hd4iw5c7fnt-1
-------------- next part --------------
=== modified file 'NEWS'

--- a/NEWS	2007-12-30 12:40:33 +0000
+++ b/NEWS	2008-01-02 14:13:55 +0000
@@ -58,6 +58,9 @@
 
   BUGFIXES:
 
+   * Better handle short reads when processing multiple range requests.
+     (Vincent Ladeuil, #179368)
+
    * build_tree acceleration uses the correct path when a file has been moved.
      (Aaron Bentley)
 

=== modified file 'bzrlib/tests/test_http.py'
--- a/bzrlib/tests/test_http.py	2007-12-28 16:51:30 +0000
+++ b/bzrlib/tests/test_http.py	2008-01-02 14:13:55 +0000
@@ -905,6 +905,88 @@
 
     _req_handler_class = MultipleRangeWithoutContentLengthRequestHandler
 
+
+class TruncatedMultipleRangeRequestHandler(
+    http_server.TestingHTTPRequestHandler):
+    """Reply to multiple range requests truncating the last ones.
+
+    This server generates responses whose Content-Length describes all the
+    ranges, but fail to include the last ones leading to client short reads.
+    This has been observed randomly with lighttpd (bug #179368).
+    """
+
+    _truncated_ranges = 2
+
+    def get_multiple_ranges(self, file, file_size, ranges):
+        self.send_response(206)
+        self.send_header('Accept-Ranges', 'bytes')
+        boundary = 'tagada'
+        self.send_header('Content-Type',
+                         'multipart/byteranges; boundary=%s' % boundary)
+        boundary_line = '--%s\r\n' % boundary
+        # Calculate the Content-Length
+        content_length = 0
+        for (start, end) in ranges:
+            content_length += len(boundary_line)
+            content_length += self._header_line_length(
+                'Content-type', 'application/octet-stream')
+            content_length += self._header_line_length(
+                'Content-Range', 'bytes %d-%d/%d' % (start, end, file_size))
+            content_length += len('\r\n') # end headers
+            content_length += end - start # + 1
+        content_length += len(boundary_line)
+        self.send_header('Content-length', content_length)
+        self.end_headers()
+
+        # Send the multipart body
+        cur = 0
+        for (start, end) in ranges:
+            self.wfile.write(boundary_line)
+            self.send_header('Content-type', 'application/octet-stream')
+            self.send_header('Content-Range', 'bytes %d-%d/%d'
+                             % (start, end, file_size))
+            self.end_headers()
+            if cur + self._truncated_ranges >= len(ranges):
+                # Abruptly ends the response and close the connection
+                self.close_connection = 1
+                return
+            self.send_range_content(file, start, end - start + 1)
+            cur += 1
+        # No final boundary
+        self.wfile.write(boundary_line)
+
+
+class TestTruncatedMultipleRangeServer(TestSpecificRequestHandler):
+
+    _req_handler_class = TruncatedMultipleRangeRequestHandler
+
+    def setUp(self):
+        super(TestTruncatedMultipleRangeServer, self).setUp()
+        self.build_tree_contents([('a', '0123456789')],)
+
+    def test_readv_with_short_reads(self):
+        server = self.get_readonly_server()
+        t = self._transport(server.get_url())
+        # Force separate ranges for each offset
+        t._bytes_to_read_before_seek = 0
+        ireadv = iter(t.readv('a', ((0, 1), (2, 1), (4, 2), (9, 1))))
+        self.assertEqual((0, '0'), ireadv.next())
+        self.assertEqual((2, '2'), ireadv.next())
+        if not self._testing_pycurl():
+            # Only one request have been issued so far (except for pycurl that
+            # try to read the whole response at once)
+            self.assertEqual(1, server.GET_request_nb)
+        self.assertEqual((4, '45'), ireadv.next())
+        self.assertEqual((9, '9'), ireadv.next())
+        # Both implementations issue 3 requests but:
+        # - urllib does two multiple (4 ranges, then 2 ranges) then a single
+        #   range,
+        # - pycurl does two multiple (4 ranges, 4 ranges) then a single range
+        self.assertEqual(3, server.GET_request_nb)
+        # Finally the client have tried a single range request and stays in
+        # that mode
+        self.assertEqual('single', t._range_hint)
+
 class LimitedRangeRequestHandler(http_server.TestingHTTPRequestHandler):
     """Errors out when range specifiers exceed the limit"""
 

=== modified file 'bzrlib/tests/test_http_response.py'
--- a/bzrlib/tests/test_http_response.py	2007-12-13 15:40:42 +0000
+++ b/bzrlib/tests/test_http_response.py	2008-01-02 14:13:55 +0000
@@ -249,7 +249,7 @@
         f._pos = 0 # Force an invalid pos
         self.assertRaises(errors.InvalidRange, f.read, 2)
 
-class TestRangeFilMultipleRanges(tests.TestCase, TestRangeFileMixin):
+class TestRangeFileMultipleRanges(tests.TestCase, TestRangeFileMixin):
     """Test a RangeFile for multiple ranges.
 
     The RangeFile used for the tests contains three ranges:
@@ -263,7 +263,7 @@
     """
 
     def setUp(self):
-        super(TestRangeFilMultipleRanges, self).setUp()
+        super(TestRangeFileMultipleRanges, self).setUp()
 
         boundary = 'separation'
 
@@ -361,6 +361,14 @@
         f.seek(126) # skip the two first ranges
         self.assertEquals('AB', f.read(2))
 
+    def test_checked_read_dont_overflow_buffers(self):
+        f = self._file
+        start = self.first_range_start
+        # We force a very low value to exercise all code paths in _checked_read
+        f._discarded_buf_size = 8
+        f.seek(126) # skip the two first ranges
+        self.assertEquals('AB', f.read(2))
+
     def test_seek_twice_between_ranges(self):
         f = self._file
         start = self.first_range_start

=== modified file 'bzrlib/transport/http/__init__.py'
--- a/bzrlib/transport/http/__init__.py	2007-12-20 14:14:21 +0000
+++ b/bzrlib/transport/http/__init__.py	2008-01-02 14:13:55 +0000
@@ -217,6 +217,7 @@
         offsets = list(offsets)
 
         try_again = True
+        retried_offset = None
         while try_again:
             try_again = False
 
@@ -271,10 +272,20 @@
 
             except (errors.ShortReadvError, errors.InvalidRange,
                     errors.InvalidHttpRange), e:
-                self._degrade_range_hint(relpath, coalesced, sys.exc_info())
+                mutter('Exception %r: %s during http._readv',e, e)
+                if (not isinstance(e, errors.ShortReadvError)
+                    or retried_offset == cur_offset_and_size):
+                    # We don't degrade the range hint for ShortReadvError since
+                    # they do not indicate a problem with the server ability to
+                    # handle ranges. Except when we fail to get back a required
+                    # offset twice in a row. In that case, falling back to
+                    # single range or whole file should help or end up in a
+                    # fatal exception.
+                    self._degrade_range_hint(relpath, coalesced, sys.exc_info())
                 # Some offsets may have been already processed, so we retry
                 # only the unsuccessful ones.
                 offsets = [cur_offset_and_size] + [o for o in iter_offsets]
+                retried_offset = cur_offset_and_size
                 try_again = True
 
     def _coalesce_readv(self, relpath, coalesced):

=== modified file 'bzrlib/transport/http/_urllib2_wrappers.py'
--- a/bzrlib/transport/http/_urllib2_wrappers.py	2007-12-28 16:51:30 +0000
+++ b/bzrlib/transport/http/_urllib2_wrappers.py	2008-01-02 14:13:55 +0000
@@ -134,16 +134,16 @@
         if not self.isclosed():
             # Make sure nothing was left to be read on the socket
             pending = 0
-            while self.length and self.length > self._discarded_buf_size:
+            data = True
+            while (data and self.length
+                   and self.length > self._discarded_buf_size):
                 data = self.read(self._discarded_buf_size)
                 pending += len(data)
-            if self.length:
+            if data and self.length:
                 data = self.read(self.length)
                 pending += len(data)
             if pending:
-                trace.mutter(
-                    "%s bytes left on the HTTP socket",
-                    pending)
+                trace.mutter("%s bytes left on the HTTP socket", pending)
             self.close()
         return pending
 

=== modified file 'bzrlib/transport/http/response.py'
--- a/bzrlib/transport/http/response.py	2007-12-10 10:41:24 +0000
+++ b/bzrlib/transport/http/response.py	2008-01-02 14:13:55 +0000
@@ -144,24 +144,36 @@
                                           'Invalid range, size <= 0')
         self.set_range(start, size)
 
+    # in _checked_read() below, we may have to discard several MB in the worst
+    # case. To avoid buffering that much, we read and discard by chunks
+    # instead. The underlying file is either a socket or a StringIO, so reading
+    # 8k chunks should be fine.
+    _discarded_buf_size = 8192
+
     def _checked_read(self, size):
-        """Read the file checking for short reads"""
+        """Read the file checking for short reads.
+
+        The data read is discarded along the way.
+        """
         pos = self._pos
-        data = self._file.read(size)
-        data_len = len(data)
+        data_len = 0
+        while size - data_len > self._discarded_buf_size:
+            data = self._file.read(self._discarded_buf_size)
+            data_len += len(data)
+        if size - data_len > 0:
+            data = self._file.read(size - data_len)
+            data_len += len(data)
         if size > 0 and data_len != size:
             raise errors.ShortReadvError(self._path, pos, size, data_len)
         self._pos += data_len
-        return data
 
     def _seek_to_next_range(self):
         # We will cross range boundaries
         if self._boundary is None:
             # If we don't have a boundary, we can't find another range
-            raise errors.InvalidRange(
-                self._path, self._pos,
-                "Range (%s, %s) exhausted"
-                % (self._start, self._size))
+            raise errors.InvalidRange(self._path, self._pos,
+                                      "Range (%s, %s) exhausted"
+                                      % (self._start, self._size))
         self.read_boundary()
         self.read_range_definition()