Rev 3148: Fix #179368 by keeping the current range hint on ShortReadvErrors. in file:///v/home/vila/src/bzr/bugs/179368/
Vincent Ladeuil
v.ladeuil+lp at free.fr
Wed Jan 2 14:14:00 GMT 2008
At file:///v/home/vila/src/bzr/bugs/179368/
------------------------------------------------------------
revno: 3148
revision-id:v.ladeuil+lp at free.fr-20080102141355-k20yfjo6i1dasuny
parent: v.ladeuil+lp at free.fr-20071230124033-gk9iv67553aq5kw2
committer: Vincent Ladeuil <v.ladeuil+lp at free.fr>
branch nick: 179368
timestamp: Wed 2008-01-02 15:13:55 +0100
message:
Fix #179368 by keeping the current range hint on ShortReadvErrors.
* response.py:
(RangeFile._checked_read): Avoid huge buffering when huge seeks
are required.
* _urllib2_wrappers.py:
(Response.finish): Check for end-of-file or we'll loop if the
server lied about Content-Length.
* __init__.py:
(HttpTransportBase._readv): When a ShortReadvError occurs, try
again, staying in multiple range mode and degrades only if the
error occurs again for the same offset.
* test_http_response.py:
(TestRangeFileMultipleRanges): Add a test to exercise the buffer
overflow protection code.
* test_http.py:
(TestMultipleRangeWithoutContentLengthServer): Emulate lighttpd
behavior regarding bug #179368.
modified:
NEWS NEWS-20050323055033-4e00b5db738777ff
bzrlib/tests/test_http.py testhttp.py-20051018020158-b2eef6e867c514d9
bzrlib/tests/test_http_response.py test_http_response.py-20060628233143-950b2a482a32505d
bzrlib/transport/http/__init__.py http_transport.py-20050711212304-506c5fd1059ace96
bzrlib/transport/http/_urllib2_wrappers.py _urllib2_wrappers.py-20060913231729-ha9ugi48ktx481ao-1
bzrlib/transport/http/response.py _response.py-20060613154423-a2ci7hd4iw5c7fnt-1
-------------- next part --------------
=== modified file 'NEWS'
--- a/NEWS 2007-12-30 12:40:33 +0000
+++ b/NEWS 2008-01-02 14:13:55 +0000
@@ -58,6 +58,9 @@
BUGFIXES:
+ * Better handle short reads when processing multiple range requests.
+ (Vincent Ladeuil, #179368)
+
* build_tree acceleration uses the correct path when a file has been moved.
(Aaron Bentley)
=== modified file 'bzrlib/tests/test_http.py'
--- a/bzrlib/tests/test_http.py 2007-12-28 16:51:30 +0000
+++ b/bzrlib/tests/test_http.py 2008-01-02 14:13:55 +0000
@@ -905,6 +905,88 @@
_req_handler_class = MultipleRangeWithoutContentLengthRequestHandler
+
+class TruncatedMultipleRangeRequestHandler(
+ http_server.TestingHTTPRequestHandler):
+ """Reply to multiple range requests truncating the last ones.
+
+ This server generates responses whose Content-Length describes all the
+ ranges, but fail to include the last ones leading to client short reads.
+ This has been observed randomly with lighttpd (bug #179368).
+ """
+
+ _truncated_ranges = 2
+
+ def get_multiple_ranges(self, file, file_size, ranges):
+ self.send_response(206)
+ self.send_header('Accept-Ranges', 'bytes')
+ boundary = 'tagada'
+ self.send_header('Content-Type',
+ 'multipart/byteranges; boundary=%s' % boundary)
+ boundary_line = '--%s\r\n' % boundary
+ # Calculate the Content-Length
+ content_length = 0
+ for (start, end) in ranges:
+ content_length += len(boundary_line)
+ content_length += self._header_line_length(
+ 'Content-type', 'application/octet-stream')
+ content_length += self._header_line_length(
+ 'Content-Range', 'bytes %d-%d/%d' % (start, end, file_size))
+ content_length += len('\r\n') # end headers
+ content_length += end - start # + 1
+ content_length += len(boundary_line)
+ self.send_header('Content-length', content_length)
+ self.end_headers()
+
+ # Send the multipart body
+ cur = 0
+ for (start, end) in ranges:
+ self.wfile.write(boundary_line)
+ self.send_header('Content-type', 'application/octet-stream')
+ self.send_header('Content-Range', 'bytes %d-%d/%d'
+ % (start, end, file_size))
+ self.end_headers()
+ if cur + self._truncated_ranges >= len(ranges):
+ # Abruptly ends the response and close the connection
+ self.close_connection = 1
+ return
+ self.send_range_content(file, start, end - start + 1)
+ cur += 1
+ # No final boundary
+ self.wfile.write(boundary_line)
+
+
+class TestTruncatedMultipleRangeServer(TestSpecificRequestHandler):
+
+ _req_handler_class = TruncatedMultipleRangeRequestHandler
+
+ def setUp(self):
+ super(TestTruncatedMultipleRangeServer, self).setUp()
+ self.build_tree_contents([('a', '0123456789')],)
+
+ def test_readv_with_short_reads(self):
+ server = self.get_readonly_server()
+ t = self._transport(server.get_url())
+ # Force separate ranges for each offset
+ t._bytes_to_read_before_seek = 0
+ ireadv = iter(t.readv('a', ((0, 1), (2, 1), (4, 2), (9, 1))))
+ self.assertEqual((0, '0'), ireadv.next())
+ self.assertEqual((2, '2'), ireadv.next())
+ if not self._testing_pycurl():
+ # Only one request have been issued so far (except for pycurl that
+ # try to read the whole response at once)
+ self.assertEqual(1, server.GET_request_nb)
+ self.assertEqual((4, '45'), ireadv.next())
+ self.assertEqual((9, '9'), ireadv.next())
+ # Both implementations issue 3 requests but:
+ # - urllib does two multiple (4 ranges, then 2 ranges) then a single
+ # range,
+ # - pycurl does two multiple (4 ranges, 4 ranges) then a single range
+ self.assertEqual(3, server.GET_request_nb)
+ # Finally the client have tried a single range request and stays in
+ # that mode
+ self.assertEqual('single', t._range_hint)
+
class LimitedRangeRequestHandler(http_server.TestingHTTPRequestHandler):
"""Errors out when range specifiers exceed the limit"""
=== modified file 'bzrlib/tests/test_http_response.py'
--- a/bzrlib/tests/test_http_response.py 2007-12-13 15:40:42 +0000
+++ b/bzrlib/tests/test_http_response.py 2008-01-02 14:13:55 +0000
@@ -249,7 +249,7 @@
f._pos = 0 # Force an invalid pos
self.assertRaises(errors.InvalidRange, f.read, 2)
-class TestRangeFilMultipleRanges(tests.TestCase, TestRangeFileMixin):
+class TestRangeFileMultipleRanges(tests.TestCase, TestRangeFileMixin):
"""Test a RangeFile for multiple ranges.
The RangeFile used for the tests contains three ranges:
@@ -263,7 +263,7 @@
"""
def setUp(self):
- super(TestRangeFilMultipleRanges, self).setUp()
+ super(TestRangeFileMultipleRanges, self).setUp()
boundary = 'separation'
@@ -361,6 +361,14 @@
f.seek(126) # skip the two first ranges
self.assertEquals('AB', f.read(2))
+ def test_checked_read_dont_overflow_buffers(self):
+ f = self._file
+ start = self.first_range_start
+ # We force a very low value to exercise all code paths in _checked_read
+ f._discarded_buf_size = 8
+ f.seek(126) # skip the two first ranges
+ self.assertEquals('AB', f.read(2))
+
def test_seek_twice_between_ranges(self):
f = self._file
start = self.first_range_start
=== modified file 'bzrlib/transport/http/__init__.py'
--- a/bzrlib/transport/http/__init__.py 2007-12-20 14:14:21 +0000
+++ b/bzrlib/transport/http/__init__.py 2008-01-02 14:13:55 +0000
@@ -217,6 +217,7 @@
offsets = list(offsets)
try_again = True
+ retried_offset = None
while try_again:
try_again = False
@@ -271,10 +272,20 @@
except (errors.ShortReadvError, errors.InvalidRange,
errors.InvalidHttpRange), e:
- self._degrade_range_hint(relpath, coalesced, sys.exc_info())
+ mutter('Exception %r: %s during http._readv',e, e)
+ if (not isinstance(e, errors.ShortReadvError)
+ or retried_offset == cur_offset_and_size):
+ # We don't degrade the range hint for ShortReadvError since
+ # they do not indicate a problem with the server ability to
+ # handle ranges. Except when we fail to get back a required
+ # offset twice in a row. In that case, falling back to
+ # single range or whole file should help or end up in a
+ # fatal exception.
+ self._degrade_range_hint(relpath, coalesced, sys.exc_info())
# Some offsets may have been already processed, so we retry
# only the unsuccessful ones.
offsets = [cur_offset_and_size] + [o for o in iter_offsets]
+ retried_offset = cur_offset_and_size
try_again = True
def _coalesce_readv(self, relpath, coalesced):
=== modified file 'bzrlib/transport/http/_urllib2_wrappers.py'
--- a/bzrlib/transport/http/_urllib2_wrappers.py 2007-12-28 16:51:30 +0000
+++ b/bzrlib/transport/http/_urllib2_wrappers.py 2008-01-02 14:13:55 +0000
@@ -134,16 +134,16 @@
if not self.isclosed():
# Make sure nothing was left to be read on the socket
pending = 0
- while self.length and self.length > self._discarded_buf_size:
+ data = True
+ while (data and self.length
+ and self.length > self._discarded_buf_size):
data = self.read(self._discarded_buf_size)
pending += len(data)
- if self.length:
+ if data and self.length:
data = self.read(self.length)
pending += len(data)
if pending:
- trace.mutter(
- "%s bytes left on the HTTP socket",
- pending)
+ trace.mutter("%s bytes left on the HTTP socket", pending)
self.close()
return pending
=== modified file 'bzrlib/transport/http/response.py'
--- a/bzrlib/transport/http/response.py 2007-12-10 10:41:24 +0000
+++ b/bzrlib/transport/http/response.py 2008-01-02 14:13:55 +0000
@@ -144,24 +144,36 @@
'Invalid range, size <= 0')
self.set_range(start, size)
+ # in _checked_read() below, we may have to discard several MB in the worst
+ # case. To avoid buffering that much, we read and discard by chunks
+ # instead. The underlying file is either a socket or a StringIO, so reading
+ # 8k chunks should be fine.
+ _discarded_buf_size = 8192
+
def _checked_read(self, size):
- """Read the file checking for short reads"""
+ """Read the file checking for short reads.
+
+ The data read is discarded along the way.
+ """
pos = self._pos
- data = self._file.read(size)
- data_len = len(data)
+ data_len = 0
+ while size - data_len > self._discarded_buf_size:
+ data = self._file.read(self._discarded_buf_size)
+ data_len += len(data)
+ if size - data_len > 0:
+ data = self._file.read(size - data_len)
+ data_len += len(data)
if size > 0 and data_len != size:
raise errors.ShortReadvError(self._path, pos, size, data_len)
self._pos += data_len
- return data
def _seek_to_next_range(self):
# We will cross range boundaries
if self._boundary is None:
# If we don't have a boundary, we can't find another range
- raise errors.InvalidRange(
- self._path, self._pos,
- "Range (%s, %s) exhausted"
- % (self._start, self._size))
+ raise errors.InvalidRange(self._path, self._pos,
+ "Range (%s, %s) exhausted"
+ % (self._start, self._size))
self.read_boundary()
self.read_range_definition()
More information about the bazaar-commits
mailing list