Rev 3076: Limit GET requests by body size instead of number of ranges. in file:///v/home/vila/src/bzr/bugs/173010/
Vincent Ladeuil
v.ladeuil+lp at free.fr
Sat Dec 8 23:15:24 GMT 2007
At file:///v/home/vila/src/bzr/bugs/173010/
------------------------------------------------------------
revno: 3076
revision-id:v.ladeuil+lp at free.fr-20071208231518-sj2ui57xyd4mkjra
parent: v.ladeuil+lp at free.fr-20071208162634-j0g3q0xjr1g7abam
committer: Vincent Ladeuil <v.ladeuil+lp at free.fr>
branch nick: 173010
timestamp: Sun 2007-12-09 00:15:18 +0100
message:
Limit GET requests by body size instead of number of ranges.
* bzrlib/transport/http/response.py:
Fix some error messages.
(RangeFile.read_range_definition): Keep the range headers in the
_headers attribute for easier debugging (especially for remote
debugging).
* bzrlib/transport/http/_pycurl.py:
(PyCurlTransport): Replace _max_readv_combine by _get_max_size
which is more approriate to the problem.
* bzrlib/transport/http/__init__.py:
(HttpTransportBase): Add a _get_max_size class attribute
corresponding to the max_size _coalesced_offets max_size
parameter.
(HttpTransportBase._coalesce_readv): Limit the size of the get
requests if _get_max_size is greater than 0 while still respecting
the maximum number of ranges in a request.
* bzrlib/tests/test_http.py:
(TestRangeRequestServer.test_readv_get_max_size): Test the
_get_max_size parameter.
* bzrlib/transport/__init__.py:
(Transport._coalesce_offsets): Add a max_size parameter limiting
the size of the coalesced offsets.
* bzrlib/tests/test_transport.py:
(TestCoalesceOffsets.check): Add the max_size parameter.
(TestCoalesceOffsets.test_coalesce_max_size,
TestCoalesceOffsets.test_coalesce_no_max_size): Test the max_size
parameter.
modified:
NEWS NEWS-20050323055033-4e00b5db738777ff
bzrlib/tests/test_http.py testhttp.py-20051018020158-b2eef6e867c514d9
bzrlib/tests/test_transport.py testtransport.py-20050718175618-e5cdb99f4555ddce
bzrlib/transport/__init__.py transport.py-20050711165921-4978aa7ce1285ad5
bzrlib/transport/http/__init__.py http_transport.py-20050711212304-506c5fd1059ace96
bzrlib/transport/http/_pycurl.py pycurlhttp.py-20060110060940-4e2a705911af77a6
bzrlib/transport/http/response.py _response.py-20060613154423-a2ci7hd4iw5c7fnt-1
-------------- next part --------------
=== modified file 'NEWS'
--- a/NEWS 2007-12-07 23:03:34 +0000
+++ b/NEWS 2007-12-08 23:15:18 +0000
@@ -76,7 +76,9 @@
(John Arbash Meinel, #165290)
* Give more feedback during long http downloads by making readv deliver data
- as it arrives for urllib, and issue more requests for pycurl.
+ as it arrives for urllib, and issue more requests for pycurl. High latency
+ networks are better handled by urllib, the pycurl implementation give more
+ feedback but also incur more latency.
(Vincent Ladeuil, #173010)
* Fall back to showing e-mail in ``log --short/--line`` if the
=== modified file 'bzrlib/tests/test_http.py'
--- a/bzrlib/tests/test_http.py 2007-12-08 14:50:52 +0000
+++ b/bzrlib/tests/test_http.py 2007-12-08 23:15:18 +0000
@@ -660,7 +660,22 @@
self.assertEqual(l[2], (3, '34'))
self.assertEqual(l[3], (9, '9'))
# The server should have issued 4 requests
- self.assertEqual(4, self.get_readonly_server().GET_request_nb)
+ self.assertEqual(4, server.GET_request_nb)
+
+ def test_readv_get_max_size(self):
+ server = self.get_readonly_server()
+ t = self._transport(server.get_url())
+ # force transport to issue multiple requests by limiting the number of
+ # bytes by request. Note that this apply to coalesced offsets only, a
+ # single range ill keep its size even if bigger than the limit.
+ t._get_max_size = 2
+ l = list(t.readv('a', ((0, 1), (1, 1), (2, 4), (6, 4))))
+ self.assertEqual(l[0], (0, '0'))
+ self.assertEqual(l[1], (1, '1'))
+ self.assertEqual(l[2], (2, '2345'))
+ self.assertEqual(l[3], (6, '6789'))
+ # The server should have issued 3 requests
+ self.assertEqual(3, server.GET_request_nb)
class TestSingleRangeRequestServer(TestRangeRequestServer):
=== modified file 'bzrlib/tests/test_transport.py'
--- a/bzrlib/tests/test_transport.py 2007-11-19 13:44:25 +0000
+++ b/bzrlib/tests/test_transport.py 2007-12-08 23:15:18 +0000
@@ -152,11 +152,12 @@
class TestCoalesceOffsets(TestCase):
-
- def check(self, expected, offsets, limit=0, fudge=0):
+
+ def check(self, expected, offsets, limit=0, max_size=0, fudge=0):
coalesce = Transport._coalesce_offsets
exp = [_CoalescedOffset(*x) for x in expected]
- out = list(coalesce(offsets, limit=limit, fudge_factor=fudge))
+ out = list(coalesce(offsets, limit=limit, fudge_factor=fudge,
+ max_size=max_size))
self.assertEqual(exp, out)
def test_coalesce_empty(self):
@@ -169,7 +170,7 @@
self.check([(0, 10, [(0, 10)]),
(20, 10, [(0, 10)]),
], [(0, 10), (20, 10)])
-
+
def test_coalesce_unsorted(self):
self.check([(20, 10, [(0, 10)]),
(0, 10, [(0, 10)]),
@@ -179,6 +180,7 @@
self.check([(0, 20, [(0, 10), (10, 10)])],
[(0, 10), (10, 10)])
+ # XXX: scary, http.readv() can't handle that
def test_coalesce_overlapped(self):
self.check([(0, 15, [(0, 10), (5, 10)])],
[(0, 10), (5, 10)])
@@ -208,6 +210,20 @@
], [(10, 10), (30, 10), (100, 10)],
fudge=10
)
+ def test_coalesce_max_size(self):
+ self.check([(10, 20, [(0, 10), (10, 10)]),
+ (30, 50, [(0, 50)]),
+ # If one range is above max_size, it gets its own coalesced
+ # offset
+ (100, 80, [(0, 80),]),],
+ [(10, 10), (20, 10), (30, 50), (100, 80)],
+ max_size=50
+ )
+
+ def test_coalesce_no_max_size(self):
+ self.check([(10, 170, [(0, 10), (10, 10), (20, 50), (70, 100)]),],
+ [(10, 10), (20, 10), (30, 50), (80, 100)],
+ )
class TestMemoryTransport(TestCase):
=== modified file 'bzrlib/transport/__init__.py'
--- a/bzrlib/transport/__init__.py 2007-12-03 08:33:06 +0000
+++ b/bzrlib/transport/__init__.py 2007-12-08 23:15:18 +0000
@@ -792,7 +792,7 @@
return offsets
@staticmethod
- def _coalesce_offsets(offsets, limit, fudge_factor):
+ def _coalesce_offsets(offsets, limit=0, fudge_factor=0, max_size=0):
"""Yield coalesced offsets.
With a long list of neighboring requests, combine them
@@ -801,13 +801,21 @@
Turns [(15, 10), (25, 10)] => [(15, 20, [(0, 10), (10, 10)])]
:param offsets: A list of (start, length) pairs
- :param limit: Only combine a maximum of this many pairs
- Some transports penalize multiple reads more than
- others, and sometimes it is better to return early.
- 0 means no limit
+
+ :param limit: Only combine a maximum of this many pairs Some transports
+ penalize multiple reads more than others, and sometimes it is
+ better to return early.
+ 0 means no limit
+
:param fudge_factor: All transports have some level of 'it is
better to read some more data and throw it away rather
than seek', so collapse if we are 'close enough'
+
+ :param max_size: Create coalesced offsets no bigger than this size.
+ When a single offset is bigger than 'max_size', it will keep
+ its size and be alone in the coalesced offset.
+ 0 means no maximum size.
+
:return: yield _CoalescedOffset objects, which have members for where
to start, how much to read, and how to split those
chunks back up
@@ -817,10 +825,11 @@
for start, size in offsets:
end = start + size
- if (last_end is not None
+ if (last_end is not None
and start <= last_end + fudge_factor
and start >= cur.start
- and (limit <= 0 or len(cur.ranges) < limit)):
+ and (limit <= 0 or len(cur.ranges) < limit)
+ and (max_size <= 0 or end - cur.start <= max_size)):
cur.length = end - cur.start
cur.ranges.append((start-cur.start, size))
else:
=== modified file 'bzrlib/transport/http/__init__.py'
--- a/bzrlib/transport/http/__init__.py 2007-12-08 16:26:34 +0000
+++ b/bzrlib/transport/http/__init__.py 2007-12-08 23:15:18 +0000
@@ -188,12 +188,14 @@
# specified.
_bytes_to_read_before_seek = 128
# No limit on the offset number that get combined into one, we are trying
- # to avoid downloading the whole file. But see _pycurl.py for a different
- # use.
+ # to avoid downloading the whole file.
_max_readv_combine = 0
# By default Apache has a limit of ~400 ranges before replying with a 400
# Bad Request. So we go underneath that amount to be safe.
_max_get_ranges = 200
+ # We impose no limit on the range size. But see _pycurl.py for a different
+ # use.
+ _get_max_size = 0
def _readv(self, relpath, offsets):
"""Get parts of the file at the given relative path.
@@ -214,7 +216,8 @@
sorted_offsets = sorted(offsets)
coalesced = self._coalesce_offsets(
sorted_offsets, limit=self._max_readv_combine,
- fudge_factor=self._bytes_to_read_before_seek)
+ fudge_factor=self._bytes_to_read_before_seek,
+ max_size=self._get_max_size)
# Turn it into a list, we will iterate it several times
coalesced = list(coalesced)
@@ -268,25 +271,49 @@
def _coalesce_readv(self, relpath, coalesced):
"""Issue several GET requests to satisfy the coalesced offsets"""
- total = len(coalesced)
- if self._range_hint == 'multi':
- max_ranges = self._max_get_ranges
- elif self._range_hint == 'single':
- max_ranges = total
+
+ def get_and_yield(relpath, coalesced):
+ if coalesced:
+ # Note that the _get below may raise
+ # errors.InvalidHttpRange. It's the caller's responsability to
+ # decide how to retry since it may provide different coalesced
+ # offsets.
+ code, rfile = self._get(relpath, coalesced)
+ for coal in coalesced:
+ yield coal, rfile
+
+ if self._range_hint is None:
+ # Download whole file
+ for c, rfile in get_and_yield(relpath, coalesced):
+ yield c, rfile
else:
- # The whole file will be downloaded anyway
- max_ranges = total
- # TODO: Some web servers may ignore the range requests and return the
- # whole file, we may want to detect that and avoid further requests.
- # Hint: test_readv_multiple_get_requests will fail once we do that
- for group in xrange(0, len(coalesced), max_ranges):
- ranges = coalesced[group:group+max_ranges]
- # Note that the following may raise errors.InvalidHttpRange. It's
- # the caller's responsability to decide how to retry since it may
- # provide different coalesced offsets.
- code, rfile = self._get(relpath, ranges)
- for range in ranges:
- yield range, rfile
+ total = len(coalesced)
+ if self._range_hint == 'multi':
+ max_ranges = self._max_get_ranges
+ else: # self._range_hint == 'single'
+ max_ranges = total
+ # TODO: Some web servers may ignore the range requests and return
+ # the whole file, we may want to detect that and avoid further
+ # requests.
+ # Hint: test_readv_multiple_get_requests will fail once we do that
+ cumul = 0
+ ranges = []
+ for coal in coalesced:
+ if ((self._get_max_size > 0
+ and cumul + coal.length > self._get_max_size)
+ or len(ranges) >= max_ranges):
+ # Get that much and yield
+ for c, rfile in get_and_yield(relpath, ranges):
+ yield c, rfile
+ # Restart with the current offset
+ ranges = [coal]
+ cumul = coal.length
+ else:
+ ranges.append(coal)
+ cumul += coal.length
+ # Get the rest and yield
+ for c, rfile in get_and_yield(relpath, ranges):
+ yield c, rfile
def recommended_page_size(self):
"""See Transport.recommended_page_size().
@@ -410,7 +437,11 @@
return self.__class__(self.abspath(offset), self)
def _attempted_range_header(self, offsets, tail_amount):
- """Prepare a HTTP Range header at a level the server should accept"""
+ """Prepare a HTTP Range header at a level the server should accept.
+
+ :return: the range header representing offsets/tail_amount or None if
+ no header can be built.
+ """
if self._range_hint == 'multi':
# Generate the header describing all offsets
=== modified file 'bzrlib/transport/http/_pycurl.py'
--- a/bzrlib/transport/http/_pycurl.py 2007-12-06 22:46:16 +0000
+++ b/bzrlib/transport/http/_pycurl.py 2007-12-08 23:15:18 +0000
@@ -212,9 +212,9 @@
# exploit the results as soon as they are received (pycurl limitation) we'd
# better issue more requests and provide a more responsive UI do the cost
# of more latency costs.
- # If you modify this think about modifying the comment in http/__init__.py
+ # If you modify this, think about modifying the comment in http/__init__.py
# too.
- _max_readv_combine = 25
+ _get_max_size = 4 * 1024 * 1024
def _get_ranged(self, relpath, offsets, tail_amount):
"""Make a request for just part of the file."""
=== modified file 'bzrlib/transport/http/response.py'
--- a/bzrlib/transport/http/response.py 2007-12-08 14:31:01 +0000
+++ b/bzrlib/transport/http/response.py 2007-12-08 23:15:18 +0000
@@ -64,6 +64,9 @@
self._path = path
self._file = infile
self._boundary = None
+ # When using multi parts response, this will be set with the headers
+ # associated with the range currently read.
+ self._headers = None
# Default to the whole file of unspecified size
self.set_range(0, -1)
@@ -105,9 +108,9 @@
Parse the headers including the empty line following them so that we
are ready to read the data itself.
"""
- msg = httplib.HTTPMessage(self._file, seekable=0)
+ self._headers = httplib.HTTPMessage(self._file, seekable=0)
# Extract the range definition
- content_range = msg.getheader('content-range', None)
+ content_range = self._headers.getheader('content-range', None)
if content_range is None:
raise errors.InvalidHttpResponse(
self._path,
@@ -115,13 +118,12 @@
self.set_range_from_header(content_range)
def set_range_from_header(self, content_range):
- """Create a new range from its description in the headers"""
+ """Helper to set the new range from its description in the headers"""
try:
rtype, values = content_range.split()
except ValueError:
raise errors.InvalidHttpRange(self._path, content_range,
- "Malformed Content-Range header '%s'"
- % content_range)
+ 'Malformed header')
if rtype != 'bytes':
raise errors.InvalidHttpRange(self._path, content_range,
"Unsupported range type '%s'" % rtype)
@@ -135,11 +137,11 @@
end = int(end)
except ValueError:
raise errors.InvalidHttpRange(self._path, content_range,
- "Invalid range values '%s'" % values)
+ 'Invalid range values')
size = end - start + 1
if size <= 0:
raise errors.InvalidHttpRange(self._path, content_range,
- "Invalid range values '%s'" % values)
+ 'Invalid range, size <= 0')
self.set_range(start, size)
def _checked_read(self, size):
@@ -178,6 +180,7 @@
limited = min(limited, size)
data = self._file.read(limited)
else:
+ # Size of file unknown
data = self._file.read(size)
# Update _pos respecting the data effectively read
self._pos += len(data)
More information about the bazaar-commits
mailing list