Rev 3076: Limit GET requests by body size instead of number of ranges. in file:///v/home/vila/src/bzr/bugs/173010/

Sat Dec 8 23:15:24 GMT 2007

At file:///v/home/vila/src/bzr/bugs/173010/

------------------------------------------------------------
revno: 3076
revision-id:v.ladeuil+lp at free.fr-20071208231518-sj2ui57xyd4mkjra
parent: v.ladeuil+lp at free.fr-20071208162634-j0g3q0xjr1g7abam
committer: Vincent Ladeuil <v.ladeuil+lp at free.fr>
branch nick: 173010
timestamp: Sun 2007-12-09 00:15:18 +0100
message:
  Limit GET requests by body size instead of number of ranges.
  
  * bzrlib/transport/http/response.py:
  Fix some error messages.
  (RangeFile.read_range_definition): Keep the range headers in the
  _headers attribute for easier debugging (especially for remote
  debugging).
  
  * bzrlib/transport/http/_pycurl.py:
  (PyCurlTransport): Replace _max_readv_combine by _get_max_size
  which is more approriate to the problem.
  
  * bzrlib/transport/http/__init__.py:
  (HttpTransportBase): Add a _get_max_size class attribute
  corresponding to the max_size _coalesced_offets max_size
  parameter.
  (HttpTransportBase._coalesce_readv): Limit the size of the get
  requests if _get_max_size is greater than 0 while still respecting
  the maximum number of ranges in a request.
  
  * bzrlib/tests/test_http.py:
  (TestRangeRequestServer.test_readv_get_max_size): Test the
  _get_max_size parameter.
  
  * bzrlib/transport/__init__.py:
  (Transport._coalesce_offsets): Add a max_size parameter limiting
  the size of the coalesced offsets.
  
  * bzrlib/tests/test_transport.py:
  (TestCoalesceOffsets.check): Add the max_size parameter.
  (TestCoalesceOffsets.test_coalesce_max_size,
  TestCoalesceOffsets.test_coalesce_no_max_size): Test the max_size
  parameter.
modified:
  NEWS                           NEWS-20050323055033-4e00b5db738777ff
  bzrlib/tests/test_http.py      testhttp.py-20051018020158-b2eef6e867c514d9
  bzrlib/tests/test_transport.py testtransport.py-20050718175618-e5cdb99f4555ddce
  bzrlib/transport/__init__.py   transport.py-20050711165921-4978aa7ce1285ad5
  bzrlib/transport/http/__init__.py http_transport.py-20050711212304-506c5fd1059ace96
  bzrlib/transport/http/_pycurl.py pycurlhttp.py-20060110060940-4e2a705911af77a6
  bzrlib/transport/http/response.py _response.py-20060613154423-a2ci7hd4iw5c7fnt-1
-------------- next part --------------
=== modified file 'NEWS'

--- a/NEWS	2007-12-07 23:03:34 +0000
+++ b/NEWS	2007-12-08 23:15:18 +0000
@@ -76,7 +76,9 @@
      (John Arbash Meinel, #165290)
 
    * Give more feedback during long http downloads by making readv deliver data
-     as it arrives for urllib, and issue more requests for pycurl.
+     as it arrives for urllib, and issue more requests for pycurl. High latency
+     networks are better handled by urllib, the pycurl implementation give more
+     feedback but also incur more latency.
      (Vincent Ladeuil, #173010)
 
    * Fall back to showing e-mail in ``log --short/--line`` if the 

=== modified file 'bzrlib/tests/test_http.py'
--- a/bzrlib/tests/test_http.py	2007-12-08 14:50:52 +0000
+++ b/bzrlib/tests/test_http.py	2007-12-08 23:15:18 +0000
@@ -660,7 +660,22 @@
         self.assertEqual(l[2], (3, '34'))
         self.assertEqual(l[3], (9, '9'))
         # The server should have issued 4 requests
-        self.assertEqual(4, self.get_readonly_server().GET_request_nb)
+        self.assertEqual(4, server.GET_request_nb)
+
+    def test_readv_get_max_size(self):
+        server = self.get_readonly_server()
+        t = self._transport(server.get_url())
+        # force transport to issue multiple requests by limiting the number of
+        # bytes by request. Note that this apply to coalesced offsets only, a
+        # single range ill keep its size even if bigger than the limit.
+        t._get_max_size = 2
+        l = list(t.readv('a', ((0, 1), (1, 1), (2, 4), (6, 4))))
+        self.assertEqual(l[0], (0, '0'))
+        self.assertEqual(l[1], (1, '1'))
+        self.assertEqual(l[2], (2, '2345'))
+        self.assertEqual(l[3], (6, '6789'))
+        # The server should have issued 3 requests
+        self.assertEqual(3, server.GET_request_nb)
 
 
 class TestSingleRangeRequestServer(TestRangeRequestServer):

=== modified file 'bzrlib/tests/test_transport.py'
--- a/bzrlib/tests/test_transport.py	2007-11-19 13:44:25 +0000
+++ b/bzrlib/tests/test_transport.py	2007-12-08 23:15:18 +0000
@@ -152,11 +152,12 @@
 
 
 class TestCoalesceOffsets(TestCase):
-    
-    def check(self, expected, offsets, limit=0, fudge=0):
+
+    def check(self, expected, offsets, limit=0, max_size=0, fudge=0):
         coalesce = Transport._coalesce_offsets
         exp = [_CoalescedOffset(*x) for x in expected]
-        out = list(coalesce(offsets, limit=limit, fudge_factor=fudge))
+        out = list(coalesce(offsets, limit=limit, fudge_factor=fudge,
+                            max_size=max_size))
         self.assertEqual(exp, out)
 
     def test_coalesce_empty(self):
@@ -169,7 +170,7 @@
         self.check([(0, 10, [(0, 10)]),
                     (20, 10, [(0, 10)]),
                    ], [(0, 10), (20, 10)])
-            
+
     def test_coalesce_unsorted(self):
         self.check([(20, 10, [(0, 10)]),
                     (0, 10, [(0, 10)]),
@@ -179,6 +180,7 @@
         self.check([(0, 20, [(0, 10), (10, 10)])],
                    [(0, 10), (10, 10)])
 
+    # XXX: scary, http.readv() can't handle that
     def test_coalesce_overlapped(self):
         self.check([(0, 15, [(0, 10), (5, 10)])],
                    [(0, 10), (5, 10)])
@@ -208,6 +210,20 @@
                    ], [(10, 10), (30, 10), (100, 10)],
                    fudge=10
                   )
+    def test_coalesce_max_size(self):
+        self.check([(10, 20, [(0, 10), (10, 10)]),
+                    (30, 50, [(0, 50)]),
+                    # If one range is above max_size, it gets its own coalesced
+                    # offset
+                    (100, 80, [(0, 80),]),],
+                   [(10, 10), (20, 10), (30, 50), (100, 80)],
+                   max_size=50
+                  )
+
+    def test_coalesce_no_max_size(self):
+        self.check([(10, 170, [(0, 10), (10, 10), (20, 50), (70, 100)]),],
+                   [(10, 10), (20, 10), (30, 50), (80, 100)],
+                  )
 
 
 class TestMemoryTransport(TestCase):

=== modified file 'bzrlib/transport/__init__.py'
--- a/bzrlib/transport/__init__.py	2007-12-03 08:33:06 +0000
+++ b/bzrlib/transport/__init__.py	2007-12-08 23:15:18 +0000
@@ -792,7 +792,7 @@
         return offsets
 
     @staticmethod
-    def _coalesce_offsets(offsets, limit, fudge_factor):
+    def _coalesce_offsets(offsets, limit=0, fudge_factor=0, max_size=0):
         """Yield coalesced offsets.
 
         With a long list of neighboring requests, combine them
@@ -801,13 +801,21 @@
         Turns  [(15, 10), (25, 10)] => [(15, 20, [(0, 10), (10, 10)])]
 
         :param offsets: A list of (start, length) pairs
-        :param limit: Only combine a maximum of this many pairs
-                      Some transports penalize multiple reads more than
-                      others, and sometimes it is better to return early.
-                      0 means no limit
+
+        :param limit: Only combine a maximum of this many pairs Some transports
+                penalize multiple reads more than others, and sometimes it is
+                better to return early.
+                0 means no limit
+
         :param fudge_factor: All transports have some level of 'it is
                 better to read some more data and throw it away rather 
                 than seek', so collapse if we are 'close enough'
+
+        :param max_size: Create coalesced offsets no bigger than this size.
+                When a single offset is bigger than 'max_size', it will keep
+                its size and be alone in the coalesced offset.
+                0 means no maximum size.
+
         :return: yield _CoalescedOffset objects, which have members for where
                 to start, how much to read, and how to split those 
                 chunks back up
@@ -817,10 +825,11 @@
 
         for start, size in offsets:
             end = start + size
-            if (last_end is not None 
+            if (last_end is not None
                 and start <= last_end + fudge_factor
                 and start >= cur.start
-                and (limit <= 0 or len(cur.ranges) < limit)):
+                and (limit <= 0 or len(cur.ranges) < limit)
+                and (max_size <= 0 or end - cur.start <= max_size)):
                 cur.length = end - cur.start
                 cur.ranges.append((start-cur.start, size))
             else:

=== modified file 'bzrlib/transport/http/__init__.py'
--- a/bzrlib/transport/http/__init__.py	2007-12-08 16:26:34 +0000
+++ b/bzrlib/transport/http/__init__.py	2007-12-08 23:15:18 +0000
@@ -188,12 +188,14 @@
     # specified.
     _bytes_to_read_before_seek = 128
     # No limit on the offset number that get combined into one, we are trying
-    # to avoid downloading the whole file. But see _pycurl.py for a different
-    # use.
+    # to avoid downloading the whole file.
     _max_readv_combine = 0
     # By default Apache has a limit of ~400 ranges before replying with a 400
     # Bad Request. So we go underneath that amount to be safe.
     _max_get_ranges = 200
+    # We impose no limit on the range size. But see _pycurl.py for a different
+    # use.
+    _get_max_size = 0
 
     def _readv(self, relpath, offsets):
         """Get parts of the file at the given relative path.
@@ -214,7 +216,8 @@
             sorted_offsets = sorted(offsets)
             coalesced = self._coalesce_offsets(
                 sorted_offsets, limit=self._max_readv_combine,
-                fudge_factor=self._bytes_to_read_before_seek)
+                fudge_factor=self._bytes_to_read_before_seek,
+                max_size=self._get_max_size)
 
             # Turn it into a list, we will iterate it several times
             coalesced = list(coalesced)
@@ -268,25 +271,49 @@
 
     def _coalesce_readv(self, relpath, coalesced):
         """Issue several GET requests to satisfy the coalesced offsets"""
-        total = len(coalesced)
-        if self._range_hint == 'multi':
-             max_ranges = self._max_get_ranges
-        elif self._range_hint == 'single':
-             max_ranges = total
+
+        def get_and_yield(relpath, coalesced):
+            if coalesced:
+                # Note that the _get below may raise
+                # errors.InvalidHttpRange. It's the caller's responsability to
+                # decide how to retry since it may provide different coalesced
+                # offsets.
+                code, rfile = self._get(relpath, coalesced)
+                for coal in coalesced:
+                    yield coal, rfile
+
+        if self._range_hint is None:
+            # Download whole file
+            for c, rfile in get_and_yield(relpath, coalesced):
+                yield c, rfile
         else:
-            # The whole file will be downloaded anyway
-            max_ranges = total
-        # TODO: Some web servers may ignore the range requests and return the
-        # whole file, we may want to detect that and avoid further requests.
-        # Hint: test_readv_multiple_get_requests will fail once we do that
-        for group in xrange(0, len(coalesced), max_ranges):
-            ranges = coalesced[group:group+max_ranges]
-            # Note that the following may raise errors.InvalidHttpRange. It's
-            # the caller's responsability to decide how to retry since it may
-            # provide different coalesced offsets.
-            code, rfile = self._get(relpath, ranges)
-            for range in ranges:
-                yield range, rfile
+            total = len(coalesced)
+            if self._range_hint == 'multi':
+                max_ranges = self._max_get_ranges
+            else: # self._range_hint == 'single'
+                max_ranges = total
+            # TODO: Some web servers may ignore the range requests and return
+            # the whole file, we may want to detect that and avoid further
+            # requests.
+            # Hint: test_readv_multiple_get_requests will fail once we do that
+            cumul = 0
+            ranges = []
+            for coal in coalesced:
+                if ((self._get_max_size > 0
+                     and cumul + coal.length > self._get_max_size)
+                    or len(ranges) >= max_ranges):
+                    # Get that much and yield
+                    for c, rfile in get_and_yield(relpath, ranges):
+                        yield c, rfile
+                    # Restart with the current offset
+                    ranges = [coal]
+                    cumul = coal.length
+                else:
+                    ranges.append(coal)
+                    cumul += coal.length
+            # Get the rest and yield
+            for c, rfile in get_and_yield(relpath, ranges):
+                yield c, rfile
 
     def recommended_page_size(self):
         """See Transport.recommended_page_size().
@@ -410,7 +437,11 @@
             return self.__class__(self.abspath(offset), self)
 
     def _attempted_range_header(self, offsets, tail_amount):
-        """Prepare a HTTP Range header at a level the server should accept"""
+        """Prepare a HTTP Range header at a level the server should accept.
+
+        :return: the range header representing offsets/tail_amount or None if
+            no header can be built.
+        """
 
         if self._range_hint == 'multi':
             # Generate the header describing all offsets

=== modified file 'bzrlib/transport/http/_pycurl.py'
--- a/bzrlib/transport/http/_pycurl.py	2007-12-06 22:46:16 +0000
+++ b/bzrlib/transport/http/_pycurl.py	2007-12-08 23:15:18 +0000
@@ -212,9 +212,9 @@
     # exploit the results as soon as they are received (pycurl limitation) we'd
     # better issue more requests and provide a more responsive UI do the cost
     # of more latency costs.
-    # If you modify this think about modifying the comment in http/__init__.py
+    # If you modify this, think about modifying the comment in http/__init__.py
     # too.
-    _max_readv_combine = 25
+    _get_max_size = 4 * 1024 * 1024
 
     def _get_ranged(self, relpath, offsets, tail_amount):
         """Make a request for just part of the file."""

=== modified file 'bzrlib/transport/http/response.py'
--- a/bzrlib/transport/http/response.py	2007-12-08 14:31:01 +0000
+++ b/bzrlib/transport/http/response.py	2007-12-08 23:15:18 +0000
@@ -64,6 +64,9 @@
         self._path = path
         self._file = infile
         self._boundary = None
+        # When using multi parts response, this will be set with the headers
+        # associated with the range currently read.
+        self._headers = None
         # Default to the whole file of unspecified size
         self.set_range(0, -1)
 
@@ -105,9 +108,9 @@
         Parse the headers including the empty line following them so that we
         are ready to read the data itself.
         """
-        msg = httplib.HTTPMessage(self._file, seekable=0)
+        self._headers = httplib.HTTPMessage(self._file, seekable=0)
         # Extract the range definition
-        content_range = msg.getheader('content-range', None)
+        content_range = self._headers.getheader('content-range', None)
         if content_range is None:
             raise errors.InvalidHttpResponse(
                 self._path,
@@ -115,13 +118,12 @@
         self.set_range_from_header(content_range)
 
     def set_range_from_header(self, content_range):
-        """Create a new range from its description in the headers"""
+        """Helper to set the new range from its description in the headers"""
         try:
             rtype, values = content_range.split()
         except ValueError:
             raise errors.InvalidHttpRange(self._path, content_range,
-                                          "Malformed Content-Range header '%s'"
-                                          % content_range)
+                                          'Malformed header')
         if rtype != 'bytes':
             raise errors.InvalidHttpRange(self._path, content_range,
                                           "Unsupported range type '%s'" % rtype)
@@ -135,11 +137,11 @@
             end = int(end)
         except ValueError:
             raise errors.InvalidHttpRange(self._path, content_range,
-                                          "Invalid range values '%s'" % values)
+                                          'Invalid range values')
         size = end - start + 1
         if size <= 0:
             raise errors.InvalidHttpRange(self._path, content_range,
-                                          "Invalid range values '%s'" % values)
+                                          'Invalid range, size <= 0')
         self.set_range(start, size)
 
     def _checked_read(self, size):
@@ -178,6 +180,7 @@
                 limited = min(limited, size)
             data = self._file.read(limited)
         else:
+            # Size of file unknown
             data = self._file.read(size)
         # Update _pos respecting the data effectively read
         self._pos += len(data)