Rev 2521: First step to fix #115209 use _coalesce_offsets like other transports. in file:///v/home/vila/src/bugs/115209/

Vincent Ladeuil v.ladeuil+lp at free.fr
Tue Jun 12 15:49:01 BST 2007


At file:///v/home/vila/src/bugs/115209/

------------------------------------------------------------
revno: 2521
revision-id: v.ladeuil+lp at free.fr-20070612144857-1swhz5c3qo8n76l8
parent: pqm at pqm.ubuntu.com-20070608134340-flu6dlpzyo7izrrs
committer: Vincent Ladeuil <v.ladeuil+lp at free.fr>
branch nick: 115209
timestamp: Tue 2007-06-12 16:48:57 +0200
message:
  First step to fix #115209 use _coalesce_offsets like other transports.
  
  * bzrlib/transport/http/__init__.py:
  (HttpTransportBase._bytes_to_read_before_seek): Setting it to 512
  to minimize additional bytes transferred but still reducing number
  of ranges.
  (HttpTransportBase.readv): _coalesce_offsets does a better job
  than offset_to_ranges/ Reduce code duplication too.
  (HttpTransportBase.offsets_to_ranges): Deprecated.
  (HttpTransportBase._attempted_range_header): Made private. Adapted
  to offsets.
  (HttpTransportBase._range_header): Made private. Adapted to
  offsets.
  
  * bzrlib/transport/http/_urllib.py:
  (HttpTransport_urllib._get): s/ranges/offsets/
  
  * bzrlib/transport/http/_pycurl.py:
  (PyCurlTransport._get, PyCurlTransport._get_ranged): s/ranges/offsets/
  
  * bzrlib/tests/__init__.py:
  (Transport._coalesce_offsets): Fix minor typo.
  
  * bzrlib/tests/test_http.py:
  (TestOffsets): Deleted. We rely on Transport._coalesce_offsets
  only now.
  (TestRangeHeader.check_header): Convert ranges to offets and use
  Transport._coalesce_offsets.
  (TestRangeHeader.test_range_header_single): Use tuples not lists !
  (TestRanges._file_contents): Convert ranges to offets and use
  Transport._coalesce_offsets. And fix indentation too.
  (TestRanges._file_tail): Fix indentation.
  (TestRanges.test_range_header): Use _file_contents so that we can
  still use ranges.
modified:
  bzrlib/tests/test_http.py      testhttp.py-20051018020158-b2eef6e867c514d9
  bzrlib/transport/__init__.py   transport.py-20050711165921-4978aa7ce1285ad5
  bzrlib/transport/http/__init__.py http_transport.py-20050711212304-506c5fd1059ace96
  bzrlib/transport/http/_pycurl.py pycurlhttp.py-20060110060940-4e2a705911af77a6
  bzrlib/transport/http/_urllib.py _urlgrabber.py-20060113083826-0bbf7d992fbf090c
-------------- next part --------------
=== modified file 'bzrlib/tests/test_http.py'
--- a/bzrlib/tests/test_http.py	2007-05-06 06:59:57 +0000
+++ b/bzrlib/tests/test_http.py	2007-06-12 14:48:57 +0000
@@ -65,6 +65,7 @@
     WallRequestHandler,
     )
 from bzrlib.transport import (
+    _CoalescedOffset,
     do_catching_redirections,
     get_transport,
     Transport,
@@ -361,30 +362,6 @@
         self.assertIsInstance(t, HttpTransport_urllib)
 
 
-class TestOffsets(TestCase):
-    """Test offsets_to_ranges method"""
-
-    def test_offsets_to_ranges_simple(self):
-        to_range = HttpTransportBase.offsets_to_ranges
-        ranges = to_range([(10, 1)])
-        self.assertEqual([[10, 10]], ranges)
-
-        ranges = to_range([(0, 1), (1, 1)])
-        self.assertEqual([[0, 1]], ranges)
-
-        ranges = to_range([(1, 1), (0, 1)])
-        self.assertEqual([[0, 1]], ranges)
-
-    def test_offset_to_ranges_overlapped(self):
-        to_range = HttpTransportBase.offsets_to_ranges
-
-        ranges = to_range([(10, 1), (20, 2), (22, 5)])
-        self.assertEqual([[10, 10], [20, 26]], ranges)
-
-        ranges = to_range([(10, 1), (11, 2), (22, 5)])
-        self.assertEqual([[10, 12], [22, 26]], ranges)
-
-
 class TestPost(object):
 
     def _test_post_body_is_received(self, scheme):
@@ -427,12 +404,15 @@
     """Test range_header method"""
 
     def check_header(self, value, ranges=[], tail=0):
-        range_header = HttpTransportBase.range_header
-        self.assertEqual(value, range_header(ranges, tail))
+        offsets = [ (start, end - start + 1) for start, end in ranges]
+        coalesce = Transport._coalesce_offsets
+        coalesced = list(coalesce(offsets, limit=0, fudge_factor=0))
+        range_header = HttpTransportBase._range_header
+        self.assertEqual(value, range_header(coalesced, tail))
 
     def test_range_header_single(self):
-        self.check_header('0-9', ranges=[[0,9]])
-        self.check_header('100-109', ranges=[[100,109]])
+        self.check_header('0-9', ranges=[(0,9)])
+        self.check_header('100-109', ranges=[(100,109)])
 
     def test_range_header_tail(self):
         self.check_header('-10', tail=10)
@@ -926,18 +906,21 @@
         server = self.get_readonly_server()
         self.transport = self._transport(server.get_url())
 
-    def _file_contents(self, relpath, ranges, tail_amount=0):
-         code, data = self.transport._get(relpath, ranges)
-         self.assertTrue(code in (200, 206),'_get returns: %d' % code)
-         for start, end in ranges:
-             data.seek(start)
-             yield data.read(end - start + 1)
+    def _file_contents(self, relpath, ranges):
+        offsets = [ (start, end - start + 1) for start, end in ranges]
+        coalesce = self.transport._coalesce_offsets
+        coalesced = list(coalesce(offsets, limit=0, fudge_factor=0))
+        code, data = self.transport._get(relpath, coalesced)
+        self.assertTrue(code in (200, 206),'_get returns: %d' % code)
+        for start, end in ranges:
+            data.seek(start)
+            yield data.read(end - start + 1)
 
     def _file_tail(self, relpath, tail_amount):
-         code, data = self.transport._get(relpath, [], tail_amount)
-         self.assertTrue(code in (200, 206),'_get returns: %d' % code)
-         data.seek(-tail_amount + 1, 2)
-         return data.read(tail_amount)
+        code, data = self.transport._get(relpath, [], tail_amount)
+        self.assertTrue(code in (200, 206),'_get returns: %d' % code)
+        data.seek(-tail_amount + 1, 2)
+        return data.read(tail_amount)
 
     def test_range_header(self):
         # Valid ranges
@@ -946,11 +929,11 @@
         # Tail
         self.assertEqual('789', self._file_tail('a', 3))
         # Syntactically invalid range
-        self.assertRaises(errors.InvalidRange,
-                          self.transport._get, 'a', [(4, 3)])
+        self.assertListRaises(errors.InvalidRange,
+                          self._file_contents, 'a', [(4, 3)])
         # Semantically invalid range
-        self.assertRaises(errors.InvalidRange,
-                          self.transport._get, 'a', [(42, 128)])
+        self.assertListRaises(errors.InvalidRange,
+                          self._file_contents, 'a', [(42, 128)])
 
 
 class TestRanges_urllib(TestRanges, TestCaseWithWebserver):

=== modified file 'bzrlib/transport/__init__.py'
--- a/bzrlib/transport/__init__.py	2007-05-08 20:00:50 +0000
+++ b/bzrlib/transport/__init__.py	2007-06-12 14:48:57 +0000
@@ -575,7 +575,7 @@
         :param fudge_factor: All transports have some level of 'it is
                 better to read some more data and throw it away rather 
                 than seek', so collapse if we are 'close enough'
-        :return: yield _CoalescedOffset objects, which have members for wher
+        :return: yield _CoalescedOffset objects, which have members for where
                 to start, how much to read, and how to split those 
                 chunks back up
         """

=== modified file 'bzrlib/transport/http/__init__.py'
--- a/bzrlib/transport/http/__init__.py	2007-05-06 06:59:57 +0000
+++ b/bzrlib/transport/http/__init__.py	2007-06-12 14:48:57 +0000
@@ -28,8 +28,13 @@
 
 from bzrlib import errors, ui
 from bzrlib.smart import medium
+from bzrlib.symbol_versioning import (
+        deprecated_method,
+        zero_seventeen,
+        )
 from bzrlib.trace import mutter
 from bzrlib.transport import (
+    _CoalescedOffset,
     Transport,
     )
 
@@ -232,7 +237,7 @@
 
         :param relpath: Path relative to transport base URL
         :param ranges: None to get the whole file;
-            or [(start,end)+], a list of tuples to fetch parts of a file.
+            or  a list of _CoalescedOffset to fetch parts of a file.
         :param tail_amount: The amount to get from the end of the file.
 
         :returns: (http_code, result_file)
@@ -279,30 +284,47 @@
             # unsuccessful
             raise exc_info[0], exc_info[1], exc_info[2]
 
+    # Having to round trip to the server means waiting for a response, so it is
+    # better to download extra bytes. We have two constraints to satisfy when
+    # choosing the right value: a request with too much ranges will make some
+    # servers issue a '400: Bad request' error (when we have too much ranges,
+    # we exceed the apache header max size (8190 by default) for example) ;
+    # coalescing too much ranges will increasing the data transferred and
+    # degrade performances.
+    _bytes_to_read_before_seek = 512
+    # No limit on the offset number combined into one, we are trying to avoid
+    # downloading the whole file.
+    _max_readv_combined = 0
+
     def readv(self, relpath, offsets):
         """Get parts of the file at the given relative path.
 
         :param offsets: A list of (offset, size) tuples.
         :param return: A list or generator of (offset, data) tuples
         """
-        ranges = self.offsets_to_ranges(offsets)
-        mutter('http readv of %s collapsed %s offsets => %s',
-                relpath, len(offsets), ranges)
+        sorted_offsets = sorted(list(offsets))
+        fudge = self._bytes_to_read_before_seek
+        coalesced = self._coalesce_offsets(sorted_offsets,
+                                           limit=self._max_readv_combine,
+                                           fudge_factor=fudge)
+        coalesced = list(coalesced)
+        mutter('http readv of %s  offsets => %s collapsed %s',
+                relpath, len(offsets), len(coalesced))
 
         try_again = True
         while try_again:
             try_again = False
             try:
-                code, f = self._get(relpath, ranges)
+                code, f = self._get(relpath, coalesced)
             except (errors.InvalidRange, errors.ShortReadvError), e:
-                try_again, code, f = self._retry_get(relpath, ranges,
+                try_again, code, f = self._retry_get(relpath, coalesced,
                                                      sys.exc_info())
 
         for start, size in offsets:
             try_again = True
             while try_again:
                 try_again = False
-                f.seek(start, (start < 0) and 2 or 0)
+                f.seek(start, ((start < 0) and 2) or 0)
                 start = f.tell()
                 try:
                     data = f.read(size)
@@ -313,12 +335,13 @@
                     # Note that we replace 'f' here and that it
                     # may need cleaning one day before being
                     # thrown that way.
-                    try_again, code, f = self._retry_get(relpath, ranges,
+                    try_again, code, f = self._retry_get(relpath, coalesced,
                                                          sys.exc_info())
             # After one or more tries, we get the data.
             yield start, data
 
     @staticmethod
+    @deprecated_method(zero_seventeen)
     def offsets_to_ranges(offsets):
         """Turn a list of offsets and sizes into a list of byte ranges.
 
@@ -453,37 +476,40 @@
         else:
             return self.__class__(self.abspath(offset), self)
 
-    def attempted_range_header(self, ranges, tail_amount):
+    def _attempted_range_header(self, offsets, tail_amount):
         """Prepare a HTTP Range header at a level the server should accept"""
 
         if self._range_hint == 'multi':
             # Nothing to do here
-            return self.range_header(ranges, tail_amount)
+            return self._range_header(offsets, tail_amount)
         elif self._range_hint == 'single':
             # Combine all the requested ranges into a single
             # encompassing one
-            if len(ranges) > 0:
-                start, ignored = ranges[0]
-                ignored, end = ranges[-1]
+            if len(offsets) > 0:
                 if tail_amount not in (0, None):
-                    # Nothing we can do here to combine ranges
-                    # with tail_amount, just returns None. The
-                    # whole file should be downloaded.
+                    # Nothing we can do here to combine ranges with tail_amount
+                    # in a single range, just returns None. The whole file
+                    # should be downloaded.
                     return None
                 else:
-                    return self.range_header([(start, end)], 0)
+                    start = offsets[0].start
+                    last = offsets[-1]
+                    end = last.start + last.length - 1
+                    whole = self._coalesce_offsets([(start, end - start + 1)],
+                                                   limit=0, fudge_factor=0)
+                    return self._range_header(list(whole), 0)
             else:
                 # Only tail_amount, requested, leave range_header
                 # do its work
-                return self.range_header(ranges, tail_amount)
+                return self._range_header(offsets, tail_amount)
         else:
             return None
 
     @staticmethod
-    def range_header(ranges, tail_amount):
+    def _range_header(ranges, tail_amount):
         """Turn a list of bytes ranges into a HTTP Range header value.
 
-        :param ranges: A list of byte ranges, (start, end).
+        :param ranges: A list of _CoalescedOffset
         :param tail_amount: The amount to get from the end of the file.
 
         :return: HTTP range header string.
@@ -492,8 +518,9 @@
         provided.
         """
         strings = []
-        for start, end in ranges:
-            strings.append('%d-%d' % (start, end))
+        for offset in ranges:
+            strings.append('%d-%d' % (offset.start,
+                                      offset.start + offset.length - 1))
 
         if tail_amount:
             strings.append('-%d' % tail_amount)

=== modified file 'bzrlib/transport/http/_pycurl.py'
--- a/bzrlib/transport/http/_pycurl.py	2007-05-06 06:59:57 +0000
+++ b/bzrlib/transport/http/_pycurl.py	2007-06-12 14:48:57 +0000
@@ -136,10 +136,10 @@
         else:
             self._raise_curl_http_error(curl)
 
-    def _get(self, relpath, ranges, tail_amount=0):
+    def _get(self, relpath, offsets, tail_amount=0):
         # This just switches based on the type of request
-        if ranges is not None or tail_amount not in (0, None):
-            return self._get_ranged(relpath, ranges, tail_amount=tail_amount)
+        if offsets is not None or tail_amount not in (0, None):
+            return self._get_ranged(relpath, offsets, tail_amount=tail_amount)
         else:
             return self._get_full(relpath)
 
@@ -189,12 +189,12 @@
 
         return code, data
 
-    def _get_ranged(self, relpath, ranges, tail_amount):
+    def _get_ranged(self, relpath, offsets, tail_amount):
         """Make a request for just part of the file."""
         curl = self._curl
         abspath, data, header = self._setup_get_request(curl, relpath)
 
-        range_header = self.attempted_range_header(ranges, tail_amount)
+        range_header = self._attempted_range_header(offsets, tail_amount)
         if range_header is None:
             # Forget ranges, the server can't handle them
             return self._get_full(relpath)

=== modified file 'bzrlib/transport/http/_urllib.py'
--- a/bzrlib/transport/http/_urllib.py	2007-04-22 16:32:04 +0000
+++ b/bzrlib/transport/http/_urllib.py	2007-06-12 14:48:57 +0000
@@ -113,13 +113,13 @@
 
         return response
 
-    def _get(self, relpath, ranges, tail_amount=0):
+    def _get(self, relpath, offsets, tail_amount=0):
         """See HttpTransport._get"""
 
         abspath = self._real_abspath(relpath)
         headers = {}
-        if ranges or tail_amount:
-            range_header = self.attempted_range_header(ranges, tail_amount)
+        if offsets or tail_amount:
+            range_header = self._attempted_range_header(offsets, tail_amount)
             if range_header is not None:
                 bytes = 'bytes=' + range_header
                 headers = {'Range': bytes}



More information about the bazaar-commits mailing list