Rev 3053: Merge http readv issuing multiple requests since that fix bug #172701 in file:///v/home/vila/src/bzr/bugs/172701/

Thu Nov 29 21:59:49 GMT 2007

At file:///v/home/vila/src/bzr/bugs/172701/

------------------------------------------------------------
revno: 3053
revision-id:v.ladeuil+lp at free.fr-20071129215945-qwzjfzz6kygljh4r
parent: pqm at pqm.ubuntu.com-20071129184101-u9506rihe4zbzyyz
parent: v.ladeuil+lp at free.fr-20071129154333-ddra1mbf487rlofw
committer: Vincent Ladeuil <v.ladeuil+lp at free.fr>
branch nick: 172701
timestamp: Thu 2007-11-29 22:59:45 +0100
message:
  Merge http readv issuing multiple requests since that fix bug #172701
modified:
  bzrlib/transport/http/__init__.py http_transport.py-20050711212304-506c5fd1059ace96
  bzrlib/transport/http/_urllib2_wrappers.py _urllib2_wrappers.py-20060913231729-ha9ugi48ktx481ao-1
    ------------------------------------------------------------
    revno: 3024.2.3
    revision-id:v.ladeuil+lp at free.fr-20071129154333-ddra1mbf487rlofw
    parent: v.ladeuil+lp at free.fr-20071127093537-gruuxzmso2r6c5pg
    committer: Vincent Ladeuil <v.ladeuil+lp at free.fr>
    branch nick: 165061
    timestamp: Thu 2007-11-29 16:43:33 +0100
    message:
      Rewrite http_readv to allow several GET requests. Smoke tested against branch reported in the bug.
      
      * bzrlib/transport/http/__init__.py:
      (HttpTransportBase._readv): Issue several GET requests if too many
      ranges are requested.
    modified:
      bzrlib/transport/http/__init__.py http_transport.py-20050711212304-506c5fd1059ace96
      bzrlib/transport/http/_urllib2_wrappers.py _urllib2_wrappers.py-20060913231729-ha9ugi48ktx481ao-1
-------------- next part --------------
=== modified file 'bzrlib/transport/http/__init__.py'

--- a/bzrlib/transport/http/__init__.py	2007-11-27 08:26:00 +0000
+++ b/bzrlib/transport/http/__init__.py	2007-11-29 15:43:33 +0000
@@ -208,9 +208,9 @@
             self._range_hint = None
             mutter('Retry "%s" without ranges' % relpath)
         else:
-            # We tried all the tricks, but nothing worked. We re-raise original
-            # exception; the 'mutter' calls above will indicate that further
-            # tries were unsuccessful
+            # We tried all the tricks, but nothing worked. We re-raise the
+            # original exception; the 'mutter' calls above will indicate that
+            # further tries were unsuccessful
             raise exc_info[0], exc_info[1], exc_info[2]
 
     def _get_ranges_hinted(self, relpath, ranges):
@@ -251,6 +251,9 @@
     # No limit on the offset number that get combined into one, we are trying
     # to avoid downloading the whole file.
     _max_readv_combine = 0
+    # By default Apache has a limit of ~400 ranges before replying with a 400
+    # Bad Request. So we go underneath that amount to be safe.
+    _max_get_ranges = 200
 
     def _readv(self, relpath, offsets):
         """Get parts of the file at the given relative path.
@@ -258,41 +261,80 @@
         :param offsets: A list of (offset, size) tuples.
         :param return: A list or generator of (offset, data) tuples
         """
-        sorted_offsets = sorted(list(offsets))
-        fudge = self._bytes_to_read_before_seek
-        coalesced = self._coalesce_offsets(sorted_offsets,
-                                           limit=self._max_readv_combine,
-                                           fudge_factor=fudge)
-        coalesced = list(coalesced)
-        mutter('http readv of %s  offsets => %s collapsed %s',
-                relpath, len(offsets), len(coalesced))
-
-        f = self._get_ranges_hinted(relpath, coalesced)
-        for start, size in offsets:
-            try_again = True
-            while try_again:
-                try_again = False
-                f.seek(start, ((start < 0) and 2) or 0)
-                start = f.tell()
-                try:
-                    data = f.read(size)
-                    if len(data) != size:
-                        raise errors.ShortReadvError(relpath, start, size,
-                                                     actual=len(data))
-                except errors.ShortReadvError, e:
-                    self._degrade_range_hint(relpath, coalesced, sys.exc_info())
-
-                    # Since the offsets and the ranges may not be in the same
-                    # order, we don't try to calculate a restricted single
-                    # range encompassing unprocessed offsets.
-
-                    # Note: we replace 'f' here, it may need cleaning one day
-                    # before being thrown that way.
-                    f = self._get_ranges_hinted(relpath, coalesced)
-                    try_again = True
-
-            # After one or more tries, we get the data.
-            yield start, data
+
+        # offsets may be a genarator, we will iterate it several times, so
+        # build a list
+        offsets = list(offsets)
+
+        try_again = True
+        while try_again:
+            try_again = False
+
+            # Coalesce the offsets to minimize the GET requests issued
+            sorted_offsets = sorted(offsets)
+            coalesced = self._coalesce_offsets(
+                sorted_offsets, limit=self._max_readv_combine,
+                fudge_factor=self._bytes_to_read_before_seek)
+
+            # Turn it into a list, we will iterate it several times
+            coalesced = list(coalesced)
+            mutter('http readv of %s  offsets => %s collapsed %s',
+                    relpath, len(offsets), len(coalesced))
+
+            # Cache the data read, but only until it's been used
+            data_map = {}
+            # We will iterate on the data received from the GET requests and
+            # serve the corresponding offsets repecting the initial order. We
+            # need an offset iterator for that.
+            iter_offsets = iter(offsets)
+            cur_offset_and_size = iter_offsets.next()
+
+            try:
+                for cur_coal, file in self._coalesce_readv(relpath, coalesced):
+                    # Split the received chunk
+                    for offset, size in cur_coal.ranges:
+                        key = (cur_coal.start + offset, size)
+                        file.seek(cur_coal.start + offset, 0)
+                        data = file.read(size)
+                        data_len = len(data)
+                        if data_len != size:
+                            raise errors.ShortReadvError(relpath, start, size,
+                                                         actual=data_len)
+                        data_map[key] = data
+
+                    # Yield everything we can
+                    while cur_offset_and_size in data_map:
+                        # Clean the cached data since we use it
+                        # XXX: will break if offsets contains duplicates --
+                        # vila20071129
+                        this_data = data_map.pop(cur_offset_and_size)
+                        yield cur_offset_and_size[0], this_data
+                        cur_offset_and_size = iter_offsets.next()
+
+            except (errors.ShortReadvError,errors.InvalidRange), e:
+                self._degrade_range_hint(relpath, coalesced, sys.exc_info())
+                # Some offsets may have been already processed, so we retry
+                # only the unsuccessful ones.
+                offsets = [cur_offset_and_size] + [o for o in offset_stack]
+
+    def _coalesce_readv(self, relpath, coalesced):
+        """Issue several GET requests to satisfy the coalesced offsets"""
+        total = len(coalesced)
+        if self._range_hint == 'multi':
+             max_ranges = self._max_get_ranges
+        elif self._range_hint == 'single':
+             max_ranges = total
+        else:
+            # The whole file will be downloaded anyway
+            max_ranges = total
+        for group in xrange(0, len(coalesced), max_ranges):
+            ranges = coalesced[group * max_ranges:group+1 * max_ranges]
+            # Note that the following may raise errors.InvalidRange. It's the
+            # caller responsability to decide how to retry since it may provide
+            # different coalesced offsets.
+            file = self._get(relpath, ranges)
+            for range in ranges:
+                yield range, file
 
     def recommended_page_size(self):
         """See Transport.recommended_page_size().
@@ -447,7 +489,7 @@
         """Prepare a HTTP Range header at a level the server should accept"""
 
         if self._range_hint == 'multi':
-            # Nothing to do here
+            # Generate the header describing all offsets
             return self._range_header(offsets, tail_amount)
         elif self._range_hint == 'single':
             # Combine all the requested ranges into a single

=== modified file 'bzrlib/transport/http/_urllib2_wrappers.py'
--- a/bzrlib/transport/http/_urllib2_wrappers.py	2007-11-05 13:21:30 +0000
+++ b/bzrlib/transport/http/_urllib2_wrappers.py	2007-11-29 15:43:33 +0000
@@ -101,7 +101,8 @@
                 # having issued the response headers (even if the
                 # headers indicate a Content-Type...)
                 body = self.fp.read(self.length)
-                if self.debuglevel > 0:
+                if self.debuglevel > 3:
+                    # This one can be huge and is generally not interesting
                     print "Consumed body: [%s]" % body
             self.close()
         elif self.status == 200:
@@ -1285,7 +1286,7 @@
             )
 
         self.open = self._opener.open
-        if DEBUG >= 2:
+        if DEBUG >= 3:
             # When dealing with handler order, it's easy to mess
             # things up, the following will help understand which
             # handler is used, when and for what.