# Bazaar revision bundle v0.7
#
# message:
#   Let the PyCurl transport read multiple ranges
# committer: Johan Rydberg <jrydberg@gnu.org>
# date: Thu 2006-06-08 16:30:12.353727102 +0200

=== modified file bzrlib/transport/http/__init__.py
--- bzrlib/transport/http/__init__.py
+++ bzrlib/transport/http/__init__.py
@@ -192,58 +192,28 @@
         :param offsets: A list of (offset, size) tuples.
         :param return: A list or generator of (offset, data) tuples
         """
-        # Ideally we would pass one big request asking for all the ranges in
-        # one go; however then the server will give a multipart mime response
-        # back, and we can't parse them yet.  So instead we just get one range
-        # per region, and try to coallesce the regions as much as possible.
-        #
-        # The read-coallescing code is not quite regular enough to have a
-        # single driver routine and
-        # helper method in Transport.
-        def do_combined_read(combined_offsets):
-            # read one coalesced block
-            total_size = 0
-            for offset, size in combined_offsets:
-                total_size += size
-            mutter('readv coalesced %d reads.', len(combined_offsets))
-            offset = combined_offsets[0][0]
-            byte_range = (offset, offset + total_size - 1)
-            code, result_file = self._get(relpath, [byte_range])
-            if code == 206:
-                for off, size in combined_offsets:
+        if not len(offsets):
+            return
+
+        def do_read(offsets):
+            code, result_file = self._get(relpath, offsets)
+            if code == 206: # we were given a partial content response
+                for offset, size in offsets:
                     result_bytes = result_file.read(size)
-                    assert len(result_bytes) == size
-                    yield off, result_bytes
-            elif code == 200:
-                data = result_file.read(offset + total_size)[offset:offset + total_size]
+                    yield offset, result_bytes
+            elif code == 200: # we got the whole thing, split it up
                 pos = 0
-                for offset, size in combined_offsets:
-                    yield offset, data[pos:pos + size]
+                for offset, size in offsets:
+                    if offset != pos:
+                        result_file.read(offset - pos)
+                        pos = offset
+                    yield offset, result_file.read(size)
                     pos += size
-                del data
-        if not len(offsets):
-            return
-        pending_offsets = deque(offsets)
-        combined_offsets = []
-        while len(pending_offsets):
-            offset, size = pending_offsets.popleft()
-            if not combined_offsets:
-                combined_offsets = [[offset, size]]
-            else:
-                if (len (combined_offsets) < 500 and
-                    combined_offsets[-1][0] + combined_offsets[-1][1] == offset):
-                    # combatible offset:
-                    combined_offsets.append([offset, size])
-                else:
-                    # incompatible, or over the threshold issue a read and yield
-                    pending_offsets.appendleft((offset, size))
-                    for result in do_combined_read(combined_offsets):
-                        yield result
-                    combined_offsets = []
-        # whatever is left is a single coalesced request
-        if len(combined_offsets):
-            for result in do_combined_read(combined_offsets):
-                yield result
+
+        while len(offsets) > 0:
+            for offset, bytes in do_read(offsets[:500]):
+                yield offset, bytes
+            offsets = offsets[500:]
 
     def put(self, relpath, f, mode=None):
         """Copy the file-like or string object into the location.
@@ -340,6 +310,25 @@
         else:
             return self.__class__(self.abspath(offset))
 
+    def _range_string(self, offsets):
+        """Turn a list of (offset, size) tuples into a HTTP range value."""
+        str = ""
+
+        start, size = offsets[0]
+        combined = [[start, start + size]]
+
+        for start, size in offsets[1:]:
+            if start == combined[-1][1]:
+                combined[-1][1] = start + size
+            else:
+                combined.append([start, start + size])
+
+        for range in combined:
+            str += "%d-%d," % (range[0], range[1] - 1)
+        return str[:-1]
+
+
+
 #---------------- test server facilities ----------------
 # TODO: load these only when running tests
 

=== modified file bzrlib/transport/http/_pycurl.py
--- bzrlib/transport/http/_pycurl.py
+++ bzrlib/transport/http/_pycurl.py
@@ -24,6 +24,7 @@
 # It's probably safer to just always revalidate.  mbp 20060321
 
 import os
+import sys
 from StringIO import StringIO
 
 import bzrlib
@@ -59,6 +60,128 @@
 register_urlparse_netloc_protocol('http+pycurl')
 
 
+class ParseError(Exception):
+    pass
+
+
+class ByteRangeFile(object):
+    """A file-like object that parses MIME multipast responses from a
+    web server."""
+
+    def __init__(self, headers, input):
+        self.headers = headers
+        self.input = input
+        self.chunks = []
+
+    def parse(self):
+        encoding = self.headers.get('transfer-encoding', None)
+        assert encoding is None or encoding != 'chunked'
+
+        boundary = self._get_boundary(self.headers['content-type'])
+        data = None
+        range = None
+
+        line = self.input.readline()
+        while line != "":
+            #print >>sys.stderr, line
+            if line.startswith('--%s' % boundary):
+                pass
+            elif line.lower().startswith('content-range:'):
+                offset, size = self._parse_range(line)
+                # XXX hack, we seem to have a blank line here
+                line = self.input.readline()
+                data = self.input.read(size)
+                #mutter('read %d bytes at offset %d', size, offset)
+                self.chunks.append(data)
+
+            line = self.input.readline()
+
+    def read(self, size):
+        assert self.chunks, "no chunks left"
+        chunk = self.chunks[0]
+        if size == len(chunk):
+            del self.chunks[0]
+        else:
+            self.chunks[0] = chunk[size:]
+        data = chunk[:size]
+        assert len(data) == size, "size mismatch, %d != %d" % (len(data), size)
+        return data
+
+    def _parse_range(self, line):
+        # Format is "bytes x-y/z" ie. x to y of total z
+        try:
+            range = line.split(':')[1].strip()
+            words = range.split()
+            if words[0] != 'bytes':
+                raise ParseError("Content range not in bytes!")
+            words = words[1].split('/')
+            range = words[0].split('-')
+            #mutter('range %s', range)
+            offset = int(range[0])
+            size = int(range[1]) - offset + 1
+        except (IndexError, ValueError):
+            raise ParseError("Couldn't parse '%s'" % line)
+
+        return (offset, size)
+
+    def _get_boundary(self, header):
+        try:
+            words = header.split(';')
+            content_type = words.pop(0).strip()
+
+            if content_type == 'multipart/byteranges':
+                param = words.pop(0)
+                name, value = [s.strip() for s in param.split('=')]
+                if name == 'boundary':
+                    boundary = value
+                else:
+                    raise ParseError('Missing boundary definition')
+            else:
+                raise ParseError('Incorrect mimetype "%s"' % content_type)
+                # XXX we get 206 responses that are text/plain and contain the
+                # whole file, yuck yuck yuck
+        except (IndexError, ValueError):
+            raise ParseError("Couldn't parse '%s'" % header)
+
+        return boundary
+
+
+class CurlByteRangeFile(ByteRangeFile):
+    def __init__(self, curl, input):
+        headers = {}
+        headers['content-type'] = curl.getinfo(pycurl.CONTENT_TYPE)
+        #headers['transfer-encoding'] = curl.getinfo(pycurl.???)
+        ByteRangeFile.__init__(self, headers, input)
+
+
+class CurlWrap(object):
+    """Wrapper object for pycurl.Curl objects, that has a few extra
+    options set.
+
+    It is not possible to inherit pycurl.Curl for some reason, have it
+    as an attribute instead.
+    """
+
+    def __init__(self):
+        self.curl = pycurl.Curl()
+        self.curl.setopt(pycurl.VERBOSE, 0)
+        self._set_options()
+
+    def _set_options(self):
+        """Set options for all requests"""
+        # There's no way in http/1.0 to say "must revalidate"; we
+        # don't want to force it to always retrieve.  so just turn off
+        # the default Pragma provided by Curl.
+        headers = ['Cache-control: max-age=0',
+                   'Pragma: no-cache']
+        # TODO: maybe include a summary of the pycurl version
+        ua_str = 'bzr/%s (pycurl)' % (bzrlib.__version__)
+        self.curl.setopt(pycurl.USERAGENT, ua_str)
+        self.curl.setopt(pycurl.HTTPHEADER, headers)
+        # follow redirect responses
+        self.curl.setopt(pycurl.FOLLOWLOCATION, 1)
+
+
 class PyCurlTransport(HttpTransportBase):
     """http client transport using pycurl
 
@@ -69,9 +192,10 @@
     set headers to allow caching.
     """
 
-    def __init__(self, base):
+    def __init__(self, base, curl=None):
         super(PyCurlTransport, self).__init__(base)
         mutter('using pycurl %s' % pycurl.version)
+        self._curl = curl or CurlWrap()
 
     def should_cache(self):
         """Return True if the data pulled across should be cached locally.
@@ -79,13 +203,13 @@
         return True
 
     def has(self, relpath):
-        curl = pycurl.Curl()
+        curl = self._curl.curl
         abspath = self._real_abspath(relpath)
         curl.setopt(pycurl.URL, abspath)
         curl.setopt(pycurl.FOLLOWLOCATION, 1) # follow redirect responses
-        self._set_curl_options(curl)
         # don't want the body - ie just do a HEAD request
         curl.setopt(pycurl.NOBODY, 1)
+        curl.setopt(pycurl.HTTPGET, 0)
         self._curl_perform(curl)
         code = curl.getinfo(pycurl.HTTP_CODE)
         if code == 404: # not found
@@ -94,22 +218,28 @@
             return True
         elif code == 0:
             self._raise_curl_connection_error(curl)
+        elif code == 206:
+            url = curl.getinfo(pycurl.EFFECTIVE_URL)
+            raise TransportError('Got 206 doing HEAD?', (code, url))
         else:
             self._raise_curl_http_error(curl)
-        
+
     def _get(self, relpath, ranges):
-        curl = pycurl.Curl()
+        curl = self._curl.curl
+        # Right now it is not possible to reset the RANGE option,
+        # setting a empty string results in "Range: bytes=".  So set a
+        # range for the whole file.
+        curl.setopt(pycurl.RANGE, '/*-')
+        if ranges is not None:
+            s = self._range_string(ranges)
+            curl.setopt(pycurl.RANGE, self._range_string(ranges))
         abspath = self._real_abspath(relpath)
         sio = StringIO()
         curl.setopt(pycurl.URL, abspath)
-        self._set_curl_options(curl)
         curl.setopt(pycurl.WRITEFUNCTION, sio.write)
+        # XXX jrydberg: Are both these needed?
         curl.setopt(pycurl.NOBODY, 0)
-        if ranges is not None:
-            assert len(ranges) == 1
-            # multiple ranges not supported yet because we can't decode the
-            # response
-            curl.setopt(pycurl.RANGE, '%d-%d' % ranges[0])
+        curl.setopt(pycurl.HTTPGET, 1)
         self._curl_perform(curl)
         code = curl.getinfo(pycurl.HTTP_CODE)
         if code == 404:
@@ -119,7 +249,18 @@
             return code, sio
         elif code == 206 and (ranges is not None):
             sio.seek(0)
-            return code, sio
+            if 'text/plain' in curl.getinfo(pycurl.CONTENT_TYPE):
+                # We got 206's that actually contain the whole file, wtf?
+                file = sio
+            else:
+                file = CurlByteRangeFile(curl, sio)
+                file.parse()
+            return code, file
+        elif code == 206:
+            # Partial contents but without any specified ranges; a
+            # result from the workaround mentioned above.
+            sio.seek(0)
+            return 200, sio
         elif code == 0:
             self._raise_curl_connection_error(curl)
         else:
@@ -137,20 +278,6 @@
         raise TransportError('http error %d probing for %s' %
                              (code, url))
 
-    def _set_curl_options(self, curl):
-        """Set options for all requests"""
-        # There's no way in http/1.0 to say "must revalidate"; we don't want
-        # to force it to always retrieve.  so just turn off the default Pragma
-        # provided by Curl.
-        headers = ['Cache-control: max-age=0',
-                   'Pragma: no-cache']
-        ## curl.setopt(pycurl.VERBOSE, 1)
-        # TODO: maybe include a summary of the pycurl version
-        ua_str = 'bzr/%s (pycurl)' % (bzrlib.__version__)
-        curl.setopt(pycurl.USERAGENT, ua_str)
-        curl.setopt(pycurl.HTTPHEADER, headers)
-        curl.setopt(pycurl.FOLLOWLOCATION, 1) # follow redirect responses
-
     def _curl_perform(self, curl):
         """Perform curl operation and translate exceptions."""
         try:
@@ -161,6 +288,13 @@
                 # couldn't resolve host
                 raise NoSuchFile(curl.getinfo(pycurl.EFFECTIVE_URL), e)
 
+    def clone(self, offset=None):
+        """See HttpTransportBase.clone."""
+        if offset is None:
+            return self.__class__(self.base, curl=self._curl)
+        else:
+            return self.__class__(self.abspath(offset), curl=self._curl)
+
 
 class HttpServer_PyCurl(HttpServer):
     """Subclass of HttpServer that gives http+pycurl urls.

# revision id: jrydberg@gnu.org-20060608143012-fe3534eee20ac780
# sha1: 5921069a2d1d0b31f2375cd9ed5d0cf5fb6f9144
# inventory sha1: fbbb850a28c3c50658a1fc2cfb11b8a774f7db21
# parent ids:
#   pqm@pqm.ubuntu.com-20060606161247-44efd292c9de0005
# base id: pqm@pqm.ubuntu.com-20060606161247-44efd292c9de0005
# properties:
#   branch-nick: bzr.jrydberg.readv