# Bazaar revision bundle v0.7 # # message: # Let the PyCurl transport read multiple ranges # committer: Johan Rydberg # date: Thu 2006-06-08 16:30:12.353727102 +0200 === modified file bzrlib/transport/http/__init__.py --- bzrlib/transport/http/__init__.py +++ bzrlib/transport/http/__init__.py @@ -192,58 +192,28 @@ :param offsets: A list of (offset, size) tuples. :param return: A list or generator of (offset, data) tuples """ - # Ideally we would pass one big request asking for all the ranges in - # one go; however then the server will give a multipart mime response - # back, and we can't parse them yet. So instead we just get one range - # per region, and try to coallesce the regions as much as possible. - # - # The read-coallescing code is not quite regular enough to have a - # single driver routine and - # helper method in Transport. - def do_combined_read(combined_offsets): - # read one coalesced block - total_size = 0 - for offset, size in combined_offsets: - total_size += size - mutter('readv coalesced %d reads.', len(combined_offsets)) - offset = combined_offsets[0][0] - byte_range = (offset, offset + total_size - 1) - code, result_file = self._get(relpath, [byte_range]) - if code == 206: - for off, size in combined_offsets: + if not len(offsets): + return + + def do_read(offsets): + code, result_file = self._get(relpath, offsets) + if code == 206: # we were given a partial content response + for offset, size in offsets: result_bytes = result_file.read(size) - assert len(result_bytes) == size - yield off, result_bytes - elif code == 200: - data = result_file.read(offset + total_size)[offset:offset + total_size] + yield offset, result_bytes + elif code == 200: # we got the whole thing, split it up pos = 0 - for offset, size in combined_offsets: - yield offset, data[pos:pos + size] + for offset, size in offsets: + if offset != pos: + result_file.read(offset - pos) + pos = offset + yield offset, result_file.read(size) pos += size - del data - if not len(offsets): - return - pending_offsets = deque(offsets) - combined_offsets = [] - while len(pending_offsets): - offset, size = pending_offsets.popleft() - if not combined_offsets: - combined_offsets = [[offset, size]] - else: - if (len (combined_offsets) < 500 and - combined_offsets[-1][0] + combined_offsets[-1][1] == offset): - # combatible offset: - combined_offsets.append([offset, size]) - else: - # incompatible, or over the threshold issue a read and yield - pending_offsets.appendleft((offset, size)) - for result in do_combined_read(combined_offsets): - yield result - combined_offsets = [] - # whatever is left is a single coalesced request - if len(combined_offsets): - for result in do_combined_read(combined_offsets): - yield result + + while len(offsets) > 0: + for offset, bytes in do_read(offsets[:500]): + yield offset, bytes + offsets = offsets[500:] def put(self, relpath, f, mode=None): """Copy the file-like or string object into the location. @@ -340,6 +310,25 @@ else: return self.__class__(self.abspath(offset)) + def _range_string(self, offsets): + """Turn a list of (offset, size) tuples into a HTTP range value.""" + str = "" + + start, size = offsets[0] + combined = [[start, start + size]] + + for start, size in offsets[1:]: + if start == combined[-1][1]: + combined[-1][1] = start + size + else: + combined.append([start, start + size]) + + for range in combined: + str += "%d-%d," % (range[0], range[1] - 1) + return str[:-1] + + + #---------------- test server facilities ---------------- # TODO: load these only when running tests === modified file bzrlib/transport/http/_pycurl.py --- bzrlib/transport/http/_pycurl.py +++ bzrlib/transport/http/_pycurl.py @@ -24,6 +24,7 @@ # It's probably safer to just always revalidate. mbp 20060321 import os +import sys from StringIO import StringIO import bzrlib @@ -59,6 +60,128 @@ register_urlparse_netloc_protocol('http+pycurl') +class ParseError(Exception): + pass + + +class ByteRangeFile(object): + """A file-like object that parses MIME multipast responses from a + web server.""" + + def __init__(self, headers, input): + self.headers = headers + self.input = input + self.chunks = [] + + def parse(self): + encoding = self.headers.get('transfer-encoding', None) + assert encoding is None or encoding != 'chunked' + + boundary = self._get_boundary(self.headers['content-type']) + data = None + range = None + + line = self.input.readline() + while line != "": + #print >>sys.stderr, line + if line.startswith('--%s' % boundary): + pass + elif line.lower().startswith('content-range:'): + offset, size = self._parse_range(line) + # XXX hack, we seem to have a blank line here + line = self.input.readline() + data = self.input.read(size) + #mutter('read %d bytes at offset %d', size, offset) + self.chunks.append(data) + + line = self.input.readline() + + def read(self, size): + assert self.chunks, "no chunks left" + chunk = self.chunks[0] + if size == len(chunk): + del self.chunks[0] + else: + self.chunks[0] = chunk[size:] + data = chunk[:size] + assert len(data) == size, "size mismatch, %d != %d" % (len(data), size) + return data + + def _parse_range(self, line): + # Format is "bytes x-y/z" ie. x to y of total z + try: + range = line.split(':')[1].strip() + words = range.split() + if words[0] != 'bytes': + raise ParseError("Content range not in bytes!") + words = words[1].split('/') + range = words[0].split('-') + #mutter('range %s', range) + offset = int(range[0]) + size = int(range[1]) - offset + 1 + except (IndexError, ValueError): + raise ParseError("Couldn't parse '%s'" % line) + + return (offset, size) + + def _get_boundary(self, header): + try: + words = header.split(';') + content_type = words.pop(0).strip() + + if content_type == 'multipart/byteranges': + param = words.pop(0) + name, value = [s.strip() for s in param.split('=')] + if name == 'boundary': + boundary = value + else: + raise ParseError('Missing boundary definition') + else: + raise ParseError('Incorrect mimetype "%s"' % content_type) + # XXX we get 206 responses that are text/plain and contain the + # whole file, yuck yuck yuck + except (IndexError, ValueError): + raise ParseError("Couldn't parse '%s'" % header) + + return boundary + + +class CurlByteRangeFile(ByteRangeFile): + def __init__(self, curl, input): + headers = {} + headers['content-type'] = curl.getinfo(pycurl.CONTENT_TYPE) + #headers['transfer-encoding'] = curl.getinfo(pycurl.???) + ByteRangeFile.__init__(self, headers, input) + + +class CurlWrap(object): + """Wrapper object for pycurl.Curl objects, that has a few extra + options set. + + It is not possible to inherit pycurl.Curl for some reason, have it + as an attribute instead. + """ + + def __init__(self): + self.curl = pycurl.Curl() + self.curl.setopt(pycurl.VERBOSE, 0) + self._set_options() + + def _set_options(self): + """Set options for all requests""" + # There's no way in http/1.0 to say "must revalidate"; we + # don't want to force it to always retrieve. so just turn off + # the default Pragma provided by Curl. + headers = ['Cache-control: max-age=0', + 'Pragma: no-cache'] + # TODO: maybe include a summary of the pycurl version + ua_str = 'bzr/%s (pycurl)' % (bzrlib.__version__) + self.curl.setopt(pycurl.USERAGENT, ua_str) + self.curl.setopt(pycurl.HTTPHEADER, headers) + # follow redirect responses + self.curl.setopt(pycurl.FOLLOWLOCATION, 1) + + class PyCurlTransport(HttpTransportBase): """http client transport using pycurl @@ -69,9 +192,10 @@ set headers to allow caching. """ - def __init__(self, base): + def __init__(self, base, curl=None): super(PyCurlTransport, self).__init__(base) mutter('using pycurl %s' % pycurl.version) + self._curl = curl or CurlWrap() def should_cache(self): """Return True if the data pulled across should be cached locally. @@ -79,13 +203,13 @@ return True def has(self, relpath): - curl = pycurl.Curl() + curl = self._curl.curl abspath = self._real_abspath(relpath) curl.setopt(pycurl.URL, abspath) curl.setopt(pycurl.FOLLOWLOCATION, 1) # follow redirect responses - self._set_curl_options(curl) # don't want the body - ie just do a HEAD request curl.setopt(pycurl.NOBODY, 1) + curl.setopt(pycurl.HTTPGET, 0) self._curl_perform(curl) code = curl.getinfo(pycurl.HTTP_CODE) if code == 404: # not found @@ -94,22 +218,28 @@ return True elif code == 0: self._raise_curl_connection_error(curl) + elif code == 206: + url = curl.getinfo(pycurl.EFFECTIVE_URL) + raise TransportError('Got 206 doing HEAD?', (code, url)) else: self._raise_curl_http_error(curl) - + def _get(self, relpath, ranges): - curl = pycurl.Curl() + curl = self._curl.curl + # Right now it is not possible to reset the RANGE option, + # setting a empty string results in "Range: bytes=". So set a + # range for the whole file. + curl.setopt(pycurl.RANGE, '/*-') + if ranges is not None: + s = self._range_string(ranges) + curl.setopt(pycurl.RANGE, self._range_string(ranges)) abspath = self._real_abspath(relpath) sio = StringIO() curl.setopt(pycurl.URL, abspath) - self._set_curl_options(curl) curl.setopt(pycurl.WRITEFUNCTION, sio.write) + # XXX jrydberg: Are both these needed? curl.setopt(pycurl.NOBODY, 0) - if ranges is not None: - assert len(ranges) == 1 - # multiple ranges not supported yet because we can't decode the - # response - curl.setopt(pycurl.RANGE, '%d-%d' % ranges[0]) + curl.setopt(pycurl.HTTPGET, 1) self._curl_perform(curl) code = curl.getinfo(pycurl.HTTP_CODE) if code == 404: @@ -119,7 +249,18 @@ return code, sio elif code == 206 and (ranges is not None): sio.seek(0) - return code, sio + if 'text/plain' in curl.getinfo(pycurl.CONTENT_TYPE): + # We got 206's that actually contain the whole file, wtf? + file = sio + else: + file = CurlByteRangeFile(curl, sio) + file.parse() + return code, file + elif code == 206: + # Partial contents but without any specified ranges; a + # result from the workaround mentioned above. + sio.seek(0) + return 200, sio elif code == 0: self._raise_curl_connection_error(curl) else: @@ -137,20 +278,6 @@ raise TransportError('http error %d probing for %s' % (code, url)) - def _set_curl_options(self, curl): - """Set options for all requests""" - # There's no way in http/1.0 to say "must revalidate"; we don't want - # to force it to always retrieve. so just turn off the default Pragma - # provided by Curl. - headers = ['Cache-control: max-age=0', - 'Pragma: no-cache'] - ## curl.setopt(pycurl.VERBOSE, 1) - # TODO: maybe include a summary of the pycurl version - ua_str = 'bzr/%s (pycurl)' % (bzrlib.__version__) - curl.setopt(pycurl.USERAGENT, ua_str) - curl.setopt(pycurl.HTTPHEADER, headers) - curl.setopt(pycurl.FOLLOWLOCATION, 1) # follow redirect responses - def _curl_perform(self, curl): """Perform curl operation and translate exceptions.""" try: @@ -161,6 +288,13 @@ # couldn't resolve host raise NoSuchFile(curl.getinfo(pycurl.EFFECTIVE_URL), e) + def clone(self, offset=None): + """See HttpTransportBase.clone.""" + if offset is None: + return self.__class__(self.base, curl=self._curl) + else: + return self.__class__(self.abspath(offset), curl=self._curl) + class HttpServer_PyCurl(HttpServer): """Subclass of HttpServer that gives http+pycurl urls. # revision id: jrydberg@gnu.org-20060608143012-fe3534eee20ac780 # sha1: 5921069a2d1d0b31f2375cd9ed5d0cf5fb6f9144 # inventory sha1: fbbb850a28c3c50658a1fc2cfb11b8a774f7db21 # parent ids: # pqm@pqm.ubuntu.com-20060606161247-44efd292c9de0005 # base id: pqm@pqm.ubuntu.com-20060606161247-44efd292c9de0005 # properties: # branch-nick: bzr.jrydberg.readv