Rev 2675: Finish the extraction code. in http://bzr.arbash-meinel.com/branches/bzr/0.19-dev/pyrex_knit_extract

Fri Aug 3 00:25:05 BST 2007

At http://bzr.arbash-meinel.com/branches/bzr/0.19-dev/pyrex_knit_extract

------------------------------------------------------------
revno: 2675
revision-id: john at arbash-meinel.com-20070802232426-mzvgu2kx022plzwh
parent: john at arbash-meinel.com-20070802221201-fq0ze6hinmbs5j9q
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: pyrex_knit_extract
timestamp: Thu 2007-08-02 18:24:26 -0500
message:
  Finish the extraction code.
  We now build up a PyString until we reach the true eol.
  And the _get_next_line() function is set up to extract as necessary
  and error if the caller cannot handle when there is no newline.
  This passes all of the important tests.
  We can get rid of _extract_lines_from_gzip_c, or possibly refactor
  as a wrapper around a gzip extractor object.
modified:
  bzrlib/_knit_helpers_c.pyx     knit_c.pyx-20070509143944-u42gy8w387a10m0j-1
-------------- next part --------------
=== modified file 'bzrlib/_knit_helpers_c.pyx'

--- a/bzrlib/_knit_helpers_c.pyx	2007-08-02 22:12:01 +0000
+++ b/bzrlib/_knit_helpers_c.pyx	2007-08-02 23:24:26 +0000
@@ -519,8 +519,8 @@
 
         self.strm.next_in = <Bytef *>c_data
         self.strm.avail_in = data_size
-        self.strm.next_out = <Bytef *>self.decompress_buffer
-        self.strm.avail_out = self.decompress_buf_size
+        self.strm.avail_out = 0 # Avail out will be set when we start
+                                # decompressing
         self.processing = 1
         self.stream_finished = 0
         self.bytes_available = 0
@@ -540,18 +540,19 @@
         # basically, we just need to memmove() the current unprocessed data
         # to the start of the buffer, and do another decompression from the
         # tail.
-        avail_out = self.strm.avail_out
-        if avail_out == 0: # Reset the buffer
+        if self.strm.avail_out == 0:
+            # Reset the buffer
             # Move the current data to the front of the decompress buffer
             # and open up more data at the tail.
-            memmove(self.decompress_buffer, self.cur_available,
-                    self.bytes_available)
+            if self.bytes_available:
+                memmove(self.decompress_buffer, self.cur_available,
+                        self.bytes_available)
             self.cur_available = self.decompress_buffer
             self.strm.next_out = <Bytef*>(self.cur_available
                                           + self.bytes_available)
             self.strm.avail_out = (self.decompress_buf_size
                                    - self.bytes_available)
-            avail_out = self.strm.avail_out
+        avail_out = self.strm.avail_out
         retval = inflate(&self.strm, Z_NO_FLUSH)
         if retval == Z_STREAM_END:
             self.stream_finished = 1 # True
@@ -570,13 +571,19 @@
             self.cur_available = self.decompress_buffer
         return 0
 
-    cdef int _get_next_line(self) except -1:
+    cdef int _get_next_line(self, int allow_intermediate) except -1:
         """Extract the next line.
 
         :postcondition: self.cur_line and self.cur_line_size will be updated
             to point to the next line to be processed.
+        :param allow_intermediate: If 0, then raise an exception if we are
+            unable to find the end of a line.
+            If 1, then return 2 to indicate that the line is not complete.
         :return: 1 to indicate we have a valid line
                  0 to indicate we are at EOF
+                 2 we have valid info, but we haven't reached a
+                   newline yet. This is only available if you pass
+                   allow_intermediate=True
         """
         # This currently uses a buffering scheme for extracting text
         # When we come into this function, if we have some decompressed data,
@@ -593,8 +600,10 @@
 
         cdef char *end_of_line
         cdef int bytes_processed
+        cdef int found_eol
 
         bytes_processed = 0
+        found_eol = 1
         # TODO: jam 2007-07-24 Figure out how to make this code support lines
         #       of "unlimited" length. Such as a versioned ISO where a single
         #       line is unlikely to fit in the decompression buffer
@@ -607,27 +616,36 @@
             end_of_line = <char *>memchr(self.cur_available, c'\n',
                                          self.bytes_available)
             if end_of_line == NULL:
-                # We reached the end of the buffer without finding a newline
-                # we need to extract more bytes, and try again
-                if self.stream_finished:
-                    raise errors.KnitCorrupt(self.knit_data_name,
-                        "Missing a trailing newline")
-                # Extract a bit more data out of the stream
-                bytes_processed = self.bytes_available
-                self._extract_from_stream()
-                assert self.bytes_available > bytes_processed
-                end_of_line = <char *>memchr(
-                        self.cur_available+bytes_processed,
-                        c'\n', self.bytes_available-bytes_processed)
+                if self.bytes_available < self.decompress_buf_size:
+                    # We reached the end of the buffer without finding a
+                    # newline.
+                    # Try to extract more bytes, and search again
+                    if self.stream_finished:
+                        raise errors.KnitCorrupt(self.knit_data_name,
+                            "Missing a trailing newline")
+                    # Remember that we already searched through some of it
+                    bytes_processed = self.bytes_available
+                    self._extract_from_stream()
+                    assert self.bytes_available > bytes_processed
+                    end_of_line = <char *>memchr(
+                            self.cur_available+bytes_processed,
+                            c'\n', self.bytes_available-bytes_processed)
                 if end_of_line == NULL:
-                    # This seems like we have a single line which
-                    # is longer than our buffer :(
-                    raise AssertionError('Not implemented')
+                    if not allow_intermediate:
+                        # This seems like we have a single line which
+                        # is longer than our buffer :(
+                        raise errors.KnitCorrupt(self.knit_data_name,
+                            "We did not find the end of a line we"
+                            " expected to be available")
+                    # We need to point at the last character for this line
+                    end_of_line = (self.cur_available
+                                   + self.bytes_available - 1)
+                    found_eol = 2
             self.cur_line = self.cur_available
             self.cur_available = end_of_line + 1
             self.cur_line_size = self.cur_available - self.cur_line
             self.bytes_available = self.bytes_available - self.cur_line_size
-            return 1
+            return found_eol
         elif self.stream_finished:
             # No bytes available, and we are at the end of the stream
             # return 0 to indicate we have nothing more to process
@@ -650,7 +668,7 @@
         cdef size_t chars_left
         cdef ssize_t piece_len
 
-        if not self._get_next_line():
+        if not self._get_next_line(0):
             raise errors.KnitCorrupt(self.knit_data_name,
                 "Could not find a record header.")
         pos = self.cur_line
@@ -711,7 +729,7 @@
 
         Also check that there is no more data after the tail.
         """
-        if not self._get_next_line():
+        if not self._get_next_line(0):
             raise errors.KnitCorrupt(self.knit_data_name,
                 "Could not find the 'end' line.")
         # Now we should have the version_id
@@ -726,7 +744,7 @@
                 "expected: 'end %r\\n'"
                 "found:    %r"
                 % (self.py_version_id, self._get_cur_string()))
-        if self._get_next_line():
+        if self._get_next_line(0):
             raise errors.KnitCorrupt(self.knit_data_name,
                 "Trailing data after the final 'end' line")
 
@@ -734,8 +752,11 @@
         """Return the next line as unannotated text."""
         cdef char *text
         cdef ssize_t text_size
+        cdef int next_retval
+        cdef object line
 
-        if not self._get_next_line():
+        next_retval = self._get_next_line(1)
+        if next_retval == 0:
             raise errors.KnitCorrupt(self.knit_data_name,
                 "Not enough lines in knit data."
                 " Expected %d, only got %d"
@@ -751,7 +772,23 @@
         else:
             text = self.cur_line
             text_size = self.cur_line_size
-        return PyString_FromStringAndSize(text, text_size)
+
+        line = PyString_FromStringAndSize(text, text_size)
+
+        # We need to keep parsing until we get an eol
+        while next_retval == 2:
+            next_retval = self._get_next_line(1)
+            if next_retval == 0:
+                raise errors.KnitCorrupt(self.knit_data_name,
+                    "Not enough lines in knit data."
+                    " Expected %d, only got %d"
+                    % (self.num_lines, i))
+            # TODO: We might think about resizing the string in place, or
+            # something like that, but for now, we just create a new string by
+            # combining the two.
+            line = line + PyString_FromStringAndSize(self.cur_line,
+                                                     self.cur_line_size)
+        return line
 
     cdef object _get_cur_string(self):
         if self.cur_line == NULL:
@@ -791,7 +828,7 @@
         cdef char *cur
         cdef char *next
 
-        if not self._get_next_line():
+        if not self._get_next_line(0):
             raise errors.KnitCorrupt(self.knit_data_name,
                 "Could not find a knit delta header line")