Rev 2678: Lots of cleanup, including implementing get_build_chain for the new index format. in http://bzr.arbash-meinel.com/branches/bzr/0.19-dev/pyrex_knit_extract

Fri Aug 3 01:01:09 BST 2007

At http://bzr.arbash-meinel.com/branches/bzr/0.19-dev/pyrex_knit_extract

------------------------------------------------------------
revno: 2678
revision-id: john at arbash-meinel.com-20070803000035-8lrg5av9vw3zw89j
parent: john at arbash-meinel.com-20070802233253-jksnt66mv2ti5mhi
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: pyrex_knit_extract
timestamp: Thu 2007-08-02 19:00:35 -0500
message:
  Lots of cleanup, including implementing get_build_chain for the new index format.
  Remove a bunch of unnecessary functions.
modified:
  bzrlib/_knit_helpers_c.pyx     knit_c.pyx-20070509143944-u42gy8w387a10m0j-1
  bzrlib/_knit_helpers_py.py     _knit_load_data_py.p-20070629000948-9a0nh4s118bi5y8n-1
  bzrlib/knit.py                 knit.py-20051212171256-f056ac8f0fbe1bd9
  bzrlib/tests/test_knit.py      test_knit.py-20051212171302-95d4c00dd5f11f2b
-------------- next part --------------
=== modified file 'bzrlib/_knit_helpers_c.pyx'

--- a/bzrlib/_knit_helpers_c.pyx	2007-08-02 23:26:06 +0000
+++ b/bzrlib/_knit_helpers_c.pyx	2007-08-03 00:00:35 +0000
@@ -386,52 +386,6 @@
     return sio.readlines()
 
 
-def _extract_lines_from_gzip_c(data):
-    """Convert a gzip stream into a set of lines.
-
-    :param data: A string of gzip data
-    :return: A list of lines extracted from the decompressed data.
-    """
-    cdef Bytef *c_buf
-    cdef uLong buf_size
-    cdef Bytef *c_data
-    cdef uLong data_size
-    cdef int retcode
-    cdef PyObject *pyobj_buf
-    cdef z_stream strm
-
-    c_data = <Bytef *>PyString_AsString(data)
-    data_size = PyString_Size(data)
-
-    buf_size = 1000000
-    # A PyString is used to allow us to avoid malloc
-    # We could use a global as long as we didn't care about thread safety
-    buf = PyString_FromStringAndSize(NULL, buf_size)
-
-    # Is this super ugly?
-    c_buf = <Bytef *>PyString_AsString(buf)
-
-    memset(&strm, 0, sizeof(z_stream))
-    strm.next_in = c_data
-    strm.avail_in = data_size
-    strm.total_in = 0
-    strm.next_out = c_buf
-    strm.avail_out = buf_size
-    strm.total_out = 0
-
-    # windowBits can be +16 to indicate we are decompressing a gzip stream, or
-    # +32 for it to auto determine whether this is a gzip or zlib stream.
-    inflateInit2(&strm, 15+16)
-
-    retcode = inflate(&strm, Z_FINISH)
-    assert retcode == Z_STREAM_END, (
-        "Expected Z_STREAM_END (%d) got %d while decompressing"
-        % (Z_STREAM_END, retcode))
-
-    # Now that the data is decompressed, resize our buffer
-    return _convert_bytes_to_lines(<char*>c_buf, buf_size-strm.avail_out)
-
-
 # 1MB for the decompression buffer
 cdef int _decompress_buffer_size
 
@@ -916,39 +870,6 @@
     return rec
 
 
-def _extract_knit_lines_from_gzip(version_id, data, kd_name):
-    """Extract the lines from a gzip chunk.
-
-    :param version_id: The chunk should correspond to this version id, verify
-        this.
-    :param data: The gzip data
-    :param kd_name: The KnitData filename. This is used when raising
-        KnitCorrupt errors
-    :return: sha1digest, [lines]
-    """
-    try:
-        record_contents = _extract_lines_from_gzip_c(data)
-    except Exception, e:
-        raise errors.KnitCorrupt(kd_name,
-            "While reading {%s} got %s(%s)"
-            % (version_id, e.__class__.__name__, str(e)))
-    header = record_contents.pop(0)
-    rec = _check_header(version_id, header, kd_name)
-
-    last_line = record_contents.pop()
-    if len(record_contents) != int(rec[2]):
-        raise errors.KnitCorrupt(kd_name,
-            'incorrect number of lines %s != %s'
-            ' for version {%s}'
-            % (len(record_contents), int(rec[2]),
-               version_id))
-    if last_line != 'end %s\n' % rec[1]:
-        raise errors.KnitCorrupt(kd_name,
-            'unexpected version end line %r, wanted %r'
-            % (last_line, version_id))
-    return rec[3], record_contents
-
-
 def _extract_knit_fulltext_from_gzip_c(version_id, data, kd_name,
                                        is_annotated):
     """Extract the unannotated fulltext lines from a gzip hunk.

=== modified file 'bzrlib/_knit_helpers_py.py'
--- a/bzrlib/_knit_helpers_py.py	2007-07-25 00:28:51 +0000
+++ b/bzrlib/_knit_helpers_py.py	2007-08-03 00:00:35 +0000
@@ -104,7 +104,10 @@
     :return: A list of lines extracted from the decompressed data.
     """
     df = GzipFile(mode='rb', fileobj=StringIO(data))
-    return df.readlines()
+    try:
+        return df.readlines()
+    finally:
+        df.close()
 
 
 def _check_header(version_id, line, kd_name):

=== modified file 'bzrlib/knit.py'
--- a/bzrlib/knit.py	2007-08-02 23:32:53 +0000
+++ b/bzrlib/knit.py	2007-08-03 00:00:35 +0000
@@ -1626,6 +1626,24 @@
             options.append('no-eol')
         return ','.join(options)
 
+    def get_build_chain(self, version_id):
+        """Get the chain that we need to get a fulltext for this version.
+
+        :return: [(version_id, start, size)] needed to extract this text.
+        """
+        chain = []
+        node = self._get_node(version_id)
+        while node is not None:
+            start, size = self.get_position(node[0])
+            chain.append((node[0], start, size))
+            if self._parent_compression(node[2][1]) == 'line-delta':
+                node = self._get_node(node[2][1][0]) # parent
+            else:
+                break
+        # Put the fulltext first
+        chain.reverse()
+        return chain
+
     def get_parents(self, version_id):
         """Return parents of specified version ignoring ghosts."""
         parents = list(self.iter_parents([version_id]))

=== modified file 'bzrlib/tests/test_knit.py'
--- a/bzrlib/tests/test_knit.py	2007-07-27 20:11:47 +0000
+++ b/bzrlib/tests/test_knit.py	2007-08-03 00:00:35 +0000
@@ -148,12 +148,20 @@
 class LowLevelKnitDataTests(TestCase):
 
     def get_knit_data(self, *args, **kwargs):
-        orig = knit._extract_lines_from_gzip
+        orig_fulltext = knit._extract_knit_fulltext_from_gzip
+        orig_linedelta = knit._extract_knit_linedelta_from_gzip
         def reset():
-            knit._extract_lines_from_gzip = orig
+            knit._extract_knit_fulltext_from_gzip = orig_fulltext
+            knit._extract_knit_linedelta_from_gzip = orig_linedelta
         self.addCleanup(reset)
-        from bzrlib._knit_helpers_py import _extract_lines_from_gzip_py
-        knit._extract_lines_from_gzip = _extract_lines_from_gzip_py
+        from bzrlib._knit_helpers_py import (
+            _extract_knit_fulltext_from_gzip_py,
+            _extract_knit_linedelta_from_gzip_py,
+            )
+        knit._extract_knit_fulltext_from_gzip = \
+                _extract_knit_fulltext_from_gzip_py
+        knit._extract_knit_linedelta_from_gzip = \
+                _extract_knit_linedelta_from_gzip_py
         return _KnitData(*args, **kwargs)
 
     def create_gz_content(self, text):
@@ -273,12 +281,20 @@
     _test_needs_features = [CompiledKnitFeature]
 
     def get_knit_data(self, *args, **kwargs):
-        orig = knit._extract_lines_from_gzip
+        orig_fulltext = knit._extract_knit_fulltext_from_gzip
+        orig_linedelta = knit._extract_knit_linedelta_from_gzip
         def reset():
-            knit._extract_lines_from_gzip = orig
+            knit._extract_knit_fulltext_from_gzip = orig_fulltext
+            knit._extract_knit_linedelta_from_gzip = orig_linedelta
         self.addCleanup(reset)
-        from bzrlib._knit_helpers_c import _extract_lines_from_gzip_c
-        knit._extract_lines_from_gzip = _extract_lines_from_gzip_c
+        from bzrlib._knit_helpers_c import (
+            _extract_knit_fulltext_from_gzip_c,
+            _extract_knit_linedelta_from_gzip_c,
+            )
+        knit._extract_knit_fulltext_from_gzip = \
+                _extract_knit_fulltext_from_gzip_c
+        knit._extract_knit_linedelta_from_gzip = \
+                _extract_knit_linedelta_from_gzip_c
         return _KnitData(*args, **kwargs)