Rev 39: Have the GroupCompressBlock decide how to compress the header and content. in http://bzr.arbash-meinel.com/plugins/groupcompress

Wed Mar 4 21:22:56 GMT 2009

At http://bzr.arbash-meinel.com/plugins/groupcompress

------------------------------------------------------------
revno: 39
revision-id: john at arbash-meinel.com-20090304212250-xcvwt1yx4zt76pev
parent: john at arbash-meinel.com-20090304210622-ur7wz2dz0w4lhzn3
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: groupcompress
timestamp: Wed 2009-03-04 15:22:50 -0600
message:
  Have the GroupCompressBlock decide how to compress the header and content.
  It can now decide whether they should be compressed together or not.
  As long as we make the to_bytes() function match the from_bytes() one, we should be fine.
-------------- next part --------------
=== modified file 'groupcompress.py'

--- a/groupcompress.py	2009-03-04 21:06:22 +0000
+++ b/groupcompress.py	2009-03-04 21:22:50 +0000
@@ -169,7 +169,8 @@
         pos2 = pos + z_header_length
         z_header_bytes = bytes[pos:pos2]
         assert len(z_header_bytes) == z_header_length
-        header_bytes = zlib.decompress(z_header_bytes)
+        d = zlib.decompressobj()
+        header_bytes = d.decompress(z_header_bytes)
         assert len(header_bytes) == header_length
         del z_header_bytes
         lines = header_bytes.split('\n')
@@ -193,7 +194,8 @@
             info_dict[key] = value
         zcontent = bytes[pos2:]
         if zcontent:
-            out._content = zlib.decompress(zcontent)
+            out._content = d.decompress(zcontent)
+            assert d.flush() == ''
             out._size = header_len + len(out._content)
         return out
 
@@ -228,7 +230,7 @@
         self._entries[key] = entry
         return entry
 
-    def to_bytes(self):
+    def to_bytes(self, content=''):
         """Encode the information into a byte stream."""
         chunks = []
         for key in sorted(self._entries):
@@ -248,11 +250,21 @@
             chunks.append(chunk)
         bytes = ''.join(chunks)
         info_len = len(bytes)
-        z_bytes = zlib.compress(bytes)
+        c = zlib.compressobj()
+        z_bytes = []
+        z_bytes.append(c.compress(bytes))
         del bytes
-        z_len = len(z_bytes)
-        chunks = [self.GCB_HEADER, '%d\n' % (z_len,), '%d\n' % (info_len,),
-                  z_bytes]
+        z_bytes.append(c.flush(zlib.Z_SYNC_FLUSH))
+        z_len = sum(map(len, z_bytes))
+        c_len = len(content)
+        z_bytes.append(c.compress(content))
+        z_bytes.append(c.flush())
+        chunks = [self.GCB_HEADER,
+                  '%d\n' % (z_len,),
+                  '%d\n' % (info_len,),
+                  #'%d\n' % (c_len,),
+                 ]
+        chunks.extend(z_bytes)
         return ''.join(chunks)
 
 
@@ -725,11 +737,10 @@
             #       label in the header is duplicated in the text.
             #       For chk pages and real bytes, I would guess this is not
             #       true.
-            header = self._compressor._block.to_bytes()
-            compressed = zlib.compress(''.join(self._compressor.lines))
-            out = header + compressed
+            bytes = self._compressor._block.to_bytes(
+                ''.join(self._compressor.lines))
             index, start, length = self._access.add_raw_records(
-                [(None, len(out))], out)[0]
+                [(None, len(bytes))], bytes)[0]
             nodes = []
             for key, reads, refs in keys_to_add:
                 nodes.append((key, "%d %d %s" % (start, length, reads), refs))