Rev 8: Try packing more into each index. Seems to cost a bit in CPU, but saves a lot in disk space. in http://bzr.arbash-meinel.com/plugins/index2

Tue Jul 1 18:59:21 BST 2008

At http://bzr.arbash-meinel.com/plugins/index2

------------------------------------------------------------
revno: 8
revision-id: john at arbash-meinel.com-20080701175844-z5jnbgf6c1qvyuux
parent: robertc at robertcollins.net-20080701113743-n0pc6x42arqd4zhd
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: index2
timestamp: Tue 2008-07-01 12:58:44 -0500
message:
  Try packing more into each index. Seems to cost a bit in CPU, but saves a lot in disk space.
-------------- next part --------------
=== modified file 'chunk_writer.py'

--- a/chunk_writer.py	2008-06-30 23:45:37 +0000
+++ b/chunk_writer.py	2008-07-01 17:58:44 +0000
@@ -36,11 +36,12 @@
     def __init__(self, chunk_size):
         """Create a ChunkWriter to write chunk_size chunks."""
         self.chunk_size = chunk_size
-        self.compressor = zlib.compressobj()
-        self.bytes_list = []
+        # self.compressor = zlib.compressobj()
         self.position = 0
+        self.in_bytes_list = []
         self.seen_bytes = 0
         self.unused_bytes = None
+        self.compressed = None
 
     def finish(self):
         """Finish the chunk.
@@ -48,12 +49,13 @@
         This returns the final compressed chunk, and either None, or the
         bytes that did not fit in the chunk.
         """
-        self.bytes_list.append(self.compressor.flush(Z_FINISH))
-        self.position += len(self.bytes_list[-1])
-        nulls_needed = self.chunk_size - self.position % self.chunk_size
-        if nulls_needed:
-            self.bytes_list.append("\x00" * nulls_needed)
-        return self.bytes_list, self.unused_bytes
+        if self.compressed is None:
+            self.compressed = zlib.compress(''.join(self.in_bytes_list))
+        self.in_bytes_list = None
+
+        nulls_needed = self.chunk_size - len(self.compressed)
+        nulls = '\x00' * nulls_needed
+        return [self.compressed, nulls], self.unused_bytes
 
     def write(self, bytes):
         """Write some bytes to the chunk.
@@ -61,21 +63,38 @@
         If the bytes fit, False is returned. Otherwise True is returned
         and the bytes have not been added to the chunk.
         """
+        if (self.seen_bytes < 1.8 * self.chunk_size):
+            # Just track the data
+            self.in_bytes_list.append(bytes)
+            self.seen_bytes += len(bytes)
+        else:
+            # Try to compress all seen chunks
+            next = self.in_bytes_list + [bytes]
+            compressed = zlib.compress(''.join(next))
+            if len(compressed) > self.chunk_size:
+                self.unused_bytes = bytes
+                return True
+            # The compression succeeded, so stick with it for now
+            self.in_bytes_list = next
+            self.compressed = compressed
+            self.seen_bytes += len(bytes)
+        return False
+
         # Reject content if its likely to fail to fit. The 10 constant is to
         # allow room for the zlib END_STREAM record in the Z_FINISH flush call.
-        if (self.seen_bytes > self.chunk_size and
-            self.position + 10 + len(bytes) > self.chunk_size):
-            self.unused_bytes = bytes
-            return True
-        self.bytes_list.append(self.compressor.compress(bytes))
-        self.position += len(self.bytes_list[-1])
-        self.seen_bytes += len(bytes)
-        # If we are at the end of what we know will fit, flush.
-        if self.seen_bytes > self.chunk_size:
-            # Note: we could strip the \x00\x00\xff\xff and reinsert it in the
-            # reader - see rfc1979. syncing on every call imposes a increase in
-            # compressed size. e.g. 3661 vs 4050 bytes for 40 200 byte rows.
-            self.bytes_list.append(self.compressor.flush(Z_SYNC_FLUSH))
-            self.position += len(self.bytes_list[-1])
-        return False
+        # if (self.seen_bytes > self.chunk_size and
+        #     self.position + 10 + len(bytes) > self.chunk_size):
+        #     self.unused_bytes = bytes
+        #     return True
+        # self.bytes_list.append(self.compressor.compress(bytes))
+        # self.position += len(self.bytes_list[-1])
+        # self.seen_bytes += len(bytes)
+        # # If we are at the end of what we know will fit, flush.
+        # if self.seen_bytes > self.chunk_size:
+        #     # Note: we could strip the \x00\x00\xff\xff and reinsert it in the
+        #     # reader - see rfc1979. syncing on every call imposes a increase in
+        #     # compressed size. e.g. 3661 vs 4050 bytes for 40 200 byte rows.
+        #     self.bytes_list.append(self.compressor.flush(Z_SYNC_FLUSH))
+        #     self.position += len(self.bytes_list[-1])
+        # return False