Rev 8: Try packing more into each index. Seems to cost a bit in CPU, but saves a lot in disk space. in http://bzr.arbash-meinel.com/plugins/index2
John Arbash Meinel
john at arbash-meinel.com
Tue Jul 1 18:59:21 BST 2008
At http://bzr.arbash-meinel.com/plugins/index2
------------------------------------------------------------
revno: 8
revision-id: john at arbash-meinel.com-20080701175844-z5jnbgf6c1qvyuux
parent: robertc at robertcollins.net-20080701113743-n0pc6x42arqd4zhd
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: index2
timestamp: Tue 2008-07-01 12:58:44 -0500
message:
Try packing more into each index. Seems to cost a bit in CPU, but saves a lot in disk space.
-------------- next part --------------
=== modified file 'chunk_writer.py'
--- a/chunk_writer.py 2008-06-30 23:45:37 +0000
+++ b/chunk_writer.py 2008-07-01 17:58:44 +0000
@@ -36,11 +36,12 @@
def __init__(self, chunk_size):
"""Create a ChunkWriter to write chunk_size chunks."""
self.chunk_size = chunk_size
- self.compressor = zlib.compressobj()
- self.bytes_list = []
+ # self.compressor = zlib.compressobj()
self.position = 0
+ self.in_bytes_list = []
self.seen_bytes = 0
self.unused_bytes = None
+ self.compressed = None
def finish(self):
"""Finish the chunk.
@@ -48,12 +49,13 @@
This returns the final compressed chunk, and either None, or the
bytes that did not fit in the chunk.
"""
- self.bytes_list.append(self.compressor.flush(Z_FINISH))
- self.position += len(self.bytes_list[-1])
- nulls_needed = self.chunk_size - self.position % self.chunk_size
- if nulls_needed:
- self.bytes_list.append("\x00" * nulls_needed)
- return self.bytes_list, self.unused_bytes
+ if self.compressed is None:
+ self.compressed = zlib.compress(''.join(self.in_bytes_list))
+ self.in_bytes_list = None
+
+ nulls_needed = self.chunk_size - len(self.compressed)
+ nulls = '\x00' * nulls_needed
+ return [self.compressed, nulls], self.unused_bytes
def write(self, bytes):
"""Write some bytes to the chunk.
@@ -61,21 +63,38 @@
If the bytes fit, False is returned. Otherwise True is returned
and the bytes have not been added to the chunk.
"""
+ if (self.seen_bytes < 1.8 * self.chunk_size):
+ # Just track the data
+ self.in_bytes_list.append(bytes)
+ self.seen_bytes += len(bytes)
+ else:
+ # Try to compress all seen chunks
+ next = self.in_bytes_list + [bytes]
+ compressed = zlib.compress(''.join(next))
+ if len(compressed) > self.chunk_size:
+ self.unused_bytes = bytes
+ return True
+ # The compression succeeded, so stick with it for now
+ self.in_bytes_list = next
+ self.compressed = compressed
+ self.seen_bytes += len(bytes)
+ return False
+
# Reject content if its likely to fail to fit. The 10 constant is to
# allow room for the zlib END_STREAM record in the Z_FINISH flush call.
- if (self.seen_bytes > self.chunk_size and
- self.position + 10 + len(bytes) > self.chunk_size):
- self.unused_bytes = bytes
- return True
- self.bytes_list.append(self.compressor.compress(bytes))
- self.position += len(self.bytes_list[-1])
- self.seen_bytes += len(bytes)
- # If we are at the end of what we know will fit, flush.
- if self.seen_bytes > self.chunk_size:
- # Note: we could strip the \x00\x00\xff\xff and reinsert it in the
- # reader - see rfc1979. syncing on every call imposes a increase in
- # compressed size. e.g. 3661 vs 4050 bytes for 40 200 byte rows.
- self.bytes_list.append(self.compressor.flush(Z_SYNC_FLUSH))
- self.position += len(self.bytes_list[-1])
- return False
+ # if (self.seen_bytes > self.chunk_size and
+ # self.position + 10 + len(bytes) > self.chunk_size):
+ # self.unused_bytes = bytes
+ # return True
+ # self.bytes_list.append(self.compressor.compress(bytes))
+ # self.position += len(self.bytes_list[-1])
+ # self.seen_bytes += len(bytes)
+ # # If we are at the end of what we know will fit, flush.
+ # if self.seen_bytes > self.chunk_size:
+ # # Note: we could strip the \x00\x00\xff\xff and reinsert it in the
+ # # reader - see rfc1979. syncing on every call imposes a increase in
+ # # compressed size. e.g. 3661 vs 4050 bytes for 40 200 byte rows.
+ # self.bytes_list.append(self.compressor.flush(Z_SYNC_FLUSH))
+ # self.position += len(self.bytes_list[-1])
+ # return False
More information about the bazaar-commits
mailing list