Rev 3657: Somewhat surprisingly, tracking bytes_out_len makes a in http://bzr.arbash-meinel.com/branches/bzr/1.7-dev/btree
John Arbash Meinel
john at arbash-meinel.com
Thu Aug 21 20:53:55 BST 2008
At http://bzr.arbash-meinel.com/branches/bzr/1.7-dev/btree
------------------------------------------------------------
revno: 3657
revision-id: john at arbash-meinel.com-20080821195353-1q7mrhcjqerr5rhh
parent: john at arbash-meinel.com-20080821193558-0a4qni76jso98gxn
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: btree
timestamp: Thu 2008-08-21 14:53:53 -0500
message:
Somewhat surprisingly, tracking bytes_out_len makes a
large difference in performance. It drops the three_level test time
from 9.2s => 8.2s. My best guess is that when adding with Z_SYNC_FLUSH
we get a *lot* of small strings, and we loop over it each time
we add another string.
Real world tests show improvement, too.
For mysql, repack=2,nocopy time 59.3=>57.4
For bzr.dev, repack=2,nocopy time 9.6=>9.3
modified:
bzrlib/btree_index.py index.py-20080624222253-p0x5f92uyh5hw734-7
bzrlib/chunk_writer.py chunk_writer.py-20080630234519-6ggn4id17nipovny-1
bzrlib/tests/test_btree_index.py test_index.py-20080624222253-p0x5f92uyh5hw734-13
-------------- next part --------------
=== modified file 'bzrlib/btree_index.py'
--- a/bzrlib/btree_index.py 2008-08-20 22:12:00 +0000
+++ b/bzrlib/btree_index.py 2008-08-21 19:53:53 +0000
@@ -372,7 +372,6 @@
copied_len = osutils.pumpfile(row.spool, result)
if copied_len != (row.nodes - 1) * _PAGE_SIZE:
if type(row) != _LeafBuilderRow:
- import pdb;pdb.set_trace()
raise AssertionError("Not enough data copied")
result.flush()
size = result.tell()
=== modified file 'bzrlib/chunk_writer.py'
--- a/bzrlib/chunk_writer.py 2008-08-21 19:35:58 +0000
+++ b/bzrlib/chunk_writer.py 2008-08-21 19:53:53 +0000
@@ -79,6 +79,7 @@
self.compressor = zlib.compressobj()
self.bytes_in = []
self.bytes_list = []
+ self.bytes_out_len = 0
self.compressed = None
self.seen_bytes = 0
self.num_repack = 0
@@ -93,13 +94,14 @@
bytes that did not fit in the chunk.
"""
self.bytes_in = None # Free the data cached so far, we don't need it
- self.bytes_list.append(self.compressor.flush(Z_FINISH))
- total_len = sum(map(len, self.bytes_list))
- if total_len > self.chunk_size:
+ out = self.compressor.flush(Z_FINISH)
+ self.bytes_list.append(out)
+ self.bytes_out_len += len(out)
+ if self.bytes_out_len > self.chunk_size:
raise AssertionError('Somehow we ended up with too much'
' compressed data, %d > %d'
- % (total_len, self.chunk_size))
- nulls_needed = self.chunk_size - total_len % self.chunk_size
+ % (self.bytes_out_len, self.chunk_size))
+ nulls_needed = self.chunk_size - self.bytes_out_len % self.chunk_size
if nulls_needed:
self.bytes_list.append("\x00" * nulls_needed)
return self.bytes_list, self.unused_bytes, nulls_needed
@@ -123,20 +125,21 @@
"""
compressor = zlib.compressobj()
bytes_out = []
+ bytes_out_len = 0
append = bytes_out.append
compress = compressor.compress
for accepted_bytes in self.bytes_in:
out = compress(accepted_bytes)
if out:
append(out)
+ bytes_out_len += len(out)
if extra_bytes:
out = compress(extra_bytes)
- if out:
- append(out)
- out = compressor.flush(Z_SYNC_FLUSH)
- if out:
- append(out)
- return bytes_out, compressor
+ out += compressor.flush(Z_SYNC_FLUSH)
+ if out:
+ append(out)
+ bytes_out_len += len(out)
+ return bytes_out, bytes_out_len, compressor
def write(self, bytes):
"""Write some bytes to the chunk.
@@ -170,6 +173,7 @@
out = self.compressor.compress(bytes)
if out:
self.bytes_list.append(out)
+ self.bytes_out_len += len(out)
self.bytes_in.append(bytes)
self.seen_bytes = next_seen_size
else:
@@ -181,22 +185,23 @@
out += self.compressor.flush(Z_SYNC_FLUSH)
if out:
self.bytes_list.append(out)
- total_len = sum(map(len, self.bytes_list))
- # total_len + 10 is to give some room for Z_FINISH
- if total_len + 10 > capacity:
+ self.bytes_out_len += len(out)
+ if self.bytes_out_len + 10 > capacity:
# We are over budget, try to squeeze this in without any
# Z_SYNC_FLUSH calls
self.num_repack += 1
- bytes_out, compressor = self._recompress_all_bytes_in(bytes)
- this_len = sum(map(len, bytes_out))
- if this_len is None or this_len + 10 > capacity:
+ (bytes_out, this_len,
+ compressor) = self._recompress_all_bytes_in(bytes)
+ if this_len + 10 > capacity:
# No way we can add anymore, we need to re-pack because our
# compressor is now out of sync.
# This seems to be rarely triggered over
# num_repack > _max_repack
- bytes_out, compressor = self._recompress_all_bytes_in()
+ (bytes_out, this_len,
+ compressor) = self._recompress_all_bytes_in()
self.compressor = compressor
self.bytes_list = bytes_out
+ self.bytes_out_len = this_len
self.unused_bytes = bytes
return True
else:
@@ -206,6 +211,7 @@
self.compressor = compressor
self.bytes_in.append(bytes)
self.bytes_list = bytes_out
+ self.bytes_out_len = this_len
else:
# It fit, so mark it added
self.bytes_in.append(bytes)
=== modified file 'bzrlib/tests/test_btree_index.py'
--- a/bzrlib/tests/test_btree_index.py 2008-08-20 23:11:59 +0000
+++ b/bzrlib/tests/test_btree_index.py 2008-08-21 19:53:53 +0000
@@ -293,7 +293,6 @@
index = btree_index.BTreeGraphIndex(transport, 'index', size)
# Seed the metadata, we're using internal calls now.
index.key_count()
- print '\n',index._row_lengths
self.assertEqual(3, len(index._row_lengths),
"Not enough rows: %r" % index._row_lengths)
self.assertEqual(4, len(index._row_offsets))
More information about the bazaar-commits
mailing list