Rev 3652: Start working on an alternate way to track compressed_chunk state. in http://bzr.arbash-meinel.com/branches/bzr/1.7-dev/btree
John Arbash Meinel
john at arbash-meinel.com
Wed Aug 20 23:12:09 BST 2008
At http://bzr.arbash-meinel.com/branches/bzr/1.7-dev/btree
------------------------------------------------------------
revno: 3652
revision-id: john at arbash-meinel.com-20080820221200-1pnk090rjwbydavn
parent: john at arbash-meinel.com-20080820205738-0tb90jyr27jhv51x
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: btree
timestamp: Wed 2008-08-20 17:12:00 -0500
message:
Start working on an alternate way to track compressed_chunk state.
I tried doing a copy() before flush() but that is quite expensive.
Both because it now does copy() but it also does a bigger flush.
Next I'll try tracking 2 objects.
modified:
bzrlib/btree_index.py index.py-20080624222253-p0x5f92uyh5hw734-7
bzrlib/chunk_writer.py chunk_writer.py-20080630234519-6ggn4id17nipovny-1
-------------- next part --------------
=== modified file 'bzrlib/btree_index.py'
--- a/bzrlib/btree_index.py 2008-08-20 20:57:38 +0000
+++ b/bzrlib/btree_index.py 2008-08-20 22:12:00 +0000
@@ -239,23 +239,22 @@
except StopIteration:
current_values[pos] = None
- def _add_key(self, string_key, key_line, rows):
+ def _add_key(self, string_key, line, rows):
"""Add a key to the current chunk.
:param string_key: The key to add.
- :param key_line: The fully serialised key and value.
+ :param line: The fully serialised key and value.
"""
if rows[-1].writer is None:
# opening a new leaf chunk;
for pos, internal_row in enumerate(rows[:-1]):
# flesh out any internal nodes that are needed to
- # preserve the high of the tree
+ # preserve the height of the tree
if internal_row.writer is None:
length = _PAGE_SIZE
if internal_row.nodes == 0:
length -= _RESERVED_HEADER_BYTES # padded
- internal_row.writer = chunk_writer.ChunkWriter(
- length, 0)
+ internal_row.writer = chunk_writer.ChunkWriter(length, 0)
internal_row.writer.write(_INTERNAL_FLAG)
internal_row.writer.write(_INTERNAL_OFFSET +
str(rows[pos + 1].nodes) + "\n")
@@ -265,16 +264,16 @@
length -= _RESERVED_HEADER_BYTES # padded
rows[-1].writer = chunk_writer.ChunkWriter(length)
rows[-1].writer.write(_LEAF_FLAG)
- if rows[-1].writer.write(key_line):
+ if rows[-1].writer.write(line):
# this key did not fit in the node:
rows[-1].finish_node()
- next_key_line = string_key + "\n"
+ key_line = string_key + "\n"
new_row = True
for row in reversed(rows[:-1]):
# Mark the start of the next node in the node above. If it
# doesn't fit then propogate upwards until we find one that
# it does fit into.
- if row.writer.write(next_key_line):
+ if row.writer.write(key_line):
row.finish_node()
else:
# We've found a node that can handle the pointer.
@@ -296,8 +295,8 @@
new_row.writer.write(_INTERNAL_FLAG)
new_row.writer.write(_INTERNAL_OFFSET +
str(rows[1].nodes - 1) + "\n")
- new_row.writer.write(next_key_line)
- self._add_key(string_key, key_line, rows)
+ new_row.writer.write(key_line)
+ self._add_key(string_key, line, rows)
def _write_nodes(self, node_iterator):
"""Write node_iterator out as a B+Tree.
@@ -326,6 +325,12 @@
# First key triggers the first row
rows.append(_LeafBuilderRow())
key_count += 1
+ # TODO: Flattening the node into a string key and a line should
+ # probably be put into a pyrex function. We can do a quick
+ # iter over all the entries to determine the final length,
+ # and then do a single malloc() rather than lots of
+ # intermediate mallocs as we build everything up.
+ # ATM 3 / 13s are spent flattening nodes (10s is compressing)
if self.reference_lists:
flattened_references = ['\r'.join(['\x00'.join(reference)
for reference in ref_list])
=== modified file 'bzrlib/chunk_writer.py'
--- a/bzrlib/chunk_writer.py 2008-08-20 19:34:29 +0000
+++ b/bzrlib/chunk_writer.py 2008-08-20 22:12:00 +0000
@@ -64,6 +64,7 @@
self.unused_bytes = None
self.reserved_size = reserved
self.min_compress_size = self._default_min_compression_size
+ self.num_zsync = 0
def finish(self):
"""Finish the chunk.
@@ -107,6 +108,9 @@
If the bytes fit, False is returned. Otherwise True is returned
and the bytes have not been added to the chunk.
"""
+ # TODO: lsprof claims that we spend 0.4/10s in calls to write just to
+ # thunk over to _write. We don't seem to need write_reserved
+ # unless we have blooms, so this *might* be worth removing
return self._write(bytes, False)
def write_reserved(self, bytes):
@@ -128,10 +132,10 @@
if (next_seen_size < self.min_compress_size * capacity):
# No need, we assume this will "just fit"
out = self.compressor.compress(bytes)
+ if out:
+ self.bytes_list.append(out)
self.bytes_in.append(bytes)
self.seen_bytes = next_seen_size
- if out:
- self.bytes_list.append(out)
else:
if not reserved and self.num_repack >= self._max_repack:
# We have packed too many times already.
@@ -143,6 +147,7 @@
out = self.compressor.flush(Z_SYNC_FLUSH)
if out:
self.bytes_list.append(out)
+ self.num_zsync += 1
# TODO: We may want to cache total_len, as the 'sum' call seems to
# be showing up a bit on lsprof output
total_len = sum(map(len, self.bytes_list))
@@ -160,12 +165,16 @@
self.compressor = compressor
self.bytes_list = bytes_out
self.unused_bytes = bytes
+ self.num_zsync = 0
return True
else:
# This fits when we pack it tighter, so use the new packing
self.compressor = compressor
self.bytes_in.append(bytes)
self.bytes_list = bytes_out
+ # There is one Z_SYNC_FLUSH call in
+ # _recompress_all_bytes_in
+ self.num_zsync = 1
else:
# It fit, so mark it added
self.bytes_in.append(bytes)
More information about the bazaar-commits
mailing list