Rev 3652: Start working on an alternate way to track compressed_chunk state. in http://bzr.arbash-meinel.com/branches/bzr/1.7-dev/btree

Wed Aug 20 23:12:09 BST 2008

At http://bzr.arbash-meinel.com/branches/bzr/1.7-dev/btree

------------------------------------------------------------
revno: 3652
revision-id: john at arbash-meinel.com-20080820221200-1pnk090rjwbydavn
parent: john at arbash-meinel.com-20080820205738-0tb90jyr27jhv51x
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: btree
timestamp: Wed 2008-08-20 17:12:00 -0500
message:
  Start working on an alternate way to track compressed_chunk state.
  I tried doing a copy() before flush() but that is quite expensive.
  Both because it now does copy() but it also does a bigger flush.
  Next I'll try tracking 2 objects.
modified:
  bzrlib/btree_index.py          index.py-20080624222253-p0x5f92uyh5hw734-7
  bzrlib/chunk_writer.py         chunk_writer.py-20080630234519-6ggn4id17nipovny-1
-------------- next part --------------
=== modified file 'bzrlib/btree_index.py'

--- a/bzrlib/btree_index.py	2008-08-20 20:57:38 +0000
+++ b/bzrlib/btree_index.py	2008-08-20 22:12:00 +0000
@@ -239,23 +239,22 @@
             except StopIteration:
                 current_values[pos] = None
 
-    def _add_key(self, string_key, key_line, rows):
+    def _add_key(self, string_key, line, rows):
         """Add a key to the current chunk.
 
         :param string_key: The key to add.
-        :param key_line: The fully serialised key and value.
+        :param line: The fully serialised key and value.
         """
         if rows[-1].writer is None:
             # opening a new leaf chunk;
             for pos, internal_row in enumerate(rows[:-1]):
                 # flesh out any internal nodes that are needed to
-                # preserve the high of the tree
+                # preserve the height of the tree
                 if internal_row.writer is None:
                     length = _PAGE_SIZE
                     if internal_row.nodes == 0:
                         length -= _RESERVED_HEADER_BYTES # padded
-                    internal_row.writer = chunk_writer.ChunkWriter(
-                        length, 0)
+                    internal_row.writer = chunk_writer.ChunkWriter(length, 0)
                     internal_row.writer.write(_INTERNAL_FLAG)
                     internal_row.writer.write(_INTERNAL_OFFSET +
                         str(rows[pos + 1].nodes) + "\n")
@@ -265,16 +264,16 @@
                 length -= _RESERVED_HEADER_BYTES # padded
             rows[-1].writer = chunk_writer.ChunkWriter(length)
             rows[-1].writer.write(_LEAF_FLAG)
-        if rows[-1].writer.write(key_line):
+        if rows[-1].writer.write(line):
             # this key did not fit in the node:
             rows[-1].finish_node()
-            next_key_line = string_key + "\n"
+            key_line = string_key + "\n"
             new_row = True
             for row in reversed(rows[:-1]):
                 # Mark the start of the next node in the node above. If it
                 # doesn't fit then propogate upwards until we find one that
                 # it does fit into.
-                if row.writer.write(next_key_line):
+                if row.writer.write(key_line):
                     row.finish_node()
                 else:
                     # We've found a node that can handle the pointer.
@@ -296,8 +295,8 @@
                 new_row.writer.write(_INTERNAL_FLAG)
                 new_row.writer.write(_INTERNAL_OFFSET +
                     str(rows[1].nodes - 1) + "\n")
-                new_row.writer.write(next_key_line)
-            self._add_key(string_key, key_line, rows)
+                new_row.writer.write(key_line)
+            self._add_key(string_key, line, rows)
 
     def _write_nodes(self, node_iterator):
         """Write node_iterator out as a B+Tree.
@@ -326,6 +325,12 @@
                 # First key triggers the first row
                 rows.append(_LeafBuilderRow())
             key_count += 1
+            # TODO: Flattening the node into a string key and a line should
+            #       probably be put into a pyrex function. We can do a quick
+            #       iter over all the entries to determine the final length,
+            #       and then do a single malloc() rather than lots of
+            #       intermediate mallocs as we build everything up.
+            #       ATM 3 / 13s are spent flattening nodes (10s is compressing)
             if self.reference_lists:
                 flattened_references = ['\r'.join(['\x00'.join(reference)
                                                    for reference in ref_list])

=== modified file 'bzrlib/chunk_writer.py'
--- a/bzrlib/chunk_writer.py	2008-08-20 19:34:29 +0000
+++ b/bzrlib/chunk_writer.py	2008-08-20 22:12:00 +0000
@@ -64,6 +64,7 @@
         self.unused_bytes = None
         self.reserved_size = reserved
         self.min_compress_size = self._default_min_compression_size
+        self.num_zsync = 0
 
     def finish(self):
         """Finish the chunk.
@@ -107,6 +108,9 @@
         If the bytes fit, False is returned. Otherwise True is returned
         and the bytes have not been added to the chunk.
         """
+        # TODO: lsprof claims that we spend 0.4/10s in calls to write just to
+        #       thunk over to _write. We don't seem to need write_reserved
+        #       unless we have blooms, so this *might* be worth removing
         return self._write(bytes, False)
 
     def write_reserved(self, bytes):
@@ -128,10 +132,10 @@
         if (next_seen_size < self.min_compress_size * capacity):
             # No need, we assume this will "just fit"
             out = self.compressor.compress(bytes)
+            if out:
+                self.bytes_list.append(out)
             self.bytes_in.append(bytes)
             self.seen_bytes = next_seen_size
-            if out:
-                self.bytes_list.append(out)
         else:
             if not reserved and self.num_repack >= self._max_repack:
                 # We have packed too many times already.
@@ -143,6 +147,7 @@
             out = self.compressor.flush(Z_SYNC_FLUSH)
             if out:
                 self.bytes_list.append(out)
+            self.num_zsync += 1
             # TODO: We may want to cache total_len, as the 'sum' call seems to
             #       be showing up a bit on lsprof output
             total_len = sum(map(len, self.bytes_list))
@@ -160,12 +165,16 @@
                     self.compressor = compressor
                     self.bytes_list = bytes_out
                     self.unused_bytes = bytes
+                    self.num_zsync = 0
                     return True
                 else:
                     # This fits when we pack it tighter, so use the new packing
                     self.compressor = compressor
                     self.bytes_in.append(bytes)
                     self.bytes_list = bytes_out
+                    # There is one Z_SYNC_FLUSH call in
+                    # _recompress_all_bytes_in
+                    self.num_zsync = 1
             else:
                 # It fit, so mark it added
                 self.bytes_in.append(bytes)