Rev 3673: Update the stats for the current code layout. in http://bzr.arbash-meinel.com/branches/bzr/1.7-dev/btree

John Arbash Meinel john at arbash-meinel.com
Fri Aug 22 04:58:21 BST 2008


At http://bzr.arbash-meinel.com/branches/bzr/1.7-dev/btree

------------------------------------------------------------
revno: 3673
revision-id: john at arbash-meinel.com-20080822035819-yx19e7qxdvjgaeql
parent: john at arbash-meinel.com-20080822022908-420tr0519tdz6pxy
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: btree
timestamp: Thu 2008-08-21 22:58:19 -0500
message:
  Update the stats for the current code layout.
  This shows why I like _max_repack=2 so much. It is
  the highest value that has 'no waste'.
  At _max_repack=2, you can always sneak in 1 more
  line, which avoids triggering an extra repack.
  Also, updating the timings with the current tuning.
modified:
  bzrlib/chunk_writer.py         chunk_writer.py-20080630234519-6ggn4id17nipovny-1
  bzrlib/tests/test_chunk_writer.py test_chunk_writer.py-20080630234519-6ggn4id17nipovny-2
-------------- next part --------------
=== modified file 'bzrlib/chunk_writer.py'
--- a/bzrlib/chunk_writer.py	2008-08-22 02:09:36 +0000
+++ b/bzrlib/chunk_writer.py	2008-08-22 03:58:19 +0000
@@ -20,6 +20,7 @@
 import zlib
 from zlib import Z_FINISH, Z_SYNC_FLUSH
 
+_stats = [0, 0, 0]
 
 class ChunkWriter(object):
     """ChunkWriter allows writing of compressed data with a fixed size.
@@ -38,23 +39,21 @@
         number of times we will try.
         In testing, some values for bzr.dev::
 
-                    w/o copy    w/ copy     w/ copy ins w/ copy & save
-            repack  time  MB    time  MB    time  MB    time  MB
-             1       8.8  5.1    8.9  5.1    9.6  4.4   12.5  4.1
-             2       9.6  4.4   10.1  4.3   10.4  4.2   11.1  4.1
-             3      10.6  4.2   11.1  4.1   11.2  4.1   11.3  4.1
-             4      12.0  4.1
-             5      12.6  4.1
-            20      12.9  4.1   12.2  4.1   12.3  4.1
+            repack  time  MB    hit_max_repack  buffer_full
+             1       7.9  5.1   1268            0
+             2       8.8  4.4   1069            0
+             3       9.7  4.2   1022            46
+             4      11.1  4.1   974             619
+            20      11.9  4.1   0               1012
 
         In testing, some values for mysql-unpacked::
 
-                    w/o copy    w/ copy     w/ copy ins w/ copy & save
-            repack  time  MB    time  MB    time  MB    time  MB
-             1      56.6  16.9              60.7  14.2
-             2      59.3  14.1              62.6  13.5  64.3  13.4
-             3      64.4  13.5
-            20      73.4  13.4
+            repack  time  MB    hit_max_repack  buffer_full
+             1      52.4  16.9  4295            0
+             2      55.8  14.1  3561            0
+             3      60.3  13.5  3407            197
+             4      66.7  13.4  3203            2154
+            20      69.3  13.4  0               3380
 
     :cvar _default_min_compression_size: The expected minimum compression.
         While packing nodes into the page, we won't Z_SYNC_FLUSH until we have
@@ -162,8 +161,8 @@
             self.bytes_in.append(bytes)
             self.seen_bytes = next_seen_size
         else:
-            if self.num_repack >= self._max_repack and not reserved:
-                # We already know we don't want to try to fit more
+            if self.num_repack > self._max_repack and not reserved:
+                self.unused_bytes = bytes
                 return True
             # This may or may not fit, try to add it with Z_SYNC_FLUSH
             out = comp.compress(bytes)
@@ -171,17 +170,27 @@
             if out:
                 self.bytes_list.append(out)
                 self.bytes_out_len += len(out)
-            if self.bytes_out_len + 10 > capacity:
+            if self.bytes_out_len + 10 <= capacity:
+                # It fit, so mark it added
+                self.bytes_in.append(bytes)
+                self.seen_bytes = next_seen_size
+            else:
                 # We are over budget, try to squeeze this in without any
                 # Z_SYNC_FLUSH calls
                 self.num_repack += 1
-                bytes_out, this_len, compressor = self._recompress_all_bytes_in(bytes)
+                (bytes_out, this_len,
+                 compressor) = self._recompress_all_bytes_in(bytes)
+                if self.num_repack >= self._max_repack:
+                    # When we get *to* _max_repack, bump over so that the
+                    # earlier > _max_repack will be triggered.
+                    self.num_repack += 1
+                    _stats[0] += 1
                 if this_len + 10 > capacity:
-                    # No way we can add anymore, we need to re-pack because our
-                    # compressor is now out of sync.
-                    # This seems to be rarely triggered over
-                    #   num_repack > _max_repack
-                    bytes_out, this_len, compressor = self._recompress_all_bytes_in()
+                    # In real-world testing, this only happens when _max_repack
+                    # is set >2, and even then rarely (46 out of 1022)
+                    (bytes_out, this_len,
+                     compressor) = self._recompress_all_bytes_in()
+                    _stats[1] += 1
                     self.compressor = compressor
                     self.bytes_list = bytes_out
                     self.bytes_out_len = this_len
@@ -191,13 +200,10 @@
                     # This fits when we pack it tighter, so use the new packing
                     # There is one Z_SYNC_FLUSH call in
                     # _recompress_all_bytes_in
+                    _stats[2] += 1
                     self.compressor = compressor
                     self.bytes_in.append(bytes)
                     self.bytes_list = bytes_out
                     self.bytes_out_len = this_len
-            else:
-                # It fit, so mark it added
-                self.bytes_in.append(bytes)
-                self.seen_bytes = next_seen_size
         return False
 

=== modified file 'bzrlib/tests/test_chunk_writer.py'
--- a/bzrlib/tests/test_chunk_writer.py	2008-08-22 02:09:36 +0000
+++ b/bzrlib/tests/test_chunk_writer.py	2008-08-22 03:58:19 +0000
@@ -78,9 +78,12 @@
             # Create a line with this group
             lines.append(''.join(map(str, numbers)) + '\n')
         writer = chunk_writer.ChunkWriter(4096, 256)
-        for line in lines:
+        for idx, line in enumerate(lines):
             if writer.write(line):
+                self.assertEqual(44, idx)
                 break
+        else:
+            self.fail('We were able to write all lines')
         self.assertFalse(writer.write("A"*256, reserved=True))
         bytes_list, unused, _ = writer.finish()
         node_bytes = self.check_chunk(bytes_list, 4096)



More information about the bazaar-commits mailing list