Rev 3690: Clean out the global state, good for prototyping and tuning, bad for production code. in http://bzr.arbash-meinel.com/branches/bzr/1.7-dev/btree

Thu Aug 28 02:59:59 BST 2008

At http://bzr.arbash-meinel.com/branches/bzr/1.7-dev/btree

------------------------------------------------------------
revno: 3690
revision-id: john at arbash-meinel.com-20080828015958-bvdt8spf2ls57s39
parent: john at arbash-meinel.com-20080826005610-275jq9uqje3prqry
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: btree
timestamp: Wed 2008-08-27 20:59:58 -0500
message:
  Clean out the global state, good for prototyping and tuning, bad for production code.
  (as recommended by Robert)
-------------- next part --------------
=== modified file 'bzrlib/btree_index.py'

--- a/bzrlib/btree_index.py	2008-08-22 02:18:27 +0000
+++ b/bzrlib/btree_index.py	2008-08-28 01:59:58 +0000
@@ -53,13 +53,6 @@
 # 4K per page: 4MB - 1000 entries
 _NODE_CACHE_SIZE = 1000
 
-leaf_value_hits = [0, 0]
-internal_node_hits = [0, 0]
-leaf_node_hits = [0, 0]
-miss_attempts = 0  # Missed this entry while looking up
-bisect_shortcut = [0, 0]
-dupes = [0]
-
 
 class _BuilderRow(object):
     """The stored state accumulated while writing out a row in the index.
@@ -622,7 +615,7 @@
             found[node_pos] = node
         return found
 
-    def _get_nodes(self, cache, node_indexes, counter):
+    def _get_nodes(self, cache, node_indexes):
         found = {}
         needed = []
         for idx in node_indexes:
@@ -631,10 +624,8 @@
                 continue
             try:
                 found[idx] = cache[idx]
-                counter[0] += 1
             except KeyError:
                 needed.append(idx)
-                counter[1] += 1
         found.update(self._cache_nodes(needed, cache))
         return found
 
@@ -643,13 +634,11 @@
 
         After getting it, the node will be cached.
         """
-        return self._get_nodes(self._internal_node_cache, node_indexes,
-                               internal_node_hits)
+        return self._get_nodes(self._internal_node_cache, node_indexes)
 
     def _get_leaf_nodes(self, node_indexes):
         """Get a bunch of nodes, from cache or disk."""
-        found = self._get_nodes(self._leaf_node_cache, node_indexes,
-                                leaf_node_hits)
+        found = self._get_nodes(self._leaf_node_cache, node_indexes)
         if self._leaf_value_cache is not None:
             for node in found.itervalues():
                 for key, value in node.keys.iteritems():
@@ -715,17 +704,13 @@
         # iter_steps = len(in_keys) + len(fixed_keys)
         # bisect_steps = len(in_keys) * math.log(len(fixed_keys), 2)
         if len(in_keys) == 1: # Bisect will always be faster for M = 1
-            bisect_shortcut[0] += 1
             return [(bisect_right(fixed_keys, in_keys[0]), in_keys)]
         # elif bisect_steps < iter_steps:
-        #     bisect_shortcut[0] += len(in_keys)
         #     offsets = {}
         #     for key in in_keys:
         #         offsets.setdefault(bisect_right(fixed_keys, key),
         #                            []).append(key)
         #     return [(o, offsets[o]) for o in sorted(offsets)]
-        else:
-            bisect_shortcut[1] += len(in_keys)
         in_keys_iter = iter(in_keys)
         fixed_keys_iter = enumerate(fixed_keys)
         cur_in_key = in_keys_iter.next()
@@ -794,7 +779,6 @@
         if not keys:
             return
 
-        global leaf_value_hits, miss_attempts, dupes
         if not self.key_count():
             return
 
@@ -805,7 +789,6 @@
             for key in keys:
                 value = self._leaf_value_cache.get(key, None)
                 if value is not None:
-                    leaf_value_hits[0] += 1
                     # This key is known not to be here, skip it
                     value, refs = value
                     if self.node_ref_lists:
@@ -813,7 +796,6 @@
                     else:
                         yield (self, key, value)
                 else:
-                    leaf_value_hits[1] += 1
                     needed_keys.append(key)
 
         last_key = None
@@ -857,8 +839,6 @@
                         yield (self, next_sub_key, value, refs)
                     else:
                         yield (self, next_sub_key, value)
-                else:
-                    miss_attempts += 1
 
     def iter_entries_prefix(self, keys):
         """Iterate over keys within the index using prefix matching.

=== modified file 'bzrlib/chunk_writer.py'
--- a/bzrlib/chunk_writer.py	2008-08-26 00:42:07 +0000
+++ b/bzrlib/chunk_writer.py	2008-08-28 01:59:58 +0000
@@ -20,10 +20,6 @@
 import zlib
 from zlib import Z_FINISH, Z_SYNC_FLUSH
 
-# [max_repack, buffer_full, repacks_with_space, min_compression,
-#  total_bytes_in, total_bytes_out, avg_comp,
-#  bytes_autopack, bytes_sync_packed, num_full_by_zsync]
-_stats = [0, 0, 0, 999, 0, 0, 0, 0, 0, 0]
 
 class ChunkWriter(object):
     """ChunkWriter allows writing of compressed data with a fixed size.
@@ -40,54 +36,57 @@
         will sometimes start over and compress the whole list to get tighter
         packing. We get diminishing returns after a while, so this limits the
         number of times we will try.
-        In testing, some values for bzr.dev::
-
-            repack  time  MB   max   full
-             1       7.5  4.6  1140  0
-             2       8.4  4.2  1036  1          6.8
-             3       9.8  4.1  1012  278
-             4      10.8  4.1  728   945
-            20      11.1  4.1  0     1012
-
-            repack = 0
-            zsync   time  MB    repack  max_z   time w/ add_node
-             0       6.7  24.7  0       6270    5.0
-             1       6.5  13.2  0       3342    4.3
-             2       6.6   9.6  0       2414    4.9
-             5       6.5   6.2  0       1549    4.8
-             6       6.5   5.8  1       1435    4.8
-             7       6.6   5.5  19      1337    4.8
-             8       6.7   5.3  81      1220    4.4
-            10       6.8   5.0  260     967     5.3
-            11       6.8   4.9  366     839     5.3
-            12       6.9   4.8  454     731     5.1
-            15       7.2   4.7  704     450     5.8
-            20       7.7   4.6  1133    7       5.8
-
-        In testing, some values for mysql-unpacked::
-
-                    next_bytes estim
-            repack  time  MB    hit_max full
-             1      51.7  15.4  3913  0
-             2      54.4  13.7  3467  0         35.4
-            20      67.0  13.4  0     3380      46.7
-
-            repack=0
-            zsync                               time w/ add_node
-             0      47.7 116.5  0       29782   29.5
-             1      48.5  60.2  0       15356   27.8
-             2      48.1  42.4  0       10822   27.8
-             5      48.3  25.5  0       6491    26.8
-             6      48.0  23.2  13      5896    27.3
-             7      48.1  21.6  29      5451    27.5
-             8      48.1  20.3  52      5108    27.1
-            10      46.9  18.6  195     4526    29.4
-            11      48.8  18.0  421     4143    29.2
-            12      47.4  17.5  702     3738    28.0
-            15      49.6  16.5  1223    2969    28.9
-            20      48.9  15.7  2182    1810    29.6
-            30            15.4  3891    23      31.4
+        The default is to try to avoid recompressing entirely, but setting this
+        to something like 20 will give maximum compression.
+
+    :cvar _max_zsync: Another tunable nob. If _max_repack is set to 0, then you
+        can limit the number of times we will try to pack more data into a
+        node. This allows us to do a single compression pass, rather than
+        trying until we overflow, and then recompressing again.
     """
+    #    In testing, some values for bzr.dev::
+    #        repack  time  MB   max   full
+    #         1       7.5  4.6  1140  0
+    #         2       8.4  4.2  1036  1          6.8
+    #         3       9.8  4.1  1012  278
+    #         4      10.8  4.1  728   945
+    #        20      11.1  4.1  0     1012
+    #        repack = 0
+    #        zsync   time  MB    repack  max_z   time w/ add_node
+    #         0       6.7  24.7  0       6270    5.0
+    #         1       6.5  13.2  0       3342    4.3
+    #         2       6.6   9.6  0       2414    4.9
+    #         5       6.5   6.2  0       1549    4.8
+    #         6       6.5   5.8  1       1435    4.8
+    #         7       6.6   5.5  19      1337    4.8
+    #         8       6.7   5.3  81      1220    4.4
+    #        10       6.8   5.0  260     967     5.3
+    #        11       6.8   4.9  366     839     5.3
+    #        12       6.9   4.8  454     731     5.1
+    #        15       7.2   4.7  704     450     5.8
+    #        20       7.7   4.6  1133    7       5.8
+
+    #    In testing, some values for mysql-unpacked::
+    #                next_bytes estim
+    #        repack  time  MB    hit_max full
+    #         1      51.7  15.4  3913  0
+    #         2      54.4  13.7  3467  0         35.4
+    #        20      67.0  13.4  0     3380      46.7
+    #        repack=0
+    #        zsync                               time w/ add_node
+    #         0      47.7 116.5  0       29782   29.5
+    #         1      48.5  60.2  0       15356   27.8
+    #         2      48.1  42.4  0       10822   27.8
+    #         5      48.3  25.5  0       6491    26.8
+    #         6      48.0  23.2  13      5896    27.3
+    #         7      48.1  21.6  29      5451    27.5
+    #         8      48.1  20.3  52      5108    27.1
+    #        10      46.9  18.6  195     4526    29.4
+    #        11      48.8  18.0  421     4143    29.2
+    #        12      47.4  17.5  702     3738    28.0
+    #        15      49.6  16.5  1223    2969    28.9
+    #        20      48.9  15.7  2182    1810    29.6
+    #        30            15.4  3891    23      31.4
 
     _max_repack = 0
     _max_zsync = 8
@@ -125,16 +124,6 @@
         out = self.compressor.flush(Z_FINISH)
         self.bytes_list.append(out)
         self.bytes_out_len += len(out)
-        if self.num_repack > 0 and self.bytes_out_len > 0:
-            comp = float(self.seen_bytes) / self.bytes_out_len
-            if comp < _stats[3]:
-                _stats[3] = comp
-        _stats[4] += self.seen_bytes
-        _stats[5] += self.bytes_out_len
-        _stats[6] = float(_stats[4]) / _stats[5]
-
-        if self._max_repack == 0 and self.num_repack == 1:
-            _stats[9] += 1
 
         if self.bytes_out_len > self.chunk_size:
             raise AssertionError('Somehow we ended up with too much'
@@ -204,10 +193,8 @@
             self.bytes_in.append(bytes)
             self.seen_bytes += len(bytes)
             self.unflushed_in_bytes += len(bytes)
-            _stats[7] += 1 # len(bytes)
         else:
             # This may or may not fit, try to add it with Z_SYNC_FLUSH
-            _stats[8] += 1 # len(bytes)
             # Note: It is tempting to do this as a look-ahead pass, and to
             # 'copy()' the compressor before flushing. However, it seems that
             # 'flush()' is when the compressor actually does most work
@@ -248,11 +235,9 @@
                     # When we get *to* _max_repack, bump over so that the
                     # earlier > _max_repack will be triggered.
                     self.num_repack += 1
-                    _stats[0] += 1
                 if this_len + 10 > capacity:
                     (bytes_out, this_len,
                      compressor) = self._recompress_all_bytes_in()
-                    _stats[1] += 1
                     self.compressor = compressor
                     # Force us to not allow more data
                     self.num_repack = self._max_repack + 1
@@ -264,7 +249,6 @@
                     # This fits when we pack it tighter, so use the new packing
                     # There is one Z_SYNC_FLUSH call in
                     # _recompress_all_bytes_in
-                    _stats[2] += 1
                     self.compressor = compressor
                     self.bytes_in.append(bytes)
                     self.bytes_list = bytes_out