Rev 27: More instrumenting in http://bzr.arbash-meinel.com/plugins/index2

John Arbash Meinel john at arbash-meinel.com
Wed Jul 2 21:17:08 BST 2008


At http://bzr.arbash-meinel.com/plugins/index2

------------------------------------------------------------
revno: 27
revision-id: john at arbash-meinel.com-20080702201701-f40mgcxxf229qhei
parent: john at arbash-meinel.com-20080702195000-hgd1exet98nfxt7e
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: index2
timestamp: Wed 2008-07-02 15:17:01 -0500
message:
  More instrumenting
-------------- next part --------------
=== modified file 'btree_index.py'
--- a/btree_index.py	2008-07-02 19:50:00 +0000
+++ b/btree_index.py	2008-07-02 20:17:01 +0000
@@ -41,9 +41,11 @@
 _PAGE_SIZE = 4096
 
 bloom_hits = 0
-leaf_value_hits = 0
-missing_hits = 0 # Found this key cached as missing 
+leaf_value_hits = [0, 0]
+internal_node_hits = [0, 0]
+leaf_node_hits = [0, 0]
 miss_attempts = 0  # Missed this entry while looking up
+bisect_shortcut = [0, 0]
 
 
 class NodeBloom(BloomSHA1):
@@ -359,9 +361,6 @@
         return nodes, bloom
 
 
-_key_not_present = object()
-
-
 class BTreeGraphIndex(object):
     """Access to nodes via the standard GraphIndex interface for B+Tree's.
     
@@ -369,7 +368,7 @@
     memory except when very large walks are done.
     """
 
-    _default_use_blooms = True
+    _default_use_blooms = False
 
     def __init__(self, transport, name, size=None):
         """Create a B+Tree index object on the index name.
@@ -388,7 +387,7 @@
         self._page_size = transport.recommended_page_size()
         self._root_node = None
         # Default max size is 100,000 leave values
-        self._leaf_value_cache = lru_cache.LRUCache(1000*1000)
+        self._leaf_value_cache = lru_cache.LRUCache(100*1000)
         self._leaf_node_cache = lru_cache.LRUCache()
         self._internal_node_cache = lru_cache.LRUCache()
         self._key_count = None
@@ -438,7 +437,7 @@
             found[node_pos] = node
         return found
 
-    def _get_nodes(self, cache, node_indexes):
+    def _get_nodes(self, cache, node_indexes, counter):
         found = {}
         needed = []
         for idx in node_indexes:
@@ -447,8 +446,10 @@
                 continue
             try:
                 found[idx] = cache[idx]
+                counter[0] += 1
             except KeyError:
                 needed.append(idx)
+                counter[1] += 1
         found.update(self._cache_nodes(needed, cache))
         return found
 
@@ -457,17 +458,20 @@
 
         After getting it, the node will be cached.
         """
-        return self._get_nodes(self._internal_node_cache, node_indexes)
+        return self._get_nodes(self._internal_node_cache, node_indexes,
+                               internal_node_hits)
 
     def _get_leaf_nodes(self, node_indexes):
         """Get a bunch of nodes, from cache or disk."""
-        # nodes = self._get_nodes(self._leaf_node_cache, node_indexes)
-        found = {}
-        for node_pos, node in self._read_nodes(sorted(node_indexes)):
-            found[node_pos] = node
+        found = self._get_nodes(self._leaf_node_cache, node_indexes,
+                                leaf_node_hits)
+        for node in found.itervalues():
             for key, value in node.keys.iteritems():
-                if key not in self._leaf_value_cache:
-                    self._leaf_value_cache[key] = value
+                if key in self._leaf_value_cache:
+                    # Don't add the rest of the keys, we've seen this node
+                    # before.
+                    break
+                self._leaf_value_cache[key] = value
         return found
 
     def iter_all_entries(self):
@@ -522,7 +526,10 @@
         # if (len(in_keys) + len(fixed_keys)
         #     > len(in_keys) * math.log(len(fixed_keys), 2)):
         if len(in_keys) == 1: # Bisect will always be faster for M = 1
+            bisect_shortcut[0] += 1
             return [(bisect_right(fixed_keys, in_keys[0]), in_keys)]
+        else:
+            bisect_shortcut[1] += len(in_keys)
         in_keys_iter = iter(in_keys)
         fixed_keys_iter = enumerate(fixed_keys)
         cur_in_key = in_keys_iter.next()
@@ -587,22 +594,20 @@
         if not self.key_count():
             return
         # Find all the keys that are already in our value cache
-        global leaf_value_hits, missing_hits, miss_attempts
+        global leaf_value_hits, miss_attempts
         needed_keys = []
         for key in keys:
             value = self._leaf_value_cache.get(key, None)
             if value is not None:
-                if value is not _key_not_present:
-                    leaf_value_hits += 1
-                    # This key is known not to be here, skip it
-                    value, refs = value
-                    if self.node_ref_lists:
-                        yield (self, key, value, refs)
-                    else:
-                        yield (self, key, value)
+                leaf_value_hits[0] += 1
+                # This key is known not to be here, skip it
+                value, refs = value
+                if self.node_ref_lists:
+                    yield (self, key, value, refs)
                 else:
-                    missing_hits += 1
+                    yield (self, key, value)
             else:
+                leaf_value_hits[1] += 1
                 needed_keys.append(key)
         needed_keys = sorted(needed_keys)
 
@@ -653,7 +658,6 @@
                         yield (self, key, value)
                 else:
                     miss_attempts += 1
-                    self._leaf_value_cache[key] = _key_not_present
 
     def iter_entries_prefix(self, keys):
         """Iterate over keys within the index using prefix matching.

=== modified file 'indexbench.py'
--- a/indexbench.py	2008-07-02 19:50:00 +0000
+++ b/indexbench.py	2008-07-02 20:17:01 +0000
@@ -54,16 +54,26 @@
 
 def reset_hit_counts():
     btree_index.bloom_hits = 0
-    btree_index.missing_hits = 0
-    btree_index.leaf_value_hits = 0
+    btree_index.leaf_value_hits[:] = [0, 0]
     btree_index.miss_attempts = 0
+    btree_index.internal_node_hits[:] = [0, 0]
+    btree_index.leaf_node_hits[:] = [0, 0]
+    btree_index.bisect_shortcut[:] = [0, 0]
 
 def hits():
-    return "bloom(%d) missing(%d) leaf_value(%d) miss_attempts(%d)" % (
+    return ("\n    bloom(%d) leaf_value(%d,%d) miss(%d), internal(%d,%d),"
+            "\n    leaf_node(%d,%d) bisect(%d,%d)" % (
         btree_index.bloom_hits,
-        btree_index.missing_hits,
-        btree_index.leaf_value_hits,
-        btree_index.miss_attempts)
+        btree_index.leaf_value_hits[0],
+        btree_index.leaf_value_hits[1],
+        btree_index.miss_attempts,
+        btree_index.internal_node_hits[0],
+        btree_index.internal_node_hits[1],
+        btree_index.leaf_node_hits[0],
+        btree_index.leaf_node_hits[1],
+        btree_index.bisect_shortcut[0],
+        btree_index.bisect_shortcut[1],
+        ))
 
 def time_index(names, source, factory, builder, target, label, name_keys,
     text_keys, use_drop_cache, fixtures, lsprof, tip_revision_id, ancestry):
@@ -104,6 +114,7 @@
 # random shuffle, all keys (name_keys comes preshuffled)
     if 'shuffle' in fixtures:
         drop_cache()
+        reset_hit_counts()
         now = time.time()
         for name, index in iter_indices(names, target, factory):
             order = name_keys[name]
@@ -111,7 +122,7 @@
             for key in order:
                 index.iter_entries([key]).next()
         finish = time.time()
-        print "%s: iter_random_one in %0.3f" % (label, finish - now)
+        print "%s: iter_random_one in %0.3f,%s" % (label, finish - now, hits())
 # text extraction simulation (follow a compression graph) for text_keys
     if 'text' in fixtures:
         text_names = [name for name in names if name.endswith('.tix')]
@@ -190,7 +201,7 @@
             for node in index.iter_entries(other_keys):
                 pass
         finish = time.time()
-        print "%s: miss torture in %0.3f, %s" % (label, finish - now, hits)
+        print "%s: miss torture in %0.3f, %s" % (label, finish - now, hits())
 
 
 class cmd_indexbench(Command):



More information about the bazaar-commits mailing list