Rev 30: temporary work to revert back to key-at-a-time to test performance in http://bzr.arbash-meinel.com/plugins/index2

Wed Jul 2 22:00:52 BST 2008

At http://bzr.arbash-meinel.com/plugins/index2

------------------------------------------------------------
revno: 30
revision-id: john at arbash-meinel.com-20080702210046-j00t51pj8kod09hz
parent: john at arbash-meinel.com-20080702204409-s512bff7tus0cb8n
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: index2
timestamp: Wed 2008-07-02 16:00:46 -0500
message:
  temporary work to revert back to key-at-a-time to test performance
-------------- next part --------------
=== modified file 'btree_index.py'

--- a/btree_index.py	2008-07-02 20:44:09 +0000
+++ b/btree_index.py	2008-07-02 21:00:46 +0000
@@ -607,66 +607,33 @@
             return
         if not self.key_count():
             return
-        # Find all the keys that are already in our value cache
-        global leaf_value_hits, miss_attempts
-        needed_keys = []
-        if self._leaf_value_cache is None:
-            needed_keys = keys
-        else:
-            for key in keys:
-                value = self._leaf_value_cache.get(key, None)
-                if value is not None:
-                    leaf_value_hits[0] += 1
-                    # This key is known not to be here, skip it
-                    value, refs = value
-                    if self.node_ref_lists:
-                        yield (self, key, value, refs)
-                    else:
-                        yield (self, key, value)
-                else:
-                    leaf_value_hits[1] += 1
-                    needed_keys.append(key)
-        needed_keys = sorted(needed_keys)
-
-        nodes_and_keys = [(0, needed_keys)]
-        row_offset = 0
-
-        if self._use_blooms:
-            global bloom_hits
-            if len(self._row_lengths) > 1:
-                # no internal nodes, and thus no blooms in a length-1 b+tree.
-                bloom_raw_offsets = NodeBloom.get_raw_offsets(needed_keys)
-        for row_length in self._row_lengths[:-1]:
-            node_indexes = [idx for idx, s_keys in nodes_and_keys]
-            nodes = self._get_internal_nodes(node_indexes)
-
-            row_offset += row_length
-
-            next_nodes_and_keys = []
-            for node_index, sub_keys in nodes_and_keys:
-                node = nodes[node_index]
+        global bloom_hits, miss_attempts
+        for key in keys:
+            # what row are we looking at
+            node_index = 0
+            row = 0
+            # how many nodes have the previous rows eaten up
+            offset = self._row_lengths[row]
+            string_key = '\x00'.join(key)
+            for row_length in self._row_lengths[:-1]:
+                node = self._get_internal_nodes([node_index])[node_index]
                 if self._use_blooms and node.bloom is not None:
-                    # If using blooms, filter out any nodes that don't hit.
-                    remaining_sub_keys = [sub_key for sub_key in sub_keys
-                        if node.bloom.contains_raw_offset(bloom_raw_offsets[sub_key])]
-                    bloom_hits += len(sub_keys) - len(remaining_sub_keys)
-                    sub_keys = remaining_sub_keys
-                    # For misses at this level, do we want to cache that fact?
-                positions = self._multi_bisect_right(sub_keys, node.keys)
-                node_offset = row_offset + node.offset
-                next_nodes_and_keys.extend([(node_offset + pos, s_keys)
-                                           for pos, s_keys in positions])
-            nodes_and_keys = next_nodes_and_keys
-        # We should now be at the _LeafNodes
-        node_indexes = [idx for idx, s_keys in nodes_and_keys]
-        # This bypasses the cache, but only for the leaf nodes
-        # We do it right now, because otherwise we swap the cache,
-        # alternatively, we may *not* want to always read all the nodes in one
-        # big go. Consider setting a max size on this.
-        nodes = self._get_leaf_nodes(node_indexes)
-        for node_index, sub_keys in nodes_and_keys:
-            node = nodes[node_index]
-            for key in sub_keys:
+                    if string_key not in node.bloom:
+                        bloom_hits += 1
+                        node_index = None
+                        break
+                pos = bisect_right(node.keys, key)
+                # There are len(node.keys) pointers dividing len(node.keys) + 1
+                # child nodes. The first pointer is the first key in the second
+                # child node, and so on.
+                # pos indexes into the list of child nodes, any key equal to a
+                # key in node.keys indicates that the key is present to the right
+                # of that divider.
+                node_index = offset + node.offset + pos
+                row += 1
+                offset = offset + self._row_lengths[row]
+            if node_index is not None:
+                node = self._get_leaf_nodes([node_index])[node_index]
                 if key in node.keys:
                     value, refs = node.keys[key]
                     if self.node_ref_lists:

=== modified file 'tests/test_btree_index.py'
--- a/tests/test_btree_index.py	2008-07-02 18:26:56 +0000
+++ b/tests/test_btree_index.py	2008-07-02 21:00:46 +0000
@@ -17,6 +17,7 @@
 
 """Tests for btree indices."""
 
+import pprint
 import random
 import zlib