Rev 31: Restore the multi-way bisecting in http://bzr.arbash-meinel.com/plugins/index2

John Arbash Meinel john at arbash-meinel.com
Wed Jul 2 22:04:25 BST 2008


At http://bzr.arbash-meinel.com/plugins/index2

------------------------------------------------------------
revno: 31
revision-id: john at arbash-meinel.com-20080702210419-4xfq1jb9k4cuksk1
parent: john at arbash-meinel.com-20080702210046-j00t51pj8kod09hz
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: index2
timestamp: Wed 2008-07-02 16:04:19 -0500
message:
  Restore the multi-way bisecting
-------------- next part --------------
=== modified file 'btree_index.py'
--- a/btree_index.py	2008-07-02 21:00:46 +0000
+++ b/btree_index.py	2008-07-02 21:04:19 +0000
@@ -607,33 +607,66 @@
             return
         if not self.key_count():
             return
-        global bloom_hits, miss_attempts
-        for key in keys:
-            # what row are we looking at
-            node_index = 0
-            row = 0
-            # how many nodes have the previous rows eaten up
-            offset = self._row_lengths[row]
-            string_key = '\x00'.join(key)
-            for row_length in self._row_lengths[:-1]:
-                node = self._get_internal_nodes([node_index])[node_index]
+        # Find all the keys that are already in our value cache
+        global leaf_value_hits, miss_attempts
+        needed_keys = []
+        if self._leaf_value_cache is None:
+            needed_keys = keys
+        else:
+            for key in keys:
+                value = self._leaf_value_cache.get(key, None)
+                if value is not None:
+                    leaf_value_hits[0] += 1
+                    # This key is known not to be here, skip it
+                    value, refs = value
+                    if self.node_ref_lists:
+                        yield (self, key, value, refs)
+                    else:
+                        yield (self, key, value)
+                else:
+                    leaf_value_hits[1] += 1
+                    needed_keys.append(key)
+        needed_keys = sorted(needed_keys)
+
+        nodes_and_keys = [(0, needed_keys)]
+        row_offset = 0
+
+        if self._use_blooms:
+            global bloom_hits
+            if len(self._row_lengths) > 1:
+                # no internal nodes, and thus no blooms in a length-1 b+tree.
+                bloom_raw_offsets = NodeBloom.get_raw_offsets(needed_keys)
+        for row_length in self._row_lengths[:-1]:
+            node_indexes = [idx for idx, s_keys in nodes_and_keys]
+            nodes = self._get_internal_nodes(node_indexes)
+
+            row_offset += row_length
+
+            next_nodes_and_keys = []
+            for node_index, sub_keys in nodes_and_keys:
+                node = nodes[node_index]
                 if self._use_blooms and node.bloom is not None:
-                    if string_key not in node.bloom:
-                        bloom_hits += 1
-                        node_index = None
-                        break
-                pos = bisect_right(node.keys, key)
-                # There are len(node.keys) pointers dividing len(node.keys) + 1
-                # child nodes. The first pointer is the first key in the second
-                # child node, and so on.
-                # pos indexes into the list of child nodes, any key equal to a
-                # key in node.keys indicates that the key is present to the right
-                # of that divider.
-                node_index = offset + node.offset + pos
-                row += 1
-                offset = offset + self._row_lengths[row]
-            if node_index is not None:
-                node = self._get_leaf_nodes([node_index])[node_index]
+                    # If using blooms, filter out any nodes that don't hit.
+                    remaining_sub_keys = [sub_key for sub_key in sub_keys
+                        if node.bloom.contains_raw_offset(bloom_raw_offsets[sub_key])]
+                    bloom_hits += len(sub_keys) - len(remaining_sub_keys)
+                    sub_keys = remaining_sub_keys
+                    # For misses at this level, do we want to cache that fact?
+                positions = self._multi_bisect_right(sub_keys, node.keys)
+                node_offset = row_offset + node.offset
+                next_nodes_and_keys.extend([(node_offset + pos, s_keys)
+                                           for pos, s_keys in positions])
+            nodes_and_keys = next_nodes_and_keys
+        # We should now be at the _LeafNodes
+        node_indexes = [idx for idx, s_keys in nodes_and_keys]
+        # This bypasses the cache, but only for the leaf nodes
+        # We do it right now, because otherwise we swap the cache,
+        # alternatively, we may *not* want to always read all the nodes in one
+        # big go. Consider setting a max size on this.
+        nodes = self._get_leaf_nodes(node_indexes)
+        for node_index, sub_keys in nodes_and_keys:
+            node = nodes[node_index]
+            for key in sub_keys:
                 if key in node.keys:
                     value, refs = node.keys[key]
                     if self.node_ref_lists:



More information about the bazaar-commits mailing list