Rev 31: Restore the multi-way bisecting in http://bzr.arbash-meinel.com/plugins/index2
John Arbash Meinel
john at arbash-meinel.com
Wed Jul 2 22:04:25 BST 2008
At http://bzr.arbash-meinel.com/plugins/index2
------------------------------------------------------------
revno: 31
revision-id: john at arbash-meinel.com-20080702210419-4xfq1jb9k4cuksk1
parent: john at arbash-meinel.com-20080702210046-j00t51pj8kod09hz
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: index2
timestamp: Wed 2008-07-02 16:04:19 -0500
message:
Restore the multi-way bisecting
-------------- next part --------------
=== modified file 'btree_index.py'
--- a/btree_index.py 2008-07-02 21:00:46 +0000
+++ b/btree_index.py 2008-07-02 21:04:19 +0000
@@ -607,33 +607,66 @@
return
if not self.key_count():
return
- global bloom_hits, miss_attempts
- for key in keys:
- # what row are we looking at
- node_index = 0
- row = 0
- # how many nodes have the previous rows eaten up
- offset = self._row_lengths[row]
- string_key = '\x00'.join(key)
- for row_length in self._row_lengths[:-1]:
- node = self._get_internal_nodes([node_index])[node_index]
+ # Find all the keys that are already in our value cache
+ global leaf_value_hits, miss_attempts
+ needed_keys = []
+ if self._leaf_value_cache is None:
+ needed_keys = keys
+ else:
+ for key in keys:
+ value = self._leaf_value_cache.get(key, None)
+ if value is not None:
+ leaf_value_hits[0] += 1
+ # This key is known not to be here, skip it
+ value, refs = value
+ if self.node_ref_lists:
+ yield (self, key, value, refs)
+ else:
+ yield (self, key, value)
+ else:
+ leaf_value_hits[1] += 1
+ needed_keys.append(key)
+ needed_keys = sorted(needed_keys)
+
+ nodes_and_keys = [(0, needed_keys)]
+ row_offset = 0
+
+ if self._use_blooms:
+ global bloom_hits
+ if len(self._row_lengths) > 1:
+ # no internal nodes, and thus no blooms in a length-1 b+tree.
+ bloom_raw_offsets = NodeBloom.get_raw_offsets(needed_keys)
+ for row_length in self._row_lengths[:-1]:
+ node_indexes = [idx for idx, s_keys in nodes_and_keys]
+ nodes = self._get_internal_nodes(node_indexes)
+
+ row_offset += row_length
+
+ next_nodes_and_keys = []
+ for node_index, sub_keys in nodes_and_keys:
+ node = nodes[node_index]
if self._use_blooms and node.bloom is not None:
- if string_key not in node.bloom:
- bloom_hits += 1
- node_index = None
- break
- pos = bisect_right(node.keys, key)
- # There are len(node.keys) pointers dividing len(node.keys) + 1
- # child nodes. The first pointer is the first key in the second
- # child node, and so on.
- # pos indexes into the list of child nodes, any key equal to a
- # key in node.keys indicates that the key is present to the right
- # of that divider.
- node_index = offset + node.offset + pos
- row += 1
- offset = offset + self._row_lengths[row]
- if node_index is not None:
- node = self._get_leaf_nodes([node_index])[node_index]
+ # If using blooms, filter out any nodes that don't hit.
+ remaining_sub_keys = [sub_key for sub_key in sub_keys
+ if node.bloom.contains_raw_offset(bloom_raw_offsets[sub_key])]
+ bloom_hits += len(sub_keys) - len(remaining_sub_keys)
+ sub_keys = remaining_sub_keys
+ # For misses at this level, do we want to cache that fact?
+ positions = self._multi_bisect_right(sub_keys, node.keys)
+ node_offset = row_offset + node.offset
+ next_nodes_and_keys.extend([(node_offset + pos, s_keys)
+ for pos, s_keys in positions])
+ nodes_and_keys = next_nodes_and_keys
+ # We should now be at the _LeafNodes
+ node_indexes = [idx for idx, s_keys in nodes_and_keys]
+ # This bypasses the cache, but only for the leaf nodes
+ # We do it right now, because otherwise we swap the cache,
+ # alternatively, we may *not* want to always read all the nodes in one
+ # big go. Consider setting a max size on this.
+ nodes = self._get_leaf_nodes(node_indexes)
+ for node_index, sub_keys in nodes_and_keys:
+ node = nodes[node_index]
+ for key in sub_keys:
if key in node.keys:
value, refs = node.keys[key]
if self.node_ref_lists:
More information about the bazaar-commits
mailing list