Rev 41: Switch back to iter_entries rather than _iter_sorted_entries. in http://bzr.arbash-meinel.com/plugins/index2
John Arbash Meinel
john at arbash-meinel.com
Wed Jul 16 14:20:32 BST 2008
At http://bzr.arbash-meinel.com/plugins/index2
------------------------------------------------------------
revno: 41
revision-id: john at arbash-meinel.com-20080716132023-zrufbn0leu228waw
parent: john at arbash-meinel.com-20080715231347-brpla88u0zdr0v9g
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: index2
timestamp: Wed 2008-07-16 08:20:23 -0500
message:
Switch back to iter_entries rather than _iter_sorted_entries.
It means we don't have to merge sort the results, with all the step() overhead.
It means we can filter using the bloom *before* we have to sort the nodes.
It paves the way for not sorting at all.
It paves the way for optimizing the __contains__ check to not have to compute all
the hashes except when they are needed.
-------------- next part --------------
=== modified file 'btree_index.py'
--- a/btree_index.py 2008-07-15 23:13:47 +0000
+++ b/btree_index.py 2008-07-16 13:20:23 +0000
@@ -32,7 +32,7 @@
from bzrlib.index import _OPTION_NODE_REFS, _OPTION_KEY_ELEMENTS, _OPTION_LEN
from bzrlib.osutils import basename, dirname
from bzrlib.plugins.index2 import errors, chunk_writer
-from bzrlib.plugins.pybloom.pybloom import BloomMurmur, _murmur_hash
+from bzrlib.plugins.pybloom import pybloom
from bzrlib.transport import get_transport
@@ -63,7 +63,7 @@
dupes = [0]
-class NodeBloom(BloomMurmur):
+class NodeBloom(pybloom.BloomMurmur):
"""Subclass of BloomMurmur that can be read from a file."""
def __init__(self, bytes):
@@ -80,8 +80,8 @@
def get_raw_offsets(keys):
"""Compute the raw internal offsets for these keys"""
offsets = {}
- hash = _murmur_hash
- seeds = BloomMurmur._predefined_seeds[:BloomMurmur.num_offsets]
+ hash = pybloom._murmur_hash
+ seeds = pybloom.BloomMurmur._predefined_seeds[:pybloom.BloomMurmur.num_offsets]
for key in keys:
text = '\x00'.join(key)
offsets[key] = tuple([int(hash(text, seed)) for seed in seeds])
@@ -312,7 +312,7 @@
# and must always be 1 (if there are any nodes in the tree).
self.row_lengths = []
if BTreeGraphIndex._default_use_blooms:
- global_bloom = BloomMurmur(_GLOBAL_BLOOM_START_SIZE)
+ global_bloom = pybloom.BloomMurmur(_GLOBAL_BLOOM_START_SIZE)
else:
global_bloom = None
def add_key(string_key, key_line):
@@ -345,7 +345,7 @@
str(rows[pos + 1].nodes) + "\n")
if create_bloom:
# new bloom for the new node
- internal_row.bloom = BloomMurmur(_BLOOM_BUILD_SIZE)
+ internal_row.bloom = pybloom.BloomMurmur(_BLOOM_BUILD_SIZE)
else:
internal_row.bloom = None
# add a new leaf
@@ -375,7 +375,7 @@
# We need a new row
if 'index' in debug.debug_flags:
trace.mutter('Inserting new global row.')
- if len(rows) == 1:
+ if len(rows) == 1 and BTreeGraphIndex._use_node_blooms:
# We have only leaf nodes, so we are creating a row
# where we will hold bloom filters
# reserve 256 for the bloom + 10 for ':bloom:\n'
@@ -679,6 +679,7 @@
"""
_default_use_blooms = True
+ _use_node_blooms = False
def __init__(self, transport, name, size):
"""Create a B+Tree index object on the index name.
@@ -940,32 +941,6 @@
if not self.key_count():
return
- # 6 seconds spent in miss_torture using the sorted() line.
- # Even with out of order disk IO it seems faster not to sort it when
- # large queries are being made.
- needed_keys = sorted(keys)
- bloom_raw_offsets = {}
- if self._use_blooms and self._get_global_bloom() is not None:
- bloom_raw_offsets = NodeBloom.get_raw_offsets(needed_keys)
- for found, info in self._iter_sorted_entries(needed_keys,
- bloom_raw_offsets,
- {}):
- if not found:
- continue
- yield info
-
- def _iter_sorted_entries(self, keys, bloom_raw_offsets, object_dedup_cache):
- """Iterate over keys within the index.
-
- :param keys: A list of sorted keys to search for
- :param bloom_raw_offsets: A dict mapping key => bloom raw offsets
- :param object_dedup_cache: A dict to use to save objects like strings
- and tuples, so that we don't consume a lot of redundant memory.
- :return: An iterable which either returns (False, key) if the key is
- not present, or (True, (key, value,[refs])) if it is present (same
- info as iter_all_entries)
- The keys should be returned in sorted order
- """
global leaf_value_hits, miss_attempts, bloom_hits, dupes
if not self.key_count():
return
@@ -987,8 +962,8 @@
else:
leaf_value_hits[1] += 1
needed_keys.append(key)
+
last_key = None
- missing_keys = []
if self._use_blooms:
global_bloom = self._get_global_bloom()
if global_bloom is not None:
@@ -998,14 +973,20 @@
dupes[0] += 1
continue
last_key = key
- if global_bloom.contains_raw_offset(bloom_raw_offsets[key]):
+ if '\x00'.join(key) in global_bloom:
remaining_keys.append(key)
else:
- missing_keys.append(key)
- bloom_hits[0] += len(missing_keys)
+ bloom_hits[0] += 1
needed_keys = remaining_keys
else:
needed_keys = keys
+ # 6 seconds spent in miss_torture using the sorted() line.
+ # Even with out of order disk IO it seems faster not to sort it when
+ # large queries are being made.
+ if not needed_keys:
+ return
+ needed_keys = sorted(needed_keys)
+
nodes_and_keys = [(0, needed_keys)]
for row_pos, next_row_start in enumerate(self._row_offsets[1:-1]):
@@ -1033,45 +1014,23 @@
# We should now be at the _LeafNodes
node_indexes = [idx for idx, s_keys in nodes_and_keys]
- def step(iterator):
- try:
- return iterator.next()
- except StopIteration:
- return None
# TODO: We may *not* want to always read all the nodes in one
# big go. Consider setting a max size on this.
- missing_key_iter = iter(missing_keys)
- next_missing_key = step(missing_key_iter)
nodes = self._get_leaf_nodes(node_indexes)
for node_index, sub_keys in nodes_and_keys:
if not sub_keys:
continue
node = nodes[node_index]
- sub_key_iter = iter(sub_keys)
- next_sub_key = step(sub_key_iter)
- # Output all of the nodes in sorted order, including the missing
- # ones
- while next_sub_key is not None:
- if next_missing_key is not None:
- if next_missing_key < next_sub_key:
- yield False, next_missing_key
- next_missing_key = step(missing_key_iter)
- continue
- # either next_missing_key is None, or it comes after sub_key
+ for next_sub_key in sub_keys:
if next_sub_key in node.keys:
value, refs = node.keys[next_sub_key]
if self.node_ref_lists:
- yield True, (self, next_sub_key, value, refs)
+ yield (self, next_sub_key, value, refs)
else:
- yield True, (self, next_sub_key, value)
+ yield (self, next_sub_key, value)
else:
- yield False, next_sub_key
miss_attempts += 1
- next_sub_key = step(sub_key_iter)
- while next_missing_key is not None:
- yield False, next_missing_key
- next_missing_key = step(missing_key_iter)
def iter_entries_prefix(self, keys):
"""Iterate over keys within the index using prefix matching.
=== modified file 'indexbench.py'
--- a/indexbench.py 2008-07-14 13:08:56 +0000
+++ b/indexbench.py 2008-07-16 13:20:23 +0000
@@ -132,8 +132,9 @@
# each time. This tests the minimum time to get to an average key from 'start'.
# On GraphIndex this should be much slower than on BTree index.
if 'random_reload' in fixtures:
- run(label, 'iter_random_reload', iter_random_one, 'iter_random_reload',
- label, drop_cache, names, target, factory, name_keys, True)
+ run(label, 'iter_random_reload', iter_random_one, label,
+ 'iter_random_reload', drop_cache, names, target, factory,
+ name_keys, True)
# text extraction simulation (follow a compression graph) for text_keys
if 'text' in fixtures:
text_names = [name for name in names if name.endswith('.tix')]
@@ -308,6 +309,7 @@
test_transport = get_transport(test_area)
if btree:
+ print 'Using blooms: %s' % (BTreeGraphIndex._default_use_blooms,)
self.test_class(names, source, BTreeGraphIndex, BTreeBuilder,
'BTreeIndex', name_keys, text_keys, drop_cache, fixture, lspro,
tip_revision_id, ancestry, test_transport)
More information about the bazaar-commits
mailing list