Rev 41: Switch back to iter_entries rather than _iter_sorted_entries. in http://bzr.arbash-meinel.com/plugins/index2

Wed Jul 16 14:20:32 BST 2008

At http://bzr.arbash-meinel.com/plugins/index2

------------------------------------------------------------
revno: 41
revision-id: john at arbash-meinel.com-20080716132023-zrufbn0leu228waw
parent: john at arbash-meinel.com-20080715231347-brpla88u0zdr0v9g
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: index2
timestamp: Wed 2008-07-16 08:20:23 -0500
message:
  Switch back to iter_entries rather than _iter_sorted_entries.
  
  It means we don't have to merge sort the results, with all the step() overhead.
  It means we can filter using the bloom *before* we have to sort the nodes.
  It paves the way for not sorting at all.
  It paves the way for optimizing the __contains__ check to not have to compute all
  the hashes except when they are needed.
-------------- next part --------------
=== modified file 'btree_index.py'

--- a/btree_index.py	2008-07-15 23:13:47 +0000
+++ b/btree_index.py	2008-07-16 13:20:23 +0000
@@ -32,7 +32,7 @@
 from bzrlib.index import _OPTION_NODE_REFS, _OPTION_KEY_ELEMENTS, _OPTION_LEN
 from bzrlib.osutils import basename, dirname
 from bzrlib.plugins.index2 import errors, chunk_writer
-from bzrlib.plugins.pybloom.pybloom import BloomMurmur, _murmur_hash
+from bzrlib.plugins.pybloom import pybloom
 from bzrlib.transport import get_transport
 
 
@@ -63,7 +63,7 @@
 dupes = [0]
 
 
-class NodeBloom(BloomMurmur):
+class NodeBloom(pybloom.BloomMurmur):
     """Subclass of BloomMurmur that can be read from a file."""
 
     def __init__(self, bytes):
@@ -80,8 +80,8 @@
     def get_raw_offsets(keys):
         """Compute the raw internal offsets for these keys"""
         offsets = {}
-        hash = _murmur_hash
-        seeds = BloomMurmur._predefined_seeds[:BloomMurmur.num_offsets]
+        hash = pybloom._murmur_hash
+        seeds = pybloom.BloomMurmur._predefined_seeds[:pybloom.BloomMurmur.num_offsets]
         for key in keys:
             text = '\x00'.join(key)
             offsets[key] = tuple([int(hash(text, seed)) for seed in seeds])
@@ -312,7 +312,7 @@
         # and must always be 1 (if there are any nodes in the tree).
         self.row_lengths = []
         if BTreeGraphIndex._default_use_blooms:
-            global_bloom = BloomMurmur(_GLOBAL_BLOOM_START_SIZE)
+            global_bloom = pybloom.BloomMurmur(_GLOBAL_BLOOM_START_SIZE)
         else:
             global_bloom = None
         def add_key(string_key, key_line):
@@ -345,7 +345,7 @@
                             str(rows[pos + 1].nodes) + "\n")
                         if create_bloom:
                             # new bloom for the new node
-                            internal_row.bloom = BloomMurmur(_BLOOM_BUILD_SIZE)
+                            internal_row.bloom = pybloom.BloomMurmur(_BLOOM_BUILD_SIZE)
                         else:
                             internal_row.bloom = None
                 # add a new leaf
@@ -375,7 +375,7 @@
                     # We need a new row
                     if 'index' in debug.debug_flags:
                         trace.mutter('Inserting new global row.')
-                    if len(rows) == 1:
+                    if len(rows) == 1 and BTreeGraphIndex._use_node_blooms:
                         # We have only leaf nodes, so we are creating a row
                         # where we will hold bloom filters
                         # reserve 256 for the bloom + 10 for ':bloom:\n'
@@ -679,6 +679,7 @@
     """
 
     _default_use_blooms = True
+    _use_node_blooms = False
 
     def __init__(self, transport, name, size):
         """Create a B+Tree index object on the index name.
@@ -940,32 +941,6 @@
         if not self.key_count():
             return
 
-        # 6 seconds spent in miss_torture using the sorted() line.
-        # Even with out of order disk IO it seems faster not to sort it when
-        # large queries are being made.
-        needed_keys = sorted(keys)
-        bloom_raw_offsets = {}
-        if self._use_blooms and self._get_global_bloom() is not None:
-            bloom_raw_offsets = NodeBloom.get_raw_offsets(needed_keys)
-        for found, info in self._iter_sorted_entries(needed_keys,
-                                                     bloom_raw_offsets,
-                                                     {}):
-            if not found:
-                continue
-            yield info
-
-    def _iter_sorted_entries(self, keys, bloom_raw_offsets, object_dedup_cache):
-        """Iterate over keys within the index.
-
-        :param keys: A list of sorted keys to search for
-        :param bloom_raw_offsets: A dict mapping key => bloom raw offsets
-        :param object_dedup_cache: A dict to use to save objects like strings
-            and tuples, so that we don't consume a lot of redundant memory.
-        :return: An iterable which either returns (False, key) if the key is
-            not present, or (True, (key, value,[refs])) if it is present (same
-            info as iter_all_entries)
-            The keys should be returned in sorted order
-        """
         global leaf_value_hits, miss_attempts, bloom_hits, dupes
         if not self.key_count():
             return
@@ -987,8 +962,8 @@
                 else:
                     leaf_value_hits[1] += 1
                     needed_keys.append(key)
+
         last_key = None
-        missing_keys = []
         if self._use_blooms:
             global_bloom = self._get_global_bloom()
             if global_bloom is not None:
@@ -998,14 +973,20 @@
                         dupes[0] += 1
                         continue
                     last_key = key
-                    if global_bloom.contains_raw_offset(bloom_raw_offsets[key]):
+                    if '\x00'.join(key) in global_bloom:
                         remaining_keys.append(key)
                     else:
-                        missing_keys.append(key)
-                bloom_hits[0] += len(missing_keys)
+                        bloom_hits[0] += 1
                 needed_keys = remaining_keys
         else:
             needed_keys = keys
+        # 6 seconds spent in miss_torture using the sorted() line.
+        # Even with out of order disk IO it seems faster not to sort it when
+        # large queries are being made.
+        if not needed_keys:
+            return
+        needed_keys = sorted(needed_keys)
+
         nodes_and_keys = [(0, needed_keys)]
 
         for row_pos, next_row_start in enumerate(self._row_offsets[1:-1]):
@@ -1033,45 +1014,23 @@
         # We should now be at the _LeafNodes
         node_indexes = [idx for idx, s_keys in nodes_and_keys]
 
-        def step(iterator):
-            try:
-                return iterator.next()
-            except StopIteration:
-                return None
         # TODO: We may *not* want to always read all the nodes in one
         #       big go. Consider setting a max size on this.
-        missing_key_iter = iter(missing_keys)
-        next_missing_key = step(missing_key_iter)
 
         nodes = self._get_leaf_nodes(node_indexes)
         for node_index, sub_keys in nodes_and_keys:
             if not sub_keys:
                 continue
             node = nodes[node_index]
-            sub_key_iter = iter(sub_keys)
-            next_sub_key = step(sub_key_iter)
-            # Output all of the nodes in sorted order, including the missing
-            # ones
-            while next_sub_key is not None:
-                if next_missing_key is not None:
-                    if next_missing_key < next_sub_key:
-                        yield False, next_missing_key
-                        next_missing_key = step(missing_key_iter)
-                        continue
-                # either next_missing_key is None, or it comes after sub_key
+            for next_sub_key in sub_keys:
                 if next_sub_key in node.keys:
                     value, refs = node.keys[next_sub_key]
                     if self.node_ref_lists:
-                        yield True, (self, next_sub_key, value, refs)
+                        yield (self, next_sub_key, value, refs)
                     else:
-                        yield True, (self, next_sub_key, value)
+                        yield (self, next_sub_key, value)
                 else:
-                    yield False, next_sub_key
                     miss_attempts += 1
-                next_sub_key = step(sub_key_iter)
-        while next_missing_key is not None:
-            yield False, next_missing_key
-            next_missing_key = step(missing_key_iter)
 
     def iter_entries_prefix(self, keys):
         """Iterate over keys within the index using prefix matching.

=== modified file 'indexbench.py'
--- a/indexbench.py	2008-07-14 13:08:56 +0000
+++ b/indexbench.py	2008-07-16 13:20:23 +0000
@@ -132,8 +132,9 @@
 # each time. This tests the minimum time to get to an average key from 'start'.
 # On GraphIndex this should be much slower than on BTree index.
     if 'random_reload' in fixtures:
-        run(label, 'iter_random_reload', iter_random_one, 'iter_random_reload',
-            label, drop_cache, names, target, factory, name_keys, True)
+        run(label, 'iter_random_reload', iter_random_one, label,
+            'iter_random_reload', drop_cache, names, target, factory,
+            name_keys, True)
 # text extraction simulation (follow a compression graph) for text_keys
     if 'text' in fixtures:
         text_names = [name for name in names if name.endswith('.tix')]
@@ -308,6 +309,7 @@
             test_transport = get_transport(test_area)
 
         if btree:
+            print 'Using blooms: %s' % (BTreeGraphIndex._default_use_blooms,)
             self.test_class(names, source, BTreeGraphIndex, BTreeBuilder,
                 'BTreeIndex', name_keys, text_keys, drop_cache, fixture, lspro,
                 tip_revision_id, ancestry, test_transport)