Rev 24: Updates for when bloom filters are active. in http://bzr.arbash-meinel.com/plugins/index2

Wed Jul 2 19:17:51 BST 2008

At http://bzr.arbash-meinel.com/plugins/index2

------------------------------------------------------------
revno: 24
revision-id: john at arbash-meinel.com-20080702181746-xlca6kf0iusfmxsy
parent: john at arbash-meinel.com-20080702174605-j9oaswfan8zvg2k7
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: index2
timestamp: Wed 2008-07-02 13:17:46 -0500
message:
  Updates for when bloom filters are active.
  Set NodeBloom._num_entries because it is required for Bloom.__repr__
  Allow requesting more nodes than can be cached, just mutter that it won't
  actually be stored. But since the function returns the objects, they will
  be valid for a little while.
  When using bloom filters, we need to sort our keys if they weren't already
  sorted.
  Update the indexbench tests, to use graph.iter_ancestry() and to assert
  that the value is actually correct.
-------------- next part --------------
=== modified file 'btree_index.py'

--- a/btree_index.py	2008-07-02 16:50:51 +0000
+++ b/btree_index.py	2008-07-02 18:17:46 +0000
@@ -53,9 +53,8 @@
         self._num_bits = self._num_bytes * 8
         self._bitmask = self._num_bits - 1
         self._array = array.array('B', bytes)
-        # we don't know, so make things die if 
-        # methods needing the entry count are called.
-        self._num_entries = None
+        # __repr__ wants the num entries, so force it off
+        self._num_entries = -1 # Unknown
 
     @staticmethod
     def get_raw_offsets(keys):
@@ -420,7 +419,8 @@
         :return: A dict of {node_pos: node}
         """
         if len(nodes) > cache._max_cache:
-            raise AssertionError("Asked for more nodes than can cache.")
+            trace.mutter('Requesting %s > %s nodes, not all will be cached',
+                         len(nodes), cache._max_cache)
         found = {}
         for node_pos, node in self._read_nodes(sorted(nodes)):
             if node_pos == 0: # Special case
@@ -594,7 +594,10 @@
                     remaining_sub_keys = [sub_key for sub_key in sub_keys
                         if node.bloom.contains_raw_offset(bloom_raw_offsets[sub_key])]
                     bloom_hits += len(sub_keys) - len(remaining_sub_keys)
-                    sub_keys = remaining_sub_keys
+                    if isinstance(sub_keys, frozenset):
+                        sub_keys = sorted(remaining_sub_keys)
+                    else:
+                        sub_keys = remaining_sub_keys
                 if isinstance(sub_keys, frozenset):
                     sub_keys = sorted(sub_keys)
                 positions = self._multi_bisect_right(sub_keys, node.keys)

=== modified file 'indexbench.py'
--- a/indexbench.py	2008-07-02 17:46:05 +0000
+++ b/indexbench.py	2008-07-02 18:17:46 +0000
@@ -53,7 +53,7 @@
 
 
 def time_index(names, source, factory, builder, target, label, name_keys,
-    text_keys, use_drop_cache, fixtures, lsprof, tip_revision_id):
+    text_keys, use_drop_cache, fixtures, lsprof, tip_revision_id, ancestry):
     if use_drop_cache:
         drop_cache = drop_caches
     else:
@@ -128,14 +128,11 @@
         index = _KnitGraphIndex(rev_index, lambda:True, deltas=True, parents=True)
         vf = KnitVersionedFiles(index, None)
         graph = Graph(vf)
-        search = graph._make_breadth_first_searcher([(tip_revision_id,)])
-        while True:
-            try:
-                search.next()
-            except StopIteration:
-                break
+        test_ancestry = dict(graph.iter_ancestry([tip_revision_id]))
         finish = time.time()
         print "%s: search_from_revid in %0.3f, bloom(%d)" % (label, finish - now, btree_index.bloom_hits)
+        if test_ancestry != ancestry:
+            print "**** Warning ancestry incorrect."
 # pathologic miss-lookups: ask each index of each type for the keys held by the
 # others of that type
     if 'miss' in fixtures:
@@ -182,7 +179,13 @@
         source_branch = Branch.open(sample_branch)
         source = source_branch.repository._transport
         source = source.clone('indices')
-        tip_revision_id = source_branch.last_revision()
+        source_branch.lock_read()
+        try:
+            tip_revision_id = source_branch.last_revision()
+            g = source_branch.repository.get_graph()
+            ancestry = dict(g.iter_ancestry([tip_revision_id]))
+        finally:
+            source_branch.unlock()
         names = source.list_dir('.')
         name_keys = {}
         text_keys = set()
@@ -203,18 +206,19 @@
         if btree:
             self.test_class(names, source, BTreeGraphIndex, BTreeBuilder,
                 'BTreeIndex', name_keys, text_keys, drop_cache, fixture, lspro,
-                tip_revision_id)
+                tip_revision_id, ancestry)
         if graph:
             self.test_class(names, source, GraphIndex, GraphIndexBuilder,
                 'GraphIndex', name_keys, text_keys, drop_cache, fixture, lspro,
-                tip_revision_id)
+                tip_revision_id, ancestry)
 
     def test_class(self, names, source, factory, builder, label, name_keys,
-        text_keys, drop_cache, fixtures, lsprof, tip_revision_id):
+        text_keys, drop_cache, fixtures, lsprof, tip_revision_id, ancestry):
         workingdir = tempfile.mkdtemp()
         t = get_transport(workingdir)
         try:
             time_index(names, source, factory, builder, t, label, name_keys,
-                text_keys, drop_cache, fixtures, lsprof, tip_revision_id)
+                text_keys, drop_cache, fixtures, lsprof, tip_revision_id,
+                ancestry)
         finally:
             t.delete_tree('.')

=== modified file 'tests/test_btree_index.py'
--- a/tests/test_btree_index.py	2008-07-02 16:50:51 +0000
+++ b/tests/test_btree_index.py	2008-07-02 18:17:46 +0000
@@ -493,6 +493,29 @@
         self.assertEqual([('readv', 'index', [(0, 4096)], False, None)],
             transport._activity)
 
+    def test_iter_entries_w_blooms_finds_correct_nodes(self):
+        builder = btree_index.BTreeBuilder(key_elements=1)
+        nodes = self.make_nodes(1200, 1, 0)
+        for node in nodes:
+            builder.add_node(*node)
+        transport = get_transport('trace+' + self.get_url(''))
+        size = transport.put_file('index', builder.finish())
+        del builder
+        index = btree_index.BTreeGraphIndex(transport, 'index', size)
+        del transport._activity[:]
+        index._use_blooms = True
+        # Search for every 100th key, to make sure the blooms are correct
+        search_nodes = nodes[::100]
+        search_keys = [n[0] for n in search_nodes]
+        # We expect (key, value) tuples from the search result
+        expected_result = sorted([n[:2] for n in search_nodes])
+        # Ignore the index object
+        found_nodes = [n[1:] for n in index.iter_entries(search_keys)]
+        sorted_result = sorted(found_nodes)
+        if expected_result != sorted_result:
+            self.assertEqualDiff(pprint.pformat(expected_result),
+                                 pprint.pformat(sorted_result))
+
 
 class TestBTreeNodes(BTreeTestCase):