Rev 21: Split out the NodeBloom.get_raw_offsets into a static function. in http://bzr.arbash-meinel.com/plugins/index2

Wed Jul 2 17:50:55 BST 2008

At http://bzr.arbash-meinel.com/plugins/index2

------------------------------------------------------------
revno: 21
revision-id: john at arbash-meinel.com-20080702165051-rt6pl41h8tt0wo3u
parent: john at arbash-meinel.com-20080702162254-tgks103u1pyz2fl6
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: index2
timestamp: Wed 2008-07-02 11:50:51 -0500
message:
  Split out the NodeBloom.get_raw_offsets into a static function.
  Break the node caching into 2 caches, one for internal nodes, one for leaf nodes.
-------------- next part --------------
=== modified file 'btree_index.py'

--- a/btree_index.py	2008-07-02 16:22:54 +0000
+++ b/btree_index.py	2008-07-02 16:50:51 +0000
@@ -57,7 +57,8 @@
         # methods needing the entry count are called.
         self._num_entries = None
 
-    def get_raw_offsets(self, keys):
+    @staticmethod
+    def get_raw_offsets(keys):
         """Compute the raw internal offsets for these keys"""
         offsets = {}
         for key in keys:
@@ -380,7 +381,9 @@
         self._name = name
         self._size = size
         self._page_size = transport.recommended_page_size()
-        self._node_cache = lru_cache.LRUCache()
+        self._root_node = None
+        self._leaf_node_cache = lru_cache.LRUCache()
+        self._internal_node_cache = lru_cache.LRUCache()
         self._key_count = None
         self._use_blooms = BTreeGraphIndex._default_use_blooms
 
@@ -395,7 +398,15 @@
     def __ne__(self, other):
         return not self.__eq__(other)
 
-    def _cache_nodes(self, nodes):
+    def _get_root_node(self):
+        if self._root_node is None:
+            # We may not have a root node yet
+            nodes = list(self._read_nodes([0]))
+            if len(nodes):
+                self._root_node = nodes[0][1]
+        return self._root_node
+
+    def _cache_nodes(self, nodes, cache):
         """Read nodes and cache them in the lru.
 
         The nodes list supplied is sorted and then read from disk, each node
@@ -405,37 +416,45 @@
         result in some of the results being immediately discarded, to prevent
         this an assertion is raised if more nodes are asked for than are
         cachable.
+
+        :return: A dict of {node_pos: node}
         """
-        if len(nodes) > self._node_cache._max_cache:
+        if len(nodes) > cache._max_cache:
             raise AssertionError("Asked for more nodes than can cache.")
+        found = {}
         for node_pos, node in self._read_nodes(sorted(nodes)):
-            self._node_cache.add(node_pos, node)
-
-    def _get_node(self, node_index):
-        """Get a node, from cache or disk.
-
-        After getting it, the node will be cached.
-        """
-        try:
-            return self._node_cache[node_index]
-        except KeyError:
-            self._cache_nodes([node_index])
-            return self._node_cache[node_index]
-
-    def _get_nodes(self, node_indexes):
-        """Get a bunch of nodes, from cache or disk."""
+            if node_pos == 0: # Special case
+                self._root_node = node
+            else:
+                cache.add(node_pos, node)
+            found[node_pos] = node
+        return found
+
+    def _get_nodes(self, cache, node_indexes):
         found = {}
         needed = []
         for idx in node_indexes:
+            if idx == 0 and self._root_node is not None:
+                found[0] = self._root_node
+                continue
             try:
-                found[idx] = self._node_cache[idx]
+                found[idx] = cache[idx]
             except KeyError:
                 needed.append(idx)
-        self._cache_nodes(needed)
-        for idx in needed:
-            found[idx] = self._node_cache[idx]
+        found.update(self._cache_nodes(needed, cache))
         return found
 
+    def _get_internal_nodes(self, node_indexes):
+        """Get a node, from cache or disk.
+
+        After getting it, the node will be cached.
+        """
+        return self._get_nodes(self._internal_node_cache, node_indexes)
+
+    def _get_leaf_nodes(self, node_indexes):
+        """Get a bunch of nodes, from cache or disk."""
+        return self._get_nodes(self._leaf_node_cache, node_indexes)
+
     def iter_all_entries(self):
         """Iterate over all keys within the index.
 
@@ -448,9 +467,7 @@
         if 'evil' in debug.debug_flags:
             trace.mutter_callsite(3,
                 "iter_all_entries scales with size of history.")
-        if self._key_count is None:
-            self._cache_nodes([0])
-        if not self._key_count:
+        if not self.key_count():
             return
         internal_nodes = sum(self._row_lengths[:-1])
         start_node = internal_nodes
@@ -542,20 +559,18 @@
             keys supplied. No additional keys will be returned, and every
             key supplied that is in the index will be returned.
         """
-        if self._key_count is None:
-            self._cache_nodes([0])
-        if not self._key_count:
-            return
         # 6 seconds spent in miss_torture using the sorted() line.
         # Even with out of order disk IO it seems faster not to sort it when
         # large queries are being made.
+        # However, now that we are doing multi-way bisecting, we need the keys
+        # in sorted order anyway. Though we could filter at the bloom level
+        # without sorting. *But* is the top level bloom going to restrict
+        # enough keys that we care?
         # keys = sorted(keys)
         keys = frozenset(keys)
         if not keys:
             return
-        if self._key_count is None:
-            self._cache_nodes([0])
-        if not self._key_count:
+        if not self.key_count():
             return
         nodes_and_keys = [(0, keys)]
         row_offset = 0
@@ -564,11 +579,10 @@
             global bloom_hits
             if len(self._row_lengths) > 1:
                 # no internal nodes, and thus no blooms in a length-1 b+tree.
-                node = self._get_node(0)
-                bloom_raw_offsets = node.bloom.get_raw_offsets(keys)
+                bloom_raw_offsets = NodeBloom.get_raw_offsets(keys)
         for row_length in self._row_lengths[:-1]:
             node_indexes = [idx for idx, s_keys in nodes_and_keys]
-            nodes = self._get_nodes(node_indexes)
+            nodes = self._get_internal_nodes(node_indexes)
 
             row_offset += row_length
 
@@ -594,7 +608,7 @@
         # We do it right now, because otherwise we swap the cache,
         # alternatively, we may *not* want to always read all the nodes in one
         # big go. Consider setting a max size on this.
-        nodes = dict(self._read_nodes(node_indexes))
+        nodes = self._get_leaf_nodes(node_indexes)
         for node_index, sub_keys in nodes_and_keys:
             node = nodes[node_index]
             for key in sub_keys:
@@ -636,7 +650,7 @@
         header.
         """
         if self._key_count is None:
-            self._cache_nodes([0])
+            self._get_root_node()
         return self._key_count
 
     def _parse_header_from_bytes(self, bytes):

=== modified file 'tests/test_btree_index.py'
--- a/tests/test_btree_index.py	2008-07-02 16:12:04 +0000
+++ b/tests/test_btree_index.py	2008-07-02 16:50:51 +0000
@@ -193,10 +193,11 @@
         index.key_count()
         self.assertEqual(3, len(index._row_lengths),
             "Not enough rows: %r" % index._row_lengths)
-        internal_node1 = index._get_node(1)
+        internal_nodes = index._get_internal_nodes([1, 2])
+        internal_node1 = internal_nodes[1]
+        internal_node2 = internal_nodes[2]
         # Must have a bloom on the first node.
         self.assertNotEqual(None, internal_node1.bloom)
-        internal_node2 = index._get_node(2)
         # The left most node node2 points at should be one after the right most node pointed at by
         # node1.
         self.assertEqual(internal_node2.offset, 1 + len(internal_node1.keys))
@@ -204,7 +205,7 @@
         # should be its first key. We can check this by looking for its first key
         # in the second node it points at
         pos = index._row_lengths[0] + index._row_lengths[1] + internal_node2.offset + 1
-        leaf = index._get_node(pos)
+        leaf = index._get_leaf_nodes([pos])[pos]
         self.assertTrue(internal_node2.keys[0] in leaf.keys)
         # Check the bloom filter for internal_node2: all the keys in the leaf
         # should appear to be present
@@ -214,7 +215,7 @@
         # the same fashion.
         for offset in [0, 1]:
             pos = index._row_lengths[0] + index._row_lengths[1] + offset
-            leaf = index._get_node(pos)
+            leaf = index._get_leaf_nodes([pos])[pos]
             for key in leaf.keys:
                 self.assertTrue('\x00'.join(key) in internal_node1.bloom)
 
@@ -659,26 +660,15 @@
 
     def test_raw_offsets(self):
         # Raw offsets are fixed for all keys
-        bytes = '\x00'*256
-        node_bloom = btree_index.NodeBloom(bytes)
         self.assertEqual({('foo',): (200198069, -364965925, -916648492, 2134662082, 1977256499)},
-                         node_bloom.get_raw_offsets([('foo',)]))
+                         btree_index.NodeBloom.get_raw_offsets([('foo',)]))
         self.assertEqual({('foo', 'bar'): (-490536797, -1827560737, -889226859, 675372215, 1276719487),
                           ('foo',): (200198069, -364965925, -916648492, 2134662082, 1977256499)},
-                         node_bloom.get_raw_offsets([('foo', 'bar'), ('foo',)]))
-
-    def test_contains_raw_offsets(self):
-        basic_bloom = BloomSHA1(256 * 8)
-        basic_bloom.insert('foo')
-        basic_bloom.insert('bar')
-
-        node_bloom = btree_index.NodeBloom(basic_bloom._array.tostring())
-        raw_offsets = node_bloom.get_raw_offsets([('foo',), ('bar',), ('baz',)])
-        self.assertTrue(node_bloom.contains_raw_offset(raw_offsets[('foo',)]))
-        self.assertTrue(node_bloom.contains_raw_offset(raw_offsets[('bar',)]))
-        self.assertFalse(node_bloom.contains_raw_offset(raw_offsets[('baz',)]))
+                         btree_index.NodeBloom.get_raw_offsets([('foo', 'bar'), ('foo',)]))
 
     def test_contains_raw_offsets_mixed_sizes(self):
+        raw_offsets = btree_index.NodeBloom.get_raw_offsets([('foo',), ('bar',), ('baz',)])
+
         small_bloom = BloomSHA1(16 * 8)
         small_bloom.insert('foo')
         small_bloom.insert('bar')
@@ -688,9 +678,7 @@
 
         small_node_bloom = btree_index.NodeBloom(small_bloom._array.tostring())
         keys = [('foo',), ('bar',), ('baz',)]
-        raw_offsets = small_node_bloom.get_raw_offsets(keys)
         big_node_bloom = btree_index.NodeBloom(big_bloom._array.tostring())
-        self.assertEqual(raw_offsets, big_node_bloom.get_raw_offsets(keys))
 
         self.assertTrue(small_node_bloom.contains_raw_offset(raw_offsets[('foo',)]))
         self.assertTrue(small_node_bloom.contains_raw_offset(raw_offsets[('bar',)]))