Rev 21: Split out the NodeBloom.get_raw_offsets into a static function. in http://bzr.arbash-meinel.com/plugins/index2
John Arbash Meinel
john at arbash-meinel.com
Wed Jul 2 17:50:55 BST 2008
At http://bzr.arbash-meinel.com/plugins/index2
------------------------------------------------------------
revno: 21
revision-id: john at arbash-meinel.com-20080702165051-rt6pl41h8tt0wo3u
parent: john at arbash-meinel.com-20080702162254-tgks103u1pyz2fl6
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: index2
timestamp: Wed 2008-07-02 11:50:51 -0500
message:
Split out the NodeBloom.get_raw_offsets into a static function.
Break the node caching into 2 caches, one for internal nodes, one for leaf nodes.
-------------- next part --------------
=== modified file 'btree_index.py'
--- a/btree_index.py 2008-07-02 16:22:54 +0000
+++ b/btree_index.py 2008-07-02 16:50:51 +0000
@@ -57,7 +57,8 @@
# methods needing the entry count are called.
self._num_entries = None
- def get_raw_offsets(self, keys):
+ @staticmethod
+ def get_raw_offsets(keys):
"""Compute the raw internal offsets for these keys"""
offsets = {}
for key in keys:
@@ -380,7 +381,9 @@
self._name = name
self._size = size
self._page_size = transport.recommended_page_size()
- self._node_cache = lru_cache.LRUCache()
+ self._root_node = None
+ self._leaf_node_cache = lru_cache.LRUCache()
+ self._internal_node_cache = lru_cache.LRUCache()
self._key_count = None
self._use_blooms = BTreeGraphIndex._default_use_blooms
@@ -395,7 +398,15 @@
def __ne__(self, other):
return not self.__eq__(other)
- def _cache_nodes(self, nodes):
+ def _get_root_node(self):
+ if self._root_node is None:
+ # We may not have a root node yet
+ nodes = list(self._read_nodes([0]))
+ if len(nodes):
+ self._root_node = nodes[0][1]
+ return self._root_node
+
+ def _cache_nodes(self, nodes, cache):
"""Read nodes and cache them in the lru.
The nodes list supplied is sorted and then read from disk, each node
@@ -405,37 +416,45 @@
result in some of the results being immediately discarded, to prevent
this an assertion is raised if more nodes are asked for than are
cachable.
+
+ :return: A dict of {node_pos: node}
"""
- if len(nodes) > self._node_cache._max_cache:
+ if len(nodes) > cache._max_cache:
raise AssertionError("Asked for more nodes than can cache.")
+ found = {}
for node_pos, node in self._read_nodes(sorted(nodes)):
- self._node_cache.add(node_pos, node)
-
- def _get_node(self, node_index):
- """Get a node, from cache or disk.
-
- After getting it, the node will be cached.
- """
- try:
- return self._node_cache[node_index]
- except KeyError:
- self._cache_nodes([node_index])
- return self._node_cache[node_index]
-
- def _get_nodes(self, node_indexes):
- """Get a bunch of nodes, from cache or disk."""
+ if node_pos == 0: # Special case
+ self._root_node = node
+ else:
+ cache.add(node_pos, node)
+ found[node_pos] = node
+ return found
+
+ def _get_nodes(self, cache, node_indexes):
found = {}
needed = []
for idx in node_indexes:
+ if idx == 0 and self._root_node is not None:
+ found[0] = self._root_node
+ continue
try:
- found[idx] = self._node_cache[idx]
+ found[idx] = cache[idx]
except KeyError:
needed.append(idx)
- self._cache_nodes(needed)
- for idx in needed:
- found[idx] = self._node_cache[idx]
+ found.update(self._cache_nodes(needed, cache))
return found
+ def _get_internal_nodes(self, node_indexes):
+ """Get a node, from cache or disk.
+
+ After getting it, the node will be cached.
+ """
+ return self._get_nodes(self._internal_node_cache, node_indexes)
+
+ def _get_leaf_nodes(self, node_indexes):
+ """Get a bunch of nodes, from cache or disk."""
+ return self._get_nodes(self._leaf_node_cache, node_indexes)
+
def iter_all_entries(self):
"""Iterate over all keys within the index.
@@ -448,9 +467,7 @@
if 'evil' in debug.debug_flags:
trace.mutter_callsite(3,
"iter_all_entries scales with size of history.")
- if self._key_count is None:
- self._cache_nodes([0])
- if not self._key_count:
+ if not self.key_count():
return
internal_nodes = sum(self._row_lengths[:-1])
start_node = internal_nodes
@@ -542,20 +559,18 @@
keys supplied. No additional keys will be returned, and every
key supplied that is in the index will be returned.
"""
- if self._key_count is None:
- self._cache_nodes([0])
- if not self._key_count:
- return
# 6 seconds spent in miss_torture using the sorted() line.
# Even with out of order disk IO it seems faster not to sort it when
# large queries are being made.
+ # However, now that we are doing multi-way bisecting, we need the keys
+ # in sorted order anyway. Though we could filter at the bloom level
+ # without sorting. *But* is the top level bloom going to restrict
+ # enough keys that we care?
# keys = sorted(keys)
keys = frozenset(keys)
if not keys:
return
- if self._key_count is None:
- self._cache_nodes([0])
- if not self._key_count:
+ if not self.key_count():
return
nodes_and_keys = [(0, keys)]
row_offset = 0
@@ -564,11 +579,10 @@
global bloom_hits
if len(self._row_lengths) > 1:
# no internal nodes, and thus no blooms in a length-1 b+tree.
- node = self._get_node(0)
- bloom_raw_offsets = node.bloom.get_raw_offsets(keys)
+ bloom_raw_offsets = NodeBloom.get_raw_offsets(keys)
for row_length in self._row_lengths[:-1]:
node_indexes = [idx for idx, s_keys in nodes_and_keys]
- nodes = self._get_nodes(node_indexes)
+ nodes = self._get_internal_nodes(node_indexes)
row_offset += row_length
@@ -594,7 +608,7 @@
# We do it right now, because otherwise we swap the cache,
# alternatively, we may *not* want to always read all the nodes in one
# big go. Consider setting a max size on this.
- nodes = dict(self._read_nodes(node_indexes))
+ nodes = self._get_leaf_nodes(node_indexes)
for node_index, sub_keys in nodes_and_keys:
node = nodes[node_index]
for key in sub_keys:
@@ -636,7 +650,7 @@
header.
"""
if self._key_count is None:
- self._cache_nodes([0])
+ self._get_root_node()
return self._key_count
def _parse_header_from_bytes(self, bytes):
=== modified file 'tests/test_btree_index.py'
--- a/tests/test_btree_index.py 2008-07-02 16:12:04 +0000
+++ b/tests/test_btree_index.py 2008-07-02 16:50:51 +0000
@@ -193,10 +193,11 @@
index.key_count()
self.assertEqual(3, len(index._row_lengths),
"Not enough rows: %r" % index._row_lengths)
- internal_node1 = index._get_node(1)
+ internal_nodes = index._get_internal_nodes([1, 2])
+ internal_node1 = internal_nodes[1]
+ internal_node2 = internal_nodes[2]
# Must have a bloom on the first node.
self.assertNotEqual(None, internal_node1.bloom)
- internal_node2 = index._get_node(2)
# The left most node node2 points at should be one after the right most node pointed at by
# node1.
self.assertEqual(internal_node2.offset, 1 + len(internal_node1.keys))
@@ -204,7 +205,7 @@
# should be its first key. We can check this by looking for its first key
# in the second node it points at
pos = index._row_lengths[0] + index._row_lengths[1] + internal_node2.offset + 1
- leaf = index._get_node(pos)
+ leaf = index._get_leaf_nodes([pos])[pos]
self.assertTrue(internal_node2.keys[0] in leaf.keys)
# Check the bloom filter for internal_node2: all the keys in the leaf
# should appear to be present
@@ -214,7 +215,7 @@
# the same fashion.
for offset in [0, 1]:
pos = index._row_lengths[0] + index._row_lengths[1] + offset
- leaf = index._get_node(pos)
+ leaf = index._get_leaf_nodes([pos])[pos]
for key in leaf.keys:
self.assertTrue('\x00'.join(key) in internal_node1.bloom)
@@ -659,26 +660,15 @@
def test_raw_offsets(self):
# Raw offsets are fixed for all keys
- bytes = '\x00'*256
- node_bloom = btree_index.NodeBloom(bytes)
self.assertEqual({('foo',): (200198069, -364965925, -916648492, 2134662082, 1977256499)},
- node_bloom.get_raw_offsets([('foo',)]))
+ btree_index.NodeBloom.get_raw_offsets([('foo',)]))
self.assertEqual({('foo', 'bar'): (-490536797, -1827560737, -889226859, 675372215, 1276719487),
('foo',): (200198069, -364965925, -916648492, 2134662082, 1977256499)},
- node_bloom.get_raw_offsets([('foo', 'bar'), ('foo',)]))
-
- def test_contains_raw_offsets(self):
- basic_bloom = BloomSHA1(256 * 8)
- basic_bloom.insert('foo')
- basic_bloom.insert('bar')
-
- node_bloom = btree_index.NodeBloom(basic_bloom._array.tostring())
- raw_offsets = node_bloom.get_raw_offsets([('foo',), ('bar',), ('baz',)])
- self.assertTrue(node_bloom.contains_raw_offset(raw_offsets[('foo',)]))
- self.assertTrue(node_bloom.contains_raw_offset(raw_offsets[('bar',)]))
- self.assertFalse(node_bloom.contains_raw_offset(raw_offsets[('baz',)]))
+ btree_index.NodeBloom.get_raw_offsets([('foo', 'bar'), ('foo',)]))
def test_contains_raw_offsets_mixed_sizes(self):
+ raw_offsets = btree_index.NodeBloom.get_raw_offsets([('foo',), ('bar',), ('baz',)])
+
small_bloom = BloomSHA1(16 * 8)
small_bloom.insert('foo')
small_bloom.insert('bar')
@@ -688,9 +678,7 @@
small_node_bloom = btree_index.NodeBloom(small_bloom._array.tostring())
keys = [('foo',), ('bar',), ('baz',)]
- raw_offsets = small_node_bloom.get_raw_offsets(keys)
big_node_bloom = btree_index.NodeBloom(big_bloom._array.tostring())
- self.assertEqual(raw_offsets, big_node_bloom.get_raw_offsets(keys))
self.assertTrue(small_node_bloom.contains_raw_offset(raw_offsets[('foo',)]))
self.assertTrue(small_node_bloom.contains_raw_offset(raw_offsets[('bar',)]))
More information about the bazaar-commits
mailing list