Rev 34: Pre-compute the row offsets, paving the way for sneaking in extra pages in http://bzr.arbash-meinel.com/plugins/index2
John Arbash Meinel
john at arbash-meinel.com
Thu Jul 3 22:11:25 BST 2008
At http://bzr.arbash-meinel.com/plugins/index2
------------------------------------------------------------
revno: 34
revision-id: john at arbash-meinel.com-20080703211121-hkin5osvlz0soy6w
parent: john at arbash-meinel.com-20080703171239-p0yr682smdn1jb3v
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: index2
timestamp: Thu 2008-07-03 16:11:21 -0500
message:
Pre-compute the row offsets, paving the way for sneaking in extra pages
-------------- next part --------------
=== modified file 'btree_index.py'
--- a/btree_index.py 2008-07-03 07:31:37 +0000
+++ b/btree_index.py 2008-07-03 21:11:21 +0000
@@ -372,6 +372,8 @@
self._internal_node_cache = lru_cache.LRUCache()
self._key_count = None
self._use_blooms = BTreeGraphIndex._default_use_blooms
+ self._row_lengths = None
+ self._row_offsets = None # Start of each row, [-1] is the end
def __eq__(self, other):
"""Equal when self and other were created with the same parameters."""
@@ -469,10 +471,11 @@
"iter_all_entries scales with size of history.")
if not self.key_count():
return
- internal_nodes = sum(self._row_lengths[:-1])
- start_node = internal_nodes
- end_node = start_node + self._row_lengths[-1]
- needed_nodes = range(start_node, end_node)
+ start_of_leaves = self._row_offsets[-2]
+ end_of_leaves = self._row_offsets[-1]
+ needed_nodes = range(start_of_leaves, end_of_leaves)
+ # TODO: we *might* want to check in the cache for nodes which we
+ # already have parsed
if self.node_ref_lists:
for _, node in self._read_nodes(needed_nodes):
for key, (value, refs) in node.keys.items():
@@ -610,19 +613,16 @@
needed_keys = sorted(needed_keys)
nodes_and_keys = [(0, needed_keys)]
- row_offset = 0
if self._use_blooms:
global bloom_hits
if len(self._row_lengths) > 1:
# no internal nodes, and thus no blooms in a length-1 b+tree.
bloom_raw_offsets = NodeBloom.get_raw_offsets(needed_keys)
- for row_pos, row_length in enumerate(self._row_lengths[:-1]):
+ for row_pos, next_row_start in enumerate(self._row_offsets[1:-1]):
node_indexes = [idx for idx, s_keys in nodes_and_keys]
nodes = self._get_internal_nodes(node_indexes)
- row_offset += row_length
-
next_nodes_and_keys = []
for node_index, sub_keys in nodes_and_keys:
node = nodes[node_index]
@@ -635,7 +635,7 @@
sub_keys = remaining_sub_keys
# For misses at this level, do we want to cache that fact?
positions = self._multi_bisect_right(sub_keys, node.keys)
- node_offset = row_offset + node.offset
+ node_offset = next_row_start + node.offset
next_nodes_and_keys.extend([(node_offset + pos, s_keys)
for pos, s_keys in positions])
nodes_and_keys = next_nodes_and_keys
@@ -733,6 +733,14 @@
if len(length)])
except ValueError:
raise bzrerrors.BadIndexOptions(self)
+ row_offset = 0
+ offsets = []
+ for row in self._row_lengths:
+ offsets.append(row_offset)
+ row_offset += row
+ offsets.append(row_offset)
+ self._row_offsets = offsets
+
# calculate the bytes we have processed
header_end = (len(signature) + sum(map(len, lines[0:4])) + 4)
return header_end, bytes[header_end:]
@@ -787,10 +795,9 @@
def validate(self):
"""Validate that everything in the index can be accessed."""
# just read and parse every node.
- if self._key_count is None:
- self._read_nodes([0]).next()
- node_count = sum(self._row_lengths) - 1
- for node in self._read_nodes(range(1, node_count + 1)):
+ self._get_root_node()
+ node_end = self._row_offsets[-1]
+ for node in self._read_nodes(range(1, node_end)):
pass
=== modified file 'tests/test_btree_index.py'
--- a/tests/test_btree_index.py 2008-07-03 03:45:49 +0000
+++ b/tests/test_btree_index.py 2008-07-03 21:11:21 +0000
@@ -217,6 +217,8 @@
index.key_count()
self.assertEqual(3, len(index._row_lengths),
"Not enough rows: %r" % index._row_lengths)
+ self.assertEqual(4, len(index._row_offsets))
+ self.assertEqual(sum(index._row_lengths), index._row_offsets[-1])
internal_nodes = index._get_internal_nodes([1, 2])
internal_node1 = internal_nodes[1]
internal_node2 = internal_nodes[2]
@@ -228,7 +230,7 @@
# The left most key of the second node pointed at by internal_node2
# should be its first key. We can check this by looking for its first key
# in the second node it points at
- pos = index._row_lengths[0] + index._row_lengths[1] + internal_node2.offset + 1
+ pos = index._row_offsets[2] + internal_node2.offset + 1
leaf = index._get_leaf_nodes([pos])[pos]
self.assertTrue(internal_node2.keys[0] in leaf.keys)
# Check the bloom filter for internal_node2: all the keys in the leaf
@@ -238,7 +240,7 @@
# Check the bloom filter for internal_node1 with its first two nodes in
# the same fashion.
for offset in [0, 1]:
- pos = index._row_lengths[0] + index._row_lengths[1] + offset
+ pos = index._row_offsets[2] + offset
leaf = index._get_leaf_nodes([pos])[pos]
for key in leaf.keys:
self.assertTrue('\x00'.join(key) in internal_node1.bloom)
More information about the bazaar-commits
mailing list