Rev 35: Allow dynamically growing the blooms when they start to fill up. in http://bzr.arbash-meinel.com/plugins/index2
John Arbash Meinel
john at arbash-meinel.com
Fri Jul 4 00:30:15 BST 2008
At http://bzr.arbash-meinel.com/plugins/index2
------------------------------------------------------------
revno: 35
revision-id: john at arbash-meinel.com-20080703233011-l2sxt8dt8pyvs0bx
parent: john at arbash-meinel.com-20080703211121-hkin5osvlz0soy6w
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: index2
timestamp: Thu 2008-07-03 18:30:11 -0500
message:
Allow dynamically growing the blooms when they start to fill up.
There is a bit of tuning to do here. How many b/e before we jump, how far do we jump, etc.
at 16 b/e and jump 2x the blooms were getting unexpectedly full.
32 b/e and 4x jump creates bigger blooms, but keeps them fairly empty.
-------------- next part --------------
=== modified file 'btree_index.py'
--- a/btree_index.py 2008-07-03 21:11:21 +0000
+++ b/btree_index.py 2008-07-03 23:30:11 +0000
@@ -40,6 +40,10 @@
_INTERNAL_OFFSET = "offset="
_PAGE_SIZE = 4096
+_BLOOM_INTERNAL_SIZE = 256*8 # Internal nodes get a 256 byte bloom
+_BLOOM_BUILD_SIZE = 4096*8 # Seems to work for lots of nodes
+_BLOOM_GROW_FACTOR = 4 # power of 2 growth rate
+_BLOOM_RESIZE_THRESHOLD = 32 # Try to keep at least 16 bits / entry
# 4K per page: 4MB - 1000 entries
_NODE_CACHE_SIZE = 1000
@@ -121,12 +125,14 @@
self.bloom = deepcopy(current_global_bloom)
def finish_node(self):
+ if 'index' in debug.debug_flags:
+ trace.mutter("Finishing node with bloom filter %r.", self.bloom)
+ self.bloom.resize(_BLOOM_INTERNAL_SIZE)
bloom_bytes = ":bloom:\n" + self.bloom._array.tostring()
if self.writer.write_reserved(bloom_bytes):
raise AssertionError("Not enough space for bloom filter.")
if 'index' in debug.debug_flags:
- trace.mutter_callsite(3,
- "Finished node with bloom filter %r.", self.bloom)
+ trace.mutter("Finished node with bloom filter %r.", self.bloom)
_BuilderRow.finish_node(self)
@@ -181,7 +187,7 @@
# (self.rows[-1]). When we finish a chunk in a row,
# propogate the key that didn't fit (comes after the chunk) to the
# row above, transitively.
- global_bloom = BloomSHA1(256 * 8)
+ global_bloom = BloomSHA1(_BLOOM_BUILD_SIZE)
self.rows.append(_LeafBuilderRow())
def add_key(string_key, key_line):
"""Add a key to the current chunk.
@@ -205,12 +211,12 @@
length -= 100 # padded
# reserve 256 for the bloom + 10 for ':bloom:\n'
internal_row.writer = chunk_writer.ChunkWriter(
- length, 266)
+ length, _BLOOM_INTERNAL_SIZE + 10)
internal_row.writer.write(_INTERNAL_FLAG)
internal_row.writer.write(_INTERNAL_OFFSET +
str(self.rows[pos + 1].nodes) + "\n")
# new bloom for the new node
- internal_row.bloom = BloomSHA1(256 * 8)
+ internal_row.bloom = BloomSHA1(_BLOOM_BUILD_SIZE)
# add a new leaf
length = _PAGE_SIZE
if self.rows[-1].nodes == 0:
@@ -243,16 +249,34 @@
# This will be padded, hence the -100
# reserve 256 for the bloom + 10 for ':bloom:\n'
new_row.writer = chunk_writer.ChunkWriter(
- _PAGE_SIZE - 100, 266)
+ _PAGE_SIZE - 100, _BLOOM_INTERNAL_SIZE + 10)
new_row.writer.write(_INTERNAL_FLAG)
new_row.writer.write(_INTERNAL_OFFSET +
str(self.rows[1].nodes - 1) + "\n")
new_row.writer.write(key_line)
add_key(string_key, key_line)
else:
- for row in self.rows[:-1]:
+ for pos, row in enumerate(self.rows[:-1]):
row.bloom.insert(string_key)
+ # Keep the blooms fairly empty. At 16 bits/entry we
+ # should be significantly under 1% FPR. This lets us
+ # resize when we need to, without introducing too much
+ # loss at the low level
+ if (row.bloom._num_entries * _BLOOM_RESIZE_THRESHOLD >
+ row.bloom._num_bits):
+ row.bloom.resize(row.bloom._num_bits *
+ _BLOOM_GROW_FACTOR)
+ if 'index' in debug.debug_flags:
+ trace.mutter('resizing bloom at level %d: %s',
+ len(self.rows) - pos, row.bloom)
global_bloom.insert(string_key)
+ if (global_bloom._num_entries * _BLOOM_RESIZE_THRESHOLD
+ > global_bloom._num_bits):
+ global_bloom.resize(global_bloom._num_bits *
+ _BLOOM_GROW_FACTOR)
+ if 'index' in debug.debug_flags:
+ trace.mutter('resizing global bloom: %s',
+ row.bloom)
for key, (absent, references, value) in nodes:
if absent:
More information about the bazaar-commits
mailing list