Rev 35: Allow dynamically growing the blooms when they start to fill up. in http://bzr.arbash-meinel.com/plugins/index2

John Arbash Meinel john at arbash-meinel.com
Fri Jul 4 00:30:15 BST 2008


At http://bzr.arbash-meinel.com/plugins/index2

------------------------------------------------------------
revno: 35
revision-id: john at arbash-meinel.com-20080703233011-l2sxt8dt8pyvs0bx
parent: john at arbash-meinel.com-20080703211121-hkin5osvlz0soy6w
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: index2
timestamp: Thu 2008-07-03 18:30:11 -0500
message:
  Allow dynamically growing the blooms when they start to fill up.
  
  There is a bit of tuning to do here. How many b/e before we jump, how far do we jump, etc.
  at 16 b/e and jump 2x the blooms were getting unexpectedly full.
  32 b/e and 4x jump creates bigger blooms, but keeps them fairly empty.
-------------- next part --------------
=== modified file 'btree_index.py'
--- a/btree_index.py	2008-07-03 21:11:21 +0000
+++ b/btree_index.py	2008-07-03 23:30:11 +0000
@@ -40,6 +40,10 @@
 _INTERNAL_OFFSET = "offset="
 
 _PAGE_SIZE = 4096
+_BLOOM_INTERNAL_SIZE = 256*8 # Internal nodes get a 256 byte bloom
+_BLOOM_BUILD_SIZE = 4096*8 # Seems to work for lots of nodes
+_BLOOM_GROW_FACTOR = 4 # power of 2 growth rate
+_BLOOM_RESIZE_THRESHOLD = 32 # Try to keep at least 16 bits / entry
 
 # 4K per page: 4MB - 1000 entries
 _NODE_CACHE_SIZE = 1000
@@ -121,12 +125,14 @@
         self.bloom = deepcopy(current_global_bloom)
 
     def finish_node(self):
+        if 'index' in debug.debug_flags:
+            trace.mutter("Finishing node with bloom filter %r.", self.bloom)
+        self.bloom.resize(_BLOOM_INTERNAL_SIZE)
         bloom_bytes = ":bloom:\n" + self.bloom._array.tostring()
         if self.writer.write_reserved(bloom_bytes):
             raise AssertionError("Not enough space for bloom filter.")
         if 'index' in debug.debug_flags:
-            trace.mutter_callsite(3,
-                "Finished node with bloom filter %r.", self.bloom)
+            trace.mutter("Finished node with bloom filter %r.", self.bloom)
         _BuilderRow.finish_node(self)
 
 
@@ -181,7 +187,7 @@
             # (self.rows[-1]). When we finish a chunk in a row,
             # propogate the key that didn't fit (comes after the chunk) to the
             # row above, transitively.
-            global_bloom = BloomSHA1(256 * 8)
+            global_bloom = BloomSHA1(_BLOOM_BUILD_SIZE)
             self.rows.append(_LeafBuilderRow())
             def add_key(string_key, key_line):
                 """Add a key to the current chunk.
@@ -205,12 +211,12 @@
                                 length -= 100 # padded
                             # reserve 256 for the bloom + 10 for ':bloom:\n'
                             internal_row.writer = chunk_writer.ChunkWriter(
-                                length, 266)
+                                length, _BLOOM_INTERNAL_SIZE + 10)
                             internal_row.writer.write(_INTERNAL_FLAG)
                             internal_row.writer.write(_INTERNAL_OFFSET +
                                 str(self.rows[pos + 1].nodes) + "\n")
                             # new bloom for the new node
-                            internal_row.bloom = BloomSHA1(256 * 8)
+                            internal_row.bloom = BloomSHA1(_BLOOM_BUILD_SIZE)
                     # add a new leaf
                     length = _PAGE_SIZE
                     if self.rows[-1].nodes == 0:
@@ -243,16 +249,34 @@
                         # This will be padded, hence the -100
                         # reserve 256 for the bloom + 10 for ':bloom:\n'
                         new_row.writer = chunk_writer.ChunkWriter(
-                            _PAGE_SIZE - 100, 266)
+                            _PAGE_SIZE - 100, _BLOOM_INTERNAL_SIZE + 10)
                         new_row.writer.write(_INTERNAL_FLAG)
                         new_row.writer.write(_INTERNAL_OFFSET +
                             str(self.rows[1].nodes - 1) + "\n")
                         new_row.writer.write(key_line)
                     add_key(string_key, key_line)
                 else:
-                    for row in self.rows[:-1]:
+                    for pos, row in enumerate(self.rows[:-1]):
                         row.bloom.insert(string_key)
+                        # Keep the blooms fairly empty. At 16 bits/entry we
+                        # should be significantly under 1% FPR. This lets us
+                        # resize when we need to, without introducing too much
+                        # loss at the low level
+                        if (row.bloom._num_entries * _BLOOM_RESIZE_THRESHOLD >
+                            row.bloom._num_bits):
+                            row.bloom.resize(row.bloom._num_bits *
+                                             _BLOOM_GROW_FACTOR)
+                            if 'index' in debug.debug_flags:
+                                trace.mutter('resizing bloom at level %d: %s',
+                                             len(self.rows) - pos, row.bloom)
                     global_bloom.insert(string_key)
+                    if (global_bloom._num_entries * _BLOOM_RESIZE_THRESHOLD 
+                        > global_bloom._num_bits):
+                        global_bloom.resize(global_bloom._num_bits *
+                                            _BLOOM_GROW_FACTOR)
+                        if 'index' in debug.debug_flags:
+                            trace.mutter('resizing global bloom: %s',
+                                         row.bloom)
 
             for key, (absent, references, value) in nodes:
                 if absent:



More information about the bazaar-commits mailing list