Rev 17: Add the ability to deal in 'raw' offsets, which are independent of bloom length in http://bzr.arbash-meinel.com/plugins/index2
John Arbash Meinel
john at arbash-meinel.com
Wed Jul 2 17:08:27 BST 2008
At http://bzr.arbash-meinel.com/plugins/index2
------------------------------------------------------------
revno: 17
revision-id: john at arbash-meinel.com-20080702160822-4q8qdi10120l1aen
parent: john at arbash-meinel.com-20080702155655-z77lyi16cvfkpwn6
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: index2
timestamp: Wed 2008-07-02 11:08:22 -0500
message:
Add the ability to deal in 'raw' offsets, which are independent of bloom length
-------------- next part --------------
=== modified file 'btree_index.py'
--- a/btree_index.py 2008-07-02 15:56:55 +0000
+++ b/btree_index.py 2008-07-02 16:08:22 +0000
@@ -20,6 +20,8 @@
import array
from bisect import bisect_right
from copy import deepcopy
+import sha
+import struct
import tempfile
import zlib
@@ -55,6 +57,25 @@
# methods needing the entry count are called.
self._num_entries = None
+ def get_raw_offsets(self, keys):
+ """Compute the raw internal offsets for these keys"""
+ offsets = {}
+ for key in keys:
+ # This generates a 128-bit hash
+ hash_val = sha.new('\x00'.join(key)).digest()
+ # break it up into 5 32-bit offsets
+ # and strip off the unused bits
+ offsets[key] = struct.unpack('>5i', hash_val)
+ return offsets
+
+ def contains_raw_offset(self, raw_offsets):
+ """Given the offsets, is this key present?"""
+ for raw_offset in raw_offsets:
+ offset = raw_offset & self._bitmask
+ if not self._array[offset >> 3] & self.bitmask[offset & 0x7]:
+ return False
+ return True
+
class _BuilderRow(object):
"""The stored state accumulated while writing out a row in the index.
@@ -538,6 +559,12 @@
return
nodes_and_keys = [(0, keys)]
row_offset = 0
+
+ if self._use_blooms:
+ node = self._get_node(0)
+ bloom_raw_offsets = {}
+ for key in keys:
+ bloom_raw_offsets[key] = node.bloom.get_raw_offsets_for_key(key)
for row_length in self._row_lengths[:-1]:
node_indexes = [idx for idx, s_keys in nodes_and_keys]
nodes = self._get_nodes(node_indexes)
=== modified file 'tests/test_btree_index.py'
--- a/tests/test_btree_index.py 2008-07-02 14:54:12 +0000
+++ b/tests/test_btree_index.py 2008-07-02 16:08:22 +0000
@@ -639,3 +639,41 @@
(4, ['g', 'h'])],
['a', 'b', 'd', 'e', 'g', 'h'],
['c', 'd', 'f', 'g'])
+
+
+
+class TestNodeBloom(tests.TestCase):
+
+ def test_init(self):
+ basic_bloom = BloomSHA1(256 * 8)
+ basic_bloom.insert('foo')
+ basic_bloom.insert('bar')
+
+ bytes = basic_bloom._array.tostring()
+ node_bloom = btree_index.NodeBloom(bytes)
+ self.assertEqual(basic_bloom._array, node_bloom._array)
+ self.assertTrue('foo' in node_bloom)
+ self.assertTrue('bar' in node_bloom)
+ # With this many bits per entry, there is no way 'baz' would collide
+ self.assertFalse('baz' in node_bloom)
+
+ def test_raw_offsets(self):
+ # Raw offsets are fixed for all keys
+ bytes = '\x00'*256
+ node_bloom = btree_index.NodeBloom(bytes)
+ self.assertEqual({('foo',): (200198069, -364965925, -916648492, 2134662082, 1977256499)},
+ node_bloom.get_raw_offsets([('foo',)]))
+ self.assertEqual({('foo', 'bar'): (-490536797, -1827560737, -889226859, 675372215, 1276719487),
+ ('foo',): (200198069, -364965925, -916648492, 2134662082, 1977256499)},
+ node_bloom.get_raw_offsets([('foo', 'bar'), ('foo',)]))
+
+ def test_contains_raw_offsets(self):
+ basic_bloom = BloomSHA1(256 * 8)
+ basic_bloom.insert('foo')
+ basic_bloom.insert('bar')
+
+ node_bloom = btree_index.NodeBloom(basic_bloom._array.tostring())
+ raw_offsets = node_bloom.get_raw_offsets([('foo',), ('bar',), ('baz',)])
+ self.assertTrue(node_bloom.contains_raw_offset(raw_offsets[('foo',)]))
+ self.assertTrue(node_bloom.contains_raw_offset(raw_offsets[('bar',)]))
+ self.assertFalse(node_bloom.contains_raw_offset(raw_offsets[('baz',)]))
More information about the bazaar-commits
mailing list