Rev 17: Add the ability to deal in 'raw' offsets, which are independent of bloom length in http://bzr.arbash-meinel.com/plugins/index2

John Arbash Meinel john at arbash-meinel.com
Wed Jul 2 17:08:27 BST 2008


At http://bzr.arbash-meinel.com/plugins/index2

------------------------------------------------------------
revno: 17
revision-id: john at arbash-meinel.com-20080702160822-4q8qdi10120l1aen
parent: john at arbash-meinel.com-20080702155655-z77lyi16cvfkpwn6
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: index2
timestamp: Wed 2008-07-02 11:08:22 -0500
message:
  Add the ability to deal in 'raw' offsets, which are independent of bloom length
-------------- next part --------------
=== modified file 'btree_index.py'
--- a/btree_index.py	2008-07-02 15:56:55 +0000
+++ b/btree_index.py	2008-07-02 16:08:22 +0000
@@ -20,6 +20,8 @@
 import array
 from bisect import bisect_right
 from copy import deepcopy
+import sha
+import struct
 import tempfile
 import zlib
 
@@ -55,6 +57,25 @@
         # methods needing the entry count are called.
         self._num_entries = None
 
+    def get_raw_offsets(self, keys):
+        """Compute the raw internal offsets for these keys"""
+        offsets = {}
+        for key in keys:
+            # This generates a 128-bit hash
+            hash_val = sha.new('\x00'.join(key)).digest()
+            # break it up into 5 32-bit offsets
+            # and strip off the unused bits
+            offsets[key] = struct.unpack('>5i', hash_val)
+        return offsets
+
+    def contains_raw_offset(self, raw_offsets):
+        """Given the offsets, is this key present?"""
+        for raw_offset in raw_offsets:
+            offset = raw_offset & self._bitmask
+            if not self._array[offset >> 3] & self.bitmask[offset & 0x7]:
+                return False
+        return True
+
 
 class _BuilderRow(object):
     """The stored state accumulated while writing out a row in the index.
@@ -538,6 +559,12 @@
             return
         nodes_and_keys = [(0, keys)]
         row_offset = 0
+
+        if self._use_blooms:
+            node = self._get_node(0)
+            bloom_raw_offsets = {}
+            for key in keys:
+                bloom_raw_offsets[key] = node.bloom.get_raw_offsets_for_key(key)
         for row_length in self._row_lengths[:-1]:
             node_indexes = [idx for idx, s_keys in nodes_and_keys]
             nodes = self._get_nodes(node_indexes)

=== modified file 'tests/test_btree_index.py'
--- a/tests/test_btree_index.py	2008-07-02 14:54:12 +0000
+++ b/tests/test_btree_index.py	2008-07-02 16:08:22 +0000
@@ -639,3 +639,41 @@
                                      (4, ['g', 'h'])],
                                     ['a', 'b', 'd', 'e', 'g', 'h'],
                                     ['c', 'd', 'f', 'g'])
+
+
+
+class TestNodeBloom(tests.TestCase):
+
+    def test_init(self):
+        basic_bloom = BloomSHA1(256 * 8)
+        basic_bloom.insert('foo')
+        basic_bloom.insert('bar')
+
+        bytes = basic_bloom._array.tostring()
+        node_bloom = btree_index.NodeBloom(bytes)
+        self.assertEqual(basic_bloom._array, node_bloom._array)
+        self.assertTrue('foo' in node_bloom)
+        self.assertTrue('bar' in node_bloom)
+        # With this many bits per entry, there is no way 'baz' would collide
+        self.assertFalse('baz' in node_bloom)
+
+    def test_raw_offsets(self):
+        # Raw offsets are fixed for all keys
+        bytes = '\x00'*256
+        node_bloom = btree_index.NodeBloom(bytes)
+        self.assertEqual({('foo',): (200198069, -364965925, -916648492, 2134662082, 1977256499)},
+                         node_bloom.get_raw_offsets([('foo',)]))
+        self.assertEqual({('foo', 'bar'): (-490536797, -1827560737, -889226859, 675372215, 1276719487),
+                          ('foo',): (200198069, -364965925, -916648492, 2134662082, 1977256499)},
+                         node_bloom.get_raw_offsets([('foo', 'bar'), ('foo',)]))
+
+    def test_contains_raw_offsets(self):
+        basic_bloom = BloomSHA1(256 * 8)
+        basic_bloom.insert('foo')
+        basic_bloom.insert('bar')
+
+        node_bloom = btree_index.NodeBloom(basic_bloom._array.tostring())
+        raw_offsets = node_bloom.get_raw_offsets([('foo',), ('bar',), ('baz',)])
+        self.assertTrue(node_bloom.contains_raw_offset(raw_offsets[('foo',)]))
+        self.assertTrue(node_bloom.contains_raw_offset(raw_offsets[('bar',)]))
+        self.assertFalse(node_bloom.contains_raw_offset(raw_offsets[('baz',)]))



More information about the bazaar-commits mailing list