Rev 9: Add bloom filter parsing to internal nodes. in http://people.ubuntu.com/~robertc/baz2.0/plugins/index2/trunk

Robert Collins robertc at robertcollins.net
Wed Jul 2 05:27:30 BST 2008


At http://people.ubuntu.com/~robertc/baz2.0/plugins/index2/trunk

------------------------------------------------------------
revno: 9
revision-id: robertc at robertcollins.net-20080702042729-36uy2gxmztjhrzbw
parent: robertc at robertcollins.net-20080701224410-2lbqoqc2dc5v3iey
committer: Robert Collins <robertc at robertcollins.net>
branch nick: trunk
timestamp: Wed 2008-07-02 14:27:29 +1000
message:
  Add bloom filter parsing to internal nodes.
modified:
  btree_index.py                 index.py-20080624222253-p0x5f92uyh5hw734-7
  tests/test_btree_index.py      test_index.py-20080624222253-p0x5f92uyh5hw734-13
=== modified file 'btree_index.py'
--- a/btree_index.py	2008-07-01 22:44:10 +0000
+++ b/btree_index.py	2008-07-02 04:27:29 +0000
@@ -17,6 +17,7 @@
 
 """B+Tree indices"""
 
+import array
 from bisect import bisect_right
 import tempfile
 import zlib
@@ -25,6 +26,7 @@
 from bzrlib import errors as bzrerrors
 from bzrlib.index import _OPTION_NODE_REFS, _OPTION_KEY_ELEMENTS, _OPTION_LEN
 from bzrlib.plugins.index2 import errors, chunk_writer
+from bzrlib.plugins.pybloom.pybloom import BloomSHA1
 
 
 _BTSIGNATURE = "B+Tree Graph Index 1\n"
@@ -36,6 +38,21 @@
 _PAGE_SIZE = 4096
 
 
+class NodeBloom(BloomSHA1):
+    """Subclass of BloomSHA1 that can be read from a file."""
+
+    def __init__(self, bytes):
+        # See BloomSHA1._init_array - which we don't use as we have an array
+        # ready
+        self._num_bytes = len(bytes)
+        self._num_bits = self._num_bytes * 8
+        self._bitmask = self._num_bits - 1
+        self._array = array.array('B', bytes)
+        # we don't know, so make things die if 
+        # methods needing the entry count are called.
+        self._num_entries = None
+
+
 class _BuilderRow(object):
     """The stored state accumulated while writing out a row in the index.
     
@@ -250,16 +267,30 @@
     def __init__(self, bytes):
         """Parse bytes to create an internal node object."""
         # splitlines mangles the \r delimiters.. don't use it.
-        self.keys = self._parse_lines(bytes.split('\n'))
+        self.keys, self.bloom = self._parse_lines(bytes.split('\n'))
 
     def _parse_lines(self, lines):
         nodes = []
         self.offset = int(lines[1][7:])
+        bloom_lines = []
+        in_bloom = False
         for line in lines[2:]:
-            if line == '':
-                return nodes
-            nodes.append(tuple(line.split('\0')))
-        return nodes
+            if not in_bloom:
+                if line == '':
+                    break
+                if line == ':bloom:':
+                    in_bloom = True
+                    continue
+                nodes.append(tuple(line.split('\0')))
+            else:
+                bloom_lines.append(line)
+        if bloom_lines:
+            # put the bloom back together again
+            bloom_bytes = '\n'.join(bloom_lines)
+            bloom = NodeBloom(bloom_bytes)
+        else:
+            bloom = None
+        return nodes, bloom
 
 
 class BTreeGraphIndex(object):

=== modified file 'tests/test_btree_index.py'
--- a/tests/test_btree_index.py	2008-07-01 22:44:10 +0000
+++ b/tests/test_btree_index.py	2008-07-02 04:27:29 +0000
@@ -21,6 +21,7 @@
 
 from bzrlib.plugins import index2
 from bzrlib.plugins.index2 import errors, btree_index
+from bzrlib.plugins.pybloom.pybloom import BloomSHA1
 from bzrlib.tests import TestCaseWithTransport
 from bzrlib.transport import get_transport
 
@@ -485,18 +486,49 @@
             "1111111111111111111111111111111111111111\n"
             "2222222222222222222222222222222222222222\n"
             "3333333333333333333333333333333333333333\n"
-            "4444444444444444444444444444444444444444\n")
-        node = btree_index._InternalNode(node_bytes)
-        # We want to bisect to find the right children from this node, so a
-        # vector is most useful.
-        self.assertEqual([
-            ("0000000000000000000000000000000000000000",),
-            ("1111111111111111111111111111111111111111",),
-            ("2222222222222222222222222222222222222222",),
-            ("3333333333333333333333333333333333333333",),
-            ("4444444444444444444444444444444444444444",),
-            ], node.keys)
-        self.assertEqual(1, node.offset)
+            "4444444444444444444444444444444444444444\n"
+            )
+        node = btree_index._InternalNode(node_bytes)
+        # We want to bisect to find the right children from this node, so a
+        # vector is most useful.
+        self.assertEqual([
+            ("0000000000000000000000000000000000000000",),
+            ("1111111111111111111111111111111111111111",),
+            ("2222222222222222222222222222222222222222",),
+            ("3333333333333333333333333333333333333333",),
+            ("4444444444444444444444444444444444444444",),
+            ], node.keys)
+        self.assertEqual(1, node.offset)
+
+    def test_InternalNode_bloom(self):
+        # Create a little bloom so we can check it is parsed right
+        bloom = BloomSHA1(256 * 8)
+        # set a a bit to test
+        bloom.insert("foo-bar")
+        # get bytes
+        bloom_bytes = bloom._array.tostring()
+        node_bytes = ("type=internal\n"
+            "offset=1\n"
+            "0000000000000000000000000000000000000000\n"
+            "1111111111111111111111111111111111111111\n"
+            "2222222222222222222222222222222222222222\n"
+            "3333333333333333333333333333333333333333\n"
+            "4444444444444444444444444444444444444444\n"
+            ":bloom:\n"
+            + bloom_bytes)
+        node = btree_index._InternalNode(node_bytes)
+        # We want to bisect to find the right children from this node, so a
+        # vector is most useful.
+        self.assertEqual([
+            ("0000000000000000000000000000000000000000",),
+            ("1111111111111111111111111111111111111111",),
+            ("2222222222222222222222222222222222222222",),
+            ("3333333333333333333333333333333333333333",),
+            ("4444444444444444444444444444444444444444",),
+            ], node.keys)
+        self.assertEqual(1, node.offset)
+        self.assertTrue("foo-bar" in node.bloom)
+        self.assertFalse("other" in node.bloom)
 
     def test_LeafNode_2_2(self):
         node_bytes = ("type=leaf\n"




More information about the bazaar-commits mailing list