Rev 9: Add bloom filter parsing to internal nodes. in http://people.ubuntu.com/~robertc/baz2.0/plugins/index2/trunk
Robert Collins
robertc at robertcollins.net
Wed Jul 2 05:27:30 BST 2008
At http://people.ubuntu.com/~robertc/baz2.0/plugins/index2/trunk
------------------------------------------------------------
revno: 9
revision-id: robertc at robertcollins.net-20080702042729-36uy2gxmztjhrzbw
parent: robertc at robertcollins.net-20080701224410-2lbqoqc2dc5v3iey
committer: Robert Collins <robertc at robertcollins.net>
branch nick: trunk
timestamp: Wed 2008-07-02 14:27:29 +1000
message:
Add bloom filter parsing to internal nodes.
modified:
btree_index.py index.py-20080624222253-p0x5f92uyh5hw734-7
tests/test_btree_index.py test_index.py-20080624222253-p0x5f92uyh5hw734-13
=== modified file 'btree_index.py'
--- a/btree_index.py 2008-07-01 22:44:10 +0000
+++ b/btree_index.py 2008-07-02 04:27:29 +0000
@@ -17,6 +17,7 @@
"""B+Tree indices"""
+import array
from bisect import bisect_right
import tempfile
import zlib
@@ -25,6 +26,7 @@
from bzrlib import errors as bzrerrors
from bzrlib.index import _OPTION_NODE_REFS, _OPTION_KEY_ELEMENTS, _OPTION_LEN
from bzrlib.plugins.index2 import errors, chunk_writer
+from bzrlib.plugins.pybloom.pybloom import BloomSHA1
_BTSIGNATURE = "B+Tree Graph Index 1\n"
@@ -36,6 +38,21 @@
_PAGE_SIZE = 4096
+class NodeBloom(BloomSHA1):
+ """Subclass of BloomSHA1 that can be read from a file."""
+
+ def __init__(self, bytes):
+ # See BloomSHA1._init_array - which we don't use as we have an array
+ # ready
+ self._num_bytes = len(bytes)
+ self._num_bits = self._num_bytes * 8
+ self._bitmask = self._num_bits - 1
+ self._array = array.array('B', bytes)
+ # we don't know, so make things die if
+ # methods needing the entry count are called.
+ self._num_entries = None
+
+
class _BuilderRow(object):
"""The stored state accumulated while writing out a row in the index.
@@ -250,16 +267,30 @@
def __init__(self, bytes):
"""Parse bytes to create an internal node object."""
# splitlines mangles the \r delimiters.. don't use it.
- self.keys = self._parse_lines(bytes.split('\n'))
+ self.keys, self.bloom = self._parse_lines(bytes.split('\n'))
def _parse_lines(self, lines):
nodes = []
self.offset = int(lines[1][7:])
+ bloom_lines = []
+ in_bloom = False
for line in lines[2:]:
- if line == '':
- return nodes
- nodes.append(tuple(line.split('\0')))
- return nodes
+ if not in_bloom:
+ if line == '':
+ break
+ if line == ':bloom:':
+ in_bloom = True
+ continue
+ nodes.append(tuple(line.split('\0')))
+ else:
+ bloom_lines.append(line)
+ if bloom_lines:
+ # put the bloom back together again
+ bloom_bytes = '\n'.join(bloom_lines)
+ bloom = NodeBloom(bloom_bytes)
+ else:
+ bloom = None
+ return nodes, bloom
class BTreeGraphIndex(object):
=== modified file 'tests/test_btree_index.py'
--- a/tests/test_btree_index.py 2008-07-01 22:44:10 +0000
+++ b/tests/test_btree_index.py 2008-07-02 04:27:29 +0000
@@ -21,6 +21,7 @@
from bzrlib.plugins import index2
from bzrlib.plugins.index2 import errors, btree_index
+from bzrlib.plugins.pybloom.pybloom import BloomSHA1
from bzrlib.tests import TestCaseWithTransport
from bzrlib.transport import get_transport
@@ -485,18 +486,49 @@
"1111111111111111111111111111111111111111\n"
"2222222222222222222222222222222222222222\n"
"3333333333333333333333333333333333333333\n"
- "4444444444444444444444444444444444444444\n")
- node = btree_index._InternalNode(node_bytes)
- # We want to bisect to find the right children from this node, so a
- # vector is most useful.
- self.assertEqual([
- ("0000000000000000000000000000000000000000",),
- ("1111111111111111111111111111111111111111",),
- ("2222222222222222222222222222222222222222",),
- ("3333333333333333333333333333333333333333",),
- ("4444444444444444444444444444444444444444",),
- ], node.keys)
- self.assertEqual(1, node.offset)
+ "4444444444444444444444444444444444444444\n"
+ )
+ node = btree_index._InternalNode(node_bytes)
+ # We want to bisect to find the right children from this node, so a
+ # vector is most useful.
+ self.assertEqual([
+ ("0000000000000000000000000000000000000000",),
+ ("1111111111111111111111111111111111111111",),
+ ("2222222222222222222222222222222222222222",),
+ ("3333333333333333333333333333333333333333",),
+ ("4444444444444444444444444444444444444444",),
+ ], node.keys)
+ self.assertEqual(1, node.offset)
+
+ def test_InternalNode_bloom(self):
+ # Create a little bloom so we can check it is parsed right
+ bloom = BloomSHA1(256 * 8)
+ # set a a bit to test
+ bloom.insert("foo-bar")
+ # get bytes
+ bloom_bytes = bloom._array.tostring()
+ node_bytes = ("type=internal\n"
+ "offset=1\n"
+ "0000000000000000000000000000000000000000\n"
+ "1111111111111111111111111111111111111111\n"
+ "2222222222222222222222222222222222222222\n"
+ "3333333333333333333333333333333333333333\n"
+ "4444444444444444444444444444444444444444\n"
+ ":bloom:\n"
+ + bloom_bytes)
+ node = btree_index._InternalNode(node_bytes)
+ # We want to bisect to find the right children from this node, so a
+ # vector is most useful.
+ self.assertEqual([
+ ("0000000000000000000000000000000000000000",),
+ ("1111111111111111111111111111111111111111",),
+ ("2222222222222222222222222222222222222222",),
+ ("3333333333333333333333333333333333333333",),
+ ("4444444444444444444444444444444444444444",),
+ ], node.keys)
+ self.assertEqual(1, node.offset)
+ self.assertTrue("foo-bar" in node.bloom)
+ self.assertFalse("other" in node.bloom)
def test_LeafNode_2_2(self):
node_bytes = ("type=leaf\n"
More information about the bazaar-commits
mailing list