Rev 24: Updates for when bloom filters are active. in http://bzr.arbash-meinel.com/plugins/index2
John Arbash Meinel
john at arbash-meinel.com
Wed Jul 2 19:17:51 BST 2008
At http://bzr.arbash-meinel.com/plugins/index2
------------------------------------------------------------
revno: 24
revision-id: john at arbash-meinel.com-20080702181746-xlca6kf0iusfmxsy
parent: john at arbash-meinel.com-20080702174605-j9oaswfan8zvg2k7
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: index2
timestamp: Wed 2008-07-02 13:17:46 -0500
message:
Updates for when bloom filters are active.
Set NodeBloom._num_entries because it is required for Bloom.__repr__
Allow requesting more nodes than can be cached, just mutter that it won't
actually be stored. But since the function returns the objects, they will
be valid for a little while.
When using bloom filters, we need to sort our keys if they weren't already
sorted.
Update the indexbench tests, to use graph.iter_ancestry() and to assert
that the value is actually correct.
-------------- next part --------------
=== modified file 'btree_index.py'
--- a/btree_index.py 2008-07-02 16:50:51 +0000
+++ b/btree_index.py 2008-07-02 18:17:46 +0000
@@ -53,9 +53,8 @@
self._num_bits = self._num_bytes * 8
self._bitmask = self._num_bits - 1
self._array = array.array('B', bytes)
- # we don't know, so make things die if
- # methods needing the entry count are called.
- self._num_entries = None
+ # __repr__ wants the num entries, so force it off
+ self._num_entries = -1 # Unknown
@staticmethod
def get_raw_offsets(keys):
@@ -420,7 +419,8 @@
:return: A dict of {node_pos: node}
"""
if len(nodes) > cache._max_cache:
- raise AssertionError("Asked for more nodes than can cache.")
+ trace.mutter('Requesting %s > %s nodes, not all will be cached',
+ len(nodes), cache._max_cache)
found = {}
for node_pos, node in self._read_nodes(sorted(nodes)):
if node_pos == 0: # Special case
@@ -594,7 +594,10 @@
remaining_sub_keys = [sub_key for sub_key in sub_keys
if node.bloom.contains_raw_offset(bloom_raw_offsets[sub_key])]
bloom_hits += len(sub_keys) - len(remaining_sub_keys)
- sub_keys = remaining_sub_keys
+ if isinstance(sub_keys, frozenset):
+ sub_keys = sorted(remaining_sub_keys)
+ else:
+ sub_keys = remaining_sub_keys
if isinstance(sub_keys, frozenset):
sub_keys = sorted(sub_keys)
positions = self._multi_bisect_right(sub_keys, node.keys)
=== modified file 'indexbench.py'
--- a/indexbench.py 2008-07-02 17:46:05 +0000
+++ b/indexbench.py 2008-07-02 18:17:46 +0000
@@ -53,7 +53,7 @@
def time_index(names, source, factory, builder, target, label, name_keys,
- text_keys, use_drop_cache, fixtures, lsprof, tip_revision_id):
+ text_keys, use_drop_cache, fixtures, lsprof, tip_revision_id, ancestry):
if use_drop_cache:
drop_cache = drop_caches
else:
@@ -128,14 +128,11 @@
index = _KnitGraphIndex(rev_index, lambda:True, deltas=True, parents=True)
vf = KnitVersionedFiles(index, None)
graph = Graph(vf)
- search = graph._make_breadth_first_searcher([(tip_revision_id,)])
- while True:
- try:
- search.next()
- except StopIteration:
- break
+ test_ancestry = dict(graph.iter_ancestry([tip_revision_id]))
finish = time.time()
print "%s: search_from_revid in %0.3f, bloom(%d)" % (label, finish - now, btree_index.bloom_hits)
+ if test_ancestry != ancestry:
+ print "**** Warning ancestry incorrect."
# pathologic miss-lookups: ask each index of each type for the keys held by the
# others of that type
if 'miss' in fixtures:
@@ -182,7 +179,13 @@
source_branch = Branch.open(sample_branch)
source = source_branch.repository._transport
source = source.clone('indices')
- tip_revision_id = source_branch.last_revision()
+ source_branch.lock_read()
+ try:
+ tip_revision_id = source_branch.last_revision()
+ g = source_branch.repository.get_graph()
+ ancestry = dict(g.iter_ancestry([tip_revision_id]))
+ finally:
+ source_branch.unlock()
names = source.list_dir('.')
name_keys = {}
text_keys = set()
@@ -203,18 +206,19 @@
if btree:
self.test_class(names, source, BTreeGraphIndex, BTreeBuilder,
'BTreeIndex', name_keys, text_keys, drop_cache, fixture, lspro,
- tip_revision_id)
+ tip_revision_id, ancestry)
if graph:
self.test_class(names, source, GraphIndex, GraphIndexBuilder,
'GraphIndex', name_keys, text_keys, drop_cache, fixture, lspro,
- tip_revision_id)
+ tip_revision_id, ancestry)
def test_class(self, names, source, factory, builder, label, name_keys,
- text_keys, drop_cache, fixtures, lsprof, tip_revision_id):
+ text_keys, drop_cache, fixtures, lsprof, tip_revision_id, ancestry):
workingdir = tempfile.mkdtemp()
t = get_transport(workingdir)
try:
time_index(names, source, factory, builder, t, label, name_keys,
- text_keys, drop_cache, fixtures, lsprof, tip_revision_id)
+ text_keys, drop_cache, fixtures, lsprof, tip_revision_id,
+ ancestry)
finally:
t.delete_tree('.')
=== modified file 'tests/test_btree_index.py'
--- a/tests/test_btree_index.py 2008-07-02 16:50:51 +0000
+++ b/tests/test_btree_index.py 2008-07-02 18:17:46 +0000
@@ -493,6 +493,29 @@
self.assertEqual([('readv', 'index', [(0, 4096)], False, None)],
transport._activity)
+ def test_iter_entries_w_blooms_finds_correct_nodes(self):
+ builder = btree_index.BTreeBuilder(key_elements=1)
+ nodes = self.make_nodes(1200, 1, 0)
+ for node in nodes:
+ builder.add_node(*node)
+ transport = get_transport('trace+' + self.get_url(''))
+ size = transport.put_file('index', builder.finish())
+ del builder
+ index = btree_index.BTreeGraphIndex(transport, 'index', size)
+ del transport._activity[:]
+ index._use_blooms = True
+ # Search for every 100th key, to make sure the blooms are correct
+ search_nodes = nodes[::100]
+ search_keys = [n[0] for n in search_nodes]
+ # We expect (key, value) tuples from the search result
+ expected_result = sorted([n[:2] for n in search_nodes])
+ # Ignore the index object
+ found_nodes = [n[1:] for n in index.iter_entries(search_keys)]
+ sorted_result = sorted(found_nodes)
+ if expected_result != sorted_result:
+ self.assertEqualDiff(pprint.pformat(expected_result),
+ pprint.pformat(sorted_result))
+
class TestBTreeNodes(BTreeTestCase):
More information about the bazaar-commits
mailing list