Rev 13: Crufty but works benchmark-as-plugin in http://people.ubuntu.com/~robertc/baz2.0/plugins/index2/trunk
Robert Collins
robertc at robertcollins.net
Wed Jul 2 09:43:44 BST 2008
At http://people.ubuntu.com/~robertc/baz2.0/plugins/index2/trunk
------------------------------------------------------------
revno: 13
revision-id: robertc at robertcollins.net-20080702084338-45774breb3cit718
parent: robertc at robertcollins.net-20080702061310-nsy8pilywn78oewj
committer: Robert Collins <robertc at robertcollins.net>
branch nick: trunk
timestamp: Wed 2008-07-02 18:43:38 +1000
message:
Crufty but works benchmark-as-plugin
added:
indexbench.py indexbench.py-20080702083855-5tju02y79rw7kkzh-1
modified:
__init__.py __init__.py-20080624222253-p0x5f92uyh5hw734-5
btree_index.py index.py-20080624222253-p0x5f92uyh5hw734-7
tests/test_btree_index.py test_index.py-20080624222253-p0x5f92uyh5hw734-13
=== modified file '__init__.py'
--- a/__init__.py 2008-07-01 11:37:43 +0000
+++ b/__init__.py 2008-07-02 08:43:38 +0000
@@ -47,6 +47,10 @@
'RepositoryFormatPackBTreePlain',
)
+import indexbench
+from bzrlib import commands
+commands.register_command(indexbench.cmd_indexbench)
+
def test_suite():
# Thunk across to load_tests for niceness with older bzr versions
=== modified file 'btree_index.py'
--- a/btree_index.py 2008-07-02 06:13:10 +0000
+++ b/btree_index.py 2008-07-02 08:43:38 +0000
@@ -23,7 +23,7 @@
import tempfile
import zlib
-from bzrlib import debug, index, lru_cache, osutils
+from bzrlib import debug, index, lru_cache, osutils, trace
from bzrlib import errors as bzrerrors
from bzrlib.index import _OPTION_NODE_REFS, _OPTION_KEY_ELEMENTS, _OPTION_LEN
from bzrlib.plugins.index2 import errors, chunk_writer
@@ -38,6 +38,8 @@
_PAGE_SIZE = 4096
+bloom_hits = 0
+use_blooms = False
class NodeBloom(BloomSHA1):
"""Subclass of BloomSHA1 that can be read from a file."""
@@ -442,7 +444,13 @@
row = 0
# how many nodes have the previous rows eaten up
offset = self._row_lengths[row]
+ string_key = '\x00'.join(key)
while type(node) != _LeafNode:
+ if use_blooms and node.bloom is not None:
+ if string_key not in node.bloom:
+ global bloom_hits
+ bloom_hits += 1
+ break
pos = bisect_right(node.keys, key)
# There are len(node.keys) pointers dividing len(node.keys) + 1
# child nodes. The first pointer is the first key in the second
@@ -454,7 +462,7 @@
row += 1
offset = offset + self._row_lengths[row]
node = self._get_node(node_index)
- if key in node.keys:
+ if type(node) == _LeafNode and key in node.keys:
value, refs = node.keys[key]
if self.node_ref_lists:
yield (self, key, value, refs)
=== added file 'indexbench.py'
--- a/indexbench.py 1970-01-01 00:00:00 +0000
+++ b/indexbench.py 2008-07-02 08:43:38 +0000
@@ -0,0 +1,172 @@
+from bzrlib.bzrdir import BzrDir
+import tempfile
+from bzrlib.transport import get_transport
+import sys
+import time
+import os
+import gc
+from random import shuffle, sample
+from bzrlib.plugins.index2.btree_index import BTreeGraphIndex, BTreeBuilder
+from bzrlib.plugins.index2 import btree_index
+from bzrlib.index import GraphIndex, GraphIndexBuilder, CombinedGraphIndex
+from bzrlib.knit import _KnitGraphIndex, KnitVersionedFiles
+from bzrlib.graph import Graph
+from bzrlib.commands import Command
+
+
+key_count = 0
+
+def drop_cache():
+ os.system('drop-caches')
+
+def iter_indices(names, t, factory):
+ for name in sorted(names):
+ size = t.stat(name).st_size
+ yield name, factory(t, name, size)
+
+def copy_indices(indices, target, factory):
+ size = 0
+ for name, index in indices:
+ index.key_count()
+ key_length = index._key_length
+ ref_lists = index.node_ref_lists
+ builder = factory(reference_lists=ref_lists, key_elements=key_length)
+ for node in index.iter_all_entries():
+ builder.add_node(*node[1:])
+ size += target.put_file(name, builder.finish())
+ return size
+
+
+def time_index(names, source, factory, builder, target, label, name_keys,
+ text_keys):
+ drop_cache()
+# Baseline time
+ now = time.time()
+ for name, index in iter_indices(names, source, GraphIndex):
+ index.key_count()
+ for node in index.iter_all_entries():
+ pass
+ finish = time.time()
+ print "Baseline overhead: Read %d keys in %0.3f" % (key_count, finish - now)
+# write time
+ read_time = finish - now
+ drop_cache()
+ now = time.time()
+ length = copy_indices(iter_indices(names, source, GraphIndex),
+ target, builder)
+ finish = time.time()
+ print "%s: Wrote %d bytes in %0.3f" % (label, length, finish - now - read_time)
+# iter_all
+ drop_cache()
+ now = time.time()
+ for name, index in iter_indices(names, target, factory):
+ for item in index.iter_all_entries():
+ pass
+ finish = time.time()
+ print "%s: iter_all_entries in %0.3f" % (label, finish - now)
+# random shuffle, all keys (name_keys comes preshuffled)
+ drop_cache()
+ now = time.time()
+ for name, index in iter_indices(names, target, factory):
+ order = name_keys[name]
+ shuffle(order)
+ for key in order:
+ index.iter_entries([key]).next()
+ finish = time.time()
+ print "%s: iter_random_one in %0.3f" % (label, finish - now)
+# text extraction simulation (follow a compression graph) for text_keys
+ text_names = [name for name in names if name.endswith('.tix')]
+ text_indices = [index for name, index in iter_indices(text_names, target, factory)]
+ text_index = CombinedGraphIndex(text_indices)
+ btree_index.bloom_hits = 0
+ drop_cache()
+ now = time.time()
+ # VersionedFiles can do multi-key extraction, so we can use that.
+
+ index = _KnitGraphIndex(text_index, lambda:True, deltas=True, parents=True)
+ vf = KnitVersionedFiles(index, None)
+ vf._get_components_positions(text_keys)
+ finish = time.time()
+ print "%s: get_components_positions(%d) in %0.3f, bloom(%d)" % (label, len(text_keys),
+ finish - now, btree_index.bloom_hits)
+# follow a revision graph
+ rev_names = [name for name in names if name.endswith('.rix')]
+ rev_indices = [index for name, index in iter_indices(rev_names, target, factory)]
+ rev_index = CombinedGraphIndex(rev_indices)
+ # pick the lexograph middle revision, just because.
+ all_revs = sorted(node[1] for node in rev_index.iter_all_entries())
+ revid = all_revs[len(all_revs)/2]
+ # reopen indices
+ rev_indices = [index for name, index in iter_indices(rev_names, target, factory)]
+ rev_index = CombinedGraphIndex(rev_indices)
+ btree_index.bloom_hits = 0
+ drop_cache()
+ now = time.time()
+ # VersionedFiles can do multi-key extraction, so we can use that.
+ index = _KnitGraphIndex(rev_index, lambda:True, deltas=True, parents=True)
+ vf = KnitVersionedFiles(index, None)
+ graph = Graph(vf)
+ search = graph._make_breadth_first_searcher([revid])
+ while True:
+ try:
+ search.next()
+ except StopIteration:
+ break
+ finish = time.time()
+ print "%s: search_from_revid in %0.3f, bloom(%d)" % (label, finish - now, btree_index.bloom_hits)
+# pathologic miss-lookups: ask each index of each type for the keys held by the
+# others of that type
+ btree_index.bloom_hits = 0
+ drop_cache()
+ now = time.time()
+ for name, index in iter_indices(names, target, factory):
+ suffix = name[-4:]
+ other_keys = set()
+ for othername, keys in name_keys.items():
+ if othername != name and othername.endswith(suffix):
+ other_keys.update(keys)
+ for node in index.iter_entries(other_keys):
+ pass
+ finish = time.time()
+ print "%s: miss torture in %0.3f, bloom(%d)" % (label, finish - now, btree_index.bloom_hits)
+ print "%s: -------Done---------" % (label,)
+
+
+class cmd_indexbench(Command):
+ """Benchmark index performance."""
+
+ takes_args = ['sample_repository']
+
+ def run(self, sample_repository):
+ workingdir = tempfile.mkdtemp()
+ t = get_transport(workingdir)
+ try:
+ source = get_transport(sample_repository)
+ source = source.clone('.bzr/repository/indices')
+# step 1:
+# load indices (gets keys for sizing)
+ names = source.list_dir('.')
+ name_keys = {}
+ text_keys = set()
+ for name, index in iter_indices(names, source, GraphIndex):
+ name_keys[name] = []
+ for item in index.iter_all_entries():
+ name_keys[name].append(item[1])
+ # Randomise for later tests
+ shuffle(name_keys[name])
+ if name.endswith('.tix'):
+ text_keys.update(name_keys[name])
+ text_keys = list(text_keys)
+ # pick 2000 text keys to pretend to reconstruct
+ text_keys = sample(text_keys, 2000)
+ global key_count
+ key_count = sum(map(len, name_keys.itervalues()))
+ t.mkdir('1')
+ t.mkdir('2')
+ time_index(names, source, GraphIndex, GraphIndexBuilder, t.clone('1'),
+ 'GraphIndex', name_keys, text_keys)
+ time_index(names, source, BTreeGraphIndex, BTreeBuilder, t.clone('2'),
+ 'BTreeIndex', name_keys, text_keys)
+ sys.exit(0)
+ finally:
+ t.delete_tree('.')
=== modified file 'tests/test_btree_index.py'
--- a/tests/test_btree_index.py 2008-07-02 06:08:10 +0000
+++ b/tests/test_btree_index.py 2008-07-02 08:43:38 +0000
@@ -469,6 +469,26 @@
('readv', 'index', [(4096, 4096), ], False, None)],
transport._activity)
+ def test_iter_entries_skips_bloom_miss(self):
+ # On a miss, if the bloom says no, no IO is performed.
+ builder = btree_index.BTreeBuilder(key_elements=1)
+ # We need a two level index.
+ nodes = self.make_nodes(1200, 1, 0)
+ for node in nodes:
+ builder.add_node(*node)
+ transport = get_transport('trace+' + self.get_url(''))
+ size = transport.put_file('index', builder.finish())
+ del builder
+ index = btree_index.BTreeGraphIndex(transport, 'index', size)
+ del transport._activity[:]
+ self.assertEqual([], transport._activity)
+ # search for one key
+ found_nodes = list(index.iter_entries([("foo",)]))
+ self.assertEqual([], found_nodes)
+ # Should have read the root node only
+ self.assertEqual([('readv', 'index', [(0, 4096)], False, None)],
+ transport._activity)
+
class TestBTreeNodes(BTreeTestCase):
More information about the bazaar-commits
mailing list