Rev 13: Crufty but works benchmark-as-plugin in http://people.ubuntu.com/~robertc/baz2.0/plugins/index2/trunk

Wed Jul 2 09:43:44 BST 2008

At http://people.ubuntu.com/~robertc/baz2.0/plugins/index2/trunk

------------------------------------------------------------
revno: 13
revision-id: robertc at robertcollins.net-20080702084338-45774breb3cit718
parent: robertc at robertcollins.net-20080702061310-nsy8pilywn78oewj
committer: Robert Collins <robertc at robertcollins.net>
branch nick: trunk
timestamp: Wed 2008-07-02 18:43:38 +1000
message:
  Crufty but works benchmark-as-plugin
added:
  indexbench.py                  indexbench.py-20080702083855-5tju02y79rw7kkzh-1
modified:
  __init__.py                    __init__.py-20080624222253-p0x5f92uyh5hw734-5
  btree_index.py                 index.py-20080624222253-p0x5f92uyh5hw734-7
  tests/test_btree_index.py      test_index.py-20080624222253-p0x5f92uyh5hw734-13
=== modified file '__init__.py'

--- a/__init__.py	2008-07-01 11:37:43 +0000
+++ b/__init__.py	2008-07-02 08:43:38 +0000
@@ -47,6 +47,10 @@
     'RepositoryFormatPackBTreePlain',
     )
 
+import indexbench
+from bzrlib import commands
+commands.register_command(indexbench.cmd_indexbench)
+
 
 def test_suite():
     # Thunk across to load_tests for niceness with older bzr versions

=== modified file 'btree_index.py'
--- a/btree_index.py	2008-07-02 06:13:10 +0000
+++ b/btree_index.py	2008-07-02 08:43:38 +0000
@@ -23,7 +23,7 @@
 import tempfile
 import zlib
 
-from bzrlib import debug, index, lru_cache, osutils
+from bzrlib import debug, index, lru_cache, osutils, trace
 from bzrlib import errors as bzrerrors
 from bzrlib.index import _OPTION_NODE_REFS, _OPTION_KEY_ELEMENTS, _OPTION_LEN
 from bzrlib.plugins.index2 import errors, chunk_writer
@@ -38,6 +38,8 @@
 
 _PAGE_SIZE = 4096
 
+bloom_hits = 0
+use_blooms = False
 
 class NodeBloom(BloomSHA1):
     """Subclass of BloomSHA1 that can be read from a file."""
@@ -442,7 +444,13 @@
             row = 0
             # how many nodes have the previous rows eaten up
             offset = self._row_lengths[row]
+            string_key = '\x00'.join(key)
             while type(node) != _LeafNode:
+                if use_blooms and node.bloom is not None:
+                    if string_key not in node.bloom:
+                        global bloom_hits
+                        bloom_hits += 1
+                        break
                 pos = bisect_right(node.keys, key)
                 # There are len(node.keys) pointers dividing len(node.keys) + 1
                 # child nodes. The first pointer is the first key in the second
@@ -454,7 +462,7 @@
                 row += 1
                 offset = offset + self._row_lengths[row]
                 node = self._get_node(node_index)
-            if key in node.keys:
+            if type(node) == _LeafNode and key in node.keys:
                 value, refs = node.keys[key]
                 if self.node_ref_lists:
                     yield (self, key, value, refs)

=== added file 'indexbench.py'
--- a/indexbench.py	1970-01-01 00:00:00 +0000
+++ b/indexbench.py	2008-07-02 08:43:38 +0000
@@ -0,0 +1,172 @@
+from bzrlib.bzrdir import BzrDir
+import tempfile
+from bzrlib.transport import get_transport
+import sys
+import time
+import os
+import gc
+from random import shuffle, sample
+from bzrlib.plugins.index2.btree_index import BTreeGraphIndex, BTreeBuilder
+from bzrlib.plugins.index2 import btree_index
+from bzrlib.index import GraphIndex, GraphIndexBuilder, CombinedGraphIndex
+from bzrlib.knit import _KnitGraphIndex, KnitVersionedFiles
+from bzrlib.graph import Graph
+from bzrlib.commands import Command
+
+
+key_count = 0
+
+def drop_cache():
+    os.system('drop-caches')
+
+def iter_indices(names, t, factory):
+    for name in sorted(names):
+        size = t.stat(name).st_size
+        yield name, factory(t, name, size)
+
+def copy_indices(indices, target, factory):
+    size = 0
+    for name, index in indices:
+        index.key_count()
+        key_length = index._key_length
+        ref_lists = index.node_ref_lists
+        builder = factory(reference_lists=ref_lists, key_elements=key_length)
+        for node in index.iter_all_entries():
+            builder.add_node(*node[1:])
+        size += target.put_file(name, builder.finish())
+    return size
+
+
+def time_index(names, source, factory, builder, target, label, name_keys,
+    text_keys):
+    drop_cache()
+# Baseline time
+    now = time.time()
+    for name, index in iter_indices(names, source, GraphIndex):
+        index.key_count()
+        for node in index.iter_all_entries():
+            pass
+    finish = time.time()
+    print "Baseline overhead: Read %d keys in %0.3f" % (key_count, finish - now)
+# write time
+    read_time = finish - now
+    drop_cache()
+    now = time.time()
+    length = copy_indices(iter_indices(names, source, GraphIndex),
+        target, builder)
+    finish = time.time()
+    print "%s: Wrote %d bytes in %0.3f" % (label, length, finish - now - read_time)
+# iter_all
+    drop_cache()
+    now = time.time()
+    for name, index in iter_indices(names, target, factory):
+        for item in index.iter_all_entries():
+            pass
+    finish = time.time()
+    print "%s: iter_all_entries in %0.3f" % (label, finish - now)
+# random shuffle, all keys (name_keys comes preshuffled)
+    drop_cache()
+    now = time.time()
+    for name, index in iter_indices(names, target, factory):
+        order = name_keys[name]
+        shuffle(order)
+        for key in order:
+            index.iter_entries([key]).next()
+    finish = time.time()
+    print "%s: iter_random_one in %0.3f" % (label, finish - now)
+# text extraction simulation (follow a compression graph) for text_keys
+    text_names = [name for name in names if name.endswith('.tix')]
+    text_indices = [index for name, index in iter_indices(text_names, target, factory)]
+    text_index = CombinedGraphIndex(text_indices)
+    btree_index.bloom_hits = 0
+    drop_cache()
+    now = time.time()
+    # VersionedFiles can do multi-key extraction, so we can use that.
+    
+    index = _KnitGraphIndex(text_index, lambda:True, deltas=True, parents=True)
+    vf = KnitVersionedFiles(index, None)
+    vf._get_components_positions(text_keys)
+    finish = time.time()
+    print "%s: get_components_positions(%d) in %0.3f, bloom(%d)" % (label, len(text_keys),
+        finish - now, btree_index.bloom_hits)
+# follow a revision graph
+    rev_names = [name for name in names if name.endswith('.rix')]
+    rev_indices = [index for name, index in iter_indices(rev_names, target, factory)]
+    rev_index = CombinedGraphIndex(rev_indices)
+    # pick the lexograph middle revision, just because.
+    all_revs = sorted(node[1] for node in rev_index.iter_all_entries())
+    revid = all_revs[len(all_revs)/2]
+    # reopen indices
+    rev_indices = [index for name, index in iter_indices(rev_names, target, factory)]
+    rev_index = CombinedGraphIndex(rev_indices)
+    btree_index.bloom_hits = 0
+    drop_cache()
+    now = time.time()
+    # VersionedFiles can do multi-key extraction, so we can use that.
+    index = _KnitGraphIndex(rev_index, lambda:True, deltas=True, parents=True)
+    vf = KnitVersionedFiles(index, None)
+    graph = Graph(vf)
+    search = graph._make_breadth_first_searcher([revid])
+    while True:
+        try:
+            search.next()
+        except StopIteration:
+            break
+    finish = time.time()
+    print "%s: search_from_revid in %0.3f, bloom(%d)" % (label, finish - now, btree_index.bloom_hits)
+# pathologic miss-lookups: ask each index of each type for the keys held by the
+# others of that type
+    btree_index.bloom_hits = 0
+    drop_cache()
+    now = time.time()
+    for name, index in iter_indices(names, target, factory):
+        suffix = name[-4:]
+        other_keys = set()
+        for othername, keys in name_keys.items():
+            if othername != name and othername.endswith(suffix):
+                other_keys.update(keys)
+        for node in index.iter_entries(other_keys):
+            pass
+    finish = time.time()
+    print "%s: miss torture in %0.3f, bloom(%d)" % (label, finish - now, btree_index.bloom_hits)
+    print "%s: -------Done---------" % (label,)
+
+
+class cmd_indexbench(Command):
+    """Benchmark index performance."""
+
+    takes_args = ['sample_repository']
+
+    def run(self, sample_repository):
+        workingdir = tempfile.mkdtemp()
+        t = get_transport(workingdir)
+        try:
+            source = get_transport(sample_repository)
+            source = source.clone('.bzr/repository/indices')
+# step 1:
+# load indices (gets keys for sizing)
+            names = source.list_dir('.')
+            name_keys = {}
+            text_keys = set()
+            for name, index in iter_indices(names, source, GraphIndex):
+                name_keys[name] = []
+                for item in index.iter_all_entries():
+                    name_keys[name].append(item[1])
+                # Randomise for later tests
+                shuffle(name_keys[name])
+                if name.endswith('.tix'):
+                    text_keys.update(name_keys[name])
+            text_keys = list(text_keys)
+            # pick 2000 text keys to pretend to reconstruct
+            text_keys = sample(text_keys, 2000)
+            global key_count
+            key_count = sum(map(len, name_keys.itervalues()))
+            t.mkdir('1')
+            t.mkdir('2')
+            time_index(names, source, GraphIndex, GraphIndexBuilder, t.clone('1'),
+                'GraphIndex', name_keys, text_keys)
+            time_index(names, source, BTreeGraphIndex, BTreeBuilder, t.clone('2'),
+                'BTreeIndex', name_keys, text_keys)
+            sys.exit(0)
+        finally:
+            t.delete_tree('.')

=== modified file 'tests/test_btree_index.py'
--- a/tests/test_btree_index.py	2008-07-02 06:08:10 +0000
+++ b/tests/test_btree_index.py	2008-07-02 08:43:38 +0000
@@ -469,6 +469,26 @@
              ('readv',  'index', [(4096, 4096), ], False, None)],
             transport._activity)
 
+    def test_iter_entries_skips_bloom_miss(self):
+        # On a miss, if the bloom says no, no IO is performed.
+        builder = btree_index.BTreeBuilder(key_elements=1)
+        # We need a two level index.
+        nodes = self.make_nodes(1200, 1, 0)
+        for node in nodes:
+            builder.add_node(*node)
+        transport = get_transport('trace+' + self.get_url(''))
+        size = transport.put_file('index', builder.finish())
+        del builder
+        index = btree_index.BTreeGraphIndex(transport, 'index', size)
+        del transport._activity[:]
+        self.assertEqual([], transport._activity)
+        # search for one key
+        found_nodes = list(index.iter_entries([("foo",)]))
+        self.assertEqual([], found_nodes)
+        # Should have read the root node only
+        self.assertEqual([('readv', 'index', [(0, 4096)], False, None)],
+            transport._activity)
+
 
 class TestBTreeNodes(BTreeTestCase):