Rev 2478: Add benchmarks for dirstate.bisect_dirblocks, and implement bisect_dirblocks in pyrex. in http://bzr.arbash-meinel.com/branches/bzr/0.17-dev/dirstate_pyrex
John Arbash Meinel
john at arbash-meinel.com
Fri May 4 00:37:10 BST 2007
At http://bzr.arbash-meinel.com/branches/bzr/0.17-dev/dirstate_pyrex
------------------------------------------------------------
revno: 2478
revision-id: john at arbash-meinel.com-20070503233549-445n015iomhc8ppm
parent: john at arbash-meinel.com-20070503233314-btj1vbd2qtod34kq
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: dirstate_pyrex
timestamp: Thu 2007-05-03 18:35:49 -0500
message:
Add benchmarks for dirstate.bisect_dirblocks, and implement bisect_dirblocks in pyrex.
Shows about a 2x performance improvement being in compiled C.
Also, at least on my Mac, it is faster without extra caching.
modified:
bzrlib/benchmarks/bench_dirstate.py bench_dirstate.py-20070503203500-gs0pz6zkvjpq9l2x-1
bzrlib/compiled/dirstate_helpers.pyx dirstate_helpers.pyx-20070503201057-u425eni465q4idwn-3
-------------- next part --------------
=== modified file 'bzrlib/benchmarks/bench_dirstate.py'
--- a/bzrlib/benchmarks/bench_dirstate.py 2007-05-03 21:17:41 +0000
+++ b/bzrlib/benchmarks/bench_dirstate.py 2007-05-03 23:35:49 +0000
@@ -23,11 +23,69 @@
dirstate,
generate_ids,
osutils,
+ tests,
)
+class _CompiledDirstateHelpersFeature(tests.Feature):
+ def _probe(self):
+ try:
+ import bzrlib.compiled.dirstate_helpers
+ except ImportError:
+ return False
+ return True
+
+ def feature_name(self):
+ return 'compiled dirstate helpers'
+
+CompiledDirstateHelpersFeature =_CompiledDirstateHelpersFeature()
+
+
class BenchmarkDirState(benchmarks.Benchmark):
+ def build_helper(self, layout):
+ """This is a helper with the common build_??_dirstate funcs.
+
+ :param layout: [(num_dirs, files_per_dir)]
+ The number of directories per level, and the number of files to put
+ in it.
+ :return: A DirState object with the given layout.
+ """
+ self.build_tree(['dir/'])
+ contents = 'x'*10000
+ self.build_tree_contents([('file', contents)])
+ file_stat = os.lstat('file')
+ dir_stat = os.lstat('dir')
+ file_sha1 = osutils.sha_string(contents)
+
+ state = dirstate.DirState.initialize('state')
+ try:
+ def create_entries(base, layout):
+ if not layout:
+ return
+ num_dirs, num_files = layout[0]
+ for dnum in xrange(num_dirs):
+ if base:
+ path = '%s/%02d_directory' % (base, dnum)
+ else:
+ path = '%02d_directory' % (dnum,)
+ dir_id = generate_ids.gen_file_id(path)
+ state.add(path, dir_id, 'directory', dir_stat, '')
+ for fnum in xrange(num_files):
+ fname = '%s/%02d_filename' % (path, fnum)
+ file_id = generate_ids.gen_file_id(fname)
+ state.add(fname, file_id, 'file', file_stat, file_sha1)
+ create_entries(path, layout[1:])
+ create_entries(None, layout)
+ state.save()
+ finally:
+ state.unlock()
+ return state
+
+ def build_1k_dirstate_dirs(self):
+ """Build a DirState file with 1k directories"""
+ return self.build_helper([(10, 0), (10, 0), (10, 1)])
+
def build_20k_dirstate(self):
"""Build a DirState file with 20k records.
@@ -38,42 +96,7 @@
We try to have reasonable filename lengths, as well as a reasonable
stat value, etc.
"""
- self.build_tree(['dir/'])
- self.build_tree_contents([('file', 'x'*10000)])
- file_stat = os.lstat('file')
- dir_stat = os.lstat('dir')
- file_sha1 = osutils.sha_string('testing')
-
- # the average filename length is 11 characters
- # find . | sed -e 's/.*\///' | wc -l
- # 22545 22545 237869
- # 237869 / 22545 = 10.6
- # average depth is 30 characters
- # find . | wc -l
- # 22545 22545 679884
- # 679884 / 22545 = 30.1
- state = dirstate.DirState.initialize('state')
- try:
- for lvl1 in xrange(10):
- dir_1 = '%2d_directory' % (lvl1,)
- dir_1_id = generate_ids.gen_file_id(dir_1)
- state.add(dir_1, dir_1_id, 'directory', dir_stat, '')
- for lvl2 in xrange(10):
- dir_2 = '%s/%2d_directory' % (dir_1, lvl2)
- dir_2_id = generate_ids.gen_file_id(dir_2)
- state.add(dir_2, dir_2_id, 'directory', dir_stat, '')
- for lvl3 in xrange(10):
- dir_3 = '%s/%2d_directory' % (dir_2, lvl3)
- dir_3_id = generate_ids.gen_file_id(dir_3)
- state.add(dir_3, dir_3_id, 'directory', dir_stat, '')
- for filenum in xrange(20):
- fname = '%s/%2d_filename' % (dir_3, filenum)
- file_id = generate_ids.gen_file_id(fname)
- state.add(fname, file_id, 'directory', dir_stat, '')
- state.save()
- finally:
- state.unlock()
- return state
+ return self.build_helper([(10, 0), (10, 0), (10, 20)])
def test_build_20k_dirblocks(self):
state = self.time(self.build_20k_dirstate)
@@ -93,3 +116,91 @@
self.time(state._read_dirblocks_if_needed)
finally:
state.unlock()
+
+ def do_bisect_list(self, bisect_func):
+ """Call bisect_dirblock for each path."""
+ # We use self._paths and self._blocks because we expect it to be a very
+ # long list. And the interface for 'self.time()' causes the parameters
+ # to be printed when run with --lsprof-timed. Which is *really* ugly
+ # when the list is thousands of entries.
+ blocks = self._blocks
+ return [bisect_func(blocks, path) for path in self._paths]
+
+ def do_bisect_list_cached(self, bisect_func):
+ """Same as do_bisect_list, but cache the split paths"""
+ cache = {}
+ blocks = self._blocks
+ return [bisect_func(blocks, path, cache=cache) for path in self._paths]
+
+ def setup_paths_and_offsets(self, state):
+ """Get a list of paths and expected offsets.
+
+ This will be used to check do_bisect_list*
+ """
+ state._read_dirblocks_if_needed()
+ paths = ['']
+ expected_offsets = [0]
+ for offset, info in enumerate(state._dirblocks):
+ dirname = info[0]
+ # We already handled the empty path
+ if dirname == '':
+ continue
+ # all paths are of the form ##_directory
+ # so search for ##_director, ##_directory
+ paths.extend([dirname[:-1], dirname])
+ expected_offsets.extend([offset, offset])
+ self._paths = paths
+ self._expected_offsets = expected_offsets
+ self._blocks = state._dirblocks
+
+ def checkOffsets(self, offsets):
+ """Make sure offsets matches self._expected_offsets"""
+ # These are really long lists, so it is easier to compare them with
+ # assertEqualDiff. So turn them into strings.
+ expected_str = '\n'.join(str(x) for x in self._expected_offsets)
+ offset_str = '\n'.join(str(x) for x in offsets)
+ self.assertEqualDiff(expected_str, offset_str)
+
+ def test_bisect_dirblock(self):
+ state = self.build_1k_dirstate_dirs()
+ state.lock_read()
+ try:
+ self.setup_paths_and_offsets(state)
+ offsets = self.time(self.do_bisect_list, dirstate.bisect_dirblock)
+ self.checkOffsets(offsets)
+ finally:
+ state.unlock()
+
+ def test_bisect_dirblock_cached(self):
+ state = self.build_1k_dirstate_dirs()
+ state.lock_read()
+ try:
+ self.setup_paths_and_offsets(state)
+ offsets = self.time(self.do_bisect_list_cached,
+ dirstate.bisect_dirblock)
+ self.checkOffsets(offsets)
+ finally:
+ state.unlock()
+
+ def test_bisect_dirblock_compiled(self):
+ self.requireFeature(CompiledDirstateHelpersFeature)
+ state = self.build_1k_dirstate_dirs()
+ state.lock_read()
+ try:
+ self.setup_paths_and_offsets(state)
+ offsets = self.time(self.do_bisect_list, dirstate.bisect_dirblock)
+ self.checkOffsets(offsets)
+ finally:
+ state.unlock()
+
+ def test_bisect_dirblock_compiled_cached(self):
+ self.requireFeature(CompiledDirstateHelpersFeature)
+ state = self.build_1k_dirstate_dirs()
+ state.lock_read()
+ try:
+ self.setup_paths_and_offsets(state)
+ offsets = self.time(self.do_bisect_list_cached,
+ dirstate.bisect_dirblock)
+ self.checkOffsets(offsets)
+ finally:
+ state.unlock()
=== modified file 'bzrlib/compiled/dirstate_helpers.pyx'
--- a/bzrlib/compiled/dirstate_helpers.pyx 2007-05-03 20:11:37 +0000
+++ b/bzrlib/compiled/dirstate_helpers.pyx 2007-05-03 23:35:49 +0000
@@ -22,6 +22,69 @@
from bzrlib.dirstate import DirState
+cdef extern from "Python.h":
+ # GetItem returns a borrowed reference
+ void *PyDict_GetItem(object p, object key)
+ int PyDict_SetItem(object p, object key, object val) except -1
+ object PyList_GetItem(object lst, int index)
+ object PyTuple_GetItem(object tpl, int index)
+
+
+cdef object _split_from_path(object cache, object path):
+ """get the dirblock tuple for a given path.
+
+ :param cache: A Dictionary mapping string paths to tuples
+ :param path: The path we care about.
+ :return: A borrowed reference to a tuple stored in cache.
+ You do not need to Py_DECREF() when you are done, unless you plan on
+ using it for a while.
+ """
+ cdef void* value_ptr
+ cdef object value
+
+ value_ptr = PyDict_GetItem(cache, path)
+ if value_ptr == NULL:
+ value = path.split('/')
+ PyDict_SetItem(cache, path, value)
+ else:
+ value = <object>value_ptr
+
+ return value
+
+
+def bisect_dirblock(dirblocks, dirname, lo=0, hi=None, cache={}):
+ """Return the index where to insert dirname into the dirblocks.
+
+ The return value idx is such that all directories blocks in dirblock[:idx]
+ have names < dirname, and all blocks in dirblock[idx:] have names >=
+ dirname.
+
+ Optional args lo (default 0) and hi (default len(dirblocks)) bound the
+ slice of a to be searched.
+ """
+ cdef int _lo
+ cdef int _hi
+ cdef int _mid
+ cdef object dirname_split
+ cdef object cur_split
+
+ if hi is None:
+ _hi = len(dirblocks)
+ else:
+ _hi = hi
+
+ _lo = lo
+ dirname_split = _split_from_path(cache, dirname)
+ while _lo < _hi:
+ _mid = (_lo+_hi)/2
+ # Grab the dirname for the current dirblock
+ cur = PyTuple_GetItem(PyList_GetItem(dirblocks, _mid), 0)
+ cur_split = _split_from_path(cache, cur)
+ if cur_split < dirname_split: _lo = _mid+1
+ else: _hi = _mid
+ return _lo
+
+
def _read_dirblocks(state):
"""Read in the dirblocks for the given DirState object.
More information about the bazaar-commits
mailing list