Rev 2515: Change the name of cmp_dirblock_strings to cmp_by_dirs in http://bzr.arbash-meinel.com/branches/bzr/0.17-dev/dirstate_pyrex
John Arbash Meinel
john at arbash-meinel.com
Mon May 7 18:57:40 BST 2007
At http://bzr.arbash-meinel.com/branches/bzr/0.17-dev/dirstate_pyrex
------------------------------------------------------------
revno: 2515
revision-id: john at arbash-meinel.com-20070507175701-b8c87exjybq31evq
parent: john at arbash-meinel.com-20070505132458-0fe0g2jfdoyg95mn
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: dirstate_pyrex
timestamp: Mon 2007-05-07 12:57:01 -0500
message:
Change the name of cmp_dirblock_strings to cmp_by_dirs
And refactor the test cases so that we test both the python version and the
C version. Also, add benchmarks for both.
It shows that the C version is approx 10x faster.
modified:
bzrlib/benchmarks/bench_dirstate.py bench_dirstate.py-20070503203500-gs0pz6zkvjpq9l2x-1
bzrlib/compiled/dirstate_helpers.pyx dirstate_helpers.pyx-20070503201057-u425eni465q4idwn-3
bzrlib/dirstate.py dirstate.py-20060728012006-d6mvoihjb3je9peu-1
bzrlib/tests/compiled/test_dirstate_helpers.py test_dirstate_helper-20070504035751-jsbn00xodv0y1eve-2
bzrlib/tests/test_dirstate.py test_dirstate.py-20060728012006-d6mvoihjb3je9peu-2
-------------- next part --------------
=== modified file 'bzrlib/benchmarks/bench_dirstate.py'
--- a/bzrlib/benchmarks/bench_dirstate.py 2007-05-04 18:05:57 +0000
+++ b/bzrlib/benchmarks/bench_dirstate.py 2007-05-07 17:57:01 +0000
@@ -182,3 +182,77 @@
self.checkOffsets(offsets)
finally:
state.unlock()
+
+ def create_path_names(self, layout, base=''):
+ """Create a list of paths with auto-generated names.
+
+ :param layout: A list of [(num_dirs, num_files)] tuples. For each
+ level, the given number of directories will be created, each
+ containing that many files.
+ So [(2, 5), (3, 4)] will create 2 top level directories, containing
+ 5 files, and each top level directory will contain 3 subdirs with 4
+ files.
+ :param base: The base path to prepend to all entries, most callers will
+ pass ''
+ :return: A list of path names.
+ """
+ if not layout:
+ return []
+
+ paths = []
+ num_dirs, num_files = layout[0]
+ for dnum in xrange(num_dirs):
+ if base:
+ path = '%s/%02d_directory' % (base, dnum)
+ else:
+ path = '%02d_directory' % (dnum,)
+ paths.append(path)
+ for fnum in xrange(num_files):
+ fname = '%s/%02d_filename' % (path, fnum)
+ paths.append(fname)
+ paths.extend(self.create_path_names(layout[1:], base=path))
+ return paths
+
+ def test_create_path_names(self):
+ names = self.create_path_names([(2, 3), (1, 2)])
+ self.assertEqual(['00_directory',
+ '00_directory/00_filename',
+ '00_directory/01_filename',
+ '00_directory/02_filename',
+ '00_directory/00_directory',
+ '00_directory/00_directory/00_filename',
+ '00_directory/00_directory/01_filename',
+ '01_directory',
+ '01_directory/00_filename',
+ '01_directory/01_filename',
+ '01_directory/02_filename',
+ '01_directory/00_directory',
+ '01_directory/00_directory/00_filename',
+ '01_directory/00_directory/01_filename',
+ ], names)
+ names = self.time(self.create_path_names, [(10, 2), (10, 2), (10, 20)])
+ # 20 files + 1 directory name, 10 times, plus 2 filenames and 1 dir, 10
+ # times, and another 2 files + 1 dir, 10 times
+ self.assertEqual(21330, 10*(3 + 10*(3 + 10*(1 + 20))))
+ self.assertEqual(21330, len(names))
+
+ def compareAllPaths(self, cmp_func, layout):
+ """Compare N^2 paths.
+
+ Basically, compare every path in the list against every other path.
+ """
+ paths = self.create_path_names(layout)
+ for path1 in paths:
+ for path2 in paths:
+ cmp_func(path1, path2)
+
+ def test_py_cmp_dirblock_strings(self):
+ """Benchmark 103041 comparisons."""
+ self.time(self.compareAllPaths, dirstate.py_cmp_by_dirs,
+ [(3, 1), (3, 1), (3, 1), (3, 2)])
+
+ def test_c_cmp_dirblock_strings(self):
+ self.requireFeature(CompiledDirstateHelpersFeature)
+ from bzrlib.compiled.dirstate_helpers import c_cmp_by_dirs
+ self.time(self.compareAllPaths, c_cmp_by_dirs,
+ [(3, 1), (3, 1), (3, 1), (3, 2)])
=== modified file 'bzrlib/compiled/dirstate_helpers.pyx'
--- a/bzrlib/compiled/dirstate_helpers.pyx 2007-05-05 05:02:02 +0000
+++ b/bzrlib/compiled/dirstate_helpers.pyx 2007-05-07 17:57:01 +0000
@@ -81,7 +81,7 @@
int strcmp(char *s1, char *s2)
-cdef int _cmp_dirblock_strings(char *path1, int size1, char *path2, int size2):
+cdef int _cmp_by_dirs(char *path1, int size1, char *path2, int size2):
cdef char *cur1
cdef char *cur2
cdef char *end1
@@ -135,12 +135,27 @@
return 0
-def cmp_dirblock_strings(path1, path2):
- """Compare to python strings in dirblock fashion."""
- return _cmp_dirblock_strings(PyString_AsString(path1),
- PyString_Size(path1),
- PyString_AsString(path2),
- PyString_Size(path2))
+def c_cmp_by_dirs(path1, path2):
+ """Compare two paths directory by directory.
+
+ This is equivalent to doing::
+
+ cmp(path1.split('/'), path2.split('/'))
+
+ The idea is that you should compare path components separately. This
+ differs from plain ``cmp(path1, path2)`` for paths like ``'a-b'`` and
+ ``a/b``. "a-b" comes after "a" but would come before "a/b" lexically.
+
+ :param path1: first path
+ :param path2: second path
+ :return: positive number if ``path1`` comes first,
+ 0 if paths are equal,
+ and negative number if ``path2`` sorts first
+ """
+ return _cmp_by_dirs(PyString_AsString(path1),
+ PyString_Size(path1),
+ PyString_AsString(path2),
+ PyString_Size(path2))
def c_bisect_dirblock(dirblocks, dirname, lo=0, hi=None, cache=None):
@@ -183,8 +198,7 @@
PyList_GetItem_object_void(dirblocks, _mid), 0)
cur_str = PyString_AS_STRING_void(cur)
cur_size = PyString_GET_SIZE_void(cur)
- if _cmp_dirblock_strings(cur_str, cur_size,
- dirname_str, dirname_size) < 0:
+ if _cmp_by_dirs(cur_str, cur_size, dirname_str, dirname_size) < 0:
_lo = _mid+1
else:
_hi = _mid
=== modified file 'bzrlib/dirstate.py'
--- a/bzrlib/dirstate.py 2007-05-05 13:24:58 +0000
+++ b/bzrlib/dirstate.py 2007-05-07 17:57:01 +0000
@@ -338,15 +338,11 @@
# add it.
#------- copied from inventory.make_entry
# --- normalized_filename wants a unicode basename only, so get one.
- dirname, basename = osutils.split(path)
- # we dont import normalized_filename directly because we want to be
- # able to change the implementation at runtime for tests.
- norm_name, can_access = osutils.normalized_filename(basename)
- if norm_name != basename:
- if can_access:
- basename = norm_name
- else:
- raise errors.InvalidNormalization(path)
+ if path.__class__ == unicode:
+ utf8path = path.encode('utf8')
+ else:
+ utf8path = path
+ dirname, basename = osutils.split(utf8path)
# you should never have files called . or ..; just add the directory
# in the parent, or according to the special treatment for the root
if basename == '.' or basename == '..':
@@ -354,8 +350,6 @@
# now that we've normalised, we need the correct utf8 path and
# dirname and basename elements. This single encode and split should be
# faster than three separate encodes.
- utf8path = (dirname + '/' + basename).strip('/').encode('utf8')
- dirname, basename = osutils.split(utf8path)
assert file_id.__class__ == str, \
"must be a utf8 file_id not %s" % (type(file_id))
# Make sure the file_id does not exist in this tree
@@ -2409,6 +2403,29 @@
_read_dirblocks = _py_read_dirblocks
+
+def py_cmp_by_dirs(path1, path2):
+ """Compare two paths directory by directory.
+
+ This is equivalent to doing::
+
+ cmp(path1.split('/'), path2.split('/'))
+
+ The idea is that you should compare path components separately. This
+ differs from plain ``cmp(path1, path2)`` for paths like ``'a-b'`` and
+ ``a/b``. "a-b" comes after "a" but would come before "a/b" lexically.
+
+ :param path1: first path
+ :param path2: second path
+ :return: positive number if ``path1`` comes first,
+ 0 if paths are equal,
+ and negative number if ``path2`` sorts first
+ """
+ return cmp(path1.split('/'), path2.split('/'))
+
+cmp_by_dirs = py_cmp_by_dirs
+
+
# Try to load the compiled form if possible
# TODO: jam 20070503 We should have a way to run tests with and without the
# compiled extensions.
@@ -2416,9 +2433,11 @@
from bzrlib.compiled.dirstate_helpers import (
_c_read_dirblocks,
c_bisect_dirblock,
+ c_cmp_by_dirs,
)
except ImportError:
pass
else:
_read_dirblocks = _c_read_dirblocks
bisect_dirblock = c_bisect_dirblock
+ cmp_by_dirs = c_cmp_by_dirs
=== modified file 'bzrlib/tests/compiled/test_dirstate_helpers.py'
--- a/bzrlib/tests/compiled/test_dirstate_helpers.py 2007-05-04 18:05:57 +0000
+++ b/bzrlib/tests/compiled/test_dirstate_helpers.py 2007-05-07 17:57:01 +0000
@@ -38,102 +38,13 @@
CompiledDirstateHelpersFeature = _CompiledDirstateHelpersFeature()
-class TestCMPDirblockStrings(tests.TestCase):
+class TestCCmpByDirs(test_dirstate.TestCmpByDirs):
+ """Test the C implementation of cmp_by_dirs"""
_test_needs_features = [CompiledDirstateHelpersFeature]
- def assertPositive(self, val):
- """Assert that val is greater than 0."""
- self.assertTrue(val > 0, 'expected a positive value, but got %s' % val)
-
- def assertNegative(self, val):
- """Assert that val is less than 0."""
- self.assertTrue(val < 0, 'expected a negative value, but got %s' % val)
-
- def assertStrCmp(self, expected, str1, str2):
- """Compare the two strings, in both directions.
-
- :param expected: The expected comparison value. -1 means str1 comes
- first, 0 means they are equal, 1 means str2 comes first
- :param str1: string to compare
- :param str2: string to compare
- """
- cmp_dirblock_strings = dirstate_helpers.cmp_dirblock_strings
- if expected == 0:
- self.assertEqual(0, cmp(str1.split('/'), str2.split('/')))
- self.assertEqual(0, cmp_dirblock_strings(str1, str2))
- self.assertEqual(0, cmp_dirblock_strings(str2, str1))
- elif expected > 0:
- self.assertPositive(cmp(str1.split('/'), str2.split('/')))
- self.assertPositive(cmp_dirblock_strings(str1, str2))
- self.assertNegative(cmp_dirblock_strings(str2, str1))
- else:
- self.assertNegative(cmp(str1.split('/'), str2.split('/')))
- self.assertNegative(cmp_dirblock_strings(str1, str2))
- self.assertPositive(cmp_dirblock_strings(str2, str1))
-
- def test_cmp_empty(self):
- """Compare against the empty string."""
- self.assertStrCmp(0, '', '')
- self.assertStrCmp(1, 'a', '')
- self.assertStrCmp(1, 'ab', '')
- self.assertStrCmp(1, 'abc', '')
- self.assertStrCmp(1, 'abcd', '')
- self.assertStrCmp(1, 'abcde', '')
- self.assertStrCmp(1, 'abcdef', '')
- self.assertStrCmp(1, 'abcdefg', '')
- self.assertStrCmp(1, 'abcdefgh', '')
- self.assertStrCmp(1, 'abcdefghi', '')
- self.assertStrCmp(1, 'test/ing/a/path/', '')
-
- def test_cmp_same_str(self):
- """Compare the same string"""
- self.assertStrCmp(0, 'a', 'a')
- self.assertStrCmp(0, 'ab', 'ab')
- self.assertStrCmp(0, 'abc', 'abc')
- self.assertStrCmp(0, 'abcd', 'abcd')
- self.assertStrCmp(0, 'abcde', 'abcde')
- self.assertStrCmp(0, 'abcdef', 'abcdef')
- self.assertStrCmp(0, 'abcdefg', 'abcdefg')
- self.assertStrCmp(0, 'abcdefgh', 'abcdefgh')
- self.assertStrCmp(0, 'abcdefghi', 'abcdefghi')
- self.assertStrCmp(0, 'testing a long string', 'testing a long string')
- self.assertStrCmp(0, 'x'*10000, 'x'*10000)
- self.assertStrCmp(0, 'a/b', 'a/b')
- self.assertStrCmp(0, 'a/b/c', 'a/b/c')
- self.assertStrCmp(0, 'a/b/c/d', 'a/b/c/d')
- self.assertStrCmp(0, 'a/b/c/d/e', 'a/b/c/d/e')
-
- def test_simple_paths(self):
- """Compare strings that act like normal string comparison"""
- self.assertStrCmp(-1, 'a', 'b')
- self.assertStrCmp(-1, 'aa', 'ab')
- self.assertStrCmp(-1, 'ab', 'bb')
- self.assertStrCmp(-1, 'aaa', 'aab')
- self.assertStrCmp(-1, 'aab', 'abb')
- self.assertStrCmp(-1, 'abb', 'bbb')
- self.assertStrCmp(-1, 'aaaa', 'aaab')
- self.assertStrCmp(-1, 'aaab', 'aabb')
- self.assertStrCmp(-1, 'aabb', 'abbb')
- self.assertStrCmp(-1, 'abbb', 'bbbb')
- self.assertStrCmp(-1, 'aaaaa', 'aaaab')
- self.assertStrCmp(-1, 'a/a', 'a/b')
- self.assertStrCmp(-1, 'a/b', 'b/b')
- self.assertStrCmp(-1, 'a/a/a', 'a/a/b')
- self.assertStrCmp(-1, 'a/a/b', 'a/b/b')
- self.assertStrCmp(-1, 'a/b/b', 'b/b/b')
- self.assertStrCmp(-1, 'a/a/a/a', 'a/a/a/b')
- self.assertStrCmp(-1, 'a/a/a/b', 'a/a/b/b')
- self.assertStrCmp(-1, 'a/a/b/b', 'a/b/b/b')
- self.assertStrCmp(-1, 'a/b/b/b', 'b/b/b/b')
- self.assertStrCmp(-1, 'a/a/a/a/a', 'a/a/a/a/b')
-
- def test_tricky_paths(self):
- self.assertStrCmp(1, 'ab/cd/ef', 'ab/cc/ef')
- self.assertStrCmp(1, 'ab/cd/ef', 'ab/c/ef')
- self.assertStrCmp(-1, 'ab/cd/ef', 'ab/cd-ef')
- self.assertStrCmp(-1, 'ab/cd', 'ab/cd-')
- self.assertStrCmp(-1, 'ab/cd', 'ab-cd')
+ def get_cmp_by_dirs(self):
+ return dirstate_helpers.c_cmp_by_dirs
class TestCompiledBisectDirblock(test_dirstate.TestBisectDirblock):
@@ -149,11 +60,5 @@
_test_needs_features = [CompiledDirstateHelpersFeature]
- def setUp(self):
- super(TestCompiledBisectDirblock, self).setUp()
- if have_dirstate_helpers:
- self.bisect_dirblock_func = dirstate_helpers.c_bisect_dirblock
- else:
- # We shouldn't be running these tests because of the
- # _test_needs_features, so make sure they would fail otherwise
- self.bisect_dirblock_func = None
+ def get_bisect_dirblock(self):
+ return dirstate_helpers.c_bisect_dirblock
=== modified file 'bzrlib/tests/test_dirstate.py'
--- a/bzrlib/tests/test_dirstate.py 2007-05-04 18:59:36 +0000
+++ b/bzrlib/tests/test_dirstate.py 2007-05-07 17:57:01 +0000
@@ -1976,12 +1976,9 @@
'to', 'foo') chunks rather than by raw 'path/to/foo'.
"""
- def setUp(self):
- super(TestBisectDirblock, self).setUp()
- # We have to set this here, because if we set it at the class variable
- # level, Python interprets it as a member function, and passes 'self'
- # as the first argument.
- self.bisect_dirblock_func = dirstate.py_bisect_dirblock
+ def get_bisect_dirblock(self):
+ """Return an implementation of bisect_dirblock"""
+ return dirstate.py_bisect_dirblock
def assertBisect(self, dirblocks, split_dirblocks, path, *args, **kwargs):
"""Assert that bisect_split works like bisect_left on the split paths.
@@ -1992,9 +1989,9 @@
All other arguments will be passed along.
"""
+ bisect_dirblock = self.get_bisect_dirblock()
self.assertIsInstance(dirblocks, list)
- bisect_split_idx = self.bisect_dirblock_func(dirblocks, path,
- *args, **kwargs)
+ bisect_split_idx = bisect_dirblock(dirblocks, path, *args, **kwargs)
split_dirblock = (path.split('/'), [])
bisect_left_idx = bisect.bisect_left(split_dirblocks, split_dirblock,
*args)
@@ -2115,3 +2112,103 @@
state._validate)
self.assertContainsRe(str(e),
'file a-id is absent in row')
+
+
+class TestCmpByDirs(TestCase):
+
+ def get_cmp_by_dirs(self):
+ """Get a specific implementation of cmp_by_dirs."""
+ return dirstate.py_cmp_by_dirs
+
+ def assertPositive(self, val):
+ """Assert that val is greater than 0."""
+ self.assertTrue(val > 0, 'expected a positive value, but got %s' % val)
+
+ def assertNegative(self, val):
+ """Assert that val is less than 0."""
+ self.assertTrue(val < 0, 'expected a negative value, but got %s' % val)
+
+ def assertCmpByDirs(self, expected, str1, str2):
+ """Compare the two strings, in both directions.
+
+ :param expected: The expected comparison value. -1 means str1 comes
+ first, 0 means they are equal, 1 means str2 comes first
+ :param str1: string to compare
+ :param str2: string to compare
+ """
+ cmp_by_dirs = self.get_cmp_by_dirs()
+ if expected == 0:
+ self.assertEqual(str1, str2)
+ self.assertEqual(0, cmp_by_dirs(str1, str2))
+ self.assertEqual(0, cmp_by_dirs(str2, str1))
+ elif expected > 0:
+ self.assertPositive(cmp_by_dirs(str1, str2))
+ self.assertNegative(cmp_by_dirs(str2, str1))
+ else:
+ self.assertNegative(cmp_by_dirs(str1, str2))
+ self.assertPositive(cmp_by_dirs(str2, str1))
+
+ def test_cmp_empty(self):
+ """Compare against the empty string."""
+ self.assertCmpByDirs(0, '', '')
+ self.assertCmpByDirs(1, 'a', '')
+ self.assertCmpByDirs(1, 'ab', '')
+ self.assertCmpByDirs(1, 'abc', '')
+ self.assertCmpByDirs(1, 'abcd', '')
+ self.assertCmpByDirs(1, 'abcde', '')
+ self.assertCmpByDirs(1, 'abcdef', '')
+ self.assertCmpByDirs(1, 'abcdefg', '')
+ self.assertCmpByDirs(1, 'abcdefgh', '')
+ self.assertCmpByDirs(1, 'abcdefghi', '')
+ self.assertCmpByDirs(1, 'test/ing/a/path/', '')
+
+ def test_cmp_same_str(self):
+ """Compare the same string"""
+ self.assertCmpByDirs(0, 'a', 'a')
+ self.assertCmpByDirs(0, 'ab', 'ab')
+ self.assertCmpByDirs(0, 'abc', 'abc')
+ self.assertCmpByDirs(0, 'abcd', 'abcd')
+ self.assertCmpByDirs(0, 'abcde', 'abcde')
+ self.assertCmpByDirs(0, 'abcdef', 'abcdef')
+ self.assertCmpByDirs(0, 'abcdefg', 'abcdefg')
+ self.assertCmpByDirs(0, 'abcdefgh', 'abcdefgh')
+ self.assertCmpByDirs(0, 'abcdefghi', 'abcdefghi')
+ self.assertCmpByDirs(0, 'testing a long string', 'testing a long string')
+ self.assertCmpByDirs(0, 'x'*10000, 'x'*10000)
+ self.assertCmpByDirs(0, 'a/b', 'a/b')
+ self.assertCmpByDirs(0, 'a/b/c', 'a/b/c')
+ self.assertCmpByDirs(0, 'a/b/c/d', 'a/b/c/d')
+ self.assertCmpByDirs(0, 'a/b/c/d/e', 'a/b/c/d/e')
+
+ def test_simple_paths(self):
+ """Compare strings that act like normal string comparison"""
+ self.assertCmpByDirs(-1, 'a', 'b')
+ self.assertCmpByDirs(-1, 'aa', 'ab')
+ self.assertCmpByDirs(-1, 'ab', 'bb')
+ self.assertCmpByDirs(-1, 'aaa', 'aab')
+ self.assertCmpByDirs(-1, 'aab', 'abb')
+ self.assertCmpByDirs(-1, 'abb', 'bbb')
+ self.assertCmpByDirs(-1, 'aaaa', 'aaab')
+ self.assertCmpByDirs(-1, 'aaab', 'aabb')
+ self.assertCmpByDirs(-1, 'aabb', 'abbb')
+ self.assertCmpByDirs(-1, 'abbb', 'bbbb')
+ self.assertCmpByDirs(-1, 'aaaaa', 'aaaab')
+ self.assertCmpByDirs(-1, 'a/a', 'a/b')
+ self.assertCmpByDirs(-1, 'a/b', 'b/b')
+ self.assertCmpByDirs(-1, 'a/a/a', 'a/a/b')
+ self.assertCmpByDirs(-1, 'a/a/b', 'a/b/b')
+ self.assertCmpByDirs(-1, 'a/b/b', 'b/b/b')
+ self.assertCmpByDirs(-1, 'a/a/a/a', 'a/a/a/b')
+ self.assertCmpByDirs(-1, 'a/a/a/b', 'a/a/b/b')
+ self.assertCmpByDirs(-1, 'a/a/b/b', 'a/b/b/b')
+ self.assertCmpByDirs(-1, 'a/b/b/b', 'b/b/b/b')
+ self.assertCmpByDirs(-1, 'a/a/a/a/a', 'a/a/a/a/b')
+
+ def test_tricky_paths(self):
+ self.assertCmpByDirs(1, 'ab/cd/ef', 'ab/cc/ef')
+ self.assertCmpByDirs(1, 'ab/cd/ef', 'ab/c/ef')
+ self.assertCmpByDirs(-1, 'ab/cd/ef', 'ab/cd-ef')
+ self.assertCmpByDirs(-1, 'ab/cd', 'ab/cd-')
+ self.assertCmpByDirs(-1, 'ab/cd', 'ab-cd')
+
+
More information about the bazaar-commits
mailing list