Rev 2515: Change the name of cmp_dirblock_strings to cmp_by_dirs in http://bzr.arbash-meinel.com/branches/bzr/0.17-dev/dirstate_pyrex

Mon May 7 18:57:40 BST 2007

At http://bzr.arbash-meinel.com/branches/bzr/0.17-dev/dirstate_pyrex

------------------------------------------------------------
revno: 2515
revision-id: john at arbash-meinel.com-20070507175701-b8c87exjybq31evq
parent: john at arbash-meinel.com-20070505132458-0fe0g2jfdoyg95mn
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: dirstate_pyrex
timestamp: Mon 2007-05-07 12:57:01 -0500
message:
  Change the name of cmp_dirblock_strings to cmp_by_dirs
  And refactor the test cases so that we test both the python version and the
  C version. Also, add benchmarks for both.
  It shows that the C version is approx 10x faster.
modified:
  bzrlib/benchmarks/bench_dirstate.py bench_dirstate.py-20070503203500-gs0pz6zkvjpq9l2x-1
  bzrlib/compiled/dirstate_helpers.pyx dirstate_helpers.pyx-20070503201057-u425eni465q4idwn-3
  bzrlib/dirstate.py             dirstate.py-20060728012006-d6mvoihjb3je9peu-1
  bzrlib/tests/compiled/test_dirstate_helpers.py test_dirstate_helper-20070504035751-jsbn00xodv0y1eve-2
  bzrlib/tests/test_dirstate.py  test_dirstate.py-20060728012006-d6mvoihjb3je9peu-2
-------------- next part --------------
=== modified file 'bzrlib/benchmarks/bench_dirstate.py'

--- a/bzrlib/benchmarks/bench_dirstate.py	2007-05-04 18:05:57 +0000
+++ b/bzrlib/benchmarks/bench_dirstate.py	2007-05-07 17:57:01 +0000
@@ -182,3 +182,77 @@
             self.checkOffsets(offsets)
         finally:
             state.unlock()
+
+    def create_path_names(self, layout, base=''):
+        """Create a list of paths with auto-generated names.
+
+        :param layout: A list of [(num_dirs, num_files)] tuples. For each
+            level, the given number of directories will be created, each
+            containing that many files.
+            So [(2, 5), (3, 4)] will create 2 top level directories, containing
+            5 files, and each top level directory will contain 3 subdirs with 4
+            files.
+        :param base: The base path to prepend to all entries, most callers will
+            pass ''
+        :return: A list of path names.
+        """
+        if not layout:
+            return []
+
+        paths = []
+        num_dirs, num_files = layout[0]
+        for dnum in xrange(num_dirs):
+            if base:
+                path = '%s/%02d_directory' % (base, dnum)
+            else:
+                path = '%02d_directory' % (dnum,)
+            paths.append(path)
+            for fnum in xrange(num_files):
+                fname = '%s/%02d_filename' % (path, fnum)
+                paths.append(fname)
+            paths.extend(self.create_path_names(layout[1:], base=path))
+        return paths
+
+    def test_create_path_names(self):
+        names = self.create_path_names([(2, 3), (1, 2)])
+        self.assertEqual(['00_directory',
+                          '00_directory/00_filename',
+                          '00_directory/01_filename',
+                          '00_directory/02_filename',
+                          '00_directory/00_directory',
+                          '00_directory/00_directory/00_filename',
+                          '00_directory/00_directory/01_filename',
+                          '01_directory',
+                          '01_directory/00_filename',
+                          '01_directory/01_filename',
+                          '01_directory/02_filename',
+                          '01_directory/00_directory',
+                          '01_directory/00_directory/00_filename',
+                          '01_directory/00_directory/01_filename',
+                         ], names)
+        names = self.time(self.create_path_names, [(10, 2), (10, 2), (10, 20)])
+        # 20 files + 1 directory name, 10 times, plus 2 filenames and 1 dir, 10
+        # times, and another 2 files + 1 dir, 10 times
+        self.assertEqual(21330, 10*(3 + 10*(3 + 10*(1 + 20))))
+        self.assertEqual(21330, len(names))
+
+    def compareAllPaths(self, cmp_func, layout):
+        """Compare N^2 paths.
+
+        Basically, compare every path in the list against every other path.
+        """
+        paths = self.create_path_names(layout)
+        for path1 in paths:
+            for path2 in paths:
+                cmp_func(path1, path2)
+
+    def test_py_cmp_dirblock_strings(self):
+        """Benchmark 103041 comparisons."""
+        self.time(self.compareAllPaths, dirstate.py_cmp_by_dirs,
+                                        [(3, 1), (3, 1), (3, 1), (3, 2)])
+
+    def test_c_cmp_dirblock_strings(self):
+        self.requireFeature(CompiledDirstateHelpersFeature)
+        from bzrlib.compiled.dirstate_helpers import c_cmp_by_dirs
+        self.time(self.compareAllPaths, c_cmp_by_dirs,
+                                        [(3, 1), (3, 1), (3, 1), (3, 2)])

=== modified file 'bzrlib/compiled/dirstate_helpers.pyx'
--- a/bzrlib/compiled/dirstate_helpers.pyx	2007-05-05 05:02:02 +0000
+++ b/bzrlib/compiled/dirstate_helpers.pyx	2007-05-07 17:57:01 +0000
@@ -81,7 +81,7 @@
     int strcmp(char *s1, char *s2)
 
 
-cdef int _cmp_dirblock_strings(char *path1, int size1, char *path2, int size2):
+cdef int _cmp_by_dirs(char *path1, int size1, char *path2, int size2):
     cdef char *cur1
     cdef char *cur2
     cdef char *end1
@@ -135,12 +135,27 @@
     return 0
 
 
-def cmp_dirblock_strings(path1, path2):
-    """Compare to python strings in dirblock fashion."""
-    return _cmp_dirblock_strings(PyString_AsString(path1),
-                                 PyString_Size(path1),
-                                 PyString_AsString(path2),
-                                 PyString_Size(path2))
+def c_cmp_by_dirs(path1, path2):
+    """Compare two paths directory by directory.
+
+    This is equivalent to doing::
+
+       cmp(path1.split('/'), path2.split('/'))
+
+    The idea is that you should compare path components separately. This
+    differs from plain ``cmp(path1, path2)`` for paths like ``'a-b'`` and
+    ``a/b``. "a-b" comes after "a" but would come before "a/b" lexically.
+
+    :param path1: first path
+    :param path2: second path
+    :return: positive number if ``path1`` comes first,
+        0 if paths are equal,
+        and negative number if ``path2`` sorts first
+    """
+    return _cmp_by_dirs(PyString_AsString(path1),
+                        PyString_Size(path1),
+                        PyString_AsString(path2),
+                        PyString_Size(path2))
 
 
 def c_bisect_dirblock(dirblocks, dirname, lo=0, hi=None, cache=None):
@@ -183,8 +198,7 @@
                 PyList_GetItem_object_void(dirblocks, _mid), 0)
         cur_str = PyString_AS_STRING_void(cur)
         cur_size = PyString_GET_SIZE_void(cur)
-        if _cmp_dirblock_strings(cur_str, cur_size,
-                                 dirname_str, dirname_size) < 0:
+        if _cmp_by_dirs(cur_str, cur_size, dirname_str, dirname_size) < 0:
             _lo = _mid+1
         else:
             _hi = _mid

=== modified file 'bzrlib/dirstate.py'
--- a/bzrlib/dirstate.py	2007-05-05 13:24:58 +0000
+++ b/bzrlib/dirstate.py	2007-05-07 17:57:01 +0000
@@ -338,15 +338,11 @@
         # add it.
         #------- copied from inventory.make_entry
         # --- normalized_filename wants a unicode basename only, so get one.
-        dirname, basename = osutils.split(path)
-        # we dont import normalized_filename directly because we want to be
-        # able to change the implementation at runtime for tests.
-        norm_name, can_access = osutils.normalized_filename(basename)
-        if norm_name != basename:
-            if can_access:
-                basename = norm_name
-            else:
-                raise errors.InvalidNormalization(path)
+        if path.__class__ == unicode:
+            utf8path = path.encode('utf8')
+        else:
+            utf8path = path
+        dirname, basename = osutils.split(utf8path)
         # you should never have files called . or ..; just add the directory
         # in the parent, or according to the special treatment for the root
         if basename == '.' or basename == '..':
@@ -354,8 +350,6 @@
         # now that we've normalised, we need the correct utf8 path and 
         # dirname and basename elements. This single encode and split should be
         # faster than three separate encodes.
-        utf8path = (dirname + '/' + basename).strip('/').encode('utf8')
-        dirname, basename = osutils.split(utf8path)
         assert file_id.__class__ == str, \
             "must be a utf8 file_id not %s" % (type(file_id))
         # Make sure the file_id does not exist in this tree
@@ -2409,6 +2403,29 @@
 
 _read_dirblocks = _py_read_dirblocks
 
+
+def py_cmp_by_dirs(path1, path2):
+    """Compare two paths directory by directory.
+
+    This is equivalent to doing::
+
+       cmp(path1.split('/'), path2.split('/'))
+
+    The idea is that you should compare path components separately. This
+    differs from plain ``cmp(path1, path2)`` for paths like ``'a-b'`` and
+    ``a/b``. "a-b" comes after "a" but would come before "a/b" lexically.
+
+    :param path1: first path
+    :param path2: second path
+    :return: positive number if ``path1`` comes first,
+        0 if paths are equal,
+        and negative number if ``path2`` sorts first
+    """
+    return cmp(path1.split('/'), path2.split('/'))
+
+cmp_by_dirs = py_cmp_by_dirs
+
+
 # Try to load the compiled form if possible
 # TODO: jam 20070503 We should have a way to run tests with and without the
 #       compiled extensions.
@@ -2416,9 +2433,11 @@
     from bzrlib.compiled.dirstate_helpers import (
         _c_read_dirblocks,
         c_bisect_dirblock,
+        c_cmp_by_dirs,
         )
 except ImportError:
     pass
 else:
     _read_dirblocks = _c_read_dirblocks
     bisect_dirblock = c_bisect_dirblock
+    cmp_by_dirs = c_cmp_by_dirs

=== modified file 'bzrlib/tests/compiled/test_dirstate_helpers.py'
--- a/bzrlib/tests/compiled/test_dirstate_helpers.py	2007-05-04 18:05:57 +0000
+++ b/bzrlib/tests/compiled/test_dirstate_helpers.py	2007-05-07 17:57:01 +0000
@@ -38,102 +38,13 @@
 CompiledDirstateHelpersFeature = _CompiledDirstateHelpersFeature()
 
 
-class TestCMPDirblockStrings(tests.TestCase):
+class TestCCmpByDirs(test_dirstate.TestCmpByDirs):
+    """Test the C implementation of cmp_by_dirs"""
 
     _test_needs_features = [CompiledDirstateHelpersFeature]
 
-    def assertPositive(self, val):
-        """Assert that val is greater than 0."""
-        self.assertTrue(val > 0, 'expected a positive value, but got %s' % val)
-
-    def assertNegative(self, val):
-        """Assert that val is less than 0."""
-        self.assertTrue(val < 0, 'expected a negative value, but got %s' % val)
-
-    def assertStrCmp(self, expected, str1, str2):
-        """Compare the two strings, in both directions.
-
-        :param expected: The expected comparison value. -1 means str1 comes
-            first, 0 means they are equal, 1 means str2 comes first
-        :param str1: string to compare
-        :param str2: string to compare
-        """
-        cmp_dirblock_strings = dirstate_helpers.cmp_dirblock_strings
-        if expected == 0:
-            self.assertEqual(0, cmp(str1.split('/'), str2.split('/')))
-            self.assertEqual(0, cmp_dirblock_strings(str1, str2))
-            self.assertEqual(0, cmp_dirblock_strings(str2, str1))
-        elif expected > 0:
-            self.assertPositive(cmp(str1.split('/'), str2.split('/')))
-            self.assertPositive(cmp_dirblock_strings(str1, str2))
-            self.assertNegative(cmp_dirblock_strings(str2, str1))
-        else:
-            self.assertNegative(cmp(str1.split('/'), str2.split('/')))
-            self.assertNegative(cmp_dirblock_strings(str1, str2))
-            self.assertPositive(cmp_dirblock_strings(str2, str1))
-
-    def test_cmp_empty(self):
-        """Compare against the empty string."""
-        self.assertStrCmp(0, '', '')
-        self.assertStrCmp(1, 'a', '')
-        self.assertStrCmp(1, 'ab', '')
-        self.assertStrCmp(1, 'abc', '')
-        self.assertStrCmp(1, 'abcd', '')
-        self.assertStrCmp(1, 'abcde', '')
-        self.assertStrCmp(1, 'abcdef', '')
-        self.assertStrCmp(1, 'abcdefg', '')
-        self.assertStrCmp(1, 'abcdefgh', '')
-        self.assertStrCmp(1, 'abcdefghi', '')
-        self.assertStrCmp(1, 'test/ing/a/path/', '')
-
-    def test_cmp_same_str(self):
-        """Compare the same string"""
-        self.assertStrCmp(0, 'a', 'a')
-        self.assertStrCmp(0, 'ab', 'ab')
-        self.assertStrCmp(0, 'abc', 'abc')
-        self.assertStrCmp(0, 'abcd', 'abcd')
-        self.assertStrCmp(0, 'abcde', 'abcde')
-        self.assertStrCmp(0, 'abcdef', 'abcdef')
-        self.assertStrCmp(0, 'abcdefg', 'abcdefg')
-        self.assertStrCmp(0, 'abcdefgh', 'abcdefgh')
-        self.assertStrCmp(0, 'abcdefghi', 'abcdefghi')
-        self.assertStrCmp(0, 'testing a long string', 'testing a long string')
-        self.assertStrCmp(0, 'x'*10000, 'x'*10000)
-        self.assertStrCmp(0, 'a/b', 'a/b')
-        self.assertStrCmp(0, 'a/b/c', 'a/b/c')
-        self.assertStrCmp(0, 'a/b/c/d', 'a/b/c/d')
-        self.assertStrCmp(0, 'a/b/c/d/e', 'a/b/c/d/e')
-
-    def test_simple_paths(self):
-        """Compare strings that act like normal string comparison"""
-        self.assertStrCmp(-1, 'a', 'b')
-        self.assertStrCmp(-1, 'aa', 'ab')
-        self.assertStrCmp(-1, 'ab', 'bb')
-        self.assertStrCmp(-1, 'aaa', 'aab')
-        self.assertStrCmp(-1, 'aab', 'abb')
-        self.assertStrCmp(-1, 'abb', 'bbb')
-        self.assertStrCmp(-1, 'aaaa', 'aaab')
-        self.assertStrCmp(-1, 'aaab', 'aabb')
-        self.assertStrCmp(-1, 'aabb', 'abbb')
-        self.assertStrCmp(-1, 'abbb', 'bbbb')
-        self.assertStrCmp(-1, 'aaaaa', 'aaaab')
-        self.assertStrCmp(-1, 'a/a', 'a/b')
-        self.assertStrCmp(-1, 'a/b', 'b/b')
-        self.assertStrCmp(-1, 'a/a/a', 'a/a/b')
-        self.assertStrCmp(-1, 'a/a/b', 'a/b/b')
-        self.assertStrCmp(-1, 'a/b/b', 'b/b/b')
-        self.assertStrCmp(-1, 'a/a/a/a', 'a/a/a/b')
-        self.assertStrCmp(-1, 'a/a/a/b', 'a/a/b/b')
-        self.assertStrCmp(-1, 'a/a/b/b', 'a/b/b/b')
-        self.assertStrCmp(-1, 'a/b/b/b', 'b/b/b/b')
-        self.assertStrCmp(-1, 'a/a/a/a/a', 'a/a/a/a/b')
-
-    def test_tricky_paths(self):
-        self.assertStrCmp(1, 'ab/cd/ef', 'ab/cc/ef')
-        self.assertStrCmp(1, 'ab/cd/ef', 'ab/c/ef')
-        self.assertStrCmp(-1, 'ab/cd/ef', 'ab/cd-ef')
-        self.assertStrCmp(-1, 'ab/cd', 'ab/cd-')
-        self.assertStrCmp(-1, 'ab/cd', 'ab-cd')
+    def get_cmp_by_dirs(self):
+        return dirstate_helpers.c_cmp_by_dirs
 
 
 class TestCompiledBisectDirblock(test_dirstate.TestBisectDirblock):
@@ -149,11 +60,5 @@
 
     _test_needs_features = [CompiledDirstateHelpersFeature]
 
-    def setUp(self):
-        super(TestCompiledBisectDirblock, self).setUp()
-        if have_dirstate_helpers:
-            self.bisect_dirblock_func = dirstate_helpers.c_bisect_dirblock
-        else:
-            # We shouldn't be running these tests because of the
-            # _test_needs_features, so make sure they would fail otherwise
-            self.bisect_dirblock_func = None
+    def get_bisect_dirblock(self):
+        return dirstate_helpers.c_bisect_dirblock

=== modified file 'bzrlib/tests/test_dirstate.py'
--- a/bzrlib/tests/test_dirstate.py	2007-05-04 18:59:36 +0000
+++ b/bzrlib/tests/test_dirstate.py	2007-05-07 17:57:01 +0000
@@ -1976,12 +1976,9 @@
     'to', 'foo') chunks rather than by raw 'path/to/foo'.
     """
 
-    def setUp(self):
-        super(TestBisectDirblock, self).setUp()
-        # We have to set this here, because if we set it at the class variable
-        # level, Python interprets it as a member function, and passes 'self'
-        # as the first argument.
-        self.bisect_dirblock_func = dirstate.py_bisect_dirblock
+    def get_bisect_dirblock(self):
+        """Return an implementation of bisect_dirblock"""
+        return dirstate.py_bisect_dirblock
 
     def assertBisect(self, dirblocks, split_dirblocks, path, *args, **kwargs):
         """Assert that bisect_split works like bisect_left on the split paths.
@@ -1992,9 +1989,9 @@
 
         All other arguments will be passed along.
         """
+        bisect_dirblock = self.get_bisect_dirblock()
         self.assertIsInstance(dirblocks, list)
-        bisect_split_idx = self.bisect_dirblock_func(dirblocks, path,
-                                                     *args, **kwargs)
+        bisect_split_idx = bisect_dirblock(dirblocks, path, *args, **kwargs)
         split_dirblock = (path.split('/'), [])
         bisect_left_idx = bisect.bisect_left(split_dirblocks, split_dirblock,
                                              *args)
@@ -2115,3 +2112,103 @@
             state._validate)
         self.assertContainsRe(str(e),
             'file a-id is absent in row')
+
+
+class TestCmpByDirs(TestCase):
+
+    def get_cmp_by_dirs(self):
+        """Get a specific implementation of cmp_by_dirs."""
+        return dirstate.py_cmp_by_dirs
+
+    def assertPositive(self, val):
+        """Assert that val is greater than 0."""
+        self.assertTrue(val > 0, 'expected a positive value, but got %s' % val)
+
+    def assertNegative(self, val):
+        """Assert that val is less than 0."""
+        self.assertTrue(val < 0, 'expected a negative value, but got %s' % val)
+
+    def assertCmpByDirs(self, expected, str1, str2):
+        """Compare the two strings, in both directions.
+
+        :param expected: The expected comparison value. -1 means str1 comes
+            first, 0 means they are equal, 1 means str2 comes first
+        :param str1: string to compare
+        :param str2: string to compare
+        """
+        cmp_by_dirs = self.get_cmp_by_dirs()
+        if expected == 0:
+            self.assertEqual(str1, str2)
+            self.assertEqual(0, cmp_by_dirs(str1, str2))
+            self.assertEqual(0, cmp_by_dirs(str2, str1))
+        elif expected > 0:
+            self.assertPositive(cmp_by_dirs(str1, str2))
+            self.assertNegative(cmp_by_dirs(str2, str1))
+        else:
+            self.assertNegative(cmp_by_dirs(str1, str2))
+            self.assertPositive(cmp_by_dirs(str2, str1))
+
+    def test_cmp_empty(self):
+        """Compare against the empty string."""
+        self.assertCmpByDirs(0, '', '')
+        self.assertCmpByDirs(1, 'a', '')
+        self.assertCmpByDirs(1, 'ab', '')
+        self.assertCmpByDirs(1, 'abc', '')
+        self.assertCmpByDirs(1, 'abcd', '')
+        self.assertCmpByDirs(1, 'abcde', '')
+        self.assertCmpByDirs(1, 'abcdef', '')
+        self.assertCmpByDirs(1, 'abcdefg', '')
+        self.assertCmpByDirs(1, 'abcdefgh', '')
+        self.assertCmpByDirs(1, 'abcdefghi', '')
+        self.assertCmpByDirs(1, 'test/ing/a/path/', '')
+
+    def test_cmp_same_str(self):
+        """Compare the same string"""
+        self.assertCmpByDirs(0, 'a', 'a')
+        self.assertCmpByDirs(0, 'ab', 'ab')
+        self.assertCmpByDirs(0, 'abc', 'abc')
+        self.assertCmpByDirs(0, 'abcd', 'abcd')
+        self.assertCmpByDirs(0, 'abcde', 'abcde')
+        self.assertCmpByDirs(0, 'abcdef', 'abcdef')
+        self.assertCmpByDirs(0, 'abcdefg', 'abcdefg')
+        self.assertCmpByDirs(0, 'abcdefgh', 'abcdefgh')
+        self.assertCmpByDirs(0, 'abcdefghi', 'abcdefghi')
+        self.assertCmpByDirs(0, 'testing a long string', 'testing a long string')
+        self.assertCmpByDirs(0, 'x'*10000, 'x'*10000)
+        self.assertCmpByDirs(0, 'a/b', 'a/b')
+        self.assertCmpByDirs(0, 'a/b/c', 'a/b/c')
+        self.assertCmpByDirs(0, 'a/b/c/d', 'a/b/c/d')
+        self.assertCmpByDirs(0, 'a/b/c/d/e', 'a/b/c/d/e')
+
+    def test_simple_paths(self):
+        """Compare strings that act like normal string comparison"""
+        self.assertCmpByDirs(-1, 'a', 'b')
+        self.assertCmpByDirs(-1, 'aa', 'ab')
+        self.assertCmpByDirs(-1, 'ab', 'bb')
+        self.assertCmpByDirs(-1, 'aaa', 'aab')
+        self.assertCmpByDirs(-1, 'aab', 'abb')
+        self.assertCmpByDirs(-1, 'abb', 'bbb')
+        self.assertCmpByDirs(-1, 'aaaa', 'aaab')
+        self.assertCmpByDirs(-1, 'aaab', 'aabb')
+        self.assertCmpByDirs(-1, 'aabb', 'abbb')
+        self.assertCmpByDirs(-1, 'abbb', 'bbbb')
+        self.assertCmpByDirs(-1, 'aaaaa', 'aaaab')
+        self.assertCmpByDirs(-1, 'a/a', 'a/b')
+        self.assertCmpByDirs(-1, 'a/b', 'b/b')
+        self.assertCmpByDirs(-1, 'a/a/a', 'a/a/b')
+        self.assertCmpByDirs(-1, 'a/a/b', 'a/b/b')
+        self.assertCmpByDirs(-1, 'a/b/b', 'b/b/b')
+        self.assertCmpByDirs(-1, 'a/a/a/a', 'a/a/a/b')
+        self.assertCmpByDirs(-1, 'a/a/a/b', 'a/a/b/b')
+        self.assertCmpByDirs(-1, 'a/a/b/b', 'a/b/b/b')
+        self.assertCmpByDirs(-1, 'a/b/b/b', 'b/b/b/b')
+        self.assertCmpByDirs(-1, 'a/a/a/a/a', 'a/a/a/a/b')
+
+    def test_tricky_paths(self):
+        self.assertCmpByDirs(1, 'ab/cd/ef', 'ab/cc/ef')
+        self.assertCmpByDirs(1, 'ab/cd/ef', 'ab/c/ef')
+        self.assertCmpByDirs(-1, 'ab/cd/ef', 'ab/cd-ef')
+        self.assertCmpByDirs(-1, 'ab/cd', 'ab/cd-')
+        self.assertCmpByDirs(-1, 'ab/cd', 'ab-cd')
+
+