Rev 2438: Add tests that the walkdirs variants work on unicode paths. in

John Arbash Meinel john at
Wed Feb 28 21:52:20 GMT 2007


revno: 2438
revision-id: john at
parent: john at
committer: John Arbash Meinel <john at>
branch nick: dirstate
timestamp: Wed 2007-02-28 15:50:11 -0600
  Add tests that the walkdirs variants work on unicode paths.
-------------- next part --------------
=== modified file 'bzrlib/'
--- a/bzrlib/	2007-02-28 18:13:00 +0000
+++ b/bzrlib/	2007-02-28 21:50:11 +0000
@@ -1,4 +1,4 @@
-# Copyright (C) 2005, 2006 Canonical Ltd
+# Copyright (C) 2005, 2006, 2007 Canonical Ltd
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@@ -1083,27 +1083,27 @@
     pending = []
     _directory = _directory_kind
     _listdir = os.listdir
-    pending = [(prefix, "", _directory, None, top)]
+    pending = [(safe_unicode(prefix), "", _directory, None, safe_unicode(top))]
     while pending:
         dirblock = []
         currentdir = pending.pop()
         # 0 - relpath, 1- basename, 2- kind, 3- stat, 4-toppath
+        relroot = currentdir[0]
         top = currentdir[4]
-        if currentdir[0]:
-            relroot = currentdir[0] + '/'
+        if relroot:
+            relprefix = relroot + u'/'
-            relroot = ""
+            relprefix = ""
+        top_slash = top + u'/'
         for name in sorted(_listdir(top)):
-            abspath = top + '/' + name
+            abspath = top_slash + name
             statvalue = lstat(abspath)
-            dirblock.append((relroot + name, name,
+            dirblock.append((relprefix + name, name,
                 statvalue, abspath))
-        yield (currentdir[0], top), dirblock
+        yield (relroot, top), dirblock
         # push the user specified dirs from dirblock
-        for dir in reversed(dirblock):
-            if dir[2] == _directory:
-                pending.append(dir)
+        pending.extend(d for d in reversed(dirblock) if d[2] == _directory)
 def _walkdirs_utf8(top, prefix=""):
@@ -1112,27 +1112,39 @@
     This yields the same information as walkdirs() only each entry is yielded
     in utf-8. On platforms which have a filesystem encoding of utf8 the paths
     are returned as exact byte-strings.
+    :return: yields a tuple of (dir_info, [file_info])
+        dir_info is (utf8_relpath, path-from-top)
+        file_info is (utf8_relpath, utf8_name, kind, lstat, path-from-top)
+        if top is an absolute path, path-from-top is also an absolute path.
+        path-from-top might be unicode or utf8, but it is the correct path to
+        pass to os functions to affect the file in question. (such as os.lstat)
+    """
+    fs_encoding = sys.getfilesystemencoding()
+    if (sys.platform == 'win32' or
+        fs_encoding not in ('UTF-8', 'US-ASCII', 'ANSI_X3.4-1968')): # ascii
+        return _walkdirs_unicode_to_utf8(top, prefix=prefix)
+    else:
+        return _walkdirs_fs_utf8(top, prefix=prefix)
+def _walkdirs_fs_utf8(top, prefix=""):
+    """See _walkdirs_utf8.
+    This sub-function is called when we know the filesystem is already in utf8
+    encoding. So we don't need to transcode filenames.
     _lstat = os.lstat
     pending = []
     _directory = _directory_kind
     _listdir = os.listdir
     _kind_from_mode = file_kind_from_stat_mode
-    if sys.platform == 'win32':
-        # We need to do the listdir using unicode paths, and then encode them
-        # into utf8.
-        assert False, 'not supported yet'
-    if sys.getfilesystemencoding() not in ('UTF-8', 'US-ASCII',
-                                           'ANSI_X3.4-1968'): # ascii
-        assert False, 'not supported yet'
     # TODO: make these assert instead
     if isinstance(top, unicode):
         top = top.encode('utf8')
     if isinstance(prefix, unicode):
         prefix = prefix.encode('utf8')
-    # The in-memory dirblocks should always have a prefix ending in '/'
-    # unless the prefix is '' then it should not have a trailing slash
     pending = [(prefix, top)]
     while pending:
         relroot, top = pending.pop()
@@ -1147,27 +1159,50 @@
             abspath = top_slash + name
             statvalue = _lstat(abspath)
             kind = _kind_from_mode(statvalue.st_mode)
-            dirblock.append((rel_prefix + name, name, kind, statvalue, abspath))
-        # 0 - relpath, 1- basename, 2- kind, 3- stat, 4-toppath
-        ## In list/generator comprehension form. On a 55k entry tree, this form
-        ## takes 1.75s versus 1.8s. So it is saving approx 50ms. Not a huge
-        ## savings, and may not be worth the complexity. And on smaller trees,
-        ## I've seen 115ms here versus 102ms in the for loop. So it isn't
-        ## always a win. This is just left for posterity.
-        # dirblock = [(rel_prefix + name, # relpath
-        #              name,           # basename
-        #              _kind_from_mode(statvalue.st_mode), # kind
-        #              statvalue,      # stat
-        #              abspath)        # path on disk
-        #                for name, abspath, statvalue in
-        #                    ((name, abspath, _lstat(abspath))
-        #                     for name, abspath in
-        #                     ((name, top_slash + name)
-        #                      for name in sorted(_listdir(top))
-        #                     )
-        #                    )
-        #            ]
+            dirblock.append((rel_prefix + name, name,
+                             kind, statvalue, abspath))
+        yield (relroot, top), dirblock
+        # push the user specified dirs from dirblock
+        pending.extend((d[0], d[4])
+                       for d in reversed(dirblock)
+                       if d[2] == _directory)
+def _walkdirs_unicode_to_utf8(top, prefix=""):
+    """See _walkdirs_utf8
+    Because Win32 has a Unicode api, all of the 'path-from-top' entries will be
+    Unicode paths.
+    This is currently the fallback code path when the filesystem encoding is
+    not UTF-8. It may be better to implement an alternative so that we can
+    safely handle paths that are not properly decodable in the current
+    encoding.
+    """
+    _utf8_encode = codecs.getencoder('utf8')
+    _lstat = os.lstat
+    pending = []
+    _directory = _directory_kind
+    _listdir = os.listdir
+    _kind_from_mode = file_kind_from_stat_mode
+    pending = [(safe_utf8(prefix), safe_unicode(top))]
+    while pending:
+        relroot, top = pending.pop()
+        if relroot == '':
+            rel_prefix = ''
+        else:
+            rel_prefix = relroot + '/'
+        top_slash = top + u'/'
+        # In plain for loop form
+        dirblock = []
+        for name in sorted(_listdir(top)):
+            name_utf8 = _utf8_encode(name)[0]
+            abspath = top_slash + name
+            statvalue = _lstat(abspath)
+            kind = _kind_from_mode(statvalue.st_mode)
+            dirblock.append((rel_prefix + name_utf8, name_utf8,
+                             kind, statvalue, abspath))
         yield (relroot, top), dirblock
         # push the user specified dirs from dirblock
         pending.extend((d[0], d[4])

=== modified file 'bzrlib/tests/'
--- a/bzrlib/tests/	2007-02-28 18:13:00 +0000
+++ b/bzrlib/tests/	2007-02-28 21:50:11 +0000
@@ -605,6 +605,170 @@
             [(dirinfo, [line[0:3] for line in block]) for dirinfo, block in result])
+    def _filter_out_stat(self, result):
+        """Filter out the stat value from the walkdirs result"""
+        for dirdetail, dirblock in result:
+            new_dirblock = []
+            for info in dirblock:
+                # Ignore info[3] which is the stat
+                new_dirblock.append((info[0], info[1], info[2], info[4]))
+            dirblock[:] = new_dirblock
+    def test_unicode_walkdirs(self):
+        """Walkdirs should always return unicode paths."""
+        name0 = u'0file-\xb6'
+        name1 = u'1dir-\u062c\u0648'
+        name2 = u'2file-\u0633'
+        tree = [
+            name0,
+            name1 + '/',
+            name1 + '/' + name0,
+            name1 + '/' + name1 + '/',
+            name2,
+            ]
+        try:
+            self.build_tree(tree)
+        except UnicodeError:
+            raise TestSkipped('Could not represent Unicode chars'
+                              ' in current encoding.')
+        expected_dirblocks = [
+                ((u'', u'.'),
+                 [(name0, name0, 'file', './' + name0),
+                  (name1, name1, 'directory', './' + name1),
+                  (name2, name2, 'file', './' + name2),
+                 ]
+                ),
+                ((name1, './' + name1),
+                 [(name1 + '/' + name0, name0, 'file', './' + name1
+                                                        + '/' + name0),
+                  (name1 + '/' + name1, name1, 'directory', './' + name1
+                                                            + '/' + name1),
+                 ]
+                ),
+                ((name1 + '/' + name1, './' + name1 + '/' + name1),
+                 [
+                 ]
+                ),
+            ]
+        result = list(osutils.walkdirs('.'))
+        self._filter_out_stat(result)
+        self.assertEqual(expected_dirblocks, result)
+        result = list(osutils.walkdirs(u'./'+name1, name1))
+        self._filter_out_stat(result)
+        self.assertEqual(expected_dirblocks[1:], result)
+    def test_unicode__walkdirs_utf8(self):
+        """Walkdirs_utf8 should always return utf8 paths.
+        The abspath portion might be in unicode or utf-8
+        """
+        name0 = u'0file-\xb6'
+        name1 = u'1dir-\u062c\u0648'
+        name2 = u'2file-\u0633'
+        tree = [
+            name0,
+            name1 + '/',
+            name1 + '/' + name0,
+            name1 + '/' + name1 + '/',
+            name2,
+            ]
+        try:
+            self.build_tree(tree)
+        except UnicodeError:
+            raise TestSkipped('Could not represent Unicode chars'
+                              ' in current encoding.')
+        name0 = name0.encode('utf8')
+        name1 = name1.encode('utf8')
+        name2 = name2.encode('utf8')
+        expected_dirblocks = [
+                (('', '.'),
+                 [(name0, name0, 'file', './' + name0),
+                  (name1, name1, 'directory', './' + name1),
+                  (name2, name2, 'file', './' + name2),
+                 ]
+                ),
+                ((name1, './' + name1),
+                 [(name1 + '/' + name0, name0, 'file', './' + name1
+                                                        + '/' + name0),
+                  (name1 + '/' + name1, name1, 'directory', './' + name1
+                                                            + '/' + name1),
+                 ]
+                ),
+                ((name1 + '/' + name1, './' + name1 + '/' + name1),
+                 [
+                 ]
+                ),
+            ]
+        result = []
+        # For ease in testing, if walkdirs_utf8 returns Unicode, assert that
+        # all abspaths are Unicode, and encode them back into utf8.
+        for dirdetail, dirblock in osutils._walkdirs_utf8('.'):
+            self.assertIsInstance(dirdetail[0], str)
+            if isinstance(dirdetail[1], unicode):
+                dirdetail[1] = dirdetail[1].encode('utf8')
+                for info in dirblock:
+                    self.assertIsInstance(info[4], unicode)
+                    info[4] = info[4].encode('utf8')
+            new_dirblock = []
+            for info in dirblock:
+                self.assertIsInstance(info[0], str)
+                self.assertIsInstance(info[1], str)
+                self.assertIsInstance(info[4], str)
+                # Remove the stat information
+                new_dirblock.append((info[0], info[1], info[2], info[4]))
+            result.append((dirdetail, new_dirblock))
+        self.assertEqual(expected_dirblocks, result)
+    def test_unicode__walkdirs_unicode_to_utf8(self):
+        """walkdirs_unicode_to_utf8 should be a safe fallback everywhere
+        The abspath portion should be in unicode
+        """
+        name0u = u'0file-\xb6'
+        name1u = u'1dir-\u062c\u0648'
+        name2u = u'2file-\u0633'
+        tree = [
+            name0u,
+            name1u + '/',
+            name1u + '/' + name0u,
+            name1u + '/' + name1u + '/',
+            name2u,
+            ]
+        try:
+            self.build_tree(tree)
+        except UnicodeError:
+            raise TestSkipped('Could not represent Unicode chars'
+                              ' in current encoding.')
+        name0 = name0u.encode('utf8')
+        name1 = name1u.encode('utf8')
+        name2 = name2u.encode('utf8')
+        # All of the abspaths should be in unicode, all of the relative paths
+        # should be in utf8
+        expected_dirblocks = [
+                (('', '.'),
+                 [(name0, name0, 'file', './' + name0u),
+                  (name1, name1, 'directory', './' + name1u),
+                  (name2, name2, 'file', './' + name2u),
+                 ]
+                ),
+                ((name1, './' + name1u),
+                 [(name1 + '/' + name0, name0, 'file', './' + name1u
+                                                        + '/' + name0u),
+                  (name1 + '/' + name1, name1, 'directory', './' + name1u
+                                                            + '/' + name1u),
+                 ]
+                ),
+                ((name1 + '/' + name1, './' + name1u + '/' + name1u),
+                 [
+                 ]
+                ),
+            ]
+        result = list(osutils._walkdirs_unicode_to_utf8('.'))
+        self._filter_out_stat(result)
+        self.assertEqual(expected_dirblocks, result)
     def assertPathCompare(self, path_less, path_greater):
         """check that path_less and path_greater compare correctly."""
         self.assertEqual(0, osutils.compare_paths_prefix_order(

More information about the bazaar-commits mailing list