Rev 2433: Add a _walkdirs_utf8 which returns utf8 paths instead of Unicode. Approx 20% faster in walking utf8 filesystems in http://bazaar.launchpad.net/%7Ebzr/bzr/dirstate

John Arbash Meinel john at arbash-meinel.com
Wed Feb 28 18:14:09 GMT 2007


At http://bazaar.launchpad.net/%7Ebzr/bzr/dirstate

------------------------------------------------------------
revno: 2433
revision-id: john at arbash-meinel.com-20070228181300-3hpf53zdxpz16bjg
parent: john at arbash-meinel.com-20070228165719-3q5bxnad9klg5tzz
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: dirstate
timestamp: Wed 2007-02-28 12:13:00 -0600
message:
  Add a _walkdirs_utf8 which returns utf8 paths instead of Unicode. Approx 20% faster in walking utf8 filesystems
modified:
  bzrlib/osutils.py              osutils.py-20050309040759-eeaff12fbf77ac86
  bzrlib/tests/test_osutils.py   test_osutils.py-20051201224856-e48ee24c12182989
-------------- next part --------------
=== modified file 'bzrlib/osutils.py'
--- a/bzrlib/osutils.py	2007-02-28 16:57:19 +0000
+++ b/bzrlib/osutils.py	2007-02-28 18:13:00 +0000
@@ -1106,6 +1106,75 @@
                 pending.append(dir)
 
 
+def _walkdirs_utf8(top, prefix=""):
+    """Yield data about all the directories in a tree.
+
+    This yields the same information as walkdirs() only each entry is yielded
+    in utf-8. On platforms which have a filesystem encoding of utf8 the paths
+    are returned as exact byte-strings.
+    """
+    _lstat = os.lstat
+    pending = []
+    _directory = _directory_kind
+    _listdir = os.listdir
+    _kind_from_mode = file_kind_from_stat_mode
+    if sys.platform == 'win32':
+        # We need to do the listdir using unicode paths, and then encode them
+        # into utf8.
+        assert False, 'not supported yet'
+    if sys.getfilesystemencoding() not in ('UTF-8', 'US-ASCII',
+                                           'ANSI_X3.4-1968'): # ascii
+        assert False, 'not supported yet'
+    # TODO: make these assert instead
+    if isinstance(top, unicode):
+        top = top.encode('utf8')
+    if isinstance(prefix, unicode):
+        prefix = prefix.encode('utf8')
+
+    # The in-memory dirblocks should always have a prefix ending in '/'
+    # unless the prefix is '' then it should not have a trailing slash
+    pending = [(prefix, top)]
+    while pending:
+        relroot, top = pending.pop()
+        if relroot == '':
+            rel_prefix = ''
+        else:
+            rel_prefix = relroot + '/'
+        top_slash = top + '/'
+        # In plain for loop form
+        dirblock = []
+        for name in sorted(_listdir(top)):
+            abspath = top_slash + name
+            statvalue = _lstat(abspath)
+            kind = _kind_from_mode(statvalue.st_mode)
+            dirblock.append((rel_prefix + name, name, kind, statvalue, abspath))
+
+        # 0 - relpath, 1- basename, 2- kind, 3- stat, 4-toppath
+        ## In list/generator comprehension form. On a 55k entry tree, this form
+        ## takes 1.75s versus 1.8s. So it is saving approx 50ms. Not a huge
+        ## savings, and may not be worth the complexity. And on smaller trees,
+        ## I've seen 115ms here versus 102ms in the for loop. So it isn't
+        ## always a win. This is just left for posterity.
+        # dirblock = [(rel_prefix + name, # relpath
+        #              name,           # basename
+        #              _kind_from_mode(statvalue.st_mode), # kind
+        #              statvalue,      # stat
+        #              abspath)        # path on disk
+        #                for name, abspath, statvalue in
+        #                    ((name, abspath, _lstat(abspath))
+        #                     for name, abspath in
+        #                     ((name, top_slash + name)
+        #                      for name in sorted(_listdir(top))
+        #                     )
+        #                    )
+        #            ]
+        yield (relroot, top), dirblock
+        # push the user specified dirs from dirblock
+        pending.extend((d[0], d[4])
+                       for d in reversed(dirblock)
+                       if d[2] == _directory)
+
+
 def copy_tree(from_path, to_path, handlers={}):
     """Copy all of the entries in from_path into to_path.
 

=== modified file 'bzrlib/tests/test_osutils.py'
--- a/bzrlib/tests/test_osutils.py	2007-02-17 21:17:22 +0000
+++ b/bzrlib/tests/test_osutils.py	2007-02-28 18:13:00 +0000
@@ -559,6 +559,52 @@
         self.assertEqual(expected_dirblocks[1:],
             [(dirinfo, [line[0:3] for line in block]) for dirinfo, block in result])
 
+    def test__walkdirs_utf8(self):
+        tree = [
+            '.bzr',
+            '0file',
+            '1dir/',
+            '1dir/0file',
+            '1dir/1dir/',
+            '2file'
+            ]
+        self.build_tree(tree)
+        expected_dirblocks = [
+                (('', '.'),
+                 [('0file', '0file', 'file'),
+                  ('1dir', '1dir', 'directory'),
+                  ('2file', '2file', 'file'),
+                 ]
+                ),
+                (('1dir', './1dir'),
+                 [('1dir/0file', '0file', 'file'),
+                  ('1dir/1dir', '1dir', 'directory'),
+                 ]
+                ),
+                (('1dir/1dir', './1dir/1dir'),
+                 [
+                 ]
+                ),
+            ]
+        result = []
+        found_bzrdir = False
+        for dirdetail, dirblock in osutils._walkdirs_utf8('.'):
+            if len(dirblock) and dirblock[0][1] == '.bzr':
+                # this tests the filtering of selected paths
+                found_bzrdir = True
+                del dirblock[0]
+            result.append((dirdetail, dirblock))
+
+        self.assertTrue(found_bzrdir)
+        self.assertEqual(expected_dirblocks,
+            [(dirinfo, [line[0:3] for line in block]) for dirinfo, block in result])
+        # you can search a subdir only, with a supplied prefix.
+        result = []
+        for dirblock in osutils.walkdirs('./1dir', '1dir'):
+            result.append(dirblock)
+        self.assertEqual(expected_dirblocks[1:],
+            [(dirinfo, [line[0:3] for line in block]) for dirinfo, block in result])
+
     def assertPathCompare(self, path_less, path_greater):
         """check that path_less and path_greater compare correctly."""
         self.assertEqual(0, osutils.compare_paths_prefix_order(



More information about the bazaar-commits mailing list