Rev 2433: Add a _walkdirs_utf8 which returns utf8 paths instead of Unicode. Approx 20% faster in walking utf8 filesystems in http://bazaar.launchpad.net/%7Ebzr/bzr/dirstate
John Arbash Meinel
john at arbash-meinel.com
Wed Feb 28 18:14:09 GMT 2007
At http://bazaar.launchpad.net/%7Ebzr/bzr/dirstate
------------------------------------------------------------
revno: 2433
revision-id: john at arbash-meinel.com-20070228181300-3hpf53zdxpz16bjg
parent: john at arbash-meinel.com-20070228165719-3q5bxnad9klg5tzz
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: dirstate
timestamp: Wed 2007-02-28 12:13:00 -0600
message:
Add a _walkdirs_utf8 which returns utf8 paths instead of Unicode. Approx 20% faster in walking utf8 filesystems
modified:
bzrlib/osutils.py osutils.py-20050309040759-eeaff12fbf77ac86
bzrlib/tests/test_osutils.py test_osutils.py-20051201224856-e48ee24c12182989
-------------- next part --------------
=== modified file 'bzrlib/osutils.py'
--- a/bzrlib/osutils.py 2007-02-28 16:57:19 +0000
+++ b/bzrlib/osutils.py 2007-02-28 18:13:00 +0000
@@ -1106,6 +1106,75 @@
pending.append(dir)
+def _walkdirs_utf8(top, prefix=""):
+ """Yield data about all the directories in a tree.
+
+ This yields the same information as walkdirs() only each entry is yielded
+ in utf-8. On platforms which have a filesystem encoding of utf8 the paths
+ are returned as exact byte-strings.
+ """
+ _lstat = os.lstat
+ pending = []
+ _directory = _directory_kind
+ _listdir = os.listdir
+ _kind_from_mode = file_kind_from_stat_mode
+ if sys.platform == 'win32':
+ # We need to do the listdir using unicode paths, and then encode them
+ # into utf8.
+ assert False, 'not supported yet'
+ if sys.getfilesystemencoding() not in ('UTF-8', 'US-ASCII',
+ 'ANSI_X3.4-1968'): # ascii
+ assert False, 'not supported yet'
+ # TODO: make these assert instead
+ if isinstance(top, unicode):
+ top = top.encode('utf8')
+ if isinstance(prefix, unicode):
+ prefix = prefix.encode('utf8')
+
+ # The in-memory dirblocks should always have a prefix ending in '/'
+ # unless the prefix is '' then it should not have a trailing slash
+ pending = [(prefix, top)]
+ while pending:
+ relroot, top = pending.pop()
+ if relroot == '':
+ rel_prefix = ''
+ else:
+ rel_prefix = relroot + '/'
+ top_slash = top + '/'
+ # In plain for loop form
+ dirblock = []
+ for name in sorted(_listdir(top)):
+ abspath = top_slash + name
+ statvalue = _lstat(abspath)
+ kind = _kind_from_mode(statvalue.st_mode)
+ dirblock.append((rel_prefix + name, name, kind, statvalue, abspath))
+
+ # 0 - relpath, 1- basename, 2- kind, 3- stat, 4-toppath
+ ## In list/generator comprehension form. On a 55k entry tree, this form
+ ## takes 1.75s versus 1.8s. So it is saving approx 50ms. Not a huge
+ ## savings, and may not be worth the complexity. And on smaller trees,
+ ## I've seen 115ms here versus 102ms in the for loop. So it isn't
+ ## always a win. This is just left for posterity.
+ # dirblock = [(rel_prefix + name, # relpath
+ # name, # basename
+ # _kind_from_mode(statvalue.st_mode), # kind
+ # statvalue, # stat
+ # abspath) # path on disk
+ # for name, abspath, statvalue in
+ # ((name, abspath, _lstat(abspath))
+ # for name, abspath in
+ # ((name, top_slash + name)
+ # for name in sorted(_listdir(top))
+ # )
+ # )
+ # ]
+ yield (relroot, top), dirblock
+ # push the user specified dirs from dirblock
+ pending.extend((d[0], d[4])
+ for d in reversed(dirblock)
+ if d[2] == _directory)
+
+
def copy_tree(from_path, to_path, handlers={}):
"""Copy all of the entries in from_path into to_path.
=== modified file 'bzrlib/tests/test_osutils.py'
--- a/bzrlib/tests/test_osutils.py 2007-02-17 21:17:22 +0000
+++ b/bzrlib/tests/test_osutils.py 2007-02-28 18:13:00 +0000
@@ -559,6 +559,52 @@
self.assertEqual(expected_dirblocks[1:],
[(dirinfo, [line[0:3] for line in block]) for dirinfo, block in result])
+ def test__walkdirs_utf8(self):
+ tree = [
+ '.bzr',
+ '0file',
+ '1dir/',
+ '1dir/0file',
+ '1dir/1dir/',
+ '2file'
+ ]
+ self.build_tree(tree)
+ expected_dirblocks = [
+ (('', '.'),
+ [('0file', '0file', 'file'),
+ ('1dir', '1dir', 'directory'),
+ ('2file', '2file', 'file'),
+ ]
+ ),
+ (('1dir', './1dir'),
+ [('1dir/0file', '0file', 'file'),
+ ('1dir/1dir', '1dir', 'directory'),
+ ]
+ ),
+ (('1dir/1dir', './1dir/1dir'),
+ [
+ ]
+ ),
+ ]
+ result = []
+ found_bzrdir = False
+ for dirdetail, dirblock in osutils._walkdirs_utf8('.'):
+ if len(dirblock) and dirblock[0][1] == '.bzr':
+ # this tests the filtering of selected paths
+ found_bzrdir = True
+ del dirblock[0]
+ result.append((dirdetail, dirblock))
+
+ self.assertTrue(found_bzrdir)
+ self.assertEqual(expected_dirblocks,
+ [(dirinfo, [line[0:3] for line in block]) for dirinfo, block in result])
+ # you can search a subdir only, with a supplied prefix.
+ result = []
+ for dirblock in osutils.walkdirs('./1dir', '1dir'):
+ result.append(dirblock)
+ self.assertEqual(expected_dirblocks[1:],
+ [(dirinfo, [line[0:3] for line in block]) for dirinfo, block in result])
+
def assertPathCompare(self, path_less, path_greater):
"""check that path_less and path_greater compare correctly."""
self.assertEqual(0, osutils.compare_paths_prefix_order(
More information about the bazaar-commits
mailing list