Rev 3505: Write an alternative 'walkdirs' implementation that uses win32 apis. in http://bzr.arbash-meinel.com/branches/bzr/1.6-dev/win32_find_files

John Arbash Meinel john at arbash-meinel.com
Thu Jun 26 17:46:33 BST 2008


At http://bzr.arbash-meinel.com/branches/bzr/1.6-dev/win32_find_files

------------------------------------------------------------
revno: 3505
revision-id: john at arbash-meinel.com-20080626164622-s0dpqlxzdybnmcb8
parent: pqm at pqm.ubuntu.com-20080619070027-3xv1vy81m3ix2oup
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: win32_find_files
timestamp: Thu 2008-06-26 11:46:22 -0500
message:
  Write an alternative 'walkdirs' implementation that uses win32 apis.
  
  Basically, calling nt.lstat() lots of times is really slow, when we can get the
  results right away from the FindFiles api.
  In my tests with ~9000 entries, it changes 'bzr status' from 4+s => 1.2s
-------------- next part --------------
=== modified file 'bzrlib/osutils.py'
--- a/bzrlib/osutils.py	2008-06-11 03:56:46 +0000
+++ b/bzrlib/osutils.py	2008-06-26 16:46:22 +0000
@@ -53,6 +53,13 @@
     )
 """)
 
+try:
+    import win32file
+except:
+    have_win32file = False
+else:
+    have_win32file = True
+
 import bzrlib
 from bzrlib import symbol_versioning
 from bzrlib.symbol_versioning import (
@@ -1190,7 +1197,9 @@
         pass to os functions to affect the file in question. (such as os.lstat)
     """
     fs_encoding = _fs_enc.upper()
-    if (sys.platform == 'win32' or
+    if (have_win32file):
+        return _walkdirs_utf8_win32_find_file(top, prefix=prefix)
+    elif (sys.platform == 'win32' or
         fs_encoding not in ('UTF-8', 'US-ASCII', 'ANSI_X3.4-1968')): # ascii
         return _walkdirs_unicode_to_utf8(top, prefix=prefix)
     else:
@@ -1271,6 +1280,108 @@
         pending.extend(d for d in reversed(dirblock) if d[2] == _directory)
 
 
+class _Win32Stat(object):
+    """Represent a 'stat' result generated from WIN32_FIND_DATA"""
+
+    __slots__ = ['st_mode', 'st_ctime', 'st_mtime', 'st_atime',
+                 'st_size']
+
+    # os.stat always returns 0, so we hard code it here
+    st_dev = 0
+    st_ino = 0
+
+    def __init__(self, win32_find_data_record):
+        """Create a new Stat object, based on the WIN32_FIND_DATA tuple"""
+        (attrib, ctime, atime, wtime, size_high, size_low,
+         res0, res1, name, alt_name) = win32_find_data_record
+        self.st_ctime = int(ctime)
+        self.st_mtime = int(wtime)
+        self.st_atime = int(atime)
+        self.st_size = (size_high * 1<<32) + size_low
+
+        mode_bits = 0100666 # writeable file, the most common
+        if (win32file.FILE_ATTRIBUTE_READONLY & attrib ==
+            win32file.FILE_ATTRIBUTE_READONLY):
+            mode_bits ^= 0222 # remove writable bits
+        if (win32file.FILE_ATTRIBUTE_DIRECTORY & attrib ==
+            win32file.FILE_ATTRIBUTE_DIRECTORY):
+            # Remove the FILE bit, set the DIR bit, and set the EXEC bits
+            mode_bits ^= 0140111
+        self.st_mode = mode_bits
+
+    def __repr__(self):
+        """Repr is the same as a Stat object.
+
+        (mode, ino, dev, nlink, uid, gid, size, atime, mtime, ctime)
+        """
+        return repr((self.st_mode, 0, 0, 0, 0, 0, self.st_size, self.st_atime,
+                     self.st_mtime, self.st_ctime))
+
+
+def _walkdirs_utf8_win32_find_file(top, prefix=""):
+    """
+    Because Win32 has a Unicode api, all of the 'path-from-top' entries will be
+    Unicode paths.
+    This is currently the fallback code path when the filesystem encoding is
+    not UTF-8. It may be better to implement an alternative so that we can
+    safely handle paths that are not properly decodable in the current
+    encoding.
+    """
+    import operator
+    _utf8_encode = codecs.getencoder('utf8')
+
+    # WIN32_FIND_DATA object looks like:
+    # (FILE_ATTRIBUTES, createTime, accessTime, writeTime, nFileSizeHigh,
+    #  nFileSizeLow, reserved0, reserved1, name, alternateFilename)
+    _directory = _directory_kind
+    _file = _formats[stat.S_IFREG]
+
+    # Possible attributes:
+    # 
+    # FILE_ATTRIBUTE_ARCHIVE 
+    # FILE_ATTRIBUTE_COMPRESSED 
+    # FILE_ATTRIBUTE_DIRECTORY 
+    # FILE_ATTRIBUTE_HIDDEN 
+    # FILE_ATTRIBUTE_NORMAL 
+    # FILE_ATTRIBUTE_OFFLINE 
+    # FILE_ATTRIBUTE_READONLY 
+    # FILE_ATTRIBUTE_SYSTEM 
+    # FILE_ATTRIBUTE_TEMPORARY 
+    DIRECTORY = win32file.FILE_ATTRIBUTE_DIRECTORY
+    NORMAL = win32file.FILE_ATTRIBUTE_NORMAL
+
+    pending = [(safe_utf8(prefix), None, None, None, safe_unicode(top))]
+    while pending:
+        relroot, _, _, _, top = pending.pop()
+        if relroot:
+            relprefix = relroot + '/'
+        else:
+            relprefix = ''
+        top_slash = top + u'/'
+        top_star = top_slash + u'*'
+
+        dirblock = []
+        append = dirblock.append
+        for record in win32file.FindFilesIterator(top_star):
+            name = record[-2]
+            if name in (u'.', u'..'):
+                continue
+            attrib = record[0]
+            statvalue = _Win32Stat(record)
+            name_utf8 = _utf8_encode(name)[0]
+            abspath = top_slash + name
+            if DIRECTORY & attrib == DIRECTORY:
+                kind = _directory
+            else:
+                kind = _file
+            append((relprefix + name_utf8, name_utf8, kind, statvalue, abspath))
+        dirblock.sort(key=operator.itemgetter(1))
+        yield (relroot, top), dirblock
+
+        # push the user specified dirs from dirblock
+        pending.extend(d for d in reversed(dirblock) if d[2] == _directory)
+
+
 def copy_tree(from_path, to_path, handlers={}):
     """Copy all of the entries in from_path into to_path.
 

=== modified file 'bzrlib/tests/test_osutils.py'
--- a/bzrlib/tests/test_osutils.py	2008-06-11 03:56:46 +0000
+++ b/bzrlib/tests/test_osutils.py	2008-06-26 16:46:22 +0000
@@ -26,6 +26,7 @@
 from bzrlib import (
     errors,
     osutils,
+    tests,
     win32utils,
     )
 from bzrlib.errors import BzrBadParameterNotUnicode, InvalidURL
@@ -48,6 +49,38 @@
     )
 from cStringIO import StringIO
 
+
+class _Win32FileFeature(tests.Feature):
+    """Test if win32file is available."""
+
+    def _probe(self):
+        try:
+            import win32file
+        except ImportError:
+            return False
+        else:
+            return True
+
+    def feature_name(self):
+        return 'win32file'
+
+Win32FileFeature = _Win32FileFeature()
+
+
+class TestWin32FileFeature(tests.TestCase):
+
+    def test_is_correct(self):
+        try:
+            import win32file
+        except ImportError:
+            self.assertFalse(Win32FileFeature.available())
+        else:
+            self.assertTrue(Win32FileFeature.available())
+
+    def test_name(self):
+        self.assertTrue('win32file' in str(Win32FileFeature))
+
+
 class TestOSUtils(TestCaseInTempDir):
 
     def test_contains_whitespace(self):
@@ -974,6 +1007,49 @@
         self._filter_out_stat(result)
         self.assertEqual(expected_dirblocks, result)
 
+    def test__walkdirs_utf_win32_find_file(self):
+        self.requireFeature(Win32FileFeature)
+        self.requireFeature(tests.UnicodeFilenameFeature)
+        name0u = u'0file-\xb6'
+        name1u = u'1dir-\u062c\u0648'
+        name2u = u'2file-\u0633'
+        tree = [
+            name0u,
+            name1u + '/',
+            name1u + '/' + name0u,
+            name1u + '/' + name1u + '/',
+            name2u,
+            ]
+        self.build_tree(tree)
+        name0 = name0u.encode('utf8')
+        name1 = name1u.encode('utf8')
+        name2 = name2u.encode('utf8')
+
+        # All of the abspaths should be in unicode, all of the relative paths
+        # should be in utf8
+        expected_dirblocks = [
+                (('', '.'),
+                 [(name0, name0, 'file', './' + name0u),
+                  (name1, name1, 'directory', './' + name1u),
+                  (name2, name2, 'file', './' + name2u),
+                 ]
+                ),
+                ((name1, './' + name1u),
+                 [(name1 + '/' + name0, name0, 'file', './' + name1u
+                                                        + '/' + name0u),
+                  (name1 + '/' + name1, name1, 'directory', './' + name1u
+                                                            + '/' + name1u),
+                 ]
+                ),
+                ((name1 + '/' + name1, './' + name1u + '/' + name1u),
+                 [
+                 ]
+                ),
+            ]
+        result = list(osutils._walkdirs_utf8_win32_find_file(u'.'))
+        self._filter_out_stat(result)
+        self.assertEqual(expected_dirblocks, result)
+
     def assertPathCompare(self, path_less, path_greater):
         """check that path_less and path_greater compare correctly."""
         self.assertEqual(0, osutils.compare_paths_prefix_order(



More information about the bazaar-commits mailing list