Rev 3701: Streamline _walkdirs_utf8 for utf8 file systems, reducing time to traverse a mozilla tree from 1s to .6 seconds. (Robert Collins) in http://people.ubuntu.com/~robertc/baz2.0/readdir
Robert Collins
robertc at robertcollins.net
Wed Sep 10 08:43:09 BST 2008
At http://people.ubuntu.com/~robertc/baz2.0/readdir
------------------------------------------------------------
revno: 3701
revision-id: robertc at robertcollins.net-20080910074259-penkn0htr03wquai
parent: john at arbash-meinel.com-20080909161809-z0so92b8wlgy2tx4
committer: Robert Collins <robertc at robertcollins.net>
branch nick: readdir
timestamp: Wed 2008-09-10 17:42:59 +1000
message:
Streamline _walkdirs_utf8 for utf8 file systems, reducing time to traverse a mozilla tree from 1s to .6 seconds. (Robert Collins)
modified:
NEWS NEWS-20050323055033-4e00b5db738777ff
bzrlib/_readdir_py.py readdir.py-20060609152855-rm6v321vuaqyh9tu-3
bzrlib/_readdir_pyx.pyx readdir.pyx-20060609152855-rm6v321vuaqyh9tu-1
bzrlib/lsprof.py lsprof.py-20051208071030-833790916798ceed
bzrlib/osutils.py osutils.py-20050309040759-eeaff12fbf77ac86
bzrlib/tests/test_osutils.py test_osutils.py-20051201224856-e48ee24c12182989
=== modified file 'NEWS'
--- a/NEWS 2008-09-09 01:53:03 +0000
+++ b/NEWS 2008-09-10 07:42:59 +0000
@@ -45,6 +45,15 @@
SFTP transport a little bit faster (~20%) and use a bit less memory.
(John Arbash Meinel)
+ * ``status`` on large trees is now faster, due to optimisations in the
+ walkdirs code. Of particular note, the walkdirs code now performs
+ a temporary ``chdir()`` while reading a single directory; if your
+ platform has non thread-local current working directories (and is
+ not windows which has its own implementation), this may introduce a
+ race condition during concurrent uses of bzrlib. The bzrlib CLI
+ will not encounter this as it is single threaded for working tree
+ operations. (Robert Collins)
+
* When reading index files, if we happen to read the whole file in a
single request treat it as a ``_buffer_all`` request. This happens
most often on small indexes over remote transports, where we default
=== modified file 'bzrlib/_readdir_py.py'
--- a/bzrlib/_readdir_py.py 2008-09-02 04:45:47 +0000
+++ b/bzrlib/_readdir_py.py 2008-09-10 07:42:59 +0000
@@ -18,16 +18,35 @@
import os
-
-
-def read_dir(path):
- """Like os.listdir, this reads the contents of a directory.
-
- There is a C module which is recommended which will return
- a sort key in the first element of the tuple to allow slightly
- more efficient behaviour on the operating systems part.
-
- :param path: the directory to list.
- :return: a list of (None, basename) tuples.
+import stat
+
+
+_directory = 'directory'
+_chardev = 'chardev'
+_block = 'block'
+_file = 'file'
+_fifo = 'fifo'
+_symlink = 'symlink'
+_socket = 'socket'
+_unknown = 'unknown'
+
+_formats = {
+ stat.S_IFDIR:'directory',
+ stat.S_IFCHR:'chardev',
+ stat.S_IFBLK:'block',
+ stat.S_IFREG:'file',
+ stat.S_IFIFO:'fifo',
+ stat.S_IFLNK:'symlink',
+ stat.S_IFSOCK:'socket',
+}
+
+
+def _kind_from_mode(stat_mode, _formats=_formats, _unknown='unknown'):
+ """Generate a file kind from a stat mode. This is used in walkdirs.
+
+ Its performance is critical: Do not mutate without careful benchmarking.
"""
- return [(None, name) for name in os.listdir(path)]
+ try:
+ return _formats[stat_mode & 0170000]
+ except KeyError:
+ return _unknown
=== modified file 'bzrlib/_readdir_pyx.pyx'
--- a/bzrlib/_readdir_pyx.pyx 2008-09-02 04:45:47 +0000
+++ b/bzrlib/_readdir_pyx.pyx 2008-09-10 07:42:59 +0000
@@ -29,9 +29,51 @@
int errno
char *strerror(int errno)
+cdef extern from 'unistd.h':
+ int chdir(char *path)
+ char *getcwd(char *, int size)
+
+cdef extern from 'stdlib.h':
+ void *malloc(int)
+ void free(void *)
+
+cdef extern from 'sys/stat.h':
+ cdef struct stat:
+ int st_mode
+ int st_size
+ int st_dev
+ int st_ino
+ int st_mtime
+ int st_ctime
+ int lstat(char *path, stat *buf)
+ int S_ISDIR(int mode)
+ int S_ISCHR(int mode)
+ int S_ISBLK(int mode)
+ int S_ISREG(int mode)
+ int S_ISFIFO(int mode)
+ int S_ISLNK(int mode)
+ int S_ISSOCK(int mode)
+
+
cdef extern from 'sys/types.h':
ctypedef long ssize_t
ctypedef unsigned long size_t
+ ctypedef long time_t
+
+
+cdef extern from 'Python.h':
+ char * PyString_AS_STRING(object)
+ int PyOS_snprintf(char *str, size_t size, char *format, ...)
+ ctypedef int Py_ssize_t # Required for older pyrex versions
+ Py_ssize_t PyString_Size(object s)
+ object PyList_GetItem(object lst, Py_ssize_t index)
+ void *PyList_GetItem_object_void "PyList_GET_ITEM" (object lst, int index)
+ int PyList_Append(object lst, object item) except -1
+ void *PyTuple_GetItem_void_void "PyTuple_GET_ITEM" (void* tpl, int index)
+ int PyTuple_SetItem(void *, Py_ssize_t pos, object item) except -1
+ void Py_INCREF(object o)
+ void Py_DECREF(object o)
+
cdef extern from 'dirent.h':
int DT_UNKNOWN
@@ -61,6 +103,7 @@
_symlink = 'symlink'
_socket = 'socket'
_unknown = 'unknown'
+_missing = 'missing'
dot = ord('.')
@@ -68,7 +111,140 @@
cdef extern from 'readdir.h':
pass
-def read_dir(path):
+
+cdef class _Stat:
+ """Represent a 'stat' result."""
+
+ cdef readonly int st_mode
+ # nanosecond time definitions use MACROS, fucking up te ability to have a
+ # variable called st_mtime. Yay Ulrich, thanks a lot.
+ cdef readonly time_t _ctime
+ cdef readonly time_t _mtime
+ # cdef readonly double st_atime
+ cdef readonly int st_size
+
+ cdef readonly int st_dev
+ cdef readonly int st_ino
+
+ property st_mtime:
+ def __get__(self):
+ return self._mtime
+
+ property st_ctime:
+ def __get__(self):
+ return self._ctime
+
+ def __repr__(self):
+ """Repr is the same as a Stat object.
+
+ (mode, ino, dev, nlink, uid, gid, size, atime, mtime, ctime)
+ """
+ #return repr((self.st_mode, 0, 0, 0, 0, 0, self.st_size, self.st_atime,
+ # self.st_mtime, self.st_ctime))
+ return repr((self.st_mode, 0, 0, 0, 0, 0, self.st_size, None,
+ self._mtime, self._ctime))
+
+
+from bzrlib import osutils
+
+
+cdef class UTF8DirReader:
+ """A dir reader for utf8 file systems."""
+
+ cdef readonly object _safe_utf8
+ cdef _directory, _chardev, _block, _file, _fifo, _symlink
+ cdef _socket, _unknown
+
+ def __init__(self):
+ self._safe_utf8 = osutils.safe_utf8
+ self._directory = _directory
+ self._chardev = _chardev
+ self._block = _block
+ self._file = _file
+ self._fifo = _fifo
+ self._symlink = _symlink
+ self._socket = _socket
+ self._unknown = _unknown
+
+ def kind_from_mode(self, int mode):
+ """Get the kind of a path from a mode status."""
+ return self._kind_from_mode(mode)
+
+ cdef _kind_from_mode(self, int mode):
+ # in order of frequency:
+ if S_ISREG(mode):
+ return self._file
+ if S_ISDIR(mode):
+ return self._directory
+ if S_ISCHR(mode):
+ return self._chardev
+ if S_ISBLK(mode):
+ return self._block
+ if S_ISLNK(mode):
+ return self._symlink
+ if S_ISFIFO(mode):
+ return self._fifo
+ if S_ISSOCK(mode):
+ return self._socket
+ return self._unknown
+
+ def top_prefix_to_starting_dir(self, top, prefix=""):
+ """See DirReader.top_prefix_to_starting_dir."""
+ return (self._safe_utf8(prefix), None, None, None,
+ self._safe_utf8(top))
+
+ def read_dir(self, prefix, top):
+ """Read a single directory from a utf8 file system.
+
+ All paths in and out are utf8.
+
+ This sub-function is called when we know the filesystem is already in utf8
+ encoding. So we don't need to transcode filenames.
+
+ See DirReader.read_dir for details.
+ """
+ #cdef char *_prefix = prefix
+ #cdef char *_top = top
+ # Use C accelerated directory listing.
+ cdef object newval
+ cdef int index
+ cdef int length
+ cdef void * atuple
+ cdef object name
+
+
+ if PyString_Size(prefix):
+ relprefix = prefix + '/'
+ else:
+ relprefix = ''
+ top_slash = top + '/'
+
+ # read_dir supplies in should-stat order.
+ # for _, name in sorted(_listdir(top)):
+ result = _read_dir(top)
+ length = len(result)
+ # result.sort()
+ for index from 0 <= index < length:
+ atuple = PyList_GetItem_object_void(result, index)
+ name = <object>PyTuple_GetItem_void_void(atuple, 1)
+ # We have inode, name, None, statvalue, None
+ # inode -> path_from_top
+ newval = relprefix + name
+ Py_INCREF(newval)
+ PyTuple_SetItem(atuple, 0, newval)
+ # None -> kind
+ newval = self._kind_from_mode(
+ (<_Stat>PyTuple_GetItem_void_void(atuple, 3)).st_mode)
+ Py_INCREF(newval)
+ PyTuple_SetItem(atuple, 2, newval)
+ # none -> abspath # perhaps only do if its a dir?
+ newval = top_slash + name
+ Py_INCREF(newval)
+ PyTuple_SetItem(atuple, 4, newval)
+ return result
+
+
+cdef _read_dir(path):
"""Like os.listdir, this reads the contents of a directory.
:param path: the directory to list.
@@ -80,9 +256,28 @@
cdef dirent * entry
cdef dirent sentinel
cdef char *name
- the_dir = opendir(path)
+ cdef int stat_result
+ cdef stat st
+ cdef _Stat statvalue
+ cdef char * abspath
+ cdef char * pathstart
+ cdef int pathlen
+ cdef int snprintf_result
+ cdef char *cwd
+
+ cwd = getcwd(NULL, 0)
+ if -1 == chdir(path):
+ raise OSError(errno, strerror(errno))
+ the_dir = opendir(".")
if NULL == the_dir:
raise OSError(errno, strerror(errno))
+ pathlen = len(path)
+ # allow 1024 bytes of name. (+ '/' + '\x00' to terminate).
+ abspath = <char *>malloc(pathlen + 1026)
+ snprintf_result = PyOS_snprintf(abspath, pathlen + 2, "%s/", PyString_AS_STRING(path))
+ if snprintf_result < 0:
+ raise OSError(errno, strerror(errno))
+ pathstart = abspath + pathlen + 1
result = []
try:
entry = &sentinel
@@ -107,8 +302,41 @@
(name[1] == 0) or
(name[1] == dot and name[2] == 0))
):
- result.append((entry.d_ino, entry.d_name))
+ snprintf_result = PyOS_snprintf(pathstart, 1025, "%s", name)
+ if snprintf_result < 0:
+ raise OSError(errno, strerror(errno))
+ stat_result = lstat(entry.d_name, &st)
+ # stat_result = lstat(abspath, &st)
+ if stat_result != 0:
+ if errno != ENOENT:
+ raise OSError(errno, strerror(errno))
+ else:
+ kind = _missing
+ statvalue = None
+ else:
+ statvalue = _Stat()
+ statvalue.st_mode = st.st_mode
+ statvalue._ctime = st.st_ctime
+ statvalue._mtime = st.st_mtime
+ # statvalue.st_atime = st.st_atime
+ statvalue.st_size = st.st_size
+ statvalue.st_ino = st.st_ino
+ statvalue.st_dev = st.st_dev
+ # We append a 5-tuple that can be modified in-place by the C
+ # api:
+ # inode to sort on and replace with top_path
+ # name to keep
+ # kind
+ # statvalue
+ # abspath
+ PyList_Append(result, (entry.d_ino, entry.d_name, None,
+ statvalue, None))
finally:
+ free(abspath)
+ if -1 == chdir(cwd):
+ free(cwd)
+ raise OSError(errno, strerror(errno))
+ free(cwd)
if -1 == closedir(the_dir):
raise OSError(errno, strerror(errno))
return result
=== modified file 'bzrlib/lsprof.py'
--- a/bzrlib/lsprof.py 2008-07-09 19:57:36 +0000
+++ b/bzrlib/lsprof.py 2008-09-10 07:42:59 +0000
@@ -27,7 +27,10 @@
def profile(f, *args, **kwds):
- """XXX docstring"""
+ """Run a function profile.
+
+ :return: The functions return value and a stats object.
+ """
global _g_threadmap
p = Profiler()
p.enable(subcalls=True)
=== modified file 'bzrlib/osutils.py'
--- a/bzrlib/osutils.py 2008-09-09 01:53:03 +0000
+++ b/bzrlib/osutils.py 2008-09-10 07:42:59 +0000
@@ -123,37 +123,6 @@
_directory_kind = 'directory'
-_formats = {
- stat.S_IFDIR:_directory_kind,
- stat.S_IFCHR:'chardev',
- stat.S_IFBLK:'block',
- stat.S_IFREG:'file',
- stat.S_IFIFO:'fifo',
- stat.S_IFLNK:'symlink',
- stat.S_IFSOCK:'socket',
-}
-
-
-def file_kind_from_stat_mode(stat_mode, _formats=_formats, _unknown='unknown'):
- """Generate a file kind from a stat mode. This is used in walkdirs.
-
- Its performance is critical: Do not mutate without careful benchmarking.
- """
- try:
- return _formats[stat_mode & 0170000]
- except KeyError:
- return _unknown
-
-
-def file_kind(f, _lstat=os.lstat, _mapper=file_kind_from_stat_mode):
- try:
- return _mapper(_lstat(f).st_mode)
- except OSError, e:
- if getattr(e, 'errno', None) in (errno.ENOENT, errno.ENOTDIR):
- raise errors.NoSuchFile(f)
- raise
-
-
def get_umask():
"""Return the current umask"""
# Assume that people aren't messing with the umask while running
@@ -1201,7 +1170,7 @@
_lstat = os.lstat
_directory = _directory_kind
_listdir = os.listdir
- _kind_from_mode = _formats.get
+ _kind_from_mode = file_kind_from_stat_mode
pending = [(safe_unicode(prefix), "", _directory, None, safe_unicode(top))]
while pending:
# 0 - relpath, 1- basename, 2- kind, 3- stat, 4-toppath
@@ -1223,7 +1192,7 @@
for name in names:
abspath = top_slash + name
statvalue = _lstat(abspath)
- kind = _kind_from_mode(statvalue.st_mode & 0170000, 'unknown')
+ kind = _kind_from_mode(statvalue.st_mode)
append((relprefix + name, name, kind, statvalue, abspath))
yield (relroot, top), dirblock
@@ -1292,58 +1261,28 @@
# ANSI_X3.4-1968 is a form of ASCII
_selected_dir_reader = UnicodeDirReader()
else:
- _selected_dir_reader = UTF8DirReader()
+ try:
+ from bzrlib._readdir_pyx import UTF8DirReader
+ except ImportError:
+ # No optimised code path
+ _selected_dir_reader = UnicodeDirReader()
+ else:
+ _selected_dir_reader = UTF8DirReader()
# 0 - relpath, 1- basename, 2- kind, 3- stat, 4-toppath
# But we don't actually uses 1-3 in pending, so set them to None
- pending = [_selected_dir_reader.top_prefix_to_starting_dir(top, prefix)]
+ pending = [[_selected_dir_reader.top_prefix_to_starting_dir(top, prefix)]]
read_dir = _selected_dir_reader.read_dir
_directory = _directory_kind
while pending:
- relroot, _, _, _, top = pending.pop()
- dirblock = read_dir(relroot, top)
+ relroot, _, _, _, top = pending[-1].pop()
+ if not pending[-1]:
+ pending.pop()
+ dirblock = sorted(read_dir(relroot, top))
yield (relroot, top), dirblock
# push the user specified dirs from dirblock
- pending.extend(d for d in reversed(dirblock) if d[2] == _directory)
-
-
-class UTF8DirReader(DirReader):
- """A dir reader for utf8 file systems."""
-
- def top_prefix_to_starting_dir(self, top, prefix=""):
- """See DirReader.top_prefix_to_starting_dir."""
- return (safe_utf8(prefix), None, None, None, safe_utf8(top))
-
- def read_dir(self, prefix, top):
- """Read a single directory from a utf8 file system.
-
- All paths in and out are utf8.
-
- This sub-function is called when we know the filesystem is already in utf8
- encoding. So we don't need to transcode filenames.
-
- See DirReader.read_dir for details.
- """
- _lstat = os.lstat
- # Use C accelerated directory listing.
- _listdir = _read_dir
- _kind_from_mode = _formats.get
-
- if prefix:
- relprefix = prefix + '/'
- else:
- relprefix = ''
- top_slash = top + '/'
-
- dirblock = []
- append = dirblock.append
- # read_dir supplies in should-stat order.
- for _, name in sorted(_listdir(top)):
- abspath = top_slash + name
- statvalue = _lstat(abspath)
- kind = _kind_from_mode(statvalue.st_mode & 0170000, 'unknown')
- append((relprefix + name, name, kind, statvalue, abspath))
- dirblock.sort()
- return dirblock
+ next = [d for d in reversed(dirblock) if d[2] == _directory]
+ if next:
+ pending.append(next)
class UnicodeDirReader(DirReader):
@@ -1374,7 +1313,7 @@
_utf8_encode = self._utf8_encode
_lstat = os.lstat
_listdir = os.listdir
- _kind_from_mode = _formats.get
+ _kind_from_mode = file_kind_from_stat_mode
if prefix:
relprefix = prefix + '/'
@@ -1388,7 +1327,7 @@
name_utf8 = _utf8_encode(name)[0]
abspath = top_slash + name
statvalue = _lstat(abspath)
- kind = _kind_from_mode(statvalue.st_mode & 0170000, 'unknown')
+ kind = _kind_from_mode(statvalue.st_mode)
append((relprefix + name_utf8, name_utf8, kind, statvalue, abspath))
return dirblock
@@ -1608,7 +1547,26 @@
return open(filename, 'rU').read()
-try:
- from bzrlib._readdir_pyx import read_dir as _read_dir
-except ImportError:
- from bzrlib._readdir_py import read_dir as _read_dir
+def file_kind_from_stat_mode_thunk(mode):
+ global file_kind_from_stat_mode
+ if file_kind_from_stat_mode is file_kind_from_stat_mode_thunk:
+ try:
+ from bzrlib._readdir_pyx import UTF8DirReader
+ file_kind_from_stat_mode = UTF8DirReader().kind_from_mode
+ except ImportError:
+ from bzrlib._readdir_py import (
+ _kind_from_mode as _file_kind_from_stat_mode
+ )
+ return file_kind_from_stat_mode(mode)
+file_kind_from_stat_mode = file_kind_from_stat_mode_thunk
+
+
+def file_kind(f, _lstat=os.lstat):
+ try:
+ return file_kind_from_stat_mode(_lstat(f).st_mode)
+ except OSError, e:
+ if getattr(e, 'errno', None) in (errno.ENOENT, errno.ENOTDIR):
+ raise errors.NoSuchFile(f)
+ raise
+
+
=== modified file 'bzrlib/tests/test_osutils.py'
--- a/bzrlib/tests/test_osutils.py 2008-09-09 16:18:09 +0000
+++ b/bzrlib/tests/test_osutils.py 2008-09-10 07:42:59 +0000
@@ -57,25 +57,12 @@
from bzrlib.tests.test__walkdirs_win32 import Win32ReadDirFeature
-def load_tests(standard_tests, module, loader):
- """Parameterize readdir tests."""
- to_adapt, result = split_suite_by_re(standard_tests, "readdir")
- adapter = TestScenarioApplier()
- from bzrlib import _readdir_py
- adapter.scenarios = [('python', {'read_dir': _readdir_py.read_dir})]
- if ReadDirFeature.available():
- adapter.scenarios.append(('pyrex',
- {'read_dir': ReadDirFeature.read_dir}))
- adapt_tests(to_adapt, adapter, result)
- return result
-
-
-class _ReadDirFeature(Feature):
+class _UTF8DirReaderFeature(Feature):
def _probe(self):
try:
from bzrlib import _readdir_pyx
- self.read_dir = _readdir_pyx.read_dir
+ self.reader = _readdir_pyx.UTF8DirReader
return True
except ImportError:
return False
@@ -83,7 +70,7 @@
def feature_name(self):
return 'bzrlib._readdir_pyx'
-ReadDirFeature = _ReadDirFeature()
+UTF8DirReaderFeature = _UTF8DirReaderFeature()
class TestOSUtils(TestCaseInTempDir):
@@ -779,38 +766,6 @@
class TestWalkDirs(TestCaseInTempDir):
- def test_readdir(self):
- tree = [
- '.bzr/',
- '0file',
- '1dir/',
- '1dir/0file',
- '1dir/1dir/',
- '2file'
- ]
- self.build_tree(tree)
- expected_names = ['.bzr', '0file', '1dir', '2file']
- # read_dir returns pairs, which form a table with either None in all
- # the first columns, or a sort key to get best on-disk-read order,
- # and the disk path name in utf-8 encoding in the second column.
- read_result = self.read_dir('.')
- # The second column is always the names, and every name except "." and
- # ".." should be present.
- names = sorted([row[1] for row in read_result])
- self.assertEqual(expected_names, names)
- expected_sort_key = None
- if read_result[0][0] is None:
- # No sort key returned - all keys must None
- operator = self.assertEqual
- else:
- # A sort key in the first row implies sort keys in the other rows.
- operator = self.assertNotEqual
- for row in read_result:
- operator(None, row[0])
-
- def test_compiled_extension_exists(self):
- self.requireFeature(ReadDirFeature)
-
def test_walkdirs(self):
tree = [
'.bzr',
@@ -931,22 +886,25 @@
self.assertIsInstance(osutils._selected_dir_reader, expected)
def test_force_walkdirs_utf8_fs_utf8(self):
+ self.requireFeature(UTF8DirReaderFeature)
self._save_platform_info()
win32utils.winver = None # Avoid the win32 detection code
osutils._fs_enc = 'UTF-8'
- self.assertReadFSDirIs(osutils.UTF8DirReader)
+ self.assertReadFSDirIs(UTF8DirReaderFeature.reader)
def test_force_walkdirs_utf8_fs_ascii(self):
+ self.requireFeature(UTF8DirReaderFeature)
self._save_platform_info()
win32utils.winver = None # Avoid the win32 detection code
osutils._fs_enc = 'US-ASCII'
- self.assertReadFSDirIs(osutils.UTF8DirReader)
+ self.assertReadFSDirIs(UTF8DirReaderFeature.reader)
def test_force_walkdirs_utf8_fs_ANSI(self):
+ self.requireFeature(UTF8DirReaderFeature)
self._save_platform_info()
win32utils.winver = None # Avoid the win32 detection code
osutils._fs_enc = 'ANSI_X3.4-1968'
- self.assertReadFSDirIs(osutils.UTF8DirReader)
+ self.assertReadFSDirIs(UTF8DirReaderFeature.reader)
def test_force_walkdirs_utf8_fs_latin1(self):
self._save_platform_info()
More information about the bazaar-commits
mailing list