Rev 3708: Relocate iter_changes' process_entry method to allow C optimisation. in http://people.ubuntu.com/~robertc/baz2.0/readdir
Robert Collins
robertc at robertcollins.net
Sun Sep 14 12:59:36 BST 2008
At http://people.ubuntu.com/~robertc/baz2.0/readdir
------------------------------------------------------------
revno: 3708
revision-id: robertc at robertcollins.net-20080914115929-vtk18rnhg76a38ox
parent: robertc at robertcollins.net-20080914085107-yfw1r5ph5g7eefmi
committer: Robert Collins <robertc at robertcollins.net>
branch nick: process-entry
timestamp: Sun 2008-09-14 21:59:29 +1000
message:
Relocate iter_changes' process_entry method to allow C optimisation.
modified:
bzrlib/dirstate.py dirstate.py-20060728012006-d6mvoihjb3je9peu-1
bzrlib/workingtree_4.py workingtree_4.py-20070208044105-5fgpc5j3ljlh5q6c-1
=== modified file 'bzrlib/dirstate.py'
--- a/bzrlib/dirstate.py 2008-09-13 22:10:50 +0000
+++ b/bzrlib/dirstate.py 2008-09-14 11:59:29 +0000
@@ -220,6 +220,7 @@
osutils,
trace,
)
+from bzrlib.osutils import pathjoin
# compile the struct compiler we need, so as to only do it once
@@ -2779,6 +2780,282 @@
update_entry = py_update_entry
+class ProcessEntryPython(object):
+
+ __slots__ = ["old_dirname_to_file_id", "new_dirname_to_file_id", "uninteresting",
+ "last_source_parent", "last_target_parent", "include_unchanged",
+ "use_filesystem_for_exec"]
+
+ def __init__(self, include_unchanged, use_filesystem_for_exec):
+ self.old_dirname_to_file_id = {}
+ self.new_dirname_to_file_id = {}
+ # Just a sentry, so that _process_entry can say that this
+ # record is handled, but isn't interesting to process (unchanged)
+ self.uninteresting = object()
+ # Using a list so that we can access the values and change them in
+ # nested scope. Each one is [path, file_id, entry]
+ self.last_source_parent = [None, None]
+ self.last_target_parent = [None, None]
+ self.include_unchanged = include_unchanged
+ self.use_filesystem_for_exec = use_filesystem_for_exec
+
+ def _process_entry(self, entry, path_info, source_index, target_index, state):
+ """Compare an entry and real disk to generate delta information.
+
+ :param path_info: top_relpath, basename, kind, lstat, abspath for
+ the path of entry. If None, then the path is considered absent.
+ (Perhaps we should pass in a concrete entry for this ?)
+ Basename is returned as a utf8 string because we expect this
+ tuple will be ignored, and don't want to take the time to
+ decode.
+ :return: None if these don't match
+ A tuple of information about the change, or
+ the object 'uninteresting' if these match, but are
+ basically identical.
+ """
+ if source_index is None:
+ source_details = NULL_PARENT_DETAILS
+ else:
+ source_details = entry[1][source_index]
+ target_details = entry[1][target_index]
+ target_minikind = target_details[0]
+ if path_info is not None and target_minikind in 'fdlt':
+ if not (target_index == 0):
+ raise AssertionError()
+ link_or_sha1 = update_entry(state, entry,
+ abspath=path_info[4], stat_value=path_info[3])
+ # The entry may have been modified by update_entry
+ target_details = entry[1][target_index]
+ target_minikind = target_details[0]
+ else:
+ link_or_sha1 = None
+ file_id = entry[0][2]
+ source_minikind = source_details[0]
+ if source_minikind in 'fdltr' and target_minikind in 'fdlt':
+ # claimed content in both: diff
+ # r | fdlt | | add source to search, add id path move and perform
+ # | | | diff check on source-target
+ # r | fdlt | a | dangling file that was present in the basis.
+ # | | | ???
+ if source_minikind in 'r':
+ # add the source to the search path to find any children it
+ # has. TODO ? : only add if it is a container ?
+ if not osutils.is_inside_any(searched_specific_files,
+ source_details[1]):
+ search_specific_files.add(source_details[1])
+ # generate the old path; this is needed for stating later
+ # as well.
+ old_path = source_details[1]
+ old_dirname, old_basename = os.path.split(old_path)
+ path = pathjoin(entry[0][0], entry[0][1])
+ old_entry = state._get_entry(source_index,
+ path_utf8=old_path)
+ # update the source details variable to be the real
+ # location.
+ if old_entry == (None, None):
+ raise errors.CorruptDirstate(state._filename,
+ "entry '%s/%s' is considered renamed from %r"
+ " but source does not exist\n"
+ "entry: %s" % (entry[0][0], entry[0][1], old_path, entry))
+ source_details = old_entry[1][source_index]
+ source_minikind = source_details[0]
+ else:
+ old_dirname = entry[0][0]
+ old_basename = entry[0][1]
+ old_path = path = None
+ if path_info is None:
+ # the file is missing on disk, show as removed.
+ content_change = True
+ target_kind = None
+ target_exec = False
+ else:
+ # source and target are both versioned and disk file is present.
+ target_kind = path_info[2]
+ if target_kind == 'directory':
+ if path is None:
+ old_path = path = pathjoin(old_dirname, old_basename)
+ self.new_dirname_to_file_id[path] = file_id
+ if source_minikind != 'd':
+ content_change = True
+ else:
+ # directories have no fingerprint
+ content_change = False
+ target_exec = False
+ elif target_kind == 'file':
+ if source_minikind != 'f':
+ content_change = True
+ else:
+ # We could check the size, but we already have the
+ # sha1 hash.
+ content_change = (link_or_sha1 != source_details[1])
+ # Target details is updated at update_entry time
+ if self.use_filesystem_for_exec:
+ # We don't need S_ISREG here, because we are sure
+ # we are dealing with a file.
+ target_exec = bool(stat.S_IEXEC & path_info[3].st_mode)
+ else:
+ target_exec = target_details[3]
+ elif target_kind == 'symlink':
+ if source_minikind != 'l':
+ content_change = True
+ else:
+ content_change = (link_or_sha1 != source_details[1])
+ target_exec = False
+ elif target_kind == 'tree-reference':
+ if source_minikind != 't':
+ content_change = True
+ else:
+ content_change = False
+ target_exec = False
+ else:
+ raise Exception, "unknown kind %s" % path_info[2]
+ if source_minikind == 'd':
+ if path is None:
+ old_path = path = pathjoin(old_dirname, old_basename)
+ self.old_dirname_to_file_id[old_path] = file_id
+ # parent id is the entry for the path in the target tree
+ if old_dirname == self.last_source_parent[0]:
+ source_parent_id = self.last_source_parent[1]
+ else:
+ try:
+ source_parent_id = self.old_dirname_to_file_id[old_dirname]
+ except KeyError:
+ source_parent_entry = state._get_entry(source_index,
+ path_utf8=old_dirname)
+ source_parent_id = source_parent_entry[0][2]
+ if source_parent_id == entry[0][2]:
+ # This is the root, so the parent is None
+ source_parent_id = None
+ else:
+ self.last_source_parent[0] = old_dirname
+ self.last_source_parent[1] = source_parent_id
+ new_dirname = entry[0][0]
+ if new_dirname == self.last_target_parent[0]:
+ target_parent_id = self.last_target_parent[1]
+ else:
+ try:
+ target_parent_id = self.new_dirname_to_file_id[new_dirname]
+ except KeyError:
+ # TODO: We don't always need to do the lookup, because the
+ # parent entry will be the same as the source entry.
+ target_parent_entry = state._get_entry(target_index,
+ path_utf8=new_dirname)
+ if target_parent_entry == (None, None):
+ raise AssertionError(
+ "Could not find target parent in wt: %s\nparent of: %s"
+ % (new_dirname, entry))
+ target_parent_id = target_parent_entry[0][2]
+ if target_parent_id == entry[0][2]:
+ # This is the root, so the parent is None
+ target_parent_id = None
+ else:
+ self.last_target_parent[0] = new_dirname
+ self.last_target_parent[1] = target_parent_id
+
+ source_exec = source_details[3]
+ if (self.include_unchanged
+ or content_change
+ or source_parent_id != target_parent_id
+ or old_basename != entry[0][1]
+ or source_exec != target_exec
+ ):
+ if old_path is None:
+ old_path = path = pathjoin(old_dirname, old_basename)
+ old_path_u = utf8_decode(old_path)[0]
+ path_u = old_path_u
+ else:
+ old_path_u = utf8_decode(old_path)[0]
+ if old_path == path:
+ path_u = old_path_u
+ else:
+ path_u = utf8_decode(path)[0]
+ source_kind = _minikind_to_kind[source_minikind]
+ return (entry[0][2],
+ (old_path_u, path_u),
+ content_change,
+ (True, True),
+ (source_parent_id, target_parent_id),
+ (utf8_decode(old_basename)[0], utf8_decode(entry[0][1])[0]),
+ (source_kind, target_kind),
+ (source_exec, target_exec))
+ else:
+ return self.uninteresting
+ elif source_minikind in 'a' and target_minikind in 'fdlt':
+ # looks like a new file
+ path = pathjoin(entry[0][0], entry[0][1])
+ # parent id is the entry for the path in the target tree
+ # TODO: these are the same for an entire directory: cache em.
+ parent_id = state._get_entry(target_index,
+ path_utf8=entry[0][0])[0][2]
+ if parent_id == entry[0][2]:
+ parent_id = None
+ if path_info is not None:
+ # Present on disk:
+ if self.use_filesystem_for_exec:
+ # We need S_ISREG here, because we aren't sure if this
+ # is a file or not.
+ target_exec = bool(
+ stat.S_ISREG(path_info[3].st_mode)
+ and stat.S_IEXEC & path_info[3].st_mode)
+ else:
+ target_exec = target_details[3]
+ return (entry[0][2],
+ (None, utf8_decode(path)[0]),
+ True,
+ (False, True),
+ (None, parent_id),
+ (None, utf8_decode(entry[0][1])[0]),
+ (None, path_info[2]),
+ (None, target_exec))
+ else:
+ # Its a missing file, report it as such.
+ return (entry[0][2],
+ (None, utf8_decode(path)[0]),
+ False,
+ (False, True),
+ (None, parent_id),
+ (None, utf8_decode(entry[0][1])[0]),
+ (None, None),
+ (None, False))
+ elif source_minikind in 'fdlt' and target_minikind in 'a':
+ # unversioned, possibly, or possibly not deleted: we dont care.
+ # if its still on disk, *and* theres no other entry at this
+ # path [we dont know this in this routine at the moment -
+ # perhaps we should change this - then it would be an unknown.
+ old_path = pathjoin(entry[0][0], entry[0][1])
+ # parent id is the entry for the path in the target tree
+ parent_id = state._get_entry(source_index, path_utf8=entry[0][0])[0][2]
+ if parent_id == entry[0][2]:
+ parent_id = None
+ return (entry[0][2],
+ (utf8_decode(old_path)[0], None),
+ True,
+ (True, False),
+ (parent_id, None),
+ (utf8_decode(entry[0][1])[0], None),
+ (_minikind_to_kind[source_minikind], None),
+ (source_details[3], None))
+ elif source_minikind in 'fdlt' and target_minikind in 'r':
+ # a rename; could be a true rename, or a rename inherited from
+ # a renamed parent. TODO: handle this efficiently. Its not
+ # common case to rename dirs though, so a correct but slow
+ # implementation will do.
+ if not osutils.is_inside_any(searched_specific_files, target_details[1]):
+ search_specific_files.add(target_details[1])
+ elif source_minikind in 'ra' and target_minikind in 'ra':
+ # neither of the selected trees contain this file,
+ # so skip over it. This is not currently directly tested, but
+ # is indirectly via test_too_much.TestCommands.test_conflicts.
+ pass
+ else:
+ raise AssertionError("don't know how to compare "
+ "source_minikind=%r, target_minikind=%r"
+ % (source_minikind, target_minikind))
+ ## import pdb;pdb.set_trace()
+ return None
+_process_entry = ProcessEntryPython
+
+
# Try to load the compiled form if possible
try:
from bzrlib._dirstate_helpers_c import (
=== modified file 'bzrlib/workingtree_4.py'
--- a/bzrlib/workingtree_4.py 2008-09-13 07:59:05 +0000
+++ b/bzrlib/workingtree_4.py 2008-09-14 11:59:29 +0000
@@ -1857,6 +1857,10 @@
"can only be used for trees stored in the dirstate"
% (self.source._revision_id, self.target, self.iter_changes))
update_entry = dirstate.update_entry
+ use_filesystem_for_exec = (sys.platform != 'win32')
+ process_entry = dirstate._process_entry(include_unchanged, use_filesystem_for_exec)
+ _process_entry = process_entry._process_entry
+ uninteresting = process_entry.uninteresting
target_index = 0
if self.source._revision_id == NULL_REVISION:
source_index = None
@@ -1971,277 +1975,10 @@
# detail is not relocated, add the id.
searched_specific_files = set()
NULL_PARENT_DETAILS = dirstate.DirState.NULL_PARENT_DETAILS
- # Using a list so that we can access the values and change them in
- # nested scope. Each one is [path, file_id, entry]
- last_source_parent = [None, None]
- last_target_parent = [None, None]
-
- use_filesystem_for_exec = (sys.platform != 'win32')
-
- # Just a sentry, so that _process_entry can say that this
- # record is handled, but isn't interesting to process (unchanged)
- uninteresting = object()
-
- old_dirname_to_file_id = {}
- new_dirname_to_file_id = {}
+
# TODO: jam 20070516 - Avoid the _get_entry lookup overhead by
# keeping a cache of directories that we have seen.
- def _process_entry(entry, path_info):
- """Compare an entry and real disk to generate delta information.
-
- :param path_info: top_relpath, basename, kind, lstat, abspath for
- the path of entry. If None, then the path is considered absent.
- (Perhaps we should pass in a concrete entry for this ?)
- Basename is returned as a utf8 string because we expect this
- tuple will be ignored, and don't want to take the time to
- decode.
- :return: None if these don't match
- A tuple of information about the change, or
- the object 'uninteresting' if these match, but are
- basically identical.
- """
- if source_index is None:
- source_details = NULL_PARENT_DETAILS
- else:
- source_details = entry[1][source_index]
- target_details = entry[1][target_index]
- target_minikind = target_details[0]
- if path_info is not None and target_minikind in 'fdlt':
- if not (target_index == 0):
- raise AssertionError()
- link_or_sha1 = update_entry(state, entry,
- abspath=path_info[4], stat_value=path_info[3])
- # The entry may have been modified by update_entry
- target_details = entry[1][target_index]
- target_minikind = target_details[0]
- else:
- link_or_sha1 = None
- file_id = entry[0][2]
- source_minikind = source_details[0]
- if source_minikind in 'fdltr' and target_minikind in 'fdlt':
- # claimed content in both: diff
- # r | fdlt | | add source to search, add id path move and perform
- # | | | diff check on source-target
- # r | fdlt | a | dangling file that was present in the basis.
- # | | | ???
- if source_minikind in 'r':
- # add the source to the search path to find any children it
- # has. TODO ? : only add if it is a container ?
- if not osutils.is_inside_any(searched_specific_files,
- source_details[1]):
- search_specific_files.add(source_details[1])
- # generate the old path; this is needed for stating later
- # as well.
- old_path = source_details[1]
- old_dirname, old_basename = os.path.split(old_path)
- path = pathjoin(entry[0][0], entry[0][1])
- old_entry = state._get_entry(source_index,
- path_utf8=old_path)
- # update the source details variable to be the real
- # location.
- if old_entry == (None, None):
- raise errors.CorruptDirstate(state._filename,
- "entry '%s/%s' is considered renamed from %r"
- " but source does not exist\n"
- "entry: %s" % (entry[0][0], entry[0][1], old_path, entry))
- source_details = old_entry[1][source_index]
- source_minikind = source_details[0]
- else:
- old_dirname = entry[0][0]
- old_basename = entry[0][1]
- old_path = path = None
- if path_info is None:
- # the file is missing on disk, show as removed.
- content_change = True
- target_kind = None
- target_exec = False
- else:
- # source and target are both versioned and disk file is present.
- target_kind = path_info[2]
- if target_kind == 'directory':
- if path is None:
- old_path = path = pathjoin(old_dirname, old_basename)
- new_dirname_to_file_id[path] = file_id
- if source_minikind != 'd':
- content_change = True
- else:
- # directories have no fingerprint
- content_change = False
- target_exec = False
- elif target_kind == 'file':
- if source_minikind != 'f':
- content_change = True
- else:
- # We could check the size, but we already have the
- # sha1 hash.
- content_change = (link_or_sha1 != source_details[1])
- # Target details is updated at update_entry time
- if use_filesystem_for_exec:
- # We don't need S_ISREG here, because we are sure
- # we are dealing with a file.
- target_exec = bool(stat.S_IEXEC & path_info[3].st_mode)
- else:
- target_exec = target_details[3]
- elif target_kind == 'symlink':
- if source_minikind != 'l':
- content_change = True
- else:
- content_change = (link_or_sha1 != source_details[1])
- target_exec = False
- elif target_kind == 'tree-reference':
- if source_minikind != 't':
- content_change = True
- else:
- content_change = False
- target_exec = False
- else:
- raise Exception, "unknown kind %s" % path_info[2]
- if source_minikind == 'd':
- if path is None:
- old_path = path = pathjoin(old_dirname, old_basename)
- old_dirname_to_file_id[old_path] = file_id
- # parent id is the entry for the path in the target tree
- if old_dirname == last_source_parent[0]:
- source_parent_id = last_source_parent[1]
- else:
- try:
- source_parent_id = old_dirname_to_file_id[old_dirname]
- except KeyError:
- source_parent_entry = state._get_entry(source_index,
- path_utf8=old_dirname)
- source_parent_id = source_parent_entry[0][2]
- if source_parent_id == entry[0][2]:
- # This is the root, so the parent is None
- source_parent_id = None
- else:
- last_source_parent[0] = old_dirname
- last_source_parent[1] = source_parent_id
- new_dirname = entry[0][0]
- if new_dirname == last_target_parent[0]:
- target_parent_id = last_target_parent[1]
- else:
- try:
- target_parent_id = new_dirname_to_file_id[new_dirname]
- except KeyError:
- # TODO: We don't always need to do the lookup, because the
- # parent entry will be the same as the source entry.
- target_parent_entry = state._get_entry(target_index,
- path_utf8=new_dirname)
- if target_parent_entry == (None, None):
- raise AssertionError(
- "Could not find target parent in wt: %s\nparent of: %s"
- % (new_dirname, entry))
- target_parent_id = target_parent_entry[0][2]
- if target_parent_id == entry[0][2]:
- # This is the root, so the parent is None
- target_parent_id = None
- else:
- last_target_parent[0] = new_dirname
- last_target_parent[1] = target_parent_id
-
- source_exec = source_details[3]
- if (include_unchanged
- or content_change
- or source_parent_id != target_parent_id
- or old_basename != entry[0][1]
- or source_exec != target_exec
- ):
- if old_path is None:
- old_path = path = pathjoin(old_dirname, old_basename)
- old_path_u = utf8_decode(old_path)[0]
- path_u = old_path_u
- else:
- old_path_u = utf8_decode(old_path)[0]
- if old_path == path:
- path_u = old_path_u
- else:
- path_u = utf8_decode(path)[0]
- source_kind = _minikind_to_kind[source_minikind]
- return (entry[0][2],
- (old_path_u, path_u),
- content_change,
- (True, True),
- (source_parent_id, target_parent_id),
- (utf8_decode(old_basename)[0], utf8_decode(entry[0][1])[0]),
- (source_kind, target_kind),
- (source_exec, target_exec))
- else:
- return uninteresting
- elif source_minikind in 'a' and target_minikind in 'fdlt':
- # looks like a new file
- path = pathjoin(entry[0][0], entry[0][1])
- # parent id is the entry for the path in the target tree
- # TODO: these are the same for an entire directory: cache em.
- parent_id = state._get_entry(target_index,
- path_utf8=entry[0][0])[0][2]
- if parent_id == entry[0][2]:
- parent_id = None
- if path_info is not None:
- # Present on disk:
- if use_filesystem_for_exec:
- # We need S_ISREG here, because we aren't sure if this
- # is a file or not.
- target_exec = bool(
- stat.S_ISREG(path_info[3].st_mode)
- and stat.S_IEXEC & path_info[3].st_mode)
- else:
- target_exec = target_details[3]
- return (entry[0][2],
- (None, utf8_decode(path)[0]),
- True,
- (False, True),
- (None, parent_id),
- (None, utf8_decode(entry[0][1])[0]),
- (None, path_info[2]),
- (None, target_exec))
- else:
- # Its a missing file, report it as such.
- return (entry[0][2],
- (None, utf8_decode(path)[0]),
- False,
- (False, True),
- (None, parent_id),
- (None, utf8_decode(entry[0][1])[0]),
- (None, None),
- (None, False))
- elif source_minikind in 'fdlt' and target_minikind in 'a':
- # unversioned, possibly, or possibly not deleted: we dont care.
- # if its still on disk, *and* theres no other entry at this
- # path [we dont know this in this routine at the moment -
- # perhaps we should change this - then it would be an unknown.
- old_path = pathjoin(entry[0][0], entry[0][1])
- # parent id is the entry for the path in the target tree
- parent_id = state._get_entry(source_index, path_utf8=entry[0][0])[0][2]
- if parent_id == entry[0][2]:
- parent_id = None
- return (entry[0][2],
- (utf8_decode(old_path)[0], None),
- True,
- (True, False),
- (parent_id, None),
- (utf8_decode(entry[0][1])[0], None),
- (_minikind_to_kind[source_minikind], None),
- (source_details[3], None))
- elif source_minikind in 'fdlt' and target_minikind in 'r':
- # a rename; could be a true rename, or a rename inherited from
- # a renamed parent. TODO: handle this efficiently. Its not
- # common case to rename dirs though, so a correct but slow
- # implementation will do.
- if not osutils.is_inside_any(searched_specific_files, target_details[1]):
- search_specific_files.add(target_details[1])
- elif source_minikind in 'ra' and target_minikind in 'ra':
- # neither of the selected trees contain this file,
- # so skip over it. This is not currently directly tested, but
- # is indirectly via test_too_much.TestCommands.test_conflicts.
- pass
- else:
- raise AssertionError("don't know how to compare "
- "source_minikind=%r, target_minikind=%r"
- % (source_minikind, target_minikind))
- ## import pdb;pdb.set_trace()
- return None
-
while search_specific_files:
# TODO: the pending list should be lexically sorted? the
# interface doesn't require it.
@@ -2276,7 +2013,7 @@
continue
path_handled = False
for entry in root_entries:
- result = _process_entry(entry, root_dir_info)
+ result = _process_entry(entry, root_dir_info, source_index, target_index, state)
if result is not None:
path_handled = True
if result is not uninteresting:
@@ -2395,7 +2132,7 @@
for current_entry in current_block[1]:
# entry referring to file not present on disk.
# advance the entry only, after processing.
- result = _process_entry(current_entry, None)
+ result = _process_entry(current_entry, None, source_index, target_index, state)
if result is not None:
if result is not uninteresting:
yield result
@@ -2433,7 +2170,7 @@
pass
elif current_path_info is None:
# no path is fine: the per entry code will handle it.
- result = _process_entry(current_entry, current_path_info)
+ result = _process_entry(current_entry, current_path_info, source_index, target_index, state)
if result is not None:
if result is not uninteresting:
yield result
@@ -2454,13 +2191,13 @@
else:
# entry referring to file not present on disk.
# advance the entry only, after processing.
- result = _process_entry(current_entry, None)
+ result = _process_entry(current_entry, None, source_index, target_index, state)
if result is not None:
if result is not uninteresting:
yield result
advance_path = False
else:
- result = _process_entry(current_entry, current_path_info)
+ result = _process_entry(current_entry, current_path_info, source_index, target_index, state)
if result is not None:
path_handled = True
if result is not uninteresting:
More information about the bazaar-commits
mailing list