Rev 3709: Simple 'compiled with pyrex' ProcessEntry class. faster. in http://people.ubuntu.com/~robertc/baz2.0/readdir

Robert Collins robertc at robertcollins.net
Sun Sep 14 13:40:48 BST 2008


At http://people.ubuntu.com/~robertc/baz2.0/readdir

------------------------------------------------------------
revno: 3709
revision-id: robertc at robertcollins.net-20080914124043-lt97fxv205326rf1
parent: robertc at robertcollins.net-20080914115929-vtk18rnhg76a38ox
committer: Robert Collins <robertc at robertcollins.net>
branch nick: process-entry-optimised
timestamp: Sun 2008-09-14 22:40:43 +1000
message:
  Simple 'compiled with pyrex' ProcessEntry class. faster.
modified:
  bzrlib/_dirstate_helpers_c.pyx dirstate_helpers.pyx-20070503201057-u425eni465q4idwn-3
  bzrlib/dirstate.py             dirstate.py-20060728012006-d6mvoihjb3je9peu-1
  bzrlib/tests/test__dirstate_helpers.py test_dirstate_helper-20070504035751-jsbn00xodv0y1eve-2
=== modified file 'bzrlib/_dirstate_helpers_c.pyx'
--- a/bzrlib/_dirstate_helpers_c.pyx	2008-09-14 08:51:07 +0000
+++ b/bzrlib/_dirstate_helpers_c.pyx	2008-09-14 12:40:43 +0000
@@ -23,6 +23,7 @@
 
 from bzrlib import errors, osutils
 from bzrlib.dirstate import DirState, pack_stat
+from bzrlib.osutils import pathjoin
 
 
 # Give Pyrex some function definitions for it to understand.
@@ -53,6 +54,7 @@
     int S_ISDIR(int mode)
     int S_ISREG(int mode)
     int S_ISLNK(int mode)
+    int S_IXUSR
 
 # These functions allow us access to a bit of the 'bare metal' of python
 # objects, rather than going through the object abstraction. (For example,
@@ -819,8 +821,6 @@
         # size should also be in packed_stat
         if saved_file_size == stat_value.st_size:
             return saved_link_or_sha1
-    else:
-        print "gararar", packed_stat, saved_packed_stat
 
     # If we have gotten this far, that means that we need to actually
     # process this entry.
@@ -862,3 +862,282 @@
                            False, DirState.NULLSTAT)
     self._dirblock_state = DirState.IN_MEMORY_MODIFIED
     return link_or_sha1
+
+
+cdef class ProcessEntryC:
+
+    cdef object old_dirname_to_file_id # dict
+    cdef object new_dirname_to_file_id # dict
+    cdef readonly object uninteresting
+    cdef object last_source_parent
+    cdef object last_target_parent
+    cdef object include_unchanged
+    cdef object use_filesystem_for_exec
+
+    def __init__(self, include_unchanged, use_filesystem_for_exec):
+        self.old_dirname_to_file_id = {}
+        self.new_dirname_to_file_id = {}
+        # Just a sentry, so that _process_entry can say that this
+        # record is handled, but isn't interesting to process (unchanged)
+        self.uninteresting = object()
+        # Using a list so that we can access the values and change them in
+        # nested scope. Each one is [path, file_id, entry]
+        self.last_source_parent = [None, None]
+        self.last_target_parent = [None, None]
+        self.include_unchanged = include_unchanged
+        self.use_filesystem_for_exec = use_filesystem_for_exec
+
+    def _process_entry(self, entry, path_info, source_index, target_index, state):
+        """Compare an entry and real disk to generate delta information.
+
+        :param path_info: top_relpath, basename, kind, lstat, abspath for
+            the path of entry. If None, then the path is considered absent.
+            (Perhaps we should pass in a concrete entry for this ?)
+            Basename is returned as a utf8 string because we expect this
+            tuple will be ignored, and don't want to take the time to
+            decode.
+        :return: None if these don't match
+                 A tuple of information about the change, or
+                 the object 'uninteresting' if these match, but are
+                 basically identical.
+        """
+        if source_index is None:
+            source_details = NULL_PARENT_DETAILS
+        else:
+            source_details = entry[1][source_index]
+        target_details = entry[1][target_index]
+        target_minikind = target_details[0]
+        if path_info is not None and target_minikind in 'fdlt':
+            if not (target_index == 0):
+                raise AssertionError()
+            link_or_sha1 = update_entry(state, entry,
+                abspath=path_info[4], stat_value=path_info[3])
+            # The entry may have been modified by update_entry
+            target_details = entry[1][target_index]
+            target_minikind = target_details[0]
+        else:
+            link_or_sha1 = None
+        file_id = entry[0][2]
+        source_minikind = source_details[0]
+        if source_minikind in 'fdltr' and target_minikind in 'fdlt':
+            # claimed content in both: diff
+            #   r    | fdlt   |      | add source to search, add id path move and perform
+            #        |        |      | diff check on source-target
+            #   r    | fdlt   |  a   | dangling file that was present in the basis.
+            #        |        |      | ???
+            if source_minikind in 'r':
+                # add the source to the search path to find any children it
+                # has.  TODO ? : only add if it is a container ?
+                if not osutils.is_inside_any(searched_specific_files,
+                                             source_details[1]):
+                    search_specific_files.add(source_details[1])
+                # generate the old path; this is needed for stating later
+                # as well.
+                old_path = source_details[1]
+                old_dirname, old_basename = os.path.split(old_path)
+                path = pathjoin(entry[0][0], entry[0][1])
+                old_entry = state._get_entry(source_index,
+                                             path_utf8=old_path)
+                # update the source details variable to be the real
+                # location.
+                if old_entry == (None, None):
+                    raise errors.CorruptDirstate(state._filename,
+                        "entry '%s/%s' is considered renamed from %r"
+                        " but source does not exist\n"
+                        "entry: %s" % (entry[0][0], entry[0][1], old_path, entry))
+                source_details = old_entry[1][source_index]
+                source_minikind = source_details[0]
+            else:
+                old_dirname = entry[0][0]
+                old_basename = entry[0][1]
+                old_path = path = None
+            if path_info is None:
+                # the file is missing on disk, show as removed.
+                content_change = True
+                target_kind = None
+                target_exec = False
+            else:
+                # source and target are both versioned and disk file is present.
+                target_kind = path_info[2]
+                if target_kind == 'directory':
+                    if path is None:
+                        old_path = path = pathjoin(old_dirname, old_basename)
+                    self.new_dirname_to_file_id[path] = file_id
+                    if source_minikind != 'd':
+                        content_change = True
+                    else:
+                        # directories have no fingerprint
+                        content_change = False
+                    target_exec = False
+                elif target_kind == 'file':
+                    if source_minikind != 'f':
+                        content_change = True
+                    else:
+                        # We could check the size, but we already have the
+                        # sha1 hash.
+                        content_change = (link_or_sha1 != source_details[1])
+                    # Target details is updated at update_entry time
+                    if self.use_filesystem_for_exec:
+                        # We don't need S_ISREG here, because we are sure
+                        # we are dealing with a file.
+                        target_exec = bool(S_IXUSR & path_info[3].st_mode)
+                    else:
+                        target_exec = target_details[3]
+                elif target_kind == 'symlink':
+                    if source_minikind != 'l':
+                        content_change = True
+                    else:
+                        content_change = (link_or_sha1 != source_details[1])
+                    target_exec = False
+                elif target_kind == 'tree-reference':
+                    if source_minikind != 't':
+                        content_change = True
+                    else:
+                        content_change = False
+                    target_exec = False
+                else:
+                    raise Exception, "unknown kind %s" % path_info[2]
+            if source_minikind == 'd':
+                if path is None:
+                    old_path = path = pathjoin(old_dirname, old_basename)
+                self.old_dirname_to_file_id[old_path] = file_id
+            # parent id is the entry for the path in the target tree
+            if old_dirname == self.last_source_parent[0]:
+                source_parent_id = self.last_source_parent[1]
+            else:
+                try:
+                    source_parent_id = self.old_dirname_to_file_id[old_dirname]
+                except KeyError:
+                    source_parent_entry = state._get_entry(source_index,
+                                                           path_utf8=old_dirname)
+                    source_parent_id = source_parent_entry[0][2]
+                if source_parent_id == entry[0][2]:
+                    # This is the root, so the parent is None
+                    source_parent_id = None
+                else:
+                    self.last_source_parent[0] = old_dirname
+                    self.last_source_parent[1] = source_parent_id
+            new_dirname = entry[0][0]
+            if new_dirname == self.last_target_parent[0]:
+                target_parent_id = self.last_target_parent[1]
+            else:
+                try:
+                    target_parent_id = self.new_dirname_to_file_id[new_dirname]
+                except KeyError:
+                    # TODO: We don't always need to do the lookup, because the
+                    #       parent entry will be the same as the source entry.
+                    target_parent_entry = state._get_entry(target_index,
+                                                           path_utf8=new_dirname)
+                    if target_parent_entry == (None, None):
+                        raise AssertionError(
+                            "Could not find target parent in wt: %s\nparent of: %s"
+                            % (new_dirname, entry))
+                    target_parent_id = target_parent_entry[0][2]
+                if target_parent_id == entry[0][2]:
+                    # This is the root, so the parent is None
+                    target_parent_id = None
+                else:
+                    self.last_target_parent[0] = new_dirname
+                    self.last_target_parent[1] = target_parent_id
+
+            source_exec = source_details[3]
+            if (self.include_unchanged
+                or content_change
+                or source_parent_id != target_parent_id
+                or old_basename != entry[0][1]
+                or source_exec != target_exec
+                ):
+                if old_path is None:
+                    old_path = path = pathjoin(old_dirname, old_basename)
+                    old_path_u = utf8_decode(old_path)[0]
+                    path_u = old_path_u
+                else:
+                    old_path_u = utf8_decode(old_path)[0]
+                    if old_path == path:
+                        path_u = old_path_u
+                    else:
+                        path_u = utf8_decode(path)[0]
+                source_kind = _minikind_to_kind[source_minikind]
+                return (entry[0][2],
+                       (old_path_u, path_u),
+                       content_change,
+                       (True, True),
+                       (source_parent_id, target_parent_id),
+                       (utf8_decode(old_basename)[0], utf8_decode(entry[0][1])[0]),
+                       (source_kind, target_kind),
+                       (source_exec, target_exec))
+            else:
+                return self.uninteresting
+        elif source_minikind in 'a' and target_minikind in 'fdlt':
+            # looks like a new file
+            path = pathjoin(entry[0][0], entry[0][1])
+            # parent id is the entry for the path in the target tree
+            # TODO: these are the same for an entire directory: cache em.
+            parent_id = state._get_entry(target_index,
+                                         path_utf8=entry[0][0])[0][2]
+            if parent_id == entry[0][2]:
+                parent_id = None
+            if path_info is not None:
+                # Present on disk:
+                if self.use_filesystem_for_exec:
+                    # We need S_ISREG here, because we aren't sure if this
+                    # is a file or not.
+                    target_exec = bool(
+                        S_ISREG(path_info[3].st_mode)
+                        and S_IXUSR & path_info[3].st_mode)
+                else:
+                    target_exec = target_details[3]
+                return (entry[0][2],
+                       (None, utf8_decode(path)[0]),
+                       True,
+                       (False, True),
+                       (None, parent_id),
+                       (None, utf8_decode(entry[0][1])[0]),
+                       (None, path_info[2]),
+                       (None, target_exec))
+            else:
+                # Its a missing file, report it as such.
+                return (entry[0][2],
+                       (None, utf8_decode(path)[0]),
+                       False,
+                       (False, True),
+                       (None, parent_id),
+                       (None, utf8_decode(entry[0][1])[0]),
+                       (None, None),
+                       (None, False))
+        elif source_minikind in 'fdlt' and target_minikind in 'a':
+            # unversioned, possibly, or possibly not deleted: we dont care.
+            # if its still on disk, *and* theres no other entry at this
+            # path [we dont know this in this routine at the moment -
+            # perhaps we should change this - then it would be an unknown.
+            old_path = pathjoin(entry[0][0], entry[0][1])
+            # parent id is the entry for the path in the target tree
+            parent_id = state._get_entry(source_index, path_utf8=entry[0][0])[0][2]
+            if parent_id == entry[0][2]:
+                parent_id = None
+            return (entry[0][2],
+                   (utf8_decode(old_path)[0], None),
+                   True,
+                   (True, False),
+                   (parent_id, None),
+                   (utf8_decode(entry[0][1])[0], None),
+                   (_minikind_to_kind[source_minikind], None),
+                   (source_details[3], None))
+        elif source_minikind in 'fdlt' and target_minikind in 'r':
+            # a rename; could be a true rename, or a rename inherited from
+            # a renamed parent. TODO: handle this efficiently. Its not
+            # common case to rename dirs though, so a correct but slow
+            # implementation will do.
+            if not osutils.is_inside_any(searched_specific_files, target_details[1]):
+                search_specific_files.add(target_details[1])
+        elif source_minikind in 'ra' and target_minikind in 'ra':
+            # neither of the selected trees contain this file,
+            # so skip over it. This is not currently directly tested, but
+            # is indirectly via test_too_much.TestCommands.test_conflicts.
+            pass
+        else:
+            raise AssertionError("don't know how to compare "
+                "source_minikind=%r, target_minikind=%r"
+                % (source_minikind, target_minikind))
+            ## import pdb;pdb.set_trace()
+        return None

=== modified file 'bzrlib/dirstate.py'
--- a/bzrlib/dirstate.py	2008-09-14 11:59:29 +0000
+++ b/bzrlib/dirstate.py	2008-09-14 12:40:43 +0000
@@ -3065,6 +3065,7 @@
         _bisect_path_right_c as _bisect_path_right,
         cmp_by_dirs_c as cmp_by_dirs,
         update_entry as update_entry,
+        ProcessEntryC as _process_entry,
         )
 except ImportError:
     from bzrlib._dirstate_helpers_py import (

=== modified file 'bzrlib/tests/test__dirstate_helpers.py'
--- a/bzrlib/tests/test__dirstate_helpers.py	2008-09-13 07:59:05 +0000
+++ b/bzrlib/tests/test__dirstate_helpers.py	2008-09-14 12:40:43 +0000
@@ -776,6 +776,14 @@
             from bzrlib.dirstate import py_update_entry
             self.assertIs(py_update_entry, dirstate.py_update_entry)
 
+    def test_process_entry(self):
+        if CompiledDirstateHelpersFeature.available():
+            from bzrlib._dirstate_helpers_c import ProcessEntryC
+            self.assertIs(ProcessEntryC, dirstate._process_entry)
+        else:
+            from bzrlib.dirstate import ProcessEntryPython
+            self.assertIs(ProcessEntryPython, dirstate._process_entry)
+
 
 class TestUpdateEntry(test_dirstate.TestCaseWithDirState):
     """Test the DirState.update_entry functions"""




More information about the bazaar-commits mailing list