Rev 3034: (robertc) Fix one cause of poor commit performance with many deleted in file:///home/pqm/archives/thelove/bzr/%2Btrunk/

Canonical.com Patch Queue Manager pqm at pqm.ubuntu.com
Tue Nov 27 02:37:52 GMT 2007


At file:///home/pqm/archives/thelove/bzr/%2Btrunk/

------------------------------------------------------------
revno: 3034
revision-id:pqm at pqm.ubuntu.com-20071127023739-a1ajr28wi7so2up6
parent: pqm at pqm.ubuntu.com-20071127020038-4i8r0718qpokyky9
parent: robertc at robertcollins.net-20071127015529-4inebdo4aa2bfatv
committer: Canonical.com Patch Queue Manager <pqm at pqm.ubuntu.com>
branch nick: +trunk
timestamp: Tue 2007-11-27 02:37:39 +0000
message:
  (robertc) Fix one cause of poor commit performance with many deleted
  	paths. (Robert Collins, #156491)
modified:
  NEWS                           NEWS-20050323055033-4e00b5db738777ff
  bzrlib/commit.py               commit.py-20050511101309-79ec1a0168e0e825
    ------------------------------------------------------------
    revno: 2938.3.2
    revision-id:robertc at robertcollins.net-20071127015529-4inebdo4aa2bfatv
    parent: robertc at robertcollins.net-20071024223742-fhjlj7l6lu77s9zq
    committer: Robert Collins <robertc at robertcollins.net>
    branch nick: commit.deleted-paths-speed
    timestamp: Tue 2007-11-27 12:55:29 +1100
    message:
      Review feedback.
    modified:
      bzrlib/commit.py               commit.py-20050511101309-79ec1a0168e0e825
    ------------------------------------------------------------
    revno: 2938.3.1
    revision-id:robertc at robertcollins.net-20071024223742-fhjlj7l6lu77s9zq
    parent: pqm at pqm.ubuntu.com-20071024181951-qqo4r5mqrhr032pf
    committer: Robert Collins <robertc at robertcollins.net>
    branch nick: commit
    timestamp: Thu 2007-10-25 08:37:42 +1000
    message:
      * Commit with many automatically found deleted paths no longer performs
        linear scanning for the children of those paths during inventory
        iteration. This should fix commit performance blowing out when many such
        paths occur during commit. (Robert Collins, #156491)
    modified:
      NEWS                           NEWS-20050323055033-4e00b5db738777ff
      bzrlib/commit.py               commit.py-20050511101309-79ec1a0168e0e825
=== modified file 'NEWS'
--- a/NEWS	2007-11-27 01:20:01 +0000
+++ b/NEWS	2007-11-27 02:37:39 +0000
@@ -245,6 +245,11 @@
    * Commit no longer checks for new text keys during insertion when the
      revision id was deterministically unique. (Robert Collins)
 
+   * Commit with many automatically found deleted paths no longer performs
+     linear scanning for the children of those paths during inventory
+     iteration. This should fix commit performance blowing out when many such
+     paths occur during commit. (Robert Collins, #156491)
+
    * Committing a change which is not a merge and does not change the number of
      files in the tree is faster by utilising the data about whether files are
      changed to determine if the tree is unchanged rather than recalculating

=== modified file 'bzrlib/commit.py'
--- a/bzrlib/commit.py	2007-11-05 19:40:28 +0000
+++ b/bzrlib/commit.py	2007-11-27 02:37:39 +0000
@@ -71,7 +71,9 @@
 from bzrlib.osutils import (kind_marker, isdir,isfile, is_inside_any,
                             is_inside_or_parent_of_any,
                             minimum_path_selection,
-                            quotefn, sha_file, split_lines)
+                            quotefn, sha_file, split_lines,
+                            splitpath,
+                            )
 from bzrlib.testament import Testament
 from bzrlib.trace import mutter, note, warning, is_quiet
 from bzrlib.xml5 import serializer_v5
@@ -694,7 +696,9 @@
                
         report_changes = self.reporter.is_verbose()
         deleted_ids = []
-        deleted_paths = set()
+        # A tree of paths that have been deleted. E.g. if foo/bar has been
+        # deleted, then we have {'foo':{'bar':{}}}
+        deleted_paths = {}
         # XXX: Note that entries may have the wrong kind because the entry does
         # not reflect the status on disk.
         work_inv = self.work_tree.inventory
@@ -708,16 +712,34 @@
             if kind == 'directory':
                 self._next_progress_entry()
             # Skip files that have been deleted from the working tree.
-            # The deleted files/directories are also recorded so they
-            # can be explicitly unversioned later. Note that when a
-            # filter of specific files is given, we must only skip/record
-            # deleted files matching that filter.
-            if is_inside_any(deleted_paths, path):
-                continue
+            # The deleted path ids are also recorded so they can be explicitly
+            # unversioned later.
+            if deleted_paths:
+                path_segments = splitpath(path)
+                deleted_dict = deleted_paths
+                for segment in path_segments:
+                    deleted_dict = deleted_dict.get(segment, None)
+                    if not deleted_dict:
+                        # We either took a path not present in the dict
+                        # (deleted_dict was None), or we've reached an empty
+                        # child dir in the dict, so are now a sub-path.
+                        break
+                else:
+                    deleted_dict = None
+                if deleted_dict is not None:
+                    # the path has a deleted parent, do not add it.
+                    continue
             content_summary = self.work_tree.path_content_summary(path)
+            # Note that when a filter of specific files is given, we must only
+            # skip/record deleted files matching that filter.
             if not specific_files or is_inside_any(specific_files, path):
                 if content_summary[0] == 'missing':
-                    deleted_paths.add(path)
+                    if not deleted_paths:
+                        # path won't have been split yet.
+                        path_segments = splitpath(path)
+                    deleted_dict = deleted_paths
+                    for segment in path_segments:
+                        deleted_dict = deleted_dict.setdefault(segment, {})
                     self.reporter.missing(path)
                     deleted_ids.append(file_id)
                     continue




More information about the bazaar-commits mailing list