Rev 2826: * Partial commits are now approximately 40% faster by walking over the in http://people.ubuntu.com/~robertc/baz2.0/commit-specific-file-handling

Robert Collins robertc at robertcollins.net
Thu Sep 20 08:05:00 BST 2007


At http://people.ubuntu.com/~robertc/baz2.0/commit-specific-file-handling

------------------------------------------------------------
revno: 2826
revision-id: robertc at robertcollins.net-20070920070333-eedxrxidkignx4i1
parent: pqm at pqm.ubuntu.com-20070917005035-cshdkpzbj63id1uw
committer: Robert Collins <robertc at robertcollins.net>
branch nick: commit-specific-file-handling
timestamp: Thu 2007-09-20 17:03:33 +1000
message:
  * Partial commits are now approximately 40% faster by walking over the
    unselected current tree more efficiently. (Robert Collins)
  
  * New method ``bzrlib.osutils.minimum_path_selection`` useful for removing
    duplication from user input, when a user mentions both a path and an item
    contained within that path. (Robert Collins)
  
  * New parameter yield_parents on ``Inventory.iter_entries_by_dir`` which
    causes the parents of a selected id to be returned recursively, so all the
    paths from the root down to each element of selected_file_ids are
    returned. (Robert Collins)
modified:
  NEWS                           NEWS-20050323055033-4e00b5db738777ff
  bzrlib/commit.py               commit.py-20050511101309-79ec1a0168e0e825
  bzrlib/dirstate.py             dirstate.py-20060728012006-d6mvoihjb3je9peu-1
  bzrlib/inventory.py            inventory.py-20050309040759-6648b84ca2005b37
  bzrlib/osutils.py              osutils.py-20050309040759-eeaff12fbf77ac86
  bzrlib/tests/blackbox/test_commit.py test_commit.py-20060212094538-ae88fc861d969db0
  bzrlib/tests/inventory_implementations/basics.py basics.py-20070903044446-kdjwbiu1p1zi9phs-1
  bzrlib/tests/test_commit.py    test_commit.py-20050914060732-279f057f8c295434
  bzrlib/tests/test_osutils.py   test_osutils.py-20051201224856-e48ee24c12182989
  bzrlib/workingtree_4.py        workingtree_4.py-20070208044105-5fgpc5j3ljlh5q6c-1
=== modified file 'NEWS'
--- a/NEWS	2007-09-16 19:29:00 +0000
+++ b/NEWS	2007-09-20 07:03:33 +0000
@@ -19,6 +19,9 @@
    * Inventory serialisation no longer double-sha's the content.
      (Robert Collins)
 
+   * Partial commits are now approximately 40% faster by walking over the
+     unselected current tree more efficiently. (Robert Collins)
+
    * XML inventory serialisation takes 20% less time while being stricter about
      the contents. (Robert Collins)
 
@@ -52,9 +55,18 @@
 
   INTERNALS:
 
+   * New method ``bzrlib.osutils.minimum_path_selection`` useful for removing
+     duplication from user input, when a user mentions both a path and an item
+     contained within that path. (Robert Collins)
+
    * New method on xml serialisers, write_inventory_to_lines, which matches the
      API used by knits for adding content. (Robert Collins)
 
+   * New parameter yield_parents on ``Inventory.iter_entries_by_dir`` which
+     causes the parents of a selected id to be returned recursively, so all the
+     paths from the root down to each element of selected_file_ids are
+     returned. (Robert Collins)
+
   TESTING:
 
 

=== modified file 'bzrlib/commit.py'
--- a/bzrlib/commit.py	2007-09-14 00:13:04 +0000
+++ b/bzrlib/commit.py	2007-09-20 07:03:33 +0000
@@ -68,8 +68,9 @@
                            ConflictsInTree,
                            StrictCommitFailed
                            )
-from bzrlib.osutils import (kind_marker, isdir,isfile, is_inside_any, 
+from bzrlib.osutils import (kind_marker, isdir,isfile, is_inside_any,
                             is_inside_or_parent_of_any,
+                            minimum_path_selection,
                             quotefn, sha_file, split_lines)
 from bzrlib.testament import Testament
 from bzrlib.trace import mutter, note, warning, is_quiet
@@ -245,7 +246,9 @@
         self.master_branch = None
         self.master_locked = False
         self.rev_id = None
-        self.specific_files = specific_files
+        self.specific_files = sorted(
+            minimum_path_selection(specific_files or ()))
+        self.specific_file_ids = None
         self.allow_pointless = allow_pointless
         self.recursive = recursive
         self.revprops = revprops
@@ -280,14 +283,17 @@
                 self.config = self.branch.get_config()
 
             # If provided, ensure the specified files are versioned
-            if specific_files is not None:
-                # Note: We don't actually need the IDs here. This routine
+            if self.specific_files:
+                # Note: This routine
                 # is being called because it raises PathNotVerisonedError
-                # as a side effect of finding the IDs.
+                # as a side effect of finding the IDs. We later use the ids we
+                # found as input to the workng tree inventory iterator, so we
+                # only consider those ids rather than examining the whole tree
+                # again.
                 # XXX: Dont we have filter_unversioned to do this more
                 # cheaply?
-                tree.find_ids_across_trees(specific_files,
-                                           [self.basis_tree, self.work_tree])
+                self.specific_file_ids = tree.find_ids_across_trees(
+                    specific_files, [self.basis_tree, self.work_tree])
 
             # Setup the progress bar. As the number of files that need to be
             # committed in unknown, progress is reported as stages.
@@ -671,13 +677,14 @@
         # recorded in their previous state. For more details, see
         # https://lists.ubuntu.com/archives/bazaar/2007q3/028476.html.
         if specific_files:
-            for path, new_ie in self.basis_inv.iter_entries():
-                if new_ie.file_id in self.builder.new_inventory:
+            for path, old_ie in self.basis_inv.iter_entries():
+                if old_ie.file_id in self.builder.new_inventory:
                     continue
                 if is_inside_any(specific_files, path):
                     continue
-                ie = new_ie.copy()
-                ie.revision = None
+                if old_ie.kind == 'directory':
+                    self._next_progress_entry()
+                ie = old_ie.copy()
                 self.builder.record_entry_contents(ie, self.parent_invs, path,
                                                    self.basis_tree)
 
@@ -699,7 +706,8 @@
         deleted_paths = set()
         work_inv = self.work_tree.inventory
         assert work_inv.root is not None
-        entries = work_inv.iter_entries()
+        entries = work_inv.iter_entries_by_dir(
+            specific_file_ids=self.specific_file_ids, yield_parents=True)
         if not self.builder.record_root_entry:
             entries.next()
         for path, existing_ie in entries:
@@ -709,7 +717,6 @@
             kind = existing_ie.kind
             if kind == 'directory':
                 self._next_progress_entry()
-
             # Skip files that have been deleted from the working tree.
             # The deleted files/directories are also recorded so they
             # can be explicitly unversioned later. Note that when a
@@ -717,12 +724,11 @@
             # deleted files matching that filter.
             if is_inside_any(deleted_paths, path):
                 continue
-            if not specific_files or is_inside_any(specific_files, path):
-                if not self.work_tree.has_filename(path):
-                    deleted_paths.add(path)
-                    self.reporter.missing(path)
-                    deleted_ids.append(file_id)
-                    continue
+            if not self.work_tree.has_filename(path):
+                deleted_paths.add(path)
+                self.reporter.missing(path)
+                deleted_ids.append(file_id)
+                continue
             try:
                 kind = self.work_tree.kind(file_id)
                 # TODO: specific_files filtering before nested tree processing
@@ -772,26 +778,16 @@
             report_changes=True):
         "Record the new inventory entry for a path if any."
         # mutter('check %s {%s}', path, file_id)
-        if (not specific_files or 
-            is_inside_or_parent_of_any(specific_files, path)):
-                # mutter('%s selected for commit', path)
-                if definitely_changed or existing_ie is None:
-                    ie = inventory.make_entry(kind, name, parent_id, file_id)
-                else:
-                    ie = existing_ie.copy()
-                    ie.revision = None
+        # mutter('%s selected for commit', path)
+        if definitely_changed or existing_ie is None:
+            ie = inventory.make_entry(kind, name, parent_id, file_id)
         else:
-            # mutter('%s not selected for commit', path)
-            if self.basis_inv.has_id(file_id):
-                ie = self.basis_inv[file_id].copy()
-            else:
-                # this entry is new and not being committed
-                ie = None
-        if ie is not None:
-            self.builder.record_entry_contents(ie, self.parent_invs, 
-                path, self.work_tree)
-            if report_changes:
-                self._report_change(ie, path)
+            ie = existing_ie.copy()
+            ie.revision = None
+        self.builder.record_entry_contents(ie, self.parent_invs, 
+            path, self.work_tree)
+        if report_changes:
+            self._report_change(ie, path)
         return ie
 
     def _report_change(self, ie, path):

=== modified file 'bzrlib/dirstate.py'
--- a/bzrlib/dirstate.py	2007-09-14 02:25:32 +0000
+++ b/bzrlib/dirstate.py	2007-09-20 07:03:33 +0000
@@ -1470,7 +1470,8 @@
         kind = inv_entry.kind
         minikind = DirState._kind_to_minikind[kind]
         tree_data = inv_entry.revision
-        assert len(tree_data) > 0, 'empty revision for the inv_entry.'
+        assert tree_data, 'empty revision for the inv_entry %s.' % \
+            inv_entry.file_id
         if kind == 'directory':
             fingerprint = ''
             size = 0

=== modified file 'bzrlib/inventory.py'
--- a/bzrlib/inventory.py	2007-09-04 03:53:07 +0000
+++ b/bzrlib/inventory.py	2007-09-20 07:03:33 +0000
@@ -1004,7 +1004,8 @@
                 # if we finished all children, pop it off the stack
                 stack.pop()
 
-    def iter_entries_by_dir(self, from_dir=None, specific_file_ids=None):
+    def iter_entries_by_dir(self, from_dir=None, specific_file_ids=None,
+        yield_parents=False):
         """Iterate over the entries in a directory first order.
 
         This returns all entries for a directory before returning
@@ -1012,6 +1013,9 @@
         lexicographically sorted order, and is a hybrid between
         depth-first and breadth-first.
 
+        :param yield_parents: If True, yield the parents from the root leading
+            down to specific_file_ids that have been requested. This has no
+            impact if specific_file_ids is None.
         :return: This yields (path, entry) pairs
         """
         if specific_file_ids:
@@ -1023,13 +1027,14 @@
             if self.root is None:
                 return
             # Optimize a common case
-            if specific_file_ids is not None and len(specific_file_ids) == 1:
+            if (not yield_parents and specific_file_ids is not None and
+                len(specific_file_ids) == 1):
                 file_id = list(specific_file_ids)[0]
                 if file_id in self:
                     yield self.id2path(file_id), self[file_id]
                 return 
             from_dir = self.root
-            if (specific_file_ids is None or 
+            if (specific_file_ids is None or yield_parents or
                 self.root.file_id in specific_file_ids):
                 yield u'', self.root
         elif isinstance(from_dir, basestring):
@@ -1064,7 +1069,8 @@
                 child_relpath = cur_relpath + child_name
 
                 if (specific_file_ids is None or 
-                    child_ie.file_id in specific_file_ids):
+                    child_ie.file_id in specific_file_ids or
+                    (yield_parents and child_ie.file_id in parents)):
                     yield child_relpath, child_ie
 
                 if child_ie.kind == 'directory':

=== modified file 'bzrlib/osutils.py'
--- a/bzrlib/osutils.py	2007-08-21 01:32:29 +0000
+++ b/bzrlib/osutils.py	2007-09-20 07:03:33 +0000
@@ -87,6 +87,23 @@
         os.chmod(filename, mod)
 
 
+def minimum_path_selection(paths):
+    """Return the smallset subset of paths which are outside paths.
+
+    :param: A container of paths.
+    :return: A set of paths sufficient to include everything in paths via
+        is_inside_any, drawn from the paths parameter.
+    """
+    search_paths = set()
+    paths = set(paths)
+    for path in paths:
+        other_paths = paths.difference(set([path]))
+        if not is_inside_any(other_paths, path):
+            # this is a top level path, we must check it.
+            search_paths.add(path)
+    return search_paths
+
+
 _QUOTE_RE = None
 
 

=== modified file 'bzrlib/tests/blackbox/test_commit.py'
--- a/bzrlib/tests/blackbox/test_commit.py	2007-09-13 05:36:27 +0000
+++ b/bzrlib/tests/blackbox/test_commit.py	2007-09-20 07:03:33 +0000
@@ -219,9 +219,9 @@
             'added newdir\n'
             'added newfile\n'
             'renamed dirtorename => renameddir\n'
+            'renamed filetorename => renamedfile\n'
             'renamed dirtoreparent => renameddir/reparenteddir\n'
             'renamed filetoreparent => renameddir/reparentedfile\n'
-            'renamed filetorename => renamedfile\n'
             'deleted dirtoremove\n'
             'deleted filetoremove\n'
             'Committed revision 2.\n' % (expected, ),

=== modified file 'bzrlib/tests/inventory_implementations/basics.py'
--- a/bzrlib/tests/inventory_implementations/basics.py	2007-09-03 04:45:38 +0000
+++ b/bzrlib/tests/inventory_implementations/basics.py	2007-09-20 07:03:33 +0000
@@ -202,6 +202,13 @@
             ], [(path, ie.file_id) for path, ie in inv.iter_entries_by_dir(
                 specific_file_ids=('bye-id',))])
 
+        self.assertEqual([
+            ('', 'tree-root'),
+            ('src', 'src-id'),
+            ('src/bye.c', 'bye-id'),
+            ], [(path, ie.file_id) for path, ie in inv.iter_entries_by_dir(
+                specific_file_ids=('bye-id',), yield_parents=True)])
+
     def test_add_recursive(self):
         parent = InventoryDirectory('src-id', 'src', 'tree-root')
         child = InventoryFile('hello-id', 'hello.c', 'src-id')

=== modified file 'bzrlib/tests/test_commit.py'
--- a/bzrlib/tests/test_commit.py	2007-09-03 13:17:52 +0000
+++ b/bzrlib/tests/test_commit.py	2007-09-20 07:03:33 +0000
@@ -528,9 +528,9 @@
             ('change', 'added', 'newdir'),
             ('change', 'added', 'newfile'),
             ('renamed', 'renamed', 'dirtorename', 'renameddir'),
+            ('renamed', 'renamed', 'filetorename', 'renamedfile'),
             ('renamed', 'renamed', 'dirtoreparent', 'renameddir/reparenteddir'),
             ('renamed', 'renamed', 'filetoreparent', 'renameddir/reparentedfile'),
-            ('renamed', 'renamed', 'filetorename', 'renamedfile'),
             ('deleted', 'dirtoremove'),
             ('deleted', 'filetoremove'),
             ],

=== modified file 'bzrlib/tests/test_osutils.py'
--- a/bzrlib/tests/test_osutils.py	2007-09-04 11:00:30 +0000
+++ b/bzrlib/tests/test_osutils.py	2007-09-20 07:03:33 +0000
@@ -477,6 +477,16 @@
         #       osutils.getcwd() renormalize the path.
         self.assertEndsWith(osutils._win32_getcwd(), u'mu-\xb5')
 
+    def test_minimum_path_selection(self):
+        self.assertEqual(set(),
+            osutils.minimum_path_selection([]))
+        self.assertEqual(set(['a', 'b']),
+            osutils.minimum_path_selection(['a', 'b']))
+        self.assertEqual(set(['a/', 'b']),
+            osutils.minimum_path_selection(['a/', 'b']))
+        self.assertEqual(set(['a/', 'b']),
+            osutils.minimum_path_selection(['a/c', 'a/', 'b']))
+
     def test_mkdtemp(self):
         tmpdir = osutils._win32_mkdtemp(dir='.')
         self.assertFalse('\\' in tmpdir)

=== modified file 'bzrlib/workingtree_4.py'
--- a/bzrlib/workingtree_4.py	2007-09-14 02:25:32 +0000
+++ b/bzrlib/workingtree_4.py	2007-09-20 07:03:33 +0000
@@ -946,12 +946,7 @@
             if not all_versioned:
                 raise errors.PathsNotVersionedError(paths)
         # -- remove redundancy in supplied paths to prevent over-scanning --
-        search_paths = set()
-        for path in paths:
-            other_paths = paths.difference(set([path]))
-            if not osutils.is_inside_any(other_paths, path):
-                # this is a top level path, we must check it.
-                search_paths.add(path)
+        search_paths = osutils.minimum_path_selection(paths)
         # sketch: 
         # for all search_indexs in each path at or under each element of
         # search_paths, if the detail is relocated: add the id, and add the



More information about the bazaar-commits mailing list