Rev 2844: Faster partial commits by walking less data (Robert Collins) in file:///home/pqm/archives/thelove/bzr/%2Btrunk/

Canonical.com Patch Queue Manager pqm at pqm.ubuntu.com
Fri Sep 21 06:13:18 BST 2007


At file:///home/pqm/archives/thelove/bzr/%2Btrunk/

------------------------------------------------------------
revno: 2844
revision-id: pqm at pqm.ubuntu.com-20070921051316-85muv96iv0duh31j
parent: pqm at pqm.ubuntu.com-20070921031326-72zmv08871klgb61
parent: ian.clatworthy at internode.on.net-20070921042253-eguu8ycfjbnb96yj
committer: Canonical.com Patch Queue Manager <pqm at pqm.ubuntu.com>
branch nick: +trunk
timestamp: Fri 2007-09-21 06:13:16 +0100
message:
  Faster partial commits by walking less data (Robert Collins)
modified:
  NEWS                           NEWS-20050323055033-4e00b5db738777ff
  bzrlib/commit.py               commit.py-20050511101309-79ec1a0168e0e825
  bzrlib/dirstate.py             dirstate.py-20060728012006-d6mvoihjb3je9peu-1
  bzrlib/inventory.py            inventory.py-20050309040759-6648b84ca2005b37
  bzrlib/osutils.py              osutils.py-20050309040759-eeaff12fbf77ac86
  bzrlib/tests/inventory_implementations/basics.py basics.py-20070903044446-kdjwbiu1p1zi9phs-1
  bzrlib/tests/test_osutils.py   test_osutils.py-20051201224856-e48ee24c12182989
  bzrlib/workingtree_4.py        workingtree_4.py-20070208044105-5fgpc5j3ljlh5q6c-1
    ------------------------------------------------------------
    revno: 2843.1.1
    merged: ian.clatworthy at internode.on.net-20070921042253-eguu8ycfjbnb96yj
    parent: pqm at pqm.ubuntu.com-20070921031326-72zmv08871klgb61
    parent: robertc at robertcollins.net-20070920070333-eedxrxidkignx4i1
    committer: Ian Clatworthy <ian.clatworthy at internode.on.net>
    branch nick: ianc-integration2
    timestamp: Fri 2007-09-21 14:22:53 +1000
    message:
      Faster partial commits by walking less data (Robert Collins)
    ------------------------------------------------------------
    revno: 2825.7.1
    merged: robertc at robertcollins.net-20070920070333-eedxrxidkignx4i1
    parent: pqm at pqm.ubuntu.com-20070917005035-cshdkpzbj63id1uw
    committer: Robert Collins <robertc at robertcollins.net>
    branch nick: commit-specific-file-handling
    timestamp: Thu 2007-09-20 17:03:33 +1000
    message:
      * Partial commits are now approximately 40% faster by walking over the
        unselected current tree more efficiently. (Robert Collins)
      
      * New method ``bzrlib.osutils.minimum_path_selection`` useful for removing
        duplication from user input, when a user mentions both a path and an item
        contained within that path. (Robert Collins)
      
      * New parameter yield_parents on ``Inventory.iter_entries_by_dir`` which
        causes the parents of a selected id to be returned recursively, so all the
        paths from the root down to each element of selected_file_ids are
        returned. (Robert Collins)
=== modified file 'NEWS'
--- a/NEWS	2007-09-21 03:13:26 +0000
+++ b/NEWS	2007-09-21 04:22:53 +0000
@@ -40,6 +40,9 @@
    * Inventory serialisation no longer double-sha's the content.
      (Robert Collins)
 
+   * Partial commits are now approximately 40% faster by walking over the
+     unselected current tree more efficiently. (Robert Collins)
+
    * XML inventory serialisation takes 20% less time while being stricter about
      the contents. (Robert Collins)
 
@@ -96,6 +99,10 @@
 
   INTERNALS:
 
+   * New method ``bzrlib.osutils.minimum_path_selection`` useful for removing
+     duplication from user input, when a user mentions both a path and an item
+     contained within that path. (Robert Collins)
+
    * New method on xml serialisers, write_inventory_to_lines, which matches the
      API used by knits for adding content. (Robert Collins)
 
@@ -103,6 +110,11 @@
      and returns a gzipped version of the same. This is used to avoid a bunch
      of api friction during adding of knit hunks. (Robert Collins)
 
+   * New parameter yield_parents on ``Inventory.iter_entries_by_dir`` which
+     causes the parents of a selected id to be returned recursively, so all the
+     paths from the root down to each element of selected_file_ids are
+     returned. (Robert Collins)
+
   TESTING:
 
 

=== modified file 'bzrlib/commit.py'
--- a/bzrlib/commit.py	2007-09-21 00:22:35 +0000
+++ b/bzrlib/commit.py	2007-09-21 04:22:53 +0000
@@ -68,8 +68,9 @@
                            ConflictsInTree,
                            StrictCommitFailed
                            )
-from bzrlib.osutils import (kind_marker, isdir,isfile, is_inside_any, 
+from bzrlib.osutils import (kind_marker, isdir,isfile, is_inside_any,
                             is_inside_or_parent_of_any,
+                            minimum_path_selection,
                             quotefn, sha_file, split_lines)
 from bzrlib.testament import Testament
 from bzrlib.trace import mutter, note, warning, is_quiet
@@ -247,7 +248,12 @@
         self.master_branch = None
         self.master_locked = False
         self.rev_id = None
-        self.specific_files = specific_files
+        if specific_files is not None:
+            self.specific_files = sorted(
+                minimum_path_selection(specific_files))
+        else:
+            self.specific_files = None
+        self.specific_file_ids = None
         self.allow_pointless = allow_pointless
         self.recursive = recursive
         self.revprops = revprops
@@ -282,14 +288,17 @@
                 self.config = self.branch.get_config()
 
             # If provided, ensure the specified files are versioned
-            if specific_files is not None:
-                # Note: We don't actually need the IDs here. This routine
+            if self.specific_files is not None:
+                # Note: This routine
                 # is being called because it raises PathNotVerisonedError
-                # as a side effect of finding the IDs.
+                # as a side effect of finding the IDs. We later use the ids we
+                # found as input to the working tree inventory iterator, so we
+                # only consider those ids rather than examining the whole tree
+                # again.
                 # XXX: Dont we have filter_unversioned to do this more
                 # cheaply?
-                tree.find_ids_across_trees(specific_files,
-                                           [self.basis_tree, self.work_tree])
+                self.specific_file_ids = tree.find_ids_across_trees(
+                    specific_files, [self.basis_tree, self.work_tree])
 
             # Setup the progress bar. As the number of files that need to be
             # committed in unknown, progress is reported as stages.
@@ -639,13 +648,19 @@
         # recorded in their previous state. For more details, see
         # https://lists.ubuntu.com/archives/bazaar/2007q3/028476.html.
         if specific_files:
-            for path, new_ie in self.basis_inv.iter_entries():
-                if new_ie.file_id in self.builder.new_inventory:
+            for path, old_ie in self.basis_inv.iter_entries():
+                if old_ie.file_id in self.builder.new_inventory:
                     continue
                 if is_inside_any(specific_files, path):
                     continue
-                ie = new_ie.copy()
-                ie.revision = None
+                if old_ie.kind == 'directory':
+                    self._next_progress_entry()
+                ie = old_ie.copy()
+                # Note: specific file commits after a merge are currently
+                # prohibited. This test is for sanity/safety in case it's
+                # required after that changes.
+                if len(self.parents) > 1:
+                    ie.revision = None
                 if self.builder.record_entry_contents(ie, self.parent_invs, path,
                     self.basis_tree):
                     self.any_entries_changed = True
@@ -671,7 +686,8 @@
         deleted_paths = set()
         work_inv = self.work_tree.inventory
         assert work_inv.root is not None
-        entries = work_inv.iter_entries_by_dir()
+        entries = work_inv.iter_entries_by_dir(
+            specific_file_ids=self.specific_file_ids, yield_parents=True)
         if not self.builder.record_root_entry:
             entries.next()
         for path, existing_ie in entries:
@@ -681,7 +697,6 @@
             kind = existing_ie.kind
             if kind == 'directory':
                 self._next_progress_entry()
-
             # Skip files that have been deleted from the working tree.
             # The deleted files/directories are also recorded so they
             # can be explicitly unversioned later. Note that when a
@@ -689,12 +704,11 @@
             # deleted files matching that filter.
             if is_inside_any(deleted_paths, path):
                 continue
-            if not specific_files or is_inside_any(specific_files, path):
-                if not self.work_tree.has_filename(path):
-                    deleted_paths.add(path)
-                    self.reporter.missing(path)
-                    deleted_ids.append(file_id)
-                    continue
+            if not self.work_tree.has_filename(path):
+                deleted_paths.add(path)
+                self.reporter.missing(path)
+                deleted_ids.append(file_id)
+                continue
             try:
                 kind = self.work_tree.kind(file_id)
                 # TODO: specific_files filtering before nested tree processing
@@ -744,27 +758,17 @@
             report_changes=True):
         "Record the new inventory entry for a path if any."
         # mutter('check %s {%s}', path, file_id)
-        if (not specific_files or 
-            is_inside_or_parent_of_any(specific_files, path)):
-                # mutter('%s selected for commit', path)
-                if definitely_changed or existing_ie is None:
-                    ie = inventory.make_entry(kind, name, parent_id, file_id)
-                else:
-                    ie = existing_ie.copy()
-                    ie.revision = None
+        # mutter('%s selected for commit', path)
+        if definitely_changed or existing_ie is None:
+            ie = inventory.make_entry(kind, name, parent_id, file_id)
         else:
-            # mutter('%s not selected for commit', path)
-            if self.basis_inv.has_id(file_id):
-                ie = self.basis_inv[file_id].copy()
-            else:
-                # this entry is new and not being committed
-                ie = None
-        if ie is not None:
-            if self.builder.record_entry_contents(ie, self.parent_invs,
-                path, self.work_tree):
-                self.any_entries_changed = True
-            if report_changes:
-                self._report_change(ie, path)
+            ie = existing_ie.copy()
+            ie.revision = None
+        if self.builder.record_entry_contents(ie, self.parent_invs, 
+            path, self.work_tree):
+            self.any_entries_changed = True
+        if report_changes:
+            self._report_change(ie, path)
         return ie
 
     def _report_change(self, ie, path):

=== modified file 'bzrlib/dirstate.py'
--- a/bzrlib/dirstate.py	2007-09-17 05:33:56 +0000
+++ b/bzrlib/dirstate.py	2007-09-21 04:22:53 +0000
@@ -1470,7 +1470,8 @@
         kind = inv_entry.kind
         minikind = DirState._kind_to_minikind[kind]
         tree_data = inv_entry.revision
-        assert len(tree_data) > 0, 'empty revision for the inv_entry.'
+        assert tree_data, 'empty revision for the inv_entry %s.' % \
+            inv_entry.file_id
         if kind == 'directory':
             fingerprint = ''
             size = 0

=== modified file 'bzrlib/inventory.py'
--- a/bzrlib/inventory.py	2007-09-21 03:13:26 +0000
+++ b/bzrlib/inventory.py	2007-09-21 04:22:53 +0000
@@ -1011,7 +1011,8 @@
                 # if we finished all children, pop it off the stack
                 stack.pop()
 
-    def iter_entries_by_dir(self, from_dir=None, specific_file_ids=None):
+    def iter_entries_by_dir(self, from_dir=None, specific_file_ids=None,
+        yield_parents=False):
         """Iterate over the entries in a directory first order.
 
         This returns all entries for a directory before returning
@@ -1019,6 +1020,9 @@
         lexicographically sorted order, and is a hybrid between
         depth-first and breadth-first.
 
+        :param yield_parents: If True, yield the parents from the root leading
+            down to specific_file_ids that have been requested. This has no
+            impact if specific_file_ids is None.
         :return: This yields (path, entry) pairs
         """
         if specific_file_ids:
@@ -1030,13 +1034,14 @@
             if self.root is None:
                 return
             # Optimize a common case
-            if specific_file_ids is not None and len(specific_file_ids) == 1:
+            if (not yield_parents and specific_file_ids is not None and
+                len(specific_file_ids) == 1):
                 file_id = list(specific_file_ids)[0]
                 if file_id in self:
                     yield self.id2path(file_id), self[file_id]
                 return 
             from_dir = self.root
-            if (specific_file_ids is None or 
+            if (specific_file_ids is None or yield_parents or
                 self.root.file_id in specific_file_ids):
                 yield u'', self.root
         elif isinstance(from_dir, basestring):
@@ -1071,7 +1076,8 @@
                 child_relpath = cur_relpath + child_name
 
                 if (specific_file_ids is None or 
-                    child_ie.file_id in specific_file_ids):
+                    child_ie.file_id in specific_file_ids or
+                    (yield_parents and child_ie.file_id in parents)):
                     yield child_relpath, child_ie
 
                 if child_ie.kind == 'directory':

=== modified file 'bzrlib/osutils.py'
--- a/bzrlib/osutils.py	2007-09-20 09:30:39 +0000
+++ b/bzrlib/osutils.py	2007-09-21 04:22:53 +0000
@@ -87,6 +87,23 @@
         os.chmod(filename, mod)
 
 
+def minimum_path_selection(paths):
+    """Return the smallset subset of paths which are outside paths.
+
+    :param paths: A container (and hence not None) of paths.
+    :return: A set of paths sufficient to include everything in paths via
+        is_inside_any, drawn from the paths parameter.
+    """
+    search_paths = set()
+    paths = set(paths)
+    for path in paths:
+        other_paths = paths.difference([path])
+        if not is_inside_any(other_paths, path):
+            # this is a top level path, we must check it.
+            search_paths.add(path)
+    return search_paths
+
+
 _QUOTE_RE = None
 
 

=== modified file 'bzrlib/tests/inventory_implementations/basics.py'
--- a/bzrlib/tests/inventory_implementations/basics.py	2007-09-03 04:45:38 +0000
+++ b/bzrlib/tests/inventory_implementations/basics.py	2007-09-20 07:03:33 +0000
@@ -202,6 +202,13 @@
             ], [(path, ie.file_id) for path, ie in inv.iter_entries_by_dir(
                 specific_file_ids=('bye-id',))])
 
+        self.assertEqual([
+            ('', 'tree-root'),
+            ('src', 'src-id'),
+            ('src/bye.c', 'bye-id'),
+            ], [(path, ie.file_id) for path, ie in inv.iter_entries_by_dir(
+                specific_file_ids=('bye-id',), yield_parents=True)])
+
     def test_add_recursive(self):
         parent = InventoryDirectory('src-id', 'src', 'tree-root')
         child = InventoryFile('hello-id', 'hello.c', 'src-id')

=== modified file 'bzrlib/tests/test_osutils.py'
--- a/bzrlib/tests/test_osutils.py	2007-09-04 11:00:30 +0000
+++ b/bzrlib/tests/test_osutils.py	2007-09-20 07:03:33 +0000
@@ -477,6 +477,16 @@
         #       osutils.getcwd() renormalize the path.
         self.assertEndsWith(osutils._win32_getcwd(), u'mu-\xb5')
 
+    def test_minimum_path_selection(self):
+        self.assertEqual(set(),
+            osutils.minimum_path_selection([]))
+        self.assertEqual(set(['a', 'b']),
+            osutils.minimum_path_selection(['a', 'b']))
+        self.assertEqual(set(['a/', 'b']),
+            osutils.minimum_path_selection(['a/', 'b']))
+        self.assertEqual(set(['a/', 'b']),
+            osutils.minimum_path_selection(['a/c', 'a/', 'b']))
+
     def test_mkdtemp(self):
         tmpdir = osutils._win32_mkdtemp(dir='.')
         self.assertFalse('\\' in tmpdir)

=== modified file 'bzrlib/workingtree_4.py'
--- a/bzrlib/workingtree_4.py	2007-09-14 02:25:32 +0000
+++ b/bzrlib/workingtree_4.py	2007-09-20 07:03:33 +0000
@@ -946,12 +946,7 @@
             if not all_versioned:
                 raise errors.PathsNotVersionedError(paths)
         # -- remove redundancy in supplied paths to prevent over-scanning --
-        search_paths = set()
-        for path in paths:
-            other_paths = paths.difference(set([path]))
-            if not osutils.is_inside_any(other_paths, path):
-                # this is a top level path, we must check it.
-                search_paths.add(path)
+        search_paths = osutils.minimum_path_selection(paths)
         # sketch: 
         # for all search_indexs in each path at or under each element of
         # search_paths, if the detail is relocated: add the id, and add the




More information about the bazaar-commits mailing list