Rev 5637: Fix bug #740932. Transform should update the sha cache. in http://bazaar.launchpad.net/~jameinel/bzr/2.3-build-cache-sha-740932

John Arbash Meinel john at arbash-meinel.com
Mon Apr 4 11:24:59 UTC 2011


At http://bazaar.launchpad.net/~jameinel/bzr/2.3-build-cache-sha-740932

------------------------------------------------------------
revno: 5637
revision-id: john at arbash-meinel.com-20110404112450-7b1sh1f0cbe7sndk
parent: pqm at pqm.ubuntu.com-20110401042049-br5zw2egh3j0ryc2
fixes bug(s): https://launchpad.net/bugs/740932
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: 2.3-build-cache-sha-740932
timestamp: Mon 2011-04-04 13:24:50 +0200
message:
  Fix bug #740932. Transform should update the sha cache.
  
  This updates TreeTransform so that for files that it creates, it calls tree._observed_sha1
  This should mean that 'build_tree' and even 'merge' will now pre-compute and
  save the sha hash for all files that we touch. Anything that takes a long
  time should end up able to save the sha hash for the newly created files.
  
  It ended up more invasive than originally expected, because of a pre-existing bug
  that stat values stored in the Dirstate did not always match both os.fstat and
  os.lstat on Windows, because in Windows the two are not equal (dev and ino are
  sometimes but not always 0, depending on python version, etc.)
-------------- next part --------------
=== modified file 'bzrlib/_walkdirs_win32.pyx'
--- a/bzrlib/_walkdirs_win32.pyx	2010-02-17 17:11:16 +0000
+++ b/bzrlib/_walkdirs_win32.pyx	2011-04-04 11:24:50 +0000
@@ -1,4 +1,4 @@
-# Copyright (C) 2008, 2009, 2010 Canonical Ltd
+# Copyright (C) 2008-2011 Canonical Ltd
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@@ -69,9 +69,13 @@
 
 
 import operator
+import os
 import stat
 
-from bzrlib import osutils, _readdir_py
+from bzrlib import _readdir_py
+
+cdef object osutils
+osutils = None
 
 
 cdef class _Win32Stat:
@@ -170,6 +174,8 @@
 
     def top_prefix_to_starting_dir(self, top, prefix=""):
         """See DirReader.top_prefix_to_starting_dir."""
+        if osutils is None:
+            from bzrlib import osutils
         return (osutils.safe_utf8(prefix), None, None, None,
                 osutils.safe_unicode(top))
 
@@ -250,3 +256,37 @@
                 #       earlier Exception, so for now, I'm ignoring this
         dirblock.sort(key=operator.itemgetter(1))
         return dirblock
+
+
+def lstat(path):
+    """Equivalent to os.lstat, except match Win32ReadDir._get_stat_value.
+    """
+    return wrap_stat(os.lstat(path))
+
+
+def fstat(fd):
+    """Like os.fstat, except match Win32ReadDir._get_stat_value
+
+    :seealso: wrap_stat
+    """
+    return wrap_stat(os.fstat(fd))
+
+
+def wrap_stat(st):
+    """Return a _Win32Stat object, based on the given stat result.
+
+    On Windows, os.fstat(open(fname).fileno()) != os.lstat(fname). This is
+    generally because os.lstat and os.fstat differ in what they put into st_ino
+    and st_dev. What gets set where seems to also be dependent on the python
+    version. So we always set it to 0 to avoid worrying about it.
+    """
+    cdef _Win32Stat statvalue
+    statvalue = _Win32Stat()
+    statvalue.st_mode = st.st_mode
+    statvalue.st_ctime = st.st_ctime
+    statvalue.st_mtime = st.st_mtime
+    statvalue.st_atime = st.st_atime
+    statvalue._st_size = st.st_size
+    statvalue.st_ino = 0
+    statvalue.st_dev = 0
+    return statvalue

=== modified file 'bzrlib/dirstate.py'
--- a/bzrlib/dirstate.py	2010-09-13 06:36:59 +0000
+++ b/bzrlib/dirstate.py	2011-04-04 11:24:50 +0000
@@ -1,4 +1,4 @@
-# Copyright (C) 2006-2010 Canonical Ltd
+# Copyright (C) 2006-2011 Canonical Ltd
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@@ -265,6 +265,17 @@
         # return '%X.%X' % (int(st.st_mtime), st.st_mode)
 
 
+def _unpack_stat(packed_stat):
+    """Turn a packed_stat back into the stat fields.
+
+    This is meant as a debugging tool, should not be used in real code.
+    """
+    (st_size, st_mtime, st_ctime, st_dev, st_ino,
+     st_mode) = struct.unpack('>LLLLLL', binascii.a2b_base64(packed_stat))
+    return dict(st_size=st_size, st_mtime=st_mtime, st_ctime=st_ctime,
+                st_dev=st_dev, st_ino=st_ino, st_mode=st_mode)
+
+
 class SHA1Provider(object):
     """An interface for getting sha1s of a file."""
 

=== modified file 'bzrlib/osutils.py'
--- a/bzrlib/osutils.py	2011-01-12 21:06:32 +0000
+++ b/bzrlib/osutils.py	2011-04-04 11:24:50 +0000
@@ -392,6 +392,12 @@
 # These were already lazily imported into local scope
 # mkdtemp = tempfile.mkdtemp
 # rmtree = shutil.rmtree
+lstat = os.lstat
+fstat = os.fstat
+
+def wrap_stat(st):
+    return st
+
 
 MIN_ABS_PATHLENGTH = 1
 
@@ -407,6 +413,14 @@
     getcwd = _win32_getcwd
     mkdtemp = _win32_mkdtemp
     rename = _win32_rename
+    try:
+        from bzrlib import _walkdirs_win32
+    except ImportError:
+        pass
+    else:
+        lstat = _walkdirs_win32.lstat
+        fstat = _walkdirs_win32.fstat
+        wrap_stat = _walkdirs_win32.wrap_stat
 
     MIN_ABS_PATHLENGTH = 3
 

=== modified file 'bzrlib/tests/test_transform.py'
--- a/bzrlib/tests/test_transform.py	2011-01-13 01:29:41 +0000
+++ b/bzrlib/tests/test_transform.py	2011-04-04 11:24:50 +0000
@@ -2062,6 +2062,39 @@
         self.assertEqual('file.moved', target.id2path('lower-id'))
         self.assertEqual('FILE', target.id2path('upper-id'))
 
+    def test_build_tree_observes_sha(self):
+        source = self.make_branch_and_tree('source')
+        self.build_tree(['source/file1', 'source/dir/', 'source/dir/file2'])
+        source.add(['file1', 'dir', 'dir/file2'],
+                   ['file1-id', 'dir-id', 'file2-id'])
+        source.commit('new files')
+        target = self.make_branch_and_tree('target')
+        target.lock_write()
+        self.addCleanup(target.unlock)
+        # We make use of the fact that DirState caches its cutoff time. So we
+        # set the 'safe' time to one minute in the future.
+        state = target.current_dirstate()
+        state._cutoff_time = time.time() + 60
+        build_tree(source.basis_tree(), target)
+        entry1_sha = osutils.sha_file_by_name('source/file1')
+        entry2_sha = osutils.sha_file_by_name('source/dir/file2')
+        # entry[1] is the state information, entry[1][0] is the state of the
+        # working tree, entry[1][0][1] is the sha value for the current working
+        # tree
+        entry1 = state._get_entry(0, path_utf8='file1')
+        self.assertEqual(entry1_sha, entry1[1][0][1])
+        entry1_packed_stat = entry1[1][0][-1]
+        entry2 = state._get_entry(0, path_utf8='dir/file2')
+        self.assertEqual(entry2_sha, entry2[1][0][1])
+        entry2_packed_stat = entry2[1][0][-1]
+        # Now, make sure that we don't have to re-read the content. The
+        # packed_stat should match exactly.
+        self.assertEqual(entry1_sha, target.get_file_sha1('file1-id', 'file1'))
+        self.assertEqual(entry2_sha,
+                         target.get_file_sha1('file2-id', 'dir/file2'))
+        self.assertEqual(entry1_packed_stat, entry1[1][0][-1])
+        self.assertEqual(entry2_packed_stat, entry2[1][0][-1])
+
 
 class TestCommitTransform(tests.TestCaseWithTransport):
 

=== modified file 'bzrlib/transform.py'
--- a/bzrlib/transform.py	2011-01-13 02:43:21 +0000
+++ b/bzrlib/transform.py	2011-04-04 11:24:50 +0000
@@ -105,6 +105,8 @@
         self._new_parent = {}
         # mapping of trans_id with new contents -> new file_kind
         self._new_contents = {}
+        # mapping of trans_id => (sha1 of content, stat_value)
+        self._observed_sha1s = {}
         # Set of trans_ids whose contents will be removed
         self._removed_contents = set()
         # Mapping of trans_id -> new execute-bit value
@@ -1271,12 +1273,26 @@
                 f.close()
                 os.unlink(name)
                 raise
-
-            f.writelines(contents)
+            if contents.__class__ is list:
+                sha_digest = osutils.sha_strings(contents)
+                f.writelines(contents)
+            else:
+                sha_value = osutils.sha()
+                def observe_sha1(contents):
+                    sha_value_update = sha_value.update
+                    for content in contents:
+                        sha_value_update(content)
+                        yield content
+                f.writelines(observe_sha1(contents))
+                sha_digest = sha_value.hexdigest()
         finally:
             f.close()
         self._set_mtime(name)
         self._set_mode(trans_id, mode_id, S_ISREG)
+        # It is unfortunate we have to use lstat instead of fstat, but we just
+        # used utime and chmod on the file, so we need the accurate final
+        # details.
+        self._observed_sha1s[trans_id] = (sha_digest, osutils.lstat(name))
 
     def _read_file_chunks(self, trans_id):
         cur_file = open(self._limbo_name(trans_id), 'rb')
@@ -1341,6 +1357,8 @@
     def cancel_creation(self, trans_id):
         """Cancel the creation of new file contents."""
         del self._new_contents[trans_id]
+        if trans_id in self._observed_sha1s:
+            del self._observed_sha1s[trans_id]
         children = self._limbo_children.get(trans_id)
         # if this is a limbo directory with children, move them before removing
         # the directory
@@ -1702,6 +1720,7 @@
         finally:
             child_pb.finished()
         self._tree.apply_inventory_delta(inventory_delta)
+        self._apply_observed_sha1s()
         self._done = True
         self.finalize()
         return _TransformResults(modified_paths, self.rename_count)
@@ -1827,6 +1846,9 @@
                             raise
                     else:
                         self.rename_count += 1
+                    # TODO: if trans_id in self._observed_sha1s, we should
+                    #       re-stat the final target, since ctime will be
+                    #       updated by the change.
                 if (trans_id in self._new_contents or
                     self.path_changed(trans_id)):
                     if trans_id in self._new_contents:
@@ -1838,6 +1860,21 @@
         self._new_contents.clear()
         return modified_paths
 
+    def _apply_observed_sha1s(self):
+        """After we have finished renaming everything, update observed sha1s
+
+        This has to be done after self._tree.apply_inventory_delta, otherwise
+        it doesn't know anything about the files we are updating. Also, we want
+        to do this as late as possible, so that most entries end up cached.
+        """
+        paths = FinalPaths(self)
+        for trans_id, observed in self._observed_sha1s.iteritems():
+            path = paths.get_path(trans_id)
+            # We could get the file_id, but dirstate prefers to use the path
+            # anyway, and it is 'cheaper' to determine.
+            # file_id = self._new_id[trans_id]
+            self._tree._observed_sha1(None, path, observed)
+
 
 class TransformPreview(DiskTreeTransform):
     """A TreeTransform for generating preview trees.

=== modified file 'bzrlib/workingtree.py'
--- a/bzrlib/workingtree.py	2011-01-13 02:05:17 +0000
+++ b/bzrlib/workingtree.py	2011-04-04 11:24:50 +0000
@@ -1,4 +1,4 @@
-# Copyright (C) 2005-2010 Canonical Ltd
+# Copyright (C) 2005-2011 Canonical Ltd
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@@ -513,7 +513,7 @@
         return self.get_file_with_stat(file_id, path, filtered=filtered)[0]
 
     def get_file_with_stat(self, file_id, path=None, filtered=True,
-        _fstat=os.fstat):
+                           _fstat=osutils.fstat):
         """See Tree.get_file_with_stat."""
         if path is None:
             path = self.id2path(file_id)

=== modified file 'bzrlib/workingtree_4.py'
--- a/bzrlib/workingtree_4.py	2010-12-02 10:41:05 +0000
+++ b/bzrlib/workingtree_4.py	2011-04-04 11:24:50 +0000
@@ -1,4 +1,4 @@
-# Copyright (C) 2007-2010 Canonical Ltd
+# Copyright (C) 2007-2011 Canonical Ltd
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@@ -369,7 +369,7 @@
         state = self.current_dirstate()
         if stat_value is None:
             try:
-                stat_value = os.lstat(file_abspath)
+                stat_value = osutils.lstat(file_abspath)
             except OSError, e:
                 if e.errno == errno.ENOENT:
                     return None
@@ -478,7 +478,7 @@
             self._must_be_locked()
             if not path:
                 path = self.id2path(file_id)
-            mode = os.lstat(self.abspath(path)).st_mode
+            mode = osutils.lstat(self.abspath(path)).st_mode
             return bool(stat.S_ISREG(mode) and stat.S_IEXEC & mode)
 
     def all_file_ids(self):

=== modified file 'doc/en/release-notes/bzr-2.3.txt'
--- a/doc/en/release-notes/bzr-2.3.txt	2011-04-01 03:11:57 +0000
+++ b/doc/en/release-notes/bzr-2.3.txt	2011-04-04 11:24:50 +0000
@@ -45,10 +45,6 @@
   normally not require running ``bzr whoami`` first.
   (Martin Pool, #616878)
 
-* When reporting a crash without apport, don't print the full list of
-   plugins because it's often too long.
-   (Martin Pool, #716389)
-
 * ``bzr push`` into a repository (that doesn't have a branch), will no
   longer copy all revisions in the repository. Only the ones in the
   ancestry of the source branch, like it does in all other cases.
@@ -62,6 +58,18 @@
   had changed.  Bazaar was attempting to open and lock the master branch
   twice in this case.  (Andrew Bennetts, #733350)
 
+* ``TreeTransform`` now calls ``_observed_sha1`` for content that it
+  creates. This means that ``merge`` and ``checkout`` should be able to
+  cache the sha values, and avoid re-reading the files on the next ``bzr
+  status`` that gets run. Further, on Windows we now properly suppress
+  ``st_dev`` and ``st_ino`` so that commit also avoids re-doing the work.
+  (John Arbash Meinel, #740932)
+
+* When reporting a crash without apport, don't print the full list of
+  plugins because it's often too long.
+  (Martin Pool, #716389)
+
+
 Documentation
 *************
 



More information about the bazaar-commits mailing list