Rev 3712: Sha files for the stat cache more lazily rather than on first-examination, allowing less overall sha calculations to occur. in http://people.ubuntu.com/~robertc/baz2.0/iter-changes.less-sha1

Robert Collins robertc at robertcollins.net
Tue Sep 23 07:21:52 BST 2008


At http://people.ubuntu.com/~robertc/baz2.0/iter-changes.less-sha1

------------------------------------------------------------
revno: 3712
revision-id: robertc at robertcollins.net-20080923062145-bi6lcbxwgfhcm8j9
parent: robertc at robertcollins.net-20080922051520-uhr3pn61w141kagv
committer: Robert Collins <robertc at robertcollins.net>
branch nick: iter-changes.less-sha1
timestamp: Tue 2008-09-23 16:21:45 +1000
message:
  Sha files for the stat cache more lazily rather than on first-examination, allowing less overall sha calculations to occur.
modified:
  NEWS                           NEWS-20050323055033-4e00b5db738777ff
  bzrlib/dirstate.py             dirstate.py-20060728012006-d6mvoihjb3je9peu-1
  bzrlib/tests/test_dirstate.py  test_dirstate.py-20060728012006-d6mvoihjb3je9peu-2
  bzrlib/workingtree_4.py        workingtree_4.py-20070208044105-5fgpc5j3ljlh5q6c-1
=== modified file 'NEWS'
--- a/NEWS	2008-09-22 05:15:20 +0000
+++ b/NEWS	2008-09-23 06:21:45 +0000
@@ -36,6 +36,10 @@
 
   API CHANGES:
 
+    * ``dirstate.DirState.update_entry`` will now only calculate the sha1
+      of a file if it is likely to be needed in determining the output
+      of iter_changes. (Robert Collins)
+
   TESTING:
 
     * ``bzrlib.tests.repository_implementations`` has been renamed to

=== modified file 'bzrlib/dirstate.py'
--- a/bzrlib/dirstate.py	2008-09-19 06:53:41 +0000
+++ b/bzrlib/dirstate.py	2008-09-23 06:21:45 +0000
@@ -1500,12 +1500,16 @@
                      _pack_stat=pack_stat):
         """Update the entry based on what is actually on disk.
 
+        This function only calculates the sha if it needs to - if the entry is
+        uncachable, or clearly different to the first parent's entry, no sha
+        is calculated, and None is returned.
+
         :param entry: This is the dirblock entry for the file in question.
         :param abspath: The path on disk for this file.
         :param stat_value: (optional) if we already have done a stat on the
             file, re-use it.
-        :return: The sha1 hexdigest of the file (40 bytes) or link target of a
-                symlink.
+        :return: None, or The sha1 hexdigest of the file (40 bytes) or link
+            target of a symlink.
         """
         try:
             minikind = _stat_to_minikind[stat_value.st_mode & 0170000]
@@ -1531,13 +1535,18 @@
         # process this entry.
         link_or_sha1 = None
         if minikind == 'f':
-            link_or_sha1 = self._sha1_file(abspath)
             executable = self._is_executable(stat_value.st_mode,
                                              saved_executable)
             if self._cutoff_time is None:
                 self._sha_cutoff_time()
             if (stat_value.st_mtime < self._cutoff_time
-                and stat_value.st_ctime < self._cutoff_time):
+                and stat_value.st_ctime < self._cutoff_time
+                and len(entry[1]) > 1
+                and entry[1][1][0] != 'a'):
+                    # Could check for size changes for further optimised
+                    # avoidance of sha1's. However the most prominent case of
+                    # over-shaing is during initial add, which this catches.
+                link_or_sha1 = self._sha1_file(abspath)
                 entry[1][0] = ('f', link_or_sha1, stat_value.st_size,
                                executable, packed_stat)
             else:

=== modified file 'bzrlib/tests/test_dirstate.py'
--- a/bzrlib/tests/test_dirstate.py	2008-09-19 06:53:41 +0000
+++ b/bzrlib/tests/test_dirstate.py	2008-09-23 06:21:45 +0000
@@ -564,20 +564,21 @@
         state.lock_read()
         try:
             entry = state._get_entry(0, path_utf8='a-file')
-            # The current sha1 sum should be empty
-            self.assertEqual('', entry[1][0][1])
+            # The current size should be 0 (default)
+            self.assertEqual(0, entry[1][0][2])
             # We should have a real entry.
             self.assertNotEqual((None, None), entry)
             # Make sure everything is old enough
             state._sha_cutoff_time()
             state._cutoff_time += 10
+            # Change the file length
+            self.build_tree_contents([('a-file', 'shorter')])
             sha1sum = state.update_entry(entry, 'a-file', os.lstat('a-file'))
-            # We should have gotten a real sha1
-            self.assertEqual('ecc5374e9ed82ad3ea3b4d452ea995a5fd3e70e3',
-                             sha1sum)
+            # new file, no cached sha:
+            self.assertEqual(None, sha1sum)
 
             # The dirblock has been updated
-            self.assertEqual(sha1sum, entry[1][0][1])
+            self.assertEqual(7, entry[1][0][2])
             self.assertEqual(dirstate.DirState.IN_MEMORY_MODIFIED,
                              state._dirblock_state)
 
@@ -593,7 +594,7 @@
         state.lock_read()
         try:
             entry = state._get_entry(0, path_utf8='a-file')
-            self.assertEqual(sha1sum, entry[1][0][1])
+            self.assertEqual(7, entry[1][0][2])
         finally:
             state.unlock()
 
@@ -613,9 +614,8 @@
         try:
             entry = state._get_entry(0, path_utf8='a-file')
             sha1sum = state.update_entry(entry, 'a-file', os.lstat('a-file'))
-            # We should have gotten a real sha1
-            self.assertEqual('ecc5374e9ed82ad3ea3b4d452ea995a5fd3e70e3',
-                             sha1sum)
+            # No sha - too new
+            self.assertEqual(None, sha1sum)
             self.assertEqual(dirstate.DirState.IN_MEMORY_MODIFIED,
                              state._dirblock_state)
 
@@ -1700,12 +1700,23 @@
         self.assertEqual(oldstat, entry[1][0][4])
 
     def test_update_entry(self):
-        state, entry = self.get_state_with_a()
+        state, _ = self.get_state_with_a()
+        tree = self.make_branch_and_tree('tree')
+        tree.lock_write()
+        empty_revid = tree.commit('empty')
+        self.build_tree(['tree/a'])
+        tree.add(['a'], ['a-id'])
+        with_a_id = tree.commit('with_a')
+        self.addCleanup(tree.unlock)
+        state.set_parent_trees(
+            [(empty_revid, tree.branch.repository.revision_tree(empty_revid))],
+            [])
+        entry = state._get_entry(0, path_utf8='a')
         self.build_tree(['a'])
         # Add one where we don't provide the stat or sha already
         self.assertEqual(('', 'a', 'a-id'), entry[0])
-        self.assertEqual([('f', '', 0, False, dirstate.DirState.NULLSTAT)],
-                         entry[1])
+        self.assertEqual(('f', '', 0, False, dirstate.DirState.NULLSTAT),
+                         entry[1][0])
         # Flush the buffers to disk
         state.save()
         self.assertEqual(dirstate.DirState.IN_MEMORY_UNMODIFIED,
@@ -1715,16 +1726,15 @@
         packed_stat = dirstate.pack_stat(stat_value)
         link_or_sha1 = state.update_entry(entry, abspath='a',
                                           stat_value=stat_value)
-        self.assertEqual('b50e5406bb5e153ebbeb20268fcf37c87e1ecfb6',
-                         link_or_sha1)
+        self.assertEqual(None, link_or_sha1)
 
-        # The dirblock entry should not cache the file's sha1
-        self.assertEqual([('f', '', 14, False, dirstate.DirState.NULLSTAT)],
-                         entry[1])
+        # The dirblock entry should not have cached the file's sha1 (too new)
+        self.assertEqual(('f', '', 14, False, dirstate.DirState.NULLSTAT),
+                         entry[1][0])
         self.assertEqual(dirstate.DirState.IN_MEMORY_MODIFIED,
                          state._dirblock_state)
         mode = stat_value.st_mode
-        self.assertEqual([('sha1', 'a'), ('is_exec', mode, False)], state._log)
+        self.assertEqual([('is_exec', mode, False)], state._log)
 
         state.save()
         self.assertEqual(dirstate.DirState.IN_MEMORY_UNMODIFIED,
@@ -1734,45 +1744,55 @@
         # so we will re-read the file. Roll the clock back so the file is
         # guaranteed to look too new.
         state.adjust_time(-10)
+        del state._log[:]
 
         link_or_sha1 = state.update_entry(entry, abspath='a',
                                           stat_value=stat_value)
-        self.assertEqual([('sha1', 'a'), ('is_exec', mode, False),
-                          ('sha1', 'a'), ('is_exec', mode, False),
-                         ], state._log)
-        self.assertEqual('b50e5406bb5e153ebbeb20268fcf37c87e1ecfb6',
-                         link_or_sha1)
+        self.assertEqual([('is_exec', mode, False)], state._log)
+        self.assertEqual(None, link_or_sha1)
         self.assertEqual(dirstate.DirState.IN_MEMORY_MODIFIED,
                          state._dirblock_state)
-        self.assertEqual([('f', '', 14, False, dirstate.DirState.NULLSTAT)],
-                         entry[1])
+        self.assertEqual(('f', '', 14, False, dirstate.DirState.NULLSTAT),
+                         entry[1][0])
         state.save()
 
-        # However, if we move the clock forward so the file is considered
-        # "stable", it should just cache the value.
+        # If it is cachable (the clock has moved forward) but new it still
+        # won't calculate the sha or cache it.
         state.adjust_time(+20)
+        del state._log[:]
+        link_or_sha1 = state.update_entry(entry, abspath='a',
+                                          stat_value=stat_value)
+        self.assertEqual(None, link_or_sha1)
+        self.assertEqual([('is_exec', mode, False)], state._log)
+        self.assertEqual(('f', '', 14, False, dirstate.DirState.NULLSTAT),
+                         entry[1][0])
+
+        # If the file is no longer new, and the clock has been moved forward
+        # sufficiently, it will cache the sha.
+        del state._log[:]
+        state.set_parent_trees(
+            [(with_a_id, tree.branch.repository.revision_tree(with_a_id))],
+            [])
+        entry = state._get_entry(0, path_utf8='a')
+
         link_or_sha1 = state.update_entry(entry, abspath='a',
                                           stat_value=stat_value)
         self.assertEqual('b50e5406bb5e153ebbeb20268fcf37c87e1ecfb6',
                          link_or_sha1)
-        self.assertEqual([('sha1', 'a'), ('is_exec', mode, False),
-                          ('sha1', 'a'), ('is_exec', mode, False),
-                          ('sha1', 'a'), ('is_exec', mode, False),
-                         ], state._log)
-        self.assertEqual([('f', link_or_sha1, 14, False, packed_stat)],
-                         entry[1])
+        self.assertEqual([('is_exec', mode, False), ('sha1', 'a')],
+                          state._log)
+        self.assertEqual(('f', link_or_sha1, 14, False, packed_stat),
+                         entry[1][0])
 
         # Subsequent calls will just return the cached value
+        del state._log[:]
         link_or_sha1 = state.update_entry(entry, abspath='a',
                                           stat_value=stat_value)
         self.assertEqual('b50e5406bb5e153ebbeb20268fcf37c87e1ecfb6',
                          link_or_sha1)
-        self.assertEqual([('sha1', 'a'), ('is_exec', mode, False),
-                          ('sha1', 'a'), ('is_exec', mode, False),
-                          ('sha1', 'a'), ('is_exec', mode, False),
-                         ], state._log)
-        self.assertEqual([('f', link_or_sha1, 14, False, packed_stat)],
-                         entry[1])
+        self.assertEqual([], state._log)
+        self.assertEqual(('f', link_or_sha1, 14, False, packed_stat),
+                         entry[1][0])
 
     def test_update_entry_symlink(self):
         """Update entry should read symlinks."""
@@ -1852,7 +1872,17 @@
                          state._dirblock_state)
 
     def test_update_entry_file_unchanged(self):
-        state, entry = self.get_state_with_a()
+        state, _ = self.get_state_with_a()
+        tree = self.make_branch_and_tree('tree')
+        tree.lock_write()
+        self.build_tree(['tree/a'])
+        tree.add(['a'], ['a-id'])
+        with_a_id = tree.commit('witha')
+        self.addCleanup(tree.unlock)
+        state.set_parent_trees(
+            [(with_a_id, tree.branch.repository.revision_tree(with_a_id))],
+            [])
+        entry = state._get_entry(0, path_utf8='a')
         self.build_tree(['a'])
         sha1sum = 'b50e5406bb5e153ebbeb20268fcf37c87e1ecfb6'
         state.adjust_time(+20)
@@ -1867,7 +1897,7 @@
                          state._dirblock_state)
 
     def create_and_test_file(self, state, entry):
-        """Create a file at 'a' and verify the state finds it.
+        """Create a file at 'a' and verify the state finds it during update.
 
         The state should already be versioning *something* at 'a'. This makes
         sure that state.update_entry recognizes it as a file.
@@ -1877,9 +1907,8 @@
         packed_stat = dirstate.pack_stat(stat_value)
 
         link_or_sha1 = self.do_update_entry(state, entry, abspath='a')
-        self.assertEqual('b50e5406bb5e153ebbeb20268fcf37c87e1ecfb6',
-                         link_or_sha1)
-        self.assertEqual([('f', link_or_sha1, 14, False, packed_stat)],
+        self.assertEqual(None, link_or_sha1)
+        self.assertEqual([('f', '', 14, False, dirstate.DirState.NULLSTAT)],
                          entry[1])
         return packed_stat
 
@@ -2001,11 +2030,12 @@
         self.assertEqual([('f', '', 14, True, dirstate.DirState.NULLSTAT)],
                          entry[1])
 
-        # Make the disk object look old enough to cache
+        # Make the disk object look old enough to cache (but it won't cache the sha
+        # as it is a new file).
         state.adjust_time(+20)
-        digest = 'b50e5406bb5e153ebbeb20268fcf37c87e1ecfb6'
         state.update_entry(entry, abspath='a', stat_value=stat_value)
-        self.assertEqual([('f', digest, 14, True, packed_stat)], entry[1])
+        self.assertEqual([('f', '', 14, True, dirstate.DirState.NULLSTAT)],
+            entry[1])
 
 
 class TestPackStat(TestCaseWithTransport):

=== modified file 'bzrlib/workingtree_4.py'
--- a/bzrlib/workingtree_4.py	2008-09-22 05:15:20 +0000
+++ b/bzrlib/workingtree_4.py	2008-09-23 06:21:45 +0000
@@ -412,7 +412,16 @@
         link_or_sha1 = state.update_entry(entry, file_abspath,
                                           stat_value=stat_value)
         if entry[1][0][0] == 'f':
-            return link_or_sha1
+            if link_or_sha1 is None:
+                file_obj, statvalue = self.get_file_with_stat(file_id, path)
+                try:
+                    sha1 = osutils.sha_file(file_obj)
+                finally:
+                    file_obj.close()
+                self._observed_sha1(file_id, path, (sha1, statvalue))
+                return sha1
+            else:
+                return link_or_sha1
         return None
 
     def _get_inventory(self):
@@ -1845,6 +1854,8 @@
         utf8_decode = cache_utf8._utf8_decode
         _minikind_to_kind = dirstate.DirState._minikind_to_kind
         cmp_by_dirs = dirstate.cmp_by_dirs
+        fstat = os.fstat
+        sha_file = osutils.sha_file
         # NB: show_status depends on being able to pass in non-versioned files
         # and report them as unknown
         # TODO: handle extra trees in the dirstate.
@@ -2078,9 +2089,22 @@
                         if source_minikind != 'f':
                             content_change = True
                         else:
-                            # We could check the size, but we already have the
-                            # sha1 hash.
-                            content_change = (link_or_sha1 != source_details[1])
+                            # If the size is the same, check the sha:
+                            if target_details[2] == source_details[2]:
+                                if link_or_sha1 is None:
+                                    # Stat cache miss:
+                                    file_obj = file(path_info[4], 'rb')
+                                    try:
+                                        statvalue = fstat(file_obj.fileno())
+                                        link_or_sha1 = sha_file(file_obj)
+                                    finally:
+                                        file_obj.close()
+                                    state._observed_sha1(entry, link_or_sha1,
+                                        statvalue)
+                                content_change = (link_or_sha1 != source_details[1])
+                            else:
+                                # Size changed, so must be different
+                                content_change = True
                         # Target details is updated at update_entry time
                         if use_filesystem_for_exec:
                             # We don't need S_ISREG here, because we are sure




More information about the bazaar-commits mailing list