Rev 3723: Integrate less aggressive sha logic with C iter-changes. in http://people.ubuntu.com/~robertc/baz2.0/readdir

Robert Collins robertc at robertcollins.net
Thu Sep 25 02:54:50 BST 2008


At http://people.ubuntu.com/~robertc/baz2.0/readdir

------------------------------------------------------------
revno: 3723
revision-id: robertc at robertcollins.net-20080925015442-p9mtzse65w5gy2uv
parent: robertc at robertcollins.net-20080925011130-2ct9kz8v1cvptavi
parent: robertc at robertcollins.net-20080923062145-bi6lcbxwgfhcm8j9
committer: Robert Collins <robertc at robertcollins.net>
branch nick: iter-changes.less-sha1
timestamp: Thu 2008-09-25 11:54:42 +1000
message:
  Integrate less aggressive sha logic with C iter-changes.
modified:
  NEWS                           NEWS-20050323055033-4e00b5db738777ff
  bzrlib/_dirstate_helpers_c.pyx dirstate_helpers.pyx-20070503201057-u425eni465q4idwn-3
  bzrlib/dirstate.py             dirstate.py-20060728012006-d6mvoihjb3je9peu-1
  bzrlib/tests/test__dirstate_helpers.py test_dirstate_helper-20070504035751-jsbn00xodv0y1eve-2
  bzrlib/tests/test_dirstate.py  test_dirstate.py-20060728012006-d6mvoihjb3je9peu-2
  bzrlib/workingtree_4.py        workingtree_4.py-20070208044105-5fgpc5j3ljlh5q6c-1
    ------------------------------------------------------------
    revno: 3696.36.1
    revision-id: robertc at robertcollins.net-20080923062145-bi6lcbxwgfhcm8j9
    parent: robertc at robertcollins.net-20080922051520-uhr3pn61w141kagv
    committer: Robert Collins <robertc at robertcollins.net>
    branch nick: iter-changes.less-sha1
    timestamp: Tue 2008-09-23 16:21:45 +1000
    message:
      Sha files for the stat cache more lazily rather than on first-examination, allowing less overall sha calculations to occur.
    modified:
      NEWS                           NEWS-20050323055033-4e00b5db738777ff
      bzrlib/dirstate.py             dirstate.py-20060728012006-d6mvoihjb3je9peu-1
      bzrlib/tests/test_dirstate.py  test_dirstate.py-20060728012006-d6mvoihjb3je9peu-2
      bzrlib/workingtree_4.py        workingtree_4.py-20070208044105-5fgpc5j3ljlh5q6c-1
=== modified file 'NEWS'
--- a/NEWS	2008-09-25 01:11:30 +0000
+++ b/NEWS	2008-09-25 01:54:42 +0000
@@ -79,6 +79,10 @@
       its result tuple - an optional file system hash for the hash cache
       to use. (Robert Collins)
 
+    * ``dirstate.DirState.update_entry`` will now only calculate the sha1
+      of a file if it is likely to be needed in determining the output
+      of iter_changes. (Robert Collins)
+
   TESTING:
 
     * ``bzrlib.tests.repository_implementations`` has been renamed to

=== modified file 'bzrlib/_dirstate_helpers_c.pyx'
--- a/bzrlib/_dirstate_helpers_c.pyx	2008-09-24 03:08:14 +0000
+++ b/bzrlib/_dirstate_helpers_c.pyx	2008-09-25 01:54:42 +0000
@@ -800,12 +800,16 @@
 def update_entry(self, entry, abspath, stat_value):
     """Update the entry based on what is actually on disk.
 
+    This function only calculates the sha if it needs to - if the entry is
+    uncachable, or clearly different to the first parent's entry, no sha
+    is calculated, and None is returned.
+
     :param entry: This is the dirblock entry for the file in question.
     :param abspath: The path on disk for this file.
     :param stat_value: (optional) if we already have done a stat on the
         file, re-use it.
-    :return: The sha1 hexdigest of the file (40 bytes) or link target of a
-            symlink.
+    :return: None, or The sha1 hexdigest of the file (40 bytes) or link
+        target of a symlink.
     """
     return _update_entry(self, entry, abspath, stat_value)
 
@@ -813,12 +817,16 @@
 cdef _update_entry(self, entry, abspath, stat_value):
     """Update the entry based on what is actually on disk.
 
+    This function only calculates the sha if it needs to - if the entry is
+    uncachable, or clearly different to the first parent's entry, no sha
+    is calculated, and None is returned.
+
     :param self: The dirstate object this is operating on.
     :param entry: This is the dirblock entry for the file in question.
     :param abspath: The path on disk for this file.
     :param stat_value: The stat value done on the path.
-    :return: The sha1 hexdigest of the file (40 bytes) or link target of a
-            symlink.
+    :return: None, or The sha1 hexdigest of the file (40 bytes) or link
+        target of a symlink.
     """
     # TODO - require pyrex 0.9.8, then use a pyd file to define access to the
     # _st mode of the compiled stat objects.
@@ -857,13 +865,18 @@
     # process this entry.
     link_or_sha1 = None
     if minikind == c'f':
-        link_or_sha1 = self._sha1_file(abspath)
         executable = self._is_executable(stat_value.st_mode,
                                          saved_executable)
         if self._cutoff_time is None:
             self._sha_cutoff_time()
         if (stat_value.st_mtime < self._cutoff_time
-            and stat_value.st_ctime < self._cutoff_time):
+            and stat_value.st_ctime < self._cutoff_time
+            and len(entry[1]) > 1
+            and entry[1][1][0] != 'a'):
+                # Could check for size changes for further optimised
+                # avoidance of sha1's. However the most prominent case of
+                # over-shaing is during initial add, which this catches.
+            link_or_sha1 = self._sha1_file(abspath)
             entry[1][0] = ('f', link_or_sha1, stat_value.st_size,
                            executable, packed_stat)
         else:
@@ -974,6 +987,8 @@
     cdef object root_dir_info
     cdef object bisect_left
     cdef object pathjoin
+    cdef object fstat
+    cdef object sha_file
 
     def __init__(self, include_unchanged, use_filesystem_for_exec,
         search_specific_files, state, source_index, target_index,
@@ -1021,6 +1036,8 @@
         self.root_dir_info = None
         self.bisect_left = bisect.bisect_left
         self.pathjoin = osutils.pathjoin
+        self.fstat = os.fstat
+        self.sha_file = osutils.sha_file
 
     cdef _process_entry(self, entry, path_info):
         """Compare an entry and real disk to generate delta information.
@@ -1117,9 +1134,24 @@
                     if source_minikind != c'f':
                         content_change = 1
                     else:
-                        # We could check the size, but we already have the
-                        # sha1 hash.
-                        content_change = (link_or_sha1 != source_details[1])
+                        # If the size is the same, check the sha:
+                        if target_details[2] == source_details[2]:
+                            if link_or_sha1 is None:
+                                # Stat cache miss:
+                                file_obj = file(path_info[4], 'rb')
+                                try:
+                                    # XXX: TODO: Use lower level file IO rather
+                                    # than python objects for sha-misses.
+                                    statvalue = self.fstat(file_obj.fileno())
+                                    link_or_sha1 = self.sha_file(file_obj)
+                                finally:
+                                    file_obj.close()
+                                self.state._observed_sha1(entry, link_or_sha1,
+                                    statvalue)
+                            content_change = (link_or_sha1 != source_details[1])
+                        else:
+                            # Size changed, so must be different
+                            content_change = 1
                     # Target details is updated at update_entry time
                     if self.use_filesystem_for_exec:
                         # We don't need S_ISREG here, because we are sure

=== modified file 'bzrlib/dirstate.py'
--- a/bzrlib/dirstate.py	2008-09-25 01:11:30 +0000
+++ b/bzrlib/dirstate.py	2008-09-25 01:54:42 +0000
@@ -2764,12 +2764,16 @@
                  _pack_stat=pack_stat):
     """Update the entry based on what is actually on disk.
 
+    This function only calculates the sha if it needs to - if the entry is
+    uncachable, or clearly different to the first parent's entry, no sha
+    is calculated, and None is returned.
+
     :param state: The dirstate this entry is in.
     :param entry: This is the dirblock entry for the file in question.
     :param abspath: The path on disk for this file.
     :param stat_value: The stat value done on the path.
-    :return: The sha1 hexdigest of the file (40 bytes) or link target of a
-            symlink.
+    :return: None, or The sha1 hexdigest of the file (40 bytes) or link
+        target of a symlink.
     """
     try:
         minikind = _stat_to_minikind[stat_value.st_mode & 0170000]
@@ -2795,13 +2799,18 @@
     # process this entry.
     link_or_sha1 = None
     if minikind == 'f':
-        link_or_sha1 = state._sha1_file(abspath)
         executable = state._is_executable(stat_value.st_mode,
                                          saved_executable)
         if state._cutoff_time is None:
             state._sha_cutoff_time()
         if (stat_value.st_mtime < state._cutoff_time
-            and stat_value.st_ctime < state._cutoff_time):
+            and stat_value.st_ctime < state._cutoff_time
+            and len(entry[1]) > 1
+            and entry[1][1][0] != 'a'):
+                # Could check for size changes for further optimised
+                # avoidance of sha1's. However the most prominent case of
+                # over-shaing is during initial add, which this catches.
+            link_or_sha1 = state._sha1_file(abspath)
             entry[1][0] = ('f', link_or_sha1, stat_value.st_size,
                            executable, packed_stat)
         else:
@@ -2955,9 +2964,22 @@
                     if source_minikind != 'f':
                         content_change = True
                     else:
-                        # We could check the size, but we already have the
-                        # sha1 hash.
-                        content_change = (link_or_sha1 != source_details[1])
+                        # If the size is the same, check the sha:
+                        if target_details[2] == source_details[2]:
+                            if link_or_sha1 is None:
+                                # Stat cache miss:
+                                file_obj = file(path_info[4], 'rb')
+                                try:
+                                    statvalue = fstat(file_obj.fileno())
+                                    link_or_sha1 = sha_file(file_obj)
+                                finally:
+                                    file_obj.close()
+                                self.state._observed_sha1(entry, link_or_sha1,
+                                    statvalue)
+                            content_change = (link_or_sha1 != source_details[1])
+                        else:
+                            # Size changed, so must be different
+                            content_change = True
                     # Target details is updated at update_entry time
                     if self.use_filesystem_for_exec:
                         # We don't need S_ISREG here, because we are sure
@@ -3131,6 +3153,8 @@
         """Iterate over the changes."""
         utf8_decode = cache_utf8._utf8_decode
         _cmp_by_dirs = cmp_by_dirs
+        fstat = os.fstat
+        sha_file = osutils.sha_file
         _process_entry = self._process_entry
         uninteresting = self.uninteresting
         search_specific_files = self.search_specific_files

=== modified file 'bzrlib/tests/test__dirstate_helpers.py'
--- a/bzrlib/tests/test__dirstate_helpers.py	2008-09-25 01:11:30 +0000
+++ b/bzrlib/tests/test__dirstate_helpers.py	2008-09-25 01:54:42 +0000
@@ -824,12 +824,23 @@
         self.assertEqual(oldstat, entry[1][0][4])
 
     def test_update_entry(self):
-        state, entry = self.get_state_with_a()
+        state, _ = self.get_state_with_a()
+        tree = self.make_branch_and_tree('tree')
+        tree.lock_write()
+        empty_revid = tree.commit('empty')
+        self.build_tree(['tree/a'])
+        tree.add(['a'], ['a-id'])
+        with_a_id = tree.commit('with_a')
+        self.addCleanup(tree.unlock)
+        state.set_parent_trees(
+            [(empty_revid, tree.branch.repository.revision_tree(empty_revid))],
+            [])
+        entry = state._get_entry(0, path_utf8='a')
         self.build_tree(['a'])
         # Add one where we don't provide the stat or sha already
         self.assertEqual(('', 'a', 'a-id'), entry[0])
-        self.assertEqual([('f', '', 0, False, dirstate.DirState.NULLSTAT)],
-                         entry[1])
+        self.assertEqual(('f', '', 0, False, dirstate.DirState.NULLSTAT),
+                         entry[1][0])
         # Flush the buffers to disk
         state.save()
         self.assertEqual(dirstate.DirState.IN_MEMORY_UNMODIFIED,
@@ -839,16 +850,15 @@
         packed_stat = dirstate.pack_stat(stat_value)
         link_or_sha1 = self.update_entry(state, entry, abspath='a',
                                           stat_value=stat_value)
-        self.assertEqual('b50e5406bb5e153ebbeb20268fcf37c87e1ecfb6',
-                         link_or_sha1)
+        self.assertEqual(None, link_or_sha1)
 
-        # The dirblock entry should not cache the file's sha1
-        self.assertEqual([('f', '', 14, False, dirstate.DirState.NULLSTAT)],
-                         entry[1])
+        # The dirblock entry should not have cached the file's sha1 (too new)
+        self.assertEqual(('f', '', 14, False, dirstate.DirState.NULLSTAT),
+                         entry[1][0])
         self.assertEqual(dirstate.DirState.IN_MEMORY_MODIFIED,
                          state._dirblock_state)
         mode = stat_value.st_mode
-        self.assertEqual([('sha1', 'a'), ('is_exec', mode, False)], state._log)
+        self.assertEqual([('is_exec', mode, False)], state._log)
 
         state.save()
         self.assertEqual(dirstate.DirState.IN_MEMORY_UNMODIFIED,
@@ -858,45 +868,55 @@
         # so we will re-read the file. Roll the clock back so the file is
         # guaranteed to look too new.
         state.adjust_time(-10)
+        del state._log[:]
 
         link_or_sha1 = self.update_entry(state, entry, abspath='a',
                                           stat_value=stat_value)
-        self.assertEqual([('sha1', 'a'), ('is_exec', mode, False),
-                          ('sha1', 'a'), ('is_exec', mode, False),
-                         ], state._log)
-        self.assertEqual('b50e5406bb5e153ebbeb20268fcf37c87e1ecfb6',
-                         link_or_sha1)
+        self.assertEqual([('is_exec', mode, False)], state._log)
+        self.assertEqual(None, link_or_sha1)
         self.assertEqual(dirstate.DirState.IN_MEMORY_MODIFIED,
                          state._dirblock_state)
-        self.assertEqual([('f', '', 14, False, dirstate.DirState.NULLSTAT)],
-                         entry[1])
+        self.assertEqual(('f', '', 14, False, dirstate.DirState.NULLSTAT),
+                         entry[1][0])
         state.save()
 
-        # However, if we move the clock forward so the file is considered
-        # "stable", it should just cache the value.
+        # If it is cachable (the clock has moved forward) but new it still
+        # won't calculate the sha or cache it.
         state.adjust_time(+20)
+        del state._log[:]
+        link_or_sha1 = dirstate.update_entry(state, entry, abspath='a',
+                                          stat_value=stat_value)
+        self.assertEqual(None, link_or_sha1)
+        self.assertEqual([('is_exec', mode, False)], state._log)
+        self.assertEqual(('f', '', 14, False, dirstate.DirState.NULLSTAT),
+                         entry[1][0])
+
+        # If the file is no longer new, and the clock has been moved forward
+        # sufficiently, it will cache the sha.
+        del state._log[:]
+        state.set_parent_trees(
+            [(with_a_id, tree.branch.repository.revision_tree(with_a_id))],
+            [])
+        entry = state._get_entry(0, path_utf8='a')
+
         link_or_sha1 = self.update_entry(state, entry, abspath='a',
                                           stat_value=stat_value)
         self.assertEqual('b50e5406bb5e153ebbeb20268fcf37c87e1ecfb6',
                          link_or_sha1)
-        self.assertEqual([('sha1', 'a'), ('is_exec', mode, False),
-                          ('sha1', 'a'), ('is_exec', mode, False),
-                          ('sha1', 'a'), ('is_exec', mode, False),
-                         ], state._log)
-        self.assertEqual([('f', link_or_sha1, 14, False, packed_stat)],
-                         entry[1])
+        self.assertEqual([('is_exec', mode, False), ('sha1', 'a')],
+                          state._log)
+        self.assertEqual(('f', link_or_sha1, 14, False, packed_stat),
+                         entry[1][0])
 
         # Subsequent calls will just return the cached value
+        del state._log[:]
         link_or_sha1 = self.update_entry(state, entry, abspath='a',
                                           stat_value=stat_value)
         self.assertEqual('b50e5406bb5e153ebbeb20268fcf37c87e1ecfb6',
                          link_or_sha1)
-        self.assertEqual([('sha1', 'a'), ('is_exec', mode, False),
-                          ('sha1', 'a'), ('is_exec', mode, False),
-                          ('sha1', 'a'), ('is_exec', mode, False),
-                         ], state._log)
-        self.assertEqual([('f', link_or_sha1, 14, False, packed_stat)],
-                         entry[1])
+        self.assertEqual([], state._log)
+        self.assertEqual(('f', link_or_sha1, 14, False, packed_stat),
+                         entry[1][0])
 
     def test_update_entry_symlink(self):
         """Update entry should read symlinks."""
@@ -976,7 +996,17 @@
                          state._dirblock_state)
 
     def test_update_entry_file_unchanged(self):
-        state, entry = self.get_state_with_a()
+        state, _ = self.get_state_with_a()
+        tree = self.make_branch_and_tree('tree')
+        tree.lock_write()
+        self.build_tree(['tree/a'])
+        tree.add(['a'], ['a-id'])
+        with_a_id = tree.commit('witha')
+        self.addCleanup(tree.unlock)
+        state.set_parent_trees(
+            [(with_a_id, tree.branch.repository.revision_tree(with_a_id))],
+            [])
+        entry = state._get_entry(0, path_utf8='a')
         self.build_tree(['a'])
         sha1sum = 'b50e5406bb5e153ebbeb20268fcf37c87e1ecfb6'
         state.adjust_time(+20)
@@ -991,7 +1021,7 @@
                          state._dirblock_state)
 
     def create_and_test_file(self, state, entry):
-        """Create a file at 'a' and verify the state finds it.
+        """Create a file at 'a' and verify the state finds it during update.
 
         The state should already be versioning *something* at 'a'. This makes
         sure that state.update_entry recognizes it as a file.
@@ -1001,9 +1031,8 @@
         packed_stat = dirstate.pack_stat(stat_value)
 
         link_or_sha1 = self.do_update_entry(state, entry, abspath='a')
-        self.assertEqual('b50e5406bb5e153ebbeb20268fcf37c87e1ecfb6',
-                         link_or_sha1)
-        self.assertEqual([('f', link_or_sha1, 14, False, packed_stat)],
+        self.assertEqual(None, link_or_sha1)
+        self.assertEqual([('f', '', 14, False, dirstate.DirState.NULLSTAT)],
                          entry[1])
         return packed_stat
 
@@ -1125,11 +1154,13 @@
         self.assertEqual([('f', '', 14, True, dirstate.DirState.NULLSTAT)],
                          entry[1])
 
-        # Make the disk object look old enough to cache
+        # Make the disk object look old enough to cache (but it won't cache the sha
+        # as it is a new file).
         state.adjust_time(+20)
         digest = 'b50e5406bb5e153ebbeb20268fcf37c87e1ecfb6'
         self.update_entry(state, entry, abspath='a', stat_value=stat_value)
-        self.assertEqual([('f', digest, 14, True, packed_stat)], entry[1])
+        self.assertEqual([('f', '', 14, True, dirstate.DirState.NULLSTAT)],
+            entry[1])
 
 
 class TestCompiledUpdateEntry(TestUpdateEntry):

=== modified file 'bzrlib/tests/test_dirstate.py'
--- a/bzrlib/tests/test_dirstate.py	2008-09-25 01:11:30 +0000
+++ b/bzrlib/tests/test_dirstate.py	2008-09-25 01:54:42 +0000
@@ -563,20 +563,22 @@
         state.lock_read()
         try:
             entry = state._get_entry(0, path_utf8='a-file')
-            # The current sha1 sum should be empty
-            self.assertEqual('', entry[1][0][1])
+            # The current size should be 0 (default)
+            self.assertEqual(0, entry[1][0][2])
             # We should have a real entry.
             self.assertNotEqual((None, None), entry)
             # Make sure everything is old enough
             state._sha_cutoff_time()
             state._cutoff_time += 10
-            sha1sum = dirstate.update_entry(state, entry, 'a-file', os.lstat('a-file'))
-            # We should have gotten a real sha1
-            self.assertEqual('ecc5374e9ed82ad3ea3b4d452ea995a5fd3e70e3',
-                             sha1sum)
+            # Change the file length
+            self.build_tree_contents([('a-file', 'shorter')])
+            sha1sum = dirstate.update_entry(state, entry, 'a-file',
+                os.lstat('a-file'))
+            # new file, no cached sha:
+            self.assertEqual(None, sha1sum)
 
             # The dirblock has been updated
-            self.assertEqual(sha1sum, entry[1][0][1])
+            self.assertEqual(7, entry[1][0][2])
             self.assertEqual(dirstate.DirState.IN_MEMORY_MODIFIED,
                              state._dirblock_state)
 
@@ -592,7 +594,7 @@
         state.lock_read()
         try:
             entry = state._get_entry(0, path_utf8='a-file')
-            self.assertEqual(sha1sum, entry[1][0][1])
+            self.assertEqual(7, entry[1][0][2])
         finally:
             state.unlock()
 
@@ -611,10 +613,10 @@
         state.lock_read()
         try:
             entry = state._get_entry(0, path_utf8='a-file')
-            sha1sum = dirstate.update_entry(state, entry, 'a-file', os.lstat('a-file'))
-            # We should have gotten a real sha1
-            self.assertEqual('ecc5374e9ed82ad3ea3b4d452ea995a5fd3e70e3',
-                             sha1sum)
+            sha1sum = dirstate.update_entry(state, entry, 'a-file',
+                os.lstat('a-file'))
+            # No sha - too new
+            self.assertEqual(None, sha1sum)
             self.assertEqual(dirstate.DirState.IN_MEMORY_MODIFIED,
                              state._dirblock_state)
 

=== modified file 'bzrlib/workingtree_4.py'
--- a/bzrlib/workingtree_4.py	2008-09-25 01:11:30 +0000
+++ b/bzrlib/workingtree_4.py	2008-09-25 01:54:42 +0000
@@ -407,7 +407,16 @@
         link_or_sha1 = dirstate.update_entry(state, entry, file_abspath,
             stat_value=stat_value)
         if entry[1][0][0] == 'f':
-            return link_or_sha1
+            if link_or_sha1 is None:
+                file_obj, statvalue = self.get_file_with_stat(file_id, path)
+                try:
+                    sha1 = osutils.sha_file(file_obj)
+                finally:
+                    file_obj.close()
+                self._observed_sha1(file_id, path, (sha1, statvalue))
+                return sha1
+            else:
+                return link_or_sha1
         return None
 
     def _get_inventory(self):




More information about the bazaar-commits mailing list