Rev 3712: Sha files for the stat cache more lazily rather than on first-examination, allowing less overall sha calculations to occur. in http://people.ubuntu.com/~robertc/baz2.0/iter-changes.less-sha1
Robert Collins
robertc at robertcollins.net
Tue Sep 23 07:21:52 BST 2008
At http://people.ubuntu.com/~robertc/baz2.0/iter-changes.less-sha1
------------------------------------------------------------
revno: 3712
revision-id: robertc at robertcollins.net-20080923062145-bi6lcbxwgfhcm8j9
parent: robertc at robertcollins.net-20080922051520-uhr3pn61w141kagv
committer: Robert Collins <robertc at robertcollins.net>
branch nick: iter-changes.less-sha1
timestamp: Tue 2008-09-23 16:21:45 +1000
message:
Sha files for the stat cache more lazily rather than on first-examination, allowing less overall sha calculations to occur.
modified:
NEWS NEWS-20050323055033-4e00b5db738777ff
bzrlib/dirstate.py dirstate.py-20060728012006-d6mvoihjb3je9peu-1
bzrlib/tests/test_dirstate.py test_dirstate.py-20060728012006-d6mvoihjb3je9peu-2
bzrlib/workingtree_4.py workingtree_4.py-20070208044105-5fgpc5j3ljlh5q6c-1
=== modified file 'NEWS'
--- a/NEWS 2008-09-22 05:15:20 +0000
+++ b/NEWS 2008-09-23 06:21:45 +0000
@@ -36,6 +36,10 @@
API CHANGES:
+ * ``dirstate.DirState.update_entry`` will now only calculate the sha1
+ of a file if it is likely to be needed in determining the output
+ of iter_changes. (Robert Collins)
+
TESTING:
* ``bzrlib.tests.repository_implementations`` has been renamed to
=== modified file 'bzrlib/dirstate.py'
--- a/bzrlib/dirstate.py 2008-09-19 06:53:41 +0000
+++ b/bzrlib/dirstate.py 2008-09-23 06:21:45 +0000
@@ -1500,12 +1500,16 @@
_pack_stat=pack_stat):
"""Update the entry based on what is actually on disk.
+ This function only calculates the sha if it needs to - if the entry is
+ uncachable, or clearly different to the first parent's entry, no sha
+ is calculated, and None is returned.
+
:param entry: This is the dirblock entry for the file in question.
:param abspath: The path on disk for this file.
:param stat_value: (optional) if we already have done a stat on the
file, re-use it.
- :return: The sha1 hexdigest of the file (40 bytes) or link target of a
- symlink.
+ :return: None, or The sha1 hexdigest of the file (40 bytes) or link
+ target of a symlink.
"""
try:
minikind = _stat_to_minikind[stat_value.st_mode & 0170000]
@@ -1531,13 +1535,18 @@
# process this entry.
link_or_sha1 = None
if minikind == 'f':
- link_or_sha1 = self._sha1_file(abspath)
executable = self._is_executable(stat_value.st_mode,
saved_executable)
if self._cutoff_time is None:
self._sha_cutoff_time()
if (stat_value.st_mtime < self._cutoff_time
- and stat_value.st_ctime < self._cutoff_time):
+ and stat_value.st_ctime < self._cutoff_time
+ and len(entry[1]) > 1
+ and entry[1][1][0] != 'a'):
+ # Could check for size changes for further optimised
+ # avoidance of sha1's. However the most prominent case of
+ # over-shaing is during initial add, which this catches.
+ link_or_sha1 = self._sha1_file(abspath)
entry[1][0] = ('f', link_or_sha1, stat_value.st_size,
executable, packed_stat)
else:
=== modified file 'bzrlib/tests/test_dirstate.py'
--- a/bzrlib/tests/test_dirstate.py 2008-09-19 06:53:41 +0000
+++ b/bzrlib/tests/test_dirstate.py 2008-09-23 06:21:45 +0000
@@ -564,20 +564,21 @@
state.lock_read()
try:
entry = state._get_entry(0, path_utf8='a-file')
- # The current sha1 sum should be empty
- self.assertEqual('', entry[1][0][1])
+ # The current size should be 0 (default)
+ self.assertEqual(0, entry[1][0][2])
# We should have a real entry.
self.assertNotEqual((None, None), entry)
# Make sure everything is old enough
state._sha_cutoff_time()
state._cutoff_time += 10
+ # Change the file length
+ self.build_tree_contents([('a-file', 'shorter')])
sha1sum = state.update_entry(entry, 'a-file', os.lstat('a-file'))
- # We should have gotten a real sha1
- self.assertEqual('ecc5374e9ed82ad3ea3b4d452ea995a5fd3e70e3',
- sha1sum)
+ # new file, no cached sha:
+ self.assertEqual(None, sha1sum)
# The dirblock has been updated
- self.assertEqual(sha1sum, entry[1][0][1])
+ self.assertEqual(7, entry[1][0][2])
self.assertEqual(dirstate.DirState.IN_MEMORY_MODIFIED,
state._dirblock_state)
@@ -593,7 +594,7 @@
state.lock_read()
try:
entry = state._get_entry(0, path_utf8='a-file')
- self.assertEqual(sha1sum, entry[1][0][1])
+ self.assertEqual(7, entry[1][0][2])
finally:
state.unlock()
@@ -613,9 +614,8 @@
try:
entry = state._get_entry(0, path_utf8='a-file')
sha1sum = state.update_entry(entry, 'a-file', os.lstat('a-file'))
- # We should have gotten a real sha1
- self.assertEqual('ecc5374e9ed82ad3ea3b4d452ea995a5fd3e70e3',
- sha1sum)
+ # No sha - too new
+ self.assertEqual(None, sha1sum)
self.assertEqual(dirstate.DirState.IN_MEMORY_MODIFIED,
state._dirblock_state)
@@ -1700,12 +1700,23 @@
self.assertEqual(oldstat, entry[1][0][4])
def test_update_entry(self):
- state, entry = self.get_state_with_a()
+ state, _ = self.get_state_with_a()
+ tree = self.make_branch_and_tree('tree')
+ tree.lock_write()
+ empty_revid = tree.commit('empty')
+ self.build_tree(['tree/a'])
+ tree.add(['a'], ['a-id'])
+ with_a_id = tree.commit('with_a')
+ self.addCleanup(tree.unlock)
+ state.set_parent_trees(
+ [(empty_revid, tree.branch.repository.revision_tree(empty_revid))],
+ [])
+ entry = state._get_entry(0, path_utf8='a')
self.build_tree(['a'])
# Add one where we don't provide the stat or sha already
self.assertEqual(('', 'a', 'a-id'), entry[0])
- self.assertEqual([('f', '', 0, False, dirstate.DirState.NULLSTAT)],
- entry[1])
+ self.assertEqual(('f', '', 0, False, dirstate.DirState.NULLSTAT),
+ entry[1][0])
# Flush the buffers to disk
state.save()
self.assertEqual(dirstate.DirState.IN_MEMORY_UNMODIFIED,
@@ -1715,16 +1726,15 @@
packed_stat = dirstate.pack_stat(stat_value)
link_or_sha1 = state.update_entry(entry, abspath='a',
stat_value=stat_value)
- self.assertEqual('b50e5406bb5e153ebbeb20268fcf37c87e1ecfb6',
- link_or_sha1)
+ self.assertEqual(None, link_or_sha1)
- # The dirblock entry should not cache the file's sha1
- self.assertEqual([('f', '', 14, False, dirstate.DirState.NULLSTAT)],
- entry[1])
+ # The dirblock entry should not have cached the file's sha1 (too new)
+ self.assertEqual(('f', '', 14, False, dirstate.DirState.NULLSTAT),
+ entry[1][0])
self.assertEqual(dirstate.DirState.IN_MEMORY_MODIFIED,
state._dirblock_state)
mode = stat_value.st_mode
- self.assertEqual([('sha1', 'a'), ('is_exec', mode, False)], state._log)
+ self.assertEqual([('is_exec', mode, False)], state._log)
state.save()
self.assertEqual(dirstate.DirState.IN_MEMORY_UNMODIFIED,
@@ -1734,45 +1744,55 @@
# so we will re-read the file. Roll the clock back so the file is
# guaranteed to look too new.
state.adjust_time(-10)
+ del state._log[:]
link_or_sha1 = state.update_entry(entry, abspath='a',
stat_value=stat_value)
- self.assertEqual([('sha1', 'a'), ('is_exec', mode, False),
- ('sha1', 'a'), ('is_exec', mode, False),
- ], state._log)
- self.assertEqual('b50e5406bb5e153ebbeb20268fcf37c87e1ecfb6',
- link_or_sha1)
+ self.assertEqual([('is_exec', mode, False)], state._log)
+ self.assertEqual(None, link_or_sha1)
self.assertEqual(dirstate.DirState.IN_MEMORY_MODIFIED,
state._dirblock_state)
- self.assertEqual([('f', '', 14, False, dirstate.DirState.NULLSTAT)],
- entry[1])
+ self.assertEqual(('f', '', 14, False, dirstate.DirState.NULLSTAT),
+ entry[1][0])
state.save()
- # However, if we move the clock forward so the file is considered
- # "stable", it should just cache the value.
+ # If it is cachable (the clock has moved forward) but new it still
+ # won't calculate the sha or cache it.
state.adjust_time(+20)
+ del state._log[:]
+ link_or_sha1 = state.update_entry(entry, abspath='a',
+ stat_value=stat_value)
+ self.assertEqual(None, link_or_sha1)
+ self.assertEqual([('is_exec', mode, False)], state._log)
+ self.assertEqual(('f', '', 14, False, dirstate.DirState.NULLSTAT),
+ entry[1][0])
+
+ # If the file is no longer new, and the clock has been moved forward
+ # sufficiently, it will cache the sha.
+ del state._log[:]
+ state.set_parent_trees(
+ [(with_a_id, tree.branch.repository.revision_tree(with_a_id))],
+ [])
+ entry = state._get_entry(0, path_utf8='a')
+
link_or_sha1 = state.update_entry(entry, abspath='a',
stat_value=stat_value)
self.assertEqual('b50e5406bb5e153ebbeb20268fcf37c87e1ecfb6',
link_or_sha1)
- self.assertEqual([('sha1', 'a'), ('is_exec', mode, False),
- ('sha1', 'a'), ('is_exec', mode, False),
- ('sha1', 'a'), ('is_exec', mode, False),
- ], state._log)
- self.assertEqual([('f', link_or_sha1, 14, False, packed_stat)],
- entry[1])
+ self.assertEqual([('is_exec', mode, False), ('sha1', 'a')],
+ state._log)
+ self.assertEqual(('f', link_or_sha1, 14, False, packed_stat),
+ entry[1][0])
# Subsequent calls will just return the cached value
+ del state._log[:]
link_or_sha1 = state.update_entry(entry, abspath='a',
stat_value=stat_value)
self.assertEqual('b50e5406bb5e153ebbeb20268fcf37c87e1ecfb6',
link_or_sha1)
- self.assertEqual([('sha1', 'a'), ('is_exec', mode, False),
- ('sha1', 'a'), ('is_exec', mode, False),
- ('sha1', 'a'), ('is_exec', mode, False),
- ], state._log)
- self.assertEqual([('f', link_or_sha1, 14, False, packed_stat)],
- entry[1])
+ self.assertEqual([], state._log)
+ self.assertEqual(('f', link_or_sha1, 14, False, packed_stat),
+ entry[1][0])
def test_update_entry_symlink(self):
"""Update entry should read symlinks."""
@@ -1852,7 +1872,17 @@
state._dirblock_state)
def test_update_entry_file_unchanged(self):
- state, entry = self.get_state_with_a()
+ state, _ = self.get_state_with_a()
+ tree = self.make_branch_and_tree('tree')
+ tree.lock_write()
+ self.build_tree(['tree/a'])
+ tree.add(['a'], ['a-id'])
+ with_a_id = tree.commit('witha')
+ self.addCleanup(tree.unlock)
+ state.set_parent_trees(
+ [(with_a_id, tree.branch.repository.revision_tree(with_a_id))],
+ [])
+ entry = state._get_entry(0, path_utf8='a')
self.build_tree(['a'])
sha1sum = 'b50e5406bb5e153ebbeb20268fcf37c87e1ecfb6'
state.adjust_time(+20)
@@ -1867,7 +1897,7 @@
state._dirblock_state)
def create_and_test_file(self, state, entry):
- """Create a file at 'a' and verify the state finds it.
+ """Create a file at 'a' and verify the state finds it during update.
The state should already be versioning *something* at 'a'. This makes
sure that state.update_entry recognizes it as a file.
@@ -1877,9 +1907,8 @@
packed_stat = dirstate.pack_stat(stat_value)
link_or_sha1 = self.do_update_entry(state, entry, abspath='a')
- self.assertEqual('b50e5406bb5e153ebbeb20268fcf37c87e1ecfb6',
- link_or_sha1)
- self.assertEqual([('f', link_or_sha1, 14, False, packed_stat)],
+ self.assertEqual(None, link_or_sha1)
+ self.assertEqual([('f', '', 14, False, dirstate.DirState.NULLSTAT)],
entry[1])
return packed_stat
@@ -2001,11 +2030,12 @@
self.assertEqual([('f', '', 14, True, dirstate.DirState.NULLSTAT)],
entry[1])
- # Make the disk object look old enough to cache
+ # Make the disk object look old enough to cache (but it won't cache the sha
+ # as it is a new file).
state.adjust_time(+20)
- digest = 'b50e5406bb5e153ebbeb20268fcf37c87e1ecfb6'
state.update_entry(entry, abspath='a', stat_value=stat_value)
- self.assertEqual([('f', digest, 14, True, packed_stat)], entry[1])
+ self.assertEqual([('f', '', 14, True, dirstate.DirState.NULLSTAT)],
+ entry[1])
class TestPackStat(TestCaseWithTransport):
=== modified file 'bzrlib/workingtree_4.py'
--- a/bzrlib/workingtree_4.py 2008-09-22 05:15:20 +0000
+++ b/bzrlib/workingtree_4.py 2008-09-23 06:21:45 +0000
@@ -412,7 +412,16 @@
link_or_sha1 = state.update_entry(entry, file_abspath,
stat_value=stat_value)
if entry[1][0][0] == 'f':
- return link_or_sha1
+ if link_or_sha1 is None:
+ file_obj, statvalue = self.get_file_with_stat(file_id, path)
+ try:
+ sha1 = osutils.sha_file(file_obj)
+ finally:
+ file_obj.close()
+ self._observed_sha1(file_id, path, (sha1, statvalue))
+ return sha1
+ else:
+ return link_or_sha1
return None
def _get_inventory(self):
@@ -1845,6 +1854,8 @@
utf8_decode = cache_utf8._utf8_decode
_minikind_to_kind = dirstate.DirState._minikind_to_kind
cmp_by_dirs = dirstate.cmp_by_dirs
+ fstat = os.fstat
+ sha_file = osutils.sha_file
# NB: show_status depends on being able to pass in non-versioned files
# and report them as unknown
# TODO: handle extra trees in the dirstate.
@@ -2078,9 +2089,22 @@
if source_minikind != 'f':
content_change = True
else:
- # We could check the size, but we already have the
- # sha1 hash.
- content_change = (link_or_sha1 != source_details[1])
+ # If the size is the same, check the sha:
+ if target_details[2] == source_details[2]:
+ if link_or_sha1 is None:
+ # Stat cache miss:
+ file_obj = file(path_info[4], 'rb')
+ try:
+ statvalue = fstat(file_obj.fileno())
+ link_or_sha1 = sha_file(file_obj)
+ finally:
+ file_obj.close()
+ state._observed_sha1(entry, link_or_sha1,
+ statvalue)
+ content_change = (link_or_sha1 != source_details[1])
+ else:
+ # Size changed, so must be different
+ content_change = True
# Target details is updated at update_entry time
if use_filesystem_for_exec:
# We don't need S_ISREG here, because we are sure
More information about the bazaar-commits
mailing list