Rev 3723: Integrate less aggressive sha logic with C iter-changes. in http://people.ubuntu.com/~robertc/baz2.0/readdir
Robert Collins
robertc at robertcollins.net
Thu Sep 25 02:54:50 BST 2008
At http://people.ubuntu.com/~robertc/baz2.0/readdir
------------------------------------------------------------
revno: 3723
revision-id: robertc at robertcollins.net-20080925015442-p9mtzse65w5gy2uv
parent: robertc at robertcollins.net-20080925011130-2ct9kz8v1cvptavi
parent: robertc at robertcollins.net-20080923062145-bi6lcbxwgfhcm8j9
committer: Robert Collins <robertc at robertcollins.net>
branch nick: iter-changes.less-sha1
timestamp: Thu 2008-09-25 11:54:42 +1000
message:
Integrate less aggressive sha logic with C iter-changes.
modified:
NEWS NEWS-20050323055033-4e00b5db738777ff
bzrlib/_dirstate_helpers_c.pyx dirstate_helpers.pyx-20070503201057-u425eni465q4idwn-3
bzrlib/dirstate.py dirstate.py-20060728012006-d6mvoihjb3je9peu-1
bzrlib/tests/test__dirstate_helpers.py test_dirstate_helper-20070504035751-jsbn00xodv0y1eve-2
bzrlib/tests/test_dirstate.py test_dirstate.py-20060728012006-d6mvoihjb3je9peu-2
bzrlib/workingtree_4.py workingtree_4.py-20070208044105-5fgpc5j3ljlh5q6c-1
------------------------------------------------------------
revno: 3696.36.1
revision-id: robertc at robertcollins.net-20080923062145-bi6lcbxwgfhcm8j9
parent: robertc at robertcollins.net-20080922051520-uhr3pn61w141kagv
committer: Robert Collins <robertc at robertcollins.net>
branch nick: iter-changes.less-sha1
timestamp: Tue 2008-09-23 16:21:45 +1000
message:
Sha files for the stat cache more lazily rather than on first-examination, allowing less overall sha calculations to occur.
modified:
NEWS NEWS-20050323055033-4e00b5db738777ff
bzrlib/dirstate.py dirstate.py-20060728012006-d6mvoihjb3je9peu-1
bzrlib/tests/test_dirstate.py test_dirstate.py-20060728012006-d6mvoihjb3je9peu-2
bzrlib/workingtree_4.py workingtree_4.py-20070208044105-5fgpc5j3ljlh5q6c-1
=== modified file 'NEWS'
--- a/NEWS 2008-09-25 01:11:30 +0000
+++ b/NEWS 2008-09-25 01:54:42 +0000
@@ -79,6 +79,10 @@
its result tuple - an optional file system hash for the hash cache
to use. (Robert Collins)
+ * ``dirstate.DirState.update_entry`` will now only calculate the sha1
+ of a file if it is likely to be needed in determining the output
+ of iter_changes. (Robert Collins)
+
TESTING:
* ``bzrlib.tests.repository_implementations`` has been renamed to
=== modified file 'bzrlib/_dirstate_helpers_c.pyx'
--- a/bzrlib/_dirstate_helpers_c.pyx 2008-09-24 03:08:14 +0000
+++ b/bzrlib/_dirstate_helpers_c.pyx 2008-09-25 01:54:42 +0000
@@ -800,12 +800,16 @@
def update_entry(self, entry, abspath, stat_value):
"""Update the entry based on what is actually on disk.
+ This function only calculates the sha if it needs to - if the entry is
+ uncachable, or clearly different to the first parent's entry, no sha
+ is calculated, and None is returned.
+
:param entry: This is the dirblock entry for the file in question.
:param abspath: The path on disk for this file.
:param stat_value: (optional) if we already have done a stat on the
file, re-use it.
- :return: The sha1 hexdigest of the file (40 bytes) or link target of a
- symlink.
+ :return: None, or The sha1 hexdigest of the file (40 bytes) or link
+ target of a symlink.
"""
return _update_entry(self, entry, abspath, stat_value)
@@ -813,12 +817,16 @@
cdef _update_entry(self, entry, abspath, stat_value):
"""Update the entry based on what is actually on disk.
+ This function only calculates the sha if it needs to - if the entry is
+ uncachable, or clearly different to the first parent's entry, no sha
+ is calculated, and None is returned.
+
:param self: The dirstate object this is operating on.
:param entry: This is the dirblock entry for the file in question.
:param abspath: The path on disk for this file.
:param stat_value: The stat value done on the path.
- :return: The sha1 hexdigest of the file (40 bytes) or link target of a
- symlink.
+ :return: None, or The sha1 hexdigest of the file (40 bytes) or link
+ target of a symlink.
"""
# TODO - require pyrex 0.9.8, then use a pyd file to define access to the
# _st mode of the compiled stat objects.
@@ -857,13 +865,18 @@
# process this entry.
link_or_sha1 = None
if minikind == c'f':
- link_or_sha1 = self._sha1_file(abspath)
executable = self._is_executable(stat_value.st_mode,
saved_executable)
if self._cutoff_time is None:
self._sha_cutoff_time()
if (stat_value.st_mtime < self._cutoff_time
- and stat_value.st_ctime < self._cutoff_time):
+ and stat_value.st_ctime < self._cutoff_time
+ and len(entry[1]) > 1
+ and entry[1][1][0] != 'a'):
+ # Could check for size changes for further optimised
+ # avoidance of sha1's. However the most prominent case of
+ # over-shaing is during initial add, which this catches.
+ link_or_sha1 = self._sha1_file(abspath)
entry[1][0] = ('f', link_or_sha1, stat_value.st_size,
executable, packed_stat)
else:
@@ -974,6 +987,8 @@
cdef object root_dir_info
cdef object bisect_left
cdef object pathjoin
+ cdef object fstat
+ cdef object sha_file
def __init__(self, include_unchanged, use_filesystem_for_exec,
search_specific_files, state, source_index, target_index,
@@ -1021,6 +1036,8 @@
self.root_dir_info = None
self.bisect_left = bisect.bisect_left
self.pathjoin = osutils.pathjoin
+ self.fstat = os.fstat
+ self.sha_file = osutils.sha_file
cdef _process_entry(self, entry, path_info):
"""Compare an entry and real disk to generate delta information.
@@ -1117,9 +1134,24 @@
if source_minikind != c'f':
content_change = 1
else:
- # We could check the size, but we already have the
- # sha1 hash.
- content_change = (link_or_sha1 != source_details[1])
+ # If the size is the same, check the sha:
+ if target_details[2] == source_details[2]:
+ if link_or_sha1 is None:
+ # Stat cache miss:
+ file_obj = file(path_info[4], 'rb')
+ try:
+ # XXX: TODO: Use lower level file IO rather
+ # than python objects for sha-misses.
+ statvalue = self.fstat(file_obj.fileno())
+ link_or_sha1 = self.sha_file(file_obj)
+ finally:
+ file_obj.close()
+ self.state._observed_sha1(entry, link_or_sha1,
+ statvalue)
+ content_change = (link_or_sha1 != source_details[1])
+ else:
+ # Size changed, so must be different
+ content_change = 1
# Target details is updated at update_entry time
if self.use_filesystem_for_exec:
# We don't need S_ISREG here, because we are sure
=== modified file 'bzrlib/dirstate.py'
--- a/bzrlib/dirstate.py 2008-09-25 01:11:30 +0000
+++ b/bzrlib/dirstate.py 2008-09-25 01:54:42 +0000
@@ -2764,12 +2764,16 @@
_pack_stat=pack_stat):
"""Update the entry based on what is actually on disk.
+ This function only calculates the sha if it needs to - if the entry is
+ uncachable, or clearly different to the first parent's entry, no sha
+ is calculated, and None is returned.
+
:param state: The dirstate this entry is in.
:param entry: This is the dirblock entry for the file in question.
:param abspath: The path on disk for this file.
:param stat_value: The stat value done on the path.
- :return: The sha1 hexdigest of the file (40 bytes) or link target of a
- symlink.
+ :return: None, or The sha1 hexdigest of the file (40 bytes) or link
+ target of a symlink.
"""
try:
minikind = _stat_to_minikind[stat_value.st_mode & 0170000]
@@ -2795,13 +2799,18 @@
# process this entry.
link_or_sha1 = None
if minikind == 'f':
- link_or_sha1 = state._sha1_file(abspath)
executable = state._is_executable(stat_value.st_mode,
saved_executable)
if state._cutoff_time is None:
state._sha_cutoff_time()
if (stat_value.st_mtime < state._cutoff_time
- and stat_value.st_ctime < state._cutoff_time):
+ and stat_value.st_ctime < state._cutoff_time
+ and len(entry[1]) > 1
+ and entry[1][1][0] != 'a'):
+ # Could check for size changes for further optimised
+ # avoidance of sha1's. However the most prominent case of
+ # over-shaing is during initial add, which this catches.
+ link_or_sha1 = state._sha1_file(abspath)
entry[1][0] = ('f', link_or_sha1, stat_value.st_size,
executable, packed_stat)
else:
@@ -2955,9 +2964,22 @@
if source_minikind != 'f':
content_change = True
else:
- # We could check the size, but we already have the
- # sha1 hash.
- content_change = (link_or_sha1 != source_details[1])
+ # If the size is the same, check the sha:
+ if target_details[2] == source_details[2]:
+ if link_or_sha1 is None:
+ # Stat cache miss:
+ file_obj = file(path_info[4], 'rb')
+ try:
+ statvalue = fstat(file_obj.fileno())
+ link_or_sha1 = sha_file(file_obj)
+ finally:
+ file_obj.close()
+ self.state._observed_sha1(entry, link_or_sha1,
+ statvalue)
+ content_change = (link_or_sha1 != source_details[1])
+ else:
+ # Size changed, so must be different
+ content_change = True
# Target details is updated at update_entry time
if self.use_filesystem_for_exec:
# We don't need S_ISREG here, because we are sure
@@ -3131,6 +3153,8 @@
"""Iterate over the changes."""
utf8_decode = cache_utf8._utf8_decode
_cmp_by_dirs = cmp_by_dirs
+ fstat = os.fstat
+ sha_file = osutils.sha_file
_process_entry = self._process_entry
uninteresting = self.uninteresting
search_specific_files = self.search_specific_files
=== modified file 'bzrlib/tests/test__dirstate_helpers.py'
--- a/bzrlib/tests/test__dirstate_helpers.py 2008-09-25 01:11:30 +0000
+++ b/bzrlib/tests/test__dirstate_helpers.py 2008-09-25 01:54:42 +0000
@@ -824,12 +824,23 @@
self.assertEqual(oldstat, entry[1][0][4])
def test_update_entry(self):
- state, entry = self.get_state_with_a()
+ state, _ = self.get_state_with_a()
+ tree = self.make_branch_and_tree('tree')
+ tree.lock_write()
+ empty_revid = tree.commit('empty')
+ self.build_tree(['tree/a'])
+ tree.add(['a'], ['a-id'])
+ with_a_id = tree.commit('with_a')
+ self.addCleanup(tree.unlock)
+ state.set_parent_trees(
+ [(empty_revid, tree.branch.repository.revision_tree(empty_revid))],
+ [])
+ entry = state._get_entry(0, path_utf8='a')
self.build_tree(['a'])
# Add one where we don't provide the stat or sha already
self.assertEqual(('', 'a', 'a-id'), entry[0])
- self.assertEqual([('f', '', 0, False, dirstate.DirState.NULLSTAT)],
- entry[1])
+ self.assertEqual(('f', '', 0, False, dirstate.DirState.NULLSTAT),
+ entry[1][0])
# Flush the buffers to disk
state.save()
self.assertEqual(dirstate.DirState.IN_MEMORY_UNMODIFIED,
@@ -839,16 +850,15 @@
packed_stat = dirstate.pack_stat(stat_value)
link_or_sha1 = self.update_entry(state, entry, abspath='a',
stat_value=stat_value)
- self.assertEqual('b50e5406bb5e153ebbeb20268fcf37c87e1ecfb6',
- link_or_sha1)
+ self.assertEqual(None, link_or_sha1)
- # The dirblock entry should not cache the file's sha1
- self.assertEqual([('f', '', 14, False, dirstate.DirState.NULLSTAT)],
- entry[1])
+ # The dirblock entry should not have cached the file's sha1 (too new)
+ self.assertEqual(('f', '', 14, False, dirstate.DirState.NULLSTAT),
+ entry[1][0])
self.assertEqual(dirstate.DirState.IN_MEMORY_MODIFIED,
state._dirblock_state)
mode = stat_value.st_mode
- self.assertEqual([('sha1', 'a'), ('is_exec', mode, False)], state._log)
+ self.assertEqual([('is_exec', mode, False)], state._log)
state.save()
self.assertEqual(dirstate.DirState.IN_MEMORY_UNMODIFIED,
@@ -858,45 +868,55 @@
# so we will re-read the file. Roll the clock back so the file is
# guaranteed to look too new.
state.adjust_time(-10)
+ del state._log[:]
link_or_sha1 = self.update_entry(state, entry, abspath='a',
stat_value=stat_value)
- self.assertEqual([('sha1', 'a'), ('is_exec', mode, False),
- ('sha1', 'a'), ('is_exec', mode, False),
- ], state._log)
- self.assertEqual('b50e5406bb5e153ebbeb20268fcf37c87e1ecfb6',
- link_or_sha1)
+ self.assertEqual([('is_exec', mode, False)], state._log)
+ self.assertEqual(None, link_or_sha1)
self.assertEqual(dirstate.DirState.IN_MEMORY_MODIFIED,
state._dirblock_state)
- self.assertEqual([('f', '', 14, False, dirstate.DirState.NULLSTAT)],
- entry[1])
+ self.assertEqual(('f', '', 14, False, dirstate.DirState.NULLSTAT),
+ entry[1][0])
state.save()
- # However, if we move the clock forward so the file is considered
- # "stable", it should just cache the value.
+ # If it is cachable (the clock has moved forward) but new it still
+ # won't calculate the sha or cache it.
state.adjust_time(+20)
+ del state._log[:]
+ link_or_sha1 = dirstate.update_entry(state, entry, abspath='a',
+ stat_value=stat_value)
+ self.assertEqual(None, link_or_sha1)
+ self.assertEqual([('is_exec', mode, False)], state._log)
+ self.assertEqual(('f', '', 14, False, dirstate.DirState.NULLSTAT),
+ entry[1][0])
+
+ # If the file is no longer new, and the clock has been moved forward
+ # sufficiently, it will cache the sha.
+ del state._log[:]
+ state.set_parent_trees(
+ [(with_a_id, tree.branch.repository.revision_tree(with_a_id))],
+ [])
+ entry = state._get_entry(0, path_utf8='a')
+
link_or_sha1 = self.update_entry(state, entry, abspath='a',
stat_value=stat_value)
self.assertEqual('b50e5406bb5e153ebbeb20268fcf37c87e1ecfb6',
link_or_sha1)
- self.assertEqual([('sha1', 'a'), ('is_exec', mode, False),
- ('sha1', 'a'), ('is_exec', mode, False),
- ('sha1', 'a'), ('is_exec', mode, False),
- ], state._log)
- self.assertEqual([('f', link_or_sha1, 14, False, packed_stat)],
- entry[1])
+ self.assertEqual([('is_exec', mode, False), ('sha1', 'a')],
+ state._log)
+ self.assertEqual(('f', link_or_sha1, 14, False, packed_stat),
+ entry[1][0])
# Subsequent calls will just return the cached value
+ del state._log[:]
link_or_sha1 = self.update_entry(state, entry, abspath='a',
stat_value=stat_value)
self.assertEqual('b50e5406bb5e153ebbeb20268fcf37c87e1ecfb6',
link_or_sha1)
- self.assertEqual([('sha1', 'a'), ('is_exec', mode, False),
- ('sha1', 'a'), ('is_exec', mode, False),
- ('sha1', 'a'), ('is_exec', mode, False),
- ], state._log)
- self.assertEqual([('f', link_or_sha1, 14, False, packed_stat)],
- entry[1])
+ self.assertEqual([], state._log)
+ self.assertEqual(('f', link_or_sha1, 14, False, packed_stat),
+ entry[1][0])
def test_update_entry_symlink(self):
"""Update entry should read symlinks."""
@@ -976,7 +996,17 @@
state._dirblock_state)
def test_update_entry_file_unchanged(self):
- state, entry = self.get_state_with_a()
+ state, _ = self.get_state_with_a()
+ tree = self.make_branch_and_tree('tree')
+ tree.lock_write()
+ self.build_tree(['tree/a'])
+ tree.add(['a'], ['a-id'])
+ with_a_id = tree.commit('witha')
+ self.addCleanup(tree.unlock)
+ state.set_parent_trees(
+ [(with_a_id, tree.branch.repository.revision_tree(with_a_id))],
+ [])
+ entry = state._get_entry(0, path_utf8='a')
self.build_tree(['a'])
sha1sum = 'b50e5406bb5e153ebbeb20268fcf37c87e1ecfb6'
state.adjust_time(+20)
@@ -991,7 +1021,7 @@
state._dirblock_state)
def create_and_test_file(self, state, entry):
- """Create a file at 'a' and verify the state finds it.
+ """Create a file at 'a' and verify the state finds it during update.
The state should already be versioning *something* at 'a'. This makes
sure that state.update_entry recognizes it as a file.
@@ -1001,9 +1031,8 @@
packed_stat = dirstate.pack_stat(stat_value)
link_or_sha1 = self.do_update_entry(state, entry, abspath='a')
- self.assertEqual('b50e5406bb5e153ebbeb20268fcf37c87e1ecfb6',
- link_or_sha1)
- self.assertEqual([('f', link_or_sha1, 14, False, packed_stat)],
+ self.assertEqual(None, link_or_sha1)
+ self.assertEqual([('f', '', 14, False, dirstate.DirState.NULLSTAT)],
entry[1])
return packed_stat
@@ -1125,11 +1154,13 @@
self.assertEqual([('f', '', 14, True, dirstate.DirState.NULLSTAT)],
entry[1])
- # Make the disk object look old enough to cache
+ # Make the disk object look old enough to cache (but it won't cache the sha
+ # as it is a new file).
state.adjust_time(+20)
digest = 'b50e5406bb5e153ebbeb20268fcf37c87e1ecfb6'
self.update_entry(state, entry, abspath='a', stat_value=stat_value)
- self.assertEqual([('f', digest, 14, True, packed_stat)], entry[1])
+ self.assertEqual([('f', '', 14, True, dirstate.DirState.NULLSTAT)],
+ entry[1])
class TestCompiledUpdateEntry(TestUpdateEntry):
=== modified file 'bzrlib/tests/test_dirstate.py'
--- a/bzrlib/tests/test_dirstate.py 2008-09-25 01:11:30 +0000
+++ b/bzrlib/tests/test_dirstate.py 2008-09-25 01:54:42 +0000
@@ -563,20 +563,22 @@
state.lock_read()
try:
entry = state._get_entry(0, path_utf8='a-file')
- # The current sha1 sum should be empty
- self.assertEqual('', entry[1][0][1])
+ # The current size should be 0 (default)
+ self.assertEqual(0, entry[1][0][2])
# We should have a real entry.
self.assertNotEqual((None, None), entry)
# Make sure everything is old enough
state._sha_cutoff_time()
state._cutoff_time += 10
- sha1sum = dirstate.update_entry(state, entry, 'a-file', os.lstat('a-file'))
- # We should have gotten a real sha1
- self.assertEqual('ecc5374e9ed82ad3ea3b4d452ea995a5fd3e70e3',
- sha1sum)
+ # Change the file length
+ self.build_tree_contents([('a-file', 'shorter')])
+ sha1sum = dirstate.update_entry(state, entry, 'a-file',
+ os.lstat('a-file'))
+ # new file, no cached sha:
+ self.assertEqual(None, sha1sum)
# The dirblock has been updated
- self.assertEqual(sha1sum, entry[1][0][1])
+ self.assertEqual(7, entry[1][0][2])
self.assertEqual(dirstate.DirState.IN_MEMORY_MODIFIED,
state._dirblock_state)
@@ -592,7 +594,7 @@
state.lock_read()
try:
entry = state._get_entry(0, path_utf8='a-file')
- self.assertEqual(sha1sum, entry[1][0][1])
+ self.assertEqual(7, entry[1][0][2])
finally:
state.unlock()
@@ -611,10 +613,10 @@
state.lock_read()
try:
entry = state._get_entry(0, path_utf8='a-file')
- sha1sum = dirstate.update_entry(state, entry, 'a-file', os.lstat('a-file'))
- # We should have gotten a real sha1
- self.assertEqual('ecc5374e9ed82ad3ea3b4d452ea995a5fd3e70e3',
- sha1sum)
+ sha1sum = dirstate.update_entry(state, entry, 'a-file',
+ os.lstat('a-file'))
+ # No sha - too new
+ self.assertEqual(None, sha1sum)
self.assertEqual(dirstate.DirState.IN_MEMORY_MODIFIED,
state._dirblock_state)
=== modified file 'bzrlib/workingtree_4.py'
--- a/bzrlib/workingtree_4.py 2008-09-25 01:11:30 +0000
+++ b/bzrlib/workingtree_4.py 2008-09-25 01:54:42 +0000
@@ -407,7 +407,16 @@
link_or_sha1 = dirstate.update_entry(state, entry, file_abspath,
stat_value=stat_value)
if entry[1][0][0] == 'f':
- return link_or_sha1
+ if link_or_sha1 is None:
+ file_obj, statvalue = self.get_file_with_stat(file_id, path)
+ try:
+ sha1 = osutils.sha_file(file_obj)
+ finally:
+ file_obj.close()
+ self._observed_sha1(file_id, path, (sha1, statvalue))
+ return sha1
+ else:
+ return link_or_sha1
return None
def _get_inventory(self):
More information about the bazaar-commits
mailing list