[patch] hashcache fixes
Martin Pool
mbp at canonical.com
Tue Jul 11 11:46:43 BST 2006
OK, here is an additional patch that
- moves the functions that depend on the OS state into HashCache
- removes dead 'FixThisError' (??) from test_hashcache
- adds new FakeHashCache, which gives the test suite control of
how the files and clock appear to the hashcache
- new tests using this which check three key behaviours:
- new files are not inserted in the cache
- old files are inserted in the cache and can hit
- if a file is modified within a short time window, we don't get
an incorrect cached result
Obviously we could make the fake cache much more sophisiticated but I
think this fixes the immediate problem and gives a foundation for future
work. What do you think?
There are now no skipped or stubbed-out tests here and none that may
pass or fail depending on environment variances.
--
Martin
-------------- next part --------------
=== modified file 'bzrlib/hashcache.py'
--- bzrlib/hashcache.py 2006-07-09 06:28:38 +0000
+++ bzrlib/hashcache.py 2006-07-11 10:25:08 +0000
@@ -42,20 +42,6 @@
FP_CTIME_COLUMN = 2
FP_MODE_COLUMN = 5
-def _fingerprint(abspath):
- try:
- fs = os.lstat(abspath)
- except OSError:
- # might be missing, etc
- return None
-
- if stat.S_ISDIR(fs.st_mode):
- return None
-
- # we discard any high precision because it's not reliable; perhaps we
- # could do better on some systems?
- return (fs.st_size, long(fs.st_mtime),
- long(fs.st_ctime), fs.st_ino, fs.st_dev, fs.st_mode)
class HashCache(object):
@@ -131,7 +117,7 @@
for inum, path, cache_entry in prep:
abspath = pathjoin(self.root, path)
- fp = _fingerprint(abspath)
+ fp = self._fingerprint(abspath)
self.stat_count += 1
cache_fp = cache_entry[1]
@@ -142,13 +128,12 @@
self.needs_write = True
del self._cache[path]
-
def get_sha1(self, path):
"""Return the sha1 of a file.
"""
abspath = pathjoin(self.root, path)
self.stat_count += 1
- file_fp = _fingerprint(abspath)
+ file_fp = self._fingerprint(abspath)
if not file_fp:
# not a regular file or not existing
@@ -173,7 +158,7 @@
mode = file_fp[FP_MODE_COLUMN]
if stat.S_ISREG(mode):
- digest = sha_file(file(abspath, 'rb', buffering=65000))
+ digest = self._really_sha1_file(abspath)
elif stat.S_ISLNK(mode):
digest = sha.new(os.readlink(abspath)).hexdigest()
else:
@@ -181,7 +166,7 @@
# window of 3 seconds to allow for 2s resolution on windows,
# unsynchronized file servers, etc.
- cutoff = int(time.time()) - 3
+ cutoff = self._cutoff_time()
if file_fp[FP_MTIME_COLUMN] >= cutoff \
or file_fp[FP_CTIME_COLUMN] >= cutoff:
# changed too recently; can't be cached. we can
@@ -209,6 +194,10 @@
self.needs_write = True
self._cache[path] = (digest, file_fp)
return digest
+
+ def _really_sha1_file(self, abspath):
+ """Calculate the SHA1 of a file by reading the full text"""
+ return sha_file(file(abspath, 'rb', buffering=65000))
def write(self):
"""Write contents of cache to file."""
@@ -252,7 +241,6 @@
self.needs_write = True
return
-
hdr = inf.readline()
if hdr != CACHE_HEADER:
mutter('cache header marker not found at top of %s;'
@@ -283,7 +271,24 @@
self._cache[path] = (sha1, fp)
self.needs_write = False
+
+ def _cutoff_time(self):
+ """Return cutoff time.
+
+ Files modified more recently than this time are at risk of being
+ undetectably modified and so can't be cached.
+ """
+ return int(time.time()) - 3
-
-
-
+ def _fingerprint(self, abspath):
+ try:
+ fs = os.lstat(abspath)
+ except OSError:
+ # might be missing, etc
+ return None
+ if stat.S_ISDIR(fs.st_mode):
+ return None
+ # we discard any high precision because it's not reliable; perhaps we
+ # could do better on some systems?
+ return (fs.st_size, long(fs.st_mtime),
+ long(fs.st_ctime), fs.st_ino, fs.st_dev, fs.st_mode)
=== modified file 'bzrlib/tests/test_hashcache.py'
--- bzrlib/tests/test_hashcache.py 2006-07-09 06:28:38 +0000
+++ bzrlib/tests/test_hashcache.py 2006-07-11 10:45:00 +0000
@@ -16,12 +16,13 @@
import os
import sha
+import stat
import sys
import time
from bzrlib.errors import BzrError
from bzrlib.hashcache import HashCache
-from bzrlib.tests import TestCaseInTempDir, TestSkipped
+from bzrlib.tests import TestCaseInTempDir, TestSkipped, TestCase
def sha1(t):
@@ -32,11 +33,8 @@
time.sleep(5.0)
-class FixThisError(Exception):
- pass
-
-
class TestHashCache(TestCaseInTempDir):
+ """Test the hashcache against a real directory"""
def make_hashcache(self):
# make a dummy bzr directory just to hold the cache
@@ -58,27 +56,6 @@
self.assertEquals(hc.miss_count, 1)
self.assertEquals(hc.hit_count, 0)
- def test_hashcache_hit_old_file(self):
- """An old file gives a cache hit"""
- return ### this takes too long to run properly; skipped
- hc = self.make_hashcache()
- self.build_tree_contents([('foo', 'hello')])
- pause() # make sure file's old enough to cache
- self.assertEquals(hc.get_sha1('foo'),
- 'aaf4c61ddcc5e8a2dabede0f3b482cd9aea9434d')
- self.assertEquals(hc.miss_count, 1)
- self.assertEquals(hc.hit_count, 0)
- # now should hit on second try
- self.assertEquals(hc.get_sha1('foo'),
- 'aaf4c61ddcc5e8a2dabede0f3b482cd9aea9434d')
- self.assertEquals(hc.miss_count, 1)
- self.assertEquals(hc.hit_count, 1)
- # and hit again on third try
- self.assertEquals(hc.get_sha1('foo'),
- 'aaf4c61ddcc5e8a2dabede0f3b482cd9aea9434d')
- self.assertEquals(hc.miss_count, 1)
- self.assertEquals(hc.hit_count, 2)
-
def test_hashcache_new_file(self):
hc = self.make_hashcache()
self.build_tree_contents([('foo', 'goodbye')])
@@ -143,3 +120,85 @@
# such combinations don't usually occur for the filesystem where
# people test bzr.
self.assertRaises(BzrError, hc.get_sha1, 'a')
+
+
+class FakeHashCache(HashCache):
+ """Hashcache that consults a fake clock rather than the real one.
+
+ This lets us examine how old or new files would be handled, without
+ actually having to wait for time to pass.
+ """
+ def __init__(self):
+ # set root and cache file name to none to make sure we won't touch the
+ # real filesystem
+ HashCache.__init__(self, '.', 'hashcache')
+ self._files = {}
+ # simulated clock running forward as operations happen
+ self._clock = 0
+
+ def put_file(self, filename, file_contents):
+ abspath = './' + filename
+ self._files[abspath] = (file_contents, self._clock)
+
+ def _fingerprint(self, abspath):
+ entry = self._files[abspath]
+ return (len(entry[0]),
+ entry[1], entry[1],
+ 10, 20,
+ stat.S_IFREG | 0600)
+
+ def _really_sha1_file(self, abspath):
+ if abspath in self._files:
+ return sha1(self._files[abspath][0])
+ else:
+ return None
+
+ def _cutoff_time(self):
+ return self._clock - 2
+
+ def pretend_to_sleep(self, secs):
+ self._clock += secs
+
+
+class TestHashCacheFakeFilesystem(TestCaseInTempDir):
+ """Tests the hashcache using a simulated OS.
+ """
+
+ def make_hashcache(self):
+ return FakeHashCache()
+
+ def test_hashcache_miss_new_file(self):
+ """A new file gives the right sha1 but misses"""
+ self.assertEquals(hc.miss_count, 1)
+ self.assertEquals(hc.hit_count, 0)
+ # if we try again it's still too new;
+ self.assertEquals(hc.get_sha1('foo'), sha1('hello'))
+ self.assertEquals(hc.miss_count, 2)
+ self.assertEquals(hc.hit_count, 0)
+
+ def test_hashcache_old_file(self):
+ """An old file gives the right sha1 and hits"""
+ hc = self.make_hashcache()
+ hc.put_file('foo', 'hello')
+ hc.pretend_to_sleep(20)
+ # file is new; should get the correct hash but miss
+ self.assertEquals(hc.get_sha1('foo'), sha1('hello'))
+ self.assertEquals(hc.miss_count, 1)
+ self.assertEquals(hc.hit_count, 0)
+ # and can now be hit
+ self.assertEquals(hc.get_sha1('foo'), sha1('hello'))
+ self.assertEquals(hc.miss_count, 1)
+ self.assertEquals(hc.hit_count, 1)
+ hc.pretend_to_sleep(3)
+ # and again
+ self.assertEquals(hc.get_sha1('foo'), sha1('hello'))
+ self.assertEquals(hc.miss_count, 1)
+ self.assertEquals(hc.hit_count, 2)
+
+ def test_hashcache_invalidates(self):
+ hc = self.make_hashcache()
+ hc.put_file('foo', 'hello')
+ hc.pretend_to_sleep(20)
+ hc.get_sha1('foo')
+ hc.put_file('foo', 'h1llo')
+ self.assertEquals(hc.get_sha1('foo'), sha1('h1llo'))
=== modified file 'bzrlib/tests/workingtree_implementations/test_workingtree.py'
--- bzrlib/tests/workingtree_implementations/test_workingtree.py 2006-07-09 06:28:38 +0000
+++ bzrlib/tests/workingtree_implementations/test_workingtree.py 2006-07-11 09:19:06 +0000
@@ -181,15 +181,6 @@
self.assertEquals(list(tree.unknowns()),
['hello.txt'])
- def test_hashcache(self):
- tree = self.make_branch_and_tree('.')
- self.build_tree(['hello.txt',
- 'hello.txt.~1~'])
- tree.add('hello.txt')
- sha = tree.get_file_sha1(tree.path2id('hello.txt'))
- tree2 = WorkingTree.open('.')
- sha2 = tree2.get_file_sha1(tree2.path2id('hello.txt'))
-
def test_initialize(self):
# initialize should create a working tree and branch in an existing dir
t = self.make_branch_and_tree('.')
More information about the bazaar
mailing list