[patch] hashcache fixes

Martin Pool mbp at canonical.com
Tue Jul 11 11:46:43 BST 2006


OK, here is an additional patch that

 - moves the functions that depend on the OS state into HashCache
 - removes dead 'FixThisError' (??) from test_hashcache
 - adds new FakeHashCache, which gives the test suite control of 
   how the files and clock appear to the hashcache
 - new tests using this which check three key behaviours:
   - new files are not inserted in the cache
   - old files are inserted in the cache and can hit
   - if a file is modified within a short time window, we don't get
     an incorrect cached result

Obviously we could make the fake cache much more sophisiticated but I
think this fixes the immediate problem and gives a foundation for future
work.  What do you think?

There are now no skipped or stubbed-out tests here and none that may
pass or fail depending on environment variances.

-- 
Martin
-------------- next part --------------
=== modified file 'bzrlib/hashcache.py'
--- bzrlib/hashcache.py	2006-07-09 06:28:38 +0000
+++ bzrlib/hashcache.py	2006-07-11 10:25:08 +0000
@@ -42,20 +42,6 @@
 FP_CTIME_COLUMN = 2
 FP_MODE_COLUMN = 5
 
-def _fingerprint(abspath):
-    try:
-        fs = os.lstat(abspath)
-    except OSError:
-        # might be missing, etc
-        return None
-
-    if stat.S_ISDIR(fs.st_mode):
-        return None
-
-    # we discard any high precision because it's not reliable; perhaps we
-    # could do better on some systems?
-    return (fs.st_size, long(fs.st_mtime),
-            long(fs.st_ctime), fs.st_ino, fs.st_dev, fs.st_mode)
 
 
 class HashCache(object):
@@ -131,7 +117,7 @@
         
         for inum, path, cache_entry in prep:
             abspath = pathjoin(self.root, path)
-            fp = _fingerprint(abspath)
+            fp = self._fingerprint(abspath)
             self.stat_count += 1
             
             cache_fp = cache_entry[1]
@@ -142,13 +128,12 @@
                 self.needs_write = True
                 del self._cache[path]
 
-
     def get_sha1(self, path):
         """Return the sha1 of a file.
         """
         abspath = pathjoin(self.root, path)
         self.stat_count += 1
-        file_fp = _fingerprint(abspath)
+        file_fp = self._fingerprint(abspath)
         
         if not file_fp:
             # not a regular file or not existing
@@ -173,7 +158,7 @@
 
         mode = file_fp[FP_MODE_COLUMN]
         if stat.S_ISREG(mode):
-            digest = sha_file(file(abspath, 'rb', buffering=65000))
+            digest = self._really_sha1_file(abspath)
         elif stat.S_ISLNK(mode):
             digest = sha.new(os.readlink(abspath)).hexdigest()
         else:
@@ -181,7 +166,7 @@
 
         # window of 3 seconds to allow for 2s resolution on windows,
         # unsynchronized file servers, etc.
-        cutoff = int(time.time()) - 3
+        cutoff = self._cutoff_time()
         if file_fp[FP_MTIME_COLUMN] >= cutoff \
                 or file_fp[FP_CTIME_COLUMN] >= cutoff:
             # changed too recently; can't be cached.  we can
@@ -209,6 +194,10 @@
             self.needs_write = True
             self._cache[path] = (digest, file_fp)
         return digest
+
+    def _really_sha1_file(self, abspath):
+        """Calculate the SHA1 of a file by reading the full text"""
+        return sha_file(file(abspath, 'rb', buffering=65000))
         
     def write(self):
         """Write contents of cache to file."""
@@ -252,7 +241,6 @@
             self.needs_write = True
             return
 
-
         hdr = inf.readline()
         if hdr != CACHE_HEADER:
             mutter('cache header marker not found at top of %s;'
@@ -283,7 +271,24 @@
             self._cache[path] = (sha1, fp)
 
         self.needs_write = False
+
+    def _cutoff_time(self):
+        """Return cutoff time.
+
+        Files modified more recently than this time are at risk of being
+        undetectably modified and so can't be cached.
+        """
+        return int(time.time()) - 3
            
-
-
-        
+    def _fingerprint(self, abspath):
+        try:
+            fs = os.lstat(abspath)
+        except OSError:
+            # might be missing, etc
+            return None
+        if stat.S_ISDIR(fs.st_mode):
+            return None
+        # we discard any high precision because it's not reliable; perhaps we
+        # could do better on some systems?
+        return (fs.st_size, long(fs.st_mtime),
+                long(fs.st_ctime), fs.st_ino, fs.st_dev, fs.st_mode)

=== modified file 'bzrlib/tests/test_hashcache.py'
--- bzrlib/tests/test_hashcache.py	2006-07-09 06:28:38 +0000
+++ bzrlib/tests/test_hashcache.py	2006-07-11 10:45:00 +0000
@@ -16,12 +16,13 @@
 
 import os
 import sha
+import stat
 import sys
 import time
 
 from bzrlib.errors import BzrError
 from bzrlib.hashcache import HashCache
-from bzrlib.tests import TestCaseInTempDir, TestSkipped
+from bzrlib.tests import TestCaseInTempDir, TestSkipped, TestCase
 
 
 def sha1(t):
@@ -32,11 +33,8 @@
     time.sleep(5.0)
 
 
-class FixThisError(Exception):
-    pass
-    
-
 class TestHashCache(TestCaseInTempDir):
+    """Test the hashcache against a real directory"""
 
     def make_hashcache(self):
         # make a dummy bzr directory just to hold the cache
@@ -58,27 +56,6 @@
         self.assertEquals(hc.miss_count, 1)
         self.assertEquals(hc.hit_count, 0)
 
-    def test_hashcache_hit_old_file(self):
-        """An old file gives a cache hit"""
-        return ### this takes too long to run properly; skipped
-        hc = self.make_hashcache()
-        self.build_tree_contents([('foo', 'hello')])
-        pause() # make sure file's old enough to cache
-        self.assertEquals(hc.get_sha1('foo'),
-                          'aaf4c61ddcc5e8a2dabede0f3b482cd9aea9434d')
-        self.assertEquals(hc.miss_count, 1)
-        self.assertEquals(hc.hit_count, 0)
-        # now should hit on second try
-        self.assertEquals(hc.get_sha1('foo'),
-                          'aaf4c61ddcc5e8a2dabede0f3b482cd9aea9434d')
-        self.assertEquals(hc.miss_count, 1)
-        self.assertEquals(hc.hit_count, 1)
-        # and hit again on third try
-        self.assertEquals(hc.get_sha1('foo'),
-                          'aaf4c61ddcc5e8a2dabede0f3b482cd9aea9434d')
-        self.assertEquals(hc.miss_count, 1)
-        self.assertEquals(hc.hit_count, 2)
-
     def test_hashcache_new_file(self):
         hc = self.make_hashcache()
         self.build_tree_contents([('foo', 'goodbye')])
@@ -143,3 +120,85 @@
         # such combinations don't usually occur for the filesystem where
         # people test bzr.
         self.assertRaises(BzrError, hc.get_sha1, 'a')
+
+
+class FakeHashCache(HashCache):
+    """Hashcache that consults a fake clock rather than the real one.
+
+    This lets us examine how old or new files would be handled, without
+    actually having to wait for time to pass.
+    """
+    def __init__(self):
+        # set root and cache file name to none to make sure we won't touch the
+        # real filesystem
+        HashCache.__init__(self, '.', 'hashcache')
+        self._files = {}
+        # simulated clock running forward as operations happen
+        self._clock = 0
+
+    def put_file(self, filename, file_contents):
+        abspath = './' + filename
+        self._files[abspath] = (file_contents, self._clock)
+
+    def _fingerprint(self, abspath):
+        entry = self._files[abspath]
+        return (len(entry[0]),
+                entry[1], entry[1],
+                10, 20,
+                stat.S_IFREG | 0600)
+
+    def _really_sha1_file(self, abspath):
+        if abspath in self._files:
+            return sha1(self._files[abspath][0])
+        else:
+            return None
+
+    def _cutoff_time(self):
+        return self._clock - 2
+
+    def pretend_to_sleep(self, secs):
+        self._clock += secs
+
+    
+class TestHashCacheFakeFilesystem(TestCaseInTempDir):
+    """Tests the hashcache using a simulated OS.
+    """
+
+    def make_hashcache(self):
+        return FakeHashCache()
+
+    def test_hashcache_miss_new_file(self):
+        """A new file gives the right sha1 but misses"""
+        self.assertEquals(hc.miss_count, 1)
+        self.assertEquals(hc.hit_count, 0)
+        # if we try again it's still too new; 
+        self.assertEquals(hc.get_sha1('foo'), sha1('hello'))
+        self.assertEquals(hc.miss_count, 2)
+        self.assertEquals(hc.hit_count, 0)
+
+    def test_hashcache_old_file(self):
+        """An old file gives the right sha1 and hits"""
+        hc = self.make_hashcache()
+        hc.put_file('foo', 'hello')
+        hc.pretend_to_sleep(20)
+        # file is new; should get the correct hash but miss
+        self.assertEquals(hc.get_sha1('foo'), sha1('hello'))
+        self.assertEquals(hc.miss_count, 1)
+        self.assertEquals(hc.hit_count, 0)
+        # and can now be hit
+        self.assertEquals(hc.get_sha1('foo'), sha1('hello'))
+        self.assertEquals(hc.miss_count, 1)
+        self.assertEquals(hc.hit_count, 1)
+        hc.pretend_to_sleep(3)
+        # and again
+        self.assertEquals(hc.get_sha1('foo'), sha1('hello'))
+        self.assertEquals(hc.miss_count, 1)
+        self.assertEquals(hc.hit_count, 2)
+
+    def test_hashcache_invalidates(self):
+        hc = self.make_hashcache()
+        hc.put_file('foo', 'hello')
+        hc.pretend_to_sleep(20)
+        hc.get_sha1('foo')
+        hc.put_file('foo', 'h1llo')
+        self.assertEquals(hc.get_sha1('foo'), sha1('h1llo'))

=== modified file 'bzrlib/tests/workingtree_implementations/test_workingtree.py'
--- bzrlib/tests/workingtree_implementations/test_workingtree.py	2006-07-09 06:28:38 +0000
+++ bzrlib/tests/workingtree_implementations/test_workingtree.py	2006-07-11 09:19:06 +0000
@@ -181,15 +181,6 @@
         self.assertEquals(list(tree.unknowns()),
                           ['hello.txt'])
 
-    def test_hashcache(self):
-        tree = self.make_branch_and_tree('.')
-        self.build_tree(['hello.txt',
-                         'hello.txt.~1~'])
-        tree.add('hello.txt')
-        sha = tree.get_file_sha1(tree.path2id('hello.txt'))
-        tree2 = WorkingTree.open('.')
-        sha2 = tree2.get_file_sha1(tree2.path2id('hello.txt'))
-
     def test_initialize(self):
         # initialize should create a working tree and branch in an existing dir
         t = self.make_branch_and_tree('.')



More information about the bazaar mailing list