Rev 2996: (robertc) Add a LRU Cache facility. (John Meinel) in file:///home/pqm/archives/thelove/bzr/%2Btrunk/

Wed Nov 14 23:33:03 GMT 2007

At file:///home/pqm/archives/thelove/bzr/%2Btrunk/

------------------------------------------------------------
revno: 2996
revision-id: pqm at pqm.ubuntu.com-20071114233259-pdmdsqafctzx6bjs
parent: pqm at pqm.ubuntu.com-20071114191349-wkdpsuhhbogisob8
parent: robertc at robertcollins.net-20071114210754-fbts4wyrsxk34t5b
committer: Canonical.com Patch Queue Manager <pqm at pqm.ubuntu.com>
branch nick: +trunk
timestamp: Wed 2007-11-14 23:32:59 +0000
message:
  (robertc) Add a LRU Cache facility. (John Meinel)
added:
  bzrlib/lru_cache.py            lru_cache.py-20070119165515-tlw203kuwh0id5gv-1
  bzrlib/tests/test_lru_cache.py test_lru_cache.py-20070119165535-hph6rk4h9rzy4180-1
modified:
  NEWS                           NEWS-20050323055033-4e00b5db738777ff
  bzrlib/tests/__init__.py       selftest.py-20050531073622-8d0e3c8845c97a64
    ------------------------------------------------------------
    revno: 2993.1.1
    merged: robertc at robertcollins.net-20071114210754-fbts4wyrsxk34t5b
    parent: pqm at pqm.ubuntu.com-20071114173258-zikev37621jxgj15
    committer: Robert Collins <robertc at robertcollins.net>
    branch nick: lrucache
    timestamp: Thu 2007-11-15 08:07:54 +1100
    message:
      * New module ``lru_cache`` providing a cache for use by tasks that need
        semi-random access to large amounts of data. (John A Meinel)
=== added file 'bzrlib/lru_cache.py'

--- a/bzrlib/lru_cache.py	1970-01-01 00:00:00 +0000
+++ b/bzrlib/lru_cache.py	2007-11-14 21:07:54 +0000
@@ -0,0 +1,214 @@
+# Copyright (C) 2006 Canonical Ltd
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+"""A simple least-recently-used (LRU) cache."""
+
+from collections import deque
+import gc
+
+
+class LRUCache(object):
+    """A class which manages a cache of entries, removing unused ones."""
+
+    def __init__(self, max_cache=100, after_cleanup_size=None):
+        self._max_cache = max_cache
+        if after_cleanup_size is None:
+            self._after_cleanup_size = self._max_cache
+        else:
+            self._after_cleanup_size = min(after_cleanup_size, self._max_cache)
+
+        self._compact_queue_length = 4*self._max_cache
+
+        self._cache = {}
+        self._cleanup = {}
+        self._queue = deque() # Track when things are accessed
+        self._refcount = {} # number of entries in self._queue for each key
+
+    def __contains__(self, key):
+        return key in self._cache
+
+    def __getitem__(self, key):
+        val = self._cache[key]
+        self._record_access(key)
+        return val
+
+    def __len__(self):
+        return len(self._cache)
+
+    def add(self, key, value, cleanup=None):
+        """Add a new value to the cache.
+
+        Also, if the entry is ever removed from the queue, call cleanup.
+        Passing it the key and value being removed.
+
+        :param key: The key to store it under
+        :param value: The object to store
+        :param cleanup: None or a function taking (key, value) to indicate
+                        'value' sohuld be cleaned up.
+        """
+        if key in self._cache:
+            self._remove(key)
+        self._cache[key] = value
+        self._cleanup[key] = cleanup
+        self._record_access(key)
+
+        if len(self._cache) > self._max_cache:
+            # Trigger the cleanup
+            self.cleanup()
+
+    def cleanup(self):
+        """Clear the cache until it shrinks to the requested size.
+
+        This does not completely wipe the cache, just makes sure it is under
+        the after_cleanup_size.
+        """
+        # Make sure the cache is shrunk to the correct size
+        while len(self._cache) > self._after_cleanup_size:
+            self._remove_lru()
+
+    def __setitem__(self, key, value):
+        """Add a value to the cache, there will be no cleanup function."""
+        self.add(key, value, cleanup=None)
+
+    def _record_access(self, key):
+        """Record that key was accessed."""
+        self._queue.append(key)
+        # Can't use setdefault because you can't += 1 the result
+        self._refcount[key] = self._refcount.get(key, 0) + 1
+
+        # If our access queue is too large, clean it up too
+        if len(self._queue) > self._compact_queue_length:
+            self._compact_queue()
+
+    def _compact_queue(self):
+        """Compact the queue, leaving things in sorted last appended order."""
+        new_queue = deque()
+        for item in self._queue:
+            if self._refcount[item] == 1:
+                new_queue.append(item)
+            else:
+                self._refcount[item] -= 1
+        self._queue = new_queue
+        # All entries should be of the same size. There should be one entry in
+        # queue for each entry in cache, and all refcounts should == 1
+        assert (len(self._queue) == len(self._cache) ==
+                len(self._refcount) == sum(self._refcount.itervalues()))
+
+    def _remove(self, key):
+        """Remove an entry, making sure to maintain the invariants."""
+        cleanup = self._cleanup.pop(key)
+        val = self._cache.pop(key)
+        if cleanup is not None:
+            cleanup(key, val)
+        return val
+
+    def _remove_lru(self):
+        """Remove one entry from the lru, and handle consequences.
+
+        If there are no more references to the lru, then this entry should be
+        removed from the cache.
+        """
+        key = self._queue.popleft()
+        self._refcount[key] -= 1
+        if not self._refcount[key]:
+            del self._refcount[key]
+            self._remove(key)
+
+    def clear(self):
+        """Clear out all of the cache."""
+        # Clean up in LRU order
+        while self._cache:
+            self._remove_lru()
+
+
+class LRUSizeCache(LRUCache):
+    """An LRUCache that removes things based on the size of the values.
+
+    This differs in that it doesn't care how many actual items there are,
+    it just restricts the cache to be cleaned up after so much data is stored.
+
+    The values that are added must support len(value).
+    """
+
+    def __init__(self, max_size=1024*1024, after_cleanup_size=None,
+                 compute_size=None):
+        """Create a new LRUSizeCache.
+
+        :param max_size: The max number of bytes to store before we start
+            clearing out entries.
+        :param after_cleanup_size: After cleaning up, shrink everything to this
+            size.
+        :param compute_size: A function to compute the size of the values. We
+            use a function here, so that you can pass 'len' if you are just
+            using simple strings, or a more complex function if you are using
+            something like a list of strings, or even a custom object.
+            The function should take the form "compute_size(value) => integer".
+            If not supplied, it defaults to 'len()'
+        """
+        # This approximates that texts are > 0.5k in size. It only really
+        # effects when we clean up the queue, so we don't want it to be too
+        # large.
+        LRUCache.__init__(self, max_cache=int(max_size/512))
+        self._max_size = max_size
+        if after_cleanup_size is None:
+            self._after_cleanup_size = self._max_size
+        else:
+            self._after_cleanup_size = min(after_cleanup_size, self._max_size)
+
+        self._value_size = 0
+        self._compute_size = compute_size
+        if compute_size is None:
+            self._compute_size = len
+
+    def add(self, key, value, cleanup=None):
+        """Add a new value to the cache.
+
+        Also, if the entry is ever removed from the queue, call cleanup.
+        Passing it the key and value being removed.
+
+        :param key: The key to store it under
+        :param value: The object to store
+        :param cleanup: None or a function taking (key, value) to indicate
+                        'value' sohuld be cleaned up.
+        """
+        if key in self._cache:
+            self._remove(key)
+        value_len = self._compute_size(value)
+        if value_len >= self._after_cleanup_size:
+            return
+        self._value_size += value_len
+        self._cache[key] = value
+        self._cleanup[key] = cleanup
+        self._record_access(key)
+
+        if self._value_size > self._max_size:
+            # Time to cleanup
+            self.cleanup()
+
+    def cleanup(self):
+        """Clear the cache until it shrinks to the requested size.
+
+        This does not completely wipe the cache, just makes sure it is under
+        the after_cleanup_size.
+        """
+        # Make sure the cache is shrunk to the correct size
+        while self._value_size > self._after_cleanup_size:
+            self._remove_lru()
+
+    def _remove(self, key):
+        """Remove an entry, making sure to maintain the invariants."""
+        val = LRUCache._remove(self, key)
+        self._value_size -= self._compute_size(val)

=== added file 'bzrlib/tests/test_lru_cache.py'
--- a/bzrlib/tests/test_lru_cache.py	1970-01-01 00:00:00 +0000
+++ b/bzrlib/tests/test_lru_cache.py	2007-11-14 21:07:54 +0000
@@ -0,0 +1,304 @@
+# Copyright (C) 2006 Canonical Ltd
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+"""Tests for the lru_cache module."""
+
+from bzrlib import (
+    lru_cache,
+    tests,
+    )
+
+
+class TestLRUCache(tests.TestCase):
+    """Test that LRU cache properly keeps track of entries."""
+
+    def test_missing(self):
+        cache = lru_cache.LRUCache(max_cache=10)
+
+        self.failIf('foo' in cache)
+        self.assertRaises(KeyError, cache.__getitem__, 'foo')
+
+        cache['foo'] = 'bar'
+        self.assertEqual('bar', cache['foo'])
+        self.failUnless('foo' in cache)
+        self.failIf('bar' in cache)
+
+    def test_overflow(self):
+        """Adding extra entries will pop out old ones."""
+        cache = lru_cache.LRUCache(max_cache=1)
+
+        cache['foo'] = 'bar'
+        # With a max cache of 1, adding 'baz' should pop out 'foo'
+        cache['baz'] = 'biz'
+
+        self.failIf('foo' in cache)
+        self.failUnless('baz' in cache)
+
+        self.assertEqual('biz', cache['baz'])
+
+    def test_by_usage(self):
+        """Accessing entries bumps them up in priority."""
+        cache = lru_cache.LRUCache(max_cache=2)
+
+        cache['baz'] = 'biz'
+        cache['foo'] = 'bar'
+
+        self.assertEqual('biz', cache['baz'])
+
+        # This must kick out 'foo' because it was the last accessed
+        cache['nub'] = 'in'
+
+        self.failIf('foo' in cache)
+
+    def test_queue_stays_bounded(self):
+        """Lots of accesses does not cause the queue to grow without bound."""
+        cache = lru_cache.LRUCache(max_cache=10)
+
+        cache['baz'] = 'biz'
+        cache['foo'] = 'bar'
+
+        for i in xrange(1000):
+            cache['baz']
+
+        self.failUnless(len(cache._queue) < 40)
+
+    def test_cleanup(self):
+        """Test that we can use a cleanup function."""
+        cleanup_called = []
+        def cleanup_func(key, val):
+            cleanup_called.append((key, val))
+
+        cache = lru_cache.LRUCache(max_cache=2)
+
+        cache.add('baz', '1', cleanup=cleanup_func)
+        cache.add('foo', '2', cleanup=cleanup_func)
+        cache.add('biz', '3', cleanup=cleanup_func)
+
+        self.assertEqual([('baz', '1')], cleanup_called)
+
+        # 'foo' is now most recent, so final cleanup will call it last
+        cache['foo']
+        cache.clear()
+        self.assertEqual([('baz', '1'), ('biz', '3'), ('foo', '2')], cleanup_called)
+
+    def test_cleanup_on_replace(self):
+        """Replacing an object should cleanup the old value."""
+        cleanup_called = []
+        def cleanup_func(key, val):
+            cleanup_called.append((key, val))
+
+        cache = lru_cache.LRUCache(max_cache=2)
+        cache.add(1, 10, cleanup=cleanup_func)
+        cache.add(2, 20, cleanup=cleanup_func)
+        cache.add(2, 25, cleanup=cleanup_func)
+
+        self.assertEqual([(2, 20)], cleanup_called)
+        self.assertEqual(25, cache[2])
+        
+        # Even __setitem__ should make sure cleanup() is called
+        cache[2] = 26
+        self.assertEqual([(2, 20), (2, 25)], cleanup_called)
+
+    def test_len(self):
+        cache = lru_cache.LRUCache(max_cache=10)
+
+        cache[1] = 10
+        cache[2] = 20
+        cache[3] = 30
+        cache[4] = 40
+
+        self.assertEqual(4, len(cache))
+
+        cache[5] = 50
+        cache[6] = 60
+        cache[7] = 70
+        cache[8] = 80
+
+        self.assertEqual(8, len(cache))
+
+        cache[1] = 15 # replacement
+
+        self.assertEqual(8, len(cache))
+
+        cache[9] = 90
+        cache[10] = 100
+        cache[11] = 110
+
+        # We hit the max
+        self.assertEqual(10, len(cache))
+
+    def test_cleanup_shrinks_to_after_clean_size(self):
+        cache = lru_cache.LRUCache(max_cache=5, after_cleanup_size=3)
+
+        cache.add(1, 10)
+        cache.add(2, 20)
+        cache.add(3, 25)
+        cache.add(4, 30)
+        cache.add(5, 35)
+
+        self.assertEqual(5, len(cache))
+        # This will bump us over the max, which causes us to shrink down to
+        # after_cleanup_cache size
+        cache.add(6, 40)
+        self.assertEqual(3, len(cache))
+
+    def test_after_cleanup_larger_than_max(self):
+        cache = lru_cache.LRUCache(max_cache=5, after_cleanup_size=10)
+        self.assertEqual(5, cache._after_cleanup_size)
+
+    def test_after_cleanup_none(self):
+        cache = lru_cache.LRUCache(max_cache=5, after_cleanup_size=None)
+        self.assertEqual(5, cache._after_cleanup_size)
+
+    def test_cleanup(self):
+        cache = lru_cache.LRUCache(max_cache=5, after_cleanup_size=2)
+
+        # Add these in order
+        cache.add(1, 10)
+        cache.add(2, 20)
+        cache.add(3, 25)
+        cache.add(4, 30)
+        cache.add(5, 35)
+
+        self.assertEqual(5, len(cache))
+        # Force a compaction
+        cache.cleanup()
+        self.assertEqual(2, len(cache))
+
+    def test_compact_preserves_last_access_order(self):
+        cache = lru_cache.LRUCache(max_cache=5)
+
+        # Add these in order
+        cache.add(1, 10)
+        cache.add(2, 20)
+        cache.add(3, 25)
+        cache.add(4, 30)
+        cache.add(5, 35)
+
+        self.assertEqual([1, 2, 3, 4, 5], list(cache._queue))
+
+        # Now access some randomly
+        cache[2]
+        cache[5]
+        cache[3]
+        cache[2]
+        self.assertEqual([1, 2, 3, 4, 5, 2, 5, 3, 2], list(cache._queue))
+        self.assertEqual({1:1, 2:3, 3:2, 4:1, 5:2}, cache._refcount)
+
+        # Compacting should save the last position
+        cache._compact_queue()
+        self.assertEqual([1, 4, 5, 3, 2], list(cache._queue))
+        self.assertEqual({1:1, 2:1, 3:1, 4:1, 5:1}, cache._refcount)
+
+
+class TestLRUSizeCache(tests.TestCase):
+
+    def test_basic_init(self):
+        cache = lru_cache.LRUSizeCache()
+        self.assertEqual(2048, cache._max_cache)
+        self.assertEqual(4*2048, cache._compact_queue_length)
+        self.assertEqual(cache._max_size, cache._after_cleanup_size)
+        self.assertEqual(0, cache._value_size)
+
+    def test_add_tracks_size(self):
+        cache = lru_cache.LRUSizeCache()
+        self.assertEqual(0, cache._value_size)
+        cache.add('my key', 'my value text')
+        self.assertEqual(13, cache._value_size)
+
+    def test_remove_tracks_size(self):
+        cache = lru_cache.LRUSizeCache()
+        self.assertEqual(0, cache._value_size)
+        cache.add('my key', 'my value text')
+        self.assertEqual(13, cache._value_size)
+        cache._remove('my key')
+        self.assertEqual(0, cache._value_size)
+
+    def test_no_add_over_size(self):
+        """Adding a large value may not be cached at all."""
+        cache = lru_cache.LRUSizeCache(max_size=10, after_cleanup_size=5)
+        self.assertEqual(0, cache._value_size)
+        self.assertEqual({}, cache._cache)
+        cache.add('test', 'key')
+        self.assertEqual(3, cache._value_size)
+        self.assertEqual({'test':'key'}, cache._cache)
+        cache.add('test2', 'key that is too big')
+        self.assertEqual(3, cache._value_size)
+        self.assertEqual({'test':'key'}, cache._cache)
+        # If we would add a key, only to cleanup and remove all cached entries,
+        # then obviously that value should not be stored
+        cache.add('test3', 'bigkey')
+        self.assertEqual(3, cache._value_size)
+        self.assertEqual({'test':'key'}, cache._cache)
+
+        cache.add('test4', 'bikey')
+        self.assertEqual(3, cache._value_size)
+        self.assertEqual({'test':'key'}, cache._cache)
+
+    def test_adding_clears_cache_based_on_size(self):
+        """The cache is cleared in LRU order until small enough"""
+        cache = lru_cache.LRUSizeCache(max_size=20)
+        cache.add('key1', 'value') # 5 chars
+        cache.add('key2', 'value2') # 6 chars
+        cache.add('key3', 'value23') # 7 chars
+        self.assertEqual(5+6+7, cache._value_size)
+        cache['key2'] # reference key2 so it gets a newer reference time
+        cache.add('key4', 'value234') # 8 chars, over limit
+        # We have to remove 2 keys to get back under limit
+        self.assertEqual(6+8, cache._value_size)
+        self.assertEqual({'key2':'value2', 'key4':'value234'},
+                         cache._cache)
+
+    def test_adding_clears_to_after_cleanup_size(self):
+        cache = lru_cache.LRUSizeCache(max_size=20, after_cleanup_size=10)
+        cache.add('key1', 'value') # 5 chars
+        cache.add('key2', 'value2') # 6 chars
+        cache.add('key3', 'value23') # 7 chars
+        self.assertEqual(5+6+7, cache._value_size)
+        cache['key2'] # reference key2 so it gets a newer reference time
+        cache.add('key4', 'value234') # 8 chars, over limit
+        # We have to remove 3 keys to get back under limit
+        self.assertEqual(8, cache._value_size)
+        self.assertEqual({'key4':'value234'}, cache._cache)
+
+    def test_custom_sizes(self):
+        def size_of_list(lst):
+            return sum(len(x) for x in lst)
+        cache = lru_cache.LRUSizeCache(max_size=20, after_cleanup_size=10,
+                                       compute_size=size_of_list)
+
+        cache.add('key1', ['val', 'ue']) # 5 chars
+        cache.add('key2', ['val', 'ue2']) # 6 chars
+        cache.add('key3', ['val', 'ue23']) # 7 chars
+        self.assertEqual(5+6+7, cache._value_size)
+        cache['key2'] # reference key2 so it gets a newer reference time
+        cache.add('key4', ['value', '234']) # 8 chars, over limit
+        # We have to remove 3 keys to get back under limit
+        self.assertEqual(8, cache._value_size)
+        self.assertEqual({'key4':['value', '234']}, cache._cache)
+
+    def test_cleanup(self):
+        cache = lru_cache.LRUSizeCache(max_size=20, after_cleanup_size=10)
+
+        # Add these in order
+        cache.add('key1', 'value') # 5 chars
+        cache.add('key2', 'value2') # 6 chars
+        cache.add('key3', 'value23') # 7 chars
+        self.assertEqual(5+6+7, cache._value_size)
+
+        cache.cleanup()
+        # Only the most recent fits after cleaning up
+        self.assertEqual(7, cache._value_size)

=== modified file 'NEWS'
--- a/NEWS	2007-11-14 17:03:53 +0000
+++ b/NEWS	2007-11-14 23:32:59 +0000
@@ -111,6 +111,9 @@
      and parsing containers from streams rather than from files.  (Andrew
      Bennetts)
 
+   * New module ``lru_cache`` providing a cache for use by tasks that need
+     semi-random access to large amounts of data. (John A Meinel)
+
   TESTING:
 
 

=== modified file 'bzrlib/tests/__init__.py'
--- a/bzrlib/tests/__init__.py	2007-11-03 02:37:39 +0000
+++ b/bzrlib/tests/__init__.py	2007-11-14 21:07:54 +0000
@@ -2429,6 +2429,7 @@
                    'bzrlib.tests.test_lockable_files',
                    'bzrlib.tests.test_log',
                    'bzrlib.tests.test_lsprof',
+                   'bzrlib.tests.test_lru_cache',
                    'bzrlib.tests.test_mail_client',
                    'bzrlib.tests.test_memorytree',
                    'bzrlib.tests.test_merge',