Rev 2996: (robertc) Add a LRU Cache facility. (John Meinel) in file:///home/pqm/archives/thelove/bzr/%2Btrunk/
Canonical.com Patch Queue Manager
pqm at pqm.ubuntu.com
Wed Nov 14 23:33:03 GMT 2007
At file:///home/pqm/archives/thelove/bzr/%2Btrunk/
------------------------------------------------------------
revno: 2996
revision-id: pqm at pqm.ubuntu.com-20071114233259-pdmdsqafctzx6bjs
parent: pqm at pqm.ubuntu.com-20071114191349-wkdpsuhhbogisob8
parent: robertc at robertcollins.net-20071114210754-fbts4wyrsxk34t5b
committer: Canonical.com Patch Queue Manager <pqm at pqm.ubuntu.com>
branch nick: +trunk
timestamp: Wed 2007-11-14 23:32:59 +0000
message:
(robertc) Add a LRU Cache facility. (John Meinel)
added:
bzrlib/lru_cache.py lru_cache.py-20070119165515-tlw203kuwh0id5gv-1
bzrlib/tests/test_lru_cache.py test_lru_cache.py-20070119165535-hph6rk4h9rzy4180-1
modified:
NEWS NEWS-20050323055033-4e00b5db738777ff
bzrlib/tests/__init__.py selftest.py-20050531073622-8d0e3c8845c97a64
------------------------------------------------------------
revno: 2993.1.1
merged: robertc at robertcollins.net-20071114210754-fbts4wyrsxk34t5b
parent: pqm at pqm.ubuntu.com-20071114173258-zikev37621jxgj15
committer: Robert Collins <robertc at robertcollins.net>
branch nick: lrucache
timestamp: Thu 2007-11-15 08:07:54 +1100
message:
* New module ``lru_cache`` providing a cache for use by tasks that need
semi-random access to large amounts of data. (John A Meinel)
=== added file 'bzrlib/lru_cache.py'
--- a/bzrlib/lru_cache.py 1970-01-01 00:00:00 +0000
+++ b/bzrlib/lru_cache.py 2007-11-14 21:07:54 +0000
@@ -0,0 +1,214 @@
+# Copyright (C) 2006 Canonical Ltd
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+"""A simple least-recently-used (LRU) cache."""
+
+from collections import deque
+import gc
+
+
+class LRUCache(object):
+ """A class which manages a cache of entries, removing unused ones."""
+
+ def __init__(self, max_cache=100, after_cleanup_size=None):
+ self._max_cache = max_cache
+ if after_cleanup_size is None:
+ self._after_cleanup_size = self._max_cache
+ else:
+ self._after_cleanup_size = min(after_cleanup_size, self._max_cache)
+
+ self._compact_queue_length = 4*self._max_cache
+
+ self._cache = {}
+ self._cleanup = {}
+ self._queue = deque() # Track when things are accessed
+ self._refcount = {} # number of entries in self._queue for each key
+
+ def __contains__(self, key):
+ return key in self._cache
+
+ def __getitem__(self, key):
+ val = self._cache[key]
+ self._record_access(key)
+ return val
+
+ def __len__(self):
+ return len(self._cache)
+
+ def add(self, key, value, cleanup=None):
+ """Add a new value to the cache.
+
+ Also, if the entry is ever removed from the queue, call cleanup.
+ Passing it the key and value being removed.
+
+ :param key: The key to store it under
+ :param value: The object to store
+ :param cleanup: None or a function taking (key, value) to indicate
+ 'value' sohuld be cleaned up.
+ """
+ if key in self._cache:
+ self._remove(key)
+ self._cache[key] = value
+ self._cleanup[key] = cleanup
+ self._record_access(key)
+
+ if len(self._cache) > self._max_cache:
+ # Trigger the cleanup
+ self.cleanup()
+
+ def cleanup(self):
+ """Clear the cache until it shrinks to the requested size.
+
+ This does not completely wipe the cache, just makes sure it is under
+ the after_cleanup_size.
+ """
+ # Make sure the cache is shrunk to the correct size
+ while len(self._cache) > self._after_cleanup_size:
+ self._remove_lru()
+
+ def __setitem__(self, key, value):
+ """Add a value to the cache, there will be no cleanup function."""
+ self.add(key, value, cleanup=None)
+
+ def _record_access(self, key):
+ """Record that key was accessed."""
+ self._queue.append(key)
+ # Can't use setdefault because you can't += 1 the result
+ self._refcount[key] = self._refcount.get(key, 0) + 1
+
+ # If our access queue is too large, clean it up too
+ if len(self._queue) > self._compact_queue_length:
+ self._compact_queue()
+
+ def _compact_queue(self):
+ """Compact the queue, leaving things in sorted last appended order."""
+ new_queue = deque()
+ for item in self._queue:
+ if self._refcount[item] == 1:
+ new_queue.append(item)
+ else:
+ self._refcount[item] -= 1
+ self._queue = new_queue
+ # All entries should be of the same size. There should be one entry in
+ # queue for each entry in cache, and all refcounts should == 1
+ assert (len(self._queue) == len(self._cache) ==
+ len(self._refcount) == sum(self._refcount.itervalues()))
+
+ def _remove(self, key):
+ """Remove an entry, making sure to maintain the invariants."""
+ cleanup = self._cleanup.pop(key)
+ val = self._cache.pop(key)
+ if cleanup is not None:
+ cleanup(key, val)
+ return val
+
+ def _remove_lru(self):
+ """Remove one entry from the lru, and handle consequences.
+
+ If there are no more references to the lru, then this entry should be
+ removed from the cache.
+ """
+ key = self._queue.popleft()
+ self._refcount[key] -= 1
+ if not self._refcount[key]:
+ del self._refcount[key]
+ self._remove(key)
+
+ def clear(self):
+ """Clear out all of the cache."""
+ # Clean up in LRU order
+ while self._cache:
+ self._remove_lru()
+
+
+class LRUSizeCache(LRUCache):
+ """An LRUCache that removes things based on the size of the values.
+
+ This differs in that it doesn't care how many actual items there are,
+ it just restricts the cache to be cleaned up after so much data is stored.
+
+ The values that are added must support len(value).
+ """
+
+ def __init__(self, max_size=1024*1024, after_cleanup_size=None,
+ compute_size=None):
+ """Create a new LRUSizeCache.
+
+ :param max_size: The max number of bytes to store before we start
+ clearing out entries.
+ :param after_cleanup_size: After cleaning up, shrink everything to this
+ size.
+ :param compute_size: A function to compute the size of the values. We
+ use a function here, so that you can pass 'len' if you are just
+ using simple strings, or a more complex function if you are using
+ something like a list of strings, or even a custom object.
+ The function should take the form "compute_size(value) => integer".
+ If not supplied, it defaults to 'len()'
+ """
+ # This approximates that texts are > 0.5k in size. It only really
+ # effects when we clean up the queue, so we don't want it to be too
+ # large.
+ LRUCache.__init__(self, max_cache=int(max_size/512))
+ self._max_size = max_size
+ if after_cleanup_size is None:
+ self._after_cleanup_size = self._max_size
+ else:
+ self._after_cleanup_size = min(after_cleanup_size, self._max_size)
+
+ self._value_size = 0
+ self._compute_size = compute_size
+ if compute_size is None:
+ self._compute_size = len
+
+ def add(self, key, value, cleanup=None):
+ """Add a new value to the cache.
+
+ Also, if the entry is ever removed from the queue, call cleanup.
+ Passing it the key and value being removed.
+
+ :param key: The key to store it under
+ :param value: The object to store
+ :param cleanup: None or a function taking (key, value) to indicate
+ 'value' sohuld be cleaned up.
+ """
+ if key in self._cache:
+ self._remove(key)
+ value_len = self._compute_size(value)
+ if value_len >= self._after_cleanup_size:
+ return
+ self._value_size += value_len
+ self._cache[key] = value
+ self._cleanup[key] = cleanup
+ self._record_access(key)
+
+ if self._value_size > self._max_size:
+ # Time to cleanup
+ self.cleanup()
+
+ def cleanup(self):
+ """Clear the cache until it shrinks to the requested size.
+
+ This does not completely wipe the cache, just makes sure it is under
+ the after_cleanup_size.
+ """
+ # Make sure the cache is shrunk to the correct size
+ while self._value_size > self._after_cleanup_size:
+ self._remove_lru()
+
+ def _remove(self, key):
+ """Remove an entry, making sure to maintain the invariants."""
+ val = LRUCache._remove(self, key)
+ self._value_size -= self._compute_size(val)
=== added file 'bzrlib/tests/test_lru_cache.py'
--- a/bzrlib/tests/test_lru_cache.py 1970-01-01 00:00:00 +0000
+++ b/bzrlib/tests/test_lru_cache.py 2007-11-14 21:07:54 +0000
@@ -0,0 +1,304 @@
+# Copyright (C) 2006 Canonical Ltd
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+"""Tests for the lru_cache module."""
+
+from bzrlib import (
+ lru_cache,
+ tests,
+ )
+
+
+class TestLRUCache(tests.TestCase):
+ """Test that LRU cache properly keeps track of entries."""
+
+ def test_missing(self):
+ cache = lru_cache.LRUCache(max_cache=10)
+
+ self.failIf('foo' in cache)
+ self.assertRaises(KeyError, cache.__getitem__, 'foo')
+
+ cache['foo'] = 'bar'
+ self.assertEqual('bar', cache['foo'])
+ self.failUnless('foo' in cache)
+ self.failIf('bar' in cache)
+
+ def test_overflow(self):
+ """Adding extra entries will pop out old ones."""
+ cache = lru_cache.LRUCache(max_cache=1)
+
+ cache['foo'] = 'bar'
+ # With a max cache of 1, adding 'baz' should pop out 'foo'
+ cache['baz'] = 'biz'
+
+ self.failIf('foo' in cache)
+ self.failUnless('baz' in cache)
+
+ self.assertEqual('biz', cache['baz'])
+
+ def test_by_usage(self):
+ """Accessing entries bumps them up in priority."""
+ cache = lru_cache.LRUCache(max_cache=2)
+
+ cache['baz'] = 'biz'
+ cache['foo'] = 'bar'
+
+ self.assertEqual('biz', cache['baz'])
+
+ # This must kick out 'foo' because it was the last accessed
+ cache['nub'] = 'in'
+
+ self.failIf('foo' in cache)
+
+ def test_queue_stays_bounded(self):
+ """Lots of accesses does not cause the queue to grow without bound."""
+ cache = lru_cache.LRUCache(max_cache=10)
+
+ cache['baz'] = 'biz'
+ cache['foo'] = 'bar'
+
+ for i in xrange(1000):
+ cache['baz']
+
+ self.failUnless(len(cache._queue) < 40)
+
+ def test_cleanup(self):
+ """Test that we can use a cleanup function."""
+ cleanup_called = []
+ def cleanup_func(key, val):
+ cleanup_called.append((key, val))
+
+ cache = lru_cache.LRUCache(max_cache=2)
+
+ cache.add('baz', '1', cleanup=cleanup_func)
+ cache.add('foo', '2', cleanup=cleanup_func)
+ cache.add('biz', '3', cleanup=cleanup_func)
+
+ self.assertEqual([('baz', '1')], cleanup_called)
+
+ # 'foo' is now most recent, so final cleanup will call it last
+ cache['foo']
+ cache.clear()
+ self.assertEqual([('baz', '1'), ('biz', '3'), ('foo', '2')], cleanup_called)
+
+ def test_cleanup_on_replace(self):
+ """Replacing an object should cleanup the old value."""
+ cleanup_called = []
+ def cleanup_func(key, val):
+ cleanup_called.append((key, val))
+
+ cache = lru_cache.LRUCache(max_cache=2)
+ cache.add(1, 10, cleanup=cleanup_func)
+ cache.add(2, 20, cleanup=cleanup_func)
+ cache.add(2, 25, cleanup=cleanup_func)
+
+ self.assertEqual([(2, 20)], cleanup_called)
+ self.assertEqual(25, cache[2])
+
+ # Even __setitem__ should make sure cleanup() is called
+ cache[2] = 26
+ self.assertEqual([(2, 20), (2, 25)], cleanup_called)
+
+ def test_len(self):
+ cache = lru_cache.LRUCache(max_cache=10)
+
+ cache[1] = 10
+ cache[2] = 20
+ cache[3] = 30
+ cache[4] = 40
+
+ self.assertEqual(4, len(cache))
+
+ cache[5] = 50
+ cache[6] = 60
+ cache[7] = 70
+ cache[8] = 80
+
+ self.assertEqual(8, len(cache))
+
+ cache[1] = 15 # replacement
+
+ self.assertEqual(8, len(cache))
+
+ cache[9] = 90
+ cache[10] = 100
+ cache[11] = 110
+
+ # We hit the max
+ self.assertEqual(10, len(cache))
+
+ def test_cleanup_shrinks_to_after_clean_size(self):
+ cache = lru_cache.LRUCache(max_cache=5, after_cleanup_size=3)
+
+ cache.add(1, 10)
+ cache.add(2, 20)
+ cache.add(3, 25)
+ cache.add(4, 30)
+ cache.add(5, 35)
+
+ self.assertEqual(5, len(cache))
+ # This will bump us over the max, which causes us to shrink down to
+ # after_cleanup_cache size
+ cache.add(6, 40)
+ self.assertEqual(3, len(cache))
+
+ def test_after_cleanup_larger_than_max(self):
+ cache = lru_cache.LRUCache(max_cache=5, after_cleanup_size=10)
+ self.assertEqual(5, cache._after_cleanup_size)
+
+ def test_after_cleanup_none(self):
+ cache = lru_cache.LRUCache(max_cache=5, after_cleanup_size=None)
+ self.assertEqual(5, cache._after_cleanup_size)
+
+ def test_cleanup(self):
+ cache = lru_cache.LRUCache(max_cache=5, after_cleanup_size=2)
+
+ # Add these in order
+ cache.add(1, 10)
+ cache.add(2, 20)
+ cache.add(3, 25)
+ cache.add(4, 30)
+ cache.add(5, 35)
+
+ self.assertEqual(5, len(cache))
+ # Force a compaction
+ cache.cleanup()
+ self.assertEqual(2, len(cache))
+
+ def test_compact_preserves_last_access_order(self):
+ cache = lru_cache.LRUCache(max_cache=5)
+
+ # Add these in order
+ cache.add(1, 10)
+ cache.add(2, 20)
+ cache.add(3, 25)
+ cache.add(4, 30)
+ cache.add(5, 35)
+
+ self.assertEqual([1, 2, 3, 4, 5], list(cache._queue))
+
+ # Now access some randomly
+ cache[2]
+ cache[5]
+ cache[3]
+ cache[2]
+ self.assertEqual([1, 2, 3, 4, 5, 2, 5, 3, 2], list(cache._queue))
+ self.assertEqual({1:1, 2:3, 3:2, 4:1, 5:2}, cache._refcount)
+
+ # Compacting should save the last position
+ cache._compact_queue()
+ self.assertEqual([1, 4, 5, 3, 2], list(cache._queue))
+ self.assertEqual({1:1, 2:1, 3:1, 4:1, 5:1}, cache._refcount)
+
+
+class TestLRUSizeCache(tests.TestCase):
+
+ def test_basic_init(self):
+ cache = lru_cache.LRUSizeCache()
+ self.assertEqual(2048, cache._max_cache)
+ self.assertEqual(4*2048, cache._compact_queue_length)
+ self.assertEqual(cache._max_size, cache._after_cleanup_size)
+ self.assertEqual(0, cache._value_size)
+
+ def test_add_tracks_size(self):
+ cache = lru_cache.LRUSizeCache()
+ self.assertEqual(0, cache._value_size)
+ cache.add('my key', 'my value text')
+ self.assertEqual(13, cache._value_size)
+
+ def test_remove_tracks_size(self):
+ cache = lru_cache.LRUSizeCache()
+ self.assertEqual(0, cache._value_size)
+ cache.add('my key', 'my value text')
+ self.assertEqual(13, cache._value_size)
+ cache._remove('my key')
+ self.assertEqual(0, cache._value_size)
+
+ def test_no_add_over_size(self):
+ """Adding a large value may not be cached at all."""
+ cache = lru_cache.LRUSizeCache(max_size=10, after_cleanup_size=5)
+ self.assertEqual(0, cache._value_size)
+ self.assertEqual({}, cache._cache)
+ cache.add('test', 'key')
+ self.assertEqual(3, cache._value_size)
+ self.assertEqual({'test':'key'}, cache._cache)
+ cache.add('test2', 'key that is too big')
+ self.assertEqual(3, cache._value_size)
+ self.assertEqual({'test':'key'}, cache._cache)
+ # If we would add a key, only to cleanup and remove all cached entries,
+ # then obviously that value should not be stored
+ cache.add('test3', 'bigkey')
+ self.assertEqual(3, cache._value_size)
+ self.assertEqual({'test':'key'}, cache._cache)
+
+ cache.add('test4', 'bikey')
+ self.assertEqual(3, cache._value_size)
+ self.assertEqual({'test':'key'}, cache._cache)
+
+ def test_adding_clears_cache_based_on_size(self):
+ """The cache is cleared in LRU order until small enough"""
+ cache = lru_cache.LRUSizeCache(max_size=20)
+ cache.add('key1', 'value') # 5 chars
+ cache.add('key2', 'value2') # 6 chars
+ cache.add('key3', 'value23') # 7 chars
+ self.assertEqual(5+6+7, cache._value_size)
+ cache['key2'] # reference key2 so it gets a newer reference time
+ cache.add('key4', 'value234') # 8 chars, over limit
+ # We have to remove 2 keys to get back under limit
+ self.assertEqual(6+8, cache._value_size)
+ self.assertEqual({'key2':'value2', 'key4':'value234'},
+ cache._cache)
+
+ def test_adding_clears_to_after_cleanup_size(self):
+ cache = lru_cache.LRUSizeCache(max_size=20, after_cleanup_size=10)
+ cache.add('key1', 'value') # 5 chars
+ cache.add('key2', 'value2') # 6 chars
+ cache.add('key3', 'value23') # 7 chars
+ self.assertEqual(5+6+7, cache._value_size)
+ cache['key2'] # reference key2 so it gets a newer reference time
+ cache.add('key4', 'value234') # 8 chars, over limit
+ # We have to remove 3 keys to get back under limit
+ self.assertEqual(8, cache._value_size)
+ self.assertEqual({'key4':'value234'}, cache._cache)
+
+ def test_custom_sizes(self):
+ def size_of_list(lst):
+ return sum(len(x) for x in lst)
+ cache = lru_cache.LRUSizeCache(max_size=20, after_cleanup_size=10,
+ compute_size=size_of_list)
+
+ cache.add('key1', ['val', 'ue']) # 5 chars
+ cache.add('key2', ['val', 'ue2']) # 6 chars
+ cache.add('key3', ['val', 'ue23']) # 7 chars
+ self.assertEqual(5+6+7, cache._value_size)
+ cache['key2'] # reference key2 so it gets a newer reference time
+ cache.add('key4', ['value', '234']) # 8 chars, over limit
+ # We have to remove 3 keys to get back under limit
+ self.assertEqual(8, cache._value_size)
+ self.assertEqual({'key4':['value', '234']}, cache._cache)
+
+ def test_cleanup(self):
+ cache = lru_cache.LRUSizeCache(max_size=20, after_cleanup_size=10)
+
+ # Add these in order
+ cache.add('key1', 'value') # 5 chars
+ cache.add('key2', 'value2') # 6 chars
+ cache.add('key3', 'value23') # 7 chars
+ self.assertEqual(5+6+7, cache._value_size)
+
+ cache.cleanup()
+ # Only the most recent fits after cleaning up
+ self.assertEqual(7, cache._value_size)
=== modified file 'NEWS'
--- a/NEWS 2007-11-14 17:03:53 +0000
+++ b/NEWS 2007-11-14 23:32:59 +0000
@@ -111,6 +111,9 @@
and parsing containers from streams rather than from files. (Andrew
Bennetts)
+ * New module ``lru_cache`` providing a cache for use by tasks that need
+ semi-random access to large amounts of data. (John A Meinel)
+
TESTING:
=== modified file 'bzrlib/tests/__init__.py'
--- a/bzrlib/tests/__init__.py 2007-11-03 02:37:39 +0000
+++ b/bzrlib/tests/__init__.py 2007-11-14 21:07:54 +0000
@@ -2429,6 +2429,7 @@
'bzrlib.tests.test_lockable_files',
'bzrlib.tests.test_log',
'bzrlib.tests.test_lsprof',
+ 'bzrlib.tests.test_lru_cache',
'bzrlib.tests.test_mail_client',
'bzrlib.tests.test_memorytree',
'bzrlib.tests.test_merge',
More information about the bazaar-commits
mailing list