Rev 4737: Adding a StaticTupleInterner class. in http://bazaar.launchpad.net/~jameinel/bzr/2.1-static-tuple
John Arbash Meinel
john at arbash-meinel.com
Fri Oct 2 06:41:32 BST 2009
At http://bazaar.launchpad.net/~jameinel/bzr/2.1-static-tuple
------------------------------------------------------------
revno: 4737
revision-id: john at arbash-meinel.com-20091002054124-pbt81bw3utkr5ufu
parent: john at arbash-meinel.com-20091002053835-q4ztao5hwvzyr181
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: 2.1-static-tuple
timestamp: Fri 2009-10-02 00:41:24 -0500
message:
Adding a StaticTupleInterner class.
Functions basically like a dict, but with less memory overhead.
(well, possibly more like a set that also implements __getitem__).
For now, it allows arbitrary objects, though I'm considering
only allowing StaticTuple objects for some ease-of-implementation.
Especially if I find I want to cache the hash() on StaticTuple for
performance, then I may want to expose that in StaticTupleInterner.
For now, this only compiles via Cython, but only because I'm using
'inline' in one place.
It wouldn't be too hard to factor that out, or make it conditional
with an IF and DEF bits.
-------------- next part --------------
=== modified file '.bzrignore'
--- a/.bzrignore 2009-07-31 19:56:19 +0000
+++ b/.bzrignore 2009-10-02 05:41:24 +0000
@@ -51,6 +51,7 @@
bzrlib/_known_graph_pyx.c
bzrlib/_readdir_pyx.c
bzrlib/_rio_pyx.c
+bzrlib/_static_tuple_interned_pyx.c
bzrlib/_walkdirs_win32.c
doc/en/release-notes/NEWS.txt
doc/en/developer-guide/HACKING.txt
=== added file 'bzrlib/_static_tuple_interned_pyx.pyx'
--- a/bzrlib/_static_tuple_interned_pyx.pyx 1970-01-01 00:00:00 +0000
+++ b/bzrlib/_static_tuple_interned_pyx.pyx 2009-10-02 05:41:24 +0000
@@ -0,0 +1,212 @@
+# Copyright (C) 2009 Canonical Ltd
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+"""Definition of a class that is used to intern StaticTuple objects."""
+
+cdef extern from "Python.h":
+ ctypedef unsigned long size_t
+ ctypedef struct PyTypeObject
+ ctypedef struct PyObject:
+ PyTypeObject *ob_type
+ ctypedef long (*hashfunc)(PyObject*)
+ ctypedef PyObject *(*richcmpfunc)(PyObject *, PyObject *, int)
+ int Py_EQ
+ PyObject *Py_True
+ PyObject *Py_NotImplemented
+ void Py_INCREF(PyObject *)
+ void Py_DECREF(PyObject *)
+ ctypedef struct PyTypeObject:
+ hashfunc tp_hash
+ richcmpfunc tp_richcompare
+
+ void *PyMem_Malloc(size_t nbytes)
+ void PyMem_Free(void *)
+ void memset(void *, int, size_t)
+
+
+cdef object _dummy_obj
+cdef PyObject *_dummy
+_dummy_obj = object()
+_dummy = <PyObject *>_dummy_obj
+
+
+cdef inline int _is_equal(PyObject *this, long this_hash, PyObject *other):
+ cdef long other_hash
+ cdef PyObject *res
+
+ if this == other:
+ return 1
+ other_hash = other.ob_type.tp_hash(other)
+ if other_hash != this_hash:
+ return 0
+ res = this.ob_type.tp_richcompare(this, other, Py_EQ)
+ if res == Py_True:
+ Py_DECREF(res)
+ return 1
+ if res == Py_NotImplemented:
+ Py_DECREF(res)
+ res = other.ob_type.tp_richcompare(other, this, Py_EQ)
+ if res == Py_True:
+ Py_DECREF(res)
+ return 1
+ Py_DECREF(res)
+ return 0
+
+
+cdef class StaticTupleInterner:
+ """This class tracks the canonical forms for StaticTuples.
+
+ It is similar in function to the interned dictionary that is used by
+ strings. However:
+
+ 1) It assumes that hash(obj) is cheap, so does not need to inline a copy
+ of it
+ 2) It only stores one reference to the object, rather than 2 (key vs
+ key:value)
+
+ As such, it uses 1/3rd the amount of memory to store a pointer to the
+ interned object.
+ """
+
+ cdef readonly Py_ssize_t used # active
+ cdef readonly Py_ssize_t fill # active + dummy
+ cdef readonly Py_ssize_t mask # Table contains (mask+1) slots, a power
+ # of 2
+ cdef PyObject **table # Pyrex/Cython doesn't support arrays to 'object'
+ # so we manage it manually
+
+ DEF DEFAULT_SIZE=1024
+ DEF PERTURB_SHIFT=5
+
+ def __init__(self):
+ cdef Py_ssize_t size, n_bytes
+
+ size = DEFAULT_SIZE
+ self.mask = size - 1
+ self.used = 0
+ self.fill = 0
+ n_bytes = sizeof(PyObject*) * size;
+ self.table = <PyObject **>PyMem_Malloc(n_bytes)
+ # TODO: Raise MemoryError if malloc fails
+ memset(self.table, 0, n_bytes)
+
+ def __dealloc__(self):
+ if self.table != NULL:
+ PyMem_Free(self.table)
+ self.table = NULL
+
+ def __len__(self):
+ return self.used
+
+ cdef PyObject **_lookup(self, key, long hash) except NULL:
+ """Find the slot where 'key' would fit.
+
+ This is the same as a dicts 'lookup' function
+
+ :param key: An object we are looking up
+ :param hash: The hash for key
+ :return: The location in self.table where key should be put
+ should never be NULL, but may reference a NULL (PyObject*)
+ """
+ cdef size_t i, perturb
+ cdef Py_ssize_t mask
+ cdef long this_hash
+ cdef PyObject **table, **cur, **free_slot, *py_key
+
+ mask = self.mask
+ table = self.table
+ i = hash & mask
+ cur = &table[i]
+ py_key = <PyObject *>key
+ if cur[0] == NULL:
+ # Found a blank spot, or found the exact key
+ return cur
+ if cur[0] == py_key:
+ return cur
+ if cur[0] == _dummy:
+ free_slot = cur
+ else:
+ if _is_equal(py_key, hash, cur[0]):
+ # Both py_key and cur[0] belong in this slot, return it
+ return cur
+ free_slot = NULL
+ # size_t is unsigned, hash is signed...
+ perturb = hash
+ while True:
+ i = (i << 2) + i + perturb + 1
+ cur = &table[i & mask]
+ if cur[0] == NULL: # Found an empty spot
+ if free_slot: # Did we find a _dummy earlier?
+ return free_slot
+ else:
+ return cur
+ if (cur[0] == py_key # exact match
+ or _is_equal(py_key, hash, cur[0])): # Equivalent match
+ return cur
+ if (cur[0] == _dummy and free_slot == NULL):
+ free_slot = cur
+ perturb >>= PERTURB_SHIFT
+ raise AssertionError('should never get here')
+
+ def _test_lookup(self, key):
+ cdef PyObject **slot
+
+ slot = self._lookup(key, hash(key))
+ if slot[0] == NULL:
+ res = '<null>'
+ elif slot[0] == _dummy:
+ res = '<dummy>'
+ else:
+ res = <object>slot[0]
+ return <int>(slot - self.table), res
+
+ def __contains__(self, key):
+ cdef PyObject **slot
+
+ slot = self._lookup(key, hash(key))
+ if slot[0] == NULL or slot[0] == _dummy:
+ return False
+ return True
+
+ def __getitem__(self, key):
+ cdef PyObject **slot
+ slot = self._lookup(key, hash(key))
+ if slot[0] == NULL or slot[0] == _dummy:
+ raise KeyError("Key %s is not present" % key)
+ val = <object>(slot[0])
+ return val
+
+ def __setitem__(self, key, value):
+ cdef PyObject **slot, *py_key
+ assert key == value
+
+ slot = self._lookup(key, hash(key))
+ if (slot[0] == NULL or slot[0] == _dummy):
+ py_key = <PyObject *>key
+ Py_INCREF(py_key)
+ slot[0] = py_key
+
+ def __delitem__(self, key):
+ cdef PyObject **slot, *py_key
+
+ slot = self._lookup(key, hash(key))
+ if (slot[0] == NULL or slot[0] == _dummy):
+ pass # Raise KeyError
+ return
+ # Found it
+ # TODO: Check refcounts
+ Py_DECREF(slot[0])
+ slot[0] = _dummy
=== modified file 'bzrlib/tests/__init__.py'
--- a/bzrlib/tests/__init__.py 2009-09-30 17:43:33 +0000
+++ b/bzrlib/tests/__init__.py 2009-10-02 05:41:24 +0000
@@ -3514,6 +3514,7 @@
'bzrlib.tests.test__known_graph',
'bzrlib.tests.test__rio',
'bzrlib.tests.test__static_tuple',
+ 'bzrlib.tests.test__static_tuple_interned',
'bzrlib.tests.test__walkdirs_win32',
'bzrlib.tests.test_ancestry',
'bzrlib.tests.test_annotate',
=== added file 'bzrlib/tests/test__static_tuple_interned.py'
--- a/bzrlib/tests/test__static_tuple_interned.py 1970-01-01 00:00:00 +0000
+++ b/bzrlib/tests/test__static_tuple_interned.py 2009-10-02 05:41:24 +0000
@@ -0,0 +1,144 @@
+# Copyright (C) 2009 Canonical Ltd
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+"""Tests for the StaticTupleInterned type."""
+
+from bzrlib import (
+ # _static_tuple_py,
+ errors,
+ osutils,
+ tests,
+ )
+
+from bzrlib.tests import (
+ test__static_tuple,
+ )
+try:
+ from bzrlib import _static_tuple_interned_pyx as _module
+except ImportError:
+ _module = None
+try:
+ from bzrlib._static_tuple_c import StaticTuple
+except ImportError:
+ pass
+
+
+# Even though this is an extension, we don't permute the tests for a python
+# version. As the plain python version is just a dict.
+
+class _CompiledStaticTupleInterned(tests.Feature):
+
+ def _probe(self):
+ if _module is None:
+ return False
+ return True
+
+ def feature_name(self):
+ return 'bzrlib._static_tuple_interned_pyx'
+
+CompiledStaticTupleInterned = _CompiledStaticTupleInterned()
+
+
+class TestStaticTupleInterned(tests.TestCase):
+
+ _test_needs_features = [CompiledStaticTupleInterned,
+ test__static_tuple.CompiledStaticTuple]
+
+ def assertIn(self, obj, container):
+ self.assertTrue(obj in container,
+ '%s not found in %s' % (obj, container))
+
+ def assertNotIn(self, obj, container):
+ self.assertTrue(obj not in container,
+ 'We found %s in %s' % (obj, container))
+
+ def test_initial(self):
+ obj = _module.StaticTupleInterner()
+ self.assertEqual(0, len(obj))
+ st = StaticTuple('foo', 'bar')
+ self.assertEqual(0, obj.used)
+ self.assertEqual(0, obj.fill)
+ self.assertEqual(0x3ff, obj.mask) # 1024 - 1
+
+ def test__lookup(self):
+ # The tuple hash function is rather good at entropy. For all integers
+ # 0=>1023, hash((i,)) & 1023 maps to a unique output, and hash((i,j))
+ # maps to all 1024 fields evenly.
+ # However, hash((c,d))& 1023 for characters has an uneven distribution
+ # of collisions, for example:
+ # ('a', 'a'), ('f', '4'), ('p', 'r'), ('q', '1'), ('F', 'T'),
+ # ('Q', 'Q'), ('V', 'd'), ('7', 'C')
+ # all collide @ 643
+ obj = _module.StaticTupleInterner()
+ offset, val = obj._test_lookup(StaticTuple('a', 'a'))
+ self.assertEqual(643, offset)
+ self.assertEqual('<null>', val)
+ offset, val = obj._test_lookup(StaticTuple('f', '4'))
+ self.assertEqual(643, offset)
+ self.assertEqual('<null>', val)
+ offset, val = obj._test_lookup(StaticTuple('p', 'r'))
+ self.assertEqual(643, offset)
+ self.assertEqual('<null>', val)
+
+ def test_get_set_del_with_collisions(self):
+ obj = _module.StaticTupleInterner()
+ k1 = StaticTuple('a', 'a')
+ k2 = StaticTuple('f', '4') # collides
+ k3 = StaticTuple('p', 'r')
+ k4 = StaticTuple('q', '1')
+ self.assertEqual((643, '<null>'), obj._test_lookup(k1))
+ self.assertEqual((643, '<null>'), obj._test_lookup(k2))
+ self.assertEqual((643, '<null>'), obj._test_lookup(k3))
+ self.assertEqual((643, '<null>'), obj._test_lookup(k4))
+ obj[k1] = k1
+ self.assertIn(k1, obj)
+ self.assertNotIn(k2, obj)
+ self.assertNotIn(k3, obj)
+ self.assertNotIn(k4, obj)
+ self.assertEqual((643, k1), obj._test_lookup(k1))
+ self.assertEqual((787, '<null>'), obj._test_lookup(k2))
+ self.assertEqual((787, '<null>'), obj._test_lookup(k3))
+ self.assertEqual((787, '<null>'), obj._test_lookup(k4))
+ self.assertIs(k1, obj[k1])
+ obj[k2] = k2
+ self.assertIs(k2, obj[k2])
+ self.assertEqual((643, k1), obj._test_lookup(k1))
+ self.assertEqual((787, k2), obj._test_lookup(k2))
+ self.assertEqual((660, '<null>'), obj._test_lookup(k3))
+ # Even though k4 collides for the first couple of iterations, the hash
+ # perturbation uses the full width hash (not just the masked value), so
+ # it now diverges
+ self.assertEqual((180, '<null>'), obj._test_lookup(k4))
+ self.assertEqual((643, k1), obj._test_lookup(('a', 'a')))
+ self.assertEqual((787, k2), obj._test_lookup(('f', '4')))
+ self.assertEqual((660, '<null>'), obj._test_lookup(('p', 'r')))
+ self.assertEqual((180, '<null>'), obj._test_lookup(('q', '1')))
+ obj[k3] = k3
+ self.assertIs(k3, obj[k3])
+ self.assertIn(k1, obj)
+ self.assertIn(k2, obj)
+ self.assertIn(k3, obj)
+ self.assertNotIn(k4, obj)
+
+ del obj[k1]
+ self.assertEqual((643, '<dummy>'), obj._test_lookup(k1))
+ self.assertEqual((787, k2), obj._test_lookup(k2))
+ self.assertEqual((660, k3), obj._test_lookup(k3))
+ self.assertEqual((643, '<dummy>'), obj._test_lookup(k4))
+ self.assertNotIn(k1, obj)
+ self.assertIn(k2, obj)
+ self.assertIn(k3, obj)
+ self.assertNotIn(k4, obj)
=== modified file 'setup.py'
--- a/setup.py 2009-09-30 17:43:33 +0000
+++ b/setup.py 2009-10-02 05:41:24 +0000
@@ -297,6 +297,7 @@
ext_modules.append(Extension('bzrlib._static_tuple_c',
['bzrlib/_static_tuple_c.c']))
add_pyrex_extension('bzrlib._btree_serializer_pyx')
+add_pyrex_extension('bzrlib._static_tuple_interned_pyx')
if unavailable_files:
More information about the bazaar-commits
mailing list