Rev 4737: Adding a StaticTupleInterner class. in http://bazaar.launchpad.net/~jameinel/bzr/2.1-static-tuple
John Arbash Meinel
john at arbash-meinel.com
Fri Oct 2 06:40:49 BST 2009
At http://bazaar.launchpad.net/~jameinel/bzr/2.1-static-tuple
------------------------------------------------------------
revno: 4737
revision-id: john at arbash-meinel.com-20091002054041-2qaug0o2ugr0ojr3
parent: john at arbash-meinel.com-20091002053835-q4ztao5hwvzyr181
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: 2.1-static-tuple
timestamp: Fri 2009-10-02 00:40:41 -0500
message:
Adding a StaticTupleInterner class.
Functions basically like a dict, but with less memory overhead.
(well, possibly more like a set that also implements __getitem__).
For now, it allows arbitrary objects, though I'm considering
only allowing StaticTuple objects for some ease-of-implementation.
Especially if I find I want to cache the hash() on StaticTuple for
performance, then I may want to expose that in StaticTupleInterner.
-------------- next part --------------
=== modified file '.bzrignore'
--- a/.bzrignore 2009-07-31 19:56:19 +0000
+++ b/.bzrignore 2009-10-02 05:40:41 +0000
@@ -51,6 +51,7 @@
bzrlib/_known_graph_pyx.c
bzrlib/_readdir_pyx.c
bzrlib/_rio_pyx.c
+bzrlib/_static_tuple_interned_pyx.c
bzrlib/_walkdirs_win32.c
doc/en/release-notes/NEWS.txt
doc/en/developer-guide/HACKING.txt
=== added file 'bzrlib/_static_tuple_interned_pyx.pyx'
--- a/bzrlib/_static_tuple_interned_pyx.pyx 1970-01-01 00:00:00 +0000
+++ b/bzrlib/_static_tuple_interned_pyx.pyx 2009-10-02 05:40:41 +0000
@@ -0,0 +1,212 @@
+# Copyright (C) 2009 Canonical Ltd
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+"""Definition of a class that is used to intern StaticTuple objects."""
+
+cdef extern from "Python.h":
+ ctypedef unsigned long size_t
+ ctypedef struct PyTypeObject
+ ctypedef struct PyObject:
+ PyTypeObject *ob_type
+ ctypedef long (*hashfunc)(PyObject*)
+ ctypedef PyObject *(*richcmpfunc)(PyObject *, PyObject *, int)
+ int Py_EQ
+ PyObject *Py_True
+ PyObject *Py_NotImplemented
+ void Py_INCREF(PyObject *)
+ void Py_DECREF(PyObject *)
+ ctypedef struct PyTypeObject:
+ hashfunc tp_hash
+ richcmpfunc tp_richcompare
+
+ void *PyMem_Malloc(size_t nbytes)
+ void PyMem_Free(void *)
+ void memset(void *, int, size_t)
+
+
+cdef object _dummy_obj
+cdef PyObject *_dummy
+_dummy_obj = object()
+_dummy = <PyObject *>_dummy_obj
+
+
+cdef inline int _is_equal(PyObject *this, long this_hash, PyObject *other):
+ cdef long other_hash
+ cdef PyObject *res
+
+ if this == other:
+ return 1
+ other_hash = other.ob_type.tp_hash(other)
+ if other_hash != this_hash:
+ return 0
+ res = this.ob_type.tp_richcompare(this, other, Py_EQ)
+ if res == Py_True:
+ Py_DECREF(res)
+ return 1
+ if res == Py_NotImplemented:
+ Py_DECREF(res)
+ res = other.ob_type.tp_richcompare(other, this, Py_EQ)
+ if res == Py_True:
+ Py_DECREF(res)
+ return 1
+ Py_DECREF(res)
+ return 0
+
+
+cdef class StaticTupleInterner:
+ """This class tracks the canonical forms for StaticTuples.
+
+ It is similar in function to the interned dictionary that is used by
+ strings. However:
+
+ 1) It assumes that hash(obj) is cheap, so does not need to inline a copy
+ of it
+ 2) It only stores one reference to the object, rather than 2 (key vs
+ key:value)
+
+ As such, it uses 1/3rd the amount of memory to store a pointer to the
+ interned object.
+ """
+
+ cdef readonly Py_ssize_t used # active
+ cdef readonly Py_ssize_t fill # active + dummy
+ cdef readonly Py_ssize_t mask # Table contains (mask+1) slots, a power
+ # of 2
+ cdef PyObject **table # Pyrex/Cython doesn't support arrays to 'object'
+ # so we manage it manually
+
+ DEF DEFAULT_SIZE=1024
+ DEF PERTURB_SHIFT=5
+
+ def __init__(self):
+ cdef Py_ssize_t size, n_bytes
+
+ size = DEFAULT_SIZE
+ self.mask = size - 1
+ self.used = 0
+ self.fill = 0
+ n_bytes = sizeof(PyObject*) * size;
+ self.table = <PyObject **>PyMem_Malloc(n_bytes)
+ # TODO: Raise MemoryError if malloc fails
+ memset(self.table, 0, n_bytes)
+
+ def __dealloc__(self):
+ if self.table != NULL:
+ PyMem_Free(self.table)
+ self.table = NULL
+
+ def __len__(self):
+ return self.used
+
+ cdef PyObject **_lookup(self, key, long hash) except NULL:
+ """Find the slot where 'key' would fit.
+
+ This is the same as a dicts 'lookup' function
+
+ :param key: An object we are looking up
+ :param hash: The hash for key
+ :return: The location in self.table where key should be put
+ should never be NULL, but may reference a NULL (PyObject*)
+ """
+ cdef size_t i, perturb
+ cdef Py_ssize_t mask
+ cdef long this_hash
+ cdef PyObject **table, **cur, **free_slot, *py_key
+
+ mask = self.mask
+ table = self.table
+ i = hash & mask
+ cur = &table[i]
+ py_key = <PyObject *>key
+ if cur[0] == NULL:
+ # Found a blank spot, or found the exact key
+ return cur
+ if cur[0] == py_key:
+ return cur
+ if cur[0] == _dummy:
+ free_slot = cur
+ else:
+ if _is_equal(py_key, hash, cur[0]):
+ # Both py_key and cur[0] belong in this slot, return it
+ return cur
+ free_slot = NULL
+ # size_t is unsigned, hash is signed...
+ perturb = hash
+ while True:
+ i = (i << 2) + i + perturb + 1
+ cur = &table[i & mask]
+ if cur[0] == NULL: # Found an empty spot
+ if free_slot: # Did we find a _dummy earlier?
+ return free_slot
+ else:
+ return cur
+ if (cur[0] == py_key # exact match
+ or _is_equal(py_key, hash, cur[0])): # Equivalent match
+ return cur
+ if (cur[0] == _dummy and free_slot == NULL):
+ free_slot = cur
+ perturb >>= PERTURB_SHIFT
+ raise AssertionError('should never get here')
+
+ def _test_lookup(self, key):
+ cdef PyObject **slot
+
+ slot = self._lookup(key, hash(key))
+ if slot[0] == NULL:
+ res = '<null>'
+ elif slot[0] == _dummy:
+ res = '<dummy>'
+ else:
+ res = <object>slot[0]
+ return <int>(slot - self.table), res
+
+ def __contains__(self, key):
+ cdef PyObject **slot
+
+ slot = self._lookup(key, hash(key))
+ if slot[0] == NULL or slot[0] == _dummy:
+ return False
+ return True
+
+ def __getitem__(self, key):
+ cdef PyObject **slot
+ slot = self._lookup(key, hash(key))
+ if slot[0] == NULL or slot[0] == _dummy:
+ raise KeyError("Key %s is not present" % key)
+ val = <object>(slot[0])
+ return val
+
+ def __setitem__(self, key, value):
+ cdef PyObject **slot, *py_key
+ assert key == value
+
+ slot = self._lookup(key, hash(key))
+ if (slot[0] == NULL or slot[0] == _dummy):
+ py_key = <PyObject *>key
+ Py_INCREF(py_key)
+ slot[0] = py_key
+
+ def __delitem__(self, key):
+ cdef PyObject **slot, *py_key
+
+ slot = self._lookup(key, hash(key))
+ if (slot[0] == NULL or slot[0] == _dummy):
+ pass # Raise KeyError
+ return
+ # Found it
+ # TODO: Check refcounts
+ Py_DECREF(slot[0])
+ slot[0] = _dummy
=== modified file 'bzrlib/tests/__init__.py'
--- a/bzrlib/tests/__init__.py 2009-09-30 17:43:33 +0000
+++ b/bzrlib/tests/__init__.py 2009-10-02 05:40:41 +0000
@@ -3514,6 +3514,7 @@
'bzrlib.tests.test__known_graph',
'bzrlib.tests.test__rio',
'bzrlib.tests.test__static_tuple',
+ 'bzrlib.tests.test__static_tuple_interned',
'bzrlib.tests.test__walkdirs_win32',
'bzrlib.tests.test_ancestry',
'bzrlib.tests.test_annotate',
=== added file 'bzrlib/tests/test__static_tuple_interned.py'
--- a/bzrlib/tests/test__static_tuple_interned.py 1970-01-01 00:00:00 +0000
+++ b/bzrlib/tests/test__static_tuple_interned.py 2009-10-02 05:40:41 +0000
@@ -0,0 +1,144 @@
+# Copyright (C) 2009 Canonical Ltd
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+"""Tests for the StaticTupleInterned type."""
+
+from bzrlib import (
+ # _static_tuple_py,
+ errors,
+ osutils,
+ tests,
+ )
+
+from bzrlib.tests import (
+ test__static_tuple,
+ )
+try:
+ from bzrlib import _static_tuple_interned_pyx as _module
+except ImportError:
+ _module = None
+try:
+ from bzrlib._static_tuple_c import StaticTuple
+except ImportError:
+ pass
+
+
+# Even though this is an extension, we don't permute the tests for a python
+# version. As the plain python version is just a dict.
+
+class _CompiledStaticTupleInterned(tests.Feature):
+
+ def _probe(self):
+ if _module is None:
+ return False
+ return True
+
+ def feature_name(self):
+ return 'bzrlib._static_tuple_interned_pyx'
+
+CompiledStaticTupleInterned = _CompiledStaticTupleInterned()
+
+
+class TestStaticTupleInterned(tests.TestCase):
+
+ _test_needs_features = [CompiledStaticTupleInterned,
+ test__static_tuple.CompiledStaticTuple]
+
+ def assertIn(self, obj, container):
+ self.assertTrue(obj in container,
+ '%s not found in %s' % (obj, container))
+
+ def assertNotIn(self, obj, container):
+ self.assertTrue(obj not in container,
+ 'We found %s in %s' % (obj, container))
+
+ def test_initial(self):
+ obj = _module.StaticTupleInterner()
+ self.assertEqual(0, len(obj))
+ st = StaticTuple('foo', 'bar')
+ self.assertEqual(0, obj.used)
+ self.assertEqual(0, obj.fill)
+ self.assertEqual(0x3ff, obj.mask) # 1024 - 1
+
+ def test__lookup(self):
+ # The tuple hash function is rather good at entropy. For all integers
+ # 0=>1023, hash((i,)) & 1023 maps to a unique output, and hash((i,j))
+ # maps to all 1024 fields evenly.
+ # However, hash((c,d))& 1023 for characters has an uneven distribution
+ # of collisions, for example:
+ # ('a', 'a'), ('f', '4'), ('p', 'r'), ('q', '1'), ('F', 'T'),
+ # ('Q', 'Q'), ('V', 'd'), ('7', 'C')
+ # all collide @ 643
+ obj = _module.StaticTupleInterner()
+ offset, val = obj._test_lookup(StaticTuple('a', 'a'))
+ self.assertEqual(643, offset)
+ self.assertEqual('<null>', val)
+ offset, val = obj._test_lookup(StaticTuple('f', '4'))
+ self.assertEqual(643, offset)
+ self.assertEqual('<null>', val)
+ offset, val = obj._test_lookup(StaticTuple('p', 'r'))
+ self.assertEqual(643, offset)
+ self.assertEqual('<null>', val)
+
+ def test_get_set_del_with_collisions(self):
+ obj = _module.StaticTupleInterner()
+ k1 = StaticTuple('a', 'a')
+ k2 = StaticTuple('f', '4') # collides
+ k3 = StaticTuple('p', 'r')
+ k4 = StaticTuple('q', '1')
+ self.assertEqual((643, '<null>'), obj._test_lookup(k1))
+ self.assertEqual((643, '<null>'), obj._test_lookup(k2))
+ self.assertEqual((643, '<null>'), obj._test_lookup(k3))
+ self.assertEqual((643, '<null>'), obj._test_lookup(k4))
+ obj[k1] = k1
+ self.assertIn(k1, obj)
+ self.assertNotIn(k2, obj)
+ self.assertNotIn(k3, obj)
+ self.assertNotIn(k4, obj)
+ self.assertEqual((643, k1), obj._test_lookup(k1))
+ self.assertEqual((787, '<null>'), obj._test_lookup(k2))
+ self.assertEqual((787, '<null>'), obj._test_lookup(k3))
+ self.assertEqual((787, '<null>'), obj._test_lookup(k4))
+ self.assertIs(k1, obj[k1])
+ obj[k2] = k2
+ self.assertIs(k2, obj[k2])
+ self.assertEqual((643, k1), obj._test_lookup(k1))
+ self.assertEqual((787, k2), obj._test_lookup(k2))
+ self.assertEqual((660, '<null>'), obj._test_lookup(k3))
+ # Even though k4 collides for the first couple of iterations, the hash
+ # perturbation uses the full width hash (not just the masked value), so
+ # it now diverges
+ self.assertEqual((180, '<null>'), obj._test_lookup(k4))
+ self.assertEqual((643, k1), obj._test_lookup(('a', 'a')))
+ self.assertEqual((787, k2), obj._test_lookup(('f', '4')))
+ self.assertEqual((660, '<null>'), obj._test_lookup(('p', 'r')))
+ self.assertEqual((180, '<null>'), obj._test_lookup(('q', '1')))
+ obj[k3] = k3
+ self.assertIs(k3, obj[k3])
+ self.assertIn(k1, obj)
+ self.assertIn(k2, obj)
+ self.assertIn(k3, obj)
+ self.assertNotIn(k4, obj)
+
+ del obj[k1]
+ self.assertEqual((643, '<dummy>'), obj._test_lookup(k1))
+ self.assertEqual((787, k2), obj._test_lookup(k2))
+ self.assertEqual((660, k3), obj._test_lookup(k3))
+ self.assertEqual((643, '<dummy>'), obj._test_lookup(k4))
+ self.assertNotIn(k1, obj)
+ self.assertIn(k2, obj)
+ self.assertIn(k3, obj)
+ self.assertNotIn(k4, obj)
=== modified file 'setup.py'
--- a/setup.py 2009-09-30 17:43:33 +0000
+++ b/setup.py 2009-10-02 05:40:41 +0000
@@ -297,6 +297,7 @@
ext_modules.append(Extension('bzrlib._static_tuple_c',
['bzrlib/_static_tuple_c.c']))
add_pyrex_extension('bzrlib._btree_serializer_pyx')
+add_pyrex_extension('bzrlib._static_tuple_interned_pyx')
if unavailable_files:
More information about the bazaar-commits
mailing list