Rev 4767: Some review comments from Andrew. in http://bazaar.launchpad.net/~jameinel/bzr/2.1-simple-set
John Arbash Meinel
john at arbash-meinel.com
Fri Oct 9 17:49:54 BST 2009
At http://bazaar.launchpad.net/~jameinel/bzr/2.1-simple-set
------------------------------------------------------------
revno: 4767
revision-id: john at arbash-meinel.com-20091009164945-x2fx9hnxhebbtitu
parent: john at arbash-meinel.com-20091009152212-wfwi0ifw5hiaonoa
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: 2.1-simple-set
timestamp: Fri 2009-10-09 11:49:45 -0500
message:
Some review comments from Andrew.
Clean up some loops, etc. Have more code-reuse.
Check for more exception conditions, etc. Make sure that items
added to the SimpleSet have tp_richcompare and tp_hash, so we don't
have to sprinkle checks for that throughout the rest of the code.
-------------- next part --------------
=== modified file 'bzrlib/_simple_set_pyx.pxd'
--- a/bzrlib/_simple_set_pyx.pxd 2009-10-09 15:22:12 +0000
+++ b/bzrlib/_simple_set_pyx.pxd 2009-10-09 16:49:45 +0000
@@ -40,11 +40,36 @@
(like a dict), but we also don't implement the complete list of 'set'
operations (difference, intersection, etc).
"""
+ # Data structure definition:
+ # This is a basic hash table using open addressing.
+ # http://en.wikipedia.org/wiki/Open_addressing
+ # Basically that means we keep an array of pointers to Python objects
+ # (called a table). Each location in the array is called a 'slot'.
+ #
+ # An empty slot holds a NULL pointer, a slot where there was an item
+ # which was then deleted will hold a pointer to _dummy, and a filled slot
+ # points at the actual object which fills that slot.
+ #
+ # The table is always a power of two, and the default location where an
+ # object is inserted is at hash(object) & (table_size - 1)
+ #
+ # If there is a collision, then we search for another location. The
+ # specific algorithm is in _lookup. We search until we:
+ # find the object
+ # find an equivalent object (by tp_richcompare(obj1, obj2, Py_EQ))
+ # find a NULL slot
+ #
+ # When an object is deleted, we set its slot to _dummy. this way we don't
+ # have to track whether there was a collision, and find the corresponding
+ # keys. (The collision resolution algorithm makes that nearly impossible
+ # anyway, because it depends on the upper bits of the hash.)
+ # The main effect of this, is that if we find _dummy, then we can insert
+ # an object there, but we have to keep searching until we find NULL to
+ # know that the object is not present elsewhere.
cdef Py_ssize_t _used # active
cdef Py_ssize_t _fill # active + dummy
- cdef Py_ssize_t _mask # Table contains (mask+1) slots, a power
- # of 2
+ cdef Py_ssize_t _mask # Table contains (mask+1) slots, a power of 2
cdef PyObject **_table # Pyrex/Cython doesn't support arrays to 'object'
# so we manage it manually
=== modified file 'bzrlib/_simple_set_pyx.pyx'
--- a/bzrlib/_simple_set_pyx.pyx 2009-10-08 04:40:16 +0000
+++ b/bzrlib/_simple_set_pyx.pyx 2009-10-09 16:49:45 +0000
@@ -33,19 +33,26 @@
traverseproc tp_traverse
PyTypeObject *Py_TYPE(PyObject *)
+ int PyObject_IsTrue(PyObject *)
void *PyMem_Malloc(size_t nbytes)
void PyMem_Free(void *)
void memset(void *, int, size_t)
+# Dummy is an object used to mark nodes that have been deleted. Since
+# collisions require us to move a node to an alternative location, if we just
+# set an entry to NULL on delete, we won't find any relocated nodes.
+# We have to use _dummy_obj because we need to keep a refcount to it, but we
+# also use _dummy as a pointer, because it avoids having to put <PyObject*> all
+# over the code base.
cdef object _dummy_obj
cdef PyObject *_dummy
_dummy_obj = object()
_dummy = <PyObject *>_dummy_obj
-cdef int _is_equal(PyObject *this, long this_hash, PyObject *other):
+cdef int _is_equal(PyObject *this, long this_hash, PyObject *other) except -1:
cdef long other_hash
cdef PyObject *res
@@ -54,14 +61,26 @@
other_hash = Py_TYPE(other).tp_hash(other)
if other_hash != this_hash:
return 0
+
+ # This implements a subset of the PyObject_RichCompareBool functionality.
+ # Namely it:
+ # 1) Doesn't try to do anything with old-style classes
+ # 2) Assumes that both objects have a tp_richcompare implementation, and
+ # that if that is not enough to compare equal, then they are not
+ # equal. (It doesn't try to cast them both to some intermediate form
+ # that would compare equal.)
res = Py_TYPE(this).tp_richcompare(this, other, Py_EQ)
- if res == Py_True:
+ if res == NULL: # Exception
+ return -1
+ if PyObject_IsTrue(res):
Py_DECREF(res)
return 1
if res == Py_NotImplemented:
Py_DECREF(res)
res = Py_TYPE(other).tp_richcompare(other, this, Py_EQ)
- if res == Py_True:
+ if res == NULL:
+ return -1
+ if PyObject_IsTrue(res):
Py_DECREF(res)
return 1
Py_DECREF(res)
@@ -172,23 +191,23 @@
"""
cdef size_t i, perturb, mask
cdef long the_hash
- cdef PyObject **table, **entry
+ cdef PyObject **table, **slot
mask = self._mask
table = self._table
the_hash = Py_TYPE(key).tp_hash(key)
i = the_hash & mask
- entry = &table[i]
+ slot = &table[i]
perturb = the_hash
# Because we know that we made all items unique before, we can just
# iterate as long as the target location is not empty, we don't have to
# do any comparison, etc.
- while entry[0] != NULL:
+ while slot[0] != NULL:
i = (i << 2) + i + perturb + 1
- entry = &table[i & mask]
+ slot = &table[i & mask]
perturb >>= PERTURB_SHIFT
- entry[0] = key
+ slot[0] = key
self._fill += 1
self._used += 1
@@ -206,7 +225,7 @@
:return: The new size of the internal table
"""
cdef Py_ssize_t new_size, n_bytes, remaining
- cdef PyObject **new_table, **old_table, **entry
+ cdef PyObject **new_table, **old_table, **slot
new_size = DEFAULT_SIZE
while new_size <= min_used and new_size > 0:
@@ -236,16 +255,16 @@
# Moving everything to the other table is refcount neutral, so we don't
# worry about it.
- entry = old_table
+ slot = old_table
while remaining > 0:
- if entry[0] == NULL: # unused slot
+ if slot[0] == NULL: # unused slot
pass
- elif entry[0] == _dummy: # dummy slot
+ elif slot[0] == _dummy: # dummy slot
remaining -= 1
else: # active slot
remaining -= 1
- self._insert_clean(entry[0])
- entry += 1
+ self._insert_clean(slot[0])
+ slot += 1
PyMem_Free(old_table)
return new_size
@@ -262,11 +281,15 @@
cdef PyObject **slot, *py_key
cdef int added
+ py_key = <PyObject *>key
+ if (Py_TYPE(py_key).tp_richcompare == NULL
+ or Py_TYPE(py_key).tp_hash == NULL):
+ raise TypeError('Types added to SimpleSet must implement'
+ ' both tp_richcompare and tp_hash')
added = 0
# We need at least one empty slot
assert self._used < self._mask
slot = _lookup(self, key)
- py_key = <PyObject *>key
if (slot[0] == NULL):
Py_INCREF(py_key)
self._fill += 1
@@ -291,11 +314,13 @@
return retval
def discard(self, key):
- """Remove key from the dict, whether it exists or not.
+ """Remove key from the set, whether it exists or not.
- :return: 0 if the item did not exist, 1 if it did
+ :return: False if the item did not exist, True if it did
"""
- return self._discard(key)
+ if self._discard(key):
+ return True
+ return False
cdef int _discard(self, key) except -1:
cdef PyObject **slot, *py_key
@@ -320,16 +345,6 @@
self._resize(self._used * 2)
return 1
- def __delitem__(self, key):
- """Remove the given item from the dict.
-
- Raise a KeyError if the key was not present.
- """
- cdef int exists
- exists = self._discard(key)
- if not exists:
- raise KeyError('Key %s not present' % (key,))
-
def __iter__(self):
return _SimpleSet_iterator(self)
@@ -353,7 +368,7 @@
def __next__(self):
cdef Py_ssize_t mask, i
- cdef PyObject **table
+ cdef PyObject *key
if self.set is None:
raise StopIteration
@@ -361,21 +376,13 @@
# Force this exception to continue to be raised
self._used = -1
raise RuntimeError("Set size changed during iteration")
- i = self.pos
- mask = self.set._mask
- table = self.set._table
- assert i >= 0
- while i <= mask and (table[i] == NULL or table[i] == _dummy):
- i += 1
- self.pos = i + 1
- if i > mask:
- # we walked to the end
+ if not SimpleSet_Next(self.set, &self.pos, &key):
self.set = None
raise StopIteration
- # We must have found one
- key = <object>(table[i])
+ # we found something
+ the_key = <object>key # INCREF
self.len -= 1
- return key
+ return the_key
def __length_hint__(self):
if self.set is not None and self._used == self.set._used:
@@ -420,42 +427,38 @@
cdef Py_ssize_t mask
cdef long key_hash
cdef long this_hash
- cdef PyObject **table, **cur, **free_slot, *py_key
+ cdef PyObject **table, **slot, *cur, **free_slot, *py_key
key_hash = hash(key)
mask = self._mask
table = self._table
+ free_slot = NULL
i = key_hash & mask
- cur = &table[i]
+ slot = &table[i]
+ cur = slot[0]
py_key = <PyObject *>key
- if cur[0] == NULL:
- # Found a blank spot, or found the exact key
- return cur
- if cur[0] == py_key:
- return cur
- if cur[0] == _dummy:
- free_slot = cur
- else:
- if _is_equal(py_key, key_hash, cur[0]):
- # Both py_key and cur[0] belong in this slot, return it
- return cur
- free_slot = NULL
- # size_t is unsigned, hash is signed...
perturb = key_hash
while True:
- i = (i << 2) + i + perturb + 1
- cur = &table[i & mask]
- if cur[0] == NULL: # Found an empty spot
- if free_slot: # Did we find a _dummy earlier?
+ if cur == NULL:
+ # Found a blank spot
+ if free_slot != NULL:
+ # Did we find an earlier _dummy entry?
return free_slot
else:
- return cur
- if (cur[0] == py_key # exact match
- or _is_equal(py_key, key_hash, cur[0])): # Equivalent match
- return cur
- if (cur[0] == _dummy and free_slot == NULL):
- free_slot = cur
+ return slot
+ if cur == py_key:
+ # Found an exact pointer to the key
+ return slot
+ if cur == _dummy:
+ if free_slot == NULL:
+ free_slot = slot
+ elif _is_equal(py_key, key_hash, cur):
+ # Both py_key and cur belong in this slot, return it
+ return slot
+ i = (i << 2) + i + perturb + 1
perturb >>= PERTURB_SHIFT
+ slot = &table[i & mask]
+ cur = slot[0]
raise AssertionError('should never get here')
@@ -521,7 +524,6 @@
return _check_self(self)._used
-# TODO: this should probably have direct tests, since it isn't used by __iter__
cdef api int SimpleSet_Next(object self, Py_ssize_t *pos, PyObject **key):
"""Walk over items in a SimpleSet.
@@ -565,8 +567,7 @@
ret = visit(next_key, arg)
if ret:
return ret
-
- return 0;
+ return 0
# It is a little bit ugly to do this, but it works, and means that Meliae can
# dump the total memory consumed by all child objects.
=== modified file 'bzrlib/tests/test__simple_set.py'
--- a/bzrlib/tests/test__simple_set.py 2009-10-08 04:35:01 +0000
+++ b/bzrlib/tests/test__simple_set.py 2009-10-09 16:49:45 +0000
@@ -140,7 +140,7 @@
self.assertIn(k3, obj)
self.assertNotIn(k4, obj)
- del obj[k1]
+ obj.discard(k1)
self.assertEqual((643, '<dummy>'), obj._test_lookup(k1))
self.assertEqual((787, k2), obj._test_lookup(k2))
self.assertEqual((660, k3), obj._test_lookup(k3))
@@ -179,7 +179,7 @@
self.assertRefcount(2, k1)
self.assertRefcount(1, k2)
# Deleting an entry should remove the fill, but not the used
- del obj[k1]
+ obj.discard(k1)
self.assertFillState(0, 1, 0x3ff, obj)
self.assertRefcount(1, k1)
k3 = tuple(['bar'])
@@ -210,23 +210,6 @@
self.assertEqual(1, obj.discard(k3))
self.assertRefcount(1, k3)
- def test__delitem__(self):
- obj = self.module.SimpleSet()
- k1 = tuple(['foo'])
- k2 = tuple(['foo'])
- k3 = tuple(['bar'])
- self.assertRefcount(1, k1)
- self.assertRefcount(1, k2)
- self.assertRefcount(1, k3)
- obj.add(k1)
- self.assertRefcount(2, k1)
- self.assertRaises(KeyError, obj.__delitem__, k3)
- self.assertRefcount(1, k3)
- obj.add(k3)
- self.assertRefcount(2, k3)
- del obj[k3]
- self.assertRefcount(1, k3)
-
def test__resize(self):
obj = self.module.SimpleSet()
k1 = ('foo',)
@@ -235,13 +218,13 @@
obj.add(k1)
obj.add(k2)
obj.add(k3)
- del obj[k2]
+ obj.discard(k2)
self.assertFillState(2, 3, 0x3ff, obj)
self.assertEqual(1024, obj._py_resize(500))
# Doesn't change the size, but does change the content
self.assertFillState(2, 2, 0x3ff, obj)
obj.add(k2)
- del obj[k3]
+ obj.discard(k3)
self.assertFillState(2, 3, 0x3ff, obj)
self.assertEqual(4096, obj._py_resize(4095))
self.assertFillState(2, 2, 0xfff, obj)
@@ -250,7 +233,7 @@
self.assertNotIn(k3, obj)
obj.add(k2)
self.assertIn(k2, obj)
- del obj[k2]
+ obj.discard(k2)
self.assertEqual((591, '<dummy>'), obj._test_lookup(k2))
self.assertFillState(1, 2, 0xfff, obj)
self.assertEqual(2048, obj._py_resize(1024))
@@ -295,5 +278,5 @@
# Set changed size
self.assertRaises(RuntimeError, iterator.next)
# And even removing an item still causes it to fail
- del obj[k2]
+ obj.discard(k2)
self.assertRaises(RuntimeError, iterator.next)
More information about the bazaar-commits
mailing list