Rev 4767: Some review comments from Andrew. in http://bazaar.launchpad.net/~jameinel/bzr/2.1-simple-set

Fri Oct 9 17:49:54 BST 2009

At http://bazaar.launchpad.net/~jameinel/bzr/2.1-simple-set

------------------------------------------------------------
revno: 4767
revision-id: john at arbash-meinel.com-20091009164945-x2fx9hnxhebbtitu
parent: john at arbash-meinel.com-20091009152212-wfwi0ifw5hiaonoa
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: 2.1-simple-set
timestamp: Fri 2009-10-09 11:49:45 -0500
message:
  Some review comments from Andrew.
  
  Clean up some loops, etc. Have more code-reuse.
  Check for more exception conditions, etc. Make sure that items
  added to the SimpleSet have tp_richcompare and tp_hash, so we don't
  have to sprinkle checks for that throughout the rest of the code.
-------------- next part --------------
=== modified file 'bzrlib/_simple_set_pyx.pxd'

--- a/bzrlib/_simple_set_pyx.pxd	2009-10-09 15:22:12 +0000
+++ b/bzrlib/_simple_set_pyx.pxd	2009-10-09 16:49:45 +0000
@@ -40,11 +40,36 @@
     (like a dict), but we also don't implement the complete list of 'set'
     operations (difference, intersection, etc).
     """
+    # Data structure definition:
+    #   This is a basic hash table using open addressing.
+    #       http://en.wikipedia.org/wiki/Open_addressing
+    #   Basically that means we keep an array of pointers to Python objects
+    #   (called a table). Each location in the array is called a 'slot'.
+    #
+    #   An empty slot holds a NULL pointer, a slot where there was an item
+    #   which was then deleted will hold a pointer to _dummy, and a filled slot
+    #   points at the actual object which fills that slot.
+    #
+    #   The table is always a power of two, and the default location where an
+    #   object is inserted is at hash(object) & (table_size - 1)
+    #
+    #   If there is a collision, then we search for another location. The
+    #   specific algorithm is in _lookup. We search until we:
+    #       find the object
+    #       find an equivalent object (by tp_richcompare(obj1, obj2, Py_EQ))
+    #       find a NULL slot
+    #
+    #   When an object is deleted, we set its slot to _dummy. this way we don't
+    #   have to track whether there was a collision, and find the corresponding
+    #   keys. (The collision resolution algorithm makes that nearly impossible
+    #   anyway, because it depends on the upper bits of the hash.)
+    #   The main effect of this, is that if we find _dummy, then we can insert
+    #   an object there, but we have to keep searching until we find NULL to
+    #   know that the object is not present elsewhere.
 
     cdef Py_ssize_t _used   # active
     cdef Py_ssize_t _fill   # active + dummy
-    cdef Py_ssize_t _mask   # Table contains (mask+1) slots, a power
-                                     # of 2
+    cdef Py_ssize_t _mask   # Table contains (mask+1) slots, a power of 2
     cdef PyObject **_table  # Pyrex/Cython doesn't support arrays to 'object'
                             # so we manage it manually
 

=== modified file 'bzrlib/_simple_set_pyx.pyx'
--- a/bzrlib/_simple_set_pyx.pyx	2009-10-08 04:40:16 +0000
+++ b/bzrlib/_simple_set_pyx.pyx	2009-10-09 16:49:45 +0000
@@ -33,19 +33,26 @@
         traverseproc tp_traverse
 
     PyTypeObject *Py_TYPE(PyObject *)
+    int PyObject_IsTrue(PyObject *)
         
     void *PyMem_Malloc(size_t nbytes)
     void PyMem_Free(void *)
     void memset(void *, int, size_t)
 
 
+# Dummy is an object used to mark nodes that have been deleted. Since
+# collisions require us to move a node to an alternative location, if we just
+# set an entry to NULL on delete, we won't find any relocated nodes.
+# We have to use _dummy_obj because we need to keep a refcount to it, but we
+# also use _dummy as a pointer, because it avoids having to put <PyObject*> all
+# over the code base.
 cdef object _dummy_obj
 cdef PyObject *_dummy
 _dummy_obj = object()
 _dummy = <PyObject *>_dummy_obj
 
 
-cdef int _is_equal(PyObject *this, long this_hash, PyObject *other):
+cdef int _is_equal(PyObject *this, long this_hash, PyObject *other) except -1:
     cdef long other_hash
     cdef PyObject *res
 
@@ -54,14 +61,26 @@
     other_hash = Py_TYPE(other).tp_hash(other)
     if other_hash != this_hash:
         return 0
+
+    # This implements a subset of the PyObject_RichCompareBool functionality.
+    # Namely it:
+    #   1) Doesn't try to do anything with old-style classes
+    #   2) Assumes that both objects have a tp_richcompare implementation, and
+    #      that if that is not enough to compare equal, then they are not
+    #      equal. (It doesn't try to cast them both to some intermediate form
+    #      that would compare equal.)
     res = Py_TYPE(this).tp_richcompare(this, other, Py_EQ)
-    if res == Py_True:
+    if res == NULL: # Exception
+        return -1
+    if PyObject_IsTrue(res):
         Py_DECREF(res)
         return 1
     if res == Py_NotImplemented:
         Py_DECREF(res)
         res = Py_TYPE(other).tp_richcompare(other, this, Py_EQ)
-    if res == Py_True:
+    if res == NULL:
+        return -1
+    if PyObject_IsTrue(res):
         Py_DECREF(res)
         return 1
     Py_DECREF(res)
@@ -172,23 +191,23 @@
         """
         cdef size_t i, perturb, mask
         cdef long the_hash
-        cdef PyObject **table, **entry
+        cdef PyObject **table, **slot
 
         mask = self._mask
         table = self._table
 
         the_hash = Py_TYPE(key).tp_hash(key)
         i = the_hash & mask
-        entry = &table[i]
+        slot = &table[i]
         perturb = the_hash
         # Because we know that we made all items unique before, we can just
         # iterate as long as the target location is not empty, we don't have to
         # do any comparison, etc.
-        while entry[0] != NULL:
+        while slot[0] != NULL:
             i = (i << 2) + i + perturb + 1
-            entry = &table[i & mask]
+            slot = &table[i & mask]
             perturb >>= PERTURB_SHIFT
-        entry[0] = key
+        slot[0] = key
         self._fill += 1
         self._used += 1
 
@@ -206,7 +225,7 @@
         :return: The new size of the internal table
         """
         cdef Py_ssize_t new_size, n_bytes, remaining
-        cdef PyObject **new_table, **old_table, **entry
+        cdef PyObject **new_table, **old_table, **slot
 
         new_size = DEFAULT_SIZE
         while new_size <= min_used and new_size > 0:
@@ -236,16 +255,16 @@
 
         # Moving everything to the other table is refcount neutral, so we don't
         # worry about it.
-        entry = old_table
+        slot = old_table
         while remaining > 0:
-            if entry[0] == NULL: # unused slot
+            if slot[0] == NULL: # unused slot
                 pass 
-            elif entry[0] == _dummy: # dummy slot
+            elif slot[0] == _dummy: # dummy slot
                 remaining -= 1
             else: # active slot
                 remaining -= 1
-                self._insert_clean(entry[0])
-            entry += 1
+                self._insert_clean(slot[0])
+            slot += 1
         PyMem_Free(old_table)
         return new_size
 
@@ -262,11 +281,15 @@
         cdef PyObject **slot, *py_key
         cdef int added
 
+        py_key = <PyObject *>key
+        if (Py_TYPE(py_key).tp_richcompare == NULL
+            or Py_TYPE(py_key).tp_hash == NULL):
+            raise TypeError('Types added to SimpleSet must implement'
+                            ' both tp_richcompare and tp_hash')
         added = 0
         # We need at least one empty slot
         assert self._used < self._mask
         slot = _lookup(self, key)
-        py_key = <PyObject *>key
         if (slot[0] == NULL):
             Py_INCREF(py_key)
             self._fill += 1
@@ -291,11 +314,13 @@
         return retval
 
     def discard(self, key):
-        """Remove key from the dict, whether it exists or not.
+        """Remove key from the set, whether it exists or not.
 
-        :return: 0 if the item did not exist, 1 if it did
+        :return: False if the item did not exist, True if it did
         """
-        return self._discard(key)
+        if self._discard(key):
+            return True
+        return False
 
     cdef int _discard(self, key) except -1:
         cdef PyObject **slot, *py_key
@@ -320,16 +345,6 @@
             self._resize(self._used * 2)
         return 1
 
-    def __delitem__(self, key):
-        """Remove the given item from the dict.
-
-        Raise a KeyError if the key was not present.
-        """
-        cdef int exists
-        exists = self._discard(key)
-        if not exists:
-            raise KeyError('Key %s not present' % (key,))
-
     def __iter__(self):
         return _SimpleSet_iterator(self)
 
@@ -353,7 +368,7 @@
 
     def __next__(self):
         cdef Py_ssize_t mask, i
-        cdef PyObject **table
+        cdef PyObject *key
 
         if self.set is None:
             raise StopIteration
@@ -361,21 +376,13 @@
             # Force this exception to continue to be raised
             self._used = -1
             raise RuntimeError("Set size changed during iteration")
-        i = self.pos
-        mask = self.set._mask
-        table = self.set._table
-        assert i >= 0
-        while i <= mask and (table[i] == NULL or table[i] == _dummy):
-            i += 1
-        self.pos = i + 1
-        if i > mask:
-            # we walked to the end
+        if not SimpleSet_Next(self.set, &self.pos, &key):
             self.set = None
             raise StopIteration
-        # We must have found one
-        key = <object>(table[i])
+        # we found something
+        the_key = <object>key # INCREF
         self.len -= 1
-        return key
+        return the_key
 
     def __length_hint__(self):
         if self.set is not None and self._used == self.set._used:
@@ -420,42 +427,38 @@
     cdef Py_ssize_t mask
     cdef long key_hash
     cdef long this_hash
-    cdef PyObject **table, **cur, **free_slot, *py_key
+    cdef PyObject **table, **slot, *cur, **free_slot, *py_key
 
     key_hash = hash(key)
     mask = self._mask
     table = self._table
+    free_slot = NULL
     i = key_hash & mask
-    cur = &table[i]
+    slot = &table[i]
+    cur = slot[0]
     py_key = <PyObject *>key
-    if cur[0] == NULL:
-        # Found a blank spot, or found the exact key
-        return cur
-    if cur[0] == py_key:
-        return cur
-    if cur[0] == _dummy:
-        free_slot = cur
-    else:
-        if _is_equal(py_key, key_hash, cur[0]):
-            # Both py_key and cur[0] belong in this slot, return it
-            return cur
-        free_slot = NULL
-    # size_t is unsigned, hash is signed...
     perturb = key_hash
     while True:
-        i = (i << 2) + i + perturb + 1
-        cur = &table[i & mask]
-        if cur[0] == NULL: # Found an empty spot
-            if free_slot: # Did we find a _dummy earlier?
+        if cur == NULL:
+            # Found a blank spot
+            if free_slot != NULL:
+                # Did we find an earlier _dummy entry?
                 return free_slot
             else:
-                return cur
-        if (cur[0] == py_key # exact match
-            or _is_equal(py_key, key_hash, cur[0])): # Equivalent match
-            return cur
-        if (cur[0] == _dummy and free_slot == NULL):
-            free_slot = cur
+                return slot
+        if cur == py_key:
+            # Found an exact pointer to the key
+            return slot
+        if cur == _dummy:
+            if free_slot == NULL:
+                free_slot = slot
+        elif _is_equal(py_key, key_hash, cur):
+            # Both py_key and cur belong in this slot, return it
+            return slot
+        i = (i << 2) + i + perturb + 1
         perturb >>= PERTURB_SHIFT
+        slot = &table[i & mask]
+        cur = slot[0]
     raise AssertionError('should never get here')
 
 
@@ -521,7 +524,6 @@
     return _check_self(self)._used
 
 
-# TODO: this should probably have direct tests, since it isn't used by __iter__
 cdef api int SimpleSet_Next(object self, Py_ssize_t *pos, PyObject **key):
     """Walk over items in a SimpleSet.
 
@@ -565,8 +567,7 @@
         ret = visit(next_key, arg)
         if ret:
             return ret
-
-    return 0;
+    return 0
 
 # It is a little bit ugly to do this, but it works, and means that Meliae can
 # dump the total memory consumed by all child objects.

=== modified file 'bzrlib/tests/test__simple_set.py'
--- a/bzrlib/tests/test__simple_set.py	2009-10-08 04:35:01 +0000
+++ b/bzrlib/tests/test__simple_set.py	2009-10-09 16:49:45 +0000
@@ -140,7 +140,7 @@
         self.assertIn(k3, obj)
         self.assertNotIn(k4, obj)
 
-        del obj[k1]
+        obj.discard(k1)
         self.assertEqual((643, '<dummy>'), obj._test_lookup(k1))
         self.assertEqual((787, k2), obj._test_lookup(k2))
         self.assertEqual((660, k3), obj._test_lookup(k3))
@@ -179,7 +179,7 @@
         self.assertRefcount(2, k1)
         self.assertRefcount(1, k2)
         # Deleting an entry should remove the fill, but not the used
-        del obj[k1]
+        obj.discard(k1)
         self.assertFillState(0, 1, 0x3ff, obj)
         self.assertRefcount(1, k1)
         k3 = tuple(['bar'])
@@ -210,23 +210,6 @@
         self.assertEqual(1, obj.discard(k3))
         self.assertRefcount(1, k3)
 
-    def test__delitem__(self):
-        obj = self.module.SimpleSet()
-        k1 = tuple(['foo'])
-        k2 = tuple(['foo'])
-        k3 = tuple(['bar'])
-        self.assertRefcount(1, k1)
-        self.assertRefcount(1, k2)
-        self.assertRefcount(1, k3)
-        obj.add(k1)
-        self.assertRefcount(2, k1)
-        self.assertRaises(KeyError, obj.__delitem__, k3)
-        self.assertRefcount(1, k3)
-        obj.add(k3)
-        self.assertRefcount(2, k3)
-        del obj[k3]
-        self.assertRefcount(1, k3)
-
     def test__resize(self):
         obj = self.module.SimpleSet()
         k1 = ('foo',)
@@ -235,13 +218,13 @@
         obj.add(k1)
         obj.add(k2)
         obj.add(k3)
-        del obj[k2]
+        obj.discard(k2)
         self.assertFillState(2, 3, 0x3ff, obj)
         self.assertEqual(1024, obj._py_resize(500))
         # Doesn't change the size, but does change the content
         self.assertFillState(2, 2, 0x3ff, obj)
         obj.add(k2)
-        del obj[k3]
+        obj.discard(k3)
         self.assertFillState(2, 3, 0x3ff, obj)
         self.assertEqual(4096, obj._py_resize(4095))
         self.assertFillState(2, 2, 0xfff, obj)
@@ -250,7 +233,7 @@
         self.assertNotIn(k3, obj)
         obj.add(k2)
         self.assertIn(k2, obj)
-        del obj[k2]
+        obj.discard(k2)
         self.assertEqual((591, '<dummy>'), obj._test_lookup(k2))
         self.assertFillState(1, 2, 0xfff, obj)
         self.assertEqual(2048, obj._py_resize(1024))
@@ -295,5 +278,5 @@
         # Set changed size
         self.assertRaises(RuntimeError, iterator.next)
         # And even removing an item still causes it to fail
-        del obj[k2]
+        obj.discard(k2)
         self.assertRaises(RuntimeError, iterator.next)