Rev 4770: Change the _lookup function to use Quadratic Probing. in http://bazaar.launchpad.net/~jameinel/bzr/2.1-simple-set

John Arbash Meinel john at arbash-meinel.com
Mon Oct 12 16:55:29 BST 2009


At http://bazaar.launchpad.net/~jameinel/bzr/2.1-simple-set

------------------------------------------------------------
revno: 4770
revision-id: john at arbash-meinel.com-20091012155526-b46tuz5cibebbnct
parent: john at arbash-meinel.com-20091012145850-9363u41ovkaz7nka
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: 2.1-simple-set
timestamp: Mon 2009-10-12 10:55:26 -0500
message:
  Change the _lookup function to use Quadratic Probing.
  
  The main strength is that we get a bit of locality (collision resolution
  looks quickly close to the location.)
  The main weakness is that we don't use the upper bits of the hash to
  induce divergence. So while we nicely handle collisions in neighboring
  buckets (Quadratic means their resolution chains will diverge), if
  two objects hash to the same bucket, their resolution chains are identical.
-------------- next part --------------
=== modified file 'bzrlib/_simple_set_pyx.pyx'
--- a/bzrlib/_simple_set_pyx.pyx	2009-10-12 14:58:50 +0000
+++ b/bzrlib/_simple_set_pyx.pyx	2009-10-12 15:55:26 +0000
@@ -107,7 +107,6 @@
     """
     # Attributes are defined in the .pxd file
     DEF DEFAULT_SIZE=1024
-    DEF PERTURB_SHIFT=5
 
     def __init__(self):
         cdef Py_ssize_t size, n_bytes
@@ -193,27 +192,27 @@
         as it makes a lot of assuptions about keys not already being present,
         and there being no dummy entries.
         """
-        cdef size_t i, perturb, mask
+        cdef size_t i, n_lookup
         cdef long the_hash
         cdef PyObject **table, **slot
+        cdef Py_ssize_t mask
 
         mask = self._mask
         table = self._table
 
         the_hash = Py_TYPE(key).tp_hash(key)
-        i = the_hash & mask
-        slot = &table[i]
-        perturb = the_hash
-        # Because we know that we made all items unique before, we can just
-        # iterate as long as the target location is not empty, we don't have to
-        # do any comparison, etc.
-        while slot[0] != NULL:
-            i = (i << 2) + i + perturb + 1
+        if the_hash == -1:
+            return -1
+        i = the_hash
+        for n_lookup from 0 <= n_lookup <= <size_t>mask: # Don't loop forever
             slot = &table[i & mask]
-            perturb >>= PERTURB_SHIFT
-        slot[0] = key
-        self._fill += 1
-        self._used += 1
+            if slot[0] == NULL:
+                slot[0] = key
+                self._fill += 1
+                self._used += 1
+                return 1
+            i = i + 1 + n_lookup
+        raise RuntimeError('ran out of slots.')
 
     def _py_resize(self, min_used):
         """Do not use this directly, it is only exposed for testing."""
@@ -422,26 +421,50 @@
 
     :param key: An object we are looking up
     :param hash: The hash for key
-    :return: The location in self.table where key should be put
-        should never be NULL, but may reference a NULL (PyObject*)
+    :return: The location in self.table where key should be put.
+        location == NULL is an exception, but (*location) == NULL just
+        indicates the slot is empty and can be used.
     """
-    # This is the heart of most functions, which is why it is pulled out as an
-    # cdef inline function.
-    cdef size_t i, perturb
+    # This uses Quadratic Probing:
+    #  http://en.wikipedia.org/wiki/Quadratic_probing
+    # with c1 = c2 = 1/2
+    # This leads to probe locations at:
+    #  h0 = hash(k1)
+    #  h1 = h0 + 1
+    #  h2 = h0 + 3 = h1 + 1 + 1
+    #  h3 = h0 + 6 = h2 + 1 + 2
+    #  h4 = h0 + 10 = h2 + 1 + 3
+    # Note that all of these are '& mask', but that is computed *after* the
+    # offset.
+    # This differs from the algorithm used by Set and Dict. Which, effectively,
+    # use double-hashing, and a step size that starts large, but dwindles to
+    # stepping one-by-one.
+    # This gives more 'locality' in that if you have a collision at offset X,
+    # the first fallback is X+1, which is fast to check. However, that means
+    # that an object w/ hash X+1 will also check there, and then X+2 next.
+    # However, for objects with differing hashes, their chains are different.
+    # The former checks X, X+1, X+3, ... the latter checks X+1, X+2, X+4, ...
+    # So different hashes diverge quickly.
+    # A bigger problem is that we *only* ever use the lowest bits of the hash
+    # So all integers (x + SIZE*N) will resolve into the same bucket, and all
+    # use the same collision resolution. We may want to try to find a way to
+    # incorporate the upper bits of the hash with quadratic probing. (For
+    # example, X, X+1, X+3+some_upper_bits, X+6+more_upper_bits, etc.)
+    cdef size_t i, n_lookup
     cdef Py_ssize_t mask
     cdef long key_hash
     cdef PyObject **table, **slot, *cur, **free_slot, *py_key
 
+    # hash is a signed long(), we are using an offset at unsigned size_t
     key_hash = hash(key)
+    i = <size_t>key_hash
     mask = self._mask
     table = self._table
     free_slot = NULL
-    i = key_hash & mask
-    slot = &table[i]
-    cur = slot[0]
     py_key = <PyObject *>key
-    perturb = key_hash
-    while True:
+    for n_lookup from 0 <= n_lookup <= <size_t>mask: # Don't loop forever
+        slot = &table[i & mask]
+        cur = slot[0]
         if cur == NULL:
             # Found a blank spot
             if free_slot != NULL:
@@ -458,10 +481,7 @@
         elif _is_equal(py_key, key_hash, cur):
             # Both py_key and cur belong in this slot, return it
             return slot
-        i = (i << 2) + i + perturb + 1
-        perturb >>= PERTURB_SHIFT
-        slot = &table[i & mask]
-        cur = slot[0]
+        i = i + 1 + n_lookup
     raise AssertionError('should never get here')
 
 

=== modified file 'bzrlib/tests/test__simple_set.py'
--- a/bzrlib/tests/test__simple_set.py	2009-10-12 14:58:50 +0000
+++ b/bzrlib/tests/test__simple_set.py	2009-10-12 15:55:26 +0000
@@ -101,6 +101,9 @@
     def assertFillState(self, used, fill, mask, obj):
         self.assertEqual((used, fill, mask), (obj.used, obj.fill, obj.mask))
 
+    def assertLookup(self, offset, value, obj, key):
+        self.assertEqual((offset, value), obj._test_lookup(key))
+
     def assertRefcount(self, count, obj):
         """Assert that the refcount for obj is what we expect.
 
@@ -123,15 +126,36 @@
         # These are carefully chosen integers to force hash collisions in the
         # algorithm, based on the initial set size of 1024
         obj = self.module.SimpleSet()
-        offset, val = obj._test_lookup(_Hashable(643))
-        self.assertEqual(643, offset)
-        self.assertEqual('<null>', val)
-        offset, val = obj._test_lookup(_Hashable(643 + 1024))
-        self.assertEqual(643, offset)
-        self.assertEqual('<null>', val)
-        offset, val = obj._test_lookup(_Hashable(643 + 1024*50))
-        self.assertEqual(643, offset)
-        self.assertEqual('<null>', val)
+        self.assertLookup(643, '<null>', obj, _Hashable(643))
+        self.assertLookup(643, '<null>', obj, _Hashable(643 + 1024))
+        self.assertLookup(643, '<null>', obj, _Hashable(643 + 50*1024))
+
+    def test__lookup_collision(self):
+        obj = self.module.SimpleSet()
+        k1 = _Hashable(643)
+        k2 = _Hashable(643 + 1024)
+        self.assertLookup(643, '<null>', obj, k1)
+        self.assertLookup(643, '<null>', obj, k2)
+        obj.add(k1)
+        self.assertLookup(643, k1, obj, k1)
+        self.assertLookup(644, '<null>', obj, k2)
+
+    def test__lookup_after_resize(self):
+        obj = self.module.SimpleSet()
+        k1 = _Hashable(643)
+        k2 = _Hashable(643 + 1024)
+        obj.add(k1)
+        obj.add(k2)
+        self.assertLookup(643, k1, obj, k1)
+        self.assertLookup(644, k2, obj, k2)
+        obj._py_resize(2047) # resized to 2048
+        self.assertEqual(2048, obj.mask + 1)
+        self.assertLookup(643, k1, obj, k1)
+        self.assertLookup(643+1024, k2, obj, k2)
+        obj._py_resize(1023) # resized back to 1024
+        self.assertEqual(1024, obj.mask + 1)
+        self.assertLookup(643, k1, obj, k1)
+        self.assertLookup(644, k2, obj, k2)
 
     def test_get_set_del_with_collisions(self):
         obj = self.module.SimpleSet()
@@ -140,38 +164,47 @@
         h2 = 643 + 1024
         h3 = 643 + 1024*50
         h4 = 643 + 1024*25
+        h5 = 644
+        h6 = 644 + 1024
 
         k1 = _Hashable(h1)
         k2 = _Hashable(h2)
         k3 = _Hashable(h3)
         k4 = _Hashable(h4)
-        self.assertEqual((643, '<null>'), obj._test_lookup(k1))
-        self.assertEqual((643, '<null>'), obj._test_lookup(k2))
-        self.assertEqual((643, '<null>'), obj._test_lookup(k3))
-        self.assertEqual((643, '<null>'), obj._test_lookup(k4))
+        k5 = _Hashable(h5)
+        k6 = _Hashable(h6)
+        self.assertLookup(643, '<null>', obj, k1)
+        self.assertLookup(643, '<null>', obj, k2)
+        self.assertLookup(643, '<null>', obj, k3)
+        self.assertLookup(643, '<null>', obj, k4)
+        self.assertLookup(644, '<null>', obj, k5)
+        self.assertLookup(644, '<null>', obj, k6)
         obj.add(k1)
         self.assertIn(k1, obj)
         self.assertNotIn(k2, obj)
         self.assertNotIn(k3, obj)
         self.assertNotIn(k4, obj)
-        self.assertEqual((643, k1), obj._test_lookup(k1))
-        self.assertEqual((787, '<null>'), obj._test_lookup(k2))
-        self.assertEqual((787, '<null>'), obj._test_lookup(k3))
-        self.assertEqual((787, '<null>'), obj._test_lookup(k4))
+        self.assertLookup(643, k1, obj, k1)
+        self.assertLookup(644, '<null>', obj, k2)
+        self.assertLookup(644, '<null>', obj, k3)
+        self.assertLookup(644, '<null>', obj, k4)
+        self.assertLookup(644, '<null>', obj, k5)
+        self.assertLookup(644, '<null>', obj, k6)
         self.assertIs(k1, obj[k1])
-        obj.add(k2)
+        self.assertIs(k2, obj.add(k2))
         self.assertIs(k2, obj[k2])
-        self.assertEqual((643, k1), obj._test_lookup(k1))
-        self.assertEqual((787, k2), obj._test_lookup(k2))
-        self.assertEqual((436, '<null>'), obj._test_lookup(k3))
-        # Even though k4 collides for the first couple of iterations, the hash
-        # perturbation uses the full width hash (not just the masked value), so
-        # it now diverges
-        self.assertEqual((660, '<null>'), obj._test_lookup(k4))
-        self.assertEqual((643, k1), obj._test_lookup(_Hashable(h1)))
-        self.assertEqual((787, k2), obj._test_lookup(_Hashable(h2)))
-        self.assertEqual((436, '<null>'), obj._test_lookup(_Hashable(h3)))
-        self.assertEqual((660, '<null>'), obj._test_lookup(_Hashable(h4)))
+        self.assertLookup(643, k1, obj, k1)
+        self.assertLookup(644, k2, obj, k2)
+        self.assertLookup(646, '<null>', obj, k3)
+        self.assertLookup(646, '<null>', obj, k4)
+        self.assertLookup(645, '<null>', obj, k5)
+        self.assertLookup(645, '<null>', obj, k6)
+        self.assertLookup(643, k1, obj, _Hashable(h1))
+        self.assertLookup(644, k2, obj, _Hashable(h2))
+        self.assertLookup(646, '<null>', obj, _Hashable(h3))
+        self.assertLookup(646, '<null>', obj, _Hashable(h4))
+        self.assertLookup(645, '<null>', obj, _Hashable(h5))
+        self.assertLookup(645, '<null>', obj, _Hashable(h6))
         obj.add(k3)
         self.assertIs(k3, obj[k3])
         self.assertIn(k1, obj)
@@ -180,10 +213,10 @@
         self.assertNotIn(k4, obj)
 
         obj.discard(k1)
-        self.assertEqual((643, '<dummy>'), obj._test_lookup(k1))
-        self.assertEqual((787, k2), obj._test_lookup(k2))
-        self.assertEqual((436, k3), obj._test_lookup(k3))
-        self.assertEqual((643, '<dummy>'), obj._test_lookup(k4))
+        self.assertLookup(643, '<dummy>', obj, k1)
+        self.assertLookup(644, k2, obj, k2)
+        self.assertLookup(646, k3, obj, k3)
+        self.assertLookup(643, '<dummy>', obj, k4)
         self.assertNotIn(k1, obj)
         self.assertIn(k2, obj)
         self.assertIn(k3, obj)



More information about the bazaar-commits mailing list