Rev 4770: Change the _lookup function to use Quadratic Probing. in http://bazaar.launchpad.net/~jameinel/bzr/2.1-simple-set
John Arbash Meinel
john at arbash-meinel.com
Mon Oct 12 16:55:29 BST 2009
At http://bazaar.launchpad.net/~jameinel/bzr/2.1-simple-set
------------------------------------------------------------
revno: 4770
revision-id: john at arbash-meinel.com-20091012155526-b46tuz5cibebbnct
parent: john at arbash-meinel.com-20091012145850-9363u41ovkaz7nka
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: 2.1-simple-set
timestamp: Mon 2009-10-12 10:55:26 -0500
message:
Change the _lookup function to use Quadratic Probing.
The main strength is that we get a bit of locality (collision resolution
looks quickly close to the location.)
The main weakness is that we don't use the upper bits of the hash to
induce divergence. So while we nicely handle collisions in neighboring
buckets (Quadratic means their resolution chains will diverge), if
two objects hash to the same bucket, their resolution chains are identical.
-------------- next part --------------
=== modified file 'bzrlib/_simple_set_pyx.pyx'
--- a/bzrlib/_simple_set_pyx.pyx 2009-10-12 14:58:50 +0000
+++ b/bzrlib/_simple_set_pyx.pyx 2009-10-12 15:55:26 +0000
@@ -107,7 +107,6 @@
"""
# Attributes are defined in the .pxd file
DEF DEFAULT_SIZE=1024
- DEF PERTURB_SHIFT=5
def __init__(self):
cdef Py_ssize_t size, n_bytes
@@ -193,27 +192,27 @@
as it makes a lot of assuptions about keys not already being present,
and there being no dummy entries.
"""
- cdef size_t i, perturb, mask
+ cdef size_t i, n_lookup
cdef long the_hash
cdef PyObject **table, **slot
+ cdef Py_ssize_t mask
mask = self._mask
table = self._table
the_hash = Py_TYPE(key).tp_hash(key)
- i = the_hash & mask
- slot = &table[i]
- perturb = the_hash
- # Because we know that we made all items unique before, we can just
- # iterate as long as the target location is not empty, we don't have to
- # do any comparison, etc.
- while slot[0] != NULL:
- i = (i << 2) + i + perturb + 1
+ if the_hash == -1:
+ return -1
+ i = the_hash
+ for n_lookup from 0 <= n_lookup <= <size_t>mask: # Don't loop forever
slot = &table[i & mask]
- perturb >>= PERTURB_SHIFT
- slot[0] = key
- self._fill += 1
- self._used += 1
+ if slot[0] == NULL:
+ slot[0] = key
+ self._fill += 1
+ self._used += 1
+ return 1
+ i = i + 1 + n_lookup
+ raise RuntimeError('ran out of slots.')
def _py_resize(self, min_used):
"""Do not use this directly, it is only exposed for testing."""
@@ -422,26 +421,50 @@
:param key: An object we are looking up
:param hash: The hash for key
- :return: The location in self.table where key should be put
- should never be NULL, but may reference a NULL (PyObject*)
+ :return: The location in self.table where key should be put.
+ location == NULL is an exception, but (*location) == NULL just
+ indicates the slot is empty and can be used.
"""
- # This is the heart of most functions, which is why it is pulled out as an
- # cdef inline function.
- cdef size_t i, perturb
+ # This uses Quadratic Probing:
+ # http://en.wikipedia.org/wiki/Quadratic_probing
+ # with c1 = c2 = 1/2
+ # This leads to probe locations at:
+ # h0 = hash(k1)
+ # h1 = h0 + 1
+ # h2 = h0 + 3 = h1 + 1 + 1
+ # h3 = h0 + 6 = h2 + 1 + 2
+ # h4 = h0 + 10 = h2 + 1 + 3
+ # Note that all of these are '& mask', but that is computed *after* the
+ # offset.
+ # This differs from the algorithm used by Set and Dict. Which, effectively,
+ # use double-hashing, and a step size that starts large, but dwindles to
+ # stepping one-by-one.
+ # This gives more 'locality' in that if you have a collision at offset X,
+ # the first fallback is X+1, which is fast to check. However, that means
+ # that an object w/ hash X+1 will also check there, and then X+2 next.
+ # However, for objects with differing hashes, their chains are different.
+ # The former checks X, X+1, X+3, ... the latter checks X+1, X+2, X+4, ...
+ # So different hashes diverge quickly.
+ # A bigger problem is that we *only* ever use the lowest bits of the hash
+ # So all integers (x + SIZE*N) will resolve into the same bucket, and all
+ # use the same collision resolution. We may want to try to find a way to
+ # incorporate the upper bits of the hash with quadratic probing. (For
+ # example, X, X+1, X+3+some_upper_bits, X+6+more_upper_bits, etc.)
+ cdef size_t i, n_lookup
cdef Py_ssize_t mask
cdef long key_hash
cdef PyObject **table, **slot, *cur, **free_slot, *py_key
+ # hash is a signed long(), we are using an offset at unsigned size_t
key_hash = hash(key)
+ i = <size_t>key_hash
mask = self._mask
table = self._table
free_slot = NULL
- i = key_hash & mask
- slot = &table[i]
- cur = slot[0]
py_key = <PyObject *>key
- perturb = key_hash
- while True:
+ for n_lookup from 0 <= n_lookup <= <size_t>mask: # Don't loop forever
+ slot = &table[i & mask]
+ cur = slot[0]
if cur == NULL:
# Found a blank spot
if free_slot != NULL:
@@ -458,10 +481,7 @@
elif _is_equal(py_key, key_hash, cur):
# Both py_key and cur belong in this slot, return it
return slot
- i = (i << 2) + i + perturb + 1
- perturb >>= PERTURB_SHIFT
- slot = &table[i & mask]
- cur = slot[0]
+ i = i + 1 + n_lookup
raise AssertionError('should never get here')
=== modified file 'bzrlib/tests/test__simple_set.py'
--- a/bzrlib/tests/test__simple_set.py 2009-10-12 14:58:50 +0000
+++ b/bzrlib/tests/test__simple_set.py 2009-10-12 15:55:26 +0000
@@ -101,6 +101,9 @@
def assertFillState(self, used, fill, mask, obj):
self.assertEqual((used, fill, mask), (obj.used, obj.fill, obj.mask))
+ def assertLookup(self, offset, value, obj, key):
+ self.assertEqual((offset, value), obj._test_lookup(key))
+
def assertRefcount(self, count, obj):
"""Assert that the refcount for obj is what we expect.
@@ -123,15 +126,36 @@
# These are carefully chosen integers to force hash collisions in the
# algorithm, based on the initial set size of 1024
obj = self.module.SimpleSet()
- offset, val = obj._test_lookup(_Hashable(643))
- self.assertEqual(643, offset)
- self.assertEqual('<null>', val)
- offset, val = obj._test_lookup(_Hashable(643 + 1024))
- self.assertEqual(643, offset)
- self.assertEqual('<null>', val)
- offset, val = obj._test_lookup(_Hashable(643 + 1024*50))
- self.assertEqual(643, offset)
- self.assertEqual('<null>', val)
+ self.assertLookup(643, '<null>', obj, _Hashable(643))
+ self.assertLookup(643, '<null>', obj, _Hashable(643 + 1024))
+ self.assertLookup(643, '<null>', obj, _Hashable(643 + 50*1024))
+
+ def test__lookup_collision(self):
+ obj = self.module.SimpleSet()
+ k1 = _Hashable(643)
+ k2 = _Hashable(643 + 1024)
+ self.assertLookup(643, '<null>', obj, k1)
+ self.assertLookup(643, '<null>', obj, k2)
+ obj.add(k1)
+ self.assertLookup(643, k1, obj, k1)
+ self.assertLookup(644, '<null>', obj, k2)
+
+ def test__lookup_after_resize(self):
+ obj = self.module.SimpleSet()
+ k1 = _Hashable(643)
+ k2 = _Hashable(643 + 1024)
+ obj.add(k1)
+ obj.add(k2)
+ self.assertLookup(643, k1, obj, k1)
+ self.assertLookup(644, k2, obj, k2)
+ obj._py_resize(2047) # resized to 2048
+ self.assertEqual(2048, obj.mask + 1)
+ self.assertLookup(643, k1, obj, k1)
+ self.assertLookup(643+1024, k2, obj, k2)
+ obj._py_resize(1023) # resized back to 1024
+ self.assertEqual(1024, obj.mask + 1)
+ self.assertLookup(643, k1, obj, k1)
+ self.assertLookup(644, k2, obj, k2)
def test_get_set_del_with_collisions(self):
obj = self.module.SimpleSet()
@@ -140,38 +164,47 @@
h2 = 643 + 1024
h3 = 643 + 1024*50
h4 = 643 + 1024*25
+ h5 = 644
+ h6 = 644 + 1024
k1 = _Hashable(h1)
k2 = _Hashable(h2)
k3 = _Hashable(h3)
k4 = _Hashable(h4)
- self.assertEqual((643, '<null>'), obj._test_lookup(k1))
- self.assertEqual((643, '<null>'), obj._test_lookup(k2))
- self.assertEqual((643, '<null>'), obj._test_lookup(k3))
- self.assertEqual((643, '<null>'), obj._test_lookup(k4))
+ k5 = _Hashable(h5)
+ k6 = _Hashable(h6)
+ self.assertLookup(643, '<null>', obj, k1)
+ self.assertLookup(643, '<null>', obj, k2)
+ self.assertLookup(643, '<null>', obj, k3)
+ self.assertLookup(643, '<null>', obj, k4)
+ self.assertLookup(644, '<null>', obj, k5)
+ self.assertLookup(644, '<null>', obj, k6)
obj.add(k1)
self.assertIn(k1, obj)
self.assertNotIn(k2, obj)
self.assertNotIn(k3, obj)
self.assertNotIn(k4, obj)
- self.assertEqual((643, k1), obj._test_lookup(k1))
- self.assertEqual((787, '<null>'), obj._test_lookup(k2))
- self.assertEqual((787, '<null>'), obj._test_lookup(k3))
- self.assertEqual((787, '<null>'), obj._test_lookup(k4))
+ self.assertLookup(643, k1, obj, k1)
+ self.assertLookup(644, '<null>', obj, k2)
+ self.assertLookup(644, '<null>', obj, k3)
+ self.assertLookup(644, '<null>', obj, k4)
+ self.assertLookup(644, '<null>', obj, k5)
+ self.assertLookup(644, '<null>', obj, k6)
self.assertIs(k1, obj[k1])
- obj.add(k2)
+ self.assertIs(k2, obj.add(k2))
self.assertIs(k2, obj[k2])
- self.assertEqual((643, k1), obj._test_lookup(k1))
- self.assertEqual((787, k2), obj._test_lookup(k2))
- self.assertEqual((436, '<null>'), obj._test_lookup(k3))
- # Even though k4 collides for the first couple of iterations, the hash
- # perturbation uses the full width hash (not just the masked value), so
- # it now diverges
- self.assertEqual((660, '<null>'), obj._test_lookup(k4))
- self.assertEqual((643, k1), obj._test_lookup(_Hashable(h1)))
- self.assertEqual((787, k2), obj._test_lookup(_Hashable(h2)))
- self.assertEqual((436, '<null>'), obj._test_lookup(_Hashable(h3)))
- self.assertEqual((660, '<null>'), obj._test_lookup(_Hashable(h4)))
+ self.assertLookup(643, k1, obj, k1)
+ self.assertLookup(644, k2, obj, k2)
+ self.assertLookup(646, '<null>', obj, k3)
+ self.assertLookup(646, '<null>', obj, k4)
+ self.assertLookup(645, '<null>', obj, k5)
+ self.assertLookup(645, '<null>', obj, k6)
+ self.assertLookup(643, k1, obj, _Hashable(h1))
+ self.assertLookup(644, k2, obj, _Hashable(h2))
+ self.assertLookup(646, '<null>', obj, _Hashable(h3))
+ self.assertLookup(646, '<null>', obj, _Hashable(h4))
+ self.assertLookup(645, '<null>', obj, _Hashable(h5))
+ self.assertLookup(645, '<null>', obj, _Hashable(h6))
obj.add(k3)
self.assertIs(k3, obj[k3])
self.assertIn(k1, obj)
@@ -180,10 +213,10 @@
self.assertNotIn(k4, obj)
obj.discard(k1)
- self.assertEqual((643, '<dummy>'), obj._test_lookup(k1))
- self.assertEqual((787, k2), obj._test_lookup(k2))
- self.assertEqual((436, k3), obj._test_lookup(k3))
- self.assertEqual((643, '<dummy>'), obj._test_lookup(k4))
+ self.assertLookup(643, '<dummy>', obj, k1)
+ self.assertLookup(644, k2, obj, k2)
+ self.assertLookup(646, k3, obj, k3)
+ self.assertLookup(643, '<dummy>', obj, k4)
self.assertNotIn(k1, obj)
self.assertIn(k2, obj)
self.assertIn(k3, obj)
More information about the bazaar-commits
mailing list