Rev 4768: Switch to using a _Hashable class, rather than using tuples. in http://bazaar.launchpad.net/~jameinel/bzr/2.1-simple-set

Fri Oct 9 18:03:36 BST 2009

At http://bazaar.launchpad.net/~jameinel/bzr/2.1-simple-set

------------------------------------------------------------
revno: 4768
revision-id: john at arbash-meinel.com-20091009170327-ezw4vx4su1msq3ap
parent: john at arbash-meinel.com-20091009164945-x2fx9hnxhebbtitu
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: 2.1-simple-set
timestamp: Fri 2009-10-09 12:03:27 -0500
message:
  Switch to using a _Hashable class, rather than using tuples.
  
  This allows us to more-easily trigger collisions. I was going to just use ints,
  but they don't seem to implement tp_richcompare...
-------------- next part --------------
=== modified file 'bzrlib/tests/test__simple_set.py'

--- a/bzrlib/tests/test__simple_set.py	2009-10-09 16:49:45 +0000
+++ b/bzrlib/tests/test__simple_set.py	2009-10-09 17:03:27 +0000
@@ -30,6 +30,24 @@
     _simple_set_pyx = None
 
 
+class _Hashable(object):
+    """A simple object which has a fixed hash value.
+
+    We could have used an 'int', but it turns out that Int objects don't
+    implement tp_richcompare...
+    """
+
+    def __init__(self, the_hash):
+        self.hash = the_hash
+
+    def __hash__(self):
+        return self.hash
+
+    def __eq__(self, other):
+        if not isinstance(other, _Hashable):
+            return NotImplemented
+        return other.hash == self.hash
+
 # Even though this is an extension, we don't permute the tests for a python
 # version. As the plain python version is just a dict or set
 
@@ -81,31 +99,31 @@
         self.assertFillState(0, 0, 0x3ff, obj)
 
     def test__lookup(self):
-        # The tuple hash function is rather good at entropy. For all integers
-        # 0=>1023, hash((i,)) & 1023 maps to a unique output, and hash((i,j))
-        # maps to all 1024 fields evenly.
-        # However, hash((c,d))& 1023 for characters has an uneven distribution
-        # of collisions, for example:
-        #  ('a', 'a'), ('f', '4'), ('p', 'r'), ('q', '1'), ('F', 'T'),
-        #  ('Q', 'Q'), ('V', 'd'), ('7', 'C')
-        # all collide @ 643
+        # These are carefully chosen integers to force hash collisions in the
+        # algorithm, based on the initial set size of 1024
         obj = self.module.SimpleSet()
-        offset, val = obj._test_lookup(('a', 'a'))
-        self.assertEqual(643, offset)
-        self.assertEqual('<null>', val)
-        offset, val = obj._test_lookup(('f', '4'))
-        self.assertEqual(643, offset)
-        self.assertEqual('<null>', val)
-        offset, val = obj._test_lookup(('p', 'r'))
+        offset, val = obj._test_lookup(_Hashable(643))
+        self.assertEqual(643, offset)
+        self.assertEqual('<null>', val)
+        offset, val = obj._test_lookup(_Hashable(643 + 1024))
+        self.assertEqual(643, offset)
+        self.assertEqual('<null>', val)
+        offset, val = obj._test_lookup(_Hashable(643 + 1024*50))
         self.assertEqual(643, offset)
         self.assertEqual('<null>', val)
 
     def test_get_set_del_with_collisions(self):
         obj = self.module.SimpleSet()
-        k1 = ('a', 'a')
-        k2 = ('f', '4') # collides
-        k3 = ('p', 'r')
-        k4 = ('q', '1')
+
+        h1 = 643
+        h2 = 643 + 1024
+        h3 = 643 + 1024*50
+        h4 = 643 + 1024*25
+
+        k1 = _Hashable(h1)
+        k2 = _Hashable(h2)
+        k3 = _Hashable(h3)
+        k4 = _Hashable(h4)
         self.assertEqual((643, '<null>'), obj._test_lookup(k1))
         self.assertEqual((643, '<null>'), obj._test_lookup(k2))
         self.assertEqual((643, '<null>'), obj._test_lookup(k3))
@@ -124,15 +142,15 @@
         self.assertIs(k2, obj[k2])
         self.assertEqual((643, k1), obj._test_lookup(k1))
         self.assertEqual((787, k2), obj._test_lookup(k2))
-        self.assertEqual((660, '<null>'), obj._test_lookup(k3))
+        self.assertEqual((436, '<null>'), obj._test_lookup(k3))
         # Even though k4 collides for the first couple of iterations, the hash
         # perturbation uses the full width hash (not just the masked value), so
         # it now diverges
-        self.assertEqual((180, '<null>'), obj._test_lookup(k4))
-        self.assertEqual((643, k1), obj._test_lookup(('a', 'a')))
-        self.assertEqual((787, k2), obj._test_lookup(('f', '4')))
-        self.assertEqual((660, '<null>'), obj._test_lookup(('p', 'r')))
-        self.assertEqual((180, '<null>'), obj._test_lookup(('q', '1')))
+        self.assertEqual((660, '<null>'), obj._test_lookup(k4))
+        self.assertEqual((643, k1), obj._test_lookup(_Hashable(h1)))
+        self.assertEqual((787, k2), obj._test_lookup(_Hashable(h2)))
+        self.assertEqual((436, '<null>'), obj._test_lookup(_Hashable(h3)))
+        self.assertEqual((660, '<null>'), obj._test_lookup(_Hashable(h4)))
         obj.add(k3)
         self.assertIs(k3, obj[k3])
         self.assertIn(k1, obj)
@@ -143,7 +161,7 @@
         obj.discard(k1)
         self.assertEqual((643, '<dummy>'), obj._test_lookup(k1))
         self.assertEqual((787, k2), obj._test_lookup(k2))
-        self.assertEqual((660, k3), obj._test_lookup(k3))
+        self.assertEqual((436, k3), obj._test_lookup(k3))
         self.assertEqual((643, '<dummy>'), obj._test_lookup(k4))
         self.assertNotIn(k1, obj)
         self.assertIn(k2, obj)