Rev 3205: Get prefix scaling working. in http://people.ubuntu.com/~robertc/baz2.0/index.hashed

Robert Collins robertc at robertcollins.net
Sun Jan 27 18:15:36 GMT 2008


At http://people.ubuntu.com/~robertc/baz2.0/index.hashed

------------------------------------------------------------
revno: 3205
revision-id:robertc at robertcollins.net-20080127181529-ryodcvcrajph2fn0
parent: robertc at robertcollins.net-20080127070656-gc93wlc9wtml829x
committer: Robert Collins <robertc at robertcollins.net>
branch nick: index.hashed
timestamp: Mon 2008-01-28 05:15:29 +1100
message:
  Get prefix scaling working.
modified:
  bzrlib/hashtable.py            hashtable.py-20080127054910-pf6gcotuda8xo42y-1
  bzrlib/tests/test_hashtable.py test_hashtable.py-20080127054910-pf6gcotuda8xo42y-2
=== modified file 'bzrlib/hashtable.py'
--- a/bzrlib/hashtable.py	2008-01-27 07:06:56 +0000
+++ b/bzrlib/hashtable.py	2008-01-27 18:15:29 +0000
@@ -23,10 +23,12 @@
     'CompactHashTable',
     ]
 
+import sha
 import struct
 
 from bzrlib.lazy_import import lazy_import
 lazy_import(globals(), """
+from bzrlib import debug
 from bzrlib import errors
 from bzrlib import trace
 from bzrlib.osutils import sha_string
@@ -73,7 +75,10 @@
         try:
             self._hash_values[self._key_hashes[key]] = value
         except KeyError:
-            key_hash = sha_string(key)
+            key_hash = sha.new(key).digest()
+            if key_hash in self._key_hashes:
+                raise errors.InternalBzrError(
+                    'Congrats, sha1 conflict on (%r->%r)' % (key, key_hash))
             self._key_hashes[key] = key_hash
             self._hash_values[key_hash] = value
 
@@ -87,11 +92,22 @@
         """
         prefix_bytes = 1
         # loop to determine how many bytes of hash
-        hashes = self._hash_values.keys()
-        end = len(hashes)
-        pos = 0
+        hashes = sorted(self._hash_values.keys())
         if not hashes:
             return 0, 0, ""
+        end = len(hashes)
+        pos = 1
+        current_hash = hashes[0]
+        while pos < end:
+            if hashes[pos][:prefix_bytes] == current_hash[:prefix_bytes]:
+                if prefix_bytes == 20:
+                    # infinte loop (and wtf ?!)
+                    raise errors.InternalBzrError('unreachable in theory')
+                # eat up more bytes
+                prefix_bytes += 1
+            else:
+                # unique enough
+                pos += 1
         # TODO: be more general. duh.
         hash_rule = ">" + "B" * prefix_bytes
         # now generate an array of the values.
@@ -104,14 +120,24 @@
                     "Not all values have the same length.")
         if not value_bytes:
             raise errors.InternalBzrError("zero-length values not supported.")
-        hash_array = ['\xff' * value_bytes] * (prefix_bytes << 8)
+        hash_array = ['\xff' * value_bytes] * 2 ** (prefix_bytes * 8)
         hash_powers = tuple(enumerate([2 ** (offset * 8) for offset in
             reversed(range(prefix_bytes))]))
+        if 'hashtable' in debug.debug_flags:
+            indices = {}
         for keyhash, value in self._hash_values.iteritems():
             hash_prefix = keyhash[:prefix_bytes]
             hash_index = 0
+            hash_bytes = struct.unpack(hash_rule, hash_prefix[:prefix_bytes])
             for offset, size in hash_powers:
-                hash_index += struct.unpack(hash_rule, hash_prefix[offset])[0] * size
+                hash_index += hash_bytes[offset] * size
             hash_array[hash_index] = value
+            if 'hashtable' in debug.debug_flags:
+                print value ,hash_index, repr(hash_prefix)
+                indices.setdefault(hash_index, []).append(value)
+        if 'hashtable' in debug.debug_flags:
+            for values in indices.values():
+                if len(values) > 1:
+                    print values
         hash_bytes = ''.join(hash_array)
         return prefix_bytes, value_bytes, hash_bytes

=== modified file 'bzrlib/tests/test_hashtable.py'
--- a/bzrlib/tests/test_hashtable.py	2008-01-27 07:06:56 +0000
+++ b/bzrlib/tests/test_hashtable.py	2008-01-27 18:15:29 +0000
@@ -57,6 +57,17 @@
         table = CompactHashTable()
         table['foo'] = 'b'
         hash_array = ['\xff'] * 256
-        hash_array[48] = 'b'
+        hash_array[11] = 'b'
         hash_bytes = ''.join(hash_array)
         self.assertEqual((1, 1, hash_bytes), table.serialize())
+
+    def test_serialize_16_bit_table(self):
+        table = CompactHashTable()
+        sample_data = [('p', 20843), ('u', 20966)]
+        hash_array = ['\xff'] * 65536
+        for key, offset in sample_data:
+            table[key] = key
+            hash_array[offset] = key
+        hash_bytes = ''.join(hash_array)
+        self.assertEqual((2, 1, hash_bytes), table.serialize())
+



More information about the bazaar-commits mailing list