Rev 3204: Beginnings of a serialisable hash table. in http://people.ubuntu.com/~robertc/baz2.0/index.hashed

Robert Collins robertc at robertcollins.net
Sun Jan 27 07:07:01 GMT 2008


At http://people.ubuntu.com/~robertc/baz2.0/index.hashed

------------------------------------------------------------
revno: 3204
revision-id:robertc at robertcollins.net-20080127070656-gc93wlc9wtml829x
parent: robertc at robertcollins.net-20080127054739-2ryl6fjj1fuyfnjo
committer: Robert Collins <robertc at robertcollins.net>
branch nick: index.hashed
timestamp: Sun 2008-01-27 18:06:56 +1100
message:
  Beginnings of a serialisable hash table.
added:
  bzrlib/hashtable.py            hashtable.py-20080127054910-pf6gcotuda8xo42y-1
  bzrlib/tests/test_hashtable.py test_hashtable.py-20080127054910-pf6gcotuda8xo42y-2
modified:
  bzrlib/tests/__init__.py       selftest.py-20050531073622-8d0e3c8845c97a64
=== added file 'bzrlib/hashtable.py'
--- a/bzrlib/hashtable.py	1970-01-01 00:00:00 +0000
+++ b/bzrlib/hashtable.py	2008-01-27 07:06:56 +0000
@@ -0,0 +1,117 @@
+# Copyright (C) 2008 Canonical Ltd
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+"""Hashtables for bzrlib.
+
+CompactHashTable provides a hash table that is extremely compact on disk.
+"""
+
+__all__ = [
+    'CompactHashTable',
+    ]
+
+import struct
+
+from bzrlib.lazy_import import lazy_import
+lazy_import(globals(), """
+from bzrlib import errors
+from bzrlib import trace
+from bzrlib.osutils import sha_string
+from bzrlib.trace import mutter
+""")
+
+
+class CompactHashTable(object):
+    """A hash table that is compact when written to disk.
+
+    CompactHashTable makes the assumption that it's values are self-verifiable
+    and of fixed length. Accordingly no collision-detection is provided by the
+    hash table. In the event of a collision a client of the hash table must be
+    able to tell that the wrong value was obtained. Furthermore, the hash table
+    makes no effort to store colliding values (though the serialized form does
+    permit this, no call for it has been felt to date.
+    
+    These assumptions permit the serialised hash table to be generated by
+    truncating all the seen hashes such that unique prefixes are obtained, and
+    then generating a fixed size array with 2^prefix_bits entries. Looking up
+    entries in this table is then simply an index into the array:
+    hash(key)[:prefix_bytes]. Unused slots in the array are set to
+    \xff*value_bytes. Note that at large prefix lengths the array becomes very
+    sparse. A planned refinement to this hash table is to permit chaining so
+    that the density of the storage array can be kept close to 100%. Exact
+    plans for that are not yet finalised.
+
+    This design is primarily geared to avoid latency scaling effects.
+    """
+
+    def __init__(self):
+        """Create a CompactHashTable."""
+        # key:hash
+        self._key_hashes = {}
+        # hash:value
+        self._hash_values = {}
+
+    def __len__(self):
+        """How many keys are in the hash table."""
+        return len(self._key_hashes)
+
+    def __setitem__(self, key, value):
+        """Set key to value."""
+        try:
+            self._hash_values[self._key_hashes[key]] = value
+        except KeyError:
+            key_hash = sha_string(key)
+            self._key_hashes[key] = key_hash
+            self._hash_values[key_hash] = value
+
+    def serialize(self):
+        """Return a serialized CompactHashTable.
+
+        :return: A tuple (prefix_bytes, value_bytes, table_bytes). prefix_bytes
+            is the number of bytes chosen for use in the hash. value_bytes is
+            the number of bytes in each value. table_bytes is the serialized
+            hash table.
+        """
+        prefix_bytes = 1
+        # loop to determine how many bytes of hash
+        hashes = self._hash_values.keys()
+        end = len(hashes)
+        pos = 0
+        if not hashes:
+            return 0, 0, ""
+        # TODO: be more general. duh.
+        hash_rule = ">" + "B" * prefix_bytes
+        # now generate an array of the values.
+        value_bytes = None
+        for value in self._hash_values.itervalues():
+            if value_bytes is None:
+                value_bytes = len(value)
+            elif len(value) != value_bytes:
+                raise errors.InternalBzrError(
+                    "Not all values have the same length.")
+        if not value_bytes:
+            raise errors.InternalBzrError("zero-length values not supported.")
+        hash_array = ['\xff' * value_bytes] * (prefix_bytes << 8)
+        hash_powers = tuple(enumerate([2 ** (offset * 8) for offset in
+            reversed(range(prefix_bytes))]))
+        for keyhash, value in self._hash_values.iteritems():
+            hash_prefix = keyhash[:prefix_bytes]
+            hash_index = 0
+            for offset, size in hash_powers:
+                hash_index += struct.unpack(hash_rule, hash_prefix[offset])[0] * size
+            hash_array[hash_index] = value
+        hash_bytes = ''.join(hash_array)
+        return prefix_bytes, value_bytes, hash_bytes

=== modified file 'bzrlib/tests/__init__.py'
--- a/bzrlib/tests/__init__.py	2008-01-23 16:25:18 +0000
+++ b/bzrlib/tests/__init__.py	2008-01-27 07:06:56 +0000
@@ -2658,6 +2658,7 @@
                    'bzrlib.tests.test_gpg',
                    'bzrlib.tests.test_graph',
                    'bzrlib.tests.test_hashcache',
+                   'bzrlib.tests.test_hashtable',
                    'bzrlib.tests.test_help',
                    'bzrlib.tests.test_hooks',
                    'bzrlib.tests.test_http',

=== added file 'bzrlib/tests/test_hashtable.py'
--- a/bzrlib/tests/test_hashtable.py	1970-01-01 00:00:00 +0000
+++ b/bzrlib/tests/test_hashtable.py	2008-01-27 07:06:56 +0000
@@ -0,0 +1,62 @@
+# Copyright (C) 2008 Canonical Ltd
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+"""Tests for hashtables."""
+
+from bzrlib import errors
+from bzrlib.hashtable import *
+from bzrlib.tests import TestCaseWithMemoryTransport
+
+
+class TestCompactHashTable(TestCaseWithMemoryTransport):
+
+    def test_create(self):
+        table = CompactHashTable()
+        self.assertEqual(0, len(table))
+
+    def test___setitem__(self):
+        table = CompactHashTable()
+        table['foo'] = 'bar'
+        self.assertEqual(1, len(table))
+        table['foo'] = 'bar'
+        self.assertEqual(1, len(table))
+        table['foo'] = 'quux'
+        self.assertEqual(1, len(table))
+        table['bar'] = 'bar'
+        self.assertEqual(2, len(table))
+
+    def test_serialize_mismatched_value_lengths(self):
+        table = CompactHashTable()
+        table['foo'] = 'bar'
+        table['bar'] = 'quux'
+        self.assertRaises(errors.InternalBzrError, table.serialize)
+
+    def test_serialize_0_value_lengths(self):
+        table = CompactHashTable()
+        table['foo'] = ''
+        self.assertRaises(errors.InternalBzrError, table.serialize)
+
+    def test_serialize_empty_table(self):
+        table = CompactHashTable()
+        self.assertEqual((0, 0, ""), table.serialize())
+
+    def test_serialize_simple_table(self):
+        table = CompactHashTable()
+        table['foo'] = 'b'
+        hash_array = ['\xff'] * 256
+        hash_array[48] = 'b'
+        hash_bytes = ''.join(hash_array)
+        self.assertEqual((1, 1, hash_bytes), table.serialize())



More information about the bazaar-commits mailing list