Rev 3204: Beginnings of a serialisable hash table. in http://people.ubuntu.com/~robertc/baz2.0/index.hashed
Robert Collins
robertc at robertcollins.net
Sun Jan 27 07:07:01 GMT 2008
At http://people.ubuntu.com/~robertc/baz2.0/index.hashed
------------------------------------------------------------
revno: 3204
revision-id:robertc at robertcollins.net-20080127070656-gc93wlc9wtml829x
parent: robertc at robertcollins.net-20080127054739-2ryl6fjj1fuyfnjo
committer: Robert Collins <robertc at robertcollins.net>
branch nick: index.hashed
timestamp: Sun 2008-01-27 18:06:56 +1100
message:
Beginnings of a serialisable hash table.
added:
bzrlib/hashtable.py hashtable.py-20080127054910-pf6gcotuda8xo42y-1
bzrlib/tests/test_hashtable.py test_hashtable.py-20080127054910-pf6gcotuda8xo42y-2
modified:
bzrlib/tests/__init__.py selftest.py-20050531073622-8d0e3c8845c97a64
=== added file 'bzrlib/hashtable.py'
--- a/bzrlib/hashtable.py 1970-01-01 00:00:00 +0000
+++ b/bzrlib/hashtable.py 2008-01-27 07:06:56 +0000
@@ -0,0 +1,117 @@
+# Copyright (C) 2008 Canonical Ltd
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+"""Hashtables for bzrlib.
+
+CompactHashTable provides a hash table that is extremely compact on disk.
+"""
+
+__all__ = [
+ 'CompactHashTable',
+ ]
+
+import struct
+
+from bzrlib.lazy_import import lazy_import
+lazy_import(globals(), """
+from bzrlib import errors
+from bzrlib import trace
+from bzrlib.osutils import sha_string
+from bzrlib.trace import mutter
+""")
+
+
+class CompactHashTable(object):
+ """A hash table that is compact when written to disk.
+
+ CompactHashTable makes the assumption that it's values are self-verifiable
+ and of fixed length. Accordingly no collision-detection is provided by the
+ hash table. In the event of a collision a client of the hash table must be
+ able to tell that the wrong value was obtained. Furthermore, the hash table
+ makes no effort to store colliding values (though the serialized form does
+ permit this, no call for it has been felt to date.
+
+ These assumptions permit the serialised hash table to be generated by
+ truncating all the seen hashes such that unique prefixes are obtained, and
+ then generating a fixed size array with 2^prefix_bits entries. Looking up
+ entries in this table is then simply an index into the array:
+ hash(key)[:prefix_bytes]. Unused slots in the array are set to
+ \xff*value_bytes. Note that at large prefix lengths the array becomes very
+ sparse. A planned refinement to this hash table is to permit chaining so
+ that the density of the storage array can be kept close to 100%. Exact
+ plans for that are not yet finalised.
+
+ This design is primarily geared to avoid latency scaling effects.
+ """
+
+ def __init__(self):
+ """Create a CompactHashTable."""
+ # key:hash
+ self._key_hashes = {}
+ # hash:value
+ self._hash_values = {}
+
+ def __len__(self):
+ """How many keys are in the hash table."""
+ return len(self._key_hashes)
+
+ def __setitem__(self, key, value):
+ """Set key to value."""
+ try:
+ self._hash_values[self._key_hashes[key]] = value
+ except KeyError:
+ key_hash = sha_string(key)
+ self._key_hashes[key] = key_hash
+ self._hash_values[key_hash] = value
+
+ def serialize(self):
+ """Return a serialized CompactHashTable.
+
+ :return: A tuple (prefix_bytes, value_bytes, table_bytes). prefix_bytes
+ is the number of bytes chosen for use in the hash. value_bytes is
+ the number of bytes in each value. table_bytes is the serialized
+ hash table.
+ """
+ prefix_bytes = 1
+ # loop to determine how many bytes of hash
+ hashes = self._hash_values.keys()
+ end = len(hashes)
+ pos = 0
+ if not hashes:
+ return 0, 0, ""
+ # TODO: be more general. duh.
+ hash_rule = ">" + "B" * prefix_bytes
+ # now generate an array of the values.
+ value_bytes = None
+ for value in self._hash_values.itervalues():
+ if value_bytes is None:
+ value_bytes = len(value)
+ elif len(value) != value_bytes:
+ raise errors.InternalBzrError(
+ "Not all values have the same length.")
+ if not value_bytes:
+ raise errors.InternalBzrError("zero-length values not supported.")
+ hash_array = ['\xff' * value_bytes] * (prefix_bytes << 8)
+ hash_powers = tuple(enumerate([2 ** (offset * 8) for offset in
+ reversed(range(prefix_bytes))]))
+ for keyhash, value in self._hash_values.iteritems():
+ hash_prefix = keyhash[:prefix_bytes]
+ hash_index = 0
+ for offset, size in hash_powers:
+ hash_index += struct.unpack(hash_rule, hash_prefix[offset])[0] * size
+ hash_array[hash_index] = value
+ hash_bytes = ''.join(hash_array)
+ return prefix_bytes, value_bytes, hash_bytes
=== modified file 'bzrlib/tests/__init__.py'
--- a/bzrlib/tests/__init__.py 2008-01-23 16:25:18 +0000
+++ b/bzrlib/tests/__init__.py 2008-01-27 07:06:56 +0000
@@ -2658,6 +2658,7 @@
'bzrlib.tests.test_gpg',
'bzrlib.tests.test_graph',
'bzrlib.tests.test_hashcache',
+ 'bzrlib.tests.test_hashtable',
'bzrlib.tests.test_help',
'bzrlib.tests.test_hooks',
'bzrlib.tests.test_http',
=== added file 'bzrlib/tests/test_hashtable.py'
--- a/bzrlib/tests/test_hashtable.py 1970-01-01 00:00:00 +0000
+++ b/bzrlib/tests/test_hashtable.py 2008-01-27 07:06:56 +0000
@@ -0,0 +1,62 @@
+# Copyright (C) 2008 Canonical Ltd
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+"""Tests for hashtables."""
+
+from bzrlib import errors
+from bzrlib.hashtable import *
+from bzrlib.tests import TestCaseWithMemoryTransport
+
+
+class TestCompactHashTable(TestCaseWithMemoryTransport):
+
+ def test_create(self):
+ table = CompactHashTable()
+ self.assertEqual(0, len(table))
+
+ def test___setitem__(self):
+ table = CompactHashTable()
+ table['foo'] = 'bar'
+ self.assertEqual(1, len(table))
+ table['foo'] = 'bar'
+ self.assertEqual(1, len(table))
+ table['foo'] = 'quux'
+ self.assertEqual(1, len(table))
+ table['bar'] = 'bar'
+ self.assertEqual(2, len(table))
+
+ def test_serialize_mismatched_value_lengths(self):
+ table = CompactHashTable()
+ table['foo'] = 'bar'
+ table['bar'] = 'quux'
+ self.assertRaises(errors.InternalBzrError, table.serialize)
+
+ def test_serialize_0_value_lengths(self):
+ table = CompactHashTable()
+ table['foo'] = ''
+ self.assertRaises(errors.InternalBzrError, table.serialize)
+
+ def test_serialize_empty_table(self):
+ table = CompactHashTable()
+ self.assertEqual((0, 0, ""), table.serialize())
+
+ def test_serialize_simple_table(self):
+ table = CompactHashTable()
+ table['foo'] = 'b'
+ hash_array = ['\xff'] * 256
+ hash_array[48] = 'b'
+ hash_bytes = ''.join(hash_array)
+ self.assertEqual((1, 1, hash_bytes), table.serialize())
More information about the bazaar-commits
mailing list