Rev 5378: Populate the offsets array. in http://bazaar.launchpad.net/~jameinel/bzr/2.3-btree-chk-leaf
John Arbash Meinel
john at arbash-meinel.com
Wed Aug 4 08:14:58 BST 2010
At http://bazaar.launchpad.net/~jameinel/bzr/2.3-btree-chk-leaf
------------------------------------------------------------
revno: 5378
revision-id: john at arbash-meinel.com-20100804071454-bfhbwrqes7sabvay
parent: john at arbash-meinel.com-20100804043626-pwawyjkb6r6wzs9a
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: 2.3-btree-chk-leaf
timestamp: Wed 2010-08-04 02:14:54 -0500
message:
Populate the offsets array.
This cuts down the number of bisections dramatically, basically by pre-caching
the first step. On real-world data it drops the steps from 587 to 156.
Or from 4.9/key to 1.3/key.
This drops the time to lookup from 23.7us to 20.3us.
Note that (k in dict) is 12.2us. I do wish we were just a bit closer to that.
However, with _LeafNode inherited from dict, I get 26us, so
maybe there is something in the interpreter that does a PyDict_CheckExact
call, and there isn't much we can do about it.
-------------- next part --------------
=== modified file 'bzrlib/_btree_serializer_pyx.pyx'
--- a/bzrlib/_btree_serializer_pyx.pyx 2010-08-04 04:33:24 +0000
+++ b/bzrlib/_btree_serializer_pyx.pyx 2010-08-04 07:14:54 +0000
@@ -483,17 +483,31 @@
return _sha1_to_key(PyString_AS_STRING(sha1_bin))
+cdef unsigned int _sha1_to_uint(char *sha1):
+ cdef unsigned int val
+ # Must be in MSB, because that is how the content is sorted
+ val = (((<unsigned int>(sha1[0]) & 0xff) << 24)
+ | ((<unsigned int>(sha1[1]) & 0xff) << 16)
+ | ((<unsigned int>(sha1[2]) & 0xff) << 8)
+ | ((<unsigned int>(sha1[3]) & 0xff) << 0))
+ return val
+
+
cdef class GCCHKSHA1LeafNode:
"""Track all the entries for a given leaf node."""
cdef public int num_entries
cdef gc_chk_sha1_record *entries
- # This is for the mini-index. We look at all the keys and use whatever byte
- # is first unique across all stored keys (this is often the first byte)
- # we then store the entries offset for the first record that matches that
- # byte. This does assume that we'll never have more than 32k entries, but
- # that doesn't seem to be a terrible assumption (we should have ~100)
- cdef public short interesting_byte
+ # This is for the mini-index.
+ # We look at all the keys and determine how many start bits are identical.
+ # We then strip those bits, and use the next 8 bits as 'interesting bits'.
+ # offsets is then the bisect() information for a key with those bits (where
+ # would you insert a key starting with these bits)
+ cdef public long bisect_steps
+ cdef public unsigned int common_mask
+ cdef public unsigned int common_bits
+ # The first N bits of all entries in this node have the same content
+ cdef public short common_shift
cdef short offsets[257]
def __sizeof__(self):
@@ -568,6 +582,7 @@
cdef gc_chk_sha1_record* _lookup_record(self, char *sha1):
"""Find a gc_chk_sha1_record that matches the sha1 supplied."""
cdef int lo, hi, mid, the_cmp
+ cdef int offset
lo = 0
hi = self.num_entries
@@ -585,15 +600,22 @@
# TODO: Consider improving this, but for now it seems good enough. (We
# could create a small index so we would have less to bisect,
# etc.)
+ # bisecting does reduce the total number of memcmp calls from 7260 for
+ # 120 keys to 720.
+ offset = self._offset_for_sha1(sha1)
+ lo = self.offsets[offset]
+ hi = self.offsets[offset+1]
while lo < hi:
mid = (lo + hi) / 2
- the_cmp = memcmp(sha1, self.entries[mid].sha1, 20)
+ the_cmp = memcmp(self.entries[mid].sha1, sha1, 20)
+ self.bisect_steps += 1
if the_cmp == 0:
+ # print mid, offset, self._offsets[offset], self._offsets[offset+1]
return &self.entries[mid]
elif the_cmp < 0:
+ lo = mid + 1
+ else:
hi = mid
- else:
- lo = mid + 1
return NULL
def __contains__(self, key):
@@ -707,10 +729,63 @@
or cur_record != self.entries + self.num_entries):
raise ValueError('Something went wrong while parsing.')
# Pass 3: build the offset map
+ self._compute_common()
+
+ cdef int _offset_for_sha1(self, char *sha1) except -1:
+ """Find the first interesting 8-bits of this sha1."""
+ cdef int this_offset
+ this_offset = ((_sha1_to_uint(sha1) & ~self.common_mask)
+ >> (24 - self.common_shift))
+ # assert 0 < this_offset < 256
+ return this_offset
+
+ cdef _compute_common(self):
+ cdef unsigned int first
+ cdef unsigned int this
+ cdef unsigned int common_mask
+ cdef short common_shift
+ cdef int i
+ cdef int offset, this_offset
# The idea with the offset map is that we should be able to quickly
# jump to the key that matches a gives sha1. We know that the keys are
# in sorted order, and we know that a lot of the prefix is going to be
# the same across them.
+ # By XORing the entries together, we can determine what bits are set in
+ # all of them
+ if self.num_entries < 2:
+ self.common_mask = 0x0
+ self.common_shift = 0
+ else:
+ common_mask = 0xFFFFFFFF
+ first = _sha1_to_uint(self.entries[0].sha1)
+ for i from 0 < i < self.num_entries:
+ this = _sha1_to_uint(self.entries[i].sha1)
+ common_mask = (~(first ^ this)) & common_mask
+ common_shift = 0
+ self.common_mask = common_mask
+ while common_mask & 0x80000000:
+ common_mask = common_mask << 1
+ common_shift += 1
+ self.common_shift = common_shift
+ self.common_bits = first & self.common_mask
+ offset = 0
+ for i from 0 <= i < self.num_entries:
+ this_offset = self._offset_for_sha1(self.entries[i].sha1)
+ while offset <= this_offset:
+ self.offsets[offset] = i
+ offset += 1
+ while offset < 257:
+ self.offsets[offset] = self.num_entries
+ offset += 1
+
+ property _test_offsets:
+ def __get__(self):
+ cdef list result
+ cdef int i
+ result = []
+ for i from 0 <= i < 257:
+ result.append(self.offsets[i])
+ return result
def _parse_into_chk(bytes, key_length, ref_list_length):
=== modified file 'bzrlib/tests/test__btree_serializer.py'
--- a/bzrlib/tests/test__btree_serializer.py 2010-08-04 01:52:14 +0000
+++ b/bzrlib/tests/test__btree_serializer.py 2010-08-04 07:14:54 +0000
@@ -18,6 +18,7 @@
"""Direct tests of the btree serializer extension"""
import binascii
+import bisect
from bzrlib import tests
@@ -138,14 +139,14 @@
"""
_multi_key_content = """type=leaf
-sha1:70c881d4a26984ddce795f6f71817c9cf4480e79\x00\x000 0 0 0
-sha1:7e240de74fb1ed08fa08d38063f6a6a91462a815\x00\x001 1 1 1
-sha1:86f7e437faa5a7fce15d1ddcb9eaeaea377667b8\x00\x002 2 2 2
-sha1:da39a3ee5e6b4b0d3255bfef95601890afd80709\x00\x003 3 3 3
-sha1:df51e37c269aa94d38f93e537bf6e2020b21406c\x00\x004 4 4 4
-sha1:e0c9035898dd52fc65c41454cec9c4d2611bfb37\x00\x005 5 5 5
-sha1:e93b4e3c464ffd51732fbd6ded717e9efda28aad\x00\x006 6 6 6
-sha1:f7a9e24777ec23212c54d7a350bc5bea5477fdbb\x00\x007 7 7 7
+sha1:c80c881d4a26984ddce795f6f71817c9cf4480e7\x00\x000 0 0 0
+sha1:c86f7e437faa5a7fce15d1ddcb9eaeaea377667b\x00\x001 1 1 1
+sha1:c8e240de74fb1ed08fa08d38063f6a6a91462a81\x00\x002 2 2 2
+sha1:cda39a3ee5e6b4b0d3255bfef95601890afd8070\x00\x003 3 3 3
+sha1:cdf51e37c269aa94d38f93e537bf6e2020b21406\x00\x004 4 4 4
+sha1:ce0c9035898dd52fc65c41454cec9c4d2611bfb3\x00\x005 5 5 5
+sha1:ce93b4e3c464ffd51732fbd6ded717e9efda28aa\x00\x006 6 6 6
+sha1:cf7a9e24777ec23212c54d7a350bc5bea5477fdb\x00\x007 7 7 7
"""
class TestGCCKHSHA1LeafNode(TestBtreeSerializer):
@@ -194,3 +195,22 @@
self.assertEqual(8, len(leaf.all_keys()))
for idx, key in enumerate(all_keys):
self.assertEqual(str(idx), leaf[key][0].split()[0])
+
+ def test_common_mask(self):
+ # The keys were deliberately chosen so that the first 5 bits all
+ # overlapped, it also happens that a later bit overlaps
+ # Note that by 'overlap' we mean that given bit is either on in all
+ # keys, or off in all keys
+ leaf = self.module._parse_into_chk(_multi_key_content, 1, 0)
+ self.assertEqual(hex(0xF8000100), hex(leaf.common_mask))
+ self.assertEqual(5, leaf.common_shift)
+ self.assertEqual(0xc8000000, leaf.common_bits)
+ # The interesting byte for each key is
+ # (defined as the 8-bits that come after the common prefix)
+ # [1, 13, 28, 180, 190, 193, 210, 239]
+ lst = [1, 13, 28, 180, 190, 193, 210, 239]
+ offsets = leaf._test_offsets
+ self.assertEqual([bisect.bisect_left(lst, x) for x in range(0, 257)],
+ offsets)
+ for idx, val in enumerate(lst):
+ self.assertEqual(idx, offsets[val])
More information about the bazaar-commits
mailing list