Rev 5380: Handle the case where there are more than 255 entries. in http://bazaar.launchpad.net/~jameinel/bzr/2.3-btree-chk-leaf
John Arbash Meinel
john at arbash-meinel.com
Wed Aug 4 16:48:43 BST 2010
At http://bazaar.launchpad.net/~jameinel/bzr/2.3-btree-chk-leaf
------------------------------------------------------------
revno: 5380
revision-id: john at arbash-meinel.com-20100804154836-79sceckc4exj134y
parent: john at arbash-meinel.com-20100804142810-9ef2ytu2dh0pwk34
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: 2.3-btree-chk-leaf
timestamp: Wed 2010-08-04 10:48:36 -0500
message:
Handle the case where there are more than 255 entries.
This is very unlikely in real-world, but we want to preserve correctness
rather than failing.
There doesn't seem to be a performance impact (still 20us), which is
fairly expected. It is a single unlikely if check in the lookup logic.
-------------- next part --------------
=== modified file 'bzrlib/_btree_serializer_pyx.pyx'
--- a/bzrlib/_btree_serializer_pyx.pyx 2010-08-04 07:14:54 +0000
+++ b/bzrlib/_btree_serializer_pyx.pyx 2010-08-04 15:48:36 +0000
@@ -507,8 +507,8 @@
cdef public unsigned int common_mask
cdef public unsigned int common_bits
# The first N bits of all entries in this node have the same content
- cdef public short common_shift
- cdef short offsets[257]
+ cdef public unsigned char common_shift
+ cdef unsigned char offsets[257]
def __sizeof__(self):
return (sizeof(GCCHKSHA1LeafNode)
@@ -584,27 +584,24 @@
cdef int lo, hi, mid, the_cmp
cdef int offset
- lo = 0
- hi = self.num_entries
-
- # Bisect until we find the right spot
- # Note the time to create Python objects dominates over comparison
- # times. With a simple iterate and compare (no bisect) we spend 45us
- # for 120 __contains__, but 201us for 120 __getitem__.
- # The _LeafNode.__getitem__ takes 54.6us, but (k in o._keys) is 20.7us
- # removing the getattr() w/ (k in _keys) 13.7ms. So we do still have
- # some room for improvement
- # _key_to_sha1 takes 24us, which puts a lower bound on _lookup_record
- # unless we skip doing the whole record (iterating is 8us,
- # iterating+no-op C function call is 12.5us)
- # TODO: Consider improving this, but for now it seems good enough. (We
- # could create a small index so we would have less to bisect,
- # etc.)
- # bisecting does reduce the total number of memcmp calls from 7260 for
- # 120 keys to 720.
+ # TODO: We can speed up misses by comparing this sha1 to the common
+ # bits, and seeing if the common prefix matches, if not, we don't
+ # need to search for anything because it cannot match
+ # Use the offset array to find the closest fit for this entry
+ # follow that up with bisecting, since multiple keys can be in one
+ # spot
+ # Bisecting dropped us from 7000 comparisons to 582 (4.8/key), using
+ # the offset array dropped us from 23us to 20us and 156 comparisions
+ # (1.3/key)
offset = self._offset_for_sha1(sha1)
lo = self.offsets[offset]
hi = self.offsets[offset+1]
+ if hi == 255:
+ # if hi == 255 that means we potentially ran off the end of the
+ # list, so push it up to num_entries
+ # note that if 'lo' == 255, that is ok, because we can start
+ # searching from that part of the list.
+ hi = self.num_entries
while lo < hi:
mid = (lo + hi) / 2
the_cmp = memcmp(self.entries[mid].sha1, sha1, 20)
@@ -734,18 +731,23 @@
cdef int _offset_for_sha1(self, char *sha1) except -1:
"""Find the first interesting 8-bits of this sha1."""
cdef int this_offset
- this_offset = ((_sha1_to_uint(sha1) & ~self.common_mask)
- >> (24 - self.common_shift))
+ cdef unsigned int as_uint
+ as_uint = _sha1_to_uint(sha1)
+ this_offset = (as_uint >> self.common_shift) & 0xFF
# assert 0 < this_offset < 256
return this_offset
+ def test_offset_for_sha1(self, sha1):
+ return self._offset_for_sha1(PyString_AS_STRING(sha1))
+
cdef _compute_common(self):
cdef unsigned int first
cdef unsigned int this
cdef unsigned int common_mask
- cdef short common_shift
+ cdef unsigned char common_shift
cdef int i
cdef int offset, this_offset
+ cdef int max_offset
# The idea with the offset map is that we should be able to quickly
# jump to the key that matches a gives sha1. We know that the keys are
# in sorted order, and we know that a lot of the prefix is going to be
@@ -753,29 +755,39 @@
# By XORing the entries together, we can determine what bits are set in
# all of them
if self.num_entries < 2:
+ # Everything is in common if you have 0 or 1 leaves
+ # So we'll always just shift to the first byte
self.common_mask = 0x0
- self.common_shift = 0
+ self.common_shift = 24
else:
common_mask = 0xFFFFFFFF
first = _sha1_to_uint(self.entries[0].sha1)
for i from 0 < i < self.num_entries:
this = _sha1_to_uint(self.entries[i].sha1)
common_mask = (~(first ^ this)) & common_mask
- common_shift = 0
- self.common_mask = common_mask
- while common_mask & 0x80000000:
+ common_shift = 24
+ self.common_mask = 0 # common_mask
+ while common_mask & 0x80000000 and common_shift > 0:
common_mask = common_mask << 1
- common_shift += 1
+ common_shift -= 1
+ self.common_mask = (self.common_mask >> 1) | 0x80000000
self.common_shift = common_shift
self.common_bits = first & self.common_mask
offset = 0
- for i from 0 <= i < self.num_entries:
+ max_offset = self.num_entries
+ # We cap this loop at 254 entries. All the other offsets just get
+ # filled with 0xff as the singleton saying 'too many'.
+ # It means that if we have >255 entries we have to bisect the second
+ # half of the list, but this is going to be very rare in practice.
+ if max_offset > 255:
+ max_offset = 255
+ for i from 0 <= i < max_offset:
this_offset = self._offset_for_sha1(self.entries[i].sha1)
while offset <= this_offset:
self.offsets[offset] = i
offset += 1
while offset < 257:
- self.offsets[offset] = self.num_entries
+ self.offsets[offset] = max_offset
offset += 1
property _test_offsets:
=== modified file 'bzrlib/tests/test__btree_serializer.py'
--- a/bzrlib/tests/test__btree_serializer.py 2010-08-04 07:14:54 +0000
+++ b/bzrlib/tests/test__btree_serializer.py 2010-08-04 15:48:36 +0000
@@ -149,6 +149,29 @@
sha1:cf7a9e24777ec23212c54d7a350bc5bea5477fdb\x00\x007 7 7 7
"""
+_multi_key_same_offset = """type=leaf
+sha1:080c881d4a26984ddce795f6f71817c9cf4480e7\x00\x000 0 0 0
+sha1:c86f7e437faa5a7fce15d1ddcb9eaeaea377667b\x00\x001 1 1 1
+sha1:cd0c9035898dd52fc65c41454cec9c4d2611bfb3\x00\x002 2 2 2
+sha1:cda39a3ee5e6b4b0d3255bfef95601890afd8070\x00\x003 3 3 3
+sha1:cde240de74fb1ed08fa08d38063f6a6a91462a81\x00\x004 4 4 4
+sha1:cdf51e37c269aa94d38f93e537bf6e2020b21406\x00\x005 5 5 5
+sha1:ce7a9e24777ec23212c54d7a350bc5bea5477fdb\x00\x006 6 6 6
+sha1:ce93b4e3c464ffd51732fbd6ded717e9efda28aa\x00\x007 7 7 7
+"""
+
+_common_32_bits = """type=leaf
+sha1:123456784a26984ddce795f6f71817c9cf4480e7\x00\x000 0 0 0
+sha1:1234567874fb1ed08fa08d38063f6a6a91462a81\x00\x001 1 1 1
+sha1:12345678777ec23212c54d7a350bc5bea5477fdb\x00\x002 2 2 2
+sha1:123456787faa5a7fce15d1ddcb9eaeaea377667b\x00\x003 3 3 3
+sha1:12345678898dd52fc65c41454cec9c4d2611bfb3\x00\x004 4 4 4
+sha1:12345678c269aa94d38f93e537bf6e2020b21406\x00\x005 5 5 5
+sha1:12345678c464ffd51732fbd6ded717e9efda28aa\x00\x006 6 6 6
+sha1:12345678e5e6b4b0d3255bfef95601890afd8070\x00\x007 7 7 7
+"""
+
+
class TestGCCKHSHA1LeafNode(TestBtreeSerializer):
def assertInvalid(self, bytes):
@@ -202,15 +225,77 @@
# Note that by 'overlap' we mean that given bit is either on in all
# keys, or off in all keys
leaf = self.module._parse_into_chk(_multi_key_content, 1, 0)
- self.assertEqual(hex(0xF8000100), hex(leaf.common_mask))
- self.assertEqual(5, leaf.common_shift)
+ self.assertEqual(hex(0xF8000000), hex(leaf.common_mask))
+ self.assertEqual(19, leaf.common_shift)
self.assertEqual(0xc8000000, leaf.common_bits)
# The interesting byte for each key is
# (defined as the 8-bits that come after the common prefix)
- # [1, 13, 28, 180, 190, 193, 210, 239]
lst = [1, 13, 28, 180, 190, 193, 210, 239]
offsets = leaf._test_offsets
self.assertEqual([bisect.bisect_left(lst, x) for x in range(0, 257)],
offsets)
for idx, val in enumerate(lst):
self.assertEqual(idx, offsets[val])
+ for idx, key in enumerate(leaf.all_keys()):
+ self.assertEqual(str(idx), leaf[key][0].split()[0])
+
+ def test_multi_key_same_offset(self):
+ # there is no common prefix, though there are some common bits
+ leaf = self.module._parse_into_chk(_multi_key_same_offset, 1, 0)
+ self.assertEqual(0x00000000, leaf.common_mask)
+ self.assertEqual(24, leaf.common_shift)
+ self.assertEqual(0x00000000, leaf.common_bits)
+ offsets = leaf._test_offsets
+ # The interesting byte is just the first 8-bits of the key
+ lst = [8, 200, 205, 205, 205, 205, 206, 206]
+ self.assertEqual([bisect.bisect_left(lst, x) for x in range(0, 257)],
+ offsets)
+ for val in lst:
+ self.assertEqual(lst.index(val), offsets[val])
+ for idx, key in enumerate(leaf.all_keys()):
+ self.assertEqual(str(idx), leaf[key][0].split()[0])
+
+ def test_all_common_prefix(self):
+ # The first 32 bits of all hashes are the same. This is going to be
+ # pretty much impossible, but I don't want to fail because of this
+ leaf = self.module._parse_into_chk(_common_32_bits, 1, 0)
+ self.assertEqual(0xFFFFFF00, leaf.common_mask)
+ self.assertEqual(0, leaf.common_shift)
+ self.assertEqual(0x12345600, leaf.common_bits)
+ lst = [0x78] * 8
+ offsets = leaf._test_offsets
+ self.assertEqual([bisect.bisect_left(lst, x) for x in range(0, 257)],
+ offsets)
+ for val in lst:
+ self.assertEqual(lst.index(val), offsets[val])
+ for idx, key in enumerate(leaf.all_keys()):
+ self.assertEqual(str(idx), leaf[key][0].split()[0])
+
+ def test_many_entries(self):
+ # Again, this is almost impossible, but we should still work
+ # It would be hard to fit more that 120 entries in a 4k page, much less
+ # more than 256 of them. but hey, weird stuff happens sometimes
+ lines = ['type=leaf\n']
+ for i in range(500):
+ key_str = 'sha1:%04x%s' % (i, _hex_form[:36])
+ key = (key_str,)
+ lines.append('%s\0\0%d %d %d %d\n' % (key_str, i, i, i, i))
+ bytes = ''.join(lines)
+ leaf = self.module._parse_into_chk(bytes, 1, 0)
+ self.assertEqual(0xFE000000, leaf.common_mask)
+ self.assertEqual(24-7, leaf.common_shift)
+ self.assertEqual(0x00000000, leaf.common_bits)
+ offsets = leaf._test_offsets
+ # This is the interesting bits for each entry
+ lst = [x // 2 for x in range(500)]
+ expected_offsets = [x * 2 for x in range(128)] + [255]*129
+ self.assertEqual(expected_offsets, offsets)
+ # We truncate because offsets is an unsigned char. So the bisection
+ # will just say 'greater than the last one' for all the rest
+ lst = lst[:255]
+ self.assertEqual([bisect.bisect_left(lst, x) for x in range(0, 257)],
+ offsets)
+ for val in lst:
+ self.assertEqual(lst.index(val), offsets[val])
+ for idx, key in enumerate(leaf.all_keys()):
+ self.assertEqual(str(idx), leaf[key][0].split()[0])
More information about the bazaar-commits
mailing list