Rev 5380: Handle the case where there are more than 255 entries. in http://bazaar.launchpad.net/~jameinel/bzr/2.3-btree-chk-leaf

Wed Aug 4 16:48:43 BST 2010

At http://bazaar.launchpad.net/~jameinel/bzr/2.3-btree-chk-leaf

------------------------------------------------------------
revno: 5380
revision-id: john at arbash-meinel.com-20100804154836-79sceckc4exj134y
parent: john at arbash-meinel.com-20100804142810-9ef2ytu2dh0pwk34
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: 2.3-btree-chk-leaf
timestamp: Wed 2010-08-04 10:48:36 -0500
message:
  Handle the case where there are more than 255 entries.
  
  This is very unlikely in real-world, but we want to preserve correctness
  rather than failing.
  There doesn't seem to be a performance impact (still 20us), which is
  fairly expected. It is a single unlikely if check in the lookup logic.
-------------- next part --------------
=== modified file 'bzrlib/_btree_serializer_pyx.pyx'

--- a/bzrlib/_btree_serializer_pyx.pyx	2010-08-04 07:14:54 +0000
+++ b/bzrlib/_btree_serializer_pyx.pyx	2010-08-04 15:48:36 +0000
@@ -507,8 +507,8 @@
     cdef public unsigned int common_mask
     cdef public unsigned int common_bits
     # The first N bits of all entries in this node have the same content
-    cdef public short common_shift
-    cdef short offsets[257]
+    cdef public unsigned char common_shift
+    cdef unsigned char offsets[257]
 
     def __sizeof__(self):
         return (sizeof(GCCHKSHA1LeafNode)
@@ -584,27 +584,24 @@
         cdef int lo, hi, mid, the_cmp
         cdef int offset
 
-        lo = 0
-        hi = self.num_entries
-
-        # Bisect until we find the right spot
-        # Note the time to create Python objects dominates over comparison
-        # times. With a simple iterate and compare (no bisect) we spend 45us
-        # for 120 __contains__, but 201us for 120 __getitem__.
-        # The _LeafNode.__getitem__ takes 54.6us, but (k in o._keys) is 20.7us
-        # removing the getattr() w/ (k in _keys) 13.7ms. So we do still have
-        # some room for improvement
-        # _key_to_sha1 takes 24us, which puts a lower bound on _lookup_record
-        # unless we skip doing the whole record (iterating is 8us,
-        # iterating+no-op C function call is 12.5us)
-        # TODO: Consider improving this, but for now it seems good enough. (We
-        #       could create a small index so we would have less to bisect,
-        #       etc.)
-        # bisecting does reduce the total number of memcmp calls from 7260 for
-        # 120 keys to 720.
+        # TODO: We can speed up misses by comparing this sha1 to the common
+        #       bits, and seeing if the common prefix matches, if not, we don't
+        #       need to search for anything because it cannot match
+        # Use the offset array to find the closest fit for this entry
+        # follow that up with bisecting, since multiple keys can be in one
+        # spot
+        # Bisecting dropped us from 7000 comparisons to 582 (4.8/key), using
+        # the offset array dropped us from 23us to 20us and 156 comparisions
+        # (1.3/key)
         offset = self._offset_for_sha1(sha1)
         lo = self.offsets[offset]
         hi = self.offsets[offset+1]
+        if hi == 255:
+            # if hi == 255 that means we potentially ran off the end of the
+            # list, so push it up to num_entries
+            # note that if 'lo' == 255, that is ok, because we can start
+            # searching from that part of the list.
+            hi = self.num_entries
         while lo < hi:
             mid = (lo + hi) / 2
             the_cmp = memcmp(self.entries[mid].sha1, sha1, 20)
@@ -734,18 +731,23 @@
     cdef int _offset_for_sha1(self, char *sha1) except -1:
         """Find the first interesting 8-bits of this sha1."""
         cdef int this_offset
-        this_offset = ((_sha1_to_uint(sha1) & ~self.common_mask)
-                       >> (24 - self.common_shift))
+        cdef unsigned int as_uint
+        as_uint = _sha1_to_uint(sha1)
+        this_offset = (as_uint >> self.common_shift) & 0xFF
         # assert 0 < this_offset < 256
         return this_offset
 
+    def test_offset_for_sha1(self, sha1):
+        return self._offset_for_sha1(PyString_AS_STRING(sha1))
+
     cdef _compute_common(self):
         cdef unsigned int first
         cdef unsigned int this
         cdef unsigned int common_mask
-        cdef short common_shift
+        cdef unsigned char common_shift
         cdef int i
         cdef int offset, this_offset
+        cdef int max_offset
         # The idea with the offset map is that we should be able to quickly
         # jump to the key that matches a gives sha1. We know that the keys are
         # in sorted order, and we know that a lot of the prefix is going to be
@@ -753,29 +755,39 @@
         # By XORing the entries together, we can determine what bits are set in
         # all of them
         if self.num_entries < 2:
+            # Everything is in common if you have 0 or 1 leaves
+            # So we'll always just shift to the first byte
             self.common_mask = 0x0
-            self.common_shift = 0
+            self.common_shift = 24
         else:
             common_mask = 0xFFFFFFFF
             first = _sha1_to_uint(self.entries[0].sha1)
             for i from 0 < i < self.num_entries:
                 this = _sha1_to_uint(self.entries[i].sha1)
                 common_mask = (~(first ^ this)) & common_mask
-            common_shift = 0
-            self.common_mask = common_mask
-            while common_mask & 0x80000000:
+            common_shift = 24
+            self.common_mask = 0 # common_mask
+            while common_mask & 0x80000000 and common_shift > 0:
                 common_mask = common_mask << 1
-                common_shift += 1
+                common_shift -= 1
+                self.common_mask = (self.common_mask >> 1) | 0x80000000
             self.common_shift = common_shift
             self.common_bits = first & self.common_mask
         offset = 0
-        for i from 0 <= i < self.num_entries:
+        max_offset = self.num_entries
+        # We cap this loop at 254 entries. All the other offsets just get
+        # filled with 0xff as the singleton saying 'too many'.
+        # It means that if we have >255 entries we have to bisect the second
+        # half of the list, but this is going to be very rare in practice.
+        if max_offset > 255:
+            max_offset = 255
+        for i from 0 <= i < max_offset:
             this_offset = self._offset_for_sha1(self.entries[i].sha1)
             while offset <= this_offset:
                 self.offsets[offset] = i
                 offset += 1
         while offset < 257:
-            self.offsets[offset] = self.num_entries
+            self.offsets[offset] = max_offset
             offset += 1
 
     property _test_offsets:

=== modified file 'bzrlib/tests/test__btree_serializer.py'
--- a/bzrlib/tests/test__btree_serializer.py	2010-08-04 07:14:54 +0000
+++ b/bzrlib/tests/test__btree_serializer.py	2010-08-04 15:48:36 +0000
@@ -149,6 +149,29 @@
 sha1:cf7a9e24777ec23212c54d7a350bc5bea5477fdb\x00\x007 7 7 7
 """
 
+_multi_key_same_offset = """type=leaf
+sha1:080c881d4a26984ddce795f6f71817c9cf4480e7\x00\x000 0 0 0
+sha1:c86f7e437faa5a7fce15d1ddcb9eaeaea377667b\x00\x001 1 1 1
+sha1:cd0c9035898dd52fc65c41454cec9c4d2611bfb3\x00\x002 2 2 2
+sha1:cda39a3ee5e6b4b0d3255bfef95601890afd8070\x00\x003 3 3 3
+sha1:cde240de74fb1ed08fa08d38063f6a6a91462a81\x00\x004 4 4 4
+sha1:cdf51e37c269aa94d38f93e537bf6e2020b21406\x00\x005 5 5 5
+sha1:ce7a9e24777ec23212c54d7a350bc5bea5477fdb\x00\x006 6 6 6
+sha1:ce93b4e3c464ffd51732fbd6ded717e9efda28aa\x00\x007 7 7 7
+"""
+
+_common_32_bits = """type=leaf
+sha1:123456784a26984ddce795f6f71817c9cf4480e7\x00\x000 0 0 0
+sha1:1234567874fb1ed08fa08d38063f6a6a91462a81\x00\x001 1 1 1
+sha1:12345678777ec23212c54d7a350bc5bea5477fdb\x00\x002 2 2 2
+sha1:123456787faa5a7fce15d1ddcb9eaeaea377667b\x00\x003 3 3 3
+sha1:12345678898dd52fc65c41454cec9c4d2611bfb3\x00\x004 4 4 4
+sha1:12345678c269aa94d38f93e537bf6e2020b21406\x00\x005 5 5 5
+sha1:12345678c464ffd51732fbd6ded717e9efda28aa\x00\x006 6 6 6
+sha1:12345678e5e6b4b0d3255bfef95601890afd8070\x00\x007 7 7 7
+"""
+
+
 class TestGCCKHSHA1LeafNode(TestBtreeSerializer):
 
     def assertInvalid(self, bytes):
@@ -202,15 +225,77 @@
         # Note that by 'overlap' we mean that given bit is either on in all
         # keys, or off in all keys
         leaf = self.module._parse_into_chk(_multi_key_content, 1, 0)
-        self.assertEqual(hex(0xF8000100), hex(leaf.common_mask))
-        self.assertEqual(5, leaf.common_shift)
+        self.assertEqual(hex(0xF8000000), hex(leaf.common_mask))
+        self.assertEqual(19, leaf.common_shift)
         self.assertEqual(0xc8000000, leaf.common_bits)
         # The interesting byte for each key is
         # (defined as the 8-bits that come after the common prefix)
-        # [1, 13, 28, 180, 190, 193, 210, 239]
         lst = [1, 13, 28, 180, 190, 193, 210, 239]
         offsets = leaf._test_offsets
         self.assertEqual([bisect.bisect_left(lst, x) for x in range(0, 257)],
                          offsets)
         for idx, val in enumerate(lst):
             self.assertEqual(idx, offsets[val])
+        for idx, key in enumerate(leaf.all_keys()):
+            self.assertEqual(str(idx), leaf[key][0].split()[0])
+
+    def test_multi_key_same_offset(self):
+        # there is no common prefix, though there are some common bits
+        leaf = self.module._parse_into_chk(_multi_key_same_offset, 1, 0)
+        self.assertEqual(0x00000000, leaf.common_mask)
+        self.assertEqual(24, leaf.common_shift)
+        self.assertEqual(0x00000000, leaf.common_bits)
+        offsets = leaf._test_offsets
+        # The interesting byte is just the first 8-bits of the key
+        lst = [8, 200, 205, 205, 205, 205, 206, 206]
+        self.assertEqual([bisect.bisect_left(lst, x) for x in range(0, 257)],
+                         offsets)
+        for val in lst:
+            self.assertEqual(lst.index(val), offsets[val])
+        for idx, key in enumerate(leaf.all_keys()):
+            self.assertEqual(str(idx), leaf[key][0].split()[0])
+
+    def test_all_common_prefix(self):
+        # The first 32 bits of all hashes are the same. This is going to be
+        # pretty much impossible, but I don't want to fail because of this
+        leaf = self.module._parse_into_chk(_common_32_bits, 1, 0)
+        self.assertEqual(0xFFFFFF00, leaf.common_mask)
+        self.assertEqual(0, leaf.common_shift)
+        self.assertEqual(0x12345600, leaf.common_bits)
+        lst = [0x78] * 8
+        offsets = leaf._test_offsets
+        self.assertEqual([bisect.bisect_left(lst, x) for x in range(0, 257)],
+                         offsets)
+        for val in lst:
+            self.assertEqual(lst.index(val), offsets[val])
+        for idx, key in enumerate(leaf.all_keys()):
+            self.assertEqual(str(idx), leaf[key][0].split()[0])
+
+    def test_many_entries(self):
+        # Again, this is almost impossible, but we should still work
+        # It would be hard to fit more that 120 entries in a 4k page, much less
+        # more than 256 of them. but hey, weird stuff happens sometimes
+        lines = ['type=leaf\n']
+        for i in range(500):
+            key_str = 'sha1:%04x%s' % (i, _hex_form[:36])
+            key = (key_str,)
+            lines.append('%s\0\0%d %d %d %d\n' % (key_str, i, i, i, i))
+        bytes = ''.join(lines)
+        leaf = self.module._parse_into_chk(bytes, 1, 0)
+        self.assertEqual(0xFE000000, leaf.common_mask)
+        self.assertEqual(24-7, leaf.common_shift)
+        self.assertEqual(0x00000000, leaf.common_bits)
+        offsets = leaf._test_offsets
+        # This is the interesting bits for each entry
+        lst = [x // 2 for x in range(500)]
+        expected_offsets = [x * 2 for x in range(128)] + [255]*129
+        self.assertEqual(expected_offsets, offsets)
+        # We truncate because offsets is an unsigned char. So the bisection
+        # will just say 'greater than the last one' for all the rest
+        lst = lst[:255]
+        self.assertEqual([bisect.bisect_left(lst, x) for x in range(0, 257)],
+                         offsets)
+        for val in lst:
+            self.assertEqual(lst.index(val), offsets[val])
+        for idx, key in enumerate(leaf.all_keys()):
+            self.assertEqual(str(idx), leaf[key][0].split()[0])