Rev 5378: Populate the offsets array. in http://bazaar.launchpad.net/~jameinel/bzr/2.3-btree-chk-leaf

Wed Aug 4 08:14:58 BST 2010

At http://bazaar.launchpad.net/~jameinel/bzr/2.3-btree-chk-leaf

------------------------------------------------------------
revno: 5378
revision-id: john at arbash-meinel.com-20100804071454-bfhbwrqes7sabvay
parent: john at arbash-meinel.com-20100804043626-pwawyjkb6r6wzs9a
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: 2.3-btree-chk-leaf
timestamp: Wed 2010-08-04 02:14:54 -0500
message:
  Populate the offsets array.
  
  This cuts down the number of bisections dramatically, basically by pre-caching
  the first step. On real-world data it drops the steps from 587 to 156.
  Or from 4.9/key to 1.3/key.
  This drops the time to lookup from 23.7us to 20.3us.
  Note that (k in dict) is 12.2us. I do wish we were just a bit closer to that.
  However, with _LeafNode inherited from dict, I get 26us, so
  maybe there is something in the interpreter that does a PyDict_CheckExact
  call, and there isn't much we can do about it.
-------------- next part --------------
=== modified file 'bzrlib/_btree_serializer_pyx.pyx'

--- a/bzrlib/_btree_serializer_pyx.pyx	2010-08-04 04:33:24 +0000
+++ b/bzrlib/_btree_serializer_pyx.pyx	2010-08-04 07:14:54 +0000
@@ -483,17 +483,31 @@
     return _sha1_to_key(PyString_AS_STRING(sha1_bin))
 
 
+cdef unsigned int _sha1_to_uint(char *sha1):
+    cdef unsigned int val
+    # Must be in MSB, because that is how the content is sorted
+    val = (((<unsigned int>(sha1[0]) & 0xff) << 24)
+           | ((<unsigned int>(sha1[1]) & 0xff) << 16)
+           | ((<unsigned int>(sha1[2]) & 0xff) << 8)
+           | ((<unsigned int>(sha1[3]) & 0xff) << 0))
+    return val
+
+
 cdef class GCCHKSHA1LeafNode:
     """Track all the entries for a given leaf node."""
 
     cdef public int num_entries
     cdef gc_chk_sha1_record *entries
-    # This is for the mini-index. We look at all the keys and use whatever byte
-    # is first unique across all stored keys (this is often the first byte)
-    # we then store the entries offset for the first record that matches that
-    # byte. This does assume that we'll never have more than 32k entries, but
-    # that doesn't seem to be a terrible assumption (we should have ~100)
-    cdef public short interesting_byte
+    # This is for the mini-index. 
+    # We look at all the keys and determine how many start bits are identical.
+    # We then strip those bits, and use the next 8 bits as 'interesting bits'.
+    # offsets is then the bisect() information for a key with those bits (where
+    # would you insert a key starting with these bits)
+    cdef public long bisect_steps
+    cdef public unsigned int common_mask
+    cdef public unsigned int common_bits
+    # The first N bits of all entries in this node have the same content
+    cdef public short common_shift
     cdef short offsets[257]
 
     def __sizeof__(self):
@@ -568,6 +582,7 @@
     cdef gc_chk_sha1_record* _lookup_record(self, char *sha1):
         """Find a gc_chk_sha1_record that matches the sha1 supplied."""
         cdef int lo, hi, mid, the_cmp
+        cdef int offset
 
         lo = 0
         hi = self.num_entries
@@ -585,15 +600,22 @@
         # TODO: Consider improving this, but for now it seems good enough. (We
         #       could create a small index so we would have less to bisect,
         #       etc.)
+        # bisecting does reduce the total number of memcmp calls from 7260 for
+        # 120 keys to 720.
+        offset = self._offset_for_sha1(sha1)
+        lo = self.offsets[offset]
+        hi = self.offsets[offset+1]
         while lo < hi:
             mid = (lo + hi) / 2
-            the_cmp = memcmp(sha1, self.entries[mid].sha1, 20)
+            the_cmp = memcmp(self.entries[mid].sha1, sha1, 20)
+            self.bisect_steps += 1
             if the_cmp == 0:
+                # print mid, offset, self._offsets[offset], self._offsets[offset+1]
                 return &self.entries[mid]
             elif the_cmp < 0:
+                lo = mid + 1
+            else:
                 hi = mid
-            else:
-                lo = mid + 1
         return NULL
 
     def __contains__(self, key):
@@ -707,10 +729,63 @@
             or cur_record != self.entries + self.num_entries):
             raise ValueError('Something went wrong while parsing.')
         # Pass 3: build the offset map
+        self._compute_common()
+
+    cdef int _offset_for_sha1(self, char *sha1) except -1:
+        """Find the first interesting 8-bits of this sha1."""
+        cdef int this_offset
+        this_offset = ((_sha1_to_uint(sha1) & ~self.common_mask)
+                       >> (24 - self.common_shift))
+        # assert 0 < this_offset < 256
+        return this_offset
+
+    cdef _compute_common(self):
+        cdef unsigned int first
+        cdef unsigned int this
+        cdef unsigned int common_mask
+        cdef short common_shift
+        cdef int i
+        cdef int offset, this_offset
         # The idea with the offset map is that we should be able to quickly
         # jump to the key that matches a gives sha1. We know that the keys are
         # in sorted order, and we know that a lot of the prefix is going to be
         # the same across them.
+        # By XORing the entries together, we can determine what bits are set in
+        # all of them
+        if self.num_entries < 2:
+            self.common_mask = 0x0
+            self.common_shift = 0
+        else:
+            common_mask = 0xFFFFFFFF
+            first = _sha1_to_uint(self.entries[0].sha1)
+            for i from 0 < i < self.num_entries:
+                this = _sha1_to_uint(self.entries[i].sha1)
+                common_mask = (~(first ^ this)) & common_mask
+            common_shift = 0
+            self.common_mask = common_mask
+            while common_mask & 0x80000000:
+                common_mask = common_mask << 1
+                common_shift += 1
+            self.common_shift = common_shift
+            self.common_bits = first & self.common_mask
+        offset = 0
+        for i from 0 <= i < self.num_entries:
+            this_offset = self._offset_for_sha1(self.entries[i].sha1)
+            while offset <= this_offset:
+                self.offsets[offset] = i
+                offset += 1
+        while offset < 257:
+            self.offsets[offset] = self.num_entries
+            offset += 1
+
+    property _test_offsets:
+        def __get__(self):
+            cdef list result
+            cdef int i
+            result = []
+            for i from 0 <= i < 257:
+                result.append(self.offsets[i])
+            return result
 
 
 def _parse_into_chk(bytes, key_length, ref_list_length):

=== modified file 'bzrlib/tests/test__btree_serializer.py'
--- a/bzrlib/tests/test__btree_serializer.py	2010-08-04 01:52:14 +0000
+++ b/bzrlib/tests/test__btree_serializer.py	2010-08-04 07:14:54 +0000
@@ -18,6 +18,7 @@
 """Direct tests of the btree serializer extension"""
 
 import binascii
+import bisect
 
 from bzrlib import tests
 
@@ -138,14 +139,14 @@
 """
 
 _multi_key_content = """type=leaf
-sha1:70c881d4a26984ddce795f6f71817c9cf4480e79\x00\x000 0 0 0
-sha1:7e240de74fb1ed08fa08d38063f6a6a91462a815\x00\x001 1 1 1
-sha1:86f7e437faa5a7fce15d1ddcb9eaeaea377667b8\x00\x002 2 2 2
-sha1:da39a3ee5e6b4b0d3255bfef95601890afd80709\x00\x003 3 3 3
-sha1:df51e37c269aa94d38f93e537bf6e2020b21406c\x00\x004 4 4 4
-sha1:e0c9035898dd52fc65c41454cec9c4d2611bfb37\x00\x005 5 5 5
-sha1:e93b4e3c464ffd51732fbd6ded717e9efda28aad\x00\x006 6 6 6
-sha1:f7a9e24777ec23212c54d7a350bc5bea5477fdbb\x00\x007 7 7 7
+sha1:c80c881d4a26984ddce795f6f71817c9cf4480e7\x00\x000 0 0 0
+sha1:c86f7e437faa5a7fce15d1ddcb9eaeaea377667b\x00\x001 1 1 1
+sha1:c8e240de74fb1ed08fa08d38063f6a6a91462a81\x00\x002 2 2 2
+sha1:cda39a3ee5e6b4b0d3255bfef95601890afd8070\x00\x003 3 3 3
+sha1:cdf51e37c269aa94d38f93e537bf6e2020b21406\x00\x004 4 4 4
+sha1:ce0c9035898dd52fc65c41454cec9c4d2611bfb3\x00\x005 5 5 5
+sha1:ce93b4e3c464ffd51732fbd6ded717e9efda28aa\x00\x006 6 6 6
+sha1:cf7a9e24777ec23212c54d7a350bc5bea5477fdb\x00\x007 7 7 7
 """
 
 class TestGCCKHSHA1LeafNode(TestBtreeSerializer):
@@ -194,3 +195,22 @@
         self.assertEqual(8, len(leaf.all_keys()))
         for idx, key in enumerate(all_keys):
             self.assertEqual(str(idx), leaf[key][0].split()[0])
+
+    def test_common_mask(self):
+        # The keys were deliberately chosen so that the first 5 bits all
+        # overlapped, it also happens that a later bit overlaps
+        # Note that by 'overlap' we mean that given bit is either on in all
+        # keys, or off in all keys
+        leaf = self.module._parse_into_chk(_multi_key_content, 1, 0)
+        self.assertEqual(hex(0xF8000100), hex(leaf.common_mask))
+        self.assertEqual(5, leaf.common_shift)
+        self.assertEqual(0xc8000000, leaf.common_bits)
+        # The interesting byte for each key is
+        # (defined as the 8-bits that come after the common prefix)
+        # [1, 13, 28, 180, 190, 193, 210, 239]
+        lst = [1, 13, 28, 180, 190, 193, 210, 239]
+        offsets = leaf._test_offsets
+        self.assertEqual([bisect.bisect_left(lst, x) for x in range(0, 257)],
+                         offsets)
+        for idx, val in enumerate(lst):
+            self.assertEqual(idx, offsets[val])