Rev 5381: Trim the class a little bit. in http://bazaar.launchpad.net/~jameinel/bzr/2.3-btree-chk-leaf

Wed Aug 4 17:08:20 BST 2010

At http://bazaar.launchpad.net/~jameinel/bzr/2.3-btree-chk-leaf

------------------------------------------------------------
revno: 5381
revision-id: john at arbash-meinel.com-20100804160812-zvqj8pn99i3d6ks3
parent: john at arbash-meinel.com-20100804154836-79sceckc4exj134y
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: 2.3-btree-chk-leaf
timestamp: Wed 2010-08-04 11:08:12 -0500
message:
  Trim the class a little bit.
  
  All we need is the common_shift, common_bits and common_mask aren't used.
  This puts as at about 42.? bytes/record.
  We have 40 actual, and then <3 bytes/record of other overhead.
-------------- next part --------------
=== modified file 'bzrlib/_btree_serializer_pyx.pyx'

--- a/bzrlib/_btree_serializer_pyx.pyx	2010-08-04 15:48:36 +0000
+++ b/bzrlib/_btree_serializer_pyx.pyx	2010-08-04 16:08:12 +0000
@@ -64,7 +64,7 @@
     # void *memrchr(void *s, int c, size_t n)
     int strncmp(char *s1, char *s2, size_t n)
     unsigned long strtoul(char *s1, char **out, int base)
-    unsigned long long strtoull(char *s1, char **out, int base)
+    long long strtoll(char *s1, char **out, int base)
 
 # It seems we need to import the definitions so that the pyrex compiler has
 # local names to access them.
@@ -503,9 +503,6 @@
     # We then strip those bits, and use the next 8 bits as 'interesting bits'.
     # offsets is then the bisect() information for a key with those bits (where
     # would you insert a key starting with these bits)
-    cdef public long bisect_steps
-    cdef public unsigned int common_mask
-    cdef public unsigned int common_bits
     # The first N bits of all entries in this node have the same content
     cdef public unsigned char common_shift
     cdef unsigned char offsets[257]
@@ -605,7 +602,6 @@
         while lo < hi:
             mid = (lo + hi) / 2
             the_cmp = memcmp(self.entries[mid].sha1, sha1, 20)
-            self.bisect_steps += 1
             if the_cmp == 0:
                 # print mid, offset, self._offsets[offset], self._offsets[offset+1]
                 return &self.entries[mid]
@@ -656,7 +652,6 @@
         cdef char *c_content
         cdef char *c_cur
         cdef char *c_end
-        cdef char *c_next
         cdef Py_ssize_t n_bytes
         cdef int num_entries
         cdef int entry
@@ -690,35 +685,7 @@
         cur_record = self.entries
         entry = 0
         while c_cur != NULL and c_cur < c_end and entry < num_entries:
-            if strncmp(c_cur, 'sha1:', 5):
-                raise ValueError('At byte %d, line did not start with sha1: %r'
-                    % (c_cur - c_bytes, safe_string_from_size(c_cur, 10)))
-            c_cur += 5
-            c_next = <char *>memchr(c_cur, c'\0', c_end - c_cur)
-            if c_next == NULL or (c_next - c_cur != 40):
-                raise ValueError('Line did not contain 40 hex bytes')
-            if not _unhexlify_sha1(c_cur, cur_record.sha1):
-                raise ValueError('We failed to unhexlify')
-            c_cur = c_next + 1
-            if c_cur[0] != c'\0':
-                raise ValueError('only 1 null, not 2 as expected')
-            c_cur += 1
-            cur_record.block_offset = strtoull(c_cur, &c_next, 10)
-            if c_cur == c_next or c_next[0] != c' ':
-                raise ValueError('Failed to parse block offset')
-            c_cur = c_next + 1
-            cur_record.block_length = strtoul(c_cur, &c_next, 10)
-            if c_cur == c_next or c_next[0] != c' ':
-                raise ValueError('Failed to parse block length')
-            c_cur = c_next + 1
-            cur_record.record_start = strtoul(c_cur, &c_next, 10)
-            if c_cur == c_next or c_next[0] != c' ':
-                raise ValueError('Failed to parse block length')
-            c_cur = c_next + 1
-            cur_record.record_end = strtoul(c_cur, &c_next, 10)
-            if c_cur == c_next or c_next[0] != c'\n':
-                raise ValueError('Failed to parse record end')
-            c_cur = c_next + 1
+            c_cur = self._parse_one_entry(c_cur, c_end, cur_record)
             cur_record += 1
             entry += 1
         if (entry != self.num_entries
@@ -728,13 +695,50 @@
         # Pass 3: build the offset map
         self._compute_common()
 
+    cdef char *_parse_one_entry(self, char *c_cur, char *c_end,
+                                gc_chk_sha1_record *cur_record) except NULL:
+        """Read a single sha record from the bytes.
+        :param c_cur: The pointer to the start of bytes
+        :param cur_record: 
+        """
+        cdef char *c_next
+        if strncmp(c_cur, 'sha1:', 5):
+            raise ValueError('line did not start with sha1: %r'
+                % (safe_string_from_size(c_cur, 10),))
+        c_cur += 5
+        c_next = <char *>memchr(c_cur, c'\0', c_end - c_cur)
+        if c_next == NULL or (c_next - c_cur != 40):
+            raise ValueError('Line did not contain 40 hex bytes')
+        if not _unhexlify_sha1(c_cur, cur_record.sha1):
+            raise ValueError('We failed to unhexlify')
+        c_cur = c_next + 1
+        if c_cur[0] != c'\0':
+            raise ValueError('only 1 null, not 2 as expected')
+        c_cur += 1
+        cur_record.block_offset = strtoll(c_cur, &c_next, 10)
+        if c_cur == c_next or c_next[0] != c' ':
+            raise ValueError('Failed to parse block offset')
+        c_cur = c_next + 1
+        cur_record.block_length = strtoul(c_cur, &c_next, 10)
+        if c_cur == c_next or c_next[0] != c' ':
+            raise ValueError('Failed to parse block length')
+        c_cur = c_next + 1
+        cur_record.record_start = strtoul(c_cur, &c_next, 10)
+        if c_cur == c_next or c_next[0] != c' ':
+            raise ValueError('Failed to parse block length')
+        c_cur = c_next + 1
+        cur_record.record_end = strtoul(c_cur, &c_next, 10)
+        if c_cur == c_next or c_next[0] != c'\n':
+            raise ValueError('Failed to parse record end')
+        c_cur = c_next + 1
+        return c_cur
+
     cdef int _offset_for_sha1(self, char *sha1) except -1:
         """Find the first interesting 8-bits of this sha1."""
         cdef int this_offset
         cdef unsigned int as_uint
         as_uint = _sha1_to_uint(sha1)
         this_offset = (as_uint >> self.common_shift) & 0xFF
-        # assert 0 < this_offset < 256
         return this_offset
 
     def test_offset_for_sha1(self, sha1):
@@ -757,7 +761,6 @@
         if self.num_entries < 2:
             # Everything is in common if you have 0 or 1 leaves
             # So we'll always just shift to the first byte
-            self.common_mask = 0x0
             self.common_shift = 24
         else:
             common_mask = 0xFFFFFFFF
@@ -766,13 +769,10 @@
                 this = _sha1_to_uint(self.entries[i].sha1)
                 common_mask = (~(first ^ this)) & common_mask
             common_shift = 24
-            self.common_mask = 0 # common_mask
             while common_mask & 0x80000000 and common_shift > 0:
                 common_mask = common_mask << 1
                 common_shift -= 1
-                self.common_mask = (self.common_mask >> 1) | 0x80000000
             self.common_shift = common_shift
-            self.common_bits = first & self.common_mask
         offset = 0
         max_offset = self.num_entries
         # We cap this loop at 254 entries. All the other offsets just get

=== modified file 'bzrlib/python-compat.h'
--- a/bzrlib/python-compat.h	2010-08-03 20:56:39 +0000
+++ b/bzrlib/python-compat.h	2010-08-04 16:08:12 +0000
@@ -65,6 +65,7 @@
     #if !defined(S_ISLNK)
         #define S_ISLNK(mode) (0)
     #endif
+    #define strtoll _strtoi64
     #define strtoull _strtoui64
 #else /* Not win32 */
     /* For htonl */

=== modified file 'bzrlib/tests/test__btree_serializer.py'
--- a/bzrlib/tests/test__btree_serializer.py	2010-08-04 15:48:36 +0000
+++ b/bzrlib/tests/test__btree_serializer.py	2010-08-04 16:08:12 +0000
@@ -219,15 +219,13 @@
         for idx, key in enumerate(all_keys):
             self.assertEqual(str(idx), leaf[key][0].split()[0])
 
-    def test_common_mask(self):
+    def test_common_shift(self):
         # The keys were deliberately chosen so that the first 5 bits all
         # overlapped, it also happens that a later bit overlaps
         # Note that by 'overlap' we mean that given bit is either on in all
         # keys, or off in all keys
         leaf = self.module._parse_into_chk(_multi_key_content, 1, 0)
-        self.assertEqual(hex(0xF8000000), hex(leaf.common_mask))
         self.assertEqual(19, leaf.common_shift)
-        self.assertEqual(0xc8000000, leaf.common_bits)
         # The interesting byte for each key is
         # (defined as the 8-bits that come after the common prefix)
         lst = [1, 13, 28, 180, 190, 193, 210, 239]
@@ -242,9 +240,7 @@
     def test_multi_key_same_offset(self):
         # there is no common prefix, though there are some common bits
         leaf = self.module._parse_into_chk(_multi_key_same_offset, 1, 0)
-        self.assertEqual(0x00000000, leaf.common_mask)
         self.assertEqual(24, leaf.common_shift)
-        self.assertEqual(0x00000000, leaf.common_bits)
         offsets = leaf._test_offsets
         # The interesting byte is just the first 8-bits of the key
         lst = [8, 200, 205, 205, 205, 205, 206, 206]
@@ -259,9 +255,7 @@
         # The first 32 bits of all hashes are the same. This is going to be
         # pretty much impossible, but I don't want to fail because of this
         leaf = self.module._parse_into_chk(_common_32_bits, 1, 0)
-        self.assertEqual(0xFFFFFF00, leaf.common_mask)
         self.assertEqual(0, leaf.common_shift)
-        self.assertEqual(0x12345600, leaf.common_bits)
         lst = [0x78] * 8
         offsets = leaf._test_offsets
         self.assertEqual([bisect.bisect_left(lst, x) for x in range(0, 257)],
@@ -282,9 +276,7 @@
             lines.append('%s\0\0%d %d %d %d\n' % (key_str, i, i, i, i))
         bytes = ''.join(lines)
         leaf = self.module._parse_into_chk(bytes, 1, 0)
-        self.assertEqual(0xFE000000, leaf.common_mask)
         self.assertEqual(24-7, leaf.common_shift)
-        self.assertEqual(0x00000000, leaf.common_bits)
         offsets = leaf._test_offsets
         # This is the interesting bits for each entry
         lst = [x // 2 for x in range(500)]