Rev 5370: Found some more bugs. I wasn't incrementing one of the pointers, in http://bazaar.launchpad.net/~jameinel/bzr/2.3-btree-chk-leaf

John Arbash Meinel john at arbash-meinel.com
Wed Aug 4 00:41:16 BST 2010


At http://bazaar.launchpad.net/~jameinel/bzr/2.3-btree-chk-leaf

------------------------------------------------------------
revno: 5370
revision-id: john at arbash-meinel.com-20100803234101-742wpbkuyhqc9tpp
parent: john at arbash-meinel.com-20100803221022-dz367g1mvwqi7cq4
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: 2.3-btree-chk-leaf
timestamp: Tue 2010-08-03 18:41:01 -0500
message:
  Found some more bugs. I wasn't incrementing one of the pointers,
  
  so I was parsing all the records, but not saving them correctly.
-------------- next part --------------
=== modified file 'bzrlib/_btree_serializer_pyx.pyx'
--- a/bzrlib/_btree_serializer_pyx.pyx	2010-08-03 22:10:22 +0000
+++ b/bzrlib/_btree_serializer_pyx.pyx	2010-08-03 23:41:01 +0000
@@ -354,12 +354,17 @@
 
 
 cdef int _unhexlify_sha1(char *as_hex, char *as_bin):
-    """Take the hex sha1 in as_hex and make it binary in as_bin"""
+    """Take the hex sha1 in as_hex and make it binary in as_bin
+    
+    Same as binascii.unhexlify, but working on C strings, not Python objects.
+    """
     cdef int top
     cdef int bot
     cdef int i, j
     cdef char *cur
     
+    # binascii does this using isupper() and tolower() and ?: syntax. I'm
+    # guessing a simple lookup array should be faster.
     j = 0
     for i from 0 <= i < 20:
         top = _unhexbuf[<unsigned char>(as_hex[j])]
@@ -404,6 +409,63 @@
     return as_hex
 
 
+cdef int _key_to_sha1(key, char *sha1):
+    """Map a key into its sha1 content.
+
+    :param key: A tuple of style ('sha1:abcd...',)
+    :param sha1: A char buffer of 20 bytes
+    :return: 1 if this could be converted, 0 otherwise
+    """
+    cdef char *c_val
+    if not PyTuple_CheckExact(key) and not StaticTuple_CheckExact(key):
+        return 0
+    if len(key) != 1:
+        return 0
+    val = key[0]
+    if not PyString_CheckExact(val) or PyString_GET_SIZE(val) != 45:
+        return 0
+    c_val = PyString_AS_STRING(val)
+    if strncmp(c_val, 'sha1:', 5) != 0:
+        return 0
+    if not _unhexlify_sha1(c_val + 5, sha1):
+        return 0
+    return 1
+
+
+def _test_key_to_sha1(key):
+    """Map a key to a simple sha1 string.
+
+    This is a testing thunk to the C function.
+    """
+    as_bin_sha = PyString_FromStringAndSize(NULL, 20)
+    if _key_to_sha1(key, PyString_AS_STRING(as_bin_sha)):
+        return as_bin_sha
+    return None
+
+
+cdef StaticTuple _sha1_to_key(char *sha1):
+    """Compute a ('sha1:abcd',) key for a given sha1."""
+    cdef StaticTuple key
+    cdef object hexxed
+    cdef char *c_buf
+    hexxed = PyString_FromStringAndSize(NULL, 45)
+    c_buf = PyString_AS_STRING(hexxed)
+    memcpy(c_buf, 'sha1:', 5)
+    _hexlify_sha1(sha1, c_buf+5)
+    key = StaticTuple_New(1)
+    Py_INCREF(hexxed)
+    StaticTuple_SET_ITEM(key, 0, hexxed)
+    # key = StaticTuple_Intern(key)
+    return key
+
+
+def _test_sha1_to_key(sha1_bin):
+    """Test thunk to check the sha1 mapping."""
+    if not PyString_CheckExact(sha1_bin) or PyString_GET_SIZE(sha1_bin) != 20:
+        raise ValueError('sha1_bin must be a str of exactly 20 bytes')
+    return _sha1_to_key(PyString_AS_STRING(sha1_bin))
+
+
 cdef class GCCHKSHA1LeafNode:
     """Track all the entries for a given leaf node."""
 
@@ -432,52 +494,15 @@
     property min_key:
         def __get__(self):
             if self.num_entries > 0:
-                return self._sha1_to_key(self.entries[0].sha1)
+                return _sha1_to_key(self.entries[0].sha1)
             return None
 
     property max_key:
         def __get__(self):
             if self.num_entries > 0:
-                return self._sha1_to_key(self.entries[self.num_entries-1].sha1)
+                return _sha1_to_key(self.entries[self.num_entries-1].sha1)
             return None
 
-    cdef int _key_to_sha1(self, key, char *sha1):
-        """Map a key into its sha1 content.
-
-        :param key: A tuple of style ('sha1:abcd...',)
-        :param sha1: A char buffer of 20 bytes
-        :return: 1 if this could be converted, 0 otherwise
-        """
-        cdef char *c_val
-        if not PyTuple_CheckExact(key) and not StaticTuple_CheckExact(key):
-            return 0
-        if len(key) != 1:
-            return 0
-        val = key[0]
-        if not PyString_CheckExact(val) or PyString_GET_SIZE(val) != 45:
-            return 0
-        c_val = PyString_AS_STRING(val)
-        if not strncmp(c_val, 'sha1:', 5):
-            return 0
-        if not _unhexlify_sha1(c_val + 5, sha1):
-            return 0
-        return 1
-
-    cdef StaticTuple _sha1_to_key(self, char *sha1):
-        """Compute a ('sha1:abcd',) key for a given sha1."""
-        cdef StaticTuple key
-        cdef object hexxed
-        cdef char *c_buf
-        hexxed = PyString_FromStringAndSize(NULL, 45)
-        c_buf = PyString_AS_STRING(hexxed)
-        memcpy(c_buf, 'sha1:', 5)
-        _hexlify_sha1(sha1, c_buf+5)
-        key = StaticTuple_New(1)
-        Py_INCREF(hexxed)
-        StaticTuple_SET_ITEM(key, 0, hexxed)
-        key = StaticTuple_Intern(key)
-        return key
-
     cdef StaticTuple _record_to_value_and_refs(self,
                                                gc_chk_sha1_record *record):
         """Extract the refs and value part of this record."""
@@ -486,8 +511,9 @@
         value_and_refs = StaticTuple_New(2)
         # This is really inefficient to go from a logical state back to a
         # string, but it makes things work a bit better internally for now.
-        value = PyString_FromFormat('%llu %lu %lu %lu',
-                                    record.block_offset, record.block_length,
+        value = PyString_FromFormat('%lu %lu %lu %lu',
+                                    <unsigned long>record.block_offset,
+                                    record.block_length,
                                     record.record_start, record.record_end)
         Py_INCREF(value)
         StaticTuple_SET_ITEM(value_and_refs, 0, value)
@@ -504,7 +530,7 @@
         cdef StaticTuple key
         cdef StaticTuple value_and_refs
         cdef object value
-        key = self._sha1_to_key(record.sha1)
+        key = _sha1_to_key(record.sha1)
         item = StaticTuple_New(2)
         Py_INCREF(key)
         StaticTuple_SET_ITEM(item, 0, key)
@@ -526,7 +552,7 @@
     def __contains__(self, key):
         cdef char sha1[20]
         cdef gc_chk_sha1_record *record
-        if not self._key_to_sha1(key, sha1):
+        if not _key_to_sha1(key, sha1):
             # If it isn't a sha1 key, then it won't be in this leaf node
             return False
         return self._lookup_record(sha1) != NULL
@@ -534,7 +560,7 @@
     def __getitem__(self, key):
         cdef char sha1[20]
         cdef gc_chk_sha1_record *record = NULL
-        if self._key_to_sha1(key, sha1):
+        if _key_to_sha1(key, sha1):
             record = self._lookup_record(sha1)
         if record == NULL:
             raise KeyError('key %r is not present' % (key,))
@@ -547,7 +573,7 @@
         cdef int i
         cdef list result = []
         for i from 0 <= i < self.num_entries:
-            result.append(self._sha1_to_key(self.entries[i].sha1))
+            result.append(_sha1_to_key(self.entries[i].sha1))
         return result
 
     def all_items(self):
@@ -627,6 +653,12 @@
             if c_cur == c_next or c_next[0] != c'\n':
                 raise ValueError('Failed to parse record end')
             c_cur = c_next + 1
+            cur_record += 1
+            entry += 1
+        if (entry != self.num_entries
+            or c_cur != c_end
+            or cur_record != self.entries + self.num_entries):
+            raise ValueError('Something went wrong while parsing.')
         # Pass 3: build the offset map
         # The idea with the offset map is that we should be able to quickly
         # jump to the key that matches a gives sha1. We know that the keys are

=== modified file 'bzrlib/tests/test__btree_serializer.py'
--- a/bzrlib/tests/test__btree_serializer.py	2010-08-03 22:10:22 +0000
+++ b/bzrlib/tests/test__btree_serializer.py	2010-08-03 23:41:01 +0000
@@ -32,6 +32,7 @@
         super(TestBtreeSerializer, self).setUp()
         self.module = compiled_btreeparser_feature.module
 
+
 class TestHexAndUnhex(TestBtreeSerializer):
 
     def assertHexlify(self, as_binary):
@@ -75,9 +76,68 @@
         self.assertFailUnhexlify('12345678901234567890123456789012345678X9')
 
 
+_hex_form = '123456789012345678901234567890abcdefabcd'
+
+class Test_KeyToSha1(TestBtreeSerializer):
+
+    def assertKeyToSha1(self, expected, key):
+        if expected is None:
+            expected_bin = None
+        else:
+            expected_bin = binascii.unhexlify(expected)
+        actual_sha1 = self.module._test_key_to_sha1(key)
+        if expected_bin != actual_sha1:
+            actual_hex_sha1 = None
+            if actual_sha1 is not None:
+                actual_hex_sha1 = binascii.hexlify(actual_sha1)
+            self.fail('_key_to_sha1 returned:\n    %s\n != %s'
+                      % (actual_sha1, expected))
+
+    def test_simple(self):
+        self.assertKeyToSha1(_hex_form, ('sha1:' + _hex_form,))
+
+    def test_invalid_not_tuple(self):
+        self.assertKeyToSha1(None, _hex_form)
+        self.assertKeyToSha1(None, 'sha1:' + _hex_form)
+
+    def test_invalid_empty(self):
+        self.assertKeyToSha1(None, ())
+
+    def test_invalid_not_string(self):
+        self.assertKeyToSha1(None, (None,))
+        self.assertKeyToSha1(None, (list(_hex_form),))
+
+    def test_invalid_not_sha1(self):
+        self.assertKeyToSha1(None, (_hex_form,))
+        self.assertKeyToSha1(None, ('sha2:' + _hex_form,))
+
+    def test_invalid_not_hex(self):
+        self.assertKeyToSha1(None,
+            ('sha1:abcdefghijklmnopqrstuvwxyz12345678901234',))
+
+
+class Test_Sha1ToKey(TestBtreeSerializer):
+
+    def assertSha1ToKey(self, hex_sha1):
+        bin_sha1 = binascii.unhexlify(hex_sha1)
+        key = self.module._test_sha1_to_key(bin_sha1)
+        self.assertEqual(('sha1:' + hex_sha1,), key)
+
+    def test_simple(self):
+        self.assertSha1ToKey(_hex_form)
+
+
+_one_key_content = """type=leaf
+sha1:123456789012345678901234567890abcdefabcd\x00\x001 2 3 4
+"""
+
+_multi_key_content = """type=leaf
+sha1:123456789012345678901234567890abcdefabcd\x00\x001 2 3 4
+sha1:abcd123456789012345678901234567890abcdef\x00\x005678 2345 3456 4567
+"""
+
 class TestGCCKHSHA1LeafNode(TestBtreeSerializer):
 
-
     def assertInvalid(self, bytes):
         """Ensure that we get a proper error when trying to parse invalid bytes.
 
@@ -97,3 +157,25 @@
         self.assertEqual(0, len(leaf))
         self.assertEqual([], leaf.all_items())
         self.assertEqual([], leaf.all_keys())
+        # It should allow any key to be queried
+        self.assertFalse(('key',) in leaf)
+
+    def test_one_key_leaf(self):
+        leaf = self.module._parse_into_chk(_one_key_content, 1, 0)
+        self.assertEqual(1, len(leaf))
+        sha_key = ('sha1:123456789012345678901234567890abcdefabcd',)
+        self.assertEqual([sha_key], leaf.all_keys())
+        self.assertEqual([(sha_key, ('1 2 3 4', ()))], leaf.all_items())
+        self.assertTrue(sha_key in leaf)
+
+    def test_many_key_leaf(self):
+        leaf = self.module._parse_into_chk(_multi_key_content, 1, 0)
+        self.assertEqual(2, len(leaf))
+        sha_key1 = ('sha1:123456789012345678901234567890abcdefabcd',)
+        sha_key2 = ('sha1:abcd123456789012345678901234567890abcdef',)
+        self.assertEqual([sha_key1, sha_key2], leaf.all_keys())
+        self.assertEqual([(sha_key1, ('1 2 3 4', ())),
+                          (sha_key2, ('5678 2345 3456 4567', ()))
+                         ], leaf.all_items())
+        self.assertTrue(sha_key1 in leaf)
+        self.assertTrue(sha_key2 in leaf)



More information about the bazaar-commits mailing list