Rev 4504: A little bit of refactoring _extract_key into a helper. in http://bazaar.launchpad.net/~jameinel/bzr/1.17-btree-faster

John Arbash Meinel john at arbash-meinel.com
Wed Jul 1 22:54:15 BST 2009


At http://bazaar.launchpad.net/~jameinel/bzr/1.17-btree-faster

------------------------------------------------------------
revno: 4504
revision-id: john at arbash-meinel.com-20090701215405-vu7kdewhxj51lh75
parent: john at arbash-meinel.com-20090701205138-a0f340l8wib7mwb6
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: 1.17-btree-faster
timestamp: Wed 2009-07-01 16:54:05 -0500
message:
  A little bit of refactoring _extract_key into a helper.
  
  This way we are sure that the new implementation is doing it correctly.
  As an added benefit, since we know the width of the key, we can
  create the tuple directly, rather than building up a list and then
  turning that into a tuple.
-------------- next part --------------
=== modified file 'bzrlib/_btree_serializer_pyx.pyx'
--- a/bzrlib/_btree_serializer_pyx.pyx	2009-07-01 20:50:37 +0000
+++ b/bzrlib/_btree_serializer_pyx.pyx	2009-07-01 21:54:05 +0000
@@ -36,7 +36,9 @@
     int PyString_CheckExact(object s)
     int PyString_CheckExact_ptr "PyString_CheckExact" (PyObject *)
     Py_ssize_t PyString_Size(object p)
+    Py_ssize_t PyString_GET_SIZE(object)
     Py_ssize_t PyString_GET_SIZE_ptr "PyString_GET_SIZE" (PyObject *)
+    char * PyString_AS_STRING(object)
     char * PyString_AS_STRING_ptr "PyString_AS_STRING" (PyObject *)
     int PyString_AsStringAndSize_ptr(PyObject *, char **buf, Py_ssize_t *len)
     void PyString_InternInPlace(PyObject **)
@@ -44,13 +46,18 @@
     Py_ssize_t PyTuple_GET_SIZE(object t)
     PyObject *PyTuple_GET_ITEM_ptr_object "PyTuple_GET_ITEM" (object tpl, int index)
     void Py_DECREF_ptr "Py_DECREF" (PyObject *)
+    void Py_INCREF(object)
+
+    PyObject *PyDict_GetItem(object, object)
+    object PyTuple_New(Py_ssize_t n)
+    void PyTuple_SET_ITEM(object, Py_ssize_t offset, object)
 
 cdef extern from "string.h":
     void *memcpy(void *dest, void *src, size_t n)
     void *memchr(void *s, int c, size_t n)
     # GNU extension
     # void *memrchr(void *s, int c, size_t n)
-    int strncmp(char *s1, char *s2, size_t n)
+    int memcmp(char *s1, char *s2, size_t n)
 
 
 # TODO: Find some way to import this from _dirstate_helpers
@@ -137,35 +144,7 @@
         :param last: points at the byte after the last byte permitted for the
             key.
         """
-        cdef char *temp_ptr
-        cdef int loop_counter
-        # keys are tuples
-        loop_counter = 0
-        key_segments = []
-        while loop_counter < self.key_length:
-            loop_counter = loop_counter + 1
-            # grab a key segment
-            temp_ptr = <char*>memchr(self._start, c'\0', last - self._start)
-            if temp_ptr == NULL:
-                if loop_counter == self.key_length:
-                    # capture to last
-                    temp_ptr = last
-                else:
-                    # Invalid line
-                    failure_string = ("invalid key, wanted segment from " +
-                        repr(safe_string_from_size(self._start,
-                                                   last - self._start)))
-                    raise AssertionError(failure_string)
-            # capture the key string
-            # TODO: Consider using PyIntern_FromString, the only caveat is that
-            # it assumes a NULL-terminated string, so we have to check if
-            # temp_ptr[0] == c'\0' or some other char.
-            key_element = safe_interned_string_from_size(self._start,
-                                                         temp_ptr - self._start)
-            # advance our pointer
-            self._start = temp_ptr + 1
-            PyList_Append(key_segments, key_element)
-        return tuple(key_segments)
+        return _extract_key(&self._start, last, self.key_length)
 
     cdef int process_line(self) except -1:
         """Process a line in the bytes."""
@@ -196,7 +175,7 @@
             return -1
         if 0 == self._header_found:
             # The first line in a leaf node is the header "type=leaf\n"
-            if strncmp("type=leaf", self._start, last - self._start) == 0:
+            if memcmp("type=leaf", self._start, last - self._start) == 0:
                 self._header_found = 1
                 return 0
             else:
@@ -280,22 +259,126 @@
     return parser.parse()
 
 
+cdef _extract_key(char **start, char *last, int key_length):
+    """Extract a key.
+
+    :param start: The first byte of the key, will be updated to the position
+        after the end of the key
+    :param last: points at the byte after the last byte permitted for the
+        key.
+    :param key_length: The number of entries in the key tuple
+    """
+    cdef char *orig_start, *temp_ptr
+    cdef int i
+    orig_start = start[0]
+    # keys are tuples
+    key = PyTuple_New(key_length)
+    for i from 0 <= i < key_length:
+        # grab a key segment
+        temp_ptr = <char*>memchr(start[0], c'\0', last - start[0])
+        if temp_ptr == NULL:
+            if i + 1 == key_length:
+                # This is the last part of the key, it is okay if it doesn't
+                # end in '\0'
+                temp_ptr = last
+            else:
+                # Invalid line
+                failure_string = ("invalid key, wanted %d segments from %r"
+                    % (key_length,
+                       safe_string_from_size(orig_start, last - orig_start)))
+                raise AssertionError(failure_string)
+        # capture the key string
+        key_element = safe_interned_string_from_size(start[0],
+                                                     temp_ptr - start[0])
+        # advance our pointer
+        start[0] = temp_ptr + 1
+        Py_INCREF(key_element) # PyTuple_SET_ITEM steals a reference
+        PyTuple_SET_ITEM(key, i, key_element)
+    return key
+
+
+cdef class _LeafLineEntry:
+    """Contains the information about a single line item."""
+
+    cdef object key
+    cdef object refs
+    cdef object value
+    cdef char *data_start
+    cdef char *data_end
+
+    def __init__(self, key):
+        self.key = key
+        self.data_start = NULL
+        self.data_end = NULL
+
+
+
 cdef class _LeafNode:
     """Contains the references to the leaf information."""
 
+    cdef object _bytes
+    cdef int _key_length
+    cdef int _ref_list_length
+    cdef object _lines
     cdef object _keys
+    cdef object _first_key
+    cdef object _last_key
 
     def __init__(self, bytes, key_length, ref_list_length):
-        self._keys = dict(_parse_leaf_lines(bytes, key_length, ref_list_length))
+        self._bytes = bytes
+        if not PyString_CheckExact(bytes):
+            raise TypeError('bytes must be exactly a PyString')
+        self._key_length = key_length
+        self._ref_list_length = ref_list_length
+        self._keys = None
+        self._parse()
+
+    def _parse(self):
+        cdef char *c_bytes, *start, *end
+        cdef Py_ssize_t str_len, pos
+        cdef _LeafLineEntry next_line
+
+        c_bytes = PyString_AS_STRING(self._bytes)
+        str_len = PyString_GET_SIZE(self._bytes)
+        if str_len < 10:
+            raise ValueError('Leaf node bytes too short.')
+        if memcmp(c_bytes, "type=leaf\n", 10) != 0:
+            raise ValueError('Leaf node bytes did not start with "type=leaf"')
+
+        # Turn the internal bytes into an array of _LeafLineEntry. This tracks
+        # what might be there without actually doing any parsing
+        self._lines = {}
+        start = c_bytes + 10
+        for pos from 10 <= pos < str_len:
+            if c_bytes[pos] == c'\n':
+                end = c_bytes + pos
+                key = _extract_key(&start, end, self._key_length)
+                next_line = _LeafLineEntry(key)
+                next_line.data_start = start
+                next_line.data_end = c_bytes + pos
+                self._lines[key] = next_line
+                start = c_bytes + pos + 1
+        # Do we check that the very last byte is '\n'?
+
+    cdef _populate_keys(self):
+        cdef _LeafLineEntry entry
+        keys = dict(_parse_leaf_lines(self._bytes, self._key_length,
+                                      self._ref_list_length))
+        self._keys = keys
 
     property keys:
         def __get__(self):
+            if self._keys is None:
+                self._populate_keys()
             return self._keys
 
     def __len__(self):
-        return len(self._keys)
+        return len(self._lines)
+        # return len(self._keys)
 
     def get(self, key):
+        if self._keys is None:
+            self._populate_keys()
         # PyDict_GetItem...
         return self._keys.get(key, None)
 



More information about the bazaar-commits mailing list