Rev 2512: Finally, faster than text.split() (156ms) in http://bzr.arbash-meinel.com/branches/bzr/0.17-dev/dirstate_pyrex

John Arbash Meinel john at arbash-meinel.com
Sat May 5 05:58:04 BST 2007


At http://bzr.arbash-meinel.com/branches/bzr/0.17-dev/dirstate_pyrex

------------------------------------------------------------
revno: 2512
revision-id: john at arbash-meinel.com-20070505045753-1fwhap6q0jyb18vt
parent: john at arbash-meinel.com-20070505043606-lw7bjxwzcnjbls9v
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: dirstate_pyrex
timestamp: Fri 2007-05-04 23:57:53 -0500
message:
  Finally, faster than text.split() (156ms)
  By iterating over the fields directly, we don't have to create Python strings
  for the dirname field (only when it changes), or for the size field or is_executable
  fields.
  A lot fewer python objects means faster parsing.
modified:
  bzrlib/compiled/dirstate_helpers.pyx dirstate_helpers.pyx-20070503201057-u425eni465q4idwn-3
-------------- next part --------------
=== modified file 'bzrlib/compiled/dirstate_helpers.pyx'
--- a/bzrlib/compiled/dirstate_helpers.pyx	2007-05-05 04:36:06 +0000
+++ b/bzrlib/compiled/dirstate_helpers.pyx	2007-05-05 04:57:53 +0000
@@ -272,53 +272,58 @@
             PyList_Append(fields, self.get_next_str())
         return fields
 
-    cdef object _fields_to_entry_0_parents(self,
-                                           PyListObject *fields, int offset,
-                                           void **p_current_dirname,
-                                           int *new_block):
+    cdef object _get_entry_0_parents(self, void **p_current_dirname,
+                                     int *new_block):
         cdef object path_name_file_id_key
-        cdef char *size_str
-        cdef unsigned long int size
+        cdef char *entry_size_str
+        cdef unsigned long int entry_size
         cdef char* executable_str
         cdef int is_executable
-        cdef PyObject **base
-        cdef void* dirname
         cdef char* dirname_str
-
-        # Is this too abusive?
-        base = fields.ob_item + offset
-
-        dirname = base[0]
-        dirname_str = PyString_AS_STRING_void(dirname)
-
-        if strcmp(dirname_str,
-                  PyString_AS_STRING_void(p_current_dirname[0])) != 0:
-            Py_INCREF_PyObject(<PyObject *>dirname)
-            p_current_dirname[0] = dirname
+        cdef char* trailing
+        cdef int cur_size
+        cdef object minikind
+        cdef object fingerprint
+        cdef object info
+
+        dirname_str = self.get_next(&cur_size)
+        if strncmp(dirname_str,
+                  PyString_AS_STRING_void(p_current_dirname[0]),
+                  cur_size+1) != 0:
+            dirname = PyString_FromStringAndSize(dirname_str, cur_size)
+            p_current_dirname[0] = <void*>dirname
             new_block[0] = 1
         else:
             new_block[0] = 0
         path_name_file_id_key = (<object>p_current_dirname[0],
-                                 <object>(base[1]),
-                                 <object>(base[2]),
+                                 self.get_next_str(),
+                                 self.get_next_str(),
                                 )
 
-        size_str = PyString_AS_STRING_void(<void*>(base[5]))
-        size = strtoul(size_str, NULL, 10)
-        executable_str = PyString_AS_STRING_void(<void*>(base[6]))
-        if executable_str[0] == c'y':
-            is_executable = 0
-        else:
-            is_executable = 0
-        return (path_name_file_id_key, [
+        minikind = self.get_next_str()
+        fingerprint = self.get_next_str()
+        entry_size_str = self.get_next(&cur_size)
+        entry_size = strtoul(entry_size_str, NULL, 10)
+        executable_str = self.get_next(&cur_size)
+        is_executable = (executable_str[0] == c'y')
+        info = self.get_next_str()
+
+        ret = (path_name_file_id_key, [
             ( # Current tree
-                <object>(base[3]),# minikind
-                <object>(base[4]),# fingerprint
-                size,             # size
-                is_executable,    # executable
-                <object>(base[7]),# packed_stat or revision_id
+                minikind,     # minikind
+                fingerprint,  # fingerprint
+                entry_size,   # size
+                is_executable,# executable
+                info,         # packed_stat or revision_id
             )])
 
+        # Ignore the trailing newline
+        trailing = self.get_next(&cur_size)
+        if cur_size != 1 or trailing[0] != c'\n':
+            raise AssertionError(
+                'Bad parse, we expected to end on \\n, not: %d %s: %s'
+                % (cur_size, PyString_FromString(trailing), ret))
+        return ret
 
     def _parse_dirblocks_0_parents(self, state, entry_size):
         cdef object current_block
@@ -336,12 +341,7 @@
         new_block = 0
 
         while not self.done():
-            fields = self.get_entry(entry_size)
-            # entry = self._get_entry_0_parents(&current_dirname, &new_block)
-            entry = self._fields_to_entry_0_parents(<PyListObject *>fields,
-                                                    0,
-                                                    &current_dirname,
-                                                    &new_block)
+            entry = self._get_entry_0_parents(&current_dirname, &new_block)
             if new_block:
                 # new block - different dirname
                 current_block = []



More information about the bazaar-commits mailing list