Rev 4444: Hack together a 'quick parser' for extracting the revision-id field in http://bazaar.launchpad.net/~jameinel/bzr/1.17-faster-branch

Mon Jun 15 21:21:43 BST 2009

At http://bazaar.launchpad.net/~jameinel/bzr/1.17-faster-branch

------------------------------------------------------------
revno: 4444
revision-id: john at arbash-meinel.com-20090615202124-o9wf7lioc6kantzj
parent: pqm at pqm.ubuntu.com-20090615170647-5zu6h93br8c4ue6i
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: 1.17-faster-branch
timestamp: Mon 2009-06-15 15:21:24 -0500
message:
  Hack together a 'quick parser' for extracting the revision-id field
  from a given 'value' field for a chk entry.
  It speeds up 'time bzr branch qbzr-trunk' from 2.6s to 2.3s...
-------------- next part --------------
=== modified file 'bzrlib/_groupcompress_pyx.pyx'

--- a/bzrlib/_groupcompress_pyx.pyx	2009-06-10 03:56:49 +0000
+++ b/bzrlib/_groupcompress_pyx.pyx	2009-06-15 20:21:24 +0000
@@ -23,7 +23,12 @@
 
 cdef extern from "Python.h":
     ctypedef int Py_ssize_t # Required for older pyrex versions
+    ctypedef struct PyObject:
+        pass
     int PyString_CheckExact(object)
+    int PyTuple_CheckExact(object)
+    Py_ssize_t PyTuple_GET_SIZE(object)
+    PyObject * PyTuple_GET_ITEM(object, Py_ssize_t)
     char * PyString_AS_STRING(object)
     Py_ssize_t PyString_GET_SIZE(object)
     object PyString_FromStringAndSize(char *, Py_ssize_t)
@@ -35,6 +40,8 @@
     void * realloc(void *, size_t)
     void free(void *)
     void memcpy(void *, void *, size_t)
+    void *memchr(void *, int, size_t)
+    int memcmp(void *, void*, size_t)
 
 
 cdef extern from "delta.h":
@@ -461,3 +468,69 @@
     return val, offset
 
 
+def _file_key_bytes_to_revision(file_key, bytes):
+    """Extract the 'revision' information from the bytes string."""
+    cdef PyObject *temp
+    cdef char *bytes_chr
+    cdef char *bytes_end_chr
+    cdef Py_ssize_t bytes_len
+    cdef char *file_id_chr
+    cdef Py_ssize_t file_id_len
+    cdef char* colon_chr
+    cdef char* newline_chr
+    cdef char* next_field_chr
+
+    # The format is:
+    #   <kind>: <file-id>\n<parent-id>\n<name>\n<revision-id>\nKIND_SPECIFIC
+    if not PyString_CheckExact(bytes):
+        raise TypeError('bytes must be a string.')
+    if not PyTuple_CheckExact(file_key):
+        raise TypeError('file_key must be a tuple')
+    if not PyTuple_GET_SIZE(file_key) == 1:
+        raise ValueError('file_key should only have 1 entry')
+    temp = PyTuple_GET_ITEM(file_key, 0)
+    file_id = <object>temp
+    if not PyString_CheckExact(file_id):
+        raise TypeError('file_key should contain a single string.')
+    file_id_chr = PyString_AS_STRING(file_id)
+    file_id_len = PyString_GET_SIZE(file_id)
+    bytes_chr = PyString_AS_STRING(bytes)
+    bytes_len = PyString_GET_SIZE(bytes)
+    bytes_end_chr = bytes_chr + bytes_len
+
+    colon_chr = <char*>memchr(bytes_chr, c':', bytes_len)
+    if colon_chr == NULL or colon_chr[1] != c' ':
+        raise ValueError('bytes must contain a ": "')
+    # TODO: We could check that the kind field is valid
+    # TODO: check that colon_chr has enough bytes left
+    next_field_chr = colon_chr + 2
+    if (memcmp(file_id_chr, next_field_chr, file_id_len)
+        or next_field_chr[file_id_len] != c'\n'):
+        file_id_from_bytes = PyString_FromStringAndSize(next_field_chr,
+                                                        file_id_len)
+        raise ValueError('file_id_key did not match file_id in bytes string:'
+                         '\n   %s\n != %r' % (file_key, file_id_from_bytes))
+    next_field_chr = next_field_chr + file_id_len + 1
+    newline_chr = <char*>memchr(next_field_chr, c'\n',
+                                bytes_end_chr - colon_chr)
+    if newline_chr == NULL:
+        raise ValueError('Missing parent_id field')
+    # Go past this newline containing the parent_id, to find the revision_id
+    next_field_chr = newline_chr + 1
+    newline_chr = <char*>memchr(next_field_chr, c'\n', bytes_end_chr -
+                                                       next_field_chr)
+    if newline_chr == NULL:
+        raise ValueError('Missing name field: %r' % (bytes,))
+    next_field_chr = newline_chr + 1
+    newline_chr = <char*>memchr(next_field_chr, c'\n', bytes_end_chr -
+                                                       next_field_chr)
+    if newline_chr == NULL:
+        # This may be the end of the value
+        assert bytes_end_chr > next_field_chr
+        newline_chr = bytes_end_chr
+        revision_id = PyString_FromStringAndSize(next_field_chr,
+                                                 bytes_end_chr - next_field_chr)
+        # print '%r\n=> %r' % (bytes, revision_id)
+    revision_id = PyString_FromStringAndSize(next_field_chr,
+                                             newline_chr - next_field_chr)
+    return revision_id

=== modified file 'bzrlib/repofmt/groupcompress_repo.py'
--- a/bzrlib/repofmt/groupcompress_repo.py	2009-06-12 01:11:00 +0000
+++ b/bzrlib/repofmt/groupcompress_repo.py	2009-06-15 20:21:24 +0000
@@ -925,15 +925,17 @@
                 uninteresting_pid_root_keys.add(
                     inv.parent_id_basename_to_file_id.key())
         bytes_to_info = inventory.CHKInventory._bytes_to_utf8name_key
+        from bzrlib._groupcompress_pyx import _file_key_bytes_to_revision
+        bytes_to_info = _file_key_bytes_to_revision
         chk_bytes = self.from_repository.chk_bytes
         def _filter_id_to_entry():
             for record, items in chk_map.iter_interesting_nodes(chk_bytes,
                         self._chk_id_roots, uninteresting_root_keys):
-                for name, bytes in items:
+                for file_key, bytes in items:
                     # Note: we don't care about name_utf8, because we are always
                     # rich-root = True
-                    _, file_id, revision_id = bytes_to_info(bytes)
-                    self._text_keys.add((file_id, revision_id))
+                    revision_id = bytes_to_info(file_key, bytes)
+                    self._text_keys.add((file_key[0], revision_id))
                 if record is not None:
                     yield record
             # Consumed