Rev 4444: Hack together a 'quick parser' for extracting the revision-id field in http://bazaar.launchpad.net/~jameinel/bzr/1.17-faster-branch
John Arbash Meinel
john at arbash-meinel.com
Mon Jun 15 21:21:43 BST 2009
At http://bazaar.launchpad.net/~jameinel/bzr/1.17-faster-branch
------------------------------------------------------------
revno: 4444
revision-id: john at arbash-meinel.com-20090615202124-o9wf7lioc6kantzj
parent: pqm at pqm.ubuntu.com-20090615170647-5zu6h93br8c4ue6i
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: 1.17-faster-branch
timestamp: Mon 2009-06-15 15:21:24 -0500
message:
Hack together a 'quick parser' for extracting the revision-id field
from a given 'value' field for a chk entry.
It speeds up 'time bzr branch qbzr-trunk' from 2.6s to 2.3s...
-------------- next part --------------
=== modified file 'bzrlib/_groupcompress_pyx.pyx'
--- a/bzrlib/_groupcompress_pyx.pyx 2009-06-10 03:56:49 +0000
+++ b/bzrlib/_groupcompress_pyx.pyx 2009-06-15 20:21:24 +0000
@@ -23,7 +23,12 @@
cdef extern from "Python.h":
ctypedef int Py_ssize_t # Required for older pyrex versions
+ ctypedef struct PyObject:
+ pass
int PyString_CheckExact(object)
+ int PyTuple_CheckExact(object)
+ Py_ssize_t PyTuple_GET_SIZE(object)
+ PyObject * PyTuple_GET_ITEM(object, Py_ssize_t)
char * PyString_AS_STRING(object)
Py_ssize_t PyString_GET_SIZE(object)
object PyString_FromStringAndSize(char *, Py_ssize_t)
@@ -35,6 +40,8 @@
void * realloc(void *, size_t)
void free(void *)
void memcpy(void *, void *, size_t)
+ void *memchr(void *, int, size_t)
+ int memcmp(void *, void*, size_t)
cdef extern from "delta.h":
@@ -461,3 +468,69 @@
return val, offset
+def _file_key_bytes_to_revision(file_key, bytes):
+ """Extract the 'revision' information from the bytes string."""
+ cdef PyObject *temp
+ cdef char *bytes_chr
+ cdef char *bytes_end_chr
+ cdef Py_ssize_t bytes_len
+ cdef char *file_id_chr
+ cdef Py_ssize_t file_id_len
+ cdef char* colon_chr
+ cdef char* newline_chr
+ cdef char* next_field_chr
+
+ # The format is:
+ # <kind>: <file-id>\n<parent-id>\n<name>\n<revision-id>\nKIND_SPECIFIC
+ if not PyString_CheckExact(bytes):
+ raise TypeError('bytes must be a string.')
+ if not PyTuple_CheckExact(file_key):
+ raise TypeError('file_key must be a tuple')
+ if not PyTuple_GET_SIZE(file_key) == 1:
+ raise ValueError('file_key should only have 1 entry')
+ temp = PyTuple_GET_ITEM(file_key, 0)
+ file_id = <object>temp
+ if not PyString_CheckExact(file_id):
+ raise TypeError('file_key should contain a single string.')
+ file_id_chr = PyString_AS_STRING(file_id)
+ file_id_len = PyString_GET_SIZE(file_id)
+ bytes_chr = PyString_AS_STRING(bytes)
+ bytes_len = PyString_GET_SIZE(bytes)
+ bytes_end_chr = bytes_chr + bytes_len
+
+ colon_chr = <char*>memchr(bytes_chr, c':', bytes_len)
+ if colon_chr == NULL or colon_chr[1] != c' ':
+ raise ValueError('bytes must contain a ": "')
+ # TODO: We could check that the kind field is valid
+ # TODO: check that colon_chr has enough bytes left
+ next_field_chr = colon_chr + 2
+ if (memcmp(file_id_chr, next_field_chr, file_id_len)
+ or next_field_chr[file_id_len] != c'\n'):
+ file_id_from_bytes = PyString_FromStringAndSize(next_field_chr,
+ file_id_len)
+ raise ValueError('file_id_key did not match file_id in bytes string:'
+ '\n %s\n != %r' % (file_key, file_id_from_bytes))
+ next_field_chr = next_field_chr + file_id_len + 1
+ newline_chr = <char*>memchr(next_field_chr, c'\n',
+ bytes_end_chr - colon_chr)
+ if newline_chr == NULL:
+ raise ValueError('Missing parent_id field')
+ # Go past this newline containing the parent_id, to find the revision_id
+ next_field_chr = newline_chr + 1
+ newline_chr = <char*>memchr(next_field_chr, c'\n', bytes_end_chr -
+ next_field_chr)
+ if newline_chr == NULL:
+ raise ValueError('Missing name field: %r' % (bytes,))
+ next_field_chr = newline_chr + 1
+ newline_chr = <char*>memchr(next_field_chr, c'\n', bytes_end_chr -
+ next_field_chr)
+ if newline_chr == NULL:
+ # This may be the end of the value
+ assert bytes_end_chr > next_field_chr
+ newline_chr = bytes_end_chr
+ revision_id = PyString_FromStringAndSize(next_field_chr,
+ bytes_end_chr - next_field_chr)
+ # print '%r\n=> %r' % (bytes, revision_id)
+ revision_id = PyString_FromStringAndSize(next_field_chr,
+ newline_chr - next_field_chr)
+ return revision_id
=== modified file 'bzrlib/repofmt/groupcompress_repo.py'
--- a/bzrlib/repofmt/groupcompress_repo.py 2009-06-12 01:11:00 +0000
+++ b/bzrlib/repofmt/groupcompress_repo.py 2009-06-15 20:21:24 +0000
@@ -925,15 +925,17 @@
uninteresting_pid_root_keys.add(
inv.parent_id_basename_to_file_id.key())
bytes_to_info = inventory.CHKInventory._bytes_to_utf8name_key
+ from bzrlib._groupcompress_pyx import _file_key_bytes_to_revision
+ bytes_to_info = _file_key_bytes_to_revision
chk_bytes = self.from_repository.chk_bytes
def _filter_id_to_entry():
for record, items in chk_map.iter_interesting_nodes(chk_bytes,
self._chk_id_roots, uninteresting_root_keys):
- for name, bytes in items:
+ for file_key, bytes in items:
# Note: we don't care about name_utf8, because we are always
# rich-root = True
- _, file_id, revision_id = bytes_to_info(bytes)
- self._text_keys.add((file_id, revision_id))
+ revision_id = bytes_to_info(file_key, bytes)
+ self._text_keys.add((file_key[0], revision_id))
if record is not None:
yield record
# Consumed
More information about the bazaar-commits
mailing list