Rev 4414: Prototype a parser for bencoded revisions in lp:///~jameinel/bzr/revision_bencode_decoder
John Arbash Meinel
john at arbash-meinel.com
Thu Jun 4 21:01:40 BST 2009
At lp:///~jameinel/bzr/revision_bencode_decoder
------------------------------------------------------------
revno: 4414
revision-id: john at arbash-meinel.com-20090604200121-spqriscq81q1of6g
parent: john at arbash-meinel.com-20090604173830-e9j1rpv4euxkmzqr
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: revision_bencode_decoder
timestamp: Thu 2009-06-04 15:01:21 -0500
message:
Prototype a parser for bencoded revisions
This gives type safety, and can parse all of bzr.dev's revisions in 438 msec.
-------------- next part --------------
=== modified file '.bzrignore'
--- a/.bzrignore 2009-06-03 14:14:31 +0000
+++ b/.bzrignore 2009-06-04 20:01:21 +0000
@@ -41,6 +41,7 @@
bzrlib/_bencode_pyx.c
bzrlib/_btree_serializer_c.c
bzrlib/_chk_map_pyx.c
+bzrlib/_chk_serializer_pyx.c
bzrlib/_chunks_to_lines_pyx.c
bzrlib/_dirstate_helpers_c.c
bzrlib/_groupcompress_pyx.c
=== added file 'bzrlib/_chk_serializer_pyx.pyx'
--- a/bzrlib/_chk_serializer_pyx.pyx 1970-01-01 00:00:00 +0000
+++ b/bzrlib/_chk_serializer_pyx.pyx 2009-06-04 20:01:21 +0000
@@ -0,0 +1,297 @@
+# Copyright (C) 2009 Canonical Ltd
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+"""Pyrex implementation CHK_Serializer Revision decoding"""
+
+cdef extern from "python-compat.h":
+ pass
+
+cdef extern from "Python.h":
+ ctypedef int Py_ssize_t
+ int PyString_CheckExact(object o)
+ object PyString_FromStringAndSize(char *v, Py_ssize_t len)
+ char *PyString_AS_STRING(object o) except NULL
+ Py_ssize_t PyString_GET_SIZE(object o) except -1
+ object PyFloat_FromDouble(double)
+ object PyInt_FromLong(long)
+ object PyUnicode_DecodeUTF8Stateful(char *, Py_ssize_t, char *errors,
+ Py_ssize_t *consumed)
+
+ int PyList_Append(object, object) except -1
+
+cdef extern from "stdlib.h":
+ long strtol(char *, char **, int)
+ unsigned long strtoul(char *, char **, int)
+ double strtod(char *, char **)
+
+cdef extern from "string.h":
+ ctypedef unsigned long size_t
+ void *memcpy(void *dest, void *src, size_t count)
+ int memcmp(void *s1, void *s2, size_t n_bytes)
+
+from bzrlib import revision
+
+cdef class RevisionDecoder:
+
+ cdef object _text
+ cdef char *cur
+ cdef char *end
+ cdef Py_ssize_t size
+
+ cdef object committer # PyUnicode
+ cdef object properties # PyDict {PyString:PyUnicode}
+ cdef object timestamp # PyFloat
+ cdef object timezone # PyInt or PyNone
+ cdef object revision_id # PyString
+ cdef object parent_ids # PyList[PyString]
+ cdef object inventory_sha1 # PyString
+ cdef object message # PyUnicode
+
+ def __init__(self, text):
+ self._text = text
+ if not PyString_CheckExact(text):
+ raise TypeError('only strings are supported.')
+ self.cur = PyString_AS_STRING(text)
+ self.size = PyString_GET_SIZE(text)
+ self.end = self.cur + self.size
+
+ cdef Py_ssize_t _get_str_len(self) except -1:
+ cdef Py_ssize_t thelen
+ cdef char* tail
+
+ if self.cur[0] < c'0' or self.cur[0] > c'9':
+ raise ValueError('string lengths are integers')
+ thelen = <Py_ssize_t>strtoul(self.cur, &tail, 10)
+ if tail == NULL or tail[0] != c':' or tail <= self.cur:
+ raise ValueError('string lengths are terminated by :')
+ if self.cur[0] == c'0' and tail - self.cur != 1:
+ raise ValueError('leading zeros not allowed')
+ # Move 'cur' past the :
+ self.cur = tail + 1
+ if self.cur + thelen >= self.end:
+ raise ValueError('string length longer than remaining buffer')
+ return thelen
+
+ cdef _raise_unknown_key(self, char *key, Py_ssize_t key_len):
+ s = PyString_FromStringAndSize(key, key_len)
+ raise ValueError("unknown key: %s" % (s,))
+
+ cdef Py_ssize_t _get_char(self, char **value) except -1:
+ """Validate a string value.
+
+ This grabs the length of the string, and moves self.cur to point to
+ after the end of the value
+
+ :param value: return the pointer to the start of the value
+ :return: the length of the value
+ """
+ cdef Py_ssize_t value_len
+ value_len = self._get_str_len()
+ if self.cur + value_len > self.end:
+ raise ValueError('Not enough bytes for value (need %d, have %d)'
+ % (value_len, self.end - self.cur))
+ value[0] = self.cur
+ self.cur = self.cur + value_len
+ return value_len
+
+ cdef object _get_utf8(self):
+ # utf-8 values are just PyStrings
+ cdef Py_ssize_t value_len
+ cdef char *value
+
+ value_len = self._get_char(&value)
+ return PyString_FromStringAndSize(value, value_len)
+
+ cdef object _get_unicode(self):
+ cdef Py_ssize_t value_len
+ cdef char *value
+
+ value_len = self._get_char(&value)
+ return PyUnicode_DecodeUTF8Stateful(value, value_len, "strict", NULL)
+
+ cdef object _get_int(self):
+ cdef int val
+ cdef char *tail
+
+ if self.cur + 3 > self.end:
+ raise ValueError('not enough bytes for an integer')
+ # All integers in Revisions are positive integers, so don't worry about
+ # '-'
+ if self.cur[0] != c'i':
+ raise ValueError('Not an integer')
+ self.cur = self.cur + 1
+ if self.cur[0] != c'-' and (self.cur[0] < c'0' or self.cur[0] > c'9'):
+ raise ValueError('First character is not a valid integer')
+ val = <int>strtol(self.cur, &tail, 10)
+ if tail == NULL or tail <= self.cur or tail[0] != c'e':
+ raise ValueError('Integer not terminated with "e"')
+ self.cur = tail + 1
+ return PyInt_FromLong(val)
+
+ cdef object _get_timestamp(self):
+ cdef double timestamp
+ cdef char *val, *tail
+ cdef Py_ssize_t val_len
+ cdef char buf[32]
+
+ # Timestamp is special because it is a float, but it is encoded as a
+ # string
+ val_len = self._get_char(&val)
+ if val_len >= 32:
+ raise ValueError('timestamp string length too long: %d'
+ % (val_len,))
+ # We copy the buffer, because it isn't null terminated, and we don't
+ # know why strtod would do with a trailing 'e'
+ memcpy(buf, val, val_len)
+ buf[val_len] = c'0'
+ timestamp = strtod(buf, NULL)
+ return PyFloat_FromDouble(timestamp)
+
+ cdef object _get_parent_ids(self):
+ """Parent_ids is a list of strings."""
+ if self.cur + 2 > self.end:
+ raise ValueError('not enough space for parent_ids')
+ if self.cur[0] != c'l':
+ raise ValueError('parent_ids is encoded as a list')
+ self.cur = self.cur + 1
+ val = []
+ while self.cur[0] != c'e':
+ PyList_Append(val, self._get_utf8())
+ self.cur = self.cur + 1
+ return val
+
+ cdef object _get_properties(self):
+ if self.cur + 2 > self.end:
+ raise ValueError('not enough space for parent_ids')
+ if self.cur[0] != c'd':
+ raise ValueError('properties is encoded as a dict')
+ self.cur = self.cur + 1
+ props = {}
+ while self.cur[0] != c'e':
+ key = self._get_utf8()
+ value = self._get_unicode()
+ self.cur = self.cur + 1
+ return props
+
+ cdef _handle_key(self, char *key, Py_ssize_t key_len):
+ s = PyString_FromStringAndSize(key, key_len)
+ if key_len == 7:
+ if memcmp("message", key, key_len) == 0:
+ if self.message is not None:
+ raise ValueError("message given 2x")
+ self.message = self._get_unicode()
+ return
+ elif key_len == 8:
+ if memcmp("timezone", key, key_len) == 0:
+ if self.timezone is not None:
+ raise ValueError("timezone given 2x")
+ self.timezone = self._get_int()
+ return
+ elif key_len == 9:
+ if memcmp("timestamp", key, key_len) == 0:
+ if self.timestamp is not None:
+ raise ValueError("timestamp given 2x")
+ self.timestamp = self._get_timestamp()
+ return
+ elif memcmp("committer", key, key_len) == 0:
+ if self.committer is not None:
+ raise ValueError("committer given 2x")
+ self.committer = self._get_unicode()
+ return
+ elif key_len == 10:
+ if memcmp("properties", key, key_len) == 0:
+ if self.properties is not None:
+ raise ValueError("properties given 2x")
+ self.properties = self._get_properties()
+ return
+ elif memcmp("parent-ids", key, key_len) == 0:
+ if self.parent_ids is not None:
+ raise ValueError("parent_ids given 2x")
+ self.parent_ids = self._get_parent_ids()
+ return
+ elif key_len == 11:
+ if memcmp("revision-id", key, key_len) == 0:
+ if self.revision_id is not None:
+ raise ValueError("revision_id given 2x")
+ self.revision_id = self._get_utf8()
+ return
+ elif key_len == 14:
+ if memcmp("inventory-sha1", key, key_len) == 0:
+ if self.inventory_sha1 is not None:
+ raise ValueError("inventory_sha1 given 2x")
+ self.inventory_sha1 = self._get_utf8()
+ if PyString_GET_SIZE(self.inventory_sha1) != 40:
+ raise ValueError('inventory_sha1 not of length 40')
+ return
+ self._raise_unknown_key(key, key_len)
+
+ cdef _next(self):
+ cdef char *key
+ cdef Py_ssize_t key_len
+ if self.end - self.cur < 2:
+ raise ValueError('no room for a key/value pair')
+ if self.cur[0] != c'l':
+ raise ValueError('a key value pair is encoded as a list.')
+ self.cur = self.cur + 1
+ key_len = self._get_char(&key)
+ self._handle_key(key, key_len)
+ if self.cur[0] != c'e':
+ raise ValueError('key/value pair was not terminated with "e"')
+ self.cur = self.cur + 1
+
+ cdef _raise_if_None(self, char *name, object val):
+ if val is None:
+ raise ValueError("%s not present" % (name,))
+
+ cdef _validate_attributes(self):
+ self._raise_if_None("committer", self.committer)
+ self._raise_if_None("properties", self.properties)
+ self._raise_if_None("timestamp", self.timestamp)
+ # timezone is allowed to be missing/None
+ self._raise_if_None("revision_id", self.revision_id)
+ self._raise_if_None("parent_ids", self.parent_ids)
+ self._raise_if_None("inventory_sha1", self.inventory_sha1)
+ self._raise_if_None("message", self.message)
+
+ def decode(self):
+ if self.size <= 0:
+ raise ValueError('empty strings can\'t be decoded :)')
+ if self.cur[0] != c'l' or self.end[-1] != c'e':
+ raise ValueError('Revisions are encoded as a bencode list.')
+ self.cur = self.cur + 1
+ # The first entry must be the format, set to 10
+ if (self.end - self.cur < 14
+ or memcmp("l6:formati10ee", self.cur, 14) != 0):
+ raise ValueError('Revisions must start with format, and must'
+ ' be format 10')
+ self.cur = self.cur + 14
+ self.end = self.end - 1
+ while self.cur < self.end:
+ self._next()
+ self._validate_attributes()
+ return revision.Revision(
+ committer=self.committer,
+ properties=self.properties,
+ timestamp=self.timestamp,
+ timezone=self.timezone,
+ revision_id=self.revision_id,
+ parent_ids=self.parent_ids,
+ inventory_sha1=self.inventory_sha1,
+ message=self.message,
+ )
+
+def read_revision_from_string(text):
+ return RevisionDecoder(text).decode()
=== modified file 'setup.py'
--- a/setup.py 2009-06-03 14:14:31 +0000
+++ b/setup.py 2009-06-04 20:01:21 +0000
@@ -262,6 +262,7 @@
add_pyrex_extension('bzrlib._bencode_pyx')
add_pyrex_extension('bzrlib._btree_serializer_c')
add_pyrex_extension('bzrlib._chk_map_pyx', libraries=['z'])
+add_pyrex_extension('bzrlib._chk_serializer_pyx')
add_pyrex_extension('bzrlib._chunks_to_lines_pyx')
add_pyrex_extension('bzrlib._groupcompress_pyx',
extra_source=['bzrlib/diff-delta.c'])
More information about the bazaar-commits
mailing list