Rev 4414: Prototype a parser for bencoded revisions in lp:///~jameinel/bzr/revision_bencode_decoder

Thu Jun 4 21:01:40 BST 2009

At lp:///~jameinel/bzr/revision_bencode_decoder

------------------------------------------------------------
revno: 4414
revision-id: john at arbash-meinel.com-20090604200121-spqriscq81q1of6g
parent: john at arbash-meinel.com-20090604173830-e9j1rpv4euxkmzqr
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: revision_bencode_decoder
timestamp: Thu 2009-06-04 15:01:21 -0500
message:
  Prototype a parser for bencoded revisions
  This gives type safety, and can parse all of bzr.dev's revisions in 438 msec.
-------------- next part --------------
=== modified file '.bzrignore'

--- a/.bzrignore	2009-06-03 14:14:31 +0000
+++ b/.bzrignore	2009-06-04 20:01:21 +0000
@@ -41,6 +41,7 @@
 bzrlib/_bencode_pyx.c
 bzrlib/_btree_serializer_c.c
 bzrlib/_chk_map_pyx.c
+bzrlib/_chk_serializer_pyx.c
 bzrlib/_chunks_to_lines_pyx.c
 bzrlib/_dirstate_helpers_c.c
 bzrlib/_groupcompress_pyx.c

=== added file 'bzrlib/_chk_serializer_pyx.pyx'
--- a/bzrlib/_chk_serializer_pyx.pyx	1970-01-01 00:00:00 +0000
+++ b/bzrlib/_chk_serializer_pyx.pyx	2009-06-04 20:01:21 +0000
@@ -0,0 +1,297 @@
+# Copyright (C) 2009 Canonical Ltd
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+"""Pyrex implementation CHK_Serializer Revision decoding"""
+
+cdef extern from "python-compat.h":
+    pass
+
+cdef extern from "Python.h":
+    ctypedef int  Py_ssize_t
+    int PyString_CheckExact(object o)
+    object PyString_FromStringAndSize(char *v, Py_ssize_t len)
+    char *PyString_AS_STRING(object o) except NULL
+    Py_ssize_t PyString_GET_SIZE(object o) except -1
+    object PyFloat_FromDouble(double)
+    object PyInt_FromLong(long)
+    object PyUnicode_DecodeUTF8Stateful(char *, Py_ssize_t, char *errors,
+                                        Py_ssize_t *consumed)
+
+    int PyList_Append(object, object) except -1
+
+cdef extern from "stdlib.h":
+    long strtol(char *, char **, int)
+    unsigned long strtoul(char *, char **, int)
+    double strtod(char *, char **)
+
+cdef extern from "string.h":
+    ctypedef unsigned long size_t
+    void *memcpy(void *dest, void *src, size_t count)
+    int memcmp(void *s1, void *s2, size_t n_bytes)
+
+from bzrlib import revision
+
+cdef class RevisionDecoder:
+
+    cdef object _text
+    cdef char *cur
+    cdef char *end
+    cdef Py_ssize_t size
+
+    cdef object committer   # PyUnicode
+    cdef object properties  # PyDict {PyString:PyUnicode}
+    cdef object timestamp   # PyFloat
+    cdef object timezone    # PyInt or PyNone
+    cdef object revision_id # PyString
+    cdef object parent_ids  # PyList[PyString]
+    cdef object inventory_sha1 # PyString
+    cdef object message     # PyUnicode
+
+    def __init__(self, text):
+        self._text = text
+        if not PyString_CheckExact(text):
+            raise TypeError('only strings are supported.')
+        self.cur = PyString_AS_STRING(text)
+        self.size = PyString_GET_SIZE(text)
+        self.end = self.cur + self.size
+
+    cdef Py_ssize_t _get_str_len(self) except -1:
+        cdef Py_ssize_t thelen
+        cdef char* tail
+
+        if self.cur[0] < c'0' or self.cur[0] > c'9':
+            raise ValueError('string lengths are integers')
+        thelen = <Py_ssize_t>strtoul(self.cur, &tail, 10)
+        if tail == NULL or tail[0] != c':' or tail <= self.cur:
+            raise ValueError('string lengths are terminated by :')
+        if self.cur[0] == c'0' and tail - self.cur != 1:
+            raise ValueError('leading zeros not allowed')
+        # Move 'cur' past the :
+        self.cur = tail + 1
+        if self.cur + thelen >= self.end:
+            raise ValueError('string length longer than remaining buffer')
+        return thelen
+
+    cdef _raise_unknown_key(self, char *key, Py_ssize_t key_len):
+        s = PyString_FromStringAndSize(key, key_len)
+        raise ValueError("unknown key: %s" % (s,))
+
+    cdef Py_ssize_t _get_char(self, char **value) except -1:
+        """Validate a string value.
+
+        This grabs the length of the string, and moves self.cur to point to
+        after the end of the value
+
+        :param value: return the pointer to the start of the value
+        :return: the length of the value
+        """
+        cdef Py_ssize_t value_len
+        value_len = self._get_str_len()
+        if self.cur + value_len > self.end:
+            raise ValueError('Not enough bytes for value (need %d, have %d)'
+                             % (value_len, self.end - self.cur))
+        value[0] = self.cur
+        self.cur = self.cur + value_len
+        return value_len
+
+    cdef object _get_utf8(self):
+        # utf-8 values are just PyStrings
+        cdef Py_ssize_t value_len
+        cdef char *value
+
+        value_len = self._get_char(&value)
+        return PyString_FromStringAndSize(value, value_len)
+
+    cdef object _get_unicode(self):
+        cdef Py_ssize_t value_len
+        cdef char *value
+
+        value_len = self._get_char(&value)
+        return PyUnicode_DecodeUTF8Stateful(value, value_len, "strict", NULL)
+
+    cdef object _get_int(self):
+        cdef int val
+        cdef char *tail
+
+        if self.cur + 3 > self.end:
+            raise ValueError('not enough bytes for an integer')
+        # All integers in Revisions are positive integers, so don't worry about
+        # '-'
+        if self.cur[0] != c'i':
+            raise ValueError('Not an integer')
+        self.cur = self.cur + 1
+        if self.cur[0] != c'-' and (self.cur[0] < c'0' or self.cur[0] > c'9'):
+            raise ValueError('First character is not a valid integer')
+        val = <int>strtol(self.cur, &tail, 10)
+        if tail == NULL or tail <= self.cur or tail[0] != c'e':
+            raise ValueError('Integer not terminated with "e"')
+        self.cur = tail + 1
+        return PyInt_FromLong(val)
+
+    cdef object _get_timestamp(self):
+        cdef double timestamp
+        cdef char *val, *tail
+        cdef Py_ssize_t val_len
+        cdef char buf[32]
+
+        # Timestamp is special because it is a float, but it is encoded as a
+        # string
+        val_len = self._get_char(&val)
+        if val_len >= 32:
+            raise ValueError('timestamp string length too long: %d'
+                             % (val_len,))
+        # We copy the buffer, because it isn't null terminated, and we don't
+        # know why strtod would do with a trailing 'e'
+        memcpy(buf, val, val_len)
+        buf[val_len] = c'0'
+        timestamp = strtod(buf, NULL)
+        return PyFloat_FromDouble(timestamp)
+
+    cdef object _get_parent_ids(self):
+        """Parent_ids is a list of strings."""
+        if self.cur + 2 > self.end:
+            raise ValueError('not enough space for parent_ids')
+        if self.cur[0] != c'l':
+            raise ValueError('parent_ids is encoded as a list')
+        self.cur = self.cur + 1
+        val = []
+        while self.cur[0] != c'e':
+            PyList_Append(val, self._get_utf8())
+        self.cur = self.cur + 1
+        return val
+
+    cdef object _get_properties(self):
+        if self.cur + 2 > self.end:
+            raise ValueError('not enough space for parent_ids')
+        if self.cur[0] != c'd':
+            raise ValueError('properties is encoded as a dict')
+        self.cur = self.cur + 1
+        props = {}
+        while self.cur[0] != c'e':
+            key = self._get_utf8()
+            value = self._get_unicode()
+        self.cur = self.cur + 1
+        return props
+
+    cdef _handle_key(self, char *key, Py_ssize_t key_len):
+        s = PyString_FromStringAndSize(key, key_len)
+        if key_len == 7:
+            if memcmp("message", key, key_len) == 0:
+                if self.message is not None:
+                    raise ValueError("message given 2x")
+                self.message = self._get_unicode()
+                return
+        elif key_len == 8:
+            if memcmp("timezone", key, key_len) == 0:
+                if self.timezone is not None:
+                    raise ValueError("timezone given 2x")
+                self.timezone = self._get_int()
+                return
+        elif key_len == 9:
+            if memcmp("timestamp", key, key_len) == 0:
+                if self.timestamp is not None:
+                    raise ValueError("timestamp given 2x")
+                self.timestamp = self._get_timestamp()
+                return
+            elif memcmp("committer", key, key_len) == 0:
+                if self.committer is not None:
+                    raise ValueError("committer given 2x")
+                self.committer = self._get_unicode()
+                return
+        elif key_len == 10:
+            if memcmp("properties", key, key_len) == 0:
+                if self.properties is not None:
+                    raise ValueError("properties given 2x")
+                self.properties = self._get_properties()
+                return
+            elif memcmp("parent-ids", key, key_len) == 0:
+                if self.parent_ids is not None:
+                    raise ValueError("parent_ids given 2x")
+                self.parent_ids = self._get_parent_ids()
+                return
+        elif key_len == 11:
+            if memcmp("revision-id", key, key_len) == 0:
+                if self.revision_id is not None:
+                    raise ValueError("revision_id given 2x")
+                self.revision_id = self._get_utf8()
+                return
+        elif key_len == 14:
+            if memcmp("inventory-sha1", key, key_len) == 0:
+                if self.inventory_sha1 is not None:
+                    raise ValueError("inventory_sha1 given 2x")
+                self.inventory_sha1 = self._get_utf8()
+                if PyString_GET_SIZE(self.inventory_sha1) != 40:
+                    raise ValueError('inventory_sha1 not of length 40')
+                return
+        self._raise_unknown_key(key, key_len)
+
+    cdef _next(self):
+        cdef char *key
+        cdef Py_ssize_t key_len
+        if self.end - self.cur < 2:
+            raise ValueError('no room for a key/value pair')
+        if self.cur[0] != c'l':
+            raise ValueError('a key value pair is encoded as a list.')
+        self.cur = self.cur + 1
+        key_len = self._get_char(&key)
+        self._handle_key(key, key_len)
+        if self.cur[0] != c'e':
+            raise ValueError('key/value pair was not terminated with "e"')
+        self.cur = self.cur + 1
+
+    cdef _raise_if_None(self, char *name, object val):
+        if val is None:
+            raise ValueError("%s not present" % (name,))
+
+    cdef _validate_attributes(self):
+        self._raise_if_None("committer", self.committer)
+        self._raise_if_None("properties", self.properties)
+        self._raise_if_None("timestamp", self.timestamp)
+        # timezone is allowed to be missing/None
+        self._raise_if_None("revision_id", self.revision_id)
+        self._raise_if_None("parent_ids", self.parent_ids)
+        self._raise_if_None("inventory_sha1", self.inventory_sha1)
+        self._raise_if_None("message", self.message)
+
+    def decode(self):
+        if self.size <= 0:
+            raise ValueError('empty strings can\'t be decoded :)')
+        if self.cur[0] != c'l' or self.end[-1] != c'e':
+            raise ValueError('Revisions are encoded as a bencode list.')
+        self.cur = self.cur + 1
+        # The first entry must be the format, set to 10
+        if (self.end - self.cur < 14
+            or memcmp("l6:formati10ee", self.cur, 14) != 0):
+            raise ValueError('Revisions must start with format, and must'
+                ' be format 10')
+        self.cur = self.cur + 14
+        self.end = self.end - 1
+        while self.cur < self.end:
+            self._next()
+        self._validate_attributes()
+        return revision.Revision(
+            committer=self.committer,
+            properties=self.properties,
+            timestamp=self.timestamp,
+            timezone=self.timezone,
+            revision_id=self.revision_id,
+            parent_ids=self.parent_ids,
+            inventory_sha1=self.inventory_sha1,
+            message=self.message,
+            )
+
+def read_revision_from_string(text):
+    return RevisionDecoder(text).decode()

=== modified file 'setup.py'
--- a/setup.py	2009-06-03 14:14:31 +0000
+++ b/setup.py	2009-06-04 20:01:21 +0000
@@ -262,6 +262,7 @@
 add_pyrex_extension('bzrlib._bencode_pyx')
 add_pyrex_extension('bzrlib._btree_serializer_c')
 add_pyrex_extension('bzrlib._chk_map_pyx', libraries=['z'])
+add_pyrex_extension('bzrlib._chk_serializer_pyx')
 add_pyrex_extension('bzrlib._chunks_to_lines_pyx')
 add_pyrex_extension('bzrlib._groupcompress_pyx',
                     extra_source=['bzrlib/diff-delta.c'])