Rev 4368: (Jelmer) Add Pyrex RIO implementation. in file:///home/pqm/archives/thelove/bzr/%2Btrunk/

Fri May 15 02:21:47 BST 2009

At file:///home/pqm/archives/thelove/bzr/%2Btrunk/

------------------------------------------------------------
revno: 4368
revision-id: pqm at pqm.ubuntu.com-20090515012140-stwx16f974x0zogp
parent: pqm at pqm.ubuntu.com-20090514222335-85rfl946254b3vk2
parent: jelmer at samba.org-20090514235307-lrw4xwiane0vvse4
committer: Canonical.com Patch Queue Manager <pqm at pqm.ubuntu.com>
branch nick: +trunk
timestamp: Fri 2009-05-15 02:21:40 +0100
message:
  (Jelmer) Add Pyrex RIO implementation.
added:
  bzrlib/_rio_py.py              _rio_py.py-20090514104624-ied3d39oju8anmfz-1
  bzrlib/_rio_pyx.pyx            _rio_pyx.pyx-20090514104636-8203jcqvfny56yrd-1
  bzrlib/tests/test__rio.py      test__rio.py-20090514191748-cy74k8yj46gzoeq6-1
modified:
  .bzrignore                     bzrignore-20050311232317-81f7b71efa2db11a
  bzrlib/rio.py                  rio.py-20051128032247-770b120b34dfff60
  bzrlib/tests/__init__.py       selftest.py-20050531073622-8d0e3c8845c97a64
  setup.py                       setup.py-20050314065409-02f8a0a6e3f9bc70
    ------------------------------------------------------------
    revno: 4354.3.18
    revision-id: jelmer at samba.org-20090514235307-lrw4xwiane0vvse4
    parent: jelmer at samba.org-20090514215739-ybv73i8mx0bgroq1
    parent: pqm at pqm.ubuntu.com-20090514222335-85rfl946254b3vk2
    committer: Jelmer Vernooij <jelmer at samba.org>
    branch nick: rio-c
    timestamp: Fri 2009-05-15 01:53:07 +0200
    message:
      Merge bzr.dev.
    modified:
      NEWS                           NEWS-20050323055033-4e00b5db738777ff
      bzr                            bzr.py-20050313053754-5485f144c7006fa6
      bzrlib/_rio_py.py              _rio_py.py-20090514104624-ied3d39oju8anmfz-1
      bzrlib/branch.py               branch.py-20050309040759-e4baf4e0d046576e
      bzrlib/builtins.py             builtins.py-20050830033751-fc01482b9ca23183
      bzrlib/chk_map.py              chk_map.py-20081001014447-ue6kkuhofvdecvxa-1
      bzrlib/commands.py             bzr.py-20050309040720-d10f4714595cf8c3
      bzrlib/errors.py               errors.py-20050309040759-20512168c4e14fbd
      bzrlib/foreign.py              foreign.py-20081112170002-olsxmandkk8qyfuq-1
      bzrlib/graph.py                graph_walker.py-20070525030359-y852guab65d4wtn0-1
      bzrlib/help_topics/en/eol.txt  eol.txt-20090327060429-todzdjmqt3bpv5r8-3
      bzrlib/osutils.py              osutils.py-20050309040759-eeaff12fbf77ac86
      bzrlib/rio.py                  rio.py-20051128032247-770b120b34dfff60
      bzrlib/tests/blackbox/test_commit.py test_commit.py-20060212094538-ae88fc861d969db0
      bzrlib/tests/blackbox/test_dpush.py test_dpush.py-20090108125928-st1td6le59g0vyv2-1
      bzrlib/tests/blackbox/test_switch.py test_switch.py-20071122111948-0c5en6uz92bwl76h-1
      bzrlib/tests/per_repository/test_fetch.py test_fetch.py-20070814052151-5cxha9slx4c93uog-1
      bzrlib/tests/test_chk_map.py   test_chk_map.py-20081001014447-ue6kkuhofvdecvxa-2
      bzrlib/tests/test_foreign.py   test_foreign.py-20081125004048-ywb901edgp9lluxo-1
      bzrlib/tests/test_pack_repository.py test_pack_repository-20080801043947-eaw0e6h2gu75kwmy-1
      bzrlib/win32utils.py           win32console.py-20051021033308-123c6c929d04973d
      bzrlib/workingtree.py          workingtree.py-20050511021032-29b6ec0a681e02e3
      bzrlib/xml_serializer.py       xml.py-20050309040759-57d51586fdec365d
    ------------------------------------------------------------
    revno: 4354.3.17
    revision-id: jelmer at samba.org-20090514215739-ybv73i8mx0bgroq1
    parent: jelmer at samba.org-20090514213133-l7y49men7ny87qfa
    committer: Jelmer Vernooij <jelmer at samba.org>
    branch nick: rio-c
    timestamp: Thu 2009-05-14 23:57:39 +0200
    message:
      Ignore rio_pyx output files.
    modified:
      .bzrignore                     bzrignore-20050311232317-81f7b71efa2db11a
    ------------------------------------------------------------
    revno: 4354.3.16
    revision-id: jelmer at samba.org-20090514213133-l7y49men7ny87qfa
    parent: jelmer at samba.org-20090514212957-fdg0uxi17n5j93ui
    committer: Jelmer Vernooij <jelmer at samba.org>
    branch nick: rio-c
    timestamp: Thu 2009-05-14 23:31:33 +0200
    message:
      Add RIO tests.
    modified:
      bzrlib/tests/__init__.py       selftest.py-20050531073622-8d0e3c8845c97a64
    ------------------------------------------------------------
    revno: 4354.3.15
    revision-id: jelmer at samba.org-20090514212957-fdg0uxi17n5j93ui
    parent: jelmer at samba.org-20090514212120-8dscngvopf7jf2i7
    committer: Jelmer Vernooij <jelmer at samba.org>
    branch nick: rio-c
    timestamp: Thu 2009-05-14 23:29:57 +0200
    message:
      Extend valid_tags tests a bit, test that stanza pairs contain the right types.
    modified:
      bzrlib/_rio_py.py              _rio_py.py-20090514104624-ied3d39oju8anmfz-1
      bzrlib/tests/test__rio.py      test__rio.py-20090514191748-cy74k8yj46gzoeq6-1
    ------------------------------------------------------------
    revno: 4354.3.14
    revision-id: jelmer at samba.org-20090514212120-8dscngvopf7jf2i7
    parent: jelmer at samba.org-20090514200427-2bapi6gexer78tcg
    committer: Jelmer Vernooij <jelmer at samba.org>
    branch nick: rio-c
    timestamp: Thu 2009-05-14 23:21:20 +0200
    message:
      Review feedback from John.
    modified:
      bzrlib/_rio_pyx.pyx            _rio_pyx.pyx-20090514104636-8203jcqvfny56yrd-1
    ------------------------------------------------------------
    revno: 4354.3.13
    revision-id: jelmer at samba.org-20090514200427-2bapi6gexer78tcg
    parent: jelmer at samba.org-20090514192015-21xv89n5aof0tsjs
    committer: Jelmer Vernooij <jelmer at samba.org>
    branch nick: rio-c
    timestamp: Thu 2009-05-14 22:04:27 +0200
    message:
      Add more RIO tests, fix bugs in pyrex implementation.
    modified:
      bzrlib/_rio_pyx.pyx            _rio_pyx.pyx-20090514104636-8203jcqvfny56yrd-1
      bzrlib/tests/test__rio.py      test__rio.py-20090514191748-cy74k8yj46gzoeq6-1
    ------------------------------------------------------------
    revno: 4354.3.12
    revision-id: jelmer at samba.org-20090514192015-21xv89n5aof0tsjs
    parent: jelmer at samba.org-20090514191048-l5rbnuydsf2bgefc
    committer: Jelmer Vernooij <jelmer at samba.org>
    branch nick: rio-c
    timestamp: Thu 2009-05-14 21:20:15 +0200
    message:
      Add tests for _valid_tag.
    added:
      bzrlib/tests/test__rio.py      test__rio.py-20090514191748-cy74k8yj46gzoeq6-1
    modified:
      bzrlib/_rio_pyx.pyx            _rio_pyx.pyx-20090514104636-8203jcqvfny56yrd-1
    ------------------------------------------------------------
    revno: 4354.3.11
    revision-id: jelmer at samba.org-20090514191048-l5rbnuydsf2bgefc
    parent: jelmer at samba.org-20090514184420-44m8jxl8o6qbl5x6
    committer: Jelmer Vernooij <jelmer at samba.org>
    branch nick: rio-c
    timestamp: Thu 2009-05-14 21:10:48 +0200
    message:
      Use shared data area when parsing pairs in stanza.
    modified:
      bzrlib/_rio_pyx.pyx            _rio_pyx.pyx-20090514104636-8203jcqvfny56yrd-1
    ------------------------------------------------------------
    revno: 4354.3.10
    revision-id: jelmer at samba.org-20090514184420-44m8jxl8o6qbl5x6
    parent: jelmer at samba.org-20090514180346-npwoc6ojwgj9dyzt
    committer: Jelmer Vernooij <jelmer at samba.org>
    branch nick: rio-c
    timestamp: Thu 2009-05-14 20:44:20 +0200
    message:
      Use Py_UNICODE in unicode RIO parser.
    modified:
      bzrlib/_rio_pyx.pyx            _rio_pyx.pyx-20090514104636-8203jcqvfny56yrd-1
    ------------------------------------------------------------
    revno: 4354.3.9
    revision-id: jelmer at samba.org-20090514180346-npwoc6ojwgj9dyzt
    parent: jelmer at samba.org-20090514175623-72o2jjemf2jh7hnn
    committer: Jelmer Vernooij <jelmer at samba.org>
    branch nick: rio-c
    timestamp: Thu 2009-05-14 20:03:46 +0200
    message:
      Use PyList_Append.
    modified:
      bzrlib/_rio_pyx.pyx            _rio_pyx.pyx-20090514104636-8203jcqvfny56yrd-1
    ------------------------------------------------------------
    revno: 4354.3.8
    revision-id: jelmer at samba.org-20090514175623-72o2jjemf2jh7hnn
    parent: jelmer at samba.org-20090514162218-j1dxoe8gs9ko4bed
    committer: Jelmer Vernooij <jelmer at samba.org>
    branch nick: rio-c
    timestamp: Thu 2009-05-14 19:56:23 +0200
    message:
      Review feedback from John:
       * look at individual characters rather than using strcmp.
       * check for ASCII values rather than using locale-dependent isalnum()
    modified:
      bzrlib/_rio_pyx.pyx            _rio_pyx.pyx-20090514104636-8203jcqvfny56yrd-1
    ------------------------------------------------------------
    revno: 4354.3.7
    revision-id: jelmer at samba.org-20090514162218-j1dxoe8gs9ko4bed
    parent: jelmer at samba.org-20090514160011-35db7q8uozmlp609
    committer: Jelmer Vernooij <jelmer at samba.org>
    branch nick: rio-c
    timestamp: Thu 2009-05-14 18:22:18 +0200
    message:
      Simplify unicode stanza reading, check for Type in valid_tag.
    modified:
      bzrlib/_rio_pyx.pyx            _rio_pyx.pyx-20090514104636-8203jcqvfny56yrd-1
    ------------------------------------------------------------
    revno: 4354.3.6
    revision-id: jelmer at samba.org-20090514160011-35db7q8uozmlp609
    parent: jelmer at samba.org-20090514155428-yle6hyxul5fqfz9u
    committer: Jelmer Vernooij <jelmer at samba.org>
    branch nick: rio-c
    timestamp: Thu 2009-05-14 18:00:11 +0200
    message:
      Add note about creating copies of objects.
    modified:
      bzrlib/_rio_pyx.pyx            _rio_pyx.pyx-20090514104636-8203jcqvfny56yrd-1
    ------------------------------------------------------------
    revno: 4354.3.5
    revision-id: jelmer at samba.org-20090514155428-yle6hyxul5fqfz9u
    parent: jelmer at samba.org-20090514154242-n8bquw2crer2yur0
    committer: Jelmer Vernooij <jelmer at samba.org>
    branch nick: rio-c
    timestamp: Thu 2009-05-14 17:54:28 +0200
    message:
      Fix formatting.
    modified:
      bzrlib/_rio_pyx.pyx            _rio_pyx.pyx-20090514104636-8203jcqvfny56yrd-1
    ------------------------------------------------------------
    revno: 4354.3.4
    revision-id: jelmer at samba.org-20090514154242-n8bquw2crer2yur0
    parent: jelmer at samba.org-20090514135904-zcmvvzmgbdqzovun
    committer: Jelmer Vernooij <jelmer at samba.org>
    branch nick: rio-c
    timestamp: Thu 2009-05-14 17:42:42 +0200
    message:
      More work using C API's rather than Python objects.
    modified:
      bzrlib/_rio_pyx.pyx            _rio_pyx.pyx-20090514104636-8203jcqvfny56yrd-1
    ------------------------------------------------------------
    revno: 4354.3.3
    revision-id: jelmer at samba.org-20090514135904-zcmvvzmgbdqzovun
    parent: jelmer at samba.org-20090514111102-v9dxbsj3r83jkato
    committer: Jelmer Vernooij <jelmer at samba.org>
    branch nick: rio-c
    timestamp: Thu 2009-05-14 15:59:04 +0200
    message:
      More performance tweaks.
    modified:
      bzrlib/_rio_pyx.pyx            _rio_pyx.pyx-20090514104636-8203jcqvfny56yrd-1
      bzrlib/rio.py                  rio.py-20051128032247-770b120b34dfff60
    ------------------------------------------------------------
    revno: 4354.3.2
    revision-id: jelmer at samba.org-20090514111102-v9dxbsj3r83jkato
    parent: jelmer at samba.org-20090514110033-98buz7oz7lr5evlf
    committer: Jelmer Vernooij <jelmer at samba.org>
    branch nick: rio-c
    timestamp: Thu 2009-05-14 13:11:02 +0200
    message:
      Provide custom implementation of _read_stanza_utf8 in Pyrex.
    modified:
      bzrlib/_rio_py.py              _rio_py.py-20090514104624-ied3d39oju8anmfz-1
      bzrlib/_rio_pyx.pyx            _rio_pyx.pyx-20090514104636-8203jcqvfny56yrd-1
      setup.py                       setup.py-20050314065409-02f8a0a6e3f9bc70
    ------------------------------------------------------------
    revno: 4354.3.1
    revision-id: jelmer at samba.org-20090514110033-98buz7oz7lr5evlf
    parent: pqm at pqm.ubuntu.com-20090512090530-v9355ohetp61ltc1
    committer: Jelmer Vernooij <jelmer at samba.org>
    branch nick: rio-c
    timestamp: Thu 2009-05-14 13:00:33 +0200
    message:
      Move core RIO parsing functionality to _rio_py.py.
    added:
      bzrlib/_rio_py.py              _rio_py.py-20090514104624-ied3d39oju8anmfz-1
      bzrlib/_rio_pyx.pyx            _rio_pyx.pyx-20090514104636-8203jcqvfny56yrd-1
    modified:
      bzrlib/rio.py                  rio.py-20051128032247-770b120b34dfff60
=== modified file '.bzrignore'

--- a/.bzrignore	2009-04-09 20:23:07 +0000
+++ b/.bzrignore	2009-05-14 21:57:39 +0000
@@ -45,6 +45,7 @@
 bzrlib/_groupcompress_pyx.c
 bzrlib/_knit_load_data_c.c
 bzrlib/_readdir_pyx.c
+bzrlib/_rio_pyx.c
 bzrlib/_walkdirs_win32.c
 doc/en/release-notes/NEWS.txt
 doc/en/developer-guide/HACKING.txt

=== added file 'bzrlib/_rio_py.py'
--- a/bzrlib/_rio_py.py	1970-01-01 00:00:00 +0000
+++ b/bzrlib/_rio_py.py	2009-05-14 23:53:07 +0000
@@ -0,0 +1,77 @@
+# Copyright (C) 2009 Canonical Ltd
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+"""Python implementation of _read_stanza_*."""
+
+import re
+
+from bzrlib.rio import (
+    Stanza,
+    )
+
+_tag_re = re.compile(r'^[-a-zA-Z0-9_]+$')
+def _valid_tag(tag):
+    if type(tag) != str:
+        raise TypeError(tag)
+    return bool(_tag_re.match(tag))
+
+
+def _read_stanza_utf8(line_iter):
+    def iter_unicode_lines():
+        for line in line_iter:
+            if type(line) != str:
+                raise TypeError(line)
+            yield line.decode('utf-8')
+    return _read_stanza_unicode(iter_unicode_lines())
+
+
+def _read_stanza_unicode(unicode_iter):
+    stanza = Stanza()
+    tag = None
+    accum_value = None
+
+    # TODO: jam 20060922 This code should raise real errors rather than
+    #       using 'assert' to process user input, or raising ValueError
+    #       rather than a more specific error.
+
+    for line in unicode_iter:
+        if line is None or line == u'':
+            break       # end of file
+        if line == u'\n':
+            break       # end of stanza
+        real_l = line
+        if line[0] == u'\t': # continues previous value
+            if tag is None:
+                raise ValueError('invalid continuation line %r' % real_l)
+            accum_value.append(u'\n' + line[1:-1])
+        else: # new tag:value line
+            if tag is not None:
+                stanza.add(tag, u''.join(accum_value))
+            try:
+                colon_index = line.index(u': ')
+            except ValueError:
+                raise ValueError('tag/value separator not found in line %r'
+                                 % real_l)
+            tag = str(line[:colon_index])
+            if not _valid_tag(tag):
+                raise ValueError("invalid rio tag %r" % (tag,))
+            accum_value = [line[colon_index+2:-1]]
+
+    if tag is not None: # add last tag-value
+        stanza.add(tag, u''.join(accum_value))
+        return stanza
+    else:     # didn't see any content
+        return None

=== added file 'bzrlib/_rio_pyx.pyx'
--- a/bzrlib/_rio_pyx.pyx	1970-01-01 00:00:00 +0000
+++ b/bzrlib/_rio_pyx.pyx	2009-05-14 21:21:20 +0000
@@ -0,0 +1,211 @@
+# Copyright (C) 2009 Canonical Ltd
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+"""Pyrex implementation of _read_stanza_*."""
+
+#python2.4 support
+cdef extern from "python-compat.h":
+    pass
+
+cdef extern from "malloc.h":
+    void *malloc(int)
+    void *realloc(void *, int)
+    void free(void *)
+
+cdef extern from "Python.h":
+    ctypedef int Py_ssize_t # Required for older pyrex versions
+    ctypedef int Py_UNICODE
+    char *PyString_AS_STRING(object s)
+    Py_ssize_t PyString_GET_SIZE(object t) except -1
+    object PyUnicode_DecodeUTF8(char *string, Py_ssize_t length, char *errors)
+    object PyString_FromStringAndSize(char *s, Py_ssize_t len)
+    int PyString_CheckExact(object)
+    int PyUnicode_CheckExact(object)
+    object PyUnicode_Join(object, object)
+    object PyUnicode_EncodeASCII(Py_UNICODE *, int, char *)
+    Py_UNICODE *PyUnicode_AS_UNICODE(object)
+    Py_UNICODE *PyUnicode_AsUnicode(object)
+    Py_ssize_t PyUnicode_GET_SIZE(object) except -1
+    int PyList_Append(object, object) except -1    
+    int Py_UNICODE_ISLINEBREAK(Py_UNICODE)
+    object PyUnicode_FromUnicode(Py_UNICODE *, int)
+    void *Py_UNICODE_COPY(Py_UNICODE *, Py_UNICODE *, int)
+
+cdef extern from "string.h":
+    void *memcpy(void *, void *, int)
+
+from bzrlib.rio import Stanza
+
+cdef int _valid_tag_char(char c):
+    return (c == c'_' or c == c'-' or 
+            (c >= c'a' and c <= c'z') or
+            (c >= c'A' and c <= c'Z') or
+            (c >= c'0' and c <= c'9'))
+
+
+def _valid_tag(tag):
+    cdef char *c_tag
+    cdef Py_ssize_t c_len
+    cdef int i
+    if not PyString_CheckExact(tag):
+        raise TypeError(tag)
+    c_tag = PyString_AS_STRING(tag)
+    c_len = PyString_GET_SIZE(tag)
+    if c_len < 1:
+        return False
+    for i from 0 <= i < c_len:
+        if not _valid_tag_char(c_tag[i]):
+            return False
+    return True
+
+
+cdef object _split_first_line_utf8(char *line, int len, 
+                                   char *value, Py_ssize_t *value_len):
+    cdef int i
+    for i from 0 <= i < len:
+        if line[i] == c':':
+            if line[i+1] != c' ':
+                raise ValueError("invalid tag in line %r" % line)
+            memcpy(value, line+i+2, len-i-2)
+            value_len[0] = len-i-2
+            return PyString_FromStringAndSize(line, i)
+    raise ValueError('tag/value separator not found in line %r' % line)
+
+
+cdef object _split_first_line_unicode(Py_UNICODE *line, int len, 
+                                      Py_UNICODE *value, Py_ssize_t *value_len):
+    cdef int i
+    for i from 0 <= i < len:
+        if line[i] == c':':
+            if line[i+1] != c' ':
+                raise ValueError("invalid tag in line %r" %
+                                 PyUnicode_FromUnicode(line, len))
+            memcpy(value, &line[i+2], (len-i-2) * sizeof(Py_UNICODE))
+            value_len[0] = len-i-2
+            return PyUnicode_EncodeASCII(line, i, "strict")
+    raise ValueError("tag/value separator not found in line %r" %
+                     PyUnicode_FromUnicode(line, len))
+
+
+def _read_stanza_utf8(line_iter):
+    cdef char *c_line
+    cdef Py_ssize_t c_len
+    cdef char *accum_value, *new_accum_value
+    cdef Py_ssize_t accum_len, accum_size
+    pairs = []
+    tag = None
+    accum_len = 0
+    accum_size = 4096
+    accum_value = <char *>malloc(accum_size)
+    if accum_value == NULL:
+        raise MemoryError
+    try:
+        for line in line_iter:
+            if line is None:
+                break # end of file
+            if not PyString_CheckExact(line):
+                raise TypeError("%r is not a plain string" % line)
+            c_line = PyString_AS_STRING(line)
+            c_len = PyString_GET_SIZE(line)
+            if c_len < 1:
+                break       # end of file
+            if c_len == 1 and c_line[0] == c"\n":
+                break       # end of stanza
+            if accum_len + c_len > accum_size:
+                accum_size = (accum_len + c_len)
+                new_accum_value = <char *>realloc(accum_value, accum_size)
+                if new_accum_value == NULL:
+                    raise MemoryError
+                else:
+                    accum_value = new_accum_value
+            if c_line[0] == c'\t': # continues previous value
+                if tag is None:
+                    raise ValueError('invalid continuation line %r' % line)
+                memcpy(accum_value+accum_len, c_line+1, c_len-1)
+                accum_len = accum_len + c_len-1
+            else: # new tag:value line
+                if tag is not None:
+                    PyList_Append(pairs, 
+                        (tag, PyUnicode_DecodeUTF8(accum_value, accum_len-1, 
+                                                   "strict")))
+                tag = _split_first_line_utf8(c_line, c_len, accum_value, 
+                                             &accum_len)
+                if not _valid_tag(tag):
+                    raise ValueError("invalid rio tag %r" % (tag,))
+        if tag is not None: # add last tag-value
+            PyList_Append(pairs, 
+                (tag, PyUnicode_DecodeUTF8(accum_value, accum_len-1, "strict")))
+            return Stanza.from_pairs(pairs)
+        else:     # didn't see any content
+            return None
+    finally:
+        free(accum_value)
+
+
+def _read_stanza_unicode(unicode_iter):
+    cdef Py_UNICODE *c_line
+    cdef int c_len
+    cdef Py_UNICODE *accum_value, *new_accum_value
+    cdef Py_ssize_t accum_len, accum_size
+    pairs = []
+    tag = None
+    accum_len = 0
+    accum_size = 4096
+    accum_value = <Py_UNICODE *>malloc(accum_size*sizeof(Py_UNICODE))
+    if accum_value == NULL:
+        raise MemoryError
+    try:
+        for line in unicode_iter:
+            if line is None:
+                break       # end of file
+            if not PyUnicode_CheckExact(line):
+                raise TypeError("%r is not a unicode string" % line)
+            c_line = PyUnicode_AS_UNICODE(line)
+            c_len = PyUnicode_GET_SIZE(line)
+            if c_len < 1:
+                break        # end of file
+            if Py_UNICODE_ISLINEBREAK(c_line[0]):
+                break       # end of stanza
+            if accum_len + c_len > accum_size:
+                accum_size = accum_len + c_len
+                new_accum_value = <Py_UNICODE *>realloc(accum_value, 
+                    accum_size*sizeof(Py_UNICODE))
+                if new_accum_value == NULL:
+                    raise MemoryError
+                else:
+                    accum_value = new_accum_value
+            if c_line[0] == c'\t': # continues previous value,
+                if tag is None:
+                    raise ValueError('invalid continuation line %r' % line)
+                memcpy(&accum_value[accum_len], &c_line[1],
+                    (c_len-1)*sizeof(Py_UNICODE))
+                accum_len = accum_len + (c_len-1)
+            else: # new tag:value line
+                if tag is not None:
+                    PyList_Append(pairs, 
+                        (tag, PyUnicode_FromUnicode(accum_value, accum_len-1)))
+                tag = _split_first_line_unicode(c_line, c_len, accum_value, 
+                                                &accum_len)
+                if not _valid_tag(tag):
+                    raise ValueError("invalid rio tag %r" % (tag,))
+        if tag is not None: # add last tag-value
+            PyList_Append(pairs,
+                    (tag, PyUnicode_FromUnicode(accum_value, accum_len-1)))
+            return Stanza.from_pairs(pairs)
+        else:     # didn't see any content
+            return None
+    finally:
+        free(accum_value)

=== modified file 'bzrlib/rio.py'
--- a/bzrlib/rio.py	2009-05-14 10:41:44 +0000
+++ b/bzrlib/rio.py	2009-05-14 23:53:07 +0000
@@ -129,6 +129,12 @@
                             % (value, type(value)))
         self.items.append((tag, value))
 
+    @classmethod
+    def from_pairs(cls, pairs):
+        ret = cls()
+        ret.items = pairs
+        return ret
+
     def __contains__(self, find_tag):
         """True if there is any field in this stanza with the given tag."""
         for tag, value in self.items:
@@ -191,16 +197,16 @@
 
         result = []
         for tag, value in self.items:
-            if value == '':
-                result.append(tag + ': \n')
-            elif '\n' in value:
+            if value == u'':
+                result.append(tag + u': \n')
+            elif u'\n' in value:
                 # don't want splitlines behaviour on empty lines
-                val_lines = value.split('\n')
-                result.append(tag + ': ' + val_lines[0] + '\n')
+                val_lines = value.split(u'\n')
+                result.append(tag + u': ' + val_lines[0] + u'\n')
                 for line in val_lines[1:]:
-                    result.append('\t' + line + '\n')
+                    result.append(u'\t' + line + u'\n')
             else:
-                result.append(tag + ': ' + value + '\n')
+                result.append(tag + u': ' + value + u'\n')
         return u''.join(result)
 
     def write(self, to_file):
@@ -236,9 +242,9 @@
             d[tag] = value
         return d
 
-_tag_re = re.compile(r'^[-a-zA-Z0-9_]+$')
+
 def valid_tag(tag):
-    return bool(_tag_re.match(tag))
+    return _valid_tag(tag)
 
 
 def read_stanza(line_iter):
@@ -254,8 +260,7 @@
 
     The raw lines must be in utf-8 encoding.
     """
-    unicode_iter = (line.decode('utf-8') for line in line_iter)
-    return read_stanza_unicode(unicode_iter)
+    return _read_stanza_utf8(line_iter)
 
 
 def read_stanza_unicode(unicode_iter):
@@ -275,42 +280,7 @@
     :return: A Stanza object if there are any lines in the file.
         None otherwise
     """
-    stanza = Stanza()
-    tag = None
-    accum_value = None
-
-    # TODO: jam 20060922 This code should raise real errors rather than
-    #       using 'assert' to process user input, or raising ValueError
-    #       rather than a more specific error.
-
-    for line in unicode_iter:
-        if line is None or line == u'':
-            break       # end of file
-        if line == u'\n':
-            break       # end of stanza
-        real_l = line
-        if line[0] == u'\t': # continues previous value
-            if tag is None:
-                raise ValueError('invalid continuation line %r' % real_l)
-            accum_value.append(u'\n' + line[1:-1])
-        else: # new tag:value line
-            if tag is not None:
-                stanza.add(tag, u''.join(accum_value))
-            try:
-                colon_index = line.index(u': ')
-            except ValueError:
-                raise ValueError('tag/value separator not found in line %r'
-                                 % real_l)
-            tag = str(line[:colon_index])
-            if not valid_tag(tag):
-                raise ValueError("invalid rio tag %r" % (tag,))
-            accum_value = [line[colon_index+2:-1]]
-
-    if tag is not None: # add last tag-value
-        stanza.add(tag, u''.join(accum_value))
-        return stanza
-    else:     # didn't see any content
-        return None
+    return _read_stanza_unicode(unicode_iter)
 
 
 def to_patch_lines(stanza, max_width=72):
@@ -399,3 +369,17 @@
     :return: a Stanza
     """
     return read_stanza(_patch_stanza_iter(line_iter))
+
+
+try:
+    from bzrlib._rio_pyx import (
+        _read_stanza_utf8,
+        _read_stanza_unicode,
+        _valid_tag,
+        )
+except ImportError:
+    from bzrlib._rio_py import (
+       _read_stanza_utf8,
+       _read_stanza_unicode,
+       _valid_tag,
+       )

=== modified file 'bzrlib/tests/__init__.py'
--- a/bzrlib/tests/__init__.py	2009-05-12 08:08:54 +0000
+++ b/bzrlib/tests/__init__.py	2009-05-14 21:31:33 +0000
@@ -3363,6 +3363,7 @@
                    'bzrlib.tests.test__chk_map',
                    'bzrlib.tests.test__dirstate_helpers',
                    'bzrlib.tests.test__groupcompress',
+                   'bzrlib.tests.test__rio',
                    'bzrlib.tests.test__walkdirs_win32',
                    'bzrlib.tests.test_ancestry',
                    'bzrlib.tests.test_annotate',

=== added file 'bzrlib/tests/test__rio.py'
--- a/bzrlib/tests/test__rio.py	1970-01-01 00:00:00 +0000
+++ b/bzrlib/tests/test__rio.py	2009-05-14 21:29:57 +0000
@@ -0,0 +1,196 @@
+# Copyright (C) 2009 Canonical Ltd
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+"""Tests for _rio_*."""
+
+from bzrlib import (
+    rio,
+    tests,
+    )
+
+
+def load_tests(standard_tests, module, loader):
+    # parameterize all tests in this module
+    suite = loader.suiteClass()
+    import bzrlib._rio_py as py_module
+    scenarios = [('python', {'module': py_module})]
+    if CompiledRioFeature.available():
+        import bzrlib._rio_pyx as c_module
+        scenarios.append(('C', {'module': c_module}))
+    else:
+        # the compiled module isn't available, so we add a failing test
+        class FailWithoutFeature(tests.TestCase):
+            def test_fail(self):
+                self.requireFeature(CompiledRioFeature)
+        suite.addTest(loader.loadTestsFromTestCase(FailWithoutFeature))
+    tests.multiply_tests(standard_tests, scenarios, suite)
+    return suite
+
+
+class _CompiledRioFeature(tests.Feature):
+
+    def _probe(self):
+        try:
+            import bzrlib._rio_pyx
+        except ImportError:
+            return False
+        return True
+
+    def feature_name(self):
+        return 'bzrlib._rio_pyx'
+
+CompiledRioFeature = _CompiledRioFeature()
+
+
+class TestValidTag(tests.TestCase):
+
+    module = None # Filled in by test parameterization
+
+    def test_ok(self):
+        self.assertTrue(self.module._valid_tag("foo"))
+
+    def test_no_spaces(self):
+        self.assertFalse(self.module._valid_tag("foo bla"))
+
+    def test_numeric(self):
+        self.assertTrue(self.module._valid_tag("3foo423"))
+
+    def test_no_colon(self):
+        self.assertFalse(self.module._valid_tag("foo:bla"))
+    
+    def test_type_error(self):
+        self.assertRaises(TypeError, self.module._valid_tag, 423)
+
+    def test_empty(self):
+        self.assertFalse(self.module._valid_tag(""))
+
+    def test_unicode(self):
+        self.assertRaises(TypeError, self.module._valid_tag, u"foo")
+
+    def test_non_ascii_char(self):
+        self.assertFalse(self.module._valid_tag("\xb5"))
+
+
+class TestReadUTF8Stanza(tests.TestCase):
+
+    module = None # Filled in by test parameterization
+
+    def assertReadStanza(self, result, line_iter):
+        s = self.module._read_stanza_utf8(line_iter)
+        self.assertEquals(result, s)
+        if s is not None:
+            for tag, value in s.iter_pairs():
+                self.assertIsInstance(tag, str)
+                self.assertIsInstance(value, unicode)
+
+    def assertReadStanzaRaises(self, exception, line_iter):
+        self.assertRaises(exception, self.module._read_stanza_utf8, line_iter)
+
+    def test_no_string(self):
+        self.assertReadStanzaRaises(TypeError, [21323])
+
+    def test_empty(self):
+        self.assertReadStanza(None, [])
+
+    def test_none(self):
+        self.assertReadStanza(None, [""])
+
+    def test_simple(self):
+        self.assertReadStanza(rio.Stanza(foo="bar"), ["foo: bar\n", ""])
+
+    def test_multi_line(self):
+        self.assertReadStanza(rio.Stanza(foo="bar\nbla"), 
+                ["foo: bar\n", "\tbla\n"])
+
+    def test_repeated(self):
+        s = rio.Stanza()
+        s.add("foo", "bar")
+        s.add("foo", "foo")
+        self.assertReadStanza(s, ["foo: bar\n", "foo: foo\n"])
+
+    def test_invalid_early_colon(self):
+        self.assertReadStanzaRaises(ValueError, ["f:oo: bar\n"])
+
+    def test_invalid_tag(self):
+        self.assertReadStanzaRaises(ValueError, ["f%oo: bar\n"])
+
+    def test_continuation_too_early(self):
+        self.assertReadStanzaRaises(ValueError, ["\tbar\n"])
+
+    def test_large(self):
+        value = "bla" * 9000
+        self.assertReadStanza(rio.Stanza(foo=value),
+            ["foo: %s\n" % value])
+
+    def test_non_ascii_char(self):
+        self.assertReadStanza(rio.Stanza(foo=u"n\xe5me"),
+            [u"foo: n\xe5me\n".encode("utf-8")])
+
+
+class TestReadUnicodeStanza(tests.TestCase):
+
+    module = None # Filled in by test parameterization
+
+    def assertReadStanza(self, result, line_iter):
+        s = self.module._read_stanza_unicode(line_iter)
+        self.assertEquals(result, s)
+        if s is not None:
+            for tag, value in s.iter_pairs():
+                self.assertIsInstance(tag, str)
+                self.assertIsInstance(value, unicode)
+
+    def assertReadStanzaRaises(self, exception, line_iter):
+        self.assertRaises(exception, self.module._read_stanza_unicode,
+                          line_iter)
+
+    def test_no_string(self):
+        self.assertReadStanzaRaises(TypeError, [21323])
+
+    def test_empty(self):
+        self.assertReadStanza(None, [])
+
+    def test_none(self):
+        self.assertReadStanza(None, [u""])
+
+    def test_simple(self):
+        self.assertReadStanza(rio.Stanza(foo="bar"), [u"foo: bar\n", u""])
+
+    def test_multi_line(self):
+        self.assertReadStanza(rio.Stanza(foo="bar\nbla"), 
+                [u"foo: bar\n", u"\tbla\n"])
+
+    def test_repeated(self):
+        s = rio.Stanza()
+        s.add("foo", "bar")
+        s.add("foo", "foo")
+        self.assertReadStanza(s, [u"foo: bar\n", u"foo: foo\n"])
+
+    def test_invalid_early_colon(self):
+        self.assertReadStanzaRaises(ValueError, [u"f:oo: bar\n"])
+
+    def test_invalid_tag(self):
+        self.assertReadStanzaRaises(ValueError, [u"f%oo: bar\n"])
+
+    def test_continuation_too_early(self):
+        self.assertReadStanzaRaises(ValueError, [u"\tbar\n"])
+
+    def test_large(self):
+        value = u"bla" * 9000
+        self.assertReadStanza(rio.Stanza(foo=value),
+            [u"foo: %s\n" % value])
+
+    def test_non_ascii_char(self):
+        self.assertReadStanza(rio.Stanza(foo=u"n\xe5me"), [u"foo: n\xe5me\n"])

=== modified file 'setup.py'
--- a/setup.py	2009-05-08 15:27:44 +0000
+++ b/setup.py	2009-05-14 11:11:02 +0000
@@ -264,6 +264,7 @@
                     extra_source=['bzrlib/diff-delta.c'])
 add_pyrex_extension('bzrlib._chunks_to_lines_pyx')
 add_pyrex_extension('bzrlib._knit_load_data_c')
+add_pyrex_extension('bzrlib._rio_pyx')
 add_pyrex_extension('bzrlib._chk_map_pyx', libraries=['z'])
 if sys.platform == 'win32':
     add_pyrex_extension('bzrlib._dirstate_helpers_c',