[MERGE][RFC] Add simple revision serializer based on RIO.

Mon Apr 27 20:53:31 BST 2009

-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1

bb:comment Where is this tested?

Jelmer Vernooij wrote:
> === modified file 'bzrlib/chk_serializer.py'
> --- bzrlib/chk_serializer.py	2009-04-09 20:23:07 +0000
> +++ bzrlib/chk_serializer.py	2009-04-15 01:37:51 +0000
> @@ -16,14 +16,94 @@
>  
>  """Serializer object for CHK based inventory storage."""
>  
> +from cStringIO import (
> +    StringIO,
> +    )
> +
>  from bzrlib import (
> +    cache_utf8,
>      inventory,
> +    osutils,
> +    revision as _mod_revision,
> +    rio,
>      xml5,
>      xml6,
>      )
>  
> -
> -class CHKSerializerSubtree(xml6.Serializer_v6):
> +class RIORevisionSerializer1(object):
> +    """Simple line-based revision serializer. 
> +    
> +    This writes simple key/value pairs for all elements on the revision, 
> +    each on a separate line and using colons to separate key and value. Newlines
> +    are backslash-escaped where necessary.

I think this docstring describes your original implementation, not RIO.
 RIO does not backslash-escape newlines.

> +
> +    The commit message is always serialized last and can span multiple lines.
> +    """

Although this is accurate, I don't see a reason to guarantee it by
enshrining it in a docstring.

> +
> +    def write_revision(self, rev, f):
> +        decode_utf8 = cache_utf8.decode
> +        w = rio.RioWriter(f)
> +        s = rio.Stanza()

This method would be a lot shorter if you used the kwargs argument of
the Stanza constructor, e.g.

s = rio.Stanza(revision_id=rev.revision_id, timestamp=rev.timestamp,
               inventory_sha1=rev.inventory_sha1, ...

> +        revision_id = rev.revision_id
> +        if isinstance(revision_id, str):
> +            revision_id = decode_utf8(revision_id)

Shouldn't we define write_revision as either requiring unicode or utf-8
throughout?

> +        s.add("revision-id", revision_id)
> +        s.add("timestamp", "%.3f" % rev.timestamp)
> +        if rev.parent_ids:
> +            s.add("parent-ids", 
> +                " ".join([decode_utf8(p) for p in rev.parent_ids]))

RIO is designed to allow the same key to be used multiple times, so I
think it's more natural to handle it as follows:

for parent_id in rev.parent_ids:
    s.add("parent-id", parent_id)

> +        s.add("inventory-sha1", rev.inventory_sha1)
> +        s.add("committer", rev.committer)
> +        if rev.timezone is not None:
> +            s.add("timezone", str(rev.timezone))
> +        if rev.properties:
> +            revprops_stanza = rio.Stanza()
> +            for k, v in rev.properties.iteritems():
> +                if isinstance(v, str):
> +                    v = decode_utf8(v)
> +                revprops_stanza.add(decode_utf8(k), v)
> +            s.add("properties", revprops_stanza.to_unicode())
> +        s.add("message", rev.message)
> +        w.write_stanza(s)
> +
> +    def write_revision_to_string(self, rev):
> +        f = StringIO()
> +        self.write_revision(rev, f)
> +        return f.getvalue()
> +
> +    def read_revision(self, f):
> +        s = rio.read_stanza(f)
> +        rev = _mod_revision.Revision(None)
> +        for (field, value) in s.iter_pairs():
> +            if field == "revision-id":
> +                rev.revision_id = cache_utf8.encode(value)
> +            elif field == "parent-ids":
> +                rev.parent_ids = [cache_utf8.encode(p) for p in value.split(" ")]
> +            elif field == "committer":
> +                rev.committer = value
> +            elif field == "inventory-sha1":
> +                rev.inventory_sha1 = value
> +            elif field == "timezone":
> +                rev.timezone = int(value)
> +            elif field == "timestamp":
> +                rev.timestamp = float(value)
> +            elif field == "message":
> +                rev.message = value
> +            elif field == "properties":
> +                rev.properties = rio.read_stanza_unicode(
> +                    osutils.split_lines(value)).as_dict()
> +            else:
> +                raise AssertionError("Unknown field %s" % field)
> +            l = f.readline()
> +        return rev
> +
> +    def read_revision_from_string(self, xml_string):
> +        f = StringIO(xml_string)
> +        rev = self.read_revision(f)
> +        return rev
> +
> +
> +class CHKSerializerSubtree(RIORevisionSerializer1,xml6.Serializer_v6):

I think there should be a space after RIORevisionSerializer1.

Also, isn't this a massive change to the behaviour of CHKSerializerSubtree?

>      """A CHKInventory based serializer that supports tree references"""
>  
>      supported_kinds = set(['file', 'directory', 'symlink', 'tree-reference'])
> @@ -51,7 +131,7 @@
>          self.search_key_name = search_key_name
>  
>  
> -class CHKSerializer(xml5.Serializer_v5):
> +class CHKSerializer(RIORevisionSerializer1,xml5.Serializer_v5):

I think there should be a space after RIORevisionSerializer1.
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.4.9 (GNU/Linux)
Comment: Using GnuPG with Mozilla - http://enigmail.mozdev.org

iEYEARECAAYFAkn2DTcACgkQ0F+nu1YWqI0dzwCghiBHFZkUmZwELmQLvur5YYih
i6oAn1VkzHAV04NV9tlhmKZjU9I5LCQ1
=4lpA
-----END PGP SIGNATURE-----