Rev 2513: Change read/iter_records to return a callable, add more validation, and in http://bazaar.launchpad.net/~bzr/bzr/container-format

Thu Jun 14 14:30:30 BST 2007

At http://bazaar.launchpad.net/~bzr/bzr/container-format

------------------------------------------------------------
revno: 2513
revision-id: andrew.bennetts at canonical.com-20070614132802-bas89f67tqq4p3s6
parent: andrew.bennetts at canonical.com-20070614055245-rtwk0vgz74fyyimo
parent: andrew.bennetts at canonical.com-20070614125513-nua0p6bw9cw3jeaq
committer: Andrew Bennetts <andrew.bennetts at canonical.com>
branch nick: container-format
timestamp: Thu 2007-06-14 23:28:02 +1000
message:
  Change read/iter_records to return a callable, add more validation, and
  improve docstrings.
modified:
  bzrlib/errors.py               errors.py-20050309040759-20512168c4e14fbd
  bzrlib/pack.py                 container.py-20070607160755-tr8zc26q18rn0jnb-1
  bzrlib/tests/test_errors.py    test_errors.py-20060210110251-41aba2deddf936a8
  bzrlib/tests/test_pack.py      test_container.py-20070607160755-tr8zc26q18rn0jnb-2
    ------------------------------------------------------------
    revno: 2512.1.2
    revision-id: andrew.bennetts at canonical.com-20070614125513-nua0p6bw9cw3jeaq
    parent: andrew.bennetts at canonical.com-20070614112338-6u3900u6nkag66u8
    committer: Andrew Bennetts <andrew.bennetts at canonical.com>
    branch nick: container-format
    timestamp: Thu 2007-06-14 22:55:13 +1000
    message:
      Docstring improvements.
    ------------------------------------------------------------
    revno: 2512.1.1
    revision-id: andrew.bennetts at canonical.com-20070614112338-6u3900u6nkag66u8
    parent: andrew.bennetts at canonical.com-20070614055245-rtwk0vgz74fyyimo
    committer: Andrew Bennetts <andrew.bennetts at canonical.com>
    branch nick: container-format
    timestamp: Thu 2007-06-14 21:23:38 +1000
    message:
      Return a callable instead of a str from read, and add more validation.
=== modified file 'bzrlib/errors.py'

--- a/bzrlib/errors.py	2007-06-14 05:52:45 +0000
+++ b/bzrlib/errors.py	2007-06-14 11:23:38 +0000
@@ -2178,3 +2178,11 @@
     def __init__(self, excess):
         self.excess = excess
 
+
+class DuplicateRecordNameError(ContainerError):
+
+    _fmt = "Container has multiple records with the same name: \"%(name)s\""
+
+    def __init__(self, name):
+        self.name = name
+

=== modified file 'bzrlib/pack.py'
--- a/bzrlib/pack.py	2007-06-14 05:52:45 +0000
+++ b/bzrlib/pack.py	2007-06-14 13:28:02 +0000
@@ -37,12 +37,26 @@
     name.
 
     :raises InvalidRecordError: if name is not valid.
+    :seealso: _check_name_encoding
     """
-    # XXX: consider checking that name is a str of valid UTF-8 too?
     if _whitespace_re.search(name) is not None:
         raise errors.InvalidRecordError("%r is not a valid name." % (name,))
 
 
+def _check_name_encoding(name):
+    """Check that 'name' is valid UTF-8.
+    
+    This is separate from _check_name because UTF-8 decoding is relatively
+    expensive, and we usually want to avoid it.
+
+    :raises InvalidRecordError: if name is not valid UTF-8.
+    """
+    try:
+        name.decode('utf-8')
+    except UnicodeDecodeError, e:
+        raise errors.InvalidRecordError(str(e))
+
+
 class ContainerWriter(object):
     """A class for writing containers."""
 
@@ -119,24 +133,55 @@
     def iter_records(self):
         """Iterate over the container, yielding each record as it is read.
 
-        Each yielded record will be a 2-tuple of (names, bytes), where names is
-        a ``list`` and bytes is a ``str``.
-
-        :raises UnknownContainerFormatError: if the format of the container is
-            unrecognised.
+        Each yielded record will be a 2-tuple of (names, callable), where names
+        is a ``list`` and bytes is a function that takes one argument,
+        ``max_length``.
+
+        You **must not** call the callable after advancing the interator to the
+        next record.  That is, this code is invalid::
+
+            record_iter = container.iter_records()
+            names1, callable1 = record_iter.next()
+            names2, callable2 = record_iter.next()
+            bytes1 = callable1(None)
+        
+        As it will give incorrect results and invalidate the state of the
+        ContainerReader.
+
+        :raises ContainerError: if any sort of containter corruption is
+            detected, e.g. UnknownContainerFormatError is the format of the
+            container is unrecognised.
+        :seealso: ContainerReader.read
         """
-        format = self._read_line()
-        if format != FORMAT_ONE:
-            raise errors.UnknownContainerFormatError(format)
+        self._read_format()
         return self._iter_records()
     
+    def iter_record_objects(self):
+        """Iterate over the container, yielding each record as it is read.
+
+        Each yielded record will be an object with ``read`` and ``validate``
+        methods.  Like with iter_records, it is not safe to use a record object
+        after advancing the iterator to yield next record.
+
+        :raises ContainerError: if any sort of containter corruption is
+            detected, e.g. UnknownContainerFormatError is the format of the
+            container is unrecognised.
+        :seealso: iter_records
+        """
+        self._read_format()
+        return self._iter_record_objects()
+    
     def _iter_records(self):
+        for record in self._iter_record_objects():
+            yield record.read()
+
+    def _iter_record_objects(self):
         while True:
             record_kind = self.reader_func(1)
             if record_kind == 'B':
                 # Bytes record.
                 reader = BytesRecordReader(self.reader_func)
-                yield reader.read()
+                yield reader
             elif record_kind == 'E':
                 # End marker.  There are no more records.
                 return
@@ -148,17 +193,32 @@
                 # Unknown record type.
                 raise errors.UnknownRecordTypeError(record_kind)
 
+    def _read_format(self):
+        format = self._read_line()
+        if format != FORMAT_ONE:
+            raise errors.UnknownContainerFormatError(format)
+
     def validate(self):
         """Validate this container and its records.
 
-        You can either validate or iter_records, you can't do both.
+        Validating consumes the data stream just like iter_records and
+        iter_record_objects, so you cannot call it after
+        iter_records/iter_record_objects.
 
         :raises ContainerError: if something is invalid.
         """
-        for names, bytes in self.iter_records():
-            # XXX: bytes is str, should be callable to get bytes.
-            # callable()
-            pass
+        all_names = set()
+        for record_names, read_bytes in self.iter_records():
+            read_bytes(None)
+            for name in record_names:
+                _check_name_encoding(name)
+                # Check that the name is unique.  Note that Python will refuse
+                # to decode non-shortest forms of UTF-8 encoding, so there is no
+                # risk that the same unicode string has been encoded two
+                # different ways.
+                if name in all_names:
+                    raise errors.DuplicateRecordNameError(name)
+                all_names.add(name)
         excess_bytes = self.reader_func(1)
         if excess_bytes != '':
             raise errors.ContainerHasExcessDataError(excess_bytes)
@@ -169,9 +229,13 @@
     def read(self):
         """Read this record.
 
-        You can either validate or read, you can't do both.
+        You can either validate or read a record, you can't do both.
 
-        :returns: (names, bytes)
+        :returns: A tuple of (names, callable).  The callable can be called
+            repeatedly to obtain the bytes for the record, with a max_length
+            argument.  If max_length is None, returns all the bytes.  Because
+            records can be arbitrarily large, using None is not recommended
+            unless you have reason to believe the content will fit in memory.
         """
         # Read the content length.
         length_line = self._read_line()
@@ -189,10 +253,20 @@
                 break
             _check_name(name)
             names.append(name)
-        bytes = self.reader_func(length)
-        if len(bytes) != length:
+
+        self._remaining_length = length
+        return names, self._content_reader
+
+    def _content_reader(self, max_length):
+        if max_length is None:
+            length_to_read = self._remaining_length
+        else:
+            length_to_read = min(max_length, self._remaining_length)
+        self._remaining_length -= length_to_read
+        bytes = self.reader_func(length_to_read)
+        if len(bytes) != length_to_read:
             raise errors.UnexpectedEndOfContainerError()
-        return names, bytes
+        return bytes
 
     def validate(self):
         """Validate this record.
@@ -201,4 +275,8 @@
 
         :raises ContainerError: if this record is invalid.
         """
-        self.read()
+        names, read_bytes = self.read()
+        for name in names:
+            _check_name_encoding(name)
+        read_bytes(None)
+

=== modified file 'bzrlib/tests/test_errors.py'
--- a/bzrlib/tests/test_errors.py	2007-06-14 05:52:45 +0000
+++ b/bzrlib/tests/test_errors.py	2007-06-14 11:23:38 +0000
@@ -300,6 +300,13 @@
             "Container has data after end marker: 'excess bytes'",
             str(e))
 
+    def test_duplicate_record_name_error(self):
+        """Test the formatting of DuplicateRecordNameError."""
+        e = errors.DuplicateRecordNameError(u"n\xe5me".encode('utf-8'))
+        self.assertEqual(
+            "Container has multiple records with the same name: \"n\xc3\xa5me\"",
+            str(e))
+
 
 class PassThroughError(errors.BzrError):
     

=== modified file 'bzrlib/tests/test_pack.py'
--- a/bzrlib/tests/test_pack.py	2007-06-14 05:52:45 +0000
+++ b/bzrlib/tests/test_pack.py	2007-06-14 11:23:38 +0000
@@ -136,19 +136,30 @@
         """
         reader = self.get_reader_for("Bazaar pack format 1\nB5\n\naaaaaE")
         expected_records = [([], 'aaaaa')]
-        self.assertEqual(expected_records, list(reader.iter_records()))
+        self.assertEqual(
+            expected_records,
+            [(names, read_bytes(None))
+             for (names, read_bytes) in reader.iter_records()])
 
     def test_validate_empty_container(self):
+        """validate does not raise an error for a container with no records."""
         reader = self.get_reader_for("Bazaar pack format 1\nE")
         # No exception raised
         reader.validate()
 
     def test_validate_non_empty_valid_container(self):
+        """validate does not raise an error for a container with a valid record.
+        """
         reader = self.get_reader_for("Bazaar pack format 1\nB3\nname\n\nabcE")
         # No exception raised
         reader.validate()
 
     def test_validate_bad_format(self):
+        """validate raises an error for unrecognised format strings.
+
+        It may raise either UnexpectedEndOfContainerError or
+        UnknownContainerFormatError, depending on exactly what the string is.
+        """
         inputs = ["", "x", "Bazaar pack format 1", "bad\n"]
         for input in inputs:
             reader = self.get_reader_for(input)
@@ -158,22 +169,47 @@
                 reader.validate)
 
     def test_validate_bad_record_marker(self):
+        """validate raises UnknownRecordTypeError for unrecognised record
+        types.
+        """
         reader = self.get_reader_for("Bazaar pack format 1\nX")
         self.assertRaises(errors.UnknownRecordTypeError, reader.validate)
 
     def test_validate_data_after_end_marker(self):
+        """validate raises ContainerHasExcessDataError if there are any bytes
+        after the end of the container.
+        """
         reader = self.get_reader_for("Bazaar pack format 1\nEcrud")
         self.assertRaises(
             errors.ContainerHasExcessDataError, reader.validate)
 
     def test_validate_no_end_marker(self):
+        """validate raises UnexpectedEndOfContainerError if there's no end of
+        container marker, even if the container up to this point has been valid.
+        """
         reader = self.get_reader_for("Bazaar pack format 1\n")
         self.assertRaises(
             errors.UnexpectedEndOfContainerError, reader.validate)
 
+    def test_validate_duplicate_name(self):
+        """validate raises DuplicateRecordNameError if the same name occurs
+        multiple times in the container.
+        """
+        reader = self.get_reader_for(
+            "Bazaar pack format 1\n"
+            "B0\nname\n\n"
+            "B0\nname\n\n"
+            "E")
+        self.assertRaises(errors.DuplicateRecordNameError, reader.validate)
+
+    def test_validate_undecodeable_name(self):
+        """Names that aren't valid UTF-8 cause validate to fail."""
+        reader = self.get_reader_for("Bazaar pack format 1\nB0\n\xcc\n\nE")
+        self.assertRaises(errors.InvalidRecordError, reader.validate)
+        
 
 class TestBytesRecordReader(tests.TestCase):
-    """Tests for parsing Bytes records with BytesRecordReader."""
+    """Tests for reading and validating Bytes records with BytesRecordReader."""
 
     def get_reader_for(self, bytes):
         stream = StringIO(bytes)
@@ -185,26 +221,26 @@
         names.
         """
         reader = self.get_reader_for("5\n\naaaaa")
-        names, bytes = reader.read()
+        names, get_bytes = reader.read()
         self.assertEqual([], names)
-        self.assertEqual('aaaaa', bytes)
+        self.assertEqual('aaaaa', get_bytes(None))
 
     def test_record_with_one_name(self):
         """Reading a Bytes record with one name returns a list of just that
         name.
         """
         reader = self.get_reader_for("5\nname1\n\naaaaa")
-        names, bytes = reader.read()
+        names, get_bytes = reader.read()
         self.assertEqual(['name1'], names)
-        self.assertEqual('aaaaa', bytes)
+        self.assertEqual('aaaaa', get_bytes(None))
 
     def test_record_with_two_names(self):
         """Reading a Bytes record with two names returns a list of both names.
         """
         reader = self.get_reader_for("5\nname1\nname2\n\naaaaa")
-        names, bytes = reader.read()
+        names, get_bytes = reader.read()
         self.assertEqual(['name1', 'name2'], names)
-        self.assertEqual('aaaaa', bytes)
+        self.assertEqual('aaaaa', get_bytes(None))
 
     def test_invalid_length(self):
         """If the length-prefix is not a number, parsing raises
@@ -225,16 +261,19 @@
         """
         complete_record = "6\nname\n\nabcdef"
         for count in range(0, len(complete_record)):
-            reader = self.get_reader_for(complete_record[:count])
-            # We don't use assertRaises to make diagnosing failures easier.
+            incomplete_record = complete_record[:count]
+            reader = self.get_reader_for(incomplete_record)
+            # We don't use assertRaises to make diagnosing failures easier
+            # (assertRaises doesn't allow a custom failure message).
             try:
-                reader.read()
+                names, read_bytes = reader.read()
+                read_bytes(None)
             except errors.UnexpectedEndOfContainerError:
                 pass
             else:
                 self.fail(
                     "UnexpectedEndOfContainerError not raised when parsing %r"
-                    % (input.getvalue()))
+                    % (incomplete_record,))
 
     def test_initial_eof(self):
         """EOF before any bytes read at all."""
@@ -251,7 +290,7 @@
         reader = self.get_reader_for("123\nname")
         self.assertRaises(errors.UnexpectedEndOfContainerError, reader.read)
 
-    def test_invalid_name_whitespace(self):
+    def test_read_invalid_name_whitespace(self):
         """Names must have no whitespace."""
         # A name with a space.
         reader = self.get_reader_for("0\nbad name\n\n")
@@ -266,21 +305,57 @@
         self.assertRaises(errors.InvalidRecordError, reader.read)
 
     def test_validate_whitespace_in_name(self):
-        reader = self.get_reader_for("0\nbad name\n\nE")
+        """Names must have no whitespace."""
+        reader = self.get_reader_for("0\nbad name\n\n")
         self.assertRaises(errors.InvalidRecordError, reader.validate)
 
     def test_validate_interrupted_prelude(self):
+        """EOF during reading a record's prelude causes validate to fail."""
         reader = self.get_reader_for("")
         self.assertRaises(
             errors.UnexpectedEndOfContainerError, reader.validate)
 
     def test_validate_interrupted_body(self):
+        """EOF during reading a record's body causes validate to fail."""
         reader = self.get_reader_for("1\n\n")
         self.assertRaises(
             errors.UnexpectedEndOfContainerError, reader.validate)
 
     def test_validate_unparseable_length(self):
+        """An unparseable record length causes validate to fail."""
         reader = self.get_reader_for("\n\n")
         self.assertRaises(
             errors.InvalidRecordError, reader.validate)
 
+    def test_validate_undecodeable_name(self):
+        """Names that aren't valid UTF-8 cause validate to fail."""
+        reader = self.get_reader_for("0\n\xcc\n\n")
+        self.assertRaises(errors.InvalidRecordError, reader.validate)
+
+    def test_read_max_length(self):
+        """If the max_length passed to the callable returned by read is not
+        None, then no more than that many bytes will be read.
+        """
+        reader = self.get_reader_for("6\n\nabcdef")
+        names, get_bytes = reader.read()
+        self.assertEqual('abc', get_bytes(3))
+
+    def test_read_no_max_length(self):
+        """If the max_length passed to the callable returned by read is None,
+        then all the bytes in the record will be read.
+        """
+        reader = self.get_reader_for("6\n\nabcdef")
+        names, get_bytes = reader.read()
+        self.assertEqual('abcdef', get_bytes(None))
+
+    def test_repeated_read_calls(self):
+        """Repeated calls to the callable returned from BytesRecordReader.read
+        will not read beyond the end of the record.
+        """
+        reader = self.get_reader_for("6\n\nabcdefB3\nnext-record\nXXX")
+        names, get_bytes = reader.read()
+        self.assertEqual('abcdef', get_bytes(None))
+        self.assertEqual('', get_bytes(None))
+        self.assertEqual('', get_bytes(99))
+
+