[MERGE, 165304] Allow insert_data_stream to insert differently annotated stream.

Fri Nov 30 05:34:44 GMT 2007

bb:tweak

Looks pretty good to me.

Robert Collins wrote:
> === modified file 'bzrlib/errors.py'
> --- bzrlib/errors.py	2007-11-29 18:41:01 +0000
> +++ bzrlib/errors.py	2007-11-29 22:24:49 +0000
> @@ -1332,6 +1332,7 @@
>  
>  
>  class KnitDataStreamIncompatible(KnitError):
> +    # Not raised anymore, as we can convert data streams

I guess you haven't deleted this for compatibility's sake?  Perhaps it should be
explicitly deprecated.

> @@ -1512,9 +1535,9 @@
>              return 'line-delta'
>  
>      def get_options(self, version_id):
> -        """Return a string represention options.
> +        """Return a string representing options.
>  
> -        e.g. foo,bar
> +        e.g. ['foo', 'bar']
>          """
>          return self._cache[version_id][1]
>  
> @@ -1748,9 +1771,9 @@
>              raise RevisionNotPresent(version_id, self)
>  
>      def get_options(self, version_id):
> -        """Return a string represention options.
> +        """Return a list representing options.
>  
> -        e.g. foo,bar
> +        e.g. ['foo', 'bar']
>          """

Should the first get_options docstring also have s/string/list/ ?

> +class _StreamAccess(object):
> +    """A Knit Access object that provides data from a datastream.
> +    
> +    It also provides a fallback to present as unannotated data, annotated data
> +    from a *backing* access object.
> +
> +    This is triggered by a index_memo which is pointing to a different index
> +    than this was constructed with, and is used to allow extracting full
> +    unannotated texts for insertion into annotated knits.
> +    """
> +
> +    def __init__(self, reader_callable, stream_index, backing_knit,
> +        orig_factory):
> +        """Create a _StreamAccess object.
> +
> +        :param reader_callable: The reader_callable from the datastream.
> +            This is called to buffer all the data immediately, for 
> +            random access.
> +        :param stream_index: The index the data stream this provides access to
> +            which will be present in native index_memo's.
> +        :param backing_knit: The knit object that will provide access to 
> +            annotated texts which are not available in the stream, so as to
> +            create unannotated texts.
> +        :param orig_factory: The original content factory used to generate the
> +            stream. This is used for checking whether the thunk code for
> +            supporting _copy_texts will generate the correct form of data.
> +        """
> +        self.data = reader_callable(None)
> +        self.stream_index = stream_index
> +        self.backing_knit = backing_knit
> +        self.orig_factory = orig_factory
> +
> +    def get_raw_records(self, memos_for_retrieval):
> +        """Get the raw bytes for a records.
> +
> +        :param memos_for_retrieval: An iterable containing the (index, pos, 
> +            length) memo for retrieving the bytes. The .knit method ignores
> +            the index as there is always only a single file.
> +        :return: An iterator over the bytes of the records.
> +        """
> +        # use a generator for memory friendliness
> +        for index, start, end in memos_for_retrieval:

“index, start, end” is different to the “index, pos, length” claimed by the
docstring.

> +            if index is self.stream_index:

This looks fishy; index is a string (or at least often is?  I'm not sure.), and
“is” comparisons on strings are often not safe.  Knowing you, you've probably
done this deliberately and carefully, but a comment explaining why this is ok
would be good, because it's not obvious to me.  Maybe it's because index is
sometimes not a string?  (what is it?)

> +                yield self.data[start:end]
> +                continue
> +            # we have been asked to thunk. This thunking only occurs when
> +            # we are obtaining plain texts from an annotated backing knit
> +            # so that _copy_texts will work.
> +            # We could improve performance here by scanning for where we need
> +            # to do this and using get_line_list, then interleaving the output
> +            # as desired. However, for now, this is sufficient.
> +            if (index[:6] != 'thunk:' or
> +                self.orig_factory.__class__ != KnitPlainFactory):
> +                raise errors.KnitCorrupt(self, 'Bad thunk request %r' % index)
> +            version_id = index[6:]
> +            lines = self.backing_knit.get_lines(version_id)
> +            line_bytes = ''.join(lines)
> +            digest = sha_string(line_bytes)
> +            if lines:
> +                if lines[-1][-1] != '\n':
> +                    lines[-1] = lines[-1] + '\n'
> +                    line_bytes += '\n'

Why is it necessary to make sure the final newline exists?

[...]
> +class _StreamIndex(object):

> +
> +    def get_method(self, version_id):
> +        """Return compression method of specified version."""
> +        try:
> +            options = self._by_version[version_id][0]
> +        except KeyError:
> +            # Strictly speaking this should checkin in the backing knit, but

“checkin in” is a typo I think.

> +    def get_options(self, version_id):
> +        """Return a string representing options.
> +
> +        e.g. ['foo', 'bar']
> +        """
> +        return self._by_version[version_id][0]

Does this return a string or a list?  The docstring is unclear.

> +    def get_position(self, version_id):
> +        """Return details needed to access the version.
> +        
> +        _StreamAccess has the data as a big array, so we return slice
> +        coordinates into that (as index_memo's are opaque outside the 
> +        index and matching access class.
> +
> +        :return: a tuple (None, start, end).
> +        """

There's an unclosed parenthesis in that docstring.

> +    def iter_parents(self, version_ids):
> +        """Iterate through the parents for many version ids.
> +
> +        :param version_ids: An iterable yielding version_ids.
> +        :return: An iterator that yields (version_id, parents). Requested 
> +            version_ids not present in the versioned file are simply skipped.
> +            The order is undefined, allowing for different optimisations in
> +            the underlying implementation.
> +        """
> +        result = []
> +        for version in version_ids:
> +            try:
> +                result.append((version, self._by_version[version][2]))
> +            except KeyError:
> +                pass
> +        return result

I'm slightly surprised that this isn't a generator, although this way obviously
satisfies the contract adequately.

> === modified file 'bzrlib/tests/test_knit.py'

> +    def assertKnitValuesEqual(self, left, right):
> +        """Assert that the texts and graph of left and right are the same."""

It also asserts that the annotations are the same.

> -        It will raise KnitDataStreamIncompatible.
> +        It will raise KnitDataStreamUnknown because the fallback code will fail
> +        to make a knit. In future we may need KnitDataStreamIncompatible again,
> +        for more exotic cases.
>          """

Ah, I guess that answers my earlier query about deprecating
KnitDataStreamIncompatible.  Perhaps this comment should be repeated in
errors.py?

-Andrew.