Rev 3055: Handle insert_data_stream of an unannotated stream into an annotated knit. in http://people.ubuntu.com/~robertc/baz2.0/knit.datastreamjoin

Fri Nov 30 03:07:15 GMT 2007

At http://people.ubuntu.com/~robertc/baz2.0/knit.datastreamjoin

------------------------------------------------------------
revno: 3055
revision-id:robertc at robertcollins.net-20071130030701-r0wm01t0a8qx29gk
parent: robertc at robertcollins.net-20071130005436-0qmx32hyti0jz0y6
committer: Robert Collins <robertc at robertcollins.net>
branch nick: knit.datastreamjoin
timestamp: Fri 2007-11-30 14:07:01 +1100
message:
  Handle insert_data_stream of an unannotated stream into an annotated knit.
modified:
  NEWS                           NEWS-20050323055033-4e00b5db738777ff
  bzrlib/knit.py                 knit.py-20051212171256-f056ac8f0fbe1bd9
  bzrlib/tests/test_knit.py      test_knit.py-20051212171302-95d4c00dd5f11f2b
=== modified file 'NEWS'

--- a/NEWS	2007-11-30 00:54:36 +0000
+++ b/NEWS	2007-11-30 03:07:01 +0000
@@ -150,8 +150,8 @@
      (Robert Collins, #153789)
 
    * Operations pulling data from a smart server where the underlying
-     repositories have are respectively annotated an unannotated will now work.
-     (Robert Collins, #165106).
+     repositories are not both annotated/both unannotated will now work.
+     (Robert Collins, #165304).
 
    * Reconcile now shows progress bars. (Robert Collins, #159351)
 

=== modified file 'bzrlib/knit.py'
--- a/bzrlib/knit.py	2007-11-30 00:54:36 +0000
+++ b/bzrlib/knit.py	2007-11-30 03:07:01 +0000
@@ -805,8 +805,8 @@
             factory = KnitAnnotateFactory()
         else:
             raise errors.KnitDataStreamUnknown(format)
-        access = _StreamAccess(reader_callable)
         index = _StreamIndex(data_list)
+        access = _StreamAccess(reader_callable, index, self, factory)
         return KnitVersionedFile(self.filename, self.transport,
             factory=factory, index=index, access_method=access)
 
@@ -2046,16 +2046,36 @@
 
 
 class _StreamAccess(object):
-    """A Knit Access object that provides data from a datastream."""
-
-    def __init__(self, reader_callable):
+    """A Knit Access object that provides data from a datastream.
+    
+    It also provides a fallback to present as unannotated data, annotated data
+    from a *backing* access object.
+
+    This is triggered by a index_memo which is pointing to a different index
+    than this was constructed with, and is used to allow extracting full
+    unannotated texts for insertion into annotated knits.
+    """
+
+    def __init__(self, reader_callable, stream_index, backing_knit,
+        orig_factory):
         """Create a _StreamAccess object.
 
         :param reader_callable: The reader_callable from the datastream.
             This is called to buffer all the data immediately, for 
             random access.
+        :param stream_index: The index the data stream this provides access to
+            which will be present in native index_memo's.
+        :param backing_knit: The knit object that will provide access to 
+            annotated texts which are not available in the stream, so as to
+            create unannotated texts.
+        :param orig_factory: The original content factory used to generate the
+            stream. This is used for checking whether the thunk code for
+            supporting _copy_texts will generate the correct form of data.
         """
         self.data = reader_callable(None)
+        self.stream_index = stream_index
+        self.backing_knit = backing_knit
+        self.orig_factory = orig_factory
 
     def get_raw_records(self, memos_for_retrieval):
         """Get the raw bytes for a records.
@@ -2066,8 +2086,39 @@
         :return: An iterator over the bytes of the records.
         """
         # use a generator for memory friendliness
-        for _, start, end in memos_for_retrieval:
-            yield self.data[start:end]
+        for index, start, end in memos_for_retrieval:
+            if index is self.stream_index:
+                yield self.data[start:end]
+                continue
+            # we have been asked to thunk. This thunking only occurs when
+            # we are obtaining plain texts from an annotated backing knit
+            # so that _copy_texts will work.
+            # We could improve performance here by scanning for where we need
+            # to do this and using get_line_list, then interleaving the output
+            # as desired. However, for now, this is sufficient.
+            if (index[:6] != 'thunk:' or
+                self.orig_factory.__class__ != KnitPlainFactory):
+                raise errors.KnitCorrupt(self, 'Bad thunk request %r' % index)
+            version_id = index[6:]
+            lines = self.backing_knit.get_lines(version_id)
+            line_bytes = ''.join(lines)
+            digest = sha_string(line_bytes)
+            if lines:
+                if lines[-1][-1] != '\n':
+                    lines[-1] = lines[-1] + '\n'
+                    line_bytes += '\n'
+            orig_options = list(self.backing_knit._index.get_options(version_id))
+            if 'fulltext' not in orig_options:
+                if 'line-delta' not in orig_options:
+                    raise errors.KnitCorrupt(self,
+                        'Unknown compression method %r' % orig_options)
+                orig_options.remove('line-delta')
+                orig_options.append('fulltext')
+            # We want plain data, because we expect to thunk only to allow text
+            # extraction.
+            size, bytes = self.backing_knit._data._record_to_data(version_id,
+                digest, lines, line_bytes)
+            yield bytes
 
 
 class _StreamIndex(object):
@@ -2108,6 +2159,21 @@
             graph[version] = parents
         return graph.keys()
 
+    def get_method(self, version_id):
+        """Return compression method of specified version."""
+        try:
+            options = self._by_version[version_id][0]
+        except KeyError:
+            # Strictly speaking this should checkin in the backing knit, but
+            # until we have a test to discriminate, this will do.
+            return 'fulltext'
+        if 'fulltext' in options:
+            return 'fulltext'
+        elif 'line-delta' in options:
+            return 'line-delta'
+        else:
+            raise errors.KnitIndexUnknownMethod(self, options)
+
     def get_options(self, version_id):
         """Return a string representing options.
 
@@ -2128,8 +2194,12 @@
 
         :return: a tuple (None, start, end).
         """
-        start, end = self._by_version[version_id][1]
-        return None, start, end
+        try:
+            start, end = self._by_version[version_id][1]
+            return self, start, end
+        except KeyError:
+            # Signal to the access object to handle this from the backing knit.
+            return ('thunk:%s' % version_id, None, None)
 
     def get_versions(self):
         """Get all the versions in the stream."""
@@ -2144,8 +2214,13 @@
             The order is undefined, allowing for different optimisations in
             the underlying implementation.
         """
-        return [(version, self._by_version[version][2]) for 
-            version in version_ids]
+        result = []
+        for version in version_ids:
+            try:
+                result.append((version, self._by_version[version][2]))
+            except KeyError:
+                pass
+        return result
 
 
 class _KnitData(object):
@@ -2408,7 +2483,7 @@
             for index, version in enumerate(to_process):
                 pb.update('Converting versioned data', index, total)
                 sha1, num_bytes, parent_text = self.target.add_lines(version,
-                    self.source.get_parents(version),
+                    self.source.get_parents_with_ghosts(version),
                     self.source.get_lines(version),
                     parent_texts=parent_cache)
                 parent_cache[version] = parent_text

=== modified file 'bzrlib/tests/test_knit.py'
--- a/bzrlib/tests/test_knit.py	2007-11-30 00:54:36 +0000
+++ b/bzrlib/tests/test_knit.py	2007-11-30 03:07:01 +0000
@@ -61,6 +61,7 @@
     )
 from bzrlib.transport import get_transport
 from bzrlib.transport.memory import MemoryTransport
+from bzrlib.tuned_gzip import GzipFile
 from bzrlib.util import bencode
 from bzrlib.weave import Weave
 
@@ -1830,6 +1831,19 @@
         target.insert_data_stream(source.get_data_stream(['text-b']))
         self.assertKnitValuesEqual(source, target)
 
+    def test_insert_data_stream_unannotated_annotated(self):
+        """Inserting an unannotated datastream to an annotated knit works."""
+        # case one - full texts.
+        source = self.make_test_knit(name='source', annotate=False)
+        target = self.make_test_knit(name='target', annotate=True)
+        source.add_lines('text-a', [], split_lines(TEXT_1))
+        target.insert_data_stream(source.get_data_stream(['text-a']))
+        self.assertKnitValuesEqual(source, target)
+        # case two - deltas.
+        source.add_lines('text-b', ['text-a'], split_lines(TEXT_2))
+        target.insert_data_stream(source.get_data_stream(['text-b']))
+        self.assertKnitValuesEqual(source, target)
+
     def test_insert_data_stream_records_already_present(self):
         """Insert a data stream where some records are alreday present in the
         target, and some not.  Only the new records are inserted.
@@ -1943,6 +1957,10 @@
         self.assertEqual(a_knit._data._access.data, expected_data)
         self.assertEqual(a_knit.filename, target_knit.filename)
         self.assertEqual(a_knit.transport, target_knit.transport)
+        self.assertEqual(a_knit._index, a_knit._data._access.stream_index)
+        self.assertEqual(target_knit, a_knit._data._access.backing_knit)
+        self.assertIsInstance(a_knit._data._access.orig_factory,
+            source_knit.factory.__class__)
 
     def test__knit_from_data_stream_empty(self):
         """Create a knit object from a datastream."""
@@ -2830,10 +2848,14 @@
         index = self.get_index(knit, knit.get_data_stream(versions))
         self.assertEqual(result, index.get_ancestry(ancestry_versions, False))
 
-    def assertIterParents(self, knit, versions, result):
+    def assertIterParents(self, knit, versions, parent_versions, result):
         """Check the result of an iter_parents call on knit."""
         index = self.get_index(knit, knit.get_data_stream(versions))
-        self.assertEqual(result, index.iter_parents(versions))
+        self.assertEqual(result, index.iter_parents(parent_versions))
+
+    def assertGetMethod(self, knit, versions, version, result):
+        index = self.get_index(knit, knit.get_data_stream(versions))
+        self.assertEqual(result, index.get_method(version))
 
     def assertGetOptions(self, knit, version, options):
         index = self.get_index(knit, knit.get_data_stream(version))
@@ -2843,6 +2865,12 @@
         index = self.get_index(knit, knit.get_data_stream(versions))
         self.assertEqual(result, index.get_position(version))
 
+    def assertGetPosition(self, knit, versions, version, result):
+        index = self.get_index(knit, knit.get_data_stream(versions))
+        if result[0] is None:
+            result = (index, result[1], result[2])
+        self.assertEqual(result, index.get_position(version))
+
     def assertGetParentsWithGhosts(self, knit, versions, version, parents):
         index = self.get_index(knit, knit.get_data_stream(versions))
         self.assertEqual(parents, index.get_parents_with_ghosts(version))
@@ -2890,14 +2918,27 @@
         self.assertIndexAncestry(knit, ['c', 'd'], ['a', 'b', 'c', 'd'],
             {'a':[], 'b':[], 'c':[], 'd':[]}.keys())
 
+    def test_get_method(self):
+        knit = self.make_knit_with_4_versions_2_dags()
+        self.assertGetMethod(knit, ['a'], 'a', 'fulltext')
+        self.assertGetMethod(knit, ['c'], 'c', 'line-delta')
+        # get_method on a basis that is not in the datastream (but in the
+        # backing knit) returns 'fulltext', because thats what we'll create as
+        # we thunk across.
+        self.assertGetMethod(knit, ['c'], 'b', 'fulltext')
+
     def test_iter_parents(self):
         knit = self.make_knit_with_4_versions_2_dags()
-        self.assertIterParents(knit, ['a'], [('a', [])])
-        self.assertIterParents(knit, ['a', 'b'], [('a', []), ('b', [])])
-        self.assertIterParents(knit, ['a', 'b', 'c'],
+        self.assertIterParents(knit, ['a'], ['a'], [('a', [])])
+        self.assertIterParents(knit, ['a', 'b'], ['a', 'b'],
+            [('a', []), ('b', [])])
+        self.assertIterParents(knit, ['a', 'b', 'c'], ['a', 'b', 'c'],
             [('a', []), ('b', []), ('c', ['b', 'a'])])
         self.assertIterParents(knit, ['a', 'b', 'c', 'd'],
+            ['a', 'b', 'c', 'd'],
             [('a', []), ('b', []), ('c', ['b', 'a']), ('d', ['e', 'f'])])
+        self.assertIterParents(knit, ['c'], ['a', 'b', 'c'],
+            [('c', ['b', 'a'])])
 
     def test_get_options(self):
         knit = self.make_knit_with_4_versions_2_dags()
@@ -2915,9 +2956,13 @@
         # _StreamAccess to use.
         self.assertGetPosition(knit, ['a'], 'a', (None, 0, 78))
         self.assertGetPosition(knit, ['a', 'c'], 'c', (None, 78, 156))
-
-
-class Test_StreamIndex(KnitTests):
+        # get_position on a text that is not in the datastream (but in the
+        # backing knit) returns ('thunk:versionid', None, None) - and then the
+        # access object can construct the relevant data as needed.
+        self.assertGetPosition(knit, ['a', 'c'], 'b', ('thunk:b', None, None))
+
+
+class Test_StreamAccess(KnitTests):
 
     def get_index_access(self, knit, stream):
         """Get a _StreamAccess from knit and stream."""
@@ -2961,3 +3006,27 @@
         self.assertGetRawRecords(knit, ['a', 'b'])
         self.assertGetRawRecords(knit, ['a'])
         self.assertGetRawRecords(knit, ['b'])
+    
+    def test_get_raw_record_from_backing_knit(self):
+        # the thunk layer should create an artificial A on-demand when needed.
+        source_knit = self.make_test_knit(name='plain', annotate=False)
+        target_knit = self.make_test_knit(name='annotated', annotate=True)
+        source_knit.add_lines("A", [], ["Foo\n"])
+        # Give the target A, so we can try to thunk across to it.
+        target_knit.join(source_knit)
+        index, access = self.get_index_access(target_knit,
+            source_knit.get_data_stream([]))
+        raw_data = list(access.get_raw_records([("thunk:A", None, None)]))[0]
+        df = GzipFile(mode='rb', fileobj=StringIO(raw_data))
+        self.assertEqual(
+            'version A 1 5d36b88bb697a2d778f024048bafabd443d74503\n'
+            'Foo\nend A\n',
+            df.read())
+
+    def test_asking_for_thunk_stream_is_not_plain_errors(self):
+        knit = self.make_test_knit(name='annotated', annotate=True)
+        knit.add_lines("A", [], ["Foo\n"])
+        index, access = self.get_index_access(knit,
+            knit.get_data_stream([]))
+        self.assertRaises(errors.KnitCorrupt,
+            list, access.get_raw_records([("thunk:A", None, None)]))