Rev 3055: Handle insert_data_stream of an unannotated stream into an annotated knit. in http://people.ubuntu.com/~robertc/baz2.0/knit.datastreamjoin
Robert Collins
robertc at robertcollins.net
Fri Nov 30 03:07:15 GMT 2007
At http://people.ubuntu.com/~robertc/baz2.0/knit.datastreamjoin
------------------------------------------------------------
revno: 3055
revision-id:robertc at robertcollins.net-20071130030701-r0wm01t0a8qx29gk
parent: robertc at robertcollins.net-20071130005436-0qmx32hyti0jz0y6
committer: Robert Collins <robertc at robertcollins.net>
branch nick: knit.datastreamjoin
timestamp: Fri 2007-11-30 14:07:01 +1100
message:
Handle insert_data_stream of an unannotated stream into an annotated knit.
modified:
NEWS NEWS-20050323055033-4e00b5db738777ff
bzrlib/knit.py knit.py-20051212171256-f056ac8f0fbe1bd9
bzrlib/tests/test_knit.py test_knit.py-20051212171302-95d4c00dd5f11f2b
=== modified file 'NEWS'
--- a/NEWS 2007-11-30 00:54:36 +0000
+++ b/NEWS 2007-11-30 03:07:01 +0000
@@ -150,8 +150,8 @@
(Robert Collins, #153789)
* Operations pulling data from a smart server where the underlying
- repositories have are respectively annotated an unannotated will now work.
- (Robert Collins, #165106).
+ repositories are not both annotated/both unannotated will now work.
+ (Robert Collins, #165304).
* Reconcile now shows progress bars. (Robert Collins, #159351)
=== modified file 'bzrlib/knit.py'
--- a/bzrlib/knit.py 2007-11-30 00:54:36 +0000
+++ b/bzrlib/knit.py 2007-11-30 03:07:01 +0000
@@ -805,8 +805,8 @@
factory = KnitAnnotateFactory()
else:
raise errors.KnitDataStreamUnknown(format)
- access = _StreamAccess(reader_callable)
index = _StreamIndex(data_list)
+ access = _StreamAccess(reader_callable, index, self, factory)
return KnitVersionedFile(self.filename, self.transport,
factory=factory, index=index, access_method=access)
@@ -2046,16 +2046,36 @@
class _StreamAccess(object):
- """A Knit Access object that provides data from a datastream."""
-
- def __init__(self, reader_callable):
+ """A Knit Access object that provides data from a datastream.
+
+ It also provides a fallback to present as unannotated data, annotated data
+ from a *backing* access object.
+
+ This is triggered by a index_memo which is pointing to a different index
+ than this was constructed with, and is used to allow extracting full
+ unannotated texts for insertion into annotated knits.
+ """
+
+ def __init__(self, reader_callable, stream_index, backing_knit,
+ orig_factory):
"""Create a _StreamAccess object.
:param reader_callable: The reader_callable from the datastream.
This is called to buffer all the data immediately, for
random access.
+ :param stream_index: The index the data stream this provides access to
+ which will be present in native index_memo's.
+ :param backing_knit: The knit object that will provide access to
+ annotated texts which are not available in the stream, so as to
+ create unannotated texts.
+ :param orig_factory: The original content factory used to generate the
+ stream. This is used for checking whether the thunk code for
+ supporting _copy_texts will generate the correct form of data.
"""
self.data = reader_callable(None)
+ self.stream_index = stream_index
+ self.backing_knit = backing_knit
+ self.orig_factory = orig_factory
def get_raw_records(self, memos_for_retrieval):
"""Get the raw bytes for a records.
@@ -2066,8 +2086,39 @@
:return: An iterator over the bytes of the records.
"""
# use a generator for memory friendliness
- for _, start, end in memos_for_retrieval:
- yield self.data[start:end]
+ for index, start, end in memos_for_retrieval:
+ if index is self.stream_index:
+ yield self.data[start:end]
+ continue
+ # we have been asked to thunk. This thunking only occurs when
+ # we are obtaining plain texts from an annotated backing knit
+ # so that _copy_texts will work.
+ # We could improve performance here by scanning for where we need
+ # to do this and using get_line_list, then interleaving the output
+ # as desired. However, for now, this is sufficient.
+ if (index[:6] != 'thunk:' or
+ self.orig_factory.__class__ != KnitPlainFactory):
+ raise errors.KnitCorrupt(self, 'Bad thunk request %r' % index)
+ version_id = index[6:]
+ lines = self.backing_knit.get_lines(version_id)
+ line_bytes = ''.join(lines)
+ digest = sha_string(line_bytes)
+ if lines:
+ if lines[-1][-1] != '\n':
+ lines[-1] = lines[-1] + '\n'
+ line_bytes += '\n'
+ orig_options = list(self.backing_knit._index.get_options(version_id))
+ if 'fulltext' not in orig_options:
+ if 'line-delta' not in orig_options:
+ raise errors.KnitCorrupt(self,
+ 'Unknown compression method %r' % orig_options)
+ orig_options.remove('line-delta')
+ orig_options.append('fulltext')
+ # We want plain data, because we expect to thunk only to allow text
+ # extraction.
+ size, bytes = self.backing_knit._data._record_to_data(version_id,
+ digest, lines, line_bytes)
+ yield bytes
class _StreamIndex(object):
@@ -2108,6 +2159,21 @@
graph[version] = parents
return graph.keys()
+ def get_method(self, version_id):
+ """Return compression method of specified version."""
+ try:
+ options = self._by_version[version_id][0]
+ except KeyError:
+ # Strictly speaking this should checkin in the backing knit, but
+ # until we have a test to discriminate, this will do.
+ return 'fulltext'
+ if 'fulltext' in options:
+ return 'fulltext'
+ elif 'line-delta' in options:
+ return 'line-delta'
+ else:
+ raise errors.KnitIndexUnknownMethod(self, options)
+
def get_options(self, version_id):
"""Return a string representing options.
@@ -2128,8 +2194,12 @@
:return: a tuple (None, start, end).
"""
- start, end = self._by_version[version_id][1]
- return None, start, end
+ try:
+ start, end = self._by_version[version_id][1]
+ return self, start, end
+ except KeyError:
+ # Signal to the access object to handle this from the backing knit.
+ return ('thunk:%s' % version_id, None, None)
def get_versions(self):
"""Get all the versions in the stream."""
@@ -2144,8 +2214,13 @@
The order is undefined, allowing for different optimisations in
the underlying implementation.
"""
- return [(version, self._by_version[version][2]) for
- version in version_ids]
+ result = []
+ for version in version_ids:
+ try:
+ result.append((version, self._by_version[version][2]))
+ except KeyError:
+ pass
+ return result
class _KnitData(object):
@@ -2408,7 +2483,7 @@
for index, version in enumerate(to_process):
pb.update('Converting versioned data', index, total)
sha1, num_bytes, parent_text = self.target.add_lines(version,
- self.source.get_parents(version),
+ self.source.get_parents_with_ghosts(version),
self.source.get_lines(version),
parent_texts=parent_cache)
parent_cache[version] = parent_text
=== modified file 'bzrlib/tests/test_knit.py'
--- a/bzrlib/tests/test_knit.py 2007-11-30 00:54:36 +0000
+++ b/bzrlib/tests/test_knit.py 2007-11-30 03:07:01 +0000
@@ -61,6 +61,7 @@
)
from bzrlib.transport import get_transport
from bzrlib.transport.memory import MemoryTransport
+from bzrlib.tuned_gzip import GzipFile
from bzrlib.util import bencode
from bzrlib.weave import Weave
@@ -1830,6 +1831,19 @@
target.insert_data_stream(source.get_data_stream(['text-b']))
self.assertKnitValuesEqual(source, target)
+ def test_insert_data_stream_unannotated_annotated(self):
+ """Inserting an unannotated datastream to an annotated knit works."""
+ # case one - full texts.
+ source = self.make_test_knit(name='source', annotate=False)
+ target = self.make_test_knit(name='target', annotate=True)
+ source.add_lines('text-a', [], split_lines(TEXT_1))
+ target.insert_data_stream(source.get_data_stream(['text-a']))
+ self.assertKnitValuesEqual(source, target)
+ # case two - deltas.
+ source.add_lines('text-b', ['text-a'], split_lines(TEXT_2))
+ target.insert_data_stream(source.get_data_stream(['text-b']))
+ self.assertKnitValuesEqual(source, target)
+
def test_insert_data_stream_records_already_present(self):
"""Insert a data stream where some records are alreday present in the
target, and some not. Only the new records are inserted.
@@ -1943,6 +1957,10 @@
self.assertEqual(a_knit._data._access.data, expected_data)
self.assertEqual(a_knit.filename, target_knit.filename)
self.assertEqual(a_knit.transport, target_knit.transport)
+ self.assertEqual(a_knit._index, a_knit._data._access.stream_index)
+ self.assertEqual(target_knit, a_knit._data._access.backing_knit)
+ self.assertIsInstance(a_knit._data._access.orig_factory,
+ source_knit.factory.__class__)
def test__knit_from_data_stream_empty(self):
"""Create a knit object from a datastream."""
@@ -2830,10 +2848,14 @@
index = self.get_index(knit, knit.get_data_stream(versions))
self.assertEqual(result, index.get_ancestry(ancestry_versions, False))
- def assertIterParents(self, knit, versions, result):
+ def assertIterParents(self, knit, versions, parent_versions, result):
"""Check the result of an iter_parents call on knit."""
index = self.get_index(knit, knit.get_data_stream(versions))
- self.assertEqual(result, index.iter_parents(versions))
+ self.assertEqual(result, index.iter_parents(parent_versions))
+
+ def assertGetMethod(self, knit, versions, version, result):
+ index = self.get_index(knit, knit.get_data_stream(versions))
+ self.assertEqual(result, index.get_method(version))
def assertGetOptions(self, knit, version, options):
index = self.get_index(knit, knit.get_data_stream(version))
@@ -2843,6 +2865,12 @@
index = self.get_index(knit, knit.get_data_stream(versions))
self.assertEqual(result, index.get_position(version))
+ def assertGetPosition(self, knit, versions, version, result):
+ index = self.get_index(knit, knit.get_data_stream(versions))
+ if result[0] is None:
+ result = (index, result[1], result[2])
+ self.assertEqual(result, index.get_position(version))
+
def assertGetParentsWithGhosts(self, knit, versions, version, parents):
index = self.get_index(knit, knit.get_data_stream(versions))
self.assertEqual(parents, index.get_parents_with_ghosts(version))
@@ -2890,14 +2918,27 @@
self.assertIndexAncestry(knit, ['c', 'd'], ['a', 'b', 'c', 'd'],
{'a':[], 'b':[], 'c':[], 'd':[]}.keys())
+ def test_get_method(self):
+ knit = self.make_knit_with_4_versions_2_dags()
+ self.assertGetMethod(knit, ['a'], 'a', 'fulltext')
+ self.assertGetMethod(knit, ['c'], 'c', 'line-delta')
+ # get_method on a basis that is not in the datastream (but in the
+ # backing knit) returns 'fulltext', because thats what we'll create as
+ # we thunk across.
+ self.assertGetMethod(knit, ['c'], 'b', 'fulltext')
+
def test_iter_parents(self):
knit = self.make_knit_with_4_versions_2_dags()
- self.assertIterParents(knit, ['a'], [('a', [])])
- self.assertIterParents(knit, ['a', 'b'], [('a', []), ('b', [])])
- self.assertIterParents(knit, ['a', 'b', 'c'],
+ self.assertIterParents(knit, ['a'], ['a'], [('a', [])])
+ self.assertIterParents(knit, ['a', 'b'], ['a', 'b'],
+ [('a', []), ('b', [])])
+ self.assertIterParents(knit, ['a', 'b', 'c'], ['a', 'b', 'c'],
[('a', []), ('b', []), ('c', ['b', 'a'])])
self.assertIterParents(knit, ['a', 'b', 'c', 'd'],
+ ['a', 'b', 'c', 'd'],
[('a', []), ('b', []), ('c', ['b', 'a']), ('d', ['e', 'f'])])
+ self.assertIterParents(knit, ['c'], ['a', 'b', 'c'],
+ [('c', ['b', 'a'])])
def test_get_options(self):
knit = self.make_knit_with_4_versions_2_dags()
@@ -2915,9 +2956,13 @@
# _StreamAccess to use.
self.assertGetPosition(knit, ['a'], 'a', (None, 0, 78))
self.assertGetPosition(knit, ['a', 'c'], 'c', (None, 78, 156))
-
-
-class Test_StreamIndex(KnitTests):
+ # get_position on a text that is not in the datastream (but in the
+ # backing knit) returns ('thunk:versionid', None, None) - and then the
+ # access object can construct the relevant data as needed.
+ self.assertGetPosition(knit, ['a', 'c'], 'b', ('thunk:b', None, None))
+
+
+class Test_StreamAccess(KnitTests):
def get_index_access(self, knit, stream):
"""Get a _StreamAccess from knit and stream."""
@@ -2961,3 +3006,27 @@
self.assertGetRawRecords(knit, ['a', 'b'])
self.assertGetRawRecords(knit, ['a'])
self.assertGetRawRecords(knit, ['b'])
+
+ def test_get_raw_record_from_backing_knit(self):
+ # the thunk layer should create an artificial A on-demand when needed.
+ source_knit = self.make_test_knit(name='plain', annotate=False)
+ target_knit = self.make_test_knit(name='annotated', annotate=True)
+ source_knit.add_lines("A", [], ["Foo\n"])
+ # Give the target A, so we can try to thunk across to it.
+ target_knit.join(source_knit)
+ index, access = self.get_index_access(target_knit,
+ source_knit.get_data_stream([]))
+ raw_data = list(access.get_raw_records([("thunk:A", None, None)]))[0]
+ df = GzipFile(mode='rb', fileobj=StringIO(raw_data))
+ self.assertEqual(
+ 'version A 1 5d36b88bb697a2d778f024048bafabd443d74503\n'
+ 'Foo\nend A\n',
+ df.read())
+
+ def test_asking_for_thunk_stream_is_not_plain_errors(self):
+ knit = self.make_test_knit(name='annotated', annotate=True)
+ knit.add_lines("A", [], ["Foo\n"])
+ index, access = self.get_index_access(knit,
+ knit.get_data_stream([]))
+ self.assertRaises(errors.KnitCorrupt,
+ list, access.get_raw_records([("thunk:A", None, None)]))
More information about the bazaar-commits
mailing list