Rev 3384: get_record_stream for fulltexts working (but note extreme memory use!). in http://people.ubuntu.com/~robertc/baz2.0/shallow-branch

Robert Collins robertc at robertcollins.net
Mon Jun 23 02:13:25 BST 2008


At http://people.ubuntu.com/~robertc/baz2.0/shallow-branch

------------------------------------------------------------
revno: 3384
revision-id: robertc at robertcollins.net-20080623011320-c5952pwuvi9m6hpp
parent: robertc at robertcollins.net-20080623001926-8vos9qr0xgbm74wu
committer: Robert Collins <robertc at robertcollins.net>
branch nick: stacking-knits
timestamp: Mon 2008-06-23 11:13:20 +1000
message:
  get_record_stream for fulltexts working (but note extreme memory use!).
modified:
  bzrlib/knit.py                 knit.py-20051212171256-f056ac8f0fbe1bd9
  bzrlib/tests/test_knit.py      test_knit.py-20051212171302-95d4c00dd5f11f2b
  bzrlib/versionedfile.py        versionedfile.py-20060222045106-5039c71ee3b65490
=== modified file 'bzrlib/knit.py'
--- a/bzrlib/knit.py	2008-06-23 00:19:26 +0000
+++ b/bzrlib/knit.py	2008-06-23 01:13:20 +0000
@@ -964,9 +964,13 @@
         text_map, contents_map = self._get_content_maps([key])
         return contents_map[key]
 
-    def _get_content_maps(self, keys):
+    def _get_content_maps(self, keys, nonlocal_keys=None):
         """Produce maps of text and KnitContents
         
+        :param keys: The keys to produce content maps for.
+        :param nonlocal_keys: An iterable of keys(possibly intersecting keys)
+            which are known to not be in this knit, but rather in one of the
+            fallback knits.
         :return: (text_map, content_map) where text_map contains the texts for
         the requested versions and content_map contains the KnitContents.
         """
@@ -976,12 +980,32 @@
         # final output.
         keys = list(keys)
         multiple_versions = len(keys) != 1
-        record_map = self._get_record_map(keys)
+        record_map = self._get_record_map(keys, allow_missing=True)
 
         text_map = {}
         content_map = {}
         final_content = {}
+        if nonlocal_keys is None:
+            nonlocal_keys = set()
+        else:
+            nonlocal_keys = frozenset(nonlocal_keys)
+        missing_keys = set(nonlocal_keys)
+        for source in self._fallback_vfs:
+            if not missing_keys:
+                break
+            for record in source.get_record_stream(missing_keys,
+                'unordered', True):
+                if record.storage_kind == 'absent':
+                    continue
+                missing_keys.remove(record.key)
+                bytes = record.get_bytes_as('fulltext')
+                lines = split_lines(record.get_bytes_as('fulltext'))
+                text_map[record.key] = lines
+                final_content[record.key] = PlainKnitContent(lines, record.key)
         for key in keys:
+            if key in nonlocal_keys:
+                # already handled
+                continue
             components = []
             cursor = key
             while cursor is not None:
@@ -1173,7 +1197,8 @@
         if include_delta_closure:
             # XXX: get_content_maps performs its own index queries; allow state
             # to be passed in.
-            text_map, _ = self._get_content_maps(present_keys)
+            text_map, _ = self._get_content_maps(present_keys,
+                needed_from_fallback - absent_keys)
             for key in present_keys:
                 yield FulltextContentFactory(key, global_map[key], None,
                     ''.join(text_map[key]))

=== modified file 'bzrlib/tests/test_knit.py'
--- a/bzrlib/tests/test_knit.py	2008-06-23 00:19:26 +0000
+++ b/bzrlib/tests/test_knit.py	2008-06-23 01:13:20 +0000
@@ -1424,6 +1424,92 @@
         self.assertEqual([("get_parent_map", set([key_basis, key_missing]))],
             basis.calls)
 
+    def test_get_record_stream_unordered_fulltexts(self):
+        # records from the test knit are answered without asking the basis:
+        basis, test = self.get_basis_and_test_knit()
+        key = ('foo',)
+        key_basis = ('bar',)
+        key_missing = ('missing',)
+        test.add_lines(key, (), ['foo\n'])
+        records = list(test.get_record_stream([key], 'unordered', True))
+        self.assertEqual(1, len(records))
+        self.assertEqual([], basis.calls)
+        # Missing (from test knit) objects are retrieved from the basis:
+        basis.add_lines(key_basis, (), ['foo\n', 'bar\n'])
+        basis.calls = []
+        records = list(test.get_record_stream([key_basis, key_missing],
+            'unordered', True))
+        self.assertEqual(2, len(records))
+        calls = list(basis.calls)
+        for record in records:
+            self.assertSubset([record.key], (key_basis, key_missing))
+            if record.key == key_missing:
+                self.assertIsInstance(record, AbsentContentFactory)
+            else:
+                reference = list(basis.get_record_stream([key_basis],
+                    'unordered', True))[0]
+                self.assertEqual(reference.key, record.key)
+                self.assertEqual(reference.sha1, record.sha1)
+                self.assertEqual(reference.storage_kind, record.storage_kind)
+                self.assertEqual(reference.get_bytes_as(reference.storage_kind),
+                    record.get_bytes_as(record.storage_kind))
+                self.assertEqual(reference.get_bytes_as('fulltext'),
+                    record.get_bytes_as('fulltext'))
+        # Its not strictly minimal, but it seems reasonable for now for it to
+        # ask which fallbacks have which parents.
+        self.assertEqual([
+            ("get_parent_map", set([key_basis, key_missing])),
+            ("get_record_stream", [key_basis], 'unordered', True)],
+            calls)
+
+    def test_get_record_stream_ordered_fulltexts(self):
+        # ordering is preserved down into the fallback store.
+        basis, test = self.get_basis_and_test_knit()
+        key = ('foo',)
+        key_basis = ('bar',)
+        key_basis_2 = ('quux',)
+        key_missing = ('missing',)
+        test.add_lines(key, (key_basis,), ['foo\n'])
+        # Missing (from test knit) objects are retrieved from the basis:
+        basis.add_lines(key_basis, (key_basis_2,), ['foo\n', 'bar\n'])
+        basis.add_lines(key_basis_2, (), ['quux\n'])
+        basis.calls = []
+        # ask for in non-topological order
+        records = list(test.get_record_stream(
+            [key, key_basis, key_missing, key_basis_2], 'topological', True))
+        self.assertEqual(4, len(records))
+        results = []
+        for record in records:
+            self.assertSubset([record.key],
+                (key_basis, key_missing, key_basis_2, key))
+            if record.key == key_missing:
+                self.assertIsInstance(record, AbsentContentFactory)
+            else:
+                results.append((record.key, record.sha1, record.storage_kind,
+                    record.get_bytes_as('fulltext')))
+        calls = list(basis.calls)
+        order = [record[0] for record in results]
+        self.assertEqual([key_basis_2, key_basis, key], order)
+        for result in results:
+            if result[0] == key:
+                source = test
+            else:
+                source = basis
+            record = source.get_record_stream([result[0]], 'unordered',
+                True).next()
+            self.assertEqual(record.key, result[0])
+            self.assertEqual(record.sha1, result[1])
+            self.assertEqual(record.storage_kind, result[2])
+            self.assertEqual(record.get_bytes_as('fulltext'), result[3])
+        # Its not strictly minimal, but it seems reasonable for now for it to
+        # ask which fallbacks have which parents.
+        self.assertEqual([
+            ("get_parent_map", set([key_basis, key_basis_2, key_missing])),
+            # unordered is asked for by the underlying worker as it still
+            # buffers everything while answering - which is a problem!
+            ("get_record_stream", [key_basis_2, key_basis], 'unordered', True)],
+            calls)
+
     def test_get_record_stream_unordered_deltas(self):
         # records from the test knit are answered without asking the basis:
         basis, test = self.get_basis_and_test_knit()

=== modified file 'bzrlib/versionedfile.py'
--- a/bzrlib/versionedfile.py	2008-06-19 21:54:06 +0000
+++ b/bzrlib/versionedfile.py	2008-06-23 01:13:20 +0000
@@ -540,7 +540,7 @@
         return self._backing_vf.get_parent_map(keys)
 
     def get_record_stream(self, keys, sort_order, include_delta_closure):
-        self.calls.append(("get_record_stream", keys, sort_order,
+        self.calls.append(("get_record_stream", list(keys), sort_order,
             include_delta_closure))
         return self._backing_vf.get_record_stream(keys, sort_order,
             include_delta_closure)




More information about the bazaar-commits mailing list