Rev 4099: Handle inconsistent inventory data more gracefully at a small performance cost during fetch. in http://people.ubuntu.com/~robertc/baz2.0/fetch

Tue Mar 10 07:47:34 GMT 2009

At http://people.ubuntu.com/~robertc/baz2.0/fetch

------------------------------------------------------------
revno: 4099
revision-id: robertc at robertcollins.net-20090310074723-jgctuly1ziw23r7e
parent: pqm at pqm.ubuntu.com-20090309084556-9i2m12qlud2qcrtw
committer: Robert Collins <robertc at robertcollins.net>
branch nick: fetch
timestamp: Tue 2009-03-10 18:47:23 +1100
message:
  Handle inconsistent inventory data more gracefully at a small performance cost during fetch.
=== modified file 'bzrlib/repository.py'

--- a/bzrlib/repository.py	2009-03-06 10:01:37 +0000
+++ b/bzrlib/repository.py	2009-03-10 07:47:23 +0000
@@ -1453,6 +1453,26 @@
                 result[key] = True
         return result
 
+    def _inventory_xml_lines_for_keys(self, keys):
+        """Get a line iterator of the sort needed for findind references.
+
+        Not relevant for non-xml inventory repositories.
+
+        Ghosts in revision_keys are ignored.
+
+        :param revision_keys: The revision keys for the inventories to inspect.
+        :return: An iterator over (inventory line, revid) for the fulltexts of
+            all of the xml inventories specified by revision_keys.
+        """
+        stream = self.inventories.get_record_stream(keys, 'unordered', True)
+        for record in stream:
+            if record.storage_kind != 'absent':
+                chunks = record.get_bytes_as('chunked')
+                revid = record.key[-1]
+                lines = osutils.chunks_to_lines(chunks)
+                for line in lines:
+                    yield line, revid
+
     def _find_file_ids_from_xml_inventory_lines(self, line_iterator,
         revision_ids):
         """Helper routine for fileids_altered_by_revision_ids.
@@ -1468,15 +1488,20 @@
         revision_ids. Each altered file-ids has the exact revision_ids that
         altered it listed explicitly.
         """
+        seen = set(self._find_text_key_references_from_xml_inventory_lines(
+                line_iterator).iterkeys())
+        # Note that revision_ids are revision keys.
+        parent_maps = self.revisions.get_parent_map(revision_ids)
+        parents = set()
+        map(parents.update, parent_maps.itervalues())
+        parents.difference_update(revision_ids)
+        parent_seen = set(self._find_text_key_references_from_xml_inventory_lines(
+            self._inventory_xml_lines_for_keys(parents)))
+        new_keys = seen - parent_seen
         result = {}
         setdefault = result.setdefault
-        for key in \
-            self._find_text_key_references_from_xml_inventory_lines(
-                line_iterator).iterkeys():
-            # once data is all ensured-consistent; then this is
-            # if revision_id == version_id
-            if key[-1:] in revision_ids:
-                setdefault(key[0], set()).add(key[-1])
+        for key in new_keys:
+            setdefault(key[0], set()).add(key[-1])
         return result
 
     def fileids_altered_by_revision_ids(self, revision_ids, _inv_weave=None):
@@ -3165,10 +3190,7 @@
                         # We don't copy the text for the root node unless the
                         # target supports_rich_root.
                         continue
-                    # TODO: Do we need:
-                    #       "if entry.revision == current_revision_id" ?
-                    if entry.revision == current_revision_id:
-                        text_keys.add((file_id, entry.revision))
+                    text_keys.add((file_id, entry.revision))
             revision = self.source.get_revision(current_revision_id)
             pending_deltas.append((basis_id, delta,
                 current_revision_id, revision.parent_ids))

=== modified file 'bzrlib/tests/interrepository_implementations/test_fetch.py'
--- a/bzrlib/tests/interrepository_implementations/test_fetch.py	2009-01-17 01:30:58 +0000
+++ b/bzrlib/tests/interrepository_implementations/test_fetch.py	2009-03-10 07:47:23 +0000
@@ -20,8 +20,10 @@
 import bzrlib
 from bzrlib import (
     errors,
+    inventory,
+    osutils,
     repository,
-    osutils,
+    versionedfile,
     )
 from bzrlib.errors import (
     NoSuchRevision,
@@ -73,6 +75,56 @@
         repo_b = self.make_to_repository('b')
         check_push_rev1(repo_b)
 
+    def test_fetch_inconsistent_last_changed_entries(self):
+        """If an inventory has odd data we should still get what it references.
+        
+        This test tests that we do fetch a file text created in a revision not
+        being fetched, but referenced from the revision we are fetching when the
+        adjacent revisions to the one being fetched do not reference that text.
+        """
+        tree = self.make_branch_and_tree('source')
+        revid = tree.commit('old')
+        to_repo = self.make_to_repository('to_repo')
+        to_repo.fetch(tree.branch.repository, revid)
+        # Make a broken revision and fetch it.
+        source = tree.branch.repository
+        source.lock_write()
+        self.addCleanup(source.unlock)
+        source.start_write_group()
+        try:
+            # We need two revisions: OLD and NEW. NEW will claim to need a file
+            # 'FOO' changed in 'OLD'. OLD will not have that file at all.
+            source.texts.insert_record_stream([
+                versionedfile.FulltextContentFactory(('foo', revid), (), None,
+                'contents')])
+            basis = source.revision_tree(revid)
+            parent_id = basis.path2id('')
+            entry = inventory.make_entry('file', 'foo-path', parent_id, 'foo')
+            entry.revision = revid
+            entry.text_size = len('contents')
+            entry.text_sha1 = osutils.sha_string('contents')
+            inv_sha1, _ = source.add_inventory_by_delta(revid, [
+                (None, 'foo-path', 'foo', entry)], 'new', [revid])
+            rev = Revision(timestamp=0,
+                           timezone=None,
+                           committer="Foo Bar <foo at example.com>",
+                           message="Message",
+                           inventory_sha1=inv_sha1,
+                           revision_id='new',
+                           parent_ids=[revid])
+            source.add_revision(rev.revision_id, rev)
+        except:
+            source.abort_write_group()
+            raise
+        else:
+            source.commit_write_group()
+        to_repo.fetch(source, 'new')
+        to_repo.lock_read()
+        self.addCleanup(to_repo.unlock)
+        self.assertEqual('contents',
+            to_repo.texts.get_record_stream([('foo', revid)],
+            'unordered', True).next().get_bytes_as('fulltext'))
+
     def test_fetch_missing_basis_text(self):
         """If fetching a delta, we should die if a basis is not present."""
         tree = self.make_branch_and_tree('tree')