[patch] check received weaves after join

Tue Mar 21 18:02:29 GMT 2006

Fix to give better detection of repositories which are either
inconsistent or (more likely?) look inconsistent because of http
arseclownery:

 * extends the fileid_involved to indicate which versions of a 
   file are introduced by a revision, making this the base for
   new calculations
 * adds a small test for this
 * removes the unused MissingText exception
 * adds a new specific CouldntPullFileVersion exception
   (Maybe this is too specific, or should be a subclass of some
   other exception?  The strongest rule I am sure of at the moment is
   that we should throw a different class for each situation distinct 
   enough to need a different explanation.)
 * causes fetch to check that weaves or knits received contain 
   everything they should after joining, and otherwise raise this 
   exception
 * bonus addition to xml serializer, not currently used

-- 
Martin
-------------- next part --------------
=== modified file 'a/bzrlib/errors.py'

--- a/bzrlib/errors.py	
+++ b/bzrlib/errors.py	
@@ -381,6 +381,13 @@
         BzrError.__init__(self, msg)
 
 
+class CouldntPullFileVersion(BzrNewError):
+    """Could not find version {%s} of {%s} to pull"""
+    def __init__(self, revision_id, file_id):
+        self.revision_id = revision_id
+        self.file_id = file_id
+
+
 class HistoryMissing(BzrError):
     def __init__(self, branch, object_type, object_id):
         self.branch = branch
@@ -692,17 +699,6 @@
     """
 
 
-class MissingText(BzrNewError):
-    """Branch %(base)s is missing revision %(text_revision)s of %(file_id)s"""
-
-    def __init__(self, branch, text_revision, file_id):
-        BzrNewError.__init__(self)
-        self.branch = branch
-        self.base = branch.base
-        self.text_revision = text_revision
-        self.file_id = file_id
-
-
 class DuplicateKey(BzrNewError):
     """Key %(key)s is already present in map"""
 

=== modified file 'a/bzrlib/fetch.py'
--- a/bzrlib/fetch.py	
+++ b/bzrlib/fetch.py	
@@ -34,7 +34,7 @@
 import bzrlib
 import bzrlib.errors as errors
 from bzrlib.errors import (InstallFailed, NoSuchRevision,
-                           MissingText)
+                           CouldntPullFileVersion)
 from bzrlib.trace import mutter
 from bzrlib.progress import ProgressBar, ProgressPhase
 from bzrlib.reconcile import RepoReconciler
@@ -157,10 +157,12 @@
     def _fetch_weave_texts(self, revs):
         texts_pb = bzrlib.ui.ui_factory.nested_progress_bar()
         try:
-            file_ids = self.from_repository.fileid_involved_by_set(revs)
+            to_fetch = {}
+            for file_id, file_revision_id in self.from_repository.file_versions_involved(revs):
+                to_fetch.setdefault(file_id, []).append(file_revision_id)
             count = 0
-            num_file_ids = len(file_ids)
-            for file_id in file_ids:
+            num_file_ids = len(to_fetch)
+            for file_id, file_revisions in to_fetch.items():
                 texts_pb.update("fetch texts", count, num_file_ids)
                 count +=1
                 try:
@@ -176,6 +178,8 @@
                         None,
                         self.from_repository.get_transaction(),
                         self.to_repository.get_transaction())
+                    to_weave = self.to_weaves.get_weave(file_id,
+                        self.to_repository.get_transaction())
                 else:
                     # destination has contents, must merge
                     from_weave = self.from_weaves.get_weave(file_id,
@@ -183,6 +187,11 @@
                     # we fetch all the texts, because texts do
                     # not reference anything, and its cheap enough
                     to_weave.join(from_weave)
+                # now make sure it has everything we need
+                for file_revision_id in file_revisions:
+                    if not to_weave.has_version(file_revision_id):
+                        raise CouldntPullFileVersion(file_revision_id,
+                                                     file_id)
         finally:
             texts_pb.finished()
 

=== modified file 'a/bzrlib/repository.py'
--- a/bzrlib/repository.py	
+++ b/bzrlib/repository.py	
@@ -326,6 +326,7 @@
         #       or better yet, change _fileid_involved_by_set so
         #       that it takes the inventory weave, rather than
         #       pulling it out by itself.
+        #       mbp 20060321 -- which line?
         return self._fileid_involved_by_set(changes)
 
     def _fileid_involved_by_set(self, changes):
@@ -341,41 +342,52 @@
         to have a single line per file/directory, and to have
         fileid="" and revision="" on that line.
         """
+        r = set()
+        for file_id, file_revision_id in self.file_versions_involved(changes):
+            r.add(file_id)
+        return r
+
+    def file_versions_involved(self, revision_ids):
+        """Find which file versions are created by a group of of revisions.
+
+        :param revision_ids: A collection of revision ids to check.
+        :return: [(file_id, file_revision_id)] for each revision of the file 
+            newly introduced by one of the revisions.
+
+        As with the other *_involved methods, this may return less than the 
+        strictly minimal list of involved versions.
+        """
         assert isinstance(self._format, (RepositoryFormat5,
                                          RepositoryFormat6,
                                          RepositoryFormat7,
                                          RepositoryFormatKnit1)), \
             "fileid_involved only supported for branches which store inventory as unnested xml"
-
         w = self.get_inventory_weave()
-        file_ids = set()
-
+        file_versions = []
         # this code needs to read every line in every inventory for the
-        # inventories [changes]. Seeing a line twice is ok. Seeing a line
+        # inventories [revision_ids]. Seeing a line twice is ok. Seeing a line
         # not pesent in one of those inventories is unnecessary and not 
         # harmful because we are filtering by the revision id marker in the
         # inventory lines to only select file ids altered in one of those  
         # revisions. We dont need to see all lines in the inventory because
         # only those added in an inventory in rev X can contain a revision=X
         # line.
-        for line in w.iter_lines_added_or_present_in_versions(changes):
+        for line in w.iter_lines_added_or_present_in_versions(revision_ids):
             start = line.find('file_id="')+9
             if start < 9: continue
             end = line.find('"', start)
             assert end>= 0
             file_id = _unescape_xml(line[start:end])
-
-            # check if file_id is already present
-            if file_id in file_ids: continue
-
             start = line.find('revision="')+10
             if start < 10: continue
             end = line.find('"', start)
             assert end>= 0
             revision_id = _unescape_xml(line[start:end])
-            if revision_id in changes:
-                file_ids.add(file_id)
-        return file_ids
+            # TODO: perhaps we can just parse this single line using the usual
+            # xml parser mechanism?  mbp 20060321
+            if revision_id in revision_ids:
+                file_versions.append((file_id, revision_id))
+        return file_versions
 
     @needs_read_lock
     def get_inventory_weave(self):

=== modified file 'a/bzrlib/tests/repository_implementations/test_fileid_involved.py'
--- a/bzrlib/tests/repository_implementations/test_fileid_involved.py	
+++ b/bzrlib/tests/repository_implementations/test_fileid_involved.py	
@@ -161,6 +161,12 @@
         l = self.branch.repository.fileid_involved_by_set(set(["rev-<D>"]))
         self.assertEquals( sorted(map( lambda x: x[0], l )), ["b"])
 
+    def test_file_versions_involved(self):
+        """Report which versions of a file are touched by a set of revisions"""
+
+        l = self.branch.repository.file_versions_involved(set(["rev-B"]))
+        self.assertEquals(l, [('a-file-id-2006-01-01-abcd', 'rev-B')])
+
     def test_fileid_involved_compare(self):
 
         l1 = self.branch.repository.fileid_involved_between_revs("rev-E", "rev-<D>")

=== modified file 'a/bzrlib/tests/test_fetch.py'
--- a/bzrlib/tests/test_fetch.py	
+++ b/bzrlib/tests/test_fetch.py	
@@ -1,4 +1,4 @@
-# Copyright (C) 2005 by Canonical Ltd
+# Copyright (C) 2005, 2006 by Canonical Ltd
 
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@@ -18,10 +18,10 @@
 import sys
 
 from bzrlib.branch import Branch
-from bzrlib.bzrdir import BzrDir
+from bzrlib.bzrdir import BzrDir, BzrDirMetaFormat1
 from bzrlib.builtins import merge
 import bzrlib.errors
-from bzrlib.tests import TestCaseWithTransport
+from bzrlib.tests import TestCaseWithTransport, TestCaseInTempDir
 from bzrlib.tests.HTTPTestUtil import TestCaseWithWebserver
 from bzrlib.tests.test_revision import make_branches
 from bzrlib.trace import mutter
@@ -237,3 +237,64 @@
         self.assertEqual(1, self._count_log_matches('branch-format', http_logs[0:1]))
         self.assertEqual(1, self._count_log_matches('revision-history', http_logs[1:2]))
         self.assertEqual(2, len(http_logs))
+
+
+class TestFetchDetectsLostHistory(TestCaseInTempDir):
+    """Detect missing dependent objects at fetch time.
+    
+    The particular case is that a misbehaving http cache might serve old 
+    versions of some files referred to by a revision.  That should be 
+    trapped at fetch time so that it does not propagate.
+    """
+
+    # XXX: test could usefully be repeated for various different formats, but
+    # will require some amount of specialization.  Testing on only weaves
+    # should test the basic detection code.
+
+    def setUp(self):
+        super(TestFetchDetectsLostHistory, self).setUp()
+        os.mkdir('a')
+        bzrdir_a = BzrDirMetaFormat1().initialize('a')
+        repo_a = bzrdir_a.create_repository()
+        self.branch_a = bzrdir_a.create_branch()
+        wt_a = bzrdir_a.create_workingtree()
+        self.build_tree(['a/onefile', 'a/twofile'])
+        wt_a.add(['onefile'], ['onefile-id'])
+        wt_a.commit('initial commit', rev_id='rev-1')
+        wt_a.remove(['onefile'])
+        wt_a.add(['twofile'], ['twofile-id'])
+        wt_a.commit('second commit', rev_id='rev-2')
+        self._remove_file_history(repo_a, 'onefile-id')
+        self._build_destination_dir()
+
+    def _remove_file_history(self, repo, file_id):
+        """Remove versions for one file to simulate transport corruption"""
+        from bzrlib.weave import Weave
+        text_store = repo.text_store
+        self.assertTrue(text_store.has_id(file_id))
+        # replace with an empty weave, as if there was somehow an old copy
+        # left around
+        text_store._put_weave(file_id, Weave(), repo.get_transaction())
+
+    def _build_destination_dir(self):
+        os.mkdir('b')
+        self.bzrdir_b = BzrDirMetaFormat1().initialize('b')
+        self.repo_b = self.bzrdir_b.create_repository()
+        self.branch_b = self.bzrdir_b.create_branch()
+        self.wt_b = self.bzrdir_b.create_workingtree()
+
+    def test_10_missing_current_file_version(self):
+        """Try to pull a tree where we can't construct even the current working tree"""
+        # typically InstallFailed at the moment, but I don't know if we should
+        # specifically require this? -- mbp 20060321
+        self.assertRaises(bzrlib.errors.InstallFailed,
+                          self.wt_b.pull,
+                          self.branch_a, stop_revision='revid-1')
+
+    def test_20_missing_file_version(self):
+        """Try to pull changes where a file version is missing, 
+        but not from the most recent tree.
+        """
+        self.assertRaises(bzrlib.errors.CouldntPullFileVersion,
+                          self.wt_b.pull,
+                          self.branch_a)

=== modified file 'a/bzrlib/xml_serializer.py'
--- a/bzrlib/xml_serializer.py	
+++ b/bzrlib/xml_serializer.py	
@@ -71,3 +71,6 @@
 
     def _read_element(self, f):
         return ElementTree().parse(f)
+
+    def read_inventory_entry_from_string(self, xml_string):
+        return self._unpack_entry(self._read_element(xml_string))

-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 191 bytes
Desc: Digital signature
Url : https://lists.ubuntu.com/archives/bazaar/attachments/20060321/e0591485/attachment.pgp