[patch] refactor check, all_revision_ids

Martin Pool mbp at canonical.com
Thu Jun 1 05:26:03 BST 2006


This is the diff of some refactorings Robert and I did to make 
it easier to introduce worm/blob/storage; basically to remove the 
assumption that it is ever a good idea to retrieve the revision 
graph of the whole repository.

 - New methods Branch.check and Repository.check.  

   This allows for it to be implemented differently depending on the type
   of branch (consider svn branches) or storage format, though it does not
   vary at the moment.

 - Check methods return Result objects that can print themselves, 
   rather than printing output directly.

 - Cleaner separation between the consistency constraints of a branch
   and of a repository.

 - Repository.all_revision_ids is deprecated

 - Remove obsolete fileid_involved and fileid_involved_between_revs.
   (Should be using fileids_altered_by_revision_ids).



=== modified file 'bzrlib/branch.py'
--- bzrlib/branch.py	
+++ bzrlib/branch.py	
@@ -537,6 +537,34 @@
         if parent:
             destination.set_parent(parent)
 
+    @needs_read_lock
+    def check(self):
+        """Check consistency of the branch.
+
+        In particular this checks that revisions given in the revision-history
+        do actually match up in the revision graph, and that they're all 
+        present in the repository.
+
+        :return: A BranchCheckResult.
+        """
+        mainline_parent_id = None
+        for revision_id in self.revision_history():
+            try:
+                revision = self.repository.get_revision(revision_id)
+            except errors.NoSuchRevision, e:
+                raise BzrCheckError("mainline revision {%s} not in repository"
+                        % revision_id)
+            # In general the first entry on the revision history has no parents.
+            # But it's not illegal for it to have parents listed; this can happen
+            # in imports from Arch when the parents weren't reachable.
+            if mainline_parent_id is not None:
+                if mainline_parent_id not in revision.parent_ids:
+                    raise BzrCheckError("previous revision {%s} not listed among "
+                                        "parents of {%s}"
+                                        % (mainline_parent_id, revision_id))
+            mainline_parent_id = revision_id
+        return BranchCheckResult(self)
+
 
 class BranchFormat(object):
     """An encapsulation of the initialization and open routines for a format.
@@ -1334,6 +1362,26 @@
         return result
 
 
+class BranchCheckResult(object):
+    """Results of checking branch consistency.
+
+    :see: Branch.check
+    """
+
+    def __init__(self, branch):
+        self.branch = branch
+
+    def report_results(self, verbose):
+        """Report the check results via trace.note.
+        
+        :param verbose: Requests more detailed display of what was checked,
+            if any.
+        """
+        note('checked branch %s format %s',
+             self.branch.base,
+             self.branch._format)
+
+
 ######################################################################
 # predicates
 

=== modified file 'bzrlib/check.py'
--- bzrlib/check.py	
+++ bzrlib/check.py	
@@ -32,22 +32,19 @@
 # raising them.  If there's more than one exception it'd be good to see them
 # all.
 
+from bzrlib.errors import BzrCheckError, NoSuchRevision
+from bzrlib.symbol_versioning import *
+from bzrlib.trace import mutter, note, warning
 import bzrlib.ui
-from bzrlib.trace import note, warning
-from bzrlib.osutils import rename, sha_string, fingerprint_file
-from bzrlib.trace import mutter
-from bzrlib.errors import BzrCheckError, NoSuchRevision
-from bzrlib.inventory import ROOT_ID
 
 
 class Check(object):
-    """Check a branch"""
+    """Check a repository"""
 
     # The Check object interacts with InventoryEntry.check, etc.
 
-    def __init__(self, branch):
-        self.branch = branch
-        self.repository = branch.repository
+    def __init__(self, repository):
+        self.repository = repository
         self.checked_text_cnt = 0
         self.checked_rev_cnt = 0
         self.ghosts = []
@@ -60,17 +57,13 @@
         self.checked_weaves = {}
 
     def check(self):
-        self.branch.lock_read()
+        self.repository.lock_read()
         self.progress = bzrlib.ui.ui_factory.nested_progress_bar()
         try:
             self.progress.update('retrieving inventory', 0, 0)
             # do not put in init, as it should be done with progess,
             # and inside the lock.
-            self.inventory_weave = self.branch.repository.get_inventory_weave()
-            self.history = self.branch.revision_history()
-            if not len(self.history):
-                # nothing to see here
-                return
+            self.inventory_weave = self.repository.get_inventory_weave()
             self.plan_revisions()
             revno = 0
             self.check_weaves()
@@ -82,11 +75,11 @@
                 self.check_one_rev(rev_id)
         finally:
             self.progress.finished()
-            self.branch.unlock()
+            self.repository.unlock()
 
     def plan_revisions(self):
-        repository = self.branch.repository
-        self.planned_revisions = set(repository.all_revision_ids())
+        repository = self.repository
+        self.planned_revisions = set(repository._all_revision_ids())
         self.progress.clear()
         inventoried = set(self.inventory_weave.versions())
         awol = self.planned_revisions - inventoried
@@ -96,10 +89,9 @@
         self.planned_revisions = list(self.planned_revisions)
 
     def report_results(self, verbose):
-        note('checked branch %s format %s',
-             self.branch.base, 
-             self.branch._format)
-
+        note('checked repository %s format %s',
+             self.repository.bzrdir.root_transport,
+             self.repository._format)
         note('%6d revisions', self.checked_rev_cnt)
         note('%6d unique file texts', self.checked_text_cnt)
         note('%6d repeated file texts', self.repeated_text_cnt)
@@ -116,7 +108,7 @@
                 for ghost in self.ghosts:
                     note('      %s', ghost)
         if len(self.missing_parent_links):
-            note('%6d revisions missing parents in ancestry', 
+            note('%6d revisions missing parents in ancestry',
                  len(self.missing_parent_links))
             if verbose:
                 for link, linkers in self.missing_parent_links.items():
@@ -128,60 +120,31 @@
         """Check one revision.
 
         rev_id - the one to check
-
-        last_rev_id - the previous one on the mainline, if any.
         """
-
-        # mutter('    revision {%s}', rev_id)
-        branch = self.branch
-        try:
-            rev_history_position = self.history.index(rev_id)
-        except ValueError:
-            rev_history_position = None
-        last_rev_id = None
-        if rev_history_position:
-            rev = branch.repository.get_revision(rev_id)
-            if rev_history_position > 0:
-                last_rev_id = self.history[rev_history_position - 1]
-        else:
-            rev = branch.repository.get_revision(rev_id)
+        rev = self.repository.get_revision(rev_id)
                 
         if rev.revision_id != rev_id:
             raise BzrCheckError('wrong internal revision id in revision {%s}'
                                 % rev_id)
 
-        # check the previous history entry is a parent of this entry
-        if rev.parent_ids:
-            if last_rev_id is not None:
-                for parent_id in rev.parent_ids:
-                    if parent_id == last_rev_id:
-                        break
+        for parent in rev.parent_ids:
+            if not parent in self.planned_revisions:
+                missing_links = self.missing_parent_links.get(parent, [])
+                missing_links.append(rev_id)
+                self.missing_parent_links[parent] = missing_links
+                # list based so somewhat slow,
+                # TODO have a planned_revisions list and set.
+                if self.repository.has_revision(parent):
+                    missing_ancestry = self.repository.get_ancestry(parent)
+                    for missing in missing_ancestry:
+                        if (missing is not None 
+                            and missing not in self.planned_revisions):
+                            self.planned_revisions.append(missing)
                 else:
-                    raise BzrCheckError("previous revision {%s} not listed among "
-                                        "parents of {%s}"
-                                        % (last_rev_id, rev_id))
-            for parent in rev.parent_ids:
-                if not parent in self.planned_revisions:
-                    missing_links = self.missing_parent_links.get(parent, [])
-                    missing_links.append(rev_id)
-                    self.missing_parent_links[parent] = missing_links
-                    # list based so somewhat slow,
-                    # TODO have a planned_revisions list and set.
-                    if self.branch.repository.has_revision(parent):
-                        missing_ancestry = self.repository.get_ancestry(parent)
-                        for missing in missing_ancestry:
-                            if (missing is not None 
-                                and missing not in self.planned_revisions):
-                                self.planned_revisions.append(missing)
-                    else:
-                        self.ghosts.append(rev_id)
-        elif last_rev_id:
-            raise BzrCheckError("revision {%s} has no parents listed "
-                                "but preceded by {%s}"
-                                % (rev_id, last_rev_id))
+                    self.ghosts.append(rev_id)
 
         if rev.inventory_sha1:
-            inv_sha1 = branch.repository.get_inventory_sha1(rev_id)
+            inv_sha1 = self.repository.get_inventory_sha1(rev_id)
             if inv_sha1 != rev.inventory_sha1:
                 raise BzrCheckError('Inventory sha1 hash doesn\'t match'
                     ' value in revision {%s}' % rev_id)
@@ -196,21 +159,21 @@
         """
         n_weaves = 1
         weave_ids = []
-        if self.branch.repository.weave_store.listable():
-            weave_ids = list(self.branch.repository.weave_store)
+        if self.repository.weave_store.listable():
+            weave_ids = list(self.repository.weave_store)
             n_weaves = len(weave_ids)
         self.progress.update('checking weave', 0, n_weaves)
         self.inventory_weave.check(progress_bar=self.progress)
         for i, weave_id in enumerate(weave_ids):
             self.progress.update('checking weave', i, n_weaves)
-            w = self.branch.repository.weave_store.get_weave(weave_id,
-                    self.branch.repository.get_transaction())
+            w = self.repository.weave_store.get_weave(weave_id,
+                    self.repository.get_transaction())
             # No progress here, because it looks ugly.
             w.check()
             self.checked_weaves[weave_id] = True
 
     def _check_revision_tree(self, rev_id):
-        tree = self.branch.repository.revision_tree(rev_id)
+        tree = self.repository.revision_tree(rev_id)
         inv = tree.inventory
         seen_ids = {}
         for file_id in inv:
@@ -232,7 +195,17 @@
 
 
 def check(branch, verbose):
-    """Run consistency checks on a branch."""
-    checker = Check(branch)
-    checker.check()
-    checker.report_results(verbose)
+    """Run consistency checks on a branch.
+    
+    Results are reported through logging.
+    
+    :raise BzrCheckError: if there's a consistency error.
+    """
+    branch.lock_read()
+    try:
+        branch_result = branch.check()
+        repo_result = branch.repository.check([branch.last_revision()])
+    finally:
+        branch.unlock()
+    branch_result.report_results(verbose)
+    repo_result.report_results(verbose)

=== modified file 'bzrlib/repository.py'
--- bzrlib/repository.py	
+++ bzrlib/repository.py	
@@ -103,8 +103,27 @@
         """Return all the possible revisions that we could find."""
         return self.get_inventory_weave().versions()
 
-    @needs_read_lock
+    @deprecated_method(zero_nine)
     def all_revision_ids(self):
+        """Returns a list of all the revision ids in the repository. 
+
+        This is deprecated because code should generally work on the graph
+        reachable from a particular revision, and ignore any other revisions
+        that might be present.  There is no direct replacement method.
+
+        Possible arguments for having this function include upgrade, gc, 
+        statistics gathering and cloning of the whole repository.  On the 
+        other hand we can implement cloning, upgrade and gc by using the 
+        contained branches to see what data can be reached.
+
+        Allowing iteration of all revisions is hard to implement well on 
+        transports (like http) that don't allow listing directories, and 
+        anyhow scales poorly.
+        """
+        return self._all_revision_ids()
+
+    @needs_read_lock
+    def _all_revision_ids(self):
         """Returns a list of all the revision ids in the repository. 
 
         These are in as much topological order as the underlying store can 
@@ -322,7 +341,8 @@
                                          RepositoryFormat6,
                                          RepositoryFormat7,
                                          RepositoryFormatKnit1)), \
-            "fileid_involved only supported for branches which store inventory as unnested xml"
+            ("fileids_altered_by_revision_ids only supported for branches " 
+             "which store inventory as unnested xml, not on %r" % self)
         selected_revision_ids = set(revision_ids)
         w = self.get_inventory_weave()
         result = {}
@@ -413,7 +433,7 @@
         """
         result = Graph()
         if not revision_ids:
-            pending = set(self.all_revision_ids())
+            pending = set(self._all_revision_ids())
             required = set([])
         else:
             pending = set(revision_ids)
@@ -485,6 +505,10 @@
     @needs_read_lock
     def get_ancestry(self, revision_id):
         """Return a list of revision-ids integrated by a revision.
+
+        The first element of the list is always None, indicating the origin 
+        revision.  This might change when we have history horizons, or 
+        perhaps we should have a new API.
         
         This is topologically sorted.
         """
@@ -560,6 +584,28 @@
         return self._revision_store.get_signature_text(revision_id,
                                                        self.get_transaction())
 
+    @needs_read_lock
+    def check(self, revision_ids):
+        """Check consistency of all history of given revision_ids.
+
+        Different repository implementations should override _check().
+
+        :param revision_ids: A non-empty list of revision_ids whose ancestry
+             will be checked.  Typically the last revision_id of a branch.
+        """
+        if not revision_ids:
+            raise ValueError("revision_ids must be non-empty in %s.check" 
+                    % (self,))
+        return self._check(revision_ids)
+
+    def _check(self, revision_ids):
+        # Different repository implementations might want to override this to
+        # check in a way that's efficient for their storage and that catches
+        # typical types of error.
+        result = bzrlib.check.Check(self)
+        result.check()
+        return result
+
 
 class AllInOneRepository(Repository):
     """Legacy support - the repository behaviour for all-in-one branches."""
@@ -662,34 +708,11 @@
     """Knit format repository."""
 
     @needs_read_lock
-    def all_revision_ids(self):
+    def _all_revision_ids(self):
         """See Repository.all_revision_ids()."""
+        # Knits get the revision graph from the index of the revision knit, so
+        # it's always possible even if they're on an unlistable transport.
         return self._revision_store.all_revision_ids(self.get_transaction())
-
-    def fileid_involved_between_revs(self, from_revid, to_revid):
-        """Find file_id(s) which are involved in the changes between revisions.
-
-        This determines the set of revisions which are involved, and then
-        finds all file ids affected by those revisions.
-        """
-        vf = self._get_revision_vf()
-        from_set = set(vf.get_ancestry(from_revid))
-        to_set = set(vf.get_ancestry(to_revid))
-        changed = to_set.difference(from_set)
-        return self._fileid_involved_by_set(changed)
-
-    def fileid_involved(self, last_revid=None):
-        """Find all file_ids modified in the ancestry of last_revid.
-
-        :param last_revid: If None, last_revision() will be used.
-        """
-        if not last_revid:
-            changed = set(self.all_revision_ids())
-        else:
-            changed = set(self.get_ancestry(last_revid))
-        if None in changed:
-            changed.remove(None)
-        return self._fileid_involved_by_set(changed)
 
     @needs_read_lock
     def get_ancestry(self, revision_id):
@@ -745,7 +768,7 @@
         vf = self._get_revision_vf()
         versions = set(vf.versions())
         if not revision_ids:
-            pending = set(self.all_revision_ids())
+            pending = set(self._all_revision_ids())
             required = set([])
         else:
             pending = set(revision_ids)

=== modified file 'bzrlib/sign_my_commits.py'
--- bzrlib/sign_my_commits.py	
+++ bzrlib/sign_my_commits.py	
@@ -13,6 +13,7 @@
 # You should have received a copy of the GNU General Public License
 # along with this program; if not, write to the Free Software
 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
 """Command which looks for unsigned commits by the current user, and signs them.
 """
 
@@ -31,6 +32,9 @@
 
     This does not sign commits that already have signatures.
     """
+    # Note that this signs everything on the branch's ancestry
+    # (both mainline and merged), but not other revisions that may be in the
+    # repository
 
     takes_options = [Option('dry-run'
                             , help='Don\'t actually sign anything, just print'
@@ -40,17 +44,13 @@
 
     def run(self, location=None, committer=None, dry_run=False):
         if location is None:
-            from bzrlib.workingtree import WorkingTree
-            # Open the containing directory
-            wt = WorkingTree.open_containing('.')[0]
-            b = wt.branch
+            bzrdir = bzrlib.bzrdir.BzrDir.open_containing('.')[0]
         else:
             # Passed in locations should be exact
-            from bzrlib.branch import Branch
-            b = Branch.open(location)
-        repo = getattr(b, 'repository', b)
-
-        config = bzrlib.config.BranchConfig(b)
+            bzrdir = bzrlib.bzrdir.BzrDir.open(location)
+        branch = bzrdir.open_branch()
+        repo = branch.repository
+        config = bzrlib.config.BranchConfig(branch)
 
         if committer is None:
             committer = config.username()
@@ -58,8 +58,7 @@
         gpg_strategy = bzrlib.gpg.GPGStrategy(config)
 
         count = 0
-        # return in partial topological order for the sake of reproducibility
-        for rev_id in repo.all_revision_ids():
+        for rev_id in repo.get_ancestry(branch.last_revision())[1:]:
             if repo.has_signature_for_revision_id(rev_id):
                 continue
             

=== modified file 'bzrlib/symbol_versioning.py'
--- bzrlib/symbol_versioning.py	
+++ bzrlib/symbol_versioning.py	
@@ -26,6 +26,7 @@
            'deprecated_passed',
            'warn', 'set_warning_method', 'zero_seven',
            'zero_eight',
+           'zero_nine',
            ]
 
 from warnings import warn
@@ -34,6 +35,7 @@
 DEPRECATED_PARAMETER = "A deprecated parameter marker."
 zero_seven = "%s was deprecated in version 0.7."
 zero_eight = "%s was deprecated in version 0.8."
+zero_nine = "%s was deprecated in version 0.9."
 
 
 def set_warning_method(method):

=== modified file 'bzrlib/tests/branch_implementations/test_branch.py'
--- bzrlib/tests/branch_implementations/test_branch.py	
+++ bzrlib/tests/branch_implementations/test_branch.py	
@@ -315,6 +315,14 @@
         tree = self.make_branch_and_tree('tree')
         text = tree.branch._format.get_format_description()
         self.failUnless(len(text))
+
+    def test_check_branch_report_results(self):
+        """Checking a branch produces results which can be printed"""
+        branch = self.make_branch('.')
+        result = branch.check()
+        # reports results through logging
+        result.report_results(verbose=True)
+        result.report_results(verbose=False)
 
 
 class ChrootedTests(TestCaseWithBranch):

=== modified file 'bzrlib/tests/repository_implementations/test_repository.py'
--- bzrlib/tests/repository_implementations/test_repository.py	
+++ bzrlib/tests/repository_implementations/test_repository.py	
@@ -1,4 +1,4 @@
-# (C) 2005, 2006 Canonical Ltd
+# Copyright (C) 2005, 2006 Canonical Ltd
 
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@@ -270,6 +270,17 @@
         self.assertMessageRoundtrips(
             "All 8-bit chars: " +  ''.join([unichr(x) for x in range(256)]))
 
+    def test_check_repository(self):
+        """Check a fairly simple repository's history"""
+        tree = self.make_branch_and_tree('.')
+        tree.commit('initial empty commit', rev_id='a-rev',
+                    allow_pointless=True)
+        result = tree.branch.repository.check(['a-rev'])
+        # writes to log; should accept both verbose or non-verbose
+        result.report_results(verbose=True)
+        result.report_results(verbose=False)
+
+
 class TestCaseWithComplexRepository(TestCaseWithRepository):
 
     def setUp(self):
@@ -279,7 +290,7 @@
         # add a corrupt inventory 'orphan'
         # this may need some generalising for knits.
         inv_file = tree_a.branch.repository.control_weaves.get_weave(
-            'inventory', 
+            'inventory',
             tree_a.branch.repository.get_transaction())
         inv_file.add_lines('orphan', [], [])
         # add a real revision 'rev1'
@@ -293,11 +304,6 @@
         tree_a.add_pending_merge('ghost1')
         tree_a.add_pending_merge('ghost2')
         tree_a.commit('rev4', rev_id='rev4', allow_pointless=True)
-
-    def test_all_revision_ids(self):
-        # all_revision_ids -> all revisions
-        self.assertEqual(['rev1', 'rev2', 'rev3', 'rev4'],
-                         self.bzrdir.open_repository().all_revision_ids())
 
     def test_get_ancestry_missing_revision(self):
         # get_ancestry(revision that is in some data but not fully installed



-- 
Martin




More information about the bazaar mailing list