[MERGE] Reconcile can fix bad parent references

Mon Aug 27 01:17:12 BST 2007

bb:resubmit - details below.

In general this is great, its good work and mostly hangs together really
well. There are a few things, some minor, and some slightly larger
things - all laid out below. I think I've been clear about what needs to
be done vs what is just a preference - but consider it all
negotiable :).

The NEWS entry could be a little more informative. Perhaps add in that
these were caused back in bzr 0.8 timeframes and any branches that were
created back then should run 'bzr check' asap.

On Sun, 2007-08-26 at 18:58 -0400, Aaron Bentley wrote:
> 
> === added file 'bzrlib/tests/repository_implementations/test_check.py'
> ---
> bzrlib/tests/repository_implementations/test_check.py       1970-01-01
> 00:00:00 +0000
> +++
> bzrlib/tests/repository_implementations/test_check.py       2007-08-24
> 17:54:22 +0000
...

This idiom:
> +        repo = self.make_repository('.')
> +
> +        # make rev1a: A well-formed revision, containing 'file1'
> +        inv = inventory.Inventory(revision_id='rev1a')
> +        inv.root.revision = 'rev1a'
> +        self.add_file(repo, inv, 'file1', 'rev1a', [])
> +        repo.add_inventory('rev1a', inv, [])
> +        revision = _mod_revision.Revision('rev1a',
> +            committer='jrandom at example.com', timestamp=0,
> inventory_sha1='',
> +            timezone=0, message='foo', parent_ids=[])
> +        repo.add_revision('rev1a',revision, inv)

Won't work with packs, because there is no write group. If you insert
repo.lock_write()
repo.start_write_group()
...
repo.commit_write_group()
repo.unlock()

around your manual data insertion it will work fine.

> === modified file 'bzrlib/check.py'
> --- bzrlib/check.py     2007-08-22 22:32:23 +0000
> +++ bzrlib/check.py     2007-08-26 22:42:53 +0000
> @@ -32,6 +32,7 @@
>  # raising them.  If there's more than one exception it'd be good to
> see them
>  # all.
>  
> +from bzrlib import repository
>  from bzrlib.errors import BzrCheckError
>  import bzrlib.ui
>  from bzrlib.trace import note
> @@ -53,6 +54,8 @@
>          # maps (file-id, version) -> sha1; used by
> InventoryFile._check
>          self.checked_texts = {}
>          self.checked_weaves = {}
> +        self.revision_versions = {}
> +        self.unreferenced_ancestors = set()
>  
>      def check(self):
>          self.repository.lock_read()
> @@ -64,36 +67,37 @@
>              self.inventory_weave =
> self.repository.get_inventory_weave()
>              self.plan_revisions()
>              revno = 0
> -            self.check_weaves()
>              while revno < len(self.planned_revisions):
>                  rev_id = self.planned_revisions[revno]
>                  self.progress.update('checking revision', revno,
>                                       len(self.planned_revisions))
>                  revno += 1
>                  self.check_one_rev(rev_id)

A note here that the revisions are checked first so that XXXX would be
good. I'm guessing that the reason is so that the data in the revision
index is trustable.

> +            self.check_weaves()
>          finally:
>              self.progress.finished()
>              self.repository.unlock()

>      def plan_revisions(self):
>          repository = self.repository
> -        self.planned_revisions = set(repository.all_revision_ids())
> +        self.planned_revisions = repository.all_revision_ids()
>          self.progress.clear()
>          inventoried = set(self.inventory_weave.versions())
> -        awol = self.planned_revisions - inventoried
> +        awol = set(self.planned_revisions) - inventoried
>          if len(awol) > 0:
>              raise BzrCheckError('Stored revisions missing from
> inventory'
>                  '{%s}' % ','.join([f for f in awol]))
> -        self.planned_revisions = list(self.planned_revisions)

What do the changes to plan_revisions accomplish? all_revision_ids() is
not in any guaranteed topological order, so I don't see why changing it
from a set is beneficial.

> === modified file 'bzrlib/reconcile.py'
> --- bzrlib/reconcile.py 2006-10-05 05:37:25 +0000
> +++ bzrlib/reconcile.py 2007-08-26 22:29:28 +0000
> @@ -20,9 +20,13 @@
>  __all__ = ['reconcile', 'Reconciler', 'RepoReconciler',
> 'KnitReconciler']
>  
>  
> -from bzrlib import ui
> +from bzrlib import (
> +    graph,
> +    ui,
> +    repository,
> +    )
>  from bzrlib.trace import mutter
> -from bzrlib.tsort import TopoSorter
> +from bzrlib.tsort import TopoSorter, topo_sort
>  
>  
>  def reconcile(dir, other=None):
> @@ -276,8 +280,8 @@
>          """Perform the steps to reconcile this repository."""
>          if self.thorough:
>              self._load_indexes()
> -            # knits never suffer this
>              self._gc_inventory()
> +            self._fix_text_parents()
>  
>      def _load_indexes(self):
>          """Load indexes for the reconciliation."""
> @@ -337,3 +341,65 @@
>          self.garbage_inventories = len(garbage)
>          for revision_id in garbage:
>              mutter('Garbage inventory {%s} found.', revision_id)
> +
> +    def _fix_text_parents(self):
> +        """Fix bad versionedfile parent entries.
> +
> +        It is possible for the parents entrie in a versionedfile
> entry to be

Typo - 'entry'

> +        inconsistent with the values in the revision and inventory.
> +
> +        This method finds entries with such inconsistencies, corrects
> their
> +        parent lists, and replaces the versionedfile with a corrected
> version.
> +        """
> +        transaction = self.repo.get_transaction()
> +        revision_parents =
> repository._RevisionParentsProvider(self.repo)
> +        revision_graph = graph.Graph(revision_parents)
> +        revision_versions = {}
> +        for num, file_id in enumerate(self.repo.weave_store):
> +            self.pb.update('Fixing text parents', num,
> +                           len(self.repo.weave_store))
> +            vf = self.repo.weave_store.get_weave(file_id,
> transaction)
> +            bad_ancestors = self.repo.find_bad_ancestors(
> +                self.revisions.versions(), file_id, vf,
> revision_versions,
> +                revision_parents)
> +            if len(bad_ancestors) == 0:
> +                continue
> +            new_vf = self.repo.weave_store.get_empty('temp:%s' %
> file_id,
> +                self.transaction)
> +            new_parents = {}
> +            for version in vf.versions():
> +                parents = vf.get_parents(version)
> +                for parent_id in parents:
> +                    if (parent_id in bad_ancestors and
> +                        version in bad_ancestors[parent_id]):
> +                        parents = self._find_correct_parents(version,
> +                            file_id, revision_versions,
> revision_graph)
> +                        break
> +                new_parents[version] = parents
> +            for version in topo_sort(new_parents.items()):
> +                new_vf.add_lines(version, new_parents[version],
> +                                 vf.get_lines(version))
> +            self.repo.weave_store.copy(new_vf, file_id,
> self.transaction)
> +            self.repo.weave_store.delete('temp:%s' % file_id,
> self.transaction)
> +
> +    def _find_correct_parents(self, revision_id, file_id,
> revision_versions,
> +                              revision_graph):
> +        parents = []
> +        rev_parents = revision_graph.get_parents([revision_id])[0]
> +        if rev_parents is None:
> +            return []
> +        for parent_id in rev_parents:
> +            try:
> +                parent_id = revision_versions[parent_id][file_id]
> +            except KeyError:
> +                continue
> +            if parent_id not in parents:
> +                parents.append(parent_id)
> +        non_heads = set()
> +        for num, parent in enumerate(parents):
> +            for other_parent in parents[num+1:]:
> +                if revision_graph.is_ancestor(parent, other_parent):
> +                    non_heads.add(parent)
> +                if revision_graph.is_ancestor(other_parent, parent):
> +                    non_heads.add(other_parent)

This uses the nodes-of-file graphs are nodes-of-revision graph property
that John and I are debating whether we can use - because the edges are
different. Until we decide that that will give identical results we
should stay with the per file graph. Rather than asking you to rework
this to use the per-file graph, I'll see if I can sort out a proof of
heads equivalence across the two graphs today. Until then this isn't
safe to merge. I had previously suggested to Ian that he factor the same
logic out of inventory.py into a method on repository.

> +        return [p for p in parents if p not in non_heads]

> === modified file 'bzrlib/repository.py'
> --- bzrlib/repository.py        2007-08-22 05:28:32 +0000
> +++ bzrlib/repository.py        2007-08-24 18:15:50 +0000
> @@ -1112,6 +1112,66 @@
>                  [parents_provider,
> other_repository._make_parents_provider()])
>          return graph.Graph(parents_provider)
>  
> +    def _add_revision_text_version(self, tree, revision_versions):
> +        inv_revisions = {}
> +        revision_versions[tree.get_revision_id()] = inv_revisions
> +        for path, entry in tree.iter_entries_by_dir():
> +            inv_revisions[entry.file_id] = entry.revision
> +        return inv_revisions

This doesn't access self at all: What made repository seem like the best
place to put it ? This is what, a cache of revisions -> text version
mappings ? I'd be inclined to either have a standalone function, or an
object to represent the cache if you have more than 1 function related
to this data structure. Looking at find_bad_ancestors there the cache is
used and added to as well, so I would really prefer to see a class for
the cache, probably constructed with the repository as a data source:

FooCache:
    init(repository)
    cache(tree)
    get_text_version(file_id, revision_id)

This gives a home for the inner method in find_bad_ancestors

> +    def find_bad_ancestors(self, revision_ids, file_id,
> versionedfile,
> +                          revision_versions, parents_provider=None):
> +        """Search the versionedfile for ancestors that are not 

Just speculating here:
likewise find_bad_ancestors doesn't seem to fit the general Repository
API to me. Specifically, for packs we won't want to do it by versioned
file anyway, rather by revision - check all the texts introduced by
revision id X for their parent details. The very core of it looks like
something that I'll need to factor out - but I'm happy to do the changes
to find_bad_ancestors.

Is :

> +class _RevisionParentsProvider(object):

Really needed? If we check the revision is cached correctly at the start
of check, then we don't need a different lookup mechanism for checking
the texts, and that means we save a bunch of code.

If we do need it:

> +   """A parents provider that uses a repositoy's revision objects.

Typo                                      ^

> === modified file
> 'bzrlib/tests/repository_implementations/__init__.py'
> --- bzrlib/tests/repository_implementations/__init__.py 2007-08-17
> 05:16:14 +0000
> +++ bzrlib/tests/repository_implementations/__init__.py 2007-08-24
> 12:45:17 +0000
> @@ -99,6 +99,7 @@
>      result = TestSuite()
>      test_repository_implementations = [
>          'bzrlib.tests.repository_implementations.test_break_lock',
> +        'bzrlib.tests.repository_implementations.test_check',
> 
> 'bzrlib.tests.repository_implementations.test_commit_builder',
>          'bzrlib.tests.repository_implementations.test_fetch',
> 
> 'bzrlib.tests.repository_implementations.test_fileid_involved',
> 
> === modified file
> 'bzrlib/tests/repository_implementations/test_reconcile.py'
> ---
> bzrlib/tests/repository_implementations/test_reconcile.py   2007-08-22
> 05:28:32 +0000
> +++
> bzrlib/tests/repository_implementations/test_reconcile.py   2007-08-26
> 22:29:28 +0000
> @@ -19,9 +19,10 @@
>  
>  import bzrlib
>  import bzrlib.errors as errors
> -from bzrlib.inventory import Inventory
> +from bzrlib.inventory import Inventory, InventoryFile
>  from bzrlib.reconcile import reconcile, Reconciler
>  from bzrlib.revision import Revision
> +from bzrlib.repofmt.knitrepo import KnitRepository

> +    def test_reconcile_text_parents(self):
> +        repo = self.make_broken_repository()
> +        if not isinstance(repo, KnitRepository):
> +            raise TestSkipped("Format does not support text parent"
> +                              " reconciliation")

I think its better to check for an attribute like I did in my recent
reconcile patch - _reconcile_does_inventory_gc is what I used. This is
better than a subclass check because it allows foreign formats to be
tested without requiring them to spuriously subclass Knits.

In general, in implementation tests, whenever we see an import of a
specific format, alarm bells should go off.

-Rob
-- 
GPG key available at: <http://www.robertcollins.net/keys.txt>.
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 189 bytes
Desc: This is a digitally signed message part
Url : https://lists.ubuntu.com/archives/bazaar/attachments/20070827/61cf1489/attachment-0001.pgp