Rev 2996: Change check and reconcile to use the new _generate_text_key_index rather in http://people.ubuntu.com/~robertc/baz2.0/reconcile

Robert Collins robertc at robertcollins.net
Fri Nov 16 01:31:16 GMT 2007


At http://people.ubuntu.com/~robertc/baz2.0/reconcile

------------------------------------------------------------
revno: 2996
revision-id:robertc at robertcollins.net-20071116012859-909bq5as99bj7hx2
parent: robertc at robertcollins.net-20071115231348-0vjmr1e8a1x3rx1p
committer: Robert Collins <robertc at robertcollins.net>
branch nick: reconcile.memory
timestamp: Fri 2007-11-16 12:28:59 +1100
message:
  Change check and reconcile to use the new _generate_text_key_index rather
  than a _RevisionTextVersionCache in calculating the correct per-file
  parent information. This exposed some test errors which got changed, and
  reporting some aspects of inventory-text mismatches become harder to
  report (but easier to calculate for when we do start correcting it).
modified:
  bzrlib/check.py                check.py-20050309040759-f3a679400c06bcc1
  bzrlib/reconcile.py            reweave_inventory.py-20051108164726-1e5e0934febac06e
  bzrlib/repository.py           rev_storage.py-20051111201905-119e9401e46257e3
  bzrlib/tests/repository_implementations/__init__.py __init__.py-20060131092037-9564957a7d4a841b
  bzrlib/tests/repository_implementations/test_check.py test_check.py-20070824124512-38g4d135gcqxo4zb-1
=== modified file 'bzrlib/check.py'
--- a/bzrlib/check.py	2007-11-15 22:38:17 +0000
+++ b/bzrlib/check.py	2007-11-16 01:28:59 +0000
@@ -56,11 +56,8 @@
         # maps (file-id, version) -> sha1; used by InventoryFile._check
         self.checked_texts = {}
         self.checked_weaves = {}
-        self.revision_versions = _mod_repository._RevisionTextVersionCache(
-            self.repository)
-        self.unreferenced_ancestors = set()
+        self.unreferenced_versions = set()
         self.inconsistent_parents = []
-        self.dangling_versions = set()
 
     def check(self):
         self.repository.lock_read()
@@ -81,7 +78,7 @@
                 revno += 1
                 self.check_one_rev(rev_id)
             # check_weaves is done after the revision scan so that
-            # revision_versions is pre-populated
+            # revision index is known to be valid.
             self.check_weaves()
         finally:
             self.progress.finished()
@@ -113,8 +110,8 @@
         note('%6d file-ids', len(self.checked_weaves))
         note('%6d unique file texts', self.checked_text_cnt)
         note('%6d repeated file texts', self.repeated_text_cnt)
-        note('%6d unreferenced text ancestors',
-             len(self.unreferenced_ancestors))
+        note('%6d unreferenced text versions',
+             len(self.unreferenced_versions))
         if self.missing_inventory_sha_cnt:
             note('%6d revisions are missing inventory_sha1',
                  self.missing_inventory_sha_cnt)
@@ -135,8 +132,8 @@
                     for linker in linkers:
                         note('       * %s', linker)
             if verbose:
-                for file_id, revision_id in self.unreferenced_ancestors:
-                    log_error('unreferenced ancestor: {%s} in %s', revision_id,
+                for file_id, revision_id in self.unreferenced_versions:
+                    log_error('unreferenced version: {%s} in %s', revision_id,
                         file_id)
         if len(self.inconsistent_parents):
             note('%6d inconsistent parents', len(self.inconsistent_parents))
@@ -157,9 +154,6 @@
                         '       %s has wrong parents in index: '
                         '%r should be %r',
                         revision_id, index_parents, actual_parents)
-        if self.dangling_versions:
-            note('%6d file versions are not referenced by their inventory',
-                 len(self.dangling_versions))
 
     def check_one_rev(self, rev_id):
         """Check one revision.
@@ -216,24 +210,18 @@
             # No progress here, because it looks ugly.
             w.check()
             result = weave_checker.check_file_version_parents(w, weave_id,
-                self.planned_revisions, self.revision_versions)
-            bad_parents, dangling_versions = result
+                self.planned_revisions)
+            bad_parents, unused_versions = result
             bad_parents = bad_parents.items()
-            for revision_id, (weave_parents,correct_parents) in bad_parents:
+            for revision_id, (weave_parents, correct_parents) in bad_parents:
                 self.inconsistent_parents.append(
                     (revision_id, weave_id, weave_parents, correct_parents))
-                if weave_parents is None:
-                    weave_parents = []
-                unreferenced_parents = set(weave_parents)-set(correct_parents)
-                for unreferenced_parent in unreferenced_parents:
-                    self.unreferenced_ancestors.add(
-                        (weave_id, unreferenced_parent))
-            self.dangling_versions.update(dangling_versions)
+            for revision_id in unused_versions:
+                self.unreferenced_versions.add((weave_id, revision_id))
             self.checked_weaves[weave_id] = True
 
     def _check_revision_tree(self, rev_id):
         tree = self.repository.revision_tree(rev_id)
-        self.revision_versions.add_revision_text_versions(tree)
         inv = tree.inventory
         seen_ids = {}
         for file_id in inv:

=== modified file 'bzrlib/reconcile.py'
--- a/bzrlib/reconcile.py	2007-11-15 22:38:17 +0000
+++ b/bzrlib/reconcile.py	2007-11-16 01:28:59 +0000
@@ -376,45 +376,24 @@
         parent lists, and replaces the versionedfile with a corrected version.
         """
         transaction = self.repo.get_transaction()
-        revision_versions = repository._RevisionTextVersionCache(self.repo)
         versions = self.revisions.versions()
         mutter('Prepopulating revision text cache with %d revisions',
                 len(versions))
-        revision_versions.prepopulate_revs(versions)
-        used_file_versions = revision_versions.used_file_versions()
         vf_checker = self.repo.get_versioned_file_checker()
-        for num, file_id in enumerate(self.repo.weave_store):
+        # List all weaves before altering, to avoid race conditions when we
+        # delete unused weaves.
+        weaves = list(enumerate(self.repo.weave_store))
+        for num, file_id in weaves:
             self.pb.update('Fixing text parents', num,
                            len(self.repo.weave_store))
             vf = self.repo.weave_store.get_weave(file_id, transaction)
-            versions_with_bad_parents, dangling_file_versions = \
+            versions_with_bad_parents, unused_versions = \
                 vf_checker.check_file_version_parents(vf, file_id,
-                vf.versions(), revision_versions)
+                vf.versions())
             if (len(versions_with_bad_parents) == 0 and
-                len(dangling_file_versions) == 0):
+                len(unused_versions) == 0):
                 continue
             full_text_versions = set()
-            unused_versions = set()
-            for dangling_version in dangling_file_versions:
-                version = dangling_version[1]
-                if dangling_version in used_file_versions:
-                    # This version *is* used by some revision, even though it
-                    # isn't used by its own revision!  We make sure any
-                    # revision referencing it is stored as a fulltext
-                    # This avoids bug 155730: it means that clients looking at
-                    # inventories to determine the versions to fetch will not
-                    # miss a required version.  (So clients can assume that if
-                    # they have a complete revision graph, and fetch all file
-                    # versions named by those revisions inventories, then they
-                    # will not have any missing parents for 'delta' knit
-                    # records.)
-                    # XXX: A better, but more difficult and slower fix would be
-                    # to rewrite the inventories referencing this version.
-                    full_text_versions.add(version)
-                else:
-                    # This version is totally unreferenced.  It should be
-                    # removed.
-                    unused_versions.add(version)
             self._fix_text_parent(file_id, vf, versions_with_bad_parents,
                 full_text_versions, unused_versions)
 
@@ -429,14 +408,18 @@
             self.transaction)
         new_parents = {}
         for version in vf.versions():
-            if version in versions_with_bad_parents:
+            if version in unused_versions:
+                continue
+            elif version in versions_with_bad_parents:
                 parents = versions_with_bad_parents[version][1]
             else:
                 parents = vf.get_parents(version)
             new_parents[version] = parents
+        if not len(new_parents):
+            # No used versions, remove the VF.
+            self.repo.weave_store.delete(file_id, self.transaction)
+            return
         for version in TopoSorter(new_parents.items()).iter_topo_order():
-            if version in unused_versions:
-                continue
             lines = vf.get_lines(version)
             parents = new_parents[version]
             if parents and (parents[0] in full_text_versions):

=== modified file 'bzrlib/repository.py'
--- a/bzrlib/repository.py	2007-11-15 23:13:48 +0000
+++ b/bzrlib/repository.py	2007-11-16 01:28:59 +0000
@@ -2812,108 +2812,23 @@
     return _unescape_re.sub(_unescaper, data)
 
 
-class _RevisionTextVersionCache(object):
-    """A cache of the versionedfile versions for revision and file-id."""
-
-    def __init__(self, repository):
-        self.repository = repository
-        self.revision_versions = {}
-        self.revision_parents = {}
-        self.repo_graph = self.repository.get_graph()
-        # XXX: RBC: I haven't tracked down what uses this, but it would be
-        # better to use the headscache directly I think.
-        self.heads = graph.HeadsCache(self.repo_graph).heads
-
-    def add_revision_text_versions(self, tree):
-        """Cache text version data from the supplied revision tree"""
-        inv_revisions = {}
-        for path, entry in tree.iter_entries_by_dir():
-            inv_revisions[entry.file_id] = entry.revision
-        self.revision_versions[tree.get_revision_id()] = inv_revisions
-        return inv_revisions
-
-    def get_text_version(self, file_id, revision_id):
-        """Determine the text version for a given file-id and revision-id"""
-        try:
-            inv_revisions = self.revision_versions[revision_id]
-        except KeyError:
-            try:
-                tree = self.repository.revision_tree(revision_id)
-            except errors.RevisionNotPresent:
-                self.revision_versions[revision_id] = inv_revisions = {}
-            else:
-                inv_revisions = self.add_revision_text_versions(tree)
-        return inv_revisions.get(file_id)
-
-    def prepopulate_revs(self, revision_ids):
-        # Filter out versions that we don't have an inventory for, so that the
-        # revision_trees() call won't fail.
-        inv_weave = self.repository.get_inventory_weave()
-        revs = [r for r in revision_ids if inv_weave.has_version(r)]
-        # XXX: this loop is very similar to
-        # bzrlib.fetch.Inter1and2Helper.iter_rev_trees.
-        while revs:
-            mutter('%d revisions left to prepopulate', len(revs))
-            for tree in self.repository.revision_trees(revs[:100]):
-                if tree.inventory.revision_id is None:
-                    tree.inventory.revision_id = tree.get_revision_id()
-                self.add_revision_text_versions(tree)
-            revs = revs[100:]
-
-    def get_parents(self, revision_id):
-        try:
-            return self.revision_parents[revision_id]
-        except KeyError:
-            parents = self.repository.get_parents([revision_id])[0]
-            self.revision_parents[revision_id] = parents
-            return parents
-
-    def used_file_versions(self):
-        """Return a set of (revision_id, file_id) pairs for each file version
-        referenced by any inventory cached by this _RevisionTextVersionCache.
-
-        If the entire repository has been cached, this can be used to find all
-        file versions that are actually referenced by inventories.  Thus any
-        other file version is completely unused and can be removed safely.
-        """
-        result = set()
-        for inventory_summary in self.revision_versions.itervalues():
-            result.update(inventory_summary.items())
-        return result
-
-
 class VersionedFileChecker(object):
 
     def __init__(self, repository):
         self.repository = repository
+        self.text_index = self.repository._generate_text_key_index()
     
     def calculate_file_version_parents(self, revision_id, file_id):
         """Calculate the correct parents for a file version according to
         the inventories.
         """
-        text_revision = self.revision_versions.get_text_version(
-            file_id, revision_id)
-        if text_revision is None:
-            return None
-        parents_of_text_revision = self.revision_versions.get_parents(
-            text_revision)
-        parents_from_inventories = []
-        for parent in parents_of_text_revision:
-            if parent == _mod_revision.NULL_REVISION:
-                continue
-            introduced_in = self.revision_versions.get_text_version(file_id,
-                    parent)
-            if introduced_in is not None:
-                parents_from_inventories.append(introduced_in)
-        heads = set(self.revision_versions.heads(parents_from_inventories))
-        new_parents = []
-        for parent in parents_from_inventories:
-            if parent in heads and parent not in new_parents:
-                new_parents.append(parent)
-        return tuple(new_parents)
+        parent_keys = self.text_index[(file_id, revision_id)]
+        if parent_keys == [_mod_revision.NULL_REVISION]:
+            return ()
+        # strip the file_id, for the weave api
+        return tuple([revision_id for file_id, revision_id in parent_keys])
 
-    def check_file_version_parents(self, weave, file_id, planned_revisions,
-        revision_versions):
+    def check_file_version_parents(self, weave, file_id, planned_revisions):
         """Check the parents stored in a versioned file are correct.
 
         It also detects file versions that are not referenced by their
@@ -2926,26 +2841,20 @@
             revision_id) tuples for versions that are present in this versioned
             file, but not used by the corresponding inventory.
         """
-        # store the current task in instance variables.
-        self.planned_revisions = planned_revisions
-        self.revision_versions = revision_versions
         wrong_parents = {}
-        dangling_file_versions = set()
-        for num, revision_id in enumerate(self.planned_revisions):
-            correct_parents = self.calculate_file_version_parents(
-                revision_id, file_id)
-            if correct_parents is None:
-                continue
-            text_revision = self.revision_versions.get_text_version(
-                file_id, revision_id)
+        unused_versions = set()
+        for num, revision_id in enumerate(planned_revisions):
             try:
-                knit_parents = tuple(weave.get_parents(revision_id))
-            except errors.RevisionNotPresent:
-                knit_parents = None
-            if text_revision != revision_id:
-                # This file version is not referenced by its corresponding
-                # inventory!
-                dangling_file_versions.add((file_id, revision_id))
-            if correct_parents != knit_parents:
-                wrong_parents[revision_id] = (knit_parents, correct_parents)
-        return wrong_parents, dangling_file_versions
+                correct_parents = self.calculate_file_version_parents(
+                    revision_id, file_id)
+            except KeyError:
+                # we were asked to investigate a non-existant version.
+                unused_versions.add(revision_id)
+            else:
+                try:
+                    knit_parents = tuple(weave.get_parents(revision_id))
+                except errors.RevisionNotPresent:
+                    knit_parents = None
+                if correct_parents != knit_parents:
+                    wrong_parents[revision_id] = (knit_parents, correct_parents)
+        return wrong_parents, unused_versions

=== modified file 'bzrlib/tests/repository_implementations/__init__.py'
--- a/bzrlib/tests/repository_implementations/__init__.py	2007-11-15 02:07:37 +0000
+++ b/bzrlib/tests/repository_implementations/__init__.py	2007-11-16 01:28:59 +0000
@@ -169,7 +169,7 @@
         return self.populated_parents()
 
     def check_regexes(self, repo):
-        return ["0 unreferenced text ancestors"]
+        return ["0 unreferenced text versions"]
 
     def populate_repository(self, repo):
         # make rev1a: A well-formed revision, containing 'a-file'
@@ -201,24 +201,24 @@
     """
 
     def all_versions_after_reconcile(self):
-        return ('rev1a', 'rev1b', 'rev2')
+        return ('rev1a', 'rev2')
 
     def populated_parents(self):
         return (
             ((), 'rev1a'),
-            ((), 'rev1b'),
-            (('rev1a', 'rev1b'), 'rev2'))
+            ((), 'rev1b'), # Will be gc'd
+            (('rev1a', 'rev1b'), 'rev2')) # Will have parents trimmed
 
     def corrected_parents(self):
         return (
             ((), 'rev1a'),
-            ((), 'rev1b'),
+            (None, 'rev1b'),
             (('rev1a',), 'rev2'))
 
     def check_regexes(self, repo):
         return [r"\* a-file-id version rev2 has parents \('rev1a', 'rev1b'\) "
                 r"but should have \('rev1a',\)",
-                "1 unreferenced text ancestors",
+                "0 unreferenced text versions",
                 ]
 
     def populate_repository(self, repo):
@@ -284,8 +284,6 @@
     def check_regexes(self, repo):
         return [r"\* a-file-id version rev3 has parents "
                 r"\('rev1c',\) but should have \(\)",
-                # Also check reporting of unreferenced ancestors
-                r"unreferenced ancestor: {rev1c} in a-file-id",
                 ]
 
     def populate_repository(self, repo):
@@ -369,22 +367,21 @@
         if repo.supports_rich_root():
             # TREE_ROOT will be wrong; but we're not testing it. so just adjust
             # the expected count of errors.
-            count = 11
+            count = 9
         else:
-            count = 5
+            count = 3
         return [
             "%d inconsistent parents" % count,
-            r"a-file-id version rev2 has parents \('rev1a',\) "
-            r"but should have \(\)",
-            r"a-file-id version rev2b has parents \('rev1a',\) "
-            r"but should have \(\)",
+            # will be gc'd
+            r"unreferenced version: {rev2} in a-file-id",
+            r"unreferenced version: {rev2b} in a-file-id",
+            # will be corrected
             r"a-file-id version rev3 has parents \('rev2',\) "
             r"but should have \('rev1a',\)",
             r"a-file-id version rev5 has parents \('rev2', 'rev2c'\) "
             r"but should have \('rev2c',\)",
             r"a-file-id version rev4 has parents \('rev2',\) "
             r"but should have \('rev1a',\)",
-            "2 file versions are not referenced by their inventory",
             ]
 
     def populate_repository(self, repo):
@@ -511,7 +508,7 @@
             )
 
     def corrected_fulltexts(self):
-        return ['rev4']
+        return ['rev2']
 
     def check_regexes(self, repo):
         return []
@@ -681,15 +678,13 @@
         if repo.supports_rich_root():
             # TREE_ROOT will be wrong; but we're not testing it. so just adjust
             # the expected count of errors.
-            count = 4
+            count = 3
         else:
-            count = 2
+            count = 1
         return (
             "%d inconsistent parents" % count,
             r"\* a-file-id version current has parents "
             r"\('modified-something-else',\) but should have \('basis',\)",
-            r"\* a-file-id version modified-something-else has parents "
-            r"\('basis',\) but should have \(\)",
             )
 
     def populate_repository(self, repo):

=== modified file 'bzrlib/tests/repository_implementations/test_check.py'
--- a/bzrlib/tests/repository_implementations/test_check.py	2007-10-17 09:39:41 +0000
+++ b/bzrlib/tests/repository_implementations/test_check.py	2007-11-16 01:28:59 +0000
@@ -23,7 +23,6 @@
     inventory,
     revision as _mod_revision,
     )
-from bzrlib.repository import _RevisionTextVersionCache
 from bzrlib.tests import TestNotApplicable
 from bzrlib.tests.repository_implementations import TestCaseWithRepository
 from bzrlib.tests.repository_implementations.helpers import (



More information about the bazaar-commits mailing list