Rev 2996: Change check and reconcile to use the new _generate_text_key_index rather in http://people.ubuntu.com/~robertc/baz2.0/reconcile
Robert Collins
robertc at robertcollins.net
Fri Nov 16 01:31:16 GMT 2007
At http://people.ubuntu.com/~robertc/baz2.0/reconcile
------------------------------------------------------------
revno: 2996
revision-id:robertc at robertcollins.net-20071116012859-909bq5as99bj7hx2
parent: robertc at robertcollins.net-20071115231348-0vjmr1e8a1x3rx1p
committer: Robert Collins <robertc at robertcollins.net>
branch nick: reconcile.memory
timestamp: Fri 2007-11-16 12:28:59 +1100
message:
Change check and reconcile to use the new _generate_text_key_index rather
than a _RevisionTextVersionCache in calculating the correct per-file
parent information. This exposed some test errors which got changed, and
reporting some aspects of inventory-text mismatches become harder to
report (but easier to calculate for when we do start correcting it).
modified:
bzrlib/check.py check.py-20050309040759-f3a679400c06bcc1
bzrlib/reconcile.py reweave_inventory.py-20051108164726-1e5e0934febac06e
bzrlib/repository.py rev_storage.py-20051111201905-119e9401e46257e3
bzrlib/tests/repository_implementations/__init__.py __init__.py-20060131092037-9564957a7d4a841b
bzrlib/tests/repository_implementations/test_check.py test_check.py-20070824124512-38g4d135gcqxo4zb-1
=== modified file 'bzrlib/check.py'
--- a/bzrlib/check.py 2007-11-15 22:38:17 +0000
+++ b/bzrlib/check.py 2007-11-16 01:28:59 +0000
@@ -56,11 +56,8 @@
# maps (file-id, version) -> sha1; used by InventoryFile._check
self.checked_texts = {}
self.checked_weaves = {}
- self.revision_versions = _mod_repository._RevisionTextVersionCache(
- self.repository)
- self.unreferenced_ancestors = set()
+ self.unreferenced_versions = set()
self.inconsistent_parents = []
- self.dangling_versions = set()
def check(self):
self.repository.lock_read()
@@ -81,7 +78,7 @@
revno += 1
self.check_one_rev(rev_id)
# check_weaves is done after the revision scan so that
- # revision_versions is pre-populated
+ # revision index is known to be valid.
self.check_weaves()
finally:
self.progress.finished()
@@ -113,8 +110,8 @@
note('%6d file-ids', len(self.checked_weaves))
note('%6d unique file texts', self.checked_text_cnt)
note('%6d repeated file texts', self.repeated_text_cnt)
- note('%6d unreferenced text ancestors',
- len(self.unreferenced_ancestors))
+ note('%6d unreferenced text versions',
+ len(self.unreferenced_versions))
if self.missing_inventory_sha_cnt:
note('%6d revisions are missing inventory_sha1',
self.missing_inventory_sha_cnt)
@@ -135,8 +132,8 @@
for linker in linkers:
note(' * %s', linker)
if verbose:
- for file_id, revision_id in self.unreferenced_ancestors:
- log_error('unreferenced ancestor: {%s} in %s', revision_id,
+ for file_id, revision_id in self.unreferenced_versions:
+ log_error('unreferenced version: {%s} in %s', revision_id,
file_id)
if len(self.inconsistent_parents):
note('%6d inconsistent parents', len(self.inconsistent_parents))
@@ -157,9 +154,6 @@
' %s has wrong parents in index: '
'%r should be %r',
revision_id, index_parents, actual_parents)
- if self.dangling_versions:
- note('%6d file versions are not referenced by their inventory',
- len(self.dangling_versions))
def check_one_rev(self, rev_id):
"""Check one revision.
@@ -216,24 +210,18 @@
# No progress here, because it looks ugly.
w.check()
result = weave_checker.check_file_version_parents(w, weave_id,
- self.planned_revisions, self.revision_versions)
- bad_parents, dangling_versions = result
+ self.planned_revisions)
+ bad_parents, unused_versions = result
bad_parents = bad_parents.items()
- for revision_id, (weave_parents,correct_parents) in bad_parents:
+ for revision_id, (weave_parents, correct_parents) in bad_parents:
self.inconsistent_parents.append(
(revision_id, weave_id, weave_parents, correct_parents))
- if weave_parents is None:
- weave_parents = []
- unreferenced_parents = set(weave_parents)-set(correct_parents)
- for unreferenced_parent in unreferenced_parents:
- self.unreferenced_ancestors.add(
- (weave_id, unreferenced_parent))
- self.dangling_versions.update(dangling_versions)
+ for revision_id in unused_versions:
+ self.unreferenced_versions.add((weave_id, revision_id))
self.checked_weaves[weave_id] = True
def _check_revision_tree(self, rev_id):
tree = self.repository.revision_tree(rev_id)
- self.revision_versions.add_revision_text_versions(tree)
inv = tree.inventory
seen_ids = {}
for file_id in inv:
=== modified file 'bzrlib/reconcile.py'
--- a/bzrlib/reconcile.py 2007-11-15 22:38:17 +0000
+++ b/bzrlib/reconcile.py 2007-11-16 01:28:59 +0000
@@ -376,45 +376,24 @@
parent lists, and replaces the versionedfile with a corrected version.
"""
transaction = self.repo.get_transaction()
- revision_versions = repository._RevisionTextVersionCache(self.repo)
versions = self.revisions.versions()
mutter('Prepopulating revision text cache with %d revisions',
len(versions))
- revision_versions.prepopulate_revs(versions)
- used_file_versions = revision_versions.used_file_versions()
vf_checker = self.repo.get_versioned_file_checker()
- for num, file_id in enumerate(self.repo.weave_store):
+ # List all weaves before altering, to avoid race conditions when we
+ # delete unused weaves.
+ weaves = list(enumerate(self.repo.weave_store))
+ for num, file_id in weaves:
self.pb.update('Fixing text parents', num,
len(self.repo.weave_store))
vf = self.repo.weave_store.get_weave(file_id, transaction)
- versions_with_bad_parents, dangling_file_versions = \
+ versions_with_bad_parents, unused_versions = \
vf_checker.check_file_version_parents(vf, file_id,
- vf.versions(), revision_versions)
+ vf.versions())
if (len(versions_with_bad_parents) == 0 and
- len(dangling_file_versions) == 0):
+ len(unused_versions) == 0):
continue
full_text_versions = set()
- unused_versions = set()
- for dangling_version in dangling_file_versions:
- version = dangling_version[1]
- if dangling_version in used_file_versions:
- # This version *is* used by some revision, even though it
- # isn't used by its own revision! We make sure any
- # revision referencing it is stored as a fulltext
- # This avoids bug 155730: it means that clients looking at
- # inventories to determine the versions to fetch will not
- # miss a required version. (So clients can assume that if
- # they have a complete revision graph, and fetch all file
- # versions named by those revisions inventories, then they
- # will not have any missing parents for 'delta' knit
- # records.)
- # XXX: A better, but more difficult and slower fix would be
- # to rewrite the inventories referencing this version.
- full_text_versions.add(version)
- else:
- # This version is totally unreferenced. It should be
- # removed.
- unused_versions.add(version)
self._fix_text_parent(file_id, vf, versions_with_bad_parents,
full_text_versions, unused_versions)
@@ -429,14 +408,18 @@
self.transaction)
new_parents = {}
for version in vf.versions():
- if version in versions_with_bad_parents:
+ if version in unused_versions:
+ continue
+ elif version in versions_with_bad_parents:
parents = versions_with_bad_parents[version][1]
else:
parents = vf.get_parents(version)
new_parents[version] = parents
+ if not len(new_parents):
+ # No used versions, remove the VF.
+ self.repo.weave_store.delete(file_id, self.transaction)
+ return
for version in TopoSorter(new_parents.items()).iter_topo_order():
- if version in unused_versions:
- continue
lines = vf.get_lines(version)
parents = new_parents[version]
if parents and (parents[0] in full_text_versions):
=== modified file 'bzrlib/repository.py'
--- a/bzrlib/repository.py 2007-11-15 23:13:48 +0000
+++ b/bzrlib/repository.py 2007-11-16 01:28:59 +0000
@@ -2812,108 +2812,23 @@
return _unescape_re.sub(_unescaper, data)
-class _RevisionTextVersionCache(object):
- """A cache of the versionedfile versions for revision and file-id."""
-
- def __init__(self, repository):
- self.repository = repository
- self.revision_versions = {}
- self.revision_parents = {}
- self.repo_graph = self.repository.get_graph()
- # XXX: RBC: I haven't tracked down what uses this, but it would be
- # better to use the headscache directly I think.
- self.heads = graph.HeadsCache(self.repo_graph).heads
-
- def add_revision_text_versions(self, tree):
- """Cache text version data from the supplied revision tree"""
- inv_revisions = {}
- for path, entry in tree.iter_entries_by_dir():
- inv_revisions[entry.file_id] = entry.revision
- self.revision_versions[tree.get_revision_id()] = inv_revisions
- return inv_revisions
-
- def get_text_version(self, file_id, revision_id):
- """Determine the text version for a given file-id and revision-id"""
- try:
- inv_revisions = self.revision_versions[revision_id]
- except KeyError:
- try:
- tree = self.repository.revision_tree(revision_id)
- except errors.RevisionNotPresent:
- self.revision_versions[revision_id] = inv_revisions = {}
- else:
- inv_revisions = self.add_revision_text_versions(tree)
- return inv_revisions.get(file_id)
-
- def prepopulate_revs(self, revision_ids):
- # Filter out versions that we don't have an inventory for, so that the
- # revision_trees() call won't fail.
- inv_weave = self.repository.get_inventory_weave()
- revs = [r for r in revision_ids if inv_weave.has_version(r)]
- # XXX: this loop is very similar to
- # bzrlib.fetch.Inter1and2Helper.iter_rev_trees.
- while revs:
- mutter('%d revisions left to prepopulate', len(revs))
- for tree in self.repository.revision_trees(revs[:100]):
- if tree.inventory.revision_id is None:
- tree.inventory.revision_id = tree.get_revision_id()
- self.add_revision_text_versions(tree)
- revs = revs[100:]
-
- def get_parents(self, revision_id):
- try:
- return self.revision_parents[revision_id]
- except KeyError:
- parents = self.repository.get_parents([revision_id])[0]
- self.revision_parents[revision_id] = parents
- return parents
-
- def used_file_versions(self):
- """Return a set of (revision_id, file_id) pairs for each file version
- referenced by any inventory cached by this _RevisionTextVersionCache.
-
- If the entire repository has been cached, this can be used to find all
- file versions that are actually referenced by inventories. Thus any
- other file version is completely unused and can be removed safely.
- """
- result = set()
- for inventory_summary in self.revision_versions.itervalues():
- result.update(inventory_summary.items())
- return result
-
-
class VersionedFileChecker(object):
def __init__(self, repository):
self.repository = repository
+ self.text_index = self.repository._generate_text_key_index()
def calculate_file_version_parents(self, revision_id, file_id):
"""Calculate the correct parents for a file version according to
the inventories.
"""
- text_revision = self.revision_versions.get_text_version(
- file_id, revision_id)
- if text_revision is None:
- return None
- parents_of_text_revision = self.revision_versions.get_parents(
- text_revision)
- parents_from_inventories = []
- for parent in parents_of_text_revision:
- if parent == _mod_revision.NULL_REVISION:
- continue
- introduced_in = self.revision_versions.get_text_version(file_id,
- parent)
- if introduced_in is not None:
- parents_from_inventories.append(introduced_in)
- heads = set(self.revision_versions.heads(parents_from_inventories))
- new_parents = []
- for parent in parents_from_inventories:
- if parent in heads and parent not in new_parents:
- new_parents.append(parent)
- return tuple(new_parents)
+ parent_keys = self.text_index[(file_id, revision_id)]
+ if parent_keys == [_mod_revision.NULL_REVISION]:
+ return ()
+ # strip the file_id, for the weave api
+ return tuple([revision_id for file_id, revision_id in parent_keys])
- def check_file_version_parents(self, weave, file_id, planned_revisions,
- revision_versions):
+ def check_file_version_parents(self, weave, file_id, planned_revisions):
"""Check the parents stored in a versioned file are correct.
It also detects file versions that are not referenced by their
@@ -2926,26 +2841,20 @@
revision_id) tuples for versions that are present in this versioned
file, but not used by the corresponding inventory.
"""
- # store the current task in instance variables.
- self.planned_revisions = planned_revisions
- self.revision_versions = revision_versions
wrong_parents = {}
- dangling_file_versions = set()
- for num, revision_id in enumerate(self.planned_revisions):
- correct_parents = self.calculate_file_version_parents(
- revision_id, file_id)
- if correct_parents is None:
- continue
- text_revision = self.revision_versions.get_text_version(
- file_id, revision_id)
+ unused_versions = set()
+ for num, revision_id in enumerate(planned_revisions):
try:
- knit_parents = tuple(weave.get_parents(revision_id))
- except errors.RevisionNotPresent:
- knit_parents = None
- if text_revision != revision_id:
- # This file version is not referenced by its corresponding
- # inventory!
- dangling_file_versions.add((file_id, revision_id))
- if correct_parents != knit_parents:
- wrong_parents[revision_id] = (knit_parents, correct_parents)
- return wrong_parents, dangling_file_versions
+ correct_parents = self.calculate_file_version_parents(
+ revision_id, file_id)
+ except KeyError:
+ # we were asked to investigate a non-existant version.
+ unused_versions.add(revision_id)
+ else:
+ try:
+ knit_parents = tuple(weave.get_parents(revision_id))
+ except errors.RevisionNotPresent:
+ knit_parents = None
+ if correct_parents != knit_parents:
+ wrong_parents[revision_id] = (knit_parents, correct_parents)
+ return wrong_parents, unused_versions
=== modified file 'bzrlib/tests/repository_implementations/__init__.py'
--- a/bzrlib/tests/repository_implementations/__init__.py 2007-11-15 02:07:37 +0000
+++ b/bzrlib/tests/repository_implementations/__init__.py 2007-11-16 01:28:59 +0000
@@ -169,7 +169,7 @@
return self.populated_parents()
def check_regexes(self, repo):
- return ["0 unreferenced text ancestors"]
+ return ["0 unreferenced text versions"]
def populate_repository(self, repo):
# make rev1a: A well-formed revision, containing 'a-file'
@@ -201,24 +201,24 @@
"""
def all_versions_after_reconcile(self):
- return ('rev1a', 'rev1b', 'rev2')
+ return ('rev1a', 'rev2')
def populated_parents(self):
return (
((), 'rev1a'),
- ((), 'rev1b'),
- (('rev1a', 'rev1b'), 'rev2'))
+ ((), 'rev1b'), # Will be gc'd
+ (('rev1a', 'rev1b'), 'rev2')) # Will have parents trimmed
def corrected_parents(self):
return (
((), 'rev1a'),
- ((), 'rev1b'),
+ (None, 'rev1b'),
(('rev1a',), 'rev2'))
def check_regexes(self, repo):
return [r"\* a-file-id version rev2 has parents \('rev1a', 'rev1b'\) "
r"but should have \('rev1a',\)",
- "1 unreferenced text ancestors",
+ "0 unreferenced text versions",
]
def populate_repository(self, repo):
@@ -284,8 +284,6 @@
def check_regexes(self, repo):
return [r"\* a-file-id version rev3 has parents "
r"\('rev1c',\) but should have \(\)",
- # Also check reporting of unreferenced ancestors
- r"unreferenced ancestor: {rev1c} in a-file-id",
]
def populate_repository(self, repo):
@@ -369,22 +367,21 @@
if repo.supports_rich_root():
# TREE_ROOT will be wrong; but we're not testing it. so just adjust
# the expected count of errors.
- count = 11
+ count = 9
else:
- count = 5
+ count = 3
return [
"%d inconsistent parents" % count,
- r"a-file-id version rev2 has parents \('rev1a',\) "
- r"but should have \(\)",
- r"a-file-id version rev2b has parents \('rev1a',\) "
- r"but should have \(\)",
+ # will be gc'd
+ r"unreferenced version: {rev2} in a-file-id",
+ r"unreferenced version: {rev2b} in a-file-id",
+ # will be corrected
r"a-file-id version rev3 has parents \('rev2',\) "
r"but should have \('rev1a',\)",
r"a-file-id version rev5 has parents \('rev2', 'rev2c'\) "
r"but should have \('rev2c',\)",
r"a-file-id version rev4 has parents \('rev2',\) "
r"but should have \('rev1a',\)",
- "2 file versions are not referenced by their inventory",
]
def populate_repository(self, repo):
@@ -511,7 +508,7 @@
)
def corrected_fulltexts(self):
- return ['rev4']
+ return ['rev2']
def check_regexes(self, repo):
return []
@@ -681,15 +678,13 @@
if repo.supports_rich_root():
# TREE_ROOT will be wrong; but we're not testing it. so just adjust
# the expected count of errors.
- count = 4
+ count = 3
else:
- count = 2
+ count = 1
return (
"%d inconsistent parents" % count,
r"\* a-file-id version current has parents "
r"\('modified-something-else',\) but should have \('basis',\)",
- r"\* a-file-id version modified-something-else has parents "
- r"\('basis',\) but should have \(\)",
)
def populate_repository(self, repo):
=== modified file 'bzrlib/tests/repository_implementations/test_check.py'
--- a/bzrlib/tests/repository_implementations/test_check.py 2007-10-17 09:39:41 +0000
+++ b/bzrlib/tests/repository_implementations/test_check.py 2007-11-16 01:28:59 +0000
@@ -23,7 +23,6 @@
inventory,
revision as _mod_revision,
)
-from bzrlib.repository import _RevisionTextVersionCache
from bzrlib.tests import TestNotApplicable
from bzrlib.tests.repository_implementations import TestCaseWithRepository
from bzrlib.tests.repository_implementations.helpers import (
More information about the bazaar-commits
mailing list