Rev 4360: Start checking file texts in a single pass. in http://people.ubuntu.com/~robertc/baz2.0/check
Robert Collins
robertc at robertcollins.net
Tue Jun 16 01:38:00 BST 2009
At http://people.ubuntu.com/~robertc/baz2.0/check
------------------------------------------------------------
revno: 4360
revision-id: robertc at robertcollins.net-20090616003755-pmlsfdnx8e5obnwm
parent: robertc at robertcollins.net-20090615043336-g1x5l817wmssg66s
committer: Robert Collins <robertc at robertcollins.net>
branch nick: check
timestamp: Tue 2009-06-16 10:37:55 +1000
message:
Start checking file texts in a single pass.
=== modified file 'bzrlib/check.py'
--- a/bzrlib/check.py 2009-06-01 03:33:15 +0000
+++ b/bzrlib/check.py 2009-06-16 00:37:55 +0000
@@ -71,8 +71,6 @@
self.missing_parent_links = {}
self.missing_inventory_sha_cnt = 0
self.missing_revision_cnt = 0
- # maps (file-id, version) -> sha1; used by InventoryFile._check
- self.checked_texts = {}
self.checked_weaves = set()
self.unreferenced_versions = set()
self.inconsistent_parents = []
@@ -300,20 +298,18 @@
storebar.finished()
def _check_weaves(self, storebar):
- storebar.update('inventory', 0, 4)
- # do not put in init, as it should be done with progess,
- # and inside the lock.
- inventory_weave = self.repository.inventories
- inventory_weave.check(progress_bar=storebar)
- storebar.update('text-deltas', 1)
- self.repository.texts.check(progress_bar=storebar)
- storebar.update('text-index', 2)
+ storebar.update('text-index', 0, 2)
weave_checker = self.repository._get_versioned_file_checker(
- text_key_references=self.text_key_references,
ancestors=self.ancestors)
- storebar.update('file-graph', 3)
- result = weave_checker.check_file_version_parents(
- self.repository.texts)
+ storebar.update('file-graph', 1)
+ if self.repository._format.fast_deltas:
+ # We haven't considered every fileid instance so far.
+ result = weave_checker.check_file_version_parents(
+ self.repository.texts)
+ else:
+ result = weave_checker.check_file_version_parents(
+ text_key_references=self.text_key_references,
+ self.repository.texts)
self.checked_weaves = weave_checker.file_ids
bad_parents, unused_versions = result
bad_parents = bad_parents.items()
@@ -328,7 +324,7 @@
self.unreferenced_versions.update(unused_versions)
def _add_entry_to_text_key_references(self, inv, entry):
- if not self.rich_roots and entry == inv.root:
+ if not self.rich_roots and entry.name == '':
return
key = (entry.file_id, entry.revision)
self.text_key_references.setdefault(key, False)
=== modified file 'bzrlib/groupcompress.py'
--- a/bzrlib/groupcompress.py 2009-04-22 17:18:45 +0000
+++ b/bzrlib/groupcompress.py 2009-06-16 00:37:55 +0000
@@ -1029,11 +1029,14 @@
reannotate(parent_lines, chunks, key, None, head_cache))
return parent_cache[key]
- def check(self, progress_bar=None):
+ def check(self, progress_bar=None, keys=None):
"""See VersionedFiles.check()."""
- keys = self.keys()
- for record in self.get_record_stream(keys, 'unordered', True):
- record.get_bytes_as('fulltext')
+ if keys is None:
+ keys = self.keys()
+ for record in self.get_record_stream(keys, 'unordered', True):
+ record.get_bytes_as('fulltext')
+ else:
+ return self.get_record_stream(keys, 'unordered', True)
def _check_add(self, key, lines, random_id, check_content):
"""check that version_id and lines are safe to add."""
=== modified file 'bzrlib/inventory.py'
--- a/bzrlib/inventory.py 2009-05-12 07:27:33 +0000
+++ b/bzrlib/inventory.py 2009-06-16 00:37:55 +0000
@@ -262,7 +262,7 @@
def versionable_kind(kind):
return (kind in ('file', 'directory', 'symlink', 'tree-reference'))
- def check(self, checker, rev_id, inv, tree):
+ def check(self, checker, rev_id, inv):
"""Check this inventory entry is intact.
This is a template method, override _check for kind specific
@@ -274,18 +274,18 @@
:param rev_id: Revision id from which this InventoryEntry was loaded.
Not necessarily the last-changed revision for this file.
:param inv: Inventory from which the entry was loaded.
- :param tree: RevisionTree for this entry.
"""
if self.parent_id is not None:
if not inv.has_id(self.parent_id):
raise BzrCheckError('missing parent {%s} in inventory for revision {%s}'
% (self.parent_id, rev_id))
- self._check(checker, rev_id, tree)
+ checker._add_entry_to_text_key_references(inv, self)
+ self._check(checker, rev_id)
- def _check(self, checker, rev_id, tree):
+ def _check(self, checker, rev_id):
"""Check this inventory entry for kind specific errors."""
- raise BzrCheckError('unknown entry kind %r in revision {%s}' %
- (self.kind, rev_id))
+ checker._report_items.append(
+ 'unknown entry kind %r in revision {%s}' % (self.kind, rev_id))
def copy(self):
"""Clone this inventory entry."""
@@ -404,7 +404,7 @@
'text_id', 'parent_id', 'children', 'executable',
'revision', 'symlink_target', 'reference_revision']
- def _check(self, checker, rev_id, tree):
+ def _check(self, checker, rev_id):
"""See InventoryEntry._check"""
def __init__(self, file_id):
@@ -433,11 +433,16 @@
'text_id', 'parent_id', 'children', 'executable',
'revision', 'symlink_target', 'reference_revision']
- def _check(self, checker, rev_id, tree):
+ def _check(self, checker, rev_id):
"""See InventoryEntry._check"""
- if self.text_sha1 is not None or self.text_size is not None or self.text_id is not None:
- raise BzrCheckError('directory {%s} has text in revision {%s}'
+ if (self.text_sha1 is not None or self.text_size is not None or
+ self.text_id is not None):
+ checker._report_items.append('directory {%s} has text in revision {%s}'
% (self.file_id, rev_id))
+ # Directories are stored as ''.
+ checker.add_pending_item(rev_id,
+ ('texts', self.file_id, self.revision), 'text',
+ 'da39a3ee5e6b4b0d3255bfef95601890afd80709')
def copy(self):
other = InventoryDirectory(self.file_id, self.name, self.parent_id)
@@ -476,24 +481,16 @@
'text_id', 'parent_id', 'children', 'executable',
'revision', 'symlink_target', 'reference_revision']
- def _check(self, checker, tree_revision_id, tree):
+ def _check(self, checker, tree_revision_id):
"""See InventoryEntry._check"""
- key = (self.file_id, self.revision)
- if key in checker.checked_texts:
- prev_sha = checker.checked_texts[key]
- if prev_sha == self.text_sha1:
- return
- raise BzrCheckError(
- 'mismatched sha1 on {%s} in {%s} (%s != %s) %r' %
- (self.file_id, tree_revision_id, prev_sha, self.text_sha1,
- t))
-
- # We can't check the length, because Weave doesn't store that
- # information, and the whole point of looking at the weave's
- # sha1sum is that we don't have to extract the text.
- if (self.text_sha1 != tree._repository.texts.get_sha1s([key])[key]):
- raise BzrCheckError('text {%s} version {%s} wrong sha1' % key)
- checker.checked_texts[key] = self.text_sha1
+ # TODO: check size too.
+ checker.add_pending_item(tree_revision_id,
+ ('texts', self.file_id, self.revision), 'text',
+ self.text_sha1)
+ if self.text_size is None:
+ checker._report_items.append(
+ 'fileid {%s} in {%s} has None for text_size' % (self.file_id,
+ tree_revision_id))
def copy(self):
other = InventoryFile(self.file_id, self.name, self.parent_id)
@@ -597,14 +594,20 @@
'text_id', 'parent_id', 'children', 'executable',
'revision', 'symlink_target', 'reference_revision']
- def _check(self, checker, rev_id, tree):
+ def _check(self, checker, rev_id):
"""See InventoryEntry._check"""
if self.text_sha1 is not None or self.text_size is not None or self.text_id is not None:
- raise BzrCheckError('symlink {%s} has text in revision {%s}'
+ checker._report_items.append(
+ 'symlink {%s} has text in revision {%s}'
% (self.file_id, rev_id))
if self.symlink_target is None:
- raise BzrCheckError('symlink {%s} has no target in revision {%s}'
+ checker._report_items.append(
+ 'symlink {%s} has no target in revision {%s}'
% (self.file_id, rev_id))
+ # Symlinks are stored as ''
+ checker.add_pending_item(tree_revision_id,
+ ('texts', self.file_id, self.revision), 'text',
+ 'da39a3ee5e6b4b0d3255bfef95601890afd80709')
def copy(self):
other = InventoryLink(self.file_id, self.name, self.parent_id)
=== modified file 'bzrlib/repository.py'
--- a/bzrlib/repository.py 2009-06-01 03:33:15 +0000
+++ b/bzrlib/repository.py 2009-06-16 00:37:55 +0000
@@ -1153,37 +1153,115 @@
def _check_inventories(self, checker):
"""Check the inventories found from the revision scan.
- This checks all data that is tree-shape and not file-content.
+ This is responsible for verifying the sha1 of inventories and
+ creating a pending_keys set that covers data referenced by inventories.
"""
- revbar = ui.ui_factory.nested_progress_bar()
+ bar = ui.ui_factory.nested_progress_bar()
+ try:
+ self._do_check_inventories(checker, bar)
+ finally:
+ bar.finished()
+
+ def _do_check_inventories(self, checker, bar):
+ """Helper for _check_inventories."""
revno = 0
+ keys = {'chk_bytes':set(), 'inventories':set(), 'texts':set()}
+ kinds = ['chk_bytes', 'texts']
count = len(checker.pending_keys)
+ bar.update("inventories", 0, 2)
current_keys = checker.pending_keys
checker.pending_keys = {}
- keys = set()
+ # Accumulate current checks.
for key in current_keys:
- if key[0] != 'inventories':
- checker._report_items.append('unknown key type %r' % key)
- keys.add(key[1:])
- # XXX: below is to-go code that accesses texts one at a time.
- try:
- while revno < len(checker.planned_revisions):
- rev_id = checker.planned_revisions[revno]
- revbar.update('checking revision', revno,
- len(checker.planned_revisions))
- revno += 1
- try:
- tree = self.revision_tree(rev_id)
- except errors.NoSuchRevision:
- self._report_items.append(
- "Missing inventory for revision {%s}" % rev_id)
- inv = tree.inventory
+ if key[0] != 'inventories' and key[0] not in kinds:
+ checker._report_items.append('unknown key type %r' % (key,))
+ keys[key[0]].add(key[1:])
+ if keys['inventories']:
+ # NB: output order *should* be roughly sorted - topo or
+ # inverse topo depending on repository - either way decent
+ # to just delta against. However, pre-CHK formats didn't
+ # try to optimise inventory layout on disk. As such the
+ # pre-CHK code path does not use inventory deltas.
+ last_object = None
+ for record in self.inventories.check(keys=keys['inventories']):
+ if record.storage_kind == 'absent':
+ checker._report_items.append(
+ 'Missing inventory {%s}' % (record.key,))
+ else:
+ last_object = self._check_record('inventories', record,
+ checker, last_object,
+ current_keys[('inventories',) + record.key])
+ del keys['inventories']
+ else:
+ return
+ bar.update("texts", 1)
+ while (checker.pending_keys or keys['chk_bytes']
+ or keys['texts']):
+ # Something to check.
+ current_keys = checker.pending_keys
+ checker.pending_keys = {}
+ # Accumulate current checks.
+ for key in current_keys:
+ if key[0] not in kinds:
+ checker._report_items.append('unknown key type %r' % (key,))
+ keys[key[0]].add(key[1:])
+ # Check the outermost kind only - inventories || chk_bytes || texts
+ for kind in kinds:
+ if keys[kind]:
+ last_object = None
+ for record in getattr(self, kind).check(keys=keys[kind]):
+ if record.storage_kind == 'absent':
+ checker._report_items.append(
+ 'Missing inventory {%s}' % (record.key,))
+ else:
+ last_object = self._check_record(kind, record,
+ checker, last_object, current_keys[(kind,) + record.key])
+ keys[kind] = set()
+ break
+
+ def _check_record(self, kind, record, checker, last_object, item_data):
+ """Check a single text from this repository."""
+ if kind == 'inventories':
+ rev_id = record.key[0]
+ inv = self.deserialise_inventory(rev_id,
+ record.get_bytes_as('fulltext'))
+ if last_object is not None:
+ delta = inv._make_delta(last_object)
+ for old_path, path, file_id, ie in delta:
+ if ie is None:
+ continue
+ ie.check(checker, rev_id, inv)
+ else:
for path, ie in inv.iter_entries():
- checker._add_entry_to_text_key_references(inv, ie)
- file_id = ie.file_id
- ie.check(checker, rev_id, inv, tree)
- finally:
- revbar.finished()
+ ie.check(checker, rev_id, inv)
+ if self._format.fast_deltas:
+ return inv
+ elif kind == 'chk_bytes':
+ # No code written to check chk_bytes for this repo format.
+ checker._report_items.append(
+ 'unsupported key type chk_bytes for %s' % (record.key,))
+ elif kind == 'texts':
+ self._check_text(record, checker, item_data)
+ else:
+ checker._report_items.append(
+ 'unknown key type %s for %s' % (kind, record.key))
+
+ def _check_text(self, record, checker, item_data):
+ """Check a single text."""
+ # Check it is extractable.
+ # TODO: check length.
+ if record.storage_kind == 'chunked':
+ chunks = record.get_bytes_as(record.storage_kind)
+ sha1 = osutils.sha_strings(chunks)
+ length = sum(map(len, chunks))
+ else:
+ content = record.get_bytes_as('fulltext')
+ sha1 = osutils.sha_string(content)
+ length = len(content)
+ if item_data and sha1 != item_data[1]:
+ checker._report_items.append(
+ 'sha1 mismatch: %s has sha1 %s expected %s referenced by %s' %
+ (record.key, sha1, item_data[1], item_data[2]))
@staticmethod
def create(a_bzrdir):
More information about the bazaar-commits
mailing list