Rev 4360: Start checking file texts in a single pass. in http://people.ubuntu.com/~robertc/baz2.0/check

Robert Collins robertc at robertcollins.net
Tue Jun 16 01:38:00 BST 2009


At http://people.ubuntu.com/~robertc/baz2.0/check

------------------------------------------------------------
revno: 4360
revision-id: robertc at robertcollins.net-20090616003755-pmlsfdnx8e5obnwm
parent: robertc at robertcollins.net-20090615043336-g1x5l817wmssg66s
committer: Robert Collins <robertc at robertcollins.net>
branch nick: check
timestamp: Tue 2009-06-16 10:37:55 +1000
message:
  Start checking file texts in a single pass.
=== modified file 'bzrlib/check.py'
--- a/bzrlib/check.py	2009-06-01 03:33:15 +0000
+++ b/bzrlib/check.py	2009-06-16 00:37:55 +0000
@@ -71,8 +71,6 @@
         self.missing_parent_links = {}
         self.missing_inventory_sha_cnt = 0
         self.missing_revision_cnt = 0
-        # maps (file-id, version) -> sha1; used by InventoryFile._check
-        self.checked_texts = {}
         self.checked_weaves = set()
         self.unreferenced_versions = set()
         self.inconsistent_parents = []
@@ -300,20 +298,18 @@
             storebar.finished()
 
     def _check_weaves(self, storebar):
-        storebar.update('inventory', 0, 4)
-        # do not put in init, as it should be done with progess,
-        # and inside the lock.
-        inventory_weave = self.repository.inventories
-        inventory_weave.check(progress_bar=storebar)
-        storebar.update('text-deltas', 1)
-        self.repository.texts.check(progress_bar=storebar)
-        storebar.update('text-index', 2)
+        storebar.update('text-index', 0, 2)
         weave_checker = self.repository._get_versioned_file_checker(
-            text_key_references=self.text_key_references,
             ancestors=self.ancestors)
-        storebar.update('file-graph', 3)
-        result = weave_checker.check_file_version_parents(
-            self.repository.texts)
+        storebar.update('file-graph', 1)
+        if self.repository._format.fast_deltas:
+            # We haven't considered every fileid instance so far.
+            result = weave_checker.check_file_version_parents(
+                self.repository.texts)
+        else:
+            result = weave_checker.check_file_version_parents(
+                text_key_references=self.text_key_references,
+                self.repository.texts)
         self.checked_weaves = weave_checker.file_ids
         bad_parents, unused_versions = result
         bad_parents = bad_parents.items()
@@ -328,7 +324,7 @@
         self.unreferenced_versions.update(unused_versions)
 
     def _add_entry_to_text_key_references(self, inv, entry):
-        if not self.rich_roots and entry == inv.root:
+        if not self.rich_roots and entry.name == '':
             return
         key = (entry.file_id, entry.revision)
         self.text_key_references.setdefault(key, False)

=== modified file 'bzrlib/groupcompress.py'
--- a/bzrlib/groupcompress.py	2009-04-22 17:18:45 +0000
+++ b/bzrlib/groupcompress.py	2009-06-16 00:37:55 +0000
@@ -1029,11 +1029,14 @@
                 reannotate(parent_lines, chunks, key, None, head_cache))
         return parent_cache[key]
 
-    def check(self, progress_bar=None):
+    def check(self, progress_bar=None, keys=None):
         """See VersionedFiles.check()."""
-        keys = self.keys()
-        for record in self.get_record_stream(keys, 'unordered', True):
-            record.get_bytes_as('fulltext')
+        if keys is None:
+            keys = self.keys()
+            for record in self.get_record_stream(keys, 'unordered', True):
+                record.get_bytes_as('fulltext')
+        else:
+            return self.get_record_stream(keys, 'unordered', True)
 
     def _check_add(self, key, lines, random_id, check_content):
         """check that version_id and lines are safe to add."""

=== modified file 'bzrlib/inventory.py'
--- a/bzrlib/inventory.py	2009-05-12 07:27:33 +0000
+++ b/bzrlib/inventory.py	2009-06-16 00:37:55 +0000
@@ -262,7 +262,7 @@
     def versionable_kind(kind):
         return (kind in ('file', 'directory', 'symlink', 'tree-reference'))
 
-    def check(self, checker, rev_id, inv, tree):
+    def check(self, checker, rev_id, inv):
         """Check this inventory entry is intact.
 
         This is a template method, override _check for kind specific
@@ -274,18 +274,18 @@
         :param rev_id: Revision id from which this InventoryEntry was loaded.
              Not necessarily the last-changed revision for this file.
         :param inv: Inventory from which the entry was loaded.
-        :param tree: RevisionTree for this entry.
         """
         if self.parent_id is not None:
             if not inv.has_id(self.parent_id):
                 raise BzrCheckError('missing parent {%s} in inventory for revision {%s}'
                         % (self.parent_id, rev_id))
-        self._check(checker, rev_id, tree)
+        checker._add_entry_to_text_key_references(inv, self)
+        self._check(checker, rev_id)
 
-    def _check(self, checker, rev_id, tree):
+    def _check(self, checker, rev_id):
         """Check this inventory entry for kind specific errors."""
-        raise BzrCheckError('unknown entry kind %r in revision {%s}' %
-                            (self.kind, rev_id))
+        checker._report_items.append(
+            'unknown entry kind %r in revision {%s}' % (self.kind, rev_id))
 
     def copy(self):
         """Clone this inventory entry."""
@@ -404,7 +404,7 @@
                  'text_id', 'parent_id', 'children', 'executable',
                  'revision', 'symlink_target', 'reference_revision']
 
-    def _check(self, checker, rev_id, tree):
+    def _check(self, checker, rev_id):
         """See InventoryEntry._check"""
 
     def __init__(self, file_id):
@@ -433,11 +433,16 @@
                  'text_id', 'parent_id', 'children', 'executable',
                  'revision', 'symlink_target', 'reference_revision']
 
-    def _check(self, checker, rev_id, tree):
+    def _check(self, checker, rev_id):
         """See InventoryEntry._check"""
-        if self.text_sha1 is not None or self.text_size is not None or self.text_id is not None:
-            raise BzrCheckError('directory {%s} has text in revision {%s}'
+        if (self.text_sha1 is not None or self.text_size is not None or
+            self.text_id is not None):
+            checker._report_items.append('directory {%s} has text in revision {%s}'
                                 % (self.file_id, rev_id))
+        # Directories are stored as ''.
+        checker.add_pending_item(rev_id,
+            ('texts', self.file_id, self.revision), 'text',
+             'da39a3ee5e6b4b0d3255bfef95601890afd80709')
 
     def copy(self):
         other = InventoryDirectory(self.file_id, self.name, self.parent_id)
@@ -476,24 +481,16 @@
                  'text_id', 'parent_id', 'children', 'executable',
                  'revision', 'symlink_target', 'reference_revision']
 
-    def _check(self, checker, tree_revision_id, tree):
+    def _check(self, checker, tree_revision_id):
         """See InventoryEntry._check"""
-        key = (self.file_id, self.revision)
-        if key in checker.checked_texts:
-            prev_sha = checker.checked_texts[key]
-            if prev_sha == self.text_sha1:
-                return
-            raise BzrCheckError(
-                'mismatched sha1 on {%s} in {%s} (%s != %s) %r' %
-                (self.file_id, tree_revision_id, prev_sha, self.text_sha1,
-                 t))
-
-        # We can't check the length, because Weave doesn't store that
-        # information, and the whole point of looking at the weave's
-        # sha1sum is that we don't have to extract the text.
-        if (self.text_sha1 != tree._repository.texts.get_sha1s([key])[key]):
-            raise BzrCheckError('text {%s} version {%s} wrong sha1' % key)
-        checker.checked_texts[key] = self.text_sha1
+        # TODO: check size too.
+        checker.add_pending_item(tree_revision_id,
+            ('texts', self.file_id, self.revision), 'text',
+             self.text_sha1)
+        if self.text_size is None:
+            checker._report_items.append(
+                'fileid {%s} in {%s} has None for text_size' % (self.file_id,
+                tree_revision_id))
 
     def copy(self):
         other = InventoryFile(self.file_id, self.name, self.parent_id)
@@ -597,14 +594,20 @@
                  'text_id', 'parent_id', 'children', 'executable',
                  'revision', 'symlink_target', 'reference_revision']
 
-    def _check(self, checker, rev_id, tree):
+    def _check(self, checker, rev_id):
         """See InventoryEntry._check"""
         if self.text_sha1 is not None or self.text_size is not None or self.text_id is not None:
-            raise BzrCheckError('symlink {%s} has text in revision {%s}'
+            checker._report_items.append(
+               'symlink {%s} has text in revision {%s}'
                     % (self.file_id, rev_id))
         if self.symlink_target is None:
-            raise BzrCheckError('symlink {%s} has no target in revision {%s}'
+            checker._report_items.append(
+                'symlink {%s} has no target in revision {%s}'
                     % (self.file_id, rev_id))
+        # Symlinks are stored as ''
+        checker.add_pending_item(tree_revision_id,
+            ('texts', self.file_id, self.revision), 'text',
+             'da39a3ee5e6b4b0d3255bfef95601890afd80709')
 
     def copy(self):
         other = InventoryLink(self.file_id, self.name, self.parent_id)

=== modified file 'bzrlib/repository.py'
--- a/bzrlib/repository.py	2009-06-01 03:33:15 +0000
+++ b/bzrlib/repository.py	2009-06-16 00:37:55 +0000
@@ -1153,37 +1153,115 @@
     def _check_inventories(self, checker):
         """Check the inventories found from the revision scan.
         
-        This checks all data that is tree-shape and not file-content.
+        This is responsible for verifying the sha1 of inventories and
+        creating a pending_keys set that covers data referenced by inventories.
         """
-        revbar = ui.ui_factory.nested_progress_bar()
+        bar = ui.ui_factory.nested_progress_bar()
+        try:
+            self._do_check_inventories(checker, bar)
+        finally:
+            bar.finished()
+
+    def _do_check_inventories(self, checker, bar):
+        """Helper for _check_inventories."""
         revno = 0
+        keys = {'chk_bytes':set(), 'inventories':set(), 'texts':set()}
+        kinds = ['chk_bytes', 'texts']
         count = len(checker.pending_keys)
+        bar.update("inventories", 0, 2)
         current_keys = checker.pending_keys
         checker.pending_keys = {}
-        keys = set()
+        # Accumulate current checks.
         for key in current_keys:
-            if key[0] != 'inventories':
-                checker._report_items.append('unknown key type %r' % key)
-            keys.add(key[1:])
-        # XXX: below is to-go code that accesses texts one at a time.
-        try:
-            while revno < len(checker.planned_revisions):
-                rev_id = checker.planned_revisions[revno]
-                revbar.update('checking revision', revno,
-                    len(checker.planned_revisions))
-                revno += 1
-                try:
-                    tree = self.revision_tree(rev_id)
-                except errors.NoSuchRevision:
-                    self._report_items.append(
-                        "Missing inventory for revision {%s}" % rev_id)
-                inv = tree.inventory
+            if key[0] != 'inventories' and key[0] not in kinds:
+                checker._report_items.append('unknown key type %r' % (key,))
+            keys[key[0]].add(key[1:])
+        if keys['inventories']:
+            # NB: output order *should* be roughly sorted - topo or
+            # inverse topo depending on repository - either way decent
+            # to just delta against. However, pre-CHK formats didn't
+            # try to optimise inventory layout on disk. As such the
+            # pre-CHK code path does not use inventory deltas.
+            last_object = None
+            for record in self.inventories.check(keys=keys['inventories']):
+                if record.storage_kind == 'absent':
+                    checker._report_items.append(
+                        'Missing inventory {%s}' % (record.key,))
+                else:
+                    last_object = self._check_record('inventories', record,
+                        checker, last_object,
+                        current_keys[('inventories',) + record.key])
+            del keys['inventories']
+        else:
+            return
+        bar.update("texts", 1)
+        while (checker.pending_keys or keys['chk_bytes']
+            or keys['texts']):
+            # Something to check.
+            current_keys = checker.pending_keys
+            checker.pending_keys = {}
+            # Accumulate current checks.
+            for key in current_keys:
+                if key[0] not in kinds:
+                    checker._report_items.append('unknown key type %r' % (key,))
+                keys[key[0]].add(key[1:])
+            # Check the outermost kind only - inventories || chk_bytes || texts
+            for kind in kinds:
+                if keys[kind]:
+                    last_object = None
+                    for record in getattr(self, kind).check(keys=keys[kind]):
+                        if record.storage_kind == 'absent':
+                            checker._report_items.append(
+                                'Missing inventory {%s}' % (record.key,))
+                        else:
+                            last_object = self._check_record(kind, record,
+                                checker, last_object, current_keys[(kind,) + record.key])
+                    keys[kind] = set()
+                    break
+
+    def _check_record(self, kind, record, checker, last_object, item_data):
+        """Check a single text from this repository."""
+        if kind == 'inventories':
+            rev_id = record.key[0]
+            inv = self.deserialise_inventory(rev_id,
+                record.get_bytes_as('fulltext'))
+            if last_object is not None:
+                delta = inv._make_delta(last_object)
+                for old_path, path, file_id, ie in delta:
+                    if ie is None:
+                        continue
+                    ie.check(checker, rev_id, inv)
+            else:
                 for path, ie in inv.iter_entries():
-                    checker._add_entry_to_text_key_references(inv, ie)
-                    file_id = ie.file_id
-                    ie.check(checker, rev_id, inv, tree)
-        finally:
-            revbar.finished()
+                    ie.check(checker, rev_id, inv)
+            if self._format.fast_deltas:
+                return inv
+        elif kind == 'chk_bytes':
+            # No code written to check chk_bytes for this repo format.
+            checker._report_items.append(
+                'unsupported key type chk_bytes for %s' % (record.key,))
+        elif kind == 'texts':
+            self._check_text(record, checker, item_data)
+        else:
+            checker._report_items.append(
+                'unknown key type %s for %s' % (kind, record.key))
+
+    def _check_text(self, record, checker, item_data):
+        """Check a single text."""
+        # Check it is extractable.
+        # TODO: check length.
+        if record.storage_kind == 'chunked':
+            chunks = record.get_bytes_as(record.storage_kind)
+            sha1 = osutils.sha_strings(chunks)
+            length = sum(map(len, chunks))
+        else:
+            content = record.get_bytes_as('fulltext')
+            sha1 = osutils.sha_string(content)
+            length = len(content)
+        if item_data and sha1 != item_data[1]:
+            checker._report_items.append(
+                'sha1 mismatch: %s has sha1 %s expected %s referenced by %s' %
+                (record.key, sha1, item_data[1], item_data[2]))
 
     @staticmethod
     def create(a_bzrdir):




More information about the bazaar-commits mailing list