Rev 5377: (spiv) Add hidden option 'bzr reconcile --canonicalize-chks' to repair repos in file:///home/pqm/archives/thelove/bzr/%2Btrunk/

Canonical.com Patch Queue Manager pqm at pqm.ubuntu.com
Mon Aug 16 07:46:19 BST 2010


At file:///home/pqm/archives/thelove/bzr/%2Btrunk/

------------------------------------------------------------
revno: 5377 [merge]
revision-id: pqm at pqm.ubuntu.com-20100816064617-wizstoapjbffkj05
parent: pqm at pqm.ubuntu.com-20100811024926-irizklc3vvd9h45p
parent: andrew.bennetts at canonical.com-20100812041907-lgauj5jm4voyjxbm
committer: Canonical.com Patch Queue Manager <pqm at pqm.ubuntu.com>
branch nick: +trunk
timestamp: Mon 2010-08-16 07:46:17 +0100
message:
  (spiv) Add hidden option 'bzr reconcile --canonicalize-chks' to repair repos
   affected by bug 522637. (Andrew Bennetts)
modified:
  bzrlib/builtins.py             builtins.py-20050830033751-fc01482b9ca23183
  bzrlib/reconcile.py            reweave_inventory.py-20051108164726-1e5e0934febac06e
  bzrlib/repofmt/groupcompress_repo.py repofmt.py-20080715094215-wp1qfvoo7093c8qr-1
  bzrlib/versionedfile.py        versionedfile.py-20060222045106-5039c71ee3b65490
=== modified file 'bzrlib/builtins.py'
--- a/bzrlib/builtins.py	2010-08-07 20:31:22 +0000
+++ b/bzrlib/builtins.py	2010-08-11 05:48:09 +0000
@@ -1592,11 +1592,17 @@
 
     _see_also = ['check']
     takes_args = ['branch?']
+    takes_options = [
+        Option('canonicalize-chks',
+               help='Make sure CHKs are in canonical form (repairs '
+                    'bug 522637).',
+               hidden=True),
+        ]
 
-    def run(self, branch="."):
+    def run(self, branch=".", canonicalize_chks=False):
         from bzrlib.reconcile import reconcile
         dir = bzrdir.BzrDir.open(branch)
-        reconcile(dir)
+        reconcile(dir, canonicalize_chks=canonicalize_chks)
 
 
 class cmd_revision_history(Command):

=== modified file 'bzrlib/reconcile.py'
--- a/bzrlib/reconcile.py	2010-05-20 18:23:17 +0000
+++ b/bzrlib/reconcile.py	2010-08-12 04:19:07 +0000
@@ -36,7 +36,7 @@
 from bzrlib.versionedfile import AdapterFactory, FulltextContentFactory
 
 
-def reconcile(dir, other=None):
+def reconcile(dir, canonicalize_chks=False):
     """Reconcile the data in dir.
 
     Currently this is limited to a inventory 'reweave'.
@@ -46,18 +46,19 @@
     Directly using Reconciler is recommended for library users that
     desire fine grained control or analysis of the found issues.
 
-    :param other: another bzrdir to reconcile against.
+    :param canonicalize_chks: Make sure CHKs are in canonical form.
     """
-    reconciler = Reconciler(dir, other=other)
+    reconciler = Reconciler(dir, canonicalize_chks=canonicalize_chks)
     reconciler.reconcile()
 
 
 class Reconciler(object):
     """Reconcilers are used to reconcile existing data."""
 
-    def __init__(self, dir, other=None):
+    def __init__(self, dir, other=None, canonicalize_chks=False):
         """Create a Reconciler."""
         self.bzrdir = dir
+        self.canonicalize_chks = canonicalize_chks
 
     def reconcile(self):
         """Perform reconciliation.
@@ -98,7 +99,15 @@
         ui.ui_factory.note('Reconciling repository %s' %
             self.repo.user_url)
         self.pb.update("Reconciling repository", 0, 1)
-        repo_reconciler = self.repo.reconcile(thorough=True)
+        if self.canonicalize_chks:
+            try:
+                self.repo.reconcile_canonicalize_chks
+            except AttributeError:
+                raise errors.BzrError(
+                    "%s cannot canonicalize CHKs." % (self.repo,))
+            repo_reconciler = self.repo.reconcile_canonicalize_chks()
+        else:
+            repo_reconciler = self.repo.reconcile(thorough=True)
         self.inconsistent_parents = repo_reconciler.inconsistent_parents
         self.garbage_inventories = repo_reconciler.garbage_inventories
         if repo_reconciler.aborted:
@@ -496,6 +505,12 @@
     #  - unlock the names list
     # https://bugs.launchpad.net/bzr/+bug/154173
 
+    def __init__(self, repo, other=None, thorough=False,
+            canonicalize_chks=False):
+        super(PackReconciler, self).__init__(repo, other=other,
+            thorough=thorough)
+        self.canonicalize_chks = canonicalize_chks
+
     def _reconcile_steps(self):
         """Perform the steps to reconcile this repository."""
         if not self.thorough:
@@ -509,8 +524,12 @@
         total_inventories = len(list(
             collection.inventory_index.combined_index.iter_all_entries()))
         if len(all_revisions):
-            new_pack =  self.repo._reconcile_pack(collection, packs,
-                ".reconcile", all_revisions, self.pb)
+            if self.canonicalize_chks:
+                reconcile_meth = self.repo._canonicalize_chks_pack
+            else:
+                reconcile_meth = self.repo._reconcile_pack
+            new_pack = reconcile_meth(collection, packs, ".reconcile",
+                all_revisions, self.pb)
             if new_pack is not None:
                 self._discard_and_save(packs)
         else:

=== modified file 'bzrlib/repofmt/groupcompress_repo.py'
--- a/bzrlib/repofmt/groupcompress_repo.py	2010-05-14 13:25:05 +0000
+++ b/bzrlib/repofmt/groupcompress_repo.py	2010-08-11 06:14:09 +0000
@@ -32,11 +32,13 @@
     revision as _mod_revision,
     trace,
     ui,
+    versionedfile,
     )
 from bzrlib.btree_index import (
     BTreeGraphIndex,
     BTreeBuilder,
     )
+from bzrlib.decorators import needs_write_lock
 from bzrlib.groupcompress import (
     _GCGraphIndex,
     GroupCompressVersionedFiles,
@@ -425,8 +427,11 @@
         self._copy_stream(source_vf, target_vf, inventory_keys,
                           'inventories', self._get_filtered_inv_stream, 2)
 
+    def _get_chk_vfs_for_copy(self):
+        return self._build_vfs('chk', False, False)
+
     def _copy_chk_texts(self):
-        source_vf, target_vf = self._build_vfs('chk', False, False)
+        source_vf, target_vf = self._get_chk_vfs_for_copy()
         # TODO: This is technically spurious... if it is a performance issue,
         #       remove it
         total_keys = source_vf.keys()
@@ -578,6 +583,111 @@
         return new_pack.data_inserted() and self._data_changed
 
 
+class GCCHKCanonicalizingPacker(GCCHKPacker):
+    """A packer that ensures inventories have canonical-form CHK maps.
+    
+    Ideally this would be part of reconcile, but it's very slow and rarely
+    needed.  (It repairs repositories affected by
+    https://bugs.launchpad.net/bzr/+bug/522637).
+    """
+
+    def __init__(self, *args, **kwargs):
+        super(GCCHKCanonicalizingPacker, self).__init__(*args, **kwargs)
+        self._data_changed = False
+    
+    def _exhaust_stream(self, source_vf, keys, message, vf_to_stream, pb_offset):
+        """Create and exhaust a stream, but don't insert it.
+        
+        This is useful to get the side-effects of generating a stream.
+        """
+        self.pb.update('scanning %s' % (message,), pb_offset)
+        child_pb = ui.ui_factory.nested_progress_bar()
+        try:
+            list(vf_to_stream(source_vf, keys, message, child_pb))
+        finally:
+            child_pb.finished()
+
+    def _copy_inventory_texts(self):
+        source_vf, target_vf = self._build_vfs('inventory', True, True)
+        source_chk_vf, target_chk_vf = self._get_chk_vfs_for_copy()
+        inventory_keys = source_vf.keys()
+        # First, copy the existing CHKs on the assumption that most of them
+        # will be correct.  This will save us from having to reinsert (and
+        # recompress) these records later at the cost of perhaps preserving a
+        # few unused CHKs. 
+        # (Iterate but don't insert _get_filtered_inv_stream to populate the
+        # variables needed by GCCHKPacker._copy_chk_texts.)
+        self._exhaust_stream(source_vf, inventory_keys, 'inventories',
+                self._get_filtered_inv_stream, 2)
+        GCCHKPacker._copy_chk_texts(self)
+        # Now copy and fix the inventories, and any regenerated CHKs.
+        def chk_canonicalizing_inv_stream(source_vf, keys, message, pb=None):
+            return self._get_filtered_canonicalizing_inv_stream(
+                source_vf, keys, message, pb, source_chk_vf, target_chk_vf)
+        self._copy_stream(source_vf, target_vf, inventory_keys,
+                          'inventories', chk_canonicalizing_inv_stream, 4)
+
+    def _copy_chk_texts(self):
+        # No-op; in this class this happens during _copy_inventory_texts.
+        pass
+
+    def _get_filtered_canonicalizing_inv_stream(self, source_vf, keys, message,
+            pb=None, source_chk_vf=None, target_chk_vf=None):
+        """Filter the texts of inventories, regenerating CHKs to make sure they
+        are canonical.
+        """
+        total_keys = len(keys)
+        target_chk_vf = versionedfile.NoDupeAddLinesDecorator(target_chk_vf)
+        def _filtered_inv_stream():
+            stream = source_vf.get_record_stream(keys, 'groupcompress', True)
+            search_key_name = None
+            for idx, record in enumerate(stream):
+                # Inventories should always be with revisions; assume success.
+                bytes = record.get_bytes_as('fulltext')
+                chk_inv = inventory.CHKInventory.deserialise(
+                    source_chk_vf, bytes, record.key)
+                if pb is not None:
+                    pb.update('inv', idx, total_keys)
+                chk_inv.id_to_entry._ensure_root()
+                if search_key_name is None:
+                    # Find the name corresponding to the search_key_func
+                    search_key_reg = chk_map.search_key_registry
+                    for search_key_name, func in search_key_reg.iteritems():
+                        if func == chk_inv.id_to_entry._search_key_func:
+                            break
+                canonical_inv = inventory.CHKInventory.from_inventory(
+                    target_chk_vf, chk_inv,
+                    maximum_size=chk_inv.id_to_entry._root_node._maximum_size,
+                    search_key_name=search_key_name)
+                if chk_inv.id_to_entry.key() != canonical_inv.id_to_entry.key():
+                    trace.mutter(
+                        'Non-canonical CHK map for id_to_entry of inv: %s '
+                        '(root is %s, should be %s)' % (chk_inv.revision_id,
+                        chk_inv.id_to_entry.key()[0],
+                        canonical_inv.id_to_entry.key()[0]))
+                    self._data_changed = True
+                p_id_map = chk_inv.parent_id_basename_to_file_id
+                p_id_map._ensure_root()
+                canon_p_id_map = canonical_inv.parent_id_basename_to_file_id
+                if p_id_map.key() != canon_p_id_map.key():
+                    trace.mutter(
+                        'Non-canonical CHK map for parent_id_to_basename of '
+                        'inv: %s (root is %s, should be %s)'
+                        % (chk_inv.revision_id, p_id_map.key()[0],
+                           canon_p_id_map.key()[0]))
+                    self._data_changed = True
+                yield versionedfile.ChunkedContentFactory(record.key,
+                        record.parents, record.sha1,
+                        canonical_inv.to_lines())
+            # We have finished processing all of the inventory records, we
+            # don't need these sets anymore
+        return _filtered_inv_stream()
+
+    def _use_pack(self, new_pack):
+        """Override _use_pack to check for reconcile having changed content."""
+        return new_pack.data_inserted() and self._data_changed
+
+
 class GCRepositoryPackCollection(RepositoryPackCollection):
 
     pack_factory = GCPack
@@ -999,10 +1109,24 @@
         finally:
             pb.finished()
 
+    @needs_write_lock
+    def reconcile_canonicalize_chks(self):
+        """Reconcile this repository to make sure all CHKs are in canonical
+        form.
+        """
+        from bzrlib.reconcile import PackReconciler
+        reconciler = PackReconciler(self, thorough=True, canonicalize_chks=True)
+        reconciler.reconcile()
+        return reconciler
+
     def _reconcile_pack(self, collection, packs, extension, revs, pb):
         packer = GCCHKReconcilePacker(collection, packs, extension)
         return packer.pack(pb)
 
+    def _canonicalize_chks_pack(self, collection, packs, extension, revs, pb):
+        packer = GCCHKCanonicalizingPacker(collection, packs, extension, revs)
+        return packer.pack(pb)
+
     def _get_source(self, to_format):
         """Return a source for streaming from this repository."""
         if self._format._serializer == to_format._serializer:

=== modified file 'bzrlib/versionedfile.py'
--- a/bzrlib/versionedfile.py	2010-08-11 01:27:46 +0000
+++ b/bzrlib/versionedfile.py	2010-08-16 06:46:17 +0000
@@ -1725,6 +1725,46 @@
                 yield (l, key)
 
 
+class NoDupeAddLinesDecorator(object):
+    """Decorator for a VersionedFiles that skips doing an add_lines if the key
+    is already present.
+    """
+
+    def __init__(self, store):
+        self._store = store
+
+    def add_lines(self, key, parents, lines, parent_texts=None,
+            left_matching_blocks=None, nostore_sha=None, random_id=False,
+            check_content=True):
+        """See VersionedFiles.add_lines.
+        
+        This implementation may return None as the third element of the return
+        value when the original store wouldn't.
+        """
+        if nostore_sha:
+            raise NotImplementedError(
+                "NoDupeAddLinesDecorator.add_lines does not implement the "
+                "nostore_sha behaviour.")
+        if key[-1] is None:
+            sha1 = osutils.sha_strings(lines)
+            key = ("sha1:" + sha1,)
+        else:
+            sha1 = None
+        if key in self._store.get_parent_map([key]):
+            # This key has already been inserted, so don't do it again.
+            if sha1 is None:
+                sha1 = osutils.sha_strings(lines)
+            return sha1, sum(map(len, lines)), None
+        return self._store.add_lines(key, parents, lines,
+                parent_texts=parent_texts,
+                left_matching_blocks=left_matching_blocks,
+                nostore_sha=nostore_sha, random_id=random_id,
+                check_content=check_content)
+
+    def __getattr__(self, name):
+        return getattr(self._store, name)
+
+
 def network_bytes_to_kind_and_offset(network_bytes):
     """Strip of a record kind from the front of network_bytes.
 




More information about the bazaar-commits mailing list