Rev 2970: * ``pack-0.92`` repositories can now be reconciled. in http://people.ubuntu.com/~robertc/baz2.0/reconcile

Thu Nov 29 04:14:28 GMT 2007

At http://people.ubuntu.com/~robertc/baz2.0/reconcile

------------------------------------------------------------
revno: 2970
revision-id:robertc at robertcollins.net-20071129041406-4zzhzzpihmspiu9k
parent: robertc at robertcollins.net-20071129014512-qwfvultxzo9w6ot4
committer: Robert Collins <robertc at robertcollins.net>
branch nick: reconcile.packs
timestamp: Thu 2007-11-29 15:14:06 +1100
message:
  * ``pack-0.92`` repositories can now be reconciled.
    (Robert Collins, #154173)
modified:
  NEWS                           NEWS-20050323055033-4e00b5db738777ff
  bzrlib/repofmt/pack_repo.py    pack_repo.py-20070813041115-gjv5ma7ktfqwsjgn-1
  bzrlib/repository.py           rev_storage.py-20051111201905-119e9401e46257e3
=== modified file 'NEWS'

--- a/NEWS	2007-11-28 01:35:49 +0000
+++ b/NEWS	2007-11-29 04:14:06 +0000
@@ -31,6 +31,9 @@
      tree roots that's recorded for all other directories.
      (Aaron Bentley, #164639)
 
+   * ``pack-0.92`` repositories can now be reconciled.
+     (Robert Collins, #154173)
+
    * ``switch`` command added for changing the branch a lightweight checkout
      is associated with and updating the tree to reflect the latest content
      accordingly. This command was previously part of the BzrTools plug-in.

=== modified file 'bzrlib/repofmt/pack_repo.py'
--- a/bzrlib/repofmt/pack_repo.py	2007-11-29 01:45:12 +0000
+++ b/bzrlib/repofmt/pack_repo.py	2007-11-29 04:14:06 +0000
@@ -373,6 +373,14 @@
                 self.pack_transport, self.name,
                 time.time() - self.start_time)
 
+    def flush(self):
+        """Flush any current data."""
+        if self._buffer[1]:
+            bytes = ''.join(self._buffer[0])
+            self.write_stream.write(bytes)
+            self._hash.update(bytes)
+            self._buffer[:] = [[], 0]
+
     def index_name(self, index_type, name):
         """Get the disk name of an index type for pack name 'name'."""
         return name + NewPack.index_definitions[index_type][0]
@@ -531,6 +539,10 @@
         # What text keys to copy. None for 'all texts'. This is set by
         # _copy_inventory_texts
         self._text_filter = None
+        self._extra_init()
+
+    def _extra_init(self):
+        """A template hook to allow extending the constructor trivially."""
 
     def pack(self, pb=None):
         """Create a new pack by reading data from other packs.
@@ -621,49 +633,21 @@
         inv_lines = self._copy_nodes_graph(inv_nodes, inventory_index_map,
             self.new_pack._writer, self.new_pack.inventory_index, output_lines=True)
         if self.revision_ids:
-            fileid_revisions = self._pack_collection.repo._find_file_ids_from_xml_inventory_lines(
-                inv_lines, self.revision_ids)
-            text_filter = []
-            for fileid, file_revids in fileid_revisions.iteritems():
-                text_filter.extend(
-                    [(fileid, file_revid) for file_revid in file_revids])
+            self._process_inventory_lines(inv_lines)
         else:
             # eat the iterator to cause it to execute.
             list(inv_lines)
-            text_filter = None
+            self._text_filter = None
         if 'pack' in debug.debug_flags:
             mutter('%s: create_pack: inventories copied: %s%s %d items t+%6.3fs',
                 time.ctime(), self._pack_collection._upload_transport.base,
                 self.new_pack.random_name,
                 self.new_pack.inventory_index.key_count(),
                 time.time() - new_pack.start_time)
-        self._text_filter = text_filter
 
-    def _create_pack_from_packs(self):
-        self.pb.update("Opening pack", 0, 5)
-        self.new_pack = self.open_pack()
-        new_pack = self.new_pack
-        # buffer data - we won't be reading-back during the pack creation and
-        # this makes a significant difference on sftp pushes.
-        new_pack.set_write_cache_size(1024*1024)
-        if 'pack' in debug.debug_flags:
-            plain_pack_list = ['%s%s' % (a_pack.pack_transport.base, a_pack.name)
-                for a_pack in self.packs]
-            if self.revision_ids is not None:
-                rev_count = len(self.revision_ids)
-            else:
-                rev_count = 'all'
-            mutter('%s: create_pack: creating pack from source packs: '
-                '%s%s %s revisions wanted %s t=0',
-                time.ctime(), self._pack_collection._upload_transport.base, new_pack.random_name,
-                plain_pack_list, rev_count)
-        self._copy_revision_texts()
-        self._copy_inventory_texts()
+    def _copy_text_texts(self):
         # select text keys
-        text_index_map = self._pack_collection._packs_list_to_pack_map_and_index_list(
-            self.packs, 'text_index')[0]
-        text_nodes = self._pack_collection._index_contents(text_index_map,
-            self._text_filter)
+        text_index_map, text_nodes = self._get_text_nodes()
         if self._text_filter is not None:
             # We could return the keys copied as part of the return value from
             # _copy_nodes_graph but this doesn't work all that well with the
@@ -683,12 +667,30 @@
         # copy text keys and adjust values
         self.pb.update("Copying content texts", 3)
         list(self._copy_nodes_graph(text_nodes, text_index_map,
-            new_pack._writer, new_pack.text_index))
+            self.new_pack._writer, self.new_pack.text_index))
+        self._log_copied_texts()
+
+    def _create_pack_from_packs(self):
+        self.pb.update("Opening pack", 0, 5)
+        self.new_pack = self.open_pack()
+        new_pack = self.new_pack
+        # buffer data - we won't be reading-back during the pack creation and
+        # this makes a significant difference on sftp pushes.
+        new_pack.set_write_cache_size(1024*1024)
         if 'pack' in debug.debug_flags:
-            mutter('%s: create_pack: file texts copied: %s%s %d items t+%6.3fs',
+            plain_pack_list = ['%s%s' % (a_pack.pack_transport.base, a_pack.name)
+                for a_pack in self.packs]
+            if self.revision_ids is not None:
+                rev_count = len(self.revision_ids)
+            else:
+                rev_count = 'all'
+            mutter('%s: create_pack: creating pack from source packs: '
+                '%s%s %s revisions wanted %s t=0',
                 time.ctime(), self._pack_collection._upload_transport.base, new_pack.random_name,
-                new_pack.text_index.key_count(),
-                time.time() - new_pack.start_time)
+                plain_pack_list, rev_count)
+        self._copy_revision_texts()
+        self._copy_inventory_texts()
+        self._copy_text_texts()
         # select signature keys
         signature_filter = self._revision_keys # same keyspace
         signature_index_map = self._pack_collection._packs_list_to_pack_map_and_index_list(
@@ -832,6 +834,38 @@
                 pb.update("Copied record", record_index)
                 record_index += 1
 
+    def _external_compression_parents_of_new_texts(self)
+        keys = set()
+        refs = set()
+        for node in self.new_pack.text_index.iter_all_entries():
+            keys.add(node[1])
+            refs.update(node[3][1])
+        return refs - keys
+
+    def _get_text_nodes(self):
+        text_index_map = self._pack_collection._packs_list_to_pack_map_and_index_list(
+            self.packs, 'text_index')[0]
+        return text_index_map, self._pack_collection._index_contents(text_index_map,
+            self._text_filter)
+
+    def _log_copied_texts(self):
+        if 'pack' in debug.debug_flags:
+            mutter('%s: create_pack: file texts copied: %s%s %d items t+%6.3fs',
+                time.ctime(), self._pack_collection._upload_transport.base,
+                self.new_pack.random_name,
+                self.new_pack.text_index.key_count(),
+                time.time() - self.new_pack.start_time)
+
+    def _process_inventory_lines(self, inv_lines):
+        """Use up the inv_lines generator and setup a text key filter."""
+        repo = self._pack_collection.repo
+        fileid_revisions = repo._find_file_ids_from_xml_inventory_lines(
+            inv_lines, self.revision_ids)
+        text_filter = []
+        for fileid, file_revids in fileid_revisions.iteritems():
+            text_filter.extend([(fileid, file_revid) for file_revid in file_revids])
+        self._text_filter = text_filter
+
     def _use_pack(self, new_pack):
         """Return True if new_pack should be used.
 
@@ -848,9 +882,120 @@
     regenerated.
     """
 
+    def _extra_init(self):
+        self._data_changed = False
+
+    def _process_inventory_lines(self, inv_lines):
+        """Generate a text key reference map rather for reconciling with."""
+        repo = self._pack_collection.repo
+        refs = repo._find_text_key_references_from_xml_inventory_lines(
+            inv_lines)
+        self._text_refs = refs
+        # during reconcile we:
+        #  - convert unreferenced texts to full texts
+        #  - correct texts which reference a text not copied to be full texts
+        #  - copy all others as-is but with corrected parents.
+        #  - so at this point we don't know enough to decide what becomes a full
+        #    text.
+        self._text_filter = None
+
+    def _copy_text_texts(self):
+        """generate what texts we should have and then copy."""
+        self.pb.update("Copying content texts", 3)
+        # we have three major tasks here:
+        # 1) generate the ideal index
+        repo = self._pack_collection.repo
+        ideal_index = repo._generate_text_key_index(self._text_refs)
+        # 2) generate a text_nodes list that contains all the deltas that can
+        #    be used as-is, with corrected parents.
+        ok_nodes = []
+        bad_texts = []
+        discarded_nodes = []
+        NULL_REVISION = _mod_revision.NULL_REVISION
+        text_index_map, text_nodes = self._get_text_nodes()
+        for node in text_nodes:
+            # 0 - index
+            # 1 - key 
+            # 2 - value
+            # 3 - refs
+            try:
+                ideal_parents = tuple(ideal_index[node[1]])
+            except KeyError:
+                discarded_nodes.append(node)
+                self._data_changed = True
+            else:
+                if ideal_parents == (NULL_REVISION,):
+                    ideal_parents = ()
+                if ideal_parents == node[3][0]:
+                    # no change needed.
+                    ok_nodes.append(node)
+                elif ideal_parents[0:1] == node[3][0][0:1]:
+                    # the left most parent is the same, or there are no parents
+                    # today. Either way, we can preserve the representation as
+                    # long as we change the refs to be inserted.
+                    self._data_changed = True
+                    ok_nodes.append((node[0], node[1], node[2],
+                        (ideal_parents, node[3][1])))
+                    self._data_changed = True
+                else:
+                    # Reinsert this text completely
+                    bad_texts.append((node[1], ideal_parents))
+                    self._data_changed = True
+        # we're finished with some data.
+        del ideal_index
+        del text_nodes
+        # 3) bulk copy the ok data
+        list(self._copy_nodes_graph(ok_nodes, text_index_map,
+            self.new_pack._writer, self.new_pack.text_index))
+        # 3) adhoc copy all the other texts.
+        transaction = repo.get_transaction()
+        file_id_index = GraphIndexPrefixAdapter(
+            self.new_pack.text_index,
+            ('blank', ), 1,
+            add_nodes_callback=self.new_pack.text_index.add_nodes)
+        knit_index = KnitGraphIndex(file_id_index,
+            add_callback=file_id_index.add_nodes,
+            deltas=True, parents=True)
+        output_knit = knit.KnitVersionedFile('reconcile-texts',
+            self._pack_collection.transport,
+            None,
+            index=knit_index,
+            access_method=_PackAccess(
+                {self.new_pack.text_index:self.new_pack.access_tuple()},
+                (self.new_pack._writer, self.new_pack.text_index)),
+            factory=knit.KnitPlainFactory())
+        for key, parent_keys in bad_texts:
+            # We refer to the new pack to delta data being output.
+            # A possible improvement would be to catch errors on short reads
+            # and only flush then.
+            self.new_pack.flush()
+            parents = []
+            for parent_key in parent_keys:
+                if parent_key[0] != key[0]:
+                    # Graph parents must match the fileid
+                    raise errors.BzrError('Mismatched key parent %r:%r' %
+                        (key, parent_keys))
+                parents.append(parent_key[1])
+            source_weave = repo.weave_store.get_weave(key[0], transaction)
+            text_lines = source_weave.get_lines(key[1])
+            # adapt the 'knit' to the current file_id.
+            file_id_index = GraphIndexPrefixAdapter(
+                self.new_pack.text_index,
+                (key[0], ), 1,
+                add_nodes_callback=self.new_pack.text_index.add_nodes)
+            knit_index._graph_index = file_id_index
+            knit_index._add_callback = file_id_index.add_nodes
+            output_knit.add_lines_with_ghosts(
+                key[1], parents, text_lines, random_id=True, check_content=False)
+        # 4) check that nothing inserted has a reference outside the keyspace.
+        missing_text_keys = self._external_compression_parents_of_new_texts()
+        if missing_text_keys:
+            raise errors.BzrError('Reference to missing compression parents %r'
+                % (refs - keys,))
+        self._log_copied_texts()
+
     def _use_pack(self, new_pack):
         """Override _use_pack to check for reconcile having changed content."""
-        self._data_changed = False
         # XXX: we might be better checking this at the copy time.
         original_inventory_keys = set()
         inv_index = self._pack_collection.inventory_index.combined_index
@@ -1590,7 +1735,7 @@
         self._transaction = None
         # for tests
         self._reconcile_does_inventory_gc = True
-        self._reconcile_fixes_text_parents = False
+        self._reconcile_fixes_text_parents = True
         self._reconcile_backsup_inventory = False
 
     def _abort_write_group(self):

=== modified file 'bzrlib/repository.py'
--- a/bzrlib/repository.py	2007-11-28 01:35:49 +0000
+++ b/bzrlib/repository.py	2007-11-29 04:14:06 +0000
@@ -1242,7 +1242,7 @@
                 raise errors.NoSuchIdInRepository(self, file_id)
             yield callable_data, weave.get_lines(revision_id)
 
-    def _generate_text_key_index(self):
+    def _generate_text_key_index(self, text_key_references=None):
         """Generate a new text key index for the repository.
 
         This is an expensive function that will take considerable time to run.
@@ -1254,7 +1254,8 @@
         # All revisions, to find inventory parents.
         revision_graph = self.get_revision_graph_with_ghosts()
         ancestors = revision_graph.get_ancestors()
-        text_key_references = self.find_text_key_references()
+        if text_key_references is None:
+            text_key_references = self.find_text_key_references()
         pb = ui.ui_factory.nested_progress_bar()
         try:
             return self._do_generate_text_key_index(ancestors,