Rev 2736: Filter out texts and signatures not referenced by the revisions being copied during pack to pack fetching. in http://people.ubuntu.com/~robertc/baz2.0/repository

Fri Aug 24 07:56:24 BST 2007

At http://people.ubuntu.com/~robertc/baz2.0/repository

------------------------------------------------------------
revno: 2736
revision-id: robertc at robertcollins.net-20070824065606-9mfmnd3rync2r6i6
parent: robertc at robertcollins.net-20070824055312-duv6jw3ed0ls1gq3
committer: Robert Collins <robertc at robertcollins.net>
branch nick: repository
timestamp: Fri 2007-08-24 16:56:06 +1000
message:
  Filter out texts and signatures not referenced by the revisions being copied during pack to pack fetching.
modified:
  bzrlib/repofmt/pack_repo.py    pack_repo.py-20070813041115-gjv5ma7ktfqwsjgn-1
  bzrlib/repository.py           rev_storage.py-20051111201905-119e9401e46257e3
=== modified file 'bzrlib/repofmt/pack_repo.py'

--- a/bzrlib/repofmt/pack_repo.py	2007-08-24 05:53:12 +0000
+++ b/bzrlib/repofmt/pack_repo.py	2007-08-24 06:56:06 +0000
@@ -257,15 +257,23 @@
         # at this point.
         inv_lines = self._copy_nodes_graph(inv_nodes, inventory_index_map,
             writer, inv_index, output_lines=True)
-        for line in inv_lines:
-            pass
+        if revision_ids:
+            fileid_revisions = self.repo._find_file_ids_from_xml_inventory_lines(
+                inv_lines, revision_ids)
+            text_filter = []
+            for fileid, file_revids in fileid_revisions.iteritems():
+                text_filter.extend(
+                    [(fileid, file_revid) for file_revid in file_revids])
+        else:
+            list(inv_lines)
+            text_filter = None
         if 'fetch' in debug.debug_flags:
             mutter('%s: create_pack: inventories copied: %s%s %d items t+%6.3fs',
                 time.ctime(), self.repo._upload_transport.base, random_name,
                 inv_index.key_count(),
                 time.time() - start_time)
         # select text keys
-        text_nodes = self._index_contents(text_index_map)
+        text_nodes = self._index_contents(text_index_map, text_filter)
         # copy text keys and adjust values
         list(self._copy_nodes_graph(text_nodes, text_index_map, writer,
             text_index))
@@ -275,7 +283,9 @@
                 text_index.key_count(),
                 time.time() - start_time)
         # select signature keys
-        signature_nodes = self._index_contents(signature_index_map)
+        signature_filter = revision_keys # same keyspace
+        signature_nodes = self._index_contents(signature_index_map,
+            signature_filter)
         # copy signature keys and adjust values
         self._copy_nodes(signature_nodes, signature_index_map, writer, signature_index)
         if 'fetch' in debug.debug_flags:

=== modified file 'bzrlib/repository.py'
--- a/bzrlib/repository.py	2007-08-24 03:14:11 +0000
+++ b/bzrlib/repository.py	2007-08-24 06:56:06 +0000
@@ -663,20 +663,18 @@
                                                          signature,
                                                          self.get_transaction())
 
-    def fileids_altered_by_revision_ids(self, revision_ids):
-        """Find the file ids and versions affected by revisions.
-
-        :param revisions: an iterable containing revision ids.
+    def _find_file_ids_from_xml_inventory_lines(self, line_iterator,
+        revision_ids):
+        """Helper routine for fileids_altered_by_revision_ids.
+
+        This performs the translation of xml lines to revision ids.
+
+        :param line_iterator: An iterator of lines
+        :param revision_ids: The revision ids to filter for.
         :return: a dictionary mapping altered file-ids to an iterable of
         revision_ids. Each altered file-ids has the exact revision_ids that
         altered it listed explicitly.
         """
-        assert self._serializer.support_altered_by_hack, \
-            ("fileids_altered_by_revision_ids only supported for branches " 
-             "which store inventory as unnested xml, not on %r" % self)
-        selected_revision_ids = set(osutils.safe_revision_id(r)
-                                    for r in revision_ids)
-        w = self.get_inventory_weave()
         result = {}
 
         # this code needs to read every new line in every inventory for the
@@ -698,44 +696,62 @@
         search = self._file_ids_altered_regex.search
         unescape = _unescape_xml
         setdefault = result.setdefault
+        for line in line_iterator:
+            match = search(line)
+            if match is None:
+                continue
+            # One call to match.group() returning multiple items is quite a
+            # bit faster than 2 calls to match.group() each returning 1
+            file_id, revision_id = match.group('file_id', 'revision_id')
+
+            # Inlining the cache lookups helps a lot when you make 170,000
+            # lines and 350k ids, versus 8.4 unique ids.
+            # Using a cache helps in 2 ways:
+            #   1) Avoids unnecessary decoding calls
+            #   2) Re-uses cached strings, which helps in future set and
+            #      equality checks.
+            # (2) is enough that removing encoding entirely along with
+            # the cache (so we are using plain strings) results in no
+            # performance improvement.
+            try:
+                revision_id = unescape_revid_cache[revision_id]
+            except KeyError:
+                unescaped = unescape(revision_id)
+                unescape_revid_cache[revision_id] = unescaped
+                revision_id = unescaped
+
+            if revision_id in revision_ids:
+                try:
+                    file_id = unescape_fileid_cache[file_id]
+                except KeyError:
+                    unescaped = unescape(file_id)
+                    unescape_fileid_cache[file_id] = unescaped
+                    file_id = unescaped
+                setdefault(file_id, set()).add(revision_id)
+        return result
+
+    def fileids_altered_by_revision_ids(self, revision_ids):
+        """Find the file ids and versions affected by revisions.
+
+        :param revisions: an iterable containing revision ids.
+        :return: a dictionary mapping altered file-ids to an iterable of
+        revision_ids. Each altered file-ids has the exact revision_ids that
+        altered it listed explicitly.
+        """
+        assert self._serializer.support_altered_by_hack, \
+            ("fileids_altered_by_revision_ids only supported for branches " 
+             "which store inventory as unnested xml, not on %r" % self)
+        selected_revision_ids = set(osutils.safe_revision_id(r)
+                                    for r in revision_ids)
+        w = self.get_inventory_weave()
         pb = ui.ui_factory.nested_progress_bar()
         try:
-            for line in w.iter_lines_added_or_present_in_versions(
-                                        selected_revision_ids, pb=pb):
-                match = search(line)
-                if match is None:
-                    continue
-                # One call to match.group() returning multiple items is quite a
-                # bit faster than 2 calls to match.group() each returning 1
-                file_id, revision_id = match.group('file_id', 'revision_id')
-
-                # Inlining the cache lookups helps a lot when you make 170,000
-                # lines and 350k ids, versus 8.4 unique ids.
-                # Using a cache helps in 2 ways:
-                #   1) Avoids unnecessary decoding calls
-                #   2) Re-uses cached strings, which helps in future set and
-                #      equality checks.
-                # (2) is enough that removing encoding entirely along with
-                # the cache (so we are using plain strings) results in no
-                # performance improvement.
-                try:
-                    revision_id = unescape_revid_cache[revision_id]
-                except KeyError:
-                    unescaped = unescape(revision_id)
-                    unescape_revid_cache[revision_id] = unescaped
-                    revision_id = unescaped
-
-                if revision_id in selected_revision_ids:
-                    try:
-                        file_id = unescape_fileid_cache[file_id]
-                    except KeyError:
-                        unescaped = unescape(file_id)
-                        unescape_fileid_cache[file_id] = unescaped
-                        file_id = unescaped
-                    setdefault(file_id, set()).add(revision_id)
+            return self._find_file_ids_from_xml_inventory_lines(
+                w.iter_lines_added_or_present_in_versions(
+                    selected_revision_ids, pb=pb),
+                selected_revision_ids)
         finally:
             pb.finished()
-        return result
 
     def iter_files_bytes(self, desired_files):
         """Iterate through file versions.