Rev 2736: Filter out texts and signatures not referenced by the revisions being copied during pack to pack fetching. in http://people.ubuntu.com/~robertc/baz2.0/repository
Robert Collins
robertc at robertcollins.net
Fri Aug 24 07:56:24 BST 2007
At http://people.ubuntu.com/~robertc/baz2.0/repository
------------------------------------------------------------
revno: 2736
revision-id: robertc at robertcollins.net-20070824065606-9mfmnd3rync2r6i6
parent: robertc at robertcollins.net-20070824055312-duv6jw3ed0ls1gq3
committer: Robert Collins <robertc at robertcollins.net>
branch nick: repository
timestamp: Fri 2007-08-24 16:56:06 +1000
message:
Filter out texts and signatures not referenced by the revisions being copied during pack to pack fetching.
modified:
bzrlib/repofmt/pack_repo.py pack_repo.py-20070813041115-gjv5ma7ktfqwsjgn-1
bzrlib/repository.py rev_storage.py-20051111201905-119e9401e46257e3
=== modified file 'bzrlib/repofmt/pack_repo.py'
--- a/bzrlib/repofmt/pack_repo.py 2007-08-24 05:53:12 +0000
+++ b/bzrlib/repofmt/pack_repo.py 2007-08-24 06:56:06 +0000
@@ -257,15 +257,23 @@
# at this point.
inv_lines = self._copy_nodes_graph(inv_nodes, inventory_index_map,
writer, inv_index, output_lines=True)
- for line in inv_lines:
- pass
+ if revision_ids:
+ fileid_revisions = self.repo._find_file_ids_from_xml_inventory_lines(
+ inv_lines, revision_ids)
+ text_filter = []
+ for fileid, file_revids in fileid_revisions.iteritems():
+ text_filter.extend(
+ [(fileid, file_revid) for file_revid in file_revids])
+ else:
+ list(inv_lines)
+ text_filter = None
if 'fetch' in debug.debug_flags:
mutter('%s: create_pack: inventories copied: %s%s %d items t+%6.3fs',
time.ctime(), self.repo._upload_transport.base, random_name,
inv_index.key_count(),
time.time() - start_time)
# select text keys
- text_nodes = self._index_contents(text_index_map)
+ text_nodes = self._index_contents(text_index_map, text_filter)
# copy text keys and adjust values
list(self._copy_nodes_graph(text_nodes, text_index_map, writer,
text_index))
@@ -275,7 +283,9 @@
text_index.key_count(),
time.time() - start_time)
# select signature keys
- signature_nodes = self._index_contents(signature_index_map)
+ signature_filter = revision_keys # same keyspace
+ signature_nodes = self._index_contents(signature_index_map,
+ signature_filter)
# copy signature keys and adjust values
self._copy_nodes(signature_nodes, signature_index_map, writer, signature_index)
if 'fetch' in debug.debug_flags:
=== modified file 'bzrlib/repository.py'
--- a/bzrlib/repository.py 2007-08-24 03:14:11 +0000
+++ b/bzrlib/repository.py 2007-08-24 06:56:06 +0000
@@ -663,20 +663,18 @@
signature,
self.get_transaction())
- def fileids_altered_by_revision_ids(self, revision_ids):
- """Find the file ids and versions affected by revisions.
-
- :param revisions: an iterable containing revision ids.
+ def _find_file_ids_from_xml_inventory_lines(self, line_iterator,
+ revision_ids):
+ """Helper routine for fileids_altered_by_revision_ids.
+
+ This performs the translation of xml lines to revision ids.
+
+ :param line_iterator: An iterator of lines
+ :param revision_ids: The revision ids to filter for.
:return: a dictionary mapping altered file-ids to an iterable of
revision_ids. Each altered file-ids has the exact revision_ids that
altered it listed explicitly.
"""
- assert self._serializer.support_altered_by_hack, \
- ("fileids_altered_by_revision_ids only supported for branches "
- "which store inventory as unnested xml, not on %r" % self)
- selected_revision_ids = set(osutils.safe_revision_id(r)
- for r in revision_ids)
- w = self.get_inventory_weave()
result = {}
# this code needs to read every new line in every inventory for the
@@ -698,44 +696,62 @@
search = self._file_ids_altered_regex.search
unescape = _unescape_xml
setdefault = result.setdefault
+ for line in line_iterator:
+ match = search(line)
+ if match is None:
+ continue
+ # One call to match.group() returning multiple items is quite a
+ # bit faster than 2 calls to match.group() each returning 1
+ file_id, revision_id = match.group('file_id', 'revision_id')
+
+ # Inlining the cache lookups helps a lot when you make 170,000
+ # lines and 350k ids, versus 8.4 unique ids.
+ # Using a cache helps in 2 ways:
+ # 1) Avoids unnecessary decoding calls
+ # 2) Re-uses cached strings, which helps in future set and
+ # equality checks.
+ # (2) is enough that removing encoding entirely along with
+ # the cache (so we are using plain strings) results in no
+ # performance improvement.
+ try:
+ revision_id = unescape_revid_cache[revision_id]
+ except KeyError:
+ unescaped = unescape(revision_id)
+ unescape_revid_cache[revision_id] = unescaped
+ revision_id = unescaped
+
+ if revision_id in revision_ids:
+ try:
+ file_id = unescape_fileid_cache[file_id]
+ except KeyError:
+ unescaped = unescape(file_id)
+ unescape_fileid_cache[file_id] = unescaped
+ file_id = unescaped
+ setdefault(file_id, set()).add(revision_id)
+ return result
+
+ def fileids_altered_by_revision_ids(self, revision_ids):
+ """Find the file ids and versions affected by revisions.
+
+ :param revisions: an iterable containing revision ids.
+ :return: a dictionary mapping altered file-ids to an iterable of
+ revision_ids. Each altered file-ids has the exact revision_ids that
+ altered it listed explicitly.
+ """
+ assert self._serializer.support_altered_by_hack, \
+ ("fileids_altered_by_revision_ids only supported for branches "
+ "which store inventory as unnested xml, not on %r" % self)
+ selected_revision_ids = set(osutils.safe_revision_id(r)
+ for r in revision_ids)
+ w = self.get_inventory_weave()
pb = ui.ui_factory.nested_progress_bar()
try:
- for line in w.iter_lines_added_or_present_in_versions(
- selected_revision_ids, pb=pb):
- match = search(line)
- if match is None:
- continue
- # One call to match.group() returning multiple items is quite a
- # bit faster than 2 calls to match.group() each returning 1
- file_id, revision_id = match.group('file_id', 'revision_id')
-
- # Inlining the cache lookups helps a lot when you make 170,000
- # lines and 350k ids, versus 8.4 unique ids.
- # Using a cache helps in 2 ways:
- # 1) Avoids unnecessary decoding calls
- # 2) Re-uses cached strings, which helps in future set and
- # equality checks.
- # (2) is enough that removing encoding entirely along with
- # the cache (so we are using plain strings) results in no
- # performance improvement.
- try:
- revision_id = unescape_revid_cache[revision_id]
- except KeyError:
- unescaped = unescape(revision_id)
- unescape_revid_cache[revision_id] = unescaped
- revision_id = unescaped
-
- if revision_id in selected_revision_ids:
- try:
- file_id = unescape_fileid_cache[file_id]
- except KeyError:
- unescaped = unescape(file_id)
- unescape_fileid_cache[file_id] = unescaped
- file_id = unescaped
- setdefault(file_id, set()).add(revision_id)
+ return self._find_file_ids_from_xml_inventory_lines(
+ w.iter_lines_added_or_present_in_versions(
+ selected_revision_ids, pb=pb),
+ selected_revision_ids)
finally:
pb.finished()
- return result
def iter_files_bytes(self, desired_files):
"""Iterate through file versions.
More information about the bazaar-commits
mailing list