Rev 5379: Rework things a bit so the logic can be shared. in http://bazaar.launchpad.net/~jameinel/bzr/2.3-send-mem-614576
John Arbash Meinel
john at arbash-meinel.com
Tue Aug 10 21:03:52 BST 2010
At http://bazaar.launchpad.net/~jameinel/bzr/2.3-send-mem-614576
------------------------------------------------------------
revno: 5379
revision-id: john at arbash-meinel.com-20100810200344-6muerwvkafqu7w47
parent: john at arbash-meinel.com-20100810190229-uzdp8oqsa6a67i0n
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: 2.3-send-mem-614576
timestamp: Tue 2010-08-10 15:03:44 -0500
message:
Rework things a bit so the logic can be shared.
It turns out that some of the peak memory is actually during the inventory
to string to bundle translations. So re-use the refcount logic there.
This actually does show a decrease in peak memory.
Specifically 'cd bzr.dev; bzr send ../2.2' drops from 221MB peak to 156MB.
We don't speed anything up (16.5s both ways), but peak memory is quite
a bit better.
-------------- next part --------------
=== modified file 'bzrlib/bundle/serializer/v4.py'
--- a/bzrlib/bundle/serializer/v4.py 2009-08-04 14:08:32 +0000
+++ b/bzrlib/bundle/serializer/v4.py 2010-08-10 20:03:44 +0000
@@ -1,4 +1,4 @@
-# Copyright (C) 2007 Canonical Ltd
+# Copyright (C) 2007-2010 Canonical Ltd
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
@@ -30,11 +30,49 @@
serializer,
trace,
ui,
+ versionedfile as _mod_versionedfile,
)
from bzrlib.bundle import bundle_data, serializer as bundle_serializer
from bzrlib import bencode
+class _MPDiffInventoryGenerator(_mod_versionedfile._MPDiffGenerator):
+ """Generate Inventory diffs serialized inventories."""
+
+ def __init__(self, repo, inventory_keys):
+ super(_MPDiffInventoryGenerator, self).__init__(repo.inventories,
+ inventory_keys)
+ self.repo = repo
+ self.sha1s = {}
+
+ def iter_diffs(self):
+ """Compute the diffs one at a time."""
+ # This is instead of compute_diffs() since we guarantee our ordering of
+ # inventories, we don't have to do any buffering
+ self._find_needed_keys()
+ # We actually use a slightly different ordering. We grab all of the
+ # parents first, and then grab the ordered requests.
+ needed_ids = [k[-1] for k in self.present_parents]
+ needed_ids.extend([k[-1] for k in self.ordered_keys])
+ inv_to_str = self.repo._serializer.write_inventory_to_string
+ for inv in self.repo.iter_inventories(needed_ids):
+ revision_id = inv.revision_id
+ key = (revision_id,)
+ if key in self.present_parents:
+ # Not a key we will transmit, which is a shame, since because
+ # of that bundles don't work with stacked branches
+ parent_ids = None
+ else:
+ parent_ids = [k[-1] for k in self.parent_map[key]]
+ as_bytes = inv_to_str(inv)
+ self._process_one_record(key, (as_bytes,))
+ if parent_ids is None:
+ continue
+ diff = self.diffs.pop(key)
+ sha1 = osutils.sha_string(as_bytes)
+ yield revision_id, parent_ids, sha1, diff
+
+
class BundleWriter(object):
"""Writer for bundle-format files.
@@ -348,48 +386,10 @@
the other side.
"""
inventory_key_order = [(r,) for r in revision_order]
- parent_map = self.repository.inventories.get_parent_map(
- inventory_key_order)
- missing_keys = set(inventory_key_order).difference(parent_map)
- if missing_keys:
- raise errors.RevisionNotPresent(list(missing_keys)[0],
- self.repository.inventories)
- inv_to_str = self.repository._serializer.write_inventory_to_string
- # Make sure that we grab the parent texts first
- just_parents = set()
- map(just_parents.update, parent_map.itervalues())
- just_parents.difference_update(parent_map)
- # Ignore ghost parents
- present_parents = self.repository.inventories.get_parent_map(
- just_parents)
- ghost_keys = just_parents.difference(present_parents)
- needed_inventories = list(present_parents) + inventory_key_order
- needed_inventories = [k[-1] for k in needed_inventories]
- all_lines = {}
- for inv in self.repository.iter_inventories(needed_inventories):
- revision_id = inv.revision_id
- key = (revision_id,)
- as_bytes = inv_to_str(inv)
- # The sha1 is validated as the xml/textual form, not as the
- # form-in-the-repository
- sha1 = osutils.sha_string(as_bytes)
- as_lines = osutils.split_lines(as_bytes)
- del as_bytes
- all_lines[key] = as_lines
- if key in just_parents:
- # We don't transmit those entries
- continue
- # Create an mpdiff for this text, and add it to the output
- parent_keys = parent_map[key]
- # See the comment in VF.make_mpdiffs about how this effects
- # ordering when there are ghosts present. I think we have a latent
- # bug
- parent_lines = [all_lines[p_key] for p_key in parent_keys
- if p_key not in ghost_keys]
- diff = multiparent.MultiParent.from_lines(
- as_lines, parent_lines)
+ generator = _MPDiffInventoryGenerator(self.repository,
+ inventory_key_order)
+ for revision_id, parent_ids, sha1, diff in generator.iter_diffs():
text = ''.join(diff.to_patch())
- parent_ids = [k[-1] for k in parent_keys]
self.bundle.add_multiparent_record(text, sha1, parent_ids,
'inventory', revision_id, None)
=== modified file 'bzrlib/tests/test_versionedfile.py'
--- a/bzrlib/tests/test_versionedfile.py 2010-08-10 18:07:54 +0000
+++ b/bzrlib/tests/test_versionedfile.py 2010-08-10 20:03:44 +0000
@@ -59,6 +59,7 @@
# It is returned, but we don't really care as we won't extract it
self.assertEqual({('one',): 1}, refcount)
self.assertEqual([('one',)], sorted(gen.ghost_parents))
+ self.assertEqual([], sorted(gen.present_parents))
def test_raises_on_ghost_keys(self):
# If the requested key is a ghost, then we have a problem
@@ -74,6 +75,7 @@
self.assertEqual(sorted([('one',), ('two',), ('three',)]),
sorted(needed_keys))
self.assertEqual({('one',): 2, ('two',): 1}, refcount)
+ self.assertEqual([('one',)], sorted(gen.present_parents))
def test_process_contents(self):
vf = self.make_three_vf()
@@ -90,7 +92,7 @@
self.assertEqual(('one',), record.key)
# one is not needed in the output, but it is needed by children. As
# such, it should end up in the various caches
- gen._process_one_record(record)
+ gen._process_one_record(record.key, record.get_bytes_as('chunked'))
# The chunks should be cached, the refcount untouched
self.assertEqual([('one',)], gen.chunks.keys())
self.assertEqual({('one',): 2, ('two',): 1}, gen.refcounts)
@@ -99,7 +101,7 @@
# three
record = stream.next()
self.assertEqual(('two',), record.key)
- gen._process_one_record(record)
+ gen._process_one_record(record.key, record.get_bytes_as('chunked'))
# Both are now cached, and the diff for two has been extracted, and
# one's refcount has been updated. two has been removed from the
# parent_map
@@ -113,7 +115,7 @@
# caches
record = stream.next()
self.assertEqual(('three',), record.key)
- gen._process_one_record(record)
+ gen._process_one_record(record.key, record.get_bytes_as('chunked'))
# Both are now cached, and the diff for two has been extracted, and
# one's refcount has been updated
self.assertEqual([], gen.chunks.keys())
=== modified file 'bzrlib/versionedfile.py'
--- a/bzrlib/versionedfile.py 2010-08-10 19:02:29 +0000
+++ b/bzrlib/versionedfile.py 2010-08-10 20:03:44 +0000
@@ -251,18 +251,20 @@
# should check for them
refcounts = {}
setdefault = refcounts.setdefault
- maybe_ghosts = set()
+ just_parents = set()
for child_key, parent_keys in parent_map.iteritems():
if not parent_keys:
- # Ghost? We should never get here
+ # parent_keys may be None if a given VersionedFile claims to
+ # not support graph operations.
continue
- maybe_ghosts.update(p for p in parent_keys if p not in needed_keys)
+ just_parents.update(parent_keys)
needed_keys.update(parent_keys)
for p in parent_keys:
refcounts[p] = setdefault(p, 0) + 1
- # Remove any parents that are actually ghosts
- self.ghost_parents = maybe_ghosts.difference(
- self.vf.get_parent_map(maybe_ghosts))
+ just_parents.difference_update(parent_map)
+ # Remove any parents that are actually ghosts from the needed set
+ self.present_parents = set(self.vf.get_parent_map(just_parents))
+ self.ghost_parents = just_parents.difference(self.present_parents)
needed_keys.difference_update(self.ghost_parents)
self.needed_keys = needed_keys
self.refcounts = refcounts
@@ -283,12 +285,12 @@
parent_lines, left_parent_blocks)
self.diffs[key] = diff
- def _process_one_record(self, record):
- this_chunks = record.get_bytes_as('chunked')
- if record.key in self.parent_map:
+ def _process_one_record(self, key, this_chunks):
+ parent_keys = None
+ if key in self.parent_map:
# This record should be ready to diff, since we requested
# content in 'topological' order
- parent_keys = self.parent_map.pop(record.key)
+ parent_keys = self.parent_map.pop(key)
# If a VersionedFile claims 'no-graph' support, then it may return
# None for any parent request, so we replace it with an empty tuple
if parent_keys is None:
@@ -315,11 +317,11 @@
lines = osutils.chunks_to_lines(this_chunks)
# Since we needed the lines, we'll go ahead and cache them this way
this_chunks = lines
- self._compute_diff(record.key, parent_lines, lines)
+ self._compute_diff(key, parent_lines, lines)
del lines
# Is this content required for any more children?
- if record.key in self.refcounts:
- self.chunks[record.key] = this_chunks
+ if key in self.refcounts:
+ self.chunks[key] = this_chunks
def _extract_diffs(self):
needed_keys, refcounts = self._find_needed_keys()
@@ -327,7 +329,8 @@
'topological', True):
if record.storage_kind == 'absent':
raise errors.RevisionNotPresent(record.key, self.vf)
- self._process_one_record(record)
+ self._process_one_record(record.key,
+ record.get_bytes_as('chunked'))
# At this point, we should have *no* remaining content
assert not self.parent_map
assert set(self.refcounts) == self.ghost_parents
More information about the bazaar-commits
mailing list