Rev 5379: Rework things a bit so the logic can be shared. in http://bazaar.launchpad.net/~jameinel/bzr/2.3-send-mem-614576

Tue Aug 10 21:03:52 BST 2010

At http://bazaar.launchpad.net/~jameinel/bzr/2.3-send-mem-614576

------------------------------------------------------------
revno: 5379
revision-id: john at arbash-meinel.com-20100810200344-6muerwvkafqu7w47
parent: john at arbash-meinel.com-20100810190229-uzdp8oqsa6a67i0n
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: 2.3-send-mem-614576
timestamp: Tue 2010-08-10 15:03:44 -0500
message:
  Rework things a bit so the logic can be shared.
  
  It turns out that some of the peak memory is actually during the inventory
  to string to bundle translations. So re-use the refcount logic there.
  This actually does show a decrease in peak memory.
  Specifically 'cd bzr.dev; bzr send ../2.2' drops from 221MB peak to 156MB.
  
  We don't speed anything up (16.5s both ways), but peak memory is quite
  a bit better.
-------------- next part --------------
=== modified file 'bzrlib/bundle/serializer/v4.py'

--- a/bzrlib/bundle/serializer/v4.py	2009-08-04 14:08:32 +0000
+++ b/bzrlib/bundle/serializer/v4.py	2010-08-10 20:03:44 +0000
@@ -1,4 +1,4 @@
-# Copyright (C) 2007 Canonical Ltd
+# Copyright (C) 2007-2010 Canonical Ltd
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@@ -30,11 +30,49 @@
     serializer,
     trace,
     ui,
+    versionedfile as _mod_versionedfile,
     )
 from bzrlib.bundle import bundle_data, serializer as bundle_serializer
 from bzrlib import bencode
 
 
+class _MPDiffInventoryGenerator(_mod_versionedfile._MPDiffGenerator):
+    """Generate Inventory diffs serialized inventories."""
+
+    def __init__(self, repo, inventory_keys):
+        super(_MPDiffInventoryGenerator, self).__init__(repo.inventories,
+            inventory_keys)
+        self.repo = repo
+        self.sha1s = {}
+
+    def iter_diffs(self):
+        """Compute the diffs one at a time."""
+        # This is instead of compute_diffs() since we guarantee our ordering of
+        # inventories, we don't have to do any buffering
+        self._find_needed_keys()
+        # We actually use a slightly different ordering. We grab all of the
+        # parents first, and then grab the ordered requests.
+        needed_ids = [k[-1] for k in self.present_parents]
+        needed_ids.extend([k[-1] for k in self.ordered_keys])
+        inv_to_str = self.repo._serializer.write_inventory_to_string
+        for inv in self.repo.iter_inventories(needed_ids):
+            revision_id = inv.revision_id
+            key = (revision_id,)
+            if key in self.present_parents:
+                # Not a key we will transmit, which is a shame, since because
+                # of that bundles don't work with stacked branches
+                parent_ids = None
+            else:
+                parent_ids = [k[-1] for k in self.parent_map[key]]
+            as_bytes = inv_to_str(inv)
+            self._process_one_record(key, (as_bytes,))
+            if parent_ids is None:
+                continue
+            diff = self.diffs.pop(key)
+            sha1 = osutils.sha_string(as_bytes)
+            yield revision_id, parent_ids, sha1, diff
+
+
 class BundleWriter(object):
     """Writer for bundle-format files.
 
@@ -348,48 +386,10 @@
         the other side.
         """
         inventory_key_order = [(r,) for r in revision_order]
-        parent_map = self.repository.inventories.get_parent_map(
-                            inventory_key_order)
-        missing_keys = set(inventory_key_order).difference(parent_map)
-        if missing_keys:
-            raise errors.RevisionNotPresent(list(missing_keys)[0],
-                                            self.repository.inventories)
-        inv_to_str = self.repository._serializer.write_inventory_to_string
-        # Make sure that we grab the parent texts first
-        just_parents = set()
-        map(just_parents.update, parent_map.itervalues())
-        just_parents.difference_update(parent_map)
-        # Ignore ghost parents
-        present_parents = self.repository.inventories.get_parent_map(
-                            just_parents)
-        ghost_keys = just_parents.difference(present_parents)
-        needed_inventories = list(present_parents) + inventory_key_order
-        needed_inventories = [k[-1] for k in needed_inventories]
-        all_lines = {}
-        for inv in self.repository.iter_inventories(needed_inventories):
-            revision_id = inv.revision_id
-            key = (revision_id,)
-            as_bytes = inv_to_str(inv)
-            # The sha1 is validated as the xml/textual form, not as the
-            # form-in-the-repository
-            sha1 = osutils.sha_string(as_bytes)
-            as_lines = osutils.split_lines(as_bytes)
-            del as_bytes
-            all_lines[key] = as_lines
-            if key in just_parents:
-                # We don't transmit those entries
-                continue
-            # Create an mpdiff for this text, and add it to the output
-            parent_keys = parent_map[key]
-            # See the comment in VF.make_mpdiffs about how this effects
-            # ordering when there are ghosts present. I think we have a latent
-            # bug
-            parent_lines = [all_lines[p_key] for p_key in parent_keys
-                            if p_key not in ghost_keys]
-            diff = multiparent.MultiParent.from_lines(
-                as_lines, parent_lines)
+        generator = _MPDiffInventoryGenerator(self.repository,
+                                              inventory_key_order)
+        for revision_id, parent_ids, sha1, diff in generator.iter_diffs():
             text = ''.join(diff.to_patch())
-            parent_ids = [k[-1] for k in parent_keys]
             self.bundle.add_multiparent_record(text, sha1, parent_ids,
                                                'inventory', revision_id, None)
 

=== modified file 'bzrlib/tests/test_versionedfile.py'
--- a/bzrlib/tests/test_versionedfile.py	2010-08-10 18:07:54 +0000
+++ b/bzrlib/tests/test_versionedfile.py	2010-08-10 20:03:44 +0000
@@ -59,6 +59,7 @@
         # It is returned, but we don't really care as we won't extract it
         self.assertEqual({('one',): 1}, refcount)
         self.assertEqual([('one',)], sorted(gen.ghost_parents))
+        self.assertEqual([], sorted(gen.present_parents))
 
     def test_raises_on_ghost_keys(self):
         # If the requested key is a ghost, then we have a problem
@@ -74,6 +75,7 @@
         self.assertEqual(sorted([('one',), ('two',), ('three',)]),
                          sorted(needed_keys))
         self.assertEqual({('one',): 2, ('two',): 1}, refcount)
+        self.assertEqual([('one',)], sorted(gen.present_parents))
 
     def test_process_contents(self):
         vf = self.make_three_vf()
@@ -90,7 +92,7 @@
         self.assertEqual(('one',), record.key)
         # one is not needed in the output, but it is needed by children. As
         # such, it should end up in the various caches
-        gen._process_one_record(record)
+        gen._process_one_record(record.key, record.get_bytes_as('chunked'))
         # The chunks should be cached, the refcount untouched
         self.assertEqual([('one',)], gen.chunks.keys())
         self.assertEqual({('one',): 2, ('two',): 1}, gen.refcounts)
@@ -99,7 +101,7 @@
         # three
         record = stream.next()
         self.assertEqual(('two',), record.key)
-        gen._process_one_record(record)
+        gen._process_one_record(record.key, record.get_bytes_as('chunked'))
         # Both are now cached, and the diff for two has been extracted, and
         # one's refcount has been updated. two has been removed from the
         # parent_map
@@ -113,7 +115,7 @@
         # caches
         record = stream.next()
         self.assertEqual(('three',), record.key)
-        gen._process_one_record(record)
+        gen._process_one_record(record.key, record.get_bytes_as('chunked'))
         # Both are now cached, and the diff for two has been extracted, and
         # one's refcount has been updated
         self.assertEqual([], gen.chunks.keys())

=== modified file 'bzrlib/versionedfile.py'
--- a/bzrlib/versionedfile.py	2010-08-10 19:02:29 +0000
+++ b/bzrlib/versionedfile.py	2010-08-10 20:03:44 +0000
@@ -251,18 +251,20 @@
         # should check for them
         refcounts = {}
         setdefault = refcounts.setdefault
-        maybe_ghosts = set()
+        just_parents = set()
         for child_key, parent_keys in parent_map.iteritems():
             if not parent_keys:
-                # Ghost? We should never get here
+                # parent_keys may be None if a given VersionedFile claims to
+                # not support graph operations.
                 continue
-            maybe_ghosts.update(p for p in parent_keys if p not in needed_keys)
+            just_parents.update(parent_keys)
             needed_keys.update(parent_keys)
             for p in parent_keys:
                 refcounts[p] = setdefault(p, 0) + 1
-        # Remove any parents that are actually ghosts
-        self.ghost_parents = maybe_ghosts.difference(
-                                self.vf.get_parent_map(maybe_ghosts))
+        just_parents.difference_update(parent_map)
+        # Remove any parents that are actually ghosts from the needed set
+        self.present_parents = set(self.vf.get_parent_map(just_parents))
+        self.ghost_parents = just_parents.difference(self.present_parents)
         needed_keys.difference_update(self.ghost_parents)
         self.needed_keys = needed_keys
         self.refcounts = refcounts
@@ -283,12 +285,12 @@
                     parent_lines, left_parent_blocks)
         self.diffs[key] = diff
 
-    def _process_one_record(self, record):
-        this_chunks = record.get_bytes_as('chunked')
-        if record.key in self.parent_map:
+    def _process_one_record(self, key, this_chunks):
+        parent_keys = None
+        if key in self.parent_map:
             # This record should be ready to diff, since we requested
             # content in 'topological' order
-            parent_keys = self.parent_map.pop(record.key)
+            parent_keys = self.parent_map.pop(key)
             # If a VersionedFile claims 'no-graph' support, then it may return
             # None for any parent request, so we replace it with an empty tuple
             if parent_keys is None:
@@ -315,11 +317,11 @@
             lines = osutils.chunks_to_lines(this_chunks)
             # Since we needed the lines, we'll go ahead and cache them this way
             this_chunks = lines
-            self._compute_diff(record.key, parent_lines, lines)
+            self._compute_diff(key, parent_lines, lines)
             del lines
         # Is this content required for any more children?
-        if record.key in self.refcounts:
-            self.chunks[record.key] = this_chunks
+        if key in self.refcounts:
+            self.chunks[key] = this_chunks
 
     def _extract_diffs(self):
         needed_keys, refcounts = self._find_needed_keys()
@@ -327,7 +329,8 @@
                                                 'topological', True):
             if record.storage_kind == 'absent':
                 raise errors.RevisionNotPresent(record.key, self.vf)
-            self._process_one_record(record)
+            self._process_one_record(record.key,
+                                     record.get_bytes_as('chunked'))
         # At this point, we should have *no* remaining content
         assert not self.parent_map
         assert set(self.refcounts) == self.ghost_parents