Rev 5376: Create a multi-parent diff generator class. in http://bazaar.launchpad.net/~jameinel/bzr/2.3-send-mem-614576

Tue Aug 10 19:08:02 BST 2010

At http://bazaar.launchpad.net/~jameinel/bzr/2.3-send-mem-614576

------------------------------------------------------------
revno: 5376
revision-id: john at arbash-meinel.com-20100810180754-jf7zhjlkoaxpqsrr
parent: john at arbash-meinel.com-20100810170137-30ug1zkmjah15saf
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: 2.3-send-mem-614576
timestamp: Tue 2010-08-10 13:07:54 -0500
message:
  Create a multi-parent diff generator class.
  
  This just encapsulates the state. That way we don't have to
  hold all texts in memory, but we can hold *some* texts in memory
  until we are done with them.
-------------- next part --------------
=== modified file 'bzrlib/tests/__init__.py'

--- a/bzrlib/tests/__init__.py	2010-08-07 16:42:09 +0000
+++ b/bzrlib/tests/__init__.py	2010-08-10 18:07:54 +0000
@@ -3828,6 +3828,7 @@
         'bzrlib.tests.test_urlutils',
         'bzrlib.tests.test_version',
         'bzrlib.tests.test_version_info',
+        'bzrlib.tests.test_versionedfile',
         'bzrlib.tests.test_weave',
         'bzrlib.tests.test_whitebox',
         'bzrlib.tests.test_win32utils',

=== added file 'bzrlib/tests/test_versionedfile.py'
--- a/bzrlib/tests/test_versionedfile.py	1970-01-01 00:00:00 +0000
+++ b/bzrlib/tests/test_versionedfile.py	2010-08-10 18:07:54 +0000
@@ -0,0 +1,137 @@
+# Copyright (C) 2010 Canonical Ltd
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+"""Tests for VersionedFile classes"""
+
+from bzrlib import (
+    errors,
+    groupcompress,
+    multiparent,
+    tests,
+    versionedfile,
+    )
+
+
+class Test_MPDiffGenerator(tests.TestCaseWithMemoryTransport):
+    # Should this be a per vf test?
+
+    def make_vf(self):
+        t = self.get_transport('')
+        factory = groupcompress.make_pack_factory(True, True, 1)
+        return factory(t)
+
+    def make_three_vf(self):
+        vf = self.make_vf()
+        vf.add_lines(('one',), (), ['first\n'])
+        vf.add_lines(('two',), [('one',)], ['first\n', 'second\n'])
+        vf.add_lines(('three',), [('one',), ('two',)],
+                    ['first\n', 'second\n', 'third\n'])
+        return vf
+
+    def test_finds_parents(self):
+        vf = self.make_three_vf()
+        gen = versionedfile._MPDiffGenerator(vf, [('three',)])
+        needed_keys, refcount = gen._find_needed_keys()
+        self.assertEqual(sorted([('one',), ('two',), ('three',)]),
+                         sorted(needed_keys))
+        self.assertEqual({('one',): 1, ('two',): 1}, refcount)
+
+    def test_ignores_ghost_parents(self):
+        # If a parent is a ghost, it is just ignored
+        vf = self.make_vf()
+        vf.add_lines(('two',), [('one',)], ['first\n', 'second\n'])
+        gen = versionedfile._MPDiffGenerator(vf, [('two',)])
+        needed_keys, refcount = gen._find_needed_keys()
+        self.assertEqual(sorted([('two',)]), sorted(needed_keys))
+        # It is returned, but we don't really care as we won't extract it
+        self.assertEqual({('one',): 1}, refcount)
+        self.assertEqual([('one',)], sorted(gen.ghost_parents))
+
+    def test_raises_on_ghost_keys(self):
+        # If the requested key is a ghost, then we have a problem
+        vf = self.make_vf()
+        gen = versionedfile._MPDiffGenerator(vf, [('one',)])
+        self.assertRaises(errors.RevisionNotPresent,
+                          gen._find_needed_keys)
+
+    def test_refcount_multiple_children(self):
+        vf = self.make_three_vf()
+        gen = versionedfile._MPDiffGenerator(vf, [('two',), ('three',)])
+        needed_keys, refcount = gen._find_needed_keys()
+        self.assertEqual(sorted([('one',), ('two',), ('three',)]),
+                         sorted(needed_keys))
+        self.assertEqual({('one',): 2, ('two',): 1}, refcount)
+
+    def test_process_contents(self):
+        vf = self.make_three_vf()
+        gen = versionedfile._MPDiffGenerator(vf, [('two',), ('three',)])
+        gen._find_needed_keys()
+        self.assertEqual({('two',): (('one',),),
+                          ('three',): (('one',), ('two',))},
+                         gen.parent_map)
+        self.assertEqual({('one',): 2, ('two',): 1}, gen.refcounts)
+        self.assertEqual(sorted([('one',), ('two',), ('three',)]),
+                         sorted(gen.needed_keys))
+        stream = vf.get_record_stream(gen.needed_keys, 'topological', True)
+        record = stream.next()
+        self.assertEqual(('one',), record.key)
+        # one is not needed in the output, but it is needed by children. As
+        # such, it should end up in the various caches
+        gen._process_one_record(record)
+        # The chunks should be cached, the refcount untouched
+        self.assertEqual([('one',)], gen.chunks.keys())
+        self.assertEqual({('one',): 2, ('two',): 1}, gen.refcounts)
+        self.assertEqual([], gen.diffs.keys())
+        # Next we get 'two', which is something we output, but also needed for
+        # three
+        record = stream.next()
+        self.assertEqual(('two',), record.key)
+        gen._process_one_record(record)
+        # Both are now cached, and the diff for two has been extracted, and
+        # one's refcount has been updated. two has been removed from the
+        # parent_map
+        self.assertEqual(sorted([('one',), ('two',)]),
+                         sorted(gen.chunks.keys()))
+        self.assertEqual({('one',): 1, ('two',): 1}, gen.refcounts)
+        self.assertEqual([('two',)], gen.diffs.keys())
+        self.assertEqual({('three',): (('one',), ('two',))},
+                         gen.parent_map)
+        # Finally 'three', which allows us to remove all parents from the
+        # caches
+        record = stream.next()
+        self.assertEqual(('three',), record.key)
+        gen._process_one_record(record)
+        # Both are now cached, and the diff for two has been extracted, and
+        # one's refcount has been updated
+        self.assertEqual([], gen.chunks.keys())
+        self.assertEqual({}, gen.refcounts)
+        self.assertEqual(sorted([('two',), ('three',)]),
+                         sorted(gen.diffs.keys()))
+
+    def test_compute_diffs(self):
+        vf = self.make_three_vf()
+        # The content is in the order requested, even if it isn't topological
+        gen = versionedfile._MPDiffGenerator(vf, [('two',), ('three',),
+                                                  ('one',)])
+        diffs = gen.compute_diffs()
+        expected_diffs = [
+            multiparent.MultiParent([multiparent.ParentText(0, 0, 0, 1),
+                                     multiparent.NewText(['second\n'])]),
+            multiparent.MultiParent([multiparent.ParentText(1, 0, 0, 2),
+                                     multiparent.NewText(['third\n'])]),
+            multiparent.MultiParent([multiparent.NewText(['first\n'])]),
+            ]
+        self.assertEqual(expected_diffs, diffs)

=== modified file 'bzrlib/versionedfile.py'
--- a/bzrlib/versionedfile.py	2010-07-30 20:44:45 +0000
+++ b/bzrlib/versionedfile.py	2010-08-10 18:07:54 +0000
@@ -206,6 +206,134 @@
             yield record
 
 
+class _MPDiffGenerator(object):
+    """Pull out the functionality for generating mp_diffs."""
+
+    def __init__(self, vf, keys):
+        self.vf = vf
+        # This is the order the keys were requested in
+        self.ordered_keys = tuple(keys)
+        # keys + their parents, what we need to compute the diffs
+        self.needed_keys = ()
+        # Map from key: mp_diff
+        self.diffs = {}
+        # Map from key: parents_needed (may have ghosts)
+        self.parent_map = {}
+        # Parents that aren't present
+        self.ghost_parents = ()
+        # Map from parent_key => number of children for this text
+        self.refcounts = {}
+        # Content chunks that are cached while we still need them
+        self.chunks = {}
+
+    def _find_needed_keys(self):
+        """Find the set of keys we need to request.
+
+        This includes all the original keys passed in, and the non-ghost
+        parents of those keys.
+
+        :return: (needed_keys, refcounts)
+            needed_keys is the set of all texts we need to extract
+            refcounts is a dict of {key: num_children} letting us know when we
+                no longer need to cache a given parent text
+        """
+        # All the keys and their parents
+        needed_keys = set(self.ordered_keys)
+        parent_map = self.vf.get_parent_map(needed_keys)
+        self.parent_map = parent_map
+        # TODO: Should we be using a different construct here? I think this
+        #       uses difference_update internally, and we expect the result to
+        #       be tiny
+        missing_keys = needed_keys.difference(parent_map)
+        if missing_keys:
+            raise errors.RevisionNotPresent(list(missing_keys)[0], self.vf)
+        # Parents that might be missing. They are allowed to be ghosts, but we
+        # should check for them
+        refcounts = {}
+        setdefault = refcounts.setdefault
+        maybe_ghosts = set()
+        for child_key, parent_keys in parent_map.iteritems():
+            if not parent_keys:
+                # Ghost? We should never get here
+                continue
+            maybe_ghosts.update(p for p in parent_keys if p not in needed_keys)
+            needed_keys.update(parent_keys)
+            for p in parent_keys:
+                refcounts[p] = setdefault(p, 0) + 1
+        # Remove any parents that are actually ghosts
+        self.ghost_parents = maybe_ghosts.difference(
+                                self.vf.get_parent_map(maybe_ghosts))
+        needed_keys.difference_update(self.ghost_parents)
+        self.needed_keys = needed_keys
+        self.refcounts = refcounts
+        return needed_keys, refcounts
+
+    def _compute_diff(self, key, parent_lines, lines):
+        """Compute a single mp_diff, and store it in self._diffs"""
+        if len(parent_lines) > 0:
+            # XXX: _extract_blocks is not usefully defined anywhere...
+            #      It was meant to extract the left-parent diff without
+            #      having to recompute it for Knit content (pack-0.92,
+            #      etc). That seems to have regressed somewhere
+            left_parent_blocks = self.vf._extract_blocks(key,
+                parent_lines[0], lines)
+        else:
+            left_parent_blocks = None
+        diff = multiparent.MultiParent.from_lines(lines,
+                    parent_lines, left_parent_blocks)
+        self.diffs[key] = diff
+
+    def _process_one_record(self, record):
+        this_chunks = record.get_bytes_as('chunked')
+        if record.key in self.parent_map:
+            # This record should be ready to diff, since we requested
+            # content in 'topological' order
+            parent_keys = self.parent_map.pop(record.key)
+            parent_lines = []
+            for p in parent_keys:
+                # Alternatively we could check p not in self.needed_keys, but
+                # ghost_parents should be tiny versus huge
+                if p in self.ghost_parents:
+                    continue
+                refcount = self.refcounts[p]
+                if refcount == 1: # Last child reference
+                    self.refcounts.pop(p)
+                    parent_chunks = self.chunks.pop(p)
+                else:
+                    self.refcounts[p] = refcount - 1
+                    parent_chunks = self.chunks[p]
+                # TODO: Should we cache the line form? We did the
+                #       computation to get it, but storing it this way will
+                #       be less memory efficient...
+                parent_lines.append(osutils.chunks_to_lines(parent_chunks))
+            lines = osutils.chunks_to_lines(this_chunks)
+            # TODO: Should we be caching lines instead of chunks?
+            #       Higher-memory, but avoids double extracting.
+            #       If we have good topological sorting, we shouldn't have
+            #       much pending stuff cached...
+            ## this_chunks = lines
+            self._compute_diff(record.key, parent_lines, lines)
+        # Is this content required for any more children?
+        if record.key in self.refcounts:
+            self.chunks[record.key] = this_chunks
+
+    def _extract_diffs(self):
+        needed_keys, refcounts = self._find_needed_keys()
+        for record in self.vf.get_record_stream(needed_keys,
+                                                'topological', True):
+            if record.storage_kind == 'absent':
+                raise errors.RevisionNotPresent(record.key, self.vf)
+            self._process_one_record(record)
+        # At this point, we should have *no* remaining content
+        assert not self.parent_map
+        assert set(self.refcounts) == self.ghost_parents
+        
+    def compute_diffs(self):
+        self._extract_diffs()
+        dpop = self.diffs.pop
+        return [dpop(k) for k in self.ordered_keys]
+
+
 class VersionedFile(object):
     """Versioned text file storage.
 
@@ -1077,6 +1205,10 @@
             # between the creation and the application of the mpdiff.
             parent_lines = [lines[p] for p in parents if p in knit_keys]
             if len(parent_lines) > 0:
+                # XXX: _extract_blocks is not usefully defined anywhere... It
+                #       was meant to extract the left-parent diff without
+                #       having to recompute it for Knit content (pack-0.92,
+                #       etc)
                 left_parent_blocks = self._extract_blocks(key, parent_lines[0],
                     target)
             else: