Rev 5376: Create a multi-parent diff generator class. in http://bazaar.launchpad.net/~jameinel/bzr/2.3-send-mem-614576
John Arbash Meinel
john at arbash-meinel.com
Tue Aug 10 19:08:02 BST 2010
At http://bazaar.launchpad.net/~jameinel/bzr/2.3-send-mem-614576
------------------------------------------------------------
revno: 5376
revision-id: john at arbash-meinel.com-20100810180754-jf7zhjlkoaxpqsrr
parent: john at arbash-meinel.com-20100810170137-30ug1zkmjah15saf
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: 2.3-send-mem-614576
timestamp: Tue 2010-08-10 13:07:54 -0500
message:
Create a multi-parent diff generator class.
This just encapsulates the state. That way we don't have to
hold all texts in memory, but we can hold *some* texts in memory
until we are done with them.
-------------- next part --------------
=== modified file 'bzrlib/tests/__init__.py'
--- a/bzrlib/tests/__init__.py 2010-08-07 16:42:09 +0000
+++ b/bzrlib/tests/__init__.py 2010-08-10 18:07:54 +0000
@@ -3828,6 +3828,7 @@
'bzrlib.tests.test_urlutils',
'bzrlib.tests.test_version',
'bzrlib.tests.test_version_info',
+ 'bzrlib.tests.test_versionedfile',
'bzrlib.tests.test_weave',
'bzrlib.tests.test_whitebox',
'bzrlib.tests.test_win32utils',
=== added file 'bzrlib/tests/test_versionedfile.py'
--- a/bzrlib/tests/test_versionedfile.py 1970-01-01 00:00:00 +0000
+++ b/bzrlib/tests/test_versionedfile.py 2010-08-10 18:07:54 +0000
@@ -0,0 +1,137 @@
+# Copyright (C) 2010 Canonical Ltd
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+"""Tests for VersionedFile classes"""
+
+from bzrlib import (
+ errors,
+ groupcompress,
+ multiparent,
+ tests,
+ versionedfile,
+ )
+
+
+class Test_MPDiffGenerator(tests.TestCaseWithMemoryTransport):
+ # Should this be a per vf test?
+
+ def make_vf(self):
+ t = self.get_transport('')
+ factory = groupcompress.make_pack_factory(True, True, 1)
+ return factory(t)
+
+ def make_three_vf(self):
+ vf = self.make_vf()
+ vf.add_lines(('one',), (), ['first\n'])
+ vf.add_lines(('two',), [('one',)], ['first\n', 'second\n'])
+ vf.add_lines(('three',), [('one',), ('two',)],
+ ['first\n', 'second\n', 'third\n'])
+ return vf
+
+ def test_finds_parents(self):
+ vf = self.make_three_vf()
+ gen = versionedfile._MPDiffGenerator(vf, [('three',)])
+ needed_keys, refcount = gen._find_needed_keys()
+ self.assertEqual(sorted([('one',), ('two',), ('three',)]),
+ sorted(needed_keys))
+ self.assertEqual({('one',): 1, ('two',): 1}, refcount)
+
+ def test_ignores_ghost_parents(self):
+ # If a parent is a ghost, it is just ignored
+ vf = self.make_vf()
+ vf.add_lines(('two',), [('one',)], ['first\n', 'second\n'])
+ gen = versionedfile._MPDiffGenerator(vf, [('two',)])
+ needed_keys, refcount = gen._find_needed_keys()
+ self.assertEqual(sorted([('two',)]), sorted(needed_keys))
+ # It is returned, but we don't really care as we won't extract it
+ self.assertEqual({('one',): 1}, refcount)
+ self.assertEqual([('one',)], sorted(gen.ghost_parents))
+
+ def test_raises_on_ghost_keys(self):
+ # If the requested key is a ghost, then we have a problem
+ vf = self.make_vf()
+ gen = versionedfile._MPDiffGenerator(vf, [('one',)])
+ self.assertRaises(errors.RevisionNotPresent,
+ gen._find_needed_keys)
+
+ def test_refcount_multiple_children(self):
+ vf = self.make_three_vf()
+ gen = versionedfile._MPDiffGenerator(vf, [('two',), ('three',)])
+ needed_keys, refcount = gen._find_needed_keys()
+ self.assertEqual(sorted([('one',), ('two',), ('three',)]),
+ sorted(needed_keys))
+ self.assertEqual({('one',): 2, ('two',): 1}, refcount)
+
+ def test_process_contents(self):
+ vf = self.make_three_vf()
+ gen = versionedfile._MPDiffGenerator(vf, [('two',), ('three',)])
+ gen._find_needed_keys()
+ self.assertEqual({('two',): (('one',),),
+ ('three',): (('one',), ('two',))},
+ gen.parent_map)
+ self.assertEqual({('one',): 2, ('two',): 1}, gen.refcounts)
+ self.assertEqual(sorted([('one',), ('two',), ('three',)]),
+ sorted(gen.needed_keys))
+ stream = vf.get_record_stream(gen.needed_keys, 'topological', True)
+ record = stream.next()
+ self.assertEqual(('one',), record.key)
+ # one is not needed in the output, but it is needed by children. As
+ # such, it should end up in the various caches
+ gen._process_one_record(record)
+ # The chunks should be cached, the refcount untouched
+ self.assertEqual([('one',)], gen.chunks.keys())
+ self.assertEqual({('one',): 2, ('two',): 1}, gen.refcounts)
+ self.assertEqual([], gen.diffs.keys())
+ # Next we get 'two', which is something we output, but also needed for
+ # three
+ record = stream.next()
+ self.assertEqual(('two',), record.key)
+ gen._process_one_record(record)
+ # Both are now cached, and the diff for two has been extracted, and
+ # one's refcount has been updated. two has been removed from the
+ # parent_map
+ self.assertEqual(sorted([('one',), ('two',)]),
+ sorted(gen.chunks.keys()))
+ self.assertEqual({('one',): 1, ('two',): 1}, gen.refcounts)
+ self.assertEqual([('two',)], gen.diffs.keys())
+ self.assertEqual({('three',): (('one',), ('two',))},
+ gen.parent_map)
+ # Finally 'three', which allows us to remove all parents from the
+ # caches
+ record = stream.next()
+ self.assertEqual(('three',), record.key)
+ gen._process_one_record(record)
+ # Both are now cached, and the diff for two has been extracted, and
+ # one's refcount has been updated
+ self.assertEqual([], gen.chunks.keys())
+ self.assertEqual({}, gen.refcounts)
+ self.assertEqual(sorted([('two',), ('three',)]),
+ sorted(gen.diffs.keys()))
+
+ def test_compute_diffs(self):
+ vf = self.make_three_vf()
+ # The content is in the order requested, even if it isn't topological
+ gen = versionedfile._MPDiffGenerator(vf, [('two',), ('three',),
+ ('one',)])
+ diffs = gen.compute_diffs()
+ expected_diffs = [
+ multiparent.MultiParent([multiparent.ParentText(0, 0, 0, 1),
+ multiparent.NewText(['second\n'])]),
+ multiparent.MultiParent([multiparent.ParentText(1, 0, 0, 2),
+ multiparent.NewText(['third\n'])]),
+ multiparent.MultiParent([multiparent.NewText(['first\n'])]),
+ ]
+ self.assertEqual(expected_diffs, diffs)
=== modified file 'bzrlib/versionedfile.py'
--- a/bzrlib/versionedfile.py 2010-07-30 20:44:45 +0000
+++ b/bzrlib/versionedfile.py 2010-08-10 18:07:54 +0000
@@ -206,6 +206,134 @@
yield record
+class _MPDiffGenerator(object):
+ """Pull out the functionality for generating mp_diffs."""
+
+ def __init__(self, vf, keys):
+ self.vf = vf
+ # This is the order the keys were requested in
+ self.ordered_keys = tuple(keys)
+ # keys + their parents, what we need to compute the diffs
+ self.needed_keys = ()
+ # Map from key: mp_diff
+ self.diffs = {}
+ # Map from key: parents_needed (may have ghosts)
+ self.parent_map = {}
+ # Parents that aren't present
+ self.ghost_parents = ()
+ # Map from parent_key => number of children for this text
+ self.refcounts = {}
+ # Content chunks that are cached while we still need them
+ self.chunks = {}
+
+ def _find_needed_keys(self):
+ """Find the set of keys we need to request.
+
+ This includes all the original keys passed in, and the non-ghost
+ parents of those keys.
+
+ :return: (needed_keys, refcounts)
+ needed_keys is the set of all texts we need to extract
+ refcounts is a dict of {key: num_children} letting us know when we
+ no longer need to cache a given parent text
+ """
+ # All the keys and their parents
+ needed_keys = set(self.ordered_keys)
+ parent_map = self.vf.get_parent_map(needed_keys)
+ self.parent_map = parent_map
+ # TODO: Should we be using a different construct here? I think this
+ # uses difference_update internally, and we expect the result to
+ # be tiny
+ missing_keys = needed_keys.difference(parent_map)
+ if missing_keys:
+ raise errors.RevisionNotPresent(list(missing_keys)[0], self.vf)
+ # Parents that might be missing. They are allowed to be ghosts, but we
+ # should check for them
+ refcounts = {}
+ setdefault = refcounts.setdefault
+ maybe_ghosts = set()
+ for child_key, parent_keys in parent_map.iteritems():
+ if not parent_keys:
+ # Ghost? We should never get here
+ continue
+ maybe_ghosts.update(p for p in parent_keys if p not in needed_keys)
+ needed_keys.update(parent_keys)
+ for p in parent_keys:
+ refcounts[p] = setdefault(p, 0) + 1
+ # Remove any parents that are actually ghosts
+ self.ghost_parents = maybe_ghosts.difference(
+ self.vf.get_parent_map(maybe_ghosts))
+ needed_keys.difference_update(self.ghost_parents)
+ self.needed_keys = needed_keys
+ self.refcounts = refcounts
+ return needed_keys, refcounts
+
+ def _compute_diff(self, key, parent_lines, lines):
+ """Compute a single mp_diff, and store it in self._diffs"""
+ if len(parent_lines) > 0:
+ # XXX: _extract_blocks is not usefully defined anywhere...
+ # It was meant to extract the left-parent diff without
+ # having to recompute it for Knit content (pack-0.92,
+ # etc). That seems to have regressed somewhere
+ left_parent_blocks = self.vf._extract_blocks(key,
+ parent_lines[0], lines)
+ else:
+ left_parent_blocks = None
+ diff = multiparent.MultiParent.from_lines(lines,
+ parent_lines, left_parent_blocks)
+ self.diffs[key] = diff
+
+ def _process_one_record(self, record):
+ this_chunks = record.get_bytes_as('chunked')
+ if record.key in self.parent_map:
+ # This record should be ready to diff, since we requested
+ # content in 'topological' order
+ parent_keys = self.parent_map.pop(record.key)
+ parent_lines = []
+ for p in parent_keys:
+ # Alternatively we could check p not in self.needed_keys, but
+ # ghost_parents should be tiny versus huge
+ if p in self.ghost_parents:
+ continue
+ refcount = self.refcounts[p]
+ if refcount == 1: # Last child reference
+ self.refcounts.pop(p)
+ parent_chunks = self.chunks.pop(p)
+ else:
+ self.refcounts[p] = refcount - 1
+ parent_chunks = self.chunks[p]
+ # TODO: Should we cache the line form? We did the
+ # computation to get it, but storing it this way will
+ # be less memory efficient...
+ parent_lines.append(osutils.chunks_to_lines(parent_chunks))
+ lines = osutils.chunks_to_lines(this_chunks)
+ # TODO: Should we be caching lines instead of chunks?
+ # Higher-memory, but avoids double extracting.
+ # If we have good topological sorting, we shouldn't have
+ # much pending stuff cached...
+ ## this_chunks = lines
+ self._compute_diff(record.key, parent_lines, lines)
+ # Is this content required for any more children?
+ if record.key in self.refcounts:
+ self.chunks[record.key] = this_chunks
+
+ def _extract_diffs(self):
+ needed_keys, refcounts = self._find_needed_keys()
+ for record in self.vf.get_record_stream(needed_keys,
+ 'topological', True):
+ if record.storage_kind == 'absent':
+ raise errors.RevisionNotPresent(record.key, self.vf)
+ self._process_one_record(record)
+ # At this point, we should have *no* remaining content
+ assert not self.parent_map
+ assert set(self.refcounts) == self.ghost_parents
+
+ def compute_diffs(self):
+ self._extract_diffs()
+ dpop = self.diffs.pop
+ return [dpop(k) for k in self.ordered_keys]
+
+
class VersionedFile(object):
"""Versioned text file storage.
@@ -1077,6 +1205,10 @@
# between the creation and the application of the mpdiff.
parent_lines = [lines[p] for p in parents if p in knit_keys]
if len(parent_lines) > 0:
+ # XXX: _extract_blocks is not usefully defined anywhere... It
+ # was meant to extract the left-parent diff without
+ # having to recompute it for Knit content (pack-0.92,
+ # etc)
left_parent_blocks = self._extract_blocks(key, parent_lines[0],
target)
else:
More information about the bazaar-commits
mailing list