[MERGE] log performance

Fri Jun 23 05:48:56 BST 2006

On 22 Jun 2006, Aaron Bentley <aaron.bentley at utoronto.ca> wrote:
> Hi all,
> 
> This patch optimizes performance of log formatters that don't show
> merges (i.e. --short and --line).
> 
> 1. It adds a new codepath for selecting the revisions to display
> 2. It does not retrieve revisions that will not be displayed
> 
> In my tests (which have a 1:1 ratio of merges to mainline), it goes from
> 190 ms to 109ms, that is 1.74x as fast.
> 
> Aaron

well done :)

> === modified file bzrlib/benchmarks/__init__.py // last-changed:aaron.bentley at u
> ... toronto.ca-20060623021716-c9b81f6432adf745
> --- bzrlib/benchmarks/__init__.py
> +++ bzrlib/benchmarks/__init__.py
> @@ -74,6 +74,37 @@
>                  tree.unlock()
>          return tree
>  
> +    def make_heavily_merged_tree(self, directory_name='.'):
> +        """Create a tree with an egregious number of commits.
> +        
> +        No files change are included.
> +        """

"egregious" means "outstandingly bad" but I don't think it's really
right here - having 250 cross merges is just large, not egregious.

It might (?) be good to add another sentence about what the structure of
merges are, in case anyone cares: two branches, one repeatedly merges
single commits from the other.

> +        tree = BzrDir.create_standalone_workingtree(directory_name)
> +        tree.lock_write()
> +        tree.branch.lock_write()
> +        tree.branch.repository.lock_write()
> +        tree2 = tree.bzrdir.sprout('tree2').open_workingtree()
> +        tree2.lock_write()
> +        try:
> +            for i in xrange(250):
> +                revision_id = tree.commit('no-changes commit %d-a' % i)
> +                tree2.branch.fetch(tree.branch, revision_id)
> +                tree2.set_pending_merges([revision_id])
> +                revision_id = tree2.commit('no-changes commit %d-b' % i)
> +                tree.branch.fetch(tree2.branch, revision_id)
> +                tree.set_pending_merges([revision_id])
> +        finally:
> +            try:
> +                try:
> +                    tree.branch.repository.unlock()
> +                finally:
> +                    tree.branch.unlock()
> +            finally:
> +                tree.unlock()
> +                tree2.unlock()
> +        tree.set_pending_merges([])
> +        return tree

These won't unwind properly in some cases; if you're going to go to the
trouble of cleaning up (which is good) you might as well do it properly,
which means

  tree.lock_write()
  try:
    tree.branch.lock_write()
    finally:
      tree.branch.unlock()
  finally:
    tree.unlock()

etc, with each acquisition/try/finally nested at its own level.  For
example this will unwind properly in a failure to sprout.  I suggest
this for a few reasons: firstly it sets a good example for others looking
at the code; secondly it makes it easier to read (without thinking "now
why are there all those things inside the finally block); thirdly it's
just more correct.

> +
>  
>  def test_suite():
>      """Build and return a TestSuite which contains benchmark tests only."""
> 
> === modified file bzrlib/benchmarks/bench_log.py // last-changed:aaron.bentley@
> ... utoronto.ca-20060623021716-c9b81f6432adf745
> --- bzrlib/benchmarks/bench_log.py
> +++ bzrlib/benchmarks/bench_log.py
> @@ -47,6 +47,12 @@
>          lf = log_formatter('long', to_file=StringIO())
>          self.time(show_log, tree.branch, lf, direction='reverse')
>  
> +    def test_merge_log(self):
> +        """Run log in a tree with many merges"""
> +        tree = self.make_heavily_merged_tree()
> +        lf = log_formatter('short', to_file=StringIO())
> +        self.time(show_log, tree.branch, lf, direction='reverse')
> +
>      def test_log_screenful(self):
>          """Simulate log --long|less"""
>          self.screenful_tester('long')
> 
> === modified file bzrlib/log.py
> --- bzrlib/log.py
> +++ bzrlib/log.py
> @@ -206,35 +206,25 @@
>      cut_revs = which_revs[(start_revision-1):(end_revision)]
>      if not cut_revs:
>          return
> +
> +    # convert the revision history to a dictionary:
> +    rev_nos = dict([(k, v) for v, k in cut_revs])
> +

You can actually just say rev_nos = dict(cut_revs) if cut_revs is a
sequence of pairs, and that will avoid building a list.

>      # override the mainline to look like the revision history.
>      mainline_revs = [revision_id for index, revision_id in cut_revs]
>      if cut_revs[0][0] == 1:
>          mainline_revs.insert(0, None)
>      else:
>          mainline_revs.insert(0, which_revs[start_revision-2][1])
> -
> -    merge_sorted_revisions = merge_sort(
> -        branch.repository.get_revision_graph(mainline_revs[-1]),
> -        mainline_revs[-1],
> -        mainline_revs)
> -
> -    if direction == 'reverse':
> -        cut_revs.reverse()
> -    elif direction == 'forward':
> -        # forward means oldest first.
> -        merge_sorted_revisions.reverse()
> +    if getattr(lf, 'show_merge', False) is False:
> +        no_merges = True 
>      else:
> -        raise ValueError('invalid direction %r' % direction)
> -
> -    revision_history = branch.revision_history()
> -
> -    # convert the revision history to a dictionary:
> -    rev_nos = {}
> -    for index, rev_id in cut_revs:
> -        rev_nos[rev_id] = index
> +        no_merges = False
> +    view_revisions = list(get_view_revisions(mainline_revs, rev_nos, branch,
> +                          direction, no_merges=no_merges))
>  
>      def iter_revisions():
> -        revision_ids = [r for s, r, m, e in merge_sorted_revisions]
> +        revision_ids = [r for r, n, d in view_revisions]
>          num = 9
>          while revision_ids:
>              revisions = branch.repository.get_revisions(revision_ids[:num])
> @@ -247,8 +237,8 @@
>          for revision in revisions:
>              yield revision
>      # now we just print all the revisions
> -    for ((sequence, rev_id, merge_depth, end_of_merge), rev) in \
> -        izip(merge_sorted_revisions, iter_revisions()):
> +    for ((rev_id, revno, merge_depth), rev) in \
> +         izip(view_revisions, iter_revisions()):
>  
>          if searchRE:
>              if not searchRE.search(rev.message):
> @@ -267,11 +257,41 @@
>                  # although we calculated it, throw it away without display
>                  delta = None
>  
> -            lf.show(rev_nos[rev_id], rev, delta)
> -        elif hasattr(lf, 'show_merge'):
> +            lf.show(revno, rev, delta)
> +        else:
>              lf.show_merge(rev, merge_depth)

There's some nice reductions here.  I'm not 100% comfortable with the
hasattr -- I generally like to reserve it for cases where you're really
doing "dynamic" lookups, though it's fuzzy in Python and I won't veto
it.  But is it the presence of the attribute or its value that matters?
If the second, as I'd expect, shouldn't the second call be getattr too?

> +def get_view_revisions(mainline_revs, rev_nos, branch, direction,
> +                       no_merges=False):
> +    """Produce an iterator of revisions to show
> +    :return: an iterator of (revision_id, revno, merge_depth)
> +    (if there is no revno for a revision, None is supplied)
> +    """
> +    if no_merges is True:
> +        revision_ids = mainline_revs[1:]
> +        if direction == 'reverse':
> +            revision_ids.reverse()
> +        for revision_id in revision_ids:
> +            yield revision_id, rev_nos[revision_id], 0
> +        return
> +    merge_sorted_revisions = merge_sort(
> +        branch.repository.get_revision_graph(mainline_revs[-1]),
> +        mainline_revs[-1],
> +        mainline_revs)
> +
> +    if direction == 'forward':
> +        # forward means oldest first.
> +        merge_sorted_revisions.reverse()
> +    elif direction != 'reverse':
> +        raise ValueError('invalid direction %r' % direction)
> +
> +    revision_history = branch.revision_history()
> +
> +    for sequence, rev_id, merge_depth, end_of_merge in merge_sorted_revisions:
> +        yield rev_id, rev_nos.get(rev_id), merge_depth
> +
> +
>  def deltas_for_log_dummy(branch, which_revs):
>      """Return all the revisions without intermediate deltas.

Rather than an inverted-sense boolean why not make it just
include_merges=True?

> === modified file bzrlib/tests/test_log.py
> @@ -301,3 +305,38 @@
>          logfile.seek(0)
>          log_contents = logfile.read()
>          self.assertEqualDiff(log_contents, '1: Line-Log-Formatte... 2005-11-23 add a\n')
> +
> +    def test_get_view_revisions(self):
> +        """Test the get_view_revisions method"""
> +        wt = self.make_branch_and_tree('tree1')
> +        wt.commit('commit one', rev_id='1')
> +        wt.commit('commit two', rev_id='2')
> +        wt.commit('commit three', rev_id='3')
> +        mainline_revs = [None, '1', '2', '3']
> +        rev_nos = {'1': 1, '2': 2, '3': 3, '4b': 4}
> +        revisions = list(get_view_revisions(mainline_revs, rev_nos, wt.branch,
> +                                            'forward'))
> +        self.assertEqual(revisions, [('1', 1, 0), ('2', 2, 0), ('3', 3, 0)])
> +        revisions2 = list(get_view_revisions(mainline_revs, rev_nos, wt.branch,
> +                                             'forward', no_merges=True))
> +        self.assertEqual(revisions, revisions2)
> +        revisions = list(get_view_revisions(mainline_revs, rev_nos, wt.branch,
> +                                            'reverse'))
> +        self.assertEqual(revisions, [('3', 3, 0), ('2', 2, 0), ('1', 1, 0), ])
> +        revisions2 = list(get_view_revisions(mainline_revs, rev_nos, wt.branch,
> +                                             'reverse', no_merges=True))
> +        self.assertEqual(revisions, revisions2)
> +        tree2 = wt.bzrdir.sprout('tree2').open_workingtree()
> +        tree2.commit('four-a', rev_id='4a')
> +        wt.branch.fetch(tree2.branch, '4a')
> +        wt.add_pending_merge('4a')
> +        wt.commit('four-b', rev_id='4b')
> +        mainline_revs.append('4b')
> +        revisions = list(get_view_revisions(mainline_revs, rev_nos, wt.branch,
> +                                            'forward'))
> +        self.assertEqual(revisions, [('1', 1, 0), ('2', 2, 0), ('3', 3, 0),
> +                                     ('4a', None, 1), ('4b', 4, 0)])
> +        revisions = list(get_view_revisions(mainline_revs, rev_nos, wt.branch,
> +                                             'forward', no_merges=True))
> +        self.assertEqual(revisions, [('1', 1, 0), ('2', 2, 0), ('3', 3, 0),
> +                                     ('4b', 4, 0)])

This test is a bit long, therefore harder to read and see just what it's
testing.  Could you perhaps split out the setup of building the
histories and have each assertion within a separate test?

+1 to merge with those changes.

Thanks
-- 
Martin