[rfc] [patch] speeding up bzr log with a tree delta iterator

John A Meinel john at arbash-meinel.com
Wed Jan 18 13:48:54 GMT 2006


Denys Duchier wrote:
> 1. I moved Branch.revision_delta_iter to (delta.py) compare_trees_iter
> 2. compare_trees_iter accepts either a revno or a revid
> 3. I removed obsolete deltas_for_log* and Branch.get_revision_delta
> 4. I added test_delta_iter
> 
> the iterator is written in a way that it is ready to yield more information if
> so desired (such as revnos and revids).
> 
> new mods in revnos=1522,1523 of my bzr.fix.log branch:
> 
>              http://delta.univ-orleans.fr/~duchier/bzr/bzr.fix.log
> 
> Here is the compound patch against bzr.dev:
> 
> 
> 
> ------------------------------------------------------------------------
> 
> === modified file 'bzrlib/branch.py'
> --- bzrlib/branch.py	
> +++ bzrlib/branch.py	
> @@ -259,27 +259,6 @@
>      def get_revision(self, revision_id):
>          """Return the Revision object for a named revision"""
>          raise NotImplementedError('get_revision is abstract')

Does this need to be deprecated, rather than just removed? As far as I
can tell, log.py was the only user, and we haven't yet entered stable
mode, so I'm fine with removing it.
But it isn't very hard just to have '@deprecated(zero_seven)'

> -
> -    def get_revision_delta(self, revno):
> -        """Return the delta for one revision.
> -
> -        The delta is relative to its mainline predecessor, or the
> -        empty tree for revision 1.
> -        """
> -        assert isinstance(revno, int)
> -        rh = self.revision_history()
> -        if not (1 <= revno <= len(rh)):
> -            raise InvalidRevisionNumber(revno)
> -
> -        # revno is 1-based; list is 0-based
> -
> -        new_tree = self.revision_tree(rh[revno-1])
> -        if revno == 1:
> -            old_tree = EmptyTree()
> -        else:
> -            old_tree = self.revision_tree(rh[revno-2])
> -
> -        return compare_trees(old_tree, new_tree)
>  
>      def get_revision_sha1(self, revision_id):
>          """Hash the stored value of a revision, and return it."""
> 
> === modified file 'bzrlib/delta.py'
> --- bzrlib/delta.py	
> +++ bzrlib/delta.py	
> @@ -16,6 +16,8 @@
>  
>  from bzrlib.inventory import InventoryEntry
>  from bzrlib.trace import mutter
> +from bzrlib.errors import InvalidRevisionNumber, InvalidRevisionId
> +from bzrlib.tree import EmptyTree
>  
>  class TreeDelta(object):
>      """Describes changes from one tree to another.
> @@ -257,3 +259,76 @@
>      delta.unchanged.sort()
>  
>      return delta
> +
> +
> +def compare_trees_iter(branch, revno_or_revid, reverse=False):
> +    """returns an iterator over successive tree deltas.
> +
> +    branch
> +        the branch object over whose revision history we must iterate
> +
> +    revno_or_revid
> +        where to start the iteration
> +
> +    reverse
> +        indicates whether to iterate backward, i.e. toward the origin;
> +        default is to iterate forward
> +
> +    Each delta is always with respect to the preceding revision. So if you
> +    start at revno=N, the first delta will be between revisions N-1 and N.
> +    """
> +    rh = branch.revision_history()
> +    rh_len = len(rh)
> +    if isinstance(revno_or_revid, int):
> +        revno = revno_or_revid
> +        if not (1 <= revno <= rh_len):
> +            raise InvalidRevisionNumber(revno)
> +    else:
> +        try:
> +            revno = rh.index(revno_or_revid) + 1
> +        except ValueError:
> +            raise InvalidRevisionId(revno_or_revid)
> +    if reverse:
> +        return _compare_trees_iter_backward(branch, revno, rh, rh_len)
> +    else:
> +        return _compare_trees_iter_forward( branch, revno, rh, rh_len)
> +

Since this function *requires* revision history, and will only iterate
through entries that exist on it. I'm okay with it just taking a revno.
I think I would prefer that to using isinstance().
The original goal was to try and get away from restricting ourselves to
the revision-history. (It would be nice to get the log for an arbitrary
revision using the 'bzr log -r revid:...' syntax.
But that doesn't have to happen now.

> +
> +def _compare_trees_iter_forward(branch, revno, rh, rh_len):
> +    if revno == 1:
> +        lo_revno = 0
> +        lo_revid = None
> +        lo_tree  = EmptyTree()
> +    else:
> +        lo_revno = revno-1
> +        lo_revid = rh[lo_revno-1]
> +        lo_tree  = branch.revision_tree(lo_revid)
> +    while revno <= rh_len:
> +        hi_revno = revno
> +        hi_revid = rh[revno-1]
> +        hi_tree  = branch.revision_tree(hi_revid)
> +        yield compare_trees(lo_tree, hi_tree)
> +        lo_revno = hi_revno
> +        lo_revid = hi_revid
> +        lo_tree  = hi_tree
> +        revno += 1
> +
> +
> +def _compare_trees_iter_backward(branch, revno, rh, rh_len):
> +    hi_revno = revno
> +    hi_revid = rh[revno-1]
> +    hi_tree  = branch.revision_tree(hi_revid)
> +    while 1 <= revno:
> +        if revno == 1:
> +            lo_revno = 0
> +            lo_revid = None
> +            lo_tree  = EmptyTree()
> +        else:
> +            lo_revno = revno-1
> +            lo_revid = rh[lo_revno-1]
> +            lo_tree  = branch.revision_tree(lo_revid)
> +        yield compare_trees(lo_tree, hi_tree)
> +        hi_tree  = lo_tree
> +        hi_revno = lo_revno
> +        hi_revid = lo_revid
> +        revno -= 1
> 
> === modified file 'bzrlib/log.py'
> --- bzrlib/log.py	
> +++ bzrlib/log.py	
> @@ -54,7 +54,7 @@
>  
>  import bzrlib.errors as errors
>  from bzrlib.tree import EmptyTree
> -from bzrlib.delta import compare_trees
> +from bzrlib.delta import compare_trees_iter
>  from bzrlib.trace import mutter
>  import re
>  
> @@ -113,17 +113,6 @@
>      return rh
>  
>  
> -def _get_revision_delta(branch, revno):
> -    """Return the delta for a mainline revision.
> -    
> -    This is used to show summaries in verbose logs, and also for finding 
> -    revisions which touch a given file."""
> -    # XXX: What are we supposed to do when showing a summary for something 
> -    # other than a mainline revision.  The delta to it's first parent, or
> -    # (more useful) the delta to a nominated other revision.
> -    return branch.get_revision_delta(revno)
> -
> -
>  def show_log(branch,
>               lf,
>               specific_fileid=None,
> @@ -211,9 +200,13 @@
>          raise ValueError('invalid direction %r' % direction)
>  
>      revision_history = branch.revision_history()
> +    delta_iter = None
>      for revno, rev_id in cut_revs:
>          if verbose or specific_fileid:
> -            delta = _get_revision_delta(branch, revno)
> +            if delta_iter is None:
> +                delta_iter = compare_trees_iter(
> +                    branch, revno, reverse=(direction=='reverse'))
> +            delta = delta_iter.next()
>              

The reason I wanted to return 'revno, rev_id' from compare_trees_iter,
was because you are not guaranteed that cut_revs matches perfectly with
what compare_trees_iter is iterating over. They *should* be the same,
but it seems like you should be passing in the revisions to iterate
over, rather than expecting it to line up with you.

What about changing the interface to that?

compare_trees_iter(branch, revisions)

You don't even need a direction, because you can just reverse the list
before hand.
This also allows us to handle the case where we want some
non-revision-history path.

>          if specific_fileid:
>              if not delta.touches_file_id(specific_fileid):
> @@ -249,88 +242,6 @@
>                      continue
>                  pending.extend(rev.parent_ids)
>                  lf.show_merge(rev)
> -
> -
> -def deltas_for_log_dummy(branch, which_revs):
> -    """Return all the revisions without intermediate deltas.
> -
> -    Useful for log commands that won't need the delta information.
> -    """
> -    
> -    for revno, revision_id in which_revs:
> -        yield revno, branch.get_revision(revision_id), None
> -
> -
> -def deltas_for_log_reverse(branch, which_revs):
> -    """Compute deltas for display in latest-to-earliest order.
> -
> -    branch
> -        Branch to traverse
> -
> -    which_revs
> -        Sequence of (revno, revision_id) for the subset of history to examine
> -
> -    returns 
> -        Sequence of (revno, rev, delta)
> -
> -    The delta is from the given revision to the next one in the
> -    sequence, which makes sense if the log is being displayed from
> -    newest to oldest.
> -    """
> -    last_revno = last_revision_id = last_tree = None
> -    for revno, revision_id in which_revs:
> -        this_tree = branch.revision_tree(revision_id)
> -        this_revision = branch.get_revision(revision_id)
> -        
> -        if last_revno:
> -            yield last_revno, last_revision, compare_trees(this_tree, last_tree, False)
> -
> -        this_tree = EmptyTree(branch.get_root_id())
> -
> -        last_revno = revno
> -        last_revision = this_revision
> -        last_tree = this_tree
> -
> -    if last_revno:
> -        if last_revno == 1:
> -            this_tree = EmptyTree(branch.get_root_id())
> -        else:
> -            this_revno = last_revno - 1
> -            this_revision_id = branch.revision_history()[this_revno]
> -            this_tree = branch.revision_tree(this_revision_id)
> -        yield last_revno, last_revision, compare_trees(this_tree, last_tree, False)
> -
> -
> -def deltas_for_log_forward(branch, which_revs):
> -    """Compute deltas for display in forward log.
> -
> -    Given a sequence of (revno, revision_id) pairs, return
> -    (revno, rev, delta).
> -
> -    The delta is from the given revision to the next one in the
> -    sequence, which makes sense if the log is being displayed from
> -    newest to oldest.
> -    """
> -    last_revno = last_revision_id = last_tree = None
> -    prev_tree = EmptyTree(branch.get_root_id())
> -
> -    for revno, revision_id in which_revs:
> -        this_tree = branch.revision_tree(revision_id)
> -        this_revision = branch.get_revision(revision_id)
> -
> -        if not last_revno:
> -            if revno == 1:
> -                last_tree = EmptyTree(branch.get_root_id())
> -            else:
> -                last_revno = revno - 1
> -                last_revision_id = branch.revision_history()[last_revno]
> -                last_tree = branch.revision_tree(last_revision_id)
> -
> -        yield revno, this_revision, compare_trees(last_tree, this_tree, False)
> -
> -        last_revno = revno
> -        last_revision = this_revision
> -        last_tree = this_tree
>  

Again, do we delete, or do we deprecate. Since these haven't been used
for many months, I think we can just delete them.

>  
>  class LogFormatter(object):
> 
> === modified file 'bzrlib/tests/test_log.py'
> --- bzrlib/tests/test_log.py	
> +++ bzrlib/tests/test_log.py	
> @@ -273,3 +273,222 @@
>  added:
>    a
>  ''')
> +
> +    def test_delta_iter(self):
> +        b = Branch.initialize(u'.')
> +        b.nick='delta.iter'
> +        wt = b.working_tree()
> +        open('A', 'wb').write('A')
> +        wt.add('A', 'A')
> +        wt.commit('committing A'
> +                  , rev_id='a1'
> +                  , timestamp=1132586655.459960938
> +                  , timezone=-6*3600
> +                  , committer='Joe Foo <joe at foo.com>')
> +        open('B', 'wb').write('B')
> +        wt.add('B', 'B')
> +        wt.commit('committing B'
> +                  , rev_id='a2'
> +                  , timestamp=1132586842.411175966
> +                  , timezone=-6*3600
> +                  , committer='Joe Foo <joe at foo.com>')
> +        open('C', 'wb').write('C')
> +        wt.add('C', 'C')
> +        wt.commit('committing C'
> +                  , rev_id='a3'
> +                  , timestamp=1132587176.835228920
> +                  , timezone=-6*3600
> +                  , committer='Joe Foo <joe at foo.com>')
> +        open('A', 'wb').write('AA')
> +        wt.commit('modifying A'
> +                  , rev_id='a4'
> +                  , timestamp=1132587200.835228920
> +                  , timezone=-6*3600
> +                  , committer='Joe Foo <joe at foo.com>')
> +        
> +        sio = StringIO()
> +        lf = LongLogFormatter(to_file=sio)
> +        show_log(b, lf)
> +        self.assertEquals(sio.getvalue(), """\
> +------------------------------------------------------------
> +revno: 4
> +committer: Joe Foo <joe at foo.com>
> +branch nick: delta.iter
> +timestamp: Mon 2005-11-21 09:33:20 -0600
> +message:
> +  modifying A
> +------------------------------------------------------------
> +revno: 3
> +committer: Joe Foo <joe at foo.com>
> +branch nick: delta.iter
> +timestamp: Mon 2005-11-21 09:32:56 -0600
> +message:
> +  committing C
> +------------------------------------------------------------
> +revno: 2
> +committer: Joe Foo <joe at foo.com>
> +branch nick: delta.iter
> +timestamp: Mon 2005-11-21 09:27:22 -0600
> +message:
> +  committing B
> +------------------------------------------------------------
> +revno: 1
> +committer: Joe Foo <joe at foo.com>
> +branch nick: delta.iter
> +timestamp: Mon 2005-11-21 09:24:15 -0600
> +message:
> +  committing A
> +""")
> +        sio = StringIO()
> +        lf = LongLogFormatter(to_file=sio)
> +        show_log(b, lf, direction="forward")
> +        self.assertEquals(sio.getvalue(), """\
> +------------------------------------------------------------
> +revno: 1
> +committer: Joe Foo <joe at foo.com>
> +branch nick: delta.iter
> +timestamp: Mon 2005-11-21 09:24:15 -0600
> +message:
> +  committing A
> +------------------------------------------------------------
> +revno: 2
> +committer: Joe Foo <joe at foo.com>
> +branch nick: delta.iter
> +timestamp: Mon 2005-11-21 09:27:22 -0600
> +message:
> +  committing B
> +------------------------------------------------------------
> +revno: 3
> +committer: Joe Foo <joe at foo.com>
> +branch nick: delta.iter
> +timestamp: Mon 2005-11-21 09:32:56 -0600
> +message:
> +  committing C
> +------------------------------------------------------------
> +revno: 4
> +committer: Joe Foo <joe at foo.com>
> +branch nick: delta.iter
> +timestamp: Mon 2005-11-21 09:33:20 -0600
> +message:
> +  modifying A
> +""")
> +        sio = StringIO()
> +        lf = LongLogFormatter(to_file=sio)
> +        show_log(b, lf, specific_fileid='A')
> +        self.assertEquals(sio.getvalue(), """\
> +------------------------------------------------------------
> +revno: 4
> +committer: Joe Foo <joe at foo.com>
> +branch nick: delta.iter
> +timestamp: Mon 2005-11-21 09:33:20 -0600
> +message:
> +  modifying A
> +------------------------------------------------------------
> +revno: 1
> +committer: Joe Foo <joe at foo.com>
> +branch nick: delta.iter
> +timestamp: Mon 2005-11-21 09:24:15 -0600
> +message:
> +  committing A
> +""")
> +        sio = StringIO()
> +        lf = LongLogFormatter(to_file=sio)
> +        show_log(b, lf, specific_fileid='A', direction="forward")
> +        self.assertEquals(sio.getvalue(), """\
> +------------------------------------------------------------
> +revno: 1
> +committer: Joe Foo <joe at foo.com>
> +branch nick: delta.iter
> +timestamp: Mon 2005-11-21 09:24:15 -0600
> +message:
> +  committing A
> +------------------------------------------------------------
> +revno: 4
> +committer: Joe Foo <joe at foo.com>
> +branch nick: delta.iter
> +timestamp: Mon 2005-11-21 09:33:20 -0600
> +message:
> +  modifying A
> +""")
> +        sio = StringIO()
> +        lf = LongLogFormatter(to_file=sio)
> +        show_log(b, lf, specific_fileid='A', start_revision=1, end_revision=4)
> +        self.assertEquals(sio.getvalue(), """\
> +------------------------------------------------------------
> +revno: 4
> +committer: Joe Foo <joe at foo.com>
> +branch nick: delta.iter
> +timestamp: Mon 2005-11-21 09:33:20 -0600
> +message:
> +  modifying A
> +------------------------------------------------------------
> +revno: 1
> +committer: Joe Foo <joe at foo.com>
> +branch nick: delta.iter
> +timestamp: Mon 2005-11-21 09:24:15 -0600
> +message:
> +  committing A
> +""")
> +        sio = StringIO()
> +        lf = LongLogFormatter(to_file=sio)
> +        show_log(b, lf, specific_fileid='A', start_revision=1, end_revision=4, direction="forward")
> +        self.assertEquals(sio.getvalue(), """\
> +------------------------------------------------------------
> +revno: 1
> +committer: Joe Foo <joe at foo.com>
> +branch nick: delta.iter
> +timestamp: Mon 2005-11-21 09:24:15 -0600
> +message:
> +  committing A
> +------------------------------------------------------------
> +revno: 4
> +committer: Joe Foo <joe at foo.com>
> +branch nick: delta.iter
> +timestamp: Mon 2005-11-21 09:33:20 -0600
> +message:
> +  modifying A
> +""")
> +        sio = StringIO()
> +        lf = LongLogFormatter(to_file=sio)
> +        show_log(b, lf, specific_fileid='A', start_revision=1, end_revision=3)
> +        self.assertEquals(sio.getvalue(), """\
> +------------------------------------------------------------
> +revno: 1
> +committer: Joe Foo <joe at foo.com>
> +branch nick: delta.iter
> +timestamp: Mon 2005-11-21 09:24:15 -0600
> +message:
> +  committing A
> +""")
> +        sio = StringIO()
> +        lf = LongLogFormatter(to_file=sio)
> +        show_log(b, lf, specific_fileid='A', start_revision=1, end_revision=3, direction="forward")
> +        self.assertEquals(sio.getvalue(), """\
> +------------------------------------------------------------
> +revno: 1
> +committer: Joe Foo <joe at foo.com>
> +branch nick: delta.iter
> +timestamp: Mon 2005-11-21 09:24:15 -0600
> +message:
> +  committing A
> +""")
> +        sio = StringIO()
> +        lf = LongLogFormatter(to_file=sio)
> +        show_log(b, lf, specific_fileid='A', start_revision=2, end_revision=3)
> +        self.assertEquals(sio.getvalue(), "")
> +        sio = StringIO()
> +        lf = LongLogFormatter(to_file=sio)
> +        show_log(b, lf, specific_fileid='A', start_revision=2, end_revision=3, direction="forward")
> +        self.assertEquals(sio.getvalue(), "")
> +        sio = StringIO()
> +        lf = LongLogFormatter(to_file=sio)
> +        show_log(b, lf, specific_fileid='B')
> +        self.assertEquals(sio.getvalue(), """\
> +------------------------------------------------------------
> +revno: 2
> +committer: Joe Foo <joe at foo.com>
> +branch nick: delta.iter
> +timestamp: Mon 2005-11-21 09:27:22 -0600
> +message:
> +  committing B
> +""")
> 
> 
> 
> ------------------------------------------------------------------------
> 
> 
> --Denys

The rest looks good to me.

John
=:->

-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 249 bytes
Desc: OpenPGP digital signature
Url : https://lists.ubuntu.com/archives/bazaar/attachments/20060118/965a0e29/attachment.pgp 


More information about the bazaar mailing list