[MERGE] Common dominator ancestor picker

Sat Feb 18 04:30:38 GMT 2006

Aaron Bentley wrote:
> Hi all,
> 
> I've implemented an ancestor picker based on the closest common
> dominator.  This speeds up find-merge-base by a factor of 10 on the bzr
> source tree, and is likely to do even better as the bzr ancestry gets
> bigger.
> 
> The old algorithm scaled linearly with the number of nodes in the
> ancestry.  This algorithm scales linearly with the number of nodes
> between each revision and their closest common dominator.  I invented it
> myself, so let me know if there's anything wrong with the theory.
> 
> Not only is this algorithm faster, but it's also more correct, as it
> will handle criss-cross merges better.
> 
> Monotone's is slightly fancier; they find the common merge that is a
> dominator of one.  I don't think the difference will be significant, and
> we can always implement it later.
> 
> Could someone give me a review?  If approved, I'll stick it in my
> integration.
> 
> Aaron

Hopefully this will be a more complete review...

------------------------------------------------------------------------

=== modified file 'bzrlib/revision.py'

--- bzrlib/revision.py	
+++ bzrlib/revision.py	
@@ -279,20 +279,6 @@
     return root, ancestors, descendants, common


-def common_ancestor(revision_a, revision_b, revision_source):
-    try:
-        root, ancestors, descendants, common = \
-            combined_graph(revision_a, revision_b, revision_source)
-    except bzrlib.errors.NoCommonRoot:
-        raise bzrlib.errors.NoCommonAncestor(revision_a, revision_b)
-
-    distances = node_distances (descendants, ancestors, root)
-    farthest = select_farthest(distances, common)
-    if farthest is None or farthest == NULL_REVISION:
-        raise bzrlib.errors.NoCommonAncestor(revision_a, revision_b)
-    return farthest
-
-
 class MultipleRevisionSources(object):
     """Proxy that looks in multiple branches for revisions."""
     def __init__(self, *args):
@@ -350,3 +336,94 @@
         next = best_ancestor(next)
     path.reverse()
     return path
+
+
+def dominators(revision_id, revision_source):
+    """Return the dominators of a revision.
+    This is based on the observation that in a DAG, the dominators of x
+    are x + common_dominators(parents(x)).  This applies recursively,
of course.
+    """
+    while True:
+        yield revision_id
+        if revision_id is NULL_REVISION:
+            return
+        try:
+            revision = revision_source.get_revision(revision_id)
+        except NoSuchRevision:
+            return
+
+        if len(revision.parent_ids) == 0:
+            revision_id = NULL_REVISION
+        elif len(revision.parent_ids) == 1:
+            (revision_id,) = revision.parent_ids
+        else:
+            revision_id = common_dominator(*revision.parent_ids)
+

So, as mentioned you need to pass the revision source here. Also, all
the common_dominator code takes the revision source as the first
argument, to be consistent, it might be good to have 'dominator' do the
same.

+
+def common_dominator(revision_source, *revision_ids):
+    """Find the common dominator of a number of revision ids.
+    This will walk a subgraph, and ends early when it finds the common
+    dominator.  It should visit each node no more than twice.
+    """
+    if len(revision_ids) == 1:
+        return revision_ids[0]
+    elif len(revision_ids) == 2:
+        return common_dominator_pair(revision_source, *revision_ids)
+    else:
+        left_revision = common_dominator(revision_source, revision_ids[:2])
+        right_revision = common_dominator(revision_source,
revision_ids[2:])
+        return common_dominator_pair(revision_source, left_revision,
+                                     right_revision)


Here, I think you should do:

left_revision = common_dominator_pair(revision_source,
	revision_ids[0], revision_ids[1])
right_revision = common_dominator(revision_source,
	*revision_ids[2:])
return common_dominator_pair(revision_source,
	left_revision, right_revision)


You need to pass revisions as a list of arguments to common_dominator
for the right revision, and you probably should just call
common_dominator_pair for 'left_revision' since you know you are passing
2 entries, and it will just call _pair for you.


+
+
+def common_dominator_pair(revision_source, left, right):
+    """Returns the common dominator of a pair of revisions, or
NULL_REVISION
+    if they have no common dominator.
+    """
+    seen = set()
+    iter_left = dominators(left, revision_source)
+    iter_right = dominators(right, revision_source)
+    first_left = None
+    first_right = None
+    last_revision_id = None
+    while True:
+        if iter_left is None:
+            if iter_right is None:
+                raise AssertionError("Unreachable code")
+        else:
+            try:
+                left_revision_id = iter_left.next()
+                if first_left is None:
+                    first_left = left_revision_id
+            except StopIteration:
+                # Handle ghosts by backtracking
+                return first_right
+            if left_revision_id is NULL_REVISION:
+                left_iter = None
+            if left_revision_id in seen:
+                return left_revision_id
+            else:
+                seen.add(left_revision_id)
+        if iter_right is not None:
+            try:
+                right_revision_id = iter_right.next()
+            except StopIteration:
+                # Handle ghosts by backtracking
+                return first_left
+            if right_revision_id is NULL_REVISION:
+                iter_right = None
+            if right_revision_id in seen:
+                return right_revision_id
+            else:
+                seen.add(right_revision_id)
+

You probably don't need the else for:

if foo:
  return X
else:
  do something

You really just need:

if foo:
  return X
do something


+
+def common_ancestor(revision_a, revision_b, revision_source):
+    """Find the closest unambiguous common ancestor of two revisions.
+    In the case of criss-cross merges, this algorithm will ignore the
criss-
+    crossed ancestors.
+    """
+    cd = common_dominator(revision_source, revision_a, revision_b)
+    if cd is NULL_REVISION:
+        raise bzrlib.errors.NoCommonAncestor(revision_a, revision_b)
+    return cd

=== modified file 'bzrlib/tests/test_revision.py'
--- bzrlib/tests/test_revision.py	
+++ bzrlib/tests/test_revision.py	
@@ -48,6 +48,23 @@

     so A is missing b6 at the start
     and B is missing a3, a4, a5
+
+    or graphically
+      a0
+      |
+      a1
+      |
+      a2
+      | \
+      |  b3
+      |  |
+      |  b4
+      | /|
+      a3 b5
+      |  |
+      a4 |
+      | X|
+      a5 b6
     """
     tree1 = self.make_branch_and_tree("branch1")
     br1 = tree1.branch
@@ -212,6 +229,14 @@
 class TestCommonAncestor(TestCaseWithTransport):
     """Test checking whether a revision is an ancestor of another
revision"""

+    @staticmethod
+    def assertCommonAncestor(ancestor, rev_a, rev_b, revision_source):
+        from bzrlib.revision import common_ancestor
+        anc = common_ancestor(rev_a, rev_b, revision_source)
+        if anc != ancestor:
+            raise AssertionError("Common ancestor of %s and %s should
be %s, not %s" %
+                                 (rev_a, rev_b, ancestor, anc))
+

Wouldn't it be better to write this as:

def assertCommonAncestor(self, ancestor, rev_a, rev_b, source):
  ...
  if anc != ancestor:
    self.fail('Common ancestor of %s and %s should be %s, not %s'
              % (rev_a, rev_b, ancestor, anc))

I realize it is about the same thing, but I prefer 'self.fail' to raise
AssertionError.

     def test_old_common_ancestor(self):
         """Pick a resonable merge base using the old functionality"""
         from bzrlib.revision import old_common_ancestor as common_ancestor
@@ -278,14 +303,16 @@
                           revisions[1])
         self.assertEqual(common_ancestor(revisions[2], revisions_2[4],
sources),
                           revisions[2])
+        self.assertCommonAncestor(revisions_2[4], revisions[3],
revisions_2[4],
+                                  sources)
         self.assertEqual(common_ancestor(revisions[3], revisions_2[4],
sources),
                           revisions_2[4])
         self.assertEqual(common_ancestor(revisions[4], revisions_2[5],
sources),
                           revisions_2[4])
-        self.assertEqual(common_ancestor(revisions[5], revisions_2[6],
sources),
-                          revisions[4]) # revisions_2[5] is equally valid
-        self.assertEqual(common_ancestor(revisions_2[6], revisions[5],
sources),
-                          revisions[4]) # revisions_2[5] is equally valid
+        self.assertCommonAncestor(revisions_2[4], revisions[5],
revisions_2[6],
+                                  sources)
+        self.assertCommonAncestor(revisions_2[4], revisions_2[6],
revisions[5],
+                                  sources)

     def test_combined(self):
         """combined_graph


You also have some lines which are longer than 80 characters. But the
basic concept looks good.

John
=:->

-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 249 bytes
Desc: OpenPGP digital signature
Url : https://lists.ubuntu.com/archives/bazaar/attachments/20060217/3ca8950f/attachment.pgp