Rev 4611: Bring in the optimized tsort code. in http://bazaar.launchpad.net/~jameinel/bzr/1.19-known-graph-sorted

John Arbash Meinel john at arbash-meinel.com
Thu Aug 13 22:30:16 BST 2009


At http://bazaar.launchpad.net/~jameinel/bzr/1.19-known-graph-sorted

------------------------------------------------------------
revno: 4611 [merge]
revision-id: john at arbash-meinel.com-20090813213007-4wqkou02fo0tegay
parent: john at arbash-meinel.com-20090813205913-fygdc0ycyio81i03
parent: john at arbash-meinel.com-20090805140544-7prpni18c6dnlqnn
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: 1.19-known-graph-sorted
timestamp: Thu 2009-08-13 16:30:07 -0500
message:
  Bring in the optimized tsort code.
  
  It turns out we needed a way to detect that there was graph cycles.
  The easiest way to do it, is by noticing that we end up with a node
  that has no gdfo. That means it has a parent that was not reached
  before the child was reached, when walking from nodes with no known parents.
  I'm 90% sure that this is correct, but arguably it should be detected
  when creating the KnownGraph, and not during topo_sort.
modified:
  bzrlib/_known_graph_pyx.pyx    _known_graph_pyx.pyx-20090610194911-yjk73td9hpjilas0-1
  bzrlib/tests/test__known_graph.py test__known_graph.py-20090610185421-vw8vfda2cgnckgb1-2
  bzrlib/tests/test_tsort.py     testtsort.py-20051025073946-27da871c394d5be4
  bzrlib/tsort.py                tsort.py-20051025073946-7808f6aaf7d07208
-------------- next part --------------
=== modified file 'bzrlib/_known_graph_pyx.pyx'
--- a/bzrlib/_known_graph_pyx.pyx	2009-08-13 20:59:13 +0000
+++ b/bzrlib/_known_graph_pyx.pyx	2009-08-13 21:30:07 +0000
@@ -45,12 +45,77 @@
     void Py_INCREF(object)
 
 
-from bzrlib import revision
+from bzrlib import errors, revision
 
 cdef object NULL_REVISION
 NULL_REVISION = revision.NULL_REVISION
 
 
+def topo_sort(graph):
+    cdef Py_ssize_t last_tip, pos
+    cdef PyObject *temp_key, *temp_value
+
+    graph = dict(graph)
+    # this is the stack storing on which the sorted nodes are pushed.
+    node_name_stack = []
+
+    # count the number of children for every node in the graph
+    num_children = dict.fromkeys(graph.iterkeys(), 0)
+    pos = 0
+    while PyDict_Next(graph, &pos, NULL, &temp_value):
+        parents = <object>temp_value
+        for parent in parents: # pretty sure this is a tuple
+            temp_value = PyDict_GetItem(num_children, parent)
+            if temp_value != NULL: # Ignore ghosts
+                n = (<object>temp_value) + 1
+                PyDict_SetItem(num_children, parent, n)
+    # keep track of nodes without children in a separate list
+    tips = []
+    pos = 0
+    while PyDict_Next(num_children, &pos, &temp_key, &temp_value):
+        value = <object>temp_value
+        if value == 0:
+            node_name = <object>temp_key
+            PyList_Append(tips, node_name)
+
+    graph_pop = graph.pop
+    last_tip = len(tips) - 1
+    while last_tip >= 0:
+        # pick a node without a child and add it to the stack.
+        temp_key = PyList_GET_ITEM(tips, last_tip)
+        node_name = <object>temp_key
+        last_tip -= 1
+        PyList_Append(node_name_stack, node_name)
+
+        # the parents of the node lose it as a child; if it was the last
+        # child, add the parent to the list of childless nodes.
+        parents = graph_pop(node_name)
+        for parent in parents:
+            temp_value = PyDict_GetItem(num_children, parent)
+            if temp_value == NULL:
+                # Ghost parent, skip it
+                continue
+            n = (<object>temp_value) - 1
+            PyDict_SetItem(num_children, parent, n)
+            if n == 0:
+                last_tip += 1
+                if PyList_GET_SIZE(tips) > last_tip:
+                    Py_INCREF(parent)
+                    PyList_SetItem(tips, last_tip, parent)
+                else:
+                    PyList_Append(tips, parent)
+
+    # if there are still nodes left in the graph,
+    # that means that there is a cycle
+    if graph:
+        raise errors.GraphCycleError(graph)
+
+    # the nodes where pushed on the stack child first, so this list needs to be
+    # reversed before returning it.
+    node_name_stack.reverse()
+    return node_name_stack
+
+
 cdef class _KnownGraphNode:
     """Represents a single object in the known graph."""
 
@@ -337,14 +402,55 @@
         return heads
 
     def topo_sort(self):
-        cdef _KnownGraphNode node, parent
-        from bzrlib import tsort
-        as_parent_map = {}
-        for node in self._nodes.itervalues():
-            if node.parents is None:
-                continue
-            parent_keys = []
-            for parent in node.parents:
-                parent_keys.append(parent.key)
-            as_parent_map[node.key] = parent_keys
-        return tsort.topo_sort(as_parent_map)
+        """Return the nodes in topological order.
+
+        All parents must occur before all children.
+        """
+        # This is, for the most part, the same iteration order that we used for
+        # _find_gdfo, consider finding a way to remove the duplication
+        # In general, we find the 'tails' (nodes with no parents), and then
+        # walk to the children. For children that have all of their parents
+        # yielded, we queue up the child to be yielded as well.
+        cdef _KnownGraphNode node
+        cdef _KnownGraphNode child
+        cdef PyObject *temp
+        cdef Py_ssize_t pos
+        cdef int replace
+        cdef Py_ssize_t last_item
+
+        pending = self._find_tails()
+        if PyList_GET_SIZE(pending) == 0 and len(self._nodes) > 0:
+            raise errors.GraphCycleError(self._nodes)
+
+        topo_order = []
+
+        last_item = PyList_GET_SIZE(pending) - 1
+        while last_item >= 0:
+            # Avoid pop followed by push, instead, peek, and replace
+            # timing shows this is 930ms => 770ms for OOo
+            node = _get_list_node(pending, last_item)
+            last_item = last_item - 1
+            if node.parents is not None:
+                # We don't include ghost parents
+                PyList_Append(topo_order, node.key)
+            for pos from 0 <= pos < PyList_GET_SIZE(node.children):
+                child = _get_list_node(node.children, pos)
+                if child.gdfo == -1:
+                    # We know we have a graph cycle because a node has a parent
+                    # which we couldn't find
+                    raise errors.GraphCycleError(self._nodes)
+                child.seen = child.seen + 1
+                if child.seen == PyTuple_GET_SIZE(child.parents):
+                    # All parents of this child have been yielded, queue this
+                    # one to be yielded as well
+                    last_item = last_item + 1
+                    if last_item < PyList_GET_SIZE(pending):
+                        Py_INCREF(child) # SetItem steals a ref
+                        PyList_SetItem(pending, last_item, child)
+                    else:
+                        PyList_Append(pending, child)
+                    # We have queued this node, we don't need to track it
+                    # anymore
+                    child.seen = 0
+        # We started from the parents, so we don't need to do anymore work
+        return topo_order

=== modified file 'bzrlib/tests/test__known_graph.py'
--- a/bzrlib/tests/test__known_graph.py	2009-08-13 20:59:13 +0000
+++ b/bzrlib/tests/test__known_graph.py	2009-08-13 21:30:07 +0000
@@ -93,6 +93,9 @@
         """
         graph = self.module.KnownGraph(ancestry)
         sort_result = graph.topo_sort()
+        # We should have an entry in sort_result for every entry present in the
+        # graph.
+        self.assertEqual(len(ancestry), len(sort_result))
         node_idx = dict((node, idx) for idx, node in enumerate(sort_result))
         for node in sort_result:
             parents = ancestry[node]
@@ -268,6 +271,15 @@
                                     2: [0]})
         self.assertRaises(errors.GraphCycleError, g.topo_sort)
 
+    def test_topo_sort_cycle_with_tail(self):
+        """TopoSort traps graph with longer cycle"""
+        g = self.module.KnownGraph({0: [1],
+                                    1: [2],
+                                    2: [3, 4],
+                                    3: [0],
+                                    4: []})
+        self.assertRaises(errors.GraphCycleError, g.topo_sort)
+
     def test_topo_sort_1(self):
         """TopoSort simple nontrivial graph"""
         self.assertTopoSortOrder({0: [3],

=== modified file 'bzrlib/tests/test_tsort.py'
--- a/bzrlib/tests/test_tsort.py	2009-08-13 20:59:13 +0000
+++ b/bzrlib/tests/test_tsort.py	2009-08-13 21:30:07 +0000
@@ -77,6 +77,15 @@
                                         1: [2],
                                         2: [0]}.items())
 
+    def test_topo_sort_cycle_with_tail(self):
+        """TopoSort traps graph with longer cycle"""
+        self.assertSortAndIterateRaise(GraphCycleError,
+                                       {0: [1],
+                                        1: [2],
+                                        2: [3, 4],
+                                        3: [0],
+                                        4: []}.items())
+
     def test_tsort_1(self):
         """TopoSort simple nontrivial graph"""
         self.assertSortAndIterate({0: [3],

=== modified file 'bzrlib/tsort.py'
--- a/bzrlib/tsort.py	2009-08-09 21:05:57 +0000
+++ b/bzrlib/tsort.py	2009-08-13 21:30:07 +0000
@@ -31,17 +31,18 @@
 
     graph -- sequence of pairs of node->parents_list.
 
-    The result is a list of node names, such that all parents come before
-    their children.
+    The result is a list of node names, such that all parents come before their
+    children.
 
     node identifiers can be any hashable object, and are typically strings.
 
     This function has the same purpose as the TopoSorter class, but uses a
-    different algorithm to sort the graph. That means that while both return a list
-    with parents before their child nodes, the exact ordering can be different.
+    different algorithm to sort the graph. That means that while both return a
+    list with parents before their child nodes, the exact ordering can be
+    different.
 
-    topo_sort is faster when the whole list is needed, while when iterating over a
-    part of the list, TopoSorter.iter_topo_order should be used.
+    topo_sort is faster when the whole list is needed, while when iterating
+    over a part of the list, TopoSorter.iter_topo_order should be used.
     """
     # store a dict of the graph.
     graph = dict(graph)
@@ -56,7 +57,8 @@
             if parent in node_child_count:
                 node_child_count[parent] += 1
     # keep track of nodes without children in a separate list
-    nochild_nodes = deque([node for (node, n) in node_child_count.iteritems() if n == 0])
+    nochild_nodes = deque([node for (node, n) in node_child_count.iteritems()
+                                 if n == 0])
 
     graph_pop = graph.pop
     node_stack_append = node_stack.append



More information about the bazaar-commits mailing list