Rev 4611: Bring in the optimized tsort code. in http://bazaar.launchpad.net/~jameinel/bzr/1.19-known-graph-sorted
John Arbash Meinel
john at arbash-meinel.com
Thu Aug 13 22:30:16 BST 2009
At http://bazaar.launchpad.net/~jameinel/bzr/1.19-known-graph-sorted
------------------------------------------------------------
revno: 4611 [merge]
revision-id: john at arbash-meinel.com-20090813213007-4wqkou02fo0tegay
parent: john at arbash-meinel.com-20090813205913-fygdc0ycyio81i03
parent: john at arbash-meinel.com-20090805140544-7prpni18c6dnlqnn
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: 1.19-known-graph-sorted
timestamp: Thu 2009-08-13 16:30:07 -0500
message:
Bring in the optimized tsort code.
It turns out we needed a way to detect that there was graph cycles.
The easiest way to do it, is by noticing that we end up with a node
that has no gdfo. That means it has a parent that was not reached
before the child was reached, when walking from nodes with no known parents.
I'm 90% sure that this is correct, but arguably it should be detected
when creating the KnownGraph, and not during topo_sort.
modified:
bzrlib/_known_graph_pyx.pyx _known_graph_pyx.pyx-20090610194911-yjk73td9hpjilas0-1
bzrlib/tests/test__known_graph.py test__known_graph.py-20090610185421-vw8vfda2cgnckgb1-2
bzrlib/tests/test_tsort.py testtsort.py-20051025073946-27da871c394d5be4
bzrlib/tsort.py tsort.py-20051025073946-7808f6aaf7d07208
-------------- next part --------------
=== modified file 'bzrlib/_known_graph_pyx.pyx'
--- a/bzrlib/_known_graph_pyx.pyx 2009-08-13 20:59:13 +0000
+++ b/bzrlib/_known_graph_pyx.pyx 2009-08-13 21:30:07 +0000
@@ -45,12 +45,77 @@
void Py_INCREF(object)
-from bzrlib import revision
+from bzrlib import errors, revision
cdef object NULL_REVISION
NULL_REVISION = revision.NULL_REVISION
+def topo_sort(graph):
+ cdef Py_ssize_t last_tip, pos
+ cdef PyObject *temp_key, *temp_value
+
+ graph = dict(graph)
+ # this is the stack storing on which the sorted nodes are pushed.
+ node_name_stack = []
+
+ # count the number of children for every node in the graph
+ num_children = dict.fromkeys(graph.iterkeys(), 0)
+ pos = 0
+ while PyDict_Next(graph, &pos, NULL, &temp_value):
+ parents = <object>temp_value
+ for parent in parents: # pretty sure this is a tuple
+ temp_value = PyDict_GetItem(num_children, parent)
+ if temp_value != NULL: # Ignore ghosts
+ n = (<object>temp_value) + 1
+ PyDict_SetItem(num_children, parent, n)
+ # keep track of nodes without children in a separate list
+ tips = []
+ pos = 0
+ while PyDict_Next(num_children, &pos, &temp_key, &temp_value):
+ value = <object>temp_value
+ if value == 0:
+ node_name = <object>temp_key
+ PyList_Append(tips, node_name)
+
+ graph_pop = graph.pop
+ last_tip = len(tips) - 1
+ while last_tip >= 0:
+ # pick a node without a child and add it to the stack.
+ temp_key = PyList_GET_ITEM(tips, last_tip)
+ node_name = <object>temp_key
+ last_tip -= 1
+ PyList_Append(node_name_stack, node_name)
+
+ # the parents of the node lose it as a child; if it was the last
+ # child, add the parent to the list of childless nodes.
+ parents = graph_pop(node_name)
+ for parent in parents:
+ temp_value = PyDict_GetItem(num_children, parent)
+ if temp_value == NULL:
+ # Ghost parent, skip it
+ continue
+ n = (<object>temp_value) - 1
+ PyDict_SetItem(num_children, parent, n)
+ if n == 0:
+ last_tip += 1
+ if PyList_GET_SIZE(tips) > last_tip:
+ Py_INCREF(parent)
+ PyList_SetItem(tips, last_tip, parent)
+ else:
+ PyList_Append(tips, parent)
+
+ # if there are still nodes left in the graph,
+ # that means that there is a cycle
+ if graph:
+ raise errors.GraphCycleError(graph)
+
+ # the nodes where pushed on the stack child first, so this list needs to be
+ # reversed before returning it.
+ node_name_stack.reverse()
+ return node_name_stack
+
+
cdef class _KnownGraphNode:
"""Represents a single object in the known graph."""
@@ -337,14 +402,55 @@
return heads
def topo_sort(self):
- cdef _KnownGraphNode node, parent
- from bzrlib import tsort
- as_parent_map = {}
- for node in self._nodes.itervalues():
- if node.parents is None:
- continue
- parent_keys = []
- for parent in node.parents:
- parent_keys.append(parent.key)
- as_parent_map[node.key] = parent_keys
- return tsort.topo_sort(as_parent_map)
+ """Return the nodes in topological order.
+
+ All parents must occur before all children.
+ """
+ # This is, for the most part, the same iteration order that we used for
+ # _find_gdfo, consider finding a way to remove the duplication
+ # In general, we find the 'tails' (nodes with no parents), and then
+ # walk to the children. For children that have all of their parents
+ # yielded, we queue up the child to be yielded as well.
+ cdef _KnownGraphNode node
+ cdef _KnownGraphNode child
+ cdef PyObject *temp
+ cdef Py_ssize_t pos
+ cdef int replace
+ cdef Py_ssize_t last_item
+
+ pending = self._find_tails()
+ if PyList_GET_SIZE(pending) == 0 and len(self._nodes) > 0:
+ raise errors.GraphCycleError(self._nodes)
+
+ topo_order = []
+
+ last_item = PyList_GET_SIZE(pending) - 1
+ while last_item >= 0:
+ # Avoid pop followed by push, instead, peek, and replace
+ # timing shows this is 930ms => 770ms for OOo
+ node = _get_list_node(pending, last_item)
+ last_item = last_item - 1
+ if node.parents is not None:
+ # We don't include ghost parents
+ PyList_Append(topo_order, node.key)
+ for pos from 0 <= pos < PyList_GET_SIZE(node.children):
+ child = _get_list_node(node.children, pos)
+ if child.gdfo == -1:
+ # We know we have a graph cycle because a node has a parent
+ # which we couldn't find
+ raise errors.GraphCycleError(self._nodes)
+ child.seen = child.seen + 1
+ if child.seen == PyTuple_GET_SIZE(child.parents):
+ # All parents of this child have been yielded, queue this
+ # one to be yielded as well
+ last_item = last_item + 1
+ if last_item < PyList_GET_SIZE(pending):
+ Py_INCREF(child) # SetItem steals a ref
+ PyList_SetItem(pending, last_item, child)
+ else:
+ PyList_Append(pending, child)
+ # We have queued this node, we don't need to track it
+ # anymore
+ child.seen = 0
+ # We started from the parents, so we don't need to do anymore work
+ return topo_order
=== modified file 'bzrlib/tests/test__known_graph.py'
--- a/bzrlib/tests/test__known_graph.py 2009-08-13 20:59:13 +0000
+++ b/bzrlib/tests/test__known_graph.py 2009-08-13 21:30:07 +0000
@@ -93,6 +93,9 @@
"""
graph = self.module.KnownGraph(ancestry)
sort_result = graph.topo_sort()
+ # We should have an entry in sort_result for every entry present in the
+ # graph.
+ self.assertEqual(len(ancestry), len(sort_result))
node_idx = dict((node, idx) for idx, node in enumerate(sort_result))
for node in sort_result:
parents = ancestry[node]
@@ -268,6 +271,15 @@
2: [0]})
self.assertRaises(errors.GraphCycleError, g.topo_sort)
+ def test_topo_sort_cycle_with_tail(self):
+ """TopoSort traps graph with longer cycle"""
+ g = self.module.KnownGraph({0: [1],
+ 1: [2],
+ 2: [3, 4],
+ 3: [0],
+ 4: []})
+ self.assertRaises(errors.GraphCycleError, g.topo_sort)
+
def test_topo_sort_1(self):
"""TopoSort simple nontrivial graph"""
self.assertTopoSortOrder({0: [3],
=== modified file 'bzrlib/tests/test_tsort.py'
--- a/bzrlib/tests/test_tsort.py 2009-08-13 20:59:13 +0000
+++ b/bzrlib/tests/test_tsort.py 2009-08-13 21:30:07 +0000
@@ -77,6 +77,15 @@
1: [2],
2: [0]}.items())
+ def test_topo_sort_cycle_with_tail(self):
+ """TopoSort traps graph with longer cycle"""
+ self.assertSortAndIterateRaise(GraphCycleError,
+ {0: [1],
+ 1: [2],
+ 2: [3, 4],
+ 3: [0],
+ 4: []}.items())
+
def test_tsort_1(self):
"""TopoSort simple nontrivial graph"""
self.assertSortAndIterate({0: [3],
=== modified file 'bzrlib/tsort.py'
--- a/bzrlib/tsort.py 2009-08-09 21:05:57 +0000
+++ b/bzrlib/tsort.py 2009-08-13 21:30:07 +0000
@@ -31,17 +31,18 @@
graph -- sequence of pairs of node->parents_list.
- The result is a list of node names, such that all parents come before
- their children.
+ The result is a list of node names, such that all parents come before their
+ children.
node identifiers can be any hashable object, and are typically strings.
This function has the same purpose as the TopoSorter class, but uses a
- different algorithm to sort the graph. That means that while both return a list
- with parents before their child nodes, the exact ordering can be different.
+ different algorithm to sort the graph. That means that while both return a
+ list with parents before their child nodes, the exact ordering can be
+ different.
- topo_sort is faster when the whole list is needed, while when iterating over a
- part of the list, TopoSorter.iter_topo_order should be used.
+ topo_sort is faster when the whole list is needed, while when iterating
+ over a part of the list, TopoSorter.iter_topo_order should be used.
"""
# store a dict of the graph.
graph = dict(graph)
@@ -56,7 +57,8 @@
if parent in node_child_count:
node_child_count[parent] += 1
# keep track of nodes without children in a separate list
- nochild_nodes = deque([node for (node, n) in node_child_count.iteritems() if n == 0])
+ nochild_nodes = deque([node for (node, n) in node_child_count.iteritems()
+ if n == 0])
graph_pop = graph.pop
node_stack_append = node_stack.append
More information about the bazaar-commits
mailing list