Rev 4390: Get an initial pyrex implementation. in http://bazaar.launchpad.net/~jameinel/bzr/1.16-better_heads

Wed Jun 10 20:56:25 BST 2009

At http://bazaar.launchpad.net/~jameinel/bzr/1.16-better_heads

------------------------------------------------------------
revno: 4390
revision-id: john at arbash-meinel.com-20090610195616-alggpr46n0bmjjhf
parent: john at arbash-meinel.com-20090610185802-wsybzjfil447yhy2
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: 1.16-better_heads
timestamp: Wed 2009-06-10 14:56:16 -0500
message:
  Get an initial pyrex implementation.
  
  Initial results show it to actually be slightly slower than pure python.
-------------- next part --------------
=== modified file '.bzrignore'

--- a/.bzrignore	2009-06-10 03:56:49 +0000
+++ b/.bzrignore	2009-06-10 19:56:16 +0000
@@ -45,6 +45,7 @@
 bzrlib/_dirstate_helpers_c.c
 bzrlib/_groupcompress_pyx.c
 bzrlib/_knit_load_data_c.c
+bzrlib/_known_graph_pyx.c
 bzrlib/_readdir_pyx.c
 bzrlib/_rio_pyx.c
 bzrlib/_walkdirs_win32.c

=== modified file 'bzrlib/_known_graph_py.py'
--- a/bzrlib/_known_graph_py.py	2009-06-10 18:58:02 +0000
+++ b/bzrlib/_known_graph_py.py	2009-06-10 19:56:16 +0000
@@ -18,6 +18,7 @@
 """
 
 import heapq
+
 from bzrlib import (
     revision,
     )
@@ -61,8 +62,10 @@
         self._nodes = {}
         # Maps {sorted(revision_id, revision_id): heads}
         self._known_heads = {}
+        self.do_cache = do_cache
         self._initialize_nodes(parent_map)
-        self.do_cache = do_cache
+        self._find_linear_dominators()
+        self._find_gdfo()
 
     def _initialize_nodes(self, parent_map):
         """Populate self._nodes.
@@ -87,8 +90,6 @@
                     parent_node = _KnownGraphNode(parent_key, None)
                     nodes[parent_key] = parent_node
                 parent_node.child_keys.append(key)
-        self._find_linear_dominators()
-        self._find_gdfo()
 
     def _find_linear_dominators(self):
         """For each node in the set, find any linear dominators.
@@ -279,9 +280,9 @@
         heappop = heapq.heappop
         heappush = heapq.heappush
         while queue and len(candidate_nodes) > 1:
-            _, next = heappop(queue)
-            # assert next.ancestor_of is not None
-            next_ancestor_of = next.ancestor_of
+            _, node = heappop(queue)
+            # assert node.ancestor_of is not None
+            next_ancestor_of = node.ancestor_of
             if len(next_ancestor_of) == num_candidates:
                 # This node is now considered 'common'
                 # Make sure all parent nodes are marked as such
@@ -294,7 +295,7 @@
                     if parent_node.ancestor_of is not None:
                         parent_node.ancestor_of = next_ancestor_of
                 continue
-            if next.parent_keys is None:
+            if node.parent_keys is None:
                 # This is a ghost
                 continue
             # Now project the current nodes ancestor list to the parent nodes,
@@ -302,13 +303,13 @@
             # Note: using linear_dominator speeds things up quite a bit
             #       enough that we actually start to be slightly faster
             #       than the default heads() implementation
-            if next.linear_dominator != next.key:
+            if node.linear_dominator != node.key:
                 # We are at the tip of a long linear region
                 # We know that there is nothing between here and the tail
                 # that is interesting, so skip to the end
-                parent_keys = [next.linear_dominator]
+                parent_keys = [node.linear_dominator]
             else:
-                parent_keys = next.parent_keys
+                parent_keys = node.parent_keys
             for parent_key in parent_keys:
                 if parent_key in candidate_nodes:
                     candidate_nodes.pop(parent_key)
@@ -332,9 +333,3 @@
                 node.ancestor_of = None
         cleanup()
         return frozenset(candidate_nodes)
-
-    def get_parent_map(self, keys):
-        # Thunk to match the Graph._parents_provider api.
-        nodes = [self._nodes[key] for key in keys]
-        return dict((node.key, node.parent_keys)
-                    for node in nodes if node.parent_keys is not None)

=== added file 'bzrlib/_known_graph_pyx.pyx'
--- a/bzrlib/_known_graph_pyx.pyx	1970-01-01 00:00:00 +0000
+++ b/bzrlib/_known_graph_pyx.pyx	2009-06-10 19:56:16 +0000
@@ -0,0 +1,419 @@
+# Copyright (C) 2009 Canonical Ltd
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+"""Implementation of Graph algorithms when we have already loaded everything.
+"""
+
+cdef extern from "python-compat.h":
+    pass
+
+cdef extern from "Python.h":
+    ctypedef int Py_ssize_t
+    ctypedef struct PyObject:
+        pass
+
+import heapq
+
+from bzrlib import revision
+
+
+cdef class _KnownGraphNode:
+    """Represents a single object in the known graph."""
+
+    cdef object key
+    # Ideally, we could change this into fixed arrays, rather than get the
+    # extra allocations. Consider that 99% of all revisions will have <= 2
+    # parents, we may want to put in the effort
+    cdef list parents
+    cdef list children
+    cdef _KnownGraphNode linear_dominator_node
+    cdef public long dominator_distance
+    cdef public long gdfo
+    # This could also be simplified
+    cdef object ancestor_of
+
+    def __init__(self, key, parents):
+        self.key = key
+        if parents is None:
+            self.parents = parents
+        else:
+            if not isinstance(parents, list):
+                raise TypeError('parents must be a list')
+            for parent in parents:
+                if not isinstance(parent, _KnownGraphNode):
+                    raise TypeError('parents must be a list of _KnownGraphNode')
+        self.parents = parents
+        self.children = []
+        # oldest ancestor, such that no parents between here and there have >1
+        # child or >1 parent.
+        self.linear_dominator_node = None
+        self.dominator_distance = 0
+        # Greatest distance from origin
+        self.gdfo = -1
+        # This will become a tuple of known heads that have this node as an
+        # ancestor
+        self.ancestor_of = None
+
+    property child_keys:
+        def __get__(self):
+            cdef list keys
+            cdef _KnownGraphNode child
+
+            keys = []
+            for child in self.children:
+                keys.append(child.key)
+            return keys
+
+    property linear_dominator:
+        def __get__(self):
+            if self.linear_dominator_node is None:
+                return None
+            else:
+                return self.linear_dominator_node.key
+
+    cdef clear_references(self):
+        self.parents = None
+        self.children = None
+        self.linear_dominator_node = None
+
+    def __repr__(self):
+        parent_keys = []
+        for parent in self.parents:
+            parent_keys.append(parent.key)
+        child_keys = []
+        for child in self.children:
+            child_keys.append(child.key)
+        return '%s(%s  gdfo:%s par:%s child:%s dom:%s %s)' % (
+            self.__class__.__name__, self.key, self.gdfo,
+            parent_keys, child_keys,
+            self.linear_dominator, self.dominator_distance)
+
+
+# TODO: slab allocate all _KnownGraphNode objects.
+#       We already know how many we are going to need...
+
+cdef class KnownGraph:
+    """This is a class which assumes we already know the full graph."""
+
+    cdef public object _nodes
+    cdef dict _known_heads
+    cdef public int do_cache
+
+    def __init__(self, parent_map, do_cache=True):
+        """Create a new KnownGraph instance.
+
+        :param parent_map: A dictionary mapping key => parent_keys
+        """
+        self._nodes = {}
+        # Maps {sorted(revision_id, revision_id): heads}
+        self._known_heads = {}
+        self.do_cache = int(do_cache)
+        self._initialize_nodes(parent_map)
+        self._find_linear_dominators()
+        self._find_gdfo()
+
+    def __dealloc__(self):
+        cdef _KnownGraphNode child
+
+        for child in self._nodes.itervalues():
+            child.clear_references()
+
+    def _initialize_nodes(self, parent_map):
+        """Populate self._nodes.
+
+        After this has finished, self._nodes will have an entry for every entry
+        in parent_map. Ghosts will have a parent_keys = None, all nodes found
+        will also have .child_keys populated with all known child_keys.
+        """
+        cdef _KnownGraphNode node
+        cdef _KnownGraphNode parent_node
+        cdef list parent_nodes
+
+        for key, parent_keys in parent_map.iteritems():
+            parent_nodes = []
+            for parent_key in parent_keys:
+                try:
+                    parent_node = self._nodes[parent_key]
+                except KeyError:
+                    parent_node = _KnownGraphNode(parent_key, None)
+                    self._nodes[parent_key] = parent_node
+                parent_nodes.append(parent_node)
+            if key in self._nodes:
+                node = self._nodes[key]
+                assert node.parents is None
+                node.parents = parent_nodes
+            else:
+                node = _KnownGraphNode(key, parent_nodes)
+                self._nodes[key] = node
+            for parent_node in parent_nodes:
+                parent_node.children.append(node)
+
+    cdef object _check_is_linear(self, _KnownGraphNode node):
+        """Check to see if a given node is part of a linear chain."""
+        cdef _KnownGraphNode parent_node
+        if node.parents is None or len(node.parents) != 1:
+            # This node is either a ghost, a tail, or has multiple parents
+            # It its own dominator
+            node.linear_dominator_node = node
+            node.dominator_distance = 0
+            return None
+        parent_node = node.parents[0]
+        if len(parent_node.children) > 1:
+            # The parent has multiple children, so *this* node is the
+            # dominator
+            node.linear_dominator_node = node
+            node.dominator_distance = 0
+            return None
+        # The parent is already filled in, so add and continue
+        if parent_node.linear_dominator_node is not None:
+            node.linear_dominator_node = parent_node.linear_dominator_node
+            node.dominator_distance = parent_node.dominator_distance + 1
+            return None
+        # We don't know this node, or its parent node, so start walking to
+        # next
+        return parent_node
+
+    def _find_linear_dominators(self):
+        """For each node in the set, find any linear dominators.
+
+        For any given node, the 'linear dominator' is an ancestor, such that
+        all parents between this node and that one have a single parent, and a
+        single child. So if A->B->C->D then B,C,D all have a linear dominator
+        of A. Because there are no interesting siblings, we can quickly skip to
+        the nearest dominator when doing comparisons.
+        """
+        cdef _KnownGraphNode node
+        cdef _KnownGraphNode next_node
+        cdef list stack
+
+        for node in self._nodes.itervalues():
+            # The parent is not filled in, so walk until we get somewhere
+            if node.linear_dominator_node is not None: #already done
+                continue
+            next_node = self._check_is_linear(node)
+            if next_node is None:
+                # Nothing more needs to be done
+                continue
+            stack = []
+            while next_node is not None:
+                stack.append(node)
+                node = next_node
+                next_node = self._check_is_linear(node)
+            # The stack now contains the linear chain, and 'node' should have
+            # been labeled
+            assert node.linear_dominator_node is not None
+            dominator = node.linear_dominator_node
+            while stack:
+                next_node = stack.pop()
+                next_node.linear_dominator_node = dominator
+                next_node.dominator_distance = node.dominator_distance + 1
+                node = next_node
+
+    cdef list _find_tails(self):
+        cdef list tails
+        cdef _KnownGraphNode node
+        tails = []
+
+        for node in self._nodes.itervalues():
+            if not node.parents:
+                tails.append(node)
+        return tails
+
+    def _find_gdfo(self):
+        # TODO: Consider moving the tails search into the first-pass over the
+        #       data, inside _find_linear_dominators
+        cdef _KnownGraphNode node
+        cdef _KnownGraphNode child_node
+        cdef _KnownGraphNode parent_node
+
+        tails = self._find_tails()
+        todo = []
+        heappush = heapq.heappush
+        heappop = heapq.heappop
+        for node in tails:
+            node.gdfo = 1
+            heappush(todo, (1, node))
+        processed = 0
+        max_gdfo = len(self._nodes) + 1
+        while todo:
+            gdfo, node = heappop(todo)
+            processed += 1
+            if node.gdfo != -1 and gdfo < node.gdfo:
+                # This node was reached from a longer path, we assume it was
+                # enqued correctly with the longer gdfo, so don't continue
+                # processing now
+                continue
+            next_gdfo = gdfo + 1
+            assert next_gdfo <= max_gdfo
+            for child_node in node.children:
+                if child_node.gdfo < next_gdfo:
+                    # Only enque children when all of their parents have been
+                    # resolved
+                    for parent_node in child_node.parents:
+                        # We know that 'this' parent is counted
+                        if parent_node is not node:
+                            if parent_node.gdfo == -1:
+                                break
+                    else:
+                        child_node.gdfo = next_gdfo
+                        heappush(todo, (next_gdfo, child_node))
+
+    def heads(self, keys):
+        """Return the heads from amongst keys.
+
+        This is done by searching the ancestries of each key.  Any key that is
+        reachable from another key is not returned; all the others are.
+
+        This operation scales with the relative depth between any two keys. If
+        any two keys are completely disconnected all ancestry of both sides
+        will be retrieved.
+
+        :param keys: An iterable of keys.
+        :return: A set of the heads. Note that as a set there is no ordering
+            information. Callers will need to filter their input to create
+            order if they need it.
+        """
+        cdef dict candidate_nodes
+        cdef dict dom_to_node
+
+        candidate_nodes = {}
+        for key in keys:
+            candidate_nodes[key] = self._nodes[key]
+        if revision.NULL_REVISION in candidate_nodes:
+            # NULL_REVISION is only a head if it is the only entry
+            candidate_nodes.pop(revision.NULL_REVISION)
+            if not candidate_nodes:
+                return set([revision.NULL_REVISION])
+        if len(candidate_nodes) < 2:
+            return frozenset(candidate_nodes)
+        heads_key = frozenset(candidate_nodes)
+        try:
+            heads = self._known_heads[heads_key]
+            return heads
+        except KeyError:
+            pass # compute it ourselves
+        ## dominator = None
+        ## # TODO: We could optimize for the len(candidate_nodes) > 2 by checking
+        ## #       for *any* pair-wise matching, and then eliminating one of the
+        ## #       nodes trivially. However, the fairly common case is just 2
+        ## #       keys, so we'll focus on that, first
+        ## for node in candidate_nodes.itervalues():
+        ##     if dominator is None:
+        ##         dominator = node.linear_dominator
+        ##     elif dominator != node.linear_dominator:
+        ##         break
+        ## else:
+        ##     # In 'time bzr annotate NEWS' this only catches *one* item, so it
+        ##     # probably isn't worth the optimization
+        ##     # All of these nodes have the same linear_dominator, which means
+        ##     # they are in a line, the head is just the one with the highest
+        ##     # distance
+        ##     def get_distance(key):
+        ##         return self._nodes[key].dominator_distance
+        ##     def get_linear_head():
+        ##         return max(candidate_nodes, key=get_distance)
+        ##     return set([get_linear_head()])
+        # Check the linear dominators of these keys, to see if we already
+        # know the heads answer
+        # dom_key = []
+        # for node in candidate_nodes.itervalues():
+        #     dom_key.append(node.linear_dominator.key)
+        # dom_key = frozenset(dom_key)
+        # if dom_key in self._known_heads:
+        #     dom_to_node = dict([(node.linear_dominator, key)
+        #                    for key, node in candidate_nodes.iteritems()])
+        #     # map back into the original keys
+        #     heads = self._known_heads[dom_key]
+        #     heads = frozenset([dom_to_node[key] for key in heads])
+        #     return heads
+        heads = self._heads_from_candidate_nodes(candidate_nodes)
+        # if self.do_cache:
+        #     self._known_heads[heads_key] = heads
+        #     # Cache the dominator heads
+        #     if dom_key is not None:
+        #         dom_heads = frozenset([candidate_nodes[key].linear_dominator
+        #                                for key in heads])
+        #         self._known_heads[dom_key] = dom_heads
+        return heads
+
+    def _heads_from_candidate_nodes(self, dict candidate_nodes):
+        cdef list to_cleanup
+        cdef _KnownGraphNode node
+        cdef _KnownGraphNode parent_node
+        cdef int num_candidates
+
+        queue = []
+        to_cleanup = []
+        for node in candidate_nodes.itervalues():
+            assert node.ancestor_of is None
+            node.ancestor_of = (node.key,)
+            queue.append((-node.gdfo, node))
+            to_cleanup.append(node)
+        heapq.heapify(queue)
+        # These are nodes that we determined are 'common' that we are no longer
+        # walking
+        # Now we walk nodes until all nodes that are being walked are 'common'
+        num_candidates = len(candidate_nodes)
+        nodes = self._nodes
+        heappop = heapq.heappop
+        heappush = heapq.heappush
+        while queue and len(candidate_nodes) > 1:
+            _, node = heappop(queue)
+            if len(node.ancestor_of) == num_candidates:
+                # This node is now considered 'common'
+                # Make sure all parent nodes are marked as such
+                for parent_node in node.parents:
+                    if parent_node.ancestor_of is not None:
+                        parent_node.ancestor_of = node.ancestor_of
+                if node.linear_dominator_node is not node:
+                    parent_node = node.linear_dominator_node
+                    if parent_node.ancestor_of is not None:
+                        parent_node.ancestor_of = node.ancestor_of
+                continue
+            if node.parents is None:
+                # This is a ghost
+                continue
+            # Now project the current nodes ancestor list to the parent nodes,
+            # and queue them up to be walked
+            # Note: using linear_dominator speeds things up quite a bit
+            #       enough that we actually start to be slightly faster
+            #       than the default heads() implementation
+            if node.linear_dominator_node is not node:
+                # We are at the tip of a long linear region
+                # We know that there is nothing between here and the tail
+                # that is interesting, so skip to the end
+                parents = [node.linear_dominator_node]
+            else:
+                parents = node.parents
+            for parent_node in node.parents:
+                if parent_node.key in candidate_nodes:
+                    candidate_nodes.pop(parent_node.key)
+                    if len(candidate_nodes) <= 1:
+                        break
+                if parent_node.ancestor_of is None:
+                    # This node hasn't been walked yet
+                    parent_node.ancestor_of = node.ancestor_of
+                    # Enqueue this node
+                    heappush(queue, (-parent_node.gdfo, parent_node))
+                    to_cleanup.append(parent_node)
+                elif parent_node.ancestor_of != node.ancestor_of:
+                    # Combine to get the full set of parents
+                    all_ancestors = set(parent_node.ancestor_of)
+                    all_ancestors.update(node.ancestor_of)
+                    parent_node.ancestor_of = tuple(sorted(all_ancestors))
+        for node in to_cleanup:
+            node.ancestor_of = None
+        return frozenset(candidate_nodes)

=== modified file 'bzrlib/graph.py'
--- a/bzrlib/graph.py	2009-06-10 18:58:02 +0000
+++ b/bzrlib/graph.py	2009-06-10 19:56:16 +0000
@@ -1657,6 +1657,7 @@
     return result
 
 
+_counters = [0, 0, 0, 0, 0]
 try:
     from bzrlib._known_graph_pyx import KnownGraph
 except ImportError:

=== modified file 'setup.py'
--- a/setup.py	2009-06-10 03:56:49 +0000
+++ b/setup.py	2009-06-10 19:56:16 +0000
@@ -266,6 +266,7 @@
 add_pyrex_extension('bzrlib._groupcompress_pyx',
                     extra_source=['bzrlib/diff-delta.c'])
 add_pyrex_extension('bzrlib._knit_load_data_c')
+add_pyrex_extension('bzrlib._known_graph_pyx')
 add_pyrex_extension('bzrlib._rio_pyx')
 if sys.platform == 'win32':
     add_pyrex_extension('bzrlib._dirstate_helpers_c',

=== modified file 'tools/time_graph.py'
--- a/tools/time_graph.py	2009-06-10 16:31:43 +0000
+++ b/tools/time_graph.py	2009-06-10 19:56:16 +0000
@@ -4,7 +4,15 @@
 import time
 import sys
 import optparse
-from bzrlib import branch, commands, graph, ui, trace
+from bzrlib import (
+    branch,
+    commands,
+    graph,
+    ui,
+    trace,
+    _known_graph_py,
+    _known_graph_pyx,
+    )
 from bzrlib.ui import text
 
 p = optparse.OptionParser()
@@ -58,7 +66,7 @@
 
 print '      %d combinations' % (len(combinations),)
 t1 = time.clock()
-known_g = graph.KnownGraph(parent_map)
+known_g = _known_graph_py.KnownGraph(parent_map)
 if opts.lsprof is not None:
     h_known = commands.apply_lsprofiled(opts.lsprof,
         all_heads_comp, known_g, combinations)
@@ -67,6 +75,16 @@
 t2 = time.clock()
 print "Known: %.3fs" % (t2-t1,)
 print "  %s" % (graph._counters,)
+t1 = time.clock()
+known_g = _known_graph_pyx.KnownGraph(parent_map)
+if opts.lsprof is not None:
+    h_known = commands.apply_lsprofiled(opts.lsprof,
+        all_heads_comp, known_g, combinations)
+else:
+    h_known = all_heads_comp(known_g, combinations)
+t2 = time.clock()
+print "Known (pyx): %.3fs" % (t2-t1,)
+print "  %s" % (graph._counters,)
 simple_g = graph.Graph(graph.DictParentsProvider(parent_map))
 graph._counters[1] = 0
 graph._counters[2] = 0