[MERGE] faster topo_sort
John Arbash Meinel
john at arbash-meinel.com
Fri Feb 24 17:44:23 GMT 2006
Robert Collins wrote:
> On Sat, 2006-02-25 at 00:13 +1100, Robert Collins wrote:
>
>
>> So - really - I'd like to replace the current topo_sort routine with
>> this one.
>
> In further news here is another edition of this patch, with the
> topological_sort in a function and with the tsort tests running against
> it... they pass.
>
> Rob
>
>
>
> ------------------------------------------------------------------------
>
> === added file 'bzrlib/reconcile.py'
> --- /dev/null
> +++ bzrlib/reconcile.py
> @@ -0,0 +1,279 @@
> +# (C) 2005, 2006 Canonical Limited.
> +#
> +# This program is free software; you can redistribute it and/or modify
> +# it under the terms of the GNU General Public License as published by
> +# the Free Software Foundation; either version 2 of the License, or
> +# (at your option) any later version.
> +#
> +# This program is distributed in the hope that it will be useful,
> +# but WITHOUT ANY WARRANTY; without even the implied warranty of
> +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> +# GNU General Public License for more details.
> +#
> +# You should have received a copy of the GNU General Public License
> +# along with this program; if not, write to the Free Software
> +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
> +
> +"""Plugin to fix some potential data errors in a branch.
> +
> +This makes sure that the inventory weave's DAG of ancestry is correct so that
> +attempts to fetch the branch over http, or certain merge operations cope
> +correctly.
> +
> +This is most likely needed if you have used fetch-ghosts from bzrlib to
> +resolve ghosts after a baz (or otherwise) import and you get bizarre behaviour
> +when either exporting over http or when merging from other translated branches.
> +"""
> +
> +
> +import bzrlib.branch
> +import bzrlib.errors as errors
> +import bzrlib.progress
> +from bzrlib.trace import mutter
> +import bzrlib.ui as ui
> +from bzrlib.weavefile import write_weave_v5 as w5
> +
> +
> +
> +def reconcile(dir):
> + """Reconcile the data in dir.
> +
> + Currently this is limited to a inventory 'reweave'.
> +
> + This is a convenience method, and the public api, for using a
> + Reconciler object.
> + """
> + reconciler = Reconciler(dir)
> + reconciler.reconcile()
> +
> +
> +def topological_sort(graph):
> + """Topological sort a graph.
> +
> + graph -- sequence of pairs of node->parents_list.
> +
> + The result is a list of node names, such that all parents come before
> + their children.
> +
> + Nodes at the same depth are returned in sorted order.
> +
> + node identifiers can be any hashable object, and are typically strings.
> + """
> + sorter = TopoSorter()
> + sorter.sort(graph)
> + return sorter._work_queue
> +
-1 on sneaking into 'sorter._work_queue'. Either sort() should return
the sorted list, or you should have another function to get it. But this
doesn't seem like the right way.
Also, if this is replacing 'topo_sort', then shouldn't it be in
bzrlib.tsort? not bzrlib.reconcile.
> +
> +class TopoSorter(object):
> +
> + def sort(self, graph):
> + # the total set of revisions to process
> + self._rev_graph = dict(graph)
> + ### if debugging:
> + # self._original_graph = dict(graph)
> +
> + # the current line of history being processed.
> + # once we add in per inventory root ids this will involve a single
> + # pass within the revision knit at that point.
> + # these contain respectively the revision id in the call stack
> + self._rev_stack = []
> + # a set of the revisions on the stack, for cycle detection
> + self._rev_set = set()
> + # and the not_ghost_parents.
> + self._pending_stack = []
> + # actual queue to reinsert. This is a single pass queue we can use to
> + # regenerate the inventory versioned file, and holds
> + # (rev_id, non_ghost_parents) entries
> + self._work_queue = []
> + # this is a set for fast lookups of queued revisions
> + self._work_set = set()
> + # total steps = 1 read per revision + one insert into the inventory
> +
> + # now we do a depth first search of the revision graph until its empty.
> + # this gives us a topological order across all revisions in
> + # self._all_revisions.
> + # This is flattened to avoid deep recursion:
> + # At each step in the call graph we need to know which parent we are on.
> + # we do this by having three variables for each stack frame:
> + # revision_id being descended into (self._rev_stack)
> + # current queue of parents to descend into
> + # (self._pending_stack)
> + # putting all the parents into a pending list, left to
> + # right and processing the right most one each time. The same parent
> + # may occur at multiple places in the stack - thats fine. We check its
> + # not in the output before processing. However revisions cannot
> + # appear twice.
> + # the algorithm is roughly:
> + # for revision, parents in _rev_graph.items():
> + # if revision in self._all_revisions:
> + # continue
> + # add_revision_parents(revision, parents)
> + # def add_revision_parents(revision, parents)
> + # fpr parent in parents:
> + # add_revision_parents(parent, parent.parents)
Typo, s/fpr/for/
> + # self._all_revisions.append(revision)
> + # self._all_revision_parents.appent(parents)
Typo, s/appent/append/
> + # however we tune this to visit fragmens of the graph
fragments
> + # and not double-visit entries.
> + # nevertheless the output is a
> + # self._work_queue of revisions, nonghost parents in
> + # topological order and
> + # a self._work_set which is a complete set of revisions.
> + while self._rev_graph:
> + rev_id, parents = self._rev_graph.iteritems().next()
> + # rev_id
> + self._stack_revision(rev_id, parents)
> + while self._rev_stack:
> + # loop until this call completes.
> + parents_to_visit = self._pending_stack[-1]
> + # if all parents are done, the revision is done
> + if not parents_to_visit:
> + # append the revision to the topo sorted list
> + self._unstack_revision()
> + else:
> + while self._pending_stack[-1]:
> + # recurse depth first into a single parent
> + next_rev_id = self._pending_stack[-1].pop()
> + if next_rev_id in self._work_set:
> + # this parent was completed by a child on the
> + # call stack. skip it.
> + continue
> + # push it into the call stack
> + self._stack_revision(next_rev_id, self._rev_graph[next_rev_id])
> + # and do not continue processing parents until this 'call'
> + # has recursed.
> + break
> +
> + # we are now recursing down a call stack.
> + # its a width first search which
> +
> +### Useful if fiddling with this code.
> +### # cross check
> +### for index in range(len(self._work_queue)):
> +### rev = self._work_queue[index]
> +### for left_index in range(index):
> +### if rev in self.original_graph[self._work_queue[left_index]]:
> +### print "revision in parent list of earlier revision"
> +### import pdb;pdb.set_trace()
> +
> + def _stack_revision(self, rev_id, parents):
> + """Add rev_id to the pending revision stack."""
> + # detect cycles:
> + if rev_id in self._rev_set:
> + # we only supply the revisions that led to the cycle. This isn't
> + # minimal though... but it is usually a subset of the entire graph
> + # and easier to debug.
> + raise errors.GraphCycleError(self._rev_set)
> +
> + self._rev_stack.append(rev_id)
> + self._rev_set.add(rev_id)
> + self._pending_stack.append(list(parents))
> +
> + def _unstack_revision(self):
> + """A revision has been completed.
> +
> + The revision is added to the work queue, and the data for
> + it popped from the call stack.
> + """
> + rev_id = self._rev_stack.pop()
> + self._rev_set.remove(rev_id)
> + self._pending_stack.pop()
> + self._work_queue.append(rev_id)
> + self._work_set.add(rev_id)
> + # and remove it from the rev graph as its now complete
> + self._rev_graph.pop(rev_id)
> +
Why are you removing the rev_id from the _rev_set? Everything else looks
good.
> +
> +class Reconciler(object):
> + """Reconcilers are used to reconcile existing data.
> +
> + Currently this is limited to a single repository, and consists
> + of an inventory reweave with revision cross-checks.
> + """
> +
> + def __init__(self, dir):
> + self.bzrdir = dir
> +
> + def reconcile(self):
> + """Actually perform the reconciliation."""
> + self.pb = ui.ui_factory.progress_bar()
> + self.repo = self.bzrdir.open_repository()
> + self.repo.lock_write()
> + try:
> + self.pb.note('Reconciling repository %s',
> + self.repo.bzrdir.root_transport.base)
> + self._reweave_inventory()
> + finally:
> + self.repo.unlock()
> + self.pb.note('Reconciliation complete.')
> +
> + def _reweave_inventory(self):
> + """Regenerate the inventory weave for the repository from scratch."""
> + self.pb.update('Reading inventory data.')
> + self.inventory = self.repo.get_inventory_weave()
> + self.repo.control_weaves.put_weave('inventory.backup',
> + self.inventory,
> + self.repo.get_transaction())
> + self.pb.note('Backup Inventory created.')
> + # asking for '' should never return a non-empty weave
> + new_inventory = self.repo.control_weaves.get_weave_or_empty('',
> + self.repo.get_transaction())
> +
> + # the total set of revisions to process
> + self.pending = set([file_id for file_id in self.repo.revision_store])
> +
> + # total steps = 1 read per revision + one insert into the inventory
> + self.total = len(self.pending) * 2
> + self.count = 0
> +
> + # mapping from revision_id to parents
> + self._rev_graph = {}
> + # we need the revision id of each revision and its available parents list
> + for rev_id in self.pending:
> + # put a revision into the graph.
> + self._graph_revision(rev_id)
> +
> + ordered_rev_ids = topological_sort(self._rev_graph.items())
> + self._work_queue = [(rev_id, self._rev_graph[rev_id]) for
> + rev_id in ordered_rev_ids]
> +
If you are always going to pop things off of the left (pop(0)), you
probably want to use a deque. List.pop(0) requires moving all entries,
because it is just an array.
It probably would be even better just do do:
for rev_id, parents in self._work_queue:
And iterate over them, rather than pop'ing them off. (Since you never
push more on anyway)
I also would still like to see you check that the inventory actually
needs re-weaving rather than just forcing it all the time.
> + # we have topological order of revisions and non ghost parents ready.
> + while self._work_queue:
> + rev_id, parents = self._work_queue.pop(0)
> + # double check this really is in topological order.
> + unavailable = [p for p in parents if p not in new_inventory]
> + assert len(unavailable) == 0
> + # this entry has all the non ghost parents in the inventory
> + # file already.
> + self._reweave_step('adding inventories')
> + new_inventory.add(rev_id, parents, self.inventory.get(rev_id))
> +
> + # if this worked, the set of new_inventory.names should equal
> + # self.pending
> + assert set(new_inventory.names()) == self.pending
> + self.pb.update('Writing weave')
> + self.repo.control_weaves.put_weave('inventory',
> + new_inventory,
> + self.repo.get_transaction())
> + self.inventory = None
> + self.pb.note('Inventory regenerated.')
> +
> + def _graph_revision(self, rev_id):
> + """Load a revision into the revision graph."""
> + # pick a random revision
> + # analyse revision id rev_id and put it in the stack.
> + self._reweave_step('loading revisions')
> + rev = self.repo.get_revision(rev_id)
> + assert rev.revision_id == rev_id
> + parents = []
> + for parent in rev.parent_ids:
> + if parent in self.inventory:
> + parents.append(parent)
> + else:
> + mutter('found ghost %s', parent)
> + self._rev_graph[rev_id] = parents
> +
> + def _reweave_step(self, message):
> + """Mark a single step of regeneration complete."""
> + self.pb.update(message, self.count, self.total)
> + self.count += 1
>
I didn't check all the tests, but with changes, +1 on the code.
John
=:->
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 254 bytes
Desc: OpenPGP digital signature
Url : https://lists.ubuntu.com/archives/bazaar/attachments/20060224/96ceae2f/attachment.pgp
More information about the bazaar
mailing list