Rev 3725: Change the per-file log algorithm dramatically. in http://bzr.arbash-meinel.com/branches/bzr/1.8-dev/lighter_log_file
John Arbash Meinel
john at arbash-meinel.com
Fri Sep 19 04:21:40 BST 2008
At http://bzr.arbash-meinel.com/branches/bzr/1.8-dev/lighter_log_file
------------------------------------------------------------
revno: 3725
revision-id: john at arbash-meinel.com-20080919032139-pdcuw9ryqatpqdn9
parent: john at arbash-meinel.com-20080919013023-31adhm4mt3obrjst
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: lighter_log_file
timestamp: Thu 2008-09-18 22:21:39 -0500
message:
Change the per-file log algorithm dramatically.
Now we use the merge_sort information to decide when to show
another entry. It seems this changes what is actually logged.
This is because sometimes a branch merges the trunk. And in that
case, it will be bringing in the changes to a file, which the
feature branch did not have before.
However, for people using 'bzr log file' I believe those are
uninteresting nodes.
Memory consumption is good (240MB, mostly from text_index._buffer_all),
speed is good (28s, also mostly in _buffer_all).
Changing the code to request one key at a time saves the
buffer all, but at the expense of lots of bisects. In some
cases it is faster and saves memory (23s, 88MB), but on
bzr.dev it is slower.
-------------- next part --------------
=== modified file 'bzrlib/log.py'
--- a/bzrlib/log.py 2008-09-18 20:43:43 +0000
+++ b/bzrlib/log.py 2008-09-19 03:21:39 +0000
@@ -269,7 +269,8 @@
view_revisions = _filter_revisions_touching_file_id(branch,
specific_fileid,
mainline_revs,
- view_revisions)
+ view_revisions,
+ direction)
# rebase merge_depth - unless there are no revisions or
# either the first or last revision have merge_depth = 0.
@@ -537,7 +538,7 @@
def _filter_revisions_touching_file_id(branch, file_id, mainline_revisions,
- view_revs_iter):
+ view_revs_iter, direction):
"""Return the list of revision ids which touch a given file id.
The function filters view_revisions and returns a subset.
@@ -557,18 +558,7 @@
:return: A list of (revision_id, dotted_revno, merge_depth) tuples.
"""
# find all the revisions that change the specific file
- # build the ancestry of each revision in the graph
- # - only listing the ancestors that change the specific file.
- graph = branch.repository.get_graph()
- # This asks for all mainline revisions, which means we only have to spider
- # sideways, rather than depth history. That said, its still size-of-history
- # and should be addressed.
- # mainline_revisions always includes an extra revision at the beginning, so
- # don't request it.
- parent_map = dict(((key, value) for key, value in
- graph.iter_ancestry(mainline_revisions[1:]) if value is not None))
- sorted_rev_list = tsort.topo_sort(parent_map)
- text_keys = [(file_id, rev_id) for rev_id in sorted_rev_list]
+ text_keys = [(file_id, rev_id) for rev_id, revno, depth in view_revs_iter]
# Do a direct lookup of all possible text keys, and figure out which ones
# are actually present, and then convert it back to revision_ids, since the
# file_id prefix is shared by everything.
@@ -578,99 +568,31 @@
modified_text_revisions = set(key[1] for key in text_parent_map)
del text_parent_map
del text_keys
- # Algorithms tried:
- # a single dictionary mapping tree_revision_id => file_ancestry
- # file_ancestry_as_tuple 50.3 272MB
- # file_ancestry_as_frozenset 42.1 409MB
- # file_ancestry_as_mixed (this was bad, because it gets confused and
- # cannot match up revisions properly)
- # a dictionary tree_revision_id => offset, and a list of offset =>
- # file_ancestry
- # file_ancestry_as_tuple 50.3s 272MB
- # file_ancestry_as_mixed 42.8s 399MB
- # file_ancestry_as_frozenset 42.1s 410MB
- # a dictionary tree_revision_id => file_rev_id, and another dict
- # file_rev_id => file_ancestry
- # file_ancestry_as_tuple 50.4s 272MB
- # file_ancestry_as_mixed 43.3s 399MB
- # file_ancestry_as_frozenset 41.6s 410MB
-
- # ancestry_values contains a pointer from a revision_id to either a tuple,
- # or a frozenset() of a given per-file ancestry.
- ancestry_values = {_mod_revision.NULL_REVISION: frozenset()}
- for rev in sorted_rev_list:
- parents = parent_map[rev]
- rev_ancestry = None
- if rev in modified_text_revisions:
- # We know this will be a new node
- rev_ancestry = [rev]
- for parent in parents:
- if parent not in ancestry_values:
- # parent is a Ghost, which won't be present in
- # sorted_rev_list, but we may access it later, so create an
- # empty node for it
- ancestry_values[parent] = frozenset()
- if rev_ancestry is None:
- # We don't have anything worked out yet, so just copy this
- # parent's ancestry_pointer
- rev_ancestry = ancestry_values[parent]
- else:
- # Check to see if this other parent introduces any new
- # revisions. If it doesn't then we don't need to create a new
- # set.
- parent_ancestry = ancestry_values[parent]
- if rev_ancestry is None:
- rev_ancestry = ancestry_values[rev_pointer]
- if parent_ancestry is rev_ancestry:
- # They both point to the same ancestry value, so we know
- # there is nothing new
- continue
- parent_ancestry = frozenset(parent_ancestry)
- new_revisions = parent_ancestry.difference(rev_ancestry)
- if new_revisions:
- # We need something that we can modify. We can use a simple
- # list, because we are only adding the 'new_revisions', so
- # we know that we won't have duplicates.
- if not isinstance(rev_ancestry, list):
- rev_ancestry = list(rev_ancestry)
- rev_ancestry.extend(new_revisions)
- if rev_ancestry is None:
- rev_ancestry = frozenset()
- elif isinstance(rev_ancestry, list):
- # We can use "tuple()" here to save memory at the cost of CPU, or
- # use frozenset() which saves CPU but consumes RAM.
- # We can also 'switch' based on the length of the ancestry.
- # That gives the most direct control over memory versus CPU.
- # For now, just going with the more performant frozenset()
- # Also, note that both tuple and frozenset just reference the
- # passed in object if they are of the same type. (Unlike list and
- # set which create a new copy.)
- # if isinstance(x, tuple):
- # assert x is tuple(x)
- # You have the same:
- # if isinstance(y, frozenset):
- # assert y is frozenset(y)
- rev_ancestry = frozenset(rev_ancestry)
- ancestry_values[rev] = rev_ancestry
-
- def is_merging_rev(r):
- parents = parent_map[r]
- if len(parents) > 1:
- leftparent = parents[0]
- left_ancestry = ancestry_values[leftparent]
- for rightparent in parents[1:]:
- right_ancestry = ancestry_values[rightparent]
- if left_ancestry is right_ancestry:
- continue
- left_ancestry = frozenset(left_ancestry)
- if not left_ancestry.issuperset(right_ancestry):
- return True
- return False
-
- # filter from the view the revisions that did not change or merge
- # the specific file
- result = [(r, n, d) for r, n, d in view_revs_iter
- if r in modified_text_revisions or is_merging_rev(r)]
+
+ result = []
+ if direction == 'forward':
+ view_revs_iter = reverse_by_depth(view_revs_iter)
+ # Track what revisions will merge the current revision, replace entries
+ # with 'None' when they have been added to result
+ current_merge_stack = [None]
+ for info in view_revs_iter:
+ rev_id, revno, depth = info
+ assert depth <= len(current_merge_stack)
+ if depth == len(current_merge_stack):
+ current_merge_stack.append(info)
+ else:
+ del current_merge_stack[depth + 1:]
+ current_merge_stack[-1] = info
+
+ if rev_id in modified_text_revisions:
+ # This needs to be logged, along with the extra revisions
+ for idx in xrange(len(current_merge_stack)):
+ node = current_merge_stack[idx]
+ if node is not None:
+ result.append(node)
+ current_merge_stack[idx] = None
+ if direction == 'forward':
+ result = reverse_by_depth(result)
return result
More information about the bazaar-commits
mailing list