Rev 3718: Instead of using a dictionary for cached ancestries, in http://bzr.arbash-meinel.com/branches/bzr/1.8-dev/lighter_log_file
John Arbash Meinel
john at arbash-meinel.com
Thu Sep 18 20:17:52 BST 2008
At http://bzr.arbash-meinel.com/branches/bzr/1.8-dev/lighter_log_file
------------------------------------------------------------
revno: 3718
revision-id: john at arbash-meinel.com-20080918191751-wk23agjzhkiw2wo5
parent: john at arbash-meinel.com-20080918190406-r75lzwnmjjth94h6
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: lighter_log_file
timestamp: Thu 2008-09-18 14:17:51 -0500
message:
Instead of using a dictionary for cached ancestries,
use a list. This also lets us use integer offsets in our mapping,
rather than revision ids.
-------------- next part --------------
=== modified file 'bzrlib/log.py'
--- a/bzrlib/log.py 2008-09-18 19:04:06 +0000
+++ b/bzrlib/log.py 2008-09-18 19:17:51 +0000
@@ -580,57 +580,85 @@
del text_parent_map
del text_keys
tmodified_text_revisions = time.time()
- # ancestry_values contains a pointer from a revision_id to either a tuple,
- # or a frozenset() of a given per-file ancestry.
- ancestry_values = {_mod_revision.NULL_REVISION: frozenset()}
- _hits = [0, 0, 0, 0, 0, 0, 0, 0]
+ # Algorithms tried:
+ # a single dictionary mapping tree_revision_id => file_ancestry
+ # file_ancestry_as_tuple
+ # file_ancestry_as_frozenset
+ # file_ancestry_as_mixed (this was bad, because it gets confused and
+ # cannot match up revisions properly)
+ # a dictionary tree_revision_id => offset, and a list of offset =>
+ # file_ancestry
+ # file_ancestry_as_tuple 50.3s 272MB
+ # file_ancestry_as_frozenset 42.1s 410MB
+ # file_ancestry_as_mixed 42.8s 399MB
+ # a dictionary tree_revision_id => file_rev_id, and another dict
+ # file_rev_id => file_ancestry
+
+ # ancestry_values contains a list of per-file ancestries. When we need a
+ # new one, we just append it to the list.
+ ancestry_values = [frozenset()]
+ # ancestry_pointers maps from a whole-tree revision_id into the
+ # ancestry_values list. (So given a whole-tree revision_id, what is the
+ # offset to the per-file ancestry for that revision.)
+ ancestry_pointers = {}
+ _hits = [0, 0, 0, 0, 0, 0, 0]
for rev in sorted_rev_list:
parents = parent_map[rev]
+ rev_pointer = None
rev_ancestry = None
if rev in modified_text_revisions:
# We know this will be a new node
rev_ancestry = [rev]
+ rev_pointer = len(ancestry_values)
for parent in parents:
- if parent not in ancestry_values:
+ if parent not in ancestry_pointers:
# parent is a Ghost, which won't be present in
# sorted_rev_list, but we may access it later, so create an
# empty node for it
- ancestry_values[parent] = frozenset()
- if rev_ancestry is None:
+ ancestry_pointers[parent] = 0
+ if rev_pointer is None:
# We don't have anything worked out yet, so just copy this
# parent's ancestry_pointer
- rev_ancestry = ancestry_values[parent]
+ rev_pointer = ancestry_pointers[parent]
else:
# Check to see if this other parent introduces any new
# revisions. If it doesn't then we don't need to create a new
# set.
- parent_ancestry = ancestry_values[parent]
- if rev_ancestry is None:
- rev_ancestry = ancestry_values[rev_pointer]
- if parent_ancestry is rev_ancestry:
+ parent_pointer = ancestry_pointers[parent]
+ if parent_pointer == rev_pointer:
# They both point to the same ancestry value, so we know
# there is nothing new
_hits[5] += 1
continue
- parent_ancestry = frozenset(parent_ancestry)
+ if rev_ancestry is None:
+ rev_ancestry = ancestry_values[rev_pointer]
+ parent_ancestry = frozenset(ancestry_values[parent_pointer])
+ # We had to compute the frozenset, so just cache it
+ # ancestry_values[parent_pointer] = parent_ancestry
new_revisions = parent_ancestry.difference(rev_ancestry)
if new_revisions:
# We need something that we can modify. We can use a simple
# list, because we are only adding the 'new_revisions', so
# we know that we won't have duplicates.
if not isinstance(rev_ancestry, list):
+ _hits[6] += 1
rev_ancestry = list(rev_ancestry)
rev_ancestry.extend(new_revisions)
- _hits[6] += 1
- else:
- _hits[7] += 1
- if rev_ancestry is None:
- rev_ancestry = frozenset()
- elif isinstance(rev_ancestry, list):
- # We can use "tuple()" here to save memory at the cost of CPU, or
- # use frozenset() which saves CPU but consumes RAM.
- rev_ancestry = frozenset(rev_ancestry)
- ancestry_values[rev] = rev_ancestry
+ # We will be creating a new value
+ rev_pointer = len(ancestry_values)
+ if rev_pointer is None:
+ # Point to the empty set
+ rev_pointer = 0
+ elif rev_pointer == len(ancestry_values):
+ # We are creating a new node update the value cache.
+ assert rev_ancestry is not None
+ # we use 'tuple()' here to save memory (at the cost of re-computing
+ # the ancestry set on demand). Alternatively, we can use
+ # frozenset() because we know that we will need to compute it for
+ # 99% of the revisions.
+ ancestry_values.append(frozenset(rev_ancestry))
+ assert rev_pointer < len(ancestry_values)
+ ancestry_pointers[rev] = rev_pointer
trev_ancestry = time.time()
@@ -640,22 +668,26 @@
if len(parents) > 1:
_hits[1] += 1
leftparent = parents[0]
- left_ancestry = ancestry_values[leftparent]
+ left_pointer = ancestry_pointers[leftparent]
+ left_ancestry = None
for rightparent in parents[1:]:
- right_ancestry = ancestry_values[rightparent]
- if left_ancestry is right_ancestry:
+ right_pointer = ancestry_pointers[rightparent]
+ if right_pointer != left_pointer:
+ _hits[3] += 1
+ if left_ancestry is None:
+ left_ancestry = ancestry_values[left_pointer]
+ left_ancestry = frozenset(left_ancestry)
+ right_ancestry = ancestry_values[right_pointer]
+ if not left_ancestry.issuperset(right_ancestry):
+ _hits[4] += 1
+ return True
+ else:
_hits[2] += 1
- continue
- _hits[3] += 1
- left_ancestry = frozenset(left_ancestry)
- if not left_ancestry.issuperset(right_ancestry):
- _hits[4] += 1
- return True
return False
trace.note('Found %s nodes, and %s unique values for %s view_revs'
' modified_text_revisions: %s',
- 0, len(ancestry_values),
+ len(ancestry_pointers), len(ancestry_values),
len(view_revs_iter), len(modified_text_revisions))
# filter from the view the revisions that did not change or merge
More information about the bazaar-commits
mailing list