Rev 32: Merge my changes to include better id matching. in http://bazaar.launchpad.net/~bzr/bzr-stats/trunk
John Arbash Meinel
john at arbash-meinel.com
Fri Jan 15 21:50:23 GMT 2010
At http://bazaar.launchpad.net/~bzr/bzr-stats/trunk
------------------------------------------------------------
revno: 32 [merge]
revision-id: john at arbash-meinel.com-20100115215000-um0yp6r000l79go7
parent: jelmer at samba.org-20090717180709-5e1i4jjrdb9aqx22
parent: john at arbash-meinel.com-20100115213257-jupa04qkt5fdkm82
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: trunk
timestamp: Fri 2010-01-15 15:50:00 -0600
message:
Merge my changes to include better id matching.
modified:
__init__.py __init__.py-20060629132721-mkbaty0vfk4y3v59-1
-------------- next part --------------
=== modified file '__init__.py'
--- a/__init__.py 2009-07-17 18:07:09 +0000
+++ b/__init__.py 2010-01-15 21:32:57 +0000
@@ -1,4 +1,4 @@
-# Copyright (C) 2005-2008 Canonical Ltd
+# Copyright (C) 2006-2010 Canonical Ltd
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
@@ -23,6 +23,7 @@
config,
errors,
option,
+ trace,
tsort,
ui,
workingtree,
@@ -32,19 +33,7 @@
""")
-def find_fullnames(lst):
- """Find the fullnames for a list committer names."""
-
- counts = {}
- for committer in lst:
- fullname = config.parse_username(committer)[0]
- counts.setdefault(fullname, 0)
- counts[fullname] += 1
- return sorted(((count, name) for name,count in counts.iteritems()),
- reverse=True)
-
-
-def collapse_by_person(committers):
+def collapse_by_person(revisions, canonical_committer):
"""The committers list is sorted by email, fix it up by person.
Some people commit with a similar username, but different email
@@ -55,65 +44,116 @@
So take the most common username for each email address, and
combine them into one new list.
"""
- # Just an indirection so that multiple names can reference
- # the same record information
- name_to_counter = {}
- # indirection back to real information
- # [[full_rev_list], {email:count}, {fname:count}]
- counter_to_info = {}
- counter = 0
- for email, revs in committers.iteritems():
- authors = []
- for rev in revs:
- authors += rev.get_apparent_authors()
- fullnames = find_fullnames(authors)
- match = None
- for count, fullname in fullnames:
- if fullname and fullname in name_to_counter:
- # We found a match
- match = name_to_counter[fullname]
- break
-
- if match:
- # One of the names matched, we need to collapse to records
- record = counter_to_info[match]
- record[0].extend(revs)
- record[1][email] = len(revs)
- for count, fullname in fullnames:
- name_to_counter[fullname] = match
- record[2].setdefault(fullname, 0)
- record[2][fullname] += count
- else:
- # just add this one to the list
- counter += 1
- for count, fullname in fullnames:
- if fullname:
- name_to_counter[fullname] = counter
- fname_map = dict((fullname, count) for count, fullname in fullnames)
- counter_to_info[counter] = [revs, {email:len(revs)}, fname_map]
- return sorted(((len(revs), revs, email, fname)
- for revs, email, fname in counter_to_info.values()), reverse=True)
-
-
-def sort_by_committer(a_repo, revids):
- committers = {}
+ # Map from canonical committer to
+ # {committer: ([rev_list], {email: count}, {fname:count})}
+ committer_to_info = {}
+ for rev in revisions:
+ authors = rev.get_apparent_authors()
+ for author in authors:
+ username, email = config.parse_username(author)
+ canon_author = canonical_committer[(username, email)]
+ info = committer_to_info.setdefault(canon_author, ([], {}, {}))
+ info[0].append(rev)
+ info[1][email] = info[1].setdefault(email, 0) + 1
+ info[2][username] = info[2].setdefault(username, 0) + 1
+ res = [(len(revs), revs, emails, fnames)
+ for revs, emails, fnames in committer_to_info.itervalues()]
+ res.sort(reverse=True)
+ return res
+
+
+def collapse_email_and_users(email_users, combo_count):
+ """Combine the mapping of User Name to email and email to User Name.
+
+ If a given User Name is used for multiple emails, try to map it all to one
+ entry.
+ """
+ id_to_combos = {}
+ username_to_id = {}
+ email_to_id = {}
+ id_counter = 0
+
+ def collapse_ids(old_id, new_id, new_combos):
+ old_combos = id_to_combos.pop(old_id)
+ new_combos.update(old_combos)
+ for old_user, old_email in old_combos:
+ if (old_user and old_user != user):
+ old_user_id = username_to_id[old_user]
+ assert old_user_id in (old_id, new_id)
+ username_to_id[old_user] = new_id
+ if (old_email and old_email != email):
+ old_email_id = email_to_id[old_email]
+ assert old_email_id in (old_id, new_id)
+ email_to_id[old_email] = cur_id
+ for email, usernames in email_users.iteritems():
+ assert email not in email_to_id
+ if not email:
+ # We use a different algorithm for usernames that have no email
+ # address, we just try to match by username, and not at all by
+ # email
+ for user in usernames:
+ if not user:
+ continue # The mysterious ('', '') user
+ user_id = username_to_id.get(user)
+ if user_id is None:
+ id_counter += 1
+ user_id = id_counter
+ username_to_id[user] = user_id
+ id_to_combos[user_id] = id_combos = set()
+ else:
+ id_combos = id_combos[user_id]
+ id_combos.add((user, email))
+ continue
+
+ id_counter += 1
+ cur_id = id_counter
+ id_to_combos[cur_id] = id_combos = set()
+ email_to_id[email] = cur_id
+
+ for user in usernames:
+ combo = (user, email)
+ id_combos.add(combo)
+ if not user:
+ # We don't match on empty usernames
+ continue
+ user_id = username_to_id.get(user)
+ if user_id is not None:
+ # This UserName was matched to an cur_id
+ if user_id != cur_id:
+ # And it is a different identity than the current email
+ collapse_ids(user_id, cur_id, id_combos)
+ username_to_id[user] = cur_id
+ combo_to_best_combo = {}
+ for cur_id, combos in id_to_combos.iteritems():
+ best_combo = sorted(combos,
+ key=lambda x:combo_count[x],
+ reverse=True)[0]
+ for combo in combos:
+ combo_to_best_combo[combo] = best_combo
+ return combo_to_best_combo
+
+
+def get_revisions_and_committers(a_repo, revids):
+ """Get the Revision information, and the best-match for committer."""
+
+ email_users = {} # user at email.com => User Name
+ combo_count = {}
pb = ui.ui_factory.nested_progress_bar()
try:
- pb.note('getting revisions')
+ trace.note('getting revisions')
revisions = a_repo.get_revisions(revids)
for count, rev in enumerate(revisions):
pb.update('checking', count, len(revids))
for author in rev.get_apparent_authors():
- username = config.parse_username(author)
- if username[1] == '':
- email = username[0]
- else:
- email = username[1]
- committers.setdefault(email, []).append(rev)
+ # XXX: There is a chance sometimes with svn imports that the
+ # full name and email can BOTH be blank.
+ username, email = config.parse_username(author)
+ email_users.setdefault(email, set()).add(username)
+ combo = (username, email)
+ combo_count[combo] = combo_count.setdefault(combo, 0) + 1
finally:
pb.finished()
-
- return committers
+ return revisions, collapse_email_and_users(email_users, combo_count)
def get_info(a_repo, revision):
@@ -121,15 +161,14 @@
pb = ui.ui_factory.nested_progress_bar()
a_repo.lock_read()
try:
- pb.note('getting ancestry')
+ trace.note('getting ancestry')
ancestry = a_repo.get_ancestry(revision)[1:]
-
- committers = sort_by_committer(a_repo, ancestry)
+ revs, canonical_committer = get_revisions_and_committers(a_repo, ancestry)
finally:
a_repo.unlock()
pb.finished()
- return collapse_by_person(committers)
+ return collapse_by_person(revs, canonical_committer)
def get_diff_info(a_repo, start_rev, end_rev):
@@ -138,7 +177,6 @@
This lets us figure out what has actually changed between 2 revisions.
"""
pb = ui.ui_factory.nested_progress_bar()
- committers = {}
a_repo.lock_read()
try:
pb.note('getting ancestry 1')
@@ -146,23 +184,12 @@
pb.note('getting ancestry 2')
ancestry = a_repo.get_ancestry(end_rev)[1:]
ancestry = [rev for rev in ancestry if rev not in start_ancestry]
- pb.note('getting revisions')
- revisions = a_repo.get_revisions(ancestry)
-
- for count, rev in enumerate(revisions):
- pb.update('checking', count, len(ancestry))
- for author in rev.get_apparent_authors():
- try:
- email = config.extract_email_address(author)
- except errors.BzrError:
- email = author
- committers.setdefault(email, []).append(rev)
+ revs, canonical_committer = sort_by_committer(a_repo, ancestry)
finally:
a_repo.unlock()
pb.finished()
- info = collapse_by_person(committers)
- return info
+ return collapse_by_person(revs, canonical_committer)
def display_info(info, to_file, gather_class_stats=None):
@@ -176,9 +203,7 @@
sorted_fullnames = sorted(((count, fullname)
for fullname,count in fullnames.iteritems()),
reverse=True)
- # There is a chance sometimes with svn imports that the full name and
- # email can BOTH be blank.
- if sorted_fullnames[0][1] == '':
+ if sorted_fullnames[0][1] == '' and sorted_emails[0][1] == '':
to_file.write('%4d %s\n'
% (count, 'Unknown'))
else:
@@ -186,15 +211,15 @@
% (count, sorted_fullnames[0][1],
sorted_emails[0][1]))
if len(sorted_fullnames) > 1:
- print ' Other names:'
- for count, fname in sorted_fullnames[1:]:
+ to_file.write(' Other names:\n')
+ for count, fname in sorted_fullnames:
to_file.write(' %4d ' % (count,))
if fname == '':
to_file.write("''\n")
else:
to_file.write("%s\n" % (fname,))
if len(sorted_emails) > 1:
- print ' Other email addresses:'
+ to_file.write(' Other email addresses:\n')
for count, email in sorted_emails:
to_file.write(' %4d ' % (count,))
if email == '':
More information about the bazaar-commits
mailing list