Rev 29: Tremendously faster, but gives different results wrt ghosts. in http://bzr.arbash-meinel.com/plugins/history_db
John Arbash Meinel
john at arbash-meinel.com
Fri Apr 2 23:44:32 BST 2010
At http://bzr.arbash-meinel.com/plugins/history_db
------------------------------------------------------------
revno: 29
revision-id: john at arbash-meinel.com-20100402224358-yk2z3ukcwo504v66
parent: john at arbash-meinel.com-20100402222713-05ufbktsvladp3lm
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: history_db
timestamp: Fri 2010-04-02 17:43:58 -0500
message:
Tremendously faster, but gives different results wrt ghosts.
Specifically, the merge_sort result omits ghosts, so they aren't in the dotted_revno
table. However the *time* spent is great. Down to 45ms to get the entire
ancestry for an arbitrary branch. (vs 161ms using the mainline-then-merged
method, and 197ms for the walk-db-parent table method.)
-------------- next part --------------
=== modified file '__init__.py'
--- a/__init__.py 2010-04-02 22:27:13 +0000
+++ b/__init__.py 2010-04-02 22:43:58 +0000
@@ -156,6 +156,7 @@
_ancestry_walk_types.register('db-rev-id', None)
_ancestry_walk_types.register('db-db-id', None)
_ancestry_walk_types.register('db-range', None)
+_ancestry_walk_types.register('db-dotted', None)
_ancestry_walk_types.register('bzr-iter-anc', None)
_ancestry_walk_types.register('bzr-kg', None)
@@ -182,21 +183,23 @@
if method.startswith('db'):
query = history_db.Querier(db, b)
if method == 'db-db-id':
- count = query.walk_ancestry_db_ids()
+ ancestors = query.walk_ancestry_db_ids()
elif method == 'db-rev-id':
- count = query.walk_ancestry()
+ ancestors = query.walk_ancestry()
+ elif method == 'db-dotted':
+ ancestors = query.walk_ancestry_range_and_dotted()
else:
assert method == 'db-range'
- count = query.walk_ancestry_range()
+ ancestors = query.walk_ancestry_range()
trace.note('Stats:\n%s' % (pprint.pformat(dict(query._stats)),))
elif method == 'bzr-iter-anc':
g = b.repository.get_graph()
- count = len(list(g.iter_ancestry([b.last_revision()])))
+ ancestors = list(g.iter_ancestry([b.last_revision()]))
elif method == 'bzr-kg':
kg = b.repository.revisions.get_known_graph_ancestry(
[(b.last_revision(),)])
- count = len(kg._nodes)
- self.outf.write('Found %d ancestors\n' % (count,))
+ ancestors = len(kg._nodes)
+ self.outf.write('Found %d ancestors\n' % (len(ancestors),))
trace.note('Time: %.3fs' % (time.time() - t,))
commands.register_command(cmd_create_history_db)
=== modified file 'history_db.py'
--- a/history_db.py 2010-04-02 22:27:13 +0000
+++ b/history_db.py 2010-04-02 22:43:58 +0000
@@ -414,7 +414,7 @@
next_parents = [p[0] for p in parents if p[0] not in all]
all.update(next_parents)
remaining.extend(next_parents)
- return len(all)
+ return all
def walk_ancestry_db_ids(self):
_exec = self._cursor.execute
@@ -424,14 +424,14 @@
remaining = [db_id]
while remaining:
self._stats['num_steps'] += 1
- next = remaining[:1000]
+ next = remaining[:100]
remaining = remaining[len(next):]
res = _exec("SELECT parent FROM parent WHERE child in (%s)"
% (', '.join('?'*len(next))), tuple(next))
next_p = [p[0] for p in res if p[0] not in all_ancestors]
all_ancestors.update(next_p)
remaining.extend(next_p)
- return len(all_ancestors)
+ return all_ancestors
def walk_ancestry_range(self):
"""Walk the whole ancestry.
@@ -439,20 +439,68 @@
Use the mainline_parent_range/mainline_parent table to speed things up.
"""
_exec = self._cursor.execute
- all_ancestors = set()
- db_id = self._get_db_id(self._branch_tip_rev_id)
- all_ancestors.add(db_id)
- remaining = [db_id]
+ # All we are doing is pre-seeding the search with all the mainline
+ # revisions, we could probably do more with interleaving calls to
+ # mainline with calls to parents but this is easier to write :)
+ all_mainline = self.walk_mainline_using_ranges()
+ t = time.time()
+ all_ancestors = set(all_mainline)
+ remaining = list(all_mainline)
while remaining:
self._stats['num_steps'] += 1
- next = remaining[:1000]
+ next = remaining[:100]
remaining = remaining[len(next):]
res = _exec("SELECT parent FROM parent WHERE child in (%s)"
% (', '.join('?'*len(next))), tuple(next))
next_p = [p[0] for p in res if p[0] not in all_ancestors]
all_ancestors.update(next_p)
remaining.extend(next_p)
- return len(all_ancestors)
+ self._stats['query_time'] += (time.time() - t)
+ # Using this shortcut to grab the mainline first helps, but not a lot.
+ # Probably because the limiting factor is the 'child in (...)' step,
+ # which is 100 entries or so. (note that setting the range to :1000
+ # shows a failure, which indicates the old code path was definitely
+ # capped at a maximum range.)
+ # 1.719s walk_ancestry
+ # 0.198s walk_ancestry_db_ids
+ # 0.164s walk_mainline_using_ranges
+ return all_ancestors
+
+ def walk_ancestry_range_and_dotted(self):
+ """Walk the whole ancestry.
+
+ Use the information from the dotted_revno table and the mainline_parent
+ table to speed things up.
+ """
+ db_id = self._get_db_id(self._branch_tip_rev_id)
+ all_ancestors = set()
+ t = time.time()
+ while db_id is not None:
+ self._stats['num_steps'] += 1
+ range_res = self._cursor.execute(
+ "SELECT pkey, tail"
+ " FROM mainline_parent_range"
+ " WHERE head = ?"
+ " ORDER BY count DESC LIMIT 1",
+ (db_id,)).fetchone()
+ if range_res is None:
+ next_db_id = self._get_lh_parent_db_id(db_id)
+ merged_revs = self._cursor.execute(
+ "SELECT merged_revision FROM dotted_revno"
+ " WHERE tip_revision = ?",
+ (db_id,)).fetchall()
+ all_ancestors.update([r[0] for r in merged_revs])
+ else:
+ pkey, next_db_id = range_res
+ merged_revs = self._cursor.execute(
+ "SELECT merged_revision FROM dotted_revno, mainline_parent"
+ " WHERE tip_revision = mainline_parent.revision"
+ " AND mainline_parent.range = ?",
+ (pkey,)).fetchall()
+ all_ancestors.update([r[0] for r in merged_revs])
+ db_id = next_db_id
+ self._stats['query_time'] += (time.time() - t)
+ return all_ancestors
def heads(self, revision_ids):
"""Compute Graph.heads() on the given data."""
More information about the bazaar-commits
mailing list