Rev 101: Change how lookups for dotted revno<=>revision_id work. in http://bzr.arbash-meinel.com/branches/bzr/history_db/trunk
John Arbash Meinel
john at arbash-meinel.com
Thu Apr 15 20:10:42 BST 2010
At http://bzr.arbash-meinel.com/branches/bzr/history_db/trunk
------------------------------------------------------------
revno: 101
revision-id: john at arbash-meinel.com-20100415191015-ttshezx11mr3w4fd
parent: john at arbash-meinel.com-20100415173653-ikarwcl4xou56wa4
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: trunk
timestamp: Thu 2010-04-15 14:10:15 -0500
message:
Change how lookups for dotted revno<=>revision_id work.
Instead of just searching until you find an entry, find the range where
the entry is present, and then load all revnos from that range.
This has a very positive effect on the 'files' page, dropping it from 10+s
down to 3.8s.
It has a negative effect on 'bzr log' times, as we now page in more stuff
than we use. (Mostly because iter_merge_sorted gets called if we actually
need any of those merged revisions.)
-------------- next part --------------
=== modified file '__init__.py'
--- a/__init__.py 2010-04-15 17:11:43 +0000
+++ b/__init__.py 2010-04-15 19:10:15 +0000
@@ -445,15 +445,16 @@
t1 = time.clock()
revision_id_map = query.get_dotted_revno_range_multi([revision_id])
t2 = time.clock()
- trace.note('history_db rev=>dotted %s took %.3fs, %.3fs to init,'
- ' %.3fs to query' % (revision_id_map.values(),
- t2-t0, t1-t0, t2-t1))
self._partial_revision_id_to_revno_cache.update(revision_id_map)
if revision_id not in revision_id_map:
trace.mutter('history_db failed to find a mapping for {%s},'
'falling back' % (revision_id,))
- return _orig_do_rev_id_to_dotted(self, revno)
+ return _orig_do_rev_id_to_dotted(self, revision_id)
+ revno = revision_id_map[revision_id]
+ trace.note('history_db rev=>dotted %s added %s took %.3fs, %.3fs to init,'
+ ' %.3fs to query' % (revno, len(revision_id_map),
+ t2-t0, t1-t0, t2-t1))
return revision_id_map[revision_id]
=== modified file 'history_db.py'
--- a/history_db.py 2010-04-15 17:36:53 +0000
+++ b/history_db.py 2010-04-15 19:10:15 +0000
@@ -1310,48 +1310,88 @@
t = time.time()
tip_db_id = self._get_db_id(self._branch_tip_rev_id)
db_ids = set()
- db_id_to_rev_id = {}
+ # TODO: We could probably do this cheaper, such as using
+ # ensure_revisions, or some other N-way query
for rev_id in revision_ids:
db_id = self._get_db_id(rev_id)
if db_id is None:
import pdb; pdb.set_trace()
db_ids.add(db_id)
- db_id_to_rev_id[db_id] = rev_id
revnos = {}
while tip_db_id is not None and db_ids:
- self._stats['num_steps'] += 1
- range_res = self._cursor.execute(
- "SELECT pkey, tail"
- " FROM mainline_parent_range"
- " WHERE head = ?"
- " ORDER BY count DESC LIMIT 1",
- (tip_db_id,)).fetchone()
- if range_res is None:
- revno_res = self._cursor.execute(_add_n_params(
- "SELECT merged_revision, revno FROM dotted_revno"
+ tip_db_id, next_db_id, range_key = self._find_next_range(tip_db_id,
+ list(db_ids), 'merged_revision')
+ if range_key is None:
+ revno_res = self._cursor.execute(
+ "SELECT merged_revision, revision_id, revno"
+ " FROM dotted_revno, revision"
" WHERE tip_revision = ?"
- " AND merged_revision IN (%s)",
- len(db_ids)),
- [tip_db_id] + list(db_ids)).fetchall()
+ " AND merged_revision = db_id",
+ [tip_db_id]).fetchall()
next_db_id = self._get_lh_parent_db_id(tip_db_id)
else:
- pkey, next_db_id = range_res
- revno_res = self._cursor.execute(_add_n_params(
- "SELECT merged_revision, revno"
- " FROM dotted_revno, mainline_parent"
+ revno_res = self._cursor.execute(
+ "SELECT merged_revision, revision_id, revno"
+ " FROM dotted_revno, revision, mainline_parent"
" WHERE tip_revision = mainline_parent.revision"
- " AND mainline_parent.range = ?"
- " AND merged_revision IN (%s)",
- len(db_ids)),
- [pkey] + list(db_ids)).fetchall()
+ " AND merged_revision = db_id"
+ " AND mainline_parent.range = ?",
+ [range_key]).fetchall()
tip_db_id = next_db_id
- for db_id, revno in revno_res:
+ self._stats['dotted_revno_loaded'] += len(revno_res)
+ for db_id, rev_id, revno in revno_res:
db_ids.discard(db_id)
- revnos[db_id_to_rev_id[db_id]] = tuple(map(int,
- revno.split('.')))
+ revnos[rev_id] = tuple(map(int, revno.split('.')))
self._stats['query_time'] += (time.time() - t)
return revnos
+ def _find_next_range(self, tip_db_id, look_for, query_field):
+ """Step back through mainline, finding any child entries.
+
+ You can search for revnos or revision_ids this way. For example,
+ look_for can be a list of revno_strs, and then query_field would be
+ "revno". Or look_for could be db_ids, and then query_field would
+ be "merged_revision".
+
+ :param tip_db_id: The current search tip
+ :param look_for: A list of things we want to find
+ :param query_field: The field name that we are looking for 'look_for'
+ :return: (tip_db_id, next_db_id, range_key)
+ """
+ # This doesn't actually pull out the data, it just walks quickly until
+ # it finds something
+ while tip_db_id is not None:
+ self._stats['num_steps'] += 1
+ range_res = self._cursor.execute(
+ "SELECT pkey, tail"
+ " FROM mainline_parent_range"
+ " WHERE head = ?"
+ " ORDER BY count DESC LIMIT 1",
+ (tip_db_id,)).fetchone()
+ if range_res is None:
+ range_key = None
+ search_res = self._cursor.execute(_add_n_params(
+ "SELECT 1"
+ " FROM dotted_revno"
+ " WHERE tip_revision = ?"
+ " AND " + query_field + " IN (%s)"
+ " LIMIT 1", len(look_for)),
+ [tip_db_id] + look_for).fetchone()
+ next_db_id = self._get_lh_parent_db_id(tip_db_id)
+ else:
+ range_key, next_db_id = range_res
+ search_res = self._cursor.execute(_add_n_params(
+ "SELECT 1"
+ " FROM dotted_revno, mainline_parent"
+ " WHERE tip_revision = mainline_parent.revision"
+ " AND mainline_parent.range = ?"
+ " AND " + query_field + " IN (%s)"
+ " LIMIT 1", len(look_for)),
+ [range_key] + look_for).fetchone()
+ if search_res is not None:
+ return tip_db_id, next_db_id, range_key
+ tip_db_id = next_db_id
+
def get_revision_ids(self, revnos):
"""Map from a dotted-revno back into a revision_id."""
t = time.time()
@@ -1361,33 +1401,28 @@
revno_strs = set(['.'.join(map(str, revno)) for revno in revnos])
revno_map = {}
while tip_db_id is not None and revno_strs:
- self._stats['num_steps'] += 1
- range_res = self._cursor.execute(
- "SELECT pkey, tail"
- " FROM mainline_parent_range"
- " WHERE head = ?"
- " ORDER BY count DESC LIMIT 1",
- (tip_db_id,)).fetchone()
- if range_res is None:
- revision_res = self._cursor.execute(_add_n_params(
+ tip_db_id, next_db_id, range_key = self._find_next_range(tip_db_id,
+ list(revno_strs), 'revno')
+ # We found a tip/range that contains the entries we are interested
+ # in. Load-it-up
+ if range_key is None:
+ revision_res = self._cursor.execute(
"SELECT revision_id, revno"
" FROM dotted_revno, revision"
" WHERE merged_revision = revision.db_id"
- " tip_revision = ?"
- " AND revno IN (%s)", len(revno_strs)),
- [tip_db_id] + list(revno_strs)).fetchall()
+ " tip_revision = ?",
+ [tip_db_id]).fetchall()
next_db_id = self._get_lh_parent_db_id(tip_db_id)
else:
- pkey, next_db_id = range_res
- revision_res = self._cursor.execute(_add_n_params(
+ revision_res = self._cursor.execute(
"SELECT revision_id, revno"
" FROM dotted_revno, mainline_parent, revision"
" WHERE tip_revision = mainline_parent.revision"
" AND merged_revision = revision.db_id"
- " AND mainline_parent.range = ?"
- " AND revno IN (%s)", len(revno_strs)),
- [pkey] + list(revno_strs)).fetchall()
+ " AND mainline_parent.range = ?",
+ [range_key]).fetchall()
tip_db_id = next_db_id
+ self._stats['get-rev-imported'] += len(revision_res)
for revision_id, revno_str in revision_res:
dotted = tuple(map(int, revno_str.split('.')))
revno_strs.discard(revno_str)
More information about the bazaar-commits
mailing list