Rev 24: Some updates to the mainline_parent tables, and a new query that uses it. in http://bzr.arbash-meinel.com/plugins/history_db
John Arbash Meinel
john at arbash-meinel.com
Fri Apr 2 22:53:48 BST 2010
At http://bzr.arbash-meinel.com/plugins/history_db
------------------------------------------------------------
revno: 24
revision-id: john at arbash-meinel.com-20100402215313-a2k8n1nbpan8n8cs
parent: john at arbash-meinel.com-20100402211723-8c8s905eifd0olgj
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: history_db
timestamp: Fri 2010-04-02 16:53:13 -0500
message:
Some updates to the mainline_parent tables, and a new query that uses it.
Turns out that partial denormalization makes a big difference.
Specifically, the time to walk the bzr mainline drops from ~300ms down to
120ms, using only 70 round trips (vs 5000 walking one-by-one).
-------------- next part --------------
=== modified file '__init__.py'
--- a/__init__.py 2010-04-02 20:17:33 +0000
+++ b/__init__.py 2010-04-02 21:53:13 +0000
@@ -97,6 +97,12 @@
trace.note('Stats:\n%s' % (pprint.pformat(dict(query._stats)),))
+_mainline_walk_types = registry.Registry()
+_mainline_walk_types.register('db-rev-id', None)
+_mainline_walk_types.register('db-db-id', None)
+_mainline_walk_types.register('db-range', None)
+_mainline_walk_types.register('bzr', None)
+
class cmd_walk_mainline(commands.Command):
"""Walk the mainline of the branch."""
@@ -104,37 +110,43 @@
help='Use this as the database for storage'),
option.Option('directory', type=unicode, short_name='d',
help='Import this location instead of "."'),
- option.Option('use-db-ids',
- help='Do the queries using database ids'),
- option.Option('in-bzr', help="Use the bzr graph."),
+ option.RegistryOption('method',
+ help='How do you want to do the walking.',
+ converter=str, registry=_mainline_walk_types)
]
- def run(self, directory='.', db=None, in_bzr=False, use_db_ids=False):
+ def run(self, directory='.', db=None, method=None):
from bzrlib.plugins.history_db import history_db
from bzrlib import branch
+ import pprint
+ import time
+ t = time.time()
b = branch.Branch.open(directory)
b.lock_read()
try:
- if in_bzr:
- import time
- t = time.time()
- b.revision_history()
- trace.note('Time: %.3fs' % (time.time() - t,))
- else:
+ if method.startswith('db'):
query = history_db.Querier(db, b)
- if use_db_ids:
- query.walk_mainline_db_ids()
+ if method == 'db-db-id':
+ mainline = query.walk_mainline_db_ids()
+ elif method == 'db-rev-id':
+ mainline = query.walk_mainline()
else:
- query.walk_mainline()
- import pprint
+ assert method == 'db-range'
+ mainline = query.walk_mainline_using_ranges()
trace.note('Stats:\n%s' % (pprint.pformat(dict(query._stats)),))
+ else:
+ assert method == 'bzr'
+ mainline = b.revision_history()
finally:
b.unlock()
+ self.outf.write('Found %d revs\n' % (len(mainline),))
+ trace.note('Time: %.3fs' % (time.time() - t,))
# Time to walk bzr mainline
- # bzr 31packs 683ms
- # bzr 1pack 320ms
- # db rev_ids 295ms
- # db db_ids 236ms
+ # bzr 13packs 646ms
+ # bzr 1pack 406ms
+ # db rev_ids 381ms
+ # db db_ids 331ms
+ # db range 118ms
_ancestry_walk_types = registry.Registry()
=== modified file 'history_db.py'
--- a/history_db.py 2010-04-02 21:17:23 +0000
+++ b/history_db.py 2010-04-02 21:53:13 +0000
@@ -183,9 +183,17 @@
# Note that inserting head and tail into mainline_parent is redundant,
# since the data is available. But I'm sure it will make the *queries*
# much easier.
+ # TODO: So far the queries actually seem worse being inclusive, because
+ # you have to worry about the overlap. Needs a bit more time to
+ # figure out whether making head or tail exclusive is better. ATM
+ # I'm thinking tail, but I haven't figured out how to handle that
+ # first revision. One option is to allow tail to be NULL,
+ # indicating the end of the whole range. Or to give it something
+ # like 'null:'...
self._cursor.executemany(
- "INSERT INTO mainline_parent (range, revision)"
- " VALUES (?, ?)", [(range_key, d) for d in range_db_ids])
+ "INSERT INTO mainline_parent (range, revision, dist)"
+ " VALUES (?, ?, ?)",
+ [(range_key, d, idx) for idx, d in enumerate(range_db_ids)])
def build_mainline_cache(self):
"""Given the current branch, cache mainline information."""
@@ -249,6 +257,11 @@
self._branch_tip_rev_id = a_branch.last_revision()
self._stats = defaultdict(lambda: 0)
+ def _get_db_id(self, revision_id):
+ return self._cursor.execute('SELECT db_id FROM revision'
+ ' WHERE revision_id = ?',
+ (revision_id,)).fetchone()[0]
+
def _get_lh_parent_rev_id(self, revision_id):
parent_res = self._cursor.execute("""
SELECT p.revision_id
@@ -347,20 +360,50 @@
all_ids.append(cur_id)
cur_id = self._get_lh_parent_rev_id(cur_id)
self._stats['query_time'] += (time.time() - t)
- return
+ return all_ids
def walk_mainline_db_ids(self):
"""Walk the db, and grab all the mainline identifiers."""
t = time.time()
- db_id = self._cursor.execute('SELECT db_id FROM revision'
- ' WHERE revision_id = ?',
- (self._branch_tip_rev_id,)).fetchone()[0]
+ db_id = self._get_db_id(self._branch_tip_rev_id)
all_ids = []
while db_id is not None:
all_ids.append(db_id)
db_id = self._get_lh_parent_db_id(db_id)
self._stats['query_time'] += (time.time() - t)
- return
+ return all_ids
+
+ def walk_mainline_using_ranges(self):
+ t = time.time()
+ db_id = self._get_db_id(self._branch_tip_rev_id)
+ all_ids = []
+ while db_id is not None:
+ self._stats['num_steps'] += 1
+ all_ids.append(db_id)
+ range_res = self._cursor.execute(
+ "SELECT pkey, tail"
+ " FROM mainline_parent_range"
+ " WHERE head = ?"
+ " ORDER BY count DESC LIMIT 1",
+ (db_id,)).fetchone()
+ if range_res is None:
+ # No range, so switch to using by-parent search
+ db_id = self._get_lh_parent_db_id(db_id)
+ else:
+ # We have a range, so read in the whole range, and append the
+ # info. Note that we already have db_id itself, so
+ range_key, tail_db_id = range_res
+ range_db_ids = self._cursor.execute(
+ "SELECT revision FROM mainline_parent"
+ " WHERE range = ? ORDER BY dist ASC",
+ (range_key,)).fetchall()
+ db_ids = [r[0] for r in range_db_ids]
+ assert db_ids[0] == db_id
+ assert db_ids[-1] == tail_db_id
+ all_ids.extend(db_ids[1:-1])
+ db_id = tail_db_id
+ self._stats['query_time'] += (time.time() - t)
+ return all_ids
def walk_ancestry(self):
"""Walk all parents of the given revision."""
@@ -384,8 +427,7 @@
def walk_ancestry_db_ids(self):
_exec = self._cursor.execute
all_ancestors = set()
- db_id = _exec("SELECT db_id FROM revision WHERE revision_id = ?",
- (self._branch_tip_rev_id,)).fetchone()[0]
+ db_id = self._get_db_id(self._branch_tip_rev_id)
all_ancestors.add(db_id)
remaining = [db_id]
while remaining:
=== modified file 'history_denormalization.txt'
--- a/history_denormalization.txt 2010-04-02 20:17:33 +0000
+++ b/history_denormalization.txt 2010-04-02 21:53:13 +0000
@@ -95,7 +95,8 @@
ON mainline_parent_range (head);
CREATE TABLE mainline_parent (
range INTEGER REFERENCES mainline_parent_range NOT NULL,
- revision INTEGER REFERENCES revision NOT NULL
+ revision INTEGER REFERENCES revision NOT NULL,
+ dist INTEGER NOT NULL -- Offset from head, so we know the order
-- Not adding the constraint at this time, but it is logically there
-- CONSTRAINT mainline_parent_rev_unique UNIQUE (range, revision)
);
=== modified file 'schema.py'
--- a/schema.py 2010-04-02 20:17:33 +0000
+++ b/schema.py 2010-04-02 21:53:13 +0000
@@ -92,7 +92,10 @@
mainline_parent_t = """
CREATE TABLE mainline_parent (
range INTEGER REFERENCES mainline_parent_range NOT NULL,
- revision INTEGER REFERENCES revision NOT NULL
+ revision INTEGER REFERENCES revision NOT NULL,
+ dist INTEGER NOT NULL -- Offset from head, so we preserve the order
+ -- Not adding the constraint at this time, but it is logically there
+ -- CONSTRAINT mainline_parent_rev_unique UNIQUE (range, revision)
);
"""
_create_statements.append(mainline_parent_t)
More information about the bazaar-commits
mailing list