Rev 25: Change the data to be [head, tail) (include head, exclude tail). in http://bzr.arbash-meinel.com/plugins/history_db
John Arbash Meinel
john at arbash-meinel.com
Fri Apr 2 23:17:35 BST 2010
At http://bzr.arbash-meinel.com/plugins/history_db
------------------------------------------------------------
revno: 25
revision-id: john at arbash-meinel.com-20100402221701-t83vc338x9ycoy3n
parent: john at arbash-meinel.com-20100402215313-a2k8n1nbpan8n8cs
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: history_db
timestamp: Fri 2010-04-02 17:17:01 -0500
message:
Change the data to be [head, tail) (include head, exclude tail).
It should be simpler. It also turns out that my approximate query times were
too high. Specifically, it was including the time to open the Branch, etc. Which
turns out to be non-trivial (80-100ms).
Factoring that out, the *query* time to walk the bzr.dev mainline is down to
18ms. << 600ms to do it using the bzr indices.
-------------- next part --------------
=== modified file '__init__.py'
--- a/__init__.py 2010-04-02 21:53:13 +0000
+++ b/__init__.py 2010-04-02 22:17:01 +0000
@@ -142,11 +142,14 @@
self.outf.write('Found %d revs\n' % (len(mainline),))
trace.note('Time: %.3fs' % (time.time() - t,))
# Time to walk bzr mainline
+ # Outer includes the branch.open time, Query is just the time we spend
+ # walking the database, etc.
+ # Outer Query
# bzr 13packs 646ms
# bzr 1pack 406ms
- # db rev_ids 381ms
- # db db_ids 331ms
- # db range 118ms
+ # db rev_ids 381ms 296ms
+ # db db_ids 331ms 243ms
+ # db range 118ms 18ms
_ancestry_walk_types = registry.Registry()
=== modified file 'history_db.py'
--- a/history_db.py 2010-04-02 21:53:13 +0000
+++ b/history_db.py 2010-04-02 22:17:01 +0000
@@ -165,31 +165,16 @@
return None
return parent_res[0]
- def _insert_range(self, range_db_ids):
+ def _insert_range(self, range_db_ids, tail_db_id):
head_db_id = range_db_ids[0]
- tail_db_id = range_db_ids[-1]
self._cursor.execute("INSERT INTO mainline_parent_range"
" (head, tail, count) VALUES (?, ?, ?)",
(head_db_id, tail_db_id, len(range_db_ids)))
- # Isn't there a dbapi to get back the row we just inserted?
- range_key = self._cursor.execute(
- "SELECT pkey FROM mainline_parent_range"
- " WHERE head = ? AND tail = ?",
- (head_db_id, tail_db_id)).fetchone()[0]
+ # Note: This works for sqlite, does it work for pgsql?
+ range_key = self._cursor.lastrowid
self._stats['ranges_inserted'] += 1
- # Note that head and tail will get double values. The start range will
- # have head, then tail, the next range will have head == tail, etc.
+ # Note that 'tail' is explicitly not included in the range
self._stats['revs_in_ranges'] += len(range_db_ids)
- # Note that inserting head and tail into mainline_parent is redundant,
- # since the data is available. But I'm sure it will make the *queries*
- # much easier.
- # TODO: So far the queries actually seem worse being inclusive, because
- # you have to worry about the overlap. Needs a bit more time to
- # figure out whether making head or tail exclusive is better. ATM
- # I'm thinking tail, but I haven't figured out how to handle that
- # first revision. One option is to allow tail to be NULL,
- # indicating the end of the whole range. Or to give it something
- # like 'null:'...
self._cursor.executemany(
"INSERT INTO mainline_parent (range, revision, dist)"
" VALUES (?, ?, ?)",
@@ -220,25 +205,22 @@
range_db_ids = []
try:
while cur_db_id is not None:
- range_db_ids.append(cur_db_id)
if self._check_range_exists_for(cur_db_id):
# This already exists as a valid 'head' in the range cache,
# so we can assume that all parents are already covered in
# reasonable ranges.
break
- if len(range_db_ids) > 100:
- # We've stepped far enough, at 101 values, we cover 100
- # revisions (rev 1000 to rev 1100, etc), because the range
- # is inclusive
- self._insert_range(range_db_ids[:75])
+ if len(range_db_ids) >= 100:
+ # We have a 100 node range
+ self._insert_range(range_db_ids, cur_db_id)
# We want to start a new range, with
# new_range.head == last_range.tail
- range_db_ids = range_db_ids[74:-1]
- continue
+ range_db_ids = []
+ range_db_ids.append(cur_db_id)
parent_db_id = self._get_lh_parent_db_id(cur_db_id)
cur_db_id = parent_db_id
if len(range_db_ids) > 1:
- self._insert_range(range_db_ids)
+ self._insert_range(range_db_ids, cur_db_id)
except:
self._db_conn.rollback()
raise
@@ -379,7 +361,6 @@
all_ids = []
while db_id is not None:
self._stats['num_steps'] += 1
- all_ids.append(db_id)
range_res = self._cursor.execute(
"SELECT pkey, tail"
" FROM mainline_parent_range"
@@ -388,6 +369,7 @@
(db_id,)).fetchone()
if range_res is None:
# No range, so switch to using by-parent search
+ all_ids.append(db_id)
db_id = self._get_lh_parent_db_id(db_id)
else:
# We have a range, so read in the whole range, and append the
@@ -399,8 +381,7 @@
(range_key,)).fetchall()
db_ids = [r[0] for r in range_db_ids]
assert db_ids[0] == db_id
- assert db_ids[-1] == tail_db_id
- all_ids.extend(db_ids[1:-1])
+ all_ids.extend(db_ids)
db_id = tail_db_id
self._stats['query_time'] += (time.time() - t)
return all_ids
=== modified file 'history_denormalization.txt'
--- a/history_denormalization.txt 2010-04-02 21:53:13 +0000
+++ b/history_denormalization.txt 2010-04-02 22:17:01 +0000
@@ -88,7 +88,7 @@
CREATE TABLE mainline_parent_range (
pkey INTEGER PRIMARY KEY AUTOINCREMENT,
head INTEGER REFERENCES revision NOT NULL,
- tail INTEGER REFERENCES revision NOT NULL,
+ tail INTEGER REFERENCES revision, -- NULL indicates start-of-history
count INTEGER NOT NULL
);
CREATE INDEX mainline_parent_range_head_index
=== modified file 'schema.py'
--- a/schema.py 2010-04-02 21:53:13 +0000
+++ b/schema.py 2010-04-02 22:17:01 +0000
@@ -77,7 +77,8 @@
CREATE TABLE mainline_parent_range (
pkey INTEGER PRIMARY KEY AUTOINCREMENT,
head INTEGER REFERENCES revision NOT NULL,
- tail INTEGER REFERENCES revision NOT NULL,
+ tail INTEGER REFERENCES revision, -- NULL indicates start-of-history
+ -- tail is *not* included in the mainline_parent table
count INTEGER NOT NULL -- num in range, inclusive
);
"""
More information about the bazaar-commits
mailing list