Rev 41: Got some implementation and tests showing it is working. in http://bzr.arbash-meinel.com/plugins/history_db
John Arbash Meinel
john at arbash-meinel.com
Mon Apr 5 21:32:44 BST 2010
At http://bzr.arbash-meinel.com/plugins/history_db
------------------------------------------------------------
revno: 41
revision-id: john at arbash-meinel.com-20100405203239-v64mx9kanm0cayfz
parent: john at arbash-meinel.com-20100405160148-1frpoq89fj77dhbw
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: history_db
timestamp: Mon 2010-04-05 15:32:39 -0500
message:
Got some implementation and tests showing it is working.
The test is a bit overbroad, but it is a bit hard to set up the state.
So instead I'm just testing how the state changes at each step.
-------------- next part --------------
=== modified file 'history_db.py'
--- a/history_db.py 2010-04-04 19:19:42 +0000
+++ b/history_db.py 2010-04-05 20:32:39 +0000
@@ -35,6 +35,25 @@
NULL_PARENTS = (revision.NULL_REVISION,)
+
+def _n_params(n):
+ """Create a query string representing N parameters.
+
+ n=1 => ?
+ n=2 => ?, ?
+ etc.
+ """
+ return ', '.join('?'*n)
+
+
+def _add_n_params(query, n):
+ """Add n parameters to the query string.
+
+ the query should have a single '%s' in it to be expanded.
+ """
+ return query % (_n_params(n),)
+
+
class Importer(object):
"""Import data from bzr into the history_db."""
@@ -80,6 +99,14 @@
(tip_rev_id,)).fetchone()
return res[0] > 0
+ def _is_imported_db_id(self, tip_db_id):
+ res = self._cursor.execute(
+ "SELECT count(*) FROM dotted_revno"
+ " WHERE tip_revision = ?"
+ " AND tip_revision = merged_revision",
+ (tip_db_id,)).fetchone()
+ return res[0] > 0
+
def _insert_nodes(self, tip_rev_id, nodes):
"""Insert all of the nodes mentioned into the database."""
self._stats['_insert_node_calls'] += 1
@@ -104,6 +131,10 @@
def _update_parents(self, nodes):
"""Update parent information for all these nodes."""
# Get the keys and their parents
+ # TODO: handle ghosts somehow, the current table structure won't
+ # distinguish between valid roots and roots that are ghosts.
+ # Note, though, that merge_sort also prunes ghosts, so you have
+ # to find them some other way.
parent_map = dict(
(n.key[0], [p[0] for p in self._graph.get_parent_keys(n.key)])
for n in nodes)
@@ -126,6 +157,12 @@
def do_import(self, expand_all=False):
if self._incremental:
self._update_ancestry(self._branch_tip_rev_id)
+ tip_db_id = self._rev_id_to_db_id[self._branch_tip_rev_id]
+ (needed_mainline,
+ imported_mainline_db_id) = self._find_needed_mainline(tip_db_id)
+ # TODO: if imported_mainline_id is None, then we may as well just
+ # switch to full import, rather than incremental
+ self._import_mainline(needed_mainline, imported_mainline_db_id)
merge_sorted = self._import_tip(self._branch_tip_rev_id)
if not expand_all:
return
@@ -205,6 +242,15 @@
self._db_conn.commit()
return merge_sorted
+ def _import_mainline(self, mainline_db_ids, pre_mainline_id):
+ """Fill out the dotted_revno table for these mainline_db_ids.
+
+ :param mainline_db_ids: A list of mainline database ids, starting with
+ the most recent revision.
+ :param pre_mainline_id: The mainline db_id that *has* been imported
+ (may be None). Should just be parent(mainline_db_ids[-1])
+ """
+
def _update_ancestry(self, new_tip_rev_id):
"""Walk the parents of this tip, updating 'revision' and 'parent'
@@ -216,6 +262,18 @@
self._insert_parent_map(parent_map)
self._db_conn.commit()
+ def _find_needed_mainline(self, new_tip_db_id):
+ """Find mainline revisions that need to be filled out.
+
+ :return: ([mainline_not_imported], most_recent_imported)
+ """
+ db_id = new_tip_db_id
+ needed = []
+ while db_id is not None and not self._is_imported_db_id(db_id):
+ needed.append(db_id)
+ db_id = self._get_lh_parent_db_id(db_id)
+ return needed, db_id
+
def _find_known_ancestors(self, new_tip_rev_id):
"""Starting at tip, find ancestors we already have"""
needed = [new_tip_rev_id]
@@ -241,6 +299,9 @@
parent_map.update(pmap)
parent_ids = pmap.get(rev_id, ())
if not parent_ids or parent_ids == NULL_PARENTS:
+ # XXX: We should handle 'not parent_ids' differently, because
+ # that means they are a ghost. Currently the table cannot
+ # distinguish between a ghost and a root revision.
# We can insert this rev directly, because we know its gdfo,
# as it has no parents.
parent_map[rev_id] = ()
@@ -392,6 +453,211 @@
self._db_conn.commit()
+class _IncrementalImporter(object):
+ """Context for importing partial history."""
+ # Note: all of the ids in this object are database ids. the revision_ids
+ # should have already been imported before we get to this step.
+
+ def __init__(self, importer, mainline_db_ids, pre_mainline_id):
+ self._importer = importer
+ self._mainline_db_ids = mainline_db_ids
+ # For now
+ assert pre_mainline_id is not None
+ self._imported_mainline_id = pre_mainline_id
+ self._cursor = importer._cursor
+
+ # db_id => gdfo
+ self._known_gdfo = {}
+ # db_ids that we know are ancestors of mainline_db_ids that are not
+ # ancestors of pre_mainline_id
+ self._interesting_ancestor_ids = set(self._mainline_db_ids)
+
+ # Information from the dotted_revno table for revisions that are in the
+ # already-imported mainline.
+ self._imported_dotted_revno_info = {}
+ # This is the gdfo of the current mainline revision search tip. This is
+ # the threshold such that
+ self._imported_gdfo = None
+
+ # Revisions that we are walking, to see if they are interesting, or
+ # already imported.
+ self._search_tips = None
+
+ def _get_initial_search_tips(self):
+ """Grab the right-hand parents of all the interesting mainline.
+
+ We know we already searched all of the left-hand parents, so just grab
+ the right-hand parents.
+ """
+ # TODO: Split this into a loop, since sqlite has a maximum number of
+ # parameters.
+ res = self._cursor.execute(_add_n_params(
+ "SELECT parent, gdfo FROM parent, revision"
+ " WHERE parent.parent = revision.db_id"
+ " AND parent_idx != 0"
+ " AND child IN (%s)", len(self._mainline_db_ids)),
+ self._mainline_db_ids).fetchall()
+ self._search_tips = set([r[0] for r in res])
+ known_gdfo = self._known_gdfo
+ known_gdfo.update(res)
+ res = self._cursor.execute(
+ "SELECT gdfo FROM revision WHERE db_id = ?",
+ [self._imported_mainline_id]).fetchone()
+ imported_gdfo = res[0]
+ self._imported_gdfo = imported_gdfo
+ known_gdfo[self._imported_mainline_id] = imported_gdfo
+
+ def _split_search_tips_by_gdfo(self, unknown):
+ """For these search tips, mark ones 'interesting' based on gdfo.
+
+ All search tips are ancestors of _mainline_db_ids. So if their gdfo
+ indicates that they could not be merged into already imported
+ revisions, then we know automatically that they are
+ new-and-interesting. Further, if they are present in
+ _imported_dotted_revno_info, then we know they are not interesting, and
+ we will stop searching them.
+
+ Otherwise, we don't know for sure which category they fall into, so
+ we return them for further processing.
+
+ :return: still_unknown - search tips that aren't known to be
+ interesting, and aren't known to be in the ancestry of already
+ imported revisions.
+ """
+ still_unknown = []
+ min_gdfo = None
+ for db_id in unknown:
+ gdfo = self._known_gdfo[db_id]
+ if gdfo >= self._imported_gdfo:
+ self._interesting_ancestor_ids.add(db_id)
+ else:
+ # TODO: This is probably a reasonable location to check
+ # self._imported_dotted_revno_info to see if this search
+ # tip is known to already exist in the ancestry.
+ if db_id in self._imported_dotted_revno_info:
+ # This should be removed as a search tip, we know it isn't
+ # interesting, it is an ancestor of an imported revision
+ self._search_tips.remove(db_id)
+ else:
+ still_unknown.append(db_id)
+ return still_unknown
+
+ def _split_interesting_using_children(self, unknown_search_tips):
+ """Find children of these search tips.
+
+ For each search tip, we find all of its known children. We then filter
+ the children by:
+ a) child is ignored if child in _interesting_ancestor_ids
+ b) child is ignored if gdfo(child) > _imported_gdfo
+ or (gdfo(child) == _imported_gdfo and child !=
+ _imported_mainline_id)
+ The reason for the extra check is because for the ancestry
+ left-to-be-searched, with tip at _imported_mainline_id, *only*
+ _imported_mainline_id is allowed to have gdfo = _imported_gdfo.
+ for each search tip, if there are no interesting children, then this
+ search tip is definitely interesting (there is no path for it to be
+ merged into a previous mainline entry.)
+
+ :return: still_unknown
+ still_unknown are the search tips that are still have children that
+ could be possibly merged.
+ """
+ interesting = self._interesting_ancestor_ids
+ parent_child_res = self._cursor.execute(_add_n_params(
+ "SELECT parent, child FROM parent"
+ " WHERE parent IN (%s)",
+ len(unknown_search_tips)), unknown_search_tips).fetchall()
+ parent_to_children = {}
+ already_imported = set()
+ for parent, child in parent_child_res:
+ if (child in self._imported_dotted_revno_info
+ or child == self._imported_mainline_id):
+ # This child is already imported, so obviously the parent is,
+ # too.
+ already_imported.add(parent)
+ already_imported.add(child)
+ parent_to_children.setdefault(parent, []).append(child)
+ self._search_tips.difference_update(already_imported)
+ possibly_merged_children = set(
+ [c for p, c in parent_child_res
+ if c not in interesting and p not in already_imported])
+ known_gdfo = self._known_gdfo
+ unknown_gdfos = [c for c in possibly_merged_children
+ if c not in known_gdfo]
+ # TODO: Is it more wasteful to join this table early, or is it better
+ # because we avoid having to pass N parameters back in?
+ # TODO: Would sorting the db ids help? They are the primary key for the
+ # table, so we could potentially fetch in a better order.
+ if unknown_gdfos:
+ res = self._cursor.execute(_add_n_params(
+ "SELECT db_id, gdfo FROM revision WHERE db_id IN (%s)",
+ len(unknown_gdfos)), unknown_gdfos)
+ known_gdfo.update(res)
+ min_gdfo = self._imported_gdfo
+ # Remove all of the children who have gdfo >= min. We already handled
+ # the == min case in the first loop.
+ possibly_merged_children.difference_update(
+ [c for c in possibly_merged_children if known_gdfo[c] >= min_gdfo])
+ still_unknown = []
+ for parent in unknown_search_tips:
+ if parent in already_imported:
+ continue
+ for c in parent_to_children[parent]:
+ if c in possibly_merged_children:
+ still_unknown.append(parent)
+ break
+ else: # All children could not be possibly merged
+ interesting.add(parent)
+ return still_unknown
+
+ def _step_mainline(self):
+ """Move the mainline pointer by one, updating the data."""
+ bork
+
+ def _step_search_tips(self):
+ """Move the search tips to their parents."""
+ res = self._cursor.execute(_add_n_params(
+ "SELECT parent, gdfo FROM parent, revision"
+ " WHERE parent=db_id AND child IN (%s)",
+ len(self._search_tips)), list(self._search_tips)).fetchall()
+ # XXX: Filter out search tips that we've already searched via a
+ # different path, either entries already in the old _search_tips,
+ # or something in _interesting_ancestor_ids, etc. Note that by
+ # construction, everything in _search_tips should be in
+ # _interesting_ancestor_ids...
+ self._search_tips = set([r[0] for r in res])
+ # TODO: For search tips we will be removing, we don't need to join
+ # against revision since we should already have them. There may
+ # be other ways that we already know gdfo. It may be cheaper to
+ # check first.
+ self._known_gdfo.update(res)
+
+ def do_import(self):
+ self._get_initial_search_tips()
+ while self._search_tips:
+ # We don't know whether these search tips are known interesting, or
+ # known uninteresting
+ unknown = self._search_tips
+ while unknown:
+ unknown = self._split_search_tips_by_gdfo(unknown)
+ if unknown:
+ unknown = self._split_interesting_using_children(unknown)
+ # The current search tips are the 'newest' possible tips right
+ # now. If we can't classify them as definitely being
+ # interesting, then we need to step the mainline until we can.
+ # This means that the current search tips have children that
+ # could be merged into an earlier mainline, walk the mainline
+ # to see if we can resolve that.
+ # Note that relying strictly on gdfo is a bit of a waste here,
+ # because you may have a rev with 10 children before it lands
+ # in mainline, but all 11 revs will be in the dotted_revno
+ # cache for that mainline.
+ self._step_mainline()
+ # All search_tips are known to either be interesting or
+ # uninteresting. Walk any search tips that remain.
+ self._step_search_tips()
+
+
class Querier(object):
"""Perform queries on an existing history db."""
@@ -553,22 +819,22 @@
" ORDER BY count DESC LIMIT 1",
(tip_db_id,)).fetchone()
if range_res is None:
- revno_res = self._cursor.execute(
+ revno_res = self._cursor.execute(_add_n_params(
"SELECT merged_revision, revno FROM dotted_revno"
" WHERE tip_revision = ?"
- " AND merged_revision IN (%s)"
- % (', '.join('?'*len(db_ids))),
+ " AND merged_revision IN (%s)",
+ len(db_ids)),
[tip_db_id] + list(db_ids)).fetchall()
next_db_id = self._get_lh_parent_db_id(tip_db_id)
else:
pkey, next_db_id = range_res
- revno_res = self._cursor.execute(
+ revno_res = self._cursor.execute(_add_n_params(
"SELECT merged_revision, revno"
" FROM dotted_revno, mainline_parent"
" WHERE tip_revision = mainline_parent.revision"
" AND mainline_parent.range = ?"
- " AND merged_revision IN (%s)"
- % (', '.join('?'*len(db_ids))),
+ " AND merged_revision IN (%s)",
+ len(db_ids)),
[pkey] + list(db_ids)).fetchall()
tip_db_id = next_db_id
for db_id, revno in revno_res:
@@ -671,8 +937,9 @@
self._stats['num_steps'] += 1
next = remaining[:100]
remaining = remaining[len(next):]
- res = _exec("SELECT parent FROM parent WHERE child in (%s)"
- % (', '.join('?'*len(next))), tuple(next))
+ res = _exec(_add_n_params(
+ "SELECT parent FROM parent WHERE child in (%s)",
+ len(db_ids)), next)
next_p = [p[0] for p in res if p[0] not in all_ancestors]
all_ancestors.update(next_p)
remaining.extend(next_p)
@@ -695,8 +962,9 @@
self._stats['num_steps'] += 1
next = remaining[:100]
remaining = remaining[len(next):]
- res = _exec("SELECT parent FROM parent WHERE child in (%s)"
- % (', '.join('?'*len(next))), tuple(next))
+ res = _exec(_add_n_params(
+ "SELECT parent FROM parent WHERE child in (%s)",
+ len(next)), next)
next_p = [p[0] for p in res if p[0] not in all_ancestors]
all_ancestors.update(next_p)
remaining.extend(next_p)
=== modified file 'history_denormalization.txt'
--- a/history_denormalization.txt 2010-04-05 16:01:48 +0000
+++ b/history_denormalization.txt 2010-04-05 20:32:39 +0000
@@ -252,7 +252,7 @@
.. [#] Dotted revnos are numbered based on the current branch tip revision.
However, there is a stability guarantee that we can exploit. If a revision
X is in the ancestry of tip T, then the dotted revno for X will be the same
- for all descendents of T.
+ for all descendants of T.
Partial ``merge_sort`` Computation
@@ -459,15 +459,14 @@
Starting at K, we grab F & J. gdfo(F) = 5. gdfo(K) = 7, gdfo(J) = 6. Since
gdfo(J) = 6, we know that it cannot be merged into F (nor can any of its
- descendents). So J gets marked as "interesting" (it is an ancestor of K),
- but "complete" (it could not have been merged into a mainline revision
- that we have not fetched).
+ descendants). So J gets marked as "interesting" (it is an ancestor of K
+ that cannot be an ancestor of F).
Next we step to H & I, both with gdfo = 5. These also get marked
- "interesting & complete". We then walk to G gdfo=4. We check the known
+ "interesting". We then walk to G gdfo=4. We check the known
children of G, and see that they are all marked either complete, or have
gdfo >= 5 (the hypothetical children off of ':' all have to have gdfo >=
- 5.) The only other possible answer here would be that F was an immediate
+ 5.[#]) The only other possible answer here would be that F was an immediate
descendent of G. G then gets marked "interesting & complete". We then
walk to C gdfo=3. We then see that C has a child E that is not interesting
& complete, and has a gdfo < 5. We could probably take a couple different
@@ -484,6 +483,14 @@
to page in the dotted_revno info anyway, because we'll use it to generate
the revnos for G, etc.
+.. [#] There is a case where things aren't as clear cut. If G were a little
+ bit older (say GDFO = 3), and G were merged into a different branch
+ that was also small (X <= [A, G]), then we would see that G is merged
+ into something new enough to cause confusion. The only way I can see to
+ handle that is to finally observe that X is not part of this branch.
+ However, I think the only way to observe that is to walk far enough
+ back, which is what we would do anyway.
+
..
vim: ft=rst tw=78 ai
=== modified file 'test_importer.py'
--- a/test_importer.py 2010-04-05 13:54:47 +0000
+++ b/test_importer.py 2010-04-05 20:32:39 +0000
@@ -230,3 +230,59 @@
('K', 0): 'J', ('L', 0): 'J', ('M', 0): 'K',
('M', 1): 'L', ('N', 0): 'I', ('N', 1): 'M',
}, parent_map)
+
+ def test__incremental_importer_step_by_step(self):
+ b = self.make_interesting_branch()
+ b._tip_revision = 'D' # Something older
+ importer = history_db.Importer(':memory:', b, incremental=False)
+ importer.do_import()
+ D_id = importer._rev_id_to_db_id['D']
+ self.assertEqual(1, importer._cursor.execute(
+ "SELECT count(*) FROM dotted_revno"
+ " WHERE tip_revision = merged_revision"
+ " AND tip_revision = ?", (D_id,)).fetchone()[0])
+ # Now work on just importing G
+ importer._update_ancestry('G')
+ G_id = importer._rev_id_to_db_id['G']
+ needed_mainline, imported_db_id = importer._find_needed_mainline(G_id)
+ self.assertEqual([G_id], needed_mainline)
+ self.assertEqual(D_id, imported_db_id)
+ inc_importer = history_db._IncrementalImporter(importer,
+ needed_mainline, imported_db_id)
+ self.assertEqual(set(needed_mainline),
+ inc_importer._interesting_ancestor_ids)
+ # This should populate ._search_tips, as well as gdfo information
+ inc_importer._get_initial_search_tips()
+ F_id = importer._rev_id_to_db_id['F']
+ self.assertEqual(set([F_id]), inc_importer._search_tips)
+ self.assertEqual(4, inc_importer._imported_gdfo)
+ self.assertEqual(D_id, inc_importer._imported_mainline_id)
+ self.assertEqual({F_id: 4, D_id: 4}, inc_importer._known_gdfo)
+ # D_id has gdfo >= F_id, so we know it isn't merged. So
+ # _split_search_tips_by_gdfo should return nothing, and update
+ # _interesting_ancestor_ids
+ self.assertEqual([], inc_importer._split_search_tips_by_gdfo([F_id]))
+ self.assertEqual(set([G_id, F_id]),
+ inc_importer._interesting_ancestor_ids)
+ # Since we know that all search tips are interesting, we walk them
+ inc_importer._step_search_tips()
+ E_id = importer._rev_id_to_db_id['E']
+ self.assertEqual(set([E_id]), inc_importer._search_tips)
+ # Now when we go to _split_search_tips_by_gdfo, we aren't positive that
+ # E wasn't merged, it should tell us so.
+ self.assertEqual([E_id],
+ inc_importer._split_search_tips_by_gdfo([E_id]))
+ # And it should not have updatde search tips or ancestor_ids
+ self.assertEqual(set([G_id, F_id]),
+ inc_importer._interesting_ancestor_ids)
+ self.assertEqual(set([E_id]), inc_importer._search_tips)
+ # Checking children should notice that no children have gdfo < F, so E
+ # is auto-marked as interesting.
+ self.assertEqual([],
+ inc_importer._split_interesting_using_children([E_id]))
+ self.assertEqual(set([E_id, G_id, F_id]),
+ inc_importer._interesting_ancestor_ids)
+ # Since we know all search tips are interesting, step again.
+ inc_importer._step_search_tips()
+ B_id = importer._rev_id_to_db_id['B']
+ self.assertEqual(set([B_id]), inc_importer._search_tips)
More information about the bazaar-commits
mailing list