Rev 414: HistoryDB is now roughly integrated. Probably still needs some tuning. in http://bazaar.launchpad.net/~jameinel/loggerhead/history_db
John Arbash Meinel
john at arbash-meinel.com
Fri Apr 30 23:27:31 BST 2010
At http://bazaar.launchpad.net/~jameinel/loggerhead/history_db
------------------------------------------------------------
revno: 414
revision-id: john at arbash-meinel.com-20100430222722-zgzx4a0d92xvhwii
parent: john at arbash-meinel.com-20100427154552-ebb7psd96xwmgo0e
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: history_db
timestamp: Fri 2010-04-30 17:27:22 -0500
message:
HistoryDB is now roughly integrated. Probably still needs some tuning.
-------------- next part --------------
=== modified file 'loggerhead/apps/branch.py'
--- a/loggerhead/apps/branch.py 2010-04-14 17:54:32 +0000
+++ b/loggerhead/apps/branch.py 2010-04-30 22:27:22 +0000
@@ -79,7 +79,10 @@
revinfo_disk_cache = RevInfoDiskCache(cache_path)
return History(
self.branch, self.graph_cache, file_cache=file_cache,
- revinfo_disk_cache=revinfo_disk_cache, cache_key=self.friendly_name)
+ revinfo_disk_cache=revinfo_disk_cache,
+ cache_key=self.friendly_name,
+ cache_path=cache_path,
+ )
def url(self, *args, **kw):
if isinstance(args[0], list):
=== modified file 'loggerhead/history.py'
--- a/loggerhead/history.py 2010-04-27 15:45:52 +0000
+++ b/loggerhead/history.py 2010-04-30 22:27:22 +0000
@@ -31,6 +31,7 @@
import bisect
import datetime
import logging
+import os
import re
import textwrap
import threading
@@ -45,6 +46,12 @@
from loggerhead import util
from loggerhead.wholehistory import compute_whole_history_data
+from bzrlib.plugins.history_db import (
+ history_db,
+ _get_history_db_path,
+ _get_querier,
+ )
+
def is_branch(folder):
try:
@@ -206,6 +213,7 @@
# to hang.
revision_graph_locks = {}
revision_graph_check_lock = threading.Lock()
+history_db_importer_lock = threading.Lock()
class History(object):
"""Decorate a branch to provide information for rendering.
@@ -217,75 +225,14 @@
:ivar _file_change_cache: An object that caches information about the
files that changed between two revisions.
- :ivar _rev_info: A list of information about revisions. This is by far
- the most cryptic data structure in loggerhead. At the top level, it
- is a list of 3-tuples [(merge-info, where-merged, parents)].
- `merge-info` is (seq, revid, merge_depth, revno_str, end_of_merge) --
- like a merged sorted list, but the revno is stringified.
- `where-merged` is a tuple of revisions that have this revision as a
- non-lefthand parent. Finally, `parents` is just the usual list of
- parents of this revision.
- :ivar _rev_indices: A dictionary mapping each revision id to the index of
- the information about it in _rev_info.
+ :ivar _querier: A HistoryDB.Querier instance, allowing us to query for
+ information in the ancestry of the branch.
:ivar _revno_revid: A dictionary mapping stringified revnos to revision
ids.
"""
- def _load_whole_history_data(self, caches, cache_key):
- """Set the attributes relating to the whole history of the branch.
-
- :param caches: a list of caches with interfaces like
- `RevInfoMemoryCache` and be ordered from fastest to slowest.
- :param cache_key: the key to use with the caches.
- """
- self._rev_indices = None
- self._rev_info = None
-
- missed_caches = []
- def update_missed_caches():
- for cache in missed_caches:
- cache.set(cache_key, self.last_revid, self._rev_info)
-
- # Theoretically, it's possible for two threads to race in creating
- # the Lock() object for their branch, so we put a lock around
- # creating the per-branch Lock().
- revision_graph_check_lock.acquire()
- try:
- if cache_key not in revision_graph_locks:
- revision_graph_locks[cache_key] = threading.Lock()
- finally:
- revision_graph_check_lock.release()
-
- revision_graph_locks[cache_key].acquire()
- try:
- for cache in caches:
- data = cache.get(cache_key, self.last_revid)
- if data is not None:
- self._rev_info = data
- update_missed_caches()
- break
- else:
- missed_caches.append(cache)
- else:
- whole_history_data = compute_whole_history_data(self._branch)
- self._rev_info, self._rev_indices = whole_history_data
- update_missed_caches()
- finally:
- revision_graph_locks[cache_key].release()
-
- if self._rev_indices is not None:
- self._revno_revid = {}
- for ((_, revid, _, revno_str, _), _, _) in self._rev_info:
- self._revno_revid[revno_str] = revid
- else:
- self._revno_revid = {}
- self._rev_indices = {}
- for ((seq, revid, _, revno_str, _), _, _) in self._rev_info:
- self._rev_indices[revid] = seq
- self._revno_revid[revno_str] = revid
-
def __init__(self, branch, whole_history_data_cache, file_cache=None,
- revinfo_disk_cache=None, cache_key=None):
+ revinfo_disk_cache=None, cache_key=None, cache_path=None):
assert branch.is_locked(), (
"Can only construct a History object with a read-locked branch.")
if file_cache is not None:
@@ -296,16 +243,24 @@
self._branch = branch
self._branch_tags = None
self._inventory_cache = {}
+ self._querier = _get_querier(branch)
+ if self._querier is None:
+ assert cache_path is not None
+ self._querier = history_db.Querier(
+ os.path.join(cache_path, 'historydb.sql'), branch)
+ # History-db is not configured for this branch, do it ourselves
+ # sqlite is single-writer, so block concurrant updates.
+ # Note that this was even done in the past because of perf issues, even
+ # without a disk requirement.
+ self._querier.set_importer_lock(history_db_importer_lock)
+ # TODO: Is this being premature? It makes the rest of the code
+ # simpler...
+ self._querier.ensure_branch_tip()
self._branch_nick = self._branch.get_config().get_nickname()
self.log = logging.getLogger('loggerhead.%s' % (self._branch_nick,))
self.last_revid = branch.last_revision()
- caches = [RevInfoMemoryCache(whole_history_data_cache)]
- if revinfo_disk_cache:
- caches.append(revinfo_disk_cache)
- self._load_whole_history_data(caches, cache_key)
-
@property
def has_revisions(self):
return not bzrlib.revision.is_null(self.last_revid)
@@ -314,41 +269,74 @@
return self._branch.get_config()
def get_revno(self, revid):
- if revid not in self._rev_indices:
- # ghost parent?
- return 'unknown'
- seq = self._rev_indices[revid]
- revno = self._rev_info[seq][0][3]
- return revno
+ if revid is None:
+ return 'unknown'
+ try:
+ revnos = self._querier.get_dotted_revno_range_multi([revid])
+ dotted_revno = revnos[revid]
+ except: # ???
+ import pdb; pdb.set_trace()
+ import sys
+ e = sys.exc_info()
+ return 'unknown'
+ return '.'.join(map(str, dotted_revno))
+
+ def get_dotted_to_rev_id(self, dotted_revnos):
+ # TODO: Create a memory cache, doing bi-directional mapping, possibly
+ # persisting between HTTP requests.
+ return self._querier.get_revision_ids(dotted_revnos)
+
+ def _get_lh_parent(self, revid):
+ """Get the left-hand parent of a given revision id."""
+ # TODO: Move this into a public method on Querier
+ # TODO: Possibly look into caching some of this info in memory, and
+ # between HTTP requests.
+ self._querier.ensure_branch_tip()
+ return self._querier._get_lh_parent_rev_id(revid)
+
+ def _get_children(self, revid):
+ """Get the children of the given revision id."""
+ # XXX: We should be filtering this based on self._branch's ancestry...
+ # TODO: We also should be using a method on Querier, instead of doing
+ # it ourselves
+ c = self._querier._get_cursor()
+ res = c.execute("SELECT c.revision_id"
+ " FROM revision p, parent, revision c"
+ " WHERE child = c.db_id"
+ " AND parent = p.db_id"
+ " AND p.revision_id = ?",
+ (revid,)).fetchall()
+ return [r[0] for r in res]
def get_revids_from(self, revid_list, start_revid):
"""
Yield the mainline (wrt start_revid) revisions that merged each
revid in revid_list.
"""
+ tip_revid = start_revid
if revid_list is None:
- revid_list = [r[0][1] for r in self._rev_info]
+ # This returns the mainline of start_revid
+ # TODO: We could use Querier for this
+ # Note: Don't use self._branch.revision_history, as that always
+ # grabs the full history, and we now support stopping early.
+ history = self._branch.repository.iter_reverse_revision_history(
+ start_revid)
+ for rev_id in history:
+ yield rev_id
+ return
revid_set = set(revid_list)
- revid = start_revid
- def introduced_revisions(revid):
- r = set([revid])
- seq = self._rev_indices[revid]
- md = self._rev_info[seq][0][2]
- i = seq + 1
- while i < len(self._rev_info) and self._rev_info[i][0][2] > md:
- r.add(self._rev_info[i][0][1])
- i += 1
- return r
- while True:
- if bzrlib.revision.is_null(revid):
- return
- if introduced_revisions(revid) & revid_set:
- yield revid
- parents = self._rev_info[self._rev_indices[revid]][2]
- if len(parents) == 0:
- return
- revid = parents[0]
+ while tip_revid is not None and revid_set:
+ parent_revid = self._get_lh_parent(tip_revid)
+ # TODO: Consider caching this, especially between HTTP requests
+ introduced = self._querier.iter_merge_sorted_revisions(
+ start_revision_id=tip_revid, stop_revision_id=parent_revid)
+ introduced_revs = set([i[0] for i in introduced
+ if i in revid_set])
+ if introduced_revs:
+ revid_set.difference_update(introduced_revs)
+ yield tip_revid
+ tip_revid = parent_revid
def get_short_revision_history_by_fileid(self, file_id):
# FIXME: would be awesome if we could get, for a folder, the list of
@@ -366,79 +354,7 @@
del possible_keys, next_keys
return revids
- def get_revision_history_since(self, revid_list, date):
- # if a user asks for revisions starting at 01-sep, they mean inclusive,
- # so start at midnight on 02-sep.
- date = date + datetime.timedelta(days=1)
- # our revid list is sorted in REVERSE date order,
- # so go thru some hoops here...
- revid_list.reverse()
- index = bisect.bisect(_RevListToTimestamps(revid_list,
- self._branch.repository),
- date)
- if index == 0:
- return []
- revid_list.reverse()
- index = -index
- return revid_list[index:]
-
- def get_search_revid_list(self, query, revid_list):
- """
- given a "quick-search" query, try a few obvious possible meanings:
-
- - revision id or # ("128.1.3")
- - date (US style "mm/dd/yy", earth style "dd-mm-yy", or \
-iso style "yyyy-mm-dd")
- - comment text as a fallback
-
- and return a revid list that matches.
- """
- # FIXME: there is some silliness in this action. we have to look up
- # all the relevant changes (time-consuming) only to return a list of
- # revids which will be used to fetch a set of changes again.
-
- # if they entered a revid, just jump straight there;
- # ignore the passed-in revid_list
- revid = self.fix_revid(query)
- if revid is not None:
- if isinstance(revid, unicode):
- revid = revid.encode('utf-8')
- changes = self.get_changes([revid])
- if (changes is not None) and (len(changes) > 0):
- return [revid]
-
- date = None
- m = self.us_date_re.match(query)
- if m is not None:
- date = datetime.datetime(util.fix_year(int(m.group(3))),
- int(m.group(1)),
- int(m.group(2)))
- else:
- m = self.earth_date_re.match(query)
- if m is not None:
- date = datetime.datetime(util.fix_year(int(m.group(3))),
- int(m.group(2)),
- int(m.group(1)))
- else:
- m = self.iso_date_re.match(query)
- if m is not None:
- date = datetime.datetime(util.fix_year(int(m.group(1))),
- int(m.group(2)),
- int(m.group(3)))
- if date is not None:
- if revid_list is None:
- # if no limit to the query was given,
- # search only the direct-parent path.
- revid_list = list(self.get_revids_from(None, self.last_revid))
- return self.get_revision_history_since(revid_list, date)
-
revno_re = re.compile(r'^[\d\.]+$')
- # the date regex are without a final '$' so that queries like
- # "2006-11-30 12:15" still mostly work. (i think it's better to give
- # them 90% of what they want instead of nothing at all.)
- us_date_re = re.compile(r'^(\d{1,2})/(\d{1,2})/(\d\d(\d\d?))')
- earth_date_re = re.compile(r'^(\d{1,2})-(\d{1,2})-(\d\d(\d\d?))')
- iso_date_re = re.compile(r'^(\d\d\d\d)-(\d\d)-(\d\d)')
def fix_revid(self, revid):
# if a "revid" is actually a dotted revno, convert it to a revid
@@ -448,8 +364,14 @@
return self.last_revid
try:
if self.revno_re.match(revid):
- revid = self._revno_revid[revid]
+ revno = tuple(map(int, revid.split('.')))
+ val = self.get_dotted_to_rev_id([revno])[revno]
+ # XXX: Do this more cleanly
+ if val is None:
+ raise KeyError
+ revid = val
except KeyError:
+ import pdb; pdb.set_trace()
raise bzrlib.errors.NoSuchRevision(self._branch_nick, revid)
return revid
@@ -528,6 +450,8 @@
def get_inventory(self, revid):
if revid not in self._inventory_cache:
+ # TODO: This cache is unbounded, though only used for a single http
+ # request. Consider what we might do to limit this.
self._inventory_cache[revid] = (
self._branch.repository.get_inventory(revid))
return self._inventory_cache[revid]
@@ -554,11 +478,11 @@
merge_point = []
while True:
- children = self._rev_info[self._rev_indices[revid]][1]
+ children = self._get_children(revid)
nexts = []
for child in children:
- child_parents = self._rev_info[self._rev_indices[child]][2]
- if child_parents[0] == revid:
+ child_lh_parent = self._get_lh_parent(child)
+ if child_lh_parent == revid:
nexts.append(child)
else:
merge_point.append(child)
More information about the bazaar-commits
mailing list