Rev 418: Bring in the updated history-db changes. in http://bazaar.launchpad.net/~jameinel/loggerhead/integration

John Arbash Meinel john at arbash-meinel.com
Mon May 3 21:43:10 BST 2010


At http://bazaar.launchpad.net/~jameinel/loggerhead/integration

------------------------------------------------------------
revno: 418 [merge]
revision-id: john at arbash-meinel.com-20100503204255-xqqrtvdt33k3v768
parent: john at arbash-meinel.com-20100430222947-dmvjm1w2ve317p7q
parent: john at arbash-meinel.com-20100503203439-hmztxy1pa9nw6d7s
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: integration
timestamp: Mon 2010-05-03 15:42:55 -0500
message:
  Bring in the updated history-db changes.
modified:
  loggerhead/history.py          history.py-20061211064342-102iqirsciyvgtcf-5
-------------- next part --------------
=== modified file 'loggerhead/history.py'
--- a/loggerhead/history.py	2010-04-30 22:29:47 +0000
+++ b/loggerhead/history.py	2010-05-03 20:42:55 +0000
@@ -36,6 +36,7 @@
 import textwrap
 import threading
 
+from bzrlib import lru_cache
 import bzrlib.branch
 import bzrlib.delta
 import bzrlib.errors
@@ -207,6 +208,53 @@
         finally:
             self._lock.release()
 
+
+_raw_revno_revid_cache = lru_cache.LRUCache(10000)
+_revno_revid_lock = threading.RLock()
+
+
+class RevnoRevidMemoryCache(object):
+    """A store that maps revnos to revids based on the branch it is in.
+    """
+
+    def __init__(self, cache, lock, branch_tip):
+        # Note: what we'd really like is something that knew how long it takes
+        # to produce a revno * how often it is accessed. Since some revnos
+        # take 100x longer to produce than others. Could we cheat and just loop
+        # on __getitem__ ?
+        # There are also other possible layouts. A per-branch cache, with an
+        # LRU around the whole thing, etc. I chose this for simplicity.
+        self._branch_tip = branch_tip
+        self._cache = cache
+        # lru_cache is not thread-safe, so we need to lock all accesses.
+        # It is even modified when doing a get() on it.
+        self._lock = lock
+
+    def get(self, key):
+        """Return the data associated with `key`.
+        Otherwise return None.
+
+        :param key: Can be a revno_str or a revid.
+        """
+        self._lock.acquire()
+        try:
+            cached = self._cache.get((self._branch_tip, key))
+        finally:
+            self._lock.release()
+        return cached
+
+    def set(self, revid, revno_str):
+        """Store `data` under `key`.
+        """
+        self._lock.acquire()
+        try:
+            # TODO: StaticTuples ? Probably only useful if we cache more than
+            #       10k of them. 100k/1M is probably useful.
+            self._cache[(self._branch_tip, revid)] = revno_str
+            self._cache[(self._branch_tip, revno_str)] = revid
+        finally:
+            self._lock.release()
+
 # Used to store locks that prevent multiple threads from building a 
 # revision graph for the same branch at the same time, because that can
 # cause severe performance issues that are so bad that the system seems
@@ -245,6 +293,8 @@
         self._branch = branch
         self._branch_tags = None
         self._inventory_cache = {}
+        # Map from (tip_revision, revision_id) => revno_str
+        # and from (tip_revisino, revno_str) => revision_id
         self._querier = _get_querier(branch)
         if self._querier is None:
             assert cache_path is not None
@@ -263,6 +313,8 @@
         self.log = logging.getLogger('loggerhead.%s' % (self._branch_nick,))
 
         self.last_revid = branch.last_revision()
+        self._revno_revid_cache = RevnoRevidMemoryCache(_raw_revno_revid_cache,
+            _revno_revid_lock, self._branch.last_revision())
 
     @property
     def has_revisions(self):
@@ -274,20 +326,51 @@
     def get_revno(self, revid):
         if revid is None:
             return 'unknown'
-        try:
-            revnos = self._querier.get_dotted_revno_range_multi([revid])
-            dotted_revno = revnos[revid]
-        except: # ???
-            import pdb; pdb.set_trace()
-            import sys
-            e = sys.exc_info()
-            return 'unknown'
-        return '.'.join(map(str, dotted_revno))
-
-    def get_dotted_to_rev_id(self, dotted_revnos):
-        # TODO: Create a memory cache, doing bi-directional mapping, possibly
-        #       persisting between HTTP requests.
-        return self._querier.get_revision_ids(dotted_revnos)
+        revno_str = self._revno_revid_cache.get(revid)
+        if revno_str is not None:
+            return revno_str
+        revnos = self._querier.get_dotted_revno_range_multi([revid])
+        # TODO: Should probably handle KeyError?
+        dotted_revno = revnos[revid]
+        revno_str = '.'.join(map(str, dotted_revno))
+        self._revno_revid_cache.set(revid, revno_str)
+        return revno_str
+
+    def get_revnos(self, revids):
+        """Get a map of revid => revno for all revisions."""
+        revno_map = {}
+        unknown = []
+        for revid in revids:
+            if revid is None:
+                revno_map[revid] = 'unknown'
+                continue
+            revno_str = self._revno_revid_cache.get(revid)
+            if revno_str is not None:
+                revno_map[revid] = revno_str
+                continue
+            unknown.append(revid)
+        if not unknown:
+            return revno_map
+        # querier returns dotted revno tuples
+        query_revno_map = self._querier.get_dotted_revno_range_multi(
+                            unknown)
+        for revid, dotted_revno in query_revno_map.iteritems():
+            revno_str = '.'.join(map(str, dotted_revno))
+            self._revno_revid_cache.set(revid, revno_str)
+            revno_map[revid] = revno_str
+        return revno_map
+
+    def get_revid_for_revno(self, revno_str):
+        revid = self._revno_revid_cache.get(revno_str)
+        if revid is not None:
+            return revid
+        dotted_revno = tuple(map(int, revno_str.split('.')))
+        revnos = self._querier.get_revision_ids([dotted_revno])
+        revnos = dict([('.'.join(map(str, drn)), ri)
+                       for drn, ri in revnos.iteritems()])
+        for revno_str, revid in revnos:
+            self._revno_revid_cache.set(revid, revno_str)
+        return revnos[revno_str]
 
     def _get_lh_parent(self, revid):
         """Get the left-hand parent of a given revision id."""
@@ -324,8 +407,8 @@
             #       grabs the full history, and we now support stopping early.
             history = self._branch.repository.iter_reverse_revision_history(
                             start_revid)
-            for rev_id in history:
-                yield rev_id
+            for revid in history:
+                yield revid
             return
         revid_set = set(revid_list)
 
@@ -367,8 +450,7 @@
             return self.last_revid
         try:
             if self.revno_re.match(revid):
-                revno = tuple(map(int, revid.split('.')))
-                val = self.get_dotted_to_rev_id([revno])[revno]
+                val = self.get_revid_for_revno([revid])[revid]
                 # XXX: Do this more cleanly
                 if val is None:
                     raise KeyError
@@ -584,26 +666,31 @@
         if len(changes) == 0:
             return changes
 
-        # some data needs to be recalculated each time, because it may
-        # change as new revisions are added.
+        needed_revnos = set()
+        for change in changes:
+            needed_revnos.add(change.revid)
+            needed_revnos.update([p_id for p_id in change.parents])
+        revno_map = self.get_revnos(needed_revnos)
+
         def merge_points_callback(a_change, attr):
             merge_revids = self.simplify_merge_point_list(
                                self.get_merge_point_list(a_change.revid))
-            return [util.Container(revid=r, revno=self.get_revno(r))
+            if not merge_revids:
+                return []
+            revno_map = self.get_revnos(merge_revids)
+            return [util.Container(revid=r, revno=revno_map[r])
                     for r in merge_revids]
 
+        parity = 0
         for change in changes:
             if self._show_merge_points:
                 change._set_property('merge_points', merge_points_callback)
             else:
                 change.merge_points = []
             if len(change.parents) > 0:
-                change.parents = [util.Container(revid=r,
-                    revno=self.get_revno(r)) for r in change.parents]
-            change.revno = self.get_revno(change.revid)
-
-        parity = 0
-        for change in changes:
+                change.parents = [util.Container(revid=r, revno=revno_map[r])
+                                  for r in change.parents]
+            change.revno = revno_map[change.revid]
             change.parity = parity
             parity ^= 1
 
@@ -613,7 +700,7 @@
         # FIXME: deprecated method in getting a null revision
         revid_list = filter(lambda revid: not bzrlib.revision.is_null(revid),
                             revid_list)
-        parent_map = self._branch.repository.get_graph().get_parent_map(
+        parent_map = self._branch.repository.get_parent_map(
                          revid_list)
         # We need to return the answer in the same order as the input,
         # less any ghosts.



More information about the bazaar-commits mailing list