[RFC] ancestry cache

John A Meinel john at arbash-meinel.com
Fri Feb 17 02:44:28 GMT 2006


Aaron Bentley wrote:
> Hi all,
> 
> I've been fiddling with progress bars for merge application, and it
> turns out we spend a lot of our time selecting a merge base.  What
> surprised me is that the largest part of this is the time generating a
> graph of common ancestors.
> 
> So I've implemented a prototype ancestry cache.  Patch attached.  Does
> this seem worth pursuing?
> 
> I benchmarked the results.  They weren't as good as I hoped, but they do
> improve things some.  I'm counting real time, because this is an
> IO-limited operation.
> 
> Without the patch, the average of 4 runs was 7.45 seconds
> With patch, the average of 4 runs was 6.45.  So the patch improves
> overall merge performance by ~ 15%.  Perhaps more, if you assume cold cache.
> 
> We can probably speed the common ancestor detection by up to 50% by
> using a somewhat more complicated algorithm.
> 
> This data is optional cache data, so it's compatible with existing
> branch formats, and doesn't need to be locked.  It can only differ from
> the repository if revisions are removed from the repository.
> 
> It is a better source of ancestry data than the inventory weave, because
> it is accurate even in the presence of ghosts
> 
> Aaron

------------------------------------------------------------------------

=== modified file 'bzrlib/revision.py'
--- bzrlib/revision.py	
+++ bzrlib/revision.py	
@@ -17,10 +17,13 @@
 # TODO: Some kind of command-line display of revision properties:
 # perhaps show them in log -v and allow them as options to the commit
command.

+from StringIO import StringIO
+
 import bzrlib.errors
 from bzrlib.graph import node_distances, select_farthest, all_descendants
 from bzrlib.osutils import contains_whitespace
 from bzrlib.progress import DummyProgress
+from bzrlib.trace import note

 NULL_REVISION="null:"

@@ -189,6 +192,51 @@
         raise bzrlib.errors.AmbiguousBase((a_closest[0], b_closest[0]))
     return a_closest[0]

+
+class AncestryCache(object):
+    def __init__(self, revision_source):
+        object.__init__(self)
+        self.parent_ids = {}
+        self.revision_source = revision_source
+        self.try_read_cache()
+
+    def try_read_cache(self):
+        self._read_cache()
+
+    def _read_cache(self):
+        try:
+            count = 0
+            for line in self.revision_source.control_files.get('x-pcache'):
+                revision_id, parent_id = line.rstrip('\n').split('/')
+                if revision_id not in self.parent_ids:
+                    self.parent_ids[revision_id] = []
+                self.parent_ids[revision_id].append(parent_id)
+        except bzrlib.errors.NoSuchFile:
+            return
+
+    def _flush_cache(self):
+        output = StringIO()
+        count = 0
+        for revision_id, parent_ids in self.parent_ids.iteritems():
+            for parent_id in parent_ids:
+                output.write('%s/%s\n' % (revision_id, parent_id))
+        output.seek(0)
+        self.revision_source.control_files.put('x-pcache', output)
+
+    def finalize(self):
+        self._flush_cache()
+        self.revision_source = None
+
+    def get_parent_ids(self, revision_id):
+        revision_id = str(revision_id)
+        if revision_id not in self.parent_ids.keys():
+            revision = self.revision_source.get_revision(revision_id)
+            self.parent_ids[revision_id] = revision.parent_ids
+        else:
+            pass
+        return self.parent_ids[revision_id]
+

I realize this has the advantage that it handles ghost revisions. But is
this genuinely faster than reading the Weave prefix of inventory.weave.
It actually seems like the weave prefix should be faster because you
have fewer bytes to read.

Also, in my store-escaping branch, I was thinking to allow '/' as a
valid character in revision ids (then we wouldn't have to escape Arch
fully qualified patch names), it is easy enough to escape them to write
them onto disk. But I don't really care.


 def revision_graph(revision, revision_source):
     """Produce a graph of the ancestry of the specified revision.
     Return root, ancestors map, descendants map
@@ -204,57 +252,60 @@
     lines = [revision]
     root = None
     descendants[revision] = {}
-    while len(lines) > 0:
-        new_lines = set()
-        for line in lines:
-            if line == NULL_REVISION:
-                parents = []
-                root = NULL_REVISION
-            else:
-                try:
-                    rev = revision_source.get_revision(line)
-                    parents = list(rev.parent_ids)
-                    if len(parents) == 0:
-                        parents = [NULL_REVISION]
-                except bzrlib.errors.NoSuchRevision:
-                    if line == revision:
-                        raise
-                    parents = None
-            if parents is not None:
-                for parent in parents:
-                    if parent not in ancestors:
-                        new_lines.add(parent)
-                    if parent not in descendants:
-                        descendants[parent] = {}
-                    descendants[parent][line] = 1
-            if parents is not None:
-                ancestors[line] = set(parents)
-        lines = new_lines
-    if root is None:
-        # The history for revision becomes inaccessible without
-        # actually hitting a no-parents revision. This then
-        # makes these asserts below trigger. So, if root is None
-        # determine the actual root by walking the accessible tree
-        # and then stash NULL_REVISION at the end.
-        root = NULL_REVISION
-        descendants[root] = {}
-        # for every revision, check we can access at least
-        # one parent, if we cant, add NULL_REVISION and
-        # a link
-        for rev in ancestors:
-            if len(ancestors[rev]) == 0:
-                raise RuntimeError('unreachable code ?!')
-            ok = False
-            for parent in ancestors[rev]:
-                if parent in ancestors:
-                    ok = True
-            if ok:
-                continue
-            descendants[root][rev] = 1
-            ancestors[rev].add(root)
-        ancestors[root] = set()
-    assert root not in descendants[root]
-    assert root not in ancestors[root]
+    a_cache = AncestryCache(revision_source)
+    try:
+        while len(lines) > 0:
+            new_lines = set()
+            for line in lines:
+                if line == NULL_REVISION:
+                    parents = []
+                    root = NULL_REVISION
+                else:
+                    try:
+                        parents = a_cache.get_parent_ids(line)
+                        if len(parents) == 0:
+                            parents = [NULL_REVISION]
+                    except bzrlib.errors.NoSuchRevision:
+                        if line == revision:
+                            raise
+                        parents = None
+                if parents is not None:
+                    for parent in parents:
+                        if parent not in ancestors:
+                            new_lines.add(parent)
+                        if parent not in descendants:
+                            descendants[parent] = {}
+                        descendants[parent][line] = 1
+                if parents is not None:
+                    ancestors[line] = set(parents)
+            lines = new_lines
+        if root is None:
+            # The history for revision becomes inaccessible without
+            # actually hitting a no-parents revision. This then
+            # makes these asserts below trigger. So, if root is None
+            # determine the actual root by walking the accessible tree
+            # and then stash NULL_REVISION at the end.
+            root = NULL_REVISION
+            descendants[root] = {}
+            # for every revision, check we can access at least
+            # one parent, if we cant, add NULL_REVISION and
+            # a link
+            for rev in ancestors:
+                if len(ancestors[rev]) == 0:
+                    raise RuntimeError('unreachable code ?!')
+                ok = False
+                for parent in ancestors[rev]:
+                    if parent in ancestors:
+                        ok = True
+                if ok:
+                    continue
+                descendants[root][rev] = 1
+                ancestors[rev].add(root)
+            ancestors[root] = set()
+        assert root not in descendants[root]
+        assert root not in ancestors[root]
+    finally:
+        a_cache.finalize()
     return root, ancestors, descendants


It seems like this makes a_cache actually important, since it doesn't
look anywhere else for the parents.

Otherwise it would seem that you would use:

try:
  parents = a_cache.get_parent_ids(line)
except NoSuchRevision:
  rev = repository.get_revision(line)
  parents = rev.parent_ids

if len(parents) == 0:
  parents = [NULL_REVISION]


John
=:->

-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 249 bytes
Desc: OpenPGP digital signature
Url : https://lists.ubuntu.com/archives/bazaar/attachments/20060216/b407fa7f/attachment.pgp 


More information about the bazaar mailing list