Rev 33: Experiment with multi-parents and multi-child base selection algorithms. in http://bzr.arbash-meinel.com/plugins/xdelta_test

Fri Jun 22 18:45:11 BST 2007

At http://bzr.arbash-meinel.com/plugins/xdelta_test

------------------------------------------------------------
revno: 33
revision-id: john at arbash-meinel.com-20070622174504-abk2qp1s9wqkb40a
parent: john at arbash-meinel.com-20070622154902-5rvjc20mrrt373zg
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: xdelta_test
timestamp: Fri 2007-06-22 12:45:04 -0500
message:
  Experiment with multi-parents and multi-child base selection algorithms.
  They seem to genuinely shrink the delta texts.
modified:
  base_selection_algorithms.py   base_selection_algor-20070528163119-4mg41krgj6fz5xen-1
  bench_important_algorithms/bench_all_texts.py bench_all_texts.py-20070528175343-sl9y3m4xrylwr5n9-3
  compression_algorithms.py      compression_algorith-20070528163119-4mg41krgj6fz5xen-2
-------------- next part --------------
=== modified file 'base_selection_algorithms.py'

--- a/base_selection_algorithms.py	2007-05-29 14:32:09 +0000
+++ b/base_selection_algorithms.py	2007-06-22 17:45:04 +0000
@@ -33,6 +33,18 @@
 base_selection_algorithms.append(lh_parent)
 
 
+def multi_parent(ancestry_graph, offsets, texts):
+    base = {}
+    for version_id, parents in ancestry_graph.iteritems():
+        if not parents:
+            base[version_id] = None
+        else:
+            base[version_id] = tuple(parents)
+    return base
+
+base_selection_algorithms.append(multi_parent)
+
+
 def rh_parent(ancestry_graph, offsets, texts):
     base = {}
     for version_id, parents in ancestry_graph.iteritems():
@@ -107,6 +119,37 @@
 base_selection_algorithms.append(lh_child_child)
 
 
+def multi_child(ancestry_graph, offsets, texts):
+    """Find a close compression relative.
+
+    We delta against all children.
+
+    :param ancestry_graph: dict of (revision_id => [parents])
+    :param offsets: dict of (revision_id => offset_in_list)
+    :param texts: List of full texts
+    """
+    children = {}
+    for version_id, parents in ancestry_graph.iteritems():
+        if not parents:
+            continue
+        offset = offsets[version_id]
+        for p in parents:
+            children.setdefault(p, []).append((offset, version_id))
+
+    base = {}
+    for version_id in ancestry_graph:
+        if version_id in children:
+            # reverse=True has been shown to be slightly (1%?) smaller than
+            # reverse=False.
+            base_version = tuple(x[1] for x in sorted(children[version_id], reverse=True))
+        else:
+            base_version = None
+        base[version_id] = base_version
+    return base
+
+base_selection_algorithms.append(multi_child)
+
+
 # This algorithm seems to be "illegal" in that it doesn't give a proper
 # fulltext base to start from. (ie if you only had the compressed output, you
 # could not generate any texts.)
@@ -200,4 +243,8 @@
 base_selection_algorithms.append(by_size_base_longer)
 
 best_base_algorithms = [lh_parent, lh_child_child, linear_base_after,
-                        by_size_base_longer]
+                        by_size_base_longer,
+                        # Not all code paths can use these yet
+                        #multi_parent,
+                        #multi_child,
+                        ]

=== modified file 'bench_important_algorithms/bench_all_texts.py'
--- a/bench_important_algorithms/bench_all_texts.py	2007-06-22 15:49:02 +0000
+++ b/bench_important_algorithms/bench_all_texts.py	2007-06-22 17:45:04 +0000
@@ -46,8 +46,14 @@
             if base is None:
                 out = self.compress_algorithm.compress_one(text)
             else:
-                base_offset = offsets[base]
-                base_text = texts[base_offset]
+                # This will be highly innefficient for lots of algorithms,
+                # since it will create a full delta, but it *is* what xdelta
+                # wants. Maybe we should just skip everything else.
+                if isinstance(base, tuple):
+                    base_text = ''.join(texts[offsets[b]] for b in base)
+                else:
+                    base_offset = offsets[base]
+                    base_text = texts[base_offset]
                 out = self.compress_algorithm.delta(base_text, text)
             compressed_texts.append(out)
         return compressed_texts
@@ -67,8 +73,14 @@
                 out = self.compress_algorithm.decompress_one(c_text,
                                                     len(texts[offset]))
             else:
-                base_offset = offsets[base]
-                base_text = texts[base_offset]
+                # This will be highly innefficient for lots of algorithms,
+                # since it will create a full delta, but it *is* what xdelta
+                # wants. Maybe we should just skip everything else.
+                if isinstance(base, tuple):
+                    base_text = ''.join(texts[offsets[b]] for b in base)
+                else:
+                    base_offset = offsets[base]
+                    base_text = texts[base_offset]
                 out = self.compress_algorithm.apply_deltas(base_text,
                                                            [c_text])
             decompressed_texts.append(out)

=== modified file 'compression_algorithms.py'
--- a/compression_algorithms.py	2007-06-22 15:49:02 +0000
+++ b/compression_algorithms.py	2007-06-22 17:45:04 +0000
@@ -364,5 +364,6 @@
                           #'xd3-NOCOMPRESS',
                           'xd3-NOCOMPRESS+zlib',
                           #'bdiff-one',
-                          'bdiff-one+zlib', 'bdiff-multi+zlib',
+                          'bdiff-one+zlib',
+                          'bdiff-multi+zlib',
                          ]]