Rev 13: Do not output copy instructions which take more to encode than a fresh insert. (But do not refer to those insertions when finding ranges to copy: they are not interesting). in http://people.ubuntu.com/~robertc/baz2.0/plugins/groupcompress/trunk

Tue Jul 15 17:22:11 BST 2008

At http://people.ubuntu.com/~robertc/baz2.0/plugins/groupcompress/trunk

------------------------------------------------------------
revno: 13
revision-id: robertc at robertcollins.net-20080715162205-f55unkim2al9pv2z
parent: robertc at robertcollins.net-20080715154818-94t8nveufs3scvay
committer: Robert Collins <robertc at robertcollins.net>
branch nick: trunk
timestamp: Wed 2008-07-16 02:22:05 +1000
message:
  Do not output copy instructions which take more to encode than a fresh insert. (But do not refer to those insertions when finding ranges to copy: they are not interesting).
modified:
  groupcompress.py               groupcompress.py-20080705181503-ccbxd6xuy1bdnrpu-8
  tests/test_groupcompress.py    test_groupcompress.p-20080705181503-ccbxd6xuy1bdnrpu-13
=== modified file 'groupcompress.py'

--- a/groupcompress.py	2008-07-15 15:48:18 +0000
+++ b/groupcompress.py	2008-07-15 16:22:05 +0000
@@ -17,6 +17,7 @@
 
 """Core compression logic for compressing streams of related files."""
 
+from itertools import izip
 from cStringIO import StringIO
 import zlib
 
@@ -145,6 +146,7 @@
         new_lines = []
         new_lines.append('label: %s\n' % label)
         new_lines.append('sha1: %s\n' % sha1)
+        index_lines = [False, False]
         pos = 0
         line_locations = self.line_locations
         accumulator = []
@@ -164,7 +166,18 @@
                         start_byte = 0
                     else:
                         start_byte = self.line_offsets[copy_start - 1]
-                    new_lines.append("c,%d,%d\n" % (start_byte, stop_byte - start_byte))
+                    bytes = stop_byte - start_byte
+                    copy_control_instruction = "c,%d,%d\n" % (start_byte, bytes)
+                    insert_instruction = "i,%d\n" % copy_len
+                    if (bytes + len(insert_instruction) >
+                        len(copy_control_instruction)):
+                        new_lines.append(copy_control_instruction)
+                        index_lines.append(False)
+                    else:
+                        # inserting is shorter than copying, so insert.
+                        new_lines.append(insert_instruction)
+                        new_lines.extend(lines[new_start:new_start+copy_len])
+                        index_lines.extend([False]*(copy_len + 1))
                     copying = False
                     new_start = pos
                     new_len = 1
@@ -186,18 +199,33 @@
                             start_byte = 0
                         else:
                             start_byte = self.line_offsets[copy_start - 1]
-                        new_lines.append("c,%d,%d\n" % (start_byte, stop_byte - start_byte))
+                        bytes = stop_byte - start_byte
+                        copy_control_instruction = "c,%d,%d\n" % (start_byte, bytes)
+                        insert_instruction = "i,%d\n" % copy_len
+                        if (bytes + len(insert_instruction) >
+                            len(copy_control_instruction)):
+                            new_lines.append(copy_control_instruction)
+                            index_lines.append(False)
+                        else:
+                            # inserting is shorter than copying, so insert.
+                            new_lines.append(insert_instruction)
+                            new_lines.extend(lines[new_start:new_start+copy_len])
+                            index_lines.extend([False]*(copy_len + 1))
                         copy_len = 1
                         copy_ends = next
+                        new_start = pos
                 else:
                     # Flush
                     if new_len:
                         new_lines.append("i,%d\n" % new_len)
                         new_lines.extend(lines[new_start:new_start+new_len])
+                        index_lines.append(False)
+                        index_lines.extend([True]*new_len)
                     # setup a copy
                     copy_len = 1
                     copy_ends = line_locations[line][1]
                     copying = True
+                    new_start = pos
             pos += 1
         if copying:
             copy_start = min(copy_ends) - copy_len
@@ -206,13 +234,25 @@
                 start_byte = 0
             else:
                 start_byte = self.line_offsets[copy_start - 1]
-            new_lines.append("c,%d,%d\n" % (start_byte, stop_byte - start_byte))
+            bytes = stop_byte - start_byte
+            copy_control_instruction = "c,%d,%d\n" % (start_byte, bytes)
+            insert_instruction = "i,%d\n" % copy_len
+            if (bytes + len(insert_instruction) >
+                len(copy_control_instruction)):
+                new_lines.append(copy_control_instruction)
+                index_lines.append(False)
+            else:
+                # inserting is shorter than copying, so insert.
+                new_lines.append(insert_instruction)
+                new_lines.extend(lines[new_start:new_start+copy_len])
+                index_lines.extend([False]*(copy_len + 1))
         elif new_len:
             new_lines.append("i,%d\n" % new_len)
             new_lines.extend(lines[new_start:new_start+new_len])
-
+            index_lines.append(False)
+            index_lines.extend([True]*new_len)
         delta_start = (self.endpoint, len(self.lines))
-        self.output_lines(new_lines)
+        self.output_lines(new_lines, index_lines)
         trim_encoding_newline(lines)
         self.input_bytes += sum(map(len, lines))
         delta_end = (self.endpoint, len(self.lines))
@@ -235,17 +275,24 @@
         sha1 = sha_strings(lines)
         return lines, sha1
 
-    def output_lines(self, new_lines):
+    def output_lines(self, new_lines, index_lines):
+        """Output some lines.
+
+        :param new_lines: The lines to output.
+        :param index_lines: A boolean flag for each line - when True, index
+            that line.
+        """
         endpoint = self.endpoint
         offset = len(self.lines)
-        for pos, line in enumerate(new_lines):
+        for (pos, line), index in izip(enumerate(new_lines), index_lines):
             self.lines.append(line)
             endpoint += len(line)
             self.line_offsets.append(endpoint)
-            indices, next_lines = self.line_locations.setdefault(line,
-                (set(), set()))
-            indices.add(pos + offset)
-            next_lines.add(pos + offset + 1)
+            if index:
+                indices, next_lines = self.line_locations.setdefault(line,
+                    (set(), set()))
+                indices.add(pos + offset)
+                next_lines.add(pos + offset + 1)
         self.endpoint = endpoint
 
     def ratio(self):

=== modified file 'tests/test_groupcompress.py'
--- a/tests/test_groupcompress.py	2008-07-15 15:48:18 +0000
+++ b/tests/test_groupcompress.py	2008-07-15 16:22:05 +0000
@@ -89,9 +89,9 @@
             # add the line different
             'i,1\n',
             'different\n',
-            # copy the line \n. Note that when we filter on encoding-overhead
-            # this will become a fresh insert instead
-            'c,79,1\n',
+            # Insert the trailing newline.
+            'i,1\n',
+            '\n'
             ])
         self.assertEqual(expected_lines, compressor.lines)
         self.assertEqual(sum(map(len, expected_lines)), end_point)
@@ -120,9 +120,9 @@
             'c,72,7\n',
             # copy the lines different, moredifferent
             'c,154,24\n',
-            # copy the line \n. Note that when we filter on encoding-overhead
-            # this will become a fresh insert instead
-            'c,79,1\n',
+            # Insert the trailing newline.
+            'i,1\n',
+            '\n'
             ])
         self.assertEqualDiff(''.join(expected_lines), ''.join(compressor.lines))
         self.assertEqual(sum(map(len, expected_lines)), end_point)