Rev 13: Do not output copy instructions which take more to encode than a fresh insert. (But do not refer to those insertions when finding ranges to copy: they are not interesting). in http://people.ubuntu.com/~robertc/baz2.0/plugins/groupcompress/trunk
Robert Collins
robertc at robertcollins.net
Tue Jul 15 17:22:11 BST 2008
At http://people.ubuntu.com/~robertc/baz2.0/plugins/groupcompress/trunk
------------------------------------------------------------
revno: 13
revision-id: robertc at robertcollins.net-20080715162205-f55unkim2al9pv2z
parent: robertc at robertcollins.net-20080715154818-94t8nveufs3scvay
committer: Robert Collins <robertc at robertcollins.net>
branch nick: trunk
timestamp: Wed 2008-07-16 02:22:05 +1000
message:
Do not output copy instructions which take more to encode than a fresh insert. (But do not refer to those insertions when finding ranges to copy: they are not interesting).
modified:
groupcompress.py groupcompress.py-20080705181503-ccbxd6xuy1bdnrpu-8
tests/test_groupcompress.py test_groupcompress.p-20080705181503-ccbxd6xuy1bdnrpu-13
=== modified file 'groupcompress.py'
--- a/groupcompress.py 2008-07-15 15:48:18 +0000
+++ b/groupcompress.py 2008-07-15 16:22:05 +0000
@@ -17,6 +17,7 @@
"""Core compression logic for compressing streams of related files."""
+from itertools import izip
from cStringIO import StringIO
import zlib
@@ -145,6 +146,7 @@
new_lines = []
new_lines.append('label: %s\n' % label)
new_lines.append('sha1: %s\n' % sha1)
+ index_lines = [False, False]
pos = 0
line_locations = self.line_locations
accumulator = []
@@ -164,7 +166,18 @@
start_byte = 0
else:
start_byte = self.line_offsets[copy_start - 1]
- new_lines.append("c,%d,%d\n" % (start_byte, stop_byte - start_byte))
+ bytes = stop_byte - start_byte
+ copy_control_instruction = "c,%d,%d\n" % (start_byte, bytes)
+ insert_instruction = "i,%d\n" % copy_len
+ if (bytes + len(insert_instruction) >
+ len(copy_control_instruction)):
+ new_lines.append(copy_control_instruction)
+ index_lines.append(False)
+ else:
+ # inserting is shorter than copying, so insert.
+ new_lines.append(insert_instruction)
+ new_lines.extend(lines[new_start:new_start+copy_len])
+ index_lines.extend([False]*(copy_len + 1))
copying = False
new_start = pos
new_len = 1
@@ -186,18 +199,33 @@
start_byte = 0
else:
start_byte = self.line_offsets[copy_start - 1]
- new_lines.append("c,%d,%d\n" % (start_byte, stop_byte - start_byte))
+ bytes = stop_byte - start_byte
+ copy_control_instruction = "c,%d,%d\n" % (start_byte, bytes)
+ insert_instruction = "i,%d\n" % copy_len
+ if (bytes + len(insert_instruction) >
+ len(copy_control_instruction)):
+ new_lines.append(copy_control_instruction)
+ index_lines.append(False)
+ else:
+ # inserting is shorter than copying, so insert.
+ new_lines.append(insert_instruction)
+ new_lines.extend(lines[new_start:new_start+copy_len])
+ index_lines.extend([False]*(copy_len + 1))
copy_len = 1
copy_ends = next
+ new_start = pos
else:
# Flush
if new_len:
new_lines.append("i,%d\n" % new_len)
new_lines.extend(lines[new_start:new_start+new_len])
+ index_lines.append(False)
+ index_lines.extend([True]*new_len)
# setup a copy
copy_len = 1
copy_ends = line_locations[line][1]
copying = True
+ new_start = pos
pos += 1
if copying:
copy_start = min(copy_ends) - copy_len
@@ -206,13 +234,25 @@
start_byte = 0
else:
start_byte = self.line_offsets[copy_start - 1]
- new_lines.append("c,%d,%d\n" % (start_byte, stop_byte - start_byte))
+ bytes = stop_byte - start_byte
+ copy_control_instruction = "c,%d,%d\n" % (start_byte, bytes)
+ insert_instruction = "i,%d\n" % copy_len
+ if (bytes + len(insert_instruction) >
+ len(copy_control_instruction)):
+ new_lines.append(copy_control_instruction)
+ index_lines.append(False)
+ else:
+ # inserting is shorter than copying, so insert.
+ new_lines.append(insert_instruction)
+ new_lines.extend(lines[new_start:new_start+copy_len])
+ index_lines.extend([False]*(copy_len + 1))
elif new_len:
new_lines.append("i,%d\n" % new_len)
new_lines.extend(lines[new_start:new_start+new_len])
-
+ index_lines.append(False)
+ index_lines.extend([True]*new_len)
delta_start = (self.endpoint, len(self.lines))
- self.output_lines(new_lines)
+ self.output_lines(new_lines, index_lines)
trim_encoding_newline(lines)
self.input_bytes += sum(map(len, lines))
delta_end = (self.endpoint, len(self.lines))
@@ -235,17 +275,24 @@
sha1 = sha_strings(lines)
return lines, sha1
- def output_lines(self, new_lines):
+ def output_lines(self, new_lines, index_lines):
+ """Output some lines.
+
+ :param new_lines: The lines to output.
+ :param index_lines: A boolean flag for each line - when True, index
+ that line.
+ """
endpoint = self.endpoint
offset = len(self.lines)
- for pos, line in enumerate(new_lines):
+ for (pos, line), index in izip(enumerate(new_lines), index_lines):
self.lines.append(line)
endpoint += len(line)
self.line_offsets.append(endpoint)
- indices, next_lines = self.line_locations.setdefault(line,
- (set(), set()))
- indices.add(pos + offset)
- next_lines.add(pos + offset + 1)
+ if index:
+ indices, next_lines = self.line_locations.setdefault(line,
+ (set(), set()))
+ indices.add(pos + offset)
+ next_lines.add(pos + offset + 1)
self.endpoint = endpoint
def ratio(self):
=== modified file 'tests/test_groupcompress.py'
--- a/tests/test_groupcompress.py 2008-07-15 15:48:18 +0000
+++ b/tests/test_groupcompress.py 2008-07-15 16:22:05 +0000
@@ -89,9 +89,9 @@
# add the line different
'i,1\n',
'different\n',
- # copy the line \n. Note that when we filter on encoding-overhead
- # this will become a fresh insert instead
- 'c,79,1\n',
+ # Insert the trailing newline.
+ 'i,1\n',
+ '\n'
])
self.assertEqual(expected_lines, compressor.lines)
self.assertEqual(sum(map(len, expected_lines)), end_point)
@@ -120,9 +120,9 @@
'c,72,7\n',
# copy the lines different, moredifferent
'c,154,24\n',
- # copy the line \n. Note that when we filter on encoding-overhead
- # this will become a fresh insert instead
- 'c,79,1\n',
+ # Insert the trailing newline.
+ 'i,1\n',
+ '\n'
])
self.assertEqualDiff(''.join(expected_lines), ''.join(compressor.lines))
self.assertEqual(sum(map(len, expected_lines)), end_point)
More information about the bazaar-commits
mailing list