Rev 4220: Clean up GroupCompressor.compress(). in file:///home/vila/src/bzr/experimental/gc-py-bbc/
Vincent Ladeuil
v.ladeuil+lp at free.fr
Mon Mar 30 16:18:29 BST 2009
At file:///home/vila/src/bzr/experimental/gc-py-bbc/
------------------------------------------------------------
revno: 4220
revision-id: v.ladeuil+lp at free.fr-20090330151829-s4dpgyurlshjvie0
parent: v.ladeuil+lp at free.fr-20090330123827-v4gwe3ki83lq51rm
committer: Vincent Ladeuil <v.ladeuil+lp at free.fr>
branch nick: python-groupcompress
timestamp: Mon 2009-03-30 17:18:29 +0200
message:
Clean up GroupCompressor.compress().
* bzrlib/tests/test_groupcompress.py:
GroupCompressor.compress() calls updated.
* bzrlib/groupcompress.py:
(_CommonGroupCompressor._compress): Get rid of the returned
length.
(PythonGroupCompressor._compress): Slight changes to match the
pyrex implementation more closely.
(GroupCompressVersionedFiles._insert_record_stream.flush):
GroupCompressor.compress() calls updated.
-------------- next part --------------
=== modified file 'BRANCH.TODO'
--- a/BRANCH.TODO 2009-03-25 17:20:33 +0000
+++ b/BRANCH.TODO 2009-03-30 15:18:29 +0000
@@ -19,6 +19,3 @@
* Investigate sha1 overhead. Currently when doing 'bzr pack' we sha once on
extraction, and then one more time on insertion. Removing both calls saves
6s out of 32s for 'bzr pack' (and 8s=>5.5s for 'repository-details')
-
- * don't require the pyx.c version or ~600 tests errors out,
- write a pure python version instead
=== modified file 'bzrlib/groupcompress.py'
--- a/bzrlib/groupcompress.py 2009-03-30 11:49:32 +0000
+++ b/bzrlib/groupcompress.py 2009-03-30 15:18:29 +0000
@@ -406,8 +406,8 @@
end_point = 0
for factory in self._factories:
bytes = factory.get_bytes_as('fulltext')
- (found_sha1, start_point, end_point, type,
- length) = compressor.compress(factory.key, bytes, factory.sha1)
+ (found_sha1, start_point, end_point,
+ type) = compressor.compress(factory.key, bytes, factory.sha1)
# Now update this factory with the new offsets, etc
factory.sha1 = found_sha1
factory._start = start_point
@@ -614,7 +614,8 @@
if key[-1] is None:
key = key[:-1] + ('sha1:' + sha1,)
- return self._compress(key, bytes, sha1, len(bytes) / 2, soft)
+ (sha1, start, end, type) = self._compress(key, bytes, sha1, len(bytes) / 2, soft)
+ return sha1, start, end, type
def _compress(self, key, bytes, sha1, max_delta_size, soft=False):
"""Compress lines with label key.
@@ -632,9 +633,8 @@
:param soft: Do a 'soft' compression. This means that we require larger
ranges to match to be considered for a copy command.
- :return: The sha1 of lines, the start and end offsets in the delta, the
- type ('fulltext' or 'delta') and the number of bytes accumulated in
- the group output so far.
+ :return: The sha1 of lines, the start and end offsets in the delta, and
+ the type ('fulltext' or 'delta').
"""
raise NotImplementedError(self._compress)
@@ -703,7 +703,7 @@
def __init__(self):
"""Create a GroupCompressor.
- :param delta: If False, do not compress records.
+ Used only if the pyrex version is not available.
"""
super(PythonGroupCompressor, self).__init__()
self._delta_index = LinesDeltaIndex([])
@@ -712,35 +712,34 @@
def _compress(self, key, bytes, sha1, max_delta_size, soft=False):
"""see _CommonGroupCompressor._compress"""
- bytes_length = len(bytes)
+ input_len = len(bytes)
new_lines = osutils.split_lines(bytes)
- out_lines, index_lines = self._delta_index.make_delta(new_lines,
- bytes_length=bytes_length, soft=soft)
+ out_lines, index_lines = self._delta_index.make_delta(
+ new_lines, bytes_length=input_len, soft=soft)
delta_length = sum(map(len, out_lines))
if delta_length > max_delta_size:
# The delta is longer than the fulltext, insert a fulltext
type = 'fulltext'
- out_lines = ['f', encode_base128_int(bytes_length)]
+ out_lines = ['f', encode_base128_int(input_len)]
out_lines.extend(new_lines)
index_lines = [False, False]
index_lines.extend([True] * len(new_lines))
- out_length = len(out_lines[1]) + bytes_length + 1
else:
# this is a worthy delta, output it
type = 'delta'
out_lines[0] = 'd'
# Update the delta_length to include those two encoded integers
out_lines[1] = encode_base128_int(delta_length)
- out_length = len(out_lines[3]) + 1 + delta_length
- start = self.endpoint # Before insertion
- chunk_start = len(self._delta_index.lines)
+ # Before insertion
+ start = self.endpoint
+ chunk_start = len(self.chunks)
self._delta_index.extend_lines(out_lines, index_lines)
self.endpoint = self._delta_index.endpoint
- self.input_bytes += bytes_length
- chunk_end = len(self._delta_index.lines)
+ self.input_bytes += input_len
+ chunk_end = len(self.chunks)
self.labels_deltas[key] = (start, chunk_start,
self.endpoint, chunk_end)
- return sha1, start, self.endpoint, type, out_length
+ return sha1, start, self.endpoint, type
class PyrexGroupCompressor(_CommonGroupCompressor):
@@ -784,14 +783,12 @@
type = 'fulltext'
enc_length = encode_base128_int(len(bytes))
len_mini_header = 1 + len(enc_length)
- length = len(bytes) + len_mini_header
self._delta_index.add_source(bytes, len_mini_header)
new_chunks = ['f', enc_length, bytes]
else:
type = 'delta'
enc_length = encode_base128_int(len(delta))
len_mini_header = 1 + len(enc_length)
- length = len(delta) + len_mini_header
new_chunks = ['d', enc_length, delta]
self._delta_index.add_delta_source(delta, len_mini_header)
# Before insertion
@@ -807,7 +804,7 @@
raise AssertionError('the delta index is out of sync'
'with the output lines %s != %s'
% (self._delta_index._source_offset, self.endpoint))
- return sha1, start, self.endpoint, type, length
+ return sha1, start, self.endpoint, type
def _output_chunks(self, new_chunks):
"""Output some chunks.
@@ -1321,7 +1318,6 @@
self._compressor = GroupCompressor()
last_prefix = None
- last_fulltext_len = None
max_fulltext_len = 0
max_fulltext_prefix = None
insert_manager = None
@@ -1335,8 +1331,8 @@
raise errors.RevisionNotPresent(record.key, self)
if random_id:
if record.key in inserted_keys:
- trace.note('Insert claimed random_id=True, but then inserted'
- ' %r two times', record.key)
+ trace.note('Insert claimed random_id=True,'
+ ' but then inserted %r two times', record.key)
continue
inserted_keys.add(record.key)
if reuse_blocks:
@@ -1379,11 +1375,11 @@
if max_fulltext_len < len(bytes):
max_fulltext_len = len(bytes)
max_fulltext_prefix = prefix
- (found_sha1, start_point, end_point, type,
- length) = self._compressor.compress(record.key,
- bytes, record.sha1, soft=soft,
- nostore_sha=nostore_sha)
- # delta_ratio = float(len(bytes)) / length
+ (found_sha1, start_point, end_point,
+ type) = self._compressor.compress(record.key,
+ bytes, record.sha1, soft=soft,
+ nostore_sha=nostore_sha)
+ # delta_ratio = float(len(bytes)) / (end_point - start_point)
# Check if we want to continue to include that text
if (prefix == max_fulltext_prefix
and end_point < 2 * max_fulltext_len):
@@ -1402,10 +1398,9 @@
self._compressor.pop_last()
flush()
max_fulltext_len = len(bytes)
- (found_sha1, start_point, end_point, type,
- length) = self._compressor.compress(record.key,
- bytes, record.sha1)
- last_fulltext_len = length
+ (found_sha1, start_point, end_point,
+ type) = self._compressor.compress(record.key, bytes,
+ record.sha1)
if record.key[-1] is None:
key = record.key[:-1] + ('sha1:' + found_sha1,)
else:
=== modified file 'bzrlib/tests/test_groupcompress.py'
--- a/bzrlib/tests/test_groupcompress.py 2009-03-30 11:49:32 +0000
+++ b/bzrlib/tests/test_groupcompress.py 2009-03-30 15:18:29 +0000
@@ -72,7 +72,7 @@
def test_one_nosha_delta(self):
# diff against NUKK
compressor = self.compressor()
- sha1, start_point, end_point, _, _ = compressor.compress(('label',),
+ sha1, start_point, end_point, _ = compressor.compress(('label',),
'strange\ncommon\n', None)
self.assertEqual(sha_string('strange\ncommon\n'), sha1)
expected_lines = 'f' '\x0f' 'strange\ncommon\n'
@@ -105,10 +105,10 @@
# Knit fetching will try to reconstruct texts locally which results in
# reading something that is in the compressor stream already.
compressor = self.compressor()
- sha1_1, _, _, _, _ = compressor.compress(('label',),
+ sha1_1, _, _, _ = compressor.compress(('label',),
'strange\ncommon long line\nthat needs a 16 byte match\n', None)
expected_lines = list(compressor.chunks)
- sha1_2, _, end_point, _, _ = compressor.compress(('newlabel',),
+ sha1_2, _, end_point, _ = compressor.compress(('newlabel',),
'common long line\nthat needs a 16 byte match\ndifferent\n', None)
# get the first out
self.assertEqual(('strange\ncommon long line\n'
@@ -146,10 +146,10 @@
def test_two_nosha_delta(self):
compressor = self.compressor()
- sha1_1, _, _, _, _ = compressor.compress(('label',),
+ sha1_1, _, _, _ = compressor.compress(('label',),
'strange\ncommon long line\nthat needs a 16 byte match\n', None)
expected_lines = list(compressor.chunks)
- sha1_2, start_point, end_point, _, _ = compressor.compress(('newlabel',),
+ sha1_2, start_point, end_point, _ = compressor.compress(('newlabel',),
'common long line\nthat needs a 16 byte match\ndifferent\n', None)
self.assertEqual(sha_string('common long line\n'
'that needs a 16 byte match\n'
@@ -171,12 +171,12 @@
# The first interesting test: make a change that should use lines from
# both parents.
compressor = self.compressor()
- sha1_1, _, _, _, _ = compressor.compress(('label',),
+ sha1_1, _, _, _ = compressor.compress(('label',),
'strange\ncommon very very long line\nwith some extra text\n', None)
- sha1_2, _, _, _, _ = compressor.compress(('newlabel',),
+ sha1_2, _, _, _ = compressor.compress(('newlabel',),
'different\nmoredifferent\nand then some more\n', None)
expected_lines = list(compressor.chunks)
- sha1_3, start_point, end_point, _, _ = compressor.compress(('label3',),
+ sha1_3, start_point, end_point, _ = compressor.compress(('label3',),
'new\ncommon very very long line\nwith some extra text\n'
'different\nmoredifferent\nand then some more\n',
None)
@@ -225,10 +225,10 @@
def test_two_nosha_delta(self):
compressor = self.compressor()
- sha1_1, _, _, _, _ = compressor.compress(('label',),
+ sha1_1, _, _, _ = compressor.compress(('label',),
'strange\ncommon long line\nthat needs a 16 byte match\n', None)
expected_lines = list(compressor.chunks)
- sha1_2, start_point, end_point, _, _ = compressor.compress(('newlabel',),
+ sha1_2, start_point, end_point, _ = compressor.compress(('newlabel',),
'common long line\nthat needs a 16 byte match\ndifferent\n', None)
self.assertEqual(sha_string('common long line\n'
'that needs a 16 byte match\n'
@@ -250,12 +250,12 @@
# The first interesting test: make a change that should use lines from
# both parents.
compressor = self.compressor()
- sha1_1, _, _, _, _ = compressor.compress(('label',),
+ sha1_1, _, _, _ = compressor.compress(('label',),
'strange\ncommon very very long line\nwith some extra text\n', None)
- sha1_2, _, _, _, _ = compressor.compress(('newlabel',),
+ sha1_2, _, _, _ = compressor.compress(('newlabel',),
'different\nmoredifferent\nand then some more\n', None)
expected_lines = list(compressor.chunks)
- sha1_3, start_point, end_point, _, _ = compressor.compress(('label3',),
+ sha1_3, start_point, end_point, _ = compressor.compress(('label3',),
'new\ncommon very very long line\nwith some extra text\n'
'different\nmoredifferent\nand then some more\n',
None)
More information about the bazaar-commits
mailing list