Rev 4220: Clean up GroupCompressor.compress(). in file:///home/vila/src/bzr/experimental/gc-py-bbc/

Mon Mar 30 16:18:29 BST 2009

At file:///home/vila/src/bzr/experimental/gc-py-bbc/

------------------------------------------------------------
revno: 4220
revision-id: v.ladeuil+lp at free.fr-20090330151829-s4dpgyurlshjvie0
parent: v.ladeuil+lp at free.fr-20090330123827-v4gwe3ki83lq51rm
committer: Vincent Ladeuil <v.ladeuil+lp at free.fr>
branch nick: python-groupcompress
timestamp: Mon 2009-03-30 17:18:29 +0200
message:
  Clean up GroupCompressor.compress().
  
  * bzrlib/tests/test_groupcompress.py:
  GroupCompressor.compress() calls updated.
  
  * bzrlib/groupcompress.py:
  (_CommonGroupCompressor._compress): Get rid of the returned
  length.
  (PythonGroupCompressor._compress): Slight changes to match the
  pyrex implementation more closely.
  (GroupCompressVersionedFiles._insert_record_stream.flush):
  GroupCompressor.compress() calls updated.
-------------- next part --------------
=== modified file 'BRANCH.TODO'

--- a/BRANCH.TODO	2009-03-25 17:20:33 +0000
+++ b/BRANCH.TODO	2009-03-30 15:18:29 +0000
@@ -19,6 +19,3 @@
  * Investigate sha1 overhead. Currently when doing 'bzr pack' we sha once on
    extraction, and then one more time on insertion. Removing both calls saves
    6s out of 32s for 'bzr pack' (and 8s=>5.5s for 'repository-details')
-
- * don't require the pyx.c version or ~600 tests errors out,
-   write a pure python version instead

=== modified file 'bzrlib/groupcompress.py'
--- a/bzrlib/groupcompress.py	2009-03-30 11:49:32 +0000
+++ b/bzrlib/groupcompress.py	2009-03-30 15:18:29 +0000
@@ -406,8 +406,8 @@
         end_point = 0
         for factory in self._factories:
             bytes = factory.get_bytes_as('fulltext')
-            (found_sha1, start_point, end_point, type,
-             length) = compressor.compress(factory.key, bytes, factory.sha1)
+            (found_sha1, start_point, end_point,
+             type) = compressor.compress(factory.key, bytes, factory.sha1)
             # Now update this factory with the new offsets, etc
             factory.sha1 = found_sha1
             factory._start = start_point
@@ -614,7 +614,8 @@
         if key[-1] is None:
             key = key[:-1] + ('sha1:' + sha1,)
 
-        return self._compress(key, bytes, sha1, len(bytes) / 2, soft)
+        (sha1, start, end, type) = self._compress(key, bytes, sha1, len(bytes) / 2, soft)
+        return sha1, start, end, type
 
     def _compress(self, key, bytes, sha1, max_delta_size, soft=False):
         """Compress lines with label key.
@@ -632,9 +633,8 @@
         :param soft: Do a 'soft' compression. This means that we require larger
             ranges to match to be considered for a copy command.
 
-        :return: The sha1 of lines, the start and end offsets in the delta, the
-            type ('fulltext' or 'delta') and the number of bytes accumulated in
-            the group output so far.
+        :return: The sha1 of lines, the start and end offsets in the delta, and
+            the type ('fulltext' or 'delta').
         """
         raise NotImplementedError(self._compress)
 
@@ -703,7 +703,7 @@
     def __init__(self):
         """Create a GroupCompressor.
 
-        :param delta: If False, do not compress records.
+        Used only if the pyrex version is not available.
         """
         super(PythonGroupCompressor, self).__init__()
         self._delta_index = LinesDeltaIndex([])
@@ -712,35 +712,34 @@
 
     def _compress(self, key, bytes, sha1, max_delta_size, soft=False):
         """see _CommonGroupCompressor._compress"""
-        bytes_length = len(bytes)
+        input_len = len(bytes)
         new_lines = osutils.split_lines(bytes)
-        out_lines, index_lines = self._delta_index.make_delta(new_lines,
-            bytes_length=bytes_length, soft=soft)
+        out_lines, index_lines = self._delta_index.make_delta(
+            new_lines, bytes_length=input_len, soft=soft)
         delta_length = sum(map(len, out_lines))
         if delta_length > max_delta_size:
             # The delta is longer than the fulltext, insert a fulltext
             type = 'fulltext'
-            out_lines = ['f', encode_base128_int(bytes_length)]
+            out_lines = ['f', encode_base128_int(input_len)]
             out_lines.extend(new_lines)
             index_lines = [False, False]
             index_lines.extend([True] * len(new_lines))
-            out_length = len(out_lines[1]) + bytes_length + 1
         else:
             # this is a worthy delta, output it
             type = 'delta'
             out_lines[0] = 'd'
             # Update the delta_length to include those two encoded integers
             out_lines[1] = encode_base128_int(delta_length)
-            out_length = len(out_lines[3]) + 1 + delta_length
-        start = self.endpoint # Before insertion
-        chunk_start = len(self._delta_index.lines)
+        # Before insertion
+        start = self.endpoint
+        chunk_start = len(self.chunks)
         self._delta_index.extend_lines(out_lines, index_lines)
         self.endpoint = self._delta_index.endpoint
-        self.input_bytes += bytes_length
-        chunk_end = len(self._delta_index.lines)
+        self.input_bytes += input_len
+        chunk_end = len(self.chunks)
         self.labels_deltas[key] = (start, chunk_start,
                                    self.endpoint, chunk_end)
-        return sha1, start, self.endpoint, type, out_length
+        return sha1, start, self.endpoint, type
 
 
 class PyrexGroupCompressor(_CommonGroupCompressor):
@@ -784,14 +783,12 @@
             type = 'fulltext'
             enc_length = encode_base128_int(len(bytes))
             len_mini_header = 1 + len(enc_length)
-            length = len(bytes) + len_mini_header
             self._delta_index.add_source(bytes, len_mini_header)
             new_chunks = ['f', enc_length, bytes]
         else:
             type = 'delta'
             enc_length = encode_base128_int(len(delta))
             len_mini_header = 1 + len(enc_length)
-            length = len(delta) + len_mini_header
             new_chunks = ['d', enc_length, delta]
             self._delta_index.add_delta_source(delta, len_mini_header)
         # Before insertion
@@ -807,7 +804,7 @@
             raise AssertionError('the delta index is out of sync'
                 'with the output lines %s != %s'
                 % (self._delta_index._source_offset, self.endpoint))
-        return sha1, start, self.endpoint, type, length
+        return sha1, start, self.endpoint, type
 
     def _output_chunks(self, new_chunks):
         """Output some chunks.
@@ -1321,7 +1318,6 @@
             self._compressor = GroupCompressor()
 
         last_prefix = None
-        last_fulltext_len = None
         max_fulltext_len = 0
         max_fulltext_prefix = None
         insert_manager = None
@@ -1335,8 +1331,8 @@
                 raise errors.RevisionNotPresent(record.key, self)
             if random_id:
                 if record.key in inserted_keys:
-                    trace.note('Insert claimed random_id=True, but then inserted'
-                               ' %r two times', record.key)
+                    trace.note('Insert claimed random_id=True,'
+                               ' but then inserted %r two times', record.key)
                     continue
                 inserted_keys.add(record.key)
             if reuse_blocks:
@@ -1379,11 +1375,11 @@
             if max_fulltext_len < len(bytes):
                 max_fulltext_len = len(bytes)
                 max_fulltext_prefix = prefix
-            (found_sha1, start_point, end_point, type,
-             length) = self._compressor.compress(record.key,
-                bytes, record.sha1, soft=soft,
-                nostore_sha=nostore_sha)
-            # delta_ratio = float(len(bytes)) / length
+            (found_sha1, start_point, end_point,
+             type) = self._compressor.compress(record.key,
+                                               bytes, record.sha1, soft=soft,
+                                               nostore_sha=nostore_sha)
+            # delta_ratio = float(len(bytes)) / (end_point - start_point)
             # Check if we want to continue to include that text
             if (prefix == max_fulltext_prefix
                 and end_point < 2 * max_fulltext_len):
@@ -1402,10 +1398,9 @@
                 self._compressor.pop_last()
                 flush()
                 max_fulltext_len = len(bytes)
-                (found_sha1, start_point, end_point, type,
-                 length) = self._compressor.compress(record.key,
-                    bytes, record.sha1)
-                last_fulltext_len = length
+                (found_sha1, start_point, end_point,
+                 type) = self._compressor.compress(record.key, bytes,
+                                                   record.sha1)
             if record.key[-1] is None:
                 key = record.key[:-1] + ('sha1:' + found_sha1,)
             else:

=== modified file 'bzrlib/tests/test_groupcompress.py'
--- a/bzrlib/tests/test_groupcompress.py	2009-03-30 11:49:32 +0000
+++ b/bzrlib/tests/test_groupcompress.py	2009-03-30 15:18:29 +0000
@@ -72,7 +72,7 @@
     def test_one_nosha_delta(self):
         # diff against NUKK
         compressor = self.compressor()
-        sha1, start_point, end_point, _, _ = compressor.compress(('label',),
+        sha1, start_point, end_point, _ = compressor.compress(('label',),
             'strange\ncommon\n', None)
         self.assertEqual(sha_string('strange\ncommon\n'), sha1)
         expected_lines = 'f' '\x0f' 'strange\ncommon\n'
@@ -105,10 +105,10 @@
         # Knit fetching will try to reconstruct texts locally which results in
         # reading something that is in the compressor stream already.
         compressor = self.compressor()
-        sha1_1, _, _, _, _ = compressor.compress(('label',),
+        sha1_1, _, _, _ = compressor.compress(('label',),
             'strange\ncommon long line\nthat needs a 16 byte match\n', None)
         expected_lines = list(compressor.chunks)
-        sha1_2, _, end_point, _, _ = compressor.compress(('newlabel',),
+        sha1_2, _, end_point, _ = compressor.compress(('newlabel',),
             'common long line\nthat needs a 16 byte match\ndifferent\n', None)
         # get the first out
         self.assertEqual(('strange\ncommon long line\n'
@@ -146,10 +146,10 @@
 
     def test_two_nosha_delta(self):
         compressor = self.compressor()
-        sha1_1, _, _, _, _ = compressor.compress(('label',),
+        sha1_1, _, _, _ = compressor.compress(('label',),
             'strange\ncommon long line\nthat needs a 16 byte match\n', None)
         expected_lines = list(compressor.chunks)
-        sha1_2, start_point, end_point, _, _ = compressor.compress(('newlabel',),
+        sha1_2, start_point, end_point, _ = compressor.compress(('newlabel',),
             'common long line\nthat needs a 16 byte match\ndifferent\n', None)
         self.assertEqual(sha_string('common long line\n'
                                     'that needs a 16 byte match\n'
@@ -171,12 +171,12 @@
         # The first interesting test: make a change that should use lines from
         # both parents.
         compressor = self.compressor()
-        sha1_1, _, _, _, _ = compressor.compress(('label',),
+        sha1_1, _, _, _ = compressor.compress(('label',),
             'strange\ncommon very very long line\nwith some extra text\n', None)
-        sha1_2, _, _, _, _ = compressor.compress(('newlabel',),
+        sha1_2, _, _, _ = compressor.compress(('newlabel',),
             'different\nmoredifferent\nand then some more\n', None)
         expected_lines = list(compressor.chunks)
-        sha1_3, start_point, end_point, _, _ = compressor.compress(('label3',),
+        sha1_3, start_point, end_point, _ = compressor.compress(('label3',),
             'new\ncommon very very long line\nwith some extra text\n'
             'different\nmoredifferent\nand then some more\n',
             None)
@@ -225,10 +225,10 @@
 
     def test_two_nosha_delta(self):
         compressor = self.compressor()
-        sha1_1, _, _, _, _ = compressor.compress(('label',),
+        sha1_1, _, _, _ = compressor.compress(('label',),
             'strange\ncommon long line\nthat needs a 16 byte match\n', None)
         expected_lines = list(compressor.chunks)
-        sha1_2, start_point, end_point, _, _ = compressor.compress(('newlabel',),
+        sha1_2, start_point, end_point, _ = compressor.compress(('newlabel',),
             'common long line\nthat needs a 16 byte match\ndifferent\n', None)
         self.assertEqual(sha_string('common long line\n'
                                     'that needs a 16 byte match\n'
@@ -250,12 +250,12 @@
         # The first interesting test: make a change that should use lines from
         # both parents.
         compressor = self.compressor()
-        sha1_1, _, _, _, _ = compressor.compress(('label',),
+        sha1_1, _, _, _ = compressor.compress(('label',),
             'strange\ncommon very very long line\nwith some extra text\n', None)
-        sha1_2, _, _, _, _ = compressor.compress(('newlabel',),
+        sha1_2, _, _, _ = compressor.compress(('newlabel',),
             'different\nmoredifferent\nand then some more\n', None)
         expected_lines = list(compressor.chunks)
-        sha1_3, start_point, end_point, _, _ = compressor.compress(('label3',),
+        sha1_3, start_point, end_point, _ = compressor.compress(('label3',),
             'new\ncommon very very long line\nwith some extra text\n'
             'different\nmoredifferent\nand then some more\n',
             None)