Rev 3924: Change the attribute from 'lines' to 'chunks' to make it more in http://bzr.arbash-meinel.com/branches/bzr/brisbane/vilajam

Fri Mar 27 21:47:25 GMT 2009

At http://bzr.arbash-meinel.com/branches/bzr/brisbane/vilajam

------------------------------------------------------------
revno: 3924
revision-id: john at arbash-meinel.com-20090327214708-sy13r2m4cu0qn72k
parent: john at arbash-meinel.com-20090327212932-psi820dh5qc5zthq
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: vilajam
timestamp: Fri 2009-03-27 16:47:08 -0500
message:
  Change the attribute from 'lines' to 'chunks' to make it more
  obvious that entries aren't guaranteed to end with '\n'.
  Also, remove more of the duplication of the 2 compressor implementations into
  the common base class.
-------------- next part --------------
=== modified file 'bzrlib/groupcompress.py'

--- a/bzrlib/groupcompress.py	2009-03-27 21:29:32 +0000
+++ b/bzrlib/groupcompress.py	2009-03-27 21:47:08 +0000
@@ -610,11 +610,12 @@
 
     def __init__(self):
         """Create a GroupCompressor."""
-        self.lines = []
+        self.chunks = []
         self._last = None
         self.endpoint = 0
         self.input_bytes = 0
         self.labels_deltas = {}
+        self._delta_index = None # Set by the children
         self._block = GroupCompressBlock()
 
     def compress(self, key, bytes, expected_sha, nostore_sha=None, soft=False):
@@ -688,7 +689,7 @@
         :return: An iterable over bytes and the sha1.
         """
         delta_details = self.labels_deltas[key]
-        delta_chunks = self.lines[delta_details[0][1]:delta_details[1][1]]
+        delta_chunks = self.chunks[delta_details[0][1]:delta_details[1][1]]
         stored_bytes = ''.join(delta_chunks)
         # TODO: Fix this, we shouldn't really be peeking here
         entry = self._block._entries[key]
@@ -707,7 +708,7 @@
             if entry.type != 'delta':
                 raise ValueError('Unknown entry type: %s' % (entry.type,))
             # XXX: This is inefficient at best
-            source = ''.join(self.lines)
+            source = ''.join(self.chunks)
             if stored_bytes[0] != 'd':
                 raise ValueError('Entry type claims delta, bytes claim %s'
                                  % (stored_bytes[0],))
@@ -724,6 +725,17 @@
                              % (entry.sha1, bytes_sha1))
         return bytes, entry.sha1
 
+    def flush(self):
+        """Finish this group, creating a formatted stream.
+
+        After calling this, the compressor should no longer be used
+        """
+        content = ''.join(self.chunks)
+        self.chunks = None
+        self._delta_index = None
+        self._block.set_content(content)
+        return self._block
+
     def pop_last(self):
         """Call this if you want to 'revoke' the last compression.
 
@@ -731,7 +743,7 @@
         more compression.
         """
         self._delta_index = None
-        del self.lines[self._last[0]:]
+        del self.chunks[self._last[0]:]
         self.endpoint = self._last[1]
         self._last = None
 
@@ -748,15 +760,15 @@
         :param delta: If False, do not compress records.
         """
         super(PythonGroupCompressor, self).__init__()
-        self.line_locations = LinesDeltaIndex([])
-        self.lines = self.line_locations.lines
-        self._present_prefixes = set()
+        self._delta_index = LinesDeltaIndex([])
+        # The actual content is managed by LinesDeltaIndex
+        self.chunks = self._delta_index.lines
 
     def _compress(self, key, bytes, sha1, max_delta_size, soft=False):
         """see _CommonGroupCompressor._compress"""
         bytes_length = len(bytes)
         new_lines = osutils.split_lines(bytes)
-        out_lines, index_lines = self.line_locations.make_delta(new_lines,
+        out_lines, index_lines = self._delta_index.make_delta(new_lines,
             bytes_length=bytes_length, soft=soft)
         delta_length = sum(map(len, out_lines))
         if delta_length > max_delta_size:
@@ -777,35 +789,14 @@
         self._block.add_entry(key, type=type, sha1=sha1,
                               start=self.endpoint, length=out_length)
         start = self.endpoint # Before insertion
-        delta_start = (self.endpoint, len(self.lines))
-        self.output_lines(out_lines, index_lines)
+        delta_start = (start, len(self._delta_index.lines))
+        self._delta_index.extend_lines(out_lines, index_lines)
+        self.endpoint = self._delta_index.endpoint
         self.input_bytes += bytes_length
-        delta_end = (self.endpoint, len(self.lines))
+        delta_end = (self.endpoint, len(self._delta_index.lines))
         self.labels_deltas[key] = (delta_start, delta_end)
         return sha1, start, self.endpoint, type, out_length
 
-    def flush(self):
-        self._block.set_content(''.join(self.lines))
-        return self._block
-
-    def output_lines(self, new_lines, index_lines):
-        """Output some lines.
-
-        :param new_lines: The lines to output.
-        :param index_lines: A boolean flag for each line - when True, index
-            that line.
-        """
-        # indexed_newlines = [idx for idx, val in enumerate(index_lines)
-        #                          if val and new_lines[idx] == '\n']
-        # if indexed_newlines:
-        #     import pdb; pdb.set_trace()
-        self._last = (len(self.lines), self.endpoint)
-        endpoint = self.endpoint
-        self.line_locations.extend_lines(new_lines, index_lines)
-        for line in new_lines:
-            endpoint += len(line)
-        self.endpoint = endpoint
-
 
 class PyrexGroupCompressor(_CommonGroupCompressor):
     """Produce a serialised group of compressed texts.
@@ -825,7 +816,6 @@
 
     def __init__(self):
         super(PyrexGroupCompressor, self).__init__()
-        self.num_keys = 0
         self._delta_index = DeltaIndex()
 
     def _compress(self, key, bytes, sha1, max_delta_size, soft=False):
@@ -862,11 +852,10 @@
         self._block.add_entry(key, type=type, sha1=sha1,
                               start=self.endpoint, length=length)
         start = self.endpoint # Before insertion
-        delta_start = (self.endpoint, len(self.lines))
-        self.num_keys += 1
-        self.output_chunks(new_chunks)
+        delta_start = (self.endpoint, len(self.chunks))
+        self._output_chunks(new_chunks)
         self.input_bytes += input_len
-        delta_end = (self.endpoint, len(self.lines))
+        delta_end = (self.endpoint, len(self.chunks))
         self.labels_deltas[key] = (delta_start, delta_end)
         if not self._delta_index._source_offset == self.endpoint:
             raise AssertionError('the delta index is out of sync'
@@ -874,64 +863,14 @@
                 % (self._delta_index._source_offset, self.endpoint))
         return sha1, start, self.endpoint, type, length
 
-    def extract(self, key):
-        """Extract a key previously added to the compressor.
-
-        :param key: The key to extract.
-        :return: An iterable over bytes and the sha1.
-        """
-        delta_details = self.labels_deltas[key]
-        delta_chunks = self.lines[delta_details[0][1]:delta_details[1][1]]
-        stored_bytes = ''.join(delta_chunks)
-        # TODO: Fix this, we shouldn't really be peeking here
-        entry = self._block._entries[key]
-        if entry.type == 'fulltext':
-            if stored_bytes[0] != 'f':
-                raise ValueError('Index claimed fulltext, but stored bytes'
-                                 ' indicate %s' % (stored_bytes[0],))
-            fulltext_len, offset = decode_base128_int(stored_bytes[1:10])
-            if fulltext_len + 1 + offset != len(stored_bytes):
-                raise ValueError('Index claimed fulltext len, but stored bytes'
-                                 ' claim %s != %s'
-                                 % (len(stored_bytes),
-                                    fulltext_len + 1 + offset))
-            bytes = stored_bytes[offset + 1:]
-        else:
-            if entry.type != 'delta':
-                raise ValueError('Unknown entry type: %s' % (entry.type,))
-            # XXX: This is inefficient at best
-            source = ''.join(self.lines)
-            if stored_bytes[0] != 'd':
-                raise ValueError('Entry type claims delta, bytes claim %s'
-                                 % (stored_bytes[0],))
-            delta_len, offset = decode_base128_int(stored_bytes[1:10])
-            if delta_len + 1 + offset != len(stored_bytes):
-                raise ValueError('Index claimed delta len, but stored bytes'
-                                 ' claim %s != %s'
-                                 % (len(stored_bytes),
-                                    delta_len + 1 + offset))
-            bytes = apply_delta(source, stored_bytes[offset + 1:])
-        bytes_sha1 = osutils.sha_string(bytes)
-        if entry.sha1 != bytes_sha1:
-            raise ValueError('Recorded sha1 != measured %s != %s'
-                             % (entry.sha1, bytes_sha1))
-        return bytes, entry.sha1
-
-    def flush(self):
-        """Finish this group, creating a formatted stream."""
-        content = ''.join(self.lines)
-        self.lines = None
-        self._block.set_content(content)
-        return self._block
-
-    def output_chunks(self, new_chunks):
+    def _output_chunks(self, new_chunks):
         """Output some chunks.
 
         :param new_chunks: The chunks to output.
         """
-        self._last = (len(self.lines), self.endpoint)
+        self._last = (len(self.chunks), self.endpoint)
         endpoint = self.endpoint
-        self.lines.extend(new_chunks)
+        self.chunks.extend(new_chunks)
         endpoint += sum(map(len, new_chunks))
         self.endpoint = endpoint
 
@@ -1504,32 +1443,6 @@
                 start_new_block = True
             else:
                 start_new_block = False
-            # if type == 'fulltext':
-            #     # If this is the first text, we don't do anything
-            #     if self._compressor.num_keys > 1:
-            #         if prefix is not None and prefix != last_prefix:
-            #             # We just inserted a fulltext for a different prefix
-            #             # (aka file-id).
-            #             if end_point > 512 * 1024:
-            #                 start_new_block = True
-            #             # TODO: Consider packing several small texts together
-            #             #       maybe only flush if end_point > some threshold
-            #             # if end_point > 512 * 1024 or len(bytes) <
-            #             #     start_new_block = true
-            #         else:
-            #             # We just added a fulltext, part of the same file-id
-            #             if (end_point > 2*1024*1024
-            #                 and end_point > 5*max_fulltext_len):
-            #                 start_new_block = True
-            #     last_fulltext_len = len(bytes)
-            # else:
-            #     delta_ratio = float(len(bytes)) / length
-            #     if delta_ratio < 3: # Not much compression
-            #         if end_point > 1*1024*1024:
-            #             start_new_block = True
-            #     elif delta_ratio < 10: # 10:1 compression
-            #         if end_point > 4*1024*1024:
-            #             start_new_block = True
             last_prefix = prefix
             if start_new_block:
                 self._compressor.pop_last()
@@ -1783,7 +1696,6 @@
     apply_delta,
     encode_base128_int,
     decode_base128_int,
-    encode_copy_instruction,
     LinesDeltaIndex,
     )
 try:

=== modified file 'bzrlib/tests/test_groupcompress.py'
--- a/bzrlib/tests/test_groupcompress.py	2009-03-27 20:12:12 +0000
+++ b/bzrlib/tests/test_groupcompress.py	2009-03-27 21:47:08 +0000
@@ -67,7 +67,7 @@
 
     def test_empty_delta(self):
         compressor = self.compressor()
-        self.assertEqual([], compressor.lines)
+        self.assertEqual([], compressor.chunks)
 
     def test_one_nosha_delta(self):
         # diff against NUKK
@@ -76,7 +76,7 @@
             'strange\ncommon\n', None)
         self.assertEqual(sha_string('strange\ncommon\n'), sha1)
         expected_lines = 'f' '\x0f' 'strange\ncommon\n'
-        self.assertEqual(expected_lines, ''.join(compressor.lines))
+        self.assertEqual(expected_lines, ''.join(compressor.chunks))
         self.assertEqual(0, start_point)
         self.assertEqual(sum(map(len, expected_lines)), end_point)
 
@@ -90,7 +90,7 @@
         self.assertEqual('fulltext', kind)
         self.assertEqual(groupcompress._null_sha1, sha1)
         self.assertEqual(0, compressor.endpoint)
-        self.assertEqual([], compressor.lines)
+        self.assertEqual([], compressor.chunks)
         # Even after adding some content
         compressor.compress(('content',), 'some\nbytes\n', None)
         self.assertTrue(compressor.endpoint > 0)
@@ -107,7 +107,7 @@
         compressor = self.compressor()
         sha1_1, _, _, _, _ = compressor.compress(('label',),
             'strange\ncommon long line\nthat needs a 16 byte match\n', None)
-        expected_lines = list(compressor.lines)
+        expected_lines = list(compressor.chunks)
         sha1_2, _, end_point, _, _ = compressor.compress(('newlabel',),
             'common long line\nthat needs a 16 byte match\ndifferent\n', None)
         # get the first out
@@ -148,7 +148,7 @@
         compressor = self.compressor()
         sha1_1, _, _, _, _ = compressor.compress(('label',),
             'strange\ncommon long line\nthat needs a 16 byte match\n', None)
-        expected_lines = list(compressor.lines)
+        expected_lines = list(compressor.chunks)
         sha1_2, start_point, end_point, _, _ = compressor.compress(('newlabel',),
             'common long line\nthat needs a 16 byte match\ndifferent\n', None)
         self.assertEqual(sha_string('common long line\n'
@@ -164,7 +164,7 @@
             # add the line different, and the trailing newline
             '\x0adifferent\n', # insert 10 bytes
             ])
-        self.assertEqualDiffEncoded(expected_lines, compressor.lines)
+        self.assertEqualDiffEncoded(expected_lines, compressor.chunks)
         self.assertEqual(sum(map(len, expected_lines)), end_point)
 
     def test_three_nosha_delta(self):
@@ -175,7 +175,7 @@
             'strange\ncommon very very long line\nwith some extra text\n', None)
         sha1_2, _, _, _, _ = compressor.compress(('newlabel',),
             'different\nmoredifferent\nand then some more\n', None)
-        expected_lines = list(compressor.lines)
+        expected_lines = list(compressor.chunks)
         sha1_3, start_point, end_point, _, _ = compressor.compress(('label3',),
             'new\ncommon very very long line\nwith some extra text\n'
             'different\nmoredifferent\nand then some more\n',
@@ -196,7 +196,7 @@
             # Copy of second parent 'different' range
             '\x91\x3c\x2b' # copy, offset 0x3c, 0x2b bytes
             ])
-        self.assertEqualDiffEncoded(expected_lines, compressor.lines)
+        self.assertEqualDiffEncoded(expected_lines, compressor.chunks)
         self.assertEqual(sum(map(len, expected_lines)), end_point)
 
 
@@ -227,7 +227,7 @@
         compressor = self.compressor()
         sha1_1, _, _, _, _ = compressor.compress(('label',),
             'strange\ncommon long line\nthat needs a 16 byte match\n', None)
-        expected_lines = list(compressor.lines)
+        expected_lines = list(compressor.chunks)
         sha1_2, start_point, end_point, _, _ = compressor.compress(('newlabel',),
             'common long line\nthat needs a 16 byte match\ndifferent\n', None)
         self.assertEqual(sha_string('common long line\n'
@@ -243,7 +243,7 @@
             # add the line different, and the trailing newline
             '\x0adifferent\n', # insert 10 bytes
             ])
-        self.assertEqualDiffEncoded(expected_lines, compressor.lines)
+        self.assertEqualDiffEncoded(expected_lines, compressor.chunks)
         self.assertEqual(sum(map(len, expected_lines)), end_point)
 
     def test_three_nosha_delta(self):
@@ -254,7 +254,7 @@
             'strange\ncommon very very long line\nwith some extra text\n', None)
         sha1_2, _, _, _, _ = compressor.compress(('newlabel',),
             'different\nmoredifferent\nand then some more\n', None)
-        expected_lines = list(compressor.lines)
+        expected_lines = list(compressor.chunks)
         sha1_3, start_point, end_point, _, _ = compressor.compress(('label3',),
             'new\ncommon very very long line\nwith some extra text\n'
             'different\nmoredifferent\nand then some more\n',
@@ -275,7 +275,7 @@
             # Copy of second parent 'different' range
             '\x91\x3c\x2b' # copy, offset 0x3c, 0x2b bytes
             ])
-        self.assertEqualDiffEncoded(expected_lines, compressor.lines)
+        self.assertEqualDiffEncoded(expected_lines, compressor.chunks)
         self.assertEqual(sum(map(len, expected_lines)), end_point)