Rev 2: Core proof of concept working. in http://people.ubuntu.com/~robertc/baz2.0/plugins/groupcompress/trunk

Sun Jul 6 04:15:57 BST 2008

At http://people.ubuntu.com/~robertc/baz2.0/plugins/groupcompress/trunk

------------------------------------------------------------
revno: 2
revision-id: robertc at robertcollins.net-20080706031554-j10ybun5rn5fjiei
parent: robertc at robertcollins.net-20080705181540-mua3hkzj7c9qa5f6
committer: Robert Collins <robertc at robertcollins.net>
branch nick: trunk
timestamp: Sun 2008-07-06 13:15:54 +1000
message:
  Core proof of concept working.
modified:
  groupcompress.py               groupcompress.py-20080705181503-ccbxd6xuy1bdnrpu-8
  tests/test_groupcompress.py    test_groupcompress.p-20080705181503-ccbxd6xuy1bdnrpu-13
=== modified file 'groupcompress.py'

--- a/groupcompress.py	2008-07-05 18:15:40 +0000
+++ b/groupcompress.py	2008-07-06 03:15:54 +0000
@@ -17,12 +17,116 @@
 
 """Core compression logic for compressing streams of related files."""
 
-
-from bzrlib import pack
+from bzrlib import diff, pack, patiencediff
 from bzrlib.knit import _DirectPackAccess
+from bzrlib.osutils import (
+    contains_whitespace,
+    contains_linebreaks,
+    sha_string,
+    sha_strings,
+    split_lines,
+    )
 from bzrlib.plugins.index2.repofmt import InMemoryBTree
-from bzrlib.versionedfile import VersionedFiles
-
+from bzrlib.versionedfile import (
+    FulltextContentFactory,
+    VersionedFiles,
+    )
+
+
+def parse(lines):
+    result = []
+    lines = iter(lines)
+    next = lines.next
+    print next(), next()
+    for header in lines:
+        start, end, count = [int(n) for n in header.split(',')]
+        contents = [next() for i in xrange(count)]
+        result.append((start, end, count, contents))
+    return result
+
+def apply_delta(basis, delta):
+    """Apply delta to this object to become new_version_id."""
+    lines = []
+    last_offset = 0
+    # eq ranges occur where gaps occur
+    # start, end refer to offsets in basis
+    for start, end, count, delta_lines in delta:
+        if last_offset != start: # copy an eq range
+            lines.extend(basis[last_offset:start])
+        lines[start:end] = delta_lines
+        last_offset = end
+    if last_offset != len(basis):
+        lines.extend(basis[last_offset:])
+    trim_encoding_newline(lines)
+    return lines
+
+
+def trim_encoding_newline(lines):
+    if lines[-1] == '\n':
+        del lines[-1]
+    else:
+        lines[-1] = lines[-1][:-1]
+
+
+class GroupCompressor(object):
+    """Produce a serialised group of compressed texts."""
+
+    def __init__(self, delta=True):
+        """Create a GroupCompressor.
+
+        :paeam delta: If False, do not compress records.
+        """
+        self._delta = delta
+        self.lines = []
+        self.endpoint = 0
+        self.input_bytes = 0
+
+    def compress(self, key, lines, expected_sha):
+        """Compress lines with label key.
+
+        :param key: A key tuple. It is stored in the output
+            for identification of the text during decompression.
+        :param lines: The lines to be compressed. Must be split
+            on \n, with the \n preserved.'
+        :param expected_sha: If non-None, the sha the lines are blieved to
+            have. During compression the sha is calculated; a mismatch will
+            cause an error.
+        :return: The sha1 of lines, and the number of bytes accumulated in
+            the group output so far.
+        """
+        sha1 = sha_strings(lines)
+        label = '\x00'.join(key)
+        # setup good encoding for trailing \n support.
+        if not lines or lines[-1].endswith('\n'):
+            lines.append('\n')
+        else:
+            lines[-1] = lines[-1] + '\n'
+        new_lines = []
+        new_lines.append('label: %s\n' % label)
+        new_lines.append('sha1: %s\n' % sha1)
+        if 0:
+            delta_seq = diff.difflib.SequenceMatcher(
+                None, self.lines, lines)
+        else:
+            delta_seq = patiencediff.PatienceSequenceMatcher(
+                None, self.lines, lines)
+        diff_hunks = []
+        for op in delta_seq.get_opcodes():
+            if op[0] == 'equal':
+                continue
+            diff_hunks.append((op[1], op[2], op[4]-op[3], lines[op[3]:op[4]]))
+        for start, end, count, new in diff_hunks:
+            new_lines.append('%d,%d,%d\n' % (start, end, count))
+            new_lines.extend(new)
+        self.endpoint += sum(map(len, new_lines))
+        self.lines.extend(new_lines)
+        trim_encoding_newline(lines)
+        self.input_bytes += sum(map(len, lines))
+        return sha1, self.endpoint
+
+    def ratio(self):
+        """Return the overall compression ratio."""
+        return float(self.input_bytes) / float(self.endpoint)
 
 def make_pack_factory(graph, delta, keylength):
     """Create a factory for creating a pack based groupcompress.
@@ -39,11 +143,6 @@
         ref_length = 0
         if graph:
             ref_length += 1
-        if delta:
-            ref_length += 1
-            max_delta_chain = 200
-        else:
-            max_delta_chain = 0
         graph_index = InMemoryBTree(reference_lists=ref_length,
             key_elements=keylength)
         stream = transport.open_write_stream('newpack')
@@ -53,8 +152,7 @@
             deltas=delta, add_callback=graph_index.add_nodes)
         access = _DirectPackAccess({})
         access.set_writer(writer, graph_index, (transport, 'newpack'))
-        result = GroupCompressVersionedFiles(index, access,
-            max_delta_chain=max_delta_chain)
+        result = GroupCompressVersionedFiles(index, access, delta)
         result.stream = stream
         result.writer = writer
         return result
@@ -69,13 +167,106 @@
 class GroupCompressVersionedFiles(VersionedFiles):
     """A group-compress based VersionedFiles implementation."""
 
-    def __init__(self, index, access, max_delta_chain=-1):
+    def __init__(self, index, access, delta=True):
         """Create a GroupCompressVersionedFiles object.
 
         :param index: The index object storing access and graph data.
         :param access: The access object storing raw data.
-        """
-
+        :param delta: Whether to delta compress or just entropy compress.
+        """
+        self._index = index
+        self._access = access
+        self._delta = delta
+
+    def add_lines(self, key, parents, lines, parent_texts=None,
+        left_matching_blocks=None, nostore_sha=None, random_id=False,
+        check_content=True):
+        """Add a text to the store.
+
+        :param key: The key tuple of the text to add.
+        :param parents: The parents key tuples of the text to add.
+        :param lines: A list of lines. Each line must be a bytestring. And all
+            of them except the last must be terminated with \n and contain no
+            other \n's. The last line may either contain no \n's or a single
+            terminating \n. If the lines list does meet this constraint the add
+            routine may error or may succeed - but you will be unable to read
+            the data back accurately. (Checking the lines have been split
+            correctly is expensive and extremely unlikely to catch bugs so it
+            is not done at runtime unless check_content is True.)
+        :param parent_texts: An optional dictionary containing the opaque 
+            representations of some or all of the parents of version_id to
+            allow delta optimisations.  VERY IMPORTANT: the texts must be those
+            returned by add_lines or data corruption can be caused.
+        :param left_matching_blocks: a hint about which areas are common
+            between the text and its left-hand-parent.  The format is
+            the SequenceMatcher.get_matching_blocks format.
+        :param nostore_sha: Raise ExistingContent and do not add the lines to
+            the versioned file if the digest of the lines matches this.
+        :param random_id: If True a random id has been selected rather than
+            an id determined by some deterministic process such as a converter
+            from a foreign VCS. When True the backend may choose not to check
+            for uniqueness of the resulting key within the versioned file, so
+            this should only be done when the result is expected to be unique
+            anyway.
+        :param check_content: If True, the lines supplied are verified to be
+            bytestrings that are correctly formed lines.
+        :return: The text sha1, the number of bytes in the text, and an opaque
+                 representation of the inserted version which can be provided
+                 back to future add_lines calls in the parent_texts dictionary.
+        """
+        self._index._check_write_ok()
+        self._check_add(key, lines, random_id, check_content)
+        if parents is None:
+            # The caller might pass None if there is no graph data, but kndx
+            # indexes can't directly store that, so we give them
+            # an empty tuple instead.
+            parents = ()
+        # double handling for now. Make it work until then.
+        bytes = ''.join(lines)
+        record = FulltextContentFactory(key, parents, None, bytes)
+        sha1 = self._insert_record_stream([record])
+        return sha1, len(bytes), None
+
+    def _check_add(self, key, lines, random_id, check_content):
+        """check that version_id and lines are safe to add."""
+        version_id = key[-1]
+        if contains_whitespace(version_id):
+            raise InvalidRevisionId(version_id, self)
+        self.check_not_reserved_id(version_id)
+        # TODO: If random_id==False and the key is already present, we should
+        # probably check that the existing content is identical to what is
+        # being inserted, and otherwise raise an exception.  This would make
+        # the bundle code simpler.
+        if check_content:
+            self._check_lines_not_unicode(lines)
+            self._check_lines_are_lines(lines)
+
+    def insert_record_stream(self, stream):
+        """Insert a record stream into this container.
+
+        :param stream: A stream of records to insert. 
+        :return: None
+        :seealso VersionedFiles.get_record_stream:
+        """
+        self._insert_record_stream(stream)
+
+    def _insert_record_stream(self, stream):
+        """Internal core to insert a record stream into this container.
+
+        This helper function has a different interface than insert_record_stream
+        to allow add_lines to be minimal, but still return the needed data.
+
+        :param stream: A stream of records to insert. 
+        :return: An iterator over the sha1 of the inserted records.
+        :seealso insert_record_stream:
+        :seealso add_lines:
+        """
+        compressor = GroupCompressor(self._delta)
+        # This will go up to fulltexts for gc to gc fetching, which isn't
+        # ideal.
+        for record in stream:
+            found_sha1, end_point = compressor.compress(record.key,
+                split_lines(record.get_bytes_as('fulltext')), record.sha1)
 
 class _GCGraphIndex(object):
     """Mapper from GroupCompressVersionedFiles needs into GraphIndex storage."""
@@ -108,3 +299,8 @@
         self.has_graph = parents
         self._is_locked = is_locked
 
+    def _check_write_ok(self):
+        """Assert if writes are not permitted."""
+        if not self._is_locked():
+            raise errors.ObjectNotLocked(self)
+

=== modified file 'tests/test_groupcompress.py'
--- a/tests/test_groupcompress.py	2008-07-05 18:15:40 +0000
+++ b/tests/test_groupcompress.py	2008-07-06 03:15:54 +0000
@@ -20,6 +20,7 @@
 import zlib
 
 from bzrlib import tests
+from bzrlib.osutils import sha_strings
 from bzrlib.plugins.groupcompress import errors, groupcompress
 from bzrlib.tests import (
     TestCaseWithTransport,
@@ -47,3 +48,111 @@
     return standard_tests
 
 
+class TestGroupCompressor(TestCaseWithTransport):
+    """Tests for GroupCompressor"""
+
+    def test_empty_delta(self):
+        compressor = groupcompress.GroupCompressor(True)
+        self.assertEqual([], compressor.lines)
+
+    def test_one_nosha_delta(self):
+        # diff against NUKK
+        compressor = groupcompress.GroupCompressor(True)
+        sha1, end_point = compressor.compress(('label',),
+            ['strange\n', 'common\n'], None)
+        self.assertEqual(sha_strings(['strange\n', 'common\n']), sha1)
+        expected_lines = [
+            'label: label\n',
+            'sha1: %s\n' % sha1,
+            '0,0,3\n',
+            'strange\n',
+            'common\n',
+            '\n', # the last \n in a text is removed, which allows safe
+            # serialisation of lines without trailing \n.
+            ]
+        self.assertEqual(expected_lines, compressor.lines)
+        self.assertEqual(sum(map(len, expected_lines)), end_point)
+
+    def test_two_nosha_delta(self):
+        compressor = groupcompress.GroupCompressor(True)
+        sha1_1, _ = compressor.compress(('label',),
+            ['strange\n', 'common\n'], None)
+        sha1_2, end_point = compressor.compress(('newlabel',),
+            ['common\n', 'different\n'], None)
+        self.assertEqual(sha_strings(['common\n', 'different\n']), sha1_2)
+        expected_lines = [
+            'label: label\n',
+            'sha1: %s\n' % sha1_1,
+            '0,0,3\n',
+            'strange\n',
+            'common\n',
+            '\n',
+            'label: newlabel\n',
+            'sha1: %s\n' % sha1_2,
+            # Delete what we don't want. Perhaps we want an implicit
+            # delete all to keep from bloating with useless delete
+            # instructions.
+            '0,4,0\n',
+            # add the new lines
+            '5,5,1\n',
+            'different\n',
+            ]
+        self.assertEqual(expected_lines, compressor.lines)
+        self.assertEqual(sum(map(len, expected_lines)), end_point)
+
+    def test_three_nosha_delta(self):
+        # The first interesting test: make a change that should use lines from
+        # both parents.
+        compressor = groupcompress.GroupCompressor(True)
+        sha1_1, end_point = compressor.compress(('label',),
+            ['strange\n', 'common\n'], None)
+        sha1_2, _ = compressor.compress(('newlabel',),
+            ['common\n', 'different\n', 'moredifferent\n'], None)
+        sha1_3, end_point = compressor.compress(('label3',),
+            ['new\n', 'common\n', 'different\n', 'moredifferent\n'], None)
+        self.assertEqual(
+            sha_strings(['new\n', 'common\n', 'different\n', 'moredifferent\n']),
+            sha1_3)
+        expected_lines = [
+            'label: label\n',
+            'sha1: %s\n' % sha1_1,
+            '0,0,3\n',
+            'strange\n',
+            'common\n',
+            '\n',
+            'label: newlabel\n',
+            'sha1: %s\n' % sha1_2,
+            # Delete what we don't want. Perhaps we want an implicit
+            # delete all to keep from bloating with useless delete
+            # instructions.
+            '0,4,0\n',
+            # add the new lines
+            '5,5,2\n',
+            'different\n',
+            'moredifferent\n',
+            'label: label3\n',
+            'sha1: %s\n' % sha1_3,
+            # Delete what we don't want. Perhaps we want an implicit
+            # delete all to keep from bloating with useless delete
+            # instructions.
+            # replace 'strange' with 'new'
+            '0,4,1\n',
+            'new\n',
+            # delete from after common up to differnet
+            '5,10,0\n',
+            # add new \n
+            '12,12,1\n',
+            '\n',
+            ]
+        self.assertEqualDiff(''.join(expected_lines), ''.join(compressor.lines))
+        self.assertEqual(sum(map(len, expected_lines)), end_point)
+
+    def test_stats(self):
+        compressor = groupcompress.GroupCompressor(True)
+        compressor.compress(('label',),
+            ['strange\n', 'common\n'], None)
+        compressor.compress(('newlabel',),
+            ['common\n', 'different\n', 'moredifferent\n'], None)
+        compressor.compress(('label3',),
+            ['new\n', 'common\n', 'different\n', 'moredifferent\n'], None)
+        self.assertAlmostEqual(0.3, compressor.ratio(), 1)