Rev 2: Core proof of concept working. in http://people.ubuntu.com/~robertc/baz2.0/plugins/groupcompress/trunk
Robert Collins
robertc at robertcollins.net
Sun Jul 6 04:15:57 BST 2008
At http://people.ubuntu.com/~robertc/baz2.0/plugins/groupcompress/trunk
------------------------------------------------------------
revno: 2
revision-id: robertc at robertcollins.net-20080706031554-j10ybun5rn5fjiei
parent: robertc at robertcollins.net-20080705181540-mua3hkzj7c9qa5f6
committer: Robert Collins <robertc at robertcollins.net>
branch nick: trunk
timestamp: Sun 2008-07-06 13:15:54 +1000
message:
Core proof of concept working.
modified:
groupcompress.py groupcompress.py-20080705181503-ccbxd6xuy1bdnrpu-8
tests/test_groupcompress.py test_groupcompress.p-20080705181503-ccbxd6xuy1bdnrpu-13
=== modified file 'groupcompress.py'
--- a/groupcompress.py 2008-07-05 18:15:40 +0000
+++ b/groupcompress.py 2008-07-06 03:15:54 +0000
@@ -17,12 +17,116 @@
"""Core compression logic for compressing streams of related files."""
-
-from bzrlib import pack
+from bzrlib import diff, pack, patiencediff
from bzrlib.knit import _DirectPackAccess
+from bzrlib.osutils import (
+ contains_whitespace,
+ contains_linebreaks,
+ sha_string,
+ sha_strings,
+ split_lines,
+ )
from bzrlib.plugins.index2.repofmt import InMemoryBTree
-from bzrlib.versionedfile import VersionedFiles
-
+from bzrlib.versionedfile import (
+ FulltextContentFactory,
+ VersionedFiles,
+ )
+
+
+def parse(lines):
+ result = []
+ lines = iter(lines)
+ next = lines.next
+ print next(), next()
+ for header in lines:
+ start, end, count = [int(n) for n in header.split(',')]
+ contents = [next() for i in xrange(count)]
+ result.append((start, end, count, contents))
+ return result
+
+def apply_delta(basis, delta):
+ """Apply delta to this object to become new_version_id."""
+ lines = []
+ last_offset = 0
+ # eq ranges occur where gaps occur
+ # start, end refer to offsets in basis
+ for start, end, count, delta_lines in delta:
+ if last_offset != start: # copy an eq range
+ lines.extend(basis[last_offset:start])
+ lines[start:end] = delta_lines
+ last_offset = end
+ if last_offset != len(basis):
+ lines.extend(basis[last_offset:])
+ trim_encoding_newline(lines)
+ return lines
+
+
+def trim_encoding_newline(lines):
+ if lines[-1] == '\n':
+ del lines[-1]
+ else:
+ lines[-1] = lines[-1][:-1]
+
+
+class GroupCompressor(object):
+ """Produce a serialised group of compressed texts."""
+
+ def __init__(self, delta=True):
+ """Create a GroupCompressor.
+
+ :paeam delta: If False, do not compress records.
+ """
+ self._delta = delta
+ self.lines = []
+ self.endpoint = 0
+ self.input_bytes = 0
+
+ def compress(self, key, lines, expected_sha):
+ """Compress lines with label key.
+
+ :param key: A key tuple. It is stored in the output
+ for identification of the text during decompression.
+ :param lines: The lines to be compressed. Must be split
+ on \n, with the \n preserved.'
+ :param expected_sha: If non-None, the sha the lines are blieved to
+ have. During compression the sha is calculated; a mismatch will
+ cause an error.
+ :return: The sha1 of lines, and the number of bytes accumulated in
+ the group output so far.
+ """
+ sha1 = sha_strings(lines)
+ label = '\x00'.join(key)
+ # setup good encoding for trailing \n support.
+ if not lines or lines[-1].endswith('\n'):
+ lines.append('\n')
+ else:
+ lines[-1] = lines[-1] + '\n'
+ new_lines = []
+ new_lines.append('label: %s\n' % label)
+ new_lines.append('sha1: %s\n' % sha1)
+ if 0:
+ delta_seq = diff.difflib.SequenceMatcher(
+ None, self.lines, lines)
+ else:
+ delta_seq = patiencediff.PatienceSequenceMatcher(
+ None, self.lines, lines)
+ diff_hunks = []
+ for op in delta_seq.get_opcodes():
+ if op[0] == 'equal':
+ continue
+ diff_hunks.append((op[1], op[2], op[4]-op[3], lines[op[3]:op[4]]))
+ for start, end, count, new in diff_hunks:
+ new_lines.append('%d,%d,%d\n' % (start, end, count))
+ new_lines.extend(new)
+ self.endpoint += sum(map(len, new_lines))
+ self.lines.extend(new_lines)
+ trim_encoding_newline(lines)
+ self.input_bytes += sum(map(len, lines))
+ return sha1, self.endpoint
+
+ def ratio(self):
+ """Return the overall compression ratio."""
+ return float(self.input_bytes) / float(self.endpoint)
def make_pack_factory(graph, delta, keylength):
"""Create a factory for creating a pack based groupcompress.
@@ -39,11 +143,6 @@
ref_length = 0
if graph:
ref_length += 1
- if delta:
- ref_length += 1
- max_delta_chain = 200
- else:
- max_delta_chain = 0
graph_index = InMemoryBTree(reference_lists=ref_length,
key_elements=keylength)
stream = transport.open_write_stream('newpack')
@@ -53,8 +152,7 @@
deltas=delta, add_callback=graph_index.add_nodes)
access = _DirectPackAccess({})
access.set_writer(writer, graph_index, (transport, 'newpack'))
- result = GroupCompressVersionedFiles(index, access,
- max_delta_chain=max_delta_chain)
+ result = GroupCompressVersionedFiles(index, access, delta)
result.stream = stream
result.writer = writer
return result
@@ -69,13 +167,106 @@
class GroupCompressVersionedFiles(VersionedFiles):
"""A group-compress based VersionedFiles implementation."""
- def __init__(self, index, access, max_delta_chain=-1):
+ def __init__(self, index, access, delta=True):
"""Create a GroupCompressVersionedFiles object.
:param index: The index object storing access and graph data.
:param access: The access object storing raw data.
- """
-
+ :param delta: Whether to delta compress or just entropy compress.
+ """
+ self._index = index
+ self._access = access
+ self._delta = delta
+
+ def add_lines(self, key, parents, lines, parent_texts=None,
+ left_matching_blocks=None, nostore_sha=None, random_id=False,
+ check_content=True):
+ """Add a text to the store.
+
+ :param key: The key tuple of the text to add.
+ :param parents: The parents key tuples of the text to add.
+ :param lines: A list of lines. Each line must be a bytestring. And all
+ of them except the last must be terminated with \n and contain no
+ other \n's. The last line may either contain no \n's or a single
+ terminating \n. If the lines list does meet this constraint the add
+ routine may error or may succeed - but you will be unable to read
+ the data back accurately. (Checking the lines have been split
+ correctly is expensive and extremely unlikely to catch bugs so it
+ is not done at runtime unless check_content is True.)
+ :param parent_texts: An optional dictionary containing the opaque
+ representations of some or all of the parents of version_id to
+ allow delta optimisations. VERY IMPORTANT: the texts must be those
+ returned by add_lines or data corruption can be caused.
+ :param left_matching_blocks: a hint about which areas are common
+ between the text and its left-hand-parent. The format is
+ the SequenceMatcher.get_matching_blocks format.
+ :param nostore_sha: Raise ExistingContent and do not add the lines to
+ the versioned file if the digest of the lines matches this.
+ :param random_id: If True a random id has been selected rather than
+ an id determined by some deterministic process such as a converter
+ from a foreign VCS. When True the backend may choose not to check
+ for uniqueness of the resulting key within the versioned file, so
+ this should only be done when the result is expected to be unique
+ anyway.
+ :param check_content: If True, the lines supplied are verified to be
+ bytestrings that are correctly formed lines.
+ :return: The text sha1, the number of bytes in the text, and an opaque
+ representation of the inserted version which can be provided
+ back to future add_lines calls in the parent_texts dictionary.
+ """
+ self._index._check_write_ok()
+ self._check_add(key, lines, random_id, check_content)
+ if parents is None:
+ # The caller might pass None if there is no graph data, but kndx
+ # indexes can't directly store that, so we give them
+ # an empty tuple instead.
+ parents = ()
+ # double handling for now. Make it work until then.
+ bytes = ''.join(lines)
+ record = FulltextContentFactory(key, parents, None, bytes)
+ sha1 = self._insert_record_stream([record])
+ return sha1, len(bytes), None
+
+ def _check_add(self, key, lines, random_id, check_content):
+ """check that version_id and lines are safe to add."""
+ version_id = key[-1]
+ if contains_whitespace(version_id):
+ raise InvalidRevisionId(version_id, self)
+ self.check_not_reserved_id(version_id)
+ # TODO: If random_id==False and the key is already present, we should
+ # probably check that the existing content is identical to what is
+ # being inserted, and otherwise raise an exception. This would make
+ # the bundle code simpler.
+ if check_content:
+ self._check_lines_not_unicode(lines)
+ self._check_lines_are_lines(lines)
+
+ def insert_record_stream(self, stream):
+ """Insert a record stream into this container.
+
+ :param stream: A stream of records to insert.
+ :return: None
+ :seealso VersionedFiles.get_record_stream:
+ """
+ self._insert_record_stream(stream)
+
+ def _insert_record_stream(self, stream):
+ """Internal core to insert a record stream into this container.
+
+ This helper function has a different interface than insert_record_stream
+ to allow add_lines to be minimal, but still return the needed data.
+
+ :param stream: A stream of records to insert.
+ :return: An iterator over the sha1 of the inserted records.
+ :seealso insert_record_stream:
+ :seealso add_lines:
+ """
+ compressor = GroupCompressor(self._delta)
+ # This will go up to fulltexts for gc to gc fetching, which isn't
+ # ideal.
+ for record in stream:
+ found_sha1, end_point = compressor.compress(record.key,
+ split_lines(record.get_bytes_as('fulltext')), record.sha1)
class _GCGraphIndex(object):
"""Mapper from GroupCompressVersionedFiles needs into GraphIndex storage."""
@@ -108,3 +299,8 @@
self.has_graph = parents
self._is_locked = is_locked
+ def _check_write_ok(self):
+ """Assert if writes are not permitted."""
+ if not self._is_locked():
+ raise errors.ObjectNotLocked(self)
+
=== modified file 'tests/test_groupcompress.py'
--- a/tests/test_groupcompress.py 2008-07-05 18:15:40 +0000
+++ b/tests/test_groupcompress.py 2008-07-06 03:15:54 +0000
@@ -20,6 +20,7 @@
import zlib
from bzrlib import tests
+from bzrlib.osutils import sha_strings
from bzrlib.plugins.groupcompress import errors, groupcompress
from bzrlib.tests import (
TestCaseWithTransport,
@@ -47,3 +48,111 @@
return standard_tests
+class TestGroupCompressor(TestCaseWithTransport):
+ """Tests for GroupCompressor"""
+
+ def test_empty_delta(self):
+ compressor = groupcompress.GroupCompressor(True)
+ self.assertEqual([], compressor.lines)
+
+ def test_one_nosha_delta(self):
+ # diff against NUKK
+ compressor = groupcompress.GroupCompressor(True)
+ sha1, end_point = compressor.compress(('label',),
+ ['strange\n', 'common\n'], None)
+ self.assertEqual(sha_strings(['strange\n', 'common\n']), sha1)
+ expected_lines = [
+ 'label: label\n',
+ 'sha1: %s\n' % sha1,
+ '0,0,3\n',
+ 'strange\n',
+ 'common\n',
+ '\n', # the last \n in a text is removed, which allows safe
+ # serialisation of lines without trailing \n.
+ ]
+ self.assertEqual(expected_lines, compressor.lines)
+ self.assertEqual(sum(map(len, expected_lines)), end_point)
+
+ def test_two_nosha_delta(self):
+ compressor = groupcompress.GroupCompressor(True)
+ sha1_1, _ = compressor.compress(('label',),
+ ['strange\n', 'common\n'], None)
+ sha1_2, end_point = compressor.compress(('newlabel',),
+ ['common\n', 'different\n'], None)
+ self.assertEqual(sha_strings(['common\n', 'different\n']), sha1_2)
+ expected_lines = [
+ 'label: label\n',
+ 'sha1: %s\n' % sha1_1,
+ '0,0,3\n',
+ 'strange\n',
+ 'common\n',
+ '\n',
+ 'label: newlabel\n',
+ 'sha1: %s\n' % sha1_2,
+ # Delete what we don't want. Perhaps we want an implicit
+ # delete all to keep from bloating with useless delete
+ # instructions.
+ '0,4,0\n',
+ # add the new lines
+ '5,5,1\n',
+ 'different\n',
+ ]
+ self.assertEqual(expected_lines, compressor.lines)
+ self.assertEqual(sum(map(len, expected_lines)), end_point)
+
+ def test_three_nosha_delta(self):
+ # The first interesting test: make a change that should use lines from
+ # both parents.
+ compressor = groupcompress.GroupCompressor(True)
+ sha1_1, end_point = compressor.compress(('label',),
+ ['strange\n', 'common\n'], None)
+ sha1_2, _ = compressor.compress(('newlabel',),
+ ['common\n', 'different\n', 'moredifferent\n'], None)
+ sha1_3, end_point = compressor.compress(('label3',),
+ ['new\n', 'common\n', 'different\n', 'moredifferent\n'], None)
+ self.assertEqual(
+ sha_strings(['new\n', 'common\n', 'different\n', 'moredifferent\n']),
+ sha1_3)
+ expected_lines = [
+ 'label: label\n',
+ 'sha1: %s\n' % sha1_1,
+ '0,0,3\n',
+ 'strange\n',
+ 'common\n',
+ '\n',
+ 'label: newlabel\n',
+ 'sha1: %s\n' % sha1_2,
+ # Delete what we don't want. Perhaps we want an implicit
+ # delete all to keep from bloating with useless delete
+ # instructions.
+ '0,4,0\n',
+ # add the new lines
+ '5,5,2\n',
+ 'different\n',
+ 'moredifferent\n',
+ 'label: label3\n',
+ 'sha1: %s\n' % sha1_3,
+ # Delete what we don't want. Perhaps we want an implicit
+ # delete all to keep from bloating with useless delete
+ # instructions.
+ # replace 'strange' with 'new'
+ '0,4,1\n',
+ 'new\n',
+ # delete from after common up to differnet
+ '5,10,0\n',
+ # add new \n
+ '12,12,1\n',
+ '\n',
+ ]
+ self.assertEqualDiff(''.join(expected_lines), ''.join(compressor.lines))
+ self.assertEqual(sum(map(len, expected_lines)), end_point)
+
+ def test_stats(self):
+ compressor = groupcompress.GroupCompressor(True)
+ compressor.compress(('label',),
+ ['strange\n', 'common\n'], None)
+ compressor.compress(('newlabel',),
+ ['common\n', 'different\n', 'moredifferent\n'], None)
+ compressor.compress(('label3',),
+ ['new\n', 'common\n', 'different\n', 'moredifferent\n'], None)
+ self.assertAlmostEqual(0.3, compressor.ratio(), 1)
More information about the bazaar-commits
mailing list