Rev 4471: (jam) Update the GroupCompressBlock to compress via chunks, in file:///home/pqm/archives/thelove/bzr/%2Btrunk/
Canonical.com Patch Queue Manager
pqm at pqm.ubuntu.com
Tue Jun 23 05:02:27 BST 2009
At file:///home/pqm/archives/thelove/bzr/%2Btrunk/
------------------------------------------------------------
revno: 4471
revision-id: pqm at pqm.ubuntu.com-20090623040223-yvjujkc4sd3me6gd
parent: pqm at pqm.ubuntu.com-20090623003517-lrjel82rf7q6qjlc
parent: john at arbash-meinel.com-20090622191305-hp87vzgnodon7g9x
committer: Canonical.com Patch Queue Manager <pqm at pqm.ubuntu.com>
branch nick: +trunk
timestamp: Tue 2009-06-23 05:02:23 +0100
message:
(jam) Update the GroupCompressBlock to compress via chunks,
shaves a memory copy of the raw file content during commit.
modified:
NEWS NEWS-20050323055033-4e00b5db738777ff
bzrlib/groupcompress.py groupcompress.py-20080705181503-ccbxd6xuy1bdnrpu-8
bzrlib/tests/test_groupcompress.py test_groupcompress.p-20080705181503-ccbxd6xuy1bdnrpu-13
------------------------------------------------------------
revno: 4469.1.4
revision-id: john at arbash-meinel.com-20090622191305-hp87vzgnodon7g9x
parent: john at arbash-meinel.com-20090622191032-vw061b72zzowbfx8
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: 1.17-gc-single-mem
timestamp: Mon 2009-06-22 14:13:05 -0500
message:
NEWS entry update, now that gc formats only create 1 copy of the text.
modified:
NEWS NEWS-20050323055033-4e00b5db738777ff
------------------------------------------------------------
revno: 4469.1.3
revision-id: john at arbash-meinel.com-20090622191032-vw061b72zzowbfx8
parent: john at arbash-meinel.com-20090622183008-pofo16w8y3at5jjv
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: 1.17-gc-single-mem
timestamp: Mon 2009-06-22 14:10:32 -0500
message:
Notes on why we do it the way we do.
modified:
bzrlib/groupcompress.py groupcompress.py-20080705181503-ccbxd6xuy1bdnrpu-8
------------------------------------------------------------
revno: 4469.1.2
revision-id: john at arbash-meinel.com-20090622183008-pofo16w8y3at5jjv
parent: john at arbash-meinel.com-20090622181004-0rsmfqcnhk48fq88
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: 1.17-gc-single-mem
timestamp: Mon 2009-06-22 13:30:08 -0500
message:
The only caller already knows the content length, so make the api such that
you are required to pass it in.
It isn't particularly more difficult to do so, and it saves the time of calling len()
on thousands of strings during pack.
modified:
bzrlib/groupcompress.py groupcompress.py-20080705181503-ccbxd6xuy1bdnrpu-8
bzrlib/tests/test_groupcompress.py test_groupcompress.p-20080705181503-ccbxd6xuy1bdnrpu-13
------------------------------------------------------------
revno: 4469.1.1
revision-id: john at arbash-meinel.com-20090622181004-0rsmfqcnhk48fq88
parent: pqm at pqm.ubuntu.com-20090622171120-fuxez9ylfqpxynqn
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: 1.17-gc-single-mem
timestamp: Mon 2009-06-22 13:10:04 -0500
message:
Add a set_content_chunked member to GroupCompressBlock.
modified:
bzrlib/groupcompress.py groupcompress.py-20080705181503-ccbxd6xuy1bdnrpu-8
bzrlib/tests/test_groupcompress.py test_groupcompress.p-20080705181503-ccbxd6xuy1bdnrpu-13
=== modified file 'NEWS'
--- a/NEWS 2009-06-22 21:55:37 +0000
+++ b/NEWS 2009-06-23 04:02:23 +0000
@@ -43,9 +43,10 @@
(Martin Pool, #339385)
* Reduced memory consumption during ``bzr commit`` of large files. For
- pre 2a formats, should be down to ~3x the size of a file, and for
- ``--2a`` formats should be down to exactly 2x the size. Related to bug
- #109114. (John Arbash Meinel)
+ pre 2a formats, should be down to ~3x the size of a file.
+ For ``--2a`` format repositories, it is down to the size of the file
+ content plus the size of the compressed text. Related to bug #109114.
+ (John Arbash Meinel)
* Repositories using CHK pages (which includes the new 2a format) will no
longer error during commit or push operations when an autopack operation
=== modified file 'bzrlib/groupcompress.py'
--- a/bzrlib/groupcompress.py 2009-06-22 15:47:25 +0000
+++ b/bzrlib/groupcompress.py 2009-06-22 19:10:32 +0000
@@ -108,6 +108,7 @@
self._z_content_length = None
self._content_length = None
self._content = None
+ self._content_chunks = None
def __len__(self):
# This is the maximum number of bytes this object will reference if
@@ -137,6 +138,10 @@
% (num_bytes, self._content_length))
# Expand the content if required
if self._content is None:
+ if self._content_chunks is not None:
+ self._content = ''.join(self._content_chunks)
+ self._content_chunks = None
+ if self._content is None:
if self._z_content is None:
raise AssertionError('No content to decompress')
if self._z_content == '':
@@ -273,22 +278,55 @@
bytes = apply_delta_to_source(self._content, content_start, end)
return bytes
+ def set_chunked_content(self, content_chunks, length):
+ """Set the content of this block to the given chunks."""
+ # If we have lots of short lines, it is may be more efficient to join
+ # the content ahead of time. If the content is <10MiB, we don't really
+ # care about the extra memory consumption, so we can just pack it and
+ # be done. However, timing showed 18s => 17.9s for repacking 1k revs of
+ # mysql, which is below the noise margin
+ self._content_length = length
+ self._content_chunks = content_chunks
+ self._content = None
+ self._z_content = None
+
def set_content(self, content):
"""Set the content of this block."""
self._content_length = len(content)
self._content = content
self._z_content = None
+ def _create_z_content_using_lzma(self):
+ if self._content_chunks is not None:
+ self._content = ''.join(self._content_chunks)
+ self._content_chunks = None
+ if self._content is None:
+ raise AssertionError('Nothing to compress')
+ self._z_content = pylzma.compress(self._content)
+ self._z_content_length = len(self._z_content)
+
+ def _create_z_content_from_chunks(self):
+ compressor = zlib.compressobj(zlib.Z_DEFAULT_COMPRESSION)
+ compressed_chunks = map(compressor.compress, self._content_chunks)
+ compressed_chunks.append(compressor.flush())
+ self._z_content = ''.join(compressed_chunks)
+ self._z_content_length = len(self._z_content)
+
+ def _create_z_content(self):
+ if self._z_content is not None:
+ return
+ if _USE_LZMA:
+ self._create_z_content_using_lzma()
+ return
+ if self._content_chunks is not None:
+ self._create_z_content_from_chunks()
+ return
+ self._z_content = zlib.compress(self._content)
+ self._z_content_length = len(self._z_content)
+
def to_bytes(self):
"""Encode the information into a byte stream."""
- compress = zlib.compress
- if _USE_LZMA:
- compress = pylzma.compress
- if self._z_content is None:
- if self._content is None:
- raise AssertionError('Nothing to compress')
- self._z_content = compress(self._content)
- self._z_content_length = len(self._z_content)
+ self._create_z_content()
if _USE_LZMA:
header = self.GCB_LZ_HEADER
else:
@@ -762,10 +800,9 @@
# for 'commit' down to ~1x the size of the largest file, at a
# cost of increased complexity within this code. 2x is still <<
# 3x the size of the largest file, so we are doing ok.
- content = ''.join(self.chunks)
+ self._block.set_chunked_content(self.chunks, self.endpoint)
self.chunks = None
self._delta_index = None
- self._block.set_content(content)
return self._block
def pop_last(self):
=== modified file 'bzrlib/tests/test_groupcompress.py'
--- a/bzrlib/tests/test_groupcompress.py 2009-06-10 03:56:49 +0000
+++ b/bzrlib/tests/test_groupcompress.py 2009-06-22 18:30:08 +0000
@@ -363,6 +363,15 @@
raw_bytes = zlib.decompress(remaining_bytes)
self.assertEqual(content, raw_bytes)
+ # we should get the same results if using the chunked version
+ gcb = groupcompress.GroupCompressBlock()
+ gcb.set_chunked_content(['this is some content\n'
+ 'this content will be compressed\n'],
+ len(content))
+ old_bytes = bytes
+ bytes = gcb.to_bytes()
+ self.assertEqual(old_bytes, bytes)
+
def test_partial_decomp(self):
content_chunks = []
# We need a sufficient amount of data so that zlib.decompress has
More information about the bazaar-commits
mailing list