Rev 6329: (mbp) PackWriter: remove one possibly large string join (bug 890085 incr) in file:///srv/pqm.bazaar-vcs.org/archives/thelove/bzr/%2Btrunk/
Patch Queue Manager
pqm at pqm.ubuntu.com
Tue Nov 29 20:02:55 UTC 2011
At file:///srv/pqm.bazaar-vcs.org/archives/thelove/bzr/%2Btrunk/
------------------------------------------------------------
revno: 6329 [merge]
revision-id: pqm at pqm.ubuntu.com-20111129200254-ptnbnkq94ye7l463
parent: pqm at pqm.ubuntu.com-20111129183153-u0gz7nl7s0j208kt
parent: mbp at canonical.com-20111129091354-zcwnzn3cy1jfzqju
committer: Patch Queue Manager <pqm at pqm.ubuntu.com>
branch nick: +trunk
timestamp: Tue 2011-11-29 20:02:54 +0000
message:
(mbp) PackWriter: remove one possibly large string join (bug 890085 incr)
(Martin Pool)
modified:
bzrlib/pack.py container.py-20070607160755-tr8zc26q18rn0jnb-1
bzrlib/tests/test_pack.py test_container.py-20070607160755-tr8zc26q18rn0jnb-2
=== modified file 'bzrlib/pack.py'
--- a/bzrlib/pack.py 2011-09-07 11:46:16 +0000
+++ b/bzrlib/pack.py 2011-11-29 09:13:54 +0000
@@ -75,14 +75,12 @@
"""Return the bytes to finish a container."""
return "E"
- def bytes_record(self, bytes, names):
- """Return the bytes for a Bytes record with the given name and
- contents.
- """
+ def bytes_header(self, length, names):
+ """Return the header for a Bytes record."""
# Kind marker
byte_sections = ["B"]
# Length
- byte_sections.append(str(len(bytes)) + "\n")
+ byte_sections.append(str(length) + "\n")
# Names
for name_tuple in names:
# Make sure we're writing valid names. Note that we will leave a
@@ -92,18 +90,17 @@
byte_sections.append('\x00'.join(name_tuple) + "\n")
# End of headers
byte_sections.append("\n")
- # Finally, the contents.
- byte_sections.append(bytes)
- # XXX: This causes a memory copy of bytes in size, but is usually
- # faster than two write calls (12 vs 13 seconds to output a gig of
- # 1k records.) - results may differ on significantly larger records
- # like .iso's but as they should be rare in any case and thus not
- # likely to be the common case. The biggest issue is causing extreme
- # memory pressure in that case. One possibly improvement here is to
- # check the size of the content before deciding to join here vs call
- # write twice.
return ''.join(byte_sections)
+ def bytes_record(self, bytes, names):
+ """Return the bytes for a Bytes record with the given name and
+ contents.
+
+ If the content may be large, construct the header separately and then
+ stream out the contents.
+ """
+ return self.bytes_header(len(bytes), names) + bytes
+
class ContainerWriter(object):
"""A class for writing containers to a file.
@@ -113,6 +110,10 @@
introduced by the begin() and end() methods.
"""
+ # Join up headers with the body if writing fewer than this many bytes:
+ # trades off memory usage and copying to do less IO ops.
+ _JOIN_WRITES_THRESHOLD = 100000
+
def __init__(self, write_func):
"""Constructor.
@@ -151,8 +152,13 @@
and thus are only suitable for use by a ContainerReader.
"""
current_offset = self.current_offset
- serialised_record = self._serialiser.bytes_record(bytes, names)
- self.write_func(serialised_record)
+ length = len(bytes)
+ if length < self._JOIN_WRITES_THRESHOLD:
+ self.write_func(self._serialiser.bytes_header(length, names)
+ + bytes)
+ else:
+ self.write_func(self._serialiser.bytes_header(length, names))
+ self.write_func(bytes)
self.records_written += 1
# return a memo of where we wrote data to allow random access.
return current_offset, self.current_offset - current_offset
=== modified file 'bzrlib/tests/test_pack.py'
--- a/bzrlib/tests/test_pack.py 2009-06-20 01:17:38 +0000
+++ b/bzrlib/tests/test_pack.py 2011-11-29 09:13:54 +0000
@@ -64,6 +64,11 @@
errors.InvalidRecordError,
serialiser.bytes_record, 'bytes', [('bad name',)])
+ def test_bytes_record_header(self):
+ serialiser = pack.ContainerSerialiser()
+ record = serialiser.bytes_header(32, [('name1',), ('name2',)])
+ self.assertEqual('B32\nname1\nname2\n\n', record)
+
class TestContainerWriter(tests.TestCase):
@@ -126,6 +131,7 @@
def test_add_bytes_record_one_name(self):
"""Add a bytes record with one name."""
self.writer.begin()
+
offset, length = self.writer.add_bytes_record(
'abc', names=[('name1', )])
self.assertEqual((42, 13), (offset, length))
@@ -133,6 +139,33 @@
'Bazaar pack format 1 (introduced in 0.18)\n'
'B3\nname1\n\nabc')
+ def test_add_bytes_record_split_writes(self):
+ """Write a large record which does multiple IOs"""
+
+ writes = []
+ real_write = self.writer.write_func
+
+ def record_writes(bytes):
+ writes.append(bytes)
+ return real_write(bytes)
+
+ self.writer.write_func = record_writes
+ self.writer._JOIN_WRITES_THRESHOLD = 2
+
+ self.writer.begin()
+ offset, length = self.writer.add_bytes_record(
+ 'abcabc', names=[('name1', )])
+ self.assertEqual((42, 16), (offset, length))
+ self.assertOutput(
+ 'Bazaar pack format 1 (introduced in 0.18)\n'
+ 'B6\nname1\n\nabcabc')
+
+ self.assertEquals([
+ 'Bazaar pack format 1 (introduced in 0.18)\n',
+ 'B6\nname1\n\n',
+ 'abcabc'],
+ writes)
+
def test_add_bytes_record_two_names(self):
"""Add a bytes record with two names."""
self.writer.begin()
@@ -207,8 +240,8 @@
def test_construct(self):
"""Test constructing a ContainerReader.
- This uses None as the output stream to show that the constructor doesn't
- try to use the input stream.
+ This uses None as the output stream to show that the constructor
+ doesn't try to use the input stream.
"""
reader = pack.ContainerReader(None)
@@ -259,7 +292,8 @@
def test_validate_empty_container(self):
"""validate does not raise an error for a container with no records."""
- reader = self.get_reader_for("Bazaar pack format 1 (introduced in 0.18)\nE")
+ reader = self.get_reader_for(
+ "Bazaar pack format 1 (introduced in 0.18)\nE")
# No exception raised
reader.validate()
@@ -705,5 +739,3 @@
parser.accept_bytes("6\n\nabcdef")
self.assertEqual([([], 'abcdef')], parser.read_pending_records())
self.assertEqual([], parser.read_pending_records())
-
-
More information about the bazaar-commits
mailing list