Rev 6329: (mbp) PackWriter: remove one possibly large string join (bug 890085 incr) in file:///srv/pqm.bazaar-vcs.org/archives/thelove/bzr/%2Btrunk/

Tue Nov 29 20:02:55 UTC 2011

At file:///srv/pqm.bazaar-vcs.org/archives/thelove/bzr/%2Btrunk/

------------------------------------------------------------
revno: 6329 [merge]
revision-id: pqm at pqm.ubuntu.com-20111129200254-ptnbnkq94ye7l463
parent: pqm at pqm.ubuntu.com-20111129183153-u0gz7nl7s0j208kt
parent: mbp at canonical.com-20111129091354-zcwnzn3cy1jfzqju
committer: Patch Queue Manager <pqm at pqm.ubuntu.com>
branch nick: +trunk
timestamp: Tue 2011-11-29 20:02:54 +0000
message:
  (mbp) PackWriter: remove one possibly large string join (bug 890085 incr)
   (Martin Pool)
modified:
  bzrlib/pack.py                 container.py-20070607160755-tr8zc26q18rn0jnb-1
  bzrlib/tests/test_pack.py      test_container.py-20070607160755-tr8zc26q18rn0jnb-2
=== modified file 'bzrlib/pack.py'

--- a/bzrlib/pack.py	2011-09-07 11:46:16 +0000
+++ b/bzrlib/pack.py	2011-11-29 09:13:54 +0000
@@ -75,14 +75,12 @@
         """Return the bytes to finish a container."""
         return "E"
 
-    def bytes_record(self, bytes, names):
-        """Return the bytes for a Bytes record with the given name and
-        contents.
-        """
+    def bytes_header(self, length, names):
+        """Return the header for a Bytes record."""
         # Kind marker
         byte_sections = ["B"]
         # Length
-        byte_sections.append(str(len(bytes)) + "\n")
+        byte_sections.append(str(length) + "\n")
         # Names
         for name_tuple in names:
             # Make sure we're writing valid names.  Note that we will leave a
@@ -92,18 +90,17 @@
             byte_sections.append('\x00'.join(name_tuple) + "\n")
         # End of headers
         byte_sections.append("\n")
-        # Finally, the contents.
-        byte_sections.append(bytes)
-        # XXX: This causes a memory copy of bytes in size, but is usually
-        # faster than two write calls (12 vs 13 seconds to output a gig of
-        # 1k records.) - results may differ on significantly larger records
-        # like .iso's but as they should be rare in any case and thus not
-        # likely to be the common case. The biggest issue is causing extreme
-        # memory pressure in that case. One possibly improvement here is to
-        # check the size of the content before deciding to join here vs call
-        # write twice.
         return ''.join(byte_sections)
 
+    def bytes_record(self, bytes, names):
+        """Return the bytes for a Bytes record with the given name and
+        contents.
+
+        If the content may be large, construct the header separately and then
+        stream out the contents.
+        """
+        return self.bytes_header(len(bytes), names) + bytes
+
 
 class ContainerWriter(object):
     """A class for writing containers to a file.
@@ -113,6 +110,10 @@
         introduced by the begin() and end() methods.
     """
 
+    # Join up headers with the body if writing fewer than this many bytes:
+    # trades off memory usage and copying to do less IO ops.
+    _JOIN_WRITES_THRESHOLD = 100000
+
     def __init__(self, write_func):
         """Constructor.
 
@@ -151,8 +152,13 @@
             and thus are only suitable for use by a ContainerReader.
         """
         current_offset = self.current_offset
-        serialised_record = self._serialiser.bytes_record(bytes, names)
-        self.write_func(serialised_record)
+        length = len(bytes)
+        if length < self._JOIN_WRITES_THRESHOLD:
+            self.write_func(self._serialiser.bytes_header(length, names)
+                + bytes)
+        else:
+            self.write_func(self._serialiser.bytes_header(length, names))
+            self.write_func(bytes)
         self.records_written += 1
         # return a memo of where we wrote data to allow random access.
         return current_offset, self.current_offset - current_offset

=== modified file 'bzrlib/tests/test_pack.py'
--- a/bzrlib/tests/test_pack.py	2009-06-20 01:17:38 +0000
+++ b/bzrlib/tests/test_pack.py	2011-11-29 09:13:54 +0000
@@ -64,6 +64,11 @@
             errors.InvalidRecordError,
             serialiser.bytes_record, 'bytes', [('bad name',)])
 
+    def test_bytes_record_header(self):
+        serialiser = pack.ContainerSerialiser()
+        record = serialiser.bytes_header(32, [('name1',), ('name2',)])
+        self.assertEqual('B32\nname1\nname2\n\n', record)
+
 
 class TestContainerWriter(tests.TestCase):
 
@@ -126,6 +131,7 @@
     def test_add_bytes_record_one_name(self):
         """Add a bytes record with one name."""
         self.writer.begin()
+
         offset, length = self.writer.add_bytes_record(
             'abc', names=[('name1', )])
         self.assertEqual((42, 13), (offset, length))
@@ -133,6 +139,33 @@
             'Bazaar pack format 1 (introduced in 0.18)\n'
             'B3\nname1\n\nabc')
 
+    def test_add_bytes_record_split_writes(self):
+        """Write a large record which does multiple IOs"""
+
+        writes = []
+        real_write = self.writer.write_func
+
+        def record_writes(bytes):
+            writes.append(bytes)
+            return real_write(bytes)
+
+        self.writer.write_func = record_writes
+        self.writer._JOIN_WRITES_THRESHOLD = 2
+
+        self.writer.begin()
+        offset, length = self.writer.add_bytes_record(
+            'abcabc', names=[('name1', )])
+        self.assertEqual((42, 16), (offset, length))
+        self.assertOutput(
+            'Bazaar pack format 1 (introduced in 0.18)\n'
+            'B6\nname1\n\nabcabc')
+
+        self.assertEquals([
+            'Bazaar pack format 1 (introduced in 0.18)\n',
+            'B6\nname1\n\n',
+            'abcabc'],
+            writes)
+
     def test_add_bytes_record_two_names(self):
         """Add a bytes record with two names."""
         self.writer.begin()
@@ -207,8 +240,8 @@
     def test_construct(self):
         """Test constructing a ContainerReader.
 
-        This uses None as the output stream to show that the constructor doesn't
-        try to use the input stream.
+        This uses None as the output stream to show that the constructor
+        doesn't try to use the input stream.
         """
         reader = pack.ContainerReader(None)
 
@@ -259,7 +292,8 @@
 
     def test_validate_empty_container(self):
         """validate does not raise an error for a container with no records."""
-        reader = self.get_reader_for("Bazaar pack format 1 (introduced in 0.18)\nE")
+        reader = self.get_reader_for(
+            "Bazaar pack format 1 (introduced in 0.18)\nE")
         # No exception raised
         reader.validate()
 
@@ -705,5 +739,3 @@
         parser.accept_bytes("6\n\nabcdef")
         self.assertEqual([([], 'abcdef')], parser.read_pending_records())
         self.assertEqual([], parser.read_pending_records())
-
-