Rev 3791: (jam) 'bzr pack' now passes optimization flags down to the index in file:///home/pqm/archives/thelove/bzr/%2Btrunk/

Canonical.com Patch Queue Manager pqm at pqm.ubuntu.com
Wed Oct 22 21:18:23 BST 2008


At file:///home/pqm/archives/thelove/bzr/%2Btrunk/

------------------------------------------------------------
revno: 3791
revision-id: pqm at pqm.ubuntu.com-20081022201819-s0a7gbf7wrsgn2q7
parent: pqm at pqm.ubuntu.com-20081022194407-i8gphy9hg1sj48ib
parent: john at arbash-meinel.com-20081022192642-3llptswnqir54glz
committer: Canonical.com Patch Queue Manager <pqm at pqm.ubuntu.com>
branch nick: +trunk
timestamp: Wed 2008-10-22 21:18:19 +0100
message:
  (jam) 'bzr pack' now passes optimization flags down to the index
  	builder
modified:
  NEWS                           NEWS-20050323055033-4e00b5db738777ff
  bzrlib/btree_index.py          index.py-20080624222253-p0x5f92uyh5hw734-7
  bzrlib/chunk_writer.py         chunk_writer.py-20080630234519-6ggn4id17nipovny-1
  bzrlib/index.py                index.py-20070712131115-lolkarso50vjr64s-1
  bzrlib/repofmt/pack_repo.py    pack_repo.py-20070813041115-gjv5ma7ktfqwsjgn-1
  bzrlib/tests/test_btree_index.py test_index.py-20080624222253-p0x5f92uyh5hw734-13
  bzrlib/tests/test_chunk_writer.py test_chunk_writer.py-20080630234519-6ggn4id17nipovny-2
  bzrlib/tests/test_index.py     test_index.py-20070712131115-lolkarso50vjr64s-2
  bzrlib/tests/test_repository.py test_repository.py-20060131075918-65c555b881612f4d
    ------------------------------------------------------------
    revno: 3777.5.7
    revision-id: john at arbash-meinel.com-20081022192642-3llptswnqir54glz
    parent: john at arbash-meinel.com-20081022192527-t9vrlcebnylxyv4f
    committer: John Arbash Meinel <john at arbash-meinel.com>
    branch nick: btree_optimize
    timestamp: Wed 2008-10-22 14:26:42 -0500
    message:
      NEWS entry about index optimization.
    modified:
      NEWS                           NEWS-20050323055033-4e00b5db738777ff
    ------------------------------------------------------------
    revno: 3777.5.6
    revision-id: john at arbash-meinel.com-20081022192527-t9vrlcebnylxyv4f
    parent: john at arbash-meinel.com-20081022192446-gh0f5zijpmurmdmg
    parent: pqm at pqm.ubuntu.com-20081021231845-k119hl1icewguq50
    committer: John Arbash Meinel <john at arbash-meinel.com>
    branch nick: btree_optimize
    timestamp: Wed 2008-10-22 14:25:27 -0500
    message:
      Merge bzr.dev 3789
    added:
      bzrlib/tests/fake_command.py   fake_command.py-20081021195002-r9v65tgxx63c25v9-1
      doc/developers/cycle.txt       cycle.txt-20081017031739-rw24r0cywm2ok3xu-1
      tools/packaging/lp-upload-release lpuploadrelease-20081020075647-56zdf9z6yav1bx81-1
    modified:
      Makefile                       Makefile-20050805140406-d96e3498bb61c5bb
      NEWS                           NEWS-20050323055033-4e00b5db738777ff
      bzrlib/branch.py               branch.py-20050309040759-e4baf4e0d046576e
      bzrlib/commands.py             bzr.py-20050309040720-d10f4714595cf8c3
      bzrlib/config.py               config.py-20051011043216-070c74f4e9e338e8
      bzrlib/errors.py               errors.py-20050309040759-20512168c4e14fbd
      bzrlib/knit.py                 knit.py-20051212171256-f056ac8f0fbe1bd9
      bzrlib/patches.py              patches.py-20050727183609-378c1cc5972ce908
      bzrlib/plugins/launchpad/account.py account.py-20071011033320-50y6vfftywf4yllw-1
      bzrlib/plugins/launchpad/lp_directory.py lp_indirect.py-20070126012204-de5rugwlt22c7u7e-1
      bzrlib/plugins/launchpad/test_account.py test_account.py-20071011033320-50y6vfftywf4yllw-2
      bzrlib/plugins/launchpad/test_lp_directory.py test_lp_indirect.py-20070126002743-oyle362tzv9cd8mi-1
      bzrlib/tests/blackbox/test_command_encoding.py test_command_encoding.py-20060106032110-45431fd2ce9ff21f
      bzrlib/tests/test_branch.py    test_branch.py-20060116013032-97819aa07b8ab3b5
      bzrlib/tests/test_commands.py  test_command.py-20051019190109-3b17be0f52eaa7a8
      bzrlib/tests/test_config.py    testconfig.py-20051011041908-742d0c15d8d8c8eb
      bzrlib/tests/test_knit.py      test_knit.py-20051212171302-95d4c00dd5f11f2b
      bzrlib/tests/test_patches.py   test_patches.py-20051231203844-f4974d20f6aea09c
      bzrlib/tests/test_plugins.py   plugins.py-20050622075746-32002b55e5e943e9
      bzrlib/tests/test_remote.py    test_remote.py-20060720103555-yeeg2x51vn0rbtdp-2
      bzrlib/tests/test_sftp_transport.py testsftp.py-20051027032739-247570325fec7e7e
      bzrlib/tests/test_store.py     teststore.py-20050826022702-f6caadb647395769
      bzrlib/tests/test_transform.py test_transaction.py-20060105172520-b3ffb3946550e6c4
      bzrlib/transform.py            transform.py-20060105172343-dd99e54394d91687
      bzrlib/transport/ftp/__init__.py ftp.py-20051116161804-58dc9506548c2a53
      bzrlib/transport/remote.py     ssh.py-20060608202016-c25gvf1ob7ypbus6-1
      bzrlib/transport/sftp.py       sftp.py-20051019050329-ab48ce71b7e32dfe
      bzrlib/transport/ssh.py        ssh.py-20060824042150-0s9787kng6zv1nwq-1
      doc/developers/HACKING.txt     HACKING-20050805200004-2a5dc975d870f78c
      doc/developers/index.txt       index.txt-20070508041241-qznziunkg0nffhiw-1
      doc/developers/releasing.txt   releasing.txt-20080502015919-fnrcav8fwy8ccibu-1
      setup.py                       setup.py-20050314065409-02f8a0a6e3f9bc70
    ------------------------------------------------------------
    revno: 3777.5.5
    revision-id: john at arbash-meinel.com-20081022192446-gh0f5zijpmurmdmg
    parent: john at arbash-meinel.com-20081016185822-3zwdbkphgacdz9s5
    committer: John Arbash Meinel <john at arbash-meinel.com>
    branch nick: btree_optimize
    timestamp: Wed 2008-10-22 14:24:46 -0500
    message:
      Up-call to the parent as suggested by Andrew.
    modified:
      bzrlib/repofmt/pack_repo.py    pack_repo.py-20070813041115-gjv5ma7ktfqwsjgn-1
    ------------------------------------------------------------
    revno: 3777.5.4
    revision-id: john at arbash-meinel.com-20081016185822-3zwdbkphgacdz9s5
    parent: john at arbash-meinel.com-20081015214003-n96hr05ylrwlgdvi
    committer: John Arbash Meinel <john at arbash-meinel.com>
    branch nick: btree_optimize
    timestamp: Thu 2008-10-16 13:58:22 -0500
    message:
      OptimisingPacker now sets the optimize flags for the indexes being built.
    modified:
      bzrlib/repofmt/pack_repo.py    pack_repo.py-20070813041115-gjv5ma7ktfqwsjgn-1
      bzrlib/tests/test_repository.py test_repository.py-20060131075918-65c555b881612f4d
    ------------------------------------------------------------
    revno: 3777.5.3
    revision-id: john at arbash-meinel.com-20081015214003-n96hr05ylrwlgdvi
    parent: john at arbash-meinel.com-20081015213410-g19sy2rpgxcl2sew
    committer: John Arbash Meinel <john at arbash-meinel.com>
    branch nick: btree_optimize
    timestamp: Wed 2008-10-15 16:40:03 -0500
    message:
      Add Builder.set_optimize(for_size=True) for GraphIndexBuilder and BTreeBuilder.
    modified:
      bzrlib/index.py                index.py-20070712131115-lolkarso50vjr64s-1
      bzrlib/tests/test_btree_index.py test_index.py-20080624222253-p0x5f92uyh5hw734-13
      bzrlib/tests/test_index.py     test_index.py-20070712131115-lolkarso50vjr64s-2
    ------------------------------------------------------------
    revno: 3777.5.2
    revision-id: john at arbash-meinel.com-20081015213410-g19sy2rpgxcl2sew
    parent: john at arbash-meinel.com-20081015212739-ap2uunpg6rjkypc1
    committer: John Arbash Meinel <john at arbash-meinel.com>
    branch nick: btree_optimize
    timestamp: Wed 2008-10-15 16:34:10 -0500
    message:
      Change the name to ChunkWriter.set_optimize()
      
      Also allow it to be passed during __init__ and pass it in from
      BTreeBuilder.
    modified:
      bzrlib/btree_index.py          index.py-20080624222253-p0x5f92uyh5hw734-7
      bzrlib/chunk_writer.py         chunk_writer.py-20080630234519-6ggn4id17nipovny-1
      bzrlib/tests/test_chunk_writer.py test_chunk_writer.py-20080630234519-6ggn4id17nipovny-2
    ------------------------------------------------------------
    revno: 3777.5.1
    revision-id: john at arbash-meinel.com-20081015212739-ap2uunpg6rjkypc1
    parent: pqm at pqm.ubuntu.com-20081014031836-0pn8u98igc7gvtv0
    committer: John Arbash Meinel <john at arbash-meinel.com>
    branch nick: btree_optimize
    timestamp: Wed 2008-10-15 16:27:39 -0500
    message:
      Add ChunkWriter.optimize(for_size=True)
    modified:
      bzrlib/chunk_writer.py         chunk_writer.py-20080630234519-6ggn4id17nipovny-1
      bzrlib/tests/test_chunk_writer.py test_chunk_writer.py-20080630234519-6ggn4id17nipovny-2
=== modified file 'NEWS'
--- a/NEWS	2008-10-22 19:09:16 +0000
+++ b/NEWS	2008-10-22 20:18:19 +0000
@@ -19,6 +19,10 @@
     * ``bzr dump-btree`` is a hidden command introduced to allow dumping
       the contents of a compressed btree file.  (John Arbash Meinel)
 
+    * ``bzr pack`` now tells the index builders to optimize for size. For
+      btree index repositories, this can save 25% of the index size
+      (mostly in the text indexes). (John Arbash Meinel)
+
     * default username for bzr+ssh and sftp can be configured in
       authentication.conf. (Aaron Bentley)
 

=== modified file 'bzrlib/btree_index.py'
--- a/bzrlib/btree_index.py	2008-09-26 07:09:50 +0000
+++ b/bzrlib/btree_index.py	2008-10-15 21:34:10 +0000
@@ -139,6 +139,7 @@
         self._nodes = {}
         # Indicate it hasn't been built yet
         self._nodes_by_key = None
+        self._optimize_for_size = False
 
     def add_node(self, key, value, references=()):
         """Add a node to the index.
@@ -276,7 +277,8 @@
                     length = _PAGE_SIZE
                     if internal_row.nodes == 0:
                         length -= _RESERVED_HEADER_BYTES # padded
-                    internal_row.writer = chunk_writer.ChunkWriter(length, 0)
+                    internal_row.writer = chunk_writer.ChunkWriter(length, 0,
+                        optimize_for_size=self._optimize_for_size)
                     internal_row.writer.write(_INTERNAL_FLAG)
                     internal_row.writer.write(_INTERNAL_OFFSET +
                         str(rows[pos + 1].nodes) + "\n")
@@ -284,7 +286,8 @@
             length = _PAGE_SIZE
             if rows[-1].nodes == 0:
                 length -= _RESERVED_HEADER_BYTES # padded
-            rows[-1].writer = chunk_writer.ChunkWriter(length)
+            rows[-1].writer = chunk_writer.ChunkWriter(length,
+                optimize_for_size=self._optimize_for_size)
             rows[-1].writer.write(_LEAF_FLAG)
         if rows[-1].writer.write(line):
             # this key did not fit in the node:
@@ -313,7 +316,8 @@
                 # This will be padded, hence the -100
                 new_row.writer = chunk_writer.ChunkWriter(
                     _PAGE_SIZE - _RESERVED_HEADER_BYTES,
-                    reserved_bytes)
+                    reserved_bytes,
+                    optimize_for_size=self._optimize_for_size)
                 new_row.writer.write(_INTERNAL_FLAG)
                 new_row.writer.write(_INTERNAL_OFFSET +
                     str(rows[1].nodes - 1) + "\n")

=== modified file 'bzrlib/chunk_writer.py'
--- a/bzrlib/chunk_writer.py	2008-08-28 20:13:31 +0000
+++ b/bzrlib/chunk_writer.py	2008-10-15 21:34:10 +0000
@@ -47,51 +47,53 @@
     #    In testing, some values for bzr.dev::
     #        repack  time  MB   max   full
     #         1       7.5  4.6  1140  0
-    #         2       8.4  4.2  1036  1          6.8
+    #         2       8.4  4.2  1036  1
     #         3       9.8  4.1  1012  278
     #         4      10.8  4.1  728   945
     #        20      11.1  4.1  0     1012
     #        repack = 0
-    #        zsync   time  MB    repack  max_z   time w/ add_node
-    #         0       6.7  24.7  0       6270    5.0
-    #         1       6.5  13.2  0       3342    4.3
-    #         2       6.6   9.6  0       2414    4.9
-    #         5       6.5   6.2  0       1549    4.8
-    #         6       6.5   5.8  1       1435    4.8
-    #         7       6.6   5.5  19      1337    4.8
-    #         8       6.7   5.3  81      1220    4.4
-    #        10       6.8   5.0  260     967     5.3
-    #        11       6.8   4.9  366     839     5.3
-    #        12       6.9   4.8  454     731     5.1
-    #        15       7.2   4.7  704     450     5.8
-    #        20       7.7   4.6  1133    7       5.8
+    #        zsync   time  MB    repack  stop_for_z
+    #         0       5.0  24.7  0       6270
+    #         1       4.3  13.2  0       3342
+    #         2       4.9   9.6  0       2414
+    #         5       4.8   6.2  0       1549
+    #         6       4.8   5.8  1       1435
+    #         7       4.8   5.5  19      1337
+    #         8       4.4   5.3  81      1220
+    #        10       5.3   5.0  260     967
+    #        11       5.3   4.9  366     839
+    #        12       5.1   4.8  454     731
+    #        15       5.8   4.7  704     450
+    #        20       5.8   4.6  1133    7
 
     #    In testing, some values for mysql-unpacked::
     #                next_bytes estim
-    #        repack  time  MB    hit_max full
-    #         1      51.7  15.4  3913  0
-    #         2      54.4  13.7  3467  0         35.4
-    #        20      67.0  13.4  0     3380      46.7
+    #        repack  time  MB    full    stop_for_repack
+    #         1            15.4  0       3913
+    #         2      35.4  13.7  0       346
+    #        20      46.7  13.4  3380    0
     #        repack=0
-    #        zsync                               time w/ add_node
-    #         0      47.7 116.5  0       29782   29.5
-    #         1      48.5  60.2  0       15356   27.8
-    #         2      48.1  42.4  0       10822   27.8
-    #         5      48.3  25.5  0       6491    26.8
-    #         6      48.0  23.2  13      5896    27.3
-    #         7      48.1  21.6  29      5451    27.5
-    #         8      48.1  20.3  52      5108    27.1
-    #        10      46.9  18.6  195     4526    29.4
-    #        11      48.8  18.0  421     4143    29.2
-    #        12      47.4  17.5  702     3738    28.0
-    #        15      49.6  16.5  1223    2969    28.9
-    #        20      48.9  15.7  2182    1810    29.6
-    #        30            15.4  3891    23      31.4
-
-    _max_repack = 0
-    _max_zsync = 8
-
-    def __init__(self, chunk_size, reserved=0):
+    #        zsync                       stop_for_z
+    #         0      29.5 116.5  0       29782
+    #         1      27.8  60.2  0       15356
+    #         2      27.8  42.4  0       10822
+    #         5      26.8  25.5  0       6491
+    #         6      27.3  23.2  13      5896
+    #         7      27.5  21.6  29      5451
+    #         8      27.1  20.3  52      5108
+    #        10      29.4  18.6  195     4526
+    #        11      29.2  18.0  421     4143
+    #        12      28.0  17.5  702     3738
+    #        15      28.9  16.5  1223    2969
+    #        20      29.6  15.7  2182    1810
+    #        30      31.4  15.4  3891    23
+
+    # Tuple of (num_repack_attempts, num_zsync_attempts)
+    # num_zsync_attempts only has meaning if num_repack_attempts is 0.
+    _repack_opts_for_speed = (0, 8)
+    _repack_opts_for_size = (20, 0)
+
+    def __init__(self, chunk_size, reserved=0, optimize_for_size=False):
         """Create a ChunkWriter to write chunk_size chunks.
 
         :param chunk_size: The total byte count to emit at the end of the
@@ -110,6 +112,8 @@
         self.num_zsync = 0
         self.unused_bytes = None
         self.reserved_size = reserved
+        # Default is to make building fast rather than compact
+        self.set_optimize(for_size=optimize_for_size)
 
     def finish(self):
         """Finish the chunk.
@@ -141,6 +145,19 @@
             self.bytes_list.append("\x00" * nulls_needed)
         return self.bytes_list, self.unused_bytes, nulls_needed
 
+    def set_optimize(self, for_size=True):
+        """Change how we optimize our writes.
+
+        :param for_size: If True, optimize for minimum space usage, otherwise
+            optimize for fastest writing speed.
+        :return: None
+        """
+        if for_size:
+            opts = ChunkWriter._repack_opts_for_size
+        else:
+            opts = ChunkWriter._repack_opts_for_speed
+        self._max_repack, self._max_zsync = opts
+
     def _recompress_all_bytes_in(self, extra_bytes=None):
         """Recompress the current bytes_in, and optionally more.
 

=== modified file 'bzrlib/index.py'
--- a/bzrlib/index.py	2008-09-21 14:48:37 +0000
+++ b/bzrlib/index.py	2008-10-15 21:40:03 +0000
@@ -84,6 +84,7 @@
         self._nodes = {}
         self._nodes_by_key = None
         self._key_length = key_elements
+        self._optimize_for_size = False
 
     def _check_key(self, key):
         """Raise BadIndexKey if key is not a valid key for this index."""
@@ -278,6 +279,17 @@
                 (len(result.getvalue()), expected_bytes))
         return result
 
+    def set_optimize(self, for_size=True):
+        """Change how the builder tries to optimize the result.
+
+        :param for_size: Tell the builder to try and make the index as small as
+            possible.
+        :return: None
+        """
+        # GraphIndexBuilder itself doesn't pay attention to the flag yet, but
+        # other builders do.
+        self._optimize_for_size = for_size
+
 
 class GraphIndex(object):
     """An index for data with embedded graphs.

=== modified file 'bzrlib/repofmt/pack_repo.py'
--- a/bzrlib/repofmt/pack_repo.py	2008-10-01 05:40:45 +0000
+++ b/bzrlib/repofmt/pack_repo.py	2008-10-22 19:24:46 +0000
@@ -971,6 +971,16 @@
         # TODO: combine requests in the same index that are in ascending order.
         return total, requests
 
+    def open_pack(self):
+        """Open a pack for the pack we are creating."""
+        new_pack = super(OptimisingPacker, self).open_pack()
+        # Turn on the optimization flags for all the index builders.
+        new_pack.revision_index.set_optimize(for_size=True)
+        new_pack.inventory_index.set_optimize(for_size=True)
+        new_pack.text_index.set_optimize(for_size=True)
+        new_pack.signature_index.set_optimize(for_size=True)
+        return new_pack
+
 
 class ReconcilePacker(Packer):
     """A packer which regenerates indices etc as it copies.

=== modified file 'bzrlib/tests/test_btree_index.py'
--- a/bzrlib/tests/test_btree_index.py	2008-08-28 20:13:31 +0000
+++ b/bzrlib/tests/test_btree_index.py	2008-10-15 21:40:03 +0000
@@ -434,6 +434,13 @@
         self.assertEqual(sorted(nodes), nodes)
         self.assertEqual(16, len(nodes))
 
+    def test_set_optimize(self):
+        builder = btree_index.BTreeBuilder(key_elements=2, reference_lists=2)
+        builder.set_optimize(for_size=True)
+        self.assertTrue(builder._optimize_for_size)
+        builder.set_optimize(for_size=False)
+        self.assertFalse(builder._optimize_for_size)
+
     def test_spill_index_stress_2_2(self):
         # test that references and longer keys don't confuse things.
         builder = btree_index.BTreeBuilder(key_elements=2, reference_lists=2,

=== modified file 'bzrlib/tests/test_chunk_writer.py'
--- a/bzrlib/tests/test_chunk_writer.py	2008-08-22 05:54:44 +0000
+++ b/bzrlib/tests/test_chunk_writer.py	2008-10-15 21:34:10 +0000
@@ -39,6 +39,24 @@
         # Only a zlib header.
         self.assertEqual(4088, padding)
 
+    def test_optimize_for_speed(self):
+        writer = chunk_writer.ChunkWriter(4096)
+        writer.set_optimize(for_size=False)
+        self.assertEqual(chunk_writer.ChunkWriter._repack_opts_for_speed,
+                         (writer._max_repack, writer._max_zsync))
+        writer = chunk_writer.ChunkWriter(4096, optimize_for_size=False)
+        self.assertEqual(chunk_writer.ChunkWriter._repack_opts_for_speed,
+                         (writer._max_repack, writer._max_zsync))
+
+    def test_optimize_for_size(self):
+        writer = chunk_writer.ChunkWriter(4096)
+        writer.set_optimize(for_size=True)
+        self.assertEqual(chunk_writer.ChunkWriter._repack_opts_for_size,
+                         (writer._max_repack, writer._max_zsync))
+        writer = chunk_writer.ChunkWriter(4096, optimize_for_size=True)
+        self.assertEqual(chunk_writer.ChunkWriter._repack_opts_for_size,
+                         (writer._max_repack, writer._max_zsync))
+
     def test_some_data(self):
         writer = chunk_writer.ChunkWriter(4096)
         writer.write("foo bar baz quux\n")

=== modified file 'bzrlib/tests/test_index.py'
--- a/bzrlib/tests/test_index.py	2008-09-02 17:52:00 +0000
+++ b/bzrlib/tests/test_index.py	2008-10-15 21:40:03 +0000
@@ -350,6 +350,13 @@
         builder.add_node(('k', 'ey'), 'data', ([('reference', 'tokey')], ))
         builder.add_node(('reference', 'tokey'), 'data', ([],))
 
+    def test_set_optimize(self):
+        builder = GraphIndexBuilder(reference_lists=1, key_elements=2)
+        builder.set_optimize(for_size=True)
+        self.assertTrue(builder._optimize_for_size)
+        builder.set_optimize(for_size=False)
+        self.assertFalse(builder._optimize_for_size)
+
 
 class TestGraphIndex(TestCaseWithMemoryTransport):
 

=== modified file 'bzrlib/tests/test_repository.py'
--- a/bzrlib/tests/test_repository.py	2008-09-29 07:03:55 +0000
+++ b/bzrlib/tests/test_repository.py	2008-10-16 18:58:22 +0000
@@ -998,6 +998,24 @@
     # thus there are not yet any tests.
 
 
+class TestOptimisingPacker(TestCaseWithTransport):
+    """Tests for the OptimisingPacker class."""
+
+    def get_pack_collection(self):
+        repo = self.make_repository('.')
+        return repo._pack_collection
+
+    def test_open_pack_will_optimise(self):
+        packer = pack_repo.OptimisingPacker(self.get_pack_collection(),
+                                            [], '.test')
+        new_pack = packer.open_pack()
+        self.assertIsInstance(new_pack, pack_repo.NewPack)
+        self.assertTrue(new_pack.revision_index._optimize_for_size)
+        self.assertTrue(new_pack.inventory_index._optimize_for_size)
+        self.assertTrue(new_pack.text_index._optimize_for_size)
+        self.assertTrue(new_pack.signature_index._optimize_for_size)
+
+
 class TestInterDifferingSerializer(TestCaseWithTransport):
 
     def test_progress_bar(self):




More information about the bazaar-commits mailing list