Rev 4772: We don't have to pad 'short' records. in http://bazaar.launchpad.net/~jameinel/bzr/2.1-btree-no-padding

Thu Oct 29 16:15:58 GMT 2009

At http://bazaar.launchpad.net/~jameinel/bzr/2.1-btree-no-padding

------------------------------------------------------------
revno: 4772
revision-id: john at arbash-meinel.com-20091029161543-tdqlm2l4e2z5o7le
parent: pqm at pqm.ubuntu.com-20091026155954-r9gw2rizkikw7cg7
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: 2.1-btree-no-padding
timestamp: Thu 2009-10-29 11:15:43 -0500
message:
  We don't have to pad 'short' records.
  
  When writing a row, we reserve 120 bytes from the first node so that we
  can write our 'B+Tree Graph Index' signature and other meta-information.
  For the root node, we don't always use the 120 bytes, and for non-root
  rows, we don't use that data at all. So we usually pad back that
  record. However, for indexes that fit entirely in the root record,
  we don't pad them to 4096, and it turns out we don't need to pad
  them with the spare 120 bytes either.
  
  I was doing a test with lots of 'chained' btree indexes, and this
  extra padding ended up being 4.6M => 4.3M of wasted space. I imagine
  that bzr-search will have a similar issue with tiny indexes.
-------------- next part --------------
=== modified file 'bzrlib/btree_index.py'

--- a/bzrlib/btree_index.py	2009-10-19 15:39:25 +0000
+++ b/bzrlib/btree_index.py	2009-10-29 16:15:43 +0000
@@ -411,7 +411,8 @@
             # Special case the first node as it may be prefixed
             node = row.spool.read(_PAGE_SIZE)
             result.write(node[reserved:])
-            result.write("\x00" * (reserved - position))
+            if len(node) == _PAGE_SIZE:
+                result.write("\x00" * (reserved - position))
             position = 0 # Only the root row actually has an offset
             copied_len = osutils.pumpfile(row.spool, result)
             if copied_len != (row.nodes - 1) * _PAGE_SIZE:

=== modified file 'bzrlib/tests/test_btree_index.py'
--- a/bzrlib/tests/test_btree_index.py	2009-10-20 04:25:27 +0000
+++ b/bzrlib/tests/test_btree_index.py	2009-10-29 16:15:43 +0000
@@ -161,7 +161,7 @@
         temp_file = builder.finish()
         content = temp_file.read()
         del temp_file
-        self.assertEqual(158, len(content))
+        self.assertEqual(131, len(content))
         self.assertEqual(
             "B+Tree Graph Index 2\nnode_ref_lists=0\nkey_elements=1\nlen=5\n"
             "row_lengths=1\n",
@@ -185,7 +185,7 @@
         temp_file = builder.finish()
         content = temp_file.read()
         del temp_file
-        self.assertEqual(264, len(content))
+        self.assertEqual(238, len(content))
         self.assertEqual(
             "B+Tree Graph Index 2\nnode_ref_lists=2\nkey_elements=2\nlen=10\n"
             "row_lengths=1\n",
@@ -251,7 +251,7 @@
         temp_file = builder.finish()
         content = temp_file.read()
         del temp_file
-        self.assertEqual(181, len(content))
+        self.assertEqual(155, len(content))
         self.assertEqual(
             "B+Tree Graph Index 2\nnode_ref_lists=0\nkey_elements=1\nlen=10\n"
             "row_lengths=1\n",
@@ -718,7 +718,7 @@
         # The entire index should have been read, as it is one page long.
         self.assertEqual([('readv', 'index', [(0, size)], False, None)],
             transport._activity)
-        self.assertEqual(1199, size)
+        self.assertEqual(1173, size)
 
     def test__read_nodes_no_size_one_page_reads_once(self):
         self.make_index(nodes=[(('key',), 'value', ())])
@@ -772,7 +772,7 @@
         # The entire index should have been read linearly.
         self.assertEqual([('readv', 'index', [(0, size)], False, None)],
             transport._activity)
-        self.assertEqual(1514, size)
+        self.assertEqual(1488, size)
 
     def test_validate_two_pages(self):
         builder = btree_index.BTreeBuilder(key_elements=2, reference_lists=2)