Rev 4173: Restore the ability to spill, but prepare a flag to disable it. in lp:///~jameinel/bzr/1.14-btree_spill

John Arbash Meinel john at arbash-meinel.com
Sat Mar 21 02:47:13 GMT 2009


At lp:///~jameinel/bzr/1.14-btree_spill

------------------------------------------------------------
revno: 4173
revision-id: john at arbash-meinel.com-20090321024706-drh9950i56vb484i
parent: john at arbash-meinel.com-20090320174406-nd2kq4i77fmfu3ss
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: 1.14-btree_spill
timestamp: Fri 2009-03-20 21:47:06 -0500
message:
  Restore the ability to spill, but prepare a flag to disable it.
-------------- next part --------------
=== modified file 'bzrlib/btree_index.py'
--- a/bzrlib/btree_index.py	2009-03-20 17:44:06 +0000
+++ b/bzrlib/btree_index.py	2009-03-21 02:47:06 +0000
@@ -140,6 +140,7 @@
         # Indicate it hasn't been built yet
         self._nodes_by_key = None
         self._optimize_for_size = False
+        self._combine_spilled_indices = True
 
     def add_node(self, key, value, references=()):
         """Add a node to the index.
@@ -180,8 +181,17 @@
         combine mem with the first and second indexes, creating a new one of
         size 4x. On the fifth create a single new one, etc.
         """
-        (new_backing_file,
-         size) = self._write_nodes(self._iter_mem_nodes(), allow_optimize=False)
+        iterators_to_combine = [self._iter_mem_nodes()]
+        pos = -1
+        for pos, backing in enumerate(self._backing_indices):
+            if backing is None:
+                pos -= 1
+                break
+            iterators_to_combine.append(backing.iter_all_entries())
+        backing_pos = pos + 1
+        new_backing_file, size = \
+            self._write_nodes(self._iter_smallest(iterators_to_combine),
+                              allow_optimize=False)
         dir_path, base_name = osutils.split(new_backing_file.name)
         # Note: The transport here isn't strictly needed, because we will use
         #       direct access to the new_backing._file object
@@ -189,7 +199,11 @@
                                       base_name, size)
         # GC will clean up the file
         new_backing._file = new_backing_file
-        self._backing_indices.append(new_backing)
+        if len(self._backing_indices) == backing_pos:
+            self._backing_indices.append(None)
+        self._backing_indices[backing_pos] = new_backing
+        for pos in range(backing_pos):
+            self._backing_indices[pos] = None
         self._keys = set()
         self._nodes = {}
         self._nodes_by_key = None

=== modified file 'bzrlib/tests/test_btree_index.py'
--- a/bzrlib/tests/test_btree_index.py	2009-03-19 20:21:23 +0000
+++ b/bzrlib/tests/test_btree_index.py	2009-03-21 02:47:06 +0000
@@ -363,60 +363,155 @@
         self.assertEqual(1, len(builder._nodes))
         self.assertEqual(1, len(builder._keys))
         self.assertIs(None, builder._nodes_by_key)
-        # And spills to a second backing index
-        builder.add_node(*nodes[3])
-        self.assertEqual(0, len(builder._nodes))
-        self.assertEqual(0, len(builder._keys))
-        self.assertIs(None, builder._nodes_by_key)
-        self.assertEqual(2, len(builder._backing_indices))
-        for i in xrange(2):
-            self.assertEqual(2, builder._backing_indices[i].key_count())
-        # The next spills to the next slot
-        builder.add_node(*nodes[4])
-        builder.add_node(*nodes[5])
-        self.assertEqual(0, len(builder._nodes))
-        self.assertEqual(0, len(builder._keys))
-        self.assertIs(None, builder._nodes_by_key)
-        self.assertEqual(3, len(builder._backing_indices))
-        for i in xrange(3):
-            self.assertEqual(2, builder._backing_indices[i].key_count())
-        # Next spill combines
-        builder.add_node(*nodes[6])
-        builder.add_node(*nodes[7])
-        self.assertEqual(4, len(builder._backing_indices))
-        for i in xrange(4):
-            self.assertEqual(2, builder._backing_indices[i].key_count())
-        # And so forth - counting up in binary.
-        builder.add_node(*nodes[8])
-        builder.add_node(*nodes[9])
-        self.assertEqual(5, len(builder._backing_indices))
-        for i in xrange(5):
-            self.assertEqual(2, builder._backing_indices[i].key_count())
-        builder.add_node(*nodes[10])
-        builder.add_node(*nodes[11])
-        self.assertEqual(6, len(builder._backing_indices))
-        for i in xrange(6):
-            self.assertEqual(2, builder._backing_indices[i].key_count())
-        builder.add_node(*nodes[12])
-        # Test that memory and disk are both used for query methods; and that
-        # None is skipped over happily.
-        self.assertEqual([(builder,) + node for node in sorted(nodes[:13])],
-            list(builder.iter_all_entries()))
-        # Two nodes - one memory one disk
-        self.assertEqual(set([(builder,) + node for node in nodes[11:13]]),
-            set(builder.iter_entries([nodes[12][0], nodes[11][0]])))
-        self.assertEqual(13, builder.key_count())
-        self.assertEqual(set([(builder,) + node for node in nodes[11:13]]),
-            set(builder.iter_entries_prefix([nodes[12][0], nodes[11][0]])))
-        builder.add_node(*nodes[13])
-        self.assertEqual(7, len(builder._backing_indices))
-        for i in xrange(7):
-            self.assertEqual(2, builder._backing_indices[i].key_count())
-        builder.add_node(*nodes[14])
-        builder.add_node(*nodes[15])
-        self.assertEqual(8, len(builder._backing_indices))
-        for i in xrange(8):
-            self.assertEqual(2, builder._backing_indices[i].key_count())
+        # And spills to a second backing index combing all
+        builder.add_node(*nodes[3])
+        self.assertEqual(0, len(builder._nodes))
+        self.assertEqual(0, len(builder._keys))
+        self.assertIs(None, builder._nodes_by_key)
+        self.assertEqual(2, len(builder._backing_indices))
+        self.assertEqual(None, builder._backing_indices[0])
+        self.assertEqual(4, builder._backing_indices[1].key_count())
+        # The next spills to the 2-len slot
+        builder.add_node(*nodes[4])
+        builder.add_node(*nodes[5])
+        self.assertEqual(0, len(builder._nodes))
+        self.assertEqual(0, len(builder._keys))
+        self.assertIs(None, builder._nodes_by_key)
+        self.assertEqual(2, len(builder._backing_indices))
+        self.assertEqual(2, builder._backing_indices[0].key_count())
+        self.assertEqual(4, builder._backing_indices[1].key_count())
+        # Next spill combines
+        builder.add_node(*nodes[6])
+        builder.add_node(*nodes[7])
+        self.assertEqual(3, len(builder._backing_indices))
+        self.assertEqual(None, builder._backing_indices[0])
+        self.assertEqual(None, builder._backing_indices[1])
+        self.assertEqual(8, builder._backing_indices[2].key_count())
+        # And so forth - counting up in binary.
+        builder.add_node(*nodes[8])
+        builder.add_node(*nodes[9])
+        self.assertEqual(3, len(builder._backing_indices))
+        self.assertEqual(2, builder._backing_indices[0].key_count())
+        self.assertEqual(None, builder._backing_indices[1])
+        self.assertEqual(8, builder._backing_indices[2].key_count())
+        builder.add_node(*nodes[10])
+        builder.add_node(*nodes[11])
+        self.assertEqual(3, len(builder._backing_indices))
+        self.assertEqual(None, builder._backing_indices[0])
+        self.assertEqual(4, builder._backing_indices[1].key_count())
+        self.assertEqual(8, builder._backing_indices[2].key_count())
+        builder.add_node(*nodes[12])
+        # Test that memory and disk are both used for query methods; and that
+        # None is skipped over happily.
+        self.assertEqual([(builder,) + node for node in sorted(nodes[:13])],
+            list(builder.iter_all_entries()))
+        # Two nodes - one memory one disk
+        self.assertEqual(set([(builder,) + node for node in nodes[11:13]]),
+            set(builder.iter_entries([nodes[12][0], nodes[11][0]])))
+        self.assertEqual(13, builder.key_count())
+        self.assertEqual(set([(builder,) + node for node in nodes[11:13]]),
+            set(builder.iter_entries_prefix([nodes[12][0], nodes[11][0]])))
+        builder.add_node(*nodes[13])
+        self.assertEqual(3, len(builder._backing_indices))
+        self.assertEqual(2, builder._backing_indices[0].key_count())
+        self.assertEqual(4, builder._backing_indices[1].key_count())
+        self.assertEqual(8, builder._backing_indices[2].key_count())
+        builder.add_node(*nodes[14])
+        builder.add_node(*nodes[15])
+        self.assertEqual(4, len(builder._backing_indices))
+        self.assertEqual(None, builder._backing_indices[0])
+        self.assertEqual(None, builder._backing_indices[1])
+        self.assertEqual(None, builder._backing_indices[2])
+        self.assertEqual(16, builder._backing_indices[3].key_count())
+        # Now finish, and check we got a correctly ordered tree
+        transport = self.get_transport('')
+        size = transport.put_file('index', builder.finish())
+        index = btree_index.BTreeGraphIndex(transport, 'index', size)
+        nodes = list(index.iter_all_entries())
+        self.assertEqual(sorted(nodes), nodes)
+        self.assertEqual(16, len(nodes))
+
+    def test_spill_index_stress_1_1_no_combine(self):
+        builder = btree_index.BTreeBuilder(key_elements=1, spill_at=2)
+        builder._combine_spilled_indices = False
+        nodes = [node[0:2] for node in self.make_nodes(16, 1, 0)]
+        builder.add_node(*nodes[0])
+        # Test the parts of the index that take up memory are doing so
+        # predictably.
+        self.assertEqual(1, len(builder._nodes))
+        self.assertEqual(1, len(builder._keys))
+        self.assertIs(None, builder._nodes_by_key)
+        builder.add_node(*nodes[1])
+        self.assertEqual(0, len(builder._nodes))
+        self.assertEqual(0, len(builder._keys))
+        self.assertIs(None, builder._nodes_by_key)
+        self.assertEqual(1, len(builder._backing_indices))
+        self.assertEqual(2, builder._backing_indices[0].key_count())
+        # now back to memory
+        builder.add_node(*nodes[2])
+        self.assertEqual(1, len(builder._nodes))
+        self.assertEqual(1, len(builder._keys))
+        self.assertIs(None, builder._nodes_by_key)
+        # And spills to a second backing index but doesn't combine
+        builder.add_node(*nodes[3])
+        self.assertEqual(0, len(builder._nodes))
+        self.assertEqual(0, len(builder._keys))
+        self.assertIs(None, builder._nodes_by_key)
+        self.assertEqual(2, len(builder._backing_indices))
+        self.assertEqual(None, builder._backing_indices[0])
+        self.assertEqual(4, builder._backing_indices[1].key_count())
+        # The next spills to the 2-len slot
+        builder.add_node(*nodes[4])
+        builder.add_node(*nodes[5])
+        self.assertEqual(0, len(builder._nodes))
+        self.assertEqual(0, len(builder._keys))
+        self.assertIs(None, builder._nodes_by_key)
+        self.assertEqual(2, len(builder._backing_indices))
+        self.assertEqual(2, builder._backing_indices[0].key_count())
+        self.assertEqual(4, builder._backing_indices[1].key_count())
+        # Next spill combines
+        builder.add_node(*nodes[6])
+        builder.add_node(*nodes[7])
+        self.assertEqual(3, len(builder._backing_indices))
+        self.assertEqual(None, builder._backing_indices[0])
+        self.assertEqual(None, builder._backing_indices[1])
+        self.assertEqual(8, builder._backing_indices[2].key_count())
+        # And so forth - counting up in binary.
+        builder.add_node(*nodes[8])
+        builder.add_node(*nodes[9])
+        self.assertEqual(3, len(builder._backing_indices))
+        self.assertEqual(2, builder._backing_indices[0].key_count())
+        self.assertEqual(None, builder._backing_indices[1])
+        self.assertEqual(8, builder._backing_indices[2].key_count())
+        builder.add_node(*nodes[10])
+        builder.add_node(*nodes[11])
+        self.assertEqual(3, len(builder._backing_indices))
+        self.assertEqual(None, builder._backing_indices[0])
+        self.assertEqual(4, builder._backing_indices[1].key_count())
+        self.assertEqual(8, builder._backing_indices[2].key_count())
+        builder.add_node(*nodes[12])
+        # Test that memory and disk are both used for query methods; and that
+        # None is skipped over happily.
+        self.assertEqual([(builder,) + node for node in sorted(nodes[:13])],
+            list(builder.iter_all_entries()))
+        # Two nodes - one memory one disk
+        self.assertEqual(set([(builder,) + node for node in nodes[11:13]]),
+            set(builder.iter_entries([nodes[12][0], nodes[11][0]])))
+        self.assertEqual(13, builder.key_count())
+        self.assertEqual(set([(builder,) + node for node in nodes[11:13]]),
+            set(builder.iter_entries_prefix([nodes[12][0], nodes[11][0]])))
+        builder.add_node(*nodes[13])
+        self.assertEqual(3, len(builder._backing_indices))
+        self.assertEqual(2, builder._backing_indices[0].key_count())
+        self.assertEqual(4, builder._backing_indices[1].key_count())
+        self.assertEqual(8, builder._backing_indices[2].key_count())
+        builder.add_node(*nodes[14])
+        builder.add_node(*nodes[15])
+        self.assertEqual(4, len(builder._backing_indices))
+        self.assertEqual(None, builder._backing_indices[0])
+        self.assertEqual(None, builder._backing_indices[1])
+        self.assertEqual(None, builder._backing_indices[2])
+        self.assertEqual(16, builder._backing_indices[3].key_count())
         # Now finish, and check we got a correctly ordered tree
         transport = self.get_transport('')
         size = transport.put_file('index', builder.finish())
@@ -458,39 +553,43 @@
         self.assertNotEqual({}, builder._nodes_by_key)
         # We should have a new entry
         self.assertNotEqual(old, builder._nodes_by_key)
-        # And spills to a second backing index
+        # And spills to a second backing index combing all
         builder.add_node(*nodes[3])
         self.assertEqual(0, len(builder._nodes))
         self.assertEqual(0, len(builder._keys))
         self.assertIs(None, builder._nodes_by_key)
         self.assertEqual(2, len(builder._backing_indices))
-        for i in xrange(2):
-            self.assertEqual(2, builder._backing_indices[i].key_count())
-        # The next spills to yet another slot
+        self.assertEqual(None, builder._backing_indices[0])
+        self.assertEqual(4, builder._backing_indices[1].key_count())
+        # The next spills to the 2-len slot
         builder.add_node(*nodes[4])
         builder.add_node(*nodes[5])
         self.assertEqual(0, len(builder._nodes))
         self.assertEqual(0, len(builder._keys))
         self.assertIs(None, builder._nodes_by_key)
-        self.assertEqual(3, len(builder._backing_indices))
-        for i in xrange(3):
-            self.assertEqual(2, builder._backing_indices[i].key_count())
+        self.assertEqual(2, len(builder._backing_indices))
+        self.assertEqual(2, builder._backing_indices[0].key_count())
+        self.assertEqual(4, builder._backing_indices[1].key_count())
+        # Next spill combines
         builder.add_node(*nodes[6])
         builder.add_node(*nodes[7])
-        self.assertEqual(4, len(builder._backing_indices))
-        for i in xrange(4):
-            self.assertEqual(2, builder._backing_indices[i].key_count())
-        # And so forth
+        self.assertEqual(3, len(builder._backing_indices))
+        self.assertEqual(None, builder._backing_indices[0])
+        self.assertEqual(None, builder._backing_indices[1])
+        self.assertEqual(8, builder._backing_indices[2].key_count())
+        # And so forth - counting up in binary.
         builder.add_node(*nodes[8])
         builder.add_node(*nodes[9])
-        self.assertEqual(5, len(builder._backing_indices))
-        for i in xrange(5):
-            self.assertEqual(2, builder._backing_indices[i].key_count())
+        self.assertEqual(3, len(builder._backing_indices))
+        self.assertEqual(2, builder._backing_indices[0].key_count())
+        self.assertEqual(None, builder._backing_indices[1])
+        self.assertEqual(8, builder._backing_indices[2].key_count())
         builder.add_node(*nodes[10])
         builder.add_node(*nodes[11])
-        self.assertEqual(6, len(builder._backing_indices))
-        for i in xrange(6):
-            self.assertEqual(2, builder._backing_indices[i].key_count())
+        self.assertEqual(3, len(builder._backing_indices))
+        self.assertEqual(None, builder._backing_indices[0])
+        self.assertEqual(4, builder._backing_indices[1].key_count())
+        self.assertEqual(8, builder._backing_indices[2].key_count())
         builder.add_node(*nodes[12])
         # Test that memory and disk are both used for query methods; and that
         # None is skipped over happily.
@@ -503,14 +602,17 @@
         self.assertEqual(set([(builder,) + node for node in nodes[11:13]]),
             set(builder.iter_entries_prefix([nodes[12][0], nodes[11][0]])))
         builder.add_node(*nodes[13])
-        self.assertEqual(7, len(builder._backing_indices))
-        for i in xrange(7):
-            self.assertEqual(2, builder._backing_indices[i].key_count())
+        self.assertEqual(3, len(builder._backing_indices))
+        self.assertEqual(2, builder._backing_indices[0].key_count())
+        self.assertEqual(4, builder._backing_indices[1].key_count())
+        self.assertEqual(8, builder._backing_indices[2].key_count())
         builder.add_node(*nodes[14])
         builder.add_node(*nodes[15])
-        self.assertEqual(8, len(builder._backing_indices))
-        for i in xrange(7):
-            self.assertEqual(2, builder._backing_indices[i].key_count())
+        self.assertEqual(4, len(builder._backing_indices))
+        self.assertEqual(None, builder._backing_indices[0])
+        self.assertEqual(None, builder._backing_indices[1])
+        self.assertEqual(None, builder._backing_indices[2])
+        self.assertEqual(16, builder._backing_indices[3].key_count())
         # Now finish, and check we got a correctly ordered tree
         transport = self.get_transport('')
         size = transport.put_file('index', builder.finish())



More information about the bazaar-commits mailing list