Rev 4173: Restore the ability to spill, but prepare a flag to disable it. in lp:///~jameinel/bzr/1.14-btree_spill
John Arbash Meinel
john at arbash-meinel.com
Sat Mar 21 02:47:13 GMT 2009
At lp:///~jameinel/bzr/1.14-btree_spill
------------------------------------------------------------
revno: 4173
revision-id: john at arbash-meinel.com-20090321024706-drh9950i56vb484i
parent: john at arbash-meinel.com-20090320174406-nd2kq4i77fmfu3ss
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: 1.14-btree_spill
timestamp: Fri 2009-03-20 21:47:06 -0500
message:
Restore the ability to spill, but prepare a flag to disable it.
-------------- next part --------------
=== modified file 'bzrlib/btree_index.py'
--- a/bzrlib/btree_index.py 2009-03-20 17:44:06 +0000
+++ b/bzrlib/btree_index.py 2009-03-21 02:47:06 +0000
@@ -140,6 +140,7 @@
# Indicate it hasn't been built yet
self._nodes_by_key = None
self._optimize_for_size = False
+ self._combine_spilled_indices = True
def add_node(self, key, value, references=()):
"""Add a node to the index.
@@ -180,8 +181,17 @@
combine mem with the first and second indexes, creating a new one of
size 4x. On the fifth create a single new one, etc.
"""
- (new_backing_file,
- size) = self._write_nodes(self._iter_mem_nodes(), allow_optimize=False)
+ iterators_to_combine = [self._iter_mem_nodes()]
+ pos = -1
+ for pos, backing in enumerate(self._backing_indices):
+ if backing is None:
+ pos -= 1
+ break
+ iterators_to_combine.append(backing.iter_all_entries())
+ backing_pos = pos + 1
+ new_backing_file, size = \
+ self._write_nodes(self._iter_smallest(iterators_to_combine),
+ allow_optimize=False)
dir_path, base_name = osutils.split(new_backing_file.name)
# Note: The transport here isn't strictly needed, because we will use
# direct access to the new_backing._file object
@@ -189,7 +199,11 @@
base_name, size)
# GC will clean up the file
new_backing._file = new_backing_file
- self._backing_indices.append(new_backing)
+ if len(self._backing_indices) == backing_pos:
+ self._backing_indices.append(None)
+ self._backing_indices[backing_pos] = new_backing
+ for pos in range(backing_pos):
+ self._backing_indices[pos] = None
self._keys = set()
self._nodes = {}
self._nodes_by_key = None
=== modified file 'bzrlib/tests/test_btree_index.py'
--- a/bzrlib/tests/test_btree_index.py 2009-03-19 20:21:23 +0000
+++ b/bzrlib/tests/test_btree_index.py 2009-03-21 02:47:06 +0000
@@ -363,60 +363,155 @@
self.assertEqual(1, len(builder._nodes))
self.assertEqual(1, len(builder._keys))
self.assertIs(None, builder._nodes_by_key)
- # And spills to a second backing index
- builder.add_node(*nodes[3])
- self.assertEqual(0, len(builder._nodes))
- self.assertEqual(0, len(builder._keys))
- self.assertIs(None, builder._nodes_by_key)
- self.assertEqual(2, len(builder._backing_indices))
- for i in xrange(2):
- self.assertEqual(2, builder._backing_indices[i].key_count())
- # The next spills to the next slot
- builder.add_node(*nodes[4])
- builder.add_node(*nodes[5])
- self.assertEqual(0, len(builder._nodes))
- self.assertEqual(0, len(builder._keys))
- self.assertIs(None, builder._nodes_by_key)
- self.assertEqual(3, len(builder._backing_indices))
- for i in xrange(3):
- self.assertEqual(2, builder._backing_indices[i].key_count())
- # Next spill combines
- builder.add_node(*nodes[6])
- builder.add_node(*nodes[7])
- self.assertEqual(4, len(builder._backing_indices))
- for i in xrange(4):
- self.assertEqual(2, builder._backing_indices[i].key_count())
- # And so forth - counting up in binary.
- builder.add_node(*nodes[8])
- builder.add_node(*nodes[9])
- self.assertEqual(5, len(builder._backing_indices))
- for i in xrange(5):
- self.assertEqual(2, builder._backing_indices[i].key_count())
- builder.add_node(*nodes[10])
- builder.add_node(*nodes[11])
- self.assertEqual(6, len(builder._backing_indices))
- for i in xrange(6):
- self.assertEqual(2, builder._backing_indices[i].key_count())
- builder.add_node(*nodes[12])
- # Test that memory and disk are both used for query methods; and that
- # None is skipped over happily.
- self.assertEqual([(builder,) + node for node in sorted(nodes[:13])],
- list(builder.iter_all_entries()))
- # Two nodes - one memory one disk
- self.assertEqual(set([(builder,) + node for node in nodes[11:13]]),
- set(builder.iter_entries([nodes[12][0], nodes[11][0]])))
- self.assertEqual(13, builder.key_count())
- self.assertEqual(set([(builder,) + node for node in nodes[11:13]]),
- set(builder.iter_entries_prefix([nodes[12][0], nodes[11][0]])))
- builder.add_node(*nodes[13])
- self.assertEqual(7, len(builder._backing_indices))
- for i in xrange(7):
- self.assertEqual(2, builder._backing_indices[i].key_count())
- builder.add_node(*nodes[14])
- builder.add_node(*nodes[15])
- self.assertEqual(8, len(builder._backing_indices))
- for i in xrange(8):
- self.assertEqual(2, builder._backing_indices[i].key_count())
+ # And spills to a second backing index combing all
+ builder.add_node(*nodes[3])
+ self.assertEqual(0, len(builder._nodes))
+ self.assertEqual(0, len(builder._keys))
+ self.assertIs(None, builder._nodes_by_key)
+ self.assertEqual(2, len(builder._backing_indices))
+ self.assertEqual(None, builder._backing_indices[0])
+ self.assertEqual(4, builder._backing_indices[1].key_count())
+ # The next spills to the 2-len slot
+ builder.add_node(*nodes[4])
+ builder.add_node(*nodes[5])
+ self.assertEqual(0, len(builder._nodes))
+ self.assertEqual(0, len(builder._keys))
+ self.assertIs(None, builder._nodes_by_key)
+ self.assertEqual(2, len(builder._backing_indices))
+ self.assertEqual(2, builder._backing_indices[0].key_count())
+ self.assertEqual(4, builder._backing_indices[1].key_count())
+ # Next spill combines
+ builder.add_node(*nodes[6])
+ builder.add_node(*nodes[7])
+ self.assertEqual(3, len(builder._backing_indices))
+ self.assertEqual(None, builder._backing_indices[0])
+ self.assertEqual(None, builder._backing_indices[1])
+ self.assertEqual(8, builder._backing_indices[2].key_count())
+ # And so forth - counting up in binary.
+ builder.add_node(*nodes[8])
+ builder.add_node(*nodes[9])
+ self.assertEqual(3, len(builder._backing_indices))
+ self.assertEqual(2, builder._backing_indices[0].key_count())
+ self.assertEqual(None, builder._backing_indices[1])
+ self.assertEqual(8, builder._backing_indices[2].key_count())
+ builder.add_node(*nodes[10])
+ builder.add_node(*nodes[11])
+ self.assertEqual(3, len(builder._backing_indices))
+ self.assertEqual(None, builder._backing_indices[0])
+ self.assertEqual(4, builder._backing_indices[1].key_count())
+ self.assertEqual(8, builder._backing_indices[2].key_count())
+ builder.add_node(*nodes[12])
+ # Test that memory and disk are both used for query methods; and that
+ # None is skipped over happily.
+ self.assertEqual([(builder,) + node for node in sorted(nodes[:13])],
+ list(builder.iter_all_entries()))
+ # Two nodes - one memory one disk
+ self.assertEqual(set([(builder,) + node for node in nodes[11:13]]),
+ set(builder.iter_entries([nodes[12][0], nodes[11][0]])))
+ self.assertEqual(13, builder.key_count())
+ self.assertEqual(set([(builder,) + node for node in nodes[11:13]]),
+ set(builder.iter_entries_prefix([nodes[12][0], nodes[11][0]])))
+ builder.add_node(*nodes[13])
+ self.assertEqual(3, len(builder._backing_indices))
+ self.assertEqual(2, builder._backing_indices[0].key_count())
+ self.assertEqual(4, builder._backing_indices[1].key_count())
+ self.assertEqual(8, builder._backing_indices[2].key_count())
+ builder.add_node(*nodes[14])
+ builder.add_node(*nodes[15])
+ self.assertEqual(4, len(builder._backing_indices))
+ self.assertEqual(None, builder._backing_indices[0])
+ self.assertEqual(None, builder._backing_indices[1])
+ self.assertEqual(None, builder._backing_indices[2])
+ self.assertEqual(16, builder._backing_indices[3].key_count())
+ # Now finish, and check we got a correctly ordered tree
+ transport = self.get_transport('')
+ size = transport.put_file('index', builder.finish())
+ index = btree_index.BTreeGraphIndex(transport, 'index', size)
+ nodes = list(index.iter_all_entries())
+ self.assertEqual(sorted(nodes), nodes)
+ self.assertEqual(16, len(nodes))
+
+ def test_spill_index_stress_1_1_no_combine(self):
+ builder = btree_index.BTreeBuilder(key_elements=1, spill_at=2)
+ builder._combine_spilled_indices = False
+ nodes = [node[0:2] for node in self.make_nodes(16, 1, 0)]
+ builder.add_node(*nodes[0])
+ # Test the parts of the index that take up memory are doing so
+ # predictably.
+ self.assertEqual(1, len(builder._nodes))
+ self.assertEqual(1, len(builder._keys))
+ self.assertIs(None, builder._nodes_by_key)
+ builder.add_node(*nodes[1])
+ self.assertEqual(0, len(builder._nodes))
+ self.assertEqual(0, len(builder._keys))
+ self.assertIs(None, builder._nodes_by_key)
+ self.assertEqual(1, len(builder._backing_indices))
+ self.assertEqual(2, builder._backing_indices[0].key_count())
+ # now back to memory
+ builder.add_node(*nodes[2])
+ self.assertEqual(1, len(builder._nodes))
+ self.assertEqual(1, len(builder._keys))
+ self.assertIs(None, builder._nodes_by_key)
+ # And spills to a second backing index but doesn't combine
+ builder.add_node(*nodes[3])
+ self.assertEqual(0, len(builder._nodes))
+ self.assertEqual(0, len(builder._keys))
+ self.assertIs(None, builder._nodes_by_key)
+ self.assertEqual(2, len(builder._backing_indices))
+ self.assertEqual(None, builder._backing_indices[0])
+ self.assertEqual(4, builder._backing_indices[1].key_count())
+ # The next spills to the 2-len slot
+ builder.add_node(*nodes[4])
+ builder.add_node(*nodes[5])
+ self.assertEqual(0, len(builder._nodes))
+ self.assertEqual(0, len(builder._keys))
+ self.assertIs(None, builder._nodes_by_key)
+ self.assertEqual(2, len(builder._backing_indices))
+ self.assertEqual(2, builder._backing_indices[0].key_count())
+ self.assertEqual(4, builder._backing_indices[1].key_count())
+ # Next spill combines
+ builder.add_node(*nodes[6])
+ builder.add_node(*nodes[7])
+ self.assertEqual(3, len(builder._backing_indices))
+ self.assertEqual(None, builder._backing_indices[0])
+ self.assertEqual(None, builder._backing_indices[1])
+ self.assertEqual(8, builder._backing_indices[2].key_count())
+ # And so forth - counting up in binary.
+ builder.add_node(*nodes[8])
+ builder.add_node(*nodes[9])
+ self.assertEqual(3, len(builder._backing_indices))
+ self.assertEqual(2, builder._backing_indices[0].key_count())
+ self.assertEqual(None, builder._backing_indices[1])
+ self.assertEqual(8, builder._backing_indices[2].key_count())
+ builder.add_node(*nodes[10])
+ builder.add_node(*nodes[11])
+ self.assertEqual(3, len(builder._backing_indices))
+ self.assertEqual(None, builder._backing_indices[0])
+ self.assertEqual(4, builder._backing_indices[1].key_count())
+ self.assertEqual(8, builder._backing_indices[2].key_count())
+ builder.add_node(*nodes[12])
+ # Test that memory and disk are both used for query methods; and that
+ # None is skipped over happily.
+ self.assertEqual([(builder,) + node for node in sorted(nodes[:13])],
+ list(builder.iter_all_entries()))
+ # Two nodes - one memory one disk
+ self.assertEqual(set([(builder,) + node for node in nodes[11:13]]),
+ set(builder.iter_entries([nodes[12][0], nodes[11][0]])))
+ self.assertEqual(13, builder.key_count())
+ self.assertEqual(set([(builder,) + node for node in nodes[11:13]]),
+ set(builder.iter_entries_prefix([nodes[12][0], nodes[11][0]])))
+ builder.add_node(*nodes[13])
+ self.assertEqual(3, len(builder._backing_indices))
+ self.assertEqual(2, builder._backing_indices[0].key_count())
+ self.assertEqual(4, builder._backing_indices[1].key_count())
+ self.assertEqual(8, builder._backing_indices[2].key_count())
+ builder.add_node(*nodes[14])
+ builder.add_node(*nodes[15])
+ self.assertEqual(4, len(builder._backing_indices))
+ self.assertEqual(None, builder._backing_indices[0])
+ self.assertEqual(None, builder._backing_indices[1])
+ self.assertEqual(None, builder._backing_indices[2])
+ self.assertEqual(16, builder._backing_indices[3].key_count())
# Now finish, and check we got a correctly ordered tree
transport = self.get_transport('')
size = transport.put_file('index', builder.finish())
@@ -458,39 +553,43 @@
self.assertNotEqual({}, builder._nodes_by_key)
# We should have a new entry
self.assertNotEqual(old, builder._nodes_by_key)
- # And spills to a second backing index
+ # And spills to a second backing index combing all
builder.add_node(*nodes[3])
self.assertEqual(0, len(builder._nodes))
self.assertEqual(0, len(builder._keys))
self.assertIs(None, builder._nodes_by_key)
self.assertEqual(2, len(builder._backing_indices))
- for i in xrange(2):
- self.assertEqual(2, builder._backing_indices[i].key_count())
- # The next spills to yet another slot
+ self.assertEqual(None, builder._backing_indices[0])
+ self.assertEqual(4, builder._backing_indices[1].key_count())
+ # The next spills to the 2-len slot
builder.add_node(*nodes[4])
builder.add_node(*nodes[5])
self.assertEqual(0, len(builder._nodes))
self.assertEqual(0, len(builder._keys))
self.assertIs(None, builder._nodes_by_key)
- self.assertEqual(3, len(builder._backing_indices))
- for i in xrange(3):
- self.assertEqual(2, builder._backing_indices[i].key_count())
+ self.assertEqual(2, len(builder._backing_indices))
+ self.assertEqual(2, builder._backing_indices[0].key_count())
+ self.assertEqual(4, builder._backing_indices[1].key_count())
+ # Next spill combines
builder.add_node(*nodes[6])
builder.add_node(*nodes[7])
- self.assertEqual(4, len(builder._backing_indices))
- for i in xrange(4):
- self.assertEqual(2, builder._backing_indices[i].key_count())
- # And so forth
+ self.assertEqual(3, len(builder._backing_indices))
+ self.assertEqual(None, builder._backing_indices[0])
+ self.assertEqual(None, builder._backing_indices[1])
+ self.assertEqual(8, builder._backing_indices[2].key_count())
+ # And so forth - counting up in binary.
builder.add_node(*nodes[8])
builder.add_node(*nodes[9])
- self.assertEqual(5, len(builder._backing_indices))
- for i in xrange(5):
- self.assertEqual(2, builder._backing_indices[i].key_count())
+ self.assertEqual(3, len(builder._backing_indices))
+ self.assertEqual(2, builder._backing_indices[0].key_count())
+ self.assertEqual(None, builder._backing_indices[1])
+ self.assertEqual(8, builder._backing_indices[2].key_count())
builder.add_node(*nodes[10])
builder.add_node(*nodes[11])
- self.assertEqual(6, len(builder._backing_indices))
- for i in xrange(6):
- self.assertEqual(2, builder._backing_indices[i].key_count())
+ self.assertEqual(3, len(builder._backing_indices))
+ self.assertEqual(None, builder._backing_indices[0])
+ self.assertEqual(4, builder._backing_indices[1].key_count())
+ self.assertEqual(8, builder._backing_indices[2].key_count())
builder.add_node(*nodes[12])
# Test that memory and disk are both used for query methods; and that
# None is skipped over happily.
@@ -503,14 +602,17 @@
self.assertEqual(set([(builder,) + node for node in nodes[11:13]]),
set(builder.iter_entries_prefix([nodes[12][0], nodes[11][0]])))
builder.add_node(*nodes[13])
- self.assertEqual(7, len(builder._backing_indices))
- for i in xrange(7):
- self.assertEqual(2, builder._backing_indices[i].key_count())
+ self.assertEqual(3, len(builder._backing_indices))
+ self.assertEqual(2, builder._backing_indices[0].key_count())
+ self.assertEqual(4, builder._backing_indices[1].key_count())
+ self.assertEqual(8, builder._backing_indices[2].key_count())
builder.add_node(*nodes[14])
builder.add_node(*nodes[15])
- self.assertEqual(8, len(builder._backing_indices))
- for i in xrange(7):
- self.assertEqual(2, builder._backing_indices[i].key_count())
+ self.assertEqual(4, len(builder._backing_indices))
+ self.assertEqual(None, builder._backing_indices[0])
+ self.assertEqual(None, builder._backing_indices[1])
+ self.assertEqual(None, builder._backing_indices[2])
+ self.assertEqual(16, builder._backing_indices[3].key_count())
# Now finish, and check we got a correctly ordered tree
transport = self.get_transport('')
size = transport.put_file('index', builder.finish())
More information about the bazaar-commits
mailing list