Rev 35: Make bloom creation optional using BTreeIndex._default_us_blooms, add 20 reserved bytes for index headers (fixes blow out at 1000000 keys), reduce memory use during conversion, and make the convert present a progress bar. in http://people.ubuntu.com/~robertc/baz2.0/plugins/index2/trunk
Robert Collins
robertc at robertcollins.net
Mon Jul 14 14:10:18 BST 2008
At http://people.ubuntu.com/~robertc/baz2.0/plugins/index2/trunk
------------------------------------------------------------
revno: 35
revision-id: robertc at robertcollins.net-20080714131017-yzik8lo3yucc5zhr
parent: robertc at robertcollins.net-20080714130856-rr86csw7zcgr3141
committer: Robert Collins <robertc at robertcollins.net>
branch nick: trunk
timestamp: Mon 2008-07-14 23:10:17 +1000
message:
Make bloom creation optional using BTreeIndex._default_us_blooms, add 20 reserved bytes for index headers (fixes blow out at 1000000 keys), reduce memory use during conversion, and make the convert present a progress bar.
modified:
btree_index.py index.py-20080624222253-p0x5f92uyh5hw734-7
repofmt.py repofmt.py-20080701113732-m1iu3n94ikbxdelb-1
tests/test_btree_index.py test_index.py-20080624222253-p0x5f92uyh5hw734-13
tests/test_repofmt.py test_repofmt.py-20080704030345-bza6rrd6nf4sdmyy-1
=== modified file 'btree_index.py'
--- a/btree_index.py 2008-07-13 20:34:32 +0000
+++ b/btree_index.py 2008-07-14 13:10:17 +0000
@@ -43,7 +43,7 @@
_INTERNAL_FLAG = "type=internal\n"
_INTERNAL_OFFSET = "offset="
-_RESERVED_HEADER_BYTES = 100
+_RESERVED_HEADER_BYTES = 120
_PAGE_SIZE = 4096
_BLOOM_DISK_SIZE = 256*8 # Internal nodes get a 256 byte bloom
_BLOOM_BUILD_SIZE = 8192*8 # Seems to work for lots of nodes
@@ -300,7 +300,10 @@
# A stack with the number of nodes of each size. 0 is the root node
# and must always be 1 (if there are any nodes in the tree).
self.row_lengths = []
- global_bloom = BloomSHA1(_GLOBAL_BLOOM_START_SIZE)
+ if BTreeGraphIndex._default_use_blooms:
+ global_bloom = BloomSHA1(_GLOBAL_BLOOM_START_SIZE)
+ else:
+ global_bloom = None
def add_key(string_key, key_line):
"""Add a key to the current chunk.
@@ -387,14 +390,15 @@
row.bloom.insert(string_key)
# Keep the bloom fairly empty. It seems we need better than
# 16 b/e to avoid accidentally filling at a lower size
- global_bloom.insert(string_key)
- if (global_bloom._num_entries * _GLOBAL_BLOOM_RESIZE_THRESHOLD
- > global_bloom._num_bits):
- global_bloom.resize(global_bloom._num_bits *
- _GLOBAL_BLOOM_GROW_FACTOR)
- if 'index' in debug.debug_flags:
- trace.mutter('resizing global bloom: %s',
- row.bloom)
+ if global_bloom is not None:
+ global_bloom.insert(string_key)
+ if (global_bloom._num_entries * _GLOBAL_BLOOM_RESIZE_THRESHOLD
+ > global_bloom._num_bits):
+ global_bloom.resize(global_bloom._num_bits *
+ _GLOBAL_BLOOM_GROW_FACTOR)
+ if 'index' in debug.debug_flags:
+ trace.mutter('resizing global bloom: %s',
+ row.bloom)
# Loop over all nodes adding them to the bottom row
# (rows[-1]). When we finish a chunk in a row,
# propogate the key that didn't fit (comes after the chunk) to the
@@ -424,18 +428,17 @@
lines.append(_OPTION_LEN + str(key_count) + '\n')
row_lengths = [row.nodes for row in rows]
lines.append(_OPTION_ROW_LENGTHS + ','.join(map(str, row_lengths)) + '\n')
+ num_bloom_pages = 0
if len(rows) > 1:
- # We only write a global bloom if there is an internal node
- optimal_num_bits = max(_PAGE_SIZE*8, global_bloom._num_entries*9.6)
- global_bloom.resize(optimal_num_bits)
- if 'index' in debug.debug_flags:
- # 9.6 == 1% error, with k=5
- trace.note('O global: %r', global_bloom)
- num_bloom_pages = global_bloom._num_bytes / _PAGE_SIZE
- lines.append(_OPTION_BLOOM_PAGES + str(num_bloom_pages) + '\n')
- else:
- num_bloom_pages = 0
- lines.append(_OPTION_BLOOM_PAGES + '0\n')
+ if global_bloom is not None:
+ # We only write a global bloom if there is an internal node
+ optimal_num_bits = max(_PAGE_SIZE*8, global_bloom._num_entries*9.6)
+ global_bloom.resize(optimal_num_bits)
+ if 'index' in debug.debug_flags:
+ # 9.6 == 1% error, with k=5
+ trace.note('O global: %r', global_bloom)
+ num_bloom_pages = global_bloom._num_bytes / _PAGE_SIZE
+ lines.append(_OPTION_BLOOM_PAGES + str(num_bloom_pages) + '\n')
result.writelines(lines)
position = sum(map(len, lines))
root_row = True
=== modified file 'repofmt.py'
--- a/repofmt.py 2008-07-13 20:34:32 +0000
+++ b/repofmt.py 2008-07-14 13:10:17 +0000
@@ -27,8 +27,11 @@
from bzrlib import debug, errors, pack, repository
from bzrlib.index import GraphIndex, GraphIndexBuilder
from bzrlib.inter import InterObject
-from bzrlib.plugins.index2.btree_index import BTreeGraphIndex, BTreeBuilder,
- FixedMemoryGraphIndex
+from bzrlib.plugins.index2.btree_index import (
+ BTreeBuilder,
+ BTreeGraphIndex,
+ FixedMemoryGraphIndex,
+ )
from bzrlib.knit import (
_KnitGraphIndex,
KnitVersionedFiles,
@@ -48,6 +51,7 @@
ReconcilePacker,
OptimisingPacker,
)
+from bzrlib import ui
def open_pack(self):
@@ -382,14 +386,14 @@
"""Convert source to be of format target."""
try:
self._prep_conversion()
- self._convert()
+ self._convert(pb)
self._pivot()
except:
self._cancel_conversion()
raise
self._complete()
- def _convert(self):
+ def _convert(self, pb):
"""Do the time consuming portions of conversion.
This should not alter live data, just prepare new disk structures for
@@ -405,15 +409,26 @@
index_sizes = {}
for index in indices:
keys = index.key_count()
- if len(keys) > 100000 and type(index) == GraphIndex:
+ if keys > 100000 and type(index) == GraphIndex:
# Avoid pathological memory scaling
index = FixedMemoryGraphIndex(index._transport, index._name,
index._size)
+ index.key_count()
key_length = index._key_length
ref_lists = index.node_ref_lists
builder = BTreeBuilder(reference_lists=ref_lists, key_elements=key_length)
- for node in index.iter_all_entries():
+ for pos, node in enumerate(index.iter_all_entries()):
builder.add_node(*node[1:])
+ if pb is not None:
+ pb.update("Copying", pos, keys)
+ # reset index.
+ index._bisect_nodes = None
+ index._nodes = None
+ index._parsed_byte_map = []
+ index._parsed_key_map = []
+ index._key_count = None
+ index._keys_by_offset = None
+ index._nodes_by_key = None
size = self.source._transport.put_file('newindices/' + index._name,
builder.finish())
sizes = index_sizes.setdefault(index._name[:-4],
=== modified file 'tests/test_btree_index.py'
--- a/tests/test_btree_index.py 2008-07-13 17:39:05 +0000
+++ b/tests/test_btree_index.py 2008-07-14 13:10:17 +0000
@@ -69,6 +69,18 @@
# test names here are suffixed by the key length and reference list count
# that they test.
+ def setUp(self):
+ TestCaseWithTransport.setUp(self)
+ self._original_default = btree_index.BTreeGraphIndex._default_use_blooms
+ self._original_header = btree_index._RESERVED_HEADER_BYTES
+ # test with blooms as thats what our sizes are adjusted for
+ def restore():
+ btree_index._RESERVED_HEADER_BYTES = self._original_header
+ btree_index.BTreeGraphIndex._default_use_blooms = self._original_default
+ self.addCleanup(restore)
+ btree_index.BTreeGraphIndex._default_use_blooms = True
+ btree_index._RESERVED_HEADER_BYTES = 100
+
def make_nodes(self, count, key_elements, reference_lists):
"""Generate count*key_elements sample nodes."""
keys = []
@@ -817,6 +829,7 @@
self.assertEqual(nodes[30], bare_nodes[0])
# Should have read the root node, one bloom page, then one leaf page:
self.assertEqual([('readv', 'index', [(0, 4096)], False, None),
+ ('readv', 'index', [(4096, 4096), ], False, None),
('readv', 'index', [(8192, 4096), ], False, None)],
transport._activity)
=== modified file 'tests/test_repofmt.py'
--- a/tests/test_repofmt.py 2008-07-04 04:23:30 +0000
+++ b/tests/test_repofmt.py 2008-07-14 13:10:17 +0000
@@ -196,13 +196,13 @@
def test__cancel_after_convert(self):
self.inter._prep_conversion()
- self.inter._convert()
+ self.inter._convert(None)
self.inter._cancel_conversion()
self.assertUnModified()
def test_cancel_after_pivot(self):
self.inter._prep_conversion()
- self.inter._convert()
+ self.inter._convert(None)
self.inter._pivot()
self.inter._cancel_conversion()
self.assertUnModified()
@@ -210,7 +210,7 @@
def test__complete(self):
# after converting, complete moves stuff into place
self.inter._prep_conversion()
- self.inter._convert()
+ self.inter._convert(None)
self.inter._pivot()
self.inter._complete()
self.assertNamesUnlocked()
@@ -223,7 +223,7 @@
def test__convert_empty(self):
self.inter._prep_conversion()
- self.inter._convert()
+ self.inter._convert(None)
self.assertIndicesConverted()
self.inter.unlock()
@@ -234,13 +234,13 @@
tree.commit('post 1')
tree.commit('post 2')
self.inter._prep_conversion()
- self.inter._convert()
+ self.inter._convert(None)
self.assertIndicesConverted()
self.inter.unlock()
def test__pivot(self):
self.inter._prep_conversion()
- self.inter._convert()
+ self.inter._convert(None)
# track the calls made
self.inter.source._transport = get_transport('trace+' +
self.inter.source._transport.base)
@@ -275,27 +275,38 @@
raise Exception(name)
return log_error
+ def make_error_log_call_1(self, name, calls):
+ def log_error(arg):
+ calls.append((name, arg))
+ raise Exception(name)
+ return log_error
+
def make_log_call(self, name, calls):
def log_call():
calls.append(name)
return log_call
+ def make_log_call_1(self, name, calls):
+ def log_call(arg):
+ calls.append((name, arg))
+ return log_call
+
def test_convert_noerror(self):
# setup lambdas to trap what convert calls:
calls = []
self.inter._prep_conversion = self.make_log_call('prep', calls)
- self.inter._convert = self.make_log_call('convert', calls)
+ self.inter._convert = self.make_log_call_1('convert', calls)
self.inter._pivot = self.make_log_call('pivot', calls)
self.inter._complete = self.make_log_call('complete', calls)
self.inter._cancel_conversion = self.make_log_call('cancel', calls)
self.inter.convert(None)
- self.assertEqual(['prep', 'convert', 'pivot', 'complete'], calls)
+ self.assertEqual(['prep', ('convert', None), 'pivot', 'complete'], calls)
def test_convert_prep_error_cancels(self):
# setup lambdas to trap what convert calls:
calls = []
self.inter._prep_conversion = self.make_error_log_call('prep', calls)
- self.inter._convert = self.make_log_call('convert', calls)
+ self.inter._convert = self.make_log_call_1('convert', calls)
self.inter._pivot = self.make_log_call('pivot', calls)
self.inter._complete = self.make_log_call('complete', calls)
self.inter._cancel_conversion = self.make_log_call('cancel', calls)
@@ -307,37 +318,37 @@
# setup lambdas to trap what convert calls:
calls = []
self.inter._prep_conversion = self.make_log_call('prep', calls)
- self.inter._convert = self.make_error_log_call('convert', calls)
+ self.inter._convert = self.make_error_log_call_1('convert', calls)
self.inter._pivot = self.make_log_call('pivot', calls)
self.inter._complete = self.make_log_call('complete', calls)
self.inter._cancel_conversion = self.make_log_call('cancel', calls)
error = self.assertRaises(Exception, self.inter.convert, None)
self.assertEqual(error.args, ('convert',))
- self.assertEqual(['prep', 'convert', 'cancel'], calls)
+ self.assertEqual(['prep', ('convert', None), 'cancel'], calls)
def test_convert_pivot_error_cancels(self):
# setup lambdas to trap what convert calls:
calls = []
self.inter._prep_conversion = self.make_log_call('prep', calls)
- self.inter._convert = self.make_log_call('convert', calls)
+ self.inter._convert = self.make_log_call_1('convert', calls)
self.inter._pivot = self.make_error_log_call('pivot', calls)
self.inter._complete = self.make_log_call('complete', calls)
self.inter._cancel_conversion = self.make_log_call('cancel', calls)
error = self.assertRaises(Exception, self.inter.convert, None)
self.assertEqual(error.args, ('pivot',))
- self.assertEqual(['prep', 'convert', 'pivot', 'cancel'], calls)
+ self.assertEqual(['prep', ('convert', None), 'pivot', 'cancel'], calls)
def test_convert_complete_error_does_not_cancel(self):
# setup lambdas to trap what convert calls:
calls = []
self.inter._prep_conversion = self.make_log_call('prep', calls)
- self.inter._convert = self.make_log_call('convert', calls)
+ self.inter._convert = self.make_log_call_1('convert', calls)
self.inter._pivot = self.make_log_call('pivot', calls)
self.inter._complete = self.make_error_log_call('complete', calls)
self.inter._cancel_conversion = self.make_log_call('cancel', calls)
error = self.assertRaises(Exception, self.inter.convert, None)
self.assertEqual(error.args, ('complete',))
- self.assertEqual(['prep', 'convert', 'pivot', 'complete'], calls)
+ self.assertEqual(['prep', ('convert', None), 'pivot', 'complete'], calls)
class TestReplacementConverter(TestCaseWithTransport):
More information about the bazaar-commits
mailing list