Rev 39: round the last page of indices to avoid massive slack space in bzr search indices. in http://people.ubuntu.com/~robertc/baz2.0/plugins/index2/trunk
Robert Collins
robertc at robertcollins.net
Mon Jul 14 18:43:54 BST 2008
At http://people.ubuntu.com/~robertc/baz2.0/plugins/index2/trunk
------------------------------------------------------------
revno: 39
revision-id: robertc at robertcollins.net-20080714174346-vb2vt10rx60ok0gr
parent: robertc at robertcollins.net-20080714150358-ms3m7vxxvm7c85oy
committer: Robert Collins <robertc at robertcollins.net>
branch nick: trunk
timestamp: Tue 2008-07-15 03:43:46 +1000
message:
round the last page of indices to avoid massive slack space in bzr search indices.
modified:
btree_index.py index.py-20080624222253-p0x5f92uyh5hw734-7
chunk_writer.py chunk_writer.py-20080630234519-6ggn4id17nipovny-1
tests/test_btree_index.py test_index.py-20080624222253-p0x5f92uyh5hw734-13
tests/test_chunk_writer.py test_chunk_writer.py-20080630234519-6ggn4id17nipovny-2
=== modified file 'btree_index.py'
--- a/btree_index.py 2008-07-14 15:03:58 +0000
+++ b/btree_index.py 2008-07-14 17:43:46 +0000
@@ -111,13 +111,17 @@
self.spool = tempfile.TemporaryFile()
self.writer = None
- def finish_node(self):
- byte_lines, _ = self.writer.finish()
+ def finish_node(self, pad=True):
+ byte_lines, _, padding = self.writer.finish()
if self.nodes == 0:
# padded note:
self.spool.write("\x00" * _RESERVED_HEADER_BYTES)
+ skipped_bytes = 0
+ if not pad and padding:
+ del byte_lines[-1]
+ skipped_bytes = padding
self.spool.writelines(byte_lines)
- if self.spool.tell() % _PAGE_SIZE != 0:
+ if (self.spool.tell() + skipped_bytes) % _PAGE_SIZE != 0:
raise AssertionError("incorrect node length")
self.nodes += 1
self.writer = None
@@ -135,7 +139,9 @@
self.bloom = deepcopy(current_global_bloom)
self.bloom.resize(_BLOOM_BUILD_SIZE)
- def finish_node(self):
+ def finish_node(self, pad=True):
+ if not pad:
+ raise AssertionError("Must pad internal nodes only.")
if 'index' in debug.debug_flags:
trace.mutter("Finishing node with bloom filter %r.", self.bloom)
if self.bloom is not None:
@@ -420,7 +426,8 @@
'\t'.join(flattened_references), node[2]))
add_key(string_key, line)
for row in reversed(rows):
- row.finish_node()
+ pad = (type(row) != _LeafBuilderRow)
+ row.finish_node(pad=pad)
result = tempfile.NamedTemporaryFile()
lines = [_BTSIGNATURE]
lines.append(_OPTION_NODE_REFS + str(self.reference_lists) + '\n')
@@ -459,8 +466,9 @@
position = 0 # Only the root row actually has an offset
copied_len = osutils.pumpfile(row.spool, result)
if copied_len != (row.nodes - 1) * _PAGE_SIZE:
- import pdb;pdb.set_trace()
- raise AssertionError("Not enough data copied")
+ if type(row) != _LeafBuilderRow:
+ import pdb;pdb.set_trace()
+ raise AssertionError("Not enough data copied")
if root_row and num_bloom_pages > 0:
root_row = False
bloom_str = global_bloom._array.tostring()
@@ -1276,6 +1284,8 @@
# Avoid doing this again
self._size = len(start)
size = min(_PAGE_SIZE, self._size)
+ else:
+ size = min(size, self._size - offset)
ranges.append((offset, size))
if not ranges:
return
=== modified file 'chunk_writer.py'
--- a/chunk_writer.py 2008-07-02 14:45:38 +0000
+++ b/chunk_writer.py 2008-07-14 17:43:46 +0000
@@ -69,7 +69,7 @@
nulls_needed = self.chunk_size - total_len % self.chunk_size
if nulls_needed:
self.bytes_list.append("\x00" * nulls_needed)
- return self.bytes_list, self.unused_bytes
+ return self.bytes_list, self.unused_bytes, nulls_needed
def _recompress_all_bytes_in(self, extra_bytes=None):
compressor = zlib.compressobj()
@@ -111,7 +111,7 @@
# Check quickly to see if this is likely to put us outside of our
# budget:
next_seen_size = self.seen_bytes + len(bytes)
- if (next_seen_size < 2 * capacity):
+ if (next_seen_size < 1.8 * capacity):
# No need, we assume this will "just fit"
out = self.compressor.compress(bytes)
self.bytes_in.append(bytes)
=== modified file 'tests/test_btree_index.py'
--- a/tests/test_btree_index.py 2008-07-14 13:10:17 +0000
+++ b/tests/test_btree_index.py 2008-07-14 17:43:46 +0000
@@ -149,7 +149,7 @@
temp_file = builder.finish()
content = temp_file.read()
del temp_file
- self.assertEqual(4096, len(content))
+ self.assertEqual(158, len(content))
self.assertEqual(
"B+Tree Graph Index 1\nnode_ref_lists=0\nkey_elements=1\nlen=5\n"
"row_lengths=1\nbloom_pages=0\n",
@@ -173,7 +173,7 @@
temp_file = builder.finish()
content = temp_file.read()
del temp_file
- self.assertEqual(4096, len(content))
+ self.assertEqual(264, len(content))
self.assertEqual(
"B+Tree Graph Index 1\nnode_ref_lists=2\nkey_elements=2\nlen=10\n"
"row_lengths=1\nbloom_pages=0\n",
@@ -205,7 +205,7 @@
temp_file = builder.finish()
content = temp_file.read()
del temp_file
- self.assertEqual(4096*4, len(content))
+ self.assertEqual(14742, len(content))
self.assertEqual(
"B+Tree Graph Index 1\nnode_ref_lists=0\nkey_elements=1\nlen=800\n"
"row_lengths=1,2\nbloom_pages=1\n",
@@ -227,7 +227,7 @@
expected_root = (
"type=internal\n"
"offset=0\n"
- "505505505505505505505505505505505505505505505505505505505505505505505505505505505505505505505505505505505505505505505505\n"
+ "503503503503503503503503503503503503503503503503503503503503503503503503503503503503503503503503503503503503503503503503\n"
":bloom:\n"
+ bloom_bytes
)
@@ -237,12 +237,56 @@
leaf1_bytes = zlib.decompress(leaf1)
sorted_node_keys = sorted(node[0] for node in nodes)
node = btree_index._LeafNode(leaf1_bytes, 1, 0)
- self.assertEqual(451, len(node.keys))
- self.assertEqual(sorted_node_keys[:451], sorted(node.keys))
- leaf2_bytes = zlib.decompress(leaf2)
- node = btree_index._LeafNode(leaf2_bytes, 1, 0)
- self.assertEqual(800 - 451, len(node.keys))
- self.assertEqual(sorted_node_keys[451:], sorted(node.keys))
+ self.assertEqual(448, len(node.keys))
+ self.assertEqual(sorted_node_keys[:448], sorted(node.keys))
+ leaf2_bytes = zlib.decompress(leaf2)
+ node = btree_index._LeafNode(leaf2_bytes, 1, 0)
+ self.assertEqual(800 - 448, len(node.keys))
+ self.assertEqual(sorted_node_keys[448:], sorted(node.keys))
+
+ def test_last_page_rounded_1_layer(self):
+ builder = btree_index.BTreeBuilder(key_elements=1, reference_lists=0)
+ nodes = self.make_nodes(10, 1, 0)
+ for node in nodes:
+ builder.add_node(*node)
+ # NamedTemporaryFile dies on builder.finish().read(). weird.
+ temp_file = builder.finish()
+ content = temp_file.read()
+ del temp_file
+ self.assertEqual(181, len(content))
+ self.assertEqual(
+ "B+Tree Graph Index 1\nnode_ref_lists=0\nkey_elements=1\nlen=10\n"
+ "row_lengths=1\nbloom_pages=0\n",
+ content[:88])
+ # Check thelast page is well formed
+ leaf2 = content[88:]
+ leaf2_bytes = zlib.decompress(leaf2)
+ node = btree_index._LeafNode(leaf2_bytes, 1, 0)
+ self.assertEqual(10, len(node.keys))
+ sorted_node_keys = sorted(node[0] for node in nodes)
+ self.assertEqual(sorted_node_keys, sorted(node.keys))
+
+ def test_last_page_not_rounded_2_layer(self):
+ builder = btree_index.BTreeBuilder(key_elements=1, reference_lists=0)
+ nodes = self.make_nodes(800, 1, 0)
+ for node in nodes:
+ builder.add_node(*node)
+ # NamedTemporaryFile dies on builder.finish().read(). weird.
+ temp_file = builder.finish()
+ content = temp_file.read()
+ del temp_file
+ self.assertEqual(14742, len(content))
+ self.assertEqual(
+ "B+Tree Graph Index 1\nnode_ref_lists=0\nkey_elements=1\nlen=800\n"
+ "row_lengths=1,2\nbloom_pages=1\n",
+ content[:91])
+ # Check thelast page is well formed
+ leaf2 = content[12288:]
+ leaf2_bytes = zlib.decompress(leaf2)
+ node = btree_index._LeafNode(leaf2_bytes, 1, 0)
+ self.assertEqual(800 - 448, len(node.keys))
+ sorted_node_keys = sorted(node[0] for node in nodes)
+ self.assertEqual(sorted_node_keys[448:], sorted(node.keys))
def test_three_level_tree_details(self):
# The left most pointer in the second internal node in a row should
@@ -306,7 +350,7 @@
temp_file = builder.finish()
content = temp_file.read()
del temp_file
- self.assertEqual(4096*4, len(content))
+ self.assertEqual(14670, len(content))
self.assertEqual(
"B+Tree Graph Index 1\nnode_ref_lists=2\nkey_elements=2\nlen=400\n"
"row_lengths=1,2\nbloom_pages=1\n",
@@ -329,8 +373,8 @@
"type=internal\n"
"offset=0\n"
"1111111111111111111111111111111111111111\x00"
- "127127127127127127127127127127127127127127127127127127127"
- "127127127127127127127127127127127127127127127127127127127127127\n"
+ "126126126126126126126126126126126126126126126126126126126"
+ "126126126126126126126126126126126126126126126126126126126126126\n"
":bloom:\n"
+ bloom_bytes
)
@@ -589,9 +633,9 @@
self.assertEqual([], transport._activity)
self.assertEqual(70, index.key_count())
# The entire index should have been read, as it is one page long.
- self.assertEqual([('readv', 'index', [(0, 4096)], False, None)],
+ self.assertEqual([('readv', 'index', [(0, size)], False, None)],
transport._activity)
- self.assertEqual(4096, size)
+ self.assertEqual(1593, size)
def test_2_levels_key_count_2_2(self):
builder = btree_index.BTreeBuilder(key_elements=2, reference_lists=2)
@@ -600,7 +644,7 @@
builder.add_node(*node)
transport = get_transport('trace+' + self.get_url(''))
size = transport.put_file('index', builder.finish())
- self.assertEqual(4096*4, size)
+ self.assertEqual(14338, size)
index = btree_index.BTreeGraphIndex(transport, 'index', size)
del transport._activity[:]
self.assertEqual([], transport._activity)
@@ -621,9 +665,9 @@
self.assertEqual([], transport._activity)
index.validate()
# The entire index should have been read linearly.
- self.assertEqual([('readv', 'index', [(0, 4096)], False, None)],
+ self.assertEqual([('readv', 'index', [(0, size)], False, None)],
transport._activity)
- self.assertEqual(4096, size)
+ self.assertEqual(3846, size)
def test_validate_two_pages(self):
builder = btree_index.BTreeBuilder(key_elements=2, reference_lists=2)
@@ -633,14 +677,14 @@
transport = get_transport('trace+' + self.get_url(''))
size = transport.put_file('index', builder.finish())
# Root page, bloom page, 2 leaf pages
- self.assertEqual(4096*4, size)
+ self.assertEqual(14338, size)
index = btree_index.BTreeGraphIndex(transport, 'index', size)
del transport._activity[:]
self.assertEqual([], transport._activity)
index.validate()
# The entire index should have been read linearly.
self.assertEqual([('readv', 'index', [(0, 4096)], False, None),
- ('readv', 'index', [(8192, 4096), (12288, 4096)], False, None)],
+ ('readv', 'index', [(8192, 4096), (12288, 2050)], False, None)],
transport._activity)
# XXX: TODO: write some badly-ordered nodes, and some pointers-to-wrong
# node and make validate find them.
@@ -761,7 +805,7 @@
builder.add_node(*node)
transport = get_transport('trace+' + self.get_url(''))
size = transport.put_file('index', builder.finish())
- self.assertEqual(4317184, size, 'number of expected bytes in the'
+ self.assertEqual(4320613, size, 'number of expected bytes in the'
' output changed')
del builder
index = btree_index.BTreeGraphIndex(transport, 'index', size)
@@ -783,12 +827,13 @@
# The entire index should have been read
total_pages = sum(index._row_lengths) + index._num_bloom_pages
self.assertEqual(total_pages, index._row_offsets[-1])
- self.assertEqual(total_pages * btree_index._PAGE_SIZE, size)
+ self.assertEqual(4320613, size)
# The start of the leaves
first_byte = index._row_offsets[-2] * btree_index._PAGE_SIZE
readv_request = []
for offset in range(first_byte, size, 4096):
readv_request.append((offset, 4096))
+ readv_request[-1] = (readv_request[-1][0], 3429)
expected = [('readv', 'index', [(0, 4096)], False, None),
('readv', 'index', readv_request, False, None)]
if expected != transport._activity:
=== modified file 'tests/test_chunk_writer.py'
--- a/tests/test_chunk_writer.py 2008-07-02 06:08:10 +0000
+++ b/tests/test_chunk_writer.py 2008-07-14 17:43:46 +0000
@@ -32,18 +32,22 @@
def test_chunk_writer_empty(self):
writer = chunk_writer.ChunkWriter(4096)
- bytes_list, unused = writer.finish()
+ bytes_list, unused, padding = writer.finish()
node_bytes = self.check_chunk(bytes_list, 4096)
self.assertEqual("", node_bytes)
self.assertEqual(None, unused)
+ # Only a zlib header.
+ self.assertEqual(4088, padding)
def test_some_data(self):
writer = chunk_writer.ChunkWriter(4096)
writer.write("foo bar baz quux\n")
- bytes_list, unused = writer.finish()
+ bytes_list, unused, padding = writer.finish()
node_bytes = self.check_chunk(bytes_list, 4096)
self.assertEqual("foo bar baz quux\n", node_bytes)
self.assertEqual(None, unused)
+ # More than just the header..
+ self.assertEqual(4073, padding)
def test_too_much_data_does_not_exceed_size(self):
# Generate enough data to exceed 4K
@@ -57,7 +61,7 @@
for line in lines:
if writer.write(line):
break
- bytes_list, unused = writer.finish()
+ bytes_list, unused, _ = writer.finish()
node_bytes = self.check_chunk(bytes_list, 4096)
# the first 46 lines should have been added
expected_bytes = ''.join(lines[:46])
@@ -78,7 +82,7 @@
if writer.write(line):
break
self.assertFalse(writer.write_reserved("A"*256))
- bytes_list, unused = writer.finish()
+ bytes_list, unused, _ = writer.finish()
node_bytes = self.check_chunk(bytes_list, 4096)
# the first 44 lines should have been added
expected_bytes = ''.join(lines[:44]) + "A"*256
More information about the bazaar-commits
mailing list