Rev 2: Record the length of each row in the header. in http://people.ubuntu.com/~robertc/baz2.0/plugins/index2/trunk
Robert Collins
robertc at robertcollins.net
Mon Jun 30 23:02:36 BST 2008
At http://people.ubuntu.com/~robertc/baz2.0/plugins/index2/trunk
------------------------------------------------------------
revno: 2
revision-id: robertc at robertcollins.net-20080630220232-977qnki90oa6f9ps
parent: robertc at robertcollins.net-20080624223756-4lopkobq9kk5u9cc
committer: Robert Collins <robertc at robertcollins.net>
branch nick: trunk
timestamp: Tue 2008-07-01 08:02:32 +1000
message:
Record the length of each row in the header.
modified:
DESIGN design-20080624222253-p0x5f92uyh5hw734-2
btree_index.py index.py-20080624222253-p0x5f92uyh5hw734-7
tests/test_btree_index.py test_index.py-20080624222253-p0x5f92uyh5hw734-13
=== modified file 'DESIGN'
--- a/DESIGN 2008-06-24 22:37:56 +0000
+++ b/DESIGN 2008-06-30 22:02:32 +0000
@@ -1,12 +1,49 @@
This document contains notes about the design for index2, an index which is API
compatible with bzr's GraphIndex, but faster and smaller.
+Disk layout
++++++++++++
+
+Each segment will be a small header (less than 100 bytes) describing the index
+attributes and format, followed by B+Tree nodes. Each node will be
+zlib compressed data padded with NULL bytes up to 4K in length, and aligned at
+4K boundaries. The first node is shorter than 4K by the size of the header.
+
Building
++++++++
Building will use a tempfile on disk to spill data to. When the finish() method
is called this will be converted into an optimised structure.
+serialisation logic:
+sort all nodes:
+maintain a stack of nodes: starts at height 1 - a leaf node
+as each bottom level node exceeds storage:
+ - name the node (e.g. layer 0, 1
+ - add its first and last nodes to the internal node above it; adding a new
+ node if there is none above it
+ - allocate
+
+
+final layout on disk:
+layer N node 0, node 1, ...
+layer N-1 node 0, ..
+layer 0 node 0, ...
+(this gives forward only IO and should provide good locality of reference for
+all-nodes traversal)
+node pointers: at 500 keys/node:
+ - 1 layer gives 500 in 1 node
+ - 2 layers gives 250K in 501 nodes
+ - 3 layers gives 125M in 250501 nodes
+ - 4 layers gives 62.5G in 125250501 nodes
+
+If we can store the numbers of nodes in each layer in the overall header, we
+can quickly calculate the location of any node. We define node addresses as
+layer, offset. layer 0 is the leaf nodes, and higher numbers refer to internal
+nodes up to the root. offset is the sequence number of the node within that
+layer. The physical location of any node in the first layer is then 4k * offset.
+For the second layer its 4K * second layer node count + 4K * offset, and so on.
+
Querying
++++++++
=== modified file 'btree_index.py'
--- a/btree_index.py 2008-06-24 22:37:56 +0000
+++ b/btree_index.py 2008-06-30 22:02:32 +0000
@@ -17,5 +17,131 @@
"""B+Tree indices"""
+import tempfile
+import zlib
+
from bzrlib import index
+from bzrlib.index import _OPTION_NODE_REFS, _OPTION_KEY_ELEMENTS, _OPTION_LEN
from bzrlib.plugins.index2 import errors
+
+
+_BTSIGNATURE = "B+Tree Graph Index 1\n"
+_OPTION_ROW_LENGTHS = "row_lengths="
+_LEAF_FLAG = "type=leaf\n"
+
+
+
+class BTreeBuilder(index.GraphIndexBuilder):
+ """A Builder for B+Tree based Graph indices.
+
+ The resulting graph has the structure:
+
+ _SIGNATURE OPTIONS NODES
+ _SIGNATURE := 'B+Tree Graph Index 1' NEWLINE
+ OPTIONS := REF_LISTS KEY_ELEMENTS LENGTH
+ REF_LISTS := 'node_ref_lists=' DIGITS NEWLINE
+ KEY_ELEMENTS := 'key_elements=' DIGITS NEWLINE
+ LENGTH := 'len=' DIGITS NEWLINE
+ ROW_LENGTHS := 'row_lengths' DIGITS (COMMA DIGITS)*
+ NODES := NODE_COMPRESSED*
+ NODE_COMPRESSED:= COMPRESSED_BYTES{4096}
+ NODE_RAW := INTERNAL | LEAF
+ INTERNAL := INTERNAL_FLAG POINTERS
+ LEAF := LEAF_FLAG ROWS
+ KEY_ELEMENT := Not-whitespace-utf8
+ KEY := KEY_ELEMENT (NULL KEY_ELEMENT)*
+ ROWS := ROW*
+ ROW := KEY NULL ABSENT? NULL REFERENCES NULL VALUE NEWLINE
+ ABSENT := 'a'
+ REFERENCES := REFERENCE_LIST (TAB REFERENCE_LIST){node_ref_lists - 1}
+ REFERENCE_LIST := (REFERENCE (CR REFERENCE)*)?
+ REFERENCE := KEY
+ VALUE := no-newline-no-null-bytes
+ """
+
+ def finish(self):
+ working = tempfile.TemporaryFile()
+ self.position = 100 # reserved for index header
+ # forward sorted by key. In future we may consider topological sorting,
+ # at the cost of table scans for direct lookup, or a second index for
+ # direct lookup
+ nodes = sorted(self._nodes.items())
+ key_count = 0
+ # A stack with the number of nodes of each size. 0 is the root node
+ # and must always be 1 (if there are any nodes in the tree).
+ self.row_lengths = []
+ # A stack with the current open node for each row length
+ self.row_nodes = []
+ if len(nodes):
+ self.rows = 1
+ # write a leaf node, padded to 4096 (4K) bytes
+ if self.position:
+ # reserved space in first node
+ out_lines = ["\x00" * 100] # reserved
+ out_lines = []
+ self.page_count = 0
+ def finish_node():
+ # trailing data:
+ out_lines.append(compressor.flush())
+ working.writelines(out_lines)
+ self.position += sum(map(len, out_lines))
+ nulls_needed = 4096 - self.position % 4096
+ if nulls_needed:
+ working.write("\x00" * nulls_needed)
+ del out_lines[:]
+ self.position = 0
+ self.page_count += 1
+ self.row_lengths[-1] += 1
+
+ for key, (absent, references, value) in nodes:
+ if absent:
+ continue
+ if not out_lines:
+ if len(self.row_lengths) != self.rows:
+ # we have bumped up a row
+ for depth in range(self.rows - len(self.row_nodes) - 1):
+ # add internal node
+ assert False
+ self.row_lengths.insert(0, 0)
+ # Time for a new node
+ compressor = zlib.compressobj()
+ out_lines.append(compressor.compress(_LEAF_FLAG))
+
+ key_count += 1
+ flattened_references = []
+ for ref_list in references:
+ ref_keys = []
+ for reference in ref_list:
+ ref_keys.append('\x00'.join(reference))
+ flattened_references.append('\r'.join(ref_keys))
+ string_key = '\x00'.join(key)
+ line = ("%s\x00%s\x00%s\n" % (string_key,
+ '\t'.join(flattened_references), value))
+ out_lines.append(compressor.compress(line))
+ if False:
+ # new node time
+ pass
+ finish_node()
+ working.flush()
+ result = tempfile.TemporaryFile()
+ lines = [_BTSIGNATURE]
+ lines.append(_OPTION_NODE_REFS + str(self.reference_lists) + '\n')
+ lines.append(_OPTION_KEY_ELEMENTS + str(self._key_length) + '\n')
+ lines.append(_OPTION_LEN + str(key_count) + '\n')
+ lines.append(_OPTION_ROW_LENGTHS + ','.join(map(str, self.row_lengths)) + '\n')
+ result.writelines(lines)
+ reserved = 100 # reserved space
+ self.position = sum(map(len, lines))
+ working.seek(0)
+ # copy nodes to the finalised file.
+ node = working.read(4096)
+ while node:
+ node = node[:4096-self.position]
+ result.write(node)
+ result.write("\x00" * (reserved - self.position))
+ node = working.read(4096)
+ self.position = 0
+ reserved = 0
+ result.flush()
+ result.seek(0)
+ return result
=== modified file 'tests/test_btree_index.py'
--- a/tests/test_btree_index.py 2008-06-24 22:37:56 +0000
+++ b/tests/test_btree_index.py 2008-06-30 22:02:32 +0000
@@ -17,8 +17,161 @@
"""Tests for btree indices."""
+import zlib
+
from bzrlib.plugins import index2
from bzrlib.plugins.index2 import errors, btree_index
from bzrlib.tests import TestCaseWithTransport
+class TestBTreeBuilder(TestCaseWithTransport):
+ # test names here are suffixed by the key length and reference list count
+ # that they test.
+
+ def make_nodes(self, count, key_elements, reference_lists):
+ """Generate count*key_elements sample nodes."""
+ keys = []
+ for prefix_pos in range(key_elements):
+ if key_elements - 1:
+ prefix = (str(prefix_pos) * 40,)
+ else:
+ prefix = ()
+ for pos in range(count):
+ key = prefix + (str(pos) * 40,)
+ value = "value:%s" % pos
+ if reference_lists:
+ # generate some references
+ refs = []
+ for list_pos in range(reference_lists):
+ # as many keys in each list as its index + the key depth
+ # mod 2 - this generates both 0 length lists and
+ # ones slightly longer than the number of lists.
+ # It alsu ensures we have non homogeneous lists.
+ refs.append([])
+ for ref_pos in range(list_pos + pos % 2):
+ if pos % 2:
+ # refer to a nearby key
+ refs[-1].append(prefix + ("ref" + str(pos - 1) * 40,))
+ else:
+ # serial of this ref in the ref list
+ refs[-1].append(prefix + ("ref" + str(ref_pos) * 40,))
+ else:
+ refs = ()
+ keys.append((key, value, refs))
+ return keys
+
+ def test_empty_1_0(self):
+ builder = btree_index.BTreeBuilder(key_elements=1, reference_lists=0)
+ content = builder.finish().read()
+ self.assertEqual(
+ "B+Tree Graph Index 1\nnode_ref_lists=0\nkey_elements=1\nlen=0\n"
+ "row_lengths=\n",
+ content)
+
+ def test_empty_2_1(self):
+ builder = btree_index.BTreeBuilder(key_elements=2, reference_lists=1)
+ content = builder.finish().read()
+ self.assertEqual(
+ "B+Tree Graph Index 1\nnode_ref_lists=1\nkey_elements=2\nlen=0\n"
+ "row_lengths=\n",
+ content)
+
+ def test_root_leaf_1_0(self):
+ builder = btree_index.BTreeBuilder(key_elements=1, reference_lists=0)
+ nodes = self.make_nodes(5, 1, 0)
+ for node in nodes:
+ builder.add_node(*node)
+ content = builder.finish().read()
+ self.assertEqual(4096, len(content))
+ self.assertEqual(
+ "B+Tree Graph Index 1\nnode_ref_lists=0\nkey_elements=1\nlen=5\n"
+ "row_lengths=1\n",
+ content[:73])
+ node_content = content[73:]
+ node_bytes = zlib.decompress(node_content)
+ expected_node = ("type=leaf\n"
+ "0000000000000000000000000000000000000000\x00\x00value:0\n"
+ "1111111111111111111111111111111111111111\x00\x00value:1\n"
+ "2222222222222222222222222222222222222222\x00\x00value:2\n"
+ "3333333333333333333333333333333333333333\x00\x00value:3\n"
+ "4444444444444444444444444444444444444444\x00\x00value:4\n")
+ self.assertEqual(expected_node, node_bytes)
+
+ def test_root_leaf_2_2(self):
+ builder = btree_index.BTreeBuilder(key_elements=2, reference_lists=2)
+ nodes = self.make_nodes(5, 2, 2)
+ for node in nodes:
+ builder.add_node(*node)
+ content = builder.finish().read()
+ self.assertEqual(4096, len(content))
+ self.assertEqual(
+ "B+Tree Graph Index 1\nnode_ref_lists=2\nkey_elements=2\nlen=10\n"
+ "row_lengths=1\n",
+ content[:74])
+ node_content = content[74:]
+ node_bytes = zlib.decompress(node_content)
+ expected_node = (
+ "type=leaf\n"
+ "0000000000000000000000000000000000000000\x000000000000000000000000000000000000000000\x00\t0000000000000000000000000000000000000000\x00ref0000000000000000000000000000000000000000\x00value:0\n"
+ "0000000000000000000000000000000000000000\x001111111111111111111111111111111111111111\x000000000000000000000000000000000000000000\x00ref0000000000000000000000000000000000000000\t0000000000000000000000000000000000000000\x00ref0000000000000000000000000000000000000000\r0000000000000000000000000000000000000000\x00ref0000000000000000000000000000000000000000\x00value:1\n"
+ "0000000000000000000000000000000000000000\x002222222222222222222222222222222222222222\x00\t0000000000000000000000000000000000000000\x00ref0000000000000000000000000000000000000000\x00value:2\n"
+ "0000000000000000000000000000000000000000\x003333333333333333333333333333333333333333\x000000000000000000000000000000000000000000\x00ref2222222222222222222222222222222222222222\t0000000000000000000000000000000000000000\x00ref2222222222222222222222222222222222222222\r0000000000000000000000000000000000000000\x00ref2222222222222222222222222222222222222222\x00value:3\n"
+ "0000000000000000000000000000000000000000\x004444444444444444444444444444444444444444\x00\t0000000000000000000000000000000000000000\x00ref0000000000000000000000000000000000000000\x00value:4\n"
+ "1111111111111111111111111111111111111111\x000000000000000000000000000000000000000000\x00\t1111111111111111111111111111111111111111\x00ref0000000000000000000000000000000000000000\x00value:0\n"
+ "1111111111111111111111111111111111111111\x001111111111111111111111111111111111111111\x001111111111111111111111111111111111111111\x00ref0000000000000000000000000000000000000000\t1111111111111111111111111111111111111111\x00ref0000000000000000000000000000000000000000\r1111111111111111111111111111111111111111\x00ref0000000000000000000000000000000000000000\x00value:1\n"
+ "1111111111111111111111111111111111111111\x002222222222222222222222222222222222222222\x00\t1111111111111111111111111111111111111111\x00ref0000000000000000000000000000000000000000\x00value:2\n"
+ "1111111111111111111111111111111111111111\x003333333333333333333333333333333333333333\x001111111111111111111111111111111111111111\x00ref2222222222222222222222222222222222222222\t1111111111111111111111111111111111111111\x00ref2222222222222222222222222222222222222222\r1111111111111111111111111111111111111111\x00ref2222222222222222222222222222222222222222\x00value:3\n"
+ "1111111111111111111111111111111111111111\x004444444444444444444444444444444444444444\x00\t1111111111111111111111111111111111111111\x00ref0000000000000000000000000000000000000000\x00value:4\n"
+ ""
+ )
+ self.assertEqual(expected_node, node_bytes)
+
+ def _test_2_leaves_1_0(self):
+ builder = btree_index.BTreeBuilder(key_elements=1, reference_lists=0)
+ nodes = self.make_nodes(700, 1, 0)
+ for node in nodes:
+ builder.add_node(*node)
+ content = builder.finish().read()
+ self.assertEqual(4096, len(content))
+ self.assertEqual(
+ "B+Tree Graph Index 1\nnode_ref_lists=0\nkey_elements=1\nlen=700\n"
+ "row_lengths=1,2\n",
+ content[:61])
+ node_content = content[61:]
+ node_bytes = zlib.decompress(node_content)
+ expected_node = ("type=leaf\n"
+ "0000000000000000000000000000000000000000\x00\x00value:0\n"
+ "1111111111111111111111111111111111111111\x00\x00value:1\n"
+ "2222222222222222222222222222222222222222\x00\x00value:2\n"
+ "3333333333333333333333333333333333333333\x00\x00value:3\n"
+ "4444444444444444444444444444444444444444\x00\x00value:4\n")
+ self.assertEqual(expected_node, node_bytes)
+
+ def _test_2_leaves_2_2(self):
+ builder = btree_index.BTreeBuilder(key_elements=2, reference_lists=2)
+ nodes = self.make_nodes(5, 2, 2)
+ for node in nodes:
+ builder.add_node(*node)
+ content = builder.finish().read()
+ self.assertEqual(4096, len(content))
+ self.assertEqual(
+ "B+Tree Graph Index 1\nnode_ref_lists=2\nkey_elements=2\nlen=10\n"
+ "row_lengths=1,2\n",
+ content[:60])
+ node_content = content[60:]
+ node_bytes = zlib.decompress(node_content)
+ expected_node = (
+ "type=leaf\n"
+ "0000000000000000000000000000000000000000\x000000000000000000000000000000000000000000\x00\t0000000000000000000000000000000000000000\x00ref0000000000000000000000000000000000000000\x00value:0\n"
+ "0000000000000000000000000000000000000000\x001111111111111111111111111111111111111111\x000000000000000000000000000000000000000000\x00ref0000000000000000000000000000000000000000\t0000000000000000000000000000000000000000\x00ref0000000000000000000000000000000000000000\r0000000000000000000000000000000000000000\x00ref0000000000000000000000000000000000000000\x00value:1\n"
+ "0000000000000000000000000000000000000000\x002222222222222222222222222222222222222222\x00\t0000000000000000000000000000000000000000\x00ref0000000000000000000000000000000000000000\x00value:2\n"
+ "0000000000000000000000000000000000000000\x003333333333333333333333333333333333333333\x000000000000000000000000000000000000000000\x00ref2222222222222222222222222222222222222222\t0000000000000000000000000000000000000000\x00ref2222222222222222222222222222222222222222\r0000000000000000000000000000000000000000\x00ref2222222222222222222222222222222222222222\x00value:3\n"
+ "0000000000000000000000000000000000000000\x004444444444444444444444444444444444444444\x00\t0000000000000000000000000000000000000000\x00ref0000000000000000000000000000000000000000\x00value:4\n"
+ "1111111111111111111111111111111111111111\x000000000000000000000000000000000000000000\x00\t1111111111111111111111111111111111111111\x00ref0000000000000000000000000000000000000000\x00value:0\n"
+ "1111111111111111111111111111111111111111\x001111111111111111111111111111111111111111\x001111111111111111111111111111111111111111\x00ref0000000000000000000000000000000000000000\t1111111111111111111111111111111111111111\x00ref0000000000000000000000000000000000000000\r1111111111111111111111111111111111111111\x00ref0000000000000000000000000000000000000000\x00value:1\n"
+ "1111111111111111111111111111111111111111\x002222222222222222222222222222222222222222\x00\t1111111111111111111111111111111111111111\x00ref0000000000000000000000000000000000000000\x00value:2\n"
+ "1111111111111111111111111111111111111111\x003333333333333333333333333333333333333333\x001111111111111111111111111111111111111111\x00ref2222222222222222222222222222222222222222\t1111111111111111111111111111111111111111\x00ref2222222222222222222222222222222222222222\r1111111111111111111111111111111111111111\x00ref2222222222222222222222222222222222222222\x00value:3\n"
+ "1111111111111111111111111111111111111111\x004444444444444444444444444444444444444444\x00\t1111111111111111111111111111111111111111\x00ref0000000000000000000000000000000000000000\x00value:4\n"
+ ""
+ )
+ self.assertEqual(expected_node, node_bytes)
More information about the bazaar-commits
mailing list