Rev 30: Refactor node writing to be a separate function from finish(), and pull up the in memory queries to be part of the builder. in http://people.ubuntu.com/~robertc/baz2.0/plugins/index2/trunk
Robert Collins
robertc at robertcollins.net
Sun Jul 13 12:51:59 BST 2008
At http://people.ubuntu.com/~robertc/baz2.0/plugins/index2/trunk
------------------------------------------------------------
revno: 30
revision-id: robertc at robertcollins.net-20080713115155-k1nu89eo932mpp1l
parent: robertc at robertcollins.net-20080713072815-sot304u35kahnexg
committer: Robert Collins <robertc at robertcollins.net>
branch nick: trunk
timestamp: Sun 2008-07-13 21:51:55 +1000
message:
Refactor node writing to be a separate function from finish(), and pull up the in memory queries to be part of the builder.
modified:
btree_index.py index.py-20080624222253-p0x5f92uyh5hw734-7
repofmt.py repofmt.py-20080701113732-m1iu3n94ikbxdelb-1
=== modified file 'btree_index.py'
--- a/btree_index.py 2008-07-13 07:28:15 +0000
+++ b/btree_index.py 2008-07-13 11:51:55 +0000
@@ -178,146 +178,172 @@
VALUE := no-newline-no-null-bytes
"""
- def finish(self):
+ def add_nodes(self, nodes):
+ """Add nodes to the index.
+
+ :param nodes: An iterable of (key, node_refs, value) entries to add.
+ """
+ if self.reference_lists:
+ for (key, value, node_refs) in nodes:
+ self.add_node(key, value, node_refs)
+ else:
+ for (key, value) in nodes:
+ self.add_node(key, value)
+
+ def _iter_mem_nodes(self):
+ """Iterate over the nodes held in memory."""
+ if self.reference_lists:
+ for key, (absent, references, value) in self._nodes.iteritems():
+ if not absent:
+ yield self, key, value, references
+ else:
+ for key, (absent, references, value) in self._nodes.iteritems():
+ if not absent:
+ yield self, key, value
+
+ def _write_nodes(self, node_iterator):
+ """Write node_iterator out as a B+Tree.
+
+ :param node_iterator: An iterator of sorted nodes. Each node should
+ match the output given by iter_all_entries.
+ :return: A file handle for a temporary file containing a B+Tree for
+ the nodes.
+ """
# The index rows - rows[0] is the root, rows[1] is the layer under it
# etc.
- self.rows = []
+ rows = []
# forward sorted by key. In future we may consider topological sorting,
# at the cost of table scans for direct lookup, or a second index for
# direct lookup
- nodes = sorted(self._nodes.items())
key_count = 0
# A stack with the number of nodes of each size. 0 is the root node
# and must always be 1 (if there are any nodes in the tree).
self.row_lengths = []
- # A stack with the current open node for each row length
- self.row_nodes = []
global_bloom = BloomSHA1(_GLOBAL_BLOOM_START_SIZE)
- if len(nodes):
- # Loop over all nodes adding them to the bottom row
- # (self.rows[-1]). When we finish a chunk in a row,
- # propogate the key that didn't fit (comes after the chunk) to the
- # row above, transitively.
- self.rows.append(_LeafBuilderRow())
- def add_key(string_key, key_line):
- """Add a key to the current chunk.
+ def add_key(string_key, key_line):
+ """Add a key to the current chunk.
- :param string_key: The key to add.
- :param key_line: The fully serialised key and value.
- """
- if self.rows[-1].writer is None:
- # opening a new leaf chunk;
- for pos, internal_row in enumerate(self.rows[:-1]):
- # flesh out any internal nodes that are needed to
- # preserve the high of the tree
- if internal_row.writer is None:
- if internal_row.bloom is None:
- create_bloom = False
- reserved_bytes = 0
- else:
- # This is a bloom row
- create_bloom = True
- # reserve 256 for the bloom + 10 for ':bloom:\n'
- reserved_bytes = _BLOOM_DISK_SIZE / 8 + 10
- length = _PAGE_SIZE
- if internal_row.nodes == 0:
- length -= _RESERVED_HEADER_BYTES # padded
- internal_row.writer = chunk_writer.ChunkWriter(
- length, reserved_bytes)
- internal_row.writer.write(_INTERNAL_FLAG)
- internal_row.writer.write(_INTERNAL_OFFSET +
- str(self.rows[pos + 1].nodes) + "\n")
- if create_bloom:
- # new bloom for the new node
- internal_row.bloom = BloomSHA1(_BLOOM_BUILD_SIZE)
- else:
- internal_row.bloom = None
- # add a new leaf
- length = _PAGE_SIZE
- if self.rows[-1].nodes == 0:
- length -= _RESERVED_HEADER_BYTES # padded
- self.rows[-1].writer = chunk_writer.ChunkWriter(length)
- self.rows[-1].writer.write(_LEAF_FLAG)
- if self.rows[-1].writer.write(line):
- # this key did not fit in the node:
- self.rows[-1].finish_node()
- key_line = string_key + "\n"
- new_row = True
- for row in reversed(self.rows[:-1]):
- # Mark the start of the next node in the node above. If it
- # doesn't fit then propogate upwards until we find one that
- # it does fit into.
- if row.writer.write(key_line):
- row.finish_node()
+ :param string_key: The key to add.
+ :param key_line: The fully serialised key and value.
+ """
+ if rows[-1].writer is None:
+ # opening a new leaf chunk;
+ for pos, internal_row in enumerate(rows[:-1]):
+ # flesh out any internal nodes that are needed to
+ # preserve the high of the tree
+ if internal_row.writer is None:
+ if internal_row.bloom is None:
+ create_bloom = False
+ reserved_bytes = 0
else:
- # We've found a node that can handle the pointer.
- new_row = False
- break
- # If we reached the current root without being able to mark the
- # division point, then we need a new root:
- if new_row:
- # We need a new row
- if 'index' in debug.debug_flags:
- trace.mutter('Inserting new global row.')
- if len(self.rows) == 1:
- # We have only leaf nodes, so we are creating a row
- # where we will hold bloom filters
+ # This is a bloom row
+ create_bloom = True
# reserve 256 for the bloom + 10 for ':bloom:\n'
- new_row = _InternalBuilderRow(global_bloom)
reserved_bytes = _BLOOM_DISK_SIZE / 8 + 10
+ length = _PAGE_SIZE
+ if internal_row.nodes == 0:
+ length -= _RESERVED_HEADER_BYTES # padded
+ internal_row.writer = chunk_writer.ChunkWriter(
+ length, reserved_bytes)
+ internal_row.writer.write(_INTERNAL_FLAG)
+ internal_row.writer.write(_INTERNAL_OFFSET +
+ str(rows[pos + 1].nodes) + "\n")
+ if create_bloom:
+ # new bloom for the new node
+ internal_row.bloom = BloomSHA1(_BLOOM_BUILD_SIZE)
else:
- new_row = _InternalBuilderRow(None)
- reserved_bytes = 0
- self.rows.insert(0, new_row)
- # This will be padded, hence the -100
- new_row.writer = chunk_writer.ChunkWriter(
- _PAGE_SIZE - _RESERVED_HEADER_BYTES,
- reserved_bytes)
- new_row.writer.write(_INTERNAL_FLAG)
- new_row.writer.write(_INTERNAL_OFFSET +
- str(self.rows[1].nodes - 1) + "\n")
- new_row.writer.write(key_line)
- add_key(string_key, key_line)
- else:
- # Only the lowest level of internal rows has a bloom
- for row in self.rows[:-1]:
- if row.bloom is not None:
- row.bloom.insert(string_key)
- # Keep the bloom fairly empty. It seems we need better than
- # 16 b/e to avoid accidentally filling at a lower size
- global_bloom.insert(string_key)
- if (global_bloom._num_entries * _GLOBAL_BLOOM_RESIZE_THRESHOLD
- > global_bloom._num_bits):
- global_bloom.resize(global_bloom._num_bits *
- _GLOBAL_BLOOM_GROW_FACTOR)
- if 'index' in debug.debug_flags:
- trace.mutter('resizing global bloom: %s',
- row.bloom)
-
- for key, (absent, references, value) in nodes:
- if absent:
- continue
- key_count += 1
- flattened_references = []
- for ref_list in references:
+ internal_row.bloom = None
+ # add a new leaf
+ length = _PAGE_SIZE
+ if rows[-1].nodes == 0:
+ length -= _RESERVED_HEADER_BYTES # padded
+ rows[-1].writer = chunk_writer.ChunkWriter(length)
+ rows[-1].writer.write(_LEAF_FLAG)
+ if rows[-1].writer.write(line):
+ # this key did not fit in the node:
+ rows[-1].finish_node()
+ key_line = string_key + "\n"
+ new_row = True
+ for row in reversed(rows[:-1]):
+ # Mark the start of the next node in the node above. If it
+ # doesn't fit then propogate upwards until we find one that
+ # it does fit into.
+ if row.writer.write(key_line):
+ row.finish_node()
+ else:
+ # We've found a node that can handle the pointer.
+ new_row = False
+ break
+ # If we reached the current root without being able to mark the
+ # division point, then we need a new root:
+ if new_row:
+ # We need a new row
+ if 'index' in debug.debug_flags:
+ trace.mutter('Inserting new global row.')
+ if len(rows) == 1:
+ # We have only leaf nodes, so we are creating a row
+ # where we will hold bloom filters
+ # reserve 256 for the bloom + 10 for ':bloom:\n'
+ new_row = _InternalBuilderRow(global_bloom)
+ reserved_bytes = _BLOOM_DISK_SIZE / 8 + 10
+ else:
+ new_row = _InternalBuilderRow(None)
+ reserved_bytes = 0
+ rows.insert(0, new_row)
+ # This will be padded, hence the -100
+ new_row.writer = chunk_writer.ChunkWriter(
+ _PAGE_SIZE - _RESERVED_HEADER_BYTES,
+ reserved_bytes)
+ new_row.writer.write(_INTERNAL_FLAG)
+ new_row.writer.write(_INTERNAL_OFFSET +
+ str(rows[1].nodes - 1) + "\n")
+ new_row.writer.write(key_line)
+ add_key(string_key, key_line)
+ else:
+ # Only the lowest level of internal rows has a bloom
+ for row in rows[:-1]:
+ if row.bloom is not None:
+ row.bloom.insert(string_key)
+ # Keep the bloom fairly empty. It seems we need better than
+ # 16 b/e to avoid accidentally filling at a lower size
+ global_bloom.insert(string_key)
+ if (global_bloom._num_entries * _GLOBAL_BLOOM_RESIZE_THRESHOLD
+ > global_bloom._num_bits):
+ global_bloom.resize(global_bloom._num_bits *
+ _GLOBAL_BLOOM_GROW_FACTOR)
+ if 'index' in debug.debug_flags:
+ trace.mutter('resizing global bloom: %s',
+ row.bloom)
+ # Loop over all nodes adding them to the bottom row
+ # (rows[-1]). When we finish a chunk in a row,
+ # propogate the key that didn't fit (comes after the chunk) to the
+ # row above, transitively.
+ for node in node_iterator:
+ if key_count == 0:
+ # First key triggers the first row
+ rows.append(_LeafBuilderRow())
+ key_count += 1
+ flattened_references = []
+ if self.reference_lists:
+ for ref_list in node[3]:
ref_keys = []
for reference in ref_list:
ref_keys.append('\x00'.join(reference))
flattened_references.append('\r'.join(ref_keys))
- string_key = '\x00'.join(key)
- line = ("%s\x00%s\x00%s\n" % (string_key,
- '\t'.join(flattened_references), value))
- add_key(string_key, line)
- for row in reversed(self.rows):
- row.finish_node()
+ string_key = '\x00'.join(node[1])
+ line = ("%s\x00%s\x00%s\n" % (string_key,
+ '\t'.join(flattened_references), node[2]))
+ add_key(string_key, line)
+ for row in reversed(rows):
+ row.finish_node()
result = tempfile.TemporaryFile()
lines = [_BTSIGNATURE]
lines.append(_OPTION_NODE_REFS + str(self.reference_lists) + '\n')
lines.append(_OPTION_KEY_ELEMENTS + str(self._key_length) + '\n')
lines.append(_OPTION_LEN + str(key_count) + '\n')
- row_lengths = [row.nodes for row in self.rows]
+ row_lengths = [row.nodes for row in rows]
lines.append(_OPTION_ROW_LENGTHS + ','.join(map(str, row_lengths)) + '\n')
- if len(self.rows) > 1:
+ if len(rows) > 1:
# We only write a global bloom if there is an internal node
optimal_num_bits = max(_PAGE_SIZE*8, global_bloom._num_entries*9.6)
global_bloom.resize(optimal_num_bits)
@@ -337,7 +363,7 @@
" reserved space: %d > %d"
% (position, _RESERVED_HEADER_BYTES))
# write the rows out:
- for row in self.rows:
+ for row in rows:
reserved = _RESERVED_HEADER_BYTES # reserved space for first node
row.spool.flush()
row.spool.seek(0)
@@ -362,6 +388,127 @@
result.seek(0)
return result
+ def finish(self):
+ """Finalise the index.
+
+ :return: A file handle for a temporary file containing the nodes added
+ to the index.
+ """
+ nodes = sorted(self._iter_mem_nodes())
+ return self._write_nodes(nodes)
+
+ def iter_all_entries(self):
+ """Iterate over all keys within the index
+
+ :return: An iterable of (index, key, reference_lists, value). There is no
+ defined order for the result iteration - it will be in the most
+ efficient order for the index (in this case dictionary hash order).
+ """
+ if 'evil' in debug.debug_flags:
+ trace.mutter_callsite(3,
+ "iter_all_entries scales with size of history.")
+ return self._iter_mem_nodes()
+
+ def iter_entries(self, keys):
+ """Iterate over keys within the index.
+
+ :param keys: An iterable providing the keys to be retrieved.
+ :return: An iterable of (index, key, value, reference_lists). There is no
+ defined order for the result iteration - it will be in the most
+ efficient order for the index (keys iteration order in this case).
+ """
+ keys = set(keys)
+ if self.reference_lists:
+ for key in keys.intersection(self._keys):
+ node = self._nodes[key]
+ if not node[0]:
+ yield self, key, node[2], node[1]
+ else:
+ for key in keys.intersection(self._keys):
+ node = self._nodes[key]
+ if not node[0]:
+ yield self, key, node[2]
+
+ def iter_entries_prefix(self, keys):
+ """Iterate over keys within the index using prefix matching.
+
+ Prefix matching is applied within the tuple of a key, not to within
+ the bytestring of each key element. e.g. if you have the keys ('foo',
+ 'bar'), ('foobar', 'gam') and do a prefix search for ('foo', None) then
+ only the former key is returned.
+
+ :param keys: An iterable providing the key prefixes to be retrieved.
+ Each key prefix takes the form of a tuple the length of a key, but
+ with the last N elements 'None' rather than a regular bytestring.
+ The first element cannot be 'None'.
+ :return: An iterable as per iter_all_entries, but restricted to the
+ keys with a matching prefix to those supplied. No additional keys
+ will be returned, and every match that is in the index will be
+ returned.
+ """
+ # XXX: To much duplication with the GraphIndex class; consider finding
+ # a good place to pull out the actual common logic.
+ keys = set(keys)
+ if not keys:
+ return
+ if self._key_length == 1:
+ for key in keys:
+ # sanity check
+ if key[0] is None:
+ raise errors.BadIndexKey(key)
+ if len(key) != self._key_length:
+ raise errors.BadIndexKey(key)
+ node = self._nodes[key]
+ if node[0]:
+ continue
+ if self.reference_lists:
+ yield self, key, node[2], node[1]
+ else:
+ yield self, key, node[2]
+ return
+ for key in keys:
+ # sanity check
+ if key[0] is None:
+ raise errors.BadIndexKey(key)
+ if len(key) != self._key_length:
+ raise errors.BadIndexKey(key)
+ # find what it refers to:
+ key_dict = self._nodes_by_key
+ elements = list(key)
+ # find the subdict to return
+ try:
+ while len(elements) and elements[0] is not None:
+ key_dict = key_dict[elements[0]]
+ elements.pop(0)
+ except KeyError:
+ # a non-existant lookup.
+ continue
+ if len(elements):
+ dicts = [key_dict]
+ while dicts:
+ key_dict = dicts.pop(-1)
+ # can't be empty or would not exist
+ item, value = key_dict.iteritems().next()
+ if type(value) == dict:
+ # push keys
+ dicts.extend(key_dict.itervalues())
+ else:
+ # yield keys
+ for value in key_dict.itervalues():
+ yield (self, ) + value
+ else:
+ yield (self, ) + key_dict
+
+ def key_count(self):
+ """Return an estimate of the number of keys in this index.
+
+ For InMemoryGraphIndex the estimate is exact.
+ """
+ return len(self._keys)
+
+ def validate(self):
+ """In memory index's have no known corruption at the moment."""
+
class _LeafNode(object):
"""A leaf node for a serialised B+Tree index."""
=== modified file 'repofmt.py'
--- a/repofmt.py 2008-07-04 04:56:18 +0000
+++ b/repofmt.py 2008-07-13 11:51:55 +0000
@@ -58,144 +58,6 @@
Packer.open_pack = open_pack
-class InMemoryBTree(BTreeBuilder):
- """TODO - make InMemoryGraphIndex more reusable.
-
- (not that all-in-mem is a good idea).
- """
-
- def add_nodes(self, nodes):
- """Add nodes to the index.
-
- :param nodes: An iterable of (key, node_refs, value) entries to add.
- """
- if self.reference_lists:
- for (key, value, node_refs) in nodes:
- self.add_node(key, value, node_refs)
- else:
- for (key, value) in nodes:
- self.add_node(key, value)
-
- def iter_all_entries(self):
- """Iterate over all keys within the index
-
- :return: An iterable of (index, key, reference_lists, value). There is no
- defined order for the result iteration - it will be in the most
- efficient order for the index (in this case dictionary hash order).
- """
- if 'evil' in debug.debug_flags:
- trace.mutter_callsite(3,
- "iter_all_entries scales with size of history.")
- if self.reference_lists:
- for key, (absent, references, value) in self._nodes.iteritems():
- if not absent:
- yield self, key, value, references
- else:
- for key, (absent, references, value) in self._nodes.iteritems():
- if not absent:
- yield self, key, value
-
- def iter_entries(self, keys):
- """Iterate over keys within the index.
-
- :param keys: An iterable providing the keys to be retrieved.
- :return: An iterable of (index, key, value, reference_lists). There is no
- defined order for the result iteration - it will be in the most
- efficient order for the index (keys iteration order in this case).
- """
- keys = set(keys)
- if self.reference_lists:
- for key in keys.intersection(self._keys):
- node = self._nodes[key]
- if not node[0]:
- yield self, key, node[2], node[1]
- else:
- for key in keys.intersection(self._keys):
- node = self._nodes[key]
- if not node[0]:
- yield self, key, node[2]
-
- def iter_entries_prefix(self, keys):
- """Iterate over keys within the index using prefix matching.
-
- Prefix matching is applied within the tuple of a key, not to within
- the bytestring of each key element. e.g. if you have the keys ('foo',
- 'bar'), ('foobar', 'gam') and do a prefix search for ('foo', None) then
- only the former key is returned.
-
- :param keys: An iterable providing the key prefixes to be retrieved.
- Each key prefix takes the form of a tuple the length of a key, but
- with the last N elements 'None' rather than a regular bytestring.
- The first element cannot be 'None'.
- :return: An iterable as per iter_all_entries, but restricted to the
- keys with a matching prefix to those supplied. No additional keys
- will be returned, and every match that is in the index will be
- returned.
- """
- # XXX: To much duplication with the GraphIndex class; consider finding
- # a good place to pull out the actual common logic.
- keys = set(keys)
- if not keys:
- return
- if self._key_length == 1:
- for key in keys:
- # sanity check
- if key[0] is None:
- raise errors.BadIndexKey(key)
- if len(key) != self._key_length:
- raise errors.BadIndexKey(key)
- node = self._nodes[key]
- if node[0]:
- continue
- if self.reference_lists:
- yield self, key, node[2], node[1]
- else:
- yield self, key, node[2]
- return
- for key in keys:
- # sanity check
- if key[0] is None:
- raise errors.BadIndexKey(key)
- if len(key) != self._key_length:
- raise errors.BadIndexKey(key)
- # find what it refers to:
- key_dict = self._nodes_by_key
- elements = list(key)
- # find the subdict to return
- try:
- while len(elements) and elements[0] is not None:
- key_dict = key_dict[elements[0]]
- elements.pop(0)
- except KeyError:
- # a non-existant lookup.
- continue
- if len(elements):
- dicts = [key_dict]
- while dicts:
- key_dict = dicts.pop(-1)
- # can't be empty or would not exist
- item, value = key_dict.iteritems().next()
- if type(value) == dict:
- # push keys
- dicts.extend(key_dict.itervalues())
- else:
- # yield keys
- for value in key_dict.itervalues():
- yield (self, ) + value
- else:
- yield (self, ) + key_dict
-
- def key_count(self):
- """Return an estimate of the number of keys in this index.
-
- For InMemoryGraphIndex the estimate is exact.
- """
- return len(self._keys)
-
- def validate(self):
- """In memory index's have no known corruption at the moment."""
-
-
class BTreePack(NewPack):
def __init__(self, upload_transport, index_transport, pack_transport,
@@ -217,18 +79,18 @@
# passed in because the caller has them, so as to avoid object churn.
Pack.__init__(self,
# Revisions: parents list, no text compression.
- InMemoryBTree(reference_lists=1),
+ BTreeBuilder(reference_lists=1),
# Inventory: We want to map compression only, but currently the
# knit code hasn't been updated enough to understand that, so we
# have a regular 2-list index giving parents and compression
# source.
- InMemoryBTree(reference_lists=2),
+ BTreeBuilder(reference_lists=2),
# Texts: compression and per file graph, for all fileids - so two
# reference lists and two elements in the key tuple.
- InMemoryBTree(reference_lists=2, key_elements=2),
+ BTreeBuilder(reference_lists=2, key_elements=2),
# Signatures: Just blobs to store, no compression, no parents
# listing.
- InMemoryBTree(reference_lists=0),
+ BTreeBuilder(reference_lists=0),
)
# where should the new pack be opened
self.upload_transport = upload_transport
More information about the bazaar-commits
mailing list