Rev 3653: Refactor some code. in http://bzr.arbash-meinel.com/branches/bzr/1.7-dev/index_builder_cleanup
John Arbash Meinel
john at arbash-meinel.com
Thu Aug 28 21:05:55 BST 2008
At http://bzr.arbash-meinel.com/branches/bzr/1.7-dev/index_builder_cleanup
------------------------------------------------------------
revno: 3653
revision-id: john at arbash-meinel.com-20080828200552-sw5lzds9mmi3qnnb
parent: john at arbash-meinel.com-20080825190216-vdkyinz5p5e5s8kq
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: index_builder_cleanup
timestamp: Thu 2008-08-28 15:05:52 -0500
message:
Refactor some code.
Move the key, reference, value checking into a helper function.
This func also finds absent references, but that should have
minimal overhead either way.
Also use the _update_nodes_by_key functionality for both
indexes, as _nodes_by_key has the same signature.
Move _spill_mem_keys_to_disk into a separate function
not necessary, but it makes add_node() easier to understand.
-------------- next part --------------
=== modified file 'bzrlib/btree_index.py'
--- a/bzrlib/btree_index.py 2008-08-25 19:02:16 +0000
+++ b/bzrlib/btree_index.py 2008-08-28 20:05:52 +0000
@@ -37,7 +37,6 @@
trace,
)
from bzrlib.index import _OPTION_NODE_REFS, _OPTION_KEY_ELEMENTS, _OPTION_LEN
-from bzrlib.osutils import basename, dirname
from bzrlib.transport import get_transport
@@ -161,38 +160,31 @@
:param value: The value to associate with the key. It may be any
bytes as long as it does not contain \0 or \n.
"""
- self._check_key(key)
- if index._newline_null_re.search(value) is not None:
- raise errors.BadIndexValue(value)
- if len(references) != self.reference_lists:
- raise errors.BadIndexValue(references)
- node_refs = []
- for reference_list in references:
- for reference in reference_list:
- # If reference is in self._nodes, then we have already checked
- # it
- if reference not in self._nodes:
- self._check_key(reference)
- node_refs.append(tuple(reference_list))
+ # we don't care about absent_references
+ node_refs, _ = self._check_key_ref_value(key, references, value)
if key in self._nodes:
raise errors.BadIndexDuplicateKey(key, self)
self._nodes[key] = (tuple(node_refs), value)
self._keys.add(key)
- if self._key_length > 1 and self._nodes_by_key is not None:
- key_dict = self._nodes_by_key
- if self.reference_lists:
- key_value = key, value, tuple(node_refs)
- else:
- key_value = key, value
- # possibly should do this on-demand, but it seems likely it is
- # always wanted
- # For a key of (foo, bar, baz) create
- # _nodes_by_key[foo][bar][baz] = key_value
- for subkey in key[:-1]:
- key_dict = key_dict.setdefault(subkey, {})
- key_dict[key[-1]] = key_value
+ if self._nodes_by_key is not None and self._key_length > 1:
+ self._update_nodes_by_key(key, value, node_refs)
if len(self._keys) < self._spill_at:
return
+ self._spill_mem_keys_to_disk()
+
+ def _spill_mem_keys_to_disk(self):
+ """Write the in memory keys down to disk to cap memory consumption.
+
+ If we already have some keys written to disk, we will combine them so
+ as to preserve the sorted order. The algorithm for combining uses
+ powers of two. So on the first spill, write all mem nodes into a
+ single index. On the second spill, combine the mem nodes with the nodes
+ on disk to create a 2x sized disk index and get rid of the first index.
+ On the third spill, create a single new disk index, which will contain
+ the mem nodes, and preserve the existing 2x sized index. On the fourth,
+ combine mem with the first and second indexes, creating a new one of
+ size 4x. On the fifth create a single new one, etc.
+ """
iterators_to_combine = [self._iter_mem_nodes()]
pos = -1
for pos, backing in enumerate(self._backing_indices):
@@ -203,9 +195,11 @@
backing_pos = pos + 1
new_backing_file, size = \
self._write_nodes(self._iter_smallest(iterators_to_combine))
- new_backing = BTreeGraphIndex(
- get_transport(dirname(new_backing_file.name)),
- basename(new_backing_file.name), size)
+ dir_path, base_name = osutils.split(new_backing_file.name)
+ # Note: The transport here isn't strictly needed, because we will use
+ # direct access to the new_backing._file object
+ new_backing = BTreeGraphIndex(get_transport(dir_path),
+ base_name, size)
# GC will clean up the file
new_backing._file = new_backing_file
if len(self._backing_indices) == backing_pos:
=== modified file 'bzrlib/index.py'
--- a/bzrlib/index.py 2008-08-25 16:29:01 +0000
+++ b/bzrlib/index.py 2008-08-28 20:05:52 +0000
@@ -134,16 +134,22 @@
key_dict = key_dict.setdefault(subkey, {})
key_dict[key[-1]] = key_value
- def add_node(self, key, value, references=()):
- """Add a node to the index.
+ def _check_key_ref_value(self, key, references, value):
+ """Check that 'key' and 'references' are all valid.
- :param key: The key. keys are non-empty tuples containing
- as many whitespace-free utf8 bytestrings as the key length
- defined for this index.
- :param references: An iterable of iterables of keys. Each is a
- reference to another key.
- :param value: The value to associate with the key. It may be any
- bytes as long as it does not contain \0 or \n.
+ :param key: A key tuple. Must conform to the key interface (be a tuple,
+ be of the right length, not have any whitespace or nulls in any key
+ element.)
+ :param references: An iterable of reference lists. Something like
+ [[(ref, key)], [(ref, key), (other, key)]]
+ :param value: The value associate with this key. Must not contain
+ newlines or null characters.
+ :return: (node_refs, absent_references)
+ node_refs basically a packed form of 'references' where all
+ iterables are tuples
+ absent_references reference keys that are not in self._nodes.
+ This may contain duplicates if the same key is
+ referenced in multiple lists.
"""
self._check_key(key)
if _newline_null_re.search(value) is not None:
@@ -151,18 +157,39 @@
if len(references) != self.reference_lists:
raise errors.BadIndexValue(references)
node_refs = []
+ absent_references = []
for reference_list in references:
for reference in reference_list:
- self._check_key(reference)
+ # If reference *is* in self._nodes, then we know it has already
+ # been checked.
if reference not in self._nodes:
- self._nodes[reference] = ('a', (), '')
+ self._check_key(reference)
+ absent_references.append(reference)
node_refs.append(tuple(reference_list))
- if key in self._nodes and self._nodes[key][0] == '':
+ return tuple(node_refs), absent_references
+
+ def add_node(self, key, value, references=()):
+ """Add a node to the index.
+
+ :param key: The key. keys are non-empty tuples containing
+ as many whitespace-free utf8 bytestrings as the key length
+ defined for this index.
+ :param references: An iterable of iterables of keys. Each is a
+ reference to another key.
+ :param value: The value to associate with the key. It may be any
+ bytes as long as it does not contain \0 or \n.
+ """
+ (node_refs,
+ absent_references) = self._check_key_ref_value(key, references, value)
+ if key in self._nodes and self._nodes[key][0] != 'a':
raise errors.BadIndexDuplicateKey(key, self)
- node_refs = tuple(node_refs)
+ for reference in absent_references:
+ # There may be duplicates, but I don't think it is worth worrying
+ # about
+ self._nodes[reference] = ('a', (), '')
self._nodes[key] = ('', node_refs, value)
self._keys.add(key)
- if self._key_length > 1 and self._nodes_by_key is not None:
+ if self._nodes_by_key is not None and self._key_length > 1:
self._update_nodes_by_key(key, value, node_refs)
def finish(self):
@@ -172,7 +199,7 @@
lines.append(_OPTION_LEN + str(len(self._keys)) + '\n')
prefix_length = sum(len(x) for x in lines)
# references are byte offsets. To avoid having to do nasty
- # polynomial work to resolve offsets (references to later in the
+ # polynomial work to resolve offsets (references to later in the
# file cannot be determined until all the inbetween references have
# been calculated too) we pad the offsets with 0's to make them be
# of consistent length. Using binary offsets would break the trivial
More information about the bazaar-commits
mailing list