Rev 3645: Change the IndexBuilders to not generate the nodes_by_key unless needed. in http://bzr.arbash-meinel.com/branches/bzr/1.7-dev/index_builder_cleanup
John Arbash Meinel
john at arbash-meinel.com
Mon Aug 25 04:41:43 BST 2008
At http://bzr.arbash-meinel.com/branches/bzr/1.7-dev/index_builder_cleanup
------------------------------------------------------------
revno: 3645
revision-id: john at arbash-meinel.com-20080825034139-68nxqiqrmqi1l5f0
parent: pqm at pqm.ubuntu.com-20080822042630-on3dxyek4ezk0miu
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: index_builder_cleanup
timestamp: Sun 2008-08-24 22:41:39 -0500
message:
Change the IndexBuilders to not generate the nodes_by_key unless needed.
modified:
bzrlib/btree_index.py index.py-20080624222253-p0x5f92uyh5hw734-7
bzrlib/index.py index.py-20070712131115-lolkarso50vjr64s-1
bzrlib/tests/test_btree_index.py test_index.py-20080624222253-p0x5f92uyh5hw734-13
-------------- next part --------------
=== modified file 'bzrlib/btree_index.py'
--- a/bzrlib/btree_index.py 2008-08-22 02:18:27 +0000
+++ b/bzrlib/btree_index.py 2008-08-25 03:41:39 +0000
@@ -142,6 +142,8 @@
key_elements=key_elements)
self._spill_at = spill_at
self._backing_indices = []
+ # Indicate it hasn't been built yet
+ self._nodes_by_key = None
def add_node(self, key, value, references=()):
"""Add a node to the index.
@@ -157,7 +159,33 @@
:param value: The value to associate with the key. It may be any
bytes as long as it does not contain \0 or \n.
"""
- index.GraphIndexBuilder.add_node(self, key, value, references=references)
+ self._check_key(key)
+ if index._newline_null_re.search(value) is not None:
+ raise errors.BadIndexValue(value)
+ if len(references) != self.reference_lists:
+ raise errors.BadIndexValue(references)
+ node_refs = []
+ for reference_list in references:
+ for reference in reference_list:
+ self._check_key(reference)
+ node_refs.append(tuple(reference_list))
+ if key in self._nodes and self._nodes[key][0] == '':
+ raise errors.BadIndexDuplicateKey(key, self)
+ self._nodes[key] = (tuple(node_refs), value)
+ self._keys.add(key)
+ if self._key_length > 1 and self._nodes_by_key is not None:
+ key_dict = self._nodes_by_key
+ if self.reference_lists:
+ key_value = key, value, tuple(node_refs)
+ else:
+ key_value = key, value
+ # possibly should do this on-demand, but it seems likely it is
+ # always wanted
+ # For a key of (foo, bar, baz) create
+ # _nodes_by_key[foo][bar][baz] = key_value
+ for subkey in key[:-1]:
+ key_dict = key_dict.setdefault(subkey, {})
+ key_dict[key[-1]] = key_value
if len(self._keys) < self._spill_at:
return
iterators_to_combine = [iter(sorted(self._iter_mem_nodes()))]
@@ -182,7 +210,7 @@
self._backing_indices[pos] = None
self._keys = set()
self._nodes = {}
- self._nodes_by_key = {}
+ self._nodes_by_key = None
def add_nodes(self, nodes):
"""Add nodes to the index.
@@ -199,13 +227,11 @@
def _iter_mem_nodes(self):
"""Iterate over the nodes held in memory."""
if self.reference_lists:
- for key, (absent, references, value) in self._nodes.iteritems():
- if not absent:
- yield self, key, value, references
+ return ((self, key, value, references)
+ for key, (references, value) in self._nodes.iteritems())
else:
- for key, (absent, references, value) in self._nodes.iteritems():
- if not absent:
- yield self, key, value
+ return ((self, key, value)
+ for key, (references, value) in self._nodes.iteritems())
def _iter_smallest(self, iterators_to_combine):
if len(iterators_to_combine) == 1:
@@ -411,13 +437,11 @@
if self.reference_lists:
for key in keys.intersection(self._keys):
node = self._nodes[key]
- if not node[0]:
- yield self, key, node[2], node[1]
+ yield self, key, node[1], node[0]
else:
for key in keys.intersection(self._keys):
node = self._nodes[key]
- if not node[0]:
- yield self, key, node[2]
+ yield self, key, node[1]
keys.difference_update(self._keys)
for backing in self._backing_indices:
if backing is None:
@@ -454,6 +478,8 @@
if backing is None:
continue
for node in backing.iter_entries_prefix(keys):
+ # TODO: We should probably remove yielded keys from the keys
+ # list
yield (self,) + node[1:]
if self._key_length == 1:
for key in keys:
@@ -466,12 +492,10 @@
node = self._nodes[key]
except KeyError:
continue
- if node[0]:
- continue
if self.reference_lists:
- yield self, key, node[2], node[1]
+ yield self, key, node[1], node[0]
else:
- yield self, key, node[2]
+ yield self, key, node[1]
return
for key in keys:
# sanity check
@@ -480,7 +504,7 @@
if len(key) != self._key_length:
raise errors.BadIndexKey(key)
# find what it refers to:
- key_dict = self._nodes_by_key
+ key_dict = self._get_nodes_by_key()
elements = list(key)
# find the subdict to return
try:
@@ -506,6 +530,24 @@
else:
yield (self, ) + key_dict
+ def _get_nodes_by_key(self):
+ if self._nodes_by_key is None:
+ nodes_by_key = {}
+ if self.reference_lists:
+ for key, (references, value) in self._nodes.iteritems():
+ key_dict = nodes_by_key
+ for subkey in key[:-1]:
+ key_dict = key_dict.setdefault(subkey, {})
+ key_dict[key[-1]] = key, value, references
+ else:
+ for key, (references, value) in self._nodes.iteritems():
+ key_dict = nodes_by_key
+ for subkey in key[:-1]:
+ key_dict = key_dict.setdefault(subkey, {})
+ key_dict[key[-1]] = key, value
+ self._nodes_by_key = nodes_by_key
+ return self._nodes_by_key
+
def key_count(self):
"""Return an estimate of the number of keys in this index.
=== modified file 'bzrlib/index.py'
--- a/bzrlib/index.py 2008-08-14 19:22:59 +0000
+++ b/bzrlib/index.py 2008-08-25 03:41:39 +0000
@@ -80,8 +80,9 @@
"""
self.reference_lists = reference_lists
self._keys = set()
+ # A dict of {key: (absent, ref_lists, value)}
self._nodes = {}
- self._nodes_by_key = {}
+ self._nodes_by_key = None
self._key_length = key_elements
def _check_key(self, key):
@@ -121,7 +122,7 @@
raise errors.BadIndexDuplicateKey(key, self)
self._nodes[key] = ('', tuple(node_refs), value)
self._keys.add(key)
- if self._key_length > 1:
+ if self._key_length > 1 and self._nodes_by_key is not None:
key_dict = self._nodes_by_key
if self.reference_lists:
key_value = key, value, tuple(node_refs)
=== modified file 'bzrlib/tests/test_btree_index.py'
--- a/bzrlib/tests/test_btree_index.py 2008-08-22 02:18:27 +0000
+++ b/bzrlib/tests/test_btree_index.py 2008-08-25 03:41:39 +0000
@@ -349,23 +349,19 @@
# predictably.
self.assertEqual(1, len(builder._nodes))
self.assertEqual(1, len(builder._keys))
- self.assertEqual({}, builder._nodes_by_key)
builder.add_node(*nodes[1])
self.assertEqual(0, len(builder._nodes))
self.assertEqual(0, len(builder._keys))
- self.assertEqual({}, builder._nodes_by_key)
self.assertEqual(1, len(builder._backing_indices))
self.assertEqual(2, builder._backing_indices[0].key_count())
# now back to memory
builder.add_node(*nodes[2])
self.assertEqual(1, len(builder._nodes))
self.assertEqual(1, len(builder._keys))
- self.assertEqual({}, builder._nodes_by_key)
# And spills to a second backing index combing all
builder.add_node(*nodes[3])
self.assertEqual(0, len(builder._nodes))
self.assertEqual(0, len(builder._keys))
- self.assertEqual({}, builder._nodes_by_key)
self.assertEqual(2, len(builder._backing_indices))
self.assertEqual(None, builder._backing_indices[0])
self.assertEqual(4, builder._backing_indices[1].key_count())
@@ -374,7 +370,6 @@
builder.add_node(*nodes[5])
self.assertEqual(0, len(builder._nodes))
self.assertEqual(0, len(builder._keys))
- self.assertEqual({}, builder._nodes_by_key)
self.assertEqual(2, len(builder._backing_indices))
self.assertEqual(2, builder._backing_indices[0].key_count())
self.assertEqual(4, builder._backing_indices[1].key_count())
@@ -438,24 +433,20 @@
# Test the parts of the index that take up memory are doing so
# predictably.
self.assertEqual(1, len(builder._keys))
- self.assertEqual(2, len(builder._nodes))
- self.assertNotEqual({}, builder._nodes_by_key)
+ self.assertEqual(1, len(builder._nodes))
builder.add_node(*nodes[1])
self.assertEqual(0, len(builder._keys))
self.assertEqual(0, len(builder._nodes))
- self.assertEqual({}, builder._nodes_by_key)
self.assertEqual(1, len(builder._backing_indices))
self.assertEqual(2, builder._backing_indices[0].key_count())
# now back to memory
builder.add_node(*nodes[2])
- self.assertEqual(2, len(builder._nodes))
+ self.assertEqual(1, len(builder._nodes))
self.assertEqual(1, len(builder._keys))
- self.assertNotEqual({}, builder._nodes_by_key)
# And spills to a second backing index combing all
builder.add_node(*nodes[3])
self.assertEqual(0, len(builder._nodes))
self.assertEqual(0, len(builder._keys))
- self.assertEqual({}, builder._nodes_by_key)
self.assertEqual(2, len(builder._backing_indices))
self.assertEqual(None, builder._backing_indices[0])
self.assertEqual(4, builder._backing_indices[1].key_count())
@@ -464,7 +455,6 @@
builder.add_node(*nodes[5])
self.assertEqual(0, len(builder._nodes))
self.assertEqual(0, len(builder._keys))
- self.assertEqual({}, builder._nodes_by_key)
self.assertEqual(2, len(builder._backing_indices))
self.assertEqual(2, builder._backing_indices[0].key_count())
self.assertEqual(4, builder._backing_indices[1].key_count())
@@ -494,11 +484,12 @@
self.assertEqual([(builder,) + node for node in sorted(nodes[:13])],
list(builder.iter_all_entries()))
# Two nodes - one memory one disk
- self.assertEqual(set([(builder,) + node for node in nodes[11:13]]),
- set(builder.iter_entries([nodes[12][0], nodes[11][0]])))
+ self.assertEqual(sorted(set([(builder,) + node for node in nodes[11:13]])),
+ sorted(set(builder.iter_entries([nodes[12][0], nodes[11][0]]))))
self.assertEqual(13, builder.key_count())
- self.assertEqual(set([(builder,) + node for node in nodes[11:13]]),
- set(builder.iter_entries_prefix([nodes[12][0], nodes[11][0]])))
+ import pdb; pdb.set_trace()
+ self.assertEqual(sorted(set([(builder,) + node for node in nodes[11:13]])),
+ sorted(set(builder.iter_entries_prefix([nodes[12][0], nodes[11][0]]))))
builder.add_node(*nodes[13])
self.assertEqual(3, len(builder._backing_indices))
self.assertEqual(2, builder._backing_indices[0].key_count())
More information about the bazaar-commits
mailing list