Rev 3900: Update _iter_nodes so that it splits the key_filter into the ones that matched. in http://bzr.arbash-meinel.com/branches/bzr/brisbane/iter_changes_fixes
John Arbash Meinel
john at arbash-meinel.com
Mon Mar 23 22:51:25 GMT 2009
At http://bzr.arbash-meinel.com/branches/bzr/brisbane/iter_changes_fixes
------------------------------------------------------------
revno: 3900
revision-id: john at arbash-meinel.com-20090323225113-kvefl5nlj1uataj3
parent: john at arbash-meinel.com-20090323215037-oyxc3d940dw0rnv7
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: iter_changes_fixes
timestamp: Mon 2009-03-23 17:51:13 -0500
message:
Update _iter_nodes so that it splits the key_filter into the ones that matched.
This should be a first step for preventing the LC^2 performance we saw for ls -l.
-------------- next part --------------
=== modified file 'bzrlib/chk_map.py'
--- a/bzrlib/chk_map.py 2009-03-19 19:31:06 +0000
+++ b/bzrlib/chk_map.py 2009-03-23 22:51:13 +0000
@@ -919,8 +919,8 @@
search_key_func=search_key_func)
def iteritems(self, store, key_filter=None):
- for node in self._iter_nodes(store, key_filter=key_filter):
- for item in node.iteritems(store, key_filter=key_filter):
+ for node, node_filter in self._iter_nodes(store, key_filter=key_filter):
+ for item in node.iteritems(store, key_filter=node_filter):
yield item
def _iter_nodes(self, store, key_filter=None, batch_size=None):
@@ -935,30 +935,38 @@
:return: An iterable of nodes. This function does not have to be fully
consumed. (There will be no pending I/O when items are being returned.)
"""
+ # Map from chk key ('sha1:...',) to (prefix, key_filter)
+ # prefix is the key in self._items to use, key_filter is the key_filter
+ # entries that would match this node
keys = {}
if key_filter is None:
for prefix, node in self._items.iteritems():
if type(node) == tuple:
- keys[node] = prefix
+ keys[node] = (prefix, None)
else:
- yield node
+ yield node, None
else:
# XXX defaultdict ?
+ prefix_to_keys = {}
length_filters = {}
for key in key_filter:
search_key = self._search_prefix_filter(key)
length_filter = length_filters.setdefault(
len(search_key), set())
length_filter.add(search_key)
+ prefix_to_keys.setdefault(search_key, []).append(key)
length_filters = length_filters.items()
for prefix, node in self._items.iteritems():
+ node_key_filter = []
for length, length_filter in length_filters:
- if prefix[:length] in length_filter:
- if type(node) == tuple:
- keys[node] = prefix
- else:
- yield node
- break
+ sub_prefix = prefix[:length]
+ if sub_prefix in length_filter:
+ node_key_filter.extend(prefix_to_keys[sub_prefix])
+ if node_key_filter: # this key matched something, yield it
+ if type(node) == tuple:
+ keys[node] = (prefix, node_key_filter)
+ else:
+ yield node, node_key_filter
if keys:
# Look in the page cache for some more bytes
found_keys = set()
@@ -970,9 +978,10 @@
else:
node = _deserialise(bytes, key,
search_key_func=self._search_key_func)
- self._items[keys[key]] = node
+ prefix, node_key_filter = keys[key]
+ self._items[prefix] = node
found_keys.add(key)
- yield node
+ yield node, node_key_filter
for key in found_keys:
del keys[key]
if keys:
@@ -986,16 +995,17 @@
# We have to fully consume the stream so there is no pending
# I/O, so we buffer the nodes for now.
stream = store.get_record_stream(batch, 'unordered', True)
- nodes = []
+ node_and_filters = []
for record in stream:
bytes = record.get_bytes_as('fulltext')
node = _deserialise(bytes, record.key,
search_key_func=self._search_key_func)
- nodes.append(node)
- self._items[keys[record.key]] = node
+ prefix, node_key_filter = keys[record.key]
+ node_and_filters.append((node, node_key_filter))
+ self._items[prefix] = node
_page_cache.add(record.key, bytes)
- for node in nodes:
- yield node
+ for info in node_and_filters:
+ yield info
def map(self, store, key, value):
"""Map key to value."""
@@ -1018,7 +1028,8 @@
new_parent.add_node(self._search_prefix[:len(new_prefix)+1],
self)
return new_parent.map(store, key, value)
- children = list(self._iter_nodes(store, key_filter=[key]))
+ children = [node for node, _
+ in self._iter_nodes(store, key_filter=[key])]
if children:
child = children[0]
else:
@@ -1171,7 +1182,8 @@
"""Remove key from this node and it's children."""
if not len(self._items):
raise AssertionError("can't unmap in an empty InternalNode.")
- children = list(self._iter_nodes(store, key_filter=[key]))
+ children = [node for node, _
+ in self._iter_nodes(store, key_filter=[key])]
if children:
child = children[0]
else:
@@ -1235,7 +1247,7 @@
# b) With 16-way fan out, we can still do a single round trip
# c) With 255-way fan out, we don't want to read all 255 and destroy
# the page cache, just to determine that we really don't need it.
- for node in self._iter_nodes(store, batch_size=16):
+ for node, _ in self._iter_nodes(store, batch_size=16):
if isinstance(node, InternalNode):
# Without looking at any leaf nodes, we are sure
return self
=== modified file 'bzrlib/tests/test_chk_map.py'
--- a/bzrlib/tests/test_chk_map.py 2009-03-10 01:24:49 +0000
+++ b/bzrlib/tests/test_chk_map.py 2009-03-23 22:51:13 +0000
@@ -123,8 +123,10 @@
# Internal nodes must have identical references
self.assertEqual(sorted(node_one._items.keys()),
sorted(node_two._items.keys()))
- node_one_stack.extend(node_one._iter_nodes(map_one._store))
- node_two_stack.extend(node_two._iter_nodes(map_two._store))
+ node_one_stack.extend([n for n, _ in
+ node_one._iter_nodes(map_one._store)])
+ node_two_stack.extend([n for n, _ in
+ node_two._iter_nodes(map_two._store)])
else:
# Leaf nodes must have identical contents
self.assertEqual(node_one._items, node_two._items)
@@ -1506,6 +1508,58 @@
# def test_add_node_two_oversized_third_kept_minimum_fan(self):
# def test_add_node_one_oversized_second_splits_errors(self):
+ def test__iter_nodes_no_key_filter(self):
+ node = InternalNode('')
+ child = LeafNode()
+ child.set_maximum_size(100)
+ child.map(None, ("foo",), "bar")
+ node.add_node("f", child)
+ child = LeafNode()
+ child.set_maximum_size(100)
+ child.map(None, ("bar",), "baz")
+ node.add_node("b", child)
+
+ for child, node_key_filter in node._iter_nodes(None, key_filter=None):
+ self.assertEqual(None, node_key_filter)
+
+ def test__iter_nodes_splits_key_filter(self):
+ node = InternalNode('')
+ child = LeafNode()
+ child.set_maximum_size(100)
+ child.map(None, ("foo",), "bar")
+ node.add_node("f", child)
+ child = LeafNode()
+ child.set_maximum_size(100)
+ child.map(None, ("bar",), "baz")
+ node.add_node("b", child)
+
+ key_filter = (('foo',), ('bar',), ('cat',))
+ for child, node_key_filter in node._iter_nodes(None,
+ key_filter=key_filter):
+ # each child could only match one key filter, so make sure it was
+ # properly filtered
+ self.assertEqual(1, len(node_key_filter))
+
+ def test__iter_nodes_with_multiple_matches(self):
+ node = InternalNode('')
+ child = LeafNode()
+ child.set_maximum_size(100)
+ child.map(None, ("foo",), "val")
+ child.map(None, ("fob",), "val")
+ node.add_node("f", child)
+ child = LeafNode()
+ child.set_maximum_size(100)
+ child.map(None, ("bar",), "val")
+ child.map(None, ("baz",), "val")
+ node.add_node("b", child)
+
+ key_filter = (('foo',), ('fob',), ('bar',), ('baz',))
+ for child, node_key_filter in node._iter_nodes(None,
+ key_filter=key_filter):
+ # each child could matches two key filters, so make sure they were
+ # both included.
+ self.assertEqual(2, len(node_key_filter))
+
def test_iteritems_empty_new(self):
node = InternalNode()
self.assertEqual([], sorted(node.iteritems(None)))
More information about the bazaar-commits
mailing list