Rev 3909: (just broken) trying to work out how to translate iter_changes in http://bzr.arbash-meinel.com/branches/bzr/brisbane/branch_performance
John Arbash Meinel
john at arbash-meinel.com
Wed Mar 25 23:01:23 GMT 2009
At http://bzr.arbash-meinel.com/branches/bzr/brisbane/branch_performance
------------------------------------------------------------
revno: 3909
revision-id: john at arbash-meinel.com-20090325225900-zsklkzcz36eanphn
parent: john at arbash-meinel.com-20090325220312-mbnryox5tj37i76w
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: branch_performance
timestamp: Wed 2009-03-25 17:59:00 -0500
message:
(just broken) trying to work out how to translate iter_changes
into a fetch-optimized iter_interesting_nodes... it will be tricky, but should
be interesting.
-------------- next part --------------
=== modified file 'bzrlib/chk_map.py'
--- a/bzrlib/chk_map.py 2009-03-25 22:03:12 +0000
+++ b/bzrlib/chk_map.py 2009-03-25 22:59:00 +0000
@@ -1281,105 +1281,39 @@
return node
-def _find_children_info(store, interesting_keys, uninteresting_keys, pb):
- """Read the associated records, and determine what is interesting."""
- uninteresting_keys = set(uninteresting_keys)
- chks_to_read = uninteresting_keys.union(interesting_keys)
- next_uninteresting = set()
- next_interesting = set()
- uninteresting_items = set()
- interesting_items = set()
- interesting_records = []
- # records_read = set()
- for record in store.get_record_stream(chks_to_read, 'unordered', True):
- # records_read.add(record.key())
- if pb is not None:
- pb.tick()
+def _read_records(uninteresting_keys, interesting_keys, store):
+ """Read the bytes for the various keys. Also check the _page_cache."""
+ to_read = []
+ for key in uninteresting_keys:
+ try:
+ bytes = _page_cache[key]
+ uninteresting_bytes[key] = bytes
+ except KeyError:
+ to_read.append(key)
+ uninteresting_bytes[key] = None
+ for key in interesting_keys:
+ try:
+ bytes = _page_cache[key]
+ interesting_bytes[key] = bytes
+ except KeyError:
+ to_read.append(key)
+ interesting_bytes[key] = None
+ stream = store.get_record_stream(to_read, 'unordered', True)
+ for record in stream:
bytes = record.get_bytes_as('fulltext')
- # We don't care about search_key_func for this code, because we only
- # care about external references.
- node = _deserialise(bytes, record.key, search_key_func=None)
- if record.key in uninteresting_keys:
- if isinstance(node, InternalNode):
- next_uninteresting.update(node.refs())
- else:
- # We know we are at a LeafNode, so we can pass None for the
- # store
- uninteresting_items.update(node.iteritems(None))
+ if record.key in uninteresting_bytes:
+ uninteresting_bytes[record.key] = bytes
else:
- interesting_records.append(record)
- if isinstance(node, InternalNode):
- next_interesting.update(node.refs())
- else:
- interesting_items.update(node.iteritems(None))
- # TODO: Filter out records that have already been read, as node splitting
- # can cause us to reference the same nodes via shorter and longer
- # paths
- return (next_uninteresting, uninteresting_items,
- next_interesting, interesting_records, interesting_items)
-
-
-def _find_all_uninteresting(store, interesting_root_keys,
- uninteresting_root_keys, adapter, pb):
- """Determine the full set of uninteresting keys."""
- # What about duplicates between interesting_root_keys and
- # uninteresting_root_keys?
- if not uninteresting_root_keys:
- # Shortcut case. We know there is nothing uninteresting to filter out
- # So we just let the rest of the algorithm do the work
- # We know there is nothing uninteresting, and we didn't have to read
- # any interesting records yet.
- return (set(), set(), set(interesting_root_keys), [], set())
- all_uninteresting_chks = set(uninteresting_root_keys)
- all_uninteresting_items = set()
-
- # First step, find the direct children of both the interesting and
- # uninteresting set
- (uninteresting_keys, uninteresting_items,
- interesting_keys, interesting_records,
- interesting_items) = _find_children_info(store, interesting_root_keys,
- uninteresting_root_keys,
- pb=pb)
- all_uninteresting_chks.update(uninteresting_keys)
- all_uninteresting_items.update(uninteresting_items)
- del uninteresting_items
- # Note: Exact matches between interesting and uninteresting do not need
- # to be search further. Non-exact matches need to be searched in case
- # there is a future exact-match
- uninteresting_keys.difference_update(interesting_keys)
-
- # Second, find the full set of uninteresting bits reachable by the
- # uninteresting roots
- chks_to_read = uninteresting_keys
- while chks_to_read:
- next_chks = set()
- for record in store.get_record_stream(chks_to_read, 'unordered', False):
- # TODO: Handle 'absent'
- if pb is not None:
- pb.tick()
- try:
- bytes = record.get_bytes_as('fulltext')
- except errors.UnavailableRepresentation:
- bytes = adapter.get_bytes(record)
- # We don't care about search_key_func for this code, because we
- # only care about external references.
- node = _deserialise(bytes, record.key, search_key_func=None)
- if isinstance(node, InternalNode):
- # uninteresting_prefix_chks.update(node._items.iteritems())
- chks = node._items.values()
- # TODO: We remove the entries that are already in
- # uninteresting_chks ?
- next_chks.update(chks)
- all_uninteresting_chks.update(chks)
- else:
- all_uninteresting_items.update(node._items.iteritems())
- chks_to_read = next_chks
- return (all_uninteresting_chks, all_uninteresting_items,
- interesting_keys, interesting_records, interesting_items)
+ assert record.key in interesting_bytes
+ interesting_bytes[record.key] = bytes
+ # Do we want to do this? or does it just punish the page cache?
+ _page_cache[key] = bytes
+ return uninteresting_bytes, interesting_bytes
def iter_interesting_nodes(store, interesting_root_keys,
- uninteresting_root_keys, pb=None):
+ uninteresting_root_keys,
+ search_key_func):
"""Given root keys, find interesting nodes.
Evaluate nodes referenced by interesting_root_keys. Ones that are also
@@ -1390,25 +1324,60 @@
:param uninteresting_root_keys: keys which should be filtered out of the
result set.
:return: Yield
- (interesting records, interesting chk's, interesting key:values)
+ (interesting sha1s, interesting key:values)
"""
- # TODO: consider that it may be more memory efficient to use the 20-byte
- # sha1 string, rather than tuples of hexidecimal sha1 strings.
- # TODO: Try to factor out a lot of the get_record_stream() calls into a
- # helper function similar to _read_bytes. This function should be
- # able to use nodes from the _page_cache as well as actually
- # requesting bytes from the store.
-
- # A way to adapt from the compressed texts back into fulltexts
- # In a way, this seems like a layering inversion to have CHKMap know the
- # details of versionedfile
- adapter_class = versionedfile.adapter_registry.get(
- ('knit-ft-gz', 'fulltext'))
- adapter = adapter_class(store)
-
- (all_uninteresting_chks, all_uninteresting_items, interesting_keys,
- interesting_records, interesting_items) = _find_all_uninteresting(store,
- interesting_root_keys, uninteresting_root_keys, adapter, pb)
+ if not uninteresting_root_keys:
+ # TODO: optimize the special case where there are no uninteresting
+ # nodes, this is 'initial branch' performance
+ pass
+ interesting_root_keys = set(interesting_root_keys)
+ uninteresting_root_keys = set(uninteresting_root_keys)
+ assert len(uninteresting_root_keys.intersection(interesting_root_keys)) == 0
+
+ uninteresting_bytes, interesting_bytes = \
+ _read_chk_bytes(uninteresting_root_keys, interesting_root_keys)
+
+ # Note that the root nodes are definitely interesting...
+
+ # At this point, we know that all of the prefixes for all of the pages are
+ # at the same prefix, so we can do heavy filtering
+ uninteresting_pending = []
+ interesting_pending = []
+ known_uninteresting_prefix_keys = set()
+ known_uninteresting_items = set()
+ maybe_interesting_prefix_keys = set()
+ maybe_interesting_items = set()
+ for key, bytes in uninteresting_bytes.iteritems():
+ # We don't care about search_key_func for this code, because we
+ # only care about external references.
+ node = _deserialise(bytes, key, search_key_func=search_key_func)
+ if type(node) is InternalNode:
+ known_uninteresting_prefix_keys.update(node._items.items())
+ else:
+ # LeafNode
+ known_uninteresting_items.update(node._items.items())
+ for key, bytes in interesting_bytes.iteritems():
+ node = _deserialise(bytes, key, search_key_func=search_key_func)
+ if type(node) is InternalNode:
+ maybe_interesting_prefix_keys.update(node._items.items())
+ else:
+ # LeafNode
+ maybe_interesting_items.update(node._items.iteritems())
+ for prefix, child in (maybe_interesting_prefix_keys
+ - known_uninteresting_prefix_keys):
+ heapq.heappush(interesting_pending, (prefix, None, child))
+ for prefix, child in (known_uninteresting_prefix_keys
+ - maybe_interesting_prefix_keys):
+ heapq.heappush(uninteresting_pending, (prefix, None, child))
+ for key, value in (maybe_interesting_items - known_uninteresting_items):
+ prefix = search_key_func(key)
+ heapq.heappush(interesting_pending, (prefix, key, value))
+ for key, value in (known_uninteresting_items - maybe_interesting_items):
+ prefix = search_key_func(key)
+ heapq.heappush(uninteresting_pending, (prefix, key, value))
+
+ BORK - BORK
+
# Now that we know everything uninteresting, we can yield information from
# our first request
More information about the bazaar-commits
mailing list