Rev 3909: (just broken) trying to work out how to translate iter_changes in http://bzr.arbash-meinel.com/branches/bzr/brisbane/branch_performance

Wed Mar 25 23:01:23 GMT 2009

At http://bzr.arbash-meinel.com/branches/bzr/brisbane/branch_performance

------------------------------------------------------------
revno: 3909
revision-id: john at arbash-meinel.com-20090325225900-zsklkzcz36eanphn
parent: john at arbash-meinel.com-20090325220312-mbnryox5tj37i76w
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: branch_performance
timestamp: Wed 2009-03-25 17:59:00 -0500
message:
  (just broken) trying to work out how to translate iter_changes
  into a fetch-optimized iter_interesting_nodes... it will be tricky, but should
  be interesting.
-------------- next part --------------
=== modified file 'bzrlib/chk_map.py'

--- a/bzrlib/chk_map.py	2009-03-25 22:03:12 +0000
+++ b/bzrlib/chk_map.py	2009-03-25 22:59:00 +0000
@@ -1281,105 +1281,39 @@
     return node
 
 
-def _find_children_info(store, interesting_keys, uninteresting_keys, pb):
-    """Read the associated records, and determine what is interesting."""
-    uninteresting_keys = set(uninteresting_keys)
-    chks_to_read = uninteresting_keys.union(interesting_keys)
-    next_uninteresting = set()
-    next_interesting = set()
-    uninteresting_items = set()
-    interesting_items = set()
-    interesting_records = []
-    # records_read = set()
-    for record in store.get_record_stream(chks_to_read, 'unordered', True):
-        # records_read.add(record.key())
-        if pb is not None:
-            pb.tick()
+def _read_records(uninteresting_keys, interesting_keys, store):
+    """Read the bytes for the various keys. Also check the _page_cache."""
+    to_read = []
+    for key in uninteresting_keys:
+        try:
+            bytes = _page_cache[key]
+            uninteresting_bytes[key] = bytes
+        except KeyError:
+            to_read.append(key)
+            uninteresting_bytes[key] = None
+    for key in interesting_keys:
+        try:
+            bytes = _page_cache[key]
+            interesting_bytes[key] = bytes
+        except KeyError:
+            to_read.append(key)
+            interesting_bytes[key] = None
+    stream = store.get_record_stream(to_read, 'unordered', True)
+    for record in stream:
         bytes = record.get_bytes_as('fulltext')
-        # We don't care about search_key_func for this code, because we only
-        # care about external references.
-        node = _deserialise(bytes, record.key, search_key_func=None)
-        if record.key in uninteresting_keys:
-            if isinstance(node, InternalNode):
-                next_uninteresting.update(node.refs())
-            else:
-                # We know we are at a LeafNode, so we can pass None for the
-                # store
-                uninteresting_items.update(node.iteritems(None))
+        if record.key in uninteresting_bytes:
+            uninteresting_bytes[record.key] = bytes
         else:
-            interesting_records.append(record)
-            if isinstance(node, InternalNode):
-                next_interesting.update(node.refs())
-            else:
-                interesting_items.update(node.iteritems(None))
-    # TODO: Filter out records that have already been read, as node splitting
-    #       can cause us to reference the same nodes via shorter and longer
-    #       paths
-    return (next_uninteresting, uninteresting_items,
-            next_interesting, interesting_records, interesting_items)
-
-
-def _find_all_uninteresting(store, interesting_root_keys,
-                            uninteresting_root_keys, adapter, pb):
-    """Determine the full set of uninteresting keys."""
-    # What about duplicates between interesting_root_keys and
-    # uninteresting_root_keys?
-    if not uninteresting_root_keys:
-        # Shortcut case. We know there is nothing uninteresting to filter out
-        # So we just let the rest of the algorithm do the work
-        # We know there is nothing uninteresting, and we didn't have to read
-        # any interesting records yet.
-        return (set(), set(), set(interesting_root_keys), [], set())
-    all_uninteresting_chks = set(uninteresting_root_keys)
-    all_uninteresting_items = set()
-
-    # First step, find the direct children of both the interesting and
-    # uninteresting set
-    (uninteresting_keys, uninteresting_items,
-     interesting_keys, interesting_records,
-     interesting_items) = _find_children_info(store, interesting_root_keys,
-                                              uninteresting_root_keys,
-                                              pb=pb)
-    all_uninteresting_chks.update(uninteresting_keys)
-    all_uninteresting_items.update(uninteresting_items)
-    del uninteresting_items
-    # Note: Exact matches between interesting and uninteresting do not need
-    #       to be search further. Non-exact matches need to be searched in case
-    #       there is a future exact-match
-    uninteresting_keys.difference_update(interesting_keys)
-
-    # Second, find the full set of uninteresting bits reachable by the
-    # uninteresting roots
-    chks_to_read = uninteresting_keys
-    while chks_to_read:
-        next_chks = set()
-        for record in store.get_record_stream(chks_to_read, 'unordered', False):
-            # TODO: Handle 'absent'
-            if pb is not None:
-                pb.tick()
-            try:
-                bytes = record.get_bytes_as('fulltext')
-            except errors.UnavailableRepresentation:
-                bytes = adapter.get_bytes(record)
-            # We don't care about search_key_func for this code, because we
-            # only care about external references.
-            node = _deserialise(bytes, record.key, search_key_func=None)
-            if isinstance(node, InternalNode):
-                # uninteresting_prefix_chks.update(node._items.iteritems())
-                chks = node._items.values()
-                # TODO: We remove the entries that are already in
-                #       uninteresting_chks ?
-                next_chks.update(chks)
-                all_uninteresting_chks.update(chks)
-            else:
-                all_uninteresting_items.update(node._items.iteritems())
-        chks_to_read = next_chks
-    return (all_uninteresting_chks, all_uninteresting_items,
-            interesting_keys, interesting_records, interesting_items)
+            assert record.key in interesting_bytes
+            interesting_bytes[record.key] = bytes
+        # Do we want to do this? or does it just punish the page cache?
+        _page_cache[key] = bytes
+    return uninteresting_bytes, interesting_bytes
 
 
 def iter_interesting_nodes(store, interesting_root_keys,
-                           uninteresting_root_keys, pb=None):
+                           uninteresting_root_keys,
+                           search_key_func):
     """Given root keys, find interesting nodes.
 
     Evaluate nodes referenced by interesting_root_keys. Ones that are also
@@ -1390,25 +1324,60 @@
     :param uninteresting_root_keys: keys which should be filtered out of the
         result set.
     :return: Yield
-        (interesting records, interesting chk's, interesting key:values)
+        (interesting sha1s, interesting key:values)
     """
-    # TODO: consider that it may be more memory efficient to use the 20-byte
-    #       sha1 string, rather than tuples of hexidecimal sha1 strings.
-    # TODO: Try to factor out a lot of the get_record_stream() calls into a
-    #       helper function similar to _read_bytes. This function should be
-    #       able to use nodes from the _page_cache as well as actually
-    #       requesting bytes from the store.
-
-    # A way to adapt from the compressed texts back into fulltexts
-    # In a way, this seems like a layering inversion to have CHKMap know the
-    # details of versionedfile
-    adapter_class = versionedfile.adapter_registry.get(
-        ('knit-ft-gz', 'fulltext'))
-    adapter = adapter_class(store)
-
-    (all_uninteresting_chks, all_uninteresting_items, interesting_keys,
-     interesting_records, interesting_items) = _find_all_uninteresting(store,
-        interesting_root_keys, uninteresting_root_keys, adapter, pb)
+    if not uninteresting_root_keys:
+        # TODO: optimize the special case where there are no uninteresting
+        #       nodes, this is 'initial branch' performance
+        pass
+    interesting_root_keys = set(interesting_root_keys)
+    uninteresting_root_keys = set(uninteresting_root_keys)
+    assert len(uninteresting_root_keys.intersection(interesting_root_keys)) == 0
+
+    uninteresting_bytes, interesting_bytes = \
+        _read_chk_bytes(uninteresting_root_keys, interesting_root_keys)
+
+    # Note that the root nodes are definitely interesting...
+
+    # At this point, we know that all of the prefixes for all of the pages are
+    # at the same prefix, so we can do heavy filtering
+    uninteresting_pending = []
+    interesting_pending = []
+    known_uninteresting_prefix_keys = set()
+    known_uninteresting_items = set()
+    maybe_interesting_prefix_keys = set()
+    maybe_interesting_items = set()
+    for key, bytes in uninteresting_bytes.iteritems():
+        # We don't care about search_key_func for this code, because we
+        # only care about external references.
+        node = _deserialise(bytes, key, search_key_func=search_key_func)
+        if type(node) is InternalNode:
+            known_uninteresting_prefix_keys.update(node._items.items())
+        else:
+            # LeafNode
+            known_uninteresting_items.update(node._items.items())
+    for key, bytes in interesting_bytes.iteritems():
+        node = _deserialise(bytes, key, search_key_func=search_key_func)
+        if type(node) is InternalNode:
+            maybe_interesting_prefix_keys.update(node._items.items())
+        else:
+            # LeafNode
+            maybe_interesting_items.update(node._items.iteritems())
+    for prefix, child in (maybe_interesting_prefix_keys
+                          - known_uninteresting_prefix_keys):
+        heapq.heappush(interesting_pending, (prefix, None, child))
+    for prefix, child in (known_uninteresting_prefix_keys
+                          - maybe_interesting_prefix_keys):
+        heapq.heappush(uninteresting_pending, (prefix, None, child))
+    for key, value in (maybe_interesting_items - known_uninteresting_items):
+        prefix = search_key_func(key)
+        heapq.heappush(interesting_pending, (prefix, key, value))
+    for key, value in (known_uninteresting_items - maybe_interesting_items):
+        prefix = search_key_func(key)
+        heapq.heappush(uninteresting_pending, (prefix, key, value))
+
+    BORK - BORK
+
 
     # Now that we know everything uninteresting, we can yield information from
     # our first request