Rev 3777: Add in a shortcut when we haven't cached much yet. in http://bzr.arbash-meinel.com/branches/bzr/1.8-dev/btree_buffer

Tue Oct 14 22:35:47 BST 2008

At http://bzr.arbash-meinel.com/branches/bzr/1.8-dev/btree_buffer

------------------------------------------------------------
revno: 3777
revision-id: john at arbash-meinel.com-20081014213527-4j9uc93aq1qmn43b
parent: john at arbash-meinel.com-20081014202843-775qw8g14vdozp7t
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: btree_buffer
timestamp: Tue 2008-10-14 16:35:27 -0500
message:
  Add in a shortcut when we haven't cached much yet.
  
  Document the current algorithm more completely, including the proper
  justification for the various steps.
-------------- next part --------------
=== modified file 'bzrlib/btree_index.py'

--- a/bzrlib/btree_index.py	2008-10-14 20:27:22 +0000
+++ b/bzrlib/btree_index.py	2008-10-14 21:35:27 +0000
@@ -700,10 +700,6 @@
         # getting.
         cached_keys = self._get_cached_keys()
 
-        # if len(cached_keys) < 2:
-        #     # We haven't read enough to justify expansion
-        #     return node_indexes
-
         # If reading recommended_pages would read the rest of the index, just
         # do so.
         if num_pages - len(cached_keys) <= self._recommended_pages:
@@ -717,8 +713,6 @@
                 trace.mutter('  reading all unread pages: %s', expanded)
             return expanded
 
-        needed_nodes = self._recommended_pages - len(node_indexes)
-        final_nodes = set(node_indexes)
         if self._root_node is None:
             # ATM on the first read of the root node of a large index, we don't
             # bother pre-reading any other pages. This is because the
@@ -728,12 +722,24 @@
             # layer index is small
             final_nodes = node_indexes
         else:
+            tree_depth = len(self._row_lengths)
+            if len(cached_keys) < tree_depth and len(node_indexes) == 1:
+                # We haven't read enough to justify expansion
+                # If we are only going to read the root node, and 1 leaf node,
+                # then it isn't worth expanding our request. Once we've read at
+                # least 2 nodes, then we are probably doing a search, and we
+                # start expanding our requests.
+                if 'index' in debug.debug_flags:
+                    trace.mutter('  not expanding on first reads')
+                return node_indexes
+
             # Expand requests to neighbors until we have at least
             # recommended_pages to request. We only want to expand requests
             # within a given layer. We cheat a little bit and assume all
             # requests will be in the same layer. This is true given the
             # current design, but if it changes this algorithm may perform
             # oddly.
+            final_nodes = set(node_indexes)
             first = end = None
             new_tips = set(final_nodes)
             while len(final_nodes) < self._recommended_pages and new_tips:

=== modified file 'bzrlib/tests/test_btree_index.py'
--- a/bzrlib/tests/test_btree_index.py	2008-10-14 20:27:22 +0000
+++ b/bzrlib/tests/test_btree_index.py	2008-10-14 21:35:27 +0000
@@ -1032,16 +1032,18 @@
 
     def make_100_node_index(self):
         index = self.make_index(4096*100, 6)
+        # Consider we've already made a single request at the middle
         self.prepare_index(index, node_ref_lists=0, key_length=1,
                            key_count=1000, row_lengths=[1, 99],
-                           cached_keys=[0])
+                           cached_keys=[0, 50])
         return index
 
     def make_1000_node_index(self):
         index = self.make_index(4096*1000, 6)
+        # Pretend we've already made a single request in the middle
         self.prepare_index(index, node_ref_lists=0, key_length=1,
                            key_count=90000, row_lengths=[1, 9, 990],
-                           cached_keys=[0])
+                           cached_keys=[0, 5, 500])
         return index
 
     def assertNumPages(self, expected_pages, index, size):
@@ -1143,12 +1145,34 @@
     def test_stay_within_layer(self):
         index = self.make_1000_node_index()
         # When expanding a request, we won't read nodes from the next layer
-        self.assertExpandNodes([1, 2, 3, 4, 5, 6], index, [2])
-        self.assertExpandNodes([3, 4, 5, 6, 7, 8, 9], index, [6])
-        self.assertExpandNodes([4, 5, 6, 7, 8, 9], index, [9])
+        self.assertExpandNodes([1, 2, 3, 4], index, [2])
+        self.assertExpandNodes([6, 7, 8, 9], index, [6])
+        self.assertExpandNodes([6, 7, 8, 9], index, [9])
         self.assertExpandNodes([10, 11, 12, 13, 14, 15], index, [10])
         self.assertExpandNodes([10, 11, 12, 13, 14, 15, 16], index, [13])
 
         self.set_cached_keys(index, [0, 4, 12])
         self.assertExpandNodes([5, 6, 7, 8, 9], index, [7])
         self.assertExpandNodes([10, 11], index, [11])
+
+    def test_small_requests_unexpanded(self):
+        index = self.make_100_node_index()
+        self.set_cached_keys(index, [0])
+        self.assertExpandNodes([1], index, [1])
+        self.assertExpandNodes([50], index, [50])
+        # If we request more than one node, then we'll expand
+        self.assertExpandNodes([49, 50, 51, 59, 60, 61], index, [50, 60])
+
+        # The first pass does not expand
+        index = self.make_1000_node_index()
+        self.set_cached_keys(index, [0])
+        self.assertExpandNodes([1], index, [1])
+        self.set_cached_keys(index, [0, 1])
+        self.assertExpandNodes([100], index, [100])
+        self.set_cached_keys(index, [0, 1, 100])
+        # But after the first depth, we will expand
+        self.assertExpandNodes([2, 3, 4, 5, 6, 7], index, [2])
+        self.assertExpandNodes([2, 3, 4, 5, 6, 7], index, [4])
+        self.set_cached_keys(index, [0, 1, 2, 3, 4, 5, 6, 7, 100])
+        self.assertExpandNodes([102, 103, 104, 105, 106, 107, 108], index,
+                               [105])

=== modified file 'doc/developers/btree_index_prefetch.txt'
--- a/doc/developers/btree_index_prefetch.txt	2008-10-14 20:28:43 +0000
+++ b/doc/developers/btree_index_prefetch.txt	2008-10-14 21:35:27 +0000
@@ -240,30 +240,55 @@
   layer will often not have a full 100-way fan out. Probably not worthwhile
   very often, though.
 
+* Sometimes we will be making a very small request for a very small number of
+  keys, we don't really want to bloat tiny requests. Hopefully we can find a
+  decent heuristic to determine when we will be wanting extra nodes later,
+  versus when we expect to find all we want right now.
+
 
 Algorithm
 =========
 
 This is the basic outline of the algorithm.
 
-1. Expand requests by requesting neighboring pages. (So if we read page 10, we
-   can expand to also read page 9 and 11.)
-
-2. Only expand within a layer. The problem is that with a 100:1 fan-out, but
-   only a 10:1 expansion policy, it is unlikely that we will happen to read the
-   next layer pages that we are interested in. Note that this doesn't hold true
-   when a layer has "recently split", so we may want to revisit this.
-
-3. Part (2) hints that when reading the root node, we never request any other
-   nodes, as the root is always a layer-unto-itself. The only exception is when
-   all pages could fit in a single width request.
-
-4. When expanding, add nodes that are "next" to the node in question, which
-   have not been read yet. This also has another interesting property. When
-   searching in a given direction, on the first request, we will pre-read both
-   directions. Future requests will only pre-read in one direction, as the
-   other direction is cached.
-
+1. If we don't know the size of the index, don't expand as we don't know what
+   is available. (This only really applies to the pack-names file, which is
+   unlikely to ever become larger than 1 page anyway.)
+
+2. If a request is already wide enough to be greater than the number of
+   recommended pages, don't bother trying to expand. This only really happens
+   with LocalTransport which recommends a single page.
+
+3. Determine what pages have already been read (if any). If the pages left to
+   read can fit in a single request, just request them. This tends to happen on
+   medium sized indexes (ones with low hundreds of revisions), and near the end
+   when we've read most of the whole index already.
+
+4. If we haven't read the root node yet, and we can't fit the whole index into
+   a single request, only read the root node. We don't know where the layer
+   boundaries are anyway.
+
+5. If we haven't read "tree depth" pages yet, and are only requesting a single
+   new page don't expand. This is meant to handle the 'lookup 1 item in the
+   index' case. In a large pack file, you'll read only a single page at each
+   layer and then be done. When spidering out in a search, this will cause us
+   to take a little bit longer to start expanding, but once we've started we'll
+   be expanding at full velocity. This could be improved by having indexes
+   inform each other that they have already entered the 'search' phase, or by
+   having a hint from above to indicate the same.
+
+   However, remember the 'bi-modal' distribution. Most indexes will either be
+   very small, or very large. So either we'll read the whole thing quickly, or
+   we'll end up spending a lot of time in the index. Which makes a small number
+   of extra round trips to large indexes a small overhead. For 2-layer nodes,
+   this only 'wastes' one round trip.
+
+6. Now we are ready to expand the requests. Expand by looking for more pages
+   next to the ones requested that fit within the current layer. If you run
+   into a cached page, or a layer boundary, search further only in the opposite
+   direction. This gives us proper locality of reference, and also helps
+   because when a search goes in a single direction, we will continue to
+   prefetch pages in that direction.
 
 ..
    vim: ft=rst tw=79 ai