Rev 3801: Change how _check_remap works so it doesn't have to load all keys. in http://bzr.arbash-meinel.com/branches/bzr/brisbane/chk_map

John Arbash Meinel john at arbash-meinel.com
Tue Dec 2 22:05:56 GMT 2008


At http://bzr.arbash-meinel.com/branches/bzr/brisbane/chk_map

------------------------------------------------------------
revno: 3801
revision-id: john at arbash-meinel.com-20081202220545-sflppo9zeho3z3j3
parent: john at arbash-meinel.com-20081202205054-ajsix3e8w8jgnnod
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: chk_map
timestamp: Tue 2008-12-02 16:05:45 -0600
message:
  Change how _check_remap works so it doesn't have to load all keys.
-------------- next part --------------
=== modified file 'bzrlib/chk_map.py'
--- a/bzrlib/chk_map.py	2008-12-02 20:50:54 +0000
+++ b/bzrlib/chk_map.py	2008-12-02 22:05:45 +0000
@@ -876,7 +876,13 @@
         if len(self._items) == 1:
             # this node is no longer needed:
             return self._items.values()[0]
-        # Logic for how unmap determines when it needs to rebuild
+        if isinstance(unmapped, InternalNode):
+            return self
+        return self._check_remap(store)
+
+    def _check_remap(self, store):
+        """Check if all keys contained by children fit in a single LeafNode."""
+        # Logic for how we determine when we need to rebuild
         # 1) Implicitly unmap() is removing a key which means that the child
         #    nodes are going to be shrinking by some extent.
         # 2) If all children are LeafNodes, it is possible that they could be
@@ -888,21 +894,7 @@
         #    have a chance of collapsing either.
         #    So a very cheap check is to just say if 'unmapped' is an
         #    InternalNode, we don't have to check further.
-        if isinstance(unmapped, InternalNode):
-            return self
-        return self._check_remap(store)
-
-    def _check_remap(self, store):
-        # Check and see if all keys contained by children could be mapped into
-        # a single LeafNode.
-
-        # TODO: If any child node is an InternalNode, we know we can stop. Is
-        #       it cheaper to load all children and see if one is an
-        #       InternalNode, or is it cheaper to start adding keys into a
-        #       LeafNode and stop when it gets full...
-        # TODO: Do a quick once-over the list of already-read children, and
-        #       check if any are an InternalNode. This will avoid any I/O some
-        #       of the time, and should be very cheap.
+
         # TODO: Another alternative is to check the total size of all known
         #       LeafNodes. If there is some formula we can use to determine the
         #       final size without actually having to read in any more
@@ -915,21 +907,57 @@
         new_leaf = LeafNode()
         new_leaf.set_maximum_size(self._maximum_size)
         new_leaf._key_width = self._key_width
-        for node in self._iter_nodes(store):
-            if isinstance(node, InternalNode):
-                # We won't be able to collapse
-                return self
-            for key, value in node._items.iteritems():
-                # TODO: it is expensive to have it actually do the split, when
-                #       all we care about is whether it *would* split. Refactor
-                #       the code a bit so we can just check without actually
-                #       having to do all the work
-                next_prefix, new_nodes = new_leaf.map(store, key, value)
-                if len(new_nodes) > 1:
-                    # These nodes cannot fit in a single leaf, so we are done
-                    return self
-                else:
-                    assert new_leaf is new_nodes[0][1]
+        keys = {}
+        # There is some overlap with _iter_nodes here, but not a lot, and it
+        # allows us to do quick evaluation without paging everything in
+        for prefix, node in self._items.iteritems():
+            if type(node) == tuple:
+                keys[node] = prefix
+            else:
+                if isinstance(node, InternalNode):
+                    # Without looking at any leaf nodes, we are sure
+                    return self
+                for key, value in node._items.iteritems():
+                    # TODO: it is expensive to have it actually do the split,
+                    #       when all we care about is whether it *would* split.
+                    #       Refactor the code a bit so we can just check
+                    #       without actually having to do all the work
+                    next_prefix, new_nodes = new_leaf.map(store, key, value)
+                    if len(new_nodes) > 1:
+                        # These nodes cannot fit in a single leaf, so we are
+                        # done
+                        return self
+                    else:
+                        assert new_leaf is new_nodes[0][1]
+        # So far, everything fits. Page in the rest of the nodes, and see if it
+        # holds true.
+        if keys:
+            stream = store.get_record_stream(keys, 'unordered', True)
+            nodes = []
+            # Fully consume the stream, even if we could determine that we
+            # don't need to continue. We requested the bytes, we may as well
+            # use them
+            for record in stream:
+                node = _deserialise(record.get_bytes_as('fulltext'), record.key)
+                self._items[keys[record.key]] = node
+                nodes.append(node)
+            for node in nodes:
+                if isinstance(node, InternalNode):
+                    # We know we won't fit
+                    return self
+                for key, value in node._items.iteritems():
+                    # TODO: it is expensive to have it actually do the split,
+                    #       when all we care about is whether it *would* split.
+                    #       Refactor the code a bit so we can just check
+                    #       without actually having to do all the work
+                    next_prefix, new_nodes = new_leaf.map(store, key, value)
+                    if len(new_nodes) > 1:
+                        # These nodes cannot fit in a single leaf, so we are
+                        # done
+                        return self
+                    else:
+                        assert new_leaf is new_nodes[0][1]
+
         # We have gone to every child, and everything fits in a single leaf
         # node, we no longer need this internal node
         return new_leaf

=== modified file 'bzrlib/tests/test_chk_map.py'
--- a/bzrlib/tests/test_chk_map.py	2008-12-02 20:50:54 +0000
+++ b/bzrlib/tests/test_chk_map.py	2008-12-02 22:05:45 +0000
@@ -368,7 +368,7 @@
                              chkmap._dump_tree())
         self.assertCanonicalForm(chkmap)
 
-    def test_stable_unmap_double_deep(self):
+    def test_unmap_double_deep(self):
         store = self.get_chk_bytes()
         chkmap = CHKMap(store, None)
         # Should fit 3 keys per LeafNode
@@ -424,6 +424,111 @@
                              "      ('abc',) 'v'\n",
                              chkmap._dump_tree())
 
+    def test_unmap_with_known_internal_node_doesnt_page(self):
+        store = self.get_chk_bytes()
+        chkmap = CHKMap(store, None)
+        # Should fit 3 keys per LeafNode
+        chkmap._root_node.set_maximum_size(30)
+        chkmap.map(('aaa',), 'v')
+        chkmap.map(('aab',), 'v')
+        chkmap.map(('aac',), 'v')
+        chkmap.map(('abc',), 'v')
+        chkmap.map(('acd',), 'v')
+        self.assertEqualDiff("'' InternalNode None\n"
+                             "  'aa' InternalNode None\n"
+                             "    'aaa' LeafNode None\n"
+                             "      ('aaa',) 'v'\n"
+                             "    'aab' LeafNode None\n"
+                             "      ('aab',) 'v'\n"
+                             "    'aac' LeafNode None\n"
+                             "      ('aac',) 'v'\n"
+                             "  'ab' LeafNode None\n"
+                             "      ('abc',) 'v'\n"
+                             "  'ac' LeafNode None\n"
+                             "      ('acd',) 'v'\n",
+                             chkmap._dump_tree())
+        # Save everything to the map, and start over
+        chkmap = CHKMap(store, chkmap._save())
+        # Mapping an 'aa' key loads the internal node, but should not map the
+        # 'ab' and 'ac' nodes
+        chkmap.map(('aad',), 'v')
+        self.assertIsInstance(chkmap._root_node._items['aa'], InternalNode)
+        self.assertIsInstance(chkmap._root_node._items['ab'], tuple)
+        self.assertIsInstance(chkmap._root_node._items['ac'], tuple)
+        # Unmapping 'acd' can notice that 'aa' is an InternalNode and not have
+        # to map in 'ab'
+        chkmap.unmap(('acd',))
+        self.assertIsInstance(chkmap._root_node._items['aa'], InternalNode)
+        self.assertIsInstance(chkmap._root_node._items['ab'], tuple)
+
+    def test_unmap_without_fitting_doesnt_page_in(self):
+        store = self.get_chk_bytes()
+        chkmap = CHKMap(store, None)
+        # Should fit 2 keys per LeafNode
+        chkmap._root_node.set_maximum_size(20)
+        chkmap.map(('aaa',), 'v')
+        chkmap.map(('aab',), 'v')
+        self.assertEqualDiff("'' InternalNode None\n"
+                             "  'aaa' LeafNode None\n"
+                             "      ('aaa',) 'v'\n"
+                             "  'aab' LeafNode None\n"
+                             "      ('aab',) 'v'\n",
+                             chkmap._dump_tree())
+        # Save everything to the map, and start over
+        chkmap = CHKMap(store, chkmap._save())
+        chkmap.map(('aac',), 'v')
+        chkmap.map(('aad',), 'v')
+        chkmap.map(('aae',), 'v')
+        chkmap.map(('aaf',), 'v')
+        # At this point, the previous nodes should not be paged in, but the
+        # newly added nodes would be
+        self.assertIsInstance(chkmap._root_node._items['aaa'], tuple)
+        self.assertIsInstance(chkmap._root_node._items['aab'], tuple)
+        self.assertIsInstance(chkmap._root_node._items['aac'], LeafNode)
+        self.assertIsInstance(chkmap._root_node._items['aad'], LeafNode)
+        self.assertIsInstance(chkmap._root_node._items['aae'], LeafNode)
+        self.assertIsInstance(chkmap._root_node._items['aaf'], LeafNode)
+        # Now unmapping one of the new nodes will use only the already-paged-in
+        # nodes to determine that we don't need to do more.
+        chkmap.unmap(('aaf',))
+        self.assertIsInstance(chkmap._root_node._items['aaa'], tuple)
+        self.assertIsInstance(chkmap._root_node._items['aab'], tuple)
+        self.assertIsInstance(chkmap._root_node._items['aac'], LeafNode)
+        self.assertIsInstance(chkmap._root_node._items['aad'], LeafNode)
+        self.assertIsInstance(chkmap._root_node._items['aae'], LeafNode)
+
+    def test_unmap_pages_in_if_necessary(self):
+        store = self.get_chk_bytes()
+        chkmap = CHKMap(store, None)
+        # Should fit 2 keys per LeafNode
+        chkmap._root_node.set_maximum_size(20)
+        chkmap.map(('aaa',), 'v')
+        chkmap.map(('aab',), 'v')
+        chkmap.map(('aac',), 'v')
+        self.assertEqualDiff("'' InternalNode None\n"
+                             "  'aaa' LeafNode None\n"
+                             "      ('aaa',) 'v'\n"
+                             "  'aab' LeafNode None\n"
+                             "      ('aab',) 'v'\n"
+                             "  'aac' LeafNode None\n"
+                             "      ('aac',) 'v'\n",
+                             chkmap._dump_tree())
+        # Save everything to the map, and start over
+        chkmap = CHKMap(store, chkmap._save())
+        chkmap.map(('aad',), 'v')
+        # At this point, the previous nodes should not be paged in, but the
+        # newly added node would be
+        self.assertIsInstance(chkmap._root_node._items['aaa'], tuple)
+        self.assertIsInstance(chkmap._root_node._items['aab'], tuple)
+        self.assertIsInstance(chkmap._root_node._items['aac'], tuple)
+        self.assertIsInstance(chkmap._root_node._items['aad'], LeafNode)
+        # Unmapping the new node will check the existing nodes to see if they
+        # would fit, and find out that they do not
+        chkmap.unmap(('aad',))
+        self.assertIsInstance(chkmap._root_node._items['aaa'], LeafNode)
+        self.assertIsInstance(chkmap._root_node._items['aab'], LeafNode)
+        self.assertIsInstance(chkmap._root_node._items['aac'], LeafNode)
+
     def test_iter_changes_empty_ab(self):
         # Asking for changes between an empty dict to a dict with keys returns
         # all the keys.
@@ -678,7 +783,7 @@
             "      ('aab',) 'value2'",
             "  'b' LeafNode sha1:67f15d1dfa451d388ed08ff17b4f9578ba010d01",
             "      ('bbb',) 'value3'",
-            ]), chkmap._dump_tree())
+            ""]), chkmap._dump_tree())
 
     def test__dump_tree_in_progress(self):
         chkmap = self._get_map({("aaa",): "value1", ("aab",): "value2"},
@@ -695,7 +800,7 @@
             "      ('aab',) 'value2'",
             "  'b' LeafNode None",
             "      ('bbb',) 'value3'",
-            ]), chkmap._dump_tree())
+            ""]), chkmap._dump_tree())
 
 
 class TestLeafNode(TestCaseWithStore):
@@ -1187,7 +1292,7 @@
             "    'aab' LeafNode sha1:10567a3bfcc764fb8d8d9edaa28c0934ada366c5\n"
             "      ('aab',) 'new'\n"
             "  'c' LeafNode sha1:263208de2fce0a8f9db614c1ca39e8f6de8b3802\n"
-            "      ('c',) 'common'",
+            "      ('c',) 'common'\n",
             CHKMap(self.get_chk_bytes(), target)._dump_tree())
         # The key for the internal aa node
         aa_key = ('sha1:2ce01860338a614b93883a5bbeb89920137ac7ef',)



More information about the bazaar-commits mailing list