Rev 3794: At the end of unmap() see if children can be packed into a single Leaf. in http://bzr.arbash-meinel.com/branches/bzr/brisbane/chk_map

John Arbash Meinel john at arbash-meinel.com
Tue Dec 2 18:39:13 GMT 2008


At http://bzr.arbash-meinel.com/branches/bzr/brisbane/chk_map

------------------------------------------------------------
revno: 3794
revision-id: john at arbash-meinel.com-20081202183902-pzwr8vxzdubumrjy
parent: john at arbash-meinel.com-20081202163706-g5ibq2cot6x31l5d
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: chk_map
timestamp: Tue 2008-12-02 12:39:02 -0600
message:
  At the end of unmap() see if children can be packed into a single Leaf.
-------------- next part --------------
=== modified file 'bzrlib/chk_map.py'
--- a/bzrlib/chk_map.py	2008-12-02 16:12:10 +0000
+++ b/bzrlib/chk_map.py	2008-12-02 18:39:02 +0000
@@ -358,7 +358,8 @@
     def unmap(self, key):
         """remove key from the map."""
         self._ensure_root()
-        self._root_node.unmap(self._store, key)
+        unmapped = self._root_node.unmap(self._store, key)
+        self._root_node = unmapped
 
     def _save(self):
         """Save the map completely.
@@ -845,26 +846,83 @@
         """Remove key from this node and it's children."""
         if not len(self._items):
             raise AssertionError("cant unmap in an empty InternalNode.")
-        serialised_key = self._serialised_key(key)
         children = self._iter_nodes(store, key_filter=[key])
-        serialised_key = self._serialised_key(key)
         if children:
             child = children[0]
         else:
             raise KeyError(key)
         self._len -= 1
         unmapped = child.unmap(store, key)
+        self._key = None
+        serialised_key = self._serialised_key(key)
         if len(unmapped) == 0:
             # All child nodes are gone, remove the child:
             del self._items[serialised_key]
+            unmapped = None
         else:
             # Stash the returned node
             self._items[serialised_key] = unmapped
         if len(self._items) == 1:
             # this node is no longer needed:
             return self._items.values()[0]
-        self._key = None
-        return self
+        # Logic for how unmap determines when it needs to rebuild
+        # 1) Implicitly unmap() is removing a key which means that the child
+        #    nodes are going to be shrinking by some extent.
+        # 2) If all children are LeafNodes, it is possible that they could be
+        #    combined into a single LeafNode, which can then completely replace
+        #    this internal node with a single LeafNode
+        # 3) If *one* child is an InternalNode, we assume it has already done
+        #    all the work to determine that its children cannot collapse, and
+        #    we can then assume that those nodes *plus* the current nodes don't
+        #    have a chance of collapsing either.
+        #    So a very cheap check is to just say if 'unmapped' is an
+        #    InternalNode, we don't have to check further.
+        if isinstance(unmapped, InternalNode):
+            return self
+        node = self._check_remap(store)
+        return self._check_remap(store)
+
+    def _check_remap(self, store):
+        # Check and see if all keys contained by children could be mapped into
+        # a single LeafNode.
+
+        # TODO: If any child node is an InternalNode, we know we can stop. Is
+        #       it cheaper to load all children and see if one is an
+        #       InternalNode, or is it cheaper to start adding keys into a
+        #       LeafNode and stop when it gets full...
+        # TODO: Do a quick once-over the list of already-read children, and
+        #       check if any are an InternalNode. This will avoid any I/O some
+        #       of the time, and should be very cheap.
+        # TODO: Another alternative is to check the total size of all known
+        #       LeafNodes. If there is some formula we can use to determine the
+        #       final size without actually having to read in any more
+        #       children, it would be nice to have. However, we have to be
+        #       careful with stuff like nodes that pull out the common prefix
+        #       of each key, as adding a new key can change the common prefix
+        #       and cause size changes greater than the length of one key.
+        #       So for now, we just add everything to a new Leaf until it
+        #       splits, as we know that will give the right answer
+        new_leaf = LeafNode()
+        new_leaf.set_maximum_size(self._maximum_size)
+        new_leaf._key_width = self._key_width
+        for node in self._iter_nodes(store):
+            if isinstance(node, InternalNode):
+                # We won't be able to collapse
+                return self
+            for key, value in node._items.iteritems():
+                # TODO: it is expensive to have it actually do the split, when
+                #       all we care about is whether it *would* split. Refactor
+                #       the code a bit so we can just check without actually
+                #       having to do all the work
+                next_prefix, new_nodes = new_leaf.map(store, key, value)
+                if len(new_nodes) > 1:
+                    # These nodes cannot fit in a single leaf, so we are done
+                    return self
+                else:
+                    assert new_leaf is new_nodes[0][1]
+        # We have gone to every child, and everything fits in a single leaf
+        # node, we no longer need this internal node
+        return new_leaf
 
 
 def _deserialise(bytes, key):

=== modified file 'bzrlib/tests/test_chk_map.py'
--- a/bzrlib/tests/test_chk_map.py	2008-12-02 16:37:06 +0000
+++ b/bzrlib/tests/test_chk_map.py	2008-12-02 18:39:02 +0000
@@ -213,6 +213,8 @@
                              "      ('aaa',) 'v'\n"
                              "      ('aab',) 'v'",
                              chkmap._dump_tree())
+        self.assertCanonicalForm(chkmap)
+
         # Creates a new internal node, and splits the others into leaves
         chkmap.map(('aac',), 'v')
         self.assertEqualDiff("'' InternalNode None\n"
@@ -223,6 +225,8 @@
                              "  'aac' LeafNode None\n"
                              "      ('aac',) 'v'",
                              chkmap._dump_tree())
+        self.assertCanonicalForm(chkmap)
+
         # Splits again, because it can't fit in the current structure
         chkmap.map(('bbb',), 'v')
         self.assertEqualDiff("'' InternalNode None\n"
@@ -236,6 +240,7 @@
                              "  'b' LeafNode None\n"
                              "      ('bbb',) 'v'",
                              chkmap._dump_tree())
+        self.assertCanonicalForm(chkmap)
 
     def test_deep_splitting(self):
         store = self.get_chk_bytes()
@@ -322,6 +327,37 @@
                              "      ('aaabadaa',) 'v'",
                              chkmap._dump_tree())
 
+    def test_stable_unmap(self):
+        store = self.get_chk_bytes()
+        chkmap = CHKMap(store, None)
+        # Should fit 2 keys per LeafNode
+        chkmap._root_node.set_maximum_size(30)
+        chkmap.map(('aaa',), 'v')
+        chkmap.map(('aab',), 'v')
+        self.assertEqualDiff("'' LeafNode None\n"
+                             "      ('aaa',) 'v'\n"
+                             "      ('aab',) 'v'",
+                             chkmap._dump_tree())
+        # Creates a new internal node, and splits the others into leaves
+        chkmap.map(('aac',), 'v')
+        self.assertEqualDiff("'' InternalNode None\n"
+                             "  'aaa' LeafNode None\n"
+                             "      ('aaa',) 'v'\n"
+                             "  'aab' LeafNode None\n"
+                             "      ('aab',) 'v'\n"
+                             "  'aac' LeafNode None\n"
+                             "      ('aac',) 'v'",
+                             chkmap._dump_tree())
+        self.assertCanonicalForm(chkmap)
+        # Now lets unmap one of the keys, and assert that we collapse the
+        # structures.
+        chkmap.unmap(('aac',))
+        self.assertEqualDiff("'' LeafNode None\n"
+                             "      ('aaa',) 'v'\n"
+                             "      ('aab',) 'v'",
+                             chkmap._dump_tree())
+        self.assertCanonicalForm(chkmap)
+
     def test_iter_changes_empty_ab(self):
         # Asking for changes between an empty dict to a dict with keys returns
         # all the keys.
@@ -929,10 +965,8 @@
         # children.
         result = node.unmap(chkmap._store, ('k23',))
         # check the pointed-at object within node - k2 should now point at the
-        # k22 leaf (which should not even have been paged in).
-        ptr = node._items['k2']
-        self.assertIsInstance(ptr, tuple)
-        child = _deserialise(self.read_bytes(chkmap._store, ptr), ptr)
+        # k22 leaf (which has been paged in to see if we can collapse the tree)
+        child = node._items['k2']
         self.assertIsInstance(child, LeafNode)
         self.assertEqual(1, len(child))
         self.assertEqual({('k22',): 'bar'},



More information about the bazaar-commits mailing list