Rev 11: Bring in the iter_entries changes, and clean them up slightly. in http://bzr.arbash-meinel.com/plugins/index2

John Arbash Meinel john at arbash-meinel.com
Wed Jul 2 06:40:46 BST 2008


At http://bzr.arbash-meinel.com/plugins/index2

------------------------------------------------------------
revno: 11
revision-id: john at arbash-meinel.com-20080702054002-tripfvw7uca0jyn2
parent: john at arbash-meinel.com-20080702053739-zxv81xne6i1pl1gx
parent: john at arbash-meinel.com-20080702045350-i5vjt3tqkyvsh3re
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: index2
timestamp: Wed 2008-07-02 00:40:02 -0500
message:
  Bring in the iter_entries changes, and clean them up slightly.
modified:
  btree_index.py                 index.py-20080624222253-p0x5f92uyh5hw734-7
  tests/test_btree_index.py      test_index.py-20080624222253-p0x5f92uyh5hw734-13
    ------------------------------------------------------------
    revno: 7.1.8
    revision-id: john at arbash-meinel.com-20080702045350-i5vjt3tqkyvsh3re
    parent: john at arbash-meinel.com-20080702032221-9qn5l8n7owpg11u7
    committer: John Arbash Meinel <john at arbash-meinel.com>
    branch nick: index2
    timestamp: Tue 2008-07-01 23:53:50 -0500
    message:
      Change the looping structure for iter_entries
    modified:
      btree_index.py                 index.py-20080624222253-p0x5f92uyh5hw734-7
    ------------------------------------------------------------
    revno: 7.1.7
    revision-id: john at arbash-meinel.com-20080702032221-9qn5l8n7owpg11u7
    parent: john at arbash-meinel.com-20080701231851-rvuj7146p23sr25v
    committer: John Arbash Meinel <john at arbash-meinel.com>
    branch nick: index2
    timestamp: Tue 2008-07-01 22:22:21 -0500
    message:
      Implement and test left-way bisecting, revert to the bisect_XX builtin for len(in_keys) == 1
    modified:
      btree_index.py                 index.py-20080624222253-p0x5f92uyh5hw734-7
      tests/test_btree_index.py      test_index.py-20080624222253-p0x5f92uyh5hw734-13
    ------------------------------------------------------------
    revno: 7.1.6
    revision-id: john at arbash-meinel.com-20080701231851-rvuj7146p23sr25v
    parent: john at arbash-meinel.com-20080701231134-73vr4qkr6dogf3h0
    committer: John Arbash Meinel <john at arbash-meinel.com>
    branch nick: index2
    timestamp: Tue 2008-07-01 18:18:51 -0500
    message:
      Return the keys themselves along with the node we care about.
    modified:
      btree_index.py                 index.py-20080624222253-p0x5f92uyh5hw734-7
      tests/test_btree_index.py      test_index.py-20080624222253-p0x5f92uyh5hw734-13
    ------------------------------------------------------------
    revno: 7.1.5
    revision-id: john at arbash-meinel.com-20080701231134-73vr4qkr6dogf3h0
    parent: john at arbash-meinel.com-20080701192734-9mbeqa4zszat3z1f
    committer: John Arbash Meinel <john at arbash-meinel.com>
    branch nick: index2
    timestamp: Tue 2008-07-01 18:11:34 -0500
    message:
      Write a helper function that can turn 2 lists of keys into a position list.
    modified:
      btree_index.py                 index.py-20080624222253-p0x5f92uyh5hw734-7
      tests/test_btree_index.py      test_index.py-20080624222253-p0x5f92uyh5hw734-13
-------------- next part --------------
=== modified file 'btree_index.py'
--- a/btree_index.py	2008-07-01 22:44:10 +0000
+++ b/btree_index.py	2008-07-02 05:40:02 +0000
@@ -17,7 +17,7 @@
 
 """B+Tree indices"""
 
-from bisect import bisect_right
+from bisect import bisect_left, bisect_right
 import tempfile
 import zlib
 
@@ -325,6 +325,20 @@
             self._cache_nodes([node_index])
             return self._node_cache[node_index]
 
+    def _get_nodes(self, node_indexes):
+        """Get a bunch of nodes, from cache or disk."""
+        found = {}
+        needed = []
+        for idx in node_indexes:
+            try:
+                found[idx] = self._node_cache[idx]
+            except KeyError:
+                needed.append(idx)
+        self._cache_nodes(needed)
+        for idx in needed:
+            found[idx] = self._node_cache[idx]
+        return found
+
     def iter_all_entries(self):
         """Iterate over all keys within the index.
 
@@ -354,6 +368,135 @@
                 for key, (value, refs) in node.keys.items():
                     yield (self, key, value)
 
+    @staticmethod
+    def _multi_bisect_right(in_keys, fixed_keys):
+        """Find the positions where each 'in_key' would fit in fixed_keys.
+
+        This is equivalent to doing "bisect_right" on each in_key into
+        fixed_keys
+
+        :param in_keys: A sorted list of keys to match with fixed_keys
+        :param fixed_keys: A sorted list of keys to match against
+        :return: A list of integer positions.
+        """
+        if not in_keys:
+            return []
+        if not fixed_keys:
+            raise AssertionError('we must have entries in fixed_keys')
+
+        # TODO: Iterating both lists will generally take M + N steps
+        #       Bisecting each key will generally take M * log2 N steps.
+        #       If we had an efficient way to compare, we could pick the method
+        #       based on which has the fewer number of steps.
+        #       There is also the argument that bisect_right is a compiled
+        #       function, so there is even more to be gained.
+        # if (len(in_keys) + len(fixed_keys)
+        #     > len(in_keys) * math.log(len(fixed_keys), 2)):
+        if len(in_keys) == 1: # Bisect will always be faster for M = 1
+            return [(bisect_right(fixed_keys, in_keys[0]), in_keys)]
+        in_keys_iter = iter(in_keys)
+        fixed_keys_iter = enumerate(fixed_keys)
+        cur_in_key = in_keys_iter.next()
+        cur_fixed_offset, cur_fixed_key = fixed_keys_iter.next()
+
+        class InputDone(Exception): pass
+        class FixedDone(Exception): pass
+
+        output = []
+        cur_out = []
+
+        try:
+            while True:
+                if cur_in_key < cur_fixed_key:
+                    cur_keys = []
+                    cur_out = (cur_fixed_offset, cur_keys)
+                    output.append(cur_out)
+                    while cur_in_key < cur_fixed_key:
+                        cur_keys.append(cur_in_key)
+                        try:
+                            cur_in_key = in_keys_iter.next()
+                        except StopIteration:
+                            raise InputDone
+                    # At this point cur_in_key must be >= cur_fixed_key
+                # step the cur_fixed_key until we pass the cur key, or walk off
+                # the end
+                while cur_in_key >= cur_fixed_key:
+                    try:
+                        cur_fixed_offset, cur_fixed_key = fixed_keys_iter.next()
+                    except StopIteration:
+                        raise FixedDone
+        except InputDone:
+            # We consumed all of the input, nothing more to do
+            pass
+        except FixedDone:
+            # There was some input left, but we consumed all of fixed, so we
+            # have to add one more for the tail
+            cur_keys = [cur_in_key]
+            cur_keys.extend(in_keys_iter)
+            cur_out = (len(fixed_keys), cur_keys)
+            output.append(cur_out)
+        return output
+
+    @staticmethod
+    def _multi_bisect_left(in_keys, fixed_keys):
+        """Find the positions where each 'in_key' would fit in fixed_keys.
+
+        This is equivalent to doing "bisect_left" on each in_key into
+        fixed_keys
+
+        :param in_keys: A sorted list of keys to match with fixed_keys
+        :param fixed_keys: A sorted list of keys to match against
+        :return: A list of integer positions.
+        """
+        if not in_keys:
+            return []
+        if not fixed_keys:
+            raise AssertionError('we must have entries in fixed_keys')
+        if len(in_keys) == 1: # Bisect will always be faster for M = 1
+            return [(bisect_left(fixed_keys, in_keys[0]), in_keys)]
+        in_keys_iter = iter(in_keys)
+        fixed_keys_iter = enumerate(fixed_keys)
+        cur_in_key = in_keys_iter.next()
+        cur_fixed_offset, cur_fixed_key = fixed_keys_iter.next()
+
+        class InputDone(Exception): pass
+        class FixedDone(Exception): pass
+
+        output = []
+        cur_out = []
+
+        try:
+            while True:
+                if cur_in_key <= cur_fixed_key:
+                    cur_keys = []
+                    cur_out = (cur_fixed_offset, cur_keys)
+                    output.append(cur_out)
+                    while cur_in_key <= cur_fixed_key:
+                        cur_keys.append(cur_in_key)
+                        try:
+                            cur_in_key = in_keys_iter.next()
+                        except StopIteration:
+                            raise InputDone
+                    # At this point cur_in_key must be >= cur_fixed_key
+                # step the cur_fixed_key until we pass the cur key, or walk off
+                # the end
+                while cur_in_key > cur_fixed_key:
+                    try:
+                        cur_fixed_offset, cur_fixed_key = fixed_keys_iter.next()
+                    except StopIteration:
+                        raise FixedDone
+        except InputDone:
+            # We consumed all of the input, nothing more to do
+            pass
+        except FixedDone:
+            # There was some input left, but we consumed all of fixed, so we
+            # have to add one more for the tail
+            cur_keys = [cur_in_key]
+            cur_keys.extend(in_keys_iter)
+            cur_out = (len(fixed_keys), cur_keys)
+            output.append(cur_out)
+        return output
+
     def iter_entries(self, keys):
         """Iterate over keys within the index.
 
@@ -369,36 +512,34 @@
             self._cache_nodes([0])
         if not self._key_count:
             return
-        for key in keys:
-            # repeated single-bisection : 'make it work'
-            # FUTURE: look within the current node for the next key,
-            # consider sequential scans
-            # consider block reads
-            # gather stats on hit rate etc
-            # start at the root:
-            node = self._get_node(0)
-            # what row are we looking at
-            row = 0
-            # how many nodes have the previous rows eaten up
-            offset = self._row_lengths[row]
-            while type(node) != _LeafNode:
-                pos = bisect_right(node.keys, key)
-                # There are len(node.keys) pointers dividing len(node.keys) + 1
-                # child nodes. The first pointer is the first key in the second
-                # child node, and so on.
-                # pos indexes into the list of child nodes, any key equal to a
-                # key in node.keys indicates that the key is present to the right
-                # of that divider.
-                node_index = offset + node.offset + pos
-                row += 1
-                offset = offset + self._row_lengths[row]
-                node = self._get_node(node_index)
-            if key in node.keys:
-                value, refs = node.keys[key]
-                if self.node_ref_lists:
-                    yield (self, key, value, refs)
-                else:
-                    yield (self, key, value)
+        nodes_and_keys = [(0, keys)]
+        row_offset = 0
+        for row_length in self._row_lengths[:-1]:
+            node_indexes = [idx for idx, s_keys in nodes_and_keys]
+            nodes = self._get_nodes(node_indexes)
+
+            row_offset += row_length
+
+            next_nodes_and_keys = []
+            for node_index, sub_keys in nodes_and_keys:
+                node = nodes[node_index]
+                positions = self._multi_bisect_right(sub_keys, node.keys)
+                node_offset = row_offset + node.offset
+                next_nodes_and_keys.extend([(node_offset + pos, s_keys)
+                                           for pos, s_keys in positions])
+            nodes_and_keys = next_nodes_and_keys
+        # We should now be at the _LeafNodes
+        node_indexes = [idx for idx, s_keys in nodes_and_keys]
+        nodes = self._get_nodes(node_indexes)
+        for node_index, sub_keys in nodes_and_keys:
+            node = nodes[node_index]
+            for key in sub_keys:
+                if key in node.keys:
+                    value, refs = node.keys[key]
+                    if self.node_ref_lists:
+                        yield (self, key, value, refs)
+                    else:
+                        yield (self, key, value)
 
     def iter_entries_prefix(self, keys):
         """Iterate over keys within the index using prefix matching.

=== modified file 'tests/test_btree_index.py'
--- a/tests/test_btree_index.py	2008-07-02 05:29:41 +0000
+++ b/tests/test_btree_index.py	2008-07-02 05:40:02 +0000
@@ -19,6 +19,7 @@
 
 import zlib
 
+from bzrlib import tests
 from bzrlib.plugins import index2
 from bzrlib.plugins.index2 import errors, btree_index
 from bzrlib.tests import TestCaseWithTransport
@@ -518,3 +519,72 @@
             ('11', '44'): ('value:4', ((), (('11', 'ref00'),)))
             }, node.keys)
 
+
+class TestMultiBisectRight(tests.TestCase):
+
+    def assertMultiBisectRight(self, offsets, search_keys, fixed_keys):
+        self.assertEqual(offsets,
+                         btree_index.BTreeGraphIndex._multi_bisect_right(
+                            search_keys, fixed_keys))
+
+    def test_after(self):
+        self.assertMultiBisectRight([(1, ['b'])], ['b'], ['a'])
+        self.assertMultiBisectRight([(3, ['e', 'f', 'g'])],
+                                    ['e', 'f', 'g'], ['a', 'b', 'c'])
+
+    def test_before(self):
+        self.assertMultiBisectRight([(0, ['a'])], ['a'], ['b'])
+        self.assertMultiBisectRight([(0, ['a', 'b', 'c', 'd'])],
+                                    ['a', 'b', 'c', 'd'], ['e', 'f', 'g'])
+
+    def test_exact(self):
+        self.assertMultiBisectRight([(1, ['a'])], ['a'], ['a'])
+        self.assertMultiBisectRight([(1, ['a']), (2, ['b'])], ['a', 'b'], ['a', 'b'])
+        self.assertMultiBisectRight([(1, ['a']), (3, ['c'])],
+                                    ['a', 'c'], ['a', 'b', 'c'])
+
+    def test_inbetween(self):
+        self.assertMultiBisectRight([(1, ['b'])], ['b'], ['a', 'c'])
+        self.assertMultiBisectRight([(1, ['b', 'c', 'd']), (2, ['f', 'g'])],
+                                    ['b', 'c', 'd', 'f', 'g'], ['a', 'e', 'h'])
+
+    def test_mixed(self):
+        self.assertMultiBisectRight([(0, ['a', 'b']), (2, ['d', 'e']),
+                                     (4, ['g', 'h'])],
+                                    ['a', 'b', 'd', 'e', 'g', 'h'],
+                                    ['c', 'd', 'f', 'g'])
+
+
+class TestMultiBisectLeft(tests.TestCase):
+
+    def assertMultiBisectLeft(self, offsets, search_keys, fixed_keys):
+        self.assertEqual(offsets,
+                         btree_index.BTreeGraphIndex._multi_bisect_left(
+                            search_keys, fixed_keys))
+
+    def test_after(self):
+        self.assertMultiBisectLeft([(1, ['b'])], ['b'], ['a'])
+        self.assertMultiBisectLeft([(3, ['e', 'f', 'g'])],
+                                    ['e', 'f', 'g'], ['a', 'b', 'c'])
+
+    def test_before(self):
+        self.assertMultiBisectLeft([(0, ['a'])], ['a'], ['b'])
+        self.assertMultiBisectLeft([(0, ['a', 'b', 'c', 'd'])],
+                                    ['a', 'b', 'c', 'd'], ['e', 'f', 'g'])
+
+    def test_exact(self):
+        self.assertMultiBisectLeft([(0, ['a'])], ['a'], ['a'])
+        self.assertMultiBisectLeft([(0, ['a']), (1, ['b'])], ['a', 'b'], ['a', 'b'])
+        self.assertMultiBisectLeft([(0, ['a']), (2, ['c'])],
+                                    ['a', 'c'], ['a', 'b', 'c'])
+
+    def test_inbetween(self):
+        self.assertMultiBisectLeft([(1, ['b'])], ['b'], ['a', 'c'])
+        self.assertMultiBisectLeft([(1, ['b', 'c', 'd']), (2, ['f', 'g'])],
+                                    ['b', 'c', 'd', 'f', 'g'], ['a', 'e', 'h'])
+
+    def test_mixed(self):
+        self.assertMultiBisectLeft([(0, ['a', 'b']), (1, ['d']), (2, ['e']),
+                                     (3, ['g']), (4, ['h'])],
+                                    ['a', 'b', 'd', 'e', 'g', 'h'],
+                                    ['c', 'd', 'f', 'g'])



More information about the bazaar-commits mailing list