Rev 14: Implement and test left-way bisecting, revert to the bisect_XX builtin for len(in_keys) == 1 in http://bzr.arbash-meinel.com/plugins/index2

John Arbash Meinel john at arbash-meinel.com
Wed Jul 2 04:22:59 BST 2008


At http://bzr.arbash-meinel.com/plugins/index2

------------------------------------------------------------
revno: 14
revision-id: john at arbash-meinel.com-20080702032221-9qn5l8n7owpg11u7
parent: john at arbash-meinel.com-20080701231851-rvuj7146p23sr25v
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: index2
timestamp: Tue 2008-07-01 22:22:21 -0500
message:
  Implement and test left-way bisecting, revert to the bisect_XX builtin for len(in_keys) == 1
-------------- next part --------------
=== modified file 'btree_index.py'
--- a/btree_index.py	2008-07-01 23:18:51 +0000
+++ b/btree_index.py	2008-07-02 03:22:21 +0000
@@ -17,7 +17,7 @@
 
 """B+Tree indices"""
 
-from bisect import bisect_right
+from bisect import bisect_left, bisect_right
 import tempfile
 import zlib
 
@@ -355,7 +355,7 @@
                     yield (self, key, value)
 
     @staticmethod
-    def _positions_for_keys(in_keys, fixed_keys):
+    def _multi_bisect_right(in_keys, fixed_keys):
         """Find the positions where each 'in_key' would fit in fixed_keys.
 
         This is equivalent to doing "bisect_right" on each in_key into
@@ -369,6 +369,17 @@
             return []
         if not fixed_keys:
             raise AssertionError('we must have entries in fixed_keys')
+
+        # TODO: Iterating both lists will generally take M + N steps
+        #       Bisecting each key will generally take M * log2 N steps.
+        #       If we had an efficient way to compare, we could pick the method
+        #       based on which has the fewer number of steps.
+        #       There is also the argument that bisect_right is a compiled
+        #       function, so there is even more to be gained.
+        # if (len(in_keys) + len(fixed_keys)
+        #     > len(in_keys) * math.log(len(fixed_keys), 2)):
+        if len(in_keys) == 1: # Bisect will always be faster for M = 1
+            return [(bisect_right(fixed_keys, in_keys[0]), in_keys)]
         in_keys_iter = iter(in_keys)
         fixed_keys_iter = enumerate(fixed_keys)
         cur_in_key = in_keys_iter.next()
@@ -412,6 +423,66 @@
             output.append(cur_out)
         return output
 
+    @staticmethod
+    def _multi_bisect_left(in_keys, fixed_keys):
+        """Find the positions where each 'in_key' would fit in fixed_keys.
+
+        This is equivalent to doing "bisect_left" on each in_key into
+        fixed_keys
+
+        :param in_keys: A sorted list of keys to match with fixed_keys
+        :param fixed_keys: A sorted list of keys to match against
+        :return: A list of integer positions.
+        """
+        if not in_keys:
+            return []
+        if not fixed_keys:
+            raise AssertionError('we must have entries in fixed_keys')
+        if len(in_keys) == 1: # Bisect will always be faster for M = 1
+            return [(bisect_left(fixed_keys, in_keys[0]), in_keys)]
+        in_keys_iter = iter(in_keys)
+        fixed_keys_iter = enumerate(fixed_keys)
+        cur_in_key = in_keys_iter.next()
+        cur_fixed_offset, cur_fixed_key = fixed_keys_iter.next()
+
+        class InputDone(Exception): pass
+        class FixedDone(Exception): pass
+
+        output = []
+        cur_out = []
+
+        try:
+            while True:
+                if cur_in_key <= cur_fixed_key:
+                    cur_keys = []
+                    cur_out = (cur_fixed_offset, cur_keys)
+                    output.append(cur_out)
+                    while cur_in_key <= cur_fixed_key:
+                        cur_keys.append(cur_in_key)
+                        try:
+                            cur_in_key = in_keys_iter.next()
+                        except StopIteration:
+                            raise InputDone
+                    # At this point cur_in_key must be >= cur_fixed_key
+                # step the cur_fixed_key until we pass the cur key, or walk off
+                # the end
+                while cur_in_key > cur_fixed_key:
+                    try:
+                        cur_fixed_offset, cur_fixed_key = fixed_keys_iter.next()
+                    except StopIteration:
+                        raise FixedDone
+        except InputDone:
+            # We consumed all of the input, nothing more to do
+            pass
+        except FixedDone:
+            # There was some input left, but we consumed all of fixed, so we
+            # have to add one more for the tail
+            cur_keys = [cur_in_key]
+            cur_keys.extend(in_keys_iter)
+            cur_out = (len(fixed_keys), cur_keys)
+            output.append(cur_out)
+        return output
+
     def iter_entries(self, keys):
         """Iterate over keys within the index.
 

=== modified file 'tests/test_btree_index.py'
--- a/tests/test_btree_index.py	2008-07-01 23:18:51 +0000
+++ b/tests/test_btree_index.py	2008-07-02 03:22:21 +0000
@@ -1118,36 +1118,71 @@
             }, node.keys)
 
 
-class TestPositionsForKeys(tests.TestCase):
+class TestMultiBisectRight(tests.TestCase):
 
-    def assertPositionsForKeys(self, offsets, search_keys, fixed_keys):
+    def assertMultiBisectRight(self, offsets, search_keys, fixed_keys):
         self.assertEqual(offsets,
-                         btree_index.BTreeGraphIndex._positions_for_keys(
+                         btree_index.BTreeGraphIndex._multi_bisect_right(
                             search_keys, fixed_keys))
 
     def test_after(self):
-        self.assertPositionsForKeys([(1, ['b'])], ['b'], ['a'])
-        self.assertPositionsForKeys([(3, ['e', 'f', 'g'])],
+        self.assertMultiBisectRight([(1, ['b'])], ['b'], ['a'])
+        self.assertMultiBisectRight([(3, ['e', 'f', 'g'])],
                                     ['e', 'f', 'g'], ['a', 'b', 'c'])
 
     def test_before(self):
-        self.assertPositionsForKeys([(0, ['a'])], ['a'], ['b'])
-        self.assertPositionsForKeys([(0, ['a', 'b', 'c', 'd'])],
+        self.assertMultiBisectRight([(0, ['a'])], ['a'], ['b'])
+        self.assertMultiBisectRight([(0, ['a', 'b', 'c', 'd'])],
                                     ['a', 'b', 'c', 'd'], ['e', 'f', 'g'])
 
     def test_exact(self):
-        self.assertPositionsForKeys([(1, ['a'])], ['a'], ['a'])
-        self.assertPositionsForKeys([(1, ['a']), (2, ['b'])], ['a', 'b'], ['a', 'b'])
-        self.assertPositionsForKeys([(1, ['a']), (3, ['c'])],
+        self.assertMultiBisectRight([(1, ['a'])], ['a'], ['a'])
+        self.assertMultiBisectRight([(1, ['a']), (2, ['b'])], ['a', 'b'], ['a', 'b'])
+        self.assertMultiBisectRight([(1, ['a']), (3, ['c'])],
                                     ['a', 'c'], ['a', 'b', 'c'])
 
     def test_inbetween(self):
-        self.assertPositionsForKeys([(1, ['b'])], ['b'], ['a', 'c'])
-        self.assertPositionsForKeys([(1, ['b', 'c', 'd']), (2, ['f', 'g'])],
+        self.assertMultiBisectRight([(1, ['b'])], ['b'], ['a', 'c'])
+        self.assertMultiBisectRight([(1, ['b', 'c', 'd']), (2, ['f', 'g'])],
                                     ['b', 'c', 'd', 'f', 'g'], ['a', 'e', 'h'])
 
     def test_mixed(self):
-        self.assertPositionsForKeys([(0, ['a', 'b']), (2, ['d', 'e']),
+        self.assertMultiBisectRight([(0, ['a', 'b']), (2, ['d', 'e']),
                                      (4, ['g', 'h'])],
                                     ['a', 'b', 'd', 'e', 'g', 'h'],
                                     ['c', 'd', 'f', 'g'])
+
+
+class TestMultiBisectLeft(tests.TestCase):
+
+    def assertMultiBisectLeft(self, offsets, search_keys, fixed_keys):
+        self.assertEqual(offsets,
+                         btree_index.BTreeGraphIndex._multi_bisect_left(
+                            search_keys, fixed_keys))
+
+    def test_after(self):
+        self.assertMultiBisectLeft([(1, ['b'])], ['b'], ['a'])
+        self.assertMultiBisectLeft([(3, ['e', 'f', 'g'])],
+                                    ['e', 'f', 'g'], ['a', 'b', 'c'])
+
+    def test_before(self):
+        self.assertMultiBisectLeft([(0, ['a'])], ['a'], ['b'])
+        self.assertMultiBisectLeft([(0, ['a', 'b', 'c', 'd'])],
+                                    ['a', 'b', 'c', 'd'], ['e', 'f', 'g'])
+
+    def test_exact(self):
+        self.assertMultiBisectLeft([(0, ['a'])], ['a'], ['a'])
+        self.assertMultiBisectLeft([(0, ['a']), (1, ['b'])], ['a', 'b'], ['a', 'b'])
+        self.assertMultiBisectLeft([(0, ['a']), (2, ['c'])],
+                                    ['a', 'c'], ['a', 'b', 'c'])
+
+    def test_inbetween(self):
+        self.assertMultiBisectLeft([(1, ['b'])], ['b'], ['a', 'c'])
+        self.assertMultiBisectLeft([(1, ['b', 'c', 'd']), (2, ['f', 'g'])],
+                                    ['b', 'c', 'd', 'f', 'g'], ['a', 'e', 'h'])
+
+    def test_mixed(self):
+        self.assertMultiBisectLeft([(0, ['a', 'b']), (1, ['d']), (2, ['e']),
+                                     (3, ['g']), (4, ['h'])],
+                                    ['a', 'b', 'd', 'e', 'g', 'h'],
+                                    ['c', 'd', 'f', 'g'])



More information about the bazaar-commits mailing list