Rev 14: Implement and test left-way bisecting, revert to the bisect_XX builtin for len(in_keys) == 1 in http://bzr.arbash-meinel.com/plugins/index2
John Arbash Meinel
john at arbash-meinel.com
Wed Jul 2 04:22:59 BST 2008
At http://bzr.arbash-meinel.com/plugins/index2
------------------------------------------------------------
revno: 14
revision-id: john at arbash-meinel.com-20080702032221-9qn5l8n7owpg11u7
parent: john at arbash-meinel.com-20080701231851-rvuj7146p23sr25v
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: index2
timestamp: Tue 2008-07-01 22:22:21 -0500
message:
Implement and test left-way bisecting, revert to the bisect_XX builtin for len(in_keys) == 1
-------------- next part --------------
=== modified file 'btree_index.py'
--- a/btree_index.py 2008-07-01 23:18:51 +0000
+++ b/btree_index.py 2008-07-02 03:22:21 +0000
@@ -17,7 +17,7 @@
"""B+Tree indices"""
-from bisect import bisect_right
+from bisect import bisect_left, bisect_right
import tempfile
import zlib
@@ -355,7 +355,7 @@
yield (self, key, value)
@staticmethod
- def _positions_for_keys(in_keys, fixed_keys):
+ def _multi_bisect_right(in_keys, fixed_keys):
"""Find the positions where each 'in_key' would fit in fixed_keys.
This is equivalent to doing "bisect_right" on each in_key into
@@ -369,6 +369,17 @@
return []
if not fixed_keys:
raise AssertionError('we must have entries in fixed_keys')
+
+ # TODO: Iterating both lists will generally take M + N steps
+ # Bisecting each key will generally take M * log2 N steps.
+ # If we had an efficient way to compare, we could pick the method
+ # based on which has the fewer number of steps.
+ # There is also the argument that bisect_right is a compiled
+ # function, so there is even more to be gained.
+ # if (len(in_keys) + len(fixed_keys)
+ # > len(in_keys) * math.log(len(fixed_keys), 2)):
+ if len(in_keys) == 1: # Bisect will always be faster for M = 1
+ return [(bisect_right(fixed_keys, in_keys[0]), in_keys)]
in_keys_iter = iter(in_keys)
fixed_keys_iter = enumerate(fixed_keys)
cur_in_key = in_keys_iter.next()
@@ -412,6 +423,66 @@
output.append(cur_out)
return output
+ @staticmethod
+ def _multi_bisect_left(in_keys, fixed_keys):
+ """Find the positions where each 'in_key' would fit in fixed_keys.
+
+ This is equivalent to doing "bisect_left" on each in_key into
+ fixed_keys
+
+ :param in_keys: A sorted list of keys to match with fixed_keys
+ :param fixed_keys: A sorted list of keys to match against
+ :return: A list of integer positions.
+ """
+ if not in_keys:
+ return []
+ if not fixed_keys:
+ raise AssertionError('we must have entries in fixed_keys')
+ if len(in_keys) == 1: # Bisect will always be faster for M = 1
+ return [(bisect_left(fixed_keys, in_keys[0]), in_keys)]
+ in_keys_iter = iter(in_keys)
+ fixed_keys_iter = enumerate(fixed_keys)
+ cur_in_key = in_keys_iter.next()
+ cur_fixed_offset, cur_fixed_key = fixed_keys_iter.next()
+
+ class InputDone(Exception): pass
+ class FixedDone(Exception): pass
+
+ output = []
+ cur_out = []
+
+ try:
+ while True:
+ if cur_in_key <= cur_fixed_key:
+ cur_keys = []
+ cur_out = (cur_fixed_offset, cur_keys)
+ output.append(cur_out)
+ while cur_in_key <= cur_fixed_key:
+ cur_keys.append(cur_in_key)
+ try:
+ cur_in_key = in_keys_iter.next()
+ except StopIteration:
+ raise InputDone
+ # At this point cur_in_key must be >= cur_fixed_key
+ # step the cur_fixed_key until we pass the cur key, or walk off
+ # the end
+ while cur_in_key > cur_fixed_key:
+ try:
+ cur_fixed_offset, cur_fixed_key = fixed_keys_iter.next()
+ except StopIteration:
+ raise FixedDone
+ except InputDone:
+ # We consumed all of the input, nothing more to do
+ pass
+ except FixedDone:
+ # There was some input left, but we consumed all of fixed, so we
+ # have to add one more for the tail
+ cur_keys = [cur_in_key]
+ cur_keys.extend(in_keys_iter)
+ cur_out = (len(fixed_keys), cur_keys)
+ output.append(cur_out)
+ return output
+
def iter_entries(self, keys):
"""Iterate over keys within the index.
=== modified file 'tests/test_btree_index.py'
--- a/tests/test_btree_index.py 2008-07-01 23:18:51 +0000
+++ b/tests/test_btree_index.py 2008-07-02 03:22:21 +0000
@@ -1118,36 +1118,71 @@
}, node.keys)
-class TestPositionsForKeys(tests.TestCase):
+class TestMultiBisectRight(tests.TestCase):
- def assertPositionsForKeys(self, offsets, search_keys, fixed_keys):
+ def assertMultiBisectRight(self, offsets, search_keys, fixed_keys):
self.assertEqual(offsets,
- btree_index.BTreeGraphIndex._positions_for_keys(
+ btree_index.BTreeGraphIndex._multi_bisect_right(
search_keys, fixed_keys))
def test_after(self):
- self.assertPositionsForKeys([(1, ['b'])], ['b'], ['a'])
- self.assertPositionsForKeys([(3, ['e', 'f', 'g'])],
+ self.assertMultiBisectRight([(1, ['b'])], ['b'], ['a'])
+ self.assertMultiBisectRight([(3, ['e', 'f', 'g'])],
['e', 'f', 'g'], ['a', 'b', 'c'])
def test_before(self):
- self.assertPositionsForKeys([(0, ['a'])], ['a'], ['b'])
- self.assertPositionsForKeys([(0, ['a', 'b', 'c', 'd'])],
+ self.assertMultiBisectRight([(0, ['a'])], ['a'], ['b'])
+ self.assertMultiBisectRight([(0, ['a', 'b', 'c', 'd'])],
['a', 'b', 'c', 'd'], ['e', 'f', 'g'])
def test_exact(self):
- self.assertPositionsForKeys([(1, ['a'])], ['a'], ['a'])
- self.assertPositionsForKeys([(1, ['a']), (2, ['b'])], ['a', 'b'], ['a', 'b'])
- self.assertPositionsForKeys([(1, ['a']), (3, ['c'])],
+ self.assertMultiBisectRight([(1, ['a'])], ['a'], ['a'])
+ self.assertMultiBisectRight([(1, ['a']), (2, ['b'])], ['a', 'b'], ['a', 'b'])
+ self.assertMultiBisectRight([(1, ['a']), (3, ['c'])],
['a', 'c'], ['a', 'b', 'c'])
def test_inbetween(self):
- self.assertPositionsForKeys([(1, ['b'])], ['b'], ['a', 'c'])
- self.assertPositionsForKeys([(1, ['b', 'c', 'd']), (2, ['f', 'g'])],
+ self.assertMultiBisectRight([(1, ['b'])], ['b'], ['a', 'c'])
+ self.assertMultiBisectRight([(1, ['b', 'c', 'd']), (2, ['f', 'g'])],
['b', 'c', 'd', 'f', 'g'], ['a', 'e', 'h'])
def test_mixed(self):
- self.assertPositionsForKeys([(0, ['a', 'b']), (2, ['d', 'e']),
+ self.assertMultiBisectRight([(0, ['a', 'b']), (2, ['d', 'e']),
(4, ['g', 'h'])],
['a', 'b', 'd', 'e', 'g', 'h'],
['c', 'd', 'f', 'g'])
+
+
+class TestMultiBisectLeft(tests.TestCase):
+
+ def assertMultiBisectLeft(self, offsets, search_keys, fixed_keys):
+ self.assertEqual(offsets,
+ btree_index.BTreeGraphIndex._multi_bisect_left(
+ search_keys, fixed_keys))
+
+ def test_after(self):
+ self.assertMultiBisectLeft([(1, ['b'])], ['b'], ['a'])
+ self.assertMultiBisectLeft([(3, ['e', 'f', 'g'])],
+ ['e', 'f', 'g'], ['a', 'b', 'c'])
+
+ def test_before(self):
+ self.assertMultiBisectLeft([(0, ['a'])], ['a'], ['b'])
+ self.assertMultiBisectLeft([(0, ['a', 'b', 'c', 'd'])],
+ ['a', 'b', 'c', 'd'], ['e', 'f', 'g'])
+
+ def test_exact(self):
+ self.assertMultiBisectLeft([(0, ['a'])], ['a'], ['a'])
+ self.assertMultiBisectLeft([(0, ['a']), (1, ['b'])], ['a', 'b'], ['a', 'b'])
+ self.assertMultiBisectLeft([(0, ['a']), (2, ['c'])],
+ ['a', 'c'], ['a', 'b', 'c'])
+
+ def test_inbetween(self):
+ self.assertMultiBisectLeft([(1, ['b'])], ['b'], ['a', 'c'])
+ self.assertMultiBisectLeft([(1, ['b', 'c', 'd']), (2, ['f', 'g'])],
+ ['b', 'c', 'd', 'f', 'g'], ['a', 'e', 'h'])
+
+ def test_mixed(self):
+ self.assertMultiBisectLeft([(0, ['a', 'b']), (1, ['d']), (2, ['e']),
+ (3, ['g']), (4, ['h'])],
+ ['a', 'b', 'd', 'e', 'g', 'h'],
+ ['c', 'd', 'f', 'g'])
More information about the bazaar-commits
mailing list