Rev 11: Bring in the iter_entries changes, and clean them up slightly. in http://bzr.arbash-meinel.com/plugins/index2
John Arbash Meinel
john at arbash-meinel.com
Wed Jul 2 06:40:46 BST 2008
At http://bzr.arbash-meinel.com/plugins/index2
------------------------------------------------------------
revno: 11
revision-id: john at arbash-meinel.com-20080702054002-tripfvw7uca0jyn2
parent: john at arbash-meinel.com-20080702053739-zxv81xne6i1pl1gx
parent: john at arbash-meinel.com-20080702045350-i5vjt3tqkyvsh3re
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: index2
timestamp: Wed 2008-07-02 00:40:02 -0500
message:
Bring in the iter_entries changes, and clean them up slightly.
modified:
btree_index.py index.py-20080624222253-p0x5f92uyh5hw734-7
tests/test_btree_index.py test_index.py-20080624222253-p0x5f92uyh5hw734-13
------------------------------------------------------------
revno: 7.1.8
revision-id: john at arbash-meinel.com-20080702045350-i5vjt3tqkyvsh3re
parent: john at arbash-meinel.com-20080702032221-9qn5l8n7owpg11u7
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: index2
timestamp: Tue 2008-07-01 23:53:50 -0500
message:
Change the looping structure for iter_entries
modified:
btree_index.py index.py-20080624222253-p0x5f92uyh5hw734-7
------------------------------------------------------------
revno: 7.1.7
revision-id: john at arbash-meinel.com-20080702032221-9qn5l8n7owpg11u7
parent: john at arbash-meinel.com-20080701231851-rvuj7146p23sr25v
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: index2
timestamp: Tue 2008-07-01 22:22:21 -0500
message:
Implement and test left-way bisecting, revert to the bisect_XX builtin for len(in_keys) == 1
modified:
btree_index.py index.py-20080624222253-p0x5f92uyh5hw734-7
tests/test_btree_index.py test_index.py-20080624222253-p0x5f92uyh5hw734-13
------------------------------------------------------------
revno: 7.1.6
revision-id: john at arbash-meinel.com-20080701231851-rvuj7146p23sr25v
parent: john at arbash-meinel.com-20080701231134-73vr4qkr6dogf3h0
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: index2
timestamp: Tue 2008-07-01 18:18:51 -0500
message:
Return the keys themselves along with the node we care about.
modified:
btree_index.py index.py-20080624222253-p0x5f92uyh5hw734-7
tests/test_btree_index.py test_index.py-20080624222253-p0x5f92uyh5hw734-13
------------------------------------------------------------
revno: 7.1.5
revision-id: john at arbash-meinel.com-20080701231134-73vr4qkr6dogf3h0
parent: john at arbash-meinel.com-20080701192734-9mbeqa4zszat3z1f
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: index2
timestamp: Tue 2008-07-01 18:11:34 -0500
message:
Write a helper function that can turn 2 lists of keys into a position list.
modified:
btree_index.py index.py-20080624222253-p0x5f92uyh5hw734-7
tests/test_btree_index.py test_index.py-20080624222253-p0x5f92uyh5hw734-13
-------------- next part --------------
=== modified file 'btree_index.py'
--- a/btree_index.py 2008-07-01 22:44:10 +0000
+++ b/btree_index.py 2008-07-02 05:40:02 +0000
@@ -17,7 +17,7 @@
"""B+Tree indices"""
-from bisect import bisect_right
+from bisect import bisect_left, bisect_right
import tempfile
import zlib
@@ -325,6 +325,20 @@
self._cache_nodes([node_index])
return self._node_cache[node_index]
+ def _get_nodes(self, node_indexes):
+ """Get a bunch of nodes, from cache or disk."""
+ found = {}
+ needed = []
+ for idx in node_indexes:
+ try:
+ found[idx] = self._node_cache[idx]
+ except KeyError:
+ needed.append(idx)
+ self._cache_nodes(needed)
+ for idx in needed:
+ found[idx] = self._node_cache[idx]
+ return found
+
def iter_all_entries(self):
"""Iterate over all keys within the index.
@@ -354,6 +368,135 @@
for key, (value, refs) in node.keys.items():
yield (self, key, value)
+ @staticmethod
+ def _multi_bisect_right(in_keys, fixed_keys):
+ """Find the positions where each 'in_key' would fit in fixed_keys.
+
+ This is equivalent to doing "bisect_right" on each in_key into
+ fixed_keys
+
+ :param in_keys: A sorted list of keys to match with fixed_keys
+ :param fixed_keys: A sorted list of keys to match against
+ :return: A list of integer positions.
+ """
+ if not in_keys:
+ return []
+ if not fixed_keys:
+ raise AssertionError('we must have entries in fixed_keys')
+
+ # TODO: Iterating both lists will generally take M + N steps
+ # Bisecting each key will generally take M * log2 N steps.
+ # If we had an efficient way to compare, we could pick the method
+ # based on which has the fewer number of steps.
+ # There is also the argument that bisect_right is a compiled
+ # function, so there is even more to be gained.
+ # if (len(in_keys) + len(fixed_keys)
+ # > len(in_keys) * math.log(len(fixed_keys), 2)):
+ if len(in_keys) == 1: # Bisect will always be faster for M = 1
+ return [(bisect_right(fixed_keys, in_keys[0]), in_keys)]
+ in_keys_iter = iter(in_keys)
+ fixed_keys_iter = enumerate(fixed_keys)
+ cur_in_key = in_keys_iter.next()
+ cur_fixed_offset, cur_fixed_key = fixed_keys_iter.next()
+
+ class InputDone(Exception): pass
+ class FixedDone(Exception): pass
+
+ output = []
+ cur_out = []
+
+ try:
+ while True:
+ if cur_in_key < cur_fixed_key:
+ cur_keys = []
+ cur_out = (cur_fixed_offset, cur_keys)
+ output.append(cur_out)
+ while cur_in_key < cur_fixed_key:
+ cur_keys.append(cur_in_key)
+ try:
+ cur_in_key = in_keys_iter.next()
+ except StopIteration:
+ raise InputDone
+ # At this point cur_in_key must be >= cur_fixed_key
+ # step the cur_fixed_key until we pass the cur key, or walk off
+ # the end
+ while cur_in_key >= cur_fixed_key:
+ try:
+ cur_fixed_offset, cur_fixed_key = fixed_keys_iter.next()
+ except StopIteration:
+ raise FixedDone
+ except InputDone:
+ # We consumed all of the input, nothing more to do
+ pass
+ except FixedDone:
+ # There was some input left, but we consumed all of fixed, so we
+ # have to add one more for the tail
+ cur_keys = [cur_in_key]
+ cur_keys.extend(in_keys_iter)
+ cur_out = (len(fixed_keys), cur_keys)
+ output.append(cur_out)
+ return output
+
+ @staticmethod
+ def _multi_bisect_left(in_keys, fixed_keys):
+ """Find the positions where each 'in_key' would fit in fixed_keys.
+
+ This is equivalent to doing "bisect_left" on each in_key into
+ fixed_keys
+
+ :param in_keys: A sorted list of keys to match with fixed_keys
+ :param fixed_keys: A sorted list of keys to match against
+ :return: A list of integer positions.
+ """
+ if not in_keys:
+ return []
+ if not fixed_keys:
+ raise AssertionError('we must have entries in fixed_keys')
+ if len(in_keys) == 1: # Bisect will always be faster for M = 1
+ return [(bisect_left(fixed_keys, in_keys[0]), in_keys)]
+ in_keys_iter = iter(in_keys)
+ fixed_keys_iter = enumerate(fixed_keys)
+ cur_in_key = in_keys_iter.next()
+ cur_fixed_offset, cur_fixed_key = fixed_keys_iter.next()
+
+ class InputDone(Exception): pass
+ class FixedDone(Exception): pass
+
+ output = []
+ cur_out = []
+
+ try:
+ while True:
+ if cur_in_key <= cur_fixed_key:
+ cur_keys = []
+ cur_out = (cur_fixed_offset, cur_keys)
+ output.append(cur_out)
+ while cur_in_key <= cur_fixed_key:
+ cur_keys.append(cur_in_key)
+ try:
+ cur_in_key = in_keys_iter.next()
+ except StopIteration:
+ raise InputDone
+ # At this point cur_in_key must be >= cur_fixed_key
+ # step the cur_fixed_key until we pass the cur key, or walk off
+ # the end
+ while cur_in_key > cur_fixed_key:
+ try:
+ cur_fixed_offset, cur_fixed_key = fixed_keys_iter.next()
+ except StopIteration:
+ raise FixedDone
+ except InputDone:
+ # We consumed all of the input, nothing more to do
+ pass
+ except FixedDone:
+ # There was some input left, but we consumed all of fixed, so we
+ # have to add one more for the tail
+ cur_keys = [cur_in_key]
+ cur_keys.extend(in_keys_iter)
+ cur_out = (len(fixed_keys), cur_keys)
+ output.append(cur_out)
+ return output
+
def iter_entries(self, keys):
"""Iterate over keys within the index.
@@ -369,36 +512,34 @@
self._cache_nodes([0])
if not self._key_count:
return
- for key in keys:
- # repeated single-bisection : 'make it work'
- # FUTURE: look within the current node for the next key,
- # consider sequential scans
- # consider block reads
- # gather stats on hit rate etc
- # start at the root:
- node = self._get_node(0)
- # what row are we looking at
- row = 0
- # how many nodes have the previous rows eaten up
- offset = self._row_lengths[row]
- while type(node) != _LeafNode:
- pos = bisect_right(node.keys, key)
- # There are len(node.keys) pointers dividing len(node.keys) + 1
- # child nodes. The first pointer is the first key in the second
- # child node, and so on.
- # pos indexes into the list of child nodes, any key equal to a
- # key in node.keys indicates that the key is present to the right
- # of that divider.
- node_index = offset + node.offset + pos
- row += 1
- offset = offset + self._row_lengths[row]
- node = self._get_node(node_index)
- if key in node.keys:
- value, refs = node.keys[key]
- if self.node_ref_lists:
- yield (self, key, value, refs)
- else:
- yield (self, key, value)
+ nodes_and_keys = [(0, keys)]
+ row_offset = 0
+ for row_length in self._row_lengths[:-1]:
+ node_indexes = [idx for idx, s_keys in nodes_and_keys]
+ nodes = self._get_nodes(node_indexes)
+
+ row_offset += row_length
+
+ next_nodes_and_keys = []
+ for node_index, sub_keys in nodes_and_keys:
+ node = nodes[node_index]
+ positions = self._multi_bisect_right(sub_keys, node.keys)
+ node_offset = row_offset + node.offset
+ next_nodes_and_keys.extend([(node_offset + pos, s_keys)
+ for pos, s_keys in positions])
+ nodes_and_keys = next_nodes_and_keys
+ # We should now be at the _LeafNodes
+ node_indexes = [idx for idx, s_keys in nodes_and_keys]
+ nodes = self._get_nodes(node_indexes)
+ for node_index, sub_keys in nodes_and_keys:
+ node = nodes[node_index]
+ for key in sub_keys:
+ if key in node.keys:
+ value, refs = node.keys[key]
+ if self.node_ref_lists:
+ yield (self, key, value, refs)
+ else:
+ yield (self, key, value)
def iter_entries_prefix(self, keys):
"""Iterate over keys within the index using prefix matching.
=== modified file 'tests/test_btree_index.py'
--- a/tests/test_btree_index.py 2008-07-02 05:29:41 +0000
+++ b/tests/test_btree_index.py 2008-07-02 05:40:02 +0000
@@ -19,6 +19,7 @@
import zlib
+from bzrlib import tests
from bzrlib.plugins import index2
from bzrlib.plugins.index2 import errors, btree_index
from bzrlib.tests import TestCaseWithTransport
@@ -518,3 +519,72 @@
('11', '44'): ('value:4', ((), (('11', 'ref00'),)))
}, node.keys)
+
+class TestMultiBisectRight(tests.TestCase):
+
+ def assertMultiBisectRight(self, offsets, search_keys, fixed_keys):
+ self.assertEqual(offsets,
+ btree_index.BTreeGraphIndex._multi_bisect_right(
+ search_keys, fixed_keys))
+
+ def test_after(self):
+ self.assertMultiBisectRight([(1, ['b'])], ['b'], ['a'])
+ self.assertMultiBisectRight([(3, ['e', 'f', 'g'])],
+ ['e', 'f', 'g'], ['a', 'b', 'c'])
+
+ def test_before(self):
+ self.assertMultiBisectRight([(0, ['a'])], ['a'], ['b'])
+ self.assertMultiBisectRight([(0, ['a', 'b', 'c', 'd'])],
+ ['a', 'b', 'c', 'd'], ['e', 'f', 'g'])
+
+ def test_exact(self):
+ self.assertMultiBisectRight([(1, ['a'])], ['a'], ['a'])
+ self.assertMultiBisectRight([(1, ['a']), (2, ['b'])], ['a', 'b'], ['a', 'b'])
+ self.assertMultiBisectRight([(1, ['a']), (3, ['c'])],
+ ['a', 'c'], ['a', 'b', 'c'])
+
+ def test_inbetween(self):
+ self.assertMultiBisectRight([(1, ['b'])], ['b'], ['a', 'c'])
+ self.assertMultiBisectRight([(1, ['b', 'c', 'd']), (2, ['f', 'g'])],
+ ['b', 'c', 'd', 'f', 'g'], ['a', 'e', 'h'])
+
+ def test_mixed(self):
+ self.assertMultiBisectRight([(0, ['a', 'b']), (2, ['d', 'e']),
+ (4, ['g', 'h'])],
+ ['a', 'b', 'd', 'e', 'g', 'h'],
+ ['c', 'd', 'f', 'g'])
+
+
+class TestMultiBisectLeft(tests.TestCase):
+
+ def assertMultiBisectLeft(self, offsets, search_keys, fixed_keys):
+ self.assertEqual(offsets,
+ btree_index.BTreeGraphIndex._multi_bisect_left(
+ search_keys, fixed_keys))
+
+ def test_after(self):
+ self.assertMultiBisectLeft([(1, ['b'])], ['b'], ['a'])
+ self.assertMultiBisectLeft([(3, ['e', 'f', 'g'])],
+ ['e', 'f', 'g'], ['a', 'b', 'c'])
+
+ def test_before(self):
+ self.assertMultiBisectLeft([(0, ['a'])], ['a'], ['b'])
+ self.assertMultiBisectLeft([(0, ['a', 'b', 'c', 'd'])],
+ ['a', 'b', 'c', 'd'], ['e', 'f', 'g'])
+
+ def test_exact(self):
+ self.assertMultiBisectLeft([(0, ['a'])], ['a'], ['a'])
+ self.assertMultiBisectLeft([(0, ['a']), (1, ['b'])], ['a', 'b'], ['a', 'b'])
+ self.assertMultiBisectLeft([(0, ['a']), (2, ['c'])],
+ ['a', 'c'], ['a', 'b', 'c'])
+
+ def test_inbetween(self):
+ self.assertMultiBisectLeft([(1, ['b'])], ['b'], ['a', 'c'])
+ self.assertMultiBisectLeft([(1, ['b', 'c', 'd']), (2, ['f', 'g'])],
+ ['b', 'c', 'd', 'f', 'g'], ['a', 'e', 'h'])
+
+ def test_mixed(self):
+ self.assertMultiBisectLeft([(0, ['a', 'b']), (1, ['d']), (2, ['e']),
+ (3, ['g']), (4, ['h'])],
+ ['a', 'b', 'd', 'e', 'g', 'h'],
+ ['c', 'd', 'f', 'g'])
More information about the bazaar-commits
mailing list