Rev 4635: First cut at an implementation of multi_bisect_right in pyrex. in http://bazaar.launchpad.net/~jameinel/bzr/2.0b1-multibisect
John Arbash Meinel
john at arbash-meinel.com
Thu Aug 20 21:48:56 BST 2009
At http://bazaar.launchpad.net/~jameinel/bzr/2.0b1-multibisect
------------------------------------------------------------
revno: 4635
revision-id: john at arbash-meinel.com-20090820204851-f320qk0or7revyvr
parent: pqm at pqm.ubuntu.com-20090820135727-pz56lf0l0tj6rbrn
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: 2.0b1-multibisect
timestamp: Thu 2009-08-20 15:48:51 -0500
message:
First cut at an implementation of multi_bisect_right in pyrex.
-------------- next part --------------
=== modified file 'bzrlib/_btree_serializer_py.py'
--- a/bzrlib/_btree_serializer_py.py 2009-03-23 14:59:43 +0000
+++ b/bzrlib/_btree_serializer_py.py 2009-08-20 20:48:51 +0000
@@ -17,6 +17,9 @@
"""B+Tree index parsing."""
+import bisect
+
+
def _parse_leaf_lines(bytes, key_length, ref_list_length):
lines = bytes.split('\n')
nodes = []
@@ -64,3 +67,85 @@
line = ("%s\x00%s\x00%s\n" % (string_key,
'\t'.join(flattened_references), node[2]))
return string_key, line
+
+
+def _multi_bisect_right(in_keys, fixed_keys):
+ """Find the positions where each 'in_key' would fit in fixed_keys.
+
+ This is equivalent to doing "bisect_right" on each in_key into
+ fixed_keys
+
+ :param in_keys: A sorted list of keys to match with fixed_keys
+ :param fixed_keys: A sorted list of keys to match against
+ :return: A list of (integer position, [key list]) tuples.
+ """
+ if not in_keys:
+ return []
+ if not fixed_keys:
+ # no pointers in the fixed_keys list, which means everything must
+ # fall to the left.
+ return [(0, in_keys)]
+
+ # TODO: Iterating both lists will generally take M + N steps
+ # Bisecting each key will generally take M * log2 N steps.
+ # If we had an efficient way to compare, we could pick the method
+ # based on which has the fewer number of steps.
+ # There is also the argument that bisect_right is a compiled
+ # function, so there is even more to be gained.
+ # iter_steps = len(in_keys) + len(fixed_keys)
+ # bisect_steps = len(in_keys) * math.log(len(fixed_keys), 2)
+ if len(in_keys) == 1: # Bisect will always be faster for M = 1
+ return [(bisect.bisect_right(fixed_keys, in_keys[0]), in_keys)]
+ # elif bisect_steps < iter_steps:
+ # offsets = {}
+ # for key in in_keys:
+ # offsets.setdefault(bisect_right(fixed_keys, key),
+ # []).append(key)
+ # return [(o, offsets[o]) for o in sorted(offsets)]
+ in_keys_iter = iter(in_keys)
+ fixed_keys_iter = enumerate(fixed_keys)
+ cur_in_key = in_keys_iter.next()
+ cur_fixed_offset, cur_fixed_key = fixed_keys_iter.next()
+
+ class InputDone(Exception): pass
+ class FixedDone(Exception): pass
+
+ output = []
+
+ # TODO: Another possibility is that rather than iterating on each side,
+ # we could use a combination of bisecting and iterating. For
+ # example, while cur_in_key < fixed_key, bisect to find its
+ # point, then iterate all matching keys, then bisect (restricted
+ # to only the remainder) for the next one, etc.
+ try:
+ while True:
+ if cur_in_key < cur_fixed_key:
+ cur_keys = []
+ cur_out = (cur_fixed_offset, cur_keys)
+ output.append(cur_out)
+ while cur_in_key < cur_fixed_key:
+ cur_keys.append(cur_in_key)
+ try:
+ cur_in_key = in_keys_iter.next()
+ except StopIteration:
+ raise InputDone
+ # At this point cur_in_key must be >= cur_fixed_key
+ # step the cur_fixed_key until we pass the cur key, or walk off
+ # the end
+ while cur_in_key >= cur_fixed_key:
+ try:
+ cur_fixed_offset, cur_fixed_key = fixed_keys_iter.next()
+ except StopIteration:
+ raise FixedDone
+ except InputDone:
+ # We consumed all of the input, nothing more to do
+ pass
+ except FixedDone:
+ # There was some input left, but we consumed all of fixed, so we
+ # have to add one more for the tail
+ cur_keys = [cur_in_key]
+ cur_keys.extend(in_keys_iter)
+ cur_out = (len(fixed_keys), cur_keys)
+ output.append(cur_out)
+ return output
+
=== modified file 'bzrlib/_btree_serializer_pyx.pyx'
--- a/bzrlib/_btree_serializer_pyx.pyx 2009-06-22 12:52:39 +0000
+++ b/bzrlib/_btree_serializer_pyx.pyx 2009-08-20 20:48:51 +0000
@@ -29,6 +29,10 @@
ctypedef struct PyObject:
pass
int PyList_Append(object lst, object item) except -1
+ int PyList_CheckExact(object)
+ PyObject *PyList_GET_ITEM(object, Py_ssize_t i)
+ PyObject *PyList_GetItem(object, Py_ssize_t i) except NULL
+ Py_ssize_t PyList_GET_SIZE(object)
char *PyString_AsString(object p) except NULL
object PyString_FromStringAndSize(char *, Py_ssize_t)
@@ -45,6 +49,15 @@
PyObject *PyTuple_GET_ITEM_ptr_object "PyTuple_GET_ITEM" (object tpl, int index)
void Py_DECREF_ptr "Py_DECREF" (PyObject *)
+ int PySequence_Check(object)
+ Py_ssize_t PySequence_Size(object)
+ # Do we want to use this, or can we assume we always have a List?
+ Py_ssize_t PySequence_GetItem(object, Py_ssize_t i)
+
+
+import bisect
+
+
cdef extern from "string.h":
void *memcpy(void *dest, void *src, size_t n)
void *memchr(void *s, int c, size_t n)
@@ -411,3 +424,92 @@
out = out + value_len
out[0] = c'\n'
return string_key, line
+
+
+cdef object bisect_right
+bisect_right = bisect.bisect_right
+
+
+def _multi_bisect_right(in_keys, fixed_keys):
+ """Find the positions where each 'in_key' would fit in fixed_keys.
+
+ This is equivalent to doing "bisect_right" on each in_key into
+ fixed_keys
+
+ :param in_keys: A sorted list of keys to match with fixed_keys
+ :param fixed_keys: A sorted list of keys to match against
+ :return: A list of (integer position, [key list]) tuples.
+ """
+ cdef PyObject *temp
+ cdef long in_pos, fixed_pos, in_length, fixed_length
+
+ if not PyList_CheckExact(in_keys):
+ raise TypeError('in_keys must be a list')
+ if not PyList_CheckExact(fixed_keys):
+ raise TypeError('fixed_keys must be a list')
+
+ in_length = PyList_GET_SIZE(in_keys)
+ if in_length == 0:
+ return []
+ fixed_length = PyList_GET_SIZE(fixed_keys)
+ if fixed_length == 0:
+ # no pointers in the fixed_keys list, which means everything must
+ # fall to the left.
+ return [(0, in_keys)]
+
+ # Find the possible location for the first entry using bisection
+ pos = bisect_right(fixed_keys, in_keys[0])
+ if PyList_GET_SIZE(in_keys) == 1:
+ # If there is only one key, then we are done.
+ return [(pos, in_keys)]
+ fixed_pos = pos
+ if fixed_pos >= fixed_length:
+ # All the keys occur after the last fixed_key
+ return [(fixed_length, in_keys)]
+
+ output = []
+
+ # TODO: Another possibility is that rather than iterating on each side,
+ # we could use a combination of bisecting and iterating. For
+ # example, while cur_in_key < fixed_key, bisect to find its
+ # point, then iterate all matching keys, then bisect (restricted
+ # to only the remainder) for the next one, etc.
+ in_pos = 0
+ temp = PyList_GetItem(in_keys, in_pos)
+ in_key = <object>temp
+ temp = PyList_GetItem(fixed_keys, fixed_pos)
+ fixed_key = <object>temp
+ while in_pos < in_length and fixed_pos < fixed_length:
+ if in_key < fixed_key:
+ cur_keys = []
+ PyList_Append(output, (fixed_pos, cur_keys))
+ while in_key < fixed_key:
+ PyList_Append(cur_keys, in_key)
+ in_pos = in_pos + 1
+ if in_pos >= in_length:
+ # We have handled all of the keys
+ return output
+ temp = PyList_GetItem(in_keys, in_pos)
+ in_key = <object>temp
+ # At this point cur_in_key must be >= cur_fixed_key
+ # step the cur_fixed_key until we pass the cur key, or walk off
+ # the end
+ while in_key >= fixed_key:
+ fixed_pos = fixed_pos + 1
+ if fixed_pos >= fixed_length:
+ break
+ temp = PyList_GetItem(fixed_keys, fixed_pos)
+ fixed_key = <object>temp
+
+ if in_pos < in_length:
+ # There was some input left, but we consumed all of fixed, so we
+ # have to add one more for the tail
+ cur_keys = [in_key]
+ in_pos = in_pos + 1
+ PyList_Append(output, (fixed_length, cur_keys))
+ while in_pos < in_length:
+ temp = PyList_GetItem(in_keys, in_pos)
+ in_key = <object>temp
+ PyList_Append(cur_keys, in_key)
+ in_pos = in_pos + 1
+ return output
=== modified file 'bzrlib/btree_index.py'
--- a/bzrlib/btree_index.py 2009-08-17 22:11:06 +0000
+++ b/bzrlib/btree_index.py 2009-08-20 20:48:51 +0000
@@ -1,4 +1,4 @@
-# Copyright (C) 2008 Canonical Ltd
+# Copyright (C) 2008, 2009 Canonical Ltd
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
@@ -17,7 +17,7 @@
"""B+Tree indices"""
-from bisect import bisect_right
+import gc
import math
import tempfile
import zlib
@@ -963,88 +963,6 @@
for key, (value, refs) in sorted(node.keys.items()):
yield (self, key, value)
- @staticmethod
- def _multi_bisect_right(in_keys, fixed_keys):
- """Find the positions where each 'in_key' would fit in fixed_keys.
-
- This is equivalent to doing "bisect_right" on each in_key into
- fixed_keys
-
- :param in_keys: A sorted list of keys to match with fixed_keys
- :param fixed_keys: A sorted list of keys to match against
- :return: A list of (integer position, [key list]) tuples.
- """
- if not in_keys:
- return []
- if not fixed_keys:
- # no pointers in the fixed_keys list, which means everything must
- # fall to the left.
- return [(0, in_keys)]
-
- # TODO: Iterating both lists will generally take M + N steps
- # Bisecting each key will generally take M * log2 N steps.
- # If we had an efficient way to compare, we could pick the method
- # based on which has the fewer number of steps.
- # There is also the argument that bisect_right is a compiled
- # function, so there is even more to be gained.
- # iter_steps = len(in_keys) + len(fixed_keys)
- # bisect_steps = len(in_keys) * math.log(len(fixed_keys), 2)
- if len(in_keys) == 1: # Bisect will always be faster for M = 1
- return [(bisect_right(fixed_keys, in_keys[0]), in_keys)]
- # elif bisect_steps < iter_steps:
- # offsets = {}
- # for key in in_keys:
- # offsets.setdefault(bisect_right(fixed_keys, key),
- # []).append(key)
- # return [(o, offsets[o]) for o in sorted(offsets)]
- in_keys_iter = iter(in_keys)
- fixed_keys_iter = enumerate(fixed_keys)
- cur_in_key = in_keys_iter.next()
- cur_fixed_offset, cur_fixed_key = fixed_keys_iter.next()
-
- class InputDone(Exception): pass
- class FixedDone(Exception): pass
-
- output = []
- cur_out = []
-
- # TODO: Another possibility is that rather than iterating on each side,
- # we could use a combination of bisecting and iterating. For
- # example, while cur_in_key < fixed_key, bisect to find its
- # point, then iterate all matching keys, then bisect (restricted
- # to only the remainder) for the next one, etc.
- try:
- while True:
- if cur_in_key < cur_fixed_key:
- cur_keys = []
- cur_out = (cur_fixed_offset, cur_keys)
- output.append(cur_out)
- while cur_in_key < cur_fixed_key:
- cur_keys.append(cur_in_key)
- try:
- cur_in_key = in_keys_iter.next()
- except StopIteration:
- raise InputDone
- # At this point cur_in_key must be >= cur_fixed_key
- # step the cur_fixed_key until we pass the cur key, or walk off
- # the end
- while cur_in_key >= cur_fixed_key:
- try:
- cur_fixed_offset, cur_fixed_key = fixed_keys_iter.next()
- except StopIteration:
- raise FixedDone
- except InputDone:
- # We consumed all of the input, nothing more to do
- pass
- except FixedDone:
- # There was some input left, but we consumed all of fixed, so we
- # have to add one more for the tail
- cur_keys = [cur_in_key]
- cur_keys.extend(in_keys_iter)
- cur_out = (len(fixed_keys), cur_keys)
- output.append(cur_out)
- return output
-
def _walk_through_internal_nodes(self, keys):
"""Take the given set of keys, and find the corresponding LeafNodes.
@@ -1065,7 +983,8 @@
next_nodes_and_keys = []
for node_index, sub_keys in keys_at_index:
node = nodes[node_index]
- positions = self._multi_bisect_right(sub_keys, node.keys)
+ positions = _btree_serializer._multi_bisect_right(sub_keys,
+ node.keys)
node_offset = next_row_start + node.offset
next_nodes_and_keys.extend([(node_offset + pos, s_keys)
for pos, s_keys in positions])
=== modified file 'bzrlib/tests/test_btree_index.py'
--- a/bzrlib/tests/test_btree_index.py 2009-08-13 19:56:26 +0000
+++ b/bzrlib/tests/test_btree_index.py 2009-08-20 20:48:51 +0000
@@ -38,7 +38,7 @@
def load_tests(standard_tests, module, loader):
# parameterise the TestBTreeNodes tests
node_tests, others = split_suite_by_condition(standard_tests,
- condition_isinstance(TestBTreeNodes))
+ condition_isinstance((TestBTreeNodes, TestMultiBisectRight)))
import bzrlib._btree_serializer_py as py_module
scenarios = [('python', {'parse_btree': py_module})]
if CompiledBtreeParserFeature.available():
@@ -1249,7 +1249,7 @@
def assertMultiBisectRight(self, offsets, search_keys, fixed_keys):
self.assertEqual(offsets,
- btree_index.BTreeGraphIndex._multi_bisect_right(
+ self.parse_btree._multi_bisect_right(
search_keys, fixed_keys))
def test_after(self):
@@ -1264,7 +1264,8 @@
def test_exact(self):
self.assertMultiBisectRight([(1, ['a'])], ['a'], ['a'])
- self.assertMultiBisectRight([(1, ['a']), (2, ['b'])], ['a', 'b'], ['a', 'b'])
+ self.assertMultiBisectRight([(1, ['a']), (2, ['b'])], ['a', 'b'],
+ ['a', 'b'])
self.assertMultiBisectRight([(1, ['a']), (3, ['c'])],
['a', 'c'], ['a', 'b', 'c'])
More information about the bazaar-commits
mailing list