Rev 4635: First cut at an implementation of multi_bisect_right in pyrex. in http://bazaar.launchpad.net/~jameinel/bzr/2.0b1-multibisect

Thu Aug 20 21:48:56 BST 2009

At http://bazaar.launchpad.net/~jameinel/bzr/2.0b1-multibisect

------------------------------------------------------------
revno: 4635
revision-id: john at arbash-meinel.com-20090820204851-f320qk0or7revyvr
parent: pqm at pqm.ubuntu.com-20090820135727-pz56lf0l0tj6rbrn
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: 2.0b1-multibisect
timestamp: Thu 2009-08-20 15:48:51 -0500
message:
  First cut at an implementation of multi_bisect_right in pyrex.
-------------- next part --------------
=== modified file 'bzrlib/_btree_serializer_py.py'

--- a/bzrlib/_btree_serializer_py.py	2009-03-23 14:59:43 +0000
+++ b/bzrlib/_btree_serializer_py.py	2009-08-20 20:48:51 +0000
@@ -17,6 +17,9 @@
 
 """B+Tree index parsing."""
 
+import bisect
+
+
 def _parse_leaf_lines(bytes, key_length, ref_list_length):
     lines = bytes.split('\n')
     nodes = []
@@ -64,3 +67,85 @@
     line = ("%s\x00%s\x00%s\n" % (string_key,
         '\t'.join(flattened_references), node[2]))
     return string_key, line
+
+
+def _multi_bisect_right(in_keys, fixed_keys):
+    """Find the positions where each 'in_key' would fit in fixed_keys.
+
+    This is equivalent to doing "bisect_right" on each in_key into
+    fixed_keys
+
+    :param in_keys: A sorted list of keys to match with fixed_keys
+    :param fixed_keys: A sorted list of keys to match against
+    :return: A list of (integer position, [key list]) tuples.
+    """
+    if not in_keys:
+        return []
+    if not fixed_keys:
+        # no pointers in the fixed_keys list, which means everything must
+        # fall to the left.
+        return [(0, in_keys)]
+
+    # TODO: Iterating both lists will generally take M + N steps
+    #       Bisecting each key will generally take M * log2 N steps.
+    #       If we had an efficient way to compare, we could pick the method
+    #       based on which has the fewer number of steps.
+    #       There is also the argument that bisect_right is a compiled
+    #       function, so there is even more to be gained.
+    # iter_steps = len(in_keys) + len(fixed_keys)
+    # bisect_steps = len(in_keys) * math.log(len(fixed_keys), 2)
+    if len(in_keys) == 1: # Bisect will always be faster for M = 1
+        return [(bisect.bisect_right(fixed_keys, in_keys[0]), in_keys)]
+    # elif bisect_steps < iter_steps:
+    #     offsets = {}
+    #     for key in in_keys:
+    #         offsets.setdefault(bisect_right(fixed_keys, key),
+    #                            []).append(key)
+    #     return [(o, offsets[o]) for o in sorted(offsets)]
+    in_keys_iter = iter(in_keys)
+    fixed_keys_iter = enumerate(fixed_keys)
+    cur_in_key = in_keys_iter.next()
+    cur_fixed_offset, cur_fixed_key = fixed_keys_iter.next()
+
+    class InputDone(Exception): pass
+    class FixedDone(Exception): pass
+
+    output = []
+
+    # TODO: Another possibility is that rather than iterating on each side,
+    #       we could use a combination of bisecting and iterating. For
+    #       example, while cur_in_key < fixed_key, bisect to find its
+    #       point, then iterate all matching keys, then bisect (restricted
+    #       to only the remainder) for the next one, etc.
+    try:
+        while True:
+            if cur_in_key < cur_fixed_key:
+                cur_keys = []
+                cur_out = (cur_fixed_offset, cur_keys)
+                output.append(cur_out)
+                while cur_in_key < cur_fixed_key:
+                    cur_keys.append(cur_in_key)
+                    try:
+                        cur_in_key = in_keys_iter.next()
+                    except StopIteration:
+                        raise InputDone
+                # At this point cur_in_key must be >= cur_fixed_key
+            # step the cur_fixed_key until we pass the cur key, or walk off
+            # the end
+            while cur_in_key >= cur_fixed_key:
+                try:
+                    cur_fixed_offset, cur_fixed_key = fixed_keys_iter.next()
+                except StopIteration:
+                    raise FixedDone
+    except InputDone:
+        # We consumed all of the input, nothing more to do
+        pass
+    except FixedDone:
+        # There was some input left, but we consumed all of fixed, so we
+        # have to add one more for the tail
+        cur_keys = [cur_in_key]
+        cur_keys.extend(in_keys_iter)
+        cur_out = (len(fixed_keys), cur_keys)
+        output.append(cur_out)
+    return output
+

=== modified file 'bzrlib/_btree_serializer_pyx.pyx'
--- a/bzrlib/_btree_serializer_pyx.pyx	2009-06-22 12:52:39 +0000
+++ b/bzrlib/_btree_serializer_pyx.pyx	2009-08-20 20:48:51 +0000
@@ -29,6 +29,10 @@
     ctypedef struct PyObject:
         pass
     int PyList_Append(object lst, object item) except -1
+    int PyList_CheckExact(object)
+    PyObject *PyList_GET_ITEM(object, Py_ssize_t i)
+    PyObject *PyList_GetItem(object, Py_ssize_t i) except NULL
+    Py_ssize_t PyList_GET_SIZE(object)
 
     char *PyString_AsString(object p) except NULL
     object PyString_FromStringAndSize(char *, Py_ssize_t)
@@ -45,6 +49,15 @@
     PyObject *PyTuple_GET_ITEM_ptr_object "PyTuple_GET_ITEM" (object tpl, int index)
     void Py_DECREF_ptr "Py_DECREF" (PyObject *)
 
+    int PySequence_Check(object)
+    Py_ssize_t PySequence_Size(object)
+    # Do we want to use this, or can we assume we always have a List?
+    Py_ssize_t PySequence_GetItem(object, Py_ssize_t i)
+
+
+import bisect
+
+
 cdef extern from "string.h":
     void *memcpy(void *dest, void *src, size_t n)
     void *memchr(void *s, int c, size_t n)
@@ -411,3 +424,92 @@
     out = out + value_len
     out[0] = c'\n'
     return string_key, line
+
+
+cdef object bisect_right
+bisect_right = bisect.bisect_right
+
+
+def _multi_bisect_right(in_keys, fixed_keys):
+    """Find the positions where each 'in_key' would fit in fixed_keys.
+
+    This is equivalent to doing "bisect_right" on each in_key into
+    fixed_keys
+
+    :param in_keys: A sorted list of keys to match with fixed_keys
+    :param fixed_keys: A sorted list of keys to match against
+    :return: A list of (integer position, [key list]) tuples.
+    """
+    cdef PyObject *temp
+    cdef long in_pos, fixed_pos, in_length, fixed_length
+
+    if not PyList_CheckExact(in_keys):
+        raise TypeError('in_keys must be a list')
+    if not PyList_CheckExact(fixed_keys):
+        raise TypeError('fixed_keys must be a list')
+
+    in_length = PyList_GET_SIZE(in_keys)
+    if in_length == 0:
+        return []
+    fixed_length = PyList_GET_SIZE(fixed_keys)
+    if fixed_length == 0:
+        # no pointers in the fixed_keys list, which means everything must
+        # fall to the left.
+        return [(0, in_keys)]
+
+    # Find the possible location for the first entry using bisection
+    pos = bisect_right(fixed_keys, in_keys[0])
+    if PyList_GET_SIZE(in_keys) == 1:
+        # If there is only one key, then we are done.
+        return [(pos, in_keys)]
+    fixed_pos = pos
+    if fixed_pos >= fixed_length:
+        # All the keys occur after the last fixed_key
+        return [(fixed_length, in_keys)]
+
+    output = []
+
+    # TODO: Another possibility is that rather than iterating on each side,
+    #       we could use a combination of bisecting and iterating. For
+    #       example, while cur_in_key < fixed_key, bisect to find its
+    #       point, then iterate all matching keys, then bisect (restricted
+    #       to only the remainder) for the next one, etc.
+    in_pos = 0
+    temp = PyList_GetItem(in_keys, in_pos)
+    in_key = <object>temp
+    temp = PyList_GetItem(fixed_keys, fixed_pos)
+    fixed_key = <object>temp
+    while in_pos < in_length and fixed_pos < fixed_length:
+        if in_key < fixed_key:
+            cur_keys = []
+            PyList_Append(output, (fixed_pos, cur_keys))
+            while in_key < fixed_key:
+                PyList_Append(cur_keys, in_key)
+                in_pos = in_pos + 1
+                if in_pos >= in_length:
+                    # We have handled all of the keys
+                    return output
+                temp = PyList_GetItem(in_keys, in_pos)
+                in_key = <object>temp
+        # At this point cur_in_key must be >= cur_fixed_key
+        # step the cur_fixed_key until we pass the cur key, or walk off
+        # the end
+        while in_key >= fixed_key:
+            fixed_pos = fixed_pos + 1
+            if fixed_pos >= fixed_length:
+                break
+            temp = PyList_GetItem(fixed_keys, fixed_pos)
+            fixed_key = <object>temp
+
+    if in_pos < in_length:
+        # There was some input left, but we consumed all of fixed, so we
+        # have to add one more for the tail
+        cur_keys = [in_key]
+        in_pos = in_pos + 1
+        PyList_Append(output, (fixed_length, cur_keys))
+        while in_pos < in_length:
+            temp = PyList_GetItem(in_keys, in_pos)
+            in_key = <object>temp
+            PyList_Append(cur_keys, in_key)
+            in_pos = in_pos + 1
+    return output

=== modified file 'bzrlib/btree_index.py'
--- a/bzrlib/btree_index.py	2009-08-17 22:11:06 +0000
+++ b/bzrlib/btree_index.py	2009-08-20 20:48:51 +0000
@@ -1,4 +1,4 @@
-# Copyright (C) 2008 Canonical Ltd
+# Copyright (C) 2008, 2009 Canonical Ltd
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@@ -17,7 +17,7 @@
 
 """B+Tree indices"""
 
-from bisect import bisect_right
+import gc
 import math
 import tempfile
 import zlib
@@ -963,88 +963,6 @@
                 for key, (value, refs) in sorted(node.keys.items()):
                     yield (self, key, value)
 
-    @staticmethod
-    def _multi_bisect_right(in_keys, fixed_keys):
-        """Find the positions where each 'in_key' would fit in fixed_keys.
-
-        This is equivalent to doing "bisect_right" on each in_key into
-        fixed_keys
-
-        :param in_keys: A sorted list of keys to match with fixed_keys
-        :param fixed_keys: A sorted list of keys to match against
-        :return: A list of (integer position, [key list]) tuples.
-        """
-        if not in_keys:
-            return []
-        if not fixed_keys:
-            # no pointers in the fixed_keys list, which means everything must
-            # fall to the left.
-            return [(0, in_keys)]
-
-        # TODO: Iterating both lists will generally take M + N steps
-        #       Bisecting each key will generally take M * log2 N steps.
-        #       If we had an efficient way to compare, we could pick the method
-        #       based on which has the fewer number of steps.
-        #       There is also the argument that bisect_right is a compiled
-        #       function, so there is even more to be gained.
-        # iter_steps = len(in_keys) + len(fixed_keys)
-        # bisect_steps = len(in_keys) * math.log(len(fixed_keys), 2)
-        if len(in_keys) == 1: # Bisect will always be faster for M = 1
-            return [(bisect_right(fixed_keys, in_keys[0]), in_keys)]
-        # elif bisect_steps < iter_steps:
-        #     offsets = {}
-        #     for key in in_keys:
-        #         offsets.setdefault(bisect_right(fixed_keys, key),
-        #                            []).append(key)
-        #     return [(o, offsets[o]) for o in sorted(offsets)]
-        in_keys_iter = iter(in_keys)
-        fixed_keys_iter = enumerate(fixed_keys)
-        cur_in_key = in_keys_iter.next()
-        cur_fixed_offset, cur_fixed_key = fixed_keys_iter.next()
-
-        class InputDone(Exception): pass
-        class FixedDone(Exception): pass
-
-        output = []
-        cur_out = []
-
-        # TODO: Another possibility is that rather than iterating on each side,
-        #       we could use a combination of bisecting and iterating. For
-        #       example, while cur_in_key < fixed_key, bisect to find its
-        #       point, then iterate all matching keys, then bisect (restricted
-        #       to only the remainder) for the next one, etc.
-        try:
-            while True:
-                if cur_in_key < cur_fixed_key:
-                    cur_keys = []
-                    cur_out = (cur_fixed_offset, cur_keys)
-                    output.append(cur_out)
-                    while cur_in_key < cur_fixed_key:
-                        cur_keys.append(cur_in_key)
-                        try:
-                            cur_in_key = in_keys_iter.next()
-                        except StopIteration:
-                            raise InputDone
-                    # At this point cur_in_key must be >= cur_fixed_key
-                # step the cur_fixed_key until we pass the cur key, or walk off
-                # the end
-                while cur_in_key >= cur_fixed_key:
-                    try:
-                        cur_fixed_offset, cur_fixed_key = fixed_keys_iter.next()
-                    except StopIteration:
-                        raise FixedDone
-        except InputDone:
-            # We consumed all of the input, nothing more to do
-            pass
-        except FixedDone:
-            # There was some input left, but we consumed all of fixed, so we
-            # have to add one more for the tail
-            cur_keys = [cur_in_key]
-            cur_keys.extend(in_keys_iter)
-            cur_out = (len(fixed_keys), cur_keys)
-            output.append(cur_out)
-        return output
-
     def _walk_through_internal_nodes(self, keys):
         """Take the given set of keys, and find the corresponding LeafNodes.
 
@@ -1065,7 +983,8 @@
             next_nodes_and_keys = []
             for node_index, sub_keys in keys_at_index:
                 node = nodes[node_index]
-                positions = self._multi_bisect_right(sub_keys, node.keys)
+                positions = _btree_serializer._multi_bisect_right(sub_keys,
+                                                                  node.keys)
                 node_offset = next_row_start + node.offset
                 next_nodes_and_keys.extend([(node_offset + pos, s_keys)
                                            for pos, s_keys in positions])

=== modified file 'bzrlib/tests/test_btree_index.py'
--- a/bzrlib/tests/test_btree_index.py	2009-08-13 19:56:26 +0000
+++ b/bzrlib/tests/test_btree_index.py	2009-08-20 20:48:51 +0000
@@ -38,7 +38,7 @@
 def load_tests(standard_tests, module, loader):
     # parameterise the TestBTreeNodes tests
     node_tests, others = split_suite_by_condition(standard_tests,
-        condition_isinstance(TestBTreeNodes))
+        condition_isinstance((TestBTreeNodes, TestMultiBisectRight)))
     import bzrlib._btree_serializer_py as py_module
     scenarios = [('python', {'parse_btree': py_module})]
     if CompiledBtreeParserFeature.available():
@@ -1249,7 +1249,7 @@
 
     def assertMultiBisectRight(self, offsets, search_keys, fixed_keys):
         self.assertEqual(offsets,
-                         btree_index.BTreeGraphIndex._multi_bisect_right(
+                         self.parse_btree._multi_bisect_right(
                             search_keys, fixed_keys))
 
     def test_after(self):
@@ -1264,7 +1264,8 @@
 
     def test_exact(self):
         self.assertMultiBisectRight([(1, ['a'])], ['a'], ['a'])
-        self.assertMultiBisectRight([(1, ['a']), (2, ['b'])], ['a', 'b'], ['a', 'b'])
+        self.assertMultiBisectRight([(1, ['a']), (2, ['b'])], ['a', 'b'],
+                                    ['a', 'b'])
         self.assertMultiBisectRight([(1, ['a']), (3, ['c'])],
                                     ['a', 'c'], ['a', 'b', 'c'])