Rev 2893: * New module ``bzrlib.bisect_multi`` with generic multiple-bisection-at-once in http://people.ubuntu.com/~robertc/baz2.0/index
Robert Collins
robertc at robertcollins.net
Fri Oct 5 11:45:35 BST 2007
At http://people.ubuntu.com/~robertc/baz2.0/index
------------------------------------------------------------
revno: 2893
revision-id: robertc at robertcollins.net-20071005104511-e1uy11glm79wrjtb
parent: robertc at robertcollins.net-20071005045703-ndqi4np2zr4gu3jr
committer: Robert Collins <robertc at robertcollins.net>
branch nick: index
timestamp: Fri 2007-10-05 20:45:11 +1000
message:
* New module ``bzrlib.bisect_multi`` with generic multiple-bisection-at-once
logic, currently only available for byte-based lookup
(``bisect_multi_bytes``). (Robert Collins)
added:
bzrlib/bisect_multi.py bisect_multi.py-20071005104357-0vymd381la7ew4o1-1
bzrlib/tests/test_bisect_multi.py test_bisect_multi.py-20071005104357-0vymd381la7ew4o1-2
modified:
NEWS NEWS-20050323055033-4e00b5db738777ff
bzrlib/tests/__init__.py selftest.py-20050531073622-8d0e3c8845c97a64
=== added file 'bzrlib/bisect_multi.py'
--- a/bzrlib/bisect_multi.py 1970-01-01 00:00:00 +0000
+++ b/bzrlib/bisect_multi.py 2007-10-05 10:45:11 +0000
@@ -0,0 +1,64 @@
+# Copyright (C) 2007 Canonical Ltd
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+"""Bisection lookup multiple keys."""
+
+__all__ = [
+ 'bisect_multi_bytes',
+ ]
+
+
+def bisect_multi_bytes(content_lookup, size, keys):
+ """Perform bisection lookups for keys using byte based addressing.
+
+ The keys are looked up via the content_lookup routine. The content_lookup
+ routine gives bisect_multi_bytes information about where to keep looking up
+ to find the data for the key, and bisect_multi_bytes feeds this back into
+ the lookup function until the search is complete. The search is complete
+ when the list of keys which have returned something other than -1 or +1 is
+ empty. Keys which are not found are not returned to the caller.
+
+ :param content_lookup: A callable that takes a list of (offset, key) pairs
+ and returns a list of result tuples ((offset, key), result). Each
+ result can be one of:
+ -1: The key comes earlier in the content.
+ False: The key is not present in the content.
+ +1: The key comes later in the content.
+ Any other value: A final result to return to the caller.
+ :param size: The length of the content.
+ :param keys: The keys to bisect for.
+ :return: An iterator of the results.
+ """
+ # possibly make this a generator, but a list meets the contract for now.
+ result = []
+ delta = size // 2
+ search_keys = [(delta, key) for key in keys]
+ while search_keys:
+ search_results = content_lookup(search_keys)
+ if delta > 1:
+ delta = delta // 2
+ search_keys = []
+ for (location, key), status in search_results:
+ if status == -1:
+ search_keys.append((location - delta, key))
+ elif status == 1:
+ search_keys.append((location + delta, key))
+ elif status == False:
+ # not present, stop searching
+ continue
+ else:
+ result.append((key, status))
+ return result
=== added file 'bzrlib/tests/test_bisect_multi.py'
--- a/bzrlib/tests/test_bisect_multi.py 1970-01-01 00:00:00 +0000
+++ b/bzrlib/tests/test_bisect_multi.py 2007-10-05 10:45:11 +0000
@@ -0,0 +1,344 @@
+# Copyright (C) 2007 Canonical Ltd
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+"""Tests for bisect_multi."""
+
+from bzrlib import errors
+from bzrlib.bisect_multi import bisect_multi_bytes
+from bzrlib.tests import TestCase
+
+
+class TestBisectMultiBytes(TestCase):
+
+ def test_lookup_no_keys_no_calls(self):
+ calls = []
+ def missing_content(location_keys):
+ calls.append(location_keys)
+ return ((location_key, False) for location_key in location_keys)
+ self.assertEqual([],
+ list(bisect_multi_bytes(missing_content, 100, [])))
+ self.assertEqual([], calls)
+
+ def test_lookup_missing_key_no_content(self):
+ """Doing a lookup in a zero-length file still does a single request.
+
+ This makes sense because the bisector cannot tell how long content is
+ and its more flexible to only stop when the content object says 'False'
+ for a given location, key pair.
+ """
+ calls = []
+ def missing_content(location_keys):
+ calls.append(location_keys)
+ return ((location_key, False) for location_key in location_keys)
+ self.assertEqual([],
+ list(bisect_multi_bytes(missing_content, 0, ['foo', 'bar'])))
+ self.assertEqual([[(0, 'foo'), (0, 'bar')]], calls)
+
+ def test_lookup_missing_key_before_all_others(self):
+ calls = []
+ def missing_first_content(location_keys):
+ # returns -1 for all keys unless the byte offset is 0 when it
+ # returns False
+ calls.append(location_keys)
+ result = []
+ for location_key in location_keys:
+ if location_key[0] == 0:
+ result.append((location_key, False))
+ else:
+ result.append((location_key, -1))
+ return result
+ # given a 0 length file, this should terminate with one call.
+ self.assertEqual([],
+ list(bisect_multi_bytes(missing_first_content, 0, ['foo', 'bar'])))
+ self.assertEqual([[(0, 'foo'), (0, 'bar')]], calls)
+ del calls[:]
+ # given a 2 length file, this should make two calls - 1, 0.
+ self.assertEqual([],
+ list(bisect_multi_bytes(missing_first_content, 2, ['foo', 'bar'])))
+ self.assertEqual([
+ [(1, 'foo'), (1, 'bar')],
+ [(0, 'foo'), (0, 'bar')],
+ ], calls)
+ del calls[:]
+ # given a really long file - 200MB, this should make a series of calls with the
+ # gap between adjactent calls dropping by 50% each time. We choose a
+ # length which just under a power of two to generate a corner case in
+ # bisection - naively using power of two reduction in size can lead to
+ # a very long tail in the bisection process. The current users of
+ # the bisect_multi_bytes api are not expected to be concerned by this,
+ # as the delta gets down to 4K (the minimum we expect to read and
+ # parse) within 16 steps even on a 200MB index (which at 4 keys/K is
+ # 800 thousand keys, and log2 of 800000 is 19 - so we're doing log2
+ # steps in the worst case there.
+ self.assertEqual([],
+ list(bisect_multi_bytes(
+ missing_first_content,268435456 - 1 , ['foo', 'bar'])))
+ self.assertEqual([
+ [(134217727, 'foo'), (134217727, 'bar')],
+ [(67108864, 'foo'), (67108864, 'bar')],
+ [(33554433, 'foo'), (33554433, 'bar')],
+ [(16777218, 'foo'), (16777218, 'bar')],
+ [(8388611, 'foo'), (8388611, 'bar')],
+ [(4194308, 'foo'), (4194308, 'bar')],
+ [(2097157, 'foo'), (2097157, 'bar')],
+ [(1048582, 'foo'), (1048582, 'bar')],
+ [(524295, 'foo'), (524295, 'bar')],
+ [(262152, 'foo'), (262152, 'bar')],
+ [(131081, 'foo'), (131081, 'bar')],
+ [(65546, 'foo'), (65546, 'bar')],
+ [(32779, 'foo'), (32779, 'bar')],
+ [(16396, 'foo'), (16396, 'bar')],
+ [(8205, 'foo'), (8205, 'bar')],
+ [(4110, 'foo'), (4110, 'bar')],
+ [(2063, 'foo'), (2063, 'bar')],
+ [(1040, 'foo'), (1040, 'bar')],
+ [(529, 'foo'), (529, 'bar')],
+ [(274, 'foo'), (274, 'bar')],
+ [(147, 'foo'), (147, 'bar')],
+ [(84, 'foo'), (84, 'bar')],
+ [(53, 'foo'), (53, 'bar')],
+ [(38, 'foo'), (38, 'bar')],
+ [(31, 'foo'), (31, 'bar')],
+ [(28, 'foo'), (28, 'bar')],
+ [(27, 'foo'), (27, 'bar')],
+ [(26, 'foo'), (26, 'bar')],
+ [(25, 'foo'), (25, 'bar')],
+ [(24, 'foo'), (24, 'bar')],
+ [(23, 'foo'), (23, 'bar')],
+ [(22, 'foo'), (22, 'bar')],
+ [(21, 'foo'), (21, 'bar')],
+ [(20, 'foo'), (20, 'bar')],
+ [(19, 'foo'), (19, 'bar')],
+ [(18, 'foo'), (18, 'bar')],
+ [(17, 'foo'), (17, 'bar')],
+ [(16, 'foo'), (16, 'bar')],
+ [(15, 'foo'), (15, 'bar')],
+ [(14, 'foo'), (14, 'bar')],
+ [(13, 'foo'), (13, 'bar')],
+ [(12, 'foo'), (12, 'bar')],
+ [(11, 'foo'), (11, 'bar')],
+ [(10, 'foo'), (10, 'bar')],
+ [(9, 'foo'), (9, 'bar')],
+ [(8, 'foo'), (8, 'bar')],
+ [(7, 'foo'), (7, 'bar')],
+ [(6, 'foo'), (6, 'bar')],
+ [(5, 'foo'), (5, 'bar')],
+ [(4, 'foo'), (4, 'bar')],
+ [(3, 'foo'), (3, 'bar')],
+ [(2, 'foo'), (2, 'bar')],
+ [(1, 'foo'), (1, 'bar')],
+ [(0, 'foo'), (0, 'bar')],
+ ], calls)
+
+ def test_lookup_missing_key_after_all_others(self):
+ calls = []
+ end = None
+ def missing_last_content(location_keys):
+ # returns +1 for all keys unless the byte offset is 'end' when it
+ # returns False
+ calls.append(location_keys)
+ result = []
+ for location_key in location_keys:
+ if location_key[0] == end:
+ result.append((location_key, False))
+ else:
+ result.append((location_key, +1))
+ return result
+ # given a 0 length file, this should terminate with one call.
+ end = 0
+ self.assertEqual([],
+ list(bisect_multi_bytes(missing_last_content, 0, ['foo', 'bar'])))
+ self.assertEqual([[(0, 'foo'), (0, 'bar')]], calls)
+ del calls[:]
+ end = 2
+ # given a 3 length file, this should make two calls - 1, 2.
+ self.assertEqual([],
+ list(bisect_multi_bytes(missing_last_content, 3, ['foo', 'bar'])))
+ self.assertEqual([
+ [(1, 'foo'), (1, 'bar')],
+ [(2, 'foo'), (2, 'bar')],
+ ], calls)
+ del calls[:]
+ end = 268435456 - 2
+ # see the really-big lookup series in
+ # test_lookup_missing_key_before_all_others for details about this
+ # assertion.
+ self.assertEqual([],
+ list(bisect_multi_bytes(
+ missing_last_content,268435456 - 1 , ['foo', 'bar'])))
+ self.assertEqual([
+ [(134217727, 'foo'), (134217727, 'bar')],
+ [(201326590, 'foo'), (201326590, 'bar')],
+ [(234881021, 'foo'), (234881021, 'bar')],
+ [(251658236, 'foo'), (251658236, 'bar')],
+ [(260046843, 'foo'), (260046843, 'bar')],
+ [(264241146, 'foo'), (264241146, 'bar')],
+ [(266338297, 'foo'), (266338297, 'bar')],
+ [(267386872, 'foo'), (267386872, 'bar')],
+ [(267911159, 'foo'), (267911159, 'bar')],
+ [(268173302, 'foo'), (268173302, 'bar')],
+ [(268304373, 'foo'), (268304373, 'bar')],
+ [(268369908, 'foo'), (268369908, 'bar')],
+ [(268402675, 'foo'), (268402675, 'bar')],
+ [(268419058, 'foo'), (268419058, 'bar')],
+ [(268427249, 'foo'), (268427249, 'bar')],
+ [(268431344, 'foo'), (268431344, 'bar')],
+ [(268433391, 'foo'), (268433391, 'bar')],
+ [(268434414, 'foo'), (268434414, 'bar')],
+ [(268434925, 'foo'), (268434925, 'bar')],
+ [(268435180, 'foo'), (268435180, 'bar')],
+ [(268435307, 'foo'), (268435307, 'bar')],
+ [(268435370, 'foo'), (268435370, 'bar')],
+ [(268435401, 'foo'), (268435401, 'bar')],
+ [(268435416, 'foo'), (268435416, 'bar')],
+ [(268435423, 'foo'), (268435423, 'bar')],
+ [(268435426, 'foo'), (268435426, 'bar')],
+ [(268435427, 'foo'), (268435427, 'bar')],
+ [(268435428, 'foo'), (268435428, 'bar')],
+ [(268435429, 'foo'), (268435429, 'bar')],
+ [(268435430, 'foo'), (268435430, 'bar')],
+ [(268435431, 'foo'), (268435431, 'bar')],
+ [(268435432, 'foo'), (268435432, 'bar')],
+ [(268435433, 'foo'), (268435433, 'bar')],
+ [(268435434, 'foo'), (268435434, 'bar')],
+ [(268435435, 'foo'), (268435435, 'bar')],
+ [(268435436, 'foo'), (268435436, 'bar')],
+ [(268435437, 'foo'), (268435437, 'bar')],
+ [(268435438, 'foo'), (268435438, 'bar')],
+ [(268435439, 'foo'), (268435439, 'bar')],
+ [(268435440, 'foo'), (268435440, 'bar')],
+ [(268435441, 'foo'), (268435441, 'bar')],
+ [(268435442, 'foo'), (268435442, 'bar')],
+ [(268435443, 'foo'), (268435443, 'bar')],
+ [(268435444, 'foo'), (268435444, 'bar')],
+ [(268435445, 'foo'), (268435445, 'bar')],
+ [(268435446, 'foo'), (268435446, 'bar')],
+ [(268435447, 'foo'), (268435447, 'bar')],
+ [(268435448, 'foo'), (268435448, 'bar')],
+ [(268435449, 'foo'), (268435449, 'bar')],
+ [(268435450, 'foo'), (268435450, 'bar')],
+ [(268435451, 'foo'), (268435451, 'bar')],
+ [(268435452, 'foo'), (268435452, 'bar')],
+ [(268435453, 'foo'), (268435453, 'bar')],
+ [(268435454, 'foo'), (268435454, 'bar')]
+ ], calls)
+
+ def test_lookup_when_a_key_is_missing_continues(self):
+ calls = []
+ def missing_foo_otherwise_missing_first_content(location_keys):
+ # returns -1 for all keys unless the byte offset is 0 when it
+ # returns False
+ calls.append(location_keys)
+ result = []
+ for location_key in location_keys:
+ if location_key[1] == 'foo' or location_key[0] == 0:
+ result.append((location_key, False))
+ else:
+ result.append((location_key, -1))
+ return result
+ # given a 2 length file, this should terminate with two calls, one for
+ # both keys, and one for bar only.
+ self.assertEqual([],
+ list(bisect_multi_bytes(
+ missing_foo_otherwise_missing_first_content, 2,
+ ['foo', 'bar'])))
+ self.assertEqual([
+ [(1, 'foo'), (1, 'bar')],
+ [(0, 'bar')],
+ ], calls)
+
+ def test_found_keys_returned_other_searches_continue(self):
+ calls = []
+ def find_bar_at_1_foo_missing_at_0(location_keys):
+ calls.append(location_keys)
+ result = []
+ for location_key in location_keys:
+ if location_key == (1, 'bar'):
+ result.append((location_key, 'bar-result'))
+ elif location_key[0] == 0:
+ result.append((location_key, False))
+ else:
+ result.append((location_key, -1))
+ return result
+ # given a 4 length file, this should terminate with three calls, two for
+ # both keys, and one for foo only.
+ self.assertEqual([('bar', 'bar-result')],
+ list(bisect_multi_bytes(
+ find_bar_at_1_foo_missing_at_0, 4,
+ ['foo', 'bar'])))
+ self.assertEqual([
+ [(2, 'foo'), (2, 'bar')],
+ [(1, 'foo'), (1, 'bar')],
+ [(0, 'foo')],
+ ], calls)
+
+ def test_searches_different_keys_in_different_directions(self):
+ calls = []
+ def missing_bar_at_1_foo_at_3(location_keys):
+ calls.append(location_keys)
+ result = []
+ for location_key in location_keys:
+ if location_key[1] == 'bar':
+ if location_key[0] == 1:
+ result.append((location_key, False))
+ else:
+ # search down
+ result.append((location_key, -1))
+ elif location_key[1] == 'foo':
+ if location_key[0] == 3:
+ result.append((location_key, False))
+ else:
+ # search up
+ result.append((location_key, +1))
+ return result
+ # given a 4 length file, this should terminate with two calls.
+ self.assertEqual([],
+ list(bisect_multi_bytes(
+ missing_bar_at_1_foo_at_3, 4,
+ ['foo', 'bar'])))
+ self.assertEqual([
+ [(2, 'foo'), (2, 'bar')],
+ [(3, 'foo'), (1, 'bar')],
+ ], calls)
+
+ def test_change_direction_in_single_key_search(self):
+ # check that we can search down, up, down again -
+ # so length 8, goes 4, 6, 5
+ calls = []
+ def missing_at_5(location_keys):
+ calls.append(location_keys)
+ result = []
+ for location_key in location_keys:
+ if location_key[0] == 5:
+ result.append((location_key, False))
+ elif location_key[0] > 5:
+ # search down
+ result.append((location_key, -1))
+ else:
+ # search up
+ result.append((location_key, +1))
+ return result
+ # given a 8 length file, this should terminate with three calls.
+ self.assertEqual([],
+ list(bisect_multi_bytes(
+ missing_at_5, 8,
+ ['foo', 'bar'])))
+ self.assertEqual([
+ [(4, 'foo'), (4, 'bar')],
+ [(6, 'foo'), (6, 'bar')],
+ [(5, 'foo'), (5, 'bar')],
+ ], calls)
+
=== modified file 'NEWS'
--- a/NEWS 2007-10-05 04:47:47 +0000
+++ b/NEWS 2007-10-05 10:45:11 +0000
@@ -191,6 +191,10 @@
* New method on xml serialisers, write_inventory_to_lines, which matches the
API used by knits for adding content. (Robert Collins)
+ * New module ``bzrlib.bisect_multi`` with generic multiple-bisection-at-once
+ logic, currently only available for byte-based lookup
+ (``bisect_multi_bytes``). (Robert Collins)
+
* New helper ``bzrlib.tuned_gzip.bytes_to_gzip`` which takes a byte string
and returns a gzipped version of the same. This is used to avoid a bunch
of api friction during adding of knit hunks. (Robert Collins)
=== modified file 'bzrlib/tests/__init__.py'
--- a/bzrlib/tests/__init__.py 2007-10-04 17:10:53 +0000
+++ b/bzrlib/tests/__init__.py 2007-10-05 10:45:11 +0000
@@ -2347,6 +2347,7 @@
'bzrlib.tests.test_api',
'bzrlib.tests.test_atomicfile',
'bzrlib.tests.test_bad_files',
+ 'bzrlib.tests.test_bisect_multi',
'bzrlib.tests.test_branch',
'bzrlib.tests.test_branchbuilder',
'bzrlib.tests.test_bugtracker',
More information about the bazaar-commits
mailing list