Rev 3879: (jam) Use I/O order for an 'unordered' get_record_stream request. in file:///home/pqm/archives/thelove/bzr/%2Btrunk/
Canonical.com Patch Queue Manager
pqm at pqm.ubuntu.com
Fri Dec 5 13:51:58 GMT 2008
At file:///home/pqm/archives/thelove/bzr/%2Btrunk/
------------------------------------------------------------
revno: 3879
revision-id: pqm at pqm.ubuntu.com-20081205135154-uwqcpl3lruah9fo3
parent: pqm at pqm.ubuntu.com-20081202015700-3mc9dola31w7h5h4
parent: john at arbash-meinel.com-20081205131110-lslaoi77c0l6cepq
committer: Canonical.com Patch Queue Manager <pqm at pqm.ubuntu.com>
branch nick: +trunk
timestamp: Fri 2008-12-05 13:51:54 +0000
message:
(jam) Use I/O order for an 'unordered' get_record_stream request.
modified:
NEWS NEWS-20050323055033-4e00b5db738777ff
bzrlib/knit.py knit.py-20051212171256-f056ac8f0fbe1bd9
bzrlib/tests/test_knit.py test_knit.py-20051212171302-95d4c00dd5f11f2b
------------------------------------------------------------
revno: 3878.1.3
revision-id: john at arbash-meinel.com-20081205131110-lslaoi77c0l6cepq
parent: john at arbash-meinel.com-20081204205915-0ubzh41iryx4f0t1
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: sorted_get_record_stream
timestamp: Fri 2008-12-05 07:11:10 -0600
message:
Add a comment about what data we are sorting by.
modified:
bzrlib/knit.py knit.py-20051212171256-f056ac8f0fbe1bd9
------------------------------------------------------------
revno: 3878.1.2
revision-id: john at arbash-meinel.com-20081204205915-0ubzh41iryx4f0t1
parent: john at arbash-meinel.com-20081204204153-xtyxecjor7itx91r
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: sorted_get_record_stream
timestamp: Thu 2008-12-04 14:59:15 -0600
message:
Move the sorting into each index, and customize it for Kndx access.
modified:
bzrlib/knit.py knit.py-20051212171256-f056ac8f0fbe1bd9
------------------------------------------------------------
revno: 3878.1.1
revision-id: john at arbash-meinel.com-20081204204153-xtyxecjor7itx91r
parent: pqm at pqm.ubuntu.com-20081202015700-3mc9dola31w7h5h4
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: sorted_get_record_stream
timestamp: Thu 2008-12-04 14:41:53 -0600
message:
KVF.get_record_stream('unordered') now returns the records based on I/O ordering.
modified:
NEWS NEWS-20050323055033-4e00b5db738777ff
bzrlib/knit.py knit.py-20051212171256-f056ac8f0fbe1bd9
bzrlib/tests/test_knit.py test_knit.py-20051212171302-95d4c00dd5f11f2b
=== modified file 'NEWS'
--- a/NEWS 2008-12-01 23:20:22 +0000
+++ b/NEWS 2008-12-04 20:41:53 +0000
@@ -36,6 +36,12 @@
INTERNALS:
+ * ``KnitVersionedFiles.get_record_stream()`` will now chose a
+ more optimal ordering when the keys are requested 'unordered'.
+ Previously the order was fully random, now the records should be
+ returned from each pack in turn, in forward I/O order.
+ (John Arbash Meinel)
+
bzr 1.10rc1 2008-11-28
----------------------
=== modified file 'bzrlib/knit.py'
--- a/bzrlib/knit.py 2008-11-27 08:26:50 +0000
+++ b/bzrlib/knit.py 2008-12-05 13:11:10 +0000
@@ -1262,6 +1262,13 @@
for key in parent_map:
present_keys.append(key)
source_keys[-1][1].append(key)
+ # We have been requested to return these records in an order that
+ # suits us. So we ask the index to give us an optimally sorted
+ # order.
+ for source, sub_keys in source_keys:
+ if source is parent_maps[0]:
+ # Only sort the keys for this VF
+ self._index._sort_keys_by_io(sub_keys, positions)
absent_keys = keys - set(global_map)
for key in absent_keys:
yield AbsentContentFactory(key)
@@ -2121,6 +2128,26 @@
else:
self._mode = 'r'
+ def _sort_keys_by_io(self, keys, positions):
+ """Figure out an optimal order to read the records for the given keys.
+
+ Sort keys, grouped by index and sorted by position.
+
+ :param keys: A list of keys whose records we want to read. This will be
+ sorted 'in-place'.
+ :param positions: A dict, such as the one returned by
+ _get_components_positions()
+ :return: None
+ """
+ def get_sort_key(key):
+ index_memo = positions[key][1]
+ # Group by prefix and position. index_memo[0] is the key, so it is
+ # (file_id, revision_id) and we don't want to sort on revision_id,
+ # index_memo[1] is the position, and index_memo[2] is the size,
+ # which doesn't matter for the sort
+ return index_memo[0][:-1], index_memo[1]
+ return keys.sort(key=get_sort_key)
+
def _split_key(self, key):
"""Split key into a prefix and suffix."""
return key[:-1], key[-1]
@@ -2380,6 +2407,26 @@
bits = node[2][1:].split(' ')
return node[0], int(bits[0]), int(bits[1])
+ def _sort_keys_by_io(self, keys, positions):
+ """Figure out an optimal order to read the records for the given keys.
+
+ Sort keys, grouped by index and sorted by position.
+
+ :param keys: A list of keys whose records we want to read. This will be
+ sorted 'in-place'.
+ :param positions: A dict, such as the one returned by
+ _get_components_positions()
+ :return: None
+ """
+ def get_index_memo(key):
+ # index_memo is at offset [1]. It is made up of (GraphIndex,
+ # position, size). GI is an object, which will be unique for each
+ # pack file. This causes us to group by pack file, then sort by
+ # position. Size doesn't matter, but it isn't worth breaking up the
+ # tuple.
+ return positions[key][1]
+ return keys.sort(key=get_index_memo)
+
class _KnitKeyAccess(object):
"""Access to records in .knit files."""
=== modified file 'bzrlib/tests/test_knit.py'
--- a/bzrlib/tests/test_knit.py 2008-11-27 07:25:52 +0000
+++ b/bzrlib/tests/test_knit.py 2008-12-04 20:41:53 +0000
@@ -659,6 +659,44 @@
vf.iter_lines_added_or_present_in_keys, keys)
self.assertEqual([2, 1, 1], reload_counter)
+ def test_get_record_stream_yields_disk_sorted_order(self):
+ # if we get 'unordered' pick a semi-optimal order for reading. The
+ # order should be grouped by pack file, and then by position in file
+ repo = self.make_repository('test', format='pack-0.92')
+ repo.lock_write()
+ self.addCleanup(repo.unlock)
+ repo.start_write_group()
+ vf = repo.texts
+ vf.add_lines(('f-id', 'rev-5'), [('f-id', 'rev-4')], ['lines\n'])
+ vf.add_lines(('f-id', 'rev-1'), [], ['lines\n'])
+ vf.add_lines(('f-id', 'rev-2'), [('f-id', 'rev-1')], ['lines\n'])
+ repo.commit_write_group()
+ # We inserted them as rev-5, rev-1, rev-2, we should get them back in
+ # the same order
+ stream = vf.get_record_stream([('f-id', 'rev-1'), ('f-id', 'rev-5'),
+ ('f-id', 'rev-2')], 'unordered', False)
+ keys = [r.key for r in stream]
+ self.assertEqual([('f-id', 'rev-5'), ('f-id', 'rev-1'),
+ ('f-id', 'rev-2')], keys)
+ repo.start_write_group()
+ vf.add_lines(('f-id', 'rev-4'), [('f-id', 'rev-3')], ['lines\n'])
+ vf.add_lines(('f-id', 'rev-3'), [('f-id', 'rev-2')], ['lines\n'])
+ vf.add_lines(('f-id', 'rev-6'), [('f-id', 'rev-5')], ['lines\n'])
+ repo.commit_write_group()
+ # Request in random order, to make sure the output order isn't based on
+ # the request
+ request_keys = set(('f-id', 'rev-%d' % i) for i in range(1, 7))
+ stream = vf.get_record_stream(request_keys, 'unordered', False)
+ keys = [r.key for r in stream]
+ # We want to get the keys back in disk order, but it doesn't matter
+ # which pack we read from first. So this can come back in 2 orders
+ alt1 = [('f-id', 'rev-%d' % i) for i in [4, 3, 6, 5, 1, 2]]
+ alt2 = [('f-id', 'rev-%d' % i) for i in [5, 1, 2, 4, 3, 6]]
+ if keys != alt1 and keys != alt2:
+ self.fail('Returned key order did not match either expected order.'
+ ' expected %s or %s, not %s'
+ % (alt1, alt2, keys))
+
class LowLevelKnitDataTests(TestCase):
More information about the bazaar-commits
mailing list