Rev 3879: KVF.get_record_stream('unordered') now returns the records based on I/O ordering. in http://bzr.arbash-meinel.com/branches/bzr/1.11/sorted_get_record_stream

Thu Dec 4 20:41:57 GMT 2008

At http://bzr.arbash-meinel.com/branches/bzr/1.11/sorted_get_record_stream

------------------------------------------------------------
revno: 3879
revision-id: john at arbash-meinel.com-20081204204153-xtyxecjor7itx91r
parent: pqm at pqm.ubuntu.com-20081202015700-3mc9dola31w7h5h4
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: sorted_get_record_stream
timestamp: Thu 2008-12-04 14:41:53 -0600
message:
  KVF.get_record_stream('unordered') now returns the records based on I/O ordering.
-------------- next part --------------
=== modified file 'NEWS'

--- a/NEWS	2008-12-01 23:20:22 +0000
+++ b/NEWS	2008-12-04 20:41:53 +0000
@@ -36,6 +36,12 @@
 
   INTERNALS:
 
+    * ``KnitVersionedFiles.get_record_stream()`` will now chose a
+      more optimal ordering when the keys are requested 'unordered'.
+      Previously the order was fully random, now the records should be
+      returned from each pack in turn, in forward I/O order.
+      (John Arbash Meinel)
+
 
 bzr 1.10rc1 2008-11-28
 ----------------------

=== modified file 'bzrlib/knit.py'
--- a/bzrlib/knit.py	2008-11-27 08:26:50 +0000
+++ b/bzrlib/knit.py	2008-12-04 20:41:53 +0000
@@ -1262,6 +1262,21 @@
                 for key in parent_map:
                     present_keys.append(key)
                     source_keys[-1][1].append(key)
+            # We have been requested to return these records in an order that
+            # suits us. So we sort by the index_memo. Generally each entry is
+            # (index, start, length). We want to group by index so that we read
+            # from one pack at a time, and then we group by start so that we
+            # read from the beginning of the file to the end.
+            # This is a little bit iffy, because index_memo is technically
+            # "opaque", but it works with current formats and would otherwise
+            # require passing our custom dict 'positions' back to self._index
+            # for it to figure out that we could sort based on index_memo.
+            def get_index_memo(key):
+                return positions[key][1]
+            for source, sub_keys in source_keys:
+                if source is parent_maps[0]:
+                    # We don't need to sort the keys for fallbacks
+                    sub_keys.sort(key=get_index_memo)
         absent_keys = keys - set(global_map)
         for key in absent_keys:
             yield AbsentContentFactory(key)

=== modified file 'bzrlib/tests/test_knit.py'
--- a/bzrlib/tests/test_knit.py	2008-11-27 07:25:52 +0000
+++ b/bzrlib/tests/test_knit.py	2008-12-04 20:41:53 +0000
@@ -659,6 +659,44 @@
             vf.iter_lines_added_or_present_in_keys, keys)
         self.assertEqual([2, 1, 1], reload_counter)
 
+    def test_get_record_stream_yields_disk_sorted_order(self):
+        # if we get 'unordered' pick a semi-optimal order for reading. The
+        # order should be grouped by pack file, and then by position in file
+        repo = self.make_repository('test', format='pack-0.92')
+        repo.lock_write()
+        self.addCleanup(repo.unlock)
+        repo.start_write_group()
+        vf = repo.texts
+        vf.add_lines(('f-id', 'rev-5'), [('f-id', 'rev-4')], ['lines\n'])
+        vf.add_lines(('f-id', 'rev-1'), [], ['lines\n'])
+        vf.add_lines(('f-id', 'rev-2'), [('f-id', 'rev-1')], ['lines\n'])
+        repo.commit_write_group()
+        # We inserted them as rev-5, rev-1, rev-2, we should get them back in
+        # the same order
+        stream = vf.get_record_stream([('f-id', 'rev-1'), ('f-id', 'rev-5'),
+                                       ('f-id', 'rev-2')], 'unordered', False)
+        keys = [r.key for r in stream]
+        self.assertEqual([('f-id', 'rev-5'), ('f-id', 'rev-1'),
+                          ('f-id', 'rev-2')], keys)
+        repo.start_write_group()
+        vf.add_lines(('f-id', 'rev-4'), [('f-id', 'rev-3')], ['lines\n'])
+        vf.add_lines(('f-id', 'rev-3'), [('f-id', 'rev-2')], ['lines\n'])
+        vf.add_lines(('f-id', 'rev-6'), [('f-id', 'rev-5')], ['lines\n'])
+        repo.commit_write_group()
+        # Request in random order, to make sure the output order isn't based on
+        # the request
+        request_keys = set(('f-id', 'rev-%d' % i) for i in range(1, 7))
+        stream = vf.get_record_stream(request_keys, 'unordered', False)
+        keys = [r.key for r in stream]
+        # We want to get the keys back in disk order, but it doesn't matter
+        # which pack we read from first. So this can come back in 2 orders
+        alt1 = [('f-id', 'rev-%d' % i) for i in [4, 3, 6, 5, 1, 2]]
+        alt2 = [('f-id', 'rev-%d' % i) for i in [5, 1, 2, 4, 3, 6]]
+        if keys != alt1 and keys != alt2:
+            self.fail('Returned key order did not match either expected order.'
+                      ' expected %s or %s, not %s'
+                      % (alt1, alt2, keys))
+
 
 class LowLevelKnitDataTests(TestCase):