Rev 3879: (jam) Use I/O order for an 'unordered' get_record_stream request. in file:///home/pqm/archives/thelove/bzr/%2Btrunk/

Fri Dec 5 13:51:58 GMT 2008

At file:///home/pqm/archives/thelove/bzr/%2Btrunk/

------------------------------------------------------------
revno: 3879
revision-id: pqm at pqm.ubuntu.com-20081205135154-uwqcpl3lruah9fo3
parent: pqm at pqm.ubuntu.com-20081202015700-3mc9dola31w7h5h4
parent: john at arbash-meinel.com-20081205131110-lslaoi77c0l6cepq
committer: Canonical.com Patch Queue Manager <pqm at pqm.ubuntu.com>
branch nick: +trunk
timestamp: Fri 2008-12-05 13:51:54 +0000
message:
  (jam) Use I/O order for an 'unordered' get_record_stream request.
modified:
  NEWS                           NEWS-20050323055033-4e00b5db738777ff
  bzrlib/knit.py                 knit.py-20051212171256-f056ac8f0fbe1bd9
  bzrlib/tests/test_knit.py      test_knit.py-20051212171302-95d4c00dd5f11f2b
    ------------------------------------------------------------
    revno: 3878.1.3
    revision-id: john at arbash-meinel.com-20081205131110-lslaoi77c0l6cepq
    parent: john at arbash-meinel.com-20081204205915-0ubzh41iryx4f0t1
    committer: John Arbash Meinel <john at arbash-meinel.com>
    branch nick: sorted_get_record_stream
    timestamp: Fri 2008-12-05 07:11:10 -0600
    message:
      Add a comment about what data we are sorting by.
    modified:
      bzrlib/knit.py                 knit.py-20051212171256-f056ac8f0fbe1bd9
    ------------------------------------------------------------
    revno: 3878.1.2
    revision-id: john at arbash-meinel.com-20081204205915-0ubzh41iryx4f0t1
    parent: john at arbash-meinel.com-20081204204153-xtyxecjor7itx91r
    committer: John Arbash Meinel <john at arbash-meinel.com>
    branch nick: sorted_get_record_stream
    timestamp: Thu 2008-12-04 14:59:15 -0600
    message:
      Move the sorting into each index, and customize it for Kndx access.
    modified:
      bzrlib/knit.py                 knit.py-20051212171256-f056ac8f0fbe1bd9
    ------------------------------------------------------------
    revno: 3878.1.1
    revision-id: john at arbash-meinel.com-20081204204153-xtyxecjor7itx91r
    parent: pqm at pqm.ubuntu.com-20081202015700-3mc9dola31w7h5h4
    committer: John Arbash Meinel <john at arbash-meinel.com>
    branch nick: sorted_get_record_stream
    timestamp: Thu 2008-12-04 14:41:53 -0600
    message:
      KVF.get_record_stream('unordered') now returns the records based on I/O ordering.
    modified:
      NEWS                           NEWS-20050323055033-4e00b5db738777ff
      bzrlib/knit.py                 knit.py-20051212171256-f056ac8f0fbe1bd9
      bzrlib/tests/test_knit.py      test_knit.py-20051212171302-95d4c00dd5f11f2b
=== modified file 'NEWS'

--- a/NEWS	2008-12-01 23:20:22 +0000
+++ b/NEWS	2008-12-04 20:41:53 +0000
@@ -36,6 +36,12 @@
 
   INTERNALS:
 
+    * ``KnitVersionedFiles.get_record_stream()`` will now chose a
+      more optimal ordering when the keys are requested 'unordered'.
+      Previously the order was fully random, now the records should be
+      returned from each pack in turn, in forward I/O order.
+      (John Arbash Meinel)
+
 
 bzr 1.10rc1 2008-11-28
 ----------------------

=== modified file 'bzrlib/knit.py'
--- a/bzrlib/knit.py	2008-11-27 08:26:50 +0000
+++ b/bzrlib/knit.py	2008-12-05 13:11:10 +0000
@@ -1262,6 +1262,13 @@
                 for key in parent_map:
                     present_keys.append(key)
                     source_keys[-1][1].append(key)
+            # We have been requested to return these records in an order that
+            # suits us. So we ask the index to give us an optimally sorted
+            # order.
+            for source, sub_keys in source_keys:
+                if source is parent_maps[0]:
+                    # Only sort the keys for this VF
+                    self._index._sort_keys_by_io(sub_keys, positions)
         absent_keys = keys - set(global_map)
         for key in absent_keys:
             yield AbsentContentFactory(key)
@@ -2121,6 +2128,26 @@
         else:
             self._mode = 'r'
 
+    def _sort_keys_by_io(self, keys, positions):
+        """Figure out an optimal order to read the records for the given keys.
+
+        Sort keys, grouped by index and sorted by position.
+
+        :param keys: A list of keys whose records we want to read. This will be
+            sorted 'in-place'.
+        :param positions: A dict, such as the one returned by
+            _get_components_positions()
+        :return: None
+        """
+        def get_sort_key(key):
+            index_memo = positions[key][1]
+            # Group by prefix and position. index_memo[0] is the key, so it is
+            # (file_id, revision_id) and we don't want to sort on revision_id,
+            # index_memo[1] is the position, and index_memo[2] is the size,
+            # which doesn't matter for the sort
+            return index_memo[0][:-1], index_memo[1]
+        return keys.sort(key=get_sort_key)
+
     def _split_key(self, key):
         """Split key into a prefix and suffix."""
         return key[:-1], key[-1]
@@ -2380,6 +2407,26 @@
         bits = node[2][1:].split(' ')
         return node[0], int(bits[0]), int(bits[1])
 
+    def _sort_keys_by_io(self, keys, positions):
+        """Figure out an optimal order to read the records for the given keys.
+
+        Sort keys, grouped by index and sorted by position.
+
+        :param keys: A list of keys whose records we want to read. This will be
+            sorted 'in-place'.
+        :param positions: A dict, such as the one returned by
+            _get_components_positions()
+        :return: None
+        """
+        def get_index_memo(key):
+            # index_memo is at offset [1]. It is made up of (GraphIndex,
+            # position, size). GI is an object, which will be unique for each
+            # pack file. This causes us to group by pack file, then sort by
+            # position. Size doesn't matter, but it isn't worth breaking up the
+            # tuple.
+            return positions[key][1]
+        return keys.sort(key=get_index_memo)
+
 
 class _KnitKeyAccess(object):
     """Access to records in .knit files."""

=== modified file 'bzrlib/tests/test_knit.py'
--- a/bzrlib/tests/test_knit.py	2008-11-27 07:25:52 +0000
+++ b/bzrlib/tests/test_knit.py	2008-12-04 20:41:53 +0000
@@ -659,6 +659,44 @@
             vf.iter_lines_added_or_present_in_keys, keys)
         self.assertEqual([2, 1, 1], reload_counter)
 
+    def test_get_record_stream_yields_disk_sorted_order(self):
+        # if we get 'unordered' pick a semi-optimal order for reading. The
+        # order should be grouped by pack file, and then by position in file
+        repo = self.make_repository('test', format='pack-0.92')
+        repo.lock_write()
+        self.addCleanup(repo.unlock)
+        repo.start_write_group()
+        vf = repo.texts
+        vf.add_lines(('f-id', 'rev-5'), [('f-id', 'rev-4')], ['lines\n'])
+        vf.add_lines(('f-id', 'rev-1'), [], ['lines\n'])
+        vf.add_lines(('f-id', 'rev-2'), [('f-id', 'rev-1')], ['lines\n'])
+        repo.commit_write_group()
+        # We inserted them as rev-5, rev-1, rev-2, we should get them back in
+        # the same order
+        stream = vf.get_record_stream([('f-id', 'rev-1'), ('f-id', 'rev-5'),
+                                       ('f-id', 'rev-2')], 'unordered', False)
+        keys = [r.key for r in stream]
+        self.assertEqual([('f-id', 'rev-5'), ('f-id', 'rev-1'),
+                          ('f-id', 'rev-2')], keys)
+        repo.start_write_group()
+        vf.add_lines(('f-id', 'rev-4'), [('f-id', 'rev-3')], ['lines\n'])
+        vf.add_lines(('f-id', 'rev-3'), [('f-id', 'rev-2')], ['lines\n'])
+        vf.add_lines(('f-id', 'rev-6'), [('f-id', 'rev-5')], ['lines\n'])
+        repo.commit_write_group()
+        # Request in random order, to make sure the output order isn't based on
+        # the request
+        request_keys = set(('f-id', 'rev-%d' % i) for i in range(1, 7))
+        stream = vf.get_record_stream(request_keys, 'unordered', False)
+        keys = [r.key for r in stream]
+        # We want to get the keys back in disk order, but it doesn't matter
+        # which pack we read from first. So this can come back in 2 orders
+        alt1 = [('f-id', 'rev-%d' % i) for i in [4, 3, 6, 5, 1, 2]]
+        alt2 = [('f-id', 'rev-%d' % i) for i in [5, 1, 2, 4, 3, 6]]
+        if keys != alt1 and keys != alt2:
+            self.fail('Returned key order did not match either expected order.'
+                      ' expected %s or %s, not %s'
+                      % (alt1, alt2, keys))
+
 
 class LowLevelKnitDataTests(TestCase):