Rev 3100: (Robert Collins) make 'bzr pack' start optimizing the order of in file:///home/pqm/archives/thelove/bzr/%2Btrunk/

Mon Dec 10 20:11:15 GMT 2007

At file:///home/pqm/archives/thelove/bzr/%2Btrunk/

------------------------------------------------------------
revno: 3100
revision-id:pqm at pqm.ubuntu.com-20071210201101-pwhyse7wzn7pvyzx
parent: pqm at pqm.ubuntu.com-20071210120611-a3j02d26cbzvlyju
parent: john at arbash-meinel.com-20071210191200-36g8jyima1o3tlo2
committer: Canonical.com Patch Queue Manager <pqm at pqm.ubuntu.com>
branch nick: +trunk
timestamp: Mon 2007-12-10 20:11:01 +0000
message:
  (Robert Collins) make 'bzr pack' start optimizing the order of
  	revision texts
modified:
  NEWS                           NEWS-20050323055033-4e00b5db738777ff
  bzrlib/repofmt/pack_repo.py    pack_repo.py-20070813041115-gjv5ma7ktfqwsjgn-1
  bzrlib/tests/test_repository.py test_repository.py-20060131075918-65c555b881612f4d
    ------------------------------------------------------------
    revno: 3070.1.3
    revision-id:john at arbash-meinel.com-20071210191200-36g8jyima1o3tlo2
    parent: john at arbash-meinel.com-20071210175440-3ywpk5riruelw2sv
    parent: pqm at pqm.ubuntu.com-20071210120611-a3j02d26cbzvlyju
    committer: John Arbash Meinel <john at arbash-meinel.com>
    branch nick: pack_optimizer
    timestamp: Mon 2007-12-10 13:12:00 -0600
    message:
      [merge] bzr.dev 3099
    removed:
      doc/en/user-reference/index.txt index.txt-20070830033353-ud9e03xsh24053oo-1
    added:
      doc/en/quick-reference/quick-start-summary.pdf quickstartsummary.pd-20071207005239-it6xmdjml8nxvzgt-1
      doc/en/quick-reference/quick-start-summary.png quickstartsummary.pn-20071203142852-hsiybkmh37q5owwe-1
    modified:
      Makefile                       Makefile-20050805140406-d96e3498bb61c5bb
      NEWS                           NEWS-20050323055033-4e00b5db738777ff
      README                         README-20050309040720-8f368abf9f346b9d
      bzrlib/branch.py               branch.py-20050309040759-e4baf4e0d046576e
      bzrlib/bugtracker.py           bugtracker.py-20070410073305-vu1vu1qosjurg8kb-1
      bzrlib/builtins.py             builtins.py-20050830033751-fc01482b9ca23183
      bzrlib/config.py               config.py-20051011043216-070c74f4e9e338e8
      bzrlib/debug.py                debug.py-20061102062349-vdhrw9qdpck8cl35-1
      bzrlib/diff.py                 diff.py-20050309040759-26944fbbf2ebbf36
      bzrlib/errors.py               errors.py-20050309040759-20512168c4e14fbd
      bzrlib/graph.py                graph_walker.py-20070525030359-y852guab65d4wtn0-1
      bzrlib/help_topics.py          help_topics.py-20060920210027-rnim90q9e0bwxvy4-1
      bzrlib/log.py                  log.py-20050505065812-c40ce11702fe5fb1
      bzrlib/merge.py                merge.py-20050513021216-953b65a438527106
      bzrlib/plugins/launchpad/lp_indirect.py lp_indirect.py-20070126012204-de5rugwlt22c7u7e-1
      bzrlib/plugins/launchpad/test_lp_indirect.py test_lp_indirect.py-20070126002743-oyle362tzv9cd8mi-1
      bzrlib/remote.py               remote.py-20060720103555-yeeg2x51vn0rbtdp-1
      bzrlib/switch.py               switch.py-20071116011000-v5lnw7d2wkng9eux-1
      bzrlib/tests/HttpServer.py     httpserver.py-20061012142527-m1yxdj1xazsf8d7s-1
      bzrlib/tests/__init__.py       selftest.py-20050531073622-8d0e3c8845c97a64
      bzrlib/tests/blackbox/test_cat.py test_cat.py-20051201162916-f0937e4e19ea24b3
      bzrlib/tests/blackbox/test_exceptions.py test_exceptions.py-20060604211237-yi2cxg0ose3xk4id-1
      bzrlib/tests/blackbox/test_merge.py test_merge.py-20060323225809-9bc0459c19917f41
      bzrlib/tests/blackbox/test_push.py test_push.py-20060329002750-929af230d5d22663
      bzrlib/tests/branch_implementations/test_branch.py testbranch.py-20050711070244-121d632bc37d7253
      bzrlib/tests/branch_implementations/test_pull.py test_pull.py-20060410103942-83c35b26657414fc
      bzrlib/tests/repository_implementations/test_repository.py test_repository.py-20060131092128-ad07f494f5c9d26c
      bzrlib/tests/test_bugtracker.py test_bugtracker.py-20070410073305-vu1vu1qosjurg8kb-2
      bzrlib/tests/test_config.py    testconfig.py-20051011041908-742d0c15d8d8c8eb
      bzrlib/tests/test_diff.py      testdiff.py-20050727164403-d1a3496ebb12e339
      bzrlib/tests/test_errors.py    test_errors.py-20060210110251-41aba2deddf936a8
      bzrlib/tests/test_graph.py     test_graph_walker.py-20070525030405-enq4r60hhi9xrujc-1
      bzrlib/tests/test_http.py      testhttp.py-20051018020158-b2eef6e867c514d9
      bzrlib/tests/test_http_response.py test_http_response.py-20060628233143-950b2a482a32505d
      bzrlib/tests/test_log.py       testlog.py-20050728115707-1a514809d7d49309
      bzrlib/tests/test_merge.py     testmerge.py-20050905070950-c1b5aa49ff911024
      bzrlib/tests/test_msgeditor.py test_msgeditor.py-20051202041359-920315ec6011ee51
      bzrlib/tests/test_switch.py    test_switch.py-20071116011000-v5lnw7d2wkng9eux-2
      bzrlib/tests/test_transport.py testtransport.py-20050718175618-e5cdb99f4555ddce
      bzrlib/tests/test_transport_implementations.py test_transport_implementations.py-20051227111451-f97c5c7d5c49fce7
      bzrlib/tests/test_versionedfile.py test_versionedfile.py-20060222045249-db45c9ed14a1c2e5
      bzrlib/trace.py                trace.py-20050309040759-c8ed824bdcd4748a
      bzrlib/transport/__init__.py   transport.py-20050711165921-4978aa7ce1285ad5
      bzrlib/transport/http/__init__.py http_transport.py-20050711212304-506c5fd1059ace96
      bzrlib/transport/http/_pycurl.py pycurlhttp.py-20060110060940-4e2a705911af77a6
      bzrlib/transport/http/_urllib.py _urlgrabber.py-20060113083826-0bbf7d992fbf090c
      bzrlib/transport/http/_urllib2_wrappers.py _urllib2_wrappers.py-20060913231729-ha9ugi48ktx481ao-1
      bzrlib/transport/http/response.py _response.py-20060613154423-a2ci7hd4iw5c7fnt-1
      bzrlib/tree.py                 tree.py-20050309040759-9d5f2496be663e77
      bzrlib/versionedfile.py        versionedfile.py-20060222045106-5039c71ee3b65490
      doc/developers/bundles.txt     bundles.txt-20070621030528-qkjnugd7iyud6ow3-1
      doc/developers/index.txt       index.txt-20070508041241-qznziunkg0nffhiw-1
      doc/developers/packrepo.txt    knitpack.txt-20071023074744-q0g5u8me8gk5zbtb-1
      doc/en/quick-reference/quick-start-summary.svg quickstartsummary.sv-20070813143223-5i7bgw7w8s7l3ae2-3
      doc/en/user-guide/bug_trackers.txt bug_trackers.txt-20070713223459-khxdlcudraii95uv-1
      doc/en/user-guide/core_concepts.txt core_concepts.txt-20071114035000-q36a9h57ps06uvnl-2
      doc/en/user-guide/introducing_bazaar.txt introducing_bazaar.t-20071114035000-q36a9h57ps06uvnl-5
      doc/en/user-guide/merging_changes.txt merging_changes.txt-20071122141511-0knao2lklsdsvb1q-3
      doc/en/user-guide/resolving_conflicts.txt resolving_conflicts.-20071122141511-0knao2lklsdsvb1q-5
      doc/en/user-guide/reusing_a_checkout.txt reusing_a_checkout.t-20071123055134-k5x4ekduci2lbn36-3
      doc/en/user-guide/undoing_mistakes.txt undoing_mistakes.txt-20071121092300-8fyacngt1w98e5mp-1
      doc/en/user-guide/working_offline_central.txt working_offline_cent-20071123055134-k5x4ekduci2lbn36-5
      doc/index.txt                  index.txt-20070813101924-07gd9i9d2jt124bf-1
    ------------------------------------------------------------
    revno: 3070.1.2
    revision-id:john at arbash-meinel.com-20071210175440-3ywpk5riruelw2sv
    parent: robertc at robertcollins.net-20071204021939-883w43jh8fuy1mzf
    committer: John Arbash Meinel <john at arbash-meinel.com>
    branch nick: pack_optimizer
    timestamp: Mon 2007-12-10 11:54:40 -0600
    message:
      Cleanup OptimizingPacker code according to my review feedback
    modified:
      bzrlib/repofmt/pack_repo.py    pack_repo.py-20070813041115-gjv5ma7ktfqwsjgn-1
    ------------------------------------------------------------
    revno: 3070.1.1
    revision-id:robertc at robertcollins.net-20071204021939-883w43jh8fuy1mzf
    parent: pqm at pqm.ubuntu.com-20071203210338-3w0ryakegm0xopp0
    committer: Robert Collins <robertc at robertcollins.net>
    branch nick: pack
    timestamp: Tue 2007-12-04 13:19:39 +1100
    message:
      * ``bzr pack`` now orders revision texts in topological order, with newest
        at the start of the file, promoting linear reads for ``bzr log`` and the
        like. This partially fixes #154129. (Robert Collins)
    modified:
      NEWS                           NEWS-20050323055033-4e00b5db738777ff
      bzrlib/repofmt/pack_repo.py    pack_repo.py-20070813041115-gjv5ma7ktfqwsjgn-1
      bzrlib/tests/test_repository.py test_repository.py-20060131075918-65c555b881612f4d
=== modified file 'NEWS'

--- a/NEWS	2007-12-10 11:10:06 +0000
+++ b/NEWS	2007-12-10 19:12:00 +0000
@@ -13,6 +13,10 @@
 
   IMPROVEMENTS:
 
+   * ``bzr pack`` now orders revision texts in topological order, with newest
+     at the start of the file, promoting linear reads for ``bzr log`` and the
+     like. This partially fixes #154129. (Robert Collins)
+
   BUGFIXES:
 
    * Fix Makefile rules for doc generation. (Ian Clatworthy, #175207)

=== modified file 'bzrlib/repofmt/pack_repo.py'
--- a/bzrlib/repofmt/pack_repo.py	2007-12-03 19:26:40 +0000
+++ b/bzrlib/repofmt/pack_repo.py	2007-12-10 17:54:40 +0000
@@ -610,8 +610,9 @@
         revision_nodes = self._pack_collection._index_contents(revision_index_map, revision_keys)
         # copy revision keys and adjust values
         self.pb.update("Copying revision texts", 1)
-        list(self._copy_nodes_graph(revision_nodes, revision_index_map,
-            self.new_pack._writer, self.new_pack.revision_index))
+        total_items, readv_group_iter = self._revision_node_readv(revision_nodes)
+        list(self._copy_nodes_graph(revision_index_map, self.new_pack._writer,
+            self.new_pack.revision_index, readv_group_iter, total_items))
         if 'pack' in debug.debug_flags:
             mutter('%s: create_pack: revisions copied: %s%s %d items t+%6.3fs',
                 time.ctime(), self._pack_collection._upload_transport.base,
@@ -639,8 +640,10 @@
         # XXX: Should be a helper function to allow different inv representation
         # at this point.
         self.pb.update("Copying inventory texts", 2)
-        inv_lines = self._copy_nodes_graph(inv_nodes, inventory_index_map,
-            self.new_pack._writer, self.new_pack.inventory_index, output_lines=True)
+        total_items, readv_group_iter = self._least_readv_node_readv(inv_nodes)
+        inv_lines = self._copy_nodes_graph(inventory_index_map,
+            self.new_pack._writer, self.new_pack.inventory_index,
+            readv_group_iter, total_items, output_lines=True)
         if self.revision_ids:
             self._process_inventory_lines(inv_lines)
         else:
@@ -675,8 +678,9 @@
                     a_missing_key[0])
         # copy text keys and adjust values
         self.pb.update("Copying content texts", 3)
-        list(self._copy_nodes_graph(text_nodes, text_index_map,
-            self.new_pack._writer, self.new_pack.text_index))
+        total_items, readv_group_iter = self._least_readv_node_readv(text_nodes)
+        list(self._copy_nodes_graph(text_index_map, self.new_pack._writer,
+            self.new_pack.text_index, readv_group_iter, total_items))
         self._log_copied_texts()
 
     def _check_references(self):
@@ -787,8 +791,8 @@
                 pb.update("Copied record", record_index)
                 record_index += 1
 
-    def _copy_nodes_graph(self, nodes, index_map, writer, write_index,
-        output_lines=False):
+    def _copy_nodes_graph(self, index_map, writer, write_index,
+        readv_group_iter, total_items, output_lines=False):
         """Copy knit nodes between packs.
 
         :param output_lines: Return lines present in the copied data as
@@ -796,8 +800,8 @@
         """
         pb = ui.ui_factory.nested_progress_bar()
         try:
-            for result in self._do_copy_nodes_graph(nodes, index_map, writer,
-                write_index, output_lines, pb):
+            for result in self._do_copy_nodes_graph(index_map, writer,
+                write_index, output_lines, pb, readv_group_iter, total_items):
                 yield result
         except Exception:
             # Python 2.4 does not permit try:finally: in a generator.
@@ -806,42 +810,21 @@
         else:
             pb.finished()
 
-    def _do_copy_nodes_graph(self, nodes, index_map, writer, write_index,
-        output_lines, pb):
+    def _do_copy_nodes_graph(self, index_map, writer, write_index,
+        output_lines, pb, readv_group_iter, total_items):
         # for record verification
         knit_data = _KnitData(None)
         # for line extraction when requested (inventories only)
         if output_lines:
             factory = knit.KnitPlainFactory()
-        # plan a readv on each source pack:
-        # group by pack
-        nodes = sorted(nodes)
-        # how to map this into knit.py - or knit.py into this?
-        # we don't want the typical knit logic, we want grouping by pack
-        # at this point - perhaps a helper library for the following code 
-        # duplication points?
-        request_groups = {}
         record_index = 0
-        pb.update("Copied record", record_index, len(nodes))
-        for index, key, value, references in nodes:
-            if index not in request_groups:
-                request_groups[index] = []
-            request_groups[index].append((key, value, references))
-        for index, items in request_groups.iteritems():
-            pack_readv_requests = []
-            for key, value, references in items:
-                # ---- KnitGraphIndex.get_position
-                bits = value[1:].split(' ')
-                offset, length = int(bits[0]), int(bits[1])
-                pack_readv_requests.append((offset, length, (key, value[0], references)))
-            # linear scan up the pack
-            pack_readv_requests.sort()
+        pb.update("Copied record", record_index, total_items)
+        for index, readv_vector, node_vector in readv_group_iter:
             # copy the data
             transport, path = index_map[index]
-            reader = pack.make_readv_reader(transport, path,
-                [offset[0:2] for offset in pack_readv_requests])
-            for (names, read_func), (_1, _2, (key, eol_flag, references)) in \
-                izip(reader.iter_records(), pack_readv_requests):
+            reader = pack.make_readv_reader(transport, path, readv_vector)
+            for (names, read_func), (key, eol_flag, references) in \
+                izip(reader.iter_records(), node_vector):
                 raw_data = read_func(None)
                 version_id = key[-1]
                 if output_lines:
@@ -868,6 +851,43 @@
         return text_index_map, self._pack_collection._index_contents(text_index_map,
             self._text_filter)
 
+    def _least_readv_node_readv(self, nodes):
+        """Generate request groups for nodes using the least readv's.
+        
+        :param nodes: An iterable of graph index nodes.
+        :return: Total node count and an iterator of the data needed to perform
+            readvs to obtain the data for nodes. Each item yielded by the
+            iterator is a tuple with:
+            index, readv_vector, node_vector. readv_vector is a list ready to
+            hand to the transport readv method, and node_vector is a list of
+            (key, eol_flag, references) for the the node retrieved by the
+            matching readv_vector.
+        """
+        # group by pack so we do one readv per pack
+        nodes = sorted(nodes)
+        total = len(nodes)
+        request_groups = {}
+        for index, key, value, references in nodes:
+            if index not in request_groups:
+                request_groups[index] = []
+            request_groups[index].append((key, value, references))
+        result = []
+        for index, items in request_groups.iteritems():
+            pack_readv_requests = []
+            for key, value, references in items:
+                # ---- KnitGraphIndex.get_position
+                bits = value[1:].split(' ')
+                offset, length = int(bits[0]), int(bits[1])
+                pack_readv_requests.append(
+                    ((offset, length), (key, value[0], references)))
+            # linear scan up the pack to maximum range combining.
+            pack_readv_requests.sort()
+            # split out the readv and the node data.
+            pack_readv = [readv for readv, node in pack_readv_requests]
+            node_vector = [node for readv, node in pack_readv_requests]
+            result.append((index, pack_readv, node_vector))
+        return total, result
+
     def _log_copied_texts(self):
         if 'pack' in debug.debug_flags:
             mutter('%s: create_pack: file texts copied: %s%s %d items t+%6.3fs',
@@ -886,6 +906,15 @@
             text_filter.extend([(fileid, file_revid) for file_revid in file_revids])
         self._text_filter = text_filter
 
+    def _revision_node_readv(self, revision_nodes):
+        """Return the total revisions and the readv's to issue.
+
+        :param revision_nodes: The revision index contents for the packs being
+            incorporated into the new pack.
+        :return: As per _least_readv_node_readv.
+        """
+        return self._least_readv_node_readv(revision_nodes)
+
     def _use_pack(self, new_pack):
         """Return True if new_pack should be used.
 
@@ -895,6 +924,40 @@
         return new_pack.data_inserted()
 
 
+class OptimisingPacker(Packer):
+    """A packer which spends more time to create better disk layouts."""
+
+    def _revision_node_readv(self, revision_nodes):
+        """Return the total revisions and the readv's to issue.
+
+        This sort places revisions in topological order with the ancestors
+        after the children.
+
+        :param revision_nodes: The revision index contents for the packs being
+            incorporated into the new pack.
+        :return: As per _least_readv_node_readv.
+        """
+        # build an ancestors dict
+        ancestors = {}
+        by_key = {}
+        for index, key, value, references in revision_nodes:
+            ancestors[key] = references[0]
+            by_key[key] = (index, value, references)
+        order = tsort.topo_sort(ancestors)
+        total = len(order)
+        # Single IO is pathological, but it will work as a starting point.
+        requests = []
+        for key in reversed(order):
+            index, value, references = by_key[key]
+            # ---- KnitGraphIndex.get_position
+            bits = value[1:].split(' ')
+            offset, length = int(bits[0]), int(bits[1])
+            requests.append(
+                (index, [(offset, length)], [(key, value[0], references)]))
+        # TODO: combine requests in the same index that are in ascending order.
+        return total, requests
+
+
 class ReconcilePacker(Packer):
     """A packer which regenerates indices etc as it copies.
     
@@ -968,8 +1031,9 @@
         del ideal_index
         del text_nodes
         # 3) bulk copy the ok data
-        list(self._copy_nodes_graph(ok_nodes, text_index_map,
-            self.new_pack._writer, self.new_pack.text_index))
+        total_items, readv_group_iter = self._least_readv_node_readv(ok_nodes)
+        list(self._copy_nodes_graph(text_index_map, self.new_pack._writer,
+            self.new_pack.text_index, readv_group_iter, total_items))
         # 4) adhoc copy all the other texts.
         # We have to topologically insert all texts otherwise we can fail to
         # reconcile when parts of a single delta chain are preserved intact,
@@ -1164,17 +1228,18 @@
         self._execute_pack_operations(pack_operations)
         return True
 
-    def _execute_pack_operations(self, pack_operations):
+    def _execute_pack_operations(self, pack_operations, _packer_class=Packer):
         """Execute a series of pack operations.
 
         :param pack_operations: A list of [revision_count, packs_to_combine].
+        :param _packer_class: The class of packer to use (default: Packer).
         :return: None.
         """
         for revision_count, packs in pack_operations:
             # we may have no-ops from the setup logic
             if len(packs) == 0:
                 continue
-            Packer(self, packs, '.autopack').pack()
+            _packer_class(self, packs, '.autopack').pack()
             for pack in packs:
                 self._remove_pack_from_memory(pack)
         # record the newly available packs and stop advertising the old
@@ -1197,6 +1262,9 @@
         self.ensure_loaded()
         total_packs = len(self._names)
         if total_packs < 2:
+            # This is arguably wrong because we might not be optimal, but for
+            # now lets leave it in. (e.g. reconcile -> one pack. But not
+            # optimal.
             return
         total_revisions = self.revision_index.combined_index.key_count()
         # XXX: the following may want to be a class, to pack with a given
@@ -1208,10 +1276,9 @@
         pack_distribution = [1]
         pack_operations = [[0, []]]
         for pack in self.all_packs():
-            revision_count = pack.get_revision_count()
-            pack_operations[-1][0] += revision_count
+            pack_operations[-1][0] += pack.get_revision_count()
             pack_operations[-1][1].append(pack)
-        self._execute_pack_operations(pack_operations)
+        self._execute_pack_operations(pack_operations, OptimisingPacker)
 
     def plan_autopack_combinations(self, existing_packs, pack_distribution):
         """Plan a pack operation.

=== modified file 'bzrlib/tests/test_repository.py'
--- a/bzrlib/tests/test_repository.py	2007-11-30 15:59:02 +0000
+++ b/bzrlib/tests/test_repository.py	2007-12-04 02:19:39 +0000
@@ -929,6 +929,26 @@
         self.assertEqual(1, len(list(index.iter_all_entries())))
         self.assertEqual(2, len(tree.branch.repository.all_revision_ids()))
 
+    def test_pack_layout(self):
+        format = self.get_format()
+        tree = self.make_branch_and_tree('.', format=format)
+        trans = tree.branch.repository.bzrdir.get_repository_transport(None)
+        tree.commit('start', rev_id='1')
+        tree.commit('more work', rev_id='2')
+        tree.branch.repository.pack()
+        tree.lock_read()
+        self.addCleanup(tree.unlock)
+        pack = tree.branch.repository._pack_collection.get_pack_by_name(
+            tree.branch.repository._pack_collection.names()[0])
+        # revision access tends to be tip->ancestor, so ordering that way on 
+        # disk is a good idea.
+        for _1, key, val, refs in pack.revision_index.iter_all_entries():
+            if key == ('1',):
+                pos_1 = int(val[1:].split()[0])
+            else:
+                pos_2 = int(val[1:].split()[0])
+        self.assertTrue(pos_2 < pos_1)
+
     def test_pack_repositories_support_multiple_write_locks(self):
         format = self.get_format()
         self.make_repository('.', shared=True, format=format)