Rev 4647: Fix bug 402652 by recompressing all texts that are streamed - slightly slower at fetch, substantially faster and more compact at read. in http://bazaar.launchpad.net/~lifeless/bzr/bug-402652

Tue Sep 1 07:10:51 BST 2009

At http://bazaar.launchpad.net/~lifeless/bzr/bug-402652

------------------------------------------------------------
revno: 4647
revision-id: robertc at robertcollins.net-20090901061024-qasufbfj7jse2eai
parent: pqm at pqm.ubuntu.com-20090830232250-2oqzti7o30pv7zc5
committer: Robert Collins <robertc at robertcollins.net>
branch nick: bug-402652
timestamp: Tue 2009-09-01 16:10:24 +1000
message:
  Fix bug 402652 by recompressing all texts that are streamed - slightly slower at fetch, substantially faster and more compact at read.
=== modified file 'NEWS'

--- a/NEWS	2009-08-30 22:02:45 +0000
+++ b/NEWS	2009-09-01 06:10:24 +0000
@@ -20,6 +20,13 @@
   revisions that are in the fallback repository. (Regressed in 2.0rc1).
   (John Arbash Meinel, #419241)
 
+* Fetches from 2a to 2a are now again requested in 'groupcompress' order,
+  and all texts are recombined appropriately. This doesn't reuse existing
+  groups, which will be measurable in some specific circumstances - an
+  approximately 25% overhead. However, doing this ensures high performance
+  reads subsequent to the fetch operation, which is the most common
+  operation: write once read many. (Robert Collins, #402652)
+
 * Fix a segmentation fault when computing the ``merge_sort`` of a graph
   that has a ghost in the mainline ancestry.
   (John Arbash Meinel, #419241)

=== modified file 'bzrlib/groupcompress.py'
--- a/bzrlib/groupcompress.py	2009-08-26 16:47:51 +0000
+++ b/bzrlib/groupcompress.py	2009-09-01 06:10:24 +0000
@@ -1516,7 +1516,8 @@
         # test_insert_record_stream_existing_keys fail for groupcompress and
         # groupcompress-nograph, this needs to be revisited while addressing
         # 'bzr branch' performance issues.
-        for _ in self._insert_record_stream(stream, random_id=False):
+        for _ in self._insert_record_stream(stream, random_id=False,
+            reuse_blocks=False):
             pass
 
     def _insert_record_stream(self, stream, random_id=False, nostore_sha=None,
@@ -1580,7 +1581,7 @@
                                ' but then inserted %r two times', record.key)
                     continue
                 inserted_keys.add(record.key)
-            if reuse_blocks:
+            if not inserted_keys and reuse_blocks:
                 # If the reuse_blocks flag is set, check to see if we can just
                 # copy a groupcompress block as-is.
                 if record.storage_kind == 'groupcompress-block':

=== modified file 'bzrlib/repofmt/groupcompress_repo.py'
--- a/bzrlib/repofmt/groupcompress_repo.py	2009-08-24 19:34:13 +0000
+++ b/bzrlib/repofmt/groupcompress_repo.py	2009-09-01 06:10:24 +0000
@@ -932,7 +932,7 @@
         super(GroupCHKStreamSource, self).__init__(from_repository, to_format)
         self._revision_keys = None
         self._text_keys = None
-        # self._text_fetch_order = 'unordered'
+        self._text_fetch_order = 'groupcompress'
         self._chk_id_roots = None
         self._chk_p_id_roots = None
 
@@ -949,7 +949,7 @@
             p_id_roots_set = set()
             source_vf = self.from_repository.inventories
             stream = source_vf.get_record_stream(inventory_keys,
-                                                 'unordered', True)
+                                                 'groupcompress', True)
             for record in stream:
                 if record.storage_kind == 'absent':
                     if allow_absent:

=== modified file 'bzrlib/tests/test_repository.py'
--- a/bzrlib/tests/test_repository.py	2009-08-17 23:15:55 +0000
+++ b/bzrlib/tests/test_repository.py	2009-09-01 06:10:24 +0000
@@ -683,6 +683,27 @@
 
 class Test2a(TestCaseWithTransport):
 
+    def test_fetch_combines_groups(self):
+        builder = self.make_branch_builder('source', format='2a')
+        builder.start_series()
+        builder.build_snapshot('1', None, [
+            ('add', ('', 'root-id', 'directory', '')),
+            ('add', ('file', 'file-id', 'file', 'content\n'))])
+        builder.build_snapshot('2', ['1'], [
+            ('modify', ('file-id', 'content-2\n'))])
+        builder.finish_series()
+        source = builder.get_branch()
+        target = self.make_repository('target', format='2a')
+        target.fetch(source.repository)
+        target.lock_read()
+        details = target.texts._index.get_build_details(
+            [('file-id', '1',), ('file-id', '2',)])
+        file_1_details = details[('file-id', '1')]
+        file_2_details = details[('file-id', '2')]
+        # The index, and what to read off disk, should be the same for both
+        # versions of the file.
+        self.assertEqual(file_1_details[0][:3], file_2_details[0][:3])
+
     def test_format_pack_compresses_True(self):
         repo = self.make_repository('repo', format='2a')
         self.assertTrue(repo._format.pack_compresses)