Rev 4642: Switching from 'groupcompress' order to 'unordered' causes the fragmentation issue in http://bazaar.launchpad.net/~jameinel/bzr/2.0b1-402645-fragmentation

Mon Aug 24 20:34:32 BST 2009

At http://bazaar.launchpad.net/~jameinel/bzr/2.0b1-402645-fragmentation

------------------------------------------------------------
revno: 4642
revision-id: john at arbash-meinel.com-20090824193413-zlz0wc07x99gxs3b
parent: pqm at pqm.ubuntu.com-20090824182846-ac4l3skw47g0tzx0
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: 2.0b1-402645-fragmentation
timestamp: Mon 2009-08-24 14:34:13 -0500
message:
  Switching from 'groupcompress' order to 'unordered' causes the fragmentation issue
  to go away.
  
  Which is probably worthwhile *today*, but the real fix is to repack on the fly
  when some value of merit has been reached.
-------------- next part --------------
=== modified file 'bzrlib/groupcompress.py'

--- a/bzrlib/groupcompress.py	2009-08-19 16:23:39 +0000
+++ b/bzrlib/groupcompress.py	2009-08-24 19:34:13 +0000
@@ -464,6 +464,8 @@
                                                self.storage_kind)
 
 
+_recent_blocks = set()
+
 class _LazyGroupContentManager(object):
     """This manages a group of _LazyGroupCompressFactory objects."""
 
@@ -531,8 +533,11 @@
         #       expand, since we do full compression again. Perhaps based on a
         #       request that ends up poorly ordered?
         delta = time.time() - tstart
+        if old_length in _recent_blocks:
+            import pdb; pdb.set_trace()
+        _recent_blocks.add(old_length)
         self._block = new_block
-        trace.mutter('creating new compressed block on-the-fly in %.3fs'
+        trace.note('creating new compressed block on-the-fly in %.3fs'
                      ' %d bytes => %d bytes', delta, old_length,
                      self._block._content_length)
 
@@ -1307,6 +1312,8 @@
         missing.difference_update(unadded_keys)
         (fallback_parent_map, key_to_source_map,
          source_result) = self._find_from_fallback(missing)
+        trace.note('getting record stream for %s keys, in %r order, from %s'
+                   % (len(keys), ordering, self._index))
         if ordering in ('topological', 'groupcompress'):
             # would be better to not globally sort initially but instead
             # start with one key, recurse to its oldest parent, then grab
@@ -1339,6 +1346,7 @@
         #       one-at-a-time.) This could be done at insert_record_stream()
         #       time, but it probably would decrease the number of
         #       bytes-on-the-wire for fetch.
+        recent_read_memos = set()
         for source, keys in source_keys:
             if source is self:
                 for key in keys:
@@ -1357,6 +1365,9 @@
                             # We are starting a new block. If we have a
                             # manager, we have found everything that fits for
                             # now, so yield records
+                            if read_memo in recent_read_memos:
+                                import pdb; pdb.set_trace()
+                            recent_read_memos.add(read_memo)
                             if manager is not None:
                                 for factory in manager.get_record_stream():
                                     yield factory

=== modified file 'bzrlib/repofmt/groupcompress_repo.py'
--- a/bzrlib/repofmt/groupcompress_repo.py	2009-08-18 05:18:52 +0000
+++ b/bzrlib/repofmt/groupcompress_repo.py	2009-08-24 19:34:13 +0000
@@ -932,7 +932,7 @@
         super(GroupCHKStreamSource, self).__init__(from_repository, to_format)
         self._revision_keys = None
         self._text_keys = None
-        self._text_fetch_order = 'groupcompress'
+        # self._text_fetch_order = 'unordered'
         self._chk_id_roots = None
         self._chk_p_id_roots = None
 
@@ -949,7 +949,7 @@
             p_id_roots_set = set()
             source_vf = self.from_repository.inventories
             stream = source_vf.get_record_stream(inventory_keys,
-                                                 'groupcompress', True)
+                                                 'unordered', True)
             for record in stream:
                 if record.storage_kind == 'absent':
                     if allow_absent: