Rev 51: Clustering chk pages properly makes a big difference. in http://bazaar.launchpad.net/%7Ejameinel/bzr-groupcompress/experimental

Fri Feb 27 03:05:06 GMT 2009

At http://bazaar.launchpad.net/%7Ejameinel/bzr-groupcompress/experimental

------------------------------------------------------------
revno: 51
revision-id: john at arbash-meinel.com-20090227030449-r8cmknorcmlgaqtm
parent: john at arbash-meinel.com-20090226224152-z4jiazt0gp1vsylk
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: experimental
timestamp: Thu 2009-02-26 21:04:49 -0600
message:
  Clustering chk pages properly makes a big difference.
  
  By iterating root nodes in the same order as the referencing inventory,
  and then iterating by search prefix, we get compression about 2:1 versus
  not compressing at all, which is probably 50% better than random ordering.
-------------- next part --------------
=== modified file 'groupcompress.py'

--- a/groupcompress.py	2009-02-26 22:09:34 +0000
+++ b/groupcompress.py	2009-02-27 03:04:49 +0000
@@ -579,10 +579,11 @@
             valid until the iterator is advanced.
         """
         # keys might be a generator
-        keys = set(keys)
+        orig_keys = list(keys)
+        keys = set(orig_keys)
         if not keys:
             return
-        if not self._index.has_graph:
+        if not self._index.has_graph and ordering == 'topological':
             # Cannot topological order when no graph has been stored.
             ordering = 'unordered'
         # Cheap: iterate
@@ -608,6 +609,8 @@
             #      balance that with the time it takes to extract ordering, by
             #      somehow grouping based on locations[key][0:3]
             present_keys = sort_gc_optimal(parent_map)
+        elif ordering == 'as-requested':
+            present_keys = [key for key in orig_keys if key in locations]
         else:
             # We want to yield the keys in a semi-optimal (read-wise) ordering.
             # Otherwise we thrash the _group_cache and destroy performance

=== modified file 'repofmt.py'
--- a/repofmt.py	2009-02-26 22:41:52 +0000
+++ b/repofmt.py	2009-02-27 03:04:49 +0000
@@ -244,17 +244,25 @@
 
     def _get_filtered_inv_stream(self, source_vf, keys):
         """Filter the texts of inventories, to find the chk pages."""
-        id_roots = set()
-        p_id_roots = set()
+        id_roots = []
+        p_id_roots = []
+        id_roots_set = set()
+        p_id_roots_set = set()
         def _filter_inv_stream(stream):
             for idx, record in enumerate(stream):
                 ### child_pb.update('fetch inv', idx, len(inv_keys_to_fetch))
                 bytes = record.get_bytes_as('fulltext')
                 chk_inv = inventory.CHKInventory.deserialise(None, bytes, record.key)
-                id_roots.add(chk_inv.id_to_entry.key())
+                key = chk_inv.id_to_entry.key()
+                if key not in id_roots_set:
+                    id_roots.append(key)
+                    id_roots_set.add(key)
                 p_id_map = chk_inv.parent_id_basename_to_file_id
                 if p_id_map is not None:
-                    p_id_roots.add(p_id_map.key())
+                    key = p_id_map.key()
+                    if key not in p_id_roots_set:
+                        p_id_roots_set.add(key)
+                        p_id_roots.append(key)
                 yield record
         stream = source_vf.get_record_stream(keys, 'gc-optimal', True)
         return _filter_inv_stream(stream), id_roots, p_id_roots
@@ -264,6 +272,11 @@
         # reference, and then stream things from p_id_roots and things they
         # reference, and then any remaining keys that we didn't get to.
 
+        # We also group referenced texts together, so if one root references a
+        # text with prefix 'a', and another root references a node with prefix
+        # 'a', we want to yield those nodes before we yield the nodes for 'b'
+        # This keeps 'similar' nodes together
+
         # Note: We probably actually want multiple streams here, to help the
         #       client understand that the different levels won't compress well
         #       against eachother
@@ -271,9 +284,10 @@
         def _get_referenced_stream(root_keys):
             cur_keys = root_keys
             while cur_keys:
+                keys_by_search_prefix = {}
                 remaining_keys.difference_update(cur_keys)
                 next_keys = set()
-                stream = source_vf.get_record_stream(cur_keys, 'unordered',
+                stream = source_vf.get_record_stream(cur_keys, 'as-requested',
                                                      True)
                 for record in stream:
                     bytes = record.get_bytes_as('fulltext')
@@ -281,9 +295,20 @@
                     # because we only care about external references.
                     node = chk_map._deserialise(bytes, record.key,
                                                 search_key_func=None)
-                    next_keys.update(node.refs())
+                    common_base = node._search_prefix
+                    if isinstance(node, chk_map.InternalNode):
+                        for prefix, value in node._items.iteritems():
+                            assert isinstance(value, tuple)
+                            if value not in next_keys:
+                                keys_by_search_prefix.setdefault(prefix,
+                                    []).append(value)
+                                next_keys.add(value)
                     yield record
-                cur_keys = next_keys.intersection(remaining_keys)
+                # Double check that we won't be emitting any keys twice
+                next_keys = next_keys.intersection(remaining_keys)
+                cur_keys = []
+                for prefix in sorted(keys_by_search_prefix):
+                    cur_keys.extend(keys_by_search_prefix[prefix])
         for record in _get_referenced_stream(id_roots):
             yield record
         for record in _get_referenced_stream(p_id_roots):