Rev 39: start experimenting with gc-optimal ordering. in http://bazaar.launchpad.net/%7Ejameinel/bzr-groupcompress/experimental

John Arbash Meinel john at arbash-meinel.com
Thu Feb 19 20:45:32 GMT 2009


At http://bazaar.launchpad.net/%7Ejameinel/bzr-groupcompress/experimental

------------------------------------------------------------
revno: 39
revision-id: john at arbash-meinel.com-20090219204500-1wb0k8f962lansy6
parent: john at arbash-meinel.com-20090219182442-waopk1hpwcidato4
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: experimental
timestamp: Thu 2009-02-19 14:45:00 -0600
message:
  start experimenting with gc-optimal ordering.
-------------- next part --------------
=== modified file 'groupcompress.py'
--- a/groupcompress.py	2009-02-19 18:24:42 +0000
+++ b/groupcompress.py	2009-02-19 20:45:00 +0000
@@ -75,6 +75,7 @@
             result.append((op, None, numbers[0], contents))
     return label, sha1, result
 
+
 def apply_delta(basis, delta):
     """Apply delta to this object to become new_version_id."""
     lines = []
@@ -97,6 +98,34 @@
         lines[-1] = lines[-1][:-1]
 
 
+
+def sort_gc_optimal(parent_map):
+    """Sort and group the keys in parent_map into gc-optimal order.
+
+    gc-optimal is defined (currently) as reverse-topological order, grouped by
+    the key prefix.
+
+    :return: A sorted-list of keys
+    """
+    # gc-optimal ordering is approximately reverse topological,
+    # properly grouped by file-id.
+    per_prefix_map = {'': []}
+    present_keys = []
+    for item in parent_map.iteritems():
+        key = item[0]
+        if isinstance(key, str) or len(key) == 1:
+            per_prefix_map[''].append(item)
+        else:
+            try:
+                per_prefix_map[key[0]].append(item)
+            except KeyError:
+                per_prefix_map[key[0]] = [item]
+
+    for prefix in sorted(per_prefix_map):
+        present_keys.extend(reversed(topo_sort(per_prefix_map[prefix])))
+    return present_keys
+
+
 class GroupCompressor(object):
     """Produce a serialised group of compressed texts.
     
@@ -145,8 +174,6 @@
         # insert new lines. To find reusable lines we traverse 
         locations = None
         max_pos = len(lines)
-        max_time = 0.0
-        max_info = None
         result_append = result.append
         while pos < max_pos:
             block, pos, locations = _get_longest_match(line_locations, pos,
@@ -481,13 +508,22 @@
                 parent_map[key] = self._unadded_refs[key]
             present_keys = topo_sort(parent_map)
             # Now group by source:
-        # TODO: Support a group-compress 'optimal' ordering
+        elif ordering == 'gc-optimal':
+            parent_map = dict((key, details[2]) for key, details in
+                locations.iteritems())
+            for key in local_keys:
+                parent_map[key] = self._unadded_refs[key]
+            # XXX: This only optimizes for the target ordering. We may need to
+            #      balance that with the time it takes to extract ordering, by
+            #      somehow grouping based on locations[key][0:3]
+            present_keys = sort_gc_optimal(parent_map)
         else:
             # We want to yield the keys in a semi-optimal (read-wise) ordering.
             # Otherwise we thrash the _group_cache and destroy performance
             def get_group(key):
-                # This is the group the bytes are stored in
-                return locations[key][0:3]
+                # This is the group the bytes are stored in, followed by the
+                # location in the group
+                return locations[key][0]
             present_keys = sorted(locations.iterkeys(), key=get_group)
             # We don't have an ordering for keys in the in-memory object, but
             # lets process the in-memory ones first.

=== modified file 'repofmt.py'
--- a/repofmt.py	2009-02-18 20:40:46 +0000
+++ b/repofmt.py	2009-02-19 20:45:00 +0000
@@ -311,6 +311,7 @@
         #       in-a-row (and sharing strings). Topological is better for
         #       remote, because we access less data.
         self._fetch_order = 'topological'
+        self._fetch_gc_optimal = True
         self._fetch_uses_deltas = False
 
 
@@ -374,6 +375,7 @@
             self._reconcile_fixes_text_parents = True
             self._reconcile_backsup_inventory = False
             self._fetch_order = 'topological'
+            self._fetch_gc_optimal = True
             self._fetch_uses_deltas = False
 
 



More information about the bazaar-commits mailing list