Rev 39: start experimenting with gc-optimal ordering. in http://bazaar.launchpad.net/%7Ejameinel/bzr-groupcompress/experimental
John Arbash Meinel
john at arbash-meinel.com
Thu Feb 19 20:45:32 GMT 2009
At http://bazaar.launchpad.net/%7Ejameinel/bzr-groupcompress/experimental
------------------------------------------------------------
revno: 39
revision-id: john at arbash-meinel.com-20090219204500-1wb0k8f962lansy6
parent: john at arbash-meinel.com-20090219182442-waopk1hpwcidato4
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: experimental
timestamp: Thu 2009-02-19 14:45:00 -0600
message:
start experimenting with gc-optimal ordering.
-------------- next part --------------
=== modified file 'groupcompress.py'
--- a/groupcompress.py 2009-02-19 18:24:42 +0000
+++ b/groupcompress.py 2009-02-19 20:45:00 +0000
@@ -75,6 +75,7 @@
result.append((op, None, numbers[0], contents))
return label, sha1, result
+
def apply_delta(basis, delta):
"""Apply delta to this object to become new_version_id."""
lines = []
@@ -97,6 +98,34 @@
lines[-1] = lines[-1][:-1]
+
+def sort_gc_optimal(parent_map):
+ """Sort and group the keys in parent_map into gc-optimal order.
+
+ gc-optimal is defined (currently) as reverse-topological order, grouped by
+ the key prefix.
+
+ :return: A sorted-list of keys
+ """
+ # gc-optimal ordering is approximately reverse topological,
+ # properly grouped by file-id.
+ per_prefix_map = {'': []}
+ present_keys = []
+ for item in parent_map.iteritems():
+ key = item[0]
+ if isinstance(key, str) or len(key) == 1:
+ per_prefix_map[''].append(item)
+ else:
+ try:
+ per_prefix_map[key[0]].append(item)
+ except KeyError:
+ per_prefix_map[key[0]] = [item]
+
+ for prefix in sorted(per_prefix_map):
+ present_keys.extend(reversed(topo_sort(per_prefix_map[prefix])))
+ return present_keys
+
+
class GroupCompressor(object):
"""Produce a serialised group of compressed texts.
@@ -145,8 +174,6 @@
# insert new lines. To find reusable lines we traverse
locations = None
max_pos = len(lines)
- max_time = 0.0
- max_info = None
result_append = result.append
while pos < max_pos:
block, pos, locations = _get_longest_match(line_locations, pos,
@@ -481,13 +508,22 @@
parent_map[key] = self._unadded_refs[key]
present_keys = topo_sort(parent_map)
# Now group by source:
- # TODO: Support a group-compress 'optimal' ordering
+ elif ordering == 'gc-optimal':
+ parent_map = dict((key, details[2]) for key, details in
+ locations.iteritems())
+ for key in local_keys:
+ parent_map[key] = self._unadded_refs[key]
+ # XXX: This only optimizes for the target ordering. We may need to
+ # balance that with the time it takes to extract ordering, by
+ # somehow grouping based on locations[key][0:3]
+ present_keys = sort_gc_optimal(parent_map)
else:
# We want to yield the keys in a semi-optimal (read-wise) ordering.
# Otherwise we thrash the _group_cache and destroy performance
def get_group(key):
- # This is the group the bytes are stored in
- return locations[key][0:3]
+ # This is the group the bytes are stored in, followed by the
+ # location in the group
+ return locations[key][0]
present_keys = sorted(locations.iterkeys(), key=get_group)
# We don't have an ordering for keys in the in-memory object, but
# lets process the in-memory ones first.
=== modified file 'repofmt.py'
--- a/repofmt.py 2009-02-18 20:40:46 +0000
+++ b/repofmt.py 2009-02-19 20:45:00 +0000
@@ -311,6 +311,7 @@
# in-a-row (and sharing strings). Topological is better for
# remote, because we access less data.
self._fetch_order = 'topological'
+ self._fetch_gc_optimal = True
self._fetch_uses_deltas = False
@@ -374,6 +375,7 @@
self._reconcile_fixes_text_parents = True
self._reconcile_backsup_inventory = False
self._fetch_order = 'topological'
+ self._fetch_gc_optimal = True
self._fetch_uses_deltas = False
More information about the bazaar-commits
mailing list