Rev 54: Try even harder, now with even *more* streams. in http://bazaar.launchpad.net/%7Ebzr/bzr-groupcompress/trunk
John Arbash Meinel
john at arbash-meinel.com
Fri Feb 27 05:15:36 GMT 2009
At http://bazaar.launchpad.net/%7Ebzr/bzr-groupcompress/trunk
------------------------------------------------------------
revno: 54
revision-id: john at arbash-meinel.com-20090227051520-3bqqcchl92qup96h
parent: john at arbash-meinel.com-20090227050931-bt0zwmqxrrheyosq
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: trunk
timestamp: Thu 2009-02-26 23:15:20 -0600
message:
Try even harder, now with even *more* streams.
The compressed size drops by another 4x.
Turn the data for each *layer* into a different stream.
With this change, gc255 has compressed inventory drop to 1.5MB
which is finally *smaller* than the source 'knit' format.
-------------- next part --------------
=== modified file 'repofmt.py'
--- a/repofmt.py 2009-02-27 05:09:31 +0000
+++ b/repofmt.py 2009-02-27 05:15:20 +0000
@@ -290,31 +290,35 @@
next_keys = set()
stream = source_vf.get_record_stream(cur_keys, 'as-requested',
True)
- for record in stream:
- bytes = record.get_bytes_as('fulltext')
- # We don't care about search_key_func for this code,
- # because we only care about external references.
- node = chk_map._deserialise(bytes, record.key,
- search_key_func=None)
- common_base = node._search_prefix
- if isinstance(node, chk_map.InternalNode):
- for prefix, value in node._items.iteritems():
- assert isinstance(value, tuple)
- if value not in next_keys:
- keys_by_search_prefix.setdefault(prefix,
- []).append(value)
- next_keys.add(value)
- counter[0] += 1
- if pb is not None:
- pb.update('chk node', counter[0])
- yield record
+ def next_stream():
+ for record in stream:
+ bytes = record.get_bytes_as('fulltext')
+ # We don't care about search_key_func for this code,
+ # because we only care about external references.
+ node = chk_map._deserialise(bytes, record.key,
+ search_key_func=None)
+ common_base = node._search_prefix
+ if isinstance(node, chk_map.InternalNode):
+ for prefix, value in node._items.iteritems():
+ assert isinstance(value, tuple)
+ if value not in next_keys:
+ keys_by_search_prefix.setdefault(prefix,
+ []).append(value)
+ next_keys.add(value)
+ counter[0] += 1
+ if pb is not None:
+ pb.update('chk node', counter[0])
+ yield record
+ yield next_stream()
# Double check that we won't be emitting any keys twice
next_keys = next_keys.intersection(remaining_keys)
cur_keys = []
for prefix in sorted(keys_by_search_prefix):
cur_keys.extend(keys_by_search_prefix[prefix])
- yield _get_referenced_stream(id_roots)
- yield _get_referenced_stream(p_id_roots)
+ for stream in _get_referenced_stream(id_roots):
+ yield stream
+ for stream in _get_referenced_stream(p_id_roots):
+ yield stream
if remaining_keys:
trace.note('There were %d keys in the chk index, which'
' were not referenced from inventories',
More information about the bazaar-commits
mailing list