Rev 53: As expected, splitting things up into streams of streams in http://bazaar.launchpad.net/%7Ebzr/bzr-groupcompress/trunk
John Arbash Meinel
john at arbash-meinel.com
Fri Feb 27 05:10:13 GMT 2009
At http://bazaar.launchpad.net/%7Ebzr/bzr-groupcompress/trunk
------------------------------------------------------------
revno: 53
revision-id: john at arbash-meinel.com-20090227050931-bt0zwmqxrrheyosq
parent: john at arbash-meinel.com-20090227035733-h1gmn3ymofoxc7zm
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: trunk
timestamp: Thu 2009-02-26 23:09:31 -0600
message:
As expected, splitting things up into streams of streams
gives even better compression. (Down to 4.4MB for inventories).
Probably the big win is that parent_id_basename content doesn't compress
well at all versus id_to_entry content, and this way you don't
get large offsets.
-------------- next part --------------
=== modified file 'repofmt.py'
--- a/repofmt.py 2009-02-27 03:57:33 +0000
+++ b/repofmt.py 2009-02-27 05:09:31 +0000
@@ -281,6 +281,7 @@
# client understand that the different levels won't compress well
# against eachother
remaining_keys = set(keys)
+ counter = [0]
def _get_referenced_stream(root_keys):
cur_keys = root_keys
while cur_keys:
@@ -303,37 +304,24 @@
keys_by_search_prefix.setdefault(prefix,
[]).append(value)
next_keys.add(value)
+ counter[0] += 1
+ if pb is not None:
+ pb.update('chk node', counter[0])
yield record
# Double check that we won't be emitting any keys twice
next_keys = next_keys.intersection(remaining_keys)
cur_keys = []
for prefix in sorted(keys_by_search_prefix):
cur_keys.extend(keys_by_search_prefix[prefix])
- counter = 0
- for record in _get_referenced_stream(id_roots):
- # We don't know how many total
- counter += 1
- if pb is not None:
- pb.update('chk node', counter)
- yield record
- for record in _get_referenced_stream(p_id_roots):
- # We don't know how many total
- counter += 1
- if pb is not None:
- pb.update('chk node', counter)
- yield record
+ yield _get_referenced_stream(id_roots)
+ yield _get_referenced_stream(p_id_roots)
if remaining_keys:
trace.note('There were %d keys in the chk index, which'
' were not referenced from inventories',
len(remaining_keys))
stream = source_vf.get_record_stream(remaining_keys, 'unordered',
True)
- for record in stream:
- # We don't know how many total
- counter += 1
- if pb is not None:
- pb.update('chk node', counter)
- yield record
+ yield stream
def _execute_pack_operations(self, pack_operations, _packer_class=Packer,
reload_func=None):
@@ -415,8 +403,12 @@
stream, id_roots, p_id_roots = self._get_filtered_inv_stream(
source_vf, keys)
elif vf_name == 'chk_bytes':
- stream = self._get_chk_stream(source_vf, keys,
- id_roots, p_id_roots, pb=child_pb)
+ for stream in self._get_chk_stream(source_vf, keys,
+ id_roots, p_id_roots,
+ pb=child_pb):
+ target_vf.insert_record_stream(stream)
+ # No more to copy
+ stream = []
if stream is None:
def pb_stream():
substream = source_vf.get_record_stream(keys, 'gc-optimal', True)
More information about the bazaar-commits
mailing list