Rev 54: Try even harder, now with even *more* streams. in http://bazaar.launchpad.net/%7Ebzr/bzr-groupcompress/trunk

John Arbash Meinel john at arbash-meinel.com
Fri Feb 27 05:15:36 GMT 2009


At http://bazaar.launchpad.net/%7Ebzr/bzr-groupcompress/trunk

------------------------------------------------------------
revno: 54
revision-id: john at arbash-meinel.com-20090227051520-3bqqcchl92qup96h
parent: john at arbash-meinel.com-20090227050931-bt0zwmqxrrheyosq
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: trunk
timestamp: Thu 2009-02-26 23:15:20 -0600
message:
  Try even harder, now with even *more* streams.
  The compressed size drops by another 4x.
  Turn the data for each *layer* into a different stream.
  With this change, gc255 has compressed inventory drop to 1.5MB
  which is finally *smaller* than the source 'knit' format.
-------------- next part --------------
=== modified file 'repofmt.py'
--- a/repofmt.py	2009-02-27 05:09:31 +0000
+++ b/repofmt.py	2009-02-27 05:15:20 +0000
@@ -290,31 +290,35 @@
                 next_keys = set()
                 stream = source_vf.get_record_stream(cur_keys, 'as-requested',
                                                      True)
-                for record in stream:
-                    bytes = record.get_bytes_as('fulltext')
-                    # We don't care about search_key_func for this code,
-                    # because we only care about external references.
-                    node = chk_map._deserialise(bytes, record.key,
-                                                search_key_func=None)
-                    common_base = node._search_prefix
-                    if isinstance(node, chk_map.InternalNode):
-                        for prefix, value in node._items.iteritems():
-                            assert isinstance(value, tuple)
-                            if value not in next_keys:
-                                keys_by_search_prefix.setdefault(prefix,
-                                    []).append(value)
-                                next_keys.add(value)
-                    counter[0] += 1
-                    if pb is not None:
-                        pb.update('chk node', counter[0])
-                    yield record
+                def next_stream():
+                    for record in stream:
+                        bytes = record.get_bytes_as('fulltext')
+                        # We don't care about search_key_func for this code,
+                        # because we only care about external references.
+                        node = chk_map._deserialise(bytes, record.key,
+                                                    search_key_func=None)
+                        common_base = node._search_prefix
+                        if isinstance(node, chk_map.InternalNode):
+                            for prefix, value in node._items.iteritems():
+                                assert isinstance(value, tuple)
+                                if value not in next_keys:
+                                    keys_by_search_prefix.setdefault(prefix,
+                                        []).append(value)
+                                    next_keys.add(value)
+                        counter[0] += 1
+                        if pb is not None:
+                            pb.update('chk node', counter[0])
+                        yield record
+                yield next_stream()
                 # Double check that we won't be emitting any keys twice
                 next_keys = next_keys.intersection(remaining_keys)
                 cur_keys = []
                 for prefix in sorted(keys_by_search_prefix):
                     cur_keys.extend(keys_by_search_prefix[prefix])
-        yield _get_referenced_stream(id_roots)
-        yield _get_referenced_stream(p_id_roots)
+        for stream in _get_referenced_stream(id_roots):
+            yield stream
+        for stream in _get_referenced_stream(p_id_roots):
+            yield stream
         if remaining_keys:
             trace.note('There were %d keys in the chk index, which'
                        ' were not referenced from inventories',



More information about the bazaar-commits mailing list