Rev 52: Hack together a way to insert the texts in reverse topological order. in http://bzr.arbash-meinel.com/plugins/groupcompress

John Arbash Meinel john at arbash-meinel.com
Fri Jul 25 20:43:38 BST 2008


At http://bzr.arbash-meinel.com/plugins/groupcompress

------------------------------------------------------------
revno: 52
revision-id: john at arbash-meinel.com-20080725194328-nhjq8m9b04tdcjo0
parent: john at arbash-meinel.com-20080725032223-uz2pbbqz5ez2t3mi
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: groupcompress
timestamp: Fri 2008-07-25 14:43:28 -0500
message:
  Hack together a way to insert the texts in reverse topological order.
  
  This shows the benefits in compression size.
-------------- next part --------------
=== modified file 'groupcompress.py'
--- a/groupcompress.py	2008-07-25 02:50:05 +0000
+++ b/groupcompress.py	2008-07-25 19:43:28 +0000
@@ -29,6 +29,7 @@
     graph as _mod_graph,
     pack,
     patiencediff,
+    tsort,
     )
 from bzrlib.graph import Graph
 from bzrlib.knit import _DirectPackAccess
@@ -252,9 +253,9 @@
         :param index_lines: A boolean flag for each line - when True, index
             that line.
         """
-        # indexed_newlines = [idx for idx, val in enumerate(index_lines)
-        #                          if val and new_lines[idx] == '\n']
-        # if indexed_newlines:
+        # if len(index_lines) < 200:
+        #     for index, line in zip(index_lines, new_lines):
+        #         print index, line[:-1]
         #     import pdb; pdb.set_trace()
         endpoint = self.endpoint
         self.line_locations.extend_lines(new_lines, index_lines)
@@ -564,6 +565,10 @@
             for key, reads, refs in keys_to_add:
                 nodes.append((key, "%d %d %s" % (start, length, reads), refs))
             self._index.add_records(nodes, random_id=random_id)
+        # reverse the standard ordering of the stream
+        # INEFFICIENT and wasteful, but heck, I'm just prototyping
+        text_info = {}
+        text_parent_map = {}
         for record in stream:
             # Raise an error when a record is missing.
             if record.storage_kind == 'absent':
@@ -575,12 +580,32 @@
                 adapter = get_adapter(adapter_key)
                 bytes = adapter.get_bytes(record,
                     record.get_bytes_as(record.storage_kind))
-            found_sha1, end_point = self._compressor.compress(record.key,
-                split_lines(bytes), record.sha1)
-            self._unadded_refs[record.key] = record.parents
+            text_info[record.key] = (bytes, record.sha1)
+            text_parent_map[record.key] = record.parents
+
+        # filter out extra nodes so that topo_sort will work
+        # We supposedly requested them in topo sort order, so we *should* be
+        # able to just reverse it.
+        minimal_parents = {}
+        for key, parent_keys in text_parent_map.iteritems():
+            if parent_keys is None:
+                continue
+            minimal_parents[key] = [p for p in parent_keys if p in
+                                    text_parent_map]
+        # Iterate in reverse topological order (ie newest first)
+        for key in reversed(tsort.topo_sort(minimal_parents)):
+            bytes, sha1 = text_info.pop(key)
+            parents = text_parent_map[key]
+            if isinstance(bytes, str):
+                lines = split_lines(bytes)
+            else:
+                lines = bytes
+            found_sha1, end_point = self._compressor.compress(key,
+                lines, sha1)
+            self._unadded_refs[key] = parents
             yield found_sha1
-            keys_to_add.append((record.key, '%d %d' % (basis_end, end_point),
-                (record.parents,)))
+            keys_to_add.append((key, '%d %d' % (basis_end, end_point),
+                (parents,)))
             basis_end = end_point
             if basis_end > 1024 * 1024 * 20:
                 flush()



More information about the bazaar-commits mailing list