Rev 52: Hack together a way to insert the texts in reverse topological order. in http://bzr.arbash-meinel.com/plugins/groupcompress
John Arbash Meinel
john at arbash-meinel.com
Fri Jul 25 20:43:38 BST 2008
At http://bzr.arbash-meinel.com/plugins/groupcompress
------------------------------------------------------------
revno: 52
revision-id: john at arbash-meinel.com-20080725194328-nhjq8m9b04tdcjo0
parent: john at arbash-meinel.com-20080725032223-uz2pbbqz5ez2t3mi
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: groupcompress
timestamp: Fri 2008-07-25 14:43:28 -0500
message:
Hack together a way to insert the texts in reverse topological order.
This shows the benefits in compression size.
-------------- next part --------------
=== modified file 'groupcompress.py'
--- a/groupcompress.py 2008-07-25 02:50:05 +0000
+++ b/groupcompress.py 2008-07-25 19:43:28 +0000
@@ -29,6 +29,7 @@
graph as _mod_graph,
pack,
patiencediff,
+ tsort,
)
from bzrlib.graph import Graph
from bzrlib.knit import _DirectPackAccess
@@ -252,9 +253,9 @@
:param index_lines: A boolean flag for each line - when True, index
that line.
"""
- # indexed_newlines = [idx for idx, val in enumerate(index_lines)
- # if val and new_lines[idx] == '\n']
- # if indexed_newlines:
+ # if len(index_lines) < 200:
+ # for index, line in zip(index_lines, new_lines):
+ # print index, line[:-1]
# import pdb; pdb.set_trace()
endpoint = self.endpoint
self.line_locations.extend_lines(new_lines, index_lines)
@@ -564,6 +565,10 @@
for key, reads, refs in keys_to_add:
nodes.append((key, "%d %d %s" % (start, length, reads), refs))
self._index.add_records(nodes, random_id=random_id)
+ # reverse the standard ordering of the stream
+ # INEFFICIENT and wasteful, but heck, I'm just prototyping
+ text_info = {}
+ text_parent_map = {}
for record in stream:
# Raise an error when a record is missing.
if record.storage_kind == 'absent':
@@ -575,12 +580,32 @@
adapter = get_adapter(adapter_key)
bytes = adapter.get_bytes(record,
record.get_bytes_as(record.storage_kind))
- found_sha1, end_point = self._compressor.compress(record.key,
- split_lines(bytes), record.sha1)
- self._unadded_refs[record.key] = record.parents
+ text_info[record.key] = (bytes, record.sha1)
+ text_parent_map[record.key] = record.parents
+
+ # filter out extra nodes so that topo_sort will work
+ # We supposedly requested them in topo sort order, so we *should* be
+ # able to just reverse it.
+ minimal_parents = {}
+ for key, parent_keys in text_parent_map.iteritems():
+ if parent_keys is None:
+ continue
+ minimal_parents[key] = [p for p in parent_keys if p in
+ text_parent_map]
+ # Iterate in reverse topological order (ie newest first)
+ for key in reversed(tsort.topo_sort(minimal_parents)):
+ bytes, sha1 = text_info.pop(key)
+ parents = text_parent_map[key]
+ if isinstance(bytes, str):
+ lines = split_lines(bytes)
+ else:
+ lines = bytes
+ found_sha1, end_point = self._compressor.compress(key,
+ lines, sha1)
+ self._unadded_refs[key] = parents
yield found_sha1
- keys_to_add.append((record.key, '%d %d' % (basis_end, end_point),
- (record.parents,)))
+ keys_to_add.append((key, '%d %d' % (basis_end, end_point),
+ (parents,)))
basis_end = end_point
if basis_end > 1024 * 1024 * 20:
flush()
More information about the bazaar-commits
mailing list