Rev 5: We now implement multi-parent xdelta compression. in http://bzr.arbash-meinel.com/plugins/xdelta_repo
John Arbash Meinel
john at arbash-meinel.com
Fri Feb 20 20:59:39 GMT 2009
At http://bzr.arbash-meinel.com/plugins/xdelta_repo
------------------------------------------------------------
revno: 5
revision-id: john at arbash-meinel.com-20090220205915-jwcsa57f0v9x7nnv
parent: john at arbash-meinel.com-20090220203305-np9cf3z9j94tjw2l
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: xdelta_repo
timestamp: Fri 2009-02-20 14:59:15 -0600
message:
We now implement multi-parent xdelta compression.
Without the _source_text_cache, the conversion becomes O(N^2) and
performance drops far past abysmal.
With the _source_text_cache, we can convert bzrtools_1.9_r599 in
5s and the compressed size is 1.7MB which is down from 2.9MB in
pack1.9 form.
-------------- next part --------------
=== modified file 'xdelta_repo.py'
--- a/xdelta_repo.py 2009-02-20 20:33:05 +0000
+++ b/xdelta_repo.py 2009-02-20 20:59:15 +0000
@@ -25,6 +25,7 @@
errors,
graph as _mod_graph,
knit,
+ lru_cache,
osutils,
pack,
repository,
@@ -157,12 +158,15 @@
key = entry[1]
if not self._parents:
parents = None
+ compression_parents = None
else:
parents = entry[3][0]
+ compression_parents = entry[3][1]
value = entry[2]
method = 'group'
result[key] = (self._node_to_position(entry),
- None, parents, (method, None))
+ compression_parents, parents,
+ (method, None))
return result
def get_parent_map(self, keys):
@@ -212,6 +216,7 @@
self._index = index
self._access = access
self._delta = delta
+ self._source_text_cache = lru_cache.LRUSizeCache(50*1024*1024)
def add_lines(self, key, parents, lines, parent_texts=None,
left_matching_blocks=None, nostore_sha=None, random_id=False,
@@ -390,11 +395,8 @@
# lines, sha1 = self._compressor.extract(key)
# parents = self._unadded_refs[key]
# else:
- index_memo, _, parents, (method, _) = locations[key]
- read_memo = index_memo[0:3]
- enc_data = self._access.get_raw_records([read_memo]).next()
- total_bytes_length = index_memo[3]
- bytes = xd3.decode_memory(enc_data, total_bytes_length, source=None)
+ bytes = self._get_source_text([key], locations)
+ parents = locations[key][2]
label, sha1, text_bytes = _parse(bytes)
if not bytes.startswith('xdelta3\n'):
raise errors.KnitCorrupt("invalid header bytes: %r"
@@ -428,6 +430,34 @@
for _ in self._insert_record_stream(stream):
pass
+ def _get_source_text(self, keys, build_details):
+ """Get the 'source' text used for this record."""
+ # TODO: Turn this into something that loops over build details, rather
+ # than recursing
+ source_texts = []
+ for key in keys:
+ try:
+ bytes = self._source_text_cache[key]
+ except KeyError:
+ pass
+ else:
+ source_texts.append(bytes)
+ continue
+ (index_memo, c_parents, parents, (method, _)) = build_details[key]
+ read_memo = index_memo[0:3]
+ enc_data = self._access.get_raw_records([read_memo]).next()
+ total_bytes_length = index_memo[3]
+ if not c_parents:
+ source = None
+ else:
+ parent_build_details = self._index.get_build_details(c_parents)
+ source = self._get_source_text(c_parents, parent_build_details)
+ bytes = xd3.decode_memory(enc_data, total_bytes_length,
+ source=source)
+ self._source_text_cache[key] = bytes
+ source_texts.append(bytes)
+ return ''.join(source_texts)
+
def _insert_record_stream(self, stream, random_id=False):
"""Internal core to insert a record stream into this container.
@@ -460,23 +490,28 @@
adapter = get_adapter(adapter_key)
bytes = adapter.get_bytes(record)
sha1 = osutils.sha_string(bytes)
+ # TODO: Add parents and compression parents
header = 'xdelta3\nlabel: %s\nsha1: %s\n' % (
'\x00'.join(record.key), sha1)
total_bytes = header + bytes
total_bytes_length = len(total_bytes)
+ if self._delta:
+ refs = (record.parents, record.parents)
+ source = self._get_source_text(record.parents,
+ self._index.get_build_details(record.parents))
+ else:
+ refs = (record.parents,)
+ source = None
# XXX: Eventually we'll want to include 'source'
- comp_bytes = xd3.encode_memory(total_bytes, source=None,
+ comp_bytes = xd3.encode_memory(total_bytes, source=source,
flags=xd3.SEC_DJW)
index, start, length = self._access.add_raw_records(
[(None, len(comp_bytes))], comp_bytes)[0]
# For now, we have no compression parents
value = "%d %d %d" % (start, length, total_bytes_length)
- if self._delta:
- refs = (record.parents, ())
- else:
- refs = (record.parents,)
nodes = [(record.key, value, refs)]
self._index.add_records(nodes)
+ self._source_text_cache[record.key] = total_bytes
yield sha1
def iter_lines_added_or_present_in_keys(self, keys, pb=None):
More information about the bazaar-commits
mailing list