Rev 100: Use the max_delta flag. in http://bazaar.launchpad.net/%7Ebzr/bzr-groupcompress/rabin

Tue Mar 3 22:03:10 GMT 2009

At http://bazaar.launchpad.net/%7Ebzr/bzr-groupcompress/rabin

------------------------------------------------------------
revno: 100
revision-id: john at arbash-meinel.com-20090303220215-1luhz4zfr9vrdmud
parent: john at arbash-meinel.com-20090303214221-ea1e84bkmi22yfgk
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: rabin
timestamp: Tue 2009-03-03 16:02:15 -0600
message:
  Use the max_delta flag.
  Prefer to extract and compress bytes rather than chunks/lines.
  This has a fairly positive impact on the 'bzr pack' times.
  We still do a ''.join([bytes]), but we know that doesn't have
  to do any memory copying.
-------------- next part --------------
=== modified file 'groupcompress.py'

--- a/groupcompress.py	2009-03-03 20:35:26 +0000
+++ b/groupcompress.py	2009-03-03 22:02:15 +0000
@@ -150,12 +150,14 @@
         :return: The sha1 of lines, and the number of bytes accumulated in
             the group output so far.
         """
-        target_text = ''.join(chunks)
-        sha1 = sha_string(target_text)
+        # TODO: Change this to a bytes interface, since the output is now a
+        #       bytes interface anyway.
+        bytes = ''.join(chunks)
+        sha1 = sha_string(bytes)
         if key[-1] is None:
             key = key[:-1] + ('sha1:' + sha1,)
         label = '\x00'.join(key)
-        input_len = len(target_text)
+        input_len = len(bytes)
         # By having action/label/sha1/len, we can parse the group if the index
         # was ever destroyed, we have the key in 'label', we know the final
         # bytes are valid from sha1, and we know where to find the end of this
@@ -172,9 +174,9 @@
             raise AssertionError('_source_offset != endpoint'
                 ' somehow the DeltaIndex got out of sync with'
                 ' the output lines')
-        delta = self._delta_index.make_delta(target_text)
-        if (delta is None
-            or len(delta) > len(target_text) / 2):
+        max_delta_size = len(bytes) / 2
+        delta = self._delta_index.make_delta(bytes, max_delta_size)
+        if (delta is None):
             # We can't delta (perhaps source_text is empty)
             # so mark this as an insert
             if _NO_LABELS:
@@ -183,8 +185,8 @@
                 new_chunks.insert(0, 'fulltext\n')
                 new_chunks.append('len:%s\n' % (input_len,))
             unadded_bytes = sum(map(len, new_chunks))
-            self._delta_index.add_source(target_text, unadded_bytes)
-            new_chunks.append(target_text)
+            self._delta_index.add_source(bytes, unadded_bytes)
+            new_chunks.append(bytes)
         else:
             if _NO_LABELS:
                 new_chunks = ['d']
@@ -605,12 +607,11 @@
             if record.storage_kind == 'absent':
                 raise errors.RevisionNotPresent(record.key, self)
             try:
-                lines = osutils.chunks_to_lines(record.get_bytes_as('chunked'))
+                bytes = record.get_bytes_as('fulltext')
             except errors.UnavailableRepresentation:
                 adapter_key = record.storage_kind, 'fulltext'
                 adapter = get_adapter(adapter_key)
                 bytes = adapter.get_bytes(record)
-                lines = osutils.split_lines(bytes)
             soft = False
             if len(record.key) > 1:
                 prefix = record.key[0]
@@ -625,7 +626,7 @@
                         groups += 1
                 last_prefix = prefix
             found_sha1, end_point = self._compressor.compress(record.key,
-                lines, record.sha1, soft=soft)
+                [bytes], record.sha1, soft=soft)
             if record.key[-1] is None:
                 key = record.key[:-1] + ('sha1:' + found_sha1,)
             else: