Rev 3879: Change .compress() to return the start-point. in http://bzr.arbash-meinel.com/branches/bzr/brisbane/hack3

Thu Mar 19 23:16:02 GMT 2009

At http://bzr.arbash-meinel.com/branches/bzr/brisbane/hack3

------------------------------------------------------------
revno: 3879
revision-id: john at arbash-meinel.com-20090319225110-hfslu08ridcsc5xi
parent: john at arbash-meinel.com-20090319223546-odxel2nktm700d7e
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: hack3
timestamp: Thu 2009-03-19 17:51:10 -0500
message:
  Change .compress() to return the start-point.
  
  Now that we can return an entry pointing at an existing entry
  we need to record that correctly in the index.
  It also helps for the 'null' entries.
-------------- next part --------------
=== modified file 'bzrlib/groupcompress.py'

--- a/bzrlib/groupcompress.py	2009-03-19 22:35:46 +0000
+++ b/bzrlib/groupcompress.py	2009-03-19 22:51:10 +0000
@@ -339,8 +339,8 @@
         :param sha1: TODO (should we validate only when sha1 is supplied?)
         :return: The bytes for the content
         """
-        if start == end == 0:
-            return ''
+        if start == end == 0: # NULL entry
+            return None, ''
         # Make sure we have enough bytes for this record
         # TODO: if we didn't want to track the end of this entry, we could
         #       _ensure_content(start+enough_bytes_for_type_and_length), and
@@ -376,7 +376,8 @@
         else:
             if end != content_start + content_len:
                 raise ValueError('end != len according to field header'
-                    ' %s != %s' % (end, content_start + content_len))
+                    ' %s != %s, %s' % (end, content_start + content_len),
+                    (start, content_len))
         entry = GroupCompressBlockEntry(key, type, sha1=None,
                                         start=start, length=end-start)
         content = self._content[content_start:end]
@@ -764,6 +765,7 @@
         self._delta_index = _groupcompress_pyx.DeltaIndex()
         self._block = GroupCompressBlock()
         self._entries_by_sha1 = {}
+        self._empty_entries = 0
         self._deduped_entries = 0
 
     def compress(self, key, bytes, expected_sha, nostore_sha=None, soft=False):
@@ -786,7 +788,11 @@
         :seealso VersionedFiles.add_lines:
         """
         if not bytes: # empty, like a dir entry, etc
-            return None, 0, 'fulltext', 0
+            self._empty_entries += 1
+            self._block.add_entry(key, type='empty',
+                                  sha1=None, start=0,
+                                  length=0)
+            return None, 0, 0, 'fulltext', 0
         sha1 = None
         # we assume someone knew what they were doing when they passed it in
         if expected_sha is not None:
@@ -822,7 +828,7 @@
             self._block.add_entry(key, type=type,
                                   sha1=sha1, start=start,
                                   length=length)
-            return sha1, self.endpoint, 'sha1_dupe', 0
+            return sha1, start, start+length, 'sha1_dupe', length
         max_delta_size = len(bytes) / 2
         delta = self._delta_index.make_delta(bytes, max_delta_size)
         if (delta is None):
@@ -846,6 +852,7 @@
                               start=self.endpoint, length=length)
         if self._check_for_dupes:
             self._entries_by_sha1[sha1] = (type, self.endpoint, length)
+        start = self.endpoint
         delta_start = (self.endpoint, len(self.lines))
         self.num_keys += 1
         self.output_chunks(new_chunks)
@@ -856,7 +863,7 @@
             raise AssertionError('the delta index is out of sync'
                 'with the output lines %s != %s'
                 % (self._delta_index._source_offset, self.endpoint))
-        return sha1, self.endpoint, type, length
+        return sha1, start, self.endpoint, type, length
 
     def extract(self, key):
         """Extract a key previously added to the compressor.
@@ -1424,14 +1431,16 @@
         self._compressor = GroupCompressor(self._check_for_dupes)
         self._unadded_refs = {}
         keys_to_add = []
-        basis_end = 0
-        total_deduped = [0]
+        total_deduped = [0, 0]
         def flush():
-            if self._compressor._deduped_entries > 0:
+            if (self._compressor._deduped_entries > 0
+                or self._compressor._empty_entries > 0):
                 total_deduped[0] += self._compressor._deduped_entries
-                trace.note('Dedupped %d out of %d entries',
+                total_deduped[1] += self._compressor._empty_entries
+                trace.note('Dedupped %d out of %d entries, %d empty',
                            self._compressor._deduped_entries,
-                           len(self._compressor._block._entries))
+                           len(self._compressor._block._entries),
+                           self._compressor._empty_entries)
             bytes = self._compressor.flush().to_bytes()
             index, start, length = self._access.add_raw_records(
                 [(None, len(bytes))], bytes)[0]
@@ -1491,7 +1500,7 @@
             if max_fulltext_len < len(bytes):
                 max_fulltext_len = len(bytes)
                 max_fulltext_prefix = prefix
-            (found_sha1, end_point, type,
+            (found_sha1, start_point, end_point, type,
              length) = self._compressor.compress(record.key,
                 bytes, record.sha1, soft=soft,
                 nostore_sha=nostore_sha)
@@ -1539,9 +1548,8 @@
             if start_new_block:
                 self._compressor.pop_last()
                 flush()
-                basis_end = 0
                 max_fulltext_len = len(bytes)
-                (found_sha1, end_point, type,
+                (found_sha1, start_point, end_point, type,
                  length) = self._compressor.compress(record.key,
                     bytes, record.sha1)
                 last_fulltext_len = length
@@ -1551,17 +1559,14 @@
                 key = record.key
             self._unadded_refs[key] = record.parents
             yield found_sha1
-            if length == 0:
-                keys_to_add.append((key, '0 0', (record.parents,)))
-            else:
-                keys_to_add.append((key, '%d %d' % (basis_end, end_point),
-                    (record.parents,)))
-            basis_end = end_point
+            keys_to_add.append((key, '%d %d' % (start_point, end_point),
+                (record.parents,)))
         if len(keys_to_add):
             flush()
         self._compressor = None
-        if total_deduped[0] > 0:
-            trace.note('Total deduped = %d\n', total_deduped[0])
+        if total_deduped[0] > 0 or total_deduped[1] > 0:
+            trace.note('Total deduped = %d, total empty = %d\n',
+                       total_deduped[0], total_deduped[1])
 
     def iter_lines_added_or_present_in_keys(self, keys, pb=None):
         """Iterate over the lines in the versioned files from keys.