Rev 3879: Merge the dupes and empty record handling. in http://bzr.arbash-meinel.com/branches/bzr/brisbane/hack3

Fri Mar 20 17:12:33 GMT 2009

At http://bzr.arbash-meinel.com/branches/bzr/brisbane/hack3

------------------------------------------------------------
revno: 3879
revision-id: john at arbash-meinel.com-20090320170430-fvgnzc1amp7rmd11
parent: john at arbash-meinel.com-20090320154811-znms4757w29gmc4b
parent: john at arbash-meinel.com-20090319225110-hfslu08ridcsc5xi
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: hack3
timestamp: Fri 2009-03-20 12:04:30 -0500
message:
  Merge the dupes and empty record handling.
modified:
  bzrlib/groupcompress.py        groupcompress.py-20080705181503-ccbxd6xuy1bdnrpu-8
  bzrlib/repofmt/groupcompress_repo.py repofmt.py-20080715094215-wp1qfvoo7093c8qr-1
    ------------------------------------------------------------
    revno: 3877.1.2
    revision-id: john at arbash-meinel.com-20090319225110-hfslu08ridcsc5xi
    parent: john at arbash-meinel.com-20090319223546-odxel2nktm700d7e
    committer: John Arbash Meinel <john at arbash-meinel.com>
    branch nick: hack3
    timestamp: Thu 2009-03-19 17:51:10 -0500
    message:
      Change .compress() to return the start-point.
      
      Now that we can return an entry pointing at an existing entry
      we need to record that correctly in the index.
      It also helps for the 'null' entries.
    modified:
      bzrlib/groupcompress.py        groupcompress.py-20080705181503-ccbxd6xuy1bdnrpu-8
    ------------------------------------------------------------
    revno: 3877.1.1
    revision-id: john at arbash-meinel.com-20090319223546-odxel2nktm700d7e
    parent: john at arbash-meinel.com-20090319194720-4esxj7gnrmfaykww
    committer: John Arbash Meinel <john at arbash-meinel.com>
    branch nick: hack3
    timestamp: Thu 2009-03-19 17:35:46 -0500
    message:
      Some hacky code to check for:
      a) empty records, like directories and symlinks. This might decrease index pressure
      a tiny bit, as they can be recorded as (0, 0, 0, 0).
      b) duplicate entries in the same gc block.
      Rather than insert another copy of the same delta (or a delta to the identical text).
      
      We seem to get quite a few hits for both (b) and (a), so it is probably worth looking
      closer at them.
    modified:
      bzrlib/groupcompress.py        groupcompress.py-20080705181503-ccbxd6xuy1bdnrpu-8
      bzrlib/repofmt/groupcompress_repo.py repofmt.py-20080715094215-wp1qfvoo7093c8qr-1
-------------- next part --------------
=== modified file 'bzrlib/groupcompress.py'

--- a/bzrlib/groupcompress.py	2009-03-20 15:48:11 +0000
+++ b/bzrlib/groupcompress.py	2009-03-20 17:04:30 +0000
@@ -730,12 +730,13 @@
        left side.
     """
 
-    def __init__(self):
+    def __init__(self, check_for_dupes=False):
         """Create a GroupCompressor."""
         # Consider seeding the lines with some sort of GC Start flag, or
         # putting it as part of the output stream, rather than in the
         # compressed bytes.
         self.lines = []
+        self._check_for_dupes = check_for_dupes
         self.endpoint = 0
         self.input_bytes = 0
         self.num_keys = 0
@@ -743,6 +744,9 @@
         self._last = None
         self._delta_index = _groupcompress_pyx.DeltaIndex()
         self._block = GroupCompressBlock()
+        self._entries_by_sha1 = {}
+        self._empty_entries = 0
+        self._deduped_entries = 0
 
     def compress(self, key, bytes, expected_sha, nostore_sha=None, soft=False):
         """Compress lines with label key.
@@ -763,10 +767,18 @@
             the group output so far.
         :seealso VersionedFiles.add_lines:
         """
+        if not bytes: # empty, like a dir entry, etc
+            self._empty_entries += 1
+            self._block.add_entry(key, type='empty',
+                                  sha1=None, start=0,
+                                  length=0)
+            return None, 0, 0, 'fulltext', 0
         sha1 = None
         # we assume someone knew what they were doing when they passed it in
         if expected_sha is not None:
             sha1 = expected_sha
+        if self._check_for_dupes and sha1 is None:
+            sha1 = osutils.sha_string(bytes)
         if nostore_sha is not None:
             if sha1 is None:
                 sha1 = osutils.sha_string(bytes)
@@ -790,6 +802,13 @@
             raise AssertionError('_source_offset != endpoint'
                 ' somehow the DeltaIndex got out of sync with'
                 ' the output lines')
+        if self._check_for_dupes and sha1 in self._entries_by_sha1:
+            self._deduped_entries += 1
+            (type, start, length) = self._entries_by_sha1[sha1]
+            self._block.add_entry(key, type=type,
+                                  sha1=sha1, start=start,
+                                  length=length)
+            return sha1, start, start+length, 'sha1_dupe', length
         max_delta_size = len(bytes) / 2
         delta = self._delta_index.make_delta(bytes, max_delta_size)
         if (delta is None):
@@ -811,6 +830,9 @@
                 self._delta_index.add_delta_source(delta, len_mini_header)
         self._block.add_entry(key, type=type, sha1=sha1,
                               start=self.endpoint, length=length)
+        if self._check_for_dupes:
+            self._entries_by_sha1[sha1] = (type, self.endpoint, length)
+        start = self.endpoint
         delta_start = (self.endpoint, len(self.lines))
         self.num_keys += 1
         self.output_chunks(new_chunks)
@@ -821,7 +843,7 @@
             raise AssertionError('the delta index is out of sync'
                 'with the output lines %s != %s'
                 % (self._delta_index._source_offset, self.endpoint))
-        return sha1, self.endpoint, type, length
+        return sha1, start, self.endpoint, type, length
 
     def extract(self, key):
         """Extract a key previously added to the compressor.
@@ -940,7 +962,8 @@
 class GroupCompressVersionedFiles(VersionedFiles):
     """A group-compress based VersionedFiles implementation."""
 
-    def __init__(self, index, access, delta=True):
+    def __init__(self, index, access, delta=True,
+                 check_for_dupes=False):
         """Create a GroupCompressVersionedFiles object.
 
         :param index: The index object storing access and graph data.
@@ -951,6 +974,7 @@
         self._access = access
         self._delta = delta
         self._unadded_refs = {}
+        self._check_for_dupes = check_for_dupes
         self._group_cache = LRUSizeCache(max_size=50*1024*1024)
         self._fallback_vfs = []
 
@@ -1384,11 +1408,19 @@
                 return adapter
         # This will go up to fulltexts for gc to gc fetching, which isn't
         # ideal.
-        self._compressor = GroupCompressor()
+        self._compressor = GroupCompressor(self._check_for_dupes)
         self._unadded_refs = {}
         keys_to_add = []
-        basis_end = 0
+        total_deduped = [0, 0]
         def flush():
+            if (self._compressor._deduped_entries > 0
+                or self._compressor._empty_entries > 0):
+                total_deduped[0] += self._compressor._deduped_entries
+                total_deduped[1] += self._compressor._empty_entries
+                trace.note('Dedupped %d out of %d entries, %d empty',
+                           self._compressor._deduped_entries,
+                           len(self._compressor._block._entries),
+                           self._compressor._empty_entries)
             bytes = self._compressor.flush().to_bytes()
             index, start, length = self._access.add_raw_records(
                 [(None, len(bytes))], bytes)[0]
@@ -1398,7 +1430,7 @@
             self._index.add_records(nodes, random_id=random_id)
             self._unadded_refs = {}
             del keys_to_add[:]
-            self._compressor = GroupCompressor()
+            self._compressor = GroupCompressor(self._check_for_dupes)
 
         last_prefix = None
         last_fulltext_len = None
@@ -1448,7 +1480,7 @@
             if max_fulltext_len < len(bytes):
                 max_fulltext_len = len(bytes)
                 max_fulltext_prefix = prefix
-            (found_sha1, end_point, type,
+            (found_sha1, start_point, end_point, type,
              length) = self._compressor.compress(record.key,
                 bytes, record.sha1, soft=soft,
                 nostore_sha=nostore_sha)
@@ -1496,9 +1528,8 @@
             if start_new_block:
                 self._compressor.pop_last()
                 flush()
-                basis_end = 0
                 max_fulltext_len = len(bytes)
-                (found_sha1, end_point, type,
+                (found_sha1, start_point, end_point, type,
                  length) = self._compressor.compress(record.key,
                     bytes, record.sha1)
                 last_fulltext_len = length
@@ -1508,12 +1539,14 @@
                 key = record.key
             self._unadded_refs[key] = record.parents
             yield found_sha1
-            keys_to_add.append((key, '%d %d' % (basis_end, end_point),
+            keys_to_add.append((key, '%d %d' % (start_point, end_point),
                 (record.parents,)))
-            basis_end = end_point
         if len(keys_to_add):
             flush()
         self._compressor = None
+        if total_deduped[0] > 0 or total_deduped[1] > 0:
+            trace.note('Total deduped = %d, total empty = %d\n',
+                       total_deduped[0], total_deduped[1])
 
     def iter_lines_added_or_present_in_keys(self, keys, pb=None):
         """Iterate over the lines in the versioned files from keys.

=== modified file 'bzrlib/repofmt/groupcompress_repo.py'
--- a/bzrlib/repofmt/groupcompress_repo.py	2009-03-20 15:48:11 +0000
+++ b/bzrlib/repofmt/groupcompress_repo.py	2009-03-20 17:04:30 +0000
@@ -348,13 +348,17 @@
                 indices.append(sub_index)
             index = _mod_index.CombinedGraphIndex(indices)
             add_callback = None
+        check_for_dupes = False
+        if index_name == 'text_index':
+            check_for_dupes = True
         vf = GroupCompressVersionedFiles(
             _GCGraphIndex(index,
                           add_callback=add_callback,
                           parents=parents,
                           is_locked=self._pack_collection.repo.is_locked),
             access=access,
-            delta=delta)
+            delta=delta,
+            check_for_dupes=check_for_dupes)
         return vf
 
     def _build_vfs(self, index_name, parents, delta):