Rev 24: Add a group cache to decompression, 5 times faster than knit at decompression when accessing everything in a group. in http://people.ubuntu.com/~robertc/baz2.0/plugins/groupcompress/trunk

Tue Feb 3 01:27:02 GMT 2009

At http://people.ubuntu.com/~robertc/baz2.0/plugins/groupcompress/trunk

------------------------------------------------------------
revno: 24
revision-id: robertc at robertcollins.net-20090203012650-ljj5dts8i69won65
parent: robertc at robertcollins.net-20090119054653-khm0iyeyfv47hzb3
committer: Robert Collins <robertc at robertcollins.net>
branch nick: trunk
timestamp: Tue 2009-02-03 12:26:50 +1100
message:
  Add a group cache to decompression, 5 times faster than knit at decompression when accessing everything in a group.
=== modified file 'DESIGN'

--- a/DESIGN	2008-07-05 18:15:40 +0000
+++ b/DESIGN	2009-02-03 01:26:50 +0000
@@ -73,7 +73,7 @@
 once the codebase is up and running, we can tweak this to
 
 Very small objects - no delta? If they are combined with a larger zlib object
-why not?
+why not? (Answer: because zlib's window is really small)
 
 Other objects - group by fileid (gives related texts a chance, though using a
 file name would be better long term as e.g. COPYING and COPYING from different

=== modified file 'groupcompress.py'
--- a/groupcompress.py	2009-01-19 05:46:53 +0000
+++ b/groupcompress.py	2009-02-03 01:26:50 +0000
@@ -40,6 +40,7 @@
     split_lines,
     )
 from bzrlib.btree_index import BTreeBuilder
+from bzrlib.lru_cache import LRUSizeCache
 from bzrlib.plugins.groupcompress import equivalence_table
 from bzrlib.tsort import topo_sort
 from bzrlib.versionedfile import (
@@ -315,6 +316,7 @@
         self._access = access
         self._delta = delta
         self._unadded_refs = {}
+        self._group_cache = LRUSizeCache(max_size=50*1024*1024)
 
     def add_lines(self, key, parents, lines, parent_texts=None,
         left_matching_blocks=None, nostore_sha=None, random_id=False,
@@ -487,23 +489,30 @@
                 parents = self._unadded_refs[key]
             else:
                 index_memo, _, parents, (method, _) = locations[key]
-                # read the group
                 read_memo = index_memo[0:3]
-                zdata = self._access.get_raw_records([read_memo]).next()
-                # decompress - whole thing; this is a bug.
-                decomp = zlib.decompressobj()
-                plain = decomp.decompress(zdata, index_memo[4])
+                # get the group:
+                try:
+                    plain = self._group_cache[read_memo]
+                except KeyError:
+                    # read the group
+                    zdata = self._access.get_raw_records([read_memo]).next()
+                    # decompress - whole thing - this is not a bug, as it
+                    # permits caching. We might want to store the partially
+                    # decompresed group and decompress object, so that recent
+                    # texts are not penalised by big groups.
+                    decomp = zlib.decompressobj()
+                    plain = decomp.decompress(zdata) #, index_memo[4])
+                    self._group_cache[read_memo] = plain
                 # cheapo debugging:
                 # print len(zdata), len(plain)
-                # parse - requires split_lines, better to have byte offsets here.
+                # parse - requires split_lines, better to have byte offsets
+                # here (but not by much - we only split the region for the
+                # recipe, and we often want to end up with lines anyway.
                 delta_lines = split_lines(plain[index_memo[3]:index_memo[4]])
                 label, sha1, delta = parse(delta_lines)
                 if label != key:
                     raise AssertionError("wrong key: %r, wanted %r" % (label, key))
-                basis = plain[:index_memo[3]]
-                # basis = StringIO(basis).readlines()
-                #basis = split_lines(plain[:last_end])
-                lines = apply_delta(basis, delta)
+                lines = apply_delta(plain, delta)
             bytes = ''.join(lines)
             yield FulltextContentFactory(key, parents, sha1, bytes)