Rev 4: Sort corpus by reverse size and allow a size cap. in http://people.ubuntu.com/~robertc/baz2.0/plugins/compressbench/trunk

Thu Jan 22 01:08:22 GMT 2009

At http://people.ubuntu.com/~robertc/baz2.0/plugins/compressbench/trunk

------------------------------------------------------------
revno: 4
revision-id: robertc at robertcollins.net-20090122010818-htr2chqbsa5l13y3
parent: robertc at robertcollins.net-20090120031851-18uikjx9ylg1xt6j
committer: Robert Collins <robertc at robertcollins.net>
branch nick: trunk
timestamp: Thu 2009-01-22 12:08:18 +1100
message:
  Sort corpus by reverse size and allow a size cap.
=== modified file 'bench.py'

--- a/bench.py	2009-01-20 03:18:51 +0000
+++ b/bench.py	2009-01-22 01:08:18 +0000
@@ -66,15 +66,17 @@
         no parent information, None (as opposed to () for an empty list of
         parents).
     :ivar _path: The path on disk of the file containing the content.
+    :ivar _length: The length of the content.
     """
 
-    def __init__(self, key, parents, sha1, path):
+    def __init__(self, key, parents, sha1, path, length):
         """Create a DiskContentFactory."""
         self.sha1 = sha1
         self.storage_kind = 'fulltext'
         self.key = key
         self.parents = parents
         self._path = path
+        self._length = length
 
     def get_bytes_as(self, storage_kind):
         # this probably needs to measure the time to read the content to
@@ -97,20 +99,28 @@
 
     takes_options = [
         ListOption('delta', type=str, help='Compressors to test. (gc, knit)'),
+        Option('limit', type=int, help='Cap the corpus at limit bytes'),
         ]
 
-    def run(self, delta=None):
+    def run(self, delta=None, limit=None):
         if not delta:
             delta = ['gc', 'knit']
         # extract source to full text files on disk with metadata held in memory
         source_repo = Repository.open('/home/robertc/source/baz')
         print "Extracting corpus to test with"
-        total_bytes, stream, cleanup = self.make_corpus(source_repo)
+        total_bytes, stream, cleanup = self.make_corpus(source_repo, limit)
         try:
             print "Corpus size", total_bytes, "bytes in", len(stream), "texts"
+            # Create an arbitrary compression order:
             # set arbitrary parents to match the compression order (avoids having
             # to change knits to handle it better during these benchmarks).
-            # compress the contents
+            # --- this sorts by largest->smallest text ---
+            print "Size sorted corpus"
+            stream.sort(key=lambda x:x._length, reverse=True)
+            parents = ()
+            for factory in stream:
+                factory.parents = parents
+                parents = (factory.key,)
             # Now for each compression type benchmark it:
             factories = {'knit':make_pack_factory, 'gc':make_gc_factory}
             for label in delta:
@@ -182,9 +192,10 @@
         sd = math.sqrt(sum((time-mean)**2 for time in per_k_times) / count)
         print "dev", sd
 
-    def make_corpus(self, repo):
+    def make_corpus(self, repo, limit):
         """Create a corpus to compress as a record_stream.
 
+        :param limit: Byte count to stop adding to the corpus at.
         :return: a_stream, a_cleanup_callback
         """
         total_bytes = 0
@@ -201,11 +212,14 @@
                     'unordered', True):
                     bytes = record.get_bytes_as('fulltext')
                     t.put_bytes("%s" % serial, bytes)
-                    result.append(DiskContentFactory(
+                    factory = DiskContentFactory(
                         record.key, record.parents, record.sha1,
-                        "%s/%s" % (workingdir, serial)))
-                    total_bytes += len(bytes)
+                        "%s/%s" % (workingdir, serial), len(bytes))
+                    result.append(factory)
+                    total_bytes += factory._length
                     serial += 1
+                    if limit and total_bytes > limit:
+                        break
             finally:
                 repo.unlock()
         except: