Rev 4: Sort corpus by reverse size and allow a size cap. in http://people.ubuntu.com/~robertc/baz2.0/plugins/compressbench/trunk
Robert Collins
robertc at robertcollins.net
Thu Jan 22 01:08:22 GMT 2009
At http://people.ubuntu.com/~robertc/baz2.0/plugins/compressbench/trunk
------------------------------------------------------------
revno: 4
revision-id: robertc at robertcollins.net-20090122010818-htr2chqbsa5l13y3
parent: robertc at robertcollins.net-20090120031851-18uikjx9ylg1xt6j
committer: Robert Collins <robertc at robertcollins.net>
branch nick: trunk
timestamp: Thu 2009-01-22 12:08:18 +1100
message:
Sort corpus by reverse size and allow a size cap.
=== modified file 'bench.py'
--- a/bench.py 2009-01-20 03:18:51 +0000
+++ b/bench.py 2009-01-22 01:08:18 +0000
@@ -66,15 +66,17 @@
no parent information, None (as opposed to () for an empty list of
parents).
:ivar _path: The path on disk of the file containing the content.
+ :ivar _length: The length of the content.
"""
- def __init__(self, key, parents, sha1, path):
+ def __init__(self, key, parents, sha1, path, length):
"""Create a DiskContentFactory."""
self.sha1 = sha1
self.storage_kind = 'fulltext'
self.key = key
self.parents = parents
self._path = path
+ self._length = length
def get_bytes_as(self, storage_kind):
# this probably needs to measure the time to read the content to
@@ -97,20 +99,28 @@
takes_options = [
ListOption('delta', type=str, help='Compressors to test. (gc, knit)'),
+ Option('limit', type=int, help='Cap the corpus at limit bytes'),
]
- def run(self, delta=None):
+ def run(self, delta=None, limit=None):
if not delta:
delta = ['gc', 'knit']
# extract source to full text files on disk with metadata held in memory
source_repo = Repository.open('/home/robertc/source/baz')
print "Extracting corpus to test with"
- total_bytes, stream, cleanup = self.make_corpus(source_repo)
+ total_bytes, stream, cleanup = self.make_corpus(source_repo, limit)
try:
print "Corpus size", total_bytes, "bytes in", len(stream), "texts"
+ # Create an arbitrary compression order:
# set arbitrary parents to match the compression order (avoids having
# to change knits to handle it better during these benchmarks).
- # compress the contents
+ # --- this sorts by largest->smallest text ---
+ print "Size sorted corpus"
+ stream.sort(key=lambda x:x._length, reverse=True)
+ parents = ()
+ for factory in stream:
+ factory.parents = parents
+ parents = (factory.key,)
# Now for each compression type benchmark it:
factories = {'knit':make_pack_factory, 'gc':make_gc_factory}
for label in delta:
@@ -182,9 +192,10 @@
sd = math.sqrt(sum((time-mean)**2 for time in per_k_times) / count)
print "dev", sd
- def make_corpus(self, repo):
+ def make_corpus(self, repo, limit):
"""Create a corpus to compress as a record_stream.
+ :param limit: Byte count to stop adding to the corpus at.
:return: a_stream, a_cleanup_callback
"""
total_bytes = 0
@@ -201,11 +212,14 @@
'unordered', True):
bytes = record.get_bytes_as('fulltext')
t.put_bytes("%s" % serial, bytes)
- result.append(DiskContentFactory(
+ factory = DiskContentFactory(
record.key, record.parents, record.sha1,
- "%s/%s" % (workingdir, serial)))
- total_bytes += len(bytes)
+ "%s/%s" % (workingdir, serial), len(bytes))
+ result.append(factory)
+ total_bytes += factory._length
serial += 1
+ if limit and total_bytes > limit:
+ break
finally:
repo.unlock()
except:
More information about the bazaar-commits
mailing list