Rev 5: Add git and beta dulwich compressor support. in http://people.ubuntu.com/~robertc/baz2.0/plugins/compressbench/trunk

Robert Collins robertc at robertcollins.net
Fri Jan 23 06:29:06 GMT 2009


At http://people.ubuntu.com/~robertc/baz2.0/plugins/compressbench/trunk

------------------------------------------------------------
revno: 5
revision-id: robertc at robertcollins.net-20090123062854-a2oezj8kln5m0aql
parent: robertc at robertcollins.net-20090122010818-htr2chqbsa5l13y3
committer: Robert Collins <robertc at robertcollins.net>
branch nick: trunk
timestamp: Fri 2009-01-23 17:28:54 +1100
message:
  Add git and beta dulwich compressor support.
=== modified file 'bench.py'
--- a/bench.py	2009-01-22 01:08:18 +0000
+++ b/bench.py	2009-01-23 06:28:54 +0000
@@ -94,11 +94,113 @@
             source.close()
 
 
+from subprocess import Popen, PIPE
+from bzrlib.btree_index import BTreeBuilder
+from bzrlib.versionedfile import FulltextContentFactory, VersionedFiles
+class GitVersionedFiles(VersionedFiles):
+
+    def __init__(self, transport):
+        self._index = BTreeBuilder()
+        self._transport = transport
+        self._path = transport.local_abspath('.')
+        self._run_git(['init'])
+        transport.put_bytes('content', '')
+        self._run_git(['add', 'content'])
+
+    def _run_git(self, params):
+        process = Popen(['git'] + params, cwd=self._path, stdin=PIPE,
+            stdout=PIPE, stderr=PIPE)
+        out, err = process.communicate()
+        rc = process.returncode
+        assert rc == 0, 'bad result %d %r %r' % (rc, out, err)
+        return out, err
+
+    def get_record_stream(self, keys, ordering, include_delta_closure):
+        for key in keys:
+            blob_id = self._index.iter_entries([key]).next()[2]
+            content, _ = self._run_git(['cat-file', '-p', blob_id])
+            yield FulltextContentFactory(key, (), None, content)
+
+    def insert_record_stream(self, stream):
+        for record in stream:
+            self._transport.put_bytes('content', record.get_bytes_as('fulltext'))
+            # Save the content
+            out,_ = self._run_git(['hash-object', 'content'])
+            blob_id = out[:-1]
+            # record a tree so pack will work as normal with ordering etc.
+            # NB: note that this is double-hashing the file content probably.
+            self._run_git(['commit', '-m', 'foo', 'content'])
+            self._index.add_node(record.key, blob_id)
+
+    def keys(self):
+        result = set()
+        for node in self._index.iter_all_entries():
+            result.add(node[1])
+        return result
+
+    def _pack(self):
+        self._run_git(['repack', '-a', '-d', '--depth=200', '--window=200'])
+
+    def _pack_name(self):
+        paths = self._transport.list_dir('.git/objects/pack')
+        for path in paths:
+            if path.endswith('.pack'):
+                return '.git/objects/pack/' + path
+
+    def size(self):
+        return self._transport.stat(self._pack_name()).st_size
+
+
+class GitWriterThunk(object):
+
+    def  __init__(self, vf):
+        self.vf = vf
+
+    def close(self):
+        pass
+
+    def end(self):
+        self.vf._pack()
+
+
+def make_git_factory(delta, graph, keylength):
+    """Make a VF factory based on invoking a git commit for each step."""
+    def factory(transport):
+        result = GitVersionedFiles(transport)
+        result.writer = GitWriterThunk(result)
+        result.stream = result.writer
+        return result
+    return factory
+
+
+def make_dulwich_factory(delta, graph, keylength):
+    """Make a dulwich-using git backed VF."""
+    from dulwich.pack import Pack
+    class DulwichVersionedFiles(GitVersionedFiles):
+        """A dulwich using VF implementation."""
+        def __init__(self, transport):
+            GitVersionedFiles.__init__(self, transport)
+            self._pack_obj = None
+        def get_record_stream(self, keys, ordering, include_delta_closure):
+            if self._pack_obj is None:
+                self._pack_obj = Pack(self._transport.local_abspath(self._pack_name()[:-5]))
+            for key in keys:
+                blob_id = self._index.iter_entries([key]).next()[2]
+                content = self._pack_obj.get_raw(blob_id)
+                yield FulltextContentFactory(key, (), None, content)
+
+    def factory(transport):
+        result = DulwichVersionedFiles(transport)
+        result.writer = GitWriterThunk(result)
+        result.stream = result.writer
+        return result
+    return factory
+
 class cmd_compressbench(Command):
     """Benchmark compression performance."""
 
     takes_options = [
-        ListOption('delta', type=str, help='Compressors to test. (gc, knit)'),
+        ListOption('delta', type=str, help='Compressors to test. (gc, knit, git, dulwich)'),
         Option('limit', type=int, help='Cap the corpus at limit bytes'),
         ]
 
@@ -122,14 +224,19 @@
                 factory.parents = parents
                 parents = (factory.key,)
             # Now for each compression type benchmark it:
-            factories = {'knit':make_pack_factory, 'gc':make_gc_factory}
+            factories = {'knit':make_pack_factory, 'gc':make_gc_factory,
+                'git':make_git_factory, 'dulwich':make_dulwich_factory}
             for label in delta:
                 factory = factories[label]
                 print "Testing", label, "compression"
                 vf, vf_t, vf_cleanup = self.test_compressor(stream, factory)
                 try:
                     # decompress
-                    print "Compressed size", vf_t.stat('newpack').st_size
+                    try:
+                        size = vf.size()
+                    except AttributeError:
+                        size = vf_t.stat('newpack').st_size
+                    print "Compressed size", size
                     self.test_decompress(vf)
                 finally:
                     vf_cleanup()




More information about the bazaar-commits mailing list