Rev 2752: Make hash based packing methods avoid roundtrips in http://sourcefrog.net/bzr/pack-hashes

Martin Pool mbp at sourcefrog.net
Thu Aug 30 09:34:38 BST 2007


At http://sourcefrog.net/bzr/pack-hashes

------------------------------------------------------------
revno: 2752
revision-id: mbp at sourcefrog.net-20070830083437-gtoy9f4awpnxiyx3
parent: mbp at sourcefrog.net-20070830043341-01q2hwpw6d78btta
committer: Martin Pool <mbp at sourcefrog.net>
branch nick: pack-hashes
timestamp: Thu 2007-08-30 18:34:37 +1000
message:
  Make hash based packing methods avoid roundtrips
modified:
  bzrlib/repofmt/pack_repo.py    pack_repo.py-20070813041115-gjv5ma7ktfqwsjgn-1
  bzrlib/tests/test_pack_repository.py test_pack_repository-20070828111851-nof5soh31tidz2dq-1
=== modified file 'bzrlib/repofmt/pack_repo.py'
--- a/bzrlib/repofmt/pack_repo.py	2007-08-30 04:33:41 +0000
+++ b/bzrlib/repofmt/pack_repo.py	2007-08-30 08:34:37 +0000
@@ -1185,15 +1185,20 @@
     they simulate - this just provides common delegated implementations.
     """
 
-    def _add_bytes_by_hash(self, new_hash, new_bytes):
-        """Add content to the repository indexed by hash.
+    def _add_bytes_by_hash_multi(self, new_hashes, new_texts):
+        """Add hash-indexed content to the repository.
         """
-        offset, length = self._open_pack_writer.add_bytes_record(
-            new_bytes, [(new_hash,)])
-        assert len(new_hash) == 40
-        key = (new_hash,)
-        value = "%d %d" % (offset, length)
-        self._hash_write_index.add_nodes([(key, value)])
+        # TODO: Could we benefit from the pack writer accepting several texts
+        # at a time so it can better buffer the writing?
+        index_updates = []
+        for h, t in zip(new_hashes, new_texts):
+            assert len(h) == 40, \
+                    "%r doesn't look like a sha1 hash" % h
+            offset, length = self._open_pack_writer.add_bytes_record(t, [])
+            key = (h, )
+            value = "%d %d" % (offset, length)
+            index_updates.append((key, value))
+        self._hash_write_index.add_nodes(index_updates)
 
     def _get_bytes_by_hash_multi(self, hashes):
         """Look up objects by hash.
@@ -1208,30 +1213,27 @@
         index_to_pack, indices = self._make_index_map(_HASH_INDEX_SUFFIX)
         combined = CombinedGraphIndex(indices)
         r = {}
-        # index keys are tuples; we just want a string
+        # index keys are tuples but our hashes are just strings
         search = [(key,) for key in hashes]
-        missing_keys = set(hashes)
+        grouped_by_index = {}
         for index, key, value in combined.iter_entries(search):
             # nb they might be returned out of order
             found_hash = key[0]
-            missing_keys.remove(found_hash)
             offset, length = map(int, value.split(' '))
+            from_this_index = grouped_by_index.setdefault(index, {})
+            from_this_index[(offset, length)] = found_hash
+        for index, from_this_index in grouped_by_index.items():
             transport, pack_name = index_to_pack[index]
-            # TODO: is there already some code that will group together all
-            # the retrievals from a pack and turn them into one readv per
-            # pack?
-            #
-            # TODO: do a readv, not individual reads
-            assert found_hash not in r
-            reader = make_readv_reader(transport, pack_name, [(offset, length)])
+            ranges = sorted(from_this_index.keys())
+            range_iter = iter(ranges)
+            reader = make_readv_reader(transport, pack_name, ranges)
             for record_reader in reader.iter_records():
-                found_names, read_fn = record_reader
+                this_range = range_iter.next()
+                found_hash = from_this_index[this_range]
+                read_fn = record_reader[1]
                 result = read_fn(None) # read all
                 assert osutils.sha_string(result) == found_hash
             r[found_hash] = result
-        if missing_keys:
-            raise errors.BzrError("keys %r were not found in %r" %
-                    (missing_keys, combined))
         return r
 
     def _get_bytes_by_hash(self, h):

=== modified file 'bzrlib/tests/test_pack_repository.py'
--- a/bzrlib/tests/test_pack_repository.py	2007-08-30 04:33:41 +0000
+++ b/bzrlib/tests/test_pack_repository.py	2007-08-30 08:34:37 +0000
@@ -45,11 +45,12 @@
         repo.lock_write()
         repo.start_write_group()
         # store some stuff
-        stuff = 'hello repo!'
-        stuff_hash = sha_string(stuff)
-        repo._add_bytes_by_hash(stuff_hash, stuff)
+        texts = ['hello repo!', 'binary\0\n\r\n\0']
+        hashes = map(sha_string, texts)
+        repo._add_bytes_by_hash_multi(hashes, texts)
         # finish storing
         repo.commit_write_group()
         # try to get it back
-        result = repo._get_bytes_by_hash(stuff_hash)
-        self.assertEquals(stuff, result)
+        for expected in texts:
+            result = repo._get_bytes_by_hash(sha_string(expected))
+            self.assertEquals(expected, result)




More information about the bazaar-commits mailing list