Rev 2752: Make hash based packing methods avoid roundtrips in http://sourcefrog.net/bzr/pack-hashes
Martin Pool
mbp at sourcefrog.net
Thu Aug 30 09:34:38 BST 2007
At http://sourcefrog.net/bzr/pack-hashes
------------------------------------------------------------
revno: 2752
revision-id: mbp at sourcefrog.net-20070830083437-gtoy9f4awpnxiyx3
parent: mbp at sourcefrog.net-20070830043341-01q2hwpw6d78btta
committer: Martin Pool <mbp at sourcefrog.net>
branch nick: pack-hashes
timestamp: Thu 2007-08-30 18:34:37 +1000
message:
Make hash based packing methods avoid roundtrips
modified:
bzrlib/repofmt/pack_repo.py pack_repo.py-20070813041115-gjv5ma7ktfqwsjgn-1
bzrlib/tests/test_pack_repository.py test_pack_repository-20070828111851-nof5soh31tidz2dq-1
=== modified file 'bzrlib/repofmt/pack_repo.py'
--- a/bzrlib/repofmt/pack_repo.py 2007-08-30 04:33:41 +0000
+++ b/bzrlib/repofmt/pack_repo.py 2007-08-30 08:34:37 +0000
@@ -1185,15 +1185,20 @@
they simulate - this just provides common delegated implementations.
"""
- def _add_bytes_by_hash(self, new_hash, new_bytes):
- """Add content to the repository indexed by hash.
+ def _add_bytes_by_hash_multi(self, new_hashes, new_texts):
+ """Add hash-indexed content to the repository.
"""
- offset, length = self._open_pack_writer.add_bytes_record(
- new_bytes, [(new_hash,)])
- assert len(new_hash) == 40
- key = (new_hash,)
- value = "%d %d" % (offset, length)
- self._hash_write_index.add_nodes([(key, value)])
+ # TODO: Could we benefit from the pack writer accepting several texts
+ # at a time so it can better buffer the writing?
+ index_updates = []
+ for h, t in zip(new_hashes, new_texts):
+ assert len(h) == 40, \
+ "%r doesn't look like a sha1 hash" % h
+ offset, length = self._open_pack_writer.add_bytes_record(t, [])
+ key = (h, )
+ value = "%d %d" % (offset, length)
+ index_updates.append((key, value))
+ self._hash_write_index.add_nodes(index_updates)
def _get_bytes_by_hash_multi(self, hashes):
"""Look up objects by hash.
@@ -1208,30 +1213,27 @@
index_to_pack, indices = self._make_index_map(_HASH_INDEX_SUFFIX)
combined = CombinedGraphIndex(indices)
r = {}
- # index keys are tuples; we just want a string
+ # index keys are tuples but our hashes are just strings
search = [(key,) for key in hashes]
- missing_keys = set(hashes)
+ grouped_by_index = {}
for index, key, value in combined.iter_entries(search):
# nb they might be returned out of order
found_hash = key[0]
- missing_keys.remove(found_hash)
offset, length = map(int, value.split(' '))
+ from_this_index = grouped_by_index.setdefault(index, {})
+ from_this_index[(offset, length)] = found_hash
+ for index, from_this_index in grouped_by_index.items():
transport, pack_name = index_to_pack[index]
- # TODO: is there already some code that will group together all
- # the retrievals from a pack and turn them into one readv per
- # pack?
- #
- # TODO: do a readv, not individual reads
- assert found_hash not in r
- reader = make_readv_reader(transport, pack_name, [(offset, length)])
+ ranges = sorted(from_this_index.keys())
+ range_iter = iter(ranges)
+ reader = make_readv_reader(transport, pack_name, ranges)
for record_reader in reader.iter_records():
- found_names, read_fn = record_reader
+ this_range = range_iter.next()
+ found_hash = from_this_index[this_range]
+ read_fn = record_reader[1]
result = read_fn(None) # read all
assert osutils.sha_string(result) == found_hash
r[found_hash] = result
- if missing_keys:
- raise errors.BzrError("keys %r were not found in %r" %
- (missing_keys, combined))
return r
def _get_bytes_by_hash(self, h):
=== modified file 'bzrlib/tests/test_pack_repository.py'
--- a/bzrlib/tests/test_pack_repository.py 2007-08-30 04:33:41 +0000
+++ b/bzrlib/tests/test_pack_repository.py 2007-08-30 08:34:37 +0000
@@ -45,11 +45,12 @@
repo.lock_write()
repo.start_write_group()
# store some stuff
- stuff = 'hello repo!'
- stuff_hash = sha_string(stuff)
- repo._add_bytes_by_hash(stuff_hash, stuff)
+ texts = ['hello repo!', 'binary\0\n\r\n\0']
+ hashes = map(sha_string, texts)
+ repo._add_bytes_by_hash_multi(hashes, texts)
# finish storing
repo.commit_write_group()
# try to get it back
- result = repo._get_bytes_by_hash(stuff_hash)
- self.assertEquals(stuff, result)
+ for expected in texts:
+ result = repo._get_bytes_by_hash(sha_string(expected))
+ self.assertEquals(expected, result)
More information about the bazaar-commits
mailing list