Rev 3925: Get rid of the entries dict in GroupCompressBlock. in http://bzr.arbash-meinel.com/branches/bzr/brisbane/vilajam
John Arbash Meinel
john at arbash-meinel.com
Fri Mar 27 22:05:54 GMT 2009
At http://bzr.arbash-meinel.com/branches/bzr/brisbane/vilajam
------------------------------------------------------------
revno: 3925
revision-id: john at arbash-meinel.com-20090327220537-loj7fdr9hi360qc3
parent: john at arbash-meinel.com-20090327214708-sy13r2m4cu0qn72k
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: vilajam
timestamp: Fri 2009-03-27 17:05:37 -0500
message:
Get rid of the entries dict in GroupCompressBlock.
We weren't making use of it, and it was overhead to update it.
This simplifies the code a bit more.
The only difference now between the python and pyrex compressors
is the __init__ and _compress functions.
If it wasn't for circular import issues, I would be tempted
to move them one step further, into the _groupcompress_* modules.
-------------- next part --------------
=== modified file 'bzrlib/groupcompress.py'
--- a/bzrlib/groupcompress.py 2009-03-27 21:47:08 +0000
+++ b/bzrlib/groupcompress.py 2009-03-27 22:05:37 +0000
@@ -83,29 +83,6 @@
return present_keys
-class GroupCompressBlockEntry(object):
- """Track the information about a single object inside a GC group.
-
- This is generally just the dumb data structure.
- """
-
- def __init__(self, key, type, sha1, start, length):
- self.key = key
- self.type = type # delta, fulltext, external?
- self.sha1 = sha1 # Sha1 of content
- self.start = start # Byte offset to start of data
- self.length = length # Length of content
-
- def __repr__(self):
- return '%s(%s, %s, %s, %s, %s)' % (
- self.__class__.__name__,
- self.key, self.type, self.sha1, self.start, self.length
- )
-
- @property
- def end(self):
- return self.start + self.length
-
# The max zlib window size is 32kB, so if we set 'max_size' output of the
# decompressor to the requested bytes + 32kB, then we should guarantee
# num_bytes coming out.
@@ -123,7 +100,6 @@
def __init__(self):
# map by key? or just order in file?
- self._entries = {}
self._compressor_name = None
self._z_content = None
self._z_content_decompressor = None
@@ -279,22 +255,6 @@
bytes = apply_delta(self._content, content)
return bytes
- def add_entry(self, key, type, sha1, start, length):
- """Add new meta info about an entry.
-
- :param key: The key for the new content
- :param type: Whether this is a delta or fulltext entry (external?)
- :param sha1: sha1sum of the fulltext of this entry
- :param start: where the encoded bytes start
- :param length: total number of bytes in the encoded form
- :return: The entry?
- """
- entry = GroupCompressBlockEntry(key, type, sha1, start, length)
- if key in self._entries:
- raise ValueError('Duplicate key found: %s' % (key,))
- self._entries[key] = entry
- return entry
-
def set_content(self, content):
"""Set the content of this block."""
self._content_length = len(content)
@@ -643,9 +603,6 @@
if not bytes: # empty, like a dir entry, etc
if nostore_sha == _null_sha1:
raise errors.ExistingContent()
- self._block.add_entry(key, type='empty',
- sha1=None, start=0,
- length=0)
return _null_sha1, 0, 0, 'fulltext', 0
# we assume someone knew what they were doing when they passed it in
if expected_sha is not None:
@@ -688,42 +645,32 @@
:param key: The key to extract.
:return: An iterable over bytes and the sha1.
"""
- delta_details = self.labels_deltas[key]
- delta_chunks = self.chunks[delta_details[0][1]:delta_details[1][1]]
+ (start_byte, start_chunk, end_byte, end_chunk) = self.labels_deltas[key]
+ delta_chunks = self.chunks[start_chunk:end_chunk]
stored_bytes = ''.join(delta_chunks)
- # TODO: Fix this, we shouldn't really be peeking here
- entry = self._block._entries[key]
- if entry.type == 'fulltext':
- if stored_bytes[0] != 'f':
- raise ValueError('Index claimed fulltext, but stored bytes'
- ' indicate %s' % (stored_bytes[0],))
+ if stored_bytes[0] == 'f':
fulltext_len, offset = decode_base128_int(stored_bytes[1:10])
- if fulltext_len + 1 + offset != len(stored_bytes):
+ data_len = fulltext_len + 1 + offset
+ if data_len != len(stored_bytes):
raise ValueError('Index claimed fulltext len, but stored bytes'
' claim %s != %s'
- % (len(stored_bytes),
- fulltext_len + 1 + offset))
+ % (len(stored_bytes), data_len))
bytes = stored_bytes[offset + 1:]
else:
- if entry.type != 'delta':
- raise ValueError('Unknown entry type: %s' % (entry.type,))
# XXX: This is inefficient at best
- source = ''.join(self.chunks)
+ source = ''.join(self.chunks[:start_chunk])
if stored_bytes[0] != 'd':
- raise ValueError('Entry type claims delta, bytes claim %s'
+ raise ValueError('Unknown content kind, bytes claim %s'
% (stored_bytes[0],))
delta_len, offset = decode_base128_int(stored_bytes[1:10])
- if delta_len + 1 + offset != len(stored_bytes):
+ data_len = delta_len + 1 + offset
+ if data_len != len(stored_bytes):
raise ValueError('Index claimed delta len, but stored bytes'
' claim %s != %s'
- % (len(stored_bytes),
- delta_len + 1 + offset))
+ % (len(stored_bytes), data_len))
bytes = apply_delta(source, stored_bytes[offset + 1:])
bytes_sha1 = osutils.sha_string(bytes)
- if entry.sha1 != bytes_sha1:
- raise ValueError('Recorded sha1 != measured %s != %s'
- % (entry.sha1, bytes_sha1))
- return bytes, entry.sha1
+ return bytes, bytes_sha1
def flush(self):
"""Finish this group, creating a formatted stream.
@@ -786,15 +733,14 @@
# Update the delta_length to include those two encoded integers
out_lines[1] = encode_base128_int(delta_length)
out_length = len(out_lines[3]) + 1 + delta_length
- self._block.add_entry(key, type=type, sha1=sha1,
- start=self.endpoint, length=out_length)
start = self.endpoint # Before insertion
- delta_start = (start, len(self._delta_index.lines))
+ chunk_start = len(self._delta_index.lines)
self._delta_index.extend_lines(out_lines, index_lines)
self.endpoint = self._delta_index.endpoint
self.input_bytes += bytes_length
- delta_end = (self.endpoint, len(self._delta_index.lines))
- self.labels_deltas[key] = (delta_start, delta_end)
+ chunk_end = len(self._delta_index.lines)
+ self.labels_deltas[key] = (start, chunk_start,
+ self.endpoint, chunk_end)
return sha1, start, self.endpoint, type, out_length
@@ -849,14 +795,15 @@
length = len(delta) + len_mini_header
new_chunks = ['d', enc_length, delta]
self._delta_index.add_delta_source(delta, len_mini_header)
- self._block.add_entry(key, type=type, sha1=sha1,
- start=self.endpoint, length=length)
- start = self.endpoint # Before insertion
- delta_start = (self.endpoint, len(self.chunks))
+ # Before insertion
+ start = self.endpoint
+ chunk_start = len(self.chunks)
+ # Now output these bytes
self._output_chunks(new_chunks)
self.input_bytes += input_len
- delta_end = (self.endpoint, len(self.chunks))
- self.labels_deltas[key] = (delta_start, delta_end)
+ chunk_end = len(self.chunks)
+ self.labels_deltas[key] = (start, chunk_start,
+ self.endpoint, chunk_end)
if not self._delta_index._source_offset == self.endpoint:
raise AssertionError('the delta index is out of sync'
'with the output lines %s != %s'
=== modified file 'bzrlib/tests/test_groupcompress.py'
--- a/bzrlib/tests/test_groupcompress.py 2009-03-27 21:47:08 +0000
+++ b/bzrlib/tests/test_groupcompress.py 2009-03-27 22:05:37 +0000
@@ -287,12 +287,13 @@
start = 0
for key in sorted(key_to_text):
compressor.compress(key, key_to_text[key], None)
+ locs = dict((key, (start, end)) for key, (start, _, end, _)
+ in compressor.labels_deltas.iteritems())
block = compressor.flush()
- entries = block._entries
+ raw_bytes = block.to_bytes()
# Go through from_bytes(to_bytes()) so that we start with a compressed
# content object
- return entries, groupcompress.GroupCompressBlock.from_bytes(
- block.to_bytes())
+ return locs, groupcompress.GroupCompressBlock.from_bytes(raw_bytes)
def test_from_empty_bytes(self):
self.assertRaises(ValueError,
@@ -302,7 +303,6 @@
block = groupcompress.GroupCompressBlock.from_bytes(
'gcb1z\n0\n0\n')
self.assertIsInstance(block, groupcompress.GroupCompressBlock)
- self.assertEqual({}, block._entries)
self.assertIs(None, block._content)
self.assertEqual('', block._z_content)
block._ensure_content()
@@ -329,22 +329,10 @@
self.assertEqual(z_content, block._z_content)
self.assertEqual(content, block._content)
- def test_add_entry(self):
- gcb = groupcompress.GroupCompressBlock()
- e = gcb.add_entry(('foo', 'bar'), 'fulltext', 'abcd'*10, 0, 100)
- self.assertIsInstance(e, groupcompress.GroupCompressBlockEntry)
- self.assertEqual(('foo', 'bar'), e.key)
- self.assertEqual('fulltext', e.type)
- self.assertEqual('abcd'*10, e.sha1)
- self.assertEqual(0, e.start)
- self.assertEqual(100, e.length)
-
def test_to_bytes(self):
content = ('this is some content\n'
'this content will be compressed\n')
gcb = groupcompress.GroupCompressBlock()
- gcb.add_entry(('foo', 'bar'), 'fulltext', 'abcd'*10, 0, 100)
- gcb.add_entry(('bing',), 'fulltext', 'abcd'*10, 100, 100)
gcb.set_content(content)
bytes = gcb.to_bytes()
self.assertEqual(gcb._z_content_length, len(gcb._z_content))
@@ -612,20 +600,21 @@
start = 0
for key in sorted(key_to_text):
compressor.compress(key, key_to_text[key], None)
+ locs = dict((key, (start, end)) for key, (start, _, end, _)
+ in compressor.labels_deltas.iteritems())
block = compressor.flush()
- entries = block._entries
raw_bytes = block.to_bytes()
- return entries, groupcompress.GroupCompressBlock.from_bytes(raw_bytes)
+ return locs, groupcompress.GroupCompressBlock.from_bytes(raw_bytes)
- def add_key_to_manager(self, key, entries, block, manager):
- entry = entries[key]
- manager.add_factory(entry.key, (), entry.start, entry.end)
+ def add_key_to_manager(self, key, locations, block, manager):
+ start, end = locations[key]
+ manager.add_factory(key, (), start, end)
def test_get_fulltexts(self):
- entries, block = self.make_block(self._texts)
+ locations, block = self.make_block(self._texts)
manager = groupcompress._LazyGroupContentManager(block)
- self.add_key_to_manager(('key1',), entries, block, manager)
- self.add_key_to_manager(('key2',), entries, block, manager)
+ self.add_key_to_manager(('key1',), locations, block, manager)
+ self.add_key_to_manager(('key2',), locations, block, manager)
result_order = []
for record in manager.get_record_stream():
result_order.append(record.key)
@@ -636,8 +625,8 @@
# If we build the manager in the opposite order, we should get them
# back in the opposite order
manager = groupcompress._LazyGroupContentManager(block)
- self.add_key_to_manager(('key2',), entries, block, manager)
- self.add_key_to_manager(('key1',), entries, block, manager)
+ self.add_key_to_manager(('key2',), locations, block, manager)
+ self.add_key_to_manager(('key1',), locations, block, manager)
result_order = []
for record in manager.get_record_stream():
result_order.append(record.key)
@@ -646,7 +635,7 @@
self.assertEqual([('key2',), ('key1',)], result_order)
def test__wire_bytes_no_keys(self):
- entries, block = self.make_block(self._texts)
+ locations, block = self.make_block(self._texts)
manager = groupcompress._LazyGroupContentManager(block)
wire_bytes = manager._wire_bytes()
block_length = len(block.to_bytes())
@@ -665,10 +654,10 @@
wire_bytes)
def test__wire_bytes(self):
- entries, block = self.make_block(self._texts)
+ locations, block = self.make_block(self._texts)
manager = groupcompress._LazyGroupContentManager(block)
- self.add_key_to_manager(('key1',), entries, block, manager)
- self.add_key_to_manager(('key4',), entries, block, manager)
+ self.add_key_to_manager(('key1',), locations, block, manager)
+ self.add_key_to_manager(('key4',), locations, block, manager)
block_bytes = block.to_bytes()
wire_bytes = manager._wire_bytes()
(storage_kind, z_header_len, header_len,
@@ -683,8 +672,8 @@
z_header = rest[:z_header_len]
header = zlib.decompress(z_header)
self.assertEqual(header_len, len(header))
- entry1 = entries[('key1',)]
- entry4 = entries[('key4',)]
+ entry1 = locations[('key1',)]
+ entry4 = locations[('key4',)]
self.assertEqualDiff('key1\n'
'\n' # no parents
'%d\n' # start offset
@@ -693,17 +682,17 @@
'\n'
'%d\n'
'%d\n'
- % (entry1.start, entry1.end,
- entry4.start, entry4.end),
+ % (entry1[0], entry1[1],
+ entry4[0], entry4[1]),
header)
z_block = rest[z_header_len:]
self.assertEqual(block_bytes, z_block)
def test_from_bytes(self):
- entries, block = self.make_block(self._texts)
+ locations, block = self.make_block(self._texts)
manager = groupcompress._LazyGroupContentManager(block)
- self.add_key_to_manager(('key1',), entries, block, manager)
- self.add_key_to_manager(('key4',), entries, block, manager)
+ self.add_key_to_manager(('key1',), locations, block, manager)
+ self.add_key_to_manager(('key4',), locations, block, manager)
wire_bytes = manager._wire_bytes()
self.assertStartsWith(wire_bytes, 'groupcompress-block\n')
manager = groupcompress._LazyGroupContentManager.from_bytes(wire_bytes)
@@ -718,21 +707,21 @@
self.assertEqual([('key1',), ('key4',)], result_order)
def test__check_rebuild_no_changes(self):
- entries, block = self.make_block(self._texts)
+ locations, block = self.make_block(self._texts)
manager = groupcompress._LazyGroupContentManager(block)
# Request all the keys, which ensures that we won't rebuild
- self.add_key_to_manager(('key1',), entries, block, manager)
- self.add_key_to_manager(('key2',), entries, block, manager)
- self.add_key_to_manager(('key3',), entries, block, manager)
- self.add_key_to_manager(('key4',), entries, block, manager)
+ self.add_key_to_manager(('key1',), locations, block, manager)
+ self.add_key_to_manager(('key2',), locations, block, manager)
+ self.add_key_to_manager(('key3',), locations, block, manager)
+ self.add_key_to_manager(('key4',), locations, block, manager)
manager._check_rebuild_block()
self.assertIs(block, manager._block)
def test__check_rebuild_only_one(self):
- entries, block = self.make_block(self._texts)
+ locations, block = self.make_block(self._texts)
manager = groupcompress._LazyGroupContentManager(block)
# Request just the first key, which should trigger a 'strip' action
- self.add_key_to_manager(('key1',), entries, block, manager)
+ self.add_key_to_manager(('key1',), locations, block, manager)
manager._check_rebuild_block()
self.assertIsNot(block, manager._block)
self.assertTrue(block._content_length > manager._block._content_length)
@@ -744,10 +733,10 @@
record.get_bytes_as('fulltext'))
def test__check_rebuild_middle(self):
- entries, block = self.make_block(self._texts)
+ locations, block = self.make_block(self._texts)
manager = groupcompress._LazyGroupContentManager(block)
# Request a small key in the middle should trigger a 'rebuild'
- self.add_key_to_manager(('key4',), entries, block, manager)
+ self.add_key_to_manager(('key4',), locations, block, manager)
manager._check_rebuild_block()
self.assertIsNot(block, manager._block)
self.assertTrue(block._content_length > manager._block._content_length)
More information about the bazaar-commits
mailing list