Rev 3915: Change the delta byte stream to remove the 'source length' entry. in http://bzr.arbash-meinel.com/branches/bzr/brisbane/disk_format
John Arbash Meinel
john at arbash-meinel.com
Fri Mar 27 16:07:59 GMT 2009
At http://bzr.arbash-meinel.com/branches/bzr/brisbane/disk_format
------------------------------------------------------------
revno: 3915
revision-id: john at arbash-meinel.com-20090327160744-anntc1oyh859s9rm
parent: john at arbash-meinel.com-20090326201840-ddb2uqof335ysvnu
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: disk_format
timestamp: Fri 2009-03-27 11:07:44 -0500
message:
Change the delta byte stream to remove the 'source length' entry.
This should generally remove ~3.5 bytes from delta records.
The groupcompress tests at least pass again.
-------------- next part --------------
=== modified file 'bzrlib/_groupcompress_pyx.pyx'
--- a/bzrlib/_groupcompress_pyx.pyx 2009-03-24 20:02:26 +0000
+++ b/bzrlib/_groupcompress_pyx.pyx 2009-03-27 16:07:44 +0000
@@ -251,15 +251,6 @@
data = <unsigned char *>delta
top = data + delta_size
- # make sure the orig file size matches what we expect
- # XXX: gcc warns because data isn't defined as 'const'
- size = get_delta_hdr_size(&data, top)
- if (size > source_size):
- # XXX: mismatched source size
- raise RuntimeError('source size %d < expected source size %d'
- % (source_size, size))
- source_size = size
-
# now the result size
size = get_delta_hdr_size(&data, top)
result = PyString_FromStringAndSize(NULL, size)
=== modified file 'bzrlib/delta.h'
--- a/bzrlib/delta.h 2009-03-26 16:22:58 +0000
+++ b/bzrlib/delta.h 2009-03-27 16:07:44 +0000
@@ -77,8 +77,10 @@
const void *buf, unsigned long bufsize,
unsigned long *delta_size, unsigned long max_delta_size);
-/* the smallest possible delta size is 4 bytes */
-#define DELTA_SIZE_MIN 4
+/* the smallest possible delta size is 3 bytes
+ * Target size, Copy command, Copy length
+ */
+#define DELTA_SIZE_MIN 3
/*
* This must be called twice on the delta data buffer, first to get the
=== modified file 'bzrlib/diff-delta.c'
--- a/bzrlib/diff-delta.c 2009-03-19 23:30:50 +0000
+++ b/bzrlib/diff-delta.c 2009-03-27 16:07:44 +0000
@@ -707,8 +707,6 @@
/* then populate the index for the new data */
prev_val = ~0;
data = buffer;
- /* source size */
- get_delta_hdr_size(&data, top);
/* target size */
get_delta_hdr_size(&data, top);
entry = entries; /* start at the first slot */
@@ -881,14 +879,7 @@
if (!out)
return NULL;
- /* store reference buffer size */
source_size = index->last_src->size + index->last_src->agg_offset;
- i = source_size;
- while (i >= 0x80) {
- out[outpos++] = i | 0x80;
- i >>= 7;
- }
- out[outpos++] = i;
/* store target buffer size */
i = trg_size;
=== modified file 'bzrlib/groupcompress.py'
--- a/bzrlib/groupcompress.py 2009-03-26 20:18:40 +0000
+++ b/bzrlib/groupcompress.py 2009-03-27 16:07:44 +0000
@@ -1462,6 +1462,9 @@
value = "%d %d %d %d" % (block_start, block_length,
record._start, record._end)
nodes = [(record.key, value, (record.parents,))]
+ # TODO: Consider buffering up many nodes to be added, not
+ # sure how much overhead this has, but we're seeing
+ # ~23s / 120s in add_records calls
self._index.add_records(nodes, random_id=random_id)
continue
try:
=== modified file 'bzrlib/tests/test__groupcompress_pyx.py'
--- a/bzrlib/tests/test__groupcompress_pyx.py 2009-03-24 19:36:34 +0000
+++ b/bzrlib/tests/test__groupcompress_pyx.py 2009-03-27 16:07:44 +0000
@@ -123,40 +123,40 @@
def test_make_noop_delta(self):
ident_delta = self.make_delta(_text1, _text1)
- self.assertEqual('MM\x90M', ident_delta)
+ self.assertEqual('M\x90M', ident_delta)
ident_delta = self.make_delta(_text2, _text2)
- self.assertEqual('NN\x90N', ident_delta)
+ self.assertEqual('N\x90N', ident_delta)
ident_delta = self.make_delta(_text3, _text3)
- self.assertEqual('\x87\x01\x87\x01\x90\x87', ident_delta)
+ self.assertEqual('\x87\x01\x90\x87', ident_delta)
def test_make_delta(self):
delta = self.make_delta(_text1, _text2)
- self.assertEqual('MN\x90/\x1fdiffer from\nagainst other text\n', delta)
+ self.assertEqual('N\x90/\x1fdiffer from\nagainst other text\n', delta)
delta = self.make_delta(_text2, _text1)
- self.assertEqual('NM\x90/\x1ebe matched\nagainst other text\n', delta)
+ self.assertEqual('M\x90/\x1ebe matched\nagainst other text\n', delta)
delta = self.make_delta(_text3, _text1)
- self.assertEqual('\x87\x01M\x90M', delta)
+ self.assertEqual('M\x90M', delta)
delta = self.make_delta(_text3, _text2)
- self.assertEqual('\x87\x01N\x90/\x1fdiffer from\nagainst other text\n',
+ self.assertEqual('N\x90/\x1fdiffer from\nagainst other text\n',
delta)
def test_apply_delta_is_typesafe(self):
- self.apply_delta(_text1, 'MM\x90M')
- self.assertRaises(TypeError,
- self.apply_delta, object(), 'MM\x90M')
- self.assertRaises(TypeError,
- self.apply_delta, unicode(_text1), 'MM\x90M')
- self.assertRaises(TypeError,
- self.apply_delta, _text1, u'MM\x90M')
+ self.apply_delta(_text1, 'M\x90M')
+ self.assertRaises(TypeError,
+ self.apply_delta, object(), 'M\x90M')
+ self.assertRaises(TypeError,
+ self.apply_delta, unicode(_text1), 'M\x90M')
+ self.assertRaises(TypeError,
+ self.apply_delta, _text1, u'M\x90M')
self.assertRaises(TypeError,
self.apply_delta, _text1, object())
def test_apply_delta(self):
target = self.apply_delta(_text1,
- 'MN\x90/\x1fdiffer from\nagainst other text\n')
+ 'N\x90/\x1fdiffer from\nagainst other text\n')
self.assertEqual(_text2, target)
target = self.apply_delta(_text2,
- 'NM\x90/\x1ebe matched\nagainst other text\n')
+ 'M\x90/\x1ebe matched\nagainst other text\n')
self.assertEqual(_text1, target)
@@ -169,7 +169,7 @@
def test_make_delta(self):
di = self._gc_module.DeltaIndex(_text1)
delta = di.make_delta(_text2)
- self.assertEqual('MN\x90/\x1fdiffer from\nagainst other text\n', delta)
+ self.assertEqual('N\x90/\x1fdiffer from\nagainst other text\n', delta)
def test_delta_against_multiple_sources(self):
di = self._gc_module.DeltaIndex()
@@ -180,7 +180,7 @@
delta = di.make_delta(_third_text)
result = self._gc_module.apply_delta(_first_text + _second_text, delta)
self.assertEqualDiff(_third_text, result)
- self.assertEqual('\xac\x01\x85\x01\x90\x14\x0chas some in '
+ self.assertEqual('\x85\x01\x90\x14\x0chas some in '
'\x91v6\x03and\x91d"\x91:\n', delta)
def test_delta_with_offsets(self):
@@ -196,7 +196,7 @@
'12345' + _first_text + '1234567890' + _second_text, delta)
self.assertIsNot(None, result)
self.assertEqualDiff(_third_text, result)
- self.assertEqual('\xbb\x01\x85\x01\x91\x05\x14\x0chas some in '
+ self.assertEqual('\x85\x01\x91\x05\x14\x0chas some in '
'\x91\x856\x03and\x91s"\x91?\n', delta)
def test_delta_with_delta_bytes(self):
@@ -205,7 +205,7 @@
di.add_source(_first_text, 0)
self.assertEqual(len(_first_text), di._source_offset)
delta = di.make_delta(_second_text)
- self.assertEqual('Dh\tsome more\x91\x019'
+ self.assertEqual('h\tsome more\x91\x019'
'&previous text\nand has some extra text\n', delta)
di.add_delta_source(delta, 0)
source += delta
@@ -218,8 +218,8 @@
# Note that we don't match the 'common with the', because it isn't long
# enough to match in the original text, and those bytes are not present
# in the delta for the second text.
- self.assertEqual('z\x85\x01\x90\x14\x1chas some in common with the '
- '\x91T&\x03and\x91\x18,', second_delta)
+ self.assertEqual('\x85\x01\x90\x14\x1chas some in common with the '
+ '\x91S&\x03and\x91\x18,', second_delta)
# Add this delta, and create a new delta for the same text. We should
# find the remaining text, and only insert the short 'and' text.
di.add_delta_source(second_delta, 0)
@@ -227,14 +227,14 @@
third_delta = di.make_delta(_third_text)
result = self._gc_module.apply_delta(source, third_delta)
self.assertEqualDiff(_third_text, result)
- self.assertEqual('\xa6\x01\x85\x01\x90\x14\x91\x80\x1c'
- '\x91T&\x03and\x91\x18,', third_delta)
+ self.assertEqual('\x85\x01\x90\x14\x91\x7e\x1c'
+ '\x91S&\x03and\x91\x18,', third_delta)
# Now create a delta, which we know won't be able to be 'fit' into the
# existing index
fourth_delta = di.make_delta(_fourth_text)
self.assertEqual(_fourth_text,
self._gc_module.apply_delta(source, fourth_delta))
- self.assertEqual('\xa6\x01\x80\x01'
+ self.assertEqual('\x80\x01'
'\x7f123456789012345\nsame rabin hash\n'
'123456789012345\nsame rabin hash\n'
'123456789012345\nsame rabin hash\n'
@@ -246,4 +246,4 @@
fifth_delta = di.make_delta(_fourth_text)
self.assertEqual(_fourth_text,
self._gc_module.apply_delta(source, fifth_delta))
- self.assertEqual('\xac\x02\x80\x01\x91\xab\x7f\x01\n', fifth_delta)
+ self.assertEqual('\x80\x01\x91\xa7\x7f\x01\n', fifth_delta)
=== modified file 'bzrlib/tests/test_groupcompress.py'
--- a/bzrlib/tests/test_groupcompress.py 2009-03-24 19:36:34 +0000
+++ b/bzrlib/tests/test_groupcompress.py 2009-03-27 16:07:44 +0000
@@ -100,9 +100,9 @@
'different\n'), sha1_2)
expected_lines.extend([
# 'delta', delta length
- 'd\x10',
- # source and target length
- '\x36\x36',
+ 'd\x0f',
+ # target length
+ '\x36',
# copy the line common
'\x91\x0a\x2c', #copy, offset 0x0a, len 0x2c
# add the line different, and the trailing newline
@@ -130,9 +130,9 @@
sha1_3)
expected_lines.extend([
# 'delta', delta length
- 'd\x0c',
- # source and target length
- '\x67\x5f'
+ 'd\x0b',
+ # target length
+ '\x5f'
# insert new
'\x03new',
# Copy of first parent 'common' range
@@ -661,6 +661,7 @@
entries, block = self.make_block(self._texts)
manager = groupcompress._LazyGroupContentManager(block)
self.add_key_to_manager(('key1',), entries, block, manager)
+ self.add_key_to_manager(('key3',), entries, block, manager)
self.add_key_to_manager(('key4',), entries, block, manager)
block_bytes = block.to_bytes()
wire_bytes = manager._wire_bytes()
@@ -670,23 +671,29 @@
header_len = int(header_len)
block_len = int(block_len)
self.assertEqual('groupcompress-block', storage_kind)
- self.assertEqual(33, z_header_len)
- self.assertEqual(25, header_len)
+ self.assertEqual(41, z_header_len)
+ self.assertEqual(39, header_len)
self.assertEqual(len(block_bytes), block_len)
z_header = rest[:z_header_len]
header = zlib.decompress(z_header)
self.assertEqual(header_len, len(header))
entry1 = entries[('key1',)]
+ entry3 = entries[('key3',)]
entry4 = entries[('key4',)]
self.assertEqualDiff('key1\n'
'\n' # no parents
'%d\n' # start offset
'%d\n' # end byte
+ 'key3\n'
+ '\n'
+ '%d\n'
+ '%d\n'
'key4\n'
'\n'
'%d\n'
'%d\n'
% (entry1.start, entry1.end,
+ entry3.start, entry3.end,
entry4.start, entry4.end),
header)
z_block = rest[z_header_len:]
@@ -696,19 +703,20 @@
entries, block = self.make_block(self._texts)
manager = groupcompress._LazyGroupContentManager(block)
self.add_key_to_manager(('key1',), entries, block, manager)
+ self.add_key_to_manager(('key3',), entries, block, manager)
self.add_key_to_manager(('key4',), entries, block, manager)
wire_bytes = manager._wire_bytes()
self.assertStartsWith(wire_bytes, 'groupcompress-block\n')
manager = groupcompress._LazyGroupContentManager.from_bytes(wire_bytes)
self.assertIsInstance(manager, groupcompress._LazyGroupContentManager)
- self.assertEqual(2, len(manager._factories))
+ self.assertEqual(3, len(manager._factories))
self.assertEqual(block._z_content, manager._block._z_content)
result_order = []
for record in manager.get_record_stream():
result_order.append(record.key)
text = self._texts[record.key]
self.assertEqual(text, record.get_bytes_as('fulltext'))
- self.assertEqual([('key1',), ('key4',)], result_order)
+ self.assertEqual([('key1',), ('key3',), ('key4',)], result_order)
def test__check_rebuild_no_changes(self):
entries, block = self.make_block(self._texts)
More information about the bazaar-commits
mailing list