Rev 4784: Finish hookup up the various bits so that 'finish' actually does so. in http://bazaar.launchpad.net/~jameinel/bzr/chk-index
John Arbash Meinel
john at arbash-meinel.com
Wed Oct 28 02:27:16 GMT 2009
At http://bazaar.launchpad.net/~jameinel/bzr/chk-index
------------------------------------------------------------
revno: 4784
revision-id: john at arbash-meinel.com-20091028022709-zzd4zvfyoip1oisi
parent: john at arbash-meinel.com-20091028014525-fix7mtqbbr9ie7sy
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: chk-index
timestamp: Tue 2009-10-27 21:27:09 -0500
message:
Finish hookup up the various bits so that 'finish' actually does so.
We now have a valid index. Though I wouldn't ever want to look
at it in binary form. I'm trying to think if we could insert
a few more formatting characters to have redundant info to
help with debug parsing...
-------------- next part --------------
=== modified file 'bzrlib/chk_index.py'
--- a/bzrlib/chk_index.py 2009-10-28 01:45:25 +0000
+++ b/bzrlib/chk_index.py 2009-10-28 02:27:09 +0000
@@ -72,6 +72,7 @@
:ivar mini_index_entry_offset_bytes: In each mini index row, how many bytes
are used to encode the offset to the start-of-entries
:ivar num_entries: How many entries are stored in this index
+ :ivar entry_offset: The byte offset to the start of the entry fields
:ivar entry_hash_bytes: How many bytes are used to store the hash key
:ivar entry_group_offset_bytes: How many bytes are used to represent each
'group offset' in the actual entry line
@@ -108,6 +109,7 @@
# either make this variable-width or we could encode it in the group
# entry table. It would require a back-and-forth to decode a
# collection, though, so it isn't really worthwhile
+ self.entry_offset = None
self.entry_group_start_bytes = None
self.entry_group_length_bytes = None
self.entry_coder = None
@@ -173,6 +175,10 @@
header.entry_group_offset_bytes, header.entry_group_start_bytes,
header.entry_group_length_bytes,
)) = cls._read_line(bytes, pos, 'entry', 5)
+ # TODO: should we be writing out entry_offset
+ header.entry_offset = header.mini_index_offset + (
+ header.num_mini_index_entries *
+ header.mini_index_entry_offset_bytes)
return header
@classmethod
@@ -293,6 +299,8 @@
last_offset = entry_bytes + max_mini_index_entry_bytes
header.mini_index_entry_offset_bytes = find_bytes(last_offset)
header._build_mini_index_coder()
+ header.entry_offset = header.mini_index_offset + (
+ header.num_mini_index_entries * header.mini_index_coder.size)
return header
@@ -303,7 +311,7 @@
def _bit_key_to_offset_16(bit_key):
# We want the first 4 bits
c = ord(bit_key[0])
- return c & 0x0F
+ return (c & 0xF0) >> 4
def _bit_key_to_offset_256(bit_key):
@@ -311,9 +319,9 @@
return ord(bit_key[0])
-_struct_H = Struct('<H')
+_struct_H = Struct('>H')
def _bit_key_to_offset_4096(bit_key):
- return _struct_H.unpack(bit_key[:2])[0] & 0x0FFF
+ return (_struct_H.unpack(bit_key[:2])[0] & 0xFFF0) >> 4
def _bit_key_to_offset_65536(bit_key):
@@ -407,11 +415,11 @@
max_group_offset, max_group_length, len(self._nodes))
return header
- def _build_group_index(self, group_coder):
+ def _build_group_index(self, header):
# TODO: Should we consider 'wasting' a few bytes and inserting a
# start-of-records header here?
sorted_groups = self._sort_groups()
- pack = group_coder.pack
+ pack = header.group_coder.pack
bytes = [pack(*group_key) for group_key in sorted_groups]
return bytes
@@ -447,11 +455,25 @@
def _build_mini_index_and_entries(self, header):
# For now, we buffer everything in memory, we *could* use a temp file
# instead
- mini_index_bytes = []
+ bit_key_to_offset = self._get_bit_key_to_mini_index(
+ header.num_mini_index_entries)
+ null_entry = '\x00'*header.mini_index_entry_offset_bytes
+ mini_index_bytes = [null_entry] * header.num_mini_index_entries
entry_bytes = []
- last_mini_prefix = None
- for bit_key in sorted(self._nodes):
- entry_bytes.append(self._entry_to_bytes(bit_key, header.entry_coder))
+ last_mini_offset = 0
+ for idx, bit_key in enumerate(sorted(self._nodes)):
+ bytes = self._entry_to_bytes(bit_key, header.entry_coder)
+ entry_bytes.append(bytes)
+ mini_offset = bit_key_to_offset(bit_key)
+ if mini_offset != last_mini_offset:
+ if mini_offset <= last_mini_offset:
+ raise AssertionError('We should only be increasing'
+ ' in mini_offset, never decreasing')
+ assert mini_index_bytes[mini_offset] is null_entry
+ mini_bytes = header.mini_index_coder.pack(header.entry_offset
+ + (idx * header.entry_coder.size))
+ mini_index_bytes[mini_offset] = mini_bytes
+ last_mini_offset = mini_offset
return mini_index_bytes, entry_bytes
def finish(self):
@@ -460,8 +482,18 @@
:return: A file handle for a temporary file containing the nodes added
to the index.
"""
- header = self._get_header()
+ header = self._build_header()
+ output_bytes = []
header_bytes = header.serialize()
if len(header_bytes) != header.group_index_offset:
raise AssertionError('header_bytes is not the same length as'
' where the group_index_offset is expected to start.')
+ output_bytes.append(header_bytes)
+ output_bytes.extend(self._build_group_index(header))
+ assert sum(map(len, output_bytes)) == header.mini_index_offset
+ mini_index_bytes, entry_bytes = self._build_mini_index_and_entries(
+ header)
+ output_bytes.extend(mini_index_bytes)
+ assert sum(map(len, output_bytes)) == header.entry_offset
+ output_bytes.extend(entry_bytes)
+ return output_bytes
=== modified file 'bzrlib/tests/test_chk_index.py'
--- a/bzrlib/tests/test_chk_index.py 2009-10-28 01:45:25 +0000
+++ b/bzrlib/tests/test_chk_index.py 2009-10-28 02:27:09 +0000
@@ -98,7 +98,7 @@
self.assertEqual(4, h.group_index_start_bytes)
self.assertEqual(4, h.group_index_length_bytes)
self.assertEqual('<II', h.group_coder.format)
- bytes = builder._build_group_index(h.group_coder)
+ bytes = builder._build_group_index(h)
self.assertEqualDiff('\n'.join([
'0000000000000000',
'0000000000002000',
@@ -122,7 +122,7 @@
self.assertEqual(2, h.group_index_start_bytes)
self.assertEqual(2, h.group_index_length_bytes)
self.assertEqual('<HH', h.group_coder.format)
- bytes = builder._build_group_index(h.group_coder)
+ bytes = builder._build_group_index(h)
self.assertEqualDiff('\n'.join([
'00000000',
'00002909',
@@ -138,7 +138,7 @@
self.assertEqual(8, h.group_index_start_bytes)
self.assertEqual(4, h.group_index_length_bytes)
self.assertEqual('<QI', h.group_coder.format)
- bytes = builder._build_group_index(h.group_coder)
+ bytes = builder._build_group_index(h)
self.assertEqualDiff('\n'.join([
'000000000000000000000000',
'000000000000000000008025',
@@ -191,10 +191,10 @@
def test__get_bit_key_to_mini_index_16(self):
offset_func = chk_index.CHKIndexBuilder._get_bit_key_to_mini_index(16)
- # da39... maps to 'a'
- self.assertEqual(0xa, offset_func(self.bit_k1))
- self.assertEqual(0x6, offset_func(self.bit_k2))
- self.assertEqual(0x9, offset_func(self.bit_k3))
+ # da39... maps to 'd'
+ self.assertEqual(0xd, offset_func(self.bit_k1))
+ self.assertEqual(0x8, offset_func(self.bit_k2))
+ self.assertEqual(0xe, offset_func(self.bit_k3))
def test__get_bit_key_to_mini_index_256(self):
offset_func = chk_index.CHKIndexBuilder._get_bit_key_to_mini_index(256)
@@ -205,18 +205,18 @@
def test__get_bit_key_to_mini_index_4096(self):
offset_func = chk_index.CHKIndexBuilder._get_bit_key_to_mini_index(4096)
- # da39... maps to 0x9da'
- self.assertEqual(0x9da, offset_func(self.bit_k1))
- self.assertEqual(0x786, offset_func(self.bit_k2))
- self.assertEqual(0x7e9, offset_func(self.bit_k3))
+ # da39... maps to 0xda3
+ self.assertEqual(0xda3, offset_func(self.bit_k1))
+ self.assertEqual(0x86f, offset_func(self.bit_k2))
+ self.assertEqual(0xe9d, offset_func(self.bit_k3))
def test__get_bit_key_to_mini_index_65536(self):
offset_func = chk_index.CHKIndexBuilder._get_bit_key_to_mini_index(
65536)
- # da39... maps to 0x39da
- self.assertEqual(0x39da, offset_func(self.bit_k1))
- self.assertEqual(0xf786, offset_func(self.bit_k2))
- self.assertEqual(0xd7e9, offset_func(self.bit_k3))
+ # da39... maps to 0xda39
+ self.assertEqual(0xda39, offset_func(self.bit_k1))
+ self.assertEqual(0x86f7, offset_func(self.bit_k2))
+ self.assertEqual(0xe9d7, offset_func(self.bit_k3))
def test__build_mini_index_and_entries_16(self):
builder = chk_index.CHKIndexBuilder()
@@ -235,9 +235,25 @@
self.assertEqual('<H', h.mini_index_coder.format)
mini_index_bytes, entry_bytes = builder._build_mini_index_and_entries(
h)
- # Tiny indexes don't have a mini_index_bytes section
- self.assertEqual([], mini_index_bytes)
b2a_hex = binascii.b2a_hex
+ self.assertEqualDiff('\n'.join([
+ '0000',
+ '0000',
+ '0000',
+ '0000',
+ '0000',
+ '0000',
+ '0000',
+ '0000',
+ '8400',
+ '0000',
+ '0000',
+ '0000',
+ '0000',
+ 'a100',
+ 'be00',
+ '0000',
+ ]), '\n'.join([b2a_hex(b) for b in mini_index_bytes]))
# bit_k2 < bit_k1
self.assertEqualDiff('\n'.join([
#hash # Group # start # end/length?
@@ -245,6 +261,36 @@
b2a_hex(self.bit_k1) + '01' '00000000' '01000000',
b2a_hex(self.bit_k3) + '02' '32000000' 'e8030000',
]), '\n'.join([b2a_hex(b) for b in entry_bytes]))
+ self.assertEqual(0x0084, h.entry_offset)
+ self.assertEqual(0x00a1, h.entry_offset + len(entry_bytes[0]))
+ self.assertEqual(0x00be, h.entry_offset + len(entry_bytes[0])
+ + len(entry_bytes[1]))
+
+ def test_build_tiny_index(self):
+ builder = chk_index.CHKIndexBuilder()
+ builder.add_node(self.k1, 0, 1000, 0, 1)
+ builder.add_node(self.k2, 1000, 20000, 0, 50)
+ builder.add_node(self.k3, 1000, 20000, 50, 1000)
+ bytes = ''.join(builder.finish())
+ self.assertEqualDiff("""Groupcompress CHK Index 1
+hash=sha1 20
+groups=120 2 2 3
+mini_index=132 0 1
+entry=3 20 1 4 4
+\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n
+""", bytes[:120])
+ b2a_hex = binascii.b2a_hex
+ self.assertEqualDiff(
+ # First the group info
+ '0000' '0000' # NULL group, probably shouldn't be included
+ '0000' 'e803' # group 1 @ offset 0, length 1000
+ 'e803' '204e' # group 2 @ offet 1000, length 20000
+ '' # No mini index
+ # 3 entries follow
+ + b2a_hex(self.bit_k2) + '02' '00000000' '32000000'
+ + b2a_hex(self.bit_k1) + '01' '00000000' '01000000'
+ + b2a_hex(self.bit_k3) + '02' '32000000' 'e8030000'
+ , b2a_hex(bytes[120:]))
class TestCHKIndexHeader(tests.TestCase):
@@ -320,6 +366,7 @@
self.assertEqual(2, header.entry_group_offset_bytes)
self.assertEqual(4, header.entry_group_start_bytes)
self.assertEqual(4, header.entry_group_length_bytes)
+ self.assertEqual(7517+256*4, header.entry_offset)
def assertBadHeader(self, content):
self.assertRaises(ValueError,
@@ -482,6 +529,8 @@
self.assertEqual(20, header.entry_hash_bytes)
self.assertEqual(1, header.entry_group_offset_bytes)
self.assertEqual(4, header.mini_index_entry_offset_bytes)
+ self.assertEqual(header.mini_index_offset + 256*4,
+ header.entry_offset)
def test_build_header_for_tiny(self):
header = chk_index.CHKIndexHeader.build_header_for(
@@ -500,6 +549,7 @@
self.assertEqual(20, header.entry_hash_bytes)
self.assertEqual(1, header.entry_group_offset_bytes)
self.assertEqual(1, header.mini_index_entry_offset_bytes)
+ self.assertEqual(header.mini_index_offset, header.entry_offset)
def test_build_header_for_huge(self):
header = chk_index.CHKIndexHeader.build_header_for(
@@ -518,3 +568,5 @@
self.assertEqual(20, header.entry_hash_bytes)
self.assertEqual(4, header.entry_group_offset_bytes)
self.assertEqual(4, header.mini_index_entry_offset_bytes)
+ self.assertEqual(header.mini_index_offset + 65536*4,
+ header.entry_offset)
More information about the bazaar-commits
mailing list