Rev 4791: Move the groups section after the mini-index. in http://bazaar.launchpad.net/~jameinel/bzr/chk-index
John Arbash Meinel
john at arbash-meinel.com
Wed Oct 28 16:24:51 GMT 2009
At http://bazaar.launchpad.net/~jameinel/bzr/chk-index
------------------------------------------------------------
revno: 4791
revision-id: john at arbash-meinel.com-20091028162442-w9nyvpz5f7kbremd
parent: john at arbash-meinel.com-20091028140503-z1it0dwzq2q14ssm
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: chk-index
timestamp: Wed 2009-10-28 11:24:42 -0500
message:
Move the groups section after the mini-index.
Add a test that builds an index that contains a real mini-index, and assert that all
the offsets, and other data bits are correct.
Catch a small bug where the *first* mini-offset wasn't getting set correctly.
-------------- next part --------------
=== modified file 'bzrlib/chk_index.py'
--- a/bzrlib/chk_index.py 2009-10-28 14:05:03 +0000
+++ b/bzrlib/chk_index.py 2009-10-28 16:24:42 +0000
@@ -32,6 +32,25 @@
# Further, this also doesn't assume the groups themselves have a data header.
# So we have to include all the relevant inner start + end value.
+# Some notes
+# The group index seems like using fixed-width fields could have quite a bit
+# of redundancy. Namely 'zlib.compress()' of a 32k group index collapses down
+# to around 22k.
+# Similarly, the mini-index has some redundancy. It compresses 16k => 13k.
+# Note that this portion of the index is 'tiny' in comparison with all of the
+# entry records (11MB). However, they are also a portion of the index that
+# we expect to be rather important.
+# For all of LP, this is <64kB. The idea, though is to figure out what a
+# 'minimal remote read' would be reasonable. That said, we are getting away
+# from tuning dumb transport performance. So we could just tune the remote
+# code to read a 64kB page, and generally get the whole index and all groups
+# to start with.
+# If we are going to always read it, then compressing it makes sense, since
+# we can make a smaller read and decompress it in memory.
+# Though I'm thinking to make a 'local-optimized' version of the code which
+# uses mmap, and if we tuned that to *not* parse the index and groups code
+# all the time, then we would want fixed-size records.
+
_CHKSIGNATURE = "Groupcompress CHK Index 1\n"
Struct = getattr(struct, 'Struct', None)
@@ -176,9 +195,9 @@
header.entry_group_length_bytes,
)) = cls._read_line(bytes, pos, 'entry', 5)
# TODO: should we be writing out entry_offset
- header.entry_offset = header.mini_index_offset + (
- header.num_mini_index_entries *
- header.mini_index_entry_offset_bytes)
+ header.entry_offset = header.group_index_offset + (
+ header.num_groups * (header.group_index_start_bytes +
+ header.group_index_length_bytes))
return header
@classmethod
@@ -253,14 +272,12 @@
num_entries, max_inner_start, max_inner_length):
header = cls()
header.num_groups = num_groups
- header.group_index_offset = header._RESERVED_HEADER_BYTES
find_bytes = header.find_min_bytes_for
header.group_index_start_bytes = find_bytes(max_group_offset)
header.group_index_length_bytes = find_bytes(max_group_length)
header._build_group_coder()
group_index_bytes = header.num_groups * header.group_coder.size
- header.mini_index_offset = (header.group_index_offset
- + group_index_bytes)
+ header.mini_index_offset = header._RESERVED_HEADER_BYTES
header.num_entries = num_entries
header.entry_hash_bytes = 20
header.entry_group_offset_bytes = find_bytes(header.num_groups)
@@ -299,8 +316,10 @@
last_offset = entry_bytes + max_mini_index_entry_bytes
header.mini_index_entry_offset_bytes = find_bytes(last_offset)
header._build_mini_index_coder()
- header.entry_offset = header.mini_index_offset + (
+ header.group_index_offset = header.mini_index_offset + (
header.num_mini_index_entries * header.mini_index_coder.size)
+ header.entry_offset = header.group_index_offset + (
+ header.num_groups * header.group_coder.size)
return header
@@ -444,6 +463,7 @@
# Note: In testing with Launchpad, the fixed-width for groups and
# lengths was not a big win. Namely, there was a lot of wasted
# space, especially in the second field
+ # zlib.compress(g) went from 32k => 22k for launchpad.
return bytes
def _entry_to_bytes(self, bit_key, entry_coder):
@@ -462,21 +482,29 @@
bit_key_to_offset = _bit_key_to_offset[header.num_mini_index_entries]
null_entry = '\x00'*header.mini_index_entry_offset_bytes
mini_index_bytes = [null_entry] * header.num_mini_index_entries
+ have_mini_index = header.num_mini_index_entries > 0
entry_bytes = []
- last_mini_offset = 0
+ last_mini_offset = -1
for idx, bit_key in enumerate(sorted(self._nodes)):
+ # TODO: We could probably just use bit_key rather than passing it
+ # into 'entry_coder'. This would save us from copying those
+ # 20 bytes into the new string.
bytes = self._entry_to_bytes(bit_key, header.entry_coder)
entry_bytes.append(bytes)
- mini_offset = bit_key_to_offset(bit_key)
- if mini_offset != last_mini_offset:
- if mini_offset <= last_mini_offset:
- raise AssertionError('We should only be increasing'
- ' in mini_offset, never decreasing')
- assert mini_index_bytes[mini_offset] is null_entry
- mini_bytes = header.mini_index_coder.pack(header.entry_offset
- + (idx * header.entry_coder.size))
- mini_index_bytes[mini_offset] = mini_bytes
- last_mini_offset = mini_offset
+ if have_mini_index:
+ mini_offset = bit_key_to_offset(bit_key)
+ if mini_offset != last_mini_offset:
+ if mini_offset <= last_mini_offset:
+ raise AssertionError('We should only be increasing'
+ ' in mini_offset, never decreasing')
+ assert mini_index_bytes[mini_offset] is null_entry
+ # Note: Using fixed-size entries for mini-index is slightly
+ # inefficient. For my 4096 LP index, I can
+ # zlib.compress from 16k to 13k.
+ mini_bytes = header.mini_index_coder.pack(
+ header.entry_offset + (idx * header.entry_coder.size))
+ mini_index_bytes[mini_offset] = mini_bytes
+ last_mini_offset = mini_offset
return mini_index_bytes, entry_bytes
def finish(self):
@@ -488,15 +516,17 @@
header = self._build_header()
output_bytes = []
header_bytes = header.serialize()
- if len(header_bytes) != header.group_index_offset:
+ if len(header_bytes) != header.mini_index_offset:
raise AssertionError('header_bytes is not the same length as'
- ' where the group_index_offset is expected to start.')
+ ' where the mini_index_offset is expected to start.')
output_bytes.append(header_bytes)
- output_bytes.extend(self._build_group_index(header))
+ group_bytes = self._build_group_index(header)
assert sum(map(len, output_bytes)) == header.mini_index_offset
mini_index_bytes, entry_bytes = self._build_mini_index_and_entries(
header)
output_bytes.extend(mini_index_bytes)
+ assert sum(map(len, output_bytes)) == header.group_index_offset
+ output_bytes.extend(group_bytes)
assert sum(map(len, output_bytes)) == header.entry_offset
output_bytes.extend(entry_bytes)
return output_bytes
=== modified file 'bzrlib/tests/test_chk_index.py'
--- a/bzrlib/tests/test_chk_index.py 2009-10-28 14:01:34 +0000
+++ b/bzrlib/tests/test_chk_index.py 2009-10-28 16:24:42 +0000
@@ -18,10 +18,12 @@
"""Indices customzied for groupcompress + CHK nodes."""
import binascii
+import struct
from bzrlib import (
chk_index,
errors,
+ osutils,
tests,
)
@@ -35,6 +37,35 @@
k3 = ('sha1:e9d71f5ee7c92d6dc9e92ffdad17b8bd49418f98',)
bit_k3 = binascii.a2b_hex(k3[0][5:])
+ def make_nodes(self, count, group_count):
+ # we need to make up some random groups, and populate some sha keys to
+ # point into those groups
+ s = osutils.sha()
+ # For now, we just use fixed-width groups
+ groups = []
+ group_start = 0
+ for i in xrange(group_count):
+ group_length = 12345
+ groups.append((group_start, group_length))
+ group_start += group_length
+ # And now we sequentially distribute the keys across the various groups
+ in_group_start = [0]*group_count
+ nodes = []
+ for i in xrange(count):
+ group_num = i % group_count
+ in_length = 200
+ s.update(str(i))
+ key = ('sha1:' + s.hexdigest(),)
+ group = groups[group_num]
+ nodes.append((key, group[0], group[1], in_group_start[group_num],
+ in_length))
+ in_group_start[group_num] += in_length
+ # TODO: We could change this around a bit, so that the group-length is
+ # actually determined by how many keys are in it, and their
+ # length. It requires an extra pass, one to determine total
+ # length, and another to actually build up the nodes.
+ return nodes
+
def test__ensure_group(self):
builder = chk_index.CHKIndexBuilder()
self.assertEqual({(0, 0): 0}, builder._groups)
@@ -268,13 +299,14 @@
builder.add_node(self.k2, 1000, 20000, 0, 1000)
builder.add_node(self.k3, 1000, 20000, 1000, 500)
bytes = ''.join(builder.finish())
- self.assertEqualDiff("""Groupcompress CHK Index 1
+ header = """Groupcompress CHK Index 1
hash=sha1 20
groups=120 2 2 3
-mini_index=132 0 1
+mini_index=120 0 1
entry=3 20 1 2 2
-\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n
-""", bytes[:120])
+"""
+ self.assertEqualDiff(header, bytes[:len(header)])
+ self.assertEqual('\n'*(120-len(header)), bytes[len(header):120])
b2a_hex = binascii.b2a_hex
self.assertEqualDiff(
# First the group info
@@ -288,31 +320,92 @@
+ b2a_hex(self.bit_k3) + '02' '03e8' '01f4'
, b2a_hex(bytes[120:]))
+ def test_build_small_index(self):
+ builder = chk_index.CHKIndexBuilder()
+ nodes = self.make_nodes(400, 20)
+ for node in nodes:
+ builder.add_node(*node)
+ bytes = ''.join(builder.finish())
+ header = """Groupcompress CHK Index 1
+hash=sha1 20
+groups=152 4 2 21
+mini_index=120 16 2
+entry=400 20 1 2 1
+"""
+ self.assertEqualDiff(header, bytes[:len(header)])
+ self.assertEqual('\n'*(120-len(header)), bytes[len(header):120])
+ h = chk_index.CHKIndexHeader.deserialize(bytes[:120])
+ # Next comes the mini-index
+ mini_index_bytes = bytes[120:152]
+ # Each mini index entry is 2 bytes wide, representing an offset into
+ # the file
+ offsets = struct.unpack('>16H', mini_index_bytes)
+ self.assertEqualDiff('\n'.join([str(o) for o in [
+ 278, 734, 1382, 1958, 2582, 3038, 3854, 4454,
+ 5198, 5798, 6398, 7190, 7814, 8294, 8918, 9398]]),
+ '\n'.join([str(o) for o in offsets]))
+ # Next comes the group information, each entry in groups is a 4 byte
+ # pack offset, and a 2 byte group length
+ entry_offset = 152 + 21 * (4 + 2)
+ self.assertEqual(entry_offset, h.entry_offset)
+ group_bytes = bytes[152:entry_offset]
+ offsets = struct.unpack('>' + ('IH' * 21), group_bytes)
+ group_starts = range(0, 12345*20, 12345)
+ group_starts.insert(0, 0)
+ self.assertEqual(tuple(group_starts), offsets[0::2])
+ group_lengths = [12345] * 20
+ group_lengths.insert(0, 0)
+ self.assertEqual(tuple(group_lengths), offsets[1::2])
+ groups = {}
+ for i in range(len(group_starts)):
+ groups[(group_starts[i], group_lengths[i])] = i
+ # And now the actual entry content. Each line is a 20 byte hash,
+ # followed by a 1 byte group number, a 2 byte start-in-group position
+ # and a 1-byte length in group
+ entries = struct.unpack('>' + ('20sBHB' * 400), bytes[entry_offset:])
+ entries = [entries[i:i+4] for i in xrange(0, len(entries), 4)]
+ # The group locations are 'round-robin' across the 20 groups, however
+ # the entries are sorted by hash, so we have to sort the nodes
+ # appropriately
+ expected_entries = []
+ formatted_entries = []
+ nodes = sorted(nodes)
+ for idx, node in enumerate(nodes):
+ group_num = groups[(node[1], node[2])]
+ entry = entries[idx]
+ sha1_key = ('sha1:' + binascii.b2a_hex(entry[0]),)
+ self.assertEqual((node[0], group_num, node[3], node[4]),
+ (sha1_key,) + entry[1:])
+
class TestCHKIndexHeader(tests.TestCase):
def test_serialize(self):
header = chk_index.CHKIndexHeader()
header.num_groups = 541
- header.group_index_offset = 1025
- header.group_index_start_bytes = 4
- header.group_index_length_bytes = 4
- header.mini_index_offset = (header.group_index_offset +
- (header.num_groups * 12))
+ header.mini_index_offset = 120
+ header.mini_index_entry_offset_bytes = 4
header.num_mini_index_entries = 256
- header.mini_index_entry_offset_bytes = 4
+ header.group_index_offset = header.mini_index_offset + (
+ header.num_mini_index_entries *
+ header.mini_index_entry_offset_bytes)
+ header.group_index_start_bytes = 4
+ header.group_index_length_bytes = 4
header.num_entries = 12345
header.entry_hash_bytes = 20
header.entry_group_offset_bytes = 2
header.entry_group_start_bytes = 4
header.entry_group_length_bytes = 4
- self.assertEqualDiff("""Groupcompress CHK Index 1
+ bytes = header.serialize()
+ header_bytes = """Groupcompress CHK Index 1
hash=sha1 20
-groups=1025 4 4 541
-mini_index=7517 256 4
+groups=1144 4 4 541
+mini_index=120 256 4
entry=12345 20 2 4 4
-\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n
-""", header.serialize())
+"""
+ self.assertEqualDiff(header_bytes, bytes[:len(header_bytes)])
+ self.assertEqual('\n'*(120-len(header_bytes)),
+ bytes[len(header_bytes):120])
# We pad with \n so tha 'head foo.cix' will only show the nice header,
# and not all the crufty content. We *could* pad with \0 or something
# else, but this works just fine.
@@ -333,38 +426,41 @@
# an order-of-magnitude bigger, or 1billion entries before we
# would overflow. And our spec hints at <100M entries, estimating
# at more like 10-30M entries for the 1Mx1M repository.
- self.assertEqualDiff("""Groupcompress CHK Index 1
+ bytes = header.serialize()
+ header_bytes = """Groupcompress CHK Index 1
hash=sha1 20
-groups=120 8 4 100000000
-mini_index=1200000120 65536 4
+groups=262264 8 4 100000000
+mini_index=120 65536 4
entry=100000000 20 4 4 4
-
-""", header.serialize())
+"""
+ self.assertEqualDiff(header_bytes, bytes[:len(header_bytes)])
+ self.assertEqual('\n'*(120-len(header_bytes)),
+ bytes[len(header_bytes):120])
def test_deserialize(self):
content = """Groupcompress CHK Index 1
hash=sha1 20
-groups=1025 4 4 541
-mini_index=7517 256 4
+groups=1144 4 4 541
+mini_index=120 256 4
entry=12345 20 2 4 4
"""
header = chk_index.CHKIndexHeader.deserialize(content)
self.assertEqual(1, header.version)
self.assertEqual('sha1', header.hash_function)
self.assertEqual(20, header.num_hash_bytes)
+ self.assertEqual(120, header.mini_index_offset)
+ self.assertEqual(256, header.num_mini_index_entries)
+ self.assertEqual(4, header.mini_index_entry_offset_bytes)
self.assertEqual(541, header.num_groups)
- self.assertEqual(1025, header.group_index_offset)
+ self.assertEqual(1144, header.group_index_offset)
self.assertEqual(4, header.group_index_start_bytes)
self.assertEqual(4, header.group_index_length_bytes)
- self.assertEqual(7517, header.mini_index_offset)
- self.assertEqual(256, header.num_mini_index_entries)
- self.assertEqual(4, header.mini_index_entry_offset_bytes)
self.assertEqual(12345, header.num_entries)
self.assertEqual(20, header.entry_hash_bytes)
self.assertEqual(2, header.entry_group_offset_bytes)
self.assertEqual(4, header.entry_group_start_bytes)
self.assertEqual(4, header.entry_group_length_bytes)
- self.assertEqual(7517+256*4, header.entry_offset)
+ self.assertEqual(1144+541*8, header.entry_offset)
def assertBadHeader(self, content):
self.assertRaises(ValueError,
@@ -517,19 +613,19 @@
num_entries=10*1000,
max_inner_start=4*1024*1024,
max_inner_length=65535)
+ self.assertEqual(header._RESERVED_HEADER_BYTES,
+ header.mini_index_offset)
+ self.assertEqual(256, header.num_mini_index_entries)
+ self.assertEqual(4, header.mini_index_entry_offset_bytes)
self.assertEqual(10, header.num_groups)
- self.assertEqual(header._RESERVED_HEADER_BYTES,
+ self.assertEqual(header._RESERVED_HEADER_BYTES + 256*4,
header.group_index_offset)
self.assertEqual(4, header.group_index_start_bytes)
self.assertEqual(4, header.group_index_length_bytes)
- self.assertEqual(header._RESERVED_HEADER_BYTES + (10*(4+4)),
- header.mini_index_offset)
- self.assertEqual(256, header.num_mini_index_entries)
self.assertEqual(10000, header.num_entries)
self.assertEqual(20, header.entry_hash_bytes)
self.assertEqual(1, header.entry_group_offset_bytes)
- self.assertEqual(4, header.mini_index_entry_offset_bytes)
- self.assertEqual(header.mini_index_offset + 256*4,
+ self.assertEqual(header.group_index_offset + 10*(4+4),
header.entry_offset)
def test_build_header_for_tiny(self):
@@ -537,19 +633,20 @@
num_groups=3, max_group_offset=40000,
max_group_length=20000,
num_entries=5, max_inner_start=40000, max_inner_length=40000)
+ self.assertEqual(header._RESERVED_HEADER_BYTES,
+ header.mini_index_offset)
+ self.assertEqual(0, header.num_mini_index_entries)
self.assertEqual(3, header.num_groups)
+ self.assertEqual(1, header.mini_index_entry_offset_bytes)
self.assertEqual(header._RESERVED_HEADER_BYTES,
header.group_index_offset)
self.assertEqual(2, header.group_index_start_bytes)
self.assertEqual(2, header.group_index_length_bytes)
- self.assertEqual(header._RESERVED_HEADER_BYTES + (3*(2+2)),
- header.mini_index_offset)
- self.assertEqual(0, header.num_mini_index_entries)
self.assertEqual(5, header.num_entries)
self.assertEqual(20, header.entry_hash_bytes)
self.assertEqual(1, header.entry_group_offset_bytes)
- self.assertEqual(1, header.mini_index_entry_offset_bytes)
- self.assertEqual(header.mini_index_offset, header.entry_offset)
+ self.assertEqual(header.group_index_offset + (3*(2+2)),
+ header.entry_offset)
def test_build_header_for_huge(self):
header = chk_index.CHKIndexHeader.build_header_for(
@@ -560,15 +657,15 @@
max_inner_length=600*1024*1024)
self.assertEqual(100000, header.num_groups)
self.assertEqual(header._RESERVED_HEADER_BYTES,
+ header.mini_index_offset)
+ self.assertEqual(65536, header.num_mini_index_entries)
+ self.assertEqual(4, header.mini_index_entry_offset_bytes)
+ self.assertEqual(header._RESERVED_HEADER_BYTES + 65536*4,
header.group_index_offset)
self.assertEqual(8, header.group_index_start_bytes)
self.assertEqual(4, header.group_index_length_bytes)
- self.assertEqual(header._RESERVED_HEADER_BYTES + (100000*(8+4)),
- header.mini_index_offset)
- self.assertEqual(65536, header.num_mini_index_entries)
self.assertEqual(60*1024*1024, header.num_entries)
self.assertEqual(20, header.entry_hash_bytes)
self.assertEqual(4, header.entry_group_offset_bytes)
- self.assertEqual(4, header.mini_index_entry_offset_bytes)
- self.assertEqual(header.mini_index_offset + 65536*4,
+ self.assertEqual(header.group_index_offset + 100000*(8+4),
header.entry_offset)
More information about the bazaar-commits
mailing list