Rev 4795: Initial work on the CHKIndex class itself. in http://bazaar.launchpad.net/~jameinel/bzr/chk-index
John Arbash Meinel
john at arbash-meinel.com
Wed Oct 28 18:07:33 GMT 2009
At http://bazaar.launchpad.net/~jameinel/bzr/chk-index
------------------------------------------------------------
revno: 4795
revision-id: john at arbash-meinel.com-20091028180725-z4890ne9xvf7vyys
parent: john at arbash-meinel.com-20091028170410-7s6zo4cwgcn65p8f
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: chk-index
timestamp: Wed 2009-10-28 13:07:25 -0500
message:
Initial work on the CHKIndex class itself.
-------------- next part --------------
=== modified file 'bzrlib/chk_index.py'
--- a/bzrlib/chk_index.py 2009-10-28 17:04:10 +0000
+++ b/bzrlib/chk_index.py 2009-10-28 18:07:25 +0000
@@ -544,4 +544,122 @@
output_bytes.extend(group_bytes)
assert sum(map(len, output_bytes)) == header.entry_offset
output_bytes.extend(entry_bytes)
- return output_bytes
+ return header.entry_offset, output_bytes
+
+
+#TODO: Create a global 'integer' cache. The way the index is set up, we will
+# have an integer reference to the group offset in every record line. So
+# for 400k keys, we have 400k references to 4k integers. Python internally
+# interns the ints up to 255 or so. But we will use a lot more than that
+# quite frequently. This can easily be shared between all chk indexes.
+
+# Note: I'm considering having 2 separate instances. One for local indexes
+# which we could use mmap, and another for cases where we either don't
+# want to use mmap, or remote where we have to go via the transport
+# abstraction.
+class CHKIndex(object):
+ """Access to Groupcompress CHK nodes via approx GraphIndex interface.
+
+ This class is specialize to handle the specifics of CHK pages stored in
+ Groupcompress blocks. Note that this class is not a generic key=>value
+ mapping.
+ """
+
+ # By default we read the 'root' page in a 64kB read
+ _INITIAL_READ_SIZE = 64*1024*1024
+
+ def __init__(self, transport, name, start_of_entries, size):
+ self._transport = transport
+ self._name = name
+ # Note: Even better than knowing the total-size of the index, would be
+ # knowing the size of the header + mini-index + groups. That way
+ # we could always ensure that in 1 round trip we would read all
+ # the indexing information.
+ # There are several layers of granularity, and all are a bit of a
+ # tradeoff. We could, for example, only read the mini-index, and
+ # not the groups. The mini-index is generally about half the size
+ # of the groups anyway. And while when looking up a key you don't
+ # need all of the index entries, the total number of entries is
+ # going to be fairly small. LP's 400k keys uses a mini-index of
+ # just 4k entries (16kB, possibly 12kB if we get 3-byte values.)
+ # If you include 'groups' then it increases to 16+4k*8=48k for
+ # mini-index + groups. Though that is probably ok as an initial
+ # read request.
+ self._start_of_entries = start_of_entries
+ self._size = size
+ self._header = None
+ # Groups will be an tuple mapping offset => group info
+ self._groups = None
+ # There are a couple possible internal structures.
+ # One is to have all entries in a global lookup dictionary.
+ # Another is to have dicts hanging off the associated mini-index
+ # location. Note, though, that tiny indexes have no 'mini-index'
+ # (everything falls into a single bucket.)
+ # It is probably more efficient to have one large dict than many small
+ # ones. Especially given that dicts < 50k entries have a resize-factor
+ # of 4:1, rather than 2:1.
+ self._mini_index = None
+
+ def _ensure_header(self):
+ if self._header is not None:
+ return
+ if self._start_of_entries is None:
+ bork
+ read_size = self._start_of_entries
+ if self._size is None:
+ bork
+ else:
+ pass
+ # TODO: If the index is small enough, just read the whole thing in
+ # one request
+ # if self._size < self._INITIAL_READ_SIZE:
+ # read_size = self._INITIAL_READ_SIZE
+ _, bytes = self._transport.readv(self._name, [(0, read_size)]).next()
+ header_bytes = bytes[:CHKIndexHeader._RESERVED_HEADER_BYTES]
+ self._header = CHKIndexHeader.deserialize(header_bytes)
+ # The above checks ensure that our first read always includes the mini
+ # index and groups.
+ self._parse_mini_index(bytes)
+ self._parse_groups(bytes)
+ # TODO: Once we change the code to also ensure that sometimes we will
+ # read the entries themselves, add code to parse them.
+
+ def _parse_mini_index(self, bytes):
+ """Parse the mini-index information from bytes.
+
+ :param bytes: Bytes of the index starting at the byte 0.
+ """
+ stuple = static_tuple.StaticTuple
+ if self._header.num_mini_index_entries == 0:
+ # (offset, loaded?)
+ self._mini_index = [stuple(self._header.entry_offset, False)]
+ return
+ bork
+ self._header._build_mini_index_coder()
+ start = self._header.mini_index_offset
+ end = self._header.group_index_offset
+ format = self._header.group_coder.format[1:]
+ format = '>' + (format * self._header.num_mini_index_entries)
+ as_ints = struct.unpack(format, bytes[start:end])
+ self._mini_index = [stuple(i, False) for i in as_ints]
+
+ def _parse_groups(self, bytes):
+ """Parse the group information from bytes.
+
+ :param bytes: These bytes should be offset from 0 as the start of the
+ index.
+ """
+ start = self._header.group_index_offset
+ end = self._header.entry_offset
+ self._header._build_group_coder()
+ format = self._header.group_coder.format[1:]
+ format = '>' + (format * self._header.num_groups)
+ as_ints = struct.unpack(format, bytes[start:end])
+ stuple = static_tuple.StaticTuple
+ self._groups = [stuple(as_ints[i], as_ints[i+1])
+ for i in xrange(0, len(as_ints), 2)]
+
+ def key_count(self):
+ """How many entries are in this index."""
+ self._ensure_header()
+ return self._header.num_entries
=== modified file 'bzrlib/tests/test_chk_index.py'
--- a/bzrlib/tests/test_chk_index.py 2009-10-28 17:04:10 +0000
+++ b/bzrlib/tests/test_chk_index.py 2009-10-28 18:07:25 +0000
@@ -28,43 +28,44 @@
)
+def make_chk_nodes(count, group_count):
+ # we need to make up some random groups, and populate some sha keys to
+ # point into those groups
+ s = osutils.sha()
+ # For now, we just use fixed-width groups
+ groups = []
+ group_start = 0
+ for i in xrange(group_count):
+ group_length = 12345
+ groups.append((group_start, group_length))
+ group_start += group_length
+ # And now we sequentially distribute the keys across the various groups
+ in_group_start = [0]*group_count
+ nodes = []
+ for i in xrange(count):
+ group_num = i % group_count
+ in_length = 200
+ s.update(str(i))
+ key = ('sha1:' + s.hexdigest(),)
+ group = groups[group_num]
+ nodes.append((key, group[0], group[1], in_group_start[group_num],
+ in_length))
+ in_group_start[group_num] += in_length
+ # TODO: We could change this around a bit, so that the group-length is
+ # actually determined by how many keys are in it, and their
+ # length. It requires an extra pass, one to determine total
+ # length, and another to actually build up the nodes.
+ return nodes
+
+k1 = ('sha1:da39a3ee5e6b4b0d3255bfef95601890afd80709',)
+bit_k1 = binascii.a2b_hex(k1[0][5:])
+k2 = ('sha1:86f7e437faa5a7fce15d1ddcb9eaeaea377667b8',)
+bit_k2 = binascii.a2b_hex(k2[0][5:])
+k3 = ('sha1:e9d71f5ee7c92d6dc9e92ffdad17b8bd49418f98',)
+bit_k3 = binascii.a2b_hex(k3[0][5:])
+
class TestCHKIndexBuilder(tests.TestCase):
- k1 = ('sha1:da39a3ee5e6b4b0d3255bfef95601890afd80709',)
- bit_k1 = binascii.a2b_hex(k1[0][5:])
- k2 = ('sha1:86f7e437faa5a7fce15d1ddcb9eaeaea377667b8',)
- bit_k2 = binascii.a2b_hex(k2[0][5:])
- k3 = ('sha1:e9d71f5ee7c92d6dc9e92ffdad17b8bd49418f98',)
- bit_k3 = binascii.a2b_hex(k3[0][5:])
-
- def make_nodes(self, count, group_count):
- # we need to make up some random groups, and populate some sha keys to
- # point into those groups
- s = osutils.sha()
- # For now, we just use fixed-width groups
- groups = []
- group_start = 0
- for i in xrange(group_count):
- group_length = 12345
- groups.append((group_start, group_length))
- group_start += group_length
- # And now we sequentially distribute the keys across the various groups
- in_group_start = [0]*group_count
- nodes = []
- for i in xrange(count):
- group_num = i % group_count
- in_length = 200
- s.update(str(i))
- key = ('sha1:' + s.hexdigest(),)
- group = groups[group_num]
- nodes.append((key, group[0], group[1], in_group_start[group_num],
- in_length))
- in_group_start[group_num] += in_length
- # TODO: We could change this around a bit, so that the group-length is
- # actually determined by how many keys are in it, and their
- # length. It requires an extra pass, one to determine total
- # length, and another to actually build up the nodes.
- return nodes
def test__ensure_group(self):
builder = chk_index.CHKIndexBuilder()
@@ -87,37 +88,37 @@
def test_add_node_bad_keys(self):
builder = chk_index.CHKIndexBuilder()
- builder.add_node(self.k1, 0, 10, 0, 1)
- builder.add_node(self.k2, 0, 10, 1, 2)
+ builder.add_node(k1, 0, 10, 0, 1)
+ builder.add_node(k2, 0, 10, 1, 2)
# Not a sha1 key
self.assertRaises(errors.BadIndexKey,
builder.add_node, ('file-id-foo',), 0, 10, 0, 1)
# multiple entries
self.assertRaises(errors.BadIndexKey,
- builder.add_node, self.k1 + self.k2, 0, 10, 0, 1)
+ builder.add_node, k1 + k2, 0, 10, 0, 1)
# incomplete 'sha1'
self.assertRaises(errors.BadIndexKey,
- builder.add_node, (self.k1[0][:39],), 0, 10, 0, 1)
+ builder.add_node, (k1[0][:39],), 0, 10, 0, 1)
# too long 'sha1'
self.assertRaises(errors.BadIndexKey,
- builder.add_node, (self.k1[0] + 'c',), 0, 10, 0, 1)
+ builder.add_node, (k1[0] + 'c',), 0, 10, 0, 1)
# invalid char
- kbad = (self.k1[0][:20] + 'x' + self.k1[0][21:],)
+ kbad = (k1[0][:20] + 'x' + k1[0][21:],)
self.assertRaises(errors.BadIndexKey,
builder.add_node, kbad, 0, 10, 0, 1)
# sha2 != sha1
- kbad = ('sha2:' + self.k1[0][5:],)
+ kbad = ('sha2:' + k1[0][5:],)
self.assertRaises(errors.BadIndexKey,
builder.add_node, kbad, 0, 10, 0, 1)
# key already exists
self.assertRaises(errors.BadIndexDuplicateKey,
- builder.add_node, self.k1, 0, 10, 0, 1)
+ builder.add_node, k1, 0, 10, 0, 1)
def test_key_count(self):
builder = chk_index.CHKIndexBuilder()
self.assertEqual(0, builder.key_count())
- builder.add_node(self.k1, 0, 10, 0, 1)
- builder.add_node(self.k2, 0, 10, 1, 2)
+ builder.add_node(k1, 0, 10, 0, 1)
+ builder.add_node(k2, 0, 10, 1, 2)
self.assertEqual(2, builder.key_count())
def test__build_group_bytes(self):
@@ -144,6 +145,7 @@
'01200000' '00200000',
]), '\n'.join([binascii.b2a_hex(b) for b in bytes]))
+ # TODO: Test the _empty case
def test__build_group_bytes_tiny(self):
builder = chk_index.CHKIndexBuilder()
builder._ensure_group((0, 2345))
@@ -179,8 +181,8 @@
def test__entry_to_bytes(self):
builder = chk_index.CHKIndexBuilder()
b2a_hex = binascii.b2a_hex
- builder.add_node(self.k1, 0, 1000, 0, 1)
- builder.add_node(self.k2, 1000, 20000, 0, 50)
+ builder.add_node(k1, 0, 1000, 0, 1)
+ builder.add_node(k2, 1000, 20000, 0, 50)
h = builder._build_header()
builder._sort_groups()
self.assertEqual(20, h.entry_hash_bytes)
@@ -190,15 +192,16 @@
self.assertEqual(1, h.entry_group_length_bytes)
self.assertEqual('>20sBBB', h.entry_coder.format)
# (sha_hash, group number, group offset, group length)
- self.assertEqual(b2a_hex(self.bit_k1) + '01' '00' '01',
- b2a_hex(builder._entry_to_bytes(self.bit_k1, h.entry_coder)))
- self.assertEqual(b2a_hex(self.bit_k2) + '02' '00' '32',
- b2a_hex(builder._entry_to_bytes(self.bit_k2, h.entry_coder)))
+ self.assertEqual(b2a_hex(bit_k1) + '01' '00' '01',
+ b2a_hex(builder._entry_to_bytes(bit_k1, h.entry_coder)))
+ self.assertEqual(b2a_hex(bit_k2) + '02' '00' '32',
+ b2a_hex(builder._entry_to_bytes(bit_k2, h.entry_coder)))
+ # TODO: Test the _empty case
def test__build_mini_index_and_entries_tiny(self):
builder = chk_index.CHKIndexBuilder()
- builder.add_node(self.k1, 0, 1000, 0, 1)
- builder.add_node(self.k2, 1000, 20000, 0, 50)
+ builder.add_node(k1, 0, 1000, 0, 1)
+ builder.add_node(k2, 1000, 20000, 0, 50)
h = builder._build_header()
builder._sort_groups()
self.assertEqual(0, h.num_mini_index_entries)
@@ -210,45 +213,45 @@
# bit_k2 < bit_k1
self.assertEqualDiff('\n'.join([
#hash # Group # start # end/length?
- b2a_hex(self.bit_k2) + '02' '00' '32',
- b2a_hex(self.bit_k1) + '01' '00' '01',
+ b2a_hex(bit_k2) + '02' '00' '32',
+ b2a_hex(bit_k1) + '01' '00' '01',
]), '\n'.join([b2a_hex(b) for b in entry_bytes]))
def test__bit_key_to_offset_0(self):
- self.assertEqual(0, chk_index._bit_key_to_offset_0(self.bit_k1))
- self.assertEqual(0, chk_index._bit_key_to_offset_0(self.bit_k2))
- self.assertEqual(0, chk_index._bit_key_to_offset_0(self.bit_k3))
+ self.assertEqual(0, chk_index._bit_key_to_offset_0(bit_k1))
+ self.assertEqual(0, chk_index._bit_key_to_offset_0(bit_k2))
+ self.assertEqual(0, chk_index._bit_key_to_offset_0(bit_k3))
def test__bit_key_to_offset_16(self):
# da39... maps to 'd'
- self.assertEqual(0xd, chk_index._bit_key_to_offset_16(self.bit_k1))
- self.assertEqual(0x8, chk_index._bit_key_to_offset_16(self.bit_k2))
- self.assertEqual(0xe, chk_index._bit_key_to_offset_16(self.bit_k3))
+ self.assertEqual(0xd, chk_index._bit_key_to_offset_16(bit_k1))
+ self.assertEqual(0x8, chk_index._bit_key_to_offset_16(bit_k2))
+ self.assertEqual(0xe, chk_index._bit_key_to_offset_16(bit_k3))
def test__bit_key_to_offset_256(self):
# da39... maps to 'da'
- self.assertEqual(0xda, chk_index._bit_key_to_offset_256(self.bit_k1))
- self.assertEqual(0x86, chk_index._bit_key_to_offset_256(self.bit_k2))
- self.assertEqual(0xe9, chk_index._bit_key_to_offset_256(self.bit_k3))
+ self.assertEqual(0xda, chk_index._bit_key_to_offset_256(bit_k1))
+ self.assertEqual(0x86, chk_index._bit_key_to_offset_256(bit_k2))
+ self.assertEqual(0xe9, chk_index._bit_key_to_offset_256(bit_k3))
def test__bit_key_to_offset_4096(self):
# da39... maps to 0xda3
- self.assertEqual(0xda3, chk_index._bit_key_to_offset_4096(self.bit_k1))
- self.assertEqual(0x86f, chk_index._bit_key_to_offset_4096(self.bit_k2))
- self.assertEqual(0xe9d, chk_index._bit_key_to_offset_4096(self.bit_k3))
+ self.assertEqual(0xda3, chk_index._bit_key_to_offset_4096(bit_k1))
+ self.assertEqual(0x86f, chk_index._bit_key_to_offset_4096(bit_k2))
+ self.assertEqual(0xe9d, chk_index._bit_key_to_offset_4096(bit_k3))
def test__bit_key_to_offset_65536(self):
# da39... maps to 0xda39
offset_func = chk_index._bit_key_to_offset_65536
- self.assertEqual(0xda39, offset_func(self.bit_k1))
- self.assertEqual(0x86f7, offset_func(self.bit_k2))
- self.assertEqual(0xe9d7, offset_func(self.bit_k3))
+ self.assertEqual(0xda39, offset_func(bit_k1))
+ self.assertEqual(0x86f7, offset_func(bit_k2))
+ self.assertEqual(0xe9d7, offset_func(bit_k3))
def test__build_mini_index_and_entries_16(self):
builder = chk_index.CHKIndexBuilder()
- builder.add_node(self.k1, 0, 1000, 0, 1)
- builder.add_node(self.k2, 1000, 20000, 0, 1000)
- builder.add_node(self.k3, 1000, 20000, 1000, 500)
+ builder.add_node(k1, 0, 1000, 0, 1)
+ builder.add_node(k2, 1000, 20000, 0, 1000)
+ builder.add_node(k3, 1000, 20000, 1000, 500)
h = builder._build_header()
builder._sort_groups()
self.assertEqual(0, h.num_mini_index_entries)
@@ -284,21 +287,23 @@
self.assertEqual('>20sBHH', h.entry_coder.format)
self.assertEqualDiff('\n'.join([
#hash # Group # start # end/length?
- b2a_hex(self.bit_k2) + '02' '0000' '03e8',
- b2a_hex(self.bit_k1) + '01' '0000' '0001',
- b2a_hex(self.bit_k3) + '02' '03e8' '01f4',
+ b2a_hex(bit_k2) + '02' '0000' '03e8',
+ b2a_hex(bit_k1) + '01' '0000' '0001',
+ b2a_hex(bit_k3) + '02' '03e8' '01f4',
]), '\n'.join([b2a_hex(b) for b in entry_bytes]))
self.assertEqual(0x0084, h.entry_offset)
self.assertEqual(0x009d, h.entry_offset + len(entry_bytes[0]))
self.assertEqual(0x00b6, h.entry_offset + len(entry_bytes[0])
+ len(entry_bytes[1]))
+ # TODO: Test the _empty case
def test_build_tiny_index(self):
builder = chk_index.CHKIndexBuilder()
- builder.add_node(self.k1, 0, 1000, 0, 1)
- builder.add_node(self.k2, 1000, 20000, 0, 1000)
- builder.add_node(self.k3, 1000, 20000, 1000, 500)
- bytes = ''.join(builder.finish())
+ builder.add_node(k1, 0, 1000, 0, 1)
+ builder.add_node(k2, 1000, 20000, 0, 1000)
+ builder.add_node(k3, 1000, 20000, 1000, 500)
+ entry_offset, bytes = builder.finish()
+ bytes = ''.join(bytes)
header = """Groupcompress CHK Index 1
hash=sha1 20
mini_index=120 0 1
@@ -307,6 +312,7 @@
"""
self.assertEqualDiff(header, bytes[:len(header)])
self.assertEqual('\n'*(120-len(header)), bytes[len(header):120])
+ self.assertEqual(120+3*(2+2), entry_offset)
b2a_hex = binascii.b2a_hex
self.assertEqualDiff(
# First the group info
@@ -315,17 +321,18 @@
'03e8' '4e20' # group 2 @ offet 1000, length 20000
'' # No mini index
# 3 entries follow
- + b2a_hex(self.bit_k2) + '02' '0000' '03e8'
- + b2a_hex(self.bit_k1) + '01' '0000' '0001'
- + b2a_hex(self.bit_k3) + '02' '03e8' '01f4'
+ + b2a_hex(bit_k2) + '02' '0000' '03e8'
+ + b2a_hex(bit_k1) + '01' '0000' '0001'
+ + b2a_hex(bit_k3) + '02' '03e8' '01f4'
, b2a_hex(bytes[120:]))
def test_build_small_index(self):
builder = chk_index.CHKIndexBuilder()
- nodes = self.make_nodes(400, 20)
+ nodes = make_chk_nodes(400, 20)
for node in nodes:
builder.add_node(*node)
- bytes = ''.join(builder.finish())
+ entry_offset, bytes = builder.finish()
+ bytes = ''.join(bytes)
header = """Groupcompress CHK Index 1
hash=sha1 20
mini_index=120 16 2
@@ -346,7 +353,7 @@
'\n'.join([str(o) for o in offsets]))
# Next comes the group information, each entry in groups is a 4 byte
# pack offset, and a 2 byte group length
- entry_offset = 152 + 21 * (4 + 2)
+ self.assertEqual(152 + 21 * (4 + 2), entry_offset)
self.assertEqual(entry_offset, h.entry_offset)
group_bytes = bytes[152:entry_offset]
offsets = struct.unpack('>' + ('IH' * 21), group_bytes)
@@ -628,6 +635,7 @@
self.assertEqual(header.group_index_offset + 10*(4+4),
header.entry_offset)
+ # TODO: Test the _empty case
def test_build_header_for_tiny(self):
header = chk_index.CHKIndexHeader.build_header_for(
num_groups=3, max_group_offset=40000,
@@ -669,3 +677,31 @@
self.assertEqual(4, header.entry_group_offset_bytes)
self.assertEqual(header.group_index_offset + 100000*(8+4),
header.entry_offset)
+
+
+class TestCHKIndex(tests.TestCaseWithMemoryTransport):
+
+ def make_index(self, nodes=[]):
+ builder = chk_index.CHKIndexBuilder()
+ for node in nodes:
+ builder.add_node(*node)
+ entry_offset, bytes = builder.finish()
+ trans = self.get_transport()
+ size = trans.put_bytes('index', ''.join(bytes))
+ return chk_index.CHKIndex(trans, 'index', entry_offset, size)
+
+ # TODO: Test the _empty case
+ def test__ensure_header_tiny(self):
+ index = self.make_index([(k1, 0, 1000, 0, 10),
+ (k2, 0, 1000, 10, 9),
+ (k3, 0, 1000, 19, 9),
+ ])
+ index._ensure_header()
+ self.assertIsNot(None, index._header)
+ self.assertEqual(3, index._header.num_entries)
+ self.assertEqual(2, index._header.num_groups)
+ self.assertIsNot(None, index._groups)
+ self.assertEqual([(0, 0), (0, 1000)], index._groups)
+ # TODO: the value of _mini_index when there isn't an index present is
+ # currently up for grabs.
+ self.assertEqual([(126, False)], index._mini_index)
More information about the bazaar-commits
mailing list