Rev 4795: Initial work on the CHKIndex class itself. in http://bazaar.launchpad.net/~jameinel/bzr/chk-index

Wed Oct 28 18:07:33 GMT 2009

At http://bazaar.launchpad.net/~jameinel/bzr/chk-index

------------------------------------------------------------
revno: 4795
revision-id: john at arbash-meinel.com-20091028180725-z4890ne9xvf7vyys
parent: john at arbash-meinel.com-20091028170410-7s6zo4cwgcn65p8f
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: chk-index
timestamp: Wed 2009-10-28 13:07:25 -0500
message:
  Initial work on the CHKIndex class itself.
-------------- next part --------------
=== modified file 'bzrlib/chk_index.py'

--- a/bzrlib/chk_index.py	2009-10-28 17:04:10 +0000
+++ b/bzrlib/chk_index.py	2009-10-28 18:07:25 +0000
@@ -544,4 +544,122 @@
         output_bytes.extend(group_bytes)
         assert sum(map(len, output_bytes)) == header.entry_offset
         output_bytes.extend(entry_bytes)
-        return output_bytes
+        return header.entry_offset, output_bytes
+
+
+#TODO: Create a global 'integer' cache. The way the index is set up, we will
+#      have an integer reference to the group offset in every record line. So
+#      for 400k keys, we have 400k references to 4k integers. Python internally
+#      interns the ints up to 255 or so. But we will use a lot more than that
+#      quite frequently. This can easily be shared between all chk indexes.
+
+# Note: I'm considering having 2 separate instances. One for local indexes
+#       which we could use mmap, and another for cases where we either don't
+#       want to use mmap, or remote where we have to go via the transport
+#       abstraction.
+class CHKIndex(object):
+    """Access to Groupcompress CHK nodes via approx GraphIndex interface.
+
+    This class is specialize to handle the specifics of CHK pages stored in
+    Groupcompress blocks. Note that this class is not a generic key=>value
+    mapping.
+    """
+
+    # By default we read the 'root' page in a 64kB read
+    _INITIAL_READ_SIZE = 64*1024*1024
+
+    def __init__(self, transport, name, start_of_entries, size):
+        self._transport = transport
+        self._name = name
+        # Note: Even better than knowing the total-size of the index, would be
+        #       knowing the size of the header + mini-index + groups. That way
+        #       we could always ensure that in 1 round trip we would read all
+        #       the indexing information.
+        #       There are several layers of granularity, and all are a bit of a
+        #       tradeoff. We could, for example, only read the mini-index, and
+        #       not the groups. The mini-index is generally about half the size
+        #       of the groups anyway. And while when looking up a key you don't
+        #       need all of the index entries, the total number of entries is
+        #       going to be fairly small. LP's 400k keys uses a mini-index of
+        #       just 4k entries (16kB, possibly 12kB if we get 3-byte values.)
+        #       If you include 'groups' then it increases to 16+4k*8=48k for
+        #       mini-index + groups. Though that is probably ok as an initial
+        #       read request.
+        self._start_of_entries = start_of_entries
+        self._size = size
+        self._header = None
+        # Groups will be an tuple mapping offset => group info
+        self._groups = None
+        # There are a couple possible internal structures.
+        # One is to have all entries in a global lookup dictionary.
+        # Another is to have dicts hanging off the associated mini-index
+        # location.  Note, though, that tiny indexes have no 'mini-index'
+        # (everything falls into a single bucket.)
+        # It is probably more efficient to have one large dict than many small
+        # ones. Especially given that dicts < 50k entries have a resize-factor
+        # of 4:1, rather than 2:1.
+        self._mini_index = None
+
+    def _ensure_header(self):
+        if self._header is not None:
+            return
+        if self._start_of_entries is None:
+            bork
+        read_size = self._start_of_entries
+        if self._size is None:
+            bork
+        else:
+            pass
+            # TODO: If the index is small enough, just read the whole thing in
+            #       one request
+            # if self._size < self._INITIAL_READ_SIZE:
+            #     read_size = self._INITIAL_READ_SIZE
+        _, bytes = self._transport.readv(self._name, [(0, read_size)]).next()
+        header_bytes = bytes[:CHKIndexHeader._RESERVED_HEADER_BYTES]
+        self._header = CHKIndexHeader.deserialize(header_bytes)
+        # The above checks ensure that our first read always includes the mini
+        # index and groups.
+        self._parse_mini_index(bytes)
+        self._parse_groups(bytes)
+        # TODO: Once we change the code to also ensure that sometimes we will
+        #       read the entries themselves, add code to parse them.
+
+    def _parse_mini_index(self, bytes):
+        """Parse the mini-index information from bytes.
+
+        :param bytes: Bytes of the index starting at the byte 0.
+        """
+        stuple = static_tuple.StaticTuple
+        if self._header.num_mini_index_entries == 0:
+            # (offset, loaded?)
+            self._mini_index = [stuple(self._header.entry_offset, False)]
+            return
+        bork
+        self._header._build_mini_index_coder()
+        start = self._header.mini_index_offset
+        end = self._header.group_index_offset
+        format = self._header.group_coder.format[1:]
+        format = '>' + (format * self._header.num_mini_index_entries)
+        as_ints = struct.unpack(format, bytes[start:end])
+        self._mini_index = [stuple(i, False) for i in as_ints]
+
+    def _parse_groups(self, bytes):
+        """Parse the group information from bytes.
+
+        :param bytes: These bytes should be offset from 0 as the start of the
+            index.
+        """
+        start = self._header.group_index_offset
+        end = self._header.entry_offset
+        self._header._build_group_coder()
+        format = self._header.group_coder.format[1:]
+        format = '>' + (format * self._header.num_groups)
+        as_ints = struct.unpack(format, bytes[start:end])
+        stuple = static_tuple.StaticTuple
+        self._groups = [stuple(as_ints[i], as_ints[i+1])
+                        for i in xrange(0, len(as_ints), 2)]
+
+    def key_count(self):
+        """How many entries are in this index."""
+        self._ensure_header()
+        return self._header.num_entries

=== modified file 'bzrlib/tests/test_chk_index.py'
--- a/bzrlib/tests/test_chk_index.py	2009-10-28 17:04:10 +0000
+++ b/bzrlib/tests/test_chk_index.py	2009-10-28 18:07:25 +0000
@@ -28,43 +28,44 @@
     )
 
 
+def make_chk_nodes(count, group_count):
+    # we need to make up some random groups, and populate some sha keys to
+    # point into those groups
+    s = osutils.sha()
+    # For now, we just use fixed-width groups
+    groups = []
+    group_start = 0
+    for i in xrange(group_count):
+        group_length = 12345
+        groups.append((group_start, group_length))
+        group_start += group_length
+    # And now we sequentially distribute the keys across the various groups
+    in_group_start = [0]*group_count
+    nodes = []
+    for i in xrange(count):
+        group_num = i % group_count
+        in_length = 200
+        s.update(str(i))
+        key = ('sha1:' + s.hexdigest(),)
+        group = groups[group_num]
+        nodes.append((key, group[0], group[1], in_group_start[group_num],
+                      in_length))
+        in_group_start[group_num] += in_length
+    # TODO: We could change this around a bit, so that the group-length is
+    #       actually determined by how many keys are in it, and their
+    #       length. It requires an extra pass, one to determine total
+    #       length, and another to actually build up the nodes.
+    return nodes
+
+k1 = ('sha1:da39a3ee5e6b4b0d3255bfef95601890afd80709',)
+bit_k1 = binascii.a2b_hex(k1[0][5:])
+k2 = ('sha1:86f7e437faa5a7fce15d1ddcb9eaeaea377667b8',)
+bit_k2 = binascii.a2b_hex(k2[0][5:])
+k3 = ('sha1:e9d71f5ee7c92d6dc9e92ffdad17b8bd49418f98',)
+bit_k3 = binascii.a2b_hex(k3[0][5:])
+
 class TestCHKIndexBuilder(tests.TestCase):
 
-    k1 = ('sha1:da39a3ee5e6b4b0d3255bfef95601890afd80709',)
-    bit_k1 = binascii.a2b_hex(k1[0][5:])
-    k2 = ('sha1:86f7e437faa5a7fce15d1ddcb9eaeaea377667b8',)
-    bit_k2 = binascii.a2b_hex(k2[0][5:])
-    k3 = ('sha1:e9d71f5ee7c92d6dc9e92ffdad17b8bd49418f98',)
-    bit_k3 = binascii.a2b_hex(k3[0][5:])
-
-    def make_nodes(self, count, group_count):
-        # we need to make up some random groups, and populate some sha keys to
-        # point into those groups
-        s = osutils.sha()
-        # For now, we just use fixed-width groups
-        groups = []
-        group_start = 0
-        for i in xrange(group_count):
-            group_length = 12345
-            groups.append((group_start, group_length))
-            group_start += group_length
-        # And now we sequentially distribute the keys across the various groups
-        in_group_start = [0]*group_count
-        nodes = []
-        for i in xrange(count):
-            group_num = i % group_count
-            in_length = 200
-            s.update(str(i))
-            key = ('sha1:' + s.hexdigest(),)
-            group = groups[group_num]
-            nodes.append((key, group[0], group[1], in_group_start[group_num],
-                          in_length))
-            in_group_start[group_num] += in_length
-        # TODO: We could change this around a bit, so that the group-length is
-        #       actually determined by how many keys are in it, and their
-        #       length. It requires an extra pass, one to determine total
-        #       length, and another to actually build up the nodes.
-        return nodes
 
     def test__ensure_group(self):
         builder = chk_index.CHKIndexBuilder()
@@ -87,37 +88,37 @@
 
     def test_add_node_bad_keys(self):   
         builder = chk_index.CHKIndexBuilder()
-        builder.add_node(self.k1, 0, 10, 0, 1)
-        builder.add_node(self.k2, 0, 10, 1, 2)
+        builder.add_node(k1, 0, 10, 0, 1)
+        builder.add_node(k2, 0, 10, 1, 2)
         # Not a sha1 key
         self.assertRaises(errors.BadIndexKey,
                           builder.add_node, ('file-id-foo',), 0, 10, 0, 1)
         # multiple entries
         self.assertRaises(errors.BadIndexKey,
-                          builder.add_node, self.k1 + self.k2, 0, 10, 0, 1)
+                          builder.add_node, k1 + k2, 0, 10, 0, 1)
         # incomplete 'sha1'
         self.assertRaises(errors.BadIndexKey,
-                          builder.add_node, (self.k1[0][:39],), 0, 10, 0, 1)
+                          builder.add_node, (k1[0][:39],), 0, 10, 0, 1)
         # too long 'sha1'
         self.assertRaises(errors.BadIndexKey,
-                          builder.add_node, (self.k1[0] + 'c',), 0, 10, 0, 1)
+                          builder.add_node, (k1[0] + 'c',), 0, 10, 0, 1)
         # invalid char
-        kbad = (self.k1[0][:20] + 'x' + self.k1[0][21:],)
+        kbad = (k1[0][:20] + 'x' + k1[0][21:],)
         self.assertRaises(errors.BadIndexKey,
                           builder.add_node, kbad, 0, 10, 0, 1)
         # sha2 != sha1
-        kbad = ('sha2:' + self.k1[0][5:],)
+        kbad = ('sha2:' + k1[0][5:],)
         self.assertRaises(errors.BadIndexKey,
                           builder.add_node, kbad, 0, 10, 0, 1)
         # key already exists
         self.assertRaises(errors.BadIndexDuplicateKey,
-                          builder.add_node, self.k1, 0, 10, 0, 1)
+                          builder.add_node, k1, 0, 10, 0, 1)
 
     def test_key_count(self):
         builder = chk_index.CHKIndexBuilder()
         self.assertEqual(0, builder.key_count())
-        builder.add_node(self.k1, 0, 10, 0, 1)
-        builder.add_node(self.k2, 0, 10, 1, 2)
+        builder.add_node(k1, 0, 10, 0, 1)
+        builder.add_node(k2, 0, 10, 1, 2)
         self.assertEqual(2, builder.key_count())
 
     def test__build_group_bytes(self):
@@ -144,6 +145,7 @@
             '01200000' '00200000',
             ]), '\n'.join([binascii.b2a_hex(b) for b in bytes]))
 
+    # TODO: Test the _empty case
     def test__build_group_bytes_tiny(self):
         builder = chk_index.CHKIndexBuilder()
         builder._ensure_group((0, 2345))
@@ -179,8 +181,8 @@
     def test__entry_to_bytes(self):
         builder = chk_index.CHKIndexBuilder()
         b2a_hex = binascii.b2a_hex
-        builder.add_node(self.k1, 0, 1000, 0, 1)
-        builder.add_node(self.k2, 1000, 20000, 0, 50)
+        builder.add_node(k1, 0, 1000, 0, 1)
+        builder.add_node(k2, 1000, 20000, 0, 50)
         h = builder._build_header()
         builder._sort_groups()
         self.assertEqual(20, h.entry_hash_bytes)
@@ -190,15 +192,16 @@
         self.assertEqual(1, h.entry_group_length_bytes)
         self.assertEqual('>20sBBB', h.entry_coder.format)
         # (sha_hash, group number, group offset, group length)
-        self.assertEqual(b2a_hex(self.bit_k1) + '01' '00' '01',
-            b2a_hex(builder._entry_to_bytes(self.bit_k1, h.entry_coder)))
-        self.assertEqual(b2a_hex(self.bit_k2) + '02' '00' '32',
-            b2a_hex(builder._entry_to_bytes(self.bit_k2, h.entry_coder)))
+        self.assertEqual(b2a_hex(bit_k1) + '01' '00' '01',
+            b2a_hex(builder._entry_to_bytes(bit_k1, h.entry_coder)))
+        self.assertEqual(b2a_hex(bit_k2) + '02' '00' '32',
+            b2a_hex(builder._entry_to_bytes(bit_k2, h.entry_coder)))
 
+    # TODO: Test the _empty case
     def test__build_mini_index_and_entries_tiny(self):
         builder = chk_index.CHKIndexBuilder()
-        builder.add_node(self.k1, 0, 1000, 0, 1)
-        builder.add_node(self.k2, 1000, 20000, 0, 50)
+        builder.add_node(k1, 0, 1000, 0, 1)
+        builder.add_node(k2, 1000, 20000, 0, 50)
         h = builder._build_header()
         builder._sort_groups()
         self.assertEqual(0, h.num_mini_index_entries)
@@ -210,45 +213,45 @@
         # bit_k2 < bit_k1
         self.assertEqualDiff('\n'.join([
             #hash # Group # start # end/length?
-            b2a_hex(self.bit_k2) + '02' '00' '32',
-            b2a_hex(self.bit_k1) + '01' '00' '01',
+            b2a_hex(bit_k2) + '02' '00' '32',
+            b2a_hex(bit_k1) + '01' '00' '01',
             ]), '\n'.join([b2a_hex(b) for b in entry_bytes]))
 
     def test__bit_key_to_offset_0(self):
-        self.assertEqual(0, chk_index._bit_key_to_offset_0(self.bit_k1))
-        self.assertEqual(0, chk_index._bit_key_to_offset_0(self.bit_k2))
-        self.assertEqual(0, chk_index._bit_key_to_offset_0(self.bit_k3))
+        self.assertEqual(0, chk_index._bit_key_to_offset_0(bit_k1))
+        self.assertEqual(0, chk_index._bit_key_to_offset_0(bit_k2))
+        self.assertEqual(0, chk_index._bit_key_to_offset_0(bit_k3))
 
     def test__bit_key_to_offset_16(self):
         # da39... maps to 'd'
-        self.assertEqual(0xd, chk_index._bit_key_to_offset_16(self.bit_k1))
-        self.assertEqual(0x8, chk_index._bit_key_to_offset_16(self.bit_k2))
-        self.assertEqual(0xe, chk_index._bit_key_to_offset_16(self.bit_k3))
+        self.assertEqual(0xd, chk_index._bit_key_to_offset_16(bit_k1))
+        self.assertEqual(0x8, chk_index._bit_key_to_offset_16(bit_k2))
+        self.assertEqual(0xe, chk_index._bit_key_to_offset_16(bit_k3))
 
     def test__bit_key_to_offset_256(self):
         # da39... maps to 'da'
-        self.assertEqual(0xda, chk_index._bit_key_to_offset_256(self.bit_k1))
-        self.assertEqual(0x86, chk_index._bit_key_to_offset_256(self.bit_k2))
-        self.assertEqual(0xe9, chk_index._bit_key_to_offset_256(self.bit_k3))
+        self.assertEqual(0xda, chk_index._bit_key_to_offset_256(bit_k1))
+        self.assertEqual(0x86, chk_index._bit_key_to_offset_256(bit_k2))
+        self.assertEqual(0xe9, chk_index._bit_key_to_offset_256(bit_k3))
 
     def test__bit_key_to_offset_4096(self):
         # da39... maps to 0xda3
-        self.assertEqual(0xda3, chk_index._bit_key_to_offset_4096(self.bit_k1))
-        self.assertEqual(0x86f, chk_index._bit_key_to_offset_4096(self.bit_k2))
-        self.assertEqual(0xe9d, chk_index._bit_key_to_offset_4096(self.bit_k3))
+        self.assertEqual(0xda3, chk_index._bit_key_to_offset_4096(bit_k1))
+        self.assertEqual(0x86f, chk_index._bit_key_to_offset_4096(bit_k2))
+        self.assertEqual(0xe9d, chk_index._bit_key_to_offset_4096(bit_k3))
 
     def test__bit_key_to_offset_65536(self):
         # da39... maps to 0xda39
         offset_func = chk_index._bit_key_to_offset_65536
-        self.assertEqual(0xda39, offset_func(self.bit_k1))
-        self.assertEqual(0x86f7, offset_func(self.bit_k2))
-        self.assertEqual(0xe9d7, offset_func(self.bit_k3))
+        self.assertEqual(0xda39, offset_func(bit_k1))
+        self.assertEqual(0x86f7, offset_func(bit_k2))
+        self.assertEqual(0xe9d7, offset_func(bit_k3))
 
     def test__build_mini_index_and_entries_16(self):
         builder = chk_index.CHKIndexBuilder()
-        builder.add_node(self.k1, 0, 1000, 0, 1)
-        builder.add_node(self.k2, 1000, 20000, 0, 1000)
-        builder.add_node(self.k3, 1000, 20000, 1000, 500)
+        builder.add_node(k1, 0, 1000, 0, 1)
+        builder.add_node(k2, 1000, 20000, 0, 1000)
+        builder.add_node(k3, 1000, 20000, 1000, 500)
         h = builder._build_header()
         builder._sort_groups()
         self.assertEqual(0, h.num_mini_index_entries)
@@ -284,21 +287,23 @@
         self.assertEqual('>20sBHH', h.entry_coder.format)
         self.assertEqualDiff('\n'.join([
             #hash # Group # start # end/length?
-            b2a_hex(self.bit_k2) + '02' '0000' '03e8',
-            b2a_hex(self.bit_k1) + '01' '0000' '0001',
-            b2a_hex(self.bit_k3) + '02' '03e8' '01f4',
+            b2a_hex(bit_k2) + '02' '0000' '03e8',
+            b2a_hex(bit_k1) + '01' '0000' '0001',
+            b2a_hex(bit_k3) + '02' '03e8' '01f4',
             ]), '\n'.join([b2a_hex(b) for b in entry_bytes]))
         self.assertEqual(0x0084, h.entry_offset)
         self.assertEqual(0x009d, h.entry_offset + len(entry_bytes[0]))
         self.assertEqual(0x00b6, h.entry_offset + len(entry_bytes[0])
                                  + len(entry_bytes[1]))
 
+    # TODO: Test the _empty case
     def test_build_tiny_index(self):
         builder = chk_index.CHKIndexBuilder()
-        builder.add_node(self.k1, 0, 1000, 0, 1)
-        builder.add_node(self.k2, 1000, 20000, 0, 1000)
-        builder.add_node(self.k3, 1000, 20000, 1000, 500)
-        bytes = ''.join(builder.finish())
+        builder.add_node(k1, 0, 1000, 0, 1)
+        builder.add_node(k2, 1000, 20000, 0, 1000)
+        builder.add_node(k3, 1000, 20000, 1000, 500)
+        entry_offset, bytes = builder.finish()
+        bytes = ''.join(bytes)
         header = """Groupcompress CHK Index 1
 hash=sha1 20
 mini_index=120 0 1
@@ -307,6 +312,7 @@
 """
         self.assertEqualDiff(header, bytes[:len(header)])
         self.assertEqual('\n'*(120-len(header)), bytes[len(header):120])
+        self.assertEqual(120+3*(2+2), entry_offset)
         b2a_hex = binascii.b2a_hex
         self.assertEqualDiff(
             # First the group info
@@ -315,17 +321,18 @@
             '03e8' '4e20' # group 2 @ offet 1000, length 20000
             '' # No mini index
             # 3 entries follow
-            + b2a_hex(self.bit_k2) + '02' '0000' '03e8'
-            + b2a_hex(self.bit_k1) + '01' '0000' '0001'
-            + b2a_hex(self.bit_k3) + '02' '03e8' '01f4'
+            + b2a_hex(bit_k2) + '02' '0000' '03e8'
+            + b2a_hex(bit_k1) + '01' '0000' '0001'
+            + b2a_hex(bit_k3) + '02' '03e8' '01f4'
             , b2a_hex(bytes[120:]))
 
     def test_build_small_index(self):
         builder = chk_index.CHKIndexBuilder()
-        nodes = self.make_nodes(400, 20)
+        nodes = make_chk_nodes(400, 20)
         for node in nodes:
             builder.add_node(*node)
-        bytes = ''.join(builder.finish())
+        entry_offset, bytes = builder.finish()
+        bytes = ''.join(bytes)
         header = """Groupcompress CHK Index 1
 hash=sha1 20
 mini_index=120 16 2
@@ -346,7 +353,7 @@
             '\n'.join([str(o) for o in offsets]))
         # Next comes the group information, each entry in groups is a 4 byte
         # pack offset, and a 2 byte group length
-        entry_offset = 152 + 21 * (4 + 2)
+        self.assertEqual(152 + 21 * (4 + 2), entry_offset)
         self.assertEqual(entry_offset, h.entry_offset)
         group_bytes = bytes[152:entry_offset]
         offsets = struct.unpack('>' + ('IH' * 21), group_bytes)
@@ -628,6 +635,7 @@
         self.assertEqual(header.group_index_offset + 10*(4+4),
                          header.entry_offset)
 
+    # TODO: Test the _empty case
     def test_build_header_for_tiny(self):
         header = chk_index.CHKIndexHeader.build_header_for(
             num_groups=3, max_group_offset=40000,
@@ -669,3 +677,31 @@
         self.assertEqual(4, header.entry_group_offset_bytes)
         self.assertEqual(header.group_index_offset + 100000*(8+4),
                          header.entry_offset)
+
+
+class TestCHKIndex(tests.TestCaseWithMemoryTransport):
+
+    def make_index(self, nodes=[]):
+        builder = chk_index.CHKIndexBuilder()
+        for node in nodes:
+            builder.add_node(*node)
+        entry_offset, bytes = builder.finish()
+        trans = self.get_transport()
+        size = trans.put_bytes('index', ''.join(bytes))
+        return chk_index.CHKIndex(trans, 'index', entry_offset, size)
+
+    # TODO: Test the _empty case
+    def test__ensure_header_tiny(self):
+        index = self.make_index([(k1, 0, 1000, 0, 10),
+                                 (k2, 0, 1000, 10, 9),
+                                 (k3, 0, 1000, 19, 9),
+                                ])
+        index._ensure_header()
+        self.assertIsNot(None, index._header)
+        self.assertEqual(3, index._header.num_entries)
+        self.assertEqual(2, index._header.num_groups)
+        self.assertIsNot(None, index._groups)
+        self.assertEqual([(0, 0), (0, 1000)], index._groups)
+        # TODO: the value of _mini_index when there isn't an index present is
+        #       currently up for grabs.
+        self.assertEqual([(126, False)], index._mini_index)