Rev 6: Implement BTreeGraphIndex reader logic, stage one - make it work. in http://people.ubuntu.com/~robertc/baz2.0/plugins/index2/trunk

Robert Collins robertc at robertcollins.net
Tue Jul 1 10:51:17 BST 2008


At http://people.ubuntu.com/~robertc/baz2.0/plugins/index2/trunk

------------------------------------------------------------
revno: 6
revision-id: robertc at robertcollins.net-20080701095116-ge9gvqt09zie8x42
parent: robertc at robertcollins.net-20080701023304-a01ibps9ycjkhzqj
committer: Robert Collins <robertc at robertcollins.net>
branch nick: trunk
timestamp: Tue 2008-07-01 19:51:16 +1000
message:
  Implement BTreeGraphIndex reader logic, stage one - make it work.
modified:
  btree_index.py                 index.py-20080624222253-p0x5f92uyh5hw734-7
  tests/test_btree_index.py      test_index.py-20080624222253-p0x5f92uyh5hw734-13
=== modified file 'btree_index.py'
--- a/btree_index.py	2008-07-01 02:33:04 +0000
+++ b/btree_index.py	2008-07-01 09:51:16 +0000
@@ -17,10 +17,12 @@
 
 """B+Tree indices"""
 
+from bisect import bisect_right
 import tempfile
 import zlib
 
-from bzrlib import index, osutils
+from bzrlib import debug, index, lru_cache, osutils
+from bzrlib import errors as bzrerrors
 from bzrlib.index import _OPTION_NODE_REFS, _OPTION_KEY_ELEMENTS, _OPTION_LEN
 from bzrlib.plugins.index2 import errors, chunk_writer
 
@@ -125,7 +127,7 @@
                                 length)
                             internal_row.writer.write(_INTERNAL_FLAG)
                             internal_row.writer.write(_INTERNAL_OFFSET +
-                                str(self.rows[pos + 1].nodes - 1) + "\n")
+                                str(self.rows[pos + 1].nodes) + "\n")
                     # add a new leaf
                     length = 4096
                     if self.rows[-1].nodes == 0:
@@ -138,9 +140,9 @@
                     key_line = string_key + "\n"
                     new_row = True
                     for row in reversed(self.rows[:-1]):
-                        # Mark the end of this node in the node above. If it
-                        # doesn't fit then flush the node out and mark the end
-                        # point of that internal node in the node above.
+                        # Mark the start of the next node in the node above. If it
+                        # doesn't fit then propogate upwards until we find one that
+                        # it does fit into.
                         if row.writer.write(key_line):
                             finish_node(row)
                         else:
@@ -203,3 +205,324 @@
         result.flush()
         result.seek(0)
         return result
+
+
+class _LeafNode(object):
+    """A leaf node for a serialised B+Tree index."""
+
+    def __init__(self, bytes, key_length, ref_list_length):
+        """Parse bytes to create a leaf node object."""
+        # splitlines mangles the \r delimiters.. don't use it.
+        self.keys = dict(self._parse_lines(bytes.split('\n'), key_length,
+            ref_list_length))
+
+    def _parse_lines(self, lines, key_length, ref_list_length):
+        nodes = []
+        for line in lines[1:]:
+            if line == '':
+                return nodes
+            elements = line.split('\0', key_length)
+            # keys are tuples
+            key = tuple(elements[:key_length])
+            line = elements[-1]
+            references, value = line.rsplit('\0', 1)
+            if ref_list_length:
+                ref_lists = []
+                for ref_string in references.split('\t'):
+                    ref_lists.append(tuple([
+                        tuple(ref.split('\0')) for ref in ref_string.split('\r') if ref
+                        ]))
+                ref_lists = tuple(ref_lists)
+                node_value = (value, ref_lists)
+            else:
+                node_value = (value, ())
+            nodes.append((key, node_value))
+        return nodes
+
+
+class _InternalNode(object):
+    """An internal node for a serialised B+Tree index."""
+
+    def __init__(self, bytes):
+        """Parse bytes to create an internal node object."""
+        # splitlines mangles the \r delimiters.. don't use it.
+        self.keys = self._parse_lines(bytes.split('\n'))
+
+    def _parse_lines(self, lines):
+        nodes = []
+        self.offset = int(lines[1][7:])
+        for line in lines[2:]:
+            if line == '':
+                return nodes
+            nodes.append(tuple(line.split('\0')))
+        return nodes
+
+
+class BTreeGraphIndex(object):
+    """Access to nodes via the standard GraphIndex interface for B+Tree's.
+    
+    Individual nodes are held in a LRU cache. This holds the root node in
+    memory except when very large walks are done.
+    """
+
+    def __init__(self, transport, name, size=None):
+        """Create a B+Tree index object on the index name.
+
+        :param transport: The transport to read data for the index from.
+        :param name: The file name of the index on transport.
+        :param size: Optional size of the index in bytes. This allows
+            compatibility with the GraphIndex API, as well as ensuring that
+            the initial read (to read the root node header) can be done
+            without over-reading even on empty indices, and on small indices
+            allows single-IO to read the entire index.
+        """
+        self._transport = transport
+        self._name = name
+        self._size = size
+        self._page_size = transport.recommended_page_size()
+        self._node_cache = lru_cache.LRUCache()
+        self._key_count = None
+
+    def __eq__(self, other):
+        """Equal when self and other were created with the same parameters."""
+        return (
+            type(self) == type(other) and
+            self._transport == other._transport and
+            self._name == other._name and
+            self._size == other._size)
+
+    def __ne__(self, other):
+        return not self.__eq__(other)
+
+    def _cache_nodes(self, nodes):
+        """Read nodes and cache them in the lru.
+
+        The nodes list supplied is sorted and then read from disk, each node
+        being inserted it into the _node_cache.
+
+        Note: Asking for more nodes than the _node_cache can contain will
+        result in some of the results being immediately discarded, to prevent
+        this an assertion is raised if more nodes are asked for than are
+        cachable.
+        """
+        if len(nodes) > self._node_cache._max_cache:
+            raise AssertionError("Asked for more nodes than can cache.")
+        for node_pos, node in self._read_nodes(sorted(nodes)):
+            self._node_cache.add(node_pos, node)
+
+    def _get_node(self, node_index):
+        """Get a node, from cache or disk.
+
+        After getting it, the node will be cached.
+        """
+        try:
+            return self._node_cache[node_index]
+        except KeyError:
+            self._cache_nodes([node_index])
+            return self._node_cache[node_index]
+
+    def iter_all_entries(self):
+        """Iterate over all keys within the index.
+
+        :return: An iterable of (index, key, value) or (index, key, value, reference_lists).
+            The former tuple is used when there are no reference lists in the
+            index, making the API compatible with simple key:value index types.
+            There is no defined order for the result iteration - it will be in
+            the most efficient order for the index.
+        """
+        if 'evil' in debug.debug_flags:
+            trace.mutter_callsite(3,
+                "iter_all_entries scales with size of history.")
+        if self._key_count is None:
+            self._cache_nodes([0])
+        if not self._key_count:
+            return
+        internal_nodes = sum(self._row_lengths[:-1])
+        start_node = internal_nodes
+        end_node = start_node + self._row_lengths[-1]
+        needed_nodes = range(start_node, end_node)
+        if self.node_ref_lists:
+            for _, node in self._read_nodes(needed_nodes):
+                for key, (value, refs) in node.keys.items():
+                    yield (self, key, value, refs)
+        else:
+            for _, node in self._read_nodes(needed_nodes):
+                for key, (value, refs) in node.keys.items():
+                    yield (self, key, value)
+
+    def iter_entries(self, keys):
+        """Iterate over keys within the index.
+
+        :param keys: An iterable providing the keys to be retrieved.
+        :return: An iterable as per iter_all_entries, but restricted to the
+            keys supplied. No additional keys will be returned, and every
+            key supplied that is in the index will be returned.
+        """
+        keys = sorted(keys)
+        if not keys:
+            return
+        if self._key_count is None:
+            self._cache_nodes([0])
+        for key in keys:
+            # repeated single-bisection : 'make it work'
+            # FUTURE: look within the current node for the next key,
+            # consider sequential scans
+            # consider block reads
+            # gather stats on hit rate etc
+            # start at the root:
+            node = self._get_node(0)
+            # what row are we looking at
+            row = 0
+            # how many nodes have the previous rows eaten up
+            offset = self._row_lengths[row]
+            while type(node) != _LeafNode:
+                pos = bisect_right(node.keys, key)
+                # There are len(node.keys) pointers dividing len(node.keys) + 1
+                # child nodes. The first pointer is the first key in the second
+                # child node, and so on.
+                # pos indexes into the list of child nodes, any key equal to a
+                # key in node.keys indicates that the key is present to the right
+                # of that divider.
+                node_index = offset + node.offset + pos
+                row += 1
+                offset = offset + self._row_lengths[row]
+                node = self._get_node(node_index)
+            if key in node.keys:
+                value, refs = node.keys[key]
+                if self.node_ref_lists:
+                    yield (self, key, value, refs)
+                else:
+                    yield (self, key, value)
+
+    def iter_entries_prefix(self, keys):
+        """Iterate over keys within the index using prefix matching.
+
+        Prefix matching is applied within the tuple of a key, not to within
+        the bytestring of each key element. e.g. if you have the keys ('foo',
+        'bar'), ('foobar', 'gam') and do a prefix search for ('foo', None) then
+        only the former key is returned.
+
+        WARNING: Note that this method currently causes a full index parse
+        unconditionally (which is reasonably appropriate as it is a means for
+        thunking many small indices into one larger one and still supplies
+        iter_all_entries at the thunk layer).
+
+        :param keys: An iterable providing the key prefixes to be retrieved.
+            Each key prefix takes the form of a tuple the length of a key, but
+            with the last N elements 'None' rather than a regular bytestring.
+            The first element cannot be 'None'.
+        :return: An iterable as per iter_all_entries, but restricted to the
+            keys with a matching prefix to those supplied. No additional keys
+            will be returned, and every match that is in the index will be
+            returned.
+        """
+        raise NotImplementedError(self.iter_entries_prefix)
+
+    def key_count(self):
+        """Return an estimate of the number of keys in this index.
+        
+        For BTreeGraphIndex the estimate is exact as it is contained in the
+        header.
+        """
+        if self._key_count is None:
+            self._cache_nodes([0])
+        return self._key_count
+
+    def _parse_header_from_bytes(self, bytes):
+        """Parse the header from a region of bytes.
+
+        :param bytes: The data to parse.
+        :return: An offset, data tuple such as readv yields, for the unparsed
+            data. (which may be of length 0).
+        """
+        signature = bytes[0:len(self._signature())]
+        if not signature == self._signature():
+            raise bzrerrors.BadIndexFormatSignature(self._name, GraphIndex)
+        lines = bytes[len(self._signature()):].splitlines()
+        options_line = lines[0]
+        if not options_line.startswith(_OPTION_NODE_REFS):
+            raise bzrerrors.BadIndexOptions(self)
+        try:
+            self.node_ref_lists = int(options_line[len(_OPTION_NODE_REFS):])
+        except ValueError:
+            raise bzrerrors.BadIndexOptions(self)
+        options_line = lines[1]
+        if not options_line.startswith(_OPTION_KEY_ELEMENTS):
+            raise bzrerrors.BadIndexOptions(self)
+        try:
+            self._key_length = int(options_line[len(_OPTION_KEY_ELEMENTS):])
+        except ValueError:
+            raise bzrerrors.BadIndexOptions(self)
+        options_line = lines[2]
+        if not options_line.startswith(_OPTION_LEN):
+            raise bzrerrors.BadIndexOptions(self)
+        try:
+            self._key_count = int(options_line[len(_OPTION_LEN):])
+        except ValueError:
+            raise bzrerrors.BadIndexOptions(self)
+        options_line = lines[3]
+        if not options_line.startswith(_OPTION_ROW_LENGTHS):
+            raise bzrerrors.BadIndexOptions(self)
+        try:
+            self._row_lengths = map(int, [length for length in 
+                options_line[len(_OPTION_ROW_LENGTHS):].split(',')
+                if len(length)])
+        except ValueError:
+            raise bzrerrors.BadIndexOptions(self)
+        # calculate the bytes we have processed
+        header_end = (len(signature) + sum(map(len, lines[0:4])) + 4)
+        return header_end, bytes[header_end:]
+
+    def _read_nodes(self, nodes):
+        """Read some nodes from disk into the LRU cache.
+
+        This performs a readv to get the node data into memory, and parses each
+        node, the yields it to the caller. The nodes are requested in the
+        supplied order. If possible doing sort() on the list before requesting
+        a read may improve performance.
+        
+        :param nodes: The nodes to read. 0 - first node, 1 - second node etc.
+        :return: None
+        """
+        ranges = []
+        for index in nodes:
+            offset = index * 4096
+            size = 4096
+            if index == 0:
+                # Root node - special case
+                if self._size:
+                    size = min(4096, self._size)
+                else:
+                    stream = self._transport.get(self._name)
+                    start = stream.read(4096)
+                    # Avoid doing this again
+                    self._size = len(start)
+                    size = min(4096, self._size)
+            ranges.append((offset, size))
+        for offset, data in self._transport.readv(self._name, ranges):
+            if offset == 0:
+                # extract the header
+                offset, data = self._parse_header_from_bytes(data)
+                if len(data) == 0:
+                    continue
+            bytes = zlib.decompress(data)
+            if bytes.startswith(_LEAF_FLAG):
+                node = _LeafNode(bytes, self._key_length, self.node_ref_lists)
+            elif bytes.startswith(_INTERNAL_FLAG):
+                node = _InternalNode(bytes)
+            else:
+                raise AssertionError("Unknown node type for %r" % bytes)
+            yield offset / 4096, node
+
+    def _signature(self):
+        """The file signature for this index type."""
+        return _BTSIGNATURE
+
+    def validate(self):
+        """Validate that everything in the index can be accessed."""
+        # just read and parse every node.
+        if self._key_count is None:
+            self._read_nodes([0]).next()
+        node_count = sum(self._row_lengths) - 1
+        for node in self._read_nodes(range(1, node_count + 1)):
+            pass

=== modified file 'tests/test_btree_index.py'
--- a/tests/test_btree_index.py	2008-07-01 02:33:04 +0000
+++ b/tests/test_btree_index.py	2008-07-01 09:51:16 +0000
@@ -22,9 +22,10 @@
 from bzrlib.plugins import index2
 from bzrlib.plugins.index2 import errors, btree_index
 from bzrlib.tests import TestCaseWithTransport
-
-
-class TestBTreeBuilder(TestCaseWithTransport):
+from bzrlib.transport import get_transport
+
+
+class BTreeTestCase(TestCaseWithTransport):
     # test names here are suffixed by the key length and reference list count
     # that they test.
 
@@ -55,11 +56,16 @@
                             else:
                                 # serial of this ref in the ref list
                                 refs[-1].append(prefix + ("ref" + str(ref_pos) * 40,))
+                        refs[-1] = tuple(refs[-1])
+                    refs = tuple(refs)
                 else:
                     refs = ()
                 keys.append((key, value, refs))
         return keys
 
+
+class TestBTreeBuilder(BTreeTestCase):
+
     def test_empty_1_0(self):
         builder = btree_index.BTreeBuilder(key_elements=1, reference_lists=0)
         content = builder.finish().read()
@@ -658,6 +664,34 @@
             )
         self.assertEqual(expected_leaf2, leaf2_bytes)
 
+    def test_second_internal_node_pointer(self):
+        # The left most pointer in the second internal node in a row should
+        # pointer to the second node that the internal node is for, _not_
+        # the first, otherwise the first node overlaps with the last node of
+        # the prior internal node on that row.
+        builder = btree_index.BTreeBuilder(key_elements=2, reference_lists=2)
+        # 25K nodes is enough to create a two internal nodes on the second level
+        nodes = self.make_nodes(25000, 2, 2)
+        for node in nodes:
+            builder.add_node(*node)
+        transport = get_transport('trace+' + self.get_url(''))
+        size = transport.put_file('index', builder.finish())
+        del builder
+        index = btree_index.BTreeGraphIndex(transport, 'index', size)
+        # Seed the metadata, we're using internal calls now.
+        index.key_count()
+        internal_node1 = index._get_node(1)
+        internal_node2 = index._get_node(2)
+        # The left most node node2 points at should be one after the right most node pointed at by
+        # node1.
+        self.assertEqual(internal_node2.offset, 1 + len(internal_node1.keys))
+        # The left most key of the second node pointed at by internal_node2
+        # should be its first key. We can check this by looking for its first key
+        # in the second node it points at
+        pos = index._row_lengths[0] + index._row_lengths[1] + internal_node2.offset + 1
+        leaf = index._get_node(pos)
+        self.assertTrue(internal_node2.keys[0] in leaf.keys)
+
     def test_2_leaves_2_2(self):
         builder = btree_index.BTreeBuilder(key_elements=2, reference_lists=2)
         nodes = self.make_nodes(100, 2, 2)
@@ -680,3 +714,405 @@
             )
         self.assertEqual(expected_root, root_bytes)
         # We assume the other leaf nodes have been written correctly - layering FTW.
+
+
+class TestBTreeIndex(BTreeTestCase):
+
+    def test_trivial_constructor(self):
+        transport = get_transport('trace+' + self.get_url(''))
+        index = btree_index.BTreeGraphIndex(transport, 'index')
+        # Checks the page size at load, but that isn't logged yet.
+        self.assertEqual([], transport._activity)
+
+    def test_with_size_constructor(self):
+        transport = get_transport('trace+' + self.get_url(''))
+        index = btree_index.BTreeGraphIndex(transport, 'index', 1)
+        # Checks the page size at load, but that isn't logged yet.
+        self.assertEqual([], transport._activity)
+
+    def test_empty_key_count_no_size(self):
+        builder = btree_index.BTreeBuilder(key_elements=1, reference_lists=0)
+        transport = get_transport('trace+' + self.get_url(''))
+        transport.put_file('index', builder.finish())
+        index = btree_index.BTreeGraphIndex(transport, 'index')
+        del transport._activity[:]
+        self.assertEqual([], transport._activity)
+        self.assertEqual(0, index.key_count())
+        # The entire index should have been requested (as we generally have the
+        # size available, and doing many small readvs is inappropriate).
+        # We can't tell how much was actually read here, but - check the code.
+        self.assertEqual([('get', 'index'),
+            ('readv', 'index', [(0, 72)], False, None)],
+            transport._activity)
+
+    def test_empty_key_count(self):
+        builder = btree_index.BTreeBuilder(key_elements=1, reference_lists=0)
+        transport = get_transport('trace+' + self.get_url(''))
+        size = transport.put_file('index', builder.finish())
+        index = btree_index.BTreeGraphIndex(transport, 'index', size)
+        del transport._activity[:]
+        self.assertEqual([], transport._activity)
+        self.assertEqual(0, index.key_count())
+        # The entire index should have been read, as 4K > size
+        self.assertEqual([('readv', 'index', [(0, 72)], False, None)],
+            transport._activity)
+        self.assertEqual(72, size)
+
+    def test_non_empty_key_count_2_2(self):
+        builder = btree_index.BTreeBuilder(key_elements=2, reference_lists=2)
+        nodes = self.make_nodes(35, 2, 2)
+        for node in nodes:
+            builder.add_node(*node)
+        transport = get_transport('trace+' + self.get_url(''))
+        size = transport.put_file('index', builder.finish())
+        index = btree_index.BTreeGraphIndex(transport, 'index', size)
+        del transport._activity[:]
+        self.assertEqual([], transport._activity)
+        self.assertEqual(70, index.key_count())
+        # The entire index should have been read, as it is one page long.
+        self.assertEqual([('readv', 'index', [(0, 4096)], False, None)],
+            transport._activity)
+        self.assertEqual(4096, size)
+
+    def test_2_levels_key_count_2_2(self):
+        builder = btree_index.BTreeBuilder(key_elements=2, reference_lists=2)
+        nodes = self.make_nodes(80, 2, 2)
+        for node in nodes:
+            builder.add_node(*node)
+        transport = get_transport('trace+' + self.get_url(''))
+        size = transport.put_file('index', builder.finish())
+        index = btree_index.BTreeGraphIndex(transport, 'index', size)
+        del transport._activity[:]
+        self.assertEqual([], transport._activity)
+        self.assertEqual(160, index.key_count())
+        # The entire index should not have been read.
+        self.assertEqual([('readv', 'index', [(0, 4096)], False, None)],
+            transport._activity)
+        self.assertEqual(4096*3, size)
+
+    def test_validate_trivial(self):
+        builder = btree_index.BTreeBuilder(key_elements=2, reference_lists=2)
+        nodes = self.make_nodes(80, 2, 2)
+        for node in nodes:
+            builder.add_node(*node)
+        transport = get_transport('trace+' + self.get_url(''))
+        size = transport.put_file('index', builder.finish())
+        index = btree_index.BTreeGraphIndex(transport, 'index', size)
+        del transport._activity[:]
+        self.assertEqual([], transport._activity)
+        index.validate()
+        # The entire index should have been read linearly.
+        self.assertEqual([('readv', 'index', [(0, 4096)], False, None),
+            ('readv', 'index', [(4096, 4096), (8192, 4096)], False, None)],
+            transport._activity)
+        self.assertEqual(4096*3, size)
+        # XXX: TODO: write some badly-ordered nodes, and some pointers-to-wrong
+        # node and make validate find them.
+
+    def test_eq_ne(self):
+        # two indices are equal when constructed with the same parameters:
+        transport1 = get_transport('trace+' + self.get_url(''))
+        transport2 = get_transport(self.get_url(''))
+        self.assertTrue(
+            btree_index.BTreeGraphIndex(transport1, 'index') ==
+            btree_index.BTreeGraphIndex(transport1, 'index'))
+        self.assertTrue(
+            btree_index.BTreeGraphIndex(transport1, 'index', 20) ==
+            btree_index.BTreeGraphIndex(transport1, 'index', 20))
+        self.assertFalse(
+            btree_index.BTreeGraphIndex(transport1, 'index', 20) ==
+            btree_index.BTreeGraphIndex(transport2, 'index', 20))
+        self.assertFalse(
+            btree_index.BTreeGraphIndex(transport1, 'inde1', 20) ==
+            btree_index.BTreeGraphIndex(transport1, 'inde2', 20))
+        self.assertFalse(
+            btree_index.BTreeGraphIndex(transport1, 'index', 10) ==
+            btree_index.BTreeGraphIndex(transport1, 'index', 20))
+        self.assertFalse(
+            btree_index.BTreeGraphIndex(transport1, 'index') !=
+            btree_index.BTreeGraphIndex(transport1, 'index'))
+        self.assertFalse(
+            btree_index.BTreeGraphIndex(transport1, 'index', 20) !=
+            btree_index.BTreeGraphIndex(transport1, 'index', 20))
+        self.assertTrue(
+            btree_index.BTreeGraphIndex(transport1, 'index', 20) !=
+            btree_index.BTreeGraphIndex(transport2, 'index', 20))
+        self.assertTrue(
+            btree_index.BTreeGraphIndex(transport1, 'inde1', 20) !=
+            btree_index.BTreeGraphIndex(transport1, 'inde2', 20))
+        self.assertTrue(
+            btree_index.BTreeGraphIndex(transport1, 'index', 10) !=
+            btree_index.BTreeGraphIndex(transport1, 'index', 20))
+
+    def test_iter_all_entries_reads(self):
+        # iterating all entries reads the header, then does a linear
+        # read.
+        builder = btree_index.BTreeBuilder(key_elements=2, reference_lists=2)
+        # 25K nodes is enough to create a three-level index.
+        nodes = self.make_nodes(25000, 2, 2)
+        for node in nodes:
+            builder.add_node(*node)
+        transport = get_transport('trace+' + self.get_url(''))
+        size = transport.put_file('index', builder.finish())
+        del builder
+        index = btree_index.BTreeGraphIndex(transport, 'index', size)
+        del transport._activity[:]
+        self.assertEqual([], transport._activity)
+        found_nodes = list(index.iter_all_entries())
+        bare_nodes = []
+        for node in found_nodes:
+            self.assertTrue(node[0] is index)
+            bare_nodes.append(node[1:])
+        # Should be as long as the nodes we supplied
+        self.assertEqual(50000, len(found_nodes))
+        # Should have the same content
+        self.assertEqual(set(nodes), set(bare_nodes))
+        # Should have done linear scan IO up the index, ignoring
+        # the internal nodes:
+        self.assertEqual([('readv', 'index', [(0, 4096)], False, None),
+             ('readv',  'index',
+             [(12288, 4096), (16384, 4096), (20480, 4096), (24576, 4096),
+             (28672, 4096), (32768, 4096), (36864, 4096), (40960, 4096),
+             (45056, 4096), (49152, 4096), (53248, 4096), (57344, 4096),
+             (61440, 4096), (65536, 4096), (69632, 4096), (73728, 4096),
+             (77824, 4096), (81920, 4096), (86016, 4096), (90112, 4096),
+             (94208, 4096), (98304, 4096), (102400, 4096), (106496, 4096),
+             (110592, 4096), (114688, 4096), (118784, 4096), (122880, 4096),
+             (126976, 4096), (131072, 4096), (135168, 4096), (139264, 4096),
+             (143360, 4096), (147456, 4096), (151552, 4096), (155648, 4096),
+             (159744, 4096), (163840, 4096), (167936, 4096), (172032, 4096),
+             (176128, 4096), (180224, 4096), (184320, 4096), (188416, 4096),
+             (192512, 4096), (196608, 4096), (200704, 4096), (204800, 4096),
+             (208896, 4096), (212992, 4096), (217088, 4096), (221184, 4096),
+             (225280, 4096), (229376, 4096), (233472, 4096), (237568, 4096),
+             (241664, 4096), (245760, 4096), (249856, 4096), (253952, 4096),
+             (258048, 4096), (262144, 4096), (266240, 4096), (270336, 4096),
+             (274432, 4096), (278528, 4096), (282624, 4096), (286720, 4096),
+             (290816, 4096), (294912, 4096), (299008, 4096), (303104, 4096),
+             (307200, 4096), (311296, 4096), (315392, 4096), (319488, 4096),
+             (323584, 4096), (327680, 4096), (331776, 4096), (335872, 4096),
+             (339968, 4096), (344064, 4096), (348160, 4096), (352256, 4096),
+             (356352, 4096), (360448, 4096), (364544, 4096), (368640, 4096),
+             (372736, 4096), (376832, 4096), (380928, 4096), (385024, 4096),
+             (389120, 4096), (393216, 4096), (397312, 4096), (401408, 4096),
+             (405504, 4096), (409600, 4096), (413696, 4096), (417792, 4096),
+             (421888, 4096), (425984, 4096), (430080, 4096), (434176, 4096),
+             (438272, 4096), (442368, 4096), (446464, 4096), (450560, 4096),
+             (454656, 4096), (458752, 4096), (462848, 4096), (466944, 4096),
+             (471040, 4096), (475136, 4096), (479232, 4096), (483328, 4096),
+             (487424, 4096), (491520, 4096), (495616, 4096), (499712, 4096),
+             (503808, 4096), (507904, 4096), (512000, 4096), (516096, 4096),
+             (520192, 4096), (524288, 4096), (528384, 4096), (532480, 4096),
+             (536576, 4096), (540672, 4096), (544768, 4096), (548864, 4096),
+             (552960, 4096), (557056, 4096), (561152, 4096), (565248, 4096),
+             (569344, 4096), (573440, 4096), (577536, 4096), (581632, 4096),
+             (585728, 4096), (589824, 4096), (593920, 4096), (598016, 4096),
+             (602112, 4096), (606208, 4096), (610304, 4096), (614400, 4096),
+             (618496, 4096), (622592, 4096), (626688, 4096), (630784, 4096),
+             (634880, 4096), (638976, 4096), (643072, 4096), (647168, 4096),
+             (651264, 4096), (655360, 4096), (659456, 4096), (663552, 4096),
+             (667648, 4096), (671744, 4096), (675840, 4096), (679936, 4096),
+             (684032, 4096), (688128, 4096), (692224, 4096), (696320, 4096),
+             (700416, 4096), (704512, 4096), (708608, 4096), (712704, 4096),
+             (716800, 4096), (720896, 4096), (724992, 4096), (729088, 4096),
+             (733184, 4096), (737280, 4096), (741376, 4096), (745472, 4096),
+             (749568, 4096), (753664, 4096), (757760, 4096), (761856, 4096),
+             (765952, 4096), (770048, 4096), (774144, 4096), (778240, 4096),
+             (782336, 4096), (786432, 4096), (790528, 4096), (794624, 4096),
+             (798720, 4096), (802816, 4096), (806912, 4096), (811008, 4096),
+             (815104, 4096), (819200, 4096), (823296, 4096), (827392, 4096),
+             (831488, 4096), (835584, 4096), (839680, 4096), (843776, 4096),
+             (847872, 4096), (851968, 4096), (856064, 4096), (860160, 4096),
+             (864256, 4096), (868352, 4096), (872448, 4096), (876544, 4096),
+             (880640, 4096), (884736, 4096), (888832, 4096), (892928, 4096),
+             (897024, 4096), (901120, 4096), (905216, 4096), (909312, 4096),
+             (913408, 4096), (917504, 4096), (921600, 4096), (925696, 4096),
+             (929792, 4096), (933888, 4096), (937984, 4096), (942080, 4096),
+             (946176, 4096), (950272, 4096), (954368, 4096), (958464, 4096),
+             (962560, 4096), (966656, 4096), (970752, 4096), (974848, 4096),
+             (978944, 4096), (983040, 4096), (987136, 4096), (991232, 4096),
+             (995328, 4096), (999424, 4096), (1003520, 4096), (1007616, 4096),
+             (1011712, 4096), (1015808, 4096), (1019904, 4096), (1024000,
+             4096), (1028096, 4096), (1032192, 4096), (1036288, 4096),
+             (1040384, 4096), (1044480, 4096), (1048576, 4096), (1052672,
+             4096), (1056768, 4096), (1060864, 4096), (1064960, 4096),
+             (1069056, 4096), (1073152, 4096), (1077248, 4096), (1081344,
+             4096), (1085440, 4096), (1089536, 4096), (1093632, 4096),
+             (1097728, 4096), (1101824, 4096), (1105920, 4096), (1110016,
+             4096), (1114112, 4096), (1118208, 4096), (1122304, 4096),
+             (1126400, 4096), (1130496, 4096), (1134592, 4096), (1138688,
+             4096), (1142784, 4096), (1146880, 4096), (1150976, 4096),
+             (1155072, 4096), (1159168, 4096), (1163264, 4096), (1167360,
+             4096), (1171456, 4096), (1175552, 4096), (1179648, 4096),
+             (1183744, 4096), (1187840, 4096), (1191936, 4096), (1196032,
+             4096), (1200128, 4096), (1204224, 4096), (1208320, 4096),
+             (1212416, 4096), (1216512, 4096), (1220608, 4096), (1224704,
+             4096), (1228800, 4096), (1232896, 4096), (1236992, 4096),
+             (1241088, 4096), (1245184, 4096), (1249280, 4096), (1253376,
+             4096), (1257472, 4096), (1261568, 4096), (1265664, 4096),
+             (1269760, 4096), (1273856, 4096), (1277952, 4096), (1282048,
+             4096), (1286144, 4096), (1290240, 4096), (1294336, 4096),
+             (1298432, 4096), (1302528, 4096), (1306624, 4096), (1310720,
+             4096), (1314816, 4096), (1318912, 4096), (1323008, 4096),
+             (1327104, 4096), (1331200, 4096), (1335296, 4096), (1339392,
+             4096), (1343488, 4096), (1347584, 4096), (1351680, 4096),
+             (1355776, 4096), (1359872, 4096), (1363968, 4096), (1368064,
+             4096), (1372160, 4096), (1376256, 4096), (1380352, 4096),
+             (1384448, 4096), (1388544, 4096), (1392640, 4096), (1396736,
+             4096), (1400832, 4096), (1404928, 4096), (1409024, 4096),
+             (1413120, 4096), (1417216, 4096), (1421312, 4096), (1425408,
+             4096), (1429504, 4096), (1433600, 4096), (1437696, 4096),
+             (1441792, 4096), (1445888, 4096), (1449984, 4096), (1454080,
+             4096), (1458176, 4096), (1462272, 4096), (1466368, 4096),
+             (1470464, 4096), (1474560, 4096), (1478656, 4096), (1482752,
+             4096), (1486848, 4096), (1490944, 4096), (1495040, 4096),
+             (1499136, 4096), (1503232, 4096), (1507328, 4096), (1511424,
+             4096), (1515520, 4096), (1519616, 4096), (1523712, 4096),
+             (1527808, 4096), (1531904, 4096), (1536000, 4096), (1540096,
+             4096), (1544192, 4096), (1548288, 4096), (1552384, 4096),
+             (1556480, 4096), (1560576, 4096), (1564672, 4096), (1568768,
+             4096), (1572864, 4096), (1576960, 4096), (1581056, 4096),
+             (1585152, 4096), (1589248, 4096), (1593344, 4096), (1597440,
+             4096), (1601536, 4096), (1605632, 4096), (1609728, 4096),
+             (1613824, 4096), (1617920, 4096), (1622016, 4096), (1626112,
+             4096), (1630208, 4096), (1634304, 4096), (1638400, 4096),
+             (1642496, 4096), (1646592, 4096), (1650688, 4096), (1654784,
+             4096), (1658880, 4096), (1662976, 4096), (1667072, 4096),
+             (1671168, 4096), (1675264, 4096), (1679360, 4096), (1683456,
+             4096), (1687552, 4096), (1691648, 4096), (1695744, 4096),
+             (1699840, 4096), (1703936, 4096), (1708032, 4096), (1712128,
+             4096), (1716224, 4096), (1720320, 4096), (1724416, 4096),
+             (1728512, 4096), (1732608, 4096), (1736704, 4096), (1740800,
+             4096), (1744896, 4096), (1748992, 4096), (1753088, 4096),
+             (1757184, 4096), (1761280, 4096), (1765376, 4096), (1769472,
+             4096), (1773568, 4096), (1777664, 4096), (1781760, 4096),
+             (1785856, 4096), (1789952, 4096), (1794048, 4096), (1798144,
+             4096), (1802240, 4096), (1806336, 4096), (1810432, 4096),
+             (1814528, 4096), (1818624, 4096), (1822720, 4096), (1826816,
+             4096), (1830912, 4096), (1835008, 4096), (1839104, 4096),
+             (1843200, 4096), (1847296, 4096), (1851392, 4096), (1855488,
+             4096), (1859584, 4096), (1863680, 4096), (1867776, 4096),
+             (1871872, 4096), (1875968, 4096), (1880064, 4096)], False, None)],
+            transport._activity)
+        # The entire index should have been read, as it is one page long.
+        self.assertEqual(1884160, size)
+        self.assertEqual(3, len(index._row_lengths))
+
+    def _test_iter_entries_references_resolved(self):
+        index = self.make_index(1, nodes=[
+            (('name', ), 'data', ([('ref', ), ('ref', )], )),
+            (('ref', ), 'refdata', ([], ))])
+        self.assertEqual(set([(index, ('name', ), 'data', ((('ref',),('ref',)),)),
+            (index, ('ref', ), 'refdata', ((), ))]),
+            set(index.iter_entries([('name',), ('ref',)])))
+
+    def test_iter_entries_references_2_refs_resolved(self):
+        # iterating some entries reads just the pages needed. For now, to 
+        # get it working and start measuring, only 4K pages are read.
+        builder = btree_index.BTreeBuilder(key_elements=2, reference_lists=2)
+        # 80 nodes is enough to create a two-level index.
+        nodes = self.make_nodes(80, 2, 2)
+        for node in nodes:
+            builder.add_node(*node)
+        transport = get_transport('trace+' + self.get_url(''))
+        size = transport.put_file('index', builder.finish())
+        del builder
+        index = btree_index.BTreeGraphIndex(transport, 'index', size)
+        del transport._activity[:]
+        self.assertEqual([], transport._activity)
+        # search for one key
+        found_nodes = list(index.iter_entries([nodes[30][0]]))
+        bare_nodes = []
+        for node in found_nodes:
+            self.assertTrue(node[0] is index)
+            bare_nodes.append(node[1:])
+        # Should be as long as the nodes we supplied
+        self.assertEqual(1, len(found_nodes))
+        # Should have the same content
+        self.assertEqual(nodes[30], bare_nodes[0])
+        # Should have read the root node, then one page:
+        self.assertEqual([('readv', 'index', [(0, 4096)], False, None),
+             ('readv',  'index', [(4096, 4096), ], False, None)],
+            transport._activity)
+
+
+class TestBTreeNodes(BTreeTestCase):
+
+    def test_LeafNode_1_0(self):
+        node_bytes = ("type=leaf\n"
+            "0000000000000000000000000000000000000000\x00\x00value:0\n"
+            "1111111111111111111111111111111111111111\x00\x00value:1\n"
+            "2222222222222222222222222222222222222222\x00\x00value:2\n"
+            "3333333333333333333333333333333333333333\x00\x00value:3\n"
+            "4444444444444444444444444444444444444444\x00\x00value:4\n")
+        node = btree_index._LeafNode(node_bytes, 1, 0)
+        # We do direct access, or don't care about order, to leaf nodes most of
+        # the time, so a dict is useful:
+        self.assertEqual({
+            ("0000000000000000000000000000000000000000",): ("value:0", ()),
+            ("1111111111111111111111111111111111111111",): ("value:1", ()),
+            ("2222222222222222222222222222222222222222",): ("value:2", ()),
+            ("3333333333333333333333333333333333333333",): ("value:3", ()),
+            ("4444444444444444444444444444444444444444",): ("value:4", ()),
+            }, node.keys)
+
+    def test_LeafNode_2_2(self):
+        node_bytes = ("type=leaf\n"
+            "00\x0000\x00\t00\x00ref00\x00value:0\n"
+            "00\x0011\x0000\x00ref00\t00\x00ref00\r01\x00ref01\x00value:1\n"
+            "11\x0033\x0011\x00ref22\t11\x00ref22\r11\x00ref22\x00value:3\n"
+            "11\x0044\x00\t11\x00ref00\x00value:4\n"
+            ""
+            )
+        node = btree_index._LeafNode(node_bytes, 2, 2)
+        # We do direct access, or don't care about order, to leaf nodes most of
+        # the time, so a dict is useful:
+        self.assertEqual({
+            ('00', '00'): ('value:0', ((), (('00', 'ref00'),))),
+            ('00', '11'): ('value:1',
+                ((('00', 'ref00'),), (('00', 'ref00'), ('01', 'ref01')))),
+            ('11', '33'): ('value:3',
+                ((('11', 'ref22'),), (('11', 'ref22'), ('11', 'ref22')))),
+            ('11', '44'): ('value:4', ((), (('11', 'ref00'),)))
+            }, node.keys)
+
+    def test_InternalNode_1(self):
+        node_bytes = ("type=internal\n"
+            "offset=1\n"
+            "0000000000000000000000000000000000000000\n"
+            "1111111111111111111111111111111111111111\n"
+            "2222222222222222222222222222222222222222\n"
+            "3333333333333333333333333333333333333333\n"
+            "4444444444444444444444444444444444444444\n")
+        node = btree_index._InternalNode(node_bytes)
+        # We want to bisect to find the right children from this node, so a
+        # vector is most useful.
+        self.assertEqual([
+            ("0000000000000000000000000000000000000000",),
+            ("1111111111111111111111111111111111111111",),
+            ("2222222222222222222222222222222222222222",),
+            ("3333333333333333333333333333333333333333",),
+            ("4444444444444444444444444444444444444444",),
+            ], node.keys)
+        self.assertEqual(1, node.offset)
+
+    def test_LeafNode_2_2(self):
+        node_bytes = ("type=leaf\n"
+            "00\x0000\x00\t00\x00ref00\x00value:0\n"
+            "00\x0011\x0000\x00ref00\t00\x00ref00\r01\x00ref01\x00value:1\n"
+            "11\x0033\x0011\x00ref22\t11\x00ref22\r11\x00ref22\x00value:3\n"
+            "11\x0044\x00\t11\x00ref00\x00value:4\n"
+            ""
+            )
+        node = btree_index._LeafNode(node_bytes, 2, 2)
+        # We do direct access, or don't care about order, to leaf nodes most of
+        # the time, so a dict is useful:
+        self.assertEqual({
+            ('00', '00'): ('value:0', ((), (('00', 'ref00'),))),
+            ('00', '11'): ('value:1',
+                ((('00', 'ref00'),), (('00', 'ref00'), ('01', 'ref01')))),
+            ('11', '33'): ('value:3',
+                ((('11', 'ref22'),), (('11', 'ref22'), ('11', 'ref22')))),
+            ('11', '44'): ('value:4', ((), (('11', 'ref00'),)))
+            }, node.keys)
+




More information about the bazaar-commits mailing list