Rev 30: Refactor node writing to be a separate function from finish(), and pull up the in memory queries to be part of the builder. in http://people.ubuntu.com/~robertc/baz2.0/plugins/index2/trunk

Sun Jul 13 12:51:59 BST 2008

At http://people.ubuntu.com/~robertc/baz2.0/plugins/index2/trunk

------------------------------------------------------------
revno: 30
revision-id: robertc at robertcollins.net-20080713115155-k1nu89eo932mpp1l
parent: robertc at robertcollins.net-20080713072815-sot304u35kahnexg
committer: Robert Collins <robertc at robertcollins.net>
branch nick: trunk
timestamp: Sun 2008-07-13 21:51:55 +1000
message:
  Refactor node writing to be a separate function from finish(), and pull up the in memory queries to be part of the builder.
modified:
  btree_index.py                 index.py-20080624222253-p0x5f92uyh5hw734-7
  repofmt.py                     repofmt.py-20080701113732-m1iu3n94ikbxdelb-1
=== modified file 'btree_index.py'

--- a/btree_index.py	2008-07-13 07:28:15 +0000
+++ b/btree_index.py	2008-07-13 11:51:55 +0000
@@ -178,146 +178,172 @@
     VALUE          := no-newline-no-null-bytes
     """
 
-    def finish(self):
+    def add_nodes(self, nodes):
+        """Add nodes to the index.
+
+        :param nodes: An iterable of (key, node_refs, value) entries to add.
+        """
+        if self.reference_lists:
+            for (key, value, node_refs) in nodes:
+                self.add_node(key, value, node_refs)
+        else:
+            for (key, value) in nodes:
+                self.add_node(key, value)
+
+    def _iter_mem_nodes(self):
+        """Iterate over the nodes held in memory."""
+        if self.reference_lists:
+            for key, (absent, references, value) in self._nodes.iteritems():
+                if not absent:
+                    yield self, key, value, references
+        else:
+            for key, (absent, references, value) in self._nodes.iteritems():
+                if not absent:
+                    yield self, key, value
+
+    def _write_nodes(self, node_iterator):
+        """Write node_iterator out as a B+Tree.
+
+        :param node_iterator: An iterator of sorted nodes. Each node should
+            match the output given by iter_all_entries.
+        :return: A file handle for a temporary file containing a B+Tree for
+            the nodes.
+        """
         # The index rows - rows[0] is the root, rows[1] is the layer under it
         # etc.
-        self.rows = []
+        rows = []
         # forward sorted by key. In future we may consider topological sorting,
         # at the cost of table scans for direct lookup, or a second index for
         # direct lookup
-        nodes = sorted(self._nodes.items())
         key_count = 0
         # A stack with the number of nodes of each size. 0 is the root node
         # and must always be 1 (if there are any nodes in the tree).
         self.row_lengths = []
-        # A stack with the current open node for each row length
-        self.row_nodes = []
         global_bloom = BloomSHA1(_GLOBAL_BLOOM_START_SIZE)
-        if len(nodes):
-            # Loop over all nodes adding them to the bottom row
-            # (self.rows[-1]). When we finish a chunk in a row,
-            # propogate the key that didn't fit (comes after the chunk) to the
-            # row above, transitively.
-            self.rows.append(_LeafBuilderRow())
-            def add_key(string_key, key_line):
-                """Add a key to the current chunk.
+        def add_key(string_key, key_line):
+            """Add a key to the current chunk.
 
-                :param string_key: The key to add.
-                :param key_line: The fully serialised key and value.
-                """
-                if self.rows[-1].writer is None:
-                    # opening a new leaf chunk;
-                    for pos, internal_row in enumerate(self.rows[:-1]):
-                        # flesh out any internal nodes that are needed to
-                        # preserve the high of the tree
-                        if internal_row.writer is None:
-                            if internal_row.bloom is None:
-                                create_bloom = False
-                                reserved_bytes = 0
-                            else:
-                                # This is a bloom row
-                                create_bloom = True
-                                # reserve 256 for the bloom + 10 for ':bloom:\n'
-                                reserved_bytes = _BLOOM_DISK_SIZE / 8 + 10
-                            length = _PAGE_SIZE
-                            if internal_row.nodes == 0:
-                                length -= _RESERVED_HEADER_BYTES # padded
-                            internal_row.writer = chunk_writer.ChunkWriter(
-                                length, reserved_bytes)
-                            internal_row.writer.write(_INTERNAL_FLAG)
-                            internal_row.writer.write(_INTERNAL_OFFSET +
-                                str(self.rows[pos + 1].nodes) + "\n")
-                            if create_bloom:
-                                # new bloom for the new node
-                                internal_row.bloom = BloomSHA1(_BLOOM_BUILD_SIZE)
-                            else:
-                                internal_row.bloom = None
-                    # add a new leaf
-                    length = _PAGE_SIZE
-                    if self.rows[-1].nodes == 0:
-                        length -= _RESERVED_HEADER_BYTES # padded
-                    self.rows[-1].writer = chunk_writer.ChunkWriter(length)
-                    self.rows[-1].writer.write(_LEAF_FLAG)
-                if self.rows[-1].writer.write(line):
-                    # this key did not fit in the node:
-                    self.rows[-1].finish_node()
-                    key_line = string_key + "\n"
-                    new_row = True
-                    for row in reversed(self.rows[:-1]):
-                        # Mark the start of the next node in the node above. If it
-                        # doesn't fit then propogate upwards until we find one that
-                        # it does fit into.
-                        if row.writer.write(key_line):
-                            row.finish_node()
+            :param string_key: The key to add.
+            :param key_line: The fully serialised key and value.
+            """
+            if rows[-1].writer is None:
+                # opening a new leaf chunk;
+                for pos, internal_row in enumerate(rows[:-1]):
+                    # flesh out any internal nodes that are needed to
+                    # preserve the high of the tree
+                    if internal_row.writer is None:
+                        if internal_row.bloom is None:
+                            create_bloom = False
+                            reserved_bytes = 0
                         else:
-                            # We've found a node that can handle the pointer.
-                            new_row = False
-                            break
-                    # If we reached the current root without being able to mark the
-                    # division point, then we need a new root:
-                    if new_row:
-                        # We need a new row
-                        if 'index' in debug.debug_flags:
-                            trace.mutter('Inserting new global row.')
-                        if len(self.rows) == 1:
-                            # We have only leaf nodes, so we are creating a row
-                            # where we will hold bloom filters
+                            # This is a bloom row
+                            create_bloom = True
                             # reserve 256 for the bloom + 10 for ':bloom:\n'
-                            new_row = _InternalBuilderRow(global_bloom)
                             reserved_bytes = _BLOOM_DISK_SIZE / 8 + 10
+                        length = _PAGE_SIZE
+                        if internal_row.nodes == 0:
+                            length -= _RESERVED_HEADER_BYTES # padded
+                        internal_row.writer = chunk_writer.ChunkWriter(
+                            length, reserved_bytes)
+                        internal_row.writer.write(_INTERNAL_FLAG)
+                        internal_row.writer.write(_INTERNAL_OFFSET +
+                            str(rows[pos + 1].nodes) + "\n")
+                        if create_bloom:
+                            # new bloom for the new node
+                            internal_row.bloom = BloomSHA1(_BLOOM_BUILD_SIZE)
                         else:
-                            new_row = _InternalBuilderRow(None)
-                            reserved_bytes = 0
-                        self.rows.insert(0, new_row)
-                        # This will be padded, hence the -100
-                        new_row.writer = chunk_writer.ChunkWriter(
-                            _PAGE_SIZE - _RESERVED_HEADER_BYTES,
-                            reserved_bytes)
-                        new_row.writer.write(_INTERNAL_FLAG)
-                        new_row.writer.write(_INTERNAL_OFFSET +
-                            str(self.rows[1].nodes - 1) + "\n")
-                        new_row.writer.write(key_line)
-                    add_key(string_key, key_line)
-                else:
-                    # Only the lowest level of internal rows has a bloom
-                    for row in self.rows[:-1]:
-                        if row.bloom is not None:
-                            row.bloom.insert(string_key)
-                    # Keep the bloom fairly empty. It seems we need better than
-                    # 16 b/e to avoid accidentally filling at a lower size
-                    global_bloom.insert(string_key)
-                    if (global_bloom._num_entries * _GLOBAL_BLOOM_RESIZE_THRESHOLD
-                        > global_bloom._num_bits):
-                        global_bloom.resize(global_bloom._num_bits *
-                                            _GLOBAL_BLOOM_GROW_FACTOR)
-                        if 'index' in debug.debug_flags:
-                            trace.mutter('resizing global bloom: %s',
-                                         row.bloom)
-
-            for key, (absent, references, value) in nodes:
-                if absent:
-                    continue
-                key_count += 1
-                flattened_references = []
-                for ref_list in references:
+                            internal_row.bloom = None
+                # add a new leaf
+                length = _PAGE_SIZE
+                if rows[-1].nodes == 0:
+                    length -= _RESERVED_HEADER_BYTES # padded
+                rows[-1].writer = chunk_writer.ChunkWriter(length)
+                rows[-1].writer.write(_LEAF_FLAG)
+            if rows[-1].writer.write(line):
+                # this key did not fit in the node:
+                rows[-1].finish_node()
+                key_line = string_key + "\n"
+                new_row = True
+                for row in reversed(rows[:-1]):
+                    # Mark the start of the next node in the node above. If it
+                    # doesn't fit then propogate upwards until we find one that
+                    # it does fit into.
+                    if row.writer.write(key_line):
+                        row.finish_node()
+                    else:
+                        # We've found a node that can handle the pointer.
+                        new_row = False
+                        break
+                # If we reached the current root without being able to mark the
+                # division point, then we need a new root:
+                if new_row:
+                    # We need a new row
+                    if 'index' in debug.debug_flags:
+                        trace.mutter('Inserting new global row.')
+                    if len(rows) == 1:
+                        # We have only leaf nodes, so we are creating a row
+                        # where we will hold bloom filters
+                        # reserve 256 for the bloom + 10 for ':bloom:\n'
+                        new_row = _InternalBuilderRow(global_bloom)
+                        reserved_bytes = _BLOOM_DISK_SIZE / 8 + 10
+                    else:
+                        new_row = _InternalBuilderRow(None)
+                        reserved_bytes = 0
+                    rows.insert(0, new_row)
+                    # This will be padded, hence the -100
+                    new_row.writer = chunk_writer.ChunkWriter(
+                        _PAGE_SIZE - _RESERVED_HEADER_BYTES,
+                        reserved_bytes)
+                    new_row.writer.write(_INTERNAL_FLAG)
+                    new_row.writer.write(_INTERNAL_OFFSET +
+                        str(rows[1].nodes - 1) + "\n")
+                    new_row.writer.write(key_line)
+                add_key(string_key, key_line)
+            else:
+                # Only the lowest level of internal rows has a bloom
+                for row in rows[:-1]:
+                    if row.bloom is not None:
+                        row.bloom.insert(string_key)
+                # Keep the bloom fairly empty. It seems we need better than
+                # 16 b/e to avoid accidentally filling at a lower size
+                global_bloom.insert(string_key)
+                if (global_bloom._num_entries * _GLOBAL_BLOOM_RESIZE_THRESHOLD
+                    > global_bloom._num_bits):
+                    global_bloom.resize(global_bloom._num_bits *
+                                        _GLOBAL_BLOOM_GROW_FACTOR)
+                    if 'index' in debug.debug_flags:
+                        trace.mutter('resizing global bloom: %s',
+                                     row.bloom)
+        # Loop over all nodes adding them to the bottom row
+        # (rows[-1]). When we finish a chunk in a row,
+        # propogate the key that didn't fit (comes after the chunk) to the
+        # row above, transitively.
+        for node in node_iterator:
+            if key_count == 0:
+                # First key triggers the first row
+                rows.append(_LeafBuilderRow())
+            key_count += 1
+            flattened_references = []
+            if self.reference_lists:
+                for ref_list in node[3]:
                     ref_keys = []
                     for reference in ref_list:
                         ref_keys.append('\x00'.join(reference))
                     flattened_references.append('\r'.join(ref_keys))
-                string_key = '\x00'.join(key)
-                line = ("%s\x00%s\x00%s\n" % (string_key,
-                    '\t'.join(flattened_references), value))
-                add_key(string_key, line)
-            for row in reversed(self.rows):
-                row.finish_node()
+            string_key = '\x00'.join(node[1])
+            line = ("%s\x00%s\x00%s\n" % (string_key,
+                '\t'.join(flattened_references), node[2]))
+            add_key(string_key, line)
+        for row in reversed(rows):
+            row.finish_node()
         result = tempfile.TemporaryFile()
         lines = [_BTSIGNATURE]
         lines.append(_OPTION_NODE_REFS + str(self.reference_lists) + '\n')
         lines.append(_OPTION_KEY_ELEMENTS + str(self._key_length) + '\n')
         lines.append(_OPTION_LEN + str(key_count) + '\n')
-        row_lengths = [row.nodes for row in self.rows]
+        row_lengths = [row.nodes for row in rows]
         lines.append(_OPTION_ROW_LENGTHS + ','.join(map(str, row_lengths)) + '\n')
-        if len(self.rows) > 1:
+        if len(rows) > 1:
             # We only write a global bloom if there is an internal node
             optimal_num_bits = max(_PAGE_SIZE*8, global_bloom._num_entries*9.6)
             global_bloom.resize(optimal_num_bits)
@@ -337,7 +363,7 @@
                                  " reserved space: %d > %d"
                                  % (position, _RESERVED_HEADER_BYTES))
         # write the rows out:
-        for row in self.rows:
+        for row in rows:
             reserved = _RESERVED_HEADER_BYTES # reserved space for first node
             row.spool.flush()
             row.spool.seek(0)
@@ -362,6 +388,127 @@
         result.seek(0)
         return result
 
+    def finish(self):
+        """Finalise the index.
+
+        :return: A file handle for a temporary file containing the nodes added
+            to the index.
+        """
+        nodes = sorted(self._iter_mem_nodes())
+        return self._write_nodes(nodes)
+
+    def iter_all_entries(self):
+        """Iterate over all keys within the index
+
+        :return: An iterable of (index, key, reference_lists, value). There is no
+            defined order for the result iteration - it will be in the most
+            efficient order for the index (in this case dictionary hash order).
+        """
+        if 'evil' in debug.debug_flags:
+            trace.mutter_callsite(3,
+                "iter_all_entries scales with size of history.")
+        return self._iter_mem_nodes()
+
+    def iter_entries(self, keys):
+        """Iterate over keys within the index.
+
+        :param keys: An iterable providing the keys to be retrieved.
+        :return: An iterable of (index, key, value, reference_lists). There is no
+            defined order for the result iteration - it will be in the most
+            efficient order for the index (keys iteration order in this case).
+        """
+        keys = set(keys)
+        if self.reference_lists:
+            for key in keys.intersection(self._keys):
+                node = self._nodes[key]
+                if not node[0]:
+                    yield self, key, node[2], node[1]
+        else:
+            for key in keys.intersection(self._keys):
+                node = self._nodes[key]
+                if not node[0]:
+                    yield self, key, node[2]
+
+    def iter_entries_prefix(self, keys):
+        """Iterate over keys within the index using prefix matching.
+
+        Prefix matching is applied within the tuple of a key, not to within
+        the bytestring of each key element. e.g. if you have the keys ('foo',
+        'bar'), ('foobar', 'gam') and do a prefix search for ('foo', None) then
+        only the former key is returned.
+
+        :param keys: An iterable providing the key prefixes to be retrieved.
+            Each key prefix takes the form of a tuple the length of a key, but
+            with the last N elements 'None' rather than a regular bytestring.
+            The first element cannot be 'None'.
+        :return: An iterable as per iter_all_entries, but restricted to the
+            keys with a matching prefix to those supplied. No additional keys
+            will be returned, and every match that is in the index will be
+            returned.
+        """
+        # XXX: To much duplication with the GraphIndex class; consider finding
+        # a good place to pull out the actual common logic.
+        keys = set(keys)
+        if not keys:
+            return
+        if self._key_length == 1:
+            for key in keys:
+                # sanity check
+                if key[0] is None:
+                    raise errors.BadIndexKey(key)
+                if len(key) != self._key_length:
+                    raise errors.BadIndexKey(key)
+                node = self._nodes[key]
+                if node[0]:
+                    continue 
+                if self.reference_lists:
+                    yield self, key, node[2], node[1]
+                else:
+                    yield self, key, node[2]
+            return
+        for key in keys:
+            # sanity check
+            if key[0] is None:
+                raise errors.BadIndexKey(key)
+            if len(key) != self._key_length:
+                raise errors.BadIndexKey(key)
+            # find what it refers to:
+            key_dict = self._nodes_by_key
+            elements = list(key)
+            # find the subdict to return
+            try:
+                while len(elements) and elements[0] is not None:
+                    key_dict = key_dict[elements[0]]
+                    elements.pop(0)
+            except KeyError:
+                # a non-existant lookup.
+                continue
+            if len(elements):
+                dicts = [key_dict]
+                while dicts:
+                    key_dict = dicts.pop(-1)
+                    # can't be empty or would not exist
+                    item, value = key_dict.iteritems().next()
+                    if type(value) == dict:
+                        # push keys 
+                        dicts.extend(key_dict.itervalues())
+                    else:
+                        # yield keys
+                        for value in key_dict.itervalues():
+                            yield (self, ) + value
+            else:
+                yield (self, ) + key_dict
+
+    def key_count(self):
+        """Return an estimate of the number of keys in this index.
+        
+        For InMemoryGraphIndex the estimate is exact.
+        """
+        return len(self._keys)
+
+    def validate(self):
+        """In memory index's have no known corruption at the moment."""
+
 
 class _LeafNode(object):
     """A leaf node for a serialised B+Tree index."""

=== modified file 'repofmt.py'
--- a/repofmt.py	2008-07-04 04:56:18 +0000
+++ b/repofmt.py	2008-07-13 11:51:55 +0000
@@ -58,144 +58,6 @@
 Packer.open_pack = open_pack
 
 
-class InMemoryBTree(BTreeBuilder):
-    """TODO - make InMemoryGraphIndex more reusable.
-    
-    (not that all-in-mem is a good idea).
-    """
-
-    def add_nodes(self, nodes):
-        """Add nodes to the index.
-
-        :param nodes: An iterable of (key, node_refs, value) entries to add.
-        """
-        if self.reference_lists:
-            for (key, value, node_refs) in nodes:
-                self.add_node(key, value, node_refs)
-        else:
-            for (key, value) in nodes:
-                self.add_node(key, value)
-
-    def iter_all_entries(self):
-        """Iterate over all keys within the index
-
-        :return: An iterable of (index, key, reference_lists, value). There is no
-            defined order for the result iteration - it will be in the most
-            efficient order for the index (in this case dictionary hash order).
-        """
-        if 'evil' in debug.debug_flags:
-            trace.mutter_callsite(3,
-                "iter_all_entries scales with size of history.")
-        if self.reference_lists:
-            for key, (absent, references, value) in self._nodes.iteritems():
-                if not absent:
-                    yield self, key, value, references
-        else:
-            for key, (absent, references, value) in self._nodes.iteritems():
-                if not absent:
-                    yield self, key, value
-
-    def iter_entries(self, keys):
-        """Iterate over keys within the index.
-
-        :param keys: An iterable providing the keys to be retrieved.
-        :return: An iterable of (index, key, value, reference_lists). There is no
-            defined order for the result iteration - it will be in the most
-            efficient order for the index (keys iteration order in this case).
-        """
-        keys = set(keys)
-        if self.reference_lists:
-            for key in keys.intersection(self._keys):
-                node = self._nodes[key]
-                if not node[0]:
-                    yield self, key, node[2], node[1]
-        else:
-            for key in keys.intersection(self._keys):
-                node = self._nodes[key]
-                if not node[0]:
-                    yield self, key, node[2]
-
-    def iter_entries_prefix(self, keys):
-        """Iterate over keys within the index using prefix matching.
-
-        Prefix matching is applied within the tuple of a key, not to within
-        the bytestring of each key element. e.g. if you have the keys ('foo',
-        'bar'), ('foobar', 'gam') and do a prefix search for ('foo', None) then
-        only the former key is returned.
-
-        :param keys: An iterable providing the key prefixes to be retrieved.
-            Each key prefix takes the form of a tuple the length of a key, but
-            with the last N elements 'None' rather than a regular bytestring.
-            The first element cannot be 'None'.
-        :return: An iterable as per iter_all_entries, but restricted to the
-            keys with a matching prefix to those supplied. No additional keys
-            will be returned, and every match that is in the index will be
-            returned.
-        """
-        # XXX: To much duplication with the GraphIndex class; consider finding
-        # a good place to pull out the actual common logic.
-        keys = set(keys)
-        if not keys:
-            return
-        if self._key_length == 1:
-            for key in keys:
-                # sanity check
-                if key[0] is None:
-                    raise errors.BadIndexKey(key)
-                if len(key) != self._key_length:
-                    raise errors.BadIndexKey(key)
-                node = self._nodes[key]
-                if node[0]:
-                    continue 
-                if self.reference_lists:
-                    yield self, key, node[2], node[1]
-                else:
-                    yield self, key, node[2]
-            return
-        for key in keys:
-            # sanity check
-            if key[0] is None:
-                raise errors.BadIndexKey(key)
-            if len(key) != self._key_length:
-                raise errors.BadIndexKey(key)
-            # find what it refers to:
-            key_dict = self._nodes_by_key
-            elements = list(key)
-            # find the subdict to return
-            try:
-                while len(elements) and elements[0] is not None:
-                    key_dict = key_dict[elements[0]]
-                    elements.pop(0)
-            except KeyError:
-                # a non-existant lookup.
-                continue
-            if len(elements):
-                dicts = [key_dict]
-                while dicts:
-                    key_dict = dicts.pop(-1)
-                    # can't be empty or would not exist
-                    item, value = key_dict.iteritems().next()
-                    if type(value) == dict:
-                        # push keys 
-                        dicts.extend(key_dict.itervalues())
-                    else:
-                        # yield keys
-                        for value in key_dict.itervalues():
-                            yield (self, ) + value
-            else:
-                yield (self, ) + key_dict
-
-    def key_count(self):
-        """Return an estimate of the number of keys in this index.
-        
-        For InMemoryGraphIndex the estimate is exact.
-        """
-        return len(self._keys)
-
-    def validate(self):
-        """In memory index's have no known corruption at the moment."""
-
-
 class BTreePack(NewPack):
 
     def __init__(self, upload_transport, index_transport, pack_transport,
@@ -217,18 +79,18 @@
         # passed in because the caller has them, so as to avoid object churn.
         Pack.__init__(self,
             # Revisions: parents list, no text compression.
-            InMemoryBTree(reference_lists=1),
+            BTreeBuilder(reference_lists=1),
             # Inventory: We want to map compression only, but currently the
             # knit code hasn't been updated enough to understand that, so we
             # have a regular 2-list index giving parents and compression
             # source.
-            InMemoryBTree(reference_lists=2),
+            BTreeBuilder(reference_lists=2),
             # Texts: compression and per file graph, for all fileids - so two
             # reference lists and two elements in the key tuple.
-            InMemoryBTree(reference_lists=2, key_elements=2),
+            BTreeBuilder(reference_lists=2, key_elements=2),
             # Signatures: Just blobs to store, no compression, no parents
             # listing.
-            InMemoryBTree(reference_lists=0),
+            BTreeBuilder(reference_lists=0),
             )
         # where should the new pack be opened
         self.upload_transport = upload_transport