Rev 3653: Refactor some code. in http://bzr.arbash-meinel.com/branches/bzr/1.7-dev/index_builder_cleanup

Thu Aug 28 21:05:55 BST 2008

At http://bzr.arbash-meinel.com/branches/bzr/1.7-dev/index_builder_cleanup

------------------------------------------------------------
revno: 3653
revision-id: john at arbash-meinel.com-20080828200552-sw5lzds9mmi3qnnb
parent: john at arbash-meinel.com-20080825190216-vdkyinz5p5e5s8kq
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: index_builder_cleanup
timestamp: Thu 2008-08-28 15:05:52 -0500
message:
  Refactor some code.
  Move the key, reference, value checking into a helper function.
  This func also finds absent references, but that should have
  minimal overhead either way.
  Also use the _update_nodes_by_key functionality for both
  indexes, as _nodes_by_key has the same signature.
  Move _spill_mem_keys_to_disk into a separate function
  not necessary, but it makes add_node() easier to understand.
-------------- next part --------------
=== modified file 'bzrlib/btree_index.py'

--- a/bzrlib/btree_index.py	2008-08-25 19:02:16 +0000
+++ b/bzrlib/btree_index.py	2008-08-28 20:05:52 +0000
@@ -37,7 +37,6 @@
     trace,
     )
 from bzrlib.index import _OPTION_NODE_REFS, _OPTION_KEY_ELEMENTS, _OPTION_LEN
-from bzrlib.osutils import basename, dirname
 from bzrlib.transport import get_transport
 
 
@@ -161,38 +160,31 @@
         :param value: The value to associate with the key. It may be any
             bytes as long as it does not contain \0 or \n.
         """
-        self._check_key(key)
-        if index._newline_null_re.search(value) is not None:
-            raise errors.BadIndexValue(value)
-        if len(references) != self.reference_lists:
-            raise errors.BadIndexValue(references)
-        node_refs = []
-        for reference_list in references:
-            for reference in reference_list:
-                # If reference is in self._nodes, then we have already checked
-                # it
-                if reference not in self._nodes:
-                    self._check_key(reference)
-            node_refs.append(tuple(reference_list))
+        # we don't care about absent_references
+        node_refs, _ = self._check_key_ref_value(key, references, value)
         if key in self._nodes:
             raise errors.BadIndexDuplicateKey(key, self)
         self._nodes[key] = (tuple(node_refs), value)
         self._keys.add(key)
-        if self._key_length > 1 and self._nodes_by_key is not None:
-            key_dict = self._nodes_by_key
-            if self.reference_lists:
-                key_value = key, value, tuple(node_refs)
-            else:
-                key_value = key, value
-            # possibly should do this on-demand, but it seems likely it is 
-            # always wanted
-            # For a key of (foo, bar, baz) create
-            # _nodes_by_key[foo][bar][baz] = key_value
-            for subkey in key[:-1]:
-                key_dict = key_dict.setdefault(subkey, {})
-            key_dict[key[-1]] = key_value
+        if self._nodes_by_key is not None and self._key_length > 1:
+            self._update_nodes_by_key(key, value, node_refs)
         if len(self._keys) < self._spill_at:
             return
+        self._spill_mem_keys_to_disk()
+
+    def _spill_mem_keys_to_disk(self):
+        """Write the in memory keys down to disk to cap memory consumption.
+
+        If we already have some keys written to disk, we will combine them so
+        as to preserve the sorted order.  The algorithm for combining uses
+        powers of two.  So on the first spill, write all mem nodes into a
+        single index. On the second spill, combine the mem nodes with the nodes
+        on disk to create a 2x sized disk index and get rid of the first index.
+        On the third spill, create a single new disk index, which will contain
+        the mem nodes, and preserve the existing 2x sized index.  On the fourth,
+        combine mem with the first and second indexes, creating a new one of
+        size 4x. On the fifth create a single new one, etc.
+        """
         iterators_to_combine = [self._iter_mem_nodes()]
         pos = -1
         for pos, backing in enumerate(self._backing_indices):
@@ -203,9 +195,11 @@
         backing_pos = pos + 1
         new_backing_file, size = \
             self._write_nodes(self._iter_smallest(iterators_to_combine))
-        new_backing = BTreeGraphIndex(
-            get_transport(dirname(new_backing_file.name)),
-            basename(new_backing_file.name), size)
+        dir_path, base_name = osutils.split(new_backing_file.name)
+        # Note: The transport here isn't strictly needed, because we will use
+        #       direct access to the new_backing._file object
+        new_backing = BTreeGraphIndex(get_transport(dir_path),
+                                      base_name, size)
         # GC will clean up the file
         new_backing._file = new_backing_file
         if len(self._backing_indices) == backing_pos:

=== modified file 'bzrlib/index.py'
--- a/bzrlib/index.py	2008-08-25 16:29:01 +0000
+++ b/bzrlib/index.py	2008-08-28 20:05:52 +0000
@@ -134,16 +134,22 @@
             key_dict = key_dict.setdefault(subkey, {})
         key_dict[key[-1]] = key_value
 
-    def add_node(self, key, value, references=()):
-        """Add a node to the index.
+    def _check_key_ref_value(self, key, references, value):
+        """Check that 'key' and 'references' are all valid.
 
-        :param key: The key. keys are non-empty tuples containing
-            as many whitespace-free utf8 bytestrings as the key length
-            defined for this index.
-        :param references: An iterable of iterables of keys. Each is a
-            reference to another key.
-        :param value: The value to associate with the key. It may be any
-            bytes as long as it does not contain \0 or \n.
+        :param key: A key tuple. Must conform to the key interface (be a tuple,
+            be of the right length, not have any whitespace or nulls in any key
+            element.)
+        :param references: An iterable of reference lists. Something like
+            [[(ref, key)], [(ref, key), (other, key)]]
+        :param value: The value associate with this key. Must not contain
+            newlines or null characters.
+        :return: (node_refs, absent_references)
+            node_refs   basically a packed form of 'references' where all
+                        iterables are tuples
+            absent_references   reference keys that are not in self._nodes.
+                                This may contain duplicates if the same key is
+                                referenced in multiple lists.
         """
         self._check_key(key)
         if _newline_null_re.search(value) is not None:
@@ -151,18 +157,39 @@
         if len(references) != self.reference_lists:
             raise errors.BadIndexValue(references)
         node_refs = []
+        absent_references = []
         for reference_list in references:
             for reference in reference_list:
-                self._check_key(reference)
+                # If reference *is* in self._nodes, then we know it has already
+                # been checked.
                 if reference not in self._nodes:
-                    self._nodes[reference] = ('a', (), '')
+                    self._check_key(reference)
+                    absent_references.append(reference)
             node_refs.append(tuple(reference_list))
-        if key in self._nodes and self._nodes[key][0] == '':
+        return tuple(node_refs), absent_references
+
+    def add_node(self, key, value, references=()):
+        """Add a node to the index.
+
+        :param key: The key. keys are non-empty tuples containing
+            as many whitespace-free utf8 bytestrings as the key length
+            defined for this index.
+        :param references: An iterable of iterables of keys. Each is a
+            reference to another key.
+        :param value: The value to associate with the key. It may be any
+            bytes as long as it does not contain \0 or \n.
+        """
+        (node_refs,
+         absent_references) = self._check_key_ref_value(key, references, value)
+        if key in self._nodes and self._nodes[key][0] != 'a':
             raise errors.BadIndexDuplicateKey(key, self)
-        node_refs = tuple(node_refs)
+        for reference in absent_references:
+            # There may be duplicates, but I don't think it is worth worrying
+            # about
+            self._nodes[reference] = ('a', (), '')
         self._nodes[key] = ('', node_refs, value)
         self._keys.add(key)
-        if self._key_length > 1 and self._nodes_by_key is not None:
+        if self._nodes_by_key is not None and self._key_length > 1:
             self._update_nodes_by_key(key, value, node_refs)
 
     def finish(self):
@@ -172,7 +199,7 @@
         lines.append(_OPTION_LEN + str(len(self._keys)) + '\n')
         prefix_length = sum(len(x) for x in lines)
         # references are byte offsets. To avoid having to do nasty
-        # polynomial work to resolve offsets (references to later in the 
+        # polynomial work to resolve offsets (references to later in the
         # file cannot be determined until all the inbetween references have
         # been calculated too) we pad the offsets with 0's to make them be
         # of consistent length. Using binary offsets would break the trivial