Rev 3645: Change the IndexBuilders to not generate the nodes_by_key unless needed. in http://bzr.arbash-meinel.com/branches/bzr/1.7-dev/index_builder_cleanup

John Arbash Meinel john at arbash-meinel.com
Mon Aug 25 04:41:43 BST 2008


At http://bzr.arbash-meinel.com/branches/bzr/1.7-dev/index_builder_cleanup

------------------------------------------------------------
revno: 3645
revision-id: john at arbash-meinel.com-20080825034139-68nxqiqrmqi1l5f0
parent: pqm at pqm.ubuntu.com-20080822042630-on3dxyek4ezk0miu
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: index_builder_cleanup
timestamp: Sun 2008-08-24 22:41:39 -0500
message:
  Change the IndexBuilders to not generate the nodes_by_key unless needed.
modified:
  bzrlib/btree_index.py          index.py-20080624222253-p0x5f92uyh5hw734-7
  bzrlib/index.py                index.py-20070712131115-lolkarso50vjr64s-1
  bzrlib/tests/test_btree_index.py test_index.py-20080624222253-p0x5f92uyh5hw734-13
-------------- next part --------------
=== modified file 'bzrlib/btree_index.py'
--- a/bzrlib/btree_index.py	2008-08-22 02:18:27 +0000
+++ b/bzrlib/btree_index.py	2008-08-25 03:41:39 +0000
@@ -142,6 +142,8 @@
             key_elements=key_elements)
         self._spill_at = spill_at
         self._backing_indices = []
+        # Indicate it hasn't been built yet
+        self._nodes_by_key = None
 
     def add_node(self, key, value, references=()):
         """Add a node to the index.
@@ -157,7 +159,33 @@
         :param value: The value to associate with the key. It may be any
             bytes as long as it does not contain \0 or \n.
         """
-        index.GraphIndexBuilder.add_node(self, key, value, references=references)
+        self._check_key(key)
+        if index._newline_null_re.search(value) is not None:
+            raise errors.BadIndexValue(value)
+        if len(references) != self.reference_lists:
+            raise errors.BadIndexValue(references)
+        node_refs = []
+        for reference_list in references:
+            for reference in reference_list:
+                self._check_key(reference)
+            node_refs.append(tuple(reference_list))
+        if key in self._nodes and self._nodes[key][0] == '':
+            raise errors.BadIndexDuplicateKey(key, self)
+        self._nodes[key] = (tuple(node_refs), value)
+        self._keys.add(key)
+        if self._key_length > 1 and self._nodes_by_key is not None:
+            key_dict = self._nodes_by_key
+            if self.reference_lists:
+                key_value = key, value, tuple(node_refs)
+            else:
+                key_value = key, value
+            # possibly should do this on-demand, but it seems likely it is 
+            # always wanted
+            # For a key of (foo, bar, baz) create
+            # _nodes_by_key[foo][bar][baz] = key_value
+            for subkey in key[:-1]:
+                key_dict = key_dict.setdefault(subkey, {})
+            key_dict[key[-1]] = key_value
         if len(self._keys) < self._spill_at:
             return
         iterators_to_combine = [iter(sorted(self._iter_mem_nodes()))]
@@ -182,7 +210,7 @@
             self._backing_indices[pos] = None
         self._keys = set()
         self._nodes = {}
-        self._nodes_by_key = {}
+        self._nodes_by_key = None
 
     def add_nodes(self, nodes):
         """Add nodes to the index.
@@ -199,13 +227,11 @@
     def _iter_mem_nodes(self):
         """Iterate over the nodes held in memory."""
         if self.reference_lists:
-            for key, (absent, references, value) in self._nodes.iteritems():
-                if not absent:
-                    yield self, key, value, references
+            return ((self, key, value, references)
+                    for key, (references, value) in self._nodes.iteritems())
         else:
-            for key, (absent, references, value) in self._nodes.iteritems():
-                if not absent:
-                    yield self, key, value
+            return ((self, key, value)
+                    for key, (references, value) in self._nodes.iteritems())
 
     def _iter_smallest(self, iterators_to_combine):
         if len(iterators_to_combine) == 1:
@@ -411,13 +437,11 @@
         if self.reference_lists:
             for key in keys.intersection(self._keys):
                 node = self._nodes[key]
-                if not node[0]:
-                    yield self, key, node[2], node[1]
+                yield self, key, node[1], node[0]
         else:
             for key in keys.intersection(self._keys):
                 node = self._nodes[key]
-                if not node[0]:
-                    yield self, key, node[2]
+                yield self, key, node[1]
         keys.difference_update(self._keys)
         for backing in self._backing_indices:
             if backing is None:
@@ -454,6 +478,8 @@
             if backing is None:
                 continue
             for node in backing.iter_entries_prefix(keys):
+                # TODO: We should probably remove yielded keys from the keys
+                #       list
                 yield (self,) + node[1:]
         if self._key_length == 1:
             for key in keys:
@@ -466,12 +492,10 @@
                     node = self._nodes[key]
                 except KeyError:
                     continue
-                if node[0]:
-                    continue
                 if self.reference_lists:
-                    yield self, key, node[2], node[1]
+                    yield self, key, node[1], node[0]
                 else:
-                    yield self, key, node[2]
+                    yield self, key, node[1]
             return
         for key in keys:
             # sanity check
@@ -480,7 +504,7 @@
             if len(key) != self._key_length:
                 raise errors.BadIndexKey(key)
             # find what it refers to:
-            key_dict = self._nodes_by_key
+            key_dict = self._get_nodes_by_key()
             elements = list(key)
             # find the subdict to return
             try:
@@ -506,6 +530,24 @@
             else:
                 yield (self, ) + key_dict
 
+    def _get_nodes_by_key(self):
+        if self._nodes_by_key is None:
+            nodes_by_key = {}
+            if self.reference_lists:
+                for key, (references, value) in self._nodes.iteritems():
+                    key_dict = nodes_by_key
+                    for subkey in key[:-1]:
+                        key_dict = key_dict.setdefault(subkey, {})
+                    key_dict[key[-1]] = key, value, references
+            else:
+                for key, (references, value) in self._nodes.iteritems():
+                    key_dict = nodes_by_key
+                    for subkey in key[:-1]:
+                        key_dict = key_dict.setdefault(subkey, {})
+                    key_dict[key[-1]] = key, value
+            self._nodes_by_key = nodes_by_key
+        return self._nodes_by_key
+
     def key_count(self):
         """Return an estimate of the number of keys in this index.
 

=== modified file 'bzrlib/index.py'
--- a/bzrlib/index.py	2008-08-14 19:22:59 +0000
+++ b/bzrlib/index.py	2008-08-25 03:41:39 +0000
@@ -80,8 +80,9 @@
         """
         self.reference_lists = reference_lists
         self._keys = set()
+        # A dict of {key: (absent, ref_lists, value)}
         self._nodes = {}
-        self._nodes_by_key = {}
+        self._nodes_by_key = None
         self._key_length = key_elements
 
     def _check_key(self, key):
@@ -121,7 +122,7 @@
             raise errors.BadIndexDuplicateKey(key, self)
         self._nodes[key] = ('', tuple(node_refs), value)
         self._keys.add(key)
-        if self._key_length > 1:
+        if self._key_length > 1 and self._nodes_by_key is not None:
             key_dict = self._nodes_by_key
             if self.reference_lists:
                 key_value = key, value, tuple(node_refs)

=== modified file 'bzrlib/tests/test_btree_index.py'
--- a/bzrlib/tests/test_btree_index.py	2008-08-22 02:18:27 +0000
+++ b/bzrlib/tests/test_btree_index.py	2008-08-25 03:41:39 +0000
@@ -349,23 +349,19 @@
         # predictably.
         self.assertEqual(1, len(builder._nodes))
         self.assertEqual(1, len(builder._keys))
-        self.assertEqual({}, builder._nodes_by_key)
         builder.add_node(*nodes[1])
         self.assertEqual(0, len(builder._nodes))
         self.assertEqual(0, len(builder._keys))
-        self.assertEqual({}, builder._nodes_by_key)
         self.assertEqual(1, len(builder._backing_indices))
         self.assertEqual(2, builder._backing_indices[0].key_count())
         # now back to memory
         builder.add_node(*nodes[2])
         self.assertEqual(1, len(builder._nodes))
         self.assertEqual(1, len(builder._keys))
-        self.assertEqual({}, builder._nodes_by_key)
         # And spills to a second backing index combing all
         builder.add_node(*nodes[3])
         self.assertEqual(0, len(builder._nodes))
         self.assertEqual(0, len(builder._keys))
-        self.assertEqual({}, builder._nodes_by_key)
         self.assertEqual(2, len(builder._backing_indices))
         self.assertEqual(None, builder._backing_indices[0])
         self.assertEqual(4, builder._backing_indices[1].key_count())
@@ -374,7 +370,6 @@
         builder.add_node(*nodes[5])
         self.assertEqual(0, len(builder._nodes))
         self.assertEqual(0, len(builder._keys))
-        self.assertEqual({}, builder._nodes_by_key)
         self.assertEqual(2, len(builder._backing_indices))
         self.assertEqual(2, builder._backing_indices[0].key_count())
         self.assertEqual(4, builder._backing_indices[1].key_count())
@@ -438,24 +433,20 @@
         # Test the parts of the index that take up memory are doing so
         # predictably.
         self.assertEqual(1, len(builder._keys))
-        self.assertEqual(2, len(builder._nodes))
-        self.assertNotEqual({}, builder._nodes_by_key)
+        self.assertEqual(1, len(builder._nodes))
         builder.add_node(*nodes[1])
         self.assertEqual(0, len(builder._keys))
         self.assertEqual(0, len(builder._nodes))
-        self.assertEqual({}, builder._nodes_by_key)
         self.assertEqual(1, len(builder._backing_indices))
         self.assertEqual(2, builder._backing_indices[0].key_count())
         # now back to memory
         builder.add_node(*nodes[2])
-        self.assertEqual(2, len(builder._nodes))
+        self.assertEqual(1, len(builder._nodes))
         self.assertEqual(1, len(builder._keys))
-        self.assertNotEqual({}, builder._nodes_by_key)
         # And spills to a second backing index combing all
         builder.add_node(*nodes[3])
         self.assertEqual(0, len(builder._nodes))
         self.assertEqual(0, len(builder._keys))
-        self.assertEqual({}, builder._nodes_by_key)
         self.assertEqual(2, len(builder._backing_indices))
         self.assertEqual(None, builder._backing_indices[0])
         self.assertEqual(4, builder._backing_indices[1].key_count())
@@ -464,7 +455,6 @@
         builder.add_node(*nodes[5])
         self.assertEqual(0, len(builder._nodes))
         self.assertEqual(0, len(builder._keys))
-        self.assertEqual({}, builder._nodes_by_key)
         self.assertEqual(2, len(builder._backing_indices))
         self.assertEqual(2, builder._backing_indices[0].key_count())
         self.assertEqual(4, builder._backing_indices[1].key_count())
@@ -494,11 +484,12 @@
         self.assertEqual([(builder,) + node for node in sorted(nodes[:13])],
             list(builder.iter_all_entries()))
         # Two nodes - one memory one disk
-        self.assertEqual(set([(builder,) + node for node in nodes[11:13]]),
-            set(builder.iter_entries([nodes[12][0], nodes[11][0]])))
+        self.assertEqual(sorted(set([(builder,) + node for node in nodes[11:13]])),
+            sorted(set(builder.iter_entries([nodes[12][0], nodes[11][0]]))))
         self.assertEqual(13, builder.key_count())
-        self.assertEqual(set([(builder,) + node for node in nodes[11:13]]),
-            set(builder.iter_entries_prefix([nodes[12][0], nodes[11][0]])))
+        import pdb; pdb.set_trace()
+        self.assertEqual(sorted(set([(builder,) + node for node in nodes[11:13]])),
+            sorted(set(builder.iter_entries_prefix([nodes[12][0], nodes[11][0]]))))
         builder.add_node(*nodes[13])
         self.assertEqual(3, len(builder._backing_indices))
         self.assertEqual(2, builder._backing_indices[0].key_count())



More information about the bazaar-commits mailing list