Rev 2640: Add a key_count method to GraphIndex and friends, allowing optimisation of length calculations by the index. in http://people.ubuntu.com/~robertc/baz2.0/index

Robert Collins robertc at robertcollins.net
Tue Aug 7 06:44:33 BST 2007


At http://people.ubuntu.com/~robertc/baz2.0/index

------------------------------------------------------------
revno: 2640
revision-id: robertc at robertcollins.net-20070807054430-65ige2n2869mp7t4
parent: robertc at robertcollins.net-20070803070942-1lrseqr6ligvrtrk
committer: Robert Collins <robertc at robertcollins.net>
branch nick: index
timestamp: Tue 2007-08-07 15:44:30 +1000
message:
  Add a key_count method to GraphIndex and friends, allowing optimisation of length calculations by the index.
modified:
  bzrlib/index.py                index.py-20070712131115-lolkarso50vjr64s-1
  bzrlib/tests/test_index.py     test_index.py-20070712131115-lolkarso50vjr64s-2
=== modified file 'bzrlib/index.py'
--- a/bzrlib/index.py	2007-08-03 07:09:42 +0000
+++ b/bzrlib/index.py	2007-08-07 05:44:30 +0000
@@ -34,6 +34,7 @@
 from bzrlib import debug, errors
 
 _OPTION_KEY_ELEMENTS = "key_elements="
+_OPTION_LEN = "len="
 _OPTION_NODE_REFS = "node_ref_lists="
 _SIGNATURE = "Bazaar Graph Index 1\n"
 
@@ -69,6 +70,7 @@
         :param key_elements: The number of bytestrings in each key.
         """
         self.reference_lists = reference_lists
+        self._keys = set()
         self._nodes = {}
         self._nodes_by_key = {}
         self._key_length = key_elements
@@ -109,6 +111,7 @@
         if key in self._nodes and self._nodes[key][0] == '':
             raise errors.BadIndexDuplicateKey(key, self)
         self._nodes[key] = ('', tuple(node_refs), value)
+        self._keys.add(key)
         if self._key_length > 1:
             key_dict = self._nodes_by_key
             if self.reference_lists:
@@ -127,6 +130,7 @@
         lines = [_SIGNATURE]
         lines.append(_OPTION_NODE_REFS + str(self.reference_lists) + '\n')
         lines.append(_OPTION_KEY_ELEMENTS + str(self._key_length) + '\n')
+        lines.append(_OPTION_LEN + str(len(self._keys)) + '\n')
         prefix_length = sum(len(x) for x in lines)
         # references are byte offsets. To avoid having to do nasty
         # polynomial work to resolve offsets (references to later in the 
@@ -236,6 +240,7 @@
         self._transport = transport
         self._name = name
         self._nodes = None
+        self._key_count = None
         self._keys_by_offset = None
         self._nodes_by_key = None
 
@@ -302,6 +307,7 @@
                 for subkey in key[:-1]:
                     key_dict = key_dict.setdefault(subkey, {})
                 key_dict[key[-1]] = key_value
+        # cache the keys for quick set intersections
         self._keys = set(self._nodes)
         if trailers != 1:
             # there must be one line - the empty trailer line.
@@ -343,6 +349,13 @@
             self._key_length = int(options_line[len(_OPTION_KEY_ELEMENTS):-1])
         except ValueError:
             raise errors.BadIndexOptions(self)
+        options_line = stream.readline()
+        if not options_line.startswith(_OPTION_LEN):
+            raise errors.BadIndexOptions(self)
+        try:
+            self._key_count = int(options_line[len(_OPTION_LEN):-1])
+        except ValueError:
+            raise errors.BadIndexOptions(self)
 
     def iter_entries(self, keys):
         """Iterate over keys within the index.
@@ -438,6 +451,16 @@
                 # the last thing looked up was a terminal element
                 yield (self, ) + key_dict
 
+    def key_count(self):
+        """Return an estimate of the number of keys in this index.
+        
+        For GraphIndex the estimate is exact.
+        """
+        if self._key_count is None:
+            # really this should just read the prefix
+            self._buffer_all()
+        return self._key_count
+
     def _signature(self):
         """The file signature for this index type."""
         return _SIGNATURE
@@ -544,6 +567,16 @@
                 seen_keys.add(node[1])
                 yield node
 
+    def key_count(self):
+        """Return an estimate of the number of keys in this index.
+        
+        For CombinedGraphIndex this is approximated by the sum of the keys of
+        the child indices. As child indices may have duplicate keys this can
+        have a maximum error of the number of child indices * largest number of
+        keys in any index.
+        """
+        return sum((index.key_count() for index in self._indices), 0)
+
     def validate(self):
         """Validate that everything in the index can be accessed."""
         for index in self._indices:
@@ -596,12 +629,12 @@
         """
         keys = set(keys)
         if self.reference_lists:
-            for key in keys.intersection(self._nodes):
+            for key in keys.intersection(self._keys):
                 node = self._nodes[key]
                 if not node[0]:
                     yield self, key, node[2], node[1]
         else:
-            for key in keys.intersection(self._nodes):
+            for key in keys.intersection(self._keys):
                 node = self._nodes[key]
                 if not node[0]:
                     yield self, key, node[2]
@@ -676,6 +709,13 @@
             else:
                 yield (self, ) + key_dict
 
+    def key_count(self):
+        """Return an estimate of the number of keys in this index.
+        
+        For InMemoryGraphIndex the estimate is exact.
+        """
+        return len(self._keys)
+
     def validate(self):
         """In memory index's have no known corruption at the moment."""
 
@@ -791,6 +831,14 @@
         return self._strip_prefix(self.adapted.iter_entries_prefix(
             self.prefix_key + key for key in keys))
 
+    def key_count(self):
+        """Return an estimate of the number of keys in this index.
+        
+        For GraphIndexPrefixAdapter this is relatively expensive - key
+        iteration with the prefix is done.
+        """
+        return len(list(self.iter_all_entries()))
+
     def validate(self):
         """Call the adapted's validate."""
         self.adapted.validate()

=== modified file 'bzrlib/tests/test_index.py'
--- a/bzrlib/tests/test_index.py	2007-08-01 07:53:14 +0000
+++ b/bzrlib/tests/test_index.py	2007-08-07 05:44:30 +0000
@@ -27,32 +27,41 @@
         builder = GraphIndexBuilder()
         stream = builder.finish()
         contents = stream.read()
-        self.assertEqual("Bazaar Graph Index 1\nnode_ref_lists=0\nkey_elements=1\n\n", contents)
+        self.assertEqual(
+            "Bazaar Graph Index 1\nnode_ref_lists=0\nkey_elements=1\nlen=0\n\n",
+            contents)
 
     def test_build_index_empty_two_element_keys(self):
         builder = GraphIndexBuilder(key_elements=2)
         stream = builder.finish()
         contents = stream.read()
-        self.assertEqual("Bazaar Graph Index 1\nnode_ref_lists=0\nkey_elements=2\n\n", contents)
+        self.assertEqual(
+            "Bazaar Graph Index 1\nnode_ref_lists=0\nkey_elements=2\nlen=0\n\n",
+            contents)
 
     def test_build_index_one_reference_list_empty(self):
         builder = GraphIndexBuilder(reference_lists=1)
         stream = builder.finish()
         contents = stream.read()
-        self.assertEqual("Bazaar Graph Index 1\nnode_ref_lists=1\nkey_elements=1\n\n", contents)
+        self.assertEqual(
+            "Bazaar Graph Index 1\nnode_ref_lists=1\nkey_elements=1\nlen=0\n\n",
+            contents)
 
     def test_build_index_two_reference_list_empty(self):
         builder = GraphIndexBuilder(reference_lists=2)
         stream = builder.finish()
         contents = stream.read()
-        self.assertEqual("Bazaar Graph Index 1\nnode_ref_lists=2\nkey_elements=1\n\n", contents)
+        self.assertEqual(
+            "Bazaar Graph Index 1\nnode_ref_lists=2\nkey_elements=1\nlen=0\n\n",
+            contents)
 
     def test_build_index_one_node_no_refs(self):
         builder = GraphIndexBuilder()
         builder.add_node(('akey', ), 'data')
         stream = builder.finish()
         contents = stream.read()
-        self.assertEqual("Bazaar Graph Index 1\nnode_ref_lists=0\nkey_elements=1\n"
+        self.assertEqual(
+            "Bazaar Graph Index 1\nnode_ref_lists=0\nkey_elements=1\nlen=1\n"
             "akey\x00\x00\x00data\n\n", contents)
 
     def test_build_index_one_node_no_refs_accepts_empty_reflist(self):
@@ -60,7 +69,8 @@
         builder.add_node(('akey', ), 'data', ())
         stream = builder.finish()
         contents = stream.read()
-        self.assertEqual("Bazaar Graph Index 1\nnode_ref_lists=0\nkey_elements=1\n"
+        self.assertEqual(
+            "Bazaar Graph Index 1\nnode_ref_lists=0\nkey_elements=1\nlen=1\n"
             "akey\x00\x00\x00data\n\n", contents)
 
     def test_build_index_one_node_2_element_keys(self):
@@ -71,7 +81,8 @@
         builder.add_node(('akey', 'secondpart'), 'data')
         stream = builder.finish()
         contents = stream.read()
-        self.assertEqual("Bazaar Graph Index 1\nnode_ref_lists=0\nkey_elements=2\n"
+        self.assertEqual(
+            "Bazaar Graph Index 1\nnode_ref_lists=0\nkey_elements=2\nlen=1\n"
             "akey\x00secondpart\x00\x00\x00data\n\n", contents)
 
     def test_add_node_empty_value(self):
@@ -79,7 +90,8 @@
         builder.add_node(('akey', ), '')
         stream = builder.finish()
         contents = stream.read()
-        self.assertEqual("Bazaar Graph Index 1\nnode_ref_lists=0\nkey_elements=1\n"
+        self.assertEqual(
+            "Bazaar Graph Index 1\nnode_ref_lists=0\nkey_elements=1\nlen=1\n"
             "akey\x00\x00\x00\n\n", contents)
 
     def test_build_index_nodes_sorted(self):
@@ -93,7 +105,8 @@
         builder.add_node(('2001', ), 'data')
         stream = builder.finish()
         contents = stream.read()
-        self.assertEqual("Bazaar Graph Index 1\nnode_ref_lists=0\nkey_elements=1\n"
+        self.assertEqual(
+            "Bazaar Graph Index 1\nnode_ref_lists=0\nkey_elements=1\nlen=3\n"
             "2000\x00\x00\x00data\n"
             "2001\x00\x00\x00data\n"
             "2002\x00\x00\x00data\n"
@@ -116,7 +129,8 @@
         builder.add_node(('2001', '2001'), 'data')
         stream = builder.finish()
         contents = stream.read()
-        self.assertEqual("Bazaar Graph Index 1\nnode_ref_lists=0\nkey_elements=2\n"
+        self.assertEqual(
+            "Bazaar Graph Index 1\nnode_ref_lists=0\nkey_elements=2\nlen=9\n"
             "2000\x002000\x00\x00\x00data\n"
             "2000\x002001\x00\x00\x00data\n"
             "2000\x002002\x00\x00\x00data\n"
@@ -133,7 +147,8 @@
         builder.add_node(('key', ), 'data', ([], ))
         stream = builder.finish()
         contents = stream.read()
-        self.assertEqual("Bazaar Graph Index 1\nnode_ref_lists=1\nkey_elements=1\n"
+        self.assertEqual(
+            "Bazaar Graph Index 1\nnode_ref_lists=1\nkey_elements=1\nlen=1\n"
             "key\x00\x00\x00data\n"
             "\n", contents)
 
@@ -142,7 +157,8 @@
         builder.add_node(('key', 'key2'), 'data', ([], ))
         stream = builder.finish()
         contents = stream.read()
-        self.assertEqual("Bazaar Graph Index 1\nnode_ref_lists=1\nkey_elements=2\n"
+        self.assertEqual(
+            "Bazaar Graph Index 1\nnode_ref_lists=1\nkey_elements=2\nlen=1\n"
             "key\x00key2\x00\x00\x00data\n"
             "\n", contents)
 
@@ -151,7 +167,8 @@
         builder.add_node(('key', ), 'data', ([], []))
         stream = builder.finish()
         contents = stream.read()
-        self.assertEqual("Bazaar Graph Index 1\nnode_ref_lists=2\nkey_elements=1\n"
+        self.assertEqual(
+            "Bazaar Graph Index 1\nnode_ref_lists=2\nkey_elements=1\nlen=1\n"
             "key\x00\x00\t\x00data\n"
             "\n", contents)
 
@@ -161,8 +178,9 @@
         builder.add_node(('key', ), 'data', ([('reference', )], ))
         stream = builder.finish()
         contents = stream.read()
-        self.assertEqual("Bazaar Graph Index 1\nnode_ref_lists=1\nkey_elements=1\n"
-            "key\x00\x0066\x00data\n"
+        self.assertEqual(
+            "Bazaar Graph Index 1\nnode_ref_lists=1\nkey_elements=1\nlen=2\n"
+            "key\x00\x0072\x00data\n"
             "reference\x00\x00\x00data\n"
             "\n", contents)
 
@@ -173,8 +191,9 @@
         builder.add_node(('key', ), 'data', ([('reference', ), ('reference2', )], ))
         stream = builder.finish()
         contents = stream.read()
-        self.assertEqual("Bazaar Graph Index 1\nnode_ref_lists=1\nkey_elements=1\n"
-            "key\x00\x00071\r088\x00data\n"
+        self.assertEqual(
+            "Bazaar Graph Index 1\nnode_ref_lists=1\nkey_elements=1\nlen=3\n"
+            "key\x00\x00077\r094\x00data\n"
             "reference\x00\x00\x00data\n"
             "reference2\x00\x00\x00data\n"
             "\n", contents)
@@ -185,9 +204,10 @@
         builder.add_node(('rey', ), 'data', ([('keference', )], [('keference', )]))
         stream = builder.finish()
         contents = stream.read()
-        self.assertEqual("Bazaar Graph Index 1\nnode_ref_lists=2\nkey_elements=1\n"
+        self.assertEqual(
+            "Bazaar Graph Index 1\nnode_ref_lists=2\nkey_elements=1\nlen=2\n"
             "keference\x00\x00\t\x00data\n"
-            "rey\x00\x0053\t53\x00data\n"
+            "rey\x00\x0059\t59\x00data\n"
             "\n", contents)
 
     def test_add_node_referencing_missing_key_makes_absent(self):
@@ -195,10 +215,11 @@
         builder.add_node(('rey', ), 'data', ([('beference', ), ('aeference2', )], ))
         stream = builder.finish()
         contents = stream.read()
-        self.assertEqual("Bazaar Graph Index 1\nnode_ref_lists=1\nkey_elements=1\n"
+        self.assertEqual(
+            "Bazaar Graph Index 1\nnode_ref_lists=1\nkey_elements=1\nlen=1\n"
             "aeference2\x00a\x00\x00\n"
             "beference\x00a\x00\x00\n"
-            "rey\x00\x0068\r53\x00data\n"
+            "rey\x00\x00074\r059\x00data\n"
             "\n", contents)
 
     def test_node_references_three_digits(self):
@@ -208,11 +229,12 @@
         builder.add_node(('2-key', ), '', (references, ))
         stream = builder.finish()
         contents = stream.read()
-        self.assertEqual("Bazaar Graph Index 1\nnode_ref_lists=1\nkey_elements=1\n"
+        self.assertEqual(
+            "Bazaar Graph Index 1\nnode_ref_lists=1\nkey_elements=1\nlen=1\n"
             "0\x00a\x00\x00\n"
             "1\x00a\x00\x00\n"
             "2\x00a\x00\x00\n"
-            "2-key\x00\x00145\r139\r133\r127\r121\r115\r065\r059\r053\x00\n"
+            "2-key\x00\x00151\r145\r139\r133\r127\r121\r071\r065\r059\x00\n"
             "3\x00a\x00\x00\n"
             "4\x00a\x00\x00\n"
             "5\x00a\x00\x00\n"
@@ -228,9 +250,10 @@
         builder.add_node(('parent', ), '', ([('aail', ), ('zther', )], []))
         stream = builder.finish()
         contents = stream.read()
-        self.assertEqual("Bazaar Graph Index 1\nnode_ref_lists=2\nkey_elements=1\n"
+        self.assertEqual(
+            "Bazaar Graph Index 1\nnode_ref_lists=2\nkey_elements=1\nlen=1\n"
             "aail\x00a\x00\x00\n"
-            "parent\x00\x0053\r78\t\x00\n"
+            "parent\x00\x0059\r84\t\x00\n"
             "zther\x00a\x00\x00\n"
             "\n", contents)
 
@@ -455,6 +478,19 @@
             (index, ('name', 'fin2'), 'beta', ((), ))]),
             set(index.iter_entries_prefix([('name', None)])))
 
+    def test_key_count_empty(self):
+        index = self.make_index()
+        self.assertEqual(0, index.key_count())
+
+    def test_key_count_one(self):
+        index = self.make_index(nodes=[(('name', ), '', ())])
+        self.assertEqual(1, index.key_count())
+
+    def test_key_count_two(self):
+        index = self.make_index(nodes=[
+            (('name', ), '', ()), (('foo', ), '', ())])
+        self.assertEqual(2, index.key_count())
+
     def test_validate_bad_index_errors(self):
         trans = self.get_transport()
         trans.put_bytes('name', "not an index\n")
@@ -624,6 +660,20 @@
         self.assertEqual([(index1, ('key', ), '')],
             list(index.iter_entries([('key', )])))
 
+    def test_key_count_empty(self):
+        index1 = self.make_index('1', nodes=[])
+        index2 = self.make_index('2', nodes=[])
+        index = CombinedGraphIndex([index1, index2])
+        self.assertEqual(0, index.key_count())
+
+    def test_key_count_sums_index_keys(self):
+        index1 = self.make_index('1', nodes=[
+            (('1',), '', ()),
+            (('2',), '', ())])
+        index2 = self.make_index('2', nodes=[(('1',), '', ())])
+        index = CombinedGraphIndex([index1, index2])
+        self.assertEqual(3, index.key_count())
+
     def test_validate_bad_child_index_errors(self):
         trans = self.get_transport()
         trans.put_bytes('name', "not an index\n")
@@ -745,6 +795,18 @@
         index = self.make_index()
         self.assertEqual([], list(index.iter_entries(['a'])))
 
+    def test_key_count_empty(self):
+        index = self.make_index()
+        self.assertEqual(0, index.key_count())
+
+    def test_key_count_one(self):
+        index = self.make_index(nodes=[(('name', ), '')])
+        self.assertEqual(1, index.key_count())
+
+    def test_key_count_two(self):
+        index = self.make_index(nodes=[(('name', ), ''), (('foo', ), '')])
+        self.assertEqual(2, index.key_count())
+
     def test_validate_empty(self):
         index = self.make_index()
         index.validate()
@@ -833,6 +895,18 @@
             (index, ('prefix2', 'key2', ), 'data2', ((('prefix2', 'key1', ),),))]),
             set(adapter.iter_entries_prefix([('prefix2', None)])))
 
+    def test_key_count_no_matching_keys(self):
+        index, adapter = self.make_index(nodes=[
+            (('notprefix', 'key1'), 'data', ((), ))])
+        self.assertEqual(0, adapter.key_count())
+
+    def test_key_count_some_keys(self):
+        index, adapter = self.make_index(nodes=[
+            (('notprefix', 'key1'), 'data', ((), )),
+            (('prefix', 'key1'), 'data1', ((), )),
+            (('prefix', 'key2'), 'data2', ((('prefix', 'key1'),),))])
+        self.assertEqual(2, adapter.key_count())
+
     def test_validate(self):
         index, adapter = self.make_index()
         calls = []



More information about the bazaar-commits mailing list