Rev 2640: Add a key_count method to GraphIndex and friends, allowing optimisation of length calculations by the index. in http://people.ubuntu.com/~robertc/baz2.0/index
Robert Collins
robertc at robertcollins.net
Tue Aug 7 06:44:33 BST 2007
At http://people.ubuntu.com/~robertc/baz2.0/index
------------------------------------------------------------
revno: 2640
revision-id: robertc at robertcollins.net-20070807054430-65ige2n2869mp7t4
parent: robertc at robertcollins.net-20070803070942-1lrseqr6ligvrtrk
committer: Robert Collins <robertc at robertcollins.net>
branch nick: index
timestamp: Tue 2007-08-07 15:44:30 +1000
message:
Add a key_count method to GraphIndex and friends, allowing optimisation of length calculations by the index.
modified:
bzrlib/index.py index.py-20070712131115-lolkarso50vjr64s-1
bzrlib/tests/test_index.py test_index.py-20070712131115-lolkarso50vjr64s-2
=== modified file 'bzrlib/index.py'
--- a/bzrlib/index.py 2007-08-03 07:09:42 +0000
+++ b/bzrlib/index.py 2007-08-07 05:44:30 +0000
@@ -34,6 +34,7 @@
from bzrlib import debug, errors
_OPTION_KEY_ELEMENTS = "key_elements="
+_OPTION_LEN = "len="
_OPTION_NODE_REFS = "node_ref_lists="
_SIGNATURE = "Bazaar Graph Index 1\n"
@@ -69,6 +70,7 @@
:param key_elements: The number of bytestrings in each key.
"""
self.reference_lists = reference_lists
+ self._keys = set()
self._nodes = {}
self._nodes_by_key = {}
self._key_length = key_elements
@@ -109,6 +111,7 @@
if key in self._nodes and self._nodes[key][0] == '':
raise errors.BadIndexDuplicateKey(key, self)
self._nodes[key] = ('', tuple(node_refs), value)
+ self._keys.add(key)
if self._key_length > 1:
key_dict = self._nodes_by_key
if self.reference_lists:
@@ -127,6 +130,7 @@
lines = [_SIGNATURE]
lines.append(_OPTION_NODE_REFS + str(self.reference_lists) + '\n')
lines.append(_OPTION_KEY_ELEMENTS + str(self._key_length) + '\n')
+ lines.append(_OPTION_LEN + str(len(self._keys)) + '\n')
prefix_length = sum(len(x) for x in lines)
# references are byte offsets. To avoid having to do nasty
# polynomial work to resolve offsets (references to later in the
@@ -236,6 +240,7 @@
self._transport = transport
self._name = name
self._nodes = None
+ self._key_count = None
self._keys_by_offset = None
self._nodes_by_key = None
@@ -302,6 +307,7 @@
for subkey in key[:-1]:
key_dict = key_dict.setdefault(subkey, {})
key_dict[key[-1]] = key_value
+ # cache the keys for quick set intersections
self._keys = set(self._nodes)
if trailers != 1:
# there must be one line - the empty trailer line.
@@ -343,6 +349,13 @@
self._key_length = int(options_line[len(_OPTION_KEY_ELEMENTS):-1])
except ValueError:
raise errors.BadIndexOptions(self)
+ options_line = stream.readline()
+ if not options_line.startswith(_OPTION_LEN):
+ raise errors.BadIndexOptions(self)
+ try:
+ self._key_count = int(options_line[len(_OPTION_LEN):-1])
+ except ValueError:
+ raise errors.BadIndexOptions(self)
def iter_entries(self, keys):
"""Iterate over keys within the index.
@@ -438,6 +451,16 @@
# the last thing looked up was a terminal element
yield (self, ) + key_dict
+ def key_count(self):
+ """Return an estimate of the number of keys in this index.
+
+ For GraphIndex the estimate is exact.
+ """
+ if self._key_count is None:
+ # really this should just read the prefix
+ self._buffer_all()
+ return self._key_count
+
def _signature(self):
"""The file signature for this index type."""
return _SIGNATURE
@@ -544,6 +567,16 @@
seen_keys.add(node[1])
yield node
+ def key_count(self):
+ """Return an estimate of the number of keys in this index.
+
+ For CombinedGraphIndex this is approximated by the sum of the keys of
+ the child indices. As child indices may have duplicate keys this can
+ have a maximum error of the number of child indices * largest number of
+ keys in any index.
+ """
+ return sum((index.key_count() for index in self._indices), 0)
+
def validate(self):
"""Validate that everything in the index can be accessed."""
for index in self._indices:
@@ -596,12 +629,12 @@
"""
keys = set(keys)
if self.reference_lists:
- for key in keys.intersection(self._nodes):
+ for key in keys.intersection(self._keys):
node = self._nodes[key]
if not node[0]:
yield self, key, node[2], node[1]
else:
- for key in keys.intersection(self._nodes):
+ for key in keys.intersection(self._keys):
node = self._nodes[key]
if not node[0]:
yield self, key, node[2]
@@ -676,6 +709,13 @@
else:
yield (self, ) + key_dict
+ def key_count(self):
+ """Return an estimate of the number of keys in this index.
+
+ For InMemoryGraphIndex the estimate is exact.
+ """
+ return len(self._keys)
+
def validate(self):
"""In memory index's have no known corruption at the moment."""
@@ -791,6 +831,14 @@
return self._strip_prefix(self.adapted.iter_entries_prefix(
self.prefix_key + key for key in keys))
+ def key_count(self):
+ """Return an estimate of the number of keys in this index.
+
+ For GraphIndexPrefixAdapter this is relatively expensive - key
+ iteration with the prefix is done.
+ """
+ return len(list(self.iter_all_entries()))
+
def validate(self):
"""Call the adapted's validate."""
self.adapted.validate()
=== modified file 'bzrlib/tests/test_index.py'
--- a/bzrlib/tests/test_index.py 2007-08-01 07:53:14 +0000
+++ b/bzrlib/tests/test_index.py 2007-08-07 05:44:30 +0000
@@ -27,32 +27,41 @@
builder = GraphIndexBuilder()
stream = builder.finish()
contents = stream.read()
- self.assertEqual("Bazaar Graph Index 1\nnode_ref_lists=0\nkey_elements=1\n\n", contents)
+ self.assertEqual(
+ "Bazaar Graph Index 1\nnode_ref_lists=0\nkey_elements=1\nlen=0\n\n",
+ contents)
def test_build_index_empty_two_element_keys(self):
builder = GraphIndexBuilder(key_elements=2)
stream = builder.finish()
contents = stream.read()
- self.assertEqual("Bazaar Graph Index 1\nnode_ref_lists=0\nkey_elements=2\n\n", contents)
+ self.assertEqual(
+ "Bazaar Graph Index 1\nnode_ref_lists=0\nkey_elements=2\nlen=0\n\n",
+ contents)
def test_build_index_one_reference_list_empty(self):
builder = GraphIndexBuilder(reference_lists=1)
stream = builder.finish()
contents = stream.read()
- self.assertEqual("Bazaar Graph Index 1\nnode_ref_lists=1\nkey_elements=1\n\n", contents)
+ self.assertEqual(
+ "Bazaar Graph Index 1\nnode_ref_lists=1\nkey_elements=1\nlen=0\n\n",
+ contents)
def test_build_index_two_reference_list_empty(self):
builder = GraphIndexBuilder(reference_lists=2)
stream = builder.finish()
contents = stream.read()
- self.assertEqual("Bazaar Graph Index 1\nnode_ref_lists=2\nkey_elements=1\n\n", contents)
+ self.assertEqual(
+ "Bazaar Graph Index 1\nnode_ref_lists=2\nkey_elements=1\nlen=0\n\n",
+ contents)
def test_build_index_one_node_no_refs(self):
builder = GraphIndexBuilder()
builder.add_node(('akey', ), 'data')
stream = builder.finish()
contents = stream.read()
- self.assertEqual("Bazaar Graph Index 1\nnode_ref_lists=0\nkey_elements=1\n"
+ self.assertEqual(
+ "Bazaar Graph Index 1\nnode_ref_lists=0\nkey_elements=1\nlen=1\n"
"akey\x00\x00\x00data\n\n", contents)
def test_build_index_one_node_no_refs_accepts_empty_reflist(self):
@@ -60,7 +69,8 @@
builder.add_node(('akey', ), 'data', ())
stream = builder.finish()
contents = stream.read()
- self.assertEqual("Bazaar Graph Index 1\nnode_ref_lists=0\nkey_elements=1\n"
+ self.assertEqual(
+ "Bazaar Graph Index 1\nnode_ref_lists=0\nkey_elements=1\nlen=1\n"
"akey\x00\x00\x00data\n\n", contents)
def test_build_index_one_node_2_element_keys(self):
@@ -71,7 +81,8 @@
builder.add_node(('akey', 'secondpart'), 'data')
stream = builder.finish()
contents = stream.read()
- self.assertEqual("Bazaar Graph Index 1\nnode_ref_lists=0\nkey_elements=2\n"
+ self.assertEqual(
+ "Bazaar Graph Index 1\nnode_ref_lists=0\nkey_elements=2\nlen=1\n"
"akey\x00secondpart\x00\x00\x00data\n\n", contents)
def test_add_node_empty_value(self):
@@ -79,7 +90,8 @@
builder.add_node(('akey', ), '')
stream = builder.finish()
contents = stream.read()
- self.assertEqual("Bazaar Graph Index 1\nnode_ref_lists=0\nkey_elements=1\n"
+ self.assertEqual(
+ "Bazaar Graph Index 1\nnode_ref_lists=0\nkey_elements=1\nlen=1\n"
"akey\x00\x00\x00\n\n", contents)
def test_build_index_nodes_sorted(self):
@@ -93,7 +105,8 @@
builder.add_node(('2001', ), 'data')
stream = builder.finish()
contents = stream.read()
- self.assertEqual("Bazaar Graph Index 1\nnode_ref_lists=0\nkey_elements=1\n"
+ self.assertEqual(
+ "Bazaar Graph Index 1\nnode_ref_lists=0\nkey_elements=1\nlen=3\n"
"2000\x00\x00\x00data\n"
"2001\x00\x00\x00data\n"
"2002\x00\x00\x00data\n"
@@ -116,7 +129,8 @@
builder.add_node(('2001', '2001'), 'data')
stream = builder.finish()
contents = stream.read()
- self.assertEqual("Bazaar Graph Index 1\nnode_ref_lists=0\nkey_elements=2\n"
+ self.assertEqual(
+ "Bazaar Graph Index 1\nnode_ref_lists=0\nkey_elements=2\nlen=9\n"
"2000\x002000\x00\x00\x00data\n"
"2000\x002001\x00\x00\x00data\n"
"2000\x002002\x00\x00\x00data\n"
@@ -133,7 +147,8 @@
builder.add_node(('key', ), 'data', ([], ))
stream = builder.finish()
contents = stream.read()
- self.assertEqual("Bazaar Graph Index 1\nnode_ref_lists=1\nkey_elements=1\n"
+ self.assertEqual(
+ "Bazaar Graph Index 1\nnode_ref_lists=1\nkey_elements=1\nlen=1\n"
"key\x00\x00\x00data\n"
"\n", contents)
@@ -142,7 +157,8 @@
builder.add_node(('key', 'key2'), 'data', ([], ))
stream = builder.finish()
contents = stream.read()
- self.assertEqual("Bazaar Graph Index 1\nnode_ref_lists=1\nkey_elements=2\n"
+ self.assertEqual(
+ "Bazaar Graph Index 1\nnode_ref_lists=1\nkey_elements=2\nlen=1\n"
"key\x00key2\x00\x00\x00data\n"
"\n", contents)
@@ -151,7 +167,8 @@
builder.add_node(('key', ), 'data', ([], []))
stream = builder.finish()
contents = stream.read()
- self.assertEqual("Bazaar Graph Index 1\nnode_ref_lists=2\nkey_elements=1\n"
+ self.assertEqual(
+ "Bazaar Graph Index 1\nnode_ref_lists=2\nkey_elements=1\nlen=1\n"
"key\x00\x00\t\x00data\n"
"\n", contents)
@@ -161,8 +178,9 @@
builder.add_node(('key', ), 'data', ([('reference', )], ))
stream = builder.finish()
contents = stream.read()
- self.assertEqual("Bazaar Graph Index 1\nnode_ref_lists=1\nkey_elements=1\n"
- "key\x00\x0066\x00data\n"
+ self.assertEqual(
+ "Bazaar Graph Index 1\nnode_ref_lists=1\nkey_elements=1\nlen=2\n"
+ "key\x00\x0072\x00data\n"
"reference\x00\x00\x00data\n"
"\n", contents)
@@ -173,8 +191,9 @@
builder.add_node(('key', ), 'data', ([('reference', ), ('reference2', )], ))
stream = builder.finish()
contents = stream.read()
- self.assertEqual("Bazaar Graph Index 1\nnode_ref_lists=1\nkey_elements=1\n"
- "key\x00\x00071\r088\x00data\n"
+ self.assertEqual(
+ "Bazaar Graph Index 1\nnode_ref_lists=1\nkey_elements=1\nlen=3\n"
+ "key\x00\x00077\r094\x00data\n"
"reference\x00\x00\x00data\n"
"reference2\x00\x00\x00data\n"
"\n", contents)
@@ -185,9 +204,10 @@
builder.add_node(('rey', ), 'data', ([('keference', )], [('keference', )]))
stream = builder.finish()
contents = stream.read()
- self.assertEqual("Bazaar Graph Index 1\nnode_ref_lists=2\nkey_elements=1\n"
+ self.assertEqual(
+ "Bazaar Graph Index 1\nnode_ref_lists=2\nkey_elements=1\nlen=2\n"
"keference\x00\x00\t\x00data\n"
- "rey\x00\x0053\t53\x00data\n"
+ "rey\x00\x0059\t59\x00data\n"
"\n", contents)
def test_add_node_referencing_missing_key_makes_absent(self):
@@ -195,10 +215,11 @@
builder.add_node(('rey', ), 'data', ([('beference', ), ('aeference2', )], ))
stream = builder.finish()
contents = stream.read()
- self.assertEqual("Bazaar Graph Index 1\nnode_ref_lists=1\nkey_elements=1\n"
+ self.assertEqual(
+ "Bazaar Graph Index 1\nnode_ref_lists=1\nkey_elements=1\nlen=1\n"
"aeference2\x00a\x00\x00\n"
"beference\x00a\x00\x00\n"
- "rey\x00\x0068\r53\x00data\n"
+ "rey\x00\x00074\r059\x00data\n"
"\n", contents)
def test_node_references_three_digits(self):
@@ -208,11 +229,12 @@
builder.add_node(('2-key', ), '', (references, ))
stream = builder.finish()
contents = stream.read()
- self.assertEqual("Bazaar Graph Index 1\nnode_ref_lists=1\nkey_elements=1\n"
+ self.assertEqual(
+ "Bazaar Graph Index 1\nnode_ref_lists=1\nkey_elements=1\nlen=1\n"
"0\x00a\x00\x00\n"
"1\x00a\x00\x00\n"
"2\x00a\x00\x00\n"
- "2-key\x00\x00145\r139\r133\r127\r121\r115\r065\r059\r053\x00\n"
+ "2-key\x00\x00151\r145\r139\r133\r127\r121\r071\r065\r059\x00\n"
"3\x00a\x00\x00\n"
"4\x00a\x00\x00\n"
"5\x00a\x00\x00\n"
@@ -228,9 +250,10 @@
builder.add_node(('parent', ), '', ([('aail', ), ('zther', )], []))
stream = builder.finish()
contents = stream.read()
- self.assertEqual("Bazaar Graph Index 1\nnode_ref_lists=2\nkey_elements=1\n"
+ self.assertEqual(
+ "Bazaar Graph Index 1\nnode_ref_lists=2\nkey_elements=1\nlen=1\n"
"aail\x00a\x00\x00\n"
- "parent\x00\x0053\r78\t\x00\n"
+ "parent\x00\x0059\r84\t\x00\n"
"zther\x00a\x00\x00\n"
"\n", contents)
@@ -455,6 +478,19 @@
(index, ('name', 'fin2'), 'beta', ((), ))]),
set(index.iter_entries_prefix([('name', None)])))
+ def test_key_count_empty(self):
+ index = self.make_index()
+ self.assertEqual(0, index.key_count())
+
+ def test_key_count_one(self):
+ index = self.make_index(nodes=[(('name', ), '', ())])
+ self.assertEqual(1, index.key_count())
+
+ def test_key_count_two(self):
+ index = self.make_index(nodes=[
+ (('name', ), '', ()), (('foo', ), '', ())])
+ self.assertEqual(2, index.key_count())
+
def test_validate_bad_index_errors(self):
trans = self.get_transport()
trans.put_bytes('name', "not an index\n")
@@ -624,6 +660,20 @@
self.assertEqual([(index1, ('key', ), '')],
list(index.iter_entries([('key', )])))
+ def test_key_count_empty(self):
+ index1 = self.make_index('1', nodes=[])
+ index2 = self.make_index('2', nodes=[])
+ index = CombinedGraphIndex([index1, index2])
+ self.assertEqual(0, index.key_count())
+
+ def test_key_count_sums_index_keys(self):
+ index1 = self.make_index('1', nodes=[
+ (('1',), '', ()),
+ (('2',), '', ())])
+ index2 = self.make_index('2', nodes=[(('1',), '', ())])
+ index = CombinedGraphIndex([index1, index2])
+ self.assertEqual(3, index.key_count())
+
def test_validate_bad_child_index_errors(self):
trans = self.get_transport()
trans.put_bytes('name', "not an index\n")
@@ -745,6 +795,18 @@
index = self.make_index()
self.assertEqual([], list(index.iter_entries(['a'])))
+ def test_key_count_empty(self):
+ index = self.make_index()
+ self.assertEqual(0, index.key_count())
+
+ def test_key_count_one(self):
+ index = self.make_index(nodes=[(('name', ), '')])
+ self.assertEqual(1, index.key_count())
+
+ def test_key_count_two(self):
+ index = self.make_index(nodes=[(('name', ), ''), (('foo', ), '')])
+ self.assertEqual(2, index.key_count())
+
def test_validate_empty(self):
index = self.make_index()
index.validate()
@@ -833,6 +895,18 @@
(index, ('prefix2', 'key2', ), 'data2', ((('prefix2', 'key1', ),),))]),
set(adapter.iter_entries_prefix([('prefix2', None)])))
+ def test_key_count_no_matching_keys(self):
+ index, adapter = self.make_index(nodes=[
+ (('notprefix', 'key1'), 'data', ((), ))])
+ self.assertEqual(0, adapter.key_count())
+
+ def test_key_count_some_keys(self):
+ index, adapter = self.make_index(nodes=[
+ (('notprefix', 'key1'), 'data', ((), )),
+ (('prefix', 'key1'), 'data1', ((), )),
+ (('prefix', 'key2'), 'data2', ((('prefix', 'key1'),),))])
+ self.assertEqual(2, adapter.key_count())
+
def test_validate(self):
index, adapter = self.make_index()
calls = []
More information about the bazaar-commits
mailing list