Rev 3807: Change the nomenclature. in http://bzr.arbash-meinel.com/branches/bzr/brisbane/prefix
John Arbash Meinel
john at arbash-meinel.com
Fri Dec 12 23:38:36 GMT 2008
At http://bzr.arbash-meinel.com/branches/bzr/brisbane/prefix
------------------------------------------------------------
revno: 3807
revision-id: john at arbash-meinel.com-20081212233830-bibp9nqi2pgpvt53
parent: john at arbash-meinel.com-20081212223534-y2h7xxe1yghd7l10
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: prefix
timestamp: Fri 2008-12-12 17:38:30 -0600
message:
Change the nomenclature.
_lookup_key is going to be the key used by internal nodes.
_serialised_key is the bytes that would be written to the final
page on disk.
Of course, we will actually be pulling out the common bytes
in the final form.
-------------- next part --------------
=== modified file 'bzrlib/chk_map.py'
--- a/bzrlib/chk_map.py 2008-12-12 22:35:34 +0000
+++ b/bzrlib/chk_map.py 2008-12-12 23:38:30 +0000
@@ -397,8 +397,8 @@
self._size = 0
# The pointers/values this node has - meaning defined by child classes.
self._items = {}
- # The common serialised prefix
- self._serialised_prefix = None
+ # The common lookup prefix
+ self._lookup_prefix = None
def __repr__(self):
items_str = sorted(self._items)
@@ -406,7 +406,7 @@
items_str = items_str[16] + '...]'
return '%s(key:%s len:%s size:%s max:%s prefix:%s items:%s)' % (
self.__class__.__name__, self._key, self._len, self._size,
- self._maximum_size, self._serialised_prefix, items_str)
+ self._maximum_size, self._lookup_prefix, items_str)
def key(self):
return self._key
@@ -427,18 +427,48 @@
"""
self._maximum_size = new_size
- @staticmethod
- def common_prefix(key1, key2):
- """Given 2 keys, return the longest prefix common to both."""
+ @classmethod
+ def common_prefix(cls, prefix, key):
+ """Given 2 strings, return the longest prefix common to both.
+
+ :param prefix: This has been the common prefix for other keys, so it is
+ more likely to be the common prefix in this case as well.
+ :param key: Another string to compare to
+ """
+ if key.startswith(prefix):
+ return prefix
# Is there a better way to do this?
- for pos, (left, right) in enumerate(zip(key1, key2)):
+ for pos, (left, right) in enumerate(zip(prefix, key)):
if left != right:
pos -= 1
break
- common = key1[:pos+1]
- assert key2.startswith(common)
+ assert pos <= len(prefix)
+ assert pos <= len(key)
+ common = prefix[:pos+1]
+ assert key.startswith(common)
return common
+ @classmethod
+ def common_prefix_for_keys(cls, keys):
+ """Given a list of keys, find their common prefix.
+
+ :param keys: An iterable of strings.
+ :return: The longest common prefix of all keys.
+ """
+ common_prefix = None
+ for key in keys:
+ if common_prefix is None:
+ common_prefix = key
+ continue
+ common_prefix = cls.common_prefix(common_prefix, key)
+ if not common_prefix:
+ # if common_prefix is the empty string, then we know it won't
+ # change further
+ return ''
+ if common_prefix is None:
+ return ''
+ return common_prefix
+
class LeafNode(Node):
"""A node containing actual key:value pairs.
@@ -451,7 +481,8 @@
def __init__(self):
Node.__init__(self)
# All of the keys in this leaf node share this common prefix
- self._common_prefix = None
+ self._common_serialised_prefix = None
+ self._serialise_key = '\x00'.join
def _current_size(self):
"""Answer the current serialised size of this node.
@@ -491,7 +522,8 @@
result._key = key
result._key_width = width
result._size = len(bytes)
- result._serialised_prefix = result.unique_serialised_prefix()
+ result._compute_lookup_prefix()
+ result._compute_serialised_prefix()
return result
def iteritems(self, store, key_filter=None):
@@ -521,7 +553,7 @@
def _key_value_len(self, key, value):
# TODO: Should probably be done without actually joining the key, but
# then that can be done via the C extension
- return 2 + len('\x00'.join(key)) + len(value)
+ return 2 + len(self._serialise_key(key)) + len(value)
def _map_no_split(self, key, value):
"""Map a key to a value.
@@ -534,12 +566,18 @@
self._items[key] = value
self._size += self._key_value_len(key, value)
self._len += 1
- serialised_key = self._serialised_key(key)
- if self._serialised_prefix is None:
- self._serialised_prefix = serialised_key
- elif not serialised_key.startswith(self._serialised_prefix):
- self._serialised_prefix = self.common_prefix(
- self._serialised_prefix, serialised_key)
+ serialised_key = self._serialise_key(key)
+ if self._common_serialised_prefix is None:
+ self._common_serialised_prefix = serialised_key
+ else:
+ self._common_serialised_prefix = self.common_prefix(
+ self._common_serialised_prefix, serialised_key)
+ lookup_key = self._lookup_key(key)
+ if self._lookup_prefix is None:
+ self._lookup_prefix = lookup_key
+ else:
+ self._lookup_prefix = self.common_prefix(
+ self._lookup_prefix, lookup_key)
if (self._len > 1
and self._maximum_size
and self._current_size() > self._maximum_size):
@@ -552,14 +590,14 @@
Split this node into multiple LeafNodes, return it up the stack so that
the next layer creates a new InternalNode and references the new nodes.
- :return: (common_serialized_prefix, [(node_serialized_prefix, node)])
+ :return: (common_serialised_prefix, [(node_serialised_prefix, node)])
"""
- common_prefix = self._serialised_prefix
+ common_prefix = self._lookup_prefix
split_at = len(common_prefix) + 1
result = {}
for key, value in self._items.iteritems():
- serialised_key = self._serialised_key(key)
- prefix = serialised_key[:split_at]
+ lookup_key = self._lookup_key(key)
+ prefix = lookup_key[:split_at]
# TODO: Generally only 1 key can be exactly the right length,
# which means we can only have 1 key in the node pointed
# at by the 'prefix\0' key. We might want to consider
@@ -589,7 +627,7 @@
if self._map_no_split(key, value):
return self._split(store)
else:
- return self._serialised_prefix, [("", self)]
+ return self._lookup_prefix, [("", self)]
def serialise(self, store):
"""Serialise the tree to store.
@@ -602,39 +640,39 @@
lines.append("%d\n" % self._key_width)
lines.append("%d\n" % self._len)
for key, value in sorted(self._items.items()):
- lines.append("%s\x00%s\n" % ('\x00'.join(key), value))
+ lines.append("%s\x00%s\n" % (self._serialise_key(key), value))
sha1, _, _ = store.add_lines((None,), (), lines)
self._key = ("sha1:" + sha1,)
_page_cache.add(self._key, ''.join(lines))
return [self._key]
- def _serialised_key(self, key):
- """Return the serialised key for key in this node."""
+ def _lookup_key(self, key):
+ """Return the lookup key for a key in this node."""
return '\x00'.join(key)
def refs(self):
"""Return the references to other CHK's held by this node."""
return []
- def unique_serialised_prefix(self):
- """Return the unique key prefix for this node.
+ def _compute_lookup_prefix(self):
+ """Determine the common lookup prefix for all keys in this node.
+
+ :return: A bytestring of the longest lookup key prefix that is
+ unique within this node.
+ """
+ lookup_keys = [self._lookup_key(key) for key in self._items]
+ self._lookup_prefix = self.common_prefix_for_keys(lookup_keys)
+ return self._lookup_prefix
+
+ def _compute_serialised_prefix(self):
+ """Determine the common prefix for serialised keys in this node.
:return: A bytestring of the longest serialised key prefix that is
unique within this node.
"""
- # may want to cache this eventually :- but wait for enough
- # functionality to profile.
- keys = list(self._items.keys())
- if not keys:
- return ""
- current_prefix = self._serialised_key(keys.pop(-1))
- while current_prefix and keys:
- next_key = self._serialised_key(keys.pop(-1))
- if next_key.startswith(current_prefix):
- # Don't need to do anything, the prefix already fits
- continue
- current_prefix = self.common_prefix(current_prefix, next_key)
- return current_prefix
+ serialised_keys = [self._serialise_key(key) for key in self._items]
+ self._common_serialised_prefix = self.common_prefix_for_keys(
+ serialised_keys)
def unmap(self, store, key):
"""Unmap key from the node."""
@@ -643,17 +681,18 @@
del self._items[key]
self._key = None
# Recompute from scratch
- self._serialised_prefix = self.unique_serialised_prefix()
+ self._compute_lookup_prefix()
+ self._compute_serialised_prefix()
return self
class InternalNode(Node):
"""A node that contains references to other nodes.
- An InternalNode is responsible for mapping serialised key prefixes to child
- nodes. It is greedy - it will defer splitting itself as long as possible.
+ An InternalNode is responsible for mapping lookup key prefixes to child
+ nodes.
- :ivar _items: serialized_key => node dictionary. node may be a tuple,
+ :ivar _items: serialised_key => node dictionary. node may be a tuple,
LeafNode or InternalNode.
"""
@@ -663,7 +702,7 @@
# self._size = 12
# How many octets key prefixes within this node are.
self._node_width = 0
- self._serialised_prefix = prefix
+ self._lookup_prefix = prefix
def __repr__(self):
items_str = sorted(self._items)
@@ -671,21 +710,21 @@
items_str = items_str[16] + '...]'
return '%s(key:%s len:%s size:%s max:%s prefix:%s items:%s)' % (
self.__class__.__name__, self._key, self._len, self._size,
- self._maximum_size, self._serialised_prefix, items_str)
+ self._maximum_size, self._lookup_prefix, items_str)
def add_node(self, prefix, node):
"""Add a child node with prefix prefix, and node node.
- :param prefix: The serialised key prefix for node.
+ :param prefix: The lookup key prefix for node.
:param node: The node being added.
"""
- assert self._serialised_prefix is not None
- assert prefix.startswith(self._serialised_prefix)
- assert len(prefix) == len(self._serialised_prefix) + 1
+ assert self._lookup_prefix is not None
+ assert prefix.startswith(self._lookup_prefix)
+ assert len(prefix) == len(self._lookup_prefix) + 1
self._len += len(node)
if not len(self._items):
self._node_width = len(prefix)
- assert self._node_width == len(self._serialised_prefix) + 1
+ assert self._node_width == len(self._lookup_prefix) + 1
self._items[prefix] = node
self._key = None
@@ -720,7 +759,7 @@
result._key_width = width
result._size = len(bytes)
result._node_width = len(prefix)
- result._serialised_prefix = result.unique_serialised_prefix()
+ result._compute_lookup_prefix()
return result
def iteritems(self, store, key_filter=None):
@@ -748,10 +787,10 @@
# XXX defaultdict ?
length_filters = {}
for key in key_filter:
- serialised_key = self._serialised_prefix_filter(key)
- length_filter = length_filters.setdefault(len(serialised_key),
+ lookup_key = self._lookup_prefix_filter(key)
+ length_filter = length_filters.setdefault(len(lookup_key),
set())
- length_filter.add(serialised_key)
+ length_filter.add(lookup_key)
length_filters = length_filters.items()
for prefix, node in self._items.iteritems():
for length, length_filter in length_filters:
@@ -791,18 +830,18 @@
"""Map key to value."""
if not len(self._items):
raise AssertionError("cant map in an empty InternalNode.")
- serialised_key = self._serialised_key(key)
- assert self._node_width == len(self._serialised_prefix) + 1
- if not serialised_key.startswith(self._serialised_prefix):
+ lookup_key = self._lookup_key(key)
+ assert self._node_width == len(self._lookup_prefix) + 1
+ if not lookup_key.startswith(self._lookup_prefix):
# This key doesn't fit in this index, so we need to split at the
# point where it would fit, insert self into that internal node,
# and then map this key into that node.
- new_prefix = self.common_prefix(self._serialised_prefix,
- serialised_key)
+ new_prefix = self.common_prefix(self._lookup_prefix,
+ lookup_key)
new_parent = InternalNode(new_prefix)
new_parent.set_maximum_size(self._maximum_size)
new_parent._key_width = self._key_width
- new_parent.add_node(self._serialised_prefix[:len(new_prefix)+1],
+ new_parent.add_node(self._lookup_prefix[:len(new_prefix)+1],
self)
return new_parent.map(store, key, value)
children = self._iter_nodes(store, key_filter=[key])
@@ -810,7 +849,7 @@
child = children[0]
else:
# new child needed:
- child = self._new_child(serialised_key, LeafNode)
+ child = self._new_child(lookup_key, LeafNode)
old_len = len(child)
if isinstance(child, LeafNode):
old_size = child._current_size()
@@ -821,7 +860,7 @@
# child may have shrunk, or might be a new node
child = node_details[0][1]
self._len = self._len - old_len + len(child)
- self._items[serialised_key] = child
+ self._items[lookup_key] = child
self._key = None
new_node = self
if (isinstance(child, LeafNode)
@@ -831,26 +870,26 @@
# at this level. Or the LeafNode has shrunk in size, so we need
# to check that as well.
new_node = self._check_remap(store)
- assert new_node._serialised_prefix is not None
- return new_node._serialised_prefix, [('', new_node)]
+ assert new_node._lookup_prefix is not None
+ return new_node._lookup_prefix, [('', new_node)]
# child has overflown - create a new intermediate node.
# XXX: This is where we might want to try and expand our depth
# to refer to more bytes of every child (which would give us
# multiple pointers to child nodes, but less intermediate nodes)
- child = self._new_child(serialised_key, InternalNode)
- child._serialised_prefix = prefix
+ child = self._new_child(lookup_key, InternalNode)
+ child._lookup_prefix = prefix
for split, node in node_details:
child.add_node(split, node)
self._len = self._len - old_len + len(child)
self._key = None
- return self._serialised_prefix, [("", self)]
+ return self._lookup_prefix, [("", self)]
- def _new_child(self, serialised_key, klass):
+ def _new_child(self, lookup_key, klass):
"""Create a new child node of type klass."""
child = klass()
child.set_maximum_size(self._maximum_size)
child._key_width = self._key_width
- self._items[serialised_key] = child
+ self._items[lookup_key] = child
return child
def serialise(self, store):
@@ -883,14 +922,16 @@
_page_cache.add(self._key, ''.join(lines))
yield self._key
- def _serialised_key(self, key):
+ def _lookup_key(self, key):
"""Return the serialised key for key in this node."""
+ # lookup keys are fixed width. All will be self._node_width wide, so we
+ # pad as necessary.
return ('\x00'.join(key) + '\x00'*self._node_width)[:self._node_width]
- def _serialised_prefix_filter(self, key):
+ def _lookup_prefix_filter(self, key):
"""Serialise key for use as a prefix filter in iteritems."""
if len(key) == self._key_width:
- return self._serialised_key(key)
+ return self._lookup_key(key)
return '\x00'.join(key)[:self._node_width]
def _split(self, offset):
@@ -920,27 +961,14 @@
refs.append(value.key())
return refs
- def unique_serialised_prefix(self, extra_key=None):
+ def _compute_lookup_prefix(self, extra_key=None):
"""Return the unique key prefix for this node.
- :return: A bytestring of the longest serialised key prefix that is
+ :return: A bytestring of the longest lookup key prefix that is
unique within this node.
"""
- # may want to cache this eventually :- but wait for enough
- # functionality to profile.
- keys = list(self._items.keys())
- if extra_key is not None:
- keys.append(extra_key)
- if not keys:
- return ""
- current_prefix = keys.pop(-1)
- while current_prefix and keys:
- next_key = keys.pop(-1)
- if next_key.startswith(current_prefix):
- # Don't need to do anything, the prefix already fits
- continue
- current_prefix = self.common_prefix(current_prefix, next_key)
- return current_prefix
+ self._lookup_prefix = self.common_prefix_for_keys(self._items)
+ return self._lookup_prefix
def unmap(self, store, key):
"""Remove key from this node and it's children."""
@@ -954,14 +982,14 @@
self._len -= 1
unmapped = child.unmap(store, key)
self._key = None
- serialised_key = self._serialised_key(key)
+ lookup_key = self._lookup_key(key)
if len(unmapped) == 0:
# All child nodes are gone, remove the child:
- del self._items[serialised_key]
+ del self._items[lookup_key]
unmapped = None
else:
# Stash the returned node
- self._items[serialised_key] = unmapped
+ self._items[lookup_key] = unmapped
if len(self._items) == 1:
# this node is no longer needed:
return self._items.values()[0]
=== modified file 'bzrlib/tests/test_chk_map.py'
--- a/bzrlib/tests/test_chk_map.py 2008-12-11 22:07:00 +0000
+++ b/bzrlib/tests/test_chk_map.py 2008-12-12 23:38:30 +0000
@@ -82,8 +82,8 @@
if node_one.__class__ != node_two.__class__:
self.assertEqualDiff(map_one._dump_tree(),
map_two._dump_tree())
- self.assertEqual(node_one._serialised_prefix,
- node_two._serialised_prefix)
+ self.assertEqual(node_one._lookup_prefix,
+ node_two._lookup_prefix)
if isinstance(node_one, InternalNode):
# Internal nodes must have identical references
self.assertEqual(sorted(node_one._items.keys()),
@@ -757,7 +757,7 @@
self.assertEqual(1, chkmap._root_node._key_width)
# There should be two child nodes, and prefix of 2(bytes):
self.assertEqual(2, len(chkmap._root_node._items))
- self.assertEqual("k", chkmap._root_node.unique_serialised_prefix())
+ self.assertEqual("k", chkmap._root_node._compute_lookup_prefix())
# The actual nodes pointed at will change as serialisers change; so
# here we test that the key prefix is correct; then load the nodes and
# check they have the right pointed at key; whether they have the
@@ -999,12 +999,12 @@
def test_unique_serialised_prefix_empty_new(self):
node = LeafNode()
- self.assertEqual("", node.unique_serialised_prefix())
+ self.assertEqual("", node._compute_lookup_prefix())
def test_unique_serialised_prefix_one_item_new(self):
node = LeafNode()
node.map(None, ("foo bar", "baz"), "baz quux")
- self.assertEqual("foo bar\x00baz", node.unique_serialised_prefix())
+ self.assertEqual("foo bar\x00baz", node._compute_lookup_prefix())
def test_unmap_missing(self):
node = LeafNode()
More information about the bazaar-commits
mailing list