Rev 3767: CHKMap.iter_changes in http://people.ubuntu.com/~robertc/baz2.0/repository
Robert Collins
robertc at robertcollins.net
Thu Nov 13 02:28:59 GMT 2008
At http://people.ubuntu.com/~robertc/baz2.0/repository
------------------------------------------------------------
revno: 3767
revision-id: robertc at robertcollins.net-20081113022855-8fb2mhizoehchwf1
parent: robertc at robertcollins.net-20081112221550-3z6l9ta8uvp1esa4
committer: Robert Collins <robertc at robertcollins.net>
branch nick: repository
timestamp: Thu 2008-11-13 13:28:55 +1100
message:
CHKMap.iter_changes
modified:
bzrlib/chk_map.py chk_map.py-20081001014447-ue6kkuhofvdecvxa-1
bzrlib/tests/test_chk_map.py test_chk_map.py-20081001014447-ue6kkuhofvdecvxa-2
=== modified file 'bzrlib/chk_map.py'
--- a/bzrlib/chk_map.py 2008-11-12 22:15:50 +0000
+++ b/bzrlib/chk_map.py 2008-11-13 02:28:55 +0000
@@ -37,6 +37,7 @@
"""
+import heapq
import osutils
@@ -78,9 +79,23 @@
"""Ensure that the root node is an object not a key."""
if type(self._root_node) == tuple:
# Demand-load the root
- bytes = self._read_bytes(self._root_node)
- root_key = self._root_node
- self._root_node = _deserialise(bytes, root_key)
+ self._root_node = self._get_node(self._root_node)
+
+ def _get_node(self, node):
+ """Get a node.
+
+ Node that this does not update the _items dict in objects containing a
+ reference to this node. As such it does not prevent subsequent IO being
+ performed.
+
+ :param node: A tuple key or node object.
+ :return: A node object.
+ """
+ if type(node) == tuple:
+ bytes = self._read_bytes(node)
+ return _deserialise(bytes, node)
+ else:
+ return node
def _read_bytes(self, key):
stream = self._store.get_record_stream([key], 'unordered', True)
@@ -113,20 +128,156 @@
is None for keys only in self; new_value is None for keys only in
basis.
"""
- # Cheap but let tests pass
- new = dict(self.iteritems())
- old = dict(basis.iteritems())
- new_keys = set(new)
- old_keys = set(old)
- result = []
- for key in new_keys - old_keys:
- result.append((key, None, new[key]))
- for key in old_keys - new_keys:
- result.append((key, old[key], None))
- for key in old_keys.intersection(new_keys):
- if new[key] != old[key]:
- result.append((key, old[key], new[key]))
- return result
+ # Overview:
+ # Read both trees in lexographic, highest-first order.
+ # Any identical nodes we skip
+ # Any unique prefixes we output immediately.
+ # values in a leaf node are treated as single-value nodes in the tree
+ # which allows them to be not-special-cased. We know to output them
+ # because their value is a string, not a key(tuple) or node.
+ #
+ # corner cases to beware of when considering this function:
+ # *) common references are at different heights.
+ # consider two trees:
+ # {'a': LeafNode={'aaa':'foo', 'aab':'bar'}, 'b': LeafNode={'b'}}
+ # {'a': InternalNode={'aa':LeafNode={'aaa':'foo', 'aab':'bar'}, 'ab':LeafNode={'ab':'bar'}}
+ # 'b': LeafNode={'b'}}
+ # the node with aaa/aab will only be encountered in the second tree
+ # after reading the 'a' subtree, but it is encountered in the first
+ # tree immediately. Variations on this may have read internal nodes like this.
+ # we want to cut the entire pending subtree when we realise we have a common node.
+ # For this we use a list of keys - the path to a node - and check the entire path is
+ # clean as we process each item.
+ if self._node_key(self._root_node) == self._node_key(basis._root_node):
+ return
+ self._ensure_root()
+ basis._ensure_root()
+ excluded_keys = set()
+ self_node = self._root_node
+ basis_node = basis._root_node
+ # A heap, each element is prefix, node(tuple/NodeObject/string),
+ # key_path (a list of tuples, tail-sharing down the tree.)
+ self_pending = []
+ basis_pending = []
+ def process_node(prefix, node, path, a_map, pending):
+ # take a node and expand it
+ node = a_map._get_node(node)
+ if type(node) == LeafNode:
+ path = (node._key, path)
+ for key, value in node._items.items():
+ heapq.heappush(pending, ('\x00'.join(key), value, path))
+ else:
+ # type(node) == InternalNode
+ path = (node._key, path)
+ for prefix, child in node._items.items():
+ heapq.heappush(pending, (prefix, child, path))
+ process_node(None, self_node, None, self, self_pending)
+ process_node(None, basis_node, None, basis, basis_pending)
+ self_seen = set()
+ basis_seen = set()
+ excluded_keys = set()
+ def check_excluded(key_path):
+ # Note that this is N^2, it depends on us trimming trees
+ # aggressively to not become slow.
+ # A better implementation would probably have a reverse map
+ # back to the children of a node, and jump straight to it when
+ # a common node is detected, the proceed to remove the already
+ # pending children. bzrlib.graph has a searcher module with a
+ # similar problem.
+ while key_path is not None:
+ key, key_path = key_path
+ if key in excluded_keys:
+ return True
+ return False
+
+ while self_pending or basis_pending:
+ if not self_pending:
+ # self is exhausted: output remainder of basis
+ for prefix, node, path in basis_pending:
+ if check_excluded(path):
+ continue
+ node = basis._get_node(node)
+ if type(node) == str:
+ # a value
+ yield (tuple(prefix.split('\x00')), node, None)
+ else:
+ # subtree - fastpath the entire thing.
+ for key, value in node.iteritems(basis._store):
+ yield (key, value, None)
+ return
+ elif not basis_pending:
+ # basis is exhausted: output remainder of self.
+ for prefix, node, path in self_pending:
+ if check_excluded(path):
+ continue
+ node = self._get_node(node)
+ excluded_keys.add(node._key)
+ if type(node) == str:
+ # a value
+ yield (tuple(prefix.split('\x00')), None, node)
+ else:
+ # subtree - fastpath the entire thing.
+ for key, value in node.iteritems(self._store):
+ yield (key, None, value)
+ return
+ else:
+ # XXX: future optimisation - yield the smaller items
+ # immediately rather than pushing everything on/off the
+ # heaps. Applies to both internal nodes and leafnodes.
+ if self_pending[0][0] < basis_pending[0][0]:
+ # expand self
+ prefix, node, path = heapq.heappop(self_pending)
+ if check_excluded(path):
+ continue
+ if type(node) == str:
+ # a value
+ yield (tuple(prefix.split('\x00')), None, node)
+ else:
+ process_node(prefix, node, path, self, self_pending)
+ continue
+ elif self_pending[0][0] > basis_pending[0][0]:
+ # expand basis
+ prefix, node, path = heapq.heappop(basis_pending)
+ if check_excluded(path):
+ continue
+ if type(node) == str:
+ # a value
+ yield (tuple(prefix.split('\x00')), node, None)
+ else:
+ process_node(prefix, node, path, basis, basis_pending)
+ continue
+ else:
+ # common prefix: possibly expand both
+ if type(self_pending[0][1]) != str:
+ # process next self
+ read_self = True
+ else:
+ read_self = False
+ if type(basis_pending[0][1]) != str:
+ # process next basis
+ read_basis = True
+ else:
+ read_basis = False
+ if not read_self and not read_basis:
+ # compare a common value
+ self_details = heapq.heappop(self_pending)
+ basis_details = heapq.heappop(basis_pending)
+ if self_details[1] != basis_details[1]:
+ yield (tuple(self_details[0].split('\x00')),
+ basis_details[1], self_details[1])
+ continue
+ # At least one side wasn't a string, we need to expand it
+ # before we can continue
+ if read_self:
+ prefix, node, path = heapq.heappop(self_pending)
+ if check_excluded(path):
+ continue
+ process_node(prefix, node, path, self, self_pending)
+ if read_basis:
+ prefix, node, path = heapq.heappop(basis_pending)
+ if check_excluded(path):
+ continue
+ process_node(prefix, node, path, basis, basis_pending)
def iteritems(self, key_filter=None):
"""Iterate over the entire CHKMap's contents."""
@@ -158,31 +309,18 @@
for split, node in node_details:
self._root_node.add_node(split, node)
- def _map(self, key, value):
- """Map key to value."""
- # Ne
- self._ensure_root()
- # Store the value
- bytes = ValueNode(value).serialise()
- # Experimental code to probe for keys rather than just adding; its not
- # clear if it is an improvement.
- #chk = ("sha1:%s" % osutils.sha_string(bytes),)
- #if not self._store.get_parent_map([key]):
- sha1, _, _ = self._store.add_lines((None,), (), osutils.split_lines(bytes))
- chk = ("sha1:" + sha1,)
- # And link into the root
- self._root_node.add_child(key, chk)
+ def _node_key(self, node):
+ """Get the key for a node whether its a tuple o r node."""
+ if type(node) == tuple:
+ return node
+ else:
+ return node._key
def unmap(self, key):
"""remove key from the map."""
self._ensure_root()
self._root_node.unmap(self._store, key)
- def _unmap(self, key):
- """remove key from the map."""
- self._ensure_root()
- self._root_node.remove_child(key)
-
def _save(self):
"""Save the map completely.
@@ -442,11 +580,11 @@
:return: An iterable of nodes.
"""
nodes = []
- keys = set()
+ keys = {}
if key_filter is None:
- for node in self._items.itervalues():
+ for prefix, node in self._items.iteritems():
if type(node) == tuple:
- keys.add(node)
+ keys[node] = prefix
else:
nodes.append(node)
else:
@@ -455,7 +593,7 @@
for prefix, node in self._items.iteritems():
if prefix in serialised_filter:
if type(node) == tuple:
- keys.add(node)
+ keys[node] = prefix
else:
nodes.append(node)
if keys:
@@ -464,6 +602,7 @@
for record in stream:
node = _deserialise(record.get_bytes_as('fulltext'), record.key)
nodes.append(node)
+ self._items[keys[record.key]] = node
return nodes
def map(self, store, key, value):
=== modified file 'bzrlib/tests/test_chk_map.py'
--- a/bzrlib/tests/test_chk_map.py 2008-11-12 22:15:50 +0000
+++ b/bzrlib/tests/test_chk_map.py 2008-11-13 02:28:55 +0000
@@ -111,12 +111,126 @@
def test_iter_changes_empty_ab(self):
# Asking for changes between an empty dict to a dict with keys returns
# all the keys.
- basis = self._get_map({})
+ basis = self._get_map({}, maximum_size=10)
target = self._get_map(
{('a',): 'content here', ('b',): 'more content'},
- chk_bytes=basis._store)
+ chk_bytes=basis._store, maximum_size=10)
self.assertEqual([(('a',), None, 'content here'),
- (('b',), None, 'more content')], list(target.iter_changes(basis)))
+ (('b',), None, 'more content')],
+ sorted(list(target.iter_changes(basis))))
+
+ def test_iter_changes_ab_empty(self):
+ # Asking for changes between a dict with keys to an empty dict returns
+ # all the keys.
+ basis = self._get_map({('a',): 'content here', ('b',): 'more content'},
+ maximum_size=10)
+ target = self._get_map({}, chk_bytes=basis._store, maximum_size=10)
+ self.assertEqual([(('a',), 'content here', None),
+ (('b',), 'more content', None)],
+ sorted(list(target.iter_changes(basis))))
+
+ def test_iter_changes_empty_empty_is_empty(self):
+ basis = self._get_map({}, maximum_size=10)
+ target = self._get_map({}, chk_bytes=basis._store, maximum_size=10)
+ self.assertEqual([], sorted(list(target.iter_changes(basis))))
+
+ def test_iter_changes_ab_ab_is_empty(self):
+ basis = self._get_map({('a',): 'content here', ('b',): 'more content'},
+ maximum_size=10)
+ target = self._get_map(
+ {('a',): 'content here', ('b',): 'more content'},
+ chk_bytes=basis._store, maximum_size=10)
+ self.assertEqual([], sorted(list(target.iter_changes(basis))))
+
+ def test_iter_changes_ab_ab_nodes_not_loaded(self):
+ basis = self._get_map({('a',): 'content here', ('b',): 'more content'},
+ maximum_size=10)
+ target = self._get_map(
+ {('a',): 'content here', ('b',): 'more content'},
+ chk_bytes=basis._store, maximum_size=10)
+ list(target.iter_changes(basis))
+ self.assertIsInstance(target._root_node, tuple)
+ self.assertIsInstance(basis._root_node, tuple)
+
+ def test_iter_changes_ab_ab_changed_values_shown(self):
+ basis = self._get_map({('a',): 'content here', ('b',): 'more content'},
+ maximum_size=10)
+ target = self._get_map(
+ {('a',): 'content here', ('b',): 'different content'},
+ chk_bytes=basis._store, maximum_size=10)
+ result = sorted(list(target.iter_changes(basis)))
+ self.assertEqual([(('b',), 'more content', 'different content')],
+ result)
+
+ def test_iter_changes_mixed_node_length(self):
+ # When one side has different node lengths than the other, common
+ # but different keys still need to be show, and new-and-old included
+ # appropriately.
+ # aaa - common unaltered
+ # aab - common altered
+ # b - basis only
+ # at - target only
+ # we expect:
+ # aaa to be not loaded (later test)
+ # aab, b, at to be returned.
+ # basis splits at byte 0,1,2, aaa is commonb is basis only
+ basis_dict = {('aaa',): 'foo bar',
+ ('aab',): 'common altered a', ('b',): 'foo bar b'}
+ # target splits at byte 1,2, at is target only
+ target_dict = {('aaa',): 'foo bar',
+ ('aab',): 'common altered b', ('at',): 'foo bar t'}
+ changes = [
+ (('aab',), 'common altered a', 'common altered b'),
+ (('at',), None, 'foo bar t'),
+ (('b',), 'foo bar b', None),
+ ]
+ basis = self._get_map(basis_dict, maximum_size=10)
+ target = self._get_map(target_dict, maximum_size=10,
+ chk_bytes=basis._store)
+ self.assertEqual(changes, sorted(list(target.iter_changes(basis))))
+
+ def test_iter_changes_common_pages_not_loaded(self):
+ # aaa - common unaltered
+ # aab - common altered
+ # b - basis only
+ # at - target only
+ # we expect:
+ # aaa to be not loaded
+ # aaa not to be in result.
+ basis_dict = {('aaa',): 'foo bar',
+ ('aab',): 'common altered a', ('b',): 'foo bar b'}
+ # target splits at byte 1, at is target only
+ target_dict = {('aaa',): 'foo bar',
+ ('aab',): 'common altered b', ('at',): 'foo bar t'}
+ basis = self._get_map(basis_dict, maximum_size=10)
+ target = self._get_map(target_dict, maximum_size=10,
+ chk_bytes=basis._store)
+ result = sorted(list(target.iter_changes(basis)))
+ for change in result:
+ if change[0] == ('aaa',):
+ self.fail("Found unexpected change: %s" % change)
+ return
+ # check both target and basis did not load the aaa pointer
+ self.assertIsInstance(target._root_node._items['aa']._items['aaa'],
+ tuple)
+ self.assertIsInstance(basis._root_node._items['a']._items['aaa'],
+ tuple)
+
+ def test_iter_changes_unchanged_keys_in_multi_key_leafs_ignored(self):
+ # Within a leaf there are no hash's to exclude keys, make sure multi
+ # value leaf nodes are handled well.
+ basis_dict = {('aaa',): 'foo bar',
+ ('aab',): 'common altered a', ('b',): 'foo bar b'}
+ target_dict = {('aaa',): 'foo bar',
+ ('aab',): 'common altered b', ('at',): 'foo bar t'}
+ changes = [
+ (('aab',), 'common altered a', 'common altered b'),
+ (('at',), None, 'foo bar t'),
+ (('b',), 'foo bar b', None),
+ ]
+ basis = self._get_map(basis_dict)
+ target = self._get_map(target_dict, chk_bytes=basis._store)
+ self.assertEqual(changes, sorted(list(target.iter_changes(basis))))
def test_iteritems_empty(self):
chk_bytes = self.get_chk_bytes()
More information about the bazaar-commits
mailing list