Rev 3779: Start working on an iter_interesting_nodes, which can find nodes to transmit in http://bzr.arbash-meinel.com/branches/bzr/brisbane/fetch

John Arbash Meinel john at arbash-meinel.com
Fri Nov 14 08:44:51 GMT 2008


At http://bzr.arbash-meinel.com/branches/bzr/brisbane/fetch

------------------------------------------------------------
revno: 3779
revision-id: john at arbash-meinel.com-20081114084445-a0en1xevr1oef4xj
parent: robertc at robertcollins.net-20081114021953-ckqpcsakzrk1ns1l
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: fetch
timestamp: Fri 2008-11-14 02:44:45 -0600
message:
  Start working on an iter_interesting_nodes, which can find nodes to transmit
  in 'parallel'. Having some small problems when looking at multiple dicts.
-------------- next part --------------
=== modified file 'bzrlib/chk_map.py'
--- a/bzrlib/chk_map.py	2008-11-14 02:13:12 +0000
+++ b/bzrlib/chk_map.py	2008-11-14 08:44:45 +0000
@@ -767,3 +767,72 @@
         return InternalNode.deserialise(bytes, key)
     else:
         raise AssertionError("Unknown node type.")
+
+
+def iter_interesting_nodes(store, interesting_root_keys,
+                           uninteresting_root_keys):
+    """Given root keys, find interesting nodes.
+
+    Evaluate nodes referenced by interesting_root_keys. Ones that are also
+    referenced from uninteresting_root_keys are not considered interesting.
+
+    :param interesting_root_keys: keys which should be part of the
+        "interesting" nodes (which will be yielded)
+    :param uninteresting_root_keys: keys which should be filtered out of the
+        result set.
+    :return: Yield
+        (interesting records, interesting chk's, interesting key:values)
+    """
+    uninteresting_keys = set(uninteresting_root_keys)
+    interesting_keys = set(interesting_root_keys)
+    # What about duplicates with uninteresting_root_keys?
+    interesting_chks = set(interesting_keys)
+    # TODO: consider that it may be more memory efficient to use the 20-byte
+    #       sha1 string, rather than tuples of hexidecimal sha1 strings.
+    uninteresting_chks = set(uninteresting_keys)
+    uninteresting_key_values = set()
+
+    # XXX: First attempt, UGLY, UGLY, UGLY
+    # First, find the full set of uninteresting bits reachable by the
+    # uninteresting roots
+    chks_to_read = uninteresting_keys
+    while chks_to_read:
+        next_chks = set()
+        for record in store.get_record_stream(chks_to_read, 'unordered', True):
+            # TODO: Handle 'absent'
+            node = _deserialise(record.get_bytes_as('fulltext'), record.key)
+            if isinstance(node, InternalNode):
+                # uninteresting_prefix_chks.update(node._items.iteritems())
+                chks = node._items.values()
+                # TODO: We remove the entries that are already in
+                #       uninteresting_chks ?
+                next_chks.update(chks)
+                uninteresting_chks.update(chks)
+            else:
+                uninteresting_key_values.update(node._items.iteritems())
+        chks_to_read = next_chks
+
+    # Is it possible that we would need to filter out the references we know to
+    # be uninteresting, eg: interesting_keys.difference(uninteresting_chks)
+    chks_to_read = interesting_keys
+    while chks_to_read:
+        next_chks = set()
+        records = {}
+        interesting_items = []
+        interesting_chks = set()
+        for record in store.get_record_stream(chks_to_read, 'unordered', True):
+            records[record.key] = record
+            # TODO: Handle 'absent'
+            node = _deserialise(record.get_bytes_as('fulltext'), record.key)
+            if isinstance(node, InternalNode):
+                chks = [chk for chk in node._items.itervalues()
+                             if chk not in uninteresting_chks]
+                next_chks.update(chks)
+                # These are now uninteresting everywhere else
+                uninteresting_chks.update(chks)
+            else:
+                interesting_items = [item for item in node._items.iteritems()
+                                     if item not in uninteresting_key_values]
+                uninteresting_key_values.update(interesting_items)
+        yield records, chks_to_read, interesting_items
+        chks_to_read = next_chks

=== modified file 'bzrlib/tests/test_chk_map.py'
--- a/bzrlib/tests/test_chk_map.py	2008-11-13 03:20:56 +0000
+++ b/bzrlib/tests/test_chk_map.py	2008-11-14 08:44:45 +0000
@@ -16,6 +16,9 @@
 
 """Tests for maps built on a CHK versionedfiles facility."""
 
+from itertools import izip
+
+from bzrlib import chk_map
 from bzrlib.chk_map import (
     CHKMap,
     InternalNode,
@@ -738,3 +741,115 @@
 # BA
 # AB - split, but we want to end up with AB, BA, in one node, with 
 # 1-4K get0
+
+
+class TestIterInterestingNodes(TestCaseWithStore):
+
+    def get_chk_bytes(self):
+        if getattr(self, '_chk_bytes', None) is None:
+            self._chk_bytes = super(TestIterInterestingNodes,
+                                    self).get_chk_bytes()
+        return self._chk_bytes
+
+    def get_map_key(self, a_dict):
+        c_map = self._get_map(a_dict, maximum_size=10,
+                              chk_bytes=self.get_chk_bytes())
+        return c_map.key()
+
+    def assertIterInteresting(self, expected, interesting_keys,
+                              uninteresting_keys):
+        """Check the result of iter_interesting_nodes.
+
+        :param expected: A list of (record_keys, interesting_chk_pages,
+                                    interesting key value pairs)
+        """
+        store = self.get_chk_bytes()
+        iter_nodes = chk_map.iter_interesting_nodes(store, interesting_keys,
+                                                    uninteresting_keys)
+        for count, (exp, act) in enumerate(izip(expected, iter_nodes)):
+            exp_record_keys, exp_chks, exp_items = exp
+            records, chks, items = act
+            exp_tuple = (sorted(exp_record_keys), sorted(exp_chks), items)
+            act_tuple = (sorted(records.keys()), sorted(chks), items)
+            self.assertEqual(exp_tuple, act_tuple)
+        self.assertEqual(len(expected), count + 1)
+
+    def test_none_to_one_key(self):
+        basis = self.get_map_key({})
+        target = self.get_map_key({('a',): 'content'})
+        self.assertIterInteresting(
+            [([target], [target], [(('a',), 'content')])],
+            [target], [basis])
+
+    def test_one_to_none_key(self):
+        basis = self.get_map_key({('a',): 'content'})
+        target = self.get_map_key({})
+        self.assertIterInteresting(
+            [([target], [target], [])],
+            [target], [basis])
+
+    def test_common_pages(self):
+        basis = self.get_map_key({('a',): 'content',
+                                  ('b',): 'content',
+                                  ('c',): 'content',
+                                 })
+        target = self.get_map_key({('a',): 'content',
+                                   ('b',): 'other content',
+                                   ('c',): 'content',
+                                  })
+        # Is there a way to get this more directly?
+        b_key = ('sha1:1d7a45ded01ab77c069350c0e290ae34db5b549b',)
+        # This should return the root node, and the node for the 'b' key
+        self.assertIterInteresting(
+            [([target], [target], []),
+             ([b_key], [b_key], [(('b',), 'other content')])],
+            [target], [basis])
+
+    def test_common_sub_page(self):
+        basis = self.get_map_key({('aaa',): 'common',
+                                  ('c',): 'common',
+                                 })
+        target = self.get_map_key({('aaa',): 'common',
+                                   ('aab',): 'new',
+                                   ('c',): 'common',
+                                  })
+        # The key for the internal aa node
+        aa_key = ('sha1:2ce01860338a614b93883a5bbeb89920137ac7ef',)
+        # The key for the leaf aab node
+        aab_key = ('sha1:10567a3bfcc764fb8d8d9edaa28c0934ada366c5',)
+        self.assertIterInteresting(
+            [([target], [target], []),
+             ([aa_key], [aa_key], []),
+             ([aab_key], [aab_key], [(('aab',), 'new')])],
+            [target], [basis])
+
+    def test_multiple_maps(self):
+        basis1 = self.get_map_key({('aaa',): 'common',
+                                   ('aab',): 'basis1',
+                                  })
+        basis2 = self.get_map_key({('bbb',): 'common',
+                                   ('bbc',): 'basis2',
+                                  })
+        target1 = self.get_map_key({('aaa',): 'common',
+                                    ('aac',): 'target1',
+                                    ('bbb',): 'common',
+                                   })
+        target2 = self.get_map_key({('aaa',): 'common',
+                                    ('bba',): 'target2',
+                                    ('bbb',): 'common',
+                                   })
+        # The key for the target1 internal aa node
+        aa_key = ('sha1:4c6b1e3e6ecb68fe039d2b00c9091bc037ebf203',)
+        # The key for the leaf aac node
+        aac_key = ('sha1:8089f6b4f3bd2a058c41be199ef5af0c5b9a0c4f',)
+        # The key for the target2 internal bb node
+        bb_key = ('sha1:5ce6a69a21060222bb0a5b48fdbfcca586cc9183',)
+        # The key for the leaf bba node
+        bba_key = ()
+        import pdb; pdb.set_trace()
+        self.assertIterInteresting(
+            [([target1, target2], [target1, target2], []),
+             ([aa_key, bb_key], [aa_key, bb_key], []),
+             ([aac_key, bba_key], [aac_key, bba_key],
+              [(('aac',), 'target1'), (('bba',), 'target2')]),
+            ], [target1, target2], [basis1, basis2])



More information about the bazaar-commits mailing list