Rev 3212: Change the smart server get_parents method to take a graph search to exclude already recieved parents from. This prevents history shortcuts causing huge numbers of duplicates. in http://people.ubuntu.com/~robertc/baz2.0/get_parents.duplicates

Sun Feb 3 22:55:13 GMT 2008

At http://people.ubuntu.com/~robertc/baz2.0/get_parents.duplicates

------------------------------------------------------------
revno: 3212
revision-id:robertc at robertcollins.net-20080203225508-0rogbg0ggonuqfhp
parent: pqm at pqm.ubuntu.com-20080201053934-q32y2nk5vvo13c6v
committer: Robert Collins <robertc at robertcollins.net>
branch nick: get_parents.duplicates
timestamp: Mon 2008-02-04 09:55:08 +1100
message:
  Change the smart server get_parents method to take a graph search to exclude already recieved parents from. This prevents history shortcuts causing huge numbers of duplicates.
modified:
  NEWS                           NEWS-20050323055033-4e00b5db738777ff
  bzrlib/graph.py                graph_walker.py-20070525030359-y852guab65d4wtn0-1
  bzrlib/remote.py               remote.py-20060720103555-yeeg2x51vn0rbtdp-1
  bzrlib/smart/protocol.py       protocol.py-20061108035435-ot0lstk2590yqhzr-1
  bzrlib/smart/repository.py     repository.py-20061128022038-vr5wy5bubyb8xttk-1
  bzrlib/tests/repository_implementations/test_repository.py test_repository.py-20060131092128-ad07f494f5c9d26c
  bzrlib/tests/test_remote.py    test_remote.py-20060720103555-yeeg2x51vn0rbtdp-2
  bzrlib/tests/test_smart.py     test_smart.py-20061122024551-ol0l0o0oofsu9b3t-2
=== modified file 'NEWS'

--- a/NEWS	2008-02-01 05:39:34 +0000
+++ b/NEWS	2008-02-03 22:55:08 +0000
@@ -29,6 +29,11 @@
     * ``merge --preview`` produces a diff of the changes merge would make,
       but does not actually perform the merge.  (Aaron Bentley)
 
+    * New smart method ``Repository.get_parent_map`` for getting revision
+      parent data. This returns additional parent information topologically
+      adjacent to the requested data to reduce round trip latency impacts.
+      (Robert Collins)
+
     * New smart method, ``Repository.stream_revisions_chunked``, for fetching
       revision data that streams revision data via a chunked encoding.  This
       avoids buffering large amounts of revision data on the server and on the

=== modified file 'bzrlib/graph.py'
--- a/bzrlib/graph.py	2008-01-29 06:56:51 +0000
+++ b/bzrlib/graph.py	2008-02-03 22:55:08 +0000
@@ -514,7 +514,7 @@
             # exclude keys for them. However, while we could have a second
             # look-ahead result buffer and shuffle things around, this method
             # is typically only called once per search - when memoising the
-            # results of the search.
+            # results of the search. 
             found, ghosts, next, parents = self._do_query(self._next_query)
             # pretend we didn't query: perhaps we should tweak _do_query to be
             # entirely stateless?

=== modified file 'bzrlib/remote.py'
--- a/bzrlib/remote.py	2008-01-29 03:22:52 +0000
+++ b/bzrlib/remote.py	2008-02-03 22:55:08 +0000
@@ -42,6 +42,7 @@
     )
 from bzrlib.revision import NULL_REVISION
 from bzrlib.trace import mutter, note
+from bzrlib.tuned_gzip import GzipFile
 
 # Note: RemoteBzrDirFormat is in bzrdir.py
 
@@ -268,6 +269,8 @@
         self._leave_lock = False
         # A cache of looked up revision parent data; reset at unlock time.
         self._parents_map = None
+        if 'hpss' in debug.debug_flags:
+            self._requested_parents = None
         # For tests:
         # These depend on the actual remote format, so force them off for
         # maximum compatibility. XXX: In future these should depend on the
@@ -472,6 +475,8 @@
             self._lock_mode = 'r'
             self._lock_count = 1
             self._parents_map = {}
+            if 'hpss' in debug.debug_flags:
+                self._requested_parents = set()
             if self._real_repository is not None:
                 self._real_repository.lock_read()
         else:
@@ -510,6 +515,8 @@
             self._lock_mode = 'w'
             self._lock_count = 1
             self._parents_map = {}
+            if 'hpss' in debug.debug_flags:
+                self._requested_parents = set()
         elif self._lock_mode == 'r':
             raise errors.ReadOnlyError(self)
         else:
@@ -570,6 +577,8 @@
         if self._lock_count > 0:
             return
         self._parents_map = None
+        if 'hpss' in debug.debug_flags:
+            self._requested_parents = None
         old_mode = self._lock_mode
         self._lock_mode = None
         try:
@@ -772,7 +781,12 @@
                         len(set(self._parents_map).intersection(parent_map)),
                         len(parent_map))
             self._parents_map.update(parent_map)
-        return dict((k, ancestry[k]) for k in keys if k in ancestry)
+        present_keys = [k for k in keys if k in ancestry]
+        if 'hpss' in debug.debug_flags:
+            self._requested_parents.update(present_keys)
+            mutter('Current RemoteRepository graph hit rate: %d%%',
+                100.0 * len(self._requested_parents) / len(self._parents_map))
+        return dict((k, ancestry[k]) for k in present_keys)
 
     def _response_is_unknown_method(self, response, verb):
         """Return True if response is an unknonwn method response to verb.
@@ -802,12 +816,33 @@
                 return found_parents
         else:
             found_parents = {}
+        # TODO(Needs analysis): We could assume that the keys being requested
+        # from get_parent_map are in a breadth first search, so typically they
+        # will all be depth N from some common parent, and we don't have to
+        # have the server iterate from the root parent, but rather from the
+        # keys we're searching; and just tell the server the keyspace we
+        # already have; but this may be more traffic again.
+
+        # Transform self._parents_map into a search request recipe.
+        # TODO: Manage this incrementally to avoid covering the same path
+        # repeatedly. (The server will have to on each request, but the less
+        # work done the better).
+        start_set = set(self._parents_map)
+        result_parents = set()
+        for parents in self._parents_map.itervalues():
+            result_parents.update(parents)
+        stop_keys = result_parents.difference(start_set)
+        included_keys = start_set.intersection(result_parents)
+        start_set.difference_update(included_keys)
+        recipe = (start_set, stop_keys, len(self._parents_map))
+        body = self._serialise_search_recipe(recipe)
         path = self.bzrdir._path_for_remote_call(self._client)
         for key in keys:
             assert type(key) is str
         verb = 'Repository.get_parent_map'
-        response = self._client.call_expecting_body(
-            verb, path, *keys)
+        args = (path,) + tuple(keys)
+        response = self._client.call_with_body_bytes_expecting_body(
+            verb, args, self._serialise_search_recipe(recipe))
         if self._response_is_unknown_method(response, verb):
             # Server that does not support this method, get the whole graph.
             response = self._client.call_expecting_body(
@@ -819,7 +854,8 @@
             reponse[1].cancel_read_body()
             raise errors.UnexpectedSmartServerResponse(response[0])
         if response[0][0] == 'ok':
-            coded = response[1].read_body_bytes()
+            coded = GzipFile(mode='rb',
+                fileobj=StringIO(response[1].read_body_bytes())).read()
             if coded == '':
                 # no revisions found
                 return {}
@@ -972,11 +1008,7 @@
     def get_data_stream_for_search(self, search):
         REQUEST_NAME = 'Repository.stream_revisions_chunked'
         path = self.bzrdir._path_for_remote_call(self._client)
-        recipe = search.get_recipe()
-        start_keys = ' '.join(recipe[0])
-        stop_keys = ' '.join(recipe[1])
-        count = str(recipe[2])
-        body = '\n'.join((start_keys, stop_keys, count))
+        body = self._serialise_search_recipe(search.get_recipe())
         response, protocol = self._client.call_with_body_bytes_expecting_body(
             REQUEST_NAME, (path,), body)
 
@@ -1037,6 +1069,17 @@
     def _make_parents_provider(self):
         return self
 
+    def _serialise_search_recipe(self, recipe):
+        """Serialise a graph search recipe.
+
+        :param recipe: A search recipe (start, stop, count).
+        :return: Serialised bytes.
+        """
+        start_keys = ' '.join(recipe[0])
+        stop_keys = ' '.join(recipe[1])
+        count = str(recipe[2])
+        return '\n'.join((start_keys, stop_keys, count))
+
 
 class RemoteBranchLockableFiles(LockableFiles):
     """A 'LockableFiles' implementation that talks to a smart server.

=== modified file 'bzrlib/smart/protocol.py'
--- a/bzrlib/smart/protocol.py	2008-01-21 23:04:54 +0000
+++ b/bzrlib/smart/protocol.py	2008-02-03 22:55:08 +0000
@@ -488,6 +488,8 @@
                 mutter('                  (to %s)', self._request._medium._path)
             mutter('              %d bytes', len(body))
             self._request_start_time = time.time()
+            if 'hpssdetail' in debug.debug_flags:
+                mutter('hpss body content: %s', body)
         self._write_args(args)
         bytes = self._encode_bulk_data(body)
         self._request.accept_bytes(bytes)

=== modified file 'bzrlib/smart/repository.py'
--- a/bzrlib/smart/repository.py	2008-01-17 07:47:52 +0000
+++ b/bzrlib/smart/repository.py	2008-02-03 22:55:08 +0000
@@ -31,6 +31,7 @@
     SuccessfulSmartServerResponse,
     )
 from bzrlib import revision as _mod_revision
+from bzrlib.tuned_gzip import GzipFile, bytes_to_gzip
 
 
 class SmartServerRepositoryRequest(SmartServerRequest):
@@ -58,29 +59,76 @@
         # is expected)
         return None
 
+    def recreate_search(self, repository, recipe_bytes):
+        lines = recipe_bytes.split('\n')
+        start_keys = set(lines[0].split(' '))
+        exclude_keys = set(lines[1].split(' '))
+        revision_count = int(lines[2])
+        repository.lock_read()
+        try:
+            search = repository.get_graph()._make_breadth_first_searcher(
+                start_keys)
+            while True:
+                try:
+                    next_revs = search.next()
+                except StopIteration:
+                    break
+                search.stop_searching_any(exclude_keys.intersection(next_revs))
+            search_result = search.get_result()
+            if search_result.get_recipe()[2] != revision_count:
+                # we got back a different amount of data than expected, this
+                # gets reported as NoSuchRevision, because less revisions
+                # indicates missing revisions, and more should never happen as
+                # the excludes list considers ghosts and ensures that ghost
+                # filling races are not a problem.
+                return (None, FailedSmartServerResponse(('NoSuchRevision',)))
+            return (search, None)
+        finally:
+            repository.unlock()
+
 
 class SmartServerRepositoryGetParentMap(SmartServerRepositoryRequest):
+    """Bzr 1.2+ - get parent data for revisions during a graph search."""
     
     def do_repository_request(self, repository, *revision_ids):
-        repository.lock_read()
-        try:
-            return self._do_repository_request(repository, revision_ids)
-        finally:
-            repository.unlock()
-
-    def _do_repository_request(self, repository, revision_ids):
         """Get parent details for some revisions.
         
         All the parents for revision_ids are returned. Additionally up to 64KB
         of additional parent data found by performing a breadth first search
-        from revision_ids is returned.
+        from revision_ids is returned. The verb takes a body containing the
+        current search state, see do_body for details.
 
         :param repository: The repository to query in.
         :param revision_ids: The utf8 encoded revision_id to answer for.
+        """
+        self._revision_ids = revision_ids
+        return None # Signal that we want a body.
+
+    def do_body(self, body_bytes):
+        """Process the current search state and perform the parent lookup.
+
         :return: A smart server response where the body contains an utf8
-            encoded flattened list of the parents of the revisions, (the same
-            format as Repository.get_revision_graph).
+            encoded flattened list of the parents of the revisions (the same
+            format as Repository.get_revision_graph) which has been gzipped.
         """
+        repository = self._repository
+        repository.lock_read()
+        try:
+            return self._do_repository_request(body_bytes)
+        finally:
+            repository.unlock()
+
+    def _do_repository_request(self, body_bytes):
+        repository = self._repository
+        revision_ids = set(self._revision_ids)
+        search, error = self.recreate_search(repository, body_bytes)
+        if error is not None:
+            return error
+        # TODO might be nice to start up the search again; but thats not
+        # written or tested yet.
+        client_seen_revs = set(search.get_result().get_keys())
+        # Always include the requested ids.
+        client_seen_revs.difference_update(revision_ids)
         lines = []
         repo_graph = repository.get_graph()
         result = {}
@@ -96,25 +144,32 @@
                 # adjust for the wire
                 if parents == (_mod_revision.NULL_REVISION,):
                     parents = ()
-                # add parents to the result
-                result[revision_id] = parents
                 # prepare the next query
                 next_revs.update(parents)
-                # Approximate the serialized cost of this revision_id.
-                size_so_far += 2 + len(revision_id) + sum(map(len, parents))
-                # get all the directly asked for parents, and then flesh out to
-                # 64K or so.
-                if first_loop_done and size_so_far > 65000:
-                    next_revs = set()
-                    break
+                if revision_id not in client_seen_revs:
+                    # Client does not have this revision, give it to it.
+                    # add parents to the result
+                    result[revision_id] = parents
+                    # Approximate the serialized cost of this revision_id.
+                    size_so_far += 2 + len(revision_id) + sum(map(len, parents))
+            # get all the directly asked for parents, and then flesh out to
+            # 64K (compressed) or so. We do one level of depth at a time to
+            # stay in sync with the client. The 185000 magic number is
+            # estimated compression ratio taken from bzr.dev itself.
+            if first_loop_done and size_so_far > 185000:
+                next_revs = set()
+                break
             # don't query things we've already queried
             next_revs.difference_update(queried_revs)
             first_loop_done = True
 
-        for revision, parents in result.items():
+        # sorting trivially puts lexographically similar revision ids together.
+        # Compression FTW.
+        for revision, parents in sorted(result.items()):
             lines.append(' '.join((revision, ) + tuple(parents)))
 
-        return SuccessfulSmartServerResponse(('ok', ), '\n'.join(lines))
+        return SuccessfulSmartServerResponse(
+            ('ok', ), bytes_to_gzip('\n'.join(lines)))
 
 
 class SmartServerRepositoryGetRevisionGraph(SmartServerRepositoryRequest):
@@ -342,31 +397,15 @@
     """Bzr 1.1+ streaming pull."""
 
     def do_body(self, body_bytes):
-        lines = body_bytes.split('\n')
-        start_keys = set(lines[0].split(' '))
-        exclude_keys = set(lines[1].split(' '))
-        revision_count = int(lines[2])
         repository = self._repository
         repository.lock_read()
         try:
-            search = repository.get_graph()._make_breadth_first_searcher(
-                start_keys)
-            while True:
-                try:
-                    next_revs = search.next()
-                except StopIteration:
-                    break
-                search.stop_searching_any(exclude_keys.intersection(next_revs))
-            search_result = search.get_result()
-            if search_result.get_recipe()[2] != revision_count:
-                # we got back a different amount of data than expected, this
-                # gets reported as NoSuchRevision, because less revisions
-                # indicates missing revisions, and more should never happen as
-                # the excludes list considers ghosts and ensures that ghost
-                # filling races are not a problem.
-                return FailedSmartServerResponse(('NoSuchRevision',))
-            stream = repository.get_data_stream_for_search(search_result)
+            search, error = self.recreate_search(repository, body_bytes)
+            if error is not None:
+                return error
+            stream = repository.get_data_stream_for_search(search.get_result())
         except Exception:
+            # On non-error, unlocking is done by the body stream handler.
             repository.unlock()
             raise
         return SuccessfulSmartServerResponse(('ok',),

=== modified file 'bzrlib/tests/repository_implementations/test_repository.py'
--- a/bzrlib/tests/repository_implementations/test_repository.py	2008-01-30 04:34:52 +0000
+++ b/bzrlib/tests/repository_implementations/test_repository.py	2008-02-03 22:55:08 +0000
@@ -604,6 +604,18 @@
         for value in parents.values():
             self.assertIsInstance(value, tuple)
 
+    def test_get_parent_map_corner_cases(self):
+        """get_parent_map(keys) must work on a repo's graph."""
+        repo = self.make_repository('.')
+        repo.lock_read()
+        self.addCleanup(repo.unlock)
+        graph = repo.get_graph()
+        # With no search, no error
+        self.assertEqual({}, graph.get_parent_map([], None))
+        # And with a search, no error
+        search = graph._make_breadth_first_searcher([])
+        self.assertEqual({}, graph.get_parent_map([], search))
+
     def test_implements_revision_graph_can_have_wrong_parents(self):
         """All repositories should implement
         revision_graph_can_have_wrong_parents, so that check and reconcile can

=== modified file 'bzrlib/tests/test_remote.py'
--- a/bzrlib/tests/test_remote.py	2008-01-22 00:27:42 +0000
+++ b/bzrlib/tests/test_remote.py	2008-02-03 22:55:08 +0000
@@ -46,6 +46,7 @@
 from bzrlib.smart.client import _SmartClient
 from bzrlib.transport.memory import MemoryTransport
 from bzrlib.transport.remote import RemoteTransport
+from bzrlib.tuned_gzip import bytes_to_gzip
 
 
 class BasicRemoteObjectTests(tests.TestCaseWithTransport):
@@ -608,7 +609,7 @@
         r1 = u'\u0e33'.encode('utf8')
         r2 = u'\u0dab'.encode('utf8')
         lines = [' '.join([r2, r1]), r1]
-        encoded_body = '\n'.join(lines)
+        encoded_body = bytes_to_gzip('\n'.join(lines))
         responses = [(('ok', ), encoded_body), (('ok', ), encoded_body)]
 
         transport_path = 'quack'
@@ -624,8 +625,8 @@
         parents = graph.get_parent_map([r1])
         self.assertEqual({r1: (NULL_REVISION,)}, parents)
         self.assertEqual(
-            [('call_expecting_body', 'Repository.get_parent_map',
-             ('quack/', r2))],
+            [('call_with_body_bytes_expecting_body',
+              'Repository.get_parent_map', ('quack/', r2), '\n\n0')],
             client._calls)
         repo.unlock()
         # now we call again, and it should use the second response.
@@ -634,10 +635,10 @@
         parents = graph.get_parent_map([r1])
         self.assertEqual({r1: (NULL_REVISION,)}, parents)
         self.assertEqual(
-            [('call_expecting_body', 'Repository.get_parent_map',
-              ('quack/', r2)),
-             ('call_expecting_body', 'Repository.get_parent_map',
-              ('quack/', r1))
+            [('call_with_body_bytes_expecting_body',
+              'Repository.get_parent_map', ('quack/', r2), '\n\n0'),
+             ('call_with_body_bytes_expecting_body',
+              'Repository.get_parent_map', ('quack/', r1), '\n\n0'),
             ],
             client._calls)
         repo.unlock()

=== modified file 'bzrlib/tests/test_smart.py'
--- a/bzrlib/tests/test_smart.py	2008-01-17 07:49:09 +0000
+++ b/bzrlib/tests/test_smart.py	2008-02-03 22:55:08 +0000
@@ -37,6 +37,7 @@
 import bzrlib.smart.bzrdir
 import bzrlib.smart.branch
 import bzrlib.smart.repository
+from bzrlib.tuned_gzip import bytes_to_gzip
 from bzrlib.util import bencode
 
 
@@ -529,6 +530,22 @@
             request.execute, backing.local_abspath('subdir'))
 
 
+class TestSmartServerRepositoryGetParentMap(tests.TestCaseWithTransport):
+
+    def test_trivial_gzipped(self):
+        # This tests that the wire encoding is actually gzipped
+        backing = self.get_transport()
+        request = smart.repository.SmartServerRepositoryGetParentMap(backing)
+        tree = self.make_branch_and_memory_tree('.')
+
+        self.assertEqual(None,
+            request.execute(backing.local_abspath(''), 'missing-id'))
+        # Note that it returns a body (of '' gzipped).
+        self.assertEqual(
+            SuccessfulSmartServerResponse(('ok', ), bytes_to_gzip('')),
+            request.do_body('\n\n0\n'))
+
+
 class TestSmartServerRepositoryGetRevisionGraph(tests.TestCaseWithTransport):
 
     def test_none_argument(self):