ACK/Cmnt: [PATCH 1/2] drm/i915: Fix ref->mutex deadlock in i915_active_wait()

Andrea Righi andrea.righi at canonical.com
Thu Apr 9 16:19:23 UTC 2020


On Tue, Apr 07, 2020 at 03:27:39PM -0700, Sultan Alsawaf wrote:
> From: Sultan Alsawaf <sultan at kerneltoast.com>
> 
> The following deadlock exists in i915_active_wait() due to a double lock
> on ref->mutex (call chain listed in order from top to bottom):
>  i915_active_wait();
>  mutex_lock_interruptible(&ref->mutex); <-- ref->mutex first acquired
>  i915_active_request_retire();
>  node_retire();
>  active_retire();
>  mutex_lock_nested(&ref->mutex, SINGLE_DEPTH_NESTING); <-- DEADLOCK
> 
> Fix the deadlock by skipping the second ref->mutex lock when
> active_retire() is called through i915_active_request_retire().
> 
> Note that this bug only affects 5.4 and has since been fixed in 5.5.
> Normally, a backport of the fix from 5.5 would be in order, but the
> patch set that fixes this deadlock involves massive changes that are
> neither feasible nor desirable for backporting [1][2][3]. Therefore,
> this small patch was made to address the deadlock specifically for 5.4.
> 
> [1] 274cbf20fd10 ("drm/i915: Push the i915_active.retire into a worker")
> [2] 093b92287363 ("drm/i915: Split i915_active.mutex into an irq-safe spinlock for the rbtree")
> [3] 750bde2fd4ff ("drm/i915: Serialise with remote retirement")
> 
> Fixes: 12c255b5dad1 ("drm/i915: Provide an i915_active.acquire callback")
> Cc: <stable at vger.kernel.org> # 5.4.x
> Signed-off-by: Sultan Alsawaf <sultan at kerneltoast.com>
> Signed-off-by: Sultan Alsawaf <sultan.alsawaf at canonical.com>
> ---

The patch makes sense, even if I'm feeling a bit nervous that is not
upstream and upstream is fixing this deadlock in another way...

BTW, is 750bde2fd4ff really needed? Looking at those commits it seems
that only 274cbf20fd10 and 093b92287363 should be enough to prevent the
deadlock.

I've tried to backport these two and it seems a doable backport (I
have the patches against focal, but I haven't tested them, I just
build-tested).

However, if you confirm that your patch has been tested I think it can
be an easier fix and a reasonable compromise. Under this assumption:

Acked-by: Andrea Righi <andrea.righi at canonical.com>

>  drivers/gpu/drm/i915/i915_active.c | 29 +++++++++++++++++++----------
>  drivers/gpu/drm/i915/i915_active.h |  4 ++--
>  2 files changed, 21 insertions(+), 12 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_active.c b/drivers/gpu/drm/i915/i915_active.c
> index 48e16ad93bbd..51dc8753b527 100644
> --- a/drivers/gpu/drm/i915/i915_active.c
> +++ b/drivers/gpu/drm/i915/i915_active.c
> @@ -121,7 +121,7 @@ static inline void debug_active_assert(struct i915_active *ref) { }
>  #endif
>  
>  static void
> -__active_retire(struct i915_active *ref)
> +__active_retire(struct i915_active *ref, bool lock)
>  {
>  	struct active_node *it, *n;
>  	struct rb_root root;
> @@ -138,7 +138,8 @@ __active_retire(struct i915_active *ref)
>  		retire = true;
>  	}
>  
> -	mutex_unlock(&ref->mutex);
> +	if (likely(lock))
> +		mutex_unlock(&ref->mutex);
>  	if (!retire)
>  		return;
>  
> @@ -153,21 +154,28 @@ __active_retire(struct i915_active *ref)
>  }
>  
>  static void
> -active_retire(struct i915_active *ref)
> +active_retire(struct i915_active *ref, bool lock)
>  {
>  	GEM_BUG_ON(!atomic_read(&ref->count));
>  	if (atomic_add_unless(&ref->count, -1, 1))
>  		return;
>  
>  	/* One active may be flushed from inside the acquire of another */
> -	mutex_lock_nested(&ref->mutex, SINGLE_DEPTH_NESTING);
> -	__active_retire(ref);
> +	if (likely(lock))
> +		mutex_lock_nested(&ref->mutex, SINGLE_DEPTH_NESTING);
> +	__active_retire(ref, lock);
>  }
>  
>  static void
>  node_retire(struct i915_active_request *base, struct i915_request *rq)
>  {
> -	active_retire(node_from_active(base)->ref);
> +	active_retire(node_from_active(base)->ref, true);
> +}
> +
> +static void
> +node_retire_nolock(struct i915_active_request *base, struct i915_request *rq)
> +{
> +	active_retire(node_from_active(base)->ref, false);
>  }
>  
>  static struct i915_active_request *
> @@ -364,7 +372,7 @@ int i915_active_acquire(struct i915_active *ref)
>  void i915_active_release(struct i915_active *ref)
>  {
>  	debug_active_assert(ref);
> -	active_retire(ref);
> +	active_retire(ref, true);
>  }
>  
>  static void __active_ungrab(struct i915_active *ref)
> @@ -391,7 +399,7 @@ void i915_active_ungrab(struct i915_active *ref)
>  {
>  	GEM_BUG_ON(!test_bit(I915_ACTIVE_GRAB_BIT, &ref->flags));
>  
> -	active_retire(ref);
> +	active_retire(ref, true);
>  	__active_ungrab(ref);
>  }
>  
> @@ -421,12 +429,13 @@ int i915_active_wait(struct i915_active *ref)
>  			break;
>  		}
>  
> -		err = i915_active_request_retire(&it->base, BKL(ref));
> +		err = i915_active_request_retire(&it->base, BKL(ref),
> +						 node_retire_nolock);
>  		if (err)
>  			break;
>  	}
>  
> -	__active_retire(ref);
> +	__active_retire(ref, true);
>  	if (err)
>  		return err;
>  
> diff --git a/drivers/gpu/drm/i915/i915_active.h b/drivers/gpu/drm/i915/i915_active.h
> index f95058f99057..0ad7ef60d15f 100644
> --- a/drivers/gpu/drm/i915/i915_active.h
> +++ b/drivers/gpu/drm/i915/i915_active.h
> @@ -309,7 +309,7 @@ i915_active_request_isset(const struct i915_active_request *active)
>   */
>  static inline int __must_check
>  i915_active_request_retire(struct i915_active_request *active,
> -			   struct mutex *mutex)
> +			   struct mutex *mutex, i915_active_retire_fn retire)
>  {
>  	struct i915_request *request;
>  	long ret;
> @@ -327,7 +327,7 @@ i915_active_request_retire(struct i915_active_request *active,
>  	list_del_init(&active->link);
>  	RCU_INIT_POINTER(active->request, NULL);
>  
> -	active->retire(active, request);
> +	retire(active, request);
>  
>  	return 0;
>  }
> -- 
> 2.20.1
> 
> 
> -- 
> kernel-team mailing list
> kernel-team at lists.ubuntu.com
> https://lists.ubuntu.com/mailman/listinfo/kernel-team



More information about the kernel-team mailing list