NACK: [SRU][J][PATCH 1/1] cgroup: Reorganize css_set_lock and kernfs path processing

Juerg Haefliger juerg.haefliger at canonical.com
Thu Jan 9 11:18:52 UTC 2025


On Mon,  6 Jan 2025 17:15:40 -0500
Philip Cox <philip.cox at canonical.com> wrote:

> From: Michal Koutný <mkoutny at suse.com>
> 
> BugLink: https://bugs.launchpad.net/bugs/2089318
> 
> The commit 74e4b956eb1c incorrectly wrapped kernfs_walk_and_get
> (might_sleep) under css_set_lock (spinlock). css_set_lock is needed by
> __cset_cgroup_from_root to ensure stable cset->cgrp_links but not for
> kernfs_walk_and_get.
> 
> We only need to make sure that the returned root_cgrp won't be freed
> under us. This is given in the case of global root because it is static
> (cgrp_dfl_root.cgrp). When the root_cgrp is lower in the hierarchy, it
> is pinned by cgroup_ns->root_cset (and `current` task cannot switch
> namespace asynchronously so ns_proxy pins cgroup_ns).
> 
> Note this reasoning won't hold for root cgroups in v1 hierarchies,
> therefore create a special-cased helper function just for the default
> hierarchy.
> 
> Fixes: 74e4b956eb1c ("cgroup: Honor caller's cgroup NS when resolving path")
> Reported-by: Dan Carpenter <dan.carpenter at oracle.com>
> Signed-off-by: Michal Koutný <mkoutny at suse.com>
> Signed-off-by: Tejun Heo <tj at kernel.org>
> (back ported from commit 46307fd6e27a3f678a1678b02e667678c22aa8cc)

Nit, this should be
(backported from commit 46307fd6e27a3f678a1678b02e667678c22aa8cc)


> [philcox: context changes in __cset_cgroup_from_root(),
> cset_cgroup_from_root(), cgroup_get_from_id()]

It's more than just context changes. See below.


> Signed-off-by: Philip Cox <philip.cox at canonical.com>
> ---
>  kernel/cgroup/cgroup.c | 86 ++++++++++++++++++++++++++++++------------
>  1 file changed, 62 insertions(+), 24 deletions(-)
> 
> diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
> index 999fef6d1228..308cdefc1b70 100644
> --- a/kernel/cgroup/cgroup.c
> +++ b/kernel/cgroup/cgroup.c
> @@ -1379,49 +1379,50 @@ static void cgroup_destroy_root(struct cgroup_root *root)
>  }
>  
>  /*
> - * look up cgroup associated with current task's cgroup namespace on the
> - * specified hierarchy
> + * Returned cgroup is without refcount but it's valid as long as cset pins it.
>   */
> -static struct cgroup *
> -current_cgns_cgroup_from_root(struct cgroup_root *root)
> +static inline struct cgroup *__cset_cgroup_from_root(struct css_set *cset,
> +					    struct cgroup_root *root)
>  {
> -	struct cgroup *res = NULL;
> -	struct css_set *cset;
> +	struct cgroup *res_cgroup = NULL;
>  
> -	lockdep_assert_held(&css_set_lock);
> -
> -	rcu_read_lock();
> -
> -	cset = current->nsproxy->cgroup_ns->root_cset;
>  	if (cset == &init_css_set) {
> -		res = &root->cgrp;
> +		res_cgroup = &root->cgrp;
>  	} else if (root == &cgrp_dfl_root) {
> -		res = cset->dfl_cgrp;
> +		res_cgroup = cset->dfl_cgrp;
>  	} else {
>  		struct cgrp_cset_link *link;
> +		lockdep_assert_held(&css_set_lock);
>  
>  		list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
>  			struct cgroup *c = link->cgrp;
>  
>  			if (c->root == root) {
> -				res = c;
> +				res_cgroup = c;
>  				break;
>  			}
>  		}
>  	}
> -	rcu_read_unlock();
>  
> -	return res;
> +	BUG_ON(!res_cgroup);
> +	return res_cgroup;
>  }
>  
> -/* look up cgroup associated with given css_set on the specified hierarchy */
> -static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
> -					    struct cgroup_root *root)
> +/*
> + * look up cgroup associated with current task's cgroup namespace on the
> + * specified hierarchy
> + */
> +static struct cgroup *
> +current_cgns_cgroup_from_root(struct cgroup_root *root)
>  {
>  	struct cgroup *res = NULL;
> +	struct css_set *cset;
>  
>  	lockdep_assert_held(&css_set_lock);
>  
> +	rcu_read_lock();
> +
> +	cset = current->nsproxy->cgroup_ns->root_cset;
>  	if (cset == &init_css_set) {
>  		res = &root->cgrp;
>  	} else if (root == &cgrp_dfl_root) {
> @@ -1438,11 +1439,40 @@ static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
>  			}
>  		}
>  	}
> +	rcu_read_unlock();
>  
> -	BUG_ON(!res);
>  	return res;
>  }
>  
> +/*
> + * Look up cgroup associated with current task's cgroup namespace on the default
> + * hierarchy.
> + *
> + * Unlike current_cgns_cgroup_from_root(), this doesn't need locks:
> + * - Internal rcu_read_lock is unnecessary because we don't dereference any rcu
> + *   pointers.
> + * - css_set_lock is not needed because we just read cset->dfl_cgrp.
> + * - As a bonus returned cgrp is pinned with the current because it cannot
> + *   switch cgroup_ns asynchronously.
> + */
> +static struct cgroup *current_cgns_cgroup_dfl(void)
> +{
> +	struct css_set *cset;
> +
> +	cset = current->nsproxy->cgroup_ns->root_cset;
> +	return __cset_cgroup_from_root(cset, &cgrp_dfl_root);
> +}
> +
> +/* look up cgroup associated with given css_set on the specified hierarchy */
> +static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
> +					    struct cgroup_root *root)
> +{
> +	lockdep_assert_held(&cgroup_mutex);
> +	lockdep_assert_held(&css_set_lock);
> +
> +	return __cset_cgroup_from_root(cset, root);
> +}
> +
>  /*
>   * Return the cgroup for "task" from the given hierarchy. Must be
>   * called with css_set_lock held to prevent task's groups from being modified.
> @@ -6034,7 +6064,7 @@ void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen)
>  struct cgroup *cgroup_get_from_id(u64 id)
>  {
>  	struct kernfs_node *kn;
> -	struct cgroup *cgrp = NULL;
> +	struct cgroup *cgrp, *root_cgrp;
>  
>  	kn = kernfs_find_and_get_node_by_id(cgrp_dfl_root.kf_root, id);
>  	if (!kn)
> @@ -6053,6 +6083,16 @@ struct cgroup *cgroup_get_from_id(u64 id)
>  put:
>  	kernfs_put(kn);
>  out:
> +
> +	if (!cgrp)
> +		return ERR_PTR(-ENOENT);
> +
> +	root_cgrp = current_cgns_cgroup_dfl();
> +	if (!cgroup_is_descendant(cgrp, root_cgrp)) {
> +		cgroup_put(cgrp);
> +		return ERR_PTR(-ENOENT);
> +	}
> +

This change is primarily from commit
534dee941056 ("cgroup: cgroup: Honor caller's cgroup NS when resolving cgroup id")

You probably also want 
7e1eb5437d3c ("cgroup: Make cgroup_get_from_id() prettier")
and maybe others...

You should cherry-pick/backport prereqs rather than squeeze changes from other commits.

...Juerg

>  	return cgrp;
>  }
>  EXPORT_SYMBOL_GPL(cgroup_get_from_id);
> @@ -6617,10 +6657,8 @@ struct cgroup *cgroup_get_from_path(const char *path)
>  	struct cgroup *cgrp = ERR_PTR(-ENOENT);
>  	struct cgroup *root_cgrp;
>  
> -	spin_lock_irq(&css_set_lock);
> -	root_cgrp = current_cgns_cgroup_from_root(&cgrp_dfl_root);
> +	root_cgrp = current_cgns_cgroup_dfl();
>  	kn = kernfs_walk_and_get(root_cgrp->kn, path);
> -	spin_unlock_irq(&css_set_lock);
>  	if (!kn)
>  		goto out;
>  

-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 833 bytes
Desc: OpenPGP digital signature
URL: <https://lists.ubuntu.com/archives/kernel-team/attachments/20250109/cde45c91/attachment-0001.sig>


More information about the kernel-team mailing list