[Xenial][SRU][CVE-2018-20784][PATCH 6/6] sched/fair: Fix O(nr_cgroups) in the load balancing path

Thu Oct 17 17:51:33 UTC 2019

From: Vincent Guittot <vincent.guittot at linaro.org>

CVE-2018-20784

This re-applies the commit reverted here:

  commit c40f7d74c741 ("sched/fair: Fix infinite loop in update_blocked_averages() by reverting a9e7f6544b9c")

I.e. now that cfs_rq can be safely removed/added in the list, we can re-apply:

 commit a9e7f6544b9c ("sched/fair: Fix O(nr_cgroups) in load balance path")

Signed-off-by: Vincent Guittot <vincent.guittot at linaro.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz at infradead.org>
Cc: Linus Torvalds <torvalds at linux-foundation.org>
Cc: Peter Zijlstra <peterz at infradead.org>
Cc: Thomas Gleixner <tglx at linutronix.de>
Cc: sargun at sargun.me
Cc: tj at kernel.org
Cc: xiexiuqi at huawei.com
Cc: xiezhipeng1 at huawei.com
Link: https://lkml.kernel.org/r/1549469662-13614-3-git-send-email-vincent.guittot@linaro.org
Signed-off-by: Ingo Molnar <mingo at kernel.org>
(backported from commit 039ae8bcf7a5f4476f4487e6bf816885fb3fb617)
[ Connor Kuehl: In 'cfs_rq_is_decayed' the data member
  'runnable_load_sum' belongs to struct cfs_rq and not sched_avg, so
  update that. Some instances of 'for_each_leaf_cfs_rq' required manual
  updating to the new 'for_each_leaf_cfs_rq_safe' and the last hunk for
  'update_blocked_averages' required manual placement. ]
Signed-off-by: Connor Kuehl <connor.kuehl at canonical.com>
---
 kernel/sched/fair.c | 48 +++++++++++++++++++++++++++++++++------------
 1 file changed, 36 insertions(+), 12 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 8bc9a5e24380..d16eec4b8d76 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -376,9 +376,10 @@ static inline void assert_list_leaf_cfs_rq(struct rq *rq)
 	WARN_ON_ONCE((rq->tmp_alone_branch != &rq->leaf_cfs_rq_list));
 }
 
-/* Iterate through all cfs_rq's on a runqueue in bottom-up order */
-#define for_each_leaf_cfs_rq(rq, cfs_rq) \
-	list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
+/* Iterate thr' all leaf cfs_rq's on a runqueue */
+#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos)			\
+	list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list,	\
+				 leaf_cfs_rq_list)
 
 /* Do the two (enqueued) entities belong to the same group ? */
 static inline struct cfs_rq *
@@ -476,8 +477,8 @@ static inline void assert_list_leaf_cfs_rq(struct rq *rq)
 {
 }
 
-#define for_each_leaf_cfs_rq(rq, cfs_rq)	\
-		for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
+#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos)	\
+		for (cfs_rq = &rq->cfs, pos = NULL; cfs_rq; cfs_rq = pos)
 
 static inline struct sched_entity *parent_entity(struct sched_entity *se)
 {
@@ -4203,9 +4204,9 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
 
 static void __maybe_unused update_runtime_enabled(struct rq *rq)
 {
-	struct cfs_rq *cfs_rq;
+	struct cfs_rq *cfs_rq, *pos;
 
-	for_each_leaf_cfs_rq(rq, cfs_rq) {
+	for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) {
 		struct cfs_bandwidth *cfs_b = &cfs_rq->tg->cfs_bandwidth;
 
 		raw_spin_lock(&cfs_b->lock);
@@ -4218,7 +4219,7 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
 {
 	struct cfs_rq *cfs_rq, *pos;
 
-	for_each_leaf_cfs_rq(rq, cfs_rq) {
+	for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) {
 		if (!cfs_rq->runtime_enabled)
 			continue;
 
@@ -6132,10 +6133,27 @@ static void attach_tasks(struct lb_env *env)
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 
+static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
+{
+	if (cfs_rq->load.weight)
+		return false;
+
+	if (cfs_rq->avg.load_sum)
+		return false;
+
+	if (cfs_rq->avg.util_sum)
+		return false;
+
+	if (cfs_rq->runnable_load_sum)
+		return false;
+
+	return true;
+}
+
 static void update_blocked_averages(int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
-	struct cfs_rq *cfs_rq;
+	struct cfs_rq *cfs_rq, *pos;
 	unsigned long flags;
 
 	raw_spin_lock_irqsave(&rq->lock, flags);
@@ -6145,9 +6163,15 @@ static void update_blocked_averages(int cpu)
 	 * Iterates the task_group tree in a bottom up fashion, see
 	 * list_add_leaf_cfs_rq() for details.
 	 */
-	for_each_leaf_cfs_rq(rq, cfs_rq) {
+	for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) {
 		if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq))
 			update_tg_load_avg(cfs_rq, 0);
+		/*
+		 * There can be a lot of idle CPU cgroups.  Don't let fully
+		 * decayed cfs_rqs linger on the list.
+		 */
+		if (cfs_rq_is_decayed(cfs_rq))
+			list_del_leaf_cfs_rq(cfs_rq);
 
 	}
 	raw_spin_unlock_irqrestore(&rq->lock, flags);
@@ -8554,10 +8578,10 @@ const struct sched_class fair_sched_class = {
 #ifdef CONFIG_SCHED_DEBUG
 void print_cfs_stats(struct seq_file *m, int cpu)
 {
-	struct cfs_rq *cfs_rq;
+	struct cfs_rq *cfs_rq, *pos;
 
 	rcu_read_lock();
-	for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
+	for_each_leaf_cfs_rq_safe(cpu_rq(cpu), cfs_rq, pos)
 		print_cfs_rq(m, cpu, cfs_rq);
 	rcu_read_unlock();
 }
-- 
2.17.1