Revert "FROMLIST: sched: Consolidate cpufreq updates"

This partially reverts commit b329d0289f. The cfs_rq->decayed addition is kept to preserve the KMI until the next KMI update. Reason for revert: This change causes major perf issues. Bug: 343389622 Change-Id: I496da55f509a3c1a7ba8f869c8db43d9509fc547 Signed-off-by: William McVicker <willmcvicker@google.com> [cmllamas: kept cfs_rq->decayed addition to preserve the KMI] Signed-off-by: Carlos Llamas <cmllamas@google.com>
2025-07-13 12:49:35 +02:00 · 2024-06-01 00:54:05 +00:00 · 2024-06-01 00:54:05 +00:00 · eab28aad52
commit eab28aad52
parent ddc5d87954
7 changed files with 81 additions and 151 deletions
--- a/include/linux/sched/cpufreq.h
+++ b/include/linux/sched/cpufreq.h
@ -8,8 +8,7 @@
 * Interface between cpufreq drivers and the scheduler:
 */

-#define SCHED_CPUFREQ_IOWAIT		(1U << 0)
-#define SCHED_CPUFREQ_FORCE_UPDATE	(1U << 1) /* ignore transition_delay_us */
+#define SCHED_CPUFREQ_IOWAIT	(1U << 0)

 #ifdef CONFIG_CPU_FREQ
 struct cpufreq_policy;
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@ -166,9 +166,6 @@ const_debug unsigned int sysctl_sched_nr_migrate = SCHED_NR_MIGRATE_BREAK;

 __read_mostly int scheduler_running;

-static __always_inline void
-update_cpufreq_ctx_switch(struct rq *rq, struct task_struct *prev);
-
 #ifdef CONFIG_SCHED_CORE

 DEFINE_STATIC_KEY_FALSE(__sched_core_enabled);
@ -1985,7 +1982,7 @@ static bool uclamp_reset(const struct sched_attr *attr,
 	return false;
 }

-static void __setscheduler_uclamp(struct rq *rq, struct task_struct *p,
+static void __setscheduler_uclamp(struct task_struct *p,
 				  const struct sched_attr *attr)
 {
 	enum uclamp_id clamp_id;
@ -2007,6 +2004,7 @@ static void __setscheduler_uclamp(struct rq *rq, struct task_struct *p,
 			value = uclamp_none(clamp_id);

 		uclamp_se_set(uc_se, value, false);
+
 	}

 	if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)))
@ -2025,13 +2023,6 @@ static void __setscheduler_uclamp(struct rq *rq, struct task_struct *p,
 			      attr->sched_util_max, true);
 		trace_android_vh_setscheduler_uclamp(p, UCLAMP_MAX, attr->sched_util_max);
 	}
-
-	/*
-	 * Updating uclamp values has impact on freq, ensure it is taken into
-	 * account.
-	 */
-	if (task_current(rq, p))
-		update_cpufreq_ctx_switch(rq, NULL);
 }

 static void uclamp_fork(struct task_struct *p)
@ -2106,7 +2097,7 @@ static inline int uclamp_validate(struct task_struct *p,
 {
 	return -EOPNOTSUPP;
 }
-static void __setscheduler_uclamp(struct rq *rq, struct task_struct *p,
+static void __setscheduler_uclamp(struct task_struct *p,
 				  const struct sched_attr *attr) { }
 static inline void uclamp_fork(struct task_struct *p) { }
 static inline void uclamp_post_fork(struct task_struct *p) { }
@ -2270,13 +2261,6 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
 			prev_class->switched_from(rq, p);

 		p->sched_class->switched_to(rq, p);
-
-		/*
-		 * Changing policies could imply requiring to send cpufreq
-		 * update.
-		 */
-		if (task_current(rq, p))
-			update_cpufreq_ctx_switch(rq, NULL);
 	} else if (oldprio != p->prio || dl_task(p))
 		p->sched_class->prio_changed(rq, p, oldprio);
 }
@ -5231,68 +5215,6 @@ static inline void balance_callbacks(struct rq *rq, struct balance_callback *hea

 #endif

-static __always_inline void
-update_cpufreq_ctx_switch(struct rq *rq, struct task_struct *prev)
-{
-#ifdef CONFIG_CPU_FREQ
-	/*
-	 * RT and DL should always send a freq update. But we can do some
-	 * simple checks to avoid it when we know it's not necessary.
-	 *
-	 * iowait_boost will always trigger a freq update too.
-	 *
-	 * Fair tasks will only trigger an update if the root cfs_rq has
-	 * decayed.
-	 *
-	 * Everything else should do nothing.
-	 */
-	switch (current->policy) {
-	case SCHED_NORMAL:
-	case SCHED_BATCH:
-		if (unlikely(current->in_iowait)) {
-			cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT | SCHED_CPUFREQ_FORCE_UPDATE);
-			return;
-		}
-
-#ifdef CONFIG_SMP
-		if (unlikely(rq->cfs.decayed)) {
-			rq->cfs.decayed = false;
-			cpufreq_update_util(rq, 0);
-			return;
-		}
-#endif
-		return;
-	case SCHED_FIFO:
-	case SCHED_RR:
-		if (prev && rt_policy(prev->policy)) {
-#ifdef CONFIG_UCLAMP_TASK
-			unsigned long curr_uclamp_min = uclamp_eff_value(current, UCLAMP_MIN);
-			unsigned long prev_uclamp_min = uclamp_eff_value(prev, UCLAMP_MIN);
-
-			if (curr_uclamp_min == prev_uclamp_min)
-#endif
-				return;
-		}
-#ifdef CONFIG_SMP
-		/* Stopper task masquerades as RT */
-		if (unlikely(current->sched_class == &stop_sched_class))
-			return;
-#endif
-		cpufreq_update_util(rq, SCHED_CPUFREQ_FORCE_UPDATE);
-		return;
-	case SCHED_DEADLINE:
-		if (current->dl.flags & SCHED_FLAG_SUGOV) {
-			/* Ignore sugov kthreads, they're responding to our requests */
-			return;
-		}
-		cpufreq_update_util(rq, SCHED_CPUFREQ_FORCE_UPDATE);
-		return;
-	default:
-		return;
-	}
-#endif
-}
-
 static inline void
 prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf)
 {
@ -5310,7 +5232,7 @@ prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf
 #endif
 }

-static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
+static inline void finish_lock_switch(struct rq *rq)
 {
 	/*
 	 * If we are tracking spinlock dependencies then we have to
@ -5319,11 +5241,6 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
 	 */
 	spin_acquire(&__rq_lockp(rq)->dep_map, 0, 0, _THIS_IP_);
 	__balance_callbacks(rq);
-	/*
-	 * Request freq update after __balance_callbacks to take into account
-	 * any changes to rq.
-	 */
-	update_cpufreq_ctx_switch(rq, prev);
 	raw_spin_rq_unlock_irq(rq);
 }

@ -5442,7 +5359,7 @@ static struct rq *finish_task_switch(struct task_struct *prev)
 	perf_event_task_sched_in(prev, current);
 	finish_task(prev);
 	tick_nohz_task_switch();
-	finish_lock_switch(rq, prev);
+	finish_lock_switch(rq);
 	finish_arch_post_lock_switch();
 	kcov_finish_switch(current);
 	/*
@ -8027,7 +7944,7 @@ change:
 		__setscheduler_prio(p, newprio);
 		trace_android_rvh_setscheduler(p);
 	}
-	__setscheduler_uclamp(rq, p, attr);
+	__setscheduler_uclamp(p, attr);

 	if (queued) {
 		/*
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@ -61,8 +61,7 @@ static DEFINE_PER_CPU(struct sugov_cpu, sugov_cpu);

 /************************ Governor internals ***********************/

-static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time,
-				     unsigned int flags)
+static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time)
 {
 	s64 delta_ns;

@ -90,16 +89,13 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time,
 		return true;
 	}

-	if (unlikely(flags & SCHED_CPUFREQ_FORCE_UPDATE))
-		return true;
-
 	delta_ns = time - sg_policy->last_freq_update_time;

 	return delta_ns >= sg_policy->freq_update_delay_ns;
 }

 static bool sugov_update_next_freq(struct sugov_policy *sg_policy, u64 time,
-				   unsigned int next_freq, unsigned int flags)
+				   unsigned int next_freq)
 {
 	if (sg_policy->need_freq_update)
 		sg_policy->need_freq_update = cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS);
@ -107,9 +103,7 @@ static bool sugov_update_next_freq(struct sugov_policy *sg_policy, u64 time,
 		return false;

 	sg_policy->next_freq = next_freq;
-
-	if (!unlikely(flags & SCHED_CPUFREQ_FORCE_UPDATE))
-		sg_policy->last_freq_update_time = time;
+	sg_policy->last_freq_update_time = time;

 	return true;
 }
@ -221,10 +215,9 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
 			       unsigned int flags)
 {
 	bool set_iowait_boost = flags & SCHED_CPUFREQ_IOWAIT;
-	bool forced_update = flags & SCHED_CPUFREQ_FORCE_UPDATE;

 	/* Reset boost if the CPU appears to have been idle enough */
-	if (sg_cpu->iowait_boost && !forced_update &&
+	if (sg_cpu->iowait_boost &&
 	    sugov_iowait_reset(sg_cpu, time, set_iowait_boost))
 		return;

@ -267,35 +260,19 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
 * being more conservative on tasks which does sporadic IO operations.
 */
 static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time,
-			       unsigned long max_cap, unsigned int flags)
+			       unsigned long max_cap)
 {
-	bool forced_update = flags & SCHED_CPUFREQ_FORCE_UPDATE;
-	s64 delta_ns = time - sg_cpu->last_update;
 	unsigned long boost;

 	/* No boost currently required */
 	if (!sg_cpu->iowait_boost)
 		return;

-	if (forced_update)
-		goto apply_boost;
-
 	/* Reset boost if the CPU appears to have been idle enough */
 	if (sugov_iowait_reset(sg_cpu, time, false))
 		return;

 	if (!sg_cpu->iowait_boost_pending) {
-		/*
-		 * This logic relied on PELT signal decays happening once every
-		 * 1ms. But due to changes to how updates are done now, we can
-		 * end up with more request coming up leading to iowait boost
-		 * to be prematurely reduced. Make the assumption explicit
-		 * until we improve the iowait boost logic to be better in
-		 * general as it is due for an overhaul.
-		 */
-		if (delta_ns <= NSEC_PER_MSEC)
-			goto apply_boost;
-
 		/*
 		 * No boost pending; reduce the boost value.
 		 */
@ -306,7 +283,6 @@ static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time,
 		}
 	}

-apply_boost:
 	sg_cpu->iowait_boost_pending = false;

 	/*
@ -351,11 +327,11 @@ static inline bool sugov_update_single_common(struct sugov_cpu *sg_cpu,

 	ignore_dl_rate_limit(sg_cpu);

-	if (!sugov_should_update_freq(sg_cpu->sg_policy, time, flags))
+	if (!sugov_should_update_freq(sg_cpu->sg_policy, time))
 		return false;

 	sugov_get_util(sg_cpu);
-	sugov_iowait_apply(sg_cpu, time, max_cap, flags);
+	sugov_iowait_apply(sg_cpu, time, max_cap);

 	return true;
 }
@ -390,7 +366,7 @@ static void sugov_update_single_freq(struct update_util_data *hook, u64 time,
 		sg_policy->cached_raw_freq = cached_freq;
 	}

-	if (!sugov_update_next_freq(sg_policy, time, next_f, flags))
+	if (!sugov_update_next_freq(sg_policy, time, next_f))
 		return;

 	/*
@ -442,12 +418,10 @@ static void sugov_update_single_perf(struct update_util_data *hook, u64 time,
 	cpufreq_driver_adjust_perf(sg_cpu->cpu, map_util_perf(sg_cpu->bw_dl),
 				   map_util_perf(sg_cpu->util), max_cap);

-	if (!unlikely(flags & SCHED_CPUFREQ_FORCE_UPDATE))
-		sg_cpu->sg_policy->last_freq_update_time = time;
+	sg_cpu->sg_policy->last_freq_update_time = time;
 }

-static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time,
-					   unsigned int flags)
+static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
 {
 	struct sugov_policy *sg_policy = sg_cpu->sg_policy;
 	struct cpufreq_policy *policy = sg_policy->policy;
@ -460,7 +434,7 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time,
 		struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j);

 		sugov_get_util(j_sg_cpu);
-		sugov_iowait_apply(j_sg_cpu, time, max_cap, flags);
+		sugov_iowait_apply(j_sg_cpu, time, max_cap);

 		util = max(j_sg_cpu->util, util);
 	}
@ -482,10 +456,10 @@ sugov_update_shared(struct update_util_data *hook, u64 time, unsigned int flags)

 	ignore_dl_rate_limit(sg_cpu);

-	if (sugov_should_update_freq(sg_policy, time, flags)) {
-		next_f = sugov_next_freq_shared(sg_cpu, time, flags);
+	if (sugov_should_update_freq(sg_policy, time)) {
+		next_f = sugov_next_freq_shared(sg_cpu, time);

-		if (!sugov_update_next_freq(sg_policy, time, next_f, flags))
+		if (!sugov_update_next_freq(sg_policy, time, next_f))
 			goto unlock;

 		if (sg_policy->policy->fast_switch_enabled)
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@ -239,6 +239,8 @@ void __add_running_bw(u64 dl_bw, struct dl_rq *dl_rq)
 	dl_rq->running_bw += dl_bw;
 	SCHED_WARN_ON(dl_rq->running_bw < old); /* overflow */
 	SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw);
+	/* kick cpufreq (see the comment in kernel/sched/sched.h). */
+	cpufreq_update_util(rq_of_dl_rq(dl_rq), 0);
 }

 static inline
@ -251,6 +253,8 @@ void __sub_running_bw(u64 dl_bw, struct dl_rq *dl_rq)
 	SCHED_WARN_ON(dl_rq->running_bw > old); /* underflow */
 	if (dl_rq->running_bw > old)
 		dl_rq->running_bw = 0;
+	/* kick cpufreq (see the comment in kernel/sched/sched.h). */
+	cpufreq_update_util(rq_of_dl_rq(dl_rq), 0);
 }

 static inline
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@ -3950,6 +3950,29 @@ static inline void update_cfs_group(struct sched_entity *se)
 }
 #endif /* CONFIG_FAIR_GROUP_SCHED */

+static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags)
+{
+	struct rq *rq = rq_of(cfs_rq);
+
+	if (&rq->cfs == cfs_rq) {
+		/*
+		 * There are a few boundary cases this might miss but it should
+		 * get called often enough that that should (hopefully) not be
+		 * a real problem.
+		 *
+		 * It will not get called when we go idle, because the idle
+		 * thread is a different class (!fair), nor will the utilization
+		 * number include things like RT tasks.
+		 *
+		 * As is, the util number is not freq-invariant (we'd have to
+		 * implement arch_scale_freq_capacity() for that).
+		 *
+		 * See cpu_util_cfs().
+		 */
+		cpufreq_update_util(rq, flags);
+	}
+}
+
 #ifdef CONFIG_SMP
 static inline bool load_avg_is_decayed(struct sched_avg *sa)
 {
@ -4567,6 +4590,8 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s

 	add_tg_cfs_propagate(cfs_rq, se->avg.load_sum);

+	cfs_rq_util_change(cfs_rq, 0);
+
 	trace_pelt_cfs_tp(cfs_rq);
 }

@ -4595,6 +4620,8 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s

 	add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum);

+	cfs_rq_util_change(cfs_rq, 0);
+
 	trace_pelt_cfs_tp(cfs_rq);
 }

@ -4610,7 +4637,7 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
 static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 {
 	u64 now = cfs_rq_clock_pelt(cfs_rq);
-	unsigned long prev_util_avg = cfs_rq->avg.util_avg;
+	int decayed;

 	/*
 	 * Track task load average for carrying it to new CPU after migrated, and
@ -4619,8 +4646,8 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
 	if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
 		__update_load_avg_se(now, cfs_rq, se);

-	cfs_rq->decayed |= update_cfs_rq_load_avg(now, cfs_rq);
-	cfs_rq->decayed |= propagate_entity_load_avg(se);
+	decayed  = update_cfs_rq_load_avg(now, cfs_rq);
+	decayed |= propagate_entity_load_avg(se);

 	if (!se->avg.last_update_time && (flags & DO_ATTACH)) {

@ -4641,19 +4668,12 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
 		 */
 		detach_entity_load_avg(cfs_rq, se);
 		update_tg_load_avg(cfs_rq);
-	} else if (cfs_rq->decayed && (flags & UPDATE_TG)) {
-		update_tg_load_avg(cfs_rq);
-	}
+	} else if (decayed) {
+		cfs_rq_util_change(cfs_rq, 0);

-	/*
-	 * This field is used to indicate whether a trigger of cpufreq update
-	 * is required. When the CPU is saturated, other load signals could
-	 * still be changing, but util_avg would have settled down, so ensure
-	 * that we don't trigger unnecessary updates as from fair policy point
-	 * of view, nothing has changed to cause a cpufreq update.
-	 */
-	if (cfs_rq->decayed && prev_util_avg == cfs_rq->avg.util_avg)
-		cfs_rq->decayed = false;
+		if (flags & UPDATE_TG)
+			update_tg_load_avg(cfs_rq);
+	}
 }

 /*
@ -5035,6 +5055,7 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)

 static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int not_used1)
 {
+	cfs_rq_util_change(cfs_rq, 0);
 }

 static inline void remove_entity_load_avg(struct sched_entity *se) {}
@ -6621,6 +6642,14 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	 */
 	util_est_enqueue(&rq->cfs, p);

+	/*
+	 * If in_iowait is set, the code below may not trigger any cpufreq
+	 * utilization updates, so do it here explicitly with the IOWAIT flag
+	 * passed.
+	 */
+	if (p->in_iowait)
+		cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);
+
 	for_each_sched_entity(se) {
 		if (se->on_rq)
 			break;
@ -9230,6 +9259,10 @@ static bool __update_blocked_others(struct rq *rq, bool *done)
 	unsigned long thermal_pressure;
 	bool decayed;

+	/*
+	 * update_load_avg() can call cpufreq_update_util(). Make sure that RT,
+	 * DL and IRQ signals have been updated before updating CFS.
+	 */
 	curr_class = rq->curr->sched_class;

 	thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq));
@ -12605,7 +12638,6 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)

 	update_misfit_status(curr, rq);
 	update_overutilized_status(task_rq(curr));
-	cpufreq_update_util(rq, 0);

 	task_tick_core(rq, curr);
 }
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@ -606,8 +606,11 @@ static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)

 	rt_se = rt_rq->tg->rt_se[cpu];

-	if (!rt_se)
+	if (!rt_se) {
 		dequeue_top_rt_rq(rt_rq, rt_rq->rt_nr_running);
+		/* Kick cpufreq (see the comment in kernel/sched/sched.h). */
+		cpufreq_update_util(rq_of_rt_rq(rt_rq), 0);
+	}
 	else if (on_rt_rq(rt_se))
 		dequeue_rt_entity(rt_se, 0);
 }
@ -1130,6 +1133,9 @@ enqueue_top_rt_rq(struct rt_rq *rt_rq)
 		add_nr_running(rq, rt_rq->rt_nr_running);
 		rt_rq->rt_queued = 1;
 	}
+
+	/* Kick cpufreq (see the comment in kernel/sched/sched.h). */
+	cpufreq_update_util(rq, 0);
 }

 #if defined CONFIG_SMP
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@ -622,9 +622,7 @@ struct cfs_rq {
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 #endif /* CONFIG_SMP */

-	/*
-	 * Store whether last update_load_avg() has decayed
-	 */
+	 /* Unused, only kept here to preserve the KMI after revert. */
 	bool			decayed;

 #ifdef CONFIG_FAIR_GROUP_SCHED