Revert "FROMLIST: sched: Consolidate cpufreq updates"

This partially reverts commit b329d0289f.
The cfs_rq->decayed addition is kept to preserve the KMI until the next
KMI update.

Reason for revert: This change causes major perf issues.

Bug: 343389622
Change-Id: I496da55f509a3c1a7ba8f869c8db43d9509fc547
Signed-off-by: William McVicker <willmcvicker@google.com>
[cmllamas: kept cfs_rq->decayed addition to preserve the KMI]
Signed-off-by: Carlos Llamas <cmllamas@google.com>
This commit is contained in:
William McVicker 2024-06-01 00:54:05 +00:00 committed by Treehugger Robot
parent ddc5d87954
commit eab28aad52
7 changed files with 81 additions and 151 deletions

View File

@ -8,8 +8,7 @@
* Interface between cpufreq drivers and the scheduler:
*/
#define SCHED_CPUFREQ_IOWAIT (1U << 0)
#define SCHED_CPUFREQ_FORCE_UPDATE (1U << 1) /* ignore transition_delay_us */
#define SCHED_CPUFREQ_IOWAIT (1U << 0)
#ifdef CONFIG_CPU_FREQ
struct cpufreq_policy;

View File

@ -166,9 +166,6 @@ const_debug unsigned int sysctl_sched_nr_migrate = SCHED_NR_MIGRATE_BREAK;
__read_mostly int scheduler_running;
static __always_inline void
update_cpufreq_ctx_switch(struct rq *rq, struct task_struct *prev);
#ifdef CONFIG_SCHED_CORE
DEFINE_STATIC_KEY_FALSE(__sched_core_enabled);
@ -1985,7 +1982,7 @@ static bool uclamp_reset(const struct sched_attr *attr,
return false;
}
static void __setscheduler_uclamp(struct rq *rq, struct task_struct *p,
static void __setscheduler_uclamp(struct task_struct *p,
const struct sched_attr *attr)
{
enum uclamp_id clamp_id;
@ -2007,6 +2004,7 @@ static void __setscheduler_uclamp(struct rq *rq, struct task_struct *p,
value = uclamp_none(clamp_id);
uclamp_se_set(uc_se, value, false);
}
if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)))
@ -2025,13 +2023,6 @@ static void __setscheduler_uclamp(struct rq *rq, struct task_struct *p,
attr->sched_util_max, true);
trace_android_vh_setscheduler_uclamp(p, UCLAMP_MAX, attr->sched_util_max);
}
/*
* Updating uclamp values has impact on freq, ensure it is taken into
* account.
*/
if (task_current(rq, p))
update_cpufreq_ctx_switch(rq, NULL);
}
static void uclamp_fork(struct task_struct *p)
@ -2106,7 +2097,7 @@ static inline int uclamp_validate(struct task_struct *p,
{
return -EOPNOTSUPP;
}
static void __setscheduler_uclamp(struct rq *rq, struct task_struct *p,
static void __setscheduler_uclamp(struct task_struct *p,
const struct sched_attr *attr) { }
static inline void uclamp_fork(struct task_struct *p) { }
static inline void uclamp_post_fork(struct task_struct *p) { }
@ -2270,13 +2261,6 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
prev_class->switched_from(rq, p);
p->sched_class->switched_to(rq, p);
/*
* Changing policies could imply requiring to send cpufreq
* update.
*/
if (task_current(rq, p))
update_cpufreq_ctx_switch(rq, NULL);
} else if (oldprio != p->prio || dl_task(p))
p->sched_class->prio_changed(rq, p, oldprio);
}
@ -5231,68 +5215,6 @@ static inline void balance_callbacks(struct rq *rq, struct balance_callback *hea
#endif
static __always_inline void
update_cpufreq_ctx_switch(struct rq *rq, struct task_struct *prev)
{
#ifdef CONFIG_CPU_FREQ
/*
* RT and DL should always send a freq update. But we can do some
* simple checks to avoid it when we know it's not necessary.
*
* iowait_boost will always trigger a freq update too.
*
* Fair tasks will only trigger an update if the root cfs_rq has
* decayed.
*
* Everything else should do nothing.
*/
switch (current->policy) {
case SCHED_NORMAL:
case SCHED_BATCH:
if (unlikely(current->in_iowait)) {
cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT | SCHED_CPUFREQ_FORCE_UPDATE);
return;
}
#ifdef CONFIG_SMP
if (unlikely(rq->cfs.decayed)) {
rq->cfs.decayed = false;
cpufreq_update_util(rq, 0);
return;
}
#endif
return;
case SCHED_FIFO:
case SCHED_RR:
if (prev && rt_policy(prev->policy)) {
#ifdef CONFIG_UCLAMP_TASK
unsigned long curr_uclamp_min = uclamp_eff_value(current, UCLAMP_MIN);
unsigned long prev_uclamp_min = uclamp_eff_value(prev, UCLAMP_MIN);
if (curr_uclamp_min == prev_uclamp_min)
#endif
return;
}
#ifdef CONFIG_SMP
/* Stopper task masquerades as RT */
if (unlikely(current->sched_class == &stop_sched_class))
return;
#endif
cpufreq_update_util(rq, SCHED_CPUFREQ_FORCE_UPDATE);
return;
case SCHED_DEADLINE:
if (current->dl.flags & SCHED_FLAG_SUGOV) {
/* Ignore sugov kthreads, they're responding to our requests */
return;
}
cpufreq_update_util(rq, SCHED_CPUFREQ_FORCE_UPDATE);
return;
default:
return;
}
#endif
}
static inline void
prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf)
{
@ -5310,7 +5232,7 @@ prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf
#endif
}
static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
static inline void finish_lock_switch(struct rq *rq)
{
/*
* If we are tracking spinlock dependencies then we have to
@ -5319,11 +5241,6 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
*/
spin_acquire(&__rq_lockp(rq)->dep_map, 0, 0, _THIS_IP_);
__balance_callbacks(rq);
/*
* Request freq update after __balance_callbacks to take into account
* any changes to rq.
*/
update_cpufreq_ctx_switch(rq, prev);
raw_spin_rq_unlock_irq(rq);
}
@ -5442,7 +5359,7 @@ static struct rq *finish_task_switch(struct task_struct *prev)
perf_event_task_sched_in(prev, current);
finish_task(prev);
tick_nohz_task_switch();
finish_lock_switch(rq, prev);
finish_lock_switch(rq);
finish_arch_post_lock_switch();
kcov_finish_switch(current);
/*
@ -8027,7 +7944,7 @@ change:
__setscheduler_prio(p, newprio);
trace_android_rvh_setscheduler(p);
}
__setscheduler_uclamp(rq, p, attr);
__setscheduler_uclamp(p, attr);
if (queued) {
/*

View File

@ -61,8 +61,7 @@ static DEFINE_PER_CPU(struct sugov_cpu, sugov_cpu);
/************************ Governor internals ***********************/
static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time,
unsigned int flags)
static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time)
{
s64 delta_ns;
@ -90,16 +89,13 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time,
return true;
}
if (unlikely(flags & SCHED_CPUFREQ_FORCE_UPDATE))
return true;
delta_ns = time - sg_policy->last_freq_update_time;
return delta_ns >= sg_policy->freq_update_delay_ns;
}
static bool sugov_update_next_freq(struct sugov_policy *sg_policy, u64 time,
unsigned int next_freq, unsigned int flags)
unsigned int next_freq)
{
if (sg_policy->need_freq_update)
sg_policy->need_freq_update = cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS);
@ -107,9 +103,7 @@ static bool sugov_update_next_freq(struct sugov_policy *sg_policy, u64 time,
return false;
sg_policy->next_freq = next_freq;
if (!unlikely(flags & SCHED_CPUFREQ_FORCE_UPDATE))
sg_policy->last_freq_update_time = time;
sg_policy->last_freq_update_time = time;
return true;
}
@ -221,10 +215,9 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
unsigned int flags)
{
bool set_iowait_boost = flags & SCHED_CPUFREQ_IOWAIT;
bool forced_update = flags & SCHED_CPUFREQ_FORCE_UPDATE;
/* Reset boost if the CPU appears to have been idle enough */
if (sg_cpu->iowait_boost && !forced_update &&
if (sg_cpu->iowait_boost &&
sugov_iowait_reset(sg_cpu, time, set_iowait_boost))
return;
@ -267,35 +260,19 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
* being more conservative on tasks which does sporadic IO operations.
*/
static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time,
unsigned long max_cap, unsigned int flags)
unsigned long max_cap)
{
bool forced_update = flags & SCHED_CPUFREQ_FORCE_UPDATE;
s64 delta_ns = time - sg_cpu->last_update;
unsigned long boost;
/* No boost currently required */
if (!sg_cpu->iowait_boost)
return;
if (forced_update)
goto apply_boost;
/* Reset boost if the CPU appears to have been idle enough */
if (sugov_iowait_reset(sg_cpu, time, false))
return;
if (!sg_cpu->iowait_boost_pending) {
/*
* This logic relied on PELT signal decays happening once every
* 1ms. But due to changes to how updates are done now, we can
* end up with more request coming up leading to iowait boost
* to be prematurely reduced. Make the assumption explicit
* until we improve the iowait boost logic to be better in
* general as it is due for an overhaul.
*/
if (delta_ns <= NSEC_PER_MSEC)
goto apply_boost;
/*
* No boost pending; reduce the boost value.
*/
@ -306,7 +283,6 @@ static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time,
}
}
apply_boost:
sg_cpu->iowait_boost_pending = false;
/*
@ -351,11 +327,11 @@ static inline bool sugov_update_single_common(struct sugov_cpu *sg_cpu,
ignore_dl_rate_limit(sg_cpu);
if (!sugov_should_update_freq(sg_cpu->sg_policy, time, flags))
if (!sugov_should_update_freq(sg_cpu->sg_policy, time))
return false;
sugov_get_util(sg_cpu);
sugov_iowait_apply(sg_cpu, time, max_cap, flags);
sugov_iowait_apply(sg_cpu, time, max_cap);
return true;
}
@ -390,7 +366,7 @@ static void sugov_update_single_freq(struct update_util_data *hook, u64 time,
sg_policy->cached_raw_freq = cached_freq;
}
if (!sugov_update_next_freq(sg_policy, time, next_f, flags))
if (!sugov_update_next_freq(sg_policy, time, next_f))
return;
/*
@ -442,12 +418,10 @@ static void sugov_update_single_perf(struct update_util_data *hook, u64 time,
cpufreq_driver_adjust_perf(sg_cpu->cpu, map_util_perf(sg_cpu->bw_dl),
map_util_perf(sg_cpu->util), max_cap);
if (!unlikely(flags & SCHED_CPUFREQ_FORCE_UPDATE))
sg_cpu->sg_policy->last_freq_update_time = time;
sg_cpu->sg_policy->last_freq_update_time = time;
}
static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time,
unsigned int flags)
static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
{
struct sugov_policy *sg_policy = sg_cpu->sg_policy;
struct cpufreq_policy *policy = sg_policy->policy;
@ -460,7 +434,7 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time,
struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j);
sugov_get_util(j_sg_cpu);
sugov_iowait_apply(j_sg_cpu, time, max_cap, flags);
sugov_iowait_apply(j_sg_cpu, time, max_cap);
util = max(j_sg_cpu->util, util);
}
@ -482,10 +456,10 @@ sugov_update_shared(struct update_util_data *hook, u64 time, unsigned int flags)
ignore_dl_rate_limit(sg_cpu);
if (sugov_should_update_freq(sg_policy, time, flags)) {
next_f = sugov_next_freq_shared(sg_cpu, time, flags);
if (sugov_should_update_freq(sg_policy, time)) {
next_f = sugov_next_freq_shared(sg_cpu, time);
if (!sugov_update_next_freq(sg_policy, time, next_f, flags))
if (!sugov_update_next_freq(sg_policy, time, next_f))
goto unlock;
if (sg_policy->policy->fast_switch_enabled)

View File

@ -239,6 +239,8 @@ void __add_running_bw(u64 dl_bw, struct dl_rq *dl_rq)
dl_rq->running_bw += dl_bw;
SCHED_WARN_ON(dl_rq->running_bw < old); /* overflow */
SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw);
/* kick cpufreq (see the comment in kernel/sched/sched.h). */
cpufreq_update_util(rq_of_dl_rq(dl_rq), 0);
}
static inline
@ -251,6 +253,8 @@ void __sub_running_bw(u64 dl_bw, struct dl_rq *dl_rq)
SCHED_WARN_ON(dl_rq->running_bw > old); /* underflow */
if (dl_rq->running_bw > old)
dl_rq->running_bw = 0;
/* kick cpufreq (see the comment in kernel/sched/sched.h). */
cpufreq_update_util(rq_of_dl_rq(dl_rq), 0);
}
static inline

View File

@ -3950,6 +3950,29 @@ static inline void update_cfs_group(struct sched_entity *se)
}
#endif /* CONFIG_FAIR_GROUP_SCHED */
static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags)
{
struct rq *rq = rq_of(cfs_rq);
if (&rq->cfs == cfs_rq) {
/*
* There are a few boundary cases this might miss but it should
* get called often enough that that should (hopefully) not be
* a real problem.
*
* It will not get called when we go idle, because the idle
* thread is a different class (!fair), nor will the utilization
* number include things like RT tasks.
*
* As is, the util number is not freq-invariant (we'd have to
* implement arch_scale_freq_capacity() for that).
*
* See cpu_util_cfs().
*/
cpufreq_update_util(rq, flags);
}
}
#ifdef CONFIG_SMP
static inline bool load_avg_is_decayed(struct sched_avg *sa)
{
@ -4567,6 +4590,8 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
add_tg_cfs_propagate(cfs_rq, se->avg.load_sum);
cfs_rq_util_change(cfs_rq, 0);
trace_pelt_cfs_tp(cfs_rq);
}
@ -4595,6 +4620,8 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum);
cfs_rq_util_change(cfs_rq, 0);
trace_pelt_cfs_tp(cfs_rq);
}
@ -4610,7 +4637,7 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
u64 now = cfs_rq_clock_pelt(cfs_rq);
unsigned long prev_util_avg = cfs_rq->avg.util_avg;
int decayed;
/*
* Track task load average for carrying it to new CPU after migrated, and
@ -4619,8 +4646,8 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
__update_load_avg_se(now, cfs_rq, se);
cfs_rq->decayed |= update_cfs_rq_load_avg(now, cfs_rq);
cfs_rq->decayed |= propagate_entity_load_avg(se);
decayed = update_cfs_rq_load_avg(now, cfs_rq);
decayed |= propagate_entity_load_avg(se);
if (!se->avg.last_update_time && (flags & DO_ATTACH)) {
@ -4641,19 +4668,12 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
*/
detach_entity_load_avg(cfs_rq, se);
update_tg_load_avg(cfs_rq);
} else if (cfs_rq->decayed && (flags & UPDATE_TG)) {
update_tg_load_avg(cfs_rq);
}
} else if (decayed) {
cfs_rq_util_change(cfs_rq, 0);
/*
* This field is used to indicate whether a trigger of cpufreq update
* is required. When the CPU is saturated, other load signals could
* still be changing, but util_avg would have settled down, so ensure
* that we don't trigger unnecessary updates as from fair policy point
* of view, nothing has changed to cause a cpufreq update.
*/
if (cfs_rq->decayed && prev_util_avg == cfs_rq->avg.util_avg)
cfs_rq->decayed = false;
if (flags & UPDATE_TG)
update_tg_load_avg(cfs_rq);
}
}
/*
@ -5035,6 +5055,7 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int not_used1)
{
cfs_rq_util_change(cfs_rq, 0);
}
static inline void remove_entity_load_avg(struct sched_entity *se) {}
@ -6621,6 +6642,14 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
*/
util_est_enqueue(&rq->cfs, p);
/*
* If in_iowait is set, the code below may not trigger any cpufreq
* utilization updates, so do it here explicitly with the IOWAIT flag
* passed.
*/
if (p->in_iowait)
cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);
for_each_sched_entity(se) {
if (se->on_rq)
break;
@ -9230,6 +9259,10 @@ static bool __update_blocked_others(struct rq *rq, bool *done)
unsigned long thermal_pressure;
bool decayed;
/*
* update_load_avg() can call cpufreq_update_util(). Make sure that RT,
* DL and IRQ signals have been updated before updating CFS.
*/
curr_class = rq->curr->sched_class;
thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq));
@ -12605,7 +12638,6 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
update_misfit_status(curr, rq);
update_overutilized_status(task_rq(curr));
cpufreq_update_util(rq, 0);
task_tick_core(rq, curr);
}

View File

@ -606,8 +606,11 @@ static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
rt_se = rt_rq->tg->rt_se[cpu];
if (!rt_se)
if (!rt_se) {
dequeue_top_rt_rq(rt_rq, rt_rq->rt_nr_running);
/* Kick cpufreq (see the comment in kernel/sched/sched.h). */
cpufreq_update_util(rq_of_rt_rq(rt_rq), 0);
}
else if (on_rt_rq(rt_se))
dequeue_rt_entity(rt_se, 0);
}
@ -1130,6 +1133,9 @@ enqueue_top_rt_rq(struct rt_rq *rt_rq)
add_nr_running(rq, rt_rq->rt_nr_running);
rt_rq->rt_queued = 1;
}
/* Kick cpufreq (see the comment in kernel/sched/sched.h). */
cpufreq_update_util(rq, 0);
}
#if defined CONFIG_SMP

View File

@ -622,9 +622,7 @@ struct cfs_rq {
#endif /* CONFIG_FAIR_GROUP_SCHED */
#endif /* CONFIG_SMP */
/*
* Store whether last update_load_avg() has decayed
*/
/* Unused, only kept here to preserve the KMI after revert. */
bool decayed;
#ifdef CONFIG_FAIR_GROUP_SCHED