From eab28aad5261542a27b79a7f91ae442a49bdbbf6 Mon Sep 17 00:00:00 2001 From: William McVicker Date: Sat, 1 Jun 2024 00:54:05 +0000 Subject: [PATCH] Revert "FROMLIST: sched: Consolidate cpufreq updates" This partially reverts commit b329d0289ff347c66ad7ec60995ef79679d0e924. The cfs_rq->decayed addition is kept to preserve the KMI until the next KMI update. Reason for revert: This change causes major perf issues. Bug: 343389622 Change-Id: I496da55f509a3c1a7ba8f869c8db43d9509fc547 Signed-off-by: William McVicker [cmllamas: kept cfs_rq->decayed addition to preserve the KMI] Signed-off-by: Carlos Llamas --- include/linux/sched/cpufreq.h | 3 +- kernel/sched/core.c | 95 ++------------------------------ kernel/sched/cpufreq_schedutil.c | 54 +++++------------- kernel/sched/deadline.c | 4 ++ kernel/sched/fair.c | 64 +++++++++++++++------ kernel/sched/rt.c | 8 ++- kernel/sched/sched.h | 4 +- 7 files changed, 81 insertions(+), 151 deletions(-) diff --git a/include/linux/sched/cpufreq.h b/include/linux/sched/cpufreq.h index 2d0a45aba16f..bdd31ab93bc5 100644 --- a/include/linux/sched/cpufreq.h +++ b/include/linux/sched/cpufreq.h @@ -8,8 +8,7 @@ * Interface between cpufreq drivers and the scheduler: */ -#define SCHED_CPUFREQ_IOWAIT (1U << 0) -#define SCHED_CPUFREQ_FORCE_UPDATE (1U << 1) /* ignore transition_delay_us */ +#define SCHED_CPUFREQ_IOWAIT (1U << 0) #ifdef CONFIG_CPU_FREQ struct cpufreq_policy; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 74015af12b65..495ee0223833 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -166,9 +166,6 @@ const_debug unsigned int sysctl_sched_nr_migrate = SCHED_NR_MIGRATE_BREAK; __read_mostly int scheduler_running; -static __always_inline void -update_cpufreq_ctx_switch(struct rq *rq, struct task_struct *prev); - #ifdef CONFIG_SCHED_CORE DEFINE_STATIC_KEY_FALSE(__sched_core_enabled); @@ -1985,7 +1982,7 @@ static bool uclamp_reset(const struct sched_attr *attr, return false; } -static void __setscheduler_uclamp(struct rq *rq, struct task_struct *p, +static void __setscheduler_uclamp(struct task_struct *p, const struct sched_attr *attr) { enum uclamp_id clamp_id; @@ -2007,6 +2004,7 @@ static void __setscheduler_uclamp(struct rq *rq, struct task_struct *p, value = uclamp_none(clamp_id); uclamp_se_set(uc_se, value, false); + } if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP))) @@ -2025,13 +2023,6 @@ static void __setscheduler_uclamp(struct rq *rq, struct task_struct *p, attr->sched_util_max, true); trace_android_vh_setscheduler_uclamp(p, UCLAMP_MAX, attr->sched_util_max); } - - /* - * Updating uclamp values has impact on freq, ensure it is taken into - * account. - */ - if (task_current(rq, p)) - update_cpufreq_ctx_switch(rq, NULL); } static void uclamp_fork(struct task_struct *p) @@ -2106,7 +2097,7 @@ static inline int uclamp_validate(struct task_struct *p, { return -EOPNOTSUPP; } -static void __setscheduler_uclamp(struct rq *rq, struct task_struct *p, +static void __setscheduler_uclamp(struct task_struct *p, const struct sched_attr *attr) { } static inline void uclamp_fork(struct task_struct *p) { } static inline void uclamp_post_fork(struct task_struct *p) { } @@ -2270,13 +2261,6 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p, prev_class->switched_from(rq, p); p->sched_class->switched_to(rq, p); - - /* - * Changing policies could imply requiring to send cpufreq - * update. - */ - if (task_current(rq, p)) - update_cpufreq_ctx_switch(rq, NULL); } else if (oldprio != p->prio || dl_task(p)) p->sched_class->prio_changed(rq, p, oldprio); } @@ -5231,68 +5215,6 @@ static inline void balance_callbacks(struct rq *rq, struct balance_callback *hea #endif -static __always_inline void -update_cpufreq_ctx_switch(struct rq *rq, struct task_struct *prev) -{ -#ifdef CONFIG_CPU_FREQ - /* - * RT and DL should always send a freq update. But we can do some - * simple checks to avoid it when we know it's not necessary. - * - * iowait_boost will always trigger a freq update too. - * - * Fair tasks will only trigger an update if the root cfs_rq has - * decayed. - * - * Everything else should do nothing. - */ - switch (current->policy) { - case SCHED_NORMAL: - case SCHED_BATCH: - if (unlikely(current->in_iowait)) { - cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT | SCHED_CPUFREQ_FORCE_UPDATE); - return; - } - -#ifdef CONFIG_SMP - if (unlikely(rq->cfs.decayed)) { - rq->cfs.decayed = false; - cpufreq_update_util(rq, 0); - return; - } -#endif - return; - case SCHED_FIFO: - case SCHED_RR: - if (prev && rt_policy(prev->policy)) { -#ifdef CONFIG_UCLAMP_TASK - unsigned long curr_uclamp_min = uclamp_eff_value(current, UCLAMP_MIN); - unsigned long prev_uclamp_min = uclamp_eff_value(prev, UCLAMP_MIN); - - if (curr_uclamp_min == prev_uclamp_min) -#endif - return; - } -#ifdef CONFIG_SMP - /* Stopper task masquerades as RT */ - if (unlikely(current->sched_class == &stop_sched_class)) - return; -#endif - cpufreq_update_util(rq, SCHED_CPUFREQ_FORCE_UPDATE); - return; - case SCHED_DEADLINE: - if (current->dl.flags & SCHED_FLAG_SUGOV) { - /* Ignore sugov kthreads, they're responding to our requests */ - return; - } - cpufreq_update_util(rq, SCHED_CPUFREQ_FORCE_UPDATE); - return; - default: - return; - } -#endif -} - static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf) { @@ -5310,7 +5232,7 @@ prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf #endif } -static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) +static inline void finish_lock_switch(struct rq *rq) { /* * If we are tracking spinlock dependencies then we have to @@ -5319,11 +5241,6 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) */ spin_acquire(&__rq_lockp(rq)->dep_map, 0, 0, _THIS_IP_); __balance_callbacks(rq); - /* - * Request freq update after __balance_callbacks to take into account - * any changes to rq. - */ - update_cpufreq_ctx_switch(rq, prev); raw_spin_rq_unlock_irq(rq); } @@ -5442,7 +5359,7 @@ static struct rq *finish_task_switch(struct task_struct *prev) perf_event_task_sched_in(prev, current); finish_task(prev); tick_nohz_task_switch(); - finish_lock_switch(rq, prev); + finish_lock_switch(rq); finish_arch_post_lock_switch(); kcov_finish_switch(current); /* @@ -8027,7 +7944,7 @@ change: __setscheduler_prio(p, newprio); trace_android_rvh_setscheduler(p); } - __setscheduler_uclamp(rq, p, attr); + __setscheduler_uclamp(p, attr); if (queued) { /* diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index b8084433577e..09d5326a2f85 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -61,8 +61,7 @@ static DEFINE_PER_CPU(struct sugov_cpu, sugov_cpu); /************************ Governor internals ***********************/ -static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time, - unsigned int flags) +static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time) { s64 delta_ns; @@ -90,16 +89,13 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time, return true; } - if (unlikely(flags & SCHED_CPUFREQ_FORCE_UPDATE)) - return true; - delta_ns = time - sg_policy->last_freq_update_time; return delta_ns >= sg_policy->freq_update_delay_ns; } static bool sugov_update_next_freq(struct sugov_policy *sg_policy, u64 time, - unsigned int next_freq, unsigned int flags) + unsigned int next_freq) { if (sg_policy->need_freq_update) sg_policy->need_freq_update = cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS); @@ -107,9 +103,7 @@ static bool sugov_update_next_freq(struct sugov_policy *sg_policy, u64 time, return false; sg_policy->next_freq = next_freq; - - if (!unlikely(flags & SCHED_CPUFREQ_FORCE_UPDATE)) - sg_policy->last_freq_update_time = time; + sg_policy->last_freq_update_time = time; return true; } @@ -221,10 +215,9 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, unsigned int flags) { bool set_iowait_boost = flags & SCHED_CPUFREQ_IOWAIT; - bool forced_update = flags & SCHED_CPUFREQ_FORCE_UPDATE; /* Reset boost if the CPU appears to have been idle enough */ - if (sg_cpu->iowait_boost && !forced_update && + if (sg_cpu->iowait_boost && sugov_iowait_reset(sg_cpu, time, set_iowait_boost)) return; @@ -267,35 +260,19 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, * being more conservative on tasks which does sporadic IO operations. */ static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time, - unsigned long max_cap, unsigned int flags) + unsigned long max_cap) { - bool forced_update = flags & SCHED_CPUFREQ_FORCE_UPDATE; - s64 delta_ns = time - sg_cpu->last_update; unsigned long boost; /* No boost currently required */ if (!sg_cpu->iowait_boost) return; - if (forced_update) - goto apply_boost; - /* Reset boost if the CPU appears to have been idle enough */ if (sugov_iowait_reset(sg_cpu, time, false)) return; if (!sg_cpu->iowait_boost_pending) { - /* - * This logic relied on PELT signal decays happening once every - * 1ms. But due to changes to how updates are done now, we can - * end up with more request coming up leading to iowait boost - * to be prematurely reduced. Make the assumption explicit - * until we improve the iowait boost logic to be better in - * general as it is due for an overhaul. - */ - if (delta_ns <= NSEC_PER_MSEC) - goto apply_boost; - /* * No boost pending; reduce the boost value. */ @@ -306,7 +283,6 @@ static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time, } } -apply_boost: sg_cpu->iowait_boost_pending = false; /* @@ -351,11 +327,11 @@ static inline bool sugov_update_single_common(struct sugov_cpu *sg_cpu, ignore_dl_rate_limit(sg_cpu); - if (!sugov_should_update_freq(sg_cpu->sg_policy, time, flags)) + if (!sugov_should_update_freq(sg_cpu->sg_policy, time)) return false; sugov_get_util(sg_cpu); - sugov_iowait_apply(sg_cpu, time, max_cap, flags); + sugov_iowait_apply(sg_cpu, time, max_cap); return true; } @@ -390,7 +366,7 @@ static void sugov_update_single_freq(struct update_util_data *hook, u64 time, sg_policy->cached_raw_freq = cached_freq; } - if (!sugov_update_next_freq(sg_policy, time, next_f, flags)) + if (!sugov_update_next_freq(sg_policy, time, next_f)) return; /* @@ -442,12 +418,10 @@ static void sugov_update_single_perf(struct update_util_data *hook, u64 time, cpufreq_driver_adjust_perf(sg_cpu->cpu, map_util_perf(sg_cpu->bw_dl), map_util_perf(sg_cpu->util), max_cap); - if (!unlikely(flags & SCHED_CPUFREQ_FORCE_UPDATE)) - sg_cpu->sg_policy->last_freq_update_time = time; + sg_cpu->sg_policy->last_freq_update_time = time; } -static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time, - unsigned int flags) +static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time) { struct sugov_policy *sg_policy = sg_cpu->sg_policy; struct cpufreq_policy *policy = sg_policy->policy; @@ -460,7 +434,7 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time, struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j); sugov_get_util(j_sg_cpu); - sugov_iowait_apply(j_sg_cpu, time, max_cap, flags); + sugov_iowait_apply(j_sg_cpu, time, max_cap); util = max(j_sg_cpu->util, util); } @@ -482,10 +456,10 @@ sugov_update_shared(struct update_util_data *hook, u64 time, unsigned int flags) ignore_dl_rate_limit(sg_cpu); - if (sugov_should_update_freq(sg_policy, time, flags)) { - next_f = sugov_next_freq_shared(sg_cpu, time, flags); + if (sugov_should_update_freq(sg_policy, time)) { + next_f = sugov_next_freq_shared(sg_cpu, time); - if (!sugov_update_next_freq(sg_policy, time, next_f, flags)) + if (!sugov_update_next_freq(sg_policy, time, next_f)) goto unlock; if (sg_policy->policy->fast_switch_enabled) diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 3dbdb05f32b1..d78f2e8769fb 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -239,6 +239,8 @@ void __add_running_bw(u64 dl_bw, struct dl_rq *dl_rq) dl_rq->running_bw += dl_bw; SCHED_WARN_ON(dl_rq->running_bw < old); /* overflow */ SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw); + /* kick cpufreq (see the comment in kernel/sched/sched.h). */ + cpufreq_update_util(rq_of_dl_rq(dl_rq), 0); } static inline @@ -251,6 +253,8 @@ void __sub_running_bw(u64 dl_bw, struct dl_rq *dl_rq) SCHED_WARN_ON(dl_rq->running_bw > old); /* underflow */ if (dl_rq->running_bw > old) dl_rq->running_bw = 0; + /* kick cpufreq (see the comment in kernel/sched/sched.h). */ + cpufreq_update_util(rq_of_dl_rq(dl_rq), 0); } static inline diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 20461354a8e6..c44306ae9008 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3950,6 +3950,29 @@ static inline void update_cfs_group(struct sched_entity *se) } #endif /* CONFIG_FAIR_GROUP_SCHED */ +static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags) +{ + struct rq *rq = rq_of(cfs_rq); + + if (&rq->cfs == cfs_rq) { + /* + * There are a few boundary cases this might miss but it should + * get called often enough that that should (hopefully) not be + * a real problem. + * + * It will not get called when we go idle, because the idle + * thread is a different class (!fair), nor will the utilization + * number include things like RT tasks. + * + * As is, the util number is not freq-invariant (we'd have to + * implement arch_scale_freq_capacity() for that). + * + * See cpu_util_cfs(). + */ + cpufreq_update_util(rq, flags); + } +} + #ifdef CONFIG_SMP static inline bool load_avg_is_decayed(struct sched_avg *sa) { @@ -4567,6 +4590,8 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s add_tg_cfs_propagate(cfs_rq, se->avg.load_sum); + cfs_rq_util_change(cfs_rq, 0); + trace_pelt_cfs_tp(cfs_rq); } @@ -4595,6 +4620,8 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum); + cfs_rq_util_change(cfs_rq, 0); + trace_pelt_cfs_tp(cfs_rq); } @@ -4610,7 +4637,7 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { u64 now = cfs_rq_clock_pelt(cfs_rq); - unsigned long prev_util_avg = cfs_rq->avg.util_avg; + int decayed; /* * Track task load average for carrying it to new CPU after migrated, and @@ -4619,8 +4646,8 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) __update_load_avg_se(now, cfs_rq, se); - cfs_rq->decayed |= update_cfs_rq_load_avg(now, cfs_rq); - cfs_rq->decayed |= propagate_entity_load_avg(se); + decayed = update_cfs_rq_load_avg(now, cfs_rq); + decayed |= propagate_entity_load_avg(se); if (!se->avg.last_update_time && (flags & DO_ATTACH)) { @@ -4641,19 +4668,12 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s */ detach_entity_load_avg(cfs_rq, se); update_tg_load_avg(cfs_rq); - } else if (cfs_rq->decayed && (flags & UPDATE_TG)) { - update_tg_load_avg(cfs_rq); - } + } else if (decayed) { + cfs_rq_util_change(cfs_rq, 0); - /* - * This field is used to indicate whether a trigger of cpufreq update - * is required. When the CPU is saturated, other load signals could - * still be changing, but util_avg would have settled down, so ensure - * that we don't trigger unnecessary updates as from fair policy point - * of view, nothing has changed to cause a cpufreq update. - */ - if (cfs_rq->decayed && prev_util_avg == cfs_rq->avg.util_avg) - cfs_rq->decayed = false; + if (flags & UPDATE_TG) + update_tg_load_avg(cfs_rq); + } } /* @@ -5035,6 +5055,7 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int not_used1) { + cfs_rq_util_change(cfs_rq, 0); } static inline void remove_entity_load_avg(struct sched_entity *se) {} @@ -6621,6 +6642,14 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) */ util_est_enqueue(&rq->cfs, p); + /* + * If in_iowait is set, the code below may not trigger any cpufreq + * utilization updates, so do it here explicitly with the IOWAIT flag + * passed. + */ + if (p->in_iowait) + cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT); + for_each_sched_entity(se) { if (se->on_rq) break; @@ -9230,6 +9259,10 @@ static bool __update_blocked_others(struct rq *rq, bool *done) unsigned long thermal_pressure; bool decayed; + /* + * update_load_avg() can call cpufreq_update_util(). Make sure that RT, + * DL and IRQ signals have been updated before updating CFS. + */ curr_class = rq->curr->sched_class; thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq)); @@ -12605,7 +12638,6 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) update_misfit_status(curr, rq); update_overutilized_status(task_rq(curr)); - cpufreq_update_util(rq, 0); task_tick_core(rq, curr); } diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index d8a1dc19972c..75022f6e9796 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -606,8 +606,11 @@ static void sched_rt_rq_dequeue(struct rt_rq *rt_rq) rt_se = rt_rq->tg->rt_se[cpu]; - if (!rt_se) + if (!rt_se) { dequeue_top_rt_rq(rt_rq, rt_rq->rt_nr_running); + /* Kick cpufreq (see the comment in kernel/sched/sched.h). */ + cpufreq_update_util(rq_of_rt_rq(rt_rq), 0); + } else if (on_rt_rq(rt_se)) dequeue_rt_entity(rt_se, 0); } @@ -1130,6 +1133,9 @@ enqueue_top_rt_rq(struct rt_rq *rt_rq) add_nr_running(rq, rt_rq->rt_nr_running); rt_rq->rt_queued = 1; } + + /* Kick cpufreq (see the comment in kernel/sched/sched.h). */ + cpufreq_update_util(rq, 0); } #if defined CONFIG_SMP diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 454469eab06d..00c1743e405c 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -622,9 +622,7 @@ struct cfs_rq { #endif /* CONFIG_FAIR_GROUP_SCHED */ #endif /* CONFIG_SMP */ - /* - * Store whether last update_load_avg() has decayed - */ + /* Unused, only kept here to preserve the KMI after revert. */ bool decayed; #ifdef CONFIG_FAIR_GROUP_SCHED