linux-yocto/kernel/sched/syscalls.c
Felix Moessbauer ed4fb6d7ef hrtimer: Use and report correct timerslack values for realtime tasks
The timerslack_ns setting is used to specify how much the hardware
timers should be delayed, to potentially dispatch multiple timers in a
single interrupt. This is a performance optimization. Timers of
realtime tasks (having a realtime scheduling policy) should not be
delayed.

This logic was inconsitently applied to the hrtimers, leading to delays
of realtime tasks which used timed waits for events (e.g. condition
variables). Due to the downstream override of the slack for rt tasks,
the procfs reported incorrect (non-zero) timerslack_ns values.

This is changed by setting the timer_slack_ns task attribute to 0 for
all tasks with a rt policy. By that, downstream users do not need to
specially handle rt tasks (w.r.t. the slack), and the procfs entry
shows the correct value of "0". Setting non-zero slack values (either
via procfs or PR_SET_TIMERSLACK) on tasks with a rt policy is ignored,
as stated in "man 2 PR_SET_TIMERSLACK":

  Timer slack is not applied to threads that are scheduled under a
  real-time scheduling policy (see sched_setscheduler(2)).

The special handling of timerslack on rt tasks in downstream users
is removed as well.

Signed-off-by: Felix Moessbauer <felix.moessbauer@siemens.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/all/20240814121032.368444-2-felix.moessbauer@siemens.com
2024-08-23 20:13:02 +02:00

1708 lines
41 KiB
C

// SPDX-License-Identifier: GPL-2.0-only
/*
* kernel/sched/syscalls.c
*
* Core kernel scheduler syscalls related code
*
* Copyright (C) 1991-2002 Linus Torvalds
* Copyright (C) 1998-2024 Ingo Molnar, Red Hat
*/
#include <linux/sched.h>
#include <linux/cpuset.h>
#include <linux/sched/debug.h>
#include <uapi/linux/sched/types.h>
#include "sched.h"
#include "autogroup.h"
static inline int __normal_prio(int policy, int rt_prio, int nice)
{
int prio;
if (dl_policy(policy))
prio = MAX_DL_PRIO - 1;
else if (rt_policy(policy))
prio = MAX_RT_PRIO - 1 - rt_prio;
else
prio = NICE_TO_PRIO(nice);
return prio;
}
/*
* Calculate the expected normal priority: i.e. priority
* without taking RT-inheritance into account. Might be
* boosted by interactivity modifiers. Changes upon fork,
* setprio syscalls, and whenever the interactivity
* estimator recalculates.
*/
static inline int normal_prio(struct task_struct *p)
{
return __normal_prio(p->policy, p->rt_priority, PRIO_TO_NICE(p->static_prio));
}
/*
* Calculate the current priority, i.e. the priority
* taken into account by the scheduler. This value might
* be boosted by RT tasks, or might be boosted by
* interactivity modifiers. Will be RT if the task got
* RT-boosted. If not then it returns p->normal_prio.
*/
static int effective_prio(struct task_struct *p)
{
p->normal_prio = normal_prio(p);
/*
* If we are RT tasks or we were boosted to RT priority,
* keep the priority unchanged. Otherwise, update priority
* to the normal priority:
*/
if (!rt_prio(p->prio))
return p->normal_prio;
return p->prio;
}
void set_user_nice(struct task_struct *p, long nice)
{
bool queued, running;
struct rq *rq;
int old_prio;
if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE)
return;
/*
* We have to be careful, if called from sys_setpriority(),
* the task might be in the middle of scheduling on another CPU.
*/
CLASS(task_rq_lock, rq_guard)(p);
rq = rq_guard.rq;
update_rq_clock(rq);
/*
* The RT priorities are set via sched_setscheduler(), but we still
* allow the 'normal' nice value to be set - but as expected
* it won't have any effect on scheduling until the task is
* SCHED_DEADLINE, SCHED_FIFO or SCHED_RR:
*/
if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
p->static_prio = NICE_TO_PRIO(nice);
return;
}
queued = task_on_rq_queued(p);
running = task_current(rq, p);
if (queued)
dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
if (running)
put_prev_task(rq, p);
p->static_prio = NICE_TO_PRIO(nice);
set_load_weight(p, true);
old_prio = p->prio;
p->prio = effective_prio(p);
if (queued)
enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
if (running)
set_next_task(rq, p);
/*
* If the task increased its priority or is running and
* lowered its priority, then reschedule its CPU:
*/
p->sched_class->prio_changed(rq, p, old_prio);
}
EXPORT_SYMBOL(set_user_nice);
/*
* is_nice_reduction - check if nice value is an actual reduction
*
* Similar to can_nice() but does not perform a capability check.
*
* @p: task
* @nice: nice value
*/
static bool is_nice_reduction(const struct task_struct *p, const int nice)
{
/* Convert nice value [19,-20] to rlimit style value [1,40]: */
int nice_rlim = nice_to_rlimit(nice);
return (nice_rlim <= task_rlimit(p, RLIMIT_NICE));
}
/*
* can_nice - check if a task can reduce its nice value
* @p: task
* @nice: nice value
*/
int can_nice(const struct task_struct *p, const int nice)
{
return is_nice_reduction(p, nice) || capable(CAP_SYS_NICE);
}
#ifdef __ARCH_WANT_SYS_NICE
/*
* sys_nice - change the priority of the current process.
* @increment: priority increment
*
* sys_setpriority is a more generic, but much slower function that
* does similar things.
*/
SYSCALL_DEFINE1(nice, int, increment)
{
long nice, retval;
/*
* Setpriority might change our priority at the same moment.
* We don't have to worry. Conceptually one call occurs first
* and we have a single winner.
*/
increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH);
nice = task_nice(current) + increment;
nice = clamp_val(nice, MIN_NICE, MAX_NICE);
if (increment < 0 && !can_nice(current, nice))
return -EPERM;
retval = security_task_setnice(current, nice);
if (retval)
return retval;
set_user_nice(current, nice);
return 0;
}
#endif
/**
* task_prio - return the priority value of a given task.
* @p: the task in question.
*
* Return: The priority value as seen by users in /proc.
*
* sched policy return value kernel prio user prio/nice
*
* normal, batch, idle [0 ... 39] [100 ... 139] 0/[-20 ... 19]
* fifo, rr [-2 ... -100] [98 ... 0] [1 ... 99]
* deadline -101 -1 0
*/
int task_prio(const struct task_struct *p)
{
return p->prio - MAX_RT_PRIO;
}
/**
* idle_cpu - is a given CPU idle currently?
* @cpu: the processor in question.
*
* Return: 1 if the CPU is currently idle. 0 otherwise.
*/
int idle_cpu(int cpu)
{
struct rq *rq = cpu_rq(cpu);
if (rq->curr != rq->idle)
return 0;
if (rq->nr_running)
return 0;
#ifdef CONFIG_SMP
if (rq->ttwu_pending)
return 0;
#endif
return 1;
}
/**
* available_idle_cpu - is a given CPU idle for enqueuing work.
* @cpu: the CPU in question.
*
* Return: 1 if the CPU is currently idle. 0 otherwise.
*/
int available_idle_cpu(int cpu)
{
if (!idle_cpu(cpu))
return 0;
if (vcpu_is_preempted(cpu))
return 0;
return 1;
}
/**
* idle_task - return the idle task for a given CPU.
* @cpu: the processor in question.
*
* Return: The idle task for the CPU @cpu.
*/
struct task_struct *idle_task(int cpu)
{
return cpu_rq(cpu)->idle;
}
#ifdef CONFIG_SCHED_CORE
int sched_core_idle_cpu(int cpu)
{
struct rq *rq = cpu_rq(cpu);
if (sched_core_enabled(rq) && rq->curr == rq->idle)
return 1;
return idle_cpu(cpu);
}
#endif
#ifdef CONFIG_SMP
/*
* This function computes an effective utilization for the given CPU, to be
* used for frequency selection given the linear relation: f = u * f_max.
*
* The scheduler tracks the following metrics:
*
* cpu_util_{cfs,rt,dl,irq}()
* cpu_bw_dl()
*
* Where the cfs,rt and dl util numbers are tracked with the same metric and
* synchronized windows and are thus directly comparable.
*
* The cfs,rt,dl utilization are the running times measured with rq->clock_task
* which excludes things like IRQ and steal-time. These latter are then accrued
* in the IRQ utilization.
*
* The DL bandwidth number OTOH is not a measured metric but a value computed
* based on the task model parameters and gives the minimal utilization
* required to meet deadlines.
*/
unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
unsigned long *min,
unsigned long *max)
{
unsigned long util, irq, scale;
struct rq *rq = cpu_rq(cpu);
scale = arch_scale_cpu_capacity(cpu);
/*
* Early check to see if IRQ/steal time saturates the CPU, can be
* because of inaccuracies in how we track these -- see
* update_irq_load_avg().
*/
irq = cpu_util_irq(rq);
if (unlikely(irq >= scale)) {
if (min)
*min = scale;
if (max)
*max = scale;
return scale;
}
if (min) {
/*
* The minimum utilization returns the highest level between:
* - the computed DL bandwidth needed with the IRQ pressure which
* steals time to the deadline task.
* - The minimum performance requirement for CFS and/or RT.
*/
*min = max(irq + cpu_bw_dl(rq), uclamp_rq_get(rq, UCLAMP_MIN));
/*
* When an RT task is runnable and uclamp is not used, we must
* ensure that the task will run at maximum compute capacity.
*/
if (!uclamp_is_used() && rt_rq_is_runnable(&rq->rt))
*min = max(*min, scale);
}
/*
* Because the time spend on RT/DL tasks is visible as 'lost' time to
* CFS tasks and we use the same metric to track the effective
* utilization (PELT windows are synchronized) we can directly add them
* to obtain the CPU's actual utilization.
*/
util = util_cfs + cpu_util_rt(rq);
util += cpu_util_dl(rq);
/*
* The maximum hint is a soft bandwidth requirement, which can be lower
* than the actual utilization because of uclamp_max requirements.
*/
if (max)
*max = min(scale, uclamp_rq_get(rq, UCLAMP_MAX));
if (util >= scale)
return scale;
/*
* There is still idle time; further improve the number by using the
* IRQ metric. Because IRQ/steal time is hidden from the task clock we
* need to scale the task numbers:
*
* max - irq
* U' = irq + --------- * U
* max
*/
util = scale_irq_capacity(util, irq, scale);
util += irq;
return min(scale, util);
}
unsigned long sched_cpu_util(int cpu)
{
return effective_cpu_util(cpu, cpu_util_cfs(cpu), NULL, NULL);
}
#endif /* CONFIG_SMP */
/**
* find_process_by_pid - find a process with a matching PID value.
* @pid: the pid in question.
*
* The task of @pid, if found. %NULL otherwise.
*/
static struct task_struct *find_process_by_pid(pid_t pid)
{
return pid ? find_task_by_vpid(pid) : current;
}
static struct task_struct *find_get_task(pid_t pid)
{
struct task_struct *p;
guard(rcu)();
p = find_process_by_pid(pid);
if (likely(p))
get_task_struct(p);
return p;
}
DEFINE_CLASS(find_get_task, struct task_struct *, if (_T) put_task_struct(_T),
find_get_task(pid), pid_t pid)
/*
* sched_setparam() passes in -1 for its policy, to let the functions
* it calls know not to change it.
*/
#define SETPARAM_POLICY -1
static void __setscheduler_params(struct task_struct *p,
const struct sched_attr *attr)
{
int policy = attr->sched_policy;
if (policy == SETPARAM_POLICY)
policy = p->policy;
p->policy = policy;
if (dl_policy(policy))
__setparam_dl(p, attr);
else if (fair_policy(policy))
p->static_prio = NICE_TO_PRIO(attr->sched_nice);
/* rt-policy tasks do not have a timerslack */
if (task_is_realtime(p)) {
p->timer_slack_ns = 0;
} else if (p->timer_slack_ns == 0) {
/* when switching back to non-rt policy, restore timerslack */
p->timer_slack_ns = p->default_timer_slack_ns;
}
/*
* __sched_setscheduler() ensures attr->sched_priority == 0 when
* !rt_policy. Always setting this ensures that things like
* getparam()/getattr() don't report silly values for !rt tasks.
*/
p->rt_priority = attr->sched_priority;
p->normal_prio = normal_prio(p);
set_load_weight(p, true);
}
/*
* Check the target process has a UID that matches the current process's:
*/
static bool check_same_owner(struct task_struct *p)
{
const struct cred *cred = current_cred(), *pcred;
guard(rcu)();
pcred = __task_cred(p);
return (uid_eq(cred->euid, pcred->euid) ||
uid_eq(cred->euid, pcred->uid));
}
#ifdef CONFIG_UCLAMP_TASK
static int uclamp_validate(struct task_struct *p,
const struct sched_attr *attr)
{
int util_min = p->uclamp_req[UCLAMP_MIN].value;
int util_max = p->uclamp_req[UCLAMP_MAX].value;
if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) {
util_min = attr->sched_util_min;
if (util_min + 1 > SCHED_CAPACITY_SCALE + 1)
return -EINVAL;
}
if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) {
util_max = attr->sched_util_max;
if (util_max + 1 > SCHED_CAPACITY_SCALE + 1)
return -EINVAL;
}
if (util_min != -1 && util_max != -1 && util_min > util_max)
return -EINVAL;
/*
* We have valid uclamp attributes; make sure uclamp is enabled.
*
* We need to do that here, because enabling static branches is a
* blocking operation which obviously cannot be done while holding
* scheduler locks.
*/
static_branch_enable(&sched_uclamp_used);
return 0;
}
static bool uclamp_reset(const struct sched_attr *attr,
enum uclamp_id clamp_id,
struct uclamp_se *uc_se)
{
/* Reset on sched class change for a non user-defined clamp value. */
if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)) &&
!uc_se->user_defined)
return true;
/* Reset on sched_util_{min,max} == -1. */
if (clamp_id == UCLAMP_MIN &&
attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN &&
attr->sched_util_min == -1) {
return true;
}
if (clamp_id == UCLAMP_MAX &&
attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX &&
attr->sched_util_max == -1) {
return true;
}
return false;
}
static void __setscheduler_uclamp(struct task_struct *p,
const struct sched_attr *attr)
{
enum uclamp_id clamp_id;
for_each_clamp_id(clamp_id) {
struct uclamp_se *uc_se = &p->uclamp_req[clamp_id];
unsigned int value;
if (!uclamp_reset(attr, clamp_id, uc_se))
continue;
/*
* RT by default have a 100% boost value that could be modified
* at runtime.
*/
if (unlikely(rt_task(p) && clamp_id == UCLAMP_MIN))
value = sysctl_sched_uclamp_util_min_rt_default;
else
value = uclamp_none(clamp_id);
uclamp_se_set(uc_se, value, false);
}
if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)))
return;
if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN &&
attr->sched_util_min != -1) {
uclamp_se_set(&p->uclamp_req[UCLAMP_MIN],
attr->sched_util_min, true);
}
if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX &&
attr->sched_util_max != -1) {
uclamp_se_set(&p->uclamp_req[UCLAMP_MAX],
attr->sched_util_max, true);
}
}
#else /* !CONFIG_UCLAMP_TASK: */
static inline int uclamp_validate(struct task_struct *p,
const struct sched_attr *attr)
{
return -EOPNOTSUPP;
}
static void __setscheduler_uclamp(struct task_struct *p,
const struct sched_attr *attr) { }
#endif
/*
* Allow unprivileged RT tasks to decrease priority.
* Only issue a capable test if needed and only once to avoid an audit
* event on permitted non-privileged operations:
*/
static int user_check_sched_setscheduler(struct task_struct *p,
const struct sched_attr *attr,
int policy, int reset_on_fork)
{
if (fair_policy(policy)) {
if (attr->sched_nice < task_nice(p) &&
!is_nice_reduction(p, attr->sched_nice))
goto req_priv;
}
if (rt_policy(policy)) {
unsigned long rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO);
/* Can't set/change the rt policy: */
if (policy != p->policy && !rlim_rtprio)
goto req_priv;
/* Can't increase priority: */
if (attr->sched_priority > p->rt_priority &&
attr->sched_priority > rlim_rtprio)
goto req_priv;
}
/*
* Can't set/change SCHED_DEADLINE policy at all for now
* (safest behavior); in the future we would like to allow
* unprivileged DL tasks to increase their relative deadline
* or reduce their runtime (both ways reducing utilization)
*/
if (dl_policy(policy))
goto req_priv;
/*
* Treat SCHED_IDLE as nice 20. Only allow a switch to
* SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
*/
if (task_has_idle_policy(p) && !idle_policy(policy)) {
if (!is_nice_reduction(p, task_nice(p)))
goto req_priv;
}
/* Can't change other user's priorities: */
if (!check_same_owner(p))
goto req_priv;
/* Normal users shall not reset the sched_reset_on_fork flag: */
if (p->sched_reset_on_fork && !reset_on_fork)
goto req_priv;
return 0;
req_priv:
if (!capable(CAP_SYS_NICE))
return -EPERM;
return 0;
}
int __sched_setscheduler(struct task_struct *p,
const struct sched_attr *attr,
bool user, bool pi)
{
int oldpolicy = -1, policy = attr->sched_policy;
int retval, oldprio, newprio, queued, running;
const struct sched_class *prev_class;
struct balance_callback *head;
struct rq_flags rf;
int reset_on_fork;
int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
struct rq *rq;
bool cpuset_locked = false;
/* The pi code expects interrupts enabled */
BUG_ON(pi && in_interrupt());
recheck:
/* Double check policy once rq lock held: */
if (policy < 0) {
reset_on_fork = p->sched_reset_on_fork;
policy = oldpolicy = p->policy;
} else {
reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK);
if (!valid_policy(policy))
return -EINVAL;
}
if (attr->sched_flags & ~(SCHED_FLAG_ALL | SCHED_FLAG_SUGOV))
return -EINVAL;
/*
* Valid priorities for SCHED_FIFO and SCHED_RR are
* 1..MAX_RT_PRIO-1, valid priority for SCHED_NORMAL,
* SCHED_BATCH and SCHED_IDLE is 0.
*/
if (attr->sched_priority > MAX_RT_PRIO-1)
return -EINVAL;
if ((dl_policy(policy) && !__checkparam_dl(attr)) ||
(rt_policy(policy) != (attr->sched_priority != 0)))
return -EINVAL;
if (user) {
retval = user_check_sched_setscheduler(p, attr, policy, reset_on_fork);
if (retval)
return retval;
if (attr->sched_flags & SCHED_FLAG_SUGOV)
return -EINVAL;
retval = security_task_setscheduler(p);
if (retval)
return retval;
}
/* Update task specific "requested" clamps */
if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) {
retval = uclamp_validate(p, attr);
if (retval)
return retval;
}
/*
* SCHED_DEADLINE bandwidth accounting relies on stable cpusets
* information.
*/
if (dl_policy(policy) || dl_policy(p->policy)) {
cpuset_locked = true;
cpuset_lock();
}
/*
* Make sure no PI-waiters arrive (or leave) while we are
* changing the priority of the task:
*
* To be able to change p->policy safely, the appropriate
* runqueue lock must be held.
*/
rq = task_rq_lock(p, &rf);
update_rq_clock(rq);
/*
* Changing the policy of the stop threads its a very bad idea:
*/
if (p == rq->stop) {
retval = -EINVAL;
goto unlock;
}
/*
* If not changing anything there's no need to proceed further,
* but store a possible modification of reset_on_fork.
*/
if (unlikely(policy == p->policy)) {
if (fair_policy(policy) && attr->sched_nice != task_nice(p))
goto change;
if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
goto change;
if (dl_policy(policy) && dl_param_changed(p, attr))
goto change;
if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)
goto change;
p->sched_reset_on_fork = reset_on_fork;
retval = 0;
goto unlock;
}
change:
if (user) {
#ifdef CONFIG_RT_GROUP_SCHED
/*
* Do not allow real-time tasks into groups that have no runtime
* assigned.
*/
if (rt_bandwidth_enabled() && rt_policy(policy) &&
task_group(p)->rt_bandwidth.rt_runtime == 0 &&
!task_group_is_autogroup(task_group(p))) {
retval = -EPERM;
goto unlock;
}
#endif
#ifdef CONFIG_SMP
if (dl_bandwidth_enabled() && dl_policy(policy) &&
!(attr->sched_flags & SCHED_FLAG_SUGOV)) {
cpumask_t *span = rq->rd->span;
/*
* Don't allow tasks with an affinity mask smaller than
* the entire root_domain to become SCHED_DEADLINE. We
* will also fail if there's no bandwidth available.
*/
if (!cpumask_subset(span, p->cpus_ptr) ||
rq->rd->dl_bw.bw == 0) {
retval = -EPERM;
goto unlock;
}
}
#endif
}
/* Re-check policy now with rq lock held: */
if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
policy = oldpolicy = -1;
task_rq_unlock(rq, p, &rf);
if (cpuset_locked)
cpuset_unlock();
goto recheck;
}
/*
* If setscheduling to SCHED_DEADLINE (or changing the parameters
* of a SCHED_DEADLINE task) we need to check if enough bandwidth
* is available.
*/
if ((dl_policy(policy) || dl_task(p)) && sched_dl_overflow(p, policy, attr)) {
retval = -EBUSY;
goto unlock;
}
p->sched_reset_on_fork = reset_on_fork;
oldprio = p->prio;
newprio = __normal_prio(policy, attr->sched_priority, attr->sched_nice);
if (pi) {
/*
* Take priority boosted tasks into account. If the new
* effective priority is unchanged, we just store the new
* normal parameters and do not touch the scheduler class and
* the runqueue. This will be done when the task deboost
* itself.
*/
newprio = rt_effective_prio(p, newprio);
if (newprio == oldprio)
queue_flags &= ~DEQUEUE_MOVE;
}
queued = task_on_rq_queued(p);
running = task_current(rq, p);
if (queued)
dequeue_task(rq, p, queue_flags);
if (running)
put_prev_task(rq, p);
prev_class = p->sched_class;
if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) {
__setscheduler_params(p, attr);
__setscheduler_prio(p, newprio);
}
__setscheduler_uclamp(p, attr);
if (queued) {
/*
* We enqueue to tail when the priority of a task is
* increased (user space view).
*/
if (oldprio < p->prio)
queue_flags |= ENQUEUE_HEAD;
enqueue_task(rq, p, queue_flags);
}
if (running)
set_next_task(rq, p);
check_class_changed(rq, p, prev_class, oldprio);
/* Avoid rq from going away on us: */
preempt_disable();
head = splice_balance_callbacks(rq);
task_rq_unlock(rq, p, &rf);
if (pi) {
if (cpuset_locked)
cpuset_unlock();
rt_mutex_adjust_pi(p);
}
/* Run balance callbacks after we've adjusted the PI chain: */
balance_callbacks(rq, head);
preempt_enable();
return 0;
unlock:
task_rq_unlock(rq, p, &rf);
if (cpuset_locked)
cpuset_unlock();
return retval;
}
static int _sched_setscheduler(struct task_struct *p, int policy,
const struct sched_param *param, bool check)
{
struct sched_attr attr = {
.sched_policy = policy,
.sched_priority = param->sched_priority,
.sched_nice = PRIO_TO_NICE(p->static_prio),
};
/* Fixup the legacy SCHED_RESET_ON_FORK hack. */
if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) {
attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
policy &= ~SCHED_RESET_ON_FORK;
attr.sched_policy = policy;
}
return __sched_setscheduler(p, &attr, check, true);
}
/**
* sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
* @p: the task in question.
* @policy: new policy.
* @param: structure containing the new RT priority.
*
* Use sched_set_fifo(), read its comment.
*
* Return: 0 on success. An error code otherwise.
*
* NOTE that the task may be already dead.
*/
int sched_setscheduler(struct task_struct *p, int policy,
const struct sched_param *param)
{
return _sched_setscheduler(p, policy, param, true);
}
int sched_setattr(struct task_struct *p, const struct sched_attr *attr)
{
return __sched_setscheduler(p, attr, true, true);
}
int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr)
{
return __sched_setscheduler(p, attr, false, true);
}
EXPORT_SYMBOL_GPL(sched_setattr_nocheck);
/**
* sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernel-space.
* @p: the task in question.
* @policy: new policy.
* @param: structure containing the new RT priority.
*
* Just like sched_setscheduler, only don't bother checking if the
* current context has permission. For example, this is needed in
* stop_machine(): we create temporary high priority worker threads,
* but our caller might not have that capability.
*
* Return: 0 on success. An error code otherwise.
*/
int sched_setscheduler_nocheck(struct task_struct *p, int policy,
const struct sched_param *param)
{
return _sched_setscheduler(p, policy, param, false);
}
/*
* SCHED_FIFO is a broken scheduler model; that is, it is fundamentally
* incapable of resource management, which is the one thing an OS really should
* be doing.
*
* This is of course the reason it is limited to privileged users only.
*
* Worse still; it is fundamentally impossible to compose static priority
* workloads. You cannot take two correctly working static prio workloads
* and smash them together and still expect them to work.
*
* For this reason 'all' FIFO tasks the kernel creates are basically at:
*
* MAX_RT_PRIO / 2
*
* The administrator _MUST_ configure the system, the kernel simply doesn't
* know enough information to make a sensible choice.
*/
void sched_set_fifo(struct task_struct *p)
{
struct sched_param sp = { .sched_priority = MAX_RT_PRIO / 2 };
WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0);
}
EXPORT_SYMBOL_GPL(sched_set_fifo);
/*
* For when you don't much care about FIFO, but want to be above SCHED_NORMAL.
*/
void sched_set_fifo_low(struct task_struct *p)
{
struct sched_param sp = { .sched_priority = 1 };
WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0);
}
EXPORT_SYMBOL_GPL(sched_set_fifo_low);
void sched_set_normal(struct task_struct *p, int nice)
{
struct sched_attr attr = {
.sched_policy = SCHED_NORMAL,
.sched_nice = nice,
};
WARN_ON_ONCE(sched_setattr_nocheck(p, &attr) != 0);
}
EXPORT_SYMBOL_GPL(sched_set_normal);
static int
do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
{
struct sched_param lparam;
if (!param || pid < 0)
return -EINVAL;
if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
return -EFAULT;
CLASS(find_get_task, p)(pid);
if (!p)
return -ESRCH;
return sched_setscheduler(p, policy, &lparam);
}
/*
* Mimics kernel/events/core.c perf_copy_attr().
*/
static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *attr)
{
u32 size;
int ret;
/* Zero the full structure, so that a short copy will be nice: */
memset(attr, 0, sizeof(*attr));
ret = get_user(size, &uattr->size);
if (ret)
return ret;
/* ABI compatibility quirk: */
if (!size)
size = SCHED_ATTR_SIZE_VER0;
if (size < SCHED_ATTR_SIZE_VER0 || size > PAGE_SIZE)
goto err_size;
ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size);
if (ret) {
if (ret == -E2BIG)
goto err_size;
return ret;
}
if ((attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) &&
size < SCHED_ATTR_SIZE_VER1)
return -EINVAL;
/*
* XXX: Do we want to be lenient like existing syscalls; or do we want
* to be strict and return an error on out-of-bounds values?
*/
attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE);
return 0;
err_size:
put_user(sizeof(*attr), &uattr->size);
return -E2BIG;
}
static void get_params(struct task_struct *p, struct sched_attr *attr)
{
if (task_has_dl_policy(p))
__getparam_dl(p, attr);
else if (task_has_rt_policy(p))
attr->sched_priority = p->rt_priority;
else
attr->sched_nice = task_nice(p);
}
/**
* sys_sched_setscheduler - set/change the scheduler policy and RT priority
* @pid: the pid in question.
* @policy: new policy.
* @param: structure containing the new RT priority.
*
* Return: 0 on success. An error code otherwise.
*/
SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, struct sched_param __user *, param)
{
if (policy < 0)
return -EINVAL;
return do_sched_setscheduler(pid, policy, param);
}
/**
* sys_sched_setparam - set/change the RT priority of a thread
* @pid: the pid in question.
* @param: structure containing the new RT priority.
*
* Return: 0 on success. An error code otherwise.
*/
SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
{
return do_sched_setscheduler(pid, SETPARAM_POLICY, param);
}
/**
* sys_sched_setattr - same as above, but with extended sched_attr
* @pid: the pid in question.
* @uattr: structure containing the extended parameters.
* @flags: for future extension.
*/
SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
unsigned int, flags)
{
struct sched_attr attr;
int retval;
if (!uattr || pid < 0 || flags)
return -EINVAL;
retval = sched_copy_attr(uattr, &attr);
if (retval)
return retval;
if ((int)attr.sched_policy < 0)
return -EINVAL;
if (attr.sched_flags & SCHED_FLAG_KEEP_POLICY)
attr.sched_policy = SETPARAM_POLICY;
CLASS(find_get_task, p)(pid);
if (!p)
return -ESRCH;
if (attr.sched_flags & SCHED_FLAG_KEEP_PARAMS)
get_params(p, &attr);
return sched_setattr(p, &attr);
}
/**
* sys_sched_getscheduler - get the policy (scheduling class) of a thread
* @pid: the pid in question.
*
* Return: On success, the policy of the thread. Otherwise, a negative error
* code.
*/
SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
{
struct task_struct *p;
int retval;
if (pid < 0)
return -EINVAL;
guard(rcu)();
p = find_process_by_pid(pid);
if (!p)
return -ESRCH;
retval = security_task_getscheduler(p);
if (!retval) {
retval = p->policy;
if (p->sched_reset_on_fork)
retval |= SCHED_RESET_ON_FORK;
}
return retval;
}
/**
* sys_sched_getparam - get the RT priority of a thread
* @pid: the pid in question.
* @param: structure containing the RT priority.
*
* Return: On success, 0 and the RT priority is in @param. Otherwise, an error
* code.
*/
SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
{
struct sched_param lp = { .sched_priority = 0 };
struct task_struct *p;
int retval;
if (!param || pid < 0)
return -EINVAL;
scoped_guard (rcu) {
p = find_process_by_pid(pid);
if (!p)
return -ESRCH;
retval = security_task_getscheduler(p);
if (retval)
return retval;
if (task_has_rt_policy(p))
lp.sched_priority = p->rt_priority;
}
/*
* This one might sleep, we cannot do it with a spinlock held ...
*/
return copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
}
/*
* Copy the kernel size attribute structure (which might be larger
* than what user-space knows about) to user-space.
*
* Note that all cases are valid: user-space buffer can be larger or
* smaller than the kernel-space buffer. The usual case is that both
* have the same size.
*/
static int
sched_attr_copy_to_user(struct sched_attr __user *uattr,
struct sched_attr *kattr,
unsigned int usize)
{
unsigned int ksize = sizeof(*kattr);
if (!access_ok(uattr, usize))
return -EFAULT;
/*
* sched_getattr() ABI forwards and backwards compatibility:
*
* If usize == ksize then we just copy everything to user-space and all is good.
*
* If usize < ksize then we only copy as much as user-space has space for,
* this keeps ABI compatibility as well. We skip the rest.
*
* If usize > ksize then user-space is using a newer version of the ABI,
* which part the kernel doesn't know about. Just ignore it - tooling can
* detect the kernel's knowledge of attributes from the attr->size value
* which is set to ksize in this case.
*/
kattr->size = min(usize, ksize);
if (copy_to_user(uattr, kattr, kattr->size))
return -EFAULT;
return 0;
}
/**
* sys_sched_getattr - similar to sched_getparam, but with sched_attr
* @pid: the pid in question.
* @uattr: structure containing the extended parameters.
* @usize: sizeof(attr) for fwd/bwd comp.
* @flags: for future extension.
*/
SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
unsigned int, usize, unsigned int, flags)
{
struct sched_attr kattr = { };
struct task_struct *p;
int retval;
if (!uattr || pid < 0 || usize > PAGE_SIZE ||
usize < SCHED_ATTR_SIZE_VER0 || flags)
return -EINVAL;
scoped_guard (rcu) {
p = find_process_by_pid(pid);
if (!p)
return -ESRCH;
retval = security_task_getscheduler(p);
if (retval)
return retval;
kattr.sched_policy = p->policy;
if (p->sched_reset_on_fork)
kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
get_params(p, &kattr);
kattr.sched_flags &= SCHED_FLAG_ALL;
#ifdef CONFIG_UCLAMP_TASK
/*
* This could race with another potential updater, but this is fine
* because it'll correctly read the old or the new value. We don't need
* to guarantee who wins the race as long as it doesn't return garbage.
*/
kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value;
kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value;
#endif
}
return sched_attr_copy_to_user(uattr, &kattr, usize);
}
#ifdef CONFIG_SMP
int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask)
{
/*
* If the task isn't a deadline task or admission control is
* disabled then we don't care about affinity changes.
*/
if (!task_has_dl_policy(p) || !dl_bandwidth_enabled())
return 0;
/*
* Since bandwidth control happens on root_domain basis,
* if admission test is enabled, we only admit -deadline
* tasks allowed to run on all the CPUs in the task's
* root_domain.
*/
guard(rcu)();
if (!cpumask_subset(task_rq(p)->rd->span, mask))
return -EBUSY;
return 0;
}
#endif /* CONFIG_SMP */
int __sched_setaffinity(struct task_struct *p, struct affinity_context *ctx)
{
int retval;
cpumask_var_t cpus_allowed, new_mask;
if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL))
return -ENOMEM;
if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
retval = -ENOMEM;
goto out_free_cpus_allowed;
}
cpuset_cpus_allowed(p, cpus_allowed);
cpumask_and(new_mask, ctx->new_mask, cpus_allowed);
ctx->new_mask = new_mask;
ctx->flags |= SCA_CHECK;
retval = dl_task_check_affinity(p, new_mask);
if (retval)
goto out_free_new_mask;
retval = __set_cpus_allowed_ptr(p, ctx);
if (retval)
goto out_free_new_mask;
cpuset_cpus_allowed(p, cpus_allowed);
if (!cpumask_subset(new_mask, cpus_allowed)) {
/*
* We must have raced with a concurrent cpuset update.
* Just reset the cpumask to the cpuset's cpus_allowed.
*/
cpumask_copy(new_mask, cpus_allowed);
/*
* If SCA_USER is set, a 2nd call to __set_cpus_allowed_ptr()
* will restore the previous user_cpus_ptr value.
*
* In the unlikely event a previous user_cpus_ptr exists,
* we need to further restrict the mask to what is allowed
* by that old user_cpus_ptr.
*/
if (unlikely((ctx->flags & SCA_USER) && ctx->user_mask)) {
bool empty = !cpumask_and(new_mask, new_mask,
ctx->user_mask);
if (WARN_ON_ONCE(empty))
cpumask_copy(new_mask, cpus_allowed);
}
__set_cpus_allowed_ptr(p, ctx);
retval = -EINVAL;
}
out_free_new_mask:
free_cpumask_var(new_mask);
out_free_cpus_allowed:
free_cpumask_var(cpus_allowed);
return retval;
}
long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
{
struct affinity_context ac;
struct cpumask *user_mask;
int retval;
CLASS(find_get_task, p)(pid);
if (!p)
return -ESRCH;
if (p->flags & PF_NO_SETAFFINITY)
return -EINVAL;
if (!check_same_owner(p)) {
guard(rcu)();
if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE))
return -EPERM;
}
retval = security_task_setscheduler(p);
if (retval)
return retval;
/*
* With non-SMP configs, user_cpus_ptr/user_mask isn't used and
* alloc_user_cpus_ptr() returns NULL.
*/
user_mask = alloc_user_cpus_ptr(NUMA_NO_NODE);
if (user_mask) {
cpumask_copy(user_mask, in_mask);
} else if (IS_ENABLED(CONFIG_SMP)) {
return -ENOMEM;
}
ac = (struct affinity_context){
.new_mask = in_mask,
.user_mask = user_mask,
.flags = SCA_USER,
};
retval = __sched_setaffinity(p, &ac);
kfree(ac.user_mask);
return retval;
}
static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
struct cpumask *new_mask)
{
if (len < cpumask_size())
cpumask_clear(new_mask);
else if (len > cpumask_size())
len = cpumask_size();
return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
}
/**
* sys_sched_setaffinity - set the CPU affinity of a process
* @pid: pid of the process
* @len: length in bytes of the bitmask pointed to by user_mask_ptr
* @user_mask_ptr: user-space pointer to the new CPU mask
*
* Return: 0 on success. An error code otherwise.
*/
SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
unsigned long __user *, user_mask_ptr)
{
cpumask_var_t new_mask;
int retval;
if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
return -ENOMEM;
retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
if (retval == 0)
retval = sched_setaffinity(pid, new_mask);
free_cpumask_var(new_mask);
return retval;
}
long sched_getaffinity(pid_t pid, struct cpumask *mask)
{
struct task_struct *p;
int retval;
guard(rcu)();
p = find_process_by_pid(pid);
if (!p)
return -ESRCH;
retval = security_task_getscheduler(p);
if (retval)
return retval;
guard(raw_spinlock_irqsave)(&p->pi_lock);
cpumask_and(mask, &p->cpus_mask, cpu_active_mask);
return 0;
}
/**
* sys_sched_getaffinity - get the CPU affinity of a process
* @pid: pid of the process
* @len: length in bytes of the bitmask pointed to by user_mask_ptr
* @user_mask_ptr: user-space pointer to hold the current CPU mask
*
* Return: size of CPU mask copied to user_mask_ptr on success. An
* error code otherwise.
*/
SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
unsigned long __user *, user_mask_ptr)
{
int ret;
cpumask_var_t mask;
if ((len * BITS_PER_BYTE) < nr_cpu_ids)
return -EINVAL;
if (len & (sizeof(unsigned long)-1))
return -EINVAL;
if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
return -ENOMEM;
ret = sched_getaffinity(pid, mask);
if (ret == 0) {
unsigned int retlen = min(len, cpumask_size());
if (copy_to_user(user_mask_ptr, cpumask_bits(mask), retlen))
ret = -EFAULT;
else
ret = retlen;
}
free_cpumask_var(mask);
return ret;
}
static void do_sched_yield(void)
{
struct rq_flags rf;
struct rq *rq;
rq = this_rq_lock_irq(&rf);
schedstat_inc(rq->yld_count);
current->sched_class->yield_task(rq);
preempt_disable();
rq_unlock_irq(rq, &rf);
sched_preempt_enable_no_resched();
schedule();
}
/**
* sys_sched_yield - yield the current processor to other threads.
*
* This function yields the current CPU to other tasks. If there are no
* other threads running on this CPU then this function will return.
*
* Return: 0.
*/
SYSCALL_DEFINE0(sched_yield)
{
do_sched_yield();
return 0;
}
/**
* yield - yield the current processor to other threads.
*
* Do not ever use this function, there's a 99% chance you're doing it wrong.
*
* The scheduler is at all times free to pick the calling task as the most
* eligible task to run, if removing the yield() call from your code breaks
* it, it's already broken.
*
* Typical broken usage is:
*
* while (!event)
* yield();
*
* where one assumes that yield() will let 'the other' process run that will
* make event true. If the current task is a SCHED_FIFO task that will never
* happen. Never use yield() as a progress guarantee!!
*
* If you want to use yield() to wait for something, use wait_event().
* If you want to use yield() to be 'nice' for others, use cond_resched().
* If you still want to use yield(), do not!
*/
void __sched yield(void)
{
set_current_state(TASK_RUNNING);
do_sched_yield();
}
EXPORT_SYMBOL(yield);
/**
* yield_to - yield the current processor to another thread in
* your thread group, or accelerate that thread toward the
* processor it's on.
* @p: target task
* @preempt: whether task preemption is allowed or not
*
* It's the caller's job to ensure that the target task struct
* can't go away on us before we can do any checks.
*
* Return:
* true (>0) if we indeed boosted the target task.
* false (0) if we failed to boost the target.
* -ESRCH if there's no task to yield to.
*/
int __sched yield_to(struct task_struct *p, bool preempt)
{
struct task_struct *curr = current;
struct rq *rq, *p_rq;
int yielded = 0;
scoped_guard (irqsave) {
rq = this_rq();
again:
p_rq = task_rq(p);
/*
* If we're the only runnable task on the rq and target rq also
* has only one task, there's absolutely no point in yielding.
*/
if (rq->nr_running == 1 && p_rq->nr_running == 1)
return -ESRCH;
guard(double_rq_lock)(rq, p_rq);
if (task_rq(p) != p_rq)
goto again;
if (!curr->sched_class->yield_to_task)
return 0;
if (curr->sched_class != p->sched_class)
return 0;
if (task_on_cpu(p_rq, p) || !task_is_running(p))
return 0;
yielded = curr->sched_class->yield_to_task(rq, p);
if (yielded) {
schedstat_inc(rq->yld_count);
/*
* Make p's CPU reschedule; pick_next_entity
* takes care of fairness.
*/
if (preempt && rq != p_rq)
resched_curr(p_rq);
}
}
if (yielded)
schedule();
return yielded;
}
EXPORT_SYMBOL_GPL(yield_to);
/**
* sys_sched_get_priority_max - return maximum RT priority.
* @policy: scheduling class.
*
* Return: On success, this syscall returns the maximum
* rt_priority that can be used by a given scheduling class.
* On failure, a negative error code is returned.
*/
SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
{
int ret = -EINVAL;
switch (policy) {
case SCHED_FIFO:
case SCHED_RR:
ret = MAX_RT_PRIO-1;
break;
case SCHED_DEADLINE:
case SCHED_NORMAL:
case SCHED_BATCH:
case SCHED_IDLE:
ret = 0;
break;
}
return ret;
}
/**
* sys_sched_get_priority_min - return minimum RT priority.
* @policy: scheduling class.
*
* Return: On success, this syscall returns the minimum
* rt_priority that can be used by a given scheduling class.
* On failure, a negative error code is returned.
*/
SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
{
int ret = -EINVAL;
switch (policy) {
case SCHED_FIFO:
case SCHED_RR:
ret = 1;
break;
case SCHED_DEADLINE:
case SCHED_NORMAL:
case SCHED_BATCH:
case SCHED_IDLE:
ret = 0;
}
return ret;
}
static int sched_rr_get_interval(pid_t pid, struct timespec64 *t)
{
unsigned int time_slice = 0;
int retval;
if (pid < 0)
return -EINVAL;
scoped_guard (rcu) {
struct task_struct *p = find_process_by_pid(pid);
if (!p)
return -ESRCH;
retval = security_task_getscheduler(p);
if (retval)
return retval;
scoped_guard (task_rq_lock, p) {
struct rq *rq = scope.rq;
if (p->sched_class->get_rr_interval)
time_slice = p->sched_class->get_rr_interval(rq, p);
}
}
jiffies_to_timespec64(time_slice, t);
return 0;
}
/**
* sys_sched_rr_get_interval - return the default time-slice of a process.
* @pid: pid of the process.
* @interval: userspace pointer to the time-slice value.
*
* this syscall writes the default time-slice value of a given process
* into the user-space timespec buffer. A value of '0' means infinity.
*
* Return: On success, 0 and the time-slice is in @interval. Otherwise,
* an error code.
*/
SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
struct __kernel_timespec __user *, interval)
{
struct timespec64 t;
int retval = sched_rr_get_interval(pid, &t);
if (retval == 0)
retval = put_timespec64(&t, interval);
return retval;
}
#ifdef CONFIG_COMPAT_32BIT_TIME
SYSCALL_DEFINE2(sched_rr_get_interval_time32, pid_t, pid,
struct old_timespec32 __user *, interval)
{
struct timespec64 t;
int retval = sched_rr_get_interval(pid, &t);
if (retval == 0)
retval = put_old_timespec32(&t, interval);
return retval;
}
#endif