mirror of
git://git.yoctoproject.org/linux-yocto.git
synced 2025-10-23 07:23:12 +02:00

-----BEGIN PGP SIGNATURE----- iHUEABYKAB0WIQRAhzRXHqcMeLMyaSiRxhvAZXjcogUCZ4pR0wAKCRCRxhvAZXjc ojb2AQD5QfpTEX/ju1TkenTvoNl+JfnIjaVSY40Lm9DWYzmCMAEAuRvf5WRIV713 00/RVOrUvsLobzhmnk0yw53EQ5A+pA0= =2NDA -----END PGP SIGNATURE----- Merge tag 'kernel-6.14-rc1.pid' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs Pull pid_max namespacing update from Christian Brauner: "The pid_max sysctl is a global value. For a long time the default value has been 65535 and during the pidfd dicussions Linus proposed to bump pid_max by default. Based on this discussion systemd started bumping pid_max to 2^22. So all new systems now run with a very high pid_max limit with some distros having also backported that change. The decision to bump pid_max is obviously correct. It just doesn't make a lot of sense nowadays to enforce such a low pid number. There's sufficient tooling to make selecting specific processes without typing really large pid numbers available. In any case, there are workloads that have expections about how large pid numbers they accept. Either for historical reasons or architectural reasons. One concreate example is the 32-bit version of Android's bionic libc which requires pid numbers less than 65536. There are workloads where it is run in a 32-bit container on a 64-bit kernel. If the host has a pid_max value greater than 65535 the libc will abort thread creation because of size assumptions of pthread_mutex_t. That's a fairly specific use-case however, in general specific workloads that are moved into containers running on a host with a new kernel and a new systemd can run into issues with large pid_max values. Obviously making assumptions about the size of the allocated pid is suboptimal but we have userspace that does it. Of course, giving containers the ability to restrict the number of processes in their respective pid namespace indepent of the global limit through pid_max is something desirable in itself and comes in handy in general. Independent of motivating use-cases the existence of pid namespaces makes this also a good semantical extension and there have been prior proposals pushing in a similar direction. The trick here is to minimize the risk of regressions which I think is doable. The fact that pid namespaces are hierarchical will help us here. What we mostly care about is that when the host sets a low pid_max limit, say (crazy number) 100 that no descendant pid namespace can allocate a higher pid number in its namespace. Since pid allocation is hierarchial this can be ensured by checking each pid allocation against the pid namespace's pid_max limit. This means if the allocation in the descendant pid namespace succeeds, the ancestor pid namespace can reject it. If the ancestor pid namespace has a higher limit than the descendant pid namespace the descendant pid namespace will reject the pid allocation. The ancestor pid namespace will obviously not care about this. All in all this means pid_max continues to enforce a system wide limit on the number of processes but allows pid namespaces sufficient leeway in handling workloads with assumptions about pid values and allows containers to restrict the number of processes in a pid namespace through the pid_max interface" * tag 'kernel-6.14-rc1.pid' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs: tests/pid_namespace: add pid_max tests pid: allow pid_max to be set per pid namespace
334 lines
9.4 KiB
C
334 lines
9.4 KiB
C
/* SPDX-License-Identifier: GPL-2.0 */
|
|
#ifndef _LINUX_PID_H
|
|
#define _LINUX_PID_H
|
|
|
|
#include <linux/pid_types.h>
|
|
#include <linux/rculist.h>
|
|
#include <linux/rcupdate.h>
|
|
#include <linux/refcount.h>
|
|
#include <linux/sched.h>
|
|
#include <linux/wait.h>
|
|
|
|
/*
|
|
* What is struct pid?
|
|
*
|
|
* A struct pid is the kernel's internal notion of a process identifier.
|
|
* It refers to individual tasks, process groups, and sessions. While
|
|
* there are processes attached to it the struct pid lives in a hash
|
|
* table, so it and then the processes that it refers to can be found
|
|
* quickly from the numeric pid value. The attached processes may be
|
|
* quickly accessed by following pointers from struct pid.
|
|
*
|
|
* Storing pid_t values in the kernel and referring to them later has a
|
|
* problem. The process originally with that pid may have exited and the
|
|
* pid allocator wrapped, and another process could have come along
|
|
* and been assigned that pid.
|
|
*
|
|
* Referring to user space processes by holding a reference to struct
|
|
* task_struct has a problem. When the user space process exits
|
|
* the now useless task_struct is still kept. A task_struct plus a
|
|
* stack consumes around 10K of low kernel memory. More precisely
|
|
* this is THREAD_SIZE + sizeof(struct task_struct). By comparison
|
|
* a struct pid is about 64 bytes.
|
|
*
|
|
* Holding a reference to struct pid solves both of these problems.
|
|
* It is small so holding a reference does not consume a lot of
|
|
* resources, and since a new struct pid is allocated when the numeric pid
|
|
* value is reused (when pids wrap around) we don't mistakenly refer to new
|
|
* processes.
|
|
*/
|
|
|
|
|
|
/*
|
|
* struct upid is used to get the id of the struct pid, as it is
|
|
* seen in particular namespace. Later the struct pid is found with
|
|
* find_pid_ns() using the int nr and struct pid_namespace *ns.
|
|
*/
|
|
|
|
#define RESERVED_PIDS 300
|
|
|
|
struct upid {
|
|
int nr;
|
|
struct pid_namespace *ns;
|
|
};
|
|
|
|
struct pid
|
|
{
|
|
refcount_t count;
|
|
unsigned int level;
|
|
spinlock_t lock;
|
|
struct dentry *stashed;
|
|
u64 ino;
|
|
struct rb_node pidfs_node;
|
|
/* lists of tasks that use this pid */
|
|
struct hlist_head tasks[PIDTYPE_MAX];
|
|
struct hlist_head inodes;
|
|
/* wait queue for pidfd notifications */
|
|
wait_queue_head_t wait_pidfd;
|
|
struct rcu_head rcu;
|
|
struct upid numbers[];
|
|
};
|
|
|
|
extern seqcount_spinlock_t pidmap_lock_seq;
|
|
extern struct pid init_struct_pid;
|
|
|
|
struct file;
|
|
|
|
struct pid *pidfd_pid(const struct file *file);
|
|
struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags);
|
|
struct task_struct *pidfd_get_task(int pidfd, unsigned int *flags);
|
|
int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret);
|
|
void do_notify_pidfd(struct task_struct *task);
|
|
|
|
static inline struct pid *get_pid(struct pid *pid)
|
|
{
|
|
if (pid)
|
|
refcount_inc(&pid->count);
|
|
return pid;
|
|
}
|
|
|
|
extern void put_pid(struct pid *pid);
|
|
extern struct task_struct *pid_task(struct pid *pid, enum pid_type);
|
|
static inline bool pid_has_task(struct pid *pid, enum pid_type type)
|
|
{
|
|
return !hlist_empty(&pid->tasks[type]);
|
|
}
|
|
extern struct task_struct *get_pid_task(struct pid *pid, enum pid_type);
|
|
|
|
extern struct pid *get_task_pid(struct task_struct *task, enum pid_type type);
|
|
|
|
/*
|
|
* these helpers must be called with the tasklist_lock write-held.
|
|
*/
|
|
extern void attach_pid(struct task_struct *task, enum pid_type);
|
|
extern void detach_pid(struct task_struct *task, enum pid_type);
|
|
extern void change_pid(struct task_struct *task, enum pid_type,
|
|
struct pid *pid);
|
|
extern void exchange_tids(struct task_struct *task, struct task_struct *old);
|
|
extern void transfer_pid(struct task_struct *old, struct task_struct *new,
|
|
enum pid_type);
|
|
|
|
/*
|
|
* look up a PID in the hash table. Must be called with the tasklist_lock
|
|
* or rcu_read_lock() held.
|
|
*
|
|
* find_pid_ns() finds the pid in the namespace specified
|
|
* find_vpid() finds the pid by its virtual id, i.e. in the current namespace
|
|
*
|
|
* see also find_task_by_vpid() set in include/linux/sched.h
|
|
*/
|
|
extern struct pid *find_pid_ns(int nr, struct pid_namespace *ns);
|
|
extern struct pid *find_vpid(int nr);
|
|
|
|
/*
|
|
* Lookup a PID in the hash table, and return with it's count elevated.
|
|
*/
|
|
extern struct pid *find_get_pid(int nr);
|
|
extern struct pid *find_ge_pid(int nr, struct pid_namespace *);
|
|
|
|
extern struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
|
|
size_t set_tid_size);
|
|
extern void free_pid(struct pid *pid);
|
|
extern void disable_pid_allocation(struct pid_namespace *ns);
|
|
|
|
/*
|
|
* ns_of_pid() returns the pid namespace in which the specified pid was
|
|
* allocated.
|
|
*
|
|
* NOTE:
|
|
* ns_of_pid() is expected to be called for a process (task) that has
|
|
* an attached 'struct pid' (see attach_pid(), detach_pid()) i.e @pid
|
|
* is expected to be non-NULL. If @pid is NULL, caller should handle
|
|
* the resulting NULL pid-ns.
|
|
*/
|
|
static inline struct pid_namespace *ns_of_pid(struct pid *pid)
|
|
{
|
|
struct pid_namespace *ns = NULL;
|
|
if (pid)
|
|
ns = pid->numbers[pid->level].ns;
|
|
return ns;
|
|
}
|
|
|
|
/*
|
|
* is_child_reaper returns true if the pid is the init process
|
|
* of the current namespace. As this one could be checked before
|
|
* pid_ns->child_reaper is assigned in copy_process, we check
|
|
* with the pid number.
|
|
*/
|
|
static inline bool is_child_reaper(struct pid *pid)
|
|
{
|
|
return pid->numbers[pid->level].nr == 1;
|
|
}
|
|
|
|
/*
|
|
* the helpers to get the pid's id seen from different namespaces
|
|
*
|
|
* pid_nr() : global id, i.e. the id seen from the init namespace;
|
|
* pid_vnr() : virtual id, i.e. the id seen from the pid namespace of
|
|
* current.
|
|
* pid_nr_ns() : id seen from the ns specified.
|
|
*
|
|
* see also task_xid_nr() etc in include/linux/sched.h
|
|
*/
|
|
|
|
static inline pid_t pid_nr(struct pid *pid)
|
|
{
|
|
pid_t nr = 0;
|
|
if (pid)
|
|
nr = pid->numbers[0].nr;
|
|
return nr;
|
|
}
|
|
|
|
pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns);
|
|
pid_t pid_vnr(struct pid *pid);
|
|
|
|
#define do_each_pid_task(pid, type, task) \
|
|
do { \
|
|
if ((pid) != NULL) \
|
|
hlist_for_each_entry_rcu((task), \
|
|
&(pid)->tasks[type], pid_links[type]) {
|
|
|
|
/*
|
|
* Both old and new leaders may be attached to
|
|
* the same pid in the middle of de_thread().
|
|
*/
|
|
#define while_each_pid_task(pid, type, task) \
|
|
if (type == PIDTYPE_PID) \
|
|
break; \
|
|
} \
|
|
} while (0)
|
|
|
|
#define do_each_pid_thread(pid, type, task) \
|
|
do_each_pid_task(pid, type, task) { \
|
|
struct task_struct *tg___ = task; \
|
|
for_each_thread(tg___, task) {
|
|
|
|
#define while_each_pid_thread(pid, type, task) \
|
|
} \
|
|
task = tg___; \
|
|
} while_each_pid_task(pid, type, task)
|
|
|
|
static inline struct pid *task_pid(struct task_struct *task)
|
|
{
|
|
return task->thread_pid;
|
|
}
|
|
|
|
/*
|
|
* the helpers to get the task's different pids as they are seen
|
|
* from various namespaces
|
|
*
|
|
* task_xid_nr() : global id, i.e. the id seen from the init namespace;
|
|
* task_xid_vnr() : virtual id, i.e. the id seen from the pid namespace of
|
|
* current.
|
|
* task_xid_nr_ns() : id seen from the ns specified;
|
|
*
|
|
* see also pid_nr() etc in include/linux/pid.h
|
|
*/
|
|
pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, struct pid_namespace *ns);
|
|
|
|
static inline pid_t task_pid_nr(struct task_struct *tsk)
|
|
{
|
|
return tsk->pid;
|
|
}
|
|
|
|
static inline pid_t task_pid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
|
|
{
|
|
return __task_pid_nr_ns(tsk, PIDTYPE_PID, ns);
|
|
}
|
|
|
|
static inline pid_t task_pid_vnr(struct task_struct *tsk)
|
|
{
|
|
return __task_pid_nr_ns(tsk, PIDTYPE_PID, NULL);
|
|
}
|
|
|
|
|
|
static inline pid_t task_tgid_nr(struct task_struct *tsk)
|
|
{
|
|
return tsk->tgid;
|
|
}
|
|
|
|
/**
|
|
* pid_alive - check that a task structure is not stale
|
|
* @p: Task structure to be checked.
|
|
*
|
|
* Test if a process is not yet dead (at most zombie state)
|
|
* If pid_alive fails, then pointers within the task structure
|
|
* can be stale and must not be dereferenced.
|
|
*
|
|
* Return: 1 if the process is alive. 0 otherwise.
|
|
*/
|
|
static inline int pid_alive(const struct task_struct *p)
|
|
{
|
|
return p->thread_pid != NULL;
|
|
}
|
|
|
|
static inline pid_t task_pgrp_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
|
|
{
|
|
return __task_pid_nr_ns(tsk, PIDTYPE_PGID, ns);
|
|
}
|
|
|
|
static inline pid_t task_pgrp_vnr(struct task_struct *tsk)
|
|
{
|
|
return __task_pid_nr_ns(tsk, PIDTYPE_PGID, NULL);
|
|
}
|
|
|
|
|
|
static inline pid_t task_session_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
|
|
{
|
|
return __task_pid_nr_ns(tsk, PIDTYPE_SID, ns);
|
|
}
|
|
|
|
static inline pid_t task_session_vnr(struct task_struct *tsk)
|
|
{
|
|
return __task_pid_nr_ns(tsk, PIDTYPE_SID, NULL);
|
|
}
|
|
|
|
static inline pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
|
|
{
|
|
return __task_pid_nr_ns(tsk, PIDTYPE_TGID, ns);
|
|
}
|
|
|
|
static inline pid_t task_tgid_vnr(struct task_struct *tsk)
|
|
{
|
|
return __task_pid_nr_ns(tsk, PIDTYPE_TGID, NULL);
|
|
}
|
|
|
|
static inline pid_t task_ppid_nr_ns(const struct task_struct *tsk, struct pid_namespace *ns)
|
|
{
|
|
pid_t pid = 0;
|
|
|
|
rcu_read_lock();
|
|
if (pid_alive(tsk))
|
|
pid = task_tgid_nr_ns(rcu_dereference(tsk->real_parent), ns);
|
|
rcu_read_unlock();
|
|
|
|
return pid;
|
|
}
|
|
|
|
static inline pid_t task_ppid_nr(const struct task_struct *tsk)
|
|
{
|
|
return task_ppid_nr_ns(tsk, &init_pid_ns);
|
|
}
|
|
|
|
/* Obsolete, do not use: */
|
|
static inline pid_t task_pgrp_nr(struct task_struct *tsk)
|
|
{
|
|
return task_pgrp_nr_ns(tsk, &init_pid_ns);
|
|
}
|
|
|
|
/**
|
|
* is_global_init - check if a task structure is init. Since init
|
|
* is free to have sub-threads we need to check tgid.
|
|
* @tsk: Task structure to be checked.
|
|
*
|
|
* Check if a task structure is the first user space task the kernel created.
|
|
*
|
|
* Return: 1 if the task structure is init. 0 otherwise.
|
|
*/
|
|
static inline int is_global_init(struct task_struct *tsk)
|
|
{
|
|
return task_tgid_nr(tsk) == 1;
|
|
}
|
|
|
|
#endif /* _LINUX_PID_H */
|