mirror of
git://git.yoctoproject.org/linux-yocto.git
synced 2025-07-05 13:25:20 +02:00

* cpuset now support isolated cpus.partition type, which will enable dynamic
CPU isolation.
* pids.peak added to remember the max number of pids used.
* Holes in cgroup namespace plugged.
* Internal cleanups.
Note that for-6.1-fixes was pulled into for-6.1 twice. Both were for
follow-up cleanups and each merge commit has details.
Also, 8a693f7766
("cgroup: Remove CFTYPE_PRESSURE") removes the flag used
by PSI changes in the tip tree and the merged result won't compile due to
the missing flag. Simply removing the struct init lines specifying the flag
is the correct resolution. linux-next already contains the correct fix:
https://lkml.kernel.org/r/20220912161812.072aaa3b@canb.auug.org.au
-----BEGIN PGP SIGNATURE-----
iIQEABYIACwWIQTfIjM1kS57o3GsC/uxYfJx3gVYGQUCYzsl7w4cdGpAa2VybmVs
Lm9yZwAKCRCxYfJx3gVYGYsxAP4kad4YPw+CueLyyEMiYgBHouqDt8cG0+FJWK3X
svTC7wD/eCLfxZM8TjjSrMmvaMrml586mr3NoQaFeW0x3twptQQ=
=LERu
-----END PGP SIGNATURE-----
Merge tag 'cgroup-for-6.1' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup
Pull cgroup updates from Tejun Heo:
- cpuset now support isolated cpus.partition type, which will enable
dynamic CPU isolation
- pids.peak added to remember the max number of pids used
- holes in cgroup namespace plugged
- internal cleanups
* tag 'cgroup-for-6.1' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup: (25 commits)
cgroup: use strscpy() is more robust and safer
iocost_monitor: reorder BlkgIterator
cgroup: simplify code in cgroup_apply_control
cgroup: Make cgroup_get_from_id() prettier
cgroup/cpuset: remove unreachable code
cgroup: Remove CFTYPE_PRESSURE
cgroup: Improve cftype add/rm error handling
kselftest/cgroup: Add cpuset v2 partition root state test
cgroup/cpuset: Update description of cpuset.cpus.partition in cgroup-v2.rst
cgroup/cpuset: Make partition invalid if cpumask change violates exclusivity rule
cgroup/cpuset: Relocate a code block in validate_change()
cgroup/cpuset: Show invalid partition reason string
cgroup/cpuset: Add a new isolated cpus.partition type
cgroup/cpuset: Relax constraints to partition & cpus changes
cgroup/cpuset: Allow no-task partition to have empty cpuset.cpus.effective
cgroup/cpuset: Miscellaneous cleanups & add helper functions
cgroup/cpuset: Enable update_tasks_cpumask() on top_cpuset
cgroup: add pids.peak interface for pids controller
cgroup: Remove data-race around cgrp_dfl_visible
cgroup: Fix build failure when CONFIG_SHRINKER_DEBUG
...
201 lines
4.7 KiB
C
201 lines
4.7 KiB
C
// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
|
|
// Copyright (c) 2021 Facebook
|
|
// Copyright (c) 2021 Google
|
|
#include "vmlinux.h"
|
|
#include <bpf/bpf_helpers.h>
|
|
#include <bpf/bpf_tracing.h>
|
|
#include <bpf/bpf_core_read.h>
|
|
|
|
#define MAX_LEVELS 10 // max cgroup hierarchy level: arbitrary
|
|
#define MAX_EVENTS 32 // max events per cgroup: arbitrary
|
|
|
|
// NOTE: many of map and global data will be modified before loading
|
|
// from the userspace (perf tool) using the skeleton helpers.
|
|
|
|
// single set of global perf events to measure
|
|
struct {
|
|
__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
|
|
__uint(key_size, sizeof(__u32));
|
|
__uint(value_size, sizeof(int));
|
|
__uint(max_entries, 1);
|
|
} events SEC(".maps");
|
|
|
|
// from cgroup id to event index
|
|
struct {
|
|
__uint(type, BPF_MAP_TYPE_HASH);
|
|
__uint(key_size, sizeof(__u64));
|
|
__uint(value_size, sizeof(__u32));
|
|
__uint(max_entries, 1);
|
|
} cgrp_idx SEC(".maps");
|
|
|
|
// per-cpu event snapshots to calculate delta
|
|
struct {
|
|
__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
|
|
__uint(key_size, sizeof(__u32));
|
|
__uint(value_size, sizeof(struct bpf_perf_event_value));
|
|
} prev_readings SEC(".maps");
|
|
|
|
// aggregated event values for each cgroup (per-cpu)
|
|
// will be read from the user-space
|
|
struct {
|
|
__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
|
|
__uint(key_size, sizeof(__u32));
|
|
__uint(value_size, sizeof(struct bpf_perf_event_value));
|
|
} cgrp_readings SEC(".maps");
|
|
|
|
const volatile __u32 num_events = 1;
|
|
const volatile __u32 num_cpus = 1;
|
|
|
|
int enabled = 0;
|
|
int use_cgroup_v2 = 0;
|
|
int perf_subsys_id = -1;
|
|
|
|
static inline int get_cgroup_v1_idx(__u32 *cgrps, int size)
|
|
{
|
|
struct task_struct *p = (void *)bpf_get_current_task();
|
|
struct cgroup *cgrp;
|
|
register int i = 0;
|
|
__u32 *elem;
|
|
int level;
|
|
int cnt;
|
|
|
|
if (perf_subsys_id == -1) {
|
|
#if __has_builtin(__builtin_preserve_enum_value)
|
|
perf_subsys_id = bpf_core_enum_value(enum cgroup_subsys_id,
|
|
perf_event_cgrp_id);
|
|
#else
|
|
perf_subsys_id = perf_event_cgrp_id;
|
|
#endif
|
|
}
|
|
cgrp = BPF_CORE_READ(p, cgroups, subsys[perf_subsys_id], cgroup);
|
|
level = BPF_CORE_READ(cgrp, level);
|
|
|
|
for (cnt = 0; i < MAX_LEVELS; i++) {
|
|
__u64 cgrp_id;
|
|
|
|
if (i > level)
|
|
break;
|
|
|
|
// convert cgroup-id to a map index
|
|
cgrp_id = BPF_CORE_READ(cgrp, ancestors[i], kn, id);
|
|
elem = bpf_map_lookup_elem(&cgrp_idx, &cgrp_id);
|
|
if (!elem)
|
|
continue;
|
|
|
|
cgrps[cnt++] = *elem;
|
|
if (cnt == size)
|
|
break;
|
|
}
|
|
|
|
return cnt;
|
|
}
|
|
|
|
static inline int get_cgroup_v2_idx(__u32 *cgrps, int size)
|
|
{
|
|
register int i = 0;
|
|
__u32 *elem;
|
|
int cnt;
|
|
|
|
for (cnt = 0; i < MAX_LEVELS; i++) {
|
|
__u64 cgrp_id = bpf_get_current_ancestor_cgroup_id(i);
|
|
|
|
if (cgrp_id == 0)
|
|
break;
|
|
|
|
// convert cgroup-id to a map index
|
|
elem = bpf_map_lookup_elem(&cgrp_idx, &cgrp_id);
|
|
if (!elem)
|
|
continue;
|
|
|
|
cgrps[cnt++] = *elem;
|
|
if (cnt == size)
|
|
break;
|
|
}
|
|
|
|
return cnt;
|
|
}
|
|
|
|
static int bperf_cgroup_count(void)
|
|
{
|
|
register __u32 idx = 0; // to have it in a register to pass BPF verifier
|
|
register int c = 0;
|
|
struct bpf_perf_event_value val, delta, *prev_val, *cgrp_val;
|
|
__u32 cpu = bpf_get_smp_processor_id();
|
|
__u32 cgrp_idx[MAX_LEVELS];
|
|
int cgrp_cnt;
|
|
__u32 key, cgrp;
|
|
long err;
|
|
|
|
if (use_cgroup_v2)
|
|
cgrp_cnt = get_cgroup_v2_idx(cgrp_idx, MAX_LEVELS);
|
|
else
|
|
cgrp_cnt = get_cgroup_v1_idx(cgrp_idx, MAX_LEVELS);
|
|
|
|
for ( ; idx < MAX_EVENTS; idx++) {
|
|
if (idx == num_events)
|
|
break;
|
|
|
|
// XXX: do not pass idx directly (for verifier)
|
|
key = idx;
|
|
// this is per-cpu array for diff
|
|
prev_val = bpf_map_lookup_elem(&prev_readings, &key);
|
|
if (!prev_val) {
|
|
val.counter = val.enabled = val.running = 0;
|
|
bpf_map_update_elem(&prev_readings, &key, &val, BPF_ANY);
|
|
|
|
prev_val = bpf_map_lookup_elem(&prev_readings, &key);
|
|
if (!prev_val)
|
|
continue;
|
|
}
|
|
|
|
// read from global perf_event array
|
|
key = idx * num_cpus + cpu;
|
|
err = bpf_perf_event_read_value(&events, key, &val, sizeof(val));
|
|
if (err)
|
|
continue;
|
|
|
|
if (enabled) {
|
|
delta.counter = val.counter - prev_val->counter;
|
|
delta.enabled = val.enabled - prev_val->enabled;
|
|
delta.running = val.running - prev_val->running;
|
|
|
|
for (c = 0; c < MAX_LEVELS; c++) {
|
|
if (c == cgrp_cnt)
|
|
break;
|
|
|
|
cgrp = cgrp_idx[c];
|
|
|
|
// aggregate the result by cgroup
|
|
key = cgrp * num_events + idx;
|
|
cgrp_val = bpf_map_lookup_elem(&cgrp_readings, &key);
|
|
if (cgrp_val) {
|
|
cgrp_val->counter += delta.counter;
|
|
cgrp_val->enabled += delta.enabled;
|
|
cgrp_val->running += delta.running;
|
|
} else {
|
|
bpf_map_update_elem(&cgrp_readings, &key,
|
|
&delta, BPF_ANY);
|
|
}
|
|
}
|
|
}
|
|
|
|
*prev_val = val;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
// This will be attached to cgroup-switches event for each cpu
|
|
SEC("perf_event")
|
|
int BPF_PROG(on_cgrp_switch)
|
|
{
|
|
return bperf_cgroup_count();
|
|
}
|
|
|
|
SEC("raw_tp/sched_switch")
|
|
int BPF_PROG(trigger_read)
|
|
{
|
|
return bperf_cgroup_count();
|
|
}
|
|
|
|
char LICENSE[] SEC("license") = "Dual BSD/GPL";
|