linux-yocto/tools/perf/util/bpf_skel/bperf_cgroup.bpf.c
Linus Torvalds adf4bfc4a9 cgroup changes for v6.1-rc1.
* cpuset now support isolated cpus.partition type, which will enable dynamic
   CPU isolation.
 * pids.peak added to remember the max number of pids used.
 * Holes in cgroup namespace plugged.
 * Internal cleanups.
 
 Note that for-6.1-fixes was pulled into for-6.1 twice. Both were for
 follow-up cleanups and each merge commit has details.
 
 Also, 8a693f7766 ("cgroup: Remove CFTYPE_PRESSURE") removes the flag used
 by PSI changes in the tip tree and the merged result won't compile due to
 the missing flag. Simply removing the struct init lines specifying the flag
 is the correct resolution. linux-next already contains the correct fix:
 
  https://lkml.kernel.org/r/20220912161812.072aaa3b@canb.auug.org.au
 -----BEGIN PGP SIGNATURE-----
 
 iIQEABYIACwWIQTfIjM1kS57o3GsC/uxYfJx3gVYGQUCYzsl7w4cdGpAa2VybmVs
 Lm9yZwAKCRCxYfJx3gVYGYsxAP4kad4YPw+CueLyyEMiYgBHouqDt8cG0+FJWK3X
 svTC7wD/eCLfxZM8TjjSrMmvaMrml586mr3NoQaFeW0x3twptQQ=
 =LERu
 -----END PGP SIGNATURE-----

Merge tag 'cgroup-for-6.1' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup

Pull cgroup updates from Tejun Heo:

 - cpuset now support isolated cpus.partition type, which will enable
   dynamic CPU isolation

 - pids.peak added to remember the max number of pids used

 - holes in cgroup namespace plugged

 - internal cleanups

* tag 'cgroup-for-6.1' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup: (25 commits)
  cgroup: use strscpy() is more robust and safer
  iocost_monitor: reorder BlkgIterator
  cgroup: simplify code in cgroup_apply_control
  cgroup: Make cgroup_get_from_id() prettier
  cgroup/cpuset: remove unreachable code
  cgroup: Remove CFTYPE_PRESSURE
  cgroup: Improve cftype add/rm error handling
  kselftest/cgroup: Add cpuset v2 partition root state test
  cgroup/cpuset: Update description of cpuset.cpus.partition in cgroup-v2.rst
  cgroup/cpuset: Make partition invalid if cpumask change violates exclusivity rule
  cgroup/cpuset: Relocate a code block in validate_change()
  cgroup/cpuset: Show invalid partition reason string
  cgroup/cpuset: Add a new isolated cpus.partition type
  cgroup/cpuset: Relax constraints to partition & cpus changes
  cgroup/cpuset: Allow no-task partition to have empty cpuset.cpus.effective
  cgroup/cpuset: Miscellaneous cleanups & add helper functions
  cgroup/cpuset: Enable update_tasks_cpumask() on top_cpuset
  cgroup: add pids.peak interface for pids controller
  cgroup: Remove data-race around cgrp_dfl_visible
  cgroup: Fix build failure when CONFIG_SHRINKER_DEBUG
  ...
2022-10-10 11:12:25 -07:00

201 lines
4.7 KiB
C

// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
// Copyright (c) 2021 Facebook
// Copyright (c) 2021 Google
#include "vmlinux.h"
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_tracing.h>
#include <bpf/bpf_core_read.h>
#define MAX_LEVELS 10 // max cgroup hierarchy level: arbitrary
#define MAX_EVENTS 32 // max events per cgroup: arbitrary
// NOTE: many of map and global data will be modified before loading
// from the userspace (perf tool) using the skeleton helpers.
// single set of global perf events to measure
struct {
__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
__uint(key_size, sizeof(__u32));
__uint(value_size, sizeof(int));
__uint(max_entries, 1);
} events SEC(".maps");
// from cgroup id to event index
struct {
__uint(type, BPF_MAP_TYPE_HASH);
__uint(key_size, sizeof(__u64));
__uint(value_size, sizeof(__u32));
__uint(max_entries, 1);
} cgrp_idx SEC(".maps");
// per-cpu event snapshots to calculate delta
struct {
__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
__uint(key_size, sizeof(__u32));
__uint(value_size, sizeof(struct bpf_perf_event_value));
} prev_readings SEC(".maps");
// aggregated event values for each cgroup (per-cpu)
// will be read from the user-space
struct {
__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
__uint(key_size, sizeof(__u32));
__uint(value_size, sizeof(struct bpf_perf_event_value));
} cgrp_readings SEC(".maps");
const volatile __u32 num_events = 1;
const volatile __u32 num_cpus = 1;
int enabled = 0;
int use_cgroup_v2 = 0;
int perf_subsys_id = -1;
static inline int get_cgroup_v1_idx(__u32 *cgrps, int size)
{
struct task_struct *p = (void *)bpf_get_current_task();
struct cgroup *cgrp;
register int i = 0;
__u32 *elem;
int level;
int cnt;
if (perf_subsys_id == -1) {
#if __has_builtin(__builtin_preserve_enum_value)
perf_subsys_id = bpf_core_enum_value(enum cgroup_subsys_id,
perf_event_cgrp_id);
#else
perf_subsys_id = perf_event_cgrp_id;
#endif
}
cgrp = BPF_CORE_READ(p, cgroups, subsys[perf_subsys_id], cgroup);
level = BPF_CORE_READ(cgrp, level);
for (cnt = 0; i < MAX_LEVELS; i++) {
__u64 cgrp_id;
if (i > level)
break;
// convert cgroup-id to a map index
cgrp_id = BPF_CORE_READ(cgrp, ancestors[i], kn, id);
elem = bpf_map_lookup_elem(&cgrp_idx, &cgrp_id);
if (!elem)
continue;
cgrps[cnt++] = *elem;
if (cnt == size)
break;
}
return cnt;
}
static inline int get_cgroup_v2_idx(__u32 *cgrps, int size)
{
register int i = 0;
__u32 *elem;
int cnt;
for (cnt = 0; i < MAX_LEVELS; i++) {
__u64 cgrp_id = bpf_get_current_ancestor_cgroup_id(i);
if (cgrp_id == 0)
break;
// convert cgroup-id to a map index
elem = bpf_map_lookup_elem(&cgrp_idx, &cgrp_id);
if (!elem)
continue;
cgrps[cnt++] = *elem;
if (cnt == size)
break;
}
return cnt;
}
static int bperf_cgroup_count(void)
{
register __u32 idx = 0; // to have it in a register to pass BPF verifier
register int c = 0;
struct bpf_perf_event_value val, delta, *prev_val, *cgrp_val;
__u32 cpu = bpf_get_smp_processor_id();
__u32 cgrp_idx[MAX_LEVELS];
int cgrp_cnt;
__u32 key, cgrp;
long err;
if (use_cgroup_v2)
cgrp_cnt = get_cgroup_v2_idx(cgrp_idx, MAX_LEVELS);
else
cgrp_cnt = get_cgroup_v1_idx(cgrp_idx, MAX_LEVELS);
for ( ; idx < MAX_EVENTS; idx++) {
if (idx == num_events)
break;
// XXX: do not pass idx directly (for verifier)
key = idx;
// this is per-cpu array for diff
prev_val = bpf_map_lookup_elem(&prev_readings, &key);
if (!prev_val) {
val.counter = val.enabled = val.running = 0;
bpf_map_update_elem(&prev_readings, &key, &val, BPF_ANY);
prev_val = bpf_map_lookup_elem(&prev_readings, &key);
if (!prev_val)
continue;
}
// read from global perf_event array
key = idx * num_cpus + cpu;
err = bpf_perf_event_read_value(&events, key, &val, sizeof(val));
if (err)
continue;
if (enabled) {
delta.counter = val.counter - prev_val->counter;
delta.enabled = val.enabled - prev_val->enabled;
delta.running = val.running - prev_val->running;
for (c = 0; c < MAX_LEVELS; c++) {
if (c == cgrp_cnt)
break;
cgrp = cgrp_idx[c];
// aggregate the result by cgroup
key = cgrp * num_events + idx;
cgrp_val = bpf_map_lookup_elem(&cgrp_readings, &key);
if (cgrp_val) {
cgrp_val->counter += delta.counter;
cgrp_val->enabled += delta.enabled;
cgrp_val->running += delta.running;
} else {
bpf_map_update_elem(&cgrp_readings, &key,
&delta, BPF_ANY);
}
}
}
*prev_val = val;
}
return 0;
}
// This will be attached to cgroup-switches event for each cpu
SEC("perf_event")
int BPF_PROG(on_cgrp_switch)
{
return bperf_cgroup_count();
}
SEC("raw_tp/sched_switch")
int BPF_PROG(trigger_read)
{
return bperf_cgroup_count();
}
char LICENSE[] SEC("license") = "Dual BSD/GPL";