- Avoid a crash on a heterogeneous machine where not all cores support the

same hw events features
 
 - Avoid a deadlock when throttling events
 
 - Document the perf event states more
 
 - Make sure a number of perf paths switching off or rescheduling events call
   perf_cgroup_event_disable()
 
 - Make sure perf does task sampling before its userspace mapping is torn down,
   and not after
 -----BEGIN PGP SIGNATURE-----
 
 iQIzBAABCgAdFiEEzv7L6UO9uDPlPSfHEsHwGGHeVUoFAmhXvYYACgkQEsHwGGHe
 VUotdg//TXchNnZ9xcGKSTFphDQMWVIy1cRWUffWC5ewUhjE9H7+FMZvCmvih8uc
 uvAsZ92GXE64fuzF0tU/5ybWEgca6HPbgI8aOhnk+vo9Yzxj9/0eO0SKK8qqSvzo
 ecn/p9yX4/jD86kIo6K279z7ZX8/0tSLselnicrGy1r4RGuaebEAvXDEzZm8p/c6
 0MjaTGC4TzkZkGyEeWXRt7jewiWvXO+91TqqwMyrhmIG3cs2TCbPhSn0QowXUZsF
 PdCJA5Z+vKp6j8n8fohRTFoATSRw5xAoqT+JRmPZ2K3QOCwtf1X0MbM6ZKkapgZO
 Y4Tp3HPw9yHUu8cyvEEwqU0jDn4J0EaqFgwCrxzvQj9ufkHBlPgNahjXW5upcw4k
 TV3qEp6KKfywTWWExh6Gjie7y7Hq3aHOkJVCg/ZeQjwMXhpZg7z+mGwh7x08Jn/2
 9/bpLG8Gl8eto3G6L1px/NUMc4poZTbSheKrjEMt3Z6ErNoAR4gb7SO547Lvf8HK
 bty5NZftDUNv42bqqXI0GY7YXKkr1AtHdRDlTeLlc5YmPzhIyG3LgEi4BqN3gyFf
 emh/CFG/1KT8GWxNCrPW6d01TBRswZjFyBDHL89HO3i0r2nDe98+2fLmllnl2Bv2
 EadgGE1XWv6RB5APJ726HXqMgtXM9cHRMogKMhiHNZnwkQba+ug=
 =nnMl
 -----END PGP SIGNATURE-----

Merge tag 'perf_urgent_for_v6.16_rc3' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull perf fixes from Borislav Petkov:

 - Avoid a crash on a heterogeneous machine where not all cores support
   the same hw events features

 - Avoid a deadlock when throttling events

 - Document the perf event states more

 - Make sure a number of perf paths switching off or rescheduling events
   call perf_cgroup_event_disable()

 - Make sure perf does task sampling before its userspace mapping is
   torn down, and not after

* tag 'perf_urgent_for_v6.16_rc3' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  perf/x86/intel: Fix crash in icl_update_topdown_event()
  perf: Fix the throttle error of some clock events
  perf: Add comment to enum perf_event_state
  perf/core: Fix WARN in perf_cgroup_switch()
  perf: Fix dangling cgroup pointer in cpuctx
  perf: Fix cgroup state vs ERROR
  perf: Fix sample vs do_exit()
This commit is contained in:
Linus Torvalds 2025-06-22 10:11:45 -07:00
commit 17ef32ae66
4 changed files with 124 additions and 53 deletions

View File

@ -2826,7 +2826,7 @@ static void intel_pmu_read_event(struct perf_event *event)
* If the PEBS counters snapshotting is enabled,
* the topdown event is available in PEBS records.
*/
if (is_topdown_event(event) && !is_pebs_counter_event_group(event))
if (is_topdown_count(event) && !is_pebs_counter_event_group(event))
static_call(intel_pmu_update_topdown_event)(event, NULL);
else
intel_pmu_drain_pebs_buffer();

View File

@ -635,8 +635,46 @@ struct perf_addr_filter_range {
unsigned long size;
};
/**
* enum perf_event_state - the states of an event:
/*
* The normal states are:
*
* ACTIVE --.
* ^ |
* | |
* sched_{in,out}() |
* | |
* v |
* ,---> INACTIVE --+ <-.
* | | |
* | {dis,en}able()
* sched_in() | |
* | OFF <--' --+
* | |
* `---> ERROR ------'
*
* That is:
*
* sched_in: INACTIVE -> {ACTIVE,ERROR}
* sched_out: ACTIVE -> INACTIVE
* disable: {ACTIVE,INACTIVE} -> OFF
* enable: {OFF,ERROR} -> INACTIVE
*
* Where {OFF,ERROR} are disabled states.
*
* Then we have the {EXIT,REVOKED,DEAD} states which are various shades of
* defunct events:
*
* - EXIT means task that the even was assigned to died, but child events
* still live, and further children can still be created. But the event
* itself will never be active again. It can only transition to
* {REVOKED,DEAD};
*
* - REVOKED means the PMU the event was associated with is gone; all
* functionality is stopped but the event is still alive. Can only
* transition to DEAD;
*
* - DEAD event really is DYING tearing down state and freeing bits.
*
*/
enum perf_event_state {
PERF_EVENT_STATE_DEAD = -5,

View File

@ -207,6 +207,19 @@ static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
__perf_ctx_unlock(&cpuctx->ctx);
}
typedef struct {
struct perf_cpu_context *cpuctx;
struct perf_event_context *ctx;
} class_perf_ctx_lock_t;
static inline void class_perf_ctx_lock_destructor(class_perf_ctx_lock_t *_T)
{ perf_ctx_unlock(_T->cpuctx, _T->ctx); }
static inline class_perf_ctx_lock_t
class_perf_ctx_lock_constructor(struct perf_cpu_context *cpuctx,
struct perf_event_context *ctx)
{ perf_ctx_lock(cpuctx, ctx); return (class_perf_ctx_lock_t){ cpuctx, ctx }; }
#define TASK_TOMBSTONE ((void *)-1L)
static bool is_kernel_event(struct perf_event *event)
@ -944,7 +957,13 @@ static void perf_cgroup_switch(struct task_struct *task)
if (READ_ONCE(cpuctx->cgrp) == cgrp)
return;
perf_ctx_lock(cpuctx, cpuctx->task_ctx);
guard(perf_ctx_lock)(cpuctx, cpuctx->task_ctx);
/*
* Re-check, could've raced vs perf_remove_from_context().
*/
if (READ_ONCE(cpuctx->cgrp) == NULL)
return;
perf_ctx_disable(&cpuctx->ctx, true);
ctx_sched_out(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP);
@ -962,7 +981,6 @@ static void perf_cgroup_switch(struct task_struct *task)
ctx_sched_in(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP);
perf_ctx_enable(&cpuctx->ctx, true);
perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
}
static int perf_cgroup_ensure_storage(struct perf_event *event,
@ -2120,18 +2138,6 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
if (event->group_leader == event)
del_event_from_groups(event, ctx);
/*
* If event was in error state, then keep it
* that way, otherwise bogus counts will be
* returned on read(). The only way to get out
* of error state is by explicit re-enabling
* of the event
*/
if (event->state > PERF_EVENT_STATE_OFF) {
perf_cgroup_event_disable(event, ctx);
perf_event_set_state(event, PERF_EVENT_STATE_OFF);
}
ctx->generation++;
event->pmu_ctx->nr_events--;
}
@ -2149,8 +2155,9 @@ perf_aux_output_match(struct perf_event *event, struct perf_event *aux_event)
}
static void put_event(struct perf_event *event);
static void event_sched_out(struct perf_event *event,
struct perf_event_context *ctx);
static void __event_disable(struct perf_event *event,
struct perf_event_context *ctx,
enum perf_event_state state);
static void perf_put_aux_event(struct perf_event *event)
{
@ -2183,8 +2190,7 @@ static void perf_put_aux_event(struct perf_event *event)
* state so that we don't try to schedule it again. Note
* that perf_event_enable() will clear the ERROR status.
*/
event_sched_out(iter, ctx);
perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
__event_disable(iter, ctx, PERF_EVENT_STATE_ERROR);
}
}
@ -2242,18 +2248,6 @@ static inline struct list_head *get_event_list(struct perf_event *event)
&event->pmu_ctx->flexible_active;
}
/*
* Events that have PERF_EV_CAP_SIBLING require being part of a group and
* cannot exist on their own, schedule them out and move them into the ERROR
* state. Also see _perf_event_enable(), it will not be able to recover
* this ERROR state.
*/
static inline void perf_remove_sibling_event(struct perf_event *event)
{
event_sched_out(event, event->ctx);
perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
}
static void perf_group_detach(struct perf_event *event)
{
struct perf_event *leader = event->group_leader;
@ -2289,8 +2283,15 @@ static void perf_group_detach(struct perf_event *event)
*/
list_for_each_entry_safe(sibling, tmp, &event->sibling_list, sibling_list) {
/*
* Events that have PERF_EV_CAP_SIBLING require being part of
* a group and cannot exist on their own, schedule them out
* and move them into the ERROR state. Also see
* _perf_event_enable(), it will not be able to recover this
* ERROR state.
*/
if (sibling->event_caps & PERF_EV_CAP_SIBLING)
perf_remove_sibling_event(sibling);
__event_disable(sibling, ctx, PERF_EVENT_STATE_ERROR);
sibling->group_leader = sibling;
list_del_init(&sibling->sibling_list);
@ -2493,11 +2494,14 @@ __perf_remove_from_context(struct perf_event *event,
state = PERF_EVENT_STATE_EXIT;
if (flags & DETACH_REVOKE)
state = PERF_EVENT_STATE_REVOKED;
if (flags & DETACH_DEAD) {
event->pending_disable = 1;
if (flags & DETACH_DEAD)
state = PERF_EVENT_STATE_DEAD;
}
event_sched_out(event, ctx);
if (event->state > PERF_EVENT_STATE_OFF)
perf_cgroup_event_disable(event, ctx);
perf_event_set_state(event, min(event->state, state));
if (flags & DETACH_GROUP)
@ -2562,6 +2566,15 @@ static void perf_remove_from_context(struct perf_event *event, unsigned long fla
event_function_call(event, __perf_remove_from_context, (void *)flags);
}
static void __event_disable(struct perf_event *event,
struct perf_event_context *ctx,
enum perf_event_state state)
{
event_sched_out(event, ctx);
perf_cgroup_event_disable(event, ctx);
perf_event_set_state(event, state);
}
/*
* Cross CPU call to disable a performance event
*/
@ -2576,13 +2589,18 @@ static void __perf_event_disable(struct perf_event *event,
perf_pmu_disable(event->pmu_ctx->pmu);
ctx_time_update_event(ctx, event);
/*
* When disabling a group leader, the whole group becomes ineligible
* to run, so schedule out the full group.
*/
if (event == event->group_leader)
group_sched_out(event, ctx);
else
event_sched_out(event, ctx);
perf_event_set_state(event, PERF_EVENT_STATE_OFF);
perf_cgroup_event_disable(event, ctx);
/*
* But only mark the leader OFF; the siblings will remain
* INACTIVE.
*/
__event_disable(event, ctx, PERF_EVENT_STATE_OFF);
perf_pmu_enable(event->pmu_ctx->pmu);
}
@ -2656,8 +2674,8 @@ static void perf_event_unthrottle(struct perf_event *event, bool start)
static void perf_event_throttle(struct perf_event *event)
{
event->pmu->stop(event, 0);
event->hw.interrupts = MAX_INTERRUPTS;
event->pmu->stop(event, 0);
if (event == event->group_leader)
perf_log_throttle(event, 0);
}
@ -7439,6 +7457,10 @@ perf_sample_ustack_size(u16 stack_size, u16 header_size,
if (!regs)
return 0;
/* No mm, no stack, no dump. */
if (!current->mm)
return 0;
/*
* Check if we fit in with the requested stack size into the:
* - TASK_SIZE
@ -8150,6 +8172,9 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs)
const u32 max_stack = event->attr.sample_max_stack;
struct perf_callchain_entry *callchain;
if (!current->mm)
user = false;
if (!kernel && !user)
return &__empty_callchain;
@ -11749,7 +11774,12 @@ static void perf_swevent_cancel_hrtimer(struct perf_event *event)
{
struct hw_perf_event *hwc = &event->hw;
if (is_sampling_event(event)) {
/*
* The throttle can be triggered in the hrtimer handler.
* The HRTIMER_NORESTART should be used to stop the timer,
* rather than hrtimer_cancel(). See perf_swevent_hrtimer()
*/
if (is_sampling_event(event) && (hwc->interrupts != MAX_INTERRUPTS)) {
ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
local64_set(&hwc->period_left, ktime_to_ns(remaining));
@ -11804,7 +11834,8 @@ static void cpu_clock_event_start(struct perf_event *event, int flags)
static void cpu_clock_event_stop(struct perf_event *event, int flags)
{
perf_swevent_cancel_hrtimer(event);
cpu_clock_event_update(event);
if (flags & PERF_EF_UPDATE)
cpu_clock_event_update(event);
}
static int cpu_clock_event_add(struct perf_event *event, int flags)
@ -11882,7 +11913,8 @@ static void task_clock_event_start(struct perf_event *event, int flags)
static void task_clock_event_stop(struct perf_event *event, int flags)
{
perf_swevent_cancel_hrtimer(event);
task_clock_event_update(event, event->ctx->time);
if (flags & PERF_EF_UPDATE)
task_clock_event_update(event, event->ctx->time);
}
static int task_clock_event_add(struct perf_event *event, int flags)

View File

@ -940,6 +940,15 @@ void __noreturn do_exit(long code)
taskstats_exit(tsk, group_dead);
trace_sched_process_exit(tsk, group_dead);
/*
* Since sampling can touch ->mm, make sure to stop everything before we
* tear it down.
*
* Also flushes inherited counters to the parent - before the parent
* gets woken up by child-exit notifications.
*/
perf_event_exit_task(tsk);
exit_mm();
if (group_dead)
@ -955,14 +964,6 @@ void __noreturn do_exit(long code)
exit_task_work(tsk);
exit_thread(tsk);
/*
* Flush inherited counters to the parent - before the parent
* gets woken up by child-exit notifications.
*
* because of cgroup mode, must be called before cgroup_exit()
*/
perf_event_exit_task(tsk);
sched_autogroup_exit_task(tsk);
cgroup_exit(tsk);