linux-yocto/kernel/watchdog_perf.c
Li Huafei 864750968d watchdog/hardlockup/perf: Fix perf_event memory leak
[ Upstream commit d6834d9c990333bfa433bc1816e2417f268eebbe ]

During stress-testing, we found a kmemleak report for perf_event:

  unreferenced object 0xff110001410a33e0 (size 1328):
    comm "kworker/4:11", pid 288, jiffies 4294916004
    hex dump (first 32 bytes):
      b8 be c2 3b 02 00 11 ff 22 01 00 00 00 00 ad de  ...;....".......
      f0 33 0a 41 01 00 11 ff f0 33 0a 41 01 00 11 ff  .3.A.....3.A....
    backtrace (crc 24eb7b3a):
      [<00000000e211b653>] kmem_cache_alloc_node_noprof+0x269/0x2e0
      [<000000009d0985fa>] perf_event_alloc+0x5f/0xcf0
      [<00000000084ad4a2>] perf_event_create_kernel_counter+0x38/0x1b0
      [<00000000fde96401>] hardlockup_detector_event_create+0x50/0xe0
      [<0000000051183158>] watchdog_hardlockup_enable+0x17/0x70
      [<00000000ac89727f>] softlockup_start_fn+0x15/0x40
      ...

Our stress test includes CPU online and offline cycles, and updating the
watchdog configuration.

After reading the code, I found that there may be a race between cleaning up
perf_event after updating watchdog and disabling event when the CPU goes offline:

  CPU0                          CPU1                           CPU2
  (update watchdog)                                            (hotplug offline CPU1)

  ...                                                          _cpu_down(CPU1)
  cpus_read_lock()                                             // waiting for cpu lock
    softlockup_start_all
      smp_call_on_cpu(CPU1)
                                softlockup_start_fn
                                ...
                                  watchdog_hardlockup_enable(CPU1)
                                    perf create E1
                                    watchdog_ev[CPU1] = E1
  cpus_read_unlock()
                                                               cpus_write_lock()
                                                               cpuhp_kick_ap_work(CPU1)
                                cpuhp_thread_fun
                                ...
                                  watchdog_hardlockup_disable(CPU1)
                                    watchdog_ev[CPU1] = NULL
                                    dead_event[CPU1] = E1
  __lockup_detector_cleanup
    for each dead_events_mask
      release each dead_event
      /*
       * CPU1 has not been added to
       * dead_events_mask, then E1
       * will not be released
       */
                                    CPU1 -> dead_events_mask
    cpumask_clear(&dead_events_mask)
    // dead_events_mask is cleared, E1 is leaked

In this case, the leaked perf_event E1 matches the perf_event leak
reported by kmemleak. Due to the low probability of problem recurrence
(only reported once), I added some hack delays in the code:

  static void __lockup_detector_reconfigure(void)
  {
    ...
          watchdog_hardlockup_start();
          cpus_read_unlock();
  +       mdelay(100);
          /*
           * Must be called outside the cpus locked section to prevent
           * recursive locking in the perf code.
    ...
  }

  void watchdog_hardlockup_disable(unsigned int cpu)
  {
    ...
                  perf_event_disable(event);
                  this_cpu_write(watchdog_ev, NULL);
                  this_cpu_write(dead_event, event);
  +               mdelay(100);
                  cpumask_set_cpu(smp_processor_id(), &dead_events_mask);
                  atomic_dec(&watchdog_cpus);
    ...
  }

  void hardlockup_detector_perf_cleanup(void)
  {
    ...
                          perf_event_release_kernel(event);
                  per_cpu(dead_event, cpu) = NULL;
          }
  +       mdelay(100);
          cpumask_clear(&dead_events_mask);
  }

Then, simultaneously performing CPU on/off and switching watchdog, it is
almost certain to reproduce this leak.

The problem here is that releasing perf_event is not within the CPU
hotplug read-write lock. Commit:

  941154bd69 ("watchdog/hardlockup/perf: Prevent CPU hotplug deadlock")

introduced deferred release to solve the deadlock caused by calling
get_online_cpus() when releasing perf_event. Later, commit:

  efe951d3de ("perf/x86: Fix perf,x86,cpuhp deadlock")

removed the get_online_cpus() call on the perf_event release path to solve
another deadlock problem.

Therefore, it is now possible to move the release of perf_event back
into the CPU hotplug read-write lock, and release the event immediately
after disabling it.

Fixes: 941154bd69 ("watchdog/hardlockup/perf: Prevent CPU hotplug deadlock")
Signed-off-by: Li Huafei <lihuafei1@huawei.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/r/20241021193004.308303-1-lihuafei1@huawei.com
Signed-off-by: Sasha Levin <sashal@kernel.org>
2025-04-10 14:39:11 +02:00

284 lines
7.1 KiB
C

// SPDX-License-Identifier: GPL-2.0
/*
* Detect hard lockups on a system using perf
*
* started by Don Zickus, Copyright (C) 2010 Red Hat, Inc.
*
* Note: Most of this code is borrowed heavily from the original softlockup
* detector, so thanks to Ingo for the initial implementation.
* Some chunks also taken from the old x86-specific nmi watchdog code, thanks
* to those contributors as well.
*/
#define pr_fmt(fmt) "NMI watchdog: " fmt
#include <linux/nmi.h>
#include <linux/atomic.h>
#include <linux/module.h>
#include <linux/sched/debug.h>
#include <asm/irq_regs.h>
#include <linux/perf_event.h>
static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
static atomic_t watchdog_cpus = ATOMIC_INIT(0);
#ifdef CONFIG_HARDLOCKUP_CHECK_TIMESTAMP
static DEFINE_PER_CPU(ktime_t, last_timestamp);
static DEFINE_PER_CPU(unsigned int, nmi_rearmed);
static ktime_t watchdog_hrtimer_sample_threshold __read_mostly;
void watchdog_update_hrtimer_threshold(u64 period)
{
/*
* The hrtimer runs with a period of (watchdog_threshold * 2) / 5
*
* So it runs effectively with 2.5 times the rate of the NMI
* watchdog. That means the hrtimer should fire 2-3 times before
* the NMI watchdog expires. The NMI watchdog on x86 is based on
* unhalted CPU cycles, so if Turbo-Mode is enabled the CPU cycles
* might run way faster than expected and the NMI fires in a
* smaller period than the one deduced from the nominal CPU
* frequency. Depending on the Turbo-Mode factor this might be fast
* enough to get the NMI period smaller than the hrtimer watchdog
* period and trigger false positives.
*
* The sample threshold is used to check in the NMI handler whether
* the minimum time between two NMI samples has elapsed. That
* prevents false positives.
*
* Set this to 4/5 of the actual watchdog threshold period so the
* hrtimer is guaranteed to fire at least once within the real
* watchdog threshold.
*/
watchdog_hrtimer_sample_threshold = period * 2;
}
static bool watchdog_check_timestamp(void)
{
ktime_t delta, now = ktime_get_mono_fast_ns();
delta = now - __this_cpu_read(last_timestamp);
if (delta < watchdog_hrtimer_sample_threshold) {
/*
* If ktime is jiffies based, a stalled timer would prevent
* jiffies from being incremented and the filter would look
* at a stale timestamp and never trigger.
*/
if (__this_cpu_inc_return(nmi_rearmed) < 10)
return false;
}
__this_cpu_write(nmi_rearmed, 0);
__this_cpu_write(last_timestamp, now);
return true;
}
static void watchdog_init_timestamp(void)
{
__this_cpu_write(nmi_rearmed, 0);
__this_cpu_write(last_timestamp, ktime_get_mono_fast_ns());
}
#else
static inline bool watchdog_check_timestamp(void) { return true; }
static inline void watchdog_init_timestamp(void) { }
#endif
static struct perf_event_attr wd_hw_attr = {
.type = PERF_TYPE_HARDWARE,
.config = PERF_COUNT_HW_CPU_CYCLES,
.size = sizeof(struct perf_event_attr),
.pinned = 1,
.disabled = 1,
};
static struct perf_event_attr fallback_wd_hw_attr = {
.type = PERF_TYPE_HARDWARE,
.config = PERF_COUNT_HW_CPU_CYCLES,
.size = sizeof(struct perf_event_attr),
.pinned = 1,
.disabled = 1,
};
/* Callback function for perf event subsystem */
static void watchdog_overflow_callback(struct perf_event *event,
struct perf_sample_data *data,
struct pt_regs *regs)
{
/* Ensure the watchdog never gets throttled */
event->hw.interrupts = 0;
if (!watchdog_check_timestamp())
return;
watchdog_hardlockup_check(smp_processor_id(), regs);
}
static int hardlockup_detector_event_create(void)
{
unsigned int cpu;
struct perf_event_attr *wd_attr;
struct perf_event *evt;
/*
* Preemption is not disabled because memory will be allocated.
* Ensure CPU-locality by calling this in per-CPU kthread.
*/
WARN_ON(!is_percpu_thread());
cpu = raw_smp_processor_id();
wd_attr = &wd_hw_attr;
wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh);
/* Try to register using hardware perf events */
evt = perf_event_create_kernel_counter(wd_attr, cpu, NULL,
watchdog_overflow_callback, NULL);
if (IS_ERR(evt)) {
wd_attr = &fallback_wd_hw_attr;
wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh);
evt = perf_event_create_kernel_counter(wd_attr, cpu, NULL,
watchdog_overflow_callback, NULL);
}
if (IS_ERR(evt)) {
pr_debug("Perf event create on CPU %d failed with %ld\n", cpu,
PTR_ERR(evt));
return PTR_ERR(evt);
}
this_cpu_write(watchdog_ev, evt);
return 0;
}
/**
* watchdog_hardlockup_enable - Enable the local event
* @cpu: The CPU to enable hard lockup on.
*/
void watchdog_hardlockup_enable(unsigned int cpu)
{
WARN_ON_ONCE(cpu != smp_processor_id());
if (hardlockup_detector_event_create())
return;
/* use original value for check */
if (!atomic_fetch_inc(&watchdog_cpus))
pr_info("Enabled. Permanently consumes one hw-PMU counter.\n");
watchdog_init_timestamp();
perf_event_enable(this_cpu_read(watchdog_ev));
}
/**
* watchdog_hardlockup_disable - Disable the local event
* @cpu: The CPU to enable hard lockup on.
*/
void watchdog_hardlockup_disable(unsigned int cpu)
{
struct perf_event *event = this_cpu_read(watchdog_ev);
WARN_ON_ONCE(cpu != smp_processor_id());
if (event) {
perf_event_disable(event);
perf_event_release_kernel(event);
this_cpu_write(watchdog_ev, NULL);
atomic_dec(&watchdog_cpus);
}
}
/**
* hardlockup_detector_perf_stop - Globally stop watchdog events
*
* Special interface for x86 to handle the perf HT bug.
*/
void __init hardlockup_detector_perf_stop(void)
{
int cpu;
lockdep_assert_cpus_held();
for_each_online_cpu(cpu) {
struct perf_event *event = per_cpu(watchdog_ev, cpu);
if (event)
perf_event_disable(event);
}
}
/**
* hardlockup_detector_perf_restart - Globally restart watchdog events
*
* Special interface for x86 to handle the perf HT bug.
*/
void __init hardlockup_detector_perf_restart(void)
{
int cpu;
lockdep_assert_cpus_held();
if (!(watchdog_enabled & WATCHDOG_HARDLOCKUP_ENABLED))
return;
for_each_online_cpu(cpu) {
struct perf_event *event = per_cpu(watchdog_ev, cpu);
if (event)
perf_event_enable(event);
}
}
bool __weak __init arch_perf_nmi_is_available(void)
{
return true;
}
/**
* watchdog_hardlockup_probe - Probe whether NMI event is available at all
*/
int __init watchdog_hardlockup_probe(void)
{
int ret;
if (!arch_perf_nmi_is_available())
return -ENODEV;
ret = hardlockup_detector_event_create();
if (ret) {
pr_info("Perf NMI watchdog permanently disabled\n");
} else {
perf_event_release_kernel(this_cpu_read(watchdog_ev));
this_cpu_write(watchdog_ev, NULL);
}
return ret;
}
/**
* hardlockup_config_perf_event - Overwrite config of wd_hw_attr.
* @str: number which identifies the raw perf event to use
*/
void __init hardlockup_config_perf_event(const char *str)
{
u64 config;
char buf[24];
char *comma = strchr(str, ',');
if (!comma) {
if (kstrtoull(str, 16, &config))
return;
} else {
unsigned int len = comma - str;
if (len >= sizeof(buf))
return;
if (strscpy(buf, str, sizeof(buf)) < 0)
return;
buf[len] = 0;
if (kstrtoull(buf, 16, &config))
return;
}
wd_hw_attr.type = PERF_TYPE_RAW;
wd_hw_attr.config = config;
}