perf mem/c2c amd: Add ldlat support

'perf mem/c2c' uses IBS Op PMU on AMD platforms.

IBS Op PMU on Zen5 uarch has added support for Load Latency filtering.

Implement 'perf mem/c2c' --ldlat using IBS Op Load Latency filtering
capability.

Some subtle differences between AMD and other arch:

o --ldlat is disabled by default on AMD

o Supported values are 128 to 2048.

Signed-off-by: Ravi Bangoria <ravi.bangoria@amd.com>
Cc: Ananth Narayan <ananth.narayan@amd.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Joe Mario <jmario@redhat.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sandipan Das <sandipan.das@amd.com>
Cc: Santosh Shukla <santosh.shukla@amd.com>
Cc: Stephane Eranian <eranian@google.com>
Link: https://lore.kernel.org/r/20250429035938.1301-4-ravi.bangoria@amd.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
This commit is contained in:
Ravi Bangoria 2025-04-29 03:59:37 +00:00 committed by Arnaldo Carvalho de Melo
parent fc481adc97
commit fa1332a801
8 changed files with 83 additions and 10 deletions

View File

@ -54,8 +54,15 @@ RECORD OPTIONS
-l::
--ldlat::
Configure mem-loads latency. Supported on Intel and Arm64 processors
only. Ignored on other archs.
Configure mem-loads latency. Supported on Intel, Arm64 and some AMD
processors. Ignored on other archs.
On supported AMD processors:
- /sys/bus/event_source/devices/ibs_op/caps/ldlat file contains '1'.
- Supported latency values are 128 to 2048 (both inclusive).
- Latency value which is a multiple of 128 incurs a little less profiling
overhead compared to other values.
- Load latency filtering is disabled by default.
-k::
--all-kernel::

View File

@ -28,6 +28,8 @@ and kernel support is required. See linkperf:perf-arm-spe[1] for a setup guide.
Due to the statistical nature of SPE sampling, not every memory operation will
be sampled.
On AMD this use IBS Op PMU to sample load-store operations.
COMMON OPTIONS
--------------
-f::
@ -67,8 +69,15 @@ RECORD OPTIONS
Configure all used events to run in user space.
--ldlat <n>::
Specify desired latency for loads event. Supported on Intel and Arm64
processors only. Ignored on other archs.
Specify desired latency for loads event. Supported on Intel, Arm64 and
some AMD processors. Ignored on other archs.
On supported AMD processors:
- /sys/bus/event_source/devices/ibs_op/caps/ldlat file contains '1'.
- Supported latency values are 128 to 2048 (both inclusive).
- Latency value which is a multiple of 128 incurs a little less profiling
overhead compared to other values.
- Load latency filtering is disabled by default.
REPORT OPTIONS
--------------

View File

@ -26,3 +26,9 @@ struct perf_mem_event perf_mem_events_amd[PERF_MEM_EVENTS__MAX] = {
E(NULL, NULL, NULL, false, 0),
E("mem-ldst", "%s//", NULL, false, 0),
};
struct perf_mem_event perf_mem_events_amd_ldlat[PERF_MEM_EVENTS__MAX] = {
E(NULL, NULL, NULL, false, 0),
E(NULL, NULL, NULL, false, 0),
E("mem-ldst", "%s/ldlat=%u/", NULL, true, 0),
};

View File

@ -6,5 +6,6 @@ extern struct perf_mem_event perf_mem_events_intel[PERF_MEM_EVENTS__MAX];
extern struct perf_mem_event perf_mem_events_intel_aux[PERF_MEM_EVENTS__MAX];
extern struct perf_mem_event perf_mem_events_amd[PERF_MEM_EVENTS__MAX];
extern struct perf_mem_event perf_mem_events_amd_ldlat[PERF_MEM_EVENTS__MAX];
#endif /* _X86_MEM_EVENTS_H */

View File

@ -18,8 +18,10 @@
#include "mem-events.h"
#include "util/env.h"
void perf_pmu__arch_init(struct perf_pmu *pmu __maybe_unused)
void perf_pmu__arch_init(struct perf_pmu *pmu)
{
struct perf_pmu_caps *ldlat_cap;
#ifdef HAVE_AUXTRACE_SUPPORT
if (!strcmp(pmu->name, INTEL_PT_PMU_NAME)) {
pmu->auxtrace = true;
@ -33,8 +35,20 @@ void perf_pmu__arch_init(struct perf_pmu *pmu __maybe_unused)
#endif
if (x86__is_amd_cpu()) {
if (!strcmp(pmu->name, "ibs_op"))
pmu->mem_events = perf_mem_events_amd;
if (strcmp(pmu->name, "ibs_op"))
return;
pmu->mem_events = perf_mem_events_amd;
if (!perf_pmu__caps_parse(pmu))
return;
ldlat_cap = perf_pmu__get_cap(pmu, "ldlat");
if (!ldlat_cap || strcmp(ldlat_cap->value, "1"))
return;
perf_mem_events__loads_ldlat = 0;
pmu->mem_events = perf_mem_events_amd_ldlat;
} else if (pmu->is_core) {
if (perf_pmu__have_event(pmu, "mem-loads-aux"))
pmu->mem_events = perf_mem_events_intel_aux;

View File

@ -54,11 +54,34 @@ trap cleanup_files exit term int
echo "Recording workload..."
# perf mem/c2c internally uses IBS PMU on AMD CPU which doesn't support
# user/kernel filtering and per-process monitoring, spin program on
# specific CPU and test in per-CPU mode.
is_amd=$(grep -E -c 'vendor_id.*AuthenticAMD' /proc/cpuinfo)
if (($is_amd >= 1)); then
mem_events="$(perf mem record -v -e list 2>&1)"
if ! [[ "$mem_events" =~ ^mem\-ldst.*ibs_op/(.*)/.*available ]]; then
echo "ERROR: mem-ldst event is not matching"
exit 1
fi
# --ldlat on AMD:
# o Zen4 and earlier uarch does not support ldlat
# o Even on supported platforms, it's disabled (--ldlat=0) by default.
ldlat=${BASH_REMATCH[1]}
if [[ -n $ldlat ]]; then
if ! [[ "$ldlat" =~ ldlat=0 ]]; then
echo "ERROR: ldlat not initialized to 0?"
exit 1
fi
mem_events="$(perf mem record -v --ldlat=150 -e list 2>&1)"
if ! [[ "$mem_events" =~ ^mem-ldst.*ibs_op/ldlat=150/.*available ]]; then
echo "ERROR: --ldlat not honored?"
exit 1
fi
fi
# perf mem/c2c internally uses IBS PMU on AMD CPU which doesn't
# support user/kernel filtering and per-process monitoring on older
# kernels, spin program on specific CPU and test in per-CPU mode.
perf mem record -vvv -o ${PERF_DATA} -C 0 -- taskset -c 0 $TEST_PROGRAM 2>"${ERR_FILE}"
else
perf mem record -vvv --all-user -o ${PERF_DATA} -- $TEST_PROGRAM 2>"${ERR_FILE}"

View File

@ -2259,6 +2259,17 @@ static void perf_pmu__del_caps(struct perf_pmu *pmu)
}
}
struct perf_pmu_caps *perf_pmu__get_cap(struct perf_pmu *pmu, const char *name)
{
struct perf_pmu_caps *caps;
list_for_each_entry(caps, &pmu->caps, list) {
if (!strcmp(caps->name, name))
return caps;
}
return NULL;
}
/*
* Reading/parsing the given pmu capabilities, which should be located at:
* /sys/bus/event_source/devices/<dev>/caps as sysfs group attributes.

View File

@ -277,6 +277,8 @@ bool pmu_uncore_identifier_match(const char *compat, const char *id);
int perf_pmu__convert_scale(const char *scale, char **end, double *sval);
struct perf_pmu_caps *perf_pmu__get_cap(struct perf_pmu *pmu, const char *name);
int perf_pmu__caps_parse(struct perf_pmu *pmu);
void perf_pmu__warn_invalid_config(struct perf_pmu *pmu, __u64 config,