mirror of
git://git.yoctoproject.org/linux-yocto.git
synced 2025-12-19 09:08:10 +01:00
perf mem/c2c amd: Add ldlat support
'perf mem/c2c' uses IBS Op PMU on AMD platforms. IBS Op PMU on Zen5 uarch has added support for Load Latency filtering. Implement 'perf mem/c2c' --ldlat using IBS Op Load Latency filtering capability. Some subtle differences between AMD and other arch: o --ldlat is disabled by default on AMD o Supported values are 128 to 2048. Signed-off-by: Ravi Bangoria <ravi.bangoria@amd.com> Cc: Ananth Narayan <ananth.narayan@amd.com> Cc: Ian Rogers <irogers@google.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Joe Mario <jmario@redhat.com> Cc: Kan Liang <kan.liang@linux.intel.com> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Sandipan Das <sandipan.das@amd.com> Cc: Santosh Shukla <santosh.shukla@amd.com> Cc: Stephane Eranian <eranian@google.com> Link: https://lore.kernel.org/r/20250429035938.1301-4-ravi.bangoria@amd.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
This commit is contained in:
parent
fc481adc97
commit
fa1332a801
|
|
@ -54,8 +54,15 @@ RECORD OPTIONS
|
||||||
|
|
||||||
-l::
|
-l::
|
||||||
--ldlat::
|
--ldlat::
|
||||||
Configure mem-loads latency. Supported on Intel and Arm64 processors
|
Configure mem-loads latency. Supported on Intel, Arm64 and some AMD
|
||||||
only. Ignored on other archs.
|
processors. Ignored on other archs.
|
||||||
|
|
||||||
|
On supported AMD processors:
|
||||||
|
- /sys/bus/event_source/devices/ibs_op/caps/ldlat file contains '1'.
|
||||||
|
- Supported latency values are 128 to 2048 (both inclusive).
|
||||||
|
- Latency value which is a multiple of 128 incurs a little less profiling
|
||||||
|
overhead compared to other values.
|
||||||
|
- Load latency filtering is disabled by default.
|
||||||
|
|
||||||
-k::
|
-k::
|
||||||
--all-kernel::
|
--all-kernel::
|
||||||
|
|
|
||||||
|
|
@ -28,6 +28,8 @@ and kernel support is required. See linkperf:perf-arm-spe[1] for a setup guide.
|
||||||
Due to the statistical nature of SPE sampling, not every memory operation will
|
Due to the statistical nature of SPE sampling, not every memory operation will
|
||||||
be sampled.
|
be sampled.
|
||||||
|
|
||||||
|
On AMD this use IBS Op PMU to sample load-store operations.
|
||||||
|
|
||||||
COMMON OPTIONS
|
COMMON OPTIONS
|
||||||
--------------
|
--------------
|
||||||
-f::
|
-f::
|
||||||
|
|
@ -67,8 +69,15 @@ RECORD OPTIONS
|
||||||
Configure all used events to run in user space.
|
Configure all used events to run in user space.
|
||||||
|
|
||||||
--ldlat <n>::
|
--ldlat <n>::
|
||||||
Specify desired latency for loads event. Supported on Intel and Arm64
|
Specify desired latency for loads event. Supported on Intel, Arm64 and
|
||||||
processors only. Ignored on other archs.
|
some AMD processors. Ignored on other archs.
|
||||||
|
|
||||||
|
On supported AMD processors:
|
||||||
|
- /sys/bus/event_source/devices/ibs_op/caps/ldlat file contains '1'.
|
||||||
|
- Supported latency values are 128 to 2048 (both inclusive).
|
||||||
|
- Latency value which is a multiple of 128 incurs a little less profiling
|
||||||
|
overhead compared to other values.
|
||||||
|
- Load latency filtering is disabled by default.
|
||||||
|
|
||||||
REPORT OPTIONS
|
REPORT OPTIONS
|
||||||
--------------
|
--------------
|
||||||
|
|
|
||||||
|
|
@ -26,3 +26,9 @@ struct perf_mem_event perf_mem_events_amd[PERF_MEM_EVENTS__MAX] = {
|
||||||
E(NULL, NULL, NULL, false, 0),
|
E(NULL, NULL, NULL, false, 0),
|
||||||
E("mem-ldst", "%s//", NULL, false, 0),
|
E("mem-ldst", "%s//", NULL, false, 0),
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct perf_mem_event perf_mem_events_amd_ldlat[PERF_MEM_EVENTS__MAX] = {
|
||||||
|
E(NULL, NULL, NULL, false, 0),
|
||||||
|
E(NULL, NULL, NULL, false, 0),
|
||||||
|
E("mem-ldst", "%s/ldlat=%u/", NULL, true, 0),
|
||||||
|
};
|
||||||
|
|
|
||||||
|
|
@ -6,5 +6,6 @@ extern struct perf_mem_event perf_mem_events_intel[PERF_MEM_EVENTS__MAX];
|
||||||
extern struct perf_mem_event perf_mem_events_intel_aux[PERF_MEM_EVENTS__MAX];
|
extern struct perf_mem_event perf_mem_events_intel_aux[PERF_MEM_EVENTS__MAX];
|
||||||
|
|
||||||
extern struct perf_mem_event perf_mem_events_amd[PERF_MEM_EVENTS__MAX];
|
extern struct perf_mem_event perf_mem_events_amd[PERF_MEM_EVENTS__MAX];
|
||||||
|
extern struct perf_mem_event perf_mem_events_amd_ldlat[PERF_MEM_EVENTS__MAX];
|
||||||
|
|
||||||
#endif /* _X86_MEM_EVENTS_H */
|
#endif /* _X86_MEM_EVENTS_H */
|
||||||
|
|
|
||||||
|
|
@ -18,8 +18,10 @@
|
||||||
#include "mem-events.h"
|
#include "mem-events.h"
|
||||||
#include "util/env.h"
|
#include "util/env.h"
|
||||||
|
|
||||||
void perf_pmu__arch_init(struct perf_pmu *pmu __maybe_unused)
|
void perf_pmu__arch_init(struct perf_pmu *pmu)
|
||||||
{
|
{
|
||||||
|
struct perf_pmu_caps *ldlat_cap;
|
||||||
|
|
||||||
#ifdef HAVE_AUXTRACE_SUPPORT
|
#ifdef HAVE_AUXTRACE_SUPPORT
|
||||||
if (!strcmp(pmu->name, INTEL_PT_PMU_NAME)) {
|
if (!strcmp(pmu->name, INTEL_PT_PMU_NAME)) {
|
||||||
pmu->auxtrace = true;
|
pmu->auxtrace = true;
|
||||||
|
|
@ -33,8 +35,20 @@ void perf_pmu__arch_init(struct perf_pmu *pmu __maybe_unused)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
if (x86__is_amd_cpu()) {
|
if (x86__is_amd_cpu()) {
|
||||||
if (!strcmp(pmu->name, "ibs_op"))
|
if (strcmp(pmu->name, "ibs_op"))
|
||||||
pmu->mem_events = perf_mem_events_amd;
|
return;
|
||||||
|
|
||||||
|
pmu->mem_events = perf_mem_events_amd;
|
||||||
|
|
||||||
|
if (!perf_pmu__caps_parse(pmu))
|
||||||
|
return;
|
||||||
|
|
||||||
|
ldlat_cap = perf_pmu__get_cap(pmu, "ldlat");
|
||||||
|
if (!ldlat_cap || strcmp(ldlat_cap->value, "1"))
|
||||||
|
return;
|
||||||
|
|
||||||
|
perf_mem_events__loads_ldlat = 0;
|
||||||
|
pmu->mem_events = perf_mem_events_amd_ldlat;
|
||||||
} else if (pmu->is_core) {
|
} else if (pmu->is_core) {
|
||||||
if (perf_pmu__have_event(pmu, "mem-loads-aux"))
|
if (perf_pmu__have_event(pmu, "mem-loads-aux"))
|
||||||
pmu->mem_events = perf_mem_events_intel_aux;
|
pmu->mem_events = perf_mem_events_intel_aux;
|
||||||
|
|
|
||||||
|
|
@ -54,11 +54,34 @@ trap cleanup_files exit term int
|
||||||
|
|
||||||
echo "Recording workload..."
|
echo "Recording workload..."
|
||||||
|
|
||||||
# perf mem/c2c internally uses IBS PMU on AMD CPU which doesn't support
|
|
||||||
# user/kernel filtering and per-process monitoring, spin program on
|
|
||||||
# specific CPU and test in per-CPU mode.
|
|
||||||
is_amd=$(grep -E -c 'vendor_id.*AuthenticAMD' /proc/cpuinfo)
|
is_amd=$(grep -E -c 'vendor_id.*AuthenticAMD' /proc/cpuinfo)
|
||||||
if (($is_amd >= 1)); then
|
if (($is_amd >= 1)); then
|
||||||
|
mem_events="$(perf mem record -v -e list 2>&1)"
|
||||||
|
if ! [[ "$mem_events" =~ ^mem\-ldst.*ibs_op/(.*)/.*available ]]; then
|
||||||
|
echo "ERROR: mem-ldst event is not matching"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# --ldlat on AMD:
|
||||||
|
# o Zen4 and earlier uarch does not support ldlat
|
||||||
|
# o Even on supported platforms, it's disabled (--ldlat=0) by default.
|
||||||
|
ldlat=${BASH_REMATCH[1]}
|
||||||
|
if [[ -n $ldlat ]]; then
|
||||||
|
if ! [[ "$ldlat" =~ ldlat=0 ]]; then
|
||||||
|
echo "ERROR: ldlat not initialized to 0?"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
mem_events="$(perf mem record -v --ldlat=150 -e list 2>&1)"
|
||||||
|
if ! [[ "$mem_events" =~ ^mem-ldst.*ibs_op/ldlat=150/.*available ]]; then
|
||||||
|
echo "ERROR: --ldlat not honored?"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
# perf mem/c2c internally uses IBS PMU on AMD CPU which doesn't
|
||||||
|
# support user/kernel filtering and per-process monitoring on older
|
||||||
|
# kernels, spin program on specific CPU and test in per-CPU mode.
|
||||||
perf mem record -vvv -o ${PERF_DATA} -C 0 -- taskset -c 0 $TEST_PROGRAM 2>"${ERR_FILE}"
|
perf mem record -vvv -o ${PERF_DATA} -C 0 -- taskset -c 0 $TEST_PROGRAM 2>"${ERR_FILE}"
|
||||||
else
|
else
|
||||||
perf mem record -vvv --all-user -o ${PERF_DATA} -- $TEST_PROGRAM 2>"${ERR_FILE}"
|
perf mem record -vvv --all-user -o ${PERF_DATA} -- $TEST_PROGRAM 2>"${ERR_FILE}"
|
||||||
|
|
|
||||||
|
|
@ -2259,6 +2259,17 @@ static void perf_pmu__del_caps(struct perf_pmu *pmu)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct perf_pmu_caps *perf_pmu__get_cap(struct perf_pmu *pmu, const char *name)
|
||||||
|
{
|
||||||
|
struct perf_pmu_caps *caps;
|
||||||
|
|
||||||
|
list_for_each_entry(caps, &pmu->caps, list) {
|
||||||
|
if (!strcmp(caps->name, name))
|
||||||
|
return caps;
|
||||||
|
}
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Reading/parsing the given pmu capabilities, which should be located at:
|
* Reading/parsing the given pmu capabilities, which should be located at:
|
||||||
* /sys/bus/event_source/devices/<dev>/caps as sysfs group attributes.
|
* /sys/bus/event_source/devices/<dev>/caps as sysfs group attributes.
|
||||||
|
|
|
||||||
|
|
@ -277,6 +277,8 @@ bool pmu_uncore_identifier_match(const char *compat, const char *id);
|
||||||
|
|
||||||
int perf_pmu__convert_scale(const char *scale, char **end, double *sval);
|
int perf_pmu__convert_scale(const char *scale, char **end, double *sval);
|
||||||
|
|
||||||
|
struct perf_pmu_caps *perf_pmu__get_cap(struct perf_pmu *pmu, const char *name);
|
||||||
|
|
||||||
int perf_pmu__caps_parse(struct perf_pmu *pmu);
|
int perf_pmu__caps_parse(struct perf_pmu *pmu);
|
||||||
|
|
||||||
void perf_pmu__warn_invalid_config(struct perf_pmu *pmu, __u64 config,
|
void perf_pmu__warn_invalid_config(struct perf_pmu *pmu, __u64 config,
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue
Block a user