Merge branch 'pm-cpuidle'

Merge cpuidle updates for 6.16-rc1:

 - Optimize bucket assignment when next_timer_ns equals KTIME_MAX in the
   menu cpuidle governor (Zhongqiu Han).

 - Convert the cpuidle PSCI driver to a faux device one (Sudeep Holla).

 - Add C1 demotion on/off sysfs knob to the intel_idle driver (Artem
   Bityutskiy).

 - Fix typos in two comments in the teo cpuidle governor (Atul Kumar
   Pant).

* pm-cpuidle:
  cpuidle: psci: Avoid initializing faux device if no DT idle states are present
  Documentation: ABI: testing: document the new cpuidle sysfs file
  Documentation: admin-guide: pm: Document intel_idle C1 demotion
  intel_idle: Add C1 demotion on/off sysfs knob
  cpuidle: psci: Transition to the faux device interface
  cpuidle: menu: Optimize bucket assignment when next_timer_ns equals KTIME_MAX
  cpuidle: teo: Fix typos in two comments
This commit is contained in:
Rafael J. Wysocki 2025-05-26 21:18:34 +02:00
commit af86d7e88e
6 changed files with 159 additions and 20 deletions

View File

@ -111,6 +111,7 @@ What: /sys/devices/system/cpu/cpuidle/available_governors
/sys/devices/system/cpu/cpuidle/current_driver
/sys/devices/system/cpu/cpuidle/current_governor
/sys/devices/system/cpu/cpuidle/current_governer_ro
/sys/devices/system/cpu/cpuidle/intel_c1_demotion
Date: September 2007
Contact: Linux kernel mailing list <linux-kernel@vger.kernel.org>
Description: Discover cpuidle policy and mechanism
@ -132,7 +133,11 @@ Description: Discover cpuidle policy and mechanism
current_governor_ro: (RO) displays current idle policy.
See Documentation/admin-guide/pm/cpuidle.rst and
intel_c1_demotion: (RW) enables/disables the C1 demotion
feature on Intel CPUs.
See Documentation/admin-guide/pm/cpuidle.rst,
Documentation/admin-guide/pm/intel_idle.rst, and
Documentation/driver-api/pm/cpuidle.rst for more information.

View File

@ -38,6 +38,27 @@ instruction at all.
only way to pass early-configuration-time parameters to it is via the kernel
command line.
Sysfs Interface
===============
The ``intel_idle`` driver exposes the following ``sysfs`` attributes in
``/sys/devices/system/cpu/cpuidle/``:
``intel_c1_demotion``
Enable or disable C1 demotion for all CPUs in the system. This file is
only exposed on platforms that support the C1 demotion feature and where
it was tested. Value 0 means that C1 demotion is disabled, value 1 means
that it is enabled. Write 0 or 1 to disable or enable C1 demotion for
all CPUs.
The C1 demotion feature involves the platform firmware demoting deep
C-state requests from the OS (e.g., C6 requests) to C1. The idea is that
firmware monitors CPU wake-up rate, and if it is higher than a
platform-specific threshold, the firmware demotes deep C-state requests
to C1. For example, Linux requests C6, but firmware noticed too many
wake-ups per second, and it keeps the CPU in C1. When the CPU stays in
C1 long enough, the platform promotes it back to C6. This may improve
some workloads' performance, but it may also increase power consumption.
.. _intel-idle-enumeration-of-states:

View File

@ -16,7 +16,7 @@
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/of.h>
#include <linux/platform_device.h>
#include <linux/device/faux.h>
#include <linux/psci.h>
#include <linux/pm_domain.h>
#include <linux/pm_runtime.h>
@ -407,14 +407,14 @@ deinit:
* to register cpuidle driver then rollback to cancel all CPUs
* registration.
*/
static int psci_cpuidle_probe(struct platform_device *pdev)
static int psci_cpuidle_probe(struct faux_device *fdev)
{
int cpu, ret;
struct cpuidle_driver *drv;
struct cpuidle_device *dev;
for_each_present_cpu(cpu) {
ret = psci_idle_init_cpu(&pdev->dev, cpu);
ret = psci_idle_init_cpu(&fdev->dev, cpu);
if (ret)
goto out_fail;
}
@ -434,26 +434,37 @@ out_fail:
return ret;
}
static struct platform_driver psci_cpuidle_driver = {
static struct faux_device_ops psci_cpuidle_ops = {
.probe = psci_cpuidle_probe,
.driver = {
.name = "psci-cpuidle",
},
};
static bool __init dt_idle_state_present(void)
{
struct device_node *cpu_node __free(device_node);
struct device_node *state_node __free(device_node);
cpu_node = of_cpu_device_node_get(cpumask_first(cpu_possible_mask));
if (!cpu_node)
return false;
state_node = of_get_cpu_state_node(cpu_node, 0);
if (!state_node)
return false;
return !!of_match_node(psci_idle_state_match, state_node);
}
static int __init psci_idle_init(void)
{
struct platform_device *pdev;
int ret;
struct faux_device *fdev;
ret = platform_driver_register(&psci_cpuidle_driver);
if (ret)
return ret;
if (!dt_idle_state_present())
return 0;
pdev = platform_device_register_simple("psci-cpuidle", -1, NULL, 0);
if (IS_ERR(pdev)) {
platform_driver_unregister(&psci_cpuidle_driver);
return PTR_ERR(pdev);
fdev = faux_device_create("psci-cpuidle", NULL, &psci_cpuidle_ops);
if (!fdev) {
pr_err("Failed to create psci-cpuidle device\n");
return -ENODEV;
}
return 0;

View File

@ -255,7 +255,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
*/
data->next_timer_ns = KTIME_MAX;
delta_tick = TICK_NSEC / 2;
data->bucket = which_bucket(KTIME_MAX);
data->bucket = BUCKETS - 1;
}
if (unlikely(drv->state_count <= 1 || latency_req == 0) ||

View File

@ -19,7 +19,7 @@
*
* Of course, non-timer wakeup sources are more important in some use cases,
* but even then it is generally unnecessary to consider idle duration values
* greater than the time time till the next timer event, referred as the sleep
* greater than the time till the next timer event, referred as the sleep
* length in what follows, because the closest timer will ultimately wake up the
* CPU anyway unless it is woken up earlier.
*
@ -311,7 +311,7 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
struct cpuidle_state *s = &drv->states[i];
/*
* Update the sums of idle state mertics for all of the states
* Update the sums of idle state metrics for all of the states
* shallower than the current one.
*/
intercept_sum += prev_bin->intercepts;

View File

@ -48,9 +48,11 @@
#include <trace/events/power.h>
#include <linux/sched.h>
#include <linux/sched/smt.h>
#include <linux/mutex.h>
#include <linux/notifier.h>
#include <linux/cpu.h>
#include <linux/moduleparam.h>
#include <linux/sysfs.h>
#include <asm/cpuid.h>
#include <asm/cpu_device_id.h>
#include <asm/intel-family.h>
@ -92,9 +94,15 @@ struct idle_cpu {
*/
unsigned long auto_demotion_disable_flags;
bool disable_promotion_to_c1e;
bool c1_demotion_supported;
bool use_acpi;
};
static bool c1_demotion_supported;
static DEFINE_MUTEX(c1_demotion_mutex);
static struct device *sysfs_root __initdata;
static const struct idle_cpu *icpu __initdata;
static struct cpuidle_state *cpuidle_state_table __initdata;
@ -1549,18 +1557,21 @@ static const struct idle_cpu idle_cpu_gmt __initconst = {
static const struct idle_cpu idle_cpu_spr __initconst = {
.state_table = spr_cstates,
.disable_promotion_to_c1e = true,
.c1_demotion_supported = true,
.use_acpi = true,
};
static const struct idle_cpu idle_cpu_gnr __initconst = {
.state_table = gnr_cstates,
.disable_promotion_to_c1e = true,
.c1_demotion_supported = true,
.use_acpi = true,
};
static const struct idle_cpu idle_cpu_gnrd __initconst = {
.state_table = gnrd_cstates,
.disable_promotion_to_c1e = true,
.c1_demotion_supported = true,
.use_acpi = true,
};
@ -1599,12 +1610,14 @@ static const struct idle_cpu idle_cpu_snr __initconst = {
static const struct idle_cpu idle_cpu_grr __initconst = {
.state_table = grr_cstates,
.disable_promotion_to_c1e = true,
.c1_demotion_supported = true,
.use_acpi = true,
};
static const struct idle_cpu idle_cpu_srf __initconst = {
.state_table = srf_cstates,
.disable_promotion_to_c1e = true,
.c1_demotion_supported = true,
.use_acpi = true,
};
@ -2324,6 +2337,88 @@ static void __init intel_idle_cpuidle_devices_uninit(void)
cpuidle_unregister_device(per_cpu_ptr(intel_idle_cpuidle_devices, i));
}
static void intel_c1_demotion_toggle(void *enable)
{
unsigned long long msr_val;
rdmsrl(MSR_PKG_CST_CONFIG_CONTROL, msr_val);
/*
* Enable/disable C1 undemotion along with C1 demotion, as this is the
* most sensible configuration in general.
*/
if (enable)
msr_val |= NHM_C1_AUTO_DEMOTE | SNB_C1_AUTO_UNDEMOTE;
else
msr_val &= ~(NHM_C1_AUTO_DEMOTE | SNB_C1_AUTO_UNDEMOTE);
wrmsrl(MSR_PKG_CST_CONFIG_CONTROL, msr_val);
}
static ssize_t intel_c1_demotion_store(struct device *dev,
struct device_attribute *attr,
const char *buf, size_t count)
{
bool enable;
int err;
err = kstrtobool(buf, &enable);
if (err)
return err;
mutex_lock(&c1_demotion_mutex);
/* Enable/disable C1 demotion on all CPUs */
on_each_cpu(intel_c1_demotion_toggle, (void *)enable, 1);
mutex_unlock(&c1_demotion_mutex);
return count;
}
static ssize_t intel_c1_demotion_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
unsigned long long msr_val;
/*
* Read the MSR value for a CPU and assume it is the same for all CPUs. Any other
* configuration would be a BIOS bug.
*/
rdmsrl(MSR_PKG_CST_CONFIG_CONTROL, msr_val);
return sysfs_emit(buf, "%d\n", !!(msr_val & NHM_C1_AUTO_DEMOTE));
}
static DEVICE_ATTR_RW(intel_c1_demotion);
static int __init intel_idle_sysfs_init(void)
{
int err;
if (!c1_demotion_supported)
return 0;
sysfs_root = bus_get_dev_root(&cpu_subsys);
if (!sysfs_root)
return 0;
err = sysfs_add_file_to_group(&sysfs_root->kobj,
&dev_attr_intel_c1_demotion.attr,
"cpuidle");
if (err) {
put_device(sysfs_root);
return err;
}
return 0;
}
static void __init intel_idle_sysfs_uninit(void)
{
if (!sysfs_root)
return;
sysfs_remove_file_from_group(&sysfs_root->kobj,
&dev_attr_intel_c1_demotion.attr,
"cpuidle");
put_device(sysfs_root);
}
static int __init intel_idle_init(void)
{
const struct x86_cpu_id *id;
@ -2374,6 +2469,8 @@ static int __init intel_idle_init(void)
auto_demotion_disable_flags = icpu->auto_demotion_disable_flags;
if (icpu->disable_promotion_to_c1e)
c1e_promotion = C1E_PROMOTION_DISABLE;
if (icpu->c1_demotion_supported)
c1_demotion_supported = true;
if (icpu->use_acpi || force_use_acpi)
intel_idle_acpi_cst_extract();
} else if (!intel_idle_acpi_cst_extract()) {
@ -2387,6 +2484,10 @@ static int __init intel_idle_init(void)
if (!intel_idle_cpuidle_devices)
return -ENOMEM;
retval = intel_idle_sysfs_init();
if (retval)
pr_warn("failed to initialized sysfs");
intel_idle_cpuidle_driver_init(&intel_idle_driver);
retval = cpuidle_register_driver(&intel_idle_driver);
@ -2411,6 +2512,7 @@ hp_setup_fail:
intel_idle_cpuidle_devices_uninit();
cpuidle_unregister_driver(&intel_idle_driver);
init_driver_fail:
intel_idle_sysfs_uninit();
free_percpu(intel_idle_cpuidle_devices);
return retval;