This is the 5.13.2 stable release

-----BEGIN PGP SIGNATURE-----
 
 iQIzBAABCAAdFiEEZH8oZUiU471FcZm+ONu9yGCSaT4FAmDu/dcACgkQONu9yGCS
 aT6I3A//bwToUSzDCnrOaLEAT2PCzf3zu9+bjKz/dVPoOcTXWcDm1JrFD4hdxCsX
 tFakW8xkj+A9h9xGSVFq9Kk5+UjMEBZr/XxDhzOUn27QsSbsocPjYmf5EvehYSm8
 Hc3a0vvbKtepu4KfMtUUwaveCcTjxEnh7IcCfdJ5FY4CjBL6jwlnyNa/i0Rqeqp8
 SssfGM6w3p9eCG2OjhwMwUZsWGDCnl9ypwSYlBU3xVg/7+L1iF6JT3ZAMaOCY5VX
 wtkQsL++nXbAa8RE+awJYLNhOjhMo16nvEZMqa/Pc2fyCzlsBG+IX8R0GNd+3vUW
 QlDMZjTzkxEkAuiqCesCpW0qpZK5q/tYAzeZoKQ6YaFf0RWSwBKoK0jn2Pchnmp8
 ZhPjThmU36U72NcJaItVGHbQ5MpBFC5qq7xk1NRE41AfG6fm13VvkazhLZW8E9if
 5KuPqxEFeTS7JVqF1TDDHPVZCx3b4R2aOSjeoJ1dtni+BcalO3Jh/aPbRc29RhpY
 m3B/2y3/kSgDkDi5He3CxH3idt7viSC46SX7SY6PuISaZUzI8yttt02QjnC1xOqE
 BMKsPthVAd/+0ZG77BS3kDJe6picD4fvNvY27j77+kP2mcKLlBaIRQ7OOCTY2AgC
 EDFZLoA92RyCCHYNEYQA9yVc3yG4mfPHol1SRZ2MB6I0LiNKSiA=
 =VM7e
 -----END PGP SIGNATURE-----

Merge tag 'v5.13.2' into v5.13/standard/base

This is the 5.13.2 stable release

# gpg: Signature made Wed 14 Jul 2021 11:08:07 AM EDT
# gpg:                using RSA key 647F28654894E3BD457199BE38DBBDC86092693E
# gpg: Can't check signature: No public key
This commit is contained in:
Bruce Ashfield 2021-07-14 15:45:04 -04:00
commit fbed80d1ef
836 changed files with 7551 additions and 3964 deletions

View File

@ -49,8 +49,30 @@ Description:
modification of EVM-protected metadata and
disable all further modification of policy
Note that once a key has been loaded, it will no longer be
possible to enable metadata modification.
Echoing a value is additive, the new value is added to the
existing initialization flags.
For example, after::
echo 2 ><securityfs>/evm
another echo can be performed::
echo 1 ><securityfs>/evm
and the resulting value will be 3.
Note that once an HMAC key has been loaded, it will no longer
be possible to enable metadata modification. Signaling that an
HMAC key has been loaded will clear the corresponding flag.
For example, if the current value is 6 (2 and 4 set)::
echo 1 ><securityfs>/evm
will set the new value to 3 (4 cleared).
Loading an HMAC key is the only way to disable metadata
modification.
Until key loading has been signaled EVM can not create
or validate the 'security.evm' xattr, but returns

View File

@ -39,9 +39,11 @@ KernelVersion: v5.9
Contact: linuxppc-dev <linuxppc-dev@lists.ozlabs.org>, nvdimm@lists.linux.dev,
Description:
(RO) Report various performance stats related to papr-scm NVDIMM
device. Each stat is reported on a new line with each line
composed of a stat-identifier followed by it value. Below are
currently known dimm performance stats which are reported:
device. This attribute is only available for NVDIMM devices
that support reporting NVDIMM performance stats. Each stat is
reported on a new line with each line composed of a
stat-identifier followed by it value. Below are currently known
dimm performance stats which are reported:
* "CtlResCt" : Controller Reset Count
* "CtlResTm" : Controller Reset Elapsed Time

View File

@ -581,6 +581,12 @@
loops can be debugged more effectively on production
systems.
clocksource.max_cswd_read_retries= [KNL]
Number of clocksource_watchdog() retries due to
external delays before the clock will be marked
unstable. Defaults to three retries, that is,
four attempts to read the clock under test.
clearcpuid=BITNUM[,BITNUM...] [X86]
Disable CPUID feature X for the kernel. See
arch/x86/include/asm/cpufeatures.h for the valid bit

View File

@ -38,6 +38,7 @@ Sysfs entries
fan[1-12]_input RO fan tachometer speed in RPM
fan[1-12]_fault RO fan experienced fault
fan[1-6]_target RW desired fan speed in RPM
pwm[1-6]_enable RW regulator mode, 0=disabled, 1=manual mode, 2=rpm mode
pwm[1-6] RW fan target duty cycle (0-255)
pwm[1-6]_enable RW regulator mode, 0=disabled (duty cycle=0%), 1=manual mode, 2=rpm mode
pwm[1-6] RW read: current pwm duty cycle,
write: target pwm duty cycle (0-255)
================== === =======================================================

View File

@ -3053,7 +3053,7 @@ enum v4l2_mpeg_video_hevc_size_of_length_field -
:stub-columns: 0
:widths: 1 1 2
* - ``V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT``
* - ``V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT_ENABLED``
- 0x00000001
-
* - ``V4L2_HEVC_PPS_FLAG_OUTPUT_FLAG_PRESENT``
@ -3277,6 +3277,9 @@ enum v4l2_mpeg_video_hevc_size_of_length_field -
* - ``V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_LOOP_FILTER_ACROSS_SLICES_ENABLED``
- 0x00000100
-
* - ``V4L2_HEVC_SLICE_PARAMS_FLAG_DEPENDENT_SLICE_SEGMENT``
- 0x00000200
-
.. raw:: latex

View File

@ -259,6 +259,18 @@ and ``ioctl(SECCOMP_IOCTL_NOTIF_SEND)`` a response, indicating what should be
returned to userspace. The ``id`` member of ``struct seccomp_notif_resp`` should
be the same ``id`` as in ``struct seccomp_notif``.
Userspace can also add file descriptors to the notifying process via
``ioctl(SECCOMP_IOCTL_NOTIF_ADDFD)``. The ``id`` member of
``struct seccomp_notif_addfd`` should be the same ``id`` as in
``struct seccomp_notif``. The ``newfd_flags`` flag may be used to set flags
like O_EXEC on the file descriptor in the notifying process. If the supervisor
wants to inject the file descriptor with a specific number, the
``SECCOMP_ADDFD_FLAG_SETFD`` flag can be used, and set the ``newfd`` member to
the specific number to use. If that file descriptor is already open in the
notifying process it will be replaced. The supervisor can also add an FD, and
respond atomically by using the ``SECCOMP_ADDFD_FLAG_SEND`` flag and the return
value will be the injected file descriptor number.
It is worth noting that ``struct seccomp_data`` contains the values of register
arguments to the syscall, but does not contain pointers to memory. The task's
memory is accessible to suitably privileged traces via ``ptrace()`` or

View File

@ -1,7 +1,7 @@
# SPDX-License-Identifier: GPL-2.0
VERSION = 5
PATCHLEVEL = 13
SUBLEVEL = 1
SUBLEVEL = 2
EXTRAVERSION =
NAME = Opossums on Parade
@ -1039,7 +1039,7 @@ LDFLAGS_vmlinux += $(call ld-option, -X,)
endif
ifeq ($(CONFIG_RELR),y)
LDFLAGS_vmlinux += --pack-dyn-relocs=relr
LDFLAGS_vmlinux += --pack-dyn-relocs=relr --use-android-relr-tags
endif
# We never want expected sections to be placed heuristically by the

View File

@ -166,7 +166,6 @@ smp_callin(void)
DBGS(("smp_callin: commencing CPU %d current %p active_mm %p\n",
cpuid, current, current->active_mm));
preempt_disable();
cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
}

View File

@ -189,7 +189,6 @@ void start_kernel_secondary(void)
pr_info("## CPU%u LIVE ##: Executing Code...\n", cpu);
local_irq_enable();
preempt_disable();
cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
}

View File

@ -787,7 +787,7 @@
0xffffffff 0x3ffcfe7c 0x1c010101 /* pioA */
0x7fffffff 0xfffccc3a 0x3f00cc3a /* pioB */
0xffffffff 0x3ff83fff 0xff00ffff /* pioC */
0x0003ff00 0x8002a800 0x00000000 /* pioD */
0xb003ff00 0x8002a800 0x00000000 /* pioD */
0xffffffff 0x7fffffff 0x76fff1bf /* pioE */
>;

View File

@ -4,6 +4,7 @@
*/
#include <dt-bindings/interrupt-controller/irq.h>
#include <dt-bindings/leds/common.h>
#include "ste-href-family-pinctrl.dtsi"
/ {
@ -64,17 +65,20 @@
reg = <0>;
led-cur = /bits/ 8 <0x2f>;
max-cur = /bits/ 8 <0x5f>;
color = <LED_COLOR_ID_BLUE>;
linux,default-trigger = "heartbeat";
};
chan@1 {
reg = <1>;
led-cur = /bits/ 8 <0x2f>;
max-cur = /bits/ 8 <0x5f>;
color = <LED_COLOR_ID_BLUE>;
};
chan@2 {
reg = <2>;
led-cur = /bits/ 8 <0x2f>;
max-cur = /bits/ 8 <0x5f>;
color = <LED_COLOR_ID_BLUE>;
};
};
lp5521@34 {
@ -88,16 +92,19 @@
reg = <0>;
led-cur = /bits/ 8 <0x2f>;
max-cur = /bits/ 8 <0x5f>;
color = <LED_COLOR_ID_BLUE>;
};
chan@1 {
reg = <1>;
led-cur = /bits/ 8 <0x2f>;
max-cur = /bits/ 8 <0x5f>;
color = <LED_COLOR_ID_BLUE>;
};
chan@2 {
reg = <2>;
led-cur = /bits/ 8 <0x2f>;
max-cur = /bits/ 8 <0x5f>;
color = <LED_COLOR_ID_BLUE>;
};
};
bh1780@29 {

View File

@ -773,10 +773,10 @@ static inline void armv7pmu_write_counter(struct perf_event *event, u64 value)
pr_err("CPU%u writing wrong counter %d\n",
smp_processor_id(), idx);
} else if (idx == ARMV7_IDX_CYCLE_COUNTER) {
asm volatile("mcr p15, 0, %0, c9, c13, 0" : : "r" (value));
asm volatile("mcr p15, 0, %0, c9, c13, 0" : : "r" ((u32)value));
} else {
armv7_pmnc_select_counter(idx);
asm volatile("mcr p15, 0, %0, c9, c13, 2" : : "r" (value));
asm volatile("mcr p15, 0, %0, c9, c13, 2" : : "r" ((u32)value));
}
}

View File

@ -432,7 +432,6 @@ asmlinkage void secondary_start_kernel(void)
#endif
pr_debug("CPU%u: Booted secondary processor\n", cpu);
preempt_disable();
trace_hardirqs_off();
/*

View File

@ -134,7 +134,7 @@
uart0: serial@12000 {
compatible = "marvell,armada-3700-uart";
reg = <0x12000 0x200>;
reg = <0x12000 0x18>;
clocks = <&xtalclk>;
interrupts =
<GIC_SPI 11 IRQ_TYPE_LEVEL_HIGH>,

View File

@ -46,6 +46,7 @@
#define KVM_REQ_VCPU_RESET KVM_ARCH_REQ(2)
#define KVM_REQ_RECORD_STEAL KVM_ARCH_REQ(3)
#define KVM_REQ_RELOAD_GICv4 KVM_ARCH_REQ(4)
#define KVM_REQ_RELOAD_PMU KVM_ARCH_REQ(5)
#define KVM_DIRTY_LOG_MANUAL_CAPS (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | \
KVM_DIRTY_LOG_INITIALLY_SET)

View File

@ -177,9 +177,9 @@ static inline void update_saved_ttbr0(struct task_struct *tsk,
return;
if (mm == &init_mm)
ttbr = __pa_symbol(reserved_pg_dir);
ttbr = phys_to_ttbr(__pa_symbol(reserved_pg_dir));
else
ttbr = virt_to_phys(mm->pgd) | ASID(mm) << 48;
ttbr = phys_to_ttbr(virt_to_phys(mm->pgd)) | ASID(mm) << 48;
WRITE_ONCE(task_thread_info(tsk)->ttbr0, ttbr);
}

View File

@ -23,7 +23,7 @@ static inline void preempt_count_set(u64 pc)
} while (0)
#define init_idle_preempt_count(p, cpu) do { \
task_thread_info(p)->preempt_count = PREEMPT_ENABLED; \
task_thread_info(p)->preempt_count = PREEMPT_DISABLED; \
} while (0)
static inline void set_preempt_need_resched(void)

View File

@ -14,6 +14,11 @@ CFLAGS_REMOVE_return_address.o = $(CC_FLAGS_FTRACE)
CFLAGS_REMOVE_syscall.o = -fstack-protector -fstack-protector-strong
CFLAGS_syscall.o += -fno-stack-protector
# It's not safe to invoke KCOV when portions of the kernel environment aren't
# available or are out-of-sync with HW state. Since `noinstr` doesn't always
# inhibit KCOV instrumentation, disable it for the entire compilation unit.
KCOV_INSTRUMENT_entry.o := n
# Object file lists.
obj-y := debug-monitors.o entry.o irq.o fpsimd.o \
entry-common.o entry-fpsimd.o process.o ptrace.o \

View File

@ -312,7 +312,7 @@ static ssize_t slots_show(struct device *dev, struct device_attribute *attr,
struct arm_pmu *cpu_pmu = container_of(pmu, struct arm_pmu, pmu);
u32 slots = cpu_pmu->reg_pmmir & ARMV8_PMU_SLOTS_MASK;
return snprintf(page, PAGE_SIZE, "0x%08x\n", slots);
return sysfs_emit(page, "0x%08x\n", slots);
}
static DEVICE_ATTR_RO(slots);

View File

@ -381,7 +381,7 @@ void __init __no_sanitize_address setup_arch(char **cmdline_p)
* faults in case uaccess_enable() is inadvertently called by the init
* thread.
*/
init_task.thread_info.ttbr0 = __pa_symbol(reserved_pg_dir);
init_task.thread_info.ttbr0 = phys_to_ttbr(__pa_symbol(reserved_pg_dir));
#endif
if (boot_args[1] || boot_args[2] || boot_args[3]) {

View File

@ -224,7 +224,6 @@ asmlinkage notrace void secondary_start_kernel(void)
init_gic_priority_masking();
rcu_cpu_starting(cpu);
preempt_disable();
trace_hardirqs_off();
/*

View File

@ -689,6 +689,10 @@ static void check_vcpu_requests(struct kvm_vcpu *vcpu)
vgic_v4_load(vcpu);
preempt_enable();
}
if (kvm_check_request(KVM_REQ_RELOAD_PMU, vcpu))
kvm_pmu_handle_pmcr(vcpu,
__vcpu_sys_reg(vcpu, PMCR_EL0));
}
}

View File

@ -578,6 +578,7 @@ void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val)
kvm_pmu_set_counter_value(vcpu, ARMV8_PMU_CYCLE_IDX, 0);
if (val & ARMV8_PMU_PMCR_P) {
mask &= ~BIT(ARMV8_PMU_CYCLE_IDX);
for_each_set_bit(i, &mask, 32)
kvm_pmu_set_counter_value(vcpu, i, 0);
}
@ -850,6 +851,9 @@ int kvm_arm_pmu_v3_enable(struct kvm_vcpu *vcpu)
return -EINVAL;
}
/* One-off reload of the PMU on first run */
kvm_make_request(KVM_REQ_RELOAD_PMU, vcpu);
return 0;
}

View File

@ -281,7 +281,6 @@ void csky_start_secondary(void)
pr_info("CPU%u Online: %s...\n", cpu, __func__);
local_irq_enable();
preempt_disable();
cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
}

View File

@ -12,15 +12,17 @@ SYSCALL_DEFINE3(cacheflush,
int, cache)
{
switch (cache) {
case ICACHE:
case BCACHE:
flush_icache_mm_range(current->mm,
(unsigned long)addr,
(unsigned long)addr + bytes);
fallthrough;
case DCACHE:
dcache_wb_range((unsigned long)addr,
(unsigned long)addr + bytes);
if (cache != BCACHE)
break;
fallthrough;
case ICACHE:
flush_icache_mm_range(current->mm,
(unsigned long)addr,
(unsigned long)addr + bytes);
break;
default:
return -EINVAL;

View File

@ -343,7 +343,7 @@ init_record_index_pools(void)
/* - 2 - */
sect_min_size = sal_log_sect_min_sizes[0];
for (i = 1; i < sizeof sal_log_sect_min_sizes/sizeof(size_t); i++)
for (i = 1; i < ARRAY_SIZE(sal_log_sect_min_sizes); i++)
if (sect_min_size > sal_log_sect_min_sizes[i])
sect_min_size = sal_log_sect_min_sizes[i];

View File

@ -441,7 +441,6 @@ start_secondary (void *unused)
#endif
efi_map_pal_code();
cpu_init();
preempt_disable();
smp_callin();
cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);

View File

@ -25,6 +25,9 @@ config ATARI
this kernel on an Atari, say Y here and browse the material
available in <file:Documentation/m68k>; otherwise say N.
config ATARI_KBD_CORE
bool
config MAC
bool "Macintosh support"
depends on MMU

View File

@ -36,7 +36,7 @@ extern pte_t *pkmap_page_table;
* easily, subsequent pte tables have to be allocated in one physical
* chunk of RAM.
*/
#ifdef CONFIG_PHYS_ADDR_T_64BIT
#if defined(CONFIG_PHYS_ADDR_T_64BIT) || defined(CONFIG_MIPS_HUGE_TLB_SUPPORT)
#define LAST_PKMAP 512
#else
#define LAST_PKMAP 1024

View File

@ -348,7 +348,6 @@ asmlinkage void start_secondary(void)
*/
calibrate_delay();
preempt_disable();
cpu = smp_processor_id();
cpu_data[cpu].udelay_val = loops_per_jiffy;

View File

@ -145,8 +145,6 @@ asmlinkage __init void secondary_start_kernel(void)
set_cpu_online(cpu, true);
local_irq_enable();
preempt_disable();
/*
* OK, it's off to the idle thread for us
*/

View File

@ -302,7 +302,6 @@ void __init smp_callin(unsigned long pdce_proc)
#endif
smp_cpu_init(slave_id);
preempt_disable();
flush_cache_all_local(); /* start with known state */
flush_tlb_all_local(NULL);

View File

@ -98,6 +98,36 @@ static inline int cpu_last_thread_sibling(int cpu)
return cpu | (threads_per_core - 1);
}
/*
* tlb_thread_siblings are siblings which share a TLB. This is not
* architected, is not something a hypervisor could emulate and a future
* CPU may change behaviour even in compat mode, so this should only be
* used on PowerNV, and only with care.
*/
static inline int cpu_first_tlb_thread_sibling(int cpu)
{
if (cpu_has_feature(CPU_FTR_ARCH_300) && (threads_per_core == 8))
return cpu & ~0x6; /* Big Core */
else
return cpu_first_thread_sibling(cpu);
}
static inline int cpu_last_tlb_thread_sibling(int cpu)
{
if (cpu_has_feature(CPU_FTR_ARCH_300) && (threads_per_core == 8))
return cpu | 0x6; /* Big Core */
else
return cpu_last_thread_sibling(cpu);
}
static inline int cpu_tlb_thread_sibling_step(void)
{
if (cpu_has_feature(CPU_FTR_ARCH_300) && (threads_per_core == 8))
return 2; /* Big Core */
else
return 1;
}
static inline u32 get_tensr(void)
{
#ifdef CONFIG_BOOKE

View File

@ -186,6 +186,7 @@ struct interrupt_nmi_state {
u8 irq_soft_mask;
u8 irq_happened;
u8 ftrace_enabled;
u64 softe;
#endif
};
@ -211,6 +212,7 @@ static inline void interrupt_nmi_enter_prepare(struct pt_regs *regs, struct inte
#ifdef CONFIG_PPC64
state->irq_soft_mask = local_paca->irq_soft_mask;
state->irq_happened = local_paca->irq_happened;
state->softe = regs->softe;
/*
* Set IRQS_ALL_DISABLED unconditionally so irqs_disabled() does
@ -263,6 +265,7 @@ static inline void interrupt_nmi_exit_prepare(struct pt_regs *regs, struct inter
/* Check we didn't change the pending interrupt mask. */
WARN_ON_ONCE((state->irq_happened | PACA_IRQ_HARD_DIS) != local_paca->irq_happened);
regs->softe = state->softe;
local_paca->irq_happened = state->irq_happened;
local_paca->irq_soft_mask = state->irq_soft_mask;
#endif

View File

@ -16,10 +16,10 @@ static inline bool is_kvm_guest(void)
return static_branch_unlikely(&kvm_guest);
}
bool check_kvm_guest(void);
int check_kvm_guest(void);
#else
static inline bool is_kvm_guest(void) { return false; }
static inline bool check_kvm_guest(void) { return false; }
static inline int check_kvm_guest(void) { return 0; }
#endif
#endif /* _ASM_POWERPC_KVM_GUEST_H_ */

View File

@ -23,18 +23,20 @@ EXPORT_SYMBOL_GPL(powerpc_firmware_features);
#if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_KVM_GUEST)
DEFINE_STATIC_KEY_FALSE(kvm_guest);
bool check_kvm_guest(void)
int __init check_kvm_guest(void)
{
struct device_node *hyper_node;
hyper_node = of_find_node_by_path("/hypervisor");
if (!hyper_node)
return false;
return 0;
if (!of_device_is_compatible(hyper_node, "linux,kvm"))
return false;
return 0;
static_branch_enable(&kvm_guest);
return true;
return 0;
}
core_initcall(check_kvm_guest); // before kvm_guest_init()
#endif

View File

@ -481,12 +481,11 @@ static int mce_find_instr_ea_and_phys(struct pt_regs *regs, uint64_t *addr,
return -1;
}
static int mce_handle_ierror(struct pt_regs *regs,
static int mce_handle_ierror(struct pt_regs *regs, unsigned long srr1,
const struct mce_ierror_table table[],
struct mce_error_info *mce_err, uint64_t *addr,
uint64_t *phys_addr)
{
uint64_t srr1 = regs->msr;
int handled = 0;
int i;
@ -695,19 +694,19 @@ static long mce_handle_ue_error(struct pt_regs *regs,
}
static long mce_handle_error(struct pt_regs *regs,
unsigned long srr1,
const struct mce_derror_table dtable[],
const struct mce_ierror_table itable[])
{
struct mce_error_info mce_err = { 0 };
uint64_t addr, phys_addr = ULONG_MAX;
uint64_t srr1 = regs->msr;
long handled;
if (SRR1_MC_LOADSTORE(srr1))
handled = mce_handle_derror(regs, dtable, &mce_err, &addr,
&phys_addr);
else
handled = mce_handle_ierror(regs, itable, &mce_err, &addr,
handled = mce_handle_ierror(regs, srr1, itable, &mce_err, &addr,
&phys_addr);
if (!handled && mce_err.error_type == MCE_ERROR_TYPE_UE)
@ -723,16 +722,20 @@ long __machine_check_early_realmode_p7(struct pt_regs *regs)
/* P7 DD1 leaves top bits of DSISR undefined */
regs->dsisr &= 0x0000ffff;
return mce_handle_error(regs, mce_p7_derror_table, mce_p7_ierror_table);
return mce_handle_error(regs, regs->msr,
mce_p7_derror_table, mce_p7_ierror_table);
}
long __machine_check_early_realmode_p8(struct pt_regs *regs)
{
return mce_handle_error(regs, mce_p8_derror_table, mce_p8_ierror_table);
return mce_handle_error(regs, regs->msr,
mce_p8_derror_table, mce_p8_ierror_table);
}
long __machine_check_early_realmode_p9(struct pt_regs *regs)
{
unsigned long srr1 = regs->msr;
/*
* On POWER9 DD2.1 and below, it's possible to get a machine check
* caused by a paste instruction where only DSISR bit 25 is set. This
@ -746,10 +749,39 @@ long __machine_check_early_realmode_p9(struct pt_regs *regs)
if (SRR1_MC_LOADSTORE(regs->msr) && regs->dsisr == 0x02000000)
return 1;
return mce_handle_error(regs, mce_p9_derror_table, mce_p9_ierror_table);
/*
* Async machine check due to bad real address from store or foreign
* link time out comes with the load/store bit (PPC bit 42) set in
* SRR1, but the cause comes in SRR1 not DSISR. Clear bit 42 so we're
* directed to the ierror table so it will find the cause (which
* describes it correctly as a store error).
*/
if (SRR1_MC_LOADSTORE(srr1) &&
((srr1 & 0x081c0000) == 0x08140000 ||
(srr1 & 0x081c0000) == 0x08180000)) {
srr1 &= ~PPC_BIT(42);
}
return mce_handle_error(regs, srr1,
mce_p9_derror_table, mce_p9_ierror_table);
}
long __machine_check_early_realmode_p10(struct pt_regs *regs)
{
return mce_handle_error(regs, mce_p10_derror_table, mce_p10_ierror_table);
unsigned long srr1 = regs->msr;
/*
* Async machine check due to bad real address from store comes with
* the load/store bit (PPC bit 42) set in SRR1, but the cause comes in
* SRR1 not DSISR. Clear bit 42 so we're directed to the ierror table
* so it will find the cause (which describes it correctly as a store
* error).
*/
if (SRR1_MC_LOADSTORE(srr1) &&
(srr1 & 0x081c0000) == 0x08140000) {
srr1 &= ~PPC_BIT(42);
}
return mce_handle_error(regs, srr1,
mce_p10_derror_table, mce_p10_ierror_table);
}

View File

@ -1213,6 +1213,19 @@ struct task_struct *__switch_to(struct task_struct *prev,
__flush_tlb_pending(batch);
batch->active = 0;
}
/*
* On POWER9 the copy-paste buffer can only paste into
* foreign real addresses, so unprivileged processes can not
* see the data or use it in any way unless they have
* foreign real mappings. If the new process has the foreign
* real address mappings, we must issue a cp_abort to clear
* any state and prevent snooping, corruption or a covert
* channel. ISA v3.1 supports paste into local memory.
*/
if (new->mm && (cpu_has_feature(CPU_FTR_ARCH_31) ||
atomic_read(&new->mm->context.vas_windows)))
asm volatile(PPC_CP_ABORT);
#endif /* CONFIG_PPC_BOOK3S_64 */
#ifdef CONFIG_PPC_ADV_DEBUG_REGS
@ -1261,30 +1274,33 @@ struct task_struct *__switch_to(struct task_struct *prev,
#endif
last = _switch(old_thread, new_thread);
/*
* Nothing after _switch will be run for newly created tasks,
* because they switch directly to ret_from_fork/ret_from_kernel_thread
* etc. Code added here should have a comment explaining why that is
* okay.
*/
#ifdef CONFIG_PPC_BOOK3S_64
/*
* This applies to a process that was context switched while inside
* arch_enter_lazy_mmu_mode(), to re-activate the batch that was
* deactivated above, before _switch(). This will never be the case
* for new tasks.
*/
if (current_thread_info()->local_flags & _TLF_LAZY_MMU) {
current_thread_info()->local_flags &= ~_TLF_LAZY_MMU;
batch = this_cpu_ptr(&ppc64_tlb_batch);
batch->active = 1;
}
if (current->thread.regs) {
/*
* Math facilities are masked out of the child MSR in copy_thread.
* A new task does not need to restore_math because it will
* demand fault them.
*/
if (current->thread.regs)
restore_math(current->thread.regs);
/*
* On POWER9 the copy-paste buffer can only paste into
* foreign real addresses, so unprivileged processes can not
* see the data or use it in any way unless they have
* foreign real mappings. If the new process has the foreign
* real address mappings, we must issue a cp_abort to clear
* any state and prevent snooping, corruption or a covert
* channel. ISA v3.1 supports paste into local memory.
*/
if (current->mm &&
(cpu_has_feature(CPU_FTR_ARCH_31) ||
atomic_read(&current->mm->context.vas_windows)))
asm volatile(PPC_CP_ABORT);
}
#endif /* CONFIG_PPC_BOOK3S_64 */
return last;

View File

@ -619,6 +619,8 @@ static void nmi_stop_this_cpu(struct pt_regs *regs)
/*
* IRQs are already hard disabled by the smp_handle_nmi_ipi.
*/
set_cpu_online(smp_processor_id(), false);
spin_begin();
while (1)
spin_cpu_relax();
@ -634,6 +636,15 @@ void smp_send_stop(void)
static void stop_this_cpu(void *dummy)
{
hard_irq_disable();
/*
* Offlining CPUs in stop_this_cpu can result in scheduler warnings,
* (see commit de6e5d38417e), but printk_safe_flush_on_panic() wants
* to know other CPUs are offline before it breaks locks to flush
* printk buffers, in case we panic()ed while holding the lock.
*/
set_cpu_online(smp_processor_id(), false);
spin_begin();
while (1)
spin_cpu_relax();
@ -1547,7 +1558,6 @@ void start_secondary(void *unused)
smp_store_cpu_info(cpu);
set_dec(tb_ticks_per_jiffy);
rcu_cpu_starting(cpu);
preempt_disable();
cpu_callin_map[cpu] = 1;
if (smp_ops->setup_cpu)

View File

@ -172,17 +172,31 @@ static void handle_backtrace_ipi(struct pt_regs *regs)
static void raise_backtrace_ipi(cpumask_t *mask)
{
struct paca_struct *p;
unsigned int cpu;
u64 delay_us;
for_each_cpu(cpu, mask) {
if (cpu == smp_processor_id())
if (cpu == smp_processor_id()) {
handle_backtrace_ipi(NULL);
else
smp_send_safe_nmi_ipi(cpu, handle_backtrace_ipi, 5 * USEC_PER_SEC);
}
continue;
}
for_each_cpu(cpu, mask) {
struct paca_struct *p = paca_ptrs[cpu];
delay_us = 5 * USEC_PER_SEC;
if (smp_send_safe_nmi_ipi(cpu, handle_backtrace_ipi, delay_us)) {
// Now wait up to 5s for the other CPU to do its backtrace
while (cpumask_test_cpu(cpu, mask) && delay_us) {
udelay(1);
delay_us--;
}
// Other CPU cleared itself from the mask
if (delay_us)
continue;
}
p = paca_ptrs[cpu];
cpumask_clear_cpu(cpu, mask);

View File

@ -2657,7 +2657,7 @@ static void radix_flush_cpu(struct kvm *kvm, int cpu, struct kvm_vcpu *vcpu)
cpumask_t *cpu_in_guest;
int i;
cpu = cpu_first_thread_sibling(cpu);
cpu = cpu_first_tlb_thread_sibling(cpu);
if (nested) {
cpumask_set_cpu(cpu, &nested->need_tlb_flush);
cpu_in_guest = &nested->cpu_in_guest;
@ -2671,9 +2671,10 @@ static void radix_flush_cpu(struct kvm *kvm, int cpu, struct kvm_vcpu *vcpu)
* the other side is the first smp_mb() in kvmppc_run_core().
*/
smp_mb();
for (i = 0; i < threads_per_core; ++i)
if (cpumask_test_cpu(cpu + i, cpu_in_guest))
smp_call_function_single(cpu + i, do_nothing, NULL, 1);
for (i = cpu; i <= cpu_last_tlb_thread_sibling(cpu);
i += cpu_tlb_thread_sibling_step())
if (cpumask_test_cpu(i, cpu_in_guest))
smp_call_function_single(i, do_nothing, NULL, 1);
}
static void kvmppc_prepare_radix_vcpu(struct kvm_vcpu *vcpu, int pcpu)
@ -2704,8 +2705,8 @@ static void kvmppc_prepare_radix_vcpu(struct kvm_vcpu *vcpu, int pcpu)
*/
if (prev_cpu != pcpu) {
if (prev_cpu >= 0 &&
cpu_first_thread_sibling(prev_cpu) !=
cpu_first_thread_sibling(pcpu))
cpu_first_tlb_thread_sibling(prev_cpu) !=
cpu_first_tlb_thread_sibling(pcpu))
radix_flush_cpu(kvm, prev_cpu, vcpu);
if (nested)
nested->prev_cpu[vcpu->arch.nested_vcpu_id] = pcpu;

View File

@ -800,7 +800,7 @@ void kvmppc_check_need_tlb_flush(struct kvm *kvm, int pcpu,
* Thus we make all 4 threads use the same bit.
*/
if (cpu_has_feature(CPU_FTR_ARCH_300))
pcpu = cpu_first_thread_sibling(pcpu);
pcpu = cpu_first_tlb_thread_sibling(pcpu);
if (nested)
need_tlb_flush = &nested->need_tlb_flush;

View File

@ -53,7 +53,8 @@ void kvmhv_save_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr)
hr->dawrx1 = vcpu->arch.dawrx1;
}
static void byteswap_pt_regs(struct pt_regs *regs)
/* Use noinline_for_stack due to https://bugs.llvm.org/show_bug.cgi?id=49610 */
static noinline_for_stack void byteswap_pt_regs(struct pt_regs *regs)
{
unsigned long *addr = (unsigned long *) regs;

View File

@ -56,7 +56,7 @@ static int global_invalidates(struct kvm *kvm)
* so use the bit for the first thread to represent the core.
*/
if (cpu_has_feature(CPU_FTR_ARCH_300))
cpu = cpu_first_thread_sibling(cpu);
cpu = cpu_first_tlb_thread_sibling(cpu);
cpumask_clear_cpu(cpu, &kvm->arch.need_tlb_flush);
}

View File

@ -1522,8 +1522,8 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap,
}
EXPORT_SYMBOL_GPL(hash_page);
DECLARE_INTERRUPT_HANDLER_RET(__do_hash_fault);
DEFINE_INTERRUPT_HANDLER_RET(__do_hash_fault)
DECLARE_INTERRUPT_HANDLER(__do_hash_fault);
DEFINE_INTERRUPT_HANDLER(__do_hash_fault)
{
unsigned long ea = regs->dar;
unsigned long dsisr = regs->dsisr;
@ -1533,6 +1533,11 @@ DEFINE_INTERRUPT_HANDLER_RET(__do_hash_fault)
unsigned int region_id;
long err;
if (unlikely(dsisr & (DSISR_BAD_FAULT_64S | DSISR_KEYFAULT))) {
hash__do_page_fault(regs);
return;
}
region_id = get_region_id(ea);
if ((region_id == VMALLOC_REGION_ID) || (region_id == IO_REGION_ID))
mm = &init_mm;
@ -1571,9 +1576,10 @@ DEFINE_INTERRUPT_HANDLER_RET(__do_hash_fault)
bad_page_fault(regs, SIGBUS);
}
err = 0;
}
return err;
} else if (err) {
hash__do_page_fault(regs);
}
}
/*
@ -1582,13 +1588,6 @@ DEFINE_INTERRUPT_HANDLER_RET(__do_hash_fault)
*/
DEFINE_INTERRUPT_HANDLER_RAW(do_hash_fault)
{
unsigned long dsisr = regs->dsisr;
if (unlikely(dsisr & (DSISR_BAD_FAULT_64S | DSISR_KEYFAULT))) {
hash__do_page_fault(regs);
return 0;
}
/*
* If we are in an "NMI" (e.g., an interrupt when soft-disabled), then
* don't call hash_page, just fail the fault. This is required to
@ -1607,8 +1606,7 @@ DEFINE_INTERRUPT_HANDLER_RAW(do_hash_fault)
return 0;
}
if (__do_hash_fault(regs))
hash__do_page_fault(regs);
__do_hash_fault(regs);
return 0;
}

View File

@ -78,9 +78,6 @@ static inline int smp_startup_cpu(unsigned int lcpu)
pcpu = get_hard_smp_processor_id(lcpu);
/* Fixup atomic count: it exited inside IRQ handler. */
task_thread_info(paca_ptrs[lcpu]->__current)->preempt_count = 0;
/*
* If the RTAS start-cpu token does not exist then presume the
* cpu is already spinning.

View File

@ -18,6 +18,7 @@
#include <asm/plpar_wrappers.h>
#include <asm/papr_pdsm.h>
#include <asm/mce.h>
#include <asm/unaligned.h>
#define BIND_ANY_ADDR (~0ul)
@ -900,6 +901,20 @@ static ssize_t flags_show(struct device *dev,
}
DEVICE_ATTR_RO(flags);
static umode_t papr_nd_attribute_visible(struct kobject *kobj,
struct attribute *attr, int n)
{
struct device *dev = kobj_to_dev(kobj);
struct nvdimm *nvdimm = to_nvdimm(dev);
struct papr_scm_priv *p = nvdimm_provider_data(nvdimm);
/* For if perf-stats not available remove perf_stats sysfs */
if (attr == &dev_attr_perf_stats.attr && p->stat_buffer_len == 0)
return 0;
return attr->mode;
}
/* papr_scm specific dimm attributes */
static struct attribute *papr_nd_attributes[] = {
&dev_attr_flags.attr,
@ -909,6 +924,7 @@ static struct attribute *papr_nd_attributes[] = {
static struct attribute_group papr_nd_attribute_group = {
.name = "papr",
.is_visible = papr_nd_attribute_visible,
.attrs = papr_nd_attributes,
};
@ -924,7 +940,6 @@ static int papr_scm_nvdimm_init(struct papr_scm_priv *p)
struct nd_region_desc ndr_desc;
unsigned long dimm_flags;
int target_nid, online_nid;
ssize_t stat_size;
p->bus_desc.ndctl = papr_scm_ndctl;
p->bus_desc.module = THIS_MODULE;
@ -1009,16 +1024,6 @@ static int papr_scm_nvdimm_init(struct papr_scm_priv *p)
list_add_tail(&p->region_list, &papr_nd_regions);
mutex_unlock(&papr_ndr_lock);
/* Try retriving the stat buffer and see if its supported */
stat_size = drc_pmem_query_stats(p, NULL, 0);
if (stat_size > 0) {
p->stat_buffer_len = stat_size;
dev_dbg(&p->pdev->dev, "Max perf-stat size %lu-bytes\n",
p->stat_buffer_len);
} else {
dev_info(&p->pdev->dev, "Dimm performance stats unavailable\n");
}
return 0;
err: nvdimm_bus_unregister(p->bus);
@ -1094,8 +1099,10 @@ static int papr_scm_probe(struct platform_device *pdev)
u32 drc_index, metadata_size;
u64 blocks, block_size;
struct papr_scm_priv *p;
u8 uuid_raw[UUID_SIZE];
const char *uuid_str;
u64 uuid[2];
ssize_t stat_size;
uuid_t uuid;
int rc;
/* check we have all the required DT properties */
@ -1138,16 +1145,23 @@ static int papr_scm_probe(struct platform_device *pdev)
p->hcall_flush_required = of_property_read_bool(dn, "ibm,hcall-flush-required");
/* We just need to ensure that set cookies are unique across */
uuid_parse(uuid_str, (uuid_t *) uuid);
uuid_parse(uuid_str, &uuid);
/*
* cookie1 and cookie2 are not really little endian
* we store a little endian representation of the
* uuid str so that we can compare this with the label
* area cookie irrespective of the endian config with which
* the kernel is built.
* The cookie1 and cookie2 are not really little endian.
* We store a raw buffer representation of the
* uuid string so that we can compare this with the label
* area cookie irrespective of the endian configuration
* with which the kernel is built.
*
* Historically we stored the cookie in the below format.
* for a uuid string 72511b67-0b3b-42fd-8d1d-5be3cae8bcaa
* cookie1 was 0xfd423b0b671b5172
* cookie2 was 0xaabce8cae35b1d8d
*/
p->nd_set.cookie1 = cpu_to_le64(uuid[0]);
p->nd_set.cookie2 = cpu_to_le64(uuid[1]);
export_uuid(uuid_raw, &uuid);
p->nd_set.cookie1 = get_unaligned_le64(&uuid_raw[0]);
p->nd_set.cookie2 = get_unaligned_le64(&uuid_raw[8]);
/* might be zero */
p->metadata_size = metadata_size;
@ -1172,6 +1186,14 @@ static int papr_scm_probe(struct platform_device *pdev)
p->res.name = pdev->name;
p->res.flags = IORESOURCE_MEM;
/* Try retrieving the stat buffer and see if its supported */
stat_size = drc_pmem_query_stats(p, NULL, 0);
if (stat_size > 0) {
p->stat_buffer_len = stat_size;
dev_dbg(&p->pdev->dev, "Max perf-stat size %lu-bytes\n",
p->stat_buffer_len);
}
rc = papr_scm_nvdimm_init(p);
if (rc)
goto err2;

View File

@ -105,9 +105,6 @@ static inline int smp_startup_cpu(unsigned int lcpu)
return 1;
}
/* Fixup atomic count: it exited inside IRQ handler. */
task_thread_info(paca_ptrs[lcpu]->__current)->preempt_count = 0;
/*
* If the RTAS start-cpu token does not exist then presume the
* cpu is already spinning.
@ -211,7 +208,9 @@ static __init void pSeries_smp_probe(void)
if (!cpu_has_feature(CPU_FTR_SMT))
return;
if (check_kvm_guest()) {
check_kvm_guest();
if (is_kvm_guest()) {
/*
* KVM emulates doorbells by disabling FSCR[MSGP] so msgsndp
* faults to the hypervisor which then reads the instruction

View File

@ -180,7 +180,6 @@ asmlinkage __visible void smp_callin(void)
* Disable preemption before enabling interrupts, so we don't try to
* schedule a CPU that hasn't actually started yet.
*/
preempt_disable();
local_irq_enable();
cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
}

View File

@ -164,6 +164,7 @@ config S390
select HAVE_FUTEX_CMPXCHG if FUTEX
select HAVE_GCC_PLUGINS
select HAVE_GENERIC_VDSO
select HAVE_IOREMAP_PROT if PCI
select HAVE_IRQ_EXIT_ON_IRQ_STACK
select HAVE_KERNEL_BZIP2
select HAVE_KERNEL_GZIP
@ -853,7 +854,7 @@ config CMM_IUCV
config APPLDATA_BASE
def_bool n
prompt "Linux - VM Monitor Stream, base infrastructure"
depends on PROC_FS
depends on PROC_SYSCTL
help
This provides a kernel interface for creating and updating z/VM APPLDATA
monitor records. The monitor records are updated at certain time

View File

@ -36,6 +36,7 @@ void uv_query_info(void)
uv_info.max_sec_stor_addr = ALIGN(uvcb.max_guest_stor_addr, PAGE_SIZE);
uv_info.max_num_sec_conf = uvcb.max_num_sec_conf;
uv_info.max_guest_cpu_id = uvcb.max_guest_cpu_id;
uv_info.uv_feature_indications = uvcb.uv_feature_indications;
}
#ifdef CONFIG_PROTECTED_VIRTUALIZATION_GUEST

View File

@ -344,8 +344,6 @@ static inline int is_module_addr(void *addr)
#define PTRS_PER_P4D _CRST_ENTRIES
#define PTRS_PER_PGD _CRST_ENTRIES
#define MAX_PTRS_PER_P4D PTRS_PER_P4D
/*
* Segment table and region3 table entry encoding
* (R = read-only, I = invalid, y = young bit):
@ -865,6 +863,25 @@ static inline int pte_unused(pte_t pte)
return pte_val(pte) & _PAGE_UNUSED;
}
/*
* Extract the pgprot value from the given pte while at the same time making it
* usable for kernel address space mappings where fault driven dirty and
* young/old accounting is not supported, i.e _PAGE_PROTECT and _PAGE_INVALID
* must not be set.
*/
static inline pgprot_t pte_pgprot(pte_t pte)
{
unsigned long pte_flags = pte_val(pte) & _PAGE_CHG_MASK;
if (pte_write(pte))
pte_flags |= pgprot_val(PAGE_KERNEL);
else
pte_flags |= pgprot_val(PAGE_KERNEL_RO);
pte_flags |= pte_val(pte) & mio_wb_bit_mask;
return __pgprot(pte_flags);
}
/*
* pgd/pmd/pte modification functions
*/

View File

@ -29,12 +29,6 @@ static inline void preempt_count_set(int pc)
old, new) != old);
}
#define init_task_preempt_count(p) do { } while (0)
#define init_idle_preempt_count(p, cpu) do { \
S390_lowcore.preempt_count = PREEMPT_ENABLED; \
} while (0)
static inline void set_preempt_need_resched(void)
{
__atomic_and(~PREEMPT_NEED_RESCHED, &S390_lowcore.preempt_count);
@ -88,12 +82,6 @@ static inline void preempt_count_set(int pc)
S390_lowcore.preempt_count = pc;
}
#define init_task_preempt_count(p) do { } while (0)
#define init_idle_preempt_count(p, cpu) do { \
S390_lowcore.preempt_count = PREEMPT_ENABLED; \
} while (0)
static inline void set_preempt_need_resched(void)
{
}
@ -130,6 +118,10 @@ static inline bool should_resched(int preempt_offset)
#endif /* CONFIG_HAVE_MARCH_Z196_FEATURES */
#define init_task_preempt_count(p) do { } while (0)
/* Deferred to CPU bringup time */
#define init_idle_preempt_count(p, cpu) do { } while (0)
#ifdef CONFIG_PREEMPTION
extern void preempt_schedule(void);
#define __preempt_schedule() preempt_schedule()

View File

@ -73,6 +73,10 @@ enum uv_cmds_inst {
BIT_UVC_CMD_UNPIN_PAGE_SHARED = 22,
};
enum uv_feat_ind {
BIT_UV_FEAT_MISC = 0,
};
struct uv_cb_header {
u16 len;
u16 cmd; /* Command Code */
@ -97,7 +101,8 @@ struct uv_cb_qui {
u64 max_guest_stor_addr;
u8 reserved88[158 - 136];
u16 max_guest_cpu_id;
u8 reserveda0[200 - 160];
u64 uv_feature_indications;
u8 reserveda0[200 - 168];
} __packed __aligned(8);
/* Initialize Ultravisor */
@ -274,6 +279,7 @@ struct uv_info {
unsigned long max_sec_stor_addr;
unsigned int max_num_sec_conf;
unsigned short max_guest_cpu_id;
unsigned long uv_feature_indications;
};
extern struct uv_info uv_info;

View File

@ -466,6 +466,7 @@ static void __init setup_lowcore_dat_off(void)
lc->br_r1_trampoline = 0x07f1; /* br %r1 */
lc->return_lpswe = gen_lpswe(__LC_RETURN_PSW);
lc->return_mcck_lpswe = gen_lpswe(__LC_RETURN_MCCK_PSW);
lc->preempt_count = PREEMPT_DISABLED;
set_prefix((u32)(unsigned long) lc);
lowcore_ptr[0] = lc;

View File

@ -219,6 +219,7 @@ static int pcpu_alloc_lowcore(struct pcpu *pcpu, int cpu)
lc->br_r1_trampoline = 0x07f1; /* br %r1 */
lc->return_lpswe = gen_lpswe(__LC_RETURN_PSW);
lc->return_mcck_lpswe = gen_lpswe(__LC_RETURN_MCCK_PSW);
lc->preempt_count = PREEMPT_DISABLED;
if (nmi_alloc_per_cpu(lc))
goto out_stack;
lowcore_ptr[cpu] = lc;
@ -878,7 +879,6 @@ static void smp_init_secondary(void)
restore_access_regs(S390_lowcore.access_regs_save_area);
cpu_init();
rcu_cpu_starting(cpu);
preempt_disable();
init_cpu_timer();
vtime_init();
vdso_getcpu_init();

View File

@ -364,6 +364,15 @@ static ssize_t uv_query_facilities(struct kobject *kobj,
static struct kobj_attribute uv_query_facilities_attr =
__ATTR(facilities, 0444, uv_query_facilities, NULL);
static ssize_t uv_query_feature_indications(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
return sysfs_emit(buf, "%lx\n", uv_info.uv_feature_indications);
}
static struct kobj_attribute uv_query_feature_indications_attr =
__ATTR(feature_indications, 0444, uv_query_feature_indications, NULL);
static ssize_t uv_query_max_guest_cpus(struct kobject *kobj,
struct kobj_attribute *attr, char *page)
{
@ -396,6 +405,7 @@ static struct kobj_attribute uv_query_max_guest_addr_attr =
static struct attribute *uv_query_attrs[] = {
&uv_query_facilities_attr.attr,
&uv_query_feature_indications_attr.attr,
&uv_query_max_guest_cpus_attr.attr,
&uv_query_max_guest_vms_attr.attr,
&uv_query_max_guest_addr_attr.attr,

View File

@ -329,31 +329,31 @@ static void allow_cpu_feat(unsigned long nr)
static inline int plo_test_bit(unsigned char nr)
{
register unsigned long r0 asm("0") = (unsigned long) nr | 0x100;
unsigned long function = (unsigned long)nr | 0x100;
int cc;
asm volatile(
" lgr 0,%[function]\n"
/* Parameter registers are ignored for "test bit" */
" plo 0,0,0,0(0)\n"
" ipm %0\n"
" srl %0,28\n"
: "=d" (cc)
: "d" (r0)
: "cc");
: [function] "d" (function)
: "cc", "0");
return cc == 0;
}
static __always_inline void __insn32_query(unsigned int opcode, u8 *query)
{
register unsigned long r0 asm("0") = 0; /* query function */
register unsigned long r1 asm("1") = (unsigned long) query;
asm volatile(
/* Parameter regs are ignored */
" lghi 0,0\n"
" lgr 1,%[query]\n"
/* Parameter registers are ignored */
" .insn rrf,%[opc] << 16,2,4,6,0\n"
:
: "d" (r0), "a" (r1), [opc] "i" (opcode)
: "cc", "memory");
: [query] "d" ((unsigned long)query), [opc] "i" (opcode)
: "cc", "memory", "0", "1");
}
#define INSN_SORTL 0xb938

View File

@ -792,6 +792,32 @@ void do_secure_storage_access(struct pt_regs *regs)
struct page *page;
int rc;
/*
* bit 61 tells us if the address is valid, if it's not we
* have a major problem and should stop the kernel or send a
* SIGSEGV to the process. Unfortunately bit 61 is not
* reliable without the misc UV feature so we need to check
* for that as well.
*/
if (test_bit_inv(BIT_UV_FEAT_MISC, &uv_info.uv_feature_indications) &&
!test_bit_inv(61, &regs->int_parm_long)) {
/*
* When this happens, userspace did something that it
* was not supposed to do, e.g. branching into secure
* memory. Trigger a segmentation fault.
*/
if (user_mode(regs)) {
send_sig(SIGSEGV, current, 0);
return;
}
/*
* The kernel should never run into this case and we
* have no way out of this situation.
*/
panic("Unexpected PGM 0x3d with TEID bit 61=0");
}
switch (get_fault_type(regs)) {
case USER_FAULT:
mm = current->mm;

View File

@ -186,8 +186,6 @@ asmlinkage void start_secondary(void)
per_cpu_trap_init();
preempt_disable();
notify_cpu_starting(cpu);
local_irq_enable();

View File

@ -348,7 +348,6 @@ static void sparc_start_secondary(void *arg)
*/
arch_cpu_pre_starting(arg);
preempt_disable();
cpu = smp_processor_id();
notify_cpu_starting(cpu);

View File

@ -138,9 +138,6 @@ void smp_callin(void)
set_cpu_online(cpuid, true);
/* idle thread is expected to have preempt disabled */
preempt_disable();
local_irq_enable();
cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);

View File

@ -1500,7 +1500,7 @@ static int __init curve25519_mod_init(void)
static void __exit curve25519_mod_exit(void)
{
if (IS_REACHABLE(CONFIG_CRYPTO_KPP) &&
(boot_cpu_has(X86_FEATURE_BMI2) || boot_cpu_has(X86_FEATURE_ADX)))
static_branch_likely(&curve25519_use_bmi2_adx))
crypto_unregister_kpp(&curve25519_alg);
}

View File

@ -506,7 +506,7 @@ SYM_CODE_START(\asmsym)
movq %rsp, %rdi /* pt_regs pointer */
call \cfunc
call kernel_\cfunc
/*
* No need to switch back to the IST stack. The current stack is either
@ -517,7 +517,7 @@ SYM_CODE_START(\asmsym)
/* Switch to the regular task stack */
.Lfrom_usermode_switch_stack_\@:
idtentry_body safe_stack_\cfunc, has_error_code=1
idtentry_body user_\cfunc, has_error_code=1
_ASM_NOKPROBE(\asmsym)
SYM_CODE_END(\asmsym)

View File

@ -1626,6 +1626,8 @@ static void x86_pmu_del(struct perf_event *event, int flags)
if (cpuc->txn_flags & PERF_PMU_TXN_ADD)
goto do_del;
__set_bit(event->hw.idx, cpuc->dirty);
/*
* Not a TXN, therefore cleanup properly.
*/
@ -2474,6 +2476,31 @@ static int x86_pmu_event_init(struct perf_event *event)
return err;
}
void perf_clear_dirty_counters(void)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
int i;
/* Don't need to clear the assigned counter. */
for (i = 0; i < cpuc->n_events; i++)
__clear_bit(cpuc->assign[i], cpuc->dirty);
if (bitmap_empty(cpuc->dirty, X86_PMC_IDX_MAX))
return;
for_each_set_bit(i, cpuc->dirty, X86_PMC_IDX_MAX) {
/* Metrics and fake events don't have corresponding HW counters. */
if (is_metric_idx(i) || (i == INTEL_PMC_IDX_FIXED_VLBR))
continue;
else if (i >= INTEL_PMC_IDX_FIXED)
wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + (i - INTEL_PMC_IDX_FIXED), 0);
else
wrmsrl(x86_pmu_event_addr(i), 0);
}
bitmap_zero(cpuc->dirty, X86_PMC_IDX_MAX);
}
static void x86_pmu_event_mapped(struct perf_event *event, struct mm_struct *mm)
{
if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED))
@ -2497,7 +2524,6 @@ static void x86_pmu_event_mapped(struct perf_event *event, struct mm_struct *mm)
static void x86_pmu_event_unmapped(struct perf_event *event, struct mm_struct *mm)
{
if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED))
return;

View File

@ -280,6 +280,8 @@ static struct extra_reg intel_spr_extra_regs[] __read_mostly = {
INTEL_UEVENT_EXTRA_REG(0x012b, MSR_OFFCORE_RSP_1, 0x3fffffffffull, RSP_1),
INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd),
INTEL_UEVENT_EXTRA_REG(0x01c6, MSR_PEBS_FRONTEND, 0x7fff17, FE),
INTEL_UEVENT_EXTRA_REG(0x40ad, MSR_PEBS_FRONTEND, 0x7, FE),
INTEL_UEVENT_EXTRA_REG(0x04c2, MSR_PEBS_FRONTEND, 0x8, FE),
EVENT_EXTRA_END
};
@ -4030,8 +4032,10 @@ spr_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
* The :ppp indicates the Precise Distribution (PDist) facility, which
* is only supported on the GP counter 0. If a :ppp event which is not
* available on the GP counter 0, error out.
* Exception: Instruction PDIR is only available on the fixed counter 0.
*/
if (event->attr.precise_ip == 3) {
if ((event->attr.precise_ip == 3) &&
!constraint_match(&fixed0_constraint, event->hw.config)) {
if (c->idxmsk64 & BIT_ULL(0))
return &counter0_constraint;
@ -6157,8 +6161,13 @@ __init int intel_pmu_init(void)
pmu = &x86_pmu.hybrid_pmu[X86_HYBRID_PMU_CORE_IDX];
pmu->name = "cpu_core";
pmu->cpu_type = hybrid_big;
pmu->num_counters = x86_pmu.num_counters + 2;
pmu->num_counters_fixed = x86_pmu.num_counters_fixed + 1;
if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) {
pmu->num_counters = x86_pmu.num_counters + 2;
pmu->num_counters_fixed = x86_pmu.num_counters_fixed + 1;
} else {
pmu->num_counters = x86_pmu.num_counters;
pmu->num_counters_fixed = x86_pmu.num_counters_fixed;
}
pmu->max_pebs_events = min_t(unsigned, MAX_PEBS_EVENTS, pmu->num_counters);
pmu->unconstrained = (struct event_constraint)
__EVENT_CONSTRAINT(0, (1ULL << pmu->num_counters) - 1,

View File

@ -229,6 +229,7 @@ struct cpu_hw_events {
*/
struct perf_event *events[X86_PMC_IDX_MAX]; /* in counter order */
unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
unsigned long dirty[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
int enabled;
int n_events; /* the # of events in the below arrays */

View File

@ -312,8 +312,8 @@ static __always_inline void __##func(struct pt_regs *regs)
*/
#define DECLARE_IDTENTRY_VC(vector, func) \
DECLARE_IDTENTRY_RAW_ERRORCODE(vector, func); \
__visible noinstr void ist_##func(struct pt_regs *regs, unsigned long error_code); \
__visible noinstr void safe_stack_##func(struct pt_regs *regs, unsigned long error_code)
__visible noinstr void kernel_##func(struct pt_regs *regs, unsigned long error_code); \
__visible noinstr void user_##func(struct pt_regs *regs, unsigned long error_code)
/**
* DEFINE_IDTENTRY_IST - Emit code for IST entry points
@ -355,33 +355,24 @@ static __always_inline void __##func(struct pt_regs *regs)
DEFINE_IDTENTRY_RAW_ERRORCODE(func)
/**
* DEFINE_IDTENTRY_VC_SAFE_STACK - Emit code for VMM communication handler
which runs on a safe stack.
* DEFINE_IDTENTRY_VC_KERNEL - Emit code for VMM communication handler
when raised from kernel mode
* @func: Function name of the entry point
*
* Maps to DEFINE_IDTENTRY_RAW_ERRORCODE
*/
#define DEFINE_IDTENTRY_VC_SAFE_STACK(func) \
DEFINE_IDTENTRY_RAW_ERRORCODE(safe_stack_##func)
#define DEFINE_IDTENTRY_VC_KERNEL(func) \
DEFINE_IDTENTRY_RAW_ERRORCODE(kernel_##func)
/**
* DEFINE_IDTENTRY_VC_IST - Emit code for VMM communication handler
which runs on the VC fall-back stack
* DEFINE_IDTENTRY_VC_USER - Emit code for VMM communication handler
when raised from user mode
* @func: Function name of the entry point
*
* Maps to DEFINE_IDTENTRY_RAW_ERRORCODE
*/
#define DEFINE_IDTENTRY_VC_IST(func) \
DEFINE_IDTENTRY_RAW_ERRORCODE(ist_##func)
/**
* DEFINE_IDTENTRY_VC - Emit code for VMM communication handler
* @func: Function name of the entry point
*
* Maps to DEFINE_IDTENTRY_RAW_ERRORCODE
*/
#define DEFINE_IDTENTRY_VC(func) \
DEFINE_IDTENTRY_RAW_ERRORCODE(func)
#define DEFINE_IDTENTRY_VC_USER(func) \
DEFINE_IDTENTRY_RAW_ERRORCODE(user_##func)
#else /* CONFIG_X86_64 */

View File

@ -85,7 +85,7 @@
#define KVM_REQ_APICV_UPDATE \
KVM_ARCH_REQ_FLAGS(25, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
#define KVM_REQ_TLB_FLUSH_CURRENT KVM_ARCH_REQ(26)
#define KVM_REQ_HV_TLB_FLUSH \
#define KVM_REQ_TLB_FLUSH_GUEST \
KVM_ARCH_REQ_FLAGS(27, KVM_REQUEST_NO_WAKEUP)
#define KVM_REQ_APF_READY KVM_ARCH_REQ(28)
#define KVM_REQ_MSR_FILTER_CHANGED KVM_ARCH_REQ(29)
@ -1464,6 +1464,7 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu);
void kvm_mmu_init_vm(struct kvm *kvm);
void kvm_mmu_uninit_vm(struct kvm *kvm);
void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu);
void kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
struct kvm_memory_slot *memslot,

View File

@ -478,6 +478,7 @@ struct x86_pmu_lbr {
extern void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap);
extern void perf_check_microcode(void);
extern void perf_clear_dirty_counters(void);
extern int x86_perf_rdpmc_index(struct perf_event *event);
#else
static inline void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap)

View File

@ -44,7 +44,7 @@ static __always_inline void preempt_count_set(int pc)
#define init_task_preempt_count(p) do { } while (0)
#define init_idle_preempt_count(p, cpu) do { \
per_cpu(__preempt_count, (cpu)) = PREEMPT_ENABLED; \
per_cpu(__preempt_count, (cpu)) = PREEMPT_DISABLED; \
} while (0)
/*

View File

@ -2,10 +2,12 @@
#ifndef _ASM_X86_HWCAP2_H
#define _ASM_X86_HWCAP2_H
#include <linux/const.h>
/* MONITOR/MWAIT enabled in Ring 3 */
#define HWCAP2_RING3MWAIT (1 << 0)
#define HWCAP2_RING3MWAIT _BITUL(0)
/* Kernel allows FSGSBASE instructions available in Ring 3 */
#define HWCAP2_FSGSBASE BIT(1)
#define HWCAP2_FSGSBASE _BITUL(1)
#endif

View File

@ -236,7 +236,7 @@ static void __init hv_smp_prepare_cpus(unsigned int max_cpus)
for_each_present_cpu(i) {
if (i == 0)
continue;
ret = hv_call_add_logical_proc(numa_cpu_node(i), i, cpu_physical_id(i));
ret = hv_call_add_logical_proc(numa_cpu_node(i), i, i);
BUG_ON(ret);
}

View File

@ -549,6 +549,7 @@ static const struct pci_device_id intel_early_ids[] __initconst = {
INTEL_CNL_IDS(&gen9_early_ops),
INTEL_ICL_11_IDS(&gen11_early_ops),
INTEL_EHL_IDS(&gen11_early_ops),
INTEL_JSL_IDS(&gen11_early_ops),
INTEL_TGL_12_IDS(&gen11_early_ops),
INTEL_RKL_IDS(&gen11_early_ops),
INTEL_ADLS_IDS(&gen11_early_ops),

View File

@ -7,12 +7,11 @@
* Author: Joerg Roedel <jroedel@suse.de>
*/
#define pr_fmt(fmt) "SEV-ES: " fmt
#define pr_fmt(fmt) "SEV: " fmt
#include <linux/sched/debug.h> /* For show_regs() */
#include <linux/percpu-defs.h>
#include <linux/mem_encrypt.h>
#include <linux/lockdep.h>
#include <linux/printk.h>
#include <linux/mm_types.h>
#include <linux/set_memory.h>
@ -192,11 +191,19 @@ void noinstr __sev_es_ist_exit(void)
this_cpu_write(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC], *(unsigned long *)ist);
}
static __always_inline struct ghcb *sev_es_get_ghcb(struct ghcb_state *state)
/*
* Nothing shall interrupt this code path while holding the per-CPU
* GHCB. The backup GHCB is only for NMIs interrupting this path.
*
* Callers must disable local interrupts around it.
*/
static noinstr struct ghcb *__sev_get_ghcb(struct ghcb_state *state)
{
struct sev_es_runtime_data *data;
struct ghcb *ghcb;
WARN_ON(!irqs_disabled());
data = this_cpu_read(runtime_data);
ghcb = &data->ghcb_page;
@ -213,7 +220,9 @@ static __always_inline struct ghcb *sev_es_get_ghcb(struct ghcb_state *state)
data->ghcb_active = false;
data->backup_ghcb_active = false;
instrumentation_begin();
panic("Unable to handle #VC exception! GHCB and Backup GHCB are already in use");
instrumentation_end();
}
/* Mark backup_ghcb active before writing to it */
@ -479,11 +488,13 @@ static enum es_result vc_slow_virt_to_phys(struct ghcb *ghcb, struct es_em_ctxt
/* Include code shared with pre-decompression boot stage */
#include "sev-shared.c"
static __always_inline void sev_es_put_ghcb(struct ghcb_state *state)
static noinstr void __sev_put_ghcb(struct ghcb_state *state)
{
struct sev_es_runtime_data *data;
struct ghcb *ghcb;
WARN_ON(!irqs_disabled());
data = this_cpu_read(runtime_data);
ghcb = &data->ghcb_page;
@ -507,7 +518,7 @@ void noinstr __sev_es_nmi_complete(void)
struct ghcb_state state;
struct ghcb *ghcb;
ghcb = sev_es_get_ghcb(&state);
ghcb = __sev_get_ghcb(&state);
vc_ghcb_invalidate(ghcb);
ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_NMI_COMPLETE);
@ -517,7 +528,7 @@ void noinstr __sev_es_nmi_complete(void)
sev_es_wr_ghcb_msr(__pa_nodebug(ghcb));
VMGEXIT();
sev_es_put_ghcb(&state);
__sev_put_ghcb(&state);
}
static u64 get_jump_table_addr(void)
@ -529,7 +540,7 @@ static u64 get_jump_table_addr(void)
local_irq_save(flags);
ghcb = sev_es_get_ghcb(&state);
ghcb = __sev_get_ghcb(&state);
vc_ghcb_invalidate(ghcb);
ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_AP_JUMP_TABLE);
@ -543,7 +554,7 @@ static u64 get_jump_table_addr(void)
ghcb_sw_exit_info_2_is_valid(ghcb))
ret = ghcb->save.sw_exit_info_2;
sev_es_put_ghcb(&state);
__sev_put_ghcb(&state);
local_irq_restore(flags);
@ -668,7 +679,7 @@ static void sev_es_ap_hlt_loop(void)
struct ghcb_state state;
struct ghcb *ghcb;
ghcb = sev_es_get_ghcb(&state);
ghcb = __sev_get_ghcb(&state);
while (true) {
vc_ghcb_invalidate(ghcb);
@ -685,7 +696,7 @@ static void sev_es_ap_hlt_loop(void)
break;
}
sev_es_put_ghcb(&state);
__sev_put_ghcb(&state);
}
/*
@ -775,7 +786,7 @@ void __init sev_es_init_vc_handling(void)
sev_es_setup_play_dead();
/* Secondary CPUs use the runtime #VC handler */
initial_vc_handler = (unsigned long)safe_stack_exc_vmm_communication;
initial_vc_handler = (unsigned long)kernel_exc_vmm_communication;
}
static void __init vc_early_forward_exception(struct es_em_ctxt *ctxt)
@ -1213,14 +1224,6 @@ static enum es_result vc_handle_trap_ac(struct ghcb *ghcb,
return ES_EXCEPTION;
}
static __always_inline void vc_handle_trap_db(struct pt_regs *regs)
{
if (user_mode(regs))
noist_exc_debug(regs);
else
exc_debug(regs);
}
static enum es_result vc_handle_exitcode(struct es_em_ctxt *ctxt,
struct ghcb *ghcb,
unsigned long exit_code)
@ -1316,44 +1319,15 @@ static __always_inline bool on_vc_fallback_stack(struct pt_regs *regs)
return (sp >= __this_cpu_ist_bottom_va(VC2) && sp < __this_cpu_ist_top_va(VC2));
}
/*
* Main #VC exception handler. It is called when the entry code was able to
* switch off the IST to a safe kernel stack.
*
* With the current implementation it is always possible to switch to a safe
* stack because #VC exceptions only happen at known places, like intercepted
* instructions or accesses to MMIO areas/IO ports. They can also happen with
* code instrumentation when the hypervisor intercepts #DB, but the critical
* paths are forbidden to be instrumented, so #DB exceptions currently also
* only happen in safe places.
*/
DEFINE_IDTENTRY_VC_SAFE_STACK(exc_vmm_communication)
static bool vc_raw_handle_exception(struct pt_regs *regs, unsigned long error_code)
{
irqentry_state_t irq_state;
struct ghcb_state state;
struct es_em_ctxt ctxt;
enum es_result result;
struct ghcb *ghcb;
bool ret = true;
/*
* Handle #DB before calling into !noinstr code to avoid recursive #DB.
*/
if (error_code == SVM_EXIT_EXCP_BASE + X86_TRAP_DB) {
vc_handle_trap_db(regs);
return;
}
irq_state = irqentry_nmi_enter(regs);
lockdep_assert_irqs_disabled();
instrumentation_begin();
/*
* This is invoked through an interrupt gate, so IRQs are disabled. The
* code below might walk page-tables for user or kernel addresses, so
* keep the IRQs disabled to protect us against concurrent TLB flushes.
*/
ghcb = sev_es_get_ghcb(&state);
ghcb = __sev_get_ghcb(&state);
vc_ghcb_invalidate(ghcb);
result = vc_init_em_ctxt(&ctxt, regs, error_code);
@ -1361,7 +1335,7 @@ DEFINE_IDTENTRY_VC_SAFE_STACK(exc_vmm_communication)
if (result == ES_OK)
result = vc_handle_exitcode(&ctxt, ghcb, error_code);
sev_es_put_ghcb(&state);
__sev_put_ghcb(&state);
/* Done - now check the result */
switch (result) {
@ -1371,15 +1345,18 @@ DEFINE_IDTENTRY_VC_SAFE_STACK(exc_vmm_communication)
case ES_UNSUPPORTED:
pr_err_ratelimited("Unsupported exit-code 0x%02lx in early #VC exception (IP: 0x%lx)\n",
error_code, regs->ip);
goto fail;
ret = false;
break;
case ES_VMM_ERROR:
pr_err_ratelimited("Failure in communication with VMM (exit-code 0x%02lx IP: 0x%lx)\n",
error_code, regs->ip);
goto fail;
ret = false;
break;
case ES_DECODE_FAILED:
pr_err_ratelimited("Failed to decode instruction (exit-code 0x%02lx IP: 0x%lx)\n",
error_code, regs->ip);
goto fail;
ret = false;
break;
case ES_EXCEPTION:
vc_forward_exception(&ctxt);
break;
@ -1395,24 +1372,52 @@ DEFINE_IDTENTRY_VC_SAFE_STACK(exc_vmm_communication)
BUG();
}
out:
instrumentation_end();
irqentry_nmi_exit(regs, irq_state);
return ret;
}
return;
static __always_inline bool vc_is_db(unsigned long error_code)
{
return error_code == SVM_EXIT_EXCP_BASE + X86_TRAP_DB;
}
fail:
if (user_mode(regs)) {
/*
* Do not kill the machine if user-space triggered the
* exception. Send SIGBUS instead and let user-space deal with
* it.
*/
force_sig_fault(SIGBUS, BUS_OBJERR, (void __user *)0);
} else {
pr_emerg("PANIC: Unhandled #VC exception in kernel space (result=%d)\n",
result);
/*
* Runtime #VC exception handler when raised from kernel mode. Runs in NMI mode
* and will panic when an error happens.
*/
DEFINE_IDTENTRY_VC_KERNEL(exc_vmm_communication)
{
irqentry_state_t irq_state;
/*
* With the current implementation it is always possible to switch to a
* safe stack because #VC exceptions only happen at known places, like
* intercepted instructions or accesses to MMIO areas/IO ports. They can
* also happen with code instrumentation when the hypervisor intercepts
* #DB, but the critical paths are forbidden to be instrumented, so #DB
* exceptions currently also only happen in safe places.
*
* But keep this here in case the noinstr annotations are violated due
* to bug elsewhere.
*/
if (unlikely(on_vc_fallback_stack(regs))) {
instrumentation_begin();
panic("Can't handle #VC exception from unsupported context\n");
instrumentation_end();
}
/*
* Handle #DB before calling into !noinstr code to avoid recursive #DB.
*/
if (vc_is_db(error_code)) {
exc_debug(regs);
return;
}
irq_state = irqentry_nmi_enter(regs);
instrumentation_begin();
if (!vc_raw_handle_exception(regs, error_code)) {
/* Show some debug info */
show_regs(regs);
@ -1423,23 +1428,38 @@ fail:
panic("Returned from Terminate-Request to Hypervisor\n");
}
goto out;
}
/* This handler runs on the #VC fall-back stack. It can cause further #VC exceptions */
DEFINE_IDTENTRY_VC_IST(exc_vmm_communication)
{
instrumentation_begin();
panic("Can't handle #VC exception from unsupported context\n");
instrumentation_end();
irqentry_nmi_exit(regs, irq_state);
}
DEFINE_IDTENTRY_VC(exc_vmm_communication)
/*
* Runtime #VC exception handler when raised from user mode. Runs in IRQ mode
* and will kill the current task with SIGBUS when an error happens.
*/
DEFINE_IDTENTRY_VC_USER(exc_vmm_communication)
{
if (likely(!on_vc_fallback_stack(regs)))
safe_stack_exc_vmm_communication(regs, error_code);
else
ist_exc_vmm_communication(regs, error_code);
/*
* Handle #DB before calling into !noinstr code to avoid recursive #DB.
*/
if (vc_is_db(error_code)) {
noist_exc_debug(regs);
return;
}
irqentry_enter_from_user_mode(regs);
instrumentation_begin();
if (!vc_raw_handle_exception(regs, error_code)) {
/*
* Do not kill the machine if user-space triggered the
* exception. Send SIGBUS instead and let user-space deal with
* it.
*/
force_sig_fault(SIGBUS, BUS_OBJERR, (void __user *)0);
}
instrumentation_end();
irqentry_exit_to_user_mode(regs);
}
bool __init handle_vc_boot_ghcb(struct pt_regs *regs)

View File

@ -236,7 +236,6 @@ static void notrace start_secondary(void *unused)
cpu_init();
rcu_cpu_starting(raw_smp_processor_id());
x86_cpuinit.early_percpu_clock_init();
preempt_disable();
smp_callin();
enable_start_cpu0 = 0;

View File

@ -1152,7 +1152,8 @@ static struct clocksource clocksource_tsc = {
.mask = CLOCKSOURCE_MASK(64),
.flags = CLOCK_SOURCE_IS_CONTINUOUS |
CLOCK_SOURCE_VALID_FOR_HRES |
CLOCK_SOURCE_MUST_VERIFY,
CLOCK_SOURCE_MUST_VERIFY |
CLOCK_SOURCE_VERIFY_PERCPU,
.vdso_clock_mode = VDSO_CLOCKMODE_TSC,
.enable = tsc_cs_enable,
.resume = tsc_resume,

View File

@ -202,10 +202,10 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
static_call(kvm_x86_vcpu_after_set_cpuid)(vcpu);
/*
* Except for the MMU, which needs to be reset after any vendor
* specific adjustments to the reserved GPA bits.
* Except for the MMU, which needs to do its thing any vendor specific
* adjustments to the reserved GPA bits.
*/
kvm_mmu_reset_context(vcpu);
kvm_mmu_after_set_cpuid(vcpu);
}
static int is_efer_nx(void)

View File

@ -1704,7 +1704,7 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, u64 ingpa, u16 rep_cnt, bool
* vcpu->arch.cr3 may not be up-to-date for running vCPUs so we can't
* analyze it here, flush TLB regardless of the specified address space.
*/
kvm_make_vcpus_request_mask(kvm, KVM_REQ_HV_TLB_FLUSH,
kvm_make_vcpus_request_mask(kvm, KVM_REQ_TLB_FLUSH_GUEST,
NULL, vcpu_mask, &hv_vcpu->tlb_flush);
ret_success:

View File

@ -4168,7 +4168,15 @@ static inline u64 reserved_hpa_bits(void)
void
reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
{
bool uses_nx = context->nx ||
/*
* KVM uses NX when TDP is disabled to handle a variety of scenarios,
* notably for huge SPTEs if iTLB multi-hit mitigation is enabled and
* to generate correct permissions for CR0.WP=0/CR4.SMEP=1/EFER.NX=0.
* The iTLB multi-hit workaround can be toggled at any time, so assume
* NX can be used by any non-nested shadow MMU to avoid having to reset
* MMU contexts. Note, KVM forces EFER.NX=1 when TDP is disabled.
*/
bool uses_nx = context->nx || !tdp_enabled ||
context->mmu_role.base.smep_andnot_wp;
struct rsvd_bits_validate *shadow_zero_check;
int i;
@ -4851,6 +4859,18 @@ kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu)
return role.base;
}
void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu)
{
/*
* Invalidate all MMU roles to force them to reinitialize as CPUID
* information is factored into reserved bit calculations.
*/
vcpu->arch.root_mmu.mmu_role.ext.valid = 0;
vcpu->arch.guest_mmu.mmu_role.ext.valid = 0;
vcpu->arch.nested_mmu.mmu_role.ext.valid = 0;
kvm_mmu_reset_context(vcpu);
}
void kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
{
kvm_mmu_unload(vcpu);

View File

@ -471,8 +471,7 @@ retry_walk:
error:
errcode |= write_fault | user_fault;
if (fetch_fault && (mmu->nx ||
kvm_read_cr4_bits(vcpu, X86_CR4_SMEP)))
if (fetch_fault && (mmu->nx || mmu->mmu_role.ext.cr4_smep))
errcode |= PFERR_FETCH_MASK;
walker->fault.vector = PF_VECTOR;

View File

@ -102,13 +102,6 @@ int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level,
else if (kvm_vcpu_ad_need_write_protect(vcpu))
spte |= SPTE_TDP_AD_WRPROT_ONLY_MASK;
/*
* Bits 62:52 of PAE SPTEs are reserved. WARN if said bits are set
* if PAE paging may be employed (shadow paging or any 32-bit KVM).
*/
WARN_ON_ONCE((!tdp_enabled || !IS_ENABLED(CONFIG_X86_64)) &&
(spte & SPTE_TDP_AD_MASK));
/*
* For the EPT case, shadow_present_mask is 0 if hardware
* supports exec-only page table entries. In that case,

View File

@ -912,7 +912,7 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write,
kvm_pfn_t pfn, bool prefault)
{
u64 new_spte;
int ret = 0;
int ret = RET_PF_FIXED;
int make_spte_ret = 0;
if (unlikely(is_noslot_pfn(pfn)))
@ -949,7 +949,11 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write,
rcu_dereference(iter->sptep));
}
if (!prefault)
/*
* Increase pf_fixed in both RET_PF_EMULATE and RET_PF_FIXED to be
* consistent with legacy MMU behavior.
*/
if (ret != RET_PF_SPURIOUS)
vcpu->stat.pf_fixed++;
return ret;

View File

@ -1127,12 +1127,19 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool ne
/*
* Unconditionally skip the TLB flush on fast CR3 switch, all TLB
* flushes are handled by nested_vmx_transition_tlb_flush(). See
* nested_vmx_transition_mmu_sync for details on skipping the MMU sync.
* flushes are handled by nested_vmx_transition_tlb_flush().
*/
if (!nested_ept)
kvm_mmu_new_pgd(vcpu, cr3, true,
!nested_vmx_transition_mmu_sync(vcpu));
if (!nested_ept) {
kvm_mmu_new_pgd(vcpu, cr3, true, true);
/*
* A TLB flush on VM-Enter/VM-Exit flushes all linear mappings
* across all PCIDs, i.e. all PGDs need to be synchronized.
* See nested_vmx_transition_mmu_sync() for more details.
*/
if (nested_vmx_transition_mmu_sync(vcpu))
kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
}
vcpu->arch.cr3 = cr3;
kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);
@ -3682,7 +3689,7 @@ void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu)
}
}
static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
int max_irr;
@ -3690,17 +3697,17 @@ static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
u16 status;
if (!vmx->nested.pi_desc || !vmx->nested.pi_pending)
return;
return 0;
vmx->nested.pi_pending = false;
if (!pi_test_and_clear_on(vmx->nested.pi_desc))
return;
return 0;
max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256);
if (max_irr != 256) {
vapic_page = vmx->nested.virtual_apic_map.hva;
if (!vapic_page)
return;
return 0;
__kvm_apic_update_irr(vmx->nested.pi_desc->pir,
vapic_page, &max_irr);
@ -3713,6 +3720,7 @@ static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
}
nested_mark_vmcs12_pages_dirty(vcpu);
return 0;
}
static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu,
@ -3887,8 +3895,7 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu)
}
no_vmexit:
vmx_complete_nested_posted_interrupt(vcpu);
return 0;
return vmx_complete_nested_posted_interrupt(vcpu);
}
static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu)
@ -5481,8 +5488,6 @@ static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu,
{
u32 index = kvm_rcx_read(vcpu);
u64 new_eptp;
bool accessed_dirty;
struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
if (!nested_cpu_has_eptp_switching(vmcs12) ||
!nested_cpu_has_ept(vmcs12))
@ -5491,13 +5496,10 @@ static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu,
if (index >= VMFUNC_EPTP_ENTRIES)
return 1;
if (kvm_vcpu_read_guest_page(vcpu, vmcs12->eptp_list_address >> PAGE_SHIFT,
&new_eptp, index * 8, 8))
return 1;
accessed_dirty = !!(new_eptp & VMX_EPTP_AD_ENABLE_BIT);
/*
* If the (L2) guest does a vmfunc to the currently
* active ept pointer, we don't have to do anything else
@ -5506,8 +5508,6 @@ static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu,
if (!nested_vmx_check_eptp(vcpu, new_eptp))
return 1;
mmu->ept_ad = accessed_dirty;
mmu->mmu_role.base.ad_disabled = !accessed_dirty;
vmcs12->ept_pointer = new_eptp;
kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu);
@ -5533,7 +5533,7 @@ static int handle_vmfunc(struct kvm_vcpu *vcpu)
}
vmcs12 = get_vmcs12(vcpu);
if ((vmcs12->vm_function_control & (1 << function)) == 0)
if (!(vmcs12->vm_function_control & BIT_ULL(function)))
goto fail;
switch (function) {
@ -5806,6 +5806,9 @@ static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu,
else if (is_breakpoint(intr_info) &&
vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
return true;
else if (is_alignment_check(intr_info) &&
!vmx_guest_inject_ac(vcpu))
return true;
return false;
case EXIT_REASON_EXTERNAL_INTERRUPT:
return true;

View File

@ -117,6 +117,11 @@ static inline bool is_gp_fault(u32 intr_info)
return is_exception_n(intr_info, GP_VECTOR);
}
static inline bool is_alignment_check(u32 intr_info)
{
return is_exception_n(intr_info, AC_VECTOR);
}
static inline bool is_machine_check(u32 intr_info)
{
return is_exception_n(intr_info, MC_VECTOR);

View File

@ -4829,7 +4829,7 @@ static int handle_machine_check(struct kvm_vcpu *vcpu)
* - Guest has #AC detection enabled in CR0
* - Guest EFLAGS has AC bit set
*/
static inline bool guest_inject_ac(struct kvm_vcpu *vcpu)
bool vmx_guest_inject_ac(struct kvm_vcpu *vcpu)
{
if (!boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT))
return true;
@ -4937,7 +4937,7 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu)
kvm_run->debug.arch.exception = ex_no;
break;
case AC_VECTOR:
if (guest_inject_ac(vcpu)) {
if (vmx_guest_inject_ac(vcpu)) {
kvm_queue_exception_e(vcpu, AC_VECTOR, error_code);
return 1;
}

View File

@ -387,6 +387,7 @@ void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
u64 construct_eptp(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level);
bool vmx_guest_inject_ac(struct kvm_vcpu *vcpu);
void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu);
void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu);
bool vmx_nmi_blocked(struct kvm_vcpu *vcpu);

View File

@ -9171,7 +9171,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
}
if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu))
kvm_vcpu_flush_tlb_current(vcpu);
if (kvm_check_request(KVM_REQ_HV_TLB_FLUSH, vcpu))
if (kvm_check_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu))
kvm_vcpu_flush_tlb_guest(vcpu);
if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) {
@ -10454,6 +10454,8 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
{
unsigned long old_cr0 = kvm_read_cr0(vcpu);
kvm_lapic_reset(vcpu, init_event);
vcpu->arch.hflags = 0;
@ -10522,6 +10524,17 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
vcpu->arch.ia32_xss = 0;
static_call(kvm_x86_vcpu_reset)(vcpu, init_event);
/*
* Reset the MMU context if paging was enabled prior to INIT (which is
* implied if CR0.PG=1 as CR0 will be '0' prior to RESET). Unlike the
* standard CR0/CR4/EFER modification paths, only CR0.PG needs to be
* checked because it is unconditionally cleared on INIT and all other
* paging related bits are ignored if paging is disabled, i.e. CR0.WP,
* CR4, and EFER changes are all irrelevant if CR0.PG was '0'.
*/
if (old_cr0 & X86_CR0_PG)
kvm_mmu_reset_context(vcpu);
}
void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)

View File

@ -14,6 +14,7 @@
#include <asm/nospec-branch.h>
#include <asm/cache.h>
#include <asm/apic.h>
#include <asm/perf_event.h>
#include "mm_internal.h"
@ -404,9 +405,14 @@ static inline void cr4_update_pce_mm(struct mm_struct *mm)
{
if (static_branch_unlikely(&rdpmc_always_available_key) ||
(!static_branch_unlikely(&rdpmc_never_available_key) &&
atomic_read(&mm->context.perf_rdpmc_allowed)))
atomic_read(&mm->context.perf_rdpmc_allowed))) {
/*
* Clear the existing dirty counters to
* prevent the leak for an RDPMC task.
*/
perf_clear_dirty_counters();
cr4_set_bits_irqsoff(X86_CR4_PCE);
else
} else
cr4_clear_bits_irqsoff(X86_CR4_PCE);
}

View File

@ -1297,7 +1297,7 @@ st: if (is_imm8(insn->off))
emit_ldx(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn->off);
if (BPF_MODE(insn->code) == BPF_PROBE_MEM) {
struct exception_table_entry *ex;
u8 *_insn = image + proglen;
u8 *_insn = image + proglen + (start_of_ldx - temp);
s64 delta;
/* populate jmp_offset for JMP above */

View File

@ -145,7 +145,6 @@ void secondary_start_kernel(void)
cpumask_set_cpu(cpu, mm_cpumask(mm));
enter_lazy_tlb(mm, current);
preempt_disable();
trace_hardirqs_off();
calibrate_delay();

View File

@ -2695,9 +2695,15 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,
* costly and complicated.
*/
if (unlikely(!bfqd->nonrot_with_queueing)) {
if (bic->stable_merge_bfqq &&
/*
* Make sure also that bfqq is sync, because
* bic->stable_merge_bfqq may point to some queue (for
* stable merging) also if bic is associated with a
* sync queue, but this bfqq is async
*/
if (bfq_bfqq_sync(bfqq) && bic->stable_merge_bfqq &&
!bfq_bfqq_just_created(bfqq) &&
time_is_after_jiffies(bfqq->split_time +
time_is_before_jiffies(bfqq->split_time +
msecs_to_jiffies(200))) {
struct bfq_queue *stable_merge_bfqq =
bic->stable_merge_bfqq;
@ -6129,11 +6135,13 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd)
* of other queues. But a false waker will unjustly steal
* bandwidth to its supposedly woken queue. So considering
* also shared queues in the waking mechanism may cause more
* control troubles than throughput benefits. Then do not set
* last_completed_rq_bfqq to bfqq if bfqq is a shared queue.
* control troubles than throughput benefits. Then reset
* last_completed_rq_bfqq if bfqq is a shared queue.
*/
if (!bfq_bfqq_coop(bfqq))
bfqd->last_completed_rq_bfqq = bfqq;
else
bfqd->last_completed_rq_bfqq = NULL;
/*
* If we are waiting to discover whether the request pattern

View File

@ -1375,8 +1375,7 @@ static inline bool bio_remaining_done(struct bio *bio)
*
* bio_endio() can be called several times on a bio that has been chained
* using bio_chain(). The ->bi_end_io() function will only be called the
* last time. At this point the BLK_TA_COMPLETE tracing event will be
* generated if BIO_TRACE_COMPLETION is set.
* last time.
**/
void bio_endio(struct bio *bio)
{
@ -1389,6 +1388,11 @@ again:
if (bio->bi_bdev)
rq_qos_done_bio(bio->bi_bdev->bd_disk->queue, bio);
if (bio->bi_bdev && bio_flagged(bio, BIO_TRACE_COMPLETION)) {
trace_block_bio_complete(bio->bi_bdev->bd_disk->queue, bio);
bio_clear_flag(bio, BIO_TRACE_COMPLETION);
}
/*
* Need to have a real endio function for chained bios, otherwise
* various corner cases will break (like stacking block devices that
@ -1402,11 +1406,6 @@ again:
goto again;
}
if (bio->bi_bdev && bio_flagged(bio, BIO_TRACE_COMPLETION)) {
trace_block_bio_complete(bio->bi_bdev->bd_disk->queue, bio);
bio_clear_flag(bio, BIO_TRACE_COMPLETION);
}
blk_throtl_bio_endio(bio);
/* release cgroup info */
bio_uninit(bio);

View File

@ -219,8 +219,6 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error)
unsigned long flags = 0;
struct blk_flush_queue *fq = blk_get_flush_queue(q, flush_rq->mq_ctx);
blk_account_io_flush(flush_rq);
/* release the tag's ownership to the req cloned from */
spin_lock_irqsave(&fq->mq_flush_lock, flags);
@ -230,6 +228,7 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error)
return;
}
blk_account_io_flush(flush_rq);
/*
* Flush request has to be marked as IDLE when it is really ended
* because its .end_io() is called from timeout code path too for

View File

@ -559,10 +559,14 @@ static inline unsigned int blk_rq_get_max_segments(struct request *rq)
static inline int ll_new_hw_segment(struct request *req, struct bio *bio,
unsigned int nr_phys_segs)
{
if (req->nr_phys_segments + nr_phys_segs > blk_rq_get_max_segments(req))
if (blk_integrity_merge_bio(req->q, req, bio) == false)
goto no_merge;
if (blk_integrity_merge_bio(req->q, req, bio) == false)
/* discard request merge won't add new segment */
if (req_op(req) == REQ_OP_DISCARD)
return 1;
if (req->nr_phys_segments + nr_phys_segs > blk_rq_get_max_segments(req))
goto no_merge;
/*

View File

@ -199,6 +199,20 @@ struct bt_iter_data {
bool reserved;
};
static struct request *blk_mq_find_and_get_req(struct blk_mq_tags *tags,
unsigned int bitnr)
{
struct request *rq;
unsigned long flags;
spin_lock_irqsave(&tags->lock, flags);
rq = tags->rqs[bitnr];
if (!rq || !refcount_inc_not_zero(&rq->ref))
rq = NULL;
spin_unlock_irqrestore(&tags->lock, flags);
return rq;
}
static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
{
struct bt_iter_data *iter_data = data;
@ -206,18 +220,22 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
struct blk_mq_tags *tags = hctx->tags;
bool reserved = iter_data->reserved;
struct request *rq;
bool ret = true;
if (!reserved)
bitnr += tags->nr_reserved_tags;
rq = tags->rqs[bitnr];
/*
* We can hit rq == NULL here, because the tagging functions
* test and set the bit before assigning ->rqs[].
*/
if (rq && rq->q == hctx->queue && rq->mq_hctx == hctx)
return iter_data->fn(hctx, rq, iter_data->data, reserved);
return true;
rq = blk_mq_find_and_get_req(tags, bitnr);
if (!rq)
return true;
if (rq->q == hctx->queue && rq->mq_hctx == hctx)
ret = iter_data->fn(hctx, rq, iter_data->data, reserved);
blk_mq_put_rq_ref(rq);
return ret;
}
/**
@ -264,6 +282,8 @@ static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
struct blk_mq_tags *tags = iter_data->tags;
bool reserved = iter_data->flags & BT_TAG_ITER_RESERVED;
struct request *rq;
bool ret = true;
bool iter_static_rqs = !!(iter_data->flags & BT_TAG_ITER_STATIC_RQS);
if (!reserved)
bitnr += tags->nr_reserved_tags;
@ -272,16 +292,19 @@ static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
* We can hit rq == NULL here, because the tagging functions
* test and set the bit before assigning ->rqs[].
*/
if (iter_data->flags & BT_TAG_ITER_STATIC_RQS)
if (iter_static_rqs)
rq = tags->static_rqs[bitnr];
else
rq = tags->rqs[bitnr];
rq = blk_mq_find_and_get_req(tags, bitnr);
if (!rq)
return true;
if ((iter_data->flags & BT_TAG_ITER_STARTED) &&
!blk_mq_request_started(rq))
return true;
return iter_data->fn(rq, iter_data->data, reserved);
if (!(iter_data->flags & BT_TAG_ITER_STARTED) ||
blk_mq_request_started(rq))
ret = iter_data->fn(rq, iter_data->data, reserved);
if (!iter_static_rqs)
blk_mq_put_rq_ref(rq);
return ret;
}
/**
@ -348,6 +371,9 @@ void blk_mq_all_tag_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn,
* indicates whether or not @rq is a reserved request. Return
* true to continue iterating tags, false to stop.
* @priv: Will be passed as second argument to @fn.
*
* We grab one request reference before calling @fn and release it after
* @fn returns.
*/
void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
busy_tag_iter_fn *fn, void *priv)
@ -516,6 +542,7 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
tags->nr_tags = total_tags;
tags->nr_reserved_tags = reserved_tags;
spin_lock_init(&tags->lock);
if (blk_mq_is_sbitmap_shared(flags))
return tags;

View File

@ -20,6 +20,12 @@ struct blk_mq_tags {
struct request **rqs;
struct request **static_rqs;
struct list_head page_list;
/*
* used to clear request reference in rqs[] before freeing one
* request pool
*/
spinlock_t lock;
};
extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags,

View File

@ -909,6 +909,14 @@ static bool blk_mq_req_expired(struct request *rq, unsigned long *next)
return false;
}
void blk_mq_put_rq_ref(struct request *rq)
{
if (is_flush_rq(rq, rq->mq_hctx))
rq->end_io(rq, 0);
else if (refcount_dec_and_test(&rq->ref))
__blk_mq_free_request(rq);
}
static bool blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
struct request *rq, void *priv, bool reserved)
{
@ -942,11 +950,7 @@ static bool blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
if (blk_mq_req_expired(rq, next))
blk_mq_rq_timed_out(rq, reserved);
if (is_flush_rq(rq, hctx))
rq->end_io(rq, 0);
else if (refcount_dec_and_test(&rq->ref))
__blk_mq_free_request(rq);
blk_mq_put_rq_ref(rq);
return true;
}
@ -1220,9 +1224,6 @@ static void blk_mq_update_dispatch_busy(struct blk_mq_hw_ctx *hctx, bool busy)
{
unsigned int ewma;
if (hctx->queue->elevator)
return;
ewma = hctx->dispatch_busy;
if (!ewma && !busy)
@ -2303,6 +2304,45 @@ queue_exit:
return BLK_QC_T_NONE;
}
static size_t order_to_size(unsigned int order)
{
return (size_t)PAGE_SIZE << order;
}
/* called before freeing request pool in @tags */
static void blk_mq_clear_rq_mapping(struct blk_mq_tag_set *set,
struct blk_mq_tags *tags, unsigned int hctx_idx)
{
struct blk_mq_tags *drv_tags = set->tags[hctx_idx];
struct page *page;
unsigned long flags;
list_for_each_entry(page, &tags->page_list, lru) {
unsigned long start = (unsigned long)page_address(page);
unsigned long end = start + order_to_size(page->private);
int i;
for (i = 0; i < set->queue_depth; i++) {
struct request *rq = drv_tags->rqs[i];
unsigned long rq_addr = (unsigned long)rq;
if (rq_addr >= start && rq_addr < end) {
WARN_ON_ONCE(refcount_read(&rq->ref) != 0);
cmpxchg(&drv_tags->rqs[i], rq, NULL);
}
}
}
/*
* Wait until all pending iteration is done.
*
* Request reference is cleared and it is guaranteed to be observed
* after the ->lock is released.
*/
spin_lock_irqsave(&drv_tags->lock, flags);
spin_unlock_irqrestore(&drv_tags->lock, flags);
}
void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
unsigned int hctx_idx)
{
@ -2321,6 +2361,8 @@ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
}
}
blk_mq_clear_rq_mapping(set, tags, hctx_idx);
while (!list_empty(&tags->page_list)) {
page = list_first_entry(&tags->page_list, struct page, lru);
list_del_init(&page->lru);
@ -2380,11 +2422,6 @@ struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
return tags;
}
static size_t order_to_size(unsigned int order)
{
return (size_t)PAGE_SIZE << order;
}
static int blk_mq_init_request(struct blk_mq_tag_set *set, struct request *rq,
unsigned int hctx_idx, int node)
{

View File

@ -47,6 +47,7 @@ void blk_mq_add_to_requeue_list(struct request *rq, bool at_head,
void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list);
struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx,
struct blk_mq_ctx *start);
void blk_mq_put_rq_ref(struct request *rq);
/*
* Internal helpers for allocating/freeing the request map

View File

@ -7,6 +7,7 @@
#include <linux/blk_types.h>
#include <linux/atomic.h>
#include <linux/wait.h>
#include <linux/blk-mq.h>
#include "blk-mq-debugfs.h"
@ -99,8 +100,21 @@ static inline void rq_wait_init(struct rq_wait *rq_wait)
static inline void rq_qos_add(struct request_queue *q, struct rq_qos *rqos)
{
/*
* No IO can be in-flight when adding rqos, so freeze queue, which
* is fine since we only support rq_qos for blk-mq queue.
*
* Reuse ->queue_lock for protecting against other concurrent
* rq_qos adding/deleting
*/
blk_mq_freeze_queue(q);
spin_lock_irq(&q->queue_lock);
rqos->next = q->rq_qos;
q->rq_qos = rqos;
spin_unlock_irq(&q->queue_lock);
blk_mq_unfreeze_queue(q);
if (rqos->ops->debugfs_attrs)
blk_mq_debugfs_register_rqos(rqos);
@ -110,12 +124,22 @@ static inline void rq_qos_del(struct request_queue *q, struct rq_qos *rqos)
{
struct rq_qos **cur;
/*
* See comment in rq_qos_add() about freezing queue & using
* ->queue_lock.
*/
blk_mq_freeze_queue(q);
spin_lock_irq(&q->queue_lock);
for (cur = &q->rq_qos; *cur; cur = &(*cur)->next) {
if (*cur == rqos) {
*cur = rqos->next;
break;
}
}
spin_unlock_irq(&q->queue_lock);
blk_mq_unfreeze_queue(q);
blk_mq_debugfs_unregister_rqos(rqos);
}

Some files were not shown because too many files have changed in this diff Show More