Merge branch 'x86/alternatives' into x86/core, to merge dependent commits

Prepare to resolve conflicts with an upstream series of fixes that conflict
with pending x86 changes:

  6f5bf947ba Merge tag 'its-for-linus-20250509' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Signed-off-by: Ingo Molnar <mingo@kernel.org>
This commit is contained in:
Ingo Molnar 2025-05-13 10:32:24 +02:00
commit 11d8f542d9
21 changed files with 352 additions and 340 deletions

View File

@ -477,7 +477,7 @@ void *text_poke_copy(void *addr, const void *opcode, size_t len)
return text_poke(addr, opcode, len);
}
void text_poke_sync(void)
void smp_text_poke_sync_each_cpu(void)
{
}

View File

@ -153,6 +153,7 @@ config X86
select ARCH_WANT_HUGETLB_VMEMMAP_PREINIT if X86_64
select ARCH_WANTS_THP_SWAP if X86_64
select ARCH_HAS_PARANOID_L1D_FLUSH
select ARCH_WANT_IRQS_OFF_ACTIVATE_MM
select BUILDTIME_TABLE_SORT
select CLKEVT_I8253
select CLOCKSOURCE_WATCHDOG

View File

@ -2803,8 +2803,15 @@ static unsigned long get_segment_base(unsigned int segment)
#ifdef CONFIG_MODIFY_LDT_SYSCALL
struct ldt_struct *ldt;
/*
* If we're not in a valid context with a real (not just lazy)
* user mm, then don't even try.
*/
if (!nmi_uaccess_okay())
return 0;
/* IRQs are off, so this synchronizes with smp_store_release */
ldt = READ_ONCE(current->active_mm->context.ldt);
ldt = smp_load_acquire(&current->mm->context.ldt);
if (!ldt || idx >= ldt->nr_entries)
return 0;

View File

@ -82,6 +82,12 @@ struct alt_instr {
extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
extern s32 __retpoline_sites[], __retpoline_sites_end[];
extern s32 __return_sites[], __return_sites_end[];
extern s32 __cfi_sites[], __cfi_sites_end[];
extern s32 __ibt_endbr_seal[], __ibt_endbr_seal_end[];
extern s32 __smp_locks[], __smp_locks_end[];
/*
* Debug flag that can be tested to see whether alternative
* instructions were patched in already:
@ -335,11 +341,6 @@ void nop_func(void);
__ALTERNATIVE(\oldinstr, \newinstr, \ft_flags)
.endm
#define old_len 141b-140b
#define new_len1 144f-143f
#define new_len2 145f-144f
#define new_len3 146f-145f
/*
* Same as ALTERNATIVE macro above but for two alternatives. If CPU
* has @feature1, it replaces @oldinstr with @newinstr1. If CPU has

View File

@ -16,6 +16,8 @@
#define MM_CONTEXT_LOCK_LAM 2
/* Allow LAM and SVA coexisting */
#define MM_CONTEXT_FORCE_TAGGED_SVA 3
/* Tracks mm_cpumask */
#define MM_CONTEXT_NOTRACK 4
/*
* x86 has arch-specific MMU state beyond what lives in mm_struct.
@ -44,9 +46,7 @@ typedef struct {
struct ldt_struct *ldt;
#endif
#ifdef CONFIG_X86_64
unsigned long flags;
#endif
#ifdef CONFIG_ADDRESS_MASKING
/* Active LAM mode: X86_CR3_LAM_U48 or X86_CR3_LAM_U57 or 0 (disabled) */

View File

@ -190,7 +190,7 @@ extern void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
#define activate_mm(prev, next) \
do { \
paravirt_enter_mmap(next); \
switch_mm((prev), (next), NULL); \
switch_mm_irqs_off((prev), (next), NULL); \
} while (0);
#ifdef CONFIG_X86_32
@ -247,6 +247,16 @@ static inline bool is_64bit_mm(struct mm_struct *mm)
}
#endif
static inline bool is_notrack_mm(struct mm_struct *mm)
{
return test_bit(MM_CONTEXT_NOTRACK, &mm->context.flags);
}
static inline void set_notrack_mm(struct mm_struct *mm)
{
set_bit(MM_CONTEXT_NOTRACK, &mm->context.flags);
}
/*
* We only want to enforce protection keys on the current process
* because we effectively have no access to PKRU for other
@ -272,4 +282,7 @@ unsigned long __get_current_cr3_fast(void);
#include <asm-generic/mmu_context.h>
extern struct mm_struct *use_temporary_mm(struct mm_struct *temp_mm);
extern void unuse_temporary_mm(struct mm_struct *prev_mm);
#endif /* _ASM_X86_MMU_CONTEXT_H */

View File

@ -11,11 +11,11 @@
* JUMP_LABEL_NOP_SIZE/RELATIVEJUMP_SIZE, which are 5.
* Raise it if needed.
*/
#define POKE_MAX_OPCODE_SIZE 5
#define TEXT_POKE_MAX_OPCODE_SIZE 5
extern void text_poke_early(void *addr, const void *opcode, size_t len);
extern void apply_relocation(u8 *buf, const u8 * const instr, size_t instrlen, u8 *repl, size_t repl_len);
extern void text_poke_apply_relocation(u8 *buf, const u8 * const instr, size_t instrlen, u8 *repl, size_t repl_len);
/*
* Clear and restore the kernel write-protection flag on the local CPU.
@ -32,17 +32,17 @@ extern void apply_relocation(u8 *buf, const u8 * const instr, size_t instrlen, u
* an inconsistent instruction while you patch.
*/
extern void *text_poke(void *addr, const void *opcode, size_t len);
extern void text_poke_sync(void);
extern void smp_text_poke_sync_each_cpu(void);
extern void *text_poke_kgdb(void *addr, const void *opcode, size_t len);
extern void *text_poke_copy(void *addr, const void *opcode, size_t len);
#define text_poke_copy text_poke_copy
extern void *text_poke_copy_locked(void *addr, const void *opcode, size_t len, bool core_ok);
extern void *text_poke_set(void *addr, int c, size_t len);
extern int poke_int3_handler(struct pt_regs *regs);
extern void text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate);
extern int smp_text_poke_int3_handler(struct pt_regs *regs);
extern void smp_text_poke_single(void *addr, const void *opcode, size_t len, const void *emulate);
extern void text_poke_queue(void *addr, const void *opcode, size_t len, const void *emulate);
extern void text_poke_finish(void);
extern void smp_text_poke_batch_add(void *addr, const void *opcode, size_t len, const void *emulate);
extern void smp_text_poke_batch_finish(void);
#define INT3_INSN_SIZE 1
#define INT3_INSN_OPCODE 0xCC
@ -82,7 +82,7 @@ static __always_inline int text_opcode_size(u8 opcode)
}
union text_poke_insn {
u8 text[POKE_MAX_OPCODE_SIZE];
u8 text[TEXT_POKE_MAX_OPCODE_SIZE];
struct {
u8 opcode;
s32 disp;
@ -128,8 +128,8 @@ void *text_gen_insn(u8 opcode, const void *addr, const void *dest)
}
extern int after_bootmem;
extern __ro_after_init struct mm_struct *poking_mm;
extern __ro_after_init unsigned long poking_addr;
extern __ro_after_init struct mm_struct *text_poke_mm;
extern __ro_after_init unsigned long text_poke_mm_addr;
#ifndef CONFIG_UML_X86
static __always_inline
@ -142,13 +142,14 @@ static __always_inline
void int3_emulate_push(struct pt_regs *regs, unsigned long val)
{
/*
* The int3 handler in entry_64.S adds a gap between the
* The INT3 handler in entry_64.S adds a gap between the
* stack where the break point happened, and the saving of
* pt_regs. We can extend the original stack because of
* this gap. See the idtentry macro's create_gap option.
* this gap. See the idtentry macro's X86_TRAP_BP logic.
*
* Similarly entry_32.S will have a gap on the stack for (any) hardware
* exception and pt_regs; see FIXUP_FRAME.
* Similarly, entry_32.S will have a gap on the stack for
* (any) hardware exception and pt_regs; see the
* FIXUP_FRAME macro.
*/
regs->sp -= sizeof(unsigned long);
*(unsigned long *)regs->sp = val;

View File

@ -1,36 +1,14 @@
// SPDX-License-Identifier: GPL-2.0-only
#define pr_fmt(fmt) "SMP alternatives: " fmt
#include <linux/module.h>
#include <linux/sched.h>
#include <linux/mmu_context.h>
#include <linux/perf_event.h>
#include <linux/mutex.h>
#include <linux/list.h>
#include <linux/stringify.h>
#include <linux/highmem.h>
#include <linux/mm.h>
#include <linux/vmalloc.h>
#include <linux/memory.h>
#include <linux/stop_machine.h>
#include <linux/slab.h>
#include <linux/kdebug.h>
#include <linux/kprobes.h>
#include <linux/mmu_context.h>
#include <linux/bsearch.h>
#include <linux/sync_core.h>
#include <asm/text-patching.h>
#include <asm/alternative.h>
#include <asm/sections.h>
#include <asm/mce.h>
#include <asm/nmi.h>
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
#include <asm/insn.h>
#include <asm/io.h>
#include <asm/fixmap.h>
#include <asm/paravirt.h>
#include <asm/asm-prototypes.h>
#include <asm/cfi.h>
#include <asm/nmi.h>
int __read_mostly alternatives_patched;
@ -171,13 +149,6 @@ static void add_nop(u8 *buf, unsigned int len)
*buf = INT3_INSN_OPCODE;
}
extern s32 __retpoline_sites[], __retpoline_sites_end[];
extern s32 __return_sites[], __return_sites_end[];
extern s32 __cfi_sites[], __cfi_sites_end[];
extern s32 __ibt_endbr_seal[], __ibt_endbr_seal_end[];
extern s32 __smp_locks[], __smp_locks_end[];
void text_poke_early(void *addr, const void *opcode, size_t len);
/*
* Matches NOP and NOPL, not any of the other possible NOPs.
*/
@ -369,7 +340,7 @@ static void __apply_relocation(u8 *buf, const u8 * const instr, size_t instrlen,
}
}
void apply_relocation(u8 *buf, const u8 * const instr, size_t instrlen, u8 *repl, size_t repl_len)
void text_poke_apply_relocation(u8 *buf, const u8 * const instr, size_t instrlen, u8 *repl, size_t repl_len)
{
__apply_relocation(buf, instr, instrlen, repl, repl_len);
optimize_nops(instr, buf, instrlen);
@ -525,7 +496,7 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,
for (; insn_buff_sz < a->instrlen; insn_buff_sz++)
insn_buff[insn_buff_sz] = 0x90;
apply_relocation(insn_buff, instr, a->instrlen, replacement, a->replacementlen);
text_poke_apply_relocation(insn_buff, instr, a->instrlen, replacement, a->replacementlen);
DUMP_BYTES(ALT, instr, a->instrlen, "%px: old_insn: ", instr);
DUMP_BYTES(ALT, replacement, a->replacementlen, "%px: rpl_insn: ", replacement);
@ -2010,7 +1981,7 @@ __visible noinline void __init __alt_reloc_selftest(void *arg)
static noinline void __init alt_reloc_selftest(void)
{
/*
* Tests apply_relocation().
* Tests text_poke_apply_relocation().
*
* This has a relative immediate (CALL) in a place other than the first
* instruction and additionally on x86_64 we get a RIP-relative LEA:
@ -2140,76 +2111,8 @@ void __init_or_module text_poke_early(void *addr, const void *opcode,
}
}
typedef struct {
struct mm_struct *mm;
} temp_mm_state_t;
/*
* Using a temporary mm allows to set temporary mappings that are not accessible
* by other CPUs. Such mappings are needed to perform sensitive memory writes
* that override the kernel memory protections (e.g., W^X), without exposing the
* temporary page-table mappings that are required for these write operations to
* other CPUs. Using a temporary mm also allows to avoid TLB shootdowns when the
* mapping is torn down.
*
* Context: The temporary mm needs to be used exclusively by a single core. To
* harden security IRQs must be disabled while the temporary mm is
* loaded, thereby preventing interrupt handler bugs from overriding
* the kernel memory protection.
*/
static inline temp_mm_state_t use_temporary_mm(struct mm_struct *mm)
{
temp_mm_state_t temp_state;
lockdep_assert_irqs_disabled();
/*
* Make sure not to be in TLB lazy mode, as otherwise we'll end up
* with a stale address space WITHOUT being in lazy mode after
* restoring the previous mm.
*/
if (this_cpu_read(cpu_tlbstate_shared.is_lazy))
leave_mm();
temp_state.mm = this_cpu_read(cpu_tlbstate.loaded_mm);
switch_mm_irqs_off(NULL, mm, current);
/*
* If breakpoints are enabled, disable them while the temporary mm is
* used. Userspace might set up watchpoints on addresses that are used
* in the temporary mm, which would lead to wrong signals being sent or
* crashes.
*
* Note that breakpoints are not disabled selectively, which also causes
* kernel breakpoints (e.g., perf's) to be disabled. This might be
* undesirable, but still seems reasonable as the code that runs in the
* temporary mm should be short.
*/
if (hw_breakpoint_active())
hw_breakpoint_disable();
return temp_state;
}
__ro_after_init struct mm_struct *poking_mm;
__ro_after_init unsigned long poking_addr;
static inline void unuse_temporary_mm(temp_mm_state_t prev_state)
{
lockdep_assert_irqs_disabled();
switch_mm_irqs_off(NULL, prev_state.mm, current);
/* Clear the cpumask, to indicate no TLB flushing is needed anywhere */
cpumask_clear_cpu(raw_smp_processor_id(), mm_cpumask(poking_mm));
/*
* Restore the breakpoints if they were disabled before the temporary mm
* was loaded.
*/
if (hw_breakpoint_active())
hw_breakpoint_restore();
}
__ro_after_init struct mm_struct *text_poke_mm;
__ro_after_init unsigned long text_poke_mm_addr;
static void text_poke_memcpy(void *dst, const void *src, size_t len)
{
@ -2229,7 +2132,7 @@ static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t l
{
bool cross_page_boundary = offset_in_page(addr) + len > PAGE_SIZE;
struct page *pages[2] = {NULL};
temp_mm_state_t prev;
struct mm_struct *prev_mm;
unsigned long flags;
pte_t pte, *ptep;
spinlock_t *ptl;
@ -2266,7 +2169,7 @@ static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t l
/*
* The lock is not really needed, but this allows to avoid open-coding.
*/
ptep = get_locked_pte(poking_mm, poking_addr, &ptl);
ptep = get_locked_pte(text_poke_mm, text_poke_mm_addr, &ptl);
/*
* This must not fail; preallocated in poking_init().
@ -2276,21 +2179,21 @@ static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t l
local_irq_save(flags);
pte = mk_pte(pages[0], pgprot);
set_pte_at(poking_mm, poking_addr, ptep, pte);
set_pte_at(text_poke_mm, text_poke_mm_addr, ptep, pte);
if (cross_page_boundary) {
pte = mk_pte(pages[1], pgprot);
set_pte_at(poking_mm, poking_addr + PAGE_SIZE, ptep + 1, pte);
set_pte_at(text_poke_mm, text_poke_mm_addr + PAGE_SIZE, ptep + 1, pte);
}
/*
* Loading the temporary mm behaves as a compiler barrier, which
* guarantees that the PTE will be set at the time memcpy() is done.
*/
prev = use_temporary_mm(poking_mm);
prev_mm = use_temporary_mm(text_poke_mm);
kasan_disable_current();
func((u8 *)poking_addr + offset_in_page(addr), src, len);
func((u8 *)text_poke_mm_addr + offset_in_page(addr), src, len);
kasan_enable_current();
/*
@ -2299,22 +2202,22 @@ static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t l
*/
barrier();
pte_clear(poking_mm, poking_addr, ptep);
pte_clear(text_poke_mm, text_poke_mm_addr, ptep);
if (cross_page_boundary)
pte_clear(poking_mm, poking_addr + PAGE_SIZE, ptep + 1);
pte_clear(text_poke_mm, text_poke_mm_addr + PAGE_SIZE, ptep + 1);
/*
* Loading the previous page-table hierarchy requires a serializing
* instruction that already allows the core to see the updated version.
* Xen-PV is assumed to serialize execution in a similar manner.
*/
unuse_temporary_mm(prev);
unuse_temporary_mm(prev_mm);
/*
* Flushing the TLB might involve IPIs, which would require enabled
* IRQs, but not if the mm is not used, as it is in this point.
*/
flush_tlb_mm_range(poking_mm, poking_addr, poking_addr +
flush_tlb_mm_range(text_poke_mm, text_poke_mm_addr, text_poke_mm_addr +
(cross_page_boundary ? 2 : 1) * PAGE_SIZE,
PAGE_SHIFT, false);
@ -2450,7 +2353,7 @@ static void do_sync_core(void *info)
sync_core();
}
void text_poke_sync(void)
void smp_text_poke_sync_each_cpu(void)
{
on_each_cpu(do_sync_core, NULL, 1);
}
@ -2460,64 +2363,66 @@ void text_poke_sync(void)
* this thing. When len == 6 everything is prefixed with 0x0f and we map
* opcode to Jcc.d8, using len to distinguish.
*/
struct text_poke_loc {
struct smp_text_poke_loc {
/* addr := _stext + rel_addr */
s32 rel_addr;
s32 disp;
u8 len;
u8 opcode;
const u8 text[POKE_MAX_OPCODE_SIZE];
/* see text_poke_bp_batch() */
const u8 text[TEXT_POKE_MAX_OPCODE_SIZE];
/* see smp_text_poke_batch_finish() */
u8 old;
};
struct bp_patching_desc {
struct text_poke_loc *vec;
#define TEXT_POKE_ARRAY_MAX (PAGE_SIZE / sizeof(struct smp_text_poke_loc))
static struct smp_text_poke_array {
struct smp_text_poke_loc vec[TEXT_POKE_ARRAY_MAX];
int nr_entries;
atomic_t refs;
};
} text_poke_array;
static struct bp_patching_desc bp_desc;
static DEFINE_PER_CPU(atomic_t, text_poke_array_refs);
static __always_inline
struct bp_patching_desc *try_get_desc(void)
/*
* These four __always_inline annotations imply noinstr, necessary
* due to smp_text_poke_int3_handler() being noinstr:
*/
static __always_inline bool try_get_text_poke_array(void)
{
struct bp_patching_desc *desc = &bp_desc;
atomic_t *refs = this_cpu_ptr(&text_poke_array_refs);
if (!raw_atomic_inc_not_zero(&desc->refs))
return NULL;
if (!raw_atomic_inc_not_zero(refs))
return false;
return desc;
return true;
}
static __always_inline void put_desc(void)
static __always_inline void put_text_poke_array(void)
{
struct bp_patching_desc *desc = &bp_desc;
atomic_t *refs = this_cpu_ptr(&text_poke_array_refs);
smp_mb__before_atomic();
raw_atomic_dec(&desc->refs);
raw_atomic_dec(refs);
}
static __always_inline void *text_poke_addr(struct text_poke_loc *tp)
static __always_inline void *text_poke_addr(const struct smp_text_poke_loc *tpl)
{
return _stext + tp->rel_addr;
return _stext + tpl->rel_addr;
}
static __always_inline int patch_cmp(const void *key, const void *elt)
static __always_inline int patch_cmp(const void *tpl_a, const void *tpl_b)
{
struct text_poke_loc *tp = (struct text_poke_loc *) elt;
if (key < text_poke_addr(tp))
if (tpl_a < text_poke_addr(tpl_b))
return -1;
if (key > text_poke_addr(tp))
if (tpl_a > text_poke_addr(tpl_b))
return 1;
return 0;
}
noinstr int poke_int3_handler(struct pt_regs *regs)
noinstr int smp_text_poke_int3_handler(struct pt_regs *regs)
{
struct bp_patching_desc *desc;
struct text_poke_loc *tp;
struct smp_text_poke_loc *tpl;
int ret = 0;
void *ip;
@ -2526,41 +2431,40 @@ noinstr int poke_int3_handler(struct pt_regs *regs)
/*
* Having observed our INT3 instruction, we now must observe
* bp_desc with non-zero refcount:
* text_poke_array with non-zero refcount:
*
* bp_desc.refs = 1 INT3
* WMB RMB
* write INT3 if (bp_desc.refs != 0)
* text_poke_array_refs = 1 INT3
* WMB RMB
* write INT3 if (text_poke_array_refs != 0)
*/
smp_rmb();
desc = try_get_desc();
if (!desc)
if (!try_get_text_poke_array())
return 0;
/*
* Discount the INT3. See text_poke_bp_batch().
* Discount the INT3. See smp_text_poke_batch_finish().
*/
ip = (void *) regs->ip - INT3_INSN_SIZE;
/*
* Skip the binary search if there is a single member in the vector.
*/
if (unlikely(desc->nr_entries > 1)) {
tp = __inline_bsearch(ip, desc->vec, desc->nr_entries,
sizeof(struct text_poke_loc),
if (unlikely(text_poke_array.nr_entries > 1)) {
tpl = __inline_bsearch(ip, text_poke_array.vec, text_poke_array.nr_entries,
sizeof(struct smp_text_poke_loc),
patch_cmp);
if (!tp)
if (!tpl)
goto out_put;
} else {
tp = desc->vec;
if (text_poke_addr(tp) != ip)
tpl = text_poke_array.vec;
if (text_poke_addr(tpl) != ip)
goto out_put;
}
ip += tp->len;
ip += tpl->len;
switch (tp->opcode) {
switch (tpl->opcode) {
case INT3_INSN_OPCODE:
/*
* Someone poked an explicit INT3, they'll want to handle it,
@ -2573,16 +2477,16 @@ noinstr int poke_int3_handler(struct pt_regs *regs)
break;
case CALL_INSN_OPCODE:
int3_emulate_call(regs, (long)ip + tp->disp);
int3_emulate_call(regs, (long)ip + tpl->disp);
break;
case JMP32_INSN_OPCODE:
case JMP8_INSN_OPCODE:
int3_emulate_jmp(regs, (long)ip + tp->disp);
int3_emulate_jmp(regs, (long)ip + tpl->disp);
break;
case 0x70 ... 0x7f: /* Jcc */
int3_emulate_jcc(regs, tp->opcode & 0xf, (long)ip, tp->disp);
int3_emulate_jcc(regs, tpl->opcode & 0xf, (long)ip, tpl->disp);
break;
default:
@ -2592,51 +2496,50 @@ noinstr int poke_int3_handler(struct pt_regs *regs)
ret = 1;
out_put:
put_desc();
put_text_poke_array();
return ret;
}
#define TP_VEC_MAX (PAGE_SIZE / sizeof(struct text_poke_loc))
static struct text_poke_loc tp_vec[TP_VEC_MAX];
static int tp_vec_nr;
/**
* text_poke_bp_batch() -- update instructions on live kernel on SMP
* @tp: vector of instructions to patch
* @nr_entries: number of entries in the vector
* smp_text_poke_batch_finish() -- update instructions on live kernel on SMP
*
* Modify multi-byte instruction by using int3 breakpoint on SMP.
* We completely avoid stop_machine() here, and achieve the
* synchronization using int3 breakpoint.
* Input state:
* text_poke_array.vec: vector of instructions to patch
* text_poke_array.nr_entries: number of entries in the vector
*
* Modify multi-byte instructions by using INT3 breakpoints on SMP.
* We completely avoid using stop_machine() here, and achieve the
* synchronization using INT3 breakpoints and SMP cross-calls.
*
* The way it is done:
* - For each entry in the vector:
* - add a int3 trap to the address that will be patched
* - sync cores
* - add an INT3 trap to the address that will be patched
* - SMP sync all CPUs
* - For each entry in the vector:
* - update all but the first byte of the patched range
* - sync cores
* - SMP sync all CPUs
* - For each entry in the vector:
* - replace the first byte (int3) by the first byte of
* - replace the first byte (INT3) by the first byte of the
* replacing opcode
* - sync cores
* - SMP sync all CPUs
*/
static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries)
void smp_text_poke_batch_finish(void)
{
unsigned char int3 = INT3_INSN_OPCODE;
unsigned int i;
int do_sync;
if (!text_poke_array.nr_entries)
return;
lockdep_assert_held(&text_mutex);
bp_desc.vec = tp;
bp_desc.nr_entries = nr_entries;
/*
* Corresponds to the implicit memory barrier in try_get_desc() to
* ensure reading a non-zero refcount provides up to date bp_desc data.
* Corresponds to the implicit memory barrier in try_get_text_poke_array() to
* ensure reading a non-zero refcount provides up to date text_poke_array data.
*/
atomic_set_release(&bp_desc.refs, 1);
for_each_possible_cpu(i)
atomic_set_release(per_cpu_ptr(&text_poke_array_refs, i), 1);
/*
* Function tracing can enable thousands of places that need to be
@ -2649,33 +2552,33 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries
cond_resched();
/*
* Corresponding read barrier in int3 notifier for making sure the
* nr_entries and handler are correctly ordered wrt. patching.
* Corresponding read barrier in INT3 notifier for making sure the
* text_poke_array.nr_entries and handler are correctly ordered wrt. patching.
*/
smp_wmb();
/*
* First step: add a int3 trap to the address that will be patched.
* First step: add a INT3 trap to the address that will be patched.
*/
for (i = 0; i < nr_entries; i++) {
tp[i].old = *(u8 *)text_poke_addr(&tp[i]);
text_poke(text_poke_addr(&tp[i]), &int3, INT3_INSN_SIZE);
for (i = 0; i < text_poke_array.nr_entries; i++) {
text_poke_array.vec[i].old = *(u8 *)text_poke_addr(&text_poke_array.vec[i]);
text_poke(text_poke_addr(&text_poke_array.vec[i]), &int3, INT3_INSN_SIZE);
}
text_poke_sync();
smp_text_poke_sync_each_cpu();
/*
* Second step: update all but the first byte of the patched range.
*/
for (do_sync = 0, i = 0; i < nr_entries; i++) {
u8 old[POKE_MAX_OPCODE_SIZE+1] = { tp[i].old, };
u8 _new[POKE_MAX_OPCODE_SIZE+1];
const u8 *new = tp[i].text;
int len = tp[i].len;
for (do_sync = 0, i = 0; i < text_poke_array.nr_entries; i++) {
u8 old[TEXT_POKE_MAX_OPCODE_SIZE+1] = { text_poke_array.vec[i].old, };
u8 _new[TEXT_POKE_MAX_OPCODE_SIZE+1];
const u8 *new = text_poke_array.vec[i].text;
int len = text_poke_array.vec[i].len;
if (len - INT3_INSN_SIZE > 0) {
memcpy(old + INT3_INSN_SIZE,
text_poke_addr(&tp[i]) + INT3_INSN_SIZE,
text_poke_addr(&text_poke_array.vec[i]) + INT3_INSN_SIZE,
len - INT3_INSN_SIZE);
if (len == 6) {
@ -2684,7 +2587,7 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries
new = _new;
}
text_poke(text_poke_addr(&tp[i]) + INT3_INSN_SIZE,
text_poke(text_poke_addr(&text_poke_array.vec[i]) + INT3_INSN_SIZE,
new + INT3_INSN_SIZE,
len - INT3_INSN_SIZE);
@ -2715,7 +2618,7 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries
* The old instruction is recorded so that the event can be
* processed forwards or backwards.
*/
perf_event_text_poke(text_poke_addr(&tp[i]), old, len, new, len);
perf_event_text_poke(text_poke_addr(&text_poke_array.vec[i]), old, len, new, len);
}
if (do_sync) {
@ -2724,63 +2627,79 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries
* not necessary and we'd be safe even without it. But
* better safe than sorry (plus there's not only Intel).
*/
text_poke_sync();
smp_text_poke_sync_each_cpu();
}
/*
* Third step: replace the first byte (int3) by the first byte of
* Third step: replace the first byte (INT3) by the first byte of the
* replacing opcode.
*/
for (do_sync = 0, i = 0; i < nr_entries; i++) {
u8 byte = tp[i].text[0];
for (do_sync = 0, i = 0; i < text_poke_array.nr_entries; i++) {
u8 byte = text_poke_array.vec[i].text[0];
if (tp[i].len == 6)
if (text_poke_array.vec[i].len == 6)
byte = 0x0f;
if (byte == INT3_INSN_OPCODE)
continue;
text_poke(text_poke_addr(&tp[i]), &byte, INT3_INSN_SIZE);
text_poke(text_poke_addr(&text_poke_array.vec[i]), &byte, INT3_INSN_SIZE);
do_sync++;
}
if (do_sync)
text_poke_sync();
smp_text_poke_sync_each_cpu();
/*
* Remove and wait for refs to be zero.
*
* Notably, if after step-3 above the INT3 got removed, then the
* smp_text_poke_sync_each_cpu() will have serialized against any running INT3
* handlers and the below spin-wait will not happen.
*
* IOW. unless the replacement instruction is INT3, this case goes
* unused.
*/
if (!atomic_dec_and_test(&bp_desc.refs))
atomic_cond_read_acquire(&bp_desc.refs, !VAL);
for_each_possible_cpu(i) {
atomic_t *refs = per_cpu_ptr(&text_poke_array_refs, i);
if (unlikely(!atomic_dec_and_test(refs)))
atomic_cond_read_acquire(refs, !VAL);
}
/* They are all completed: */
text_poke_array.nr_entries = 0;
}
static void text_poke_loc_init(struct text_poke_loc *tp, void *addr,
const void *opcode, size_t len, const void *emulate)
static void __smp_text_poke_batch_add(void *addr, const void *opcode, size_t len, const void *emulate)
{
struct smp_text_poke_loc *tpl;
struct insn insn;
int ret, i = 0;
tpl = &text_poke_array.vec[text_poke_array.nr_entries++];
if (len == 6)
i = 1;
memcpy((void *)tp->text, opcode+i, len-i);
memcpy((void *)tpl->text, opcode+i, len-i);
if (!emulate)
emulate = opcode;
ret = insn_decode_kernel(&insn, emulate);
BUG_ON(ret < 0);
tp->rel_addr = addr - (void *)_stext;
tp->len = len;
tp->opcode = insn.opcode.bytes[0];
tpl->rel_addr = addr - (void *)_stext;
tpl->len = len;
tpl->opcode = insn.opcode.bytes[0];
if (is_jcc32(&insn)) {
/*
* Map Jcc.d32 onto Jcc.d8 and use len to distinguish.
*/
tp->opcode = insn.opcode.bytes[1] - 0x10;
tpl->opcode = insn.opcode.bytes[1] - 0x10;
}
switch (tp->opcode) {
switch (tpl->opcode) {
case RET_INSN_OPCODE:
case JMP32_INSN_OPCODE:
case JMP8_INSN_OPCODE:
@ -2789,14 +2708,14 @@ static void text_poke_loc_init(struct text_poke_loc *tp, void *addr,
* next instruction can be padded with INT3.
*/
for (i = insn.length; i < len; i++)
BUG_ON(tp->text[i] != INT3_INSN_OPCODE);
BUG_ON(tpl->text[i] != INT3_INSN_OPCODE);
break;
default:
BUG_ON(len != insn.length);
}
switch (tp->opcode) {
switch (tpl->opcode) {
case INT3_INSN_OPCODE:
case RET_INSN_OPCODE:
break;
@ -2805,21 +2724,21 @@ static void text_poke_loc_init(struct text_poke_loc *tp, void *addr,
case JMP32_INSN_OPCODE:
case JMP8_INSN_OPCODE:
case 0x70 ... 0x7f: /* Jcc */
tp->disp = insn.immediate.value;
tpl->disp = insn.immediate.value;
break;
default: /* assume NOP */
switch (len) {
case 2: /* NOP2 -- emulate as JMP8+0 */
BUG_ON(memcmp(emulate, x86_nops[len], len));
tp->opcode = JMP8_INSN_OPCODE;
tp->disp = 0;
tpl->opcode = JMP8_INSN_OPCODE;
tpl->disp = 0;
break;
case 5: /* NOP5 -- emulate as JMP32+0 */
BUG_ON(memcmp(emulate, x86_nops[len], len));
tp->opcode = JMP32_INSN_OPCODE;
tp->disp = 0;
tpl->opcode = JMP32_INSN_OPCODE;
tpl->disp = 0;
break;
default: /* unknown instruction */
@ -2830,51 +2749,50 @@ static void text_poke_loc_init(struct text_poke_loc *tp, void *addr,
}
/*
* We hard rely on the tp_vec being ordered; ensure this is so by flushing
* We hard rely on the text_poke_array.vec being ordered; ensure this is so by flushing
* early if needed.
*/
static bool tp_order_fail(void *addr)
static bool text_poke_addr_ordered(void *addr)
{
struct text_poke_loc *tp;
WARN_ON_ONCE(!addr);
if (!tp_vec_nr)
if (!text_poke_array.nr_entries)
return true;
/*
* If the last current entry's address is higher than the
* new entry's address we'd like to add, then ordering
* is violated and we must first flush all pending patching
* requests:
*/
if (text_poke_addr(text_poke_array.vec + text_poke_array.nr_entries-1) > addr)
return false;
if (!addr) /* force */
return true;
tp = &tp_vec[tp_vec_nr - 1];
if ((unsigned long)text_poke_addr(tp) > (unsigned long)addr)
return true;
return false;
}
static void text_poke_flush(void *addr)
{
if (tp_vec_nr == TP_VEC_MAX || tp_order_fail(addr)) {
text_poke_bp_batch(tp_vec, tp_vec_nr);
tp_vec_nr = 0;
}
}
void text_poke_finish(void)
{
text_poke_flush(NULL);
}
void __ref text_poke_queue(void *addr, const void *opcode, size_t len, const void *emulate)
{
struct text_poke_loc *tp;
text_poke_flush(addr);
tp = &tp_vec[tp_vec_nr++];
text_poke_loc_init(tp, addr, opcode, len, emulate);
return true;
}
/**
* text_poke_bp() -- update instructions on live kernel on SMP
* smp_text_poke_batch_add() -- update instruction on live kernel on SMP, batched
* @addr: address to patch
* @opcode: opcode of new instruction
* @len: length to copy
* @emulate: instruction to be emulated
*
* Add a new instruction to the current queue of to-be-patched instructions
* the kernel maintains. The patching request will not be executed immediately,
* but becomes part of an array of patching requests, optimized for batched
* execution. All pending patching requests will be executed on the next
* smp_text_poke_batch_finish() call.
*/
void __ref smp_text_poke_batch_add(void *addr, const void *opcode, size_t len, const void *emulate)
{
if (text_poke_array.nr_entries == TEXT_POKE_ARRAY_MAX || !text_poke_addr_ordered(addr))
smp_text_poke_batch_finish();
__smp_text_poke_batch_add(addr, opcode, len, emulate);
}
/**
* smp_text_poke_single() -- update instruction on live kernel on SMP immediately
* @addr: address to patch
* @opcode: opcode of new instruction
* @len: length to copy
@ -2882,12 +2800,11 @@ void __ref text_poke_queue(void *addr, const void *opcode, size_t len, const voi
*
* Update a single instruction with the vector in the stack, avoiding
* dynamically allocated memory. This function should be used when it is
* not possible to allocate memory.
* not possible to allocate memory for a vector. The single instruction
* is patched in immediately.
*/
void __ref text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate)
void __ref smp_text_poke_single(void *addr, const void *opcode, size_t len, const void *emulate)
{
struct text_poke_loc tp;
text_poke_loc_init(&tp, addr, opcode, len, emulate);
text_poke_bp_batch(&tp, 1);
__smp_text_poke_batch_add(addr, opcode, len, emulate);
smp_text_poke_batch_finish();
}

View File

@ -185,7 +185,7 @@ static void *patch_dest(void *dest, bool direct)
u8 *pad = dest - tsize;
memcpy(insn_buff, skl_call_thunk_template, tsize);
apply_relocation(insn_buff, pad, tsize, skl_call_thunk_template, tsize);
text_poke_apply_relocation(insn_buff, pad, tsize, skl_call_thunk_template, tsize);
/* Already patched? */
if (!bcmp(pad, insn_buff, tsize))
@ -294,7 +294,7 @@ static bool is_callthunk(void *addr)
pad = (void *)(dest - tmpl_size);
memcpy(insn_buff, skl_call_thunk_template, tmpl_size);
apply_relocation(insn_buff, pad, tmpl_size, skl_call_thunk_template, tmpl_size);
text_poke_apply_relocation(insn_buff, pad, tmpl_size, skl_call_thunk_template, tmpl_size);
return !bcmp(pad, insn_buff, tmpl_size);
}
@ -312,7 +312,7 @@ int x86_call_depth_emit_accounting(u8 **pprog, void *func, void *ip)
return 0;
memcpy(insn_buff, skl_call_thunk_template, tmpl_size);
apply_relocation(insn_buff, ip, tmpl_size, skl_call_thunk_template, tmpl_size);
text_poke_apply_relocation(insn_buff, ip, tmpl_size, skl_call_thunk_template, tmpl_size);
memcpy(*pprog, insn_buff, tmpl_size);
*pprog += tmpl_size;

View File

@ -55,10 +55,10 @@ void ftrace_arch_code_modify_post_process(void)
{
/*
* ftrace_make_{call,nop}() may be called during
* module load, and we need to finish the text_poke_queue()
* module load, and we need to finish the smp_text_poke_batch_add()
* that they do, here.
*/
text_poke_finish();
smp_text_poke_batch_finish();
ftrace_poke_late = 0;
mutex_unlock(&text_mutex);
}
@ -119,7 +119,7 @@ ftrace_modify_code_direct(unsigned long ip, const char *old_code,
/* replace the text with the new text */
if (ftrace_poke_late)
text_poke_queue((void *)ip, new_code, MCOUNT_INSN_SIZE, NULL);
smp_text_poke_batch_add((void *)ip, new_code, MCOUNT_INSN_SIZE, NULL);
else
text_poke_early((void *)ip, new_code, MCOUNT_INSN_SIZE);
return 0;
@ -186,11 +186,11 @@ int ftrace_update_ftrace_func(ftrace_func_t func)
ip = (unsigned long)(&ftrace_call);
new = ftrace_call_replace(ip, (unsigned long)func);
text_poke_bp((void *)ip, new, MCOUNT_INSN_SIZE, NULL);
smp_text_poke_single((void *)ip, new, MCOUNT_INSN_SIZE, NULL);
ip = (unsigned long)(&ftrace_regs_call);
new = ftrace_call_replace(ip, (unsigned long)func);
text_poke_bp((void *)ip, new, MCOUNT_INSN_SIZE, NULL);
smp_text_poke_single((void *)ip, new, MCOUNT_INSN_SIZE, NULL);
return 0;
}
@ -247,10 +247,10 @@ void ftrace_replace_code(int enable)
break;
}
text_poke_queue((void *)rec->ip, new, MCOUNT_INSN_SIZE, NULL);
smp_text_poke_batch_add((void *)rec->ip, new, MCOUNT_INSN_SIZE, NULL);
ftrace_update_record(rec, enable);
}
text_poke_finish();
smp_text_poke_batch_finish();
}
void arch_ftrace_update_code(int command)
@ -492,7 +492,7 @@ void arch_ftrace_update_trampoline(struct ftrace_ops *ops)
mutex_lock(&text_mutex);
/* Do a safe modify in case the trampoline is executing */
new = ftrace_call_replace(ip, (unsigned long)func);
text_poke_bp((void *)ip, new, MCOUNT_INSN_SIZE, NULL);
smp_text_poke_single((void *)ip, new, MCOUNT_INSN_SIZE, NULL);
mutex_unlock(&text_mutex);
}
@ -586,7 +586,7 @@ static int ftrace_mod_jmp(unsigned long ip, void *func)
const char *new;
new = ftrace_jmp_replace(ip, (unsigned long)func);
text_poke_bp((void *)ip, new, MCOUNT_INSN_SIZE, NULL);
smp_text_poke_single((void *)ip, new, MCOUNT_INSN_SIZE, NULL);
return 0;
}

View File

@ -102,7 +102,7 @@ __jump_label_transform(struct jump_entry *entry,
return;
}
text_poke_bp((void *)jump_entry_code(entry), jlp.code, jlp.size, NULL);
smp_text_poke_single((void *)jump_entry_code(entry), jlp.code, jlp.size, NULL);
}
static void __ref jump_label_transform(struct jump_entry *entry,
@ -135,7 +135,7 @@ bool arch_jump_label_transform_queue(struct jump_entry *entry,
mutex_lock(&text_mutex);
jlp = __jump_label_patch(entry, type);
text_poke_queue((void *)jump_entry_code(entry), jlp.code, jlp.size, NULL);
smp_text_poke_batch_add((void *)jump_entry_code(entry), jlp.code, jlp.size, NULL);
mutex_unlock(&text_mutex);
return true;
}
@ -143,6 +143,6 @@ bool arch_jump_label_transform_queue(struct jump_entry *entry,
void arch_jump_label_transform_apply(void)
{
mutex_lock(&text_mutex);
text_poke_finish();
smp_text_poke_batch_finish();
mutex_unlock(&text_mutex);
}

View File

@ -808,7 +808,7 @@ void arch_arm_kprobe(struct kprobe *p)
u8 int3 = INT3_INSN_OPCODE;
text_poke(p->addr, &int3, 1);
text_poke_sync();
smp_text_poke_sync_each_cpu();
perf_event_text_poke(p->addr, &p->opcode, 1, &int3, 1);
}
@ -818,7 +818,7 @@ void arch_disarm_kprobe(struct kprobe *p)
perf_event_text_poke(p->addr, &int3, 1, &p->opcode, 1);
text_poke(p->addr, &p->opcode, 1);
text_poke_sync();
smp_text_poke_sync_each_cpu();
}
void arch_remove_kprobe(struct kprobe *p)

View File

@ -488,7 +488,7 @@ void arch_optimize_kprobes(struct list_head *oplist)
insn_buff[0] = JMP32_INSN_OPCODE;
*(s32 *)(&insn_buff[1]) = rel;
text_poke_bp(op->kp.addr, insn_buff, JMP32_INSN_SIZE, NULL);
smp_text_poke_single(op->kp.addr, insn_buff, JMP32_INSN_SIZE, NULL);
list_del_init(&op->list);
}
@ -513,11 +513,11 @@ void arch_unoptimize_kprobe(struct optimized_kprobe *op)
JMP32_INSN_SIZE - INT3_INSN_SIZE);
text_poke(addr, new, INT3_INSN_SIZE);
text_poke_sync();
smp_text_poke_sync_each_cpu();
text_poke(addr + INT3_INSN_SIZE,
new + INT3_INSN_SIZE,
JMP32_INSN_SIZE - INT3_INSN_SIZE);
text_poke_sync();
smp_text_poke_sync_each_cpu();
perf_event_text_poke(op->kp.addr, old, JMP32_INSN_SIZE, new, JMP32_INSN_SIZE);
}

View File

@ -206,7 +206,7 @@ static int write_relocate_add(Elf64_Shdr *sechdrs,
write, apply);
if (!early) {
text_poke_sync();
smp_text_poke_sync_each_cpu();
mutex_unlock(&text_mutex);
}

View File

@ -108,7 +108,7 @@ static void __ref __static_call_transform(void *insn, enum insn_type type,
if (system_state == SYSTEM_BOOTING || modinit)
return text_poke_early(insn, code, size);
text_poke_bp(insn, code, size, emulate);
smp_text_poke_single(insn, code, size, emulate);
}
static void __static_call_validate(u8 *insn, bool tail, bool tramp)

View File

@ -882,16 +882,16 @@ static void do_int3_user(struct pt_regs *regs)
DEFINE_IDTENTRY_RAW(exc_int3)
{
/*
* poke_int3_handler() is completely self contained code; it does (and
* smp_text_poke_int3_handler() is completely self contained code; it does (and
* must) *NOT* call out to anything, lest it hits upon yet another
* INT3.
*/
if (poke_int3_handler(regs))
if (smp_text_poke_int3_handler(regs))
return;
/*
* irqentry_enter_from_user_mode() uses static_branch_{,un}likely()
* and therefore can trigger INT3, hence poke_int3_handler() must
* and therefore can trigger INT3, hence smp_text_poke_int3_handler() must
* be done before. If the entry came from kernel mode, then use
* nmi_enter() because the INT3 could have been hit in any context
* including NMI.

View File

@ -631,14 +631,21 @@ static bool get_desc(struct desc_struct *out, unsigned short sel)
/* Bits [15:3] contain the index of the desired entry. */
sel >>= 3;
mutex_lock(&current->active_mm->context.lock);
ldt = current->active_mm->context.ldt;
/*
* If we're not in a valid context with a real (not just lazy)
* user mm, then don't even try.
*/
if (!nmi_uaccess_okay())
return false;
mutex_lock(&current->mm->context.lock);
ldt = current->mm->context.ldt;
if (ldt && sel < ldt->nr_entries) {
*out = ldt->entries[sel];
success = true;
}
mutex_unlock(&current->active_mm->context.lock);
mutex_unlock(&current->mm->context.lock);
return success;
}

View File

@ -28,6 +28,7 @@
#include <asm/text-patching.h>
#include <asm/memtype.h>
#include <asm/paravirt.h>
#include <asm/mmu_context.h>
/*
* We need to define the tracepoints somewhere, and tlb.c
@ -824,31 +825,33 @@ void __init poking_init(void)
spinlock_t *ptl;
pte_t *ptep;
poking_mm = mm_alloc();
BUG_ON(!poking_mm);
text_poke_mm = mm_alloc();
BUG_ON(!text_poke_mm);
/* Xen PV guests need the PGD to be pinned. */
paravirt_enter_mmap(poking_mm);
paravirt_enter_mmap(text_poke_mm);
set_notrack_mm(text_poke_mm);
/*
* Randomize the poking address, but make sure that the following page
* will be mapped at the same PMD. We need 2 pages, so find space for 3,
* and adjust the address if the PMD ends after the first one.
*/
poking_addr = TASK_UNMAPPED_BASE;
text_poke_mm_addr = TASK_UNMAPPED_BASE;
if (IS_ENABLED(CONFIG_RANDOMIZE_BASE))
poking_addr += (kaslr_get_random_long("Poking") & PAGE_MASK) %
text_poke_mm_addr += (kaslr_get_random_long("Poking") & PAGE_MASK) %
(TASK_SIZE - TASK_UNMAPPED_BASE - 3 * PAGE_SIZE);
if (((poking_addr + PAGE_SIZE) & ~PMD_MASK) == 0)
poking_addr += PAGE_SIZE;
if (((text_poke_mm_addr + PAGE_SIZE) & ~PMD_MASK) == 0)
text_poke_mm_addr += PAGE_SIZE;
/*
* We need to trigger the allocation of the page-tables that will be
* needed for poking now. Later, poking may be performed in an atomic
* section, which might cause allocation to fail.
*/
ptep = get_locked_pte(poking_mm, poking_addr, &ptl);
ptep = get_locked_pte(text_poke_mm, text_poke_mm_addr, &ptl);
BUG_ON(!ptep);
pte_unmap_unlock(ptep, ptl);
}

View File

@ -847,7 +847,8 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next,
* mm_cpumask. The TLB shootdown code can figure out from
* cpu_tlbstate_shared.is_lazy whether or not to send an IPI.
*/
if (IS_ENABLED(CONFIG_DEBUG_VM) && WARN_ON_ONCE(prev != &init_mm &&
if (IS_ENABLED(CONFIG_DEBUG_VM) &&
WARN_ON_ONCE(prev != &init_mm && !is_notrack_mm(prev) &&
!cpumask_test_cpu(cpu, mm_cpumask(next))))
cpumask_set_cpu(cpu, mm_cpumask(next));
@ -906,14 +907,6 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next,
this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING);
barrier();
/*
* Leave this CPU in prev's mm_cpumask. Atomic writes to
* mm_cpumask can be expensive under contention. The CPU
* will be removed lazily at TLB flush time.
*/
VM_WARN_ON_ONCE(prev != &init_mm && !cpumask_test_cpu(cpu,
mm_cpumask(prev)));
/* Start receiving IPIs and then read tlb_gen (and LAM below) */
if (next != &init_mm && !cpumask_test_cpu(cpu, mm_cpumask(next)))
cpumask_set_cpu(cpu, mm_cpumask(next));
@ -972,6 +965,77 @@ void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
this_cpu_write(cpu_tlbstate_shared.is_lazy, true);
}
/*
* Using a temporary mm allows to set temporary mappings that are not accessible
* by other CPUs. Such mappings are needed to perform sensitive memory writes
* that override the kernel memory protections (e.g., W^X), without exposing the
* temporary page-table mappings that are required for these write operations to
* other CPUs. Using a temporary mm also allows to avoid TLB shootdowns when the
* mapping is torn down. Temporary mms can also be used for EFI runtime service
* calls or similar functionality.
*
* It is illegal to schedule while using a temporary mm -- the context switch
* code is unaware of the temporary mm and does not know how to context switch.
* Use a real (non-temporary) mm in a kernel thread if you need to sleep.
*
* Note: For sensitive memory writes, the temporary mm needs to be used
* exclusively by a single core, and IRQs should be disabled while the
* temporary mm is loaded, thereby preventing interrupt handler bugs from
* overriding the kernel memory protection.
*/
struct mm_struct *use_temporary_mm(struct mm_struct *temp_mm)
{
struct mm_struct *prev_mm;
lockdep_assert_preemption_disabled();
guard(irqsave)();
/*
* Make sure not to be in TLB lazy mode, as otherwise we'll end up
* with a stale address space WITHOUT being in lazy mode after
* restoring the previous mm.
*/
if (this_cpu_read(cpu_tlbstate_shared.is_lazy))
leave_mm();
prev_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
switch_mm_irqs_off(NULL, temp_mm, current);
/*
* If breakpoints are enabled, disable them while the temporary mm is
* used. Userspace might set up watchpoints on addresses that are used
* in the temporary mm, which would lead to wrong signals being sent or
* crashes.
*
* Note that breakpoints are not disabled selectively, which also causes
* kernel breakpoints (e.g., perf's) to be disabled. This might be
* undesirable, but still seems reasonable as the code that runs in the
* temporary mm should be short.
*/
if (hw_breakpoint_active())
hw_breakpoint_disable();
return prev_mm;
}
void unuse_temporary_mm(struct mm_struct *prev_mm)
{
lockdep_assert_preemption_disabled();
guard(irqsave)();
/* Clear the cpumask, to indicate no TLB flushing is needed anywhere */
cpumask_clear_cpu(smp_processor_id(), mm_cpumask(this_cpu_read(cpu_tlbstate.loaded_mm)));
switch_mm_irqs_off(NULL, prev_mm, current);
/*
* Restore the breakpoints if they were disabled before the temporary mm
* was loaded.
*/
if (hw_breakpoint_active())
hw_breakpoint_restore();
}
/*
* Call this when reinitializing a CPU. It fixes the following potential
* problems:

View File

@ -629,7 +629,7 @@ static int __bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
goto out;
ret = 1;
if (memcmp(ip, new_insn, X86_PATCH_SIZE)) {
text_poke_bp(ip, new_insn, X86_PATCH_SIZE, NULL);
smp_text_poke_single(ip, new_insn, X86_PATCH_SIZE, NULL);
ret = 0;
}
out:

View File

@ -89,6 +89,7 @@ int __init efi_alloc_page_tables(void)
efi_mm.pgd = efi_pgd;
mm_init_cpumask(&efi_mm);
init_new_context(NULL, &efi_mm);
set_notrack_mm(&efi_mm);
return 0;
@ -434,15 +435,12 @@ void __init efi_dump_pagetable(void)
*/
static void efi_enter_mm(void)
{
efi_prev_mm = current->active_mm;
current->active_mm = &efi_mm;
switch_mm(efi_prev_mm, &efi_mm, NULL);
efi_prev_mm = use_temporary_mm(&efi_mm);
}
static void efi_leave_mm(void)
{
current->active_mm = efi_prev_mm;
switch_mm(&efi_mm, efi_prev_mm, NULL);
unuse_temporary_mm(efi_prev_mm);
}
void arch_efi_call_virt_setup(void)