From 4334336e769bea1351ab82b22b06118c81bd217f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 11 Apr 2025 07:40:13 +0200 Subject: [PATCH 01/63] x86/alternatives: Improve code-patching scalability by removing false sharing in poke_int3_handler() eBPF programs can be run 50,000,000 times per second on busy servers. Whenever /proc/sys/kernel/bpf_stats_enabled is turned off, hundreds of calls sites are patched from text_poke_bp_batch() and we see a huge loss of performance due to false sharing on bp_desc.refs lasting up to three seconds. 51.30% server_bin [kernel.kallsyms] [k] poke_int3_handler | |--46.45%--poke_int3_handler | exc_int3 | asm_exc_int3 | | | |--24.26%--cls_bpf_classify | | tcf_classify | | __dev_queue_xmit | | ip6_finish_output2 | | ip6_output | | ip6_xmit | | inet6_csk_xmit | | __tcp_transmit_skb Fix this by replacing bp_desc.refs with a per-cpu bp_refs. Before the patch, on a host with 240 cores (480 threads): $ sysctl -wq kernel.bpf_stats_enabled=0 text_poke_bp_batch(nr_entries=164) : Took 2655300 usec $ bpftool prog | grep run_time_ns ... 105: sched_cls name hn_egress tag 699fc5eea64144e3 gpl run_time_ns 3009063719 run_cnt 82757845 : average cost is 36 nsec per call After this patch: $ sysctl -wq kernel.bpf_stats_enabled=0 text_poke_bp_batch(nr_entries=164) : Took 702 usec $ bpftool prog | grep run_time_ns ... 105: sched_cls name hn_egress tag 699fc5eea64144e3 gpl run_time_ns 1928223019 run_cnt 67682728 : average cost is 28 nsec per call Ie. text-patching performance improved 3700x: from 2.65 seconds to 0.0007 seconds. Since the atomic_cond_read_acquire(refs, !VAL) spin-loop was not triggered even once in my tests, add an unlikely() annotation, because this appears to be the common case. [ mingo: Improved the changelog some more. ] Signed-off-by: Eric Dumazet Signed-off-by: Ingo Molnar Cc: Juergen Gross Cc: "H . Peter Anvin" Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Brian Gerst Cc: Kees Cook Cc: Josh Poimboeuf Link: https://lore.kernel.org/r/20250411054105.2341982-2-mingo@kernel.org --- arch/x86/kernel/alternative.c | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index bf82c6f7d690..85089c79a828 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -2474,28 +2474,29 @@ struct text_poke_loc { struct bp_patching_desc { struct text_poke_loc *vec; int nr_entries; - atomic_t refs; }; +static DEFINE_PER_CPU(atomic_t, bp_refs); + static struct bp_patching_desc bp_desc; static __always_inline struct bp_patching_desc *try_get_desc(void) { - struct bp_patching_desc *desc = &bp_desc; + atomic_t *refs = this_cpu_ptr(&bp_refs); - if (!raw_atomic_inc_not_zero(&desc->refs)) + if (!raw_atomic_inc_not_zero(refs)) return NULL; - return desc; + return &bp_desc; } static __always_inline void put_desc(void) { - struct bp_patching_desc *desc = &bp_desc; + atomic_t *refs = this_cpu_ptr(&bp_refs); smp_mb__before_atomic(); - raw_atomic_dec(&desc->refs); + raw_atomic_dec(refs); } static __always_inline void *text_poke_addr(struct text_poke_loc *tp) @@ -2528,9 +2529,9 @@ noinstr int poke_int3_handler(struct pt_regs *regs) * Having observed our INT3 instruction, we now must observe * bp_desc with non-zero refcount: * - * bp_desc.refs = 1 INT3 - * WMB RMB - * write INT3 if (bp_desc.refs != 0) + * bp_refs = 1 INT3 + * WMB RMB + * write INT3 if (bp_refs != 0) */ smp_rmb(); @@ -2636,7 +2637,8 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries * Corresponds to the implicit memory barrier in try_get_desc() to * ensure reading a non-zero refcount provides up to date bp_desc data. */ - atomic_set_release(&bp_desc.refs, 1); + for_each_possible_cpu(i) + atomic_set_release(per_cpu_ptr(&bp_refs, i), 1); /* * Function tracing can enable thousands of places that need to be @@ -2750,8 +2752,12 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries /* * Remove and wait for refs to be zero. */ - if (!atomic_dec_and_test(&bp_desc.refs)) - atomic_cond_read_acquire(&bp_desc.refs, !VAL); + for_each_possible_cpu(i) { + atomic_t *refs = per_cpu_ptr(&bp_refs, i); + + if (unlikely(!atomic_dec_and_test(refs))) + atomic_cond_read_acquire(refs, !VAL); + } } static void text_poke_loc_init(struct text_poke_loc *tp, void *addr, From d60e4b2410e1b9f7c5ca347c78c6b07175c2e873 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 11 Apr 2025 07:40:14 +0200 Subject: [PATCH 02/63] x86/alternatives: Document the text_poke_bp_batch() synchronization rules a bit more Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Cc: Juergen Gross Cc: "H . Peter Anvin" Cc: Linus Torvalds Cc: Eric Dumazet Cc: Brian Gerst Cc: Josh Poimboeuf Link: https://lore.kernel.org/r/20250411054105.2341982-3-mingo@kernel.org --- arch/x86/kernel/alternative.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 85089c79a828..5f448142aa99 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -2751,6 +2751,13 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries /* * Remove and wait for refs to be zero. + * + * Notably, if after step-3 above the INT3 got removed, then the + * text_poke_sync() will have serialized against any running INT3 + * handlers and the below spin-wait will not happen. + * + * IOW. unless the replacement instruction is INT3, this case goes + * unused. */ for_each_possible_cpu(i) { atomic_t *refs = per_cpu_ptr(&bp_refs, i); From 84e5ba949b0a3fcf2fd0a1b6c9ce14d8436dbbb8 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 11 Apr 2025 07:40:15 +0200 Subject: [PATCH 03/63] x86/alternatives: Rename 'struct bp_patching_desc' to 'struct text_poke_int3_vec' Follow the INT3 text-poking nomenclature, and also adopt the 'vector' name for the entire object, instead of the rather opaque 'descriptor' naming. Signed-off-by: Ingo Molnar Cc: Juergen Gross Cc: "H . Peter Anvin" Cc: Linus Torvalds Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20250411054105.2341982-4-mingo@kernel.org --- arch/x86/kernel/alternative.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 5f448142aa99..8edf7d3fd184 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -2471,17 +2471,17 @@ struct text_poke_loc { u8 old; }; -struct bp_patching_desc { +struct text_poke_int3_vec { struct text_poke_loc *vec; int nr_entries; }; static DEFINE_PER_CPU(atomic_t, bp_refs); -static struct bp_patching_desc bp_desc; +static struct text_poke_int3_vec bp_desc; static __always_inline -struct bp_patching_desc *try_get_desc(void) +struct text_poke_int3_vec *try_get_desc(void) { atomic_t *refs = this_cpu_ptr(&bp_refs); @@ -2517,7 +2517,7 @@ static __always_inline int patch_cmp(const void *key, const void *elt) noinstr int poke_int3_handler(struct pt_regs *regs) { - struct bp_patching_desc *desc; + struct text_poke_int3_vec *desc; struct text_poke_loc *tp; int ret = 0; void *ip; From 28fb79092d9f7db3397e886d637d3006551693b3 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 11 Apr 2025 07:40:16 +0200 Subject: [PATCH 04/63] x86/alternatives: Rename 'bp_refs' to 'text_poke_array_refs' Make it clear that these reference counts lock access to text_poke_array. Signed-off-by: Ingo Molnar Cc: Juergen Gross Cc: "H . Peter Anvin" Cc: Linus Torvalds Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20250411054105.2341982-5-mingo@kernel.org --- arch/x86/kernel/alternative.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 8edf7d3fd184..9bd71c017cfd 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -2476,14 +2476,14 @@ struct text_poke_int3_vec { int nr_entries; }; -static DEFINE_PER_CPU(atomic_t, bp_refs); +static DEFINE_PER_CPU(atomic_t, text_poke_array_refs); static struct text_poke_int3_vec bp_desc; static __always_inline struct text_poke_int3_vec *try_get_desc(void) { - atomic_t *refs = this_cpu_ptr(&bp_refs); + atomic_t *refs = this_cpu_ptr(&text_poke_array_refs); if (!raw_atomic_inc_not_zero(refs)) return NULL; @@ -2493,7 +2493,7 @@ struct text_poke_int3_vec *try_get_desc(void) static __always_inline void put_desc(void) { - atomic_t *refs = this_cpu_ptr(&bp_refs); + atomic_t *refs = this_cpu_ptr(&text_poke_array_refs); smp_mb__before_atomic(); raw_atomic_dec(refs); @@ -2529,9 +2529,9 @@ noinstr int poke_int3_handler(struct pt_regs *regs) * Having observed our INT3 instruction, we now must observe * bp_desc with non-zero refcount: * - * bp_refs = 1 INT3 + * text_poke_array_refs = 1 INT3 * WMB RMB - * write INT3 if (bp_refs != 0) + * write INT3 if (text_poke_array_refs != 0) */ smp_rmb(); @@ -2638,7 +2638,7 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries * ensure reading a non-zero refcount provides up to date bp_desc data. */ for_each_possible_cpu(i) - atomic_set_release(per_cpu_ptr(&bp_refs, i), 1); + atomic_set_release(per_cpu_ptr(&text_poke_array_refs, i), 1); /* * Function tracing can enable thousands of places that need to be @@ -2760,7 +2760,7 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries * unused. */ for_each_possible_cpu(i) { - atomic_t *refs = per_cpu_ptr(&bp_refs, i); + atomic_t *refs = per_cpu_ptr(&text_poke_array_refs, i); if (unlikely(!atomic_dec_and_test(refs))) atomic_cond_read_acquire(refs, !VAL); From bee4fcfbc128c3ad604539f88307dc2c0fc6f843 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 11 Apr 2025 07:40:17 +0200 Subject: [PATCH 05/63] x86/alternatives: Rename 'text_poke_bp_batch()' to 'smp_text_poke_batch_process()' Signed-off-by: Ingo Molnar Cc: Juergen Gross Cc: "H . Peter Anvin" Cc: Linus Torvalds Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20250411054105.2341982-6-mingo@kernel.org --- arch/x86/kernel/alternative.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 9bd71c017cfd..78024e5def29 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -2467,7 +2467,7 @@ struct text_poke_loc { u8 len; u8 opcode; const u8 text[POKE_MAX_OPCODE_SIZE]; - /* see text_poke_bp_batch() */ + /* see smp_text_poke_batch_process() */ u8 old; }; @@ -2540,7 +2540,7 @@ noinstr int poke_int3_handler(struct pt_regs *regs) return 0; /* - * Discount the INT3. See text_poke_bp_batch(). + * Discount the INT3. See smp_text_poke_batch_process(). */ ip = (void *) regs->ip - INT3_INSN_SIZE; @@ -2602,7 +2602,7 @@ static struct text_poke_loc tp_vec[TP_VEC_MAX]; static int tp_vec_nr; /** - * text_poke_bp_batch() -- update instructions on live kernel on SMP + * smp_text_poke_batch_process() -- update instructions on live kernel on SMP * @tp: vector of instructions to patch * @nr_entries: number of entries in the vector * @@ -2622,7 +2622,7 @@ static int tp_vec_nr; * replacing opcode * - sync cores */ -static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries) +static void smp_text_poke_batch_process(struct text_poke_loc *tp, unsigned int nr_entries) { unsigned char int3 = INT3_INSN_OPCODE; unsigned int i; @@ -2866,7 +2866,7 @@ static bool tp_order_fail(void *addr) static void text_poke_flush(void *addr) { if (tp_vec_nr == TP_VEC_MAX || tp_order_fail(addr)) { - text_poke_bp_batch(tp_vec, tp_vec_nr); + smp_text_poke_batch_process(tp_vec, tp_vec_nr); tp_vec_nr = 0; } } @@ -2902,5 +2902,5 @@ void __ref text_poke_bp(void *addr, const void *opcode, size_t len, const void * struct text_poke_loc tp; text_poke_loc_init(&tp, addr, opcode, len, emulate); - text_poke_bp_batch(&tp, 1); + smp_text_poke_batch_process(&tp, 1); } From 9586ae48e785b48b8dd25136c34c8d0e3e1d2cc8 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 11 Apr 2025 07:40:18 +0200 Subject: [PATCH 06/63] x86/alternatives: Rename 'text_poke_bp()' to 'smp_text_poke_single()' Signed-off-by: Ingo Molnar Cc: Juergen Gross Cc: "H . Peter Anvin" Cc: Linus Torvalds Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20250411054105.2341982-7-mingo@kernel.org --- arch/x86/include/asm/text-patching.h | 2 +- arch/x86/kernel/alternative.c | 4 ++-- arch/x86/kernel/ftrace.c | 8 ++++---- arch/x86/kernel/jump_label.c | 2 +- arch/x86/kernel/kprobes/opt.c | 2 +- arch/x86/kernel/static_call.c | 2 +- arch/x86/net/bpf_jit_comp.c | 2 +- 7 files changed, 11 insertions(+), 11 deletions(-) diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h index ab9e143ec9fe..5189188b5e49 100644 --- a/arch/x86/include/asm/text-patching.h +++ b/arch/x86/include/asm/text-patching.h @@ -39,7 +39,7 @@ extern void *text_poke_copy(void *addr, const void *opcode, size_t len); extern void *text_poke_copy_locked(void *addr, const void *opcode, size_t len, bool core_ok); extern void *text_poke_set(void *addr, int c, size_t len); extern int poke_int3_handler(struct pt_regs *regs); -extern void text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate); +extern void smp_text_poke_single(void *addr, const void *opcode, size_t len, const void *emulate); extern void text_poke_queue(void *addr, const void *opcode, size_t len, const void *emulate); extern void text_poke_finish(void); diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 78024e5def29..222021af7906 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -2887,7 +2887,7 @@ void __ref text_poke_queue(void *addr, const void *opcode, size_t len, const voi } /** - * text_poke_bp() -- update instructions on live kernel on SMP + * smp_text_poke_single() -- update instructions on live kernel on SMP * @addr: address to patch * @opcode: opcode of new instruction * @len: length to copy @@ -2897,7 +2897,7 @@ void __ref text_poke_queue(void *addr, const void *opcode, size_t len, const voi * dynamically allocated memory. This function should be used when it is * not possible to allocate memory. */ -void __ref text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate) +void __ref smp_text_poke_single(void *addr, const void *opcode, size_t len, const void *emulate) { struct text_poke_loc tp; diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index cace6e8d7cc7..7175a0404def 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -186,11 +186,11 @@ int ftrace_update_ftrace_func(ftrace_func_t func) ip = (unsigned long)(&ftrace_call); new = ftrace_call_replace(ip, (unsigned long)func); - text_poke_bp((void *)ip, new, MCOUNT_INSN_SIZE, NULL); + smp_text_poke_single((void *)ip, new, MCOUNT_INSN_SIZE, NULL); ip = (unsigned long)(&ftrace_regs_call); new = ftrace_call_replace(ip, (unsigned long)func); - text_poke_bp((void *)ip, new, MCOUNT_INSN_SIZE, NULL); + smp_text_poke_single((void *)ip, new, MCOUNT_INSN_SIZE, NULL); return 0; } @@ -492,7 +492,7 @@ void arch_ftrace_update_trampoline(struct ftrace_ops *ops) mutex_lock(&text_mutex); /* Do a safe modify in case the trampoline is executing */ new = ftrace_call_replace(ip, (unsigned long)func); - text_poke_bp((void *)ip, new, MCOUNT_INSN_SIZE, NULL); + smp_text_poke_single((void *)ip, new, MCOUNT_INSN_SIZE, NULL); mutex_unlock(&text_mutex); } @@ -586,7 +586,7 @@ static int ftrace_mod_jmp(unsigned long ip, void *func) const char *new; new = ftrace_jmp_replace(ip, (unsigned long)func); - text_poke_bp((void *)ip, new, MCOUNT_INSN_SIZE, NULL); + smp_text_poke_single((void *)ip, new, MCOUNT_INSN_SIZE, NULL); return 0; } diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c index f5b8ef02d172..166e12037199 100644 --- a/arch/x86/kernel/jump_label.c +++ b/arch/x86/kernel/jump_label.c @@ -102,7 +102,7 @@ __jump_label_transform(struct jump_entry *entry, return; } - text_poke_bp((void *)jump_entry_code(entry), jlp.code, jlp.size, NULL); + smp_text_poke_single((void *)jump_entry_code(entry), jlp.code, jlp.size, NULL); } static void __ref jump_label_transform(struct jump_entry *entry, diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index 36d6809c6c9e..9307a40f4983 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -488,7 +488,7 @@ void arch_optimize_kprobes(struct list_head *oplist) insn_buff[0] = JMP32_INSN_OPCODE; *(s32 *)(&insn_buff[1]) = rel; - text_poke_bp(op->kp.addr, insn_buff, JMP32_INSN_SIZE, NULL); + smp_text_poke_single(op->kp.addr, insn_buff, JMP32_INSN_SIZE, NULL); list_del_init(&op->list); } diff --git a/arch/x86/kernel/static_call.c b/arch/x86/kernel/static_call.c index a59c72e77645..8164a7323c17 100644 --- a/arch/x86/kernel/static_call.c +++ b/arch/x86/kernel/static_call.c @@ -108,7 +108,7 @@ static void __ref __static_call_transform(void *insn, enum insn_type type, if (system_state == SYSTEM_BOOTING || modinit) return text_poke_early(insn, code, size); - text_poke_bp(insn, code, size, emulate); + smp_text_poke_single(insn, code, size, emulate); } static void __static_call_validate(u8 *insn, bool tail, bool tramp) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 9e5fe2ba858f..e2b9991c3326 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -629,7 +629,7 @@ static int __bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, goto out; ret = 1; if (memcmp(ip, new_insn, X86_PATCH_SIZE)) { - text_poke_bp(ip, new_insn, X86_PATCH_SIZE, NULL); + smp_text_poke_single(ip, new_insn, X86_PATCH_SIZE, NULL); ret = 0; } out: From 5236b6a0fe921f5de53b8eeea2d8fdd6d643dd7f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 11 Apr 2025 07:40:19 +0200 Subject: [PATCH 07/63] x86/alternatives: Rename 'poke_int3_handler()' to 'smp_text_poke_int3_handler()' All related functions in this subsystem already have a text_poke_int3_ prefix - add it to the trap handler as well. Signed-off-by: Ingo Molnar Cc: Juergen Gross Cc: "H . Peter Anvin" Cc: Linus Torvalds Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20250411054105.2341982-8-mingo@kernel.org --- arch/x86/include/asm/text-patching.h | 2 +- arch/x86/kernel/alternative.c | 2 +- arch/x86/kernel/traps.c | 6 +++--- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h index 5189188b5e49..93a6b7bc78bd 100644 --- a/arch/x86/include/asm/text-patching.h +++ b/arch/x86/include/asm/text-patching.h @@ -38,7 +38,7 @@ extern void *text_poke_copy(void *addr, const void *opcode, size_t len); #define text_poke_copy text_poke_copy extern void *text_poke_copy_locked(void *addr, const void *opcode, size_t len, bool core_ok); extern void *text_poke_set(void *addr, int c, size_t len); -extern int poke_int3_handler(struct pt_regs *regs); +extern int smp_text_poke_int3_handler(struct pt_regs *regs); extern void smp_text_poke_single(void *addr, const void *opcode, size_t len, const void *emulate); extern void text_poke_queue(void *addr, const void *opcode, size_t len, const void *emulate); diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 222021af7906..d2cd0d815130 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -2515,7 +2515,7 @@ static __always_inline int patch_cmp(const void *key, const void *elt) return 0; } -noinstr int poke_int3_handler(struct pt_regs *regs) +noinstr int smp_text_poke_int3_handler(struct pt_regs *regs) { struct text_poke_int3_vec *desc; struct text_poke_loc *tp; diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 9f88b8a78e50..d67407c623f3 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -882,16 +882,16 @@ static void do_int3_user(struct pt_regs *regs) DEFINE_IDTENTRY_RAW(exc_int3) { /* - * poke_int3_handler() is completely self contained code; it does (and + * smp_text_poke_int3_handler() is completely self contained code; it does (and * must) *NOT* call out to anything, lest it hits upon yet another * INT3. */ - if (poke_int3_handler(regs)) + if (smp_text_poke_int3_handler(regs)) return; /* * irqentry_enter_from_user_mode() uses static_branch_{,un}likely() - * and therefore can trigger INT3, hence poke_int3_handler() must + * and therefore can trigger INT3, hence smp_text_poke_int3_handler() must * be done before. If the entry came from kernel mode, then use * nmi_enter() because the INT3 could have been hit in any context * including NMI. From a5c832e0476e461af46a0aa9bda43a573adbe63f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 11 Apr 2025 07:40:20 +0200 Subject: [PATCH 08/63] x86/alternatives: Rename 'poking_mm' to 'text_poke_mm' Put it into the text_poke_* namespace of . Signed-off-by: Ingo Molnar Cc: Juergen Gross Cc: "H . Peter Anvin" Cc: Linus Torvalds Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20250411054105.2341982-9-mingo@kernel.org --- arch/x86/include/asm/text-patching.h | 2 +- arch/x86/kernel/alternative.c | 18 +++++++++--------- arch/x86/mm/init.c | 8 ++++---- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h index 93a6b7bc78bd..7a95c0820b3e 100644 --- a/arch/x86/include/asm/text-patching.h +++ b/arch/x86/include/asm/text-patching.h @@ -128,7 +128,7 @@ void *text_gen_insn(u8 opcode, const void *addr, const void *dest) } extern int after_bootmem; -extern __ro_after_init struct mm_struct *poking_mm; +extern __ro_after_init struct mm_struct *text_poke_mm; extern __ro_after_init unsigned long poking_addr; #ifndef CONFIG_UML_X86 diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index d2cd0d815130..8ce0d469e32f 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -2191,7 +2191,7 @@ static inline temp_mm_state_t use_temporary_mm(struct mm_struct *mm) return temp_state; } -__ro_after_init struct mm_struct *poking_mm; +__ro_after_init struct mm_struct *text_poke_mm; __ro_after_init unsigned long poking_addr; static inline void unuse_temporary_mm(temp_mm_state_t prev_state) @@ -2201,7 +2201,7 @@ static inline void unuse_temporary_mm(temp_mm_state_t prev_state) switch_mm_irqs_off(NULL, prev_state.mm, current); /* Clear the cpumask, to indicate no TLB flushing is needed anywhere */ - cpumask_clear_cpu(raw_smp_processor_id(), mm_cpumask(poking_mm)); + cpumask_clear_cpu(raw_smp_processor_id(), mm_cpumask(text_poke_mm)); /* * Restore the breakpoints if they were disabled before the temporary mm @@ -2266,7 +2266,7 @@ static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t l /* * The lock is not really needed, but this allows to avoid open-coding. */ - ptep = get_locked_pte(poking_mm, poking_addr, &ptl); + ptep = get_locked_pte(text_poke_mm, poking_addr, &ptl); /* * This must not fail; preallocated in poking_init(). @@ -2276,18 +2276,18 @@ static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t l local_irq_save(flags); pte = mk_pte(pages[0], pgprot); - set_pte_at(poking_mm, poking_addr, ptep, pte); + set_pte_at(text_poke_mm, poking_addr, ptep, pte); if (cross_page_boundary) { pte = mk_pte(pages[1], pgprot); - set_pte_at(poking_mm, poking_addr + PAGE_SIZE, ptep + 1, pte); + set_pte_at(text_poke_mm, poking_addr + PAGE_SIZE, ptep + 1, pte); } /* * Loading the temporary mm behaves as a compiler barrier, which * guarantees that the PTE will be set at the time memcpy() is done. */ - prev = use_temporary_mm(poking_mm); + prev = use_temporary_mm(text_poke_mm); kasan_disable_current(); func((u8 *)poking_addr + offset_in_page(addr), src, len); @@ -2299,9 +2299,9 @@ static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t l */ barrier(); - pte_clear(poking_mm, poking_addr, ptep); + pte_clear(text_poke_mm, poking_addr, ptep); if (cross_page_boundary) - pte_clear(poking_mm, poking_addr + PAGE_SIZE, ptep + 1); + pte_clear(text_poke_mm, poking_addr + PAGE_SIZE, ptep + 1); /* * Loading the previous page-table hierarchy requires a serializing @@ -2314,7 +2314,7 @@ static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t l * Flushing the TLB might involve IPIs, which would require enabled * IRQs, but not if the mm is not used, as it is in this point. */ - flush_tlb_mm_range(poking_mm, poking_addr, poking_addr + + flush_tlb_mm_range(text_poke_mm, poking_addr, poking_addr + (cross_page_boundary ? 2 : 1) * PAGE_SIZE, PAGE_SHIFT, false); diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index bfa444a7dbb0..84b52a1ebd48 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -824,11 +824,11 @@ void __init poking_init(void) spinlock_t *ptl; pte_t *ptep; - poking_mm = mm_alloc(); - BUG_ON(!poking_mm); + text_poke_mm = mm_alloc(); + BUG_ON(!text_poke_mm); /* Xen PV guests need the PGD to be pinned. */ - paravirt_enter_mmap(poking_mm); + paravirt_enter_mmap(text_poke_mm); /* * Randomize the poking address, but make sure that the following page @@ -848,7 +848,7 @@ void __init poking_init(void) * needed for poking now. Later, poking may be performed in an atomic * section, which might cause allocation to fail. */ - ptep = get_locked_pte(poking_mm, poking_addr, &ptl); + ptep = get_locked_pte(text_poke_mm, poking_addr, &ptl); BUG_ON(!ptep); pte_unmap_unlock(ptep, ptl); } From da364fc547897ed98fbf2192d86b5242439d7762 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 11 Apr 2025 07:40:21 +0200 Subject: [PATCH 09/63] x86/alternatives: Rename 'poking_addr' to 'text_poke_mm_addr' Put it into the text_poke_* namespace of . Signed-off-by: Ingo Molnar Cc: Juergen Gross Cc: "H . Peter Anvin" Cc: Linus Torvalds Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20250411054105.2341982-10-mingo@kernel.org --- arch/x86/include/asm/text-patching.h | 2 +- arch/x86/kernel/alternative.c | 16 ++++++++-------- arch/x86/mm/init.c | 10 +++++----- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h index 7a95c0820b3e..c8eac8cf4737 100644 --- a/arch/x86/include/asm/text-patching.h +++ b/arch/x86/include/asm/text-patching.h @@ -129,7 +129,7 @@ void *text_gen_insn(u8 opcode, const void *addr, const void *dest) extern int after_bootmem; extern __ro_after_init struct mm_struct *text_poke_mm; -extern __ro_after_init unsigned long poking_addr; +extern __ro_after_init unsigned long text_poke_mm_addr; #ifndef CONFIG_UML_X86 static __always_inline diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 8ce0d469e32f..62d74442cbd1 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -2192,7 +2192,7 @@ static inline temp_mm_state_t use_temporary_mm(struct mm_struct *mm) } __ro_after_init struct mm_struct *text_poke_mm; -__ro_after_init unsigned long poking_addr; +__ro_after_init unsigned long text_poke_mm_addr; static inline void unuse_temporary_mm(temp_mm_state_t prev_state) { @@ -2266,7 +2266,7 @@ static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t l /* * The lock is not really needed, but this allows to avoid open-coding. */ - ptep = get_locked_pte(text_poke_mm, poking_addr, &ptl); + ptep = get_locked_pte(text_poke_mm, text_poke_mm_addr, &ptl); /* * This must not fail; preallocated in poking_init(). @@ -2276,11 +2276,11 @@ static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t l local_irq_save(flags); pte = mk_pte(pages[0], pgprot); - set_pte_at(text_poke_mm, poking_addr, ptep, pte); + set_pte_at(text_poke_mm, text_poke_mm_addr, ptep, pte); if (cross_page_boundary) { pte = mk_pte(pages[1], pgprot); - set_pte_at(text_poke_mm, poking_addr + PAGE_SIZE, ptep + 1, pte); + set_pte_at(text_poke_mm, text_poke_mm_addr + PAGE_SIZE, ptep + 1, pte); } /* @@ -2290,7 +2290,7 @@ static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t l prev = use_temporary_mm(text_poke_mm); kasan_disable_current(); - func((u8 *)poking_addr + offset_in_page(addr), src, len); + func((u8 *)text_poke_mm_addr + offset_in_page(addr), src, len); kasan_enable_current(); /* @@ -2299,9 +2299,9 @@ static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t l */ barrier(); - pte_clear(text_poke_mm, poking_addr, ptep); + pte_clear(text_poke_mm, text_poke_mm_addr, ptep); if (cross_page_boundary) - pte_clear(text_poke_mm, poking_addr + PAGE_SIZE, ptep + 1); + pte_clear(text_poke_mm, text_poke_mm_addr + PAGE_SIZE, ptep + 1); /* * Loading the previous page-table hierarchy requires a serializing @@ -2314,7 +2314,7 @@ static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t l * Flushing the TLB might involve IPIs, which would require enabled * IRQs, but not if the mm is not used, as it is in this point. */ - flush_tlb_mm_range(text_poke_mm, poking_addr, poking_addr + + flush_tlb_mm_range(text_poke_mm, text_poke_mm_addr, text_poke_mm_addr + (cross_page_boundary ? 2 : 1) * PAGE_SIZE, PAGE_SHIFT, false); diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 84b52a1ebd48..f8c74d19bebb 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -835,20 +835,20 @@ void __init poking_init(void) * will be mapped at the same PMD. We need 2 pages, so find space for 3, * and adjust the address if the PMD ends after the first one. */ - poking_addr = TASK_UNMAPPED_BASE; + text_poke_mm_addr = TASK_UNMAPPED_BASE; if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) - poking_addr += (kaslr_get_random_long("Poking") & PAGE_MASK) % + text_poke_mm_addr += (kaslr_get_random_long("Poking") & PAGE_MASK) % (TASK_SIZE - TASK_UNMAPPED_BASE - 3 * PAGE_SIZE); - if (((poking_addr + PAGE_SIZE) & ~PMD_MASK) == 0) - poking_addr += PAGE_SIZE; + if (((text_poke_mm_addr + PAGE_SIZE) & ~PMD_MASK) == 0) + text_poke_mm_addr += PAGE_SIZE; /* * We need to trigger the allocation of the page-tables that will be * needed for poking now. Later, poking may be performed in an atomic * section, which might cause allocation to fail. */ - ptep = get_locked_pte(text_poke_mm, poking_addr, &ptl); + ptep = get_locked_pte(text_poke_mm, text_poke_mm_addr, &ptl); BUG_ON(!ptep); pte_unmap_unlock(ptep, ptl); } From e84c31b9c9ac75ceaa2597bce021c529e76edd26 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 11 Apr 2025 07:40:22 +0200 Subject: [PATCH 10/63] x86/alternatives: Rename 'bp_desc' to 'int3_desc' Signed-off-by: Ingo Molnar Cc: Juergen Gross Cc: "H . Peter Anvin" Cc: Linus Torvalds Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20250411054105.2341982-11-mingo@kernel.org --- arch/x86/kernel/alternative.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 62d74442cbd1..c6d0ca3d683b 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -2478,7 +2478,7 @@ struct text_poke_int3_vec { static DEFINE_PER_CPU(atomic_t, text_poke_array_refs); -static struct text_poke_int3_vec bp_desc; +static struct text_poke_int3_vec int3_desc; static __always_inline struct text_poke_int3_vec *try_get_desc(void) @@ -2488,7 +2488,7 @@ struct text_poke_int3_vec *try_get_desc(void) if (!raw_atomic_inc_not_zero(refs)) return NULL; - return &bp_desc; + return &int3_desc; } static __always_inline void put_desc(void) @@ -2527,7 +2527,7 @@ noinstr int smp_text_poke_int3_handler(struct pt_regs *regs) /* * Having observed our INT3 instruction, we now must observe - * bp_desc with non-zero refcount: + * int3_desc with non-zero refcount: * * text_poke_array_refs = 1 INT3 * WMB RMB @@ -2630,12 +2630,12 @@ static void smp_text_poke_batch_process(struct text_poke_loc *tp, unsigned int n lockdep_assert_held(&text_mutex); - bp_desc.vec = tp; - bp_desc.nr_entries = nr_entries; + int3_desc.vec = tp; + int3_desc.nr_entries = nr_entries; /* * Corresponds to the implicit memory barrier in try_get_desc() to - * ensure reading a non-zero refcount provides up to date bp_desc data. + * ensure reading a non-zero refcount provides up to date int3_desc data. */ for_each_possible_cpu(i) atomic_set_release(per_cpu_ptr(&text_poke_array_refs, i), 1); From 762255b743b81c5307c0e93b2eeee4e8b7424152 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 11 Apr 2025 07:40:23 +0200 Subject: [PATCH 11/63] x86/alternatives: Remove duplicate 'text_poke_early()' prototype It's declared in already. Signed-off-by: Ingo Molnar Cc: Juergen Gross Cc: "H . Peter Anvin" Cc: Linus Torvalds Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20250411054105.2341982-12-mingo@kernel.org --- arch/x86/kernel/alternative.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index c6d0ca3d683b..b8794f756ea4 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -176,7 +176,6 @@ extern s32 __return_sites[], __return_sites_end[]; extern s32 __cfi_sites[], __cfi_sites_end[]; extern s32 __ibt_endbr_seal[], __ibt_endbr_seal_end[]; extern s32 __smp_locks[], __smp_locks_end[]; -void text_poke_early(void *addr, const void *opcode, size_t len); /* * Matches NOP and NOPL, not any of the other possible NOPs. From 5224f09a7b57fcf2024245d89dcb26b0756fb1c8 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 11 Apr 2025 07:40:24 +0200 Subject: [PATCH 12/63] x86/alternatives: Update comments in int3_emulate_push() The idtentry macro in entry_64.S hasn't had a create_gap option for 5 years - update the comment. (Also clean up the entire comment block while at it.) Signed-off-by: Ingo Molnar Cc: Juergen Gross Cc: "H . Peter Anvin" Cc: Linus Torvalds Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20250411054105.2341982-13-mingo@kernel.org --- arch/x86/include/asm/text-patching.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h index c8eac8cf4737..7e3527385708 100644 --- a/arch/x86/include/asm/text-patching.h +++ b/arch/x86/include/asm/text-patching.h @@ -142,13 +142,14 @@ static __always_inline void int3_emulate_push(struct pt_regs *regs, unsigned long val) { /* - * The int3 handler in entry_64.S adds a gap between the + * The INT3 handler in entry_64.S adds a gap between the * stack where the break point happened, and the saving of * pt_regs. We can extend the original stack because of - * this gap. See the idtentry macro's create_gap option. + * this gap. See the idtentry macro's X86_TRAP_BP logic. * - * Similarly entry_32.S will have a gap on the stack for (any) hardware - * exception and pt_regs; see FIXUP_FRAME. + * Similarly, entry_32.S will have a gap on the stack for + * (any) hardware exception and pt_regs; see the + * FIXUP_FRAME macro. */ regs->sp -= sizeof(unsigned long); *(unsigned long *)regs->sp = val; From f5afa2e8efda592ecc69cea7528ff660ac1d8096 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 11 Apr 2025 07:40:25 +0200 Subject: [PATCH 13/63] x86/alternatives: Remove the confusing, inaccurate & unnecessary 'temp_mm_state_t' abstraction So the temp_mm_state_t abstraction used by use_temporary_mm() and unuse_temporary_mm() is super confusing: - The whole machinery is about temporarily switching to the text_poke_mm utility MM that got allocated during bootup for text-patching purposes alone: temp_mm_state_t prev; /* * Loading the temporary mm behaves as a compiler barrier, which * guarantees that the PTE will be set at the time memcpy() is done. */ prev = use_temporary_mm(text_poke_mm); - Yet the value that gets saved in the temp_mm_state_t variable is not the temporary MM ... but the previous MM... - Ie. we temporarily put the non-temporary MM into a variable that has the temp_mm_state_t type. This makes no sense whatsoever. - The confusion continues in unuse_temporary_mm(): static inline void unuse_temporary_mm(temp_mm_state_t prev_state) Here we unuse an MM that is ... not the temporary MM, but the previous MM. :-/ Fix up all this confusion by removing the unnecessary layer of abstraction and using a bog-standard 'struct mm_struct *prev_mm' variable to save the MM to. Signed-off-by: Ingo Molnar Cc: Juergen Gross Cc: "H . Peter Anvin" Cc: Linus Torvalds Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20250411054105.2341982-14-mingo@kernel.org --- arch/x86/kernel/alternative.c | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index b8794f756ea4..0ee43aa70adf 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -2139,10 +2139,6 @@ void __init_or_module text_poke_early(void *addr, const void *opcode, } } -typedef struct { - struct mm_struct *mm; -} temp_mm_state_t; - /* * Using a temporary mm allows to set temporary mappings that are not accessible * by other CPUs. Such mappings are needed to perform sensitive memory writes @@ -2156,9 +2152,9 @@ typedef struct { * loaded, thereby preventing interrupt handler bugs from overriding * the kernel memory protection. */ -static inline temp_mm_state_t use_temporary_mm(struct mm_struct *mm) +static inline struct mm_struct *use_temporary_mm(struct mm_struct *temp_mm) { - temp_mm_state_t temp_state; + struct mm_struct *prev_mm; lockdep_assert_irqs_disabled(); @@ -2170,8 +2166,8 @@ static inline temp_mm_state_t use_temporary_mm(struct mm_struct *mm) if (this_cpu_read(cpu_tlbstate_shared.is_lazy)) leave_mm(); - temp_state.mm = this_cpu_read(cpu_tlbstate.loaded_mm); - switch_mm_irqs_off(NULL, mm, current); + prev_mm = this_cpu_read(cpu_tlbstate.loaded_mm); + switch_mm_irqs_off(NULL, temp_mm, current); /* * If breakpoints are enabled, disable them while the temporary mm is @@ -2187,17 +2183,17 @@ static inline temp_mm_state_t use_temporary_mm(struct mm_struct *mm) if (hw_breakpoint_active()) hw_breakpoint_disable(); - return temp_state; + return prev_mm; } __ro_after_init struct mm_struct *text_poke_mm; __ro_after_init unsigned long text_poke_mm_addr; -static inline void unuse_temporary_mm(temp_mm_state_t prev_state) +static inline void unuse_temporary_mm(struct mm_struct *prev_mm) { lockdep_assert_irqs_disabled(); - switch_mm_irqs_off(NULL, prev_state.mm, current); + switch_mm_irqs_off(NULL, prev_mm, current); /* Clear the cpumask, to indicate no TLB flushing is needed anywhere */ cpumask_clear_cpu(raw_smp_processor_id(), mm_cpumask(text_poke_mm)); @@ -2228,7 +2224,7 @@ static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t l { bool cross_page_boundary = offset_in_page(addr) + len > PAGE_SIZE; struct page *pages[2] = {NULL}; - temp_mm_state_t prev; + struct mm_struct *prev_mm; unsigned long flags; pte_t pte, *ptep; spinlock_t *ptl; @@ -2286,7 +2282,7 @@ static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t l * Loading the temporary mm behaves as a compiler barrier, which * guarantees that the PTE will be set at the time memcpy() is done. */ - prev = use_temporary_mm(text_poke_mm); + prev_mm = use_temporary_mm(text_poke_mm); kasan_disable_current(); func((u8 *)text_poke_mm_addr + offset_in_page(addr), src, len); @@ -2307,7 +2303,7 @@ static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t l * instruction that already allows the core to see the updated version. * Xen-PV is assumed to serialize execution in a similar manner. */ - unuse_temporary_mm(prev); + unuse_temporary_mm(prev_mm); /* * Flushing the TLB might involve IPIs, which would require enabled From aedb60c2c66c82be7cceb89e2b1a0d491bb7ca2f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 11 Apr 2025 07:40:26 +0200 Subject: [PATCH 14/63] x86/alternatives: Rename 'text_poke_flush()' to 'smp_text_poke_batch_flush()' This name is actually actively confusing, because the simple text_poke*() APIs use MM-switching based code patching, while text_poke_flush() is part of the INT3 based text_poke_int3_*() machinery that is an additional layer of functionality on top of regular text_poke*() functionality. Rename it to smp_text_poke_batch_flush() to make it clear which layer it belongs to. Signed-off-by: Ingo Molnar Cc: Juergen Gross Cc: "H . Peter Anvin" Cc: Linus Torvalds Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20250411054105.2341982-15-mingo@kernel.org --- arch/x86/kernel/alternative.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 0ee43aa70adf..6c8bf2fd7a1c 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -2858,7 +2858,7 @@ static bool tp_order_fail(void *addr) return false; } -static void text_poke_flush(void *addr) +static void smp_text_poke_batch_flush(void *addr) { if (tp_vec_nr == TP_VEC_MAX || tp_order_fail(addr)) { smp_text_poke_batch_process(tp_vec, tp_vec_nr); @@ -2868,14 +2868,14 @@ static void text_poke_flush(void *addr) void text_poke_finish(void) { - text_poke_flush(NULL); + smp_text_poke_batch_flush(NULL); } void __ref text_poke_queue(void *addr, const void *opcode, size_t len, const void *emulate) { struct text_poke_loc *tp; - text_poke_flush(addr); + smp_text_poke_batch_flush(addr); tp = &tp_vec[tp_vec_nr++]; text_poke_loc_init(tp, addr, opcode, len, emulate); From e8d7b8c2bbcd5e50c93902af4ba53029fc0497fc Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 11 Apr 2025 07:40:27 +0200 Subject: [PATCH 15/63] x86/alternatives: Rename 'text_poke_finish()' to 'smp_text_poke_batch_finish()' This name is actively confusing as well, because the simple text_poke*() APIs use MM-switching based code patching, while text_poke_finish() is part of the INT3 based text_poke_int3_*() machinery that is an additional layer of functionality on top of regular text_poke*() functionality. Rename it to smp_text_poke_batch_finish() to make it clear which layer it belongs to. Signed-off-by: Ingo Molnar Cc: Juergen Gross Cc: "H . Peter Anvin" Cc: Linus Torvalds Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20250411054105.2341982-16-mingo@kernel.org --- arch/x86/include/asm/text-patching.h | 2 +- arch/x86/kernel/alternative.c | 2 +- arch/x86/kernel/ftrace.c | 4 ++-- arch/x86/kernel/jump_label.c | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h index 7e3527385708..f27d29042f8c 100644 --- a/arch/x86/include/asm/text-patching.h +++ b/arch/x86/include/asm/text-patching.h @@ -42,7 +42,7 @@ extern int smp_text_poke_int3_handler(struct pt_regs *regs); extern void smp_text_poke_single(void *addr, const void *opcode, size_t len, const void *emulate); extern void text_poke_queue(void *addr, const void *opcode, size_t len, const void *emulate); -extern void text_poke_finish(void); +extern void smp_text_poke_batch_finish(void); #define INT3_INSN_SIZE 1 #define INT3_INSN_OPCODE 0xCC diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 6c8bf2fd7a1c..0589c051fe83 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -2866,7 +2866,7 @@ static void smp_text_poke_batch_flush(void *addr) } } -void text_poke_finish(void) +void smp_text_poke_batch_finish(void) { smp_text_poke_batch_flush(NULL); } diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 7175a0404def..c35a928364b9 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -58,7 +58,7 @@ void ftrace_arch_code_modify_post_process(void) * module load, and we need to finish the text_poke_queue() * that they do, here. */ - text_poke_finish(); + smp_text_poke_batch_finish(); ftrace_poke_late = 0; mutex_unlock(&text_mutex); } @@ -250,7 +250,7 @@ void ftrace_replace_code(int enable) text_poke_queue((void *)rec->ip, new, MCOUNT_INSN_SIZE, NULL); ftrace_update_record(rec, enable); } - text_poke_finish(); + smp_text_poke_batch_finish(); } void arch_ftrace_update_code(int command) diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c index 166e12037199..28be6eb6cb3d 100644 --- a/arch/x86/kernel/jump_label.c +++ b/arch/x86/kernel/jump_label.c @@ -143,6 +143,6 @@ bool arch_jump_label_transform_queue(struct jump_entry *entry, void arch_jump_label_transform_apply(void) { mutex_lock(&text_mutex); - text_poke_finish(); + smp_text_poke_batch_finish(); mutex_unlock(&text_mutex); } From 732c7c33a0c17f68393497766445cbd2878ee95e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 11 Apr 2025 07:40:28 +0200 Subject: [PATCH 16/63] x86/alternatives: Rename 'text_poke_queue()' to 'smp_text_poke_batch_add()' This name is actively confusing as well, because the simple text_poke*() APIs use MM-switching based code patching, while text_poke_queue() is part of the INT3 based text_poke_int3_*() machinery that is an additional layer of functionality on top of regular text_poke*() functionality. Rename it to smp_text_poke_batch_add() to make it clear which layer it belongs to. Signed-off-by: Ingo Molnar Cc: Juergen Gross Cc: "H . Peter Anvin" Cc: Linus Torvalds Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20250411054105.2341982-17-mingo@kernel.org --- arch/x86/include/asm/text-patching.h | 2 +- arch/x86/kernel/alternative.c | 2 +- arch/x86/kernel/ftrace.c | 6 +++--- arch/x86/kernel/jump_label.c | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h index f27d29042f8c..f3c9b70afb0c 100644 --- a/arch/x86/include/asm/text-patching.h +++ b/arch/x86/include/asm/text-patching.h @@ -41,7 +41,7 @@ extern void *text_poke_set(void *addr, int c, size_t len); extern int smp_text_poke_int3_handler(struct pt_regs *regs); extern void smp_text_poke_single(void *addr, const void *opcode, size_t len, const void *emulate); -extern void text_poke_queue(void *addr, const void *opcode, size_t len, const void *emulate); +extern void smp_text_poke_batch_add(void *addr, const void *opcode, size_t len, const void *emulate); extern void smp_text_poke_batch_finish(void); #define INT3_INSN_SIZE 1 diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 0589c051fe83..6865296920d5 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -2871,7 +2871,7 @@ void smp_text_poke_batch_finish(void) smp_text_poke_batch_flush(NULL); } -void __ref text_poke_queue(void *addr, const void *opcode, size_t len, const void *emulate) +void __ref smp_text_poke_batch_add(void *addr, const void *opcode, size_t len, const void *emulate) { struct text_poke_loc *tp; diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index c35a928364b9..0853ba3fd04a 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -55,7 +55,7 @@ void ftrace_arch_code_modify_post_process(void) { /* * ftrace_make_{call,nop}() may be called during - * module load, and we need to finish the text_poke_queue() + * module load, and we need to finish the smp_text_poke_batch_add() * that they do, here. */ smp_text_poke_batch_finish(); @@ -119,7 +119,7 @@ ftrace_modify_code_direct(unsigned long ip, const char *old_code, /* replace the text with the new text */ if (ftrace_poke_late) - text_poke_queue((void *)ip, new_code, MCOUNT_INSN_SIZE, NULL); + smp_text_poke_batch_add((void *)ip, new_code, MCOUNT_INSN_SIZE, NULL); else text_poke_early((void *)ip, new_code, MCOUNT_INSN_SIZE); return 0; @@ -247,7 +247,7 @@ void ftrace_replace_code(int enable) break; } - text_poke_queue((void *)rec->ip, new, MCOUNT_INSN_SIZE, NULL); + smp_text_poke_batch_add((void *)rec->ip, new, MCOUNT_INSN_SIZE, NULL); ftrace_update_record(rec, enable); } smp_text_poke_batch_finish(); diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c index 28be6eb6cb3d..a7949a54a0ff 100644 --- a/arch/x86/kernel/jump_label.c +++ b/arch/x86/kernel/jump_label.c @@ -135,7 +135,7 @@ bool arch_jump_label_transform_queue(struct jump_entry *entry, mutex_lock(&text_mutex); jlp = __jump_label_patch(entry, type); - text_poke_queue((void *)jump_entry_code(entry), jlp.code, jlp.size, NULL); + smp_text_poke_batch_add((void *)jump_entry_code(entry), jlp.code, jlp.size, NULL); mutex_unlock(&text_mutex); return true; } From fb802d639340d041e32d48057c7f15175a57c2de Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 11 Apr 2025 07:40:29 +0200 Subject: [PATCH 17/63] x86/alternatives: Rename 'text_poke_loc_init()' to 'text_poke_int3_loc_init()' This name is actively confusing as well, because the simple text_poke*() APIs use MM-switching based code patching, while text_poke_loc_init() is part of the INT3 based text_poke_int3_*() machinery that is an additional layer of functionality on top of regular text_poke*() functionality. Rename it to text_poke_int3_loc_init() to make it clear which layer it belongs to. Signed-off-by: Ingo Molnar Cc: Juergen Gross Cc: "H . Peter Anvin" Cc: Linus Torvalds Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20250411054105.2341982-18-mingo@kernel.org --- arch/x86/kernel/alternative.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 6865296920d5..ebfd364f947a 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -2762,7 +2762,7 @@ static void smp_text_poke_batch_process(struct text_poke_loc *tp, unsigned int n } } -static void text_poke_loc_init(struct text_poke_loc *tp, void *addr, +static void text_poke_int3_loc_init(struct text_poke_loc *tp, void *addr, const void *opcode, size_t len, const void *emulate) { struct insn insn; @@ -2878,7 +2878,7 @@ void __ref smp_text_poke_batch_add(void *addr, const void *opcode, size_t len, c smp_text_poke_batch_flush(addr); tp = &tp_vec[tp_vec_nr++]; - text_poke_loc_init(tp, addr, opcode, len, emulate); + text_poke_int3_loc_init(tp, addr, opcode, len, emulate); } /** @@ -2896,6 +2896,6 @@ void __ref smp_text_poke_single(void *addr, const void *opcode, size_t len, cons { struct text_poke_loc tp; - text_poke_loc_init(&tp, addr, opcode, len, emulate); + text_poke_int3_loc_init(&tp, addr, opcode, len, emulate); smp_text_poke_batch_process(&tp, 1); } From a81d43c46e6e89fef1961147c5f3faca31f6b84e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 11 Apr 2025 07:40:30 +0200 Subject: [PATCH 18/63] x86/alternatives: Rename 'struct text_poke_loc' to 'struct smp_text_poke_loc' Make it clear that this structure is part of the INT3 based SMP patching facility, not the regular text_poke*() MM-switch based facility. Signed-off-by: Ingo Molnar Cc: Juergen Gross Cc: "H . Peter Anvin" Cc: Linus Torvalds Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20250411054105.2341982-19-mingo@kernel.org --- arch/x86/kernel/alternative.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index ebfd364f947a..9abb8f0b8416 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -2455,7 +2455,7 @@ void text_poke_sync(void) * this thing. When len == 6 everything is prefixed with 0x0f and we map * opcode to Jcc.d8, using len to distinguish. */ -struct text_poke_loc { +struct smp_text_poke_loc { /* addr := _stext + rel_addr */ s32 rel_addr; s32 disp; @@ -2467,7 +2467,7 @@ struct text_poke_loc { }; struct text_poke_int3_vec { - struct text_poke_loc *vec; + struct smp_text_poke_loc *vec; int nr_entries; }; @@ -2494,14 +2494,14 @@ static __always_inline void put_desc(void) raw_atomic_dec(refs); } -static __always_inline void *text_poke_addr(struct text_poke_loc *tp) +static __always_inline void *text_poke_addr(struct smp_text_poke_loc *tp) { return _stext + tp->rel_addr; } static __always_inline int patch_cmp(const void *key, const void *elt) { - struct text_poke_loc *tp = (struct text_poke_loc *) elt; + struct smp_text_poke_loc *tp = (struct smp_text_poke_loc *) elt; if (key < text_poke_addr(tp)) return -1; @@ -2513,7 +2513,7 @@ static __always_inline int patch_cmp(const void *key, const void *elt) noinstr int smp_text_poke_int3_handler(struct pt_regs *regs) { struct text_poke_int3_vec *desc; - struct text_poke_loc *tp; + struct smp_text_poke_loc *tp; int ret = 0; void *ip; @@ -2544,7 +2544,7 @@ noinstr int smp_text_poke_int3_handler(struct pt_regs *regs) */ if (unlikely(desc->nr_entries > 1)) { tp = __inline_bsearch(ip, desc->vec, desc->nr_entries, - sizeof(struct text_poke_loc), + sizeof(struct smp_text_poke_loc), patch_cmp); if (!tp) goto out_put; @@ -2592,8 +2592,8 @@ out_put: return ret; } -#define TP_VEC_MAX (PAGE_SIZE / sizeof(struct text_poke_loc)) -static struct text_poke_loc tp_vec[TP_VEC_MAX]; +#define TP_VEC_MAX (PAGE_SIZE / sizeof(struct smp_text_poke_loc)) +static struct smp_text_poke_loc tp_vec[TP_VEC_MAX]; static int tp_vec_nr; /** @@ -2617,7 +2617,7 @@ static int tp_vec_nr; * replacing opcode * - sync cores */ -static void smp_text_poke_batch_process(struct text_poke_loc *tp, unsigned int nr_entries) +static void smp_text_poke_batch_process(struct smp_text_poke_loc *tp, unsigned int nr_entries) { unsigned char int3 = INT3_INSN_OPCODE; unsigned int i; @@ -2762,7 +2762,7 @@ static void smp_text_poke_batch_process(struct text_poke_loc *tp, unsigned int n } } -static void text_poke_int3_loc_init(struct text_poke_loc *tp, void *addr, +static void text_poke_int3_loc_init(struct smp_text_poke_loc *tp, void *addr, const void *opcode, size_t len, const void *emulate) { struct insn insn; @@ -2843,7 +2843,7 @@ static void text_poke_int3_loc_init(struct text_poke_loc *tp, void *addr, */ static bool tp_order_fail(void *addr) { - struct text_poke_loc *tp; + struct smp_text_poke_loc *tp; if (!tp_vec_nr) return false; @@ -2873,7 +2873,7 @@ void smp_text_poke_batch_finish(void) void __ref smp_text_poke_batch_add(void *addr, const void *opcode, size_t len, const void *emulate) { - struct text_poke_loc *tp; + struct smp_text_poke_loc *tp; smp_text_poke_batch_flush(addr); @@ -2894,7 +2894,7 @@ void __ref smp_text_poke_batch_add(void *addr, const void *opcode, size_t len, c */ void __ref smp_text_poke_single(void *addr, const void *opcode, size_t len, const void *emulate) { - struct text_poke_loc tp; + struct smp_text_poke_loc tp; text_poke_int3_loc_init(&tp, addr, opcode, len, emulate); smp_text_poke_batch_process(&tp, 1); From 3bd7546ff24ecf9dbd74adf92b843eebd2862d1c Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 11 Apr 2025 07:40:32 +0200 Subject: [PATCH 19/63] x86/alternatives: Rename 'int3_desc' to 'int3_vec' Signed-off-by: Ingo Molnar Cc: Juergen Gross Cc: "H . Peter Anvin" Cc: Linus Torvalds Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20250411054105.2341982-21-mingo@kernel.org --- arch/x86/kernel/alternative.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 9abb8f0b8416..b97abfb38c3b 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -2473,7 +2473,7 @@ struct text_poke_int3_vec { static DEFINE_PER_CPU(atomic_t, text_poke_array_refs); -static struct text_poke_int3_vec int3_desc; +static struct text_poke_int3_vec int3_vec; static __always_inline struct text_poke_int3_vec *try_get_desc(void) @@ -2483,7 +2483,7 @@ struct text_poke_int3_vec *try_get_desc(void) if (!raw_atomic_inc_not_zero(refs)) return NULL; - return &int3_desc; + return &int3_vec; } static __always_inline void put_desc(void) @@ -2522,7 +2522,7 @@ noinstr int smp_text_poke_int3_handler(struct pt_regs *regs) /* * Having observed our INT3 instruction, we now must observe - * int3_desc with non-zero refcount: + * int3_vec with non-zero refcount: * * text_poke_array_refs = 1 INT3 * WMB RMB @@ -2625,12 +2625,12 @@ static void smp_text_poke_batch_process(struct smp_text_poke_loc *tp, unsigned i lockdep_assert_held(&text_mutex); - int3_desc.vec = tp; - int3_desc.nr_entries = nr_entries; + int3_vec.vec = tp; + int3_vec.nr_entries = nr_entries; /* * Corresponds to the implicit memory barrier in try_get_desc() to - * ensure reading a non-zero refcount provides up to date int3_desc data. + * ensure reading a non-zero refcount provides up to date int3_vec data. */ for_each_possible_cpu(i) atomic_set_release(per_cpu_ptr(&text_poke_array_refs, i), 1); From 87836af1eafc6616bde680be556f49ba3325f798 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 11 Apr 2025 07:40:33 +0200 Subject: [PATCH 20/63] x86/alternatives: Add text_mutex) assert to smp_text_poke_batch_flush() It's possible to escape the text_mutex-held assert in smp_text_poke_batch_process() if the caller uses a properly batched and sorted series of patch requests, so add an explicit lockdep_assert_held() to make sure it's held by all callers. All text_poke_int3_*() APIs will call either smp_text_poke_batch_process() or smp_text_poke_batch_flush() internally. The text_mutex must be held, because tp_vec and tp_vec_nr et al are all globals, and the INT3 patching machinery itself relies on external serialization. Signed-off-by: Ingo Molnar Cc: Juergen Gross Cc: "H . Peter Anvin" Cc: Linus Torvalds Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20250411054105.2341982-22-mingo@kernel.org --- arch/x86/kernel/alternative.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index b97abfb38c3b..c53eb3b0454e 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -2860,6 +2860,8 @@ static bool tp_order_fail(void *addr) static void smp_text_poke_batch_flush(void *addr) { + lockdep_assert_held(&text_mutex); + if (tp_vec_nr == TP_VEC_MAX || tp_order_fail(addr)) { smp_text_poke_batch_process(tp_vec, tp_vec_nr); tp_vec_nr = 0; From 2d0cf10a1eb60deded109c2357326a5ca44e3845 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 11 Apr 2025 07:40:34 +0200 Subject: [PATCH 21/63] x86/alternatives: Use non-inverted logic instead of 'tp_order_fail()' tp_order_fail() uses inverted logic: it returns true in case something is false, which is only a plus at the IOCCC. Instead rename it to regular parity as 'text_poke_addr_ordered()', and adjust the code accordingly. Also add a comment explaining how the address ordering should be understood. No change in functionality intended. Signed-off-by: Ingo Molnar Cc: Juergen Gross Cc: "H . Peter Anvin" Cc: Linus Torvalds Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20250411054105.2341982-23-mingo@kernel.org --- arch/x86/kernel/alternative.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index c53eb3b0454e..c5801b4cbb66 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -2841,28 +2841,34 @@ static void text_poke_int3_loc_init(struct smp_text_poke_loc *tp, void *addr, * We hard rely on the tp_vec being ordered; ensure this is so by flushing * early if needed. */ -static bool tp_order_fail(void *addr) +static bool text_poke_addr_ordered(void *addr) { struct smp_text_poke_loc *tp; if (!tp_vec_nr) - return false; + return true; if (!addr) /* force */ - return true; + return false; - tp = &tp_vec[tp_vec_nr - 1]; + /* + * If the last current entry's address is higher than the + * new entry's address we'd like to add, then ordering + * is violated and we must first flush all pending patching + * requests: + */ + tp = &tp_vec[tp_vec_nr-1]; if ((unsigned long)text_poke_addr(tp) > (unsigned long)addr) - return true; + return false; - return false; + return true; } static void smp_text_poke_batch_flush(void *addr) { lockdep_assert_held(&text_mutex); - if (tp_vec_nr == TP_VEC_MAX || tp_order_fail(addr)) { + if (tp_vec_nr == TP_VEC_MAX || !text_poke_addr_ordered(addr)) { smp_text_poke_batch_process(tp_vec, tp_vec_nr); tp_vec_nr = 0; } From eaa24c9177c8c765ec9b9ccab392ac07ae8acda0 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 11 Apr 2025 07:40:35 +0200 Subject: [PATCH 22/63] x86/alternatives: Remove the 'addr == NULL means forced-flush' hack from smp_text_poke_batch_finish()/smp_text_poke_batch_flush()/text_poke_addr_ordered() There's this weird hack used by smp_text_poke_batch_finish() to indicate a 'forced flush': smp_text_poke_batch_flush(NULL); Just open-code the vector-flush in a straightforward fashion: smp_text_poke_batch_process(tp_vec, tp_vec_nr); tp_vec_nr = 0; And get rid of !addr hack from text_poke_addr_ordered(). Leave a WARN_ON_ONCE(), just in case some external code learned to rely on this behavior. Signed-off-by: Ingo Molnar Cc: Juergen Gross Cc: "H . Peter Anvin" Cc: Linus Torvalds Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20250411054105.2341982-24-mingo@kernel.org --- arch/x86/kernel/alternative.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index c5801b4cbb66..16a41e29db17 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -2845,12 +2845,11 @@ static bool text_poke_addr_ordered(void *addr) { struct smp_text_poke_loc *tp; + WARN_ON_ONCE(!addr); + if (!tp_vec_nr) return true; - if (!addr) /* force */ - return false; - /* * If the last current entry's address is higher than the * new entry's address we'd like to add, then ordering @@ -2864,6 +2863,14 @@ static bool text_poke_addr_ordered(void *addr) return true; } +void smp_text_poke_batch_finish(void) +{ + if (tp_vec_nr) { + smp_text_poke_batch_process(tp_vec, tp_vec_nr); + tp_vec_nr = 0; + } +} + static void smp_text_poke_batch_flush(void *addr) { lockdep_assert_held(&text_mutex); @@ -2874,11 +2881,6 @@ static void smp_text_poke_batch_flush(void *addr) } } -void smp_text_poke_batch_finish(void) -{ - smp_text_poke_batch_flush(NULL); -} - void __ref smp_text_poke_batch_add(void *addr, const void *opcode, size_t len, const void *emulate) { struct smp_text_poke_loc *tp; From c8976ade0c1b4c0629b8a080d712ff402e8343b3 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 11 Apr 2025 07:40:36 +0200 Subject: [PATCH 23/63] x86/alternatives: Simplify smp_text_poke_single() by using tp_vec and existing APIs Instead of constructing a vector on-stack, just use the already available batch-patching vector - which should always be empty at this point. This will allow subsequent simplifications. Signed-off-by: Ingo Molnar Cc: Juergen Gross Cc: "H . Peter Anvin" Cc: Linus Torvalds Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20250411054105.2341982-25-mingo@kernel.org --- arch/x86/kernel/alternative.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 16a41e29db17..b4cb8676c2f0 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -2904,8 +2904,13 @@ void __ref smp_text_poke_batch_add(void *addr, const void *opcode, size_t len, c */ void __ref smp_text_poke_single(void *addr, const void *opcode, size_t len, const void *emulate) { - struct smp_text_poke_loc tp; + struct smp_text_poke_loc *tp; - text_poke_int3_loc_init(&tp, addr, opcode, len, emulate); - smp_text_poke_batch_process(&tp, 1); + /* Batch-patching should not be mixed with single-patching: */ + WARN_ON_ONCE(tp_vec_nr != 0); + + tp = &tp_vec[tp_vec_nr++]; + text_poke_int3_loc_init(tp, addr, opcode, len, emulate); + + smp_text_poke_batch_finish(); } From 476ad071c678d70623cdb14fadd21818f64cc45b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 11 Apr 2025 07:40:37 +0200 Subject: [PATCH 24/63] x86/alternatives: Assert that smp_text_poke_int3_handler() can only ever handle 'tp_vec[]' based requests Signed-off-by: Ingo Molnar Cc: Juergen Gross Cc: "H . Peter Anvin" Cc: Linus Torvalds Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20250411054105.2341982-26-mingo@kernel.org --- arch/x86/kernel/alternative.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index b4cb8676c2f0..329f6ee6b3bc 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -2510,6 +2510,10 @@ static __always_inline int patch_cmp(const void *key, const void *elt) return 0; } +#define TP_VEC_MAX (PAGE_SIZE / sizeof(struct smp_text_poke_loc)) +static struct smp_text_poke_loc tp_vec[TP_VEC_MAX]; +static int tp_vec_nr; + noinstr int smp_text_poke_int3_handler(struct pt_regs *regs) { struct text_poke_int3_vec *desc; @@ -2534,6 +2538,8 @@ noinstr int smp_text_poke_int3_handler(struct pt_regs *regs) if (!desc) return 0; + WARN_ON_ONCE(desc->vec != tp_vec); + /* * Discount the INT3. See smp_text_poke_batch_process(). */ @@ -2592,10 +2598,6 @@ out_put: return ret; } -#define TP_VEC_MAX (PAGE_SIZE / sizeof(struct smp_text_poke_loc)) -static struct smp_text_poke_loc tp_vec[TP_VEC_MAX]; -static int tp_vec_nr; - /** * smp_text_poke_batch_process() -- update instructions on live kernel on SMP * @tp: vector of instructions to patch From 37725b64a9912292841cea7e9aebfd0f084ed8c0 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 11 Apr 2025 07:40:38 +0200 Subject: [PATCH 25/63] x86/alternatives: Assert input parameters in smp_text_poke_batch_process() At this point the 'tp' input parameter must always be the global 'tp_vec' array, and 'nr_entries' must always be equal to 'tp_vec_nr'. Assert these conditions - which will allow the removal of a layer of indirection between these values. Signed-off-by: Ingo Molnar Cc: Juergen Gross Cc: "H . Peter Anvin" Cc: Linus Torvalds Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20250411054105.2341982-27-mingo@kernel.org --- arch/x86/kernel/alternative.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 329f6ee6b3bc..4fa26a4351a6 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -2627,6 +2627,9 @@ static void smp_text_poke_batch_process(struct smp_text_poke_loc *tp, unsigned i lockdep_assert_held(&text_mutex); + WARN_ON_ONCE(tp != tp_vec); + WARN_ON_ONCE(nr_entries != tp_vec_nr); + int3_vec.vec = tp; int3_vec.nr_entries = nr_entries; From 6e7dc03aeeb52fb0147c03377e4f44fea780ef53 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 11 Apr 2025 07:40:39 +0200 Subject: [PATCH 26/63] x86/alternatives: Introduce 'struct smp_text_poke_array' and move tp_vec and tp_vec_nr to it struct text_poke_array is an equivalent structure to these global variables: static struct smp_text_poke_loc tp_vec[TP_VEC_MAX]; static int tp_vec_nr; Note that we intentionally mirror much of the naming of 'struct text_poke_int3_vec', which will further highlight the unecessary layering going on in this code, and will ease its removal. No change in functionality. Signed-off-by: Ingo Molnar Cc: Juergen Gross Cc: "H . Peter Anvin" Cc: Linus Torvalds Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20250411054105.2341982-28-mingo@kernel.org --- arch/x86/kernel/alternative.c | 43 +++++++++++++++++++---------------- 1 file changed, 23 insertions(+), 20 deletions(-) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 4fa26a4351a6..0af220394480 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -2467,14 +2467,21 @@ struct smp_text_poke_loc { }; struct text_poke_int3_vec { - struct smp_text_poke_loc *vec; int nr_entries; + struct smp_text_poke_loc *vec; }; static DEFINE_PER_CPU(atomic_t, text_poke_array_refs); static struct text_poke_int3_vec int3_vec; +#define TP_ARRAY_NR_ENTRIES_MAX (PAGE_SIZE / sizeof(struct smp_text_poke_loc)) + +static struct smp_text_poke_array { + int nr_entries; + struct smp_text_poke_loc vec[TP_ARRAY_NR_ENTRIES_MAX]; +} text_poke_array; + static __always_inline struct text_poke_int3_vec *try_get_desc(void) { @@ -2510,10 +2517,6 @@ static __always_inline int patch_cmp(const void *key, const void *elt) return 0; } -#define TP_VEC_MAX (PAGE_SIZE / sizeof(struct smp_text_poke_loc)) -static struct smp_text_poke_loc tp_vec[TP_VEC_MAX]; -static int tp_vec_nr; - noinstr int smp_text_poke_int3_handler(struct pt_regs *regs) { struct text_poke_int3_vec *desc; @@ -2538,7 +2541,7 @@ noinstr int smp_text_poke_int3_handler(struct pt_regs *regs) if (!desc) return 0; - WARN_ON_ONCE(desc->vec != tp_vec); + WARN_ON_ONCE(desc->vec != text_poke_array.vec); /* * Discount the INT3. See smp_text_poke_batch_process(). @@ -2627,8 +2630,8 @@ static void smp_text_poke_batch_process(struct smp_text_poke_loc *tp, unsigned i lockdep_assert_held(&text_mutex); - WARN_ON_ONCE(tp != tp_vec); - WARN_ON_ONCE(nr_entries != tp_vec_nr); + WARN_ON_ONCE(tp != text_poke_array.vec); + WARN_ON_ONCE(nr_entries != text_poke_array.nr_entries); int3_vec.vec = tp; int3_vec.nr_entries = nr_entries; @@ -2843,7 +2846,7 @@ static void text_poke_int3_loc_init(struct smp_text_poke_loc *tp, void *addr, } /* - * We hard rely on the tp_vec being ordered; ensure this is so by flushing + * We hard rely on the text_poke_array.vec being ordered; ensure this is so by flushing * early if needed. */ static bool text_poke_addr_ordered(void *addr) @@ -2852,7 +2855,7 @@ static bool text_poke_addr_ordered(void *addr) WARN_ON_ONCE(!addr); - if (!tp_vec_nr) + if (!text_poke_array.nr_entries) return true; /* @@ -2861,7 +2864,7 @@ static bool text_poke_addr_ordered(void *addr) * is violated and we must first flush all pending patching * requests: */ - tp = &tp_vec[tp_vec_nr-1]; + tp = &text_poke_array.vec[text_poke_array.nr_entries-1]; if ((unsigned long)text_poke_addr(tp) > (unsigned long)addr) return false; @@ -2870,9 +2873,9 @@ static bool text_poke_addr_ordered(void *addr) void smp_text_poke_batch_finish(void) { - if (tp_vec_nr) { - smp_text_poke_batch_process(tp_vec, tp_vec_nr); - tp_vec_nr = 0; + if (text_poke_array.nr_entries) { + smp_text_poke_batch_process(text_poke_array.vec, text_poke_array.nr_entries); + text_poke_array.nr_entries = 0; } } @@ -2880,9 +2883,9 @@ static void smp_text_poke_batch_flush(void *addr) { lockdep_assert_held(&text_mutex); - if (tp_vec_nr == TP_VEC_MAX || !text_poke_addr_ordered(addr)) { - smp_text_poke_batch_process(tp_vec, tp_vec_nr); - tp_vec_nr = 0; + if (text_poke_array.nr_entries == TP_ARRAY_NR_ENTRIES_MAX || !text_poke_addr_ordered(addr)) { + smp_text_poke_batch_process(text_poke_array.vec, text_poke_array.nr_entries); + text_poke_array.nr_entries = 0; } } @@ -2892,7 +2895,7 @@ void __ref smp_text_poke_batch_add(void *addr, const void *opcode, size_t len, c smp_text_poke_batch_flush(addr); - tp = &tp_vec[tp_vec_nr++]; + tp = &text_poke_array.vec[text_poke_array.nr_entries++]; text_poke_int3_loc_init(tp, addr, opcode, len, emulate); } @@ -2912,9 +2915,9 @@ void __ref smp_text_poke_single(void *addr, const void *opcode, size_t len, cons struct smp_text_poke_loc *tp; /* Batch-patching should not be mixed with single-patching: */ - WARN_ON_ONCE(tp_vec_nr != 0); + WARN_ON_ONCE(text_poke_array.nr_entries != 0); - tp = &tp_vec[tp_vec_nr++]; + tp = &text_poke_array.vec[text_poke_array.nr_entries++]; text_poke_int3_loc_init(tp, addr, opcode, len, emulate); smp_text_poke_batch_finish(); From 0494b16b9caed22ff78adb84cedb7460532eb3f0 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 11 Apr 2025 07:40:40 +0200 Subject: [PATCH 27/63] x86/alternatives: Remove the tp_vec indirection At this point we are always working out of an uptodate text_poke_array, there's no need for smp_text_poke_int3_handler() to read via the int3_vec indirection - remove it. This simplifies the code: 1 file changed, 5 insertions(+), 15 deletions(-) Signed-off-by: Ingo Molnar Cc: Juergen Gross Cc: "H . Peter Anvin" Cc: Linus Torvalds Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20250411054105.2341982-29-mingo@kernel.org --- arch/x86/kernel/alternative.c | 24 +++++++----------------- 1 file changed, 7 insertions(+), 17 deletions(-) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 0af220394480..9937345e4c5b 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -2466,15 +2466,6 @@ struct smp_text_poke_loc { u8 old; }; -struct text_poke_int3_vec { - int nr_entries; - struct smp_text_poke_loc *vec; -}; - -static DEFINE_PER_CPU(atomic_t, text_poke_array_refs); - -static struct text_poke_int3_vec int3_vec; - #define TP_ARRAY_NR_ENTRIES_MAX (PAGE_SIZE / sizeof(struct smp_text_poke_loc)) static struct smp_text_poke_array { @@ -2482,15 +2473,17 @@ static struct smp_text_poke_array { struct smp_text_poke_loc vec[TP_ARRAY_NR_ENTRIES_MAX]; } text_poke_array; +static DEFINE_PER_CPU(atomic_t, text_poke_array_refs); + static __always_inline -struct text_poke_int3_vec *try_get_desc(void) +struct smp_text_poke_array *try_get_desc(void) { atomic_t *refs = this_cpu_ptr(&text_poke_array_refs); if (!raw_atomic_inc_not_zero(refs)) return NULL; - return &int3_vec; + return &text_poke_array; } static __always_inline void put_desc(void) @@ -2519,7 +2512,7 @@ static __always_inline int patch_cmp(const void *key, const void *elt) noinstr int smp_text_poke_int3_handler(struct pt_regs *regs) { - struct text_poke_int3_vec *desc; + struct smp_text_poke_array *desc; struct smp_text_poke_loc *tp; int ret = 0; void *ip; @@ -2529,7 +2522,7 @@ noinstr int smp_text_poke_int3_handler(struct pt_regs *regs) /* * Having observed our INT3 instruction, we now must observe - * int3_vec with non-zero refcount: + * text_poke_array with non-zero refcount: * * text_poke_array_refs = 1 INT3 * WMB RMB @@ -2633,12 +2626,9 @@ static void smp_text_poke_batch_process(struct smp_text_poke_loc *tp, unsigned i WARN_ON_ONCE(tp != text_poke_array.vec); WARN_ON_ONCE(nr_entries != text_poke_array.nr_entries); - int3_vec.vec = tp; - int3_vec.nr_entries = nr_entries; - /* * Corresponds to the implicit memory barrier in try_get_desc() to - * ensure reading a non-zero refcount provides up to date int3_vec data. + * ensure reading a non-zero refcount provides up to date text_poke_array data. */ for_each_possible_cpu(i) atomic_set_release(per_cpu_ptr(&text_poke_array_refs, i), 1); From 46f3d9d329dec169c54a3b5b48deb1ba258689fc Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 11 Apr 2025 07:40:41 +0200 Subject: [PATCH 28/63] x86/alternatives: Rename 'try_get_desc()' to 'try_get_text_poke_array()' This better reflects what the underlying code is doing, there's no 'descriptor' indirection anymore. Signed-off-by: Ingo Molnar Cc: Juergen Gross Cc: "H . Peter Anvin" Cc: Linus Torvalds Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20250411054105.2341982-30-mingo@kernel.org --- arch/x86/kernel/alternative.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 9937345e4c5b..02f123cff898 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -2476,7 +2476,7 @@ static struct smp_text_poke_array { static DEFINE_PER_CPU(atomic_t, text_poke_array_refs); static __always_inline -struct smp_text_poke_array *try_get_desc(void) +struct smp_text_poke_array *try_get_text_poke_array(void) { atomic_t *refs = this_cpu_ptr(&text_poke_array_refs); @@ -2530,7 +2530,7 @@ noinstr int smp_text_poke_int3_handler(struct pt_regs *regs) */ smp_rmb(); - desc = try_get_desc(); + desc = try_get_text_poke_array(); if (!desc) return 0; @@ -2627,7 +2627,7 @@ static void smp_text_poke_batch_process(struct smp_text_poke_loc *tp, unsigned i WARN_ON_ONCE(nr_entries != text_poke_array.nr_entries); /* - * Corresponds to the implicit memory barrier in try_get_desc() to + * Corresponds to the implicit memory barrier in try_get_text_poke_array() to * ensure reading a non-zero refcount provides up to date text_poke_array data. */ for_each_possible_cpu(i) From 3916eec5160dd42c8409c2032149470a474cb5f2 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 11 Apr 2025 07:40:42 +0200 Subject: [PATCH 29/63] x86/alternatives: Rename 'put_desc()' to 'put_text_poke_array()' Just like with try_get_text_poke_array(), this name better reflects what the underlying code is doing, there's no 'descriptor' indirection anymore. Signed-off-by: Ingo Molnar Cc: Juergen Gross Cc: "H . Peter Anvin" Cc: Linus Torvalds Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20250411054105.2341982-31-mingo@kernel.org --- arch/x86/kernel/alternative.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 02f123cff898..f909f4e2f908 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -2486,7 +2486,7 @@ struct smp_text_poke_array *try_get_text_poke_array(void) return &text_poke_array; } -static __always_inline void put_desc(void) +static __always_inline void put_text_poke_array(void) { atomic_t *refs = this_cpu_ptr(&text_poke_array_refs); @@ -2590,7 +2590,7 @@ noinstr int smp_text_poke_int3_handler(struct pt_regs *regs) ret = 1; out_put: - put_desc(); + put_text_poke_array(); return ret; } From b6a25841c171c42b02d316a6bf784fb32e39c786 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 11 Apr 2025 07:40:43 +0200 Subject: [PATCH 30/63] x86/alternatives: Simplify try_get_text_poke_array() There's no need to return a pointer on success - it's always the same pointer. Return a bool instead. Signed-off-by: Ingo Molnar Cc: Juergen Gross Cc: "H . Peter Anvin" Cc: Linus Torvalds Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20250411054105.2341982-32-mingo@kernel.org --- arch/x86/kernel/alternative.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index f909f4e2f908..edc18be4fac4 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -2475,15 +2475,14 @@ static struct smp_text_poke_array { static DEFINE_PER_CPU(atomic_t, text_poke_array_refs); -static __always_inline -struct smp_text_poke_array *try_get_text_poke_array(void) +static __always_inline bool try_get_text_poke_array(void) { atomic_t *refs = this_cpu_ptr(&text_poke_array_refs); if (!raw_atomic_inc_not_zero(refs)) - return NULL; + return false; - return &text_poke_array; + return true; } static __always_inline void put_text_poke_array(void) @@ -2530,9 +2529,9 @@ noinstr int smp_text_poke_int3_handler(struct pt_regs *regs) */ smp_rmb(); - desc = try_get_text_poke_array(); - if (!desc) + if (!try_get_text_poke_array()) return 0; + desc = &text_poke_array; WARN_ON_ONCE(desc->vec != text_poke_array.vec); From 8e35752f0c334ee3eb2bf075ac0bdb243ad25fac Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 11 Apr 2025 07:40:44 +0200 Subject: [PATCH 31/63] x86/alternatives: Simplify smp_text_poke_int3_handler() Remove the 'desc' local variable indirection and use text_poke_array directly. Signed-off-by: Ingo Molnar Cc: Juergen Gross Cc: "H . Peter Anvin" Cc: Linus Torvalds Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20250411054105.2341982-33-mingo@kernel.org --- arch/x86/kernel/alternative.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index edc18be4fac4..97cb954467ff 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -2511,7 +2511,6 @@ static __always_inline int patch_cmp(const void *key, const void *elt) noinstr int smp_text_poke_int3_handler(struct pt_regs *regs) { - struct smp_text_poke_array *desc; struct smp_text_poke_loc *tp; int ret = 0; void *ip; @@ -2531,9 +2530,6 @@ noinstr int smp_text_poke_int3_handler(struct pt_regs *regs) if (!try_get_text_poke_array()) return 0; - desc = &text_poke_array; - - WARN_ON_ONCE(desc->vec != text_poke_array.vec); /* * Discount the INT3. See smp_text_poke_batch_process(). @@ -2543,14 +2539,14 @@ noinstr int smp_text_poke_int3_handler(struct pt_regs *regs) /* * Skip the binary search if there is a single member in the vector. */ - if (unlikely(desc->nr_entries > 1)) { - tp = __inline_bsearch(ip, desc->vec, desc->nr_entries, + if (unlikely(text_poke_array.nr_entries > 1)) { + tp = __inline_bsearch(ip, text_poke_array.vec, text_poke_array.nr_entries, sizeof(struct smp_text_poke_loc), patch_cmp); if (!tp) goto out_put; } else { - tp = desc->vec; + tp = text_poke_array.vec; if (text_poke_addr(tp) != ip) goto out_put; } From 74e8e2bf950e8deb0965583e2130f0fb5a705085 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 11 Apr 2025 07:40:45 +0200 Subject: [PATCH 32/63] x86/alternatives: Simplify smp_text_poke_batch_process() This function is now using the text_poke_array state exclusively, make that explicit by removing the redundant input parameters. Signed-off-by: Ingo Molnar Cc: Juergen Gross Cc: "H . Peter Anvin" Cc: Linus Torvalds Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20250411054105.2341982-34-mingo@kernel.org --- arch/x86/kernel/alternative.c | 43 ++++++++++++++++------------------- 1 file changed, 20 insertions(+), 23 deletions(-) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 97cb954467ff..08ac3c7ad6f8 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -2591,8 +2591,8 @@ out_put: /** * smp_text_poke_batch_process() -- update instructions on live kernel on SMP - * @tp: vector of instructions to patch - * @nr_entries: number of entries in the vector + * @text_poke_array.vec: vector of instructions to patch + * @text_poke_array.nr_entries: number of entries in the vector * * Modify multi-byte instruction by using int3 breakpoint on SMP. * We completely avoid stop_machine() here, and achieve the @@ -2610,7 +2610,7 @@ out_put: * replacing opcode * - sync cores */ -static void smp_text_poke_batch_process(struct smp_text_poke_loc *tp, unsigned int nr_entries) +static void smp_text_poke_batch_process(void) { unsigned char int3 = INT3_INSN_OPCODE; unsigned int i; @@ -2618,9 +2618,6 @@ static void smp_text_poke_batch_process(struct smp_text_poke_loc *tp, unsigned i lockdep_assert_held(&text_mutex); - WARN_ON_ONCE(tp != text_poke_array.vec); - WARN_ON_ONCE(nr_entries != text_poke_array.nr_entries); - /* * Corresponds to the implicit memory barrier in try_get_text_poke_array() to * ensure reading a non-zero refcount provides up to date text_poke_array data. @@ -2640,16 +2637,16 @@ static void smp_text_poke_batch_process(struct smp_text_poke_loc *tp, unsigned i /* * Corresponding read barrier in int3 notifier for making sure the - * nr_entries and handler are correctly ordered wrt. patching. + * text_poke_array.nr_entries and handler are correctly ordered wrt. patching. */ smp_wmb(); /* * First step: add a int3 trap to the address that will be patched. */ - for (i = 0; i < nr_entries; i++) { - tp[i].old = *(u8 *)text_poke_addr(&tp[i]); - text_poke(text_poke_addr(&tp[i]), &int3, INT3_INSN_SIZE); + for (i = 0; i < text_poke_array.nr_entries; i++) { + text_poke_array.vec[i].old = *(u8 *)text_poke_addr(&text_poke_array.vec[i]); + text_poke(text_poke_addr(&text_poke_array.vec[i]), &int3, INT3_INSN_SIZE); } text_poke_sync(); @@ -2657,15 +2654,15 @@ static void smp_text_poke_batch_process(struct smp_text_poke_loc *tp, unsigned i /* * Second step: update all but the first byte of the patched range. */ - for (do_sync = 0, i = 0; i < nr_entries; i++) { - u8 old[POKE_MAX_OPCODE_SIZE+1] = { tp[i].old, }; + for (do_sync = 0, i = 0; i < text_poke_array.nr_entries; i++) { + u8 old[POKE_MAX_OPCODE_SIZE+1] = { text_poke_array.vec[i].old, }; u8 _new[POKE_MAX_OPCODE_SIZE+1]; - const u8 *new = tp[i].text; - int len = tp[i].len; + const u8 *new = text_poke_array.vec[i].text; + int len = text_poke_array.vec[i].len; if (len - INT3_INSN_SIZE > 0) { memcpy(old + INT3_INSN_SIZE, - text_poke_addr(&tp[i]) + INT3_INSN_SIZE, + text_poke_addr(&text_poke_array.vec[i]) + INT3_INSN_SIZE, len - INT3_INSN_SIZE); if (len == 6) { @@ -2674,7 +2671,7 @@ static void smp_text_poke_batch_process(struct smp_text_poke_loc *tp, unsigned i new = _new; } - text_poke(text_poke_addr(&tp[i]) + INT3_INSN_SIZE, + text_poke(text_poke_addr(&text_poke_array.vec[i]) + INT3_INSN_SIZE, new + INT3_INSN_SIZE, len - INT3_INSN_SIZE); @@ -2705,7 +2702,7 @@ static void smp_text_poke_batch_process(struct smp_text_poke_loc *tp, unsigned i * The old instruction is recorded so that the event can be * processed forwards or backwards. */ - perf_event_text_poke(text_poke_addr(&tp[i]), old, len, new, len); + perf_event_text_poke(text_poke_addr(&text_poke_array.vec[i]), old, len, new, len); } if (do_sync) { @@ -2721,16 +2718,16 @@ static void smp_text_poke_batch_process(struct smp_text_poke_loc *tp, unsigned i * Third step: replace the first byte (int3) by the first byte of * replacing opcode. */ - for (do_sync = 0, i = 0; i < nr_entries; i++) { - u8 byte = tp[i].text[0]; + for (do_sync = 0, i = 0; i < text_poke_array.nr_entries; i++) { + u8 byte = text_poke_array.vec[i].text[0]; - if (tp[i].len == 6) + if (text_poke_array.vec[i].len == 6) byte = 0x0f; if (byte == INT3_INSN_OPCODE) continue; - text_poke(text_poke_addr(&tp[i]), &byte, INT3_INSN_SIZE); + text_poke(text_poke_addr(&text_poke_array.vec[i]), &byte, INT3_INSN_SIZE); do_sync++; } @@ -2859,7 +2856,7 @@ static bool text_poke_addr_ordered(void *addr) void smp_text_poke_batch_finish(void) { if (text_poke_array.nr_entries) { - smp_text_poke_batch_process(text_poke_array.vec, text_poke_array.nr_entries); + smp_text_poke_batch_process(); text_poke_array.nr_entries = 0; } } @@ -2869,7 +2866,7 @@ static void smp_text_poke_batch_flush(void *addr) lockdep_assert_held(&text_mutex); if (text_poke_array.nr_entries == TP_ARRAY_NR_ENTRIES_MAX || !text_poke_addr_ordered(addr)) { - smp_text_poke_batch_process(text_poke_array.vec, text_poke_array.nr_entries); + smp_text_poke_batch_process(); text_poke_array.nr_entries = 0; } } From 0e351aec2b0052d33d6e44ded622223043d4dcd5 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 11 Apr 2025 07:40:47 +0200 Subject: [PATCH 33/63] x86/alternatives: Move the text_poke_array manipulation into text_poke_int3_loc_init() and rename it to __smp_text_poke_batch_add() This simplifies the code and code generation a bit: text data bss dec hex filename 14802 1029 4112 19943 4de7 alternative.o.before 14784 1029 4112 19925 4dd5 alternative.o.after Signed-off-by: Ingo Molnar Cc: Juergen Gross Cc: "H . Peter Anvin" Cc: Linus Torvalds Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20250411054105.2341982-36-mingo@kernel.org --- arch/x86/kernel/alternative.c | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 08ac3c7ad6f8..eb0da270043b 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -2752,12 +2752,14 @@ static void smp_text_poke_batch_process(void) } } -static void text_poke_int3_loc_init(struct smp_text_poke_loc *tp, void *addr, - const void *opcode, size_t len, const void *emulate) +static void __smp_text_poke_batch_add(void *addr, const void *opcode, size_t len, const void *emulate) { + struct smp_text_poke_loc *tp; struct insn insn; int ret, i = 0; + tp = &text_poke_array.vec[text_poke_array.nr_entries++]; + if (len == 6) i = 1; memcpy((void *)tp->text, opcode+i, len-i); @@ -2873,12 +2875,8 @@ static void smp_text_poke_batch_flush(void *addr) void __ref smp_text_poke_batch_add(void *addr, const void *opcode, size_t len, const void *emulate) { - struct smp_text_poke_loc *tp; - smp_text_poke_batch_flush(addr); - - tp = &text_poke_array.vec[text_poke_array.nr_entries++]; - text_poke_int3_loc_init(tp, addr, opcode, len, emulate); + __smp_text_poke_batch_add(addr, opcode, len, emulate); } /** @@ -2894,13 +2892,9 @@ void __ref smp_text_poke_batch_add(void *addr, const void *opcode, size_t len, c */ void __ref smp_text_poke_single(void *addr, const void *opcode, size_t len, const void *emulate) { - struct smp_text_poke_loc *tp; - /* Batch-patching should not be mixed with single-patching: */ WARN_ON_ONCE(text_poke_array.nr_entries != 0); - tp = &text_poke_array.vec[text_poke_array.nr_entries++]; - text_poke_int3_loc_init(tp, addr, opcode, len, emulate); - + __smp_text_poke_batch_add(addr, opcode, len, emulate); smp_text_poke_batch_finish(); } From 8a6a1b4e0ef15dab908a365588e06f23f9c0bad5 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 11 Apr 2025 07:40:48 +0200 Subject: [PATCH 34/63] x86/alternatives: Remove the mixed-patching restriction on smp_text_poke_single() At this point smp_text_poke_single(addr, opcode, len, emulate) is equivalent to: smp_text_poke_batch_add(addr, opcode, len, emulate); smp_text_poke_batch_finish(); So remove the restriction on mixing single-instruction patching with multi-instruction patching. Signed-off-by: Ingo Molnar Cc: Juergen Gross Cc: "H . Peter Anvin" Cc: Linus Torvalds Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20250411054105.2341982-37-mingo@kernel.org --- arch/x86/kernel/alternative.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index eb0da270043b..f0bb2158f27b 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -2892,9 +2892,6 @@ void __ref smp_text_poke_batch_add(void *addr, const void *opcode, size_t len, c */ void __ref smp_text_poke_single(void *addr, const void *opcode, size_t len, const void *emulate) { - /* Batch-patching should not be mixed with single-patching: */ - WARN_ON_ONCE(text_poke_array.nr_entries != 0); - __smp_text_poke_batch_add(addr, opcode, len, emulate); smp_text_poke_batch_finish(); } From 9647ce465265720509f80f21f0b36c00bb0c0d18 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 11 Apr 2025 07:40:49 +0200 Subject: [PATCH 35/63] x86/alternatives: Document 'smp_text_poke_single()' Extend the documentation to better describe its purpose. Signed-off-by: Ingo Molnar Cc: Juergen Gross Cc: "H . Peter Anvin" Cc: Linus Torvalds Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20250411054105.2341982-38-mingo@kernel.org --- arch/x86/kernel/alternative.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index f0bb2158f27b..a9726cc96972 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -2880,7 +2880,7 @@ void __ref smp_text_poke_batch_add(void *addr, const void *opcode, size_t len, c } /** - * smp_text_poke_single() -- update instructions on live kernel on SMP + * smp_text_poke_single() -- update instruction on live kernel on SMP immediately * @addr: address to patch * @opcode: opcode of new instruction * @len: length to copy @@ -2888,7 +2888,8 @@ void __ref smp_text_poke_batch_add(void *addr, const void *opcode, size_t len, c * * Update a single instruction with the vector in the stack, avoiding * dynamically allocated memory. This function should be used when it is - * not possible to allocate memory. + * not possible to allocate memory for a vector. The single instruction + * is patched in immediately. */ void __ref smp_text_poke_single(void *addr, const void *opcode, size_t len, const void *emulate) { From cca3473956be6ca5c7ad5d2ced5516eb509c1936 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 11 Apr 2025 07:40:50 +0200 Subject: [PATCH 36/63] x86/alternatives: Add documentation for smp_text_poke_batch_add() Signed-off-by: Ingo Molnar Cc: Juergen Gross Cc: "H . Peter Anvin" Cc: Linus Torvalds Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20250411054105.2341982-39-mingo@kernel.org --- arch/x86/kernel/alternative.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index a9726cc96972..b47ad0853589 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -2873,6 +2873,19 @@ static void smp_text_poke_batch_flush(void *addr) } } +/** + * smp_text_poke_batch_add() -- update instruction on live kernel on SMP, batched + * @addr: address to patch + * @opcode: opcode of new instruction + * @len: length to copy + * @emulate: instruction to be emulated + * + * Add a new instruction to the current queue of to-be-patched instructions + * the kernel maintains. The patching request will not be executed immediately, + * but becomes part of an array of patching requests, optimized for batched + * execution. All pending patching requests will be executed on the next + * smp_text_poke_batch_finish() call. + */ void __ref smp_text_poke_batch_add(void *addr, const void *opcode, size_t len, const void *emulate) { smp_text_poke_batch_flush(addr); From 7fbadb50d95a8bbc0de720e0857c77d4f13ddcaf Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 11 Apr 2025 07:40:51 +0200 Subject: [PATCH 37/63] x86/alternatives: Move text_poke_array completion from smp_text_poke_batch_finish() and smp_text_poke_batch_flush() to smp_text_poke_batch_process() Simplifies the code and improves code generation a bit: text data bss dec hex filename 14769 1017 4112 19898 4dba alternative.o.before 14742 1017 4112 19871 4d9f alternative.o.after Signed-off-by: Ingo Molnar Cc: Juergen Gross Cc: "H . Peter Anvin" Cc: Linus Torvalds Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20250411054105.2341982-40-mingo@kernel.org --- arch/x86/kernel/alternative.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index b47ad0853589..556a82f576cd 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -2750,6 +2750,9 @@ static void smp_text_poke_batch_process(void) if (unlikely(!atomic_dec_and_test(refs))) atomic_cond_read_acquire(refs, !VAL); } + + /* They are all completed: */ + text_poke_array.nr_entries = 0; } static void __smp_text_poke_batch_add(void *addr, const void *opcode, size_t len, const void *emulate) @@ -2857,20 +2860,16 @@ static bool text_poke_addr_ordered(void *addr) void smp_text_poke_batch_finish(void) { - if (text_poke_array.nr_entries) { + if (text_poke_array.nr_entries) smp_text_poke_batch_process(); - text_poke_array.nr_entries = 0; - } } static void smp_text_poke_batch_flush(void *addr) { lockdep_assert_held(&text_mutex); - if (text_poke_array.nr_entries == TP_ARRAY_NR_ENTRIES_MAX || !text_poke_addr_ordered(addr)) { + if (text_poke_array.nr_entries == TP_ARRAY_NR_ENTRIES_MAX || !text_poke_addr_ordered(addr)) smp_text_poke_batch_process(); - text_poke_array.nr_entries = 0; - } } /** From 6e4955a9d73ebdc8496e6bff7f6d2bf83c01959f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 11 Apr 2025 07:40:52 +0200 Subject: [PATCH 38/63] x86/alternatives: Rename 'text_poke_sync()' to 'smp_text_poke_sync_each_cpu()' Unlike sync_core(), text_poke_sync() is a very heavy operation, as it sends an IPI to every online CPU in the system and waits for completion. Reflect this in the name. Signed-off-by: Ingo Molnar Cc: Juergen Gross Cc: "H . Peter Anvin" Cc: Linus Torvalds Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20250411054105.2341982-41-mingo@kernel.org --- arch/x86/include/asm/text-patching.h | 2 +- arch/x86/kernel/alternative.c | 12 ++++++------ arch/x86/kernel/kprobes/core.c | 4 ++-- arch/x86/kernel/kprobes/opt.c | 4 ++-- arch/x86/kernel/module.c | 2 +- 5 files changed, 12 insertions(+), 12 deletions(-) diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h index f3c9b70afb0c..d9dbbe9d9667 100644 --- a/arch/x86/include/asm/text-patching.h +++ b/arch/x86/include/asm/text-patching.h @@ -32,7 +32,7 @@ extern void apply_relocation(u8 *buf, const u8 * const instr, size_t instrlen, u * an inconsistent instruction while you patch. */ extern void *text_poke(void *addr, const void *opcode, size_t len); -extern void text_poke_sync(void); +extern void smp_text_poke_sync_each_cpu(void); extern void *text_poke_kgdb(void *addr, const void *opcode, size_t len); extern void *text_poke_copy(void *addr, const void *opcode, size_t len); #define text_poke_copy text_poke_copy diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 556a82f576cd..e4c51d81a72f 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -2445,7 +2445,7 @@ static void do_sync_core(void *info) sync_core(); } -void text_poke_sync(void) +void smp_text_poke_sync_each_cpu(void) { on_each_cpu(do_sync_core, NULL, 1); } @@ -2469,8 +2469,8 @@ struct smp_text_poke_loc { #define TP_ARRAY_NR_ENTRIES_MAX (PAGE_SIZE / sizeof(struct smp_text_poke_loc)) static struct smp_text_poke_array { - int nr_entries; struct smp_text_poke_loc vec[TP_ARRAY_NR_ENTRIES_MAX]; + int nr_entries; } text_poke_array; static DEFINE_PER_CPU(atomic_t, text_poke_array_refs); @@ -2649,7 +2649,7 @@ static void smp_text_poke_batch_process(void) text_poke(text_poke_addr(&text_poke_array.vec[i]), &int3, INT3_INSN_SIZE); } - text_poke_sync(); + smp_text_poke_sync_each_cpu(); /* * Second step: update all but the first byte of the patched range. @@ -2711,7 +2711,7 @@ static void smp_text_poke_batch_process(void) * not necessary and we'd be safe even without it. But * better safe than sorry (plus there's not only Intel). */ - text_poke_sync(); + smp_text_poke_sync_each_cpu(); } /* @@ -2732,13 +2732,13 @@ static void smp_text_poke_batch_process(void) } if (do_sync) - text_poke_sync(); + smp_text_poke_sync_each_cpu(); /* * Remove and wait for refs to be zero. * * Notably, if after step-3 above the INT3 got removed, then the - * text_poke_sync() will have serialized against any running INT3 + * smp_text_poke_sync_each_cpu() will have serialized against any running INT3 * handlers and the below spin-wait will not happen. * * IOW. unless the replacement instruction is INT3, this case goes diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 09608fd93687..47cb8eb138ba 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -808,7 +808,7 @@ void arch_arm_kprobe(struct kprobe *p) u8 int3 = INT3_INSN_OPCODE; text_poke(p->addr, &int3, 1); - text_poke_sync(); + smp_text_poke_sync_each_cpu(); perf_event_text_poke(p->addr, &p->opcode, 1, &int3, 1); } @@ -818,7 +818,7 @@ void arch_disarm_kprobe(struct kprobe *p) perf_event_text_poke(p->addr, &int3, 1, &p->opcode, 1); text_poke(p->addr, &p->opcode, 1); - text_poke_sync(); + smp_text_poke_sync_each_cpu(); } void arch_remove_kprobe(struct kprobe *p) diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index 9307a40f4983..0aabd4c4e2c4 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -513,11 +513,11 @@ void arch_unoptimize_kprobe(struct optimized_kprobe *op) JMP32_INSN_SIZE - INT3_INSN_SIZE); text_poke(addr, new, INT3_INSN_SIZE); - text_poke_sync(); + smp_text_poke_sync_each_cpu(); text_poke(addr + INT3_INSN_SIZE, new + INT3_INSN_SIZE, JMP32_INSN_SIZE - INT3_INSN_SIZE); - text_poke_sync(); + smp_text_poke_sync_each_cpu(); perf_event_text_poke(op->kp.addr, old, JMP32_INSN_SIZE, new, JMP32_INSN_SIZE); } diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index a7998f351701..231d6326d1fd 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -206,7 +206,7 @@ static int write_relocate_add(Elf64_Shdr *sechdrs, write, apply); if (!early) { - text_poke_sync(); + smp_text_poke_sync_each_cpu(); mutex_unlock(&text_mutex); } From 0e67e587e2e07be8d6775a1444e679c6afbc87f4 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 11 Apr 2025 07:40:53 +0200 Subject: [PATCH 39/63] x86/alternatives: Simplify text_poke_addr_ordered() - Use direct 'void *' pointer comparison, there's no need to force the type to 'unsigned long'. - Remove the 'tp' local variable indirection Signed-off-by: Ingo Molnar Cc: Juergen Gross Cc: "H . Peter Anvin" Cc: Linus Torvalds Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20250411054105.2341982-42-mingo@kernel.org --- arch/x86/kernel/alternative.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index e4c51d81a72f..a747b0885f9a 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -2838,8 +2838,6 @@ static void __smp_text_poke_batch_add(void *addr, const void *opcode, size_t len */ static bool text_poke_addr_ordered(void *addr) { - struct smp_text_poke_loc *tp; - WARN_ON_ONCE(!addr); if (!text_poke_array.nr_entries) @@ -2851,8 +2849,7 @@ static bool text_poke_addr_ordered(void *addr) * is violated and we must first flush all pending patching * requests: */ - tp = &text_poke_array.vec[text_poke_array.nr_entries-1]; - if ((unsigned long)text_poke_addr(tp) > (unsigned long)addr) + if (text_poke_addr(text_poke_array.vec + text_poke_array.nr_entries-1) > addr) return false; return true; From 6af9540379628a769c63b0a101ff371d7719ec04 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 11 Apr 2025 07:40:54 +0200 Subject: [PATCH 40/63] x86/alternatives: Constify text_poke_addr() This will also allow the simplification of patch_cmp(). Signed-off-by: Ingo Molnar Cc: Juergen Gross Cc: "H . Peter Anvin" Cc: Linus Torvalds Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20250411054105.2341982-43-mingo@kernel.org --- arch/x86/kernel/alternative.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index a747b0885f9a..14ca17dc36e8 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -2493,7 +2493,7 @@ static __always_inline void put_text_poke_array(void) raw_atomic_dec(refs); } -static __always_inline void *text_poke_addr(struct smp_text_poke_loc *tp) +static __always_inline void *text_poke_addr(const struct smp_text_poke_loc *tp) { return _stext + tp->rel_addr; } From 3e6f47573ec3ed1d9dc50243fbcf50c87f740853 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 11 Apr 2025 07:40:55 +0200 Subject: [PATCH 41/63] x86/alternatives: Simplify and clean up patch_cmp() - No need to cast over to 'struct smp_text_poke_loc *', void * is just fine for a binary search, - Use the canonical (a, b) input parameter nomenclature of cmp_func_t functions and rename the input parameters from (tp, elt) to (tpl_a, tpl_b). Signed-off-by: Ingo Molnar Cc: Juergen Gross Cc: "H . Peter Anvin" Cc: Linus Torvalds Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20250411054105.2341982-44-mingo@kernel.org --- arch/x86/kernel/alternative.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 14ca17dc36e8..f278655f0950 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -2498,13 +2498,11 @@ static __always_inline void *text_poke_addr(const struct smp_text_poke_loc *tp) return _stext + tp->rel_addr; } -static __always_inline int patch_cmp(const void *key, const void *elt) +static __always_inline int patch_cmp(const void *tpl_a, const void *tpl_b) { - struct smp_text_poke_loc *tp = (struct smp_text_poke_loc *) elt; - - if (key < text_poke_addr(tp)) + if (tpl_a < text_poke_addr(tpl_b)) return -1; - if (key > text_poke_addr(tp)) + if (tpl_a > text_poke_addr(tpl_b)) return 1; return 0; } From 22b9662313034e42c780a9d6ebcc1811d47d359b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 11 Apr 2025 07:40:56 +0200 Subject: [PATCH 42/63] x86/alternatives: Standardize on 'tpl' local variable names for 'struct smp_text_poke_loc *' There's no toilet paper in this code. Signed-off-by: Ingo Molnar Cc: Juergen Gross Cc: "H . Peter Anvin" Cc: Linus Torvalds Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20250411054105.2341982-45-mingo@kernel.org --- arch/x86/kernel/alternative.c | 54 +++++++++++++++++------------------ 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index f278655f0950..c5abcf95daea 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -2493,9 +2493,9 @@ static __always_inline void put_text_poke_array(void) raw_atomic_dec(refs); } -static __always_inline void *text_poke_addr(const struct smp_text_poke_loc *tp) +static __always_inline void *text_poke_addr(const struct smp_text_poke_loc *tpl) { - return _stext + tp->rel_addr; + return _stext + tpl->rel_addr; } static __always_inline int patch_cmp(const void *tpl_a, const void *tpl_b) @@ -2509,7 +2509,7 @@ static __always_inline int patch_cmp(const void *tpl_a, const void *tpl_b) noinstr int smp_text_poke_int3_handler(struct pt_regs *regs) { - struct smp_text_poke_loc *tp; + struct smp_text_poke_loc *tpl; int ret = 0; void *ip; @@ -2538,20 +2538,20 @@ noinstr int smp_text_poke_int3_handler(struct pt_regs *regs) * Skip the binary search if there is a single member in the vector. */ if (unlikely(text_poke_array.nr_entries > 1)) { - tp = __inline_bsearch(ip, text_poke_array.vec, text_poke_array.nr_entries, + tpl = __inline_bsearch(ip, text_poke_array.vec, text_poke_array.nr_entries, sizeof(struct smp_text_poke_loc), patch_cmp); - if (!tp) + if (!tpl) goto out_put; } else { - tp = text_poke_array.vec; - if (text_poke_addr(tp) != ip) + tpl = text_poke_array.vec; + if (text_poke_addr(tpl) != ip) goto out_put; } - ip += tp->len; + ip += tpl->len; - switch (tp->opcode) { + switch (tpl->opcode) { case INT3_INSN_OPCODE: /* * Someone poked an explicit INT3, they'll want to handle it, @@ -2564,16 +2564,16 @@ noinstr int smp_text_poke_int3_handler(struct pt_regs *regs) break; case CALL_INSN_OPCODE: - int3_emulate_call(regs, (long)ip + tp->disp); + int3_emulate_call(regs, (long)ip + tpl->disp); break; case JMP32_INSN_OPCODE: case JMP8_INSN_OPCODE: - int3_emulate_jmp(regs, (long)ip + tp->disp); + int3_emulate_jmp(regs, (long)ip + tpl->disp); break; case 0x70 ... 0x7f: /* Jcc */ - int3_emulate_jcc(regs, tp->opcode & 0xf, (long)ip, tp->disp); + int3_emulate_jcc(regs, tpl->opcode & 0xf, (long)ip, tpl->disp); break; default: @@ -2755,33 +2755,33 @@ static void smp_text_poke_batch_process(void) static void __smp_text_poke_batch_add(void *addr, const void *opcode, size_t len, const void *emulate) { - struct smp_text_poke_loc *tp; + struct smp_text_poke_loc *tpl; struct insn insn; int ret, i = 0; - tp = &text_poke_array.vec[text_poke_array.nr_entries++]; + tpl = &text_poke_array.vec[text_poke_array.nr_entries++]; if (len == 6) i = 1; - memcpy((void *)tp->text, opcode+i, len-i); + memcpy((void *)tpl->text, opcode+i, len-i); if (!emulate) emulate = opcode; ret = insn_decode_kernel(&insn, emulate); BUG_ON(ret < 0); - tp->rel_addr = addr - (void *)_stext; - tp->len = len; - tp->opcode = insn.opcode.bytes[0]; + tpl->rel_addr = addr - (void *)_stext; + tpl->len = len; + tpl->opcode = insn.opcode.bytes[0]; if (is_jcc32(&insn)) { /* * Map Jcc.d32 onto Jcc.d8 and use len to distinguish. */ - tp->opcode = insn.opcode.bytes[1] - 0x10; + tpl->opcode = insn.opcode.bytes[1] - 0x10; } - switch (tp->opcode) { + switch (tpl->opcode) { case RET_INSN_OPCODE: case JMP32_INSN_OPCODE: case JMP8_INSN_OPCODE: @@ -2790,14 +2790,14 @@ static void __smp_text_poke_batch_add(void *addr, const void *opcode, size_t len * next instruction can be padded with INT3. */ for (i = insn.length; i < len; i++) - BUG_ON(tp->text[i] != INT3_INSN_OPCODE); + BUG_ON(tpl->text[i] != INT3_INSN_OPCODE); break; default: BUG_ON(len != insn.length); } - switch (tp->opcode) { + switch (tpl->opcode) { case INT3_INSN_OPCODE: case RET_INSN_OPCODE: break; @@ -2806,21 +2806,21 @@ static void __smp_text_poke_batch_add(void *addr, const void *opcode, size_t len case JMP32_INSN_OPCODE: case JMP8_INSN_OPCODE: case 0x70 ... 0x7f: /* Jcc */ - tp->disp = insn.immediate.value; + tpl->disp = insn.immediate.value; break; default: /* assume NOP */ switch (len) { case 2: /* NOP2 -- emulate as JMP8+0 */ BUG_ON(memcmp(emulate, x86_nops[len], len)); - tp->opcode = JMP8_INSN_OPCODE; - tp->disp = 0; + tpl->opcode = JMP8_INSN_OPCODE; + tpl->disp = 0; break; case 5: /* NOP5 -- emulate as JMP32+0 */ BUG_ON(memcmp(emulate, x86_nops[len], len)); - tp->opcode = JMP32_INSN_OPCODE; - tp->disp = 0; + tpl->opcode = JMP32_INSN_OPCODE; + tpl->disp = 0; break; default: /* unknown instruction */ From 8036fbe5a5d618be3694c4719afb14fd14cc972d Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 11 Apr 2025 07:40:57 +0200 Subject: [PATCH 43/63] x86/alternatives: Rename 'TP_ARRAY_NR_ENTRIES_MAX' to 'TEXT_POKE_ARRAY_MAX' Standardize on TEXT_POKE_ namespace for CPP constants too. Signed-off-by: Ingo Molnar Cc: Juergen Gross Cc: "H . Peter Anvin" Cc: Linus Torvalds Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20250411054105.2341982-46-mingo@kernel.org --- arch/x86/kernel/alternative.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index c5abcf95daea..4b460dea4f4e 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -2466,10 +2466,10 @@ struct smp_text_poke_loc { u8 old; }; -#define TP_ARRAY_NR_ENTRIES_MAX (PAGE_SIZE / sizeof(struct smp_text_poke_loc)) +#define TEXT_POKE_ARRAY_MAX (PAGE_SIZE / sizeof(struct smp_text_poke_loc)) static struct smp_text_poke_array { - struct smp_text_poke_loc vec[TP_ARRAY_NR_ENTRIES_MAX]; + struct smp_text_poke_loc vec[TEXT_POKE_ARRAY_MAX]; int nr_entries; } text_poke_array; @@ -2863,7 +2863,7 @@ static void smp_text_poke_batch_flush(void *addr) { lockdep_assert_held(&text_mutex); - if (text_poke_array.nr_entries == TP_ARRAY_NR_ENTRIES_MAX || !text_poke_addr_ordered(addr)) + if (text_poke_array.nr_entries == TEXT_POKE_ARRAY_MAX || !text_poke_addr_ordered(addr)) smp_text_poke_batch_process(); } From 3c8454dfc9143c992375a166a620ea3d62c3e434 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 11 Apr 2025 07:40:58 +0200 Subject: [PATCH 44/63] x86/alternatives: Rename 'POKE_MAX_OPCODE_SIZE' to 'TEXT_POKE_MAX_OPCODE_SIZE' Join the TEXT_POKE_ namespace. Signed-off-by: Ingo Molnar Cc: Juergen Gross Cc: "H . Peter Anvin" Cc: Linus Torvalds Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20250411054105.2341982-47-mingo@kernel.org --- arch/x86/include/asm/text-patching.h | 4 ++-- arch/x86/kernel/alternative.c | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h index d9dbbe9d9667..a45ac8a223f3 100644 --- a/arch/x86/include/asm/text-patching.h +++ b/arch/x86/include/asm/text-patching.h @@ -11,7 +11,7 @@ * JUMP_LABEL_NOP_SIZE/RELATIVEJUMP_SIZE, which are 5. * Raise it if needed. */ -#define POKE_MAX_OPCODE_SIZE 5 +#define TEXT_POKE_MAX_OPCODE_SIZE 5 extern void text_poke_early(void *addr, const void *opcode, size_t len); @@ -82,7 +82,7 @@ static __always_inline int text_opcode_size(u8 opcode) } union text_poke_insn { - u8 text[POKE_MAX_OPCODE_SIZE]; + u8 text[TEXT_POKE_MAX_OPCODE_SIZE]; struct { u8 opcode; s32 disp; diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 4b460dea4f4e..b8e0b1b2383f 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -2461,7 +2461,7 @@ struct smp_text_poke_loc { s32 disp; u8 len; u8 opcode; - const u8 text[POKE_MAX_OPCODE_SIZE]; + const u8 text[TEXT_POKE_MAX_OPCODE_SIZE]; /* see smp_text_poke_batch_process() */ u8 old; }; @@ -2653,8 +2653,8 @@ static void smp_text_poke_batch_process(void) * Second step: update all but the first byte of the patched range. */ for (do_sync = 0, i = 0; i < text_poke_array.nr_entries; i++) { - u8 old[POKE_MAX_OPCODE_SIZE+1] = { text_poke_array.vec[i].old, }; - u8 _new[POKE_MAX_OPCODE_SIZE+1]; + u8 old[TEXT_POKE_MAX_OPCODE_SIZE+1] = { text_poke_array.vec[i].old, }; + u8 _new[TEXT_POKE_MAX_OPCODE_SIZE+1]; const u8 *new = text_poke_array.vec[i].text; int len = text_poke_array.vec[i].len; From db5c68c88c07337b7e5801abfb926f9191fa7945 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 11 Apr 2025 07:40:59 +0200 Subject: [PATCH 45/63] x86/alternatives: Simplify the #include section We accumulated lots of unnecessary header inclusions over the years, trim them. Signed-off-by: Ingo Molnar Cc: Juergen Gross Cc: "H . Peter Anvin" Cc: Linus Torvalds Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20250411054105.2341982-48-mingo@kernel.org --- arch/x86/kernel/alternative.c | 28 +++------------------------- 1 file changed, 3 insertions(+), 25 deletions(-) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index b8e0b1b2383f..eb3be5d3efef 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -1,36 +1,14 @@ // SPDX-License-Identifier: GPL-2.0-only #define pr_fmt(fmt) "SMP alternatives: " fmt -#include -#include +#include #include -#include -#include -#include -#include -#include #include #include -#include -#include -#include -#include -#include -#include -#include + #include -#include -#include -#include -#include -#include -#include #include -#include -#include -#include -#include -#include +#include int __read_mostly alternatives_patched; From b1bb39185df6f5ddafcb912304b73e70c6b70c5f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 11 Apr 2025 07:41:00 +0200 Subject: [PATCH 46/63] x86/alternatives: Move declarations of vmlinux.lds.S defined section symbols to Move it from the middle of a .c file next to the similar declarations of __alt_instructions[] et al. Signed-off-by: Ingo Molnar Cc: Juergen Gross Cc: "H . Peter Anvin" Cc: Linus Torvalds Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20250411054105.2341982-49-mingo@kernel.org --- arch/x86/include/asm/alternative.h | 6 ++++++ arch/x86/kernel/alternative.c | 6 ------ 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 4a37a8bd87fd..ef84739a77f5 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -82,6 +82,12 @@ struct alt_instr { extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; +extern s32 __retpoline_sites[], __retpoline_sites_end[]; +extern s32 __return_sites[], __return_sites_end[]; +extern s32 __cfi_sites[], __cfi_sites_end[]; +extern s32 __ibt_endbr_seal[], __ibt_endbr_seal_end[]; +extern s32 __smp_locks[], __smp_locks_end[]; + /* * Debug flag that can be tested to see whether alternative * instructions were patched in already: diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index eb3be5d3efef..cd828c236fd2 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -149,12 +149,6 @@ static void add_nop(u8 *buf, unsigned int len) *buf = INT3_INSN_OPCODE; } -extern s32 __retpoline_sites[], __retpoline_sites_end[]; -extern s32 __return_sites[], __return_sites_end[]; -extern s32 __cfi_sites[], __cfi_sites_end[]; -extern s32 __ibt_endbr_seal[], __ibt_endbr_seal_end[]; -extern s32 __smp_locks[], __smp_locks_end[]; - /* * Matches NOP and NOPL, not any of the other possible NOPs. */ From 2c373ca0640f25361aee1bcf382fc8081830938e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 11 Apr 2025 07:41:01 +0200 Subject: [PATCH 47/63] x86/alternatives: Remove 'smp_text_poke_batch_flush()' It only has a single user left, merge it into smp_text_poke_batch_add() and remove the helper function. Signed-off-by: Ingo Molnar Cc: Juergen Gross Cc: "H . Peter Anvin" Cc: Linus Torvalds Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20250411054105.2341982-50-mingo@kernel.org --- arch/x86/kernel/alternative.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index cd828c236fd2..c0be0663425e 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -2831,14 +2831,6 @@ void smp_text_poke_batch_finish(void) smp_text_poke_batch_process(); } -static void smp_text_poke_batch_flush(void *addr) -{ - lockdep_assert_held(&text_mutex); - - if (text_poke_array.nr_entries == TEXT_POKE_ARRAY_MAX || !text_poke_addr_ordered(addr)) - smp_text_poke_batch_process(); -} - /** * smp_text_poke_batch_add() -- update instruction on live kernel on SMP, batched * @addr: address to patch @@ -2854,7 +2846,8 @@ static void smp_text_poke_batch_flush(void *addr) */ void __ref smp_text_poke_batch_add(void *addr, const void *opcode, size_t len, const void *emulate) { - smp_text_poke_batch_flush(addr); + if (text_poke_array.nr_entries == TEXT_POKE_ARRAY_MAX || !text_poke_addr_ordered(addr)) + smp_text_poke_batch_process(); __smp_text_poke_batch_add(addr, opcode, len, emulate); } From dac0d7542782bae98a4d8cedde3028a07f1915d2 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 11 Apr 2025 07:41:02 +0200 Subject: [PATCH 48/63] x86/alternatives: Update the comments in smp_text_poke_batch_process() - Capitalize 'INT3' consistently, - make it clear that 'sync cores' means an SMP sync to all CPUs, - fix typos and spelling. Signed-off-by: Ingo Molnar Cc: Juergen Gross Cc: "H . Peter Anvin" Cc: Linus Torvalds Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20250411054105.2341982-51-mingo@kernel.org --- arch/x86/kernel/alternative.c | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index c0be0663425e..9ee6f879bce5 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -2561,24 +2561,26 @@ out_put: /** * smp_text_poke_batch_process() -- update instructions on live kernel on SMP - * @text_poke_array.vec: vector of instructions to patch - * @text_poke_array.nr_entries: number of entries in the vector * - * Modify multi-byte instruction by using int3 breakpoint on SMP. - * We completely avoid stop_machine() here, and achieve the - * synchronization using int3 breakpoint. + * Input state: + * text_poke_array.vec: vector of instructions to patch + * text_poke_array.nr_entries: number of entries in the vector + * + * Modify multi-byte instructions by using INT3 breakpoints on SMP. + * We completely avoid using stop_machine() here, and achieve the + * synchronization using INT3 breakpoints and SMP cross-calls. * * The way it is done: * - For each entry in the vector: - * - add a int3 trap to the address that will be patched - * - sync cores + * - add an INT3 trap to the address that will be patched + * - SMP sync all CPUs * - For each entry in the vector: * - update all but the first byte of the patched range - * - sync cores + * - SMP sync all CPUs * - For each entry in the vector: - * - replace the first byte (int3) by the first byte of + * - replace the first byte (INT3) by the first byte of the * replacing opcode - * - sync cores + * - SMP sync all CPUs */ static void smp_text_poke_batch_process(void) { @@ -2606,13 +2608,13 @@ static void smp_text_poke_batch_process(void) cond_resched(); /* - * Corresponding read barrier in int3 notifier for making sure the + * Corresponding read barrier in INT3 notifier for making sure the * text_poke_array.nr_entries and handler are correctly ordered wrt. patching. */ smp_wmb(); /* - * First step: add a int3 trap to the address that will be patched. + * First step: add a INT3 trap to the address that will be patched. */ for (i = 0; i < text_poke_array.nr_entries; i++) { text_poke_array.vec[i].old = *(u8 *)text_poke_addr(&text_poke_array.vec[i]); @@ -2685,7 +2687,7 @@ static void smp_text_poke_batch_process(void) } /* - * Third step: replace the first byte (int3) by the first byte of + * Third step: replace the first byte (INT3) by the first byte of the * replacing opcode. */ for (do_sync = 0, i = 0; i < text_poke_array.nr_entries; i++) { From 023f42dd59203be8ad2fc0574af32d3b4ad041ec Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 11 Apr 2025 07:41:03 +0200 Subject: [PATCH 49/63] x86/alternatives: Rename 'apply_relocation()' to 'text_poke_apply_relocation()' Join the text_poke_*() API namespace. Signed-off-by: Ingo Molnar Cc: Juergen Gross Cc: "H . Peter Anvin" Cc: Linus Torvalds Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20250411054105.2341982-52-mingo@kernel.org --- arch/x86/include/asm/text-patching.h | 2 +- arch/x86/kernel/alternative.c | 6 +++--- arch/x86/kernel/callthunks.c | 6 +++--- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h index a45ac8a223f3..5337f1be18f6 100644 --- a/arch/x86/include/asm/text-patching.h +++ b/arch/x86/include/asm/text-patching.h @@ -15,7 +15,7 @@ extern void text_poke_early(void *addr, const void *opcode, size_t len); -extern void apply_relocation(u8 *buf, const u8 * const instr, size_t instrlen, u8 *repl, size_t repl_len); +extern void text_poke_apply_relocation(u8 *buf, const u8 * const instr, size_t instrlen, u8 *repl, size_t repl_len); /* * Clear and restore the kernel write-protection flag on the local CPU. diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 9ee6f879bce5..231b2acbe360 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -340,7 +340,7 @@ static void __apply_relocation(u8 *buf, const u8 * const instr, size_t instrlen, } } -void apply_relocation(u8 *buf, const u8 * const instr, size_t instrlen, u8 *repl, size_t repl_len) +void text_poke_apply_relocation(u8 *buf, const u8 * const instr, size_t instrlen, u8 *repl, size_t repl_len) { __apply_relocation(buf, instr, instrlen, repl, repl_len); optimize_nops(instr, buf, instrlen); @@ -496,7 +496,7 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, for (; insn_buff_sz < a->instrlen; insn_buff_sz++) insn_buff[insn_buff_sz] = 0x90; - apply_relocation(insn_buff, instr, a->instrlen, replacement, a->replacementlen); + text_poke_apply_relocation(insn_buff, instr, a->instrlen, replacement, a->replacementlen); DUMP_BYTES(ALT, instr, a->instrlen, "%px: old_insn: ", instr); DUMP_BYTES(ALT, replacement, a->replacementlen, "%px: rpl_insn: ", replacement); @@ -1981,7 +1981,7 @@ __visible noinline void __init __alt_reloc_selftest(void *arg) static noinline void __init alt_reloc_selftest(void) { /* - * Tests apply_relocation(). + * Tests text_poke_apply_relocation(). * * This has a relative immediate (CALL) in a place other than the first * instruction and additionally on x86_64 we get a RIP-relative LEA: diff --git a/arch/x86/kernel/callthunks.c b/arch/x86/kernel/callthunks.c index d86d7d6e750c..a951333c5995 100644 --- a/arch/x86/kernel/callthunks.c +++ b/arch/x86/kernel/callthunks.c @@ -185,7 +185,7 @@ static void *patch_dest(void *dest, bool direct) u8 *pad = dest - tsize; memcpy(insn_buff, skl_call_thunk_template, tsize); - apply_relocation(insn_buff, pad, tsize, skl_call_thunk_template, tsize); + text_poke_apply_relocation(insn_buff, pad, tsize, skl_call_thunk_template, tsize); /* Already patched? */ if (!bcmp(pad, insn_buff, tsize)) @@ -294,7 +294,7 @@ static bool is_callthunk(void *addr) pad = (void *)(dest - tmpl_size); memcpy(insn_buff, skl_call_thunk_template, tmpl_size); - apply_relocation(insn_buff, pad, tmpl_size, skl_call_thunk_template, tmpl_size); + text_poke_apply_relocation(insn_buff, pad, tmpl_size, skl_call_thunk_template, tmpl_size); return !bcmp(pad, insn_buff, tmpl_size); } @@ -312,7 +312,7 @@ int x86_call_depth_emit_accounting(u8 **pprog, void *func, void *ip) return 0; memcpy(insn_buff, skl_call_thunk_template, tmpl_size); - apply_relocation(insn_buff, ip, tmpl_size, skl_call_thunk_template, tmpl_size); + text_poke_apply_relocation(insn_buff, ip, tmpl_size, skl_call_thunk_template, tmpl_size); memcpy(*pprog, insn_buff, tmpl_size); *pprog += tmpl_size; From 4f9534719e524affb1aa8e0ff0c8b30c1c65e574 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 11 Apr 2025 07:41:04 +0200 Subject: [PATCH 50/63] x86/alternatives: Add comment about noinstr expectations Signed-off-by: Ingo Molnar Cc: Juergen Gross Cc: "H . Peter Anvin" Cc: Linus Torvalds Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20250411054105.2341982-53-mingo@kernel.org --- arch/x86/kernel/alternative.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 231b2acbe360..604dd608d281 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -2447,6 +2447,11 @@ static struct smp_text_poke_array { static DEFINE_PER_CPU(atomic_t, text_poke_array_refs); +/* + * These four __always_inline annotations imply noinstr, necessary + * due to smp_text_poke_int3_handler() being noinstr: + */ + static __always_inline bool try_get_text_poke_array(void) { atomic_t *refs = this_cpu_ptr(&text_poke_array_refs); From 23a76739d6afe6f98ccdb2517d7985b4335c7a3a Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Fri, 11 Apr 2025 07:41:05 +0200 Subject: [PATCH 51/63] x86/alternatives: Make smp_text_poke_batch_process() subsume smp_text_poke_batch_finish() Simplify the alternatives interface some more by moving the poke_batch_finish check into poke_batch_process and renaming the latter. The net effect is one less function name to consider when reading the code. Signed-off-by: Nikolay Borisov Signed-off-by: Ingo Molnar Cc: Juergen Gross Cc: "H . Peter Anvin" Cc: Linus Torvalds Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20250411054105.2341982-54-mingo@kernel.org --- arch/x86/kernel/alternative.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 604dd608d281..f785d2335812 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -2434,7 +2434,7 @@ struct smp_text_poke_loc { u8 len; u8 opcode; const u8 text[TEXT_POKE_MAX_OPCODE_SIZE]; - /* see smp_text_poke_batch_process() */ + /* see smp_text_poke_batch_finish() */ u8 old; }; @@ -2507,7 +2507,7 @@ noinstr int smp_text_poke_int3_handler(struct pt_regs *regs) return 0; /* - * Discount the INT3. See smp_text_poke_batch_process(). + * Discount the INT3. See smp_text_poke_batch_finish(). */ ip = (void *) regs->ip - INT3_INSN_SIZE; @@ -2565,7 +2565,7 @@ out_put: } /** - * smp_text_poke_batch_process() -- update instructions on live kernel on SMP + * smp_text_poke_batch_finish() -- update instructions on live kernel on SMP * * Input state: * text_poke_array.vec: vector of instructions to patch @@ -2587,12 +2587,15 @@ out_put: * replacing opcode * - SMP sync all CPUs */ -static void smp_text_poke_batch_process(void) +void smp_text_poke_batch_finish(void) { unsigned char int3 = INT3_INSN_OPCODE; unsigned int i; int do_sync; + if (!text_poke_array.nr_entries) + return; + lockdep_assert_held(&text_mutex); /* @@ -2832,12 +2835,6 @@ static bool text_poke_addr_ordered(void *addr) return true; } -void smp_text_poke_batch_finish(void) -{ - if (text_poke_array.nr_entries) - smp_text_poke_batch_process(); -} - /** * smp_text_poke_batch_add() -- update instruction on live kernel on SMP, batched * @addr: address to patch @@ -2854,7 +2851,7 @@ void smp_text_poke_batch_finish(void) void __ref smp_text_poke_batch_add(void *addr, const void *opcode, size_t len, const void *emulate) { if (text_poke_array.nr_entries == TEXT_POKE_ARRAY_MAX || !text_poke_addr_ordered(addr)) - smp_text_poke_batch_process(); + smp_text_poke_batch_finish(); __smp_text_poke_batch_add(addr, opcode, len, emulate); } From 0812e096cff0fd58d88a21a413fba56c0e6c3caa Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 2 Apr 2025 11:45:34 +0200 Subject: [PATCH 52/63] x86/mm: Add 'mm' argument to unuse_temporary_mm() In commit 209954cbc7d0 ("x86/mm/tlb: Update mm_cpumask lazily") unuse_temporary_mm() grew the assumption that it gets used on poking_mm exclusively. While this is currently true, lets not hard code this assumption. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Cc: Andy Lutomirski Cc: Rik van Riel Cc: "H. Peter Anvin" Cc: Linus Torvalds Cc: Andrew Morton Link: https://lore.kernel.org/r/20250402094540.3586683-2-mingo@kernel.org --- arch/x86/kernel/alternative.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index f785d2335812..95053e8a1378 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -2161,14 +2161,14 @@ static inline struct mm_struct *use_temporary_mm(struct mm_struct *temp_mm) __ro_after_init struct mm_struct *text_poke_mm; __ro_after_init unsigned long text_poke_mm_addr; -static inline void unuse_temporary_mm(struct mm_struct *prev_mm) +static inline void unuse_temporary_mm(struct mm_struct *mm, struct mm_struct *prev_mm) { lockdep_assert_irqs_disabled(); switch_mm_irqs_off(NULL, prev_mm, current); /* Clear the cpumask, to indicate no TLB flushing is needed anywhere */ - cpumask_clear_cpu(raw_smp_processor_id(), mm_cpumask(text_poke_mm)); + cpumask_clear_cpu(raw_smp_processor_id(), mm_cpumask(mm)); /* * Restore the breakpoints if they were disabled before the temporary mm @@ -2275,7 +2275,7 @@ static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t l * instruction that already allows the core to see the updated version. * Xen-PV is assumed to serialize execution in a similar manner. */ - unuse_temporary_mm(prev_mm); + unuse_temporary_mm(text_poke_mm, prev_mm); /* * Flushing the TLB might involve IPIs, which would require enabled From 81e3cbdef230fd9adfa8569044b07290afd66708 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 2 Apr 2025 11:45:35 +0200 Subject: [PATCH 53/63] x86/events, x86/insn-eval: Remove incorrect current->active_mm references When decoding an instruction or handling a perf event that references an LDT segment, if we don't have a valid user context, trying to access the LDT by any means other than SLDT is racy. Certainly, using current->active_mm is wrong, as active_mm can point to a real user mm when CR3 and LDTR no longer reference that mm. Clean up the code. If nmi_uaccess_okay() says we don't have a valid context, just fail. Otherwise use current->mm. Signed-off-by: Andy Lutomirski Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Cc: Rik van Riel Cc: "H. Peter Anvin" Cc: Linus Torvalds Cc: Andrew Morton Link: https://lore.kernel.org/r/20250402094540.3586683-3-mingo@kernel.org --- arch/x86/events/core.c | 9 ++++++++- arch/x86/lib/insn-eval.c | 13 ++++++++++--- 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 6866cc5acb0b..95118b52b606 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -2803,8 +2803,15 @@ static unsigned long get_segment_base(unsigned int segment) #ifdef CONFIG_MODIFY_LDT_SYSCALL struct ldt_struct *ldt; + /* + * If we're not in a valid context with a real (not just lazy) + * user mm, then don't even try. + */ + if (!nmi_uaccess_okay()) + return 0; + /* IRQs are off, so this synchronizes with smp_store_release */ - ldt = READ_ONCE(current->active_mm->context.ldt); + ldt = smp_load_acquire(¤t->mm->context.ldt); if (!ldt || idx >= ldt->nr_entries) return 0; diff --git a/arch/x86/lib/insn-eval.c b/arch/x86/lib/insn-eval.c index 98631c0e7a11..f786401ac15d 100644 --- a/arch/x86/lib/insn-eval.c +++ b/arch/x86/lib/insn-eval.c @@ -631,14 +631,21 @@ static bool get_desc(struct desc_struct *out, unsigned short sel) /* Bits [15:3] contain the index of the desired entry. */ sel >>= 3; - mutex_lock(¤t->active_mm->context.lock); - ldt = current->active_mm->context.ldt; + /* + * If we're not in a valid context with a real (not just lazy) + * user mm, then don't even try. + */ + if (!nmi_uaccess_okay()) + return false; + + mutex_lock(¤t->mm->context.lock); + ldt = current->mm->context.ldt; if (ldt && sel < ldt->nr_entries) { *out = ldt->entries[sel]; success = true; } - mutex_unlock(¤t->active_mm->context.lock); + mutex_unlock(¤t->mm->context.lock); return success; } From d376972c9825ac4e8ad74872ee0730a5b4292e44 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 2 Apr 2025 11:45:36 +0200 Subject: [PATCH 54/63] x86/mm: Make use_/unuse_temporary_mm() non-static This prepares them for use outside of the alternative machinery. The code is unchanged. Signed-off-by: Andy Lutomirski Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Cc: Rik van Riel Cc: "H. Peter Anvin" Cc: Linus Torvalds Cc: Andrew Morton Link: https://lore.kernel.org/r/20250402094540.3586683-4-mingo@kernel.org --- arch/x86/include/asm/mmu_context.h | 3 ++ arch/x86/kernel/alternative.c | 64 ------------------------------ arch/x86/mm/tlb.c | 64 ++++++++++++++++++++++++++++++ 3 files changed, 67 insertions(+), 64 deletions(-) diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 2398058b6e83..b103e1709a67 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -272,4 +272,7 @@ unsigned long __get_current_cr3_fast(void); #include +extern struct mm_struct *use_temporary_mm(struct mm_struct *temp_mm); +extern void unuse_temporary_mm(struct mm_struct *mm, struct mm_struct *prev_mm); + #endif /* _ASM_X86_MMU_CONTEXT_H */ diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 95053e8a1378..bdbdfa0e4d06 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -2111,73 +2111,9 @@ void __init_or_module text_poke_early(void *addr, const void *opcode, } } -/* - * Using a temporary mm allows to set temporary mappings that are not accessible - * by other CPUs. Such mappings are needed to perform sensitive memory writes - * that override the kernel memory protections (e.g., W^X), without exposing the - * temporary page-table mappings that are required for these write operations to - * other CPUs. Using a temporary mm also allows to avoid TLB shootdowns when the - * mapping is torn down. - * - * Context: The temporary mm needs to be used exclusively by a single core. To - * harden security IRQs must be disabled while the temporary mm is - * loaded, thereby preventing interrupt handler bugs from overriding - * the kernel memory protection. - */ -static inline struct mm_struct *use_temporary_mm(struct mm_struct *temp_mm) -{ - struct mm_struct *prev_mm; - - lockdep_assert_irqs_disabled(); - - /* - * Make sure not to be in TLB lazy mode, as otherwise we'll end up - * with a stale address space WITHOUT being in lazy mode after - * restoring the previous mm. - */ - if (this_cpu_read(cpu_tlbstate_shared.is_lazy)) - leave_mm(); - - prev_mm = this_cpu_read(cpu_tlbstate.loaded_mm); - switch_mm_irqs_off(NULL, temp_mm, current); - - /* - * If breakpoints are enabled, disable them while the temporary mm is - * used. Userspace might set up watchpoints on addresses that are used - * in the temporary mm, which would lead to wrong signals being sent or - * crashes. - * - * Note that breakpoints are not disabled selectively, which also causes - * kernel breakpoints (e.g., perf's) to be disabled. This might be - * undesirable, but still seems reasonable as the code that runs in the - * temporary mm should be short. - */ - if (hw_breakpoint_active()) - hw_breakpoint_disable(); - - return prev_mm; -} - __ro_after_init struct mm_struct *text_poke_mm; __ro_after_init unsigned long text_poke_mm_addr; -static inline void unuse_temporary_mm(struct mm_struct *mm, struct mm_struct *prev_mm) -{ - lockdep_assert_irqs_disabled(); - - switch_mm_irqs_off(NULL, prev_mm, current); - - /* Clear the cpumask, to indicate no TLB flushing is needed anywhere */ - cpumask_clear_cpu(raw_smp_processor_id(), mm_cpumask(mm)); - - /* - * Restore the breakpoints if they were disabled before the temporary mm - * was loaded. - */ - if (hw_breakpoint_active()) - hw_breakpoint_restore(); -} - static void text_poke_memcpy(void *dst, const void *src, size_t len) { memcpy(dst, src, len); diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index e459d97ef397..f3da20bfcf0e 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -971,6 +971,70 @@ void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) this_cpu_write(cpu_tlbstate_shared.is_lazy, true); } +/* + * Using a temporary mm allows to set temporary mappings that are not accessible + * by other CPUs. Such mappings are needed to perform sensitive memory writes + * that override the kernel memory protections (e.g., W^X), without exposing the + * temporary page-table mappings that are required for these write operations to + * other CPUs. Using a temporary mm also allows to avoid TLB shootdowns when the + * mapping is torn down. + * + * Context: The temporary mm needs to be used exclusively by a single core. To + * harden security IRQs must be disabled while the temporary mm is + * loaded, thereby preventing interrupt handler bugs from overriding + * the kernel memory protection. + */ +struct mm_struct *use_temporary_mm(struct mm_struct *temp_mm) +{ + struct mm_struct *prev_mm; + + lockdep_assert_irqs_disabled(); + + /* + * Make sure not to be in TLB lazy mode, as otherwise we'll end up + * with a stale address space WITHOUT being in lazy mode after + * restoring the previous mm. + */ + if (this_cpu_read(cpu_tlbstate_shared.is_lazy)) + leave_mm(); + + prev_mm = this_cpu_read(cpu_tlbstate.loaded_mm); + switch_mm_irqs_off(NULL, temp_mm, current); + + /* + * If breakpoints are enabled, disable them while the temporary mm is + * used. Userspace might set up watchpoints on addresses that are used + * in the temporary mm, which would lead to wrong signals being sent or + * crashes. + * + * Note that breakpoints are not disabled selectively, which also causes + * kernel breakpoints (e.g., perf's) to be disabled. This might be + * undesirable, but still seems reasonable as the code that runs in the + * temporary mm should be short. + */ + if (hw_breakpoint_active()) + hw_breakpoint_disable(); + + return prev_mm; +} + +void unuse_temporary_mm(struct mm_struct *mm, struct mm_struct *prev_mm) +{ + lockdep_assert_irqs_disabled(); + + switch_mm_irqs_off(NULL, prev_mm, current); + + /* Clear the cpumask, to indicate no TLB flushing is needed anywhere */ + cpumask_clear_cpu(raw_smp_processor_id(), mm_cpumask(mm)); + + /* + * Restore the breakpoints if they were disabled before the temporary mm + * was loaded. + */ + if (hw_breakpoint_active()) + hw_breakpoint_restore(); +} + /* * Call this when reinitializing a CPU. It fixes the following potential * problems: From 4873f494bbe4670f353a9b76ce44e6028c811cbb Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 2 Apr 2025 11:45:37 +0200 Subject: [PATCH 55/63] x86/mm: Remove 'mm' argument from unuse_temporary_mm() again Now that unuse_temporary_mm() lives in tlb.c it can access cpu_tlbstate.loaded_mm. [ mingo: Merged it on top of x86/alternatives ] Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Cc: Andy Lutomirski Cc: Rik van Riel Cc: "H. Peter Anvin" Cc: Linus Torvalds Cc: Andrew Morton Link: https://lore.kernel.org/r/20250402094540.3586683-5-mingo@kernel.org --- arch/x86/include/asm/mmu_context.h | 2 +- arch/x86/kernel/alternative.c | 2 +- arch/x86/mm/tlb.c | 8 ++++---- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index b103e1709a67..988c11792634 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -273,6 +273,6 @@ unsigned long __get_current_cr3_fast(void); #include extern struct mm_struct *use_temporary_mm(struct mm_struct *temp_mm); -extern void unuse_temporary_mm(struct mm_struct *mm, struct mm_struct *prev_mm); +extern void unuse_temporary_mm(struct mm_struct *prev_mm); #endif /* _ASM_X86_MMU_CONTEXT_H */ diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index bdbdfa0e4d06..ddbc303e41e3 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -2211,7 +2211,7 @@ static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t l * instruction that already allows the core to see the updated version. * Xen-PV is assumed to serialize execution in a similar manner. */ - unuse_temporary_mm(text_poke_mm, prev_mm); + unuse_temporary_mm(prev_mm); /* * Flushing the TLB might involve IPIs, which would require enabled diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index f3da20bfcf0e..38fdcf875d5f 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -1018,14 +1018,14 @@ struct mm_struct *use_temporary_mm(struct mm_struct *temp_mm) return prev_mm; } -void unuse_temporary_mm(struct mm_struct *mm, struct mm_struct *prev_mm) +void unuse_temporary_mm(struct mm_struct *prev_mm) { lockdep_assert_irqs_disabled(); - switch_mm_irqs_off(NULL, prev_mm, current); - /* Clear the cpumask, to indicate no TLB flushing is needed anywhere */ - cpumask_clear_cpu(raw_smp_processor_id(), mm_cpumask(mm)); + cpumask_clear_cpu(smp_processor_id(), mm_cpumask(this_cpu_read(cpu_tlbstate.loaded_mm))); + + switch_mm_irqs_off(NULL, prev_mm, current); /* * Restore the breakpoints if they were disabled before the temporary mm From 58f8ffa917669a0c8c027e24d5349f0b488f8181 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 2 Apr 2025 11:45:38 +0200 Subject: [PATCH 56/63] x86/mm: Allow temporary MMs when IRQs are on EFI runtime services should use temporary MMs, but EFI runtime services want IRQs on. Preemption must still be disabled in a temporary MM context. At some point, the entirely temporary MM mechanism should be moved out of arch code. Signed-off-by: Andy Lutomirski Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Cc: Rik van Riel Cc: "H. Peter Anvin" Cc: Linus Torvalds Cc: Andrew Morton Cc: Ard Biesheuvel Link: https://lore.kernel.org/r/20250402094540.3586683-6-mingo@kernel.org --- arch/x86/mm/tlb.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 38fdcf875d5f..c9b87e5f569a 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -977,18 +977,23 @@ void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) * that override the kernel memory protections (e.g., W^X), without exposing the * temporary page-table mappings that are required for these write operations to * other CPUs. Using a temporary mm also allows to avoid TLB shootdowns when the - * mapping is torn down. + * mapping is torn down. Temporary mms can also be used for EFI runtime service + * calls or similar functionality. * - * Context: The temporary mm needs to be used exclusively by a single core. To - * harden security IRQs must be disabled while the temporary mm is - * loaded, thereby preventing interrupt handler bugs from overriding - * the kernel memory protection. + * It is illegal to schedule while using a temporary mm -- the context switch + * code is unaware of the temporary mm and does not know how to context switch. + * Use a real (non-temporary) mm in a kernel thread if you need to sleep. + * + * Note: For sensitive memory writes, the temporary mm needs to be used + * exclusively by a single core, and IRQs should be disabled while the + * temporary mm is loaded, thereby preventing interrupt handler bugs from + * overriding the kernel memory protection. */ struct mm_struct *use_temporary_mm(struct mm_struct *temp_mm) { struct mm_struct *prev_mm; - lockdep_assert_irqs_disabled(); + lockdep_assert_preemption_disabled(); /* * Make sure not to be in TLB lazy mode, as otherwise we'll end up @@ -1020,7 +1025,7 @@ struct mm_struct *use_temporary_mm(struct mm_struct *temp_mm) void unuse_temporary_mm(struct mm_struct *prev_mm) { - lockdep_assert_irqs_disabled(); + lockdep_assert_preemption_disabled(); /* Clear the cpumask, to indicate no TLB flushing is needed anywhere */ cpumask_clear_cpu(smp_processor_id(), mm_cpumask(this_cpu_read(cpu_tlbstate.loaded_mm))); From e7021e2fe0b4335523d3f6e2221000bdfc633b62 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 2 Apr 2025 11:45:39 +0200 Subject: [PATCH 57/63] x86/efi: Make efi_enter/leave_mm() use the use_/unuse_temporary_mm() machinery This should be considerably more robust. It's also necessary for optimized for_each_possible_lazymm_cpu() on x86 -- without this patch, EFI calls in lazy context would remove the lazy mm from mm_cpumask(). [ mingo: Merged it on top of x86/alternatives ] Signed-off-by: Andy Lutomirski Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Cc: Rik van Riel Cc: "H. Peter Anvin" Cc: Linus Torvalds Cc: Andrew Morton Link: https://lore.kernel.org/r/20250402094540.3586683-7-mingo@kernel.org --- arch/x86/platform/efi/efi_64.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index ac57259a432b..a5d3496d32a5 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -434,15 +434,12 @@ void __init efi_dump_pagetable(void) */ static void efi_enter_mm(void) { - efi_prev_mm = current->active_mm; - current->active_mm = &efi_mm; - switch_mm(efi_prev_mm, &efi_mm, NULL); + efi_prev_mm = use_temporary_mm(&efi_mm); } static void efi_leave_mm(void) { - current->active_mm = efi_prev_mm; - switch_mm(&efi_mm, efi_prev_mm, NULL); + unuse_temporary_mm(efi_prev_mm); } void arch_efi_call_virt_setup(void) From af8967158f9ad759a93e8e7a933c10e7cbb01ba2 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 2 Apr 2025 11:45:40 +0200 Subject: [PATCH 58/63] x86/mm: Opt-in to IRQs-off activate_mm() We gain nothing by having the core code enable IRQs right before calling activate_mm() only for us to turn them right back off again in switch_mm(). This will save a few cycles, so execve() should be blazingly fast with this patch applied! Signed-off-by: Andy Lutomirski Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Cc: Rik van Riel Cc: "H. Peter Anvin" Cc: Linus Torvalds Cc: Andrew Morton Link: https://lore.kernel.org/r/20250402094540.3586683-8-mingo@kernel.org --- arch/x86/Kconfig | 1 + arch/x86/include/asm/mmu_context.h | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 4b9f378e05f6..aeac63b11fc2 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -153,6 +153,7 @@ config X86 select ARCH_WANT_HUGETLB_VMEMMAP_PREINIT if X86_64 select ARCH_WANTS_THP_SWAP if X86_64 select ARCH_HAS_PARANOID_L1D_FLUSH + select ARCH_WANT_IRQS_OFF_ACTIVATE_MM select BUILDTIME_TABLE_SORT select CLKEVT_I8253 select CLOCKSOURCE_WATCHDOG diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 988c11792634..c511f8584ae4 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -190,7 +190,7 @@ extern void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, #define activate_mm(prev, next) \ do { \ paravirt_enter_mmap(next); \ - switch_mm((prev), (next), NULL); \ + switch_mm_irqs_off((prev), (next), NULL); \ } while (0); #ifdef CONFIG_X86_32 From f99002b9a9cc441a8f362e6fb32cf8a5a990261a Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 14 Apr 2025 09:39:33 +0200 Subject: [PATCH 59/63] x86/alternatives, um: Rename UML's text_poke_sync() wrapper to smp_text_poke_sync_each_cpu() Missed this UML wrapper in the rename. Fixes: 6e4955a9d73e ("x86/alternatives: Rename 'text_poke_sync()' to 'smp_text_poke_sync_each_cpu()'") Reported-by: kernel test robot Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/202504141003.kc69fVoj-lkp@intel.com --- arch/um/kernel/um_arch.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/um/kernel/um_arch.c b/arch/um/kernel/um_arch.c index d4b3b6742ec8..2f5ee045bc7a 100644 --- a/arch/um/kernel/um_arch.c +++ b/arch/um/kernel/um_arch.c @@ -477,7 +477,7 @@ void *text_poke_copy(void *addr, const void *opcode, size_t len) return text_poke(addr, opcode, len); } -void text_poke_sync(void) +void smp_text_poke_sync_each_cpu(void) { } From 52ebfe7412ce4b3af54fe962af58efe9b25cd9a9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 17 Apr 2025 14:34:13 +0200 Subject: [PATCH 60/63] x86/mm: Remove the mm_cpumask(prev) warning from switch_mm_irqs_off() The CONFIG_DEBUG_VM=y warning in switch_mm_irqs_off() started triggering in testing: VM_WARN_ON_ONCE(prev != &init_mm && !cpumask_test_cpu(cpu, mm_cpumask(prev))); AFAIU what happens is that unuse_temporary_mm() clears the mm_cpumask() for the current CPU, while switch_mm_irqs_off() then checks that the mm_cpumask() bit is set for the current CPU. While this behaviour hasn't really changed since the following commit: 209954cbc7d0 ("x86/mm/tlb: Update mm_cpumask lazily") introduced both, but the warning is wrong, so remove it. [ mingo: Patchified Peter's email. ] Reported-by: syzbot+c2537ce72a879a38113e@syzkaller.appspotmail.com Reported-by: Borislav Petkov Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar Cc: Andy Lutomirski Cc: Brian Gerst Cc: Juergen Gross Cc: Andrew Cooper Cc: Rik van Riel Cc: "H. Peter Anvin" Cc: Linus Torvalds Cc: linux-kernel@vger.kernel.org Link: https://lore.kernel.org/r/20250414135629.GA17910@noisy.programming.kicks-ass.net --- arch/x86/mm/tlb.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index c9b87e5f569a..79c124f6f3f2 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -905,14 +905,6 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next, this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING); barrier(); - /* - * Leave this CPU in prev's mm_cpumask. Atomic writes to - * mm_cpumask can be expensive under contention. The CPU - * will be removed lazily at TLB flush time. - */ - VM_WARN_ON_ONCE(prev != &init_mm && !cpumask_test_cpu(cpu, - mm_cpumask(prev))); - /* Start receiving IPIs and then read tlb_gen (and LAM below) */ if (next != &init_mm && !cpumask_test_cpu(cpu, mm_cpumask(next))) cpumask_set_cpu(cpu, mm_cpumask(next)); From aef1d0209ddf127a8069aca5fa3a062be4136b76 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 18 Apr 2025 11:50:34 +0200 Subject: [PATCH 61/63] x86/mm: Fix {,un}use_temporary_mm() IRQ state As the function switch_mm_irqs_off() implies, it ought to be called with IRQs *off*. Commit 58f8ffa91766 ("x86/mm: Allow temporary MMs when IRQs are on") caused this to not be the case for EFI. Ensure IRQs are off where it matters. Fixes: 58f8ffa91766 ("x86/mm: Allow temporary MMs when IRQs are on") Reported-by: Borislav Petkov (AMD) Tested-by: Borislav Petkov (AMD) Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Cc: H. Peter Anvin Cc: Andrew Morton Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Rik van Riel Link: https://lore.kernel.org/r/20250418095034.GR38216@noisy.programming.kicks-ass.net --- arch/x86/mm/tlb.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 79c124f6f3f2..39761c7765bd 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -986,6 +986,7 @@ struct mm_struct *use_temporary_mm(struct mm_struct *temp_mm) struct mm_struct *prev_mm; lockdep_assert_preemption_disabled(); + guard(irqsave)(); /* * Make sure not to be in TLB lazy mode, as otherwise we'll end up @@ -1018,6 +1019,7 @@ struct mm_struct *use_temporary_mm(struct mm_struct *temp_mm) void unuse_temporary_mm(struct mm_struct *prev_mm) { lockdep_assert_preemption_disabled(); + guard(irqsave)(); /* Clear the cpumask, to indicate no TLB flushing is needed anywhere */ cpumask_clear_cpu(smp_processor_id(), mm_cpumask(this_cpu_read(cpu_tlbstate.loaded_mm))); From 43c2df7e2b08db6d65ce9707e4090f1f0c61f2f6 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Mon, 5 May 2025 15:16:46 +0200 Subject: [PATCH 62/63] x86/alternative: Remove unused header #defines Remove some unfortunately-named unused macros which could potentially result in weird build failures. Fortunately, they are under an #ifdef __ASSEMBLER__ which has kept them from causing problems so far. [ dhansen: subject and changelog tweaks ] Fixes: 1a6ade825079 ("x86/alternative: Convert the asm ALTERNATIVE_3() macro") Signed-off-by: Juergen Gross Signed-off-by: Dave Hansen Link: https://lore.kernel.org/all/20250505131646.29288-1-jgross%40suse.com --- arch/x86/include/asm/alternative.h | 5 ----- 1 file changed, 5 deletions(-) diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index ef84739a77f5..e18cdaa1573c 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -341,11 +341,6 @@ void nop_func(void); __ALTERNATIVE(\oldinstr, \newinstr, \ft_flags) .endm -#define old_len 141b-140b -#define new_len1 144f-143f -#define new_len2 145f-144f -#define new_len3 146f-145f - /* * Same as ALTERNATIVE macro above but for two alternatives. If CPU * has @feature1, it replaces @oldinstr with @newinstr1. If CPU has From 7f9958230d8a79d474829bee25ec9426397335ce Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 30 Apr 2025 10:11:54 +0200 Subject: [PATCH 63/63] x86/mm: Fix false positive warning in switch_mm_irqs_off() Multiple testers reported the following new warning: WARNING: CPU: 0 PID: 0 at arch/x86/mm/tlb.c:795 Which corresponds to: if (IS_ENABLED(CONFIG_DEBUG_VM) && WARN_ON_ONCE(prev != &init_mm && !cpumask_test_cpu(cpu, mm_cpumask(next)))) cpumask_set_cpu(cpu, mm_cpumask(next)); So the problem is that unuse_temporary_mm() explicitly clears that bit; and it has to, because otherwise the flush_tlb_mm_range() in __text_poke() will try sending IPIs, which are not at all needed. See also: https://lore.kernel.org/all/20241113095550.GBZzR3pg-RhJKPDazS@fat_crate.local/ Notably, the whole {,un}use_temporary_mm() thing requires preemption to be disabled across it with the express purpose of keeping all TLB nonsense CPU local, such that invalidations can also stay local etc. However, as a side-effect, we violate this above WARN(), which sorta makes sense for the normal case, but very much doesn't make sense here. Change unuse_temporary_mm() to mark the mm_struct such that a further exception (beyond init_mm) can be grafted, to keep the warning for all the other cases. Reported-by: Chaitanya Kumar Borah Reported-by: Jani Nikula Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar Cc: Andrew Cooper Cc: Andy Lutomirski Cc: Brian Gerst Cc: H. Peter Anvin Cc: Juergen Gross Cc: Linus Torvalds Cc: Rik van Riel Link: https://lore.kernel.org/r/20250430081154.GH4439@noisy.programming.kicks-ass.net --- arch/x86/include/asm/mmu.h | 4 ++-- arch/x86/include/asm/mmu_context.h | 10 ++++++++++ arch/x86/mm/init.c | 3 +++ arch/x86/mm/tlb.c | 3 ++- arch/x86/platform/efi/efi_64.c | 1 + 5 files changed, 18 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h index 8b8055a8eb9e..0fe9c569d171 100644 --- a/arch/x86/include/asm/mmu.h +++ b/arch/x86/include/asm/mmu.h @@ -16,6 +16,8 @@ #define MM_CONTEXT_LOCK_LAM 2 /* Allow LAM and SVA coexisting */ #define MM_CONTEXT_FORCE_TAGGED_SVA 3 +/* Tracks mm_cpumask */ +#define MM_CONTEXT_NOTRACK 4 /* * x86 has arch-specific MMU state beyond what lives in mm_struct. @@ -44,9 +46,7 @@ typedef struct { struct ldt_struct *ldt; #endif -#ifdef CONFIG_X86_64 unsigned long flags; -#endif #ifdef CONFIG_ADDRESS_MASKING /* Active LAM mode: X86_CR3_LAM_U48 or X86_CR3_LAM_U57 or 0 (disabled) */ diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index c511f8584ae4..73bf3b1b44e8 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -247,6 +247,16 @@ static inline bool is_64bit_mm(struct mm_struct *mm) } #endif +static inline bool is_notrack_mm(struct mm_struct *mm) +{ + return test_bit(MM_CONTEXT_NOTRACK, &mm->context.flags); +} + +static inline void set_notrack_mm(struct mm_struct *mm) +{ + set_bit(MM_CONTEXT_NOTRACK, &mm->context.flags); +} + /* * We only want to enforce protection keys on the current process * because we effectively have no access to PKRU for other diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index f8c74d19bebb..aa56d9ac0b8f 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -28,6 +28,7 @@ #include #include #include +#include /* * We need to define the tracepoints somewhere, and tlb.c @@ -830,6 +831,8 @@ void __init poking_init(void) /* Xen PV guests need the PGD to be pinned. */ paravirt_enter_mmap(text_poke_mm); + set_notrack_mm(text_poke_mm); + /* * Randomize the poking address, but make sure that the following page * will be mapped at the same PMD. We need 2 pages, so find space for 3, diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 39761c7765bd..f5b990e46d7b 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -847,7 +847,8 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next, * mm_cpumask. The TLB shootdown code can figure out from * cpu_tlbstate_shared.is_lazy whether or not to send an IPI. */ - if (IS_ENABLED(CONFIG_DEBUG_VM) && WARN_ON_ONCE(prev != &init_mm && + if (IS_ENABLED(CONFIG_DEBUG_VM) && + WARN_ON_ONCE(prev != &init_mm && !is_notrack_mm(prev) && !cpumask_test_cpu(cpu, mm_cpumask(next)))) cpumask_set_cpu(cpu, mm_cpumask(next)); diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index a5d3496d32a5..ce4c08adca88 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -89,6 +89,7 @@ int __init efi_alloc_page_tables(void) efi_mm.pgd = efi_pgd; mm_init_cpumask(&efi_mm); init_new_context(NULL, &efi_mm); + set_notrack_mm(&efi_mm); return 0;