From fe37c699ae3eed6e02ee55fbf5cb9ceb7fcfd76c Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 6 Feb 2025 14:18:44 -0500 Subject: [PATCH 01/16] x86/nmi: Add an emergency handler in nmi_desc & use it in nmi_shootdown_cpus() Depending on the type of panics, it was found that the __register_nmi_handler() function can be called in NMI context from nmi_shootdown_cpus() leading to a lockdep splat: WARNING: inconsistent lock state inconsistent {INITIAL USE} -> {IN-NMI} usage. lock(&nmi_desc[0].lock); lock(&nmi_desc[0].lock); Call Trace: _raw_spin_lock_irqsave __register_nmi_handler nmi_shootdown_cpus kdump_nmi_shootdown_cpus native_machine_crash_shutdown __crash_kexec In this particular case, the following panic message was printed before: Kernel panic - not syncing: Fatal hardware error! This message seemed to be given out from __ghes_panic() running in NMI context. The __register_nmi_handler() function which takes the nmi_desc lock with irq disabled shouldn't be called from NMI context as this can lead to deadlock. The nmi_shootdown_cpus() function can only be invoked once. After the first invocation, all other CPUs should be stuck in the newly added crash_nmi_callback() and cannot respond to a second NMI. Fix it by adding a new emergency NMI handler to the nmi_desc structure and provide a new set_emergency_nmi_handler() helper to set crash_nmi_callback() in any context. The new emergency handler will preempt other handlers in the linked list. That will eliminate the need to take any lock and serve the panic in NMI use case. Signed-off-by: Waiman Long Signed-off-by: Ingo Molnar Acked-by: Rik van Riel Cc: Thomas Gleixner Link: https://lore.kernel.org/r/20250206191844.131700-1-longman@redhat.com --- arch/x86/include/asm/nmi.h | 2 ++ arch/x86/kernel/nmi.c | 42 ++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/reboot.c | 10 +++------ 3 files changed, 47 insertions(+), 7 deletions(-) diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h index 41a0ebb699ec..f677382093f3 100644 --- a/arch/x86/include/asm/nmi.h +++ b/arch/x86/include/asm/nmi.h @@ -56,6 +56,8 @@ int __register_nmi_handler(unsigned int, struct nmiaction *); void unregister_nmi_handler(unsigned int, const char *); +void set_emergency_nmi_handler(unsigned int type, nmi_handler_t handler); + void stop_nmi(void); void restart_nmi(void); void local_touch_nmi(void); diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index ed163c8c8604..9a95d00f1423 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -40,8 +40,12 @@ #define CREATE_TRACE_POINTS #include +/* + * An emergency handler can be set in any context including NMI + */ struct nmi_desc { raw_spinlock_t lock; + nmi_handler_t emerg_handler; struct list_head head; }; @@ -132,9 +136,22 @@ static void nmi_check_duration(struct nmiaction *action, u64 duration) static int nmi_handle(unsigned int type, struct pt_regs *regs) { struct nmi_desc *desc = nmi_to_desc(type); + nmi_handler_t ehandler; struct nmiaction *a; int handled=0; + /* + * Call the emergency handler, if set + * + * In the case of crash_nmi_callback() emergency handler, it will + * return in the case of the crashing CPU to enable it to complete + * other necessary crashing actions ASAP. Other handlers in the + * linked list won't need to be run. + */ + ehandler = desc->emerg_handler; + if (ehandler) + return ehandler(type, regs); + rcu_read_lock(); /* @@ -224,6 +241,31 @@ void unregister_nmi_handler(unsigned int type, const char *name) } EXPORT_SYMBOL_GPL(unregister_nmi_handler); +/** + * set_emergency_nmi_handler - Set emergency handler + * @type: NMI type + * @handler: the emergency handler to be stored + * + * Set an emergency NMI handler which, if set, will preempt all the other + * handlers in the linked list. If a NULL handler is passed in, it will clear + * it. It is expected that concurrent calls to this function will not happen + * or the system is screwed beyond repair. + */ +void set_emergency_nmi_handler(unsigned int type, nmi_handler_t handler) +{ + struct nmi_desc *desc = nmi_to_desc(type); + + if (WARN_ON_ONCE(desc->emerg_handler == handler)) + return; + desc->emerg_handler = handler; + + /* + * Ensure the emergency handler is visible to other CPUs before + * function return + */ + smp_wmb(); +} + static void pci_serr_error(unsigned char reason, struct pt_regs *regs) { diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index dc1dd3f3e67f..9aaac1f9f45b 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -926,15 +926,11 @@ void nmi_shootdown_cpus(nmi_shootdown_cb callback) shootdown_callback = callback; atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1); - /* Would it be better to replace the trap vector here? */ - if (register_nmi_handler(NMI_LOCAL, crash_nmi_callback, - NMI_FLAG_FIRST, "crash")) - return; /* Return what? */ + /* - * Ensure the new callback function is set before sending - * out the NMI + * Set emergency handler to preempt other handlers. */ - wmb(); + set_emergency_nmi_handler(NMI_LOCAL, crash_nmi_callback); apic_send_IPI_allbutself(NMI_VECTOR); -- 2.51.0 From 9a54fb31343362f93680543e37afc14484c185d9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 24 Feb 2025 13:37:04 +0100 Subject: [PATCH 02/16] x86/cfi: Add 'cfi=warn' boot option Rebuilding with CONFIG_CFI_PERMISSIVE=y enabled is such a pain, esp. since clang is so slow. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Kees Cook Link: https://lore.kernel.org/r/20250224124159.924496481@infradead.org --- arch/x86/kernel/alternative.c | 3 +++ include/linux/cfi.h | 2 ++ kernel/cfi.c | 4 +++- 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 247ee5ffbff4..1142ebd3bb49 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -1022,6 +1022,9 @@ static __init int cfi_parse_cmdline(char *str) cfi_mode = CFI_FINEIBT; } else if (!strcmp(str, "norand")) { cfi_rand = false; + } else if (!strcmp(str, "warn")) { + pr_alert("CFI mismatch non-fatal!\n"); + cfi_warn = true; } else { pr_err("Ignoring unknown cfi option (%s).", str); } diff --git a/include/linux/cfi.h b/include/linux/cfi.h index f0df518e11dd..1db17ecbb86c 100644 --- a/include/linux/cfi.h +++ b/include/linux/cfi.h @@ -11,6 +11,8 @@ #include #include +extern bool cfi_warn; + #ifndef cfi_get_offset static inline int cfi_get_offset(void) { diff --git a/kernel/cfi.c b/kernel/cfi.c index 08caad776717..19be79639542 100644 --- a/kernel/cfi.c +++ b/kernel/cfi.c @@ -7,6 +7,8 @@ #include +bool cfi_warn __ro_after_init = IS_ENABLED(CONFIG_CFI_PERMISSIVE); + enum bug_trap_type report_cfi_failure(struct pt_regs *regs, unsigned long addr, unsigned long *target, u32 type) { @@ -17,7 +19,7 @@ enum bug_trap_type report_cfi_failure(struct pt_regs *regs, unsigned long addr, pr_err("CFI failure at %pS (no target information)\n", (void *)addr); - if (IS_ENABLED(CONFIG_CFI_PERMISSIVE)) { + if (cfi_warn) { __warn(NULL, 0, (void *)addr, 0, regs, NULL); return BUG_TRAP_TYPE_WARN; } -- 2.51.0 From 500a41acb05a973cb6826ee56df082a97e210a95 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 24 Feb 2025 13:37:05 +0100 Subject: [PATCH 03/16] x86/ibt: Add exact_endbr() helper For when we want to exactly match ENDBR, and not everything that we can scribble it with. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Kees Cook Link: https://lore.kernel.org/r/20250224124200.059556588@infradead.org --- arch/x86/kernel/alternative.c | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 1142ebd3bb49..83316ea470b2 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -863,6 +863,21 @@ Efault: return false; } +#ifdef CONFIG_FINEIBT + +static __noendbr bool exact_endbr(u32 *val) +{ + u32 endbr; + + __get_kernel_nofault(&endbr, val, u32, Efault); + return endbr == gen_endbr(); + +Efault: + return false; +} + +#endif + static void poison_cfi(void *addr); static void __init_or_module poison_endbr(void *addr) @@ -1426,10 +1441,9 @@ static void poison_cfi(void *addr) bool decode_fineibt_insn(struct pt_regs *regs, unsigned long *target, u32 *type) { unsigned long addr = regs->ip - fineibt_preamble_ud2; - u32 endbr, hash; + u32 hash; - __get_kernel_nofault(&endbr, addr, u32, Efault); - if (endbr != gen_endbr()) + if (!exact_endbr((void *)addr)) return false; *target = addr + fineibt_preamble_size; -- 2.51.0 From 5d703825fde301677e8a79b0738927490407f435 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 26 Feb 2025 12:15:15 +0100 Subject: [PATCH 04/16] x86/alternatives: Clean up preprocessor conditional block comments When in the middle of a kernel source code file a kernel developer sees a lone #else or #endif: ... #else ... It's not obvious at a glance what those preprocessor blocks are conditional on, if the starting #ifdef is outside visible range. So apply the standard pattern we use in such cases elsewhere in the kernel for large preprocessor blocks: #ifdef CONFIG_XXX ... ... ... #endif /* CONFIG_XXX */ ... #ifdef CONFIG_XXX ... ... ... #else /* !CONFIG_XXX: */ ... ... ... #endif /* !CONFIG_XXX */ ( Note that in the #else case we use the /* !CONFIG_XXX */ marker in the final #endif, not /* CONFIG_XXX */, which serves as an easy visual marker to differentiate #else or #elif related #endif closures from singular #ifdef/#endif blocks. ) Also clean up __CFI_DEFAULT definition with a bit more vertical alignment applied, and a pointless tab converted to the standard space we use in such definitions. Signed-off-by: Ingo Molnar Cc: linux-kernel@vger.kernel.org Cc: Peter Zijlstra (Intel) --- arch/x86/kernel/alternative.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 83316ea470b2..ea68f0e59cec 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -839,16 +839,16 @@ void __init_or_module noinline apply_returns(s32 *start, s32 *end) } } } -#else +#else /* !CONFIG_MITIGATION_RETHUNK: */ void __init_or_module noinline apply_returns(s32 *start, s32 *end) { } -#endif /* CONFIG_MITIGATION_RETHUNK */ +#endif /* !CONFIG_MITIGATION_RETHUNK */ #else /* !CONFIG_MITIGATION_RETPOLINE || !CONFIG_OBJTOOL */ void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) { } void __init_or_module noinline apply_returns(s32 *start, s32 *end) { } -#endif /* CONFIG_MITIGATION_RETPOLINE && CONFIG_OBJTOOL */ +#endif /* !CONFIG_MITIGATION_RETPOLINE || !CONFIG_OBJTOOL */ #ifdef CONFIG_X86_KERNEL_IBT @@ -916,18 +916,18 @@ void __init_or_module noinline apply_seal_endbr(s32 *start, s32 *end) } } -#else +#else /* !CONFIG_X86_KERNEL_IBT: */ void __init_or_module apply_seal_endbr(s32 *start, s32 *end) { } -#endif /* CONFIG_X86_KERNEL_IBT */ +#endif /* !CONFIG_X86_KERNEL_IBT */ #ifdef CONFIG_CFI_AUTO_DEFAULT -#define __CFI_DEFAULT CFI_AUTO +# define __CFI_DEFAULT CFI_AUTO #elif defined(CONFIG_CFI_CLANG) -#define __CFI_DEFAULT CFI_KCFI +# define __CFI_DEFAULT CFI_KCFI #else -#define __CFI_DEFAULT CFI_OFF +# define __CFI_DEFAULT CFI_OFF #endif enum cfi_mode cfi_mode __ro_after_init = __CFI_DEFAULT; @@ -1457,7 +1457,7 @@ Efault: return false; } -#else +#else /* !CONFIG_FINEIBT: */ static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, s32 *start_cfi, s32 *end_cfi, bool builtin) @@ -1468,7 +1468,7 @@ static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, static void poison_cfi(void *addr) { } #endif -#endif +#endif /* !CONFIG_FINEIBT */ void apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, s32 *start_cfi, s32 *end_cfi) -- 2.51.0 From 2e044911be75ce3321c5b3d10205ac0b54f8cb92 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 24 Feb 2025 13:37:06 +0100 Subject: [PATCH 05/16] x86/traps: Decode 0xEA instructions as #UD FineIBT will start using 0xEA as #UD. Normally '0xEA' is a 'bad', invalid instruction for the CPU. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Kees Cook Link: https://lore.kernel.org/r/20250224124200.166774696@infradead.org --- arch/x86/include/asm/bug.h | 1 + arch/x86/kernel/traps.c | 20 +++++++++++++++++--- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h index 1a5e4b372694..bc8a2ca3c82e 100644 --- a/arch/x86/include/asm/bug.h +++ b/arch/x86/include/asm/bug.h @@ -25,6 +25,7 @@ #define BUG_UD2 0xfffe #define BUG_UD1 0xfffd #define BUG_UD1_UBSAN 0xfffc +#define BUG_EA 0xffea #ifdef CONFIG_GENERIC_BUG diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 05b86c05e446..a02a51bf433f 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -96,6 +96,7 @@ __always_inline int is_valid_bugaddr(unsigned long addr) * Check for UD1 or UD2, accounting for Address Size Override Prefixes. * If it's a UD1, further decode to determine its use: * + * FineIBT: ea (bad) * UBSan{0}: 67 0f b9 00 ud1 (%eax),%eax * UBSan{10}: 67 0f b9 40 10 ud1 0x10(%eax),%eax * static_call: 0f b9 cc ud1 %esp,%ecx @@ -113,6 +114,10 @@ __always_inline int decode_bug(unsigned long addr, s32 *imm, int *len) v = *(u8 *)(addr++); if (v == INSN_ASOP) v = *(u8 *)(addr++); + if (v == 0xea) { + *len = addr - start; + return BUG_EA; + } if (v != OPCODE_ESCAPE) return BUG_NONE; @@ -309,10 +314,16 @@ static noinstr bool handle_bug(struct pt_regs *regs) switch (ud_type) { case BUG_UD2: - if (report_bug(regs->ip, regs) == BUG_TRAP_TYPE_WARN || - handle_cfi_failure(regs) == BUG_TRAP_TYPE_WARN) { - regs->ip += ud_len; + if (report_bug(regs->ip, regs) == BUG_TRAP_TYPE_WARN) { + handled = true; + break; + } + fallthrough; + + case BUG_EA: + if (handle_cfi_failure(regs) == BUG_TRAP_TYPE_WARN) { handled = true; + break; } break; @@ -328,6 +339,9 @@ static noinstr bool handle_bug(struct pt_regs *regs) break; } + if (handled) + regs->ip += ud_len; + if (regs->flags & X86_EFLAGS_IF) raw_local_irq_disable(); instrumentation_end(); -- 2.51.0 From e33d805a1005bf7b8a5a53559c81a8e17f0b981b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 24 Feb 2025 13:37:07 +0100 Subject: [PATCH 06/16] x86/traps: Allow custom fixups in handle_bug() The normal fixup in handle_bug() is simply continuing at the next instruction. However upcoming patches make this the wrong thing, so allow handlers (specifically handle_cfi_failure()) to over-ride regs->ip. The callchain is such that the fixup needs to be done before it is determined if the exception is fatal, as such, revert any changes in that case. Additionally, have handle_cfi_failure() remember the regs->ip value it starts with for reporting. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Kees Cook Link: https://lore.kernel.org/r/20250224124200.275223080@infradead.org --- arch/x86/kernel/cfi.c | 8 ++++---- arch/x86/kernel/traps.c | 16 +++++++++++++--- 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/arch/x86/kernel/cfi.c b/arch/x86/kernel/cfi.c index f6905bef0af8..77086cf565ec 100644 --- a/arch/x86/kernel/cfi.c +++ b/arch/x86/kernel/cfi.c @@ -67,16 +67,16 @@ static bool decode_cfi_insn(struct pt_regs *regs, unsigned long *target, */ enum bug_trap_type handle_cfi_failure(struct pt_regs *regs) { - unsigned long target; + unsigned long target, addr = regs->ip; u32 type; switch (cfi_mode) { case CFI_KCFI: - if (!is_cfi_trap(regs->ip)) + if (!is_cfi_trap(addr)) return BUG_TRAP_TYPE_NONE; if (!decode_cfi_insn(regs, &target, &type)) - return report_cfi_failure_noaddr(regs, regs->ip); + return report_cfi_failure_noaddr(regs, addr); break; @@ -90,7 +90,7 @@ enum bug_trap_type handle_cfi_failure(struct pt_regs *regs) return BUG_TRAP_TYPE_NONE; } - return report_cfi_failure(regs, regs->ip, &target, type); + return report_cfi_failure(regs, addr, &target, type); } /* diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index a02a51bf433f..c169f3bd3c6c 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -287,11 +287,12 @@ static inline void handle_invalid_op(struct pt_regs *regs) static noinstr bool handle_bug(struct pt_regs *regs) { + unsigned long addr = regs->ip; bool handled = false; int ud_type, ud_len; s32 ud_imm; - ud_type = decode_bug(regs->ip, &ud_imm, &ud_len); + ud_type = decode_bug(addr, &ud_imm, &ud_len); if (ud_type == BUG_NONE) return handled; @@ -339,8 +340,17 @@ static noinstr bool handle_bug(struct pt_regs *regs) break; } - if (handled) - regs->ip += ud_len; + /* + * When continuing, and regs->ip hasn't changed, move it to the next + * instruction. When not continuing execution, restore the instruction + * pointer. + */ + if (handled) { + if (regs->ip == addr) + regs->ip += ud_len; + } else { + regs->ip = addr; + } if (regs->flags & X86_EFLAGS_IF) raw_local_irq_disable(); -- 2.51.0 From 06926c6cdb955bf24521d4e13af95b4d062d02d0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 24 Feb 2025 13:37:08 +0100 Subject: [PATCH 07/16] x86/ibt: Optimize the FineIBT instruction sequence Scott notes that non-taken branches are faster. Abuse overlapping code that traps instead of explicit UD2 instructions. And LEA does not modify flags and will have less dependencies. Suggested-by: Scott Constable Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Kees Cook Link: https://lore.kernel.org/r/20250224124200.371942555@infradead.org --- arch/x86/kernel/alternative.c | 61 +++++++++++++++++++++++------------ arch/x86/net/bpf_jit_comp.c | 5 ++- 2 files changed, 42 insertions(+), 24 deletions(-) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index ea68f0e59cec..a2e8ee8029eb 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -1057,9 +1057,9 @@ early_param("cfi", cfi_parse_cmdline); * __cfi_\func: __cfi_\func: * movl $0x12345678,%eax // 5 endbr64 // 4 * nop subl $0x12345678,%r10d // 7 - * nop jz 1f // 2 - * nop ud2 // 2 - * nop 1: nop // 1 + * nop jne __cfi_\func+6 // 2 + * nop nop3 // 3 + * nop * nop * nop * nop @@ -1071,37 +1071,50 @@ early_param("cfi", cfi_parse_cmdline); * * caller: caller: * movl $(-0x12345678),%r10d // 6 movl $0x12345678,%r10d // 6 - * addl $-15(%r11),%r10d // 4 sub $16,%r11 // 4 + * addl $-15(%r11),%r10d // 4 lea -0x10(%r11),%r11 // 4 * je 1f // 2 nop4 // 4 * ud2 // 2 - * 1: call __x86_indirect_thunk_r11 // 5 call *%r11; nop2; // 5 + * 1: cs call __x86_indirect_thunk_r11 // 6 call *%r11; nop3; // 6 * */ -asm( ".pushsection .rodata \n" - "fineibt_preamble_start: \n" - " endbr64 \n" - " subl $0x12345678, %r10d \n" - " je fineibt_preamble_end \n" - "fineibt_preamble_ud2: \n" - " ud2 \n" - " nop \n" - "fineibt_preamble_end: \n" +/* + * : + * 0: f3 0f 1e fa endbr64 + * 4: 41 81 78 56 34 12 sub $0x12345678, %r10d + * b: 75 f9 jne 6 + * d: 0f 1f 00 nopl (%rax) + * + * Note that the JNE target is the 0xEA byte inside the SUB, this decodes as + * (bad) on x86_64 and raises #UD. + */ +asm( ".pushsection .rodata \n" + "fineibt_preamble_start: \n" + " endbr64 \n" + " subl $0x12345678, %r10d \n" + " jne fineibt_preamble_start+6 \n" + ASM_NOP3 + "fineibt_preamble_end: \n" ".popsection\n" ); extern u8 fineibt_preamble_start[]; -extern u8 fineibt_preamble_ud2[]; extern u8 fineibt_preamble_end[]; #define fineibt_preamble_size (fineibt_preamble_end - fineibt_preamble_start) -#define fineibt_preamble_ud2 (fineibt_preamble_ud2 - fineibt_preamble_start) +#define fineibt_preamble_ud 6 #define fineibt_preamble_hash 7 +/* + * : + * 0: 41 ba 78 56 34 12 mov $0x12345678, %r10d + * 6: 4d 8d 5b f0 lea -0x10(%r11), %r11 + * a: 0f 1f 40 00 nopl 0x0(%rax) + */ asm( ".pushsection .rodata \n" "fineibt_caller_start: \n" " movl $0x12345678, %r10d \n" - " sub $16, %r11 \n" + " lea -0x10(%r11), %r11 \n" ASM_NOP4 "fineibt_caller_end: \n" ".popsection \n" @@ -1432,15 +1445,15 @@ static void poison_cfi(void *addr) } /* - * regs->ip points to a UD2 instruction, return true and fill out target and - * type when this UD2 is from a FineIBT preamble. + * When regs->ip points to a 0xEA byte in the FineIBT preamble, + * return true and fill out target and type. * * We check the preamble by checking for the ENDBR instruction relative to the - * UD2 instruction. + * 0xEA instruction. */ bool decode_fineibt_insn(struct pt_regs *regs, unsigned long *target, u32 *type) { - unsigned long addr = regs->ip - fineibt_preamble_ud2; + unsigned long addr = regs->ip - fineibt_preamble_ud; u32 hash; if (!exact_endbr((void *)addr)) @@ -1451,6 +1464,12 @@ bool decode_fineibt_insn(struct pt_regs *regs, unsigned long *target, u32 *type) __get_kernel_nofault(&hash, addr + fineibt_preamble_hash, u32, Efault); *type = (u32)regs->r10 + hash; + /* + * Since regs->ip points to the middle of an instruction; it cannot + * continue with the normal fixup. + */ + regs->ip = *target; + return true; Efault: diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index f36508b67278..ce033e63bc27 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -417,9 +417,8 @@ static void emit_fineibt(u8 **pprog, u32 hash) EMIT_ENDBR(); EMIT3_off32(0x41, 0x81, 0xea, hash); /* subl $hash, %r10d */ - EMIT2(0x74, 0x07); /* jz.d8 +7 */ - EMIT2(0x0f, 0x0b); /* ud2 */ - EMIT1(0x90); /* nop */ + EMIT2(0x75, 0xf9); /* jne.d8 .-7 */ + EMIT3(0x0f, 0x1f, 0x00); /* nop3 */ EMIT_ENDBR_POISON(); *pprog = prog; -- 2.51.0 From 029f718fedd72872f7475604fe71b2a841108834 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 24 Feb 2025 13:37:09 +0100 Subject: [PATCH 08/16] x86/traps: Decode LOCK Jcc.d8 as #UD Because overlapping code sequences are all the rage. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Kees Cook Link: https://lore.kernel.org/r/20250224124200.486463917@infradead.org --- arch/x86/include/asm/bug.h | 2 ++ arch/x86/kernel/traps.c | 26 +++++++++++++++++++++++--- 2 files changed, 25 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h index bc8a2ca3c82e..f0e9acf72547 100644 --- a/arch/x86/include/asm/bug.h +++ b/arch/x86/include/asm/bug.h @@ -17,6 +17,7 @@ * In clang we have UD1s reporting UBSAN failures on X86, 64 and 32bit. */ #define INSN_ASOP 0x67 +#define INSN_LOCK 0xf0 #define OPCODE_ESCAPE 0x0f #define SECOND_BYTE_OPCODE_UD1 0xb9 #define SECOND_BYTE_OPCODE_UD2 0x0b @@ -26,6 +27,7 @@ #define BUG_UD1 0xfffd #define BUG_UD1_UBSAN 0xfffc #define BUG_EA 0xffea +#define BUG_LOCK 0xfff0 #ifdef CONFIG_GENERIC_BUG diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index c169f3bd3c6c..f4263cb3d21e 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -97,6 +97,7 @@ __always_inline int is_valid_bugaddr(unsigned long addr) * If it's a UD1, further decode to determine its use: * * FineIBT: ea (bad) + * FineIBT: f0 75 f9 lock jne . - 6 * UBSan{0}: 67 0f b9 00 ud1 (%eax),%eax * UBSan{10}: 67 0f b9 40 10 ud1 0x10(%eax),%eax * static_call: 0f b9 cc ud1 %esp,%ecx @@ -106,6 +107,7 @@ __always_inline int is_valid_bugaddr(unsigned long addr) __always_inline int decode_bug(unsigned long addr, s32 *imm, int *len) { unsigned long start = addr; + bool lock = false; u8 v; if (addr < TASK_SIZE_MAX) @@ -114,12 +116,29 @@ __always_inline int decode_bug(unsigned long addr, s32 *imm, int *len) v = *(u8 *)(addr++); if (v == INSN_ASOP) v = *(u8 *)(addr++); - if (v == 0xea) { + + if (v == INSN_LOCK) { + lock = true; + v = *(u8 *)(addr++); + } + + switch (v) { + case 0x70 ... 0x7f: /* Jcc.d8 */ + addr += 1; /* d8 */ + *len = addr - start; + WARN_ON_ONCE(!lock); + return BUG_LOCK; + + case 0xea: *len = addr - start; return BUG_EA; - } - if (v != OPCODE_ESCAPE) + + case OPCODE_ESCAPE: + break; + + default: return BUG_NONE; + } v = *(u8 *)(addr++); if (v == SECOND_BYTE_OPCODE_UD2) { @@ -322,6 +341,7 @@ static noinstr bool handle_bug(struct pt_regs *regs) fallthrough; case BUG_EA: + case BUG_LOCK: if (handle_cfi_failure(regs) == BUG_TRAP_TYPE_WARN) { handled = true; break; -- 2.51.0 From 97e59672a9d2aec0c27f6cd6a6b0edfdd6e5a85c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 26 Feb 2025 12:25:17 +0100 Subject: [PATCH 09/16] x86/ibt: Add paranoid FineIBT mode Due to concerns about circumvention attacks against FineIBT on 'naked' ENDBR, add an additional caller side hash check to FineIBT. This should make it impossible to pivot over such a 'naked' ENDBR instruction at the cost of an additional load. The specific pivot reported was against the SYSCALL entry site and FRED will have all those holes fixed up. https://lore.kernel.org/linux-hardening/Z60NwR4w%2F28Z7XUa@ubun/ This specific fineibt_paranoid_start[] sequence was concocted by Scott. Suggested-by: Scott Constable Reported-by: Jennifer Miller Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Kees Cook Link: https://lore.kernel.org/r/20250224124200.598033084@infradead.org --- arch/x86/kernel/alternative.c | 144 ++++++++++++++++++++++++++++++++-- 1 file changed, 138 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index a2e8ee8029eb..c698c9e27ce3 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -741,6 +741,11 @@ void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) op2 = insn.opcode.bytes[1]; switch (op1) { + case 0x70 ... 0x7f: /* Jcc.d8 */ + /* See cfi_paranoid. */ + WARN_ON_ONCE(cfi_mode != CFI_FINEIBT); + continue; + case CALL_INSN_OPCODE: case JMP32_INSN_OPCODE: break; @@ -998,6 +1003,8 @@ u32 cfi_get_func_hash(void *func) static bool cfi_rand __ro_after_init = true; static u32 cfi_seed __ro_after_init; +static bool cfi_paranoid __ro_after_init = false; + /* * Re-hash the CFI hash with a boot-time seed while making sure the result is * not a valid ENDBR instruction. @@ -1040,6 +1047,12 @@ static __init int cfi_parse_cmdline(char *str) } else if (!strcmp(str, "warn")) { pr_alert("CFI mismatch non-fatal!\n"); cfi_warn = true; + } else if (!strcmp(str, "paranoid")) { + if (cfi_mode == CFI_FINEIBT) { + cfi_paranoid = true; + } else { + pr_err("Ignoring paranoid; depends on fineibt.\n"); + } } else { pr_err("Ignoring unknown cfi option (%s).", str); } @@ -1128,6 +1141,52 @@ extern u8 fineibt_caller_end[]; #define fineibt_caller_jmp (fineibt_caller_size - 2) +/* + * Since FineIBT does hash validation on the callee side it is prone to + * circumvention attacks where a 'naked' ENDBR instruction exists that + * is not part of the fineibt_preamble sequence. + * + * Notably the x86 entry points must be ENDBR and equally cannot be + * fineibt_preamble. + * + * The fineibt_paranoid caller sequence adds additional caller side + * hash validation. This stops such circumvention attacks dead, but at the cost + * of adding a load. + * + * : + * 0: 41 ba 78 56 34 12 mov $0x12345678, %r10d + * 6: 45 3b 53 f7 cmp -0x9(%r11), %r10d + * a: 4d 8d 5b lea -0x10(%r11), %r11 + * e: 75 fd jne d + * 10: 41 ff d3 call *%r11 + * 13: 90 nop + * + * Notably LEA does not modify flags and can be reordered with the CMP, + * avoiding a dependency. Again, using a non-taken (backwards) branch + * for the failure case, abusing LEA's immediate 0xf0 as LOCK prefix for the + * Jcc.d8, causing #UD. + */ +asm( ".pushsection .rodata \n" + "fineibt_paranoid_start: \n" + " movl $0x12345678, %r10d \n" + " cmpl -9(%r11), %r10d \n" + " lea -0x10(%r11), %r11 \n" + " jne fineibt_paranoid_start+0xd \n" + "fineibt_paranoid_ind: \n" + " call *%r11 \n" + " nop \n" + "fineibt_paranoid_end: \n" + ".popsection \n" +); + +extern u8 fineibt_paranoid_start[]; +extern u8 fineibt_paranoid_ind[]; +extern u8 fineibt_paranoid_end[]; + +#define fineibt_paranoid_size (fineibt_paranoid_end - fineibt_paranoid_start) +#define fineibt_paranoid_ind (fineibt_paranoid_ind - fineibt_paranoid_start) +#define fineibt_paranoid_ud 0xd + static u32 decode_preamble_hash(void *addr) { u8 *p = addr; @@ -1291,18 +1350,48 @@ static int cfi_rewrite_callers(s32 *start, s32 *end) { s32 *s; + BUG_ON(fineibt_paranoid_size != 20); + for (s = start; s < end; s++) { void *addr = (void *)s + *s; + struct insn insn; + u8 bytes[20]; u32 hash; + int ret; + u8 op; addr -= fineibt_caller_size; hash = decode_caller_hash(addr); - if (hash) { + if (!hash) + continue; + + if (!cfi_paranoid) { text_poke_early(addr, fineibt_caller_start, fineibt_caller_size); WARN_ON(*(u32 *)(addr + fineibt_caller_hash) != 0x12345678); text_poke_early(addr + fineibt_caller_hash, &hash, 4); + /* rely on apply_retpolines() */ + continue; + } + + /* cfi_paranoid */ + ret = insn_decode_kernel(&insn, addr + fineibt_caller_size); + if (WARN_ON_ONCE(ret < 0)) + continue; + + op = insn.opcode.bytes[0]; + if (op != CALL_INSN_OPCODE && op != JMP32_INSN_OPCODE) { + WARN_ON_ONCE(1); + continue; } - /* rely on apply_retpolines() */ + + memcpy(bytes, fineibt_paranoid_start, fineibt_paranoid_size); + memcpy(bytes + fineibt_caller_hash, &hash, 4); + + ret = emit_indirect(op, 11, bytes + fineibt_paranoid_ind); + if (WARN_ON_ONCE(ret != 3)) + continue; + + text_poke_early(addr, bytes, fineibt_paranoid_size); } return 0; @@ -1319,8 +1408,15 @@ static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, if (cfi_mode == CFI_AUTO) { cfi_mode = CFI_KCFI; - if (HAS_KERNEL_IBT && cpu_feature_enabled(X86_FEATURE_IBT)) + if (HAS_KERNEL_IBT && cpu_feature_enabled(X86_FEATURE_IBT)) { + /* + * FRED has much saner context on exception entry and + * is less easy to take advantage of. + */ + if (!cpu_feature_enabled(X86_FEATURE_FRED)) + cfi_paranoid = true; cfi_mode = CFI_FINEIBT; + } } /* @@ -1377,8 +1473,10 @@ static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, /* now that nobody targets func()+0, remove ENDBR there */ cfi_rewrite_endbr(start_cfi, end_cfi); - if (builtin) - pr_info("Using FineIBT CFI\n"); + if (builtin) { + pr_info("Using %sFineIBT CFI\n", + cfi_paranoid ? "paranoid " : ""); + } return; default: @@ -1451,7 +1549,7 @@ static void poison_cfi(void *addr) * We check the preamble by checking for the ENDBR instruction relative to the * 0xEA instruction. */ -bool decode_fineibt_insn(struct pt_regs *regs, unsigned long *target, u32 *type) +static bool decode_fineibt_preamble(struct pt_regs *regs, unsigned long *target, u32 *type) { unsigned long addr = regs->ip - fineibt_preamble_ud; u32 hash; @@ -1476,6 +1574,40 @@ Efault: return false; } +/* + * regs->ip points to a LOCK Jcc.d8 instruction from the fineibt_paranoid_start[] + * sequence. + */ +static bool decode_fineibt_paranoid(struct pt_regs *regs, unsigned long *target, u32 *type) +{ + unsigned long addr = regs->ip - fineibt_paranoid_ud; + u32 hash; + + if (!cfi_paranoid || !is_cfi_trap(addr + fineibt_caller_size - LEN_UD2)) + return false; + + __get_kernel_nofault(&hash, addr + fineibt_caller_hash, u32, Efault); + *target = regs->r11 + fineibt_preamble_size; + *type = regs->r10; + + /* + * Since the trapping instruction is the exact, but LOCK prefixed, + * Jcc.d8 that got us here, the normal fixup will work. + */ + return true; + +Efault: + return false; +} + +bool decode_fineibt_insn(struct pt_regs *regs, unsigned long *target, u32 *type) +{ + if (decode_fineibt_paranoid(regs, target, type)) + return true; + + return decode_fineibt_preamble(regs, target, type); +} + #else /* !CONFIG_FINEIBT: */ static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, -- 2.51.0 From b815f6877d8063482fe745f098eeef632679aa8a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 24 Feb 2025 13:37:11 +0100 Subject: [PATCH 10/16] x86/bhi: Add BHI stubs Add an array of code thunks, to be called from the FineIBT preamble, clobbering the first 'n' argument registers for speculative execution. Notably the 0th entry will clobber no argument registers and will never be used, it exists so the array can be naturally indexed, while the 7th entry will clobber all the 6 argument registers and also RSP in order to mess up stack based arguments. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Kees Cook Link: https://lore.kernel.org/r/20250224124200.717378681@infradead.org --- arch/x86/include/asm/cfi.h | 4 + arch/x86/lib/Makefile | 3 +- arch/x86/lib/bhi.S | 147 +++++++++++++++++++++++++++++++++++++ 3 files changed, 153 insertions(+), 1 deletion(-) create mode 100644 arch/x86/lib/bhi.S diff --git a/arch/x86/include/asm/cfi.h b/arch/x86/include/asm/cfi.h index 7dd5ab239c87..7c15c4bfe65f 100644 --- a/arch/x86/include/asm/cfi.h +++ b/arch/x86/include/asm/cfi.h @@ -101,6 +101,10 @@ enum cfi_mode { extern enum cfi_mode cfi_mode; +typedef u8 bhi_thunk[32]; +extern bhi_thunk __bhi_args[]; +extern bhi_thunk __bhi_args_end[]; + struct pt_regs; #ifdef CONFIG_CFI_CLANG diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index 8a59c61624c2..f453507649d4 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -66,5 +66,6 @@ endif lib-y += clear_page_64.o copy_page_64.o lib-y += memmove_64.o memset_64.o lib-y += copy_user_64.o copy_user_uncached_64.o - lib-y += cmpxchg16b_emu.o + lib-y += cmpxchg16b_emu.o + lib-y += bhi.o endif diff --git a/arch/x86/lib/bhi.S b/arch/x86/lib/bhi.S new file mode 100644 index 000000000000..58891681261b --- /dev/null +++ b/arch/x86/lib/bhi.S @@ -0,0 +1,147 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#include +#include +#include + +/* + * Notably, the FineIBT preamble calling these will have ZF set and r10 zero. + * + * The very last element is in fact larger than 32 bytes, but since its the + * last element, this does not matter, + * + * There are 2 #UD sites, located between 0,1-2,3 and 4,5-6,7 such that they + * can be reached using Jcc.d8, these elements (1 and 5) have sufficiently + * big alignment holes for this to not stagger the array. + */ + +.pushsection .noinstr.text, "ax" + + .align 32 +SYM_CODE_START(__bhi_args) + +#ifdef CONFIG_FINEIBT_BHI + + .align 32 +SYM_INNER_LABEL(__bhi_args_0, SYM_L_LOCAL) + ANNOTATE_NOENDBR + UNWIND_HINT_FUNC + jne .Lud_1 + ANNOTATE_UNRET_SAFE + ret + int3 + + .align 32 +SYM_INNER_LABEL(__bhi_args_1, SYM_L_LOCAL) + ANNOTATE_NOENDBR + UNWIND_HINT_FUNC + jne .Lud_1 + cmovne %r10, %rdi + ANNOTATE_UNRET_SAFE + ret + int3 + + .align 8 + ANNOTATE_REACHABLE +.Lud_1: ud2 + ANNOTATE_UNRET_SAFE + ret + int3 + + .align 32 +SYM_INNER_LABEL(__bhi_args_2, SYM_L_LOCAL) + ANNOTATE_NOENDBR + UNWIND_HINT_FUNC + jne .Lud_1 + cmovne %r10, %rdi + cmovne %r10, %rsi + ANNOTATE_UNRET_SAFE + ret + int3 + + .align 32 +SYM_INNER_LABEL(__bhi_args_3, SYM_L_LOCAL) + ANNOTATE_NOENDBR + UNWIND_HINT_FUNC + jne .Lud_1 + cmovne %r10, %rdi + cmovne %r10, %rsi + cmovne %r10, %rdx + ANNOTATE_UNRET_SAFE + ret + int3 + + .align 32 +SYM_INNER_LABEL(__bhi_args_4, SYM_L_LOCAL) + ANNOTATE_NOENDBR + UNWIND_HINT_FUNC + jne .Lud_2 + cmovne %r10, %rdi + cmovne %r10, %rsi + cmovne %r10, %rdx + cmovne %r10, %rcx + ANNOTATE_UNRET_SAFE + ret + int3 + + .align 32 +SYM_INNER_LABEL(__bhi_args_5, SYM_L_LOCAL) + ANNOTATE_NOENDBR + UNWIND_HINT_FUNC + jne .Lud_2 + cmovne %r10, %rdi + cmovne %r10, %rsi + cmovne %r10, %rdx + cmovne %r10, %rcx + cmovne %r10, %r8 + ANNOTATE_UNRET_SAFE + ret + int3 + + .align 8 + ANNOTATE_REACHABLE +.Lud_2: ud2 + ANNOTATE_UNRET_SAFE + ret + int3 + + .align 32 +SYM_INNER_LABEL(__bhi_args_6, SYM_L_LOCAL) + ANNOTATE_NOENDBR + UNWIND_HINT_FUNC + jne .Lud_2 + cmovne %r10, %rdi + cmovne %r10, %rsi + cmovne %r10, %rdx + cmovne %r10, %rcx + cmovne %r10, %r8 + cmovne %r10, %r9 + ANNOTATE_UNRET_SAFE + ret + int3 + + .align 32 +SYM_INNER_LABEL(__bhi_args_7, SYM_L_LOCAL) + ANNOTATE_NOENDBR + UNWIND_HINT_FUNC + jne .Lud_2 + cmovne %r10, %rdi + cmovne %r10, %rsi + cmovne %r10, %rdx + cmovne %r10, %rcx + cmovne %r10, %r8 + cmovne %r10, %r9 + cmovne %r10, %rsp + ANNOTATE_UNRET_SAFE + ret + int3 + +#endif /* CONFIG_FINEIBT_BHI */ + + .align 32 +SYM_INNER_LABEL(__bhi_args_end, SYM_L_GLOBAL) + ANNOTATE_NOENDBR + nop /* Work around toolchain+objtool quirk */ +SYM_CODE_END(__bhi_args) + +.popsection -- 2.51.0 From 0c92385dc05ee9637c04372ea95a11bbf6e010ff Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 24 Feb 2025 13:37:12 +0100 Subject: [PATCH 11/16] x86/ibt: Implement FineIBT-BHI mitigation While WAIT_FOR_ENDBR is specified to be a full speculation stop; it has been shown that some implementations are 'leaky' to such an extend that speculation can escape even the FineIBT preamble. To deal with this, add additional hardening to the FineIBT preamble. Notably, using a new LLVM feature: https://github.com/llvm/llvm-project/commit/e223485c9b38a5579991b8cebb6a200153eee245 which encodes the number of arguments in the kCFI preamble's register. Using this register<->arity mapping, have the FineIBT preamble CALL into a stub clobbering the relevant argument registers in the speculative case. Scott sayeth thusly: Microarchitectural attacks such as Branch History Injection (BHI) and Intra-mode Branch Target Injection (IMBTI) [1] can cause an indirect call to mispredict to an adversary-influenced target within the same hardware domain (e.g., within the kernel). Instructions at the mispredicted target may execute speculatively and potentially expose kernel data (e.g., to a user-mode adversary) through a microarchitectural covert channel such as CPU cache state. CET-IBT [2] is a coarse-grained control-flow integrity (CFI) ISA extension that enforces that each indirect call (or indirect jump) must land on an ENDBR (end branch) instruction, even speculatively*. FineIBT is a software technique that refines CET-IBT by associating each function type with a 32-bit hash and enforcing (at the callee) that the hash of the caller's function pointer type matches the hash of the callee's function type. However, recent research [3] has demonstrated that the conditional branch that enforces FineIBT's hash check can be coerced to mispredict, potentially allowing an adversary to speculatively bypass the hash check: __cfi_foo: ENDBR64 SUB R10d, 0x01234567 JZ foo # Even if the hash check fails and ZF=0, this branch could still mispredict as taken UD2 foo: ... The techniques demonstrated in [3] require the attacker to be able to control the contents of at least one live register at the mispredicted target. Therefore, this patch set introduces a sequence of CMOV instructions at each indirect-callable target that poisons every live register with data that the attacker cannot control whenever the FineIBT hash check fails, thus mitigating any potential attack. The security provided by this scheme has been discussed in detail on an earlier thread [4]. [1] https://www.intel.com/content/www/us/en/developer/articles/technical/software-security-guidance/technical-documentation/branch-history-injection.html [2] Intel Software Developer's Manual, Volume 1, Chapter 18 [3] https://www.vusec.net/projects/native-bhi/ [4] https://lore.kernel.org/lkml/20240927194925.707462984@infradead.org/ *There are some caveats for certain processors, see [1] for more info Suggested-by: Scott Constable Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Kees Cook Link: https://lore.kernel.org/r/20250224124200.820402212@infradead.org --- Makefile | 3 + arch/x86/Kconfig | 8 +++ arch/x86/include/asm/cfi.h | 6 ++ arch/x86/kernel/alternative.c | 107 ++++++++++++++++++++++++++++++---- arch/x86/net/bpf_jit_comp.c | 29 ++++++--- 5 files changed, 134 insertions(+), 19 deletions(-) diff --git a/Makefile b/Makefile index 96407c1d6be1..f19431f25ed7 100644 --- a/Makefile +++ b/Makefile @@ -1014,6 +1014,9 @@ CC_FLAGS_CFI := -fsanitize=kcfi ifdef CONFIG_CFI_ICALL_NORMALIZE_INTEGERS CC_FLAGS_CFI += -fsanitize-cfi-icall-experimental-normalize-integers endif +ifdef CONFIG_FINEIBT_BHI + CC_FLAGS_CFI += -fsanitize-kcfi-arity +endif ifdef CONFIG_RUST # Always pass -Zsanitizer-cfi-normalize-integers as CONFIG_RUST selects # CONFIG_CFI_ICALL_NORMALIZE_INTEGERS. diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index c4175f4635ee..5c277261507e 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2473,6 +2473,10 @@ config CC_HAS_RETURN_THUNK config CC_HAS_ENTRY_PADDING def_bool $(cc-option,-fpatchable-function-entry=16,16) +config CC_HAS_KCFI_ARITY + def_bool $(cc-option,-fsanitize=kcfi -fsanitize-kcfi-arity) + depends on CC_IS_CLANG && !RUST + config FUNCTION_PADDING_CFI int default 59 if FUNCTION_ALIGNMENT_64B @@ -2498,6 +2502,10 @@ config FINEIBT depends on X86_KERNEL_IBT && CFI_CLANG && MITIGATION_RETPOLINE select CALL_PADDING +config FINEIBT_BHI + def_bool y + depends on FINEIBT && CC_HAS_KCFI_ARITY + config HAVE_CALL_THUNKS def_bool y depends on CC_HAS_ENTRY_PADDING && MITIGATION_RETHUNK && OBJTOOL diff --git a/arch/x86/include/asm/cfi.h b/arch/x86/include/asm/cfi.h index 7c15c4bfe65f..2f6a01f098b5 100644 --- a/arch/x86/include/asm/cfi.h +++ b/arch/x86/include/asm/cfi.h @@ -100,6 +100,7 @@ enum cfi_mode { }; extern enum cfi_mode cfi_mode; +extern bool cfi_bhi; typedef u8 bhi_thunk[32]; extern bhi_thunk __bhi_args[]; @@ -129,6 +130,7 @@ static inline int cfi_get_offset(void) #define cfi_get_offset cfi_get_offset extern u32 cfi_get_func_hash(void *func); +extern int cfi_get_func_arity(void *func); #ifdef CONFIG_FINEIBT extern bool decode_fineibt_insn(struct pt_regs *regs, unsigned long *target, u32 *type); @@ -152,6 +154,10 @@ static inline u32 cfi_get_func_hash(void *func) { return 0; } +static inline int cfi_get_func_arity(void *func) +{ + return 0; +} #endif /* CONFIG_CFI_CLANG */ #if HAS_KERNEL_IBT == 1 diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index c698c9e27ce3..b8d65d512066 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -936,6 +936,7 @@ void __init_or_module apply_seal_endbr(s32 *start, s32 *end) { } #endif enum cfi_mode cfi_mode __ro_after_init = __CFI_DEFAULT; +bool cfi_bhi __ro_after_init = false; #ifdef CONFIG_CFI_CLANG struct bpf_insn; @@ -996,6 +997,21 @@ u32 cfi_get_func_hash(void *func) return hash; } + +int cfi_get_func_arity(void *func) +{ + bhi_thunk *target; + s32 disp; + + if (cfi_mode != CFI_FINEIBT && !cfi_bhi) + return 0; + + if (get_kernel_nofault(disp, func - 4)) + return 0; + + target = func + disp; + return target - __bhi_args; +} #endif #ifdef CONFIG_FINEIBT @@ -1053,6 +1069,12 @@ static __init int cfi_parse_cmdline(char *str) } else { pr_err("Ignoring paranoid; depends on fineibt.\n"); } + } else if (!strcmp(str, "bhi")) { + if (cfi_mode == CFI_FINEIBT) { + cfi_bhi = true; + } else { + pr_err("Ignoring bhi; depends on fineibt.\n"); + } } else { pr_err("Ignoring unknown cfi option (%s).", str); } @@ -1105,6 +1127,7 @@ asm( ".pushsection .rodata \n" "fineibt_preamble_start: \n" " endbr64 \n" " subl $0x12345678, %r10d \n" + "fineibt_preamble_bhi: \n" " jne fineibt_preamble_start+6 \n" ASM_NOP3 "fineibt_preamble_end: \n" @@ -1112,9 +1135,11 @@ asm( ".pushsection .rodata \n" ); extern u8 fineibt_preamble_start[]; +extern u8 fineibt_preamble_bhi[]; extern u8 fineibt_preamble_end[]; #define fineibt_preamble_size (fineibt_preamble_end - fineibt_preamble_start) +#define fineibt_preamble_bhi (fineibt_preamble_bhi - fineibt_preamble_start) #define fineibt_preamble_ud 6 #define fineibt_preamble_hash 7 @@ -1187,13 +1212,16 @@ extern u8 fineibt_paranoid_end[]; #define fineibt_paranoid_ind (fineibt_paranoid_ind - fineibt_paranoid_start) #define fineibt_paranoid_ud 0xd -static u32 decode_preamble_hash(void *addr) +static u32 decode_preamble_hash(void *addr, int *reg) { u8 *p = addr; - /* b8 78 56 34 12 mov $0x12345678,%eax */ - if (p[0] == 0xb8) + /* b8+reg 78 56 34 12 movl $0x12345678,\reg */ + if (p[0] >= 0xb8 && p[0] < 0xc0) { + if (reg) + *reg = p[0] - 0xb8; return *(u32 *)(addr + 1); + } return 0; /* invalid hash value */ } @@ -1202,11 +1230,11 @@ static u32 decode_caller_hash(void *addr) { u8 *p = addr; - /* 41 ba 78 56 34 12 mov $0x12345678,%r10d */ + /* 41 ba 88 a9 cb ed mov $(-0x12345678),%r10d */ if (p[0] == 0x41 && p[1] == 0xba) return -*(u32 *)(addr + 2); - /* e8 0c 78 56 34 12 jmp.d8 +12 */ + /* e8 0c 88 a9 cb ed jmp.d8 +12 */ if (p[0] == JMP8_INSN_OPCODE && p[1] == fineibt_caller_jmp) return -*(u32 *)(addr + 2); @@ -1271,7 +1299,7 @@ static int cfi_rand_preamble(s32 *start, s32 *end) void *addr = (void *)s + *s; u32 hash; - hash = decode_preamble_hash(addr); + hash = decode_preamble_hash(addr, NULL); if (WARN(!hash, "no CFI hash found at: %pS %px %*ph\n", addr, addr, 5, addr)) return -EINVAL; @@ -1289,6 +1317,7 @@ static int cfi_rewrite_preamble(s32 *start, s32 *end) for (s = start; s < end; s++) { void *addr = (void *)s + *s; + int arity; u32 hash; /* @@ -1299,7 +1328,7 @@ static int cfi_rewrite_preamble(s32 *start, s32 *end) if (!is_endbr(addr + 16)) continue; - hash = decode_preamble_hash(addr); + hash = decode_preamble_hash(addr, &arity); if (WARN(!hash, "no CFI hash found at: %pS %px %*ph\n", addr, addr, 5, addr)) return -EINVAL; @@ -1307,6 +1336,19 @@ static int cfi_rewrite_preamble(s32 *start, s32 *end) text_poke_early(addr, fineibt_preamble_start, fineibt_preamble_size); WARN_ON(*(u32 *)(addr + fineibt_preamble_hash) != 0x12345678); text_poke_early(addr + fineibt_preamble_hash, &hash, 4); + + WARN_ONCE(!IS_ENABLED(CONFIG_FINEIBT_BHI) && arity, + "kCFI preamble has wrong register at: %pS %*ph\n", + addr, 5, addr); + + if (!cfi_bhi || !arity) + continue; + + text_poke_early(addr + fineibt_preamble_bhi, + text_gen_insn(CALL_INSN_OPCODE, + addr + fineibt_preamble_bhi, + __bhi_args[arity]), + CALL_INSN_SIZE); } return 0; @@ -1474,8 +1516,9 @@ static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, cfi_rewrite_endbr(start_cfi, end_cfi); if (builtin) { - pr_info("Using %sFineIBT CFI\n", - cfi_paranoid ? "paranoid " : ""); + pr_info("Using %sFineIBT%s CFI\n", + cfi_paranoid ? "paranoid " : "", + cfi_bhi ? "+BHI" : ""); } return; @@ -1526,7 +1569,7 @@ static void poison_cfi(void *addr) /* * kCFI prefix should start with a valid hash. */ - if (!decode_preamble_hash(addr)) + if (!decode_preamble_hash(addr, NULL)) break; /* @@ -1574,6 +1617,47 @@ Efault: return false; } +/* + * regs->ip points to one of the UD2 in __bhi_args[]. + */ +static bool decode_fineibt_bhi(struct pt_regs *regs, unsigned long *target, u32 *type) +{ + unsigned long addr; + u32 hash; + + if (!cfi_bhi) + return false; + + if (regs->ip < (unsigned long)__bhi_args || + regs->ip >= (unsigned long)__bhi_args_end) + return false; + + /* + * Fetch the return address from the stack, this points to the + * FineIBT preamble. Since the CALL instruction is in the 5 last + * bytes of the preamble, the return address is in fact the target + * address. + */ + __get_kernel_nofault(&addr, regs->sp, unsigned long, Efault); + *target = addr; + + addr -= fineibt_preamble_size; + if (!exact_endbr((void *)addr)) + return false; + + __get_kernel_nofault(&hash, addr + fineibt_preamble_hash, u32, Efault); + *type = (u32)regs->r10 + hash; + + /* + * The UD2 sites are constructed with a RET immediately following, + * as such the non-fatal case can use the regular fixup. + */ + return true; + +Efault: + return false; +} + /* * regs->ip points to a LOCK Jcc.d8 instruction from the fineibt_paranoid_start[] * sequence. @@ -1605,6 +1689,9 @@ bool decode_fineibt_insn(struct pt_regs *regs, unsigned long *target, u32 *type) if (decode_fineibt_paranoid(regs, target, type)) return true; + if (decode_fineibt_bhi(regs, target, type)) + return true; + return decode_fineibt_preamble(regs, target, type); } diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index ce033e63bc27..72776dcb75aa 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -410,15 +410,20 @@ static void emit_nops(u8 **pprog, int len) * Emit the various CFI preambles, see asm/cfi.h and the comments about FineIBT * in arch/x86/kernel/alternative.c */ +static int emit_call(u8 **prog, void *func, void *ip); -static void emit_fineibt(u8 **pprog, u32 hash) +static void emit_fineibt(u8 **pprog, u8 *ip, u32 hash, int arity) { u8 *prog = *pprog; EMIT_ENDBR(); EMIT3_off32(0x41, 0x81, 0xea, hash); /* subl $hash, %r10d */ - EMIT2(0x75, 0xf9); /* jne.d8 .-7 */ - EMIT3(0x0f, 0x1f, 0x00); /* nop3 */ + if (cfi_bhi) { + emit_call(&prog, __bhi_args[arity], ip + 11); + } else { + EMIT2(0x75, 0xf9); /* jne.d8 .-7 */ + EMIT3(0x0f, 0x1f, 0x00); /* nop3 */ + } EMIT_ENDBR_POISON(); *pprog = prog; @@ -447,13 +452,13 @@ static void emit_kcfi(u8 **pprog, u32 hash) *pprog = prog; } -static void emit_cfi(u8 **pprog, u32 hash) +static void emit_cfi(u8 **pprog, u8 *ip, u32 hash, int arity) { u8 *prog = *pprog; switch (cfi_mode) { case CFI_FINEIBT: - emit_fineibt(&prog, hash); + emit_fineibt(&prog, ip, hash, arity); break; case CFI_KCFI: @@ -504,13 +509,17 @@ static void emit_prologue_tail_call(u8 **pprog, bool is_subprog) * bpf_tail_call helper will skip the first X86_TAIL_CALL_OFFSET bytes * while jumping to another program */ -static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf, +static void emit_prologue(u8 **pprog, u8 *ip, u32 stack_depth, bool ebpf_from_cbpf, bool tail_call_reachable, bool is_subprog, bool is_exception_cb) { u8 *prog = *pprog; - emit_cfi(&prog, is_subprog ? cfi_bpf_subprog_hash : cfi_bpf_hash); + if (is_subprog) { + emit_cfi(&prog, ip, cfi_bpf_subprog_hash, 5); + } else { + emit_cfi(&prog, ip, cfi_bpf_hash, 1); + } /* BPF trampoline can be made to work without these nops, * but let's waste 5 bytes for now and optimize later */ @@ -1479,7 +1488,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image detect_reg_usage(insn, insn_cnt, callee_regs_used); - emit_prologue(&prog, stack_depth, + emit_prologue(&prog, image, stack_depth, bpf_prog_was_classic(bpf_prog), tail_call_reachable, bpf_is_subprog(bpf_prog), bpf_prog->aux->exception_cb); /* Exception callback will clobber callee regs for its own use, and @@ -3046,7 +3055,9 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im /* * Indirect call for bpf_struct_ops */ - emit_cfi(&prog, cfi_get_func_hash(func_addr)); + emit_cfi(&prog, image, + cfi_get_func_hash(func_addr), + cfi_get_func_arity(func_addr)); } else { /* * Direct-call fentry stub, as such it needs accounting for the -- 2.51.0 From dfebe7362f6f461d771cdb9ac2c5172a4721f064 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 24 Feb 2025 13:37:13 +0100 Subject: [PATCH 12/16] x86/ibt: Optimize the fineibt-bhi arity 1 case Saves a CALL to an out-of-line thunk for the common case of 1 argument. Suggested-by: Scott Constable Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Kees Cook Link: https://lore.kernel.org/r/20250224124200.927885784@infradead.org --- arch/x86/include/asm/ibt.h | 4 +++ arch/x86/kernel/alternative.c | 59 +++++++++++++++++++++++++++++------ 2 files changed, 54 insertions(+), 9 deletions(-) diff --git a/arch/x86/include/asm/ibt.h b/arch/x86/include/asm/ibt.h index f0ca5c0b1a6b..9423a2967f50 100644 --- a/arch/x86/include/asm/ibt.h +++ b/arch/x86/include/asm/ibt.h @@ -70,6 +70,10 @@ static inline bool __is_endbr(u32 val) if (val == gen_endbr_poison()) return true; + /* See cfi_fineibt_bhi_preamble() */ + if (IS_ENABLED(CONFIG_FINEIBT_BHI) && val == 0x001f0ff5) + return true; + val &= ~0x01000000U; /* ENDBR32 -> ENDBR64 */ return val == gen_endbr(); } diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index b8d65d512066..32e4b801db99 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -1311,6 +1311,53 @@ static int cfi_rand_preamble(s32 *start, s32 *end) return 0; } +static void cfi_fineibt_bhi_preamble(void *addr, int arity) +{ + if (!arity) + return; + + if (!cfi_warn && arity == 1) { + /* + * Crazy scheme to allow arity-1 inline: + * + * __cfi_foo: + * 0: f3 0f 1e fa endbr64 + * 4: 41 81 78 56 34 12 sub 0x12345678, %r10d + * b: 49 0f 45 fa cmovne %r10, %rdi + * f: 75 f5 jne __cfi_foo+6 + * 11: 0f 1f 00 nopl (%rax) + * + * Code that direct calls to foo()+0, decodes the tail end as: + * + * foo: + * 0: f5 cmc + * 1: 0f 1f 00 nopl (%rax) + * + * which clobbers CF, but does not affect anything ABI + * wise. + * + * Notably, this scheme is incompatible with permissive CFI + * because the CMOVcc is unconditional and RDI will have been + * clobbered. + */ + const u8 magic[9] = { + 0x49, 0x0f, 0x45, 0xfa, + 0x75, 0xf5, + BYTES_NOP3, + }; + + text_poke_early(addr + fineibt_preamble_bhi, magic, 9); + + return; + } + + text_poke_early(addr + fineibt_preamble_bhi, + text_gen_insn(CALL_INSN_OPCODE, + addr + fineibt_preamble_bhi, + __bhi_args[arity]), + CALL_INSN_SIZE); +} + static int cfi_rewrite_preamble(s32 *start, s32 *end) { s32 *s; @@ -1341,14 +1388,8 @@ static int cfi_rewrite_preamble(s32 *start, s32 *end) "kCFI preamble has wrong register at: %pS %*ph\n", addr, 5, addr); - if (!cfi_bhi || !arity) - continue; - - text_poke_early(addr + fineibt_preamble_bhi, - text_gen_insn(CALL_INSN_OPCODE, - addr + fineibt_preamble_bhi, - __bhi_args[arity]), - CALL_INSN_SIZE); + if (cfi_bhi) + cfi_fineibt_bhi_preamble(addr, arity); } return 0; @@ -1361,7 +1402,7 @@ static void cfi_rewrite_endbr(s32 *start, s32 *end) for (s = start; s < end; s++) { void *addr = (void *)s + *s; - if (!is_endbr(addr + 16)) + if (!exact_endbr(addr + 16)) continue; poison_endbr(addr + 16); -- 2.51.0 From 73e8079be9e7ae5ed197d074e0ba6c43674c52f7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 3 Mar 2025 10:21:47 +0100 Subject: [PATCH 13/16] x86/ibt: Make cfi_bhi a constant for FINEIBT_BHI=n Robot yielded a .config that tripped: vmlinux.o: warning: objtool: do_jit+0x276: relocation to !ENDBR: .noinstr.text+0x6a60 This is the result of using __bhi_args[1] in unreachable code; make sure the compiler is able to determine this is unreachable and trigger DCE. Closes: https://lore.kernel.org/oe-kbuild-all/202503030704.H9KFysNS-lkp@intel.com/ Reported-by: kernel test robot Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250303094911.GL5880@noisy.programming.kicks-ass.net --- arch/x86/include/asm/cfi.h | 5 +++++ arch/x86/kernel/alternative.c | 7 +++++++ 2 files changed, 12 insertions(+) diff --git a/arch/x86/include/asm/cfi.h b/arch/x86/include/asm/cfi.h index 2f6a01f098b5..3e51ba459154 100644 --- a/arch/x86/include/asm/cfi.h +++ b/arch/x86/include/asm/cfi.h @@ -100,7 +100,12 @@ enum cfi_mode { }; extern enum cfi_mode cfi_mode; + +#ifdef CONFIG_FINEIBT_BHI extern bool cfi_bhi; +#else +#define cfi_bhi (0) +#endif typedef u8 bhi_thunk[32]; extern bhi_thunk __bhi_args[]; diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 32e4b801db99..bf82c6f7d690 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -936,7 +936,10 @@ void __init_or_module apply_seal_endbr(s32 *start, s32 *end) { } #endif enum cfi_mode cfi_mode __ro_after_init = __CFI_DEFAULT; + +#ifdef CONFIG_FINEIBT_BHI bool cfi_bhi __ro_after_init = false; +#endif #ifdef CONFIG_CFI_CLANG struct bpf_insn; @@ -1070,11 +1073,15 @@ static __init int cfi_parse_cmdline(char *str) pr_err("Ignoring paranoid; depends on fineibt.\n"); } } else if (!strcmp(str, "bhi")) { +#ifdef CONFIG_FINEIBT_BHI if (cfi_mode == CFI_FINEIBT) { cfi_bhi = true; } else { pr_err("Ignoring bhi; depends on fineibt.\n"); } +#else + pr_err("Ignoring bhi; depends on FINEIBT_BHI=y.\n"); +#endif } else { pr_err("Ignoring unknown cfi option (%s).", str); } -- 2.51.0 From 3101900218d7b6acbdee8af3e7bcf04acf5bf9ef Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Mon, 3 Mar 2025 00:44:41 +0000 Subject: [PATCH 14/16] x86/paravirt: Remove unused paravirt_disable_iospace() The last use of paravirt_disable_iospace() was removed in 2015 by commit d1c29465b8a5 ("lguest: don't disable iospace.") Remove it. Note the comment above it about 'entry.S' is unrelated to this but stayed when intervening code got deleted. Signed-off-by: Dr. David Alan Gilbert Signed-off-by: Borislav Petkov (AMD) Signed-off-by: Ingo Molnar Reviewed-by: Juergen Gross Link: https://lore.kernel.org/r/20250303004441.250451-1-linux@treblig.org --- arch/x86/include/asm/paravirt_types.h | 2 -- arch/x86/kernel/paravirt.c | 20 -------------------- 2 files changed, 22 deletions(-) diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 8b36a9562191..8e21a1a85e74 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -242,8 +242,6 @@ extern struct paravirt_patch_template pv_ops; #define paravirt_ptr(op) [paravirt_opptr] "m" (pv_ops.op) -int paravirt_disable_iospace(void); - /* * This generates an indirect call based on the operation type number. * diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 1dc114c810ac..d0b789d04245 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -90,26 +90,6 @@ void paravirt_set_sched_clock(u64 (*func)(void)) static_call_update(pv_sched_clock, func); } -/* These are in entry.S */ -static struct resource reserve_ioports = { - .start = 0, - .end = IO_SPACE_LIMIT, - .name = "paravirt-ioport", - .flags = IORESOURCE_IO | IORESOURCE_BUSY, -}; - -/* - * Reserve the whole legacy IO space to prevent any legacy drivers - * from wasting time probing for their hardware. This is a fairly - * brute-force approach to disabling all non-virtual drivers. - * - * Note that this must be called very early to have any effect. - */ -int paravirt_disable_iospace(void) -{ - return request_resource(&ioport_resource, &reserve_ioports); -} - #ifdef CONFIG_PARAVIRT_XXL static noinstr void pv_native_write_cr2(unsigned long val) { -- 2.51.0 From 399fd7a26441586021ca3722f6a98ff33ed32caf Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Mon, 3 Mar 2025 13:31:11 -0500 Subject: [PATCH 15/16] x86/asm: Merge KSTK_ESP() implementations Commit: 263042e4630a ("Save user RSP in pt_regs->sp on SYSCALL64 fastpath") simplified the 64-bit implementation of KSTK_ESP() which is now identical to 32-bit. Merge them into a common definition. No functional change. Signed-off-by: Brian Gerst Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20250303183111.2245129-1-brgerst@gmail.com --- arch/x86/include/asm/processor.h | 5 +---- arch/x86/kernel/process_64.c | 5 ----- 2 files changed, 1 insertion(+), 9 deletions(-) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index c0cd10182e90..0c911083fdd1 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -668,8 +668,6 @@ static __always_inline void prefetchw(const void *x) .sysenter_cs = __KERNEL_CS, \ } -#define KSTK_ESP(task) (task_pt_regs(task)->sp) - #else extern unsigned long __top_init_kernel_stack[]; @@ -677,8 +675,6 @@ extern unsigned long __top_init_kernel_stack[]; .sp = (unsigned long)&__top_init_kernel_stack, \ } -extern unsigned long KSTK_ESP(struct task_struct *task); - #endif /* CONFIG_X86_64 */ extern void start_thread(struct pt_regs *regs, unsigned long new_ip, @@ -692,6 +688,7 @@ extern void start_thread(struct pt_regs *regs, unsigned long new_ip, #define TASK_UNMAPPED_BASE __TASK_UNMAPPED_BASE(TASK_SIZE_LOW) #define KSTK_EIP(task) (task_pt_regs(task)->ip) +#define KSTK_ESP(task) (task_pt_regs(task)->sp) /* Get/set a process' ability to use the timestamp counter instruction */ #define GET_TSC_CTL(adr) get_tsc_mode((adr)) diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 4ca73ddfb30b..f983d2a57ac3 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -979,8 +979,3 @@ long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2) return ret; } - -unsigned long KSTK_ESP(struct task_struct *task) -{ - return task_pt_regs(task)->sp; -} -- 2.51.0 From 604ea3e90b17f27928a64d86259c57710c254438 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Mon, 3 Mar 2025 12:01:15 -0500 Subject: [PATCH 16/16] x86/smp/32: Remove safe_smp_processor_id() The safe_smp_processor_id() function was originally implemented in: dc2bc768a009 ("stack overflow safe kdump: safe_smp_processor_id()") to mitigate the CPU number corruption on a stack overflow. At the time, x86-32 stored the CPU number in thread_struct, which was located at the bottom of the task stack and thus vulnerable to an overflow. The CPU number is now located in percpu memory, so this workaround is no longer needed. Signed-off-by: Brian Gerst Signed-off-by: Ingo Molnar Cc: Linus Torvalds Cc: Uros Bizjak Cc: "H. Peter Anvin" Link: https://lore.kernel.org/r/20250303170115.2176553-1-brgerst@gmail.com --- arch/x86/include/asm/cpu.h | 1 - arch/x86/include/asm/smp.h | 6 ------ arch/x86/kernel/apic/ipi.c | 30 ------------------------------ arch/x86/kernel/crash.c | 2 +- arch/x86/kernel/reboot.c | 2 +- 5 files changed, 2 insertions(+), 39 deletions(-) diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h index 98eced5084ca..f44bbce18859 100644 --- a/arch/x86/include/asm/cpu.h +++ b/arch/x86/include/asm/cpu.h @@ -12,7 +12,6 @@ #ifndef CONFIG_SMP #define cpu_physical_id(cpu) boot_cpu_physical_apicid #define cpu_acpi_id(cpu) 0 -#define safe_smp_processor_id() 0 #endif /* CONFIG_SMP */ #ifdef CONFIG_HOTPLUG_CPU diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index ca073f40698f..2419e5086c9e 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -136,12 +136,6 @@ __visible void smp_call_function_single_interrupt(struct pt_regs *r); #define raw_smp_processor_id() this_cpu_read(pcpu_hot.cpu_number) #define __smp_processor_id() __this_cpu_read(pcpu_hot.cpu_number) -#ifdef CONFIG_X86_32 -extern int safe_smp_processor_id(void); -#else -# define safe_smp_processor_id() smp_processor_id() -#endif - static inline struct cpumask *cpu_llc_shared_mask(int cpu) { return per_cpu(cpu_llc_shared_map, cpu); diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c index 5da693d633b7..23025a3a1db4 100644 --- a/arch/x86/kernel/apic/ipi.c +++ b/arch/x86/kernel/apic/ipi.c @@ -287,34 +287,4 @@ void default_send_IPI_mask_logical(const struct cpumask *cpumask, int vector) __default_send_IPI_dest_field(mask, vector, APIC_DEST_LOGICAL); local_irq_restore(flags); } - -#ifdef CONFIG_SMP -static int convert_apicid_to_cpu(u32 apic_id) -{ - int i; - - for_each_possible_cpu(i) { - if (per_cpu(x86_cpu_to_apicid, i) == apic_id) - return i; - } - return -1; -} - -int safe_smp_processor_id(void) -{ - u32 apicid; - int cpuid; - - if (!boot_cpu_has(X86_FEATURE_APIC)) - return 0; - - apicid = read_apic_id(); - if (apicid == BAD_APICID) - return 0; - - cpuid = convert_apicid_to_cpu(apicid); - - return cpuid >= 0 ? cpuid : 0; -} -#endif #endif diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 340af8155658..0be61c45400c 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -140,7 +140,7 @@ void native_machine_crash_shutdown(struct pt_regs *regs) x86_platform.guest.enc_kexec_begin(); x86_platform.guest.enc_kexec_finish(); - crash_save_cpu(regs, safe_smp_processor_id()); + crash_save_cpu(regs, smp_processor_id()); } #if defined(CONFIG_KEXEC_FILE) || defined(CONFIG_CRASH_HOTPLUG) diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 9aaac1f9f45b..964f6b0a3d68 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -921,7 +921,7 @@ void nmi_shootdown_cpus(nmi_shootdown_cb callback) return; /* Make a note of crashing cpu. Will be used in NMI callback. */ - crashing_cpu = safe_smp_processor_id(); + crashing_cpu = smp_processor_id(); shootdown_callback = callback; -- 2.51.0