]> www.infradead.org Git - users/hch/misc.git/commitdiff
x86,retpoline: Optimize patch_retpoline()
authorPeter Zijlstra <peterz@infradead.org>
Tue, 2 Sep 2025 09:20:35 +0000 (11:20 +0200)
committerPeter Zijlstra <peterz@infradead.org>
Thu, 4 Sep 2025 19:59:09 +0000 (21:59 +0200)
Currently the very common retpoline: "CS CALL __x86_indirect_thunk_r11"
is transformed into "CALL *R11; NOP3" for eIBRS/BHI_NO parts.

Similarly, paranoid fineibt has: "CALL *R11; NOP".

Recognise that CS stuffing can avoid the extra NOP. However, due to
prefix decode penalties, make sure to not emit too many CS prefixes.
Notably: "CS CALL __x86_indirect_thunk_rax" must not become "CS CS CS
CS CALL *RAX". Prefix decode penalties are typically many more cycles
than decoding an extra NOP.

Additionally, if the retpoline is a tail-call, the "JMP *%\reg" should
be followed by INT3 for straight-line-speculation mitigation, since
emit_indirect() now has a length argument, move this into
emit_indirect() such that other users (paranoid-fineibt) also do this.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20250902104627.GM4068168@noisy.programming.kicks-ass.net
arch/x86/kernel/alternative.c

index 3d6a884436161d6097a874490b83166a4dd92e6b..69fb818df2eedc598e0a37d92adcb5477f888742 100644 (file)
@@ -713,20 +713,33 @@ static inline bool is_jcc32(struct insn *insn)
 #if defined(CONFIG_MITIGATION_RETPOLINE) && defined(CONFIG_OBJTOOL)
 
 /*
- * CALL/JMP *%\reg
+ * [CS]{,3} CALL/JMP *%\reg [INT3]*
  */
-static int emit_indirect(int op, int reg, u8 *bytes)
+static int emit_indirect(int op, int reg, u8 *bytes, int len)
 {
+       int cs = 0, bp = 0;
        int i = 0;
        u8 modrm;
 
+       /*
+        * Set @len to the excess bytes after writing the instruction.
+        */
+       len -= 2 + (reg >= 8);
+       WARN_ON_ONCE(len < 0);
+
        switch (op) {
        case CALL_INSN_OPCODE:
                modrm = 0x10; /* Reg = 2; CALL r/m */
+               /*
+                * Additional NOP is better than prefix decode penalty.
+                */
+               if (len <= 3)
+                       cs = len;
                break;
 
        case JMP32_INSN_OPCODE:
                modrm = 0x20; /* Reg = 4; JMP r/m */
+               bp = len;
                break;
 
        default:
@@ -734,6 +747,9 @@ static int emit_indirect(int op, int reg, u8 *bytes)
                return -1;
        }
 
+       while (cs--)
+               bytes[i++] = 0x2e; /* CS-prefix */
+
        if (reg >= 8) {
                bytes[i++] = 0x41; /* REX.B prefix */
                reg -= 8;
@@ -745,6 +761,9 @@ static int emit_indirect(int op, int reg, u8 *bytes)
        bytes[i++] = 0xff; /* opcode */
        bytes[i++] = modrm;
 
+       while (bp--)
+               bytes[i++] = 0xcc; /* INT3 */
+
        return i;
 }
 
@@ -918,20 +937,11 @@ static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes)
                return emit_its_trampoline(addr, insn, reg, bytes);
 #endif
 
-       ret = emit_indirect(op, reg, bytes + i);
+       ret = emit_indirect(op, reg, bytes + i, insn->length - i);
        if (ret < 0)
                return ret;
        i += ret;
 
-       /*
-        * The compiler is supposed to EMIT an INT3 after every unconditional
-        * JMP instruction due to AMD BTC. However, if the compiler is too old
-        * or MITIGATION_SLS isn't enabled, we still need an INT3 after
-        * indirect JMPs even on Intel.
-        */
-       if (op == JMP32_INSN_OPCODE && i < insn->length)
-               bytes[i++] = INT3_INSN_OPCODE;
-
        for (; i < insn->length;)
                bytes[i++] = BYTES_NOP1;
 
@@ -1421,8 +1431,7 @@ asm(      ".pushsection .rodata                           \n"
        "#fineibt_caller_size:                          \n"
        "       jne     fineibt_paranoid_start+0xd      \n"
        "fineibt_paranoid_ind:                          \n"
-       "       call    *%r11                           \n"
-       "       nop                                     \n"
+       "       cs call *%r11                           \n"
        "fineibt_paranoid_end:                          \n"
        ".popsection                                    \n"
 );
@@ -1724,8 +1733,9 @@ static int cfi_rewrite_callers(s32 *start, s32 *end)
                        emit_paranoid_trampoline(addr + fineibt_caller_size,
                                                 &insn, 11, bytes + fineibt_caller_size);
                } else {
-                       ret = emit_indirect(op, 11, bytes + fineibt_paranoid_ind);
-                       if (WARN_ON_ONCE(ret != 3))
+                       int len = fineibt_paranoid_size - fineibt_paranoid_ind;
+                       ret = emit_indirect(op, 11, bytes + fineibt_paranoid_ind, len);
+                       if (WARN_ON_ONCE(ret != len))
                                continue;
                }