Ensure that retpolines do the proper call accounting so that the return
accounting works correctly.
Specifically; retpolines are used to replace both 'jmp *%reg' and
'call *%reg', however these two cases do not have the same accounting
requirements. Therefore split things up and provide two different
retpoline arrays for SKL.
The 'jmp *%reg' case needs no accounting, the
__x86_indirect_jump_thunk_array[] covers this. The retpoline is
changed to not use the return thunk; it's a simple call;ret construct.
[ strictly speaking it should do:
	andq $(~0x1f), PER_CPU_VAR(__x86_call_depth)
  but we can argue this can be covered by the fuzz we already have
  in the accounting depth (12) vs the RSB depth (16) ]
The 'call *%reg' case does need accounting, the
__x86_indirect_call_thunk_array[] covers this. Again, this retpoline
avoids the use of the return-thunk, in this case to avoid double
accounting.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20220915111147.996634749@infradead.org
 
 typedef u8 retpoline_thunk_t[RETPOLINE_THUNK_SIZE];
 extern retpoline_thunk_t __x86_indirect_thunk_array[];
+extern retpoline_thunk_t __x86_indirect_call_thunk_array[];
+extern retpoline_thunk_t __x86_indirect_jump_thunk_array[];
 
 extern void __x86_return_thunk(void);
 extern void zen_untrain_ret(void);
 #include <asm/GEN-for-each-reg.h>
 #undef GEN
 
+#define GEN(reg)                                               \
+       extern retpoline_thunk_t __x86_indirect_call_thunk_ ## reg;
+#include <asm/GEN-for-each-reg.h>
+#undef GEN
+
+#define GEN(reg)                                               \
+       extern retpoline_thunk_t __x86_indirect_jump_thunk_ ## reg;
+#include <asm/GEN-for-each-reg.h>
+#undef GEN
+
 #ifdef CONFIG_X86_64
 
 /*
 
        return i;
 }
 
+static inline bool is_jcc32(struct insn *insn)
+{
+       /* Jcc.d32 second opcode byte is in the range: 0x80-0x8f */
+       return insn->opcode.bytes[0] == 0x0f && (insn->opcode.bytes[1] & 0xf0) == 0x80;
+}
+
+static int emit_call_track_retpoline(void *addr, struct insn *insn, int reg, u8 *bytes)
+{
+       u8 op = insn->opcode.bytes[0];
+       int i = 0;
+
+       /*
+        * Clang does 'weird' Jcc __x86_indirect_thunk_r11 conditional
+        * tail-calls. Deal with them.
+        */
+       if (is_jcc32(insn)) {
+               bytes[i++] = op;
+               op = insn->opcode.bytes[1];
+               goto clang_jcc;
+       }
+
+       if (insn->length == 6)
+               bytes[i++] = 0x2e; /* CS-prefix */
+
+       switch (op) {
+       case CALL_INSN_OPCODE:
+               __text_gen_insn(bytes+i, op, addr+i,
+                               __x86_indirect_call_thunk_array[reg],
+                               CALL_INSN_SIZE);
+               i += CALL_INSN_SIZE;
+               break;
+
+       case JMP32_INSN_OPCODE:
+clang_jcc:
+               __text_gen_insn(bytes+i, op, addr+i,
+                               __x86_indirect_jump_thunk_array[reg],
+                               JMP32_INSN_SIZE);
+               i += JMP32_INSN_SIZE;
+               break;
+
+       default:
+               WARN("%pS %px %*ph\n", addr, addr, 6, addr);
+               return -1;
+       }
+
+       WARN_ON_ONCE(i != insn->length);
+
+       return i;
+}
+
 /*
  * Rewrite the compiler generated retpoline thunk calls.
  *
        BUG_ON(reg == 4);
 
        if (cpu_feature_enabled(X86_FEATURE_RETPOLINE) &&
-           !cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE))
+           !cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) {
+               if (cpu_feature_enabled(X86_FEATURE_CALL_DEPTH))
+                       return emit_call_track_retpoline(addr, insn, reg, bytes);
+
                return -1;
+       }
 
        op = insn->opcode.bytes[0];
 
         *   [ NOP ]
         * 1:
         */
-       /* Jcc.d32 second opcode byte is in the range: 0x80-0x8f */
-       if (op == 0x0f && (insn->opcode.bytes[1] & 0xf0) == 0x80) {
+       if (is_jcc32(insn)) {
                cc = insn->opcode.bytes[1] & 0xf;
                cc ^= 1; /* invert condition */
 
 
 
        .section .text.__x86.indirect_thunk
 
-.macro RETPOLINE reg
+
+.macro POLINE reg
        ANNOTATE_INTRA_FUNCTION_CALL
        call    .Ldo_rop_\@
-.Lspec_trap_\@:
-       UNWIND_HINT_EMPTY
-       pause
-       lfence
-       jmp .Lspec_trap_\@
+       int3
 .Ldo_rop_\@:
        mov     %\reg, (%_ASM_SP)
        UNWIND_HINT_FUNC
+.endm
+
+.macro RETPOLINE reg
+       POLINE \reg
        RET
 .endm
 
  */
 
 #define __EXPORT_THUNK(sym)    _ASM_NOKPROBE(sym); EXPORT_SYMBOL(sym)
-#define EXPORT_THUNK(reg)      __EXPORT_THUNK(__x86_indirect_thunk_ ## reg)
 
        .align RETPOLINE_THUNK_SIZE
 SYM_CODE_START(__x86_indirect_thunk_array)
        .align RETPOLINE_THUNK_SIZE
 SYM_CODE_END(__x86_indirect_thunk_array)
 
-#define GEN(reg) EXPORT_THUNK(reg)
+#define GEN(reg) __EXPORT_THUNK(__x86_indirect_thunk_ ## reg)
+#include <asm/GEN-for-each-reg.h>
+#undef GEN
+
+#ifdef CONFIG_CALL_DEPTH_TRACKING
+.macro CALL_THUNK reg
+       .align RETPOLINE_THUNK_SIZE
+
+SYM_INNER_LABEL(__x86_indirect_call_thunk_\reg, SYM_L_GLOBAL)
+       UNWIND_HINT_EMPTY
+       ANNOTATE_NOENDBR
+
+       CALL_DEPTH_ACCOUNT
+       POLINE \reg
+       ANNOTATE_UNRET_SAFE
+       ret
+       int3
+.endm
+
+       .align RETPOLINE_THUNK_SIZE
+SYM_CODE_START(__x86_indirect_call_thunk_array)
+
+#define GEN(reg) CALL_THUNK reg
 #include <asm/GEN-for-each-reg.h>
 #undef GEN
 
+       .align RETPOLINE_THUNK_SIZE
+SYM_CODE_END(__x86_indirect_call_thunk_array)
+
+#define GEN(reg) __EXPORT_THUNK(__x86_indirect_call_thunk_ ## reg)
+#include <asm/GEN-for-each-reg.h>
+#undef GEN
+
+.macro JUMP_THUNK reg
+       .align RETPOLINE_THUNK_SIZE
+
+SYM_INNER_LABEL(__x86_indirect_jump_thunk_\reg, SYM_L_GLOBAL)
+       UNWIND_HINT_EMPTY
+       ANNOTATE_NOENDBR
+       POLINE \reg
+       ANNOTATE_UNRET_SAFE
+       ret
+       int3
+.endm
+
+       .align RETPOLINE_THUNK_SIZE
+SYM_CODE_START(__x86_indirect_jump_thunk_array)
+
+#define GEN(reg) JUMP_THUNK reg
+#include <asm/GEN-for-each-reg.h>
+#undef GEN
+
+       .align RETPOLINE_THUNK_SIZE
+SYM_CODE_END(__x86_indirect_jump_thunk_array)
+
+#define GEN(reg) __EXPORT_THUNK(__x86_indirect_jump_thunk_ ## reg)
+#include <asm/GEN-for-each-reg.h>
+#undef GEN
+#endif
 /*
  * This function name is magical and is used by -mfunction-return=thunk-extern
  * for the compiler to generate JMPs to it.
 
                EMIT2(0xFF, 0xE0 + reg);
        } else if (cpu_feature_enabled(X86_FEATURE_RETPOLINE)) {
                OPTIMIZER_HIDE_VAR(reg);
-               emit_jump(&prog, &__x86_indirect_thunk_array[reg], ip);
+               if (cpu_feature_enabled(X86_FEATURE_CALL_DEPTH))
+                       emit_jump(&prog, &__x86_indirect_jump_thunk_array[reg], ip);
+               else
+                       emit_jump(&prog, &__x86_indirect_thunk_array[reg], ip);
        } else {
                EMIT2(0xFF, 0xE0 + reg);        /* jmp *%\reg */
                if (IS_ENABLED(CONFIG_RETPOLINE) || IS_ENABLED(CONFIG_SLS))