Since ftrace has trampolines, don't use thunks for the __fentry__ site
but instead require that every function called from there includes
accounting. This very much includes all the direct-call functions.
Additionally, ftrace uses ROP tricks in two places:
 - return_to_handler(), and
 - ftrace_regs_caller() when pt_regs->orig_ax is set by a direct-call.
return_to_handler() already uses a retpoline to replace an
indirect-jump to defeat IBT, since this is a jump-type retpoline, make
sure there is no accounting done and ALTERNATIVE the RET into a ret.
ftrace_regs_caller() does much the same and gets the same treatment.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20220915111148.927545073@infradead.org
 {
        x86_return_thunk = &__x86_return_skl;
 }
+
+#define CALL_DEPTH_ACCOUNT                                     \
+       ALTERNATIVE("",                                         \
+                   __stringify(INCREMENT_CALL_DEPTH),          \
+                   X86_FEATURE_CALL_DEPTH)
+
 #ifdef CONFIG_CALL_THUNKS_DEBUG
 DECLARE_PER_CPU(u64, __x86_call_count);
 DECLARE_PER_CPU(u64, __x86_ret_count);
 #endif
 #else
 static inline void x86_set_skl_return_thunk(void) {}
+
+#define CALL_DEPTH_ACCOUNT ""
+
 #endif
 
 #ifdef CONFIG_RETPOLINE
 
                return 0;
 
        /* Is function call target a thunk? */
-       if (is_callthunk(func))
+       if (func && is_callthunk(func))
                return 0;
 
        memcpy(*pprog, tmpl, tmpl_size);
 
 
 static const char *ftrace_call_replace(unsigned long ip, unsigned long addr)
 {
+       /*
+        * No need to translate into a callthunk. The trampoline does
+        * the depth accounting itself.
+        */
        return text_gen_insn(CALL_INSN_OPCODE, (void *)ip, (void *)addr);
 }
 
        unsigned long size;
        unsigned long *ptr;
        void *trampoline;
-       void *ip;
+       void *ip, *dest;
        /* 48 8b 15 <offset> is movq <offset>(%rip), %rdx */
        unsigned const char op_ref[] = { 0x48, 0x8b, 0x15 };
        unsigned const char retq[] = { RET_INSN_OPCODE, INT3_INSN_OPCODE };
        /* put in the call to the function */
        mutex_lock(&text_mutex);
        call_offset -= start_offset;
+       /*
+        * No need to translate into a callthunk. The trampoline does
+        * the depth accounting before the call already.
+        */
+       dest = ftrace_ops_get_func(ops);
        memcpy(trampoline + call_offset,
-              text_gen_insn(CALL_INSN_OPCODE,
-                            trampoline + call_offset,
-                            ftrace_ops_get_func(ops)), CALL_INSN_SIZE);
+              text_gen_insn(CALL_INSN_OPCODE, trampoline + call_offset, dest),
+              CALL_INSN_SIZE);
        mutex_unlock(&text_mutex);
 
        /* ALLOC_TRAMP flags lets us know we created it */
 
  */
 
 #include <linux/linkage.h>
+#include <asm/asm-offsets.h>
 #include <asm/ptrace.h>
 #include <asm/ftrace.h>
 #include <asm/export.h>
 #ifdef CONFIG_DYNAMIC_FTRACE
 
 SYM_FUNC_START(__fentry__)
+       CALL_DEPTH_ACCOUNT
        RET
 SYM_FUNC_END(__fentry__)
 EXPORT_SYMBOL(__fentry__)
        /* save_mcount_regs fills in first two parameters */
        save_mcount_regs
 
+       CALL_DEPTH_ACCOUNT
+
        /* Stack - skipping return address of ftrace_caller */
        leaq MCOUNT_REG_SIZE+8(%rsp), %rcx
        movq %rcx, RSP(%rsp)
        /* Only ops with REGS flag set should have CS register set */
        movq $0, CS(%rsp)
 
+       /* Account for the function call below */
+       CALL_DEPTH_ACCOUNT
+
 SYM_INNER_LABEL(ftrace_call, SYM_L_GLOBAL)
        ANNOTATE_NOENDBR
        call ftrace_stub
        save_mcount_regs 8
        /* save_mcount_regs fills in first two parameters */
 
+       CALL_DEPTH_ACCOUNT
+
 SYM_INNER_LABEL(ftrace_regs_caller_op_ptr, SYM_L_GLOBAL)
        ANNOTATE_NOENDBR
        /* Load the ftrace_ops into the 3rd parameter */
        /* regs go into 4th parameter */
        leaq (%rsp), %rcx
 
+       /* Account for the function call below */
+       CALL_DEPTH_ACCOUNT
+
 SYM_INNER_LABEL(ftrace_regs_call, SYM_L_GLOBAL)
        ANNOTATE_NOENDBR
        call ftrace_stub
        int3
 .Ldo_rebalance:
        add $8, %rsp
-       RET
+       ALTERNATIVE __stringify(RET), \
+                   __stringify(ANNOTATE_UNRET_SAFE; ret; int3), \
+                   X86_FEATURE_CALL_DEPTH
 
 SYM_FUNC_END(ftrace_regs_caller)
 STACK_FRAME_NON_STANDARD_FP(ftrace_regs_caller)
 #else /* ! CONFIG_DYNAMIC_FTRACE */
 
 SYM_FUNC_START(__fentry__)
+       CALL_DEPTH_ACCOUNT
+
        cmpq $ftrace_stub, ftrace_trace_function
        jnz trace
 
        int3
 .Ldo_rop:
        mov %rdi, (%rsp)
-       RET
+       ALTERNATIVE __stringify(RET), \
+                   __stringify(ANNOTATE_UNRET_SAFE; ret; int3), \
+                   X86_FEATURE_CALL_DEPTH
 SYM_CODE_END(return_to_handler)
 #endif
 
 #include <linux/memory.h>
 #include <linux/sort.h>
 #include <asm/extable.h>
+#include <asm/ftrace.h>
 #include <asm/set_memory.h>
 #include <asm/nospec-branch.h>
 #include <asm/text-patching.h>
        prog = image;
 
        EMIT_ENDBR();
+       /*
+        * This is the direct-call trampoline, as such it needs accounting
+        * for the __fentry__ call.
+        */
+       x86_call_depth_emit_accounting(&prog, NULL);
        EMIT1(0x55);             /* push rbp */
        EMIT3(0x48, 0x89, 0xE5); /* mov rbp, rsp */
        EMIT4(0x48, 0x83, 0xEC, stack_size); /* sub rsp, stack_size */
 
 };
 
 #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
-noinline __noclone static void trace_direct_tramp(void) { }
+#ifndef CALL_DEPTH_ACCOUNT
+#define CALL_DEPTH_ACCOUNT ""
+#endif
+
+noinline __noclone static void trace_direct_tramp(void)
+{
+       asm(CALL_DEPTH_ACCOUNT);
+}
 #endif
 
 /*
 
 #include <linux/kthread.h>
 #include <linux/ftrace.h>
 #include <asm/asm-offsets.h>
+#include <asm/nospec-branch.h>
 
 extern void my_direct_func1(void);
 extern void my_direct_func2(void);
        ASM_ENDBR
 "      pushq %rbp\n"
 "      movq %rsp, %rbp\n"
+       CALL_DEPTH_ACCOUNT
 "      call my_direct_func1\n"
 "      leave\n"
 "      .size           my_tramp1, .-my_tramp1\n"
        ASM_ENDBR
 "      pushq %rbp\n"
 "      movq %rsp, %rbp\n"
+       CALL_DEPTH_ACCOUNT
 "      call my_direct_func2\n"
 "      leave\n"
        ASM_RET
 
 #include <linux/kthread.h>
 #include <linux/ftrace.h>
 #include <asm/asm-offsets.h>
+#include <asm/nospec-branch.h>
 
 extern void my_direct_func1(unsigned long ip);
 extern void my_direct_func2(unsigned long ip);
        ASM_ENDBR
 "      pushq %rbp\n"
 "      movq %rsp, %rbp\n"
+       CALL_DEPTH_ACCOUNT
 "      pushq %rdi\n"
 "      movq 8(%rbp), %rdi\n"
 "      call my_direct_func1\n"
        ASM_ENDBR
 "      pushq %rbp\n"
 "      movq %rsp, %rbp\n"
+       CALL_DEPTH_ACCOUNT
 "      pushq %rdi\n"
 "      movq 8(%rbp), %rdi\n"
 "      call my_direct_func2\n"
 
 #include <linux/ftrace.h>
 #include <linux/sched/stat.h>
 #include <asm/asm-offsets.h>
+#include <asm/nospec-branch.h>
 
 extern void my_direct_func(unsigned long ip);
 
        ASM_ENDBR
 "      pushq %rbp\n"
 "      movq %rsp, %rbp\n"
+       CALL_DEPTH_ACCOUNT
 "      pushq %rdi\n"
 "      movq 8(%rbp), %rdi\n"
 "      call my_direct_func\n"
 
 #include <linux/mm.h> /* for handle_mm_fault() */
 #include <linux/ftrace.h>
 #include <asm/asm-offsets.h>
+#include <asm/nospec-branch.h>
 
 extern void my_direct_func(struct vm_area_struct *vma,
                           unsigned long address, unsigned int flags);
        ASM_ENDBR
 "      pushq %rbp\n"
 "      movq %rsp, %rbp\n"
+       CALL_DEPTH_ACCOUNT
 "      pushq %rdi\n"
 "      pushq %rsi\n"
 "      pushq %rdx\n"
 
 #include <linux/sched.h> /* for wake_up_process() */
 #include <linux/ftrace.h>
 #include <asm/asm-offsets.h>
+#include <asm/nospec-branch.h>
 
 extern void my_direct_func(struct task_struct *p);
 
        ASM_ENDBR
 "      pushq %rbp\n"
 "      movq %rsp, %rbp\n"
+       CALL_DEPTH_ACCOUNT
 "      pushq %rdi\n"
 "      call my_direct_func\n"
 "      popq %rdi\n"