#define RET_DEPTH_INIT_FROM_CALL       0xfc00000000000000ULL
 #define RET_DEPTH_CREDIT               0xffffffffffffffffULL
 
+#ifdef CONFIG_CALL_THUNKS_DEBUG
+# define CALL_THUNKS_DEBUG_INC_CALLS                           \
+       incq    %gs:__x86_call_count;
+# define CALL_THUNKS_DEBUG_INC_RETS                            \
+       incq    %gs:__x86_ret_count;
+# define CALL_THUNKS_DEBUG_INC_STUFFS                          \
+       incq    %gs:__x86_stuffs_count;
+# define CALL_THUNKS_DEBUG_INC_CTXSW                           \
+       incq    %gs:__x86_ctxsw_count;
+#else
+# define CALL_THUNKS_DEBUG_INC_CALLS
+# define CALL_THUNKS_DEBUG_INC_RETS
+# define CALL_THUNKS_DEBUG_INC_STUFFS
+# define CALL_THUNKS_DEBUG_INC_CTXSW
+#endif
+
 #if defined(CONFIG_CALL_DEPTH_TRACKING) && !defined(COMPILE_OFFSETS)
 
 #include <asm/asm-offsets.h>
 #define RESET_CALL_DEPTH_FROM_CALL                             \
        mov     $0xfc, %rax;                                    \
        shl     $56, %rax;                                      \
-       movq    %rax, PER_CPU_VAR(pcpu_hot + X86_call_depth);
+       movq    %rax, PER_CPU_VAR(pcpu_hot + X86_call_depth);   \
+       CALL_THUNKS_DEBUG_INC_CALLS
 
 #define INCREMENT_CALL_DEPTH                                   \
-       sarq    $5, %gs:pcpu_hot + X86_call_depth;
+       sarq    $5, %gs:pcpu_hot + X86_call_depth;              \
+       CALL_THUNKS_DEBUG_INC_CALLS
 
 #define ASM_INCREMENT_CALL_DEPTH                               \
-       sarq    $5, PER_CPU_VAR(pcpu_hot + X86_call_depth);
+       sarq    $5, PER_CPU_VAR(pcpu_hot + X86_call_depth);     \
+       CALL_THUNKS_DEBUG_INC_CALLS
 
 #else
 #define CREDIT_CALL_DEPTH
+#define ASM_CREDIT_CALL_DEPTH
 #define RESET_CALL_DEPTH
 #define INCREMENT_CALL_DEPTH
+#define ASM_INCREMENT_CALL_DEPTH
 #define RESET_CALL_DEPTH_FROM_CALL
 #endif
 
        jnz     771b;                                   \
        /* barrier for jnz misprediction */             \
        lfence;                                         \
-       ASM_CREDIT_CALL_DEPTH
+       ASM_CREDIT_CALL_DEPTH                           \
+       CALL_THUNKS_DEBUG_INC_CTXSW
 #else
 /*
  * i386 doesn't unconditionally have LFENCE, as such it can't
 {
        x86_return_thunk = &__x86_return_skl;
 }
+#ifdef CONFIG_CALL_THUNKS_DEBUG
+DECLARE_PER_CPU(u64, __x86_call_count);
+DECLARE_PER_CPU(u64, __x86_ret_count);
+DECLARE_PER_CPU(u64, __x86_stuffs_count);
+DECLARE_PER_CPU(u64, __x86_ctxsw_count);
+#endif
 #else
 static inline void x86_set_skl_return_thunk(void) {}
 #endif
 
 
 #define pr_fmt(fmt) "callthunks: " fmt
 
+#include <linux/debugfs.h>
 #include <linux/kallsyms.h>
 #include <linux/memory.h>
 #include <linux/moduleloader.h>
 }
 __setup("debug-callthunks", debug_thunks);
 
+#ifdef CONFIG_CALL_THUNKS_DEBUG
+DEFINE_PER_CPU(u64, __x86_call_count);
+DEFINE_PER_CPU(u64, __x86_ret_count);
+DEFINE_PER_CPU(u64, __x86_stuffs_count);
+DEFINE_PER_CPU(u64, __x86_ctxsw_count);
+EXPORT_SYMBOL_GPL(__x86_ctxsw_count);
+EXPORT_SYMBOL_GPL(__x86_call_count);
+#endif
+
 extern s32 __call_sites[], __call_sites_end[];
 
 struct thunk_desc {
        mutex_unlock(&text_mutex);
 }
 #endif /* CONFIG_MODULES */
+
+#if defined(CONFIG_CALL_THUNKS_DEBUG) && defined(CONFIG_DEBUG_FS)
+static int callthunks_debug_show(struct seq_file *m, void *p)
+{
+       unsigned long cpu = (unsigned long)m->private;
+
+       seq_printf(m, "C: %16llu R: %16llu S: %16llu X: %16llu\n,",
+                  per_cpu(__x86_call_count, cpu),
+                  per_cpu(__x86_ret_count, cpu),
+                  per_cpu(__x86_stuffs_count, cpu),
+                  per_cpu(__x86_ctxsw_count, cpu));
+       return 0;
+}
+
+static int callthunks_debug_open(struct inode *inode, struct file *file)
+{
+       return single_open(file, callthunks_debug_show, inode->i_private);
+}
+
+static const struct file_operations dfs_ops = {
+       .open           = callthunks_debug_open,
+       .read           = seq_read,
+       .llseek         = seq_lseek,
+       .release        = single_release,
+};
+
+static int __init callthunks_debugfs_init(void)
+{
+       struct dentry *dir;
+       unsigned long cpu;
+
+       dir = debugfs_create_dir("callthunks", NULL);
+       for_each_possible_cpu(cpu) {
+               void *arg = (void *)cpu;
+               char name [10];
+
+               sprintf(name, "cpu%lu", cpu);
+               debugfs_create_file(name, 0644, dir, arg, &dfs_ops);
+       }
+       return 0;
+}
+__initcall(callthunks_debugfs_init);
+#endif
 
        .align 64
 SYM_FUNC_START(__x86_return_skl)
        ANNOTATE_NOENDBR
-       /* Keep the hotpath in a 16byte I-fetch */
+       /*
+        * Keep the hotpath in a 16byte I-fetch for the non-debug
+        * case.
+        */
+       CALL_THUNKS_DEBUG_INC_RETS
        shlq    $5, PER_CPU_VAR(pcpu_hot + X86_call_depth)
        jz      1f
        ANNOTATE_UNRET_SAFE
        ret
        int3
 1:
+       CALL_THUNKS_DEBUG_INC_STUFFS
        .rept   16
        ANNOTATE_INTRA_FUNCTION_CALL
        call    2f