#ifndef _SPARC_DTRACE_ARCH_H
#define _SPARC_DTRACE_ARCH_H
+#include <linux/module.h>
+
typedef uint32_t asm_instr_t;
-#define SDT_TRAMP_SIZE 11
-#define DTRACE_PDATA_SIZE 64
-#define DTRACE_PDATA_EXTRA (dtrace_sdt_nprobes * SDT_TRAMP_SIZE * \
- sizeof(asm_instr_t))
-#define DTRACE_PDATA_MAXSIZE (DTRACE_PDATA_SIZE + DTRACE_PDATA_EXTRA)
+/*
+ * Maximum size (in instruction count) of SDT and FBT trampolines.
+ */
+#define SDT_TRAMP_SIZE 11
+#define FBT_TRAMP_SIZE 13
+
+/*
+ * Maximum number of SDT and FBT probes. The actual number available to DTRACE
+ * may be lower due to runtime filtering of troublesome functions.
+ */
+#define DTRACE_SDT_MAX(mp) (mp->sdt_probec)
+#define DTRACE_FBT_MAX(mp) (mp->num_ftrace_callsites)
+
+/*
+ * The following macros are used to partition the PDATA memory block. The SDT
+ * trampolines are stored first, followed by the FBT trampolines.
+ *
+ * DTRACE_PD_SDT_OFF:
+ * Offset (in the PDATA memory block) for space to store SDT trampolines.
+ * DTRACE_PD_FBT_OFF:
+ * Offset (in the PDATA memory block) for space to store FBT trampolines.
+ * DTRACE_PD_MAXSIZE:
+ * Maximum size of the PDATA memory block (if no SDT or FBT probes get
+ * filtered out).
+ * DTRACE_PD_MAXSIZE:
+ * Maximum size of the PDATA memory bock for the kernel pseudo-module.
+ * There is a separate macro for this because (at boot time) the maximum
+ * number of SDT and FBT probes is stored in global constants. Wehn the
+ * kernel pseudo-module is initialized, the value of those constants is
+ * assigned to the appropriate module struct members so that the macros
+ * above (DTRACE_SDT_MAX and DTRACE_FBT_MAX) can be used after that point.
+ */
+#define DTRACE_PD_SDT_OFF_(sc, fc) 0
+#define DTRACE_PD_SDT_OFF(mp) DTRACE_PD_SDT_OFF_(DTRACE_SDT_MAX(mp), \
+ DTRACE_FBT_MAX(mp))
+#define DTRACE_PD_FBT_OFF_(sc, fc) (DTRACE_PD_SDT_OFF_((sc), (fc)) + \
+ (sc) * SDT_TRAMP_SIZE * \
+ sizeof(asm_instr_t))
+#define DTRACE_PD_FBT_OFF(mp) DTRACE_PD_FBT_OFF_(DTRACE_SDT_MAX(mp), \
+ DTRACE_FBT_MAX(mp))
+#define DTRACE_PD_MAXSIZE_(sc, fc) (DTRACE_PD_FBT_OFF_((sc), (fc)) + \
+ (fc) * FBT_TRAMP_SIZE * \
+ sizeof(asm_instr_t))
+#define DTRACE_PD_MAXSIZE(mp) DTRACE_PD_MAXSIZE_(DTRACE_SDT_MAX(mp), \
+ DTRACE_FBT_MAX(mp))
+
+#define DTRACE_PD_MAXSIZE_KERNEL DTRACE_PD_MAXSIZE_(dtrace_sdt_nprobes, \
+ dtrace_fbt_nfuncs)
#endif /* _SPARC_DTRACE_ARCH_H */
--- /dev/null
+/*
+ * FILE: dtrace_fbt.c
+ * DESCRIPTION: Dynamic Tracing: FBT registration code (arch-specific)
+ *
+ * Copyright (C) 2010-2016 Oracle Corporation
+ */
+
+#include <linux/kernel.h>
+#include <linux/kallsyms.h>
+#include <linux/slab.h>
+#include <linux/sort.h>
+#include <linux/dtrace_os.h>
+#include <linux/dtrace_fbt.h>
+#include <linux/moduleloader.h>
+#include <linux/vmalloc.h>
+#include <asm/dtrace_arch.h>
+#include <asm/sections.h>
+
+#define ASM_REG_G0 0
+#define ASM_REG_G1 1
+#define ASM_REG_O0 8
+#define ASM_REG_O1 9
+#define ASM_REG_O2 10
+#define ASM_REG_O3 11
+#define ASM_REG_O4 12
+#define ASM_REG_O5 13
+#define ASM_REG_O6 14
+#define ASM_REG_O7 15
+#define ASM_REG_I0 24
+#define ASM_REG_I1 25
+#define ASM_REG_I2 26
+#define ASM_REG_I3 27
+#define ASM_REG_I4 28
+#define ASM_REG_I7 31
+#define ASM_REG_L0 16
+#define ASM_REG_L1 17
+#define ASM_REG_L2 18
+#define ASM_REG_L3 19
+#define ASM_REG_PC 5
+
+#define ASM_REG_ISOUTPUT(r) ((r) >= 8 && (r) < 16)
+#define ASM_REG_ISINPUT(r) ((r) >= 24 && (r) < 32)
+
+#define ASM_OP_MASK 0xc0000000
+#define ASM_OP_SHIFT 30
+#define ASM_OP(val) ((val) & ASM_OP_MASK)
+
+#define ASM_SIMM13_MASK 0x1fff
+#define ASM_SIMM13_MAX ((int32_t)0xfff)
+#define ASM_IMM22_MASK 0x3fffff
+#define ASM_IMM22_SHIFT 10
+
+#define ASM_OP0 (((uint32_t)0) << ASM_OP_SHIFT)
+#define ASM_OP2 (((uint32_t)2) << ASM_OP_SHIFT)
+
+#define ASM_FMT3_OP3_SHIFT 19
+#define ASM_FMT3_OP_MASK 0xc1f80000
+#define ASM_FMT3_OP(val) ((val) & ASM_FMT3_OP_MASK)
+
+#define ASM_FMT3_RD_SHIFT 25
+#define ASM_FMT3_RD_MASK (0x1f << ASM_FMT3_RD_SHIFT)
+#define ASM_FMT3_RD(val) \
+ (((val) & ASM_FMT3_RD_MASK) >> ASM_FMT3_RD_SHIFT)
+
+#define ASM_FMT3_RS1_SHIFT 14
+#define ASM_FMT3_RS1_MASK (0x1f << ASM_FMT3_RS1_SHIFT)
+#define ASM_FMT3_RS1(val) \
+ (((val) & ASM_FMT3_RS1_MASK) >> ASM_FMT3_RS1_SHIFT)
+#define ASM_FMT3_RS1_SET(val, rs1) \
+ (val) = ((val) & ~ASM_FMT3_RS1_MASK) | ((rs1) << ASM_FMT3_RS1_SHIFT)
+
+#define ASM_FMT3_RS2_SHIFT 0
+#define ASM_FMT3_RS2_MASK (0x1f << ASM_FMT3_RS2_SHIFT)
+#define ASM_FMT3_RS2(val) \
+ (((val) & ASM_FMT3_RS2_MASK) >> ASM_FMT3_RS2_SHIFT)
+#define ASM_FMT3_RS2_SET(val, rs2) \
+ (val) = ((val) & ~ASM_FMT3_RS2_MASK) | ((rs2) << ASM_FMT3_RS2_SHIFT)
+
+#define ASM_FMT3_IMM_SHIFT 13
+#define ASM_FMT3_IMM (1 << ASM_FMT3_IMM_SHIFT)
+#define ASM_FMT3_SIMM13_MASK ASM_SIMM13_MASK
+
+#define ASM_FMT3_ISIMM(val) ((val) & ASM_FMT3_IMM)
+#define ASM_FMT3_SIMM13(val) ((val) & ASM_FMT3_SIMM13_MASK)
+
+#define ASM_FMT2_OP2_SHIFT 22
+#define ASM_FMT2_OP2_MASK (0x7 << ASM_FMT2_OP2_SHIFT)
+#define ASM_FMT2_RD_SHIFT 25
+
+#define ASM_FMT2_OP2_BPCC (0x01 << ASM_FMT2_OP2_SHIFT)
+#define ASM_FMT2_OP2_BCC (0x02 << ASM_FMT2_OP2_SHIFT)
+#define ASM_FMT2_OP2_BPR (0x03 << ASM_FMT2_OP2_SHIFT)
+#define ASM_FMT2_OP2_SETHI (0x04 << ASM_FMT2_OP2_SHIFT)
+
+#define ASM_FMT2_COND_SHIFT 25
+#define ASM_FMT2_COND_BA (0x8 << ASM_FMT2_COND_SHIFT)
+#define ASM_FMT2_COND_BL (0x3 << ASM_FMT2_COND_SHIFT)
+#define ASM_FMT2_COND_BGE (0xb << ASM_FMT2_COND_SHIFT)
+
+#define ASM_OP_SAVE (ASM_OP2 | (0x3c << ASM_FMT3_OP3_SHIFT))
+#define ASM_OP_SETHI (ASM_OP0 | ASM_FMT2_OP2_SETHI)
+
+#define ASM_SETHI(val, reg) \
+ (ASM_OP_SETHI | (reg << ASM_FMT2_RD_SHIFT) | \
+ ((val >> ASM_IMM22_SHIFT) & ASM_IMM22_MASK))
+
+#define ASM_NOP ASM_SETHI(0, 0)
+
+/*
+ * We're only going to treat a save as safe if
+ * (a) both rs1 and rd are %sp and
+ * (b) if the instruction has a simm, the value isn't 0.
+ */
+#define ASM_IS_SAVE(instr) \
+ (ASM_FMT3_OP(instr) == ASM_OP_SAVE && \
+ ASM_FMT3_RD(instr) == ASM_REG_O6 && \
+ ASM_FMT3_RS1(instr) == ASM_REG_O6 && \
+ !(ASM_FMT3_ISIMM(instr) && ASM_FMT3_SIMM13(instr) == 0))
+
+#define ASM_IS_NOP(instr) ((instr) == ASM_NOP)
+
+#define ASM_MOD_INPUTS(instr) (ASM_OP(instr) == ASM_OP2 && \
+ ASM_REG_ISINPUT(ASM_FMT3_RD(instr)))
+#define ASM_MOD_OUTPUTS(instr) (ASM_OP(instr) == ASM_OP2 && \
+ ASM_REG_ISOUTPUT(ASM_FMT3_RD(instr)))
+
+#define BL_SENTRY(tp, nm) extern tp nm;
+#define BL_DENTRY(tp, nm)
+#include "fbt_blacklist.h"
+#undef BL_DENTRY
+#undef BL_SENTRY
+
+typedef struct _bl_entry {
+ void *addr;
+ const char *name;
+} bl_entry;
+
+static bl_entry blacklist[] = {
+#define BL_SENTRY(tp, nm) { (void *)&nm, __stringify(nm) },
+#define BL_DENTRY(tp, nm) { NULL, __stringify(nm) },
+#include "fbt_blacklist.h"
+#undef BL_DENTRY
+#undef BL_SENTRY
+};
+static int blacklist_len = ARRAY_SIZE(blacklist);
+
+static int bl_entry_cmp(const void *xx, const void *yy)
+{
+ bl_entry *x = (bl_entry *)xx;
+ bl_entry *y = (bl_entry *)yy;
+
+ return x->addr > y->addr ? 1
+ : x->addr < y->addr ? -1 : 0;
+}
+
+void dtrace_fbt_init(fbt_add_probe_fn fbt_add_probe)
+{
+ loff_t pos;
+ struct kallsym_iter sym;
+ size_t blpos = 0;
+ asm_instr_t *paddr = NULL;
+
+ /*
+ * Look up any unresolved symbols in the blacklist, and sort the list
+ * by ascending address.
+ */
+ for (pos = 0; pos < blacklist_len; pos++) {
+ bl_entry *be = &blacklist[pos];
+
+ if (!be->addr)
+ be->addr = (void *)kallsyms_lookup_name(be->name);
+ }
+ sort(blacklist, blacklist_len, sizeof(bl_entry), bl_entry_cmp, NULL);
+
+ pos = 0;
+ kallsyms_iter_reset(&sym, 0);
+ while (kallsyms_iter_update(&sym, pos++)) {
+ asm_instr_t *addr, *end;
+
+ /*
+ * There is no point considering non-function symbols for FBT,
+ * or symbols that have a zero size. We could consider weak
+ * symbols but that gets quite complicated and there is no
+ * demands for that (so far).
+ */
+ if (sym.type != 'T' && sym.type != 't')
+ continue;
+ if (!sym.size)
+ continue;
+
+ /*
+ * The symbol must be at a properly aligned address.
+ */
+ if (!IS_ALIGNED(sym.value, 4))
+ continue;
+
+ /*
+ * Only core kernel symbols are of interest here.
+ */
+ if (!core_kernel_text(sym.value))
+ continue;
+
+ /*
+ * See if the symbol is on the blacklist. Since both lists are
+ * sorted by ascending address we can use concurrent traversal
+ * of both lists.
+ */
+ while (blpos < blacklist_len &&
+ blacklist[blpos].addr < (void *)sym.value)
+ blpos++;
+
+ if (blacklist[blpos].addr == (void *)sym.value)
+ continue;
+
+ /*
+ * No FBT tracing for DTrace functions. Also weed out symbols
+ * that are not relevant here.
+ */
+ if (strncmp(sym.name, "dtrace_", 7) == 0)
+ continue;
+ if (strncmp(sym.name, "_GLOBAL_", 8) == 0)
+ continue;
+ if (strncmp(sym.name, "do_", 3) == 0)
+ continue;
+ if (!sym.size)
+ continue;
+
+ addr = (asm_instr_t *)sym.value;
+ end = (asm_instr_t *)(sym.value + sym.size);
+
+ /*
+ * When there are multiple symbols for the same address we
+ * should link them together as probes that are associated with
+ * the same function. When a probe for that function is
+ * triggered, all the associated probes should fire.
+ *
+ * For now, we're ignoring all but the first symbol...
+ */
+ if (addr == paddr)
+ continue;
+ paddr = addr;
+
+ if (ASM_IS_SAVE(*addr)) {
+ asm_instr_t *ins = addr;
+
+ /*
+ * If there are other saves, this function has multiple
+ * entry points or some other complex construct - we'll
+ * skip it.
+ */
+ while (++ins < end) {
+ if (ASM_IS_SAVE(*ins))
+ break;
+ }
+ if (ins != end)
+ continue;
+
+ /*
+ * What we are really looking for is a sequence like:
+ * save %sp, <num>, %sp
+ * call _mcount
+ * nop
+ * but due to ftrace patching in executable code, that
+ * call actually gets rewritten as a NOP before we even
+ * get to looking at it. We depend on ftrace already
+ * to get a count of functions that are potential
+ * candidates for FBT.
+ */
+ if (!ASM_IS_NOP(*(addr + 1)))
+ continue;
+
+ /*
+ * We should be OK as long as the instruction in the
+ * delay slot after the call to the trampoline does not
+ * modify input or output registers.
+ */
+ if (!ASM_IS_NOP(*(addr + 2)) &&
+ (ASM_MOD_INPUTS(*(addr + 2)) ||
+ ASM_MOD_OUTPUTS(*(addr + 2))))
+ continue;
+
+ fbt_add_probe(dtrace_kmod, sym.name, FBT_ENTRY, 32,
+ addr + 1, NULL);
+ } else
+ continue;
+ }
+}
+EXPORT_SYMBOL(dtrace_fbt_init);
--- /dev/null
+BL_DENTRY(void *, read_tsc)
+BL_DENTRY(void *, notifier_call_chain)
+BL_SENTRY(typeof(__atomic_notifier_call_chain), __atomic_notifier_call_chain)
+BL_SENTRY(typeof(atomic_notifier_call_chain), atomic_notifier_call_chain)
+BL_SENTRY(typeof(__raw_notifier_call_chain), __raw_notifier_call_chain)
+BL_SENTRY(typeof(raw_notifier_call_chain), raw_notifier_call_chain)
+BL_SENTRY(typeof(getrawmonotonic64), getrawmonotonic64)
+BL_DENTRY(void *, update_fast_timekeeper)
+BL_SENTRY(typeof(idr_find_slowpath), idr_find_slowpath)
+BL_DENTRY(void *, kprobe_exceptions_notify)
+BL_SENTRY(void *, notify_die)
+BL_SENTRY(void *, rcu_nmi_exit)
+BL_SENTRY(void *, rcu_nmi_enter)
+BL_SENTRY(void *, get_kprobe)
}
# ifdef CONFIG_DTRACE
- if (me->sdt_probec > 0)
- me->pdata = module_alloc(me->sdt_probec * SDT_TRAMP_SIZE *
- sizeof(asm_instr_t));
+ if (DTRACE_SDT_MAX(me) + DTRACE_FBT_MAX(me) > 0)
+ me->pdata = module_alloc(DTRACE_PD_MAXSIZE(me));
else
me->pdata = NULL;
# endif
typedef uint8_t asm_instr_t;
-#define DTRACE_PDATA_SIZE 64
-#define DTRACE_PDATA_EXTRA 0
-#define DTRACE_PDATA_MAXSIZE (DTRACE_PDATA_SIZE + DTRACE_PDATA_EXTRA)
+/*
+ * No additional memory needs to be allocated for the PDATA section on x86.
+ */
+#define DTRACE_PD_MAXSIZE(mp) (0)
-#define ASM_CALL_SIZE 5
+#define DTRACE_PD_MAXSIZE_KERNEL (0)
+
+#define ASM_CALL_SIZE 5
#endif /* _X86_DTRACE_ARCH_H */
#ifndef _X86_DTRACE_UTIL_H
#define _X86_DTRACE_UTIL_H
+#define DTRACE_INVOP_NOPS 0x0f /* 5-byte NOP sequence */
+#define DTRACE_INVOP_MOV_RSP_RBP 0x48 /* mov %rsp, %rbp = 48 89 e5 */
+#define DTRACE_INVOP_PUSH_BP 0x55 /* push %rbp = 55 */
+#define DTRACE_INVOP_NOP 0x90 /* nop = 90 */
+#define DTRACE_INVOP_LEAVE 0xc9 /* leave = c9 */
+#define DTRACE_INVOP_RET 0xc3 /* ret = c3 */
+
+#ifndef __ASSEMBLY__
+
#include <asm/ptrace.h>
extern int dtrace_invop_add(uint8_t (*func)(struct pt_regs *));
extern void dtrace_invop_enable(uint8_t *);
extern void dtrace_invop_disable(uint8_t *, uint8_t);
+#endif
+
#endif /* _X86_DTRACE_UTIL_H */
--- /dev/null
+/*
+ * FILE: dtrace_fbt.c
+ * DESCRIPTION: Dynamic Tracing: FBT registration code (arch-specific)
+ *
+ * Copyright (C) 2010-2014 Oracle Corporation
+ */
+
+#include <linux/kernel.h>
+#include <linux/kallsyms.h>
+#include <linux/dtrace_os.h>
+#include <linux/dtrace_fbt.h>
+#include <linux/slab.h>
+#include <linux/sort.h>
+#include <asm/insn.h>
+#include <asm/sections.h>
+
+#define FBT_MOV_RSP_RBP_1 0x48
+#define FBT_MOV_RSP_RBP_2 0x89
+#define FBT_MOV_RSP_RBP_3 0xe5
+#define FBT_PUSHL_EBP 0x55
+#define FBT_NOP 0x90
+#define FBT_RET_IMM16 0xc2
+#define FBT_RET 0xc3
+#define FBT_LEAVE 0xc9
+
+#define BL_SENTRY(tp, nm) extern tp nm;
+#define BL_DENTRY(tp, nm)
+#include "fbt_blacklist.h"
+#undef BL_DENTRY
+#undef BL_SENTRY
+
+typedef struct _bl_entry {
+ void *addr;
+ const char *name;
+} bl_entry;
+
+static bl_entry blacklist[] = {
+#define BL_SENTRY(tp, nm) { (void *)&nm, __stringify(nm) },
+#define BL_DENTRY(tp, nm) { NULL, __stringify(nm) },
+#include "fbt_blacklist.h"
+#undef BL_DENTRY
+#undef BL_SENTRY
+};
+static int blacklist_len = ARRAY_SIZE(blacklist);
+
+static int bl_entry_cmp(const void *xx, const void *yy)
+{
+ bl_entry *x = (bl_entry *)xx;
+ bl_entry *y = (bl_entry *)yy;
+
+ if (x->addr > y->addr)
+ return 1;
+ else if (x->addr < y->addr)
+ return -1;
+ else
+ return 0;
+}
+
+void dtrace_fbt_init(fbt_add_probe_fn fbt_add_probe)
+{
+ loff_t pos;
+ struct kallsym_iter sym;
+ size_t blpos = 0;
+ asm_instr_t *paddr = NULL;
+
+ /*
+ * Look up any unresolved symbols in the blacklist, and sort the list
+ * by ascending address.
+ */
+ for (pos = 0; pos < blacklist_len; pos++) {
+ bl_entry *be = &blacklist[pos];
+
+ if (!be->addr)
+ be->addr = (void *)kallsyms_lookup_name(be->name);
+ }
+ sort(blacklist, blacklist_len, sizeof(bl_entry), bl_entry_cmp, NULL);
+
+ pos = 0;
+ kallsyms_iter_reset(&sym, 0);
+ while (kallsyms_iter_update(&sym, pos++)) {
+ asm_instr_t *addr, *end;
+ int state = 0;
+ void *efbp = NULL;
+ void *fbtp = NULL;
+
+ /*
+ * There is no point considering non-function symbols for FBT,
+ * or symbols that have a zero size. We could consider weak
+ * symbols but that gets quite complicated and there is no
+ * demands for that (so far).
+ */
+ if (sym.type != 'T' && sym.type != 't')
+ continue;
+ if (!sym.size)
+ continue;
+
+ /*
+ * Only core kernel symbols are of interest here.
+ */
+ if (!core_kernel_text(sym.value))
+ continue;
+
+ /*
+ * See if the symbol is on the blacklist. Since both lists are
+ * sorted by ascending address we can use concurrent traversal
+ * of both lists.
+ */
+ while (blpos < blacklist_len &&
+ blacklist[blpos].addr < (void *)sym.value)
+ blpos++;
+ if (blacklist[blpos].addr == (void *)sym.value)
+ continue;
+
+ /*
+ * No FBT tracing for DTrace functions. Also weed out symbols
+ * that are not relevant here.
+ */
+ if (strncmp(sym.name, "dtrace_", 7) == 0)
+ continue;
+ if (strncmp(sym.name, "_GLOBAL_", 8) == 0)
+ continue;
+ if (strncmp(sym.name, "do_", 3) == 0)
+ continue;
+ if (strncmp(sym.name, "xen_", 4) == 0)
+ continue;
+
+ addr = (asm_instr_t *)sym.value;
+ end = (asm_instr_t *)(sym.value + sym.size);
+
+ /*
+ * FIXME:
+ * When there are multiple symbols for the same address, we
+ * should link them together as probes associated with the
+ * same function. When a probe for that function is triggered
+ * all associated probes should fire.
+ *
+ * For now, we ignore duplicates.
+ */
+ if (addr == paddr)
+ continue;
+ paddr = addr;
+
+ while (addr < end) {
+ struct insn insn;
+
+ switch (state) {
+ case 0: /* start of function */
+ if (*addr == FBT_PUSHL_EBP)
+ state = 1;
+ else
+ state = 2;
+ break;
+ case 1: /* push %rbp seen */
+ if (*addr == FBT_MOV_RSP_RBP_1 &&
+ *(addr + 1) == FBT_MOV_RSP_RBP_2 &&
+ *(addr + 2) == FBT_MOV_RSP_RBP_3)
+ fbt_add_probe(
+ dtrace_kmod, sym.name,
+ FBT_ENTRY, *addr, addr, NULL);
+ state = 2;
+ break;
+ case 2: /* look for ret */
+ if (*addr == FBT_RET &&
+ (*(addr + 1) == FBT_PUSHL_EBP ||
+ *(addr + 1) == FBT_NOP)) {
+ fbt_add_probe(
+ dtrace_kmod, sym.name,
+ FBT_RETURN, *addr, addr, fbtp);
+ state = 3;
+ }
+ break;
+ }
+
+ if (state == 3)
+ break;
+
+ kernel_insn_init(&insn, addr, MAX_INSN_SIZE);
+ insn_get_length(&insn);
+
+ addr += insn.length;
+ }
+ }
+}
+EXPORT_SYMBOL(dtrace_fbt_init);
#include <linux/slab.h>
#include <asm/insn.h>
#include <asm/ptrace.h>
+#include <asm/dtrace_arch.h>
+#include <asm/dtrace_util.h>
/*
* Move the instruction pointer forward to the next instruction, effectiely
static dtrace_invop_hdlr_t *dtrace_invop_hdlrs;
-#define INVOP_TRAP_INSTR 0xf0
+#if 1
+# define INVOP_TRAP_INSTR 0xf0
+#else
+# define INVOP_TRAP_INSTR 0xcc
+#endif
/*
* Trap notification handler.
return NOTIFY_OK | NOTIFY_STOP_MASK;
}
case DIE_GPF: {
- struct insn insn;
-
- kernel_insn_init(&insn, (void *)dargs->regs->ip, MAX_INSN_SIZE);
- insn_get_length(&insn);
-
/*
- * It would seem that the invalid opcode generated by the LOCK
- * prefix (0xF0) used for SDT probe points may get delivered as
- * a general protection failure on Xen. We need to ignore them
- * as general protection failures...
+ * This gets messy... For one, some versions of Xen deliver
+ * the invalid opcode generated by the LOCK prefix (0xf0) as a
+ * GP fault rather than a UD fault. So, we need to figure out
+ * whether the GP we're processing here is one of those
+ * misreported faults.
+ *
+ * But, it is possible that the instruction that caused the
+ * fault (0xf0) gets overwritten by a different CPU with the
+ * original valid opcode before we get to look at it here,
+ * which makes it kind of hard to recognize.
+ *
+ * So... we're going to assume that a GP fault that gets
+ * triggered for the LOCK prefix opcode (0xf0) *or* for an
+ * opcode that can get overwritten with the LOCK prefix for
+ * probing is actually a UD fault.
+ *
+ * If we are wrong, the handlers will simply see a fault that
+ * isn't theirs, and return without consuming it. And in that
+ * case, the kernel will report a UD fault that may have been
+ * a real GP fault... Sorry.
*/
- if (insn.length != 5 || insn.prefixes.bytes[0] != 0xf0 ||
- insn.opcode.bytes[0] != 0x90) {
+ asm_instr_t opc = *(asm_instr_t *)dargs->regs->ip;
+
+ if (opc != 0xf0 && opc != 0x55 && opc != 0xc3) {
if (!DTRACE_CPUFLAG_ISSET(CPU_DTRACE_NOFAULT))
return NOTIFY_DONE;
DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
- dargs->regs->ip += insn.length;
+ dargs->regs->ip += 1;
return NOTIFY_OK | NOTIFY_STOP_MASK;
}
break;
}
- if (rval != 0) {
- dargs->regs->ip += rval;
+ switch (rval) {
+ case DTRACE_INVOP_NOPS:
+ /*
+ * Probe points are encoded as a single-byte NOP,
+ * followed by a multi-byte NOP. We can therefore
+ * safely report this case as equivalent to a single
+ * NOP that needs to be emulated. Execution will
+ * continue with the multi-byte NOP.
+ */
+ rval = DTRACE_INVOP_NOP;
+ case DTRACE_INVOP_MOV_RSP_RBP:
+ case DTRACE_INVOP_NOP:
+ case DTRACE_INVOP_PUSH_BP:
+ case DTRACE_INVOP_RET:
+ return notifier_from_errno(-rval);
+ default:
+ /*
+ * This must not have been a trap triggered from a
+ * probe point. Re-adjust the instruction pointer
+ * and let someone else deal with it...
+ */
+ dargs->regs->ip++;
+ }
- return NOTIFY_OK | NOTIFY_STOP_MASK;
+ return NOTIFY_DONE;
+ }
+ case DIE_INT3: {
+ dtrace_invop_hdlr_t *hdlr;
+ int rval = 0;
+
+ /*
+ * Let's assume that this is a DTrace probe firing, so we need
+ * to adjust the IP (to be consistent with #UD processing) so
+ * that it reflects the address of the #BP rather than the
+ * following intruction.
+ *
+ * If it turns out that this was not DTrace related, we'll have
+ * to reverse this adjustment.
+ */
+ dargs->regs->ip--;
+ for (hdlr = dtrace_invop_hdlrs; hdlr != NULL;
+ hdlr = hdlr->dtih_next) {
+ rval = hdlr->dtih_func(dargs->regs);
+ if (rval != 0)
+ break;
+ }
+
+ switch (rval) {
+ case DTRACE_INVOP_NOPS:
+ /*
+ * Probe points are encoded as a single-byte NOP,
+ * followed by a multi-byte NOP. We can therefore
+ * safely report this case as equivalent to a single
+ * NOP that needs to be emulated. Execution will
+ * continue with the multi-byte NOP.
+ */
+ rval = DTRACE_INVOP_NOP;
+ case DTRACE_INVOP_MOV_RSP_RBP:
+ case DTRACE_INVOP_NOP:
+ case DTRACE_INVOP_PUSH_BP:
+ case DTRACE_INVOP_RET:
+ return notifier_from_errno(-rval);
+ default:
+ /*
+ * This must not have been a trap triggered from a
+ * probe point. Re-adjust the instruction pointer
+ * and let someone else deal with it...
+ */
+ dargs->regs->ip++;
}
}
default:
#include <asm/context_tracking.h>
#include <asm/smap.h>
#include <asm/pgtable_types.h>
+#include <asm/dtrace_util.h>
#include <linux/err.h>
/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
addq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist)
.endif
+#ifdef CONFIG_DTRACE
+ test %rax,%rax
+ jnz dtrace_error_exit
+#endif
+
/* these procedures expect "no swapgs" flag in ebx */
.if \paranoid
jmp paranoid_exit
END(\sym)
.endm
+#ifdef CONFIG_DTRACE
+ENTRY(dtrace_error_exit)
+ DEFAULT_FRAME
+ RESTORE_EXTRA_REGS
+ DISABLE_INTERRUPTS(CLBR_NONE)
+ TRACE_IRQS_OFF
+ GET_THREAD_INFO(%rcx)
+
+#ifdef CONFIG_PREEMPT
+ /* Interrupts are off */
+ /* Check if we need preemption */
+ bt $9,EFLAGS(%rsp) /* interrupts were off? */
+ jnc 1f
+0: cmpl $0,PER_CPU_VAR(__preempt_count)
+ jnz 1f
+ call preempt_schedule_irq
+ jmp 0b
+1:
+#endif
+
+ /*
+ * The iretq could re-enable interrupts:
+ */
+ TRACE_IRQS_IRETQ
+
+ negq %rax
+
+ cmpl $DTRACE_INVOP_MOV_RSP_RBP,%eax
+ je dtrace_emu_mov
+ cmpl $DTRACE_INVOP_PUSH_BP,%eax
+ je dtrace_emu_push
+ cmpl $DTRACE_INVOP_LEAVE,%eax
+ je dtrace_emu_leave
+ cmpl $DTRACE_INVOP_NOP,%eax
+ je dtrace_emu_nop
+ cmpl $DTRACE_INVOP_RET,%eax
+ je dtrace_emu_ret
+
+ leaq dtrace_error_msg(%rip),%rdi
+ movq %rax,%rsi
+ movq (%rsp),%rdx
+ call printk
+
+ RESTORE_C_REGS
+ REMOVE_PT_GPREGS_FROM_STACK 8
+ INTERRUPT_RETURN
+
+dtrace_emu_mov:
+ RESTORE_C_REGS
+ REMOVE_PT_GPREGS_FROM_STACK 8
+
+ /* Emulate "mov %rsp, %rbp" instruction. */
+ pushq %rax /* push temp */
+ movq 8(%rsp),%rax /* load calling RIP */
+ addq $3,%rax /* increment over trapping instr */
+ movq %rax,8(%rsp) /* store calling RIP */
+ movq 32(%rsp),%rbp /* load %rsp into %rbp */
+ popq %rax /* pop off temp */
+
+ INTERRUPT_RETURN
+
+dtrace_emu_push:
+ RESTORE_C_REGS
+ REMOVE_PT_GPREGS_FROM_STACK 8
+
+ /*
+ * Emulate a "pushq %rbp" instruction. We need to move the stack down
+ * to make room for the extra address getting pushed.
+ */
+ subq $16,%rsp /* make room for %rbp */
+ pushq %rax /* push temp */
+ movq 24(%rsp),%rax /* load calling RIP */
+ addq $1,%rax /* increment over trapping instr */
+ movq %rax,8(%rsp) /* store calling RIP */
+ movq 32(%rsp),%rax /* load calling CS */
+ movq %rax,16(%rsp) /* store calling CS */
+ movq 40(%rsp),%rax /* load calling RFLAGS */
+ movq %rax,24(%rsp) /* store calling RFLAGS */
+ movq 48(%rsp),%rax /* load calling RSP */
+ subq $8,%rax /* make room for %rbp */
+ movq %rax,32(%rsp) /* store calling RSP */
+ movq 56(%rsp),%rax /* load calling SS */
+ movq %rax,40(%rsp) /* store calling SS */
+ movq 32(%rsp),%rax /* reload calling RSP */
+ movq %rbp,(%rax) /* store %rbp there */
+ popq %rax /* pop off temp */
+
+ INTERRUPT_RETURN
+
+dtrace_emu_nop:
+ RESTORE_C_REGS
+ REMOVE_PT_GPREGS_FROM_STACK 8
+
+ /* Emulate a "nop" instruction. */
+ incq (%rsp)
+
+ INTERRUPT_RETURN
+
+dtrace_emu_leave:
+ RESTORE_C_REGS
+ REMOVE_PT_GPREGS_FROM_STACK 8
+
+ /*
+ * Emulate a "leave" instruction. This is equivalent to the sequence:
+ * movq %rbp,%rsp
+ * popq %rbp
+ * We can use the fact that on x86_64 %rsp is saved explicitly, so we
+ * do not need to move any data around.
+ */
+ pushq %rax /* push temp */
+ movq 8(%rsp),%rax /* load calling RIP */
+ addq $1,%rax /* increment over trapping instr */
+ movq %rax,8(%rsp) /* store calling RIP */
+ movq (%rbp),%rax /* get new %rbp */
+ addq $8,%rbp /* adjust new %rsp */
+ movq %rbp,32(%rsp) /* store new %rsp */
+ movq %rax,%rbp /* set new %rbp */
+ popq %rax /* pop off temp */
+
+ INTERRUPT_RETURN
+
+dtrace_emu_ret:
+ RESTORE_C_REGS
+ REMOVE_PT_GPREGS_FROM_STACK 8
+
+ /* Emulate a "ret" instruction. */
+ pushq %rax /* push temp */
+ movq 32(%rsp),%rax /* load %rsp */
+ movq (%rax),%rax /* load calling RIP */
+ movq %rax,8(%rsp) /* store calling RIP */
+ addq $8,32(%rsp) /* adjust new %rsp */
+ popq %rax /* pop off temp */
+
+ INTERRUPT_RETURN
+
+ CFI_ENDPROC
+END(dtrace_error_exit)
+
+dtrace_error_msg:
+ .asciz "DTRACE: non-zero (%x) return from trap at %x\n"
+#endif
+
#ifdef CONFIG_TRACING
.macro trace_idtentry sym do_sym has_error_code:req
idtentry trace(\sym) trace(\do_sym) has_error_code=\has_error_code
--- /dev/null
+BL_SENTRY(void *, update_vsyscall)
+BL_DENTRY(void *, read_tsc)
+BL_DENTRY(void *, notifier_call_chain)
+BL_SENTRY(typeof(__atomic_notifier_call_chain), __atomic_notifier_call_chain)
+BL_SENTRY(typeof(atomic_notifier_call_chain), atomic_notifier_call_chain)
+BL_SENTRY(typeof(__raw_notifier_call_chain), __raw_notifier_call_chain)
+BL_SENTRY(typeof(raw_notifier_call_chain), raw_notifier_call_chain)
+BL_DENTRY(void *, timekeeping_get_ns)
+BL_SENTRY(typeof(getrawmonotonic64), getrawmonotonic64)
+BL_DENTRY(void *, update_fast_timekeeper)
+BL_DENTRY(void *, timekeeping_update.clone.3)
+BL_SENTRY(typeof(idr_find_slowpath), idr_find_slowpath)
+BL_SENTRY(typeof(poke_int3_handler), poke_int3_handler) /* MAYBE */
+BL_SENTRY(void *, ftrace_int3_handler) /* MAYBE */
+BL_SENTRY(void *, kprobe_int3_handler) /* MAYBE */
+BL_DENTRY(void *, set_intr_gate_ist) /* MAYBE */
+BL_DENTRY(void *, ist_enter) /* MAYBE */
+BL_DENTRY(void *, ist_exit) /* MAYBE */
+BL_DENTRY(void *, hw_breakpoint_exceptions_notify)
+BL_DENTRY(void *, kprobe_exceptions_notify)
+BL_SENTRY(void *, notify_die)
+BL_SENTRY(void *, rcu_nmi_exit)
+BL_SENTRY(void *, rcu_nmi_enter)
+BL_SENTRY(void *, get_kprobe)
+BL_DENTRY(void *, xen_timer_interrupt)
--- /dev/null
+/* Copyright (C) 2015 Oracle, Inc. */
+
+#ifndef _LINUX_DTRACE_FBT_H
+#define _LINUX_DTRACE_FBT_H
+
+#include <linux/module.h>
+#include <asm/dtrace_arch.h>
+
+extern unsigned long dtrace_fbt_nfuncs __attribute__((weak));
+
+/*
+ * Prototype for callback function that handles the actual creation of FBT
+ * probes.
+ *
+ * Arguments to pass:
+ * - Pointer to module the probe will belong to
+ * - function name
+ * - probe type (FBT_ENTRY or FBT_RETURN)
+ * - probe subtype (arch-specific)
+ * - address (location of the probe)
+ * - return value from previous callback invocation
+ * Returns:
+ * - generic pointer (only to be used to pass back in)
+ */
+#define FBT_ENTRY 0
+#define FBT_RETURN 1
+
+typedef void *(*fbt_add_probe_fn)(struct module *, char *, int, int,
+ asm_instr_t *, void *);
+extern void dtrace_fbt_init(fbt_add_probe_fn);
+
+#endif /* _LINUX_DTRACE_FBT_H */
Provides the perf provider, containing a DTrace probe for each
perf-events tracepoint in the system.
+config DT_FBT
+ tristate "Function boundary tracing"
+ default m
+ select FTRACE
+ help
+ Provides function boundary tracing for functions in the kernel.
+
config DT_SYSTRACE
tristate "System Call Tracing"
default m
DT_CORE_ARCH_OBJS = $(addprefix ../../arch/$(SRCARCH)/kernel/, \
dtrace_syscall.o dtrace_syscall_stubs.o \
- dtrace_sdt.o dtrace_util.o)
+ dtrace_fbt.o dtrace_sdt.o dtrace_util.o)
ifdef CONFIG_DT_CORE
obj-y += cyclic.o dtrace_os.o dtrace_cpu.o \
* DTrace uses an architecture-specific structure (hidden from us here)
* to hold some data, and since we do not know the layout or the size,
* we ensure that we allocate enough memory to accomodate the largest
- * of those structures.
+ * of those structures. On some architectures there may not be a need
+ * for additional data. In that case, pdata will be NULL.
+ *
* So, the memory we allocate will hold:
* - the dtrace_kmod module structure
* - a block of memory (aligned at a structure boundary) to be
- * used for pdata and other related data
+ * used for pdata and other related data [optional]
* The memory is allocated from the modules space.
*/
- module_size = ALIGN(sizeof(struct module), 8) + DTRACE_PDATA_MAXSIZE;
+ module_size = ALIGN(sizeof(struct module), 8) +
+ DTRACE_PD_MAXSIZE_KERNEL;
dtrace_kmod = module_alloc(module_size);
if (dtrace_kmod == NULL) {
pr_warning("%s: cannot allocate kernel pseudo-module\n",
memset(dtrace_kmod, 0, module_size);
strlcpy(dtrace_kmod->name, "vmlinux", MODULE_NAME_LEN);
+
+ if (DTRACE_PD_MAXSIZE_KERNEL > 0)
+ dtrace_kmod->pdata = (char *)dtrace_kmod +
+ ALIGN(sizeof(struct module), 8);
+ else
+ dtrace_kmod->pdata = NULL;
+
+ dtrace_kmod->core_size = DTRACE_PD_MAXSIZE_KERNEL;
+ dtrace_kmod->num_ftrace_callsites = dtrace_fbt_nfuncs;
dtrace_kmod->state = MODULE_STATE_LIVE;
atomic_inc(&dtrace_kmod->refcnt);
- dtrace_kmod->pdata = (char *)dtrace_kmod +
- ALIGN(sizeof(struct module), 8);
- dtrace_kmod->core_size = DTRACE_PDATA_MAXSIZE;
psinfo_cachep = kmem_cache_create("psinfo_cache",
sizeof(dtrace_psinfo_t), 0,
sub(/\..*$/, "", fname);
alias = $3;
+ if ($1 != prev)
+ funcc++;
+ prev = $1;
+
next;
}
END {
print "";
print ".globl dtrace_sdt_nprobes";
+ print ".globl dtrace_fbt_nfuncs";
print "\tALGN";
print "dtrace_sdt_nprobes:";
printf "\tPTR\t%d\n", probec;
+ print "dtrace_fbt_nfuncs:";
+ printf "\tPTR\t%d\n", funcc;
exit(errc == 0 ? 0 : 1);
}' > $tfn