This commit continues the implementation of Function Boundary Tracing
(FBT) and fixes various problems with the original implementation and
other things in DTrace that it caused to break. It is done as a single
commit due to the intertwined nature of the code it touches.
1. We were only handling unaligned memory access traps as part of the
NOFAULT access protection. This commit adds handling data and
instruction access trap handling.
2. When an OOPS takes place, we now add output about whether we are
in DTrace probe context and what the last probe was that was being
processed (if any). That last data item isn't guaranteed to always
have a valid value. But it is helpful.
3. New ustack stack walker implementation (moved from module to kernel
for consistency and because we need access to low level structures
like the page tables) for both x86 and sparc. The new code avoids
any locking or sleeping. The new user stack walker is accessed as
as sub-function of dtrace_stacktrace(), selected using the flags
field of stacktrace_state_t.
4. We added a new field to the dtrace_psinfo_t structure (ustack) to
hold the bottom address of the stack. This is needed in the stack
walker (specifically for x86) to know when we have reached the end
of the stack. It is initialized from copy_process (in DTrace
specific code) when stack_start is passed as parameter to clone.
It is also set from dtrace_psinfo_alloc() (which is generally called
from performing an exec), and there it gets its value from the
mm->start_stack value.
5. The FBT black lists have been updated with functions that may be
invoked during probe processing. In addition, for x86_64 we added
explicit filter out of functions that start with insn_* or inat_*
because they are used for instruction analysis during probe
processing.
6. On sparc64, per-cpu data gets access by means of a global register
that holds the base address for this memory area. Some assembler
code clobbers that register in some cases, so it is not safe to
depend on this in probe context. Instead, we explicitly access
the data based on the smp_processor_id().
7. We added a new CPU DTTrace flag (CPU_DTRACE_PROBE_CTX) to flag that
we are processing in DTrace probe context. It is primarily used
to detect attempts of re-entry into dtrace_probe().
Signed-off-by: Kris Van Hees <kris.van.hees@oracle.com>
Acked-by: Nick Alcock <nick.alcock@oracle.com>
Orabug:
21220305
Orabug:
24829326
#ifndef _SPARC_DTRACE_UTIL_H
#define _SPARC_DTRACE_UTIL_H
-/* Nothing for now */
+extern int dtrace_user_addr_is_exec(uintptr_t);
#endif /* _SPARC_DTRACE_UTIL_H */
*/
#include <linux/dtrace_cpu.h>
+#include <linux/dtrace_os.h>
#include <linux/kdebug.h>
+#include <linux/mm.h>
#include <linux/notifier.h>
+#include <linux/sched.h>
#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <asm/cacheflush.h>
+#include <asm/pgtable.h>
#include <asm/ptrace.h>
+#include <asm/switch_to.h>
void dtrace_skip_instruction(struct pt_regs *regs)
{
return NOTIFY_OK | NOTIFY_STOP_MASK;
}
case DIE_TRAP: {
- if (dargs->trapnr != 0x34)
+ if (dargs->trapnr != 0x34 && dargs->trapnr != 0x08)
return NOTIFY_DONE;
if (!DTRACE_CPUFLAG_ISSET(CPU_DTRACE_NOFAULT))
return NOTIFY_OK | NOTIFY_STOP_MASK;
}
+ case DIE_OOPS: {
+ printk("DTrace: probe ctx %d last probe %ld\n",
+ !!DTRACE_CPUFLAG_ISSET(CPU_DTRACE_PROBE_CTX),
+ this_cpu_core->cpu_dtrace_caller);
+ return NOTIFY_DONE;
+ }
default:
return NOTIFY_DONE;
}
}
+
+int dtrace_user_addr_is_exec(uintptr_t addr)
+{
+ struct mm_struct *mm = current->mm;
+ pgd_t pgd;
+ pud_t pud;
+ pmd_t pmd;
+ unsigned long flags;
+ int ret = 0;
+
+ if (mm == NULL)
+ return 0;
+
+ addr &= PAGE_MASK;
+
+ local_irq_save(flags);
+
+ pgd = *pgd_offset(mm, addr);
+ if (pgd_none(pgd))
+ goto out;
+
+ pud = *pud_offset(&pgd, addr);
+ if (pud_none(pud))
+ goto out;
+
+ pmd = *pmd_offset(&pud, addr);
+ if (pmd_none(pmd) || pmd_trans_splitting(pmd))
+ goto out;
+ if (unlikely(pmd_large(pmd))) {
+ /* not sure how to do this */
+ goto out;
+ } else {
+ pte_t pte;
+
+ pte = *pte_offset_kernel(&pmd, addr);
+
+ ret = pte_exec(pte);
+ }
+
+out:
+ local_irq_restore(flags);
+
+ return ret;
+}
+EXPORT_SYMBOL(dtrace_user_addr_is_exec);
+
+void dtrace_user_stacktrace(stacktrace_state_t *st)
+{
+ struct thread_info *t = current_thread_info();
+ struct pt_regs *regs = current_pt_regs();
+ uint64_t *pcs = st->pcs;
+ uint64_t *fps = st->fps;
+ int limit = st->limit;
+ unsigned long window;
+ unsigned long sp = user_stack_pointer(regs);
+ int ret;
+
+ if (!user_mode(regs))
+ goto out;
+
+ flush_user_windows();
+
+ st->depth = 1;
+ if (pcs) {
+ *pcs++ = (uint64_t)instruction_pointer(regs);
+ limit--;
+ }
+
+ if (!limit)
+ goto out;
+
+ if (test_thread_flag(TIF_32BIT))
+ sp = (uint32_t)sp;
+
+ /*
+ * First we have to process all user windows that have not been flushed
+ * to the stack save area.
+ */
+ window = get_thread_wsaved();
+ while (window--) {
+ unsigned long addr;
+
+ sp = t->rwbuf_stkptrs[window];
+
+ if (test_thread_64bit_stack((unsigned long)sp)) {
+ addr = t->reg_window[window].ins[7];
+ } else {
+ addr = ((struct reg_window32 *)(&t->reg_window[window]))->ins[7];
+ }
+
+ if (pcs) {
+ *pcs++ = addr;
+ limit--;
+ }
+ st->depth++;
+
+ if (!limit)
+ goto out;
+
+ /* Grab %fp so we can continue iteration on stack. */
+ if (window == 0) {
+ if (test_thread_64bit_stack((unsigned long)sp)) {
+ sp = t->reg_window[window].ins[6];
+ } else {
+ sp = ((struct reg_window32 *)(&t->reg_window[window]))->ins[6];
+ }
+ }
+ }
+
+ /* continue iteration on the stack */
+ while ((sp != 0 || sp != STACK_BIAS) && limit > 0) {
+ unsigned long addr;
+
+ pagefault_disable();
+ if (test_thread_64bit_stack(sp)) {
+ ret = __copy_from_user_inatomic(&addr, (unsigned long *)(sp + STACK_BIAS + SF_V9_PC),
+ sizeof(addr));
+ } else {
+ unsigned int addr32;
+
+ ret = __copy_from_user_inatomic(&addr32, (unsigned int *)(sp + SF_PC), sizeof(addr32));
+ addr = addr32;
+ }
+ pagefault_enable();
+
+ if (ret)
+ break;
+
+ if (pcs) {
+ *pcs++ = addr;
+ limit--;
+ }
+ st->depth++;
+
+ pagefault_disable();
+ if (test_thread_64bit_stack(sp)) {
+ ret = __copy_from_user_inatomic(&sp, (unsigned long *)(sp + STACK_BIAS + SF_V9_FP),
+ sizeof (sp));
+ } else {
+ unsigned int sp_tmp;
+
+ ret = __copy_from_user_inatomic(&sp_tmp, (unsigned int *)(sp + SF_FP), sizeof (sp_tmp));
+ sp = sp_tmp;
+ }
+ pagefault_enable();
+
+ if (ret)
+ break;
+ }
+
+out:
+ if (pcs) {
+ while (limit--)
+ *pcs++ = 0;
+ }
+}
-BL_DENTRY(void *, read_tsc)
BL_DENTRY(void *, notifier_call_chain)
BL_SENTRY(typeof(__atomic_notifier_call_chain), __atomic_notifier_call_chain)
BL_SENTRY(typeof(atomic_notifier_call_chain), atomic_notifier_call_chain)
BL_SENTRY(typeof(__raw_notifier_call_chain), __raw_notifier_call_chain)
BL_SENTRY(typeof(raw_notifier_call_chain), raw_notifier_call_chain)
-BL_SENTRY(typeof(getrawmonotonic64), getrawmonotonic64)
-BL_DENTRY(void *, update_fast_timekeeper)
-BL_SENTRY(typeof(idr_find_slowpath), idr_find_slowpath)
-BL_DENTRY(void *, kprobe_exceptions_notify)
BL_SENTRY(void *, notify_die)
BL_SENTRY(void *, rcu_nmi_exit)
BL_SENTRY(void *, rcu_nmi_enter)
-BL_SENTRY(void *, get_kprobe)
+BL_SENTRY(typeof(ktime_get_raw_fast_ns), ktime_get_raw_fast_ns)
+BL_SENTRY(typeof(idr_find_slowpath), idr_find_slowpath)
+BL_DENTRY(void *, kprobe_exceptions_notify)
+BL_DENTRY(void *, arch_uprobe_exception_notify)
+BL_DENTRY(void *, sun4v_data_access_exception)
+BL_DENTRY(void *, sun4v_do_mna)
+BL_DENTRY(void *, get_fault_insn)
+BL_DENTRY(void *, kernel_unaligned_trap)
+BL_DENTRY(typeof(save_stack_trace), save_stack_trace)
+BL_DENTRY(typeof(__save_stack_trace), __save_stack_trace)
+BL_DENTRY(typeof(stack_trace_flush), stack_trace_flush)
+BL_DENTRY(typeof(in_sched_functions), in_sched_functions)
+
+BL_SENTRY(typeof(search_exception_tables), search_exception_tables)
+
+BL_DENTRY(void *, down_read_trylock)
+BL_DENTRY(void *, __down_read_trylock)
+BL_DENTRY(void *, __get_user_pages_fast)
+BL_DENTRY(void *, gup_pud_range)
+BL_DENTRY(void *, gup_pmd_range)
+BL_DENTRY(void *, gup_huge_pmd)
+BL_DENTRY(void *, gup_pte_range)
extern void dtrace_invop_enable(uint8_t *);
extern void dtrace_invop_disable(uint8_t *, uint8_t);
+extern int dtrace_user_addr_is_exec(uintptr_t);
+
#endif
#endif /* _X86_DTRACE_UTIL_H */
continue;
/*
- * No FBT tracing for DTrace functions. Also weed out symbols
- * that are not relevant here.
+ * No FBT tracing for DTrace functions, and functions that are
+ * crucial to probe processing.
+ * Also weed out symbols that are not relevant here.
*/
if (strncmp(sym.name, "dtrace_", 7) == 0)
continue;
+ if (strncmp(sym.name, "insn_", 5) == 0)
+ continue;
+ if (strncmp(sym.name, "inat_", 5) == 0)
+ continue;
if (strncmp(sym.name, "_GLOBAL_", 8) == 0)
continue;
if (strncmp(sym.name, "do_", 3) == 0)
*/
#include <linux/dtrace_cpu.h>
+#include <linux/dtrace_os.h>
#include <linux/kdebug.h>
+#include <linux/mm.h>
#include <linux/module.h>
#include <linux/notifier.h>
+#include <linux/ptrace.h>
+#include <linux/sched.h>
#include <linux/slab.h>
+#include <linux/uaccess.h>
#include <asm/insn.h>
+#include <asm/pgtable.h>
#include <asm/ptrace.h>
#include <asm/dtrace_arch.h>
#include <asm/dtrace_util.h>
text_poke(addr, ((unsigned char []){opcode}), 1);
}
EXPORT_SYMBOL(dtrace_invop_disable);
+
+static inline dtrace_bad_address(void *addr)
+{
+ unsigned long dummy;
+
+ return probe_kernel_address((unsigned long *)addr, dummy);
+}
+
+int dtrace_user_addr_is_exec(uintptr_t addr)
+{
+ struct mm_struct *mm = current->mm;
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+ unsigned long flags;
+ int ret = 0;
+
+ if (mm == NULL)
+ return 0;
+
+ addr &= PAGE_MASK;
+
+ local_irq_save(flags);
+
+ pgd = pgd_offset(mm, addr);
+ if (dtrace_bad_address(pgd))
+ goto out;
+ if (pgd_none(*pgd) || !pgd_present(*pgd))
+ goto out;
+
+ pud = pud_offset(pgd, addr);
+ if (dtrace_bad_address(pud))
+ goto out;
+ if (pud_none(*pud) || !pud_present(*pud))
+ goto out;
+ if (unlikely(pud_large(*pud))) {
+ pte = (pte_t *)pud;
+ if (dtrace_bad_address(pte))
+ goto out;
+
+ ret = pte_exec(*pte);
+ goto out;
+ }
+
+ pmd = pmd_offset(pud, addr);
+ if (dtrace_bad_address(pmd))
+ goto out;
+ if (pmd_none(*pmd) || pmd_trans_splitting(*pmd))
+ goto out;
+ if (unlikely(pmd_large(*pmd) || !pmd_present(*pmd))) {
+ pte = (pte_t *)pmd;
+ if (dtrace_bad_address(pte))
+ goto out;
+
+ ret = pte_exec(*pte);
+ goto out;
+ }
+
+ pte = pte_offset_map(pmd, addr);
+ if (dtrace_bad_address(pte))
+ goto out;
+ if (pte_protnone(*pte))
+ goto out;
+ if ((pte_flags(*pte) & (_PAGE_PRESENT|_PAGE_USER|_PAGE_SPECIAL)) !=
+ (_PAGE_PRESENT|_PAGE_USER))
+ goto out;
+
+ ret = pte_exec(*pte);
+
+out:
+ local_irq_restore(flags);
+
+ return ret;
+}
+EXPORT_SYMBOL(dtrace_user_addr_is_exec);
+
+void dtrace_user_stacktrace(stacktrace_state_t *st)
+{
+ struct thread_info *t = current_thread_info();
+ struct pt_regs *regs = current_pt_regs();
+ uint64_t *pcs = st->pcs;
+ uint64_t *fps = st->fps;
+ int limit = st->limit;
+ unsigned long *bos;
+ unsigned long *sp = (unsigned long *)user_stack_pointer(regs);
+ int ret;
+
+ if (!user_mode(regs))
+ goto out;
+
+ if (!current->dtrace_psinfo)
+ goto out;
+
+ bos = current->dtrace_psinfo->ustack;
+
+ st->depth = 1;
+ if (pcs) {
+ *pcs++ = (uint64_t)instruction_pointer(regs);
+ limit--;
+ }
+
+ if (!limit)
+ goto out;
+
+ while (sp <= bos && limit) {
+ unsigned long pc;
+
+ pagefault_disable();
+ ret = __copy_from_user_inatomic(&pc, sp, sizeof(pc));
+ pagefault_enable();
+
+ if (ret)
+ break;
+
+ if (dtrace_user_addr_is_exec(pc) && pcs) {
+ *pcs++ = pc;
+ limit--;
+ }
+ st->depth++;
+
+ sp++;
+ }
+
+out:
+ if (pcs) {
+ while (limit--)
+ *pcs++ = 0;
+ }
+}
-BL_SENTRY(void *, update_vsyscall)
-BL_DENTRY(void *, read_tsc)
BL_DENTRY(void *, notifier_call_chain)
BL_SENTRY(typeof(__atomic_notifier_call_chain), __atomic_notifier_call_chain)
BL_SENTRY(typeof(atomic_notifier_call_chain), atomic_notifier_call_chain)
BL_SENTRY(typeof(__raw_notifier_call_chain), __raw_notifier_call_chain)
BL_SENTRY(typeof(raw_notifier_call_chain), raw_notifier_call_chain)
-BL_DENTRY(void *, timekeeping_get_ns)
-BL_SENTRY(typeof(getrawmonotonic64), getrawmonotonic64)
-BL_DENTRY(void *, update_fast_timekeeper)
-BL_DENTRY(void *, timekeeping_update.clone.3)
BL_SENTRY(typeof(idr_find_slowpath), idr_find_slowpath)
-BL_SENTRY(typeof(poke_int3_handler), poke_int3_handler) /* MAYBE */
-BL_SENTRY(void *, ftrace_int3_handler) /* MAYBE */
-BL_SENTRY(void *, kprobe_int3_handler) /* MAYBE */
-BL_DENTRY(void *, set_intr_gate_ist) /* MAYBE */
-BL_DENTRY(void *, ist_enter) /* MAYBE */
-BL_DENTRY(void *, ist_exit) /* MAYBE */
BL_DENTRY(void *, hw_breakpoint_exceptions_notify)
BL_DENTRY(void *, kprobe_exceptions_notify)
BL_SENTRY(void *, notify_die)
-BL_SENTRY(void *, rcu_nmi_exit)
-BL_SENTRY(void *, rcu_nmi_enter)
-BL_SENTRY(void *, get_kprobe)
-BL_DENTRY(void *, xen_timer_interrupt)
+BL_DENTRY(void *, pvclock_clocksource_read)
+BL_SENTRY(typeof(ktime_get_raw_fast_ns), ktime_get_raw_fast_ns)
+BL_DENTRY(void *, fixup_exception)
+
+BL_SENTRY(void *, do_page_fault)
+BL_DENTRY(void *, __do_page_fault)
+BL_DENTRY(void *, down_read_trylock)
+BL_DENTRY(void *, __get_user_pages_fast)
+BL_DENTRY(void *, gup_pud_range)
+BL_DENTRY(void *, gup_huge_pud)
+BL_DENTRY(void *, gup_pmd_range)
+BL_DENTRY(void *, gup_huge_pmd)
+BL_DENTRY(void *, gup_pte_range)
+BL_DENTRY(void *, pte_mfn_to_pfn)
#define CPUC_PADSIZE (192 - CPUC_SIZE)
#define per_cpu_core(cpu) (&per_cpu(dtrace_cpu_core, (cpu)))
-#define this_cpu_core (this_cpu_ptr(&dtrace_cpu_core))
+#if 0
+# define this_cpu_core (this_cpu_ptr(&dtrace_cpu_core))
+#else
+# define this_cpu_core (per_cpu_core(smp_processor_id()))
+#endif
#define DTRACE_CPUFLAG_ISSET(flag) \
(this_cpu_core->cpuc_dtrace_flags & (flag))
#define CPU_DTRACE_BADSTACK 0x1000
#define CPU_DTRACE_NOPF 0x2000
#define CPU_DTRACE_PF_TRAPPED 0x4000
+#define CPU_DTRACE_PROBE_CTX 0x8000
#define CPU_DTRACE_FAULT (CPU_DTRACE_BADADDR | CPU_DTRACE_BADALIGN | \
CPU_DTRACE_DIVZERO | CPU_DTRACE_ILLOP | \
#ifdef CONFIG_DTRACE
#include <linux/ktime.h>
+#include <linux/mm.h>
#include <linux/notifier.h>
#if defined(CONFIG_DT_FASTTRAP) || defined(CONFIG_DT_FASTTRAP_MODULE)
#include <linux/uprobes.h>
#endif
-#include <asm/dtrace_util.h>
#include <asm/unistd.h>
#include <asm/asm-offsets.h>
#include <linux/dtrace_cpu.h>
#define STACKTRACE_KERNEL 0x01
#define STACKTRACE_USER 0x02
-#define STACKTRACE_SKIP 0x10
+#define STACKTRACE_TYPE 0x0f
typedef struct stacktrace_state {
uint64_t *pcs;
} stacktrace_state_t;
extern void dtrace_stacktrace(stacktrace_state_t *);
+extern void dtrace_user_stacktrace(stacktrace_state_t *);
extern void dtrace_handle_badaddr(struct pt_regs *);
+#include <asm/dtrace_util.h>
+
/*
* This is only safe to call if we know this is a userspace fault
* or that the call happens after early boot.
unsigned long envc;
char **envp;
char psargs[PR_PSARGS_SZ];
+#ifndef __GENKSYMS__
+ void *ustack;
+#endif
} dtrace_psinfo_t;
extern void dtrace_psinfo_alloc(struct task_struct *);
#include <linux/vmalloc.h>
#include <linux/kallsyms.h>
#include <linux/workqueue.h>
-#include <linux/mm.h>
#include <asm/ptrace.h>
#if defined(CONFIG_DT_FASTTRAP) || defined(CONFIG_DT_FASTTRAP_MODULE)
}
psinfo->envp[len] = NULL;
+ psinfo->ustack = mm->start_stack;
+
mmput(mm);
} else {
size_t len = min(TASK_COMM_LEN, PR_PSARGS_SZ);
struct stack_trace trace;
int i;
+ if ((st->flags & STACKTRACE_TYPE) == STACKTRACE_USER) {
+ dtrace_user_stacktrace(st);
+ return;
+ }
+
trace.nr_entries = 0;
trace.max_entries = st->limit ? st->limit : 512;
trace.entries = (typeof(trace.entries))st->pcs;
write_unlock_irq(&tasklist_lock);
#ifdef CONFIG_DTRACE
+ /*
+ * If we're called with stack_start != 0, this is almost certainly a
+ * thread being created in current. Make sure it gets its own psinfo
+ * data, because we need to record a new bottom of stack value.
+ */
+ if (p->mm && stack_start) {
+ dtrace_psinfo_alloc(p);
+ p->dtrace_psinfo->ustack = stack_start;
+ }
+
/*
* We make this call fairly late into the copy_process() handling,
* because we need to ensure that we can look up this task based on