From: Kris Van Hees Date: Wed, 1 Mar 2017 04:37:11 +0000 (-0500) Subject: dtrace: comtinuing the FBT implementation and fixes X-Git-Tag: v4.1.12-98.0.20170517_2143~41^2~2 X-Git-Url: https://www.infradead.org/git/?a=commitdiff_plain;h=6bd26f8d9d2acb7b1a5153f4168cf0e923b5d3b1;p=users%2Fjedix%2Flinux-maple.git dtrace: comtinuing the FBT implementation and fixes This commit continues the implementation of Function Boundary Tracing (FBT) and fixes various problems with the original implementation and other things in DTrace that it caused to break. It is done as a single commit due to the intertwined nature of the code it touches. 1. We were only handling unaligned memory access traps as part of the NOFAULT access protection. This commit adds handling data and instruction access trap handling. 2. When an OOPS takes place, we now add output about whether we are in DTrace probe context and what the last probe was that was being processed (if any). That last data item isn't guaranteed to always have a valid value. But it is helpful. 3. New ustack stack walker implementation (moved from module to kernel for consistency and because we need access to low level structures like the page tables) for both x86 and sparc. The new code avoids any locking or sleeping. The new user stack walker is accessed as as sub-function of dtrace_stacktrace(), selected using the flags field of stacktrace_state_t. 4. We added a new field to the dtrace_psinfo_t structure (ustack) to hold the bottom address of the stack. This is needed in the stack walker (specifically for x86) to know when we have reached the end of the stack. It is initialized from copy_process (in DTrace specific code) when stack_start is passed as parameter to clone. It is also set from dtrace_psinfo_alloc() (which is generally called from performing an exec), and there it gets its value from the mm->start_stack value. 5. The FBT black lists have been updated with functions that may be invoked during probe processing. In addition, for x86_64 we added explicit filter out of functions that start with insn_* or inat_* because they are used for instruction analysis during probe processing. 6. On sparc64, per-cpu data gets access by means of a global register that holds the base address for this memory area. Some assembler code clobbers that register in some cases, so it is not safe to depend on this in probe context. Instead, we explicitly access the data based on the smp_processor_id(). 7. We added a new CPU DTTrace flag (CPU_DTRACE_PROBE_CTX) to flag that we are processing in DTrace probe context. It is primarily used to detect attempts of re-entry into dtrace_probe(). Signed-off-by: Kris Van Hees Acked-by: Nick Alcock Orabug: 21220305 Orabug: 24829326 --- diff --git a/arch/sparc/include/asm/dtrace_util.h b/arch/sparc/include/asm/dtrace_util.h index 9dd517add3c2..32f2158dde07 100644 --- a/arch/sparc/include/asm/dtrace_util.h +++ b/arch/sparc/include/asm/dtrace_util.h @@ -3,6 +3,6 @@ #ifndef _SPARC_DTRACE_UTIL_H #define _SPARC_DTRACE_UTIL_H -/* Nothing for now */ +extern int dtrace_user_addr_is_exec(uintptr_t); #endif /* _SPARC_DTRACE_UTIL_H */ diff --git a/arch/sparc/kernel/dtrace_util.c b/arch/sparc/kernel/dtrace_util.c index cd68baf37acf..9e6629068225 100644 --- a/arch/sparc/kernel/dtrace_util.c +++ b/arch/sparc/kernel/dtrace_util.c @@ -6,10 +6,17 @@ */ #include +#include #include +#include #include +#include #include +#include +#include +#include #include +#include void dtrace_skip_instruction(struct pt_regs *regs) { @@ -51,7 +58,7 @@ int dtrace_die_notifier(struct notifier_block *nb, unsigned long val, return NOTIFY_OK | NOTIFY_STOP_MASK; } case DIE_TRAP: { - if (dargs->trapnr != 0x34) + if (dargs->trapnr != 0x34 && dargs->trapnr != 0x08) return NOTIFY_DONE; if (!DTRACE_CPUFLAG_ISSET(CPU_DTRACE_NOFAULT)) @@ -61,7 +68,169 @@ int dtrace_die_notifier(struct notifier_block *nb, unsigned long val, return NOTIFY_OK | NOTIFY_STOP_MASK; } + case DIE_OOPS: { + printk("DTrace: probe ctx %d last probe %ld\n", + !!DTRACE_CPUFLAG_ISSET(CPU_DTRACE_PROBE_CTX), + this_cpu_core->cpu_dtrace_caller); + return NOTIFY_DONE; + } default: return NOTIFY_DONE; } } + +int dtrace_user_addr_is_exec(uintptr_t addr) +{ + struct mm_struct *mm = current->mm; + pgd_t pgd; + pud_t pud; + pmd_t pmd; + unsigned long flags; + int ret = 0; + + if (mm == NULL) + return 0; + + addr &= PAGE_MASK; + + local_irq_save(flags); + + pgd = *pgd_offset(mm, addr); + if (pgd_none(pgd)) + goto out; + + pud = *pud_offset(&pgd, addr); + if (pud_none(pud)) + goto out; + + pmd = *pmd_offset(&pud, addr); + if (pmd_none(pmd) || pmd_trans_splitting(pmd)) + goto out; + if (unlikely(pmd_large(pmd))) { + /* not sure how to do this */ + goto out; + } else { + pte_t pte; + + pte = *pte_offset_kernel(&pmd, addr); + + ret = pte_exec(pte); + } + +out: + local_irq_restore(flags); + + return ret; +} +EXPORT_SYMBOL(dtrace_user_addr_is_exec); + +void dtrace_user_stacktrace(stacktrace_state_t *st) +{ + struct thread_info *t = current_thread_info(); + struct pt_regs *regs = current_pt_regs(); + uint64_t *pcs = st->pcs; + uint64_t *fps = st->fps; + int limit = st->limit; + unsigned long window; + unsigned long sp = user_stack_pointer(regs); + int ret; + + if (!user_mode(regs)) + goto out; + + flush_user_windows(); + + st->depth = 1; + if (pcs) { + *pcs++ = (uint64_t)instruction_pointer(regs); + limit--; + } + + if (!limit) + goto out; + + if (test_thread_flag(TIF_32BIT)) + sp = (uint32_t)sp; + + /* + * First we have to process all user windows that have not been flushed + * to the stack save area. + */ + window = get_thread_wsaved(); + while (window--) { + unsigned long addr; + + sp = t->rwbuf_stkptrs[window]; + + if (test_thread_64bit_stack((unsigned long)sp)) { + addr = t->reg_window[window].ins[7]; + } else { + addr = ((struct reg_window32 *)(&t->reg_window[window]))->ins[7]; + } + + if (pcs) { + *pcs++ = addr; + limit--; + } + st->depth++; + + if (!limit) + goto out; + + /* Grab %fp so we can continue iteration on stack. */ + if (window == 0) { + if (test_thread_64bit_stack((unsigned long)sp)) { + sp = t->reg_window[window].ins[6]; + } else { + sp = ((struct reg_window32 *)(&t->reg_window[window]))->ins[6]; + } + } + } + + /* continue iteration on the stack */ + while ((sp != 0 || sp != STACK_BIAS) && limit > 0) { + unsigned long addr; + + pagefault_disable(); + if (test_thread_64bit_stack(sp)) { + ret = __copy_from_user_inatomic(&addr, (unsigned long *)(sp + STACK_BIAS + SF_V9_PC), + sizeof(addr)); + } else { + unsigned int addr32; + + ret = __copy_from_user_inatomic(&addr32, (unsigned int *)(sp + SF_PC), sizeof(addr32)); + addr = addr32; + } + pagefault_enable(); + + if (ret) + break; + + if (pcs) { + *pcs++ = addr; + limit--; + } + st->depth++; + + pagefault_disable(); + if (test_thread_64bit_stack(sp)) { + ret = __copy_from_user_inatomic(&sp, (unsigned long *)(sp + STACK_BIAS + SF_V9_FP), + sizeof (sp)); + } else { + unsigned int sp_tmp; + + ret = __copy_from_user_inatomic(&sp_tmp, (unsigned int *)(sp + SF_FP), sizeof (sp_tmp)); + sp = sp_tmp; + } + pagefault_enable(); + + if (ret) + break; + } + +out: + if (pcs) { + while (limit--) + *pcs++ = 0; + } +} diff --git a/arch/sparc/kernel/fbt_blacklist.h b/arch/sparc/kernel/fbt_blacklist.h index 3fd94299f0b8..fc7b2d42f4f0 100644 --- a/arch/sparc/kernel/fbt_blacklist.h +++ b/arch/sparc/kernel/fbt_blacklist.h @@ -1,14 +1,30 @@ -BL_DENTRY(void *, read_tsc) BL_DENTRY(void *, notifier_call_chain) BL_SENTRY(typeof(__atomic_notifier_call_chain), __atomic_notifier_call_chain) BL_SENTRY(typeof(atomic_notifier_call_chain), atomic_notifier_call_chain) BL_SENTRY(typeof(__raw_notifier_call_chain), __raw_notifier_call_chain) BL_SENTRY(typeof(raw_notifier_call_chain), raw_notifier_call_chain) -BL_SENTRY(typeof(getrawmonotonic64), getrawmonotonic64) -BL_DENTRY(void *, update_fast_timekeeper) -BL_SENTRY(typeof(idr_find_slowpath), idr_find_slowpath) -BL_DENTRY(void *, kprobe_exceptions_notify) BL_SENTRY(void *, notify_die) BL_SENTRY(void *, rcu_nmi_exit) BL_SENTRY(void *, rcu_nmi_enter) -BL_SENTRY(void *, get_kprobe) +BL_SENTRY(typeof(ktime_get_raw_fast_ns), ktime_get_raw_fast_ns) +BL_SENTRY(typeof(idr_find_slowpath), idr_find_slowpath) +BL_DENTRY(void *, kprobe_exceptions_notify) +BL_DENTRY(void *, arch_uprobe_exception_notify) +BL_DENTRY(void *, sun4v_data_access_exception) +BL_DENTRY(void *, sun4v_do_mna) +BL_DENTRY(void *, get_fault_insn) +BL_DENTRY(void *, kernel_unaligned_trap) +BL_DENTRY(typeof(save_stack_trace), save_stack_trace) +BL_DENTRY(typeof(__save_stack_trace), __save_stack_trace) +BL_DENTRY(typeof(stack_trace_flush), stack_trace_flush) +BL_DENTRY(typeof(in_sched_functions), in_sched_functions) + +BL_SENTRY(typeof(search_exception_tables), search_exception_tables) + +BL_DENTRY(void *, down_read_trylock) +BL_DENTRY(void *, __down_read_trylock) +BL_DENTRY(void *, __get_user_pages_fast) +BL_DENTRY(void *, gup_pud_range) +BL_DENTRY(void *, gup_pmd_range) +BL_DENTRY(void *, gup_huge_pmd) +BL_DENTRY(void *, gup_pte_range) diff --git a/arch/x86/include/asm/dtrace_util.h b/arch/x86/include/asm/dtrace_util.h index 99a8385c5341..d53b11f2d1a8 100644 --- a/arch/x86/include/asm/dtrace_util.h +++ b/arch/x86/include/asm/dtrace_util.h @@ -20,6 +20,8 @@ extern void dtrace_invop_remove(uint8_t (*func)(struct pt_regs *)); extern void dtrace_invop_enable(uint8_t *); extern void dtrace_invop_disable(uint8_t *, uint8_t); +extern int dtrace_user_addr_is_exec(uintptr_t); + #endif #endif /* _X86_DTRACE_UTIL_H */ diff --git a/arch/x86/kernel/dtrace_fbt.c b/arch/x86/kernel/dtrace_fbt.c index eb0fb0a9d0ee..4d6c0777f67e 100644 --- a/arch/x86/kernel/dtrace_fbt.c +++ b/arch/x86/kernel/dtrace_fbt.c @@ -112,11 +112,16 @@ void dtrace_fbt_init(fbt_add_probe_fn fbt_add_probe) continue; /* - * No FBT tracing for DTrace functions. Also weed out symbols - * that are not relevant here. + * No FBT tracing for DTrace functions, and functions that are + * crucial to probe processing. + * Also weed out symbols that are not relevant here. */ if (strncmp(sym.name, "dtrace_", 7) == 0) continue; + if (strncmp(sym.name, "insn_", 5) == 0) + continue; + if (strncmp(sym.name, "inat_", 5) == 0) + continue; if (strncmp(sym.name, "_GLOBAL_", 8) == 0) continue; if (strncmp(sym.name, "do_", 3) == 0) diff --git a/arch/x86/kernel/dtrace_util.c b/arch/x86/kernel/dtrace_util.c index 3cf71850f360..f246c4c5fdd0 100644 --- a/arch/x86/kernel/dtrace_util.c +++ b/arch/x86/kernel/dtrace_util.c @@ -6,11 +6,17 @@ */ #include +#include #include +#include #include #include +#include +#include #include +#include #include +#include #include #include #include @@ -270,3 +276,133 @@ void dtrace_invop_disable(uint8_t *addr, uint8_t opcode) text_poke(addr, ((unsigned char []){opcode}), 1); } EXPORT_SYMBOL(dtrace_invop_disable); + +static inline dtrace_bad_address(void *addr) +{ + unsigned long dummy; + + return probe_kernel_address((unsigned long *)addr, dummy); +} + +int dtrace_user_addr_is_exec(uintptr_t addr) +{ + struct mm_struct *mm = current->mm; + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + unsigned long flags; + int ret = 0; + + if (mm == NULL) + return 0; + + addr &= PAGE_MASK; + + local_irq_save(flags); + + pgd = pgd_offset(mm, addr); + if (dtrace_bad_address(pgd)) + goto out; + if (pgd_none(*pgd) || !pgd_present(*pgd)) + goto out; + + pud = pud_offset(pgd, addr); + if (dtrace_bad_address(pud)) + goto out; + if (pud_none(*pud) || !pud_present(*pud)) + goto out; + if (unlikely(pud_large(*pud))) { + pte = (pte_t *)pud; + if (dtrace_bad_address(pte)) + goto out; + + ret = pte_exec(*pte); + goto out; + } + + pmd = pmd_offset(pud, addr); + if (dtrace_bad_address(pmd)) + goto out; + if (pmd_none(*pmd) || pmd_trans_splitting(*pmd)) + goto out; + if (unlikely(pmd_large(*pmd) || !pmd_present(*pmd))) { + pte = (pte_t *)pmd; + if (dtrace_bad_address(pte)) + goto out; + + ret = pte_exec(*pte); + goto out; + } + + pte = pte_offset_map(pmd, addr); + if (dtrace_bad_address(pte)) + goto out; + if (pte_protnone(*pte)) + goto out; + if ((pte_flags(*pte) & (_PAGE_PRESENT|_PAGE_USER|_PAGE_SPECIAL)) != + (_PAGE_PRESENT|_PAGE_USER)) + goto out; + + ret = pte_exec(*pte); + +out: + local_irq_restore(flags); + + return ret; +} +EXPORT_SYMBOL(dtrace_user_addr_is_exec); + +void dtrace_user_stacktrace(stacktrace_state_t *st) +{ + struct thread_info *t = current_thread_info(); + struct pt_regs *regs = current_pt_regs(); + uint64_t *pcs = st->pcs; + uint64_t *fps = st->fps; + int limit = st->limit; + unsigned long *bos; + unsigned long *sp = (unsigned long *)user_stack_pointer(regs); + int ret; + + if (!user_mode(regs)) + goto out; + + if (!current->dtrace_psinfo) + goto out; + + bos = current->dtrace_psinfo->ustack; + + st->depth = 1; + if (pcs) { + *pcs++ = (uint64_t)instruction_pointer(regs); + limit--; + } + + if (!limit) + goto out; + + while (sp <= bos && limit) { + unsigned long pc; + + pagefault_disable(); + ret = __copy_from_user_inatomic(&pc, sp, sizeof(pc)); + pagefault_enable(); + + if (ret) + break; + + if (dtrace_user_addr_is_exec(pc) && pcs) { + *pcs++ = pc; + limit--; + } + st->depth++; + + sp++; + } + +out: + if (pcs) { + while (limit--) + *pcs++ = 0; + } +} diff --git a/arch/x86/kernel/fbt_blacklist.h b/arch/x86/kernel/fbt_blacklist.h index 1599ba880608..3294cd428a45 100644 --- a/arch/x86/kernel/fbt_blacklist.h +++ b/arch/x86/kernel/fbt_blacklist.h @@ -1,25 +1,23 @@ -BL_SENTRY(void *, update_vsyscall) -BL_DENTRY(void *, read_tsc) BL_DENTRY(void *, notifier_call_chain) BL_SENTRY(typeof(__atomic_notifier_call_chain), __atomic_notifier_call_chain) BL_SENTRY(typeof(atomic_notifier_call_chain), atomic_notifier_call_chain) BL_SENTRY(typeof(__raw_notifier_call_chain), __raw_notifier_call_chain) BL_SENTRY(typeof(raw_notifier_call_chain), raw_notifier_call_chain) -BL_DENTRY(void *, timekeeping_get_ns) -BL_SENTRY(typeof(getrawmonotonic64), getrawmonotonic64) -BL_DENTRY(void *, update_fast_timekeeper) -BL_DENTRY(void *, timekeeping_update.clone.3) BL_SENTRY(typeof(idr_find_slowpath), idr_find_slowpath) -BL_SENTRY(typeof(poke_int3_handler), poke_int3_handler) /* MAYBE */ -BL_SENTRY(void *, ftrace_int3_handler) /* MAYBE */ -BL_SENTRY(void *, kprobe_int3_handler) /* MAYBE */ -BL_DENTRY(void *, set_intr_gate_ist) /* MAYBE */ -BL_DENTRY(void *, ist_enter) /* MAYBE */ -BL_DENTRY(void *, ist_exit) /* MAYBE */ BL_DENTRY(void *, hw_breakpoint_exceptions_notify) BL_DENTRY(void *, kprobe_exceptions_notify) BL_SENTRY(void *, notify_die) -BL_SENTRY(void *, rcu_nmi_exit) -BL_SENTRY(void *, rcu_nmi_enter) -BL_SENTRY(void *, get_kprobe) -BL_DENTRY(void *, xen_timer_interrupt) +BL_DENTRY(void *, pvclock_clocksource_read) +BL_SENTRY(typeof(ktime_get_raw_fast_ns), ktime_get_raw_fast_ns) +BL_DENTRY(void *, fixup_exception) + +BL_SENTRY(void *, do_page_fault) +BL_DENTRY(void *, __do_page_fault) +BL_DENTRY(void *, down_read_trylock) +BL_DENTRY(void *, __get_user_pages_fast) +BL_DENTRY(void *, gup_pud_range) +BL_DENTRY(void *, gup_huge_pud) +BL_DENTRY(void *, gup_pmd_range) +BL_DENTRY(void *, gup_huge_pmd) +BL_DENTRY(void *, gup_pte_range) +BL_DENTRY(void *, pte_mfn_to_pfn) diff --git a/include/linux/dtrace_cpu_defines.h b/include/linux/dtrace_cpu_defines.h index 99199ca1f98b..52e777291788 100644 --- a/include/linux/dtrace_cpu_defines.h +++ b/include/linux/dtrace_cpu_defines.h @@ -10,7 +10,11 @@ #define CPUC_PADSIZE (192 - CPUC_SIZE) #define per_cpu_core(cpu) (&per_cpu(dtrace_cpu_core, (cpu))) -#define this_cpu_core (this_cpu_ptr(&dtrace_cpu_core)) +#if 0 +# define this_cpu_core (this_cpu_ptr(&dtrace_cpu_core)) +#else +# define this_cpu_core (per_cpu_core(smp_processor_id())) +#endif #define DTRACE_CPUFLAG_ISSET(flag) \ (this_cpu_core->cpuc_dtrace_flags & (flag)) @@ -35,6 +39,7 @@ #define CPU_DTRACE_BADSTACK 0x1000 #define CPU_DTRACE_NOPF 0x2000 #define CPU_DTRACE_PF_TRAPPED 0x4000 +#define CPU_DTRACE_PROBE_CTX 0x8000 #define CPU_DTRACE_FAULT (CPU_DTRACE_BADADDR | CPU_DTRACE_BADALIGN | \ CPU_DTRACE_DIVZERO | CPU_DTRACE_ILLOP | \ diff --git a/include/linux/dtrace_os.h b/include/linux/dtrace_os.h index 8c7819ae55c8..f5daa4832f36 100644 --- a/include/linux/dtrace_os.h +++ b/include/linux/dtrace_os.h @@ -10,11 +10,11 @@ typedef uint32_t dtrace_id_t; #ifdef CONFIG_DTRACE #include +#include #include #if defined(CONFIG_DT_FASTTRAP) || defined(CONFIG_DT_FASTTRAP_MODULE) #include #endif -#include #include #include #include @@ -49,7 +49,7 @@ extern int dtrace_die_notifier(struct notifier_block *, unsigned long, void *); #define STACKTRACE_KERNEL 0x01 #define STACKTRACE_USER 0x02 -#define STACKTRACE_SKIP 0x10 +#define STACKTRACE_TYPE 0x0f typedef struct stacktrace_state { uint64_t *pcs; @@ -60,8 +60,11 @@ typedef struct stacktrace_state { } stacktrace_state_t; extern void dtrace_stacktrace(stacktrace_state_t *); +extern void dtrace_user_stacktrace(stacktrace_state_t *); extern void dtrace_handle_badaddr(struct pt_regs *); +#include + /* * This is only safe to call if we know this is a userspace fault * or that the call happens after early boot. diff --git a/include/linux/dtrace_psinfo.h b/include/linux/dtrace_psinfo.h index 0151fd8679ee..f6091ab49575 100644 --- a/include/linux/dtrace_psinfo.h +++ b/include/linux/dtrace_psinfo.h @@ -20,6 +20,9 @@ typedef struct dtrace_psinfo { unsigned long envc; char **envp; char psargs[PR_PSARGS_SZ]; +#ifndef __GENKSYMS__ + void *ustack; +#endif } dtrace_psinfo_t; extern void dtrace_psinfo_alloc(struct task_struct *); diff --git a/kernel/dtrace/dtrace_os.c b/kernel/dtrace/dtrace_os.c index b84a2d22f470..a1257d7c836b 100644 --- a/kernel/dtrace/dtrace_os.c +++ b/kernel/dtrace/dtrace_os.c @@ -22,7 +22,6 @@ #include #include #include -#include #include #if defined(CONFIG_DT_FASTTRAP) || defined(CONFIG_DT_FASTTRAP_MODULE) @@ -224,6 +223,8 @@ void dtrace_psinfo_alloc(struct task_struct *tsk) } psinfo->envp[len] = NULL; + psinfo->ustack = mm->start_stack; + mmput(mm); } else { size_t len = min(TASK_COMM_LEN, PR_PSARGS_SZ); @@ -409,6 +410,11 @@ void dtrace_stacktrace(stacktrace_state_t *st) struct stack_trace trace; int i; + if ((st->flags & STACKTRACE_TYPE) == STACKTRACE_USER) { + dtrace_user_stacktrace(st); + return; + } + trace.nr_entries = 0; trace.max_entries = st->limit ? st->limit : 512; trace.entries = (typeof(trace.entries))st->pcs; diff --git a/kernel/fork.c b/kernel/fork.c index 923f3f5599eb..623d452f290f 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1606,6 +1606,16 @@ static struct task_struct *copy_process(unsigned long clone_flags, write_unlock_irq(&tasklist_lock); #ifdef CONFIG_DTRACE + /* + * If we're called with stack_start != 0, this is almost certainly a + * thread being created in current. Make sure it gets its own psinfo + * data, because we need to record a new bottom of stack value. + */ + if (p->mm && stack_start) { + dtrace_psinfo_alloc(p); + p->dtrace_psinfo->ustack = stack_start; + } + /* * We make this call fairly late into the copy_process() handling, * because we need to ensure that we can look up this task based on