From 69d3172b5f17f803e6fa5d77d798e0eb79bd920f Mon Sep 17 00:00:00 2001 From: Kris Van Hees Date: Thu, 2 Mar 2017 21:02:01 -0500 Subject: [PATCH] dtrace: comtinuing the FBT implementation and fixes This commit continues the implementation of Function Boundary Tracing (FBT) and fixes various problems with the original implementation and other things in DTrace that it caused to break. It is done as a single commit due to the intertwined nature of the code it touches. 1. The sparc64 fast path implementation (dtrace_caller) for the D 'caller' variable was trampling the %g4 register which Linux uses to hold the 'current' task pointer. By passing in a dummy argument, we ensure that we can use the %i1 register to temporarily store %g4. 2. For consistency, we are now using stacktrace_state_t instead of struct stacktrace_state. 3. We now call dtrace_stacktrace() under NOFAULT protection. 4. The ustack stack walker has been rewritten (in the kernel), so the previous implementation has been removed. 5. We no longer process probes when the kernel panics, to avoid DTrace disrupting output that could be crucial to debugging. 6. We now ensure that re-entry of dtrace_probe() can no longer happen, except for the ERROR probe (which is by a re-entry by design). 7. Since FBT now works, the restriction to only support SyS_* functions has been removed. Signed-off-by: Kris Van Hees Signed-off-by: Tomas Jedlicka Reviewed-by: Nick Alcock Orabug: 21220305 Orabug: 24829326 --- dtrace/dtrace_asm_sparc64.S | 37 +++--- dtrace/dtrace_dif.c | 4 +- dtrace/dtrace_isa.c | 168 ++++------------------------ dtrace/dtrace_probe.c | 38 ++++++- dtrace/fbt_dev.c | 8 +- dtrace/include/dtrace/dtrace_impl.h | 28 ++++- 6 files changed, 104 insertions(+), 179 deletions(-) diff --git a/dtrace/dtrace_asm_sparc64.S b/dtrace/dtrace_asm_sparc64.S index f9daf136fe4b..c0522e15b8a7 100644 --- a/dtrace/dtrace_asm_sparc64.S +++ b/dtrace/dtrace_asm_sparc64.S @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2010-2014 Oracle, Inc. All rights reserved. + * Copyright 2010-2017 Oracle, Inc. All rights reserved. * Use is subject to license terms. */ @@ -47,6 +47,7 @@ mov %o2, %o0 ENDPROC(dtrace_casptr) +/* FIXME */ ENTRY(dtrace_fish) rd %pc, %g5 ba 0f @@ -110,8 +111,16 @@ add %g2, 1, %o0 ! Failure; return deepest frame + 1 ENDPROC(dtrace_fish) -/* FIXME */ + /* + * Try to find caller in register windows. + * + * This is tricky as we must use globals during window rotation + * and must preserve them at the same time. For that reason the 2nd + * argument is not used. This allows us to use %o1 as a scratch to + * keep value of %g4 in and prevents caller to keep something in %o1. + */ ENTRY(dtrace_caller) + mov %g4, %o1 mov nwin_minus_one, %g4 rdpr %canrestore, %g2 cmp %g2, %o0 @@ -120,25 +129,27 @@ sub %g1, %o0, %g3 brgez,a,pt %g3, 0f wrpr %g3, %cwp - ! - ! CWP minus the number of frames is negative; we must perform the - ! arithmetic modulo MAXWIN. - ! + /* + * CWP minus the number of frames is negative; we must perform the + * arithmetic modulo MAXWIN. + */ add %g4, %g3, %g3 inc %g3 wrpr %g3, %cwp 0: mov %i7, %g4 wrpr %g1, %cwp - retl mov %g4, %o0 + retl + mov %o1, %g4 1: - ! - ! The caller has been flushed to the stack. This is unlikely - ! (interrupts are disabled in dtrace_probe()), but possible (the - ! interrupt inducing the spill may have been taken before the - ! call to dtrace_probe()). - ! + /* + * The caller has been flushed to the stack. This is unlikely + * (interrupts are disabled in dtrace_probe()), but possible (the + * interrupt inducing the spill may have been taken before the + * call to dtrace_probe()). + */ + mov %o1, %g4 retl mov -1, %o0 ENDPROC(dtrace_caller) diff --git a/dtrace/dtrace_dif.c b/dtrace/dtrace_dif.c index f434df47c1e7..6cd62dc63bc1 100644 --- a/dtrace/dtrace_dif.c +++ b/dtrace/dtrace_dif.c @@ -21,7 +21,7 @@ * * CDDL HEADER END * - * Copyright 2010-2014 Oracle, Inc. All rights reserved. + * Copyright 2010-2017 Oracle, Inc. All rights reserved. * Use is subject to license terms. */ @@ -2169,7 +2169,7 @@ static uint64_t dtrace_dif_variable(dtrace_mstate_t *mstate, mstate->dtms_arg[0]); mstate->dtms_caller = caller[1]; } else if ((mstate->dtms_caller = - dtrace_caller(aframes)) == -1) { + dtrace_caller(aframes, 0)) == -1) { /* * We have failed to do this the quick way; * we must resort to the slower approach of diff --git a/dtrace/dtrace_isa.c b/dtrace/dtrace_isa.c index 8b5e8c40ccb6..cb29accd2782 100644 --- a/dtrace/dtrace_isa.c +++ b/dtrace/dtrace_isa.c @@ -21,7 +21,7 @@ * * CDDL HEADER END * - * Copyright 2010 -- 2016 Oracle, Inc. All rights reserved. + * Copyright 2010-2017 Oracle, Inc. All rights reserved. * Use is subject to license terms. */ @@ -129,7 +129,7 @@ ktime_t dtrace_gethrestime(void) void dtrace_getpcstack(uint64_t *pcstack, int pcstack_limit, int aframes, uint32_t *intrpc) { - struct stacktrace_state st = { + stacktrace_state_t st = { pcstack, NULL, pcstack_limit, @@ -137,44 +137,15 @@ void dtrace_getpcstack(uint64_t *pcstack, int pcstack_limit, int aframes, STACKTRACE_KERNEL }; + DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT); dtrace_stacktrace(&st); + DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT); while (st.depth < st.limit) pcstack[st.depth++] = 0; } EXPORT_SYMBOL(dtrace_getpcstack); -static struct vm_area_struct *find_user_vma(struct task_struct *tsk, - struct mm_struct *mm, - struct page **page, - unsigned long addr, - int need_incore) -{ - struct vm_area_struct *vma = NULL; - int nonblocking = 1; - int flags = FOLL_IMMED; - int ret; - - if (page) - flags |= FOLL_GET; - - ret = __get_user_pages(tsk, mm, addr, 1, flags, page, &vma, - &nonblocking); - - if ((nonblocking == 0) && need_incore) { - if ((ret > 0) && page) { - size_t i; - for (i = 0; i < ret; i++) - put_page(page[i]); - } - return NULL; - } - else if (ret <= 0) - return NULL; - else - return vma; -} - /* * Get user stack entries up to the pcstack_limit; return the number of entries * acquired. If pcstack is NULL, return the number of entries potentially @@ -184,13 +155,8 @@ unsigned long dtrace_getufpstack(uint64_t *pcstack, uint64_t *fpstack, int pcstack_limit) { struct task_struct *p = current; - struct mm_struct *mm = p->mm; - unsigned long tos, bos, fpc; - unsigned long *sp; - unsigned long depth = 0; - struct vm_area_struct *stack_vma; - struct page *stack_page = NULL; - struct pt_regs *regs = current_pt_regs(); + stacktrace_state_t st; + unsigned long depth; if (pcstack) { if (unlikely(pcstack_limit < 2)) { @@ -202,117 +168,21 @@ unsigned long dtrace_getufpstack(uint64_t *pcstack, uint64_t *fpstack, pcstack_limit -= 2; } - if (!user_mode(regs)) - goto out; - - /* - * There is always at least one address to report: the instruction - * pointer itself (frame 0). - */ - depth++; - - fpc = instruction_pointer(regs); - if (pcstack) { - *pcstack++ = (uint64_t)fpc; - pcstack_limit--; - } + st.pcs = pcstack; + st.fps = fpstack; + st.limit = pcstack_limit; + st.depth = 0; + st.flags = STACKTRACE_USER; - /* - * We cannot ustack() if this task has no mm, if this task is a kernel - * thread, or when someone else has the mmap_sem or the page_table_lock - * (because find_user_vma() ultimately does a __get_user_pages() and - * thence a follow_page(), which can take that lock). - */ - if (mm == NULL || (p->flags & PF_KTHREAD) || - spin_is_locked(&mm->page_table_lock)) - goto out; - - if (!down_read_trylock(&mm->mmap_sem)) - goto out; - atomic_inc(&mm->mm_users); - -#ifdef CONFIG_X86_64 - tos = current_user_stack_pointer(); -#elif defined(STACK_BIAS) - tos = user_stack_pointer(current_pt_regs()) + STACK_BIAS; -#else -#error Not x86-64 nor a stack-biased platform, porting needed -#endif - stack_vma = find_user_vma(p, mm, NULL, (unsigned long) tos, 0); - if (!stack_vma || - stack_vma->vm_start > (unsigned long) tos) - goto unlock_out; - -#ifdef CONFIG_STACK_GROWSUP -#error This code does not yet work on STACK_GROWSUP platforms. -#endif - bos = stack_vma->vm_end; - if (stack_guard_page_end(stack_vma, bos)) - bos -= PAGE_SIZE; - - /* - * If we have a pcstack, loop as long as we are within the stack limit. - * Otherwise, loop until we run out of stack. - */ - for (sp = (unsigned long *)tos; - sp <= ((unsigned long *)bos - sizeof(unsigned long)) && - ((pcstack && pcstack_limit > 0) || - !pcstack); - sp++) { - struct vm_area_struct *code_vma; - unsigned long addr; - int copyret; - - /* - * Recheck for faultedness and pin at page boundaries. - */ - if (!stack_page || (((unsigned long)sp & PAGE_MASK) == 0)) { - if (stack_page) { - put_page(stack_page); - stack_page = NULL; - } - - if (!find_user_vma(p, mm, &stack_page, - (unsigned long) sp, 1)) - break; - } + dtrace_stacktrace(&st); - pagefault_disable(); - copyret = copy_from_user(&addr, sp, sizeof(addr)); - pagefault_enable(); - if (copyret) - break; - - if (addr == fpc) - continue; - - code_vma = find_user_vma(p, mm, NULL, addr, 0); - - if (!code_vma || code_vma->vm_start > addr) - continue; - - if ((addr >= tos && addr <= bos) || - (code_vma->vm_flags & VM_GROWSDOWN)) { - /* stack address - may need it for the fpstack. */ - } else if (code_vma->vm_flags & VM_EXEC) { - if (pcstack) { - *pcstack++ = addr; - pcstack_limit--; - } - depth++; - } + depth = st.depth; + while (st.depth < st.limit) { + if (pcstack) + pcstack[st.depth++] = 0; + if (fpstack) + fpstack[st.depth++] = 0; } - if (stack_page != NULL) - put_page(stack_page); - -unlock_out: - atomic_dec(&mm->mm_users); - up_read(&mm->mmap_sem); - -out: - if (pcstack) - while (pcstack_limit--) - *pcstack++ = 0; return depth; } @@ -326,7 +196,7 @@ int dtrace_getstackdepth(dtrace_mstate_t *mstate, int aframes) { uintptr_t old = mstate->dtms_scratch_ptr; size_t size; - struct stacktrace_state st = { + stacktrace_state_t st = { NULL, NULL, 0, diff --git a/dtrace/dtrace_probe.c b/dtrace/dtrace_probe.c index 854f463b6ad9..3ace8ee087a0 100644 --- a/dtrace/dtrace_probe.c +++ b/dtrace/dtrace_probe.c @@ -21,7 +21,7 @@ * * CDDL HEADER END * - * Copyright 2010-2014 Oracle, Inc. All rights reserved. + * Copyright 2010-2017 Oracle, Inc. All rights reserved. * Use is subject to license terms. */ @@ -599,8 +599,7 @@ void dtrace_probe(dtrace_id_t id, uintptr_t arg0, uintptr_t arg1, return; } -#ifdef FIXME - if (panic_quiesce) { + if (oops_in_progress) { /* * We don't trace anything if we're panicking. */ @@ -609,7 +608,26 @@ void dtrace_probe(dtrace_id_t id, uintptr_t arg0, uintptr_t arg1, local_irq_restore(cookie); return; } -#endif + + flags = (volatile uint16_t *)&this_cpu_core->cpuc_dtrace_flags; + + /* + * Probe context is not re-entrant, unless we're getting called to + * process an ERROR probe. + */ + if ((*flags & CPU_DTRACE_PROBE_CTX) && id != dtrace_probeid_error) { + dt_dbg_probe("Attempt to fire probe from within a probe " \ + "(ID %d, CPoID %d, U %d, pflag %d)\n", id, + (int)this_cpu_core->cpu_dtrace_caller, cpuid, + pflag); + if (pflag) + dtrace_preempt_on(); + local_irq_restore(cookie); + return; + } + + *flags |= CPU_DTRACE_PROBE_CTX; + this_cpu_core->cpu_dtrace_caller = id; now = dtrace_gethrtime(); vtime = (dtrace_vtime_references > 0); @@ -628,8 +646,6 @@ void dtrace_probe(dtrace_id_t id, uintptr_t arg0, uintptr_t arg1, mstate.dtms_arg[3] = arg3; mstate.dtms_arg[4] = arg4; - flags = (volatile uint16_t *)&this_cpu_core->cpuc_dtrace_flags; - for (ecb = probe->dtpr_ecb; ecb != NULL; ecb = ecb->dte_next) { dtrace_predicate_t *pred = ecb->dte_predicate; dtrace_state_t *state = ecb->dte_state; @@ -1270,6 +1286,16 @@ void dtrace_probe(dtrace_id_t id, uintptr_t arg0, uintptr_t arg1, if (vtime) current->dtrace_start = dtrace_gethrtime(); + /* + * Only clear the flag if this is not the ERROR probe. We know that + * an ERROR probe executes from within another probe, and therefore + * we need to retain the probe context flag in the flags. + */ + if (id != dtrace_probeid_error) { + *flags &= ~CPU_DTRACE_PROBE_CTX; + this_cpu_core->cpu_dtrace_caller = 0; + } + if (pflag) dtrace_preempt_on(); local_irq_restore(cookie); diff --git a/dtrace/fbt_dev.c b/dtrace/fbt_dev.c index b0cc6838b508..0b5d0f2ef490 100644 --- a/dtrace/fbt_dev.c +++ b/dtrace/fbt_dev.c @@ -21,7 +21,7 @@ * * CDDL HEADER END * - * Copyright 2010, 2011, 2012, 2013 Oracle, Inc. All rights reserved. + * Copyright 2010-2017 Oracle, Inc. All rights reserved. * Use is subject to license terms. */ @@ -48,12 +48,6 @@ static void *fbt_provide_probe(struct module *mp, char *func, int type, int fbt_probe_t *fbp; fbt_probe_t *prev; - /* - * Only syscalls for now... - */ - if (strncmp(func, "SyS_", 4)) - return NULL; - switch (type) { case FBT_ENTRY: fbp = kzalloc(sizeof(fbt_probe_t), GFP_KERNEL); diff --git a/dtrace/include/dtrace/dtrace_impl.h b/dtrace/include/dtrace/dtrace_impl.h index 2b237e973bd5..8dcc36e447c5 100644 --- a/dtrace/include/dtrace/dtrace_impl.h +++ b/dtrace/include/dtrace/dtrace_impl.h @@ -28,7 +28,7 @@ * * CDDL HEADER END * - * Copyright 2009-2014 Oracle, Inc. All rights reserved. + * Copyright 2009-2017 Oracle, Inc. All rights reserved. * Use is subject to license terms. */ @@ -905,7 +905,31 @@ extern void dtrace_copyinstr(uintptr_t, uintptr_t, size_t, volatile uint16_t *); extern void dtrace_copyoutstr(uintptr_t, uintptr_t, size_t, volatile uint16_t *); -extern uintptr_t dtrace_caller(int); + +/* + * Plaforms that support a fast path to obtain the caller implement the + * dtrace_caller() function. + * + * The first argument is the number of frames that should be skipped when + * looking for a caller address. The 2nd argument is a dummy argument that + * is necessary for SPARC. + * + * On x86 this is effectively a NOP. + * + * On SPARC it is possible to retrieve the caller address from the register + * windows without flushing them to the stack. This involves performing + * explicit rotation of the register windows. Modification of the windowing + * mechanism state alters all %i, %o, and %l registers so we are can only use + * %g registers to store temporary data. + * + * On Linux a lot of %g registers are already allocated for specific purpose. + * Saving temporaries to the stack would be a violation of the fast path code + * logic. Therefore, the function prototype declares a 2nd argument that serves + * as a temporary value. A compiler will not expect that the value in %o1 + * will survive the call and therefore dtrace_caller() can use %o1 as a + * temporary registe. + */ +extern uintptr_t dtrace_caller(int, int); extern void dtrace_copyin_arch(uintptr_t, uintptr_t, size_t, volatile uint16_t *); -- 2.50.1