From 81e24eecf39f787a5ca6b99f6a1ea1af7483e931 Mon Sep 17 00:00:00 2001 From: Nick Alcock Date: Mon, 24 Mar 2014 22:43:17 +0000 Subject: [PATCH] dtrace: Improve dtrace_getufpstack() (locks, stack detection, faults) dtrace_getufpstack() had several flaws exposed by ustack() of multithreaded processes. All the flaws touch the same small body of code, and none could be verified to work until all were in place: hence this rather do-everything commit. Firstly, it was detecting the end of the stack using mm->start_stack. This is incorrect for all threads but the first, and is even incorrect for the first thread in languages such as Go with split stacks. As it is, this causes the stack traversal to attempt to walk over a gap with no VMAs, causing a crash. The correct solution is of course to look at the VMAs to find the VMA which covers the user's stack address. We are already looking at the VMAs in is_code_addr(), but this is both a linear scan when all but no-mmu platforms have better ways, and a *lockless* scan. This is barely safe in the single-threaded case, but in the multithreaded case other tasks sharing the same mm may well be executing in parallel, and it becomes crucial that scanning the VMAs be done under the mmap_sem. Unfortunately we cannot always take the mmap_sem: DTrace may well be invoked in contexts in which sleeping is prohibited, and in which other threads have the semaphore. So we must do a down_read_trylock() on the mmap_sem, aborting the ustack() if we cannot take it just as we already do if this is a process with no mm at all. (We also need to boost the mm_users to prevent problems with group exits.) We are also accessing the pages themselves without pinning, which means concurrent memory pressure could swap them out, or memory compaction move them around. We can use __get_user_pages() to get the VMA and pin the pages we need simultaneously, as long as we use the newly-introduced FOLL_NOFAULT to ensure that __get_user_pages() does not incur page faults. We wrap __get_user_pages() in a local find_user_vma(), which also arranges to optionally fail if particular pages (such as the stack pages) are not in core. (We need the VMA for some pages so we can see if they are likely to be text-segment VMAs or not: such pages do not need to be in core and ustack() need not fail if they are swapped out.) For efficiency's sake, we pin each stack page as we cross the page boundary into it, releasing it afterwards. But even this does not suffice. FOLL_NOFAULT ensures that __get_user_pages() will not fault, but does not ensure that a page fault will not happen when accessing the page itself. So we use the newly-introduced CPU_DTRACE_NOPF machinery to entirely suppress page faults inside get_user() (and nowhere else), and check it afterwards. As an additional feature, dtrace_getufpstack() can now be called with a NULL pcstack and a pcstack_limit of zero, meaning that the stack frame entries are only counted, not recorded. We use this feature to reimplement dtrace_getustackdepth() in terms of dtrace_getufpstack(). With this change, multithreaded ustack()s appear to work, even in the presence of non-glibc stack layouts (as used by Java and other non-glibc threading libraries) and concurrent group exits and VMA changes. Orabug: 18412802 Signed-off-by: Nick Alcock Reviewed-by: Kris Van Hees Acked-by: Chuck Anderson --- dtrace/dtrace_isa.c | 190 +++++++++++++++++++--------- dtrace/include/dtrace/dtrace_impl.h | 2 +- 2 files changed, 134 insertions(+), 58 deletions(-) diff --git a/dtrace/dtrace_isa.c b/dtrace/dtrace_isa.c index 0c85bf9f3798..fd5f6cf0252c 100644 --- a/dtrace/dtrace_isa.c +++ b/dtrace/dtrace_isa.c @@ -21,7 +21,7 @@ * * CDDL HEADER END * - * Copyright 2010, 2011, 2012 Oracle, Inc. All rights reserved. + * Copyright 2010, 2011, 2012, 2013, 2014 Oracle, Inc. All rights reserved. * Use is subject to license terms. */ @@ -259,70 +259,158 @@ void dtrace_getpcstack(uint64_t *pcstack, int pcstack_limit, int aframes, } EXPORT_SYMBOL(dtrace_getpcstack); -static int is_code_addr(unsigned long addr) { - struct vm_area_struct *vma, *first; - - first = NULL; - for (vma = current->mm->mmap; - vma != NULL && vma != first; - vma = vma->vm_next) { - if (!first) - first = vma; - - if (!(vma->vm_flags & VM_EXEC)) - continue; - - if (addr < vma->vm_start) - return 0; - if (addr <= vma->vm_end) - return 1; +static struct vm_area_struct *find_user_vma(struct task_struct *tsk, + struct mm_struct *mm, + struct page **page, + unsigned long addr, + int need_incore) +{ + struct vm_area_struct *vma = NULL; + int nonblocking = 1; + int flags = FOLL_NOFAULT; + int ret; + + if (page) + flags |= FOLL_GET; + + ret = __get_user_pages(tsk, mm, addr, 1, flags, page, &vma, + &nonblocking); + + if ((nonblocking == 0) && need_incore) { + if ((ret > 0) && page) { + size_t i; + for (i = 0; i < ret; i++) + put_page(page[i]); + } + return NULL; } - - return 0; + else if (ret <= 0) + return NULL; + else + return vma; } -void dtrace_getufpstack(uint64_t *pcstack, uint64_t *fpstack, - int pcstack_limit) +/* + * Get user stack entries up to the pcstack_limit; return the number of entries + * acquired. If pcstack is NULL, return the number of entries potentially + * acquirable. + */ +unsigned long dtrace_getufpstack(uint64_t *pcstack, uint64_t *fpstack, + int pcstack_limit) { struct task_struct *p = current; - unsigned long bos, tos; + struct mm_struct *mm = p->mm; + unsigned long tos, bos; unsigned long *sp; + unsigned long depth = 0; + struct vm_area_struct *stack_vma; + struct page *stack_page = NULL; + + if (pcstack) { + if (unlikely(pcstack_limit < 2)) { + DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP); + return 0; + } + *pcstack++ = (uint64_t)p->pid; + *pcstack++ = (uint64_t)p->tgid; + pcstack_limit-=2; + } - *pcstack++ = (uint64_t)p->pid; - *pcstack++ = (uint64_t)p->tgid; - pcstack_limit-=2; - - if (p->mm == NULL || (p->flags & PF_KTHREAD)) - goto out; - - tos = this_cpu_read(old_rsp); - bos = p->mm->start_stack; + /* + * We cannot ustack() if this task has no mm, if this task is a kernel + * thread, or when someone else has the mmap_sem or the page_table_lock + * (because find_user_vma() ultimately does a __get_user_pages() and + * thence a follow_page(), which can take that lock). + */ + if (mm == NULL || (p->flags & PF_KTHREAD) || + spin_is_locked(&mm->page_table_lock)) + goto out; + + if (!down_read_trylock(&mm->mmap_sem)) + goto out; + atomic_inc(&mm->mm_users); + + tos = current_user_stack_pointer(); + stack_vma = find_user_vma(p, mm, NULL, (unsigned long) tos, 0); + if (!stack_vma || + stack_vma->vm_start > (unsigned long) tos) + goto unlock_out; + +#ifdef CONFIG_STACK_GROWSUP +#error This code does not yet work on STACK_GROWSUP platforms. +#endif + bos = stack_vma->vm_end; + if (stack_guard_page_end(stack_vma, bos)) + bos -= PAGE_SIZE; + /* + * If we have a pcstack, loop as long as we are within the stack limit. + * Otherwise, loop until we run out of stack. + */ for (sp = (unsigned long *)tos; - sp <= (unsigned long *)bos && pcstack_limit; sp++) { - unsigned long addr; + sp <= (unsigned long *)bos && + ((pcstack && pcstack_limit > 0) || + !pcstack); + sp++) { + struct vm_area_struct *code_vma; + unsigned long addr; + /* + * Recheck for faultedness and pin at page boundaries. + */ + if (!stack_page || (((unsigned long)sp & PAGE_MASK) == 0)) { + if (stack_page) { + put_page(stack_page); + stack_page = NULL; + } + + if (!find_user_vma(p, mm, &stack_page, + (unsigned long) sp, 1)) + break; + } + + DTRACE_CPUFLAG_SET(CPU_DTRACE_NOPF); DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT); get_user(addr, sp); DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT); + DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOPF); - if (DTRACE_CPUFLAG_ISSET(CPU_DTRACE_FAULT)) + if (DTRACE_CPUFLAG_ISSET(CPU_DTRACE_FAULT)) { + DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_BADADDR); + DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_PF_TRAPPED); break; + } + + code_vma = find_user_vma(p, mm, NULL, addr, 0); + + if (!code_vma || code_vma->vm_start > addr) + continue; - if (addr >= tos && addr <= bos) { + if ((addr >= tos && addr <= bos) || + (code_vma->vm_flags & VM_GROWSDOWN)) { /* stack address - may need it for the fpstack. */ - } else if (is_code_addr(addr)) { - *pcstack++ = addr; - pcstack_limit--; + } else if (code_vma->vm_flags & VM_EXEC) { + if (pcstack) { + *pcstack++ = addr; + pcstack_limit--; + } + depth++; } - - sp++; } + if (stack_page != NULL) + put_page(stack_page); + +unlock_out: + atomic_dec(&mm->mm_users); + up_read(&mm->mmap_sem); out: - while (pcstack_limit--) - *pcstack++ = 0; -} + if (pcstack) + while (pcstack_limit--) + *pcstack++ = 0; + + return depth; +} void dtrace_getupcstack(uint64_t *pcstack, int pcstack_limit) { @@ -349,17 +437,5 @@ int dtrace_getstackdepth(int aframes) int dtrace_getustackdepth(void) { - int depth = 0; - struct task_struct *p = current; - unsigned long *sp = (unsigned long *)p->thread.usersp; - unsigned long *bos = (unsigned long *)p->mm->start_stack; - - while (sp <= bos) { - if (is_code_addr(*sp)) - depth++; - - sp++; - } - - return depth; + return dtrace_getufpstack(NULL, NULL, 0); } diff --git a/dtrace/include/dtrace/dtrace_impl.h b/dtrace/include/dtrace/dtrace_impl.h index 73a8b7d160ca..c1198cb3b4aa 100644 --- a/dtrace/include/dtrace/dtrace_impl.h +++ b/dtrace/include/dtrace/dtrace_impl.h @@ -885,7 +885,7 @@ extern void dtrace_probe_error(dtrace_state_t *, dtrace_epid_t, int, int, int, extern void dtrace_getpcstack(uint64_t *, int, int, uint32_t *); extern void dtrace_getupcstack(uint64_t *, int); -extern void dtrace_getufpstack(uint64_t *, uint64_t *, int); +extern unsigned long dtrace_getufpstack(uint64_t *, uint64_t *, int); extern uintptr_t dtrace_getfp(void); extern uint64_t dtrace_getarg(int, int); extern int dtrace_getstackdepth(int); -- 2.50.1