From: Tomas Jedlicka Date: Thu, 20 Jul 2017 11:54:33 +0000 (-0400) Subject: dtrace: Removal of XCalls from dtrace_sync() X-Git-Tag: v4.1.12-111.0.20170918_2215~182^2~2 X-Git-Url: https://www.infradead.org/git/?a=commitdiff_plain;h=e23d1365cc6cd2d9d78e23483f07ef2df3035e42;p=users%2Fjedix%2Flinux-maple.git dtrace: Removal of XCalls from dtrace_sync() Replaces synchronization mechanism in the framework with lock-free algorithm. Orabug: 26671843 Signed-off-by: Tomas Jedlicka Reviewed-by: Nick Alcock --- diff --git a/arch/sparc/kernel/fbt_blacklist.h b/arch/sparc/kernel/fbt_blacklist.h index ee9e341b6768..a1eccd0e9bec 100644 --- a/arch/sparc/kernel/fbt_blacklist.h +++ b/arch/sparc/kernel/fbt_blacklist.h @@ -28,3 +28,9 @@ BL_DENTRY(void *, gup_pud_range) BL_DENTRY(void *, gup_pmd_range) BL_DENTRY(void *, gup_huge_pmd) BL_DENTRY(void *, gup_pte_range) + +/* + * Functions used in dtrace_sync(). + */ +BL_DENTRY(void *, find_next_bit) +BL_DENTRY(void *, _find_next_bit) diff --git a/arch/x86/kernel/fbt_blacklist.h b/arch/x86/kernel/fbt_blacklist.h index 41b7244199cb..db887b4889af 100644 --- a/arch/x86/kernel/fbt_blacklist.h +++ b/arch/x86/kernel/fbt_blacklist.h @@ -66,3 +66,9 @@ BL_DENTRY(void *, gup_pmd_range) BL_DENTRY(void *, gup_huge_pmd) BL_DENTRY(void *, gup_pte_range) BL_DENTRY(void *, pte_mfn_to_pfn) + +/* + * Functions used in dtrace_sync(). + */ +BL_DENTRY(void *, find_next_bit) +BL_DENTRY(void *, _find_next_bit) diff --git a/dtrace/dtrace_isa.c b/dtrace/dtrace_isa.c index a0fe47b48f7b..0b92f9fe6470 100644 --- a/dtrace/dtrace_isa.c +++ b/dtrace/dtrace_isa.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -36,10 +37,6 @@ int dtrace_getipl(void) return in_interrupt(); } -static void dtrace_sync_func(void) -{ -} - void dtrace_xcall(processorid_t cpu, dtrace_xcall_t func, void *arg) { if (cpu == DTRACE_CPUALL) { @@ -48,14 +45,133 @@ void dtrace_xcall(processorid_t cpu, dtrace_xcall_t func, void *arg) smp_call_function_single(cpu, func, arg, 1); } -void dtrace_sync(void) +void dtrace_toxic_ranges(void (*func)(uintptr_t, uintptr_t)) { - dtrace_xcall(DTRACE_CPUALL, (dtrace_xcall_t)dtrace_sync_func, NULL); + /* FIXME */ } -void dtrace_toxic_ranges(void (*func)(uintptr_t, uintptr_t)) +/* + * Note: not called from probe context. This function is called + * asynchronously (and at a regular interval) from outside of probe context + * by the DTrace framework to sync shared data which DTrace probe context + * may access without locks. + * + * Whenever the framework updates data which can be accessed from probe context, + * the framework then calls dtrace_sync(). dtrace_sync() guarantees all probes + * are using the new data before returning. + * + * See the comment in dtrace_impl.h which describes this algorithm. + * The cpuc_in_probe_ctxt flag is an increasing 16-bit count. It is odd when + * in DTrace probe context and even when not in DTrace probe context. + * The upper 15 bits are a counter which are incremented when exiting DTrace + * probe context. These upper 15 bits are used to detect "sample aliasing": + * i.e. the target CPU is not in DTrace probe context between samples but + * continually enters probe context just before being sampled. + * + * dtrace_sync() loops over NCPUs. CPUs which are not in DTrace probe context + * (cpuc_in_probe_ctxt is even) are removed from the list. This is repeated + * until there are no CPUs left in the sync list. + * + * In the rare cases where dtrace_sync() loops over all NCPUs more than + * dtrace_sync_sample_count times, dtrace_sync() then spins on one CPU's + * cpuc_in_probe_ctxt count until the count increments. This is intended to + * avoid sample aliasing. + */ +void dtrace_sync(void) { - /* FIXME */ + /* + * sync_cpus is a bitmap of CPUs that need to be synced with. + */ + cpumask_t sync_cpus; + uint64_t sample_count = 0; + int cpuid, sample_cpuid; + int outstanding; + + /* + * Create bitmap of CPUs that need to be synced with. + */ + cpumask_copy(&sync_cpus, cpu_online_mask); + outstanding = 0; + for_each_cpu(cpuid, &sync_cpus) { + ++outstanding; + + /* + * Set a flag to let the CPU know we are syncing with it. + */ + DTRACE_SYNC_START(cpuid); + } + + /* + * The preceding stores by DTRACE_SYNC_START() must complete before + * subsequent loads or stores. No membar is needed because the + * atomic-add operation in DTRACE_SYNC_START is a memory barrier on + * SPARC and X86. + */ + + while (outstanding > 0) { + /* + * Loop over the map of CPUs that need to be synced with. + */ + for_each_cpu(cpuid, &sync_cpus) { + if (!DTRACE_SYNC_IN_CRITICAL(cpuid)) { + + /* Clear the CPU's sync request flag */ + DTRACE_SYNC_END(cpuid); + + /* + * remove cpuid from list of CPUs that + * still need to be synced with. + */ + DTRACE_SYNC_DONE(cpuid, &sync_cpus); + --outstanding; + } else { + /* + * Remember one of the outstanding CPUs to spin + * on once we reach the sampling limit. + */ + sample_cpuid = cpuid; + } + } + + /* + * dtrace_probe may be running in sibling threads in this core. + */ + if (outstanding > 0) { + dtrace_safe_smt_pause(); + + /* + * After sample_count loops, spin on one CPU's count + * instead of just checking for odd/even. + */ + if (++sample_count > dtrace_sync_sample_count) { + uint64_t count = + DTRACE_SYNC_CRITICAL_COUNT(sample_cpuid); + + /* + * Spin until critical section count increments. + */ + if (DTRACE_SYNC_IN_CRITICAL(sample_cpuid)) { + while (count == + DTRACE_SYNC_CRITICAL_COUNT( + sample_cpuid)) { + + dtrace_safe_smt_pause(); + } + } + + DTRACE_SYNC_END(sample_cpuid); + DTRACE_SYNC_DONE(sample_cpuid, &sync_cpus); + --outstanding; + } + } + } + +/* + * All preceding loads by DTRACE_SYNC_IN_CRITICAL() and + * DTRACE_SYNC_CRITICAL_COUNT() must complete before subsequent loads + * or stores. No membar is needed because the atomic-add operation in + * DTRACE_SYNC_END() is a memory barrier on SPARC and X86. + */ } /* diff --git a/dtrace/dtrace_probe.c b/dtrace/dtrace_probe.c index 2a107b81542b..de6e6edc3418 100644 --- a/dtrace/dtrace_probe.c +++ b/dtrace/dtrace_probe.c @@ -515,6 +515,7 @@ void dtrace_probe(dtrace_id_t id, uintptr_t arg0, uintptr_t arg1, int onintr; volatile uint16_t *flags; int pflag = 0; + uint32_t re_entry; #ifdef FIXME /* @@ -526,12 +527,13 @@ void dtrace_probe(dtrace_id_t id, uintptr_t arg0, uintptr_t arg1, return; #endif + DTRACE_SYNC_ENTER_CRITICAL(cookie, re_entry); + /* * If preemption has already been disabled before we get here, we * accept it as a free gift. We just need to make sure that we don't * re-enable preemption on the way out... */ - local_irq_save(cookie); if ((pflag = dtrace_is_preemptive())) dtrace_preempt_off(); @@ -547,7 +549,7 @@ void dtrace_probe(dtrace_id_t id, uintptr_t arg0, uintptr_t arg1, */ if (pflag) dtrace_preempt_on(); - local_irq_restore(cookie); + DTRACE_SYNC_EXIT_CRITICAL(cookie, re_entry); return; } @@ -557,7 +559,7 @@ void dtrace_probe(dtrace_id_t id, uintptr_t arg0, uintptr_t arg1, */ if (pflag) dtrace_preempt_on(); - local_irq_restore(cookie); + DTRACE_SYNC_EXIT_CRITICAL(cookie, re_entry); return; } @@ -574,7 +576,7 @@ void dtrace_probe(dtrace_id_t id, uintptr_t arg0, uintptr_t arg1, pflag); if (pflag) dtrace_preempt_on(); - local_irq_restore(cookie); + DTRACE_SYNC_EXIT_CRITICAL(cookie, re_entry); return; } @@ -1250,7 +1252,7 @@ void dtrace_probe(dtrace_id_t id, uintptr_t arg0, uintptr_t arg1, if (pflag) dtrace_preempt_on(); - local_irq_restore(cookie); + DTRACE_SYNC_EXIT_CRITICAL(cookie, re_entry); if (current->dtrace_sig != 0) { int sig = current->dtrace_sig; diff --git a/dtrace/dtrace_spec.c b/dtrace/dtrace_spec.c index 03d8d2845746..7c1b918dd6c1 100644 --- a/dtrace/dtrace_spec.c +++ b/dtrace/dtrace_spec.c @@ -281,11 +281,12 @@ void dtrace_speculation_clean_here(dtrace_state_t *state) processorid_t cpu = smp_processor_id(); dtrace_buffer_t *dest = &state->dts_buffer[cpu]; dtrace_specid_t i; + uint32_t re_entry; - local_irq_save(cookie); + DTRACE_SYNC_ENTER_CRITICAL(cookie, re_entry); if (dest->dtb_tomax == NULL) { - local_irq_restore(cookie); + DTRACE_SYNC_EXIT_CRITICAL(cookie, re_entry); return; } @@ -310,7 +311,7 @@ void dtrace_speculation_clean_here(dtrace_state_t *state) dtrace_speculation_commit(state, cpu, i + 1); } - local_irq_restore(cookie); + DTRACE_SYNC_EXIT_CRITICAL(cookie, re_entry); } void dtrace_speculation_clean(dtrace_state_t *state) diff --git a/dtrace/dtrace_state.c b/dtrace/dtrace_state.c index 9c6ec36aaef7..249642965a23 100644 --- a/dtrace/dtrace_state.c +++ b/dtrace/dtrace_state.c @@ -44,6 +44,7 @@ dtrace_optval_t dtrace_jstackstrsize_default = 512; ktime_t dtrace_deadman_interval = KTIME_INIT(1, 0); ktime_t dtrace_deadman_timeout = KTIME_INIT(10, 0); uint64_t dtrace_deadman_user = SECS_TO_JIFFIES(30); +uint64_t dtrace_sync_sample_count = 100; /* Sampling before counting */ dtrace_id_t dtrace_probeid_begin; dtrace_id_t dtrace_probeid_end; diff --git a/include/dtrace/dtrace_impl.h b/include/dtrace/dtrace_impl.h index c0091acf0a2d..94430006db2b 100644 --- a/include/dtrace/dtrace_impl.h +++ b/include/dtrace/dtrace_impl.h @@ -758,6 +758,283 @@ extern intptr_t dtrace_buffer_reserve(dtrace_buffer_t *, size_t, size_t, extern void dtrace_buffer_polish(dtrace_buffer_t *); extern void dtrace_buffer_free(dtrace_buffer_t *); +/* + * DTrace framework/probe data synchronization + * ------------------------------------------- + * + * The dtrace_sync() facility is used to synchronize global DTrace framework + * data with DTrace probe context. The framework updates data and then calls + * dtrace_sync(). dtrace_sync() loops until it observes all CPUs have been out + * of probe context at least once. This ensures all consumers are using the + * updated data. + * + * DTrace probes have several requirements. First DTrace probe context cannot + * block. DTrace probes execute with interrupts disabled. Locks cannot be + * acquired in DTrace probe context. A second requirement is that DTrace + * probes need to be as high performance as possible to minimize the effect of + * enabled probes. + * + * DTrace framework data changes have their own requirements. DTrace data + * changes/syncs are extremely infrequent compared to DTrace probe firings. + * Probes can be in commonly executed code. A good trade-off is to favor + * DTrace probe context performance over DTrace sync performance. + * + * To meet the above requirements, the DTrace data synchronization algorithm + * is lock-less. The DTrace probe path is wait-free. The DTrace probe path + * is memory-barrier-free in the common case to minimize probe effect. + * dtrace_probe has been made membar free in the common case by adding a read + * in dtrace_probe and adding an additional write and membar to dtrace_sync(). + * + * A simple algorithm is to have dtrace_probe set a flag for its CPU when + * entering DTrace probe context and clear the flag when it exits DTrace probe + * context. A producer of DTrace framework data checks the flag to detect and + * synchronize with probe context. Unfortunately memory ordering issues + * complicate the implementation. Memory barriers are required in probe + * context for this simple approach to work. + * + * A simple implementation to sync with one CPU that works with any memory + * ordering model is: + * + * DTrace probe: + * 1. CPU->in_probe_context = B_TRUE; + * 2. dtrace_membar_enter()// membar #StoreLoad|#StoreStore + * 3. access framework shared data// critical section + * 4. dtrace_membar_exit()// membar #LoadStore|#StoreStore + * 5. CPU->in_probe_context = B_FALSE; + * + * DTrace framework dtrace_sync: + * 0. update framework shared data + * 1. dtrace_membar_enter()// membar #StoreLoad|#StoreStore + * 2. while (CPU->in_probe_context == B_TRUE) + * 3. spin + * 4. dtrace_membar_exit()// membar #LoadStore|#StoreStore + * 5. produce shared dtrace data + * + * A note on memory ordering + * ------------------------- + * + * dtrace_membar_enter() guarantees later loads cannot complete before earlier + * stores, and it guarantees later stores cannot complete before earlier stores. + * dtrace_membar_enter() is, in SPARC parlance, a membar #StoreLoad|#StoreStore. + * + * dtrace_membar_exit() guarantees later stores cannot complete before earlier + * loads, and it guarantees later stores cannot complete before earlier stores. + * dtrace_membar_exit() is, in SPARC parlance, a membar #LoadStore|#StoreStore. + * + * Please see the SPARC and Intel processor guides on memory ordering. + * All sun4v and Fujitsu processors are TSO (Total Store Order). Modern + * supported Intel and AMD processors have similar load and store ordering + * to SPARC. All processors currently supported by Solaris have these memory + * ordering properties: + * 1) Loads are ordered with respect to earlier loads. + * 2) Stores are ordered with respect to earlier stores. + * 3a) SPARC Atomic load-store behaves as if it were followed by a + * MEMBAR #LoadLoad, #LoadStore, and #StoreStore. + * 3b) X86 Atomic operations serialize load and store. + * 4) Stores cannot bypass earlier loads. + * + * The above implementation details allow the membars to be simplified thus: + * A) dtrace_membar_enter() can be reduced to "membar #StoreLoad" on sparc. + * See property number 4 above. + * Since dtrace_membar_enter() is an atomic operation on x86, it cannot be + * reduced further. + * B) dtrace_membar_exit() becomes a NOP on both SPARC and x86. + * See properties 2 and 4. + * + * + * Elimination of membar #StoreLoad from dtrace probe context + * ---------------------------------------------------------- + * + * Furthermore it is possible to eliminate all memory barriers from the common + * dtrace_probe() entry case. The only membar needed in dtrace_probe is there + * to prevent Loads of global DTrace framework data from passing the Store to + * the "in_probe_context" flag (i.e. the dtrace_membar_enter()). + * A Load at the beginning of the algorithm is also ordered with these later + * Loads and Stores: the membar #StoreLoad can be replaced with a early Load of + * a "sync_request" flag and a conditional branch on the flag value. + * + * dtrace_sync() first Stores to the "sync_request" flag, and dtrace_probe() + * starts by Loading the flag. This Load in dtrace_probe() of "sync_request" + * is ordered with its later Store to the "in_probe_context" flag and + * dtrace_probe's later Loads of DTrace framework data. dtrace_probe() only + * needs a membar #StoreLoad iff the "sync_request" flag is set. + * + * Optimized Synchronization Algorithm + * ----------------------------------- + * + * DTrace probe: + * + 1a. request_flag = CPU->sync_request // Load + * 1b. CPU->in_probe_context = B_TRUE // Store + * + 2. if request_flag > 0 + * dtrace_membar_enter() // membar #StoreLoad + * 3. access framework shared data // critical section + * - + * 5. CPU->in_probe_context = B_FALSE // Store + * + * DTrace framework dtrace_sync: + * + 1a. atomically add 1 to CPU->sync_request // Store and + * 1b. dtrace_membar_enter() // membar #StoreLoad + * 2. while (CPU->in_probe_context == B_TRUE) // Load + * 3. spin + * + 4a. atomically subtract 1 from CPU->sync_request // Load + Store + * - + * 5. produce shared dtrace data + * + * This algorithm has been proven correct by analysis of all interleaving + * scenarios of the above operations with the hardware memory ordering + * described above. + * + * The Load and store of the flag pair is very inexpensive. The cacheline with + * the flag pair is never accessed by a different CPU except by dtrace_sync. + * dtrace_sync is very uncommon compared to typical probe firings. The removal + * of membars from DTrace probe context at the expense of a Load and Store and + * a conditional branch is a good performance win. + * + * As implemented there is one pair of flags per CPU. The flags are in one + * cacheline; they could be split into two cachelines if dtrace_sync was more + * common. dtrace_sync loops over all NCPU sets of flags. dtrace_sync lazily + * only does one dtrace_membar_enter() (step 1b) after setting all NCPU + * sync_request flags. + * + * Sample aliasing could cause dtrace_sync() to always sample a CPU's + * in_probe_context flag when the CPU is in probe context even if the CPU + * left and returned to probe context one or more times since the last sample. + * cpuc_in_probe_ctxt is implemented as an even/odd counter instead of a + * boolean flag. cpuc_in_probe_ctxt is odd when in probe context and even + * when not in probe context. Probe context increments cpuc_in_probe_ctxt when + * entering and exiting. dtrace_probe() handles re-entry by not increment the + * counter for re-enterant entry and exit. + */ + +/* + * dtrace_membar_exit() is a NOP on current SPARC and X86 hardware. + * It is defined as an inline asm statement to prevent the C optimizer from + * moving C statements around the membar. + */ +#define dtrace_membar_exit() \ + __asm__ __volatile__("" ::: "memory") + +/* + * dtrace_membar_enter() does not need an explicit membar #StoreStore because + * modern SPARC hardware is TSO: stores are ordered with other stores. + */ +#define dtrace_membar_enter() \ + mb(); + +#define dtrace_safe_smt_pause() \ + cpu_relax(); + +/* + * Used by dtrace_probe() to flag entry to the the critical section. + * dtrace_probe() context may be consuming DTrace framework data. + * + * cpuc_in_probe_ctxt is odd when in probe context and even when not in + * probe context. The flag must not be incremented when re-entering from + * probe context. + */ +#define DTRACE_SYNC_ENTER_CRITICAL(cookie, re_entry) \ +{ \ + uint64_t requests; \ + uint64_t count; \ + \ + local_irq_save(cookie); \ + \ + requests = atomic64_read(&this_cpu_core->cpuc_sync_requests); \ + \ + /* Increment flag iff it is even */ \ + count = atomic64_read(&this_cpu_core->cpuc_in_probe_ctx); \ + re_entry = count & 0x1; \ + atomic64_set(&this_cpu_core->cpuc_in_probe_ctx, count | 0x1); \ + ASSERT(DTRACE_SYNC_IN_CRITICAL(smp_processor_id())); \ + \ + /* \ + * Later Loads are ordered with respect to the Load of \ + * cpuc_sync_requests. The Load is also guaranteed to complete \ + * before the store to cpuc_in_probe_ctxt. Thus a member_enter \ + * is only needed when requests is not 0. This is very \ + * uncommon. \ + */ \ + if (requests > 0) { \ + dtrace_membar_enter(); \ + } \ +} + +/* + * Used by dtrace_probe() to flag exit from the critical section. + * dtrace_probe context is no longer using DTrace framework data. + */ +#define DTRACE_SYNC_EXIT_CRITICAL(cookie, re_entry) \ +{ \ + dtrace_membar_exit(); \ + ASSERT((re_entry | 0x1) == 0x1); \ + \ + /* \ + * flag must not be incremented when returning to probe context.\ + */ \ + atomic64_add(~re_entry & 0x1, &this_cpu_core->cpuc_in_probe_ctx); \ + ASSERT(re_entry == \ + (atomic64_read(&this_cpu_core->cpuc_in_probe_ctx) & 0x1)); \ + local_irq_restore(cookie); \ +} + +/* + * Used by dtrace_sync to inform dtrace_probe it needs to synchronize with + * dtrace_sync. dtrace_probe consumes the cpuc_sync_requests flag to determine + * if it needs a membar_enter. Not called from probe context. + * + * cpuc_sync_requests must be updated atomically by dtrace_sync because there + * may be multiple dtrace_sync operations executing at the same time. + * cpuc_sync_requests is a simple count of the number of concurrent + * dtrace_sync requests. + */ +#define DTRACE_SYNC_START(cpuid) \ +{ \ + atomic64_add(1, &(per_cpu_core(cpuid))->cpuc_sync_requests); \ + ASSERT(atomic64_read(&per_cpu_core(cpuid)->cpuc_sync_requests) > 0); \ +} + +/* + * Used by dtrace_sync to flag dtrace_probe that it no longer needs to + * synchronize with dtrace_sync. Not called from probe context. + */ +#define DTRACE_SYNC_END(cpuid) \ +{ \ + atomic64_add(-1, &(per_cpu_core(cpuid))->cpuc_sync_requests); \ + ASSERT(atomic64_read(&per_cpu_core(cpuid)->cpuc_sync_requests) >= 0); \ +} + +/* + * The next two macros are used by dtrace_sync to check if the target CPU is in + * DTrace probe context. cpuc_in_probe_ctxt is a monotonically increasing + * count which dtrace_probe() increments when entering and exiting probe + * context. The flag is odd when in probe context, and even when not in probe + * context. + */ +#define DTRACE_SYNC_IN_CRITICAL(cpuid) \ + (atomic64_read(&per_cpu_core(cpuid)->cpuc_in_probe_ctx) & 0x1) + +/* + * Used to check if the target CPU left and then entered probe context again. + */ +#define DTRACE_SYNC_CRITICAL_COUNT(cpuid) \ + (atomic64_read(&per_cpu_core(cpuid)->cpuc_in_probe_ctx)) + +/* + * The next three macros are bitmap operations used by dtrace_sync to keep track + * of which CPUs it still needs to synchronize with. + */ +#define DTRACE_SYNC_OUTSTANDING(cpuid, bitmap) \ + (cpumask_test_cpu(cpuid, bitmap) == 1) + +#define DTRACE_SYNC_NEEDED(cpuid, bitmap) \ + cpumask_set_cpu(cpuid, bitmap) + +#define DTRACE_SYNC_DONE(cpuid, bitmap) \ + cpumask_clear_cpu(cpuid, bitmap) + +extern uint64_t dtrace_sync_sample_count; +extern void dtrace_sync(void); + /* * DTrace Enabling Functions */ @@ -857,7 +1134,6 @@ typedef unsigned long dtrace_icookie_t; extern struct mutex cpu_lock; -extern void dtrace_sync(void); extern void dtrace_toxic_ranges(void (*)(uintptr_t, uintptr_t)); extern void dtrace_vpanic(const char *, va_list); extern int dtrace_getipl(void); diff --git a/include/linux/dtrace_cpu.h b/include/linux/dtrace_cpu.h index d6aa57f08fc4..20691ccd203c 100644 --- a/include/linux/dtrace_cpu.h +++ b/include/linux/dtrace_cpu.h @@ -1,4 +1,6 @@ -/* Copyright (C) 2011-2014 Oracle, Inc. */ +/* + * Copyright (c) 2004, 2017, Oracle and/or its affiliates. All rights reserved. + */ #ifndef _LINUX_DTRACE_CPU_H_ #define _LINUX_DTRACE_CPU_H_ @@ -21,6 +23,8 @@ typedef struct cpu_core { ktime_t cpu_dtrace_chillmark; ktime_t cpu_dtrace_chilled; rwlock_t cpu_ft_lock; + atomic64_t cpuc_sync_requests; + atomic64_t cpuc_in_probe_ctx; } cpu_core_t; DECLARE_PER_CPU_SHARED_ALIGNED(cpu_core_t, dtrace_cpu_core);