From: Babu Moger Date: Tue, 27 Jun 2017 15:51:45 +0000 (-0600) Subject: sparc64: revert pause instruction patch for atomic backoff and cpu_relax() X-Git-Tag: v4.1.12-106.0.20170720_1900~94 X-Git-Url: https://www.infradead.org/git/?a=commitdiff_plain;h=5bf37a7fc439efcf4ab2a2fe53e0d3b8e5b17375;p=users%2Fjedix%2Flinux-maple.git sparc64: revert pause instruction patch for atomic backoff and cpu_relax() This patch reverts the commit e9b9eb59ffcdee09ec96b040f85c919618f4043e ("sparc64: Use pause instruction when available"). This all started with our TPCC results on UEK4. During T7 testing, the TPCC results were much lower compared to UEK2. The atomic calls like atomic_add and atomic_sub were showing top on perf results. Karl found out that this was caused by the upstream commit e9b9eb59ffcdee09ec96b040f85c919618f4043e (sparc64: Use pause instruction when available). After reverting this commit on UEK4, the TPCC numbers were back to UEK2 level. However, things changed after Atish's scheduler fixes on UEK4. The TPCC numbers improved and the upstream commit (sparc64: Use pause instruction when available) did not seem make any difference. So, Karl's "revert pause instruction" patch was removed from UEK4. Now again with T8 testing, we are seeing the same old behaviour. The atomic calls like atomic_add and atomic_sub are showing top on perf results. After trying with Karl's patch(revert pause instruction patch for atomic backoff) the TPCC numbers improved(about %25 better than T7) and atomic calls are not showing on top in perf. So, we are adding this patch back again. This is a temporary fix. Long term solution is still in the discussion. The original patch is from Karl. http://ca-git.us.oracle.com/?p=linux-uek-sparc.git;a=commit;h=f214eebf2223d23a2b1499be5b54719bdd7651e3 All the credit should go to Karl. Rebased it on latest sparc tree. Orabug: 26306832 Signed-off-by: Karl Volz Reviewed-by: Atish Patra Signed-off-by: Henry Willard Signed-off-by: Babu Moger Reviewed-by: Karl Volz Signed-off-by: Allen Pais --- diff --git a/arch/sparc/include/asm/backoff.h b/arch/sparc/include/asm/backoff.h index 4e02086b839c..5653a6fc1169 100644 --- a/arch/sparc/include/asm/backoff.h +++ b/arch/sparc/include/asm/backoff.h @@ -25,20 +25,9 @@ * between 40 and 50 cpu cycles. * * For SPARC-T4 and later we have a special "pause" instruction - * available. This is implemented using writes to register %asr27. - * The cpu will block the number of cycles written into the register, - * unless a disrupting trap happens first. SPARC-T4 specifically - * implements pause with a granularity of 8 cycles. Each strand has - * an internal pause counter which decrements every 8 cycles. So the - * chip shifts the %asr27 value down by 3 bits, and writes the result - * into the pause counter. If a value smaller than 8 is written, the - * chip blocks for 1 cycle. + * available. NOTE: pause is currently not used due to performance degradation + * in M7/M8 platforms. * - * To achieve the same amount of backoff as the three %ccr reads give - * on earlier chips, we shift the backoff value up by 7 bits. (Three - * %ccr reads block for about 128 cycles, 1 << 7 == 128) We write the - * whole amount we want to block into the pause register, rather than - * loop writing 128 each time. */ #define BACKOFF_LIMIT (4 * 1024) @@ -51,25 +40,16 @@ #define BACKOFF_LABEL(spin_label, continue_label) \ spin_label -#define BACKOFF_SPIN(reg, tmp, label) \ - mov reg, tmp; \ -88: rd %ccr, %g0; \ - rd %ccr, %g0; \ - rd %ccr, %g0; \ - .section .pause_3insn_patch,"ax";\ - .word 88b; \ - sllx tmp, 7, tmp; \ - wr tmp, 0, %asr27; \ - clr tmp; \ - .previous; \ - brnz,pt tmp, 88b; \ - sub tmp, 1, tmp; \ - set BACKOFF_LIMIT, tmp; \ - cmp reg, tmp; \ - bg,pn %xcc, label; \ - nop; \ - ba,pt %xcc, label; \ - sllx reg, 1, reg; +#define BACKOFF_SPIN(reg, tmp, label) \ + mov reg, tmp; \ +88: brnz,pt tmp, 88b; \ + sub tmp, 1, tmp; \ + set BACKOFF_LIMIT, tmp; \ + cmp reg, tmp; \ + bg,pn %xcc, label; \ + nop; \ + ba,pt %xcc, label; \ + sllx reg, 1, reg; #else diff --git a/arch/sparc/include/asm/processor_64.h b/arch/sparc/include/asm/processor_64.h index ebb6fea53b84..5edfa4834448 100644 --- a/arch/sparc/include/asm/processor_64.h +++ b/arch/sparc/include/asm/processor_64.h @@ -205,25 +205,12 @@ unsigned long get_wchan(struct task_struct *task); * To make a long story short, we are trying to yield the current cpu * strand during busy loops. */ -#ifdef BUILD_VDSO #define cpu_relax() asm volatile("\n99:\n\t" \ "rd %%ccr, %%g0\n\t" \ "rd %%ccr, %%g0\n\t" \ "rd %%ccr, %%g0\n\t" \ ::: "memory") -#else /* ! BUILD_VDSO */ -#define cpu_relax() asm volatile("\n99:\n\t" \ - "rd %%ccr, %%g0\n\t" \ - "rd %%ccr, %%g0\n\t" \ - "rd %%ccr, %%g0\n\t" \ - ".section .pause_3insn_patch,\"ax\"\n\t"\ - ".word 99b\n\t" \ - "wr %%g0, 128, %%asr27\n\t" \ - "nop\n\t" \ - "nop\n\t" \ - ".previous" \ - ::: "memory") -#endif + #define cpu_relax_lowlatency() cpu_relax() /* Prefetch support. This is tuned for UltraSPARC-III and later. diff --git a/arch/sparc/kernel/entry.h b/arch/sparc/kernel/entry.h index ab30aa21d33f..4f4384460617 100644 --- a/arch/sparc/kernel/entry.h +++ b/arch/sparc/kernel/entry.h @@ -58,13 +58,6 @@ struct popc_6insn_patch_entry { extern struct popc_6insn_patch_entry __popc_6insn_patch, __popc_6insn_patch_end; -struct pause_patch_entry { - unsigned int addr; - unsigned int insns[3]; -}; -extern struct pause_patch_entry __pause_3insn_patch, - __pause_3insn_patch_end; - void sun4v_patch_1insn_range(struct sun4v_1insn_patch_entry *, struct sun4v_1insn_patch_entry *); void sun4v_patch_2insn_range(struct sun4v_2insn_patch_entry *, diff --git a/arch/sparc/kernel/setup_64.c b/arch/sparc/kernel/setup_64.c index 410a09a107d6..a4fddcbfc9e4 100644 --- a/arch/sparc/kernel/setup_64.c +++ b/arch/sparc/kernel/setup_64.c @@ -331,25 +331,6 @@ static void __init popc_patch(void) } } -static void __init pause_patch(void) -{ - struct pause_patch_entry *p; - - p = &__pause_3insn_patch; - while (p < &__pause_3insn_patch_end) { - unsigned long i, addr = p->addr; - - for (i = 0; i < 3; i++) { - *(unsigned int *) (addr + (i * 4)) = p->insns[i]; - wmb(); - __asm__ __volatile__("flush %0" - : : "r" (addr + (i * 4))); - } - - p++; - } -} - void __init start_early_boot(void) { int cpu; @@ -627,8 +608,6 @@ static void __init init_sparc64_elf_hwcap(void) if (sparc64_elf_hwcap & AV_SPARC_POPC) popc_patch(); - if (sparc64_elf_hwcap & AV_SPARC_PAUSE) - pause_patch(); } void __init alloc_irqstack_bootmem(void) diff --git a/arch/sparc/kernel/vmlinux.lds.S b/arch/sparc/kernel/vmlinux.lds.S index ec931a3b362d..f0f4ea82ec78 100644 --- a/arch/sparc/kernel/vmlinux.lds.S +++ b/arch/sparc/kernel/vmlinux.lds.S @@ -133,11 +133,6 @@ SECTIONS *(.popc_6insn_patch) __popc_6insn_patch_end = .; } - .pause_3insn_patch : { - __pause_3insn_patch = .; - *(.pause_3insn_patch) - __pause_3insn_patch_end = .; - } .sun4v_adi_1insn_patch : { __sun4v_adi_1insn_patch = .; *(.sun4v_adi_1insn_patch)