This patch reverts the commit
e9b9eb59ffcdee09ec96b040f85c919618f4043e
("sparc64: Use pause instruction when available").
This all started with our TPCC results on UEK4. During T7 testing, the TPCC results
were much lower compared to UEK2. The atomic calls like atomic_add and atomic_sub
were showing top on perf results. Karl found out that this was caused by
the upstream commit
e9b9eb59ffcdee09ec96b040f85c919618f4043e
(sparc64: Use pause instruction when available). After reverting this commit on UEK4,
the TPCC numbers were back to UEK2 level. However, things changed after Atish's
scheduler fixes on UEK4. The TPCC numbers improved and the upstream commit
(sparc64: Use pause instruction when available) did not seem make any difference.
So, Karl's "revert pause instruction" patch was removed from UEK4.
Now again with T8 testing, we are seeing the same old behaviour. The atomic calls
like atomic_add and atomic_sub are showing top on perf results. After trying with
Karl's patch(revert pause instruction patch for atomic backoff) the TPCC numbers
improved(about %25 better than T7) and atomic calls are not showing on top in perf.
So, we are adding this patch back again. This is a temporary fix. Long term solution
is still in the discussion. The original patch is from Karl.
http://ca-git.us.oracle.com/?p=linux-uek-sparc.git;a=commit;h=
f214eebf2223d23a2b1499be5b54719bdd7651e3
All the credit should go to Karl. Rebased it on latest sparc tree.
Orabug:
26306832
Signed-off-by: Karl Volz <karl.volz@Oracle.com>
Reviewed-by: Atish Patra <atish.patra@oracle.com>
Signed-off-by: Henry Willard <henry.willard@oracle.com>
Signed-off-by: Babu Moger <babu.moger@oracle.com>
Reviewed-by: Karl Volz <karl.volz@Oracle.com>
Signed-off-by: Allen Pais <allen.pais@oracle.com>
* between 40 and 50 cpu cycles.
*
* For SPARC-T4 and later we have a special "pause" instruction
- * available. This is implemented using writes to register %asr27.
- * The cpu will block the number of cycles written into the register,
- * unless a disrupting trap happens first. SPARC-T4 specifically
- * implements pause with a granularity of 8 cycles. Each strand has
- * an internal pause counter which decrements every 8 cycles. So the
- * chip shifts the %asr27 value down by 3 bits, and writes the result
- * into the pause counter. If a value smaller than 8 is written, the
- * chip blocks for 1 cycle.
+ * available. NOTE: pause is currently not used due to performance degradation
+ * in M7/M8 platforms.
*
- * To achieve the same amount of backoff as the three %ccr reads give
- * on earlier chips, we shift the backoff value up by 7 bits. (Three
- * %ccr reads block for about 128 cycles, 1 << 7 == 128) We write the
- * whole amount we want to block into the pause register, rather than
- * loop writing 128 each time.
*/
#define BACKOFF_LIMIT (4 * 1024)
#define BACKOFF_LABEL(spin_label, continue_label) \
spin_label
-#define BACKOFF_SPIN(reg, tmp, label) \
- mov reg, tmp; \
-88: rd %ccr, %g0; \
- rd %ccr, %g0; \
- rd %ccr, %g0; \
- .section .pause_3insn_patch,"ax";\
- .word 88b; \
- sllx tmp, 7, tmp; \
- wr tmp, 0, %asr27; \
- clr tmp; \
- .previous; \
- brnz,pt tmp, 88b; \
- sub tmp, 1, tmp; \
- set BACKOFF_LIMIT, tmp; \
- cmp reg, tmp; \
- bg,pn %xcc, label; \
- nop; \
- ba,pt %xcc, label; \
- sllx reg, 1, reg;
+#define BACKOFF_SPIN(reg, tmp, label) \
+ mov reg, tmp; \
+88: brnz,pt tmp, 88b; \
+ sub tmp, 1, tmp; \
+ set BACKOFF_LIMIT, tmp; \
+ cmp reg, tmp; \
+ bg,pn %xcc, label; \
+ nop; \
+ ba,pt %xcc, label; \
+ sllx reg, 1, reg;
#else
* To make a long story short, we are trying to yield the current cpu
* strand during busy loops.
*/
-#ifdef BUILD_VDSO
#define cpu_relax() asm volatile("\n99:\n\t" \
"rd %%ccr, %%g0\n\t" \
"rd %%ccr, %%g0\n\t" \
"rd %%ccr, %%g0\n\t" \
::: "memory")
-#else /* ! BUILD_VDSO */
-#define cpu_relax() asm volatile("\n99:\n\t" \
- "rd %%ccr, %%g0\n\t" \
- "rd %%ccr, %%g0\n\t" \
- "rd %%ccr, %%g0\n\t" \
- ".section .pause_3insn_patch,\"ax\"\n\t"\
- ".word 99b\n\t" \
- "wr %%g0, 128, %%asr27\n\t" \
- "nop\n\t" \
- "nop\n\t" \
- ".previous" \
- ::: "memory")
-#endif
+
#define cpu_relax_lowlatency() cpu_relax()
/* Prefetch support. This is tuned for UltraSPARC-III and later.
extern struct popc_6insn_patch_entry __popc_6insn_patch,
__popc_6insn_patch_end;
-struct pause_patch_entry {
- unsigned int addr;
- unsigned int insns[3];
-};
-extern struct pause_patch_entry __pause_3insn_patch,
- __pause_3insn_patch_end;
-
void sun4v_patch_1insn_range(struct sun4v_1insn_patch_entry *,
struct sun4v_1insn_patch_entry *);
void sun4v_patch_2insn_range(struct sun4v_2insn_patch_entry *,
}
}
-static void __init pause_patch(void)
-{
- struct pause_patch_entry *p;
-
- p = &__pause_3insn_patch;
- while (p < &__pause_3insn_patch_end) {
- unsigned long i, addr = p->addr;
-
- for (i = 0; i < 3; i++) {
- *(unsigned int *) (addr + (i * 4)) = p->insns[i];
- wmb();
- __asm__ __volatile__("flush %0"
- : : "r" (addr + (i * 4)));
- }
-
- p++;
- }
-}
-
void __init start_early_boot(void)
{
int cpu;
if (sparc64_elf_hwcap & AV_SPARC_POPC)
popc_patch();
- if (sparc64_elf_hwcap & AV_SPARC_PAUSE)
- pause_patch();
}
void __init alloc_irqstack_bootmem(void)
*(.popc_6insn_patch)
__popc_6insn_patch_end = .;
}
- .pause_3insn_patch : {
- __pause_3insn_patch = .;
- *(.pause_3insn_patch)
- __pause_3insn_patch_end = .;
- }
.sun4v_adi_1insn_patch : {
__sun4v_adi_1insn_patch = .;
*(.sun4v_adi_1insn_patch)