int result;
 
        smp_mb();
+       prefetchw(&v->counter);
 
        __asm__ __volatile__("@ atomic_add_return\n"
 "1:    ldrex   %0, [%3]\n"
        int result;
 
        smp_mb();
+       prefetchw(&v->counter);
 
        __asm__ __volatile__("@ atomic_sub_return\n"
 "1:    ldrex   %0, [%3]\n"
        unsigned long res;
 
        smp_mb();
+       prefetchw(&ptr->counter);
 
        do {
                __asm__ __volatile__("@ atomic_cmpxchg\n"
        unsigned long tmp;
 
        smp_mb();
+       prefetchw(&v->counter);
 
        __asm__ __volatile__("@ atomic64_add_return\n"
 "1:    ldrexd  %0, %H0, [%3]\n"
        unsigned long tmp;
 
        smp_mb();
+       prefetchw(&v->counter);
 
        __asm__ __volatile__("@ atomic64_sub_return\n"
 "1:    ldrexd  %0, %H0, [%3]\n"
        unsigned long res;
 
        smp_mb();
+       prefetchw(&ptr->counter);
 
        do {
                __asm__ __volatile__("@ atomic64_cmpxchg\n"
        unsigned long tmp;
 
        smp_mb();
+       prefetchw(&ptr->counter);
 
        __asm__ __volatile__("@ atomic64_xchg\n"
 "1:    ldrexd  %0, %H0, [%3]\n"
        unsigned long tmp;
 
        smp_mb();
+       prefetchw(&v->counter);
 
        __asm__ __volatile__("@ atomic64_dec_if_positive\n"
 "1:    ldrexd  %0, %H0, [%3]\n"
        int ret = 1;
 
        smp_mb();
+       prefetchw(&v->counter);
 
        __asm__ __volatile__("@ atomic64_add_unless\n"
 "1:    ldrexd  %0, %H0, [%4]\n"
 
 #define __ASM_ARM_CMPXCHG_H
 
 #include <linux/irqflags.h>
+#include <linux/prefetch.h>
 #include <asm/barrier.h>
 
 #if defined(CONFIG_CPU_SA1100) || defined(CONFIG_CPU_SA110)
 #endif
 
        smp_mb();
+       prefetchw((const void *)ptr);
 
        switch (size) {
 #if __LINUX_ARM_ARCH__ >= 6
 {
        unsigned long oldval, res;
 
+       prefetchw((const void *)ptr);
+
        switch (size) {
 #ifndef CONFIG_CPU_V6  /* min ARCH >= ARMv6K */
        case 1:
        unsigned long long oldval;
        unsigned long res;
 
+       prefetchw(ptr);
+
        __asm__ __volatile__(
 "1:    ldrexd          %1, %H1, [%3]\n"
 "      teq             %1, %4\n"
 
 
 #define __futex_atomic_op(insn, ret, oldval, tmp, uaddr, oparg)        \
        smp_mb();                                               \
+       prefetchw(uaddr);                                       \
        __asm__ __volatile__(                                   \
        "1:     ldrex   %1, [%3]\n"                             \
        "       " insn "\n"                                     \
                return -EFAULT;
 
        smp_mb();
+       /* Prefetching cannot fault */
+       prefetchw(uaddr);
        __asm__ __volatile__("@futex_atomic_cmpxchg_inatomic\n"
        "1:     ldrex   %1, [%4]\n"
        "       teq     %1, %2\n"
 
        add     r1, r1, r0, lsl #2      @ Get word offset
        mov     r3, r2, lsl r3          @ create mask
        smp_dmb
+#if __LINUX_ARM_ARCH__ >= 7 && defined(CONFIG_SMP)
+       .arch_extension mp
+       ALT_SMP(W(pldw) [r1])
+       ALT_UP(W(nop))
+#endif
 1:     ldrex   r2, [r1]
        ands    r0, r2, r3              @ save old value of bit
        \instr  r2, r2, r3              @ toggle bit