#endif /* !CONFIG_M386 */
 
 #ifdef CONFIG_X86_CMPXCHG64
-#define percpu_cmpxchg8b_double(pcp1, o1, o2, n1, n2)                  \
+#define percpu_cmpxchg8b_double(pcp1, pcp2, o1, o2, n1, n2)            \
 ({                                                                     \
-       char __ret;                                                     \
-       typeof(o1) __o1 = o1;                                           \
-       typeof(o1) __n1 = n1;                                           \
-       typeof(o2) __o2 = o2;                                           \
-       typeof(o2) __n2 = n2;                                           \
-       typeof(o2) __dummy = n2;                                        \
+       bool __ret;                                                     \
+       typeof(pcp1) __o1 = (o1), __n1 = (n1);                          \
+       typeof(pcp2) __o2 = (o2), __n2 = (n2);                          \
        asm volatile("cmpxchg8b "__percpu_arg(1)"\n\tsetz %0\n\t"       \
-                   : "=a"(__ret), "=m" (pcp1), "=d"(__dummy)           \
-                   :  "b"(__n1), "c"(__n2), "a"(__o1), "d"(__o2));     \
+                   : "=a" (__ret), "+m" (pcp1), "+m" (pcp2), "+d" (__o2) \
+                   :  "b" (__n1), "c" (__n2), "a" (__o1));             \
        __ret;                                                          \
 })
 
-#define __this_cpu_cmpxchg_double_4(pcp1, pcp2, o1, o2, n1, n2)                percpu_cmpxchg8b_double(pcp1, o1, o2, n1, n2)
-#define this_cpu_cmpxchg_double_4(pcp1, pcp2, o1, o2, n1, n2)          percpu_cmpxchg8b_double(pcp1, o1, o2, n1, n2)
-#define irqsafe_cpu_cmpxchg_double_4(pcp1, pcp2, o1, o2, n1, n2)       percpu_cmpxchg8b_double(pcp1, o1, o2, n1, n2)
+#define __this_cpu_cmpxchg_double_4    percpu_cmpxchg8b_double
+#define this_cpu_cmpxchg_double_4      percpu_cmpxchg8b_double
+#define irqsafe_cpu_cmpxchg_double_4   percpu_cmpxchg8b_double
 #endif /* CONFIG_X86_CMPXCHG64 */
 
 /*
  * it in software.  The address used in the cmpxchg16 instruction must be
  * aligned to a 16 byte boundary.
  */
-#ifdef CONFIG_SMP
-#define CMPXCHG16B_EMU_CALL "call this_cpu_cmpxchg16b_emu\n\t" ASM_NOP3
-#else
-#define CMPXCHG16B_EMU_CALL "call this_cpu_cmpxchg16b_emu\n\t" ASM_NOP2
-#endif
-#define percpu_cmpxchg16b_double(pcp1, o1, o2, n1, n2)                 \
+#define percpu_cmpxchg16b_double(pcp1, pcp2, o1, o2, n1, n2)           \
 ({                                                                     \
-       char __ret;                                                     \
-       typeof(o1) __o1 = o1;                                           \
-       typeof(o1) __n1 = n1;                                           \
-       typeof(o2) __o2 = o2;                                           \
-       typeof(o2) __n2 = n2;                                           \
-       typeof(o2) __dummy;                                             \
-       alternative_io(CMPXCHG16B_EMU_CALL,                             \
-                      "cmpxchg16b " __percpu_prefix "(%%rsi)\n\tsetz %0\n\t",  \
+       bool __ret;                                                     \
+       typeof(pcp1) __o1 = (o1), __n1 = (n1);                          \
+       typeof(pcp2) __o2 = (o2), __n2 = (n2);                          \
+       alternative_io("leaq %P1,%%rsi\n\tcall this_cpu_cmpxchg16b_emu\n\t", \
+                      "cmpxchg16b " __percpu_arg(1) "\n\tsetz %0\n\t", \
                       X86_FEATURE_CX16,                                \
-                      ASM_OUTPUT2("=a"(__ret), "=d"(__dummy)),         \
-                      "S" (&pcp1), "b"(__n1), "c"(__n2),               \
-                      "a"(__o1), "d"(__o2) : "memory");                \
+                      ASM_OUTPUT2("=a" (__ret), "+m" (pcp1),           \
+                                  "+m" (pcp2), "+d" (__o2)),           \
+                      "b" (__n1), "c" (__n2), "a" (__o1) : "rsi");     \
        __ret;                                                          \
 })
 
-#define __this_cpu_cmpxchg_double_8(pcp1, pcp2, o1, o2, n1, n2)                percpu_cmpxchg16b_double(pcp1, o1, o2, n1, n2)
-#define this_cpu_cmpxchg_double_8(pcp1, pcp2, o1, o2, n1, n2)          percpu_cmpxchg16b_double(pcp1, o1, o2, n1, n2)
-#define irqsafe_cpu_cmpxchg_double_8(pcp1, pcp2, o1, o2, n1, n2)       percpu_cmpxchg16b_double(pcp1, o1, o2, n1, n2)
+#define __this_cpu_cmpxchg_double_8    percpu_cmpxchg16b_double
+#define this_cpu_cmpxchg_double_8      percpu_cmpxchg16b_double
+#define irqsafe_cpu_cmpxchg_double_8   percpu_cmpxchg16b_double
 
 #endif