#define _STRING_C
 #include <linux/linkage.h>
 #include <asm/dwarf2.h>
+#include <asm/cpufeature.h>
 
 #undef memmove
 
  */
 ENTRY(memmove)
        CFI_STARTPROC
+
        /* Handle more 32bytes in loop */
        mov %rdi, %rax
        cmp $0x20, %rdx
 
        /* Decide forward/backward copy mode */
        cmp %rdi, %rsi
-       jb      2f
+       jge .Lmemmove_begin_forward
+       mov %rsi, %r8
+       add %rdx, %r8
+       cmp %rdi, %r8
+       jg 2f
 
+.Lmemmove_begin_forward:
        /*
         * movsq instruction have many startup latency
         * so we handle small size by general register.
        rep movsq
        movq %r11, (%r10)
        jmp 13f
+.Lmemmove_end_forward:
+
        /*
         * Handle data backward by movsq.
         */
 13:
        retq
        CFI_ENDPROC
+
+       .section .altinstr_replacement,"ax"
+.Lmemmove_begin_forward_efs:
+       /* Forward moving data. */
+       movq %rdx, %rcx
+       rep movsb
+       retq
+.Lmemmove_end_forward_efs:
+       .previous
+
+       .section .altinstructions,"a"
+       .align 8
+       .quad .Lmemmove_begin_forward
+       .quad .Lmemmove_begin_forward_efs
+       .word X86_FEATURE_ERMS
+       .byte .Lmemmove_end_forward-.Lmemmove_begin_forward
+       .byte .Lmemmove_end_forward_efs-.Lmemmove_begin_forward_efs
+       .previous
 ENDPROC(memmove)