x86, mem: Optimize memcpy by avoiding memory false dependece

author Ma Ling <ling.ma@intel.com>

Mon, 28 Jun 2010 19:24:25 +0000 (03:24 +0800)

committer H. Peter Anvin <hpa@linux.intel.com>

Mon, 23 Aug 2010 21:56:41 +0000 (14:56 -0700)
author Ma Ling <ling.ma@intel.com>
Mon, 28 Jun 2010 19:24:25 +0000 (03:24 +0800)
committer H. Peter Anvin <hpa@linux.intel.com>
Mon, 23 Aug 2010 21:56:41 +0000 (14:56 -0700)
diff --git a/arch/x86/lib/memcpy_32.c b/arch/x86/lib/memcpy_32.c

index be424dfcf3654ab76da38d486fceb0c17dad0684..81130d477ee2c48904e0f910b4ab73b303273e20 100644 (file)
--- a/arch/x86/lib/memcpy_32.c
+++ b/arch/x86/lib/memcpy_32.c
@@ -36,11 +36,9 @@ void *memmove(void *dest, const void *src, size_t n)
                                  "1" (src),
                                  "2" (dest)
                                 :"memory");
-
         } else {
-
-               if((src + count) < dest)
-                       return memcpy(dest, src, count);
+               if((src + n) < dest)
+                       return memcpy(dest, src, n);
                 else
                         __asm__ __volatile__(
                                 "std\n\t"
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S

index bcbcd1e0f7d57fe4b3972adc24785dc6837386f6..75ef61e35e38aee1cf62b05a48f1add62b148f98 100644 (file)
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -40,84 +40,132 @@
  ENTRY(__memcpy)
  ENTRY(memcpy)
         CFI_STARTPROC
+       movq %rdi, %rax
  
         /*
-        * Put the number of full 64-byte blocks into %ecx.
-        * Tail portion is handled at the end:
+        * Use 32bit CMP here to avoid long NOP padding.
          */
-       movq %rdi, %rax
-       movl %edx, %ecx
-       shrl   $6, %ecx
-       jz .Lhandle_tail
+       cmp  $0x20, %edx
+       jb .Lhandle_tail
  
-       .p2align 4
-.Lloop_64:
         /*
-        * We decrement the loop index here - and the zero-flag is
-        * checked at the end of the loop (instructions inbetween do
-        * not change the zero flag):
+        * We check whether memory false dependece could occur,
+        * then jump to corresponding copy mode.
          */
-       decl %ecx
+       cmp  %dil, %sil
+       jl .Lcopy_backward
+       subl $0x20, %edx
+.Lcopy_forward_loop:
+       subq $0x20,     %rdx
  
         /*
-        * Move in blocks of 4x16 bytes:
+        * Move in blocks of 4x8 bytes:
          */
-       movq 0*8(%rsi),         %r11
-       movq 1*8(%rsi),         %r8
-       movq %r11,              0*8(%rdi)
-       movq %r8,               1*8(%rdi)
-
-       movq 2*8(%rsi),         %r9
-       movq 3*8(%rsi),         %r10
-       movq %r9,               2*8(%rdi)
-       movq %r10,              3*8(%rdi)
-
-       movq 4*8(%rsi),         %r11
-       movq 5*8(%rsi),         %r8
-       movq %r11,              4*8(%rdi)
-       movq %r8,               5*8(%rdi)
-
-       movq 6*8(%rsi),         %r9
-       movq 7*8(%rsi),         %r10
-       movq %r9,               6*8(%rdi)
-       movq %r10,              7*8(%rdi)
-
-       leaq 64(%rsi), %rsi
-       leaq 64(%rdi), %rdi
-
-       jnz  .Lloop_64
+       movq 0*8(%rsi), %r8
+       movq 1*8(%rsi), %r9
+       movq 2*8(%rsi), %r10
+       movq 3*8(%rsi), %r11
+       leaq 4*8(%rsi), %rsi
+
+       movq %r8,       0*8(%rdi)
+       movq %r9,       1*8(%rdi)
+       movq %r10,      2*8(%rdi)
+       movq %r11,      3*8(%rdi)
+       leaq 4*8(%rdi), %rdi
+       jae  .Lcopy_forward_loop
+       addq $0x20,     %rdx
+       jmp  .Lhandle_tail
+
+.Lcopy_backward:
+       /*
+        * Calculate copy position to tail.
+        */
+       addq %rdx,      %rsi
+       addq %rdx,      %rdi
+       subq $0x20,     %rdx
+       /*
+        * At most 3 ALU operations in one cycle,
+        * so append NOPS in the same 16bytes trunk.
+        */
+       .p2align 4
+.Lcopy_backward_loop:
+       subq $0x20,     %rdx
+       movq -1*8(%rsi),        %r8
+       movq -2*8(%rsi),        %r9
+       movq -3*8(%rsi),        %r10
+       movq -4*8(%rsi),        %r11
+       leaq -4*8(%rsi),        %rsi
+       movq %r8,               -1*8(%rdi)
+       movq %r9,               -2*8(%rdi)
+       movq %r10,              -3*8(%rdi)
+       movq %r11,              -4*8(%rdi)
+       leaq -4*8(%rdi),        %rdi
+       jae  .Lcopy_backward_loop
  
+       /*
+        * Calculate copy position to head.
+        */
+       addq $0x20,     %rdx
+       subq %rdx,      %rsi
+       subq %rdx,      %rdi
  .Lhandle_tail:
-       movl %edx, %ecx
-       andl  $63, %ecx
-       shrl   $3, %ecx
-       jz   .Lhandle_7
+       cmpq $16,       %rdx
+       jb   .Lless_16bytes
  
+       /*
+        * Move data from 16 bytes to 31 bytes.
+        */
+       movq 0*8(%rsi), %r8
+       movq 1*8(%rsi), %r9
+       movq -2*8(%rsi, %rdx),  %r10
+       movq -1*8(%rsi, %rdx),  %r11
+       movq %r8,       0*8(%rdi)
+       movq %r9,       1*8(%rdi)
+       movq %r10,      -2*8(%rdi, %rdx)
+       movq %r11,      -1*8(%rdi, %rdx)
+       retq
         .p2align 4
-.Lloop_8:
-       decl %ecx
-       movq (%rsi),            %r8
-       movq %r8,               (%rdi)
-       leaq 8(%rdi),           %rdi
-       leaq 8(%rsi),           %rsi
-       jnz  .Lloop_8
-
-.Lhandle_7:
-       movl %edx, %ecx
-       andl $7, %ecx
-       jz .Lend
+.Lless_16bytes:
+       cmpq $8,        %rdx
+       jb   .Lless_8bytes
+       /*
+        * Move data from 8 bytes to 15 bytes.
+        */
+       movq 0*8(%rsi), %r8
+       movq -1*8(%rsi, %rdx),  %r9
+       movq %r8,       0*8(%rdi)
+       movq %r9,       -1*8(%rdi, %rdx)
+       retq
+       .p2align 4
+.Lless_8bytes:
+       cmpq $4,        %rdx
+       jb   .Lless_3bytes
  
+       /*
+        * Move data from 4 bytes to 7 bytes.
+        */
+       movl (%rsi), %ecx
+       movl -4(%rsi, %rdx), %r8d
+       movl %ecx, (%rdi)
+       movl %r8d, -4(%rdi, %rdx)
+       retq
         .p2align 4
+.Lless_3bytes:
+       cmpl $0, %edx
+       je .Lend
+       /*
+        * Move data from 1 bytes to 3 bytes.
+        */
  .Lloop_1:
         movb (%rsi), %r8b
         movb %r8b, (%rdi)
         incq %rdi
         incq %rsi
-       decl %ecx
+       decl %edx
         jnz .Lloop_1
  
  .Lend:
-       ret
+       retq
         CFI_ENDPROC
  ENDPROC(memcpy)
  ENDPROC(__memcpy)
author	Ma Ling <ling.ma@intel.com>
	Mon, 28 Jun 2010 19:24:25 +0000 (03:24 +0800)
committer	H. Peter Anvin <hpa@linux.intel.com>
	Mon, 23 Aug 2010 21:56:41 +0000 (14:56 -0700)
arch/x86/lib/memcpy_32.c		patch \| blob \| history
arch/x86/lib/memcpy_64.S		patch \| blob \| history