powerpc: Optimise 64bit csum_partial_copy_generic and add csum_and_copy_from_user

author Anton Blanchard <anton@samba.org>

Mon, 2 Aug 2010 20:09:52 +0000 (20:09 +0000)

committer Benjamin Herrenschmidt <benh@kernel.crashing.org>

Thu, 2 Sep 2010 04:07:30 +0000 (14:07 +1000)
author Anton Blanchard <anton@samba.org>
Mon, 2 Aug 2010 20:09:52 +0000 (20:09 +0000)
committer Benjamin Herrenschmidt <benh@kernel.crashing.org>
Thu, 2 Sep 2010 04:07:30 +0000 (14:07 +1000)
diff --git a/arch/powerpc/include/asm/checksum.h b/arch/powerpc/include/asm/checksum.h

index 7cdf358337cf246c21634650af8495086750361a..9ea58c0e7cfb00cbf8030623cda1927cd9344e4e 100644 (file)
--- a/arch/powerpc/include/asm/checksum.h
+++ b/arch/powerpc/include/asm/checksum.h
@@ -52,12 +52,19 @@ extern __wsum csum_partial(const void *buff, int len, __wsum sum);
  extern __wsum csum_partial_copy_generic(const void *src, void *dst,
                                               int len, __wsum sum,
                                               int *src_err, int *dst_err);
+
+#ifdef __powerpc64__
+#define _HAVE_ARCH_COPY_AND_CSUM_FROM_USER
+extern __wsum csum_and_copy_from_user(const void __user *src, void *dst,
+                                     int len, __wsum sum, int *err_ptr);
+#else
  /*
   * the same as csum_partial, but copies from src to dst while it
   * checksums.
   */
  #define csum_partial_copy_from_user(src, dst, len, sum, errp)   \
          csum_partial_copy_generic((__force const void *)(src), (dst), (len), (sum), (errp), NULL)
+#endif
  
  #define csum_partial_copy_nocheck(src, dst, len, sum)   \
          csum_partial_copy_generic((src), (dst), (len), (sum), NULL, NULL)
diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile

index 5bb89c828070173a3b20496435b65c3928f18515..ad4a36848f2562c6fc4e708f7bd2062014d2a774 100644 (file)
--- a/arch/powerpc/lib/Makefile
+++ b/arch/powerpc/lib/Makefile
@@ -17,7 +17,8 @@ obj-$(CONFIG_PPC32)   += div64.o copy_32.o
  obj-$(CONFIG_HAS_IOMEM)        += devres.o
  
  obj-$(CONFIG_PPC64)    += copypage_64.o copyuser_64.o \
-                          memcpy_64.o usercopy_64.o mem_64.o string.o
+                          memcpy_64.o usercopy_64.o mem_64.o string.o \
+                          checksum_wrappers_64.o
  obj-$(CONFIG_XMON)     += sstep.o ldstfp.o
  obj-$(CONFIG_KPROBES)  += sstep.o ldstfp.o
  obj-$(CONFIG_HAVE_HW_BREAKPOINT)       += sstep.o ldstfp.o
diff --git a/arch/powerpc/lib/checksum_64.S b/arch/powerpc/lib/checksum_64.S

index 404d5a6e3387ddce178a50e92d53078434599037..18245af38aea7f747bd034dd5892c300ce73111c 100644 (file)
--- a/arch/powerpc/lib/checksum_64.S
+++ b/arch/powerpc/lib/checksum_64.S
@@ -228,115 +228,230 @@ _GLOBAL(csum_partial)
         srdi    r3,r3,32
         blr
  
+
+       .macro source
+100:
+       .section __ex_table,"a"
+       .align 3
+       .llong 100b,.Lsrc_error
+       .previous
+       .endm
+
+       .macro dest
+200:
+       .section __ex_table,"a"
+       .align 3
+       .llong 200b,.Ldest_error
+       .previous
+       .endm
+
  /*
   * Computes the checksum of a memory block at src, length len,
   * and adds in "sum" (32-bit), while copying the block to dst.
   * If an access exception occurs on src or dst, it stores -EFAULT
- * to *src_err or *dst_err respectively, and (for an error on
- * src) zeroes the rest of dst.
- *
- * This code needs to be reworked to take advantage of 64 bit sum+copy.
- * However, due to tokenring halfword alignment problems this will be very
- * tricky.  For now we'll leave it until we instrument it somehow.
+ * to *src_err or *dst_err respectively. The caller must take any action
+ * required in this case (zeroing memory, recalculating partial checksum etc).
   *
   * csum_partial_copy_generic(r3=src, r4=dst, r5=len, r6=sum, r7=src_err, r8=dst_err)
   */
  _GLOBAL(csum_partial_copy_generic)
-       addic   r0,r6,0
-       subi    r3,r3,4
-       subi    r4,r4,4
-       srwi.   r6,r5,2
-       beq     3f              /* if we're doing < 4 bytes */
-       andi.   r9,r4,2         /* Align dst to longword boundary */
-       beq+    1f
-81:    lhz     r6,4(r3)        /* do 2 bytes to get aligned */
-       addi    r3,r3,2
+       addic   r0,r6,0                 /* clear carry */
+
+       srdi.   r6,r5,3                 /* less than 8 bytes? */
+       beq     .Lcopy_tail_word
+
+       /*
+        * If only halfword aligned, align to a double word. Since odd
+        * aligned addresses should be rare and they would require more
+        * work to calculate the correct checksum, we ignore that case
+        * and take the potential slowdown of unaligned loads.
+        *
+        * If the source and destination are relatively unaligned we only
+        * align the source. This keeps things simple.
+        */
+       rldicl. r6,r3,64-1,64-2         /* r6 = (r3 & 0x3) >> 1 */
+       beq     .Lcopy_aligned
+
+       li      r7,4
+       sub     r6,r7,r6
+       mtctr   r6
+
+1:
+source;        lhz     r6,0(r3)                /* align to doubleword */
         subi    r5,r5,2
-91:    sth     r6,4(r4)
-       addi    r4,r4,2
-       addc    r0,r0,r6
-       srwi.   r6,r5,2         /* # words to do */
-       beq     3f
-1:     mtctr   r6
-82:    lwzu    r6,4(r3)        /* the bdnz has zero overhead, so it should */
-92:    stwu    r6,4(r4)        /* be unnecessary to unroll this loop */
-       adde    r0,r0,r6
-       bdnz    82b
-       andi.   r5,r5,3
-3:     cmpwi   0,r5,2
-       blt+    4f
-83:    lhz     r6,4(r3)
         addi    r3,r3,2
-       subi    r5,r5,2
-93:    sth     r6,4(r4)
+       adde    r0,r0,r6
+dest;  sth     r6,0(r4)
         addi    r4,r4,2
+       bdnz    1b
+
+.Lcopy_aligned:
+       /*
+        * We unroll the loop such that each iteration is 64 bytes with an
+        * entry and exit limb of 64 bytes, meaning a minimum size of
+        * 128 bytes.
+        */
+       srdi.   r6,r5,7
+       beq     .Lcopy_tail_doublewords         /* len < 128 */
+
+       srdi    r6,r5,6
+       subi    r6,r6,1
+       mtctr   r6
+
+       stdu    r1,-STACKFRAMESIZE(r1)
+       std     r14,STK_REG(r14)(r1)
+       std     r15,STK_REG(r15)(r1)
+       std     r16,STK_REG(r16)(r1)
+
+source;        ld      r6,0(r3)
+source;        ld      r9,8(r3)
+
+source;        ld      r10,16(r3)
+source;        ld      r11,24(r3)
+
+       /*
+        * On POWER6 and POWER7 back to back addes take 2 cycles because of
+        * the XER dependency. This means the fastest this loop can go is
+        * 16 cycles per iteration. The scheduling of the loop below has
+        * been shown to hit this on both POWER6 and POWER7.
+        */
+       .align 5
+2:
+       adde    r0,r0,r6
+source;        ld      r12,32(r3)
+source;        ld      r14,40(r3)
+
+       adde    r0,r0,r9
+source;        ld      r15,48(r3)
+source;        ld      r16,56(r3)
+       addi    r3,r3,64
+
+       adde    r0,r0,r10
+dest;  std     r6,0(r4)
+dest;  std     r9,8(r4)
+
+       adde    r0,r0,r11
+dest;  std     r10,16(r4)
+dest;  std     r11,24(r4)
+
+       adde    r0,r0,r12
+dest;  std     r12,32(r4)
+dest;  std     r14,40(r4)
+
+       adde    r0,r0,r14
+dest;  std     r15,48(r4)
+dest;  std     r16,56(r4)
+       addi    r4,r4,64
+
+       adde    r0,r0,r15
+source;        ld      r6,0(r3)
+source;        ld      r9,8(r3)
+
+       adde    r0,r0,r16
+source;        ld      r10,16(r3)
+source;        ld      r11,24(r3)
+       bdnz    2b
+
+
         adde    r0,r0,r6
-4:     cmpwi   0,r5,1
-       bne+    5f
-84:    lbz     r6,4(r3)
-94:    stb     r6,4(r4)
-       slwi    r6,r6,8         /* Upper byte of word */
+source;        ld      r12,32(r3)
+source;        ld      r14,40(r3)
+
+       adde    r0,r0,r9
+source;        ld      r15,48(r3)
+source;        ld      r16,56(r3)
+       addi    r3,r3,64
+
+       adde    r0,r0,r10
+dest;  std     r6,0(r4)
+dest;  std     r9,8(r4)
+
+       adde    r0,r0,r11
+dest;  std     r10,16(r4)
+dest;  std     r11,24(r4)
+
+       adde    r0,r0,r12
+dest;  std     r12,32(r4)
+dest;  std     r14,40(r4)
+
+       adde    r0,r0,r14
+dest;  std     r15,48(r4)
+dest;  std     r16,56(r4)
+       addi    r4,r4,64
+
+       adde    r0,r0,r15
+       adde    r0,r0,r16
+
+       ld      r14,STK_REG(r14)(r1)
+       ld      r15,STK_REG(r15)(r1)
+       ld      r16,STK_REG(r16)(r1)
+       addi    r1,r1,STACKFRAMESIZE
+
+       andi.   r5,r5,63
+
+.Lcopy_tail_doublewords:               /* Up to 127 bytes to go */
+       srdi.   r6,r5,3
+       beq     .Lcopy_tail_word
+
+       mtctr   r6
+3:
+source;        ld      r6,0(r3)
+       addi    r3,r3,8
         adde    r0,r0,r6
-5:     addze   r3,r0           /* add in final carry (unlikely with 64-bit regs) */
-        rldicl  r4,r3,32,0      /* fold 64 bit value */
-        add     r3,r4,r3
-        srdi    r3,r3,32
-       blr
+dest;  std     r6,0(r4)
+       addi    r4,r4,8
+       bdnz    3b
  
-/* These shouldn't go in the fixup section, since that would
-   cause the ex_table addresses to get out of order. */
+       andi.   r5,r5,7
  
-       .globl src_error_1
-src_error_1:
-       li      r6,0
-       subi    r5,r5,2
-95:    sth     r6,4(r4)
+.Lcopy_tail_word:                      /* Up to 7 bytes to go */
+       srdi.   r6,r5,2
+       beq     .Lcopy_tail_halfword
+
+source;        lwz     r6,0(r3)
+       addi    r3,r3,4
+       adde    r0,r0,r6
+dest;  stw     r6,0(r4)
+       addi    r4,r4,4
+       subi    r5,r5,4
+
+.Lcopy_tail_halfword:                  /* Up to 3 bytes to go */
+       srdi.   r6,r5,1
+       beq     .Lcopy_tail_byte
+
+source;        lhz     r6,0(r3)
+       addi    r3,r3,2
+       adde    r0,r0,r6
+dest;  sth     r6,0(r4)
         addi    r4,r4,2
-       srwi.   r6,r5,2
-       beq     3f
-       mtctr   r6
-       .globl src_error_2
-src_error_2:
-       li      r6,0
-96:    stwu    r6,4(r4)
-       bdnz    96b
-3:     andi.   r5,r5,3
-       beq     src_error
-       .globl src_error_3
-src_error_3:
-       li      r6,0
-       mtctr   r5
-       addi    r4,r4,3
-97:    stbu    r6,1(r4)
-       bdnz    97b
-       .globl src_error
-src_error:
+       subi    r5,r5,2
+
+.Lcopy_tail_byte:                      /* Up to 1 byte to go */
+       andi.   r6,r5,1
+       beq     .Lcopy_finish
+
+source;        lbz     r6,0(r3)
+       sldi    r9,r6,8                 /* Pad the byte out to 16 bits */
+       adde    r0,r0,r9
+dest;  stb     r6,0(r4)
+
+.Lcopy_finish:
+       addze   r0,r0                   /* add in final carry */
+       rldicl  r4,r0,32,0              /* fold two 32 bit halves together */
+       add     r3,r4,r0
+       srdi    r3,r3,32
+       blr
+
+.Lsrc_error:
         cmpdi   0,r7,0
-       beq     1f
+       beqlr
         li      r6,-EFAULT
         stw     r6,0(r7)
-1:     addze   r3,r0
         blr
  
-       .globl dst_error
-dst_error:
+.Ldest_error:
         cmpdi   0,r8,0
-       beq     1f
+       beqlr
         li      r6,-EFAULT
         stw     r6,0(r8)
-1:     addze   r3,r0
         blr
-
-.section __ex_table,"a"
-       .align  3
-       .llong  81b,src_error_1
-       .llong  91b,dst_error
-       .llong  82b,src_error_2
-       .llong  92b,dst_error
-       .llong  83b,src_error_3
-       .llong  93b,dst_error
-       .llong  84b,src_error_3
-       .llong  94b,dst_error
-       .llong  95b,dst_error
-       .llong  96b,dst_error
-       .llong  97b,dst_error
diff --git a/arch/powerpc/lib/checksum_wrappers_64.c b/arch/powerpc/lib/checksum_wrappers_64.c

new file mode 100644 (file)

index 0000000..614cff1
--- /dev/null
+++ b/arch/powerpc/lib/checksum_wrappers_64.c
@@ -0,0 +1,65 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2010
+ *
+ * Author: Anton Blanchard <anton@au.ibm.com>
+ */
+#include <linux/module.h>
+#include <linux/compiler.h>
+#include <linux/types.h>
+#include <asm/checksum.h>
+#include <asm/uaccess.h>
+
+__wsum csum_and_copy_from_user(const void __user *src, void *dst,
+                              int len, __wsum sum, int *err_ptr)
+{
+       unsigned int csum;
+
+       might_sleep();
+
+       *err_ptr = 0;
+
+       if (!len) {
+               csum = 0;
+               goto out;
+       }
+
+       if (unlikely((len < 0) || !access_ok(VERIFY_READ, src, len))) {
+               *err_ptr = -EFAULT;
+               csum = (__force unsigned int)sum;
+               goto out;
+       }
+
+       csum = csum_partial_copy_generic((void __force *)src, dst,
+                                        len, sum, err_ptr, NULL);
+
+       if (unlikely(*err_ptr)) {
+               int missing = __copy_from_user(dst, src, len);
+
+               if (missing) {
+                       memset(dst + len - missing, 0, missing);
+                       *err_ptr = -EFAULT;
+               } else {
+                       *err_ptr = 0;
+               }
+
+               csum = csum_partial(dst, len, sum);
+       }
+
+out:
+       return (__force __wsum)csum;
+}
+EXPORT_SYMBOL(csum_and_copy_from_user);
author	Anton Blanchard <anton@samba.org>
	Mon, 2 Aug 2010 20:09:52 +0000 (20:09 +0000)
committer	Benjamin Herrenschmidt <benh@kernel.crashing.org>
	Thu, 2 Sep 2010 04:07:30 +0000 (14:07 +1000)
arch/powerpc/include/asm/checksum.h		patch \| blob \| history
arch/powerpc/lib/Makefile		patch \| blob \| history
arch/powerpc/lib/checksum_64.S		patch \| blob \| history
arch/powerpc/lib/checksum_wrappers_64.c	[new file with mode: 0644]	patch \| blob