From: Allen Pais Date: Mon, 15 May 2017 13:46:23 +0000 (+0530) Subject: arch/sparc: Use new misaligned load instructions for memcpy and copy_from_user X-Git-Tag: v4.1.12-98.0.20170517_2143~3 X-Git-Url: https://www.infradead.org/git/?a=commitdiff_plain;h=b99f2d11f8dc2bb13c55513757d729d640fbea16;p=users%2Fjedix%2Flinux-maple.git arch/sparc: Use new misaligned load instructions for memcpy and copy_from_user Use the new instructions for Load Misaligned Integer and Load Misaligned Integer Alternate space for M8 architecture. Decide when to use FP or ldm based on the following condition. In case of FP load/alignaddr logic, there is a fixed overhead of FP save/restore regardless of memcpy length. But the overhead due to the ldm instruction grows with the size of memcpy. With our tests noticed that up to length about 4096, the ldm instructions performs significanty better than the FP alignaddr/load logic. With that into consideration, use the new ldm instructions for length of 4096 or less. For lengths above 4096, we will continue to use FP alignaddr/load logic. Added the fix noticed crypto key corruption while running AES crypto tests. This is the same problem reported in NG4memcpy. The commit f4da3628dc7c ("sparc64: Fix FPU register corruption with AES crypto offload.") fixes the problem. Ported these changes to M8memcpy and verified the fix. TODO: Encoded the ldmx and ldmxa instruction for now. Our build servers are not updated with latest M8 instruction set yet. We need to decode it back to assembly mnemonics when these instructions are available. Orabug: 25381567 Signed-off-by: Babu Moger Signed-off-by: Allen Pais --- diff --git a/arch/sparc/lib/M8copy_from_user.S b/arch/sparc/lib/M8copy_from_user.S index c8344e01174d5..c3e57ebf687ec 100644 --- a/arch/sparc/lib/M8copy_from_user.S +++ b/arch/sparc/lib/M8copy_from_user.S @@ -29,6 +29,18 @@ #define FUNC_NAME M8copy_from_user #define LOAD(type,addr,dest) type##a [addr] %asi, dest +/* + * For ldmxa instruction, source address has to be in registers. + * Look at Oracle SPARC Architecture 2017 + * Section 7.89 Load Misaligned Integer to Alternate Space + * So, we need to do this in two instructions. + * mov offset, %o3 + * ldmxa [ %o1 + %o3 ] %asi, %o4 + */ +#define LOAD_MS(type,addr,offset,dest) \ + mov offset, %o3; \ + EX_LD(.word 0xd98a560b) + #define EX_RETVAL(x) 0 #ifdef __KERNEL__ diff --git a/arch/sparc/lib/M8memcpy.S b/arch/sparc/lib/M8memcpy.S index 33ec89833fe7d..4bfc7bfcdd12c 100644 --- a/arch/sparc/lib/M8memcpy.S +++ b/arch/sparc/lib/M8memcpy.S @@ -91,6 +91,10 @@ #include #include +#if !defined(EX_LD) && !defined(EX_ST) +#define NON_USER_COPY +#endif + #ifndef EX_LD #define EX_LD(x) x #endif @@ -113,6 +117,10 @@ #define LOAD(type,addr,dest) type [addr], dest #endif +#ifndef LOAD_MS +#define LOAD_MS(type,addr,offset,dest) EX_LD(.word 0xd98a7400+offset) +#endif + #ifndef STORE #define STORE(type,src,addr) type src, [addr] #endif @@ -169,6 +177,8 @@ #define SHORT_LONG 64 /* max copy for short longword-aligned case */ /* must be at least 64 */ #define SMALL_MAX 128 +#define MED_MS_UMAX 4096 /* max copy for medium un-aligned case where */ + /* ldmx makes lot of difference */ #define MED_UMAX 32960 /* max copy for medium un-aligned case */ #define MED_WMAX 32960 /* max copy for medium word-aligned case */ #define MED_MAX 32960 /* max copy for medium longword-aligned case */ @@ -801,8 +811,28 @@ FUNC_NAME: ! Dst is on 8 byte boundary; src is not; remaining count > SMALL_MAX .Lunalignsetup: .Lunalignrejoin: + ! At smaller lengths, new ldm instructions perform better than + ! FP logic. This is due to fixed overhead of FP save/restore. But + ! the overhead does not matter how big the memcpy length is. However, + ! the internal overhead due to the slower ldmx instruction grows with + ! memcpy size. With our experiments, we noticed that ldm performance + ! starts to converge with FP performance at about length 4096 and over. + ! We check the memcpy length length here. Jump to mis-aligned logic + ! if the length is 4096 or less. Else continue with FP logic. + + set MED_MS_UMAX, %o3 + cmp %o2, %o3 + ble,pt %ncc, .Lunalign_ms_start + nop + + /* Now let's start FP/alignaddr logic */ mov %g1, %o3 ! save %g1 as VISEntryHalf clobbers it +#ifdef NON_USER_COPY + VISEntryHalfFast(.Lunalign_ms_start) +#else + VISEntryHalf +#endif VISEntryHalf mov %o3, %g1 ! restore %g1 @@ -991,8 +1021,60 @@ FUNC_NAME: add %o0, 8, %o0 .Lunalign_short: +#ifdef NON_USER_COPY + VISExitHalfFast +#else VISExitHalf +#endif ba .Lsmallrest nop + ! Start using M8 mis-aligned instructions. +.Lunalign_ms_start: + prefetch [%o1 + (4 * BLOCK_SIZE)], 20 + andn %o2, 0x3f, %o5 ! %o5 is multiple of block size + and %o2, 0x3f, %o2 ! residue bytes in %o2 + +.Lunalign_ms_loop: + LOAD_MS(ldmx, %o1, 0, %o4) + subcc %o5, BLOCK_SIZE, %o5 + EX_ST(STORE(stx, %o4, %o0)) + LOAD_MS(ldmx, %o1, 8, %o4) + EX_ST(STORE(stx, %o4, %o0+8)) + LOAD_MS(ldmx, %o1, 16, %o4) + EX_ST(STORE(stx, %o4, %o0+16)) + LOAD_MS(ldmx, %o1, 24, %o4) + EX_ST(STORE(stx, %o4, %o0+24)) + LOAD_MS(ldmx, %o1, 32, %o4) + EX_ST(STORE(stx, %o4, %o0+32)) + LOAD_MS(ldmx, %o1, 40, %o4) + EX_ST(STORE(stx, %o4, %o0+40)) + LOAD_MS(ldmx, %o1, 48, %o4) + EX_ST(STORE(stx, %o4, %o0+48)) + LOAD_MS(ldmx, %o1, 56, %o4) + add %o1, BLOCK_SIZE, %o1 + EX_ST(STORE(stx, %o4, %o0+56)) + add %o0, BLOCK_SIZE, %o0 + bgu,pt %ncc, .Lunalign_ms_loop + prefetch [%o1 + (5 * BLOCK_SIZE)], 20 + + ! Handle trailing bytes, 64 to 127 + ! Dest long word aligned, Src not long word aligned + cmp %o2, 15 + bleu %ncc, .Lunalign_ms_short + + andn %o2, 0x7, %o5 ! %o5 is multiple of 8 + and %o2, 0x7, %o2 ! residue bytes in %o2 + +.Lunalign_ms_by8: + subcc %o5, 8, %o5 + LOAD_MS(ldmx, %o1, 0, %o4) + EX_ST(STORE(stx, %o4, %o0)) + add %o1, 8, %o1 + bgu,pt %ncc, .Lunalign_ms_by8 + add %o0, 8, %o0 + +.Lunalign_ms_short: + ba .Lsmallrest + nop .size FUNC_NAME, .-FUNC_NAME