crypto: arm64/crct10dif - Remove remaining 64x64 PMULL fallback code

author Ard Biesheuvel <ardb@kernel.org>

Tue, 5 Nov 2024 16:09:03 +0000 (17:09 +0100)

committer Herbert Xu <herbert@gondor.apana.org.au>

Fri, 15 Nov 2024 11:52:51 +0000 (19:52 +0800)
author Ard Biesheuvel <ardb@kernel.org>
Tue, 5 Nov 2024 16:09:03 +0000 (17:09 +0100)
committer Herbert Xu <herbert@gondor.apana.org.au>
Fri, 15 Nov 2024 11:52:51 +0000 (19:52 +0800)
diff --git a/arch/arm64/crypto/crct10dif-ce-core.S b/arch/arm64/crypto/crct10dif-ce-core.S

index d2acaa2b5a01f00962643be867cb9a83909d71e1..87dd6d46224d8c2e7bb282296e4816cb9752a8df 100644 (file)
--- a/arch/arm64/crypto/crct10dif-ce-core.S
+++ b/arch/arm64/crypto/crct10dif-ce-core.S
@@ -74,137 +74,18 @@
         init_crc        .req    w0
         buf             .req    x1
         len             .req    x2
-       fold_consts_ptr .req    x3
+       fold_consts_ptr .req    x5
  
         fold_consts     .req    v10
  
-       ad              .req    v14
-
-       k00_16          .req    v15
-       k32_48          .req    v16
-
         t3              .req    v17
         t4              .req    v18
         t5              .req    v19
         t6              .req    v20
         t7              .req    v21
         t8              .req    v22
-       t9              .req    v23
-
-       perm1           .req    v24
-       perm2           .req    v25
-       perm3           .req    v26
-       perm4           .req    v27
-
-       bd1             .req    v28
-       bd2             .req    v29
-       bd3             .req    v30
-       bd4             .req    v31
-
-       .macro          __pmull_init_p64
-       .endm
  
-       .macro          __pmull_pre_p64, bd
-       .endm
-
-       .macro          __pmull_init_p8
-       // k00_16 := 0x0000000000000000_000000000000ffff
-       // k32_48 := 0x00000000ffffffff_0000ffffffffffff
-       movi            k32_48.2d, #0xffffffff
-       mov             k32_48.h[2], k32_48.h[0]
-       ushr            k00_16.2d, k32_48.2d, #32
-
-       // prepare the permutation vectors
-       mov_q           x5, 0x080f0e0d0c0b0a09
-       movi            perm4.8b, #8
-       dup             perm1.2d, x5
-       eor             perm1.16b, perm1.16b, perm4.16b
-       ushr            perm2.2d, perm1.2d, #8
-       ushr            perm3.2d, perm1.2d, #16
-       ushr            perm4.2d, perm1.2d, #24
-       sli             perm2.2d, perm1.2d, #56
-       sli             perm3.2d, perm1.2d, #48
-       sli             perm4.2d, perm1.2d, #40
-
-       // Compose { 0,0,0,0, 8,8,8,8, 1,1,1,1, 9,9,9,9 }
-       movi            bd1.4h, #8, lsl #8
-       orr             bd1.2s, #1, lsl #16
-       orr             bd1.2s, #1, lsl #24
-       zip1            bd1.16b, bd1.16b, bd1.16b
-       zip1            bd1.16b, bd1.16b, bd1.16b
-       .endm
-
-       .macro          __pmull_pre_p8, bd
-       tbl             bd1.16b, {\bd\().16b}, perm1.16b
-       tbl             bd2.16b, {\bd\().16b}, perm2.16b
-       tbl             bd3.16b, {\bd\().16b}, perm3.16b
-       tbl             bd4.16b, {\bd\().16b}, perm4.16b
-       .endm
-
-SYM_FUNC_START_LOCAL(__pmull_p8_core)
-.L__pmull_p8_core:
-       ext             t4.8b, ad.8b, ad.8b, #1                 // A1
-       ext             t5.8b, ad.8b, ad.8b, #2                 // A2
-       ext             t6.8b, ad.8b, ad.8b, #3                 // A3
-
-       pmull           t4.8h, t4.8b, fold_consts.8b            // F = A1*B
-       pmull           t8.8h, ad.8b, bd1.8b                    // E = A*B1
-       pmull           t5.8h, t5.8b, fold_consts.8b            // H = A2*B
-       pmull           t7.8h, ad.8b, bd2.8b                    // G = A*B2
-       pmull           t6.8h, t6.8b, fold_consts.8b            // J = A3*B
-       pmull           t9.8h, ad.8b, bd3.8b                    // I = A*B3
-       pmull           t3.8h, ad.8b, bd4.8b                    // K = A*B4
-       b               0f
-
-.L__pmull_p8_core2:
-       tbl             t4.16b, {ad.16b}, perm1.16b             // A1
-       tbl             t5.16b, {ad.16b}, perm2.16b             // A2
-       tbl             t6.16b, {ad.16b}, perm3.16b             // A3
-
-       pmull2          t4.8h, t4.16b, fold_consts.16b          // F = A1*B
-       pmull2          t8.8h, ad.16b, bd1.16b                  // E = A*B1
-       pmull2          t5.8h, t5.16b, fold_consts.16b          // H = A2*B
-       pmull2          t7.8h, ad.16b, bd2.16b                  // G = A*B2
-       pmull2          t6.8h, t6.16b, fold_consts.16b          // J = A3*B
-       pmull2          t9.8h, ad.16b, bd3.16b                  // I = A*B3
-       pmull2          t3.8h, ad.16b, bd4.16b                  // K = A*B4
-
-0:     eor             t4.16b, t4.16b, t8.16b                  // L = E + F
-       eor             t5.16b, t5.16b, t7.16b                  // M = G + H
-       eor             t6.16b, t6.16b, t9.16b                  // N = I + J
-
-       uzp1            t8.2d, t4.2d, t5.2d
-       uzp2            t4.2d, t4.2d, t5.2d
-       uzp1            t7.2d, t6.2d, t3.2d
-       uzp2            t6.2d, t6.2d, t3.2d
-
-       // t4 = (L) (P0 + P1) << 8
-       // t5 = (M) (P2 + P3) << 16
-       eor             t8.16b, t8.16b, t4.16b
-       and             t4.16b, t4.16b, k32_48.16b
-
-       // t6 = (N) (P4 + P5) << 24
-       // t7 = (K) (P6 + P7) << 32
-       eor             t7.16b, t7.16b, t6.16b
-       and             t6.16b, t6.16b, k00_16.16b
-
-       eor             t8.16b, t8.16b, t4.16b
-       eor             t7.16b, t7.16b, t6.16b
-
-       zip2            t5.2d, t8.2d, t4.2d
-       zip1            t4.2d, t8.2d, t4.2d
-       zip2            t3.2d, t7.2d, t6.2d
-       zip1            t6.2d, t7.2d, t6.2d
-
-       ext             t4.16b, t4.16b, t4.16b, #15
-       ext             t5.16b, t5.16b, t5.16b, #14
-       ext             t6.16b, t6.16b, t6.16b, #13
-       ext             t3.16b, t3.16b, t3.16b, #12
-
-       eor             t4.16b, t4.16b, t5.16b
-       eor             t6.16b, t6.16b, t3.16b
-       ret
-SYM_FUNC_END(__pmull_p8_core)
+       perm            .req    v27
  
         .macro          pmull16x64_p64, a16, b64, c64
         pmull2          \c64\().1q, \a16\().2d, \b64\().2d
@@ -266,7 +147,7 @@ SYM_FUNC_END(__pmull_p8_core)
          */
         .macro          pmull16x64_p8, a16, b64, c64
         ext             t7.16b, \b64\().16b, \b64\().16b, #1
-       tbl             t5.16b, {\a16\().16b}, bd1.16b
+       tbl             t5.16b, {\a16\().16b}, perm.16b
         uzp1            t7.16b, \b64\().16b, t7.16b
         bl              __pmull_p8_16x64
         ext             \b64\().16b, t4.16b, t4.16b, #15
@@ -292,22 +173,6 @@ SYM_FUNC_START_LOCAL(__pmull_p8_16x64)
         ret
  SYM_FUNC_END(__pmull_p8_16x64)
  
-       .macro          __pmull_p8, rq, ad, bd, i
-       .ifnc           \bd, fold_consts
-       .err
-       .endif
-       mov             ad.16b, \ad\().16b
-       .ifb            \i
-       pmull           \rq\().8h, \ad\().8b, \bd\().8b         // D = A*B
-       .else
-       pmull2          \rq\().8h, \ad\().16b, \bd\().16b       // D = A*B
-       .endif
-
-       bl              .L__pmull_p8_core\i
-
-       eor             \rq\().16b, \rq\().16b, t4.16b
-       eor             \rq\().16b, \rq\().16b, t6.16b
-       .endm
  
         // Fold reg1, reg2 into the next 32 data bytes, storing the result back
         // into reg1, reg2.
@@ -340,16 +205,7 @@ CPU_LE(    ext             v12.16b, v12.16b, v12.16b, #8   )
         eor             \dst_reg\().16b, \dst_reg\().16b, \src_reg\().16b
         .endm
  
-       .macro          __pmull_p64, rd, rn, rm, n
-       .ifb            \n
-       pmull           \rd\().1q, \rn\().1d, \rm\().1d
-       .else
-       pmull2          \rd\().1q, \rn\().2d, \rm\().2d
-       .endif
-       .endm
-
         .macro          crc_t10dif_pmull, p
-       __pmull_init_\p
  
         // For sizes less than 256 bytes, we can't fold 128 bytes at a time.
         cmp             len, #256
@@ -479,47 +335,7 @@ CPU_LE(    ext             v0.16b, v0.16b, v0.16b, #8      )
         pmull16x64_\p   fold_consts, v3, v0
         eor             v7.16b, v3.16b, v0.16b
         eor             v7.16b, v7.16b, v2.16b
-
-.Lreduce_final_16_bytes_\@:
-       // Reduce the 128-bit value M(x), stored in v7, to the final 16-bit CRC.
-
-       movi            v2.16b, #0              // init zero register
-
-       // Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'.
-       ld1             {fold_consts.2d}, [fold_consts_ptr], #16
-       __pmull_pre_\p  fold_consts
-
-       // Fold the high 64 bits into the low 64 bits, while also multiplying by
-       // x^64.  This produces a 128-bit value congruent to x^64 * M(x) and
-       // whose low 48 bits are 0.
-       ext             v0.16b, v2.16b, v7.16b, #8
-       __pmull_\p      v7, v7, fold_consts, 2  // high bits * x^48 * (x^80 mod G(x))
-       eor             v0.16b, v0.16b, v7.16b  // + low bits * x^64
-
-       // Fold the high 32 bits into the low 96 bits.  This produces a 96-bit
-       // value congruent to x^64 * M(x) and whose low 48 bits are 0.
-       ext             v1.16b, v0.16b, v2.16b, #12     // extract high 32 bits
-       mov             v0.s[3], v2.s[0]        // zero high 32 bits
-       __pmull_\p      v1, v1, fold_consts     // high 32 bits * x^48 * (x^48 mod G(x))
-       eor             v0.16b, v0.16b, v1.16b  // + low bits
-
-       // Load G(x) and floor(x^48 / G(x)).
-       ld1             {fold_consts.2d}, [fold_consts_ptr]
-       __pmull_pre_\p  fold_consts
-
-       // Use Barrett reduction to compute the final CRC value.
-       __pmull_\p      v1, v0, fold_consts, 2  // high 32 bits * floor(x^48 / G(x))
-       ushr            v1.2d, v1.2d, #32       // /= x^32
-       __pmull_\p      v1, v1, fold_consts     // *= G(x)
-       ushr            v0.2d, v0.2d, #48
-       eor             v0.16b, v0.16b, v1.16b  // + low 16 nonzero bits
-       // Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of v0.
-
-       umov            w0, v0.h[0]
-       .ifc            \p, p8
-       frame_pop
-       .endif
-       ret
+       b               .Lreduce_final_16_bytes_\@
  
  .Lless_than_256_bytes_\@:
         // Checksumming a buffer of length 16...255 bytes
@@ -545,6 +361,8 @@ CPU_LE(     ext             v7.16b, v7.16b, v7.16b, #8      )
         b.ge            .Lfold_16_bytes_loop_\@         // 32 <= len <= 255
         add             len, len, #16
         b               .Lhandle_partial_segment_\@     // 17 <= len <= 31
+
+.Lreduce_final_16_bytes_\@:
         .endm
  
  //
@@ -554,7 +372,22 @@ CPU_LE(    ext             v7.16b, v7.16b, v7.16b, #8      )
  //
  SYM_FUNC_START(crc_t10dif_pmull_p8)
         frame_push      1
+
+       // Compose { 0,0,0,0, 8,8,8,8, 1,1,1,1, 9,9,9,9 }
+       movi            perm.4h, #8, lsl #8
+       orr             perm.2s, #1, lsl #16
+       orr             perm.2s, #1, lsl #24
+       zip1            perm.16b, perm.16b, perm.16b
+       zip1            perm.16b, perm.16b, perm.16b
+
         crc_t10dif_pmull p8
+
+CPU_LE(        rev64           v7.16b, v7.16b                  )
+CPU_LE(        ext             v7.16b, v7.16b, v7.16b, #8      )
+       str             q7, [x3]
+
+       frame_pop
+       ret
  SYM_FUNC_END(crc_t10dif_pmull_p8)
  
         .align          5
@@ -565,6 +398,41 @@ SYM_FUNC_END(crc_t10dif_pmull_p8)
  //
  SYM_FUNC_START(crc_t10dif_pmull_p64)
         crc_t10dif_pmull        p64
+
+       // Reduce the 128-bit value M(x), stored in v7, to the final 16-bit CRC.
+
+       movi            v2.16b, #0              // init zero register
+
+       // Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'.
+       ld1             {fold_consts.2d}, [fold_consts_ptr], #16
+
+       // Fold the high 64 bits into the low 64 bits, while also multiplying by
+       // x^64.  This produces a 128-bit value congruent to x^64 * M(x) and
+       // whose low 48 bits are 0.
+       ext             v0.16b, v2.16b, v7.16b, #8
+       pmull2          v7.1q, v7.2d, fold_consts.2d    // high bits * x^48 * (x^80 mod G(x))
+       eor             v0.16b, v0.16b, v7.16b          // + low bits * x^64
+
+       // Fold the high 32 bits into the low 96 bits.  This produces a 96-bit
+       // value congruent to x^64 * M(x) and whose low 48 bits are 0.
+       ext             v1.16b, v0.16b, v2.16b, #12     // extract high 32 bits
+       mov             v0.s[3], v2.s[0]                // zero high 32 bits
+       pmull           v1.1q, v1.1d, fold_consts.1d    // high 32 bits * x^48 * (x^48 mod G(x))
+       eor             v0.16b, v0.16b, v1.16b          // + low bits
+
+       // Load G(x) and floor(x^48 / G(x)).
+       ld1             {fold_consts.2d}, [fold_consts_ptr]
+
+       // Use Barrett reduction to compute the final CRC value.
+       pmull2          v1.1q, v0.2d, fold_consts.2d    // high 32 bits * floor(x^48 / G(x))
+       ushr            v1.2d, v1.2d, #32               // /= x^32
+       pmull           v1.1q, v1.1d, fold_consts.1d    // *= G(x)
+       ushr            v0.2d, v0.2d, #48
+       eor             v0.16b, v0.16b, v1.16b          // + low 16 nonzero bits
+       // Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of v0.
+
+       umov            w0, v0.h[0]
+       ret
  SYM_FUNC_END(crc_t10dif_pmull_p64)
  
         .section        ".rodata", "a"
diff --git a/arch/arm64/crypto/crct10dif-ce-glue.c b/arch/arm64/crypto/crct10dif-ce-glue.c

index 7b05094a048045e89d1822cc9a98ed792c24594c..08bcbd884395f4ffe4eed9fd35e9523767222d26 100644 (file)
--- a/arch/arm64/crypto/crct10dif-ce-glue.c
+++ b/arch/arm64/crypto/crct10dif-ce-glue.c
@@ -20,7 +20,8 @@
  
  #define CRC_T10DIF_PMULL_CHUNK_SIZE    16U
  
-asmlinkage u16 crc_t10dif_pmull_p8(u16 init_crc, const u8 *buf, size_t len);
+asmlinkage void crc_t10dif_pmull_p8(u16 init_crc, const u8 *buf, size_t len,
+                                   u8 out[16]);
  asmlinkage u16 crc_t10dif_pmull_p64(u16 init_crc, const u8 *buf, size_t len);
  
  static int crct10dif_init(struct shash_desc *desc)
@@ -34,16 +35,21 @@ static int crct10dif_init(struct shash_desc *desc)
  static int crct10dif_update_pmull_p8(struct shash_desc *desc, const u8 *data,
                             unsigned int length)
  {
-       u16 *crc = shash_desc_ctx(desc);
+       u16 *crcp = shash_desc_ctx(desc);
+       u16 crc = *crcp;
+       u8 buf[16];
  
-       if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && crypto_simd_usable()) {
+       if (length > CRC_T10DIF_PMULL_CHUNK_SIZE && crypto_simd_usable()) {
                 kernel_neon_begin();
-               *crc = crc_t10dif_pmull_p8(*crc, data, length);
+               crc_t10dif_pmull_p8(crc, data, length, buf);
                 kernel_neon_end();
-       } else {
-               *crc = crc_t10dif_generic(*crc, data, length);
+
+               crc = 0;
+               data = buf;
+               length = sizeof(buf);
         }
  
+       *crcp = crc_t10dif_generic(crc, data, length);
         return 0;
  }
author	Ard Biesheuvel <ardb@kernel.org>
	Tue, 5 Nov 2024 16:09:03 +0000 (17:09 +0100)
committer	Herbert Xu <herbert@gondor.apana.org.au>
	Fri, 15 Nov 2024 11:52:51 +0000 (19:52 +0800)
arch/arm64/crypto/crct10dif-ce-core.S		patch \| blob \| history
arch/arm64/crypto/crct10dif-ce-glue.c		patch \| blob \| history