crypto: arm64/crct10dif - Use faster 16x64 bit polynomial multiply

author Ard Biesheuvel <ardb@kernel.org>

Tue, 5 Nov 2024 16:09:02 +0000 (17:09 +0100)

committer Herbert Xu <herbert@gondor.apana.org.au>

Fri, 15 Nov 2024 11:52:51 +0000 (19:52 +0800)
author Ard Biesheuvel <ardb@kernel.org>
Tue, 5 Nov 2024 16:09:02 +0000 (17:09 +0100)
committer Herbert Xu <herbert@gondor.apana.org.au>
Fri, 15 Nov 2024 11:52:51 +0000 (19:52 +0800)
diff --git a/arch/arm64/crypto/crct10dif-ce-core.S b/arch/arm64/crypto/crct10dif-ce-core.S

index 5604de61d06d04eeac8fb616859098b53a8cedd8..d2acaa2b5a01f00962643be867cb9a83909d71e1 100644 (file)
--- a/arch/arm64/crypto/crct10dif-ce-core.S
+++ b/arch/arm64/crypto/crct10dif-ce-core.S
@@ -1,8 +1,11 @@
  //
  // Accelerated CRC-T10DIF using arm64 NEON and Crypto Extensions instructions
  //
-// Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
-// Copyright (C) 2019 Google LLC <ebiggers@google.com>
+// Copyright (C) 2016 Linaro Ltd
+// Copyright (C) 2019-2024 Google LLC
+//
+// Authors: Ard Biesheuvel <ardb@google.com>
+//          Eric Biggers <ebiggers@google.com>
  //
  // This program is free software; you can redistribute it and/or modify
  // it under the terms of the GNU General Public License version 2 as
@@ -122,6 +125,13 @@
         sli             perm2.2d, perm1.2d, #56
         sli             perm3.2d, perm1.2d, #48
         sli             perm4.2d, perm1.2d, #40
+
+       // Compose { 0,0,0,0, 8,8,8,8, 1,1,1,1, 9,9,9,9 }
+       movi            bd1.4h, #8, lsl #8
+       orr             bd1.2s, #1, lsl #16
+       orr             bd1.2s, #1, lsl #24
+       zip1            bd1.16b, bd1.16b, bd1.16b
+       zip1            bd1.16b, bd1.16b, bd1.16b
         .endm
  
         .macro          __pmull_pre_p8, bd
@@ -196,6 +206,92 @@ SYM_FUNC_START_LOCAL(__pmull_p8_core)
         ret
  SYM_FUNC_END(__pmull_p8_core)
  
+       .macro          pmull16x64_p64, a16, b64, c64
+       pmull2          \c64\().1q, \a16\().2d, \b64\().2d
+       pmull           \b64\().1q, \a16\().1d, \b64\().1d
+       .endm
+
+       /*
+        * Pairwise long polynomial multiplication of two 16-bit values
+        *
+        *   { w0, w1 }, { y0, y1 }
+        *
+        * by two 64-bit values
+        *
+        *   { x0, x1, x2, x3, x4, x5, x6, x7 }, { z0, z1, z2, z3, z4, z5, z6, z7 }
+        *
+        * where each vector element is a byte, ordered from least to most
+        * significant.
+        *
+        * This can be implemented using 8x8 long polynomial multiplication, by
+        * reorganizing the input so that each pairwise 8x8 multiplication
+        * produces one of the terms from the decomposition below, and
+        * combining the results of each rank and shifting them into place.
+        *
+        * Rank
+        *  0            w0*x0 ^              |        y0*z0 ^
+        *  1       (w0*x1 ^ w1*x0) <<  8 ^   |   (y0*z1 ^ y1*z0) <<  8 ^
+        *  2       (w0*x2 ^ w1*x1) << 16 ^   |   (y0*z2 ^ y1*z1) << 16 ^
+        *  3       (w0*x3 ^ w1*x2) << 24 ^   |   (y0*z3 ^ y1*z2) << 24 ^
+        *  4       (w0*x4 ^ w1*x3) << 32 ^   |   (y0*z4 ^ y1*z3) << 32 ^
+        *  5       (w0*x5 ^ w1*x4) << 40 ^   |   (y0*z5 ^ y1*z4) << 40 ^
+        *  6       (w0*x6 ^ w1*x5) << 48 ^   |   (y0*z6 ^ y1*z5) << 48 ^
+        *  7       (w0*x7 ^ w1*x6) << 56 ^   |   (y0*z7 ^ y1*z6) << 56 ^
+        *  8            w1*x7      << 64     |        y1*z7      << 64
+        *
+        * The inputs can be reorganized into
+        *
+        *   { w0, w0, w0, w0, y0, y0, y0, y0 }, { w1, w1, w1, w1, y1, y1, y1, y1 }
+        *   { x0, x2, x4, x6, z0, z2, z4, z6 }, { x1, x3, x5, x7, z1, z3, z5, z7 }
+        *
+        * and after performing 8x8->16 bit long polynomial multiplication of
+        * each of the halves of the first vector with those of the second one,
+        * we obtain the following four vectors of 16-bit elements:
+        *
+        *   a := { w0*x0, w0*x2, w0*x4, w0*x6 }, { y0*z0, y0*z2, y0*z4, y0*z6 }
+        *   b := { w0*x1, w0*x3, w0*x5, w0*x7 }, { y0*z1, y0*z3, y0*z5, y0*z7 }
+        *   c := { w1*x0, w1*x2, w1*x4, w1*x6 }, { y1*z0, y1*z2, y1*z4, y1*z6 }
+        *   d := { w1*x1, w1*x3, w1*x5, w1*x7 }, { y1*z1, y1*z3, y1*z5, y1*z7 }
+        *
+        * Results b and c can be XORed together, as the vector elements have
+        * matching ranks. Then, the final XOR (*) can be pulled forward, and
+        * applied between the halves of each of the remaining three vectors,
+        * which are then shifted into place, and combined to produce two
+        * 80-bit results.
+        *
+        * (*) NOTE: the 16x64 bit polynomial multiply below is not equivalent
+        * to the 64x64 bit one above, but XOR'ing the outputs together will
+        * produce the expected result, and this is sufficient in the context of
+        * this algorithm.
+        */
+       .macro          pmull16x64_p8, a16, b64, c64
+       ext             t7.16b, \b64\().16b, \b64\().16b, #1
+       tbl             t5.16b, {\a16\().16b}, bd1.16b
+       uzp1            t7.16b, \b64\().16b, t7.16b
+       bl              __pmull_p8_16x64
+       ext             \b64\().16b, t4.16b, t4.16b, #15
+       eor             \c64\().16b, t8.16b, t5.16b
+       .endm
+
+SYM_FUNC_START_LOCAL(__pmull_p8_16x64)
+       ext             t6.16b, t5.16b, t5.16b, #8
+
+       pmull           t3.8h, t7.8b, t5.8b
+       pmull           t4.8h, t7.8b, t6.8b
+       pmull2          t5.8h, t7.16b, t5.16b
+       pmull2          t6.8h, t7.16b, t6.16b
+
+       ext             t8.16b, t3.16b, t3.16b, #8
+       eor             t4.16b, t4.16b, t6.16b
+       ext             t7.16b, t5.16b, t5.16b, #8
+       ext             t6.16b, t4.16b, t4.16b, #8
+       eor             t8.8b, t8.8b, t3.8b
+       eor             t5.8b, t5.8b, t7.8b
+       eor             t4.8b, t4.8b, t6.8b
+       ext             t5.16b, t5.16b, t5.16b, #14
+       ret
+SYM_FUNC_END(__pmull_p8_16x64)
+
         .macro          __pmull_p8, rq, ad, bd, i
         .ifnc           \bd, fold_consts
         .err
@@ -218,14 +314,12 @@ SYM_FUNC_END(__pmull_p8_core)
         .macro          fold_32_bytes, p, reg1, reg2
         ldp             q11, q12, [buf], #0x20
  
-       __pmull_\p      v8, \reg1, fold_consts, 2
-       __pmull_\p      \reg1, \reg1, fold_consts
+       pmull16x64_\p   fold_consts, \reg1, v8
  
  CPU_LE(        rev64           v11.16b, v11.16b                )
  CPU_LE(        rev64           v12.16b, v12.16b                )
  
-       __pmull_\p      v9, \reg2, fold_consts, 2
-       __pmull_\p      \reg2, \reg2, fold_consts
+       pmull16x64_\p   fold_consts, \reg2, v9
  
  CPU_LE(        ext             v11.16b, v11.16b, v11.16b, #8   )
  CPU_LE(        ext             v12.16b, v12.16b, v12.16b, #8   )
@@ -238,11 +332,9 @@ CPU_LE(    ext             v12.16b, v12.16b, v12.16b, #8   )
  
         // Fold src_reg into dst_reg, optionally loading the next fold constants
         .macro          fold_16_bytes, p, src_reg, dst_reg, load_next_consts
-       __pmull_\p      v8, \src_reg, fold_consts
-       __pmull_\p      \src_reg, \src_reg, fold_consts, 2
+       pmull16x64_\p   fold_consts, \src_reg, v8
         .ifnb           \load_next_consts
         ld1             {fold_consts.2d}, [fold_consts_ptr], #16
-       __pmull_pre_\p  fold_consts
         .endif
         eor             \dst_reg\().16b, \dst_reg\().16b, v8.16b
         eor             \dst_reg\().16b, \dst_reg\().16b, \src_reg\().16b
@@ -296,7 +388,6 @@ CPU_LE(     ext             v7.16b, v7.16b, v7.16b, #8      )
  
         // Load the constants for folding across 128 bytes.
         ld1             {fold_consts.2d}, [fold_consts_ptr]
-       __pmull_pre_\p  fold_consts
  
         // Subtract 128 for the 128 data bytes just consumed.  Subtract another
         // 128 to simplify the termination condition of the following loop.
@@ -318,7 +409,6 @@ CPU_LE(     ext             v7.16b, v7.16b, v7.16b, #8      )
         // Fold across 64 bytes.
         add             fold_consts_ptr, fold_consts_ptr, #16
         ld1             {fold_consts.2d}, [fold_consts_ptr], #16
-       __pmull_pre_\p  fold_consts
         fold_16_bytes   \p, v0, v4
         fold_16_bytes   \p, v1, v5
         fold_16_bytes   \p, v2, v6
@@ -339,8 +429,7 @@ CPU_LE(     ext             v7.16b, v7.16b, v7.16b, #8      )
         // into them, storing the result back into v7.
         b.lt            .Lfold_16_bytes_loop_done_\@
  .Lfold_16_bytes_loop_\@:
-       __pmull_\p      v8, v7, fold_consts
-       __pmull_\p      v7, v7, fold_consts, 2
+       pmull16x64_\p   fold_consts, v7, v8
         eor             v7.16b, v7.16b, v8.16b
         ldr             q0, [buf], #16
  CPU_LE(        rev64           v0.16b, v0.16b                  )
@@ -387,9 +476,8 @@ CPU_LE(     ext             v0.16b, v0.16b, v0.16b, #8      )
         bsl             v2.16b, v1.16b, v0.16b
  
         // Fold the first chunk into the second chunk, storing the result in v7.
-       __pmull_\p      v0, v3, fold_consts
-       __pmull_\p      v7, v3, fold_consts, 2
-       eor             v7.16b, v7.16b, v0.16b
+       pmull16x64_\p   fold_consts, v3, v0
+       eor             v7.16b, v3.16b, v0.16b
         eor             v7.16b, v7.16b, v2.16b
  
  .Lreduce_final_16_bytes_\@:
@@ -450,7 +538,6 @@ CPU_LE(     ext             v7.16b, v7.16b, v7.16b, #8      )
  
         // Load the fold-across-16-bytes constants.
         ld1             {fold_consts.2d}, [fold_consts_ptr], #16
-       __pmull_pre_\p  fold_consts
  
         cmp             len, #16
         b.eq            .Lreduce_final_16_bytes_\@      // len == 16
author	Ard Biesheuvel <ardb@kernel.org>
	Tue, 5 Nov 2024 16:09:02 +0000 (17:09 +0100)
committer	Herbert Xu <herbert@gondor.apana.org.au>
	Fri, 15 Nov 2024 11:52:51 +0000 (19:52 +0800)