crypto: arm/crct10dif - Macroify PMULL asm code

author Ard Biesheuvel <ardb@kernel.org>

Tue, 5 Nov 2024 16:09:05 +0000 (17:09 +0100)

committer Herbert Xu <herbert@gondor.apana.org.au>

Fri, 15 Nov 2024 11:52:51 +0000 (19:52 +0800)
author Ard Biesheuvel <ardb@kernel.org>
Tue, 5 Nov 2024 16:09:05 +0000 (17:09 +0100)
committer Herbert Xu <herbert@gondor.apana.org.au>
Fri, 15 Nov 2024 11:52:51 +0000 (19:52 +0800)
diff --git a/arch/arm/crypto/crct10dif-ce-core.S b/arch/arm/crypto/crct10dif-ce-core.S

index 4dac32e020defd6ed1bc729a7def499967b1d31a..6b72167574b2f85cf01978eafcc3f441cb842392 100644 (file)
--- a/arch/arm/crypto/crct10dif-ce-core.S
+++ b/arch/arm/crypto/crct10dif-ce-core.S
@@ -112,48 +112,42 @@
         FOLD_CONST_L    .req    q10l
         FOLD_CONST_H    .req    q10h
  
+        .macro         pmull16x64_p64, v16, v64
+       vmull.p64       q11, \v64\()l, \v16\()_L
+       vmull.p64       \v64, \v64\()h, \v16\()_H
+       veor            \v64, \v64, q11
+       .endm
+
         // Fold reg1, reg2 into the next 32 data bytes, storing the result back
         // into reg1, reg2.
-       .macro          fold_32_bytes, reg1, reg2
-       vld1.64         {q11-q12}, [buf]!
+       .macro          fold_32_bytes, reg1, reg2, p
+       vld1.64         {q8-q9}, [buf]!
  
-       vmull.p64       q8, \reg1\()h, FOLD_CONST_H
-       vmull.p64       \reg1, \reg1\()l, FOLD_CONST_L
-       vmull.p64       q9, \reg2\()h, FOLD_CONST_H
-       vmull.p64       \reg2, \reg2\()l, FOLD_CONST_L
+       pmull16x64_\p   FOLD_CONST, \reg1
+       pmull16x64_\p   FOLD_CONST, \reg2
  
-CPU_LE(        vrev64.8        q11, q11        )
-CPU_LE(        vrev64.8        q12, q12        )
-       vswp            q11l, q11h
-       vswp            q12l, q12h
+CPU_LE(        vrev64.8        q8, q8  )
+CPU_LE(        vrev64.8        q9, q9  )
+       vswp            q8l, q8h
+       vswp            q9l, q9h
  
         veor.8          \reg1, \reg1, q8
         veor.8          \reg2, \reg2, q9
-       veor.8          \reg1, \reg1, q11
-       veor.8          \reg2, \reg2, q12
         .endm
  
         // Fold src_reg into dst_reg, optionally loading the next fold constants
-       .macro          fold_16_bytes, src_reg, dst_reg, load_next_consts
-       vmull.p64       q8, \src_reg\()l, FOLD_CONST_L
-       vmull.p64       \src_reg, \src_reg\()h, FOLD_CONST_H
+       .macro          fold_16_bytes, src_reg, dst_reg, p, load_next_consts
+       pmull16x64_\p   FOLD_CONST, \src_reg
         .ifnb           \load_next_consts
         vld1.64         {FOLD_CONSTS}, [fold_consts_ptr, :128]!
         .endif
-       veor.8          \dst_reg, \dst_reg, q8
         veor.8          \dst_reg, \dst_reg, \src_reg
         .endm
  
-//
-// u16 crc_t10dif_pmull(u16 init_crc, const u8 *buf, size_t len);
-//
-// Assumes len >= 16.
-//
-ENTRY(crc_t10dif_pmull)
-
+       .macro          crct10dif, p
         // For sizes less than 256 bytes, we can't fold 128 bytes at a time.
         cmp             len, #256
-       blt             .Lless_than_256_bytes
+       blt             .Lless_than_256_bytes\@
  
         mov_l           fold_consts_ptr, .Lfold_across_128_bytes_consts
  
@@ -194,27 +188,27 @@ CPU_LE(   vrev64.8        q7, q7  )
  
         // While >= 128 data bytes remain (not counting q0-q7), fold the 128
         // bytes q0-q7 into them, storing the result back into q0-q7.
-.Lfold_128_bytes_loop:
-       fold_32_bytes   q0, q1
-       fold_32_bytes   q2, q3
-       fold_32_bytes   q4, q5
-       fold_32_bytes   q6, q7
+.Lfold_128_bytes_loop\@:
+       fold_32_bytes   q0, q1, \p
+       fold_32_bytes   q2, q3, \p
+       fold_32_bytes   q4, q5, \p
+       fold_32_bytes   q6, q7, \p
         subs            len, len, #128
-       bge             .Lfold_128_bytes_loop
+       bge             .Lfold_128_bytes_loop\@
  
         // Now fold the 112 bytes in q0-q6 into the 16 bytes in q7.
  
         // Fold across 64 bytes.
         vld1.64         {FOLD_CONSTS}, [fold_consts_ptr, :128]!
-       fold_16_bytes   q0, q4
-       fold_16_bytes   q1, q5
-       fold_16_bytes   q2, q6
-       fold_16_bytes   q3, q7, 1
+       fold_16_bytes   q0, q4, \p
+       fold_16_bytes   q1, q5, \p
+       fold_16_bytes   q2, q6, \p
+       fold_16_bytes   q3, q7, \p, 1
         // Fold across 32 bytes.
-       fold_16_bytes   q4, q6
-       fold_16_bytes   q5, q7, 1
+       fold_16_bytes   q4, q6, \p
+       fold_16_bytes   q5, q7, \p, 1
         // Fold across 16 bytes.
-       fold_16_bytes   q6, q7
+       fold_16_bytes   q6, q7, \p
  
         // Add 128 to get the correct number of data bytes remaining in 0...127
         // (not counting q7), following the previous extra subtraction by 128.
@@ -224,25 +218,23 @@ CPU_LE(   vrev64.8        q7, q7  )
  
         // While >= 16 data bytes remain (not counting q7), fold the 16 bytes q7
         // into them, storing the result back into q7.
-       blt             .Lfold_16_bytes_loop_done
-.Lfold_16_bytes_loop:
-       vmull.p64       q8, q7l, FOLD_CONST_L
-       vmull.p64       q7, q7h, FOLD_CONST_H
-       veor.8          q7, q7, q8
+       blt             .Lfold_16_bytes_loop_done\@
+.Lfold_16_bytes_loop\@:
+       pmull16x64_\p   FOLD_CONST, q7
         vld1.64         {q0}, [buf]!
  CPU_LE(        vrev64.8        q0, q0  )
         vswp            q0l, q0h
         veor.8          q7, q7, q0
         subs            len, len, #16
-       bge             .Lfold_16_bytes_loop
+       bge             .Lfold_16_bytes_loop\@
  
-.Lfold_16_bytes_loop_done:
+.Lfold_16_bytes_loop_done\@:
         // Add 16 to get the correct number of data bytes remaining in 0...15
         // (not counting q7), following the previous extra subtraction by 16.
         adds            len, len, #16
-       beq             .Lreduce_final_16_bytes
+       beq             .Lreduce_final_16_bytes\@
  
-.Lhandle_partial_segment:
+.Lhandle_partial_segment\@:
         // Reduce the last '16 + len' bytes where 1 <= len <= 15 and the first
         // 16 bytes are in q7 and the rest are the remaining data in 'buf'.  To
         // do this without needing a fold constant for each possible 'len',
@@ -277,12 +269,46 @@ CPU_LE(   vrev64.8        q0, q0  )
         vbsl.8          q2, q1, q0
  
         // Fold the first chunk into the second chunk, storing the result in q7.
-       vmull.p64       q0, q3l, FOLD_CONST_L
-       vmull.p64       q7, q3h, FOLD_CONST_H
-       veor.8          q7, q7, q0
-       veor.8          q7, q7, q2
+       pmull16x64_\p   FOLD_CONST, q3
+       veor.8          q7, q3, q2
+       b               .Lreduce_final_16_bytes\@
+
+.Lless_than_256_bytes\@:
+       // Checksumming a buffer of length 16...255 bytes
+
+       mov_l           fold_consts_ptr, .Lfold_across_16_bytes_consts
+
+       // Load the first 16 data bytes.
+       vld1.64         {q7}, [buf]!
+CPU_LE(        vrev64.8        q7, q7  )
+       vswp            q7l, q7h
+
+       // XOR the first 16 data *bits* with the initial CRC value.
+       vmov.i8         q0h, #0
+       vmov.u16        q0h[3], init_crc
+       veor.8          q7h, q7h, q0h
+
+       // Load the fold-across-16-bytes constants.
+       vld1.64         {FOLD_CONSTS}, [fold_consts_ptr, :128]!
+
+       cmp             len, #16
+       beq             .Lreduce_final_16_bytes\@       // len == 16
+       subs            len, len, #32
+       addlt           len, len, #16
+       blt             .Lhandle_partial_segment\@      // 17 <= len <= 31
+       b               .Lfold_16_bytes_loop\@          // 32 <= len <= 255
+
+.Lreduce_final_16_bytes\@:
+       .endm
+
+//
+// u16 crc_t10dif_pmull(u16 init_crc, const u8 *buf, size_t len);
+//
+// Assumes len >= 16.
+//
+ENTRY(crc_t10dif_pmull64)
+       crct10dif       p64
  
-.Lreduce_final_16_bytes:
         // Reduce the 128-bit value M(x), stored in q7, to the final 16-bit CRC.
  
         // Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'.
@@ -316,31 +342,7 @@ CPU_LE(    vrev64.8        q0, q0  )
         vmov.u16        r0, q0l[0]
         bx              lr
  
-.Lless_than_256_bytes:
-       // Checksumming a buffer of length 16...255 bytes
-
-       mov_l           fold_consts_ptr, .Lfold_across_16_bytes_consts
-
-       // Load the first 16 data bytes.
-       vld1.64         {q7}, [buf]!
-CPU_LE(        vrev64.8        q7, q7  )
-       vswp            q7l, q7h
-
-       // XOR the first 16 data *bits* with the initial CRC value.
-       vmov.i8         q0h, #0
-       vmov.u16        q0h[3], init_crc
-       veor.8          q7h, q7h, q0h
-
-       // Load the fold-across-16-bytes constants.
-       vld1.64         {FOLD_CONSTS}, [fold_consts_ptr, :128]!
-
-       cmp             len, #16
-       beq             .Lreduce_final_16_bytes         // len == 16
-       subs            len, len, #32
-       addlt           len, len, #16
-       blt             .Lhandle_partial_segment        // 17 <= len <= 31
-       b               .Lfold_16_bytes_loop            // 32 <= len <= 255
-ENDPROC(crc_t10dif_pmull)
+ENDPROC(crc_t10dif_pmull64)
  
         .section        ".rodata", "a"
         .align          4
diff --git a/arch/arm/crypto/crct10dif-ce-glue.c b/arch/arm/crypto/crct10dif-ce-glue.c

index 79f3b204d8c03bb4630988e94fb4e6ab0ed9251e..60aa79c2fcdb63f7ab0cf91286754d734274e6f6 100644 (file)
--- a/arch/arm/crypto/crct10dif-ce-glue.c
+++ b/arch/arm/crypto/crct10dif-ce-glue.c
@@ -19,7 +19,7 @@
  
  #define CRC_T10DIF_PMULL_CHUNK_SIZE    16U
  
-asmlinkage u16 crc_t10dif_pmull(u16 init_crc, const u8 *buf, size_t len);
+asmlinkage u16 crc_t10dif_pmull64(u16 init_crc, const u8 *buf, size_t len);
  
  static int crct10dif_init(struct shash_desc *desc)
  {
@@ -29,14 +29,14 @@ static int crct10dif_init(struct shash_desc *desc)
         return 0;
  }
  
-static int crct10dif_update(struct shash_desc *desc, const u8 *data,
-                           unsigned int length)
+static int crct10dif_update_ce(struct shash_desc *desc, const u8 *data,
+                              unsigned int length)
  {
         u16 *crc = shash_desc_ctx(desc);
  
         if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && crypto_simd_usable()) {
                 kernel_neon_begin();
-               *crc = crc_t10dif_pmull(*crc, data, length);
+               *crc = crc_t10dif_pmull64(*crc, data, length);
                 kernel_neon_end();
         } else {
                 *crc = crc_t10dif_generic(*crc, data, length);
@@ -56,7 +56,7 @@ static int crct10dif_final(struct shash_desc *desc, u8 *out)
  static struct shash_alg crc_t10dif_alg = {
         .digestsize             = CRC_T10DIF_DIGEST_SIZE,
         .init                   = crct10dif_init,
-       .update                 = crct10dif_update,
+       .update                 = crct10dif_update_ce,
         .final                  = crct10dif_final,
         .descsize               = CRC_T10DIF_DIGEST_SIZE,
author	Ard Biesheuvel <ardb@kernel.org>
	Tue, 5 Nov 2024 16:09:05 +0000 (17:09 +0100)
committer	Herbert Xu <herbert@gondor.apana.org.au>
	Fri, 15 Nov 2024 11:52:51 +0000 (19:52 +0800)
arch/arm/crypto/crct10dif-ce-core.S		patch \| blob \| history
arch/arm/crypto/crct10dif-ce-glue.c		patch \| blob \| history