crypto: arm/crct10dif - Implement plain NEON variant

author Ard Biesheuvel <ardb@kernel.org>

Tue, 5 Nov 2024 16:09:06 +0000 (17:09 +0100)

committer Herbert Xu <herbert@gondor.apana.org.au>

Fri, 15 Nov 2024 11:52:51 +0000 (19:52 +0800)
author Ard Biesheuvel <ardb@kernel.org>
Tue, 5 Nov 2024 16:09:06 +0000 (17:09 +0100)
committer Herbert Xu <herbert@gondor.apana.org.au>
Fri, 15 Nov 2024 11:52:51 +0000 (19:52 +0800)
diff --git a/arch/arm/crypto/crct10dif-ce-core.S b/arch/arm/crypto/crct10dif-ce-core.S

index 6b72167574b2f85cf01978eafcc3f441cb842392..2bbf2df9c1e2fe71e110fa98e3c1f07a23c6f25c 100644 (file)
--- a/arch/arm/crypto/crct10dif-ce-core.S
+++ b/arch/arm/crypto/crct10dif-ce-core.S
@@ -112,6 +112,82 @@
         FOLD_CONST_L    .req    q10l
         FOLD_CONST_H    .req    q10h
  
+       /*
+        * Pairwise long polynomial multiplication of two 16-bit values
+        *
+        *   { w0, w1 }, { y0, y1 }
+        *
+        * by two 64-bit values
+        *
+        *   { x0, x1, x2, x3, x4, x5, x6, x7 }, { z0, z1, z2, z3, z4, z5, z6, z7 }
+        *
+        * where each vector element is a byte, ordered from least to most
+        * significant. The resulting 80-bit vectors are XOR'ed together.
+        *
+        * This can be implemented using 8x8 long polynomial multiplication, by
+        * reorganizing the input so that each pairwise 8x8 multiplication
+        * produces one of the terms from the decomposition below, and
+        * combining the results of each rank and shifting them into place.
+        *
+        * Rank
+        *  0            w0*x0 ^              |        y0*z0 ^
+        *  1       (w0*x1 ^ w1*x0) <<  8 ^   |   (y0*z1 ^ y1*z0) <<  8 ^
+        *  2       (w0*x2 ^ w1*x1) << 16 ^   |   (y0*z2 ^ y1*z1) << 16 ^
+        *  3       (w0*x3 ^ w1*x2) << 24 ^   |   (y0*z3 ^ y1*z2) << 24 ^
+        *  4       (w0*x4 ^ w1*x3) << 32 ^   |   (y0*z4 ^ y1*z3) << 32 ^
+        *  5       (w0*x5 ^ w1*x4) << 40 ^   |   (y0*z5 ^ y1*z4) << 40 ^
+        *  6       (w0*x6 ^ w1*x5) << 48 ^   |   (y0*z6 ^ y1*z5) << 48 ^
+        *  7       (w0*x7 ^ w1*x6) << 56 ^   |   (y0*z7 ^ y1*z6) << 56 ^
+        *  8            w1*x7      << 64     |        y1*z7      << 64
+        *
+        * The inputs can be reorganized into
+        *
+        *   { w0, w0, w0, w0, y0, y0, y0, y0 }, { w1, w1, w1, w1, y1, y1, y1, y1 }
+        *   { x0, x2, x4, x6, z0, z2, z4, z6 }, { x1, x3, x5, x7, z1, z3, z5, z7 }
+        *
+        * and after performing 8x8->16 bit long polynomial multiplication of
+        * each of the halves of the first vector with those of the second one,
+        * we obtain the following four vectors of 16-bit elements:
+        *
+        *   a := { w0*x0, w0*x2, w0*x4, w0*x6 }, { y0*z0, y0*z2, y0*z4, y0*z6 }
+        *   b := { w0*x1, w0*x3, w0*x5, w0*x7 }, { y0*z1, y0*z3, y0*z5, y0*z7 }
+        *   c := { w1*x0, w1*x2, w1*x4, w1*x6 }, { y1*z0, y1*z2, y1*z4, y1*z6 }
+        *   d := { w1*x1, w1*x3, w1*x5, w1*x7 }, { y1*z1, y1*z3, y1*z5, y1*z7 }
+        *
+        * Results b and c can be XORed together, as the vector elements have
+        * matching ranks. Then, the final XOR can be pulled forward, and
+        * applied between the halves of each of the remaining three vectors,
+        * which are then shifted into place, and XORed together to produce the
+        * final 80-bit result.
+        */
+        .macro         pmull16x64_p8, v16, v64
+       vext.8          q11, \v64, \v64, #1
+       vld1.64         {q12}, [r4, :128]
+       vuzp.8          q11, \v64
+       vtbl.8          d24, {\v16\()_L-\v16\()_H}, d24
+       vtbl.8          d25, {\v16\()_L-\v16\()_H}, d25
+       bl              __pmull16x64_p8
+       veor            \v64, q12, q14
+        .endm
+
+__pmull16x64_p8:
+       vmull.p8        q13, d23, d24
+       vmull.p8        q14, d23, d25
+       vmull.p8        q15, d22, d24
+       vmull.p8        q12, d22, d25
+
+       veor            q14, q14, q15
+       veor            d24, d24, d25
+       veor            d26, d26, d27
+       veor            d28, d28, d29
+       vmov.i32        d25, #0
+       vmov.i32        d29, #0
+       vext.8          q12, q12, q12, #14
+       vext.8          q14, q14, q14, #15
+       veor            d24, d24, d26
+       bx              lr
+ENDPROC(__pmull16x64_p8)
+
          .macro         pmull16x64_p64, v16, v64
         vmull.p64       q11, \v64\()l, \v16\()_L
         vmull.p64       \v64, \v64\()h, \v16\()_H
@@ -249,9 +325,9 @@ CPU_LE(     vrev64.8        q0, q0  )
         vswp            q0l, q0h
  
         // q1 = high order part of second chunk: q7 left-shifted by 'len' bytes.
-       mov_l           r3, .Lbyteshift_table + 16
-       sub             r3, r3, len
-       vld1.8          {q2}, [r3]
+       mov_l           r1, .Lbyteshift_table + 16
+       sub             r1, r1, len
+       vld1.8          {q2}, [r1]
         vtbl.8          q1l, {q7l-q7h}, q2l
         vtbl.8          q1h, {q7l-q7h}, q2h
  
@@ -341,9 +417,20 @@ ENTRY(crc_t10dif_pmull64)
  
         vmov.u16        r0, q0l[0]
         bx              lr
-
  ENDPROC(crc_t10dif_pmull64)
  
+ENTRY(crc_t10dif_pmull8)
+       push            {r4, lr}
+       mov_l           r4, .L16x64perm
+
+       crct10dif       p8
+
+CPU_LE(        vrev64.8        q7, q7  )
+       vswp            q7l, q7h
+       vst1.64         {q7}, [r3, :128]
+       pop             {r4, pc}
+ENDPROC(crc_t10dif_pmull8)
+
         .section        ".rodata", "a"
         .align          4
  
@@ -376,3 +463,6 @@ ENDPROC(crc_t10dif_pmull64)
         .byte           0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f
         .byte            0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
         .byte            0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe , 0x0
+
+.L16x64perm:
+       .quad           0x808080800000000, 0x909090901010101
diff --git a/arch/arm/crypto/crct10dif-ce-glue.c b/arch/arm/crypto/crct10dif-ce-glue.c

index 60aa79c2fcdb63f7ab0cf91286754d734274e6f6..a8b74523729ee759cb41775886e3fa51edc50b07 100644 (file)
--- a/arch/arm/crypto/crct10dif-ce-glue.c
+++ b/arch/arm/crypto/crct10dif-ce-glue.c
@@ -20,6 +20,8 @@
  #define CRC_T10DIF_PMULL_CHUNK_SIZE    16U
  
  asmlinkage u16 crc_t10dif_pmull64(u16 init_crc, const u8 *buf, size_t len);
+asmlinkage void crc_t10dif_pmull8(u16 init_crc, const u8 *buf, size_t len,
+                                 u8 out[16]);
  
  static int crct10dif_init(struct shash_desc *desc)
  {
@@ -45,6 +47,27 @@ static int crct10dif_update_ce(struct shash_desc *desc, const u8 *data,
         return 0;
  }
  
+static int crct10dif_update_neon(struct shash_desc *desc, const u8 *data,
+                                unsigned int length)
+{
+       u16 *crcp = shash_desc_ctx(desc);
+       u8 buf[16] __aligned(16);
+       u16 crc = *crcp;
+
+       if (length > CRC_T10DIF_PMULL_CHUNK_SIZE && crypto_simd_usable()) {
+               kernel_neon_begin();
+               crc_t10dif_pmull8(crc, data, length, buf);
+               kernel_neon_end();
+
+               crc = 0;
+               data = buf;
+               length = sizeof(buf);
+       }
+
+       *crcp = crc_t10dif_generic(crc, data, length);
+       return 0;
+}
+
  static int crct10dif_final(struct shash_desc *desc, u8 *out)
  {
         u16 *crc = shash_desc_ctx(desc);
@@ -53,7 +76,19 @@ static int crct10dif_final(struct shash_desc *desc, u8 *out)
         return 0;
  }
  
-static struct shash_alg crc_t10dif_alg = {
+static struct shash_alg algs[] = {{
+       .digestsize             = CRC_T10DIF_DIGEST_SIZE,
+       .init                   = crct10dif_init,
+       .update                 = crct10dif_update_neon,
+       .final                  = crct10dif_final,
+       .descsize               = CRC_T10DIF_DIGEST_SIZE,
+
+       .base.cra_name          = "crct10dif",
+       .base.cra_driver_name   = "crct10dif-arm-neon",
+       .base.cra_priority      = 150,
+       .base.cra_blocksize     = CRC_T10DIF_BLOCK_SIZE,
+       .base.cra_module        = THIS_MODULE,
+}, {
         .digestsize             = CRC_T10DIF_DIGEST_SIZE,
         .init                   = crct10dif_init,
         .update                 = crct10dif_update_ce,
@@ -65,19 +100,19 @@ static struct shash_alg crc_t10dif_alg = {
         .base.cra_priority      = 200,
         .base.cra_blocksize     = CRC_T10DIF_BLOCK_SIZE,
         .base.cra_module        = THIS_MODULE,
-};
+}};
  
  static int __init crc_t10dif_mod_init(void)
  {
-       if (!(elf_hwcap2 & HWCAP2_PMULL))
+       if (!(elf_hwcap & HWCAP_NEON))
                 return -ENODEV;
  
-       return crypto_register_shash(&crc_t10dif_alg);
+       return crypto_register_shashes(algs, 1 + !!(elf_hwcap2 & HWCAP2_PMULL));
  }
  
  static void __exit crc_t10dif_mod_exit(void)
  {
-       crypto_unregister_shash(&crc_t10dif_alg);
+       crypto_unregister_shashes(algs, 1 + !!(elf_hwcap2 & HWCAP2_PMULL));
  }
  
  module_init(crc_t10dif_mod_init);
author	Ard Biesheuvel <ardb@kernel.org>
	Tue, 5 Nov 2024 16:09:06 +0000 (17:09 +0100)
committer	Herbert Xu <herbert@gondor.apana.org.au>
	Fri, 15 Nov 2024 11:52:51 +0000 (19:52 +0800)
arch/arm/crypto/crct10dif-ce-core.S		patch \| blob \| history
arch/arm/crypto/crct10dif-ce-glue.c		patch \| blob \| history