]> www.infradead.org Git - users/dwmw2/linux.git/commitdiff
crypto: arm64/crct10dif - Use faster 16x64 bit polynomial multiply
authorArd Biesheuvel <ardb@kernel.org>
Tue, 5 Nov 2024 16:09:02 +0000 (17:09 +0100)
committerHerbert Xu <herbert@gondor.apana.org.au>
Fri, 15 Nov 2024 11:52:51 +0000 (19:52 +0800)
The CRC-T10DIF implementation for arm64 has a version that uses 8x8
polynomial multiplication, for cores that lack the crypto extensions,
which cover the 64x64 polynomial multiplication instruction that the
algorithm was built around.

This fallback version rather naively adopted the 64x64 polynomial
multiplication algorithm that I ported from ARM for the GHASH driver,
which needs 8 PMULL8 instructions to implement one PMULL64. This is
reasonable, given that each 8-bit vector element needs to be multiplied
with each element in the other vector, producing 8 vectors with partial
results that need to be combined to yield the correct result.

However, most PMULL64 invocations in the CRC-T10DIF code involve
multiplication by a pair of 16-bit folding coefficients, and so all the
partial results from higher order bytes will be zero, and there is no
need to calculate them to begin with.

Then, the CRC-T10DIF algorithm always XORs the output values of the
PMULL64 instructions being issued in pairs, and so there is no need to
faithfully implement each individual PMULL64 instruction, as long as
XORing the results pairwise produces the expected result.

Implementing these improvements results in a speedup of 3.3x on low-end
platforms such as Raspberry Pi 4 (Cortex-A72)

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
arch/arm64/crypto/crct10dif-ce-core.S

index 5604de61d06d04eeac8fb616859098b53a8cedd8..d2acaa2b5a01f00962643be867cb9a83909d71e1 100644 (file)
@@ -1,8 +1,11 @@
 //
 // Accelerated CRC-T10DIF using arm64 NEON and Crypto Extensions instructions
 //
-// Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
-// Copyright (C) 2019 Google LLC <ebiggers@google.com>
+// Copyright (C) 2016 Linaro Ltd
+// Copyright (C) 2019-2024 Google LLC
+//
+// Authors: Ard Biesheuvel <ardb@google.com>
+//          Eric Biggers <ebiggers@google.com>
 //
 // This program is free software; you can redistribute it and/or modify
 // it under the terms of the GNU General Public License version 2 as
        sli             perm2.2d, perm1.2d, #56
        sli             perm3.2d, perm1.2d, #48
        sli             perm4.2d, perm1.2d, #40
+
+       // Compose { 0,0,0,0, 8,8,8,8, 1,1,1,1, 9,9,9,9 }
+       movi            bd1.4h, #8, lsl #8
+       orr             bd1.2s, #1, lsl #16
+       orr             bd1.2s, #1, lsl #24
+       zip1            bd1.16b, bd1.16b, bd1.16b
+       zip1            bd1.16b, bd1.16b, bd1.16b
        .endm
 
        .macro          __pmull_pre_p8, bd
@@ -196,6 +206,92 @@ SYM_FUNC_START_LOCAL(__pmull_p8_core)
        ret
 SYM_FUNC_END(__pmull_p8_core)
 
+       .macro          pmull16x64_p64, a16, b64, c64
+       pmull2          \c64\().1q, \a16\().2d, \b64\().2d
+       pmull           \b64\().1q, \a16\().1d, \b64\().1d
+       .endm
+
+       /*
+        * Pairwise long polynomial multiplication of two 16-bit values
+        *
+        *   { w0, w1 }, { y0, y1 }
+        *
+        * by two 64-bit values
+        *
+        *   { x0, x1, x2, x3, x4, x5, x6, x7 }, { z0, z1, z2, z3, z4, z5, z6, z7 }
+        *
+        * where each vector element is a byte, ordered from least to most
+        * significant.
+        *
+        * This can be implemented using 8x8 long polynomial multiplication, by
+        * reorganizing the input so that each pairwise 8x8 multiplication
+        * produces one of the terms from the decomposition below, and
+        * combining the results of each rank and shifting them into place.
+        *
+        * Rank
+        *  0            w0*x0 ^              |        y0*z0 ^
+        *  1       (w0*x1 ^ w1*x0) <<  8 ^   |   (y0*z1 ^ y1*z0) <<  8 ^
+        *  2       (w0*x2 ^ w1*x1) << 16 ^   |   (y0*z2 ^ y1*z1) << 16 ^
+        *  3       (w0*x3 ^ w1*x2) << 24 ^   |   (y0*z3 ^ y1*z2) << 24 ^
+        *  4       (w0*x4 ^ w1*x3) << 32 ^   |   (y0*z4 ^ y1*z3) << 32 ^
+        *  5       (w0*x5 ^ w1*x4) << 40 ^   |   (y0*z5 ^ y1*z4) << 40 ^
+        *  6       (w0*x6 ^ w1*x5) << 48 ^   |   (y0*z6 ^ y1*z5) << 48 ^
+        *  7       (w0*x7 ^ w1*x6) << 56 ^   |   (y0*z7 ^ y1*z6) << 56 ^
+        *  8            w1*x7      << 64     |        y1*z7      << 64
+        *
+        * The inputs can be reorganized into
+        *
+        *   { w0, w0, w0, w0, y0, y0, y0, y0 }, { w1, w1, w1, w1, y1, y1, y1, y1 }
+        *   { x0, x2, x4, x6, z0, z2, z4, z6 }, { x1, x3, x5, x7, z1, z3, z5, z7 }
+        *
+        * and after performing 8x8->16 bit long polynomial multiplication of
+        * each of the halves of the first vector with those of the second one,
+        * we obtain the following four vectors of 16-bit elements:
+        *
+        *   a := { w0*x0, w0*x2, w0*x4, w0*x6 }, { y0*z0, y0*z2, y0*z4, y0*z6 }
+        *   b := { w0*x1, w0*x3, w0*x5, w0*x7 }, { y0*z1, y0*z3, y0*z5, y0*z7 }
+        *   c := { w1*x0, w1*x2, w1*x4, w1*x6 }, { y1*z0, y1*z2, y1*z4, y1*z6 }
+        *   d := { w1*x1, w1*x3, w1*x5, w1*x7 }, { y1*z1, y1*z3, y1*z5, y1*z7 }
+        *
+        * Results b and c can be XORed together, as the vector elements have
+        * matching ranks. Then, the final XOR (*) can be pulled forward, and
+        * applied between the halves of each of the remaining three vectors,
+        * which are then shifted into place, and combined to produce two
+        * 80-bit results.
+        *
+        * (*) NOTE: the 16x64 bit polynomial multiply below is not equivalent
+        * to the 64x64 bit one above, but XOR'ing the outputs together will
+        * produce the expected result, and this is sufficient in the context of
+        * this algorithm.
+        */
+       .macro          pmull16x64_p8, a16, b64, c64
+       ext             t7.16b, \b64\().16b, \b64\().16b, #1
+       tbl             t5.16b, {\a16\().16b}, bd1.16b
+       uzp1            t7.16b, \b64\().16b, t7.16b
+       bl              __pmull_p8_16x64
+       ext             \b64\().16b, t4.16b, t4.16b, #15
+       eor             \c64\().16b, t8.16b, t5.16b
+       .endm
+
+SYM_FUNC_START_LOCAL(__pmull_p8_16x64)
+       ext             t6.16b, t5.16b, t5.16b, #8
+
+       pmull           t3.8h, t7.8b, t5.8b
+       pmull           t4.8h, t7.8b, t6.8b
+       pmull2          t5.8h, t7.16b, t5.16b
+       pmull2          t6.8h, t7.16b, t6.16b
+
+       ext             t8.16b, t3.16b, t3.16b, #8
+       eor             t4.16b, t4.16b, t6.16b
+       ext             t7.16b, t5.16b, t5.16b, #8
+       ext             t6.16b, t4.16b, t4.16b, #8
+       eor             t8.8b, t8.8b, t3.8b
+       eor             t5.8b, t5.8b, t7.8b
+       eor             t4.8b, t4.8b, t6.8b
+       ext             t5.16b, t5.16b, t5.16b, #14
+       ret
+SYM_FUNC_END(__pmull_p8_16x64)
+
        .macro          __pmull_p8, rq, ad, bd, i
        .ifnc           \bd, fold_consts
        .err
@@ -218,14 +314,12 @@ SYM_FUNC_END(__pmull_p8_core)
        .macro          fold_32_bytes, p, reg1, reg2
        ldp             q11, q12, [buf], #0x20
 
-       __pmull_\p      v8, \reg1, fold_consts, 2
-       __pmull_\p      \reg1, \reg1, fold_consts
+       pmull16x64_\p   fold_consts, \reg1, v8
 
 CPU_LE(        rev64           v11.16b, v11.16b                )
 CPU_LE(        rev64           v12.16b, v12.16b                )
 
-       __pmull_\p      v9, \reg2, fold_consts, 2
-       __pmull_\p      \reg2, \reg2, fold_consts
+       pmull16x64_\p   fold_consts, \reg2, v9
 
 CPU_LE(        ext             v11.16b, v11.16b, v11.16b, #8   )
 CPU_LE(        ext             v12.16b, v12.16b, v12.16b, #8   )
@@ -238,11 +332,9 @@ CPU_LE(    ext             v12.16b, v12.16b, v12.16b, #8   )
 
        // Fold src_reg into dst_reg, optionally loading the next fold constants
        .macro          fold_16_bytes, p, src_reg, dst_reg, load_next_consts
-       __pmull_\p      v8, \src_reg, fold_consts
-       __pmull_\p      \src_reg, \src_reg, fold_consts, 2
+       pmull16x64_\p   fold_consts, \src_reg, v8
        .ifnb           \load_next_consts
        ld1             {fold_consts.2d}, [fold_consts_ptr], #16
-       __pmull_pre_\p  fold_consts
        .endif
        eor             \dst_reg\().16b, \dst_reg\().16b, v8.16b
        eor             \dst_reg\().16b, \dst_reg\().16b, \src_reg\().16b
@@ -296,7 +388,6 @@ CPU_LE(     ext             v7.16b, v7.16b, v7.16b, #8      )
 
        // Load the constants for folding across 128 bytes.
        ld1             {fold_consts.2d}, [fold_consts_ptr]
-       __pmull_pre_\p  fold_consts
 
        // Subtract 128 for the 128 data bytes just consumed.  Subtract another
        // 128 to simplify the termination condition of the following loop.
@@ -318,7 +409,6 @@ CPU_LE(     ext             v7.16b, v7.16b, v7.16b, #8      )
        // Fold across 64 bytes.
        add             fold_consts_ptr, fold_consts_ptr, #16
        ld1             {fold_consts.2d}, [fold_consts_ptr], #16
-       __pmull_pre_\p  fold_consts
        fold_16_bytes   \p, v0, v4
        fold_16_bytes   \p, v1, v5
        fold_16_bytes   \p, v2, v6
@@ -339,8 +429,7 @@ CPU_LE(     ext             v7.16b, v7.16b, v7.16b, #8      )
        // into them, storing the result back into v7.
        b.lt            .Lfold_16_bytes_loop_done_\@
 .Lfold_16_bytes_loop_\@:
-       __pmull_\p      v8, v7, fold_consts
-       __pmull_\p      v7, v7, fold_consts, 2
+       pmull16x64_\p   fold_consts, v7, v8
        eor             v7.16b, v7.16b, v8.16b
        ldr             q0, [buf], #16
 CPU_LE(        rev64           v0.16b, v0.16b                  )
@@ -387,9 +476,8 @@ CPU_LE(     ext             v0.16b, v0.16b, v0.16b, #8      )
        bsl             v2.16b, v1.16b, v0.16b
 
        // Fold the first chunk into the second chunk, storing the result in v7.
-       __pmull_\p      v0, v3, fold_consts
-       __pmull_\p      v7, v3, fold_consts, 2
-       eor             v7.16b, v7.16b, v0.16b
+       pmull16x64_\p   fold_consts, v3, v0
+       eor             v7.16b, v3.16b, v0.16b
        eor             v7.16b, v7.16b, v2.16b
 
 .Lreduce_final_16_bytes_\@:
@@ -450,7 +538,6 @@ CPU_LE(     ext             v7.16b, v7.16b, v7.16b, #8      )
 
        // Load the fold-across-16-bytes constants.
        ld1             {fold_consts.2d}, [fold_consts_ptr], #16
-       __pmull_pre_\p  fold_consts
 
        cmp             len, #16
        b.eq            .Lreduce_final_16_bytes_\@      // len == 16