From a10549fcce2913be7dc581562ffd8ea35653853e Mon Sep 17 00:00:00 2001 From: Li Huafei Date: Thu, 31 Oct 2024 19:27:55 +0800 Subject: [PATCH 01/16] crypto: inside-secure - Fix the return value of safexcel_xcbcmac_cra_init() The commit 320406cb60b6 ("crypto: inside-secure - Replace generic aes with libaes") replaced crypto_alloc_cipher() with kmalloc(), but did not modify the handling of the return value. When kmalloc() returns NULL, PTR_ERR_OR_ZERO(NULL) returns 0, but in fact, the memory allocation has failed, and -ENOMEM should be returned. Fixes: 320406cb60b6 ("crypto: inside-secure - Replace generic aes with libaes") Signed-off-by: Li Huafei Acked-by: Antoine Tenart Signed-off-by: Herbert Xu --- drivers/crypto/inside-secure/safexcel_hash.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/crypto/inside-secure/safexcel_hash.c b/drivers/crypto/inside-secure/safexcel_hash.c index e17577b785c3..f44c08f5f5ec 100644 --- a/drivers/crypto/inside-secure/safexcel_hash.c +++ b/drivers/crypto/inside-secure/safexcel_hash.c @@ -2093,7 +2093,7 @@ static int safexcel_xcbcmac_cra_init(struct crypto_tfm *tfm) safexcel_ahash_cra_init(tfm); ctx->aes = kmalloc(sizeof(*ctx->aes), GFP_KERNEL); - return PTR_ERR_OR_ZERO(ctx->aes); + return ctx->aes == NULL ? -ENOMEM : 0; } static void safexcel_xcbcmac_cra_exit(struct crypto_tfm *tfm) -- 2.51.0 From e45f0ab6ee48531f8bd4cae94a498893a983a5e1 Mon Sep 17 00:00:00 2001 From: Zicheng Qu Date: Fri, 1 Nov 2024 09:13:24 +0000 Subject: [PATCH 02/16] padata: Clean up in padata_do_multithreaded() In commit 24cc57d8faaa ("padata: Honor the caller's alignment in case of chunk_size 0"), the line 'ps.chunk_size = max(ps.chunk_size, 1ul)' was added, making 'ps.chunk_size = 1U' redundant and never executed. Signed-off-by: Zicheng Qu Acked-by: Daniel Jordan Signed-off-by: Herbert Xu --- kernel/padata.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/kernel/padata.c b/kernel/padata.c index d899f34558af..d51bbc76b227 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -521,13 +521,6 @@ void __init padata_do_multithreaded(struct padata_mt_job *job) ps.chunk_size = max(ps.chunk_size, 1ul); ps.chunk_size = roundup(ps.chunk_size, job->align); - /* - * chunk_size can be 0 if the caller sets min_chunk to 0. So force it - * to at least 1 to prevent divide-by-0 panic in padata_mt_helper().` - */ - if (!ps.chunk_size) - ps.chunk_size = 1U; - list_for_each_entry(pw, &works, pw_list) if (job->numa_aware) { int old_node = atomic_read(&last_used_nid); -- 2.51.0 From c0559d24560d04a808e4c7838e38a8d556679843 Mon Sep 17 00:00:00 2001 From: Markus Mayer Date: Fri, 1 Nov 2024 14:13:14 -0700 Subject: [PATCH 03/16] dt-bindings: rng: add binding for BCM74110 RNG Add a binding for the random number generator used on the BCM74110. Signed-off-by: Markus Mayer Reviewed-by: Krzysztof Kozlowski Signed-off-by: Herbert Xu --- .../bindings/rng/brcm,bcm74110-rng.yaml | 35 +++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100644 Documentation/devicetree/bindings/rng/brcm,bcm74110-rng.yaml diff --git a/Documentation/devicetree/bindings/rng/brcm,bcm74110-rng.yaml b/Documentation/devicetree/bindings/rng/brcm,bcm74110-rng.yaml new file mode 100644 index 000000000000..8e89d4a70b53 --- /dev/null +++ b/Documentation/devicetree/bindings/rng/brcm,bcm74110-rng.yaml @@ -0,0 +1,35 @@ +# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/rng/brcm,bcm74110-rng.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: BCM74110 Random number generator + +description: + Random number generator used on the BCM74110. + +maintainers: + - Markus Mayer + - Florian Fainelli + +properties: + compatible: + enum: + - brcm,bcm74110-rng + + reg: + maxItems: 1 + +required: + - compatible + - reg + +additionalProperties: false + +examples: + - | + rng@83ba000 { + compatible = "brcm,bcm74110-rng"; + reg = <0x83ba000 0x14>; + }; -- 2.51.0 From 35b2237f27c33c9d84733e03d84d79b6a6062715 Mon Sep 17 00:00:00 2001 From: Markus Mayer Date: Fri, 1 Nov 2024 14:13:15 -0700 Subject: [PATCH 04/16] hwrng: bcm74110 - Add Broadcom BCM74110 RNG driver Add a driver for the random number generator present on the Broadcom BCM74110 SoC. Signed-off-by: Markus Mayer Signed-off-by: Herbert Xu --- drivers/char/hw_random/Kconfig | 13 +++ drivers/char/hw_random/Makefile | 1 + drivers/char/hw_random/bcm74110-rng.c | 125 ++++++++++++++++++++++++++ 3 files changed, 139 insertions(+) create mode 100644 drivers/char/hw_random/bcm74110-rng.c diff --git a/drivers/char/hw_random/Kconfig b/drivers/char/hw_random/Kconfig index bda283f290bc..bf9b2354766a 100644 --- a/drivers/char/hw_random/Kconfig +++ b/drivers/char/hw_random/Kconfig @@ -112,6 +112,19 @@ config HW_RANDOM_BCM2835 If unsure, say Y. +config HW_RANDOM_BCM74110 + tristate "Broadcom BCM74110 Random Number Generator support" + depends on ARCH_BRCMSTB || COMPILE_TEST + default HW_RANDOM + help + This driver provides kernel-side support for the Random Number + Generator hardware found on the Broadcom BCM74110 SoCs. + + To compile this driver as a module, choose M here: the + module will be called bcm74110-rng + + If unsure, say Y. + config HW_RANDOM_IPROC_RNG200 tristate "Broadcom iProc/STB RNG200 support" depends on ARCH_BCM_IPROC || ARCH_BCM2835 || ARCH_BCMBCA || ARCH_BRCMSTB || COMPILE_TEST diff --git a/drivers/char/hw_random/Makefile b/drivers/char/hw_random/Makefile index dfb717b12f0b..b9132b3f5d21 100644 --- a/drivers/char/hw_random/Makefile +++ b/drivers/char/hw_random/Makefile @@ -32,6 +32,7 @@ obj-$(CONFIG_HW_RANDOM_POWERNV) += powernv-rng.o obj-$(CONFIG_HW_RANDOM_HISI) += hisi-rng.o obj-$(CONFIG_HW_RANDOM_HISTB) += histb-rng.o obj-$(CONFIG_HW_RANDOM_BCM2835) += bcm2835-rng.o +obj-$(CONFIG_HW_RANDOM_BCM74110) += bcm74110-rng.o obj-$(CONFIG_HW_RANDOM_IPROC_RNG200) += iproc-rng200.o obj-$(CONFIG_HW_RANDOM_ST) += st-rng.o obj-$(CONFIG_HW_RANDOM_XGENE) += xgene-rng.o diff --git a/drivers/char/hw_random/bcm74110-rng.c b/drivers/char/hw_random/bcm74110-rng.c new file mode 100644 index 000000000000..5c64148e91f1 --- /dev/null +++ b/drivers/char/hw_random/bcm74110-rng.c @@ -0,0 +1,125 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2024 Broadcom + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include + +#define HOST_REV_ID 0x00 +#define HOST_FIFO_DEPTH 0x04 +#define HOST_FIFO_COUNT 0x08 +#define HOST_FIFO_THRESHOLD 0x0c +#define HOST_FIFO_DATA 0x10 + +#define HOST_FIFO_COUNT_MASK 0xffff + +/* Delay range in microseconds */ +#define FIFO_DELAY_MIN_US 3 +#define FIFO_DELAY_MAX_US 7 +#define FIFO_DELAY_MAX_COUNT 10 + +struct bcm74110_priv { + void __iomem *base; +}; + +static inline int bcm74110_rng_fifo_count(void __iomem *mem) +{ + return readl_relaxed(mem) & HOST_FIFO_COUNT_MASK; +} + +static int bcm74110_rng_read(struct hwrng *rng, void *buf, size_t max, + bool wait) +{ + struct bcm74110_priv *priv = (struct bcm74110_priv *)rng->priv; + void __iomem *fc_addr = priv->base + HOST_FIFO_COUNT; + void __iomem *fd_addr = priv->base + HOST_FIFO_DATA; + unsigned underrun_count = 0; + u32 max_words = max / sizeof(u32); + u32 num_words; + unsigned i; + + /* + * We need to check how many words are available in the RNG FIFO. If + * there aren't any, we need to wait for some to become available. + */ + while ((num_words = bcm74110_rng_fifo_count(fc_addr)) == 0) { + if (!wait) + return 0; + /* + * As a precaution, limit how long we wait. If the FIFO doesn't + * refill within the allotted time, return 0 (=no data) to the + * caller. + */ + if (likely(underrun_count < FIFO_DELAY_MAX_COUNT)) + usleep_range(FIFO_DELAY_MIN_US, FIFO_DELAY_MAX_US); + else + return 0; + underrun_count++; + } + if (num_words > max_words) + num_words = max_words; + + /* Bail early if we run out of random numbers unexpectedly */ + for (i = 0; i < num_words && bcm74110_rng_fifo_count(fc_addr) > 0; i++) + ((u32 *)buf)[i] = readl_relaxed(fd_addr); + + return i * sizeof(u32); +} + +static struct hwrng bcm74110_hwrng = { + .read = bcm74110_rng_read, +}; + +static int bcm74110_rng_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct bcm74110_priv *priv; + int rc; + + priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + + bcm74110_hwrng.name = pdev->name; + bcm74110_hwrng.priv = (unsigned long)priv; + + priv->base = devm_platform_ioremap_resource(pdev, 0); + if (IS_ERR(priv->base)) + return PTR_ERR(priv->base); + + rc = devm_hwrng_register(dev, &bcm74110_hwrng); + if (rc) + dev_err(dev, "hwrng registration failed (%d)\n", rc); + else + dev_info(dev, "hwrng registered\n"); + + return rc; +} + +static const struct of_device_id bcm74110_rng_match[] = { + { .compatible = "brcm,bcm74110-rng", }, + {}, +}; +MODULE_DEVICE_TABLE(of, bcm74110_rng_match); + +static struct platform_driver bcm74110_rng_driver = { + .driver = { + .name = KBUILD_MODNAME, + .of_match_table = bcm74110_rng_match, + }, + .probe = bcm74110_rng_probe, +}; +module_platform_driver(bcm74110_rng_driver); + +MODULE_AUTHOR("Markus Mayer "); +MODULE_DESCRIPTION("BCM 74110 Random Number Generator (RNG) driver"); +MODULE_LICENSE("GPL v2"); -- 2.51.0 From b64140c74e954f1db6eae5548ca3a1f41b6fad79 Mon Sep 17 00:00:00 2001 From: Chen Ridong Date: Mon, 4 Nov 2024 12:15:11 +0000 Subject: [PATCH 05/16] crypto: caam - add error check to caam_rsa_set_priv_key_form MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit The caam_rsa_set_priv_key_form did not check for memory allocation errors. Add the checks to the caam_rsa_set_priv_key_form functions. Fixes: 52e26d77b8b3 ("crypto: caam - add support for RSA key form 2") Signed-off-by: Chen Ridong Reviewed-by: Gaurav Jain Reviewed-by: Horia Geantă Signed-off-by: Herbert Xu --- drivers/crypto/caam/caampkc.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/drivers/crypto/caam/caampkc.c b/drivers/crypto/caam/caampkc.c index 887a5f2fb927..cb001aa1de66 100644 --- a/drivers/crypto/caam/caampkc.c +++ b/drivers/crypto/caam/caampkc.c @@ -984,7 +984,7 @@ err: return -ENOMEM; } -static void caam_rsa_set_priv_key_form(struct caam_rsa_ctx *ctx, +static int caam_rsa_set_priv_key_form(struct caam_rsa_ctx *ctx, struct rsa_key *raw_key) { struct caam_rsa_key *rsa_key = &ctx->key; @@ -994,7 +994,7 @@ static void caam_rsa_set_priv_key_form(struct caam_rsa_ctx *ctx, rsa_key->p = caam_read_raw_data(raw_key->p, &p_sz); if (!rsa_key->p) - return; + return -ENOMEM; rsa_key->p_sz = p_sz; rsa_key->q = caam_read_raw_data(raw_key->q, &q_sz); @@ -1029,7 +1029,7 @@ static void caam_rsa_set_priv_key_form(struct caam_rsa_ctx *ctx, rsa_key->priv_form = FORM3; - return; + return 0; free_dq: kfree_sensitive(rsa_key->dq); @@ -1043,6 +1043,7 @@ free_q: kfree_sensitive(rsa_key->q); free_p: kfree_sensitive(rsa_key->p); + return -ENOMEM; } static int caam_rsa_set_priv_key(struct crypto_akcipher *tfm, const void *key, @@ -1088,7 +1089,9 @@ static int caam_rsa_set_priv_key(struct crypto_akcipher *tfm, const void *key, rsa_key->e_sz = raw_key.e_sz; rsa_key->n_sz = raw_key.n_sz; - caam_rsa_set_priv_key_form(ctx, &raw_key); + ret = caam_rsa_set_priv_key_form(ctx, &raw_key); + if (ret) + goto err; return 0; -- 2.51.0 From 19630cf57233e845b6ac57c9c969a4888925467b Mon Sep 17 00:00:00 2001 From: Chen Ridong Date: Mon, 4 Nov 2024 12:17:45 +0000 Subject: [PATCH 06/16] crypto: bcm - add error check in the ahash_hmac_init function The ahash_init functions may return fails. The ahash_hmac_init should not return ok when ahash_init returns error. For an example, ahash_init will return -ENOMEM when allocation memory is error. Fixes: 9d12ba86f818 ("crypto: brcm - Add Broadcom SPU driver") Signed-off-by: Chen Ridong Signed-off-by: Herbert Xu --- drivers/crypto/bcm/cipher.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/crypto/bcm/cipher.c b/drivers/crypto/bcm/cipher.c index 7540eb7cd331..9e6798efbfb7 100644 --- a/drivers/crypto/bcm/cipher.c +++ b/drivers/crypto/bcm/cipher.c @@ -2415,6 +2415,7 @@ static int ahash_hmac_setkey(struct crypto_ahash *ahash, const u8 *key, static int ahash_hmac_init(struct ahash_request *req) { + int ret; struct iproc_reqctx_s *rctx = ahash_request_ctx(req); struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); struct iproc_ctx_s *ctx = crypto_ahash_ctx(tfm); @@ -2424,7 +2425,9 @@ static int ahash_hmac_init(struct ahash_request *req) flow_log("ahash_hmac_init()\n"); /* init the context as a hash */ - ahash_init(req); + ret = ahash_init(req); + if (ret) + return ret; if (!spu_no_incr_hash(ctx)) { /* SPU-M can do incr hashing but needs sw for outer HMAC */ -- 2.51.0 From 7048c21e6b50e4dec0de1ed48b12db50b94b3f57 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 5 Nov 2024 17:09:01 +0100 Subject: [PATCH 07/16] crypto: arm64/crct10dif - Remove obsolete chunking logic This is a partial revert of commit fc754c024a343b, which moved the logic into C code which ensures that kernel mode NEON code does not hog the CPU for too long. This is no longer needed now that kernel mode NEON no longer disables preemption, so we can drop this. Reviewed-by: Eric Biggers Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/arm64/crypto/crct10dif-ce-glue.c | 30 ++++++--------------------- 1 file changed, 6 insertions(+), 24 deletions(-) diff --git a/arch/arm64/crypto/crct10dif-ce-glue.c b/arch/arm64/crypto/crct10dif-ce-glue.c index 606d25c559ed..7b05094a0480 100644 --- a/arch/arm64/crypto/crct10dif-ce-glue.c +++ b/arch/arm64/crypto/crct10dif-ce-glue.c @@ -37,18 +37,9 @@ static int crct10dif_update_pmull_p8(struct shash_desc *desc, const u8 *data, u16 *crc = shash_desc_ctx(desc); if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && crypto_simd_usable()) { - do { - unsigned int chunk = length; - - if (chunk > SZ_4K + CRC_T10DIF_PMULL_CHUNK_SIZE) - chunk = SZ_4K; - - kernel_neon_begin(); - *crc = crc_t10dif_pmull_p8(*crc, data, chunk); - kernel_neon_end(); - data += chunk; - length -= chunk; - } while (length); + kernel_neon_begin(); + *crc = crc_t10dif_pmull_p8(*crc, data, length); + kernel_neon_end(); } else { *crc = crc_t10dif_generic(*crc, data, length); } @@ -62,18 +53,9 @@ static int crct10dif_update_pmull_p64(struct shash_desc *desc, const u8 *data, u16 *crc = shash_desc_ctx(desc); if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && crypto_simd_usable()) { - do { - unsigned int chunk = length; - - if (chunk > SZ_4K + CRC_T10DIF_PMULL_CHUNK_SIZE) - chunk = SZ_4K; - - kernel_neon_begin(); - *crc = crc_t10dif_pmull_p64(*crc, data, chunk); - kernel_neon_end(); - data += chunk; - length -= chunk; - } while (length); + kernel_neon_begin(); + *crc = crc_t10dif_pmull_p64(*crc, data, length); + kernel_neon_end(); } else { *crc = crc_t10dif_generic(*crc, data, length); } -- 2.51.0 From 67dfb1b73f423622a0096ea43fb1f5b7336f49e0 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 5 Nov 2024 17:09:02 +0100 Subject: [PATCH 08/16] crypto: arm64/crct10dif - Use faster 16x64 bit polynomial multiply The CRC-T10DIF implementation for arm64 has a version that uses 8x8 polynomial multiplication, for cores that lack the crypto extensions, which cover the 64x64 polynomial multiplication instruction that the algorithm was built around. This fallback version rather naively adopted the 64x64 polynomial multiplication algorithm that I ported from ARM for the GHASH driver, which needs 8 PMULL8 instructions to implement one PMULL64. This is reasonable, given that each 8-bit vector element needs to be multiplied with each element in the other vector, producing 8 vectors with partial results that need to be combined to yield the correct result. However, most PMULL64 invocations in the CRC-T10DIF code involve multiplication by a pair of 16-bit folding coefficients, and so all the partial results from higher order bytes will be zero, and there is no need to calculate them to begin with. Then, the CRC-T10DIF algorithm always XORs the output values of the PMULL64 instructions being issued in pairs, and so there is no need to faithfully implement each individual PMULL64 instruction, as long as XORing the results pairwise produces the expected result. Implementing these improvements results in a speedup of 3.3x on low-end platforms such as Raspberry Pi 4 (Cortex-A72) Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/arm64/crypto/crct10dif-ce-core.S | 121 ++++++++++++++++++++++---- 1 file changed, 104 insertions(+), 17 deletions(-) diff --git a/arch/arm64/crypto/crct10dif-ce-core.S b/arch/arm64/crypto/crct10dif-ce-core.S index 5604de61d06d..d2acaa2b5a01 100644 --- a/arch/arm64/crypto/crct10dif-ce-core.S +++ b/arch/arm64/crypto/crct10dif-ce-core.S @@ -1,8 +1,11 @@ // // Accelerated CRC-T10DIF using arm64 NEON and Crypto Extensions instructions // -// Copyright (C) 2016 Linaro Ltd -// Copyright (C) 2019 Google LLC +// Copyright (C) 2016 Linaro Ltd +// Copyright (C) 2019-2024 Google LLC +// +// Authors: Ard Biesheuvel +// Eric Biggers // // This program is free software; you can redistribute it and/or modify // it under the terms of the GNU General Public License version 2 as @@ -122,6 +125,13 @@ sli perm2.2d, perm1.2d, #56 sli perm3.2d, perm1.2d, #48 sli perm4.2d, perm1.2d, #40 + + // Compose { 0,0,0,0, 8,8,8,8, 1,1,1,1, 9,9,9,9 } + movi bd1.4h, #8, lsl #8 + orr bd1.2s, #1, lsl #16 + orr bd1.2s, #1, lsl #24 + zip1 bd1.16b, bd1.16b, bd1.16b + zip1 bd1.16b, bd1.16b, bd1.16b .endm .macro __pmull_pre_p8, bd @@ -196,6 +206,92 @@ SYM_FUNC_START_LOCAL(__pmull_p8_core) ret SYM_FUNC_END(__pmull_p8_core) + .macro pmull16x64_p64, a16, b64, c64 + pmull2 \c64\().1q, \a16\().2d, \b64\().2d + pmull \b64\().1q, \a16\().1d, \b64\().1d + .endm + + /* + * Pairwise long polynomial multiplication of two 16-bit values + * + * { w0, w1 }, { y0, y1 } + * + * by two 64-bit values + * + * { x0, x1, x2, x3, x4, x5, x6, x7 }, { z0, z1, z2, z3, z4, z5, z6, z7 } + * + * where each vector element is a byte, ordered from least to most + * significant. + * + * This can be implemented using 8x8 long polynomial multiplication, by + * reorganizing the input so that each pairwise 8x8 multiplication + * produces one of the terms from the decomposition below, and + * combining the results of each rank and shifting them into place. + * + * Rank + * 0 w0*x0 ^ | y0*z0 ^ + * 1 (w0*x1 ^ w1*x0) << 8 ^ | (y0*z1 ^ y1*z0) << 8 ^ + * 2 (w0*x2 ^ w1*x1) << 16 ^ | (y0*z2 ^ y1*z1) << 16 ^ + * 3 (w0*x3 ^ w1*x2) << 24 ^ | (y0*z3 ^ y1*z2) << 24 ^ + * 4 (w0*x4 ^ w1*x3) << 32 ^ | (y0*z4 ^ y1*z3) << 32 ^ + * 5 (w0*x5 ^ w1*x4) << 40 ^ | (y0*z5 ^ y1*z4) << 40 ^ + * 6 (w0*x6 ^ w1*x5) << 48 ^ | (y0*z6 ^ y1*z5) << 48 ^ + * 7 (w0*x7 ^ w1*x6) << 56 ^ | (y0*z7 ^ y1*z6) << 56 ^ + * 8 w1*x7 << 64 | y1*z7 << 64 + * + * The inputs can be reorganized into + * + * { w0, w0, w0, w0, y0, y0, y0, y0 }, { w1, w1, w1, w1, y1, y1, y1, y1 } + * { x0, x2, x4, x6, z0, z2, z4, z6 }, { x1, x3, x5, x7, z1, z3, z5, z7 } + * + * and after performing 8x8->16 bit long polynomial multiplication of + * each of the halves of the first vector with those of the second one, + * we obtain the following four vectors of 16-bit elements: + * + * a := { w0*x0, w0*x2, w0*x4, w0*x6 }, { y0*z0, y0*z2, y0*z4, y0*z6 } + * b := { w0*x1, w0*x3, w0*x5, w0*x7 }, { y0*z1, y0*z3, y0*z5, y0*z7 } + * c := { w1*x0, w1*x2, w1*x4, w1*x6 }, { y1*z0, y1*z2, y1*z4, y1*z6 } + * d := { w1*x1, w1*x3, w1*x5, w1*x7 }, { y1*z1, y1*z3, y1*z5, y1*z7 } + * + * Results b and c can be XORed together, as the vector elements have + * matching ranks. Then, the final XOR (*) can be pulled forward, and + * applied between the halves of each of the remaining three vectors, + * which are then shifted into place, and combined to produce two + * 80-bit results. + * + * (*) NOTE: the 16x64 bit polynomial multiply below is not equivalent + * to the 64x64 bit one above, but XOR'ing the outputs together will + * produce the expected result, and this is sufficient in the context of + * this algorithm. + */ + .macro pmull16x64_p8, a16, b64, c64 + ext t7.16b, \b64\().16b, \b64\().16b, #1 + tbl t5.16b, {\a16\().16b}, bd1.16b + uzp1 t7.16b, \b64\().16b, t7.16b + bl __pmull_p8_16x64 + ext \b64\().16b, t4.16b, t4.16b, #15 + eor \c64\().16b, t8.16b, t5.16b + .endm + +SYM_FUNC_START_LOCAL(__pmull_p8_16x64) + ext t6.16b, t5.16b, t5.16b, #8 + + pmull t3.8h, t7.8b, t5.8b + pmull t4.8h, t7.8b, t6.8b + pmull2 t5.8h, t7.16b, t5.16b + pmull2 t6.8h, t7.16b, t6.16b + + ext t8.16b, t3.16b, t3.16b, #8 + eor t4.16b, t4.16b, t6.16b + ext t7.16b, t5.16b, t5.16b, #8 + ext t6.16b, t4.16b, t4.16b, #8 + eor t8.8b, t8.8b, t3.8b + eor t5.8b, t5.8b, t7.8b + eor t4.8b, t4.8b, t6.8b + ext t5.16b, t5.16b, t5.16b, #14 + ret +SYM_FUNC_END(__pmull_p8_16x64) + .macro __pmull_p8, rq, ad, bd, i .ifnc \bd, fold_consts .err @@ -218,14 +314,12 @@ SYM_FUNC_END(__pmull_p8_core) .macro fold_32_bytes, p, reg1, reg2 ldp q11, q12, [buf], #0x20 - __pmull_\p v8, \reg1, fold_consts, 2 - __pmull_\p \reg1, \reg1, fold_consts + pmull16x64_\p fold_consts, \reg1, v8 CPU_LE( rev64 v11.16b, v11.16b ) CPU_LE( rev64 v12.16b, v12.16b ) - __pmull_\p v9, \reg2, fold_consts, 2 - __pmull_\p \reg2, \reg2, fold_consts + pmull16x64_\p fold_consts, \reg2, v9 CPU_LE( ext v11.16b, v11.16b, v11.16b, #8 ) CPU_LE( ext v12.16b, v12.16b, v12.16b, #8 ) @@ -238,11 +332,9 @@ CPU_LE( ext v12.16b, v12.16b, v12.16b, #8 ) // Fold src_reg into dst_reg, optionally loading the next fold constants .macro fold_16_bytes, p, src_reg, dst_reg, load_next_consts - __pmull_\p v8, \src_reg, fold_consts - __pmull_\p \src_reg, \src_reg, fold_consts, 2 + pmull16x64_\p fold_consts, \src_reg, v8 .ifnb \load_next_consts ld1 {fold_consts.2d}, [fold_consts_ptr], #16 - __pmull_pre_\p fold_consts .endif eor \dst_reg\().16b, \dst_reg\().16b, v8.16b eor \dst_reg\().16b, \dst_reg\().16b, \src_reg\().16b @@ -296,7 +388,6 @@ CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 ) // Load the constants for folding across 128 bytes. ld1 {fold_consts.2d}, [fold_consts_ptr] - __pmull_pre_\p fold_consts // Subtract 128 for the 128 data bytes just consumed. Subtract another // 128 to simplify the termination condition of the following loop. @@ -318,7 +409,6 @@ CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 ) // Fold across 64 bytes. add fold_consts_ptr, fold_consts_ptr, #16 ld1 {fold_consts.2d}, [fold_consts_ptr], #16 - __pmull_pre_\p fold_consts fold_16_bytes \p, v0, v4 fold_16_bytes \p, v1, v5 fold_16_bytes \p, v2, v6 @@ -339,8 +429,7 @@ CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 ) // into them, storing the result back into v7. b.lt .Lfold_16_bytes_loop_done_\@ .Lfold_16_bytes_loop_\@: - __pmull_\p v8, v7, fold_consts - __pmull_\p v7, v7, fold_consts, 2 + pmull16x64_\p fold_consts, v7, v8 eor v7.16b, v7.16b, v8.16b ldr q0, [buf], #16 CPU_LE( rev64 v0.16b, v0.16b ) @@ -387,9 +476,8 @@ CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 ) bsl v2.16b, v1.16b, v0.16b // Fold the first chunk into the second chunk, storing the result in v7. - __pmull_\p v0, v3, fold_consts - __pmull_\p v7, v3, fold_consts, 2 - eor v7.16b, v7.16b, v0.16b + pmull16x64_\p fold_consts, v3, v0 + eor v7.16b, v3.16b, v0.16b eor v7.16b, v7.16b, v2.16b .Lreduce_final_16_bytes_\@: @@ -450,7 +538,6 @@ CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 ) // Load the fold-across-16-bytes constants. ld1 {fold_consts.2d}, [fold_consts_ptr], #16 - __pmull_pre_\p fold_consts cmp len, #16 b.eq .Lreduce_final_16_bytes_\@ // len == 16 -- 2.51.0 From 779cee8209c67aae195a81c3a72bac9e127fdaee Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 5 Nov 2024 17:09:03 +0100 Subject: [PATCH 09/16] crypto: arm64/crct10dif - Remove remaining 64x64 PMULL fallback code The only remaining user of the fallback implementation of 64x64 polynomial multiplication using 8x8 PMULL instructions is the final reduction from a 16 byte vector to a 16-bit CRC. The fallback code is complicated and messy, and this reduction has little impact on the overall performance, so instead, let's calculate the final CRC by passing the 16 byte vector to the generic CRC-T10DIF implementation when running the fallback version. Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/arm64/crypto/crct10dif-ce-core.S | 244 ++++++-------------------- arch/arm64/crypto/crct10dif-ce-glue.c | 18 +- 2 files changed, 68 insertions(+), 194 deletions(-) diff --git a/arch/arm64/crypto/crct10dif-ce-core.S b/arch/arm64/crypto/crct10dif-ce-core.S index d2acaa2b5a01..87dd6d46224d 100644 --- a/arch/arm64/crypto/crct10dif-ce-core.S +++ b/arch/arm64/crypto/crct10dif-ce-core.S @@ -74,137 +74,18 @@ init_crc .req w0 buf .req x1 len .req x2 - fold_consts_ptr .req x3 + fold_consts_ptr .req x5 fold_consts .req v10 - ad .req v14 - - k00_16 .req v15 - k32_48 .req v16 - t3 .req v17 t4 .req v18 t5 .req v19 t6 .req v20 t7 .req v21 t8 .req v22 - t9 .req v23 - - perm1 .req v24 - perm2 .req v25 - perm3 .req v26 - perm4 .req v27 - - bd1 .req v28 - bd2 .req v29 - bd3 .req v30 - bd4 .req v31 - - .macro __pmull_init_p64 - .endm - .macro __pmull_pre_p64, bd - .endm - - .macro __pmull_init_p8 - // k00_16 := 0x0000000000000000_000000000000ffff - // k32_48 := 0x00000000ffffffff_0000ffffffffffff - movi k32_48.2d, #0xffffffff - mov k32_48.h[2], k32_48.h[0] - ushr k00_16.2d, k32_48.2d, #32 - - // prepare the permutation vectors - mov_q x5, 0x080f0e0d0c0b0a09 - movi perm4.8b, #8 - dup perm1.2d, x5 - eor perm1.16b, perm1.16b, perm4.16b - ushr perm2.2d, perm1.2d, #8 - ushr perm3.2d, perm1.2d, #16 - ushr perm4.2d, perm1.2d, #24 - sli perm2.2d, perm1.2d, #56 - sli perm3.2d, perm1.2d, #48 - sli perm4.2d, perm1.2d, #40 - - // Compose { 0,0,0,0, 8,8,8,8, 1,1,1,1, 9,9,9,9 } - movi bd1.4h, #8, lsl #8 - orr bd1.2s, #1, lsl #16 - orr bd1.2s, #1, lsl #24 - zip1 bd1.16b, bd1.16b, bd1.16b - zip1 bd1.16b, bd1.16b, bd1.16b - .endm - - .macro __pmull_pre_p8, bd - tbl bd1.16b, {\bd\().16b}, perm1.16b - tbl bd2.16b, {\bd\().16b}, perm2.16b - tbl bd3.16b, {\bd\().16b}, perm3.16b - tbl bd4.16b, {\bd\().16b}, perm4.16b - .endm - -SYM_FUNC_START_LOCAL(__pmull_p8_core) -.L__pmull_p8_core: - ext t4.8b, ad.8b, ad.8b, #1 // A1 - ext t5.8b, ad.8b, ad.8b, #2 // A2 - ext t6.8b, ad.8b, ad.8b, #3 // A3 - - pmull t4.8h, t4.8b, fold_consts.8b // F = A1*B - pmull t8.8h, ad.8b, bd1.8b // E = A*B1 - pmull t5.8h, t5.8b, fold_consts.8b // H = A2*B - pmull t7.8h, ad.8b, bd2.8b // G = A*B2 - pmull t6.8h, t6.8b, fold_consts.8b // J = A3*B - pmull t9.8h, ad.8b, bd3.8b // I = A*B3 - pmull t3.8h, ad.8b, bd4.8b // K = A*B4 - b 0f - -.L__pmull_p8_core2: - tbl t4.16b, {ad.16b}, perm1.16b // A1 - tbl t5.16b, {ad.16b}, perm2.16b // A2 - tbl t6.16b, {ad.16b}, perm3.16b // A3 - - pmull2 t4.8h, t4.16b, fold_consts.16b // F = A1*B - pmull2 t8.8h, ad.16b, bd1.16b // E = A*B1 - pmull2 t5.8h, t5.16b, fold_consts.16b // H = A2*B - pmull2 t7.8h, ad.16b, bd2.16b // G = A*B2 - pmull2 t6.8h, t6.16b, fold_consts.16b // J = A3*B - pmull2 t9.8h, ad.16b, bd3.16b // I = A*B3 - pmull2 t3.8h, ad.16b, bd4.16b // K = A*B4 - -0: eor t4.16b, t4.16b, t8.16b // L = E + F - eor t5.16b, t5.16b, t7.16b // M = G + H - eor t6.16b, t6.16b, t9.16b // N = I + J - - uzp1 t8.2d, t4.2d, t5.2d - uzp2 t4.2d, t4.2d, t5.2d - uzp1 t7.2d, t6.2d, t3.2d - uzp2 t6.2d, t6.2d, t3.2d - - // t4 = (L) (P0 + P1) << 8 - // t5 = (M) (P2 + P3) << 16 - eor t8.16b, t8.16b, t4.16b - and t4.16b, t4.16b, k32_48.16b - - // t6 = (N) (P4 + P5) << 24 - // t7 = (K) (P6 + P7) << 32 - eor t7.16b, t7.16b, t6.16b - and t6.16b, t6.16b, k00_16.16b - - eor t8.16b, t8.16b, t4.16b - eor t7.16b, t7.16b, t6.16b - - zip2 t5.2d, t8.2d, t4.2d - zip1 t4.2d, t8.2d, t4.2d - zip2 t3.2d, t7.2d, t6.2d - zip1 t6.2d, t7.2d, t6.2d - - ext t4.16b, t4.16b, t4.16b, #15 - ext t5.16b, t5.16b, t5.16b, #14 - ext t6.16b, t6.16b, t6.16b, #13 - ext t3.16b, t3.16b, t3.16b, #12 - - eor t4.16b, t4.16b, t5.16b - eor t6.16b, t6.16b, t3.16b - ret -SYM_FUNC_END(__pmull_p8_core) + perm .req v27 .macro pmull16x64_p64, a16, b64, c64 pmull2 \c64\().1q, \a16\().2d, \b64\().2d @@ -266,7 +147,7 @@ SYM_FUNC_END(__pmull_p8_core) */ .macro pmull16x64_p8, a16, b64, c64 ext t7.16b, \b64\().16b, \b64\().16b, #1 - tbl t5.16b, {\a16\().16b}, bd1.16b + tbl t5.16b, {\a16\().16b}, perm.16b uzp1 t7.16b, \b64\().16b, t7.16b bl __pmull_p8_16x64 ext \b64\().16b, t4.16b, t4.16b, #15 @@ -292,22 +173,6 @@ SYM_FUNC_START_LOCAL(__pmull_p8_16x64) ret SYM_FUNC_END(__pmull_p8_16x64) - .macro __pmull_p8, rq, ad, bd, i - .ifnc \bd, fold_consts - .err - .endif - mov ad.16b, \ad\().16b - .ifb \i - pmull \rq\().8h, \ad\().8b, \bd\().8b // D = A*B - .else - pmull2 \rq\().8h, \ad\().16b, \bd\().16b // D = A*B - .endif - - bl .L__pmull_p8_core\i - - eor \rq\().16b, \rq\().16b, t4.16b - eor \rq\().16b, \rq\().16b, t6.16b - .endm // Fold reg1, reg2 into the next 32 data bytes, storing the result back // into reg1, reg2. @@ -340,16 +205,7 @@ CPU_LE( ext v12.16b, v12.16b, v12.16b, #8 ) eor \dst_reg\().16b, \dst_reg\().16b, \src_reg\().16b .endm - .macro __pmull_p64, rd, rn, rm, n - .ifb \n - pmull \rd\().1q, \rn\().1d, \rm\().1d - .else - pmull2 \rd\().1q, \rn\().2d, \rm\().2d - .endif - .endm - .macro crc_t10dif_pmull, p - __pmull_init_\p // For sizes less than 256 bytes, we can't fold 128 bytes at a time. cmp len, #256 @@ -479,47 +335,7 @@ CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 ) pmull16x64_\p fold_consts, v3, v0 eor v7.16b, v3.16b, v0.16b eor v7.16b, v7.16b, v2.16b - -.Lreduce_final_16_bytes_\@: - // Reduce the 128-bit value M(x), stored in v7, to the final 16-bit CRC. - - movi v2.16b, #0 // init zero register - - // Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'. - ld1 {fold_consts.2d}, [fold_consts_ptr], #16 - __pmull_pre_\p fold_consts - - // Fold the high 64 bits into the low 64 bits, while also multiplying by - // x^64. This produces a 128-bit value congruent to x^64 * M(x) and - // whose low 48 bits are 0. - ext v0.16b, v2.16b, v7.16b, #8 - __pmull_\p v7, v7, fold_consts, 2 // high bits * x^48 * (x^80 mod G(x)) - eor v0.16b, v0.16b, v7.16b // + low bits * x^64 - - // Fold the high 32 bits into the low 96 bits. This produces a 96-bit - // value congruent to x^64 * M(x) and whose low 48 bits are 0. - ext v1.16b, v0.16b, v2.16b, #12 // extract high 32 bits - mov v0.s[3], v2.s[0] // zero high 32 bits - __pmull_\p v1, v1, fold_consts // high 32 bits * x^48 * (x^48 mod G(x)) - eor v0.16b, v0.16b, v1.16b // + low bits - - // Load G(x) and floor(x^48 / G(x)). - ld1 {fold_consts.2d}, [fold_consts_ptr] - __pmull_pre_\p fold_consts - - // Use Barrett reduction to compute the final CRC value. - __pmull_\p v1, v0, fold_consts, 2 // high 32 bits * floor(x^48 / G(x)) - ushr v1.2d, v1.2d, #32 // /= x^32 - __pmull_\p v1, v1, fold_consts // *= G(x) - ushr v0.2d, v0.2d, #48 - eor v0.16b, v0.16b, v1.16b // + low 16 nonzero bits - // Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of v0. - - umov w0, v0.h[0] - .ifc \p, p8 - frame_pop - .endif - ret + b .Lreduce_final_16_bytes_\@ .Lless_than_256_bytes_\@: // Checksumming a buffer of length 16...255 bytes @@ -545,6 +361,8 @@ CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 ) b.ge .Lfold_16_bytes_loop_\@ // 32 <= len <= 255 add len, len, #16 b .Lhandle_partial_segment_\@ // 17 <= len <= 31 + +.Lreduce_final_16_bytes_\@: .endm // @@ -554,7 +372,22 @@ CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 ) // SYM_FUNC_START(crc_t10dif_pmull_p8) frame_push 1 + + // Compose { 0,0,0,0, 8,8,8,8, 1,1,1,1, 9,9,9,9 } + movi perm.4h, #8, lsl #8 + orr perm.2s, #1, lsl #16 + orr perm.2s, #1, lsl #24 + zip1 perm.16b, perm.16b, perm.16b + zip1 perm.16b, perm.16b, perm.16b + crc_t10dif_pmull p8 + +CPU_LE( rev64 v7.16b, v7.16b ) +CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 ) + str q7, [x3] + + frame_pop + ret SYM_FUNC_END(crc_t10dif_pmull_p8) .align 5 @@ -565,6 +398,41 @@ SYM_FUNC_END(crc_t10dif_pmull_p8) // SYM_FUNC_START(crc_t10dif_pmull_p64) crc_t10dif_pmull p64 + + // Reduce the 128-bit value M(x), stored in v7, to the final 16-bit CRC. + + movi v2.16b, #0 // init zero register + + // Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'. + ld1 {fold_consts.2d}, [fold_consts_ptr], #16 + + // Fold the high 64 bits into the low 64 bits, while also multiplying by + // x^64. This produces a 128-bit value congruent to x^64 * M(x) and + // whose low 48 bits are 0. + ext v0.16b, v2.16b, v7.16b, #8 + pmull2 v7.1q, v7.2d, fold_consts.2d // high bits * x^48 * (x^80 mod G(x)) + eor v0.16b, v0.16b, v7.16b // + low bits * x^64 + + // Fold the high 32 bits into the low 96 bits. This produces a 96-bit + // value congruent to x^64 * M(x) and whose low 48 bits are 0. + ext v1.16b, v0.16b, v2.16b, #12 // extract high 32 bits + mov v0.s[3], v2.s[0] // zero high 32 bits + pmull v1.1q, v1.1d, fold_consts.1d // high 32 bits * x^48 * (x^48 mod G(x)) + eor v0.16b, v0.16b, v1.16b // + low bits + + // Load G(x) and floor(x^48 / G(x)). + ld1 {fold_consts.2d}, [fold_consts_ptr] + + // Use Barrett reduction to compute the final CRC value. + pmull2 v1.1q, v0.2d, fold_consts.2d // high 32 bits * floor(x^48 / G(x)) + ushr v1.2d, v1.2d, #32 // /= x^32 + pmull v1.1q, v1.1d, fold_consts.1d // *= G(x) + ushr v0.2d, v0.2d, #48 + eor v0.16b, v0.16b, v1.16b // + low 16 nonzero bits + // Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of v0. + + umov w0, v0.h[0] + ret SYM_FUNC_END(crc_t10dif_pmull_p64) .section ".rodata", "a" diff --git a/arch/arm64/crypto/crct10dif-ce-glue.c b/arch/arm64/crypto/crct10dif-ce-glue.c index 7b05094a0480..08bcbd884395 100644 --- a/arch/arm64/crypto/crct10dif-ce-glue.c +++ b/arch/arm64/crypto/crct10dif-ce-glue.c @@ -20,7 +20,8 @@ #define CRC_T10DIF_PMULL_CHUNK_SIZE 16U -asmlinkage u16 crc_t10dif_pmull_p8(u16 init_crc, const u8 *buf, size_t len); +asmlinkage void crc_t10dif_pmull_p8(u16 init_crc, const u8 *buf, size_t len, + u8 out[16]); asmlinkage u16 crc_t10dif_pmull_p64(u16 init_crc, const u8 *buf, size_t len); static int crct10dif_init(struct shash_desc *desc) @@ -34,16 +35,21 @@ static int crct10dif_init(struct shash_desc *desc) static int crct10dif_update_pmull_p8(struct shash_desc *desc, const u8 *data, unsigned int length) { - u16 *crc = shash_desc_ctx(desc); + u16 *crcp = shash_desc_ctx(desc); + u16 crc = *crcp; + u8 buf[16]; - if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && crypto_simd_usable()) { + if (length > CRC_T10DIF_PMULL_CHUNK_SIZE && crypto_simd_usable()) { kernel_neon_begin(); - *crc = crc_t10dif_pmull_p8(*crc, data, length); + crc_t10dif_pmull_p8(crc, data, length, buf); kernel_neon_end(); - } else { - *crc = crc_t10dif_generic(*crc, data, length); + + crc = 0; + data = buf; + length = sizeof(buf); } + *crcp = crc_t10dif_generic(crc, data, length); return 0; } -- 2.51.0 From fcf27785ae51b259ea2a9b340f10f9d393954887 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 5 Nov 2024 17:09:04 +0100 Subject: [PATCH 10/16] crypto: arm/crct10dif - Use existing mov_l macro instead of __adrl Reviewed-by: Eric Biggers Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/arm/crypto/crct10dif-ce-core.S | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/arch/arm/crypto/crct10dif-ce-core.S b/arch/arm/crypto/crct10dif-ce-core.S index 46c02c518a30..4dac32e020de 100644 --- a/arch/arm/crypto/crct10dif-ce-core.S +++ b/arch/arm/crypto/crct10dif-ce-core.S @@ -144,11 +144,6 @@ CPU_LE( vrev64.8 q12, q12 ) veor.8 \dst_reg, \dst_reg, \src_reg .endm - .macro __adrl, out, sym - movw \out, #:lower16:\sym - movt \out, #:upper16:\sym - .endm - // // u16 crc_t10dif_pmull(u16 init_crc, const u8 *buf, size_t len); // @@ -160,7 +155,7 @@ ENTRY(crc_t10dif_pmull) cmp len, #256 blt .Lless_than_256_bytes - __adrl fold_consts_ptr, .Lfold_across_128_bytes_consts + mov_l fold_consts_ptr, .Lfold_across_128_bytes_consts // Load the first 128 data bytes. Byte swapping is necessary to make // the bit order match the polynomial coefficient order. @@ -262,7 +257,7 @@ CPU_LE( vrev64.8 q0, q0 ) vswp q0l, q0h // q1 = high order part of second chunk: q7 left-shifted by 'len' bytes. - __adrl r3, .Lbyteshift_table + 16 + mov_l r3, .Lbyteshift_table + 16 sub r3, r3, len vld1.8 {q2}, [r3] vtbl.8 q1l, {q7l-q7h}, q2l @@ -324,7 +319,7 @@ CPU_LE( vrev64.8 q0, q0 ) .Lless_than_256_bytes: // Checksumming a buffer of length 16...255 bytes - __adrl fold_consts_ptr, .Lfold_across_16_bytes_consts + mov_l fold_consts_ptr, .Lfold_across_16_bytes_consts // Load the first 16 data bytes. vld1.64 {q7}, [buf]! -- 2.51.0 From 802d8d110ce2b3ae979221551f4cb168e2f5e464 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 5 Nov 2024 17:09:05 +0100 Subject: [PATCH 11/16] crypto: arm/crct10dif - Macroify PMULL asm code To allow an alternative version to be created of the PMULL based CRC-T10DIF algorithm, turn the bulk of it into a macro, except for the final reduction, which will only be used by the existing version. Reviewed-by: Eric Biggers Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/arm/crypto/crct10dif-ce-core.S | 154 ++++++++++++++-------------- arch/arm/crypto/crct10dif-ce-glue.c | 10 +- 2 files changed, 83 insertions(+), 81 deletions(-) diff --git a/arch/arm/crypto/crct10dif-ce-core.S b/arch/arm/crypto/crct10dif-ce-core.S index 4dac32e020de..6b72167574b2 100644 --- a/arch/arm/crypto/crct10dif-ce-core.S +++ b/arch/arm/crypto/crct10dif-ce-core.S @@ -112,48 +112,42 @@ FOLD_CONST_L .req q10l FOLD_CONST_H .req q10h + .macro pmull16x64_p64, v16, v64 + vmull.p64 q11, \v64\()l, \v16\()_L + vmull.p64 \v64, \v64\()h, \v16\()_H + veor \v64, \v64, q11 + .endm + // Fold reg1, reg2 into the next 32 data bytes, storing the result back // into reg1, reg2. - .macro fold_32_bytes, reg1, reg2 - vld1.64 {q11-q12}, [buf]! + .macro fold_32_bytes, reg1, reg2, p + vld1.64 {q8-q9}, [buf]! - vmull.p64 q8, \reg1\()h, FOLD_CONST_H - vmull.p64 \reg1, \reg1\()l, FOLD_CONST_L - vmull.p64 q9, \reg2\()h, FOLD_CONST_H - vmull.p64 \reg2, \reg2\()l, FOLD_CONST_L + pmull16x64_\p FOLD_CONST, \reg1 + pmull16x64_\p FOLD_CONST, \reg2 -CPU_LE( vrev64.8 q11, q11 ) -CPU_LE( vrev64.8 q12, q12 ) - vswp q11l, q11h - vswp q12l, q12h +CPU_LE( vrev64.8 q8, q8 ) +CPU_LE( vrev64.8 q9, q9 ) + vswp q8l, q8h + vswp q9l, q9h veor.8 \reg1, \reg1, q8 veor.8 \reg2, \reg2, q9 - veor.8 \reg1, \reg1, q11 - veor.8 \reg2, \reg2, q12 .endm // Fold src_reg into dst_reg, optionally loading the next fold constants - .macro fold_16_bytes, src_reg, dst_reg, load_next_consts - vmull.p64 q8, \src_reg\()l, FOLD_CONST_L - vmull.p64 \src_reg, \src_reg\()h, FOLD_CONST_H + .macro fold_16_bytes, src_reg, dst_reg, p, load_next_consts + pmull16x64_\p FOLD_CONST, \src_reg .ifnb \load_next_consts vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]! .endif - veor.8 \dst_reg, \dst_reg, q8 veor.8 \dst_reg, \dst_reg, \src_reg .endm -// -// u16 crc_t10dif_pmull(u16 init_crc, const u8 *buf, size_t len); -// -// Assumes len >= 16. -// -ENTRY(crc_t10dif_pmull) - + .macro crct10dif, p // For sizes less than 256 bytes, we can't fold 128 bytes at a time. cmp len, #256 - blt .Lless_than_256_bytes + blt .Lless_than_256_bytes\@ mov_l fold_consts_ptr, .Lfold_across_128_bytes_consts @@ -194,27 +188,27 @@ CPU_LE( vrev64.8 q7, q7 ) // While >= 128 data bytes remain (not counting q0-q7), fold the 128 // bytes q0-q7 into them, storing the result back into q0-q7. -.Lfold_128_bytes_loop: - fold_32_bytes q0, q1 - fold_32_bytes q2, q3 - fold_32_bytes q4, q5 - fold_32_bytes q6, q7 +.Lfold_128_bytes_loop\@: + fold_32_bytes q0, q1, \p + fold_32_bytes q2, q3, \p + fold_32_bytes q4, q5, \p + fold_32_bytes q6, q7, \p subs len, len, #128 - bge .Lfold_128_bytes_loop + bge .Lfold_128_bytes_loop\@ // Now fold the 112 bytes in q0-q6 into the 16 bytes in q7. // Fold across 64 bytes. vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]! - fold_16_bytes q0, q4 - fold_16_bytes q1, q5 - fold_16_bytes q2, q6 - fold_16_bytes q3, q7, 1 + fold_16_bytes q0, q4, \p + fold_16_bytes q1, q5, \p + fold_16_bytes q2, q6, \p + fold_16_bytes q3, q7, \p, 1 // Fold across 32 bytes. - fold_16_bytes q4, q6 - fold_16_bytes q5, q7, 1 + fold_16_bytes q4, q6, \p + fold_16_bytes q5, q7, \p, 1 // Fold across 16 bytes. - fold_16_bytes q6, q7 + fold_16_bytes q6, q7, \p // Add 128 to get the correct number of data bytes remaining in 0...127 // (not counting q7), following the previous extra subtraction by 128. @@ -224,25 +218,23 @@ CPU_LE( vrev64.8 q7, q7 ) // While >= 16 data bytes remain (not counting q7), fold the 16 bytes q7 // into them, storing the result back into q7. - blt .Lfold_16_bytes_loop_done -.Lfold_16_bytes_loop: - vmull.p64 q8, q7l, FOLD_CONST_L - vmull.p64 q7, q7h, FOLD_CONST_H - veor.8 q7, q7, q8 + blt .Lfold_16_bytes_loop_done\@ +.Lfold_16_bytes_loop\@: + pmull16x64_\p FOLD_CONST, q7 vld1.64 {q0}, [buf]! CPU_LE( vrev64.8 q0, q0 ) vswp q0l, q0h veor.8 q7, q7, q0 subs len, len, #16 - bge .Lfold_16_bytes_loop + bge .Lfold_16_bytes_loop\@ -.Lfold_16_bytes_loop_done: +.Lfold_16_bytes_loop_done\@: // Add 16 to get the correct number of data bytes remaining in 0...15 // (not counting q7), following the previous extra subtraction by 16. adds len, len, #16 - beq .Lreduce_final_16_bytes + beq .Lreduce_final_16_bytes\@ -.Lhandle_partial_segment: +.Lhandle_partial_segment\@: // Reduce the last '16 + len' bytes where 1 <= len <= 15 and the first // 16 bytes are in q7 and the rest are the remaining data in 'buf'. To // do this without needing a fold constant for each possible 'len', @@ -277,12 +269,46 @@ CPU_LE( vrev64.8 q0, q0 ) vbsl.8 q2, q1, q0 // Fold the first chunk into the second chunk, storing the result in q7. - vmull.p64 q0, q3l, FOLD_CONST_L - vmull.p64 q7, q3h, FOLD_CONST_H - veor.8 q7, q7, q0 - veor.8 q7, q7, q2 + pmull16x64_\p FOLD_CONST, q3 + veor.8 q7, q3, q2 + b .Lreduce_final_16_bytes\@ + +.Lless_than_256_bytes\@: + // Checksumming a buffer of length 16...255 bytes + + mov_l fold_consts_ptr, .Lfold_across_16_bytes_consts + + // Load the first 16 data bytes. + vld1.64 {q7}, [buf]! +CPU_LE( vrev64.8 q7, q7 ) + vswp q7l, q7h + + // XOR the first 16 data *bits* with the initial CRC value. + vmov.i8 q0h, #0 + vmov.u16 q0h[3], init_crc + veor.8 q7h, q7h, q0h + + // Load the fold-across-16-bytes constants. + vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]! + + cmp len, #16 + beq .Lreduce_final_16_bytes\@ // len == 16 + subs len, len, #32 + addlt len, len, #16 + blt .Lhandle_partial_segment\@ // 17 <= len <= 31 + b .Lfold_16_bytes_loop\@ // 32 <= len <= 255 + +.Lreduce_final_16_bytes\@: + .endm + +// +// u16 crc_t10dif_pmull(u16 init_crc, const u8 *buf, size_t len); +// +// Assumes len >= 16. +// +ENTRY(crc_t10dif_pmull64) + crct10dif p64 -.Lreduce_final_16_bytes: // Reduce the 128-bit value M(x), stored in q7, to the final 16-bit CRC. // Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'. @@ -316,31 +342,7 @@ CPU_LE( vrev64.8 q0, q0 ) vmov.u16 r0, q0l[0] bx lr -.Lless_than_256_bytes: - // Checksumming a buffer of length 16...255 bytes - - mov_l fold_consts_ptr, .Lfold_across_16_bytes_consts - - // Load the first 16 data bytes. - vld1.64 {q7}, [buf]! -CPU_LE( vrev64.8 q7, q7 ) - vswp q7l, q7h - - // XOR the first 16 data *bits* with the initial CRC value. - vmov.i8 q0h, #0 - vmov.u16 q0h[3], init_crc - veor.8 q7h, q7h, q0h - - // Load the fold-across-16-bytes constants. - vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]! - - cmp len, #16 - beq .Lreduce_final_16_bytes // len == 16 - subs len, len, #32 - addlt len, len, #16 - blt .Lhandle_partial_segment // 17 <= len <= 31 - b .Lfold_16_bytes_loop // 32 <= len <= 255 -ENDPROC(crc_t10dif_pmull) +ENDPROC(crc_t10dif_pmull64) .section ".rodata", "a" .align 4 diff --git a/arch/arm/crypto/crct10dif-ce-glue.c b/arch/arm/crypto/crct10dif-ce-glue.c index 79f3b204d8c0..60aa79c2fcdb 100644 --- a/arch/arm/crypto/crct10dif-ce-glue.c +++ b/arch/arm/crypto/crct10dif-ce-glue.c @@ -19,7 +19,7 @@ #define CRC_T10DIF_PMULL_CHUNK_SIZE 16U -asmlinkage u16 crc_t10dif_pmull(u16 init_crc, const u8 *buf, size_t len); +asmlinkage u16 crc_t10dif_pmull64(u16 init_crc, const u8 *buf, size_t len); static int crct10dif_init(struct shash_desc *desc) { @@ -29,14 +29,14 @@ static int crct10dif_init(struct shash_desc *desc) return 0; } -static int crct10dif_update(struct shash_desc *desc, const u8 *data, - unsigned int length) +static int crct10dif_update_ce(struct shash_desc *desc, const u8 *data, + unsigned int length) { u16 *crc = shash_desc_ctx(desc); if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && crypto_simd_usable()) { kernel_neon_begin(); - *crc = crc_t10dif_pmull(*crc, data, length); + *crc = crc_t10dif_pmull64(*crc, data, length); kernel_neon_end(); } else { *crc = crc_t10dif_generic(*crc, data, length); @@ -56,7 +56,7 @@ static int crct10dif_final(struct shash_desc *desc, u8 *out) static struct shash_alg crc_t10dif_alg = { .digestsize = CRC_T10DIF_DIGEST_SIZE, .init = crct10dif_init, - .update = crct10dif_update, + .update = crct10dif_update_ce, .final = crct10dif_final, .descsize = CRC_T10DIF_DIGEST_SIZE, -- 2.51.0 From e7c1d1c9b2023decb855ec4c921a7d78abbf64eb Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 5 Nov 2024 17:09:06 +0100 Subject: [PATCH 12/16] crypto: arm/crct10dif - Implement plain NEON variant The CRC-T10DIF algorithm produces a 16-bit CRC, and this is reflected in the folding coefficients, which are also only 16 bits wide. This means that the polynomial multiplications involving these coefficients can be performed using 8-bit long polynomial multiplication (8x8 -> 16) in only a few steps, and this is an instruction that is part of the base NEON ISA, which is all most real ARMv7 cores implement. (The 64-bit PMULL instruction is part of the crypto extensions, which are only implemented by 64-bit cores) The final reduction is a bit more involved, but we can delegate that to the generic CRC-T10DIF implementation after folding the entire input into a 16 byte vector. This results in a speedup of around 6.6x on Cortex-A72 running in 32-bit mode. On Cortex-A8 (BeagleBone White), the results are substantially better than that, but not sufficiently reproducible (with tcrypt) to quote a number here. Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/arm/crypto/crct10dif-ce-core.S | 98 +++++++++++++++++++++++++++-- arch/arm/crypto/crct10dif-ce-glue.c | 45 +++++++++++-- 2 files changed, 134 insertions(+), 9 deletions(-) diff --git a/arch/arm/crypto/crct10dif-ce-core.S b/arch/arm/crypto/crct10dif-ce-core.S index 6b72167574b2..2bbf2df9c1e2 100644 --- a/arch/arm/crypto/crct10dif-ce-core.S +++ b/arch/arm/crypto/crct10dif-ce-core.S @@ -112,6 +112,82 @@ FOLD_CONST_L .req q10l FOLD_CONST_H .req q10h + /* + * Pairwise long polynomial multiplication of two 16-bit values + * + * { w0, w1 }, { y0, y1 } + * + * by two 64-bit values + * + * { x0, x1, x2, x3, x4, x5, x6, x7 }, { z0, z1, z2, z3, z4, z5, z6, z7 } + * + * where each vector element is a byte, ordered from least to most + * significant. The resulting 80-bit vectors are XOR'ed together. + * + * This can be implemented using 8x8 long polynomial multiplication, by + * reorganizing the input so that each pairwise 8x8 multiplication + * produces one of the terms from the decomposition below, and + * combining the results of each rank and shifting them into place. + * + * Rank + * 0 w0*x0 ^ | y0*z0 ^ + * 1 (w0*x1 ^ w1*x0) << 8 ^ | (y0*z1 ^ y1*z0) << 8 ^ + * 2 (w0*x2 ^ w1*x1) << 16 ^ | (y0*z2 ^ y1*z1) << 16 ^ + * 3 (w0*x3 ^ w1*x2) << 24 ^ | (y0*z3 ^ y1*z2) << 24 ^ + * 4 (w0*x4 ^ w1*x3) << 32 ^ | (y0*z4 ^ y1*z3) << 32 ^ + * 5 (w0*x5 ^ w1*x4) << 40 ^ | (y0*z5 ^ y1*z4) << 40 ^ + * 6 (w0*x6 ^ w1*x5) << 48 ^ | (y0*z6 ^ y1*z5) << 48 ^ + * 7 (w0*x7 ^ w1*x6) << 56 ^ | (y0*z7 ^ y1*z6) << 56 ^ + * 8 w1*x7 << 64 | y1*z7 << 64 + * + * The inputs can be reorganized into + * + * { w0, w0, w0, w0, y0, y0, y0, y0 }, { w1, w1, w1, w1, y1, y1, y1, y1 } + * { x0, x2, x4, x6, z0, z2, z4, z6 }, { x1, x3, x5, x7, z1, z3, z5, z7 } + * + * and after performing 8x8->16 bit long polynomial multiplication of + * each of the halves of the first vector with those of the second one, + * we obtain the following four vectors of 16-bit elements: + * + * a := { w0*x0, w0*x2, w0*x4, w0*x6 }, { y0*z0, y0*z2, y0*z4, y0*z6 } + * b := { w0*x1, w0*x3, w0*x5, w0*x7 }, { y0*z1, y0*z3, y0*z5, y0*z7 } + * c := { w1*x0, w1*x2, w1*x4, w1*x6 }, { y1*z0, y1*z2, y1*z4, y1*z6 } + * d := { w1*x1, w1*x3, w1*x5, w1*x7 }, { y1*z1, y1*z3, y1*z5, y1*z7 } + * + * Results b and c can be XORed together, as the vector elements have + * matching ranks. Then, the final XOR can be pulled forward, and + * applied between the halves of each of the remaining three vectors, + * which are then shifted into place, and XORed together to produce the + * final 80-bit result. + */ + .macro pmull16x64_p8, v16, v64 + vext.8 q11, \v64, \v64, #1 + vld1.64 {q12}, [r4, :128] + vuzp.8 q11, \v64 + vtbl.8 d24, {\v16\()_L-\v16\()_H}, d24 + vtbl.8 d25, {\v16\()_L-\v16\()_H}, d25 + bl __pmull16x64_p8 + veor \v64, q12, q14 + .endm + +__pmull16x64_p8: + vmull.p8 q13, d23, d24 + vmull.p8 q14, d23, d25 + vmull.p8 q15, d22, d24 + vmull.p8 q12, d22, d25 + + veor q14, q14, q15 + veor d24, d24, d25 + veor d26, d26, d27 + veor d28, d28, d29 + vmov.i32 d25, #0 + vmov.i32 d29, #0 + vext.8 q12, q12, q12, #14 + vext.8 q14, q14, q14, #15 + veor d24, d24, d26 + bx lr +ENDPROC(__pmull16x64_p8) + .macro pmull16x64_p64, v16, v64 vmull.p64 q11, \v64\()l, \v16\()_L vmull.p64 \v64, \v64\()h, \v16\()_H @@ -249,9 +325,9 @@ CPU_LE( vrev64.8 q0, q0 ) vswp q0l, q0h // q1 = high order part of second chunk: q7 left-shifted by 'len' bytes. - mov_l r3, .Lbyteshift_table + 16 - sub r3, r3, len - vld1.8 {q2}, [r3] + mov_l r1, .Lbyteshift_table + 16 + sub r1, r1, len + vld1.8 {q2}, [r1] vtbl.8 q1l, {q7l-q7h}, q2l vtbl.8 q1h, {q7l-q7h}, q2h @@ -341,9 +417,20 @@ ENTRY(crc_t10dif_pmull64) vmov.u16 r0, q0l[0] bx lr - ENDPROC(crc_t10dif_pmull64) +ENTRY(crc_t10dif_pmull8) + push {r4, lr} + mov_l r4, .L16x64perm + + crct10dif p8 + +CPU_LE( vrev64.8 q7, q7 ) + vswp q7l, q7h + vst1.64 {q7}, [r3, :128] + pop {r4, pc} +ENDPROC(crc_t10dif_pmull8) + .section ".rodata", "a" .align 4 @@ -376,3 +463,6 @@ ENDPROC(crc_t10dif_pmull64) .byte 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe , 0x0 + +.L16x64perm: + .quad 0x808080800000000, 0x909090901010101 diff --git a/arch/arm/crypto/crct10dif-ce-glue.c b/arch/arm/crypto/crct10dif-ce-glue.c index 60aa79c2fcdb..a8b74523729e 100644 --- a/arch/arm/crypto/crct10dif-ce-glue.c +++ b/arch/arm/crypto/crct10dif-ce-glue.c @@ -20,6 +20,8 @@ #define CRC_T10DIF_PMULL_CHUNK_SIZE 16U asmlinkage u16 crc_t10dif_pmull64(u16 init_crc, const u8 *buf, size_t len); +asmlinkage void crc_t10dif_pmull8(u16 init_crc, const u8 *buf, size_t len, + u8 out[16]); static int crct10dif_init(struct shash_desc *desc) { @@ -45,6 +47,27 @@ static int crct10dif_update_ce(struct shash_desc *desc, const u8 *data, return 0; } +static int crct10dif_update_neon(struct shash_desc *desc, const u8 *data, + unsigned int length) +{ + u16 *crcp = shash_desc_ctx(desc); + u8 buf[16] __aligned(16); + u16 crc = *crcp; + + if (length > CRC_T10DIF_PMULL_CHUNK_SIZE && crypto_simd_usable()) { + kernel_neon_begin(); + crc_t10dif_pmull8(crc, data, length, buf); + kernel_neon_end(); + + crc = 0; + data = buf; + length = sizeof(buf); + } + + *crcp = crc_t10dif_generic(crc, data, length); + return 0; +} + static int crct10dif_final(struct shash_desc *desc, u8 *out) { u16 *crc = shash_desc_ctx(desc); @@ -53,7 +76,19 @@ static int crct10dif_final(struct shash_desc *desc, u8 *out) return 0; } -static struct shash_alg crc_t10dif_alg = { +static struct shash_alg algs[] = {{ + .digestsize = CRC_T10DIF_DIGEST_SIZE, + .init = crct10dif_init, + .update = crct10dif_update_neon, + .final = crct10dif_final, + .descsize = CRC_T10DIF_DIGEST_SIZE, + + .base.cra_name = "crct10dif", + .base.cra_driver_name = "crct10dif-arm-neon", + .base.cra_priority = 150, + .base.cra_blocksize = CRC_T10DIF_BLOCK_SIZE, + .base.cra_module = THIS_MODULE, +}, { .digestsize = CRC_T10DIF_DIGEST_SIZE, .init = crct10dif_init, .update = crct10dif_update_ce, @@ -65,19 +100,19 @@ static struct shash_alg crc_t10dif_alg = { .base.cra_priority = 200, .base.cra_blocksize = CRC_T10DIF_BLOCK_SIZE, .base.cra_module = THIS_MODULE, -}; +}}; static int __init crc_t10dif_mod_init(void) { - if (!(elf_hwcap2 & HWCAP2_PMULL)) + if (!(elf_hwcap & HWCAP_NEON)) return -ENODEV; - return crypto_register_shash(&crc_t10dif_alg); + return crypto_register_shashes(algs, 1 + !!(elf_hwcap2 & HWCAP2_PMULL)); } static void __exit crc_t10dif_mod_exit(void) { - crypto_unregister_shash(&crc_t10dif_alg); + crypto_unregister_shashes(algs, 1 + !!(elf_hwcap2 & HWCAP2_PMULL)); } module_init(crc_t10dif_mod_init); -- 2.51.0 From 5465951e3f539aaff21823706d631b75407fb8d5 Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Wed, 6 Nov 2024 09:13:43 +0100 Subject: [PATCH 13/16] hwrng: amd - remove reference to removed PPC_MAPLE config MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Commit 62f8f307c80e ("powerpc/64: Remove maple platform") removes the PPC_MAPLE config as a consequence of the platform’s removal. The config definition of HW_RANDOM_AMD refers to this removed config option in its dependencies. Remove the reference to the removed config option. Signed-off-by: Lukas Bulwahn Acked-by: Michael Ellerman (powerpc) Signed-off-by: Herbert Xu --- drivers/char/hw_random/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/char/hw_random/Kconfig b/drivers/char/hw_random/Kconfig index bf9b2354766a..17854f052386 100644 --- a/drivers/char/hw_random/Kconfig +++ b/drivers/char/hw_random/Kconfig @@ -50,7 +50,7 @@ config HW_RANDOM_INTEL config HW_RANDOM_AMD tristate "AMD HW Random Number Generator support" - depends on (X86 || PPC_MAPLE || COMPILE_TEST) + depends on (X86 || COMPILE_TEST) depends on PCI && HAS_IOPORT_MAP default HW_RANDOM help -- 2.51.0 From 3574a5168ff3b6bddc4cd235878491f75967c8d4 Mon Sep 17 00:00:00 2001 From: Michal Suchanek Date: Wed, 6 Nov 2024 13:09:33 +0100 Subject: [PATCH 14/16] crypto: aes-gcm-p10 - Use the correct bit to test for P10 A hwcap feature bit is passed to cpu_has_feature, resulting in testing for CPU_FTR_MMCRA instead of the 3.1 platform revision. Fixes: c954b252dee9 ("crypto: powerpc/p10-aes-gcm - Register modules as SIMD") Reported-by: Nicolai Stange Signed-off-by: Michal Suchanek Acked-by: Michael Ellerman (powerpc) Signed-off-by: Herbert Xu --- arch/powerpc/crypto/aes-gcm-p10-glue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/crypto/aes-gcm-p10-glue.c b/arch/powerpc/crypto/aes-gcm-p10-glue.c index e52629334cf8..1f8b67775658 100644 --- a/arch/powerpc/crypto/aes-gcm-p10-glue.c +++ b/arch/powerpc/crypto/aes-gcm-p10-glue.c @@ -414,7 +414,7 @@ static int __init p10_init(void) { int ret; - if (!cpu_has_feature(PPC_FEATURE2_ARCH_3_1)) + if (!cpu_has_feature(CPU_FTR_ARCH_31)) return 0; ret = simd_register_aeads_compat(gcm_aes_algs, -- 2.51.0 From 0594ad6184598b5b9a6eb5619785f37f825e6ffd Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 7 Nov 2024 13:37:35 +0800 Subject: [PATCH 15/16] crypto: lib/mpi - Export mpi_set_bit This function is part of the exposed API and should be exported. Otherwise a modular user would fail to build, e.g., crypto/rsa. Signed-off-by: Herbert Xu --- lib/crypto/mpi/mpi-bit.c | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/crypto/mpi/mpi-bit.c b/lib/crypto/mpi/mpi-bit.c index 835a2f0622a0..934d81311360 100644 --- a/lib/crypto/mpi/mpi-bit.c +++ b/lib/crypto/mpi/mpi-bit.c @@ -95,6 +95,7 @@ int mpi_set_bit(MPI a, unsigned int n) a->d[limbno] |= (A_LIMB_1< Date: Thu, 7 Nov 2024 17:57:06 +0800 Subject: [PATCH 16/16] crypto: aesni - Move back to module_init This patch reverts commit 0fbafd06bdde938884f7326548d3df812b267c3c ("crypto: aesni - fix failing setkey for rfc4106-gcm-aesni") by moving the aesni init function back to module_init from late_initcall. The original patch was needed because tests were synchronous. This is no longer the case so there is no need to postpone the registration. Signed-off-by: Herbert Xu --- arch/x86/crypto/aesni-intel_glue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index b0dd83555499..fbf43482e1f5 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -1747,7 +1747,7 @@ static void __exit aesni_exit(void) unregister_avx_algs(); } -late_initcall(aesni_init); +module_init(aesni_init); module_exit(aesni_exit); MODULE_DESCRIPTION("AES cipher and modes, optimized with AES-NI or VAES instructions"); -- 2.51.0