From bdd9155560d09c471886c46e4e2f60d246598b8c Mon Sep 17 00:00:00 2001 From: Vishal Chourasia Date: Tue, 15 Oct 2024 16:25:51 +0530 Subject: [PATCH 01/16] crypto: nx - Rename devdata_mutex to devdata_spinlock Rename devdata_mutex to devdata_spinlock to accurately reflect its implementation as a spinlock. [1] v1 https://lore.kernel.org/all/ZwyqD-w5hEhrnqTB@linux.ibm.com Signed-off-by: Vishal Chourasia Signed-off-by: Herbert Xu --- drivers/crypto/nx/nx-common-pseries.c | 34 +++++++++++++-------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/drivers/crypto/nx/nx-common-pseries.c b/drivers/crypto/nx/nx-common-pseries.c index 35f2d0d8507e..a0eb900383af 100644 --- a/drivers/crypto/nx/nx-common-pseries.c +++ b/drivers/crypto/nx/nx-common-pseries.c @@ -133,7 +133,7 @@ struct nx842_devdata { }; static struct nx842_devdata __rcu *devdata; -static DEFINE_SPINLOCK(devdata_mutex); +static DEFINE_SPINLOCK(devdata_spinlock); #define NX842_COUNTER_INC(_x) \ static inline void nx842_inc_##_x( \ @@ -750,15 +750,15 @@ static int nx842_OF_upd(struct property *new_prop) if (!new_devdata) return -ENOMEM; - spin_lock_irqsave(&devdata_mutex, flags); + spin_lock_irqsave(&devdata_spinlock, flags); old_devdata = rcu_dereference_check(devdata, - lockdep_is_held(&devdata_mutex)); + lockdep_is_held(&devdata_spinlock)); if (old_devdata) of_node = old_devdata->dev->of_node; if (!old_devdata || !of_node) { pr_err("%s: device is not available\n", __func__); - spin_unlock_irqrestore(&devdata_mutex, flags); + spin_unlock_irqrestore(&devdata_spinlock, flags); kfree(new_devdata); return -ENODEV; } @@ -810,7 +810,7 @@ out: old_devdata->max_sg_len); rcu_assign_pointer(devdata, new_devdata); - spin_unlock_irqrestore(&devdata_mutex, flags); + spin_unlock_irqrestore(&devdata_spinlock, flags); synchronize_rcu(); dev_set_drvdata(new_devdata->dev, new_devdata); kfree(old_devdata); @@ -821,13 +821,13 @@ error_out: dev_info(old_devdata->dev, "%s: device disabled\n", __func__); nx842_OF_set_defaults(new_devdata); rcu_assign_pointer(devdata, new_devdata); - spin_unlock_irqrestore(&devdata_mutex, flags); + spin_unlock_irqrestore(&devdata_spinlock, flags); synchronize_rcu(); dev_set_drvdata(new_devdata->dev, new_devdata); kfree(old_devdata); } else { dev_err(old_devdata->dev, "%s: could not update driver from hardware\n", __func__); - spin_unlock_irqrestore(&devdata_mutex, flags); + spin_unlock_irqrestore(&devdata_spinlock, flags); } if (!ret) @@ -1045,9 +1045,9 @@ static int nx842_probe(struct vio_dev *viodev, return -ENOMEM; } - spin_lock_irqsave(&devdata_mutex, flags); + spin_lock_irqsave(&devdata_spinlock, flags); old_devdata = rcu_dereference_check(devdata, - lockdep_is_held(&devdata_mutex)); + lockdep_is_held(&devdata_spinlock)); if (old_devdata && old_devdata->vdev != NULL) { dev_err(&viodev->dev, "%s: Attempt to register more than one instance of the hardware\n", __func__); @@ -1062,7 +1062,7 @@ static int nx842_probe(struct vio_dev *viodev, nx842_OF_set_defaults(new_devdata); rcu_assign_pointer(devdata, new_devdata); - spin_unlock_irqrestore(&devdata_mutex, flags); + spin_unlock_irqrestore(&devdata_spinlock, flags); synchronize_rcu(); kfree(old_devdata); @@ -1101,7 +1101,7 @@ static int nx842_probe(struct vio_dev *viodev, return 0; error_unlock: - spin_unlock_irqrestore(&devdata_mutex, flags); + spin_unlock_irqrestore(&devdata_spinlock, flags); if (new_devdata) kfree(new_devdata->counters); kfree(new_devdata); @@ -1122,12 +1122,12 @@ static void nx842_remove(struct vio_dev *viodev) crypto_unregister_alg(&nx842_pseries_alg); - spin_lock_irqsave(&devdata_mutex, flags); + spin_lock_irqsave(&devdata_spinlock, flags); old_devdata = rcu_dereference_check(devdata, - lockdep_is_held(&devdata_mutex)); + lockdep_is_held(&devdata_spinlock)); of_reconfig_notifier_unregister(&nx842_of_nb); RCU_INIT_POINTER(devdata, NULL); - spin_unlock_irqrestore(&devdata_mutex, flags); + spin_unlock_irqrestore(&devdata_spinlock, flags); synchronize_rcu(); dev_set_drvdata(&viodev->dev, NULL); if (old_devdata) @@ -1257,11 +1257,11 @@ static void __exit nx842_pseries_exit(void) crypto_unregister_alg(&nx842_pseries_alg); - spin_lock_irqsave(&devdata_mutex, flags); + spin_lock_irqsave(&devdata_spinlock, flags); old_devdata = rcu_dereference_check(devdata, - lockdep_is_held(&devdata_mutex)); + lockdep_is_held(&devdata_spinlock)); RCU_INIT_POINTER(devdata, NULL); - spin_unlock_irqrestore(&devdata_mutex, flags); + spin_unlock_irqrestore(&devdata_spinlock, flags); synchronize_rcu(); if (old_devdata && old_devdata->dev) dev_set_drvdata(old_devdata->dev, NULL); -- 2.51.0 From 69b062072739404f403bab2710b770919ce2f1ce Mon Sep 17 00:00:00 2001 From: Vishal Chourasia Date: Tue, 15 Oct 2024 16:25:52 +0530 Subject: [PATCH 02/16] crypto: nx - Fix invalid wait context during kexec reboot nx842_remove() call of_reconfig_notifier_unregister while holding the devdata_spinlock. This could lead to an invalid wait context error during kexec reboot, as of_reconfig_notifier_unregister tries to acquire a read-write semaphore (check logs) while holding a spinlock. Move the of_reconfig_notifier_unregister() call before acquiring the spinlock to prevent this race condition invalid wait contexts during system shutdown or kexec operations. Log: [ BUG: Invalid wait context ] 6.11.0-test2-10547-g684a64bf32b6-dirty #79 Not tainted ----------------------------- kexec/61926 is trying to lock: c000000002d8b590 ((of_reconfig_chain).rwsem){++++}-{4:4}, at: blocking_notifier_chain_unregister+0x44/0xa0 other info that might help us debug this: context-{5:5} 4 locks held by kexec/61926: #0: c000000002926c70 (system_transition_mutex){+.+.}-{4:4}, at: __do_sys_reboot+0xf8/0x2e0 #1: c00000000291af30 (&dev->mutex){....}-{4:4}, at: device_shutdown+0x160/0x310 #2: c000000051011938 (&dev->mutex){....}-{4:4}, at: device_shutdown+0x174/0x310 #3: c000000002d88070 (devdata_mutex){....}-{3:3}, at: nx842_remove+0xac/0x1bc stack backtrace: CPU: 2 UID: 0 PID: 61926 Comm: kexec Not tainted 6.11.0-test2-10547-g684a64bf32b6-dirty #79 Hardware name: IBM,9080-HEX POWER10 (architected) 0x800200 0xf000006 of:IBM,FW1060.00 (NH1060_012) hv:phyp pSeries Call Trace: [c0000000bb577400] [c000000001239704] dump_stack_lvl+0xc8/0x130 (unreliable) [c0000000bb577440] [c000000000248398] __lock_acquire+0xb68/0xf00 [c0000000bb577550] [c000000000248820] lock_acquire.part.0+0xf0/0x2a0 [c0000000bb577670] [c00000000127faa0] down_write+0x70/0x1e0 [c0000000bb5776b0] [c0000000001acea4] blocking_notifier_chain_unregister+0x44/0xa0 [c0000000bb5776e0] [c000000000e2312c] of_reconfig_notifier_unregister+0x2c/0x40 [c0000000bb577700] [c000000000ded24c] nx842_remove+0x148/0x1bc [c0000000bb577790] [c00000000011a114] vio_bus_remove+0x54/0xc0 [c0000000bb5777c0] [c000000000c1a44c] device_shutdown+0x20c/0x310 [c0000000bb577850] [c0000000001b0ab4] kernel_restart_prepare+0x54/0x70 [c0000000bb577870] [c000000000308718] kernel_kexec+0xa8/0x110 [c0000000bb5778e0] [c0000000001b1144] __do_sys_reboot+0x214/0x2e0 [c0000000bb577a40] [c000000000032f98] system_call_exception+0x148/0x310 [c0000000bb577e50] [c00000000000cedc] system_call_vectored_common+0x15c/0x2ec --- interrupt: 3000 at 0x7fffa07e7df8 NIP: 00007fffa07e7df8 LR: 00007fffa07e7df8 CTR: 0000000000000000 REGS: c0000000bb577e80 TRAP: 3000 Not tainted (6.11.0-test2-10547-g684a64bf32b6-dirty) MSR: 800000000280f033 CR: 48022484 XER: 00000000 IRQMASK: 0 GPR00: 0000000000000058 00007ffff961f1e0 00007fffa08f7100 fffffffffee1dead GPR04: 0000000028121969 0000000045584543 0000000000000000 0000000000000003 GPR08: 0000000000000003 0000000000000000 0000000000000000 0000000000000000 GPR12: 0000000000000000 00007fffa0a9b360 ffffffffffffffff 0000000000000000 GPR16: 0000000000000001 0000000000000002 0000000000000001 0000000000000001 GPR20: 000000011710f520 0000000000000000 0000000000000000 0000000000000001 GPR24: 0000000129be0480 0000000000000003 0000000000000003 00007ffff961f2b0 GPR28: 00000001170f2d30 00000001170f2d28 00007fffa08f18d0 0000000129be04a0 NIP [00007fffa07e7df8] 0x7fffa07e7df8 LR [00007fffa07e7df8] 0x7fffa07e7df8 --- interrupt: 3000 Suggested-by: Michael Ellerman Signed-off-by: Vishal Chourasia Signed-off-by: Herbert Xu --- drivers/crypto/nx/nx-common-pseries.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/crypto/nx/nx-common-pseries.c b/drivers/crypto/nx/nx-common-pseries.c index a0eb900383af..1660c5cf3641 100644 --- a/drivers/crypto/nx/nx-common-pseries.c +++ b/drivers/crypto/nx/nx-common-pseries.c @@ -1122,10 +1122,11 @@ static void nx842_remove(struct vio_dev *viodev) crypto_unregister_alg(&nx842_pseries_alg); + of_reconfig_notifier_unregister(&nx842_of_nb); + spin_lock_irqsave(&devdata_spinlock, flags); old_devdata = rcu_dereference_check(devdata, lockdep_is_held(&devdata_spinlock)); - of_reconfig_notifier_unregister(&nx842_of_nb); RCU_INIT_POINTER(devdata, NULL); spin_unlock_irqrestore(&devdata_spinlock, flags); synchronize_rcu(); -- 2.51.0 From 7b90df78184de90fe5afcc45393c8ad83b5b18a1 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 15 Oct 2024 14:11:22 +0100 Subject: [PATCH 03/16] crypto: tegra - remove redundant error check on ret Currently there is an unnecessary error check on ret without a proceeding assignment to ret that needs checking. The check is redundant and can be removed. Signed-off-by: Colin Ian King Acked-by: Akhil R Signed-off-by: Herbert Xu --- drivers/crypto/tegra/tegra-se-aes.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/crypto/tegra/tegra-se-aes.c b/drivers/crypto/tegra/tegra-se-aes.c index ae7a0f8435fc..9d130592cc0a 100644 --- a/drivers/crypto/tegra/tegra-se-aes.c +++ b/drivers/crypto/tegra/tegra-se-aes.c @@ -1180,8 +1180,6 @@ static int tegra_ccm_do_one_req(struct crypto_engine *engine, void *areq) goto out; } else { rctx->cryptlen = req->cryptlen - ctx->authsize; - if (ret) - goto out; /* CTR operation */ ret = tegra_ccm_do_ctr(ctx, rctx); -- 2.51.0 From 4eb10daba80d65a18f56624d183e5304e17c3459 Mon Sep 17 00:00:00 2001 From: Gatien Chevallier Date: Wed, 16 Oct 2024 10:04:18 +0200 Subject: [PATCH 04/16] dt-bindings: rng: add st,stm32mp25-rng support Add RNG STM32MP25x platforms compatible. Update the clock properties management to support all versions. Signed-off-by: Gatien Chevallier Reviewed-by: Rob Herring (Arm) Signed-off-by: Herbert Xu --- .../devicetree/bindings/rng/st,stm32-rng.yaml | 28 ++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/rng/st,stm32-rng.yaml b/Documentation/devicetree/bindings/rng/st,stm32-rng.yaml index 340d01d481d1..7db65f49773b 100644 --- a/Documentation/devicetree/bindings/rng/st,stm32-rng.yaml +++ b/Documentation/devicetree/bindings/rng/st,stm32-rng.yaml @@ -18,12 +18,19 @@ properties: enum: - st,stm32-rng - st,stm32mp13-rng + - st,stm32mp25-rng reg: maxItems: 1 clocks: - maxItems: 1 + minItems: 1 + maxItems: 2 + + clock-names: + items: + - const: core + - const: bus resets: maxItems: 1 @@ -57,6 +64,25 @@ allOf: properties: st,rng-lock-conf: false + - if: + properties: + compatible: + contains: + enum: + - st,stm32-rng + - st,stm32mp13-rng + then: + properties: + clocks: + maxItems: 1 + clock-names: false + else: + properties: + clocks: + minItems: 2 + required: + - clock-names + additionalProperties: false examples: -- 2.51.0 From 842285d4ce1cecbe768ea01bed42ad5a938ab3dd Mon Sep 17 00:00:00 2001 From: Gatien Chevallier Date: Wed, 16 Oct 2024 10:04:19 +0200 Subject: [PATCH 05/16] hwrng: stm32 - implement support for STM32MP25x platforms Implement the support for STM32MP25x platforms. On this platform, a security clock is shared between some hardware blocks. For the RNG, it is the RNG kernel clock. Therefore, the gate is no more shared between the RNG bus and kernel clocks as on STM32MP1x platforms and the bus clock has to be managed on its own. Signed-off-by: Gatien Chevallier Reviewed-by: Marek Vasut Signed-off-by: Herbert Xu --- drivers/char/hw_random/stm32-rng.c | 74 ++++++++++++++++++++++-------- 1 file changed, 56 insertions(+), 18 deletions(-) diff --git a/drivers/char/hw_random/stm32-rng.c b/drivers/char/hw_random/stm32-rng.c index 9d041a67c295..279328902bf8 100644 --- a/drivers/char/hw_random/stm32-rng.c +++ b/drivers/char/hw_random/stm32-rng.c @@ -4,6 +4,7 @@ */ #include +#include #include #include #include @@ -49,6 +50,7 @@ struct stm32_rng_data { uint max_clock_rate; + uint nb_clock; u32 cr; u32 nscr; u32 htcr; @@ -72,7 +74,7 @@ struct stm32_rng_private { struct hwrng rng; struct device *dev; void __iomem *base; - struct clk *clk; + struct clk_bulk_data *clk_bulk; struct reset_control *rst; struct stm32_rng_config pm_conf; const struct stm32_rng_data *data; @@ -266,7 +268,7 @@ static uint stm32_rng_clock_freq_restrain(struct hwrng *rng) unsigned long clock_rate = 0; uint clock_div = 0; - clock_rate = clk_get_rate(priv->clk); + clock_rate = clk_get_rate(priv->clk_bulk[0].clk); /* * Get the exponent to apply on the CLKDIV field in RNG_CR register @@ -276,7 +278,7 @@ static uint stm32_rng_clock_freq_restrain(struct hwrng *rng) while ((clock_rate >> clock_div) > priv->data->max_clock_rate) clock_div++; - pr_debug("RNG clk rate : %lu\n", clk_get_rate(priv->clk) >> clock_div); + pr_debug("RNG clk rate : %lu\n", clk_get_rate(priv->clk_bulk[0].clk) >> clock_div); return clock_div; } @@ -288,7 +290,7 @@ static int stm32_rng_init(struct hwrng *rng) int err; u32 reg; - err = clk_prepare_enable(priv->clk); + err = clk_bulk_prepare_enable(priv->data->nb_clock, priv->clk_bulk); if (err) return err; @@ -328,7 +330,7 @@ static int stm32_rng_init(struct hwrng *rng) (!(reg & RNG_CR_CONDRST)), 10, 50000); if (err) { - clk_disable_unprepare(priv->clk); + clk_bulk_disable_unprepare(priv->data->nb_clock, priv->clk_bulk); dev_err(priv->dev, "%s: timeout %x!\n", __func__, reg); return -EINVAL; } @@ -356,12 +358,13 @@ static int stm32_rng_init(struct hwrng *rng) reg & RNG_SR_DRDY, 10, 100000); if (err || (reg & ~RNG_SR_DRDY)) { - clk_disable_unprepare(priv->clk); + clk_bulk_disable_unprepare(priv->data->nb_clock, priv->clk_bulk); dev_err(priv->dev, "%s: timeout:%x SR: %x!\n", __func__, err, reg); + return -EINVAL; } - clk_disable_unprepare(priv->clk); + clk_bulk_disable_unprepare(priv->data->nb_clock, priv->clk_bulk); return 0; } @@ -379,7 +382,8 @@ static int __maybe_unused stm32_rng_runtime_suspend(struct device *dev) reg = readl_relaxed(priv->base + RNG_CR); reg &= ~RNG_CR_RNGEN; writel_relaxed(reg, priv->base + RNG_CR); - clk_disable_unprepare(priv->clk); + + clk_bulk_disable_unprepare(priv->data->nb_clock, priv->clk_bulk); return 0; } @@ -389,7 +393,7 @@ static int __maybe_unused stm32_rng_suspend(struct device *dev) struct stm32_rng_private *priv = dev_get_drvdata(dev); int err; - err = clk_prepare_enable(priv->clk); + err = clk_bulk_prepare_enable(priv->data->nb_clock, priv->clk_bulk); if (err) return err; @@ -403,7 +407,7 @@ static int __maybe_unused stm32_rng_suspend(struct device *dev) writel_relaxed(priv->pm_conf.cr, priv->base + RNG_CR); - clk_disable_unprepare(priv->clk); + clk_bulk_disable_unprepare(priv->data->nb_clock, priv->clk_bulk); return 0; } @@ -414,7 +418,7 @@ static int __maybe_unused stm32_rng_runtime_resume(struct device *dev) int err; u32 reg; - err = clk_prepare_enable(priv->clk); + err = clk_bulk_prepare_enable(priv->data->nb_clock, priv->clk_bulk); if (err) return err; @@ -434,7 +438,7 @@ static int __maybe_unused stm32_rng_resume(struct device *dev) int err; u32 reg; - err = clk_prepare_enable(priv->clk); + err = clk_bulk_prepare_enable(priv->data->nb_clock, priv->clk_bulk); if (err) return err; @@ -462,7 +466,7 @@ static int __maybe_unused stm32_rng_resume(struct device *dev) reg & ~RNG_CR_CONDRST, 10, 100000); if (err) { - clk_disable_unprepare(priv->clk); + clk_bulk_disable_unprepare(priv->data->nb_clock, priv->clk_bulk); dev_err(priv->dev, "%s: timeout:%x CR: %x!\n", __func__, err, reg); return -EINVAL; } @@ -472,7 +476,7 @@ static int __maybe_unused stm32_rng_resume(struct device *dev) writel_relaxed(reg, priv->base + RNG_CR); } - clk_disable_unprepare(priv->clk); + clk_bulk_disable_unprepare(priv->data->nb_clock, priv->clk_bulk); return 0; } @@ -484,9 +488,19 @@ static const struct dev_pm_ops __maybe_unused stm32_rng_pm_ops = { stm32_rng_resume) }; +static const struct stm32_rng_data stm32mp25_rng_data = { + .has_cond_reset = true, + .max_clock_rate = 48000000, + .nb_clock = 2, + .cr = 0x00F00D00, + .nscr = 0x2B5BB, + .htcr = 0x969D, +}; + static const struct stm32_rng_data stm32mp13_rng_data = { .has_cond_reset = true, .max_clock_rate = 48000000, + .nb_clock = 1, .cr = 0x00F00D00, .nscr = 0x2B5BB, .htcr = 0x969D, @@ -495,9 +509,14 @@ static const struct stm32_rng_data stm32mp13_rng_data = { static const struct stm32_rng_data stm32_rng_data = { .has_cond_reset = false, .max_clock_rate = 3000000, + .nb_clock = 1, }; static const struct of_device_id stm32_rng_match[] = { + { + .compatible = "st,stm32mp25-rng", + .data = &stm32mp25_rng_data, + }, { .compatible = "st,stm32mp13-rng", .data = &stm32mp13_rng_data, @@ -516,6 +535,7 @@ static int stm32_rng_probe(struct platform_device *ofdev) struct device_node *np = ofdev->dev.of_node; struct stm32_rng_private *priv; struct resource *res; + int ret; priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL); if (!priv) @@ -525,10 +545,6 @@ static int stm32_rng_probe(struct platform_device *ofdev) if (IS_ERR(priv->base)) return PTR_ERR(priv->base); - priv->clk = devm_clk_get(&ofdev->dev, NULL); - if (IS_ERR(priv->clk)) - return PTR_ERR(priv->clk); - priv->rst = devm_reset_control_get(&ofdev->dev, NULL); if (!IS_ERR(priv->rst)) { reset_control_assert(priv->rst); @@ -551,6 +567,28 @@ static int stm32_rng_probe(struct platform_device *ofdev) priv->rng.read = stm32_rng_read; priv->rng.quality = 900; + if (!priv->data->nb_clock || priv->data->nb_clock > 2) + return -EINVAL; + + ret = devm_clk_bulk_get_all(dev, &priv->clk_bulk); + if (ret != priv->data->nb_clock) + return dev_err_probe(dev, -EINVAL, "Failed to get clocks: %d\n", ret); + + if (priv->data->nb_clock == 2) { + const char *id = priv->clk_bulk[1].id; + struct clk *clk = priv->clk_bulk[1].clk; + + if (!priv->clk_bulk[0].id || !priv->clk_bulk[1].id) + return dev_err_probe(dev, -EINVAL, "Missing clock name\n"); + + if (strcmp(priv->clk_bulk[0].id, "core")) { + priv->clk_bulk[1].id = priv->clk_bulk[0].id; + priv->clk_bulk[1].clk = priv->clk_bulk[0].clk; + priv->clk_bulk[0].id = id; + priv->clk_bulk[0].clk = clk; + } + } + pm_runtime_set_autosuspend_delay(dev, 100); pm_runtime_use_autosuspend(dev); pm_runtime_enable(dev); -- 2.51.0 From 5a61fd622b07b17b6fa3c231fc7d83cbcba0229e Mon Sep 17 00:00:00 2001 From: Gatien Chevallier Date: Wed, 16 Oct 2024 10:04:20 +0200 Subject: [PATCH 06/16] hwrng: stm32 - update STM32MP15 RNG max clock frequency RNG max clock frequency can be updated to 48MHz for stm32mp1x platforms according to the latest specifications. Signed-off-by: Gatien Chevallier Reviewed-by: Marek Vasut Signed-off-by: Herbert Xu --- drivers/char/hw_random/stm32-rng.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/char/hw_random/stm32-rng.c b/drivers/char/hw_random/stm32-rng.c index 279328902bf8..5b4fb35bcb5c 100644 --- a/drivers/char/hw_random/stm32-rng.c +++ b/drivers/char/hw_random/stm32-rng.c @@ -508,7 +508,7 @@ static const struct stm32_rng_data stm32mp13_rng_data = { static const struct stm32_rng_data stm32_rng_data = { .has_cond_reset = false, - .max_clock_rate = 3000000, + .max_clock_rate = 48000000, .nb_clock = 1, }; -- 2.51.0 From a1ba22921e7186f2b3b8b056a607191e603104db Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Wed, 16 Oct 2024 15:57:28 +0200 Subject: [PATCH 07/16] crypto: drbg - Use str_true_false() and str_enabled_disabled() helpers Remove hard-coded strings by using the helper functions str_true_false() and str_enabled_disabled(). Signed-off-by: Thorsten Blum Signed-off-by: Herbert Xu --- crypto/drbg.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/crypto/drbg.c b/crypto/drbg.c index 3addce90930c..c323f40bed4f 100644 --- a/crypto/drbg.c +++ b/crypto/drbg.c @@ -101,6 +101,7 @@ #include #include #include +#include /*************************************************************** * Backend cipher definitions available to DRBG @@ -1412,7 +1413,7 @@ static int drbg_generate(struct drbg_state *drbg, if (drbg->pr || drbg->seeded == DRBG_SEED_STATE_UNSEEDED) { pr_devel("DRBG: reseeding before generation (prediction " "resistance: %s, state %s)\n", - drbg->pr ? "true" : "false", + str_true_false(drbg->pr), (drbg->seeded == DRBG_SEED_STATE_FULL ? "seeded" : "unseeded")); /* 9.3.1 steps 7.1 through 7.3 */ @@ -1562,7 +1563,7 @@ static int drbg_instantiate(struct drbg_state *drbg, struct drbg_string *pers, bool reseed = true; pr_devel("DRBG: Initializing DRBG core %d with prediction resistance " - "%s\n", coreref, pr ? "enabled" : "disabled"); + "%s\n", coreref, str_enabled_disabled(pr)); mutex_lock(&drbg->drbg_mutex); /* 9.1 step 1 is implicit with the selected DRBG type */ -- 2.51.0 From a37e55791f204bd65da07d281d95629df15ccf81 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Wed, 16 Oct 2024 20:57:24 +0200 Subject: [PATCH 08/16] crypto: crc32 - Provide crc32-arch driver for accelerated library code crc32-generic is currently backed by the architecture's CRC-32 library code, which may offer a variety of implementations depending on the capabilities of the platform. These are not covered by the crypto subsystem's fuzz testing capabilities because crc32-generic is the reference driver that the fuzzing logic uses as a source of truth. Fix this by providing a crc32-arch implementation which is based on the arch library code if available, and modify crc32-generic so it is always based on the generic C implementation. If the arch has no CRC-32 library code, this change does nothing. Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- crypto/Makefile | 1 + crypto/crc32_generic.c | 94 +++++++++++++++++++++++++++++++----------- lib/crc32.c | 2 + 3 files changed, 73 insertions(+), 24 deletions(-) diff --git a/crypto/Makefile b/crypto/Makefile index 81be78d39c2d..3e49a820f148 100644 --- a/crypto/Makefile +++ b/crypto/Makefile @@ -155,6 +155,7 @@ obj-$(CONFIG_CRYPTO_DEFLATE) += deflate.o obj-$(CONFIG_CRYPTO_MICHAEL_MIC) += michael_mic.o obj-$(CONFIG_CRYPTO_CRC32C) += crc32c_generic.o obj-$(CONFIG_CRYPTO_CRC32) += crc32_generic.o +CFLAGS_crc32_generic.o += -DARCH=$(ARCH) obj-$(CONFIG_CRYPTO_CRCT10DIF) += crct10dif_common.o crct10dif_generic.o obj-$(CONFIG_CRYPTO_CRC64_ROCKSOFT) += crc64_rocksoft_generic.o obj-$(CONFIG_CRYPTO_AUTHENC) += authenc.o authencesn.o diff --git a/crypto/crc32_generic.c b/crypto/crc32_generic.c index a989cb44fd16..8d6e1baec509 100644 --- a/crypto/crc32_generic.c +++ b/crypto/crc32_generic.c @@ -59,6 +59,15 @@ static int crc32_update(struct shash_desc *desc, const u8 *data, { u32 *crcp = shash_desc_ctx(desc); + *crcp = crc32_le_base(*crcp, data, len); + return 0; +} + +static int crc32_update_arch(struct shash_desc *desc, const u8 *data, + unsigned int len) +{ + u32 *crcp = shash_desc_ctx(desc); + *crcp = crc32_le(*crcp, data, len); return 0; } @@ -66,6 +75,13 @@ static int crc32_update(struct shash_desc *desc, const u8 *data, /* No final XOR 0xFFFFFFFF, like crc32_le */ static int __crc32_finup(u32 *crcp, const u8 *data, unsigned int len, u8 *out) +{ + put_unaligned_le32(crc32_le_base(*crcp, data, len), out); + return 0; +} + +static int __crc32_finup_arch(u32 *crcp, const u8 *data, unsigned int len, + u8 *out) { put_unaligned_le32(crc32_le(*crcp, data, len), out); return 0; @@ -77,6 +93,12 @@ static int crc32_finup(struct shash_desc *desc, const u8 *data, return __crc32_finup(shash_desc_ctx(desc), data, len, out); } +static int crc32_finup_arch(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + return __crc32_finup_arch(shash_desc_ctx(desc), data, len, out); +} + static int crc32_final(struct shash_desc *desc, u8 *out) { u32 *crcp = shash_desc_ctx(desc); @@ -88,38 +110,62 @@ static int crc32_final(struct shash_desc *desc, u8 *out) static int crc32_digest(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out) { - return __crc32_finup(crypto_shash_ctx(desc->tfm), data, len, - out); + return __crc32_finup(crypto_shash_ctx(desc->tfm), data, len, out); } -static struct shash_alg alg = { - .setkey = crc32_setkey, - .init = crc32_init, - .update = crc32_update, - .final = crc32_final, - .finup = crc32_finup, - .digest = crc32_digest, - .descsize = sizeof(u32), - .digestsize = CHKSUM_DIGEST_SIZE, - .base = { - .cra_name = "crc32", - .cra_driver_name = "crc32-generic", - .cra_priority = 100, - .cra_flags = CRYPTO_ALG_OPTIONAL_KEY, - .cra_blocksize = CHKSUM_BLOCK_SIZE, - .cra_ctxsize = sizeof(u32), - .cra_module = THIS_MODULE, - .cra_init = crc32_cra_init, - } -}; + +static int crc32_digest_arch(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + return __crc32_finup_arch(crypto_shash_ctx(desc->tfm), data, len, out); +} + +static struct shash_alg algs[] = {{ + .setkey = crc32_setkey, + .init = crc32_init, + .update = crc32_update, + .final = crc32_final, + .finup = crc32_finup, + .digest = crc32_digest, + .descsize = sizeof(u32), + .digestsize = CHKSUM_DIGEST_SIZE, + + .base.cra_name = "crc32", + .base.cra_driver_name = "crc32-generic", + .base.cra_priority = 100, + .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY, + .base.cra_blocksize = CHKSUM_BLOCK_SIZE, + .base.cra_ctxsize = sizeof(u32), + .base.cra_module = THIS_MODULE, + .base.cra_init = crc32_cra_init, +}, { + .setkey = crc32_setkey, + .init = crc32_init, + .update = crc32_update_arch, + .final = crc32_final, + .finup = crc32_finup_arch, + .digest = crc32_digest_arch, + .descsize = sizeof(u32), + .digestsize = CHKSUM_DIGEST_SIZE, + + .base.cra_name = "crc32", + .base.cra_driver_name = "crc32-" __stringify(ARCH), + .base.cra_priority = 150, + .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY, + .base.cra_blocksize = CHKSUM_BLOCK_SIZE, + .base.cra_ctxsize = sizeof(u32), + .base.cra_module = THIS_MODULE, + .base.cra_init = crc32_cra_init, +}}; static int __init crc32_mod_init(void) { - return crypto_register_shash(&alg); + /* register the arch flavor only if it differs from the generic one */ + return crypto_register_shashes(algs, 1 + (&crc32_le != &crc32_le_base)); } static void __exit crc32_mod_fini(void) { - crypto_unregister_shash(&alg); + crypto_unregister_shashes(algs, 1 + (&crc32_le != &crc32_le_base)); } subsys_initcall(crc32_mod_init); diff --git a/lib/crc32.c b/lib/crc32.c index 5649847d0a8d..a54ba87b7073 100644 --- a/lib/crc32.c +++ b/lib/crc32.c @@ -205,6 +205,8 @@ EXPORT_SYMBOL(crc32_le); EXPORT_SYMBOL(__crc32c_le); u32 __pure crc32_le_base(u32, unsigned char const *, size_t) __alias(crc32_le); +EXPORT_SYMBOL(crc32_le_base); + u32 __pure __crc32c_le_base(u32, unsigned char const *, size_t) __alias(__crc32c_le); u32 __pure crc32_be_base(u32, unsigned char const *, size_t) __alias(crc32_be); -- 2.51.0 From 16739efac6e1ea40df5ec7a263e664481840e73a Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Wed, 16 Oct 2024 20:57:25 +0200 Subject: [PATCH 09/16] crypto: crc32c - Provide crc32c-arch driver for accelerated library code crc32c-generic is currently backed by the architecture's CRC-32c library code, which may offer a variety of implementations depending on the capabilities of the platform. These are not covered by the crypto subsystem's fuzz testing capabilities because crc32c-generic is the reference driver that the fuzzing logic uses as a source of truth. Fix this by providing a crc32c-arch implementation which is based on the arch library code if available, and modify crc32c-generic so it is always based on the generic C implementation. If the arch has no CRC-32c library code, this change does nothing. Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- crypto/Makefile | 1 + crypto/crc32c_generic.c | 94 +++++++++++++++++++++++++++++++---------- lib/crc32.c | 2 + 3 files changed, 75 insertions(+), 22 deletions(-) diff --git a/crypto/Makefile b/crypto/Makefile index 3e49a820f148..77abca715445 100644 --- a/crypto/Makefile +++ b/crypto/Makefile @@ -155,6 +155,7 @@ obj-$(CONFIG_CRYPTO_DEFLATE) += deflate.o obj-$(CONFIG_CRYPTO_MICHAEL_MIC) += michael_mic.o obj-$(CONFIG_CRYPTO_CRC32C) += crc32c_generic.o obj-$(CONFIG_CRYPTO_CRC32) += crc32_generic.o +CFLAGS_crc32c_generic.o += -DARCH=$(ARCH) CFLAGS_crc32_generic.o += -DARCH=$(ARCH) obj-$(CONFIG_CRYPTO_CRCT10DIF) += crct10dif_common.o crct10dif_generic.o obj-$(CONFIG_CRYPTO_CRC64_ROCKSOFT) += crc64_rocksoft_generic.o diff --git a/crypto/crc32c_generic.c b/crypto/crc32c_generic.c index 768614738541..3f928d7f4ade 100644 --- a/crypto/crc32c_generic.c +++ b/crypto/crc32c_generic.c @@ -85,6 +85,15 @@ static int chksum_update(struct shash_desc *desc, const u8 *data, { struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); + ctx->crc = __crc32c_le_base(ctx->crc, data, length); + return 0; +} + +static int chksum_update_arch(struct shash_desc *desc, const u8 *data, + unsigned int length) +{ + struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); + ctx->crc = __crc32c_le(ctx->crc, data, length); return 0; } @@ -98,6 +107,13 @@ static int chksum_final(struct shash_desc *desc, u8 *out) } static int __chksum_finup(u32 *crcp, const u8 *data, unsigned int len, u8 *out) +{ + put_unaligned_le32(~__crc32c_le_base(*crcp, data, len), out); + return 0; +} + +static int __chksum_finup_arch(u32 *crcp, const u8 *data, unsigned int len, + u8 *out) { put_unaligned_le32(~__crc32c_le(*crcp, data, len), out); return 0; @@ -111,6 +127,14 @@ static int chksum_finup(struct shash_desc *desc, const u8 *data, return __chksum_finup(&ctx->crc, data, len, out); } +static int chksum_finup_arch(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); + + return __chksum_finup_arch(&ctx->crc, data, len, out); +} + static int chksum_digest(struct shash_desc *desc, const u8 *data, unsigned int length, u8 *out) { @@ -119,6 +143,14 @@ static int chksum_digest(struct shash_desc *desc, const u8 *data, return __chksum_finup(&mctx->key, data, length, out); } +static int chksum_digest_arch(struct shash_desc *desc, const u8 *data, + unsigned int length, u8 *out) +{ + struct chksum_ctx *mctx = crypto_shash_ctx(desc->tfm); + + return __chksum_finup_arch(&mctx->key, data, length, out); +} + static int crc32c_cra_init(struct crypto_tfm *tfm) { struct chksum_ctx *mctx = crypto_tfm_ctx(tfm); @@ -127,35 +159,53 @@ static int crc32c_cra_init(struct crypto_tfm *tfm) return 0; } -static struct shash_alg alg = { - .digestsize = CHKSUM_DIGEST_SIZE, - .setkey = chksum_setkey, - .init = chksum_init, - .update = chksum_update, - .final = chksum_final, - .finup = chksum_finup, - .digest = chksum_digest, - .descsize = sizeof(struct chksum_desc_ctx), - .base = { - .cra_name = "crc32c", - .cra_driver_name = "crc32c-generic", - .cra_priority = 100, - .cra_flags = CRYPTO_ALG_OPTIONAL_KEY, - .cra_blocksize = CHKSUM_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct chksum_ctx), - .cra_module = THIS_MODULE, - .cra_init = crc32c_cra_init, - } -}; +static struct shash_alg algs[] = {{ + .digestsize = CHKSUM_DIGEST_SIZE, + .setkey = chksum_setkey, + .init = chksum_init, + .update = chksum_update, + .final = chksum_final, + .finup = chksum_finup, + .digest = chksum_digest, + .descsize = sizeof(struct chksum_desc_ctx), + + .base.cra_name = "crc32c", + .base.cra_driver_name = "crc32c-generic", + .base.cra_priority = 100, + .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY, + .base.cra_blocksize = CHKSUM_BLOCK_SIZE, + .base.cra_ctxsize = sizeof(struct chksum_ctx), + .base.cra_module = THIS_MODULE, + .base.cra_init = crc32c_cra_init, +}, { + .digestsize = CHKSUM_DIGEST_SIZE, + .setkey = chksum_setkey, + .init = chksum_init, + .update = chksum_update_arch, + .final = chksum_final, + .finup = chksum_finup_arch, + .digest = chksum_digest_arch, + .descsize = sizeof(struct chksum_desc_ctx), + + .base.cra_name = "crc32c", + .base.cra_driver_name = "crc32c-" __stringify(ARCH), + .base.cra_priority = 150, + .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY, + .base.cra_blocksize = CHKSUM_BLOCK_SIZE, + .base.cra_ctxsize = sizeof(struct chksum_ctx), + .base.cra_module = THIS_MODULE, + .base.cra_init = crc32c_cra_init, +}}; static int __init crc32c_mod_init(void) { - return crypto_register_shash(&alg); + /* register the arch flavor only if it differs from the generic one */ + return crypto_register_shashes(algs, 1 + (&__crc32c_le != &__crc32c_le_base)); } static void __exit crc32c_mod_fini(void) { - crypto_unregister_shash(&alg); + crypto_unregister_shashes(algs, 1 + (&__crc32c_le != &__crc32c_le_base)); } subsys_initcall(crc32c_mod_init); diff --git a/lib/crc32.c b/lib/crc32.c index a54ba87b7073..ff587fee3893 100644 --- a/lib/crc32.c +++ b/lib/crc32.c @@ -208,6 +208,8 @@ u32 __pure crc32_le_base(u32, unsigned char const *, size_t) __alias(crc32_le); EXPORT_SYMBOL(crc32_le_base); u32 __pure __crc32c_le_base(u32, unsigned char const *, size_t) __alias(__crc32c_le); +EXPORT_SYMBOL(__crc32c_le_base); + u32 __pure crc32_be_base(u32, unsigned char const *, size_t) __alias(crc32_be); /* -- 2.51.0 From 3b2f2d22fb424e9bebda4dbf6676cbfc7f9f62cd Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 16 Oct 2024 17:00:42 -0700 Subject: [PATCH 10/16] crypto: x86/aegis128 - access 32-bit arguments as 32-bit Fix the AEGIS assembly code to access 'unsigned int' arguments as 32-bit values instead of 64-bit, since the upper bits of the corresponding 64-bit registers are not guaranteed to be zero. Note: there haven't been any reports of this bug actually causing incorrect behavior. Neither gcc nor clang guarantee zero-extension to 64 bits, but zero-extension is likely to happen in practice because most instructions that operate on 32-bit registers zero-extend to 64 bits. Fixes: 1d373d4e8e15 ("crypto: x86 - Add optimized AEGIS implementations") Cc: stable@vger.kernel.org Reviewed-by: Ondrej Mosnacek Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- arch/x86/crypto/aegis128-aesni-asm.S | 29 ++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/arch/x86/crypto/aegis128-aesni-asm.S b/arch/x86/crypto/aegis128-aesni-asm.S index ad7f4c891625..2de859173940 100644 --- a/arch/x86/crypto/aegis128-aesni-asm.S +++ b/arch/x86/crypto/aegis128-aesni-asm.S @@ -21,7 +21,7 @@ #define T1 %xmm7 #define STATEP %rdi -#define LEN %rsi +#define LEN %esi #define SRC %rdx #define DST %rcx @@ -76,32 +76,32 @@ SYM_FUNC_START_LOCAL(__load_partial) xor %r9d, %r9d pxor MSG, MSG - mov LEN, %r8 + mov LEN, %r8d and $0x1, %r8 jz .Lld_partial_1 - mov LEN, %r8 + mov LEN, %r8d and $0x1E, %r8 add SRC, %r8 mov (%r8), %r9b .Lld_partial_1: - mov LEN, %r8 + mov LEN, %r8d and $0x2, %r8 jz .Lld_partial_2 - mov LEN, %r8 + mov LEN, %r8d and $0x1C, %r8 add SRC, %r8 shl $0x10, %r9 mov (%r8), %r9w .Lld_partial_2: - mov LEN, %r8 + mov LEN, %r8d and $0x4, %r8 jz .Lld_partial_4 - mov LEN, %r8 + mov LEN, %r8d and $0x18, %r8 add SRC, %r8 shl $32, %r9 @@ -111,11 +111,11 @@ SYM_FUNC_START_LOCAL(__load_partial) .Lld_partial_4: movq %r9, MSG - mov LEN, %r8 + mov LEN, %r8d and $0x8, %r8 jz .Lld_partial_8 - mov LEN, %r8 + mov LEN, %r8d and $0x10, %r8 add SRC, %r8 pslldq $8, MSG @@ -139,7 +139,7 @@ SYM_FUNC_END(__load_partial) * %r10 */ SYM_FUNC_START_LOCAL(__store_partial) - mov LEN, %r8 + mov LEN, %r8d mov DST, %r9 movq T0, %r10 @@ -677,7 +677,7 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_dec_tail) call __store_partial /* mask with byte count: */ - movq LEN, T0 + movd LEN, T0 punpcklbw T0, T0 punpcklbw T0, T0 punpcklbw T0, T0 @@ -702,7 +702,8 @@ SYM_FUNC_END(crypto_aegis128_aesni_dec_tail) /* * void crypto_aegis128_aesni_final(void *state, void *tag_xor, - * u64 assoclen, u64 cryptlen); + * unsigned int assoclen, + * unsigned int cryptlen); */ SYM_FUNC_START(crypto_aegis128_aesni_final) FRAME_BEGIN @@ -715,8 +716,8 @@ SYM_FUNC_START(crypto_aegis128_aesni_final) movdqu 0x40(STATEP), STATE4 /* prepare length block: */ - movq %rdx, MSG - movq %rcx, T0 + movd %edx, MSG + movd %ecx, T0 pslldq $8, T0 pxor T0, MSG psllq $3, MSG /* multiply by 8 (to get bit count) */ -- 2.51.0 From ebb445f5e7950a9e052a7df9e6f56c32539f2e55 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 16 Oct 2024 17:00:43 -0700 Subject: [PATCH 11/16] crypto: x86/aegis128 - remove no-op init and exit functions Don't bother providing empty stubs for the init and exit methods in struct aead_alg, since they are optional anyway. Reviewed-by: Ondrej Mosnacek Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- arch/x86/crypto/aegis128-aesni-glue.c | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/arch/x86/crypto/aegis128-aesni-glue.c b/arch/x86/crypto/aegis128-aesni-glue.c index 4623189000d8..96586470154e 100644 --- a/arch/x86/crypto/aegis128-aesni-glue.c +++ b/arch/x86/crypto/aegis128-aesni-glue.c @@ -227,22 +227,11 @@ static int crypto_aegis128_aesni_decrypt(struct aead_request *req) return crypto_memneq(tag.bytes, zeros.bytes, authsize) ? -EBADMSG : 0; } -static int crypto_aegis128_aesni_init_tfm(struct crypto_aead *aead) -{ - return 0; -} - -static void crypto_aegis128_aesni_exit_tfm(struct crypto_aead *aead) -{ -} - static struct aead_alg crypto_aegis128_aesni_alg = { .setkey = crypto_aegis128_aesni_setkey, .setauthsize = crypto_aegis128_aesni_setauthsize, .encrypt = crypto_aegis128_aesni_encrypt, .decrypt = crypto_aegis128_aesni_decrypt, - .init = crypto_aegis128_aesni_init_tfm, - .exit = crypto_aegis128_aesni_exit_tfm, .ivsize = AEGIS128_NONCE_SIZE, .maxauthsize = AEGIS128_MAX_AUTH_SIZE, -- 2.51.0 From b8d2e7bac3f768e5ab0b52a4a6dd65aa130113be Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 16 Oct 2024 17:00:44 -0700 Subject: [PATCH 12/16] crypto: x86/aegis128 - eliminate some indirect calls Instead of using a struct of function pointers to decide whether to call the encryption or decryption assembly functions, use a conditional branch on a bool. Force-inline the functions to avoid actually generating the branch. This improves performance slightly since indirect calls are slow. Remove the now-unnecessary CFI stubs. Note that just force-inlining the existing functions might cause the compiler to optimize out the indirect branches, but that would not be a reliable way to do it and the CFI stubs would still be required. Reviewed-by: Ondrej Mosnacek Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- arch/x86/crypto/aegis128-aesni-asm.S | 9 ++-- arch/x86/crypto/aegis128-aesni-glue.c | 74 +++++++++++++-------------- 2 files changed, 40 insertions(+), 43 deletions(-) diff --git a/arch/x86/crypto/aegis128-aesni-asm.S b/arch/x86/crypto/aegis128-aesni-asm.S index 2de859173940..1b57558548c7 100644 --- a/arch/x86/crypto/aegis128-aesni-asm.S +++ b/arch/x86/crypto/aegis128-aesni-asm.S @@ -7,7 +7,6 @@ */ #include -#include #include #define STATE0 %xmm0 @@ -403,7 +402,7 @@ SYM_FUNC_END(crypto_aegis128_aesni_ad) * void crypto_aegis128_aesni_enc(void *state, unsigned int length, * const void *src, void *dst); */ -SYM_TYPED_FUNC_START(crypto_aegis128_aesni_enc) +SYM_FUNC_START(crypto_aegis128_aesni_enc) FRAME_BEGIN cmp $0x10, LEN @@ -500,7 +499,7 @@ SYM_FUNC_END(crypto_aegis128_aesni_enc) * void crypto_aegis128_aesni_enc_tail(void *state, unsigned int length, * const void *src, void *dst); */ -SYM_TYPED_FUNC_START(crypto_aegis128_aesni_enc_tail) +SYM_FUNC_START(crypto_aegis128_aesni_enc_tail) FRAME_BEGIN /* load the state: */ @@ -557,7 +556,7 @@ SYM_FUNC_END(crypto_aegis128_aesni_enc_tail) * void crypto_aegis128_aesni_dec(void *state, unsigned int length, * const void *src, void *dst); */ -SYM_TYPED_FUNC_START(crypto_aegis128_aesni_dec) +SYM_FUNC_START(crypto_aegis128_aesni_dec) FRAME_BEGIN cmp $0x10, LEN @@ -654,7 +653,7 @@ SYM_FUNC_END(crypto_aegis128_aesni_dec) * void crypto_aegis128_aesni_dec_tail(void *state, unsigned int length, * const void *src, void *dst); */ -SYM_TYPED_FUNC_START(crypto_aegis128_aesni_dec_tail) +SYM_FUNC_START(crypto_aegis128_aesni_dec_tail) FRAME_BEGIN /* load the state: */ diff --git a/arch/x86/crypto/aegis128-aesni-glue.c b/arch/x86/crypto/aegis128-aesni-glue.c index 96586470154e..deb39cef0be1 100644 --- a/arch/x86/crypto/aegis128-aesni-glue.c +++ b/arch/x86/crypto/aegis128-aesni-glue.c @@ -56,16 +56,6 @@ struct aegis_ctx { struct aegis_block key; }; -struct aegis_crypt_ops { - int (*skcipher_walk_init)(struct skcipher_walk *walk, - struct aead_request *req, bool atomic); - - void (*crypt_blocks)(void *state, unsigned int length, const void *src, - void *dst); - void (*crypt_tail)(void *state, unsigned int length, const void *src, - void *dst); -}; - static void crypto_aegis128_aesni_process_ad( struct aegis_state *state, struct scatterlist *sg_src, unsigned int assoclen) @@ -114,20 +104,37 @@ static void crypto_aegis128_aesni_process_ad( } } -static void crypto_aegis128_aesni_process_crypt( - struct aegis_state *state, struct skcipher_walk *walk, - const struct aegis_crypt_ops *ops) +static __always_inline void +crypto_aegis128_aesni_process_crypt(struct aegis_state *state, + struct skcipher_walk *walk, bool enc) { while (walk->nbytes >= AEGIS128_BLOCK_SIZE) { - ops->crypt_blocks(state, - round_down(walk->nbytes, AEGIS128_BLOCK_SIZE), - walk->src.virt.addr, walk->dst.virt.addr); + if (enc) + crypto_aegis128_aesni_enc( + state, + round_down(walk->nbytes, + AEGIS128_BLOCK_SIZE), + walk->src.virt.addr, + walk->dst.virt.addr); + else + crypto_aegis128_aesni_dec( + state, + round_down(walk->nbytes, + AEGIS128_BLOCK_SIZE), + walk->src.virt.addr, + walk->dst.virt.addr); skcipher_walk_done(walk, walk->nbytes % AEGIS128_BLOCK_SIZE); } if (walk->nbytes) { - ops->crypt_tail(state, walk->nbytes, walk->src.virt.addr, - walk->dst.virt.addr); + if (enc) + crypto_aegis128_aesni_enc_tail(state, walk->nbytes, + walk->src.virt.addr, + walk->dst.virt.addr); + else + crypto_aegis128_aesni_dec_tail(state, walk->nbytes, + walk->src.virt.addr, + walk->dst.virt.addr); skcipher_walk_done(walk, 0); } } @@ -162,23 +169,26 @@ static int crypto_aegis128_aesni_setauthsize(struct crypto_aead *tfm, return 0; } -static void crypto_aegis128_aesni_crypt(struct aead_request *req, - struct aegis_block *tag_xor, - unsigned int cryptlen, - const struct aegis_crypt_ops *ops) +static __always_inline void +crypto_aegis128_aesni_crypt(struct aead_request *req, + struct aegis_block *tag_xor, + unsigned int cryptlen, bool enc) { struct crypto_aead *tfm = crypto_aead_reqtfm(req); struct aegis_ctx *ctx = crypto_aegis128_aesni_ctx(tfm); struct skcipher_walk walk; struct aegis_state state; - ops->skcipher_walk_init(&walk, req, true); + if (enc) + skcipher_walk_aead_encrypt(&walk, req, true); + else + skcipher_walk_aead_decrypt(&walk, req, true); kernel_fpu_begin(); crypto_aegis128_aesni_init(&state, ctx->key.bytes, req->iv); crypto_aegis128_aesni_process_ad(&state, req->src, req->assoclen); - crypto_aegis128_aesni_process_crypt(&state, &walk, ops); + crypto_aegis128_aesni_process_crypt(&state, &walk, enc); crypto_aegis128_aesni_final(&state, tag_xor, req->assoclen, cryptlen); kernel_fpu_end(); @@ -186,18 +196,12 @@ static void crypto_aegis128_aesni_crypt(struct aead_request *req, static int crypto_aegis128_aesni_encrypt(struct aead_request *req) { - static const struct aegis_crypt_ops OPS = { - .skcipher_walk_init = skcipher_walk_aead_encrypt, - .crypt_blocks = crypto_aegis128_aesni_enc, - .crypt_tail = crypto_aegis128_aesni_enc_tail, - }; - struct crypto_aead *tfm = crypto_aead_reqtfm(req); struct aegis_block tag = {}; unsigned int authsize = crypto_aead_authsize(tfm); unsigned int cryptlen = req->cryptlen; - crypto_aegis128_aesni_crypt(req, &tag, cryptlen, &OPS); + crypto_aegis128_aesni_crypt(req, &tag, cryptlen, true); scatterwalk_map_and_copy(tag.bytes, req->dst, req->assoclen + cryptlen, authsize, 1); @@ -208,12 +212,6 @@ static int crypto_aegis128_aesni_decrypt(struct aead_request *req) { static const struct aegis_block zeros = {}; - static const struct aegis_crypt_ops OPS = { - .skcipher_walk_init = skcipher_walk_aead_decrypt, - .crypt_blocks = crypto_aegis128_aesni_dec, - .crypt_tail = crypto_aegis128_aesni_dec_tail, - }; - struct crypto_aead *tfm = crypto_aead_reqtfm(req); struct aegis_block tag; unsigned int authsize = crypto_aead_authsize(tfm); @@ -222,7 +220,7 @@ static int crypto_aegis128_aesni_decrypt(struct aead_request *req) scatterwalk_map_and_copy(tag.bytes, req->src, req->assoclen + cryptlen, authsize, 0); - crypto_aegis128_aesni_crypt(req, &tag, cryptlen, &OPS); + crypto_aegis128_aesni_crypt(req, &tag, cryptlen, false); return crypto_memneq(tag.bytes, zeros.bytes, authsize) ? -EBADMSG : 0; } -- 2.51.0 From 595bca25a632a83544d5509e4c92ed3de0a2db51 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 16 Oct 2024 17:00:45 -0700 Subject: [PATCH 13/16] crypto: x86/aegis128 - don't bother with special code for aligned data Remove the AEGIS assembly code paths that were "optimized" to operate on 16-byte aligned data using movdqa, and instead just use the code paths that use movdqu and can handle data with any alignment. This does not reduce performance. movdqa is basically a historical artifact; on aligned data, movdqu and movdqa have had the same performance since Intel Nehalem (2008) and AMD Bulldozer (2011). And code that requires AES-NI cannot run on CPUs older than those anyway. Reviewed-by: Ondrej Mosnacek Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- arch/x86/crypto/aegis128-aesni-asm.S | 122 +++++---------------------- 1 file changed, 22 insertions(+), 100 deletions(-) diff --git a/arch/x86/crypto/aegis128-aesni-asm.S b/arch/x86/crypto/aegis128-aesni-asm.S index 1b57558548c7..5541aca2fd0d 100644 --- a/arch/x86/crypto/aegis128-aesni-asm.S +++ b/arch/x86/crypto/aegis128-aesni-asm.S @@ -245,52 +245,8 @@ SYM_FUNC_START(crypto_aegis128_aesni_ad) movdqu 0x30(STATEP), STATE3 movdqu 0x40(STATEP), STATE4 - mov SRC, %r8 - and $0xF, %r8 - jnz .Lad_u_loop - -.align 8 -.Lad_a_loop: - movdqa 0x00(SRC), MSG - aegis128_update - pxor MSG, STATE4 - sub $0x10, LEN - cmp $0x10, LEN - jl .Lad_out_1 - - movdqa 0x10(SRC), MSG - aegis128_update - pxor MSG, STATE3 - sub $0x10, LEN - cmp $0x10, LEN - jl .Lad_out_2 - - movdqa 0x20(SRC), MSG - aegis128_update - pxor MSG, STATE2 - sub $0x10, LEN - cmp $0x10, LEN - jl .Lad_out_3 - - movdqa 0x30(SRC), MSG - aegis128_update - pxor MSG, STATE1 - sub $0x10, LEN - cmp $0x10, LEN - jl .Lad_out_4 - - movdqa 0x40(SRC), MSG - aegis128_update - pxor MSG, STATE0 - sub $0x10, LEN - cmp $0x10, LEN - jl .Lad_out_0 - - add $0x50, SRC - jmp .Lad_a_loop - .align 8 -.Lad_u_loop: +.Lad_loop: movdqu 0x00(SRC), MSG aegis128_update pxor MSG, STATE4 @@ -327,7 +283,7 @@ SYM_FUNC_START(crypto_aegis128_aesni_ad) jl .Lad_out_0 add $0x50, SRC - jmp .Lad_u_loop + jmp .Lad_loop /* store the state: */ .Lad_out_0: @@ -380,15 +336,15 @@ SYM_FUNC_START(crypto_aegis128_aesni_ad) RET SYM_FUNC_END(crypto_aegis128_aesni_ad) -.macro encrypt_block a s0 s1 s2 s3 s4 i - movdq\a (\i * 0x10)(SRC), MSG +.macro encrypt_block s0 s1 s2 s3 s4 i + movdqu (\i * 0x10)(SRC), MSG movdqa MSG, T0 pxor \s1, T0 pxor \s4, T0 movdqa \s2, T1 pand \s3, T1 pxor T1, T0 - movdq\a T0, (\i * 0x10)(DST) + movdqu T0, (\i * 0x10)(DST) aegis128_update pxor MSG, \s4 @@ -415,34 +371,17 @@ SYM_FUNC_START(crypto_aegis128_aesni_enc) movdqu 0x30(STATEP), STATE3 movdqu 0x40(STATEP), STATE4 - mov SRC, %r8 - or DST, %r8 - and $0xF, %r8 - jnz .Lenc_u_loop - .align 8 -.Lenc_a_loop: - encrypt_block a STATE0 STATE1 STATE2 STATE3 STATE4 0 - encrypt_block a STATE4 STATE0 STATE1 STATE2 STATE3 1 - encrypt_block a STATE3 STATE4 STATE0 STATE1 STATE2 2 - encrypt_block a STATE2 STATE3 STATE4 STATE0 STATE1 3 - encrypt_block a STATE1 STATE2 STATE3 STATE4 STATE0 4 +.Lenc_loop: + encrypt_block STATE0 STATE1 STATE2 STATE3 STATE4 0 + encrypt_block STATE4 STATE0 STATE1 STATE2 STATE3 1 + encrypt_block STATE3 STATE4 STATE0 STATE1 STATE2 2 + encrypt_block STATE2 STATE3 STATE4 STATE0 STATE1 3 + encrypt_block STATE1 STATE2 STATE3 STATE4 STATE0 4 add $0x50, SRC add $0x50, DST - jmp .Lenc_a_loop - -.align 8 -.Lenc_u_loop: - encrypt_block u STATE0 STATE1 STATE2 STATE3 STATE4 0 - encrypt_block u STATE4 STATE0 STATE1 STATE2 STATE3 1 - encrypt_block u STATE3 STATE4 STATE0 STATE1 STATE2 2 - encrypt_block u STATE2 STATE3 STATE4 STATE0 STATE1 3 - encrypt_block u STATE1 STATE2 STATE3 STATE4 STATE0 4 - - add $0x50, SRC - add $0x50, DST - jmp .Lenc_u_loop + jmp .Lenc_loop /* store the state: */ .Lenc_out_0: @@ -535,14 +474,14 @@ SYM_FUNC_START(crypto_aegis128_aesni_enc_tail) RET SYM_FUNC_END(crypto_aegis128_aesni_enc_tail) -.macro decrypt_block a s0 s1 s2 s3 s4 i - movdq\a (\i * 0x10)(SRC), MSG +.macro decrypt_block s0 s1 s2 s3 s4 i + movdqu (\i * 0x10)(SRC), MSG pxor \s1, MSG pxor \s4, MSG movdqa \s2, T1 pand \s3, T1 pxor T1, MSG - movdq\a MSG, (\i * 0x10)(DST) + movdqu MSG, (\i * 0x10)(DST) aegis128_update pxor MSG, \s4 @@ -569,34 +508,17 @@ SYM_FUNC_START(crypto_aegis128_aesni_dec) movdqu 0x30(STATEP), STATE3 movdqu 0x40(STATEP), STATE4 - mov SRC, %r8 - or DST, %r8 - and $0xF, %r8 - jnz .Ldec_u_loop - -.align 8 -.Ldec_a_loop: - decrypt_block a STATE0 STATE1 STATE2 STATE3 STATE4 0 - decrypt_block a STATE4 STATE0 STATE1 STATE2 STATE3 1 - decrypt_block a STATE3 STATE4 STATE0 STATE1 STATE2 2 - decrypt_block a STATE2 STATE3 STATE4 STATE0 STATE1 3 - decrypt_block a STATE1 STATE2 STATE3 STATE4 STATE0 4 - - add $0x50, SRC - add $0x50, DST - jmp .Ldec_a_loop - .align 8 -.Ldec_u_loop: - decrypt_block u STATE0 STATE1 STATE2 STATE3 STATE4 0 - decrypt_block u STATE4 STATE0 STATE1 STATE2 STATE3 1 - decrypt_block u STATE3 STATE4 STATE0 STATE1 STATE2 2 - decrypt_block u STATE2 STATE3 STATE4 STATE0 STATE1 3 - decrypt_block u STATE1 STATE2 STATE3 STATE4 STATE0 4 +.Ldec_loop: + decrypt_block STATE0 STATE1 STATE2 STATE3 STATE4 0 + decrypt_block STATE4 STATE0 STATE1 STATE2 STATE3 1 + decrypt_block STATE3 STATE4 STATE0 STATE1 STATE2 2 + decrypt_block STATE2 STATE3 STATE4 STATE0 STATE1 3 + decrypt_block STATE1 STATE2 STATE3 STATE4 STATE0 4 add $0x50, SRC add $0x50, DST - jmp .Ldec_u_loop + jmp .Ldec_loop /* store the state: */ .Ldec_out_0: -- 2.51.0 From af2aff7caf8afb7abbe219a838d61b4c17d88a47 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 16 Oct 2024 17:00:46 -0700 Subject: [PATCH 14/16] crypto: x86/aegis128 - optimize length block preparation using SSE4.1 Start using SSE4.1 instructions in the AES-NI AEGIS code, with the first use case being preparing the length block in fewer instructions. In practice this does not reduce the set of CPUs on which the code can run, because all Intel and AMD CPUs with AES-NI also have SSE4.1. Upgrade the existing SSE2 feature check to SSE4.1, though it seems this check is not strictly necessary; the aesni-intel module has been getting away with using SSE4.1 despite checking for AES-NI only. Reviewed-by: Ondrej Mosnacek Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- arch/x86/crypto/Kconfig | 4 ++-- arch/x86/crypto/aegis128-aesni-asm.S | 6 ++---- arch/x86/crypto/aegis128-aesni-glue.c | 6 +++--- 3 files changed, 7 insertions(+), 9 deletions(-) diff --git a/arch/x86/crypto/Kconfig b/arch/x86/crypto/Kconfig index 7b1bebed879d..3d2e38ba5240 100644 --- a/arch/x86/crypto/Kconfig +++ b/arch/x86/crypto/Kconfig @@ -363,7 +363,7 @@ config CRYPTO_CHACHA20_X86_64 - AVX-512VL (Advanced Vector Extensions-512VL) config CRYPTO_AEGIS128_AESNI_SSE2 - tristate "AEAD ciphers: AEGIS-128 (AES-NI/SSE2)" + tristate "AEAD ciphers: AEGIS-128 (AES-NI/SSE4.1)" depends on X86 && 64BIT select CRYPTO_AEAD select CRYPTO_SIMD @@ -372,7 +372,7 @@ config CRYPTO_AEGIS128_AESNI_SSE2 Architecture: x86_64 using: - AES-NI (AES New Instructions) - - SSE2 (Streaming SIMD Extensions 2) + - SSE4.1 (Streaming SIMD Extensions 4.1) config CRYPTO_NHPOLY1305_SSE2 tristate "Hash functions: NHPoly1305 (SSE2)" diff --git a/arch/x86/crypto/aegis128-aesni-asm.S b/arch/x86/crypto/aegis128-aesni-asm.S index 5541aca2fd0d..6ed4bc452c29 100644 --- a/arch/x86/crypto/aegis128-aesni-asm.S +++ b/arch/x86/crypto/aegis128-aesni-asm.S @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0-only */ /* - * AES-NI + SSE2 implementation of AEGIS-128 + * AES-NI + SSE4.1 implementation of AEGIS-128 * * Copyright (c) 2017-2018 Ondrej Mosnacek * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. @@ -638,9 +638,7 @@ SYM_FUNC_START(crypto_aegis128_aesni_final) /* prepare length block: */ movd %edx, MSG - movd %ecx, T0 - pslldq $8, T0 - pxor T0, MSG + pinsrd $2, %ecx, MSG psllq $3, MSG /* multiply by 8 (to get bit count) */ pxor STATE3, MSG diff --git a/arch/x86/crypto/aegis128-aesni-glue.c b/arch/x86/crypto/aegis128-aesni-glue.c index deb39cef0be1..4dd2d981a514 100644 --- a/arch/x86/crypto/aegis128-aesni-glue.c +++ b/arch/x86/crypto/aegis128-aesni-glue.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-or-later /* * The AEGIS-128 Authenticated-Encryption Algorithm - * Glue for AES-NI + SSE2 implementation + * Glue for AES-NI + SSE4.1 implementation * * Copyright (c) 2017-2018 Ondrej Mosnacek * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. @@ -254,7 +254,7 @@ static struct simd_aead_alg *simd_alg; static int __init crypto_aegis128_aesni_module_init(void) { - if (!boot_cpu_has(X86_FEATURE_XMM2) || + if (!boot_cpu_has(X86_FEATURE_XMM4_1) || !boot_cpu_has(X86_FEATURE_AES) || !cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL)) return -ENODEV; @@ -273,6 +273,6 @@ module_exit(crypto_aegis128_aesni_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Ondrej Mosnacek "); -MODULE_DESCRIPTION("AEGIS-128 AEAD algorithm -- AESNI+SSE2 implementation"); +MODULE_DESCRIPTION("AEGIS-128 AEAD algorithm -- AESNI+SSE4.1 implementation"); MODULE_ALIAS_CRYPTO("aegis128"); MODULE_ALIAS_CRYPTO("aegis128-aesni"); -- 2.51.0 From 8da94b300f67240fbd8880d918200aa9046fc398 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 16 Oct 2024 17:00:47 -0700 Subject: [PATCH 15/16] crypto: x86/aegis128 - improve assembly function prototypes Adjust the prototypes of the AEGIS assembly functions: - Use proper types instead of 'void *', when applicable. - Move the length parameter to after the buffers it describes rather than before, to match the usual convention. Also shorten its name to just len (which is the name used in the assembly code). - Declare register aliases at the beginning of each function rather than once per file. This was necessary because len was moved, but also it allows adding some aliases where raw registers were used before. - Put assoclen and cryptlen in the correct order when declaring the finalization function in the .c file. - Remove the unnecessary "crypto_" prefix. Reviewed-by: Ondrej Mosnacek Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- arch/x86/crypto/aegis128-aesni-asm.S | 105 ++++++++++++++++---------- arch/x86/crypto/aegis128-aesni-glue.c | 92 +++++++++++----------- 2 files changed, 112 insertions(+), 85 deletions(-) diff --git a/arch/x86/crypto/aegis128-aesni-asm.S b/arch/x86/crypto/aegis128-aesni-asm.S index 6ed4bc452c29..9dfdbe0b1fb8 100644 --- a/arch/x86/crypto/aegis128-aesni-asm.S +++ b/arch/x86/crypto/aegis128-aesni-asm.S @@ -19,11 +19,6 @@ #define T0 %xmm6 #define T1 %xmm7 -#define STATEP %rdi -#define LEN %esi -#define SRC %rdx -#define DST %rcx - .section .rodata.cst16.aegis128_const, "aM", @progbits, 32 .align 16 .Laegis128_const_0: @@ -72,6 +67,8 @@ * %r9 */ SYM_FUNC_START_LOCAL(__load_partial) + .set LEN, %ecx + .set SRC, %rsi xor %r9d, %r9d pxor MSG, MSG @@ -138,6 +135,8 @@ SYM_FUNC_END(__load_partial) * %r10 */ SYM_FUNC_START_LOCAL(__store_partial) + .set LEN, %ecx + .set DST, %rdx mov LEN, %r8d mov DST, %r9 @@ -184,16 +183,21 @@ SYM_FUNC_START_LOCAL(__store_partial) SYM_FUNC_END(__store_partial) /* - * void crypto_aegis128_aesni_init(void *state, const void *key, const void *iv); + * void aegis128_aesni_init(struct aegis_state *state, + * const struct aegis_block *key, + * const u8 iv[AEGIS128_NONCE_SIZE]); */ -SYM_FUNC_START(crypto_aegis128_aesni_init) +SYM_FUNC_START(aegis128_aesni_init) + .set STATEP, %rdi + .set KEYP, %rsi + .set IVP, %rdx FRAME_BEGIN /* load IV: */ - movdqu (%rdx), T1 + movdqu (IVP), T1 /* load key: */ - movdqa (%rsi), KEY + movdqa (KEYP), KEY pxor KEY, T1 movdqa T1, STATE0 movdqa KEY, STATE3 @@ -226,13 +230,16 @@ SYM_FUNC_START(crypto_aegis128_aesni_init) FRAME_END RET -SYM_FUNC_END(crypto_aegis128_aesni_init) +SYM_FUNC_END(aegis128_aesni_init) /* - * void crypto_aegis128_aesni_ad(void *state, unsigned int length, - * const void *data); + * void aegis128_aesni_ad(struct aegis_state *state, const u8 *data, + * unsigned int len); */ -SYM_FUNC_START(crypto_aegis128_aesni_ad) +SYM_FUNC_START(aegis128_aesni_ad) + .set STATEP, %rdi + .set SRC, %rsi + .set LEN, %edx FRAME_BEGIN cmp $0x10, LEN @@ -334,7 +341,7 @@ SYM_FUNC_START(crypto_aegis128_aesni_ad) .Lad_out: FRAME_END RET -SYM_FUNC_END(crypto_aegis128_aesni_ad) +SYM_FUNC_END(aegis128_aesni_ad) .macro encrypt_block s0 s1 s2 s3 s4 i movdqu (\i * 0x10)(SRC), MSG @@ -355,10 +362,14 @@ SYM_FUNC_END(crypto_aegis128_aesni_ad) .endm /* - * void crypto_aegis128_aesni_enc(void *state, unsigned int length, - * const void *src, void *dst); + * void aegis128_aesni_enc(struct aegis_state *state, const u8 *src, u8 *dst, + * unsigned int len); */ -SYM_FUNC_START(crypto_aegis128_aesni_enc) +SYM_FUNC_START(aegis128_aesni_enc) + .set STATEP, %rdi + .set SRC, %rsi + .set DST, %rdx + .set LEN, %ecx FRAME_BEGIN cmp $0x10, LEN @@ -432,13 +443,17 @@ SYM_FUNC_START(crypto_aegis128_aesni_enc) .Lenc_out: FRAME_END RET -SYM_FUNC_END(crypto_aegis128_aesni_enc) +SYM_FUNC_END(aegis128_aesni_enc) /* - * void crypto_aegis128_aesni_enc_tail(void *state, unsigned int length, - * const void *src, void *dst); + * void aegis128_aesni_enc_tail(struct aegis_state *state, const u8 *src, + * u8 *dst, unsigned int len); */ -SYM_FUNC_START(crypto_aegis128_aesni_enc_tail) +SYM_FUNC_START(aegis128_aesni_enc_tail) + .set STATEP, %rdi + .set SRC, %rsi + .set DST, %rdx + .set LEN, %ecx FRAME_BEGIN /* load the state: */ @@ -472,7 +487,7 @@ SYM_FUNC_START(crypto_aegis128_aesni_enc_tail) FRAME_END RET -SYM_FUNC_END(crypto_aegis128_aesni_enc_tail) +SYM_FUNC_END(aegis128_aesni_enc_tail) .macro decrypt_block s0 s1 s2 s3 s4 i movdqu (\i * 0x10)(SRC), MSG @@ -492,10 +507,14 @@ SYM_FUNC_END(crypto_aegis128_aesni_enc_tail) .endm /* - * void crypto_aegis128_aesni_dec(void *state, unsigned int length, - * const void *src, void *dst); + * void aegis128_aesni_dec(struct aegis_state *state, const u8 *src, u8 *dst, + * unsigned int len); */ -SYM_FUNC_START(crypto_aegis128_aesni_dec) +SYM_FUNC_START(aegis128_aesni_dec) + .set STATEP, %rdi + .set SRC, %rsi + .set DST, %rdx + .set LEN, %ecx FRAME_BEGIN cmp $0x10, LEN @@ -569,13 +588,17 @@ SYM_FUNC_START(crypto_aegis128_aesni_dec) .Ldec_out: FRAME_END RET -SYM_FUNC_END(crypto_aegis128_aesni_dec) +SYM_FUNC_END(aegis128_aesni_dec) /* - * void crypto_aegis128_aesni_dec_tail(void *state, unsigned int length, - * const void *src, void *dst); + * void aegis128_aesni_dec_tail(struct aegis_state *state, const u8 *src, + * u8 *dst, unsigned int len); */ -SYM_FUNC_START(crypto_aegis128_aesni_dec_tail) +SYM_FUNC_START(aegis128_aesni_dec_tail) + .set STATEP, %rdi + .set SRC, %rsi + .set DST, %rdx + .set LEN, %ecx FRAME_BEGIN /* load the state: */ @@ -619,14 +642,18 @@ SYM_FUNC_START(crypto_aegis128_aesni_dec_tail) FRAME_END RET -SYM_FUNC_END(crypto_aegis128_aesni_dec_tail) +SYM_FUNC_END(aegis128_aesni_dec_tail) /* - * void crypto_aegis128_aesni_final(void *state, void *tag_xor, - * unsigned int assoclen, - * unsigned int cryptlen); + * void aegis128_aesni_final(struct aegis_state *state, + * struct aegis_block *tag_xor, + * unsigned int assoclen, unsigned int cryptlen); */ -SYM_FUNC_START(crypto_aegis128_aesni_final) +SYM_FUNC_START(aegis128_aesni_final) + .set STATEP, %rdi + .set TAG_XOR, %rsi + .set ASSOCLEN, %edx + .set CRYPTLEN, %ecx FRAME_BEGIN /* load the state: */ @@ -637,8 +664,8 @@ SYM_FUNC_START(crypto_aegis128_aesni_final) movdqu 0x40(STATEP), STATE4 /* prepare length block: */ - movd %edx, MSG - pinsrd $2, %ecx, MSG + movd ASSOCLEN, MSG + pinsrd $2, CRYPTLEN, MSG psllq $3, MSG /* multiply by 8 (to get bit count) */ pxor STATE3, MSG @@ -653,7 +680,7 @@ SYM_FUNC_START(crypto_aegis128_aesni_final) aegis128_update; pxor MSG, STATE3 /* xor tag: */ - movdqu (%rsi), MSG + movdqu (TAG_XOR), MSG pxor STATE0, MSG pxor STATE1, MSG @@ -661,8 +688,8 @@ SYM_FUNC_START(crypto_aegis128_aesni_final) pxor STATE3, MSG pxor STATE4, MSG - movdqu MSG, (%rsi) + movdqu MSG, (TAG_XOR) FRAME_END RET -SYM_FUNC_END(crypto_aegis128_aesni_final) +SYM_FUNC_END(aegis128_aesni_final) diff --git a/arch/x86/crypto/aegis128-aesni-glue.c b/arch/x86/crypto/aegis128-aesni-glue.c index 4dd2d981a514..9555958e4089 100644 --- a/arch/x86/crypto/aegis128-aesni-glue.c +++ b/arch/x86/crypto/aegis128-aesni-glue.c @@ -23,27 +23,6 @@ #define AEGIS128_MIN_AUTH_SIZE 8 #define AEGIS128_MAX_AUTH_SIZE 16 -asmlinkage void crypto_aegis128_aesni_init(void *state, void *key, void *iv); - -asmlinkage void crypto_aegis128_aesni_ad( - void *state, unsigned int length, const void *data); - -asmlinkage void crypto_aegis128_aesni_enc( - void *state, unsigned int length, const void *src, void *dst); - -asmlinkage void crypto_aegis128_aesni_dec( - void *state, unsigned int length, const void *src, void *dst); - -asmlinkage void crypto_aegis128_aesni_enc_tail( - void *state, unsigned int length, const void *src, void *dst); - -asmlinkage void crypto_aegis128_aesni_dec_tail( - void *state, unsigned int length, const void *src, void *dst); - -asmlinkage void crypto_aegis128_aesni_final( - void *state, void *tag_xor, unsigned int cryptlen, - unsigned int assoclen); - struct aegis_block { u8 bytes[AEGIS128_BLOCK_SIZE] __aligned(AEGIS128_BLOCK_ALIGN); }; @@ -56,6 +35,32 @@ struct aegis_ctx { struct aegis_block key; }; +asmlinkage void aegis128_aesni_init(struct aegis_state *state, + const struct aegis_block *key, + const u8 iv[AEGIS128_NONCE_SIZE]); + +asmlinkage void aegis128_aesni_ad(struct aegis_state *state, const u8 *data, + unsigned int len); + +asmlinkage void aegis128_aesni_enc(struct aegis_state *state, const u8 *src, + u8 *dst, unsigned int len); + +asmlinkage void aegis128_aesni_dec(struct aegis_state *state, const u8 *src, + u8 *dst, unsigned int len); + +asmlinkage void aegis128_aesni_enc_tail(struct aegis_state *state, + const u8 *src, u8 *dst, + unsigned int len); + +asmlinkage void aegis128_aesni_dec_tail(struct aegis_state *state, + const u8 *src, u8 *dst, + unsigned int len); + +asmlinkage void aegis128_aesni_final(struct aegis_state *state, + struct aegis_block *tag_xor, + unsigned int assoclen, + unsigned int cryptlen); + static void crypto_aegis128_aesni_process_ad( struct aegis_state *state, struct scatterlist *sg_src, unsigned int assoclen) @@ -75,15 +80,14 @@ static void crypto_aegis128_aesni_process_ad( if (pos > 0) { unsigned int fill = AEGIS128_BLOCK_SIZE - pos; memcpy(buf.bytes + pos, src, fill); - crypto_aegis128_aesni_ad(state, - AEGIS128_BLOCK_SIZE, - buf.bytes); + aegis128_aesni_ad(state, buf.bytes, + AEGIS128_BLOCK_SIZE); pos = 0; left -= fill; src += fill; } - crypto_aegis128_aesni_ad(state, left, src); + aegis128_aesni_ad(state, src, left); src += left & ~(AEGIS128_BLOCK_SIZE - 1); left &= AEGIS128_BLOCK_SIZE - 1; @@ -100,7 +104,7 @@ static void crypto_aegis128_aesni_process_ad( if (pos > 0) { memset(buf.bytes + pos, 0, AEGIS128_BLOCK_SIZE - pos); - crypto_aegis128_aesni_ad(state, AEGIS128_BLOCK_SIZE, buf.bytes); + aegis128_aesni_ad(state, buf.bytes, AEGIS128_BLOCK_SIZE); } } @@ -110,31 +114,27 @@ crypto_aegis128_aesni_process_crypt(struct aegis_state *state, { while (walk->nbytes >= AEGIS128_BLOCK_SIZE) { if (enc) - crypto_aegis128_aesni_enc( - state, - round_down(walk->nbytes, - AEGIS128_BLOCK_SIZE), - walk->src.virt.addr, - walk->dst.virt.addr); + aegis128_aesni_enc(state, walk->src.virt.addr, + walk->dst.virt.addr, + round_down(walk->nbytes, + AEGIS128_BLOCK_SIZE)); else - crypto_aegis128_aesni_dec( - state, - round_down(walk->nbytes, - AEGIS128_BLOCK_SIZE), - walk->src.virt.addr, - walk->dst.virt.addr); + aegis128_aesni_dec(state, walk->src.virt.addr, + walk->dst.virt.addr, + round_down(walk->nbytes, + AEGIS128_BLOCK_SIZE)); skcipher_walk_done(walk, walk->nbytes % AEGIS128_BLOCK_SIZE); } if (walk->nbytes) { if (enc) - crypto_aegis128_aesni_enc_tail(state, walk->nbytes, - walk->src.virt.addr, - walk->dst.virt.addr); + aegis128_aesni_enc_tail(state, walk->src.virt.addr, + walk->dst.virt.addr, + walk->nbytes); else - crypto_aegis128_aesni_dec_tail(state, walk->nbytes, - walk->src.virt.addr, - walk->dst.virt.addr); + aegis128_aesni_dec_tail(state, walk->src.virt.addr, + walk->dst.virt.addr, + walk->nbytes); skcipher_walk_done(walk, 0); } } @@ -186,10 +186,10 @@ crypto_aegis128_aesni_crypt(struct aead_request *req, kernel_fpu_begin(); - crypto_aegis128_aesni_init(&state, ctx->key.bytes, req->iv); + aegis128_aesni_init(&state, &ctx->key, req->iv); crypto_aegis128_aesni_process_ad(&state, req->src, req->assoclen); crypto_aegis128_aesni_process_crypt(&state, &walk, enc); - crypto_aegis128_aesni_final(&state, tag_xor, req->assoclen, cryptlen); + aegis128_aesni_final(&state, tag_xor, req->assoclen, cryptlen); kernel_fpu_end(); } -- 2.51.0 From 933e8974312e348c017c07591bec56677bdfc3dc Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 16 Oct 2024 17:00:48 -0700 Subject: [PATCH 16/16] crypto: x86/aegis128 - optimize partial block handling using SSE4.1 Optimize the code that loads and stores partial blocks, taking advantage of SSE4.1. The code is adapted from that in aes-gcm-aesni-x86_64.S. Reviewed-by: Ondrej Mosnacek Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- arch/x86/crypto/aegis128-aesni-asm.S | 236 +++++++++++---------------- 1 file changed, 95 insertions(+), 141 deletions(-) diff --git a/arch/x86/crypto/aegis128-aesni-asm.S b/arch/x86/crypto/aegis128-aesni-asm.S index 9dfdbe0b1fb8..e650330ef695 100644 --- a/arch/x86/crypto/aegis128-aesni-asm.S +++ b/arch/x86/crypto/aegis128-aesni-asm.S @@ -4,6 +4,7 @@ * * Copyright (c) 2017-2018 Ondrej Mosnacek * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. + * Copyright 2024 Google LLC */ #include @@ -28,11 +29,11 @@ .byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1 .byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd -.section .rodata.cst16.aegis128_counter, "aM", @progbits, 16 -.align 16 -.Laegis128_counter: - .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 - .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f +.section .rodata.cst32.zeropad_mask, "aM", @progbits, 32 +.align 32 +.Lzeropad_mask: + .octa 0xffffffffffffffffffffffffffffffff + .octa 0 .text @@ -55,132 +56,86 @@ .endm /* - * __load_partial: internal ABI - * input: - * LEN - bytes - * SRC - src - * output: - * MSG - message block - * changed: - * T0 - * %r8 - * %r9 + * Load 1 <= LEN (%ecx) <= 15 bytes from the pointer SRC into the xmm register + * MSG and zeroize any remaining bytes. Clobbers %rax, %rcx, and %r8. */ -SYM_FUNC_START_LOCAL(__load_partial) - .set LEN, %ecx - .set SRC, %rsi - xor %r9d, %r9d - pxor MSG, MSG - - mov LEN, %r8d - and $0x1, %r8 - jz .Lld_partial_1 - - mov LEN, %r8d - and $0x1E, %r8 - add SRC, %r8 - mov (%r8), %r9b - -.Lld_partial_1: - mov LEN, %r8d - and $0x2, %r8 - jz .Lld_partial_2 - - mov LEN, %r8d - and $0x1C, %r8 - add SRC, %r8 - shl $0x10, %r9 - mov (%r8), %r9w - -.Lld_partial_2: - mov LEN, %r8d - and $0x4, %r8 - jz .Lld_partial_4 - - mov LEN, %r8d - and $0x18, %r8 - add SRC, %r8 - shl $32, %r9 - mov (%r8), %r8d - xor %r8, %r9 - -.Lld_partial_4: - movq %r9, MSG - - mov LEN, %r8d - and $0x8, %r8 - jz .Lld_partial_8 - - mov LEN, %r8d - and $0x10, %r8 - add SRC, %r8 - pslldq $8, MSG - movq (%r8), T0 - pxor T0, MSG - -.Lld_partial_8: - RET -SYM_FUNC_END(__load_partial) +.macro load_partial + sub $8, %ecx /* LEN - 8 */ + jle .Lle8\@ + + /* Load 9 <= LEN <= 15 bytes: */ + movq (SRC), MSG /* Load first 8 bytes */ + mov (SRC, %rcx), %rax /* Load last 8 bytes */ + neg %ecx + shl $3, %ecx + shr %cl, %rax /* Discard overlapping bytes */ + pinsrq $1, %rax, MSG + jmp .Ldone\@ + +.Lle8\@: + add $4, %ecx /* LEN - 4 */ + jl .Llt4\@ + + /* Load 4 <= LEN <= 8 bytes: */ + mov (SRC), %eax /* Load first 4 bytes */ + mov (SRC, %rcx), %r8d /* Load last 4 bytes */ + jmp .Lcombine\@ + +.Llt4\@: + /* Load 1 <= LEN <= 3 bytes: */ + add $2, %ecx /* LEN - 2 */ + movzbl (SRC), %eax /* Load first byte */ + jl .Lmovq\@ + movzwl (SRC, %rcx), %r8d /* Load last 2 bytes */ +.Lcombine\@: + shl $3, %ecx + shl %cl, %r8 + or %r8, %rax /* Combine the two parts */ +.Lmovq\@: + movq %rax, MSG +.Ldone\@: +.endm /* - * __store_partial: internal ABI - * input: - * LEN - bytes - * DST - dst - * output: - * T0 - message block - * changed: - * %r8 - * %r9 - * %r10 + * Store 1 <= LEN (%ecx) <= 15 bytes from the xmm register \msg to the pointer + * DST. Clobbers %rax, %rcx, and %r8. */ -SYM_FUNC_START_LOCAL(__store_partial) - .set LEN, %ecx - .set DST, %rdx - mov LEN, %r8d - mov DST, %r9 - - movq T0, %r10 - - cmp $8, %r8 - jl .Lst_partial_8 - - mov %r10, (%r9) - psrldq $8, T0 - movq T0, %r10 - - sub $8, %r8 - add $8, %r9 - -.Lst_partial_8: - cmp $4, %r8 - jl .Lst_partial_4 - - mov %r10d, (%r9) - shr $32, %r10 - - sub $4, %r8 - add $4, %r9 - -.Lst_partial_4: - cmp $2, %r8 - jl .Lst_partial_2 - - mov %r10w, (%r9) - shr $0x10, %r10 - - sub $2, %r8 - add $2, %r9 - -.Lst_partial_2: - cmp $1, %r8 - jl .Lst_partial_1 - - mov %r10b, (%r9) - -.Lst_partial_1: - RET -SYM_FUNC_END(__store_partial) +.macro store_partial msg + sub $8, %ecx /* LEN - 8 */ + jl .Llt8\@ + + /* Store 8 <= LEN <= 15 bytes: */ + pextrq $1, \msg, %rax + mov %ecx, %r8d + shl $3, %ecx + ror %cl, %rax + mov %rax, (DST, %r8) /* Store last LEN - 8 bytes */ + movq \msg, (DST) /* Store first 8 bytes */ + jmp .Ldone\@ + +.Llt8\@: + add $4, %ecx /* LEN - 4 */ + jl .Llt4\@ + + /* Store 4 <= LEN <= 7 bytes: */ + pextrd $1, \msg, %eax + mov %ecx, %r8d + shl $3, %ecx + ror %cl, %eax + mov %eax, (DST, %r8) /* Store last LEN - 4 bytes */ + movd \msg, (DST) /* Store first 4 bytes */ + jmp .Ldone\@ + +.Llt4\@: + /* Store 1 <= LEN <= 3 bytes: */ + pextrb $0, \msg, 0(DST) + cmp $-2, %ecx /* LEN - 4 == -2, i.e. LEN == 2? */ + jl .Ldone\@ + pextrb $1, \msg, 1(DST) + je .Ldone\@ + pextrb $2, \msg, 2(DST) +.Ldone\@: +.endm /* * void aegis128_aesni_init(struct aegis_state *state, @@ -453,7 +408,7 @@ SYM_FUNC_START(aegis128_aesni_enc_tail) .set STATEP, %rdi .set SRC, %rsi .set DST, %rdx - .set LEN, %ecx + .set LEN, %ecx /* {load,store}_partial rely on this being %ecx */ FRAME_BEGIN /* load the state: */ @@ -464,7 +419,8 @@ SYM_FUNC_START(aegis128_aesni_enc_tail) movdqu 0x40(STATEP), STATE4 /* encrypt message: */ - call __load_partial + mov LEN, %r9d + load_partial movdqa MSG, T0 pxor STATE1, T0 @@ -473,7 +429,8 @@ SYM_FUNC_START(aegis128_aesni_enc_tail) pand STATE3, T1 pxor T1, T0 - call __store_partial + mov %r9d, LEN + store_partial T0 aegis128_update pxor MSG, STATE4 @@ -598,7 +555,7 @@ SYM_FUNC_START(aegis128_aesni_dec_tail) .set STATEP, %rdi .set SRC, %rsi .set DST, %rdx - .set LEN, %ecx + .set LEN, %ecx /* {load,store}_partial rely on this being %ecx */ FRAME_BEGIN /* load the state: */ @@ -609,7 +566,8 @@ SYM_FUNC_START(aegis128_aesni_dec_tail) movdqu 0x40(STATEP), STATE4 /* decrypt message: */ - call __load_partial + mov LEN, %r9d + load_partial pxor STATE1, MSG pxor STATE4, MSG @@ -617,17 +575,13 @@ SYM_FUNC_START(aegis128_aesni_dec_tail) pand STATE3, T1 pxor T1, MSG - movdqa MSG, T0 - call __store_partial + mov %r9d, LEN + store_partial MSG /* mask with byte count: */ - movd LEN, T0 - punpcklbw T0, T0 - punpcklbw T0, T0 - punpcklbw T0, T0 - punpcklbw T0, T0 - movdqa .Laegis128_counter(%rip), T1 - pcmpgtb T1, T0 + lea .Lzeropad_mask+16(%rip), %rax + sub %r9, %rax + movdqu (%rax), T0 pand T0, MSG aegis128_update -- 2.51.0