#include <asm/hwcap.h>
 #include <asm/neon.h>
-#include <asm/simd.h>
-#include <crypto/poly1305.h>
-#include <crypto/internal/simd.h>
+#include <crypto/internal/poly1305.h>
 #include <linux/cpufeature.h>
 #include <linux/jump_label.h>
+#include <linux/kernel.h>
 #include <linux/module.h>
+#include <linux/string.h>
 #include <linux/unaligned.h>
 
-void poly1305_init_arm(void *state, const u8 *key);
-void poly1305_blocks_arm(void *state, const u8 *src, u32 len, u32 hibit);
-void poly1305_blocks_neon(void *state, const u8 *src, u32 len, u32 hibit);
-void poly1305_emit_arm(void *state, u8 *digest, const u32 *nonce);
-
-void __weak poly1305_blocks_neon(void *state, const u8 *src, u32 len, u32 hibit)
+asmlinkage void poly1305_block_init_arch(
+       struct poly1305_block_state *state,
+       const u8 raw_key[POLY1305_BLOCK_SIZE]);
+EXPORT_SYMBOL_GPL(poly1305_block_init_arch);
+asmlinkage void poly1305_blocks_arm(struct poly1305_block_state *state,
+                                   const u8 *src, u32 len, u32 hibit);
+asmlinkage void poly1305_blocks_neon(struct poly1305_block_state *state,
+                                    const u8 *src, u32 len, u32 hibit);
+asmlinkage void poly1305_emit_arch(const struct poly1305_state *state,
+                                  u8 digest[POLY1305_DIGEST_SIZE],
+                                  const u32 nonce[4]);
+EXPORT_SYMBOL_GPL(poly1305_emit_arch);
+
+void __weak poly1305_blocks_neon(struct poly1305_block_state *state,
+                                const u8 *src, u32 len, u32 hibit)
 {
 }
 
 
 void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 key[POLY1305_KEY_SIZE])
 {
-       poly1305_init_arm(&dctx->h, key);
        dctx->s[0] = get_unaligned_le32(key + 16);
        dctx->s[1] = get_unaligned_le32(key + 20);
        dctx->s[2] = get_unaligned_le32(key + 24);
        dctx->s[3] = get_unaligned_le32(key + 28);
        dctx->buflen = 0;
+       poly1305_block_init_arch(&dctx->state, key);
 }
 EXPORT_SYMBOL(poly1305_init_arch);
 
+void poly1305_blocks_arch(struct poly1305_block_state *state, const u8 *src,
+                         unsigned int len, u32 padbit)
+{
+       len = round_down(len, POLY1305_BLOCK_SIZE);
+       if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
+           static_branch_likely(&have_neon)) {
+               do {
+                       unsigned int todo = min_t(unsigned int, len, SZ_4K);
+
+                       kernel_neon_begin();
+                       poly1305_blocks_neon(state, src, todo, padbit);
+                       kernel_neon_end();
+
+                       len -= todo;
+                       src += todo;
+               } while (len);
+       } else
+               poly1305_blocks_arm(state, src, len, padbit);
+}
+EXPORT_SYMBOL_GPL(poly1305_blocks_arch);
+
 void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src,
                          unsigned int nbytes)
 {
-       bool do_neon = IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
-                      crypto_simd_usable();
-
        if (unlikely(dctx->buflen)) {
                u32 bytes = min(nbytes, POLY1305_BLOCK_SIZE - dctx->buflen);
 
                dctx->buflen += bytes;
 
                if (dctx->buflen == POLY1305_BLOCK_SIZE) {
-                       poly1305_blocks_arm(&dctx->h, dctx->buf,
-                                           POLY1305_BLOCK_SIZE, 1);
+                       poly1305_blocks_arch(&dctx->state, dctx->buf,
+                                            POLY1305_BLOCK_SIZE, 1);
                        dctx->buflen = 0;
                }
        }
 
        if (likely(nbytes >= POLY1305_BLOCK_SIZE)) {
-               unsigned int len = round_down(nbytes, POLY1305_BLOCK_SIZE);
-
-               if (static_branch_likely(&have_neon) && do_neon) {
-                       do {
-                               unsigned int todo = min_t(unsigned int, len, SZ_4K);
-
-                               kernel_neon_begin();
-                               poly1305_blocks_neon(&dctx->h, src, todo, 1);
-                               kernel_neon_end();
-
-                               len -= todo;
-                               src += todo;
-                       } while (len);
-               } else {
-                       poly1305_blocks_arm(&dctx->h, src, len, 1);
-                       src += len;
-               }
+               poly1305_blocks_arch(&dctx->state, src, nbytes, 1);
+               src += round_down(nbytes, POLY1305_BLOCK_SIZE);
                nbytes %= POLY1305_BLOCK_SIZE;
        }
 
                dctx->buf[dctx->buflen++] = 1;
                memset(dctx->buf + dctx->buflen, 0,
                       POLY1305_BLOCK_SIZE - dctx->buflen);
-               poly1305_blocks_arm(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 0);
+               poly1305_blocks_arch(&dctx->state, dctx->buf, POLY1305_BLOCK_SIZE, 0);
        }
 
-       poly1305_emit_arm(&dctx->h, dst, dctx->s);
+       poly1305_emit_arch(&dctx->h, dst, dctx->s);
        *dctx = (struct poly1305_desc_ctx){};
 }
 EXPORT_SYMBOL(poly1305_final_arch);