poly1305_block_sse2(a, m, b, 1);
 }
 
+static unsigned int poly1305_scalar_blocks(struct poly1305_desc_ctx *dctx,
+                                          const u8 *src, unsigned int srclen)
+{
+       unsigned int datalen;
+
+       if (unlikely(!dctx->sset)) {
+               datalen = crypto_poly1305_setdesckey(dctx, src, srclen);
+               src += srclen - datalen;
+               srclen = datalen;
+       }
+       if (srclen >= POLY1305_BLOCK_SIZE) {
+               poly1305_core_blocks(&dctx->h, dctx->r, src,
+                                    srclen / POLY1305_BLOCK_SIZE, 1);
+               srclen %= POLY1305_BLOCK_SIZE;
+       }
+       return srclen;
+}
+
 static unsigned int poly1305_simd_blocks(struct poly1305_desc_ctx *dctx,
                                         const u8 *src, unsigned int srclen)
 {
        struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
        unsigned int bytes;
 
-       /* kernel_fpu_begin/end is costly, use fallback for small updates */
-       if (srclen <= 288 || !crypto_simd_usable())
-               return crypto_poly1305_update(desc, src, srclen);
-
-       kernel_fpu_begin();
-
        if (unlikely(dctx->buflen)) {
                bytes = min(srclen, POLY1305_BLOCK_SIZE - dctx->buflen);
                memcpy(dctx->buf + dctx->buflen, src, bytes);
                dctx->buflen += bytes;
 
                if (dctx->buflen == POLY1305_BLOCK_SIZE) {
-                       poly1305_simd_blocks(dctx, dctx->buf,
-                                            POLY1305_BLOCK_SIZE);
+                       if (likely(crypto_simd_usable())) {
+                               kernel_fpu_begin();
+                               poly1305_simd_blocks(dctx, dctx->buf,
+                                                    POLY1305_BLOCK_SIZE);
+                               kernel_fpu_end();
+                       } else {
+                               poly1305_scalar_blocks(dctx, dctx->buf,
+                                                      POLY1305_BLOCK_SIZE);
+                       }
                        dctx->buflen = 0;
                }
        }
 
        if (likely(srclen >= POLY1305_BLOCK_SIZE)) {
-               bytes = poly1305_simd_blocks(dctx, src, srclen);
+               if (likely(crypto_simd_usable())) {
+                       kernel_fpu_begin();
+                       bytes = poly1305_simd_blocks(dctx, src, srclen);
+                       kernel_fpu_end();
+               } else {
+                       bytes = poly1305_scalar_blocks(dctx, src, srclen);
+               }
                src += srclen - bytes;
                srclen = bytes;
        }
 
-       kernel_fpu_end();
-
        if (unlikely(srclen)) {
                dctx->buflen = srclen;
                memcpy(dctx->buf, src, srclen);
        }
+}
+
+static int crypto_poly1305_init(struct shash_desc *desc)
+{
+       struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
+
+       poly1305_core_init(&dctx->h);
+       dctx->buflen = 0;
+       dctx->rset = 0;
+       dctx->sset = false;
+
+       return 0;
+}
+
+static int crypto_poly1305_final(struct shash_desc *desc, u8 *dst)
+{
+       struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
+
+       if (unlikely(!dctx->sset))
+               return -ENOKEY;
 
+       poly1305_final_generic(dctx, dst);
        return 0;
 }
 
 
 #include <linux/module.h>
 #include <asm/unaligned.h>
 
-int crypto_poly1305_init(struct shash_desc *desc)
+static int crypto_poly1305_init(struct shash_desc *desc)
 {
        struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
 
 
        return 0;
 }
-EXPORT_SYMBOL_GPL(crypto_poly1305_init);
 
 static void poly1305_blocks(struct poly1305_desc_ctx *dctx, const u8 *src,
                            unsigned int srclen)
                             srclen / POLY1305_BLOCK_SIZE, 1);
 }
 
-int crypto_poly1305_update(struct shash_desc *desc,
-                          const u8 *src, unsigned int srclen)
+static int crypto_poly1305_update(struct shash_desc *desc,
+                                 const u8 *src, unsigned int srclen)
 {
        struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
        unsigned int bytes;
 
        return 0;
 }
-EXPORT_SYMBOL_GPL(crypto_poly1305_update);
 
-int crypto_poly1305_final(struct shash_desc *desc, u8 *dst)
+static int crypto_poly1305_final(struct shash_desc *desc, u8 *dst)
 {
        struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
 
        poly1305_final_generic(dctx, dst);
        return 0;
 }
-EXPORT_SYMBOL_GPL(crypto_poly1305_final);
 
 static struct shash_alg poly1305_alg = {
        .digestsize     = POLY1305_DIGEST_SIZE,
 
 #include <linux/types.h>
 #include <crypto/poly1305.h>
 
-struct shash_desc;
-
 /*
  * Poly1305 core functions.  These implement the ε-almost-∆-universal hash
  * function underlying the Poly1305 MAC, i.e. they don't add an encrypted nonce
                          unsigned int nblocks, u32 hibit);
 void poly1305_core_emit(const struct poly1305_state *state, void *dst);
 
-/* Crypto API helper functions for the Poly1305 MAC */
-int crypto_poly1305_init(struct shash_desc *desc);
-
-int crypto_poly1305_update(struct shash_desc *desc,
-                          const u8 *src, unsigned int srclen);
-int crypto_poly1305_final(struct shash_desc *desc, u8 *dst);
-
 /*
  * Poly1305 requires a unique key for each tag, which implies that we can't set
  * it on the tfm that gets accessed by multiple users simultaneously. Instead we