--- /dev/null
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SM4-CCM AEAD Algorithm using ARMv8 Crypto Extensions
+ * as specified in rfc8998
+ * https://datatracker.ietf.org/doc/html/rfc8998
+ *
+ * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+#include "sm4-ce-asm.h"
+
+.arch  armv8-a+crypto
+
+.irp b, 0, 1, 8, 9, 10, 11, 12, 13, 14, 15, 16, 24, 25, 26, 27, 28, 29, 30, 31
+       .set .Lv\b\().4s, \b
+.endr
+
+.macro sm4e, vd, vn
+       .inst 0xcec08400 | (.L\vn << 5) | .L\vd
+.endm
+
+/* Register macros */
+
+#define RMAC   v16
+
+/* Helper macros. */
+
+#define inc_le128(vctr)                                        \
+               mov             vctr.d[1], x8;          \
+               mov             vctr.d[0], x7;          \
+               adds            x8, x8, #1;             \
+               rev64           vctr.16b, vctr.16b;     \
+               adc             x7, x7, xzr;
+
+
+.align 3
+SYM_FUNC_START(sm4_ce_cbcmac_update)
+       /* input:
+        *   x0: round key array, CTX
+        *   x1: mac
+        *   x2: src
+        *   w3: nblocks
+        */
+       SM4_PREPARE(x0)
+
+       ld1             {RMAC.16b}, [x1]
+
+.Lcbcmac_loop_4x:
+       cmp             w3, #4
+       blt             .Lcbcmac_loop_1x
+
+       sub             w3, w3, #4
+
+       ld1             {v0.16b-v3.16b}, [x2], #64
+
+       SM4_CRYPT_BLK(RMAC)
+       eor             RMAC.16b, RMAC.16b, v0.16b
+       SM4_CRYPT_BLK(RMAC)
+       eor             RMAC.16b, RMAC.16b, v1.16b
+       SM4_CRYPT_BLK(RMAC)
+       eor             RMAC.16b, RMAC.16b, v2.16b
+       SM4_CRYPT_BLK(RMAC)
+       eor             RMAC.16b, RMAC.16b, v3.16b
+
+       cbz             w3, .Lcbcmac_end
+       b               .Lcbcmac_loop_4x
+
+.Lcbcmac_loop_1x:
+       sub             w3, w3, #1
+
+       ld1             {v0.16b}, [x2], #16
+
+       SM4_CRYPT_BLK(RMAC)
+       eor             RMAC.16b, RMAC.16b, v0.16b
+
+       cbnz            w3, .Lcbcmac_loop_1x
+
+.Lcbcmac_end:
+       st1             {RMAC.16b}, [x1]
+       ret
+SYM_FUNC_END(sm4_ce_cbcmac_update)
+
+.align 3
+SYM_FUNC_START(sm4_ce_ccm_final)
+       /* input:
+        *   x0: round key array, CTX
+        *   x1: ctr0 (big endian, 128 bit)
+        *   x2: mac
+        */
+       SM4_PREPARE(x0)
+
+       ld1             {RMAC.16b}, [x2]
+       ld1             {v0.16b}, [x1]
+
+       SM4_CRYPT_BLK2(RMAC, v0)
+
+       /* en-/decrypt the mac with ctr0 */
+       eor             RMAC.16b, RMAC.16b, v0.16b
+       st1             {RMAC.16b}, [x2]
+
+       ret
+SYM_FUNC_END(sm4_ce_ccm_final)
+
+.align 3
+SYM_FUNC_START(sm4_ce_ccm_enc)
+       /* input:
+        *   x0: round key array, CTX
+        *   x1: dst
+        *   x2: src
+        *   x3: ctr (big endian, 128 bit)
+        *   w4: nbytes
+        *   x5: mac
+        */
+       SM4_PREPARE(x0)
+
+       ldp             x7, x8, [x3]
+       rev             x7, x7
+       rev             x8, x8
+
+       ld1             {RMAC.16b}, [x5]
+
+.Lccm_enc_loop_4x:
+       cmp             w4, #(4 * 16)
+       blt             .Lccm_enc_loop_1x
+
+       sub             w4, w4, #(4 * 16)
+
+       /* construct CTRs */
+       inc_le128(v8)                   /* +0 */
+       inc_le128(v9)                   /* +1 */
+       inc_le128(v10)                  /* +2 */
+       inc_le128(v11)                  /* +3 */
+
+       ld1             {v0.16b-v3.16b}, [x2], #64
+
+       SM4_CRYPT_BLK2(v8, RMAC)
+       eor             v8.16b, v8.16b, v0.16b
+       eor             RMAC.16b, RMAC.16b, v0.16b
+       SM4_CRYPT_BLK2(v9, RMAC)
+       eor             v9.16b, v9.16b, v1.16b
+       eor             RMAC.16b, RMAC.16b, v1.16b
+       SM4_CRYPT_BLK2(v10, RMAC)
+       eor             v10.16b, v10.16b, v2.16b
+       eor             RMAC.16b, RMAC.16b, v2.16b
+       SM4_CRYPT_BLK2(v11, RMAC)
+       eor             v11.16b, v11.16b, v3.16b
+       eor             RMAC.16b, RMAC.16b, v3.16b
+
+       st1             {v8.16b-v11.16b}, [x1], #64
+
+       cbz             w4, .Lccm_enc_end
+       b               .Lccm_enc_loop_4x
+
+.Lccm_enc_loop_1x:
+       cmp             w4, #16
+       blt             .Lccm_enc_tail
+
+       sub             w4, w4, #16
+
+       /* construct CTRs */
+       inc_le128(v8)
+
+       ld1             {v0.16b}, [x2], #16
+
+       SM4_CRYPT_BLK2(v8, RMAC)
+       eor             v8.16b, v8.16b, v0.16b
+       eor             RMAC.16b, RMAC.16b, v0.16b
+
+       st1             {v8.16b}, [x1], #16
+
+       cbz             w4, .Lccm_enc_end
+       b               .Lccm_enc_loop_1x
+
+.Lccm_enc_tail:
+       /* construct CTRs */
+       inc_le128(v8)
+
+       SM4_CRYPT_BLK2(RMAC, v8)
+
+       /* store new MAC */
+       st1             {RMAC.16b}, [x5]
+
+.Lccm_enc_tail_loop:
+       ldrb            w0, [x2], #1            /* get 1 byte from input */
+       umov            w9, v8.b[0]             /* get top crypted CTR byte */
+       umov            w6, RMAC.b[0]           /* get top MAC byte */
+
+       eor             w9, w9, w0              /* w9 = CTR ^ input */
+       eor             w6, w6, w0              /* w6 = MAC ^ input */
+
+       strb            w9, [x1], #1            /* store out byte */
+       strb            w6, [x5], #1            /* store MAC byte */
+
+       subs            w4, w4, #1
+       beq             .Lccm_enc_ret
+
+       /* shift out one byte */
+       ext             RMAC.16b, RMAC.16b, RMAC.16b, #1
+       ext             v8.16b, v8.16b, v8.16b, #1
+
+       b               .Lccm_enc_tail_loop
+
+.Lccm_enc_end:
+       /* store new MAC */
+       st1             {RMAC.16b}, [x5]
+
+       /* store new CTR */
+       rev             x7, x7
+       rev             x8, x8
+       stp             x7, x8, [x3]
+
+.Lccm_enc_ret:
+       ret
+SYM_FUNC_END(sm4_ce_ccm_enc)
+
+.align 3
+SYM_FUNC_START(sm4_ce_ccm_dec)
+       /* input:
+        *   x0: round key array, CTX
+        *   x1: dst
+        *   x2: src
+        *   x3: ctr (big endian, 128 bit)
+        *   w4: nbytes
+        *   x5: mac
+        */
+       SM4_PREPARE(x0)
+
+       ldp             x7, x8, [x3]
+       rev             x7, x7
+       rev             x8, x8
+
+       ld1             {RMAC.16b}, [x5]
+
+.Lccm_dec_loop_4x:
+       cmp             w4, #(4 * 16)
+       blt             .Lccm_dec_loop_1x
+
+       sub             w4, w4, #(4 * 16)
+
+       /* construct CTRs */
+       inc_le128(v8)                   /* +0 */
+       inc_le128(v9)                   /* +1 */
+       inc_le128(v10)                  /* +2 */
+       inc_le128(v11)                  /* +3 */
+
+       ld1             {v0.16b-v3.16b}, [x2], #64
+
+       SM4_CRYPT_BLK2(v8, RMAC)
+       eor             v8.16b, v8.16b, v0.16b
+       eor             RMAC.16b, RMAC.16b, v8.16b
+       SM4_CRYPT_BLK2(v9, RMAC)
+       eor             v9.16b, v9.16b, v1.16b
+       eor             RMAC.16b, RMAC.16b, v9.16b
+       SM4_CRYPT_BLK2(v10, RMAC)
+       eor             v10.16b, v10.16b, v2.16b
+       eor             RMAC.16b, RMAC.16b, v10.16b
+       SM4_CRYPT_BLK2(v11, RMAC)
+       eor             v11.16b, v11.16b, v3.16b
+       eor             RMAC.16b, RMAC.16b, v11.16b
+
+       st1             {v8.16b-v11.16b}, [x1], #64
+
+       cbz             w4, .Lccm_dec_end
+       b               .Lccm_dec_loop_4x
+
+.Lccm_dec_loop_1x:
+       cmp             w4, #16
+       blt             .Lccm_dec_tail
+
+       sub             w4, w4, #16
+
+       /* construct CTRs */
+       inc_le128(v8)
+
+       ld1             {v0.16b}, [x2], #16
+
+       SM4_CRYPT_BLK2(v8, RMAC)
+       eor             v8.16b, v8.16b, v0.16b
+       eor             RMAC.16b, RMAC.16b, v8.16b
+
+       st1             {v8.16b}, [x1], #16
+
+       cbz             w4, .Lccm_dec_end
+       b               .Lccm_dec_loop_1x
+
+.Lccm_dec_tail:
+       /* construct CTRs */
+       inc_le128(v8)
+
+       SM4_CRYPT_BLK2(RMAC, v8)
+
+       /* store new MAC */
+       st1             {RMAC.16b}, [x5]
+
+.Lccm_dec_tail_loop:
+       ldrb            w0, [x2], #1            /* get 1 byte from input */
+       umov            w9, v8.b[0]             /* get top crypted CTR byte */
+       umov            w6, RMAC.b[0]           /* get top MAC byte */
+
+       eor             w9, w9, w0              /* w9 = CTR ^ input */
+       eor             w6, w6, w9              /* w6 = MAC ^ output */
+
+       strb            w9, [x1], #1            /* store out byte */
+       strb            w6, [x5], #1            /* store MAC byte */
+
+       subs            w4, w4, #1
+       beq             .Lccm_dec_ret
+
+       /* shift out one byte */
+       ext             RMAC.16b, RMAC.16b, RMAC.16b, #1
+       ext             v8.16b, v8.16b, v8.16b, #1
+
+       b               .Lccm_dec_tail_loop
+
+.Lccm_dec_end:
+       /* store new MAC */
+       st1             {RMAC.16b}, [x5]
+
+       /* store new CTR */
+       rev             x7, x7
+       rev             x8, x8
+       stp             x7, x8, [x3]
+
+.Lccm_dec_ret:
+       ret
+SYM_FUNC_END(sm4_ce_ccm_dec)
 
--- /dev/null
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SM4-CCM AEAD Algorithm using ARMv8 Crypto Extensions
+ * as specified in rfc8998
+ * https://datatracker.ietf.org/doc/html/rfc8998
+ *
+ * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
+ */
+
+#include <linux/module.h>
+#include <linux/crypto.h>
+#include <linux/kernel.h>
+#include <linux/cpufeature.h>
+#include <asm/neon.h>
+#include <crypto/scatterwalk.h>
+#include <crypto/internal/aead.h>
+#include <crypto/internal/skcipher.h>
+#include <crypto/sm4.h>
+#include "sm4-ce.h"
+
+asmlinkage void sm4_ce_cbcmac_update(const u32 *rkey_enc, u8 *mac,
+                                    const u8 *src, unsigned int nblocks);
+asmlinkage void sm4_ce_ccm_enc(const u32 *rkey_enc, u8 *dst, const u8 *src,
+                              u8 *iv, unsigned int nbytes, u8 *mac);
+asmlinkage void sm4_ce_ccm_dec(const u32 *rkey_enc, u8 *dst, const u8 *src,
+                              u8 *iv, unsigned int nbytes, u8 *mac);
+asmlinkage void sm4_ce_ccm_final(const u32 *rkey_enc, u8 *iv, u8 *mac);
+
+
+static int ccm_setkey(struct crypto_aead *tfm, const u8 *key,
+                     unsigned int key_len)
+{
+       struct sm4_ctx *ctx = crypto_aead_ctx(tfm);
+
+       if (key_len != SM4_KEY_SIZE)
+               return -EINVAL;
+
+       kernel_neon_begin();
+       sm4_ce_expand_key(key, ctx->rkey_enc, ctx->rkey_dec,
+                         crypto_sm4_fk, crypto_sm4_ck);
+       kernel_neon_end();
+
+       return 0;
+}
+
+static int ccm_setauthsize(struct crypto_aead *tfm, unsigned int authsize)
+{
+       if ((authsize & 1) || authsize < 4)
+               return -EINVAL;
+       return 0;
+}
+
+static int ccm_format_input(u8 info[], struct aead_request *req,
+                           unsigned int msglen)
+{
+       struct crypto_aead *aead = crypto_aead_reqtfm(req);
+       unsigned int l = req->iv[0] + 1;
+       unsigned int m;
+       __be32 len;
+
+       /* verify that CCM dimension 'L': 2 <= L <= 8 */
+       if (l < 2 || l > 8)
+               return -EINVAL;
+       if (l < 4 && msglen >> (8 * l))
+               return -EOVERFLOW;
+
+       memset(&req->iv[SM4_BLOCK_SIZE - l], 0, l);
+
+       memcpy(info, req->iv, SM4_BLOCK_SIZE);
+
+       m = crypto_aead_authsize(aead);
+
+       /* format flags field per RFC 3610/NIST 800-38C */
+       *info |= ((m - 2) / 2) << 3;
+       if (req->assoclen)
+               *info |= (1 << 6);
+
+       /*
+        * format message length field,
+        * Linux uses a u32 type to represent msglen
+        */
+       if (l >= 4)
+               l = 4;
+
+       len = cpu_to_be32(msglen);
+       memcpy(&info[SM4_BLOCK_SIZE - l], (u8 *)&len + 4 - l, l);
+
+       return 0;
+}
+
+static void ccm_calculate_auth_mac(struct aead_request *req, u8 mac[])
+{
+       struct crypto_aead *aead = crypto_aead_reqtfm(req);
+       struct sm4_ctx *ctx = crypto_aead_ctx(aead);
+       struct __packed { __be16 l; __be32 h; } aadlen;
+       u32 assoclen = req->assoclen;
+       struct scatter_walk walk;
+       unsigned int len;
+
+       if (assoclen < 0xff00) {
+               aadlen.l = cpu_to_be16(assoclen);
+               len = 2;
+       } else {
+               aadlen.l = cpu_to_be16(0xfffe);
+               put_unaligned_be32(assoclen, &aadlen.h);
+               len = 6;
+       }
+
+       sm4_ce_crypt_block(ctx->rkey_enc, mac, mac);
+       crypto_xor(mac, (const u8 *)&aadlen, len);
+
+       scatterwalk_start(&walk, req->src);
+
+       do {
+               u32 n = scatterwalk_clamp(&walk, assoclen);
+               u8 *p, *ptr;
+
+               if (!n) {
+                       scatterwalk_start(&walk, sg_next(walk.sg));
+                       n = scatterwalk_clamp(&walk, assoclen);
+               }
+
+               p = ptr = scatterwalk_map(&walk);
+               assoclen -= n;
+               scatterwalk_advance(&walk, n);
+
+               while (n > 0) {
+                       unsigned int l, nblocks;
+
+                       if (len == SM4_BLOCK_SIZE) {
+                               if (n < SM4_BLOCK_SIZE) {
+                                       sm4_ce_crypt_block(ctx->rkey_enc,
+                                                          mac, mac);
+
+                                       len = 0;
+                               } else {
+                                       nblocks = n / SM4_BLOCK_SIZE;
+                                       sm4_ce_cbcmac_update(ctx->rkey_enc,
+                                                            mac, ptr, nblocks);
+
+                                       ptr += nblocks * SM4_BLOCK_SIZE;
+                                       n %= SM4_BLOCK_SIZE;
+
+                                       continue;
+                               }
+                       }
+
+                       l = min(n, SM4_BLOCK_SIZE - len);
+                       if (l) {
+                               crypto_xor(mac + len, ptr, l);
+                               len += l;
+                               ptr += l;
+                               n -= l;
+                       }
+               }
+
+               scatterwalk_unmap(p);
+               scatterwalk_done(&walk, 0, assoclen);
+       } while (assoclen);
+}
+
+static int ccm_crypt(struct aead_request *req, struct skcipher_walk *walk,
+                    u32 *rkey_enc, u8 mac[],
+                    void (*sm4_ce_ccm_crypt)(const u32 *rkey_enc, u8 *dst,
+                                       const u8 *src, u8 *iv,
+                                       unsigned int nbytes, u8 *mac))
+{
+       u8 __aligned(8) ctr0[SM4_BLOCK_SIZE];
+       int err;
+
+       /* preserve the initial ctr0 for the TAG */
+       memcpy(ctr0, walk->iv, SM4_BLOCK_SIZE);
+       crypto_inc(walk->iv, SM4_BLOCK_SIZE);
+
+       kernel_neon_begin();
+
+       if (req->assoclen)
+               ccm_calculate_auth_mac(req, mac);
+
+       do {
+               unsigned int tail = walk->nbytes % SM4_BLOCK_SIZE;
+               const u8 *src = walk->src.virt.addr;
+               u8 *dst = walk->dst.virt.addr;
+
+               if (walk->nbytes == walk->total)
+                       tail = 0;
+
+               if (walk->nbytes - tail)
+                       sm4_ce_ccm_crypt(rkey_enc, dst, src, walk->iv,
+                                        walk->nbytes - tail, mac);
+
+               if (walk->nbytes == walk->total)
+                       sm4_ce_ccm_final(rkey_enc, ctr0, mac);
+
+               kernel_neon_end();
+
+               if (walk->nbytes) {
+                       err = skcipher_walk_done(walk, tail);
+                       if (err)
+                               return err;
+                       if (walk->nbytes)
+                               kernel_neon_begin();
+               }
+       } while (walk->nbytes > 0);
+
+       return 0;
+}
+
+static int ccm_encrypt(struct aead_request *req)
+{
+       struct crypto_aead *aead = crypto_aead_reqtfm(req);
+       struct sm4_ctx *ctx = crypto_aead_ctx(aead);
+       u8 __aligned(8) mac[SM4_BLOCK_SIZE];
+       struct skcipher_walk walk;
+       int err;
+
+       err = ccm_format_input(mac, req, req->cryptlen);
+       if (err)
+               return err;
+
+       err = skcipher_walk_aead_encrypt(&walk, req, false);
+       if (err)
+               return err;
+
+       err = ccm_crypt(req, &walk, ctx->rkey_enc, mac, sm4_ce_ccm_enc);
+       if (err)
+               return err;
+
+       /* copy authtag to end of dst */
+       scatterwalk_map_and_copy(mac, req->dst, req->assoclen + req->cryptlen,
+                                crypto_aead_authsize(aead), 1);
+
+       return 0;
+}
+
+static int ccm_decrypt(struct aead_request *req)
+{
+       struct crypto_aead *aead = crypto_aead_reqtfm(req);
+       unsigned int authsize = crypto_aead_authsize(aead);
+       struct sm4_ctx *ctx = crypto_aead_ctx(aead);
+       u8 __aligned(8) mac[SM4_BLOCK_SIZE];
+       u8 authtag[SM4_BLOCK_SIZE];
+       struct skcipher_walk walk;
+       int err;
+
+       err = ccm_format_input(mac, req, req->cryptlen - authsize);
+       if (err)
+               return err;
+
+       err = skcipher_walk_aead_decrypt(&walk, req, false);
+       if (err)
+               return err;
+
+       err = ccm_crypt(req, &walk, ctx->rkey_enc, mac, sm4_ce_ccm_dec);
+       if (err)
+               return err;
+
+       /* compare calculated auth tag with the stored one */
+       scatterwalk_map_and_copy(authtag, req->src,
+                                req->assoclen + req->cryptlen - authsize,
+                                authsize, 0);
+
+       if (crypto_memneq(authtag, mac, authsize))
+               return -EBADMSG;
+
+       return 0;
+}
+
+static struct aead_alg sm4_ccm_alg = {
+       .base = {
+               .cra_name               = "ccm(sm4)",
+               .cra_driver_name        = "ccm-sm4-ce",
+               .cra_priority           = 400,
+               .cra_blocksize          = 1,
+               .cra_ctxsize            = sizeof(struct sm4_ctx),
+               .cra_module             = THIS_MODULE,
+       },
+       .ivsize         = SM4_BLOCK_SIZE,
+       .chunksize      = SM4_BLOCK_SIZE,
+       .maxauthsize    = SM4_BLOCK_SIZE,
+       .setkey         = ccm_setkey,
+       .setauthsize    = ccm_setauthsize,
+       .encrypt        = ccm_encrypt,
+       .decrypt        = ccm_decrypt,
+};
+
+static int __init sm4_ce_ccm_init(void)
+{
+       return crypto_register_aead(&sm4_ccm_alg);
+}
+
+static void __exit sm4_ce_ccm_exit(void)
+{
+       crypto_unregister_aead(&sm4_ccm_alg);
+}
+
+module_cpu_feature_match(SM4, sm4_ce_ccm_init);
+module_exit(sm4_ce_ccm_exit);
+
+MODULE_DESCRIPTION("Synchronous SM4 in CCM mode using ARMv8 Crypto Extensions");
+MODULE_ALIAS_CRYPTO("ccm(sm4)");
+MODULE_AUTHOR("Tianjia Zhang <tianjia.zhang@linux.alibaba.com>");
+MODULE_LICENSE("GPL v2");