#define RTMP3  v19
 
 #define RIV    v20
+#define RMASK  v21
 
 
 .align 3
 SYM_FUNC_END(sm4_ce_ctr_enc)
 
 
+#define tweak_next(vt, vin, RTMP)                                      \
+               sshr            RTMP.2d, vin.2d, #63;                   \
+               and             RTMP.16b, RTMP.16b, RMASK.16b;          \
+               add             vt.2d, vin.2d, vin.2d;                  \
+               ext             RTMP.16b, RTMP.16b, RTMP.16b, #8;       \
+               eor             vt.16b, vt.16b, RTMP.16b;
+
+.align 3
+SYM_FUNC_START(sm4_ce_xts_enc)
+       /* input:
+        *   x0: round key array, CTX
+        *   x1: dst
+        *   x2: src
+        *   x3: tweak (big endian, 128 bit)
+        *   w4: nbytes
+        *   x5: round key array for IV
+        */
+       ld1             {v8.16b}, [x3]
+
+       cbz             x5, .Lxts_enc_nofirst
+
+       SM4_PREPARE(x5)
+
+       /* Generate first tweak */
+       SM4_CRYPT_BLK(v8)
+
+.Lxts_enc_nofirst:
+       SM4_PREPARE(x0)
+
+       ands            w5, w4, #15
+       lsr             w4, w4, #4
+       sub             w6, w4, #1
+       csel            w4, w4, w6, eq
+       uxtw            x5, w5
+
+       movi            RMASK.2s, #0x1
+       movi            RTMP0.2s, #0x87
+       uzp1            RMASK.4s, RMASK.4s, RTMP0.4s
+
+       cbz             w4, .Lxts_enc_cts
+
+.Lxts_enc_loop_8x:
+       sub             w4, w4, #8
+       tbnz            w4, #31, .Lxts_enc_4x
+
+       tweak_next( v9,  v8, RTMP0)
+       tweak_next(v10,  v9, RTMP1)
+       tweak_next(v11, v10, RTMP2)
+       tweak_next(v12, v11, RTMP3)
+       tweak_next(v13, v12, RTMP0)
+       tweak_next(v14, v13, RTMP1)
+       tweak_next(v15, v14, RTMP2)
+
+       ld1             {v0.16b-v3.16b}, [x2], #64
+       ld1             {v4.16b-v7.16b}, [x2], #64
+       eor             v0.16b, v0.16b,  v8.16b
+       eor             v1.16b, v1.16b,  v9.16b
+       eor             v2.16b, v2.16b, v10.16b
+       eor             v3.16b, v3.16b, v11.16b
+       eor             v4.16b, v4.16b, v12.16b
+       eor             v5.16b, v5.16b, v13.16b
+       eor             v6.16b, v6.16b, v14.16b
+       eor             v7.16b, v7.16b, v15.16b
+
+       SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7)
+
+       eor             v0.16b, v0.16b,  v8.16b
+       eor             v1.16b, v1.16b,  v9.16b
+       eor             v2.16b, v2.16b, v10.16b
+       eor             v3.16b, v3.16b, v11.16b
+       eor             v4.16b, v4.16b, v12.16b
+       eor             v5.16b, v5.16b, v13.16b
+       eor             v6.16b, v6.16b, v14.16b
+       eor             v7.16b, v7.16b, v15.16b
+       st1             {v0.16b-v3.16b}, [x1], #64
+       st1             {v4.16b-v7.16b}, [x1], #64
+
+       tweak_next(v8, v15, RTMP3)
+
+       cbz             w4, .Lxts_enc_cts
+       b               .Lxts_enc_loop_8x
+
+.Lxts_enc_4x:
+       add             w4, w4, #8
+       cmp             w4, #4
+       blt             .Lxts_enc_loop_1x
+
+       sub             w4, w4, #4
+
+       tweak_next( v9,  v8, RTMP0)
+       tweak_next(v10,  v9, RTMP1)
+       tweak_next(v11, v10, RTMP2)
+
+       ld1             {v0.16b-v3.16b}, [x2], #64
+       eor             v0.16b, v0.16b,  v8.16b
+       eor             v1.16b, v1.16b,  v9.16b
+       eor             v2.16b, v2.16b, v10.16b
+       eor             v3.16b, v3.16b, v11.16b
+
+       SM4_CRYPT_BLK4(v0, v1, v2, v3)
+
+       eor             v0.16b, v0.16b,  v8.16b
+       eor             v1.16b, v1.16b,  v9.16b
+       eor             v2.16b, v2.16b, v10.16b
+       eor             v3.16b, v3.16b, v11.16b
+       st1             {v0.16b-v3.16b}, [x1], #64
+
+       tweak_next(v8, v11, RTMP3)
+
+       cbz             w4, .Lxts_enc_cts
+
+.Lxts_enc_loop_1x:
+       sub             w4, w4, #1
+
+       ld1             {v0.16b}, [x2], #16
+       eor             v0.16b, v0.16b, v8.16b
+
+       SM4_CRYPT_BLK(v0)
+
+       eor             v0.16b, v0.16b, v8.16b
+       st1             {v0.16b}, [x1], #16
+
+       tweak_next(v8, v8, RTMP0)
+
+       cbnz            w4, .Lxts_enc_loop_1x
+
+.Lxts_enc_cts:
+       cbz             x5, .Lxts_enc_end
+
+       /* cipher text stealing */
+
+       tweak_next(v9, v8, RTMP0)
+       ld1             {v0.16b}, [x2]
+       eor             v0.16b, v0.16b, v8.16b
+       SM4_CRYPT_BLK(v0)
+       eor             v0.16b, v0.16b, v8.16b
+
+       /* load permute table */
+       adr_l           x6, .Lcts_permute_table
+       add             x7, x6, #32
+       add             x6, x6, x5
+       sub             x7, x7, x5
+       ld1             {v3.16b}, [x6]
+       ld1             {v4.16b}, [x7]
+
+       /* overlapping loads */
+       add             x2, x2, x5
+       ld1             {v1.16b}, [x2]
+
+       /* create Cn from En-1 */
+       tbl             v2.16b, {v0.16b}, v3.16b
+       /* padding Pn with En-1 at the end */
+       tbx             v0.16b, {v1.16b}, v4.16b
+
+       eor             v0.16b, v0.16b, v9.16b
+       SM4_CRYPT_BLK(v0)
+       eor             v0.16b, v0.16b, v9.16b
+
+
+       /* overlapping stores */
+       add             x5, x1, x5
+       st1             {v2.16b}, [x5]
+       st1             {v0.16b}, [x1]
+
+       b               .Lxts_enc_ret
+
+.Lxts_enc_end:
+       /* store new tweak */
+       st1             {v8.16b}, [x3]
+
+.Lxts_enc_ret:
+       ret
+SYM_FUNC_END(sm4_ce_xts_enc)
+
+.align 3
+SYM_FUNC_START(sm4_ce_xts_dec)
+       /* input:
+        *   x0: round key array, CTX
+        *   x1: dst
+        *   x2: src
+        *   x3: tweak (big endian, 128 bit)
+        *   w4: nbytes
+        *   x5: round key array for IV
+        */
+       ld1             {v8.16b}, [x3]
+
+       cbz             x5, .Lxts_dec_nofirst
+
+       SM4_PREPARE(x5)
+
+       /* Generate first tweak */
+       SM4_CRYPT_BLK(v8)
+
+.Lxts_dec_nofirst:
+       SM4_PREPARE(x0)
+
+       ands            w5, w4, #15
+       lsr             w4, w4, #4
+       sub             w6, w4, #1
+       csel            w4, w4, w6, eq
+       uxtw            x5, w5
+
+       movi            RMASK.2s, #0x1
+       movi            RTMP0.2s, #0x87
+       uzp1            RMASK.4s, RMASK.4s, RTMP0.4s
+
+       cbz             w4, .Lxts_dec_cts
+
+.Lxts_dec_loop_8x:
+       sub             w4, w4, #8
+       tbnz            w4, #31, .Lxts_dec_4x
+
+       tweak_next( v9,  v8, RTMP0)
+       tweak_next(v10,  v9, RTMP1)
+       tweak_next(v11, v10, RTMP2)
+       tweak_next(v12, v11, RTMP3)
+       tweak_next(v13, v12, RTMP0)
+       tweak_next(v14, v13, RTMP1)
+       tweak_next(v15, v14, RTMP2)
+
+       ld1             {v0.16b-v3.16b}, [x2], #64
+       ld1             {v4.16b-v7.16b}, [x2], #64
+       eor             v0.16b, v0.16b,  v8.16b
+       eor             v1.16b, v1.16b,  v9.16b
+       eor             v2.16b, v2.16b, v10.16b
+       eor             v3.16b, v3.16b, v11.16b
+       eor             v4.16b, v4.16b, v12.16b
+       eor             v5.16b, v5.16b, v13.16b
+       eor             v6.16b, v6.16b, v14.16b
+       eor             v7.16b, v7.16b, v15.16b
+
+       SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7)
+
+       eor             v0.16b, v0.16b,  v8.16b
+       eor             v1.16b, v1.16b,  v9.16b
+       eor             v2.16b, v2.16b, v10.16b
+       eor             v3.16b, v3.16b, v11.16b
+       eor             v4.16b, v4.16b, v12.16b
+       eor             v5.16b, v5.16b, v13.16b
+       eor             v6.16b, v6.16b, v14.16b
+       eor             v7.16b, v7.16b, v15.16b
+       st1             {v0.16b-v3.16b}, [x1], #64
+       st1             {v4.16b-v7.16b}, [x1], #64
+
+       tweak_next(v8, v15, RTMP3)
+
+       cbz             w4, .Lxts_dec_cts
+       b               .Lxts_dec_loop_8x
+
+.Lxts_dec_4x:
+       add             w4, w4, #8
+       cmp             w4, #4
+       blt             .Lxts_dec_loop_1x
+
+       sub             w4, w4, #4
+
+       tweak_next( v9,  v8, RTMP0)
+       tweak_next(v10,  v9, RTMP1)
+       tweak_next(v11, v10, RTMP2)
+
+       ld1             {v0.16b-v3.16b}, [x2], #64
+       eor             v0.16b, v0.16b,  v8.16b
+       eor             v1.16b, v1.16b,  v9.16b
+       eor             v2.16b, v2.16b, v10.16b
+       eor             v3.16b, v3.16b, v11.16b
+
+       SM4_CRYPT_BLK4(v0, v1, v2, v3)
+
+       eor             v0.16b, v0.16b,  v8.16b
+       eor             v1.16b, v1.16b,  v9.16b
+       eor             v2.16b, v2.16b, v10.16b
+       eor             v3.16b, v3.16b, v11.16b
+       st1             {v0.16b-v3.16b}, [x1], #64
+
+       tweak_next(v8, v11, RTMP3)
+
+       cbz             w4, .Lxts_dec_cts
+
+.Lxts_dec_loop_1x:
+       sub             w4, w4, #1
+
+       ld1             {v0.16b}, [x2], #16
+       eor             v0.16b, v0.16b, v8.16b
+
+       SM4_CRYPT_BLK(v0)
+
+       eor             v0.16b, v0.16b, v8.16b
+       st1             {v0.16b}, [x1], #16
+
+       tweak_next(v8, v8, RTMP0)
+
+       cbnz            w4, .Lxts_dec_loop_1x
+
+.Lxts_dec_cts:
+       cbz             x5, .Lxts_dec_end
+
+       /* cipher text stealing */
+
+       tweak_next(v9, v8, RTMP0)
+       ld1             {v0.16b}, [x2]
+       eor             v0.16b, v0.16b, v9.16b
+       SM4_CRYPT_BLK(v0)
+       eor             v0.16b, v0.16b, v9.16b
+
+       /* load permute table */
+       adr_l           x6, .Lcts_permute_table
+       add             x7, x6, #32
+       add             x6, x6, x5
+       sub             x7, x7, x5
+       ld1             {v3.16b}, [x6]
+       ld1             {v4.16b}, [x7]
+
+       /* overlapping loads */
+       add             x2, x2, x5
+       ld1             {v1.16b}, [x2]
+
+       /* create Cn from En-1 */
+       tbl             v2.16b, {v0.16b}, v3.16b
+       /* padding Pn with En-1 at the end */
+       tbx             v0.16b, {v1.16b}, v4.16b
+
+       eor             v0.16b, v0.16b, v8.16b
+       SM4_CRYPT_BLK(v0)
+       eor             v0.16b, v0.16b, v8.16b
+
+
+       /* overlapping stores */
+       add             x5, x1, x5
+       st1             {v2.16b}, [x5]
+       st1             {v0.16b}, [x1]
+
+       b               .Lxts_dec_ret
+
+.Lxts_dec_end:
+       /* store new tweak */
+       st1             {v8.16b}, [x3]
+
+.Lxts_dec_ret:
+       ret
+SYM_FUNC_END(sm4_ce_xts_dec)
+
+
        .section        ".rodata", "a"
        .align 4
 .Lbswap128_mask:
 
 #include <crypto/internal/simd.h>
 #include <crypto/internal/skcipher.h>
 #include <crypto/scatterwalk.h>
+#include <crypto/xts.h>
 #include <crypto/sm4.h>
 
 #define BYTES2BLKS(nbytes)     ((nbytes) >> 4)
                               u8 *iv, unsigned int nblks);
 asmlinkage void sm4_ce_ctr_enc(const u32 *rkey, u8 *dst, const u8 *src,
                               u8 *iv, unsigned int nblks);
+asmlinkage void sm4_ce_xts_enc(const u32 *rkey1, u8 *dst, const u8 *src,
+                              u8 *tweak, unsigned int nbytes,
+                              const u32 *rkey2_enc);
+asmlinkage void sm4_ce_xts_dec(const u32 *rkey1, u8 *dst, const u8 *src,
+                              u8 *tweak, unsigned int nbytes,
+                              const u32 *rkey2_enc);
 
 EXPORT_SYMBOL(sm4_ce_expand_key);
 EXPORT_SYMBOL(sm4_ce_crypt_block);
 EXPORT_SYMBOL(sm4_ce_cbc_enc);
 EXPORT_SYMBOL(sm4_ce_cfb_enc);
 
+struct sm4_xts_ctx {
+       struct sm4_ctx key1;
+       struct sm4_ctx key2;
+};
+
 static int sm4_setkey(struct crypto_skcipher *tfm, const u8 *key,
                      unsigned int key_len)
 {
        return 0;
 }
 
+static int sm4_xts_setkey(struct crypto_skcipher *tfm, const u8 *key,
+                         unsigned int key_len)
+{
+       struct sm4_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
+       int ret;
+
+       if (key_len != SM4_KEY_SIZE * 2)
+               return -EINVAL;
+
+       ret = xts_verify_key(tfm, key, key_len);
+       if (ret)
+               return ret;
+
+       kernel_neon_begin();
+       sm4_ce_expand_key(key, ctx->key1.rkey_enc,
+                         ctx->key1.rkey_dec, crypto_sm4_fk, crypto_sm4_ck);
+       sm4_ce_expand_key(&key[SM4_KEY_SIZE], ctx->key2.rkey_enc,
+                         ctx->key2.rkey_dec, crypto_sm4_fk, crypto_sm4_ck);
+       kernel_neon_end();
+
+       return 0;
+}
+
 static int sm4_ecb_do_crypt(struct skcipher_request *req, const u32 *rkey)
 {
        struct skcipher_walk walk;
        return err;
 }
 
+static int sm4_xts_crypt(struct skcipher_request *req, bool encrypt)
+{
+       struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+       struct sm4_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
+       int tail = req->cryptlen % SM4_BLOCK_SIZE;
+       const u32 *rkey2_enc = ctx->key2.rkey_enc;
+       struct scatterlist sg_src[2], sg_dst[2];
+       struct skcipher_request subreq;
+       struct scatterlist *src, *dst;
+       struct skcipher_walk walk;
+       unsigned int nbytes;
+       int err;
+
+       if (req->cryptlen < SM4_BLOCK_SIZE)
+               return -EINVAL;
+
+       err = skcipher_walk_virt(&walk, req, false);
+       if (err)
+               return err;
+
+       if (unlikely(tail > 0 && walk.nbytes < walk.total)) {
+               int nblocks = DIV_ROUND_UP(req->cryptlen, SM4_BLOCK_SIZE) - 2;
+
+               skcipher_walk_abort(&walk);
+
+               skcipher_request_set_tfm(&subreq, tfm);
+               skcipher_request_set_callback(&subreq,
+                                             skcipher_request_flags(req),
+                                             NULL, NULL);
+               skcipher_request_set_crypt(&subreq, req->src, req->dst,
+                                          nblocks * SM4_BLOCK_SIZE, req->iv);
+
+               err = skcipher_walk_virt(&walk, &subreq, false);
+               if (err)
+                       return err;
+       } else {
+               tail = 0;
+       }
+
+       while ((nbytes = walk.nbytes) >= SM4_BLOCK_SIZE) {
+               if (nbytes < walk.total)
+                       nbytes &= ~(SM4_BLOCK_SIZE - 1);
+
+               kernel_neon_begin();
+
+               if (encrypt)
+                       sm4_ce_xts_enc(ctx->key1.rkey_enc, walk.dst.virt.addr,
+                                      walk.src.virt.addr, walk.iv, nbytes,
+                                      rkey2_enc);
+               else
+                       sm4_ce_xts_dec(ctx->key1.rkey_dec, walk.dst.virt.addr,
+                                      walk.src.virt.addr, walk.iv, nbytes,
+                                      rkey2_enc);
+
+               kernel_neon_end();
+
+               rkey2_enc = NULL;
+
+               err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
+               if (err)
+                       return err;
+       }
+
+       if (likely(tail == 0))
+               return 0;
+
+       /* handle ciphertext stealing */
+
+       dst = src = scatterwalk_ffwd(sg_src, req->src, subreq.cryptlen);
+       if (req->dst != req->src)
+               dst = scatterwalk_ffwd(sg_dst, req->dst, subreq.cryptlen);
+
+       skcipher_request_set_crypt(&subreq, src, dst, SM4_BLOCK_SIZE + tail,
+                                  req->iv);
+
+       err = skcipher_walk_virt(&walk, &subreq, false);
+       if (err)
+               return err;
+
+       kernel_neon_begin();
+
+       if (encrypt)
+               sm4_ce_xts_enc(ctx->key1.rkey_enc, walk.dst.virt.addr,
+                              walk.src.virt.addr, walk.iv, walk.nbytes,
+                              rkey2_enc);
+       else
+               sm4_ce_xts_dec(ctx->key1.rkey_dec, walk.dst.virt.addr,
+                              walk.src.virt.addr, walk.iv, walk.nbytes,
+                              rkey2_enc);
+
+       kernel_neon_end();
+
+       return skcipher_walk_done(&walk, 0);
+}
+
+static int sm4_xts_encrypt(struct skcipher_request *req)
+{
+       return sm4_xts_crypt(req, true);
+}
+
+static int sm4_xts_decrypt(struct skcipher_request *req)
+{
+       return sm4_xts_crypt(req, false);
+}
+
 static struct skcipher_alg sm4_algs[] = {
        {
                .base = {
                .setkey         = sm4_setkey,
                .encrypt        = sm4_cbc_cts_encrypt,
                .decrypt        = sm4_cbc_cts_decrypt,
+       }, {
+               .base = {
+                       .cra_name               = "xts(sm4)",
+                       .cra_driver_name        = "xts-sm4-ce",
+                       .cra_priority           = 400,
+                       .cra_blocksize          = SM4_BLOCK_SIZE,
+                       .cra_ctxsize            = sizeof(struct sm4_xts_ctx),
+                       .cra_module             = THIS_MODULE,
+               },
+               .min_keysize    = SM4_KEY_SIZE * 2,
+               .max_keysize    = SM4_KEY_SIZE * 2,
+               .ivsize         = SM4_BLOCK_SIZE,
+               .walksize       = SM4_BLOCK_SIZE * 2,
+               .setkey         = sm4_xts_setkey,
+               .encrypt        = sm4_xts_encrypt,
+               .decrypt        = sm4_xts_decrypt,
        }
 };
 
 module_cpu_feature_match(SM4, sm4_init);
 module_exit(sm4_exit);
 
-MODULE_DESCRIPTION("SM4 ECB/CBC/CFB/CTR using ARMv8 Crypto Extensions");
+MODULE_DESCRIPTION("SM4 ECB/CBC/CFB/CTR/XTS using ARMv8 Crypto Extensions");
 MODULE_ALIAS_CRYPTO("sm4-ce");
 MODULE_ALIAS_CRYPTO("sm4");
 MODULE_ALIAS_CRYPTO("ecb(sm4)");
 MODULE_ALIAS_CRYPTO("cfb(sm4)");
 MODULE_ALIAS_CRYPTO("ctr(sm4)");
 MODULE_ALIAS_CRYPTO("cts(cbc(sm4))");
+MODULE_ALIAS_CRYPTO("xts(sm4)");
 MODULE_AUTHOR("Tianjia Zhang <tianjia.zhang@linux.alibaba.com>");
 MODULE_LICENSE("GPL v2");