/*
  * aesce-ccm-core.S - AES-CCM transform for ARMv8 with Crypto Extensions
  *
- * Copyright (C) 2013 - 2014 Linaro Ltd <ard.biesheuvel@linaro.org>
+ * Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
        beq     8f                              /* out of input? */
        cbnz    w8, 0b
        eor     v0.16b, v0.16b, v1.16b
-1:     ld1     {v3.16b}, [x4]                  /* load first round key */
+1:     ld1     {v3.4s}, [x4]                   /* load first round key */
        prfm    pldl1strm, [x1]
        cmp     w5, #12                         /* which key size? */
        add     x6, x4, #16
        mov     v5.16b, v3.16b
        b       4f
 2:     mov     v4.16b, v3.16b
-       ld1     {v5.16b}, [x6], #16             /* load 2nd round key */
+       ld1     {v5.4s}, [x6], #16              /* load 2nd round key */
 3:     aese    v0.16b, v4.16b
        aesmc   v0.16b, v0.16b
-4:     ld1     {v3.16b}, [x6], #16             /* load next round key */
+4:     ld1     {v3.4s}, [x6], #16              /* load next round key */
        aese    v0.16b, v5.16b
        aesmc   v0.16b, v0.16b
-5:     ld1     {v4.16b}, [x6], #16             /* load next round key */
+5:     ld1     {v4.4s}, [x6], #16              /* load next round key */
        subs    w7, w7, #3
        aese    v0.16b, v3.16b
        aesmc   v0.16b, v0.16b
-       ld1     {v5.16b}, [x6], #16             /* load next round key */
+       ld1     {v5.4s}, [x6], #16              /* load next round key */
        bpl     3b
        aese    v0.16b, v4.16b
        subs    w2, w2, #16                     /* last data? */
         *                       u32 rounds);
         */
 ENTRY(ce_aes_ccm_final)
-       ld1     {v3.16b}, [x2], #16             /* load first round key */
+       ld1     {v3.4s}, [x2], #16              /* load first round key */
        ld1     {v0.16b}, [x0]                  /* load mac */
        cmp     w3, #12                         /* which key size? */
        sub     w3, w3, #2                      /* modified # of rounds */
        mov     v5.16b, v3.16b
        b       2f
 0:     mov     v4.16b, v3.16b
-1:     ld1     {v5.16b}, [x2], #16             /* load next round key */
+1:     ld1     {v5.4s}, [x2], #16              /* load next round key */
        aese    v0.16b, v4.16b
        aesmc   v0.16b, v0.16b
        aese    v1.16b, v4.16b
        aesmc   v1.16b, v1.16b
-2:     ld1     {v3.16b}, [x2], #16             /* load next round key */
+2:     ld1     {v3.4s}, [x2], #16              /* load next round key */
        aese    v0.16b, v5.16b
        aesmc   v0.16b, v0.16b
        aese    v1.16b, v5.16b
        aesmc   v1.16b, v1.16b
-3:     ld1     {v4.16b}, [x2], #16             /* load next round key */
+3:     ld1     {v4.4s}, [x2], #16              /* load next round key */
        subs    w3, w3, #3
        aese    v0.16b, v3.16b
        aesmc   v0.16b, v0.16b
        cmp     w4, #12                         /* which key size? */
        sub     w7, w4, #2                      /* get modified # of rounds */
        ins     v1.d[1], x9                     /* no carry in lower ctr */
-       ld1     {v3.16b}, [x3]                  /* load first round key */
+       ld1     {v3.4s}, [x3]                   /* load first round key */
        add     x10, x3, #16
        bmi     1f
        bne     4f
        mov     v5.16b, v3.16b
        b       3f
 1:     mov     v4.16b, v3.16b
-       ld1     {v5.16b}, [x10], #16            /* load 2nd round key */
+       ld1     {v5.4s}, [x10], #16             /* load 2nd round key */
 2:     /* inner loop: 3 rounds, 2x interleaved */
        aese    v0.16b, v4.16b
        aesmc   v0.16b, v0.16b
        aese    v1.16b, v4.16b
        aesmc   v1.16b, v1.16b
-3:     ld1     {v3.16b}, [x10], #16            /* load next round key */
+3:     ld1     {v3.4s}, [x10], #16             /* load next round key */
        aese    v0.16b, v5.16b
        aesmc   v0.16b, v0.16b
        aese    v1.16b, v5.16b
        aesmc   v1.16b, v1.16b
-4:     ld1     {v4.16b}, [x10], #16            /* load next round key */
+4:     ld1     {v4.4s}, [x10], #16             /* load next round key */
        subs    w7, w7, #3
        aese    v0.16b, v3.16b
        aesmc   v0.16b, v0.16b
        aese    v1.16b, v3.16b
        aesmc   v1.16b, v1.16b
-       ld1     {v5.16b}, [x10], #16            /* load next round key */
+       ld1     {v5.4s}, [x10], #16             /* load next round key */
        bpl     2b
        aese    v0.16b, v4.16b
        aese    v1.16b, v4.16b
 
 /*
  * aes-ce-cipher.c - core AES cipher using ARMv8 Crypto Extensions
  *
- * Copyright (C) 2013 - 2014 Linaro Ltd <ard.biesheuvel@linaro.org>
+ * Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
  */
 
 #include <asm/neon.h>
+#include <asm/unaligned.h>
 #include <crypto/aes.h>
 #include <linux/cpufeature.h>
 #include <linux/crypto.h>
        kernel_neon_begin_partial(4);
 
        __asm__("       ld1     {v0.16b}, %[in]                 ;"
-               "       ld1     {v1.16b}, [%[key]], #16         ;"
+               "       ld1     {v1.4s}, [%[key]], #16          ;"
                "       cmp     %w[rounds], #10                 ;"
                "       bmi     0f                              ;"
                "       bne     3f                              ;"
                "       mov     v3.16b, v1.16b                  ;"
                "       b       2f                              ;"
                "0:     mov     v2.16b, v1.16b                  ;"
-               "       ld1     {v3.16b}, [%[key]], #16         ;"
+               "       ld1     {v3.4s}, [%[key]], #16          ;"
                "1:     aese    v0.16b, v2.16b                  ;"
                "       aesmc   v0.16b, v0.16b                  ;"
-               "2:     ld1     {v1.16b}, [%[key]], #16         ;"
+               "2:     ld1     {v1.4s}, [%[key]], #16          ;"
                "       aese    v0.16b, v3.16b                  ;"
                "       aesmc   v0.16b, v0.16b                  ;"
-               "3:     ld1     {v2.16b}, [%[key]], #16         ;"
+               "3:     ld1     {v2.4s}, [%[key]], #16          ;"
                "       subs    %w[rounds], %w[rounds], #3      ;"
                "       aese    v0.16b, v1.16b                  ;"
                "       aesmc   v0.16b, v0.16b                  ;"
-               "       ld1     {v3.16b}, [%[key]], #16         ;"
+               "       ld1     {v3.4s}, [%[key]], #16          ;"
                "       bpl     1b                              ;"
                "       aese    v0.16b, v2.16b                  ;"
                "       eor     v0.16b, v0.16b, v3.16b          ;"
        kernel_neon_begin_partial(4);
 
        __asm__("       ld1     {v0.16b}, %[in]                 ;"
-               "       ld1     {v1.16b}, [%[key]], #16         ;"
+               "       ld1     {v1.4s}, [%[key]], #16          ;"
                "       cmp     %w[rounds], #10                 ;"
                "       bmi     0f                              ;"
                "       bne     3f                              ;"
                "       mov     v3.16b, v1.16b                  ;"
                "       b       2f                              ;"
                "0:     mov     v2.16b, v1.16b                  ;"
-               "       ld1     {v3.16b}, [%[key]], #16         ;"
+               "       ld1     {v3.4s}, [%[key]], #16          ;"
                "1:     aesd    v0.16b, v2.16b                  ;"
                "       aesimc  v0.16b, v0.16b                  ;"
-               "2:     ld1     {v1.16b}, [%[key]], #16         ;"
+               "2:     ld1     {v1.4s}, [%[key]], #16          ;"
                "       aesd    v0.16b, v3.16b                  ;"
                "       aesimc  v0.16b, v0.16b                  ;"
-               "3:     ld1     {v2.16b}, [%[key]], #16         ;"
+               "3:     ld1     {v2.4s}, [%[key]], #16          ;"
                "       subs    %w[rounds], %w[rounds], #3      ;"
                "       aesd    v0.16b, v1.16b                  ;"
                "       aesimc  v0.16b, v0.16b                  ;"
-               "       ld1     {v3.16b}, [%[key]], #16         ;"
+               "       ld1     {v3.4s}, [%[key]], #16          ;"
                "       bpl     1b                              ;"
                "       aesd    v0.16b, v2.16b                  ;"
                "       eor     v0.16b, v0.16b, v3.16b          ;"
            key_len != AES_KEYSIZE_256)
                return -EINVAL;
 
-       memcpy(ctx->key_enc, in_key, key_len);
        ctx->key_length = key_len;
+       for (i = 0; i < kwords; i++)
+               ctx->key_enc[i] = get_unaligned_le32(in_key + i * sizeof(u32));
 
        kernel_neon_begin_partial(2);
        for (i = 0; i < sizeof(rcon); i++) {
                u32 *rki = ctx->key_enc + (i * kwords);
                u32 *rko = rki + kwords;
 
-#ifndef CONFIG_CPU_BIG_ENDIAN
                rko[0] = ror32(aes_sub(rki[kwords - 1]), 8) ^ rcon[i] ^ rki[0];
-#else
-               rko[0] = rol32(aes_sub(rki[kwords - 1]), 8) ^ (rcon[i] << 24) ^
-                        rki[0];
-#endif
                rko[1] = rko[0] ^ rki[1];
                rko[2] = rko[1] ^ rki[2];
                rko[3] = rko[2] ^ rki[3];
 
        key_dec[0] = key_enc[j];
        for (i = 1, j--; j > 0; i++, j--)
-               __asm__("ld1    {v0.16b}, %[in]         ;"
+               __asm__("ld1    {v0.4s}, %[in]          ;"
                        "aesimc v1.16b, v0.16b          ;"
-                       "st1    {v1.16b}, %[out]        ;"
+                       "st1    {v1.4s}, %[out] ;"
 
                :       [out]   "=Q"(key_dec[i])
                :       [in]    "Q"(key_enc[j])
 
  * linux/arch/arm64/crypto/aes-ce.S - AES cipher for ARMv8 with
  *                                    Crypto Extensions
  *
- * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org>
+ * Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
        cmp             \rounds, #12
        blo             2222f           /* 128 bits */
        beq             1111f           /* 192 bits */
-       ld1             {v17.16b-v18.16b}, [\rk], #32
-1111:  ld1             {v19.16b-v20.16b}, [\rk], #32
-2222:  ld1             {v21.16b-v24.16b}, [\rk], #64
-       ld1             {v25.16b-v28.16b}, [\rk], #64
-       ld1             {v29.16b-v31.16b}, [\rk]
+       ld1             {v17.4s-v18.4s}, [\rk], #32
+1111:  ld1             {v19.4s-v20.4s}, [\rk], #32
+2222:  ld1             {v21.4s-v24.4s}, [\rk], #64
+       ld1             {v25.4s-v28.4s}, [\rk], #64
+       ld1             {v29.4s-v31.4s}, [\rk]
        .endm
 
        /* prepare for encryption with key in rk[] */