.text
        .align          4
 
-/*
- * There are several ways to instantiate this code:
- * - no interleave, all inline
- * - 2-way interleave, 2x calls out of line (-DINTERLEAVE=2)
- * - 2-way interleave, all inline (-DINTERLEAVE=2 -DINTERLEAVE_INLINE)
- * - 4-way interleave, 4x calls out of line (-DINTERLEAVE=4)
- * - 4-way interleave, all inline (-DINTERLEAVE=4 -DINTERLEAVE_INLINE)
- *
- * Macros imported by this code:
- * - enc_prepare       - setup NEON registers for encryption
- * - dec_prepare       - setup NEON registers for decryption
- * - enc_switch_key    - change to new key after having prepared for encryption
- * - encrypt_block     - encrypt a single block
- * - decrypt block     - decrypt a single block
- * - encrypt_block2x   - encrypt 2 blocks in parallel (if INTERLEAVE == 2)
- * - decrypt_block2x   - decrypt 2 blocks in parallel (if INTERLEAVE == 2)
- * - encrypt_block4x   - encrypt 4 blocks in parallel (if INTERLEAVE == 4)
- * - decrypt_block4x   - decrypt 4 blocks in parallel (if INTERLEAVE == 4)
- */
-
-#if defined(INTERLEAVE) && !defined(INTERLEAVE_INLINE)
-#define FRAME_PUSH     stp x29, x30, [sp,#-16]! ; mov x29, sp
-#define FRAME_POP      ldp x29, x30, [sp],#16
-
-#if INTERLEAVE == 2
-
-aes_encrypt_block2x:
-       encrypt_block2x v0, v1, w3, x2, x8, w7
-       ret
-ENDPROC(aes_encrypt_block2x)
-
-aes_decrypt_block2x:
-       decrypt_block2x v0, v1, w3, x2, x8, w7
-       ret
-ENDPROC(aes_decrypt_block2x)
-
-#elif INTERLEAVE == 4
-
 aes_encrypt_block4x:
        encrypt_block4x v0, v1, v2, v3, w3, x2, x8, w7
        ret
        ret
 ENDPROC(aes_decrypt_block4x)
 
-#else
-#error INTERLEAVE should equal 2 or 4
-#endif
-
-       .macro          do_encrypt_block2x
-       bl              aes_encrypt_block2x
-       .endm
-
-       .macro          do_decrypt_block2x
-       bl              aes_decrypt_block2x
-       .endm
-
-       .macro          do_encrypt_block4x
-       bl              aes_encrypt_block4x
-       .endm
-
-       .macro          do_decrypt_block4x
-       bl              aes_decrypt_block4x
-       .endm
-
-#else
-#define FRAME_PUSH
-#define FRAME_POP
-
-       .macro          do_encrypt_block2x
-       encrypt_block2x v0, v1, w3, x2, x8, w7
-       .endm
-
-       .macro          do_decrypt_block2x
-       decrypt_block2x v0, v1, w3, x2, x8, w7
-       .endm
-
-       .macro          do_encrypt_block4x
-       encrypt_block4x v0, v1, v2, v3, w3, x2, x8, w7
-       .endm
-
-       .macro          do_decrypt_block4x
-       decrypt_block4x v0, v1, v2, v3, w3, x2, x8, w7
-       .endm
-
-#endif
-
        /*
         * aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
         *                 int blocks)
         */
 
 AES_ENTRY(aes_ecb_encrypt)
-       FRAME_PUSH
+       stp             x29, x30, [sp, #-16]!
+       mov             x29, sp
 
        enc_prepare     w3, x2, x5
 
 .LecbencloopNx:
-#if INTERLEAVE >= 2
-       subs            w4, w4, #INTERLEAVE
+       subs            w4, w4, #4
        bmi             .Lecbenc1x
-#if INTERLEAVE == 2
-       ld1             {v0.16b-v1.16b}, [x1], #32      /* get 2 pt blocks */
-       do_encrypt_block2x
-       st1             {v0.16b-v1.16b}, [x0], #32
-#else
        ld1             {v0.16b-v3.16b}, [x1], #64      /* get 4 pt blocks */
-       do_encrypt_block4x
+       bl              aes_encrypt_block4x
        st1             {v0.16b-v3.16b}, [x0], #64
-#endif
        b               .LecbencloopNx
 .Lecbenc1x:
-       adds            w4, w4, #INTERLEAVE
+       adds            w4, w4, #4
        beq             .Lecbencout
-#endif
 .Lecbencloop:
        ld1             {v0.16b}, [x1], #16             /* get next pt block */
        encrypt_block   v0, w3, x2, x5, w6
        subs            w4, w4, #1
        bne             .Lecbencloop
 .Lecbencout:
-       FRAME_POP
+       ldp             x29, x30, [sp], #16
        ret
 AES_ENDPROC(aes_ecb_encrypt)
 
 
 AES_ENTRY(aes_ecb_decrypt)
-       FRAME_PUSH
+       stp             x29, x30, [sp, #-16]!
+       mov             x29, sp
 
        dec_prepare     w3, x2, x5
 
 .LecbdecloopNx:
-#if INTERLEAVE >= 2
-       subs            w4, w4, #INTERLEAVE
+       subs            w4, w4, #4
        bmi             .Lecbdec1x
-#if INTERLEAVE == 2
-       ld1             {v0.16b-v1.16b}, [x1], #32      /* get 2 ct blocks */
-       do_decrypt_block2x
-       st1             {v0.16b-v1.16b}, [x0], #32
-#else
        ld1             {v0.16b-v3.16b}, [x1], #64      /* get 4 ct blocks */
-       do_decrypt_block4x
+       bl              aes_decrypt_block4x
        st1             {v0.16b-v3.16b}, [x0], #64
-#endif
        b               .LecbdecloopNx
 .Lecbdec1x:
-       adds            w4, w4, #INTERLEAVE
+       adds            w4, w4, #4
        beq             .Lecbdecout
-#endif
 .Lecbdecloop:
        ld1             {v0.16b}, [x1], #16             /* get next ct block */
        decrypt_block   v0, w3, x2, x5, w6
        subs            w4, w4, #1
        bne             .Lecbdecloop
 .Lecbdecout:
-       FRAME_POP
+       ldp             x29, x30, [sp], #16
        ret
 AES_ENDPROC(aes_ecb_decrypt)
 
 
 
 AES_ENTRY(aes_cbc_decrypt)
-       FRAME_PUSH
+       stp             x29, x30, [sp, #-16]!
+       mov             x29, sp
 
        ld1             {v7.16b}, [x5]                  /* get iv */
        dec_prepare     w3, x2, x6
 
 .LcbcdecloopNx:
-#if INTERLEAVE >= 2
-       subs            w4, w4, #INTERLEAVE
+       subs            w4, w4, #4
        bmi             .Lcbcdec1x
-#if INTERLEAVE == 2
-       ld1             {v0.16b-v1.16b}, [x1], #32      /* get 2 ct blocks */
-       mov             v2.16b, v0.16b
-       mov             v3.16b, v1.16b
-       do_decrypt_block2x
-       eor             v0.16b, v0.16b, v7.16b
-       eor             v1.16b, v1.16b, v2.16b
-       mov             v7.16b, v3.16b
-       st1             {v0.16b-v1.16b}, [x0], #32
-#else
        ld1             {v0.16b-v3.16b}, [x1], #64      /* get 4 ct blocks */
        mov             v4.16b, v0.16b
        mov             v5.16b, v1.16b
        mov             v6.16b, v2.16b
-       do_decrypt_block4x
+       bl              aes_decrypt_block4x
        sub             x1, x1, #16
        eor             v0.16b, v0.16b, v7.16b
        eor             v1.16b, v1.16b, v4.16b
        eor             v2.16b, v2.16b, v5.16b
        eor             v3.16b, v3.16b, v6.16b
        st1             {v0.16b-v3.16b}, [x0], #64
-#endif
        b               .LcbcdecloopNx
 .Lcbcdec1x:
-       adds            w4, w4, #INTERLEAVE
+       adds            w4, w4, #4
        beq             .Lcbcdecout
-#endif
 .Lcbcdecloop:
        ld1             {v1.16b}, [x1], #16             /* get next ct block */
        mov             v0.16b, v1.16b                  /* ...and copy to v0 */
        subs            w4, w4, #1
        bne             .Lcbcdecloop
 .Lcbcdecout:
-       FRAME_POP
        st1             {v7.16b}, [x5]                  /* return iv */
+       ldp             x29, x30, [sp], #16
        ret
 AES_ENDPROC(aes_cbc_decrypt)
 
         */
 
 AES_ENTRY(aes_ctr_encrypt)
-       FRAME_PUSH
+       stp             x29, x30, [sp, #-16]!
+       mov             x29, sp
 
        enc_prepare     w3, x2, x6
        ld1             {v4.16b}, [x5]
 
        umov            x6, v4.d[1]             /* keep swabbed ctr in reg */
        rev             x6, x6
-#if INTERLEAVE >= 2
        cmn             w6, w4                  /* 32 bit overflow? */
        bcs             .Lctrloop
 .LctrloopNx:
-       subs            w4, w4, #INTERLEAVE
+       subs            w4, w4, #4
        bmi             .Lctr1x
-#if INTERLEAVE == 2
-       mov             v0.8b, v4.8b
-       mov             v1.8b, v4.8b
-       rev             x7, x6
-       add             x6, x6, #1
-       ins             v0.d[1], x7
-       rev             x7, x6
-       add             x6, x6, #1
-       ins             v1.d[1], x7
-       ld1             {v2.16b-v3.16b}, [x1], #32      /* get 2 input blocks */
-       do_encrypt_block2x
-       eor             v0.16b, v0.16b, v2.16b
-       eor             v1.16b, v1.16b, v3.16b
-       st1             {v0.16b-v1.16b}, [x0], #32
-#else
        ldr             q8, =0x30000000200000001        /* addends 1,2,3[,0] */
        dup             v7.4s, w6
        mov             v0.16b, v4.16b
        mov             v2.s[3], v8.s[1]
        mov             v3.s[3], v8.s[2]
        ld1             {v5.16b-v7.16b}, [x1], #48      /* get 3 input blocks */
-       do_encrypt_block4x
+       bl              aes_encrypt_block4x
        eor             v0.16b, v5.16b, v0.16b
        ld1             {v5.16b}, [x1], #16             /* get 1 input block  */
        eor             v1.16b, v6.16b, v1.16b
        eor             v2.16b, v7.16b, v2.16b
        eor             v3.16b, v5.16b, v3.16b
        st1             {v0.16b-v3.16b}, [x0], #64
-       add             x6, x6, #INTERLEAVE
-#endif
+       add             x6, x6, #4
        rev             x7, x6
        ins             v4.d[1], x7
        cbz             w4, .Lctrout
        b               .LctrloopNx
 .Lctr1x:
-       adds            w4, w4, #INTERLEAVE
+       adds            w4, w4, #4
        beq             .Lctrout
-#endif
 .Lctrloop:
        mov             v0.16b, v4.16b
        encrypt_block   v0, w3, x2, x8, w7
 
 .Lctrout:
        st1             {v4.16b}, [x5]          /* return next CTR value */
-       FRAME_POP
+       ldp             x29, x30, [sp], #16
        ret
 
 .Lctrtailblock:
        st1             {v0.16b}, [x0]
-       FRAME_POP
+       ldp             x29, x30, [sp], #16
        ret
 
 .Lctrcarry:
 CPU_BE(        .quad           0x87, 1         )
 
 AES_ENTRY(aes_xts_encrypt)
-       FRAME_PUSH
+       stp             x29, x30, [sp, #-16]!
+       mov             x29, sp
+
        ld1             {v4.16b}, [x6]
        cbz             w7, .Lxtsencnotfirst
 
        ldr             q7, .Lxts_mul_x
        next_tweak      v4, v4, v7, v8
 .LxtsencNx:
-#if INTERLEAVE >= 2
-       subs            w4, w4, #INTERLEAVE
+       subs            w4, w4, #4
        bmi             .Lxtsenc1x
-#if INTERLEAVE == 2
-       ld1             {v0.16b-v1.16b}, [x1], #32      /* get 2 pt blocks */
-       next_tweak      v5, v4, v7, v8
-       eor             v0.16b, v0.16b, v4.16b
-       eor             v1.16b, v1.16b, v5.16b
-       do_encrypt_block2x
-       eor             v0.16b, v0.16b, v4.16b
-       eor             v1.16b, v1.16b, v5.16b
-       st1             {v0.16b-v1.16b}, [x0], #32
-       cbz             w4, .LxtsencoutNx
-       next_tweak      v4, v5, v7, v8
-       b               .LxtsencNx
-.LxtsencoutNx:
-       mov             v4.16b, v5.16b
-       b               .Lxtsencout
-#else
        ld1             {v0.16b-v3.16b}, [x1], #64      /* get 4 pt blocks */
        next_tweak      v5, v4, v7, v8
        eor             v0.16b, v0.16b, v4.16b
        eor             v2.16b, v2.16b, v6.16b
        next_tweak      v7, v6, v7, v8
        eor             v3.16b, v3.16b, v7.16b
-       do_encrypt_block4x
+       bl              aes_encrypt_block4x
        eor             v3.16b, v3.16b, v7.16b
        eor             v0.16b, v0.16b, v4.16b
        eor             v1.16b, v1.16b, v5.16b
        mov             v4.16b, v7.16b
        cbz             w4, .Lxtsencout
        b               .LxtsencloopNx
-#endif
 .Lxtsenc1x:
-       adds            w4, w4, #INTERLEAVE
+       adds            w4, w4, #4
        beq             .Lxtsencout
-#endif
 .Lxtsencloop:
        ld1             {v1.16b}, [x1], #16
        eor             v0.16b, v1.16b, v4.16b
        b               .Lxtsencloop
 .Lxtsencout:
        st1             {v4.16b}, [x6]
-       FRAME_POP
+       ldp             x29, x30, [sp], #16
        ret
 AES_ENDPROC(aes_xts_encrypt)
 
 
 AES_ENTRY(aes_xts_decrypt)
-       FRAME_PUSH
+       stp             x29, x30, [sp, #-16]!
+       mov             x29, sp
+
        ld1             {v4.16b}, [x6]
        cbz             w7, .Lxtsdecnotfirst
 
        ldr             q7, .Lxts_mul_x
        next_tweak      v4, v4, v7, v8
 .LxtsdecNx:
-#if INTERLEAVE >= 2
-       subs            w4, w4, #INTERLEAVE
+       subs            w4, w4, #4
        bmi             .Lxtsdec1x
-#if INTERLEAVE == 2
-       ld1             {v0.16b-v1.16b}, [x1], #32      /* get 2 ct blocks */
-       next_tweak      v5, v4, v7, v8
-       eor             v0.16b, v0.16b, v4.16b
-       eor             v1.16b, v1.16b, v5.16b
-       do_decrypt_block2x
-       eor             v0.16b, v0.16b, v4.16b
-       eor             v1.16b, v1.16b, v5.16b
-       st1             {v0.16b-v1.16b}, [x0], #32
-       cbz             w4, .LxtsdecoutNx
-       next_tweak      v4, v5, v7, v8
-       b               .LxtsdecNx
-.LxtsdecoutNx:
-       mov             v4.16b, v5.16b
-       b               .Lxtsdecout
-#else
        ld1             {v0.16b-v3.16b}, [x1], #64      /* get 4 ct blocks */
        next_tweak      v5, v4, v7, v8
        eor             v0.16b, v0.16b, v4.16b
        eor             v2.16b, v2.16b, v6.16b
        next_tweak      v7, v6, v7, v8
        eor             v3.16b, v3.16b, v7.16b
-       do_decrypt_block4x
+       bl              aes_decrypt_block4x
        eor             v3.16b, v3.16b, v7.16b
        eor             v0.16b, v0.16b, v4.16b
        eor             v1.16b, v1.16b, v5.16b
        mov             v4.16b, v7.16b
        cbz             w4, .Lxtsdecout
        b               .LxtsdecloopNx
-#endif
 .Lxtsdec1x:
-       adds            w4, w4, #INTERLEAVE
+       adds            w4, w4, #4
        beq             .Lxtsdecout
-#endif
 .Lxtsdecloop:
        ld1             {v1.16b}, [x1], #16
        eor             v0.16b, v1.16b, v4.16b
        b               .Lxtsdecloop
 .Lxtsdecout:
        st1             {v4.16b}, [x6]
-       FRAME_POP
+       ldp             x29, x30, [sp], #16
        ret
 AES_ENDPROC(aes_xts_decrypt)