vext.8          q10, qzr, q0, #4
 
        // receive the initial 64B data, xor the initial crc value
-       vld1.64         {q0-q1}, [arg2, :128]!
-       vld1.64         {q2-q3}, [arg2, :128]!
-       vld1.64         {q4-q5}, [arg2, :128]!
-       vld1.64         {q6-q7}, [arg2, :128]!
+       vld1.64         {q0-q1}, [arg2]!
+       vld1.64         {q2-q3}, [arg2]!
+       vld1.64         {q4-q5}, [arg2]!
+       vld1.64         {q6-q7}, [arg2]!
 CPU_LE(        vrev64.8        q0, q0                  )
 CPU_LE(        vrev64.8        q1, q1                  )
 CPU_LE(        vrev64.8        q2, q2                  )
 _fold_64_B_loop:
 
        .macro          fold64, reg1, reg2
-       vld1.64         {q11-q12}, [arg2, :128]!
+       vld1.64         {q11-q12}, [arg2]!
 
        vmull.p64       q8, \reg1\()h, d21
        vmull.p64       \reg1, \reg1\()l, d20
        vmull.p64       q7, d15, d21
        veor.8          q7, q7, q8
 
-       vld1.64         {q0}, [arg2, :128]!
+       vld1.64         {q0}, [arg2]!
 CPU_LE(        vrev64.8        q0, q0          )
        vswp            d0, d1
        veor.8          q7, q7, q0
        vmov.i8         q0, #0
        vmov            s3, arg1_low32          // get the initial crc value
 
-       vld1.64         {q7}, [arg2, :128]!
+       vld1.64         {q7}, [arg2]!
 CPU_LE(        vrev64.8        q7, q7          )
        vswp            d14, d15
        veor.8          q7, q7, q0
 
                            unsigned int length)
 {
        u16 *crc = shash_desc_ctx(desc);
-       unsigned int l;
 
-       if (!may_use_simd()) {
-               *crc = crc_t10dif_generic(*crc, data, length);
+       if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && may_use_simd()) {
+               kernel_neon_begin();
+               *crc = crc_t10dif_pmull(*crc, data, length);
+               kernel_neon_end();
        } else {
-               if (unlikely((u32)data % CRC_T10DIF_PMULL_CHUNK_SIZE)) {
-                       l = min_t(u32, length, CRC_T10DIF_PMULL_CHUNK_SIZE -
-                                 ((u32)data % CRC_T10DIF_PMULL_CHUNK_SIZE));
-
-                       *crc = crc_t10dif_generic(*crc, data, l);
-
-                       length -= l;
-                       data += l;
-               }
-               if (length > 0) {
-                       kernel_neon_begin();
-                       *crc = crc_t10dif_pmull(*crc, data, length);
-                       kernel_neon_end();
-               }
+               *crc = crc_t10dif_generic(*crc, data, length);
        }
+
        return 0;
 }