(order_base_2(prop->queue_size / 4) - 1));
        tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_CONTROL, RPTR_BLOCK_SIZE,
                            (order_base_2(AMDGPU_GPU_PAGE_SIZE / 4) - 1));
-       tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_CONTROL, UNORD_DISPATCH, 0);
+       tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_CONTROL, UNORD_DISPATCH, 1);
        tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_CONTROL, TUNNEL_DISPATCH, 0);
        tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_CONTROL, PRIV_STATE, 1);
        tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_CONTROL, KMD_QUEUE, 1);
 
        m->cp_hqd_pq_control = 5 << CP_HQD_PQ_CONTROL__RPTR_BLOCK_SIZE__SHIFT;
        m->cp_hqd_pq_control |=
                        ffs(q->queue_size / sizeof(unsigned int)) - 1 - 1;
+       m->cp_hqd_pq_control |= CP_HQD_PQ_CONTROL__UNORD_DISPATCH_MASK;
        pr_debug("cp_hqd_pq_control 0x%x\n", m->cp_hqd_pq_control);
 
        m->cp_hqd_pq_base_lo = lower_32_bits((uint64_t)q->queue_address >> 8);