int slot_cnt;
        int slots_per_op;
        dma_cookie_t cookie;
+       dma_addr_t next_dma;
 
        grp_start = sw_desc->group_head;
        slot_cnt = grp_start->slot_cnt;
                         &old_chain_tail->chain_node);
 
        /* fix up the hardware chain */
-       iop_desc_set_next_desc(old_chain_tail, grp_start->async_tx.phys);
+       next_dma = grp_start->async_tx.phys;
+       iop_desc_set_next_desc(old_chain_tail, next_dma);
+       BUG_ON(iop_desc_get_next_desc(old_chain_tail) != next_dma); /* flush */
 
-       /* 1/ don't add pre-chained descriptors
-        * 2/ dummy read to flush next_desc write
-        */
+       /* check for pre-chained descriptors */
        BUG_ON(iop_desc_get_next_desc(sw_desc));
 
        /* increment the pending count by the number of slots