u8                      stopping;
        struct list_head        queue;
        struct spi_transfer     *current_transfer;
-       unsigned long           remaining_bytes;
+       unsigned long           current_remaining_bytes;
+       struct spi_transfer     *next_transfer;
+       unsigned long           next_remaining_bytes;
 
        void                    *buffer;
        dma_addr_t              buffer_dma;
                gpio_set_value(gpio, !active);
 }
 
+static inline int atmel_spi_xfer_is_last(struct spi_message *msg,
+                                       struct spi_transfer *xfer)
+{
+       return msg->transfers.prev == &xfer->transfer_list;
+}
+
+static inline int atmel_spi_xfer_can_be_chained(struct spi_transfer *xfer)
+{
+       return xfer->delay_usecs == 0 && !xfer->cs_change;
+}
+
+static void atmel_spi_next_xfer_data(struct spi_master *master,
+                               struct spi_transfer *xfer,
+                               dma_addr_t *tx_dma,
+                               dma_addr_t *rx_dma,
+                               u32 *plen)
+{
+       struct atmel_spi        *as = spi_master_get_devdata(master);
+       u32                     len = *plen;
+
+       /* use scratch buffer only when rx or tx data is unspecified */
+       if (xfer->rx_buf)
+               *rx_dma = xfer->rx_dma + xfer->len - len;
+       else {
+               *rx_dma = as->buffer_dma;
+               if (len > BUFFER_SIZE)
+                       len = BUFFER_SIZE;
+       }
+       if (xfer->tx_buf)
+               *tx_dma = xfer->tx_dma + xfer->len - len;
+       else {
+               *tx_dma = as->buffer_dma;
+               if (len > BUFFER_SIZE)
+                       len = BUFFER_SIZE;
+               memset(as->buffer, 0, len);
+               dma_sync_single_for_device(&as->pdev->dev,
+                               as->buffer_dma, len, DMA_TO_DEVICE);
+       }
+
+       *plen = len;
+}
+
 /*
  * Submit next transfer for DMA.
  * lock is held, spi irq is blocked
 {
        struct atmel_spi        *as = spi_master_get_devdata(master);
        struct spi_transfer     *xfer;
-       u32                     len;
+       u32                     len, remaining, total;
        dma_addr_t              tx_dma, rx_dma;
 
-       xfer = as->current_transfer;
-       if (!xfer || as->remaining_bytes == 0) {
-               if (xfer)
-                       xfer = list_entry(xfer->transfer_list.next,
-                                       struct spi_transfer, transfer_list);
-               else
-                       xfer = list_entry(msg->transfers.next,
-                                       struct spi_transfer, transfer_list);
-               as->remaining_bytes = xfer->len;
-               as->current_transfer = xfer;
-       }
+       if (!as->current_transfer)
+               xfer = list_entry(msg->transfers.next,
+                               struct spi_transfer, transfer_list);
+       else if (!as->next_transfer)
+               xfer = list_entry(as->current_transfer->transfer_list.next,
+                               struct spi_transfer, transfer_list);
+       else
+               xfer = NULL;
 
-       len = as->remaining_bytes;
+       if (xfer) {
+               len = xfer->len;
+               atmel_spi_next_xfer_data(master, xfer, &tx_dma, &rx_dma, &len);
+               remaining = xfer->len - len;
 
-       tx_dma = xfer->tx_dma + xfer->len - len;
-       rx_dma = xfer->rx_dma + xfer->len - len;
+               spi_writel(as, RPR, rx_dma);
+               spi_writel(as, TPR, tx_dma);
 
-       /* use scratch buffer only when rx or tx data is unspecified */
-       if (!xfer->rx_buf) {
-               rx_dma = as->buffer_dma;
-               if (len > BUFFER_SIZE)
-                       len = BUFFER_SIZE;
-       }
-       if (!xfer->tx_buf) {
-               tx_dma = as->buffer_dma;
-               if (len > BUFFER_SIZE)
-                       len = BUFFER_SIZE;
-               memset(as->buffer, 0, len);
-               dma_sync_single_for_device(&as->pdev->dev,
-                               as->buffer_dma, len, DMA_TO_DEVICE);
+               if (msg->spi->bits_per_word > 8)
+                       len >>= 1;
+               spi_writel(as, RCR, len);
+               spi_writel(as, TCR, len);
+       } else {
+               xfer = as->next_transfer;
+               remaining = as->next_remaining_bytes;
        }
 
-       spi_writel(as, RPR, rx_dma);
-       spi_writel(as, TPR, tx_dma);
+       as->current_transfer = xfer;
+       as->current_remaining_bytes = remaining;
 
-       as->remaining_bytes -= len;
-       if (msg->spi->bits_per_word > 8)
-               len >>= 1;
+       if (remaining > 0)
+               len = remaining;
+       else if (!atmel_spi_xfer_is_last(msg, xfer) &&
+               atmel_spi_xfer_can_be_chained(xfer)) {
+               xfer = list_entry(xfer->transfer_list.next,
+                               struct spi_transfer, transfer_list);
+               len = xfer->len;
+       } else
+               xfer = NULL;
 
-       /* REVISIT: when xfer->delay_usecs == 0, the PDC "next transfer"
-        * mechanism might help avoid the IRQ latency between transfers
-        * (and improve the nCS0 errata handling on at91rm9200 chips)
-        *
-        * We're also waiting for ENDRX before we start the next
+       as->next_transfer = xfer;
+
+       if (xfer) {
+               total = len;
+               atmel_spi_next_xfer_data(master, xfer, &tx_dma, &rx_dma, &len);
+               as->next_remaining_bytes = total - len;
+
+               spi_writel(as, RNPR, rx_dma);
+               spi_writel(as, TNPR, tx_dma);
+
+               if (msg->spi->bits_per_word > 8)
+                       len >>= 1;
+               spi_writel(as, RNCR, len);
+               spi_writel(as, TNCR, len);
+       } else {
+               spi_writel(as, RNCR, 0);
+               spi_writel(as, TNCR, 0);
+       }
+
+       /* REVISIT: We're waiting for ENDRX before we start the next
         * transfer because we need to handle some difficult timing
         * issues otherwise. If we wait for ENDTX in one transfer and
         * then starts waiting for ENDRX in the next, it's difficult
         *
         * It should be doable, though. Just not now...
         */
-       spi_writel(as, TNCR, 0);
-       spi_writel(as, RNCR, 0);
        spi_writel(as, IER, SPI_BIT(ENDRX) | SPI_BIT(OVRES));
 
        dev_dbg(&msg->spi->dev,
                xfer, xfer->len, xfer->tx_buf, xfer->tx_dma,
                xfer->rx_buf, xfer->rx_dma, spi_readl(as, IMR));
 
-       spi_writel(as, RCR, len);
-       spi_writel(as, TCR, len);
        spi_writel(as, PTCR, SPI_BIT(TXTEN) | SPI_BIT(RXTEN));
 }
 
        spin_lock(&as->lock);
 
        as->current_transfer = NULL;
+       as->next_transfer = NULL;
 
        /* continue if needed */
        if (list_empty(&as->queue) || as->stopping)
 
                spi_writel(as, IDR, pending);
 
-               if (as->remaining_bytes == 0) {
+               if (as->current_remaining_bytes == 0) {
                        msg->actual_length += xfer->len;
 
                        if (!msg->is_dma_mapped)
                        if (xfer->delay_usecs)
                                udelay(xfer->delay_usecs);
 
-                       if (msg->transfers.prev == &xfer->transfer_list) {
+                       if (atmel_spi_xfer_is_last(msg, xfer)) {
                                /* report completed message */
                                atmel_spi_msg_done(master, as, msg, 0,
                                                xfer->cs_change);