queue->max_elements = QETH_MAX_BUFFER_ELEMENTS(card);
                queue->next_buf_to_fill = 0;
                queue->do_pack = 0;
+               queue->prev_hdr = NULL;
+               queue->bulk_start = 0;
                atomic_set(&queue->used_buffers, 0);
                atomic_set(&queue->set_pci_flags_count, 0);
                atomic_set(&queue->state, QETH_OUT_Q_UNLOCKED);
        }
 }
 
+static void qeth_flush_queue(struct qeth_qdio_out_q *queue)
+{
+       qeth_flush_buffers(queue, queue->bulk_start, 1);
+
+       queue->bulk_start = QDIO_BUFNR(queue->bulk_start + 1);
+       queue->prev_hdr = NULL;
+}
+
 static void qeth_check_outbound_queue(struct qeth_qdio_out_q *queue)
 {
        int index;
        return 0;
 }
 
-static void __qeth_fill_buffer(struct sk_buff *skb,
-                              struct qeth_qdio_out_buffer *buf,
-                              bool is_first_elem, unsigned int offset)
+static bool qeth_iqd_may_bulk(struct qeth_qdio_out_q *queue,
+                             struct qeth_qdio_out_buffer *buffer,
+                             struct sk_buff *curr_skb,
+                             struct qeth_hdr *curr_hdr)
+{
+       struct qeth_hdr *prev_hdr = queue->prev_hdr;
+
+       if (!prev_hdr)
+               return true;
+
+       /* All packets must have the same target: */
+       if (curr_hdr->hdr.l2.id == QETH_HEADER_TYPE_LAYER2) {
+               struct sk_buff *prev_skb = skb_peek(&buffer->skb_list);
+
+               return ether_addr_equal(eth_hdr(prev_skb)->h_dest,
+                                       eth_hdr(curr_skb)->h_dest) &&
+                      qeth_l2_same_vlan(&prev_hdr->hdr.l2, &curr_hdr->hdr.l2);
+       }
+
+       return qeth_l3_same_next_hop(&prev_hdr->hdr.l3, &curr_hdr->hdr.l3) &&
+              qeth_l3_iqd_same_vlan(&prev_hdr->hdr.l3, &curr_hdr->hdr.l3);
+}
+
+static unsigned int __qeth_fill_buffer(struct sk_buff *skb,
+                                      struct qeth_qdio_out_buffer *buf,
+                                      bool is_first_elem, unsigned int offset)
 {
        struct qdio_buffer *buffer = buf->buffer;
        int element = buf->next_element_to_fill;
        if (buffer->element[element - 1].eflags)
                buffer->element[element - 1].eflags = SBAL_EFLAGS_LAST_FRAG;
        buf->next_element_to_fill = element;
+       return element;
 }
 
 /**
  * qeth_fill_buffer() - map skb into an output buffer
- * @queue:     QDIO queue to submit the buffer on
  * @buf:       buffer to transport the skb
  * @skb:       skb to map into the buffer
  * @hdr:       qeth_hdr for this skb. Either at skb->data, or allocated
  *             from qeth_core_header_cache.
  * @offset:    when mapping the skb, start at skb->data + offset
  * @hd_len:    if > 0, build a dedicated header element of this size
- * flush:      Prepare the buffer to be flushed, regardless of its fill level.
  */
-static int qeth_fill_buffer(struct qeth_qdio_out_q *queue,
-                           struct qeth_qdio_out_buffer *buf,
-                           struct sk_buff *skb, struct qeth_hdr *hdr,
-                           unsigned int offset, unsigned int hd_len,
-                           bool flush)
+static unsigned int qeth_fill_buffer(struct qeth_qdio_out_buffer *buf,
+                                    struct sk_buff *skb, struct qeth_hdr *hdr,
+                                    unsigned int offset, unsigned int hd_len)
 {
        struct qdio_buffer *buffer = buf->buffer;
        bool is_first_elem = true;
                buf->next_element_to_fill++;
        }
 
-       __qeth_fill_buffer(skb, buf, is_first_elem, offset);
-
-       if (!queue->do_pack) {
-               QETH_CARD_TEXT(queue->card, 6, "fillbfnp");
-       } else {
-               QETH_CARD_TEXT(queue->card, 6, "fillbfpa");
-
-               QETH_TXQ_STAT_INC(queue, skbs_pack);
-               /* If the buffer still has free elements, keep using it. */
-               if (!flush &&
-                   buf->next_element_to_fill < queue->max_elements)
-                       return 0;
-       }
-
-       /* flush out the buffer */
-       atomic_set(&buf->state, QETH_QDIO_BUF_PRIMED);
-       queue->next_buf_to_fill = (queue->next_buf_to_fill + 1) %
-                                 QDIO_MAX_BUFFERS_PER_Q;
-       return 1;
+       return __qeth_fill_buffer(skb, buf, is_first_elem, offset);
 }
 
-static int qeth_do_send_packet_fast(struct qeth_qdio_out_q *queue,
-                                   struct sk_buff *skb, struct qeth_hdr *hdr,
-                                   unsigned int offset, unsigned int hd_len)
+static int __qeth_xmit(struct qeth_card *card, struct qeth_qdio_out_q *queue,
+                      struct sk_buff *skb, unsigned int elements,
+                      struct qeth_hdr *hdr, unsigned int offset,
+                      unsigned int hd_len)
 {
-       int index = queue->next_buf_to_fill;
-       struct qeth_qdio_out_buffer *buffer = queue->bufs[index];
+       struct qeth_qdio_out_buffer *buffer = queue->bufs[queue->bulk_start];
        unsigned int bytes = qdisc_pkt_len(skb);
+       unsigned int next_element;
        struct netdev_queue *txq;
        bool stopped = false;
+       bool flush;
+
+       txq = netdev_get_tx_queue(card->dev, skb_get_queue_mapping(skb));
 
        /* Just a sanity check, the wake/stop logic should ensure that we always
         * get a free buffer.
        if (atomic_read(&buffer->state) != QETH_QDIO_BUF_EMPTY)
                return -EBUSY;
 
-       txq = netdev_get_tx_queue(queue->card->dev, skb_get_queue_mapping(skb));
+       if ((buffer->next_element_to_fill + elements > queue->max_elements) ||
+           !qeth_iqd_may_bulk(queue, buffer, skb, hdr)) {
+               atomic_set(&buffer->state, QETH_QDIO_BUF_PRIMED);
+               qeth_flush_queue(queue);
+               buffer = queue->bufs[queue->bulk_start];
 
-       if (atomic_inc_return(&queue->used_buffers) >= QDIO_MAX_BUFFERS_PER_Q) {
+               /* Sanity-check again: */
+               if (atomic_read(&buffer->state) != QETH_QDIO_BUF_EMPTY)
+                       return -EBUSY;
+       }
+
+       if (buffer->next_element_to_fill == 0 &&
+           atomic_inc_return(&queue->used_buffers) >= QDIO_MAX_BUFFERS_PER_Q) {
                /* If a TX completion happens right _here_ and misses to wake
                 * the txq, then our re-check below will catch the race.
                 */
                stopped = true;
        }
 
-       qeth_fill_buffer(queue, buffer, skb, hdr, offset, hd_len, stopped);
-       netdev_tx_sent_queue(txq, bytes);
+       next_element = qeth_fill_buffer(buffer, skb, hdr, offset, hd_len);
        buffer->bytes += bytes;
+       queue->prev_hdr = hdr;
 
-       qeth_flush_buffers(queue, index, 1);
+       flush = __netdev_tx_sent_queue(txq, bytes,
+                                      !stopped && netdev_xmit_more());
+
+       if (flush || next_element >= queue->max_elements) {
+               atomic_set(&buffer->state, QETH_QDIO_BUF_PRIMED);
+               qeth_flush_queue(queue);
+       }
 
        if (stopped && !qeth_out_queue_is_full(queue))
                netif_tx_start_queue(txq);
                        int elements_needed)
 {
        struct qeth_qdio_out_buffer *buffer;
+       unsigned int next_element;
        struct netdev_queue *txq;
        bool stopped = false;
        int start_index;
                stopped = true;
        }
 
-       flush_count += qeth_fill_buffer(queue, buffer, skb, hdr, offset, hd_len,
-                                       stopped);
+       next_element = qeth_fill_buffer(buffer, skb, hdr, offset, hd_len);
+
+       if (queue->do_pack)
+               QETH_TXQ_STAT_INC(queue, skbs_pack);
+       if (!queue->do_pack || stopped || next_element >= queue->max_elements) {
+               flush_count++;
+               atomic_set(&buffer->state, QETH_QDIO_BUF_PRIMED);
+               queue->next_buf_to_fill = (queue->next_buf_to_fill + 1) %
+                                         QDIO_MAX_BUFFERS_PER_Q;
+       }
+
        if (flush_count)
                qeth_flush_buffers(queue, start_index, flush_count);
        else if (!atomic_read(&queue->set_pci_flags_count))
                                  frame_len - proto_len, skb, proto_len);
 
        if (IS_IQD(card)) {
-               rc = qeth_do_send_packet_fast(queue, skb, hdr, data_offset,
-                                             hd_len);
+               rc = __qeth_xmit(card, queue, skb, elements, hdr, data_offset,
+                                hd_len);
        } else {
                /* TODO: drop skb_orphan() once TX completion is fast enough */
                skb_orphan(skb);