static int napi_weight = NAPI_POLL_WEIGHT;
 module_param(napi_weight, int, 0444);
 
-static bool csum = true, gso = true;
+static bool csum = true, gso = true, napi_tx;
 module_param(csum, bool, 0444);
 module_param(gso, bool, 0444);
+module_param(napi_tx, bool, 0644);
 
 /* FIXME: MTU in config. */
 #define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
 
        /* Name of the send queue: output.$index */
        char name[40];
+
+       struct napi_struct napi;
 };
 
 /* Internal representation of a receive virtqueue */
 static void skb_xmit_done(struct virtqueue *vq)
 {
        struct virtnet_info *vi = vq->vdev->priv;
+       struct napi_struct *napi = &vi->sq[vq2txq(vq)].napi;
 
        /* Suppress further interrupts. */
        virtqueue_disable_cb(vq);
 
-       /* We were probably waiting for more output buffers. */
-       netif_wake_subqueue(vi->dev, vq2txq(vq));
+       if (napi->weight)
+               virtqueue_napi_schedule(napi, vq);
+       else
+               /* We were probably waiting for more output buffers. */
+               netif_wake_subqueue(vi->dev, vq2txq(vq));
 }
 
 static unsigned int mergeable_ctx_to_buf_truesize(unsigned long mrg_ctx)
        local_bh_enable();
 }
 
+static void virtnet_napi_tx_enable(struct virtnet_info *vi,
+                                  struct virtqueue *vq,
+                                  struct napi_struct *napi)
+{
+       if (!napi->weight)
+               return;
+
+       /* Tx napi touches cachelines on the cpu handling tx interrupts. Only
+        * enable the feature if this is likely affine with the transmit path.
+        */
+       if (!vi->affinity_hint_set) {
+               napi->weight = 0;
+               return;
+       }
+
+       return virtnet_napi_enable(vq, napi);
+}
+
 static void refill_work(struct work_struct *work)
 {
        struct virtnet_info *vi =
                        if (!try_fill_recv(vi, &vi->rq[i], GFP_KERNEL))
                                schedule_delayed_work(&vi->refill, 0);
                virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi);
+               virtnet_napi_tx_enable(vi, vi->sq[i].vq, &vi->sq[i].napi);
        }
 
        return 0;
        u64_stats_update_end(&stats->tx_syncp);
 }
 
+static int virtnet_poll_tx(struct napi_struct *napi, int budget)
+{
+       struct send_queue *sq = container_of(napi, struct send_queue, napi);
+       struct virtnet_info *vi = sq->vq->vdev->priv;
+       struct netdev_queue *txq = netdev_get_tx_queue(vi->dev, vq2txq(sq->vq));
+
+       __netif_tx_lock(txq, raw_smp_processor_id());
+       free_old_xmit_skbs(sq);
+       __netif_tx_unlock(txq);
+
+       virtqueue_napi_complete(napi, sq->vq, 0);
+
+       if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS)
+               netif_tx_wake_queue(txq);
+
+       return 0;
+}
+
 static int xmit_skb(struct send_queue *sq, struct sk_buff *skb)
 {
        struct virtio_net_hdr_mrg_rxbuf *hdr;
        int err;
        struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum);
        bool kick = !skb->xmit_more;
+       bool use_napi = sq->napi.weight;
 
        /* Free up any pending old buffers before queueing new ones. */
        free_old_xmit_skbs(sq);
        }
 
        /* Don't wait up for transmitted skbs to be freed. */
-       skb_orphan(skb);
-       nf_reset(skb);
+       if (!use_napi) {
+               skb_orphan(skb);
+               nf_reset(skb);
+       }
 
        /* If running out of space, stop queue to avoid getting packets that we
         * are then unable to transmit.
         */
        if (sq->vq->num_free < 2+MAX_SKB_FRAGS) {
                netif_stop_subqueue(dev, qnum);
-               if (unlikely(!virtqueue_enable_cb_delayed(sq->vq))) {
+               if (!use_napi &&
+                   unlikely(!virtqueue_enable_cb_delayed(sq->vq))) {
                        /* More just got used, free them then recheck. */
                        free_old_xmit_skbs(sq);
                        if (sq->vq->num_free >= 2+MAX_SKB_FRAGS) {
        /* Make sure refill_work doesn't re-enable napi! */
        cancel_delayed_work_sync(&vi->refill);
 
-       for (i = 0; i < vi->max_queue_pairs; i++)
+       for (i = 0; i < vi->max_queue_pairs; i++) {
                napi_disable(&vi->rq[i].napi);
+               napi_disable(&vi->sq[i].napi);
+       }
 
        return 0;
 }
        cancel_delayed_work_sync(&vi->refill);
 
        if (netif_running(vi->dev)) {
-               for (i = 0; i < vi->max_queue_pairs; i++)
+               for (i = 0; i < vi->max_queue_pairs; i++) {
                        napi_disable(&vi->rq[i].napi);
+                       napi_disable(&vi->sq[i].napi);
+               }
        }
 }
 
                        if (!try_fill_recv(vi, &vi->rq[i], GFP_KERNEL))
                                schedule_delayed_work(&vi->refill, 0);
 
-               for (i = 0; i < vi->max_queue_pairs; i++)
+               for (i = 0; i < vi->max_queue_pairs; i++) {
                        virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi);
+                       virtnet_napi_tx_enable(vi, vi->sq[i].vq,
+                                              &vi->sq[i].napi);
+               }
        }
 
        netif_device_attach(vi->dev);
        for (i = 0; i < vi->max_queue_pairs; i++) {
                napi_hash_del(&vi->rq[i].napi);
                netif_napi_del(&vi->rq[i].napi);
+               netif_napi_del(&vi->sq[i].napi);
        }
 
        /* We called napi_hash_del() before netif_napi_del(),
                vi->rq[i].pages = NULL;
                netif_napi_add(vi->dev, &vi->rq[i].napi, virtnet_poll,
                               napi_weight);
+               netif_napi_add(vi->dev, &vi->sq[i].napi, virtnet_poll_tx,
+                              napi_tx ? napi_weight : 0);
 
                sg_init_table(vi->rq[i].sg, ARRAY_SIZE(vi->rq[i].sg));
                ewma_pkt_len_init(&vi->rq[i].mrg_avg_pkt_len);