gve: DQO: Add TX path

author Bailey Forrest <bcf@google.com>

Thu, 24 Jun 2021 18:06:31 +0000 (11:06 -0700)

committer David S. Miller <davem@davemloft.net>

Thu, 24 Jun 2021 19:47:38 +0000 (12:47 -0700)
author Bailey Forrest <bcf@google.com>
Thu, 24 Jun 2021 18:06:31 +0000 (11:06 -0700)
committer David S. Miller <davem@davemloft.net>
Thu, 24 Jun 2021 19:47:38 +0000 (12:47 -0700)
diff --git a/drivers/net/ethernet/google/gve/gve_dqo.h b/drivers/net/ethernet/google/gve/gve_dqo.h

index 3b300223ea15894e9ad7af199976336e5ca6029d..836042364124e123537ccf1e5a880fe493c4a2d3 100644 (file)
--- a/drivers/net/ethernet/google/gve/gve_dqo.h
+++ b/drivers/net/ethernet/google/gve/gve_dqo.h
@@ -19,6 +19,18 @@
  #define GVE_TX_IRQ_RATELIMIT_US_DQO 50
  #define GVE_RX_IRQ_RATELIMIT_US_DQO 20
  
+/* Timeout in seconds to wait for a reinjection completion after receiving
+ * its corresponding miss completion.
+ */
+#define GVE_REINJECT_COMPL_TIMEOUT 1
+
+/* Timeout in seconds to deallocate the completion tag for a packet that was
+ * prematurely freed for not receiving a valid completion. This should be large
+ * enough to rule out the possibility of receiving the corresponding valid
+ * completion after this interval.
+ */
+#define GVE_DEALLOCATE_COMPL_TIMEOUT 60
+
  netdev_tx_t gve_tx_dqo(struct sk_buff *skb, struct net_device *dev);
  bool gve_tx_poll_dqo(struct gve_notify_block *block, bool do_clean);
  int gve_rx_poll_dqo(struct gve_notify_block *block, int budget);
diff --git a/drivers/net/ethernet/google/gve/gve_tx_dqo.c b/drivers/net/ethernet/google/gve/gve_tx_dqo.c

index bde8f90ac8bda0f5fe2a1f1a8611f3add9f306b7..a4906b9df54002a79c5487f0cb1d5a387d6fb1e1 100644 (file)
--- a/drivers/net/ethernet/google/gve/gve_tx_dqo.c
+++ b/drivers/net/ethernet/google/gve/gve_tx_dqo.c
@@ -12,6 +12,67 @@
  #include <linux/slab.h>
  #include <linux/skbuff.h>
  
+/* Returns true if a gve_tx_pending_packet_dqo object is available. */
+static bool gve_has_pending_packet(struct gve_tx_ring *tx)
+{
+       /* Check TX path's list. */
+       if (tx->dqo_tx.free_pending_packets != -1)
+               return true;
+
+       /* Check completion handler's list. */
+       if (atomic_read_acquire(&tx->dqo_compl.free_pending_packets) != -1)
+               return true;
+
+       return false;
+}
+
+static struct gve_tx_pending_packet_dqo *
+gve_alloc_pending_packet(struct gve_tx_ring *tx)
+{
+       struct gve_tx_pending_packet_dqo *pending_packet;
+       s16 index;
+
+       index = tx->dqo_tx.free_pending_packets;
+
+       /* No pending_packets available, try to steal the list from the
+        * completion handler.
+        */
+       if (unlikely(index == -1)) {
+               tx->dqo_tx.free_pending_packets =
+                       atomic_xchg(&tx->dqo_compl.free_pending_packets, -1);
+               index = tx->dqo_tx.free_pending_packets;
+
+               if (unlikely(index == -1))
+                       return NULL;
+       }
+
+       pending_packet = &tx->dqo.pending_packets[index];
+
+       /* Remove pending_packet from free list */
+       tx->dqo_tx.free_pending_packets = pending_packet->next;
+       pending_packet->state = GVE_PACKET_STATE_PENDING_DATA_COMPL;
+
+       return pending_packet;
+}
+
+static void
+gve_free_pending_packet(struct gve_tx_ring *tx,
+                       struct gve_tx_pending_packet_dqo *pending_packet)
+{
+       s16 index = pending_packet - tx->dqo.pending_packets;
+
+       pending_packet->state = GVE_PACKET_STATE_UNALLOCATED;
+       while (true) {
+               s16 old_head = atomic_read_acquire(&tx->dqo_compl.free_pending_packets);
+
+               pending_packet->next = old_head;
+               if (atomic_cmpxchg(&tx->dqo_compl.free_pending_packets,
+                                  old_head, index) == old_head) {
+                       break;
+               }
+       }
+}
+
  /* gve_tx_free_desc - Cleans up all pending tx requests and buffers.
   */
  static void gve_tx_clean_pending_packets(struct gve_tx_ring *tx)
@@ -199,18 +260,772 @@ void gve_tx_free_rings_dqo(struct gve_priv *priv)
         }
  }
  
+/* Returns the number of slots available in the ring */
+static inline u32 num_avail_tx_slots(const struct gve_tx_ring *tx)
+{
+       u32 num_used = (tx->dqo_tx.tail - tx->dqo_tx.head) & tx->mask;
+
+       return tx->mask - num_used;
+}
+
+/* Stops the queue if available descriptors is less than 'count'.
+ * Return: 0 if stop is not required.
+ */
+static int gve_maybe_stop_tx_dqo(struct gve_tx_ring *tx, int count)
+{
+       if (likely(gve_has_pending_packet(tx) &&
+                  num_avail_tx_slots(tx) >= count))
+               return 0;
+
+       /* Update cached TX head pointer */
+       tx->dqo_tx.head = atomic_read_acquire(&tx->dqo_compl.hw_tx_head);
+
+       if (likely(gve_has_pending_packet(tx) &&
+                  num_avail_tx_slots(tx) >= count))
+               return 0;
+
+       /* No space, so stop the queue */
+       tx->stop_queue++;
+       netif_tx_stop_queue(tx->netdev_txq);
+
+       /* Sync with restarting queue in `gve_tx_poll_dqo()` */
+       mb();
+
+       /* After stopping queue, check if we can transmit again in order to
+        * avoid TOCTOU bug.
+        */
+       tx->dqo_tx.head = atomic_read_acquire(&tx->dqo_compl.hw_tx_head);
+
+       if (likely(!gve_has_pending_packet(tx) ||
+                  num_avail_tx_slots(tx) < count))
+               return -EBUSY;
+
+       netif_tx_start_queue(tx->netdev_txq);
+       tx->wake_queue++;
+       return 0;
+}
+
+static void gve_extract_tx_metadata_dqo(const struct sk_buff *skb,
+                                       struct gve_tx_metadata_dqo *metadata)
+{
+       memset(metadata, 0, sizeof(*metadata));
+       metadata->version = GVE_TX_METADATA_VERSION_DQO;
+
+       if (skb->l4_hash) {
+               u16 path_hash = skb->hash ^ (skb->hash >> 16);
+
+               path_hash &= (1 << 15) - 1;
+               if (unlikely(path_hash == 0))
+                       path_hash = ~path_hash;
+
+               metadata->path_hash = path_hash;
+       }
+}
+
+static void gve_tx_fill_pkt_desc_dqo(struct gve_tx_ring *tx, u32 *desc_idx,
+                                    struct sk_buff *skb, u32 len, u64 addr,
+                                    s16 compl_tag, bool eop, bool is_gso)
+{
+       const bool checksum_offload_en = skb->ip_summed == CHECKSUM_PARTIAL;
+
+       while (len > 0) {
+               struct gve_tx_pkt_desc_dqo *desc =
+                       &tx->dqo.tx_ring[*desc_idx].pkt;
+               u32 cur_len = min_t(u32, len, GVE_TX_MAX_BUF_SIZE_DQO);
+               bool cur_eop = eop && cur_len == len;
+
+               *desc = (struct gve_tx_pkt_desc_dqo){
+                       .buf_addr = cpu_to_le64(addr),
+                       .dtype = GVE_TX_PKT_DESC_DTYPE_DQO,
+                       .end_of_packet = cur_eop,
+                       .checksum_offload_enable = checksum_offload_en,
+                       .compl_tag = cpu_to_le16(compl_tag),
+                       .buf_size = cur_len,
+               };
+
+               addr += cur_len;
+               len -= cur_len;
+               *desc_idx = (*desc_idx + 1) & tx->mask;
+       }
+}
+
+/* Validates and prepares `skb` for TSO.
+ *
+ * Returns header length, or < 0 if invalid.
+ */
+static int gve_prep_tso(struct sk_buff *skb)
+{
+       struct tcphdr *tcp;
+       int header_len;
+       u32 paylen;
+       int err;
+
+       /* Note: HW requires MSS (gso_size) to be <= 9728 and the total length
+        * of the TSO to be <= 262143.
+        *
+        * However, we don't validate these because:
+        * - Hypervisor enforces a limit of 9K MTU
+        * - Kernel will not produce a TSO larger than 64k
+        */
+
+       if (unlikely(skb_shinfo(skb)->gso_size < GVE_TX_MIN_TSO_MSS_DQO))
+               return -1;
+
+       /* Needed because we will modify header. */
+       err = skb_cow_head(skb, 0);
+       if (err < 0)
+               return err;
+
+       tcp = tcp_hdr(skb);
+
+       /* Remove payload length from checksum. */
+       paylen = skb->len - skb_transport_offset(skb);
+
+       switch (skb_shinfo(skb)->gso_type) {
+       case SKB_GSO_TCPV4:
+       case SKB_GSO_TCPV6:
+               csum_replace_by_diff(&tcp->check,
+                                    (__force __wsum)htonl(paylen));
+
+               /* Compute length of segmentation header. */
+               header_len = skb_transport_offset(skb) + tcp_hdrlen(skb);
+               break;
+       default:
+               return -EINVAL;
+       }
+
+       if (unlikely(header_len > GVE_TX_MAX_HDR_SIZE_DQO))
+               return -EINVAL;
+
+       return header_len;
+}
+
+static void gve_tx_fill_tso_ctx_desc(struct gve_tx_tso_context_desc_dqo *desc,
+                                    const struct sk_buff *skb,
+                                    const struct gve_tx_metadata_dqo *metadata,
+                                    int header_len)
+{
+       *desc = (struct gve_tx_tso_context_desc_dqo){
+               .header_len = header_len,
+               .cmd_dtype = {
+                       .dtype = GVE_TX_TSO_CTX_DESC_DTYPE_DQO,
+                       .tso = 1,
+               },
+               .flex0 = metadata->bytes[0],
+               .flex5 = metadata->bytes[5],
+               .flex6 = metadata->bytes[6],
+               .flex7 = metadata->bytes[7],
+               .flex8 = metadata->bytes[8],
+               .flex9 = metadata->bytes[9],
+               .flex10 = metadata->bytes[10],
+               .flex11 = metadata->bytes[11],
+       };
+       desc->tso_total_len = skb->len - header_len;
+       desc->mss = skb_shinfo(skb)->gso_size;
+}
+
+static void
+gve_tx_fill_general_ctx_desc(struct gve_tx_general_context_desc_dqo *desc,
+                            const struct gve_tx_metadata_dqo *metadata)
+{
+       *desc = (struct gve_tx_general_context_desc_dqo){
+               .flex0 = metadata->bytes[0],
+               .flex1 = metadata->bytes[1],
+               .flex2 = metadata->bytes[2],
+               .flex3 = metadata->bytes[3],
+               .flex4 = metadata->bytes[4],
+               .flex5 = metadata->bytes[5],
+               .flex6 = metadata->bytes[6],
+               .flex7 = metadata->bytes[7],
+               .flex8 = metadata->bytes[8],
+               .flex9 = metadata->bytes[9],
+               .flex10 = metadata->bytes[10],
+               .flex11 = metadata->bytes[11],
+               .cmd_dtype = {.dtype = GVE_TX_GENERAL_CTX_DESC_DTYPE_DQO},
+       };
+}
+
+/* Returns 0 on success, or < 0 on error.
+ *
+ * Before this function is called, the caller must ensure
+ * gve_has_pending_packet(tx) returns true.
+ */
+static int gve_tx_add_skb_no_copy_dqo(struct gve_tx_ring *tx,
+                                     struct sk_buff *skb)
+{
+       const struct skb_shared_info *shinfo = skb_shinfo(skb);
+       const bool is_gso = skb_is_gso(skb);
+       u32 desc_idx = tx->dqo_tx.tail;
+
+       struct gve_tx_pending_packet_dqo *pending_packet;
+       struct gve_tx_metadata_dqo metadata;
+       s16 completion_tag;
+       int i;
+
+       pending_packet = gve_alloc_pending_packet(tx);
+       pending_packet->skb = skb;
+       pending_packet->num_bufs = 0;
+       completion_tag = pending_packet - tx->dqo.pending_packets;
+
+       gve_extract_tx_metadata_dqo(skb, &metadata);
+       if (is_gso) {
+               int header_len = gve_prep_tso(skb);
+
+               if (unlikely(header_len < 0))
+                       goto err;
+
+               gve_tx_fill_tso_ctx_desc(&tx->dqo.tx_ring[desc_idx].tso_ctx,
+                                        skb, &metadata, header_len);
+               desc_idx = (desc_idx + 1) & tx->mask;
+       }
+
+       gve_tx_fill_general_ctx_desc(&tx->dqo.tx_ring[desc_idx].general_ctx,
+                                    &metadata);
+       desc_idx = (desc_idx + 1) & tx->mask;
+
+       /* Note: HW requires that the size of a non-TSO packet be within the
+        * range of [17, 9728].
+        *
+        * We don't double check because
+        * - We limited `netdev->min_mtu` to ETH_MIN_MTU.
+        * - Hypervisor won't allow MTU larger than 9216.
+        */
+
+       /* Map the linear portion of skb */
+       {
+               struct gve_tx_dma_buf *buf =
+                       &pending_packet->bufs[pending_packet->num_bufs];
+               u32 len = skb_headlen(skb);
+               dma_addr_t addr;
+
+               addr = dma_map_single(tx->dev, skb->data, len, DMA_TO_DEVICE);
+               if (unlikely(dma_mapping_error(tx->dev, addr)))
+                       goto err;
+
+               dma_unmap_len_set(buf, len, len);
+               dma_unmap_addr_set(buf, dma, addr);
+               ++pending_packet->num_bufs;
+
+               gve_tx_fill_pkt_desc_dqo(tx, &desc_idx, skb, len, addr,
+                                        completion_tag,
+                                        /*eop=*/shinfo->nr_frags == 0, is_gso);
+       }
+
+       for (i = 0; i < shinfo->nr_frags; i++) {
+               struct gve_tx_dma_buf *buf =
+                       &pending_packet->bufs[pending_packet->num_bufs];
+               const skb_frag_t *frag = &shinfo->frags[i];
+               bool is_eop = i == (shinfo->nr_frags - 1);
+               u32 len = skb_frag_size(frag);
+               dma_addr_t addr;
+
+               addr = skb_frag_dma_map(tx->dev, frag, 0, len, DMA_TO_DEVICE);
+               if (unlikely(dma_mapping_error(tx->dev, addr)))
+                       goto err;
+
+               dma_unmap_len_set(buf, len, len);
+               dma_unmap_addr_set(buf, dma, addr);
+               ++pending_packet->num_bufs;
+
+               gve_tx_fill_pkt_desc_dqo(tx, &desc_idx, skb, len, addr,
+                                        completion_tag, is_eop, is_gso);
+       }
+
+       /* Commit the changes to our state */
+       tx->dqo_tx.tail = desc_idx;
+
+       /* Request a descriptor completion on the last descriptor of the
+        * packet if we are allowed to by the HW enforced interval.
+        */
+       {
+               u32 last_desc_idx = (desc_idx - 1) & tx->mask;
+               u32 last_report_event_interval =
+                       (last_desc_idx - tx->dqo_tx.last_re_idx) & tx->mask;
+
+               if (unlikely(last_report_event_interval >=
+                            GVE_TX_MIN_RE_INTERVAL)) {
+                       tx->dqo.tx_ring[last_desc_idx].pkt.report_event = true;
+                       tx->dqo_tx.last_re_idx = last_desc_idx;
+               }
+       }
+
+       return 0;
+
+err:
+       for (i = 0; i < pending_packet->num_bufs; i++) {
+               struct gve_tx_dma_buf *buf = &pending_packet->bufs[i];
+
+               if (i == 0) {
+                       dma_unmap_single(tx->dev, dma_unmap_addr(buf, dma),
+                                        dma_unmap_len(buf, len),
+                                        DMA_TO_DEVICE);
+               } else {
+                       dma_unmap_page(tx->dev, dma_unmap_addr(buf, dma),
+                                      dma_unmap_len(buf, len), DMA_TO_DEVICE);
+               }
+       }
+
+       pending_packet->skb = NULL;
+       pending_packet->num_bufs = 0;
+       gve_free_pending_packet(tx, pending_packet);
+
+       return -1;
+}
+
+static int gve_num_descs_per_buf(size_t size)
+{
+       return DIV_ROUND_UP(size, GVE_TX_MAX_BUF_SIZE_DQO);
+}
+
+static int gve_num_buffer_descs_needed(const struct sk_buff *skb)
+{
+       const struct skb_shared_info *shinfo = skb_shinfo(skb);
+       int num_descs;
+       int i;
+
+       num_descs = gve_num_descs_per_buf(skb_headlen(skb));
+
+       for (i = 0; i < shinfo->nr_frags; i++) {
+               unsigned int frag_size = skb_frag_size(&shinfo->frags[i]);
+
+               num_descs += gve_num_descs_per_buf(frag_size);
+       }
+
+       return num_descs;
+}
+
+/* Returns true if HW is capable of sending TSO represented by `skb`.
+ *
+ * Each segment must not span more than GVE_TX_MAX_DATA_DESCS buffers.
+ * - The header is counted as one buffer for every single segment.
+ * - A buffer which is split between two segments is counted for both.
+ * - If a buffer contains both header and payload, it is counted as two buffers.
+ */
+static bool gve_can_send_tso(const struct sk_buff *skb)
+{
+       const int header_len = skb_checksum_start_offset(skb) + tcp_hdrlen(skb);
+       const int max_bufs_per_seg = GVE_TX_MAX_DATA_DESCS - 1;
+       const struct skb_shared_info *shinfo = skb_shinfo(skb);
+       const int gso_size = shinfo->gso_size;
+       int cur_seg_num_bufs;
+       int cur_seg_size;
+       int i;
+
+       cur_seg_size = skb_headlen(skb) - header_len;
+       cur_seg_num_bufs = cur_seg_size > 0;
+
+       for (i = 0; i < shinfo->nr_frags; i++) {
+               if (cur_seg_size >= gso_size) {
+                       cur_seg_size %= gso_size;
+                       cur_seg_num_bufs = cur_seg_size > 0;
+               }
+
+               if (unlikely(++cur_seg_num_bufs > max_bufs_per_seg))
+                       return false;
+
+               cur_seg_size += skb_frag_size(&shinfo->frags[i]);
+       }
+
+       return true;
+}
+
+/* Attempt to transmit specified SKB.
+ *
+ * Returns 0 if the SKB was transmitted or dropped.
+ * Returns -1 if there is not currently enough space to transmit the SKB.
+ */
+static int gve_try_tx_skb(struct gve_priv *priv, struct gve_tx_ring *tx,
+                         struct sk_buff *skb)
+{
+       int num_buffer_descs;
+       int total_num_descs;
+
+       if (skb_is_gso(skb)) {
+               /* If TSO doesn't meet HW requirements, attempt to linearize the
+                * packet.
+                */
+               if (unlikely(!gve_can_send_tso(skb) &&
+                            skb_linearize(skb) < 0)) {
+                       net_err_ratelimited("%s: Failed to transmit TSO packet\n",
+                                           priv->dev->name);
+                       goto drop;
+               }
+
+               num_buffer_descs = gve_num_buffer_descs_needed(skb);
+       } else {
+               num_buffer_descs = gve_num_buffer_descs_needed(skb);
+
+               if (unlikely(num_buffer_descs > GVE_TX_MAX_DATA_DESCS)) {
+                       if (unlikely(skb_linearize(skb) < 0))
+                               goto drop;
+
+                       num_buffer_descs = 1;
+               }
+       }
+
+       /* Metadata + (optional TSO) + data descriptors. */
+       total_num_descs = 1 + skb_is_gso(skb) + num_buffer_descs;
+       if (unlikely(gve_maybe_stop_tx_dqo(tx, total_num_descs +
+                       GVE_TX_MIN_DESC_PREVENT_CACHE_OVERLAP))) {
+               return -1;
+       }
+
+       if (unlikely(gve_tx_add_skb_no_copy_dqo(tx, skb) < 0))
+               goto drop;
+
+       netdev_tx_sent_queue(tx->netdev_txq, skb->len);
+       skb_tx_timestamp(skb);
+       return 0;
+
+drop:
+       tx->dropped_pkt++;
+       dev_kfree_skb_any(skb);
+       return 0;
+}
+
+/* Transmit a given skb and ring the doorbell. */
  netdev_tx_t gve_tx_dqo(struct sk_buff *skb, struct net_device *dev)
  {
+       struct gve_priv *priv = netdev_priv(dev);
+       struct gve_tx_ring *tx;
+
+       tx = &priv->tx[skb_get_queue_mapping(skb)];
+       if (unlikely(gve_try_tx_skb(priv, tx, skb) < 0)) {
+               /* We need to ring the txq doorbell -- we have stopped the Tx
+                * queue for want of resources, but prior calls to gve_tx()
+                * may have added descriptors without ringing the doorbell.
+                */
+               gve_tx_put_doorbell_dqo(priv, tx->q_resources, tx->dqo_tx.tail);
+               return NETDEV_TX_BUSY;
+       }
+
+       if (!netif_xmit_stopped(tx->netdev_txq) && netdev_xmit_more())
+               return NETDEV_TX_OK;
+
+       gve_tx_put_doorbell_dqo(priv, tx->q_resources, tx->dqo_tx.tail);
         return NETDEV_TX_OK;
  }
  
+static void add_to_list(struct gve_tx_ring *tx, struct gve_index_list *list,
+                       struct gve_tx_pending_packet_dqo *pending_packet)
+{
+       s16 old_tail, index;
+
+       index = pending_packet - tx->dqo.pending_packets;
+       old_tail = list->tail;
+       list->tail = index;
+       if (old_tail == -1)
+               list->head = index;
+       else
+               tx->dqo.pending_packets[old_tail].next = index;
+
+       pending_packet->next = -1;
+       pending_packet->prev = old_tail;
+}
+
+static void remove_from_list(struct gve_tx_ring *tx,
+                            struct gve_index_list *list,
+                            struct gve_tx_pending_packet_dqo *pending_packet)
+{
+       s16 index, prev_index, next_index;
+
+       index = pending_packet - tx->dqo.pending_packets;
+       prev_index = pending_packet->prev;
+       next_index = pending_packet->next;
+
+       if (prev_index == -1) {
+               /* Node is head */
+               list->head = next_index;
+       } else {
+               tx->dqo.pending_packets[prev_index].next = next_index;
+       }
+       if (next_index == -1) {
+               /* Node is tail */
+               list->tail = prev_index;
+       } else {
+               tx->dqo.pending_packets[next_index].prev = prev_index;
+       }
+}
+
+static void gve_unmap_packet(struct device *dev,
+                            struct gve_tx_pending_packet_dqo *pending_packet)
+{
+       struct gve_tx_dma_buf *buf;
+       int i;
+
+       /* SKB linear portion is guaranteed to be mapped */
+       buf = &pending_packet->bufs[0];
+       dma_unmap_single(dev, dma_unmap_addr(buf, dma),
+                        dma_unmap_len(buf, len), DMA_TO_DEVICE);
+       for (i = 1; i < pending_packet->num_bufs; i++) {
+               buf = &pending_packet->bufs[i];
+               dma_unmap_page(dev, dma_unmap_addr(buf, dma),
+                              dma_unmap_len(buf, len), DMA_TO_DEVICE);
+       }
+       pending_packet->num_bufs = 0;
+}
+
+/* Completion types and expected behavior:
+ * No Miss compl + Packet compl = Packet completed normally.
+ * Miss compl + Re-inject compl = Packet completed normally.
+ * No Miss compl + Re-inject compl = Skipped i.e. packet not completed.
+ * Miss compl + Packet compl = Skipped i.e. packet not completed.
+ */
+static void gve_handle_packet_completion(struct gve_priv *priv,
+                                        struct gve_tx_ring *tx, bool is_napi,
+                                        u16 compl_tag, u64 *bytes, u64 *pkts,
+                                        bool is_reinjection)
+{
+       struct gve_tx_pending_packet_dqo *pending_packet;
+
+       if (unlikely(compl_tag >= tx->dqo.num_pending_packets)) {
+               net_err_ratelimited("%s: Invalid TX completion tag: %d\n",
+                                   priv->dev->name, (int)compl_tag);
+               return;
+       }
+
+       pending_packet = &tx->dqo.pending_packets[compl_tag];
+
+       if (unlikely(is_reinjection)) {
+               if (unlikely(pending_packet->state ==
+                            GVE_PACKET_STATE_TIMED_OUT_COMPL)) {
+                       net_err_ratelimited("%s: Re-injection completion: %d received after timeout.\n",
+                                           priv->dev->name, (int)compl_tag);
+                       /* Packet was already completed as a result of timeout,
+                        * so just remove from list and free pending packet.
+                        */
+                       remove_from_list(tx,
+                                        &tx->dqo_compl.timed_out_completions,
+                                        pending_packet);
+                       gve_free_pending_packet(tx, pending_packet);
+                       return;
+               }
+               if (unlikely(pending_packet->state !=
+                            GVE_PACKET_STATE_PENDING_REINJECT_COMPL)) {
+                       /* No outstanding miss completion but packet allocated
+                        * implies packet receives a re-injection completion
+                        * without a a prior miss completion. Return without
+                        * completing the packet.
+                        */
+                       net_err_ratelimited("%s: Re-injection completion received without corresponding miss completion: %d\n",
+                                           priv->dev->name, (int)compl_tag);
+                       return;
+               }
+               remove_from_list(tx, &tx->dqo_compl.miss_completions,
+                                pending_packet);
+       } else {
+               /* Packet is allocated but not a pending data completion. */
+               if (unlikely(pending_packet->state !=
+                            GVE_PACKET_STATE_PENDING_DATA_COMPL)) {
+                       net_err_ratelimited("%s: No pending data completion: %d\n",
+                                           priv->dev->name, (int)compl_tag);
+                       return;
+               }
+       }
+       gve_unmap_packet(tx->dev, pending_packet);
+
+       *bytes += pending_packet->skb->len;
+       (*pkts)++;
+       napi_consume_skb(pending_packet->skb, is_napi);
+       pending_packet->skb = NULL;
+       gve_free_pending_packet(tx, pending_packet);
+}
+
+static void gve_handle_miss_completion(struct gve_priv *priv,
+                                      struct gve_tx_ring *tx, u16 compl_tag,
+                                      u64 *bytes, u64 *pkts)
+{
+       struct gve_tx_pending_packet_dqo *pending_packet;
+
+       if (unlikely(compl_tag >= tx->dqo.num_pending_packets)) {
+               net_err_ratelimited("%s: Invalid TX completion tag: %d\n",
+                                   priv->dev->name, (int)compl_tag);
+               return;
+       }
+
+       pending_packet = &tx->dqo.pending_packets[compl_tag];
+       if (unlikely(pending_packet->state !=
+                               GVE_PACKET_STATE_PENDING_DATA_COMPL)) {
+               net_err_ratelimited("%s: Unexpected packet state: %d for completion tag : %d\n",
+                                   priv->dev->name, (int)pending_packet->state,
+                                   (int)compl_tag);
+               return;
+       }
+
+       pending_packet->state = GVE_PACKET_STATE_PENDING_REINJECT_COMPL;
+       /* jiffies can wraparound but time comparisons can handle overflows. */
+       pending_packet->timeout_jiffies =
+                       jiffies +
+                       msecs_to_jiffies(GVE_REINJECT_COMPL_TIMEOUT *
+                                        MSEC_PER_SEC);
+       add_to_list(tx, &tx->dqo_compl.miss_completions, pending_packet);
+
+       *bytes += pending_packet->skb->len;
+       (*pkts)++;
+}
+
+static void remove_miss_completions(struct gve_priv *priv,
+                                   struct gve_tx_ring *tx)
+{
+       struct gve_tx_pending_packet_dqo *pending_packet;
+       s16 next_index;
+
+       next_index = tx->dqo_compl.miss_completions.head;
+       while (next_index != -1) {
+               pending_packet = &tx->dqo.pending_packets[next_index];
+               next_index = pending_packet->next;
+               /* Break early because packets should timeout in order. */
+               if (time_is_after_jiffies(pending_packet->timeout_jiffies))
+                       break;
+
+               remove_from_list(tx, &tx->dqo_compl.miss_completions,
+                                pending_packet);
+               /* Unmap buffers and free skb but do not unallocate packet i.e.
+                * the completion tag is not freed to ensure that the driver
+                * can take appropriate action if a corresponding valid
+                * completion is received later.
+                */
+               gve_unmap_packet(tx->dev, pending_packet);
+               /* This indicates the packet was dropped. */
+               dev_kfree_skb_any(pending_packet->skb);
+               pending_packet->skb = NULL;
+               tx->dropped_pkt++;
+               net_err_ratelimited("%s: No reinjection completion was received for: %ld.\n",
+                                   priv->dev->name,
+                                   (pending_packet - tx->dqo.pending_packets));
+
+               pending_packet->state = GVE_PACKET_STATE_TIMED_OUT_COMPL;
+               pending_packet->timeout_jiffies =
+                               jiffies +
+                               msecs_to_jiffies(GVE_DEALLOCATE_COMPL_TIMEOUT *
+                                                MSEC_PER_SEC);
+               /* Maintain pending packet in another list so the packet can be
+                * unallocated at a later time.
+                */
+               add_to_list(tx, &tx->dqo_compl.timed_out_completions,
+                           pending_packet);
+       }
+}
+
+static void remove_timed_out_completions(struct gve_priv *priv,
+                                        struct gve_tx_ring *tx)
+{
+       struct gve_tx_pending_packet_dqo *pending_packet;
+       s16 next_index;
+
+       next_index = tx->dqo_compl.timed_out_completions.head;
+       while (next_index != -1) {
+               pending_packet = &tx->dqo.pending_packets[next_index];
+               next_index = pending_packet->next;
+               /* Break early because packets should timeout in order. */
+               if (time_is_after_jiffies(pending_packet->timeout_jiffies))
+                       break;
+
+               remove_from_list(tx, &tx->dqo_compl.timed_out_completions,
+                                pending_packet);
+               gve_free_pending_packet(tx, pending_packet);
+       }
+}
+
  int gve_clean_tx_done_dqo(struct gve_priv *priv, struct gve_tx_ring *tx,
                           struct napi_struct *napi)
  {
-       return 0;
+       u64 reinject_compl_bytes = 0;
+       u64 reinject_compl_pkts = 0;
+       int num_descs_cleaned = 0;
+       u64 miss_compl_bytes = 0;
+       u64 miss_compl_pkts = 0;
+       u64 pkt_compl_bytes = 0;
+       u64 pkt_compl_pkts = 0;
+
+       /* Limit in order to avoid blocking for too long */
+       while (!napi || pkt_compl_pkts < napi->weight) {
+               struct gve_tx_compl_desc *compl_desc =
+                       &tx->dqo.compl_ring[tx->dqo_compl.head];
+               u16 type;
+
+               if (compl_desc->generation == tx->dqo_compl.cur_gen_bit)
+                       break;
+
+               /* Prefetch the next descriptor. */
+               prefetch(&tx->dqo.compl_ring[(tx->dqo_compl.head + 1) &
+                               tx->dqo.complq_mask]);
+
+               /* Do not read data until we own the descriptor */
+               dma_rmb();
+               type = compl_desc->type;
+
+               if (type == GVE_COMPL_TYPE_DQO_DESC) {
+                       /* This is the last descriptor fetched by HW plus one */
+                       u16 tx_head = le16_to_cpu(compl_desc->tx_head);
+
+                       atomic_set_release(&tx->dqo_compl.hw_tx_head, tx_head);
+               } else if (type == GVE_COMPL_TYPE_DQO_PKT) {
+                       u16 compl_tag = le16_to_cpu(compl_desc->completion_tag);
+
+                       gve_handle_packet_completion(priv, tx, !!napi,
+                                                    compl_tag,
+                                                    &pkt_compl_bytes,
+                                                    &pkt_compl_pkts,
+                                                    /*is_reinjection=*/false);
+               } else if (type == GVE_COMPL_TYPE_DQO_MISS) {
+                       u16 compl_tag = le16_to_cpu(compl_desc->completion_tag);
+
+                       gve_handle_miss_completion(priv, tx, compl_tag,
+                                                  &miss_compl_bytes,
+                                                  &miss_compl_pkts);
+               } else if (type == GVE_COMPL_TYPE_DQO_REINJECTION) {
+                       u16 compl_tag = le16_to_cpu(compl_desc->completion_tag);
+
+                       gve_handle_packet_completion(priv, tx, !!napi,
+                                                    compl_tag,
+                                                    &reinject_compl_bytes,
+                                                    &reinject_compl_pkts,
+                                                    /*is_reinjection=*/true);
+               }
+
+               tx->dqo_compl.head =
+                       (tx->dqo_compl.head + 1) & tx->dqo.complq_mask;
+               /* Flip the generation bit when we wrap around */
+               tx->dqo_compl.cur_gen_bit ^= tx->dqo_compl.head == 0;
+               num_descs_cleaned++;
+       }
+
+       netdev_tx_completed_queue(tx->netdev_txq,
+                                 pkt_compl_pkts + miss_compl_pkts,
+                                 pkt_compl_bytes + miss_compl_bytes);
+
+       remove_miss_completions(priv, tx);
+       remove_timed_out_completions(priv, tx);
+
+       u64_stats_update_begin(&tx->statss);
+       tx->bytes_done += pkt_compl_bytes + reinject_compl_bytes;
+       tx->pkt_done += pkt_compl_pkts + reinject_compl_pkts;
+       u64_stats_update_end(&tx->statss);
+       return num_descs_cleaned;
  }
  
  bool gve_tx_poll_dqo(struct gve_notify_block *block, bool do_clean)
  {
-       return false;
+       struct gve_tx_compl_desc *compl_desc;
+       struct gve_tx_ring *tx = block->tx;
+       struct gve_priv *priv = block->priv;
+
+       if (do_clean) {
+               int num_descs_cleaned = gve_clean_tx_done_dqo(priv, tx,
+                                                             &block->napi);
+
+               /* Sync with queue being stopped in `gve_maybe_stop_tx_dqo()` */
+               mb();
+
+               if (netif_tx_queue_stopped(tx->netdev_txq) &&
+                   num_descs_cleaned > 0) {
+                       tx->wake_queue++;
+                       netif_tx_wake_queue(tx->netdev_txq);
+               }
+       }
+
+       /* Return true if we still have work. */
+       compl_desc = &tx->dqo.compl_ring[tx->dqo_compl.head];
+       return compl_desc->generation != tx->dqo_compl.cur_gen_bit;
  }
author	Bailey Forrest <bcf@google.com>
	Thu, 24 Jun 2021 18:06:31 +0000 (11:06 -0700)
committer	David S. Miller <davem@davemloft.net>
	Thu, 24 Jun 2021 19:47:38 +0000 (12:47 -0700)
drivers/net/ethernet/google/gve/gve_dqo.h		patch \| blob \| history
drivers/net/ethernet/google/gve/gve_tx_dqo.c		patch \| blob \| history