Improve the following in rtl8169_start_xmit:
- tp->cur_tx can be accessed in parallel by rtl_tx(), therefore
  annotate the race by using WRITE_ONCE
- avoid checking stop_queue a second time by moving the doorbell check
- netif_stop_queue() uses atomic operation set_bit() that includes a
  full memory barrier on some platforms, therefore use
  smp_mb__after_atomic to avoid overhead
Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Link: https://lore.kernel.org/r/80085451-3eaf-507a-c7c0-08d607c46fbc@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
 
        /* rtl_tx needs to see descriptor changes before updated tp->cur_tx */
        smp_wmb();
 
-       tp->cur_tx += frags + 1;
+       WRITE_ONCE(tp->cur_tx, tp->cur_tx + frags + 1);
 
        stop_queue = !rtl_tx_slots_avail(tp, MAX_SKB_FRAGS);
        if (unlikely(stop_queue)) {
                 */
                smp_wmb();
                netif_stop_queue(dev);
-               door_bell = true;
-       }
-
-       if (door_bell)
-               rtl8169_doorbell(tp);
-
-       if (unlikely(stop_queue)) {
                /* Sync with rtl_tx:
                 * - publish queue status and cur_tx ring index (write barrier)
                 * - refresh dirty_tx ring index (read barrier).
                 * status and forget to wake up queue, a racing rtl_tx thread
                 * can't.
                 */
-               smp_mb();
+               smp_mb__after_atomic();
                if (rtl_tx_slots_avail(tp, MAX_SKB_FRAGS))
                        netif_start_queue(dev);
+               door_bell = true;
        }
 
+       if (door_bell)
+               rtl8169_doorbell(tp);
+
        return NETDEV_TX_OK;
 
 err_dma_1: