if (skb_cloned(skb) &&
            (!(local->hw.flags & IEEE80211_HW_SUPPORTS_CLONED_SKBS) ||
             !skb_clone_writable(skb, ETH_HLEN) ||
-            sdata->crypto_tx_tailroom_needed_cnt))
+            (may_encrypt && sdata->crypto_tx_tailroom_needed_cnt)))
                I802_DEBUG_INC(local->tx_expand_skb_head_cloned);
        else if (head_need || tail_need)
                I802_DEBUG_INC(local->tx_expand_skb_head);
        return ERR_PTR(ret);
 }
 
+/*
+ * fast-xmit overview
+ *
+ * The core idea of this fast-xmit is to remove per-packet checks by checking
+ * them out of band. ieee80211_check_fast_xmit() implements the out-of-band
+ * checks that are needed to get the sta->fast_tx pointer assigned, after which
+ * much less work can be done per packet. For example, fragmentation must be
+ * disabled or the fast_tx pointer will not be set. All the conditions are seen
+ * in the code here.
+ *
+ * Once assigned, the fast_tx data structure also caches the per-packet 802.11
+ * header and other data to aid packet processing in ieee80211_xmit_fast().
+ *
+ * The most difficult part of this is that when any of these assumptions
+ * change, an external trigger (i.e. a call to ieee80211_clear_fast_xmit(),
+ * ieee80211_check_fast_xmit() or friends) is required to reset the data,
+ * since the per-packet code no longer checks the conditions. This is reflected
+ * by the calls to these functions throughout the rest of the code, and must be
+ * maintained if any of the TX path checks change.
+ */
+
+void ieee80211_check_fast_xmit(struct sta_info *sta)
+{
+       struct ieee80211_fast_tx build = {}, *fast_tx = NULL, *old;
+       struct ieee80211_local *local = sta->local;
+       struct ieee80211_sub_if_data *sdata = sta->sdata;
+       struct ieee80211_hdr *hdr = (void *)build.hdr;
+       struct ieee80211_chanctx_conf *chanctx_conf;
+       __le16 fc;
+
+       if (!(local->hw.flags & IEEE80211_HW_SUPPORT_FAST_XMIT))
+               return;
+
+       /* Locking here protects both the pointer itself, and against concurrent
+        * invocations winning data access races to, e.g., the key pointer that
+        * is used.
+        * Without it, the invocation of this function right after the key
+        * pointer changes wouldn't be sufficient, as another CPU could access
+        * the pointer, then stall, and then do the cache update after the CPU
+        * that invalidated the key.
+        * With the locking, such scenarios cannot happen as the check for the
+        * key and the fast-tx assignment are done atomically, so the CPU that
+        * modifies the key will either wait or other one will see the key
+        * cleared/changed already.
+        */
+       spin_lock_bh(&sta->lock);
+       if (local->hw.flags & IEEE80211_HW_SUPPORTS_PS &&
+           !(local->hw.flags & IEEE80211_HW_SUPPORTS_DYNAMIC_PS) &&
+           sdata->vif.type == NL80211_IFTYPE_STATION)
+               goto out;
+
+       if (!test_sta_flag(sta, WLAN_STA_AUTHORIZED))
+               goto out;
+
+       if (test_sta_flag(sta, WLAN_STA_PS_STA) ||
+           test_sta_flag(sta, WLAN_STA_PS_DRIVER) ||
+           test_sta_flag(sta, WLAN_STA_PS_DELIVER))
+               goto out;
+
+       if (sdata->noack_map)
+               goto out;
+
+       /* fast-xmit doesn't handle fragmentation at all */
+       if (local->hw.wiphy->frag_threshold != (u32)-1)
+               goto out;
+
+       rcu_read_lock();
+       chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf);
+       if (!chanctx_conf) {
+               rcu_read_unlock();
+               goto out;
+       }
+       build.band = chanctx_conf->def.chan->band;
+       rcu_read_unlock();
+
+       fc = cpu_to_le16(IEEE80211_FTYPE_DATA | IEEE80211_STYPE_DATA);
+
+       switch (sdata->vif.type) {
+       case NL80211_IFTYPE_STATION:
+               if (test_sta_flag(sta, WLAN_STA_TDLS_PEER)) {
+                       /* DA SA BSSID */
+                       build.da_offs = offsetof(struct ieee80211_hdr, addr1);
+                       build.sa_offs = offsetof(struct ieee80211_hdr, addr2);
+                       memcpy(hdr->addr3, sdata->u.mgd.bssid, ETH_ALEN);
+                       build.hdr_len = 24;
+                       break;
+               }
+
+               if (sdata->u.mgd.use_4addr) {
+                       /* non-regular ethertype cannot use the fastpath */
+                       fc |= cpu_to_le16(IEEE80211_FCTL_FROMDS |
+                                         IEEE80211_FCTL_TODS);
+                       /* RA TA DA SA */
+                       memcpy(hdr->addr1, sdata->u.mgd.bssid, ETH_ALEN);
+                       memcpy(hdr->addr2, sdata->vif.addr, ETH_ALEN);
+                       build.da_offs = offsetof(struct ieee80211_hdr, addr3);
+                       build.sa_offs = offsetof(struct ieee80211_hdr, addr4);
+                       build.hdr_len = 30;
+                       break;
+               }
+               fc |= cpu_to_le16(IEEE80211_FCTL_TODS);
+               /* BSSID SA DA */
+               memcpy(hdr->addr1, sdata->u.mgd.bssid, ETH_ALEN);
+               build.da_offs = offsetof(struct ieee80211_hdr, addr3);
+               build.sa_offs = offsetof(struct ieee80211_hdr, addr2);
+               build.hdr_len = 24;
+               break;
+       case NL80211_IFTYPE_AP_VLAN:
+               if (sdata->wdev.use_4addr) {
+                       fc |= cpu_to_le16(IEEE80211_FCTL_FROMDS |
+                                         IEEE80211_FCTL_TODS);
+                       /* RA TA DA SA */
+                       memcpy(hdr->addr1, sta->sta.addr, ETH_ALEN);
+                       memcpy(hdr->addr2, sdata->vif.addr, ETH_ALEN);
+                       build.da_offs = offsetof(struct ieee80211_hdr, addr3);
+                       build.sa_offs = offsetof(struct ieee80211_hdr, addr4);
+                       build.hdr_len = 30;
+                       break;
+               }
+               /* fall through */
+       case NL80211_IFTYPE_AP:
+               fc |= cpu_to_le16(IEEE80211_FCTL_FROMDS);
+               /* DA BSSID SA */
+               build.da_offs = offsetof(struct ieee80211_hdr, addr1);
+               memcpy(hdr->addr2, sdata->vif.addr, ETH_ALEN);
+               build.sa_offs = offsetof(struct ieee80211_hdr, addr3);
+               build.hdr_len = 24;
+               break;
+       default:
+               /* not handled on fast-xmit */
+               goto out;
+       }
+
+       if (sta->sta.wme) {
+               build.hdr_len += 2;
+               fc |= cpu_to_le16(IEEE80211_STYPE_QOS_DATA);
+       }
+
+       /* We store the key here so there's no point in using rcu_dereference()
+        * but that's fine because the code that changes the pointers will call
+        * this function after doing so. For a single CPU that would be enough,
+        * for multiple see the comment above.
+        */
+       build.key = rcu_access_pointer(sta->ptk[sta->ptk_idx]);
+       if (!build.key)
+               build.key = rcu_access_pointer(sdata->default_unicast_key);
+       if (build.key) {
+               bool gen_iv, iv_spc;
+
+               gen_iv = build.key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_IV;
+               iv_spc = build.key->conf.flags & IEEE80211_KEY_FLAG_PUT_IV_SPACE;
+
+               /* don't handle software crypto */
+               if (!(build.key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE))
+                       goto out;
+
+               switch (build.key->conf.cipher) {
+               case WLAN_CIPHER_SUITE_CCMP:
+               case WLAN_CIPHER_SUITE_CCMP_256:
+                       /* add fixed key ID */
+                       if (gen_iv) {
+                               (build.hdr + build.hdr_len)[3] =
+                                       0x20 | (build.key->conf.keyidx << 6);
+                               build.pn_offs = build.hdr_len;
+                       }
+                       if (gen_iv || iv_spc)
+                               build.hdr_len += IEEE80211_CCMP_HDR_LEN;
+                       break;
+               case WLAN_CIPHER_SUITE_GCMP:
+               case WLAN_CIPHER_SUITE_GCMP_256:
+                       /* add fixed key ID */
+                       if (gen_iv) {
+                               (build.hdr + build.hdr_len)[3] =
+                                       0x20 | (build.key->conf.keyidx << 6);
+                               build.pn_offs = build.hdr_len;
+                       }
+                       if (gen_iv || iv_spc)
+                               build.hdr_len += IEEE80211_GCMP_HDR_LEN;
+                       break;
+               default:
+                       /* don't do fast-xmit for these ciphers (yet) */
+                       goto out;
+               }
+
+               fc |= cpu_to_le16(IEEE80211_FCTL_PROTECTED);
+       }
+
+       hdr->frame_control = fc;
+
+       memcpy(build.hdr + build.hdr_len,
+              rfc1042_header,  sizeof(rfc1042_header));
+       build.hdr_len += sizeof(rfc1042_header);
+
+       fast_tx = kmemdup(&build, sizeof(build), GFP_ATOMIC);
+       /* if the kmemdup fails, continue w/o fast_tx */
+       if (!fast_tx)
+               goto out;
+
+ out:
+       /* we might have raced against another call to this function */
+       old = rcu_dereference_protected(sta->fast_tx,
+                                       lockdep_is_held(&sta->lock));
+       rcu_assign_pointer(sta->fast_tx, fast_tx);
+       if (old)
+               kfree_rcu(old, rcu_head);
+       spin_unlock_bh(&sta->lock);
+}
+
+void ieee80211_check_fast_xmit_all(struct ieee80211_local *local)
+{
+       struct sta_info *sta;
+
+       rcu_read_lock();
+       list_for_each_entry_rcu(sta, &local->sta_list, list)
+               ieee80211_check_fast_xmit(sta);
+       rcu_read_unlock();
+}
+
+void ieee80211_check_fast_xmit_iface(struct ieee80211_sub_if_data *sdata)
+{
+       struct ieee80211_local *local = sdata->local;
+       struct sta_info *sta;
+
+       rcu_read_lock();
+
+       list_for_each_entry_rcu(sta, &local->sta_list, list) {
+               if (sdata != sta->sdata &&
+                   (!sta->sdata->bss || sta->sdata->bss != sdata->bss))
+                       continue;
+               ieee80211_check_fast_xmit(sta);
+       }
+
+       rcu_read_unlock();
+}
+
+void ieee80211_clear_fast_xmit(struct sta_info *sta)
+{
+       struct ieee80211_fast_tx *fast_tx;
+
+       spin_lock_bh(&sta->lock);
+       fast_tx = rcu_dereference_protected(sta->fast_tx,
+                                           lockdep_is_held(&sta->lock));
+       RCU_INIT_POINTER(sta->fast_tx, NULL);
+       spin_unlock_bh(&sta->lock);
+
+       if (fast_tx)
+               kfree_rcu(fast_tx, rcu_head);
+}
+
+static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
+                               struct net_device *dev, struct sta_info *sta,
+                               struct ieee80211_fast_tx *fast_tx,
+                               struct sk_buff *skb)
+{
+       struct ieee80211_local *local = sdata->local;
+       u16 ethertype = (skb->data[12] << 8) | skb->data[13];
+       int extra_head = fast_tx->hdr_len - (ETH_HLEN - 2);
+       int hw_headroom = sdata->local->hw.extra_tx_headroom;
+       struct ethhdr eth;
+       struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+       struct ieee80211_hdr *hdr = (void *)fast_tx->hdr;
+       struct ieee80211_tx_data tx;
+       ieee80211_tx_result r;
+       struct tid_ampdu_tx *tid_tx = NULL;
+       u8 tid = IEEE80211_NUM_TIDS;
+
+       /* control port protocol needs a lot of special handling */
+       if (cpu_to_be16(ethertype) == sdata->control_port_protocol)
+               return false;
+
+       /* only RFC 1042 SNAP */
+       if (ethertype < ETH_P_802_3_MIN)
+               return false;
+
+       /* don't handle TX status request here either */
+       if (skb->sk && skb_shinfo(skb)->tx_flags & SKBTX_WIFI_STATUS)
+               return false;
+
+       if (hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) {
+               tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK;
+               tid_tx = rcu_dereference(sta->ampdu_mlme.tid_tx[tid]);
+               if (tid_tx &&
+                   !test_bit(HT_AGG_STATE_OPERATIONAL, &tid_tx->state))
+                       return false;
+       }
+
+       /* after this point (skb is modified) we cannot return false */
+
+       if (skb_shared(skb)) {
+               struct sk_buff *tmp_skb = skb;
+
+               skb = skb_clone(skb, GFP_ATOMIC);
+               kfree_skb(tmp_skb);
+
+               if (!skb)
+                       return true;
+       }
+
+       dev->stats.tx_packets++;
+       dev->stats.tx_bytes += skb->len + extra_head;
+       dev->trans_start = jiffies;
+
+       /* will not be crypto-handled beyond what we do here, so use false
+        * as the may-encrypt argument for the resize to not account for
+        * more room than we already have in 'extra_head'
+        */
+       if (unlikely(ieee80211_skb_resize(sdata, skb,
+                                         max_t(int, extra_head + hw_headroom -
+                                                    skb_headroom(skb), 0),
+                                         false))) {
+               kfree_skb(skb);
+               return true;
+       }
+
+       memcpy(ð, skb->data, ETH_HLEN - 2);
+       hdr = (void *)skb_push(skb, extra_head);
+       memcpy(skb->data, fast_tx->hdr, fast_tx->hdr_len);
+       memcpy(skb->data + fast_tx->da_offs, eth.h_dest, ETH_ALEN);
+       memcpy(skb->data + fast_tx->sa_offs, eth.h_source, ETH_ALEN);
+
+       memset(info, 0, sizeof(*info));
+       info->band = fast_tx->band;
+       info->control.vif = &sdata->vif;
+       info->flags = IEEE80211_TX_CTL_FIRST_FRAGMENT |
+                     IEEE80211_TX_CTL_DONTFRAG |
+                     (tid_tx ? IEEE80211_TX_CTL_AMPDU : 0);
+
+       if (hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) {
+               *ieee80211_get_qos_ctl(hdr) = tid;
+               hdr->seq_ctrl = ieee80211_tx_next_seq(sta, tid);
+       } else {
+               info->flags |= IEEE80211_TX_CTL_ASSIGN_SEQ;
+               hdr->seq_ctrl = cpu_to_le16(sdata->sequence_number);
+               sdata->sequence_number += 0x10;
+       }
+
+       sta->tx_msdu[tid]++;
+
+       info->hw_queue = sdata->vif.hw_queue[skb_get_queue_mapping(skb)];
+
+       __skb_queue_head_init(&tx.skbs);
+
+       tx.flags = IEEE80211_TX_UNICAST;
+       tx.local = local;
+       tx.sdata = sdata;
+       tx.sta = sta;
+       tx.key = fast_tx->key;
+
+       if (fast_tx->key)
+               info->control.hw_key = &fast_tx->key->conf;
+
+       if (!(local->hw.flags & IEEE80211_HW_HAS_RATE_CONTROL)) {
+               tx.skb = skb;
+               r = ieee80211_tx_h_rate_ctrl(&tx);
+               skb = tx.skb;
+               tx.skb = NULL;
+
+               if (r != TX_CONTINUE) {
+                       if (r != TX_QUEUED)
+                               kfree_skb(skb);
+                       return true;
+               }
+       }
+
+       /* statistics normally done by ieee80211_tx_h_stats (but that
+        * has to consider fragmentation, so is more complex)
+        */
+       sta->tx_fragments++;
+       sta->tx_bytes[skb_get_queue_mapping(skb)] += skb->len;
+       sta->tx_packets[skb_get_queue_mapping(skb)]++;
+
+       if (fast_tx->pn_offs) {
+               u64 pn;
+               u8 *crypto_hdr = skb->data + fast_tx->pn_offs;
+
+               switch (fast_tx->key->conf.cipher) {
+               case WLAN_CIPHER_SUITE_CCMP:
+               case WLAN_CIPHER_SUITE_CCMP_256:
+                       pn = atomic64_inc_return(&fast_tx->key->u.ccmp.tx_pn);
+                       crypto_hdr[0] = pn;
+                       crypto_hdr[1] = pn >> 8;
+                       crypto_hdr[4] = pn >> 16;
+                       crypto_hdr[5] = pn >> 24;
+                       crypto_hdr[6] = pn >> 32;
+                       crypto_hdr[7] = pn >> 40;
+                       break;
+               case WLAN_CIPHER_SUITE_GCMP:
+               case WLAN_CIPHER_SUITE_GCMP_256:
+                       pn = atomic64_inc_return(&fast_tx->key->u.gcmp.tx_pn);
+                       crypto_hdr[0] = pn;
+                       crypto_hdr[1] = pn >> 8;
+                       crypto_hdr[4] = pn >> 16;
+                       crypto_hdr[5] = pn >> 24;
+                       crypto_hdr[6] = pn >> 32;
+                       crypto_hdr[7] = pn >> 40;
+                       break;
+               }
+       }
+
+       if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
+               sdata = container_of(sdata->bss,
+                                    struct ieee80211_sub_if_data, u.ap);
+
+       __skb_queue_tail(&tx.skbs, skb);
+       ieee80211_tx_frags(local, &sdata->vif, &sta->sta, &tx.skbs, false);
+       return true;
+}
+
 void __ieee80211_subif_start_xmit(struct sk_buff *skb,
                                  struct net_device *dev,
                                  u32 info_flags)
                goto out;
        }
 
+       if (!IS_ERR_OR_NULL(sta)) {
+               struct ieee80211_fast_tx *fast_tx;
+
+               fast_tx = rcu_dereference(sta->fast_tx);
+
+               if (fast_tx &&
+                   ieee80211_xmit_fast(sdata, dev, sta, fast_tx, skb))
+                       goto out;
+       }
+
        skb = ieee80211_build_hdr(sdata, skb, info_flags, sta);
        if (IS_ERR(skb))
                goto out;