destroy_workqueue(wg->handshake_receive_wq);
        destroy_workqueue(wg->handshake_send_wq);
        destroy_workqueue(wg->packet_crypt_wq);
-       wg_packet_queue_free(&wg->decrypt_queue, true);
-       wg_packet_queue_free(&wg->encrypt_queue, true);
+       wg_packet_queue_free(&wg->decrypt_queue);
+       wg_packet_queue_free(&wg->encrypt_queue);
        rcu_barrier(); /* Wait for all the peers to be actually freed. */
        wg_ratelimiter_uninit();
        memzero_explicit(&wg->static_identity, sizeof(wg->static_identity));
                goto err_destroy_handshake_send;
 
        ret = wg_packet_queue_init(&wg->encrypt_queue, wg_packet_encrypt_worker,
-                                  true, MAX_QUEUED_PACKETS);
+                                  MAX_QUEUED_PACKETS);
        if (ret < 0)
                goto err_destroy_packet_crypt;
 
        ret = wg_packet_queue_init(&wg->decrypt_queue, wg_packet_decrypt_worker,
-                                  true, MAX_QUEUED_PACKETS);
+                                  MAX_QUEUED_PACKETS);
        if (ret < 0)
                goto err_free_encrypt_queue;
 
 err_uninit_ratelimiter:
        wg_ratelimiter_uninit();
 err_free_decrypt_queue:
-       wg_packet_queue_free(&wg->decrypt_queue, true);
+       wg_packet_queue_free(&wg->decrypt_queue);
 err_free_encrypt_queue:
-       wg_packet_queue_free(&wg->encrypt_queue, true);
+       wg_packet_queue_free(&wg->encrypt_queue);
 err_destroy_packet_crypt:
        destroy_workqueue(wg->packet_crypt_wq);
 err_destroy_handshake_send:
 
 
 struct crypt_queue {
        struct ptr_ring ring;
-       union {
-               struct {
-                       struct multicore_worker __percpu *worker;
-                       int last_cpu;
-               };
-               struct work_struct work;
-       };
+       struct multicore_worker __percpu *worker;
+       int last_cpu;
+};
+
+struct prev_queue {
+       struct sk_buff *head, *tail, *peeked;
+       struct { struct sk_buff *next, *prev; } empty; // Match first 2 members of struct sk_buff.
+       atomic_t count;
 };
 
 struct wg_device {
 
        peer = kzalloc(sizeof(*peer), GFP_KERNEL);
        if (unlikely(!peer))
                return ERR_PTR(ret);
-       peer->device = wg;
+       if (dst_cache_init(&peer->endpoint_cache, GFP_KERNEL))
+               goto err;
 
+       peer->device = wg;
        wg_noise_handshake_init(&peer->handshake, &wg->static_identity,
                                public_key, preshared_key, peer);
-       if (dst_cache_init(&peer->endpoint_cache, GFP_KERNEL))
-               goto err_1;
-       if (wg_packet_queue_init(&peer->tx_queue, wg_packet_tx_worker, false,
-                                MAX_QUEUED_PACKETS))
-               goto err_2;
-       if (wg_packet_queue_init(&peer->rx_queue, NULL, false,
-                                MAX_QUEUED_PACKETS))
-               goto err_3;
-
        peer->internal_id = atomic64_inc_return(&peer_counter);
        peer->serial_work_cpu = nr_cpumask_bits;
        wg_cookie_init(&peer->latest_cookie);
        wg_timers_init(peer);
        wg_cookie_checker_precompute_peer_keys(peer);
        spin_lock_init(&peer->keypairs.keypair_update_lock);
-       INIT_WORK(&peer->transmit_handshake_work,
-                 wg_packet_handshake_send_worker);
+       INIT_WORK(&peer->transmit_handshake_work, wg_packet_handshake_send_worker);
+       INIT_WORK(&peer->transmit_packet_work, wg_packet_tx_worker);
+       wg_prev_queue_init(&peer->tx_queue);
+       wg_prev_queue_init(&peer->rx_queue);
        rwlock_init(&peer->endpoint_lock);
        kref_init(&peer->refcount);
        skb_queue_head_init(&peer->staged_packet_queue);
        pr_debug("%s: Peer %llu created\n", wg->dev->name, peer->internal_id);
        return peer;
 
-err_3:
-       wg_packet_queue_free(&peer->tx_queue, false);
-err_2:
-       dst_cache_destroy(&peer->endpoint_cache);
-err_1:
+err:
        kfree(peer);
        return ERR_PTR(ret);
 }
        struct wg_peer *peer = container_of(rcu, struct wg_peer, rcu);
 
        dst_cache_destroy(&peer->endpoint_cache);
-       wg_packet_queue_free(&peer->rx_queue, false);
-       wg_packet_queue_free(&peer->tx_queue, false);
+       WARN_ON(wg_prev_queue_peek(&peer->tx_queue) || wg_prev_queue_peek(&peer->rx_queue));
 
        /* The final zeroing takes care of clearing any remaining handshake key
         * material and other potentially sensitive information.
 
 
 struct wg_peer {
        struct wg_device *device;
-       struct crypt_queue tx_queue, rx_queue;
+       struct prev_queue tx_queue, rx_queue;
        struct sk_buff_head staged_packet_queue;
        int serial_work_cpu;
        bool is_dead;
        rwlock_t endpoint_lock;
        struct noise_handshake handshake;
        atomic64_t last_sent_handshake;
-       struct work_struct transmit_handshake_work, clear_peer_work;
+       struct work_struct transmit_handshake_work, clear_peer_work, transmit_packet_work;
        struct cookie latest_cookie;
        struct hlist_node pubkey_hash;
        u64 rx_bytes, tx_bytes;
 
 wg_packet_percpu_multicore_worker_alloc(work_func_t function, void *ptr)
 {
        int cpu;
-       struct multicore_worker __percpu *worker =
-               alloc_percpu(struct multicore_worker);
+       struct multicore_worker __percpu *worker = alloc_percpu(struct multicore_worker);
 
        if (!worker)
                return NULL;
 }
 
 int wg_packet_queue_init(struct crypt_queue *queue, work_func_t function,
-                        bool multicore, unsigned int len)
+                        unsigned int len)
 {
        int ret;
 
        ret = ptr_ring_init(&queue->ring, len, GFP_KERNEL);
        if (ret)
                return ret;
-       if (function) {
-               if (multicore) {
-                       queue->worker = wg_packet_percpu_multicore_worker_alloc(
-                               function, queue);
-                       if (!queue->worker) {
-                               ptr_ring_cleanup(&queue->ring, NULL);
-                               return -ENOMEM;
-                       }
-               } else {
-                       INIT_WORK(&queue->work, function);
-               }
+       queue->worker = wg_packet_percpu_multicore_worker_alloc(function, queue);
+       if (!queue->worker) {
+               ptr_ring_cleanup(&queue->ring, NULL);
+               return -ENOMEM;
        }
        return 0;
 }
 
-void wg_packet_queue_free(struct crypt_queue *queue, bool multicore)
+void wg_packet_queue_free(struct crypt_queue *queue)
 {
-       if (multicore)
-               free_percpu(queue->worker);
+       free_percpu(queue->worker);
        WARN_ON(!__ptr_ring_empty(&queue->ring));
        ptr_ring_cleanup(&queue->ring, NULL);
 }
+
+#define NEXT(skb) ((skb)->prev)
+#define STUB(queue) ((struct sk_buff *)&queue->empty)
+
+void wg_prev_queue_init(struct prev_queue *queue)
+{
+       NEXT(STUB(queue)) = NULL;
+       queue->head = queue->tail = STUB(queue);
+       queue->peeked = NULL;
+       atomic_set(&queue->count, 0);
+       BUILD_BUG_ON(
+               offsetof(struct sk_buff, next) != offsetof(struct prev_queue, empty.next) -
+                                                       offsetof(struct prev_queue, empty) ||
+               offsetof(struct sk_buff, prev) != offsetof(struct prev_queue, empty.prev) -
+                                                        offsetof(struct prev_queue, empty));
+}
+
+static void __wg_prev_queue_enqueue(struct prev_queue *queue, struct sk_buff *skb)
+{
+       WRITE_ONCE(NEXT(skb), NULL);
+       WRITE_ONCE(NEXT(xchg_release(&queue->head, skb)), skb);
+}
+
+bool wg_prev_queue_enqueue(struct prev_queue *queue, struct sk_buff *skb)
+{
+       if (!atomic_add_unless(&queue->count, 1, MAX_QUEUED_PACKETS))
+               return false;
+       __wg_prev_queue_enqueue(queue, skb);
+       return true;
+}
+
+struct sk_buff *wg_prev_queue_dequeue(struct prev_queue *queue)
+{
+       struct sk_buff *tail = queue->tail, *next = smp_load_acquire(&NEXT(tail));
+
+       if (tail == STUB(queue)) {
+               if (!next)
+                       return NULL;
+               queue->tail = next;
+               tail = next;
+               next = smp_load_acquire(&NEXT(next));
+       }
+       if (next) {
+               queue->tail = next;
+               atomic_dec(&queue->count);
+               return tail;
+       }
+       if (tail != READ_ONCE(queue->head))
+               return NULL;
+       __wg_prev_queue_enqueue(queue, STUB(queue));
+       next = smp_load_acquire(&NEXT(tail));
+       if (next) {
+               queue->tail = next;
+               atomic_dec(&queue->count);
+               return tail;
+       }
+       return NULL;
+}
+
+#undef NEXT
+#undef STUB
 
 struct wg_peer;
 struct multicore_worker;
 struct crypt_queue;
+struct prev_queue;
 struct sk_buff;
 
 /* queueing.c APIs: */
 int wg_packet_queue_init(struct crypt_queue *queue, work_func_t function,
-                        bool multicore, unsigned int len);
-void wg_packet_queue_free(struct crypt_queue *queue, bool multicore);
+                        unsigned int len);
+void wg_packet_queue_free(struct crypt_queue *queue);
 struct multicore_worker __percpu *
 wg_packet_percpu_multicore_worker_alloc(work_func_t function, void *ptr);
 
        return cpu;
 }
 
+void wg_prev_queue_init(struct prev_queue *queue);
+
+/* Multi producer */
+bool wg_prev_queue_enqueue(struct prev_queue *queue, struct sk_buff *skb);
+
+/* Single consumer */
+struct sk_buff *wg_prev_queue_dequeue(struct prev_queue *queue);
+
+/* Single consumer */
+static inline struct sk_buff *wg_prev_queue_peek(struct prev_queue *queue)
+{
+       if (queue->peeked)
+               return queue->peeked;
+       queue->peeked = wg_prev_queue_dequeue(queue);
+       return queue->peeked;
+}
+
+/* Single consumer */
+static inline void wg_prev_queue_drop_peeked(struct prev_queue *queue)
+{
+       queue->peeked = NULL;
+}
+
 static inline int wg_queue_enqueue_per_device_and_peer(
-       struct crypt_queue *device_queue, struct crypt_queue *peer_queue,
+       struct crypt_queue *device_queue, struct prev_queue *peer_queue,
        struct sk_buff *skb, struct workqueue_struct *wq, int *next_cpu)
 {
        int cpu;
        /* We first queue this up for the peer ingestion, but the consumer
         * will wait for the state to change to CRYPTED or DEAD before.
         */
-       if (unlikely(ptr_ring_produce_bh(&peer_queue->ring, skb)))
+       if (unlikely(!wg_prev_queue_enqueue(peer_queue, skb)))
                return -ENOSPC;
+
        /* Then we queue it up in the device queue, which consumes the
         * packet as soon as it can.
         */
        return 0;
 }
 
-static inline void wg_queue_enqueue_per_peer(struct crypt_queue *queue,
-                                            struct sk_buff *skb,
-                                            enum packet_state state)
+static inline void wg_queue_enqueue_per_peer_tx(struct sk_buff *skb, enum packet_state state)
 {
        /* We take a reference, because as soon as we call atomic_set, the
         * peer can be freed from below us.
        struct wg_peer *peer = wg_peer_get(PACKET_PEER(skb));
 
        atomic_set_release(&PACKET_CB(skb)->state, state);
-       queue_work_on(wg_cpumask_choose_online(&peer->serial_work_cpu,
-                                              peer->internal_id),
-                     peer->device->packet_crypt_wq, &queue->work);
+       queue_work_on(wg_cpumask_choose_online(&peer->serial_work_cpu, peer->internal_id),
+                     peer->device->packet_crypt_wq, &peer->transmit_packet_work);
        wg_peer_put(peer);
 }
 
-static inline void wg_queue_enqueue_per_peer_napi(struct sk_buff *skb,
-                                                 enum packet_state state)
+static inline void wg_queue_enqueue_per_peer_rx(struct sk_buff *skb, enum packet_state state)
 {
        /* We take a reference, because as soon as we call atomic_set, the
         * peer can be freed from below us.
 
 int wg_packet_rx_poll(struct napi_struct *napi, int budget)
 {
        struct wg_peer *peer = container_of(napi, struct wg_peer, napi);
-       struct crypt_queue *queue = &peer->rx_queue;
        struct noise_keypair *keypair;
        struct endpoint endpoint;
        enum packet_state state;
        if (unlikely(budget <= 0))
                return 0;
 
-       while ((skb = __ptr_ring_peek(&queue->ring)) != NULL &&
+       while ((skb = wg_prev_queue_peek(&peer->rx_queue)) != NULL &&
               (state = atomic_read_acquire(&PACKET_CB(skb)->state)) !=
                       PACKET_STATE_UNCRYPTED) {
-               __ptr_ring_discard_one(&queue->ring);
-               peer = PACKET_PEER(skb);
+               wg_prev_queue_drop_peeked(&peer->rx_queue);
                keypair = PACKET_CB(skb)->keypair;
                free = true;
 
                enum packet_state state =
                        likely(decrypt_packet(skb, PACKET_CB(skb)->keypair)) ?
                                PACKET_STATE_CRYPTED : PACKET_STATE_DEAD;
-               wg_queue_enqueue_per_peer_napi(skb, state);
+               wg_queue_enqueue_per_peer_rx(skb, state);
                if (need_resched())
                        cond_resched();
        }
        if (unlikely(READ_ONCE(peer->is_dead)))
                goto err;
 
-       ret = wg_queue_enqueue_per_device_and_peer(&wg->decrypt_queue,
-                                                  &peer->rx_queue, skb,
-                                                  wg->packet_crypt_wq,
-                                                  &wg->decrypt_queue.last_cpu);
+       ret = wg_queue_enqueue_per_device_and_peer(&wg->decrypt_queue, &peer->rx_queue, skb,
+                                                  wg->packet_crypt_wq, &wg->decrypt_queue.last_cpu);
        if (unlikely(ret == -EPIPE))
-               wg_queue_enqueue_per_peer_napi(skb, PACKET_STATE_DEAD);
+               wg_queue_enqueue_per_peer_rx(skb, PACKET_STATE_DEAD);
        if (likely(!ret || ret == -EPIPE)) {
                rcu_read_unlock_bh();
                return;
 
        wg_packet_send_staged_packets(peer);
 }
 
-static void wg_packet_create_data_done(struct sk_buff *first,
-                                      struct wg_peer *peer)
+static void wg_packet_create_data_done(struct wg_peer *peer, struct sk_buff *first)
 {
        struct sk_buff *skb, *next;
        bool is_keepalive, data_sent = false;
 
 void wg_packet_tx_worker(struct work_struct *work)
 {
-       struct crypt_queue *queue = container_of(work, struct crypt_queue,
-                                                work);
+       struct wg_peer *peer = container_of(work, struct wg_peer, transmit_packet_work);
        struct noise_keypair *keypair;
        enum packet_state state;
        struct sk_buff *first;
-       struct wg_peer *peer;
 
-       while ((first = __ptr_ring_peek(&queue->ring)) != NULL &&
+       while ((first = wg_prev_queue_peek(&peer->tx_queue)) != NULL &&
               (state = atomic_read_acquire(&PACKET_CB(first)->state)) !=
                       PACKET_STATE_UNCRYPTED) {
-               __ptr_ring_discard_one(&queue->ring);
-               peer = PACKET_PEER(first);
+               wg_prev_queue_drop_peeked(&peer->tx_queue);
                keypair = PACKET_CB(first)->keypair;
 
                if (likely(state == PACKET_STATE_CRYPTED))
-                       wg_packet_create_data_done(first, peer);
+                       wg_packet_create_data_done(peer, first);
                else
                        kfree_skb_list(first);
 
                                break;
                        }
                }
-               wg_queue_enqueue_per_peer(&PACKET_PEER(first)->tx_queue, first,
-                                         state);
+               wg_queue_enqueue_per_peer_tx(first, state);
                if (need_resched())
                        cond_resched();
        }
 }
 
-static void wg_packet_create_data(struct sk_buff *first)
+static void wg_packet_create_data(struct wg_peer *peer, struct sk_buff *first)
 {
-       struct wg_peer *peer = PACKET_PEER(first);
        struct wg_device *wg = peer->device;
        int ret = -EINVAL;
 
        if (unlikely(READ_ONCE(peer->is_dead)))
                goto err;
 
-       ret = wg_queue_enqueue_per_device_and_peer(&wg->encrypt_queue,
-                                                  &peer->tx_queue, first,
-                                                  wg->packet_crypt_wq,
-                                                  &wg->encrypt_queue.last_cpu);
+       ret = wg_queue_enqueue_per_device_and_peer(&wg->encrypt_queue, &peer->tx_queue, first,
+                                                  wg->packet_crypt_wq, &wg->encrypt_queue.last_cpu);
        if (unlikely(ret == -EPIPE))
-               wg_queue_enqueue_per_peer(&peer->tx_queue, first,
-                                         PACKET_STATE_DEAD);
+               wg_queue_enqueue_per_peer_tx(first, PACKET_STATE_DEAD);
 err:
        rcu_read_unlock_bh();
        if (likely(!ret || ret == -EPIPE))
        packets.prev->next = NULL;
        wg_peer_get(keypair->entry.peer);
        PACKET_CB(packets.next)->keypair = keypair;
-       wg_packet_create_data(packets.next);
+       wg_packet_create_data(peer, packets.next);
        return;
 
 out_invalid: