]> www.infradead.org Git - users/dwmw2/linux.git/commitdiff
xfrm: iptfs: add user packet (tunnel ingress) handling
authorChristian Hopps <chopps@labn.net>
Thu, 14 Nov 2024 07:07:04 +0000 (02:07 -0500)
committerSteffen Klassert <steffen.klassert@secunet.com>
Thu, 5 Dec 2024 09:01:44 +0000 (10:01 +0100)
Add tunnel packet output functionality. This is code handles
the ingress to the tunnel.

Signed-off-by: Christian Hopps <chopps@labn.net>
Tested-by: Antony Antony <antony.antony@secunet.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
net/xfrm/xfrm_iptfs.c

index e7cb8734fc0fdd07870a1d380acb85a208d8da04..c4cff005ea9a79b1bfb155299f355c2bab15facc 100644 (file)
 
 #include "xfrm_inout.h"
 
+/* ------------------------------------------------ */
+/* IPTFS default SA values (tunnel ingress/dir-out) */
+/* ------------------------------------------------ */
+
+/**
+ * define IPTFS_DEFAULT_INIT_DELAY_USECS - default initial output delay
+ *
+ * The initial output delay is the amount of time prior to servicing the output
+ * queue after queueing the first packet on said queue. This applies anytime the
+ * output queue was previously empty.
+ *
+ * Default 0.
+ */
+#define IPTFS_DEFAULT_INIT_DELAY_USECS 0
+
+/**
+ * define IPTFS_DEFAULT_MAX_QUEUE_SIZE - default max output queue size.
+ *
+ * The default IPTFS max output queue size in octets. The output queue is where
+ * received packets destined for output over an IPTFS tunnel are stored prior to
+ * being output in aggregated/fragmented form over the IPTFS tunnel.
+ *
+ * Default 1M.
+ */
+#define IPTFS_DEFAULT_MAX_QUEUE_SIZE (1024 * 10240)
+
+#define NSECS_IN_USEC 1000
+
+#define IPTFS_HRTIMER_MODE HRTIMER_MODE_REL_SOFT
+
 /**
  * struct xfrm_iptfs_config - configuration for the IPTFS tunnel.
  * @pkt_size: size of the outer IP packet. 0 to use interface and MTU discovery,
  *     otherwise the user specified value.
+ * @max_queue_size: The maximum number of octets allowed to be queued to be sent
+ *     over the IPTFS SA. The queue size is measured as the size of all the
+ *     packets enqueued.
  */
 struct xfrm_iptfs_config {
        u32 pkt_size;       /* outer_packet_size or 0 */
+       u32 max_queue_size; /* octets */
 };
 
 /**
  * struct xfrm_iptfs_data - mode specific xfrm state.
  * @cfg: IPTFS tunnel config.
  * @x: owning SA (xfrm_state).
+ * @queue: queued user packets to send.
+ * @queue_size: number of octets on queue (sum of packet sizes).
+ * @ecn_queue_size: octets above with ECN mark.
+ * @init_delay_ns: nanoseconds to wait to send initial IPTFS packet.
+ * @iptfs_timer: output timer.
  * @payload_mtu: max payload size.
  */
 struct xfrm_iptfs_data {
        struct xfrm_iptfs_config cfg;
 
        /* Ingress User Input */
-       struct xfrm_state *x;       /* owning state */
+       struct xfrm_state *x;      /* owning state */
+       struct sk_buff_head queue; /* output queue */
+
+       u32 queue_size;             /* octets */
+       u32 ecn_queue_size;         /* octets above which ECN mark */
+       u64 init_delay_ns;          /* nanoseconds */
+       struct hrtimer iptfs_timer; /* output timer */
        u32 payload_mtu;            /* max payload size */
 };
 
+static u32 iptfs_get_inner_mtu(struct xfrm_state *x, int outer_mtu);
+static enum hrtimer_restart iptfs_delay_timer(struct hrtimer *me);
+
+/* ================================= */
+/* IPTFS Sending (ingress) Functions */
+/* ================================= */
+
+/* ------------------------- */
+/* Enqueue to send functions */
+/* ------------------------- */
+
+/**
+ * iptfs_enqueue() - enqueue packet if ok to send.
+ * @xtfs: xtfs state
+ * @skb: the packet
+ *
+ * Return: true if packet enqueued.
+ */
+static bool iptfs_enqueue(struct xfrm_iptfs_data *xtfs, struct sk_buff *skb)
+{
+       u64 newsz = xtfs->queue_size + skb->len;
+       struct iphdr *iph;
+
+       assert_spin_locked(&xtfs->x->lock);
+
+       if (newsz > xtfs->cfg.max_queue_size)
+               return false;
+
+       /* Set ECN CE if we are above our ECN queue threshold */
+       if (newsz > xtfs->ecn_queue_size) {
+               iph = ip_hdr(skb);
+               if (iph->version == 4)
+                       IP_ECN_set_ce(iph);
+               else if (iph->version == 6)
+                       IP6_ECN_set_ce(skb, ipv6_hdr(skb));
+       }
+
+       __skb_queue_tail(&xtfs->queue, skb);
+       xtfs->queue_size += skb->len;
+       return true;
+}
+
+static int iptfs_get_cur_pmtu(struct xfrm_state *x, struct xfrm_iptfs_data *xtfs,
+                             struct sk_buff *skb)
+{
+       struct xfrm_dst *xdst = (struct xfrm_dst *)skb_dst(skb);
+       u32 payload_mtu = xtfs->payload_mtu;
+       u32 pmtu = iptfs_get_inner_mtu(x, xdst->child_mtu_cached);
+
+       if (payload_mtu && payload_mtu < pmtu)
+               pmtu = payload_mtu;
+
+       return pmtu;
+}
+
+static int iptfs_is_too_big(struct sock *sk, struct sk_buff *skb, u32 pmtu)
+{
+       if (skb->len <= pmtu)
+               return 0;
+
+       /* We only send ICMP too big if the user has configured us as
+        * dont-fragment.
+        */
+       if (skb->dev)
+               XFRM_INC_STATS(dev_net(skb->dev), LINUX_MIB_XFRMOUTERROR);
+
+       if (sk)
+               xfrm_local_error(skb, pmtu);
+       else if (ip_hdr(skb)->version == 4)
+               icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(pmtu));
+       else
+               icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, pmtu);
+
+       return 1;
+}
+
+/* IPv4/IPv6 packet ingress to IPTFS tunnel, arrange to send in IPTFS payload
+ * (i.e., aggregating or fragmenting as appropriate).
+ * This is set in dst->output for an SA.
+ */
+static int iptfs_output_collect(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+       struct dst_entry *dst = skb_dst(skb);
+       struct xfrm_state *x = dst->xfrm;
+       struct xfrm_iptfs_data *xtfs = x->mode_data;
+       struct sk_buff *segs, *nskb;
+       u32 pmtu = 0;
+       bool ok = true;
+       bool was_gso;
+
+       /* We have hooked into dst_entry->output which means we have skipped the
+        * protocol specific netfilter (see xfrm4_output, xfrm6_output).
+        * when our timer runs we will end up calling xfrm_output directly on
+        * the encapsulated traffic.
+        *
+        * For both cases this is the NF_INET_POST_ROUTING hook which allows
+        * changing the skb->dst entry which then may not be xfrm based anymore
+        * in which case a REROUTED flag is set. and dst_output is called.
+        *
+        * For IPv6 we are also skipping fragmentation handling for local
+        * sockets, which may or may not be good depending on our tunnel DF
+        * setting. Normally with fragmentation supported we want to skip this
+        * fragmentation.
+        */
+
+       pmtu = iptfs_get_cur_pmtu(x, xtfs, skb);
+
+       /* Break apart GSO skbs. If the queue is nearing full then we want the
+        * accounting and queuing to be based on the individual packets not on the
+        * aggregate GSO buffer.
+        */
+       was_gso = skb_is_gso(skb);
+       if (!was_gso) {
+               segs = skb;
+       } else {
+               segs = skb_gso_segment(skb, 0);
+               if (IS_ERR_OR_NULL(segs)) {
+                       XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTERROR);
+                       kfree_skb(skb);
+                       if (IS_ERR(segs))
+                               return PTR_ERR(segs);
+                       return -EINVAL;
+               }
+               consume_skb(skb);
+               skb = NULL;
+       }
+
+       /* We can be running on multiple cores and from the network softirq or
+        * from user context depending on where the packet is coming from.
+        */
+       spin_lock_bh(&x->lock);
+
+       skb_list_walk_safe(segs, skb, nskb) {
+               skb_mark_not_on_list(skb);
+
+               /* Once we drop due to no queue space we continue to drop the
+                * rest of the packets from that GRO.
+                */
+               if (!ok) {
+nospace:
+                       XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTNOQSPACE);
+                       kfree_skb_reason(skb, SKB_DROP_REASON_FULL_RING);
+                       continue;
+               }
+
+               /* Fragmenting handled in following commits. */
+               if (iptfs_is_too_big(sk, skb, pmtu)) {
+                       kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG);
+                       continue;
+               }
+
+               /* Enqueue to send in tunnel */
+               ok = iptfs_enqueue(xtfs, skb);
+               if (!ok)
+                       goto nospace;
+       }
+
+       /* Start a delay timer if we don't have one yet */
+       if (!hrtimer_is_queued(&xtfs->iptfs_timer))
+               hrtimer_start(&xtfs->iptfs_timer, xtfs->init_delay_ns, IPTFS_HRTIMER_MODE);
+
+       spin_unlock_bh(&x->lock);
+       return 0;
+}
+
+/* -------------------------- */
+/* Dequeue and send functions */
+/* -------------------------- */
+
+static void iptfs_output_prepare_skb(struct sk_buff *skb, u32 blkoff)
+{
+       struct ip_iptfs_hdr *h;
+       size_t hsz = sizeof(*h);
+
+       /* now reset values to be pointing at the rest of the packets */
+       h = skb_push(skb, hsz);
+       memset(h, 0, hsz);
+       if (blkoff)
+               h->block_offset = htons(blkoff);
+
+       /* network_header current points at the inner IP packet
+        * move it to the iptfs header
+        */
+       skb->transport_header = skb->network_header;
+       skb->network_header -= hsz;
+
+       IPCB(skb)->flags |= IPSKB_XFRM_TUNNEL_SIZE;
+}
+
+static struct sk_buff **iptfs_rehome_fraglist(struct sk_buff **nextp, struct sk_buff *child)
+{
+       u32 fllen = 0;
+
+       /* It might be possible to account for a frag list in addition to page
+        * fragment if it's a valid state to be in. The page fragments size
+        * should be kept as data_len so only the frag_list size is removed,
+        * this must be done above as well.
+        */
+       *nextp = skb_shinfo(child)->frag_list;
+       while (*nextp) {
+               fllen += (*nextp)->len;
+               nextp = &(*nextp)->next;
+       }
+       skb_frag_list_init(child);
+       child->len -= fllen;
+       child->data_len -= fllen;
+
+       return nextp;
+}
+
+static void iptfs_output_queued(struct xfrm_state *x, struct sk_buff_head *list)
+{
+       struct xfrm_iptfs_data *xtfs = x->mode_data;
+       struct sk_buff *skb, *skb2, **nextp;
+       struct skb_shared_info *shi;
+
+       while ((skb = __skb_dequeue(list))) {
+               u32 mtu = iptfs_get_cur_pmtu(x, xtfs, skb);
+               int remaining;
+
+               /* protocol comes to us cleared sometimes */
+               skb->protocol = x->outer_mode.family == AF_INET ? htons(ETH_P_IP) :
+                                                                 htons(ETH_P_IPV6);
+
+               if (skb->len > mtu) {
+                       /* We handle this case before enqueueing so we are only
+                        * here b/c MTU changed after we enqueued before we
+                        * dequeued, just drop these.
+                        */
+                       XFRM_INC_STATS(xs_net(x), LINUX_MIB_XFRMOUTERROR);
+
+                       kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG);
+                       continue;
+               }
+
+               /* If we don't have a cksum in the packet we need to add one
+                * before encapsulation.
+                */
+               if (skb->ip_summed == CHECKSUM_PARTIAL) {
+                       if (skb_checksum_help(skb)) {
+                               XFRM_INC_STATS(dev_net(skb_dst(skb)->dev), LINUX_MIB_XFRMOUTERROR);
+                               kfree_skb(skb);
+                               continue;
+                       }
+               }
+
+               /* Consider the buffer Tx'd and no longer owned */
+               skb_orphan(skb);
+
+               /* Convert first inner packet into an outer IPTFS packet */
+               iptfs_output_prepare_skb(skb, 0);
+
+               /* The space remaining to send more inner packet data is `mtu` -
+                * (skb->len - sizeof iptfs header). This is b/c the `mtu` value
+                * has the basic IPTFS header len accounted for, and we added
+                * that header to the skb so it is a part of skb->len, thus we
+                * subtract it from the skb length.
+                */
+               remaining = mtu - (skb->len - sizeof(struct ip_iptfs_hdr));
+
+               /* Re-home (un-nest) nested fragment lists. We need to do this
+                * b/c we will simply be appending any following aggregated
+                * inner packets to the frag list.
+                */
+               shi = skb_shinfo(skb);
+               nextp = &shi->frag_list;
+               while (*nextp) {
+                       if (skb_has_frag_list(*nextp))
+                               nextp = iptfs_rehome_fraglist(&(*nextp)->next, *nextp);
+                       else
+                               nextp = &(*nextp)->next;
+               }
+
+               /* See if we have enough space to simply append.
+                *
+                * NOTE: Maybe do not append if we will be mis-aligned,
+                * SW-based endpoints will probably have to copy in this
+                * case.
+                */
+               while ((skb2 = skb_peek(list))) {
+                       if (skb2->len > remaining)
+                               break;
+
+                       __skb_unlink(skb2, list);
+
+                       /* Consider the buffer Tx'd and no longer owned */
+                       skb_orphan(skb);
+
+                       /* If we don't have a cksum in the packet we need to add
+                        * one before encapsulation.
+                        */
+                       if (skb2->ip_summed == CHECKSUM_PARTIAL) {
+                               if (skb_checksum_help(skb2)) {
+                                       XFRM_INC_STATS(xs_net(x), LINUX_MIB_XFRMOUTERROR);
+                                       kfree_skb(skb2);
+                                       continue;
+                               }
+                       }
+
+                       /* Do accounting */
+                       skb->data_len += skb2->len;
+                       skb->len += skb2->len;
+                       remaining -= skb2->len;
+
+                       /* Append to the frag_list */
+                       *nextp = skb2;
+                       nextp = &skb2->next;
+                       if (skb_has_frag_list(skb2))
+                               nextp = iptfs_rehome_fraglist(nextp, skb2);
+                       skb->truesize += skb2->truesize;
+               }
+
+               xfrm_output(NULL, skb);
+       }
+}
+
+static enum hrtimer_restart iptfs_delay_timer(struct hrtimer *me)
+{
+       struct sk_buff_head list;
+       struct xfrm_iptfs_data *xtfs;
+       struct xfrm_state *x;
+
+       xtfs = container_of(me, typeof(*xtfs), iptfs_timer);
+       x = xtfs->x;
+
+       /* Process all the queued packets
+        *
+        * softirq execution order: timer > tasklet > hrtimer
+        *
+        * Network rx will have run before us giving one last chance to queue
+        * ingress packets for us to process and transmit.
+        */
+
+       spin_lock(&x->lock);
+       __skb_queue_head_init(&list);
+       skb_queue_splice_init(&xtfs->queue, &list);
+       xtfs->queue_size = 0;
+       spin_unlock(&x->lock);
+
+       /* After the above unlock, packets can begin queuing again, and the
+        * timer can be set again, from another CPU either in softirq or user
+        * context (not from this one since we are running at softirq level
+        * already).
+        */
+
+       iptfs_output_queued(x, &list);
+
+       return HRTIMER_NORESTART;
+}
+
+/**
+ * iptfs_encap_add_ipv4() - add outer encaps
+ * @x: xfrm state
+ * @skb: the packet
+ *
+ * This was originally taken from xfrm4_tunnel_encap_add. The reason for the
+ * copy is that IP-TFS/AGGFRAG can have different functionality for how to set
+ * the TOS/DSCP bits. Sets the protocol to a different value and doesn't do
+ * anything with inner headers as they aren't pointing into a normal IP
+ * singleton inner packet.
+ *
+ * Return: 0 on success or a negative error code on failure
+ */
+static int iptfs_encap_add_ipv4(struct xfrm_state *x, struct sk_buff *skb)
+{
+       struct dst_entry *dst = skb_dst(skb);
+       struct iphdr *top_iph;
+
+       skb_reset_inner_network_header(skb);
+       skb_reset_inner_transport_header(skb);
+
+       skb_set_network_header(skb, -(x->props.header_len - x->props.enc_hdr_len));
+       skb->mac_header = skb->network_header + offsetof(struct iphdr, protocol);
+       skb->transport_header = skb->network_header + sizeof(*top_iph);
+
+       top_iph = ip_hdr(skb);
+       top_iph->ihl = 5;
+       top_iph->version = 4;
+       top_iph->protocol = IPPROTO_AGGFRAG;
+
+       /* As we have 0, fractional, 1 or N inner packets there's no obviously
+        * correct DSCP mapping to inherit. ECN should be cleared per RFC9347
+        * 3.1.
+        */
+       top_iph->tos = 0;
+
+       top_iph->frag_off = htons(IP_DF);
+       top_iph->ttl = ip4_dst_hoplimit(xfrm_dst_child(dst));
+       top_iph->saddr = x->props.saddr.a4;
+       top_iph->daddr = x->id.daddr.a4;
+       ip_select_ident(dev_net(dst->dev), skb, NULL);
+
+       return 0;
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+/**
+ * iptfs_encap_add_ipv6() - add outer encaps
+ * @x: xfrm state
+ * @skb: the packet
+ *
+ * This was originally taken from xfrm6_tunnel_encap_add. The reason for the
+ * copy is that IP-TFS/AGGFRAG can have different functionality for how to set
+ * the flow label and TOS/DSCP bits. It also sets the protocol to a different
+ * value and doesn't do anything with inner headers as they aren't pointing into
+ * a normal IP singleton inner packet.
+ *
+ * Return: 0 on success or a negative error code on failure
+ */
+static int iptfs_encap_add_ipv6(struct xfrm_state *x, struct sk_buff *skb)
+{
+       struct dst_entry *dst = skb_dst(skb);
+       struct ipv6hdr *top_iph;
+       int dsfield;
+
+       skb_reset_inner_network_header(skb);
+       skb_reset_inner_transport_header(skb);
+
+       skb_set_network_header(skb, -x->props.header_len + x->props.enc_hdr_len);
+       skb->mac_header = skb->network_header + offsetof(struct ipv6hdr, nexthdr);
+       skb->transport_header = skb->network_header + sizeof(*top_iph);
+
+       top_iph = ipv6_hdr(skb);
+       top_iph->version = 6;
+       top_iph->priority = 0;
+       memset(top_iph->flow_lbl, 0, sizeof(top_iph->flow_lbl));
+       top_iph->nexthdr = IPPROTO_AGGFRAG;
+
+       /* As we have 0, fractional, 1 or N inner packets there's no obviously
+        * correct DSCP mapping to inherit. ECN should be cleared per RFC9347
+        * 3.1.
+        */
+       dsfield = 0;
+       ipv6_change_dsfield(top_iph, 0, dsfield);
+
+       top_iph->hop_limit = ip6_dst_hoplimit(xfrm_dst_child(dst));
+       top_iph->saddr = *(struct in6_addr *)&x->props.saddr;
+       top_iph->daddr = *(struct in6_addr *)&x->id.daddr;
+
+       return 0;
+}
+#endif
+
+/**
+ * iptfs_prepare_output() -  prepare the skb for output
+ * @x: xfrm state
+ * @skb: the packet
+ *
+ * Return: Error value, if 0 then skb values should be as follows:
+ *    - transport_header should point at ESP header
+ *    - network_header should point at Outer IP header
+ *    - mac_header should point at protocol/nexthdr of the outer IP
+ */
+static int iptfs_prepare_output(struct xfrm_state *x, struct sk_buff *skb)
+{
+       if (x->outer_mode.family == AF_INET)
+               return iptfs_encap_add_ipv4(x, skb);
+       if (x->outer_mode.family == AF_INET6) {
+#if IS_ENABLED(CONFIG_IPV6)
+               return iptfs_encap_add_ipv6(x, skb);
+#else
+               return -EAFNOSUPPORT;
+#endif
+       }
+       return -EOPNOTSUPP;
+}
+
 /* ========================== */
 /* State Management Functions */
 /* ========================== */
@@ -77,8 +589,11 @@ static int iptfs_user_init(struct net *net, struct xfrm_state *x,
 {
        struct xfrm_iptfs_data *xtfs = x->mode_data;
        struct xfrm_iptfs_config *xc;
+       u64 q;
 
        xc = &xtfs->cfg;
+       xc->max_queue_size = IPTFS_DEFAULT_MAX_QUEUE_SIZE;
+       xtfs->init_delay_ns = IPTFS_DEFAULT_INIT_DELAY_USECS * NSECS_IN_USEC;
 
        if (attrs[XFRMA_IPTFS_PKT_SIZE]) {
                xc->pkt_size = nla_get_u32(attrs[XFRMA_IPTFS_PKT_SIZE]);
@@ -92,6 +607,16 @@ static int iptfs_user_init(struct net *net, struct xfrm_state *x,
                        return -EINVAL;
                }
        }
+       if (attrs[XFRMA_IPTFS_MAX_QSIZE])
+               xc->max_queue_size = nla_get_u32(attrs[XFRMA_IPTFS_MAX_QSIZE]);
+       if (attrs[XFRMA_IPTFS_INIT_DELAY])
+               xtfs->init_delay_ns =
+                       (u64)nla_get_u32(attrs[XFRMA_IPTFS_INIT_DELAY]) * NSECS_IN_USEC;
+
+       q = (u64)xc->max_queue_size * 95;
+       do_div(q, 100);
+       xtfs->ecn_queue_size = (u32)q;
+
        return 0;
 }
 
@@ -101,8 +626,11 @@ static unsigned int iptfs_sa_len(const struct xfrm_state *x)
        struct xfrm_iptfs_config *xc = &xtfs->cfg;
        unsigned int l = 0;
 
-       if (x->dir == XFRM_SA_DIR_OUT)
+       if (x->dir == XFRM_SA_DIR_OUT) {
+               l += nla_total_size(sizeof(u32)); /* init delay usec */
+               l += nla_total_size(sizeof(xc->max_queue_size));
                l += nla_total_size(sizeof(xc->pkt_size));
+       }
 
        return l;
 }
@@ -112,9 +640,21 @@ static int iptfs_copy_to_user(struct xfrm_state *x, struct sk_buff *skb)
        struct xfrm_iptfs_data *xtfs = x->mode_data;
        struct xfrm_iptfs_config *xc = &xtfs->cfg;
        int ret = 0;
+       u64 q;
+
+       if (x->dir == XFRM_SA_DIR_OUT) {
+               q = xtfs->init_delay_ns;
+               do_div(q, NSECS_IN_USEC);
+               ret = nla_put_u32(skb, XFRMA_IPTFS_INIT_DELAY, q);
+               if (ret)
+                       return ret;
+
+               ret = nla_put_u32(skb, XFRMA_IPTFS_MAX_QSIZE, xc->max_queue_size);
+               if (ret)
+                       return ret;
 
-       if (x->dir == XFRM_SA_DIR_OUT)
                ret = nla_put_u32(skb, XFRMA_IPTFS_PKT_SIZE, xc->pkt_size);
+       }
 
        return ret;
 }
@@ -122,6 +662,10 @@ static int iptfs_copy_to_user(struct xfrm_state *x, struct sk_buff *skb)
 static void __iptfs_init_state(struct xfrm_state *x,
                               struct xfrm_iptfs_data *xtfs)
 {
+       __skb_queue_head_init(&xtfs->queue);
+       hrtimer_init(&xtfs->iptfs_timer, CLOCK_MONOTONIC, IPTFS_HRTIMER_MODE);
+       xtfs->iptfs_timer.function = iptfs_delay_timer;
+
        /* Modify type (esp) adjustment values */
 
        if (x->props.family == AF_INET)
@@ -172,10 +716,21 @@ static int iptfs_init_state(struct xfrm_state *x)
 static void iptfs_destroy_state(struct xfrm_state *x)
 {
        struct xfrm_iptfs_data *xtfs = x->mode_data;
+       struct sk_buff_head list;
+       struct sk_buff *skb;
 
        if (!xtfs)
                return;
 
+       spin_lock_bh(&xtfs->x->lock);
+       hrtimer_cancel(&xtfs->iptfs_timer);
+       __skb_queue_head_init(&list);
+       skb_queue_splice_init(&xtfs->queue, &list);
+       spin_unlock_bh(&xtfs->x->lock);
+
+       while ((skb = __skb_dequeue(&list)))
+               kfree_skb(skb);
+
        kfree_sensitive(xtfs);
 
        module_put(x->mode_cbs->owner);
@@ -190,6 +745,8 @@ static const struct xfrm_mode_cbs iptfs_mode_cbs = {
        .copy_to_user = iptfs_copy_to_user,
        .sa_len = iptfs_sa_len,
        .get_inner_mtu = iptfs_get_inner_mtu,
+       .output = iptfs_output_collect,
+       .prepare_output = iptfs_prepare_output,
 };
 
 static int __init xfrm_iptfs_init(void)