kcm: Kernel Connection Multiplexor module

author Tom Herbert <tom@herbertland.com>

Mon, 7 Mar 2016 22:11:06 +0000 (14:11 -0800)

committer David S. Miller <davem@davemloft.net>

Wed, 9 Mar 2016 21:36:14 +0000 (16:36 -0500)
author Tom Herbert <tom@herbertland.com>
Mon, 7 Mar 2016 22:11:06 +0000 (14:11 -0800)
committer David S. Miller <davem@davemloft.net>
Wed, 9 Mar 2016 21:36:14 +0000 (16:36 -0500)
diff --git a/include/linux/socket.h b/include/linux/socket.h

index d834af22a4607cd2acfd69490f891e13690c7bae..73bf6c6a833b37b907bc6f8852356d8976b9662d 100644 (file)
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -200,7 +200,9 @@ struct ucred {
  #define AF_ALG         38      /* Algorithm sockets            */
  #define AF_NFC         39      /* NFC sockets                  */
  #define AF_VSOCK       40      /* vSockets                     */
-#define AF_MAX         41      /* For now.. */
+#define AF_KCM         41      /* Kernel Connection Multiplexor*/
+
+#define AF_MAX         42      /* For now.. */
  
  /* Protocol families, same as address families. */
  #define PF_UNSPEC      AF_UNSPEC
@@ -246,6 +248,7 @@ struct ucred {
  #define PF_ALG         AF_ALG
  #define PF_NFC         AF_NFC
  #define PF_VSOCK       AF_VSOCK
+#define PF_KCM         AF_KCM
  #define PF_MAX         AF_MAX
  
  /* Maximum queue length specifiable by listen.  */
@@ -323,6 +326,7 @@ struct ucred {
  #define SOL_CAIF       278
  #define SOL_ALG                279
  #define SOL_NFC                280
+#define SOL_KCM                281
  
  /* IPX options */
  #define IPX_TYPE       1
diff --git a/include/net/kcm.h b/include/net/kcm.h

new file mode 100644 (file)

index 0000000..1bcae39
--- /dev/null
+++ b/include/net/kcm.h
@@ -0,0 +1,125 @@
+/*
+ * Kernel Connection Multiplexor
+ *
+ * Copyright (c) 2016 Tom Herbert <tom@herbertland.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ */
+
+#ifndef __NET_KCM_H_
+#define __NET_KCM_H_
+
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <uapi/linux/kcm.h>
+
+extern unsigned int kcm_net_id;
+
+struct kcm_tx_msg {
+       unsigned int sent;
+       unsigned int fragidx;
+       unsigned int frag_offset;
+       unsigned int msg_flags;
+       struct sk_buff *frag_skb;
+       struct sk_buff *last_skb;
+};
+
+struct kcm_rx_msg {
+       int full_len;
+       int accum_len;
+       int offset;
+};
+
+/* Socket structure for KCM client sockets */
+struct kcm_sock {
+       struct sock sk;
+       struct kcm_mux *mux;
+       struct list_head kcm_sock_list;
+       int index;
+       u32 done : 1;
+       struct work_struct done_work;
+
+       /* Transmit */
+       struct kcm_psock *tx_psock;
+       struct work_struct tx_work;
+       struct list_head wait_psock_list;
+       struct sk_buff *seq_skb;
+
+       /* Don't use bit fields here, these are set under different locks */
+       bool tx_wait;
+       bool tx_wait_more;
+
+       /* Receive */
+       struct kcm_psock *rx_psock;
+       struct list_head wait_rx_list; /* KCMs waiting for receiving */
+       bool rx_wait;
+       u32 rx_disabled : 1;
+};
+
+struct bpf_prog;
+
+/* Structure for an attached lower socket */
+struct kcm_psock {
+       struct sock *sk;
+       struct kcm_mux *mux;
+       int index;
+
+       u32 tx_stopped : 1;
+       u32 rx_stopped : 1;
+       u32 done : 1;
+       u32 unattaching : 1;
+
+       void (*save_state_change)(struct sock *sk);
+       void (*save_data_ready)(struct sock *sk);
+       void (*save_write_space)(struct sock *sk);
+
+       struct list_head psock_list;
+
+       /* Receive */
+       struct sk_buff *rx_skb_head;
+       struct sk_buff **rx_skb_nextp;
+       struct sk_buff *ready_rx_msg;
+       struct list_head psock_ready_list;
+       struct work_struct rx_work;
+       struct delayed_work rx_delayed_work;
+       struct bpf_prog *bpf_prog;
+       struct kcm_sock *rx_kcm;
+
+       /* Transmit */
+       struct kcm_sock *tx_kcm;
+       struct list_head psock_avail_list;
+};
+
+/* Per net MUX list */
+struct kcm_net {
+       struct mutex mutex;
+       struct list_head mux_list;
+       int count;
+};
+
+/* Structure for a MUX */
+struct kcm_mux {
+       struct list_head kcm_mux_list;
+       struct rcu_head rcu;
+       struct kcm_net *knet;
+
+       struct list_head kcm_socks;     /* All KCM sockets on MUX */
+       int kcm_socks_cnt;              /* Total KCM socket count for MUX */
+       struct list_head psocks;        /* List of all psocks on MUX */
+       int psocks_cnt;         /* Total attached sockets */
+
+       /* Receive */
+       spinlock_t rx_lock ____cacheline_aligned_in_smp;
+       struct list_head kcm_rx_waiters; /* KCMs waiting for receiving */
+       struct list_head psocks_ready;  /* List of psocks with a msg ready */
+       struct sk_buff_head rx_hold_queue;
+
+       /* Transmit */
+       spinlock_t  lock ____cacheline_aligned_in_smp;  /* TX and mux locking */
+       struct list_head psocks_avail;  /* List of available psocks */
+       struct list_head kcm_tx_waiters; /* KCMs waiting for a TX psock */
+};
+
+#endif /* __NET_KCM_H_ */
diff --git a/include/uapi/linux/kcm.h b/include/uapi/linux/kcm.h

new file mode 100644 (file)

index 0000000..a5a5309
--- /dev/null
+++ b/include/uapi/linux/kcm.h
@@ -0,0 +1,40 @@
+/*
+ * Kernel Connection Multiplexor
+ *
+ * Copyright (c) 2016 Tom Herbert <tom@herbertland.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * User API to clone KCM sockets and attach transport socket to a KCM
+ * multiplexor.
+ */
+
+#ifndef KCM_KERNEL_H
+#define KCM_KERNEL_H
+
+struct kcm_attach {
+       int fd;
+       int bpf_fd;
+};
+
+struct kcm_unattach {
+       int fd;
+};
+
+struct kcm_clone {
+       int fd;
+};
+
+#define SIOCKCMATTACH  (SIOCPROTOPRIVATE + 0)
+#define SIOCKCMUNATTACH        (SIOCPROTOPRIVATE + 1)
+#define SIOCKCMCLONE   (SIOCPROTOPRIVATE + 2)
+
+#define KCMPROTO_CONNECTED     0
+
+/* Socket options */
+#define KCM_RECV_DISABLE       1
+
+#endif
+
diff --git a/net/Kconfig b/net/Kconfig

index 2760825e53fa4e2404571d34c98c965e2b46b9cb..10640d5f8beefd84d41b8d3eff2758c3757e9f6b 100644 (file)
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -360,6 +360,7 @@ source "net/can/Kconfig"
  source "net/irda/Kconfig"
  source "net/bluetooth/Kconfig"
  source "net/rxrpc/Kconfig"
+source "net/kcm/Kconfig"
  
  config FIB_RULES
         bool
diff --git a/net/Makefile b/net/Makefile

index a5d04098dfce8693c46785f6c0dfa90bd23f4187..81d14119eab5416e8576f1fffa5109d752784143 100644 (file)
--- a/net/Makefile
+++ b/net/Makefile
@@ -34,6 +34,7 @@ obj-$(CONFIG_IRDA)            += irda/
  obj-$(CONFIG_BT)               += bluetooth/
  obj-$(CONFIG_SUNRPC)           += sunrpc/
  obj-$(CONFIG_AF_RXRPC)         += rxrpc/
+obj-$(CONFIG_AF_KCM)           += kcm/
  obj-$(CONFIG_ATM)              += atm/
  obj-$(CONFIG_L2TP)             += l2tp/
  obj-$(CONFIG_DECNET)           += decnet/
diff --git a/net/kcm/Kconfig b/net/kcm/Kconfig

new file mode 100644 (file)

index 0000000..5db94d9
--- /dev/null
+++ b/net/kcm/Kconfig
@@ -0,0 +1,10 @@
+
+config AF_KCM
+       tristate "KCM sockets"
+       depends on INET
+       select BPF_SYSCALL
+       ---help---
+         KCM (Kernel Connection Multiplexor) sockets provide a method
+         for multiplexing messages of a message based application
+         protocol over kernel connectons (e.g. TCP connections).
+
diff --git a/net/kcm/Makefile b/net/kcm/Makefile

new file mode 100644 (file)

index 0000000..cb525f7
--- /dev/null
+++ b/net/kcm/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_AF_KCM) += kcm.o
+
+kcm-y := kcmsock.o
diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c

new file mode 100644 (file)

index 0000000..30ef69a
--- /dev/null
+++ b/net/kcm/kcmsock.c
@@ -0,0 +1,2016 @@
+#include <linux/bpf.h>
+#include <linux/errno.h>
+#include <linux/errqueue.h>
+#include <linux/file.h>
+#include <linux/in.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/net.h>
+#include <linux/netdevice.h>
+#include <linux/poll.h>
+#include <linux/rculist.h>
+#include <linux/skbuff.h>
+#include <linux/socket.h>
+#include <linux/uaccess.h>
+#include <linux/workqueue.h>
+#include <net/kcm.h>
+#include <net/netns/generic.h>
+#include <net/sock.h>
+#include <net/tcp.h>
+#include <uapi/linux/kcm.h>
+
+unsigned int kcm_net_id;
+
+static struct kmem_cache *kcm_psockp __read_mostly;
+static struct kmem_cache *kcm_muxp __read_mostly;
+static struct workqueue_struct *kcm_wq;
+
+static inline struct kcm_sock *kcm_sk(const struct sock *sk)
+{
+       return (struct kcm_sock *)sk;
+}
+
+static inline struct kcm_tx_msg *kcm_tx_msg(struct sk_buff *skb)
+{
+       return (struct kcm_tx_msg *)skb->cb;
+}
+
+static inline struct kcm_rx_msg *kcm_rx_msg(struct sk_buff *skb)
+{
+       return (struct kcm_rx_msg *)((void *)skb->cb +
+                                    offsetof(struct qdisc_skb_cb, data));
+}
+
+static void report_csk_error(struct sock *csk, int err)
+{
+       csk->sk_err = EPIPE;
+       csk->sk_error_report(csk);
+}
+
+/* Callback lock held */
+static void kcm_abort_rx_psock(struct kcm_psock *psock, int err,
+                              struct sk_buff *skb)
+{
+       struct sock *csk = psock->sk;
+
+       /* Unrecoverable error in receive */
+
+       if (psock->rx_stopped)
+               return;
+
+       psock->rx_stopped = 1;
+
+       /* Report an error on the lower socket */
+       report_csk_error(csk, err);
+}
+
+static void kcm_abort_tx_psock(struct kcm_psock *psock, int err,
+                              bool wakeup_kcm)
+{
+       struct sock *csk = psock->sk;
+       struct kcm_mux *mux = psock->mux;
+
+       /* Unrecoverable error in transmit */
+
+       spin_lock_bh(&mux->lock);
+
+       if (psock->tx_stopped) {
+               spin_unlock_bh(&mux->lock);
+               return;
+       }
+
+       psock->tx_stopped = 1;
+
+       if (!psock->tx_kcm) {
+               /* Take off psocks_avail list */
+               list_del(&psock->psock_avail_list);
+       } else if (wakeup_kcm) {
+               /* In this case psock is being aborted while outside of
+                * write_msgs and psock is reserved. Schedule tx_work
+                * to handle the failure there. Need to commit tx_stopped
+                * before queuing work.
+                */
+               smp_mb();
+
+               queue_work(kcm_wq, &psock->tx_kcm->tx_work);
+       }
+
+       spin_unlock_bh(&mux->lock);
+
+       /* Report error on lower socket */
+       report_csk_error(csk, err);
+}
+
+static int kcm_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
+
+/* KCM is ready to receive messages on its queue-- either the KCM is new or
+ * has become unblocked after being blocked on full socket buffer. Queue any
+ * pending ready messages on a psock. RX mux lock held.
+ */
+static void kcm_rcv_ready(struct kcm_sock *kcm)
+{
+       struct kcm_mux *mux = kcm->mux;
+       struct kcm_psock *psock;
+       struct sk_buff *skb;
+
+       if (unlikely(kcm->rx_wait || kcm->rx_psock || kcm->rx_disabled))
+               return;
+
+       while (unlikely((skb = __skb_dequeue(&mux->rx_hold_queue)))) {
+               if (kcm_queue_rcv_skb(&kcm->sk, skb)) {
+                       /* Assuming buffer limit has been reached */
+                       skb_queue_head(&mux->rx_hold_queue, skb);
+                       WARN_ON(!sk_rmem_alloc_get(&kcm->sk));
+                       return;
+               }
+       }
+
+       while (!list_empty(&mux->psocks_ready)) {
+               psock = list_first_entry(&mux->psocks_ready, struct kcm_psock,
+                                        psock_ready_list);
+
+               if (kcm_queue_rcv_skb(&kcm->sk, psock->ready_rx_msg)) {
+                       /* Assuming buffer limit has been reached */
+                       WARN_ON(!sk_rmem_alloc_get(&kcm->sk));
+                       return;
+               }
+
+               /* Consumed the ready message on the psock. Schedule rx_work to
+                * get more messages.
+                */
+               list_del(&psock->psock_ready_list);
+               psock->ready_rx_msg = NULL;
+
+               /* Commit clearing of ready_rx_msg for queuing work */
+               smp_mb();
+
+               queue_work(kcm_wq, &psock->rx_work);
+       }
+
+       /* Buffer limit is okay now, add to ready list */
+       list_add_tail(&kcm->wait_rx_list,
+                     &kcm->mux->kcm_rx_waiters);
+       kcm->rx_wait = true;
+}
+
+static void kcm_rfree(struct sk_buff *skb)
+{
+       struct sock *sk = skb->sk;
+       struct kcm_sock *kcm = kcm_sk(sk);
+       struct kcm_mux *mux = kcm->mux;
+       unsigned int len = skb->truesize;
+
+       sk_mem_uncharge(sk, len);
+       atomic_sub(len, &sk->sk_rmem_alloc);
+
+       /* For reading rx_wait and rx_psock without holding lock */
+       smp_mb__after_atomic();
+
+       if (!kcm->rx_wait && !kcm->rx_psock &&
+           sk_rmem_alloc_get(sk) < sk->sk_rcvlowat) {
+               spin_lock_bh(&mux->rx_lock);
+               kcm_rcv_ready(kcm);
+               spin_unlock_bh(&mux->rx_lock);
+       }
+}
+
+static int kcm_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+{
+       struct sk_buff_head *list = &sk->sk_receive_queue;
+
+       if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
+               return -ENOMEM;
+
+       if (!sk_rmem_schedule(sk, skb, skb->truesize))
+               return -ENOBUFS;
+
+       skb->dev = NULL;
+
+       skb_orphan(skb);
+       skb->sk = sk;
+       skb->destructor = kcm_rfree;
+       atomic_add(skb->truesize, &sk->sk_rmem_alloc);
+       sk_mem_charge(sk, skb->truesize);
+
+       skb_queue_tail(list, skb);
+
+       if (!sock_flag(sk, SOCK_DEAD))
+               sk->sk_data_ready(sk);
+
+       return 0;
+}
+
+/* Requeue received messages for a kcm socket to other kcm sockets. This is
+ * called with a kcm socket is receive disabled.
+ * RX mux lock held.
+ */
+static void requeue_rx_msgs(struct kcm_mux *mux, struct sk_buff_head *head)
+{
+       struct sk_buff *skb;
+       struct kcm_sock *kcm;
+
+       while ((skb = __skb_dequeue(head))) {
+               /* Reset destructor to avoid calling kcm_rcv_ready */
+               skb->destructor = sock_rfree;
+               skb_orphan(skb);
+try_again:
+               if (list_empty(&mux->kcm_rx_waiters)) {
+                       skb_queue_tail(&mux->rx_hold_queue, skb);
+                       continue;
+               }
+
+               kcm = list_first_entry(&mux->kcm_rx_waiters,
+                                      struct kcm_sock, wait_rx_list);
+
+               if (kcm_queue_rcv_skb(&kcm->sk, skb)) {
+                       /* Should mean socket buffer full */
+                       list_del(&kcm->wait_rx_list);
+                       kcm->rx_wait = false;
+
+                       /* Commit rx_wait to read in kcm_free */
+                       smp_wmb();
+
+                       goto try_again;
+               }
+       }
+}
+
+/* Lower sock lock held */
+static struct kcm_sock *reserve_rx_kcm(struct kcm_psock *psock,
+                                      struct sk_buff *head)
+{
+       struct kcm_mux *mux = psock->mux;
+       struct kcm_sock *kcm;
+
+       WARN_ON(psock->ready_rx_msg);
+
+       if (psock->rx_kcm)
+               return psock->rx_kcm;
+
+       spin_lock_bh(&mux->rx_lock);
+
+       if (psock->rx_kcm) {
+               spin_unlock_bh(&mux->rx_lock);
+               return psock->rx_kcm;
+       }
+
+       if (list_empty(&mux->kcm_rx_waiters)) {
+               psock->ready_rx_msg = head;
+               list_add_tail(&psock->psock_ready_list,
+                             &mux->psocks_ready);
+               spin_unlock_bh(&mux->rx_lock);
+               return NULL;
+       }
+
+       kcm = list_first_entry(&mux->kcm_rx_waiters,
+                              struct kcm_sock, wait_rx_list);
+       list_del(&kcm->wait_rx_list);
+       kcm->rx_wait = false;
+
+       psock->rx_kcm = kcm;
+       kcm->rx_psock = psock;
+
+       spin_unlock_bh(&mux->rx_lock);
+
+       return kcm;
+}
+
+static void kcm_done(struct kcm_sock *kcm);
+
+static void kcm_done_work(struct work_struct *w)
+{
+       kcm_done(container_of(w, struct kcm_sock, done_work));
+}
+
+/* Lower sock held */
+static void unreserve_rx_kcm(struct kcm_psock *psock,
+                            bool rcv_ready)
+{
+       struct kcm_sock *kcm = psock->rx_kcm;
+       struct kcm_mux *mux = psock->mux;
+
+       if (!kcm)
+               return;
+
+       spin_lock_bh(&mux->rx_lock);
+
+       psock->rx_kcm = NULL;
+       kcm->rx_psock = NULL;
+
+       /* Commit kcm->rx_psock before sk_rmem_alloc_get to sync with
+        * kcm_rfree
+        */
+       smp_mb();
+
+       if (unlikely(kcm->done)) {
+               spin_unlock_bh(&mux->rx_lock);
+
+               /* Need to run kcm_done in a task since we need to qcquire
+                * callback locks which may already be held here.
+                */
+               INIT_WORK(&kcm->done_work, kcm_done_work);
+               schedule_work(&kcm->done_work);
+               return;
+       }
+
+       if (unlikely(kcm->rx_disabled)) {
+               requeue_rx_msgs(mux, &kcm->sk.sk_receive_queue);
+       } else if (rcv_ready || unlikely(!sk_rmem_alloc_get(&kcm->sk))) {
+               /* Check for degenerative race with rx_wait that all
+                * data was dequeued (accounted for in kcm_rfree).
+                */
+               kcm_rcv_ready(kcm);
+       }
+       spin_unlock_bh(&mux->rx_lock);
+}
+
+/* Macro to invoke filter function. */
+#define KCM_RUN_FILTER(prog, ctx) \
+       (*prog->bpf_func)(ctx, prog->insnsi)
+
+/* Lower socket lock held */
+static int kcm_tcp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb,
+                       unsigned int orig_offset, size_t orig_len)
+{
+       struct kcm_psock *psock = (struct kcm_psock *)desc->arg.data;
+       struct kcm_rx_msg *rxm;
+       struct kcm_sock *kcm;
+       struct sk_buff *head, *skb;
+       size_t eaten = 0, cand_len;
+       ssize_t extra;
+       int err;
+       bool cloned_orig = false;
+
+       if (psock->ready_rx_msg)
+               return 0;
+
+       head = psock->rx_skb_head;
+       if (head) {
+               /* Message already in progress */
+
+               if (unlikely(orig_offset)) {
+                       /* Getting data with a non-zero offset when a message is
+                        * in progress is not expected. If it does happen, we
+                        * need to clone and pull since we can't deal with
+                        * offsets in the skbs for a message expect in the head.
+                        */
+                       orig_skb = skb_clone(orig_skb, GFP_ATOMIC);
+                       if (!orig_skb) {
+                               desc->error = -ENOMEM;
+                               return 0;
+                       }
+                       if (!pskb_pull(orig_skb, orig_offset)) {
+                               kfree_skb(orig_skb);
+                               desc->error = -ENOMEM;
+                               return 0;
+                       }
+                       cloned_orig = true;
+                       orig_offset = 0;
+               }
+
+               if (!psock->rx_skb_nextp) {
+                       /* We are going to append to the frags_list of head.
+                        * Need to unshare the frag_list.
+                        */
+                       err = skb_unclone(head, GFP_ATOMIC);
+                       if (err) {
+                               desc->error = err;
+                               return 0;
+                       }
+
+                       if (unlikely(skb_shinfo(head)->frag_list)) {
+                               /* We can't append to an sk_buff that already
+                                * has a frag_list. We create a new head, point
+                                * the frag_list of that to the old head, and
+                                * then are able to use the old head->next for
+                                * appending to the message.
+                                */
+                               if (WARN_ON(head->next)) {
+                                       desc->error = -EINVAL;
+                                       return 0;
+                               }
+
+                               skb = alloc_skb(0, GFP_ATOMIC);
+                               if (!skb) {
+                                       desc->error = -ENOMEM;
+                                       return 0;
+                               }
+                               skb->len = head->len;
+                               skb->data_len = head->len;
+                               skb->truesize = head->truesize;
+                               *kcm_rx_msg(skb) = *kcm_rx_msg(head);
+                               psock->rx_skb_nextp = &head->next;
+                               skb_shinfo(skb)->frag_list = head;
+                               psock->rx_skb_head = skb;
+                               head = skb;
+                       } else {
+                               psock->rx_skb_nextp =
+                                   &skb_shinfo(head)->frag_list;
+                       }
+               }
+       }
+
+       while (eaten < orig_len) {
+               /* Always clone since we will consume something */
+               skb = skb_clone(orig_skb, GFP_ATOMIC);
+               if (!skb) {
+                       desc->error = -ENOMEM;
+                       break;
+               }
+
+               cand_len = orig_len - eaten;
+
+               head = psock->rx_skb_head;
+               if (!head) {
+                       head = skb;
+                       psock->rx_skb_head = head;
+                       /* Will set rx_skb_nextp on next packet if needed */
+                       psock->rx_skb_nextp = NULL;
+                       rxm = kcm_rx_msg(head);
+                       memset(rxm, 0, sizeof(*rxm));
+                       rxm->offset = orig_offset + eaten;
+               } else {
+                       /* Unclone since we may be appending to an skb that we
+                        * already share a frag_list with.
+                        */
+                       err = skb_unclone(skb, GFP_ATOMIC);
+                       if (err) {
+                               desc->error = err;
+                               break;
+                       }
+
+                       rxm = kcm_rx_msg(head);
+                       *psock->rx_skb_nextp = skb;
+                       psock->rx_skb_nextp = &skb->next;
+                       head->data_len += skb->len;
+                       head->len += skb->len;
+                       head->truesize += skb->truesize;
+               }
+
+               if (!rxm->full_len) {
+                       ssize_t len;
+
+                       len = KCM_RUN_FILTER(psock->bpf_prog, head);
+
+                       if (!len) {
+                               /* Need more header to determine length */
+                               rxm->accum_len += cand_len;
+                               eaten += cand_len;
+                               WARN_ON(eaten != orig_len);
+                               break;
+                       } else if (len <= (ssize_t)head->len -
+                                         skb->len - rxm->offset) {
+                               /* Length must be into new skb (and also
+                                * greater than zero)
+                                */
+                               desc->error = -EPROTO;
+                               psock->rx_skb_head = NULL;
+                               kcm_abort_rx_psock(psock, EPROTO, head);
+                               break;
+                       }
+
+                       rxm->full_len = len;
+               }
+
+               extra = (ssize_t)(rxm->accum_len + cand_len) - rxm->full_len;
+
+               if (extra < 0) {
+                       /* Message not complete yet. */
+                       rxm->accum_len += cand_len;
+                       eaten += cand_len;
+                       WARN_ON(eaten != orig_len);
+                       break;
+               }
+
+               /* Positive extra indicates ore bytes than needed for the
+                * message
+                */
+
+               WARN_ON(extra > cand_len);
+
+               eaten += (cand_len - extra);
+
+               /* Hurray, we have a new message! */
+               psock->rx_skb_head = NULL;
+
+try_queue:
+               kcm = reserve_rx_kcm(psock, head);
+               if (!kcm) {
+                       /* Unable to reserve a KCM, message is held in psock. */
+                       break;
+               }
+
+               if (kcm_queue_rcv_skb(&kcm->sk, head)) {
+                       /* Should mean socket buffer full */
+                       unreserve_rx_kcm(psock, false);
+                       goto try_queue;
+               }
+       }
+
+       if (cloned_orig)
+               kfree_skb(orig_skb);
+
+       return eaten;
+}
+
+/* Called with lock held on lower socket */
+static int psock_tcp_read_sock(struct kcm_psock *psock)
+{
+       read_descriptor_t desc;
+
+       desc.arg.data = psock;
+       desc.error = 0;
+       desc.count = 1; /* give more than one skb per call */
+
+       /* sk should be locked here, so okay to do tcp_read_sock */
+       tcp_read_sock(psock->sk, &desc, kcm_tcp_recv);
+
+       unreserve_rx_kcm(psock, true);
+
+       return desc.error;
+}
+
+/* Lower sock lock held */
+static void psock_tcp_data_ready(struct sock *sk)
+{
+       struct kcm_psock *psock;
+
+       read_lock_bh(&sk->sk_callback_lock);
+
+       psock = (struct kcm_psock *)sk->sk_user_data;
+       if (unlikely(!psock || psock->rx_stopped))
+               goto out;
+
+       if (psock->ready_rx_msg)
+               goto out;
+
+       if (psock_tcp_read_sock(psock) == -ENOMEM)
+               queue_delayed_work(kcm_wq, &psock->rx_delayed_work, 0);
+
+out:
+       read_unlock_bh(&sk->sk_callback_lock);
+}
+
+static void do_psock_rx_work(struct kcm_psock *psock)
+{
+       read_descriptor_t rd_desc;
+       struct sock *csk = psock->sk;
+
+       /* We need the read lock to synchronize with psock_tcp_data_ready. We
+        * need the socket lock for calling tcp_read_sock.
+        */
+       lock_sock(csk);
+       read_lock_bh(&csk->sk_callback_lock);
+
+       if (unlikely(csk->sk_user_data != psock))
+               goto out;
+
+       if (unlikely(psock->rx_stopped))
+               goto out;
+
+       if (psock->ready_rx_msg)
+               goto out;
+
+       rd_desc.arg.data = psock;
+
+       if (psock_tcp_read_sock(psock) == -ENOMEM)
+               queue_delayed_work(kcm_wq, &psock->rx_delayed_work, 0);
+
+out:
+       read_unlock_bh(&csk->sk_callback_lock);
+       release_sock(csk);
+}
+
+static void psock_rx_work(struct work_struct *w)
+{
+       do_psock_rx_work(container_of(w, struct kcm_psock, rx_work));
+}
+
+static void psock_rx_delayed_work(struct work_struct *w)
+{
+       do_psock_rx_work(container_of(w, struct kcm_psock,
+                                     rx_delayed_work.work));
+}
+
+static void psock_tcp_state_change(struct sock *sk)
+{
+       /* TCP only does a POLLIN for a half close. Do a POLLHUP here
+        * since application will normally not poll with POLLIN
+        * on the TCP sockets.
+        */
+
+       report_csk_error(sk, EPIPE);
+}
+
+static void psock_tcp_write_space(struct sock *sk)
+{
+       struct kcm_psock *psock;
+       struct kcm_mux *mux;
+       struct kcm_sock *kcm;
+
+       read_lock_bh(&sk->sk_callback_lock);
+
+       psock = (struct kcm_psock *)sk->sk_user_data;
+       if (unlikely(!psock))
+               goto out;
+
+       mux = psock->mux;
+
+       spin_lock_bh(&mux->lock);
+
+       /* Check if the socket is reserved so someone is waiting for sending. */
+       kcm = psock->tx_kcm;
+       if (kcm)
+               queue_work(kcm_wq, &kcm->tx_work);
+
+       spin_unlock_bh(&mux->lock);
+out:
+       read_unlock_bh(&sk->sk_callback_lock);
+}
+
+static void unreserve_psock(struct kcm_sock *kcm);
+
+/* kcm sock is locked. */
+static struct kcm_psock *reserve_psock(struct kcm_sock *kcm)
+{
+       struct kcm_mux *mux = kcm->mux;
+       struct kcm_psock *psock;
+
+       psock = kcm->tx_psock;
+
+       smp_rmb(); /* Must read tx_psock before tx_wait */
+
+       if (psock) {
+               WARN_ON(kcm->tx_wait);
+               if (unlikely(psock->tx_stopped))
+                       unreserve_psock(kcm);
+               else
+                       return kcm->tx_psock;
+       }
+
+       spin_lock_bh(&mux->lock);
+
+       /* Check again under lock to see if psock was reserved for this
+        * psock via psock_unreserve.
+        */
+       psock = kcm->tx_psock;
+       if (unlikely(psock)) {
+               WARN_ON(kcm->tx_wait);
+               spin_unlock_bh(&mux->lock);
+               return kcm->tx_psock;
+       }
+
+       if (!list_empty(&mux->psocks_avail)) {
+               psock = list_first_entry(&mux->psocks_avail,
+                                        struct kcm_psock,
+                                        psock_avail_list);
+               list_del(&psock->psock_avail_list);
+               if (kcm->tx_wait) {
+                       list_del(&kcm->wait_psock_list);
+                       kcm->tx_wait = false;
+               }
+               kcm->tx_psock = psock;
+               psock->tx_kcm = kcm;
+       } else if (!kcm->tx_wait) {
+               list_add_tail(&kcm->wait_psock_list,
+                             &mux->kcm_tx_waiters);
+               kcm->tx_wait = true;
+       }
+
+       spin_unlock_bh(&mux->lock);
+
+       return psock;
+}
+
+/* mux lock held */
+static void psock_now_avail(struct kcm_psock *psock)
+{
+       struct kcm_mux *mux = psock->mux;
+       struct kcm_sock *kcm;
+
+       if (list_empty(&mux->kcm_tx_waiters)) {
+               list_add_tail(&psock->psock_avail_list,
+                             &mux->psocks_avail);
+       } else {
+               kcm = list_first_entry(&mux->kcm_tx_waiters,
+                                      struct kcm_sock,
+                                      wait_psock_list);
+               list_del(&kcm->wait_psock_list);
+               kcm->tx_wait = false;
+               psock->tx_kcm = kcm;
+
+               /* Commit before changing tx_psock since that is read in
+                * reserve_psock before queuing work.
+                */
+               smp_mb();
+
+               kcm->tx_psock = psock;
+               queue_work(kcm_wq, &kcm->tx_work);
+       }
+}
+
+/* kcm sock is locked. */
+static void unreserve_psock(struct kcm_sock *kcm)
+{
+       struct kcm_psock *psock;
+       struct kcm_mux *mux = kcm->mux;
+
+       spin_lock_bh(&mux->lock);
+
+       psock = kcm->tx_psock;
+
+       if (WARN_ON(!psock)) {
+               spin_unlock_bh(&mux->lock);
+               return;
+       }
+
+       smp_rmb(); /* Read tx_psock before tx_wait */
+
+       WARN_ON(kcm->tx_wait);
+
+       kcm->tx_psock = NULL;
+       psock->tx_kcm = NULL;
+
+       if (unlikely(psock->tx_stopped)) {
+               if (psock->done) {
+                       /* Deferred free */
+                       list_del(&psock->psock_list);
+                       mux->psocks_cnt--;
+                       sock_put(psock->sk);
+                       fput(psock->sk->sk_socket->file);
+                       kmem_cache_free(kcm_psockp, psock);
+               }
+
+               /* Don't put back on available list */
+
+               spin_unlock_bh(&mux->lock);
+
+               return;
+       }
+
+       psock_now_avail(psock);
+
+       spin_unlock_bh(&mux->lock);
+}
+
+/* Write any messages ready on the kcm socket.  Called with kcm sock lock
+ * held.  Return bytes actually sent or error.
+ */
+static int kcm_write_msgs(struct kcm_sock *kcm)
+{
+       struct sock *sk = &kcm->sk;
+       struct kcm_psock *psock;
+       struct sk_buff *skb, *head;
+       struct kcm_tx_msg *txm;
+       unsigned short fragidx, frag_offset;
+       unsigned int sent, total_sent = 0;
+       int ret = 0;
+
+       kcm->tx_wait_more = false;
+       psock = kcm->tx_psock;
+       if (unlikely(psock && psock->tx_stopped)) {
+               /* A reserved psock was aborted asynchronously. Unreserve
+                * it and we'll retry the message.
+                */
+               unreserve_psock(kcm);
+               if (skb_queue_empty(&sk->sk_write_queue))
+                       return 0;
+
+               kcm_tx_msg(skb_peek(&sk->sk_write_queue))->sent = 0;
+
+       } else if (skb_queue_empty(&sk->sk_write_queue)) {
+               return 0;
+       }
+
+       head = skb_peek(&sk->sk_write_queue);
+       txm = kcm_tx_msg(head);
+
+       if (txm->sent) {
+               /* Send of first skbuff in queue already in progress */
+               if (WARN_ON(!psock)) {
+                       ret = -EINVAL;
+                       goto out;
+               }
+               sent = txm->sent;
+               frag_offset = txm->frag_offset;
+               fragidx = txm->fragidx;
+               skb = txm->frag_skb;
+
+               goto do_frag;
+       }
+
+try_again:
+       psock = reserve_psock(kcm);
+       if (!psock)
+               goto out;
+
+       do {
+               skb = head;
+               txm = kcm_tx_msg(head);
+               sent = 0;
+
+do_frag_list:
+               if (WARN_ON(!skb_shinfo(skb)->nr_frags)) {
+                       ret = -EINVAL;
+                       goto out;
+               }
+
+               for (fragidx = 0; fragidx < skb_shinfo(skb)->nr_frags;
+                    fragidx++) {
+                       skb_frag_t *frag;
+
+                       frag_offset = 0;
+do_frag:
+                       frag = &skb_shinfo(skb)->frags[fragidx];
+                       if (WARN_ON(!frag->size)) {
+                               ret = -EINVAL;
+                               goto out;
+                       }
+
+                       ret = kernel_sendpage(psock->sk->sk_socket,
+                                             frag->page.p,
+                                             frag->page_offset + frag_offset,
+                                             frag->size - frag_offset,
+                                             MSG_DONTWAIT);
+                       if (ret <= 0) {
+                               if (ret == -EAGAIN) {
+                                       /* Save state to try again when there's
+                                        * write space on the socket
+                                        */
+                                       txm->sent = sent;
+                                       txm->frag_offset = frag_offset;
+                                       txm->fragidx = fragidx;
+                                       txm->frag_skb = skb;
+
+                                       ret = 0;
+                                       goto out;
+                               }
+
+                               /* Hard failure in sending message, abort this
+                                * psock since it has lost framing
+                                * synchonization and retry sending the
+                                * message from the beginning.
+                                */
+                               kcm_abort_tx_psock(psock, ret ? -ret : EPIPE,
+                                                  true);
+                               unreserve_psock(kcm);
+
+                               txm->sent = 0;
+                               ret = 0;
+
+                               goto try_again;
+                       }
+
+                       sent += ret;
+                       frag_offset += ret;
+                       if (frag_offset < frag->size) {
+                               /* Not finished with this frag */
+                               goto do_frag;
+                       }
+               }
+
+               if (skb == head) {
+                       if (skb_has_frag_list(skb)) {
+                               skb = skb_shinfo(skb)->frag_list;
+                               goto do_frag_list;
+                       }
+               } else if (skb->next) {
+                       skb = skb->next;
+                       goto do_frag_list;
+               }
+
+               /* Successfully sent the whole packet, account for it. */
+               skb_dequeue(&sk->sk_write_queue);
+               kfree_skb(head);
+               sk->sk_wmem_queued -= sent;
+               total_sent += sent;
+       } while ((head = skb_peek(&sk->sk_write_queue)));
+out:
+       if (!head) {
+               /* Done with all queued messages. */
+               WARN_ON(!skb_queue_empty(&sk->sk_write_queue));
+               unreserve_psock(kcm);
+       }
+
+       /* Check if write space is available */
+       sk->sk_write_space(sk);
+
+       return total_sent ? : ret;
+}
+
+static void kcm_tx_work(struct work_struct *w)
+{
+       struct kcm_sock *kcm = container_of(w, struct kcm_sock, tx_work);
+       struct sock *sk = &kcm->sk;
+       int err;
+
+       lock_sock(sk);
+
+       /* Primarily for SOCK_DGRAM sockets, also handle asynchronous tx
+        * aborts
+        */
+       err = kcm_write_msgs(kcm);
+       if (err < 0) {
+               /* Hard failure in write, report error on KCM socket */
+               pr_warn("KCM: Hard failure on kcm_write_msgs %d\n", err);
+               report_csk_error(&kcm->sk, -err);
+               goto out;
+       }
+
+       /* Primarily for SOCK_SEQPACKET sockets */
+       if (likely(sk->sk_socket) &&
+           test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
+               clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+               sk->sk_write_space(sk);
+       }
+
+out:
+       release_sock(sk);
+}
+
+static void kcm_push(struct kcm_sock *kcm)
+{
+       if (kcm->tx_wait_more)
+               kcm_write_msgs(kcm);
+}
+
+static int kcm_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
+{
+       struct sock *sk = sock->sk;
+       struct kcm_sock *kcm = kcm_sk(sk);
+       struct sk_buff *skb = NULL, *head = NULL;
+       size_t copy, copied = 0;
+       long timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
+       int eor = (sock->type == SOCK_DGRAM) ?
+                 !(msg->msg_flags & MSG_MORE) : !!(msg->msg_flags & MSG_EOR);
+       int err = -EPIPE;
+
+       lock_sock(sk);
+
+       /* Per tcp_sendmsg this should be in poll */
+       sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
+
+       if (sk->sk_err)
+               goto out_error;
+
+       if (kcm->seq_skb) {
+               /* Previously opened message */
+               head = kcm->seq_skb;
+               skb = kcm_tx_msg(head)->last_skb;
+               goto start;
+       }
+
+       /* Call the sk_stream functions to manage the sndbuf mem. */
+       if (!sk_stream_memory_free(sk)) {
+               kcm_push(kcm);
+               set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+               err = sk_stream_wait_memory(sk, &timeo);
+               if (err)
+                       goto out_error;
+       }
+
+       /* New message, alloc head skb */
+       head = alloc_skb(0, sk->sk_allocation);
+       while (!head) {
+               kcm_push(kcm);
+               err = sk_stream_wait_memory(sk, &timeo);
+               if (err)
+                       goto out_error;
+
+               head = alloc_skb(0, sk->sk_allocation);
+       }
+
+       skb = head;
+
+       /* Set ip_summed to CHECKSUM_UNNECESSARY to avoid calling
+        * csum_and_copy_from_iter from skb_do_copy_data_nocache.
+        */
+       skb->ip_summed = CHECKSUM_UNNECESSARY;
+
+start:
+       while (msg_data_left(msg)) {
+               bool merge = true;
+               int i = skb_shinfo(skb)->nr_frags;
+               struct page_frag *pfrag = sk_page_frag(sk);
+
+               if (!sk_page_frag_refill(sk, pfrag))
+                       goto wait_for_memory;
+
+               if (!skb_can_coalesce(skb, i, pfrag->page,
+                                     pfrag->offset)) {
+                       if (i == MAX_SKB_FRAGS) {
+                               struct sk_buff *tskb;
+
+                               tskb = alloc_skb(0, sk->sk_allocation);
+                               if (!tskb)
+                                       goto wait_for_memory;
+
+                               if (head == skb)
+                                       skb_shinfo(head)->frag_list = tskb;
+                               else
+                                       skb->next = tskb;
+
+                               skb = tskb;
+                               skb->ip_summed = CHECKSUM_UNNECESSARY;
+                               continue;
+                       }
+                       merge = false;
+               }
+
+               copy = min_t(int, msg_data_left(msg),
+                            pfrag->size - pfrag->offset);
+
+               if (!sk_wmem_schedule(sk, copy))
+                       goto wait_for_memory;
+
+               err = skb_copy_to_page_nocache(sk, &msg->msg_iter, skb,
+                                              pfrag->page,
+                                              pfrag->offset,
+                                              copy);
+               if (err)
+                       goto out_error;
+
+               /* Update the skb. */
+               if (merge) {
+                       skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
+               } else {
+                       skb_fill_page_desc(skb, i, pfrag->page,
+                                          pfrag->offset, copy);
+                       get_page(pfrag->page);
+               }
+
+               pfrag->offset += copy;
+               copied += copy;
+               if (head != skb) {
+                       head->len += copy;
+                       head->data_len += copy;
+               }
+
+               continue;
+
+wait_for_memory:
+               kcm_push(kcm);
+               err = sk_stream_wait_memory(sk, &timeo);
+               if (err)
+                       goto out_error;
+       }
+
+       if (eor) {
+               bool not_busy = skb_queue_empty(&sk->sk_write_queue);
+
+               /* Message complete, queue it on send buffer */
+               __skb_queue_tail(&sk->sk_write_queue, head);
+               kcm->seq_skb = NULL;
+
+               if (msg->msg_flags & MSG_BATCH) {
+                       kcm->tx_wait_more = true;
+               } else if (kcm->tx_wait_more || not_busy) {
+                       err = kcm_write_msgs(kcm);
+                       if (err < 0) {
+                               /* We got a hard error in write_msgs but have
+                                * already queued this message. Report an error
+                                * in the socket, but don't affect return value
+                                * from sendmsg
+                                */
+                               pr_warn("KCM: Hard failure on kcm_write_msgs\n");
+                               report_csk_error(&kcm->sk, -err);
+                       }
+               }
+       } else {
+               /* Message not complete, save state */
+partial_message:
+               kcm->seq_skb = head;
+               kcm_tx_msg(head)->last_skb = skb;
+       }
+
+       release_sock(sk);
+       return copied;
+
+out_error:
+       kcm_push(kcm);
+
+       if (copied && sock->type == SOCK_SEQPACKET) {
+               /* Wrote some bytes before encountering an
+                * error, return partial success.
+                */
+               goto partial_message;
+       }
+
+       if (head != kcm->seq_skb)
+               kfree_skb(head);
+
+       err = sk_stream_error(sk, msg->msg_flags, err);
+
+       /* make sure we wake any epoll edge trigger waiter */
+       if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && err == -EAGAIN))
+               sk->sk_write_space(sk);
+
+       release_sock(sk);
+       return err;
+}
+
+static struct sk_buff *kcm_wait_data(struct sock *sk, int flags,
+                                    long timeo, int *err)
+{
+       struct sk_buff *skb;
+
+       while (!(skb = skb_peek(&sk->sk_receive_queue))) {
+               if (sk->sk_err) {
+                       *err = sock_error(sk);
+                       return NULL;
+               }
+
+               if (sock_flag(sk, SOCK_DONE))
+                       return NULL;
+
+               if ((flags & MSG_DONTWAIT) || !timeo) {
+                       *err = -EAGAIN;
+                       return NULL;
+               }
+
+               sk_wait_data(sk, &timeo, NULL);
+
+               /* Handle signals */
+               if (signal_pending(current)) {
+                       *err = sock_intr_errno(timeo);
+                       return NULL;
+               }
+       }
+
+       return skb;
+}
+
+static int kcm_recvmsg(struct socket *sock, struct msghdr *msg,
+                      size_t len, int flags)
+{
+       struct sock *sk = sock->sk;
+       int err = 0;
+       long timeo;
+       struct kcm_rx_msg *rxm;
+       int copied = 0;
+       struct sk_buff *skb;
+
+       timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
+
+       lock_sock(sk);
+
+       skb = kcm_wait_data(sk, flags, timeo, &err);
+       if (!skb)
+               goto out;
+
+       /* Okay, have a message on the receive queue */
+
+       rxm = kcm_rx_msg(skb);
+
+       if (len > rxm->full_len)
+               len = rxm->full_len;
+
+       err = skb_copy_datagram_msg(skb, rxm->offset, msg, len);
+       if (err < 0)
+               goto out;
+
+       copied = len;
+       if (likely(!(flags & MSG_PEEK))) {
+               if (copied < rxm->full_len) {
+                       if (sock->type == SOCK_DGRAM) {
+                               /* Truncated message */
+                               msg->msg_flags |= MSG_TRUNC;
+                               goto msg_finished;
+                       }
+                       rxm->offset += copied;
+                       rxm->full_len -= copied;
+               } else {
+msg_finished:
+                       /* Finished with message */
+                       msg->msg_flags |= MSG_EOR;
+                       skb_unlink(skb, &sk->sk_receive_queue);
+                       kfree_skb(skb);
+               }
+       }
+
+out:
+       release_sock(sk);
+
+       return copied ? : err;
+}
+
+/* kcm sock lock held */
+static void kcm_recv_disable(struct kcm_sock *kcm)
+{
+       struct kcm_mux *mux = kcm->mux;
+
+       if (kcm->rx_disabled)
+               return;
+
+       spin_lock_bh(&mux->rx_lock);
+
+       kcm->rx_disabled = 1;
+
+       /* If a psock is reserved we'll do cleanup in unreserve */
+       if (!kcm->rx_psock) {
+               if (kcm->rx_wait) {
+                       list_del(&kcm->wait_rx_list);
+                       kcm->rx_wait = false;
+               }
+
+               requeue_rx_msgs(mux, &kcm->sk.sk_receive_queue);
+       }
+
+       spin_unlock_bh(&mux->rx_lock);
+}
+
+/* kcm sock lock held */
+static void kcm_recv_enable(struct kcm_sock *kcm)
+{
+       struct kcm_mux *mux = kcm->mux;
+
+       if (!kcm->rx_disabled)
+               return;
+
+       spin_lock_bh(&mux->rx_lock);
+
+       kcm->rx_disabled = 0;
+       kcm_rcv_ready(kcm);
+
+       spin_unlock_bh(&mux->rx_lock);
+}
+
+static int kcm_setsockopt(struct socket *sock, int level, int optname,
+                         char __user *optval, unsigned int optlen)
+{
+       struct kcm_sock *kcm = kcm_sk(sock->sk);
+       int val, valbool;
+       int err = 0;
+
+       if (level != SOL_KCM)
+               return -ENOPROTOOPT;
+
+       if (optlen < sizeof(int))
+               return -EINVAL;
+
+       if (get_user(val, (int __user *)optval))
+               return -EINVAL;
+
+       valbool = val ? 1 : 0;
+
+       switch (optname) {
+       case KCM_RECV_DISABLE:
+               lock_sock(&kcm->sk);
+               if (valbool)
+                       kcm_recv_disable(kcm);
+               else
+                       kcm_recv_enable(kcm);
+               release_sock(&kcm->sk);
+               break;
+       default:
+               err = -ENOPROTOOPT;
+       }
+
+       return err;
+}
+
+static int kcm_getsockopt(struct socket *sock, int level, int optname,
+                         char __user *optval, int __user *optlen)
+{
+       struct kcm_sock *kcm = kcm_sk(sock->sk);
+       int val, len;
+
+       if (level != SOL_KCM)
+               return -ENOPROTOOPT;
+
+       if (get_user(len, optlen))
+               return -EFAULT;
+
+       len = min_t(unsigned int, len, sizeof(int));
+       if (len < 0)
+               return -EINVAL;
+
+       switch (optname) {
+       case KCM_RECV_DISABLE:
+               val = kcm->rx_disabled;
+               break;
+       default:
+               return -ENOPROTOOPT;
+       }
+
+       if (put_user(len, optlen))
+               return -EFAULT;
+       if (copy_to_user(optval, &val, len))
+               return -EFAULT;
+       return 0;
+}
+
+static void init_kcm_sock(struct kcm_sock *kcm, struct kcm_mux *mux)
+{
+       struct kcm_sock *tkcm;
+       struct list_head *head;
+       int index = 0;
+
+       /* For SOCK_SEQPACKET sock type, datagram_poll checks the sk_state, so
+        * we set sk_state, otherwise epoll_wait always returns right away with
+        * POLLHUP
+        */
+       kcm->sk.sk_state = TCP_ESTABLISHED;
+
+       /* Add to mux's kcm sockets list */
+       kcm->mux = mux;
+       spin_lock_bh(&mux->lock);
+
+       head = &mux->kcm_socks;
+       list_for_each_entry(tkcm, &mux->kcm_socks, kcm_sock_list) {
+               if (tkcm->index != index)
+                       break;
+               head = &tkcm->kcm_sock_list;
+               index++;
+       }
+
+       list_add(&kcm->kcm_sock_list, head);
+       kcm->index = index;
+
+       mux->kcm_socks_cnt++;
+       spin_unlock_bh(&mux->lock);
+
+       INIT_WORK(&kcm->tx_work, kcm_tx_work);
+
+       spin_lock_bh(&mux->rx_lock);
+       kcm_rcv_ready(kcm);
+       spin_unlock_bh(&mux->rx_lock);
+}
+
+static int kcm_attach(struct socket *sock, struct socket *csock,
+                     struct bpf_prog *prog)
+{
+       struct kcm_sock *kcm = kcm_sk(sock->sk);
+       struct kcm_mux *mux = kcm->mux;
+       struct sock *csk;
+       struct kcm_psock *psock = NULL, *tpsock;
+       struct list_head *head;
+       int index = 0;
+
+       if (csock->ops->family != PF_INET &&
+           csock->ops->family != PF_INET6)
+               return -EINVAL;
+
+       csk = csock->sk;
+       if (!csk)
+               return -EINVAL;
+
+       /* Only support TCP for now */
+       if (csk->sk_protocol != IPPROTO_TCP)
+               return -EINVAL;
+
+       psock = kmem_cache_zalloc(kcm_psockp, GFP_KERNEL);
+       if (!psock)
+               return -ENOMEM;
+
+       psock->mux = mux;
+       psock->sk = csk;
+       psock->bpf_prog = prog;
+       INIT_WORK(&psock->rx_work, psock_rx_work);
+       INIT_DELAYED_WORK(&psock->rx_delayed_work, psock_rx_delayed_work);
+
+       sock_hold(csk);
+
+       write_lock_bh(&csk->sk_callback_lock);
+       psock->save_data_ready = csk->sk_data_ready;
+       psock->save_write_space = csk->sk_write_space;
+       psock->save_state_change = csk->sk_state_change;
+       csk->sk_user_data = psock;
+       csk->sk_data_ready = psock_tcp_data_ready;
+       csk->sk_write_space = psock_tcp_write_space;
+       csk->sk_state_change = psock_tcp_state_change;
+       write_unlock_bh(&csk->sk_callback_lock);
+
+       /* Finished initialization, now add the psock to the MUX. */
+       spin_lock_bh(&mux->lock);
+       head = &mux->psocks;
+       list_for_each_entry(tpsock, &mux->psocks, psock_list) {
+               if (tpsock->index != index)
+                       break;
+               head = &tpsock->psock_list;
+               index++;
+       }
+
+       list_add(&psock->psock_list, head);
+       psock->index = index;
+
+       mux->psocks_cnt++;
+       psock_now_avail(psock);
+       spin_unlock_bh(&mux->lock);
+
+       /* Schedule RX work in case there are already bytes queued */
+       queue_work(kcm_wq, &psock->rx_work);
+
+       return 0;
+}
+
+static int kcm_attach_ioctl(struct socket *sock, struct kcm_attach *info)
+{
+       struct socket *csock;
+       struct bpf_prog *prog;
+       int err;
+
+       csock = sockfd_lookup(info->fd, &err);
+       if (!csock)
+               return -ENOENT;
+
+       prog = bpf_prog_get(info->bpf_fd);
+       if (IS_ERR(prog)) {
+               err = PTR_ERR(prog);
+               goto out;
+       }
+
+       if (prog->type != BPF_PROG_TYPE_SOCKET_FILTER) {
+               bpf_prog_put(prog);
+               err = -EINVAL;
+               goto out;
+       }
+
+       err = kcm_attach(sock, csock, prog);
+       if (err) {
+               bpf_prog_put(prog);
+               goto out;
+       }
+
+       /* Keep reference on file also */
+
+       return 0;
+out:
+       fput(csock->file);
+       return err;
+}
+
+static void kcm_unattach(struct kcm_psock *psock)
+{
+       struct sock *csk = psock->sk;
+       struct kcm_mux *mux = psock->mux;
+
+       /* Stop getting callbacks from TCP socket. After this there should
+        * be no way to reserve a kcm for this psock.
+        */
+       write_lock_bh(&csk->sk_callback_lock);
+       csk->sk_user_data = NULL;
+       csk->sk_data_ready = psock->save_data_ready;
+       csk->sk_write_space = psock->save_write_space;
+       csk->sk_state_change = psock->save_state_change;
+       psock->rx_stopped = 1;
+
+       if (WARN_ON(psock->rx_kcm)) {
+               write_unlock_bh(&csk->sk_callback_lock);
+               return;
+       }
+
+       spin_lock_bh(&mux->rx_lock);
+
+       /* Stop receiver activities. After this point psock should not be
+        * able to get onto ready list either through callbacks or work.
+        */
+       if (psock->ready_rx_msg) {
+               list_del(&psock->psock_ready_list);
+               kfree_skb(psock->ready_rx_msg);
+               psock->ready_rx_msg = NULL;
+       }
+
+       spin_unlock_bh(&mux->rx_lock);
+
+       write_unlock_bh(&csk->sk_callback_lock);
+
+       cancel_work_sync(&psock->rx_work);
+       cancel_delayed_work_sync(&psock->rx_delayed_work);
+
+       bpf_prog_put(psock->bpf_prog);
+
+       kfree_skb(psock->rx_skb_head);
+       psock->rx_skb_head = NULL;
+
+       spin_lock_bh(&mux->lock);
+
+       if (psock->tx_kcm) {
+               /* psock was reserved.  Just mark it finished and we will clean
+                * up in the kcm paths, we need kcm lock which can not be
+                * acquired here.
+                */
+               spin_unlock_bh(&mux->lock);
+
+               /* We are unattaching a socket that is reserved. Abort the
+                * socket since we may be out of sync in sending on it. We need
+                * to do this without the mux lock.
+                */
+               kcm_abort_tx_psock(psock, EPIPE, false);
+
+               spin_lock_bh(&mux->lock);
+               if (!psock->tx_kcm) {
+                       /* psock now unreserved in window mux was unlocked */
+                       goto no_reserved;
+               }
+               psock->done = 1;
+
+               /* Commit done before queuing work to process it */
+               smp_mb();
+
+               /* Queue tx work to make sure psock->done is handled */
+               queue_work(kcm_wq, &psock->tx_kcm->tx_work);
+               spin_unlock_bh(&mux->lock);
+       } else {
+no_reserved:
+               if (!psock->tx_stopped)
+                       list_del(&psock->psock_avail_list);
+               list_del(&psock->psock_list);
+               mux->psocks_cnt--;
+               spin_unlock_bh(&mux->lock);
+
+               sock_put(csk);
+               fput(csk->sk_socket->file);
+               kmem_cache_free(kcm_psockp, psock);
+       }
+}
+
+static int kcm_unattach_ioctl(struct socket *sock, struct kcm_unattach *info)
+{
+       struct kcm_sock *kcm = kcm_sk(sock->sk);
+       struct kcm_mux *mux = kcm->mux;
+       struct kcm_psock *psock;
+       struct socket *csock;
+       struct sock *csk;
+       int err;
+
+       csock = sockfd_lookup(info->fd, &err);
+       if (!csock)
+               return -ENOENT;
+
+       csk = csock->sk;
+       if (!csk) {
+               err = -EINVAL;
+               goto out;
+       }
+
+       err = -ENOENT;
+
+       spin_lock_bh(&mux->lock);
+
+       list_for_each_entry(psock, &mux->psocks, psock_list) {
+               if (psock->sk != csk)
+                       continue;
+
+               /* Found the matching psock */
+
+               if (psock->unattaching || WARN_ON(psock->done)) {
+                       err = -EALREADY;
+                       break;
+               }
+
+               psock->unattaching = 1;
+
+               spin_unlock_bh(&mux->lock);
+
+               kcm_unattach(psock);
+
+               err = 0;
+               goto out;
+       }
+
+       spin_unlock_bh(&mux->lock);
+
+out:
+       fput(csock->file);
+       return err;
+}
+
+static struct proto kcm_proto = {
+       .name   = "KCM",
+       .owner  = THIS_MODULE,
+       .obj_size = sizeof(struct kcm_sock),
+};
+
+/* Clone a kcm socket. */
+static int kcm_clone(struct socket *osock, struct kcm_clone *info,
+                    struct socket **newsockp)
+{
+       struct socket *newsock;
+       struct sock *newsk;
+       struct file *newfile;
+       int err, newfd;
+
+       err = -ENFILE;
+       newsock = sock_alloc();
+       if (!newsock)
+               goto out;
+
+       newsock->type = osock->type;
+       newsock->ops = osock->ops;
+
+       __module_get(newsock->ops->owner);
+
+       newfd = get_unused_fd_flags(0);
+       if (unlikely(newfd < 0)) {
+               err = newfd;
+               goto out_fd_fail;
+       }
+
+       newfile = sock_alloc_file(newsock, 0, osock->sk->sk_prot_creator->name);
+       if (unlikely(IS_ERR(newfile))) {
+               err = PTR_ERR(newfile);
+               goto out_sock_alloc_fail;
+       }
+
+       newsk = sk_alloc(sock_net(osock->sk), PF_KCM, GFP_KERNEL,
+                        &kcm_proto, true);
+       if (!newsk) {
+               err = -ENOMEM;
+               goto out_sk_alloc_fail;
+       }
+
+       sock_init_data(newsock, newsk);
+       init_kcm_sock(kcm_sk(newsk), kcm_sk(osock->sk)->mux);
+
+       fd_install(newfd, newfile);
+       *newsockp = newsock;
+       info->fd = newfd;
+
+       return 0;
+
+out_sk_alloc_fail:
+       fput(newfile);
+out_sock_alloc_fail:
+       put_unused_fd(newfd);
+out_fd_fail:
+       sock_release(newsock);
+out:
+       return err;
+}
+
+static int kcm_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+       int err;
+
+       switch (cmd) {
+       case SIOCKCMATTACH: {
+               struct kcm_attach info;
+
+               if (copy_from_user(&info, (void __user *)arg, sizeof(info)))
+                       err = -EFAULT;
+
+               err = kcm_attach_ioctl(sock, &info);
+
+               break;
+       }
+       case SIOCKCMUNATTACH: {
+               struct kcm_unattach info;
+
+               if (copy_from_user(&info, (void __user *)arg, sizeof(info)))
+                       err = -EFAULT;
+
+               err = kcm_unattach_ioctl(sock, &info);
+
+               break;
+       }
+       case SIOCKCMCLONE: {
+               struct kcm_clone info;
+               struct socket *newsock = NULL;
+
+               if (copy_from_user(&info, (void __user *)arg, sizeof(info)))
+                       err = -EFAULT;
+
+               err = kcm_clone(sock, &info, &newsock);
+
+               if (!err) {
+                       if (copy_to_user((void __user *)arg, &info,
+                                        sizeof(info))) {
+                               err = -EFAULT;
+                               sock_release(newsock);
+                       }
+               }
+
+               break;
+       }
+       default:
+               err = -ENOIOCTLCMD;
+               break;
+       }
+
+       return err;
+}
+
+static void free_mux(struct rcu_head *rcu)
+{
+       struct kcm_mux *mux = container_of(rcu,
+           struct kcm_mux, rcu);
+
+       kmem_cache_free(kcm_muxp, mux);
+}
+
+static void release_mux(struct kcm_mux *mux)
+{
+       struct kcm_net *knet = mux->knet;
+       struct kcm_psock *psock, *tmp_psock;
+
+       /* Release psocks */
+       list_for_each_entry_safe(psock, tmp_psock,
+                                &mux->psocks, psock_list) {
+               if (!WARN_ON(psock->unattaching))
+                       kcm_unattach(psock);
+       }
+
+       if (WARN_ON(mux->psocks_cnt))
+               return;
+
+       __skb_queue_purge(&mux->rx_hold_queue);
+
+       mutex_lock(&knet->mutex);
+       list_del_rcu(&mux->kcm_mux_list);
+       knet->count--;
+       mutex_unlock(&knet->mutex);
+
+       call_rcu(&mux->rcu, free_mux);
+}
+
+static void kcm_done(struct kcm_sock *kcm)
+{
+       struct kcm_mux *mux = kcm->mux;
+       struct sock *sk = &kcm->sk;
+       int socks_cnt;
+
+       spin_lock_bh(&mux->rx_lock);
+       if (kcm->rx_psock) {
+               /* Cleanup in unreserve_rx_kcm */
+               WARN_ON(kcm->done);
+               kcm->rx_disabled = 1;
+               kcm->done = 1;
+               spin_unlock_bh(&mux->rx_lock);
+               return;
+       }
+
+       if (kcm->rx_wait) {
+               list_del(&kcm->wait_rx_list);
+               kcm->rx_wait = false;
+       }
+       /* Move any pending receive messages to other kcm sockets */
+       requeue_rx_msgs(mux, &sk->sk_receive_queue);
+
+       spin_unlock_bh(&mux->rx_lock);
+
+       if (WARN_ON(sk_rmem_alloc_get(sk)))
+               return;
+
+       /* Detach from MUX */
+       spin_lock_bh(&mux->lock);
+
+       list_del(&kcm->kcm_sock_list);
+       mux->kcm_socks_cnt--;
+       socks_cnt = mux->kcm_socks_cnt;
+
+       spin_unlock_bh(&mux->lock);
+
+       if (!socks_cnt) {
+               /* We are done with the mux now. */
+               release_mux(mux);
+       }
+
+       WARN_ON(kcm->rx_wait);
+
+       sock_put(&kcm->sk);
+}
+
+/* Called by kcm_release to close a KCM socket.
+ * If this is the last KCM socket on the MUX, destroy the MUX.
+ */
+static int kcm_release(struct socket *sock)
+{
+       struct sock *sk = sock->sk;
+       struct kcm_sock *kcm;
+       struct kcm_mux *mux;
+       struct kcm_psock *psock;
+
+       if (!sk)
+               return 0;
+
+       kcm = kcm_sk(sk);
+       mux = kcm->mux;
+
+       sock_orphan(sk);
+       kfree_skb(kcm->seq_skb);
+
+       lock_sock(sk);
+       /* Purge queue under lock to avoid race condition with tx_work trying
+        * to act when queue is nonempty. If tx_work runs after this point
+        * it will just return.
+        */
+       __skb_queue_purge(&sk->sk_write_queue);
+       release_sock(sk);
+
+       spin_lock_bh(&mux->lock);
+       if (kcm->tx_wait) {
+               /* Take of tx_wait list, after this point there should be no way
+                * that a psock will be assigned to this kcm.
+                */
+               list_del(&kcm->wait_psock_list);
+               kcm->tx_wait = false;
+       }
+       spin_unlock_bh(&mux->lock);
+
+       /* Cancel work. After this point there should be no outside references
+        * to the kcm socket.
+        */
+       cancel_work_sync(&kcm->tx_work);
+
+       lock_sock(sk);
+       psock = kcm->tx_psock;
+       if (psock) {
+               /* A psock was reserved, so we need to kill it since it
+                * may already have some bytes queued from a message. We
+                * need to do this after removing kcm from tx_wait list.
+                */
+               kcm_abort_tx_psock(psock, EPIPE, false);
+               unreserve_psock(kcm);
+       }
+       release_sock(sk);
+
+       WARN_ON(kcm->tx_wait);
+       WARN_ON(kcm->tx_psock);
+
+       sock->sk = NULL;
+
+       kcm_done(kcm);
+
+       return 0;
+}
+
+static const struct proto_ops kcm_ops = {
+       .family =       PF_KCM,
+       .owner =        THIS_MODULE,
+       .release =      kcm_release,
+       .bind =         sock_no_bind,
+       .connect =      sock_no_connect,
+       .socketpair =   sock_no_socketpair,
+       .accept =       sock_no_accept,
+       .getname =      sock_no_getname,
+       .poll =         datagram_poll,
+       .ioctl =        kcm_ioctl,
+       .listen =       sock_no_listen,
+       .shutdown =     sock_no_shutdown,
+       .setsockopt =   kcm_setsockopt,
+       .getsockopt =   kcm_getsockopt,
+       .sendmsg =      kcm_sendmsg,
+       .recvmsg =      kcm_recvmsg,
+       .mmap =         sock_no_mmap,
+       .sendpage =     sock_no_sendpage,
+};
+
+/* Create proto operation for kcm sockets */
+static int kcm_create(struct net *net, struct socket *sock,
+                     int protocol, int kern)
+{
+       struct kcm_net *knet = net_generic(net, kcm_net_id);
+       struct sock *sk;
+       struct kcm_mux *mux;
+
+       switch (sock->type) {
+       case SOCK_DGRAM:
+       case SOCK_SEQPACKET:
+               sock->ops = &kcm_ops;
+               break;
+       default:
+               return -ESOCKTNOSUPPORT;
+       }
+
+       if (protocol != KCMPROTO_CONNECTED)
+               return -EPROTONOSUPPORT;
+
+       sk = sk_alloc(net, PF_KCM, GFP_KERNEL, &kcm_proto, kern);
+       if (!sk)
+               return -ENOMEM;
+
+       /* Allocate a kcm mux, shared between KCM sockets */
+       mux = kmem_cache_zalloc(kcm_muxp, GFP_KERNEL);
+       if (!mux) {
+               sk_free(sk);
+               return -ENOMEM;
+       }
+
+       spin_lock_init(&mux->lock);
+       spin_lock_init(&mux->rx_lock);
+       INIT_LIST_HEAD(&mux->kcm_socks);
+       INIT_LIST_HEAD(&mux->kcm_rx_waiters);
+       INIT_LIST_HEAD(&mux->kcm_tx_waiters);
+
+       INIT_LIST_HEAD(&mux->psocks);
+       INIT_LIST_HEAD(&mux->psocks_ready);
+       INIT_LIST_HEAD(&mux->psocks_avail);
+
+       mux->knet = knet;
+
+       /* Add new MUX to list */
+       mutex_lock(&knet->mutex);
+       list_add_rcu(&mux->kcm_mux_list, &knet->mux_list);
+       knet->count++;
+       mutex_unlock(&knet->mutex);
+
+       skb_queue_head_init(&mux->rx_hold_queue);
+
+       /* Init KCM socket */
+       sock_init_data(sock, sk);
+       init_kcm_sock(kcm_sk(sk), mux);
+
+       return 0;
+}
+
+static struct net_proto_family kcm_family_ops = {
+       .family = PF_KCM,
+       .create = kcm_create,
+       .owner  = THIS_MODULE,
+};
+
+static __net_init int kcm_init_net(struct net *net)
+{
+       struct kcm_net *knet = net_generic(net, kcm_net_id);
+
+       INIT_LIST_HEAD_RCU(&knet->mux_list);
+       mutex_init(&knet->mutex);
+
+       return 0;
+}
+
+static __net_exit void kcm_exit_net(struct net *net)
+{
+       struct kcm_net *knet = net_generic(net, kcm_net_id);
+
+       /* All KCM sockets should be closed at this point, which should mean
+        * that all multiplexors and psocks have been destroyed.
+        */
+       WARN_ON(!list_empty(&knet->mux_list));
+}
+
+static struct pernet_operations kcm_net_ops = {
+       .init = kcm_init_net,
+       .exit = kcm_exit_net,
+       .id   = &kcm_net_id,
+       .size = sizeof(struct kcm_net),
+};
+
+static int __init kcm_init(void)
+{
+       int err = -ENOMEM;
+
+       kcm_muxp = kmem_cache_create("kcm_mux_cache",
+                                    sizeof(struct kcm_mux), 0,
+                                    SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
+       if (!kcm_muxp)
+               goto fail;
+
+       kcm_psockp = kmem_cache_create("kcm_psock_cache",
+                                      sizeof(struct kcm_psock), 0,
+                                       SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
+       if (!kcm_psockp)
+               goto fail;
+
+       kcm_wq = create_singlethread_workqueue("kkcmd");
+       if (!kcm_wq)
+               goto fail;
+
+       err = proto_register(&kcm_proto, 1);
+       if (err)
+               goto fail;
+
+       err = sock_register(&kcm_family_ops);
+       if (err)
+               goto sock_register_fail;
+
+       err = register_pernet_device(&kcm_net_ops);
+       if (err)
+               goto net_ops_fail;
+
+       return 0;
+
+net_ops_fail:
+       sock_unregister(PF_KCM);
+
+sock_register_fail:
+       proto_unregister(&kcm_proto);
+
+fail:
+       kmem_cache_destroy(kcm_muxp);
+       kmem_cache_destroy(kcm_psockp);
+
+       if (kcm_wq)
+               destroy_workqueue(kcm_wq);
+
+       return err;
+}
+
+static void __exit kcm_exit(void)
+{
+       unregister_pernet_device(&kcm_net_ops);
+       sock_unregister(PF_KCM);
+       proto_unregister(&kcm_proto);
+       destroy_workqueue(kcm_wq);
+
+       kmem_cache_destroy(kcm_muxp);
+       kmem_cache_destroy(kcm_psockp);
+}
+
+module_init(kcm_init);
+module_exit(kcm_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NETPROTO(PF_KCM);
+
author	Tom Herbert <tom@herbertland.com>
	Mon, 7 Mar 2016 22:11:06 +0000 (14:11 -0800)
committer	David S. Miller <davem@davemloft.net>
	Wed, 9 Mar 2016 21:36:14 +0000 (16:36 -0500)
include/linux/socket.h		patch \| blob \| history
include/net/kcm.h	[new file with mode: 0644]	patch \| blob
include/uapi/linux/kcm.h	[new file with mode: 0644]	patch \| blob
net/Kconfig		patch \| blob \| history
net/Makefile		patch \| blob \| history
net/kcm/Kconfig	[new file with mode: 0644]	patch \| blob
net/kcm/Makefile	[new file with mode: 0644]	patch \| blob
net/kcm/kcmsock.c	[new file with mode: 0644]	patch \| blob