]> www.infradead.org Git - users/dwmw2/openconnect.git/commitdiff
Initial vhost-net support
authorDavid Woodhouse <dwmw2@infradead.org>
Wed, 16 Jun 2021 23:05:14 +0000 (00:05 +0100)
committerDavid Woodhouse <dwmw2@infradead.org>
Thu, 1 Jul 2021 20:46:06 +0000 (21:46 +0100)
We spend a lot of CPU time copying packets between kernel and userspace.

Eventually we want to implement a completely in-kernel data path. It
isn't even that hard, now that most of the functionality we need from
the kernel is there and it's mostly just a case of fitting it together.

In the meantime, though, there are a few things we can do even on today's
released kernels. For a start, we can use vhost-net to avoid having to
do the read()/write() on the tun device in our mainloop.

Ultimately, it ends up being done by a kernel thread instead; it doesn't
really go away. But that should at least give us a performance win which
would compare with a decent threading model, while allowing OpenConnect
to remain naïvely single-threaded and lock-free.

We have to carefully pick a configuration for vhost-net which actually
works, since it's fairly hosed for IFF_TUN support:
https://lore.kernel.org/netdev/2433592d2b26deec33336dd3e83acfd273b0cf30.camel@infradead.org/T/

But by limiting the sndbuf (which disables XDP, sadly) and by requesting
a virtio header that we don't actually want, we *can* make it work even
with today's production kernels.

Thanks to Eugenio Pérez Martín >eperezma@redhat.com> for his blog at
https://www.redhat.com/en/blog/virtqueues-and-virtio-ring-how-data-travels
and for lots more help and guidance as I floundered around trying to make
this work.

Although this gives a 10% improvement on the bandwidth we can manage in
my testing (up to 2.75Gb/s with other tricks, on a c5.8xlarge Skylake VM)
it also introduces a small amount of extra latency, so disable it by
default unless the user has bumped the queue length to 16 or more, which
presumably means they choose bandwidth over latency.

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
Makefile.am
configure.ac
library.c
mainloop.c
openconnect-internal.h
openconnect.8.in
tun.c
vhost.c [new file with mode: 0644]
www/changelog.xml

index 48ca991fecf2fdf08414d317c90cd1843560e233..2cd27afaf9b4887f245ed23d29fa75c99b4ffaaf 100644 (file)
@@ -65,13 +65,17 @@ lib_srcs_yubikey = yubikey.c
 lib_srcs_stoken = stoken.c
 lib_srcs_esp = esp.c esp-seqno.c
 lib_srcs_dtls = dtls.c
+lib_srcs_vhost = vhost.c
 
 POTFILES = $(openconnect_SOURCES) gnutls-esp.c gnutls-dtls.c openssl-esp.c openssl-dtls.c \
           $(lib_srcs_esp) $(lib_srcs_dtls) gnutls_tpm2_esys.c gnutls_tpm2_ibm.c \
           $(lib_srcs_openssl) $(lib_srcs_gnutls) $(library_srcs) \
           $(lib_srcs_win32) $(lib_srcs_posix) $(lib_srcs_gssapi) $(lib_srcs_iconv) \
-          $(lib_srcs_yubikey) $(lib_srcs_stoken) $(lib_srcs_oidc)
+          $(lib_srcs_yubikey) $(lib_srcs_stoken) $(lib_srcs_oidc) $(lib_srcs_vhost)
 
+if OPENCONNECT_VHOST
+library_srcs += $(lib_srcs_vhost)
+endif
 if OPENCONNECT_LIBPCSCLITE
 library_srcs += $(lib_srcs_yubikey)
 endif
index e0eb80206588de9107ae036144aabf9b797825f0..f636291b717f02fbcd7d0b72d42dcf0b94f1dc05 100644 (file)
@@ -34,8 +34,14 @@ symver_win32_setenv=
 AC_PROG_CC
 AC_PROG_CC_C99
 
+have_vhost=no
 case $host_os in
  *linux* | *gnu* | *nacl*)
+    case $host_cpu in
+       x86_64|amd64)
+           have_vhost=yes
+           ;;
+    esac
     AC_MSG_NOTICE([Applying feature macros for GNU build])
     AC_DEFINE(_GNU_SOURCE, 1, [_GNU_SOURCE])
     ;;
@@ -1170,6 +1176,40 @@ AC_CHECK_HEADER([if_tun.h],
 AC_CHECK_HEADER([net/if_utun.h], AC_DEFINE([HAVE_NET_UTUN_H], 1, [Have net/if_utun.h]), ,
                [#include <sys/types.h>])
 
+AC_ARG_ENABLE([vhost-net],
+       AS_HELP_STRING([--enable-vhost-net],
+                      [Build vhost-net support for tun device acceleration [default=no]]),
+       [have_vhost=$enableval])
+
+if test "$have_vhost" = "yes"; then
+   AC_MSG_CHECKING([for vhost-net support])
+   AC_COMPILE_IFELSE([AC_LANG_PROGRAM([
+               #include <linux/if_tun.h>
+               #include <linux/virtio_net.h>
+               #include <linux/vhost.h>
+               #include <sys/eventfd.h>
+
+               struct foo {
+                       struct vring_desc desc;
+                       struct vring_avail avail;
+                       struct vring_used used;
+                       struct virtio_net_hdr_mrg_rxbuf h;
+               };
+       ],[
+               (void)eventfd(0, EFD_CLOEXEC|EFD_NONBLOCK);
+               (void)VHOST_NET_F_VIRTIO_NET_HDR;
+               (void)VIRTIO_F_VERSION_1;
+               (void)TUNSETSNDBUF;
+               __sync_synchronize();
+       ])],
+       [have_vhost=yes
+        AC_DEFINE([HAVE_VHOST], 1, [Have vhost])
+        AC_MSG_RESULT([yes])],
+       [have_vhost=no
+        AC_MSG_RESULT([no])])
+fi
+AM_CONDITIONAL(OPENCONNECT_VHOST, [test "$have_vhost" = "yes"])
+
 AC_CHECK_HEADER([alloca.h], AC_DEFINE([HAVE_ALLOCA_H], 1, [Have alloca.h]))
 
 AC_CHECK_HEADER([endian.h],
@@ -1318,6 +1358,7 @@ SUMMARY([libproxy support], [$libproxy_pkg])
 SUMMARY([RSA SecurID support], [$libstoken_pkg])
 SUMMARY([PSKC OATH file support], [$libpskc_pkg])
 SUMMARY([GSSAPI support], [$linked_gssapi])
+SUMMARY([vhost-net support], [$have_vhost])
 SUMMARY([Yubikey support], [$libpcsclite_pkg])
 SUMMARY([JSON parser], [$json])
 SUMMARY([LZ4 compression], [$lz4_pkg])
index 8900944dc670252e2a51bfb24ece97a10f9d5dc8..37f2b537f9de60d2e38cb56e72fecaf04caf75fa 100644 (file)
--- a/library.c
+++ b/library.c
@@ -67,6 +67,9 @@ struct openconnect_info *openconnect_vpninfo_new(const char *useragent,
                vpninfo->ic_legacy_to_utf8 = (iconv_t)-1;
        }
 #endif
+#ifdef HAVE_VHOST
+       vpninfo->vhost_fd = vpninfo->vhost_call_fd = vpninfo->vhost_kick_fd = -1;
+#endif
 #ifndef _WIN32
        vpninfo->tun_fd = -1;
 #endif
index 848049faddf9f716b9f91eeae77fd6deed926472..a459dfb95a17138dc29deef42d0aa1034a514d8e 100644 (file)
@@ -47,19 +47,11 @@ int queue_new_packet(struct openconnect_info *vpninfo,
 
 /* This is here because it's generic and hence can't live in either of the
    tun*.c files for specific platforms */
-int tun_mainloop(struct openconnect_info *vpninfo, int *timeout, int readable)
+int tun_mainloop(struct openconnect_info *vpninfo, int *timeout, int readable, int did_work)
 {
        struct pkt *this;
        int work_done = 0;
 
-       if (!tun_is_up(vpninfo)) {
-               /* no tun yet; clear any queued packets */
-               while ((this = dequeue_packet(&vpninfo->incoming_queue)))
-                       free_pkt(vpninfo, this);
-
-               return 0;
-       }
-
        if (readable && read_fd_monitored(vpninfo, tun)) {
                struct pkt *out_pkt = vpninfo->tun_pkt;
                while (1) {
@@ -182,6 +174,10 @@ int openconnect_mainloop(struct openconnect_info *vpninfo,
 {
        int ret = 0;
        int tun_r = 1, udp_r = 1, tcp_r = 1;
+#ifdef HAVE_VHOST
+       int vhost_r = 0;
+#endif
+
        vpninfo->reconnect_timeout = reconnect_timeout;
        vpninfo->reconnect_interval = reconnect_interval;
 
@@ -234,9 +230,25 @@ int openconnect_mainloop(struct openconnect_info *vpninfo,
                        break;
                did_work += ret;
 
+
                /* Tun must be last because it will set/clear its bit
                   in the select_rfds according to the queue length */
-               did_work += tun_mainloop(vpninfo, &timeout, tun_r);
+               if (!tun_is_up(vpninfo)) {
+                       struct pkt *this;
+                       /* no tun yet; clear any queued packets */
+                       while ((this = dequeue_packet(&vpninfo->incoming_queue)))
+                               free_pkt(vpninfo, this);
+#ifdef HAVE_VHOST
+               } else if (vpninfo->vhost_fd != -1) {
+                       did_work += vhost_tun_mainloop(vpninfo, &timeout, vhost_r, did_work);
+                       /* If it returns zero *then* it will have read the eventfd
+                        * and there's no need to do so again until we poll again. */
+                       if (!did_work)
+                               vhost_r = 0;
+#endif
+               } else {
+                       did_work += tun_mainloop(vpninfo, &timeout, tun_r, did_work);
+               }
                if (vpninfo->quit_reason)
                        break;
 
@@ -340,9 +352,15 @@ int openconnect_mainloop(struct openconnect_info *vpninfo,
                                update_epoll_fd(vpninfo, ssl);
                                update_epoll_fd(vpninfo, cmd);
                                update_epoll_fd(vpninfo, dtls);
+#ifdef HAVE_VHOST
+                               update_epoll_fd(vpninfo, vhost_call);
+#endif
                        }
 
                        tun_r = udp_r = tcp_r = 0;
+#ifdef HAVE_VHOST
+                       vhost_r = 0;
+#endif
 
                        int nfds = epoll_wait(vpninfo->epoll_fd, evs, 5, timeout);
                        if (nfds < 0) {
@@ -361,6 +379,10 @@ int openconnect_mainloop(struct openconnect_info *vpninfo,
                                                tcp_r = 1;
                                        else if (evs[nfds].data.fd == vpninfo->dtls_fd)
                                                udp_r = 1;
+#ifdef HAVE_VHOST
+                                       else if (evs[nfds].data.fd == vpninfo->vhost_call_fd)
+                                               vhost_r = 1;
+#endif
                                }
                        }
                        continue;
@@ -379,7 +401,10 @@ int openconnect_mainloop(struct openconnect_info *vpninfo,
                        vpn_perror(vpninfo, _("Failed select() in mainloop"));
                        break;
                }
-
+#ifdef HAVE_VHOST
+               if (vpninfo->vhost_call_fd >= 0)
+                       vhost_r = FD_ISSET(vpninfo->vhost_call_fd, &rfds);
+#endif
                if (vpninfo->tun_fd >= 0)
                        tun_r = FD_ISSET(vpninfo->tun_fd, &rfds);
                if (vpninfo->dtls_fd >= 0)
index 023b14be67e50568c67600998b068cbc6ebc61f8..91610d41d36d28fe8d59e4cc9d675794b2b44359 100644 (file)
 #define IPPROTO_IPIP 0x04
 #endif
 
+#ifdef HAVE_VHOST
+#include <linux/virtio_net.h>
+#include <linux/vhost.h>
+
+struct oc_vring {
+       struct vring_desc *desc;
+       struct vring_avail *avail;
+       struct vring_used *used;
+       uint16_t seen_used;
+};
+
+#endif
+
+
 /****************************************************************************/
 
 struct pkt {
@@ -190,10 +204,19 @@ struct pkt {
                        uint16_t proto;
                        unsigned char hdr[18];
                } ppp;
+#ifdef HAVE_VHOST
+               struct {
+                       unsigned char pad[12];
+                       struct virtio_net_hdr_mrg_rxbuf h;
+               } virtio;
+#endif
        };
        unsigned char data[];
 };
 
+#define pkt_offset(field) ((intptr_t)&((struct pkt *)NULL)->field)
+#define pkt_from_hdr(addr, field) ((struct pkt *) ((intptr_t)(addr) - pkt_offset(field) ))
+
 #define REKEY_NONE      0
 #define REKEY_TUNNEL    1
 #define REKEY_SSL       2
@@ -653,6 +676,9 @@ struct openconnect_info {
        int epoll_fd;
        int epoll_update;
        uint32_t tun_epoll, ssl_epoll, dtls_epoll, cmd_epoll;
+#ifdef HAVE_VHOST
+       uint32_t vhost_call_epoll;
+#endif
 #endif
 #endif
 
@@ -660,6 +686,11 @@ struct openconnect_info {
        int ip_fd;
        int ip6_fd;
 #endif
+#ifdef HAVE_VHOST
+       int vhost_ring_size;
+       int vhost_fd, vhost_call_fd, vhost_kick_fd;
+       struct oc_vring tx_vring, rx_vring;
+       #endif
 #ifdef _WIN32
        HMODULE wintun;
        wchar_t *ifname_w;
@@ -783,9 +814,10 @@ static inline struct pkt *dequeue_packet(struct pkt_q *q)
        struct pkt *ret = q->head;
 
        if (ret) {
-               q->head = ret->next;
+               struct pkt *next = ret->next;
                if (!--q->count)
                        q->tail = &q->head;
+               q->head = next;
        }
        return ret;
 }
@@ -1119,6 +1151,11 @@ void free_split_routes(struct oc_ip_info *ip_info);
 int install_vpn_opts(struct openconnect_info *vpninfo, struct oc_vpn_option *opt,
                     struct oc_ip_info *ip_info);
 
+/* vhost.h */
+int setup_vhost(struct openconnect_info *vpninfo, int tun_fd);
+void shutdown_vhost(struct openconnect_info *vpninfo);
+int vhost_tun_mainloop(struct openconnect_info *vpninfo, int *timeout, int readable, int did_work);
+
 /* tun.c / tun-win32.c */
 void os_shutdown_tun(struct openconnect_info *vpninfo);
 int os_read_tun(struct openconnect_info *vpninfo, struct pkt *pkt);
@@ -1363,7 +1400,7 @@ int openconnect_install_ctx_verify(struct openconnect_info *vpninfo,
 #endif
 
 /* mainloop.c */
-int tun_mainloop(struct openconnect_info *vpninfo, int *timeout, int readable);
+int tun_mainloop(struct openconnect_info *vpninfo, int *timeout, int readable, int did_work);
 int queue_new_packet(struct openconnect_info *vpninfo,
                     struct pkt_q *q, void *buf, int len);
 int keepalive_action(struct keepalive_info *ka, int *timeout);
index 341ad548df2c8e56e6d4185fd3801af7d94c76eb..497ff3b70793f8ffa8b147500fb85659482ee4c1 100644 (file)
@@ -321,7 +321,31 @@ Less output
 .B \-Q,\-\-queue\-len=LEN
 Set packet queue limit to
 .I LEN
-pkts
+packets. The default is 10. A high value may allow better overall bandwidth
+but at a cost of latency. If you run Voice over IP or other interactive
+traffic over the VPN, you don't want those packets to be queued behind
+thousands of other large packets which are part of a bulk transfer.
+
+This option sets the maximum inbound and outbound packet queue sizes
+in OpenConnect itself, which control how many packets will be sent and
+received in a single batch, as well as affecting other buffering such
+as the socket send buffer (SO_SNDBUF) for network connections and the
+OS tunnel device.
+
+Ultimately, the right size for a queue is "just enough packets that it
+never quite gets empty before more are pushed to it". Any higher than
+that is simply introducing bufferbloat and additional latency with no
+benefit. With the default of 10, we are able to saturate a single
+Gigabit Ethernet from modest hardware, which is more than enough for
+most VPN users.
+
+If OpenConnect is built with vhost-net support, it will only be used
+if the queue length is set to 16 or more. This is because vhost-net
+introduces a small amount of additional latency, but improves total
+bandwidth quite considerably for those operating at high traffic
+rates. Thus it makes sense to use it when the user has indicated a
+preference for bandwidth over latency, by increasing the queue size.
+
 .TP
 .B \-s,\-\-script=SCRIPT
 Invoke
diff --git a/tun.c b/tun.c
index eba766d8859f5fecff18b868b5971cd9456788f4..07099bed6f83910f4429e012bb5785c6d3fc6d43 100644 (file)
--- a/tun.c
+++ b/tun.c
@@ -457,15 +457,20 @@ int openconnect_setup_tun_fd(struct openconnect_info *vpninfo, int tun_fd)
 
        vpninfo->tun_fd = tun_fd;
 
-       monitor_fd_new(vpninfo, tun);
-       monitor_read_fd(vpninfo, tun);
-
        if (set_sock_nonblock(tun_fd)) {
                vpn_progress(vpninfo, PRG_ERR, _("Failed to make tun socket nonblocking: %s\n"),
                             strerror(errno));
                return -EIO;
        }
 
+#ifdef HAVE_VHOST
+       if (!setup_vhost(vpninfo, tun_fd))
+               return 0;
+#endif
+
+       monitor_fd_new(vpninfo, tun);
+       monitor_read_fd(vpninfo, tun);
+
        return 0;
 }
 
@@ -591,6 +596,10 @@ void os_shutdown_tun(struct openconnect_info *vpninfo)
 #endif
        }
 
+#ifdef HAVE_VHOST
+       shutdown_vhost(vpninfo);
+#endif
+
        if (vpninfo->vpnc_script)
                close(vpninfo->tun_fd);
        vpninfo->tun_fd = -1;
diff --git a/vhost.c b/vhost.c
new file mode 100644 (file)
index 0000000..e0593f6
--- /dev/null
+++ b/vhost.c
@@ -0,0 +1,500 @@
+/*
+ * OpenConnect (SSL + DTLS) VPN client
+ *
+ * Copyright © 2021 David Woodhouse.
+ *
+ * Author: David Woodhouse <dwmw2@infradead.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * version 2.1, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ */
+
+#include <config.h>
+
+#include "openconnect-internal.h"
+
+#include <linux/if_tun.h>
+#include <linux/vhost.h>
+
+#include <sys/eventfd.h>
+#include <sys/ioctl.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+
+#include <ctype.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#define debug_vhost 0
+
+#define barrier() __sync_synchronize()
+
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+#define vio16(x) ((uint16_t)(x))
+#define vio32(x) ((uint32_t)(x))
+#define vio64(x) ((uint64_t)(x))
+#else
+#define vio16(x) ((uint16_t)__builtin_bswap16(x))
+#define vio32(x) ((uint32_t)__builtin_bswap32(x))
+#define vio64(x) ((uint64_t)__builtin_bswap64(x))
+#endif
+
+static int setup_vring(struct openconnect_info *vpninfo, int idx)
+{
+       struct oc_vring *vring = idx ? &vpninfo->tx_vring : &vpninfo->rx_vring;
+       int ret;
+
+       if (getenv("NOVHOST"))
+               return -EINVAL;
+
+       vring->desc = calloc(vpninfo->vhost_ring_size, sizeof(*vring->desc));
+       vring->avail = calloc(vpninfo->vhost_ring_size + 3, 2);
+       vring->used = calloc(1 + (vpninfo->vhost_ring_size * 2), 4);
+
+       if (!vring->desc || !vring->avail || !vring->used)
+               return -ENOMEM;
+
+       for (int i = 0; i < vpninfo->vhost_ring_size; i++)
+               vring->avail->ring[i] = i;
+
+       struct vhost_vring_state vs = { };
+       vs.index = idx;
+       vs.num = vpninfo->vhost_ring_size;
+       if (ioctl(vpninfo->vhost_fd, VHOST_SET_VRING_NUM, &vs) < 0) {
+               ret = -errno;
+               vpn_progress(vpninfo, PRG_ERR, _("Failed to set vring #%d size: %s\n"),
+                            idx, strerror(-ret));
+               return ret;
+       }
+
+       vs.num = 0;
+       if (ioctl(vpninfo->vhost_fd, VHOST_SET_VRING_BASE, &vs) < 0) {
+               ret = -errno;
+               vpn_progress(vpninfo, PRG_ERR, _("Failed to set vring #%d base: %s\n"),
+                            idx, strerror(-ret));
+               return ret;
+       }
+
+       struct vhost_vring_addr va = { };
+       va.index = idx;
+       va.desc_user_addr = (uint64_t)vring->desc;
+       va.avail_user_addr = (uint64_t)vring->avail;
+       va.used_user_addr  = (uint64_t)vring->used;
+       if (ioctl(vpninfo->vhost_fd, VHOST_SET_VRING_ADDR, &va) < 0) {
+               ret = -errno;
+               vpn_progress(vpninfo, PRG_ERR, _("Failed to set vring #%d base: %s\n"),
+                            idx, strerror(-ret));
+               return ret;
+       }
+
+       struct vhost_vring_file vf = { };
+       vf.index = idx;
+       vf.fd = vpninfo->tun_fd;
+       if (ioctl(vpninfo->vhost_fd, VHOST_NET_SET_BACKEND, &vf) < 0) {
+               ret = -errno;
+               vpn_progress(vpninfo, PRG_ERR, _("Failed to set vring #%d RX backend: %s\n"),
+                            idx, strerror(-ret));
+               return ret;
+       }
+
+       vf.fd = vpninfo->vhost_call_fd;
+       if (ioctl(vpninfo->vhost_fd, VHOST_SET_VRING_CALL, &vf) < 0) {
+               ret = -errno;
+               vpn_progress(vpninfo, PRG_ERR, _("Failed to set vring #%d call eventfd: %s\n"),
+                            idx, strerror(-ret));
+               close(vpninfo->vhost_fd);
+               return ret;
+       }
+
+       vf.fd = vpninfo->vhost_kick_fd;
+       if (ioctl(vpninfo->vhost_fd, VHOST_SET_VRING_KICK, &vf) < 0) {
+               ret = -errno;
+               vpn_progress(vpninfo, PRG_ERR, _("Failed to set vring #%d kick eventfd: %s\n"),
+                            idx, strerror(-ret));
+               close(vpninfo->vhost_fd);
+               return ret;
+       }
+
+       return 0;
+}
+#define OC_VHOST_NET_FEATURES ((1ULL << VHOST_NET_F_VIRTIO_NET_HDR) |  \
+                              (1ULL << VIRTIO_F_VERSION_1) |           \
+                              (1ULL << VIRTIO_RING_F_EVENT_IDX))
+
+int setup_vhost(struct openconnect_info *vpninfo, int tun_fd)
+{
+       int ret;
+
+       /* If tuned for latency not bandwidth, that isn't vhost-net */
+       if (vpninfo->max_qlen < 16) {
+               vpn_progress(vpninfo, PRG_DEBUG,
+                            _("Not using vhost-net due to low queue length %d\n"),
+                            vpninfo->max_qlen);
+               return -EINVAL;
+       }
+
+       vpninfo->vhost_ring_size = 1 << (32 - __builtin_clz(vpninfo->max_qlen - 1));
+       if (vpninfo->vhost_ring_size < 32)
+               vpninfo->vhost_ring_size = 32;
+       if (vpninfo->vhost_ring_size > 32768)
+               vpninfo->vhost_ring_size = 32768;
+
+       vpninfo->vhost_fd = open("/dev/vhost-net", O_RDWR);
+       if (vpninfo->vhost_fd == -1) {
+               ret = -errno;
+               vpn_progress(vpninfo, PRG_ERR, _("Failed to open /dev/vhost-net: %s\n"),
+                            strerror(-ret));
+               goto err;
+       }
+
+       if (ioctl(vpninfo->vhost_fd, VHOST_SET_OWNER, NULL) < 0) {
+               ret = -errno;
+               vpn_progress(vpninfo, PRG_DEBUG, _("Failed to set vhost ownership: %s\n"),
+                            strerror(-ret));
+               goto err;
+       }
+
+       uint64_t features;
+
+       if (ioctl(vpninfo->vhost_fd, VHOST_GET_FEATURES, &features) < 0) {
+               ret = -errno;
+               vpn_progress(vpninfo, PRG_DEBUG, _("Failed to get vhost features: %s\n"),
+                            strerror(-ret));
+               goto err;
+       }
+       if ((features & OC_VHOST_NET_FEATURES) != OC_VHOST_NET_FEATURES) {
+               vpn_progress(vpninfo, PRG_DEBUG, _("vhost-net lacks required features: %llx\n"),
+                            (unsigned long long)features);
+               return -EOPNOTSUPP;
+       }
+
+       features = OC_VHOST_NET_FEATURES;
+       if (ioctl(vpninfo->vhost_fd, VHOST_SET_FEATURES, &features) < 0) {
+               ret = -errno;
+               vpn_progress(vpninfo, PRG_ERR, _("Failed to set vhost features: %s\n"),
+                            strerror(-ret));
+               goto err;
+       }
+
+       vpninfo->vhost_kick_fd = eventfd(0, EFD_CLOEXEC|EFD_NONBLOCK);
+       if (vpninfo->vhost_kick_fd == -1) {
+               ret = -errno;
+               vpn_progress(vpninfo, PRG_ERR, _("Failed to open vhost kick eventfd: %s\n"),
+                            strerror(-ret));
+               goto err;
+       }
+       vpninfo->vhost_call_fd = eventfd(0, EFD_CLOEXEC|EFD_NONBLOCK);
+       if (vpninfo->vhost_call_fd == -1) {
+               ret = -errno;
+               vpn_progress(vpninfo, PRG_ERR, _("Failed to open vhost call eventfd: %s\n"),
+                            strerror(-ret));
+               goto err;
+       }
+
+       struct vhost_memory *vmem = alloca(sizeof(*vmem) + sizeof(vmem->regions[0]));
+
+       memset(vmem, 0, sizeof(*vmem) + sizeof(vmem->regions[0]));
+       vmem->nregions = 1;
+#ifdef __x86_64__
+       vmem->regions[0].guest_phys_addr = 4096;
+       vmem->regions[0].memory_size = 0x7fffffffe000; /* Why doesn't it allow 0x7fffffff000? */
+       vmem->regions[0].userspace_addr = 4096;
+#else
+#error Need magic vhost numbers for this platform
+#endif
+       if (ioctl(vpninfo->vhost_fd, VHOST_SET_MEM_TABLE, vmem) < 0) {
+               ret = -errno;
+               vpn_progress(vpninfo, PRG_DEBUG, _("Failed to set vhost memory map: %s\n"),
+                            strerror(-ret));
+               goto err;
+       }
+
+       ret = setup_vring(vpninfo, 0);
+       if (ret)
+               goto err;
+
+       ret = setup_vring(vpninfo, 1);
+       if (ret)
+               goto err;
+
+       /* This isn't just for bufferbloat; there are various issues with the XDP
+        * code path:
+        * https://lore.kernel.org/netdev/2433592d2b26deec33336dd3e83acfd273b0cf30.camel@infradead.org/T/
+        */
+       int sndbuf = vpninfo->ip_info.mtu;
+       if (!sndbuf)
+               sndbuf = 1500;
+       sndbuf *= 2 * vpninfo->max_qlen;
+       if (ioctl(vpninfo->tun_fd, TUNSETSNDBUF, &sndbuf) < 0) {
+               ret = -errno;
+               vpn_progress(vpninfo, PRG_INFO, _("Failed to set tun sndbuf: %s\n"),
+                            strerror(-ret));
+               goto err;
+       }
+
+       vpn_progress(vpninfo, PRG_INFO, _("Using vhost-net for tun acceleration, ring size %d\n"),
+                    vpninfo->vhost_ring_size);
+
+       monitor_fd_new(vpninfo, vhost_call);
+       monitor_read_fd(vpninfo, vhost_call);
+
+       return 0;
+
+ err:
+       shutdown_vhost(vpninfo);
+       return ret;
+}
+
+static void free_vring(struct openconnect_info *vpninfo,
+                      struct oc_vring *vring)
+{
+       if (vring->desc) {
+               for (int i = 0; i < vpninfo->vhost_ring_size; i++) {
+                       if (vring->desc[i].addr)
+                               free_pkt(vpninfo, pkt_from_hdr(vio64(vring->desc[i].addr), virtio.h));
+               }
+
+               free(vring->desc);
+               vring->desc = NULL;
+       }
+
+       free(vring->avail);
+       vring->avail = NULL;
+       free(vring->used);
+       vring->used = NULL;
+}
+
+void shutdown_vhost(struct openconnect_info *vpninfo)
+{
+       if (vpninfo->vhost_fd != -1)
+               close(vpninfo->vhost_fd);
+       if (vpninfo->vhost_kick_fd != -1)
+               close(vpninfo->vhost_kick_fd);
+       if (vpninfo->vhost_call_fd != -1)
+               close(vpninfo->vhost_call_fd);
+
+       vpninfo->vhost_fd = vpninfo->vhost_kick_fd = vpninfo->vhost_call_fd = -1;
+
+       free_vring(vpninfo, &vpninfo->rx_vring);
+       free_vring(vpninfo, &vpninfo->tx_vring);
+}
+
+static void dump_vring(struct openconnect_info *vpninfo, struct oc_vring *ring)
+{
+       vpn_progress(vpninfo, PRG_ERR,
+                    "next_avail 0x%x, used idx 0x%x seen_used 0x%x\n",
+                    vio16(ring->avail->idx), vio16(ring->used->idx),
+                    ring->seen_used);
+
+       vpn_progress(vpninfo, PRG_ERR, "#   ADDR         AVAIL         USED\n");
+
+       for (int i = 0; i < vpninfo->vhost_ring_size + 1; i++)
+               vpn_progress(vpninfo, PRG_ERR,
+                            "%d %p %x %x\n", i,
+                            (void *)vio64(ring->desc[i].addr),
+                            vio16(ring->avail->ring[i]),
+                            vio16(ring->used->ring[i].id));
+}
+
+/* With thanks to Eugenio Pérez Martin <eperezma@redhat.com> for writing
+ * https://www.redhat.com/en/blog/virtqueues-and-virtio-ring-how-data-travels
+ * which saved a lot of time and caffeine in getting this to work. */
+static inline int process_ring(struct openconnect_info *vpninfo, int tx, uint64_t *kick)
+{
+       struct oc_vring *ring = tx ? &vpninfo->tx_vring : &vpninfo->rx_vring;
+       const unsigned int ring_mask = vpninfo->vhost_ring_size - 1;
+       int did_work = 0;
+
+       /* First handle 'used' packets handed back to us from the ring.
+        * For TX packets (incoming from VPN into the tun device) we just
+        * free them now. For RX packets from the tun device we fill in
+        * the length and queue them for sending over the VPN. */
+       uint16_t used_idx = vio16(ring->used->idx);
+       while (used_idx != ring->seen_used) {
+               uint32_t desc = vio32(ring->used->ring[ring->seen_used & ring_mask].id);
+               uint32_t len  = vio32(ring->used->ring[ring->seen_used & ring_mask].len);
+
+               if (desc > ring_mask) {
+               inval:
+                       vpn_progress(vpninfo, PRG_ERR,
+                                    _("Error: vhost gave back invalid descriptor %d, len %d\n"),
+                                    desc, len);
+                       dump_vring(vpninfo, ring);
+                       vpninfo->quit_reason = "vhost error";
+                       return -EIO;
+               }
+
+               uint64_t addr = vio64(ring->desc[desc].addr);
+               if (!addr) {
+                       vpn_progress(vpninfo, PRG_ERR,
+                                    _("vhost gave back empty descriptor %d\n"),
+                                    desc);
+                       dump_vring(vpninfo, ring);
+                       vpninfo->quit_reason = "vhost error";
+                       return -EIO;
+               }
+
+               struct pkt *this = pkt_from_hdr(addr, virtio.h);
+
+               if (tx) {
+                       vpn_progress(vpninfo, PRG_TRACE,
+                                    _("Free TX packet %p [%d] [used %d]\n"),
+                                    this, ring->seen_used, used_idx);
+                       vpninfo->stats.rx_pkts++;
+                       vpninfo->stats.rx_bytes += this->len;
+
+                       free_pkt(vpninfo, this);
+               } else {
+                       if (len < sizeof(this->virtio.h))
+                               goto inval;
+
+                       this->len = len - sizeof(this->virtio.h);
+                       vpn_progress(vpninfo, PRG_TRACE,
+                                    _("RX packet %p(%d) [%d] [used %d]\n"),
+                                    this, this->len, ring->seen_used, used_idx);
+                       if (debug_vhost)
+                               dump_buf_hex(vpninfo, PRG_TRACE, '<',
+                                            (void *) &this->virtio.h,
+                                            this->len + sizeof(this->virtio.h));
+
+                       /* If the incoming queue fill up, pretend we can't see any more
+                        * by contracting our idea of 'used_idx' back to *this* one. */
+                       if (queue_packet(&vpninfo->outgoing_queue, this) >= vpninfo->max_qlen)
+                               used_idx = ring->seen_used + 1;
+
+                       did_work = 1;
+               }
+
+               /* Zero the descriptor and line it up in the next slot in the avail ring. */
+               ring->desc[desc].addr = 0;
+               ring->avail->ring[ring->seen_used++ & ring_mask] = vio32(desc);
+       }
+
+       /* Now handle 'avail' and prime the RX ring full of empty buffers, or
+        * the TX ring with anything we have on the VPN incoming queue. */
+       uint16_t next_avail = vio16(ring->avail->idx);
+       uint32_t desc = ring->avail->ring[next_avail & ring_mask];
+       while (!ring->desc[desc].addr) {
+               struct pkt *this;
+               if (tx) {
+                       this = dequeue_packet(&vpninfo->incoming_queue);
+                       if (!this)
+                               break;
+                       memset(&this->virtio.h, 0, sizeof(this->virtio.h));
+               } else {
+                       int len = vpninfo->ip_info.mtu;
+                       this = alloc_pkt(vpninfo, len + vpninfo->pkt_trailer);
+                       if (!this)
+                               break;
+                       this->len = len;
+               }
+
+               if (!tx)
+                       ring->desc[desc].flags = vio16(VRING_DESC_F_WRITE);
+               ring->desc[desc].addr = vio64((uint64_t)&this->virtio.h);
+               ring->desc[desc].len = vio32(this->len + sizeof(this->virtio.h));
+               barrier();
+
+               if (debug_vhost) {
+                       if (tx) {
+                               vpn_progress(vpninfo, PRG_TRACE,
+                                            _("Queue TX packet %p at desc %d avail %d\n"),
+                                            this, desc, next_avail);
+                               if (debug_vhost)
+                                       dump_buf_hex(vpninfo, PRG_TRACE, '>',
+                                                    (void *)&this->virtio.h,
+                                                    this->len + sizeof(this->virtio.h));
+                       } else
+                               vpn_progress(vpninfo, PRG_TRACE,
+                                            _("Queue RX packet %p at desc %d avail %d\n"),
+                                            this, desc, next_avail);
+               }
+
+
+               ring->avail->idx = vio16(++next_avail);
+               barrier();
+               uint16_t avail_event = (&ring->used->flags)[2 + (vpninfo->vhost_ring_size * 4)];
+               barrier();
+               if (avail_event == vio16(next_avail-1))
+                       *kick = 1;
+
+               desc = ring->avail->ring[next_avail & ring_mask];
+       }
+
+       return did_work;
+}
+
+static int set_ring_wake(struct openconnect_info *vpninfo, int tx)
+{
+       /* No wakeup for tun RX if the queue is already full. */
+       if (!tx && vpninfo->outgoing_queue.count >= vpninfo->max_qlen)
+               return 0;
+
+       struct oc_vring *ring = tx ? &vpninfo->tx_vring : &vpninfo->rx_vring;
+       uint16_t wake_idx = vio16(ring->seen_used);
+
+       /* Ask it to wake us if the used idx moves on. Note: used_event
+        * is at the end of the *avail* ring, and vice versa. */
+       ring->avail->ring[vpninfo->vhost_ring_size] = wake_idx;
+       barrier();
+
+       /* If it already did, loop again immediately */
+       if (ring->used->idx != wake_idx) {
+               vpn_progress(vpninfo, PRG_TRACE,
+                            _("Immediate wake because vhost ring moved on from 0x%x to 0x%x\n"),
+                            ring->used->idx, wake_idx);
+               return 1;
+       }
+
+       return 0;
+}
+
+int vhost_tun_mainloop(struct openconnect_info *vpninfo, int *timeout, int readable, int did_work)
+{
+       uint64_t kick = 0;
+
+       if (vpninfo->outgoing_queue.count < vpninfo->max_qlen) {
+               did_work += process_ring(vpninfo, 0, &kick);
+               if (vpninfo->quit_reason)
+                       return 0;
+       }
+
+       did_work += process_ring(vpninfo, 1, &kick);
+       if (vpninfo->quit_reason)
+               return 0;
+
+       if (kick) {
+               barrier();
+               write(vpninfo->vhost_kick_fd, &kick, sizeof(kick));
+               vpn_progress(vpninfo, PRG_TRACE,
+                            _("Kick vhost ring\n"));
+               did_work = 1;
+       }
+
+       /* We only read from the eventfd when we're done with *actual*
+        * work, which is when !did_work. Except in the cases where
+        * we race with setting the ring wakeup and have to go round
+        * again. */
+       if (!did_work && readable) {
+               uint64_t evt;
+               read(vpninfo->vhost_call_fd, &evt, sizeof(evt));
+       }
+
+       /* If we aren't going to have one more turn around the mainloop,
+        * set the wake event indices. And if we find the the rings have
+        * moved on while we're doing that, take one more turn around
+        * the mainloop... */
+       return did_work || set_ring_wake(vpninfo, 1) || set_ring_wake(vpninfo, 0);
+}
index 801f0bc5401e6d2a67088be053f106f0a9d29980..768a94a73c97261f821aea6241972e2998d8f151 100644 (file)
@@ -15,6 +15,8 @@
 <ul>
    <li><b>OpenConnect HEAD</b>
      <ul>
+       <li>When the queue length <i>(<tt>-Q</tt> option)</i> is 16 or more, try using <a
+       href="https://www.redhat.com/en/blog/virtqueues-and-virtio-ring-how-data-travels">vhost-net</a> to accelerate tun device access.</li>
        <li>Use <tt>epoll()</tt> where available.</li>
        <li>Support non-AEAD ciphersuites in DTLSv1.2 with AnyConnect. (<a href="https://gitlab.com/openconnect/openconnect/-/issues/249">#249</a>)</li>
        <li>Make <tt>tncc-emulate.py</tt> work with Python 3.7+. (<a href="https://gitlab.com/openconnect/openconnect/-/issues/152">!152</a>, <a href="https://gitlab.com/openconnect/openconnect/merge_requests/120">!120</a>)</li>