From c6594d64271704b335378e7b74c39fe4d4fcc777 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Thu, 6 Feb 2025 19:26:26 +0100 Subject: [PATCH 01/16] unroll: add generic loop unroll helpers There are cases when we need to explicitly unroll loops. For example, cache operations, filling DMA descriptors on very high speeds etc. Add compiler-specific attribute macros to give the compiler a hint that we'd like to unroll a loop. Example usage: #define UNROLL_BATCH 8 unrolled_count(UNROLL_BATCH) for (u32 i = 0; i < UNROLL_BATCH; i++) op(priv, i); Note that sometimes the compilers won't unroll loops if they think this would have worse optimization and perf than without unrolling, and that unroll attributes are available only starting GCC 8. For older compiler versions, no hints/attributes will be applied. For better unrolling/parallelization, don't have any variables that interfere between iterations except for the iterator itself. Co-developed-by: Jose E. Marchesi # pragmas Signed-off-by: Jose E. Marchesi Reviewed-by: Przemek Kitszel Signed-off-by: Alexander Lobakin Link: https://patch.msgid.link/20250206182630.3914318-2-aleksander.lobakin@intel.com Signed-off-by: Jakub Kicinski --- include/linux/unroll.h | 44 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/include/linux/unroll.h b/include/linux/unroll.h index d42fd6366373..863fb69f6a7e 100644 --- a/include/linux/unroll.h +++ b/include/linux/unroll.h @@ -9,6 +9,50 @@ #include +#ifdef CONFIG_CC_IS_CLANG +#define __pick_unrolled(x, y) _Pragma(#x) +#elif CONFIG_GCC_VERSION >= 80000 +#define __pick_unrolled(x, y) _Pragma(#y) +#else +#define __pick_unrolled(x, y) /* not supported */ +#endif + +/** + * unrolled - loop attributes to ask the compiler to unroll it + * + * Usage: + * + * #define BATCH 8 + * + * unrolled_count(BATCH) + * for (u32 i = 0; i < BATCH; i++) + * // loop body without cross-iteration dependencies + * + * This is only a hint and the compiler is free to disable unrolling if it + * thinks the count is suboptimal and may hurt performance and/or hugely + * increase object code size. + * Not having any cross-iteration dependencies (i.e. when iter x + 1 depends + * on what iter x will do with variables) is not a strict requirement, but + * provides best performance and object code size. + * Available only on Clang and GCC 8.x onwards. + */ + +/* Ask the compiler to pick an optimal unroll count, Clang only */ +#define unrolled \ + __pick_unrolled(clang loop unroll(enable), /* nothing */) + +/* Unroll each @n iterations of the loop */ +#define unrolled_count(n) \ + __pick_unrolled(clang loop unroll_count(n), GCC unroll n) + +/* Unroll the whole loop */ +#define unrolled_full \ + __pick_unrolled(clang loop unroll(full), GCC unroll 65534) + +/* Never unroll the loop */ +#define unrolled_none \ + __pick_unrolled(clang loop unroll(disable), GCC unroll 1) + #define UNROLL(N, MACRO, args...) CONCATENATE(__UNROLL_, N)(MACRO, args) #define __UNROLL_0(MACRO, args...) -- 2.50.1 From 9144e6f404da258a7620e66aadea953cf3b114d6 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Thu, 6 Feb 2025 19:26:27 +0100 Subject: [PATCH 02/16] i40e: use generic unrolled_count() macro i40e, as well as ice, has a custom loop unrolling macro for unrolling Tx descriptors filling on XSk xmit. Replace i40e defs with generic unrolled_count(), which is also more convenient as it allows passing defines as its argument, not hardcoded values, while the loop declaration will still be a usual for-loop. Signed-off-by: Alexander Lobakin Acked-by: Maciej Fijalkowski Link: https://patch.msgid.link/20250206182630.3914318-3-aleksander.lobakin@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/i40e/i40e_xsk.c | 4 +++- drivers/net/ethernet/intel/i40e/i40e_xsk.h | 10 +--------- 2 files changed, 4 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c b/drivers/net/ethernet/intel/i40e/i40e_xsk.c index e28f1905a4a0..9f47388eaba5 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_xsk.c +++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c @@ -2,6 +2,7 @@ /* Copyright(c) 2018 Intel Corporation. */ #include +#include #include #include "i40e_txrx_common.h" #include "i40e_xsk.h" @@ -529,7 +530,8 @@ static void i40e_xmit_pkt_batch(struct i40e_ring *xdp_ring, struct xdp_desc *des dma_addr_t dma; u32 i; - loop_unrolled_for(i = 0; i < PKTS_PER_BATCH; i++) { + unrolled_count(PKTS_PER_BATCH) + for (i = 0; i < PKTS_PER_BATCH; i++) { u32 cmd = I40E_TX_DESC_CMD_ICRC | xsk_is_eop_desc(&desc[i]); dma = xsk_buff_raw_get_dma(xdp_ring->xsk_pool, desc[i].addr); diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.h b/drivers/net/ethernet/intel/i40e/i40e_xsk.h index ef156fad52f2..dd16351a7af8 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_xsk.h +++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.h @@ -6,7 +6,7 @@ #include -/* This value should match the pragma in the loop_unrolled_for +/* This value should match the pragma in the unrolled_count() * macro. Why 4? It is strictly empirical. It seems to be a good * compromise between the advantage of having simultaneous outstanding * reads to the DMA array that can hide each others latency and the @@ -14,14 +14,6 @@ */ #define PKTS_PER_BATCH 4 -#ifdef __clang__ -#define loop_unrolled_for _Pragma("clang loop unroll_count(4)") for -#elif __GNUC__ >= 8 -#define loop_unrolled_for _Pragma("GCC unroll 4") for -#else -#define loop_unrolled_for for -#endif - struct i40e_ring; struct i40e_vsi; struct net_device; -- 2.50.1 From 2fc6b26ac8aecd5f53164b00fe5d32417e1fbfc9 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Thu, 6 Feb 2025 19:26:28 +0100 Subject: [PATCH 03/16] ice: use generic unrolled_count() macro ice, same as i40e, has a custom loop unrolling macros for unrolling Tx descriptors filling on XSk xmit. Replace ice defs with generic unrolled_count(), which is also more convenient as it allows passing defines as its argument, not hardcoded values, while the loop declaration will still be usual for-loop. Signed-off-by: Alexander Lobakin Acked-by: Maciej Fijalkowski Link: https://patch.msgid.link/20250206182630.3914318-4-aleksander.lobakin@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/ice/ice_xsk.c | 4 +++- drivers/net/ethernet/intel/ice/ice_xsk.h | 8 -------- 2 files changed, 3 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c index 8975d2971bc3..a3a4eaa17739 100644 --- a/drivers/net/ethernet/intel/ice/ice_xsk.c +++ b/drivers/net/ethernet/intel/ice/ice_xsk.c @@ -2,6 +2,7 @@ /* Copyright (c) 2019, Intel Corporation. */ #include +#include #include #include #include "ice.h" @@ -989,7 +990,8 @@ static void ice_xmit_pkt_batch(struct ice_tx_ring *xdp_ring, struct ice_tx_desc *tx_desc; u32 i; - loop_unrolled_for(i = 0; i < PKTS_PER_BATCH; i++) { + unrolled_count(PKTS_PER_BATCH) + for (i = 0; i < PKTS_PER_BATCH; i++) { dma_addr_t dma; dma = xsk_buff_raw_get_dma(xsk_pool, descs[i].addr); diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.h b/drivers/net/ethernet/intel/ice/ice_xsk.h index 45adeb513253..8dc5d55e26c5 100644 --- a/drivers/net/ethernet/intel/ice/ice_xsk.h +++ b/drivers/net/ethernet/intel/ice/ice_xsk.h @@ -7,14 +7,6 @@ #define PKTS_PER_BATCH 8 -#ifdef __clang__ -#define loop_unrolled_for _Pragma("clang loop unroll_count(8)") for -#elif __GNUC__ >= 8 -#define loop_unrolled_for _Pragma("GCC unroll 8") for -#else -#define loop_unrolled_for for -#endif - struct ice_vsi; #ifdef CONFIG_XDP_SOCKETS -- 2.50.1 From 23d9324a27a48858cfdd7f0342f52328e8595c6d Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Thu, 6 Feb 2025 19:26:29 +0100 Subject: [PATCH 04/16] xsk: add helper to get &xdp_desc's DMA and meta pointer in one go Currently, when your driver supports XSk Tx metadata and you want to send an XSk frame, you need to do the following: * call external xsk_buff_raw_get_dma(); * call inline xsk_buff_get_metadata(), which calls external xsk_buff_raw_get_data() and then do some inline checks. This effectively means that the following piece: addr = pool->unaligned ? xp_unaligned_add_offset_to_addr(addr) : addr; is done twice per frame, plus you have 2 external calls per frame, plus this: meta = pool->addrs + addr - pool->tx_metadata_len; if (unlikely(!xsk_buff_valid_tx_metadata(meta))) is always inlined, even if there's no meta or it's invalid. Add xsk_buff_raw_get_ctx() (xp_raw_get_ctx() to be precise) to do that in one go. It returns a small structure with 2 fields: DMA address, filled unconditionally, and metadata pointer, non-NULL only if it's present and valid. The address correction is performed only once and you also have only 1 external call per XSk frame, which does all the calculations and checks outside of your hotpath. You only need to check `if (ctx.meta)` for the metadata presence. To not copy any existing code, derive address correction and getting virtual and DMA address into small helpers. bloat-o-meter reports no object code changes for the existing functionality. Signed-off-by: Alexander Lobakin Link: https://patch.msgid.link/20250206182630.3914318-5-aleksander.lobakin@intel.com Signed-off-by: Jakub Kicinski --- include/net/xdp_sock_drv.h | 43 +++++++++++++++++++++++++++++++--- include/net/xsk_buff_pool.h | 8 +++++++ net/xdp/xsk_buff_pool.c | 46 +++++++++++++++++++++++++++++++++---- 3 files changed, 90 insertions(+), 7 deletions(-) diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h index 784cd34f5bba..15086dcf51d8 100644 --- a/include/net/xdp_sock_drv.h +++ b/include/net/xdp_sock_drv.h @@ -196,6 +196,23 @@ static inline void *xsk_buff_raw_get_data(struct xsk_buff_pool *pool, u64 addr) return xp_raw_get_data(pool, addr); } +/** + * xsk_buff_raw_get_ctx - get &xdp_desc context + * @pool: XSk buff pool desc address belongs to + * @addr: desc address (from userspace) + * + * Wrapper for xp_raw_get_ctx() to be used in drivers, see its kdoc for + * details. + * + * Return: new &xdp_desc_ctx struct containing desc's DMA address and metadata + * pointer, if it is present and valid (initialized to %NULL otherwise). + */ +static inline struct xdp_desc_ctx +xsk_buff_raw_get_ctx(const struct xsk_buff_pool *pool, u64 addr) +{ + return xp_raw_get_ctx(pool, addr); +} + #define XDP_TXMD_FLAGS_VALID ( \ XDP_TXMD_FLAGS_TIMESTAMP | \ XDP_TXMD_FLAGS_CHECKSUM | \ @@ -207,20 +224,27 @@ xsk_buff_valid_tx_metadata(const struct xsk_tx_metadata *meta) return !(meta->flags & ~XDP_TXMD_FLAGS_VALID); } -static inline struct xsk_tx_metadata *xsk_buff_get_metadata(struct xsk_buff_pool *pool, u64 addr) +static inline struct xsk_tx_metadata * +__xsk_buff_get_metadata(const struct xsk_buff_pool *pool, void *data) { struct xsk_tx_metadata *meta; if (!pool->tx_metadata_len) return NULL; - meta = xp_raw_get_data(pool, addr) - pool->tx_metadata_len; + meta = data - pool->tx_metadata_len; if (unlikely(!xsk_buff_valid_tx_metadata(meta))) return NULL; /* no way to signal the error to the user */ return meta; } +static inline struct xsk_tx_metadata * +xsk_buff_get_metadata(struct xsk_buff_pool *pool, u64 addr) +{ + return __xsk_buff_get_metadata(pool, xp_raw_get_data(pool, addr)); +} + static inline void xsk_buff_dma_sync_for_cpu(struct xdp_buff *xdp) { struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); @@ -388,12 +412,25 @@ static inline void *xsk_buff_raw_get_data(struct xsk_buff_pool *pool, u64 addr) return NULL; } +static inline struct xdp_desc_ctx +xsk_buff_raw_get_ctx(const struct xsk_buff_pool *pool, u64 addr) +{ + return (struct xdp_desc_ctx){ }; +} + static inline bool xsk_buff_valid_tx_metadata(struct xsk_tx_metadata *meta) { return false; } -static inline struct xsk_tx_metadata *xsk_buff_get_metadata(struct xsk_buff_pool *pool, u64 addr) +static inline struct xsk_tx_metadata * +__xsk_buff_get_metadata(const struct xsk_buff_pool *pool, void *data) +{ + return NULL; +} + +static inline struct xsk_tx_metadata * +xsk_buff_get_metadata(struct xsk_buff_pool *pool, u64 addr) { return NULL; } diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h index 50779406bc2d..1dcd4d71468a 100644 --- a/include/net/xsk_buff_pool.h +++ b/include/net/xsk_buff_pool.h @@ -141,6 +141,14 @@ u32 xp_alloc_batch(struct xsk_buff_pool *pool, struct xdp_buff **xdp, u32 max); bool xp_can_alloc(struct xsk_buff_pool *pool, u32 count); void *xp_raw_get_data(struct xsk_buff_pool *pool, u64 addr); dma_addr_t xp_raw_get_dma(struct xsk_buff_pool *pool, u64 addr); + +struct xdp_desc_ctx { + dma_addr_t dma; + struct xsk_tx_metadata *meta; +}; + +struct xdp_desc_ctx xp_raw_get_ctx(const struct xsk_buff_pool *pool, u64 addr); + static inline dma_addr_t xp_get_dma(struct xdp_buff_xsk *xskb) { return xskb->dma; diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index 1f7975b49657..c263fb7a68dc 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -699,18 +699,56 @@ void xp_free(struct xdp_buff_xsk *xskb) } EXPORT_SYMBOL(xp_free); -void *xp_raw_get_data(struct xsk_buff_pool *pool, u64 addr) +static u64 __xp_raw_get_addr(const struct xsk_buff_pool *pool, u64 addr) +{ + return pool->unaligned ? xp_unaligned_add_offset_to_addr(addr) : addr; +} + +static void *__xp_raw_get_data(const struct xsk_buff_pool *pool, u64 addr) { - addr = pool->unaligned ? xp_unaligned_add_offset_to_addr(addr) : addr; return pool->addrs + addr; } + +void *xp_raw_get_data(struct xsk_buff_pool *pool, u64 addr) +{ + return __xp_raw_get_data(pool, __xp_raw_get_addr(pool, addr)); +} EXPORT_SYMBOL(xp_raw_get_data); -dma_addr_t xp_raw_get_dma(struct xsk_buff_pool *pool, u64 addr) +static dma_addr_t __xp_raw_get_dma(const struct xsk_buff_pool *pool, u64 addr) { - addr = pool->unaligned ? xp_unaligned_add_offset_to_addr(addr) : addr; return (pool->dma_pages[addr >> PAGE_SHIFT] & ~XSK_NEXT_PG_CONTIG_MASK) + (addr & ~PAGE_MASK); } + +dma_addr_t xp_raw_get_dma(struct xsk_buff_pool *pool, u64 addr) +{ + return __xp_raw_get_dma(pool, __xp_raw_get_addr(pool, addr)); +} EXPORT_SYMBOL(xp_raw_get_dma); + +/** + * xp_raw_get_ctx - get &xdp_desc context + * @pool: XSk buff pool desc address belongs to + * @addr: desc address (from userspace) + * + * Helper for getting desc's DMA address and metadata pointer, if present. + * Saves one call on hotpath, double calculation of the actual address, + * and inline checks for metadata presence and sanity. + * + * Return: new &xdp_desc_ctx struct containing desc's DMA address and metadata + * pointer, if it is present and valid (initialized to %NULL otherwise). + */ +struct xdp_desc_ctx xp_raw_get_ctx(const struct xsk_buff_pool *pool, u64 addr) +{ + struct xdp_desc_ctx ret; + + addr = __xp_raw_get_addr(pool, addr); + + ret.dma = __xp_raw_get_dma(pool, addr); + ret.meta = __xsk_buff_get_metadata(pool, __xp_raw_get_data(pool, addr)); + + return ret; +} +EXPORT_SYMBOL(xp_raw_get_ctx); -- 2.50.1 From 848b09d53d923b4caee5491f57a5c5b22d81febc Mon Sep 17 00:00:00 2001 From: Aleksander Jan Bajkowski Date: Thu, 6 Feb 2025 23:40:33 +0100 Subject: [PATCH 05/16] r8152: add vendor/device ID pair for Dell Alienware AW1022z The Dell AW1022z is an RTL8156B based 2.5G Ethernet controller. Add the vendor and product ID values to the driver. This makes Ethernet work with the adapter. Signed-off-by: Aleksander Jan Bajkowski Link: https://patch.msgid.link/20250206224033.980115-1-olek2@wp.pl Signed-off-by: Jakub Kicinski --- drivers/net/usb/r8152.c | 1 + include/linux/usb/r8152.h | 1 + 2 files changed, 2 insertions(+) diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c index 468c73974046..e1021148d3a6 100644 --- a/drivers/net/usb/r8152.c +++ b/drivers/net/usb/r8152.c @@ -10079,6 +10079,7 @@ static const struct usb_device_id rtl8152_table[] = { { USB_DEVICE(VENDOR_ID_NVIDIA, 0x09ff) }, { USB_DEVICE(VENDOR_ID_TPLINK, 0x0601) }, { USB_DEVICE(VENDOR_ID_DLINK, 0xb301) }, + { USB_DEVICE(VENDOR_ID_DELL, 0xb097) }, { USB_DEVICE(VENDOR_ID_ASUS, 0x1976) }, {} }; diff --git a/include/linux/usb/r8152.h b/include/linux/usb/r8152.h index 33a4c146dc19..2ca60828f28b 100644 --- a/include/linux/usb/r8152.h +++ b/include/linux/usb/r8152.h @@ -30,6 +30,7 @@ #define VENDOR_ID_NVIDIA 0x0955 #define VENDOR_ID_TPLINK 0x2357 #define VENDOR_ID_DLINK 0x2001 +#define VENDOR_ID_DELL 0x413c #define VENDOR_ID_ASUS 0x0b05 #if IS_REACHABLE(CONFIG_USB_RTL8152) -- 2.50.1 From e76d1ea8cb18b6aa389a0d3202ffc63ed278215d Mon Sep 17 00:00:00 2001 From: Sean Anderson Date: Thu, 6 Feb 2025 15:10:33 -0500 Subject: [PATCH 06/16] net: xilinx: axienet: Combine CR calculation Combine the common parts of the CR calculations for better code reuse. While we're at it, simplify the code a bit. Signed-off-by: Sean Anderson Reviewed-by: Shannon Nelson Link: https://patch.msgid.link/20250206201036.1516800-2-sean.anderson@linux.dev Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/xilinx/xilinx_axienet.h | 3 - .../net/ethernet/xilinx/xilinx_axienet_main.c | 64 ++++++++++--------- 2 files changed, 34 insertions(+), 33 deletions(-) diff --git a/drivers/net/ethernet/xilinx/xilinx_axienet.h b/drivers/net/ethernet/xilinx/xilinx_axienet.h index a3f4f3e42587..8fd3b45ef6aa 100644 --- a/drivers/net/ethernet/xilinx/xilinx_axienet.h +++ b/drivers/net/ethernet/xilinx/xilinx_axienet.h @@ -112,9 +112,6 @@ #define XAXIDMA_DELAY_MASK 0xFF000000 /* Delay timeout counter */ #define XAXIDMA_COALESCE_MASK 0x00FF0000 /* Coalesce counter */ -#define XAXIDMA_DELAY_SHIFT 24 -#define XAXIDMA_COALESCE_SHIFT 16 - #define XAXIDMA_IRQ_IOC_MASK 0x00001000 /* Completion intr */ #define XAXIDMA_IRQ_DELAY_MASK 0x00002000 /* Delay interrupt */ #define XAXIDMA_IRQ_ERROR_MASK 0x00004000 /* Error interrupt */ diff --git a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c index 9e7fa012e4fa..bd95f2ba3bae 100644 --- a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c +++ b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c @@ -224,22 +224,40 @@ static void axienet_dma_bd_release(struct net_device *ndev) } /** - * axienet_usec_to_timer - Calculate IRQ delay timer value - * @lp: Pointer to the axienet_local structure - * @coalesce_usec: Microseconds to convert into timer value + * axienet_calc_cr() - Calculate control register value + * @lp: Device private data + * @count: Number of completions before an interrupt + * @usec: Microseconds after the last completion before an interrupt + * + * Calculate a control register value based on the coalescing settings. The + * run/stop bit is not set. */ -static u32 axienet_usec_to_timer(struct axienet_local *lp, u32 coalesce_usec) +static u32 axienet_calc_cr(struct axienet_local *lp, u32 count, u32 usec) { - u32 result; - u64 clk_rate = 125000000; /* arbitrary guess if no clock rate set */ + u32 cr; + + cr = FIELD_PREP(XAXIDMA_COALESCE_MASK, count) | XAXIDMA_IRQ_IOC_MASK | + XAXIDMA_IRQ_ERROR_MASK; + /* Only set interrupt delay timer if not generating an interrupt on + * the first packet. Otherwise leave at 0 to disable delay interrupt. + */ + if (count > 1) { + u64 clk_rate = 125000000; /* arbitrary guess if no clock rate set */ + u32 timer; + + if (lp->axi_clk) + clk_rate = clk_get_rate(lp->axi_clk); - if (lp->axi_clk) - clk_rate = clk_get_rate(lp->axi_clk); + /* 1 Timeout Interval = 125 * (clock period of SG clock) */ + timer = DIV64_U64_ROUND_CLOSEST((u64)usec * clk_rate, + XAXIDMA_DELAY_SCALE); - /* 1 Timeout Interval = 125 * (clock period of SG clock) */ - result = DIV64_U64_ROUND_CLOSEST((u64)coalesce_usec * clk_rate, - XAXIDMA_DELAY_SCALE); - return min(result, FIELD_MAX(XAXIDMA_DELAY_MASK)); + timer = min(timer, FIELD_MAX(XAXIDMA_DELAY_MASK)); + cr |= FIELD_PREP(XAXIDMA_DELAY_MASK, timer) | + XAXIDMA_IRQ_DELAY_MASK; + } + + return cr; } /** @@ -249,27 +267,13 @@ static u32 axienet_usec_to_timer(struct axienet_local *lp, u32 coalesce_usec) static void axienet_dma_start(struct axienet_local *lp) { /* Start updating the Rx channel control register */ - lp->rx_dma_cr = (lp->coalesce_count_rx << XAXIDMA_COALESCE_SHIFT) | - XAXIDMA_IRQ_IOC_MASK | XAXIDMA_IRQ_ERROR_MASK; - /* Only set interrupt delay timer if not generating an interrupt on - * the first RX packet. Otherwise leave at 0 to disable delay interrupt. - */ - if (lp->coalesce_count_rx > 1) - lp->rx_dma_cr |= (axienet_usec_to_timer(lp, lp->coalesce_usec_rx) - << XAXIDMA_DELAY_SHIFT) | - XAXIDMA_IRQ_DELAY_MASK; + lp->rx_dma_cr = axienet_calc_cr(lp, lp->coalesce_count_rx, + lp->coalesce_usec_rx); axienet_dma_out32(lp, XAXIDMA_RX_CR_OFFSET, lp->rx_dma_cr); /* Start updating the Tx channel control register */ - lp->tx_dma_cr = (lp->coalesce_count_tx << XAXIDMA_COALESCE_SHIFT) | - XAXIDMA_IRQ_IOC_MASK | XAXIDMA_IRQ_ERROR_MASK; - /* Only set interrupt delay timer if not generating an interrupt on - * the first TX packet. Otherwise leave at 0 to disable delay interrupt. - */ - if (lp->coalesce_count_tx > 1) - lp->tx_dma_cr |= (axienet_usec_to_timer(lp, lp->coalesce_usec_tx) - << XAXIDMA_DELAY_SHIFT) | - XAXIDMA_IRQ_DELAY_MASK; + lp->tx_dma_cr = axienet_calc_cr(lp, lp->coalesce_count_tx, + lp->coalesce_usec_tx); axienet_dma_out32(lp, XAXIDMA_TX_CR_OFFSET, lp->tx_dma_cr); /* Populate the tail pointer and bring the Rx Axi DMA engine out of -- 2.50.1 From d048c717df33baf337a58b5d74c855a74abf4e04 Mon Sep 17 00:00:00 2001 From: Sean Anderson Date: Thu, 6 Feb 2025 15:10:34 -0500 Subject: [PATCH 07/16] net: xilinx: axienet: Support adjusting coalesce settings while running In preparation for adaptive IRQ coalescing, we first need to support adjusting the settings at runtime. The existing code doesn't require any locking because - dma_start is the only function that modifies rx/tx_dma_cr. It is always called with IRQs and NAPI disabled, so nothing else is touching the hardware. - The IRQs don't race with poll, since the latter is a softirq. - The IRQs don't race with dma_stop since they both just clear the control registers. - dma_stop doesn't race with poll since the former is called with NAPI disabled. However, once we introduce another function that modifies rx/tx_dma_cr, we need to have some locking to prevent races. Introduce two locks to protect these variables and their registers. The control register values are now generated where the coalescing settings are set. Converting coalescing settings to control register values may require sleeping because of clk_get_rate. However, the read/modify/write of the control registers themselves can't sleep because it needs to happen in IRQ context. By pre-calculating the control register values, we avoid introducing an additional mutex. Since axienet_dma_start writes the control settings when it runs, we don't bother updating the CR registers when rx/tx_dma_started is false. This prevents any issues from writing to the control registers in the middle of a reset sequence. Signed-off-by: Sean Anderson Reviewed-by: Shannon Nelson Link: https://patch.msgid.link/20250206201036.1516800-3-sean.anderson@linux.dev Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/xilinx/xilinx_axienet.h | 8 ++ .../net/ethernet/xilinx/xilinx_axienet_main.c | 134 +++++++++++++++--- 2 files changed, 119 insertions(+), 23 deletions(-) diff --git a/drivers/net/ethernet/xilinx/xilinx_axienet.h b/drivers/net/ethernet/xilinx/xilinx_axienet.h index 8fd3b45ef6aa..6b8e550c2155 100644 --- a/drivers/net/ethernet/xilinx/xilinx_axienet.h +++ b/drivers/net/ethernet/xilinx/xilinx_axienet.h @@ -484,7 +484,9 @@ struct skbuf_dma_descriptor { * @regs: Base address for the axienet_local device address space * @dma_regs: Base address for the axidma device address space * @napi_rx: NAPI RX control structure + * @rx_cr_lock: Lock protecting @rx_dma_cr, its register, and @rx_dma_started * @rx_dma_cr: Nominal content of RX DMA control register + * @rx_dma_started: Set when RX DMA is started * @rx_bd_v: Virtual address of the RX buffer descriptor ring * @rx_bd_p: Physical address(start address) of the RX buffer descr. ring * @rx_bd_num: Size of RX buffer descriptor ring @@ -494,7 +496,9 @@ struct skbuf_dma_descriptor { * @rx_bytes: RX byte count for statistics * @rx_stat_sync: Synchronization object for RX stats * @napi_tx: NAPI TX control structure + * @tx_cr_lock: Lock protecting @tx_dma_cr, its register, and @tx_dma_started * @tx_dma_cr: Nominal content of TX DMA control register + * @tx_dma_started: Set when TX DMA is started * @tx_bd_v: Virtual address of the TX buffer descriptor ring * @tx_bd_p: Physical address(start address) of the TX buffer descr. ring * @tx_bd_num: Size of TX buffer descriptor ring @@ -566,7 +570,9 @@ struct axienet_local { void __iomem *dma_regs; struct napi_struct napi_rx; + spinlock_t rx_cr_lock; u32 rx_dma_cr; + bool rx_dma_started; struct axidma_bd *rx_bd_v; dma_addr_t rx_bd_p; u32 rx_bd_num; @@ -576,7 +582,9 @@ struct axienet_local { struct u64_stats_sync rx_stat_sync; struct napi_struct napi_tx; + spinlock_t tx_cr_lock; u32 tx_dma_cr; + bool tx_dma_started; struct axidma_bd *tx_bd_v; dma_addr_t tx_bd_p; u32 tx_bd_num; diff --git a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c index bd95f2ba3bae..b70e65757454 100644 --- a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c +++ b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c @@ -266,16 +266,12 @@ static u32 axienet_calc_cr(struct axienet_local *lp, u32 count, u32 usec) */ static void axienet_dma_start(struct axienet_local *lp) { + spin_lock_irq(&lp->rx_cr_lock); + /* Start updating the Rx channel control register */ - lp->rx_dma_cr = axienet_calc_cr(lp, lp->coalesce_count_rx, - lp->coalesce_usec_rx); + lp->rx_dma_cr &= ~XAXIDMA_CR_RUNSTOP_MASK; axienet_dma_out32(lp, XAXIDMA_RX_CR_OFFSET, lp->rx_dma_cr); - /* Start updating the Tx channel control register */ - lp->tx_dma_cr = axienet_calc_cr(lp, lp->coalesce_count_tx, - lp->coalesce_usec_tx); - axienet_dma_out32(lp, XAXIDMA_TX_CR_OFFSET, lp->tx_dma_cr); - /* Populate the tail pointer and bring the Rx Axi DMA engine out of * halted state. This will make the Rx side ready for reception. */ @@ -284,6 +280,14 @@ static void axienet_dma_start(struct axienet_local *lp) axienet_dma_out32(lp, XAXIDMA_RX_CR_OFFSET, lp->rx_dma_cr); axienet_dma_out_addr(lp, XAXIDMA_RX_TDESC_OFFSET, lp->rx_bd_p + (sizeof(*lp->rx_bd_v) * (lp->rx_bd_num - 1))); + lp->rx_dma_started = true; + + spin_unlock_irq(&lp->rx_cr_lock); + spin_lock_irq(&lp->tx_cr_lock); + + /* Start updating the Tx channel control register */ + lp->tx_dma_cr &= ~XAXIDMA_CR_RUNSTOP_MASK; + axienet_dma_out32(lp, XAXIDMA_TX_CR_OFFSET, lp->tx_dma_cr); /* Write to the RS (Run-stop) bit in the Tx channel control register. * Tx channel is now ready to run. But only after we write to the @@ -292,6 +296,9 @@ static void axienet_dma_start(struct axienet_local *lp) axienet_dma_out_addr(lp, XAXIDMA_TX_CDESC_OFFSET, lp->tx_bd_p); lp->tx_dma_cr |= XAXIDMA_CR_RUNSTOP_MASK; axienet_dma_out32(lp, XAXIDMA_TX_CR_OFFSET, lp->tx_dma_cr); + lp->tx_dma_started = true; + + spin_unlock_irq(&lp->tx_cr_lock); } /** @@ -627,14 +634,22 @@ static void axienet_dma_stop(struct axienet_local *lp) int count; u32 cr, sr; - cr = axienet_dma_in32(lp, XAXIDMA_RX_CR_OFFSET); - cr &= ~(XAXIDMA_CR_RUNSTOP_MASK | XAXIDMA_IRQ_ALL_MASK); + spin_lock_irq(&lp->rx_cr_lock); + + cr = lp->rx_dma_cr & ~(XAXIDMA_CR_RUNSTOP_MASK | XAXIDMA_IRQ_ALL_MASK); axienet_dma_out32(lp, XAXIDMA_RX_CR_OFFSET, cr); + lp->rx_dma_started = false; + + spin_unlock_irq(&lp->rx_cr_lock); synchronize_irq(lp->rx_irq); - cr = axienet_dma_in32(lp, XAXIDMA_TX_CR_OFFSET); - cr &= ~(XAXIDMA_CR_RUNSTOP_MASK | XAXIDMA_IRQ_ALL_MASK); + spin_lock_irq(&lp->tx_cr_lock); + + cr = lp->tx_dma_cr & ~(XAXIDMA_CR_RUNSTOP_MASK | XAXIDMA_IRQ_ALL_MASK); axienet_dma_out32(lp, XAXIDMA_TX_CR_OFFSET, cr); + lp->tx_dma_started = false; + + spin_unlock_irq(&lp->tx_cr_lock); synchronize_irq(lp->tx_irq); /* Give DMAs a chance to halt gracefully */ @@ -983,7 +998,9 @@ static int axienet_tx_poll(struct napi_struct *napi, int budget) * cause an immediate interrupt if any TX packets are * already pending. */ + spin_lock_irq(&lp->tx_cr_lock); axienet_dma_out32(lp, XAXIDMA_TX_CR_OFFSET, lp->tx_dma_cr); + spin_unlock_irq(&lp->tx_cr_lock); } return packets; } @@ -1249,7 +1266,9 @@ static int axienet_rx_poll(struct napi_struct *napi, int budget) * cause an immediate interrupt if any RX packets are * already pending. */ + spin_lock_irq(&lp->rx_cr_lock); axienet_dma_out32(lp, XAXIDMA_RX_CR_OFFSET, lp->rx_dma_cr); + spin_unlock_irq(&lp->rx_cr_lock); } return packets; } @@ -1287,11 +1306,14 @@ static irqreturn_t axienet_tx_irq(int irq, void *_ndev) /* Disable further TX completion interrupts and schedule * NAPI to handle the completions. */ - u32 cr = lp->tx_dma_cr; - - cr &= ~(XAXIDMA_IRQ_IOC_MASK | XAXIDMA_IRQ_DELAY_MASK); if (napi_schedule_prep(&lp->napi_tx)) { + u32 cr; + + spin_lock(&lp->tx_cr_lock); + cr = lp->tx_dma_cr; + cr &= ~(XAXIDMA_IRQ_IOC_MASK | XAXIDMA_IRQ_DELAY_MASK); axienet_dma_out32(lp, XAXIDMA_TX_CR_OFFSET, cr); + spin_unlock(&lp->tx_cr_lock); __napi_schedule(&lp->napi_tx); } } @@ -1332,11 +1354,15 @@ static irqreturn_t axienet_rx_irq(int irq, void *_ndev) /* Disable further RX completion interrupts and schedule * NAPI receive. */ - u32 cr = lp->rx_dma_cr; - - cr &= ~(XAXIDMA_IRQ_IOC_MASK | XAXIDMA_IRQ_DELAY_MASK); if (napi_schedule_prep(&lp->napi_rx)) { + u32 cr; + + spin_lock(&lp->rx_cr_lock); + cr = lp->rx_dma_cr; + cr &= ~(XAXIDMA_IRQ_IOC_MASK | XAXIDMA_IRQ_DELAY_MASK); axienet_dma_out32(lp, XAXIDMA_RX_CR_OFFSET, cr); + spin_unlock(&lp->rx_cr_lock); + __napi_schedule(&lp->napi_rx); } } @@ -2002,6 +2028,62 @@ axienet_ethtools_set_pauseparam(struct net_device *ndev, return phylink_ethtool_set_pauseparam(lp->phylink, epauseparm); } +/** + * axienet_update_coalesce_rx() - Set RX CR + * @lp: Device private data + * @cr: Value to write to the RX CR + * @mask: Bits to set from @cr + */ +static void axienet_update_coalesce_rx(struct axienet_local *lp, u32 cr, + u32 mask) +{ + spin_lock_irq(&lp->rx_cr_lock); + lp->rx_dma_cr &= ~mask; + lp->rx_dma_cr |= cr; + /* If DMA isn't started, then the settings will be applied the next + * time dma_start() is called. + */ + if (lp->rx_dma_started) { + u32 reg = axienet_dma_in32(lp, XAXIDMA_RX_CR_OFFSET); + + /* Don't enable IRQs if they are disabled by NAPI */ + if (reg & XAXIDMA_IRQ_ALL_MASK) + cr = lp->rx_dma_cr; + else + cr = lp->rx_dma_cr & ~XAXIDMA_IRQ_ALL_MASK; + axienet_dma_out32(lp, XAXIDMA_RX_CR_OFFSET, cr); + } + spin_unlock_irq(&lp->rx_cr_lock); +} + +/** + * axienet_update_coalesce_tx() - Set TX CR + * @lp: Device private data + * @cr: Value to write to the TX CR + * @mask: Bits to set from @cr + */ +static void axienet_update_coalesce_tx(struct axienet_local *lp, u32 cr, + u32 mask) +{ + spin_lock_irq(&lp->tx_cr_lock); + lp->tx_dma_cr &= ~mask; + lp->tx_dma_cr |= cr; + /* If DMA isn't started, then the settings will be applied the next + * time dma_start() is called. + */ + if (lp->tx_dma_started) { + u32 reg = axienet_dma_in32(lp, XAXIDMA_TX_CR_OFFSET); + + /* Don't enable IRQs if they are disabled by NAPI */ + if (reg & XAXIDMA_IRQ_ALL_MASK) + cr = lp->tx_dma_cr; + else + cr = lp->tx_dma_cr & ~XAXIDMA_IRQ_ALL_MASK; + axienet_dma_out32(lp, XAXIDMA_TX_CR_OFFSET, cr); + } + spin_unlock_irq(&lp->tx_cr_lock); +} + /** * axienet_ethtools_get_coalesce - Get DMA interrupt coalescing count. * @ndev: Pointer to net_device structure @@ -2050,12 +2132,7 @@ axienet_ethtools_set_coalesce(struct net_device *ndev, struct netlink_ext_ack *extack) { struct axienet_local *lp = netdev_priv(ndev); - - if (netif_running(ndev)) { - NL_SET_ERR_MSG(extack, - "Please stop netif before applying configuration"); - return -EBUSY; - } + u32 cr; if (ecoalesce->rx_max_coalesced_frames > 255 || ecoalesce->tx_max_coalesced_frames > 255) { @@ -2083,6 +2160,11 @@ axienet_ethtools_set_coalesce(struct net_device *ndev, lp->coalesce_count_tx = ecoalesce->tx_max_coalesced_frames; lp->coalesce_usec_tx = ecoalesce->tx_coalesce_usecs; + cr = axienet_calc_cr(lp, lp->coalesce_count_rx, lp->coalesce_usec_rx); + axienet_update_coalesce_rx(lp, cr, ~XAXIDMA_CR_RUNSTOP_MASK); + + cr = axienet_calc_cr(lp, lp->coalesce_count_tx, lp->coalesce_usec_tx); + axienet_update_coalesce_tx(lp, cr, ~XAXIDMA_CR_RUNSTOP_MASK); return 0; } @@ -2862,10 +2944,16 @@ static int axienet_probe(struct platform_device *pdev) axienet_set_mac_address(ndev, NULL); } + spin_lock_init(&lp->rx_cr_lock); + spin_lock_init(&lp->tx_cr_lock); lp->coalesce_count_rx = XAXIDMA_DFT_RX_THRESHOLD; lp->coalesce_count_tx = XAXIDMA_DFT_TX_THRESHOLD; lp->coalesce_usec_rx = XAXIDMA_DFT_RX_USEC; lp->coalesce_usec_tx = XAXIDMA_DFT_TX_USEC; + lp->rx_dma_cr = axienet_calc_cr(lp, lp->coalesce_count_rx, + lp->coalesce_usec_rx); + lp->tx_dma_cr = axienet_calc_cr(lp, lp->coalesce_count_tx, + lp->coalesce_usec_tx); ret = axienet_mdio_setup(lp); if (ret) -- 2.50.1 From eb80520e8a5be676cd546c2503aa78653ff4b026 Mon Sep 17 00:00:00 2001 From: Sean Anderson Date: Thu, 6 Feb 2025 15:10:35 -0500 Subject: [PATCH 08/16] net: xilinx: axienet: Get coalesce parameters from driver state The cr variables now contain the same values as the control registers themselves. Extract/calculate the values from the variables instead of saving the user-specified values. This allows us to remove some bookeeping, and also lets the user know what the actual coalesce settings are. Signed-off-by: Sean Anderson Reviewed by: Shannon Nelson Link: https://patch.msgid.link/20250206201036.1516800-4-sean.anderson@linux.dev Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/xilinx/xilinx_axienet.h | 8 --- .../net/ethernet/xilinx/xilinx_axienet_main.c | 70 +++++++++++++------ 2 files changed, 47 insertions(+), 31 deletions(-) diff --git a/drivers/net/ethernet/xilinx/xilinx_axienet.h b/drivers/net/ethernet/xilinx/xilinx_axienet.h index 6b8e550c2155..45d8d80dbb1a 100644 --- a/drivers/net/ethernet/xilinx/xilinx_axienet.h +++ b/drivers/net/ethernet/xilinx/xilinx_axienet.h @@ -533,10 +533,6 @@ struct skbuf_dma_descriptor { * supported, the maximum frame size would be 9k. Else it is * 1522 bytes (assuming support for basic VLAN) * @rxmem: Stores rx memory size for jumbo frame handling. - * @coalesce_count_rx: Store the irq coalesce on RX side. - * @coalesce_usec_rx: IRQ coalesce delay for RX - * @coalesce_count_tx: Store the irq coalesce on TX side. - * @coalesce_usec_tx: IRQ coalesce delay for TX * @use_dmaengine: flag to check dmaengine framework usage. * @tx_chan: TX DMA channel. * @rx_chan: RX DMA channel. @@ -615,10 +611,6 @@ struct axienet_local { u32 max_frm_size; u32 rxmem; - u32 coalesce_count_rx; - u32 coalesce_usec_rx; - u32 coalesce_count_tx; - u32 coalesce_usec_tx; u8 use_dmaengine; struct dma_chan *tx_chan; struct dma_chan *rx_chan; diff --git a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c index b70e65757454..da3d4193674e 100644 --- a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c +++ b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c @@ -223,6 +223,13 @@ static void axienet_dma_bd_release(struct net_device *ndev) lp->rx_bd_p); } +static u64 axienet_dma_rate(struct axienet_local *lp) +{ + if (lp->axi_clk) + return clk_get_rate(lp->axi_clk); + return 125000000; /* arbitrary guess if no clock rate set */ +} + /** * axienet_calc_cr() - Calculate control register value * @lp: Device private data @@ -242,12 +249,9 @@ static u32 axienet_calc_cr(struct axienet_local *lp, u32 count, u32 usec) * the first packet. Otherwise leave at 0 to disable delay interrupt. */ if (count > 1) { - u64 clk_rate = 125000000; /* arbitrary guess if no clock rate set */ + u64 clk_rate = axienet_dma_rate(lp); u32 timer; - if (lp->axi_clk) - clk_rate = clk_get_rate(lp->axi_clk); - /* 1 Timeout Interval = 125 * (clock period of SG clock) */ timer = DIV64_U64_ROUND_CLOSEST((u64)usec * clk_rate, XAXIDMA_DELAY_SCALE); @@ -260,6 +264,23 @@ static u32 axienet_calc_cr(struct axienet_local *lp, u32 count, u32 usec) return cr; } +/** + * axienet_coalesce_params() - Extract coalesce parameters from the CR + * @lp: Device private data + * @cr: The control register to parse + * @count: Number of packets before an interrupt + * @usec: Idle time (in usec) before an interrupt + */ +static void axienet_coalesce_params(struct axienet_local *lp, u32 cr, + u32 *count, u32 *usec) +{ + u64 clk_rate = axienet_dma_rate(lp); + u64 timer = FIELD_GET(XAXIDMA_DELAY_MASK, cr); + + *count = FIELD_GET(XAXIDMA_COALESCE_MASK, cr); + *usec = DIV64_U64_ROUND_CLOSEST(timer * XAXIDMA_DELAY_SCALE, clk_rate); +} + /** * axienet_dma_start - Set up DMA registers and start DMA operation * @lp: Pointer to the axienet_local structure @@ -2104,11 +2125,21 @@ axienet_ethtools_get_coalesce(struct net_device *ndev, struct netlink_ext_ack *extack) { struct axienet_local *lp = netdev_priv(ndev); + u32 cr; - ecoalesce->rx_max_coalesced_frames = lp->coalesce_count_rx; - ecoalesce->rx_coalesce_usecs = lp->coalesce_usec_rx; - ecoalesce->tx_max_coalesced_frames = lp->coalesce_count_tx; - ecoalesce->tx_coalesce_usecs = lp->coalesce_usec_tx; + spin_lock_irq(&lp->rx_cr_lock); + cr = lp->rx_dma_cr; + spin_unlock_irq(&lp->rx_cr_lock); + axienet_coalesce_params(lp, cr, + &ecoalesce->rx_max_coalesced_frames, + &ecoalesce->rx_coalesce_usecs); + + spin_lock_irq(&lp->tx_cr_lock); + cr = lp->tx_dma_cr; + spin_unlock_irq(&lp->tx_cr_lock); + axienet_coalesce_params(lp, cr, + &ecoalesce->tx_max_coalesced_frames, + &ecoalesce->tx_coalesce_usecs); return 0; } @@ -2155,15 +2186,12 @@ axienet_ethtools_set_coalesce(struct net_device *ndev, return -EINVAL; } - lp->coalesce_count_rx = ecoalesce->rx_max_coalesced_frames; - lp->coalesce_usec_rx = ecoalesce->rx_coalesce_usecs; - lp->coalesce_count_tx = ecoalesce->tx_max_coalesced_frames; - lp->coalesce_usec_tx = ecoalesce->tx_coalesce_usecs; - - cr = axienet_calc_cr(lp, lp->coalesce_count_rx, lp->coalesce_usec_rx); + cr = axienet_calc_cr(lp, ecoalesce->rx_max_coalesced_frames, + ecoalesce->rx_coalesce_usecs); axienet_update_coalesce_rx(lp, cr, ~XAXIDMA_CR_RUNSTOP_MASK); - cr = axienet_calc_cr(lp, lp->coalesce_count_tx, lp->coalesce_usec_tx); + cr = axienet_calc_cr(lp, ecoalesce->tx_max_coalesced_frames, + ecoalesce->tx_coalesce_usecs); axienet_update_coalesce_tx(lp, cr, ~XAXIDMA_CR_RUNSTOP_MASK); return 0; } @@ -2946,14 +2974,10 @@ static int axienet_probe(struct platform_device *pdev) spin_lock_init(&lp->rx_cr_lock); spin_lock_init(&lp->tx_cr_lock); - lp->coalesce_count_rx = XAXIDMA_DFT_RX_THRESHOLD; - lp->coalesce_count_tx = XAXIDMA_DFT_TX_THRESHOLD; - lp->coalesce_usec_rx = XAXIDMA_DFT_RX_USEC; - lp->coalesce_usec_tx = XAXIDMA_DFT_TX_USEC; - lp->rx_dma_cr = axienet_calc_cr(lp, lp->coalesce_count_rx, - lp->coalesce_usec_rx); - lp->tx_dma_cr = axienet_calc_cr(lp, lp->coalesce_count_tx, - lp->coalesce_usec_tx); + lp->rx_dma_cr = axienet_calc_cr(lp, XAXIDMA_DFT_RX_THRESHOLD, + XAXIDMA_DFT_RX_USEC); + lp->tx_dma_cr = axienet_calc_cr(lp, XAXIDMA_DFT_TX_THRESHOLD, + XAXIDMA_DFT_TX_USEC); ret = axienet_mdio_setup(lp); if (ret) -- 2.50.1 From e1d27d29dbe5139cd6b9a5e06bfe6e18149d1bff Mon Sep 17 00:00:00 2001 From: Sean Anderson Date: Thu, 6 Feb 2025 15:10:36 -0500 Subject: [PATCH 09/16] net: xilinx: axienet: Enable adaptive IRQ coalescing with DIM The default RX IRQ coalescing settings of one IRQ per packet can represent a significant CPU load. However, increasing the coalescing unilaterally can result in undesirable latency under low load. Adaptive IRQ coalescing with DIM offers a way to adjust the coalescing settings based on load. This device only supports "CQE" mode [1], where each packet resets the timer. Therefore, an interrupt is fired either when we receive coalesce_count_rx packets or when the interface is idle for coalesce_usec_rx. With this in mind, consider the following scenarios: Link saturated Here we want to set coalesce_count_rx to a large value, in order to coalesce more packets and reduce CPU load. coalesce_usec_rx should be set to at least the time for one packet. Otherwise the link will be "idle" and we will get an interrupt for each packet anyway. Bursts of packets Each burst should be coalesced into a single interrupt, although it may be prudent to reduce coalesce_count_rx for better latency. coalesce_usec_rx should be set to at least the time for one packet so bursts are coalesced. However, additional time beyond the packet time will just increase latency at the end of a burst. Sporadic packets Due to low load, we can set coalesce_count_rx to 1 in order to reduce latency to the minimum. coalesce_usec_rx does not matter in this case. Based on this analysis, I expected the CQE profiles to look something like usec = 0, pkts = 1 // Low load usec = 16, pkts = 4 usec = 16, pkts = 16 usec = 16, pkts = 64 usec = 16, pkts = 256 // High load Where usec is set to 16 to be a few us greater than the 12.3 us packet time of a 1500 MTU packet at 1 GBit/s. However, the CQE profile is instead usec = 2, pkts = 256 // Low load usec = 8, pkts = 128 usec = 16, pkts = 64 usec = 32, pkts = 64 usec = 64, pkts = 64 // High load I found this very surprising. The number of coalesced packets *decreases* as load increases. But as load increases we have more opportunities to coalesce packets without affecting latency as much. Additionally, the profile *increases* the usec as the load increases. But as load increases, the gaps between packets will tend to become smaller, making it possible to *decrease* usec for better latency at the end of a "burst". I consider the default CQE profile unsuitable for this NIC. Therefore, we use the first profile outlined in this commit instead. coalesce_usec_rx is set to 16 by default, but the user can customize it. This may be necessary if they are using jumbo frames. I think adjusting the profile times based on the link speed/mtu would be good improvement for generic DIM. In addition to the above profile problems, I noticed the following additional issues with DIM while testing: - DIM tends to "wander" when at low load, since the performance gradient is pretty flat. If you only have 10p/ms anyway then adjusting the coalescing settings will not affect throughput very much. - DIM takes a long time to adjust back to low indices when load is decreased following a period of high load. This is because it only re-evaluates its settings once every 64 interrupts. However, at low load 64 interrupts can be several seconds. Finally: performance. This patch increases receive throughput with iperf3 from 840 Mbits/sec to 938 Mbits/sec, decreases interrupts from 69920/sec to 316/sec, and decreases CPU utilization (4x Cortex-A53) from 43% to 9%. [1] Who names this stuff? Signed-off-by: Sean Anderson Reviewed by: Shannon Nelson Link: https://patch.msgid.link/20250206201036.1516800-5-sean.anderson@linux.dev Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/xilinx/Kconfig | 1 + drivers/net/ethernet/xilinx/xilinx_axienet.h | 10 ++- .../net/ethernet/xilinx/xilinx_axienet_main.c | 80 +++++++++++++++++-- 3 files changed, 82 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/xilinx/Kconfig b/drivers/net/ethernet/xilinx/Kconfig index 35d96c633a33..7502214cc7d5 100644 --- a/drivers/net/ethernet/xilinx/Kconfig +++ b/drivers/net/ethernet/xilinx/Kconfig @@ -28,6 +28,7 @@ config XILINX_AXI_EMAC depends on HAS_IOMEM depends on XILINX_DMA select PHYLINK + select DIMLIB help This driver supports the 10/100/1000 Ethernet from Xilinx for the AXI bus interface used in Xilinx Virtex FPGAs and Soc's. diff --git a/drivers/net/ethernet/xilinx/xilinx_axienet.h b/drivers/net/ethernet/xilinx/xilinx_axienet.h index 45d8d80dbb1a..5ff742103beb 100644 --- a/drivers/net/ethernet/xilinx/xilinx_axienet.h +++ b/drivers/net/ethernet/xilinx/xilinx_axienet.h @@ -9,6 +9,7 @@ #ifndef XILINX_AXIENET_H #define XILINX_AXIENET_H +#include #include #include #include @@ -123,8 +124,7 @@ /* Default TX/RX Threshold and delay timer values for SGDMA mode */ #define XAXIDMA_DFT_TX_THRESHOLD 24 #define XAXIDMA_DFT_TX_USEC 50 -#define XAXIDMA_DFT_RX_THRESHOLD 1 -#define XAXIDMA_DFT_RX_USEC 50 +#define XAXIDMA_DFT_RX_USEC 16 #define XAXIDMA_BD_CTRL_TXSOF_MASK 0x08000000 /* First tx packet */ #define XAXIDMA_BD_CTRL_TXEOF_MASK 0x04000000 /* Last tx packet */ @@ -484,6 +484,9 @@ struct skbuf_dma_descriptor { * @regs: Base address for the axienet_local device address space * @dma_regs: Base address for the axidma device address space * @napi_rx: NAPI RX control structure + * @rx_dim: DIM state for the receive queue + * @rx_dim_enabled: Whether DIM is enabled or not + * @rx_irqs: Number of interrupts * @rx_cr_lock: Lock protecting @rx_dma_cr, its register, and @rx_dma_started * @rx_dma_cr: Nominal content of RX DMA control register * @rx_dma_started: Set when RX DMA is started @@ -566,6 +569,9 @@ struct axienet_local { void __iomem *dma_regs; struct napi_struct napi_rx; + struct dim rx_dim; + bool rx_dim_enabled; + u16 rx_irqs; spinlock_t rx_cr_lock; u32 rx_dma_cr; bool rx_dma_started; diff --git a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c index da3d4193674e..0673b2694e4c 100644 --- a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c +++ b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c @@ -1283,6 +1283,18 @@ static int axienet_rx_poll(struct napi_struct *napi, int budget) axienet_dma_out_addr(lp, XAXIDMA_RX_TDESC_OFFSET, tail_p); if (packets < budget && napi_complete_done(napi, packets)) { + if (READ_ONCE(lp->rx_dim_enabled)) { + struct dim_sample sample = { + .time = ktime_get(), + /* Safe because we are the only writer */ + .pkt_ctr = u64_stats_read(&lp->rx_packets), + .byte_ctr = u64_stats_read(&lp->rx_bytes), + .event_ctr = READ_ONCE(lp->rx_irqs), + }; + + net_dim(&lp->rx_dim, &sample); + } + /* Re-enable RX completion interrupts. This should * cause an immediate interrupt if any RX packets are * already pending. @@ -1375,6 +1387,7 @@ static irqreturn_t axienet_rx_irq(int irq, void *_ndev) /* Disable further RX completion interrupts and schedule * NAPI receive. */ + WRITE_ONCE(lp->rx_irqs, READ_ONCE(lp->rx_irqs) + 1); if (napi_schedule_prep(&lp->napi_rx)) { u32 cr; @@ -1676,6 +1689,7 @@ err_free_eth_irq: if (lp->eth_irq > 0) free_irq(lp->eth_irq, ndev); err_phy: + cancel_work_sync(&lp->rx_dim.work); cancel_delayed_work_sync(&lp->stats_work); phylink_stop(lp->phylink); phylink_disconnect_phy(lp->phylink); @@ -1705,6 +1719,7 @@ static int axienet_stop(struct net_device *ndev) napi_disable(&lp->napi_rx); } + cancel_work_sync(&lp->rx_dim.work); cancel_delayed_work_sync(&lp->stats_work); phylink_stop(lp->phylink); @@ -2077,6 +2092,31 @@ static void axienet_update_coalesce_rx(struct axienet_local *lp, u32 cr, spin_unlock_irq(&lp->rx_cr_lock); } +/** + * axienet_dim_coalesce_count_rx() - RX coalesce count for DIM + * @lp: Device private data + */ +static u32 axienet_dim_coalesce_count_rx(struct axienet_local *lp) +{ + return min(1 << (lp->rx_dim.profile_ix << 1), 255); +} + +/** + * axienet_rx_dim_work() - Adjust RX DIM settings + * @work: The work struct + */ +static void axienet_rx_dim_work(struct work_struct *work) +{ + struct axienet_local *lp = + container_of(work, struct axienet_local, rx_dim.work); + u32 cr = axienet_calc_cr(lp, axienet_dim_coalesce_count_rx(lp), 0); + u32 mask = XAXIDMA_COALESCE_MASK | XAXIDMA_IRQ_IOC_MASK | + XAXIDMA_IRQ_ERROR_MASK; + + axienet_update_coalesce_rx(lp, cr, mask); + lp->rx_dim.state = DIM_START_MEASURE; +} + /** * axienet_update_coalesce_tx() - Set TX CR * @lp: Device private data @@ -2127,6 +2167,8 @@ axienet_ethtools_get_coalesce(struct net_device *ndev, struct axienet_local *lp = netdev_priv(ndev); u32 cr; + ecoalesce->use_adaptive_rx_coalesce = lp->rx_dim_enabled; + spin_lock_irq(&lp->rx_cr_lock); cr = lp->rx_dma_cr; spin_unlock_irq(&lp->rx_cr_lock); @@ -2163,7 +2205,9 @@ axienet_ethtools_set_coalesce(struct net_device *ndev, struct netlink_ext_ack *extack) { struct axienet_local *lp = netdev_priv(ndev); - u32 cr; + bool new_dim = ecoalesce->use_adaptive_rx_coalesce; + bool old_dim = lp->rx_dim_enabled; + u32 cr, mask = ~XAXIDMA_CR_RUNSTOP_MASK; if (ecoalesce->rx_max_coalesced_frames > 255 || ecoalesce->tx_max_coalesced_frames > 255) { @@ -2177,7 +2221,7 @@ axienet_ethtools_set_coalesce(struct net_device *ndev, return -EINVAL; } - if ((ecoalesce->rx_max_coalesced_frames > 1 && + if (((ecoalesce->rx_max_coalesced_frames > 1 || new_dim) && !ecoalesce->rx_coalesce_usecs) || (ecoalesce->tx_max_coalesced_frames > 1 && !ecoalesce->tx_coalesce_usecs)) { @@ -2186,9 +2230,27 @@ axienet_ethtools_set_coalesce(struct net_device *ndev, return -EINVAL; } - cr = axienet_calc_cr(lp, ecoalesce->rx_max_coalesced_frames, - ecoalesce->rx_coalesce_usecs); - axienet_update_coalesce_rx(lp, cr, ~XAXIDMA_CR_RUNSTOP_MASK); + if (new_dim && !old_dim) { + cr = axienet_calc_cr(lp, axienet_dim_coalesce_count_rx(lp), + ecoalesce->rx_coalesce_usecs); + } else if (!new_dim) { + if (old_dim) { + WRITE_ONCE(lp->rx_dim_enabled, false); + napi_synchronize(&lp->napi_rx); + flush_work(&lp->rx_dim.work); + } + + cr = axienet_calc_cr(lp, ecoalesce->rx_max_coalesced_frames, + ecoalesce->rx_coalesce_usecs); + } else { + /* Dummy value for count just to calculate timer */ + cr = axienet_calc_cr(lp, 2, ecoalesce->rx_coalesce_usecs); + mask = XAXIDMA_DELAY_MASK | XAXIDMA_IRQ_DELAY_MASK; + } + + axienet_update_coalesce_rx(lp, cr, mask); + if (new_dim && !old_dim) + WRITE_ONCE(lp->rx_dim_enabled, true); cr = axienet_calc_cr(lp, ecoalesce->tx_max_coalesced_frames, ecoalesce->tx_coalesce_usecs); @@ -2430,7 +2492,8 @@ axienet_ethtool_get_rmon_stats(struct net_device *dev, static const struct ethtool_ops axienet_ethtool_ops = { .supported_coalesce_params = ETHTOOL_COALESCE_MAX_FRAMES | - ETHTOOL_COALESCE_USECS, + ETHTOOL_COALESCE_USECS | + ETHTOOL_COALESCE_USE_ADAPTIVE_RX, .get_drvinfo = axienet_ethtools_get_drvinfo, .get_regs_len = axienet_ethtools_get_regs_len, .get_regs = axienet_ethtools_get_regs, @@ -2974,7 +3037,10 @@ static int axienet_probe(struct platform_device *pdev) spin_lock_init(&lp->rx_cr_lock); spin_lock_init(&lp->tx_cr_lock); - lp->rx_dma_cr = axienet_calc_cr(lp, XAXIDMA_DFT_RX_THRESHOLD, + INIT_WORK(&lp->rx_dim.work, axienet_rx_dim_work); + lp->rx_dim_enabled = true; + lp->rx_dim.profile_ix = 1; + lp->rx_dma_cr = axienet_calc_cr(lp, axienet_dim_coalesce_count_rx(lp), XAXIDMA_DFT_RX_USEC); lp->tx_dma_cr = axienet_calc_cr(lp, XAXIDMA_DFT_TX_THRESHOLD, XAXIDMA_DFT_TX_USEC); -- 2.50.1 From 5a9c5e5d8a1b70837287d03432757d10047c3bbb Mon Sep 17 00:00:00 2001 From: Akihiko Odaki Date: Fri, 7 Feb 2025 15:10:51 +0900 Subject: [PATCH 10/16] tun: Refactor CONFIG_TUN_VNET_CROSS_LE Check IS_ENABLED(CONFIG_TUN_VNET_CROSS_LE) to save some lines and make future changes easier. Signed-off-by: Akihiko Odaki Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20250207-tun-v6-1-fb49cf8b103e@daynix.com Signed-off-by: Jakub Kicinski --- drivers/net/tun.c | 29 ++++++++++------------------- 1 file changed, 10 insertions(+), 19 deletions(-) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index acf96f262488..8a9a29031bd8 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -298,17 +298,21 @@ static bool tun_napi_frags_enabled(const struct tun_file *tfile) return tfile->napi_frags_enabled; } -#ifdef CONFIG_TUN_VNET_CROSS_LE static inline bool tun_legacy_is_little_endian(struct tun_struct *tun) { - return tun->flags & TUN_VNET_BE ? false : - virtio_legacy_is_little_endian(); + bool be = IS_ENABLED(CONFIG_TUN_VNET_CROSS_LE) && + (tun->flags & TUN_VNET_BE); + + return !be && virtio_legacy_is_little_endian(); } static long tun_get_vnet_be(struct tun_struct *tun, int __user *argp) { int be = !!(tun->flags & TUN_VNET_BE); + if (!IS_ENABLED(CONFIG_TUN_VNET_CROSS_LE)) + return -EINVAL; + if (put_user(be, argp)) return -EFAULT; @@ -319,6 +323,9 @@ static long tun_set_vnet_be(struct tun_struct *tun, int __user *argp) { int be; + if (!IS_ENABLED(CONFIG_TUN_VNET_CROSS_LE)) + return -EINVAL; + if (get_user(be, argp)) return -EFAULT; @@ -329,22 +336,6 @@ static long tun_set_vnet_be(struct tun_struct *tun, int __user *argp) return 0; } -#else -static inline bool tun_legacy_is_little_endian(struct tun_struct *tun) -{ - return virtio_legacy_is_little_endian(); -} - -static long tun_get_vnet_be(struct tun_struct *tun, int __user *argp) -{ - return -EINVAL; -} - -static long tun_set_vnet_be(struct tun_struct *tun, int __user *argp) -{ - return -EINVAL; -} -#endif /* CONFIG_TUN_VNET_CROSS_LE */ static inline bool tun_is_little_endian(struct tun_struct *tun) { -- 2.50.1 From 07e8b3bae2f8f49e3cd6ea51f899a3d062100822 Mon Sep 17 00:00:00 2001 From: Akihiko Odaki Date: Fri, 7 Feb 2025 15:10:52 +0900 Subject: [PATCH 11/16] tun: Keep hdr_len in tun_get_user() hdr_len is repeatedly used so keep it in a local variable. Signed-off-by: Akihiko Odaki Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20250207-tun-v6-2-fb49cf8b103e@daynix.com Signed-off-by: Jakub Kicinski --- drivers/net/tun.c | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 8a9a29031bd8..ecc32421dbb5 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1747,6 +1747,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, struct virtio_net_hdr gso = { 0 }; int good_linear; int copylen; + int hdr_len = 0; bool zerocopy = false; int err; u32 rxhash = 0; @@ -1773,19 +1774,21 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, if (!copy_from_iter_full(&gso, sizeof(gso), from)) return -EFAULT; - if ((gso.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) && - tun16_to_cpu(tun, gso.csum_start) + tun16_to_cpu(tun, gso.csum_offset) + 2 > tun16_to_cpu(tun, gso.hdr_len)) - gso.hdr_len = cpu_to_tun16(tun, tun16_to_cpu(tun, gso.csum_start) + tun16_to_cpu(tun, gso.csum_offset) + 2); + hdr_len = tun16_to_cpu(tun, gso.hdr_len); - if (tun16_to_cpu(tun, gso.hdr_len) > len) + if (gso.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) { + hdr_len = max(tun16_to_cpu(tun, gso.csum_start) + tun16_to_cpu(tun, gso.csum_offset) + 2, hdr_len); + gso.hdr_len = cpu_to_tun16(tun, hdr_len); + } + + if (hdr_len > len) return -EINVAL; iov_iter_advance(from, vnet_hdr_sz - sizeof(gso)); } if ((tun->flags & TUN_TYPE_MASK) == IFF_TAP) { align += NET_IP_ALIGN; - if (unlikely(len < ETH_HLEN || - (gso.hdr_len && tun16_to_cpu(tun, gso.hdr_len) < ETH_HLEN))) + if (unlikely(len < ETH_HLEN || (hdr_len && hdr_len < ETH_HLEN))) return -EINVAL; } @@ -1798,9 +1801,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, * enough room for skb expand head in case it is used. * The rest of the buffer is mapped from userspace. */ - copylen = gso.hdr_len ? tun16_to_cpu(tun, gso.hdr_len) : GOODCOPY_LEN; - if (copylen > good_linear) - copylen = good_linear; + copylen = min(hdr_len ? hdr_len : GOODCOPY_LEN, good_linear); linear = copylen; iov_iter_advance(&i, copylen); if (iov_iter_npages(&i, INT_MAX) <= MAX_SKB_FRAGS) @@ -1821,10 +1822,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, } else { if (!zerocopy) { copylen = len; - if (tun16_to_cpu(tun, gso.hdr_len) > good_linear) - linear = good_linear; - else - linear = tun16_to_cpu(tun, gso.hdr_len); + linear = min(hdr_len, good_linear); } if (frags) { -- 2.50.1 From 60df67b94804b1adca74854db502a72f7aeaa125 Mon Sep 17 00:00:00 2001 From: Akihiko Odaki Date: Fri, 7 Feb 2025 15:10:53 +0900 Subject: [PATCH 12/16] tun: Decouple vnet from tun_struct Decouple vnet-related functions from tun_struct so that we can reuse them for tap in the future. Signed-off-by: Akihiko Odaki Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20250207-tun-v6-3-fb49cf8b103e@daynix.com Signed-off-by: Jakub Kicinski --- drivers/net/tun.c | 51 ++++++++++++++++++++++++----------------------- 1 file changed, 26 insertions(+), 25 deletions(-) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index ecc32421dbb5..0f86697bca57 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -298,17 +298,17 @@ static bool tun_napi_frags_enabled(const struct tun_file *tfile) return tfile->napi_frags_enabled; } -static inline bool tun_legacy_is_little_endian(struct tun_struct *tun) +static inline bool tun_legacy_is_little_endian(unsigned int flags) { bool be = IS_ENABLED(CONFIG_TUN_VNET_CROSS_LE) && - (tun->flags & TUN_VNET_BE); + (flags & TUN_VNET_BE); return !be && virtio_legacy_is_little_endian(); } -static long tun_get_vnet_be(struct tun_struct *tun, int __user *argp) +static long tun_get_vnet_be(unsigned int flags, int __user *argp) { - int be = !!(tun->flags & TUN_VNET_BE); + int be = !!(flags & TUN_VNET_BE); if (!IS_ENABLED(CONFIG_TUN_VNET_CROSS_LE)) return -EINVAL; @@ -319,7 +319,7 @@ static long tun_get_vnet_be(struct tun_struct *tun, int __user *argp) return 0; } -static long tun_set_vnet_be(struct tun_struct *tun, int __user *argp) +static long tun_set_vnet_be(unsigned int *flags, int __user *argp) { int be; @@ -330,27 +330,26 @@ static long tun_set_vnet_be(struct tun_struct *tun, int __user *argp) return -EFAULT; if (be) - tun->flags |= TUN_VNET_BE; + *flags |= TUN_VNET_BE; else - tun->flags &= ~TUN_VNET_BE; + *flags &= ~TUN_VNET_BE; return 0; } -static inline bool tun_is_little_endian(struct tun_struct *tun) +static inline bool tun_is_little_endian(unsigned int flags) { - return tun->flags & TUN_VNET_LE || - tun_legacy_is_little_endian(tun); + return flags & TUN_VNET_LE || tun_legacy_is_little_endian(flags); } -static inline u16 tun16_to_cpu(struct tun_struct *tun, __virtio16 val) +static inline u16 tun16_to_cpu(unsigned int flags, __virtio16 val) { - return __virtio16_to_cpu(tun_is_little_endian(tun), val); + return __virtio16_to_cpu(tun_is_little_endian(flags), val); } -static inline __virtio16 cpu_to_tun16(struct tun_struct *tun, u16 val) +static inline __virtio16 cpu_to_tun16(unsigned int flags, u16 val) { - return __cpu_to_virtio16(tun_is_little_endian(tun), val); + return __cpu_to_virtio16(tun_is_little_endian(flags), val); } static inline u32 tun_hashfn(u32 rxhash) @@ -1766,6 +1765,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, if (tun->flags & IFF_VNET_HDR) { int vnet_hdr_sz = READ_ONCE(tun->vnet_hdr_sz); + int flags = tun->flags; if (len < vnet_hdr_sz) return -EINVAL; @@ -1774,11 +1774,11 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, if (!copy_from_iter_full(&gso, sizeof(gso), from)) return -EFAULT; - hdr_len = tun16_to_cpu(tun, gso.hdr_len); + hdr_len = tun16_to_cpu(flags, gso.hdr_len); if (gso.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) { - hdr_len = max(tun16_to_cpu(tun, gso.csum_start) + tun16_to_cpu(tun, gso.csum_offset) + 2, hdr_len); - gso.hdr_len = cpu_to_tun16(tun, hdr_len); + hdr_len = max(tun16_to_cpu(flags, gso.csum_start) + tun16_to_cpu(flags, gso.csum_offset) + 2, hdr_len); + gso.hdr_len = cpu_to_tun16(flags, hdr_len); } if (hdr_len > len) @@ -1857,7 +1857,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, } } - if (virtio_net_hdr_to_skb(skb, &gso, tun_is_little_endian(tun))) { + if (virtio_net_hdr_to_skb(skb, &gso, tun_is_little_endian(tun->flags))) { atomic_long_inc(&tun->rx_frame_errors); err = -EINVAL; goto free_skb; @@ -2111,23 +2111,24 @@ static ssize_t tun_put_user(struct tun_struct *tun, if (vnet_hdr_sz) { struct virtio_net_hdr gso; + int flags = tun->flags; if (iov_iter_count(iter) < vnet_hdr_sz) return -EINVAL; if (virtio_net_hdr_from_skb(skb, &gso, - tun_is_little_endian(tun), true, + tun_is_little_endian(flags), true, vlan_hlen)) { struct skb_shared_info *sinfo = skb_shinfo(skb); if (net_ratelimit()) { netdev_err(tun->dev, "unexpected GSO type: 0x%x, gso_size %d, hdr_len %d\n", - sinfo->gso_type, tun16_to_cpu(tun, gso.gso_size), - tun16_to_cpu(tun, gso.hdr_len)); + sinfo->gso_type, tun16_to_cpu(flags, gso.gso_size), + tun16_to_cpu(flags, gso.hdr_len)); print_hex_dump(KERN_ERR, "tun: ", DUMP_PREFIX_NONE, 16, 1, skb->head, - min((int)tun16_to_cpu(tun, gso.hdr_len), 64), true); + min((int)tun16_to_cpu(flags, gso.hdr_len), 64), true); } WARN_ON_ONCE(1); return -EINVAL; @@ -2496,7 +2497,7 @@ build: skb_reserve(skb, xdp->data - xdp->data_hard_start); skb_put(skb, xdp->data_end - xdp->data); - if (virtio_net_hdr_to_skb(skb, gso, tun_is_little_endian(tun))) { + if (virtio_net_hdr_to_skb(skb, gso, tun_is_little_endian(tun->flags))) { atomic_long_inc(&tun->rx_frame_errors); kfree_skb(skb); ret = -EINVAL; @@ -3325,11 +3326,11 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, break; case TUNGETVNETBE: - ret = tun_get_vnet_be(tun, argp); + ret = tun_get_vnet_be(tun->flags, argp); break; case TUNSETVNETBE: - ret = tun_set_vnet_be(tun, argp); + ret = tun_set_vnet_be(&tun->flags, argp); break; case TUNATTACHFILTER: -- 2.50.1 From 2506251e81d18783885d34a329795f4398488957 Mon Sep 17 00:00:00 2001 From: Akihiko Odaki Date: Fri, 7 Feb 2025 15:10:54 +0900 Subject: [PATCH 13/16] tun: Decouple vnet handling Decouple the vnet handling code so that we can reuse it for tap. Signed-off-by: Akihiko Odaki Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20250207-tun-v6-4-fb49cf8b103e@daynix.com Signed-off-by: Jakub Kicinski --- drivers/net/tun.c | 237 +++++++++++++++++++++++++++------------------- 1 file changed, 139 insertions(+), 98 deletions(-) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 0f86697bca57..ed79a691eee0 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -352,6 +352,127 @@ static inline __virtio16 cpu_to_tun16(unsigned int flags, u16 val) return __cpu_to_virtio16(tun_is_little_endian(flags), val); } +static long tun_vnet_ioctl(int *vnet_hdr_sz, unsigned int *flags, + unsigned int cmd, int __user *sp) +{ + int s; + + switch (cmd) { + case TUNGETVNETHDRSZ: + s = *vnet_hdr_sz; + if (put_user(s, sp)) + return -EFAULT; + return 0; + + case TUNSETVNETHDRSZ: + if (get_user(s, sp)) + return -EFAULT; + if (s < (int)sizeof(struct virtio_net_hdr)) + return -EINVAL; + + *vnet_hdr_sz = s; + return 0; + + case TUNGETVNETLE: + s = !!(*flags & TUN_VNET_LE); + if (put_user(s, sp)) + return -EFAULT; + return 0; + + case TUNSETVNETLE: + if (get_user(s, sp)) + return -EFAULT; + if (s) + *flags |= TUN_VNET_LE; + else + *flags &= ~TUN_VNET_LE; + return 0; + + case TUNGETVNETBE: + return tun_get_vnet_be(*flags, sp); + + case TUNSETVNETBE: + return tun_set_vnet_be(flags, sp); + + default: + return -EINVAL; + } +} + +static int tun_vnet_hdr_get(int sz, unsigned int flags, struct iov_iter *from, + struct virtio_net_hdr *hdr) +{ + u16 hdr_len; + + if (iov_iter_count(from) < sz) + return -EINVAL; + + if (!copy_from_iter_full(hdr, sizeof(*hdr), from)) + return -EFAULT; + + hdr_len = tun16_to_cpu(flags, hdr->hdr_len); + + if (hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) { + hdr_len = max(tun16_to_cpu(flags, hdr->csum_start) + tun16_to_cpu(flags, hdr->csum_offset) + 2, hdr_len); + hdr->hdr_len = cpu_to_tun16(flags, hdr_len); + } + + if (hdr_len > iov_iter_count(from)) + return -EINVAL; + + iov_iter_advance(from, sz - sizeof(*hdr)); + + return hdr_len; +} + +static int tun_vnet_hdr_put(int sz, struct iov_iter *iter, + const struct virtio_net_hdr *hdr) +{ + if (unlikely(iov_iter_count(iter) < sz)) + return -EINVAL; + + if (unlikely(copy_to_iter(hdr, sizeof(*hdr), iter) != sizeof(*hdr))) + return -EFAULT; + + iov_iter_advance(iter, sz - sizeof(*hdr)); + + return 0; +} + +static int tun_vnet_hdr_to_skb(unsigned int flags, struct sk_buff *skb, + const struct virtio_net_hdr *hdr) +{ + return virtio_net_hdr_to_skb(skb, hdr, tun_is_little_endian(flags)); +} + +static int tun_vnet_hdr_from_skb(unsigned int flags, + const struct net_device *dev, + const struct sk_buff *skb, + struct virtio_net_hdr *hdr) +{ + int vlan_hlen = skb_vlan_tag_present(skb) ? VLAN_HLEN : 0; + + if (virtio_net_hdr_from_skb(skb, hdr, + tun_is_little_endian(flags), true, + vlan_hlen)) { + struct skb_shared_info *sinfo = skb_shinfo(skb); + + if (net_ratelimit()) { + netdev_err(dev, "unexpected GSO type: 0x%x, gso_size %d, hdr_len %d\n", + sinfo->gso_type, tun16_to_cpu(flags, hdr->gso_size), + tun16_to_cpu(flags, hdr->hdr_len)); + print_hex_dump(KERN_ERR, "tun: ", + DUMP_PREFIX_NONE, + 16, 1, skb->head, + min(tun16_to_cpu(flags, hdr->hdr_len), 64), true); + } + WARN_ON_ONCE(1); + return -EINVAL; + } + + return 0; +} + static inline u32 tun_hashfn(u32 rxhash) { return rxhash & TUN_MASK_FLOW_ENTRIES; @@ -1765,25 +1886,12 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, if (tun->flags & IFF_VNET_HDR) { int vnet_hdr_sz = READ_ONCE(tun->vnet_hdr_sz); - int flags = tun->flags; - - if (len < vnet_hdr_sz) - return -EINVAL; - len -= vnet_hdr_sz; - - if (!copy_from_iter_full(&gso, sizeof(gso), from)) - return -EFAULT; - - hdr_len = tun16_to_cpu(flags, gso.hdr_len); - if (gso.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) { - hdr_len = max(tun16_to_cpu(flags, gso.csum_start) + tun16_to_cpu(flags, gso.csum_offset) + 2, hdr_len); - gso.hdr_len = cpu_to_tun16(flags, hdr_len); - } + hdr_len = tun_vnet_hdr_get(vnet_hdr_sz, tun->flags, from, &gso); + if (hdr_len < 0) + return hdr_len; - if (hdr_len > len) - return -EINVAL; - iov_iter_advance(from, vnet_hdr_sz - sizeof(gso)); + len -= vnet_hdr_sz; } if ((tun->flags & TUN_TYPE_MASK) == IFF_TAP) { @@ -1857,7 +1965,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, } } - if (virtio_net_hdr_to_skb(skb, &gso, tun_is_little_endian(tun->flags))) { + if (tun_vnet_hdr_to_skb(tun->flags, skb, &gso)) { atomic_long_inc(&tun->rx_frame_errors); err = -EINVAL; goto free_skb; @@ -2052,18 +2160,15 @@ static ssize_t tun_put_user_xdp(struct tun_struct *tun, { int vnet_hdr_sz = 0; size_t size = xdp_frame->len; - size_t ret; + ssize_t ret; if (tun->flags & IFF_VNET_HDR) { struct virtio_net_hdr gso = { 0 }; vnet_hdr_sz = READ_ONCE(tun->vnet_hdr_sz); - if (unlikely(iov_iter_count(iter) < vnet_hdr_sz)) - return -EINVAL; - if (unlikely(copy_to_iter(&gso, sizeof(gso), iter) != - sizeof(gso))) - return -EFAULT; - iov_iter_advance(iter, vnet_hdr_sz - sizeof(gso)); + ret = tun_vnet_hdr_put(vnet_hdr_sz, iter, &gso); + if (ret) + return ret; } ret = copy_to_iter(xdp_frame->data, size, iter) + vnet_hdr_sz; @@ -2086,6 +2191,7 @@ static ssize_t tun_put_user(struct tun_struct *tun, int vlan_offset = 0; int vlan_hlen = 0; int vnet_hdr_sz = 0; + int ret; if (skb_vlan_tag_present(skb)) vlan_hlen = VLAN_HLEN; @@ -2111,33 +2217,14 @@ static ssize_t tun_put_user(struct tun_struct *tun, if (vnet_hdr_sz) { struct virtio_net_hdr gso; - int flags = tun->flags; - - if (iov_iter_count(iter) < vnet_hdr_sz) - return -EINVAL; - - if (virtio_net_hdr_from_skb(skb, &gso, - tun_is_little_endian(flags), true, - vlan_hlen)) { - struct skb_shared_info *sinfo = skb_shinfo(skb); - - if (net_ratelimit()) { - netdev_err(tun->dev, "unexpected GSO type: 0x%x, gso_size %d, hdr_len %d\n", - sinfo->gso_type, tun16_to_cpu(flags, gso.gso_size), - tun16_to_cpu(flags, gso.hdr_len)); - print_hex_dump(KERN_ERR, "tun: ", - DUMP_PREFIX_NONE, - 16, 1, skb->head, - min((int)tun16_to_cpu(flags, gso.hdr_len), 64), true); - } - WARN_ON_ONCE(1); - return -EINVAL; - } - if (copy_to_iter(&gso, sizeof(gso), iter) != sizeof(gso)) - return -EFAULT; + ret = tun_vnet_hdr_from_skb(tun->flags, tun->dev, skb, &gso); + if (ret) + return ret; - iov_iter_advance(iter, vnet_hdr_sz - sizeof(gso)); + ret = tun_vnet_hdr_put(vnet_hdr_sz, iter, &gso); + if (ret) + return ret; } if (vlan_hlen) { @@ -2497,7 +2584,7 @@ build: skb_reserve(skb, xdp->data - xdp->data_hard_start); skb_put(skb, xdp->data_end - xdp->data); - if (virtio_net_hdr_to_skb(skb, gso, tun_is_little_endian(tun->flags))) { + if (tun_vnet_hdr_to_skb(tun->flags, skb, gso)) { atomic_long_inc(&tun->rx_frame_errors); kfree_skb(skb); ret = -EINVAL; @@ -3081,8 +3168,6 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, kgid_t group; int ifindex; int sndbuf; - int vnet_hdr_sz; - int le; int ret; bool do_notify = false; @@ -3289,50 +3374,6 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, tun_set_sndbuf(tun); break; - case TUNGETVNETHDRSZ: - vnet_hdr_sz = tun->vnet_hdr_sz; - if (copy_to_user(argp, &vnet_hdr_sz, sizeof(vnet_hdr_sz))) - ret = -EFAULT; - break; - - case TUNSETVNETHDRSZ: - if (copy_from_user(&vnet_hdr_sz, argp, sizeof(vnet_hdr_sz))) { - ret = -EFAULT; - break; - } - if (vnet_hdr_sz < (int)sizeof(struct virtio_net_hdr)) { - ret = -EINVAL; - break; - } - - tun->vnet_hdr_sz = vnet_hdr_sz; - break; - - case TUNGETVNETLE: - le = !!(tun->flags & TUN_VNET_LE); - if (put_user(le, (int __user *)argp)) - ret = -EFAULT; - break; - - case TUNSETVNETLE: - if (get_user(le, (int __user *)argp)) { - ret = -EFAULT; - break; - } - if (le) - tun->flags |= TUN_VNET_LE; - else - tun->flags &= ~TUN_VNET_LE; - break; - - case TUNGETVNETBE: - ret = tun_get_vnet_be(tun->flags, argp); - break; - - case TUNSETVNETBE: - ret = tun_set_vnet_be(&tun->flags, argp); - break; - case TUNATTACHFILTER: /* Can be set only for TAPs */ ret = -EINVAL; @@ -3388,7 +3429,7 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, break; default: - ret = -EINVAL; + ret = tun_vnet_ioctl(&tun->vnet_hdr_sz, &tun->flags, cmd, argp); break; } -- 2.50.1 From 1d41e2fa93f7d4dce4d05dfa2f980fba0898bea8 Mon Sep 17 00:00:00 2001 From: Akihiko Odaki Date: Fri, 7 Feb 2025 15:10:55 +0900 Subject: [PATCH 14/16] tun: Extract the vnet handling code The vnet handling code will be reused by tap. Functions are renamed to ensure that their names contain "vnet" to clarify that they are part of the decoupled vnet handling code. Signed-off-by: Akihiko Odaki Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20250207-tun-v6-5-fb49cf8b103e@daynix.com Signed-off-by: Jakub Kicinski --- MAINTAINERS | 2 +- drivers/net/tun.c | 180 +-------------------------------------- drivers/net/tun_vnet.h | 185 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 188 insertions(+), 179 deletions(-) create mode 100644 drivers/net/tun_vnet.h diff --git a/MAINTAINERS b/MAINTAINERS index 873aa2cce4d7..67665d9dd536 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -24152,7 +24152,7 @@ W: http://vtun.sourceforge.net/tun F: Documentation/networking/tuntap.rst F: arch/um/os-Linux/drivers/ F: drivers/net/tap.c -F: drivers/net/tun.c +F: drivers/net/tun* TURBOCHANNEL SUBSYSTEM M: "Maciej W. Rozycki" diff --git a/drivers/net/tun.c b/drivers/net/tun.c index ed79a691eee0..d8f4d3e996a7 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -83,6 +83,8 @@ #include #include +#include "tun_vnet.h" + static void tun_default_link_ksettings(struct net_device *dev, struct ethtool_link_ksettings *cmd); @@ -94,9 +96,6 @@ static void tun_default_link_ksettings(struct net_device *dev, * overload it to mean fasync when stored there. */ #define TUN_FASYNC IFF_ATTACH_QUEUE -/* High bits in flags field are unused. */ -#define TUN_VNET_LE 0x80000000 -#define TUN_VNET_BE 0x40000000 #define TUN_FEATURES (IFF_NO_PI | IFF_ONE_QUEUE | IFF_VNET_HDR | \ IFF_MULTI_QUEUE | IFF_NAPI | IFF_NAPI_FRAGS) @@ -298,181 +297,6 @@ static bool tun_napi_frags_enabled(const struct tun_file *tfile) return tfile->napi_frags_enabled; } -static inline bool tun_legacy_is_little_endian(unsigned int flags) -{ - bool be = IS_ENABLED(CONFIG_TUN_VNET_CROSS_LE) && - (flags & TUN_VNET_BE); - - return !be && virtio_legacy_is_little_endian(); -} - -static long tun_get_vnet_be(unsigned int flags, int __user *argp) -{ - int be = !!(flags & TUN_VNET_BE); - - if (!IS_ENABLED(CONFIG_TUN_VNET_CROSS_LE)) - return -EINVAL; - - if (put_user(be, argp)) - return -EFAULT; - - return 0; -} - -static long tun_set_vnet_be(unsigned int *flags, int __user *argp) -{ - int be; - - if (!IS_ENABLED(CONFIG_TUN_VNET_CROSS_LE)) - return -EINVAL; - - if (get_user(be, argp)) - return -EFAULT; - - if (be) - *flags |= TUN_VNET_BE; - else - *flags &= ~TUN_VNET_BE; - - return 0; -} - -static inline bool tun_is_little_endian(unsigned int flags) -{ - return flags & TUN_VNET_LE || tun_legacy_is_little_endian(flags); -} - -static inline u16 tun16_to_cpu(unsigned int flags, __virtio16 val) -{ - return __virtio16_to_cpu(tun_is_little_endian(flags), val); -} - -static inline __virtio16 cpu_to_tun16(unsigned int flags, u16 val) -{ - return __cpu_to_virtio16(tun_is_little_endian(flags), val); -} - -static long tun_vnet_ioctl(int *vnet_hdr_sz, unsigned int *flags, - unsigned int cmd, int __user *sp) -{ - int s; - - switch (cmd) { - case TUNGETVNETHDRSZ: - s = *vnet_hdr_sz; - if (put_user(s, sp)) - return -EFAULT; - return 0; - - case TUNSETVNETHDRSZ: - if (get_user(s, sp)) - return -EFAULT; - if (s < (int)sizeof(struct virtio_net_hdr)) - return -EINVAL; - - *vnet_hdr_sz = s; - return 0; - - case TUNGETVNETLE: - s = !!(*flags & TUN_VNET_LE); - if (put_user(s, sp)) - return -EFAULT; - return 0; - - case TUNSETVNETLE: - if (get_user(s, sp)) - return -EFAULT; - if (s) - *flags |= TUN_VNET_LE; - else - *flags &= ~TUN_VNET_LE; - return 0; - - case TUNGETVNETBE: - return tun_get_vnet_be(*flags, sp); - - case TUNSETVNETBE: - return tun_set_vnet_be(flags, sp); - - default: - return -EINVAL; - } -} - -static int tun_vnet_hdr_get(int sz, unsigned int flags, struct iov_iter *from, - struct virtio_net_hdr *hdr) -{ - u16 hdr_len; - - if (iov_iter_count(from) < sz) - return -EINVAL; - - if (!copy_from_iter_full(hdr, sizeof(*hdr), from)) - return -EFAULT; - - hdr_len = tun16_to_cpu(flags, hdr->hdr_len); - - if (hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) { - hdr_len = max(tun16_to_cpu(flags, hdr->csum_start) + tun16_to_cpu(flags, hdr->csum_offset) + 2, hdr_len); - hdr->hdr_len = cpu_to_tun16(flags, hdr_len); - } - - if (hdr_len > iov_iter_count(from)) - return -EINVAL; - - iov_iter_advance(from, sz - sizeof(*hdr)); - - return hdr_len; -} - -static int tun_vnet_hdr_put(int sz, struct iov_iter *iter, - const struct virtio_net_hdr *hdr) -{ - if (unlikely(iov_iter_count(iter) < sz)) - return -EINVAL; - - if (unlikely(copy_to_iter(hdr, sizeof(*hdr), iter) != sizeof(*hdr))) - return -EFAULT; - - iov_iter_advance(iter, sz - sizeof(*hdr)); - - return 0; -} - -static int tun_vnet_hdr_to_skb(unsigned int flags, struct sk_buff *skb, - const struct virtio_net_hdr *hdr) -{ - return virtio_net_hdr_to_skb(skb, hdr, tun_is_little_endian(flags)); -} - -static int tun_vnet_hdr_from_skb(unsigned int flags, - const struct net_device *dev, - const struct sk_buff *skb, - struct virtio_net_hdr *hdr) -{ - int vlan_hlen = skb_vlan_tag_present(skb) ? VLAN_HLEN : 0; - - if (virtio_net_hdr_from_skb(skb, hdr, - tun_is_little_endian(flags), true, - vlan_hlen)) { - struct skb_shared_info *sinfo = skb_shinfo(skb); - - if (net_ratelimit()) { - netdev_err(dev, "unexpected GSO type: 0x%x, gso_size %d, hdr_len %d\n", - sinfo->gso_type, tun16_to_cpu(flags, hdr->gso_size), - tun16_to_cpu(flags, hdr->hdr_len)); - print_hex_dump(KERN_ERR, "tun: ", - DUMP_PREFIX_NONE, - 16, 1, skb->head, - min(tun16_to_cpu(flags, hdr->hdr_len), 64), true); - } - WARN_ON_ONCE(1); - return -EINVAL; - } - - return 0; -} - static inline u32 tun_hashfn(u32 rxhash) { return rxhash & TUN_MASK_FLOW_ENTRIES; diff --git a/drivers/net/tun_vnet.h b/drivers/net/tun_vnet.h new file mode 100644 index 000000000000..fd7411c4447f --- /dev/null +++ b/drivers/net/tun_vnet.h @@ -0,0 +1,185 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef TUN_VNET_H +#define TUN_VNET_H + +/* High bits in flags field are unused. */ +#define TUN_VNET_LE 0x80000000 +#define TUN_VNET_BE 0x40000000 + +static inline bool tun_vnet_legacy_is_little_endian(unsigned int flags) +{ + bool be = IS_ENABLED(CONFIG_TUN_VNET_CROSS_LE) && + (flags & TUN_VNET_BE); + + return !be && virtio_legacy_is_little_endian(); +} + +static inline long tun_get_vnet_be(unsigned int flags, int __user *argp) +{ + int be = !!(flags & TUN_VNET_BE); + + if (!IS_ENABLED(CONFIG_TUN_VNET_CROSS_LE)) + return -EINVAL; + + if (put_user(be, argp)) + return -EFAULT; + + return 0; +} + +static inline long tun_set_vnet_be(unsigned int *flags, int __user *argp) +{ + int be; + + if (!IS_ENABLED(CONFIG_TUN_VNET_CROSS_LE)) + return -EINVAL; + + if (get_user(be, argp)) + return -EFAULT; + + if (be) + *flags |= TUN_VNET_BE; + else + *flags &= ~TUN_VNET_BE; + + return 0; +} + +static inline bool tun_vnet_is_little_endian(unsigned int flags) +{ + return flags & TUN_VNET_LE || tun_vnet_legacy_is_little_endian(flags); +} + +static inline u16 tun_vnet16_to_cpu(unsigned int flags, __virtio16 val) +{ + return __virtio16_to_cpu(tun_vnet_is_little_endian(flags), val); +} + +static inline __virtio16 cpu_to_tun_vnet16(unsigned int flags, u16 val) +{ + return __cpu_to_virtio16(tun_vnet_is_little_endian(flags), val); +} + +static inline long tun_vnet_ioctl(int *vnet_hdr_sz, unsigned int *flags, + unsigned int cmd, int __user *sp) +{ + int s; + + switch (cmd) { + case TUNGETVNETHDRSZ: + s = *vnet_hdr_sz; + if (put_user(s, sp)) + return -EFAULT; + return 0; + + case TUNSETVNETHDRSZ: + if (get_user(s, sp)) + return -EFAULT; + if (s < (int)sizeof(struct virtio_net_hdr)) + return -EINVAL; + + *vnet_hdr_sz = s; + return 0; + + case TUNGETVNETLE: + s = !!(*flags & TUN_VNET_LE); + if (put_user(s, sp)) + return -EFAULT; + return 0; + + case TUNSETVNETLE: + if (get_user(s, sp)) + return -EFAULT; + if (s) + *flags |= TUN_VNET_LE; + else + *flags &= ~TUN_VNET_LE; + return 0; + + case TUNGETVNETBE: + return tun_get_vnet_be(*flags, sp); + + case TUNSETVNETBE: + return tun_set_vnet_be(flags, sp); + + default: + return -EINVAL; + } +} + +static inline int tun_vnet_hdr_get(int sz, unsigned int flags, + struct iov_iter *from, + struct virtio_net_hdr *hdr) +{ + u16 hdr_len; + + if (iov_iter_count(from) < sz) + return -EINVAL; + + if (!copy_from_iter_full(hdr, sizeof(*hdr), from)) + return -EFAULT; + + hdr_len = tun_vnet16_to_cpu(flags, hdr->hdr_len); + + if (hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) { + hdr_len = max(tun_vnet16_to_cpu(flags, hdr->csum_start) + tun_vnet16_to_cpu(flags, hdr->csum_offset) + 2, hdr_len); + hdr->hdr_len = cpu_to_tun_vnet16(flags, hdr_len); + } + + if (hdr_len > iov_iter_count(from)) + return -EINVAL; + + iov_iter_advance(from, sz - sizeof(*hdr)); + + return hdr_len; +} + +static inline int tun_vnet_hdr_put(int sz, struct iov_iter *iter, + const struct virtio_net_hdr *hdr) +{ + if (unlikely(iov_iter_count(iter) < sz)) + return -EINVAL; + + if (unlikely(copy_to_iter(hdr, sizeof(*hdr), iter) != sizeof(*hdr))) + return -EFAULT; + + iov_iter_advance(iter, sz - sizeof(*hdr)); + + return 0; +} + +static inline int tun_vnet_hdr_to_skb(unsigned int flags, struct sk_buff *skb, + const struct virtio_net_hdr *hdr) +{ + return virtio_net_hdr_to_skb(skb, hdr, tun_vnet_is_little_endian(flags)); +} + +static inline int tun_vnet_hdr_from_skb(unsigned int flags, + const struct net_device *dev, + const struct sk_buff *skb, + struct virtio_net_hdr *hdr) +{ + int vlan_hlen = skb_vlan_tag_present(skb) ? VLAN_HLEN : 0; + + if (virtio_net_hdr_from_skb(skb, hdr, + tun_vnet_is_little_endian(flags), true, + vlan_hlen)) { + struct skb_shared_info *sinfo = skb_shinfo(skb); + + if (net_ratelimit()) { + netdev_err(dev, "unexpected GSO type: 0x%x, gso_size %d, hdr_len %d\n", + sinfo->gso_type, tun_vnet16_to_cpu(flags, hdr->gso_size), + tun_vnet16_to_cpu(flags, hdr->hdr_len)); + print_hex_dump(KERN_ERR, "tun: ", + DUMP_PREFIX_NONE, + 16, 1, skb->head, + min(tun_vnet16_to_cpu(flags, hdr->hdr_len), 64), true); + } + WARN_ON_ONCE(1); + return -EINVAL; + } + + return 0; +} + +#endif /* TUN_VNET_H */ -- 2.50.1 From 74212f20f366db41dd3148b951284ac38ef9bb10 Mon Sep 17 00:00:00 2001 From: Akihiko Odaki Date: Fri, 7 Feb 2025 15:10:56 +0900 Subject: [PATCH 15/16] tap: Keep hdr_len in tap_get_user() hdr_len is repeatedly used so keep it in a local variable. Signed-off-by: Akihiko Odaki Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20250207-tun-v6-6-fb49cf8b103e@daynix.com Signed-off-by: Jakub Kicinski --- drivers/net/tap.c | 28 ++++++++++------------------ 1 file changed, 10 insertions(+), 18 deletions(-) diff --git a/drivers/net/tap.c b/drivers/net/tap.c index 5ca6ecf0ce5f..21b77c261fea 100644 --- a/drivers/net/tap.c +++ b/drivers/net/tap.c @@ -645,6 +645,7 @@ static ssize_t tap_get_user(struct tap_queue *q, void *msg_control, int err; struct virtio_net_hdr vnet_hdr = { 0 }; int vnet_hdr_len = 0; + int hdr_len = 0; int copylen = 0; int depth; bool zerocopy = false; @@ -663,13 +664,13 @@ static ssize_t tap_get_user(struct tap_queue *q, void *msg_control, if (!copy_from_iter_full(&vnet_hdr, sizeof(vnet_hdr), from)) goto err; iov_iter_advance(from, vnet_hdr_len - sizeof(vnet_hdr)); - if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) && - tap16_to_cpu(q, vnet_hdr.csum_start) + - tap16_to_cpu(q, vnet_hdr.csum_offset) + 2 > - tap16_to_cpu(q, vnet_hdr.hdr_len)) - vnet_hdr.hdr_len = cpu_to_tap16(q, - tap16_to_cpu(q, vnet_hdr.csum_start) + - tap16_to_cpu(q, vnet_hdr.csum_offset) + 2); + hdr_len = tap16_to_cpu(q, vnet_hdr.hdr_len); + if (vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) { + hdr_len = max(tap16_to_cpu(q, vnet_hdr.csum_start) + + tap16_to_cpu(q, vnet_hdr.csum_offset) + 2, + hdr_len); + vnet_hdr.hdr_len = cpu_to_tap16(q, hdr_len); + } err = -EINVAL; if (tap16_to_cpu(q, vnet_hdr.hdr_len) > len) goto err; @@ -682,12 +683,7 @@ static ssize_t tap_get_user(struct tap_queue *q, void *msg_control, if (msg_control && sock_flag(&q->sk, SOCK_ZEROCOPY)) { struct iov_iter i; - copylen = vnet_hdr.hdr_len ? - tap16_to_cpu(q, vnet_hdr.hdr_len) : GOODCOPY_LEN; - if (copylen > good_linear) - copylen = good_linear; - else if (copylen < ETH_HLEN) - copylen = ETH_HLEN; + copylen = clamp(hdr_len ?: GOODCOPY_LEN, ETH_HLEN, good_linear); linear = copylen; i = *from; iov_iter_advance(&i, copylen); @@ -697,11 +693,7 @@ static ssize_t tap_get_user(struct tap_queue *q, void *msg_control, if (!zerocopy) { copylen = len; - linear = tap16_to_cpu(q, vnet_hdr.hdr_len); - if (linear > good_linear) - linear = good_linear; - else if (linear < ETH_HLEN) - linear = ETH_HLEN; + linear = clamp(hdr_len, ETH_HLEN, good_linear); } skb = tap_alloc_skb(&q->sk, TAP_RESERVE, copylen, -- 2.50.1 From 6a53fc5a877098889b4bce45ec7ea44948819905 Mon Sep 17 00:00:00 2001 From: Akihiko Odaki Date: Fri, 7 Feb 2025 15:10:57 +0900 Subject: [PATCH 16/16] tap: Use tun's vnet-related code tun and tap implements the same vnet-related features so reuse the code. Signed-off-by: Akihiko Odaki Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20250207-tun-v6-7-fb49cf8b103e@daynix.com Signed-off-by: Jakub Kicinski --- drivers/net/tap.c | 152 +++++----------------------------------------- 1 file changed, 16 insertions(+), 136 deletions(-) diff --git a/drivers/net/tap.c b/drivers/net/tap.c index 21b77c261fea..d4ece538f1b2 100644 --- a/drivers/net/tap.c +++ b/drivers/net/tap.c @@ -26,74 +26,9 @@ #include #include -#define TAP_IFFEATURES (IFF_VNET_HDR | IFF_MULTI_QUEUE) - -#define TAP_VNET_LE 0x80000000 -#define TAP_VNET_BE 0x40000000 - -#ifdef CONFIG_TUN_VNET_CROSS_LE -static inline bool tap_legacy_is_little_endian(struct tap_queue *q) -{ - return q->flags & TAP_VNET_BE ? false : - virtio_legacy_is_little_endian(); -} - -static long tap_get_vnet_be(struct tap_queue *q, int __user *sp) -{ - int s = !!(q->flags & TAP_VNET_BE); - - if (put_user(s, sp)) - return -EFAULT; - - return 0; -} - -static long tap_set_vnet_be(struct tap_queue *q, int __user *sp) -{ - int s; - - if (get_user(s, sp)) - return -EFAULT; - - if (s) - q->flags |= TAP_VNET_BE; - else - q->flags &= ~TAP_VNET_BE; - - return 0; -} -#else -static inline bool tap_legacy_is_little_endian(struct tap_queue *q) -{ - return virtio_legacy_is_little_endian(); -} - -static long tap_get_vnet_be(struct tap_queue *q, int __user *argp) -{ - return -EINVAL; -} - -static long tap_set_vnet_be(struct tap_queue *q, int __user *argp) -{ - return -EINVAL; -} -#endif /* CONFIG_TUN_VNET_CROSS_LE */ - -static inline bool tap_is_little_endian(struct tap_queue *q) -{ - return q->flags & TAP_VNET_LE || - tap_legacy_is_little_endian(q); -} - -static inline u16 tap16_to_cpu(struct tap_queue *q, __virtio16 val) -{ - return __virtio16_to_cpu(tap_is_little_endian(q), val); -} +#include "tun_vnet.h" -static inline __virtio16 cpu_to_tap16(struct tap_queue *q, u16 val) -{ - return __cpu_to_virtio16(tap_is_little_endian(q), val); -} +#define TAP_IFFEATURES (IFF_VNET_HDR | IFF_MULTI_QUEUE) static struct proto tap_proto = { .name = "tap", @@ -655,25 +590,13 @@ static ssize_t tap_get_user(struct tap_queue *q, void *msg_control, if (q->flags & IFF_VNET_HDR) { vnet_hdr_len = READ_ONCE(q->vnet_hdr_sz); - err = -EINVAL; - if (len < vnet_hdr_len) + hdr_len = tun_vnet_hdr_get(vnet_hdr_len, q->flags, from, &vnet_hdr); + if (hdr_len < 0) { + err = hdr_len; goto err; - len -= vnet_hdr_len; - - err = -EFAULT; - if (!copy_from_iter_full(&vnet_hdr, sizeof(vnet_hdr), from)) - goto err; - iov_iter_advance(from, vnet_hdr_len - sizeof(vnet_hdr)); - hdr_len = tap16_to_cpu(q, vnet_hdr.hdr_len); - if (vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) { - hdr_len = max(tap16_to_cpu(q, vnet_hdr.csum_start) + - tap16_to_cpu(q, vnet_hdr.csum_offset) + 2, - hdr_len); - vnet_hdr.hdr_len = cpu_to_tap16(q, hdr_len); } - err = -EINVAL; - if (tap16_to_cpu(q, vnet_hdr.hdr_len) > len) - goto err; + + len -= vnet_hdr_len; } err = -EINVAL; @@ -725,8 +648,7 @@ static ssize_t tap_get_user(struct tap_queue *q, void *msg_control, skb->dev = tap->dev; if (vnet_hdr_len) { - err = virtio_net_hdr_to_skb(skb, &vnet_hdr, - tap_is_little_endian(q)); + err = tun_vnet_hdr_to_skb(q->flags, skb, &vnet_hdr); if (err) { rcu_read_unlock(); drop_reason = SKB_DROP_REASON_DEV_HDR; @@ -789,23 +711,17 @@ static ssize_t tap_put_user(struct tap_queue *q, int total; if (q->flags & IFF_VNET_HDR) { - int vlan_hlen = skb_vlan_tag_present(skb) ? VLAN_HLEN : 0; struct virtio_net_hdr vnet_hdr; vnet_hdr_len = READ_ONCE(q->vnet_hdr_sz); - if (iov_iter_count(iter) < vnet_hdr_len) - return -EINVAL; - - if (virtio_net_hdr_from_skb(skb, &vnet_hdr, - tap_is_little_endian(q), true, - vlan_hlen)) - BUG(); - if (copy_to_iter(&vnet_hdr, sizeof(vnet_hdr), iter) != - sizeof(vnet_hdr)) - return -EFAULT; + ret = tun_vnet_hdr_from_skb(q->flags, NULL, skb, &vnet_hdr); + if (ret) + return ret; - iov_iter_advance(iter, vnet_hdr_len - sizeof(vnet_hdr)); + ret = tun_vnet_hdr_put(vnet_hdr_len, iter, &vnet_hdr); + if (ret) + return ret; } total = vnet_hdr_len; total += skb->len; @@ -1064,42 +980,6 @@ static long tap_ioctl(struct file *file, unsigned int cmd, q->sk.sk_sndbuf = s; return 0; - case TUNGETVNETHDRSZ: - s = q->vnet_hdr_sz; - if (put_user(s, sp)) - return -EFAULT; - return 0; - - case TUNSETVNETHDRSZ: - if (get_user(s, sp)) - return -EFAULT; - if (s < (int)sizeof(struct virtio_net_hdr)) - return -EINVAL; - - q->vnet_hdr_sz = s; - return 0; - - case TUNGETVNETLE: - s = !!(q->flags & TAP_VNET_LE); - if (put_user(s, sp)) - return -EFAULT; - return 0; - - case TUNSETVNETLE: - if (get_user(s, sp)) - return -EFAULT; - if (s) - q->flags |= TAP_VNET_LE; - else - q->flags &= ~TAP_VNET_LE; - return 0; - - case TUNGETVNETBE: - return tap_get_vnet_be(q, sp); - - case TUNSETVNETBE: - return tap_set_vnet_be(q, sp); - case TUNSETOFFLOAD: /* let the user check for future flags */ if (arg & ~(TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6 | @@ -1143,7 +1023,7 @@ static long tap_ioctl(struct file *file, unsigned int cmd, return ret; default: - return -EINVAL; + return tun_vnet_ioctl(&q->vnet_hdr_sz, &q->flags, cmd, sp); } } @@ -1190,7 +1070,7 @@ static int tap_get_user_xdp(struct tap_queue *q, struct xdp_buff *xdp) skb->protocol = eth_hdr(skb)->h_proto; if (vnet_hdr_len) { - err = virtio_net_hdr_to_skb(skb, gso, tap_is_little_endian(q)); + err = tun_vnet_hdr_to_skb(q->flags, skb, gso); if (err) goto err_kfree; } -- 2.50.1