i40e, as well as ice, has a custom loop unrolling macro for unrolling
Tx descriptors filling on XSk xmit.
Replace i40e defs with generic unrolled_count(), which is also more
convenient as it allows passing defines as its argument, not hardcoded
values, while the loop declaration will still be a usual for-loop.
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Acked-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Link: https://patch.msgid.link/20250206182630.3914318-3-aleksander.lobakin@intel.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
/* Copyright(c) 2018 Intel Corporation. */
#include <linux/bpf_trace.h>
+#include <linux/unroll.h>
#include <net/xdp_sock_drv.h>
#include "i40e_txrx_common.h"
#include "i40e_xsk.h"
dma_addr_t dma;
u32 i;
- loop_unrolled_for(i = 0; i < PKTS_PER_BATCH; i++) {
+ unrolled_count(PKTS_PER_BATCH)
+ for (i = 0; i < PKTS_PER_BATCH; i++) {
u32 cmd = I40E_TX_DESC_CMD_ICRC | xsk_is_eop_desc(&desc[i]);
dma = xsk_buff_raw_get_dma(xdp_ring->xsk_pool, desc[i].addr);
#include <linux/types.h>
-/* This value should match the pragma in the loop_unrolled_for
+/* This value should match the pragma in the unrolled_count()
* macro. Why 4? It is strictly empirical. It seems to be a good
* compromise between the advantage of having simultaneous outstanding
* reads to the DMA array that can hide each others latency and the
*/
#define PKTS_PER_BATCH 4
-#ifdef __clang__
-#define loop_unrolled_for _Pragma("clang loop unroll_count(4)") for
-#elif __GNUC__ >= 8
-#define loop_unrolled_for _Pragma("GCC unroll 4") for
-#else
-#define loop_unrolled_for for
-#endif
-
struct i40e_ring;
struct i40e_vsi;
struct net_device;