From 1c7eec4d5f3b39cdea2153abaebf1b7229a47072 Mon Sep 17 00:00:00 2001 From: Zhu Yanjun Date: Sat, 19 Apr 2025 10:07:41 +0200 Subject: [PATCH 01/16] RDMA/rxe: Fix "trying to register non-static key in rxe_qp_do_cleanup" bug Call Trace: __dump_stack lib/dump_stack.c:94 [inline] dump_stack_lvl+0x116/0x1f0 lib/dump_stack.c:120 assign_lock_key kernel/locking/lockdep.c:986 [inline] register_lock_class+0x4a3/0x4c0 kernel/locking/lockdep.c:1300 __lock_acquire+0x99/0x1ba0 kernel/locking/lockdep.c:5110 lock_acquire kernel/locking/lockdep.c:5866 [inline] lock_acquire+0x179/0x350 kernel/locking/lockdep.c:5823 __timer_delete_sync+0x152/0x1b0 kernel/time/timer.c:1644 rxe_qp_do_cleanup+0x5c3/0x7e0 drivers/infiniband/sw/rxe/rxe_qp.c:815 execute_in_process_context+0x3a/0x160 kernel/workqueue.c:4596 __rxe_cleanup+0x267/0x3c0 drivers/infiniband/sw/rxe/rxe_pool.c:232 rxe_create_qp+0x3f7/0x5f0 drivers/infiniband/sw/rxe/rxe_verbs.c:604 create_qp+0x62d/0xa80 drivers/infiniband/core/verbs.c:1250 ib_create_qp_kernel+0x9f/0x310 drivers/infiniband/core/verbs.c:1361 ib_create_qp include/rdma/ib_verbs.h:3803 [inline] rdma_create_qp+0x10c/0x340 drivers/infiniband/core/cma.c:1144 rds_ib_setup_qp+0xc86/0x19a0 net/rds/ib_cm.c:600 rds_ib_cm_initiate_connect+0x1e8/0x3d0 net/rds/ib_cm.c:944 rds_rdma_cm_event_handler_cmn+0x61f/0x8c0 net/rds/rdma_transport.c:109 cma_cm_event_handler+0x94/0x300 drivers/infiniband/core/cma.c:2184 cma_work_handler+0x15b/0x230 drivers/infiniband/core/cma.c:3042 process_one_work+0x9cc/0x1b70 kernel/workqueue.c:3238 process_scheduled_works kernel/workqueue.c:3319 [inline] worker_thread+0x6c8/0xf10 kernel/workqueue.c:3400 kthread+0x3c2/0x780 kernel/kthread.c:464 ret_from_fork+0x45/0x80 arch/x86/kernel/process.c:153 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:245 The root cause is as below: In the function rxe_create_qp, the function rxe_qp_from_init is called to create qp, if this function rxe_qp_from_init fails, rxe_cleanup will be called to handle all the allocated resources, including the timers: retrans_timer and rnr_nak_timer. The function rxe_qp_from_init calls the function rxe_qp_init_req to initialize the timers: retrans_timer and rnr_nak_timer. But these timers are initialized in the end of rxe_qp_init_req. If some errors occur before the initialization of these timers, this problem will occur. The solution is to check whether these timers are initialized or not. If these timers are not initialized, ignore these timers. Fixes: 8700e3e7c485 ("Soft RoCE driver") Reported-by: syzbot+4edb496c3cad6e953a31@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=4edb496c3cad6e953a31 Signed-off-by: Zhu Yanjun Link: https://patch.msgid.link/20250419080741.1515231-1-yanjun.zhu@linux.dev Signed-off-by: Leon Romanovsky --- drivers/infiniband/sw/rxe/rxe_qp.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/sw/rxe/rxe_qp.c b/drivers/infiniband/sw/rxe/rxe_qp.c index 7975fb0e2782..f2af3e0aef35 100644 --- a/drivers/infiniband/sw/rxe/rxe_qp.c +++ b/drivers/infiniband/sw/rxe/rxe_qp.c @@ -811,7 +811,12 @@ static void rxe_qp_do_cleanup(struct work_struct *work) spin_unlock_irqrestore(&qp->state_lock, flags); qp->qp_timeout_jiffies = 0; - if (qp_type(qp) == IB_QPT_RC) { + /* In the function timer_setup, .function is initialized. If .function + * is NULL, it indicates the function timer_setup is not called, the + * timer is not initialized. Or else, the timer is initialized. + */ + if (qp_type(qp) == IB_QPT_RC && qp->retrans_timer.function && + qp->rnr_nak_timer.function) { timer_delete_sync(&qp->retrans_timer); timer_delete_sync(&qp->rnr_nak_timer); } -- 2.51.0 From d85080df12f33ad42a452a9998b21813b29daf35 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Sat, 19 Apr 2025 14:27:25 +0100 Subject: [PATCH 02/16] RDMA/rxe: Remove unused rxe_run_task rxe_run_task() has been unused since 2024's commit 23bc06af547f ("RDMA/rxe: Don't call direct between tasks") Remove it. Signed-off-by: Dr. David Alan Gilbert Link: https://patch.msgid.link/20250419132725.199785-1-linux@treblig.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/sw/rxe/rxe_task.c | 40 ++++++++-------------------- drivers/infiniband/sw/rxe/rxe_task.h | 2 -- 2 files changed, 11 insertions(+), 31 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_task.c b/drivers/infiniband/sw/rxe/rxe_task.c index 80332638d9e3..6f8f353e9583 100644 --- a/drivers/infiniband/sw/rxe/rxe_task.c +++ b/drivers/infiniband/sw/rxe/rxe_task.c @@ -85,17 +85,17 @@ static bool is_done(struct rxe_task *task) /* do_task is a wrapper for the three tasks (requester, * completer, responder) and calls them in a loop until - * they return a non-zero value. It is called either - * directly by rxe_run_task or indirectly if rxe_sched_task - * schedules the task. They must call __reserve_if_idle to - * move the task to busy before calling or scheduling. - * The task can also be moved to drained or invalid - * by calls to rxe_cleanup_task or rxe_disable_task. - * In that case tasks which get here are not executed but - * just flushed. The tasks are designed to look to see if - * there is work to do and then do part of it before returning - * here with a return value of zero until all the work - * has been consumed then it returns a non-zero value. + * they return a non-zero value. It is called indirectly + * when rxe_sched_task schedules the task. They must + * call __reserve_if_idle to move the task to busy before + * calling or scheduling. The task can also be moved to + * drained or invalid by calls to rxe_cleanup_task or + * rxe_disable_task. In that case tasks which get here + * are not executed but just flushed. The tasks are + * designed to look to see if there is work to do and + * then do part of it before returning here with a return + * value of zero until all the work has been consumed then + * it returns a non-zero value. * The number of times the task can be run is limited by * max iterations so one task cannot hold the cpu forever. * If the limit is hit and work remains the task is rescheduled. @@ -234,24 +234,6 @@ void rxe_cleanup_task(struct rxe_task *task) spin_unlock_irqrestore(&task->lock, flags); } -/* run the task inline if it is currently idle - * cannot call do_task holding the lock - */ -void rxe_run_task(struct rxe_task *task) -{ - unsigned long flags; - bool run; - - WARN_ON(rxe_read(task->qp) <= 0); - - spin_lock_irqsave(&task->lock, flags); - run = __reserve_if_idle(task); - spin_unlock_irqrestore(&task->lock, flags); - - if (run) - do_task(task); -} - /* schedule the task to run later as a work queue entry. * the queue_work call can be called holding * the lock. diff --git a/drivers/infiniband/sw/rxe/rxe_task.h b/drivers/infiniband/sw/rxe/rxe_task.h index a63e258b3d66..a8c9a77b6027 100644 --- a/drivers/infiniband/sw/rxe/rxe_task.h +++ b/drivers/infiniband/sw/rxe/rxe_task.h @@ -47,8 +47,6 @@ int rxe_init_task(struct rxe_task *task, struct rxe_qp *qp, /* cleanup task */ void rxe_cleanup_task(struct rxe_task *task); -void rxe_run_task(struct rxe_task *task); - void rxe_sched_task(struct rxe_task *task); /* keep a task from scheduling */ -- 2.51.0 From 23ea3c70ee426b12ce033cf3107bc0dd312ea1d9 Mon Sep 17 00:00:00 2001 From: Daisuke Matsuda Date: Mon, 21 Apr 2025 11:51:01 +0900 Subject: [PATCH 03/16] RDMA/rxe: Remove 32-bit architecture support Major linux distibutions have phased out support for 32-bit machines. Since rxe is primarily used for development and testing, the benefit of maintaining 32-bit support is minimal. This change simplifies ATOMIC WRITE implementations and improves maintainability of the driver. Signed-off-by: Daisuke Matsuda Link: https://patch.msgid.link/20250421025101.3588139-1-matsuda-daisuke@fujitsu.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/sw/rxe/Kconfig | 2 +- drivers/infiniband/sw/rxe/rxe_loc.h | 6 +----- drivers/infiniband/sw/rxe/rxe_mr.c | 8 -------- drivers/infiniband/sw/rxe/rxe_odp.c | 1 - drivers/infiniband/sw/rxe/rxe_param.h | 5 +---- 5 files changed, 3 insertions(+), 19 deletions(-) diff --git a/drivers/infiniband/sw/rxe/Kconfig b/drivers/infiniband/sw/rxe/Kconfig index c180e7ebcfc5..1ed5b63f8afc 100644 --- a/drivers/infiniband/sw/rxe/Kconfig +++ b/drivers/infiniband/sw/rxe/Kconfig @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only config RDMA_RXE tristate "Software RDMA over Ethernet (RoCE) driver" - depends on INET && PCI && INFINIBAND + depends on INET && PCI && INFINIBAND && 64BIT depends on INFINIBAND_VIRT_DMA select NET_UDP_TUNNEL select CRC32 diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/infiniband/sw/rxe/rxe_loc.h index 30fa83c9c846..f7dbb9cddd12 100644 --- a/drivers/infiniband/sw/rxe/rxe_loc.h +++ b/drivers/infiniband/sw/rxe/rxe_loc.h @@ -196,6 +196,7 @@ enum resp_states rxe_odp_atomic_op(struct rxe_mr *mr, u64 iova, int opcode, u64 compare, u64 swap_add, u64 *orig_val); int rxe_odp_flush_pmem_iova(struct rxe_mr *mr, u64 iova, unsigned int length); +enum resp_states rxe_odp_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value); #else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ static inline int rxe_odp_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, u64 iova, @@ -219,11 +220,6 @@ static inline int rxe_odp_flush_pmem_iova(struct rxe_mr *mr, u64 iova, { return -EOPNOTSUPP; } -#endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ - -#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING -enum resp_states rxe_odp_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value); -#else static inline enum resp_states rxe_odp_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value) { diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c index 1a74013a14ab..c4936d63e26a 100644 --- a/drivers/infiniband/sw/rxe/rxe_mr.c +++ b/drivers/infiniband/sw/rxe/rxe_mr.c @@ -539,8 +539,6 @@ enum resp_states rxe_mr_do_atomic_op(struct rxe_mr *mr, u64 iova, int opcode, return RESPST_NONE; } -#if defined CONFIG_64BIT -/* only implemented or called for 64 bit architectures */ enum resp_states rxe_mr_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value) { unsigned int page_offset; @@ -580,12 +578,6 @@ enum resp_states rxe_mr_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value) return RESPST_NONE; } -#else -enum resp_states rxe_mr_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value) -{ - return RESPST_ERR_UNSUPPORTED_OPCODE; -} -#endif int advance_dma_data(struct rxe_dma_info *dma, unsigned int length) { diff --git a/drivers/infiniband/sw/rxe/rxe_odp.c b/drivers/infiniband/sw/rxe/rxe_odp.c index fa5e8f5017cc..6149d9ffe7f7 100644 --- a/drivers/infiniband/sw/rxe/rxe_odp.c +++ b/drivers/infiniband/sw/rxe/rxe_odp.c @@ -380,7 +380,6 @@ int rxe_odp_flush_pmem_iova(struct rxe_mr *mr, u64 iova, return 0; } -/* CONFIG_64BIT=y */ enum resp_states rxe_odp_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value) { struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem); diff --git a/drivers/infiniband/sw/rxe/rxe_param.h b/drivers/infiniband/sw/rxe/rxe_param.h index 003f681e5dc0..767870568372 100644 --- a/drivers/infiniband/sw/rxe/rxe_param.h +++ b/drivers/infiniband/sw/rxe/rxe_param.h @@ -53,12 +53,9 @@ enum rxe_device_param { | IB_DEVICE_MEM_WINDOW | IB_DEVICE_FLUSH_GLOBAL | IB_DEVICE_FLUSH_PERSISTENT -#ifdef CONFIG_64BIT | IB_DEVICE_MEM_WINDOW_TYPE_2B | IB_DEVICE_ATOMIC_WRITE, -#else - | IB_DEVICE_MEM_WINDOW_TYPE_2B, -#endif /* CONFIG_64BIT */ + RXE_MAX_SGE = 32, RXE_MAX_WQE_SIZE = sizeof(struct rxe_send_wqe) + sizeof(struct ib_sge) * RXE_MAX_SGE, -- 2.51.0 From 685f9537a72877693a1ab116d155acc89562c29b Mon Sep 17 00:00:00 2001 From: Daisuke Matsuda Date: Fri, 18 Apr 2025 14:13:45 +0900 Subject: [PATCH 04/16] RDMA/core: Move ODP capability definitions to uapi The bits are used from both kernel space and userland, so they should be placed in UAPI. Signed-off-by: Daisuke Matsuda Link: https://patch.msgid.link/20250418051345.1022339-2-matsuda-daisuke@fujitsu.com Signed-off-by: Leon Romanovsky --- include/rdma/ib_verbs.h | 20 ++++++++++---------- include/uapi/rdma/ib_user_verbs.h | 16 ++++++++++++++++ 2 files changed, 26 insertions(+), 10 deletions(-) diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 0a7ccd08b365..b06a0ed81bdd 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -314,19 +314,19 @@ enum ib_atomic_cap { }; enum ib_odp_general_cap_bits { - IB_ODP_SUPPORT = 1 << 0, - IB_ODP_SUPPORT_IMPLICIT = 1 << 1, + IB_ODP_SUPPORT = IB_UVERBS_ODP_SUPPORT, + IB_ODP_SUPPORT_IMPLICIT = IB_UVERBS_ODP_SUPPORT_IMPLICIT, }; enum ib_odp_transport_cap_bits { - IB_ODP_SUPPORT_SEND = 1 << 0, - IB_ODP_SUPPORT_RECV = 1 << 1, - IB_ODP_SUPPORT_WRITE = 1 << 2, - IB_ODP_SUPPORT_READ = 1 << 3, - IB_ODP_SUPPORT_ATOMIC = 1 << 4, - IB_ODP_SUPPORT_SRQ_RECV = 1 << 5, - IB_ODP_SUPPORT_FLUSH = 1 << 6, - IB_ODP_SUPPORT_ATOMIC_WRITE = 1 << 7, + IB_ODP_SUPPORT_SEND = IB_UVERBS_ODP_SUPPORT_SEND, + IB_ODP_SUPPORT_RECV = IB_UVERBS_ODP_SUPPORT_RECV, + IB_ODP_SUPPORT_WRITE = IB_UVERBS_ODP_SUPPORT_WRITE, + IB_ODP_SUPPORT_READ = IB_UVERBS_ODP_SUPPORT_READ, + IB_ODP_SUPPORT_ATOMIC = IB_UVERBS_ODP_SUPPORT_ATOMIC, + IB_ODP_SUPPORT_SRQ_RECV = IB_UVERBS_ODP_SUPPORT_SRQ_RECV, + IB_ODP_SUPPORT_FLUSH = IB_UVERBS_ODP_SUPPORT_FLUSH, + IB_ODP_SUPPORT_ATOMIC_WRITE = IB_UVERBS_ODP_SUPPORT_ATOMIC_WRITE, }; struct ib_odp_caps { diff --git a/include/uapi/rdma/ib_user_verbs.h b/include/uapi/rdma/ib_user_verbs.h index e16650f0c85d..3b7bd99813e9 100644 --- a/include/uapi/rdma/ib_user_verbs.h +++ b/include/uapi/rdma/ib_user_verbs.h @@ -233,6 +233,22 @@ struct ib_uverbs_ex_query_device { __u32 reserved; }; +enum ib_uverbs_odp_general_cap_bits { + IB_UVERBS_ODP_SUPPORT = 1 << 0, + IB_UVERBS_ODP_SUPPORT_IMPLICIT = 1 << 1, +}; + +enum ib_uverbs_odp_transport_cap_bits { + IB_UVERBS_ODP_SUPPORT_SEND = 1 << 0, + IB_UVERBS_ODP_SUPPORT_RECV = 1 << 1, + IB_UVERBS_ODP_SUPPORT_WRITE = 1 << 2, + IB_UVERBS_ODP_SUPPORT_READ = 1 << 3, + IB_UVERBS_ODP_SUPPORT_ATOMIC = 1 << 4, + IB_UVERBS_ODP_SUPPORT_SRQ_RECV = 1 << 5, + IB_UVERBS_ODP_SUPPORT_FLUSH = 1 << 6, + IB_UVERBS_ODP_SUPPORT_ATOMIC_WRITE = 1 << 7, +}; + struct ib_uverbs_odp_caps { __aligned_u64 general_caps; struct { -- 2.51.0 From 02007e3ddc0790ed5a6a504892d7aa1a917076bc Mon Sep 17 00:00:00 2001 From: Junxian Huang Date: Mon, 21 Apr 2025 21:27:45 +0800 Subject: [PATCH 05/16] RDMA/hns: Add trace for flush CQE Add trace to print the producer index of QP when triggering flush CQE. Output example: $ cat /sys/kernel/debug/tracing/trace tracer: nop entries-in-buffer/entries-written: 2/2 #P:128 _-----=> irqs-off/BH-disabled / _----=> need-resched | / _---=> hardirq/softirq || / _--=> preempt-depth ||| / _-=> migrate-disable |||| / delay TASK-PID CPU# ||||| TIMESTAMP FUNCTION | | | ||||| | | ib_send_bw-11474 [075] d..1. 2393.434738: hns_sq_flush_cqe: SQ 0x2 flush head 0xb5c7. ib_send_bw-11474 [075] d..1. 2393.434739: hns_rq_flush_cqe: RQ 0x2 flush head 0. Signed-off-by: Junxian Huang Link: https://patch.msgid.link/20250421132750.1363348-2-huangjunxian6@hisilicon.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/hns/hns_roce_device.h | 17 +++++++ drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 5 +++ drivers/infiniband/hw/hns/hns_roce_trace.h | 50 +++++++++++++++++++++ 3 files changed, 72 insertions(+) create mode 100644 drivers/infiniband/hw/hns/hns_roce_trace.h diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index 560a1d9de408..080bd049b0f8 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -1027,6 +1027,23 @@ struct hns_roce_dev { atomic64_t *dfx_cnt; }; +enum hns_roce_trace_type { + TRACE_SQ, + TRACE_RQ, +}; + +static inline const char *trace_type_to_str(enum hns_roce_trace_type type) +{ + switch (type) { + case TRACE_SQ: + return "SQ"; + case TRACE_RQ: + return "RQ"; + default: + return "UNKNOWN"; + } +} + static inline struct hns_roce_dev *to_hr_dev(struct ib_device *ib_dev) { return container_of(ib_dev, struct hns_roce_dev, ib_dev); diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index c6399490e3a5..e1c27c9556fd 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -50,6 +50,9 @@ #include "hns_roce_hem.h" #include "hns_roce_hw_v2.h" +#define CREATE_TRACE_POINTS +#include "hns_roce_trace.h" + enum { CMD_RST_PRC_OTHERS, CMD_RST_PRC_SUCCESS, @@ -5312,6 +5315,7 @@ static void v2_set_flushed_fields(struct ib_qp *ibqp, return; spin_lock_irqsave(&hr_qp->sq.lock, sq_flag); + trace_hns_sq_flush_cqe(hr_qp->qpn, hr_qp->sq.head, TRACE_SQ); hr_reg_write(context, QPC_SQ_PRODUCER_IDX, hr_qp->sq.head); hr_reg_clear(qpc_mask, QPC_SQ_PRODUCER_IDX); hr_qp->state = IB_QPS_ERR; @@ -5321,6 +5325,7 @@ static void v2_set_flushed_fields(struct ib_qp *ibqp, return; spin_lock_irqsave(&hr_qp->rq.lock, rq_flag); + trace_hns_rq_flush_cqe(hr_qp->qpn, hr_qp->rq.head, TRACE_RQ); hr_reg_write(context, QPC_RQ_PRODUCER_IDX, hr_qp->rq.head); hr_reg_clear(qpc_mask, QPC_RQ_PRODUCER_IDX); spin_unlock_irqrestore(&hr_qp->rq.lock, rq_flag); diff --git a/drivers/infiniband/hw/hns/hns_roce_trace.h b/drivers/infiniband/hw/hns/hns_roce_trace.h new file mode 100644 index 000000000000..3deace357fb7 --- /dev/null +++ b/drivers/infiniband/hw/hns/hns_roce_trace.h @@ -0,0 +1,50 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* + * Copyright (c) 2025 Hisilicon Limited. + */ + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM hns_roce + +#if !defined(__HNS_ROCE_TRACE_H) || defined(TRACE_HEADER_MULTI_READ) +#define __HNS_ROCE_TRACE_H + +#include +#include "hns_roce_device.h" + +DECLARE_EVENT_CLASS(flush_head_template, + TP_PROTO(unsigned long qpn, u32 pi, + enum hns_roce_trace_type type), + TP_ARGS(qpn, pi, type), + + TP_STRUCT__entry(__field(unsigned long, qpn) + __field(u32, pi) + __field(enum hns_roce_trace_type, type) + ), + + TP_fast_assign(__entry->qpn = qpn; + __entry->pi = pi; + __entry->type = type; + ), + + TP_printk("%s 0x%lx flush head 0x%x.", + trace_type_to_str(__entry->type), + __entry->qpn, __entry->pi) +); + +DEFINE_EVENT(flush_head_template, hns_sq_flush_cqe, + TP_PROTO(unsigned long qpn, u32 pi, + enum hns_roce_trace_type type), + TP_ARGS(qpn, pi, type)); +DEFINE_EVENT(flush_head_template, hns_rq_flush_cqe, + TP_PROTO(unsigned long qpn, u32 pi, + enum hns_roce_trace_type type), + TP_ARGS(qpn, pi, type)); + +#endif /* __HNS_ROCE_TRACE_H */ + +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE hns_roce_trace +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#include -- 2.51.0 From 6c98c86708063af509b82f35bfac9577b61b4a94 Mon Sep 17 00:00:00 2001 From: Junxian Huang Date: Mon, 21 Apr 2025 21:27:46 +0800 Subject: [PATCH 06/16] RDMA/hns: Add trace for WQE dumping Add trace for WQE dumping, including SQ, RQ and SRQ. Output example: $ cat /sys/kernel/debug/tracing/trace tracer: nop entries-in-buffer/entries-written: 2/2 #P:128 _-----=> irqs-off/BH-disabled / _----=> need-resched | / _---=> hardirq/softirq || / _--=> preempt-depth ||| / _-=> migrate-disable |||| / delay TASK-PID CPU# ||||| TIMESTAMP FUNCTION | | | ||||| | | roce_test_main-22730 [074] d..1. 16133.898282: hns_sq_wqe: SQ 0xc wqe (0x0/0xffff0820a6076060): {0x180,0x639c,0x0,0x1000000,0x0,0x0,0x0,0x0, 0x639c,0x300,0xf7e38000,0x0,0x0,0x0,0x0,0x0} Signed-off-by: Junxian Huang Link: https://patch.msgid.link/20250421132750.1363348-3-huangjunxian6@hisilicon.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/hns/hns_roce_device.h | 3 ++ drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 8 ++++ drivers/infiniband/hw/hns/hns_roce_trace.h | 44 +++++++++++++++++++++ 3 files changed, 55 insertions(+) diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index 080bd049b0f8..1dcc9cbb4678 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -1030,6 +1030,7 @@ struct hns_roce_dev { enum hns_roce_trace_type { TRACE_SQ, TRACE_RQ, + TRACE_SRQ, }; static inline const char *trace_type_to_str(enum hns_roce_trace_type type) @@ -1039,6 +1040,8 @@ static inline const char *trace_type_to_str(enum hns_roce_trace_type type) return "SQ"; case TRACE_RQ: return "RQ"; + case TRACE_SRQ: + return "SRQ"; default: return "UNKNOWN"; } diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index e1c27c9556fd..ecc1e0b1df4e 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -741,6 +741,8 @@ static int hns_roce_v2_post_send(struct ib_qp *ibqp, else ret = set_ud_wqe(qp, wr, wqe, &sge_idx, owner_bit); + trace_hns_sq_wqe(qp->qpn, wqe_idx, wqe, 1 << qp->sq.wqe_shift, + wr->wr_id, TRACE_SQ); if (unlikely(ret)) { *bad_wr = wr; goto out; @@ -810,6 +812,9 @@ static void fill_rq_wqe(struct hns_roce_qp *hr_qp, const struct ib_recv_wr *wr, wqe = hns_roce_get_recv_wqe(hr_qp, wqe_idx); fill_recv_sge_to_wqe(wr, wqe, max_sge, hr_qp->rq.rsv_sge); + + trace_hns_rq_wqe(hr_qp->qpn, wqe_idx, wqe, 1 << hr_qp->rq.wqe_shift, + wr->wr_id, TRACE_RQ); } static int hns_roce_v2_post_recv(struct ib_qp *ibqp, @@ -987,6 +992,9 @@ static int hns_roce_v2_post_srq_recv(struct ib_srq *ibsrq, fill_recv_sge_to_wqe(wr, wqe, max_sge, srq->rsv_sge); fill_wqe_idx(srq, wqe_idx); srq->wrid[wqe_idx] = wr->wr_id; + + trace_hns_srq_wqe(srq->srqn, wqe_idx, wqe, 1 << srq->wqe_shift, + wr->wr_id, TRACE_SRQ); } if (likely(nreq)) { diff --git a/drivers/infiniband/hw/hns/hns_roce_trace.h b/drivers/infiniband/hw/hns/hns_roce_trace.h index 3deace357fb7..56e152387db4 100644 --- a/drivers/infiniband/hw/hns/hns_roce_trace.h +++ b/drivers/infiniband/hw/hns/hns_roce_trace.h @@ -41,6 +41,50 @@ DEFINE_EVENT(flush_head_template, hns_rq_flush_cqe, enum hns_roce_trace_type type), TP_ARGS(qpn, pi, type)); +#define MAX_SGE_PER_WQE 64 +#define MAX_WQE_SIZE (MAX_SGE_PER_WQE * HNS_ROCE_SGE_SIZE) +DECLARE_EVENT_CLASS(wqe_template, + TP_PROTO(unsigned long qpn, u32 idx, void *wqe, u32 len, + u64 id, enum hns_roce_trace_type type), + TP_ARGS(qpn, idx, wqe, len, id, type), + + TP_STRUCT__entry(__field(unsigned long, qpn) + __field(u32, idx) + __array(__le32, wqe, + MAX_WQE_SIZE / sizeof(__le32)) + __field(u32, len) + __field(u64, id) + __field(enum hns_roce_trace_type, type) + ), + + TP_fast_assign(__entry->qpn = qpn; + __entry->idx = idx; + __entry->id = id; + memcpy(__entry->wqe, wqe, len); + __entry->len = len / sizeof(__le32); + __entry->type = type; + ), + + TP_printk("%s 0x%lx wqe(0x%x/0x%llx): %s", + trace_type_to_str(__entry->type), + __entry->qpn, __entry->idx, __entry->id, + __print_array(__entry->wqe, __entry->len, + sizeof(__le32))) +); + +DEFINE_EVENT(wqe_template, hns_sq_wqe, + TP_PROTO(unsigned long qpn, u32 idx, void *wqe, u32 len, u64 id, + enum hns_roce_trace_type type), + TP_ARGS(qpn, idx, wqe, len, id, type)); +DEFINE_EVENT(wqe_template, hns_rq_wqe, + TP_PROTO(unsigned long qpn, u32 idx, void *wqe, u32 len, u64 id, + enum hns_roce_trace_type type), + TP_ARGS(qpn, idx, wqe, len, id, type)); +DEFINE_EVENT(wqe_template, hns_srq_wqe, + TP_PROTO(unsigned long qpn, u32 idx, void *wqe, u32 len, u64 id, + enum hns_roce_trace_type type), + TP_ARGS(qpn, idx, wqe, len, id, type)); + #endif /* __HNS_ROCE_TRACE_H */ #undef TRACE_INCLUDE_FILE -- 2.51.0 From 1e63e2f96613bc6471e7497e47d1767e3846608e Mon Sep 17 00:00:00 2001 From: Junxian Huang Date: Mon, 21 Apr 2025 21:27:47 +0800 Subject: [PATCH 07/16] RDMA/hns: Add trace for AEQE dumping Add trace for AEQE dumping. Output example: $ cat /sys/kernel/debug/tracing/trace tracer: nop entries-in-buffer/entries-written: 2/2 #P:128 _-----=> irqs-off/BH-disabled / _----=> need-resched | / _---=> hardirq/softirq || / _--=> preempt-depth ||| / _-=> migrate-disable |||| / delay TASK-PID CPU# ||||| TIMESTAMP FUNCTION | | | ||||| | | -0 [120] d.h1. 7995.835587: hns_ae_info: event 19 aeqe: {0x80006013,0x0,0x0,0x10d2c,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0} Signed-off-by: Junxian Huang Link: https://patch.msgid.link/20250421132750.1363348-4-huangjunxian6@hisilicon.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 1 + drivers/infiniband/hw/hns/hns_roce_trace.h | 19 +++++++++++++++++++ 2 files changed, 20 insertions(+) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index ecc1e0b1df4e..5d3bf84c1aae 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -6260,6 +6260,7 @@ static irqreturn_t hns_roce_v2_aeq_int(struct hns_roce_dev *hr_dev, eq->sub_type = sub_type; ++eq->cons_index; aeqe_found = IRQ_HANDLED; + trace_hns_ae_info(event_type, aeqe, eq->eqe_size); atomic64_inc(&hr_dev->dfx_cnt[HNS_ROCE_DFX_AEQE_CNT]); diff --git a/drivers/infiniband/hw/hns/hns_roce_trace.h b/drivers/infiniband/hw/hns/hns_roce_trace.h index 56e152387db4..75c89aff68c3 100644 --- a/drivers/infiniband/hw/hns/hns_roce_trace.h +++ b/drivers/infiniband/hw/hns/hns_roce_trace.h @@ -85,6 +85,25 @@ DEFINE_EVENT(wqe_template, hns_srq_wqe, enum hns_roce_trace_type type), TP_ARGS(qpn, idx, wqe, len, id, type)); +TRACE_EVENT(hns_ae_info, + TP_PROTO(int event_type, void *aeqe, unsigned int len), + TP_ARGS(event_type, aeqe, len), + + TP_STRUCT__entry(__field(int, event_type) + __array(__le32, aeqe, + HNS_ROCE_V3_EQE_SIZE / sizeof(__le32)) + __field(u32, len) + ), + + TP_fast_assign(__entry->event_type = event_type; + __entry->len = len / sizeof(__le32); + memcpy(__entry->aeqe, aeqe, len); + ), + + TP_printk("event %2d aeqe: %s", __entry->event_type, + __print_array(__entry->aeqe, __entry->len, sizeof(__le32))) +); + #endif /* __HNS_ROCE_TRACE_H */ #undef TRACE_INCLUDE_FILE -- 2.51.0 From 48ffc152576d633d05eda4251aaef5dd53970cb8 Mon Sep 17 00:00:00 2001 From: Junxian Huang Date: Mon, 21 Apr 2025 21:27:48 +0800 Subject: [PATCH 08/16] RDMA/hns: Add trace for MR/MTR attribute dumping Add trace for MR/MTR attribute dumping. Output example: $ cat /sys/kernel/debug/tracing/trace tracer: nop entries-in-buffer/entries-written: 2/2 #P:128 _-----=> irqs-off/BH-disabled / _----=> need-resched | / _---=> hardirq/softirq || / _--=> preempt-depth ||| / _-=> migrate-disable |||| / delay TASK-PID CPU# ||||| TIMESTAMP FUNCTION | | | ||||| | | ib_send_bw-14751 [111] ..... 8763.823038: hns_buf_attr: rg cnt:1, pg_sft:0xc, mtt_only:no, rg 0 (sz:131072, hop:2), rg 1 (sz:0, hop:0), rg 2 (sz:0, hop:0) ib_send_bw-14751 [111] ..... 8763.823118: hns_mr: iova:0xffffb2968000, size:131072, key:512, pd:1, pbl_hop:1, npages:4, type:0, status:0 Signed-off-by: Junxian Huang Link: https://patch.msgid.link/20250421132750.1363348-5-huangjunxian6@hisilicon.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/hns/hns_roce_mr.c | 3 + drivers/infiniband/hw/hns/hns_roce_trace.h | 65 ++++++++++++++++++++++ 2 files changed, 68 insertions(+) diff --git a/drivers/infiniband/hw/hns/hns_roce_mr.c b/drivers/infiniband/hw/hns/hns_roce_mr.c index 09da3496843b..93a48b41955b 100644 --- a/drivers/infiniband/hw/hns/hns_roce_mr.c +++ b/drivers/infiniband/hw/hns/hns_roce_mr.c @@ -38,6 +38,7 @@ #include "hns_roce_device.h" #include "hns_roce_cmd.h" #include "hns_roce_hem.h" +#include "hns_roce_trace.h" static u32 hw_index_to_key(int ind) { @@ -159,6 +160,7 @@ static int hns_roce_mr_enable(struct hns_roce_dev *hr_dev, if (IS_ERR(mailbox)) return PTR_ERR(mailbox); + trace_hns_mr(mr); if (mr->type != MR_TYPE_FRMR) ret = hr_dev->hw->write_mtpt(hr_dev, mailbox->buf, mr); else @@ -1146,6 +1148,7 @@ int hns_roce_mtr_create(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr, struct ib_device *ibdev = &hr_dev->ib_dev; int ret; + trace_hns_buf_attr(buf_attr); /* The caller has its own buffer list and invokes the hns_roce_mtr_map() * to finish the MTT configuration. */ diff --git a/drivers/infiniband/hw/hns/hns_roce_trace.h b/drivers/infiniband/hw/hns/hns_roce_trace.h index 75c89aff68c3..b0bc22b9cc28 100644 --- a/drivers/infiniband/hw/hns/hns_roce_trace.h +++ b/drivers/infiniband/hw/hns/hns_roce_trace.h @@ -10,6 +10,7 @@ #define __HNS_ROCE_TRACE_H #include +#include #include "hns_roce_device.h" DECLARE_EVENT_CLASS(flush_head_template, @@ -104,6 +105,70 @@ TRACE_EVENT(hns_ae_info, __print_array(__entry->aeqe, __entry->len, sizeof(__le32))) ); +TRACE_EVENT(hns_mr, + TP_PROTO(struct hns_roce_mr *mr), + TP_ARGS(mr), + + TP_STRUCT__entry(__field(u64, iova) + __field(u64, size) + __field(u32, key) + __field(u32, pd) + __field(u32, pbl_hop_num) + __field(u32, npages) + __field(int, type) + __field(int, enabled) + ), + + TP_fast_assign(__entry->iova = mr->iova; + __entry->size = mr->size; + __entry->key = mr->key; + __entry->pd = mr->pd; + __entry->pbl_hop_num = mr->pbl_hop_num; + __entry->npages = mr->npages; + __entry->type = mr->type; + __entry->enabled = mr->enabled; + ), + + TP_printk("iova:0x%llx, size:%llu, key:%u, pd:%u, pbl_hop:%u, npages:%u, type:%d, status:%d", + __entry->iova, __entry->size, __entry->key, + __entry->pd, __entry->pbl_hop_num, __entry->npages, + __entry->type, __entry->enabled) +); + +TRACE_EVENT(hns_buf_attr, + TP_PROTO(struct hns_roce_buf_attr *attr), + TP_ARGS(attr), + + TP_STRUCT__entry(__field(unsigned int, region_count) + __field(unsigned int, region0_size) + __field(int, region0_hopnum) + __field(unsigned int, region1_size) + __field(int, region1_hopnum) + __field(unsigned int, region2_size) + __field(int, region2_hopnum) + __field(unsigned int, page_shift) + __field(bool, mtt_only) + ), + + TP_fast_assign(__entry->region_count = attr->region_count; + __entry->region0_size = attr->region[0].size; + __entry->region0_hopnum = attr->region[0].hopnum; + __entry->region1_size = attr->region[1].size; + __entry->region1_hopnum = attr->region[1].hopnum; + __entry->region2_size = attr->region[2].size; + __entry->region2_hopnum = attr->region[2].hopnum; + __entry->page_shift = attr->page_shift; + __entry->mtt_only = attr->mtt_only; + ), + + TP_printk("rg cnt:%u, pg_sft:0x%x, mtt_only:%s, rg 0 (sz:%u, hop:%u), rg 1 (sz:%u, hop:%u), rg 2 (sz:%u, hop:%u)\n", + __entry->region_count, __entry->page_shift, + str_yes_no(__entry->mtt_only), + __entry->region0_size, __entry->region0_hopnum, + __entry->region1_size, __entry->region1_hopnum, + __entry->region2_size, __entry->region2_hopnum) +); + #endif /* __HNS_ROCE_TRACE_H */ #undef TRACE_INCLUDE_FILE -- 2.51.0 From 2b11d33de23262cb20d1dcb24b586dbb8f54d463 Mon Sep 17 00:00:00 2001 From: Junxian Huang Date: Mon, 21 Apr 2025 21:27:49 +0800 Subject: [PATCH 09/16] RDMA/hns: Include hnae3.h in hns_roce_hw_v2.h hns_roce_hw_v2.h has a direct dependency on hnae3.h due to the inline function hns_roce_write64(), but it doesn't include this header currently. This leads to that files including hns_roce_hw_v2.h must also include hnae3.h to avoid compilation errors, even if they themselves don't really rely on hnae3.h. This doesn't make sense, hns_roce_hw_v2.h should include hnae3.h directly. Fixes: d3743fa94ccd ("RDMA/hns: Fix the chip hanging caused by sending doorbell during reset") Signed-off-by: Junxian Huang Link: https://patch.msgid.link/20250421132750.1363348-6-huangjunxian6@hisilicon.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/hns/hns_roce_ah.c | 1 - drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 1 - drivers/infiniband/hw/hns/hns_roce_hw_v2.h | 1 + drivers/infiniband/hw/hns/hns_roce_main.c | 1 - drivers/infiniband/hw/hns/hns_roce_restrack.c | 1 - 5 files changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_ah.c b/drivers/infiniband/hw/hns/hns_roce_ah.c index 4fc5b9d5fea8..307c35888b30 100644 --- a/drivers/infiniband/hw/hns/hns_roce_ah.c +++ b/drivers/infiniband/hw/hns/hns_roce_ah.c @@ -33,7 +33,6 @@ #include #include #include -#include "hnae3.h" #include "hns_roce_device.h" #include "hns_roce_hw_v2.h" diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 5d3bf84c1aae..9cb9fe6a0a98 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -43,7 +43,6 @@ #include #include -#include "hnae3.h" #include "hns_roce_common.h" #include "hns_roce_device.h" #include "hns_roce_cmd.h" diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h index 91a5665465ff..bc7466830eaf 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h @@ -34,6 +34,7 @@ #define _HNS_ROCE_HW_V2_H #include +#include "hnae3.h" #define HNS_ROCE_V2_MAX_RC_INL_INN_SZ 32 #define HNS_ROCE_V2_MTT_ENTRY_SZ 64 diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c index cf89a8db4f64..02f7acb30c5d 100644 --- a/drivers/infiniband/hw/hns/hns_roce_main.c +++ b/drivers/infiniband/hw/hns/hns_roce_main.c @@ -37,7 +37,6 @@ #include #include #include -#include "hnae3.h" #include "hns_roce_common.h" #include "hns_roce_device.h" #include "hns_roce_hem.h" diff --git a/drivers/infiniband/hw/hns/hns_roce_restrack.c b/drivers/infiniband/hw/hns/hns_roce_restrack.c index 356d98816949..f637b73b946e 100644 --- a/drivers/infiniband/hw/hns/hns_roce_restrack.c +++ b/drivers/infiniband/hw/hns/hns_roce_restrack.c @@ -4,7 +4,6 @@ #include #include #include -#include "hnae3.h" #include "hns_roce_common.h" #include "hns_roce_device.h" #include "hns_roce_hw_v2.h" -- 2.51.0 From 6bd18dabf1c94653b9a96402544bb980a7ca7ce1 Mon Sep 17 00:00:00 2001 From: Junxian Huang Date: Mon, 21 Apr 2025 21:27:50 +0800 Subject: [PATCH 10/16] RDMA/hns: Add trace for CMDQ dumping Add trace for CMDQ dumping. Output example: $ cat /sys/kernel/debug/tracing/trace tracer: nop entries-in-buffer/entries-written: 2/2 #P:128 _-----=> irqs-off/BH-disabled / _----=> need-resched | / _---=> hardirq/softirq || / _--=> preempt-depth ||| / _-=> migrate-disable |||| / delay TASK-PID CPU# ||||| TIMESTAMP FUNCTION | | | ||||| | | kworker/u512:1-14003 [089] b..1. 50737.238304: hns_cmdq_req: 0000:bd:00.0 cmdq opcode:0x8500, flag:0x1, retval:0x0, data:{0x2,0x0,0x0,0xffff0000,0x32323232,0x0} kworker/u512:1-14003 [089] b..1. 50737.238316: hns_cmdq_resp: 0000:bd:00.0 cmdq opcode:0x8500, flag:0x2, retval:0x0, data:{0x2,0x0,0x0,0xffff0000,0x32323232,0x0} Signed-off-by: Junxian Huang Link: https://patch.msgid.link/20250421132750.1363348-7-huangjunxian6@hisilicon.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 4 +++ drivers/infiniband/hw/hns/hns_roce_trace.h | 35 ++++++++++++++++++++++ 2 files changed, 39 insertions(+) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 9cb9fe6a0a98..fa8747656f25 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -1321,6 +1321,8 @@ static int __hns_roce_cmq_send_one(struct hns_roce_dev *hr_dev, tail = csq->head; for (i = 0; i < num; i++) { + trace_hns_cmdq_req(hr_dev, &desc[i]); + csq->desc[csq->head++] = desc[i]; if (csq->head == csq->desc_num) csq->head = 0; @@ -1335,6 +1337,8 @@ static int __hns_roce_cmq_send_one(struct hns_roce_dev *hr_dev, if (hns_roce_cmq_csq_done(hr_dev)) { ret = 0; for (i = 0; i < num; i++) { + trace_hns_cmdq_resp(hr_dev, &csq->desc[tail]); + /* check the result of hardware write back */ desc_ret = le16_to_cpu(csq->desc[tail++].retval); if (tail == csq->desc_num) diff --git a/drivers/infiniband/hw/hns/hns_roce_trace.h b/drivers/infiniband/hw/hns/hns_roce_trace.h index b0bc22b9cc28..23cbdbaeffaa 100644 --- a/drivers/infiniband/hw/hns/hns_roce_trace.h +++ b/drivers/infiniband/hw/hns/hns_roce_trace.h @@ -12,6 +12,7 @@ #include #include #include "hns_roce_device.h" +#include "hns_roce_hw_v2.h" DECLARE_EVENT_CLASS(flush_head_template, TP_PROTO(unsigned long qpn, u32 pi, @@ -169,6 +170,40 @@ TRACE_EVENT(hns_buf_attr, __entry->region2_size, __entry->region2_hopnum) ); +DECLARE_EVENT_CLASS(cmdq, + TP_PROTO(struct hns_roce_dev *hr_dev, + struct hns_roce_cmq_desc *desc), + TP_ARGS(hr_dev, desc), + + TP_STRUCT__entry(__string(dev_name, dev_name(hr_dev->dev)) + __field(u16, opcode) + __field(u16, flag) + __field(u16, retval) + __array(__le32, data, 6) + ), + + TP_fast_assign(__assign_str(dev_name); + __entry->opcode = le16_to_cpu(desc->opcode); + __entry->flag = le16_to_cpu(desc->flag); + __entry->retval = le16_to_cpu(desc->retval); + memcpy(__entry->data, desc->data, 6 * sizeof(__le32)); + ), + + TP_printk("%s cmdq opcode:0x%x, flag:0x%x, retval:0x%x, data:%s\n", + __get_str(dev_name), __entry->opcode, + __entry->flag, __entry->retval, + __print_array(__entry->data, 6, sizeof(__le32))) +); + +DEFINE_EVENT(cmdq, hns_cmdq_req, + TP_PROTO(struct hns_roce_dev *hr_dev, + struct hns_roce_cmq_desc *desc), + TP_ARGS(hr_dev, desc)); +DEFINE_EVENT(cmdq, hns_cmdq_resp, + TP_PROTO(struct hns_roce_dev *hr_dev, + struct hns_roce_cmq_desc *desc), + TP_ARGS(hr_dev, desc)); + #endif /* __HNS_ROCE_TRACE_H */ #undef TRACE_INCLUDE_FILE -- 2.51.0 From 7c04bc97171780e15798433d9e83a241d9956014 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 25 Apr 2025 23:12:09 -0700 Subject: [PATCH 11/16] IB/mthca: Adjust buddy->bits allocation type In preparation for making the kmalloc family of allocators type aware, we need to make sure that the returned type from the allocation matches the type of the variable being assigned. (Before, the allocator would always return "void *", which can be implicitly cast to any pointer type.) The assigned type is "unsigned long **", but the returned type will be "long **". These are the same allocation size (pointer size), but the types do not match. Adjust the allocation type to match the assignment. Signed-off-by: Kees Cook Link: https://patch.msgid.link/20250426061208.work.000-kees@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mthca/mthca_mr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/mthca/mthca_mr.c b/drivers/infiniband/hw/mthca/mthca_mr.c index 192f83fd7c8a..dacb8ceeebe0 100644 --- a/drivers/infiniband/hw/mthca/mthca_mr.c +++ b/drivers/infiniband/hw/mthca/mthca_mr.c @@ -144,7 +144,7 @@ static int mthca_buddy_init(struct mthca_buddy *buddy, int max_order) buddy->max_order = max_order; spin_lock_init(&buddy->lock); - buddy->bits = kcalloc(buddy->max_order + 1, sizeof(long *), + buddy->bits = kcalloc(buddy->max_order + 1, sizeof(*buddy->bits), GFP_KERNEL); buddy->num_free = kcalloc((buddy->max_order + 1), sizeof *buddy->num_free, GFP_KERNEL); -- 2.51.0 From 3db60cf9b7da4ae35646c00d9241c44d2dd6caf8 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 25 Apr 2025 23:12:48 -0700 Subject: [PATCH 12/16] IB/hfi1: Adjust fd->entry_to_rb allocation type In preparation for making the kmalloc family of allocators type aware, we need to make sure that the returned type from the allocation matches the type of the variable being assigned. (Before, the allocator would always return "void *", which can be implicitly cast to any pointer type.) The assigned type is "struct tid_rb_node **", but the return type will be "struct rb_node **". These are the same allocation size (pointer size), but the types do not match. Adjust the allocation type to match the assignment. Signed-off-by: Kees Cook Link: https://patch.msgid.link/20250426061247.work.261-kees@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/hfi1/user_exp_rcv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/hfi1/user_exp_rcv.c b/drivers/infiniband/hw/hfi1/user_exp_rcv.c index cf2d29098406..62b4f16dab27 100644 --- a/drivers/infiniband/hw/hfi1/user_exp_rcv.c +++ b/drivers/infiniband/hw/hfi1/user_exp_rcv.c @@ -53,7 +53,7 @@ int hfi1_user_exp_rcv_init(struct hfi1_filedata *fd, int ret = 0; fd->entry_to_rb = kcalloc(uctxt->expected_count, - sizeof(struct rb_node *), + sizeof(*fd->entry_to_rb), GFP_KERNEL); if (!fd->entry_to_rb) return -ENOMEM; -- 2.51.0 From 7590649ee7af381a9d1153143026dec124c5798e Mon Sep 17 00:00:00 2001 From: Vlad Dumitrescu Date: Mon, 28 Apr 2025 14:30:18 +0300 Subject: [PATCH 13/16] IB/cm: Drop lockdep assert and WARN when freeing old msg The send completion handler can run after cm_id has advanced to another message. The cm_id lock is not needed in this case, but a recent change re-used cm_free_priv_msg(), which asserts that the lock is held and WARNs if the cm_id's currently outstanding msg is different than the one being freed. Fixes: 1e5159219076 ("IB/cm: Do not hold reference on cm_id unless needed") Signed-off-by: Vlad Dumitrescu Reviewed-by: Sean Hefty Link: https://patch.msgid.link/0c364c29142f72b7875fdeba51f3c9bd6ca863ee.1745839788.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/cm.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index effa53dd6800..e64cbd034a2a 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -3786,7 +3786,8 @@ static void cm_process_send_error(struct cm_id_private *cm_id_priv, spin_lock_irq(&cm_id_priv->lock); if (msg != cm_id_priv->msg) { spin_unlock_irq(&cm_id_priv->lock); - cm_free_priv_msg(msg); + cm_free_msg(msg); + cm_deref_id(cm_id_priv); return; } cm_free_priv_msg(msg); -- 2.51.0 From 5d2ea5aebbb2f3ebde4403f9c55b2b057e5dd2d6 Mon Sep 17 00:00:00 2001 From: Patrisious Haddad Date: Mon, 28 Apr 2025 14:34:07 +0300 Subject: [PATCH 14/16] RDMA/mlx5: Fix error flow upon firmware failure for RQ destruction Upon RQ destruction if the firmware command fails which is the last resource to be destroyed some SW resources were already cleaned regardless of the failure. Now properly rollback the object to its original state upon such failure. In order to avoid a use-after free in case someone tries to destroy the object again, which results in the following kernel trace: refcount_t: underflow; use-after-free. WARNING: CPU: 0 PID: 37589 at lib/refcount.c:28 refcount_warn_saturate+0xf4/0x148 Modules linked in: rdma_ucm(OE) rdma_cm(OE) iw_cm(OE) ib_ipoib(OE) ib_cm(OE) ib_umad(OE) mlx5_ib(OE) rfkill mlx5_core(OE) mlxdevm(OE) ib_uverbs(OE) ib_core(OE) psample mlxfw(OE) mlx_compat(OE) macsec tls pci_hyperv_intf sunrpc vfat fat virtio_net net_failover failover fuse loop nfnetlink vsock_loopback vmw_vsock_virtio_transport_common vmw_vsock_vmci_transport vmw_vmci vsock xfs crct10dif_ce ghash_ce sha2_ce sha256_arm64 sha1_ce virtio_console virtio_gpu virtio_blk virtio_dma_buf virtio_mmio dm_mirror dm_region_hash dm_log dm_mod xpmem(OE) CPU: 0 UID: 0 PID: 37589 Comm: python3 Kdump: loaded Tainted: G OE ------- --- 6.12.0-54.el10.aarch64 #1 Tainted: [O]=OOT_MODULE, [E]=UNSIGNED_MODULE Hardware name: QEMU KVM Virtual Machine, BIOS 0.0.0 02/06/2015 pstate: 60400005 (nZCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) pc : refcount_warn_saturate+0xf4/0x148 lr : refcount_warn_saturate+0xf4/0x148 sp : ffff80008b81b7e0 x29: ffff80008b81b7e0 x28: ffff000133d51600 x27: 0000000000000001 x26: 0000000000000000 x25: 00000000ffffffea x24: ffff00010ae80f00 x23: ffff00010ae80f80 x22: ffff0000c66e5d08 x21: 0000000000000000 x20: ffff0000c66e0000 x19: ffff00010ae80340 x18: 0000000000000006 x17: 0000000000000000 x16: 0000000000000020 x15: ffff80008b81b37f x14: 0000000000000000 x13: 2e656572662d7265 x12: ffff80008283ef78 x11: ffff80008257efd0 x10: ffff80008283efd0 x9 : ffff80008021ed90 x8 : 0000000000000001 x7 : 00000000000bffe8 x6 : c0000000ffff7fff x5 : ffff0001fb8e3408 x4 : 0000000000000000 x3 : ffff800179993000 x2 : 0000000000000000 x1 : 0000000000000000 x0 : ffff000133d51600 Call trace: refcount_warn_saturate+0xf4/0x148 mlx5_core_put_rsc+0x88/0xa0 [mlx5_ib] mlx5_core_destroy_rq_tracked+0x64/0x98 [mlx5_ib] mlx5_ib_destroy_wq+0x34/0x80 [mlx5_ib] ib_destroy_wq_user+0x30/0xc0 [ib_core] uverbs_free_wq+0x28/0x58 [ib_uverbs] destroy_hw_idr_uobject+0x34/0x78 [ib_uverbs] uverbs_destroy_uobject+0x48/0x240 [ib_uverbs] __uverbs_cleanup_ufile+0xd4/0x1a8 [ib_uverbs] uverbs_destroy_ufile_hw+0x48/0x120 [ib_uverbs] ib_uverbs_close+0x2c/0x100 [ib_uverbs] __fput+0xd8/0x2f0 __fput_sync+0x50/0x70 __arm64_sys_close+0x40/0x90 invoke_syscall.constprop.0+0x74/0xd0 do_el0_svc+0x48/0xe8 el0_svc+0x44/0x1d0 el0t_64_sync_handler+0x120/0x130 el0t_64_sync+0x1a4/0x1a8 Fixes: e2013b212f9f ("net/mlx5_core: Add RQ and SQ event handling") Signed-off-by: Patrisious Haddad Link: https://patch.msgid.link/3181433ccdd695c63560eeeb3f0c990961732101.1745839855.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/qpc.c | 30 ++++++++++++++++++++++++++++-- include/linux/mlx5/driver.h | 1 + 2 files changed, 29 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/qpc.c b/drivers/infiniband/hw/mlx5/qpc.c index d3dcc272200a..146d03ae40bd 100644 --- a/drivers/infiniband/hw/mlx5/qpc.c +++ b/drivers/infiniband/hw/mlx5/qpc.c @@ -21,8 +21,10 @@ mlx5_get_rsc(struct mlx5_qp_table *table, u32 rsn) spin_lock_irqsave(&table->lock, flags); common = radix_tree_lookup(&table->tree, rsn); - if (common) + if (common && !common->invalid) refcount_inc(&common->refcount); + else + common = NULL; spin_unlock_irqrestore(&table->lock, flags); @@ -178,6 +180,18 @@ static int create_resource_common(struct mlx5_ib_dev *dev, return 0; } +static void modify_resource_common_state(struct mlx5_ib_dev *dev, + struct mlx5_core_qp *qp, + bool invalid) +{ + struct mlx5_qp_table *table = &dev->qp_table; + unsigned long flags; + + spin_lock_irqsave(&table->lock, flags); + qp->common.invalid = invalid; + spin_unlock_irqrestore(&table->lock, flags); +} + static void destroy_resource_common(struct mlx5_ib_dev *dev, struct mlx5_core_qp *qp) { @@ -609,8 +623,20 @@ err_destroy_rq: int mlx5_core_destroy_rq_tracked(struct mlx5_ib_dev *dev, struct mlx5_core_qp *rq) { + int ret; + + /* The rq destruction can be called again in case it fails, hence we + * mark the common resource as invalid and only once FW destruction + * is completed successfully we actually destroy the resources. + */ + modify_resource_common_state(dev, rq, true); + ret = destroy_rq_tracked(dev, rq->qpn, rq->uid); + if (ret) { + modify_resource_common_state(dev, rq, false); + return ret; + } destroy_resource_common(dev, rq); - return destroy_rq_tracked(dev, rq->qpn, rq->uid); + return 0; } static void destroy_sq_tracked(struct mlx5_ib_dev *dev, u32 sqn, u16 uid) diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index d1dfbad9a447..e6ba8f4f4bd1 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -398,6 +398,7 @@ struct mlx5_core_rsc_common { enum mlx5_res_type res; refcount_t refcount; struct completion free; + bool invalid; }; struct mlx5_uars_page { -- 2.51.0 From 7ccc2a0646ecbf2f8676b7ee0dc4aaccaa2e5ce1 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Mon, 5 May 2025 21:54:19 +0100 Subject: [PATCH 15/16] IB/hfi1: Remove unused sc_drop and sdma_all_idle sc_drop() and sdma_all_idle() were both added in 2015's commit 7724105686e7 ("IB/hfi1: add driver files") but have remained unused. Remove them. Link: https://patch.msgid.link/r/20250505205419.88131-1-linux@treblig.org Signed-off-by: Dr. David Alan Gilbert Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/pio.c | 10 ---------- drivers/infiniband/hw/hfi1/pio.h | 1 - drivers/infiniband/hw/hfi1/sdma.c | 18 ------------------ drivers/infiniband/hw/hfi1/sdma.h | 1 - 4 files changed, 30 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/pio.c b/drivers/infiniband/hw/hfi1/pio.c index 5a91cbda4aee..764286da2ce8 100644 --- a/drivers/infiniband/hw/hfi1/pio.c +++ b/drivers/infiniband/hw/hfi1/pio.c @@ -1361,16 +1361,6 @@ void sc_flush(struct send_context *sc) sc_wait_for_packet_egress(sc, 1); } -/* drop all packets on the context, no waiting until they are sent */ -void sc_drop(struct send_context *sc) -{ - if (!sc) - return; - - dd_dev_info(sc->dd, "%s: context %u(%u) - not implemented\n", - __func__, sc->sw_index, sc->hw_context); -} - /* * Start the software reaction to a context halt or SPC freeze: * - mark the context as halted or frozen diff --git a/drivers/infiniband/hw/hfi1/pio.h b/drivers/infiniband/hw/hfi1/pio.h index d07cc6ea7c63..ab0f9a3a8d12 100644 --- a/drivers/infiniband/hw/hfi1/pio.h +++ b/drivers/infiniband/hw/hfi1/pio.h @@ -246,7 +246,6 @@ void sc_disable(struct send_context *sc); int sc_restart(struct send_context *sc); void sc_return_credits(struct send_context *sc); void sc_flush(struct send_context *sc); -void sc_drop(struct send_context *sc); void sc_stop(struct send_context *sc, int bit); struct pio_buf *sc_buffer_alloc(struct send_context *sc, u32 dw_len, pio_release_cb cb, void *arg); diff --git a/drivers/infiniband/hw/hfi1/sdma.c b/drivers/infiniband/hw/hfi1/sdma.c index 0d2b39b7c8b5..16a749d16ee9 100644 --- a/drivers/infiniband/hw/hfi1/sdma.c +++ b/drivers/infiniband/hw/hfi1/sdma.c @@ -1520,24 +1520,6 @@ void sdma_all_running(struct hfi1_devdata *dd) } } -/** - * sdma_all_idle() - called when the link goes down - * @dd: hfi1_devdata - * - * This routine moves all engines to the idle state. - */ -void sdma_all_idle(struct hfi1_devdata *dd) -{ - struct sdma_engine *sde; - unsigned int i; - - /* idle all engines */ - for (i = 0; i < dd->num_sdma; ++i) { - sde = &dd->per_sdma[i]; - sdma_process_event(sde, sdma_event_e70_go_idle); - } -} - /** * sdma_start() - called to kick off state processing for all engines * @dd: hfi1_devdata diff --git a/drivers/infiniband/hw/hfi1/sdma.h b/drivers/infiniband/hw/hfi1/sdma.h index d77246b48434..91dfd5d0c419 100644 --- a/drivers/infiniband/hw/hfi1/sdma.h +++ b/drivers/infiniband/hw/hfi1/sdma.h @@ -373,7 +373,6 @@ void sdma_start(struct hfi1_devdata *dd); void sdma_exit(struct hfi1_devdata *dd); void sdma_clean(struct hfi1_devdata *dd, size_t num_engines); void sdma_all_running(struct hfi1_devdata *dd); -void sdma_all_idle(struct hfi1_devdata *dd); void sdma_freeze_notify(struct hfi1_devdata *dd, int go_idle); void sdma_freeze(struct hfi1_devdata *dd); void sdma_unfreeze(struct hfi1_devdata *dd); -- 2.51.0 From e56b4eab9cde7db525bf8dc57b489d157d5a201e Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Mon, 5 May 2025 22:02:26 +0100 Subject: [PATCH 16/16] RDMA/siw: Remove unused siw_mem_add siw_mem_add() was added in 2019 by commit 2251334dcac9 ("rdma/siw: application buffer management") but has remained unused. Remove it. Link: https://patch.msgid.link/r/20250505210226.88994-1-linux@treblig.org Signed-off-by: Dr. David Alan Gilbert Acked-by: Bernard Metzler Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/siw/siw_mem.c | 24 ------------------------ drivers/infiniband/sw/siw/siw_mem.h | 1 - 2 files changed, 25 deletions(-) diff --git a/drivers/infiniband/sw/siw/siw_mem.c b/drivers/infiniband/sw/siw/siw_mem.c index b17156995595..d5ddeb17bd22 100644 --- a/drivers/infiniband/sw/siw/siw_mem.c +++ b/drivers/infiniband/sw/siw/siw_mem.c @@ -17,30 +17,6 @@ /* Stag lookup is based on its index part only (24 bits). */ #define SIW_STAG_MAX_INDEX 0x00ffffff -/* - * The code avoids special Stag of zero and tries to randomize - * STag values between 1 and SIW_STAG_MAX_INDEX. - */ -int siw_mem_add(struct siw_device *sdev, struct siw_mem *m) -{ - struct xa_limit limit = XA_LIMIT(1, SIW_STAG_MAX_INDEX); - u32 id, next; - - get_random_bytes(&next, 4); - next &= SIW_STAG_MAX_INDEX; - - if (xa_alloc_cyclic(&sdev->mem_xa, &id, m, limit, &next, - GFP_KERNEL) < 0) - return -ENOMEM; - - /* Set the STag index part */ - m->stag = id << 8; - - siw_dbg_mem(m, "new MEM object\n"); - - return 0; -} - /* * siw_mem_id2obj() * diff --git a/drivers/infiniband/sw/siw/siw_mem.h b/drivers/infiniband/sw/siw/siw_mem.h index e74cfcd6dbc1..8e769d30e2ac 100644 --- a/drivers/infiniband/sw/siw/siw_mem.h +++ b/drivers/infiniband/sw/siw/siw_mem.h @@ -12,7 +12,6 @@ void siw_umem_release(struct siw_umem *umem); struct siw_pbl *siw_pbl_alloc(u32 num_buf); dma_addr_t siw_pbl_get_buffer(struct siw_pbl *pbl, u64 off, int *len, int *idx); struct siw_mem *siw_mem_id2obj(struct siw_device *sdev, int stag_index); -int siw_mem_add(struct siw_device *sdev, struct siw_mem *m); int siw_invalidate_stag(struct ib_pd *pd, u32 stag); int siw_check_mem(struct ib_pd *pd, struct siw_mem *mem, u64 addr, enum ib_access_flags perms, int len); -- 2.51.0