int ret;
DECLARE_SIF_CQE_WITH_SAME_EQ(sdev, lcqe, cq->eq_idx);
- sif_log(sdev, SIF_NCQ, "cq_idx %d, flags 0x%x", cq->index, flags);
-
memset(&wr, 0, sizeof(struct psif_wr));
if (flags & IB_CQ_SOLICITED)
wr.se = 1;
- /* If a CQ is not valid, do not rearm the CQ. */
- if (!get_psif_cq_hw__valid(&cq->d))
+ /* Do not rearm a CQ if it is not valid or is in error */
+ if (unlikely(!get_psif_cq_hw__valid(&cq->d) || READ_ONCE(cq->in_error))) {
+ sif_log(sdev, SIF_NCQ, "cq %d, flags 0x%x (ignored - CQ in error)", cq->index, flags);
return 0;
+ }
+
+ sif_log(sdev, SIF_NCQ, "cq_idx %d, flags 0x%x", cq->index, flags);
/* We should never miss events in psif so we have no need for a separate
* handling of IB_CQ_REPORT_MISSED_EVENTS - ignore it.
/* lock protects the below data structure and access/freeing of sq elems */
spinlock_t lock ____cacheline_internodealigned_in_smp;
bool user_mode; /* Set if this is a CQ to be mapped to user space */
- bool pd_is_set; /* Whether or not this cq has a pd set in it's descriptor */
+ bool in_error; /* Set if the CQ has received an error event */
bool rcn_sent; /* Set if ib_req_notify_cq() has been called on this cq */
u8 eq_idx; /* Index of the event queue that gets completion events for this cq */
atomic_t error_cnt; /* No. of error completions observed on this cq */
struct sif_rq *rq = get_rq(sdev, qp);
if (rq) {
+ struct sif_cq *cq = get_sif_cq(sdev, qp->rcv_cq_indx);
struct sif_rq_sw *rq_sw = get_sif_rq_sw(sdev, rq->index);
- /* WA #3850:if SRQ, generate LAST_WQE event */
- if (rq->is_srq && ibqp->event_handler) {
+ if (READ_ONCE(cq->in_error))
+ sif_log(sdev, SIF_INTR, "CQ %d already in error - not flushing",
+ cq->index);
+ else if (rq->is_srq && ibqp->event_handler) {
+ /* WA #3850:if SRQ, generate LAST_WQE event */
struct ib_event ibe = {
.device = &sdev->ib_dev,
.event = IB_EVENT_QP_LAST_WQE_REACHED,
ibe.element.port_num = port_num;
nevents += handle_event(eq, &ibe);
}
+ /* Handle CQ errors early, as they may affect what we need to do on QPs */
+ if (leqe.event_status_cq_error) {
+ struct sif_cq *cq = get_sif_cq(sdev, leqe.cqd_id);
+
+ ibe.event = IB_EVENT_CQ_ERR;
+ ibe.element.cq = &get_sif_cq(sdev, leqe.cqd_id)->ibcq;
+ WRITE_ONCE(cq->in_error, true);
+ if (leqe.vendor_error == TSU_CBLD_CQ_FULL_ERR)
+ sif_log(sdev, SIF_INFO, "CQ overrun on CQ %d", cq->index);
+ else if (leqe.vendor_error == TSU_CBLD_CQ_ALREADY_IN_ERR)
+ sif_log(sdev, SIF_INTR, "CQ %d already in error event", cq->index);
+ else
+ dump_eq_entry(SIF_INFO, "Got cq_error", &leqe);
+ nevents += handle_event(eq, &ibe);
+ }
if (leqe.event_status_local_work_queue_catastrophic_error ||
leqe.event_status_xrc_domain_violation ||
leqe.event_status_invalid_xrceth) {
+ struct sif_qp *qp = to_sqp(ibqp);
+
+ qp->last_set_state = IB_QPS_ERR;
ibe.event = IB_EVENT_QP_FATAL;
ibe.element.qp = ibqp;
nevents += handle_event(eq, &ibe);
- dump_eq_entry(SIF_INFO, "Got Fatal error", &leqe);
+ dump_eq_entry(SIF_INFO, "Got fatal QP error", &leqe);
}
if (leqe.event_status_srq_catastrophic_error) {
ibe.event = IB_EVENT_SRQ_ERR;
ibe.element.qp = ibqp;
nevents += handle_event(eq, &ibe);
}
- if (leqe.event_status_cq_error) {
- ibe.event = IB_EVENT_CQ_ERR;
- ibe.element.cq = &get_sif_cq(sdev, leqe.cqd_id)->ibcq;
- nevents += handle_event(eq, &ibe);
- dump_eq_entry(SIF_INFO, "Got cq_error", &leqe);
- }
if (leqe.event_status_local_catastrophic_error) {
ibe.event = IB_EVENT_DEVICE_FATAL;
/* psif does not associate this event with a port
/* Allow some pure busy wait before we attempt to reschedule/relax */
if (waitcnt < 10)
continue;
+
if (!irqs_disabled())
cond_resched();
else
cpu_relax();
+ if (unlikely(READ_ONCE(pqp->qp->last_set_state) != IB_QPS_RTS)) {
+ sif_log(sdev, SIF_INFO,
+ "cq %d: poll for cqe %p failed - pqp %d not operational\n",
+ cq->index, lcqe, pqp->qp->qp_idx);
+ ret = -EINTR;
+ break;
+ }
if (sdev->min_resp_ticks != min_resp_ticks) {
/* Give us a quick way out by changing min_resp_ticks */
pqp->timeout -= (min_resp_ticks - sdev->min_resp_ticks) * 4;
else
memcpy(lqqp, &sqp->qqp, sizeof(*lqqp));
+ /* Capture if the QP went to one of the error states */
+ if (!ret) {
+ if (lqqp->qp.state == PSIF_QP_STATE_ERROR)
+ sqp->last_set_state = IB_QPS_ERR;
+ else if (lqqp->qp.state == PSIF_QP_STATE_SQERR)
+ sqp->last_set_state = IB_QPS_SQE;
+ }
return ret;
}
ret = epsc_query_qp(qp, &lqqp);
- if (!ret)
- qp->last_set_state = sif2ib_qp_state(lqqp.qp.state);
-
if (ret)
return ret;
struct sif_cq *recv_cq = rq ? get_sif_cq(sdev, cq_idx) : NULL;
- /* clean-up the SQ/RQ CQ before reset the SQ */
+ /* clean-up the SQ/RQ CQ before resetting the SQ */
if (send_cq) {
nfixup = sif_fixup_cqes(send_cq, sq, qp);
if (nfixup < 0) {
u16 eps_tag; /* Value to use for the eps_tag field (proxy_qp) */
short port; /* IB port number (= sif port# + 1) */
u32 flags; /* SIF specific flags */
- enum ib_qp_state last_set_state;
+ enum ib_qp_state last_set_state; /* Best effort last known state of the QP */
enum psif_qp_trans type; /* PSIF transport type set up for this QP */
enum ib_qp_type ib_qp_type; /* IB QP type */
u32 pd_indx;
return 0;
}
-/* QP is in RESET state, its now safe to do a cq_walk and
+/* QP is in RESET or shadow error state, its now safe to do a cq_walk and
* flush any completions.
*/
int post_process_wa4074(struct sif_dev *sdev, struct sif_qp *qp)
return -1;
}
+ if (unlikely(READ_ONCE(cq->in_error))) {
+ sif_log(sdev, SIF_WCE_V, "qp %d: cq %d is in error - exiting",
+ qp->qp_idx, cq->index);
+ return ret;
+ }
+
if (qp->flags & SIF_QPF_HW_OWNED) {
sif_log(sdev, SIF_INFO, "qp %d is not in SHADOWED ERR state yet",
qp->qp_idx);
u32 head, tail;
unsigned long flags;
enum sif_mqp_type mqp_type = SIF_MQP_SW;
- struct sif_cq *cq = rq ? get_sif_cq(sdev, rq->cq_idx) : NULL;
+ struct sif_cq *cq = rq ? get_sif_cq(sdev, target_qp->rcv_cq_indx) : NULL;
DECLARE_SIF_CQE_POLL(sdev, lcqe);
+ if (unlikely(!rq || !cq)) {
+ sif_log(sdev, SIF_INFO, "rq/cq not defined for qp %d (type %s)",
+ target_qp->qp_idx, string_enum_psif_qp_trans(target_qp->type));
+ goto done;
+ }
+
/* if flush RQ is in progress, set FLUSH_RQ_IN_FLIGHT.
*/
if (test_bit(FLUSH_RQ_IN_PROGRESS, &rq_sw->flags)) {
}
mutex_unlock(&target_qp->lock);
+ if (unlikely(READ_ONCE(cq->in_error))) {
+ sif_log(sdev, SIF_WCE_V, "qp %d: cq %d is in error - exiting",
+ target_qp->qp_idx, cq->index);
+ goto free_rq_error;
+ }
+
/* Workaround #622 v2 step 2: Invalidate RQ
* Invalidation of an RQ causes PSIF to flush it's caches for that RQ.
* If PSIF finds the RQ invalid, it will attempt to fetch it.