From: Wei Lin Guay <wei.lin.guay@oracle.com>
Date: Tue, 14 Jun 2016 18:40:14 +0000 (+0200)
Subject: sif: rq: Added synchronization during freeing rq
X-Git-Tag: v4.1.12-92~129^2~18
X-Git-Url: https://www.infradead.org/git/?a=commitdiff_plain;h=a9900cf980e474fde08582e751e52732137b2d56;p=users%2Fjedix%2Flinux-maple.git

sif: rq: Added synchronization during freeing rq

Added synchronization between free_rq and flush_rq to
ensure rq can only be freed up after flush_rq has
completed.

Signed-off-by: Wei Lin Guay <wei.lin.guay@oracle.com>
Reviewed-by: Knut Omang <knut.omang@oracle.com>
---

diff --git a/drivers/infiniband/hw/sif/sif_rq.c b/drivers/infiniband/hw/sif/sif_rq.c
index c0e357444990..6d120f22e12a 100644
--- a/drivers/infiniband/hw/sif/sif_rq.c
+++ b/drivers/infiniband/hw/sif/sif_rq.c
@@ -151,6 +151,7 @@ int alloc_rq(struct sif_dev *sdev, struct sif_pd *pd,
 	}
 
 	rq->sg_entries = sg_entries;
+	init_completion(&rq->can_destroy);
 	atomic_set(&rq->refcnt, 1);
 
 	/* Initialize hw part of descriptor */
@@ -327,209 +328,216 @@ int sif_flush_rq(struct sif_dev *sdev, struct sif_rq *rq, struct sif_qp *target_
 	if (len == 0)
 		goto error;
 
-	sif_log(sdev, SIF_INFO_V, "flushing %d entries out of %d/%d entries remaining",
-		len, atomic_read(&rq_sw->length), rq->entries);
-
-	/* Workaround #622 v2 step 1: ModifyQP to RESET
-	 * The QP must be in the RESET state to avoid race condition.
-	 * sif_flush_rq will only be called when the QP is
-	 * in ERROR state. As for now, keeping the same coding style to
-	 * check whether the qp flags SIF_QPF_HW_OWNED is clear.
-	 * If it is clear, it means that the QP is in the shadowed
-	 * software error state (actual hw state is in RESET).
-	 *
-	 * TBD - Should we add new PSIF_QP_STATE_SHADOWED_ERROR state,
-	 * at least to me it is more readable?
-	 */
-	mutex_lock(&target_qp->lock);
-	/* qp lock must be held to make sure not other thread is trying to do modify_qp_hw to RESET */
-	mqp_type = sif_modify_qp_is_ok(target_qp, target_qp->last_set_state, IB_QPS_RESET, IB_QP_STATE);
+	if (atomic_add_unless(&rq->refcnt, 1, 0)) {
+		sif_log(sdev, SIF_INFO_V, "flushing %d entries out of %d/%d entries remaining",
+			len, atomic_read(&rq_sw->length), rq->entries);
+
+		/* Workaround #622 v2 step 1: ModifyQP to RESET
+		 * The QP must be in the RESET state to avoid race condition.
+		 * sif_flush_rq will only be called when the QP is
+		 * in ERROR state. As for now, keeping the same coding style to
+		 * check whether the qp flags SIF_QPF_HW_OWNED is clear.
+		 * If it is clear, it means that the QP is in the shadowed
+		 * software error state (actual hw state is in RESET).
+		 *
+		 * TBD - Should we add new PSIF_QP_STATE_SHADOWED_ERROR state,
+		 * at least to me it is more readable?
+		 */
+		mutex_lock(&target_qp->lock);
+		/* qp lock must be held to make sure not other thread is trying to
+		 * do modify_qp_hw to RESET.
+		 */
+		mqp_type = sif_modify_qp_is_ok(target_qp, target_qp->last_set_state,
+					       IB_QPS_RESET, IB_QP_STATE);
 
-	if (mqp_type == SIF_MQP_HW) {
-		struct ib_qp_attr attr = {
-			.qp_state = IB_QPS_ERR
-		};
+		if (mqp_type == SIF_MQP_HW) {
+			struct ib_qp_attr attr = {
+				.qp_state = IB_QPS_ERR
+			};
 
-		ret = modify_qp_hw_wa_qp_retry(sdev, target_qp, &attr, IB_QP_STATE);
+			ret = modify_qp_hw_wa_qp_retry(sdev, target_qp, &attr, IB_QP_STATE);
 
-		if (ret)
-			sif_log(sdev, SIF_INFO, "qp %d RESET failed, ret %d",
-				target_qp->qp_idx, ret);
+			if (ret)
+				sif_log(sdev, SIF_INFO, "qp %d RESET failed, ret %d",
+					target_qp->qp_idx, ret);
 
-	}
-	mutex_unlock(&target_qp->lock);
-
-	/* Workaround #622 v2 step 2: Invalidate RQ
-	 * Invalidation of an RQ causes PSIF to flush it's caches for that RQ.
-	 * If PSIF finds the RQ invalid, it will attempt to fetch it.
-	 * It is then required to be valid (otherwise it will be interpreted as an error
-	 * by PSIF (see #2134). So software cannot rely upon the completion of the invalidate
-	 * to signal that the descriptor can be re-used, instead it will have to
-	 * verify by checking the final write-back of the descriptor, which will have
-	 * valid set to 0 by PSIF. In the general case we handle this lazy and check before we
-	 * try to re-use. The request is posted with no completion requested as we
-	 * do not need the completion:
-	 */
-	if (!(test_bit(RQ_IS_INVALIDATED, &rq_sw->flags))) {
-		ret = sif_invalidate_rq_hw(sdev, rq->index, PCM_POST);
-		if (ret) {
-			sif_log(sdev, SIF_INFO,
-				"Invalidate rq_hw failed, status %d", ret);
-			goto error;
 		}
-		set_bit(RQ_IS_INVALIDATED, &rq_sw->flags);
-	}
+		mutex_unlock(&target_qp->lock);
+
+		/* Workaround #622 v2 step 2: Invalidate RQ
+		 * Invalidation of an RQ causes PSIF to flush it's caches for that RQ.
+		 * If PSIF finds the RQ invalid, it will attempt to fetch it.
+		 * It is then required to be valid (otherwise it will be interpreted as an error
+		 * by PSIF (see #2134). So software cannot rely upon the completion of the invalidate
+		 * to signal that the descriptor can be re-used, instead it will have to
+		 * verify by checking the final write-back of the descriptor, which will have
+		 * valid set to 0 by PSIF. In the general case we handle this lazy and check before we
+		 * try to re-use. The request is posted with no completion requested as we
+		 * do not need the completion:
+		 */
+		if (!(test_bit(RQ_IS_INVALIDATED, &rq_sw->flags))) {
+			ret = sif_invalidate_rq_hw(sdev, rq->index, PCM_POST);
+			if (ret) {
+				sif_log(sdev, SIF_INFO,
+					"Invalidate rq_hw failed, status %d", ret);
+				goto free_rq_error;
+			}
+			set_bit(RQ_IS_INVALIDATED, &rq_sw->flags);
+		}
 
-	/* Make sure the RQ is sofware owned: */
-	ret = poll_wait_for_rq_writeback(sdev, rq);
-	if (ret)
-		goto error;
+		/* Make sure the RQ is sofware owned: */
+		ret = poll_wait_for_rq_writeback(sdev, rq);
+		if (ret)
+			goto free_rq_error;
 
-	/* The RQ is now software owned and the (after a successful invalidate) so we
-	 * should be able to trust rq_hw::head_indx - better than scanning the CQ
-	 * for unprocessed elements:
-	 * Note that only the lowest 14 bits of the sequence number in head_indx is
-	 * valid:
-	 */
+		/* The RQ is now software owned and the (after a successful invalidate) so we
+		 * should be able to trust rq_hw::head_indx - better than scanning the CQ
+		 * for unprocessed elements:
+		 * Note that only the lowest 14 bits of the sequence number in head_indx is
+		 * valid:
+		 */
 flush_rq_again:
-	head = get_psif_rq_hw__head_indx(&rq->d);
-	tail = rq_sw->next_seq;
-	real_len = rq_length(rq, head, tail & ((1 << 14) - 1)) & ((1 << 14) - 1);
-
-	/* Workaround #622 v2 step 3: Check the last completion on the CQ
-	 * The rq_sw->length is used to track the length of a queue
-	 * with #posted - #completed. If the calculated real_len is
-	 * smaller than the len, it means that a completion is missing.
-	 * Instead of loooping RQ to find rqe of the completed wc_id, the
-	 * rq_sw->length represents the #posted - #completed, and nfixup
-	 * represents the remaining completions after the QP moved to RESET.
-	 * Thus, the number of flush-in error that must be generated is
-	 * rq_sw->length - nfixup.
-	 */
-	if (!(test_bit(FLUSH_RQ_FIRST_TIME, &rq_sw->flags))) {
-		/* need to use a flag to differentiate between the first call of
-		 * sif_flush_rq or the subsequent call. The race condition where
-		 * HW acquired a RWQE but does not generate a completion can
-		 * only happen at the first call of sif_flush_rq. This is because
-		 * the QP state is moved to RESET.
-		 * Besides, if the generated completion arrived later and
-		 * FLUSH_RQ_IN_FLIGHT is set, the test of real_len < len
-		 * might be true.
+		head = get_psif_rq_hw__head_indx(&rq->d);
+		tail = rq_sw->next_seq;
+		real_len = rq_length(rq, head, tail & ((1 << 14) - 1)) & ((1 << 14) - 1);
+
+		/* Workaround #622 v2 step 3: Check the last completion on the CQ
+		 * The rq_sw->length is used to track the length of a queue
+		 * with #posted - #completed. If the calculated real_len is
+		 * smaller than the len, it means that a completion is missing.
+		 * Instead of loooping RQ to find rqe of the completed wc_id, the
+		 * rq_sw->length represents the #posted - #completed, and nfixup
+		 * represents the remaining completions after the QP moved to RESET.
+		 * Thus, the number of flush-in error that must be generated is
+		 * rq_sw->length - nfixup.
 		 */
-		len = atomic_read(&rq_sw->length);
-		if (real_len < len) {
-			struct psif_qp lqps;
-
-			copy_conv_to_sw(&lqps, &target_qp->d, sizeof(lqps));
-
-			/* from Brian - This is a scenario where the first packet is received,
-			 * the RQ is claimed but the Last packet is not received after the QP
-			 * is in Error. Then, nothing will come up the pipe to complete the
-			 * Received and it will be dangling.
+		if (!(test_bit(FLUSH_RQ_FIRST_TIME, &rq_sw->flags))) {
+			/* need to use a flag to differentiate between the first call of
+			 * sif_flush_rq or the subsequent call. The race condition where
+			 * HW acquired a RWQE but does not generate a completion can
+			 * only happen at the first call of sif_flush_rq. This is because
+			 * the QP state is moved to RESET.
+			 * Besides, if the generated completion arrived later and
+			 * FLUSH_RQ_IN_FLIGHT is set, the test of real_len < len
+			 * might be true.
 			 */
-			if ((lqps.state.expected_opcode != NO_OPERATION_IN_PROGRESS) &&
-			    (lqps.state.committed_received_psn + 1 == lqps.state.expected_psn)) {
-				int entries;
-				struct sif_cq *cq = get_sif_cq(sdev, lqps.state.rcv_cq_indx);
-				struct sif_cq_sw *cq_sw;
-				unsigned long timeout;
-
-				if (!cq) {
+			len = atomic_read(&rq_sw->length);
+			if (real_len < len) {
+				struct psif_qp lqps;
+
+				copy_conv_to_sw(&lqps, &target_qp->d, sizeof(lqps));
+
+				/* from Brian - This is a scenario where the first packet is received,
+				 * the RQ is claimed but the Last packet is not received after the QP
+				 * is in Error. Then, nothing will come up the pipe to complete the
+				 * Received and it will be dangling.
+				 */
+				if ((lqps.state.expected_opcode != NO_OPERATION_IN_PROGRESS) &&
+				    (lqps.state.committed_received_psn + 1 == lqps.state.expected_psn)) {
+					int entries;
+					struct sif_cq *cq = get_sif_cq(sdev, lqps.state.rcv_cq_indx);
+					struct sif_cq_sw *cq_sw;
+					unsigned long timeout;
+
+					if (!cq) {
+						sif_log(sdev, SIF_RQ,
+							"recevied cq is NULL");
+						goto free_rq_error;
+					}
+					cq_sw = get_sif_cq_sw(sdev, cq->index);
+
+					/* wait for 1 second to ensure that all the completions are back */
+					timeout = jiffies + HZ;
+					do {
+						cpu_relax();
+					} while (time_is_after_jiffies(timeout));
+
+					/* use the software counter (rq_sw->length) */
+					entries = find_recv_cqes_in_cq(sdev, cq, target_qp);
+					len = atomic_read(&rq_sw->length);
 					sif_log(sdev, SIF_RQ,
-						"recevied cq is NULL");
-					goto error;
+						"RQ %d: updating calculated entries from %d to %d - %d (%d)",
+						rq->index, real_len, len, entries, len - entries);
+					real_len = real_len < len ? len - entries : real_len;
 				}
-				cq_sw = get_sif_cq_sw(sdev, cq->index);
-
-				/* wait for 1 second to ensure that all the completions are back */
-				timeout = jiffies + HZ;
-				do {
-					cpu_relax();
-				} while (time_is_after_jiffies(timeout));
-
-				/* use the software counter (rq_sw->length) */
-				entries = find_recv_cqes_in_cq(sdev, cq, target_qp);
-				len = atomic_read(&rq_sw->length);
-				sif_log(sdev, SIF_RQ,
-					"RQ %d: updating calculated entries from %d to %d - %d (%d)",
-					rq->index, real_len, len, entries, len - entries);
-				real_len = real_len < len ? len - entries : real_len;
 			}
+			set_bit(FLUSH_RQ_FIRST_TIME, &rq_sw->flags);
 		}
-		set_bit(FLUSH_RQ_FIRST_TIME, &rq_sw->flags);
-	}
 
-	/* Now find the actual 32 bit seq.no */
-	head = tail - real_len;
+		/* Now find the actual 32 bit seq.no */
+		head = tail - real_len;
 
-	sif_log(sdev, SIF_RQ,
-		"RQ %d not empty: sz %d, head %d, next_seq %d, %d/%d entries at exit",
-		rq->index, rq->entries, head, tail, len, real_len);
+		sif_log(sdev, SIF_RQ,
+			"RQ %d not empty: sz %d, head %d, next_seq %d, %d/%d entries at exit",
+			rq->index, rq->entries, head, tail, len, real_len);
 
-	if (!real_len)
-		goto error;
+		if (!real_len)
+			goto free_rq_error;
 
-	/* Workaround #622 v2 step 4: generate flush in error completion
-	 * Generate flushed in error completions:
-	 * these give no pqp completions but may in theory fail
-	 */
-	while (real_len > 0) {
-		sif_log(sdev, SIF_PQP, "rq %d, len %d", rq->index, real_len);
-		ret = sif_gen_rq_flush_cqe(sdev, rq, head, target_qp);
-		if (ret)
-			sif_log(sdev, SIF_INFO, "rq %d, len %d, sif_gen_rq_flush_cqe returned %d",
-				rq->index, real_len, ret);
-		if (ret == -EAGAIN) {
-			ret = gen_pqp_cqe(&lcqe);
-			if (ret < 0)
-				goto error;
-			ret = poll_cq_waitfor(&lcqe);
+		/* Workaround #622 v2 step 4: generate flush in error completion
+		 * Generate flushed in error completions:
+		 * these give no pqp completions but may in theory fail
+		 */
+		while (real_len > 0) {
+			sif_log(sdev, SIF_PQP, "rq %d, len %d", rq->index, real_len);
+			ret = sif_gen_rq_flush_cqe(sdev, rq, head, target_qp);
+			if (ret)
+				sif_log(sdev, SIF_INFO, "rq %d, len %d, sif_gen_rq_flush_cqe returned %d",
+					rq->index, real_len, ret);
+			if (ret == -EAGAIN) {
+				ret = gen_pqp_cqe(&lcqe);
+				if (ret < 0)
+					goto free_rq_error;
+				ret = poll_cq_waitfor(&lcqe);
+				if (ret < 0)
+					goto free_rq_error;
+				lcqe.written = false;
+				continue;
+			}
 			if (ret < 0)
-				goto error;
-			lcqe.written = false;
-			continue;
+				goto free_rq_error;
+			real_len--;
+			head++;
 		}
-		if (ret < 0)
-			goto error;
-		real_len--;
-		head++;
-	}
-
-	/* Finally generate a sync.completion for us on the PQP itself
-	 * to allow us to wait for the whole to complete:
-	 */
-	ret = gen_pqp_cqe(&lcqe);
-	if (ret < 0) {
-		sif_log(sdev, SIF_INFO, "rq %d, cqe %p gen_pqp_cqe returned %d",
-			rq->index, &lcqe, ret);
-		goto error;
-	}
 
-	ret = poll_cq_waitfor(&lcqe);
-	if (ret < 0) {
-		sif_log(sdev, SIF_INFO, "rq %d, cqe %p poll_cq_waitfor returned %d",
-			rq->index, &lcqe, ret);
-		goto error;
-	}
+		/* Finally generate a sync.completion for us on the PQP itself
+		 * to allow us to wait for the whole to complete:
+		 */
+		ret = gen_pqp_cqe(&lcqe);
+		if (ret < 0) {
+			sif_log(sdev, SIF_INFO, "rq %d, cqe %p gen_pqp_cqe returned %d",
+				rq->index, &lcqe, ret);
+			goto free_rq_error;
+		}
 
-	sif_log(sdev, SIF_INFO_V, "RQ %d: received completion on cq %d seq 0x%x - done",
-		rq->index, rq->cq_idx, lcqe.cqe.seq_num);
+		ret = poll_cq_waitfor(&lcqe);
+		if (ret < 0) {
+			sif_log(sdev, SIF_INFO, "rq %d, cqe %p poll_cq_waitfor returned %d",
+				rq->index, &lcqe, ret);
+			goto free_rq_error;
+		}
 
-	/* Make sure hardware pointer reflects the flushed situation */
-	set_psif_rq_hw__head_indx(&rq->d, head);
-	wmb();
+		sif_log(sdev, SIF_INFO_V, "RQ %d: received completion on cq %d seq 0x%x - done",
+			rq->index, rq->cq_idx, lcqe.cqe.seq_num);
 
-	/* if FLUSH_RQ_IN_FLIGHT is set, it means another party is trying to
-	 * flush the rq at the same time. This should be retried
-	 * once as no more than one asynchronous event will be generated if
-	 * QP is in ERROR state. This is to take care of a scenario where
-	 * QP is modified to ERROR explicitly and at the same time received
-	 * the asynchronous event. Nevertheless, the RQ entry changes in between
-	 * of these two scenario that can trigger flush rq.
-	 */
-	if (test_and_clear_bit(FLUSH_RQ_IN_FLIGHT, &rq_sw->flags))
-		goto flush_rq_again;
+		/* Make sure hardware pointer reflects the flushed situation */
+		set_psif_rq_hw__head_indx(&rq->d, head);
+		wmb();
 
+		/* if FLUSH_RQ_IN_FLIGHT is set, it means another party is trying to
+		 * flush the rq at the same time. This should be retried
+		 * once as no more than one asynchronous event will be generated if
+		 * QP is in ERROR state. This is to take care of a scenario where
+		 * QP is modified to ERROR explicitly and at the same time received
+		 * the asynchronous event. Nevertheless, the RQ entry changes in between
+		 * of these two scenario that can trigger flush rq.
+		 */
+		if (test_and_clear_bit(FLUSH_RQ_IN_FLIGHT, &rq_sw->flags))
+			goto flush_rq_again;
+free_rq_error:
+		if (atomic_dec_and_test(&rq->refcnt))
+			complete(&rq->can_destroy);
+	}
 error:
 	clear_bit(FLUSH_RQ_IN_PROGRESS, &rq_sw->flags);
 	return ret = ret > 0 ? 0 : ret;
@@ -539,18 +547,24 @@ error:
 int free_rq(struct sif_dev *sdev, int rq_idx)
 {
 	struct sif_rq *rq;
-	int stat;
 
 	rq = get_sif_rq(sdev, rq_idx);
 	sif_log(sdev, SIF_RQ, "entry %d", rq_idx);
 
-	stat = atomic_dec_and_test(&rq->refcnt);
-	if (!stat) {
+	if (!rq->is_srq) {
+		if (atomic_dec_and_test(&rq->refcnt))
+			complete(&rq->can_destroy);
+		wait_for_completion(&rq->can_destroy);
+		goto clean_rq;
+	}
+
+	if (!atomic_dec_and_test(&rq->refcnt)) {
 		sif_log(sdev, SIF_RQ, "rq %d still in use - ref.cnt %d",
 			rq_idx, atomic_read(&rq->refcnt));
 		return -EBUSY;
 	}
 
+clean_rq:
 	sif_release_rq(sdev, rq->index);
 	return 0;
 }
diff --git a/drivers/infiniband/hw/sif/sif_rq.h b/drivers/infiniband/hw/sif/sif_rq.h
index be8bb21fadc2..8dd2fcb37cef 100644
--- a/drivers/infiniband/hw/sif/sif_rq.h
+++ b/drivers/infiniband/hw/sif/sif_rq.h
@@ -26,6 +26,7 @@ struct sif_rq {
 	bool is_srq; /* Set if this is a shared receive queue */
 	int xrc_domain; /* If != 0: This is an XRC SRQ member of this domain idx */
 	atomic_t refcnt; /* Ref.count for usage as a shared receive queue */
+	struct completion can_destroy; /* use refcnt to synchronization in !srq case */
 	u16 entries;      /* Allocated entries */
 	u16 entries_user; /* Entries reported to user (entries -1 if max) */
 	u32 sg_entries; /* Max receive scatter/gather configured for this rq */