]> www.infradead.org Git - users/jedix/linux-maple.git/commitdiff
sif: sif_r3: implemented WA#4074 stats counters
authorTriviño <francisco.trivino@oracle.com>
Thu, 30 Jun 2016 14:30:07 +0000 (16:30 +0200)
committerSantosh Shilimkar <santosh.shilimkar@oracle.com>
Wed, 27 Jul 2016 17:24:20 +0000 (10:24 -0700)
This commit added both wa4074 and wa4059 statistics
to help to identify potential issues when the work-
around are applied.

The wa4074 stats implementation is based on:

a) pre_wa4074_cnt == post_wa4074_cnt. This means the
w/a is triggered from the modify_qp_hw.
b) pre_wa4074_cnt != post_wa4074_cnt. post_wa4074 is
triggered from other scenarios too.
c) post_wa4074_err_cnt != 0. It means that post_wa4074
fails.
d) wrs_csum_corr_wa4074_cnt indicates the number of
WRs that were csum corrupted.
e) rcv_snd_gen_wa4074_cnt shows the number of recv
and send cqe's were generated.

The wa4059 stats indicate the number of keep-alive
events that have been sent.

This commit also improves wa3714 stats implementation
by using atomic64 counters and enumeration values,
and other minor changes such as clean up and fix
typos on comment messages.

Orabug: 23760170

Signed-off-by: Triviño <francisco.trivino@oracle.com>
Reviewed-by: Håkon Bugge <haakon.bugge@oracle.com>
drivers/infiniband/hw/sif/sif_epsc.c
drivers/infiniband/hw/sif/sif_r3.c
drivers/infiniband/hw/sif/sif_r3.h

index 01f79c87114e2ca6e457e5e5b11e1a0c46f41cc2..95a213dbd71126922b4367ddfd77af7eca374236 100644 (file)
@@ -1511,7 +1511,11 @@ static int __sif_eps_send_keep_alive(struct sif_dev *sdev, enum psif_mbox_type e
        int ret = 0;
 
        if (sif_eps_keep_alive_timeout(es) || force) {
-               sif_log(sdev, SIF_INFO, "Sending keep-alive (force=%i)", force);
+               sif_log(sdev, SIF_INTR, "Sending keep-alive (force=%i)", force);
+               if (force)
+                       atomic64_inc(&sdev->wa_stats.wa4059[SND_INTR_KEEP_ALIVE_WA4059_CNT]);
+               else
+                       atomic64_inc(&sdev->wa_stats.wa4059[SND_THREAD_KEEP_ALIVE_WA4059_CNT]);
 
                /* prevent infinite loop with __sif_post_eps_wr */
                es->last_req_posted = jiffies;
index 2446fbc56916968555d0bf452cea1348c3007005..f60e6082eda85d1446b7e61cf9324ddaf5096e22 100644 (file)
@@ -292,7 +292,7 @@ static int sif_hw_allocate_flush_qp(struct sif_dev *sdev, u8 flush_idx)
        }
 
        sdev->flush_qp[flush_idx] = qp->qp_idx;
-       sif_log(sdev, SIF_INFO, "Allocated flush-retry qp port %d, index %d", port, sdev->flush_qp[flush_idx]);
+       sif_log(sdev, SIF_QP, "Allocated flush-retry qp port %d, index %d", port, sdev->flush_qp[flush_idx]);
 
        return ret;
 
@@ -455,18 +455,18 @@ int reset_qp_flush_retry(struct sif_dev *sdev, u8 flush_idx)
                }
        }
 
-       sdev->wa_stats.wa3714[0]++;
+       atomic64_inc(&sdev->wa_stats.wa3714[FLUSH_RETRY_WA3714_CNT]);
        mutex_unlock(&sdev->flush_lock[flush_idx]);
        return ret;
 fail:
-       sdev->wa_stats.wa3714[1]++;
+       atomic64_inc(&sdev->wa_stats.wa3714[FLUSH_RETRY_WA3714_ERR_CNT]);
        sif_hw_free_flush_qp(sdev, flush_idx);
        sif_hw_allocate_flush_qp(sdev, flush_idx);
        mutex_unlock(&sdev->flush_lock[flush_idx]);
        return ret;
 
 err_flush_qp:
-       sdev->wa_stats.wa3714[1]++;
+       atomic64_inc(&sdev->wa_stats.wa3714[FLUSH_RETRY_WA3714_ERR_CNT]);
        mutex_unlock(&sdev->flush_lock[flush_idx]);
        return ret;
 }
@@ -519,10 +519,13 @@ int pre_process_wa4074(struct sif_dev *sdev, struct sif_qp *qp)
                set_psif_wr__checksum(&sqe->wr, ~get_psif_wr__checksum(&sqe->wr));
                len--;
        }
+       atomic64_add(len, &sdev->wa_stats.wa4074[WRS_CSUM_CORR_WA4074_CNT]);
        spin_unlock_irqrestore(&sq->lock, flags);
        if (cq)
                set_bit(CQ_POLLING_NOT_ALLOWED, &cq_sw->flags);
 
+       atomic64_inc(&sdev->wa_stats.wa4074[PRE_WA4074_CNT]);
+
        return 0;
 }
 
@@ -694,6 +697,8 @@ flush_sq_again:
                                "sq %d, last_seq %x, sif_gen_sq_flush_cqe returned %d",
                                sq->index, last_seq, ret);
 
+               atomic64_inc(&sdev->wa_stats.wa4074[RCV_SND_GEN_WA4074_CNT]);
+
                if (ret == -EAGAIN) {
                        ret = gen_pqp_cqe(&lcqe);
                        if (ret < 0)
@@ -738,6 +743,12 @@ err_post_wa4074:
        clear_bit(CQ_POLLING_NOT_ALLOWED, &cq_sw->flags);
        clear_bit(FLUSH_SQ_IN_FLIGHT, &sq_sw->flags);
        clear_bit(FLUSH_SQ_IN_PROGRESS, &sq_sw->flags);
+
+       if (ret < 0)
+               atomic64_inc(&sdev->wa_stats.wa4074[POST_WA4074_ERR_CNT]);
+       else
+               atomic64_inc(&sdev->wa_stats.wa4074[POST_WA4074_CNT]);
+
        return ret = ret > 0 ? 0 : ret;
 }
 
@@ -781,7 +792,7 @@ static u16 walk_and_update_cqes(struct sif_dev *sdev, struct sif_qp *qp, u16 hea
                                if (GREATER_16(updated_seq, end)) {
                                        /* A scenario might be that an additional CQE
                                         * must be generated to flush all the HW
-                                        * generated completions. Thus, igore the polling the cqe.
+                                        * generated completions. Thus, ignore the polling of the cqe.
                                         */
                                        lcqe.seq_num = ~lcqe.seq_num;
                                        sif_log(sdev, SIF_WCE_V, "corrupt: lcqe.seq_num %x",
@@ -799,6 +810,7 @@ static u16 walk_and_update_cqes(struct sif_dev *sdev, struct sif_qp *qp, u16 hea
                sq->index, cq->index, n);
 
        spin_unlock_irqrestore(&cq->lock, flags);
+
        return updated_seq;
 }
 
@@ -857,12 +869,25 @@ static u16 cq_walk_wa4074(struct sif_dev *sdev, struct sif_qp *qp, bool *last_se
 
 void sif_dfs_print_wa_stats(struct sif_dev *sdev, char *buf)
 {
-       /* Header */
-       sprintf(buf, "#%7s %10s %10s %20s\n", "WA", "ok", "err", "desc");
-       /* Content */
-       sprintf(buf + strlen(buf), "#%8s %9llu %10llu %20s\n",
-               "WA3714",
-               sdev->wa_stats.wa3714[0],
-               sdev->wa_stats.wa3714[1],
-               "Destroying QPs with a retry in progress");
+       /* Header WA#3714 */
+       sprintf(buf, "\nWA3714: Destroying QPs with a retry in progress\n");
+       /* Content WA#3714 */
+       sprintf(buf + strlen(buf), "%s: %lu\n%s: %lu\n",
+               "ok", atomic64_read(&sdev->wa_stats.wa3714[FLUSH_RETRY_WA3714_CNT]),
+               "err", atomic64_read(&sdev->wa_stats.wa3714[FLUSH_RETRY_WA3714_ERR_CNT]));
+       /* Header WA#4074 */
+       sprintf(buf + strlen(buf), "\nWA4074: Duplicate flushed in error completions\n");
+       /* Content WA#4074 */
+       sprintf(buf + strlen(buf), "%s: %lu\n%s: %lu\n%s: %lu\n%s: %lu\n%s: %lu\n",
+               "pre-ok", atomic64_read(&sdev->wa_stats.wa4074[PRE_WA4074_CNT]),
+               "post-ok", atomic64_read(&sdev->wa_stats.wa4074[POST_WA4074_CNT]),
+               "post-err", atomic64_read(&sdev->wa_stats.wa4074[POST_WA4074_ERR_CNT]),
+               "wr-csum-corr", atomic64_read(&sdev->wa_stats.wa4074[WRS_CSUM_CORR_WA4074_CNT]),
+               "rcv-snd-gen", atomic64_read(&sdev->wa_stats.wa4074[RCV_SND_GEN_WA4074_CNT]));
+       /* Header WA#4059 */
+       sprintf(buf + strlen(buf), "\nWA4059: Mailbox writes from host to EPS sometimes get misplaced\n");
+       /* Content WA#4059 */
+       sprintf(buf + strlen(buf), "%s: %lu\n%s: %lu\n",
+               "keep-alive-int", atomic64_read(&sdev->wa_stats.wa4059[SND_INTR_KEEP_ALIVE_WA4059_CNT]),
+               "keep-alive-thread", atomic64_read(&sdev->wa_stats.wa4059[SND_THREAD_KEEP_ALIVE_WA4059_CNT]));
 }
index b6cf44c8f81b9894c6270070271015dc18e7b486..bb694b40f5d125e74a1fa565bcbab002f1ed24e3 100644 (file)
 #ifndef _SIF_R3_H
 #define _SIF_R3_H
 
+enum wa4059_stats_counter {
+       SND_INTR_KEEP_ALIVE_WA4059_CNT   = 0,
+       SND_THREAD_KEEP_ALIVE_WA4059_CNT = 1,
+       WA4059_CNT_MAX                   = 2,
+};
+
+enum wa3714_stats_counter {
+       FLUSH_RETRY_WA3714_CNT     = 0,
+       FLUSH_RETRY_WA3714_ERR_CNT = 1,
+       WA3714_CNT_MAX             = 2,
+};
+
+enum wa4074_stats_counter {
+       PRE_WA4074_CNT           = 0,
+       POST_WA4074_CNT          = 1,
+       POST_WA4074_ERR_CNT      = 2,
+       WRS_CSUM_CORR_WA4074_CNT = 3,
+       RCV_SND_GEN_WA4074_CNT   = 4,
+       WA4074_CNT_MAX           = 5,
+};
+
 struct sif_wa_stats {
        /* Destroying QPs with a retry in progress */
-       u64 wa3714[2];
+       atomic64_t wa3714[WA3714_CNT_MAX];
+       /* Duplicate flushed in error completions */
+       atomic64_t wa4074[WA4074_CNT_MAX];
+       /* Mailbox writes from host to EPS sometimes get misplaced */
+       atomic64_t wa4059[WA4059_CNT_MAX];
 };
 
 void sif_r3_pre_init(struct sif_dev *sdev);
@@ -30,7 +55,6 @@ void sif_r3_recreate_flush_qp(struct sif_dev *sdev, u8 flush_idx);
 /* WA for #4074 */
 int pre_process_wa4074(struct sif_dev *sdev, struct sif_qp *qp);
 int post_process_wa4074(struct sif_dev *sdev, struct sif_qp *qp);
-int sq_flush_wa4074(struct sif_dev *sdev, struct sif_qp *qp);
 
 /* Single file for the wa statistics */
 void sif_dfs_print_wa_stats(struct sif_dev *sdev, char *buf);