]> www.infradead.org Git - users/jedix/linux-maple.git/commitdiff
RDS: Handle RDMA_CM_EVENT_TIMEWAIT_EXIT event.
authorVenkat Venkatsubra <venkat.x.venkatsubra@oracle.com>
Fri, 18 Sep 2015 01:37:01 +0000 (18:37 -0700)
committerMukesh Kacker <mukesh.kacker@oracle.com>
Tue, 6 Oct 2015 04:40:08 +0000 (21:40 -0700)
RDS/IB connection can hang with the log message
   "CQ access violation on CQN ..."
followed by
   "RDS: unknown event 15!".

Event 15 is RDMA_CM_EVENT_TIMEWAIT_EXIT. RDS was not handling this.
With this fix RDS will now attempt to reconnect on getting this event.

The fix contains 2 changes.
1) RDS change to handle RDMA_CM_EVENT_TIMEWAIT_EXIT event.
2) Display diagnostic data of "syndrome" and "vendor_error_syndrome" in
   mlx4_core when CQ access violation occurs.

Orabug: 21675221

Acked-by: Chien Yen <chien.yen@oracle.com>
Signed-off-by: Venkat Venkatsubra <venkat.x.venkatsubra@oracle.com>
Signed-off-by: Mukesh Kacker <mukesh.kacker@oracle.com>
drivers/net/ethernet/mellanox/mlx4/eq.c
net/rds/rdma_transport.c

index 2619c9fbf42dfb952473e4779a2ee8d6ebfd2c65..5677d8a1c51eae9f73f269c8ca3e61c84f169428 100644 (file)
@@ -627,10 +627,13 @@ static int mlx4_eq_int(struct mlx4_dev *dev, struct mlx4_eq *eq)
                }
 
                case MLX4_EVENT_TYPE_CQ_ERROR:
-                       mlx4_warn(dev, "CQ %s on CQN %06x\n",
+                       mlx4_warn(dev,
+                "CQ %s on CQN %06x syndrome=0x%x vendor_error_syndrome=0x%x\n",
                                  eqe->event.cq_err.syndrome == 1 ?
                                  "overrun" : "access violation",
-                                 be32_to_cpu(eqe->event.cq_err.cqn) & 0xffffff);
+                                 be32_to_cpu(eqe->event.cq_err.cqn) & 0xffffff,
+                                 eqe->event.cq_err.syndrome,
+                                 eqe->event.cq_err.reserved2[2]);
                        if (mlx4_is_master(dev)) {
                                ret = mlx4_get_slave_from_resource_id(dev,
                                        RES_CQ,
index b826fcc9c709e48877f3ddd7b16da4167a836a75..42dad98f6cbe48bd5b0f4ac8232627315a054aa8 100644 (file)
@@ -276,6 +276,17 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
                rds_conn_drop(conn);
                break;
 
+       case RDMA_CM_EVENT_TIMEWAIT_EXIT:
+               if (conn) {
+                       printk(KERN_INFO "TIMEWAIT_EXIT event - "
+                               "dropping connection "
+                               "%pI4->%pI4\n", &conn->c_laddr,
+                                &conn->c_faddr);
+                       rds_conn_drop(conn);
+               } else
+                       printk(KERN_INFO "TIMEWAIT_EXIT event - conn=NULL\n");
+               break;
+
        default:
                /* things like device disconnect? */
                printk(KERN_ERR "RDS: unknown event %u!\n", event->event);