]> www.infradead.org Git - users/jedix/linux-maple.git/commitdiff
ib/rds: runtime debuggability enhancement
authorQing Huang <qing.huang@oracle.com>
Wed, 15 Jul 2015 01:36:43 +0000 (18:36 -0700)
committerMukesh Kacker <mukesh.kacker@oracle.com>
Tue, 13 Oct 2015 15:43:25 +0000 (08:43 -0700)
This patch provides the ability to dynamically turn on or off various
types of debug/diag prints inside the RDS module.

The run-time debug prints are controlled by a rds module parameter,
rds_rt_debug_bitmap.

Here is the definition for different bits. We have implemented feature
related bits, such as Connection Management, Active Bonding, Error prints,
Send, Recv.

in net/rds/rds_rt_debug.h
...
enum {
        /* bit 0 ~ 19 are feature related bits */
        RDS_RTD_ERR                     = 1 << 0,       /* 0x1    */
        RDS_RTD_ERR_EXT                 = 1 << 1,       /* 0x2    */

        RDS_RTD_CM                      = 1 << 3,       /* 0x8    */
        RDS_RTD_CM_EXT                  = 1 << 4,       /* 0x10   */
        RDS_RTD_CM_EXT_P                = 1 << 5,       /* 0x20   */

        RDS_RTD_ACT_BND                 = 1 << 7,       /* 0x80   */
        RDS_RTD_ACT_BND_EXT             = 1 << 8,       /* 0x100  */

        RDS_RTD_RCV                     = 1 << 11,      /* 0x800  */
        RDS_RTD_RCV_EXT                 = 1 << 12,      /* 0x1000 */

        RDS_RTD_SND                     = 1 << 14,      /* 0x4000 */
        RDS_RTD_SND_EXT                 = 1 << 15,      /* 0x8000 */
...

In general, *EXTRA bits mean that you will get extra information but
possible flood prints as well. But every bit can be controlled by users
so users can decide how much information they want to see/collect. The
current embedded printk level used for this patch is KERN_INFO. Most
likely all the msgs will only go to /var/log/messages without showing up
on console if we use the default settings for /proc/sys/kernel/printk and
/etc/rsyslog.conf in ol6 environment.

E.g if we want to turn on RDS_RTD_ERR and RDS_RTD_CM bits. What we can
do is

echo 0x9 > /sys/module/rds/parameters/rds_rt_debug_bitmap

To turn on RDS_RTD_ERR(0x1), RDS_RTD_CM(0x8), and RDS_RTD_RCV(0x800) bits

echo 0x809 > /sys/module/rds/parameters/rds_rt_debug_bitmap

Performance penalty: with all the debug flag bits set to 0, there
should be no porformance impact in a kernel with this patch.

Orabug 21314268

Reviewed-by: Ajaykumar Hotchandani <ajaykumar.hotchandani@oracle.com>
Reviewed-by: Shamir Rabinovitch <shamir.rabinovitch@oracle.com>
Reviewed-by: Yuval Shaia <yuval.shaia@oracle.com>
Signed-off-by: Qing Huang <qing.huang@oracle.com>
Signed-off-by: Mukesh Kacker <mukesh.kacker@oracle.com>
net/rds/Makefile
net/rds/connection.c
net/rds/ib.c
net/rds/ib_cm.c
net/rds/rdma_transport.c
net/rds/rds.h
net/rds/rds_rt_debug.c [new file with mode: 0644]
net/rds/rds_rt_debug.h [new file with mode: 0644]
net/rds/recv.c
net/rds/send.c
net/rds/threads.c

index 56d3f6023ced41adb81510ec0dc8525c9230dea9..60fb4029e0a5e990aa18686ed133c4eefc9d158c 100644 (file)
@@ -1,7 +1,7 @@
 obj-$(CONFIG_RDS) += rds.o
 rds-y :=       af_rds.o bind.o cong.o connection.o info.o message.o   \
                        recv.o send.o stats.o sysctl.o threads.o transport.o \
-                       loop.o page.o rdma.o
+                       loop.o page.o rdma.o rds_rt_debug.o
 
 obj-$(CONFIG_RDS_RDMA) += rds_rdma.o
 rds_rdma-y :=  rdma_transport.o \
index 0dd0cef5ca8d64851769a35343cea6224d7bb9a1..4538d90e2083c203b6dfa1d0a41d89ff201065f2 100644 (file)
@@ -317,6 +317,10 @@ void rds_conn_shutdown(struct rds_connection *conn, int restart)
 {
        /* shut it down unless it's down already */
        if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_DOWN)) {
+               rds_rtd(RDS_RTD_CM_EXT,
+                       "RDS/IB: shutdown init <%u.%u.%u.%u,%u.%u.%u.%u,%d>, cn %p, cn->c_p %p\n",
+                       NIPQUAD(conn->c_laddr), NIPQUAD(conn->c_faddr),
+                       conn->c_tos, conn, conn->c_passive);
                /*
                 * Quiesce the connection mgmt handlers before we start tearing
                 * things down. We don't hold the mutex for the entire
@@ -365,6 +369,10 @@ void rds_conn_shutdown(struct rds_connection *conn, int restart)
        rcu_read_lock();
        if (!hlist_unhashed(&conn->c_hash_node) && restart) {
                rcu_read_unlock();
+               rds_rtd(RDS_RTD_CM_EXT,
+                       "queueing reconnect request... <%u.%u.%u.%u,%u.%u.%u.%u,%d>\n",
+                       NIPQUAD(conn->c_laddr), NIPQUAD(conn->c_faddr),
+                       conn->c_tos);
                rds_queue_reconnect(conn);
        } else {
                rcu_read_unlock();
@@ -383,9 +391,9 @@ void rds_conn_destroy(struct rds_connection *conn)
        struct rds_message *rm, *rtmp;
        unsigned long flags;
 
-       rdsdebug("freeing conn %p for %pI4 -> "
-                "%pI4\n", conn, &conn->c_laddr,
-                &conn->c_faddr);
+       rds_rtd(RDS_RTD_CM, "freeing conn %p <%u.%u.%u.%u,%u.%u.%u.%u,%d>\n",
+               conn, NIPQUAD(conn->c_laddr), NIPQUAD(conn->c_faddr),
+               conn->c_tos);
 
        /* Ensure conn will not be scheduled for reconnect */
        spin_lock_irq(&rds_conn_lock);
@@ -676,6 +684,11 @@ void rds_conn_drop(struct rds_connection *conn)
 
        atomic_set(&conn->c_state, RDS_CONN_ERROR);
 
+       rds_rtd(RDS_RTD_CM_EXT,
+               "RDS/IB: queueing shutdown work, conn %p, <%u.%u.%u.%u,%u.%u.%u.%u,%d>\n",
+               conn, NIPQUAD(conn->c_laddr), NIPQUAD(conn->c_faddr),
+               conn->c_tos);
+
        if (conn->c_loopback)
                queue_work(rds_local_wq, &conn->c_down_w);
        else
@@ -692,6 +705,10 @@ void rds_conn_connect_if_down(struct rds_connection *conn)
 {
        if (rds_conn_state(conn) == RDS_CONN_DOWN &&
            !test_and_set_bit(RDS_RECONNECT_PENDING, &conn->c_flags)) {
+               rds_rtd(RDS_RTD_CM_EXT,
+                       "queueing connect work, conn %p, <%u.%u.%u.%u,%u.%u.%u.%u,%d>\n",
+                       conn, NIPQUAD(conn->c_laddr), NIPQUAD(conn->c_faddr),
+                       conn->c_tos);
                if (conn->c_loopback)
                        queue_delayed_work(rds_local_wq, &conn->c_conn_w, 0);
                else
index 9291e86d27f52a2214d1a4648cb9e4164e57437c..377d516b2c91a5075acba77e411cfffa4d0195db 100644 (file)
@@ -180,6 +180,8 @@ void rds_ib_nodev_connect(void)
 {
        struct rds_ib_connection *ic;
 
+       rds_rtd(RDS_RTD_CM_EXT, "check & build all connections\n");
+
        spin_lock(&ib_nodev_conns_lock);
        list_for_each_entry(ic, &ib_nodev_conns, ib_node)
                rds_conn_connect_if_down(ic->conn);
@@ -191,6 +193,9 @@ void rds_ib_dev_shutdown(struct rds_ib_device *rds_ibdev)
        struct rds_ib_connection *ic;
        unsigned long flags;
 
+       rds_rtd(RDS_RTD_CM_EXT,
+               "calling rds_conn_drop to drop all connections.\n");
+
        spin_lock_irqsave(&rds_ibdev->spinlock, flags);
        list_for_each_entry(ic, &rds_ibdev->conn_list, ib_node)
                rds_conn_drop(ic->conn);
@@ -296,8 +301,11 @@ void rds_ib_remove_one(struct ib_device *device)
        int i;
 
        rds_ibdev = ib_get_client_data(device, &rds_ib_client);
-       if (!rds_ibdev)
+       if (!rds_ibdev) {
+               rds_rtd(RDS_RTD_ACT_BND, "rds_ibdev is NULL, ib_device %p\n",
+                       device);
                return;
+       }
 
        if (rds_ib_active_bonding_enabled) {
                for (i = 1; i <= ip_port_cnt; i++) {
@@ -307,6 +315,9 @@ void rds_ib_remove_one(struct ib_device *device)
                ib_unregister_event_handler(&rds_ibdev->event_handler);
        }
 
+       rds_rtd(RDS_RTD_ACT_BND,
+               "calling rds_ib_dev_shutdown, ib_device %p, rds_ibdev %p\n",
+               device, rds_ibdev);
        rds_ib_dev_shutdown(rds_ibdev);
 
        /* stop connection attempts from getting a reference to this device. */
@@ -611,6 +622,9 @@ static void rds_ib_conn_drop(struct work_struct *_work)
                container_of(_work, struct rds_ib_conn_drop_work, work.work);
        struct rds_connection   *conn = work->conn;
 
+       rds_rtd(RDS_RTD_CM_EXT,
+               "conn: %p, calling rds_conn_drop\n", conn);
+
        rds_conn_drop(conn);
 
        kfree(work);
@@ -838,6 +852,10 @@ static int rds_ib_move_ip(char                     *from_dev,
                                                        ic->conn->c_faddr &&
                                                        ic2->conn->c_faddr ==
                                                        ic->conn->c_laddr) {
+                                                   rds_rtd(RDS_RTD_CM_EXT_P,
+                                                           "conn:%p, tos %d, calling rds_conn_drop\n",
+                                                           ic2->conn,
+                                                           ic2->conn->c_tos);
                                                        rds_conn_drop(ic2->conn);
                                                }
                                        }
@@ -862,8 +880,12 @@ static int rds_ib_move_ip(char                     *from_dev,
                                        INIT_DELAYED_WORK(&work->work, rds_ib_conn_drop);
                                        queue_delayed_work(rds_aux_wq, &work->work,
                                                msecs_to_jiffies(1000 * rds_ib_active_bonding_reconnect_delay));
-                               } else
+                               } else {
+                                       rds_rtd(RDS_RTD_CM_EXT,
+                                               "conn: %p, tos %d, calling rds_conn_drop\n",
+                                               ic->conn, ic->conn->c_tos);
                                        rds_conn_drop(ic->conn);
+                               }
                        }
                }
                spin_unlock_bh(&rds_ibdev->spinlock);
@@ -1090,11 +1112,10 @@ static void rds_ib_do_failover(u8 from_port, u8 to_port, u8 arp_port,
 
                if (!to_port) {
                        /* we tried, but did not get a failover port! */
-                       rdsdebug("RDS/IB: IP %u.%u.%u.%u failed to "
-                                "migrate from %s: no matching "
-                                "destination port available!\n",
-                                NIPQUAD(ip_config[from_port].ip_addr),
-                                ip_config[from_port].if_name);
+                       rds_rtd(RDS_RTD_ERR,
+                               "RDS/IB: IP %u.%u.%u.%u failed to migrate from %s: no matching dest port avail!\n",
+                               NIPQUAD(ip_config[from_port].ip_addr),
+                               ip_config[from_port].if_name);
                        return;
                }
        } else {
@@ -1216,9 +1237,9 @@ static void rds_ib_do_failback(u8 port, int event_type)
                /* Test IP addresses and set them if not already set */
                ret = rds_ib_testset_ip(port);
                if (ret) {
-                       rdsdebug("RDS/IB: failed to ressurrect "
-                                "port index %u devname %s or one if its aliases\n",
-                                port, ip_config[port].dev->name);
+                       rds_rtd(RDS_RTD_ACT_BND,
+                               "RDS/IB: failed to ressrt port idx %u dev %s or one of its aliases\n",
+                               port, ip_config[port].dev->name);
                }
        }
 }
@@ -1336,8 +1357,11 @@ static void rds_ib_event_handler(struct ib_event_handler *handler,
        u8      port;
        struct rds_ib_port_ud_work      *work;
 
-       if (!rds_ib_active_bonding_enabled || !ip_port_cnt)
+       if (!rds_ib_active_bonding_enabled || !ip_port_cnt) {
+               rds_rtd(RDS_RTD_ACT_BND, "ip_port_cnt %d, event %d\n",
+                       ip_port_cnt, event->event);
                return;
+       }
 
        if (event->event != IB_EVENT_PORT_ACTIVE &&
                event->event != IB_EVENT_PORT_ERR)
@@ -1526,6 +1550,8 @@ static void rds_ib_event_handler(struct ib_event_handler *handler,
 
                if (this_port_transition == RDSIBP_TRANSITION_UP) {
                        if (rds_ib_active_bonding_fallback) {
+                               rds_rtd(RDS_RTD_ACT_BND,
+                                       "active bonding fallback enabled\n");
                                INIT_DELAYED_WORK(&work->work, rds_ib_failback);
                                queue_delayed_work(rds_wq, &work->work, 0);
                        } else
@@ -1658,8 +1684,10 @@ static void rds_ib_dump_ip_config(void)
 {
        int     i, j;
 
-       if (!rds_ib_active_bonding_enabled || !ip_port_cnt)
+       if (!rds_ib_active_bonding_enabled || !ip_port_cnt) {
+               rds_rtd(RDS_RTD_ACT_BND, "ip_port_cnt %d\n", ip_port_cnt);
                return;
+       }
 
        for (i = 1; i <= ip_port_cnt; i++) {
                printk(KERN_INFO "RDS/IB: %s/port_%d/%s: "
@@ -1833,8 +1861,10 @@ static int rds_ib_ip_config_init(void)
        unsigned int            tot_devs = 0;
        unsigned int            tot_ibdevs = 0;
 
-       if (!rds_ib_active_bonding_enabled)
+       if (!rds_ib_active_bonding_enabled) {
+               rds_rtd(RDS_RTD_ACT_BND, "active bonding not enabled\n");
                return 0;
+       }
 
        ip_config = kzalloc(sizeof(struct rds_ib_port) *
                                (ip_port_max + 1), GFP_KERNEL);
@@ -2021,8 +2051,10 @@ void rds_ib_ip_failover_groups_init(void)
        int i;
        struct rds_ib_device *rds_ibdev;
 
-       if (!rds_ib_active_bonding_enabled)
+       if (!rds_ib_active_bonding_enabled) {
+               rds_rtd(RDS_RTD_ACT_BND, "active bonding not enabled\n");
                return;
+       }
 
        if (rds_ib_active_bonding_failover_groups == NULL) {
                list_for_each_entry_rcu(rds_ibdev, &rds_ib_devices, list) {
@@ -2092,7 +2124,8 @@ void rds_ib_add_one(struct ib_device *device)
                return;
 
        if (ib_query_device(device, dev_attr)) {
-               rdsdebug("Query device failed for %s\n", device->name);
+               rds_rtd(RDS_RTD_ERR, "Query device failed for %s\n",
+                       device->name);
                goto free_attr;
        }
 
@@ -2319,8 +2352,10 @@ static int rds_ib_netdev_callback(struct notifier_block *self, unsigned long eve
        struct rds_ib_port_ud_work *work = NULL;
        int port_transition = RDSIBP_TRANSITION_NOOP;
 
-       if (!rds_ib_active_bonding_enabled)
+       if (!rds_ib_active_bonding_enabled) {
+               rds_rtd(RDS_RTD_ACT_BND, "active bonding not enabled\n");
                return NOTIFY_DONE;
+       }
 
        if (event != NETDEV_UP &&
            event != NETDEV_DOWN &&
@@ -2567,6 +2602,8 @@ static int rds_ib_netdev_callback(struct notifier_block *self, unsigned long eve
        switch (port_transition) {
        case RDSIBP_TRANSITION_UP:
                if (rds_ib_active_bonding_fallback) {
+                       rds_rtd(RDS_RTD_ACT_BND,
+                               "active bonding fallback enabled\n");
                        INIT_DELAYED_WORK(&work->work, rds_ib_failback);
                        queue_delayed_work(rds_wq, &work->work, 0);
                } else
index a1bbf0467cd90fd8f05ad0e601824a2ae1e8b596..153d603447308ac7899bc91f2b1164fd614dbc7b 100644 (file)
@@ -164,7 +164,8 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
        }
 
        printk(KERN_NOTICE
-               "RDS/IB: connected <%u.%u.%u.%u,%u.%u.%u.%u,%d> version %u.%u%s\n",
+               "RDS/IB: conn %p i_cm_id %p connected <%u.%u.%u.%u,%u.%u.%u.%u,%d> version %u.%u%s\n",
+               conn, ic->i_cm_id,
                NIPQUAD(conn->c_laddr),
                NIPQUAD(conn->c_faddr),
                conn->c_tos,
@@ -174,6 +175,9 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
 
        /* The connection might have been dropped under us*/
        if (!ic->i_cm_id) {
+               rds_rtd(RDS_RTD_CM,
+                       "ic->i_cm_id is NULL, ic: %p, calling rds_conn_drop\n",
+                       ic);
                rds_conn_drop(conn);
                return;
        }
@@ -181,6 +185,9 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
        /* Drop connection if connection state is not CONNECTING.
           Potentially connection drop from some other place like rds_conn_probe_lanes() */
        if (!rds_conn_connecting(conn)) {
+               rds_rtd(RDS_RTD_CM,
+                       "conn is in connecting state, conn: %p, calling rds_conn_drop\n",
+                       conn);
                rds_conn_drop(conn);
                return;
        }
@@ -517,13 +524,13 @@ static void rds_ib_qp_event_handler(struct ib_event *event, void *data)
 
                break;
        case IB_EVENT_PATH_MIG_ERR:
-               rdsdebug("RDS: Path migration error\n");
+               rds_rtd(RDS_RTD_ERR, "RDS: Path migration error\n");
                break;
        default:
-               rdsdebug("Fatal QP Event %u (%s) "
-                       "- connection %pI4->%pI4, reconnecting\n",
+               rds_rtd(RDS_RTD_ERR,
+                       "Fatal QP Event %u (%s) - connection %pI4->%pI4 tos %d, reconnecting\n",
                        event->event, rds_ib_event_str(event->event),
-                       &conn->c_laddr, &conn->c_faddr);
+                       &conn->c_laddr, &conn->c_faddr, conn->c_tos);
                rds_conn_drop(conn);
                break;
        }
@@ -658,7 +665,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
         */
        ret = rdma_create_qp(ic->i_cm_id, ic->i_pd, &attr);
        if (ret) {
-               rdsdebug("rdma_create_qp failed: %d\n", ret);
+               rds_rtd(RDS_RTD_ERR, "rdma_create_qp failed: %d\n", ret);
                goto out;
        }
 
@@ -668,7 +675,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
                                           &ic->i_send_hdrs_dma, GFP_KERNEL);
        if (!ic->i_send_hdrs) {
                ret = -ENOMEM;
-               rdsdebug("ib_dma_alloc_coherent send failed\n");
+               rds_rtd(RDS_RTD_ERR, "ib_dma_alloc_coherent send failed\n");
                goto out;
        }
 
@@ -679,7 +686,8 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
                                        &ic->i_recv_hdrs_dma, GFP_KERNEL);
                if (!ic->i_recv_hdrs) {
                        ret = -ENOMEM;
-                       rdsdebug("ib_dma_alloc_coherent recv failed\n");
+                       rds_rtd(RDS_RTD_ERR,
+                               "ib_dma_alloc_coherent recv failed\n");
                        goto out;
                }
        }
@@ -688,7 +696,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
                                       &ic->i_ack_dma, GFP_KERNEL);
        if (!ic->i_ack) {
                ret = -ENOMEM;
-               rdsdebug("ib_dma_alloc_coherent ack failed\n");
+               rds_rtd(RDS_RTD_ERR, "ib_dma_alloc_coherent ack failed\n");
                goto out;
        }
 
@@ -696,7 +704,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
                                   ibdev_to_node(dev));
        if (!ic->i_sends) {
                ret = -ENOMEM;
-               rdsdebug("send allocation failed\n");
+               rds_rtd(RDS_RTD_ERR, "send allocation failed\n");
                goto out;
        }
        memset(ic->i_sends, 0, ic->i_send_ring.w_nr * sizeof(struct rds_ib_send_work));
@@ -706,7 +714,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
                                sizeof(struct rds_ib_recv_work));
                if (!ic->i_recvs) {
                        ret = -ENOMEM;
-                       rdsdebug("recv allocation failed\n");
+                       rds_rtd(RDS_RTD_ERR, "recv allocation failed\n");
                        goto out;
                }
                memset(ic->i_recvs, 0, ic->i_recv_ring.w_nr * sizeof(struct rds_ib_recv_work));
@@ -786,17 +794,21 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
        if (!version)
                goto out;
 
-       rdsdebug("saddr %pI4 daddr %pI4 RDSv%u.%u lguid 0x%llx fguid "
-                "0x%llx\n", &dp->dp_saddr, &dp->dp_daddr,
-                RDS_PROTOCOL_MAJOR(version), RDS_PROTOCOL_MINOR(version),
-                (unsigned long long)be64_to_cpu(lguid),
-                (unsigned long long)be64_to_cpu(fguid));
+       rds_rtd(RDS_RTD_CM,
+               "saddr %pI4 daddr %pI4 RDSv%u.%u lguid 0x%llx fguid 0x%llx tos %d\n",
+               &dp->dp_saddr, &dp->dp_daddr,
+               RDS_PROTOCOL_MAJOR(version),
+               RDS_PROTOCOL_MINOR(version),
+               (unsigned long long)be64_to_cpu(lguid),
+               (unsigned long long)be64_to_cpu(fguid),
+               dp->dp_tos);
 
        /* RDS/IB is not currently netns aware, thus init_net */
        conn = rds_conn_create(&init_net, dp->dp_daddr, dp->dp_saddr,
                               &rds_ib_transport, dp->dp_tos, GFP_KERNEL);
        if (IS_ERR(conn)) {
-               rdsdebug("rds_conn_create failed (%ld)\n", PTR_ERR(conn));
+               rds_rtd(RDS_RTD_ERR, "rds_conn_create failed (%ld)\n",
+                       PTR_ERR(conn));
                conn = NULL;
                goto out;
        }
@@ -828,7 +840,8 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
                if (ic && ic->i_cm_id == cm_id)
                        destroy = 0;
                if (rds_conn_state(conn) == RDS_CONN_UP) {
-                       rdsdebug("incoming connect while connecting\n");
+                       rds_rtd(RDS_RTD_CM_EXT_P,
+                               "incoming connect while connecting\n");
                        rds_conn_drop(conn);
                        rds_ib_stats_inc(s_ib_listen_closed_stale);
                } else if (rds_conn_state(conn) == RDS_CONN_CONNECTING) {
@@ -1079,11 +1092,14 @@ int rds_ib_conn_connect(struct rds_connection *conn)
        if (IS_ERR(ic->i_cm_id)) {
                ret = PTR_ERR(ic->i_cm_id);
                ic->i_cm_id = NULL;
-               rdsdebug("rdma_create_id() failed: %d\n", ret);
+               rds_rtd(RDS_RTD_ERR, "rdma_create_id() failed: %d\n", ret);
                goto out;
        }
 
-       rdsdebug("created cm id %p for conn %p\n", ic->i_cm_id, conn);
+       rds_rtd(RDS_RTD_CM_EXT,
+               "RDS/IB: conn init <%u.%u.%u.%u,%u.%u.%u.%u,%d> cm_id %p\n",
+               NIPQUAD(conn->c_laddr), NIPQUAD(conn->c_faddr),
+               conn->c_tos, ic->i_cm_id);
 
        src.sin_family = AF_INET;
        src.sin_addr.s_addr = (__force u32)conn->c_laddr;
@@ -1097,8 +1113,8 @@ int rds_ib_conn_connect(struct rds_connection *conn)
                                (struct sockaddr *)&dest,
                                RDS_RDMA_RESOLVE_TIMEOUT_MS);
        if (ret) {
-               rdsdebug("addr resolve failed for cm id %p: %d\n", ic->i_cm_id,
-                        ret);
+               rds_rtd(RDS_RTD_ERR, "addr resolve failed for cm id %p: %d\n",
+                       ic->i_cm_id, ret);
                rdma_destroy_id(ic->i_cm_id);
 
                ic->i_cm_id = NULL;
@@ -1130,7 +1146,8 @@ void rds_ib_conn_shutdown(struct rds_connection *conn)
                        /* Actually this may happen quite frequently, when
                         * an outgoing connect raced with an incoming connect.
                         */
-                       rdsdebug("failed to disconnect, cm: %p err %d\n",
+                       rds_rtd(RDS_RTD_CM_EXT_P,
+                               "failed to disconnect, cm: %p err %d\n",
                                ic->i_cm_id, err);
                } else if (rds_ib_srq_enabled && ic->rds_ibdev) {
                        /*
index 42dad98f6cbe48bd5b0f4ac8232627315a054aa8..bd2cfba0765ec329887921a4f30e1cb104b67c23 100644 (file)
@@ -128,6 +128,10 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
                        mutex_unlock(&conn->c_base_conn->c_cm_lock);
 
                        if (ret) {
+                               rds_rtd(RDS_RTD_CM,
+                                       "ADDR_RESOLVED: ret %d, calling rds_conn_drop <%u.%u.%u.%u,%u.%u.%u.%u,%d>\n",
+                                       ret, NIPQUAD(conn->c_laddr),
+                                       NIPQUAD(conn->c_faddr), conn->c_tos);
                                rds_conn_drop(conn);
                                ret = 0;
                        }
@@ -170,8 +174,13 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
                        ibic = conn->c_transport_data;
                        if (ibic && ibic->i_cm_id == cm_id)
                                ret = trans->cm_initiate_connect(cm_id);
-                       else
+                       else {
+                               rds_rtd(RDS_RTD_CM,
+                                       "ROUTE_RESOLVED: calling rds_conn_drop, conn %p <%u.%u.%u.%u,%u.%u.%u.%u,%d>\n",
+                                       conn, NIPQUAD(conn->c_laddr),
+                                       NIPQUAD(conn->c_faddr), conn->c_tos);
                                rds_conn_drop(conn);
+                       }
                }
                break;
 
@@ -208,7 +217,13 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
                        __free_page(page);
                }
 
-               rds_conn_drop(conn);
+               if (conn) {
+                       rds_rtd(RDS_RTD_ERR,
+                               "ROUTE_ERROR: conn %p, calling rds_conn_drop <%u.%u.%u.%u,%u.%u.%u.%u,%d>\n",
+                               conn, NIPQUAD(conn->c_laddr),
+                               NIPQUAD(conn->c_faddr), conn->c_tos);
+                       rds_conn_drop(conn);
+               }
                break;
 
        case RDMA_CM_EVENT_ESTABLISHED:
@@ -216,15 +231,25 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
                break;
 
        case RDMA_CM_EVENT_ADDR_ERROR:
-               if (conn)
+               if (conn) {
+                       rds_rtd(RDS_RTD_ERR,
+                               "ADDR_ERROR: conn %p, calling rds_conn_drop <%u.%u.%u.%u,%u.%u.%u.%u,%d>\n",
+                               conn, NIPQUAD(conn->c_laddr),
+                               NIPQUAD(conn->c_faddr), conn->c_tos);
                        rds_conn_drop(conn);
+               }
                break;
 
        case RDMA_CM_EVENT_CONNECT_ERROR:
        case RDMA_CM_EVENT_UNREACHABLE:
        case RDMA_CM_EVENT_DEVICE_REMOVAL:
-               if (conn)
+               if (conn) {
+                       rds_rtd(RDS_RTD_ERR,
+                               "CONN/UNREACHABLE/RMVAL ERR: conn %p, calling rds_conn_drop <%u.%u.%u.%u,%u.%u.%u.%u,%d>\n",
+                               conn, NIPQUAD(conn->c_laddr),
+                               NIPQUAD(conn->c_faddr), conn->c_tos);
                        rds_conn_drop(conn);
+               }
                break;
 
        case RDMA_CM_EVENT_REJECTED:
@@ -238,6 +263,9 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
                        if (event->status == RDS_REJ_CONSUMER_DEFINED &&
                            (*err) == 0) {
                                /* Rejection from RDSV3.1 */
+                               pr_warn("Rejected: CSR_DEF err 0, calling rds_conn_drop <%u.%u.%u.%u,%u.%u.%u.%u,%d>\n",
+                                       NIPQUAD(conn->c_laddr),
+                                       NIPQUAD(conn->c_faddr), conn->c_tos);
                                if (!conn->c_tos) {
                                        conn->c_proposed_version =
                                                RDS_PROTOCOL_COMPAT_VERSION;
@@ -252,27 +280,46 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
                                                        &conn->c_reject_w,
                                                        msecs_to_jiffies(10));
                                }
-                       } else
+                       } else {
+                               rds_rtd(RDS_RTD_ERR,
+                                       "Rejected: *err %d status %d calling rds_conn_drop <%u.%u.%u.%u,%u.%u.%u.%u,%d>\n",
+                                       *err, event->status,
+                                       NIPQUAD(conn->c_laddr),
+                                       NIPQUAD(conn->c_faddr),
+                                       conn->c_tos);
                                rds_conn_drop(conn);
+                       }
                }
                break;
 
        case RDMA_CM_EVENT_ADDR_CHANGE:
-               rdsdebug("ADDR_CHANGE event <%u.%u.%u.%u,%u.%u.%u.%u>\n",
-                        NIPQUAD(conn->c_laddr), NIPQUAD(conn->c_faddr));
+               rds_rtd(RDS_RTD_CM_EXT,
+                       "ADDR_CHANGE event <%u.%u.%u.%u,%u.%u.%u.%u>\n",
+                       NIPQUAD(conn->c_laddr),
+                       NIPQUAD(conn->c_faddr));
 #if RDMA_RDS_APM_SUPPORTED
-               if (conn && !rds_ib_apm_enabled)
+               if (conn && !rds_ib_apm_enabled) {
+                       rds_rtd(RDS_RTD_CM,
+                               "ADDR_CHANGE: calling rds_conn_drop <%u.%u.%u.%u,%u.%u.%u.%u,%d>\n",
+                               NIPQUAD(conn->c_laddr), NIPQUAD(conn->c_faddr),
+                               conn->c_tos);
                        rds_conn_drop(conn);
+               }
 #else
-               if (conn)
+               if (conn) {
+                       rds_rtd(RDS_RTD_CM,
+                               "ADDR_CHANGE: calling rds_conn_drop <%u.%u.%u.%u,%u.%u.%u.%u,%d>\n",
+                               NIPQUAD(conn->c_laddr), NIPQUAD(conn->c_faddr),
+                               conn->c_tos);
                        rds_conn_drop(conn);
+               }
 #endif
                break;
 
        case RDMA_CM_EVENT_DISCONNECTED:
-               rdsdebug("DISCONNECT event - dropping connection "
-                       "%pI4->%pI4\n", &conn->c_laddr,
-                        &conn->c_faddr);
+               rds_rtd(RDS_RTD_CM,
+                       "DISCONNECT event - dropping connection %pI4->%pI4 tos %d\n",
+                       &conn->c_laddr, &conn->c_faddr, conn->c_tos);
                rds_conn_drop(conn);
                break;
 
index be8306c35b3748ab841bf43d316fa284171f627e..348324f89cd04618e638af48d586357163fe7f38 100644 (file)
@@ -11,6 +11,7 @@
 #include <uapi/linux/rds.h>
 
 #include "info.h"
+#include "rds_rt_debug.h"
 
 /*
  * RDS Network protocol version
diff --git a/net/rds/rds_rt_debug.c b/net/rds/rds_rt_debug.c
new file mode 100644 (file)
index 0000000..8012c80
--- /dev/null
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2015 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+
+#include "rds.h"
+#include "rds_rt_debug.h"
+
+u32 kernel_rds_rt_debug_bitmap;
+EXPORT_SYMBOL(kernel_rds_rt_debug_bitmap);
+
+module_param_named(rds_rt_debug_bitmap, kernel_rds_rt_debug_bitmap, uint, 0644);
+MODULE_PARM_DESC(rds_rt_debug_bitmap, "RDS Runtime Debug Message Enabling Bitmap [default 0]");
+
diff --git a/net/rds/rds_rt_debug.h b/net/rds/rds_rt_debug.h
new file mode 100644 (file)
index 0000000..80afccd
--- /dev/null
@@ -0,0 +1,42 @@
+#ifndef _RDS_RT_DEBUG_H
+#define _RDS_RT_DEBUG_H
+
+extern u32 kernel_rds_rt_debug_bitmap;
+
+enum {
+       /* bit 0 ~ 19 are feature related bits */
+       RDS_RTD_ERR                     = 1 << 0,       /* 0x1    */
+       RDS_RTD_ERR_EXT                 = 1 << 1,       /* 0x2    */
+
+       RDS_RTD_CM                      = 1 << 3,       /* 0x8    */
+       RDS_RTD_CM_EXT                  = 1 << 4,       /* 0x10   */
+       RDS_RTD_CM_EXT_P                = 1 << 5,       /* 0x20   */
+
+       RDS_RTD_ACT_BND                 = 1 << 7,       /* 0x80   */
+       RDS_RTD_ACT_BND_EXT             = 1 << 8,       /* 0x100  */
+
+       RDS_RTD_RCV                     = 1 << 11,      /* 0x800  */
+       RDS_RTD_RCV_EXT                 = 1 << 12,      /* 0x1000 */
+
+       RDS_RTD_SND                     = 1 << 14,      /* 0x4000 */
+       RDS_RTD_SND_EXT                 = 1 << 15,      /* 0x8000 */
+
+       /* bit 20 ~ 31 are module specific bits */
+       RDS_RTD_CORE                    = 1 << 20,      /* 0x100000   */
+       RDS_RTD_RDMA_IB                 = 1 << 23,      /* 0x800000   */
+
+       /* the following are placeholders for now */
+       RDS_RTD_RDMA_IW                 = 1 << 26,      /* 0x4000000  */
+       RDS_RTD_TCP                     = 1 << 28,      /* 0x10000000 */
+};
+
+#define rds_rtd_printk(level, format, arg...)          \
+       printk(level "RDS_RTD %s:%d: " format, __func__, __LINE__, ## arg)
+
+#define rds_rtd(enabling_bit, format, arg...)                                 \
+       do {  if (likely(!(enabling_bit & kernel_rds_rt_debug_bitmap))) break; \
+                rds_rtd_printk(KERN_INFO, format, ## arg);                    \
+       } while (0)
+
+#endif /* _RDS_RT_DEBUG_H */
+
index b103976a20903943828bbfd2fce2f96c5984217b..5d5296a00c04660a525cb5aadc5cd0924eb4e5da 100644 (file)
@@ -237,8 +237,9 @@ void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr,
        skb = alloc_skb(sizeof(struct rds_nf_hdr) * 2, gfp);
        if (NULL == skb) {
                /* if we have allocation problems, then we just need to depart */
-               rdsdebug("failure to allocate space for inc %p, %u.%u.%u.%u -> %u.%d.%u.%u\n",
-                        inc, NIPQUAD(saddr), NIPQUAD(daddr));
+               rds_rtd(RDS_RTD_ERR,
+                       "failure to allocate space for inc %p, %u.%u.%u.%u -> %u.%d.%u.%u tos %d\n",
+                       inc, NIPQUAD(saddr), NIPQUAD(daddr), conn->c_tos);
                rds_recv_local(conn, saddr, daddr, inc, gfp);
                return;
        }
@@ -281,8 +282,9 @@ void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr,
        }
        /* if we had a failure to convert, then just assuming to continue as local */
        else {
-               rdsdebug("failed to create skb form, conn %p, inc %p, %u.%u.%u.%u -> %u.%u.%u.%u\n",
-                        conn, inc, NIPQUAD(saddr), NIPQUAD(daddr));
+               rds_rtd(RDS_RTD_RCV_EXT,
+                       "failed to create skb form, conn %p, inc %p, %u.%u.%u.%u -> %u.%u.%u.%u tos %d\n",
+                       conn, inc, NIPQUAD(saddr), NIPQUAD(daddr), conn->c_tos);
                ret = 1;
        }
 
@@ -399,8 +401,10 @@ rds_recv_forward(struct rds_connection *conn, struct rds_incoming *inc,
        /* find the proper output socket - it should be the local one on which we originated */
        rs = rds_find_bound(dst->saddr, dst->sport);
        if (!rs) {
-               rdsdebug("failed to find output rds_socket dst %u.%u.%u.%u : %u, inc %p, conn %p\n",
-                        NIPQUAD(dst->daddr), dst->dport, inc, conn);
+               rds_rtd(RDS_RTD_RCV,
+                       "failed to find output rds_socket dst %u.%u.%u.%u : %u, inc %p, conn %p tos %d\n",
+                       NIPQUAD(dst->daddr), dst->dport, inc, conn,
+                       conn->c_tos);
                rds_stats_inc(s_recv_drop_no_sock);
                goto out;
        }
@@ -411,8 +415,10 @@ rds_recv_forward(struct rds_connection *conn, struct rds_incoming *inc,
        /* now lets see if we can send it all */
        ret = rds_send_internal(conn, rs, inc->i_skb, gfp);
        if (len != ret) {
-               rdsdebug("failed to send rds_data dst %u.%u.%u.%u : %u, inc %p, conn %p, len %d != ret %d\n",
-                        NIPQUAD(dst->daddr), dst->dport, inc, conn, len, ret);
+               rds_rtd(RDS_RTD_RCV,
+                       "failed to send rds_data dst %u.%u.%u.%u : %u, inc %p, conn %p tos %d, len %d != ret %d\n",
+                       NIPQUAD(dst->daddr), dst->dport, inc, conn, conn->c_tos,
+                       len, ret);
                goto out;
        }
 
@@ -443,6 +449,7 @@ rds_recv_local(struct rds_connection *conn, __be32 saddr, __be32 daddr,
        struct rds_sock *rs = NULL;
        struct sock *sk;
        unsigned long flags;
+       u64 inc_hdr_h_sequence = 0;
 
        inc->i_conn = conn;
        inc->i_rx_jiffies = jiffies;
@@ -478,13 +485,22 @@ rds_recv_local(struct rds_connection *conn, __be32 saddr, __be32 daddr,
         * XXX we could spend more on the wire to get more robust failure
         * detection, arguably worth it to avoid data corruption.
         */
+       inc_hdr_h_sequence = be64_to_cpu(inc->i_hdr.h_sequence);
 
-       if (be64_to_cpu(inc->i_hdr.h_sequence) < conn->c_next_rx_seq
+       if (inc_hdr_h_sequence != conn->c_next_rx_seq) {
+               rds_rtd(RDS_RTD_RCV,
+                       "conn %p <%u.%u.%u.%u,%u.%u.%u.%u,%d> expect seq# %llu, recved seq# %llu, retrans bit %d\n",
+                       conn, NIPQUAD(conn->c_laddr), NIPQUAD(conn->c_faddr),
+                       conn->c_tos, conn->c_next_rx_seq, inc_hdr_h_sequence,
+                       inc->i_hdr.h_flags & RDS_FLAG_RETRANSMITTED);
+       }
+
+       if (inc_hdr_h_sequence < conn->c_next_rx_seq
         && (inc->i_hdr.h_flags & RDS_FLAG_RETRANSMITTED)) {
                rds_stats_inc(s_recv_drop_old_seq);
                goto out;
        }
-       conn->c_next_rx_seq = be64_to_cpu(inc->i_hdr.h_sequence) + 1;
+       conn->c_next_rx_seq = inc_hdr_h_sequence + 1;
 
        if (rds_sysctl_ping_enable && inc->i_hdr.h_dport == 0) {
                if (inc->i_hdr.h_flags & RDS_FLAG_HB_PING) {
index cc4835a81afed3908c5a01787e48680ec326cb4f..7efb0642922b4b64e22b5548aca4dc46c928594a 100644 (file)
@@ -1259,6 +1259,8 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
                                ret = PTR_ERR(conn->c_base_conn);
                                goto out;
                        }
+                       rds_rtd(RDS_RTD_CM_EXT, "checking conn %p\n",
+                               conn->c_base_conn);
                        rds_conn_connect_if_down(conn->c_base_conn);
                }
                rs->rs_conn = conn;
@@ -1302,6 +1304,9 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
                goto out;
        }
 
+       if (rds_conn_state(conn) == RDS_CONN_DOWN)
+               rds_rtd(RDS_RTD_CM_EXT, "checking conn in down state %p\n",
+                       conn);
        rds_conn_connect_if_down(conn);
 
        ret = rds_cong_wait(conn->c_fcong, dport, nonblock, rs);
@@ -1412,7 +1417,8 @@ int rds_send_internal(struct rds_connection *conn, struct rds_sock *rs,
        /* create ourselves a new message to send out the data */
        rm = rds_message_alloc(ret, gfp);
        if (!rm) {
-               rdsdebug("failed to allocate response message rs %p", rs);
+               rds_rtd(RDS_RTD_ERR, "failed to allocate resp message rs %p",
+                       rs);
                ret = -ENOMEM;
                goto out;
        }
@@ -1464,6 +1470,7 @@ int rds_send_internal(struct rds_connection *conn, struct rds_sock *rs,
        }
 
        /* retry the connection if it hasn't actually been made */
+       rds_rtd(RDS_RTD_CM_EXT, "checking conn %p\n", conn);
        rds_conn_connect_if_down(conn);
 
        /* simple congestion check */
@@ -1476,7 +1483,7 @@ int rds_send_internal(struct rds_connection *conn, struct rds_sock *rs,
        /* only take a single pass */
        if (!rds_send_queue_rm(rs, conn, rm, rs->rs_bound_port,
                               dst->dport, &queued)) {
-               rdsdebug("cannot block on internal send rs %p", rs);
+               rds_rtd(RDS_RTD_SND, "cannot block on internal send rs %p", rs);
                rds_stats_inc(s_send_queue_full);
 
                /* force a requeue of the work for later */
@@ -1534,6 +1541,7 @@ rds_send_pong(struct rds_connection *conn, __be16 dport)
        rm->m_daddr = conn->c_faddr;
        rm->data.op_active = 1;
 
+       rds_rtd(RDS_RTD_CM_EXT, "checking conn %p\n", conn);
        rds_conn_connect_if_down(conn);
 
        ret = rds_cong_wait(conn->c_fcong, dport, 1, NULL);
index 13eff3b134e5afe4f0ad9746f1fff64c1de51527..015598b1aa85e96dad85383403bda0a076769766 100644 (file)
@@ -92,8 +92,8 @@ void rds_connect_complete(struct rds_connection *conn)
                return;
        }
 
-       rdsdebug("conn %p for %pI4 to %pI4 complete\n",
-         conn, &conn->c_laddr, &conn->c_faddr);
+       rds_rtd(RDS_RTD_CM_EXT, "conn %p for %pI4 to %pI4 tos %d complete\n",
+               conn, &conn->c_laddr, &conn->c_faddr, conn->c_tos);
 
        conn->c_reconnect_jiffies = 0;
        set_bit(0, &conn->c_map_queued);
@@ -131,9 +131,10 @@ void rds_queue_reconnect(struct rds_connection *conn)
 {
        unsigned long rand;
 
-       rdsdebug("conn %p for %pI4 to %pI4 reconnect jiffies %lu\n",
-         conn, &conn->c_laddr, &conn->c_faddr,
-         conn->c_reconnect_jiffies);
+       rds_rtd(RDS_RTD_CM_EXT,
+               "conn %p for %pI4 to %pI4 tos %d reconnect jiffies %lu\n", conn,
+               &conn->c_laddr, &conn->c_faddr, conn->c_tos,
+               conn->c_reconnect_jiffies);
 
        set_bit(RDS_RECONNECT_PENDING, &conn->c_flags);
        if (conn->c_reconnect_jiffies == 0) {
@@ -143,9 +144,10 @@ void rds_queue_reconnect(struct rds_connection *conn)
        }
 
        get_random_bytes(&rand, sizeof(rand));
-       rdsdebug("%lu delay %lu ceil conn %p for %pI4 -> %pI4\n",
-                rand % conn->c_reconnect_jiffies, conn->c_reconnect_jiffies,
-                conn, &conn->c_laddr, &conn->c_faddr);
+       rds_rtd(RDS_RTD_CM_EXT,
+               "%lu delay %lu ceil conn %p for %pI4 -> %pI4 tos %d\n",
+               rand % conn->c_reconnect_jiffies, conn->c_reconnect_jiffies,
+               conn, &conn->c_laddr, &conn->c_faddr, conn->c_tos);
 
        if (conn->c_loopback) {
                if (conn->c_laddr >= conn->c_faddr)
@@ -179,15 +181,22 @@ void rds_connect_worker(struct work_struct *work)
                conn->c_connection_start = get_seconds();
 
                ret = conn->c_trans->conn_connect(conn);
-               rdsdebug("conn %p for %pI4 to %pI4 dispatched, ret %d\n",
-                       conn, &conn->c_laddr, &conn->c_faddr, ret);
+               rds_rtd(RDS_RTD_CM_EXT,
+                       "conn %p for %pI4 to %pI4 tos %d dispatched, ret %d\n",
+                       conn, &conn->c_laddr, &conn->c_faddr, conn->c_tos, ret);
 
                if (ret) {
                        if (rds_conn_transition(conn, RDS_CONN_CONNECTING, RDS_CONN_DOWN)) {
+                               rds_rtd(RDS_RTD_CM_EXT,
+                                       "reconnecting..., conn %p\n", conn);
                                rds_queue_reconnect(conn);
                        } else
                                rds_conn_error(conn, "RDS: connect failed\n");
                }
+       } else {
+               rds_rtd(RDS_RTD_CM,
+                       "conn %p cannot trans from DOWN to CONNECTING state.\n",
+                       conn);
        }
 }
 
@@ -200,7 +209,7 @@ void rds_send_worker(struct work_struct *work)
                clear_bit(RDS_LL_SEND_FULL, &conn->c_flags);
                ret = rds_send_xmit(conn);
                cond_resched();
-               rdsdebug("conn %p ret %d\n", conn, ret);
+               rds_rtd(RDS_RTD_SND_EXT, "conn %p ret %d\n", conn, ret);
                switch (ret) {
                case -EAGAIN:
                        rds_stats_inc(s_send_immediate_retry);
@@ -222,7 +231,7 @@ void rds_recv_worker(struct work_struct *work)
 
        if (rds_conn_state(conn) == RDS_CONN_UP) {
                ret = conn->c_trans->recv(conn);
-               rdsdebug("conn %p ret %d\n", conn, ret);
+               rds_rtd(RDS_RTD_RCV_EXT, "conn %p ret %d\n", conn, ret);
                switch (ret) {
                case -EAGAIN:
                        rds_stats_inc(s_recv_immediate_retry);
@@ -242,6 +251,7 @@ void rds_reject_worker(struct work_struct *work)
        struct rds_connection *conn = container_of(work, struct rds_connection, c_reject_w.work);
 
        atomic_set(&conn->c_state, RDS_CONN_ERROR);
+       rds_rtd(RDS_RTD_CM, "calling rds_conn_shutdown, conn %p:0\n", conn);
        rds_conn_shutdown(conn, 0);
        rds_route_to_base(conn);
 }
@@ -259,17 +269,17 @@ void rds_hb_worker(struct work_struct *work)
                if (!conn->c_hb_start) {
                        ret = rds_send_hb(conn, 0);
                        if (ret) {
-                               rdsdebug("RDS/IB: rds_hb_worker: failed %d\n", ret);
+                               rds_rtd(RDS_RTD_ERR_EXT,
+                                       "RDS/IB: rds_hb_worker: failed %d\n",
+                                       ret);
                                return;
                        }
                        conn->c_hb_start = now;
                } else if (now - conn->c_hb_start > rds_conn_hb_timeout) {
-                       printk(KERN_NOTICE
-                               "RDS/IB: connection <%u.%u.%u.%u,%u.%u.%u.%u,%d> "
-                               "timed out (0x%lx,0x%lx)..disconnecting and reconnecting\n",
-                               NIPQUAD(conn->c_laddr),
-                               NIPQUAD(conn->c_faddr), conn->c_tos,
-                               conn->c_hb_start, now);
+                       rds_rtd(RDS_RTD_CM,
+                               "RDS/IB: connection <%u.%u.%u.%u,%u.%u.%u.%u,%d> timed out (0x%lx,0x%lx)..discon and recon\n",
+                               NIPQUAD(conn->c_laddr), NIPQUAD(conn->c_faddr),
+                               conn->c_tos, conn->c_hb_start, now);
                                rds_conn_drop(conn);
                        return;
                }
@@ -286,6 +296,10 @@ void rds_reconnect_timeout(struct work_struct *work)
         * reconnect.
         */
        if (!rds_conn_up(conn)) {
+               rds_rtd(RDS_RTD_CM,
+                       "conn not up, calling rds_conn_drop <%u.%u.%u.%u,%u.%u.%u.%u,%d>\n",
+                       NIPQUAD(conn->c_laddr), NIPQUAD(conn->c_faddr),
+                       conn->c_tos);
                rds_conn_drop(conn);
                conn->c_reconnect_racing = 0;
        }
@@ -300,6 +314,10 @@ void rds_shutdown_worker(struct work_struct *work)
         * drives the reconnect (one-sided reconnect)
         */
        if (conn->c_laddr < conn->c_faddr && conn->c_reconnect_racing) {
+               rds_rtd(RDS_RTD_CM,
+                       "calling rds_conn_shutdown, conn %p:0 <%u.%u.%u.%u,%u.%u.%u.%u,%d>\n",
+                       conn, NIPQUAD(conn->c_laddr), NIPQUAD(conn->c_faddr),
+                       conn->c_tos);
                rds_conn_shutdown(conn, 0);
                if (conn->c_loopback)
                        queue_delayed_work(rds_local_wq, &conn->c_reconn_w,
@@ -307,8 +325,13 @@ void rds_shutdown_worker(struct work_struct *work)
                else
                        queue_delayed_work(rds_wq, &conn->c_reconn_w,
                                           msecs_to_jiffies(5000));
-       } else
+       } else {
+               rds_rtd(RDS_RTD_CM,
+                       "calling rds_conn_shutdown, conn %p:1 <%u.%u.%u.%u,%u.%u.%u.%u,%d>\n",
+                       conn, NIPQUAD(conn->c_laddr), NIPQUAD(conn->c_faddr),
+                       conn->c_tos);
                rds_conn_shutdown(conn, 1);
+       }
 
 }