rds: congestion updates can be missed when kernel low on memory

author Mukesh Kacker <mukesh.kacker@oracle.com>

Wed, 1 Aug 2018 18:37:01 +0000 (11:37 -0700)

committer Brian Maly <brian.maly@oracle.com>

Wed, 30 Jan 2019 06:17:11 +0000 (01:17 -0500)
author Mukesh Kacker <mukesh.kacker@oracle.com>
Wed, 1 Aug 2018 18:37:01 +0000 (11:37 -0700)
committer Brian Maly <brian.maly@oracle.com>
Wed, 30 Jan 2019 06:17:11 +0000 (01:17 -0500)
diff --git a/net/rds/cong.c b/net/rds/cong.c

index 7fe693f7e08b79982a86e3d8f6c852a087b5407c..7b591bf38158b50e11a7ad792824dc3d527c8478 100644 (file)
--- a/net/rds/cong.c
+++ b/net/rds/cong.c
@@ -245,7 +245,8 @@ void rds_cong_queue_updates(struct rds_cong_map *map)
         spin_lock_irqsave(&rds_cong_lock, flags);
  
         list_for_each_entry(conn, &map->m_conn_list, c_map_item) {
-               if (!test_and_set_bit(0, &conn->c_map_queued)) {
+               if (!test_and_set_bit(RCMQ_BITOFF_CONGU_PENDING,
+                                     &conn->c_map_queued)) {
                         rds_stats_inc(s_cong_update_queued);
                         queue_delayed_work(conn->c_path[0].cp_wq,
                                            &conn->c_path[0].cp_send_w, 0);
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c

index a03d4d6cac831d64b9d3ea0f144c622efc00cdc4..5f28eb51aee637f74407bafadc595ca2d4e32fa2 100644 (file)
--- a/net/rds/ib_cm.c
+++ b/net/rds/ib_cm.c
@@ -571,7 +571,7 @@ void rds_ib_tasklet_fn_send(unsigned long data)
  
         if (rds_conn_up(conn) &&
            (!test_bit(RDS_LL_SEND_FULL, &conn->c_flags) ||
-           test_bit(0, &conn->c_map_queued)))
+           test_bit(RCMQ_BITOFF_CONGU_PENDING, &conn->c_map_queued)))
                 rds_send_xmit(&ic->conn->c_path[0]);
  }
  
diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c

index 4c05ae7e6f97f9204efe791da64c5215180e98ca..7ad94b12da5392293d7337a52c1c62c5980a0ad8 100644 (file)
--- a/net/rds/ib_recv.c
+++ b/net/rds/ib_recv.c
@@ -1118,11 +1118,18 @@ static void rds_ib_cong_recv(struct rds_connection *conn,
         uint64_t uncongested = 0;
         void *addr;
  
+       map = conn->c_fcong;
+
         /* catch completely corrupt packets */
-       if (be32_to_cpu(ibinc->ii_inc.i_hdr.h_len) != RDS_CONG_MAP_BYTES)
+       if (be32_to_cpu(ibinc->ii_inc.i_hdr.h_len) != RDS_CONG_MAP_BYTES) {
+               pr_warn_ratelimited("RDS: received corrupt congestion update, expected header length: %d, received header length: %d on conn %p <%pI6c, %pI6c, %d> remote map %p remote IP %pI6c\n",
+                                   RDS_CONG_MAP_BYTES,
+                                   be32_to_cpu(ibinc->ii_inc.i_hdr.h_len),
+                                   conn, &conn->c_laddr, &conn->c_faddr,
+                                   conn->c_tos, map, &map->m_addr);
                 return;
+       }
  
-       map = conn->c_fcong;
         map_page = 0;
         map_off = 0;
  
diff --git a/net/rds/rds.h b/net/rds/rds.h

index efd4e2532bddc414a86a9d3c234ca793a6549f16..de7243427ac1208a1ed73b60110119b15cee5a10 100644 (file)
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -312,8 +312,19 @@ struct rds_connection {
         struct rds_cong_map     *c_fcong;
  
         struct list_head        c_map_item;
+
+       /* c_map_queued: bit map field */
         unsigned long           c_map_queued;
  
+       /**     bit 0: set indicates congestion update
+        *              pending to send to peer.
+        *      bit 1: set indicates last alloc attempt(GFP_NOWAIT)
+        *              for congestion update message failed
+        *              and update was deferred
+        */
+#define        RCMQ_BITOFF_CONGU_PENDING       0
+#define RCMQ_BITOFF_CONGU_ALLOC_DEFER  1
+
         /* Protocol version */
         unsigned int            c_proposed_version;
         unsigned int            c_version;
diff --git a/net/rds/send.c b/net/rds/send.c

index 6a753aeafe913bda63df4918e4d87507aba5b3ea..98a0612fa11ca266d6c34f11d3192e40b7bfc955 100644 (file)
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -256,13 +256,25 @@ restart:
                 }
  
                 /*
-                * If between sending messages, we can send a pending congestion
-                * map update.
+                * If between sending messages, we can send a pending
+                * congestion map update.
                  */
-               if (!rm && test_and_clear_bit(0, &conn->c_map_queued)) {
+               if (!rm && test_bit(RCMQ_BITOFF_CONGU_PENDING,
+                                   &conn->c_map_queued)) {
                         rm = rds_cong_update_alloc(conn);
                         if (IS_ERR(rm)) {
+                               pr_warn_ratelimited("RDS: Congestion update allocation deferred: conn %p<%pI6c, %pI6c, %d>\n",
+                                                   conn, &conn->c_laddr,
+                                                   &conn->c_faddr,
+                                                   conn->c_tos);
+                               /* Set bit to mark deferred cong update */
+                               set_bit(RCMQ_BITOFF_CONGU_ALLOC_DEFER,
+                                       &conn->c_map_queued);
                                 ret = PTR_ERR(rm);
+
+                               /** Note: pending congestion update
+                                * remains set!
+                                */
                                 break;
                         }
                         rm->data.op_active = 1;
@@ -270,6 +282,19 @@ restart:
                         rm->m_inc.i_conn = cp->cp_conn;
  
                         cp->cp_xmit_rm = rm;
+
+                       /* clear deferred alloc if set  */
+                       if (test_and_clear_bit(RCMQ_BITOFF_CONGU_ALLOC_DEFER,
+                                              &conn->c_map_queued)) {
+                               pr_warn_ratelimited("RDS: Deferred congestion update allocated: conn %p<%pI6c, %pI6c, %d>\n",
+                                                   conn, &conn->c_laddr,
+                                                   &conn->c_faddr,
+                                                   conn->c_tos);
+                       }
+
+                       /* clear pending congestion update */
+                       clear_bit(RCMQ_BITOFF_CONGU_PENDING,
+                                 &conn->c_map_queued);
                 }
  
                 /*
@@ -492,6 +517,12 @@ over_batch:
          * We have an extra generation check here so that if someone manages
          * to jump in after our release_in_xmit, we'll see that they have done
          * some work and we will skip our goto
+        *
+        * (Note: We check not just for more messages on send queue but also
+        *  for congestion update that might still be pending if GFP_NOWAIT
+        *  allocation failed earlier. Retrying for it in this call will also
+        *  be capped at "send_batch_count" attempts as it is for data messages
+        *  before getting rescheduled.)
          */
         if (ret == 0) {
                 bool raced;
@@ -499,8 +530,9 @@ over_batch:
                 smp_mb();
                 raced = send_gen != READ_ONCE(cp->cp_send_gen);
  
-               if ((test_bit(0, &conn->c_map_queued) ||
-                   !list_empty(&cp->cp_send_queue)) && !raced) {
+               if ((test_bit(RCMQ_BITOFF_CONGU_PENDING,
+                             &conn->c_map_queued) ||
+                    !list_empty(&cp->cp_send_queue)) && !raced) {
                         if (batch_count < send_batch_count)
                                 goto restart;
                         queue_delayed_work(cp->cp_wq, &cp->cp_send_w, 1);
diff --git a/net/rds/threads.c b/net/rds/threads.c

index d828f1be63f7d37c5da165639b0a55788a5dd214..42b1f46b2ea594b5b643d538e5fb9fa5f1651a87 100644 (file)
--- a/net/rds/threads.c
+++ b/net/rds/threads.c
@@ -94,7 +94,7 @@ void rds_connect_path_complete(struct rds_conn_path *cp, int curr)
                     conn, &conn->c_laddr, &conn->c_faddr, conn->c_tos);
  
         cp->cp_reconnect_jiffies = 0;
-       set_bit(0, &conn->c_map_queued);
+       set_bit(RCMQ_BITOFF_CONGU_PENDING, &conn->c_map_queued);
         queue_delayed_work(cp->cp_wq, &cp->cp_send_w, 0);
         queue_delayed_work(cp->cp_wq, &cp->cp_recv_w, 0);
         queue_delayed_work(cp->cp_wq, &cp->cp_hb_w, 0);
author	Mukesh Kacker <mukesh.kacker@oracle.com>
	Wed, 1 Aug 2018 18:37:01 +0000 (11:37 -0700)
committer	Brian Maly <brian.maly@oracle.com>
	Wed, 30 Jan 2019 06:17:11 +0000 (01:17 -0500)
net/rds/cong.c		patch \| blob \| history
net/rds/ib_cm.c		patch \| blob \| history
net/rds/ib_recv.c		patch \| blob \| history
net/rds/rds.h		patch \| blob \| history
net/rds/send.c		patch \| blob \| history
net/rds/threads.c		patch \| blob \| history