]> www.infradead.org Git - users/jedix/linux-maple.git/commitdiff
rds: congestion updates can be missed when kernel low on memory
authorMukesh Kacker <mukesh.kacker@oracle.com>
Wed, 1 Aug 2018 18:37:01 +0000 (11:37 -0700)
committerBrian Maly <brian.maly@oracle.com>
Wed, 30 Jan 2019 06:17:11 +0000 (01:17 -0500)
The congestion updates are allocated under GFP_NOWAIT and can
fail under temporary memory pressure. These are not retried and
the update here retries them until sent.

On receiving congestion updates,  corrupt packet check failures
are not logged as warnings.

Orabug: 28425811

Signed-off-by: Mukesh Kacker <mukesh.kacker@oracle.com>
Reviewed-by: Rama Nichanamatlu <rama.nichanamatlu@oracle.com>
Signed-off-by: Brian Maly <brian.maly@oracle.com>
net/rds/cong.c
net/rds/ib_cm.c
net/rds/ib_recv.c
net/rds/rds.h
net/rds/send.c
net/rds/threads.c

index 7fe693f7e08b79982a86e3d8f6c852a087b5407c..7b591bf38158b50e11a7ad792824dc3d527c8478 100644 (file)
@@ -245,7 +245,8 @@ void rds_cong_queue_updates(struct rds_cong_map *map)
        spin_lock_irqsave(&rds_cong_lock, flags);
 
        list_for_each_entry(conn, &map->m_conn_list, c_map_item) {
-               if (!test_and_set_bit(0, &conn->c_map_queued)) {
+               if (!test_and_set_bit(RCMQ_BITOFF_CONGU_PENDING,
+                                     &conn->c_map_queued)) {
                        rds_stats_inc(s_cong_update_queued);
                        queue_delayed_work(conn->c_path[0].cp_wq,
                                           &conn->c_path[0].cp_send_w, 0);
index a03d4d6cac831d64b9d3ea0f144c622efc00cdc4..5f28eb51aee637f74407bafadc595ca2d4e32fa2 100644 (file)
@@ -571,7 +571,7 @@ void rds_ib_tasklet_fn_send(unsigned long data)
 
        if (rds_conn_up(conn) &&
           (!test_bit(RDS_LL_SEND_FULL, &conn->c_flags) ||
-           test_bit(0, &conn->c_map_queued)))
+           test_bit(RCMQ_BITOFF_CONGU_PENDING, &conn->c_map_queued)))
                rds_send_xmit(&ic->conn->c_path[0]);
 }
 
index 4c05ae7e6f97f9204efe791da64c5215180e98ca..7ad94b12da5392293d7337a52c1c62c5980a0ad8 100644 (file)
@@ -1118,11 +1118,18 @@ static void rds_ib_cong_recv(struct rds_connection *conn,
        uint64_t uncongested = 0;
        void *addr;
 
+       map = conn->c_fcong;
+
        /* catch completely corrupt packets */
-       if (be32_to_cpu(ibinc->ii_inc.i_hdr.h_len) != RDS_CONG_MAP_BYTES)
+       if (be32_to_cpu(ibinc->ii_inc.i_hdr.h_len) != RDS_CONG_MAP_BYTES) {
+               pr_warn_ratelimited("RDS: received corrupt congestion update, expected header length: %d, received header length: %d on conn %p <%pI6c, %pI6c, %d> remote map %p remote IP %pI6c\n",
+                                   RDS_CONG_MAP_BYTES,
+                                   be32_to_cpu(ibinc->ii_inc.i_hdr.h_len),
+                                   conn, &conn->c_laddr, &conn->c_faddr,
+                                   conn->c_tos, map, &map->m_addr);
                return;
+       }
 
-       map = conn->c_fcong;
        map_page = 0;
        map_off = 0;
 
index efd4e2532bddc414a86a9d3c234ca793a6549f16..de7243427ac1208a1ed73b60110119b15cee5a10 100644 (file)
@@ -312,8 +312,19 @@ struct rds_connection {
        struct rds_cong_map     *c_fcong;
 
        struct list_head        c_map_item;
+
+       /* c_map_queued: bit map field */
        unsigned long           c_map_queued;
 
+       /**     bit 0: set indicates congestion update
+        *              pending to send to peer.
+        *      bit 1: set indicates last alloc attempt(GFP_NOWAIT)
+        *              for congestion update message failed
+        *              and update was deferred
+        */
+#define        RCMQ_BITOFF_CONGU_PENDING       0
+#define RCMQ_BITOFF_CONGU_ALLOC_DEFER  1
+
        /* Protocol version */
        unsigned int            c_proposed_version;
        unsigned int            c_version;
index 6a753aeafe913bda63df4918e4d87507aba5b3ea..98a0612fa11ca266d6c34f11d3192e40b7bfc955 100644 (file)
@@ -256,13 +256,25 @@ restart:
                }
 
                /*
-                * If between sending messages, we can send a pending congestion
-                * map update.
+                * If between sending messages, we can send a pending
+                * congestion map update.
                 */
-               if (!rm && test_and_clear_bit(0, &conn->c_map_queued)) {
+               if (!rm && test_bit(RCMQ_BITOFF_CONGU_PENDING,
+                                   &conn->c_map_queued)) {
                        rm = rds_cong_update_alloc(conn);
                        if (IS_ERR(rm)) {
+                               pr_warn_ratelimited("RDS: Congestion update allocation deferred: conn %p<%pI6c, %pI6c, %d>\n",
+                                                   conn, &conn->c_laddr,
+                                                   &conn->c_faddr,
+                                                   conn->c_tos);
+                               /* Set bit to mark deferred cong update */
+                               set_bit(RCMQ_BITOFF_CONGU_ALLOC_DEFER,
+                                       &conn->c_map_queued);
                                ret = PTR_ERR(rm);
+
+                               /** Note: pending congestion update
+                                * remains set!
+                                */
                                break;
                        }
                        rm->data.op_active = 1;
@@ -270,6 +282,19 @@ restart:
                        rm->m_inc.i_conn = cp->cp_conn;
 
                        cp->cp_xmit_rm = rm;
+
+                       /* clear deferred alloc if set  */
+                       if (test_and_clear_bit(RCMQ_BITOFF_CONGU_ALLOC_DEFER,
+                                              &conn->c_map_queued)) {
+                               pr_warn_ratelimited("RDS: Deferred congestion update allocated: conn %p<%pI6c, %pI6c, %d>\n",
+                                                   conn, &conn->c_laddr,
+                                                   &conn->c_faddr,
+                                                   conn->c_tos);
+                       }
+
+                       /* clear pending congestion update */
+                       clear_bit(RCMQ_BITOFF_CONGU_PENDING,
+                                 &conn->c_map_queued);
                }
 
                /*
@@ -492,6 +517,12 @@ over_batch:
         * We have an extra generation check here so that if someone manages
         * to jump in after our release_in_xmit, we'll see that they have done
         * some work and we will skip our goto
+        *
+        * (Note: We check not just for more messages on send queue but also
+        *  for congestion update that might still be pending if GFP_NOWAIT
+        *  allocation failed earlier. Retrying for it in this call will also
+        *  be capped at "send_batch_count" attempts as it is for data messages
+        *  before getting rescheduled.)
         */
        if (ret == 0) {
                bool raced;
@@ -499,8 +530,9 @@ over_batch:
                smp_mb();
                raced = send_gen != READ_ONCE(cp->cp_send_gen);
 
-               if ((test_bit(0, &conn->c_map_queued) ||
-                   !list_empty(&cp->cp_send_queue)) && !raced) {
+               if ((test_bit(RCMQ_BITOFF_CONGU_PENDING,
+                             &conn->c_map_queued) ||
+                    !list_empty(&cp->cp_send_queue)) && !raced) {
                        if (batch_count < send_batch_count)
                                goto restart;
                        queue_delayed_work(cp->cp_wq, &cp->cp_send_w, 1);
index d828f1be63f7d37c5da165639b0a55788a5dd214..42b1f46b2ea594b5b643d538e5fb9fa5f1651a87 100644 (file)
@@ -94,7 +94,7 @@ void rds_connect_path_complete(struct rds_conn_path *cp, int curr)
                    conn, &conn->c_laddr, &conn->c_faddr, conn->c_tos);
 
        cp->cp_reconnect_jiffies = 0;
-       set_bit(0, &conn->c_map_queued);
+       set_bit(RCMQ_BITOFF_CONGU_PENDING, &conn->c_map_queued);
        queue_delayed_work(cp->cp_wq, &cp->cp_send_w, 0);
        queue_delayed_work(cp->cp_wq, &cp->cp_recv_w, 0);
        queue_delayed_work(cp->cp_wq, &cp->cp_hb_w, 0);