From: Mukesh Kacker <mukesh.kacker@oracle.com>
Date: Wed, 1 Aug 2018 18:37:01 +0000 (-0700)
Subject: rds: congestion updates can be missed when kernel low on memory
X-Git-Tag: v4.1.12-124.31.3~295
X-Git-Url: https://www.infradead.org/git/?a=commitdiff_plain;h=9c4d9674bb8079bc0565481064abfdfb1c845bc9;p=users%2Fjedix%2Flinux-maple.git

rds: congestion updates can be missed when kernel low on memory

The congestion updates are allocated under GFP_NOWAIT and can
fail under temporary memory pressure. These are not retried and
the update here retries them until sent.

On receiving congestion updates,  corrupt packet check failures
are not logged as warnings.

Orabug: 28425811

Signed-off-by: Mukesh Kacker <mukesh.kacker@oracle.com>
Reviewed-by: Rama Nichanamatlu <rama.nichanamatlu@oracle.com>
Signed-off-by: Brian Maly <brian.maly@oracle.com>
---

diff --git a/net/rds/cong.c b/net/rds/cong.c
index 7fe693f7e08b..7b591bf38158 100644
--- a/net/rds/cong.c
+++ b/net/rds/cong.c
@@ -245,7 +245,8 @@ void rds_cong_queue_updates(struct rds_cong_map *map)
 	spin_lock_irqsave(&rds_cong_lock, flags);
 
 	list_for_each_entry(conn, &map->m_conn_list, c_map_item) {
-		if (!test_and_set_bit(0, &conn->c_map_queued)) {
+		if (!test_and_set_bit(RCMQ_BITOFF_CONGU_PENDING,
+				      &conn->c_map_queued)) {
 			rds_stats_inc(s_cong_update_queued);
 			queue_delayed_work(conn->c_path[0].cp_wq,
 					   &conn->c_path[0].cp_send_w, 0);
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
index a03d4d6cac83..5f28eb51aee6 100644
--- a/net/rds/ib_cm.c
+++ b/net/rds/ib_cm.c
@@ -571,7 +571,7 @@ void rds_ib_tasklet_fn_send(unsigned long data)
 
 	if (rds_conn_up(conn) &&
 	   (!test_bit(RDS_LL_SEND_FULL, &conn->c_flags) ||
-	    test_bit(0, &conn->c_map_queued)))
+	    test_bit(RCMQ_BITOFF_CONGU_PENDING, &conn->c_map_queued)))
 		rds_send_xmit(&ic->conn->c_path[0]);
 }
 
diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c
index 4c05ae7e6f97..7ad94b12da53 100644
--- a/net/rds/ib_recv.c
+++ b/net/rds/ib_recv.c
@@ -1118,11 +1118,18 @@ static void rds_ib_cong_recv(struct rds_connection *conn,
 	uint64_t uncongested = 0;
 	void *addr;
 
+	map = conn->c_fcong;
+
 	/* catch completely corrupt packets */
-	if (be32_to_cpu(ibinc->ii_inc.i_hdr.h_len) != RDS_CONG_MAP_BYTES)
+	if (be32_to_cpu(ibinc->ii_inc.i_hdr.h_len) != RDS_CONG_MAP_BYTES) {
+		pr_warn_ratelimited("RDS: received corrupt congestion update, expected header length: %d, received header length: %d on conn %p <%pI6c, %pI6c, %d> remote map %p remote IP %pI6c\n",
+				    RDS_CONG_MAP_BYTES,
+				    be32_to_cpu(ibinc->ii_inc.i_hdr.h_len),
+				    conn, &conn->c_laddr, &conn->c_faddr,
+				    conn->c_tos, map, &map->m_addr);
 		return;
+	}
 
-	map = conn->c_fcong;
 	map_page = 0;
 	map_off = 0;
 
diff --git a/net/rds/rds.h b/net/rds/rds.h
index efd4e2532bdd..de7243427ac1 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -312,8 +312,19 @@ struct rds_connection {
 	struct rds_cong_map	*c_fcong;
 
 	struct list_head	c_map_item;
+
+	/* c_map_queued: bit map field */
 	unsigned long		c_map_queued;
 
+	/**	bit 0: set indicates congestion update
+	 *		pending to send to peer.
+	 *	bit 1: set indicates last alloc attempt(GFP_NOWAIT)
+	 *		for congestion update message failed
+	 *		and update was deferred
+	 */
+#define	RCMQ_BITOFF_CONGU_PENDING	0
+#define RCMQ_BITOFF_CONGU_ALLOC_DEFER	1
+
 	/* Protocol version */
 	unsigned int		c_proposed_version;
 	unsigned int		c_version;
diff --git a/net/rds/send.c b/net/rds/send.c
index 6a753aeafe91..98a0612fa11c 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -256,13 +256,25 @@ restart:
 		}
 
 		/*
-		 * If between sending messages, we can send a pending congestion
-		 * map update.
+		 * If between sending messages, we can send a pending
+		 * congestion map update.
 		 */
-		if (!rm && test_and_clear_bit(0, &conn->c_map_queued)) {
+		if (!rm && test_bit(RCMQ_BITOFF_CONGU_PENDING,
+				    &conn->c_map_queued)) {
 			rm = rds_cong_update_alloc(conn);
 			if (IS_ERR(rm)) {
+				pr_warn_ratelimited("RDS: Congestion update allocation deferred: conn %p<%pI6c, %pI6c, %d>\n",
+						    conn, &conn->c_laddr,
+						    &conn->c_faddr,
+						    conn->c_tos);
+				/* Set bit to mark deferred cong update */
+				set_bit(RCMQ_BITOFF_CONGU_ALLOC_DEFER,
+					&conn->c_map_queued);
 				ret = PTR_ERR(rm);
+
+				/** Note: pending congestion update
+				 * remains set!
+				 */
 				break;
 			}
 			rm->data.op_active = 1;
@@ -270,6 +282,19 @@ restart:
 			rm->m_inc.i_conn = cp->cp_conn;
 
 			cp->cp_xmit_rm = rm;
+
+			/* clear deferred alloc if set  */
+			if (test_and_clear_bit(RCMQ_BITOFF_CONGU_ALLOC_DEFER,
+					       &conn->c_map_queued)) {
+				pr_warn_ratelimited("RDS: Deferred congestion update allocated: conn %p<%pI6c, %pI6c, %d>\n",
+						    conn, &conn->c_laddr,
+						    &conn->c_faddr,
+						    conn->c_tos);
+			}
+
+			/* clear pending congestion update */
+			clear_bit(RCMQ_BITOFF_CONGU_PENDING,
+				  &conn->c_map_queued);
 		}
 
 		/*
@@ -492,6 +517,12 @@ over_batch:
 	 * We have an extra generation check here so that if someone manages
 	 * to jump in after our release_in_xmit, we'll see that they have done
 	 * some work and we will skip our goto
+	 *
+	 * (Note: We check not just for more messages on send queue but also
+	 *  for congestion update that might still be pending if GFP_NOWAIT
+	 *  allocation failed earlier. Retrying for it in this call will also
+	 *  be capped at "send_batch_count" attempts as it is for data messages
+	 *  before getting rescheduled.)
 	 */
 	if (ret == 0) {
 		bool raced;
@@ -499,8 +530,9 @@ over_batch:
 		smp_mb();
 		raced = send_gen != READ_ONCE(cp->cp_send_gen);
 
-		if ((test_bit(0, &conn->c_map_queued) ||
-		    !list_empty(&cp->cp_send_queue)) && !raced) {
+		if ((test_bit(RCMQ_BITOFF_CONGU_PENDING,
+			      &conn->c_map_queued) ||
+		     !list_empty(&cp->cp_send_queue)) && !raced) {
 			if (batch_count < send_batch_count)
 				goto restart;
 			queue_delayed_work(cp->cp_wq, &cp->cp_send_w, 1);
diff --git a/net/rds/threads.c b/net/rds/threads.c
index d828f1be63f7..42b1f46b2ea5 100644
--- a/net/rds/threads.c
+++ b/net/rds/threads.c
@@ -94,7 +94,7 @@ void rds_connect_path_complete(struct rds_conn_path *cp, int curr)
 		    conn, &conn->c_laddr, &conn->c_faddr, conn->c_tos);
 
 	cp->cp_reconnect_jiffies = 0;
-	set_bit(0, &conn->c_map_queued);
+	set_bit(RCMQ_BITOFF_CONGU_PENDING, &conn->c_map_queued);
 	queue_delayed_work(cp->cp_wq, &cp->cp_send_w, 0);
 	queue_delayed_work(cp->cp_wq, &cp->cp_recv_w, 0);
 	queue_delayed_work(cp->cp_wq, &cp->cp_hb_w, 0);