On hardware port linkups, at time the multi-cast joins fails
which delays the IP layer to bringup the interface quickly.
Subsequent multi-cast retry might succeed and then the IP
layer will be ready for IP migration. This happens very
sporadically on bare metal systems but more often on VM systems
and the number of multi-cast queries also goes up with number of VMs.
This create load of RC connection thrashing across the cluster
since the IP migration gets staggered which is not ideal for
active active. So we create a sync point so that entire cluster
gets synced up. This helps to reduce the thrashing and premature
failover attempts. Obviously its only applicable for failback
A user sysctl is provided "active_bonding_failback_ms"
in case there is a need to tune the sync point.
Orabug:
25026643
Tested-by: Michael Nowak <michael.nowak@oracle.com>
Tested-by: Dib Chatterjee <dib.chatterjee@oracle.com>
Reviewed-by: Avinash Repaka <avinash.repaka@oracle.com>
Reviewed-by: Mukesh Kacker <mukesh.kacker@oracle.com>
Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
return 0;
}
+static unsigned long get_failback_sync_jiffies(struct rds_ib_port *rds_ibp)
+{
+ unsigned long t = get_jiffies_64() - rds_ibp->port_active_ts;
+
+ if (t > rds_ib_sysctl_active_bonding_failback_jiffies)
+ return 0;
+
+ return rds_ib_sysctl_active_bonding_failback_jiffies - t;
+}
+
void rds_ib_nodev_connect(void)
{
struct rds_ib_connection *ic;
if (event->event == IB_EVENT_PORT_ACTIVE) {
ip_config[port].port_layerflags |=
RDSIBP_STATUS_HWPORTUP;
+ ip_config[port].port_active_ts = get_jiffies_64();
} else {
/* event->event == IB_EVENT_PORT_ERROR */
ip_config[port].port_layerflags &=
rds_rtd(RDS_RTD_ACT_BND,
"active bonding fallback enabled\n");
INIT_DELAYED_WORK(&work->work, rds_ib_failback);
- queue_delayed_work(rds_wq, &work->work, 0);
+ queue_delayed_work(rds_wq, &work->work,
+ get_failback_sync_jiffies(&ip_config[port]));
} else
kfree(work);
} else {
rds_rtd(RDS_RTD_ACT_BND,
"active bonding fallback enabled\n");
INIT_DELAYED_WORK(&work->work, rds_ib_failback);
- queue_delayed_work(rds_wq, &work->work, 0);
+ queue_delayed_work(rds_wq, &work->work,
+ get_failback_sync_jiffies(&ip_config[port]));
+ ip_config[port].port_active_ts = 0;
} else
kfree(work);
break;
uint16_t pkey;
unsigned int alias_cnt;
struct rds_ib_alias aliases[RDS_IB_MAX_ALIASES];
+ unsigned long port_active_ts;
};
enum {
extern unsigned int rds_ib_sysctl_active_bonding;
extern unsigned int rds_ib_sysctl_trigger_active_bonding;
extern unsigned int rds_ib_sysctl_disable_unmap_fmr_cpu;
+extern unsigned long rds_ib_active_bonding_failback_min_jiffies;
+extern unsigned long rds_ib_active_bonding_failback_max_jiffies;
+extern unsigned long rds_ib_sysctl_active_bonding_failback_jiffies;
#endif
*/
unsigned int rds_ib_sysctl_trigger_active_bonding; /* = 0 */
+unsigned long rds_ib_active_bonding_failback_min_jiffies = HZ;
+unsigned long rds_ib_active_bonding_failback_max_jiffies = HZ * 100;
+unsigned long rds_ib_sysctl_active_bonding_failback_jiffies = HZ * 10;
+
static struct ctl_table rds_ib_sysctl_table[] = {
{
.procname = "max_send_wr",
.mode = 0644,
.proc_handler = &proc_dointvec,
},
+ {
+ .procname = "active_bonding_failback_ms",
+ .data = &rds_ib_sysctl_active_bonding_failback_jiffies,
+ .maxlen = sizeof(rds_ib_sysctl_active_bonding_failback_jiffies),
+ .mode = 0644,
+ .proc_handler = proc_doulongvec_ms_jiffies_minmax,
+ .extra1 = &rds_ib_active_bonding_failback_min_jiffies,
+ .extra2 = &rds_ib_active_bonding_failback_max_jiffies,
+ },
{
.procname = "disable_unmap_fmr_cpu_assignment",
.data = &rds_ib_sysctl_disable_unmap_fmr_cpu,