From 4c999f5b1cfe917e7f1cfd83e88af785cdeb8a99 Mon Sep 17 00:00:00 2001 From: Bang Nguyen Date: Sun, 13 Jan 2013 21:54:09 -0800 Subject: [PATCH] RDS merge for UEK2 Orabug: 15997083 This is merged code of Mellanox OFED R2, 0080 release; and ofa 4.1 Signed-off-by: Bang Nguyen (cherry picked from commit 26add53cf20e08dfa331ec22d307dab40f0c4d74) --- net/rds/af_rds.c | 130 +++++++++++++++++++++++++++++++++++++++++++++-- net/rds/ib.c | 92 ++++++++++++++++++++++++++++----- net/rds/ib.h | 1 + net/rds/rds.h | 2 + net/rds/send.c | 15 +++++- net/rds/stats.c | 1 + 6 files changed, 221 insertions(+), 20 deletions(-) diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c index ddbf568da6d4..01af7ddfe917 100644 --- a/net/rds/af_rds.c +++ b/net/rds/af_rds.c @@ -30,6 +30,7 @@ * SOFTWARE. * */ +#include #include #include #include @@ -45,6 +46,21 @@ static unsigned int rds_ib_retry_count = 0xdead; module_param(rds_ib_retry_count, int, 0444); MODULE_PARM_DESC(rds_ib_retry_count, "UNUSED, set param in rds_rdma instead"); +static int rds_qos_enabled = 1; +module_param(rds_qos_enabled, int, 0444); +MODULE_PARM_DESC(rds_qos_enabled, "Set to enable QoS"); + +static char *rds_qos_threshold = NULL; +module_param(rds_qos_threshold, charp, 0444); +MODULE_PARM_DESC(rds_qos_threshold, ":[,:]*"); + +static int rds_qos_threshold_action = 0; +module_param(rds_qos_threshold_action, int, 0444); +MODULE_PARM_DESC(rds_qos_threshold_action, + "0=Ignore,1=Error,2=Statistic,3=Error_Statistic"); + +static unsigned long rds_qos_threshold_tbl[256]; + /* this is just used for stats gathering :/ */ static DEFINE_SPINLOCK(rds_sock_lock); static unsigned long rds_sock_count; @@ -204,11 +220,14 @@ static int rds_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) rds_tos_t tos; unsigned long flags; - if (get_user(tos, (rds_tos_t __user *)arg)) - return -EFAULT; - switch (cmd) { case SIOCRDSSETTOS: + if (!rds_qos_enabled) + return -EOPNOTSUPP; + + if (get_user(tos, (rds_tos_t __user *)arg)) + return -EFAULT; + spin_lock_irqsave(&rds_sock_lock, flags); if (rs->rs_tos || rs->rs_conn) { spin_unlock_irqrestore(&rds_sock_lock, flags); @@ -621,6 +640,105 @@ out: spin_unlock_irqrestore(&rds_sock_lock, flags); } +static unsigned long parse_ul(char *ptr, unsigned long max) +{ + unsigned long val; + char *endptr; + + val = simple_strtoul(ptr, &endptr, 0); + switch (*endptr) { + case 'k': case 'K': + val <<= 10; + endptr++; + break; + case 'm': case 'M': + val <<= 20; + endptr++; + break; + } + + if (*ptr && !*endptr && val <= max) + return val; + + printk(KERN_WARNING "RDS: Invalid threshold number\n"); + return 0; +} + +int rds_check_qos_threshold(u8 tos, size_t payload_len) +{ + if (rds_qos_threshold_action == 0) + return 0; + + if (rds_qos_threshold_tbl[tos] && payload_len && + rds_qos_threshold_tbl[tos] < payload_len) { + if (rds_qos_threshold_action == 1) + return 1; + else if (rds_qos_threshold_action == 2) { + rds_stats_inc(s_qos_threshold_exceeded); + return 0; + } else if (rds_qos_threshold_action == 3) { + rds_stats_inc(s_qos_threshold_exceeded); + return 1; + } else + return 0; + } else + return 0; +} + +static void rds_qos_threshold_init(void) +{ + char *tok, *nxt_tok, *end; + char str[1024]; + int i; + + for (i = 0; i < 256; i++) + rds_qos_threshold_tbl[i] = 0; + + if (rds_qos_threshold == NULL) + return; + + strcpy(str, rds_qos_threshold); + nxt_tok = strchr(str, ','); + if (nxt_tok) { + *nxt_tok = '\0'; + nxt_tok++; + } + + tok = str; + while (tok) { + char *qos_str, *threshold_str; + + qos_str = tok; + threshold_str = strchr(tok, ':'); + if (threshold_str) { + unsigned long qos, threshold; + + *threshold_str = '\0'; + threshold_str++; + qos = simple_strtol(qos_str, &end, 0); + if (*end) { + printk(KERN_WARNING "RDS: Warning: QoS " + "%s is improperly formatted\n", qos); + } else if (qos > 255) { + printk(KERN_WARNING "RDS: Warning: QoS " + "%s out of range\n", qos); + } + threshold = parse_ul(threshold_str, (u32)~0); + rds_qos_threshold_tbl[qos] = threshold; + } else { + printk(KERN_WARNING "RDS: Warning: QoS:Threshold " + "%s is improperly formatted\n", tok); + } + + tok = nxt_tok; + nxt_tok = strchr(str, ','); + if (nxt_tok) { + *nxt_tok = '\0'; + nxt_tok++; + } + } +} + static void rds_exit(void) { sock_unregister(rds_family_ops.family); @@ -662,6 +780,8 @@ static int rds_init(void) rds_info_register_func(RDS_INFO_SOCKETS, rds_sock_info); rds_info_register_func(RDS_INFO_RECV_MESSAGES, rds_sock_inc_info); + rds_qos_threshold_init(); + goto out; out_proto: @@ -681,8 +801,8 @@ out: } module_init(rds_init); -#define DRV_VERSION "4.0" -#define DRV_RELDATE "Feb 12, 2009" +#define DRV_VERSION "4.1" +#define DRV_RELDATE "Jan 04, 2013" MODULE_AUTHOR("Oracle Corporation "); MODULE_DESCRIPTION("RDS: Reliable Datagram Sockets" diff --git a/net/rds/ib.c b/net/rds/ib.c index 903e9fa847a4..a768633ac627 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -55,10 +55,10 @@ unsigned int rds_ib_apm_enabled = 0; unsigned int rds_ib_apm_fallback = 1; unsigned int rds_ib_haip_enabled = 0; unsigned int rds_ib_haip_fallback = 1; -unsigned int rds_ib_haip_hca_failover_enabled = 1; unsigned int rds_ib_apm_timeout = RDS_IB_DEFAULT_TIMEOUT; unsigned int rds_ib_rnr_retry_count = RDS_IB_DEFAULT_RNR_RETRY_COUNT; unsigned int rds_ib_cq_balance_enabled = 1; +static char *rds_ib_haip_failover_groups = NULL; module_param(rds_ib_fmr_1m_pool_size, int, 0444); MODULE_PARM_DESC(rds_ib_fmr_1m_pool_size, " Max number of 1m fmr per HCA"); @@ -78,8 +78,9 @@ module_param(rds_ib_apm_fallback, int, 0444); MODULE_PARM_DESC(rds_ib_apm_fallback, " APM failback enabled"); module_param(rds_ib_haip_fallback, int, 0444); MODULE_PARM_DESC(rds_ib_haip_fallback, " HAIP failback Enabled"); -module_param(rds_ib_haip_hca_failover_enabled, int, 0444); -MODULE_PARM_DESC(rds_ib_haip_hca_failover_enabled, " HAIP HCA failover Enabled"); +module_param(rds_ib_haip_failover_groups, charp, 0444); +MODULE_PARM_DESC(rds_ib_haip_failover_groups, + "[,]*[;[,]*]*"); module_param(rds_ib_cq_balance_enabled, int, 0444); MODULE_PARM_DESC(rds_ib_cq_balance_enabled, " CQ load balance Enabled"); @@ -338,19 +339,18 @@ static u8 rds_ib_get_failover_port(u8 port) for (i = 1; i <= ip_port_cnt; i++) { if (i != port && - ip_config[i].rds_ibdev == ip_config[port].rds_ibdev && + ip_config[i].failover_group == + ip_config[port].failover_group && ip_config[i].port_state == RDS_IB_PORT_UP) { return i; } } - if (rds_ib_haip_hca_failover_enabled) { - for (i = 1; i <= ip_port_cnt; i++) { - if (i != port && - ip_config[i].port_state == RDS_IB_PORT_UP) { - return i; - } - } + for (i = 1; i <= ip_port_cnt; i++) { + if (i != port && + ip_config[i].port_state == RDS_IB_PORT_UP) { + return i; + } } return 0; @@ -771,8 +771,8 @@ static void rds_ib_failback(struct work_struct *_work) } else if (ip_config[ip_config[i].ip_active_port].port_state == RDS_IB_PORT_DOWN) { rds_ib_do_failover(i, 0, ip_active_port); - } else if (ip_config[port].rds_ibdev == - ip_config[i].rds_ibdev) { + } else if (ip_config[port].failover_group == + ip_config[i].failover_group) { rds_ib_do_failover(i, port, ip_active_port); } } @@ -980,6 +980,70 @@ out: return ret; } +void rds_ib_ip_failover_groups_init(void) +{ + char *tok, *grp, *nxt_tok, *nxt_grp, *end; + char str[1024]; + unsigned int grp_id = 1; + int i; + struct rds_ib_device *rds_ibdev; + + if (rds_ib_haip_failover_groups == NULL) { + rcu_read_lock(); + list_for_each_entry_rcu(rds_ibdev, &rds_ib_devices, list) { + for (i = 1; i <= ip_port_cnt; i++) { + if (ip_config[i].rds_ibdev == rds_ibdev) + ip_config[i].failover_group = grp_id; + } + grp_id++; + } + rcu_read_unlock(); + return; + } + + strcpy(str, rds_ib_haip_failover_groups); + nxt_grp = strchr(str, ';'); + if (nxt_grp) { + *nxt_grp = '\0'; + nxt_grp++; + } + grp = str; + while (grp) { + tok = grp; + nxt_tok = strchr(tok, ','); + if (nxt_tok) { + *nxt_tok = '\0'; + nxt_tok++; + } + while (tok) { + for (i = 1; i <= ip_port_cnt; i++) { + if (!strcmp(tok, ip_config[i].if_name)) { + if (!ip_config[i].failover_group) + ip_config[i].failover_group = + grp_id; + else + printk(KERN_WARNING "RDS/IB: %s is already part of another failover group\n", tok); + break; + } + } + tok = nxt_tok; + nxt_tok = strchr(str, ','); + if (nxt_tok) { + *nxt_tok = '\0'; + nxt_tok++; + } + } + + grp = nxt_grp; + nxt_grp = strchr(str, ';'); + if (nxt_grp) { + *nxt_grp = '\0'; + nxt_grp++; + } + grp_id++; + } +} + void rds_ib_add_one(struct ib_device *device) { struct rds_ib_device *rds_ibdev; @@ -1224,6 +1288,8 @@ int rds_ib_init(void) goto out_srq; } + rds_ib_ip_failover_groups_init(); + register_netdevice_notifier(&rds_ib_nb); goto out; diff --git a/net/rds/ib.h b/net/rds/ib.h index 3c6675c6663c..944b6e692743 100644 --- a/net/rds/ib.h +++ b/net/rds/ib.h @@ -272,6 +272,7 @@ enum { #define RDS_IB_MAX_ALIASES 100 struct rds_ib_port { struct rds_ib_device *rds_ibdev; + unsigned int failover_group; struct net_device *dev; unsigned int port_state; u8 port_num; diff --git a/net/rds/rds.h b/net/rds/rds.h index 038e809c1963..dfe88f1a99b6 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -598,6 +598,7 @@ struct rds_statistics { uint64_t s_cong_update_received; uint64_t s_cong_send_error; uint64_t s_cong_send_blocked; + uint64_t s_qos_threshold_exceeded; }; /* af_rds.c */ @@ -611,6 +612,7 @@ static inline void __rds_wake_sk_sleep(struct sock *sk) if (!sock_flag(sk, SOCK_DEAD) && waitq) wake_up(waitq); } +int rds_check_qos_threshold(u8 tos, size_t pauload_len); extern wait_queue_head_t rds_poll_waitq; void debug_sock_hold(struct sock *sock); diff --git a/net/rds/send.c b/net/rds/send.c index a3b5ecc617fb..8d454bd0576e 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -581,7 +581,8 @@ void rds_rdma_send_complete(struct rds_message *rm, int status) notifier->n_status = status; if (!ro->op_remote_complete) { - if (rds_async_send_enabled && !status) { + if (!rds_async_send_enabled || + (rds_async_send_enabled && !status)) { spin_lock(&rs->rs_lock); list_add_tail(¬ifier->n_list, &rs->rs_notify_queue); @@ -620,7 +621,8 @@ void rds_atomic_send_complete(struct rds_message *rm, int status) debug_sock_hold(rds_rs_to_sk(rs)); notifier->n_status = status; - if (rds_async_send_enabled && !status) { + if (!rds_async_send_enabled || + (rds_async_send_enabled && !status)) { spin_lock(&rs->rs_lock); list_add_tail(¬ifier->n_list, &rs->rs_notify_queue); @@ -1185,6 +1187,7 @@ int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, int queued = 0, allocated_mr = 0; int nonblock = msg->msg_flags & MSG_DONTWAIT; long timeo = sock_sndtimeo(sk, nonblock); + size_t total_payload_len = payload_len; /* Mirror Linux UDP mirror of BSD error message compatibility */ /* XXX: Perhaps MSG_MORE someday */ @@ -1245,6 +1248,14 @@ int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, if (ret) goto out; + if (rm->rdma.op_active) + total_payload_len += rm->rdma.op_bytes; + + if (rds_check_qos_threshold(rs->rs_tos, total_payload_len)) { + ret = -EINVAL; + goto out; + } + /* rds_conn_create has a spinlock that runs with IRQ off. * Caching the conn in the socket helps a lot. */ if (rs->rs_conn && rs->rs_conn->c_faddr == daddr && diff --git a/net/rds/stats.c b/net/rds/stats.c index 01acc9faac24..e341b37c4f78 100644 --- a/net/rds/stats.c +++ b/net/rds/stats.c @@ -75,6 +75,7 @@ static char *rds_stat_names[] = { "cong_update_received", "cong_send_error", "cong_send_blocked", + "qos_threshold_exceeded", }; void rds_stats_info_copy(struct rds_info_iterator *iter, -- 2.50.1