* SOFTWARE.
*
*/
+#include <linux/string.h>
#include <linux/module.h>
#include <linux/errno.h>
#include <linux/kernel.h>
module_param(rds_ib_retry_count, int, 0444);
MODULE_PARM_DESC(rds_ib_retry_count, "UNUSED, set param in rds_rdma instead");
+static int rds_qos_enabled = 1;
+module_param(rds_qos_enabled, int, 0444);
+MODULE_PARM_DESC(rds_qos_enabled, "Set to enable QoS");
+
+static char *rds_qos_threshold = NULL;
+module_param(rds_qos_threshold, charp, 0444);
+MODULE_PARM_DESC(rds_qos_threshold, "<tos>:<max_msg_size>[,<tos>:<max_msg_size>]*");
+
+static int rds_qos_threshold_action = 0;
+module_param(rds_qos_threshold_action, int, 0444);
+MODULE_PARM_DESC(rds_qos_threshold_action,
+ "0=Ignore,1=Error,2=Statistic,3=Error_Statistic");
+
+static unsigned long rds_qos_threshold_tbl[256];
+
/* this is just used for stats gathering :/ */
static DEFINE_SPINLOCK(rds_sock_lock);
static unsigned long rds_sock_count;
rds_tos_t tos;
unsigned long flags;
- if (get_user(tos, (rds_tos_t __user *)arg))
- return -EFAULT;
-
switch (cmd) {
case SIOCRDSSETTOS:
+ if (!rds_qos_enabled)
+ return -EOPNOTSUPP;
+
+ if (get_user(tos, (rds_tos_t __user *)arg))
+ return -EFAULT;
+
spin_lock_irqsave(&rds_sock_lock, flags);
if (rs->rs_tos || rs->rs_conn) {
spin_unlock_irqrestore(&rds_sock_lock, flags);
spin_unlock_irqrestore(&rds_sock_lock, flags);
}
+static unsigned long parse_ul(char *ptr, unsigned long max)
+{
+ unsigned long val;
+ char *endptr;
+
+ val = simple_strtoul(ptr, &endptr, 0);
+ switch (*endptr) {
+ case 'k': case 'K':
+ val <<= 10;
+ endptr++;
+ break;
+ case 'm': case 'M':
+ val <<= 20;
+ endptr++;
+ break;
+ }
+
+ if (*ptr && !*endptr && val <= max)
+ return val;
+
+ printk(KERN_WARNING "RDS: Invalid threshold number\n");
+ return 0;
+}
+
+int rds_check_qos_threshold(u8 tos, size_t payload_len)
+{
+ if (rds_qos_threshold_action == 0)
+ return 0;
+
+ if (rds_qos_threshold_tbl[tos] && payload_len &&
+ rds_qos_threshold_tbl[tos] < payload_len) {
+ if (rds_qos_threshold_action == 1)
+ return 1;
+ else if (rds_qos_threshold_action == 2) {
+ rds_stats_inc(s_qos_threshold_exceeded);
+ return 0;
+ } else if (rds_qos_threshold_action == 3) {
+ rds_stats_inc(s_qos_threshold_exceeded);
+ return 1;
+ } else
+ return 0;
+ } else
+ return 0;
+}
+
+static void rds_qos_threshold_init(void)
+{
+ char *tok, *nxt_tok, *end;
+ char str[1024];
+ int i;
+
+ for (i = 0; i < 256; i++)
+ rds_qos_threshold_tbl[i] = 0;
+
+ if (rds_qos_threshold == NULL)
+ return;
+
+ strcpy(str, rds_qos_threshold);
+ nxt_tok = strchr(str, ',');
+ if (nxt_tok) {
+ *nxt_tok = '\0';
+ nxt_tok++;
+ }
+
+ tok = str;
+ while (tok) {
+ char *qos_str, *threshold_str;
+
+ qos_str = tok;
+ threshold_str = strchr(tok, ':');
+ if (threshold_str) {
+ unsigned long qos, threshold;
+
+ *threshold_str = '\0';
+ threshold_str++;
+ qos = simple_strtol(qos_str, &end, 0);
+ if (*end) {
+ printk(KERN_WARNING "RDS: Warning: QoS "
+ "%s is improperly formatted\n", qos);
+ } else if (qos > 255) {
+ printk(KERN_WARNING "RDS: Warning: QoS "
+ "%s out of range\n", qos);
+ }
+ threshold = parse_ul(threshold_str, (u32)~0);
+ rds_qos_threshold_tbl[qos] = threshold;
+ } else {
+ printk(KERN_WARNING "RDS: Warning: QoS:Threshold "
+ "%s is improperly formatted\n", tok);
+ }
+
+ tok = nxt_tok;
+ nxt_tok = strchr(str, ',');
+ if (nxt_tok) {
+ *nxt_tok = '\0';
+ nxt_tok++;
+ }
+ }
+}
+
static void rds_exit(void)
{
sock_unregister(rds_family_ops.family);
rds_info_register_func(RDS_INFO_SOCKETS, rds_sock_info);
rds_info_register_func(RDS_INFO_RECV_MESSAGES, rds_sock_inc_info);
+ rds_qos_threshold_init();
+
goto out;
out_proto:
}
module_init(rds_init);
-#define DRV_VERSION "4.0"
-#define DRV_RELDATE "Feb 12, 2009"
+#define DRV_VERSION "4.1"
+#define DRV_RELDATE "Jan 04, 2013"
MODULE_AUTHOR("Oracle Corporation <rds-devel@oss.oracle.com>");
MODULE_DESCRIPTION("RDS: Reliable Datagram Sockets"
unsigned int rds_ib_apm_fallback = 1;
unsigned int rds_ib_haip_enabled = 0;
unsigned int rds_ib_haip_fallback = 1;
-unsigned int rds_ib_haip_hca_failover_enabled = 1;
unsigned int rds_ib_apm_timeout = RDS_IB_DEFAULT_TIMEOUT;
unsigned int rds_ib_rnr_retry_count = RDS_IB_DEFAULT_RNR_RETRY_COUNT;
unsigned int rds_ib_cq_balance_enabled = 1;
+static char *rds_ib_haip_failover_groups = NULL;
module_param(rds_ib_fmr_1m_pool_size, int, 0444);
MODULE_PARM_DESC(rds_ib_fmr_1m_pool_size, " Max number of 1m fmr per HCA");
MODULE_PARM_DESC(rds_ib_apm_fallback, " APM failback enabled");
module_param(rds_ib_haip_fallback, int, 0444);
MODULE_PARM_DESC(rds_ib_haip_fallback, " HAIP failback Enabled");
-module_param(rds_ib_haip_hca_failover_enabled, int, 0444);
-MODULE_PARM_DESC(rds_ib_haip_hca_failover_enabled, " HAIP HCA failover Enabled");
+module_param(rds_ib_haip_failover_groups, charp, 0444);
+MODULE_PARM_DESC(rds_ib_haip_failover_groups,
+ "<ifname>[,<ifname>]*[;<ifname>[,<ifname>]*]*");
module_param(rds_ib_cq_balance_enabled, int, 0444);
MODULE_PARM_DESC(rds_ib_cq_balance_enabled, " CQ load balance Enabled");
for (i = 1; i <= ip_port_cnt; i++) {
if (i != port &&
- ip_config[i].rds_ibdev == ip_config[port].rds_ibdev &&
+ ip_config[i].failover_group ==
+ ip_config[port].failover_group &&
ip_config[i].port_state == RDS_IB_PORT_UP) {
return i;
}
}
- if (rds_ib_haip_hca_failover_enabled) {
- for (i = 1; i <= ip_port_cnt; i++) {
- if (i != port &&
- ip_config[i].port_state == RDS_IB_PORT_UP) {
- return i;
- }
- }
+ for (i = 1; i <= ip_port_cnt; i++) {
+ if (i != port &&
+ ip_config[i].port_state == RDS_IB_PORT_UP) {
+ return i;
+ }
}
return 0;
} else if (ip_config[ip_config[i].ip_active_port].port_state ==
RDS_IB_PORT_DOWN) {
rds_ib_do_failover(i, 0, ip_active_port);
- } else if (ip_config[port].rds_ibdev ==
- ip_config[i].rds_ibdev) {
+ } else if (ip_config[port].failover_group ==
+ ip_config[i].failover_group) {
rds_ib_do_failover(i, port, ip_active_port);
}
}
return ret;
}
+void rds_ib_ip_failover_groups_init(void)
+{
+ char *tok, *grp, *nxt_tok, *nxt_grp, *end;
+ char str[1024];
+ unsigned int grp_id = 1;
+ int i;
+ struct rds_ib_device *rds_ibdev;
+
+ if (rds_ib_haip_failover_groups == NULL) {
+ rcu_read_lock();
+ list_for_each_entry_rcu(rds_ibdev, &rds_ib_devices, list) {
+ for (i = 1; i <= ip_port_cnt; i++) {
+ if (ip_config[i].rds_ibdev == rds_ibdev)
+ ip_config[i].failover_group = grp_id;
+ }
+ grp_id++;
+ }
+ rcu_read_unlock();
+ return;
+ }
+
+ strcpy(str, rds_ib_haip_failover_groups);
+ nxt_grp = strchr(str, ';');
+ if (nxt_grp) {
+ *nxt_grp = '\0';
+ nxt_grp++;
+ }
+ grp = str;
+ while (grp) {
+ tok = grp;
+ nxt_tok = strchr(tok, ',');
+ if (nxt_tok) {
+ *nxt_tok = '\0';
+ nxt_tok++;
+ }
+ while (tok) {
+ for (i = 1; i <= ip_port_cnt; i++) {
+ if (!strcmp(tok, ip_config[i].if_name)) {
+ if (!ip_config[i].failover_group)
+ ip_config[i].failover_group =
+ grp_id;
+ else
+ printk(KERN_WARNING "RDS/IB: %s is already part of another failover group\n", tok);
+ break;
+ }
+ }
+ tok = nxt_tok;
+ nxt_tok = strchr(str, ',');
+ if (nxt_tok) {
+ *nxt_tok = '\0';
+ nxt_tok++;
+ }
+ }
+
+ grp = nxt_grp;
+ nxt_grp = strchr(str, ';');
+ if (nxt_grp) {
+ *nxt_grp = '\0';
+ nxt_grp++;
+ }
+ grp_id++;
+ }
+}
+
void rds_ib_add_one(struct ib_device *device)
{
struct rds_ib_device *rds_ibdev;
goto out_srq;
}
+ rds_ib_ip_failover_groups_init();
+
register_netdevice_notifier(&rds_ib_nb);
goto out;
#define RDS_IB_MAX_ALIASES 100
struct rds_ib_port {
struct rds_ib_device *rds_ibdev;
+ unsigned int failover_group;
struct net_device *dev;
unsigned int port_state;
u8 port_num;
uint64_t s_cong_update_received;
uint64_t s_cong_send_error;
uint64_t s_cong_send_blocked;
+ uint64_t s_qos_threshold_exceeded;
};
/* af_rds.c */
if (!sock_flag(sk, SOCK_DEAD) && waitq)
wake_up(waitq);
}
+int rds_check_qos_threshold(u8 tos, size_t pauload_len);
extern wait_queue_head_t rds_poll_waitq;
void debug_sock_hold(struct sock *sock);
notifier->n_status = status;
if (!ro->op_remote_complete) {
- if (rds_async_send_enabled && !status) {
+ if (!rds_async_send_enabled ||
+ (rds_async_send_enabled && !status)) {
spin_lock(&rs->rs_lock);
list_add_tail(¬ifier->n_list,
&rs->rs_notify_queue);
debug_sock_hold(rds_rs_to_sk(rs));
notifier->n_status = status;
- if (rds_async_send_enabled && !status) {
+ if (!rds_async_send_enabled ||
+ (rds_async_send_enabled && !status)) {
spin_lock(&rs->rs_lock);
list_add_tail(¬ifier->n_list,
&rs->rs_notify_queue);
int queued = 0, allocated_mr = 0;
int nonblock = msg->msg_flags & MSG_DONTWAIT;
long timeo = sock_sndtimeo(sk, nonblock);
+ size_t total_payload_len = payload_len;
/* Mirror Linux UDP mirror of BSD error message compatibility */
/* XXX: Perhaps MSG_MORE someday */
if (ret)
goto out;
+ if (rm->rdma.op_active)
+ total_payload_len += rm->rdma.op_bytes;
+
+ if (rds_check_qos_threshold(rs->rs_tos, total_payload_len)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
/* rds_conn_create has a spinlock that runs with IRQ off.
* Caching the conn in the socket helps a lot. */
if (rs->rs_conn && rs->rs_conn->c_faddr == daddr &&
"cong_update_received",
"cong_send_error",
"cong_send_blocked",
+ "qos_threshold_exceeded",
};
void rds_stats_info_copy(struct rds_info_iterator *iter,