spin_unlock_irqrestore(&ssk->rx_ring.lock, f); \
} while (0)
+#define SDP_MODPARAM_SINT(var, def_val, msg) \
+ static int var = def_val; \
+ module_param_named(var, var, int, 0644); \
+ MODULE_PARM_DESC(var, msg " [" #def_val "]"); \
+
+#define SDP_MODPARAM_INT(var, def_val, msg) \
+ int var = def_val; \
+ module_param_named(var, var, int, 0644); \
+ MODULE_PARM_DESC(var, msg " [" #def_val "]"); \
+
#ifdef SDP_PROFILING
struct sk_buff;
struct sdpprf_log {
/* how long (in jiffies) to block sender till tx completion*/
#define SDP_BZCOPY_POLL_TIMEOUT (HZ / 10)
+#define SDP_AUTO_CONF 0xffff
+#define AUTO_MOD_DELAY (HZ / 4)
+
#define BZCOPY_STATE(skb) (*(struct bzcopy_state **)(skb->cb))
#ifndef MIN
#define MIN(a, b) (a < b ? a : b)
sdp_do_posts(ssk); \
})
+struct sdp_moderation {
+ unsigned long last_moder_packets;
+ unsigned long last_moder_tx_packets;
+ unsigned long last_moder_bytes;
+ unsigned long last_moder_jiffies;
+ int last_moder_time;
+ u16 rx_usecs;
+ u16 rx_frames;
+ u16 tx_usecs;
+ u32 pkt_rate_low;
+ u16 rx_usecs_low;
+ u32 pkt_rate_high;
+ u16 rx_usecs_high;
+ u16 sample_interval;
+ u16 adaptive_rx_coal;
+ u32 msg_enable;
+
+ int moder_cnt;
+ int moder_time;
+};
+
struct sdp_sock {
/* sk has to be the first member of inet_sock */
struct inet_sock isk;
int recv_frags; /* max skb frags in recv packets */
int send_frags; /* max skb frags in send packets */
+ unsigned long tx_packets;
+ unsigned long rx_packets;
+ unsigned long tx_bytes;
+ unsigned long rx_bytes;
+ struct sdp_moderation auto_mod;
+
/* BZCOPY data */
int zcopy_thresh;
};
int sdp_init_sock(struct sock *sk);
int __init sdp_proc_init(void);
void sdp_proc_unregister(void);
+/* sdp_main.c */
+void sdp_set_default_moderation(struct sdp_sock *ssk);
/* sdp_tx.c */
int sdp_tx_ring_create(struct sdp_sock *ssk, struct ib_device *device);
MODULE_LICENSE("Dual BSD/GPL");
#ifdef CONFIG_INFINIBAND_SDP_DEBUG
-int sdp_debug_level;
-
-module_param_named(debug_level, sdp_debug_level, int, 0644);
-MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0.");
+SDP_MODPARAM_INT(sdp_debug_level, 0, "Enable debug tracing if > 0.");
#endif
#ifdef CONFIG_INFINIBAND_SDP_DEBUG
-int sdp_data_debug_level;
-
-module_param_named(data_debug_level, sdp_data_debug_level, int, 0644);
-MODULE_PARM_DESC(data_debug_level, "Enable data path debug tracing if > 0.");
+SDP_MODPARAM_INT(sdp_data_debug_level, 0, "Enable data path debug tracing if > 0.");
#endif
-static int recv_poll_hit;
-
-module_param_named(recv_poll_hit, recv_poll_hit, int, 0644);
-MODULE_PARM_DESC(recv_poll_hit, "How many times recv poll helped.");
-
-static int recv_poll_miss;
-
-module_param_named(recv_poll_miss, recv_poll_miss, int, 0644);
-MODULE_PARM_DESC(recv_poll_miss, "How many times recv poll missed.");
-
-static int recv_poll = 1000;
-
-module_param_named(recv_poll, recv_poll, int, 0644);
-MODULE_PARM_DESC(recv_poll, "How many times to poll recv.");
-
-static unsigned int sdp_keepalive_time = SDP_KEEPALIVE_TIME;
-
-module_param_named(sdp_keepalive_time, sdp_keepalive_time, uint, 0644);
-MODULE_PARM_DESC(sdp_keepalive_time, "Default idle time in seconds before keepalive probe sent.");
-
-static int sdp_zcopy_thresh = 65536;
-module_param_named(sdp_zcopy_thresh, sdp_zcopy_thresh, int, 0644);
-MODULE_PARM_DESC(sdp_zcopy_thresh, "Zero copy send threshold; 0=0ff.");
+SDP_MODPARAM_SINT(recv_poll_hit, -1, "How many times recv poll helped.");
+SDP_MODPARAM_SINT(recv_poll_miss, -1, "How many times recv poll missed.");
+SDP_MODPARAM_SINT(recv_poll, 1000, "How many times to poll recv.");
+SDP_MODPARAM_SINT(sdp_keepalive_time, SDP_KEEPALIVE_TIME,
+ "Default idle time in seconds before keepalive probe sent.");
+SDP_MODPARAM_SINT(sdp_zcopy_thresh, 65536, "Zero copy send threshold; 0=0ff.");
+
+#define SDP_RX_COAL_TIME_HIGH 128
+SDP_MODPARAM_SINT(sdp_rx_coal_target, 0x50000,
+ "Target number of bytes to coalesce with interrupt moderation (bytes).");
+SDP_MODPARAM_SINT(sdp_rx_coal_time, 0x10, "rx coal time (jiffies).");
+SDP_MODPARAM_SINT(sdp_rx_rate_low, 80000, "rx_rate low (packets/sec).");
+SDP_MODPARAM_SINT(sdp_rx_coal_time_low, 0, "low moderation time val (usec).");
+SDP_MODPARAM_SINT(sdp_rx_rate_high, 100000, "rx_rate high (packets/sec).");
+SDP_MODPARAM_SINT(sdp_rx_coal_time_high, 128, "high moderation time val (usec).");
+SDP_MODPARAM_SINT(sdp_rx_rate_thresh, (200000 / SDP_RX_COAL_TIME_HIGH),
+ "rx rate thresh ().");
+SDP_MODPARAM_SINT(sdp_sample_interval, (HZ / 4), "sample interval (jiffies).");
+
+SDP_MODPARAM_INT(hw_int_mod_count, -1, "forced hw int moderation val. -1 for auto (packets).");
+SDP_MODPARAM_INT(hw_int_mod_usec, -1, "forced hw int moderation val. -1 for auto (usec).");
struct workqueue_struct *sdp_wq;
struct workqueue_struct *rx_comp_wq;
sdp_reset_keepalive_timer(sk, sdp_keepalive_time_when(sdp_sk(sk)));
}
+void sdp_set_default_moderation(struct sdp_sock *ssk)
+{
+ struct sdp_moderation *mod = &ssk->auto_mod;
+ int rx_buf_size;
+
+ if (hw_int_mod_count > -1 || hw_int_mod_usec > -1) {
+ int err;
+
+ mod->adaptive_rx_coal = 0;
+
+ if (hw_int_mod_count > 0 && hw_int_mod_usec > 0) {
+ err = ib_modify_cq(ssk->rx_ring.cq, hw_int_mod_count, hw_int_mod_usec);
+ if (err)
+ sdp_warn(&ssk->isk.sk, "Failed modifying moderation for cq");
+ else
+ sdp_dbg(&ssk->isk.sk, "Using fixed interrupt moderation\n");
+ }
+ return;
+ }
+
+ mod->adaptive_rx_coal = 1;
+ sdp_dbg(&ssk->isk.sk, "Using adaptive interrupt moderation\n");
+
+ /* If we haven't received a specific coalescing setting
+ * (module param), we set the moderation paramters as follows:
+ * - moder_cnt is set to the number of mtu sized packets to
+ * satisfy our coelsing target.
+ * - moder_time is set to a fixed value.
+ */
+ rx_buf_size = (ssk->recv_frags * PAGE_SIZE) + sizeof(struct sdp_bsdh);
+ mod->moder_cnt = sdp_rx_coal_target / rx_buf_size + 1;
+ mod->moder_time = sdp_rx_coal_time;
+ sdp_dbg(&ssk->isk.sk, "Default coalesing params for buf size:%d - "
+ "moder_cnt:%d moder_time:%d\n",
+ rx_buf_size, mod->moder_cnt, mod->moder_time);
+
+ /* Reset auto-moderation params */
+ mod->pkt_rate_low = sdp_rx_rate_low;
+ mod->rx_usecs_low = sdp_rx_coal_time_low;
+ mod->pkt_rate_high = sdp_rx_rate_high;
+ mod->rx_usecs_high = sdp_rx_coal_time_high;
+ mod->sample_interval = sdp_sample_interval;
+
+ mod->last_moder_time = SDP_AUTO_CONF;
+ mod->last_moder_jiffies = 0;
+ mod->last_moder_packets = 0;
+ mod->last_moder_tx_packets = 0;
+ mod->last_moder_bytes = 0;
+}
+
+static void sdp_auto_moderation(struct sdp_sock *ssk)
+{
+ struct sdp_moderation *mod = &ssk->auto_mod;
+
+ unsigned long period = (unsigned long) (jiffies - mod->last_moder_jiffies);
+ unsigned long packets;
+ unsigned long rate;
+ unsigned long avg_pkt_size;
+ unsigned long tx_pkt_diff;
+ unsigned long rx_pkt_diff;
+ int moder_time;
+ int err;
+
+ if (!mod->adaptive_rx_coal)
+ return;
+
+ if (period < mod->sample_interval)
+ return;
+
+ if (!mod->last_moder_jiffies || !period)
+ goto out;
+
+ tx_pkt_diff = ((unsigned long) (ssk->tx_packets -
+ mod->last_moder_tx_packets));
+ rx_pkt_diff = ((unsigned long) (ssk->rx_packets -
+ mod->last_moder_packets));
+ packets = max(tx_pkt_diff, rx_pkt_diff);
+ rate = packets * HZ / period;
+ avg_pkt_size = packets ? ((unsigned long) (ssk->rx_bytes -
+ mod->last_moder_bytes)) / packets : 0;
+
+ /* Apply auto-moderation only when packet rate exceeds a rate that
+ * it matters */
+ if (rate > sdp_rx_rate_thresh) {
+ /* If tx and rx packet rates are not balanced, assume that
+ * traffic is mainly BW bound and apply maximum moderation.
+ * Otherwise, moderate according to packet rate */
+ if (2 * tx_pkt_diff > 3 * rx_pkt_diff ||
+ 2 * rx_pkt_diff > 3 * tx_pkt_diff) {
+ moder_time = mod->rx_usecs_high;
+ } else {
+ if (rate < mod->pkt_rate_low) {
+ moder_time = mod->rx_usecs_low;
+ } else if (rate > mod->pkt_rate_high)
+ moder_time = mod->rx_usecs_high;
+ else
+ moder_time = (rate - mod->pkt_rate_low) *
+ (mod->rx_usecs_high - mod->rx_usecs_low) /
+ (mod->pkt_rate_high - mod->pkt_rate_low) +
+ mod->rx_usecs_low;
+ }
+ } else {
+ /* When packet rate is low, use default moderation rather than
+ * 0 to prevent interrupt storms if traffic suddenly increases */
+ moder_time = mod->moder_time;
+ }
+
+ sdp_dbg_data(&ssk->isk.sk, "tx rate:%lu rx_rate:%lu\n",
+ tx_pkt_diff * HZ / period, rx_pkt_diff * HZ / period);
+
+ sdp_dbg_data(&ssk->isk.sk, "Rx moder_time changed from:%d to %d period:%lu "
+ "[jiff] packets:%lu avg_pkt_size:%lu rate:%lu [p/s])\n",
+ mod->last_moder_time, moder_time, period, packets,
+ avg_pkt_size, rate);
+
+ if (moder_time != mod->last_moder_time) {
+ mod->last_moder_time = moder_time;
+ err = ib_modify_cq(ssk->rx_ring.cq, mod->moder_cnt, moder_time);
+ if (err)
+ sdp_dbg_data(&ssk->isk.sk, "Failed modifying moderation for cq");
+ }
+
+out:
+ mod->last_moder_packets = ssk->rx_packets;
+ mod->last_moder_tx_packets = ssk->tx_packets;
+ mod->last_moder_bytes = ssk->rx_bytes;
+ mod->last_moder_jiffies = jiffies;
+}
+
void sdp_reset_sk(struct sock *sk, int rc)
{
struct sdp_sock *ssk = sdp_sk(sk);
posts_handler_put(ssk);
+ sdp_auto_moderation(ssk);
+
rdtscll(end);
SDPSTATS_COUNTER_ADD(sendmsg_sum, end - start);
release_sock(sk);
posts_handler_put(ssk);
+ sdp_auto_moderation(ssk);
+
release_sock(sk);
return err;
#include <rdma/rdma_cm.h>
#include "sdp.h"
-static int rcvbuf_scale = 0x10;
-
-int rcvbuf_initial_size = 32 * 1024;
-module_param_named(rcvbuf_initial_size, rcvbuf_initial_size, int, 0644);
-MODULE_PARM_DESC(rcvbuf_initial_size, "Receive buffer initial size in bytes.");
-
-module_param_named(rcvbuf_scale, rcvbuf_scale, int, 0644);
-MODULE_PARM_DESC(rcvbuf_scale, "Receive buffer size scale factor.");
-
-static int top_mem_usage = 0;
-module_param_named(top_mem_usage, top_mem_usage, int, 0644);
-MODULE_PARM_DESC(top_mem_usage, "Top system wide sdp memory usage for recv (in MB).");
-
-static int hw_int_mod_count = 10;
-module_param_named(hw_int_mod_count, hw_int_mod_count, int, 0644);
-MODULE_PARM_DESC(hw_int_mod_count, "HW interrupt moderation. int count");
-
-static int hw_int_mod_msec = 200;
-module_param_named(hw_int_mod_msec, hw_int_mod_msec, int, 0644);
-MODULE_PARM_DESC(hw_int_mod_count, "HW interrupt moderation. mseq");
+SDP_MODPARAM_INT(rcvbuf_initial_size, 32 * 1024, "Receive buffer initial size in bytes.");
+SDP_MODPARAM_SINT(rcvbuf_scale, 0x10, "Receive buffer size scale factor.");
+SDP_MODPARAM_SINT(top_mem_usage, 0, "Top system wide sdp memory usage for recv (in MB).");
#ifdef CONFIG_PPC
-static int max_large_sockets = 100;
+SDP_MODPARAM_SINT(max_large_sockets, 100, "Max number of large sockets (32k buffers).");
#else
-static int max_large_sockets = 1000;
+SDP_MODPARAM_SINT(max_large_sockets, 1000, "Max number of large sockets (32k buffers).");
#endif
-module_param_named(max_large_sockets, max_large_sockets, int, 0644);
-MODULE_PARM_DESC(max_large_sockets, "Max number of large sockets (32k buffers).");
static int curr_large_sockets = 0;
atomic_t sdp_current_mem_usage;
SDP_DUMP_PACKET(&ssk->isk.sk, "RX", skb, h);
skb_reset_transport_header(skb);
+ ssk->rx_packets++;
+ ssk->rx_bytes += skb->len;
+
mseq = ntohl(h->mseq);
atomic_set(&ssk->mseq_ack, mseq);
if (mseq != (int)wc->wr_id)
goto err_cq;
}
- rc = ib_modify_cq(rx_cq, hw_int_mod_count, hw_int_mod_msec);
- if (rc) {
- sdp_warn(&ssk->isk.sk, "Unable to modify RX CQ: %d.\n", rc);
- goto err_mod;
- }
- sdp_warn(&ssk->isk.sk, "Initialized CQ moderation\n");
sdp_sk(&ssk->isk.sk)->rx_ring.cq = rx_cq;
INIT_WORK(&ssk->rx_comp_work, sdp_rx_comp_work);
goto out;
-err_mod:
- ib_destroy_cq(rx_cq);
err_cq:
kfree(ssk->rx_ring.buffer);
ssk->rx_ring.buffer = NULL;