From: Amir Vadai Date: Mon, 8 Jun 2009 14:02:20 +0000 (+0300) Subject: sdp: make interrupt moderation adaptive X-Git-Tag: v4.1.12-92~264^2~5^2~270 X-Git-Url: https://www.infradead.org/git/?a=commitdiff_plain;h=59ba073d670fe4ca94385d85d6ccd10aa6c962ab;p=users%2Fjedix%2Flinux-maple.git sdp: make interrupt moderation adaptive Signed-off-by: Amir Vadai --- diff --git a/drivers/infiniband/ulp/sdp/sdp.h b/drivers/infiniband/ulp/sdp/sdp.h index 5d3e7d727e246..cc38f5b16d381 100644 --- a/drivers/infiniband/ulp/sdp/sdp.h +++ b/drivers/infiniband/ulp/sdp/sdp.h @@ -37,6 +37,16 @@ spin_unlock_irqrestore(&ssk->rx_ring.lock, f); \ } while (0) +#define SDP_MODPARAM_SINT(var, def_val, msg) \ + static int var = def_val; \ + module_param_named(var, var, int, 0644); \ + MODULE_PARM_DESC(var, msg " [" #def_val "]"); \ + +#define SDP_MODPARAM_INT(var, def_val, msg) \ + int var = def_val; \ + module_param_named(var, var, int, 0644); \ + MODULE_PARM_DESC(var, msg " [" #def_val "]"); \ + #ifdef SDP_PROFILING struct sk_buff; struct sdpprf_log { @@ -238,6 +248,9 @@ static inline void sdpstats_hist(u32 *h, u32 val, u32 maxidx, int is_log) /* how long (in jiffies) to block sender till tx completion*/ #define SDP_BZCOPY_POLL_TIMEOUT (HZ / 10) +#define SDP_AUTO_CONF 0xffff +#define AUTO_MOD_DELAY (HZ / 4) + #define BZCOPY_STATE(skb) (*(struct bzcopy_state **)(skb->cb)) #ifndef MIN #define MIN(a, b) (a < b ? a : b) @@ -372,6 +385,27 @@ struct sdp_chrecvbuf { sdp_do_posts(ssk); \ }) +struct sdp_moderation { + unsigned long last_moder_packets; + unsigned long last_moder_tx_packets; + unsigned long last_moder_bytes; + unsigned long last_moder_jiffies; + int last_moder_time; + u16 rx_usecs; + u16 rx_frames; + u16 tx_usecs; + u32 pkt_rate_low; + u16 rx_usecs_low; + u32 pkt_rate_high; + u16 rx_usecs_high; + u16 sample_interval; + u16 adaptive_rx_coal; + u32 msg_enable; + + int moder_cnt; + int moder_time; +}; + struct sdp_sock { /* sk has to be the first member of inet_sock */ struct inet_sock isk; @@ -451,6 +485,12 @@ struct sdp_sock { int recv_frags; /* max skb frags in recv packets */ int send_frags; /* max skb frags in send packets */ + unsigned long tx_packets; + unsigned long rx_packets; + unsigned long tx_bytes; + unsigned long rx_bytes; + struct sdp_moderation auto_mod; + /* BZCOPY data */ int zcopy_thresh; }; @@ -568,6 +608,8 @@ void sdp_start_keepalive_timer(struct sock *sk); int sdp_init_sock(struct sock *sk); int __init sdp_proc_init(void); void sdp_proc_unregister(void); +/* sdp_main.c */ +void sdp_set_default_moderation(struct sdp_sock *ssk); /* sdp_tx.c */ int sdp_tx_ring_create(struct sdp_sock *ssk, struct ib_device *device); diff --git a/drivers/infiniband/ulp/sdp/sdp_cma.c b/drivers/infiniband/ulp/sdp/sdp_cma.c index 83e39dc240974..55ade25aa91b5 100644 --- a/drivers/infiniband/ulp/sdp/sdp_cma.c +++ b/drivers/infiniband/ulp/sdp/sdp_cma.c @@ -203,6 +203,7 @@ static int sdp_response_handler(struct sock *sk, struct rdma_cm_id *id, sdp_dbg(sk, "%s\n", __func__); sdp_exch_state(sk, TCPF_SYN_SENT, TCP_ESTABLISHED); + sdp_set_default_moderation(sdp_sk(sk)); if (sock_flag(sk, SOCK_KEEPOPEN)) sdp_start_keepalive_timer(sk); @@ -249,6 +250,8 @@ static int sdp_connected_handler(struct sock *sk, struct rdma_cm_event *event) sdp_exch_state(sk, TCPF_SYN_RECV, TCP_ESTABLISHED); + sdp_set_default_moderation(sdp_sk(sk)); + if (sock_flag(sk, SOCK_KEEPOPEN)) sdp_start_keepalive_timer(sk); diff --git a/drivers/infiniband/ulp/sdp/sdp_main.c b/drivers/infiniband/ulp/sdp/sdp_main.c index 64511fea03cb6..b34743da13007 100644 --- a/drivers/infiniband/ulp/sdp/sdp_main.c +++ b/drivers/infiniband/ulp/sdp/sdp_main.c @@ -79,41 +79,33 @@ MODULE_DESCRIPTION("InfiniBand SDP module"); MODULE_LICENSE("Dual BSD/GPL"); #ifdef CONFIG_INFINIBAND_SDP_DEBUG -int sdp_debug_level; - -module_param_named(debug_level, sdp_debug_level, int, 0644); -MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0."); +SDP_MODPARAM_INT(sdp_debug_level, 0, "Enable debug tracing if > 0."); #endif #ifdef CONFIG_INFINIBAND_SDP_DEBUG -int sdp_data_debug_level; - -module_param_named(data_debug_level, sdp_data_debug_level, int, 0644); -MODULE_PARM_DESC(data_debug_level, "Enable data path debug tracing if > 0."); +SDP_MODPARAM_INT(sdp_data_debug_level, 0, "Enable data path debug tracing if > 0."); #endif -static int recv_poll_hit; - -module_param_named(recv_poll_hit, recv_poll_hit, int, 0644); -MODULE_PARM_DESC(recv_poll_hit, "How many times recv poll helped."); - -static int recv_poll_miss; - -module_param_named(recv_poll_miss, recv_poll_miss, int, 0644); -MODULE_PARM_DESC(recv_poll_miss, "How many times recv poll missed."); - -static int recv_poll = 1000; - -module_param_named(recv_poll, recv_poll, int, 0644); -MODULE_PARM_DESC(recv_poll, "How many times to poll recv."); - -static unsigned int sdp_keepalive_time = SDP_KEEPALIVE_TIME; - -module_param_named(sdp_keepalive_time, sdp_keepalive_time, uint, 0644); -MODULE_PARM_DESC(sdp_keepalive_time, "Default idle time in seconds before keepalive probe sent."); - -static int sdp_zcopy_thresh = 65536; -module_param_named(sdp_zcopy_thresh, sdp_zcopy_thresh, int, 0644); -MODULE_PARM_DESC(sdp_zcopy_thresh, "Zero copy send threshold; 0=0ff."); +SDP_MODPARAM_SINT(recv_poll_hit, -1, "How many times recv poll helped."); +SDP_MODPARAM_SINT(recv_poll_miss, -1, "How many times recv poll missed."); +SDP_MODPARAM_SINT(recv_poll, 1000, "How many times to poll recv."); +SDP_MODPARAM_SINT(sdp_keepalive_time, SDP_KEEPALIVE_TIME, + "Default idle time in seconds before keepalive probe sent."); +SDP_MODPARAM_SINT(sdp_zcopy_thresh, 65536, "Zero copy send threshold; 0=0ff."); + +#define SDP_RX_COAL_TIME_HIGH 128 +SDP_MODPARAM_SINT(sdp_rx_coal_target, 0x50000, + "Target number of bytes to coalesce with interrupt moderation (bytes)."); +SDP_MODPARAM_SINT(sdp_rx_coal_time, 0x10, "rx coal time (jiffies)."); +SDP_MODPARAM_SINT(sdp_rx_rate_low, 80000, "rx_rate low (packets/sec)."); +SDP_MODPARAM_SINT(sdp_rx_coal_time_low, 0, "low moderation time val (usec)."); +SDP_MODPARAM_SINT(sdp_rx_rate_high, 100000, "rx_rate high (packets/sec)."); +SDP_MODPARAM_SINT(sdp_rx_coal_time_high, 128, "high moderation time val (usec)."); +SDP_MODPARAM_SINT(sdp_rx_rate_thresh, (200000 / SDP_RX_COAL_TIME_HIGH), + "rx rate thresh ()."); +SDP_MODPARAM_SINT(sdp_sample_interval, (HZ / 4), "sample interval (jiffies)."); + +SDP_MODPARAM_INT(hw_int_mod_count, -1, "forced hw int moderation val. -1 for auto (packets)."); +SDP_MODPARAM_INT(hw_int_mod_usec, -1, "forced hw int moderation val. -1 for auto (usec)."); struct workqueue_struct *sdp_wq; struct workqueue_struct *rx_comp_wq; @@ -298,6 +290,135 @@ void sdp_start_keepalive_timer(struct sock *sk) sdp_reset_keepalive_timer(sk, sdp_keepalive_time_when(sdp_sk(sk))); } +void sdp_set_default_moderation(struct sdp_sock *ssk) +{ + struct sdp_moderation *mod = &ssk->auto_mod; + int rx_buf_size; + + if (hw_int_mod_count > -1 || hw_int_mod_usec > -1) { + int err; + + mod->adaptive_rx_coal = 0; + + if (hw_int_mod_count > 0 && hw_int_mod_usec > 0) { + err = ib_modify_cq(ssk->rx_ring.cq, hw_int_mod_count, hw_int_mod_usec); + if (err) + sdp_warn(&ssk->isk.sk, "Failed modifying moderation for cq"); + else + sdp_dbg(&ssk->isk.sk, "Using fixed interrupt moderation\n"); + } + return; + } + + mod->adaptive_rx_coal = 1; + sdp_dbg(&ssk->isk.sk, "Using adaptive interrupt moderation\n"); + + /* If we haven't received a specific coalescing setting + * (module param), we set the moderation paramters as follows: + * - moder_cnt is set to the number of mtu sized packets to + * satisfy our coelsing target. + * - moder_time is set to a fixed value. + */ + rx_buf_size = (ssk->recv_frags * PAGE_SIZE) + sizeof(struct sdp_bsdh); + mod->moder_cnt = sdp_rx_coal_target / rx_buf_size + 1; + mod->moder_time = sdp_rx_coal_time; + sdp_dbg(&ssk->isk.sk, "Default coalesing params for buf size:%d - " + "moder_cnt:%d moder_time:%d\n", + rx_buf_size, mod->moder_cnt, mod->moder_time); + + /* Reset auto-moderation params */ + mod->pkt_rate_low = sdp_rx_rate_low; + mod->rx_usecs_low = sdp_rx_coal_time_low; + mod->pkt_rate_high = sdp_rx_rate_high; + mod->rx_usecs_high = sdp_rx_coal_time_high; + mod->sample_interval = sdp_sample_interval; + + mod->last_moder_time = SDP_AUTO_CONF; + mod->last_moder_jiffies = 0; + mod->last_moder_packets = 0; + mod->last_moder_tx_packets = 0; + mod->last_moder_bytes = 0; +} + +static void sdp_auto_moderation(struct sdp_sock *ssk) +{ + struct sdp_moderation *mod = &ssk->auto_mod; + + unsigned long period = (unsigned long) (jiffies - mod->last_moder_jiffies); + unsigned long packets; + unsigned long rate; + unsigned long avg_pkt_size; + unsigned long tx_pkt_diff; + unsigned long rx_pkt_diff; + int moder_time; + int err; + + if (!mod->adaptive_rx_coal) + return; + + if (period < mod->sample_interval) + return; + + if (!mod->last_moder_jiffies || !period) + goto out; + + tx_pkt_diff = ((unsigned long) (ssk->tx_packets - + mod->last_moder_tx_packets)); + rx_pkt_diff = ((unsigned long) (ssk->rx_packets - + mod->last_moder_packets)); + packets = max(tx_pkt_diff, rx_pkt_diff); + rate = packets * HZ / period; + avg_pkt_size = packets ? ((unsigned long) (ssk->rx_bytes - + mod->last_moder_bytes)) / packets : 0; + + /* Apply auto-moderation only when packet rate exceeds a rate that + * it matters */ + if (rate > sdp_rx_rate_thresh) { + /* If tx and rx packet rates are not balanced, assume that + * traffic is mainly BW bound and apply maximum moderation. + * Otherwise, moderate according to packet rate */ + if (2 * tx_pkt_diff > 3 * rx_pkt_diff || + 2 * rx_pkt_diff > 3 * tx_pkt_diff) { + moder_time = mod->rx_usecs_high; + } else { + if (rate < mod->pkt_rate_low) { + moder_time = mod->rx_usecs_low; + } else if (rate > mod->pkt_rate_high) + moder_time = mod->rx_usecs_high; + else + moder_time = (rate - mod->pkt_rate_low) * + (mod->rx_usecs_high - mod->rx_usecs_low) / + (mod->pkt_rate_high - mod->pkt_rate_low) + + mod->rx_usecs_low; + } + } else { + /* When packet rate is low, use default moderation rather than + * 0 to prevent interrupt storms if traffic suddenly increases */ + moder_time = mod->moder_time; + } + + sdp_dbg_data(&ssk->isk.sk, "tx rate:%lu rx_rate:%lu\n", + tx_pkt_diff * HZ / period, rx_pkt_diff * HZ / period); + + sdp_dbg_data(&ssk->isk.sk, "Rx moder_time changed from:%d to %d period:%lu " + "[jiff] packets:%lu avg_pkt_size:%lu rate:%lu [p/s])\n", + mod->last_moder_time, moder_time, period, packets, + avg_pkt_size, rate); + + if (moder_time != mod->last_moder_time) { + mod->last_moder_time = moder_time; + err = ib_modify_cq(ssk->rx_ring.cq, mod->moder_cnt, moder_time); + if (err) + sdp_dbg_data(&ssk->isk.sk, "Failed modifying moderation for cq"); + } + +out: + mod->last_moder_packets = ssk->rx_packets; + mod->last_moder_tx_packets = ssk->tx_packets; + mod->last_moder_bytes = ssk->rx_bytes; + mod->last_moder_jiffies = jiffies; +} + void sdp_reset_sk(struct sock *sk, int rc) { struct sdp_sock *ssk = sdp_sk(sk); @@ -1780,6 +1901,8 @@ out: posts_handler_put(ssk); + sdp_auto_moderation(ssk); + rdtscll(end); SDPSTATS_COUNTER_ADD(sendmsg_sum, end - start); release_sock(sk); @@ -2050,6 +2173,8 @@ out: posts_handler_put(ssk); + sdp_auto_moderation(ssk); + release_sock(sk); return err; diff --git a/drivers/infiniband/ulp/sdp/sdp_rx.c b/drivers/infiniband/ulp/sdp/sdp_rx.c index 1ef28acd02383..2c33af8af2003 100644 --- a/drivers/infiniband/ulp/sdp/sdp_rx.c +++ b/drivers/infiniband/ulp/sdp/sdp_rx.c @@ -37,34 +37,15 @@ #include #include "sdp.h" -static int rcvbuf_scale = 0x10; - -int rcvbuf_initial_size = 32 * 1024; -module_param_named(rcvbuf_initial_size, rcvbuf_initial_size, int, 0644); -MODULE_PARM_DESC(rcvbuf_initial_size, "Receive buffer initial size in bytes."); - -module_param_named(rcvbuf_scale, rcvbuf_scale, int, 0644); -MODULE_PARM_DESC(rcvbuf_scale, "Receive buffer size scale factor."); - -static int top_mem_usage = 0; -module_param_named(top_mem_usage, top_mem_usage, int, 0644); -MODULE_PARM_DESC(top_mem_usage, "Top system wide sdp memory usage for recv (in MB)."); - -static int hw_int_mod_count = 10; -module_param_named(hw_int_mod_count, hw_int_mod_count, int, 0644); -MODULE_PARM_DESC(hw_int_mod_count, "HW interrupt moderation. int count"); - -static int hw_int_mod_msec = 200; -module_param_named(hw_int_mod_msec, hw_int_mod_msec, int, 0644); -MODULE_PARM_DESC(hw_int_mod_count, "HW interrupt moderation. mseq"); +SDP_MODPARAM_INT(rcvbuf_initial_size, 32 * 1024, "Receive buffer initial size in bytes."); +SDP_MODPARAM_SINT(rcvbuf_scale, 0x10, "Receive buffer size scale factor."); +SDP_MODPARAM_SINT(top_mem_usage, 0, "Top system wide sdp memory usage for recv (in MB)."); #ifdef CONFIG_PPC -static int max_large_sockets = 100; +SDP_MODPARAM_SINT(max_large_sockets, 100, "Max number of large sockets (32k buffers)."); #else -static int max_large_sockets = 1000; +SDP_MODPARAM_SINT(max_large_sockets, 1000, "Max number of large sockets (32k buffers)."); #endif -module_param_named(max_large_sockets, max_large_sockets, int, 0644); -MODULE_PARM_DESC(max_large_sockets, "Max number of large sockets (32k buffers)."); static int curr_large_sockets = 0; atomic_t sdp_current_mem_usage; @@ -580,6 +561,9 @@ static struct sk_buff *sdp_process_rx_wc(struct sdp_sock *ssk, struct ib_wc *wc) SDP_DUMP_PACKET(&ssk->isk.sk, "RX", skb, h); skb_reset_transport_header(skb); + ssk->rx_packets++; + ssk->rx_bytes += skb->len; + mseq = ntohl(h->mseq); atomic_set(&ssk->mseq_ack, mseq); if (mseq != (int)wc->wr_id) @@ -823,12 +807,6 @@ int sdp_rx_ring_create(struct sdp_sock *ssk, struct ib_device *device) goto err_cq; } - rc = ib_modify_cq(rx_cq, hw_int_mod_count, hw_int_mod_msec); - if (rc) { - sdp_warn(&ssk->isk.sk, "Unable to modify RX CQ: %d.\n", rc); - goto err_mod; - } - sdp_warn(&ssk->isk.sk, "Initialized CQ moderation\n"); sdp_sk(&ssk->isk.sk)->rx_ring.cq = rx_cq; INIT_WORK(&ssk->rx_comp_work, sdp_rx_comp_work); @@ -837,8 +815,6 @@ int sdp_rx_ring_create(struct sdp_sock *ssk, struct ib_device *device) goto out; -err_mod: - ib_destroy_cq(rx_cq); err_cq: kfree(ssk->rx_ring.buffer); ssk->rx_ring.buffer = NULL; diff --git a/drivers/infiniband/ulp/sdp/sdp_tx.c b/drivers/infiniband/ulp/sdp/sdp_tx.c index 9cc37a341aabb..6fc4746b66967 100644 --- a/drivers/infiniband/ulp/sdp/sdp_tx.c +++ b/drivers/infiniband/ulp/sdp/sdp_tx.c @@ -38,10 +38,8 @@ #include "sdp.h" #define sdp_cnt(var) do { (var)++; } while (0) -static unsigned sdp_keepalive_probes_sent = 0; -module_param_named(sdp_keepalive_probes_sent, sdp_keepalive_probes_sent, uint, 0644); -MODULE_PARM_DESC(sdp_keepalive_probes_sent, "Total number of keepalive probes sent."); +SDP_MODPARAM_SINT(sdp_keepalive_probes_sent, 0, "Total number of keepalive probes sent."); static int sdp_process_tx_cq(struct sdp_sock *ssk); @@ -83,6 +81,9 @@ void sdp_post_send(struct sdp_sock *ssk, struct sk_buff *skb, u8 mid) SDPSTATS_COUNTER_MID_INC(post_send, mid); SDPSTATS_HIST(send_size, skb->len); + ssk->tx_packets++; + ssk->tx_bytes += skb->len; + h->mid = mid; if (unlikely(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_URG)) h->flags = SDP_OOB_PRES | SDP_OOB_PEND;