From: Wei Lin Guay Date: Wed, 18 Nov 2015 11:23:08 +0000 (+0100) Subject: RDS: IB: Run rds_fmr_flush WQ closer to ib_device X-Git-Tag: v4.1.12-92~175^2~4 X-Git-Url: https://www.infradead.org/git/?a=commitdiff_plain;h=24163058321e6725c265dd33be5c6c9f42cb6843;p=users%2Fjedix%2Flinux-maple.git RDS: IB: Run rds_fmr_flush WQ closer to ib_device Orabug 22269408 rds_fmr_flush workqueue is calling ib_unmap_fmr to invalidate a list of FMRs. Today, this workqueue can be scheduled at any CPUs. In a NUMA-aware system, schedule this workqueue to run on a CPU core closer to ib_device can improve performance. As for now, we use "sequential-low" policy. This policy selects two lower cpu cores closer to HCA. In a non-NUMA aware system, schedule rds_fmr_flush workqueue in a fixed cpu core improves performance. The mapping of cpu to the rds_fmr_flush workqueue can be enabled/disabled via sysctl and it is enable by default. To disable the feature, use below sysctl. rds_ib_sysctl_disable_unmap_fmr_cpu = 1 Putting down some of the rds-stress performance number comparing default and sequential-low policy in a NUMA system with Oracle M4 QDR and Mellanox CX3. rds-stress 4 conns, 32 threads, 16 depths, RDMA write and unidirectional (higher is better). (Oracle M4 QDR) default : 645591 IOPS sequential-low : 806196 IOPS (Mellanox CX3) default : 473836 IOPS sequential-low : 544187 IOPS Reviewed-by: HÃ¥kon Bugge Reviewed-by: Knut Omang Reviewed-by: Santosh Shilimkar Signed-off-by: Wei Lin Guay --- diff --git a/net/rds/ib.h b/net/rds/ib.h index 0d0ea322f575..c66eec4a5b0d 100644 --- a/net/rds/ib.h +++ b/net/rds/ib.h @@ -690,5 +690,6 @@ extern unsigned long rds_ib_sysctl_max_recv_allocation; extern unsigned int rds_ib_sysctl_flow_control; extern unsigned int rds_ib_sysctl_active_bonding; extern unsigned int rds_ib_sysctl_trigger_active_bonding; +extern unsigned int rds_ib_sysctl_disable_unmap_fmr_cpu; #endif diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c index d6dd17710f2d..090877d97208 100644 --- a/net/rds/ib_rdma.c +++ b/net/rds/ib_rdma.c @@ -85,6 +85,7 @@ struct rds_ib_mr_pool { unsigned long max_items; atomic_t max_items_soft; unsigned long max_free_pinned; + unsigned unmap_fmr_cpu; struct ib_fmr_attr fmr_attr; spinlock_t busy_lock; /* protect ops on 'busy_list' */ @@ -227,6 +228,22 @@ void rds_ib_destroy_nodev_conns(void) rds_conn_destroy(ic->conn); } +static unsigned int get_unmap_fmr_cpu(struct rds_ib_device *rds_ibdev, + int pool_type) +{ + int index; + int ib_node = rdsibdev_to_node(rds_ibdev); + + /* always returns a CPU core that is closer to + * IB device first if possible. As for now, the + * first two cpu cores are returned. For numa + * or non-numa system, cpumask_local_spread + * will take care of it. + */ + index = pool_type == RDS_IB_MR_8K_POOL ? 0 : 1; + return cpumask_local_spread(index, ib_node); +} + struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev, int pool_type) { @@ -250,9 +267,11 @@ struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev, if (pool_type == RDS_IB_MR_1M_POOL) { pool->fmr_attr.max_pages = RDS_FMR_1M_MSG_SIZE + 1; pool->max_items = rds_ibdev->max_1m_fmrs; + pool->unmap_fmr_cpu = get_unmap_fmr_cpu(rds_ibdev, pool_type); } else /* pool_type == RDS_IB_MR_8K_POOL */ { pool->fmr_attr.max_pages = RDS_FMR_8K_MSG_SIZE + 1; pool->max_items = rds_ibdev->max_8k_fmrs; + pool->unmap_fmr_cpu = get_unmap_fmr_cpu(rds_ibdev, pool_type); } pool->max_free_pinned = pool->max_items * pool->fmr_attr.max_pages / 4; @@ -344,6 +363,7 @@ static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev, { struct rds_ib_mr_pool *pool; struct rds_ib_mr *ibmr = NULL; + unsigned int unmap_fmr_cpu = 0; int err = 0, iter = 0; if (npages <= RDS_FMR_8K_MSG_SIZE) @@ -351,9 +371,12 @@ static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev, else pool = rds_ibdev->mr_1m_pool; + unmap_fmr_cpu = rds_ib_sysctl_disable_unmap_fmr_cpu ? + WORK_CPU_UNBOUND : pool->unmap_fmr_cpu; if (atomic_read(&pool->dirty_count) >= atomic_read(&pool->max_items_soft) / 10) - queue_delayed_work(rds_ib_fmr_wq, &pool->flush_worker, 10); + queue_delayed_work_on(unmap_fmr_cpu, + rds_ib_fmr_wq, &pool->flush_worker, 10); while (1) { ibmr = rds_ib_reuse_fmr(pool); @@ -840,6 +863,8 @@ void rds_ib_free_mr(void *trans_private, int invalidate) struct rds_ib_mr *ibmr = trans_private; struct rds_ib_device *rds_ibdev = ibmr->device; struct rds_ib_mr_pool *pool = ibmr->pool; + unsigned int unmap_fmr_cpu = rds_ib_sysctl_disable_unmap_fmr_cpu ? + WORK_CPU_UNBOUND : pool->unmap_fmr_cpu; rdsdebug("RDS/IB: free_mr nents %u\n", ibmr->sg_len); @@ -863,7 +888,8 @@ void rds_ib_free_mr(void *trans_private, int invalidate) if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned || atomic_read(&pool->dirty_count) >= atomic_read(&pool->max_items_soft) / 5) - queue_delayed_work(rds_ib_fmr_wq, &pool->flush_worker, 10); + queue_delayed_work_on(unmap_fmr_cpu, + rds_ib_fmr_wq, &pool->flush_worker, 10); if (invalidate) { if (likely(!in_interrupt())) { @@ -871,8 +897,8 @@ void rds_ib_free_mr(void *trans_private, int invalidate) } else { /* We get here if the user created a MR marked * as use_once and invalidate at the same time. */ - queue_delayed_work(rds_ib_fmr_wq, - &pool->flush_worker, 10); + queue_delayed_work_on(unmap_fmr_cpu, rds_ib_fmr_wq, + &pool->flush_worker, 10); } } diff --git a/net/rds/ib_sysctl.c b/net/rds/ib_sysctl.c index fef8ff916cc1..f6dfb230d5fb 100644 --- a/net/rds/ib_sysctl.c +++ b/net/rds/ib_sysctl.c @@ -62,6 +62,7 @@ static unsigned long rds_ib_sysctl_max_unsig_wr_max = 64; unsigned int rds_ib_sysctl_flow_control = 0; unsigned int rds_ib_sysctl_active_bonding = 1; +unsigned int rds_ib_sysctl_disable_unmap_fmr_cpu; /* = 0 */ /* * sysctl to trigger active bonding when set to 1 @@ -129,6 +130,13 @@ static struct ctl_table rds_ib_sysctl_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, + { + .procname = "disable_unmap_fmr_cpu_assignment", + .data = &rds_ib_sysctl_disable_unmap_fmr_cpu, + .maxlen = sizeof(rds_ib_sysctl_disable_unmap_fmr_cpu), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, { } };