]> www.infradead.org Git - users/jedix/linux-maple.git/commitdiff
RDS: IB: Run rds_fmr_flush WQ closer to ib_device
authorWei Lin Guay <wei.lin.guay@oracle.com>
Wed, 18 Nov 2015 11:23:08 +0000 (12:23 +0100)
committerChuck Anderson <chuck.anderson@oracle.com>
Thu, 14 Apr 2016 01:01:53 +0000 (18:01 -0700)
Orabug 22269408

rds_fmr_flush workqueue is calling ib_unmap_fmr
to invalidate a list of FMRs. Today, this workqueue
can be scheduled at any CPUs. In a NUMA-aware system,
schedule this workqueue to run on a CPU core closer to
ib_device can improve performance. As for now, we use
"sequential-low" policy. This policy selects two lower
cpu cores closer to HCA. In a non-NUMA aware system,
schedule rds_fmr_flush workqueue in a fixed cpu core
improves performance.

The mapping of cpu to the rds_fmr_flush workqueue
can be enabled/disabled via  sysctl and it is enable
by default. To disable the feature, use below sysctl.

rds_ib_sysctl_disable_unmap_fmr_cpu = 1

Putting down some of the rds-stress performance number
comparing default and sequential-low policy in a NUMA
system with Oracle M4 QDR and Mellanox CX3.

rds-stress 4 conns, 32 threads, 16 depths, RDMA write
and unidirectional (higher is better).

(Oracle M4 QDR)
default : 645591 IOPS
sequential-low : 806196 IOPS

(Mellanox CX3)
default : 473836 IOPS
sequential-low : 544187 IOPS

Reviewed-by: HÃ¥kon Bugge <haakon.bugge@oracle.com>
Reviewed-by: Knut Omang <knut.omang@oracle.com>
Reviewed-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
Signed-off-by: Wei Lin Guay <wei.lin.guay@oracle.com>
net/rds/ib.h
net/rds/ib_rdma.c
net/rds/ib_sysctl.c

index 0d0ea322f57596cb0153af3baa56b7912522dc09..c66eec4a5b0d4c2dc6a682f64fac8a7c7bc65f2a 100644 (file)
@@ -690,5 +690,6 @@ extern unsigned long rds_ib_sysctl_max_recv_allocation;
 extern unsigned int rds_ib_sysctl_flow_control;
 extern unsigned int rds_ib_sysctl_active_bonding;
 extern unsigned int rds_ib_sysctl_trigger_active_bonding;
+extern unsigned int rds_ib_sysctl_disable_unmap_fmr_cpu;
 
 #endif
index d6dd17710f2d00adb408e3b6cf41a2c5dae8e446..090877d97208c54200df8898c4fb855568a13ba1 100644 (file)
@@ -85,6 +85,7 @@ struct rds_ib_mr_pool {
        unsigned long           max_items;
        atomic_t                max_items_soft;
        unsigned long           max_free_pinned;
+       unsigned                unmap_fmr_cpu;
        struct ib_fmr_attr      fmr_attr;
 
        spinlock_t              busy_lock; /* protect ops on 'busy_list' */
@@ -227,6 +228,22 @@ void rds_ib_destroy_nodev_conns(void)
                rds_conn_destroy(ic->conn);
 }
 
+static unsigned int get_unmap_fmr_cpu(struct rds_ib_device *rds_ibdev,
+                                     int pool_type)
+{
+       int index;
+       int ib_node = rdsibdev_to_node(rds_ibdev);
+
+       /* always returns a CPU core that is closer to
+        * IB device first if possible. As for now, the
+        * first two cpu cores are returned. For numa
+        * or non-numa system, cpumask_local_spread
+        * will take care of it.
+        */
+       index = pool_type == RDS_IB_MR_8K_POOL ? 0 : 1;
+       return cpumask_local_spread(index, ib_node);
+}
+
 struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev,
                                                int pool_type)
 {
@@ -250,9 +267,11 @@ struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev,
        if (pool_type == RDS_IB_MR_1M_POOL) {
                pool->fmr_attr.max_pages = RDS_FMR_1M_MSG_SIZE + 1;
                pool->max_items = rds_ibdev->max_1m_fmrs;
+               pool->unmap_fmr_cpu = get_unmap_fmr_cpu(rds_ibdev, pool_type);
        } else /* pool_type == RDS_IB_MR_8K_POOL */ {
                pool->fmr_attr.max_pages = RDS_FMR_8K_MSG_SIZE + 1;
                pool->max_items = rds_ibdev->max_8k_fmrs;
+               pool->unmap_fmr_cpu = get_unmap_fmr_cpu(rds_ibdev, pool_type);
        }
        pool->max_free_pinned =
                pool->max_items * pool->fmr_attr.max_pages / 4;
@@ -344,6 +363,7 @@ static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev,
 {
        struct rds_ib_mr_pool *pool;
        struct rds_ib_mr *ibmr = NULL;
+       unsigned int unmap_fmr_cpu = 0;
        int err = 0, iter = 0;
 
        if (npages <= RDS_FMR_8K_MSG_SIZE)
@@ -351,9 +371,12 @@ static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev,
        else
                pool = rds_ibdev->mr_1m_pool;
 
+       unmap_fmr_cpu = rds_ib_sysctl_disable_unmap_fmr_cpu ?
+               WORK_CPU_UNBOUND : pool->unmap_fmr_cpu;
        if (atomic_read(&pool->dirty_count) >=
                atomic_read(&pool->max_items_soft) / 10)
-               queue_delayed_work(rds_ib_fmr_wq, &pool->flush_worker, 10);
+               queue_delayed_work_on(unmap_fmr_cpu,
+                                     rds_ib_fmr_wq, &pool->flush_worker, 10);
 
        while (1) {
                ibmr = rds_ib_reuse_fmr(pool);
@@ -840,6 +863,8 @@ void rds_ib_free_mr(void *trans_private, int invalidate)
        struct rds_ib_mr *ibmr = trans_private;
        struct rds_ib_device *rds_ibdev = ibmr->device;
        struct rds_ib_mr_pool *pool = ibmr->pool;
+       unsigned int unmap_fmr_cpu = rds_ib_sysctl_disable_unmap_fmr_cpu ?
+                                    WORK_CPU_UNBOUND : pool->unmap_fmr_cpu;
 
        rdsdebug("RDS/IB: free_mr nents %u\n", ibmr->sg_len);
 
@@ -863,7 +888,8 @@ void rds_ib_free_mr(void *trans_private, int invalidate)
        if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned
         || atomic_read(&pool->dirty_count) >=
                atomic_read(&pool->max_items_soft) / 5)
-               queue_delayed_work(rds_ib_fmr_wq, &pool->flush_worker, 10);
+               queue_delayed_work_on(unmap_fmr_cpu,
+                                     rds_ib_fmr_wq, &pool->flush_worker, 10);
 
        if (invalidate) {
                if (likely(!in_interrupt())) {
@@ -871,8 +897,8 @@ void rds_ib_free_mr(void *trans_private, int invalidate)
                } else {
                        /* We get here if the user created a MR marked
                         * as use_once and invalidate at the same time. */
-                       queue_delayed_work(rds_ib_fmr_wq,
-                                          &pool->flush_worker, 10);
+                       queue_delayed_work_on(unmap_fmr_cpu, rds_ib_fmr_wq,
+                                             &pool->flush_worker, 10);
                }
        }
 
index fef8ff916cc11f3b99a24b655cb72cd72d593bd1..f6dfb230d5fb62b235ed9e808c077f70ebe6bb17 100644 (file)
@@ -62,6 +62,7 @@ static unsigned long rds_ib_sysctl_max_unsig_wr_max = 64;
 
 unsigned int rds_ib_sysctl_flow_control = 0;
 unsigned int rds_ib_sysctl_active_bonding = 1;
+unsigned int rds_ib_sysctl_disable_unmap_fmr_cpu; /* = 0 */
 
 /*
  * sysctl to trigger active bonding when set to 1
@@ -129,6 +130,13 @@ static struct ctl_table rds_ib_sysctl_table[] = {
                .mode           = 0644,
                .proc_handler   = &proc_dointvec,
        },
+       {
+               .procname       = "disable_unmap_fmr_cpu_assignment",
+               .data           = &rds_ib_sysctl_disable_unmap_fmr_cpu,
+               .maxlen         = sizeof(rds_ib_sysctl_disable_unmap_fmr_cpu),
+               .mode           = 0644,
+               .proc_handler   = &proc_dointvec,
+       },
        { }
 };