]> www.infradead.org Git - users/jedix/linux-maple.git/commitdiff
ib/mlx4: Initialize multiple Mellanox HCAs in parallel
authorQing Huang <qing.huang@oracle.com>
Wed, 10 Aug 2016 17:14:25 +0000 (10:14 -0700)
committerSantosh Shilimkar <santosh.shilimkar@oracle.com>
Wed, 10 Aug 2016 17:24:05 +0000 (10:24 -0700)
This is a rework of UEK2 commit a8962313e121 ("OFED: Load multiple ...").
The goal of this patch to reduce the total mount of system boot/kernel
startup time when there are multiple Mellanox HCAs present in the system.
Typically each HCA/PF would require 6~7s to initialize plus extra time for
a certian number of VFs created by each PF. By default, multiple HCAs have
to be probed one by one in a serialized fasion.

The new scheme is to create a work request for current pci probe/mlx4 init
task and then return -EPROBE_DEFER immediately to the probe caller while
the system thread starts to execute the work request in the background.
The main pci probe thread doesn't have to wait for all the current probe
task to finish. The background init task's progress and return err code
will be saved by the sys worker thread and processed from the deferred
queue.

Orabug: 20995222

Signed-off-by: Qing Huang <qing.huang@oracle.com>
Reviewed-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
drivers/base/dd.c
drivers/net/ethernet/mellanox/mlx4/main.c
include/linux/device.h

index e843fdbe492514d83fd1f66cfc8678b10099e877..365a5d2636cd3eccbde0308c1fe823e5bd2aff6e 100644 (file)
@@ -148,7 +148,7 @@ static bool driver_deferred_probe_enable = false;
  * changes in the midst of a probe, then deferred processing should be triggered
  * again.
  */
-static void driver_deferred_probe_trigger(void)
+void driver_deferred_probe_trigger(void)
 {
        if (!driver_deferred_probe_enable)
                return;
@@ -170,6 +170,7 @@ static void driver_deferred_probe_trigger(void)
         */
        queue_work(deferred_wq, &deferred_probe_work);
 }
+EXPORT_SYMBOL_GPL(driver_deferred_probe_trigger);
 
 /**
  * deferred_probe_initcall() - Enable probing of deferred devices
index 9323ef75a13a68ec7d6e489305a06d403e5651cb..95a754594ed0734c04ec591c56c631fe1afe54ea 100644 (file)
@@ -42,6 +42,9 @@
 #include <linux/io-mapping.h>
 #include <linux/delay.h>
 #include <linux/kmod.h>
+#include <linux/topology.h>
+#include <linux/cpumask.h>
+#include <linux/device.h>
 
 #include <linux/mlx4/device.h>
 #include <linux/mlx4/doorbell.h>
@@ -232,6 +235,28 @@ enum {
        MLX4_IF_STATE_EXTENDED
 };
 
+enum {
+       MLX4_DEV_UNINITIALIZED,
+       MLX4_DEV_INITIALIZING,
+       MLX4_DEV_INITIALIZED
+};
+
+struct mlx4_drv_load_work {
+       struct work_struct work;
+       struct pci_dev *pdev;
+       int pci_dev_data;
+       struct mlx4_priv *priv;
+       u8 state;
+       int err;
+       struct list_head list_node;
+};
+
+static LIST_HEAD(mlx4_dl_work_list);
+
+/* Used for parallel init work list
+ */
+static DEFINE_MUTEX(mlx4_dl_work_list_mutex);
+
 static void process_mod_param_profile(struct mlx4_profile *profile)
 {
        struct sysinfo si;
@@ -3566,11 +3591,121 @@ err_disable_pdev:
        return err;
 }
 
+static void __mlx4_init_parallel_one(struct work_struct *_work)
+{
+       int err = 0;
+       struct mlx4_drv_load_work *work =
+               container_of(_work, struct mlx4_drv_load_work, work);
+
+       err = __mlx4_init_one(work->pdev, work->pci_dev_data, work->priv);
+
+       if (err) {
+               kfree(work->priv->dev.persist);
+               kfree(work->priv);
+       } else {
+               pci_save_state(work->pdev);
+       }
+
+       work->err = err;
+       work->state = MLX4_DEV_INITIALIZED;
+       driver_deferred_probe_trigger();
+}
+
+static int __mlx4_init_create_pwork(struct pci_dev *pdev, int pci_dev_data,
+                                   struct mlx4_priv *priv)
+{
+       int err = 0;
+       int node, cpu;
+       struct mlx4_drv_load_work *mlx4_work;
+       struct cpumask tmp_mask;
+
+       node = dev_to_node(&pdev->dev);
+       if (node >= 0) {
+               cpu = cpumask_next_and(get_cpu(), cpumask_of_node(node),
+                                      cpu_online_mask);
+               if (cpu_to_node(cpu) != node)
+                       cpu = cpumask_any_and(cpumask_of_node(node),
+                                             cpu_online_mask);
+       } else {
+               cpumask_xor(&tmp_mask, cpu_online_mask, cpumask_of(get_cpu()));
+               cpu = cpumask_next_and(prandom_u32_max(nr_cpu_ids),
+                                      cpu_online_mask, &tmp_mask);
+       }
+
+       if (cpu >= nr_cpu_ids)
+               return -EINVAL;
+
+       mlx4_work = kmalloc(sizeof(*mlx4_work), GFP_KERNEL);
+       if (!mlx4_work)
+               return -ENOMEM;
+
+       mlx4_work->pdev = pdev;
+       mlx4_work->pci_dev_data = pci_dev_data;
+       mlx4_work->priv = priv;
+       mlx4_work->err = 0;
+       mlx4_work->state = MLX4_DEV_INITIALIZING;
+
+       INIT_LIST_HEAD(&mlx4_work->list_node);
+       mutex_lock(&mlx4_dl_work_list_mutex);
+       list_add_tail(&mlx4_work->list_node,
+                     &mlx4_dl_work_list);
+       mutex_unlock(&mlx4_dl_work_list_mutex);
+
+       INIT_WORK(&mlx4_work->work, __mlx4_init_parallel_one);
+       schedule_work_on(cpu, &mlx4_work->work);
+
+       return err;
+}
+
+static int mlx4_init_work_in_flight(struct pci_dev *pdev, int *found)
+{
+       int ret = 0;
+       struct mlx4_drv_load_work *mlx4_work;
+       struct list_head *list_itr;
+
+       /* Check to see if parallel Init has been started before */
+       mutex_lock(&mlx4_dl_work_list_mutex);
+       list_for_each(list_itr, &mlx4_dl_work_list) {
+               mlx4_work = list_entry(list_itr, struct mlx4_drv_load_work,
+                                      list_node);
+               if (pdev == mlx4_work->pdev) {
+                       switch (mlx4_work->state) {
+                       case MLX4_DEV_INITIALIZING:
+                               ret = -EPROBE_DEFER;
+                               break;
+                       case MLX4_DEV_INITIALIZED:
+                               ret =  mlx4_work->err;
+                               pci_set_drvdata(pdev,
+                                               mlx4_work->priv->dev.persist);
+                               list_del(list_itr);
+                               kfree(mlx4_work);
+                               break;
+                       case MLX4_DEV_UNINITIALIZED:
+                               WARN_ONCE(1, "incorrectly initialized\n");
+                       default:
+                               ret = -EINVAL;
+                               dev_warn(&pdev->dev, "unsupported state %u\n",
+                                        mlx4_work->state);
+                       }
+                       *found = 1;
+                       mutex_unlock(&mlx4_dl_work_list_mutex);
+                       return ret;
+               }
+       }
+       mutex_unlock(&mlx4_dl_work_list_mutex);
+
+       return ret;
+}
+
 static int mlx4_init_one(struct pci_dev *pdev, const struct pci_device_id *id)
 {
        struct mlx4_priv *priv;
        struct mlx4_dev *dev;
-       int ret;
+       int ret, found = 0;
+
+       ret = mlx4_init_work_in_flight(pdev, &found);
+       if (found)
+               return ret;
 
        printk_once(KERN_INFO "%s", mlx4_version);
 
@@ -3591,12 +3726,17 @@ static int mlx4_init_one(struct pci_dev *pdev, const struct pci_device_id *id)
        mutex_init(&dev->persist->device_state_mutex);
        mutex_init(&dev->persist->interface_state_mutex);
 
-       ret =  __mlx4_init_one(pdev, id->driver_data, priv);
+       ret = __mlx4_init_create_pwork(pdev, id->driver_data, priv);
        if (ret) {
-               kfree(dev->persist);
-               kfree(priv);
+               ret =  __mlx4_init_one(pdev, id->driver_data, priv);
+               if (ret) {
+                       kfree(dev->persist);
+                       kfree(priv);
+               } else {
+                       pci_save_state(pdev);
+               }
        } else {
-               pci_save_state(pdev);
+               ret = -EPROBE_DEFER;
        }
 
        return ret;
index 6558af90c8fe3b9263441dc401b3689940d3453e..3525aac388ec5b71cd604c24c534d7284c9569a5 100644 (file)
@@ -259,6 +259,7 @@ extern struct device_driver *driver_find(const char *name,
                                         struct bus_type *bus);
 extern int driver_probe_done(void);
 extern void wait_for_device_probe(void);
+extern void driver_deferred_probe_trigger(void);
 
 
 /* sysfs interface for exporting driver attributes */