In the SoundWire probe, we store a pointer from the driver ops into
the 'slave' structure. This can lead to kernel oopses when unbinding
codec drivers, e.g. with the following sequence to remove machine
driver and codec driver.
/sbin/modprobe -r snd_soc_sof_sdw
/sbin/modprobe -r snd_soc_rt711
The full details can be found in the BugLink below, for reference the
two following examples show different cases of driver ops/callbacks
being invoked after the driver .remove().
kernel: BUG: kernel NULL pointer dereference, address: 
0000000000000150
kernel: Workqueue: events cdns_update_slave_status_work [soundwire_cadence]
kernel: RIP: 0010:mutex_lock+0x19/0x30
kernel: Call Trace:
kernel:  ? sdw_handle_slave_status+0x426/0xe00 [soundwire_bus 
94ff184bf398570c3f8ff7efe9e32529f532e4ae]
kernel:  ? newidle_balance+0x26a/0x400
kernel:  ? cdns_update_slave_status_work+0x1e9/0x200 [soundwire_cadence 
1bcf98eebe5ba9833cd433323769ac923c9c6f82]
kernel: BUG: unable to handle page fault for address: 
ffffffffc07654c8
kernel: Workqueue: pm pm_runtime_work
kernel: RIP: 0010:sdw_bus_prep_clk_stop+0x6f/0x160 [soundwire_bus]
kernel: Call Trace:
kernel:  <TASK>
kernel:  sdw_cdns_clock_stop+0xb5/0x1b0 [soundwire_cadence 
1bcf98eebe5ba9833cd433323769ac923c9c6f82]
kernel:  intel_suspend_runtime+0x5f/0x120 [soundwire_intel 
aca858f7c87048d3152a4a41bb68abb9b663a1dd]
kernel:  ? dpm_sysfs_remove+0x60/0x60
This was not detected earlier in Intel tests since the tests first
remove the parent PCI device and shut down the bus. The sequence
above is a corner case which keeps the bus operational but without a
driver bound.
While trying to solve this kernel oopses, it became clear that the
existing SoundWire bus does not deal well with the unbind case.
Commit 
528be501b7d4a ("soundwire: sdw_slave: add probe_complete structure and new fields")
added a 'probed' status variable and a 'probe_complete'
struct completion. This status is however not reset on remove and
likewise the 'probe complete' is not re-initialized, so the
bind/unbind/bind test cases would fail. The timeout used before the
'update_status' callback was also a bad idea in hindsight, there
should really be no timing assumption as to if and when a driver is
bound to a device.
An initial draft was based on device_lock() and device_unlock() was
tested. This proved too complicated, with deadlocks created during the
suspend-resume sequences, which also use the same device_lock/unlock()
as the bind/unbind sequences. On a CometLake device, a bad DSDT/BIOS
caused spurious resumes and the use of device_lock() caused hangs
during suspend. After multiple weeks or testing and painful
reverse-engineering of deadlocks on different devices, we looked for
alternatives that did not interfere with the device core.
A bus notifier was used successfully to keep track of DRIVER_BOUND and
DRIVER_UNBIND events. This solved the bind-unbind-bind case in tests,
but it can still be defeated with a theoretical corner case where the
memory is freed by a .remove while the callback is in use. The
notifier only helps make sure the driver callbacks are valid, but not
that the memory allocated in probe remains valid while the callbacks
are invoked.
This patch suggests the introduction of a new 'sdw_dev_lock' mutex
protecting probe/remove and all driver callbacks. Since this mutex is
'local' to SoundWire only, it does not interfere with existing locks
and does not create deadlocks. In addition, this patch removes the
'probe_complete' completion, instead we directly invoke the
'update_status' from the probe routine. That removes any sort of
timing dependency and a much better support for the device/driver
model, the driver could be bound before the bus started, or eons after
the bus started and the hardware would be properly initialized in all
cases.
BugLink: https://github.com/thesofproject/linux/issues/3531
Fixes: 56d4fe31af77 ("soundwire: Add MIPI DisCo property helpers")
Fixes: 528be501b7d4a ("soundwire: sdw_slave: add probe_complete structure and new fields")
Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Reviewed-by: Rander Wang <rander.wang@intel.com>
Reviewed-by: Ranjani Sridharan <ranjani.sridharan@linux.intel.com>
Reviewed-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Reviewed-by: Péter Ujfalusi <peter.ujfalusi@linux.intel.com>
Link: https://lore.kernel.org/r/20220621225641.221170-2-pierre-louis.bossart@linux.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
 #include <linux/pm_runtime.h>
 #include <linux/soundwire/sdw_registers.h>
 #include <linux/soundwire/sdw.h>
+#include <linux/soundwire/sdw_type.h>
 #include "bus.h"
 #include "sysfs_local.h"
 
                                       enum sdw_clk_stop_mode mode,
                                       enum sdw_clk_stop_type type)
 {
-       int ret;
+       int ret = 0;
 
-       if (slave->ops && slave->ops->clk_stop) {
-               ret = slave->ops->clk_stop(slave, mode, type);
-               if (ret < 0)
-                       return ret;
+       mutex_lock(&slave->sdw_dev_lock);
+
+       if (slave->probed)  {
+               struct device *dev = &slave->dev;
+               struct sdw_driver *drv = drv_to_sdw_driver(dev->driver);
+
+               if (drv->ops && drv->ops->clk_stop)
+                       ret = drv->ops->clk_stop(slave, mode, type);
        }
 
-       return 0;
+       mutex_unlock(&slave->sdw_dev_lock);
+
+       return ret;
 }
 
 static int sdw_slave_clk_stop_prepare(struct sdw_slave *slave,
                }
 
                /* Update the Slave driver */
-               if (slave_notify && slave->ops &&
-                   slave->ops->interrupt_callback) {
-                       slave_intr.sdca_cascade = sdca_cascade;
-                       slave_intr.control_port = clear;
-                       memcpy(slave_intr.port, &port_status,
-                              sizeof(slave_intr.port));
-
-                       slave->ops->interrupt_callback(slave, &slave_intr);
+               if (slave_notify) {
+                       mutex_lock(&slave->sdw_dev_lock);
+
+                       if (slave->probed) {
+                               struct device *dev = &slave->dev;
+                               struct sdw_driver *drv = drv_to_sdw_driver(dev->driver);
+
+                               if (drv->ops && drv->ops->interrupt_callback) {
+                                       slave_intr.sdca_cascade = sdca_cascade;
+                                       slave_intr.control_port = clear;
+                                       memcpy(slave_intr.port, &port_status,
+                                              sizeof(slave_intr.port));
+
+                                       drv->ops->interrupt_callback(slave, &slave_intr);
+                               }
+                       }
+
+                       mutex_unlock(&slave->sdw_dev_lock);
                }
 
                /* Ack interrupt */
 static int sdw_update_slave_status(struct sdw_slave *slave,
                                   enum sdw_slave_status status)
 {
-       unsigned long time;
+       int ret = 0;
 
-       if (!slave->probed) {
-               /*
-                * the slave status update is typically handled in an
-                * interrupt thread, which can race with the driver
-                * probe, e.g. when a module needs to be loaded.
-                *
-                * make sure the probe is complete before updating
-                * status.
-                */
-               time = wait_for_completion_timeout(&slave->probe_complete,
-                               msecs_to_jiffies(DEFAULT_PROBE_TIMEOUT));
-               if (!time) {
-                       dev_err(&slave->dev, "Probe not complete, timed out\n");
-                       return -ETIMEDOUT;
-               }
+       mutex_lock(&slave->sdw_dev_lock);
+
+       if (slave->probed) {
+               struct device *dev = &slave->dev;
+               struct sdw_driver *drv = drv_to_sdw_driver(dev->driver);
+
+               if (drv->ops && drv->ops->update_status)
+                       ret = drv->ops->update_status(slave, status);
        }
 
-       if (!slave->ops || !slave->ops->update_status)
-               return 0;
+       mutex_unlock(&slave->sdw_dev_lock);
 
-       return slave->ops->update_status(slave, status);
+       return ret;
 }
 
 /**
 
        if (!id)
                return -ENODEV;
 
-       slave->ops = drv->ops;
-
        /*
         * attach to power domain but don't turn on (last arg)
         */
        if (ret)
                return ret;
 
+       mutex_lock(&slave->sdw_dev_lock);
+
        ret = drv->probe(slave, id);
        if (ret) {
                name = drv->name;
                if (!name)
                        name = drv->driver.name;
+               mutex_unlock(&slave->sdw_dev_lock);
+
                dev_err(dev, "Probe of %s failed: %d\n", name, ret);
                dev_pm_domain_detach(dev, false);
                return ret;
        }
 
        /* device is probed so let's read the properties now */
-       if (slave->ops && slave->ops->read_prop)
-               slave->ops->read_prop(slave);
+       if (drv->ops && drv->ops->read_prop)
+               drv->ops->read_prop(slave);
 
        /* init the sysfs as we have properties now */
        ret = sdw_slave_sysfs_init(slave);
                                             slave->prop.clk_stop_timeout);
 
        slave->probed = true;
-       complete(&slave->probe_complete);
+
+       /*
+        * if the probe happened after the bus was started, notify the codec driver
+        * of the current hardware status to e.g. start the initialization.
+        * Errors are only logged as warnings to avoid failing the probe.
+        */
+       if (drv->ops && drv->ops->update_status) {
+               ret = drv->ops->update_status(slave, slave->status);
+               if (ret < 0)
+                       dev_warn(dev, "%s: update_status failed with status %d\n", __func__, ret);
+       }
+
+       mutex_unlock(&slave->sdw_dev_lock);
 
        dev_dbg(dev, "probe complete\n");
 
        struct sdw_driver *drv = drv_to_sdw_driver(dev->driver);
        int ret = 0;
 
+       mutex_lock(&slave->sdw_dev_lock);
+
+       slave->probed = false;
+
        if (drv->remove)
                ret = drv->remove(slave);
 
+       mutex_unlock(&slave->sdw_dev_lock);
+
        dev_pm_domain_detach(dev, false);
 
        return ret;
 
 {
        struct sdw_slave *slave = dev_to_sdw_dev(dev);
 
+       mutex_destroy(&slave->sdw_dev_lock);
        kfree(slave);
 }
 
        init_completion(&slave->enumeration_complete);
        init_completion(&slave->initialization_complete);
        slave->dev_num = 0;
-       init_completion(&slave->probe_complete);
        slave->probed = false;
        slave->first_interrupt_done = false;
+       mutex_init(&slave->sdw_dev_lock);
 
        for (i = 0; i < SDW_MAX_PORTS; i++)
                init_completion(&slave->port_ready[i]);
 
 #include <linux/slab.h>
 #include <linux/soundwire/sdw_registers.h>
 #include <linux/soundwire/sdw.h>
+#include <linux/soundwire/sdw_type.h>
 #include <sound/soc.h>
 #include "bus.h"
 
                            struct sdw_prepare_ch prep_ch,
                            enum sdw_port_prep_ops cmd)
 {
-       const struct sdw_slave_ops *ops = s_rt->slave->ops;
-       int ret;
+       int ret = 0;
+       struct sdw_slave *slave = s_rt->slave;
 
-       if (ops->port_prep) {
-               ret = ops->port_prep(s_rt->slave, &prep_ch, cmd);
-               if (ret < 0) {
-                       dev_err(&s_rt->slave->dev,
-                               "Slave Port Prep cmd %d failed: %d\n",
-                               cmd, ret);
-                       return ret;
+       mutex_lock(&slave->sdw_dev_lock);
+
+       if (slave->probed) {
+               struct device *dev = &slave->dev;
+               struct sdw_driver *drv = drv_to_sdw_driver(dev->driver);
+
+               if (drv->ops && drv->ops->port_prep) {
+                       ret = drv->ops->port_prep(slave, &prep_ch, cmd);
+                       if (ret < 0)
+                               dev_err(dev, "Slave Port Prep cmd %d failed: %d\n",
+                                       cmd, ret);
                }
        }
 
-       return 0;
+       mutex_unlock(&slave->sdw_dev_lock);
+
+       return ret;
 }
 
 static int sdw_prep_deprep_slave_ports(struct sdw_bus *bus,
        struct sdw_slave_runtime *s_rt;
        struct sdw_bus *bus = m_rt->bus;
        struct sdw_slave *slave;
-       int ret = 0;
+       int ret;
 
        if (bus->ops->set_bus_conf) {
                ret = bus->ops->set_bus_conf(bus, &bus->params);
        list_for_each_entry(s_rt, &m_rt->slave_rt_list, m_rt_node) {
                slave = s_rt->slave;
 
-               if (slave->ops->bus_config) {
-                       ret = slave->ops->bus_config(slave, &bus->params);
-                       if (ret < 0) {
-                               dev_err(bus->dev, "Notify Slave: %d failed\n",
-                                       slave->dev_num);
-                               return ret;
+               mutex_lock(&slave->sdw_dev_lock);
+
+               if (slave->probed) {
+                       struct device *dev = &slave->dev;
+                       struct sdw_driver *drv = drv_to_sdw_driver(dev->driver);
+
+                       if (drv->ops && drv->ops->bus_config) {
+                               ret = drv->ops->bus_config(slave, &bus->params);
+                               if (ret < 0) {
+                                       dev_err(dev, "Notify Slave: %d failed\n",
+                                               slave->dev_num);
+                                       mutex_unlock(&slave->sdw_dev_lock);
+                                       return ret;
+                               }
                        }
                }
+
+               mutex_unlock(&slave->sdw_dev_lock);
        }
 
-       return ret;
+       return 0;
 }
 
 /**
 
  * @dev_num: Current Device Number, values can be 0 or dev_num_sticky
  * @dev_num_sticky: one-time static Device Number assigned by Bus
  * @probed: boolean tracking driver state
- * @probe_complete: completion utility to control potential races
- * on startup between driver probe/initialization and SoundWire
- * Slave state changes/implementation-defined interrupts
  * @enumeration_complete: completion utility to control potential races
  * on startup between device enumeration and read/write access to the
  * Slave device
  * for a Slave happens for the first time after enumeration
  * @is_mockup_device: status flag used to squelch errors in the command/control
  * protocol for SoundWire mockup devices
+ * @sdw_dev_lock: mutex used to protect callbacks/remove races
  */
 struct sdw_slave {
        struct sdw_slave_id id;
        u16 dev_num;
        u16 dev_num_sticky;
        bool probed;
-       struct completion probe_complete;
        struct completion enumeration_complete;
        struct completion initialization_complete;
        u32 unattach_request;
        bool first_interrupt_done;
        bool is_mockup_device;
+       struct mutex sdw_dev_lock; /* protect callbacks/remove races */
 };
 
 #define dev_to_sdw_dev(_dev) container_of(_dev, struct sdw_slave, dev)