* Copyright (C) 2019 Dell Inc
  * Copyright (C) 2023-2024 Intel Corporation
  *
+ * The PCIe bandwidth controller provides a way to alter PCIe Link Speeds
+ * and notify the operating system when the Link Width or Speed changes. The
+ * notification capability is required for all Root Ports and Downstream
+ * Ports supporting Link Width wider than x1 and/or multiple Link Speeds.
+ *
  * This service port driver hooks into the Bandwidth Notification interrupt
  * watching for changes or links becoming degraded in operation. It updates
  * the cached Current Link Speed that is exposed to user space through sysfs.
 #define dev_fmt(fmt) "bwctrl: " fmt
 
 #include <linux/atomic.h>
+#include <linux/bitops.h>
+#include <linux/bits.h>
 #include <linux/cleanup.h>
 #include <linux/errno.h>
 #include <linux/interrupt.h>
+#include <linux/mutex.h>
 #include <linux/pci.h>
 #include <linux/rwsem.h>
 #include <linux/slab.h>
 
 /**
  * struct pcie_bwctrl_data - PCIe bandwidth controller
+ * @set_speed_mutex:   Serializes link speed changes
  * @lbms_count:                Count for LBMS (since last reset)
  */
 struct pcie_bwctrl_data {
+       struct mutex set_speed_mutex;
        atomic_t lbms_count;
 };
 
-/* Prevents port removal during LBMS count accessors */
+/*
+ * Prevent port removal during LBMS count accessors and Link Speed changes.
+ *
+ * These have to be differentiated because pcie_bwctrl_change_speed() calls
+ * pcie_retrain_link() which uses LBMS count reset accessor on success
+ * (using just one rwsem triggers "possible recursive locking detected"
+ * warning).
+ */
 static DECLARE_RWSEM(pcie_bwctrl_lbms_rwsem);
+static DECLARE_RWSEM(pcie_bwctrl_setspeed_rwsem);
+
+static bool pcie_valid_speed(enum pci_bus_speed speed)
+{
+       return (speed >= PCIE_SPEED_2_5GT) && (speed <= PCIE_SPEED_64_0GT);
+}
+
+static u16 pci_bus_speed2lnkctl2(enum pci_bus_speed speed)
+{
+       static const u8 speed_conv[] = {
+               [PCIE_SPEED_2_5GT] = PCI_EXP_LNKCTL2_TLS_2_5GT,
+               [PCIE_SPEED_5_0GT] = PCI_EXP_LNKCTL2_TLS_5_0GT,
+               [PCIE_SPEED_8_0GT] = PCI_EXP_LNKCTL2_TLS_8_0GT,
+               [PCIE_SPEED_16_0GT] = PCI_EXP_LNKCTL2_TLS_16_0GT,
+               [PCIE_SPEED_32_0GT] = PCI_EXP_LNKCTL2_TLS_32_0GT,
+               [PCIE_SPEED_64_0GT] = PCI_EXP_LNKCTL2_TLS_64_0GT,
+       };
+
+       if (WARN_ON_ONCE(!pcie_valid_speed(speed)))
+               return 0;
+
+       return speed_conv[speed];
+}
+
+static inline u16 pcie_supported_speeds2target_speed(u8 supported_speeds)
+{
+       return __fls(supported_speeds);
+}
+
+/**
+ * pcie_bwctrl_select_speed - Select Target Link Speed
+ * @port:      PCIe Port
+ * @speed_req: Requested PCIe Link Speed
+ *
+ * Select Target Link Speed by take into account Supported Link Speeds of
+ * both the Root Port and the Endpoint.
+ *
+ * Return: Target Link Speed (1=2.5GT/s, 2=5GT/s, 3=8GT/s, etc.)
+ */
+static u16 pcie_bwctrl_select_speed(struct pci_dev *port, enum pci_bus_speed speed_req)
+{
+       struct pci_bus *bus = port->subordinate;
+       u8 desired_speeds, supported_speeds;
+       struct pci_dev *dev;
+
+       desired_speeds = GENMASK(pci_bus_speed2lnkctl2(speed_req),
+                                __fls(PCI_EXP_LNKCAP2_SLS_2_5GB));
+
+       supported_speeds = port->supported_speeds;
+       if (bus) {
+               down_read(&pci_bus_sem);
+               dev = list_first_entry_or_null(&bus->devices, struct pci_dev, bus_list);
+               if (dev)
+                       supported_speeds &= dev->supported_speeds;
+               up_read(&pci_bus_sem);
+       }
+       if (!supported_speeds)
+               return PCI_EXP_LNKCAP2_SLS_2_5GB;
+
+       return pcie_supported_speeds2target_speed(supported_speeds & desired_speeds);
+}
+
+static int pcie_bwctrl_change_speed(struct pci_dev *port, u16 target_speed, bool use_lt)
+{
+       int ret;
+
+       ret = pcie_capability_clear_and_set_word(port, PCI_EXP_LNKCTL2,
+                                                PCI_EXP_LNKCTL2_TLS, target_speed);
+       if (ret != PCIBIOS_SUCCESSFUL)
+               return pcibios_err_to_errno(ret);
+
+       ret = pcie_retrain_link(port, use_lt);
+       if (ret < 0)
+               return ret;
+
+       /*
+        * Ensure link speed updates also with platforms that have problems
+        * with notifications.
+        */
+       if (port->subordinate)
+               pcie_update_link_speed(port->subordinate);
+
+       return 0;
+}
+
+/**
+ * pcie_set_target_speed - Set downstream Link Speed for PCIe Port
+ * @port:      PCIe Port
+ * @speed_req: Requested PCIe Link Speed
+ * @use_lt:    Wait for the LT or DLLLA bit to detect the end of link training
+ *
+ * Attempt to set PCIe Port Link Speed to @speed_req. @speed_req may be
+ * adjusted downwards to the best speed supported by both the Port and PCIe
+ * Device underneath it.
+ *
+ * Return:
+ * * 0         - on success
+ * * -EINVAL   - @speed_req is not a PCIe Link Speed
+ * * -ENODEV   - @port is not controllable
+ * * -ETIMEDOUT        - changing Link Speed took too long
+ * * -EAGAIN   - Link Speed was changed but @speed_req was not achieved
+ */
+int pcie_set_target_speed(struct pci_dev *port, enum pci_bus_speed speed_req,
+                         bool use_lt)
+{
+       struct pci_bus *bus = port->subordinate;
+       u16 target_speed;
+       int ret;
+
+       if (WARN_ON_ONCE(!pcie_valid_speed(speed_req)))
+               return -EINVAL;
+
+       if (bus && bus->cur_bus_speed == speed_req)
+               return 0;
+
+       target_speed = pcie_bwctrl_select_speed(port, speed_req);
+
+       scoped_guard(rwsem_read, &pcie_bwctrl_setspeed_rwsem) {
+               struct pcie_bwctrl_data *data = port->link_bwctrl;
+
+               /*
+                * port->link_bwctrl is NULL during initial scan when called
+                * e.g. from the Target Speed quirk.
+                */
+               if (data)
+                       mutex_lock(&data->set_speed_mutex);
+
+               ret = pcie_bwctrl_change_speed(port, target_speed, use_lt);
+
+               if (data)
+                       mutex_unlock(&data->set_speed_mutex);
+       }
+
+       /*
+        * Despite setting higher speed into the Target Link Speed, empty
+        * bus won't train to 5GT+ speeds.
+        */
+       if (!ret && bus && bus->cur_bus_speed != speed_req &&
+           !list_empty(&bus->devices))
+               ret = -EAGAIN;
+
+       return ret;
+}
 
 static void pcie_bwnotif_enable(struct pcie_device *srv)
 {
        if (!data)
                return -ENOMEM;
 
+       ret = devm_mutex_init(&srv->device, &data->set_speed_mutex);
+       if (ret)
+               return ret;
+
        ret = devm_request_irq(&srv->device, srv->irq, pcie_bwnotif_irq,
                               IRQF_SHARED, "PCIe bwctrl", srv);
        if (ret)
                return ret;
 
-       scoped_guard(rwsem_write, &pcie_bwctrl_lbms_rwsem) {
-               port->link_bwctrl = no_free_ptr(data);
-               pcie_bwnotif_enable(srv);
+       scoped_guard(rwsem_write, &pcie_bwctrl_setspeed_rwsem) {
+               scoped_guard(rwsem_write, &pcie_bwctrl_lbms_rwsem) {
+                       port->link_bwctrl = no_free_ptr(data);
+                       pcie_bwnotif_enable(srv);
+               }
        }
 
        pci_dbg(port, "enabled with IRQ %d\n", srv->irq);
 static void pcie_bwnotif_remove(struct pcie_device *srv)
 {
        pcie_bwnotif_disable(srv->port);
-       scoped_guard(rwsem_write, &pcie_bwctrl_lbms_rwsem)
-               srv->port->link_bwctrl = NULL;
+
+       scoped_guard(rwsem_write, &pcie_bwctrl_setspeed_rwsem)
+               scoped_guard(rwsem_write, &pcie_bwctrl_lbms_rwsem)
+                       srv->port->link_bwctrl = NULL;
 }
 
 static int pcie_bwnotif_suspend(struct pcie_device *srv)
 
 
                pci_info(dev, "broken device, retraining non-functional downstream link at 2.5GT/s\n");
 
-               lnkctl2 &= ~PCI_EXP_LNKCTL2_TLS;
-               lnkctl2 |= PCI_EXP_LNKCTL2_TLS_2_5GT;
-               pcie_capability_write_word(dev, PCI_EXP_LNKCTL2, lnkctl2);
-
-               ret = pcie_retrain_link(dev, false);
+               ret = pcie_set_target_speed(dev, PCIE_SPEED_2_5GT, false);
                if (ret) {
                        pci_info(dev, "retraining failed\n");
-                       pcie_capability_write_word(dev, PCI_EXP_LNKCTL2,
-                                                  oldlnkctl2);
-                       pcie_retrain_link(dev, true);
+                       pcie_set_target_speed(dev, PCIE_LNKCTL2_TLS2SPEED(oldlnkctl2),
+                                             true);
                        return ret;
                }
 
 
                pci_info(dev, "removing 2.5GT/s downstream link speed restriction\n");
                pcie_capability_read_dword(dev, PCI_EXP_LNKCAP, &lnkcap);
-               lnkctl2 &= ~PCI_EXP_LNKCTL2_TLS;
-               lnkctl2 |= lnkcap & PCI_EXP_LNKCAP_SLS;
-               pcie_capability_write_word(dev, PCI_EXP_LNKCTL2, lnkctl2);
-
-               ret = pcie_retrain_link(dev, false);
+               ret = pcie_set_target_speed(dev, PCIE_LNKCAP_SLS2SPEED(lnkcap), false);
                if (ret) {
                        pci_info(dev, "retraining failed\n");
                        return ret;