if (event != ACPI_NOTIFY_DEVICE_WAKE || !pci_dev)
                return;
 
+       if (pci_dev->current_state == PCI_D3cold) {
+               pci_wakeup_event(pci_dev);
+               pm_runtime_resume(&pci_dev->dev);
+               return;
+       }
+
        if (!pci_dev->pm_cap || !pci_dev->pme_support
             || pci_check_pme_status(pci_dev)) {
                if (pci_dev->pme_poll)
 
 static pci_power_t acpi_pci_choose_state(struct pci_dev *pdev)
 {
-       int acpi_state;
+       int acpi_state, d_max;
 
-       acpi_state = acpi_pm_device_sleep_state(&pdev->dev, NULL,
-                                               ACPI_STATE_D3);
+       if (pdev->no_d3cold)
+               d_max = ACPI_STATE_D3_HOT;
+       else
+               d_max = ACPI_STATE_D3_COLD;
+       acpi_state = acpi_pm_device_sleep_state(&pdev->dev, NULL, d_max);
        if (acpi_state < 0)
                return PCI_POWER_ERROR;
 
 
 static int acpi_pci_run_wake(struct pci_dev *dev, bool enable)
 {
-       if (dev->pme_interrupt)
+       /*
+        * Per PCI Express Base Specification Revision 2.0 section
+        * 5.3.3.2 Link Wakeup, platform support is needed for D3cold
+        * waking up to power on the main link even if there is PME
+        * support for D3cold
+        */
+       if (dev->pme_interrupt && !dev->runtime_d3cold)
                return 0;
 
        if (!acpi_pm_device_run_wake(&dev->dev, enable))
 
        if (!pm || !pm->runtime_suspend)
                return -ENOSYS;
 
+       pci_dev->no_d3cold = false;
        error = pm->runtime_suspend(dev);
        suspend_report_result(pm->runtime_suspend, error);
        if (error)
                return error;
+       if (!pci_dev->d3cold_allowed)
+               pci_dev->no_d3cold = true;
 
        pci_fixup_device(pci_fixup_suspend, pci_dev);
 
 
 static int pci_pm_runtime_resume(struct device *dev)
 {
+       int rc;
        struct pci_dev *pci_dev = to_pci_dev(dev);
        const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL;
 
        __pci_enable_wake(pci_dev, PCI_D0, true, false);
        pci_fixup_device(pci_fixup_resume, pci_dev);
 
-       return pm->runtime_resume(dev);
+       rc = pm->runtime_resume(dev);
+
+       pci_dev->runtime_d3cold = false;
+
+       return rc;
 }
 
 static int pci_pm_runtime_idle(struct device *dev)
 
 #include <linux/pci-aspm.h>
 #include <linux/slab.h>
 #include <linux/vgaarb.h>
+#include <linux/pm_runtime.h>
 #include "pci.h"
 
 static int sysfs_initialized;  /* = 0 */
 
 #endif
 
+#if defined(CONFIG_PM_RUNTIME) && defined(CONFIG_ACPI)
+static ssize_t d3cold_allowed_store(struct device *dev,
+                                   struct device_attribute *attr,
+                                   const char *buf, size_t count)
+{
+       struct pci_dev *pdev = to_pci_dev(dev);
+       unsigned long val;
+
+       if (strict_strtoul(buf, 0, &val) < 0)
+               return -EINVAL;
+
+       pdev->d3cold_allowed = !!val;
+       pm_runtime_resume(dev);
+
+       return count;
+}
+
+static ssize_t d3cold_allowed_show(struct device *dev,
+                                  struct device_attribute *attr, char *buf)
+{
+       struct pci_dev *pdev = to_pci_dev(dev);
+       return sprintf (buf, "%u\n", pdev->d3cold_allowed);
+}
+#endif
+
 struct device_attribute pci_dev_attrs[] = {
        __ATTR_RO(resource),
        __ATTR_RO(vendor),
 #ifdef CONFIG_HOTPLUG
        __ATTR(remove, (S_IWUSR|S_IWGRP), NULL, remove_store),
        __ATTR(rescan, (S_IWUSR|S_IWGRP), NULL, dev_rescan_store),
+#endif
+#if defined(CONFIG_PM_RUNTIME) && defined(CONFIG_ACPI)
+       __ATTR(d3cold_allowed, 0644, d3cold_allowed_show, d3cold_allowed_store),
 #endif
        __ATTR_NULL,
 };
 
                dev_info(&dev->dev, "Refused to change power state, "
                        "currently in D%d\n", dev->current_state);
 
-       /* According to section 5.4.1 of the "PCI BUS POWER MANAGEMENT
+       /*
+        * According to section 5.4.1 of the "PCI BUS POWER MANAGEMENT
         * INTERFACE SPECIFICATION, REV. 1.2", a device transitioning
         * from D3hot to D0 _may_ perform an internal reset, thereby
         * going to "D0 Uninitialized" rather than "D0 Initialized".
        if (dev->pm_cap) {
                u16 pmcsr;
 
+               /*
+                * Configuration space is not accessible for device in
+                * D3cold, so just keep or set D3cold for safety
+                */
+               if (dev->current_state == PCI_D3cold)
+                       return;
+               if (state == PCI_D3cold) {
+                       dev->current_state = PCI_D3cold;
+                       return;
+               }
                pci_read_config_word(dev, dev->pm_cap + PCI_PM_CTRL, &pmcsr);
                dev->current_state = (pmcsr & PCI_PM_CTRL_STATE_MASK);
        } else {
  */
 static void __pci_start_power_transition(struct pci_dev *dev, pci_power_t state)
 {
-       if (state == PCI_D0)
+       if (state == PCI_D0) {
                pci_platform_power_transition(dev, PCI_D0);
+               /*
+                * Mandatory power management transition delays, see
+                * PCI Express Base Specification Revision 2.0 Section
+                * 6.6.1: Conventional Reset.  Do not delay for
+                * devices powered on/off by corresponding bridge,
+                * because have already delayed for the bridge.
+                */
+               if (dev->runtime_d3cold) {
+                       msleep(dev->d3cold_delay);
+                       /*
+                        * When powering on a bridge from D3cold, the
+                        * whole hierarchy may be powered on into
+                        * D0uninitialized state, resume them to give
+                        * them a chance to suspend again
+                        */
+                       pci_wakeup_bus(dev->subordinate);
+               }
+       }
+}
+
+/**
+ * __pci_dev_set_current_state - Set current state of a PCI device
+ * @dev: Device to handle
+ * @data: pointer to state to be set
+ */
+static int __pci_dev_set_current_state(struct pci_dev *dev, void *data)
+{
+       pci_power_t state = *(pci_power_t *)data;
+
+       dev->current_state = state;
+       return 0;
+}
+
+/**
+ * __pci_bus_set_current_state - Walk given bus and set current state of devices
+ * @bus: Top bus of the subtree to walk.
+ * @state: state to be set
+ */
+static void __pci_bus_set_current_state(struct pci_bus *bus, pci_power_t state)
+{
+       if (bus)
+               pci_walk_bus(bus, __pci_dev_set_current_state, &state);
 }
 
 /**
  */
 int __pci_complete_power_transition(struct pci_dev *dev, pci_power_t state)
 {
-       return state >= PCI_D0 ?
-                       pci_platform_power_transition(dev, state) : -EINVAL;
+       int ret;
+
+       if (state < PCI_D0)
+               return -EINVAL;
+       ret = pci_platform_power_transition(dev, state);
+       /* Power off the bridge may power off the whole hierarchy */
+       if (!ret && state == PCI_D3cold)
+               __pci_bus_set_current_state(dev->subordinate, PCI_D3cold);
+       return ret;
 }
 EXPORT_SYMBOL_GPL(__pci_complete_power_transition);
 
        int error;
 
        /* bound the state we're entering */
-       if (state > PCI_D3hot)
-               state = PCI_D3hot;
+       if (state > PCI_D3cold)
+               state = PCI_D3cold;
        else if (state < PCI_D0)
                state = PCI_D0;
        else if ((state == PCI_D1 || state == PCI_D2) && pci_no_d1d2(dev))
 
        /* This device is quirked not to be put into D3, so
           don't put it in D3 */
-       if (state == PCI_D3hot && (dev->dev_flags & PCI_DEV_FLAGS_NO_D3))
+       if (state >= PCI_D3hot && (dev->dev_flags & PCI_DEV_FLAGS_NO_D3))
                return 0;
 
-       error = pci_raw_set_power_state(dev, state);
+       /*
+        * To put device in D3cold, we put device into D3hot in native
+        * way, then put device into D3cold with platform ops
+        */
+       error = pci_raw_set_power_state(dev, state > PCI_D3hot ?
+                                       PCI_D3hot : state);
 
        if (!__pci_complete_power_transition(dev, state))
                error = 0;
                pci_walk_bus(bus, pci_pme_wakeup, (void *)true);
 }
 
+/**
+ * pci_wakeup - Wake up a PCI device
+ * @dev: Device to handle.
+ * @ign: ignored parameter
+ */
+static int pci_wakeup(struct pci_dev *pci_dev, void *ign)
+{
+       pci_wakeup_event(pci_dev);
+       pm_request_resume(&pci_dev->dev);
+       return 0;
+}
+
+/**
+ * pci_wakeup_bus - Walk given bus and wake up devices on it
+ * @bus: Top bus of the subtree to walk.
+ */
+void pci_wakeup_bus(struct pci_bus *bus)
+{
+       if (bus)
+               pci_walk_bus(bus, pci_wakeup, NULL);
+}
+
 /**
  * pci_pme_capable - check the capability of PCI device to generate PME#
  * @dev: PCI device to handle.
        if (target_state == PCI_POWER_ERROR)
                return -EIO;
 
+       /* D3cold during system suspend/hibernate is not supported */
+       if (target_state > PCI_D3hot)
+               target_state = PCI_D3hot;
+
        pci_enable_wake(dev, target_state, device_may_wakeup(&dev->dev));
 
        error = pci_set_power_state(dev, target_state);
        if (target_state == PCI_POWER_ERROR)
                return -EIO;
 
+       dev->runtime_d3cold = target_state == PCI_D3cold;
+
        __pci_enable_wake(dev, target_state, true, pci_dev_run_wake(dev));
 
        error = pci_set_power_state(dev, target_state);
 
-       if (error)
+       if (error) {
                __pci_enable_wake(dev, target_state, true, false);
+               dev->runtime_d3cold = false;
+       }
 
        return error;
 }
 
        dev->pm_cap = pm;
        dev->d3_delay = PCI_PM_D3_WAIT;
+       dev->d3cold_delay = PCI_PM_D3COLD_WAIT;
 
        dev->d1_support = false;
        dev->d2_support = false;
 
 extern void pci_disable_enabled_device(struct pci_dev *dev);
 extern int pci_finish_runtime_suspend(struct pci_dev *dev);
 extern int __pci_pme_wakeup(struct pci_dev *dev, void *ign);
+extern void pci_wakeup_bus(struct pci_bus *bus);
 extern void pci_pm_init(struct pci_dev *dev);
 extern void platform_pci_wakeup_init(struct pci_dev *dev);
 extern void pci_allocate_cap_save_buffers(struct pci_dev *dev);
 
 }
 
 #ifdef CONFIG_PM_RUNTIME
-static int pcie_port_runtime_pm(struct device *dev)
+struct d3cold_info {
+       bool no_d3cold;
+       unsigned int d3cold_delay;
+};
+
+static int pci_dev_d3cold_info(struct pci_dev *pdev, void *data)
+{
+       struct d3cold_info *info = data;
+
+       info->d3cold_delay = max_t(unsigned int, pdev->d3cold_delay,
+                                  info->d3cold_delay);
+       if (pdev->no_d3cold)
+               info->no_d3cold = true;
+       return 0;
+}
+
+static int pcie_port_runtime_suspend(struct device *dev)
+{
+       struct pci_dev *pdev = to_pci_dev(dev);
+       struct d3cold_info d3cold_info = {
+               .no_d3cold      = false,
+               .d3cold_delay   = PCI_PM_D3_WAIT,
+       };
+
+       /*
+        * If any subordinate device disable D3cold, we should not put
+        * the port into D3cold.  The D3cold delay of port should be
+        * the max of that of all subordinate devices.
+        */
+       pci_walk_bus(pdev->subordinate, pci_dev_d3cold_info, &d3cold_info);
+       pdev->no_d3cold = d3cold_info.no_d3cold;
+       pdev->d3cold_delay = d3cold_info.d3cold_delay;
+       return 0;
+}
+
+static int pcie_port_runtime_resume(struct device *dev)
 {
        return 0;
 }
 #else
-#define pcie_port_runtime_pm   NULL
+#define pcie_port_runtime_suspend      NULL
+#define pcie_port_runtime_resume       NULL
 #endif
 
 static const struct dev_pm_ops pcie_portdrv_pm_ops = {
        .poweroff       = pcie_port_device_suspend,
        .restore        = pcie_port_device_resume,
        .resume_noirq   = pcie_port_resume_noirq,
-       .runtime_suspend = pcie_port_runtime_pm,
-       .runtime_resume = pcie_port_runtime_pm,
+       .runtime_suspend = pcie_port_runtime_suspend,
+       .runtime_resume = pcie_port_runtime_resume,
 };
 
 #define PCIE_PORTDRV_PM_OPS    (&pcie_portdrv_pm_ops)
 
        return pci_power_names[1 + (int) state];
 }
 
-#define PCI_PM_D2_DELAY        200
-#define PCI_PM_D3_WAIT 10
-#define PCI_PM_BUS_WAIT        50
+#define PCI_PM_D2_DELAY                200
+#define PCI_PM_D3_WAIT         10
+#define PCI_PM_D3COLD_WAIT     100
+#define PCI_PM_BUS_WAIT                50
 
 /** The pci_channel state describes connectivity between the CPU and
  *  the pci device.  If some PCI bus between here and the pci device
        unsigned int    pme_poll:1;     /* Poll device's PME status bit */
        unsigned int    d1_support:1;   /* Low power state D1 is supported */
        unsigned int    d2_support:1;   /* Low power state D2 is supported */
-       unsigned int    no_d1d2:1;      /* Only allow D0 and D3 */
+       unsigned int    no_d1d2:1;      /* D1 and D2 are forbidden */
+       unsigned int    no_d3cold:1;    /* D3cold is forbidden */
+       unsigned int    d3cold_allowed:1;       /* D3cold is allowed by user */
        unsigned int    mmio_always_on:1;       /* disallow turning off io/mem
                                                   decoding during bar sizing */
        unsigned int    wakeup_prepared:1;
+       unsigned int    runtime_d3cold:1;       /* whether go through runtime
+                                                  D3cold, not set for devices
+                                                  powered on/off by the
+                                                  corresponding bridge */
        unsigned int    d3_delay;       /* D3->D0 transition time in ms */
+       unsigned int    d3cold_delay;   /* D3cold->D0 transition time in ms */
 
 #ifdef CONFIG_PCIEASPM
        struct pcie_link_state  *link_state;    /* ASPM link state. */