struct mlx4_priv *priv = mlx4_priv(dev);
 
        if (readl(priv->catas_err.map)) {
-               dump_err_buf(dev);
-
-               mlx4_dispatch_event(dev, MLX4_DEV_EVENT_CATASTROPHIC_ERROR, 0);
+               /* If the device is off-line, we cannot try to recover it */
+               if (pci_channel_offline(dev->pdev))
+                       mod_timer(&priv->catas_err.timer,
+                                 round_jiffies(jiffies + MLX4_CATAS_POLL_INTERVAL));
+               else {
+                       dump_err_buf(dev);
+                       mlx4_dispatch_event(dev, MLX4_DEV_EVENT_CATASTROPHIC_ERROR, 0);
 
-               if (internal_err_reset) {
-                       spin_lock(&catas_lock);
-                       list_add(&priv->catas_err.list, &catas_list);
-                       spin_unlock(&catas_lock);
+                       if (internal_err_reset) {
+                               spin_lock(&catas_lock);
+                               list_add(&priv->catas_err.list, &catas_list);
+                               spin_unlock(&catas_lock);
 
-                       queue_work(mlx4_wq, &catas_work);
+                               queue_work(mlx4_wq, &catas_work);
+                       }
                }
        } else
                mod_timer(&priv->catas_err.timer,
        list_for_each_entry_safe(priv, tmppriv, &tlist, catas_err.list) {
                struct pci_dev *pdev = priv->dev.pdev;
 
+               /* If the device is off-line, we cannot reset it */
+               if (pci_channel_offline(pdev))
+                       continue;
+
                ret = mlx4_restart_one(priv->dev.pdev);
                /* 'priv' now is not valid */
                if (ret)
 
 
 static int cmd_pending(struct mlx4_dev *dev)
 {
-       u32 status = readl(mlx4_priv(dev)->cmd.hcr + HCR_STATUS_OFFSET);
+       u32 status;
+
+       if (pci_channel_offline(dev->pdev))
+               return -EIO;
+
+       status = readl(mlx4_priv(dev)->cmd.hcr + HCR_STATUS_OFFSET);
 
        return (status & swab32(1 << HCR_GO_BIT)) ||
                (mlx4_priv(dev)->cmd.toggle ==
 
        mutex_lock(&cmd->hcr_mutex);
 
+       if (pci_channel_offline(dev->pdev)) {
+               /*
+                * Device is going through error recovery
+                * and cannot accept commands.
+                */
+               ret = -EIO;
+               goto out;
+       }
+
        end = jiffies;
        if (event)
                end += msecs_to_jiffies(GO_BIT_TIMEOUT_MSECS);
 
        while (cmd_pending(dev)) {
+               if (pci_channel_offline(dev->pdev)) {
+                       /*
+                        * Device is going through error recovery
+                        * and cannot accept commands.
+                        */
+                       ret = -EIO;
+                       goto out;
+               }
+
                if (time_after_eq(jiffies, end)) {
                        mlx4_err(dev, "%s:cmd_pending failed\n", __func__);
                        goto out;
 
        down(&priv->cmd.poll_sem);
 
+       if (pci_channel_offline(dev->pdev)) {
+               /*
+                * Device is going through error recovery
+                * and cannot accept commands.
+                */
+               err = -EIO;
+               goto out;
+       }
+
        err = mlx4_cmd_post(dev, in_param, out_param ? *out_param : 0,
                            in_modifier, op_modifier, op, CMD_POLL_TOKEN, 0);
        if (err)
                goto out;
 
        end = msecs_to_jiffies(timeout) + jiffies;
-       while (cmd_pending(dev) && time_before(jiffies, end))
+       while (cmd_pending(dev) && time_before(jiffies, end)) {
+               if (pci_channel_offline(dev->pdev)) {
+                       /*
+                        * Device is going through error recovery
+                        * and cannot accept commands.
+                        */
+                       err = -EIO;
+                       goto out;
+               }
+
                cond_resched();
+       }
 
        if (cmd_pending(dev)) {
                err = -ETIMEDOUT;
               int out_is_imm, u32 in_modifier, u8 op_modifier,
               u16 op, unsigned long timeout, int native)
 {
+       if (pci_channel_offline(dev->pdev))
+               return -EIO;
+
        if (!mlx4_is_mfunc(dev) || (native && mlx4_is_master(dev))) {
                if (mlx4_priv(dev)->cmd.use_events)
                        return mlx4_cmd_wait(dev, in_param, out_param,
 
        void __iomem *owner;
        u32 ret;
 
+       if (pci_channel_offline(dev->pdev))
+               return -EIO;
+
        owner = ioremap(pci_resource_start(dev->pdev, 0) + MLX4_OWNER_BASE,
                        MLX4_OWNER_SIZE);
        if (!owner) {
 {
        void __iomem *owner;
 
+       if (pci_channel_offline(dev->pdev))
+               return;
+
        owner = ioremap(pci_resource_start(dev->pdev, 0) + MLX4_OWNER_BASE,
                        MLX4_OWNER_SIZE);
        if (!owner) {
 
 MODULE_DEVICE_TABLE(pci, mlx4_pci_table);
 
+static pci_ers_result_t mlx4_pci_err_detected(struct pci_dev *pdev,
+                                             pci_channel_state_t state)
+{
+       mlx4_remove_one(pdev);
+
+       return state == pci_channel_io_perm_failure ?
+               PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_NEED_RESET;
+}
+
+static pci_ers_result_t mlx4_pci_slot_reset(struct pci_dev *pdev)
+{
+       int ret = __mlx4_init_one(pdev, NULL);
+
+       return ret ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
+}
+
+static struct pci_error_handlers mlx4_err_handler = {
+       .error_detected = mlx4_pci_err_detected,
+       .slot_reset     = mlx4_pci_slot_reset,
+};
+
 static struct pci_driver mlx4_driver = {
        .name           = DRV_NAME,
        .id_table       = mlx4_pci_table,
        .probe          = mlx4_init_one,
-       .remove         = __devexit_p(mlx4_remove_one)
+       .remove         = __devexit_p(mlx4_remove_one),
+       .err_handler    = &mlx4_err_handler,
 };
 
 static int __init mlx4_verify_params(void)