instance->skip_heartbeat_timer_del = 1;
        }
 
+       /*
+        * Create and start watchdog thread which will monitor
+        * controller state every 1 sec and trigger OCR when
+        * it enters fault state
+        */
+       if (instance->adapter_type != MFI_SERIES)
+               if (megasas_fusion_start_watchdog(instance) != SUCCESS)
+                       goto fail_start_watchdog;
+
        return 0;
 
+fail_start_watchdog:
+       if (instance->requestorId && !instance->skip_heartbeat_timer_del)
+               del_timer_sync(&instance->sriov_heartbeat_timer);
 fail_get_ld_pd_list:
        instance->instancet->disable_intr(instance);
 fail_init_adapter:
        instance->disableOnlineCtrlReset = 1;
        instance->UnevenSpanSupport = 0;
 
-       if (instance->adapter_type != MFI_SERIES) {
+       if (instance->adapter_type != MFI_SERIES)
                INIT_WORK(&instance->work_init, megasas_fusion_ocr_wq);
-               INIT_WORK(&instance->crash_init, megasas_fusion_crash_dump_wq);
-       } else {
+       else
                INIT_WORK(&instance->work_init, process_fw_state_change_wq);
-       }
 }
 
 /**
        if (instance->requestorId && !instance->skip_heartbeat_timer_del)
                del_timer_sync(&instance->sriov_heartbeat_timer);
 
+       /* Stop the FW fault detection watchdog */
+       if (instance->adapter_type != MFI_SERIES)
+               megasas_fusion_stop_watchdog(instance);
+
        megasas_flush_cache(instance);
        megasas_shutdown_controller(instance, MR_DCMD_HIBERNATE_SHUTDOWN);
 
        if (megasas_start_aen(instance))
                dev_err(&instance->pdev->dev, "Start AEN failed\n");
 
+       /* Re-launch FW fault watchdog */
+       if (instance->adapter_type != MFI_SERIES)
+               if (megasas_fusion_start_watchdog(instance) != SUCCESS)
+                       goto fail_start_watchdog;
+
        return 0;
 
+fail_start_watchdog:
+       if (instance->requestorId && !instance->skip_heartbeat_timer_del)
+               del_timer_sync(&instance->sriov_heartbeat_timer);
 fail_init_mfi:
        megasas_free_ctrl_dma_buffers(instance);
        megasas_free_ctrl_mem(instance);
        if (instance->requestorId && !instance->skip_heartbeat_timer_del)
                del_timer_sync(&instance->sriov_heartbeat_timer);
 
+       /* Stop the FW fault detection watchdog */
+       if (instance->adapter_type != MFI_SERIES)
+               megasas_fusion_stop_watchdog(instance);
+
        if (instance->fw_crash_state != UNAVAILABLE)
                megasas_free_host_crash_buffer(instance);
        scsi_remove_host(instance->host);
 
 #include <linux/mutex.h>
 #include <linux/poll.h>
 #include <linux/vmalloc.h>
+#include <linux/workqueue.h>
 
 #include <scsi/scsi.h>
 #include <scsi/scsi_cmnd.h>
 static void megasas_free_reply_fusion(struct megasas_instance *instance);
 static inline
 void megasas_configure_queue_sizes(struct megasas_instance *instance);
+static void megasas_fusion_crash_dump(struct megasas_instance *instance);
 
 /**
  * megasas_check_same_4gb_region -     check if allocation
        return 1;
 }
 
+/**
+ * megasas_fault_detect_work   -       Worker function of
+ *                                     FW fault handling workqueue.
+ */
+static void
+megasas_fault_detect_work(struct work_struct *work)
+{
+       struct megasas_instance *instance =
+               container_of(work, struct megasas_instance,
+                            fw_fault_work.work);
+       u32 fw_state, dma_state, status;
+
+       /* Check the fw state */
+       fw_state = instance->instancet->read_fw_status_reg(instance->reg_set) &
+                       MFI_STATE_MASK;
+
+       if (fw_state == MFI_STATE_FAULT) {
+               dma_state = instance->instancet->read_fw_status_reg(
+                               instance->reg_set) & MFI_STATE_DMADONE;
+               /* Start collecting crash, if DMA bit is done */
+               if (instance->crash_dump_drv_support &&
+                   instance->crash_dump_app_support && dma_state) {
+                       megasas_fusion_crash_dump(instance);
+               } else {
+                       if (instance->unload == 0) {
+                               status = megasas_reset_fusion(instance->host, 0);
+                               if (status != SUCCESS) {
+                                       dev_err(&instance->pdev->dev,
+                                               "Failed from %s %d, do not re-arm timer\n",
+                                               __func__, __LINE__);
+                                       return;
+                               }
+                       }
+               }
+       }
+
+       if (instance->fw_fault_work_q)
+               queue_delayed_work(instance->fw_fault_work_q,
+                       &instance->fw_fault_work,
+                       msecs_to_jiffies(MEGASAS_WATCHDOG_THREAD_INTERVAL));
+}
+
+int
+megasas_fusion_start_watchdog(struct megasas_instance *instance)
+{
+       /* Check if the Fault WQ is already started */
+       if (instance->fw_fault_work_q)
+               return SUCCESS;
+
+       INIT_DELAYED_WORK(&instance->fw_fault_work, megasas_fault_detect_work);
+
+       snprintf(instance->fault_handler_work_q_name,
+                sizeof(instance->fault_handler_work_q_name),
+                "poll_megasas%d_status", instance->host->host_no);
+
+       instance->fw_fault_work_q =
+               create_singlethread_workqueue(instance->fault_handler_work_q_name);
+       if (!instance->fw_fault_work_q) {
+               dev_err(&instance->pdev->dev, "Failed from %s %d\n",
+                       __func__, __LINE__);
+               return FAILED;
+       }
+
+       queue_delayed_work(instance->fw_fault_work_q,
+                          &instance->fw_fault_work,
+                          msecs_to_jiffies(MEGASAS_WATCHDOG_THREAD_INTERVAL));
+
+       return SUCCESS;
+}
+
+void
+megasas_fusion_stop_watchdog(struct megasas_instance *instance)
+{
+       struct workqueue_struct *wq;
+
+       if (instance->fw_fault_work_q) {
+               wq = instance->fw_fault_work_q;
+               instance->fw_fault_work_q = NULL;
+               if (!cancel_delayed_work_sync(&instance->fw_fault_work))
+                       flush_workqueue(wq);
+               destroy_workqueue(wq);
+       }
+}
+
 /**
  * map_cmd_status -    Maps FW cmd status to OS cmd status
  * @cmd :              Pointer to cmd
 {
        struct megasas_irq_context *irq_context = devp;
        struct megasas_instance *instance = irq_context->instance;
-       u32 mfiStatus, fw_state, dma_state;
+       u32 mfiStatus;
 
        if (instance->mask_interrupts)
                return IRQ_NONE;
                return IRQ_HANDLED;
        }
 
-       if (!complete_cmd_fusion(instance, irq_context->MSIxIndex)) {
-               instance->instancet->clear_intr(instance->reg_set);
-               /* If we didn't complete any commands, check for FW fault */
-               fw_state = instance->instancet->read_fw_status_reg(
-                       instance->reg_set) & MFI_STATE_MASK;
-               dma_state = instance->instancet->read_fw_status_reg
-                       (instance->reg_set) & MFI_STATE_DMADONE;
-               if (instance->crash_dump_drv_support &&
-                       instance->crash_dump_app_support) {
-                       /* Start collecting crash, if DMA bit is done */
-                       if ((fw_state == MFI_STATE_FAULT) && dma_state)
-                               schedule_work(&instance->crash_init);
-                       else if (fw_state == MFI_STATE_FAULT) {
-                               if (instance->unload == 0)
-                                       schedule_work(&instance->work_init);
-                       }
-               } else if (fw_state == MFI_STATE_FAULT) {
-                       dev_warn(&instance->pdev->dev, "Iop2SysDoorbellInt"
-                              "for scsi%d\n", instance->host->host_no);
-                       if (instance->unload == 0)
-                               schedule_work(&instance->work_init);
-               }
-       }
-
-       return IRQ_HANDLED;
+       return complete_cmd_fusion(instance, irq_context->MSIxIndex);
 }
 
 /**
        return retval;
 }
 
-/* Fusion Crash dump collection work queue */
-void  megasas_fusion_crash_dump_wq(struct work_struct *work)
+/* Fusion Crash dump collection */
+void  megasas_fusion_crash_dump(struct megasas_instance *instance)
 {
-       struct megasas_instance *instance =
-               container_of(work, struct megasas_instance, crash_init);
        u32 status_reg;
        u8 partial_copy = 0;
+       int wait = 0;
 
 
        status_reg = instance->instancet->read_fw_status_reg(instance->reg_set);
                        "allocated: %d\n", instance->drv_buf_alloc);
        }
 
-       /*
-        * Driver has allocated max buffers, which can be allocated
-        * and FW has more crash dump data, then driver will
-        * ignore the data.
-        */
-       if (instance->drv_buf_index >= (instance->drv_buf_alloc)) {
-               dev_info(&instance->pdev->dev, "Driver is done copying "
-                       "the buffer: %d\n", instance->drv_buf_alloc);
-               status_reg |= MFI_STATE_CRASH_DUMP_DONE;
-               partial_copy = 1;
-       } else {
-               memcpy(instance->crash_buf[instance->drv_buf_index],
-                       instance->crash_dump_buf, CRASH_DMA_BUF_SIZE);
-               instance->drv_buf_index++;
-               status_reg &= ~MFI_STATE_DMADONE;
+       while (!(status_reg & MFI_STATE_CRASH_DUMP_DONE) &&
+              (wait < MEGASAS_WATCHDOG_WAIT_COUNT)) {
+               if (!(status_reg & MFI_STATE_DMADONE)) {
+                       /*
+                        * Next crash dump buffer is not yet DMA'd by FW
+                        * Check after 10ms. Wait for 1 second for FW to
+                        * post the next buffer. If not bail out.
+                        */
+                       wait++;
+                       msleep(MEGASAS_WAIT_FOR_NEXT_DMA_MSECS);
+                       status_reg = instance->instancet->read_fw_status_reg(
+                                       instance->reg_set);
+                       continue;
+               }
+
+               wait = 0;
+               if (instance->drv_buf_index >= instance->drv_buf_alloc) {
+                       dev_info(&instance->pdev->dev,
+                                "Driver is done copying the buffer: %d\n",
+                                instance->drv_buf_alloc);
+                       status_reg |= MFI_STATE_CRASH_DUMP_DONE;
+                       partial_copy = 1;
+                       break;
+               } else {
+                       memcpy(instance->crash_buf[instance->drv_buf_index],
+                              instance->crash_dump_buf, CRASH_DMA_BUF_SIZE);
+                       instance->drv_buf_index++;
+                       status_reg &= ~MFI_STATE_DMADONE;
+               }
+
+               writel(status_reg, &instance->reg_set->outbound_scratch_pad);
+               readl(&instance->reg_set->outbound_scratch_pad);
+
+               msleep(MEGASAS_WAIT_FOR_NEXT_DMA_MSECS);
+               status_reg = instance->instancet->read_fw_status_reg(
+                               instance->reg_set);
        }
 
        if (status_reg & MFI_STATE_CRASH_DUMP_DONE) {
                readl(&instance->reg_set->outbound_scratch_pad);
                if (!partial_copy)
                        megasas_reset_fusion(instance->host, 0);
-       } else {
-               writel(status_reg, &instance->reg_set->outbound_scratch_pad);
-               readl(&instance->reg_set->outbound_scratch_pad);
        }
 }