return;
 
 reschedule:
+       /*
+        * prev_reset_trigger tracks consecutive fatal h/w errors until first
+        * heartbeat immediately post reset.
+        * If control reached here, then at least one heartbeat work has been
+        * scheduled since last reset/init cycle.
+        * So if the device is not already in reset cycle, reset the flag
+        * prev_reset_trigger as no reset occurred with HL_RESET_FW_FATAL_ERR
+        * status for at least one heartbeat. From this point driver restarts
+        * tracking future consecutive fatal errors.
+        */
+       if (!(atomic_read(&hdev->in_reset)))
+               hdev->prev_reset_trigger = HL_RESET_TRIGGER_DEFAULT;
+
        schedule_delayed_work(&hdev->work_heartbeat,
                        usecs_to_jiffies(HL_HEARTBEAT_PER_USEC));
 }
        mutex_unlock(&hdev->fpriv_list_lock);
 }
 
+static void handle_reset_trigger(struct hl_device *hdev, u32 flags)
+{
+       u32 cur_reset_trigger = HL_RESET_TRIGGER_DEFAULT;
+
+       /*
+        * 'reset cause' is being updated here, because getting here
+        * means that it's the 1st time and the last time we're here
+        * ('in_reset' makes sure of it). This makes sure that
+        * 'reset_cause' will continue holding its 1st recorded reason!
+        */
+       if (flags & HL_RESET_HEARTBEAT) {
+               hdev->curr_reset_cause = HL_RESET_CAUSE_HEARTBEAT;
+               cur_reset_trigger = HL_RESET_HEARTBEAT;
+       } else if (flags & HL_RESET_TDR) {
+               hdev->curr_reset_cause = HL_RESET_CAUSE_TDR;
+               cur_reset_trigger = HL_RESET_TDR;
+       } else if (flags & HL_RESET_FW_FATAL_ERR) {
+               hdev->curr_reset_cause = HL_RESET_CAUSE_UNKNOWN;
+               cur_reset_trigger = HL_RESET_FW_FATAL_ERR;
+       } else {
+               hdev->curr_reset_cause = HL_RESET_CAUSE_UNKNOWN;
+       }
+
+       /*
+        * If reset cause is same twice, then reset_trigger_repeated
+        * is set and if this reset is due to a fatal FW error
+        * device is set to an unstable state.
+        */
+       if (hdev->prev_reset_trigger != cur_reset_trigger) {
+               hdev->prev_reset_trigger = cur_reset_trigger;
+               hdev->reset_trigger_repeated = 0;
+       } else {
+               hdev->reset_trigger_repeated = 1;
+       }
+
+       /* If reset is due to heartbeat, device CPU is no responsive in
+        * which case no point sending PCI disable message to it.
+        *
+        * If F/W is performing the reset, no need to send it a message to disable
+        * PCI access
+        */
+       if ((flags & HL_RESET_HARD) &&
+                       !(flags & (HL_RESET_HEARTBEAT | HL_RESET_FW))) {
+               /* Disable PCI access from device F/W so he won't send
+                * us additional interrupts. We disable MSI/MSI-X at
+                * the halt_engines function and we can't have the F/W
+                * sending us interrupts after that. We need to disable
+                * the access here because if the device is marked
+                * disable, the message won't be send. Also, in case
+                * of heartbeat, the device CPU is marked as disable
+                * so this message won't be sent
+                */
+               if (hl_fw_send_pci_access_msg(hdev,
+                               CPUCP_PACKET_DISABLE_PCI_ACCESS))
+                       dev_warn(hdev->dev,
+                               "Failed to disable PCI access by F/W\n");
+       }
+}
+
 /*
  * hl_device_reset - reset the device
  *
                if (rc)
                        return 0;
 
-               /*
-                * 'reset cause' is being updated here, because getting here
-                * means that it's the 1st time and the last time we're here
-                * ('in_reset' makes sure of it). This makes sure that
-                * 'reset_cause' will continue holding its 1st recorded reason!
-                */
-               if (flags & HL_RESET_HEARTBEAT)
-                       hdev->curr_reset_cause = HL_RESET_CAUSE_HEARTBEAT;
-               else if (flags & HL_RESET_TDR)
-                       hdev->curr_reset_cause = HL_RESET_CAUSE_TDR;
-               else
-                       hdev->curr_reset_cause = HL_RESET_CAUSE_UNKNOWN;
-
-               /* If reset is due to heartbeat, device CPU is no responsive in
-                * which case no point sending PCI disable message to it.
-                *
-                * If F/W is performing the reset, no need to send it a message to disable
-                * PCI access
-                */
-               if (hard_reset && !(flags & (HL_RESET_HEARTBEAT | HL_RESET_FW))) {
-                       /* Disable PCI access from device F/W so he won't send
-                        * us additional interrupts. We disable MSI/MSI-X at
-                        * the halt_engines function and we can't have the F/W
-                        * sending us interrupts after that. We need to disable
-                        * the access here because if the device is marked
-                        * disable, the message won't be send. Also, in case
-                        * of heartbeat, the device CPU is marked as disable
-                        * so this message won't be sent
-                        */
-                       if (hl_fw_send_pci_access_msg(hdev,
-                                       CPUCP_PACKET_DISABLE_PCI_ACCESS))
-                               dev_warn(hdev->dev,
-                                       "Failed to disable PCI access by F/W\n");
-               }
+               handle_reset_trigger(hdev, flags);
 
                /* This also blocks future CS/VM/JOB completion operations */
                hdev->disabled = true;
                hdev->device_cpu_disabled = false;
                hdev->hard_reset_pending = false;
 
+               if (hdev->reset_trigger_repeated &&
+                               (hdev->prev_reset_trigger == HL_RESET_FW_FATAL_ERR)) {
+                       /* if there 2 back to back resets from FW,
+                        * ensure driver puts the driver in a unusable state
+                        */
+                       dev_crit(hdev->dev,
+                               "Consecutive FW fatal errors received, stopping hard reset\n");
+                       rc = -EIO;
+                       goto out_err;
+               }
+
                if (hdev->kernel_ctx) {
                        dev_crit(hdev->dev,
                                "kernel ctx was alive during hard reset, something is terribly wrong\n");
 
 
 #define HL_STATE_DUMP_HIST_LEN         5
 
+/* Default value for device reset trigger , an invalid value */
+#define HL_RESET_TRIGGER_DEFAULT       0xFF
+
 #define OBJ_NAMES_HASH_TABLE_BITS      7 /* 1 << 7 buckets */
 #define SYNC_TO_ENGINE_HASH_TABLE_BITS 7 /* 1 << 7 buckets */
 
  * - HL_RESET_FW
  *       F/W will perform the reset. No need to ask it to reset the device. This is relevant
  *       only when running with secured f/w
+ *
+ * - HL_RESET_FW_FATAL_ERR
+ *       Set if reset is due to a fatal error from FW
  */
+
 #define HL_RESET_HARD                  (1 << 0)
 #define HL_RESET_FROM_RESET_THREAD     (1 << 1)
 #define HL_RESET_HEARTBEAT             (1 << 2)
 #define HL_RESET_TDR                   (1 << 3)
 #define HL_RESET_DEVICE_RELEASE                (1 << 4)
 #define HL_RESET_FW                    (1 << 5)
+#define HL_RESET_FW_FATAL_ERR          (1 << 6)
 
 #define HL_MAX_SOBS_PER_MONITOR        8
 
  * @supports_staged_submission: true if staged submissions are supported
  * @curr_reset_cause: saves an enumerated reset cause when a hard reset is
  *                    triggered, and cleared after it is shared with preboot.
+ * @prev_reset_trigger: saves the previous trigger which caused a reset, overidden
+ *                      with a new value on next reset
+ * @reset_trigger_repeated: set if device reset is triggered more than once with
+ *                          same cause.
  * @skip_reset_on_timeout: Skip device reset if CS has timed out, wait for it to
  *                         complete instead.
  * @device_cpu_is_halted: Flag to indicate whether the device CPU was already
        u8                              device_fini_pending;
        u8                              supports_staged_submission;
        u8                              curr_reset_cause;
+       u8                              prev_reset_trigger;
+       u8                              reset_trigger_repeated;
        u8                              skip_reset_on_timeout;
        u8                              device_cpu_is_halted;
        u8                              supports_wait_for_multi_cs;
 
 {
        struct gaudi_device *gaudi = hdev->asic_specific;
        u32 ctl = le32_to_cpu(eq_entry->hdr.ctl);
+       u32 fw_fatal_err_flag = 0;
        u16 event_type = ((ctl & EQ_CTL_EVENT_TYPE_MASK)
                        >> EQ_CTL_EVENT_TYPE_SHIFT);
        bool reset_required;
        case GAUDI_EVENT_NIC0_CS_DBG_DERR ... GAUDI_EVENT_NIC4_CS_DBG_DERR:
                gaudi_print_irq_info(hdev, event_type, true);
                gaudi_handle_ecc_event(hdev, event_type, &eq_entry->ecc_data);
+               fw_fatal_err_flag = HL_RESET_FW_FATAL_ERR;
                goto reset_device;
 
        case GAUDI_EVENT_GIC500:
        case GAUDI_EVENT_L2_RAM_ECC:
        case GAUDI_EVENT_PLL0 ... GAUDI_EVENT_PLL17:
                gaudi_print_irq_info(hdev, event_type, false);
+               fw_fatal_err_flag = HL_RESET_FW_FATAL_ERR;
                goto reset_device;
 
        case GAUDI_EVENT_HBM0_SPI_0:
                gaudi_hbm_read_interrupts(hdev,
                                gaudi_hbm_event_to_dev(event_type),
                                &eq_entry->hbm_ecc_data);
+               fw_fatal_err_flag = HL_RESET_FW_FATAL_ERR;
                goto reset_device;
 
        case GAUDI_EVENT_HBM0_SPI_1:
 
 reset_device:
        if (hdev->asic_prop.fw_security_enabled)
-               hl_device_reset(hdev, HL_RESET_HARD | HL_RESET_FW);
+               hl_device_reset(hdev, HL_RESET_HARD | HL_RESET_FW | fw_fatal_err_flag);
        else if (hdev->hard_reset_on_fw_events)
-               hl_device_reset(hdev, HL_RESET_HARD);
+               hl_device_reset(hdev, HL_RESET_HARD | fw_fatal_err_flag);
        else
                hl_fw_unmask_irq(hdev, event_type);
 }