]> www.infradead.org Git - nvme.git/commitdiff
accel/habanalabs: abort device reset for consecutive heartbeat failures
authorTomer Tayar <ttayar@habana.ai>
Sun, 24 Dec 2023 22:28:36 +0000 (00:28 +0200)
committerOded Gabbay <ogabbay@kernel.org>
Mon, 26 Feb 2024 07:30:40 +0000 (09:30 +0200)
The mechanism of aborting device reset for consecutive fatal errors is
currently only for fatal errors that are reported by FW.
A non-responsive FW and consecutive heartbeat failures is also
considered fatal, so add them as well to this mechanism to avoid
recurring device reset in such a case.

Signed-off-by: Tomer Tayar <ttayar@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
drivers/accel/habanalabs/common/device.c

index 5c46826e365929e7fb7c11b599d31153ce6052a0..cf004baf5e6213fbe28f784194208d0e9c550759 100644 (file)
@@ -1769,14 +1769,16 @@ kill_processes:
                hdev->device_cpu_disabled = false;
                hdev->reset_info.hard_reset_pending = false;
 
+               /*
+                * Put the device in an unusable state if there are 2 back to back resets due to
+                * fatal errors.
+                */
                if (hdev->reset_info.reset_trigger_repeated &&
-                               (hdev->reset_info.prev_reset_trigger ==
-                                               HL_DRV_RESET_FW_FATAL_ERR)) {
-                       /* if there 2 back to back resets from FW,
-                        * ensure driver puts the driver in a unusable state
-                        */
+                               (hdev->reset_info.prev_reset_trigger == HL_DRV_RESET_FW_FATAL_ERR ||
+                                               hdev->reset_info.prev_reset_trigger ==
+                                                               HL_DRV_RESET_HEARTBEAT)) {
                        dev_crit(hdev->dev,
-                               "%s Consecutive FW fatal errors received, stopping hard reset\n",
+                               "%s Consecutive fatal errors, stopping hard reset\n",
                                dev_name(&(hdev)->pdev->dev));
                        rc = -EIO;
                        goto out_err;