struct mlx5_core_dev *dev = container_of(ent->cmd, struct mlx5_core_dev,
                                                 cmd);
 
+       mlx5_cmd_eq_recover(dev);
+
+       /* Maybe got handled by eq recover ? */
+       if (!test_bit(MLX5_CMD_ENT_STATE_PENDING_COMP, &ent->state)) {
+               mlx5_core_warn(dev, "cmd[%d]: %s(0x%x) Async, recovered after timeout\n", ent->idx,
+                              mlx5_command_str(msg_to_opcode(ent->in)), msg_to_opcode(ent->in));
+               goto out; /* phew, already handled */
+       }
+
        ent->ret = -ETIMEDOUT;
-       mlx5_core_warn(dev, "%s(0x%x) timeout. Will cause a leak of a command resource\n",
-                      mlx5_command_str(msg_to_opcode(ent->in)),
-                      msg_to_opcode(ent->in));
+       mlx5_core_warn(dev, "cmd[%d]: %s(0x%x) Async, timeout. Will cause a leak of a command resource\n",
+                      ent->idx, mlx5_command_str(msg_to_opcode(ent->in)), msg_to_opcode(ent->in));
        mlx5_cmd_comp_handler(dev, 1UL << ent->idx, true);
+
+out:
        cmd_ent_put(ent); /* for the cmd_ent_get() took on schedule delayed work */
 }
 
        }
 }
 
+enum {
+       MLX5_CMD_TIMEOUT_RECOVER_MSEC   = 5 * 1000,
+};
+
+static void wait_func_handle_exec_timeout(struct mlx5_core_dev *dev,
+                                         struct mlx5_cmd_work_ent *ent)
+{
+       unsigned long timeout = msecs_to_jiffies(MLX5_CMD_TIMEOUT_RECOVER_MSEC);
+
+       mlx5_cmd_eq_recover(dev);
+
+       /* Re-wait on the ent->done after executing the recovery flow. If the
+        * recovery flow (or any other recovery flow running simultaneously)
+        * has recovered an EQE, it should cause the entry to be completed by
+        * the command interface.
+        */
+       if (wait_for_completion_timeout(&ent->done, timeout)) {
+               mlx5_core_warn(dev, "cmd[%d]: %s(0x%x) recovered after timeout\n", ent->idx,
+                              mlx5_command_str(msg_to_opcode(ent->in)), msg_to_opcode(ent->in));
+               return;
+       }
+
+       mlx5_core_warn(dev, "cmd[%d]: %s(0x%x) No done completion\n", ent->idx,
+                      mlx5_command_str(msg_to_opcode(ent->in)), msg_to_opcode(ent->in));
+
+       ent->ret = -ETIMEDOUT;
+       mlx5_cmd_comp_handler(dev, 1UL << ent->idx, true);
+}
+
 static int wait_func(struct mlx5_core_dev *dev, struct mlx5_cmd_work_ent *ent)
 {
        unsigned long timeout = msecs_to_jiffies(MLX5_CMD_TIMEOUT_MSEC);
                ent->ret = -ECANCELED;
                goto out_err;
        }
-       if (cmd->mode == CMD_MODE_POLLING || ent->polling) {
+       if (cmd->mode == CMD_MODE_POLLING || ent->polling)
                wait_for_completion(&ent->done);
-       } else if (!wait_for_completion_timeout(&ent->done, timeout)) {
-               ent->ret = -ETIMEDOUT;
-               mlx5_cmd_comp_handler(dev, 1UL << ent->idx, true);
-       }
+       else if (!wait_for_completion_timeout(&ent->done, timeout))
+               wait_func_handle_exec_timeout(dev, ent);
 
 out_err:
        err = ent->ret;
 
        return count_eqe;
 }
 
+static void mlx5_eq_async_int_lock(struct mlx5_eq_async *eq, unsigned long *flags)
+       __acquires(&eq->lock)
+{
+       if (in_irq())
+               spin_lock(&eq->lock);
+       else
+               spin_lock_irqsave(&eq->lock, *flags);
+}
+
+static void mlx5_eq_async_int_unlock(struct mlx5_eq_async *eq, unsigned long *flags)
+       __releases(&eq->lock)
+{
+       if (in_irq())
+               spin_unlock(&eq->lock);
+       else
+               spin_unlock_irqrestore(&eq->lock, *flags);
+}
+
+enum async_eq_nb_action {
+       ASYNC_EQ_IRQ_HANDLER = 0,
+       ASYNC_EQ_RECOVER = 1,
+};
+
 static int mlx5_eq_async_int(struct notifier_block *nb,
                             unsigned long action, void *data)
 {
        struct mlx5_eq_table *eqt;
        struct mlx5_core_dev *dev;
        struct mlx5_eqe *eqe;
+       unsigned long flags;
        int num_eqes = 0;
 
        dev = eq->dev;
        eqt = dev->priv.eq_table;
 
+       mlx5_eq_async_int_lock(eq_async, &flags);
+
        eqe = next_eqe_sw(eq);
        if (!eqe)
                goto out;
 
 out:
        eq_update_ci(eq, 1);
+       mlx5_eq_async_int_unlock(eq_async, &flags);
 
-       return 0;
+       return unlikely(action == ASYNC_EQ_RECOVER) ? num_eqes : 0;
+}
+
+void mlx5_cmd_eq_recover(struct mlx5_core_dev *dev)
+{
+       struct mlx5_eq_async *eq = &dev->priv.eq_table->cmd_eq;
+       int eqes;
+
+       eqes = mlx5_eq_async_int(&eq->irq_nb, ASYNC_EQ_RECOVER, NULL);
+       if (eqes)
+               mlx5_core_warn(dev, "Recovered %d EQEs on cmd_eq\n", eqes);
 }
 
 static void init_eq_buf(struct mlx5_eq *eq)
        int err;
 
        eq->irq_nb.notifier_call = mlx5_eq_async_int;
+       spin_lock_init(&eq->lock);
 
        err = create_async_eq(dev, &eq->core, param);
        if (err) {