--- /dev/null
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2023-2024, Advanced Micro Devices, Inc.
+ */
+
+#include <drm/drm_cache.h>
+#include <drm/drm_device.h>
+#include <drm/drm_print.h>
+#include <drm/gpu_scheduler.h>
+#include <linux/dma-mapping.h>
+#include <linux/kthread.h>
+#include <linux/kernel.h>
+
+#include "aie2_msg_priv.h"
+#include "aie2_pci.h"
+#include "amdxdna_mailbox.h"
+#include "amdxdna_pci_drv.h"
+
+struct async_event {
+       struct amdxdna_dev_hdl          *ndev;
+       struct async_event_msg_resp     resp;
+       struct workqueue_struct         *wq;
+       struct work_struct              work;
+       u8                              *buf;
+       dma_addr_t                      addr;
+       u32                             size;
+};
+
+struct async_events {
+       struct workqueue_struct         *wq;
+       u8                              *buf;
+       dma_addr_t                      addr;
+       u32                             size;
+       u32                             event_cnt;
+       struct async_event              event[] __counted_by(event_cnt);
+};
+
+/*
+ * Below enum, struct and lookup tables are porting from XAIE util header file.
+ *
+ * Below data is defined by AIE device and it is used for decode error message
+ * from the device.
+ */
+
+enum aie_module_type {
+       AIE_MEM_MOD = 0,
+       AIE_CORE_MOD,
+       AIE_PL_MOD,
+};
+
+enum aie_error_category {
+       AIE_ERROR_SATURATION = 0,
+       AIE_ERROR_FP,
+       AIE_ERROR_STREAM,
+       AIE_ERROR_ACCESS,
+       AIE_ERROR_BUS,
+       AIE_ERROR_INSTRUCTION,
+       AIE_ERROR_ECC,
+       AIE_ERROR_LOCK,
+       AIE_ERROR_DMA,
+       AIE_ERROR_MEM_PARITY,
+       /* Unknown is not from XAIE, added for better category */
+       AIE_ERROR_UNKNOWN,
+};
+
+/* Don't pack, unless XAIE side changed */
+struct aie_error {
+       __u8                    row;
+       __u8                    col;
+       __u32                   mod_type;
+       __u8                    event_id;
+};
+
+struct aie_err_info {
+       u32                     err_cnt;
+       u32                     ret_code;
+       u32                     rsvd;
+       struct aie_error        payload[] __counted_by(err_cnt);
+};
+
+struct aie_event_category {
+       u8                      event_id;
+       enum aie_error_category category;
+};
+
+#define EVENT_CATEGORY(id, cat) { id, cat }
+static const struct aie_event_category aie_ml_mem_event_cat[] = {
+       EVENT_CATEGORY(88U,  AIE_ERROR_ECC),
+       EVENT_CATEGORY(90U,  AIE_ERROR_ECC),
+       EVENT_CATEGORY(91U,  AIE_ERROR_MEM_PARITY),
+       EVENT_CATEGORY(92U,  AIE_ERROR_MEM_PARITY),
+       EVENT_CATEGORY(93U,  AIE_ERROR_MEM_PARITY),
+       EVENT_CATEGORY(94U,  AIE_ERROR_MEM_PARITY),
+       EVENT_CATEGORY(95U,  AIE_ERROR_MEM_PARITY),
+       EVENT_CATEGORY(96U,  AIE_ERROR_MEM_PARITY),
+       EVENT_CATEGORY(97U,  AIE_ERROR_DMA),
+       EVENT_CATEGORY(98U,  AIE_ERROR_DMA),
+       EVENT_CATEGORY(99U,  AIE_ERROR_DMA),
+       EVENT_CATEGORY(100U, AIE_ERROR_DMA),
+       EVENT_CATEGORY(101U, AIE_ERROR_LOCK),
+};
+
+static const struct aie_event_category aie_ml_core_event_cat[] = {
+       EVENT_CATEGORY(55U, AIE_ERROR_ACCESS),
+       EVENT_CATEGORY(56U, AIE_ERROR_STREAM),
+       EVENT_CATEGORY(57U, AIE_ERROR_STREAM),
+       EVENT_CATEGORY(58U, AIE_ERROR_BUS),
+       EVENT_CATEGORY(59U, AIE_ERROR_INSTRUCTION),
+       EVENT_CATEGORY(60U, AIE_ERROR_ACCESS),
+       EVENT_CATEGORY(62U, AIE_ERROR_ECC),
+       EVENT_CATEGORY(64U, AIE_ERROR_ECC),
+       EVENT_CATEGORY(65U, AIE_ERROR_ACCESS),
+       EVENT_CATEGORY(66U, AIE_ERROR_ACCESS),
+       EVENT_CATEGORY(67U, AIE_ERROR_LOCK),
+       EVENT_CATEGORY(70U, AIE_ERROR_INSTRUCTION),
+       EVENT_CATEGORY(71U, AIE_ERROR_STREAM),
+       EVENT_CATEGORY(72U, AIE_ERROR_BUS),
+};
+
+static const struct aie_event_category aie_ml_mem_tile_event_cat[] = {
+       EVENT_CATEGORY(130U, AIE_ERROR_ECC),
+       EVENT_CATEGORY(132U, AIE_ERROR_ECC),
+       EVENT_CATEGORY(133U, AIE_ERROR_DMA),
+       EVENT_CATEGORY(134U, AIE_ERROR_DMA),
+       EVENT_CATEGORY(135U, AIE_ERROR_STREAM),
+       EVENT_CATEGORY(136U, AIE_ERROR_STREAM),
+       EVENT_CATEGORY(137U, AIE_ERROR_STREAM),
+       EVENT_CATEGORY(138U, AIE_ERROR_BUS),
+       EVENT_CATEGORY(139U, AIE_ERROR_LOCK),
+};
+
+static const struct aie_event_category aie_ml_shim_tile_event_cat[] = {
+       EVENT_CATEGORY(64U, AIE_ERROR_BUS),
+       EVENT_CATEGORY(65U, AIE_ERROR_STREAM),
+       EVENT_CATEGORY(66U, AIE_ERROR_STREAM),
+       EVENT_CATEGORY(67U, AIE_ERROR_BUS),
+       EVENT_CATEGORY(68U, AIE_ERROR_BUS),
+       EVENT_CATEGORY(69U, AIE_ERROR_BUS),
+       EVENT_CATEGORY(70U, AIE_ERROR_BUS),
+       EVENT_CATEGORY(71U, AIE_ERROR_BUS),
+       EVENT_CATEGORY(72U, AIE_ERROR_DMA),
+       EVENT_CATEGORY(73U, AIE_ERROR_DMA),
+       EVENT_CATEGORY(74U, AIE_ERROR_LOCK),
+};
+
+static enum aie_error_category
+aie_get_error_category(u8 row, u8 event_id, enum aie_module_type mod_type)
+{
+       const struct aie_event_category *lut;
+       int num_entry;
+       int i;
+
+       switch (mod_type) {
+       case AIE_PL_MOD:
+               lut = aie_ml_shim_tile_event_cat;
+               num_entry = ARRAY_SIZE(aie_ml_shim_tile_event_cat);
+               break;
+       case AIE_CORE_MOD:
+               lut = aie_ml_core_event_cat;
+               num_entry = ARRAY_SIZE(aie_ml_core_event_cat);
+               break;
+       case AIE_MEM_MOD:
+               if (row == 1) {
+                       lut = aie_ml_mem_tile_event_cat;
+                       num_entry = ARRAY_SIZE(aie_ml_mem_tile_event_cat);
+               } else {
+                       lut = aie_ml_mem_event_cat;
+                       num_entry = ARRAY_SIZE(aie_ml_mem_event_cat);
+               }
+               break;
+       default:
+               return AIE_ERROR_UNKNOWN;
+       }
+
+       for (i = 0; i < num_entry; i++) {
+               if (event_id != lut[i].event_id)
+                       continue;
+
+               return lut[i].category;
+       }
+
+       return AIE_ERROR_UNKNOWN;
+}
+
+static u32 aie2_error_backtrack(struct amdxdna_dev_hdl *ndev, void *err_info, u32 num_err)
+{
+       struct aie_error *errs = err_info;
+       u32 err_col = 0; /* assume that AIE has less than 32 columns */
+       int i;
+
+       /* Get err column bitmap */
+       for (i = 0; i < num_err; i++) {
+               struct aie_error *err = &errs[i];
+               enum aie_error_category cat;
+
+               cat = aie_get_error_category(err->row, err->event_id, err->mod_type);
+               XDNA_ERR(ndev->xdna, "Row: %d, Col: %d, module %d, event ID %d, category %d",
+                        err->row, err->col, err->mod_type,
+                        err->event_id, cat);
+
+               if (err->col >= 32) {
+                       XDNA_WARN(ndev->xdna, "Invalid column number");
+                       break;
+               }
+
+               err_col |= (1 << err->col);
+       }
+
+       return err_col;
+}
+
+static int aie2_error_async_cb(void *handle, const u32 *data, size_t size)
+{
+       struct async_event_msg_resp *resp;
+       struct async_event *e = handle;
+
+       if (data) {
+               resp = (struct async_event_msg_resp *)data;
+               e->resp.type = resp->type;
+               wmb(); /* Update status in the end, so that no lock for here */
+               e->resp.status = resp->status;
+       }
+       queue_work(e->wq, &e->work);
+       return 0;
+}
+
+static int aie2_error_event_send(struct async_event *e)
+{
+       drm_clflush_virt_range(e->buf, e->size); /* device can access */
+       return aie2_register_asyn_event_msg(e->ndev, e->addr, e->size, e,
+                                           aie2_error_async_cb);
+}
+
+static void aie2_error_worker(struct work_struct *err_work)
+{
+       struct aie_err_info *info;
+       struct amdxdna_dev *xdna;
+       struct async_event *e;
+       u32 max_err;
+       u32 err_col;
+
+       e = container_of(err_work, struct async_event, work);
+
+       xdna = e->ndev->xdna;
+
+       if (e->resp.status == MAX_AIE2_STATUS_CODE)
+               return;
+
+       e->resp.status = MAX_AIE2_STATUS_CODE;
+
+       print_hex_dump_debug("AIE error: ", DUMP_PREFIX_OFFSET, 16, 4,
+                            e->buf, 0x100, false);
+
+       info = (struct aie_err_info *)e->buf;
+       XDNA_DBG(xdna, "Error count %d return code %d", info->err_cnt, info->ret_code);
+
+       max_err = (e->size - sizeof(*info)) / sizeof(struct aie_error);
+       if (unlikely(info->err_cnt > max_err)) {
+               WARN_ONCE(1, "Error count too large %d\n", info->err_cnt);
+               return;
+       }
+       err_col = aie2_error_backtrack(e->ndev, info->payload, info->err_cnt);
+       if (!err_col) {
+               XDNA_WARN(xdna, "Did not get error column");
+               return;
+       }
+
+       mutex_lock(&xdna->dev_lock);
+       /* Re-sent this event to firmware */
+       if (aie2_error_event_send(e))
+               XDNA_WARN(xdna, "Unable to register async event");
+       mutex_unlock(&xdna->dev_lock);
+}
+
+int aie2_error_async_events_send(struct amdxdna_dev_hdl *ndev)
+{
+       struct amdxdna_dev *xdna = ndev->xdna;
+       struct async_event *e;
+       int i, ret;
+
+       drm_WARN_ON(&xdna->ddev, !mutex_is_locked(&xdna->dev_lock));
+       for (i = 0; i < ndev->async_events->event_cnt; i++) {
+               e = &ndev->async_events->event[i];
+               ret = aie2_error_event_send(e);
+               if (ret)
+                       return ret;
+       }
+
+       return 0;
+}
+
+void aie2_error_async_events_free(struct amdxdna_dev_hdl *ndev)
+{
+       struct amdxdna_dev *xdna = ndev->xdna;
+       struct async_events *events;
+
+       events = ndev->async_events;
+
+       mutex_unlock(&xdna->dev_lock);
+       destroy_workqueue(events->wq);
+       mutex_lock(&xdna->dev_lock);
+
+       dma_free_noncoherent(xdna->ddev.dev, events->size, events->buf,
+                            events->addr, DMA_FROM_DEVICE);
+       kfree(events);
+}
+
+int aie2_error_async_events_alloc(struct amdxdna_dev_hdl *ndev)
+{
+       struct amdxdna_dev *xdna = ndev->xdna;
+       u32 total_col = ndev->total_col;
+       u32 total_size = ASYNC_BUF_SIZE * total_col;
+       struct async_events *events;
+       int i, ret;
+
+       events = kzalloc(struct_size(events, event, total_col), GFP_KERNEL);
+       if (!events)
+               return -ENOMEM;
+
+       events->buf = dma_alloc_noncoherent(xdna->ddev.dev, total_size, &events->addr,
+                                           DMA_FROM_DEVICE, GFP_KERNEL);
+       if (!events->buf) {
+               ret = -ENOMEM;
+               goto free_events;
+       }
+       events->size = total_size;
+       events->event_cnt = total_col;
+
+       events->wq = alloc_ordered_workqueue("async_wq", 0);
+       if (!events->wq) {
+               ret = -ENOMEM;
+               goto free_buf;
+       }
+
+       for (i = 0; i < events->event_cnt; i++) {
+               struct async_event *e = &events->event[i];
+               u32 offset = i * ASYNC_BUF_SIZE;
+
+               e->ndev = ndev;
+               e->wq = events->wq;
+               e->buf = &events->buf[offset];
+               e->addr = events->addr + offset;
+               e->size = ASYNC_BUF_SIZE;
+               e->resp.status = MAX_AIE2_STATUS_CODE;
+               INIT_WORK(&e->work, aie2_error_worker);
+       }
+
+       ndev->async_events = events;
+
+       XDNA_DBG(xdna, "Async event count %d, buf total size 0x%x",
+                events->event_cnt, events->size);
+       return 0;
+
+free_buf:
+       dma_free_noncoherent(xdna->ddev.dev, events->size, events->buf,
+                            events->addr, DMA_FROM_DEVICE);
+free_events:
+       kfree(events);
+       return ret;
+}