]> www.infradead.org Git - users/mchehab/rasdaemon.git/commitdiff
rasdaemon: Add support for the CXL AER uncorrectable errors
authorShiju Jose <shiju.jose@huawei.com>
Fri, 17 Mar 2023 12:51:02 +0000 (12:51 +0000)
committerMauro Carvalho Chehab <mchehab@kernel.org>
Sun, 30 Apr 2023 08:43:28 +0000 (09:43 +0100)
Add support to log and record the CXL AER uncorrectable errors.

The corresponding Kernel patches are here:
https://lore.kernel.org/linux-cxl/166974401763.1608150.5424589924034481387.stgit@djiang5-desk3.ch.intel.com/T/#t
https://lore.kernel.org/lkml/63eeb2a8c9e3f_32d612941f@dwillia2-xfh.jf.intel.com.notmuch/T/

It was found that the header log data to be converted to the
big-endian format to correctly store in the SQLite DB likely
because the SQLite database seems uses the big-endian storage.

Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Reviewed-by: Dave Jiang <dave.jiang@intel.com>#
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
ras-cxl-handler.c
ras-cxl-handler.h
ras-events.c
ras-events.h
ras-record.c
ras-record.h
ras-report.c
ras-report.h

index cb23ba236abd3a9913e5641a1e25de8b6f6de0d3..0f2c9e45460db97a514d55151bbd77deb45fe6e4 100644 (file)
@@ -21,6 +21,7 @@
 #include "ras-record.h"
 #include "ras-logger.h"
 #include "ras-report.h"
+#include <endian.h>
 
 /* Poison List: Payload out flags */
 #define CXL_POISON_FLAG_MORE            BIT(0)
@@ -200,3 +201,153 @@ int ras_cxl_poison_event_handler(struct trace_seq *s,
 
        return 0;
 }
+
+/* CXL AER Errors */
+
+#define CXL_AER_UE_CACHE_DATA_PARITY   BIT(0)
+#define CXL_AER_UE_CACHE_ADDR_PARITY   BIT(1)
+#define CXL_AER_UE_CACHE_BE_PARITY     BIT(2)
+#define CXL_AER_UE_CACHE_DATA_ECC      BIT(3)
+#define CXL_AER_UE_MEM_DATA_PARITY     BIT(4)
+#define CXL_AER_UE_MEM_ADDR_PARITY     BIT(5)
+#define CXL_AER_UE_MEM_BE_PARITY       BIT(6)
+#define CXL_AER_UE_MEM_DATA_ECC                BIT(7)
+#define CXL_AER_UE_REINIT_THRESH       BIT(8)
+#define CXL_AER_UE_RSVD_ENCODE         BIT(9)
+#define CXL_AER_UE_POISON              BIT(10)
+#define CXL_AER_UE_RECV_OVERFLOW       BIT(11)
+#define CXL_AER_UE_INTERNAL_ERR                BIT(14)
+#define CXL_AER_UE_IDE_TX_ERR          BIT(15)
+#define CXL_AER_UE_IDE_RX_ERR          BIT(16)
+
+struct cxl_error_list {
+       uint32_t bit;
+       const char *error;
+};
+
+static const struct cxl_error_list cxl_aer_ue[] = {
+       { .bit = CXL_AER_UE_CACHE_DATA_PARITY, .error = "Cache Data Parity Error" },
+       { .bit = CXL_AER_UE_CACHE_ADDR_PARITY, .error = "Cache Address Parity Error" },
+       { .bit = CXL_AER_UE_CACHE_BE_PARITY, .error = "Cache Byte Enable Parity Error" },
+       { .bit = CXL_AER_UE_CACHE_DATA_ECC, .error = "Cache Data ECC Error" },
+       { .bit = CXL_AER_UE_MEM_DATA_PARITY, .error = "Memory Data Parity Error" },
+       { .bit = CXL_AER_UE_MEM_ADDR_PARITY, .error = "Memory Address Parity Error" },
+       { .bit = CXL_AER_UE_MEM_BE_PARITY, .error = "Memory Byte Enable Parity Error" },
+       { .bit = CXL_AER_UE_MEM_DATA_ECC, .error = "Memory Data ECC Error" },
+       { .bit = CXL_AER_UE_REINIT_THRESH, .error = "REINIT Threshold Hit" },
+       { .bit = CXL_AER_UE_RSVD_ENCODE, .error = "Received Unrecognized Encoding" },
+       { .bit = CXL_AER_UE_POISON, .error = "Received Poison From Peer" },
+       { .bit = CXL_AER_UE_RECV_OVERFLOW, .error = "Receiver Overflow" },
+       { .bit = CXL_AER_UE_INTERNAL_ERR, .error = "Component Specific Error" },
+       { .bit = CXL_AER_UE_IDE_TX_ERR, .error = "IDE Tx Error" },
+       { .bit = CXL_AER_UE_IDE_RX_ERR, .error = "IDE Rx Error" },
+};
+
+static int decode_cxl_error_status(struct trace_seq *s, uint32_t status,
+                                  const struct cxl_error_list *cxl_error_list,
+                                  uint8_t num_elems)
+{
+       int i;
+
+       for (i = 0; i < num_elems; i++) {
+               if (status & cxl_error_list[i].bit)
+                       if (trace_seq_printf(s, "\'%s\' ", cxl_error_list[i].error) <= 0)
+                               return -1;
+       }
+       return 0;
+}
+
+int ras_cxl_aer_ue_event_handler(struct trace_seq *s,
+                                struct tep_record *record,
+                                struct tep_event *event, void *context)
+{
+       int len, i;
+       unsigned long long val;
+       time_t now;
+       struct tm *tm;
+       struct ras_events *ras = context;
+       struct ras_cxl_aer_ue_event ev;
+
+       memset(&ev, 0, sizeof(ev));
+       now = record->ts / user_hz + ras->uptime_diff;
+       tm = localtime(&now);
+       if (tm)
+               strftime(ev.timestamp, sizeof(ev.timestamp),
+                        "%Y-%m-%d %H:%M:%S %z", tm);
+       else
+               strncpy(ev.timestamp, "1970-01-01 00:00:00 +0000", sizeof(ev.timestamp));
+       if (trace_seq_printf(s, "%s ", ev.timestamp) <= 0)
+               return -1;
+
+       ev.memdev = tep_get_field_raw(s, event, "memdev",
+                                     record, &len, 1);
+       if (!ev.memdev)
+               return -1;
+       if (trace_seq_printf(s, "memdev:%s ", ev.memdev) <= 0)
+               return -1;
+
+       ev.host = tep_get_field_raw(s, event, "host",
+                                   record, &len, 1);
+       if (!ev.host)
+               return -1;
+       if (trace_seq_printf(s, "host:%s ", ev.host) <= 0)
+               return -1;
+
+       if (tep_get_field_val(s, event, "serial", record, &val, 1) < 0)
+               return -1;
+       ev.serial = val;
+       if (trace_seq_printf(s, "serial:0x%llx ", (unsigned long long)ev.serial) <= 0)
+               return -1;
+
+       if (tep_get_field_val(s, event, "status", record, &val, 1) < 0)
+               return -1;
+       ev.error_status = val;
+
+       if (trace_seq_printf(s, "error status:") <= 0)
+               return -1;
+       if (decode_cxl_error_status(s, ev.error_status,
+                                   cxl_aer_ue, ARRAY_SIZE(cxl_aer_ue)) < 0)
+               return -1;
+
+       if (tep_get_field_val(s,  event, "first_error", record, &val, 1) < 0)
+               return -1;
+       ev.first_error = val;
+
+       if (trace_seq_printf(s, "first error:") <= 0)
+               return -1;
+       if (decode_cxl_error_status(s, ev.first_error,
+                                   cxl_aer_ue, ARRAY_SIZE(cxl_aer_ue)) < 0)
+               return -1;
+
+       ev.header_log = tep_get_field_raw(s, event, "header_log",
+                                         record, &len, 1);
+       if (!ev.header_log)
+               return -1;
+       if (trace_seq_printf(s, "header log:\n") <= 0)
+               return -1;
+       for (i = 0; i < CXL_HEADERLOG_SIZE_U32; i++) {
+               if (trace_seq_printf(s, "%08x ", ev.header_log[i]) <= 0)
+                       break;
+               if ((i > 0) && ((i % 20) == 0))
+                       if (trace_seq_printf(s, "\n") <= 0)
+                               break;
+               /* Convert header log data to the big-endian format because
+                * the SQLite database seems uses the big-endian storage.
+                */
+               ev.header_log[i] = htobe32(ev.header_log[i]);
+       }
+       if (i < CXL_HEADERLOG_SIZE_U32)
+               return -1;
+
+       /* Insert data into the SGBD */
+#ifdef HAVE_SQLITE3
+       ras_store_cxl_aer_ue_event(ras, &ev);
+#endif
+
+#ifdef HAVE_ABRT_REPORT
+       /* Report event to ABRT */
+       ras_report_cxl_aer_ue_event(ras, &ev);
+#endif
+
+       return 0;
+}
index 84d5cc6c8342848e7d312a37d155bd8210f90ff6..35efadd03e469140599de84453195c3c8dc7949d 100644 (file)
@@ -21,4 +21,8 @@
 int ras_cxl_poison_event_handler(struct trace_seq *s,
                                 struct tep_record *record,
                                 struct tep_event *event, void *context);
+
+int ras_cxl_aer_ue_event_handler(struct trace_seq *s,
+                                struct tep_record *record,
+                                struct tep_event *event, void *context);
 #endif
index f95844a53c731b45906c58fe6fff764a16ca3685..5d73df1f9c122281d1722212e44cab315b563ff6 100644 (file)
@@ -246,6 +246,7 @@ int toggle_ras_mc_event(int enable)
 
 #ifdef HAVE_CXL
        rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_poison", enable);
+       rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_aer_uncorrectable_error", enable);
 #endif
 
 free_ras:
@@ -992,6 +993,14 @@ int handle_ras_events(int record_events)
        else
                log(ALL, LOG_ERR, "Can't get traces from %s:%s\n",
                    "cxl", "cxl_poison");
+
+       rc = add_event_handler(ras, pevent, page_size, "cxl", "cxl_aer_uncorrectable_error",
+                              ras_cxl_aer_ue_event_handler, NULL, CXL_AER_UE_EVENT);
+       if (!rc)
+               num_events++;
+       else
+               log(ALL, LOG_ERR, "Can't get traces from %s:%s\n",
+                   "cxl", "cxl_aer_uncorrectable_error");
 #endif
 
        if (!num_events) {
index 1ef3ecdd207d0864ae09c509f4806e623c53128a..4acbe57c9d5bc16010f10cdedacb37e7270bded1 100644 (file)
@@ -40,6 +40,7 @@ enum {
        DISKERROR_EVENT,
        MF_EVENT,
        CXL_POISON_EVENT,
+       CXL_AER_UE_EVENT,
        NR_EVENTS
 };
 
index c31baa023564d6c31edd4f828934d02700d36f9d..97a2a3741e92362500c46f3b1ea605542630a01a 100644 (file)
@@ -622,6 +622,57 @@ int ras_store_cxl_poison_event(struct ras_events *ras, struct ras_cxl_poison_eve
 
        return rc;
 }
+
+/*
+ * Table and functions to handle cxl:cxl_aer_uncorrectable_error
+ */
+static const struct db_fields cxl_aer_ue_event_fields[] = {
+       { .name = "id",                 .type = "INTEGER PRIMARY KEY" },
+       { .name = "timestamp",          .type = "TEXT" },
+       { .name = "memdev",             .type = "TEXT" },
+       { .name = "host",               .type = "TEXT" },
+       { .name = "serial",             .type = "INTEGER" },
+       { .name = "error_status",       .type = "INTEGER" },
+       { .name = "first_error",        .type = "INTEGER" },
+       { .name = "header_log",         .type = "BLOB" },
+};
+
+static const struct db_table_descriptor cxl_aer_ue_event_tab = {
+       .name = "cxl_aer_ue_event",
+       .fields = cxl_aer_ue_event_fields,
+       .num_fields = ARRAY_SIZE(cxl_aer_ue_event_fields),
+};
+
+int ras_store_cxl_aer_ue_event(struct ras_events *ras, struct ras_cxl_aer_ue_event *ev)
+{
+       int rc;
+       struct sqlite3_priv *priv = ras->db_priv;
+
+       if (!priv || !priv->stmt_cxl_aer_ue_event)
+               return 0;
+       log(TERM, LOG_INFO, "cxl_aer_ue_event store: %p\n", priv->stmt_cxl_aer_ue_event);
+
+       sqlite3_bind_text(priv->stmt_cxl_aer_ue_event, 1, ev->timestamp, -1, NULL);
+       sqlite3_bind_text(priv->stmt_cxl_aer_ue_event, 2, ev->memdev, -1, NULL);
+       sqlite3_bind_text(priv->stmt_cxl_aer_ue_event, 3, ev->host, -1, NULL);
+       sqlite3_bind_int64(priv->stmt_cxl_aer_ue_event, 4, ev->serial);
+       sqlite3_bind_int(priv->stmt_cxl_aer_ue_event, 5, ev->error_status);
+       sqlite3_bind_int(priv->stmt_cxl_aer_ue_event, 6, ev->first_error);
+       sqlite3_bind_blob(priv->stmt_cxl_aer_ue_event, 7, ev->header_log, CXL_HEADERLOG_SIZE, NULL);
+
+       rc = sqlite3_step(priv->stmt_cxl_aer_ue_event);
+       if (rc != SQLITE_OK && rc != SQLITE_DONE)
+               log(TERM, LOG_ERR,
+                   "Failed to do cxl_aer_ue_event step on sqlite: error = %d\n", rc);
+       rc = sqlite3_reset(priv->stmt_cxl_aer_ue_event);
+       if (rc != SQLITE_OK && rc != SQLITE_DONE)
+               log(TERM, LOG_ERR,
+                   "Failed reset cxl_aer_ue_event on sqlite: error = %d\n",
+                   rc);
+       log(TERM, LOG_INFO, "register inserted at db\n");
+
+       return rc;
+}
 #endif
 
 /*
@@ -973,6 +1024,14 @@ int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras)
                if (rc != SQLITE_OK)
                        goto error;
        }
+
+       rc = ras_mc_create_table(priv, &cxl_aer_ue_event_tab);
+       if (rc == SQLITE_OK) {
+               rc = ras_mc_prepare_stmt(priv, &priv->stmt_cxl_aer_ue_event,
+                                        &cxl_aer_ue_event_tab);
+               if (rc != SQLITE_OK)
+                       goto error;
+       }
 #endif
 
        ras->db_priv = priv;
@@ -1102,6 +1161,14 @@ int ras_mc_event_closedb(unsigned int cpu, struct ras_events *ras)
                            "cpu %u: Failed to finalize cxl_poison_event sqlite: error = %d\n",
                            cpu, rc);
        }
+
+       if (priv->stmt_cxl_aer_ue_event) {
+               rc = sqlite3_finalize(priv->stmt_cxl_aer_ue_event);
+               if (rc != SQLITE_OK)
+                       log(TERM, LOG_ERR,
+                           "cpu %u: Failed to finalize cxl_aer_ue_event sqlite: error = %d\n",
+                           cpu, rc);
+       }
 #endif
 
        rc = sqlite3_close_v2(db);
index fd152159706cff530929f738ec885e0af2eff53b..f11985fe1a09d04bf3ff98b1aff87b1bfca7608a 100644 (file)
@@ -130,6 +130,20 @@ struct ras_cxl_poison_event {
        char overflow_ts[64];
 };
 
+#define SZ_512                          0x200
+#define CXL_HEADERLOG_SIZE              SZ_512
+#define CXL_HEADERLOG_SIZE_U32          (SZ_512 / sizeof(uint32_t))
+
+struct ras_cxl_aer_ue_event {
+       char timestamp[64];
+       const char *memdev;
+       const char *host;
+       uint64_t serial;
+       uint32_t error_status;
+       uint32_t first_error;
+       uint32_t *header_log;
+};
+
 struct ras_mc_event;
 struct ras_aer_event;
 struct ras_extlog_event;
@@ -140,6 +154,7 @@ struct devlink_event;
 struct diskerror_event;
 struct ras_mf_event;
 struct ras_cxl_poison_event;
+struct ras_cxl_aer_ue_event;
 
 #ifdef HAVE_SQLITE3
 
@@ -174,6 +189,7 @@ struct sqlite3_priv {
 #endif
 #ifdef HAVE_CXL
        sqlite3_stmt    *stmt_cxl_poison_event;
+       sqlite3_stmt    *stmt_cxl_aer_ue_event;
 #endif
 };
 
@@ -203,6 +219,7 @@ int ras_store_devlink_event(struct ras_events *ras, struct devlink_event *ev);
 int ras_store_diskerror_event(struct ras_events *ras, struct diskerror_event *ev);
 int ras_store_mf_event(struct ras_events *ras, struct ras_mf_event *ev);
 int ras_store_cxl_poison_event(struct ras_events *ras, struct ras_cxl_poison_event *ev);
+int ras_store_cxl_aer_ue_event(struct ras_events *ras, struct ras_cxl_aer_ue_event *ev);
 
 #else
 static inline int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras) { return 0; };
@@ -217,6 +234,7 @@ static inline int ras_store_devlink_event(struct ras_events *ras, struct devlink
 static inline int ras_store_diskerror_event(struct ras_events *ras, struct diskerror_event *ev) { return 0; };
 static inline int ras_store_mf_event(struct ras_events *ras, struct ras_mf_event *ev) { return 0; };
 static inline int ras_store_cxl_poison_event(struct ras_events *ras, struct ras_cxl_poison_event *ev) { return 0; };
+static inline int ras_store_cxl_aer_ue_event(struct ras_events *ras, struct ras_cxl_aer_ue_event *ev) { return 0; };
 
 #endif
 
index 3daecc0698ad3ae2a3e5dff64ddb4f0fd46599d8..2ebdc80d31a07523f2bb9d12c1eae74ff27fdd4d 100644 (file)
@@ -371,6 +371,32 @@ static int set_cxl_poison_event_backtrace(char *buf, struct ras_cxl_poison_event
        return 0;
 }
 
+static int set_cxl_aer_ue_event_backtrace(char *buf, struct ras_cxl_aer_ue_event *ev)
+{
+       char bt_buf[MAX_BACKTRACE_SIZE];
+
+       if (!buf || !ev)
+               return -1;
+
+       sprintf(bt_buf, "BACKTRACE="    \
+                                               "timestamp=%s\n"        \
+                                               "memdev=%s\n"           \
+                                               "host=%s\n"             \
+                                               "serial=0x%lx\n"        \
+                                               "error_status=%u\n"     \
+                                               "first_error=%u\n",     \
+                                               ev->timestamp,          \
+                                               ev->memdev,             \
+                                               ev->host,               \
+                                               ev->serial,             \
+                                               ev->error_status,       \
+                                               ev->first_error);
+
+       strcat(buf, bt_buf);
+
+       return 0;
+}
+
 static int commit_report_backtrace(int sockfd, int type, void *ev){
        char buf[MAX_BACKTRACE_SIZE];
        char *pbuf = buf;
@@ -411,6 +437,9 @@ static int commit_report_backtrace(int sockfd, int type, void *ev){
        case CXL_POISON_EVENT:
                rc = set_cxl_poison_event_backtrace(buf, (struct ras_cxl_poison_event *)ev);
                break;
+       case CXL_AER_UE_EVENT:
+               rc = set_cxl_aer_ue_event_backtrace(buf, (struct ras_cxl_aer_ue_event *)ev);
+               break;
        default:
                return -1;
        }
@@ -863,3 +892,47 @@ cxl_poison_fail:
        else
                return -1;
 }
+
+int ras_report_cxl_aer_ue_event(struct ras_events *ras, struct ras_cxl_aer_ue_event *ev)
+{
+       char buf[MAX_MESSAGE_SIZE];
+       int sockfd = 0;
+       int done = 0;
+       int rc = -1;
+
+       memset(buf, 0, sizeof(buf));
+
+       sockfd = setup_report_socket();
+       if (sockfd < 0)
+               return -1;
+
+       rc = commit_report_basic(sockfd);
+       if (rc < 0)
+               goto cxl_aer_ue_fail;
+
+       rc = commit_report_backtrace(sockfd, CXL_AER_UE_EVENT, ev);
+       if (rc < 0)
+               goto cxl_aer_ue_fail;
+
+       sprintf(buf, "ANALYZER=%s", "rasdaemon-cxl-aer-uncorrectable-error");
+       rc = write(sockfd, buf, strlen(buf) + 1);
+       if (rc < strlen(buf) + 1)
+               goto cxl_aer_ue_fail;
+
+       sprintf(buf, "REASON=%s", "CXL AER uncorrectable error");
+       rc = write(sockfd, buf, strlen(buf) + 1);
+       if (rc < strlen(buf) + 1)
+               goto cxl_aer_ue_fail;
+
+       done = 1;
+
+cxl_aer_ue_fail:
+
+       if (sockfd >= 0)
+               close(sockfd);
+
+       if (done)
+               return 0;
+       else
+               return -1;
+}
index d1591ce174a2da41e8e1a25e5a976190b87fd73f..dfe89d1901de283266cc4a6c6d6d803dee41c083 100644 (file)
@@ -40,6 +40,7 @@ int ras_report_devlink_event(struct ras_events *ras, struct devlink_event *ev);
 int ras_report_diskerror_event(struct ras_events *ras, struct diskerror_event *ev);
 int ras_report_mf_event(struct ras_events *ras, struct ras_mf_event *ev);
 int ras_report_cxl_poison_event(struct ras_events *ras, struct ras_cxl_poison_event *ev);
+int ras_report_cxl_aer_ue_event(struct ras_events *ras, struct ras_cxl_aer_ue_event *ev);
 
 #else
 
@@ -52,6 +53,7 @@ static inline int ras_report_devlink_event(struct ras_events *ras, struct devlin
 static inline int ras_report_diskerror_event(struct ras_events *ras, struct diskerror_event *ev) { return 0; };
 static inline int ras_report_mf_event(struct ras_events *ras, struct ras_mf_event *ev) { return 0; };
 static inline int ras_report_cxl_poison_event(struct ras_events *ras, struct ras_cxl_poison_event *ev) { return 0; };
+static inline int ras_report_cxl_aer_ue_event(struct ras_events *ras, struct ras_cxl_aer_ue_event *ev) { return 0; };
 
 #endif