]> www.infradead.org Git - users/mchehab/rasdaemon.git/commitdiff
rasdaemon: Add support for post-processing MCA errors
authorAvadhut Naik <avadhut.naik@amd.com>
Mon, 22 May 2023 22:13:17 +0000 (22:13 +0000)
committerMauro Carvalho Chehab <mchehab@kernel.org>
Mon, 23 Oct 2023 09:27:18 +0000 (11:27 +0200)
Currently, the rasdaemon performs detailed error decoding of received
MCA errors on the system only whence it is running, either as a daemon
or in the foreground.

As such, error decoding cannot be undertaken for any MCA errors received
whence the rasdaemon wasn't running. Additionally, if the error decoding
modules like edac_mce_amd too have not been loaded, error records in the
demsg buffer might correspond to raw values in associated MSRs, compelling
users to undertake decoding manually. The scenario seems more plausible on
AMD systems with Scalabale MCA (SMCA) with plans in place to remove SMCA
Extended Error Descriptions from the edac_mce_amd module in an effort to
offload SMCA Error Decoding to the rasdaemon.

As such, add support to post-process and decode MCA Errors received on AMD
SMCA systems from raw MSR values. Support for post-processing and decoding
of MCA Errors received on CPUs of other vendors can be added in the future,
as needed.

Suggested-by: Yazen Ghannam <yazen.ghannam@amd.com>
Signed-off-by: Avadhut Naik <avadhut.naik@amd.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
README.md
mce-amd-smca.c
ras-events.h
ras-mce-handler.c
ras-mce-handler.h
ras-record.h
rasdaemon.c

index e41cb5d8cad9ea420016b4bfe12986ceae638c2c..c3e0a885945b9b8e051d20b645ccfb3d4d7e3b71 100644 (file)
--- a/README.md
+++ b/README.md
@@ -188,6 +188,16 @@ required):
     # rasdaemon -f -r
 ```
 
+To post-process and decode received MCA errors on AMD SMCA systems, run:
+
+```
+       # rasdaemon -p --status <STATUS_reg> --ipid <IPID_reg> --smca --family <CPU Family> --model <CPU Model> --bank <BANK_NUM>
+```
+
+Status and IPID Register values (in hex) are mandatory. The `smca` flag
+with `family` and `model` are required if not decoding locally. `Bank`
+parameter is optional.
+
 You may also start it via systemd:
 
 ```
index e81f732c894a8212fc164f3835bf0bbb0793f3dc..7c88a464684fb20d0c2258c56c2172db031e34f4 100644 (file)
@@ -710,7 +710,7 @@ static struct smca_bank_name smca_names[] = {
        [SMCA_GMI_PHY]                  = { "Global Memory Interconnect PHY Unit" },
 };
 
-static void amd_decode_errcode(struct mce_event *e)
+void amd_decode_errcode(struct mce_event *e)
 {
 
        decode_amd_errcode(e);
@@ -782,7 +782,7 @@ static inline void fixup_hwid(struct mce_priv* m, uint32_t *hwid_mcatype)
 }
 
 /* Decode extended errors according to Scalable MCA specification */
-static void decode_smca_error(struct mce_event *e, struct mce_priv* m)
+void decode_smca_error(struct mce_event *e, struct mce_priv *m)
 {
        enum smca_bank_types bank_type;
        const char *ip_name;
@@ -827,7 +827,9 @@ static void decode_smca_error(struct mce_event *e, struct mce_priv* m)
        /* Only print the descriptor of valid extended error code */
        if (xec < smca_mce_descs[bank_type].num_descs)
                mce_snprintf(e->mcastatus_msg,
-                            " %s.\n", smca_mce_descs[bank_type].descs[xec]);
+                            "%s. Ext Err Code: %d",
+                            smca_mce_descs[bank_type].descs[xec],
+                            xec);
 
        if (bank_type == SMCA_UMC && xec == 0) {
                channel = find_umc_channel(e);
index c4d54e36abcc0b7c798099a5466c9997eb9551aa..cc03006d8fada9e48f34fc4120209f46c4cb9c33 100644 (file)
@@ -110,6 +110,7 @@ enum ghes_severity {
 
 /* Function prototypes */
 int toggle_ras_mc_event(int enable);
+int ras_offline_mce_event(struct ras_mc_offline_event *event);
 int handle_ras_events(int record_events);
 
 #endif
index 66004153d8a31a831baf50a9134dcef63a258410..d50da295c238932569874645042292473a72d233 100644 (file)
@@ -63,10 +63,8 @@ static char *cputype_name[] = {
        [CPU_SAPPHIRERAPIDS] = "Sapphirerapids server",
 };
 
-static enum cputype select_intel_cputype(struct ras_events *ras)
+static enum cputype select_intel_cputype(struct mce_priv *mce)
 {
-       struct mce_priv *mce = ras->mce_priv;
-
        if (mce->family == 15) {
                if (mce->model == 6)
                        return CPU_TULSA;
@@ -140,9 +138,8 @@ static enum cputype select_intel_cputype(struct ras_events *ras)
        return mce->family == 6 ? CPU_P6OLD : CPU_GENERIC;
 }
 
-static int detect_cpu(struct ras_events *ras)
+static int detect_cpu(struct mce_priv *mce)
 {
-       struct mce_priv *mce = ras->mce_priv;
        FILE *f;
        int ret = 0;
        char *line = NULL;
@@ -221,7 +218,7 @@ static int detect_cpu(struct ras_events *ras)
                }
                goto ret;
        } else if (!strcmp(mce->vendor,"GenuineIntel")) {
-               mce->cputype = select_intel_cputype(ras);
+               mce->cputype = select_intel_cputype(mce);
        } else {
                ret = EINVAL;
        }
@@ -246,7 +243,7 @@ int register_mce_handler(struct ras_events *ras, unsigned ncpus)
 
        mce = ras->mce_priv;
 
-       rc = detect_cpu(ras);
+       rc = detect_cpu(mce);
        if (rc) {
                if (mce->processor_flags)
                        free (mce->processor_flags);
@@ -383,6 +380,105 @@ static void report_mce_event(struct ras_events *ras,
         */
 }
 
+static int report_mce_offline(struct trace_seq *s,
+                             struct mce_event *mce,
+                             struct mce_priv *priv)
+{
+       time_t now;
+       struct tm *tm;
+
+       time(&now);
+       tm = localtime(&now);
+
+       if (tm)
+               strftime(mce->timestamp, sizeof(mce->timestamp),
+                        "%Y-%m-%d %H:%M:%S %z", tm);
+       trace_seq_printf(s, "%s,", mce->timestamp);
+
+       if (*mce->bank_name)
+               trace_seq_printf(s, " %s,", mce->bank_name);
+       else
+               trace_seq_printf(s, " bank=%x,", mce->bank);
+
+       if (*mce->mcastatus_msg)
+               trace_seq_printf(s, " mca: %s,", mce->mcastatus_msg);
+
+       if (*mce->mcistatus_msg)
+               trace_seq_printf(s, " mci: %s,", mce->mcistatus_msg);
+
+       if (*mce->mc_location)
+               trace_seq_printf(s, " Locn: %s,", mce->mc_location);
+
+       if (*mce->error_msg)
+               trace_seq_printf(s, " Error Msg: %s\n", mce->error_msg);
+
+       return 0;
+}
+
+int ras_offline_mce_event(struct ras_mc_offline_event *event)
+{
+       int rc = 0;
+       struct trace_seq s;
+       struct mce_event *mce = NULL;
+       struct mce_priv *priv = NULL;
+
+       mce = (struct mce_event *)calloc(1, sizeof(struct mce_event));
+       if (!mce) {
+               log(TERM, LOG_ERR, "Can't allocate memory for mce struct\n");
+               return errno;
+       }
+
+       priv = (struct mce_priv *)calloc(1, sizeof(struct mce_priv));
+       if (!priv) {
+               log(TERM, LOG_ERR, "Can't allocate memory for mce_priv struct\n");
+               free(mce);
+               return errno;
+       }
+
+       if (event->smca) {
+               priv->cputype = CPU_AMD_SMCA;
+               priv->family = event->family;
+               priv->model = event->model;
+       } else {
+               rc = detect_cpu(priv);
+               if (rc) {
+                       log(TERM, LOG_ERR, "Failed to detect CPU\n");
+                       goto free_mce;
+               }
+       }
+
+       mce->status = event->status;
+       mce->bank = event->bank;
+
+       switch (priv->cputype) {
+       case CPU_AMD_SMCA:
+               mce->synd = event->synd;
+               mce->ipid = event->ipid;
+               if (!mce->ipid || !mce->status) {
+                       log(TERM, LOG_ERR, "%s MSR required.\n",
+                                   mce->ipid ? "Status" : "Ipid");
+                       rc = -EINVAL;
+                       goto free_mce;
+               }
+               decode_smca_error(mce, priv);
+               amd_decode_errcode(mce);
+       break;
+       default:
+               break;
+       }
+
+       trace_seq_init(&s);
+       report_mce_offline(&s, mce, priv);
+       trace_seq_do_printf(&s);
+       fflush(stdout);
+       trace_seq_destroy(&s);
+
+free_mce:
+       free(priv);
+       free(mce);
+       return rc;
+}
+
 int ras_mce_event_handler(struct trace_seq *s,
                          struct tep_record *record,
                          struct tep_event *event, void *context)
index 91ff5f7f5b7cd688494ae975e94e9a1633d8c5b0..3022b15efb1f1460c7f8e302b820fc575556152e 100644 (file)
@@ -118,6 +118,10 @@ int ras_mce_event_handler(struct trace_seq *s,
 /* enables intel iMC logs */
 int set_intel_imc_log(enum cputype cputype, unsigned ncpus);
 
+/* Undertake AMD SMCA Error Decoding */
+void decode_smca_error(struct mce_event *e, struct mce_priv *m);
+void amd_decode_errcode(struct mce_event *e);
+
 /* Per-CPU-type decoders for Intel CPUs */
 void p4_decode_model(struct mce_event *e);
 void core2_decode_model(struct mce_event *e);
index a7b9ab92230259b6b38d35a696dcd656bef87f5e..2b2231c122c0deb1a875eac3f6ce1e6a42a304e4 100644 (file)
@@ -21,6 +21,7 @@
 #define __RAS_RECORD_H
 
 #include <stdint.h>
+#include <stdbool.h>
 #include "config.h"
 
 #define ARRAY_SIZE(x) (sizeof(x)/sizeof(*(x)))
@@ -42,6 +43,15 @@ struct ras_mc_event {
        const char *driver_detail;
 };
 
+struct ras_mc_offline_event {
+       unsigned int family, model;
+       bool smca;
+       uint8_t bank;
+       uint64_t ipid;
+       uint64_t synd;
+       uint64_t status;
+};
+
 struct ras_aer_event {
        char timestamp[64];
        const char *error_type;
index 66f4dea90660cb55ba16ddfa40e44f9eccceea18..e9a3a4de76eeda70b324680447c8cba7de05cfa2 100644 (file)
@@ -41,8 +41,21 @@ struct arguments {
        int record_events;
        int enable_ras;
        int foreground;
+       int offline;
 };
 
+enum OFFLINE_ARG_KEYS {
+       SMCA = 0x100,
+       MODEL,
+       FAMILY,
+       BANK_NUM,
+       IPID_REG,
+       STATUS_REG,
+       SYNDROME_REG
+};
+
+struct ras_mc_offline_event event;
+
 static error_t parse_opt(int k, char *arg, struct argp_state *state)
 {
        struct arguments *args = state->input;
@@ -62,18 +75,84 @@ static error_t parse_opt(int k, char *arg, struct argp_state *state)
        case 'f':
                args->foreground++;
                break;
+#ifdef HAVE_MCE
+       case 'p':
+               if (state->argc < 4)
+                       argp_state_help(state, stdout, ARGP_HELP_LONG | ARGP_HELP_EXIT_ERR);
+               args->offline++;
+               break;
+#endif
        default:
                return ARGP_ERR_UNKNOWN;
        }
        return 0;
 }
 
+#ifdef HAVE_MCE
+static error_t parse_opt_offline(int key, char *arg,
+                                struct argp_state *state)
+{
+       switch (key) {
+       case SMCA:
+               event.smca = true;
+               break;
+       case MODEL:
+               event.model = strtoul(state->argv[state->next], NULL, 0);
+               break;
+       case FAMILY:
+               event.family = strtoul(state->argv[state->next], NULL, 0);
+               break;
+       case BANK_NUM:
+               event.bank = atoi(state->argv[state->next]);
+               break;
+       case IPID_REG:
+               event.ipid = strtoull(state->argv[state->next], NULL, 0);
+               break;
+       case STATUS_REG:
+               event.status = strtoull(state->argv[state->next], NULL, 0);
+               break;
+       case SYNDROME_REG:
+               event.synd = strtoull(state->argv[state->next], NULL, 0);
+               break;
+       default:
+               return ARGP_ERR_UNKNOWN;
+       }
+       return 0;
+}
+#endif
+
 long user_hz;
 
 int main(int argc, char *argv[])
 {
        struct arguments args;
        int idx = -1;
+
+#ifdef HAVE_MCE
+       const struct argp_option offline_options[] = {
+               {"smca", SMCA, 0, 0, "AMD SMCA Error Decoding"},
+               {"model", MODEL, 0, 0, "CPU Model"},
+               {"family", FAMILY, 0, 0, "CPU Family"},
+               {"bank", BANK_NUM, 0, 0, "Bank Number"},
+               {"ipid", IPID_REG, 0, 0, "IPID Register (for SMCA systems only)"},
+               {"status", STATUS_REG, 0, 0, "Status Register"},
+               {"synd", SYNDROME_REG, 0, 0, "Syndrome Register"},
+               {0, 0, 0, 0, 0, 0},
+       };
+
+       struct argp offline_argp = {
+               .options = offline_options,
+               .parser = parse_opt_offline,
+               .doc = TOOL_DESCRIPTION,
+               .args_doc = ARGS_DOC,
+       };
+
+       struct argp_child offline_parser[] = {
+               {&offline_argp, 0, "Post-Processing Options:", 0},
+               {0, 0, 0, 0},
+       };
+#endif
+
        const struct argp_option options[] = {
                {"enable",  'e', 0, 0, "enable RAS events and exit", 0},
                {"disable", 'd', 0, 0, "disable RAS events and exit", 0},
@@ -81,6 +160,10 @@ int main(int argc, char *argv[])
                {"record",  'r', 0, 0, "record events via sqlite3", 0},
 #endif
                {"foreground", 'f', 0, 0, "run foreground, not daemonize"},
+#ifdef HAVE_MCE
+               {"post-processing", 'p', 0, 0,
+               "Post-processing MCE's with raw register values"},
+#endif
 
                { 0, 0, 0, 0, 0, 0 }
        };
@@ -89,7 +172,9 @@ int main(int argc, char *argv[])
                .parser = parse_opt,
                .doc = TOOL_DESCRIPTION,
                .args_doc = ARGS_DOC,
-
+#ifdef HAVE_MCE
+               .children = offline_parser,
+#endif
        };
        memset (&args, 0, sizeof(args));
 
@@ -111,6 +196,13 @@ int main(int argc, char *argv[])
                return 0;
        }
 
+#ifdef HAVE_MCE
+       if (args.offline) {
+               ras_offline_mce_event(&event);
+               return 0;
+       }
+#endif
+
        openlog(TOOL_NAME, 0, LOG_DAEMON);
        if (!args.foreground)
                if (daemon(0,0))