# rasdaemon -f -r
```
+To post-process and decode received MCA errors on AMD SMCA systems, run:
+
+```
+ # rasdaemon -p --status <STATUS_reg> --ipid <IPID_reg> --smca --family <CPU Family> --model <CPU Model> --bank <BANK_NUM>
+```
+
+Status and IPID Register values (in hex) are mandatory. The `smca` flag
+with `family` and `model` are required if not decoding locally. `Bank`
+parameter is optional.
+
You may also start it via systemd:
```
[SMCA_GMI_PHY] = { "Global Memory Interconnect PHY Unit" },
};
-static void amd_decode_errcode(struct mce_event *e)
+void amd_decode_errcode(struct mce_event *e)
{
decode_amd_errcode(e);
}
/* Decode extended errors according to Scalable MCA specification */
-static void decode_smca_error(struct mce_event *e, struct mce_priv* m)
+void decode_smca_error(struct mce_event *e, struct mce_priv *m)
{
enum smca_bank_types bank_type;
const char *ip_name;
/* Only print the descriptor of valid extended error code */
if (xec < smca_mce_descs[bank_type].num_descs)
mce_snprintf(e->mcastatus_msg,
- " %s.\n", smca_mce_descs[bank_type].descs[xec]);
+ "%s. Ext Err Code: %d",
+ smca_mce_descs[bank_type].descs[xec],
+ xec);
if (bank_type == SMCA_UMC && xec == 0) {
channel = find_umc_channel(e);
/* Function prototypes */
int toggle_ras_mc_event(int enable);
+int ras_offline_mce_event(struct ras_mc_offline_event *event);
int handle_ras_events(int record_events);
#endif
[CPU_SAPPHIRERAPIDS] = "Sapphirerapids server",
};
-static enum cputype select_intel_cputype(struct ras_events *ras)
+static enum cputype select_intel_cputype(struct mce_priv *mce)
{
- struct mce_priv *mce = ras->mce_priv;
-
if (mce->family == 15) {
if (mce->model == 6)
return CPU_TULSA;
return mce->family == 6 ? CPU_P6OLD : CPU_GENERIC;
}
-static int detect_cpu(struct ras_events *ras)
+static int detect_cpu(struct mce_priv *mce)
{
- struct mce_priv *mce = ras->mce_priv;
FILE *f;
int ret = 0;
char *line = NULL;
}
goto ret;
} else if (!strcmp(mce->vendor,"GenuineIntel")) {
- mce->cputype = select_intel_cputype(ras);
+ mce->cputype = select_intel_cputype(mce);
} else {
ret = EINVAL;
}
mce = ras->mce_priv;
- rc = detect_cpu(ras);
+ rc = detect_cpu(mce);
if (rc) {
if (mce->processor_flags)
free (mce->processor_flags);
*/
}
+static int report_mce_offline(struct trace_seq *s,
+ struct mce_event *mce,
+ struct mce_priv *priv)
+{
+ time_t now;
+ struct tm *tm;
+
+ time(&now);
+ tm = localtime(&now);
+
+ if (tm)
+ strftime(mce->timestamp, sizeof(mce->timestamp),
+ "%Y-%m-%d %H:%M:%S %z", tm);
+ trace_seq_printf(s, "%s,", mce->timestamp);
+
+ if (*mce->bank_name)
+ trace_seq_printf(s, " %s,", mce->bank_name);
+ else
+ trace_seq_printf(s, " bank=%x,", mce->bank);
+
+ if (*mce->mcastatus_msg)
+ trace_seq_printf(s, " mca: %s,", mce->mcastatus_msg);
+
+ if (*mce->mcistatus_msg)
+ trace_seq_printf(s, " mci: %s,", mce->mcistatus_msg);
+
+ if (*mce->mc_location)
+ trace_seq_printf(s, " Locn: %s,", mce->mc_location);
+
+ if (*mce->error_msg)
+ trace_seq_printf(s, " Error Msg: %s\n", mce->error_msg);
+
+ return 0;
+}
+
+int ras_offline_mce_event(struct ras_mc_offline_event *event)
+{
+ int rc = 0;
+ struct trace_seq s;
+ struct mce_event *mce = NULL;
+ struct mce_priv *priv = NULL;
+
+ mce = (struct mce_event *)calloc(1, sizeof(struct mce_event));
+ if (!mce) {
+ log(TERM, LOG_ERR, "Can't allocate memory for mce struct\n");
+ return errno;
+ }
+
+ priv = (struct mce_priv *)calloc(1, sizeof(struct mce_priv));
+ if (!priv) {
+ log(TERM, LOG_ERR, "Can't allocate memory for mce_priv struct\n");
+ free(mce);
+ return errno;
+ }
+
+ if (event->smca) {
+ priv->cputype = CPU_AMD_SMCA;
+ priv->family = event->family;
+ priv->model = event->model;
+ } else {
+ rc = detect_cpu(priv);
+ if (rc) {
+ log(TERM, LOG_ERR, "Failed to detect CPU\n");
+ goto free_mce;
+ }
+ }
+
+ mce->status = event->status;
+ mce->bank = event->bank;
+
+ switch (priv->cputype) {
+ case CPU_AMD_SMCA:
+ mce->synd = event->synd;
+ mce->ipid = event->ipid;
+ if (!mce->ipid || !mce->status) {
+ log(TERM, LOG_ERR, "%s MSR required.\n",
+ mce->ipid ? "Status" : "Ipid");
+ rc = -EINVAL;
+ goto free_mce;
+ }
+ decode_smca_error(mce, priv);
+ amd_decode_errcode(mce);
+ break;
+ default:
+ break;
+ }
+
+ trace_seq_init(&s);
+ report_mce_offline(&s, mce, priv);
+ trace_seq_do_printf(&s);
+ fflush(stdout);
+ trace_seq_destroy(&s);
+
+free_mce:
+ free(priv);
+ free(mce);
+ return rc;
+}
+
int ras_mce_event_handler(struct trace_seq *s,
struct tep_record *record,
struct tep_event *event, void *context)
/* enables intel iMC logs */
int set_intel_imc_log(enum cputype cputype, unsigned ncpus);
+/* Undertake AMD SMCA Error Decoding */
+void decode_smca_error(struct mce_event *e, struct mce_priv *m);
+void amd_decode_errcode(struct mce_event *e);
+
/* Per-CPU-type decoders for Intel CPUs */
void p4_decode_model(struct mce_event *e);
void core2_decode_model(struct mce_event *e);
#define __RAS_RECORD_H
#include <stdint.h>
+#include <stdbool.h>
#include "config.h"
#define ARRAY_SIZE(x) (sizeof(x)/sizeof(*(x)))
const char *driver_detail;
};
+struct ras_mc_offline_event {
+ unsigned int family, model;
+ bool smca;
+ uint8_t bank;
+ uint64_t ipid;
+ uint64_t synd;
+ uint64_t status;
+};
+
struct ras_aer_event {
char timestamp[64];
const char *error_type;
int record_events;
int enable_ras;
int foreground;
+ int offline;
};
+enum OFFLINE_ARG_KEYS {
+ SMCA = 0x100,
+ MODEL,
+ FAMILY,
+ BANK_NUM,
+ IPID_REG,
+ STATUS_REG,
+ SYNDROME_REG
+};
+
+struct ras_mc_offline_event event;
+
static error_t parse_opt(int k, char *arg, struct argp_state *state)
{
struct arguments *args = state->input;
case 'f':
args->foreground++;
break;
+#ifdef HAVE_MCE
+ case 'p':
+ if (state->argc < 4)
+ argp_state_help(state, stdout, ARGP_HELP_LONG | ARGP_HELP_EXIT_ERR);
+ args->offline++;
+ break;
+#endif
default:
return ARGP_ERR_UNKNOWN;
}
return 0;
}
+#ifdef HAVE_MCE
+static error_t parse_opt_offline(int key, char *arg,
+ struct argp_state *state)
+{
+ switch (key) {
+ case SMCA:
+ event.smca = true;
+ break;
+ case MODEL:
+ event.model = strtoul(state->argv[state->next], NULL, 0);
+ break;
+ case FAMILY:
+ event.family = strtoul(state->argv[state->next], NULL, 0);
+ break;
+ case BANK_NUM:
+ event.bank = atoi(state->argv[state->next]);
+ break;
+ case IPID_REG:
+ event.ipid = strtoull(state->argv[state->next], NULL, 0);
+ break;
+ case STATUS_REG:
+ event.status = strtoull(state->argv[state->next], NULL, 0);
+ break;
+ case SYNDROME_REG:
+ event.synd = strtoull(state->argv[state->next], NULL, 0);
+ break;
+ default:
+ return ARGP_ERR_UNKNOWN;
+ }
+ return 0;
+}
+#endif
+
long user_hz;
int main(int argc, char *argv[])
{
struct arguments args;
int idx = -1;
+
+#ifdef HAVE_MCE
+ const struct argp_option offline_options[] = {
+ {"smca", SMCA, 0, 0, "AMD SMCA Error Decoding"},
+ {"model", MODEL, 0, 0, "CPU Model"},
+ {"family", FAMILY, 0, 0, "CPU Family"},
+ {"bank", BANK_NUM, 0, 0, "Bank Number"},
+ {"ipid", IPID_REG, 0, 0, "IPID Register (for SMCA systems only)"},
+ {"status", STATUS_REG, 0, 0, "Status Register"},
+ {"synd", SYNDROME_REG, 0, 0, "Syndrome Register"},
+ {0, 0, 0, 0, 0, 0},
+ };
+
+ struct argp offline_argp = {
+ .options = offline_options,
+ .parser = parse_opt_offline,
+ .doc = TOOL_DESCRIPTION,
+ .args_doc = ARGS_DOC,
+ };
+
+ struct argp_child offline_parser[] = {
+ {&offline_argp, 0, "Post-Processing Options:", 0},
+ {0, 0, 0, 0},
+ };
+#endif
+
const struct argp_option options[] = {
{"enable", 'e', 0, 0, "enable RAS events and exit", 0},
{"disable", 'd', 0, 0, "disable RAS events and exit", 0},
{"record", 'r', 0, 0, "record events via sqlite3", 0},
#endif
{"foreground", 'f', 0, 0, "run foreground, not daemonize"},
+#ifdef HAVE_MCE
+ {"post-processing", 'p', 0, 0,
+ "Post-processing MCE's with raw register values"},
+#endif
{ 0, 0, 0, 0, 0, 0 }
};
.parser = parse_opt,
.doc = TOOL_DESCRIPTION,
.args_doc = ARGS_DOC,
-
+#ifdef HAVE_MCE
+ .children = offline_parser,
+#endif
};
memset (&args, 0, sizeof(args));
return 0;
}
+#ifdef HAVE_MCE
+ if (args.offline) {
+ ras_offline_mce_event(&event);
+ return 0;
+ }
+#endif
+
openlog(TOOL_NAME, 0, LOG_DAEMON);
if (!args.foreground)
if (daemon(0,0))