sbin_PROGRAMS = rasdaemon
rasdaemon_SOURCES = rasdaemon.c ras-events.c ras-mc-handler.c \
- bitfield.c
+ bitfield.c trigger.c
if WITH_SQLITE3
rasdaemon_SOURCES += ras-record.c
endif
ras-devlink-handler.h ras-diskerror-handler.h rbtree.h ras-page-isolation.h \
non-standard-hisilicon.h non-standard-ampere.h ras-memory-failure-handler.h \
ras-cxl-handler.h ras-cpu-isolation.h queue.h non-standard-yitian.h \
- non-standard-jaguarmicro.h
+ non-standard-jaguarmicro.h trigger.h
# This rule can't be called with more than one Makefile job (like make -j8)
# I can't figure out a way to fix that
# custom target
install-data-local:
$(install_sh) -d "$(DESTDIR)@sysconfdir@/ras/dimm_labels.d"
-if WITH_MEMORY_CE_PFA
+ $(install_sh) -d "$(DESTDIR)@sysconfdir@/ras/triggers"
$(install_sh) @abs_srcdir@/misc/rasdaemon.env "$(DESTDIR)@SYSCONFDEFDIR@/rasdaemon"
-endif
+ $(install_sh) @abs_srcdir@/contrib/mc_event_trigger "$(DESTDIR)@sysconfdir@/ras/triggers/mc_event_trigger"
--- /dev/null
+#!/bin/sh
+# This shell script can be executed by rasdaemon in daemon mode when a
+# mc_event is occured, environment variables include all information
+# reported by tracepoint.
+#
+# environment:
+# TIMESTAMP Timestamp when error occurred
+# COUNT Number of errors of the same type
+# TYPE Error type from Corrected/Uncorrected
+# MESSAGE Error message
+# LABEL Label of the affected DIMM(s)
+# MC_INDEX DIMM identifier from DMI/SMBIOS if available
+# TOP_LAYER Top layer of the error
+# MIDDLE_LAYER Middle layer of the error
+# LOWER_LAYER Low layer of the error
+# ADDRESS Error address
+# GRAIN Minimum granularity for an error report, in bytes
+# SYNDROME Syndrome of the error (or 0 if unknown or if the syndrome is not applicable)
+# DRIVER_DETAIL Other driver-specific detail about the error
+#
+
+[ -x ./mc_event_trigger.local ] && . ./mc_event_trigger.local
+
+exit 0
CPU_ISOLATION_CYCLE="24h"
# Prevent excessive isolation from causing an avalanche effect
-CPU_ISOLATION_LIMIT="10"
\ No newline at end of file
+CPU_ISOLATION_LIMIT="10"
+
+# Event Trigger
+
+# Event trigger will be executed when the specified event occurs.
+#
+# Execute triggers path
+# For example: TRIGGER_DIR=/etc/ras/triggers
+TRIGGER_DIR=
+
+# Execute these triggers when the mc_event occured, the triggers will not
+# be executed if the trigger is not specified.
+# For example:
+# MC_CE_TRIGGER=mc_event_trigger
+# MC_UE_TRIGGER=mc_event_trigger
+MC_CE_TRIGGER=
+MC_UE_TRIGGER=
#include "ras-logger.h"
#include "ras-page-isolation.h"
#include "ras-cpu-isolation.h"
+#include "trigger.h"
/*
* Polling time, if read() doesn't block. Currently, trace_pipe_raw never
extern char *choices_disable;
+const static struct event_trigger event_triggers[] = {
+ { "mc_event", &mc_event_trigger_setup },
+};
+
static int get_debugfs_dir(char *tracing_dir, size_t len)
{
FILE *fp;
return rc;
}
+static void setup_event_trigger(char *event)
+{
+ struct event_trigger trigger;
+ for (int i = 0; i < ARRAY_SIZE(event_triggers); i++) {
+ trigger = event_triggers[i];
+ if (!strcmp(event, trigger.name))
+ trigger.setup();
+ }
+}
+
#ifndef HAVE_BLK_RQ_ERROR
/*
* Set kernel filter. libtrace doesn't provide an API for setting filters
return EINVAL;
}
+ setup_event_trigger(event);
+
log(ALL, LOG_INFO, "Enabled event %s:%s\n", group, event);
return 0;
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
+#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <traceevent/kbuffer.h>
+#include <assert.h>
#include "ras-mc-handler.h"
#include "ras-record.h"
#include "ras-logger.h"
#include "ras-page-isolation.h"
#include "ras-report.h"
+#include "trigger.h"
+
+#define MAX_ENV 30
+static char *mc_ce_trigger;
+static char *mc_ue_trigger;
+
+void mc_event_trigger_setup(void)
+{
+ mc_ce_trigger = getenv("MC_CE_TRIGGER");
+ if (!mc_ce_trigger || !strcmp(mc_ce_trigger, "")
+ || trigger_check(mc_ce_trigger) < 0) {
+ log(SYSLOG, LOG_ERR, "Cannot access mc_event ce trigger `%s`\n",
+ mc_ce_trigger);
+ } else
+ log(SYSLOG, LOG_INFO, "Setup mc_event ce trigger `%s`\n",
+ mc_ce_trigger);
+
+ mc_ue_trigger = getenv("MC_UE_TRIGGER");
+ if (!mc_ue_trigger || !strcmp(mc_ue_trigger, "")
+ || trigger_check(mc_ue_trigger) < 0) {
+ log(SYSLOG, LOG_ERR, "Cannot access mc_event ue trigger `%s`\n",
+ mc_ue_trigger);
+ } else
+ log(SYSLOG, LOG_INFO, "Setup mc_event ue trigger `%s`\n",
+ mc_ue_trigger);
+}
+
+static void run_mc_trigger(struct ras_mc_event *ev, const char *mc_trigger)
+{
+ char *env[MAX_ENV];
+ int ei = 0;
+ int i;
+
+ if (!mc_trigger || !strcmp(mc_trigger, ""))
+ return;
+
+ if (asprintf(&env[ei++], "PATH=%s", getenv("PATH") ?: "/sbin:/usr/sbin:/bin:/usr/bin") < 0)
+ goto free;
+ if (asprintf(&env[ei++], "TIMESTAMP=%s", ev->timestamp) < 0)
+ goto free;
+ if (asprintf(&env[ei++], "COUNT=%d", ev->error_count) < 0)
+ goto free;
+ if (asprintf(&env[ei++], "TYPE=%s", ev->error_type) < 0)
+ goto free;
+ if (asprintf(&env[ei++], "MESSAGE=%s", ev->msg) < 0)
+ goto free;
+ if (asprintf(&env[ei++], "LABEL=%s", ev->label) < 0)
+ goto free;
+ if (asprintf(&env[ei++], "MC_INDEX=%d", ev->mc_index) < 0)
+ goto free;
+ if (asprintf(&env[ei++], "TOP_LAYER=%d", ev->top_layer) < 0)
+ goto free;
+ if (asprintf(&env[ei++], "MIDDLE_LAYER=%d", ev->middle_layer) < 0)
+ goto free;
+ if (asprintf(&env[ei++], "LOWER_LAYER=%d", ev->lower_layer) < 0)
+ goto free;
+ if (asprintf(&env[ei++], "ADDRESS=%llx", ev->address) < 0)
+ goto free;
+ if (asprintf(&env[ei++], "GRAIN=%lld", ev->grain) < 0)
+ goto free;
+ if (asprintf(&env[ei++], "SYNDROME=%llx", ev->syndrome) < 0)
+ goto free;
+ if (asprintf(&env[ei++], "DRIVER_DETAIL=%s", ev->driver_detail) < 0)
+ goto free;
+ env[ei] = NULL;
+ assert(ei < MAX_ENV);
+
+ run_trigger(mc_trigger, NULL, env, "mc_event");
+
+free:
+ for (i = 0; i < ei; i++)
+ free(env[i]);
+}
int ras_mc_event_handler(struct trace_seq *s,
struct tep_record *record,
ras_report_mc_event(ras, &ev);
#endif
+ if (!strcmp(ev.error_type, "Corrected"))
+ run_mc_trigger(&ev, mc_ce_trigger);
+
+ if (!strcmp(ev.error_type, "Uncorrected"))
+ run_mc_trigger(&ev, mc_ue_trigger);
+
return 0;
parse_error:
#include "ras-events.h"
#include <traceevent/event-parse.h>
+void mc_event_trigger_setup(void);
+
int ras_mc_event_handler(struct trace_seq *s,
struct tep_record *record,
struct tep_event *event, void *context);
--- /dev/null
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/wait.h>
+#include "ras-logger.h"
+#include "trigger.h"
+
+void run_trigger(const char *trigger, char *argv[], char **env, const char* reporter)
+{
+ pid_t child;
+ char *path;
+ int status;
+ char *trigger_dir = getenv("TRIGGER_DIR");
+
+ log(SYSLOG, LOG_INFO, "Running trigger `%s' (reporter: %s)\n", trigger, reporter);
+
+ if (asprintf(&path, "%s/%s", trigger_dir, trigger) < 0)
+ return;
+
+ child = fork();
+ if (child < 0) {
+ log(SYSLOG, LOG_ERR, "Cannot create process for trigger");
+ return;
+ }
+
+ if (child == 0) {
+ execve(path, argv, env);
+ _exit(127);
+ } else {
+ waitpid(child, &status, 0);
+ if (WIFEXITED(status) && WEXITSTATUS(status)) {
+ log(SYSLOG, LOG_INFO, "Trigger %s exited with status %d",
+ trigger, WEXITSTATUS(status));
+ } else if (WIFSIGNALED(status)) {
+ log(SYSLOG, LOG_INFO, "Trigger %s killed by signal %d",
+ trigger, WTERMSIG(status));
+ }
+ }
+}
+
+int trigger_check(char *s)
+{
+ char *name;
+ int rc;
+ char *trigger_dir = getenv("TRIGGER_DIR");
+
+ if (trigger_dir) {
+ if (asprintf(&name, "%s/%s", trigger_dir, s) < 0)
+ return -1;
+ } else
+ name = s;
+
+ rc = access(name, R_OK|X_OK);
+
+ if (trigger_dir)
+ free(name);
+
+ return rc;
+}
--- /dev/null
+#ifndef __TRIGGER_H__
+#define __TRIGGER_H__
+
+struct event_trigger {
+ const char *name;
+ void (*setup)(void);
+};
+
+int trigger_check(char *s);
+void run_trigger(const char *trigger, char *argv[], char **env, const char* reporter);
+
+
+#endif