]> www.infradead.org Git - users/mchehab/rasdaemon.git/commitdiff
rasdaemon: add mc_event trigger
authorRuidong Tian <tianruidong@linux.alibaba.com>
Thu, 23 Nov 2023 09:47:25 +0000 (17:47 +0800)
committerMauro Carvalho Chehab <mchehab+huawei@kernel.org>
Mon, 15 Jul 2024 11:17:59 +0000 (13:17 +0200)
Allow users to run a trigger when RAS mc_event occurs, The mc_event
trigger is separated into CE trigger and UE trigger, this is because
CE is more frequent than UE, and the CE trigger will lead to more
performance hits. Users can choose different triggers for CE/UE to
reduce this effect.

Users can config trigger in /etc/sysconfig/rasdaemon:

    TRIGGER_DIR: The trigger diretory
    MC_CE_TRIGGER: The script executed when corrected error occurs.
    MC_UE_TRIGGER: The script executed when uncorrected error occurs.

No script will be executed if MC_CE_TRIGGER/MC_UE_TRIGGER is null.

Signed-off-by: Ruidong Tian <tianruidong@linux.alibaba.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Makefile.am
contrib/mc_event_trigger [new file with mode: 0755]
misc/rasdaemon.env
ras-events.c
ras-mc-handler.c
ras-mc-handler.h
trigger.c [new file with mode: 0644]
trigger.h [new file with mode: 0644]

index 9dd42c9727f55eb74fb92343739b23258224028a..7a18f75b2c076ac5f0e02a79fc1569ab612d3ca6 100644 (file)
@@ -24,7 +24,7 @@ all-local: $(SYSTEMD_SERVICES)
 
 sbin_PROGRAMS = rasdaemon
 rasdaemon_SOURCES = rasdaemon.c ras-events.c ras-mc-handler.c \
-                   bitfield.c
+                   bitfield.c trigger.c
 if WITH_SQLITE3
    rasdaemon_SOURCES += ras-record.c
 endif
@@ -93,7 +93,7 @@ include_HEADERS = config.h  ras-events.h  ras-logger.h  ras-mc-handler.h \
                  ras-devlink-handler.h ras-diskerror-handler.h rbtree.h ras-page-isolation.h \
                  non-standard-hisilicon.h non-standard-ampere.h ras-memory-failure-handler.h \
                  ras-cxl-handler.h ras-cpu-isolation.h queue.h non-standard-yitian.h \
-                 non-standard-jaguarmicro.h
+                 non-standard-jaguarmicro.h trigger.h
 
 # This rule can't be called with more than one Makefile job (like make -j8)
 # I can't figure out a way to fix that
@@ -120,6 +120,6 @@ upload:
 # custom target
 install-data-local:
        $(install_sh) -d "$(DESTDIR)@sysconfdir@/ras/dimm_labels.d"
-if WITH_MEMORY_CE_PFA
+       $(install_sh) -d "$(DESTDIR)@sysconfdir@/ras/triggers"
        $(install_sh) @abs_srcdir@/misc/rasdaemon.env "$(DESTDIR)@SYSCONFDEFDIR@/rasdaemon"
-endif
+       $(install_sh) @abs_srcdir@/contrib/mc_event_trigger "$(DESTDIR)@sysconfdir@/ras/triggers/mc_event_trigger"
diff --git a/contrib/mc_event_trigger b/contrib/mc_event_trigger
new file mode 100755 (executable)
index 0000000..5c6ccfa
--- /dev/null
@@ -0,0 +1,24 @@
+#!/bin/sh
+#  This shell script can be executed by rasdaemon in daemon mode when a
+#  mc_event is occured, environment variables include all information
+#  reported by tracepoint.
+#
+# environment:
+# TIMESTAMP     Timestamp when error occurred
+# COUNT         Number of errors of the same type
+# TYPE          Error type from Corrected/Uncorrected
+# MESSAGE       Error message
+# LABEL         Label of the affected DIMM(s)
+# MC_INDEX      DIMM identifier from DMI/SMBIOS if available
+# TOP_LAYER     Top layer of the error
+# MIDDLE_LAYER  Middle layer of the error
+# LOWER_LAYER   Low layer of the error
+# ADDRESS       Error address
+# GRAIN         Minimum granularity for an error report, in bytes
+# SYNDROME      Syndrome of the error (or 0 if unknown or if the syndrome is not applicable)
+# DRIVER_DETAIL Other driver-specific detail about the error
+#
+
+[ -x ./mc_event_trigger.local ] && . ./mc_event_trigger.local
+
+exit 0
index 7cb18e8d24651a2bb00b1c21acb7b3f27eac983a..3389a73861ae7fb29f7338dd6fc5eca1aa41c0d0 100644 (file)
@@ -43,4 +43,20 @@ CPU_CE_THRESHOLD="18"
 CPU_ISOLATION_CYCLE="24h"
 
 # Prevent excessive isolation from causing an avalanche effect
-CPU_ISOLATION_LIMIT="10"
\ No newline at end of file
+CPU_ISOLATION_LIMIT="10"
+
+# Event Trigger
+
+# Event trigger will be executed when the specified event occurs.
+#
+# Execute triggers path
+# For example: TRIGGER_DIR=/etc/ras/triggers
+TRIGGER_DIR=
+
+# Execute these triggers when the mc_event occured, the triggers will not
+# be executed if the trigger is not specified.
+# For example:
+#   MC_CE_TRIGGER=mc_event_trigger
+#   MC_UE_TRIGGER=mc_event_trigger
+MC_CE_TRIGGER=
+MC_UE_TRIGGER=
index 9e20ab4e94be67a878081008858328d9c67f62a2..0df1d202fb3981127818f6045347c8a7818327d9 100644 (file)
@@ -44,6 +44,7 @@
 #include "ras-logger.h"
 #include "ras-page-isolation.h"
 #include "ras-cpu-isolation.h"
+#include "trigger.h"
 
 /*
  * Polling time, if read() doesn't block. Currently, trace_pipe_raw never
 
 extern char *choices_disable;
 
+const static struct event_trigger event_triggers[] = {
+       { "mc_event", &mc_event_trigger_setup },
+};
+
 static int get_debugfs_dir(char *tracing_dir, size_t len)
 {
        FILE *fp;
@@ -276,6 +281,16 @@ free_ras:
        return rc;
 }
 
+static void setup_event_trigger(char *event)
+{
+       struct event_trigger trigger;
+       for (int i = 0; i < ARRAY_SIZE(event_triggers); i++) {
+               trigger = event_triggers[i];
+               if (!strcmp(event, trigger.name))
+                       trigger.setup();
+       }
+}
+
 #ifndef HAVE_BLK_RQ_ERROR
 /*
  * Set kernel filter. libtrace doesn't provide an API for setting filters
@@ -870,6 +885,8 @@ static int add_event_handler(struct ras_events *ras, struct tep_handle *pevent,
                return EINVAL;
        }
 
+       setup_event_trigger(event);
+
        log(ALL, LOG_INFO, "Enabled event %s:%s\n", group, event);
 
        return 0;
index d93ba57c53c19653caf2f3edeecfdc278bfb45ab..2f06a01fcbc5c5ead9813e443d81349f03881e2a 100644 (file)
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */
+#define _GNU_SOURCE
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <unistd.h>
 #include <traceevent/kbuffer.h>
+#include <assert.h>
 #include "ras-mc-handler.h"
 #include "ras-record.h"
 #include "ras-logger.h"
 #include "ras-page-isolation.h"
 #include "ras-report.h"
+#include "trigger.h"
+
+#define MAX_ENV 30
+static char *mc_ce_trigger;
+static char *mc_ue_trigger;
+
+void mc_event_trigger_setup(void)
+{
+       mc_ce_trigger = getenv("MC_CE_TRIGGER");
+       if (!mc_ce_trigger || !strcmp(mc_ce_trigger, "")
+                       || trigger_check(mc_ce_trigger) < 0) {
+               log(SYSLOG, LOG_ERR, "Cannot access mc_event ce trigger `%s`\n",
+                       mc_ce_trigger);
+       } else
+               log(SYSLOG, LOG_INFO, "Setup mc_event ce trigger `%s`\n",
+                       mc_ce_trigger);
+
+       mc_ue_trigger = getenv("MC_UE_TRIGGER");
+       if (!mc_ue_trigger || !strcmp(mc_ue_trigger, "")
+                       || trigger_check(mc_ue_trigger) < 0) {
+               log(SYSLOG, LOG_ERR, "Cannot access mc_event ue trigger `%s`\n",
+                       mc_ue_trigger);
+       } else
+               log(SYSLOG, LOG_INFO, "Setup mc_event ue trigger `%s`\n",
+                       mc_ue_trigger);
+}
+
+static void run_mc_trigger(struct ras_mc_event *ev, const char *mc_trigger)
+{
+       char *env[MAX_ENV];
+       int ei = 0;
+       int i;
+
+       if (!mc_trigger || !strcmp(mc_trigger, ""))
+               return;
+
+       if (asprintf(&env[ei++], "PATH=%s", getenv("PATH") ?: "/sbin:/usr/sbin:/bin:/usr/bin") < 0)
+               goto free;
+       if (asprintf(&env[ei++], "TIMESTAMP=%s", ev->timestamp) < 0)
+               goto free;
+       if (asprintf(&env[ei++], "COUNT=%d", ev->error_count) < 0)
+               goto free;
+       if (asprintf(&env[ei++], "TYPE=%s", ev->error_type) < 0)
+               goto free;
+       if (asprintf(&env[ei++], "MESSAGE=%s", ev->msg) < 0)
+               goto free;
+       if (asprintf(&env[ei++], "LABEL=%s", ev->label) < 0)
+               goto free;
+       if (asprintf(&env[ei++], "MC_INDEX=%d", ev->mc_index) < 0)
+               goto free;
+       if (asprintf(&env[ei++], "TOP_LAYER=%d", ev->top_layer) < 0)
+               goto free;
+       if (asprintf(&env[ei++], "MIDDLE_LAYER=%d", ev->middle_layer) < 0)
+               goto free;
+       if (asprintf(&env[ei++], "LOWER_LAYER=%d", ev->lower_layer) < 0)
+               goto free;
+       if (asprintf(&env[ei++], "ADDRESS=%llx", ev->address) < 0)
+               goto free;
+       if (asprintf(&env[ei++], "GRAIN=%lld", ev->grain) < 0)
+               goto free;
+       if (asprintf(&env[ei++], "SYNDROME=%llx", ev->syndrome) < 0)
+               goto free;
+       if (asprintf(&env[ei++], "DRIVER_DETAIL=%s", ev->driver_detail) < 0)
+               goto free;
+       env[ei] = NULL;
+       assert(ei < MAX_ENV);
+
+       run_trigger(mc_trigger, NULL, env, "mc_event");
+
+free:
+       for (i = 0; i < ei; i++)
+               free(env[i]);
+}
 
 int ras_mc_event_handler(struct trace_seq *s,
                         struct tep_record *record,
@@ -194,6 +269,12 @@ int ras_mc_event_handler(struct trace_seq *s,
        ras_report_mc_event(ras, &ev);
 #endif
 
+       if (!strcmp(ev.error_type, "Corrected"))
+               run_mc_trigger(&ev, mc_ce_trigger);
+
+       if (!strcmp(ev.error_type, "Uncorrected"))
+               run_mc_trigger(&ev, mc_ue_trigger);
+
        return 0;
 
 parse_error:
index afc000506dd8ea806743859020a9d71dab5e3500..a7637b244d084b1a2ca1907baf2583ca660b29af 100644 (file)
@@ -22,6 +22,8 @@
 #include "ras-events.h"
 #include <traceevent/event-parse.h>
 
+void mc_event_trigger_setup(void);
+
 int ras_mc_event_handler(struct trace_seq *s,
                         struct tep_record *record,
                         struct tep_event *event, void *context);
diff --git a/trigger.c b/trigger.c
new file mode 100644 (file)
index 0000000..95fb8ca
--- /dev/null
+++ b/trigger.c
@@ -0,0 +1,60 @@
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/wait.h>
+#include "ras-logger.h"
+#include "trigger.h"
+
+void run_trigger(const char *trigger, char *argv[], char **env, const char* reporter)
+{
+       pid_t child;
+       char *path;
+       int status;
+       char *trigger_dir = getenv("TRIGGER_DIR");
+
+       log(SYSLOG, LOG_INFO, "Running trigger `%s' (reporter: %s)\n", trigger, reporter);
+
+       if (asprintf(&path, "%s/%s", trigger_dir, trigger) < 0)
+               return;
+
+       child = fork();
+       if (child < 0) {
+               log(SYSLOG, LOG_ERR, "Cannot create process for trigger");
+               return;
+       }
+
+       if (child == 0) {
+               execve(path, argv, env);
+               _exit(127);
+       } else {
+               waitpid(child, &status, 0);
+               if (WIFEXITED(status) && WEXITSTATUS(status)) {
+                       log(SYSLOG, LOG_INFO, "Trigger %s exited with status %d",
+                               trigger, WEXITSTATUS(status));
+               } else if (WIFSIGNALED(status)) {
+                       log(SYSLOG, LOG_INFO, "Trigger %s killed by signal %d",
+                               trigger, WTERMSIG(status));
+               }
+       }
+}
+
+int trigger_check(char *s)
+{
+       char *name;
+       int rc;
+       char *trigger_dir = getenv("TRIGGER_DIR");
+
+       if (trigger_dir) {
+               if (asprintf(&name, "%s/%s", trigger_dir, s) < 0)
+                       return -1;
+       } else
+               name = s;
+
+       rc = access(name, R_OK|X_OK);
+
+       if (trigger_dir)
+               free(name);
+
+       return rc;
+}
diff --git a/trigger.h b/trigger.h
new file mode 100644 (file)
index 0000000..556a7f2
--- /dev/null
+++ b/trigger.h
@@ -0,0 +1,13 @@
+#ifndef __TRIGGER_H__
+#define __TRIGGER_H__
+
+struct event_trigger {
+        const char *name;
+        void (*setup)(void);
+};
+
+int trigger_check(char *s);
+void run_trigger(const char *trigger, char *argv[], char **env, const char* reporter);
+
+
+#endif