- sudo apt-get install -y sqlite3
install:
- autoreconf -vfi
-- ./configure --enable-sqlite3 --enable-aer --enable-non-standard --enable-arm --enable-mce --enable-extlog --enable-devlink --enable-abrt-report --enable-hisi-ns-decode
+- ./configure --enable-sqlite3 --enable-aer --enable-non-standard --enable-arm --enable-mce --enable-extlog --enable-devlink --enable-diskerror --enable-abrt-report --enable-hisi-ns-decode
script:
- make && sudo make install
if WITH_DEVLINK
rasdaemon_SOURCES += ras-devlink-handler.c
endif
+if WITH_DISKERROR
+ rasdaemon_SOURCES += ras-diskerror-handler.c
+endif
if WITH_ABRT_REPORT
rasdaemon_SOURCES += ras-report.c
endif
include_HEADERS = config.h ras-events.h ras-logger.h ras-mc-handler.h \
ras-aer-handler.h ras-mce-handler.h ras-record.h bitfield.h ras-report.h \
ras-extlog-handler.h ras-arm-handler.h ras-non-standard-handler.h \
- ras-devlink-handler.h
+ ras-devlink-handler.h ras-diskerror-handler.h
# This rule can't be called with more than one Makefile job (like make -j8)
# I can't figure out a way to fix that
])
AM_CONDITIONAL([WITH_DEVLINK], [test x$enable_devlink = xyes])
+AC_ARG_ENABLE([diskerror],
+ AS_HELP_STRING([--enable-diskerror], [enable disk I/O error events (currently experimental)]))
+
+AS_IF([test "x$enable_diskerror" = "xyes"], [
+ AC_DEFINE(HAVE_DISKERROR,1,"have disk I/O errors collect")
+ AC_SUBST([WITH_DISKERROR])
+])
+AM_CONDITIONAL([WITH_DISKERROR], [test x$enable_diskerror = xyes])
+
AC_ARG_ENABLE([abrt_report],
AS_HELP_STRING([--enable-abrt-report], [enable report event to ABRT (currently experimental)]))
HIP07 SAS HW errors : $enable_hisi_ns_decode
ARM events : $enable_arm
DEVLINK : $enable_devlink
+ Disk I/O errors : $enable_diskerror
EOF
--- /dev/null
+/*
+ * Copyright (C) 2019 Cong Wang <xiyou.wangcong@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+*/
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <sys/sysmacros.h>
+#include "libtrace/kbuffer.h"
+#include "ras-diskerror-handler.h"
+#include "ras-record.h"
+#include "ras-logger.h"
+#include "ras-report.h"
+
+
+static const struct {
+ int error;
+ const char *name;
+} blk_errors[] = {
+ { -EOPNOTSUPP, "operation not supported error" },
+ { -ETIMEDOUT, "timeout error" },
+ { -ENOSPC, "critical space allocation error" },
+ { -ENOLINK, "recoverable transport error" },
+ { -EREMOTEIO, "critical target error" },
+ { -EBADE, "critical nexus error" },
+ { -ENODATA, "critical medium error" },
+ { -EILSEQ, "protection error" },
+ { -ENOMEM, "kernel resource error" },
+ { -EBUSY, "device resource error" },
+ { -EAGAIN, "nonblocking retry error" },
+ { -EREMCHG, "dm internal retry error" },
+ { -EIO, "I/O error" },
+};
+
+#define ARRAY_SIZE(x) (sizeof(x)/sizeof(*(x)))
+
+static const char *get_blk_error(int err)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(blk_errors); i++)
+ if (blk_errors[i].error == err)
+ return blk_errors[i].name;
+ return "unknown block error";
+}
+
+int ras_diskerror_event_handler(struct trace_seq *s,
+ struct pevent_record *record,
+ struct event_format *event, void *context)
+{
+ unsigned long long val;
+ int len;
+ struct ras_events *ras = context;
+ time_t now;
+ struct tm *tm;
+ struct diskerror_event ev;
+ dev_t dev;
+
+ if (ras->filters[DISKERROR_EVENT] &&
+ pevent_filter_match(ras->filters[DISKERROR_EVENT], record) == FILTER_MATCH)
+ return 0;
+ /*
+ * Newer kernels (3.10-rc1 or upper) provide an uptime clock.
+ * On previous kernels, the way to properly generate an event would
+ * be to inject a fake one, measure its timestamp and diff it against
+ * gettimeofday. We won't do it here. Instead, let's use uptime,
+ * falling-back to the event report's time, if "uptime" clock is
+ * not available (legacy kernels).
+ */
+
+ if (ras->use_uptime)
+ now = record->ts/user_hz + ras->uptime_diff;
+ else
+ now = time(NULL);
+
+ tm = localtime(&now);
+ if (tm)
+ strftime(ev.timestamp, sizeof(ev.timestamp),
+ "%Y-%m-%d %H:%M:%S %z", tm);
+ trace_seq_printf(s, "%s ", ev.timestamp);
+
+ if (pevent_get_field_val(s, event, "dev", record, &val, 1) < 0)
+ return -1;
+ dev = (dev_t)val;
+ asprintf(&ev.dev, "%u:%u", major(dev), minor(dev));
+
+ if (pevent_get_field_val(s, event, "sector", record, &val, 1) < 0)
+ return -1;
+ ev.sector = val;
+
+ if (pevent_get_field_val(s, event, "nr_sector", record, &val, 1) < 0)
+ return -1;
+ ev.nr_sector = (unsigned int)val;
+
+ if (pevent_get_field_val(s, event, "error", record, &val, 1) < 0)
+ return -1;
+ ev.error = get_blk_error((int)val);
+
+ ev.rwbs = pevent_get_field_raw(s, event, "rwbs", record, &len, 1);
+ if (!ev.rwbs)
+ return -1;
+
+ ev.cmd = pevent_get_field_raw(s, event, "cmd", record, &len, 1);
+ if (!ev.cmd)
+ return -1;
+
+ /* Insert data into the SGBD */
+#ifdef HAVE_SQLITE3
+ ras_store_diskerror_event(ras, &ev);
+#endif
+
+#ifdef HAVE_ABRT_REPORT
+ /* Report event to ABRT */
+ ras_report_diskerror_event(ras, &ev);
+#endif
+ free(ev.dev);
+ return 0;
+}
--- /dev/null
+/*
+ * Copyright (C) 2019 Cong Wang <xiyou.wangcong@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+*/
+
+#ifndef __RAS_DISKERROR_HANDLER_H
+#define __RAS_DISKERROR_HANDLER_H
+
+#include "ras-events.h"
+#include "libtrace/event-parse.h"
+
+int ras_diskerror_event_handler(struct trace_seq *s,
+ struct pevent_record *record,
+ struct event_format *event, void *context);
+
+#endif
#include "ras-mce-handler.h"
#include "ras-extlog-handler.h"
#include "ras-devlink-handler.h"
+#include "ras-diskerror-handler.h"
#include "ras-record.h"
#include "ras-logger.h"
rc |= __toggle_ras_mc_event(ras, "devlink", "devlink_health_report", enable);
#endif
+#ifdef HAVE_DISKERROR
+ rc |= __toggle_ras_mc_event(ras, "block", "block_rq_complete", enable);
+#endif
+
free_ras:
free(ras);
return rc;
"devlink", "devlink_health_report");
#endif
+#ifdef HAVE_DISKERROR
+ rc = add_event_handler(ras, pevent, page_size, "block",
+ "block_rq_complete", ras_diskerror_event_handler,
+ "block/block_rq_complete:error==0", DISKERROR_EVENT);
+ if (!rc)
+ num_events++;
+ else
+ log(ALL, LOG_ERR, "Can't get traces from %s:%s\n",
+ "block", "block_rq_complete");
+#endif
+
if (!num_events) {
log(ALL, LOG_INFO,
"Failed to trace all supported RAS events. Aborting.\n");
ARM_EVENT,
EXTLOG_EVENT,
DEVLINK_EVENT,
+ DISKERROR_EVENT,
NR_EVENTS
};
}
#endif
+/*
+ * Table and functions to handle block:block_rq_complete
+ */
+
+#ifdef HAVE_DISKERROR
+static const struct db_fields diskerror_event_fields[] = {
+ { .name="id", .type="INTEGER PRIMARY KEY" },
+ { .name="timestamp", .type="TEXT" },
+ { .name="dev", .type="TEXT" },
+ { .name="sector", .type="INTEGER" },
+ { .name="nr_sector", .type="INTEGER" },
+ { .name="error", .type="TEXT" },
+ { .name="rwbs", .type="TEXT" },
+ { .name="cmd", .type="TEXT" },
+};
+
+static const struct db_table_descriptor diskerror_event_tab = {
+ .name = "disk_errors",
+ .fields = diskerror_event_fields,
+ .num_fields = ARRAY_SIZE(diskerror_event_fields),
+};
+
+int ras_store_diskerror_event(struct ras_events *ras, struct diskerror_event *ev)
+{
+ int rc;
+ struct sqlite3_priv *priv = ras->db_priv;
+
+ if (!priv || !priv->stmt_diskerror_event)
+ return 0;
+ log(TERM, LOG_INFO, "diskerror_eventstore: %p\n", priv->stmt_diskerror_event);
+
+ sqlite3_bind_text(priv->stmt_diskerror_event, 1, ev->timestamp, -1, NULL);
+ sqlite3_bind_text(priv->stmt_diskerror_event, 2, ev->dev, -1, NULL);
+ sqlite3_bind_int64(priv->stmt_diskerror_event, 3, ev->sector);
+ sqlite3_bind_int(priv->stmt_diskerror_event, 4, ev->nr_sector);
+ sqlite3_bind_text(priv->stmt_diskerror_event, 5, ev->error, -1, NULL);
+ sqlite3_bind_text(priv->stmt_diskerror_event, 6, ev->rwbs, -1, NULL);
+ sqlite3_bind_text(priv->stmt_diskerror_event, 7, ev->cmd, -1, NULL);
+
+ rc = sqlite3_step(priv->stmt_diskerror_event);
+ if (rc != SQLITE_OK && rc != SQLITE_DONE)
+ log(TERM, LOG_ERR,
+ "Failed to do diskerror_event step on sqlite: error = %d\n", rc);
+ rc = sqlite3_reset(priv->stmt_diskerror_event);
+ if (rc != SQLITE_OK && rc != SQLITE_DONE)
+ log(TERM, LOG_ERR,
+ "Failed reset diskerror_event on sqlite: error = %d\n",
+ rc);
+ log(TERM, LOG_INFO, "register inserted at db\n");
+
+ return rc;
+}
+#endif
+
/*
* Generic code
*/
&devlink_event_tab);
#endif
+#ifdef HAVE_DISKERROR
+ rc = ras_mc_create_table(priv, &diskerror_event_tab);
+ if (rc == SQLITE_OK)
+ rc = ras_mc_prepare_stmt(priv, &priv->stmt_diskerror_event,
+ &diskerror_event_tab);
+#endif
+
ras->db_priv = priv;
return 0;
}
char *msg;
};
+struct diskerror_event {
+ char timestamp[64];
+ char *dev;
+ unsigned long long sector;
+ unsigned int nr_sector;
+ const char *error;
+ const char *rwbs;
+ const char *cmd;
+};
+
struct ras_mc_event;
struct ras_aer_event;
struct ras_extlog_event;
struct ras_arm_event;
struct mce_event;
struct devlink_event;
+struct diskerror_event;
#ifdef HAVE_SQLITE3
#ifdef HAVE_DEVLINK
sqlite3_stmt *stmt_devlink_event;
#endif
+#ifdef HAVE_DISKERROR
+ sqlite3_stmt *stmt_diskerror_event;
+#endif
};
struct db_fields {
int ras_store_non_standard_record(struct ras_events *ras, struct ras_non_standard_event *ev);
int ras_store_arm_record(struct ras_events *ras, struct ras_arm_event *ev);
int ras_store_devlink_event(struct ras_events *ras, struct devlink_event *ev);
+int ras_store_diskerror_event(struct ras_events *ras, struct diskerror_event *ev);
#else
static inline int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras) { return 0; };
static inline int ras_store_non_standard_record(struct ras_events *ras, struct ras_non_standard_event *ev) { return 0; };
static inline int ras_store_arm_record(struct ras_events *ras, struct ras_arm_event *ev) { return 0; };
static inline int ras_store_devlink_event(struct ras_events *ras, struct devlink_event *ev) { return 0; };
+static inline int ras_store_diskerror_event(struct ras_events *ras, struct diskerror_event *ev) { return 0; };
#endif
return 0;
}
+static int set_diskerror_event_backtrace(char *buf, struct diskerror_event *ev) {
+ char bt_buf[MAX_BACKTRACE_SIZE];
+
+ if(!buf || !ev)
+ return -1;
+
+ sprintf(bt_buf, "BACKTRACE=" \
+ "timestamp=%s\n" \
+ "dev=%s\n" \
+ "sector=%llu\n" \
+ "nr_sector=%u\n" \
+ "error=%s\n" \
+ "rwbs=%s\n" \
+ "cmd=%s\n", \
+ ev->timestamp, \
+ ev->dev, \
+ ev->sector, \
+ ev->nr_sector, \
+ ev->error, \
+ ev->rwbs, \
+ ev->cmd);
+
+ strcat(buf, bt_buf);
+
+ return 0;
+}
+
static int commit_report_backtrace(int sockfd, int type, void *ev){
char buf[MAX_BACKTRACE_SIZE];
char *pbuf = buf;
case DEVLINK_EVENT:
rc = set_devlink_event_backtrace(buf, (struct devlink_event *)ev);
break;
+ case DISKERROR_EVENT:
+ rc = set_diskerror_event_backtrace(buf, (struct diskerror_event *)ev);
+ break;
default:
return -1;
}
return -1;
}
}
+
+int ras_report_diskerror_event(struct ras_events *ras, struct diskerror_event *ev){
+ char buf[MAX_MESSAGE_SIZE];
+ int sockfd = 0;
+ int done = 0;
+ int rc = -1;
+
+ memset(buf, 0, sizeof(buf));
+
+ sockfd = setup_report_socket();
+ if(sockfd < 0){
+ return -1;
+ }
+
+ rc = commit_report_basic(sockfd);
+ if(rc < 0){
+ goto diskerror_fail;
+ }
+
+ rc = commit_report_backtrace(sockfd, DISKERROR_EVENT, ev);
+ if(rc < 0){
+ goto diskerror_fail;
+ }
+
+ sprintf(buf, "ANALYZER=%s", "rasdaemon-diskerror");
+ rc = write(sockfd, buf, strlen(buf) + 1);
+ if(rc < strlen(buf) + 1){
+ goto diskerror_fail;
+ }
+
+ sprintf(buf, "REASON=%s", "disk I/O error");
+ rc = write(sockfd, buf, strlen(buf) + 1);
+ if(rc < strlen(buf) + 1){
+ goto diskerror_fail;
+ }
+
+ done = 1;
+
+diskerror_fail:
+ if(sockfd > 0){
+ close(sockfd);
+ }
+
+ if(done){
+ return 0;
+ }else{
+ return -1;
+ }
+}
int ras_report_non_standard_event(struct ras_events *ras, struct ras_non_standard_event *ev);
int ras_report_arm_event(struct ras_events *ras, struct ras_arm_event *ev);
int ras_report_devlink_event(struct ras_events *ras, struct devlink_event *ev);
+int ras_report_diskerror_event(struct ras_events *ras, struct diskerror_event *ev);
#else
static inline int ras_report_non_standard_event(struct ras_events *ras, struct ras_non_standard_event *ev) { return 0; };
static inline int ras_report_arm_event(struct ras_events *ras, struct ras_arm_event *ev) { return 0; };
static inline int ras_report_devlink_event(struct ras_events *ras, struct devlink_event *ev) { return 0; };
+static inline int ras_report_diskerror_event(struct ras_events *ras, struct diskerror_event *ev) { return 0; };
#endif
my ($query, $query_handle, $out);
my ($err_type, $label, $mc, $top, $mid, $low, $count, $msg);
my ($etype, $severity, $etype_string, $severity_string);
- my $dev_name;
+ my ($dev_name, $dev);
my $dbh = DBI->connect("dbi:SQLite:dbname=$dbname", "", "", {});
}
$query_handle->finish;
+ # Disk errors
+ $query = "select dev, count(*) from disk_errors group by dev";
+ $query_handle = $dbh->prepare($query);
+ $query_handle->execute();
+ $query_handle->bind_columns(\($dev, $count));
+ $out = "";
+ while($query_handle->fetch()) {
+ $out .= "\t$dev has $count errors\n";
+ }
+ if ($out ne "") {
+ print "Disk errors summary:\n$out";
+ } else {
+ print "No disk errors.\n";
+ }
+ $query_handle->finish;
+
# MCE mce_record errors
$query = "select error_msg, count(*) from mce_record group by error_msg";
$query_handle = $dbh->prepare($query);
my ($mcgcap,$mcgstatus, $status, $misc, $ip, $tsc, $walltime, $cpu, $cpuid, $apicid, $socketid, $cs, $bank, $cpuvendor, $bank_name, $mcgstatus_msg, $mcistatus_msg, $user_action, $mc_location);
my ($timestamp, $etype, $severity, $etype_string, $severity_string, $fru_id, $fru_text, $cper_data);
my ($bus_name, $dev_name, $driver_name, $reporter_name);
+ my ($dev, $sector, $nr_sector, $error, $rwbs, $cmd);
my $dbh = DBI->connect("dbi:SQLite:dbname=$dbname", "", "", {});
}
$query_handle->finish;
+ # Disk errors
+ $query = "select id, timestamp, dev, sector, nr_sector, error, rwbs, cmd from disk_errors order by id";
+ $query_handle = $dbh->prepare($query);
+ $query_handle->execute();
+ $query_handle->bind_columns(\($id, $timestamp, $dev, $sector, $nr_sector, $error, $rwbs, $cmd));
+ $out = "";
+ while($query_handle->fetch()) {
+ $out .= "$id $timestamp error: ";
+ $out .= "dev=$dev, ";
+ $out .= "sector=$sector, ";
+ $out .= "nr_sector=$nr_sector, ";
+ $out .= "error='$error', ";
+ $out .= "rwbs='$rwbs', ";
+ $out .= "cmd='$cmd', ";
+ $out .= "\n";
+ }
+ if ($out ne "") {
+ print "Disk errors\n$out\n";
+ } else {
+ print "No disk errors.\n\n";
+ }
+ $query_handle->finish;
+
# MCE mce_record errors
$query = "select id, timestamp, mcgcap, mcgstatus, status, addr, misc, ip, tsc, walltime, cpu, cpuid, apicid, socketid, cs, bank, cpuvendor, bank_name, error_msg, mcgstatus_msg, mcistatus_msg, user_action, mc_location from mce_record order by id";
$query_handle = $dbh->prepare($query);