]> www.infradead.org Git - users/mchehab/rasdaemon.git/commitdiff
Add a decoder for Nehalem-specific types
authorMauro Carvalho Chehab <mchehab@redhat.com>
Sat, 18 May 2013 14:35:55 +0000 (11:35 -0300)
committerMauro Carvalho Chehab <mchehab@redhat.com>
Sat, 18 May 2013 15:04:20 +0000 (12:04 -0300)
Note: Memory Controller-specific decoding was excluded.

Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>
Makefile.am
mce-intel-nehalem.c [new file with mode: 0644]
mce-intel.c
ras-mce-handler.h

index 54ac34528d687db6d49797c9d7df93aeb4552d82..6edf2f2527b951d7f05ed668d330355a02472fe4 100644 (file)
@@ -12,7 +12,7 @@ if WITH_AER
 endif
 if WITH_MCE
    rasdaemon_SOURCES += ras-mce-handler.c mce-intel.c mce-amd-k8.c \
-                       bitfield.c mce-intel-p4-p6.c
+                       bitfield.c mce-intel-p4-p6.c mce-intel-nehalem.c
 endif
 rasdaemon_LDADD = -lpthread $(SQLITE3_LIBS) libtrace/libtrace.a
 
diff --git a/mce-intel-nehalem.c b/mce-intel-nehalem.c
new file mode 100644 (file)
index 0000000..f49eb72
--- /dev/null
@@ -0,0 +1,156 @@
+/*
+ * The code below came from Andi Kleen/Intel/SuSe mcelog code,
+ * released under GNU Public General License, v.2
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+*/
+
+#include <string.h>
+#include <stdio.h>
+
+#include "ras-mce-handler.h"
+#include "bitfield.h"
+
+/* See IA32 SDM Vol3B Appendix E.3.2 ff */
+
+/* MC1_STATUS error */
+static struct field qpi_status[] = {
+       SBITFIELD(16, "QPI header had bad parity"),
+       SBITFIELD(17, "QPI Data packet had bad parity"),
+       SBITFIELD(18, "Number of QPI retries exceeded"),
+       SBITFIELD(19, "Received QPI data packet that was poisoned by sender"),
+       SBITFIELD(20, "QPI reserved 20"),
+       SBITFIELD(21, "QPI reserved 21"),
+       SBITFIELD(22, "QPI received unsupported message encoding"),
+       SBITFIELD(23, "QPI credit type is not supported"),
+       SBITFIELD(24, "Sender sent too many QPI flits to the receiver"),
+       SBITFIELD(25, "QPI Sender sent a failed response to receiver"),
+       SBITFIELD(26, "Clock jitter detected in internal QPI clocking"),
+       {}
+};
+
+static struct field qpi_misc[] = {
+       SBITFIELD(14, "QPI misc reserved 14"),
+       SBITFIELD(15, "QPI misc reserved 15"),
+       SBITFIELD(24, "QPI Interleave/Head Indication Bit (IIB)"),
+       {}
+};
+
+static struct numfield qpi_numbers[] = {
+       HEXNUMBER(0, 7, "QPI class and opcode of packet with error"),
+       HEXNUMBER(8, 13, "QPI Request Transaction ID"),
+       NUMBERFORCE(16, 18, "QPI Requestor/Home Node ID (RHNID)"),
+       HEXNUMBER(19, 23, "QPI miscreserved 19-23"),
+       {},
+};
+
+static struct field nhm_memory_status[] = {
+       SBITFIELD(16, "Memory read ECC error"),
+       SBITFIELD(17, "Memory ECC error occurred during scrub"),
+       SBITFIELD(18, "Memory write parity error"),
+       SBITFIELD(19, "Memory error in half of redundant memory"),
+       SBITFIELD(20, "Memory reserved 20"),
+       SBITFIELD(21, "Memory access out of range"),
+       SBITFIELD(22, "Memory internal RTID invalid"),
+       SBITFIELD(23, "Memory address parity error"),
+       SBITFIELD(24, "Memory byte enable parity error"),
+       {}
+};
+
+static struct numfield nhm_memory_status_numbers[] = {
+       HEXNUMBER(25, 37, "Memory MISC reserved 25..37"),
+       NUMBERFORCE(38, 52, "Memory corrected error count (CORE_ERR_CNT)"),
+       HEXNUMBER(53, 56, "Memory MISC reserved 53..56"),
+       {}
+};
+
+static struct numfield nhm_memory_misc_numbers[] = {
+       HEXNUMBERFORCE(0, 7, "Memory transaction Tracker ID (RTId)"),
+       NUMBERFORCE(16, 17, "Memory DIMM ID of error"),
+       NUMBERFORCE(18, 19, "Memory channel ID of error"),
+       HEXNUMBERFORCE(32, 63, "Memory ECC syndrome"),
+       {}
+};
+
+static char *internal_errors[] = {
+       [0x0]  = "No Error",
+       [0x3]  = "Reset firmware did not complete",
+       [0x8]  = "Received an invalid CMPD",
+       [0xa]  = "Invalid Power Management Request",
+       [0xd]  = "Invalid S-state transition",
+       [0x11] = "VID controller does not match POC controller selected",
+       [0x1a] = "MSID from POC does not match CPU MSID",
+};
+
+static struct field internal_error_status[] = {
+       FIELD(24, internal_errors),
+       {}
+};
+
+static struct numfield internal_error_numbers[] = {
+       HEXNUMBER(16, 23, "Internal machine check status reserved 16..23"),
+       HEXNUMBER(32, 56, "Internal machine check status reserved 32..56"),
+       {},
+};
+
+/* Generic architectural memory controller encoding */
+
+void nehalem_decode_model(struct mce_event *e)
+{
+       uint64_t status = e->status;
+       uint32_t mca = status & 0xffff;
+       uint64_t misc = e->misc;
+
+       if ((mca >> 11) == 1) {         /* bus and interconnect QPI */
+               decode_bitfield(e, status, qpi_status);
+               if (status & MCI_STATUS_MISCV) {
+                       decode_numfield(e, misc, qpi_numbers);
+                       decode_bitfield(e, misc, qpi_misc);
+               }
+       } else if (mca == 0x0001) { /* internal unspecified */
+               decode_bitfield(e, status, internal_error_status);
+               decode_numfield(e, status, internal_error_numbers);
+       } else if ((mca >> 7) == 1) { /* memory controller */
+               decode_bitfield(e, status, nhm_memory_status);
+               decode_numfield(e, status, nhm_memory_status_numbers);
+               if (status & MCI_STATUS_MISCV)
+                       decode_numfield(e, misc, nhm_memory_misc_numbers);
+       }
+}
+
+/* Only core errors supported. Same as Nehalem */
+void xeon75xx_decode_model(struct mce_event *e)
+{
+       uint64_t status = e->status;
+       uint32_t mca = status & 0xffff;
+       if (mca == 0x0001) { /* internal unspecified */
+               decode_bitfield(e, status, internal_error_status);
+               decode_numfield(e, status, internal_error_numbers);
+       }
+#if 0
+       xeon75xx_decode_dimm(m, msize);
+#endif
+}
+
+#if 0
+/* Nehalem-EP specific DIMM decoding */
+void nehalem_memerr_misc(struct mce *m, int *channel, int *dimm)
+{
+       if (m->status & MCI_STATUS_MISCV) {
+               *channel = EXTRACT(m->misc, 18, 19);
+               *dimm = EXTRACT(m->misc, 16, 17);
+       }
+}
+#endif
\ No newline at end of file
index 4bf1a4237c5e64b2c52b7abe50840c4135eef8d9..6b58d756e2fc2a1e4374356ea0233d57f6a79e81 100644 (file)
@@ -355,20 +355,20 @@ int parse_intel_event(struct ras_events *ras, struct mce_event *e)
                        break;
                }
        }
-#if 0
        switch(mce->cputype) {
        case CPU_NEHALEM:
                nehalem_decode_model(e);
                break;
+       case CPU_XEON75XX:
+               xeon75xx_decode_model(e);
+               break;
+#if 0
        case CPU_DUNNINGTON:
                dunnington_decode_model(e);
                break;
        case CPU_TULSA:
                tulsa_decode_model(e);
                break;
-       case CPU_XEON75XX:
-               xeon75xx_decode_model(e);
-               break;
        case CPU_SANDY_BRIDGE:
        case CPU_SANDY_BRIDGE_EP:
                snb_decode_model(ras, e);
@@ -376,8 +376,8 @@ int parse_intel_event(struct ras_events *ras, struct mce_event *e)
        case CPU_IVY_BRIDGE_EPEX:
                ivb_decode_model(ras, e);
                break;
-       }
 #endif
+       }
 
        return 0;
 }
index 1789e9d2c1d9d9c1aea586d315ac45b7b5e07fe6..4488ddcdcca17e15689eb90b42920a975d956c3a 100644 (file)
@@ -112,7 +112,8 @@ unsigned bitfield_msg(char *buf, size_t len, char **bitarray, unsigned array_len
 void p4_decode_model(struct mce_event *e);
 void core2_decode_model(struct mce_event *e);
 void p6old_decode_model(struct mce_event *e);
-
+void nehalem_decode_model(struct mce_event *e);
+void xeon75xx_decode_model(struct mce_event *e);
 
 /* Software defined banks */
 #define MCE_EXTENDED_BANK      128