]> www.infradead.org Git - users/mchehab/rasdaemon.git/commitdiff
mce-intel: Add support to decode MCI/MCA
authorMauro Carvalho Chehab <mchehab@redhat.com>
Sat, 18 May 2013 09:26:01 +0000 (06:26 -0300)
committerMauro Carvalho Chehab <mchehab@redhat.com>
Sat, 18 May 2013 09:38:10 +0000 (06:38 -0300)
As almost all mce decoding code, those code came from Andi Kleen's
mcelog application.

While the code added there came from p4.c and nehalem.c, they're
used by all Intel CPUs so far.

Intel CPU-specific code parsing is still not implemented.

Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>
mce-intel.c
ras-mce-handler.c
ras-mce-handler.h

index fc03e8b8f325ffee14a752fbc5af3d54344bd2b3..71543d38ac73e6e215f741a81268d6e435125e63 100644 (file)
 #define MCE_THERMAL_BANK       (MCE_EXTENDED_BANK + 0)
 #define MCE_TIMEOUT_BANK        (MCE_EXTENDED_BANK + 90)
 
+#define TLB_LL_MASK      0x3  /*bit 0, bit 1*/
+#define TLB_LL_SHIFT     0x0
+#define TLB_TT_MASK      0xc  /*bit 2, bit 3*/
+#define TLB_TT_SHIFT     0x2
+
+#define CACHE_LL_MASK    0x3  /*bit 0, bit 1*/
+#define CACHE_LL_SHIFT   0x0
+#define CACHE_TT_MASK    0xc  /*bit 2, bit 3*/
+#define CACHE_TT_SHIFT   0x2
+#define CACHE_RRRR_MASK  0xF0 /*bit 4, bit 5, bit 6, bit 7 */
+#define CACHE_RRRR_SHIFT 0x4
+
+#define BUS_LL_MASK      0x3  /* bit 0, bit 1*/
+#define BUS_LL_SHIFT     0x0
+#define BUS_II_MASK      0xc  /*bit 2, bit 3*/
+#define BUS_II_SHIFT     0x2
+#define BUS_RRRR_MASK    0xF0 /*bit 4, bit 5, bit 6, bit 7 */
+#define BUS_RRRR_SHIFT   0x4
+#define BUS_T_MASK       0x100 /*bit 8*/
+#define BUS_T_SHIFT      0x8
+#define BUS_PP_MASK      0x600 /*bit 9, bit 10*/
+#define BUS_PP_SHIFT     0x9
+
+#define MCG_TES_P       (1ULL<<11)   /* Yellow bit cache threshold supported */
+
+
+static char *TT[] = {
+       "Instruction",
+       "Data",
+       "Generic",
+       "Unknown"
+};
+
+static char *LL[] = {
+       "Level-0",
+       "Level-1",
+       "Level-2",
+       "Level-3"
+};
+
+static struct {
+       uint8_t value;
+       char* str;
+} RRRR [] = {
+       {0, "Generic"},
+       {1, "Read"},
+       {2, "Write" },
+       {3, "Data-Read"},
+       {4, "Data-Write"},
+       {5, "Instruction-Fetch"},
+       {6, "Prefetch"},
+       {7, "Eviction"},
+       {8, "Snoop"}
+};
+
+static char *PP[] = {
+       "Local-CPU-originated-request",
+       "Responed-to-request",
+       "Observed-error-as-third-party",
+       "Generic"
+};
+
+static char *T[] = {
+       "Request-did-not-timeout",
+       "Request-timed-out"
+};
+
+static char *II[] = {
+       "Memory-access",
+       "Reserved",
+       "IO",
+       "Other-transaction"
+};
+
+static char *mca_msg[] = {
+       [0] = "No Error",
+       [1] = "Unclassified",
+       [2] = "Microcode ROM parity error",
+       [3] = "External error",
+       [4] = "FRC error",
+       [5] = "Internal parity error",
+};
+
+static char *tracking_msg[] = {
+       [1] = "green",
+       [2] = "yellow",
+       [3] ="res3"
+};
+
+static const char *arstate[4] = {
+       [0] = "UCNA",
+       [1] = "AR",
+       [2] = "SRAO",
+       [3] = "SRAR"
+};
+
+static char *mmm_mnemonic[] = {
+       "GEN", "RD", "WR", "AC", "MS", "RES5", "RES6", "RES7"
+};
+
+static char *mmm_desc[] = {
+       "Generic undefined request",
+       "Memory read error",
+       "Memory write error",
+       "Address/Command error",
+       "Memory scrubbing error",
+       "Reserved 5",
+       "Reserved 6",
+       "Reserved 7"
+};
+
+void decode_memory_controller(struct mce_event *e, uint32_t status)
+{
+       char channel[30];
+       if ((status & 0xf) == 0xf)
+               mce_snprintf(e->mc_channel, "unspecified");
+       else
+               mce_snprintf(e->mc_channel, "%u", status & 0xf);
+       mce_snprintf(e->error_msg, "MEMORY CONTROLLER %s_CHANNEL%s_ERR\n",
+                   mmm_mnemonic[(status >> 4) & 7],
+                   channel);
+       mce_snprintf(e->error_msg, "Transaction: %s\n",
+                   mmm_desc[(status >> 4) & 7]);
+}
+
 static decode_termal_bank(struct mce_event *e)
 {
        if (e->status & 1) {
                mce_snprintf(e->mcgstatus_msg, "Processor %d heated above trip temperature. Throttling enabled.", e->cpu);
                mce_snprintf(e->user_action, "Please check your system cooling. Performance will be impacted");
        } else {
-               sprintf(e->error_msg, "Processor %d below trip temperature. Throttling disabled", e->cpu);
+               mce_snprintf(e->error_msg, "Processor %d below trip temperature. Throttling disabled", e->cpu);
        }
 }
 
@@ -69,8 +194,144 @@ static void bank_name(struct mce_event *e)
        }
 }
 
+static char *get_RRRR_str(uint8_t rrrr)
+{
+       unsigned i;
+
+       for (i = 0; i < ARRAY_SIZE(RRRR); i++) {
+               if (RRRR[i].value == rrrr) {
+                       return RRRR[i].str;
+               }
+       }
+
+       return "UNKNOWN";
+}
+
+#define decode_attr(arr, val) ({                               \
+       char *__str;                                            \
+       if ((unsigned)(val) >= ARRAY_SIZE(arr))                 \
+               __str = "UNKNOWN";                              \
+       else                                                    \
+               __str = (arr)[val];                             \
+       __str;                                                  \
+})
+
+static int test_prefix(int nr, uint32_t value)
+{
+       return ((value >> nr) == 1);
+}
+
+static void decode_mca(struct mce_event *e, uint64_t track, int *ismemerr)
+{
+       uint32_t mca = e->status & 0xffffL;
+
+       if (mca & (1UL << 12)) {
+               mce_snprintf(e->mcastatus_msg,
+                            "corrected filtering (some unreported errors in same region)");
+               mca &= ~(1UL << 12);
+       }
+
+       if (mca < ARRAY_SIZE(mca_msg)) {
+               mce_snprintf(e->mcastatus_msg, "%s", mca_msg[mca]);
+               return;
+       }
+
+       if ((mca >> 2) == 3) {
+               mce_snprintf(e->mcastatus_msg,
+                            "%s Generic memory hierarchy error\n",
+                            decode_attr(LL, mca & 3));
+       } else if (test_prefix(4, mca)) {
+               mce_snprintf(e->mcastatus_msg, "%s TLB %s Error\n",
+                               decode_attr(TT, (mca & TLB_TT_MASK) >> TLB_TT_SHIFT),
+                               decode_attr(LL, (mca & TLB_LL_MASK) >> TLB_LL_SHIFT));
+       } else if (test_prefix(8, mca)) {
+               unsigned typenum = (mca & CACHE_TT_MASK) >> CACHE_TT_SHIFT;
+               unsigned levelnum = (mca & CACHE_LL_MASK) >> CACHE_LL_SHIFT;
+               char *type = decode_attr(TT, typenum);
+               char *level = decode_attr(LL, levelnum);
+               mce_snprintf(e->mcastatus_msg,
+                            "%s CACHE %s %s Error\n", type, level,
+                            get_RRRR_str((mca & CACHE_RRRR_MASK) >>
+                                             CACHE_RRRR_SHIFT));
+#if 0
+               /* FIXME: We shouldn't mix parsing with actions */
+               if (track == 2)
+                       run_yellow_trigger(e->cpu, typenum, levelnum, type, level, e->socket);
+#endif
+       } else if (test_prefix(10, mca)) {
+               if (mca == 0x400)
+                       mce_snprintf(e->mcastatus_msg,
+                                    "Internal Timer error\n");
+               else
+                       mce_snprintf(e->mcastatus_msg,
+                                    "Internal unclassified error: %x\n",
+                                    mca);
+       } else if (test_prefix(11, mca)) {
+               mce_snprintf(e->mcastatus_msg, "BUS %s %s %s %s %s Error\n",
+                            decode_attr(LL, (mca & BUS_LL_MASK) >> BUS_LL_SHIFT),
+                            decode_attr(PP, (mca & BUS_PP_MASK) >> BUS_PP_SHIFT),
+                            get_RRRR_str((mca & BUS_RRRR_MASK) >> BUS_RRRR_SHIFT),
+                            decode_attr(II, (mca & BUS_II_MASK) >> BUS_II_SHIFT),
+                            decode_attr(T, (mca & BUS_T_MASK) >> BUS_T_SHIFT));
+       } else if (test_prefix(7, mca)) {
+               decode_memory_controller(e, mca);
+               *ismemerr = 1;
+       } else
+               mce_snprintf(e->mcastatus_msg, "Unknown Error %x\n", mca);
+}
+
+static void decode_tracking(struct mce_event *e, uint64_t track)
+{
+       if (track == 1)
+               mce_snprintf(e->user_action,
+                            "Large number of corrected cache errors. System operating, but might leadto uncorrected errors soon");
+
+       if (track)
+               mce_snprintf(e->mcistatus_msg, "Threshold based error status: %s",
+                            tracking_msg[track]);
+}
+
+static void decode_mci(struct mce_event *e, int *ismemerr)
+{
+       uint64_t track = 0;
+
+       if (!(e->status & MCI_STATUS_VAL))
+               mce_snprintf(e->mcistatus_msg, "MCE_INVALID");
+
+       if (e->status & MCI_STATUS_OVER)
+               mce_snprintf(e->mcistatus_msg, "Error_overflow");
+
+       /* FIXME: convert into severity */
+       if (e->status & MCI_STATUS_UC)
+               mce_snprintf(e->mcistatus_msg, "Uncorrected_error");
+       else
+               mce_snprintf(e->mcistatus_msg, "Corrected_error");
+
+
+       if (e->status & MCI_STATUS_EN)
+               mce_snprintf(e->mcistatus_msg, "Error_enabled");
+
+
+       if (e->status & MCI_STATUS_PCC)
+               mce_snprintf(e->mcistatus_msg, "Processor_context_corrupt");
+
+       if (e->status & (MCI_STATUS_S|MCI_STATUS_AR))
+               mce_snprintf(e->mcistatus_msg, "%s\n",
+                            arstate[(e->status >> 55) & 3]);
+
+       if ((e->mcgcap == 0 || (e->mcgcap & MCG_TES_P)) &&
+           !(e->status & MCI_STATUS_UC)) {
+               track = (e->status >> 53) & 3;
+               decode_tracking(e, track);
+       }
+
+       decode_mca(e, track, ismemerr);
+}
+
 int parse_intel_event(struct ras_events *ras, struct mce_event *e)
 {
+       int ismemerr;
+
        bank_name(e);
 
        if (e->bank == MCE_THERMAL_BANK) {
@@ -78,7 +339,9 @@ int parse_intel_event(struct ras_events *ras, struct mce_event *e)
                return 0;
        }
        decode_mcg(e);
+       decode_mci(e, &ismemerr);
+
+       /* FIXME: add per-CPU-type specific handlers */
 
        return 0;
 }
-
index a8b4379bde54fd5354e285cfd069bca9fe2a9d49..ff9931cac1657b7ead0f7ef641698823a51d9a8a 100644 (file)
@@ -271,6 +271,12 @@ static void report_mce_event(struct ras_events *ras,
        trace_seq_printf(s, ", status= %d", e->status);
        if (*e->error_msg)
                trace_seq_printf(s, ", %s", e->error_msg);
+       if (*e->mcistatus_msg)
+               trace_seq_printf(s, ", mci=%s", e->mcistatus_msg);
+       if (*e->mcastatus_msg)
+               trace_seq_printf(s, ", mca=%s", e->mcastatus_msg);
+       if (*e->mc_channel)
+               trace_seq_printf(s, ", mc_channel=%s", e->mc_channel);
 
        if (*e->user_action)
                trace_seq_printf(s, " %s", e->user_action);
index 2327a28950e7c4a681571dda11aff24360092d04..0384fa2bb3c379727fa83be9e23e18a75540aa2c 100644 (file)
@@ -67,7 +67,10 @@ struct mce_event {
        char            bank_name[64];
        char            error_msg[4096];
        char            mcgstatus_msg[256];
+       char            mcistatus_msg[1024];
+       char            mcastatus_msg[1024];
        char            user_action[4096];
+       char            mc_channel[256];
 };
 
 struct mce_priv {