1/* 2 * MCE grading rules. 3 * Copyright 2008, 2009 Intel Corporation. 4 * 5 * This program is free software; you can redistribute it and/or 6 * modify it under the terms of the GNU General Public License 7 * as published by the Free Software Foundation; version 2 8 * of the License. 9 * 10 * Author: Andi Kleen 11 */ 12#include <linux/kernel.h> 13#include <linux/seq_file.h> 14#include <linux/init.h> 15#include <linux/debugfs.h> 16#include <asm/mce.h> 17 18#include "mce-internal.h" 19 20/* 21 * Grade an mce by severity. In general the most severe ones are processed 22 * first. Since there are quite a lot of combinations test the bits in a 23 * table-driven way. The rules are simply processed in order, first 24 * match wins. 25 * 26 * Note this is only used for machine check exceptions, the corrected 27 * errors use much simpler rules. The exceptions still check for the corrected 28 * errors, but only to leave them alone for the CMCI handler (except for 29 * panic situations) 30 */ 31 32enum context { IN_KERNEL = 1, IN_USER = 2 }; 33enum ser { SER_REQUIRED = 1, NO_SER = 2 }; 34 35static struct severity { 36 u64 mask; 37 u64 result; 38 unsigned char sev; 39 unsigned char mcgmask; 40 unsigned char mcgres; 41 unsigned char ser; 42 unsigned char context; 43 unsigned char covered; 44 char *msg; 45} severities[] = { 46#define MCESEV(s, m, c...) { .sev = MCE_ ## s ## _SEVERITY, .msg = m, ## c } 47#define KERNEL .context = IN_KERNEL 48#define USER .context = IN_USER 49#define SER .ser = SER_REQUIRED 50#define NOSER .ser = NO_SER 51#define BITCLR(x) .mask = x, .result = 0 52#define BITSET(x) .mask = x, .result = x 53#define MCGMASK(x, y) .mcgmask = x, .mcgres = y 54#define MASK(x, y) .mask = x, .result = y 55#define MCI_UC_S (MCI_STATUS_UC|MCI_STATUS_S) 56#define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR) 57#define MCI_ADDR (MCI_STATUS_ADDRV|MCI_STATUS_MISCV) 58#define MCACOD 0xffff 59/* Architecturally defined codes from SDM Vol. 3B Chapter 15 */ 60#define MCACOD_SCRUB 0x00C0 /* 0xC0-0xCF Memory Scrubbing */ 61#define MCACOD_SCRUBMSK 0xfff0 62#define MCACOD_L3WB 0x017A /* L3 Explicit Writeback */ 63#define MCACOD_DATA 0x0134 /* Data Load */ 64#define MCACOD_INSTR 0x0150 /* Instruction Fetch */ 65 66 MCESEV( 67 NO, "Invalid", 68 BITCLR(MCI_STATUS_VAL) 69 ), 70 MCESEV( 71 NO, "Not enabled", 72 BITCLR(MCI_STATUS_EN) 73 ), 74 MCESEV( 75 PANIC, "Processor context corrupt", 76 BITSET(MCI_STATUS_PCC) 77 ), 78 /* When MCIP is not set something is very confused */ 79 MCESEV( 80 PANIC, "MCIP not set in MCA handler", 81 MCGMASK(MCG_STATUS_MCIP, 0) 82 ), 83 /* Neither return not error IP -- no chance to recover -> PANIC */ 84 MCESEV( 85 PANIC, "Neither restart nor error IP", 86 MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, 0) 87 ), 88 MCESEV( 89 PANIC, "In kernel and no restart IP", 90 KERNEL, MCGMASK(MCG_STATUS_RIPV, 0) 91 ), 92 MCESEV( 93 KEEP, "Corrected error", 94 NOSER, BITCLR(MCI_STATUS_UC) 95 ), 96 97 /* ignore OVER for UCNA */ 98 MCESEV( 99 KEEP, "Uncorrected no action required", 100 SER, MASK(MCI_UC_SAR, MCI_STATUS_UC) 101 ), 102 MCESEV( 103 PANIC, "Illegal combination (UCNA with AR=1)", 104 SER, 105 MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_UC|MCI_STATUS_AR) 106 ), 107 MCESEV( 108 KEEP, "Non signalled machine check", 109 SER, BITCLR(MCI_STATUS_S) 110 ), 111 112 MCESEV( 113 PANIC, "Action required with lost events", 114 SER, BITSET(MCI_STATUS_OVER|MCI_UC_SAR) 115 ), 116 117 /* known AR MCACODs: */ 118#ifdef CONFIG_MEMORY_FAILURE 119 MCESEV( 120 KEEP, "HT thread notices Action required: data load error", 121 SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA), 122 MCGMASK(MCG_STATUS_EIPV, 0) 123 ), 124 MCESEV( 125 AR, "Action required: data load error", 126 SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA), 127 USER 128 ), 129#endif 130 MCESEV( 131 PANIC, "Action required: unknown MCACOD", 132 SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_SAR) 133 ), 134 135 /* known AO MCACODs: */ 136 MCESEV( 137 AO, "Action optional: memory scrubbing error", 138 SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD_SCRUBMSK, MCI_UC_S|MCACOD_SCRUB) 139 ), 140 MCESEV( 141 AO, "Action optional: last level cache writeback error", 142 SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD, MCI_UC_S|MCACOD_L3WB) 143 ), 144 MCESEV( 145 SOME, "Action optional: unknown MCACOD", 146 SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S) 147 ), 148 MCESEV( 149 SOME, "Action optional with lost events", 150 SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_OVER|MCI_UC_S) 151 ), 152 153 MCESEV( 154 PANIC, "Overflowed uncorrected", 155 BITSET(MCI_STATUS_OVER|MCI_STATUS_UC) 156 ), 157 MCESEV( 158 UC, "Uncorrected", 159 BITSET(MCI_STATUS_UC) 160 ), 161 MCESEV( 162 SOME, "No match", 163 BITSET(0) 164 ) /* always matches. keep at end */ 165}; 166 167/* 168 * If mcgstatus indicated that ip/cs on the stack were 169 * no good, then "m->cs" will be zero and we will have 170 * to assume the worst case (IN_KERNEL) as we actually 171 * have no idea what we were executing when the machine 172 * check hit. 173 * If we do have a good "m->cs" (or a faked one in the 174 * case we were executing in VM86 mode) we can use it to 175 * distinguish an exception taken in user from from one 176 * taken in the kernel. 177 */ 178static int error_context(struct mce *m) 179{ 180 return ((m->cs & 3) == 3) ? IN_USER : IN_KERNEL; 181} 182 183int mce_severity(struct mce *m, int tolerant, char **msg) 184{ 185 enum context ctx = error_context(m); 186 struct severity *s; 187 188 for (s = severities;; s++) { 189 if ((m->status & s->mask) != s->result) 190 continue; 191 if ((m->mcgstatus & s->mcgmask) != s->mcgres) 192 continue; 193 if (s->ser == SER_REQUIRED && !mce_ser) 194 continue; 195 if (s->ser == NO_SER && mce_ser) 196 continue; 197 if (s->context && ctx != s->context) 198 continue; 199 if (msg) 200 *msg = s->msg; 201 s->covered = 1; 202 if (s->sev >= MCE_UC_SEVERITY && ctx == IN_KERNEL) { 203 if (panic_on_oops || tolerant < 1) 204 return MCE_PANIC_SEVERITY; 205 } 206 return s->sev; 207 } 208} 209 210#ifdef CONFIG_DEBUG_FS 211static void *s_start(struct seq_file *f, loff_t *pos) 212{ 213 if (*pos >= ARRAY_SIZE(severities)) 214 return NULL; 215 return &severities[*pos]; 216} 217 218static void *s_next(struct seq_file *f, void *data, loff_t *pos) 219{ 220 if (++(*pos) >= ARRAY_SIZE(severities)) 221 return NULL; 222 return &severities[*pos]; 223} 224 225static void s_stop(struct seq_file *f, void *data) 226{ 227} 228 229static int s_show(struct seq_file *f, void *data) 230{ 231 struct severity *ser = data; 232 seq_printf(f, "%d\t%s\n", ser->covered, ser->msg); 233 return 0; 234} 235 236static const struct seq_operations severities_seq_ops = { 237 .start = s_start, 238 .next = s_next, 239 .stop = s_stop, 240 .show = s_show, 241}; 242 243static int severities_coverage_open(struct inode *inode, struct file *file) 244{ 245 return seq_open(file, &severities_seq_ops); 246} 247 248static ssize_t severities_coverage_write(struct file *file, 249 const char __user *ubuf, 250 size_t count, loff_t *ppos) 251{ 252 int i; 253 for (i = 0; i < ARRAY_SIZE(severities); i++) 254 severities[i].covered = 0; 255 return count; 256} 257 258static const struct file_operations severities_coverage_fops = { 259 .open = severities_coverage_open, 260 .release = seq_release, 261 .read = seq_read, 262 .write = severities_coverage_write, 263 .llseek = seq_lseek, 264}; 265 266static int __init severities_debugfs_init(void) 267{ 268 struct dentry *dmce, *fsev; 269 270 dmce = mce_get_debugfs_dir(); 271 if (!dmce) 272 goto err_out; 273 274 fsev = debugfs_create_file("severities-coverage", 0444, dmce, NULL, 275 &severities_coverage_fops); 276 if (!fsev) 277 goto err_out; 278 279 return 0; 280 281err_out: 282 return -ENOMEM; 283} 284late_initcall(severities_debugfs_init); 285#endif /* CONFIG_DEBUG_FS */ 286