mce_amd.c revision ded506232865e8e932bc21c87f48170d50db4d97
1#include <linux/module.h> 2#include <linux/slab.h> 3 4#include "mce_amd.h" 5 6static struct amd_decoder_ops *fam_ops; 7 8static bool report_gart_errors; 9static void (*nb_bus_decoder)(int node_id, struct mce *m, u32 nbcfg); 10 11void amd_report_gart_errors(bool v) 12{ 13 report_gart_errors = v; 14} 15EXPORT_SYMBOL_GPL(amd_report_gart_errors); 16 17void amd_register_ecc_decoder(void (*f)(int, struct mce *, u32)) 18{ 19 nb_bus_decoder = f; 20} 21EXPORT_SYMBOL_GPL(amd_register_ecc_decoder); 22 23void amd_unregister_ecc_decoder(void (*f)(int, struct mce *, u32)) 24{ 25 if (nb_bus_decoder) { 26 WARN_ON(nb_bus_decoder != f); 27 28 nb_bus_decoder = NULL; 29 } 30} 31EXPORT_SYMBOL_GPL(amd_unregister_ecc_decoder); 32 33/* 34 * string representation for the different MCA reported error types, see F3x48 35 * or MSR0000_0411. 36 */ 37 38/* transaction type */ 39const char *tt_msgs[] = { "INSN", "DATA", "GEN", "RESV" }; 40EXPORT_SYMBOL_GPL(tt_msgs); 41 42/* cache level */ 43const char *ll_msgs[] = { "RESV", "L1", "L2", "L3/GEN" }; 44EXPORT_SYMBOL_GPL(ll_msgs); 45 46/* memory transaction type */ 47const char *rrrr_msgs[] = { 48 "GEN", "RD", "WR", "DRD", "DWR", "IRD", "PRF", "EV", "SNP" 49}; 50EXPORT_SYMBOL_GPL(rrrr_msgs); 51 52/* participating processor */ 53const char *pp_msgs[] = { "SRC", "RES", "OBS", "GEN" }; 54EXPORT_SYMBOL_GPL(pp_msgs); 55 56/* request timeout */ 57const char *to_msgs[] = { "no timeout", "timed out" }; 58EXPORT_SYMBOL_GPL(to_msgs); 59 60/* memory or i/o */ 61const char *ii_msgs[] = { "MEM", "RESV", "IO", "GEN" }; 62EXPORT_SYMBOL_GPL(ii_msgs); 63 64/* 65 * Map the 4 or 5 (family-specific) bits of Extended Error code to the 66 * string table. 67 */ 68const char *ext_msgs[] = { 69 "K8 ECC error", /* 0_0000b */ 70 "CRC error on link", /* 0_0001b */ 71 "Sync error packets on link", /* 0_0010b */ 72 "Master Abort during link operation", /* 0_0011b */ 73 "Target Abort during link operation", /* 0_0100b */ 74 "Invalid GART PTE entry during table walk", /* 0_0101b */ 75 "Unsupported atomic RMW command received", /* 0_0110b */ 76 "WDT error: NB transaction timeout", /* 0_0111b */ 77 "ECC/ChipKill ECC error", /* 0_1000b */ 78 "SVM DEV Error", /* 0_1001b */ 79 "Link Data error", /* 0_1010b */ 80 "Link/L3/Probe Filter Protocol error", /* 0_1011b */ 81 "NB Internal Arrays Parity error", /* 0_1100b */ 82 "DRAM Address/Control Parity error", /* 0_1101b */ 83 "Link Transmission error", /* 0_1110b */ 84 "GART/DEV Table Walk Data error" /* 0_1111b */ 85 "Res 0x100 error", /* 1_0000b */ 86 "Res 0x101 error", /* 1_0001b */ 87 "Res 0x102 error", /* 1_0010b */ 88 "Res 0x103 error", /* 1_0011b */ 89 "Res 0x104 error", /* 1_0100b */ 90 "Res 0x105 error", /* 1_0101b */ 91 "Res 0x106 error", /* 1_0110b */ 92 "Res 0x107 error", /* 1_0111b */ 93 "Res 0x108 error", /* 1_1000b */ 94 "Res 0x109 error", /* 1_1001b */ 95 "Res 0x10A error", /* 1_1010b */ 96 "Res 0x10B error", /* 1_1011b */ 97 "ECC error in L3 Cache Data", /* 1_1100b */ 98 "L3 Cache Tag error", /* 1_1101b */ 99 "L3 Cache LRU Parity error", /* 1_1110b */ 100 "Probe Filter error" /* 1_1111b */ 101}; 102EXPORT_SYMBOL_GPL(ext_msgs); 103 104static bool f10h_dc_mce(u16 ec) 105{ 106 u8 r4 = (ec >> 4) & 0xf; 107 bool ret = false; 108 109 if (r4 == R4_GEN) { 110 pr_cont("during data scrub.\n"); 111 return true; 112 } 113 114 if (MEM_ERROR(ec)) { 115 u8 ll = ec & 0x3; 116 ret = true; 117 118 if (ll == LL_L2) 119 pr_cont("during L1 linefill from L2.\n"); 120 else if (ll == LL_L1) 121 pr_cont("Data/Tag %s error.\n", RRRR_MSG(ec)); 122 else 123 ret = false; 124 } 125 return ret; 126} 127 128static bool k8_dc_mce(u16 ec) 129{ 130 if (BUS_ERROR(ec)) { 131 pr_cont("during system linefill.\n"); 132 return true; 133 } 134 135 return f10h_dc_mce(ec); 136} 137 138static bool f14h_dc_mce(u16 ec) 139{ 140 u8 r4 = (ec >> 4) & 0xf; 141 u8 ll = ec & 0x3; 142 u8 tt = (ec >> 2) & 0x3; 143 u8 ii = tt; 144 bool ret = true; 145 146 if (MEM_ERROR(ec)) { 147 148 if (tt != TT_DATA || ll != LL_L1) 149 return false; 150 151 switch (r4) { 152 case R4_DRD: 153 case R4_DWR: 154 pr_cont("Data/Tag parity error due to %s.\n", 155 (r4 == R4_DRD ? "load/hw prf" : "store")); 156 break; 157 case R4_EVICT: 158 pr_cont("Copyback parity error on a tag miss.\n"); 159 break; 160 case R4_SNOOP: 161 pr_cont("Tag parity error during snoop.\n"); 162 break; 163 default: 164 ret = false; 165 } 166 } else if (BUS_ERROR(ec)) { 167 168 if ((ii != II_MEM && ii != II_IO) || ll != LL_LG) 169 return false; 170 171 pr_cont("System read data error on a "); 172 173 switch (r4) { 174 case R4_RD: 175 pr_cont("TLB reload.\n"); 176 break; 177 case R4_DWR: 178 pr_cont("store.\n"); 179 break; 180 case R4_DRD: 181 pr_cont("load.\n"); 182 break; 183 default: 184 ret = false; 185 } 186 } else { 187 ret = false; 188 } 189 190 return ret; 191} 192 193static void amd_decode_dc_mce(struct mce *m) 194{ 195 u16 ec = m->status & 0xffff; 196 u8 xec = (m->status >> 16) & 0xf; 197 198 pr_emerg(HW_ERR "Data Cache Error: "); 199 200 /* TLB error signatures are the same across families */ 201 if (TLB_ERROR(ec)) { 202 u8 tt = (ec >> 2) & 0x3; 203 204 if (tt == TT_DATA) { 205 pr_cont("%s TLB %s.\n", LL_MSG(ec), 206 (xec ? "multimatch" : "parity error")); 207 return; 208 } 209 else 210 goto wrong_dc_mce; 211 } 212 213 if (!fam_ops->dc_mce(ec)) 214 goto wrong_dc_mce; 215 216 return; 217 218wrong_dc_mce: 219 pr_emerg(HW_ERR "Corrupted DC MCE info?\n"); 220} 221 222static bool k8_ic_mce(u16 ec) 223{ 224 u8 ll = ec & 0x3; 225 u8 r4 = (ec >> 4) & 0xf; 226 bool ret = true; 227 228 if (!MEM_ERROR(ec)) 229 return false; 230 231 if (ll == 0x2) 232 pr_cont("during a linefill from L2.\n"); 233 else if (ll == 0x1) { 234 switch (r4) { 235 case R4_IRD: 236 pr_cont("Parity error during data load.\n"); 237 break; 238 239 case R4_EVICT: 240 pr_cont("Copyback Parity/Victim error.\n"); 241 break; 242 243 case R4_SNOOP: 244 pr_cont("Tag Snoop error.\n"); 245 break; 246 247 default: 248 ret = false; 249 break; 250 } 251 } else 252 ret = false; 253 254 return ret; 255} 256 257static bool f14h_ic_mce(u16 ec) 258{ 259 u8 ll = ec & 0x3; 260 u8 tt = (ec >> 2) & 0x3; 261 u8 r4 = (ec >> 4) & 0xf; 262 bool ret = true; 263 264 if (MEM_ERROR(ec)) { 265 if (tt != 0 || ll != 1) 266 ret = false; 267 268 if (r4 == R4_IRD) 269 pr_cont("Data/tag array parity error for a tag hit.\n"); 270 else if (r4 == R4_SNOOP) 271 pr_cont("Tag error during snoop/victimization.\n"); 272 else 273 ret = false; 274 } 275 return ret; 276} 277 278static void amd_decode_ic_mce(struct mce *m) 279{ 280 u16 ec = m->status & 0xffff; 281 u8 xec = (m->status >> 16) & 0xf; 282 283 pr_emerg(HW_ERR "Instruction Cache Error: "); 284 285 if (TLB_ERROR(ec)) 286 pr_cont("%s TLB %s.\n", LL_MSG(ec), 287 (xec ? "multimatch" : "parity error")); 288 else if (BUS_ERROR(ec)) { 289 bool k8 = (boot_cpu_data.x86 == 0xf && (m->status & BIT(58))); 290 291 pr_cont("during %s.\n", (k8 ? "system linefill" : "NB data read")); 292 } else if (fam_ops->ic_mce(ec)) 293 ; 294 else 295 pr_emerg(HW_ERR "Corrupted IC MCE info?\n"); 296} 297 298static void amd_decode_bu_mce(struct mce *m) 299{ 300 u32 ec = m->status & 0xffff; 301 u32 xec = (m->status >> 16) & 0xf; 302 303 pr_emerg(HW_ERR "Bus Unit Error"); 304 305 if (xec == 0x1) 306 pr_cont(" in the write data buffers.\n"); 307 else if (xec == 0x3) 308 pr_cont(" in the victim data buffers.\n"); 309 else if (xec == 0x2 && MEM_ERROR(ec)) 310 pr_cont(": %s error in the L2 cache tags.\n", RRRR_MSG(ec)); 311 else if (xec == 0x0) { 312 if (TLB_ERROR(ec)) 313 pr_cont(": %s error in a Page Descriptor Cache or " 314 "Guest TLB.\n", TT_MSG(ec)); 315 else if (BUS_ERROR(ec)) 316 pr_cont(": %s/ECC error in data read from NB: %s.\n", 317 RRRR_MSG(ec), PP_MSG(ec)); 318 else if (MEM_ERROR(ec)) { 319 u8 rrrr = (ec >> 4) & 0xf; 320 321 if (rrrr >= 0x7) 322 pr_cont(": %s error during data copyback.\n", 323 RRRR_MSG(ec)); 324 else if (rrrr <= 0x1) 325 pr_cont(": %s parity/ECC error during data " 326 "access from L2.\n", RRRR_MSG(ec)); 327 else 328 goto wrong_bu_mce; 329 } else 330 goto wrong_bu_mce; 331 } else 332 goto wrong_bu_mce; 333 334 return; 335 336wrong_bu_mce: 337 pr_emerg(HW_ERR "Corrupted BU MCE info?\n"); 338} 339 340static void amd_decode_ls_mce(struct mce *m) 341{ 342 u16 ec = m->status & 0xffff; 343 u8 xec = (m->status >> 16) & 0xf; 344 345 if (boot_cpu_data.x86 == 0x14) { 346 pr_emerg("You shouldn't be seeing an LS MCE on this cpu family," 347 " please report on LKML.\n"); 348 return; 349 } 350 351 pr_emerg(HW_ERR "Load Store Error"); 352 353 if (xec == 0x0) { 354 u8 r4 = (ec >> 4) & 0xf; 355 356 if (!BUS_ERROR(ec) || (r4 != R4_DRD && r4 != R4_DWR)) 357 goto wrong_ls_mce; 358 359 pr_cont(" during %s.\n", RRRR_MSG(ec)); 360 } else 361 goto wrong_ls_mce; 362 363 return; 364 365wrong_ls_mce: 366 pr_emerg(HW_ERR "Corrupted LS MCE info?\n"); 367} 368 369void amd_decode_nb_mce(int node_id, struct mce *m, u32 nbcfg) 370{ 371 u32 ec = m->status & 0xffff; 372 u32 nbsh = (u32)(m->status >> 32); 373 u32 nbsl = (u32)m->status; 374 375 /* 376 * GART TLB error reporting is disabled by default. Bail out early. 377 */ 378 if (TLB_ERROR(ec) && !report_gart_errors) 379 return; 380 381 pr_emerg(HW_ERR "Northbridge Error, node %d", node_id); 382 383 /* 384 * F10h, revD can disable ErrCpu[3:0] so check that first and also the 385 * value encoding has changed so interpret those differently 386 */ 387 if ((boot_cpu_data.x86 == 0x10) && 388 (boot_cpu_data.x86_model > 7)) { 389 if (nbsh & K8_NBSH_ERR_CPU_VAL) 390 pr_cont(", core: %u\n", (u8)(nbsh & 0xf)); 391 } else { 392 u8 assoc_cpus = nbsh & 0xf; 393 394 if (assoc_cpus > 0) 395 pr_cont(", core: %d", fls(assoc_cpus) - 1); 396 397 pr_cont("\n"); 398 } 399 400 pr_emerg(HW_ERR "%s.\n", EXT_ERR_MSG(nbsl)); 401 402 if (BUS_ERROR(ec) && nb_bus_decoder) 403 nb_bus_decoder(node_id, m, nbcfg); 404} 405EXPORT_SYMBOL_GPL(amd_decode_nb_mce); 406 407static void amd_decode_fr_mce(struct mce *m) 408{ 409 /* we have only one error signature so match all fields at once. */ 410 if ((m->status & 0xffff) == 0x0f0f) 411 pr_emerg(HW_ERR " FR Error: CPU Watchdog timer expire.\n"); 412 else 413 pr_emerg(HW_ERR "Corrupted FR MCE info?\n"); 414} 415 416static inline void amd_decode_err_code(u16 ec) 417{ 418 if (TLB_ERROR(ec)) { 419 pr_emerg(HW_ERR "Transaction: %s, Cache Level: %s\n", 420 TT_MSG(ec), LL_MSG(ec)); 421 } else if (MEM_ERROR(ec)) { 422 pr_emerg(HW_ERR "Transaction: %s, Type: %s, Cache Level: %s\n", 423 RRRR_MSG(ec), TT_MSG(ec), LL_MSG(ec)); 424 } else if (BUS_ERROR(ec)) { 425 pr_emerg(HW_ERR "Transaction: %s (%s), %s, Cache Level: %s, " 426 "Participating Processor: %s\n", 427 RRRR_MSG(ec), II_MSG(ec), TO_MSG(ec), LL_MSG(ec), 428 PP_MSG(ec)); 429 } else 430 pr_emerg(HW_ERR "Huh? Unknown MCE error 0x%x\n", ec); 431} 432 433int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data) 434{ 435 struct mce *m = (struct mce *)data; 436 int node, ecc; 437 438 pr_emerg(HW_ERR "MC%d_STATUS: ", m->bank); 439 440 pr_cont("%sorrected error, other errors lost: %s, " 441 "CPU context corrupt: %s", 442 ((m->status & MCI_STATUS_UC) ? "Unc" : "C"), 443 ((m->status & MCI_STATUS_OVER) ? "yes" : "no"), 444 ((m->status & MCI_STATUS_PCC) ? "yes" : "no")); 445 446 /* do the two bits[14:13] together */ 447 ecc = (m->status >> 45) & 0x3; 448 if (ecc) 449 pr_cont(", %sECC Error", ((ecc == 2) ? "C" : "U")); 450 451 pr_cont("\n"); 452 453 switch (m->bank) { 454 case 0: 455 amd_decode_dc_mce(m); 456 break; 457 458 case 1: 459 amd_decode_ic_mce(m); 460 break; 461 462 case 2: 463 amd_decode_bu_mce(m); 464 break; 465 466 case 3: 467 amd_decode_ls_mce(m); 468 break; 469 470 case 4: 471 node = amd_get_nb_id(m->extcpu); 472 amd_decode_nb_mce(node, m, 0); 473 break; 474 475 case 5: 476 amd_decode_fr_mce(m); 477 break; 478 479 default: 480 break; 481 } 482 483 amd_decode_err_code(m->status & 0xffff); 484 485 return NOTIFY_STOP; 486} 487EXPORT_SYMBOL_GPL(amd_decode_mce); 488 489static struct notifier_block amd_mce_dec_nb = { 490 .notifier_call = amd_decode_mce, 491}; 492 493static int __init mce_amd_init(void) 494{ 495 /* 496 * We can decode MCEs for K8, F10h and F11h CPUs: 497 */ 498 if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) 499 return 0; 500 501 if (boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x11) 502 return 0; 503 504 fam_ops = kzalloc(sizeof(struct amd_decoder_ops), GFP_KERNEL); 505 if (!fam_ops) 506 return -ENOMEM; 507 508 switch (boot_cpu_data.x86) { 509 case 0xf: 510 fam_ops->dc_mce = k8_dc_mce; 511 fam_ops->ic_mce = k8_ic_mce; 512 break; 513 514 case 0x10: 515 fam_ops->dc_mce = f10h_dc_mce; 516 fam_ops->ic_mce = k8_ic_mce; 517 break; 518 519 case 0x14: 520 fam_ops->dc_mce = f14h_dc_mce; 521 fam_ops->ic_mce = f14h_ic_mce; 522 break; 523 524 default: 525 printk(KERN_WARNING "Huh? What family is that: %d?!\n", 526 boot_cpu_data.x86); 527 kfree(fam_ops); 528 return -EINVAL; 529 } 530 531 atomic_notifier_chain_register(&x86_mce_decoder_chain, &amd_mce_dec_nb); 532 533 return 0; 534} 535early_initcall(mce_amd_init); 536 537#ifdef MODULE 538static void __exit mce_amd_exit(void) 539{ 540 atomic_notifier_chain_unregister(&x86_mce_decoder_chain, &amd_mce_dec_nb); 541 kfree(fam_ops); 542} 543 544MODULE_DESCRIPTION("AMD MCE decoder"); 545MODULE_ALIAS("edac-mce-amd"); 546MODULE_LICENSE("GPL"); 547module_exit(mce_amd_exit); 548#endif 549