mce_amd.c revision 5ce88f6ea6bef929f59f9468413f922c9a486fa4
1#include <linux/module.h> 2#include <linux/slab.h> 3 4#include "mce_amd.h" 5 6static struct amd_decoder_ops *fam_ops; 7 8static u8 nb_err_cpumask = 0xf; 9 10static bool report_gart_errors; 11static void (*nb_bus_decoder)(int node_id, struct mce *m, u32 nbcfg); 12 13void amd_report_gart_errors(bool v) 14{ 15 report_gart_errors = v; 16} 17EXPORT_SYMBOL_GPL(amd_report_gart_errors); 18 19void amd_register_ecc_decoder(void (*f)(int, struct mce *, u32)) 20{ 21 nb_bus_decoder = f; 22} 23EXPORT_SYMBOL_GPL(amd_register_ecc_decoder); 24 25void amd_unregister_ecc_decoder(void (*f)(int, struct mce *, u32)) 26{ 27 if (nb_bus_decoder) { 28 WARN_ON(nb_bus_decoder != f); 29 30 nb_bus_decoder = NULL; 31 } 32} 33EXPORT_SYMBOL_GPL(amd_unregister_ecc_decoder); 34 35/* 36 * string representation for the different MCA reported error types, see F3x48 37 * or MSR0000_0411. 38 */ 39 40/* transaction type */ 41const char *tt_msgs[] = { "INSN", "DATA", "GEN", "RESV" }; 42EXPORT_SYMBOL_GPL(tt_msgs); 43 44/* cache level */ 45const char *ll_msgs[] = { "RESV", "L1", "L2", "L3/GEN" }; 46EXPORT_SYMBOL_GPL(ll_msgs); 47 48/* memory transaction type */ 49const char *rrrr_msgs[] = { 50 "GEN", "RD", "WR", "DRD", "DWR", "IRD", "PRF", "EV", "SNP" 51}; 52EXPORT_SYMBOL_GPL(rrrr_msgs); 53 54/* participating processor */ 55const char *pp_msgs[] = { "SRC", "RES", "OBS", "GEN" }; 56EXPORT_SYMBOL_GPL(pp_msgs); 57 58/* request timeout */ 59const char *to_msgs[] = { "no timeout", "timed out" }; 60EXPORT_SYMBOL_GPL(to_msgs); 61 62/* memory or i/o */ 63const char *ii_msgs[] = { "MEM", "RESV", "IO", "GEN" }; 64EXPORT_SYMBOL_GPL(ii_msgs); 65 66static const char *f10h_nb_mce_desc[] = { 67 "HT link data error", 68 "Protocol error (link, L3, probe filter, etc.)", 69 "Parity error in NB-internal arrays", 70 "Link Retry due to IO link transmission error", 71 "L3 ECC data cache error", 72 "ECC error in L3 cache tag", 73 "L3 LRU parity bits error", 74 "ECC Error in the Probe Filter directory" 75}; 76 77static bool f10h_dc_mce(u16 ec) 78{ 79 u8 r4 = (ec >> 4) & 0xf; 80 bool ret = false; 81 82 if (r4 == R4_GEN) { 83 pr_cont("during data scrub.\n"); 84 return true; 85 } 86 87 if (MEM_ERROR(ec)) { 88 u8 ll = ec & 0x3; 89 ret = true; 90 91 if (ll == LL_L2) 92 pr_cont("during L1 linefill from L2.\n"); 93 else if (ll == LL_L1) 94 pr_cont("Data/Tag %s error.\n", RRRR_MSG(ec)); 95 else 96 ret = false; 97 } 98 return ret; 99} 100 101static bool k8_dc_mce(u16 ec) 102{ 103 if (BUS_ERROR(ec)) { 104 pr_cont("during system linefill.\n"); 105 return true; 106 } 107 108 return f10h_dc_mce(ec); 109} 110 111static bool f14h_dc_mce(u16 ec) 112{ 113 u8 r4 = (ec >> 4) & 0xf; 114 u8 ll = ec & 0x3; 115 u8 tt = (ec >> 2) & 0x3; 116 u8 ii = tt; 117 bool ret = true; 118 119 if (MEM_ERROR(ec)) { 120 121 if (tt != TT_DATA || ll != LL_L1) 122 return false; 123 124 switch (r4) { 125 case R4_DRD: 126 case R4_DWR: 127 pr_cont("Data/Tag parity error due to %s.\n", 128 (r4 == R4_DRD ? "load/hw prf" : "store")); 129 break; 130 case R4_EVICT: 131 pr_cont("Copyback parity error on a tag miss.\n"); 132 break; 133 case R4_SNOOP: 134 pr_cont("Tag parity error during snoop.\n"); 135 break; 136 default: 137 ret = false; 138 } 139 } else if (BUS_ERROR(ec)) { 140 141 if ((ii != II_MEM && ii != II_IO) || ll != LL_LG) 142 return false; 143 144 pr_cont("System read data error on a "); 145 146 switch (r4) { 147 case R4_RD: 148 pr_cont("TLB reload.\n"); 149 break; 150 case R4_DWR: 151 pr_cont("store.\n"); 152 break; 153 case R4_DRD: 154 pr_cont("load.\n"); 155 break; 156 default: 157 ret = false; 158 } 159 } else { 160 ret = false; 161 } 162 163 return ret; 164} 165 166static void amd_decode_dc_mce(struct mce *m) 167{ 168 u16 ec = m->status & 0xffff; 169 u8 xec = (m->status >> 16) & 0xf; 170 171 pr_emerg(HW_ERR "Data Cache Error: "); 172 173 /* TLB error signatures are the same across families */ 174 if (TLB_ERROR(ec)) { 175 u8 tt = (ec >> 2) & 0x3; 176 177 if (tt == TT_DATA) { 178 pr_cont("%s TLB %s.\n", LL_MSG(ec), 179 (xec ? "multimatch" : "parity error")); 180 return; 181 } 182 else 183 goto wrong_dc_mce; 184 } 185 186 if (!fam_ops->dc_mce(ec)) 187 goto wrong_dc_mce; 188 189 return; 190 191wrong_dc_mce: 192 pr_emerg(HW_ERR "Corrupted DC MCE info?\n"); 193} 194 195static bool k8_ic_mce(u16 ec) 196{ 197 u8 ll = ec & 0x3; 198 u8 r4 = (ec >> 4) & 0xf; 199 bool ret = true; 200 201 if (!MEM_ERROR(ec)) 202 return false; 203 204 if (ll == 0x2) 205 pr_cont("during a linefill from L2.\n"); 206 else if (ll == 0x1) { 207 switch (r4) { 208 case R4_IRD: 209 pr_cont("Parity error during data load.\n"); 210 break; 211 212 case R4_EVICT: 213 pr_cont("Copyback Parity/Victim error.\n"); 214 break; 215 216 case R4_SNOOP: 217 pr_cont("Tag Snoop error.\n"); 218 break; 219 220 default: 221 ret = false; 222 break; 223 } 224 } else 225 ret = false; 226 227 return ret; 228} 229 230static bool f14h_ic_mce(u16 ec) 231{ 232 u8 ll = ec & 0x3; 233 u8 tt = (ec >> 2) & 0x3; 234 u8 r4 = (ec >> 4) & 0xf; 235 bool ret = true; 236 237 if (MEM_ERROR(ec)) { 238 if (tt != 0 || ll != 1) 239 ret = false; 240 241 if (r4 == R4_IRD) 242 pr_cont("Data/tag array parity error for a tag hit.\n"); 243 else if (r4 == R4_SNOOP) 244 pr_cont("Tag error during snoop/victimization.\n"); 245 else 246 ret = false; 247 } 248 return ret; 249} 250 251static void amd_decode_ic_mce(struct mce *m) 252{ 253 u16 ec = m->status & 0xffff; 254 u8 xec = (m->status >> 16) & 0xf; 255 256 pr_emerg(HW_ERR "Instruction Cache Error: "); 257 258 if (TLB_ERROR(ec)) 259 pr_cont("%s TLB %s.\n", LL_MSG(ec), 260 (xec ? "multimatch" : "parity error")); 261 else if (BUS_ERROR(ec)) { 262 bool k8 = (boot_cpu_data.x86 == 0xf && (m->status & BIT(58))); 263 264 pr_cont("during %s.\n", (k8 ? "system linefill" : "NB data read")); 265 } else if (fam_ops->ic_mce(ec)) 266 ; 267 else 268 pr_emerg(HW_ERR "Corrupted IC MCE info?\n"); 269} 270 271static void amd_decode_bu_mce(struct mce *m) 272{ 273 u32 ec = m->status & 0xffff; 274 u32 xec = (m->status >> 16) & 0xf; 275 276 pr_emerg(HW_ERR "Bus Unit Error"); 277 278 if (xec == 0x1) 279 pr_cont(" in the write data buffers.\n"); 280 else if (xec == 0x3) 281 pr_cont(" in the victim data buffers.\n"); 282 else if (xec == 0x2 && MEM_ERROR(ec)) 283 pr_cont(": %s error in the L2 cache tags.\n", RRRR_MSG(ec)); 284 else if (xec == 0x0) { 285 if (TLB_ERROR(ec)) 286 pr_cont(": %s error in a Page Descriptor Cache or " 287 "Guest TLB.\n", TT_MSG(ec)); 288 else if (BUS_ERROR(ec)) 289 pr_cont(": %s/ECC error in data read from NB: %s.\n", 290 RRRR_MSG(ec), PP_MSG(ec)); 291 else if (MEM_ERROR(ec)) { 292 u8 rrrr = (ec >> 4) & 0xf; 293 294 if (rrrr >= 0x7) 295 pr_cont(": %s error during data copyback.\n", 296 RRRR_MSG(ec)); 297 else if (rrrr <= 0x1) 298 pr_cont(": %s parity/ECC error during data " 299 "access from L2.\n", RRRR_MSG(ec)); 300 else 301 goto wrong_bu_mce; 302 } else 303 goto wrong_bu_mce; 304 } else 305 goto wrong_bu_mce; 306 307 return; 308 309wrong_bu_mce: 310 pr_emerg(HW_ERR "Corrupted BU MCE info?\n"); 311} 312 313static void amd_decode_ls_mce(struct mce *m) 314{ 315 u16 ec = m->status & 0xffff; 316 u8 xec = (m->status >> 16) & 0xf; 317 318 if (boot_cpu_data.x86 == 0x14) { 319 pr_emerg("You shouldn't be seeing an LS MCE on this cpu family," 320 " please report on LKML.\n"); 321 return; 322 } 323 324 pr_emerg(HW_ERR "Load Store Error"); 325 326 if (xec == 0x0) { 327 u8 r4 = (ec >> 4) & 0xf; 328 329 if (!BUS_ERROR(ec) || (r4 != R4_DRD && r4 != R4_DWR)) 330 goto wrong_ls_mce; 331 332 pr_cont(" during %s.\n", RRRR_MSG(ec)); 333 } else 334 goto wrong_ls_mce; 335 336 return; 337 338wrong_ls_mce: 339 pr_emerg(HW_ERR "Corrupted LS MCE info?\n"); 340} 341 342static bool k8_nb_mce(u16 ec, u8 xec) 343{ 344 bool ret = true; 345 346 switch (xec) { 347 case 0x1: 348 pr_cont("CRC error detected on HT link.\n"); 349 break; 350 351 case 0x5: 352 pr_cont("Invalid GART PTE entry during GART table walk.\n"); 353 break; 354 355 case 0x6: 356 pr_cont("Unsupported atomic RMW received from an IO link.\n"); 357 break; 358 359 case 0x0: 360 case 0x8: 361 pr_cont("DRAM ECC error detected on the NB.\n"); 362 break; 363 364 case 0xd: 365 pr_cont("Parity error on the DRAM addr/ctl signals.\n"); 366 break; 367 368 default: 369 ret = false; 370 break; 371 } 372 373 return ret; 374} 375 376static bool f10h_nb_mce(u16 ec, u8 xec) 377{ 378 bool ret = true; 379 u8 offset = 0; 380 381 if (k8_nb_mce(ec, xec)) 382 return true; 383 384 switch(xec) { 385 case 0xa ... 0xc: 386 offset = 10; 387 break; 388 389 case 0xe: 390 offset = 11; 391 break; 392 393 case 0xf: 394 if (TLB_ERROR(ec)) 395 pr_cont("GART Table Walk data error.\n"); 396 else if (BUS_ERROR(ec)) 397 pr_cont("DMA Exclusion Vector Table Walk error.\n"); 398 else 399 ret = false; 400 401 goto out; 402 break; 403 404 case 0x1c ... 0x1f: 405 offset = 24; 406 break; 407 408 default: 409 ret = false; 410 411 goto out; 412 break; 413 } 414 415 pr_cont("%s.\n", f10h_nb_mce_desc[xec - offset]); 416 417out: 418 return ret; 419} 420 421static bool f14h_nb_mce(u16 ec, u8 xec) 422{ 423 return false; 424} 425 426void amd_decode_nb_mce(int node_id, struct mce *m, u32 nbcfg) 427{ 428 u8 xec = (m->status >> 16) & 0x1f; 429 u16 ec = m->status & 0xffff; 430 u32 nbsh = (u32)(m->status >> 32); 431 432 pr_emerg(HW_ERR "Northbridge Error, node %d: ", node_id); 433 434 /* 435 * F10h, revD can disable ErrCpu[3:0] so check that first and also the 436 * value encoding has changed so interpret those differently 437 */ 438 if ((boot_cpu_data.x86 == 0x10) && 439 (boot_cpu_data.x86_model > 7)) { 440 if (nbsh & K8_NBSH_ERR_CPU_VAL) 441 pr_cont(", core: %u", (u8)(nbsh & nb_err_cpumask)); 442 } else { 443 u8 assoc_cpus = nbsh & nb_err_cpumask; 444 445 if (assoc_cpus > 0) 446 pr_cont(", core: %d", fls(assoc_cpus) - 1); 447 } 448 449 switch (xec) { 450 case 0x2: 451 pr_cont("Sync error (sync packets on HT link detected).\n"); 452 return; 453 454 case 0x3: 455 pr_cont("HT Master abort.\n"); 456 return; 457 458 case 0x4: 459 pr_cont("HT Target abort.\n"); 460 return; 461 462 case 0x7: 463 pr_cont("NB Watchdog timeout.\n"); 464 return; 465 466 case 0x9: 467 pr_cont("SVM DMA Exclusion Vector error.\n"); 468 return; 469 470 default: 471 break; 472 } 473 474 if (!fam_ops->nb_mce(ec, xec)) 475 goto wrong_nb_mce; 476 477 if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10) 478 if ((xec == 0x8 || xec == 0x0) && nb_bus_decoder) 479 nb_bus_decoder(node_id, m, nbcfg); 480 481 return; 482 483wrong_nb_mce: 484 pr_emerg(HW_ERR "Corrupted NB MCE info?\n"); 485} 486EXPORT_SYMBOL_GPL(amd_decode_nb_mce); 487 488static void amd_decode_fr_mce(struct mce *m) 489{ 490 /* we have only one error signature so match all fields at once. */ 491 if ((m->status & 0xffff) == 0x0f0f) 492 pr_emerg(HW_ERR " FR Error: CPU Watchdog timer expire.\n"); 493 else 494 pr_emerg(HW_ERR "Corrupted FR MCE info?\n"); 495} 496 497static inline void amd_decode_err_code(u16 ec) 498{ 499 if (TLB_ERROR(ec)) { 500 pr_emerg(HW_ERR "Transaction: %s, Cache Level: %s\n", 501 TT_MSG(ec), LL_MSG(ec)); 502 } else if (MEM_ERROR(ec)) { 503 pr_emerg(HW_ERR "Transaction: %s, Type: %s, Cache Level: %s\n", 504 RRRR_MSG(ec), TT_MSG(ec), LL_MSG(ec)); 505 } else if (BUS_ERROR(ec)) { 506 pr_emerg(HW_ERR "Transaction: %s (%s), %s, Cache Level: %s, " 507 "Participating Processor: %s\n", 508 RRRR_MSG(ec), II_MSG(ec), TO_MSG(ec), LL_MSG(ec), 509 PP_MSG(ec)); 510 } else 511 pr_emerg(HW_ERR "Huh? Unknown MCE error 0x%x\n", ec); 512} 513 514/* 515 * Filter out unwanted MCE signatures here. 516 */ 517static bool amd_filter_mce(struct mce *m) 518{ 519 u8 xec = (m->status >> 16) & 0x1f; 520 521 /* 522 * NB GART TLB error reporting is disabled by default. 523 */ 524 if (m->bank == 4 && xec == 0x5 && !report_gart_errors) 525 return true; 526 527 return false; 528} 529 530int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data) 531{ 532 struct mce *m = (struct mce *)data; 533 int node, ecc; 534 535 if (amd_filter_mce(m)) 536 return NOTIFY_STOP; 537 538 pr_emerg(HW_ERR "MC%d_STATUS: ", m->bank); 539 540 pr_cont("%sorrected error, other errors lost: %s, " 541 "CPU context corrupt: %s", 542 ((m->status & MCI_STATUS_UC) ? "Unc" : "C"), 543 ((m->status & MCI_STATUS_OVER) ? "yes" : "no"), 544 ((m->status & MCI_STATUS_PCC) ? "yes" : "no")); 545 546 /* do the two bits[14:13] together */ 547 ecc = (m->status >> 45) & 0x3; 548 if (ecc) 549 pr_cont(", %sECC Error", ((ecc == 2) ? "C" : "U")); 550 551 pr_cont("\n"); 552 553 switch (m->bank) { 554 case 0: 555 amd_decode_dc_mce(m); 556 break; 557 558 case 1: 559 amd_decode_ic_mce(m); 560 break; 561 562 case 2: 563 amd_decode_bu_mce(m); 564 break; 565 566 case 3: 567 amd_decode_ls_mce(m); 568 break; 569 570 case 4: 571 node = amd_get_nb_id(m->extcpu); 572 amd_decode_nb_mce(node, m, 0); 573 break; 574 575 case 5: 576 amd_decode_fr_mce(m); 577 break; 578 579 default: 580 break; 581 } 582 583 amd_decode_err_code(m->status & 0xffff); 584 585 return NOTIFY_STOP; 586} 587EXPORT_SYMBOL_GPL(amd_decode_mce); 588 589static struct notifier_block amd_mce_dec_nb = { 590 .notifier_call = amd_decode_mce, 591}; 592 593static int __init mce_amd_init(void) 594{ 595 /* 596 * We can decode MCEs for K8, F10h and F11h CPUs: 597 */ 598 if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) 599 return 0; 600 601 if (boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x11) 602 return 0; 603 604 fam_ops = kzalloc(sizeof(struct amd_decoder_ops), GFP_KERNEL); 605 if (!fam_ops) 606 return -ENOMEM; 607 608 switch (boot_cpu_data.x86) { 609 case 0xf: 610 fam_ops->dc_mce = k8_dc_mce; 611 fam_ops->ic_mce = k8_ic_mce; 612 fam_ops->nb_mce = k8_nb_mce; 613 break; 614 615 case 0x10: 616 fam_ops->dc_mce = f10h_dc_mce; 617 fam_ops->ic_mce = k8_ic_mce; 618 fam_ops->nb_mce = f10h_nb_mce; 619 break; 620 621 case 0x14: 622 nb_err_cpumask = 0x3; 623 fam_ops->dc_mce = f14h_dc_mce; 624 fam_ops->ic_mce = f14h_ic_mce; 625 fam_ops->nb_mce = f14h_nb_mce; 626 break; 627 628 default: 629 printk(KERN_WARNING "Huh? What family is that: %d?!\n", 630 boot_cpu_data.x86); 631 kfree(fam_ops); 632 return -EINVAL; 633 } 634 635 atomic_notifier_chain_register(&x86_mce_decoder_chain, &amd_mce_dec_nb); 636 637 return 0; 638} 639early_initcall(mce_amd_init); 640 641#ifdef MODULE 642static void __exit mce_amd_exit(void) 643{ 644 atomic_notifier_chain_unregister(&x86_mce_decoder_chain, &amd_mce_dec_nb); 645 kfree(fam_ops); 646} 647 648MODULE_DESCRIPTION("AMD MCE decoder"); 649MODULE_ALIAS("edac-mce-amd"); 650MODULE_LICENSE("GPL"); 651module_exit(mce_amd_exit); 652#endif 653