mce_amd.c revision 86039cd401e1780573733870f9c0bd458fc96ea2
1#include <linux/module.h> 2#include <linux/slab.h> 3 4#include "mce_amd.h" 5 6static struct amd_decoder_ops *fam_ops; 7 8static u8 xec_mask = 0xf; 9static u8 nb_err_cpumask = 0xf; 10 11static bool report_gart_errors; 12static void (*nb_bus_decoder)(int node_id, struct mce *m, u32 nbcfg); 13 14void amd_report_gart_errors(bool v) 15{ 16 report_gart_errors = v; 17} 18EXPORT_SYMBOL_GPL(amd_report_gart_errors); 19 20void amd_register_ecc_decoder(void (*f)(int, struct mce *, u32)) 21{ 22 nb_bus_decoder = f; 23} 24EXPORT_SYMBOL_GPL(amd_register_ecc_decoder); 25 26void amd_unregister_ecc_decoder(void (*f)(int, struct mce *, u32)) 27{ 28 if (nb_bus_decoder) { 29 WARN_ON(nb_bus_decoder != f); 30 31 nb_bus_decoder = NULL; 32 } 33} 34EXPORT_SYMBOL_GPL(amd_unregister_ecc_decoder); 35 36/* 37 * string representation for the different MCA reported error types, see F3x48 38 * or MSR0000_0411. 39 */ 40 41/* transaction type */ 42const char *tt_msgs[] = { "INSN", "DATA", "GEN", "RESV" }; 43EXPORT_SYMBOL_GPL(tt_msgs); 44 45/* cache level */ 46const char *ll_msgs[] = { "RESV", "L1", "L2", "L3/GEN" }; 47EXPORT_SYMBOL_GPL(ll_msgs); 48 49/* memory transaction type */ 50const char *rrrr_msgs[] = { 51 "GEN", "RD", "WR", "DRD", "DWR", "IRD", "PRF", "EV", "SNP" 52}; 53EXPORT_SYMBOL_GPL(rrrr_msgs); 54 55/* participating processor */ 56const char *pp_msgs[] = { "SRC", "RES", "OBS", "GEN" }; 57EXPORT_SYMBOL_GPL(pp_msgs); 58 59/* request timeout */ 60const char *to_msgs[] = { "no timeout", "timed out" }; 61EXPORT_SYMBOL_GPL(to_msgs); 62 63/* memory or i/o */ 64const char *ii_msgs[] = { "MEM", "RESV", "IO", "GEN" }; 65EXPORT_SYMBOL_GPL(ii_msgs); 66 67static const char *f10h_nb_mce_desc[] = { 68 "HT link data error", 69 "Protocol error (link, L3, probe filter, etc.)", 70 "Parity error in NB-internal arrays", 71 "Link Retry due to IO link transmission error", 72 "L3 ECC data cache error", 73 "ECC error in L3 cache tag", 74 "L3 LRU parity bits error", 75 "ECC Error in the Probe Filter directory" 76}; 77 78static const char * const f15h_ic_mce_desc[] = { 79 "UC during a demand linefill from L2", 80 "Parity error during data load from IC", 81 "Parity error for IC valid bit", 82 "Main tag parity error", 83 "Parity error in prediction queue", 84 "PFB data/address parity error", 85 "Parity error in the branch status reg", 86 "PFB promotion address error", 87 "Tag error during probe/victimization", 88 "Parity error for IC probe tag valid bit", 89 "PFB non-cacheable bit parity error", 90 "PFB valid bit parity error", /* xec = 0xd */ 91 "patch RAM", /* xec = 010 */ 92 "uop queue", 93 "insn buffer", 94 "predecode buffer", 95 "fetch address FIFO" 96}; 97 98static bool f12h_dc_mce(u16 ec, u8 xec) 99{ 100 bool ret = false; 101 102 if (MEM_ERROR(ec)) { 103 u8 ll = ec & 0x3; 104 ret = true; 105 106 if (ll == LL_L2) 107 pr_cont("during L1 linefill from L2.\n"); 108 else if (ll == LL_L1) 109 pr_cont("Data/Tag %s error.\n", RRRR_MSG(ec)); 110 else 111 ret = false; 112 } 113 return ret; 114} 115 116static bool f10h_dc_mce(u16 ec, u8 xec) 117{ 118 u8 r4 = (ec >> 4) & 0xf; 119 u8 ll = ec & 0x3; 120 121 if (r4 == R4_GEN && ll == LL_L1) { 122 pr_cont("during data scrub.\n"); 123 return true; 124 } 125 return f12h_dc_mce(ec, xec); 126} 127 128static bool k8_dc_mce(u16 ec, u8 xec) 129{ 130 if (BUS_ERROR(ec)) { 131 pr_cont("during system linefill.\n"); 132 return true; 133 } 134 135 return f10h_dc_mce(ec, xec); 136} 137 138static bool f14h_dc_mce(u16 ec, u8 xec) 139{ 140 u8 r4 = (ec >> 4) & 0xf; 141 u8 ll = ec & 0x3; 142 u8 tt = (ec >> 2) & 0x3; 143 u8 ii = tt; 144 bool ret = true; 145 146 if (MEM_ERROR(ec)) { 147 148 if (tt != TT_DATA || ll != LL_L1) 149 return false; 150 151 switch (r4) { 152 case R4_DRD: 153 case R4_DWR: 154 pr_cont("Data/Tag parity error due to %s.\n", 155 (r4 == R4_DRD ? "load/hw prf" : "store")); 156 break; 157 case R4_EVICT: 158 pr_cont("Copyback parity error on a tag miss.\n"); 159 break; 160 case R4_SNOOP: 161 pr_cont("Tag parity error during snoop.\n"); 162 break; 163 default: 164 ret = false; 165 } 166 } else if (BUS_ERROR(ec)) { 167 168 if ((ii != II_MEM && ii != II_IO) || ll != LL_LG) 169 return false; 170 171 pr_cont("System read data error on a "); 172 173 switch (r4) { 174 case R4_RD: 175 pr_cont("TLB reload.\n"); 176 break; 177 case R4_DWR: 178 pr_cont("store.\n"); 179 break; 180 case R4_DRD: 181 pr_cont("load.\n"); 182 break; 183 default: 184 ret = false; 185 } 186 } else { 187 ret = false; 188 } 189 190 return ret; 191} 192 193static bool f15h_dc_mce(u16 ec, u8 xec) 194{ 195 bool ret = true; 196 197 if (MEM_ERROR(ec)) { 198 199 switch (xec) { 200 case 0x0: 201 pr_cont("Data Array access error.\n"); 202 break; 203 204 case 0x1: 205 pr_cont("UC error during a linefill from L2/NB.\n"); 206 break; 207 208 case 0x2: 209 case 0x11: 210 pr_cont("STQ access error.\n"); 211 break; 212 213 case 0x3: 214 pr_cont("SCB access error.\n"); 215 break; 216 217 case 0x10: 218 pr_cont("Tag error.\n"); 219 break; 220 221 case 0x12: 222 pr_cont("LDQ access error.\n"); 223 break; 224 225 default: 226 ret = false; 227 } 228 } else if (BUS_ERROR(ec)) { 229 230 if (!xec) 231 pr_cont("during system linefill.\n"); 232 else 233 pr_cont(" Internal %s condition.\n", 234 ((xec == 1) ? "livelock" : "deadlock")); 235 } else 236 ret = false; 237 238 return ret; 239} 240 241static void amd_decode_dc_mce(struct mce *m) 242{ 243 u16 ec = m->status & 0xffff; 244 u8 xec = (m->status >> 16) & xec_mask; 245 246 pr_emerg(HW_ERR "Data Cache Error: "); 247 248 /* TLB error signatures are the same across families */ 249 if (TLB_ERROR(ec)) { 250 u8 tt = (ec >> 2) & 0x3; 251 252 if (tt == TT_DATA) { 253 pr_cont("%s TLB %s.\n", LL_MSG(ec), 254 ((xec == 2) ? "locked miss" 255 : (xec ? "multimatch" : "parity"))); 256 return; 257 } 258 } else if (fam_ops->dc_mce(ec, xec)) 259 ; 260 else 261 pr_emerg(HW_ERR "Corrupted DC MCE info?\n"); 262} 263 264static bool k8_ic_mce(u16 ec, u8 xec) 265{ 266 u8 ll = ec & 0x3; 267 u8 r4 = (ec >> 4) & 0xf; 268 bool ret = true; 269 270 if (!MEM_ERROR(ec)) 271 return false; 272 273 if (ll == 0x2) 274 pr_cont("during a linefill from L2.\n"); 275 else if (ll == 0x1) { 276 switch (r4) { 277 case R4_IRD: 278 pr_cont("Parity error during data load.\n"); 279 break; 280 281 case R4_EVICT: 282 pr_cont("Copyback Parity/Victim error.\n"); 283 break; 284 285 case R4_SNOOP: 286 pr_cont("Tag Snoop error.\n"); 287 break; 288 289 default: 290 ret = false; 291 break; 292 } 293 } else 294 ret = false; 295 296 return ret; 297} 298 299static bool f14h_ic_mce(u16 ec, u8 xec) 300{ 301 u8 ll = ec & 0x3; 302 u8 tt = (ec >> 2) & 0x3; 303 u8 r4 = (ec >> 4) & 0xf; 304 bool ret = true; 305 306 if (MEM_ERROR(ec)) { 307 if (tt != 0 || ll != 1) 308 ret = false; 309 310 if (r4 == R4_IRD) 311 pr_cont("Data/tag array parity error for a tag hit.\n"); 312 else if (r4 == R4_SNOOP) 313 pr_cont("Tag error during snoop/victimization.\n"); 314 else 315 ret = false; 316 } 317 return ret; 318} 319 320static bool f15h_ic_mce(u16 ec, u8 xec) 321{ 322 bool ret = true; 323 324 if (!MEM_ERROR(ec)) 325 return false; 326 327 switch (xec) { 328 case 0x0 ... 0xa: 329 pr_cont("%s.\n", f15h_ic_mce_desc[xec]); 330 break; 331 332 case 0xd: 333 pr_cont("%s.\n", f15h_ic_mce_desc[xec-2]); 334 break; 335 336 case 0x10 ... 0x14: 337 pr_cont("Decoder %s parity error.\n", f15h_ic_mce_desc[xec-4]); 338 break; 339 340 default: 341 ret = false; 342 } 343 return ret; 344} 345 346static void amd_decode_ic_mce(struct mce *m) 347{ 348 u16 ec = m->status & 0xffff; 349 u8 xec = (m->status >> 16) & xec_mask; 350 351 pr_emerg(HW_ERR "Instruction Cache Error: "); 352 353 if (TLB_ERROR(ec)) 354 pr_cont("%s TLB %s.\n", LL_MSG(ec), 355 (xec ? "multimatch" : "parity error")); 356 else if (BUS_ERROR(ec)) { 357 bool k8 = (boot_cpu_data.x86 == 0xf && (m->status & BIT_64(58))); 358 359 pr_cont("during %s.\n", (k8 ? "system linefill" : "NB data read")); 360 } else if (fam_ops->ic_mce(ec, xec)) 361 ; 362 else 363 pr_emerg(HW_ERR "Corrupted IC MCE info?\n"); 364} 365 366static void amd_decode_bu_mce(struct mce *m) 367{ 368 u32 ec = m->status & 0xffff; 369 u32 xec = (m->status >> 16) & xec_mask; 370 371 pr_emerg(HW_ERR "Bus Unit Error"); 372 373 if (xec == 0x1) 374 pr_cont(" in the write data buffers.\n"); 375 else if (xec == 0x3) 376 pr_cont(" in the victim data buffers.\n"); 377 else if (xec == 0x2 && MEM_ERROR(ec)) 378 pr_cont(": %s error in the L2 cache tags.\n", RRRR_MSG(ec)); 379 else if (xec == 0x0) { 380 if (TLB_ERROR(ec)) 381 pr_cont(": %s error in a Page Descriptor Cache or " 382 "Guest TLB.\n", TT_MSG(ec)); 383 else if (BUS_ERROR(ec)) 384 pr_cont(": %s/ECC error in data read from NB: %s.\n", 385 RRRR_MSG(ec), PP_MSG(ec)); 386 else if (MEM_ERROR(ec)) { 387 u8 rrrr = (ec >> 4) & 0xf; 388 389 if (rrrr >= 0x7) 390 pr_cont(": %s error during data copyback.\n", 391 RRRR_MSG(ec)); 392 else if (rrrr <= 0x1) 393 pr_cont(": %s parity/ECC error during data " 394 "access from L2.\n", RRRR_MSG(ec)); 395 else 396 goto wrong_bu_mce; 397 } else 398 goto wrong_bu_mce; 399 } else 400 goto wrong_bu_mce; 401 402 return; 403 404wrong_bu_mce: 405 pr_emerg(HW_ERR "Corrupted BU MCE info?\n"); 406} 407 408static void amd_decode_ls_mce(struct mce *m) 409{ 410 u16 ec = m->status & 0xffff; 411 u8 xec = (m->status >> 16) & xec_mask; 412 413 if (boot_cpu_data.x86 == 0x14) { 414 pr_emerg("You shouldn't be seeing an LS MCE on this cpu family," 415 " please report on LKML.\n"); 416 return; 417 } 418 419 pr_emerg(HW_ERR "Load Store Error"); 420 421 if (xec == 0x0) { 422 u8 r4 = (ec >> 4) & 0xf; 423 424 if (!BUS_ERROR(ec) || (r4 != R4_DRD && r4 != R4_DWR)) 425 goto wrong_ls_mce; 426 427 pr_cont(" during %s.\n", RRRR_MSG(ec)); 428 } else 429 goto wrong_ls_mce; 430 431 return; 432 433wrong_ls_mce: 434 pr_emerg(HW_ERR "Corrupted LS MCE info?\n"); 435} 436 437static bool k8_nb_mce(u16 ec, u8 xec) 438{ 439 bool ret = true; 440 441 switch (xec) { 442 case 0x1: 443 pr_cont("CRC error detected on HT link.\n"); 444 break; 445 446 case 0x5: 447 pr_cont("Invalid GART PTE entry during GART table walk.\n"); 448 break; 449 450 case 0x6: 451 pr_cont("Unsupported atomic RMW received from an IO link.\n"); 452 break; 453 454 case 0x0: 455 case 0x8: 456 if (boot_cpu_data.x86 == 0x11) 457 return false; 458 459 pr_cont("DRAM ECC error detected on the NB.\n"); 460 break; 461 462 case 0xd: 463 pr_cont("Parity error on the DRAM addr/ctl signals.\n"); 464 break; 465 466 default: 467 ret = false; 468 break; 469 } 470 471 return ret; 472} 473 474static bool f10h_nb_mce(u16 ec, u8 xec) 475{ 476 bool ret = true; 477 u8 offset = 0; 478 479 if (k8_nb_mce(ec, xec)) 480 return true; 481 482 switch(xec) { 483 case 0xa ... 0xc: 484 offset = 10; 485 break; 486 487 case 0xe: 488 offset = 11; 489 break; 490 491 case 0xf: 492 if (TLB_ERROR(ec)) 493 pr_cont("GART Table Walk data error.\n"); 494 else if (BUS_ERROR(ec)) 495 pr_cont("DMA Exclusion Vector Table Walk error.\n"); 496 else 497 ret = false; 498 499 goto out; 500 break; 501 502 case 0x1c ... 0x1f: 503 offset = 24; 504 break; 505 506 default: 507 ret = false; 508 509 goto out; 510 break; 511 } 512 513 pr_cont("%s.\n", f10h_nb_mce_desc[xec - offset]); 514 515out: 516 return ret; 517} 518 519static bool nb_noop_mce(u16 ec, u8 xec) 520{ 521 return false; 522} 523 524void amd_decode_nb_mce(int node_id, struct mce *m, u32 nbcfg) 525{ 526 u8 xec = (m->status >> 16) & 0x1f; 527 u16 ec = m->status & 0xffff; 528 u32 nbsh = (u32)(m->status >> 32); 529 530 pr_emerg(HW_ERR "Northbridge Error, node %d: ", node_id); 531 532 /* 533 * F10h, revD can disable ErrCpu[3:0] so check that first and also the 534 * value encoding has changed so interpret those differently 535 */ 536 if ((boot_cpu_data.x86 == 0x10) && 537 (boot_cpu_data.x86_model > 7)) { 538 if (nbsh & K8_NBSH_ERR_CPU_VAL) 539 pr_cont(", core: %u", (u8)(nbsh & nb_err_cpumask)); 540 } else { 541 u8 assoc_cpus = nbsh & nb_err_cpumask; 542 543 if (assoc_cpus > 0) 544 pr_cont(", core: %d", fls(assoc_cpus) - 1); 545 } 546 547 switch (xec) { 548 case 0x2: 549 pr_cont("Sync error (sync packets on HT link detected).\n"); 550 return; 551 552 case 0x3: 553 pr_cont("HT Master abort.\n"); 554 return; 555 556 case 0x4: 557 pr_cont("HT Target abort.\n"); 558 return; 559 560 case 0x7: 561 pr_cont("NB Watchdog timeout.\n"); 562 return; 563 564 case 0x9: 565 pr_cont("SVM DMA Exclusion Vector error.\n"); 566 return; 567 568 default: 569 break; 570 } 571 572 if (!fam_ops->nb_mce(ec, xec)) 573 goto wrong_nb_mce; 574 575 if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10) 576 if ((xec == 0x8 || xec == 0x0) && nb_bus_decoder) 577 nb_bus_decoder(node_id, m, nbcfg); 578 579 return; 580 581wrong_nb_mce: 582 pr_emerg(HW_ERR "Corrupted NB MCE info?\n"); 583} 584EXPORT_SYMBOL_GPL(amd_decode_nb_mce); 585 586static void amd_decode_fr_mce(struct mce *m) 587{ 588 if (boot_cpu_data.x86 == 0xf || 589 boot_cpu_data.x86 == 0x11) 590 goto wrong_fr_mce; 591 592 /* we have only one error signature so match all fields at once. */ 593 if ((m->status & 0xffff) == 0x0f0f) { 594 pr_emerg(HW_ERR "FR Error: CPU Watchdog timer expire.\n"); 595 return; 596 } 597 598wrong_fr_mce: 599 pr_emerg(HW_ERR "Corrupted FR MCE info?\n"); 600} 601 602static inline void amd_decode_err_code(u16 ec) 603{ 604 if (TLB_ERROR(ec)) { 605 pr_emerg(HW_ERR "Transaction: %s, Cache Level: %s\n", 606 TT_MSG(ec), LL_MSG(ec)); 607 } else if (MEM_ERROR(ec)) { 608 pr_emerg(HW_ERR "Transaction: %s, Type: %s, Cache Level: %s\n", 609 RRRR_MSG(ec), TT_MSG(ec), LL_MSG(ec)); 610 } else if (BUS_ERROR(ec)) { 611 pr_emerg(HW_ERR "Transaction: %s (%s), %s, Cache Level: %s, " 612 "Participating Processor: %s\n", 613 RRRR_MSG(ec), II_MSG(ec), TO_MSG(ec), LL_MSG(ec), 614 PP_MSG(ec)); 615 } else 616 pr_emerg(HW_ERR "Huh? Unknown MCE error 0x%x\n", ec); 617} 618 619/* 620 * Filter out unwanted MCE signatures here. 621 */ 622static bool amd_filter_mce(struct mce *m) 623{ 624 u8 xec = (m->status >> 16) & 0x1f; 625 626 /* 627 * NB GART TLB error reporting is disabled by default. 628 */ 629 if (m->bank == 4 && xec == 0x5 && !report_gart_errors) 630 return true; 631 632 return false; 633} 634 635int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data) 636{ 637 struct mce *m = (struct mce *)data; 638 int node, ecc; 639 640 if (amd_filter_mce(m)) 641 return NOTIFY_STOP; 642 643 pr_emerg(HW_ERR "MC%d_STATUS: ", m->bank); 644 645 pr_cont("%sorrected error, other errors lost: %s, " 646 "CPU context corrupt: %s", 647 ((m->status & MCI_STATUS_UC) ? "Unc" : "C"), 648 ((m->status & MCI_STATUS_OVER) ? "yes" : "no"), 649 ((m->status & MCI_STATUS_PCC) ? "yes" : "no")); 650 651 /* do the two bits[14:13] together */ 652 ecc = (m->status >> 45) & 0x3; 653 if (ecc) 654 pr_cont(", %sECC Error", ((ecc == 2) ? "C" : "U")); 655 656 pr_cont("\n"); 657 658 switch (m->bank) { 659 case 0: 660 amd_decode_dc_mce(m); 661 break; 662 663 case 1: 664 amd_decode_ic_mce(m); 665 break; 666 667 case 2: 668 amd_decode_bu_mce(m); 669 break; 670 671 case 3: 672 amd_decode_ls_mce(m); 673 break; 674 675 case 4: 676 node = amd_get_nb_id(m->extcpu); 677 amd_decode_nb_mce(node, m, 0); 678 break; 679 680 case 5: 681 amd_decode_fr_mce(m); 682 break; 683 684 default: 685 break; 686 } 687 688 amd_decode_err_code(m->status & 0xffff); 689 690 return NOTIFY_STOP; 691} 692EXPORT_SYMBOL_GPL(amd_decode_mce); 693 694static struct notifier_block amd_mce_dec_nb = { 695 .notifier_call = amd_decode_mce, 696}; 697 698static int __init mce_amd_init(void) 699{ 700 if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) 701 return 0; 702 703 if ((boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x12) && 704 (boot_cpu_data.x86 != 0x14 || boot_cpu_data.x86_model > 0xf)) 705 return 0; 706 707 fam_ops = kzalloc(sizeof(struct amd_decoder_ops), GFP_KERNEL); 708 if (!fam_ops) 709 return -ENOMEM; 710 711 switch (boot_cpu_data.x86) { 712 case 0xf: 713 fam_ops->dc_mce = k8_dc_mce; 714 fam_ops->ic_mce = k8_ic_mce; 715 fam_ops->nb_mce = k8_nb_mce; 716 break; 717 718 case 0x10: 719 fam_ops->dc_mce = f10h_dc_mce; 720 fam_ops->ic_mce = k8_ic_mce; 721 fam_ops->nb_mce = f10h_nb_mce; 722 break; 723 724 case 0x11: 725 fam_ops->dc_mce = k8_dc_mce; 726 fam_ops->ic_mce = k8_ic_mce; 727 fam_ops->nb_mce = f10h_nb_mce; 728 break; 729 730 case 0x12: 731 fam_ops->dc_mce = f12h_dc_mce; 732 fam_ops->ic_mce = k8_ic_mce; 733 fam_ops->nb_mce = nb_noop_mce; 734 break; 735 736 case 0x14: 737 nb_err_cpumask = 0x3; 738 fam_ops->dc_mce = f14h_dc_mce; 739 fam_ops->ic_mce = f14h_ic_mce; 740 fam_ops->nb_mce = nb_noop_mce; 741 break; 742 743 case 0x15: 744 xec_mask = 0x1f; 745 fam_ops->dc_mce = f15h_dc_mce; 746 fam_ops->ic_mce = f15h_ic_mce; 747 break; 748 749 default: 750 printk(KERN_WARNING "Huh? What family is that: %d?!\n", 751 boot_cpu_data.x86); 752 kfree(fam_ops); 753 return -EINVAL; 754 } 755 756 pr_info("MCE: In-kernel MCE decoding enabled.\n"); 757 758 atomic_notifier_chain_register(&x86_mce_decoder_chain, &amd_mce_dec_nb); 759 760 return 0; 761} 762early_initcall(mce_amd_init); 763 764#ifdef MODULE 765static void __exit mce_amd_exit(void) 766{ 767 atomic_notifier_chain_unregister(&x86_mce_decoder_chain, &amd_mce_dec_nb); 768 kfree(fam_ops); 769} 770 771MODULE_DESCRIPTION("AMD MCE decoder"); 772MODULE_ALIAS("edac-mce-amd"); 773MODULE_LICENSE("GPL"); 774module_exit(mce_amd_exit); 775#endif 776