mce.c revision 29b0f591d678838435fbb3e15ef20266f1a9e01d
1/* 2 * Machine check handler. 3 * 4 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. 5 * Rest from unknown author(s). 6 * 2004 Andi Kleen. Rewrote most of it. 7 * Copyright 2008 Intel Corporation 8 * Author: Andi Kleen 9 */ 10#include <linux/thread_info.h> 11#include <linux/capability.h> 12#include <linux/miscdevice.h> 13#include <linux/interrupt.h> 14#include <linux/ratelimit.h> 15#include <linux/kallsyms.h> 16#include <linux/rcupdate.h> 17#include <linux/kobject.h> 18#include <linux/uaccess.h> 19#include <linux/kdebug.h> 20#include <linux/kernel.h> 21#include <linux/percpu.h> 22#include <linux/string.h> 23#include <linux/sysdev.h> 24#include <linux/delay.h> 25#include <linux/ctype.h> 26#include <linux/sched.h> 27#include <linux/sysfs.h> 28#include <linux/types.h> 29#include <linux/init.h> 30#include <linux/kmod.h> 31#include <linux/poll.h> 32#include <linux/nmi.h> 33#include <linux/cpu.h> 34#include <linux/smp.h> 35#include <linux/fs.h> 36 37#include <asm/processor.h> 38#include <asm/hw_irq.h> 39#include <asm/apic.h> 40#include <asm/idle.h> 41#include <asm/ipi.h> 42#include <asm/mce.h> 43#include <asm/msr.h> 44 45#include "mce-internal.h" 46#include "mce.h" 47 48/* Handle unconfigured int18 (should never happen) */ 49static void unexpected_machine_check(struct pt_regs *regs, long error_code) 50{ 51 printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", 52 smp_processor_id()); 53} 54 55/* Call the installed machine check handler for this CPU setup. */ 56void (*machine_check_vector)(struct pt_regs *, long error_code) = 57 unexpected_machine_check; 58 59int mce_disabled; 60 61#ifdef CONFIG_X86_NEW_MCE 62 63#define MISC_MCELOG_MINOR 227 64 65#define SPINUNIT 100 /* 100ns */ 66 67atomic_t mce_entry; 68 69DEFINE_PER_CPU(unsigned, mce_exception_count); 70 71/* 72 * Tolerant levels: 73 * 0: always panic on uncorrected errors, log corrected errors 74 * 1: panic or SIGBUS on uncorrected errors, log corrected errors 75 * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors 76 * 3: never panic or SIGBUS, log all errors (for testing only) 77 */ 78static int tolerant = 1; 79static int banks; 80static u64 *bank; 81static unsigned long notify_user; 82static int rip_msr; 83static int mce_bootlog = -1; 84static int monarch_timeout = -1; 85static int mce_panic_timeout; 86 87static char trigger[128]; 88static char *trigger_argv[2] = { trigger, NULL }; 89 90static unsigned long dont_init_banks; 91 92static DECLARE_WAIT_QUEUE_HEAD(mce_wait); 93static DEFINE_PER_CPU(struct mce, mces_seen); 94static int cpu_missing; 95 96 97/* MCA banks polled by the period polling timer for corrected events */ 98DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { 99 [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL 100}; 101 102static inline int skip_bank_init(int i) 103{ 104 return i < BITS_PER_LONG && test_bit(i, &dont_init_banks); 105} 106 107/* Do initial initialization of a struct mce */ 108void mce_setup(struct mce *m) 109{ 110 memset(m, 0, sizeof(struct mce)); 111 m->cpu = m->extcpu = smp_processor_id(); 112 rdtscll(m->tsc); 113 /* We hope get_seconds stays lockless */ 114 m->time = get_seconds(); 115 m->cpuvendor = boot_cpu_data.x86_vendor; 116 m->cpuid = cpuid_eax(1); 117#ifdef CONFIG_SMP 118 m->socketid = cpu_data(m->extcpu).phys_proc_id; 119#endif 120 m->apicid = cpu_data(m->extcpu).initial_apicid; 121 rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap); 122} 123 124DEFINE_PER_CPU(struct mce, injectm); 125EXPORT_PER_CPU_SYMBOL_GPL(injectm); 126 127/* 128 * Lockless MCE logging infrastructure. 129 * This avoids deadlocks on printk locks without having to break locks. Also 130 * separate MCEs from kernel messages to avoid bogus bug reports. 131 */ 132 133static struct mce_log mcelog = { 134 .signature = MCE_LOG_SIGNATURE, 135 .len = MCE_LOG_LEN, 136 .recordlen = sizeof(struct mce), 137}; 138 139void mce_log(struct mce *mce) 140{ 141 unsigned next, entry; 142 143 mce->finished = 0; 144 wmb(); 145 for (;;) { 146 entry = rcu_dereference(mcelog.next); 147 for (;;) { 148 /* 149 * When the buffer fills up discard new entries. 150 * Assume that the earlier errors are the more 151 * interesting ones: 152 */ 153 if (entry >= MCE_LOG_LEN) { 154 set_bit(MCE_OVERFLOW, 155 (unsigned long *)&mcelog.flags); 156 return; 157 } 158 /* Old left over entry. Skip: */ 159 if (mcelog.entry[entry].finished) { 160 entry++; 161 continue; 162 } 163 break; 164 } 165 smp_rmb(); 166 next = entry + 1; 167 if (cmpxchg(&mcelog.next, entry, next) == entry) 168 break; 169 } 170 memcpy(mcelog.entry + entry, mce, sizeof(struct mce)); 171 wmb(); 172 mcelog.entry[entry].finished = 1; 173 wmb(); 174 175 mce->finished = 1; 176 set_bit(0, ¬ify_user); 177} 178 179static void print_mce(struct mce *m) 180{ 181 printk(KERN_EMERG "\n" 182 KERN_EMERG "HARDWARE ERROR\n" 183 KERN_EMERG 184 "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n", 185 m->extcpu, m->mcgstatus, m->bank, m->status); 186 if (m->ip) { 187 printk(KERN_EMERG "RIP%s %02x:<%016Lx> ", 188 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", 189 m->cs, m->ip); 190 if (m->cs == __KERNEL_CS) 191 print_symbol("{%s}", m->ip); 192 printk("\n"); 193 } 194 printk(KERN_EMERG "TSC %llx ", m->tsc); 195 if (m->addr) 196 printk("ADDR %llx ", m->addr); 197 if (m->misc) 198 printk("MISC %llx ", m->misc); 199 printk("\n"); 200 printk(KERN_EMERG "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n", 201 m->cpuvendor, m->cpuid, m->time, m->socketid, 202 m->apicid); 203 printk(KERN_EMERG "This is not a software problem!\n"); 204 printk(KERN_EMERG "Run through mcelog --ascii to decode " 205 "and contact your hardware vendor\n"); 206} 207 208#define PANIC_TIMEOUT 5 /* 5 seconds */ 209 210static atomic_t mce_paniced; 211 212/* Panic in progress. Enable interrupts and wait for final IPI */ 213static void wait_for_panic(void) 214{ 215 long timeout = PANIC_TIMEOUT*USEC_PER_SEC; 216 preempt_disable(); 217 local_irq_enable(); 218 while (timeout-- > 0) 219 udelay(1); 220 if (panic_timeout == 0) 221 panic_timeout = mce_panic_timeout; 222 panic("Panicing machine check CPU died"); 223} 224 225static void mce_panic(char *msg, struct mce *final, char *exp) 226{ 227 int i; 228 229 /* 230 * Make sure only one CPU runs in machine check panic 231 */ 232 if (atomic_add_return(1, &mce_paniced) > 1) 233 wait_for_panic(); 234 barrier(); 235 236 bust_spinlocks(1); 237 console_verbose(); 238 /* First print corrected ones that are still unlogged */ 239 for (i = 0; i < MCE_LOG_LEN; i++) { 240 struct mce *m = &mcelog.entry[i]; 241 if ((m->status & MCI_STATUS_VAL) && 242 !(m->status & MCI_STATUS_UC)) 243 print_mce(m); 244 } 245 /* Now print uncorrected but with the final one last */ 246 for (i = 0; i < MCE_LOG_LEN; i++) { 247 struct mce *m = &mcelog.entry[i]; 248 if (!(m->status & MCI_STATUS_VAL)) 249 continue; 250 if (!final || memcmp(m, final, sizeof(struct mce))) 251 print_mce(m); 252 } 253 if (final) 254 print_mce(final); 255 if (cpu_missing) 256 printk(KERN_EMERG "Some CPUs didn't answer in synchronization\n"); 257 if (exp) 258 printk(KERN_EMERG "Machine check: %s\n", exp); 259 if (panic_timeout == 0) 260 panic_timeout = mce_panic_timeout; 261 panic(msg); 262} 263 264/* Support code for software error injection */ 265 266static int msr_to_offset(u32 msr) 267{ 268 unsigned bank = __get_cpu_var(injectm.bank); 269 if (msr == rip_msr) 270 return offsetof(struct mce, ip); 271 if (msr == MSR_IA32_MC0_STATUS + bank*4) 272 return offsetof(struct mce, status); 273 if (msr == MSR_IA32_MC0_ADDR + bank*4) 274 return offsetof(struct mce, addr); 275 if (msr == MSR_IA32_MC0_MISC + bank*4) 276 return offsetof(struct mce, misc); 277 if (msr == MSR_IA32_MCG_STATUS) 278 return offsetof(struct mce, mcgstatus); 279 return -1; 280} 281 282/* MSR access wrappers used for error injection */ 283static u64 mce_rdmsrl(u32 msr) 284{ 285 u64 v; 286 if (__get_cpu_var(injectm).finished) { 287 int offset = msr_to_offset(msr); 288 if (offset < 0) 289 return 0; 290 return *(u64 *)((char *)&__get_cpu_var(injectm) + offset); 291 } 292 rdmsrl(msr, v); 293 return v; 294} 295 296static void mce_wrmsrl(u32 msr, u64 v) 297{ 298 if (__get_cpu_var(injectm).finished) { 299 int offset = msr_to_offset(msr); 300 if (offset >= 0) 301 *(u64 *)((char *)&__get_cpu_var(injectm) + offset) = v; 302 return; 303 } 304 wrmsrl(msr, v); 305} 306 307int mce_available(struct cpuinfo_x86 *c) 308{ 309 if (mce_disabled) 310 return 0; 311 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); 312} 313 314/* 315 * Get the address of the instruction at the time of the machine check 316 * error. 317 */ 318static inline void mce_get_rip(struct mce *m, struct pt_regs *regs) 319{ 320 321 if (regs && (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV))) { 322 m->ip = regs->ip; 323 m->cs = regs->cs; 324 } else { 325 m->ip = 0; 326 m->cs = 0; 327 } 328 if (rip_msr) 329 m->ip = mce_rdmsrl(rip_msr); 330} 331 332#ifdef CONFIG_X86_LOCAL_APIC 333/* 334 * Called after interrupts have been reenabled again 335 * when a MCE happened during an interrupts off region 336 * in the kernel. 337 */ 338asmlinkage void smp_mce_self_interrupt(struct pt_regs *regs) 339{ 340 ack_APIC_irq(); 341 exit_idle(); 342 irq_enter(); 343 mce_notify_user(); 344 irq_exit(); 345} 346#endif 347 348static void mce_report_event(struct pt_regs *regs) 349{ 350 if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) { 351 mce_notify_user(); 352 return; 353 } 354 355#ifdef CONFIG_X86_LOCAL_APIC 356 /* 357 * Without APIC do not notify. The event will be picked 358 * up eventually. 359 */ 360 if (!cpu_has_apic) 361 return; 362 363 /* 364 * When interrupts are disabled we cannot use 365 * kernel services safely. Trigger an self interrupt 366 * through the APIC to instead do the notification 367 * after interrupts are reenabled again. 368 */ 369 apic->send_IPI_self(MCE_SELF_VECTOR); 370 371 /* 372 * Wait for idle afterwards again so that we don't leave the 373 * APIC in a non idle state because the normal APIC writes 374 * cannot exclude us. 375 */ 376 apic_wait_icr_idle(); 377#endif 378} 379 380DEFINE_PER_CPU(unsigned, mce_poll_count); 381 382/* 383 * Poll for corrected events or events that happened before reset. 384 * Those are just logged through /dev/mcelog. 385 * 386 * This is executed in standard interrupt context. 387 */ 388void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) 389{ 390 struct mce m; 391 int i; 392 393 __get_cpu_var(mce_poll_count)++; 394 395 mce_setup(&m); 396 397 m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); 398 for (i = 0; i < banks; i++) { 399 if (!bank[i] || !test_bit(i, *b)) 400 continue; 401 402 m.misc = 0; 403 m.addr = 0; 404 m.bank = i; 405 m.tsc = 0; 406 407 barrier(); 408 m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4); 409 if (!(m.status & MCI_STATUS_VAL)) 410 continue; 411 412 /* 413 * Uncorrected events are handled by the exception handler 414 * when it is enabled. But when the exception is disabled log 415 * everything. 416 * 417 * TBD do the same check for MCI_STATUS_EN here? 418 */ 419 if ((m.status & MCI_STATUS_UC) && !(flags & MCP_UC)) 420 continue; 421 422 if (m.status & MCI_STATUS_MISCV) 423 m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4); 424 if (m.status & MCI_STATUS_ADDRV) 425 m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4); 426 427 if (!(flags & MCP_TIMESTAMP)) 428 m.tsc = 0; 429 /* 430 * Don't get the IP here because it's unlikely to 431 * have anything to do with the actual error location. 432 */ 433 if (!(flags & MCP_DONTLOG)) { 434 mce_log(&m); 435 add_taint(TAINT_MACHINE_CHECK); 436 } 437 438 /* 439 * Clear state for this bank. 440 */ 441 mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); 442 } 443 444 /* 445 * Don't clear MCG_STATUS here because it's only defined for 446 * exceptions. 447 */ 448 449 sync_core(); 450} 451EXPORT_SYMBOL_GPL(machine_check_poll); 452 453/* 454 * Do a quick check if any of the events requires a panic. 455 * This decides if we keep the events around or clear them. 456 */ 457static int mce_no_way_out(struct mce *m, char **msg) 458{ 459 int i; 460 461 for (i = 0; i < banks; i++) { 462 m->status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4); 463 if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY) 464 return 1; 465 } 466 return 0; 467} 468 469/* 470 * Variable to establish order between CPUs while scanning. 471 * Each CPU spins initially until executing is equal its number. 472 */ 473static atomic_t mce_executing; 474 475/* 476 * Defines order of CPUs on entry. First CPU becomes Monarch. 477 */ 478static atomic_t mce_callin; 479 480/* 481 * Check if a timeout waiting for other CPUs happened. 482 */ 483static int mce_timed_out(u64 *t) 484{ 485 /* 486 * The others already did panic for some reason. 487 * Bail out like in a timeout. 488 * rmb() to tell the compiler that system_state 489 * might have been modified by someone else. 490 */ 491 rmb(); 492 if (atomic_read(&mce_paniced)) 493 wait_for_panic(); 494 if (!monarch_timeout) 495 goto out; 496 if ((s64)*t < SPINUNIT) { 497 /* CHECKME: Make panic default for 1 too? */ 498 if (tolerant < 1) 499 mce_panic("Timeout synchronizing machine check over CPUs", 500 NULL, NULL); 501 cpu_missing = 1; 502 return 1; 503 } 504 *t -= SPINUNIT; 505out: 506 touch_nmi_watchdog(); 507 return 0; 508} 509 510/* 511 * The Monarch's reign. The Monarch is the CPU who entered 512 * the machine check handler first. It waits for the others to 513 * raise the exception too and then grades them. When any 514 * error is fatal panic. Only then let the others continue. 515 * 516 * The other CPUs entering the MCE handler will be controlled by the 517 * Monarch. They are called Subjects. 518 * 519 * This way we prevent any potential data corruption in a unrecoverable case 520 * and also makes sure always all CPU's errors are examined. 521 * 522 * Also this detects the case of an machine check event coming from outer 523 * space (not detected by any CPUs) In this case some external agent wants 524 * us to shut down, so panic too. 525 * 526 * The other CPUs might still decide to panic if the handler happens 527 * in a unrecoverable place, but in this case the system is in a semi-stable 528 * state and won't corrupt anything by itself. It's ok to let the others 529 * continue for a bit first. 530 * 531 * All the spin loops have timeouts; when a timeout happens a CPU 532 * typically elects itself to be Monarch. 533 */ 534static void mce_reign(void) 535{ 536 int cpu; 537 struct mce *m = NULL; 538 int global_worst = 0; 539 char *msg = NULL; 540 char *nmsg = NULL; 541 542 /* 543 * This CPU is the Monarch and the other CPUs have run 544 * through their handlers. 545 * Grade the severity of the errors of all the CPUs. 546 */ 547 for_each_possible_cpu(cpu) { 548 int severity = mce_severity(&per_cpu(mces_seen, cpu), tolerant, 549 &nmsg); 550 if (severity > global_worst) { 551 msg = nmsg; 552 global_worst = severity; 553 m = &per_cpu(mces_seen, cpu); 554 } 555 } 556 557 /* 558 * Cannot recover? Panic here then. 559 * This dumps all the mces in the log buffer and stops the 560 * other CPUs. 561 */ 562 if (m && global_worst >= MCE_PANIC_SEVERITY && tolerant < 3) 563 mce_panic("Fatal Machine check", m, msg); 564 565 /* 566 * For UC somewhere we let the CPU who detects it handle it. 567 * Also must let continue the others, otherwise the handling 568 * CPU could deadlock on a lock. 569 */ 570 571 /* 572 * No machine check event found. Must be some external 573 * source or one CPU is hung. Panic. 574 */ 575 if (!m && tolerant < 3) 576 mce_panic("Machine check from unknown source", NULL, NULL); 577 578 /* 579 * Now clear all the mces_seen so that they don't reappear on 580 * the next mce. 581 */ 582 for_each_possible_cpu(cpu) 583 memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce)); 584} 585 586static atomic_t global_nwo; 587 588/* 589 * Start of Monarch synchronization. This waits until all CPUs have 590 * entered the exception handler and then determines if any of them 591 * saw a fatal event that requires panic. Then it executes them 592 * in the entry order. 593 * TBD double check parallel CPU hotunplug 594 */ 595static int mce_start(int no_way_out, int *order) 596{ 597 int nwo; 598 int cpus = num_online_cpus(); 599 u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC; 600 601 if (!timeout) { 602 *order = -1; 603 return no_way_out; 604 } 605 606 atomic_add(no_way_out, &global_nwo); 607 608 /* 609 * Wait for everyone. 610 */ 611 while (atomic_read(&mce_callin) != cpus) { 612 if (mce_timed_out(&timeout)) { 613 atomic_set(&global_nwo, 0); 614 *order = -1; 615 return no_way_out; 616 } 617 ndelay(SPINUNIT); 618 } 619 620 /* 621 * Cache the global no_way_out state. 622 */ 623 nwo = atomic_read(&global_nwo); 624 625 /* 626 * Monarch starts executing now, the others wait. 627 */ 628 if (*order == 1) { 629 atomic_set(&mce_executing, 1); 630 return nwo; 631 } 632 633 /* 634 * Now start the scanning loop one by one 635 * in the original callin order. 636 * This way when there are any shared banks it will 637 * be only seen by one CPU before cleared, avoiding duplicates. 638 */ 639 while (atomic_read(&mce_executing) < *order) { 640 if (mce_timed_out(&timeout)) { 641 atomic_set(&global_nwo, 0); 642 *order = -1; 643 return no_way_out; 644 } 645 ndelay(SPINUNIT); 646 } 647 return nwo; 648} 649 650/* 651 * Synchronize between CPUs after main scanning loop. 652 * This invokes the bulk of the Monarch processing. 653 */ 654static int mce_end(int order) 655{ 656 int ret = -1; 657 u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC; 658 659 if (!timeout) 660 goto reset; 661 if (order < 0) 662 goto reset; 663 664 /* 665 * Allow others to run. 666 */ 667 atomic_inc(&mce_executing); 668 669 if (order == 1) { 670 /* CHECKME: Can this race with a parallel hotplug? */ 671 int cpus = num_online_cpus(); 672 673 /* 674 * Monarch: Wait for everyone to go through their scanning 675 * loops. 676 */ 677 while (atomic_read(&mce_executing) <= cpus) { 678 if (mce_timed_out(&timeout)) 679 goto reset; 680 ndelay(SPINUNIT); 681 } 682 683 mce_reign(); 684 barrier(); 685 ret = 0; 686 } else { 687 /* 688 * Subject: Wait for Monarch to finish. 689 */ 690 while (atomic_read(&mce_executing) != 0) { 691 if (mce_timed_out(&timeout)) 692 goto reset; 693 ndelay(SPINUNIT); 694 } 695 696 /* 697 * Don't reset anything. That's done by the Monarch. 698 */ 699 return 0; 700 } 701 702 /* 703 * Reset all global state. 704 */ 705reset: 706 atomic_set(&global_nwo, 0); 707 atomic_set(&mce_callin, 0); 708 barrier(); 709 710 /* 711 * Let others run again. 712 */ 713 atomic_set(&mce_executing, 0); 714 return ret; 715} 716 717static void mce_clear_state(unsigned long *toclear) 718{ 719 int i; 720 721 for (i = 0; i < banks; i++) { 722 if (test_bit(i, toclear)) 723 mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); 724 } 725} 726 727/* 728 * The actual machine check handler. This only handles real 729 * exceptions when something got corrupted coming in through int 18. 730 * 731 * This is executed in NMI context not subject to normal locking rules. This 732 * implies that most kernel services cannot be safely used. Don't even 733 * think about putting a printk in there! 734 * 735 * On Intel systems this is entered on all CPUs in parallel through 736 * MCE broadcast. However some CPUs might be broken beyond repair, 737 * so be always careful when synchronizing with others. 738 */ 739void do_machine_check(struct pt_regs *regs, long error_code) 740{ 741 struct mce m, *final; 742 int i; 743 int worst = 0; 744 int severity; 745 /* 746 * Establish sequential order between the CPUs entering the machine 747 * check handler. 748 */ 749 int order; 750 751 /* 752 * If no_way_out gets set, there is no safe way to recover from this 753 * MCE. If tolerant is cranked up, we'll try anyway. 754 */ 755 int no_way_out = 0; 756 /* 757 * If kill_it gets set, there might be a way to recover from this 758 * error. 759 */ 760 int kill_it = 0; 761 DECLARE_BITMAP(toclear, MAX_NR_BANKS); 762 char *msg = "Unknown"; 763 764 atomic_inc(&mce_entry); 765 766 __get_cpu_var(mce_exception_count)++; 767 768 if (notify_die(DIE_NMI, "machine check", regs, error_code, 769 18, SIGKILL) == NOTIFY_STOP) 770 goto out; 771 if (!banks) 772 goto out; 773 774 order = atomic_add_return(1, &mce_callin); 775 mce_setup(&m); 776 777 m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); 778 no_way_out = mce_no_way_out(&m, &msg); 779 780 final = &__get_cpu_var(mces_seen); 781 *final = m; 782 783 barrier(); 784 785 /* 786 * Go through all the banks in exclusion of the other CPUs. 787 * This way we don't report duplicated events on shared banks 788 * because the first one to see it will clear it. 789 */ 790 no_way_out = mce_start(no_way_out, &order); 791 for (i = 0; i < banks; i++) { 792 __clear_bit(i, toclear); 793 if (!bank[i]) 794 continue; 795 796 m.misc = 0; 797 m.addr = 0; 798 m.bank = i; 799 800 m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4); 801 if ((m.status & MCI_STATUS_VAL) == 0) 802 continue; 803 804 /* 805 * Non uncorrected errors are handled by machine_check_poll 806 * Leave them alone, unless this panics. 807 */ 808 if ((m.status & MCI_STATUS_UC) == 0 && !no_way_out) 809 continue; 810 811 /* 812 * Set taint even when machine check was not enabled. 813 */ 814 add_taint(TAINT_MACHINE_CHECK); 815 816 __set_bit(i, toclear); 817 818 if (m.status & MCI_STATUS_EN) { 819 /* 820 * If this error was uncorrectable and there was 821 * an overflow, we're in trouble. If no overflow, 822 * we might get away with just killing a task. 823 */ 824 if (m.status & MCI_STATUS_UC) 825 kill_it = 1; 826 } else { 827 /* 828 * Machine check event was not enabled. Clear, but 829 * ignore. 830 */ 831 continue; 832 } 833 834 if (m.status & MCI_STATUS_MISCV) 835 m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4); 836 if (m.status & MCI_STATUS_ADDRV) 837 m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4); 838 839 mce_get_rip(&m, regs); 840 mce_log(&m); 841 842 severity = mce_severity(&m, tolerant, NULL); 843 if (severity > worst) { 844 *final = m; 845 worst = severity; 846 } 847 } 848 849 if (!no_way_out) 850 mce_clear_state(toclear); 851 852 /* 853 * Do most of the synchronization with other CPUs. 854 * When there's any problem use only local no_way_out state. 855 */ 856 if (mce_end(order) < 0) 857 no_way_out = worst >= MCE_PANIC_SEVERITY; 858 859 /* 860 * If we have decided that we just CAN'T continue, and the user 861 * has not set tolerant to an insane level, give up and die. 862 * 863 * This is mainly used in the case when the system doesn't 864 * support MCE broadcasting or it has been disabled. 865 */ 866 if (no_way_out && tolerant < 3) 867 mce_panic("Fatal machine check on current CPU", final, msg); 868 869 /* 870 * If the error seems to be unrecoverable, something should be 871 * done. Try to kill as little as possible. If we can kill just 872 * one task, do that. If the user has set the tolerance very 873 * high, don't try to do anything at all. 874 */ 875 if (kill_it && tolerant < 3) { 876 int user_space = 0; 877 878 /* 879 * If the EIPV bit is set, it means the saved IP is the 880 * instruction which caused the MCE. 881 */ 882 if (m.mcgstatus & MCG_STATUS_EIPV) 883 user_space = final->ip && (final->cs & 3); 884 885 /* 886 * If we know that the error was in user space, send a 887 * SIGBUS. Otherwise, panic if tolerance is low. 888 * 889 * force_sig() takes an awful lot of locks and has a slight 890 * risk of deadlocking. 891 */ 892 if (user_space) { 893 force_sig(SIGBUS, current); 894 } else if (panic_on_oops || tolerant < 2) { 895 mce_panic("Uncorrected machine check", final, msg); 896 } 897 } 898 899 /* notify userspace ASAP */ 900 set_thread_flag(TIF_MCE_NOTIFY); 901 902 if (worst > 0) 903 mce_report_event(regs); 904 mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); 905out: 906 atomic_dec(&mce_entry); 907 sync_core(); 908} 909EXPORT_SYMBOL_GPL(do_machine_check); 910 911#ifdef CONFIG_X86_MCE_INTEL 912/*** 913 * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog 914 * @cpu: The CPU on which the event occurred. 915 * @status: Event status information 916 * 917 * This function should be called by the thermal interrupt after the 918 * event has been processed and the decision was made to log the event 919 * further. 920 * 921 * The status parameter will be saved to the 'status' field of 'struct mce' 922 * and historically has been the register value of the 923 * MSR_IA32_THERMAL_STATUS (Intel) msr. 924 */ 925void mce_log_therm_throt_event(__u64 status) 926{ 927 struct mce m; 928 929 mce_setup(&m); 930 m.bank = MCE_THERMAL_BANK; 931 m.status = status; 932 mce_log(&m); 933} 934#endif /* CONFIG_X86_MCE_INTEL */ 935 936/* 937 * Periodic polling timer for "silent" machine check errors. If the 938 * poller finds an MCE, poll 2x faster. When the poller finds no more 939 * errors, poll 2x slower (up to check_interval seconds). 940 */ 941static int check_interval = 5 * 60; /* 5 minutes */ 942 943static DEFINE_PER_CPU(int, next_interval); /* in jiffies */ 944static DEFINE_PER_CPU(struct timer_list, mce_timer); 945 946static void mcheck_timer(unsigned long data) 947{ 948 struct timer_list *t = &per_cpu(mce_timer, data); 949 int *n; 950 951 WARN_ON(smp_processor_id() != data); 952 953 if (mce_available(¤t_cpu_data)) { 954 machine_check_poll(MCP_TIMESTAMP, 955 &__get_cpu_var(mce_poll_banks)); 956 } 957 958 /* 959 * Alert userspace if needed. If we logged an MCE, reduce the 960 * polling interval, otherwise increase the polling interval. 961 */ 962 n = &__get_cpu_var(next_interval); 963 if (mce_notify_user()) 964 *n = max(*n/2, HZ/100); 965 else 966 *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ)); 967 968 t->expires = jiffies + *n; 969 add_timer(t); 970} 971 972static void mce_do_trigger(struct work_struct *work) 973{ 974 call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT); 975} 976 977static DECLARE_WORK(mce_trigger_work, mce_do_trigger); 978 979/* 980 * Notify the user(s) about new machine check events. 981 * Can be called from interrupt context, but not from machine check/NMI 982 * context. 983 */ 984int mce_notify_user(void) 985{ 986 /* Not more than two messages every minute */ 987 static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); 988 989 clear_thread_flag(TIF_MCE_NOTIFY); 990 991 if (test_and_clear_bit(0, ¬ify_user)) { 992 wake_up_interruptible(&mce_wait); 993 994 /* 995 * There is no risk of missing notifications because 996 * work_pending is always cleared before the function is 997 * executed. 998 */ 999 if (trigger[0] && !work_pending(&mce_trigger_work)) 1000 schedule_work(&mce_trigger_work); 1001 1002 if (__ratelimit(&ratelimit)) 1003 printk(KERN_INFO "Machine check events logged\n"); 1004 1005 return 1; 1006 } 1007 return 0; 1008} 1009EXPORT_SYMBOL_GPL(mce_notify_user); 1010 1011/* 1012 * Initialize Machine Checks for a CPU. 1013 */ 1014static int mce_cap_init(void) 1015{ 1016 unsigned b; 1017 u64 cap; 1018 1019 rdmsrl(MSR_IA32_MCG_CAP, cap); 1020 1021 b = cap & MCG_BANKCNT_MASK; 1022 printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b); 1023 1024 if (b > MAX_NR_BANKS) { 1025 printk(KERN_WARNING 1026 "MCE: Using only %u machine check banks out of %u\n", 1027 MAX_NR_BANKS, b); 1028 b = MAX_NR_BANKS; 1029 } 1030 1031 /* Don't support asymmetric configurations today */ 1032 WARN_ON(banks != 0 && b != banks); 1033 banks = b; 1034 if (!bank) { 1035 bank = kmalloc(banks * sizeof(u64), GFP_KERNEL); 1036 if (!bank) 1037 return -ENOMEM; 1038 memset(bank, 0xff, banks * sizeof(u64)); 1039 } 1040 1041 /* Use accurate RIP reporting if available. */ 1042 if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9) 1043 rip_msr = MSR_IA32_MCG_EIP; 1044 1045 return 0; 1046} 1047 1048static void mce_init(void) 1049{ 1050 mce_banks_t all_banks; 1051 u64 cap; 1052 int i; 1053 1054 /* 1055 * Log the machine checks left over from the previous reset. 1056 */ 1057 bitmap_fill(all_banks, MAX_NR_BANKS); 1058 machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks); 1059 1060 set_in_cr4(X86_CR4_MCE); 1061 1062 rdmsrl(MSR_IA32_MCG_CAP, cap); 1063 if (cap & MCG_CTL_P) 1064 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); 1065 1066 for (i = 0; i < banks; i++) { 1067 if (skip_bank_init(i)) 1068 continue; 1069 wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]); 1070 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); 1071 } 1072} 1073 1074/* Add per CPU specific workarounds here */ 1075static void mce_cpu_quirks(struct cpuinfo_x86 *c) 1076{ 1077 /* This should be disabled by the BIOS, but isn't always */ 1078 if (c->x86_vendor == X86_VENDOR_AMD) { 1079 if (c->x86 == 15 && banks > 4) { 1080 /* 1081 * disable GART TBL walk error reporting, which 1082 * trips off incorrectly with the IOMMU & 3ware 1083 * & Cerberus: 1084 */ 1085 clear_bit(10, (unsigned long *)&bank[4]); 1086 } 1087 if (c->x86 <= 17 && mce_bootlog < 0) { 1088 /* 1089 * Lots of broken BIOS around that don't clear them 1090 * by default and leave crap in there. Don't log: 1091 */ 1092 mce_bootlog = 0; 1093 } 1094 /* 1095 * Various K7s with broken bank 0 around. Always disable 1096 * by default. 1097 */ 1098 if (c->x86 == 6) 1099 bank[0] = 0; 1100 } 1101 1102 if (c->x86_vendor == X86_VENDOR_INTEL) { 1103 /* 1104 * SDM documents that on family 6 bank 0 should not be written 1105 * because it aliases to another special BIOS controlled 1106 * register. 1107 * But it's not aliased anymore on model 0x1a+ 1108 * Don't ignore bank 0 completely because there could be a 1109 * valid event later, merely don't write CTL0. 1110 */ 1111 1112 if (c->x86 == 6 && c->x86_model < 0x1A) 1113 __set_bit(0, &dont_init_banks); 1114 1115 /* 1116 * All newer Intel systems support MCE broadcasting. Enable 1117 * synchronization with a one second timeout. 1118 */ 1119 if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) && 1120 monarch_timeout < 0) 1121 monarch_timeout = USEC_PER_SEC; 1122 } 1123 if (monarch_timeout < 0) 1124 monarch_timeout = 0; 1125 if (mce_bootlog != 0) 1126 mce_panic_timeout = 30; 1127} 1128 1129static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c) 1130{ 1131 if (c->x86 != 5) 1132 return; 1133 switch (c->x86_vendor) { 1134 case X86_VENDOR_INTEL: 1135 if (mce_p5_enabled()) 1136 intel_p5_mcheck_init(c); 1137 break; 1138 case X86_VENDOR_CENTAUR: 1139 winchip_mcheck_init(c); 1140 break; 1141 } 1142} 1143 1144static void mce_cpu_features(struct cpuinfo_x86 *c) 1145{ 1146 switch (c->x86_vendor) { 1147 case X86_VENDOR_INTEL: 1148 mce_intel_feature_init(c); 1149 break; 1150 case X86_VENDOR_AMD: 1151 mce_amd_feature_init(c); 1152 break; 1153 default: 1154 break; 1155 } 1156} 1157 1158static void mce_init_timer(void) 1159{ 1160 struct timer_list *t = &__get_cpu_var(mce_timer); 1161 int *n = &__get_cpu_var(next_interval); 1162 1163 *n = check_interval * HZ; 1164 if (!*n) 1165 return; 1166 setup_timer(t, mcheck_timer, smp_processor_id()); 1167 t->expires = round_jiffies(jiffies + *n); 1168 add_timer(t); 1169} 1170 1171/* 1172 * Called for each booted CPU to set up machine checks. 1173 * Must be called with preempt off: 1174 */ 1175void __cpuinit mcheck_init(struct cpuinfo_x86 *c) 1176{ 1177 if (mce_disabled) 1178 return; 1179 1180 mce_ancient_init(c); 1181 1182 if (!mce_available(c)) 1183 return; 1184 1185 if (mce_cap_init() < 0) { 1186 mce_disabled = 1; 1187 return; 1188 } 1189 mce_cpu_quirks(c); 1190 1191 machine_check_vector = do_machine_check; 1192 1193 mce_init(); 1194 mce_cpu_features(c); 1195 mce_init_timer(); 1196} 1197 1198/* 1199 * Character device to read and clear the MCE log. 1200 */ 1201 1202static DEFINE_SPINLOCK(mce_state_lock); 1203static int open_count; /* #times opened */ 1204static int open_exclu; /* already open exclusive? */ 1205 1206static int mce_open(struct inode *inode, struct file *file) 1207{ 1208 spin_lock(&mce_state_lock); 1209 1210 if (open_exclu || (open_count && (file->f_flags & O_EXCL))) { 1211 spin_unlock(&mce_state_lock); 1212 1213 return -EBUSY; 1214 } 1215 1216 if (file->f_flags & O_EXCL) 1217 open_exclu = 1; 1218 open_count++; 1219 1220 spin_unlock(&mce_state_lock); 1221 1222 return nonseekable_open(inode, file); 1223} 1224 1225static int mce_release(struct inode *inode, struct file *file) 1226{ 1227 spin_lock(&mce_state_lock); 1228 1229 open_count--; 1230 open_exclu = 0; 1231 1232 spin_unlock(&mce_state_lock); 1233 1234 return 0; 1235} 1236 1237static void collect_tscs(void *data) 1238{ 1239 unsigned long *cpu_tsc = (unsigned long *)data; 1240 1241 rdtscll(cpu_tsc[smp_processor_id()]); 1242} 1243 1244static DEFINE_MUTEX(mce_read_mutex); 1245 1246static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, 1247 loff_t *off) 1248{ 1249 char __user *buf = ubuf; 1250 unsigned long *cpu_tsc; 1251 unsigned prev, next; 1252 int i, err; 1253 1254 cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL); 1255 if (!cpu_tsc) 1256 return -ENOMEM; 1257 1258 mutex_lock(&mce_read_mutex); 1259 next = rcu_dereference(mcelog.next); 1260 1261 /* Only supports full reads right now */ 1262 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) { 1263 mutex_unlock(&mce_read_mutex); 1264 kfree(cpu_tsc); 1265 1266 return -EINVAL; 1267 } 1268 1269 err = 0; 1270 prev = 0; 1271 do { 1272 for (i = prev; i < next; i++) { 1273 unsigned long start = jiffies; 1274 1275 while (!mcelog.entry[i].finished) { 1276 if (time_after_eq(jiffies, start + 2)) { 1277 memset(mcelog.entry + i, 0, 1278 sizeof(struct mce)); 1279 goto timeout; 1280 } 1281 cpu_relax(); 1282 } 1283 smp_rmb(); 1284 err |= copy_to_user(buf, mcelog.entry + i, 1285 sizeof(struct mce)); 1286 buf += sizeof(struct mce); 1287timeout: 1288 ; 1289 } 1290 1291 memset(mcelog.entry + prev, 0, 1292 (next - prev) * sizeof(struct mce)); 1293 prev = next; 1294 next = cmpxchg(&mcelog.next, prev, 0); 1295 } while (next != prev); 1296 1297 synchronize_sched(); 1298 1299 /* 1300 * Collect entries that were still getting written before the 1301 * synchronize. 1302 */ 1303 on_each_cpu(collect_tscs, cpu_tsc, 1); 1304 1305 for (i = next; i < MCE_LOG_LEN; i++) { 1306 if (mcelog.entry[i].finished && 1307 mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) { 1308 err |= copy_to_user(buf, mcelog.entry+i, 1309 sizeof(struct mce)); 1310 smp_rmb(); 1311 buf += sizeof(struct mce); 1312 memset(&mcelog.entry[i], 0, sizeof(struct mce)); 1313 } 1314 } 1315 mutex_unlock(&mce_read_mutex); 1316 kfree(cpu_tsc); 1317 1318 return err ? -EFAULT : buf - ubuf; 1319} 1320 1321static unsigned int mce_poll(struct file *file, poll_table *wait) 1322{ 1323 poll_wait(file, &mce_wait, wait); 1324 if (rcu_dereference(mcelog.next)) 1325 return POLLIN | POLLRDNORM; 1326 return 0; 1327} 1328 1329static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg) 1330{ 1331 int __user *p = (int __user *)arg; 1332 1333 if (!capable(CAP_SYS_ADMIN)) 1334 return -EPERM; 1335 1336 switch (cmd) { 1337 case MCE_GET_RECORD_LEN: 1338 return put_user(sizeof(struct mce), p); 1339 case MCE_GET_LOG_LEN: 1340 return put_user(MCE_LOG_LEN, p); 1341 case MCE_GETCLEAR_FLAGS: { 1342 unsigned flags; 1343 1344 do { 1345 flags = mcelog.flags; 1346 } while (cmpxchg(&mcelog.flags, flags, 0) != flags); 1347 1348 return put_user(flags, p); 1349 } 1350 default: 1351 return -ENOTTY; 1352 } 1353} 1354 1355/* Modified in mce-inject.c, so not static or const */ 1356struct file_operations mce_chrdev_ops = { 1357 .open = mce_open, 1358 .release = mce_release, 1359 .read = mce_read, 1360 .poll = mce_poll, 1361 .unlocked_ioctl = mce_ioctl, 1362}; 1363EXPORT_SYMBOL_GPL(mce_chrdev_ops); 1364 1365static struct miscdevice mce_log_device = { 1366 MISC_MCELOG_MINOR, 1367 "mcelog", 1368 &mce_chrdev_ops, 1369}; 1370 1371/* 1372 * mce=off disables machine check 1373 * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above) 1374 * monarchtimeout is how long to wait for other CPUs on machine 1375 * check, or 0 to not wait 1376 * mce=bootlog Log MCEs from before booting. Disabled by default on AMD. 1377 * mce=nobootlog Don't log MCEs from before booting. 1378 */ 1379static int __init mcheck_enable(char *str) 1380{ 1381 if (*str == 0) 1382 enable_p5_mce(); 1383 if (*str == '=') 1384 str++; 1385 if (!strcmp(str, "off")) 1386 mce_disabled = 1; 1387 else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog")) 1388 mce_bootlog = (str[0] == 'b'); 1389 else if (isdigit(str[0])) { 1390 get_option(&str, &tolerant); 1391 if (*str == ',') { 1392 ++str; 1393 get_option(&str, &monarch_timeout); 1394 } 1395 } else { 1396 printk(KERN_INFO "mce argument %s ignored. Please use /sys\n", 1397 str); 1398 return 0; 1399 } 1400 return 1; 1401} 1402__setup("mce", mcheck_enable); 1403 1404/* 1405 * Sysfs support 1406 */ 1407 1408/* 1409 * Disable machine checks on suspend and shutdown. We can't really handle 1410 * them later. 1411 */ 1412static int mce_disable(void) 1413{ 1414 int i; 1415 1416 for (i = 0; i < banks; i++) { 1417 if (!skip_bank_init(i)) 1418 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); 1419 } 1420 return 0; 1421} 1422 1423static int mce_suspend(struct sys_device *dev, pm_message_t state) 1424{ 1425 return mce_disable(); 1426} 1427 1428static int mce_shutdown(struct sys_device *dev) 1429{ 1430 return mce_disable(); 1431} 1432 1433/* 1434 * On resume clear all MCE state. Don't want to see leftovers from the BIOS. 1435 * Only one CPU is active at this time, the others get re-added later using 1436 * CPU hotplug: 1437 */ 1438static int mce_resume(struct sys_device *dev) 1439{ 1440 mce_init(); 1441 mce_cpu_features(¤t_cpu_data); 1442 1443 return 0; 1444} 1445 1446static void mce_cpu_restart(void *data) 1447{ 1448 del_timer_sync(&__get_cpu_var(mce_timer)); 1449 if (mce_available(¤t_cpu_data)) 1450 mce_init(); 1451 mce_init_timer(); 1452} 1453 1454/* Reinit MCEs after user configuration changes */ 1455static void mce_restart(void) 1456{ 1457 on_each_cpu(mce_cpu_restart, NULL, 1); 1458} 1459 1460static struct sysdev_class mce_sysclass = { 1461 .suspend = mce_suspend, 1462 .shutdown = mce_shutdown, 1463 .resume = mce_resume, 1464 .name = "machinecheck", 1465}; 1466 1467DEFINE_PER_CPU(struct sys_device, mce_dev); 1468 1469__cpuinitdata 1470void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); 1471 1472static struct sysdev_attribute *bank_attrs; 1473 1474static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr, 1475 char *buf) 1476{ 1477 u64 b = bank[attr - bank_attrs]; 1478 1479 return sprintf(buf, "%llx\n", b); 1480} 1481 1482static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, 1483 const char *buf, size_t size) 1484{ 1485 u64 new; 1486 1487 if (strict_strtoull(buf, 0, &new) < 0) 1488 return -EINVAL; 1489 1490 bank[attr - bank_attrs] = new; 1491 mce_restart(); 1492 1493 return size; 1494} 1495 1496static ssize_t 1497show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf) 1498{ 1499 strcpy(buf, trigger); 1500 strcat(buf, "\n"); 1501 return strlen(trigger) + 1; 1502} 1503 1504static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr, 1505 const char *buf, size_t siz) 1506{ 1507 char *p; 1508 int len; 1509 1510 strncpy(trigger, buf, sizeof(trigger)); 1511 trigger[sizeof(trigger)-1] = 0; 1512 len = strlen(trigger); 1513 p = strchr(trigger, '\n'); 1514 1515 if (*p) 1516 *p = 0; 1517 1518 return len; 1519} 1520 1521static ssize_t store_int_with_restart(struct sys_device *s, 1522 struct sysdev_attribute *attr, 1523 const char *buf, size_t size) 1524{ 1525 ssize_t ret = sysdev_store_int(s, attr, buf, size); 1526 mce_restart(); 1527 return ret; 1528} 1529 1530static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); 1531static SYSDEV_INT_ATTR(tolerant, 0644, tolerant); 1532static SYSDEV_INT_ATTR(monarch_timeout, 0644, monarch_timeout); 1533 1534static struct sysdev_ext_attribute attr_check_interval = { 1535 _SYSDEV_ATTR(check_interval, 0644, sysdev_show_int, 1536 store_int_with_restart), 1537 &check_interval 1538}; 1539 1540static struct sysdev_attribute *mce_attrs[] = { 1541 &attr_tolerant.attr, &attr_check_interval.attr, &attr_trigger, 1542 &attr_monarch_timeout.attr, 1543 NULL 1544}; 1545 1546static cpumask_var_t mce_dev_initialized; 1547 1548/* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */ 1549static __cpuinit int mce_create_device(unsigned int cpu) 1550{ 1551 int err; 1552 int i; 1553 1554 if (!mce_available(&boot_cpu_data)) 1555 return -EIO; 1556 1557 memset(&per_cpu(mce_dev, cpu).kobj, 0, sizeof(struct kobject)); 1558 per_cpu(mce_dev, cpu).id = cpu; 1559 per_cpu(mce_dev, cpu).cls = &mce_sysclass; 1560 1561 err = sysdev_register(&per_cpu(mce_dev, cpu)); 1562 if (err) 1563 return err; 1564 1565 for (i = 0; mce_attrs[i]; i++) { 1566 err = sysdev_create_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); 1567 if (err) 1568 goto error; 1569 } 1570 for (i = 0; i < banks; i++) { 1571 err = sysdev_create_file(&per_cpu(mce_dev, cpu), 1572 &bank_attrs[i]); 1573 if (err) 1574 goto error2; 1575 } 1576 cpumask_set_cpu(cpu, mce_dev_initialized); 1577 1578 return 0; 1579error2: 1580 while (--i >= 0) 1581 sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]); 1582error: 1583 while (--i >= 0) 1584 sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); 1585 1586 sysdev_unregister(&per_cpu(mce_dev, cpu)); 1587 1588 return err; 1589} 1590 1591static __cpuinit void mce_remove_device(unsigned int cpu) 1592{ 1593 int i; 1594 1595 if (!cpumask_test_cpu(cpu, mce_dev_initialized)) 1596 return; 1597 1598 for (i = 0; mce_attrs[i]; i++) 1599 sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); 1600 1601 for (i = 0; i < banks; i++) 1602 sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]); 1603 1604 sysdev_unregister(&per_cpu(mce_dev, cpu)); 1605 cpumask_clear_cpu(cpu, mce_dev_initialized); 1606} 1607 1608/* Make sure there are no machine checks on offlined CPUs. */ 1609static void mce_disable_cpu(void *h) 1610{ 1611 unsigned long action = *(unsigned long *)h; 1612 int i; 1613 1614 if (!mce_available(¤t_cpu_data)) 1615 return; 1616 if (!(action & CPU_TASKS_FROZEN)) 1617 cmci_clear(); 1618 for (i = 0; i < banks; i++) { 1619 if (!skip_bank_init(i)) 1620 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); 1621 } 1622} 1623 1624static void mce_reenable_cpu(void *h) 1625{ 1626 unsigned long action = *(unsigned long *)h; 1627 int i; 1628 1629 if (!mce_available(¤t_cpu_data)) 1630 return; 1631 1632 if (!(action & CPU_TASKS_FROZEN)) 1633 cmci_reenable(); 1634 for (i = 0; i < banks; i++) { 1635 if (!skip_bank_init(i)) 1636 wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]); 1637 } 1638} 1639 1640/* Get notified when a cpu comes on/off. Be hotplug friendly. */ 1641static int __cpuinit 1642mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) 1643{ 1644 unsigned int cpu = (unsigned long)hcpu; 1645 struct timer_list *t = &per_cpu(mce_timer, cpu); 1646 1647 switch (action) { 1648 case CPU_ONLINE: 1649 case CPU_ONLINE_FROZEN: 1650 mce_create_device(cpu); 1651 if (threshold_cpu_callback) 1652 threshold_cpu_callback(action, cpu); 1653 break; 1654 case CPU_DEAD: 1655 case CPU_DEAD_FROZEN: 1656 if (threshold_cpu_callback) 1657 threshold_cpu_callback(action, cpu); 1658 mce_remove_device(cpu); 1659 break; 1660 case CPU_DOWN_PREPARE: 1661 case CPU_DOWN_PREPARE_FROZEN: 1662 del_timer_sync(t); 1663 smp_call_function_single(cpu, mce_disable_cpu, &action, 1); 1664 break; 1665 case CPU_DOWN_FAILED: 1666 case CPU_DOWN_FAILED_FROZEN: 1667 t->expires = round_jiffies(jiffies + 1668 __get_cpu_var(next_interval)); 1669 add_timer_on(t, cpu); 1670 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); 1671 break; 1672 case CPU_POST_DEAD: 1673 /* intentionally ignoring frozen here */ 1674 cmci_rediscover(cpu); 1675 break; 1676 } 1677 return NOTIFY_OK; 1678} 1679 1680static struct notifier_block mce_cpu_notifier __cpuinitdata = { 1681 .notifier_call = mce_cpu_callback, 1682}; 1683 1684static __init int mce_init_banks(void) 1685{ 1686 int i; 1687 1688 bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks, 1689 GFP_KERNEL); 1690 if (!bank_attrs) 1691 return -ENOMEM; 1692 1693 for (i = 0; i < banks; i++) { 1694 struct sysdev_attribute *a = &bank_attrs[i]; 1695 1696 a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i); 1697 if (!a->attr.name) 1698 goto nomem; 1699 1700 a->attr.mode = 0644; 1701 a->show = show_bank; 1702 a->store = set_bank; 1703 } 1704 return 0; 1705 1706nomem: 1707 while (--i >= 0) 1708 kfree(bank_attrs[i].attr.name); 1709 kfree(bank_attrs); 1710 bank_attrs = NULL; 1711 1712 return -ENOMEM; 1713} 1714 1715static __init int mce_init_device(void) 1716{ 1717 int err; 1718 int i = 0; 1719 1720 if (!mce_available(&boot_cpu_data)) 1721 return -EIO; 1722 1723 alloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL); 1724 1725 err = mce_init_banks(); 1726 if (err) 1727 return err; 1728 1729 err = sysdev_class_register(&mce_sysclass); 1730 if (err) 1731 return err; 1732 1733 for_each_online_cpu(i) { 1734 err = mce_create_device(i); 1735 if (err) 1736 return err; 1737 } 1738 1739 register_hotcpu_notifier(&mce_cpu_notifier); 1740 misc_register(&mce_log_device); 1741 1742 return err; 1743} 1744 1745device_initcall(mce_init_device); 1746 1747#else /* CONFIG_X86_OLD_MCE: */ 1748 1749int nr_mce_banks; 1750EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */ 1751 1752/* This has to be run for each processor */ 1753void mcheck_init(struct cpuinfo_x86 *c) 1754{ 1755 if (mce_disabled == 1) 1756 return; 1757 1758 switch (c->x86_vendor) { 1759 case X86_VENDOR_AMD: 1760 amd_mcheck_init(c); 1761 break; 1762 1763 case X86_VENDOR_INTEL: 1764 if (c->x86 == 5) 1765 intel_p5_mcheck_init(c); 1766 if (c->x86 == 6) 1767 intel_p6_mcheck_init(c); 1768 if (c->x86 == 15) 1769 intel_p4_mcheck_init(c); 1770 break; 1771 1772 case X86_VENDOR_CENTAUR: 1773 if (c->x86 == 5) 1774 winchip_mcheck_init(c); 1775 break; 1776 1777 default: 1778 break; 1779 } 1780 printk(KERN_INFO "mce: CPU supports %d MCE banks\n", nr_mce_banks); 1781} 1782 1783static int __init mcheck_enable(char *str) 1784{ 1785 mce_disabled = -1; 1786 return 1; 1787} 1788 1789__setup("mce", mcheck_enable); 1790 1791#endif /* CONFIG_X86_OLD_MCE */ 1792 1793/* 1794 * Old style boot options parsing. Only for compatibility. 1795 */ 1796static int __init mcheck_disable(char *str) 1797{ 1798 mce_disabled = 1; 1799 return 1; 1800} 1801__setup("nomce", mcheck_disable); 1802