mce.c revision 7fb06fc9672b947424e05871243a4c8e19ec3bce
1/* 2 * Machine check handler. 3 * 4 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. 5 * Rest from unknown author(s). 6 * 2004 Andi Kleen. Rewrote most of it. 7 * Copyright 2008 Intel Corporation 8 * Author: Andi Kleen 9 */ 10#include <linux/thread_info.h> 11#include <linux/capability.h> 12#include <linux/miscdevice.h> 13#include <linux/interrupt.h> 14#include <linux/ratelimit.h> 15#include <linux/kallsyms.h> 16#include <linux/rcupdate.h> 17#include <linux/kobject.h> 18#include <linux/uaccess.h> 19#include <linux/kdebug.h> 20#include <linux/kernel.h> 21#include <linux/percpu.h> 22#include <linux/string.h> 23#include <linux/sysdev.h> 24#include <linux/delay.h> 25#include <linux/ctype.h> 26#include <linux/sched.h> 27#include <linux/sysfs.h> 28#include <linux/types.h> 29#include <linux/init.h> 30#include <linux/kmod.h> 31#include <linux/poll.h> 32#include <linux/nmi.h> 33#include <linux/cpu.h> 34#include <linux/smp.h> 35#include <linux/fs.h> 36#include <linux/mm.h> 37 38#include <asm/processor.h> 39#include <asm/hw_irq.h> 40#include <asm/apic.h> 41#include <asm/idle.h> 42#include <asm/ipi.h> 43#include <asm/mce.h> 44#include <asm/msr.h> 45 46#include "mce-internal.h" 47#include "mce.h" 48 49/* Handle unconfigured int18 (should never happen) */ 50static void unexpected_machine_check(struct pt_regs *regs, long error_code) 51{ 52 printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", 53 smp_processor_id()); 54} 55 56/* Call the installed machine check handler for this CPU setup. */ 57void (*machine_check_vector)(struct pt_regs *, long error_code) = 58 unexpected_machine_check; 59 60int mce_disabled; 61 62#ifdef CONFIG_X86_NEW_MCE 63 64#define MISC_MCELOG_MINOR 227 65 66#define SPINUNIT 100 /* 100ns */ 67 68atomic_t mce_entry; 69 70DEFINE_PER_CPU(unsigned, mce_exception_count); 71 72/* 73 * Tolerant levels: 74 * 0: always panic on uncorrected errors, log corrected errors 75 * 1: panic or SIGBUS on uncorrected errors, log corrected errors 76 * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors 77 * 3: never panic or SIGBUS, log all errors (for testing only) 78 */ 79static int tolerant = 1; 80static int banks; 81static u64 *bank; 82static unsigned long notify_user; 83static int rip_msr; 84static int mce_bootlog = -1; 85static int monarch_timeout = -1; 86static int mce_panic_timeout; 87static int mce_dont_log_ce; 88int mce_cmci_disabled; 89int mce_ignore_ce; 90int mce_ser; 91 92static char trigger[128]; 93static char *trigger_argv[2] = { trigger, NULL }; 94 95static unsigned long dont_init_banks; 96 97static DECLARE_WAIT_QUEUE_HEAD(mce_wait); 98static DEFINE_PER_CPU(struct mce, mces_seen); 99static int cpu_missing; 100 101 102/* MCA banks polled by the period polling timer for corrected events */ 103DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { 104 [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL 105}; 106 107static inline int skip_bank_init(int i) 108{ 109 return i < BITS_PER_LONG && test_bit(i, &dont_init_banks); 110} 111 112static DEFINE_PER_CPU(struct work_struct, mce_work); 113 114/* Do initial initialization of a struct mce */ 115void mce_setup(struct mce *m) 116{ 117 memset(m, 0, sizeof(struct mce)); 118 m->cpu = m->extcpu = smp_processor_id(); 119 rdtscll(m->tsc); 120 /* We hope get_seconds stays lockless */ 121 m->time = get_seconds(); 122 m->cpuvendor = boot_cpu_data.x86_vendor; 123 m->cpuid = cpuid_eax(1); 124#ifdef CONFIG_SMP 125 m->socketid = cpu_data(m->extcpu).phys_proc_id; 126#endif 127 m->apicid = cpu_data(m->extcpu).initial_apicid; 128 rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap); 129} 130 131DEFINE_PER_CPU(struct mce, injectm); 132EXPORT_PER_CPU_SYMBOL_GPL(injectm); 133 134/* 135 * Lockless MCE logging infrastructure. 136 * This avoids deadlocks on printk locks without having to break locks. Also 137 * separate MCEs from kernel messages to avoid bogus bug reports. 138 */ 139 140static struct mce_log mcelog = { 141 .signature = MCE_LOG_SIGNATURE, 142 .len = MCE_LOG_LEN, 143 .recordlen = sizeof(struct mce), 144}; 145 146void mce_log(struct mce *mce) 147{ 148 unsigned next, entry; 149 150 mce->finished = 0; 151 wmb(); 152 for (;;) { 153 entry = rcu_dereference(mcelog.next); 154 for (;;) { 155 /* 156 * When the buffer fills up discard new entries. 157 * Assume that the earlier errors are the more 158 * interesting ones: 159 */ 160 if (entry >= MCE_LOG_LEN) { 161 set_bit(MCE_OVERFLOW, 162 (unsigned long *)&mcelog.flags); 163 return; 164 } 165 /* Old left over entry. Skip: */ 166 if (mcelog.entry[entry].finished) { 167 entry++; 168 continue; 169 } 170 break; 171 } 172 smp_rmb(); 173 next = entry + 1; 174 if (cmpxchg(&mcelog.next, entry, next) == entry) 175 break; 176 } 177 memcpy(mcelog.entry + entry, mce, sizeof(struct mce)); 178 wmb(); 179 mcelog.entry[entry].finished = 1; 180 wmb(); 181 182 mce->finished = 1; 183 set_bit(0, ¬ify_user); 184} 185 186static void print_mce(struct mce *m) 187{ 188 printk(KERN_EMERG 189 "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n", 190 m->extcpu, m->mcgstatus, m->bank, m->status); 191 if (m->ip) { 192 printk(KERN_EMERG "RIP%s %02x:<%016Lx> ", 193 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", 194 m->cs, m->ip); 195 if (m->cs == __KERNEL_CS) 196 print_symbol("{%s}", m->ip); 197 printk("\n"); 198 } 199 printk(KERN_EMERG "TSC %llx ", m->tsc); 200 if (m->addr) 201 printk("ADDR %llx ", m->addr); 202 if (m->misc) 203 printk("MISC %llx ", m->misc); 204 printk("\n"); 205 printk(KERN_EMERG "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n", 206 m->cpuvendor, m->cpuid, m->time, m->socketid, 207 m->apicid); 208} 209 210static void print_mce_head(void) 211{ 212 printk(KERN_EMERG "\n" KERN_EMERG "HARDWARE ERROR\n"); 213} 214 215static void print_mce_tail(void) 216{ 217 printk(KERN_EMERG "This is not a software problem!\n" 218 KERN_EMERG "Run through mcelog --ascii to decode and contact your hardware vendor\n"); 219} 220 221#define PANIC_TIMEOUT 5 /* 5 seconds */ 222 223static atomic_t mce_paniced; 224 225/* Panic in progress. Enable interrupts and wait for final IPI */ 226static void wait_for_panic(void) 227{ 228 long timeout = PANIC_TIMEOUT*USEC_PER_SEC; 229 preempt_disable(); 230 local_irq_enable(); 231 while (timeout-- > 0) 232 udelay(1); 233 if (panic_timeout == 0) 234 panic_timeout = mce_panic_timeout; 235 panic("Panicing machine check CPU died"); 236} 237 238static void mce_panic(char *msg, struct mce *final, char *exp) 239{ 240 int i; 241 242 /* 243 * Make sure only one CPU runs in machine check panic 244 */ 245 if (atomic_add_return(1, &mce_paniced) > 1) 246 wait_for_panic(); 247 barrier(); 248 249 bust_spinlocks(1); 250 console_verbose(); 251 print_mce_head(); 252 /* First print corrected ones that are still unlogged */ 253 for (i = 0; i < MCE_LOG_LEN; i++) { 254 struct mce *m = &mcelog.entry[i]; 255 if (!(m->status & MCI_STATUS_VAL)) 256 continue; 257 if (!(m->status & MCI_STATUS_UC)) 258 print_mce(m); 259 } 260 /* Now print uncorrected but with the final one last */ 261 for (i = 0; i < MCE_LOG_LEN; i++) { 262 struct mce *m = &mcelog.entry[i]; 263 if (!(m->status & MCI_STATUS_VAL)) 264 continue; 265 if (!(m->status & MCI_STATUS_UC)) 266 continue; 267 if (!final || memcmp(m, final, sizeof(struct mce))) 268 print_mce(m); 269 } 270 if (final) 271 print_mce(final); 272 if (cpu_missing) 273 printk(KERN_EMERG "Some CPUs didn't answer in synchronization\n"); 274 print_mce_tail(); 275 if (exp) 276 printk(KERN_EMERG "Machine check: %s\n", exp); 277 if (panic_timeout == 0) 278 panic_timeout = mce_panic_timeout; 279 panic(msg); 280} 281 282/* Support code for software error injection */ 283 284static int msr_to_offset(u32 msr) 285{ 286 unsigned bank = __get_cpu_var(injectm.bank); 287 if (msr == rip_msr) 288 return offsetof(struct mce, ip); 289 if (msr == MSR_IA32_MC0_STATUS + bank*4) 290 return offsetof(struct mce, status); 291 if (msr == MSR_IA32_MC0_ADDR + bank*4) 292 return offsetof(struct mce, addr); 293 if (msr == MSR_IA32_MC0_MISC + bank*4) 294 return offsetof(struct mce, misc); 295 if (msr == MSR_IA32_MCG_STATUS) 296 return offsetof(struct mce, mcgstatus); 297 return -1; 298} 299 300/* MSR access wrappers used for error injection */ 301static u64 mce_rdmsrl(u32 msr) 302{ 303 u64 v; 304 if (__get_cpu_var(injectm).finished) { 305 int offset = msr_to_offset(msr); 306 if (offset < 0) 307 return 0; 308 return *(u64 *)((char *)&__get_cpu_var(injectm) + offset); 309 } 310 rdmsrl(msr, v); 311 return v; 312} 313 314static void mce_wrmsrl(u32 msr, u64 v) 315{ 316 if (__get_cpu_var(injectm).finished) { 317 int offset = msr_to_offset(msr); 318 if (offset >= 0) 319 *(u64 *)((char *)&__get_cpu_var(injectm) + offset) = v; 320 return; 321 } 322 wrmsrl(msr, v); 323} 324 325/* 326 * Simple lockless ring to communicate PFNs from the exception handler with the 327 * process context work function. This is vastly simplified because there's 328 * only a single reader and a single writer. 329 */ 330#define MCE_RING_SIZE 16 /* we use one entry less */ 331 332struct mce_ring { 333 unsigned short start; 334 unsigned short end; 335 unsigned long ring[MCE_RING_SIZE]; 336}; 337static DEFINE_PER_CPU(struct mce_ring, mce_ring); 338 339/* Runs with CPU affinity in workqueue */ 340static int mce_ring_empty(void) 341{ 342 struct mce_ring *r = &__get_cpu_var(mce_ring); 343 344 return r->start == r->end; 345} 346 347static int mce_ring_get(unsigned long *pfn) 348{ 349 struct mce_ring *r; 350 int ret = 0; 351 352 *pfn = 0; 353 get_cpu(); 354 r = &__get_cpu_var(mce_ring); 355 if (r->start == r->end) 356 goto out; 357 *pfn = r->ring[r->start]; 358 r->start = (r->start + 1) % MCE_RING_SIZE; 359 ret = 1; 360out: 361 put_cpu(); 362 return ret; 363} 364 365/* Always runs in MCE context with preempt off */ 366static int mce_ring_add(unsigned long pfn) 367{ 368 struct mce_ring *r = &__get_cpu_var(mce_ring); 369 unsigned next; 370 371 next = (r->end + 1) % MCE_RING_SIZE; 372 if (next == r->start) 373 return -1; 374 r->ring[r->end] = pfn; 375 wmb(); 376 r->end = next; 377 return 0; 378} 379 380int mce_available(struct cpuinfo_x86 *c) 381{ 382 if (mce_disabled) 383 return 0; 384 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); 385} 386 387static void mce_schedule_work(void) 388{ 389 if (!mce_ring_empty()) { 390 struct work_struct *work = &__get_cpu_var(mce_work); 391 if (!work_pending(work)) 392 schedule_work(work); 393 } 394} 395 396/* 397 * Get the address of the instruction at the time of the machine check 398 * error. 399 */ 400static inline void mce_get_rip(struct mce *m, struct pt_regs *regs) 401{ 402 403 if (regs && (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV))) { 404 m->ip = regs->ip; 405 m->cs = regs->cs; 406 } else { 407 m->ip = 0; 408 m->cs = 0; 409 } 410 if (rip_msr) 411 m->ip = mce_rdmsrl(rip_msr); 412} 413 414#ifdef CONFIG_X86_LOCAL_APIC 415/* 416 * Called after interrupts have been reenabled again 417 * when a MCE happened during an interrupts off region 418 * in the kernel. 419 */ 420asmlinkage void smp_mce_self_interrupt(struct pt_regs *regs) 421{ 422 ack_APIC_irq(); 423 exit_idle(); 424 irq_enter(); 425 mce_notify_irq(); 426 mce_schedule_work(); 427 irq_exit(); 428} 429#endif 430 431static void mce_report_event(struct pt_regs *regs) 432{ 433 if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) { 434 mce_notify_irq(); 435 /* 436 * Triggering the work queue here is just an insurance 437 * policy in case the syscall exit notify handler 438 * doesn't run soon enough or ends up running on the 439 * wrong CPU (can happen when audit sleeps) 440 */ 441 mce_schedule_work(); 442 return; 443 } 444 445#ifdef CONFIG_X86_LOCAL_APIC 446 /* 447 * Without APIC do not notify. The event will be picked 448 * up eventually. 449 */ 450 if (!cpu_has_apic) 451 return; 452 453 /* 454 * When interrupts are disabled we cannot use 455 * kernel services safely. Trigger an self interrupt 456 * through the APIC to instead do the notification 457 * after interrupts are reenabled again. 458 */ 459 apic->send_IPI_self(MCE_SELF_VECTOR); 460 461 /* 462 * Wait for idle afterwards again so that we don't leave the 463 * APIC in a non idle state because the normal APIC writes 464 * cannot exclude us. 465 */ 466 apic_wait_icr_idle(); 467#endif 468} 469 470DEFINE_PER_CPU(unsigned, mce_poll_count); 471 472/* 473 * Poll for corrected events or events that happened before reset. 474 * Those are just logged through /dev/mcelog. 475 * 476 * This is executed in standard interrupt context. 477 * 478 * Note: spec recommends to panic for fatal unsignalled 479 * errors here. However this would be quite problematic -- 480 * we would need to reimplement the Monarch handling and 481 * it would mess up the exclusion between exception handler 482 * and poll hander -- * so we skip this for now. 483 * These cases should not happen anyways, or only when the CPU 484 * is already totally * confused. In this case it's likely it will 485 * not fully execute the machine check handler either. 486 */ 487void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) 488{ 489 struct mce m; 490 int i; 491 492 __get_cpu_var(mce_poll_count)++; 493 494 mce_setup(&m); 495 496 m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); 497 for (i = 0; i < banks; i++) { 498 if (!bank[i] || !test_bit(i, *b)) 499 continue; 500 501 m.misc = 0; 502 m.addr = 0; 503 m.bank = i; 504 m.tsc = 0; 505 506 barrier(); 507 m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4); 508 if (!(m.status & MCI_STATUS_VAL)) 509 continue; 510 511 /* 512 * Uncorrected or signalled events are handled by the exception 513 * handler when it is enabled, so don't process those here. 514 * 515 * TBD do the same check for MCI_STATUS_EN here? 516 */ 517 if (!(flags & MCP_UC) && 518 (m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC))) 519 continue; 520 521 if (m.status & MCI_STATUS_MISCV) 522 m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4); 523 if (m.status & MCI_STATUS_ADDRV) 524 m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4); 525 526 if (!(flags & MCP_TIMESTAMP)) 527 m.tsc = 0; 528 /* 529 * Don't get the IP here because it's unlikely to 530 * have anything to do with the actual error location. 531 */ 532 if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) { 533 mce_log(&m); 534 add_taint(TAINT_MACHINE_CHECK); 535 } 536 537 /* 538 * Clear state for this bank. 539 */ 540 mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); 541 } 542 543 /* 544 * Don't clear MCG_STATUS here because it's only defined for 545 * exceptions. 546 */ 547 548 sync_core(); 549} 550EXPORT_SYMBOL_GPL(machine_check_poll); 551 552/* 553 * Do a quick check if any of the events requires a panic. 554 * This decides if we keep the events around or clear them. 555 */ 556static int mce_no_way_out(struct mce *m, char **msg) 557{ 558 int i; 559 560 for (i = 0; i < banks; i++) { 561 m->status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4); 562 if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY) 563 return 1; 564 } 565 return 0; 566} 567 568/* 569 * Variable to establish order between CPUs while scanning. 570 * Each CPU spins initially until executing is equal its number. 571 */ 572static atomic_t mce_executing; 573 574/* 575 * Defines order of CPUs on entry. First CPU becomes Monarch. 576 */ 577static atomic_t mce_callin; 578 579/* 580 * Check if a timeout waiting for other CPUs happened. 581 */ 582static int mce_timed_out(u64 *t) 583{ 584 /* 585 * The others already did panic for some reason. 586 * Bail out like in a timeout. 587 * rmb() to tell the compiler that system_state 588 * might have been modified by someone else. 589 */ 590 rmb(); 591 if (atomic_read(&mce_paniced)) 592 wait_for_panic(); 593 if (!monarch_timeout) 594 goto out; 595 if ((s64)*t < SPINUNIT) { 596 /* CHECKME: Make panic default for 1 too? */ 597 if (tolerant < 1) 598 mce_panic("Timeout synchronizing machine check over CPUs", 599 NULL, NULL); 600 cpu_missing = 1; 601 return 1; 602 } 603 *t -= SPINUNIT; 604out: 605 touch_nmi_watchdog(); 606 return 0; 607} 608 609/* 610 * The Monarch's reign. The Monarch is the CPU who entered 611 * the machine check handler first. It waits for the others to 612 * raise the exception too and then grades them. When any 613 * error is fatal panic. Only then let the others continue. 614 * 615 * The other CPUs entering the MCE handler will be controlled by the 616 * Monarch. They are called Subjects. 617 * 618 * This way we prevent any potential data corruption in a unrecoverable case 619 * and also makes sure always all CPU's errors are examined. 620 * 621 * Also this detects the case of an machine check event coming from outer 622 * space (not detected by any CPUs) In this case some external agent wants 623 * us to shut down, so panic too. 624 * 625 * The other CPUs might still decide to panic if the handler happens 626 * in a unrecoverable place, but in this case the system is in a semi-stable 627 * state and won't corrupt anything by itself. It's ok to let the others 628 * continue for a bit first. 629 * 630 * All the spin loops have timeouts; when a timeout happens a CPU 631 * typically elects itself to be Monarch. 632 */ 633static void mce_reign(void) 634{ 635 int cpu; 636 struct mce *m = NULL; 637 int global_worst = 0; 638 char *msg = NULL; 639 char *nmsg = NULL; 640 641 /* 642 * This CPU is the Monarch and the other CPUs have run 643 * through their handlers. 644 * Grade the severity of the errors of all the CPUs. 645 */ 646 for_each_possible_cpu(cpu) { 647 int severity = mce_severity(&per_cpu(mces_seen, cpu), tolerant, 648 &nmsg); 649 if (severity > global_worst) { 650 msg = nmsg; 651 global_worst = severity; 652 m = &per_cpu(mces_seen, cpu); 653 } 654 } 655 656 /* 657 * Cannot recover? Panic here then. 658 * This dumps all the mces in the log buffer and stops the 659 * other CPUs. 660 */ 661 if (m && global_worst >= MCE_PANIC_SEVERITY && tolerant < 3) 662 mce_panic("Fatal Machine check", m, msg); 663 664 /* 665 * For UC somewhere we let the CPU who detects it handle it. 666 * Also must let continue the others, otherwise the handling 667 * CPU could deadlock on a lock. 668 */ 669 670 /* 671 * No machine check event found. Must be some external 672 * source or one CPU is hung. Panic. 673 */ 674 if (!m && tolerant < 3) 675 mce_panic("Machine check from unknown source", NULL, NULL); 676 677 /* 678 * Now clear all the mces_seen so that they don't reappear on 679 * the next mce. 680 */ 681 for_each_possible_cpu(cpu) 682 memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce)); 683} 684 685static atomic_t global_nwo; 686 687/* 688 * Start of Monarch synchronization. This waits until all CPUs have 689 * entered the exception handler and then determines if any of them 690 * saw a fatal event that requires panic. Then it executes them 691 * in the entry order. 692 * TBD double check parallel CPU hotunplug 693 */ 694static int mce_start(int *no_way_out) 695{ 696 int order; 697 int cpus = num_online_cpus(); 698 u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC; 699 700 if (!timeout) 701 return -1; 702 703 atomic_add(*no_way_out, &global_nwo); 704 /* 705 * global_nwo should be updated before mce_callin 706 */ 707 smp_wmb(); 708 order = atomic_add_return(1, &mce_callin); 709 710 /* 711 * Wait for everyone. 712 */ 713 while (atomic_read(&mce_callin) != cpus) { 714 if (mce_timed_out(&timeout)) { 715 atomic_set(&global_nwo, 0); 716 return -1; 717 } 718 ndelay(SPINUNIT); 719 } 720 721 /* 722 * mce_callin should be read before global_nwo 723 */ 724 smp_rmb(); 725 726 if (order == 1) { 727 /* 728 * Monarch: Starts executing now, the others wait. 729 */ 730 atomic_set(&mce_executing, 1); 731 } else { 732 /* 733 * Subject: Now start the scanning loop one by one in 734 * the original callin order. 735 * This way when there are any shared banks it will be 736 * only seen by one CPU before cleared, avoiding duplicates. 737 */ 738 while (atomic_read(&mce_executing) < order) { 739 if (mce_timed_out(&timeout)) { 740 atomic_set(&global_nwo, 0); 741 return -1; 742 } 743 ndelay(SPINUNIT); 744 } 745 } 746 747 /* 748 * Cache the global no_way_out state. 749 */ 750 *no_way_out = atomic_read(&global_nwo); 751 752 return order; 753} 754 755/* 756 * Synchronize between CPUs after main scanning loop. 757 * This invokes the bulk of the Monarch processing. 758 */ 759static int mce_end(int order) 760{ 761 int ret = -1; 762 u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC; 763 764 if (!timeout) 765 goto reset; 766 if (order < 0) 767 goto reset; 768 769 /* 770 * Allow others to run. 771 */ 772 atomic_inc(&mce_executing); 773 774 if (order == 1) { 775 /* CHECKME: Can this race with a parallel hotplug? */ 776 int cpus = num_online_cpus(); 777 778 /* 779 * Monarch: Wait for everyone to go through their scanning 780 * loops. 781 */ 782 while (atomic_read(&mce_executing) <= cpus) { 783 if (mce_timed_out(&timeout)) 784 goto reset; 785 ndelay(SPINUNIT); 786 } 787 788 mce_reign(); 789 barrier(); 790 ret = 0; 791 } else { 792 /* 793 * Subject: Wait for Monarch to finish. 794 */ 795 while (atomic_read(&mce_executing) != 0) { 796 if (mce_timed_out(&timeout)) 797 goto reset; 798 ndelay(SPINUNIT); 799 } 800 801 /* 802 * Don't reset anything. That's done by the Monarch. 803 */ 804 return 0; 805 } 806 807 /* 808 * Reset all global state. 809 */ 810reset: 811 atomic_set(&global_nwo, 0); 812 atomic_set(&mce_callin, 0); 813 barrier(); 814 815 /* 816 * Let others run again. 817 */ 818 atomic_set(&mce_executing, 0); 819 return ret; 820} 821 822/* 823 * Check if the address reported by the CPU is in a format we can parse. 824 * It would be possible to add code for most other cases, but all would 825 * be somewhat complicated (e.g. segment offset would require an instruction 826 * parser). So only support physical addresses upto page granuality for now. 827 */ 828static int mce_usable_address(struct mce *m) 829{ 830 if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV)) 831 return 0; 832 if ((m->misc & 0x3f) > PAGE_SHIFT) 833 return 0; 834 if (((m->misc >> 6) & 7) != MCM_ADDR_PHYS) 835 return 0; 836 return 1; 837} 838 839static void mce_clear_state(unsigned long *toclear) 840{ 841 int i; 842 843 for (i = 0; i < banks; i++) { 844 if (test_bit(i, toclear)) 845 mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); 846 } 847} 848 849/* 850 * The actual machine check handler. This only handles real 851 * exceptions when something got corrupted coming in through int 18. 852 * 853 * This is executed in NMI context not subject to normal locking rules. This 854 * implies that most kernel services cannot be safely used. Don't even 855 * think about putting a printk in there! 856 * 857 * On Intel systems this is entered on all CPUs in parallel through 858 * MCE broadcast. However some CPUs might be broken beyond repair, 859 * so be always careful when synchronizing with others. 860 */ 861void do_machine_check(struct pt_regs *regs, long error_code) 862{ 863 struct mce m, *final; 864 int i; 865 int worst = 0; 866 int severity; 867 /* 868 * Establish sequential order between the CPUs entering the machine 869 * check handler. 870 */ 871 int order; 872 /* 873 * If no_way_out gets set, there is no safe way to recover from this 874 * MCE. If tolerant is cranked up, we'll try anyway. 875 */ 876 int no_way_out = 0; 877 /* 878 * If kill_it gets set, there might be a way to recover from this 879 * error. 880 */ 881 int kill_it = 0; 882 DECLARE_BITMAP(toclear, MAX_NR_BANKS); 883 char *msg = "Unknown"; 884 885 atomic_inc(&mce_entry); 886 887 __get_cpu_var(mce_exception_count)++; 888 889 if (notify_die(DIE_NMI, "machine check", regs, error_code, 890 18, SIGKILL) == NOTIFY_STOP) 891 goto out; 892 if (!banks) 893 goto out; 894 895 mce_setup(&m); 896 897 m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); 898 no_way_out = mce_no_way_out(&m, &msg); 899 900 final = &__get_cpu_var(mces_seen); 901 *final = m; 902 903 barrier(); 904 905 /* 906 * When no restart IP must always kill or panic. 907 */ 908 if (!(m.mcgstatus & MCG_STATUS_RIPV)) 909 kill_it = 1; 910 911 /* 912 * Go through all the banks in exclusion of the other CPUs. 913 * This way we don't report duplicated events on shared banks 914 * because the first one to see it will clear it. 915 */ 916 order = mce_start(&no_way_out); 917 for (i = 0; i < banks; i++) { 918 __clear_bit(i, toclear); 919 if (!bank[i]) 920 continue; 921 922 m.misc = 0; 923 m.addr = 0; 924 m.bank = i; 925 926 m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4); 927 if ((m.status & MCI_STATUS_VAL) == 0) 928 continue; 929 930 /* 931 * Non uncorrected or non signaled errors are handled by 932 * machine_check_poll. Leave them alone, unless this panics. 933 */ 934 if (!(m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)) && 935 !no_way_out) 936 continue; 937 938 /* 939 * Set taint even when machine check was not enabled. 940 */ 941 add_taint(TAINT_MACHINE_CHECK); 942 943 severity = mce_severity(&m, tolerant, NULL); 944 945 /* 946 * When machine check was for corrected handler don't touch, 947 * unless we're panicing. 948 */ 949 if (severity == MCE_KEEP_SEVERITY && !no_way_out) 950 continue; 951 __set_bit(i, toclear); 952 if (severity == MCE_NO_SEVERITY) { 953 /* 954 * Machine check event was not enabled. Clear, but 955 * ignore. 956 */ 957 continue; 958 } 959 960 /* 961 * Kill on action required. 962 */ 963 if (severity == MCE_AR_SEVERITY) 964 kill_it = 1; 965 966 if (m.status & MCI_STATUS_MISCV) 967 m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4); 968 if (m.status & MCI_STATUS_ADDRV) 969 m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4); 970 971 /* 972 * Action optional error. Queue address for later processing. 973 * When the ring overflows we just ignore the AO error. 974 * RED-PEN add some logging mechanism when 975 * usable_address or mce_add_ring fails. 976 * RED-PEN don't ignore overflow for tolerant == 0 977 */ 978 if (severity == MCE_AO_SEVERITY && mce_usable_address(&m)) 979 mce_ring_add(m.addr >> PAGE_SHIFT); 980 981 mce_get_rip(&m, regs); 982 mce_log(&m); 983 984 if (severity > worst) { 985 *final = m; 986 worst = severity; 987 } 988 } 989 990 if (!no_way_out) 991 mce_clear_state(toclear); 992 993 /* 994 * Do most of the synchronization with other CPUs. 995 * When there's any problem use only local no_way_out state. 996 */ 997 if (mce_end(order) < 0) 998 no_way_out = worst >= MCE_PANIC_SEVERITY; 999 1000 /* 1001 * If we have decided that we just CAN'T continue, and the user 1002 * has not set tolerant to an insane level, give up and die. 1003 * 1004 * This is mainly used in the case when the system doesn't 1005 * support MCE broadcasting or it has been disabled. 1006 */ 1007 if (no_way_out && tolerant < 3) 1008 mce_panic("Fatal machine check on current CPU", final, msg); 1009 1010 /* 1011 * If the error seems to be unrecoverable, something should be 1012 * done. Try to kill as little as possible. If we can kill just 1013 * one task, do that. If the user has set the tolerance very 1014 * high, don't try to do anything at all. 1015 */ 1016 1017 if (kill_it && tolerant < 3) 1018 force_sig(SIGBUS, current); 1019 1020 /* notify userspace ASAP */ 1021 set_thread_flag(TIF_MCE_NOTIFY); 1022 1023 if (worst > 0) 1024 mce_report_event(regs); 1025 mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); 1026out: 1027 atomic_dec(&mce_entry); 1028 sync_core(); 1029} 1030EXPORT_SYMBOL_GPL(do_machine_check); 1031 1032/* dummy to break dependency. actual code is in mm/memory-failure.c */ 1033void __attribute__((weak)) memory_failure(unsigned long pfn, int vector) 1034{ 1035 printk(KERN_ERR "Action optional memory failure at %lx ignored\n", pfn); 1036} 1037 1038/* 1039 * Called after mce notification in process context. This code 1040 * is allowed to sleep. Call the high level VM handler to process 1041 * any corrupted pages. 1042 * Assume that the work queue code only calls this one at a time 1043 * per CPU. 1044 * Note we don't disable preemption, so this code might run on the wrong 1045 * CPU. In this case the event is picked up by the scheduled work queue. 1046 * This is merely a fast path to expedite processing in some common 1047 * cases. 1048 */ 1049void mce_notify_process(void) 1050{ 1051 unsigned long pfn; 1052 mce_notify_irq(); 1053 while (mce_ring_get(&pfn)) 1054 memory_failure(pfn, MCE_VECTOR); 1055} 1056 1057static void mce_process_work(struct work_struct *dummy) 1058{ 1059 mce_notify_process(); 1060} 1061 1062#ifdef CONFIG_X86_MCE_INTEL 1063/*** 1064 * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog 1065 * @cpu: The CPU on which the event occurred. 1066 * @status: Event status information 1067 * 1068 * This function should be called by the thermal interrupt after the 1069 * event has been processed and the decision was made to log the event 1070 * further. 1071 * 1072 * The status parameter will be saved to the 'status' field of 'struct mce' 1073 * and historically has been the register value of the 1074 * MSR_IA32_THERMAL_STATUS (Intel) msr. 1075 */ 1076void mce_log_therm_throt_event(__u64 status) 1077{ 1078 struct mce m; 1079 1080 mce_setup(&m); 1081 m.bank = MCE_THERMAL_BANK; 1082 m.status = status; 1083 mce_log(&m); 1084} 1085#endif /* CONFIG_X86_MCE_INTEL */ 1086 1087/* 1088 * Periodic polling timer for "silent" machine check errors. If the 1089 * poller finds an MCE, poll 2x faster. When the poller finds no more 1090 * errors, poll 2x slower (up to check_interval seconds). 1091 */ 1092static int check_interval = 5 * 60; /* 5 minutes */ 1093 1094static DEFINE_PER_CPU(int, next_interval); /* in jiffies */ 1095static DEFINE_PER_CPU(struct timer_list, mce_timer); 1096 1097static void mcheck_timer(unsigned long data) 1098{ 1099 struct timer_list *t = &per_cpu(mce_timer, data); 1100 int *n; 1101 1102 WARN_ON(smp_processor_id() != data); 1103 1104 if (mce_available(¤t_cpu_data)) { 1105 machine_check_poll(MCP_TIMESTAMP, 1106 &__get_cpu_var(mce_poll_banks)); 1107 } 1108 1109 /* 1110 * Alert userspace if needed. If we logged an MCE, reduce the 1111 * polling interval, otherwise increase the polling interval. 1112 */ 1113 n = &__get_cpu_var(next_interval); 1114 if (mce_notify_irq()) 1115 *n = max(*n/2, HZ/100); 1116 else 1117 *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ)); 1118 1119 t->expires = jiffies + *n; 1120 add_timer(t); 1121} 1122 1123static void mce_do_trigger(struct work_struct *work) 1124{ 1125 call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT); 1126} 1127 1128static DECLARE_WORK(mce_trigger_work, mce_do_trigger); 1129 1130/* 1131 * Notify the user(s) about new machine check events. 1132 * Can be called from interrupt context, but not from machine check/NMI 1133 * context. 1134 */ 1135int mce_notify_irq(void) 1136{ 1137 /* Not more than two messages every minute */ 1138 static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); 1139 1140 clear_thread_flag(TIF_MCE_NOTIFY); 1141 1142 if (test_and_clear_bit(0, ¬ify_user)) { 1143 wake_up_interruptible(&mce_wait); 1144 1145 /* 1146 * There is no risk of missing notifications because 1147 * work_pending is always cleared before the function is 1148 * executed. 1149 */ 1150 if (trigger[0] && !work_pending(&mce_trigger_work)) 1151 schedule_work(&mce_trigger_work); 1152 1153 if (__ratelimit(&ratelimit)) 1154 printk(KERN_INFO "Machine check events logged\n"); 1155 1156 return 1; 1157 } 1158 return 0; 1159} 1160EXPORT_SYMBOL_GPL(mce_notify_irq); 1161 1162/* 1163 * Initialize Machine Checks for a CPU. 1164 */ 1165static int mce_cap_init(void) 1166{ 1167 unsigned b; 1168 u64 cap; 1169 1170 rdmsrl(MSR_IA32_MCG_CAP, cap); 1171 1172 b = cap & MCG_BANKCNT_MASK; 1173 printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b); 1174 1175 if (b > MAX_NR_BANKS) { 1176 printk(KERN_WARNING 1177 "MCE: Using only %u machine check banks out of %u\n", 1178 MAX_NR_BANKS, b); 1179 b = MAX_NR_BANKS; 1180 } 1181 1182 /* Don't support asymmetric configurations today */ 1183 WARN_ON(banks != 0 && b != banks); 1184 banks = b; 1185 if (!bank) { 1186 bank = kmalloc(banks * sizeof(u64), GFP_KERNEL); 1187 if (!bank) 1188 return -ENOMEM; 1189 memset(bank, 0xff, banks * sizeof(u64)); 1190 } 1191 1192 /* Use accurate RIP reporting if available. */ 1193 if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9) 1194 rip_msr = MSR_IA32_MCG_EIP; 1195 1196 if (cap & MCG_SER_P) 1197 mce_ser = 1; 1198 1199 return 0; 1200} 1201 1202static void mce_init(void) 1203{ 1204 mce_banks_t all_banks; 1205 u64 cap; 1206 int i; 1207 1208 /* 1209 * Log the machine checks left over from the previous reset. 1210 */ 1211 bitmap_fill(all_banks, MAX_NR_BANKS); 1212 machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks); 1213 1214 set_in_cr4(X86_CR4_MCE); 1215 1216 rdmsrl(MSR_IA32_MCG_CAP, cap); 1217 if (cap & MCG_CTL_P) 1218 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); 1219 1220 for (i = 0; i < banks; i++) { 1221 if (skip_bank_init(i)) 1222 continue; 1223 wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]); 1224 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); 1225 } 1226} 1227 1228/* Add per CPU specific workarounds here */ 1229static void mce_cpu_quirks(struct cpuinfo_x86 *c) 1230{ 1231 /* This should be disabled by the BIOS, but isn't always */ 1232 if (c->x86_vendor == X86_VENDOR_AMD) { 1233 if (c->x86 == 15 && banks > 4) { 1234 /* 1235 * disable GART TBL walk error reporting, which 1236 * trips off incorrectly with the IOMMU & 3ware 1237 * & Cerberus: 1238 */ 1239 clear_bit(10, (unsigned long *)&bank[4]); 1240 } 1241 if (c->x86 <= 17 && mce_bootlog < 0) { 1242 /* 1243 * Lots of broken BIOS around that don't clear them 1244 * by default and leave crap in there. Don't log: 1245 */ 1246 mce_bootlog = 0; 1247 } 1248 /* 1249 * Various K7s with broken bank 0 around. Always disable 1250 * by default. 1251 */ 1252 if (c->x86 == 6) 1253 bank[0] = 0; 1254 } 1255 1256 if (c->x86_vendor == X86_VENDOR_INTEL) { 1257 /* 1258 * SDM documents that on family 6 bank 0 should not be written 1259 * because it aliases to another special BIOS controlled 1260 * register. 1261 * But it's not aliased anymore on model 0x1a+ 1262 * Don't ignore bank 0 completely because there could be a 1263 * valid event later, merely don't write CTL0. 1264 */ 1265 1266 if (c->x86 == 6 && c->x86_model < 0x1A) 1267 __set_bit(0, &dont_init_banks); 1268 1269 /* 1270 * All newer Intel systems support MCE broadcasting. Enable 1271 * synchronization with a one second timeout. 1272 */ 1273 if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) && 1274 monarch_timeout < 0) 1275 monarch_timeout = USEC_PER_SEC; 1276 } 1277 if (monarch_timeout < 0) 1278 monarch_timeout = 0; 1279 if (mce_bootlog != 0) 1280 mce_panic_timeout = 30; 1281} 1282 1283static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c) 1284{ 1285 if (c->x86 != 5) 1286 return; 1287 switch (c->x86_vendor) { 1288 case X86_VENDOR_INTEL: 1289 if (mce_p5_enabled()) 1290 intel_p5_mcheck_init(c); 1291 break; 1292 case X86_VENDOR_CENTAUR: 1293 winchip_mcheck_init(c); 1294 break; 1295 } 1296} 1297 1298static void mce_cpu_features(struct cpuinfo_x86 *c) 1299{ 1300 switch (c->x86_vendor) { 1301 case X86_VENDOR_INTEL: 1302 mce_intel_feature_init(c); 1303 break; 1304 case X86_VENDOR_AMD: 1305 mce_amd_feature_init(c); 1306 break; 1307 default: 1308 break; 1309 } 1310} 1311 1312static void mce_init_timer(void) 1313{ 1314 struct timer_list *t = &__get_cpu_var(mce_timer); 1315 int *n = &__get_cpu_var(next_interval); 1316 1317 if (mce_ignore_ce) 1318 return; 1319 1320 *n = check_interval * HZ; 1321 if (!*n) 1322 return; 1323 setup_timer(t, mcheck_timer, smp_processor_id()); 1324 t->expires = round_jiffies(jiffies + *n); 1325 add_timer(t); 1326} 1327 1328/* 1329 * Called for each booted CPU to set up machine checks. 1330 * Must be called with preempt off: 1331 */ 1332void __cpuinit mcheck_init(struct cpuinfo_x86 *c) 1333{ 1334 if (mce_disabled) 1335 return; 1336 1337 mce_ancient_init(c); 1338 1339 if (!mce_available(c)) 1340 return; 1341 1342 if (mce_cap_init() < 0) { 1343 mce_disabled = 1; 1344 return; 1345 } 1346 mce_cpu_quirks(c); 1347 1348 machine_check_vector = do_machine_check; 1349 1350 mce_init(); 1351 mce_cpu_features(c); 1352 mce_init_timer(); 1353 INIT_WORK(&__get_cpu_var(mce_work), mce_process_work); 1354} 1355 1356/* 1357 * Character device to read and clear the MCE log. 1358 */ 1359 1360static DEFINE_SPINLOCK(mce_state_lock); 1361static int open_count; /* #times opened */ 1362static int open_exclu; /* already open exclusive? */ 1363 1364static int mce_open(struct inode *inode, struct file *file) 1365{ 1366 spin_lock(&mce_state_lock); 1367 1368 if (open_exclu || (open_count && (file->f_flags & O_EXCL))) { 1369 spin_unlock(&mce_state_lock); 1370 1371 return -EBUSY; 1372 } 1373 1374 if (file->f_flags & O_EXCL) 1375 open_exclu = 1; 1376 open_count++; 1377 1378 spin_unlock(&mce_state_lock); 1379 1380 return nonseekable_open(inode, file); 1381} 1382 1383static int mce_release(struct inode *inode, struct file *file) 1384{ 1385 spin_lock(&mce_state_lock); 1386 1387 open_count--; 1388 open_exclu = 0; 1389 1390 spin_unlock(&mce_state_lock); 1391 1392 return 0; 1393} 1394 1395static void collect_tscs(void *data) 1396{ 1397 unsigned long *cpu_tsc = (unsigned long *)data; 1398 1399 rdtscll(cpu_tsc[smp_processor_id()]); 1400} 1401 1402static DEFINE_MUTEX(mce_read_mutex); 1403 1404static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, 1405 loff_t *off) 1406{ 1407 char __user *buf = ubuf; 1408 unsigned long *cpu_tsc; 1409 unsigned prev, next; 1410 int i, err; 1411 1412 cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL); 1413 if (!cpu_tsc) 1414 return -ENOMEM; 1415 1416 mutex_lock(&mce_read_mutex); 1417 next = rcu_dereference(mcelog.next); 1418 1419 /* Only supports full reads right now */ 1420 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) { 1421 mutex_unlock(&mce_read_mutex); 1422 kfree(cpu_tsc); 1423 1424 return -EINVAL; 1425 } 1426 1427 err = 0; 1428 prev = 0; 1429 do { 1430 for (i = prev; i < next; i++) { 1431 unsigned long start = jiffies; 1432 1433 while (!mcelog.entry[i].finished) { 1434 if (time_after_eq(jiffies, start + 2)) { 1435 memset(mcelog.entry + i, 0, 1436 sizeof(struct mce)); 1437 goto timeout; 1438 } 1439 cpu_relax(); 1440 } 1441 smp_rmb(); 1442 err |= copy_to_user(buf, mcelog.entry + i, 1443 sizeof(struct mce)); 1444 buf += sizeof(struct mce); 1445timeout: 1446 ; 1447 } 1448 1449 memset(mcelog.entry + prev, 0, 1450 (next - prev) * sizeof(struct mce)); 1451 prev = next; 1452 next = cmpxchg(&mcelog.next, prev, 0); 1453 } while (next != prev); 1454 1455 synchronize_sched(); 1456 1457 /* 1458 * Collect entries that were still getting written before the 1459 * synchronize. 1460 */ 1461 on_each_cpu(collect_tscs, cpu_tsc, 1); 1462 1463 for (i = next; i < MCE_LOG_LEN; i++) { 1464 if (mcelog.entry[i].finished && 1465 mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) { 1466 err |= copy_to_user(buf, mcelog.entry+i, 1467 sizeof(struct mce)); 1468 smp_rmb(); 1469 buf += sizeof(struct mce); 1470 memset(&mcelog.entry[i], 0, sizeof(struct mce)); 1471 } 1472 } 1473 mutex_unlock(&mce_read_mutex); 1474 kfree(cpu_tsc); 1475 1476 return err ? -EFAULT : buf - ubuf; 1477} 1478 1479static unsigned int mce_poll(struct file *file, poll_table *wait) 1480{ 1481 poll_wait(file, &mce_wait, wait); 1482 if (rcu_dereference(mcelog.next)) 1483 return POLLIN | POLLRDNORM; 1484 return 0; 1485} 1486 1487static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg) 1488{ 1489 int __user *p = (int __user *)arg; 1490 1491 if (!capable(CAP_SYS_ADMIN)) 1492 return -EPERM; 1493 1494 switch (cmd) { 1495 case MCE_GET_RECORD_LEN: 1496 return put_user(sizeof(struct mce), p); 1497 case MCE_GET_LOG_LEN: 1498 return put_user(MCE_LOG_LEN, p); 1499 case MCE_GETCLEAR_FLAGS: { 1500 unsigned flags; 1501 1502 do { 1503 flags = mcelog.flags; 1504 } while (cmpxchg(&mcelog.flags, flags, 0) != flags); 1505 1506 return put_user(flags, p); 1507 } 1508 default: 1509 return -ENOTTY; 1510 } 1511} 1512 1513/* Modified in mce-inject.c, so not static or const */ 1514struct file_operations mce_chrdev_ops = { 1515 .open = mce_open, 1516 .release = mce_release, 1517 .read = mce_read, 1518 .poll = mce_poll, 1519 .unlocked_ioctl = mce_ioctl, 1520}; 1521EXPORT_SYMBOL_GPL(mce_chrdev_ops); 1522 1523static struct miscdevice mce_log_device = { 1524 MISC_MCELOG_MINOR, 1525 "mcelog", 1526 &mce_chrdev_ops, 1527}; 1528 1529/* 1530 * mce=off Disables machine check 1531 * mce=no_cmci Disables CMCI 1532 * mce=dont_log_ce Clears corrected events silently, no log created for CEs. 1533 * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared. 1534 * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above) 1535 * monarchtimeout is how long to wait for other CPUs on machine 1536 * check, or 0 to not wait 1537 * mce=bootlog Log MCEs from before booting. Disabled by default on AMD. 1538 * mce=nobootlog Don't log MCEs from before booting. 1539 */ 1540static int __init mcheck_enable(char *str) 1541{ 1542 if (*str == 0) 1543 enable_p5_mce(); 1544 if (*str == '=') 1545 str++; 1546 if (!strcmp(str, "off")) 1547 mce_disabled = 1; 1548 else if (!strcmp(str, "no_cmci")) 1549 mce_cmci_disabled = 1; 1550 else if (!strcmp(str, "dont_log_ce")) 1551 mce_dont_log_ce = 1; 1552 else if (!strcmp(str, "ignore_ce")) 1553 mce_ignore_ce = 1; 1554 else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog")) 1555 mce_bootlog = (str[0] == 'b'); 1556 else if (isdigit(str[0])) { 1557 get_option(&str, &tolerant); 1558 if (*str == ',') { 1559 ++str; 1560 get_option(&str, &monarch_timeout); 1561 } 1562 } else { 1563 printk(KERN_INFO "mce argument %s ignored. Please use /sys\n", 1564 str); 1565 return 0; 1566 } 1567 return 1; 1568} 1569__setup("mce", mcheck_enable); 1570 1571/* 1572 * Sysfs support 1573 */ 1574 1575/* 1576 * Disable machine checks on suspend and shutdown. We can't really handle 1577 * them later. 1578 */ 1579static int mce_disable(void) 1580{ 1581 int i; 1582 1583 for (i = 0; i < banks; i++) { 1584 if (!skip_bank_init(i)) 1585 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); 1586 } 1587 return 0; 1588} 1589 1590static int mce_suspend(struct sys_device *dev, pm_message_t state) 1591{ 1592 return mce_disable(); 1593} 1594 1595static int mce_shutdown(struct sys_device *dev) 1596{ 1597 return mce_disable(); 1598} 1599 1600/* 1601 * On resume clear all MCE state. Don't want to see leftovers from the BIOS. 1602 * Only one CPU is active at this time, the others get re-added later using 1603 * CPU hotplug: 1604 */ 1605static int mce_resume(struct sys_device *dev) 1606{ 1607 mce_init(); 1608 mce_cpu_features(¤t_cpu_data); 1609 1610 return 0; 1611} 1612 1613static void mce_cpu_restart(void *data) 1614{ 1615 del_timer_sync(&__get_cpu_var(mce_timer)); 1616 if (!mce_available(¤t_cpu_data)) 1617 return; 1618 mce_init(); 1619 mce_init_timer(); 1620} 1621 1622/* Reinit MCEs after user configuration changes */ 1623static void mce_restart(void) 1624{ 1625 on_each_cpu(mce_cpu_restart, NULL, 1); 1626} 1627 1628static struct sysdev_class mce_sysclass = { 1629 .suspend = mce_suspend, 1630 .shutdown = mce_shutdown, 1631 .resume = mce_resume, 1632 .name = "machinecheck", 1633}; 1634 1635DEFINE_PER_CPU(struct sys_device, mce_dev); 1636 1637__cpuinitdata 1638void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); 1639 1640static struct sysdev_attribute *bank_attrs; 1641 1642static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr, 1643 char *buf) 1644{ 1645 u64 b = bank[attr - bank_attrs]; 1646 1647 return sprintf(buf, "%llx\n", b); 1648} 1649 1650static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, 1651 const char *buf, size_t size) 1652{ 1653 u64 new; 1654 1655 if (strict_strtoull(buf, 0, &new) < 0) 1656 return -EINVAL; 1657 1658 bank[attr - bank_attrs] = new; 1659 mce_restart(); 1660 1661 return size; 1662} 1663 1664static ssize_t 1665show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf) 1666{ 1667 strcpy(buf, trigger); 1668 strcat(buf, "\n"); 1669 return strlen(trigger) + 1; 1670} 1671 1672static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr, 1673 const char *buf, size_t siz) 1674{ 1675 char *p; 1676 int len; 1677 1678 strncpy(trigger, buf, sizeof(trigger)); 1679 trigger[sizeof(trigger)-1] = 0; 1680 len = strlen(trigger); 1681 p = strchr(trigger, '\n'); 1682 1683 if (*p) 1684 *p = 0; 1685 1686 return len; 1687} 1688 1689static ssize_t store_int_with_restart(struct sys_device *s, 1690 struct sysdev_attribute *attr, 1691 const char *buf, size_t size) 1692{ 1693 ssize_t ret = sysdev_store_int(s, attr, buf, size); 1694 mce_restart(); 1695 return ret; 1696} 1697 1698static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); 1699static SYSDEV_INT_ATTR(tolerant, 0644, tolerant); 1700static SYSDEV_INT_ATTR(monarch_timeout, 0644, monarch_timeout); 1701 1702static struct sysdev_ext_attribute attr_check_interval = { 1703 _SYSDEV_ATTR(check_interval, 0644, sysdev_show_int, 1704 store_int_with_restart), 1705 &check_interval 1706}; 1707 1708static struct sysdev_attribute *mce_attrs[] = { 1709 &attr_tolerant.attr, &attr_check_interval.attr, &attr_trigger, 1710 &attr_monarch_timeout.attr, 1711 NULL 1712}; 1713 1714static cpumask_var_t mce_dev_initialized; 1715 1716/* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */ 1717static __cpuinit int mce_create_device(unsigned int cpu) 1718{ 1719 int err; 1720 int i; 1721 1722 if (!mce_available(&boot_cpu_data)) 1723 return -EIO; 1724 1725 memset(&per_cpu(mce_dev, cpu).kobj, 0, sizeof(struct kobject)); 1726 per_cpu(mce_dev, cpu).id = cpu; 1727 per_cpu(mce_dev, cpu).cls = &mce_sysclass; 1728 1729 err = sysdev_register(&per_cpu(mce_dev, cpu)); 1730 if (err) 1731 return err; 1732 1733 for (i = 0; mce_attrs[i]; i++) { 1734 err = sysdev_create_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); 1735 if (err) 1736 goto error; 1737 } 1738 for (i = 0; i < banks; i++) { 1739 err = sysdev_create_file(&per_cpu(mce_dev, cpu), 1740 &bank_attrs[i]); 1741 if (err) 1742 goto error2; 1743 } 1744 cpumask_set_cpu(cpu, mce_dev_initialized); 1745 1746 return 0; 1747error2: 1748 while (--i >= 0) 1749 sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]); 1750error: 1751 while (--i >= 0) 1752 sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); 1753 1754 sysdev_unregister(&per_cpu(mce_dev, cpu)); 1755 1756 return err; 1757} 1758 1759static __cpuinit void mce_remove_device(unsigned int cpu) 1760{ 1761 int i; 1762 1763 if (!cpumask_test_cpu(cpu, mce_dev_initialized)) 1764 return; 1765 1766 for (i = 0; mce_attrs[i]; i++) 1767 sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); 1768 1769 for (i = 0; i < banks; i++) 1770 sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]); 1771 1772 sysdev_unregister(&per_cpu(mce_dev, cpu)); 1773 cpumask_clear_cpu(cpu, mce_dev_initialized); 1774} 1775 1776/* Make sure there are no machine checks on offlined CPUs. */ 1777static void mce_disable_cpu(void *h) 1778{ 1779 unsigned long action = *(unsigned long *)h; 1780 int i; 1781 1782 if (!mce_available(¤t_cpu_data)) 1783 return; 1784 if (!(action & CPU_TASKS_FROZEN)) 1785 cmci_clear(); 1786 for (i = 0; i < banks; i++) { 1787 if (!skip_bank_init(i)) 1788 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); 1789 } 1790} 1791 1792static void mce_reenable_cpu(void *h) 1793{ 1794 unsigned long action = *(unsigned long *)h; 1795 int i; 1796 1797 if (!mce_available(¤t_cpu_data)) 1798 return; 1799 1800 if (!(action & CPU_TASKS_FROZEN)) 1801 cmci_reenable(); 1802 for (i = 0; i < banks; i++) { 1803 if (!skip_bank_init(i)) 1804 wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]); 1805 } 1806} 1807 1808/* Get notified when a cpu comes on/off. Be hotplug friendly. */ 1809static int __cpuinit 1810mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) 1811{ 1812 unsigned int cpu = (unsigned long)hcpu; 1813 struct timer_list *t = &per_cpu(mce_timer, cpu); 1814 1815 switch (action) { 1816 case CPU_ONLINE: 1817 case CPU_ONLINE_FROZEN: 1818 mce_create_device(cpu); 1819 if (threshold_cpu_callback) 1820 threshold_cpu_callback(action, cpu); 1821 break; 1822 case CPU_DEAD: 1823 case CPU_DEAD_FROZEN: 1824 if (threshold_cpu_callback) 1825 threshold_cpu_callback(action, cpu); 1826 mce_remove_device(cpu); 1827 break; 1828 case CPU_DOWN_PREPARE: 1829 case CPU_DOWN_PREPARE_FROZEN: 1830 del_timer_sync(t); 1831 smp_call_function_single(cpu, mce_disable_cpu, &action, 1); 1832 break; 1833 case CPU_DOWN_FAILED: 1834 case CPU_DOWN_FAILED_FROZEN: 1835 t->expires = round_jiffies(jiffies + 1836 __get_cpu_var(next_interval)); 1837 add_timer_on(t, cpu); 1838 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); 1839 break; 1840 case CPU_POST_DEAD: 1841 /* intentionally ignoring frozen here */ 1842 cmci_rediscover(cpu); 1843 break; 1844 } 1845 return NOTIFY_OK; 1846} 1847 1848static struct notifier_block mce_cpu_notifier __cpuinitdata = { 1849 .notifier_call = mce_cpu_callback, 1850}; 1851 1852static __init int mce_init_banks(void) 1853{ 1854 int i; 1855 1856 bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks, 1857 GFP_KERNEL); 1858 if (!bank_attrs) 1859 return -ENOMEM; 1860 1861 for (i = 0; i < banks; i++) { 1862 struct sysdev_attribute *a = &bank_attrs[i]; 1863 1864 a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i); 1865 if (!a->attr.name) 1866 goto nomem; 1867 1868 a->attr.mode = 0644; 1869 a->show = show_bank; 1870 a->store = set_bank; 1871 } 1872 return 0; 1873 1874nomem: 1875 while (--i >= 0) 1876 kfree(bank_attrs[i].attr.name); 1877 kfree(bank_attrs); 1878 bank_attrs = NULL; 1879 1880 return -ENOMEM; 1881} 1882 1883static __init int mce_init_device(void) 1884{ 1885 int err; 1886 int i = 0; 1887 1888 if (!mce_available(&boot_cpu_data)) 1889 return -EIO; 1890 1891 alloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL); 1892 1893 err = mce_init_banks(); 1894 if (err) 1895 return err; 1896 1897 err = sysdev_class_register(&mce_sysclass); 1898 if (err) 1899 return err; 1900 1901 for_each_online_cpu(i) { 1902 err = mce_create_device(i); 1903 if (err) 1904 return err; 1905 } 1906 1907 register_hotcpu_notifier(&mce_cpu_notifier); 1908 misc_register(&mce_log_device); 1909 1910 return err; 1911} 1912 1913device_initcall(mce_init_device); 1914 1915#else /* CONFIG_X86_OLD_MCE: */ 1916 1917int nr_mce_banks; 1918EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */ 1919 1920/* This has to be run for each processor */ 1921void mcheck_init(struct cpuinfo_x86 *c) 1922{ 1923 if (mce_disabled == 1) 1924 return; 1925 1926 switch (c->x86_vendor) { 1927 case X86_VENDOR_AMD: 1928 amd_mcheck_init(c); 1929 break; 1930 1931 case X86_VENDOR_INTEL: 1932 if (c->x86 == 5) 1933 intel_p5_mcheck_init(c); 1934 if (c->x86 == 6) 1935 intel_p6_mcheck_init(c); 1936 if (c->x86 == 15) 1937 intel_p4_mcheck_init(c); 1938 break; 1939 1940 case X86_VENDOR_CENTAUR: 1941 if (c->x86 == 5) 1942 winchip_mcheck_init(c); 1943 break; 1944 1945 default: 1946 break; 1947 } 1948 printk(KERN_INFO "mce: CPU supports %d MCE banks\n", nr_mce_banks); 1949} 1950 1951static int __init mcheck_enable(char *str) 1952{ 1953 mce_disabled = -1; 1954 return 1; 1955} 1956 1957__setup("mce", mcheck_enable); 1958 1959#endif /* CONFIG_X86_OLD_MCE */ 1960 1961/* 1962 * Old style boot options parsing. Only for compatibility. 1963 */ 1964static int __init mcheck_disable(char *str) 1965{ 1966 mce_disabled = 1; 1967 return 1; 1968} 1969__setup("nomce", mcheck_disable); 1970