mce.c revision 5bb38adcb54cf7192b154368ad62982caa11ca0b
1/* 2 * Machine check handler. 3 * 4 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. 5 * Rest from unknown author(s). 6 * 2004 Andi Kleen. Rewrote most of it. 7 * Copyright 2008 Intel Corporation 8 * Author: Andi Kleen 9 */ 10#include <linux/thread_info.h> 11#include <linux/capability.h> 12#include <linux/miscdevice.h> 13#include <linux/interrupt.h> 14#include <linux/ratelimit.h> 15#include <linux/kallsyms.h> 16#include <linux/rcupdate.h> 17#include <linux/kobject.h> 18#include <linux/uaccess.h> 19#include <linux/kdebug.h> 20#include <linux/kernel.h> 21#include <linux/percpu.h> 22#include <linux/string.h> 23#include <linux/sysdev.h> 24#include <linux/delay.h> 25#include <linux/ctype.h> 26#include <linux/sched.h> 27#include <linux/sysfs.h> 28#include <linux/types.h> 29#include <linux/init.h> 30#include <linux/kmod.h> 31#include <linux/poll.h> 32#include <linux/nmi.h> 33#include <linux/cpu.h> 34#include <linux/smp.h> 35#include <linux/fs.h> 36#include <linux/mm.h> 37 38#include <asm/processor.h> 39#include <asm/hw_irq.h> 40#include <asm/apic.h> 41#include <asm/idle.h> 42#include <asm/ipi.h> 43#include <asm/mce.h> 44#include <asm/msr.h> 45 46#include "mce-internal.h" 47 48/* Handle unconfigured int18 (should never happen) */ 49static void unexpected_machine_check(struct pt_regs *regs, long error_code) 50{ 51 printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", 52 smp_processor_id()); 53} 54 55/* Call the installed machine check handler for this CPU setup. */ 56void (*machine_check_vector)(struct pt_regs *, long error_code) = 57 unexpected_machine_check; 58 59int mce_disabled __read_mostly; 60 61#define MISC_MCELOG_MINOR 227 62 63#define SPINUNIT 100 /* 100ns */ 64 65atomic_t mce_entry; 66 67DEFINE_PER_CPU(unsigned, mce_exception_count); 68 69/* 70 * Tolerant levels: 71 * 0: always panic on uncorrected errors, log corrected errors 72 * 1: panic or SIGBUS on uncorrected errors, log corrected errors 73 * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors 74 * 3: never panic or SIGBUS, log all errors (for testing only) 75 */ 76static int tolerant __read_mostly = 1; 77static int banks __read_mostly; 78static u64 *bank __read_mostly; 79static int rip_msr __read_mostly; 80static int mce_bootlog __read_mostly = -1; 81static int monarch_timeout __read_mostly = -1; 82static int mce_panic_timeout __read_mostly; 83static int mce_dont_log_ce __read_mostly; 84int mce_cmci_disabled __read_mostly; 85int mce_ignore_ce __read_mostly; 86int mce_ser __read_mostly; 87 88/* User mode helper program triggered by machine check event */ 89static unsigned long mce_need_notify; 90static char mce_helper[128]; 91static char *mce_helper_argv[2] = { mce_helper, NULL }; 92 93static unsigned long dont_init_banks; 94 95static DECLARE_WAIT_QUEUE_HEAD(mce_wait); 96static DEFINE_PER_CPU(struct mce, mces_seen); 97static int cpu_missing; 98 99 100/* MCA banks polled by the period polling timer for corrected events */ 101DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { 102 [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL 103}; 104 105static inline int skip_bank_init(int i) 106{ 107 return i < BITS_PER_LONG && test_bit(i, &dont_init_banks); 108} 109 110static DEFINE_PER_CPU(struct work_struct, mce_work); 111 112/* Do initial initialization of a struct mce */ 113void mce_setup(struct mce *m) 114{ 115 memset(m, 0, sizeof(struct mce)); 116 m->cpu = m->extcpu = smp_processor_id(); 117 rdtscll(m->tsc); 118 /* We hope get_seconds stays lockless */ 119 m->time = get_seconds(); 120 m->cpuvendor = boot_cpu_data.x86_vendor; 121 m->cpuid = cpuid_eax(1); 122#ifdef CONFIG_SMP 123 m->socketid = cpu_data(m->extcpu).phys_proc_id; 124#endif 125 m->apicid = cpu_data(m->extcpu).initial_apicid; 126 rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap); 127} 128 129DEFINE_PER_CPU(struct mce, injectm); 130EXPORT_PER_CPU_SYMBOL_GPL(injectm); 131 132/* 133 * Lockless MCE logging infrastructure. 134 * This avoids deadlocks on printk locks without having to break locks. Also 135 * separate MCEs from kernel messages to avoid bogus bug reports. 136 */ 137 138static struct mce_log mcelog = { 139 .signature = MCE_LOG_SIGNATURE, 140 .len = MCE_LOG_LEN, 141 .recordlen = sizeof(struct mce), 142}; 143 144void mce_log(struct mce *mce) 145{ 146 unsigned next, entry; 147 148 mce->finished = 0; 149 wmb(); 150 for (;;) { 151 entry = rcu_dereference(mcelog.next); 152 for (;;) { 153 /* 154 * When the buffer fills up discard new entries. 155 * Assume that the earlier errors are the more 156 * interesting ones: 157 */ 158 if (entry >= MCE_LOG_LEN) { 159 set_bit(MCE_OVERFLOW, 160 (unsigned long *)&mcelog.flags); 161 return; 162 } 163 /* Old left over entry. Skip: */ 164 if (mcelog.entry[entry].finished) { 165 entry++; 166 continue; 167 } 168 break; 169 } 170 smp_rmb(); 171 next = entry + 1; 172 if (cmpxchg(&mcelog.next, entry, next) == entry) 173 break; 174 } 175 memcpy(mcelog.entry + entry, mce, sizeof(struct mce)); 176 wmb(); 177 mcelog.entry[entry].finished = 1; 178 wmb(); 179 180 mce->finished = 1; 181 set_bit(0, &mce_need_notify); 182} 183 184static void print_mce(struct mce *m) 185{ 186 printk(KERN_EMERG 187 "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n", 188 m->extcpu, m->mcgstatus, m->bank, m->status); 189 if (m->ip) { 190 printk(KERN_EMERG "RIP%s %02x:<%016Lx> ", 191 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", 192 m->cs, m->ip); 193 if (m->cs == __KERNEL_CS) 194 print_symbol("{%s}", m->ip); 195 printk("\n"); 196 } 197 printk(KERN_EMERG "TSC %llx ", m->tsc); 198 if (m->addr) 199 printk("ADDR %llx ", m->addr); 200 if (m->misc) 201 printk("MISC %llx ", m->misc); 202 printk("\n"); 203 printk(KERN_EMERG "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n", 204 m->cpuvendor, m->cpuid, m->time, m->socketid, 205 m->apicid); 206} 207 208static void print_mce_head(void) 209{ 210 printk(KERN_EMERG "\n" KERN_EMERG "HARDWARE ERROR\n"); 211} 212 213static void print_mce_tail(void) 214{ 215 printk(KERN_EMERG "This is not a software problem!\n" 216 KERN_EMERG "Run through mcelog --ascii to decode and contact your hardware vendor\n"); 217} 218 219#define PANIC_TIMEOUT 5 /* 5 seconds */ 220 221static atomic_t mce_paniced; 222 223/* Panic in progress. Enable interrupts and wait for final IPI */ 224static void wait_for_panic(void) 225{ 226 long timeout = PANIC_TIMEOUT*USEC_PER_SEC; 227 preempt_disable(); 228 local_irq_enable(); 229 while (timeout-- > 0) 230 udelay(1); 231 if (panic_timeout == 0) 232 panic_timeout = mce_panic_timeout; 233 panic("Panicing machine check CPU died"); 234} 235 236static void mce_panic(char *msg, struct mce *final, char *exp) 237{ 238 int i; 239 240 /* 241 * Make sure only one CPU runs in machine check panic 242 */ 243 if (atomic_inc_return(&mce_paniced) > 1) 244 wait_for_panic(); 245 barrier(); 246 247 bust_spinlocks(1); 248 console_verbose(); 249 print_mce_head(); 250 /* First print corrected ones that are still unlogged */ 251 for (i = 0; i < MCE_LOG_LEN; i++) { 252 struct mce *m = &mcelog.entry[i]; 253 if (!(m->status & MCI_STATUS_VAL)) 254 continue; 255 if (!(m->status & MCI_STATUS_UC)) 256 print_mce(m); 257 } 258 /* Now print uncorrected but with the final one last */ 259 for (i = 0; i < MCE_LOG_LEN; i++) { 260 struct mce *m = &mcelog.entry[i]; 261 if (!(m->status & MCI_STATUS_VAL)) 262 continue; 263 if (!(m->status & MCI_STATUS_UC)) 264 continue; 265 if (!final || memcmp(m, final, sizeof(struct mce))) 266 print_mce(m); 267 } 268 if (final) 269 print_mce(final); 270 if (cpu_missing) 271 printk(KERN_EMERG "Some CPUs didn't answer in synchronization\n"); 272 print_mce_tail(); 273 if (exp) 274 printk(KERN_EMERG "Machine check: %s\n", exp); 275 if (panic_timeout == 0) 276 panic_timeout = mce_panic_timeout; 277 panic(msg); 278} 279 280/* Support code for software error injection */ 281 282static int msr_to_offset(u32 msr) 283{ 284 unsigned bank = __get_cpu_var(injectm.bank); 285 if (msr == rip_msr) 286 return offsetof(struct mce, ip); 287 if (msr == MSR_IA32_MC0_STATUS + bank*4) 288 return offsetof(struct mce, status); 289 if (msr == MSR_IA32_MC0_ADDR + bank*4) 290 return offsetof(struct mce, addr); 291 if (msr == MSR_IA32_MC0_MISC + bank*4) 292 return offsetof(struct mce, misc); 293 if (msr == MSR_IA32_MCG_STATUS) 294 return offsetof(struct mce, mcgstatus); 295 return -1; 296} 297 298/* MSR access wrappers used for error injection */ 299static u64 mce_rdmsrl(u32 msr) 300{ 301 u64 v; 302 if (__get_cpu_var(injectm).finished) { 303 int offset = msr_to_offset(msr); 304 if (offset < 0) 305 return 0; 306 return *(u64 *)((char *)&__get_cpu_var(injectm) + offset); 307 } 308 rdmsrl(msr, v); 309 return v; 310} 311 312static void mce_wrmsrl(u32 msr, u64 v) 313{ 314 if (__get_cpu_var(injectm).finished) { 315 int offset = msr_to_offset(msr); 316 if (offset >= 0) 317 *(u64 *)((char *)&__get_cpu_var(injectm) + offset) = v; 318 return; 319 } 320 wrmsrl(msr, v); 321} 322 323/* 324 * Simple lockless ring to communicate PFNs from the exception handler with the 325 * process context work function. This is vastly simplified because there's 326 * only a single reader and a single writer. 327 */ 328#define MCE_RING_SIZE 16 /* we use one entry less */ 329 330struct mce_ring { 331 unsigned short start; 332 unsigned short end; 333 unsigned long ring[MCE_RING_SIZE]; 334}; 335static DEFINE_PER_CPU(struct mce_ring, mce_ring); 336 337/* Runs with CPU affinity in workqueue */ 338static int mce_ring_empty(void) 339{ 340 struct mce_ring *r = &__get_cpu_var(mce_ring); 341 342 return r->start == r->end; 343} 344 345static int mce_ring_get(unsigned long *pfn) 346{ 347 struct mce_ring *r; 348 int ret = 0; 349 350 *pfn = 0; 351 get_cpu(); 352 r = &__get_cpu_var(mce_ring); 353 if (r->start == r->end) 354 goto out; 355 *pfn = r->ring[r->start]; 356 r->start = (r->start + 1) % MCE_RING_SIZE; 357 ret = 1; 358out: 359 put_cpu(); 360 return ret; 361} 362 363/* Always runs in MCE context with preempt off */ 364static int mce_ring_add(unsigned long pfn) 365{ 366 struct mce_ring *r = &__get_cpu_var(mce_ring); 367 unsigned next; 368 369 next = (r->end + 1) % MCE_RING_SIZE; 370 if (next == r->start) 371 return -1; 372 r->ring[r->end] = pfn; 373 wmb(); 374 r->end = next; 375 return 0; 376} 377 378int mce_available(struct cpuinfo_x86 *c) 379{ 380 if (mce_disabled) 381 return 0; 382 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); 383} 384 385static void mce_schedule_work(void) 386{ 387 if (!mce_ring_empty()) { 388 struct work_struct *work = &__get_cpu_var(mce_work); 389 if (!work_pending(work)) 390 schedule_work(work); 391 } 392} 393 394/* 395 * Get the address of the instruction at the time of the machine check 396 * error. 397 */ 398static inline void mce_get_rip(struct mce *m, struct pt_regs *regs) 399{ 400 401 if (regs && (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV))) { 402 m->ip = regs->ip; 403 m->cs = regs->cs; 404 } else { 405 m->ip = 0; 406 m->cs = 0; 407 } 408 if (rip_msr) 409 m->ip = mce_rdmsrl(rip_msr); 410} 411 412#ifdef CONFIG_X86_LOCAL_APIC 413/* 414 * Called after interrupts have been reenabled again 415 * when a MCE happened during an interrupts off region 416 * in the kernel. 417 */ 418asmlinkage void smp_mce_self_interrupt(struct pt_regs *regs) 419{ 420 ack_APIC_irq(); 421 exit_idle(); 422 irq_enter(); 423 mce_notify_irq(); 424 mce_schedule_work(); 425 irq_exit(); 426} 427#endif 428 429static void mce_report_event(struct pt_regs *regs) 430{ 431 if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) { 432 mce_notify_irq(); 433 /* 434 * Triggering the work queue here is just an insurance 435 * policy in case the syscall exit notify handler 436 * doesn't run soon enough or ends up running on the 437 * wrong CPU (can happen when audit sleeps) 438 */ 439 mce_schedule_work(); 440 return; 441 } 442 443#ifdef CONFIG_X86_LOCAL_APIC 444 /* 445 * Without APIC do not notify. The event will be picked 446 * up eventually. 447 */ 448 if (!cpu_has_apic) 449 return; 450 451 /* 452 * When interrupts are disabled we cannot use 453 * kernel services safely. Trigger an self interrupt 454 * through the APIC to instead do the notification 455 * after interrupts are reenabled again. 456 */ 457 apic->send_IPI_self(MCE_SELF_VECTOR); 458 459 /* 460 * Wait for idle afterwards again so that we don't leave the 461 * APIC in a non idle state because the normal APIC writes 462 * cannot exclude us. 463 */ 464 apic_wait_icr_idle(); 465#endif 466} 467 468DEFINE_PER_CPU(unsigned, mce_poll_count); 469 470/* 471 * Poll for corrected events or events that happened before reset. 472 * Those are just logged through /dev/mcelog. 473 * 474 * This is executed in standard interrupt context. 475 * 476 * Note: spec recommends to panic for fatal unsignalled 477 * errors here. However this would be quite problematic -- 478 * we would need to reimplement the Monarch handling and 479 * it would mess up the exclusion between exception handler 480 * and poll hander -- * so we skip this for now. 481 * These cases should not happen anyways, or only when the CPU 482 * is already totally * confused. In this case it's likely it will 483 * not fully execute the machine check handler either. 484 */ 485void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) 486{ 487 struct mce m; 488 int i; 489 490 __get_cpu_var(mce_poll_count)++; 491 492 mce_setup(&m); 493 494 m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); 495 for (i = 0; i < banks; i++) { 496 if (!bank[i] || !test_bit(i, *b)) 497 continue; 498 499 m.misc = 0; 500 m.addr = 0; 501 m.bank = i; 502 m.tsc = 0; 503 504 barrier(); 505 m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4); 506 if (!(m.status & MCI_STATUS_VAL)) 507 continue; 508 509 /* 510 * Uncorrected or signalled events are handled by the exception 511 * handler when it is enabled, so don't process those here. 512 * 513 * TBD do the same check for MCI_STATUS_EN here? 514 */ 515 if (!(flags & MCP_UC) && 516 (m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC))) 517 continue; 518 519 if (m.status & MCI_STATUS_MISCV) 520 m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4); 521 if (m.status & MCI_STATUS_ADDRV) 522 m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4); 523 524 if (!(flags & MCP_TIMESTAMP)) 525 m.tsc = 0; 526 /* 527 * Don't get the IP here because it's unlikely to 528 * have anything to do with the actual error location. 529 */ 530 if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) { 531 mce_log(&m); 532 add_taint(TAINT_MACHINE_CHECK); 533 } 534 535 /* 536 * Clear state for this bank. 537 */ 538 mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); 539 } 540 541 /* 542 * Don't clear MCG_STATUS here because it's only defined for 543 * exceptions. 544 */ 545 546 sync_core(); 547} 548EXPORT_SYMBOL_GPL(machine_check_poll); 549 550/* 551 * Do a quick check if any of the events requires a panic. 552 * This decides if we keep the events around or clear them. 553 */ 554static int mce_no_way_out(struct mce *m, char **msg) 555{ 556 int i; 557 558 for (i = 0; i < banks; i++) { 559 m->status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4); 560 if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY) 561 return 1; 562 } 563 return 0; 564} 565 566/* 567 * Variable to establish order between CPUs while scanning. 568 * Each CPU spins initially until executing is equal its number. 569 */ 570static atomic_t mce_executing; 571 572/* 573 * Defines order of CPUs on entry. First CPU becomes Monarch. 574 */ 575static atomic_t mce_callin; 576 577/* 578 * Check if a timeout waiting for other CPUs happened. 579 */ 580static int mce_timed_out(u64 *t) 581{ 582 /* 583 * The others already did panic for some reason. 584 * Bail out like in a timeout. 585 * rmb() to tell the compiler that system_state 586 * might have been modified by someone else. 587 */ 588 rmb(); 589 if (atomic_read(&mce_paniced)) 590 wait_for_panic(); 591 if (!monarch_timeout) 592 goto out; 593 if ((s64)*t < SPINUNIT) { 594 /* CHECKME: Make panic default for 1 too? */ 595 if (tolerant < 1) 596 mce_panic("Timeout synchronizing machine check over CPUs", 597 NULL, NULL); 598 cpu_missing = 1; 599 return 1; 600 } 601 *t -= SPINUNIT; 602out: 603 touch_nmi_watchdog(); 604 return 0; 605} 606 607/* 608 * The Monarch's reign. The Monarch is the CPU who entered 609 * the machine check handler first. It waits for the others to 610 * raise the exception too and then grades them. When any 611 * error is fatal panic. Only then let the others continue. 612 * 613 * The other CPUs entering the MCE handler will be controlled by the 614 * Monarch. They are called Subjects. 615 * 616 * This way we prevent any potential data corruption in a unrecoverable case 617 * and also makes sure always all CPU's errors are examined. 618 * 619 * Also this detects the case of an machine check event coming from outer 620 * space (not detected by any CPUs) In this case some external agent wants 621 * us to shut down, so panic too. 622 * 623 * The other CPUs might still decide to panic if the handler happens 624 * in a unrecoverable place, but in this case the system is in a semi-stable 625 * state and won't corrupt anything by itself. It's ok to let the others 626 * continue for a bit first. 627 * 628 * All the spin loops have timeouts; when a timeout happens a CPU 629 * typically elects itself to be Monarch. 630 */ 631static void mce_reign(void) 632{ 633 int cpu; 634 struct mce *m = NULL; 635 int global_worst = 0; 636 char *msg = NULL; 637 char *nmsg = NULL; 638 639 /* 640 * This CPU is the Monarch and the other CPUs have run 641 * through their handlers. 642 * Grade the severity of the errors of all the CPUs. 643 */ 644 for_each_possible_cpu(cpu) { 645 int severity = mce_severity(&per_cpu(mces_seen, cpu), tolerant, 646 &nmsg); 647 if (severity > global_worst) { 648 msg = nmsg; 649 global_worst = severity; 650 m = &per_cpu(mces_seen, cpu); 651 } 652 } 653 654 /* 655 * Cannot recover? Panic here then. 656 * This dumps all the mces in the log buffer and stops the 657 * other CPUs. 658 */ 659 if (m && global_worst >= MCE_PANIC_SEVERITY && tolerant < 3) 660 mce_panic("Fatal Machine check", m, msg); 661 662 /* 663 * For UC somewhere we let the CPU who detects it handle it. 664 * Also must let continue the others, otherwise the handling 665 * CPU could deadlock on a lock. 666 */ 667 668 /* 669 * No machine check event found. Must be some external 670 * source or one CPU is hung. Panic. 671 */ 672 if (!m && tolerant < 3) 673 mce_panic("Machine check from unknown source", NULL, NULL); 674 675 /* 676 * Now clear all the mces_seen so that they don't reappear on 677 * the next mce. 678 */ 679 for_each_possible_cpu(cpu) 680 memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce)); 681} 682 683static atomic_t global_nwo; 684 685/* 686 * Start of Monarch synchronization. This waits until all CPUs have 687 * entered the exception handler and then determines if any of them 688 * saw a fatal event that requires panic. Then it executes them 689 * in the entry order. 690 * TBD double check parallel CPU hotunplug 691 */ 692static int mce_start(int *no_way_out) 693{ 694 int order; 695 int cpus = num_online_cpus(); 696 u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC; 697 698 if (!timeout) 699 return -1; 700 701 atomic_add(*no_way_out, &global_nwo); 702 /* 703 * global_nwo should be updated before mce_callin 704 */ 705 smp_wmb(); 706 order = atomic_inc_return(&mce_callin); 707 708 /* 709 * Wait for everyone. 710 */ 711 while (atomic_read(&mce_callin) != cpus) { 712 if (mce_timed_out(&timeout)) { 713 atomic_set(&global_nwo, 0); 714 return -1; 715 } 716 ndelay(SPINUNIT); 717 } 718 719 /* 720 * mce_callin should be read before global_nwo 721 */ 722 smp_rmb(); 723 724 if (order == 1) { 725 /* 726 * Monarch: Starts executing now, the others wait. 727 */ 728 atomic_set(&mce_executing, 1); 729 } else { 730 /* 731 * Subject: Now start the scanning loop one by one in 732 * the original callin order. 733 * This way when there are any shared banks it will be 734 * only seen by one CPU before cleared, avoiding duplicates. 735 */ 736 while (atomic_read(&mce_executing) < order) { 737 if (mce_timed_out(&timeout)) { 738 atomic_set(&global_nwo, 0); 739 return -1; 740 } 741 ndelay(SPINUNIT); 742 } 743 } 744 745 /* 746 * Cache the global no_way_out state. 747 */ 748 *no_way_out = atomic_read(&global_nwo); 749 750 return order; 751} 752 753/* 754 * Synchronize between CPUs after main scanning loop. 755 * This invokes the bulk of the Monarch processing. 756 */ 757static int mce_end(int order) 758{ 759 int ret = -1; 760 u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC; 761 762 if (!timeout) 763 goto reset; 764 if (order < 0) 765 goto reset; 766 767 /* 768 * Allow others to run. 769 */ 770 atomic_inc(&mce_executing); 771 772 if (order == 1) { 773 /* CHECKME: Can this race with a parallel hotplug? */ 774 int cpus = num_online_cpus(); 775 776 /* 777 * Monarch: Wait for everyone to go through their scanning 778 * loops. 779 */ 780 while (atomic_read(&mce_executing) <= cpus) { 781 if (mce_timed_out(&timeout)) 782 goto reset; 783 ndelay(SPINUNIT); 784 } 785 786 mce_reign(); 787 barrier(); 788 ret = 0; 789 } else { 790 /* 791 * Subject: Wait for Monarch to finish. 792 */ 793 while (atomic_read(&mce_executing) != 0) { 794 if (mce_timed_out(&timeout)) 795 goto reset; 796 ndelay(SPINUNIT); 797 } 798 799 /* 800 * Don't reset anything. That's done by the Monarch. 801 */ 802 return 0; 803 } 804 805 /* 806 * Reset all global state. 807 */ 808reset: 809 atomic_set(&global_nwo, 0); 810 atomic_set(&mce_callin, 0); 811 barrier(); 812 813 /* 814 * Let others run again. 815 */ 816 atomic_set(&mce_executing, 0); 817 return ret; 818} 819 820/* 821 * Check if the address reported by the CPU is in a format we can parse. 822 * It would be possible to add code for most other cases, but all would 823 * be somewhat complicated (e.g. segment offset would require an instruction 824 * parser). So only support physical addresses upto page granuality for now. 825 */ 826static int mce_usable_address(struct mce *m) 827{ 828 if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV)) 829 return 0; 830 if ((m->misc & 0x3f) > PAGE_SHIFT) 831 return 0; 832 if (((m->misc >> 6) & 7) != MCM_ADDR_PHYS) 833 return 0; 834 return 1; 835} 836 837static void mce_clear_state(unsigned long *toclear) 838{ 839 int i; 840 841 for (i = 0; i < banks; i++) { 842 if (test_bit(i, toclear)) 843 mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); 844 } 845} 846 847/* 848 * The actual machine check handler. This only handles real 849 * exceptions when something got corrupted coming in through int 18. 850 * 851 * This is executed in NMI context not subject to normal locking rules. This 852 * implies that most kernel services cannot be safely used. Don't even 853 * think about putting a printk in there! 854 * 855 * On Intel systems this is entered on all CPUs in parallel through 856 * MCE broadcast. However some CPUs might be broken beyond repair, 857 * so be always careful when synchronizing with others. 858 */ 859void do_machine_check(struct pt_regs *regs, long error_code) 860{ 861 struct mce m, *final; 862 int i; 863 int worst = 0; 864 int severity; 865 /* 866 * Establish sequential order between the CPUs entering the machine 867 * check handler. 868 */ 869 int order; 870 /* 871 * If no_way_out gets set, there is no safe way to recover from this 872 * MCE. If tolerant is cranked up, we'll try anyway. 873 */ 874 int no_way_out = 0; 875 /* 876 * If kill_it gets set, there might be a way to recover from this 877 * error. 878 */ 879 int kill_it = 0; 880 DECLARE_BITMAP(toclear, MAX_NR_BANKS); 881 char *msg = "Unknown"; 882 883 atomic_inc(&mce_entry); 884 885 __get_cpu_var(mce_exception_count)++; 886 887 if (notify_die(DIE_NMI, "machine check", regs, error_code, 888 18, SIGKILL) == NOTIFY_STOP) 889 goto out; 890 if (!banks) 891 goto out; 892 893 mce_setup(&m); 894 895 m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); 896 no_way_out = mce_no_way_out(&m, &msg); 897 898 final = &__get_cpu_var(mces_seen); 899 *final = m; 900 901 barrier(); 902 903 /* 904 * When no restart IP must always kill or panic. 905 */ 906 if (!(m.mcgstatus & MCG_STATUS_RIPV)) 907 kill_it = 1; 908 909 /* 910 * Go through all the banks in exclusion of the other CPUs. 911 * This way we don't report duplicated events on shared banks 912 * because the first one to see it will clear it. 913 */ 914 order = mce_start(&no_way_out); 915 for (i = 0; i < banks; i++) { 916 __clear_bit(i, toclear); 917 if (!bank[i]) 918 continue; 919 920 m.misc = 0; 921 m.addr = 0; 922 m.bank = i; 923 924 m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4); 925 if ((m.status & MCI_STATUS_VAL) == 0) 926 continue; 927 928 /* 929 * Non uncorrected or non signaled errors are handled by 930 * machine_check_poll. Leave them alone, unless this panics. 931 */ 932 if (!(m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)) && 933 !no_way_out) 934 continue; 935 936 /* 937 * Set taint even when machine check was not enabled. 938 */ 939 add_taint(TAINT_MACHINE_CHECK); 940 941 severity = mce_severity(&m, tolerant, NULL); 942 943 /* 944 * When machine check was for corrected handler don't touch, 945 * unless we're panicing. 946 */ 947 if (severity == MCE_KEEP_SEVERITY && !no_way_out) 948 continue; 949 __set_bit(i, toclear); 950 if (severity == MCE_NO_SEVERITY) { 951 /* 952 * Machine check event was not enabled. Clear, but 953 * ignore. 954 */ 955 continue; 956 } 957 958 /* 959 * Kill on action required. 960 */ 961 if (severity == MCE_AR_SEVERITY) 962 kill_it = 1; 963 964 if (m.status & MCI_STATUS_MISCV) 965 m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4); 966 if (m.status & MCI_STATUS_ADDRV) 967 m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4); 968 969 /* 970 * Action optional error. Queue address for later processing. 971 * When the ring overflows we just ignore the AO error. 972 * RED-PEN add some logging mechanism when 973 * usable_address or mce_add_ring fails. 974 * RED-PEN don't ignore overflow for tolerant == 0 975 */ 976 if (severity == MCE_AO_SEVERITY && mce_usable_address(&m)) 977 mce_ring_add(m.addr >> PAGE_SHIFT); 978 979 mce_get_rip(&m, regs); 980 mce_log(&m); 981 982 if (severity > worst) { 983 *final = m; 984 worst = severity; 985 } 986 } 987 988 if (!no_way_out) 989 mce_clear_state(toclear); 990 991 /* 992 * Do most of the synchronization with other CPUs. 993 * When there's any problem use only local no_way_out state. 994 */ 995 if (mce_end(order) < 0) 996 no_way_out = worst >= MCE_PANIC_SEVERITY; 997 998 /* 999 * If we have decided that we just CAN'T continue, and the user 1000 * has not set tolerant to an insane level, give up and die. 1001 * 1002 * This is mainly used in the case when the system doesn't 1003 * support MCE broadcasting or it has been disabled. 1004 */ 1005 if (no_way_out && tolerant < 3) 1006 mce_panic("Fatal machine check on current CPU", final, msg); 1007 1008 /* 1009 * If the error seems to be unrecoverable, something should be 1010 * done. Try to kill as little as possible. If we can kill just 1011 * one task, do that. If the user has set the tolerance very 1012 * high, don't try to do anything at all. 1013 */ 1014 1015 if (kill_it && tolerant < 3) 1016 force_sig(SIGBUS, current); 1017 1018 /* notify userspace ASAP */ 1019 set_thread_flag(TIF_MCE_NOTIFY); 1020 1021 if (worst > 0) 1022 mce_report_event(regs); 1023 mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); 1024out: 1025 atomic_dec(&mce_entry); 1026 sync_core(); 1027} 1028EXPORT_SYMBOL_GPL(do_machine_check); 1029 1030/* dummy to break dependency. actual code is in mm/memory-failure.c */ 1031void __attribute__((weak)) memory_failure(unsigned long pfn, int vector) 1032{ 1033 printk(KERN_ERR "Action optional memory failure at %lx ignored\n", pfn); 1034} 1035 1036/* 1037 * Called after mce notification in process context. This code 1038 * is allowed to sleep. Call the high level VM handler to process 1039 * any corrupted pages. 1040 * Assume that the work queue code only calls this one at a time 1041 * per CPU. 1042 * Note we don't disable preemption, so this code might run on the wrong 1043 * CPU. In this case the event is picked up by the scheduled work queue. 1044 * This is merely a fast path to expedite processing in some common 1045 * cases. 1046 */ 1047void mce_notify_process(void) 1048{ 1049 unsigned long pfn; 1050 mce_notify_irq(); 1051 while (mce_ring_get(&pfn)) 1052 memory_failure(pfn, MCE_VECTOR); 1053} 1054 1055static void mce_process_work(struct work_struct *dummy) 1056{ 1057 mce_notify_process(); 1058} 1059 1060#ifdef CONFIG_X86_MCE_INTEL 1061/*** 1062 * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog 1063 * @cpu: The CPU on which the event occurred. 1064 * @status: Event status information 1065 * 1066 * This function should be called by the thermal interrupt after the 1067 * event has been processed and the decision was made to log the event 1068 * further. 1069 * 1070 * The status parameter will be saved to the 'status' field of 'struct mce' 1071 * and historically has been the register value of the 1072 * MSR_IA32_THERMAL_STATUS (Intel) msr. 1073 */ 1074void mce_log_therm_throt_event(__u64 status) 1075{ 1076 struct mce m; 1077 1078 mce_setup(&m); 1079 m.bank = MCE_THERMAL_BANK; 1080 m.status = status; 1081 mce_log(&m); 1082} 1083#endif /* CONFIG_X86_MCE_INTEL */ 1084 1085/* 1086 * Periodic polling timer for "silent" machine check errors. If the 1087 * poller finds an MCE, poll 2x faster. When the poller finds no more 1088 * errors, poll 2x slower (up to check_interval seconds). 1089 */ 1090static int check_interval = 5 * 60; /* 5 minutes */ 1091 1092static DEFINE_PER_CPU(int, next_interval); /* in jiffies */ 1093static DEFINE_PER_CPU(struct timer_list, mce_timer); 1094 1095static void mcheck_timer(unsigned long data) 1096{ 1097 struct timer_list *t = &per_cpu(mce_timer, data); 1098 int *n; 1099 1100 WARN_ON(smp_processor_id() != data); 1101 1102 if (mce_available(¤t_cpu_data)) { 1103 machine_check_poll(MCP_TIMESTAMP, 1104 &__get_cpu_var(mce_poll_banks)); 1105 } 1106 1107 /* 1108 * Alert userspace if needed. If we logged an MCE, reduce the 1109 * polling interval, otherwise increase the polling interval. 1110 */ 1111 n = &__get_cpu_var(next_interval); 1112 if (mce_notify_irq()) 1113 *n = max(*n/2, HZ/100); 1114 else 1115 *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ)); 1116 1117 t->expires = jiffies + *n; 1118 add_timer(t); 1119} 1120 1121static void mce_do_trigger(struct work_struct *work) 1122{ 1123 call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT); 1124} 1125 1126static DECLARE_WORK(mce_trigger_work, mce_do_trigger); 1127 1128/* 1129 * Notify the user(s) about new machine check events. 1130 * Can be called from interrupt context, but not from machine check/NMI 1131 * context. 1132 */ 1133int mce_notify_irq(void) 1134{ 1135 /* Not more than two messages every minute */ 1136 static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); 1137 1138 clear_thread_flag(TIF_MCE_NOTIFY); 1139 1140 if (test_and_clear_bit(0, &mce_need_notify)) { 1141 wake_up_interruptible(&mce_wait); 1142 1143 /* 1144 * There is no risk of missing notifications because 1145 * work_pending is always cleared before the function is 1146 * executed. 1147 */ 1148 if (mce_helper[0] && !work_pending(&mce_trigger_work)) 1149 schedule_work(&mce_trigger_work); 1150 1151 if (__ratelimit(&ratelimit)) 1152 printk(KERN_INFO "Machine check events logged\n"); 1153 1154 return 1; 1155 } 1156 return 0; 1157} 1158EXPORT_SYMBOL_GPL(mce_notify_irq); 1159 1160/* 1161 * Initialize Machine Checks for a CPU. 1162 */ 1163static int mce_cap_init(void) 1164{ 1165 unsigned b; 1166 u64 cap; 1167 1168 rdmsrl(MSR_IA32_MCG_CAP, cap); 1169 1170 b = cap & MCG_BANKCNT_MASK; 1171 printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b); 1172 1173 if (b > MAX_NR_BANKS) { 1174 printk(KERN_WARNING 1175 "MCE: Using only %u machine check banks out of %u\n", 1176 MAX_NR_BANKS, b); 1177 b = MAX_NR_BANKS; 1178 } 1179 1180 /* Don't support asymmetric configurations today */ 1181 WARN_ON(banks != 0 && b != banks); 1182 banks = b; 1183 if (!bank) { 1184 bank = kmalloc(banks * sizeof(u64), GFP_KERNEL); 1185 if (!bank) 1186 return -ENOMEM; 1187 memset(bank, 0xff, banks * sizeof(u64)); 1188 } 1189 1190 /* Use accurate RIP reporting if available. */ 1191 if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9) 1192 rip_msr = MSR_IA32_MCG_EIP; 1193 1194 if (cap & MCG_SER_P) 1195 mce_ser = 1; 1196 1197 return 0; 1198} 1199 1200static void mce_init(void) 1201{ 1202 mce_banks_t all_banks; 1203 u64 cap; 1204 int i; 1205 1206 /* 1207 * Log the machine checks left over from the previous reset. 1208 */ 1209 bitmap_fill(all_banks, MAX_NR_BANKS); 1210 machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks); 1211 1212 set_in_cr4(X86_CR4_MCE); 1213 1214 rdmsrl(MSR_IA32_MCG_CAP, cap); 1215 if (cap & MCG_CTL_P) 1216 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); 1217 1218 for (i = 0; i < banks; i++) { 1219 if (skip_bank_init(i)) 1220 continue; 1221 wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]); 1222 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); 1223 } 1224} 1225 1226/* Add per CPU specific workarounds here */ 1227static void mce_cpu_quirks(struct cpuinfo_x86 *c) 1228{ 1229 /* This should be disabled by the BIOS, but isn't always */ 1230 if (c->x86_vendor == X86_VENDOR_AMD) { 1231 if (c->x86 == 15 && banks > 4) { 1232 /* 1233 * disable GART TBL walk error reporting, which 1234 * trips off incorrectly with the IOMMU & 3ware 1235 * & Cerberus: 1236 */ 1237 clear_bit(10, (unsigned long *)&bank[4]); 1238 } 1239 if (c->x86 <= 17 && mce_bootlog < 0) { 1240 /* 1241 * Lots of broken BIOS around that don't clear them 1242 * by default and leave crap in there. Don't log: 1243 */ 1244 mce_bootlog = 0; 1245 } 1246 /* 1247 * Various K7s with broken bank 0 around. Always disable 1248 * by default. 1249 */ 1250 if (c->x86 == 6 && banks > 0) 1251 bank[0] = 0; 1252 } 1253 1254 if (c->x86_vendor == X86_VENDOR_INTEL) { 1255 /* 1256 * SDM documents that on family 6 bank 0 should not be written 1257 * because it aliases to another special BIOS controlled 1258 * register. 1259 * But it's not aliased anymore on model 0x1a+ 1260 * Don't ignore bank 0 completely because there could be a 1261 * valid event later, merely don't write CTL0. 1262 */ 1263 1264 if (c->x86 == 6 && c->x86_model < 0x1A) 1265 __set_bit(0, &dont_init_banks); 1266 1267 /* 1268 * All newer Intel systems support MCE broadcasting. Enable 1269 * synchronization with a one second timeout. 1270 */ 1271 if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) && 1272 monarch_timeout < 0) 1273 monarch_timeout = USEC_PER_SEC; 1274 } 1275 if (monarch_timeout < 0) 1276 monarch_timeout = 0; 1277 if (mce_bootlog != 0) 1278 mce_panic_timeout = 30; 1279} 1280 1281static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c) 1282{ 1283 if (c->x86 != 5) 1284 return; 1285 switch (c->x86_vendor) { 1286 case X86_VENDOR_INTEL: 1287 intel_p5_mcheck_init(c); 1288 break; 1289 case X86_VENDOR_CENTAUR: 1290 winchip_mcheck_init(c); 1291 break; 1292 } 1293} 1294 1295static void mce_cpu_features(struct cpuinfo_x86 *c) 1296{ 1297 switch (c->x86_vendor) { 1298 case X86_VENDOR_INTEL: 1299 mce_intel_feature_init(c); 1300 break; 1301 case X86_VENDOR_AMD: 1302 mce_amd_feature_init(c); 1303 break; 1304 default: 1305 break; 1306 } 1307} 1308 1309static void mce_init_timer(void) 1310{ 1311 struct timer_list *t = &__get_cpu_var(mce_timer); 1312 int *n = &__get_cpu_var(next_interval); 1313 1314 if (mce_ignore_ce) 1315 return; 1316 1317 *n = check_interval * HZ; 1318 if (!*n) 1319 return; 1320 setup_timer(t, mcheck_timer, smp_processor_id()); 1321 t->expires = round_jiffies(jiffies + *n); 1322 add_timer(t); 1323} 1324 1325/* 1326 * Called for each booted CPU to set up machine checks. 1327 * Must be called with preempt off: 1328 */ 1329void __cpuinit mcheck_init(struct cpuinfo_x86 *c) 1330{ 1331 if (mce_disabled) 1332 return; 1333 1334 mce_ancient_init(c); 1335 1336 if (!mce_available(c)) 1337 return; 1338 1339 if (mce_cap_init() < 0) { 1340 mce_disabled = 1; 1341 return; 1342 } 1343 mce_cpu_quirks(c); 1344 1345 machine_check_vector = do_machine_check; 1346 1347 mce_init(); 1348 mce_cpu_features(c); 1349 mce_init_timer(); 1350 INIT_WORK(&__get_cpu_var(mce_work), mce_process_work); 1351} 1352 1353/* 1354 * Character device to read and clear the MCE log. 1355 */ 1356 1357static DEFINE_SPINLOCK(mce_state_lock); 1358static int open_count; /* #times opened */ 1359static int open_exclu; /* already open exclusive? */ 1360 1361static int mce_open(struct inode *inode, struct file *file) 1362{ 1363 spin_lock(&mce_state_lock); 1364 1365 if (open_exclu || (open_count && (file->f_flags & O_EXCL))) { 1366 spin_unlock(&mce_state_lock); 1367 1368 return -EBUSY; 1369 } 1370 1371 if (file->f_flags & O_EXCL) 1372 open_exclu = 1; 1373 open_count++; 1374 1375 spin_unlock(&mce_state_lock); 1376 1377 return nonseekable_open(inode, file); 1378} 1379 1380static int mce_release(struct inode *inode, struct file *file) 1381{ 1382 spin_lock(&mce_state_lock); 1383 1384 open_count--; 1385 open_exclu = 0; 1386 1387 spin_unlock(&mce_state_lock); 1388 1389 return 0; 1390} 1391 1392static void collect_tscs(void *data) 1393{ 1394 unsigned long *cpu_tsc = (unsigned long *)data; 1395 1396 rdtscll(cpu_tsc[smp_processor_id()]); 1397} 1398 1399static DEFINE_MUTEX(mce_read_mutex); 1400 1401static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, 1402 loff_t *off) 1403{ 1404 char __user *buf = ubuf; 1405 unsigned long *cpu_tsc; 1406 unsigned prev, next; 1407 int i, err; 1408 1409 cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL); 1410 if (!cpu_tsc) 1411 return -ENOMEM; 1412 1413 mutex_lock(&mce_read_mutex); 1414 next = rcu_dereference(mcelog.next); 1415 1416 /* Only supports full reads right now */ 1417 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) { 1418 mutex_unlock(&mce_read_mutex); 1419 kfree(cpu_tsc); 1420 1421 return -EINVAL; 1422 } 1423 1424 err = 0; 1425 prev = 0; 1426 do { 1427 for (i = prev; i < next; i++) { 1428 unsigned long start = jiffies; 1429 1430 while (!mcelog.entry[i].finished) { 1431 if (time_after_eq(jiffies, start + 2)) { 1432 memset(mcelog.entry + i, 0, 1433 sizeof(struct mce)); 1434 goto timeout; 1435 } 1436 cpu_relax(); 1437 } 1438 smp_rmb(); 1439 err |= copy_to_user(buf, mcelog.entry + i, 1440 sizeof(struct mce)); 1441 buf += sizeof(struct mce); 1442timeout: 1443 ; 1444 } 1445 1446 memset(mcelog.entry + prev, 0, 1447 (next - prev) * sizeof(struct mce)); 1448 prev = next; 1449 next = cmpxchg(&mcelog.next, prev, 0); 1450 } while (next != prev); 1451 1452 synchronize_sched(); 1453 1454 /* 1455 * Collect entries that were still getting written before the 1456 * synchronize. 1457 */ 1458 on_each_cpu(collect_tscs, cpu_tsc, 1); 1459 1460 for (i = next; i < MCE_LOG_LEN; i++) { 1461 if (mcelog.entry[i].finished && 1462 mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) { 1463 err |= copy_to_user(buf, mcelog.entry+i, 1464 sizeof(struct mce)); 1465 smp_rmb(); 1466 buf += sizeof(struct mce); 1467 memset(&mcelog.entry[i], 0, sizeof(struct mce)); 1468 } 1469 } 1470 mutex_unlock(&mce_read_mutex); 1471 kfree(cpu_tsc); 1472 1473 return err ? -EFAULT : buf - ubuf; 1474} 1475 1476static unsigned int mce_poll(struct file *file, poll_table *wait) 1477{ 1478 poll_wait(file, &mce_wait, wait); 1479 if (rcu_dereference(mcelog.next)) 1480 return POLLIN | POLLRDNORM; 1481 return 0; 1482} 1483 1484static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg) 1485{ 1486 int __user *p = (int __user *)arg; 1487 1488 if (!capable(CAP_SYS_ADMIN)) 1489 return -EPERM; 1490 1491 switch (cmd) { 1492 case MCE_GET_RECORD_LEN: 1493 return put_user(sizeof(struct mce), p); 1494 case MCE_GET_LOG_LEN: 1495 return put_user(MCE_LOG_LEN, p); 1496 case MCE_GETCLEAR_FLAGS: { 1497 unsigned flags; 1498 1499 do { 1500 flags = mcelog.flags; 1501 } while (cmpxchg(&mcelog.flags, flags, 0) != flags); 1502 1503 return put_user(flags, p); 1504 } 1505 default: 1506 return -ENOTTY; 1507 } 1508} 1509 1510/* Modified in mce-inject.c, so not static or const */ 1511struct file_operations mce_chrdev_ops = { 1512 .open = mce_open, 1513 .release = mce_release, 1514 .read = mce_read, 1515 .poll = mce_poll, 1516 .unlocked_ioctl = mce_ioctl, 1517}; 1518EXPORT_SYMBOL_GPL(mce_chrdev_ops); 1519 1520static struct miscdevice mce_log_device = { 1521 MISC_MCELOG_MINOR, 1522 "mcelog", 1523 &mce_chrdev_ops, 1524}; 1525 1526/* 1527 * mce=off Disables machine check 1528 * mce=no_cmci Disables CMCI 1529 * mce=dont_log_ce Clears corrected events silently, no log created for CEs. 1530 * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared. 1531 * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above) 1532 * monarchtimeout is how long to wait for other CPUs on machine 1533 * check, or 0 to not wait 1534 * mce=bootlog Log MCEs from before booting. Disabled by default on AMD. 1535 * mce=nobootlog Don't log MCEs from before booting. 1536 */ 1537static int __init mcheck_enable(char *str) 1538{ 1539 if (*str == 0) 1540 enable_p5_mce(); 1541 if (*str == '=') 1542 str++; 1543 if (!strcmp(str, "off")) 1544 mce_disabled = 1; 1545 else if (!strcmp(str, "no_cmci")) 1546 mce_cmci_disabled = 1; 1547 else if (!strcmp(str, "dont_log_ce")) 1548 mce_dont_log_ce = 1; 1549 else if (!strcmp(str, "ignore_ce")) 1550 mce_ignore_ce = 1; 1551 else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog")) 1552 mce_bootlog = (str[0] == 'b'); 1553 else if (isdigit(str[0])) { 1554 get_option(&str, &tolerant); 1555 if (*str == ',') { 1556 ++str; 1557 get_option(&str, &monarch_timeout); 1558 } 1559 } else { 1560 printk(KERN_INFO "mce argument %s ignored. Please use /sys\n", 1561 str); 1562 return 0; 1563 } 1564 return 1; 1565} 1566__setup("mce", mcheck_enable); 1567 1568/* 1569 * Sysfs support 1570 */ 1571 1572/* 1573 * Disable machine checks on suspend and shutdown. We can't really handle 1574 * them later. 1575 */ 1576static int mce_disable(void) 1577{ 1578 int i; 1579 1580 for (i = 0; i < banks; i++) { 1581 if (!skip_bank_init(i)) 1582 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); 1583 } 1584 return 0; 1585} 1586 1587static int mce_suspend(struct sys_device *dev, pm_message_t state) 1588{ 1589 return mce_disable(); 1590} 1591 1592static int mce_shutdown(struct sys_device *dev) 1593{ 1594 return mce_disable(); 1595} 1596 1597/* 1598 * On resume clear all MCE state. Don't want to see leftovers from the BIOS. 1599 * Only one CPU is active at this time, the others get re-added later using 1600 * CPU hotplug: 1601 */ 1602static int mce_resume(struct sys_device *dev) 1603{ 1604 mce_init(); 1605 mce_cpu_features(¤t_cpu_data); 1606 1607 return 0; 1608} 1609 1610static void mce_cpu_restart(void *data) 1611{ 1612 del_timer_sync(&__get_cpu_var(mce_timer)); 1613 if (!mce_available(¤t_cpu_data)) 1614 return; 1615 mce_init(); 1616 mce_init_timer(); 1617} 1618 1619/* Reinit MCEs after user configuration changes */ 1620static void mce_restart(void) 1621{ 1622 on_each_cpu(mce_cpu_restart, NULL, 1); 1623} 1624 1625/* Toggle features for corrected errors */ 1626static void mce_disable_ce(void *all) 1627{ 1628 if (!mce_available(¤t_cpu_data)) 1629 return; 1630 if (all) 1631 del_timer_sync(&__get_cpu_var(mce_timer)); 1632 cmci_clear(); 1633} 1634 1635static void mce_enable_ce(void *all) 1636{ 1637 if (!mce_available(¤t_cpu_data)) 1638 return; 1639 cmci_reenable(); 1640 cmci_recheck(); 1641 if (all) 1642 mce_init_timer(); 1643} 1644 1645static struct sysdev_class mce_sysclass = { 1646 .suspend = mce_suspend, 1647 .shutdown = mce_shutdown, 1648 .resume = mce_resume, 1649 .name = "machinecheck", 1650}; 1651 1652DEFINE_PER_CPU(struct sys_device, mce_dev); 1653 1654__cpuinitdata 1655void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); 1656 1657static struct sysdev_attribute *bank_attrs; 1658 1659static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr, 1660 char *buf) 1661{ 1662 u64 b = bank[attr - bank_attrs]; 1663 1664 return sprintf(buf, "%llx\n", b); 1665} 1666 1667static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, 1668 const char *buf, size_t size) 1669{ 1670 u64 new; 1671 1672 if (strict_strtoull(buf, 0, &new) < 0) 1673 return -EINVAL; 1674 1675 bank[attr - bank_attrs] = new; 1676 mce_restart(); 1677 1678 return size; 1679} 1680 1681static ssize_t 1682show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf) 1683{ 1684 strcpy(buf, mce_helper); 1685 strcat(buf, "\n"); 1686 return strlen(mce_helper) + 1; 1687} 1688 1689static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr, 1690 const char *buf, size_t siz) 1691{ 1692 char *p; 1693 int len; 1694 1695 strncpy(mce_helper, buf, sizeof(mce_helper)); 1696 mce_helper[sizeof(mce_helper)-1] = 0; 1697 len = strlen(mce_helper); 1698 p = strchr(mce_helper, '\n'); 1699 1700 if (*p) 1701 *p = 0; 1702 1703 return len; 1704} 1705 1706static ssize_t set_ignore_ce(struct sys_device *s, 1707 struct sysdev_attribute *attr, 1708 const char *buf, size_t size) 1709{ 1710 u64 new; 1711 1712 if (strict_strtoull(buf, 0, &new) < 0) 1713 return -EINVAL; 1714 1715 if (mce_ignore_ce ^ !!new) { 1716 if (new) { 1717 /* disable ce features */ 1718 on_each_cpu(mce_disable_ce, (void *)1, 1); 1719 mce_ignore_ce = 1; 1720 } else { 1721 /* enable ce features */ 1722 mce_ignore_ce = 0; 1723 on_each_cpu(mce_enable_ce, (void *)1, 1); 1724 } 1725 } 1726 return size; 1727} 1728 1729static ssize_t set_cmci_disabled(struct sys_device *s, 1730 struct sysdev_attribute *attr, 1731 const char *buf, size_t size) 1732{ 1733 u64 new; 1734 1735 if (strict_strtoull(buf, 0, &new) < 0) 1736 return -EINVAL; 1737 1738 if (mce_cmci_disabled ^ !!new) { 1739 if (new) { 1740 /* disable cmci */ 1741 on_each_cpu(mce_disable_ce, NULL, 1); 1742 mce_cmci_disabled = 1; 1743 } else { 1744 /* enable cmci */ 1745 mce_cmci_disabled = 0; 1746 on_each_cpu(mce_enable_ce, NULL, 1); 1747 } 1748 } 1749 return size; 1750} 1751 1752static ssize_t store_int_with_restart(struct sys_device *s, 1753 struct sysdev_attribute *attr, 1754 const char *buf, size_t size) 1755{ 1756 ssize_t ret = sysdev_store_int(s, attr, buf, size); 1757 mce_restart(); 1758 return ret; 1759} 1760 1761static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); 1762static SYSDEV_INT_ATTR(tolerant, 0644, tolerant); 1763static SYSDEV_INT_ATTR(monarch_timeout, 0644, monarch_timeout); 1764static SYSDEV_INT_ATTR(dont_log_ce, 0644, mce_dont_log_ce); 1765 1766static struct sysdev_ext_attribute attr_check_interval = { 1767 _SYSDEV_ATTR(check_interval, 0644, sysdev_show_int, 1768 store_int_with_restart), 1769 &check_interval 1770}; 1771 1772static struct sysdev_ext_attribute attr_ignore_ce = { 1773 _SYSDEV_ATTR(ignore_ce, 0644, sysdev_show_int, set_ignore_ce), 1774 &mce_ignore_ce 1775}; 1776 1777static struct sysdev_ext_attribute attr_cmci_disabled = { 1778 _SYSDEV_ATTR(cmci_disabled, 0644, sysdev_show_int, set_cmci_disabled), 1779 &mce_cmci_disabled 1780}; 1781 1782static struct sysdev_attribute *mce_attrs[] = { 1783 &attr_tolerant.attr, 1784 &attr_check_interval.attr, 1785 &attr_trigger, 1786 &attr_monarch_timeout.attr, 1787 &attr_dont_log_ce.attr, 1788 &attr_ignore_ce.attr, 1789 &attr_cmci_disabled.attr, 1790 NULL 1791}; 1792 1793static cpumask_var_t mce_dev_initialized; 1794 1795/* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */ 1796static __cpuinit int mce_create_device(unsigned int cpu) 1797{ 1798 int err; 1799 int i, j; 1800 1801 if (!mce_available(&boot_cpu_data)) 1802 return -EIO; 1803 1804 memset(&per_cpu(mce_dev, cpu).kobj, 0, sizeof(struct kobject)); 1805 per_cpu(mce_dev, cpu).id = cpu; 1806 per_cpu(mce_dev, cpu).cls = &mce_sysclass; 1807 1808 err = sysdev_register(&per_cpu(mce_dev, cpu)); 1809 if (err) 1810 return err; 1811 1812 for (i = 0; mce_attrs[i]; i++) { 1813 err = sysdev_create_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); 1814 if (err) 1815 goto error; 1816 } 1817 for (j = 0; j < banks; j++) { 1818 err = sysdev_create_file(&per_cpu(mce_dev, cpu), 1819 &bank_attrs[j]); 1820 if (err) 1821 goto error2; 1822 } 1823 cpumask_set_cpu(cpu, mce_dev_initialized); 1824 1825 return 0; 1826error2: 1827 while (--j >= 0) 1828 sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[j]); 1829error: 1830 while (--i >= 0) 1831 sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); 1832 1833 sysdev_unregister(&per_cpu(mce_dev, cpu)); 1834 1835 return err; 1836} 1837 1838static __cpuinit void mce_remove_device(unsigned int cpu) 1839{ 1840 int i; 1841 1842 if (!cpumask_test_cpu(cpu, mce_dev_initialized)) 1843 return; 1844 1845 for (i = 0; mce_attrs[i]; i++) 1846 sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); 1847 1848 for (i = 0; i < banks; i++) 1849 sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]); 1850 1851 sysdev_unregister(&per_cpu(mce_dev, cpu)); 1852 cpumask_clear_cpu(cpu, mce_dev_initialized); 1853} 1854 1855/* Make sure there are no machine checks on offlined CPUs. */ 1856static void mce_disable_cpu(void *h) 1857{ 1858 unsigned long action = *(unsigned long *)h; 1859 int i; 1860 1861 if (!mce_available(¤t_cpu_data)) 1862 return; 1863 if (!(action & CPU_TASKS_FROZEN)) 1864 cmci_clear(); 1865 for (i = 0; i < banks; i++) { 1866 if (!skip_bank_init(i)) 1867 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); 1868 } 1869} 1870 1871static void mce_reenable_cpu(void *h) 1872{ 1873 unsigned long action = *(unsigned long *)h; 1874 int i; 1875 1876 if (!mce_available(¤t_cpu_data)) 1877 return; 1878 1879 if (!(action & CPU_TASKS_FROZEN)) 1880 cmci_reenable(); 1881 for (i = 0; i < banks; i++) { 1882 if (!skip_bank_init(i)) 1883 wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]); 1884 } 1885} 1886 1887/* Get notified when a cpu comes on/off. Be hotplug friendly. */ 1888static int __cpuinit 1889mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) 1890{ 1891 unsigned int cpu = (unsigned long)hcpu; 1892 struct timer_list *t = &per_cpu(mce_timer, cpu); 1893 1894 switch (action) { 1895 case CPU_ONLINE: 1896 case CPU_ONLINE_FROZEN: 1897 mce_create_device(cpu); 1898 if (threshold_cpu_callback) 1899 threshold_cpu_callback(action, cpu); 1900 break; 1901 case CPU_DEAD: 1902 case CPU_DEAD_FROZEN: 1903 if (threshold_cpu_callback) 1904 threshold_cpu_callback(action, cpu); 1905 mce_remove_device(cpu); 1906 break; 1907 case CPU_DOWN_PREPARE: 1908 case CPU_DOWN_PREPARE_FROZEN: 1909 del_timer_sync(t); 1910 smp_call_function_single(cpu, mce_disable_cpu, &action, 1); 1911 break; 1912 case CPU_DOWN_FAILED: 1913 case CPU_DOWN_FAILED_FROZEN: 1914 t->expires = round_jiffies(jiffies + 1915 __get_cpu_var(next_interval)); 1916 add_timer_on(t, cpu); 1917 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); 1918 break; 1919 case CPU_POST_DEAD: 1920 /* intentionally ignoring frozen here */ 1921 cmci_rediscover(cpu); 1922 break; 1923 } 1924 return NOTIFY_OK; 1925} 1926 1927static struct notifier_block mce_cpu_notifier __cpuinitdata = { 1928 .notifier_call = mce_cpu_callback, 1929}; 1930 1931static __init int mce_init_banks(void) 1932{ 1933 int i; 1934 1935 bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks, 1936 GFP_KERNEL); 1937 if (!bank_attrs) 1938 return -ENOMEM; 1939 1940 for (i = 0; i < banks; i++) { 1941 struct sysdev_attribute *a = &bank_attrs[i]; 1942 1943 a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i); 1944 if (!a->attr.name) 1945 goto nomem; 1946 1947 a->attr.mode = 0644; 1948 a->show = show_bank; 1949 a->store = set_bank; 1950 } 1951 return 0; 1952 1953nomem: 1954 while (--i >= 0) 1955 kfree(bank_attrs[i].attr.name); 1956 kfree(bank_attrs); 1957 bank_attrs = NULL; 1958 1959 return -ENOMEM; 1960} 1961 1962static __init int mce_init_device(void) 1963{ 1964 int err; 1965 int i = 0; 1966 1967 if (!mce_available(&boot_cpu_data)) 1968 return -EIO; 1969 1970 zalloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL); 1971 1972 err = mce_init_banks(); 1973 if (err) 1974 return err; 1975 1976 err = sysdev_class_register(&mce_sysclass); 1977 if (err) 1978 return err; 1979 1980 for_each_online_cpu(i) { 1981 err = mce_create_device(i); 1982 if (err) 1983 return err; 1984 } 1985 1986 register_hotcpu_notifier(&mce_cpu_notifier); 1987 misc_register(&mce_log_device); 1988 1989 return err; 1990} 1991 1992device_initcall(mce_init_device); 1993 1994/* 1995 * Old style boot options parsing. Only for compatibility. 1996 */ 1997static int __init mcheck_disable(char *str) 1998{ 1999 mce_disabled = 1; 2000 return 1; 2001} 2002__setup("nomce", mcheck_disable); 2003