mce.c revision 62fdac5913f71f8f200bd2c9bd59a02e9a1498e9
1/* 2 * Machine check handler. 3 * 4 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. 5 * Rest from unknown author(s). 6 * 2004 Andi Kleen. Rewrote most of it. 7 * Copyright 2008 Intel Corporation 8 * Author: Andi Kleen 9 */ 10#include <linux/thread_info.h> 11#include <linux/capability.h> 12#include <linux/miscdevice.h> 13#include <linux/interrupt.h> 14#include <linux/ratelimit.h> 15#include <linux/kallsyms.h> 16#include <linux/rcupdate.h> 17#include <linux/kobject.h> 18#include <linux/uaccess.h> 19#include <linux/kdebug.h> 20#include <linux/kernel.h> 21#include <linux/percpu.h> 22#include <linux/string.h> 23#include <linux/sysdev.h> 24#include <linux/delay.h> 25#include <linux/ctype.h> 26#include <linux/sched.h> 27#include <linux/sysfs.h> 28#include <linux/types.h> 29#include <linux/init.h> 30#include <linux/kmod.h> 31#include <linux/poll.h> 32#include <linux/nmi.h> 33#include <linux/cpu.h> 34#include <linux/smp.h> 35#include <linux/fs.h> 36#include <linux/mm.h> 37 38#include <asm/processor.h> 39#include <asm/hw_irq.h> 40#include <asm/apic.h> 41#include <asm/idle.h> 42#include <asm/ipi.h> 43#include <asm/mce.h> 44#include <asm/msr.h> 45 46#include "mce-internal.h" 47#include "mce.h" 48 49/* Handle unconfigured int18 (should never happen) */ 50static void unexpected_machine_check(struct pt_regs *regs, long error_code) 51{ 52 printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", 53 smp_processor_id()); 54} 55 56/* Call the installed machine check handler for this CPU setup. */ 57void (*machine_check_vector)(struct pt_regs *, long error_code) = 58 unexpected_machine_check; 59 60int mce_disabled; 61 62#ifdef CONFIG_X86_NEW_MCE 63 64#define MISC_MCELOG_MINOR 227 65 66#define SPINUNIT 100 /* 100ns */ 67 68atomic_t mce_entry; 69 70DEFINE_PER_CPU(unsigned, mce_exception_count); 71 72/* 73 * Tolerant levels: 74 * 0: always panic on uncorrected errors, log corrected errors 75 * 1: panic or SIGBUS on uncorrected errors, log corrected errors 76 * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors 77 * 3: never panic or SIGBUS, log all errors (for testing only) 78 */ 79static int tolerant = 1; 80static int banks; 81static u64 *bank; 82static unsigned long notify_user; 83static int rip_msr; 84static int mce_bootlog = -1; 85static int monarch_timeout = -1; 86static int mce_panic_timeout; 87static int mce_dont_log_ce; 88int mce_cmci_disabled; 89int mce_ignore_ce; 90int mce_ser; 91 92static char trigger[128]; 93static char *trigger_argv[2] = { trigger, NULL }; 94 95static unsigned long dont_init_banks; 96 97static DECLARE_WAIT_QUEUE_HEAD(mce_wait); 98static DEFINE_PER_CPU(struct mce, mces_seen); 99static int cpu_missing; 100 101 102/* MCA banks polled by the period polling timer for corrected events */ 103DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { 104 [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL 105}; 106 107static inline int skip_bank_init(int i) 108{ 109 return i < BITS_PER_LONG && test_bit(i, &dont_init_banks); 110} 111 112static DEFINE_PER_CPU(struct work_struct, mce_work); 113 114/* Do initial initialization of a struct mce */ 115void mce_setup(struct mce *m) 116{ 117 memset(m, 0, sizeof(struct mce)); 118 m->cpu = m->extcpu = smp_processor_id(); 119 rdtscll(m->tsc); 120 /* We hope get_seconds stays lockless */ 121 m->time = get_seconds(); 122 m->cpuvendor = boot_cpu_data.x86_vendor; 123 m->cpuid = cpuid_eax(1); 124#ifdef CONFIG_SMP 125 m->socketid = cpu_data(m->extcpu).phys_proc_id; 126#endif 127 m->apicid = cpu_data(m->extcpu).initial_apicid; 128 rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap); 129} 130 131DEFINE_PER_CPU(struct mce, injectm); 132EXPORT_PER_CPU_SYMBOL_GPL(injectm); 133 134/* 135 * Lockless MCE logging infrastructure. 136 * This avoids deadlocks on printk locks without having to break locks. Also 137 * separate MCEs from kernel messages to avoid bogus bug reports. 138 */ 139 140static struct mce_log mcelog = { 141 .signature = MCE_LOG_SIGNATURE, 142 .len = MCE_LOG_LEN, 143 .recordlen = sizeof(struct mce), 144}; 145 146void mce_log(struct mce *mce) 147{ 148 unsigned next, entry; 149 150 mce->finished = 0; 151 wmb(); 152 for (;;) { 153 entry = rcu_dereference(mcelog.next); 154 for (;;) { 155 /* 156 * When the buffer fills up discard new entries. 157 * Assume that the earlier errors are the more 158 * interesting ones: 159 */ 160 if (entry >= MCE_LOG_LEN) { 161 set_bit(MCE_OVERFLOW, 162 (unsigned long *)&mcelog.flags); 163 return; 164 } 165 /* Old left over entry. Skip: */ 166 if (mcelog.entry[entry].finished) { 167 entry++; 168 continue; 169 } 170 break; 171 } 172 smp_rmb(); 173 next = entry + 1; 174 if (cmpxchg(&mcelog.next, entry, next) == entry) 175 break; 176 } 177 memcpy(mcelog.entry + entry, mce, sizeof(struct mce)); 178 wmb(); 179 mcelog.entry[entry].finished = 1; 180 wmb(); 181 182 mce->finished = 1; 183 set_bit(0, ¬ify_user); 184} 185 186static void print_mce(struct mce *m) 187{ 188 printk(KERN_EMERG 189 "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n", 190 m->extcpu, m->mcgstatus, m->bank, m->status); 191 if (m->ip) { 192 printk(KERN_EMERG "RIP%s %02x:<%016Lx> ", 193 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", 194 m->cs, m->ip); 195 if (m->cs == __KERNEL_CS) 196 print_symbol("{%s}", m->ip); 197 printk("\n"); 198 } 199 printk(KERN_EMERG "TSC %llx ", m->tsc); 200 if (m->addr) 201 printk("ADDR %llx ", m->addr); 202 if (m->misc) 203 printk("MISC %llx ", m->misc); 204 printk("\n"); 205 printk(KERN_EMERG "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n", 206 m->cpuvendor, m->cpuid, m->time, m->socketid, 207 m->apicid); 208} 209 210static void print_mce_head(void) 211{ 212 printk(KERN_EMERG "\n" KERN_EMERG "HARDWARE ERROR\n"); 213} 214 215static void print_mce_tail(void) 216{ 217 printk(KERN_EMERG "This is not a software problem!\n" 218 KERN_EMERG "Run through mcelog --ascii to decode and contact your hardware vendor\n"); 219} 220 221#define PANIC_TIMEOUT 5 /* 5 seconds */ 222 223static atomic_t mce_paniced; 224 225/* Panic in progress. Enable interrupts and wait for final IPI */ 226static void wait_for_panic(void) 227{ 228 long timeout = PANIC_TIMEOUT*USEC_PER_SEC; 229 preempt_disable(); 230 local_irq_enable(); 231 while (timeout-- > 0) 232 udelay(1); 233 if (panic_timeout == 0) 234 panic_timeout = mce_panic_timeout; 235 panic("Panicing machine check CPU died"); 236} 237 238static void mce_panic(char *msg, struct mce *final, char *exp) 239{ 240 int i; 241 242 /* 243 * Make sure only one CPU runs in machine check panic 244 */ 245 if (atomic_add_return(1, &mce_paniced) > 1) 246 wait_for_panic(); 247 barrier(); 248 249 bust_spinlocks(1); 250 console_verbose(); 251 print_mce_head(); 252 /* First print corrected ones that are still unlogged */ 253 for (i = 0; i < MCE_LOG_LEN; i++) { 254 struct mce *m = &mcelog.entry[i]; 255 if (!(m->status & MCI_STATUS_VAL)) 256 continue; 257 if (!(m->status & MCI_STATUS_UC)) 258 print_mce(m); 259 } 260 /* Now print uncorrected but with the final one last */ 261 for (i = 0; i < MCE_LOG_LEN; i++) { 262 struct mce *m = &mcelog.entry[i]; 263 if (!(m->status & MCI_STATUS_VAL)) 264 continue; 265 if (!(m->status & MCI_STATUS_UC)) 266 continue; 267 if (!final || memcmp(m, final, sizeof(struct mce))) 268 print_mce(m); 269 } 270 if (final) 271 print_mce(final); 272 if (cpu_missing) 273 printk(KERN_EMERG "Some CPUs didn't answer in synchronization\n"); 274 print_mce_tail(); 275 if (exp) 276 printk(KERN_EMERG "Machine check: %s\n", exp); 277 if (panic_timeout == 0) 278 panic_timeout = mce_panic_timeout; 279 panic(msg); 280} 281 282/* Support code for software error injection */ 283 284static int msr_to_offset(u32 msr) 285{ 286 unsigned bank = __get_cpu_var(injectm.bank); 287 if (msr == rip_msr) 288 return offsetof(struct mce, ip); 289 if (msr == MSR_IA32_MC0_STATUS + bank*4) 290 return offsetof(struct mce, status); 291 if (msr == MSR_IA32_MC0_ADDR + bank*4) 292 return offsetof(struct mce, addr); 293 if (msr == MSR_IA32_MC0_MISC + bank*4) 294 return offsetof(struct mce, misc); 295 if (msr == MSR_IA32_MCG_STATUS) 296 return offsetof(struct mce, mcgstatus); 297 return -1; 298} 299 300/* MSR access wrappers used for error injection */ 301static u64 mce_rdmsrl(u32 msr) 302{ 303 u64 v; 304 if (__get_cpu_var(injectm).finished) { 305 int offset = msr_to_offset(msr); 306 if (offset < 0) 307 return 0; 308 return *(u64 *)((char *)&__get_cpu_var(injectm) + offset); 309 } 310 rdmsrl(msr, v); 311 return v; 312} 313 314static void mce_wrmsrl(u32 msr, u64 v) 315{ 316 if (__get_cpu_var(injectm).finished) { 317 int offset = msr_to_offset(msr); 318 if (offset >= 0) 319 *(u64 *)((char *)&__get_cpu_var(injectm) + offset) = v; 320 return; 321 } 322 wrmsrl(msr, v); 323} 324 325/* 326 * Simple lockless ring to communicate PFNs from the exception handler with the 327 * process context work function. This is vastly simplified because there's 328 * only a single reader and a single writer. 329 */ 330#define MCE_RING_SIZE 16 /* we use one entry less */ 331 332struct mce_ring { 333 unsigned short start; 334 unsigned short end; 335 unsigned long ring[MCE_RING_SIZE]; 336}; 337static DEFINE_PER_CPU(struct mce_ring, mce_ring); 338 339/* Runs with CPU affinity in workqueue */ 340static int mce_ring_empty(void) 341{ 342 struct mce_ring *r = &__get_cpu_var(mce_ring); 343 344 return r->start == r->end; 345} 346 347static int mce_ring_get(unsigned long *pfn) 348{ 349 struct mce_ring *r; 350 int ret = 0; 351 352 *pfn = 0; 353 get_cpu(); 354 r = &__get_cpu_var(mce_ring); 355 if (r->start == r->end) 356 goto out; 357 *pfn = r->ring[r->start]; 358 r->start = (r->start + 1) % MCE_RING_SIZE; 359 ret = 1; 360out: 361 put_cpu(); 362 return ret; 363} 364 365/* Always runs in MCE context with preempt off */ 366static int mce_ring_add(unsigned long pfn) 367{ 368 struct mce_ring *r = &__get_cpu_var(mce_ring); 369 unsigned next; 370 371 next = (r->end + 1) % MCE_RING_SIZE; 372 if (next == r->start) 373 return -1; 374 r->ring[r->end] = pfn; 375 wmb(); 376 r->end = next; 377 return 0; 378} 379 380int mce_available(struct cpuinfo_x86 *c) 381{ 382 if (mce_disabled) 383 return 0; 384 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); 385} 386 387static void mce_schedule_work(void) 388{ 389 if (!mce_ring_empty()) { 390 struct work_struct *work = &__get_cpu_var(mce_work); 391 if (!work_pending(work)) 392 schedule_work(work); 393 } 394} 395 396/* 397 * Get the address of the instruction at the time of the machine check 398 * error. 399 */ 400static inline void mce_get_rip(struct mce *m, struct pt_regs *regs) 401{ 402 403 if (regs && (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV))) { 404 m->ip = regs->ip; 405 m->cs = regs->cs; 406 } else { 407 m->ip = 0; 408 m->cs = 0; 409 } 410 if (rip_msr) 411 m->ip = mce_rdmsrl(rip_msr); 412} 413 414#ifdef CONFIG_X86_LOCAL_APIC 415/* 416 * Called after interrupts have been reenabled again 417 * when a MCE happened during an interrupts off region 418 * in the kernel. 419 */ 420asmlinkage void smp_mce_self_interrupt(struct pt_regs *regs) 421{ 422 ack_APIC_irq(); 423 exit_idle(); 424 irq_enter(); 425 mce_notify_irq(); 426 mce_schedule_work(); 427 irq_exit(); 428} 429#endif 430 431static void mce_report_event(struct pt_regs *regs) 432{ 433 if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) { 434 mce_notify_irq(); 435 /* 436 * Triggering the work queue here is just an insurance 437 * policy in case the syscall exit notify handler 438 * doesn't run soon enough or ends up running on the 439 * wrong CPU (can happen when audit sleeps) 440 */ 441 mce_schedule_work(); 442 return; 443 } 444 445#ifdef CONFIG_X86_LOCAL_APIC 446 /* 447 * Without APIC do not notify. The event will be picked 448 * up eventually. 449 */ 450 if (!cpu_has_apic) 451 return; 452 453 /* 454 * When interrupts are disabled we cannot use 455 * kernel services safely. Trigger an self interrupt 456 * through the APIC to instead do the notification 457 * after interrupts are reenabled again. 458 */ 459 apic->send_IPI_self(MCE_SELF_VECTOR); 460 461 /* 462 * Wait for idle afterwards again so that we don't leave the 463 * APIC in a non idle state because the normal APIC writes 464 * cannot exclude us. 465 */ 466 apic_wait_icr_idle(); 467#endif 468} 469 470DEFINE_PER_CPU(unsigned, mce_poll_count); 471 472/* 473 * Poll for corrected events or events that happened before reset. 474 * Those are just logged through /dev/mcelog. 475 * 476 * This is executed in standard interrupt context. 477 * 478 * Note: spec recommends to panic for fatal unsignalled 479 * errors here. However this would be quite problematic -- 480 * we would need to reimplement the Monarch handling and 481 * it would mess up the exclusion between exception handler 482 * and poll hander -- * so we skip this for now. 483 * These cases should not happen anyways, or only when the CPU 484 * is already totally * confused. In this case it's likely it will 485 * not fully execute the machine check handler either. 486 */ 487void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) 488{ 489 struct mce m; 490 int i; 491 492 __get_cpu_var(mce_poll_count)++; 493 494 mce_setup(&m); 495 496 m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); 497 for (i = 0; i < banks; i++) { 498 if (!bank[i] || !test_bit(i, *b)) 499 continue; 500 501 m.misc = 0; 502 m.addr = 0; 503 m.bank = i; 504 m.tsc = 0; 505 506 barrier(); 507 m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4); 508 if (!(m.status & MCI_STATUS_VAL)) 509 continue; 510 511 /* 512 * Uncorrected or signalled events are handled by the exception 513 * handler when it is enabled, so don't process those here. 514 * 515 * TBD do the same check for MCI_STATUS_EN here? 516 */ 517 if (!(flags & MCP_UC) && 518 (m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC))) 519 continue; 520 521 if (m.status & MCI_STATUS_MISCV) 522 m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4); 523 if (m.status & MCI_STATUS_ADDRV) 524 m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4); 525 526 if (!(flags & MCP_TIMESTAMP)) 527 m.tsc = 0; 528 /* 529 * Don't get the IP here because it's unlikely to 530 * have anything to do with the actual error location. 531 */ 532 if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) { 533 mce_log(&m); 534 add_taint(TAINT_MACHINE_CHECK); 535 } 536 537 /* 538 * Clear state for this bank. 539 */ 540 mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); 541 } 542 543 /* 544 * Don't clear MCG_STATUS here because it's only defined for 545 * exceptions. 546 */ 547 548 sync_core(); 549} 550EXPORT_SYMBOL_GPL(machine_check_poll); 551 552/* 553 * Do a quick check if any of the events requires a panic. 554 * This decides if we keep the events around or clear them. 555 */ 556static int mce_no_way_out(struct mce *m, char **msg) 557{ 558 int i; 559 560 for (i = 0; i < banks; i++) { 561 m->status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4); 562 if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY) 563 return 1; 564 } 565 return 0; 566} 567 568/* 569 * Variable to establish order between CPUs while scanning. 570 * Each CPU spins initially until executing is equal its number. 571 */ 572static atomic_t mce_executing; 573 574/* 575 * Defines order of CPUs on entry. First CPU becomes Monarch. 576 */ 577static atomic_t mce_callin; 578 579/* 580 * Check if a timeout waiting for other CPUs happened. 581 */ 582static int mce_timed_out(u64 *t) 583{ 584 /* 585 * The others already did panic for some reason. 586 * Bail out like in a timeout. 587 * rmb() to tell the compiler that system_state 588 * might have been modified by someone else. 589 */ 590 rmb(); 591 if (atomic_read(&mce_paniced)) 592 wait_for_panic(); 593 if (!monarch_timeout) 594 goto out; 595 if ((s64)*t < SPINUNIT) { 596 /* CHECKME: Make panic default for 1 too? */ 597 if (tolerant < 1) 598 mce_panic("Timeout synchronizing machine check over CPUs", 599 NULL, NULL); 600 cpu_missing = 1; 601 return 1; 602 } 603 *t -= SPINUNIT; 604out: 605 touch_nmi_watchdog(); 606 return 0; 607} 608 609/* 610 * The Monarch's reign. The Monarch is the CPU who entered 611 * the machine check handler first. It waits for the others to 612 * raise the exception too and then grades them. When any 613 * error is fatal panic. Only then let the others continue. 614 * 615 * The other CPUs entering the MCE handler will be controlled by the 616 * Monarch. They are called Subjects. 617 * 618 * This way we prevent any potential data corruption in a unrecoverable case 619 * and also makes sure always all CPU's errors are examined. 620 * 621 * Also this detects the case of an machine check event coming from outer 622 * space (not detected by any CPUs) In this case some external agent wants 623 * us to shut down, so panic too. 624 * 625 * The other CPUs might still decide to panic if the handler happens 626 * in a unrecoverable place, but in this case the system is in a semi-stable 627 * state and won't corrupt anything by itself. It's ok to let the others 628 * continue for a bit first. 629 * 630 * All the spin loops have timeouts; when a timeout happens a CPU 631 * typically elects itself to be Monarch. 632 */ 633static void mce_reign(void) 634{ 635 int cpu; 636 struct mce *m = NULL; 637 int global_worst = 0; 638 char *msg = NULL; 639 char *nmsg = NULL; 640 641 /* 642 * This CPU is the Monarch and the other CPUs have run 643 * through their handlers. 644 * Grade the severity of the errors of all the CPUs. 645 */ 646 for_each_possible_cpu(cpu) { 647 int severity = mce_severity(&per_cpu(mces_seen, cpu), tolerant, 648 &nmsg); 649 if (severity > global_worst) { 650 msg = nmsg; 651 global_worst = severity; 652 m = &per_cpu(mces_seen, cpu); 653 } 654 } 655 656 /* 657 * Cannot recover? Panic here then. 658 * This dumps all the mces in the log buffer and stops the 659 * other CPUs. 660 */ 661 if (m && global_worst >= MCE_PANIC_SEVERITY && tolerant < 3) 662 mce_panic("Fatal Machine check", m, msg); 663 664 /* 665 * For UC somewhere we let the CPU who detects it handle it. 666 * Also must let continue the others, otherwise the handling 667 * CPU could deadlock on a lock. 668 */ 669 670 /* 671 * No machine check event found. Must be some external 672 * source or one CPU is hung. Panic. 673 */ 674 if (!m && tolerant < 3) 675 mce_panic("Machine check from unknown source", NULL, NULL); 676 677 /* 678 * Now clear all the mces_seen so that they don't reappear on 679 * the next mce. 680 */ 681 for_each_possible_cpu(cpu) 682 memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce)); 683} 684 685static atomic_t global_nwo; 686 687/* 688 * Start of Monarch synchronization. This waits until all CPUs have 689 * entered the exception handler and then determines if any of them 690 * saw a fatal event that requires panic. Then it executes them 691 * in the entry order. 692 * TBD double check parallel CPU hotunplug 693 */ 694static int mce_start(int no_way_out, int *order) 695{ 696 int nwo; 697 int cpus = num_online_cpus(); 698 u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC; 699 700 if (!timeout) { 701 *order = -1; 702 return no_way_out; 703 } 704 705 atomic_add(no_way_out, &global_nwo); 706 707 /* 708 * Wait for everyone. 709 */ 710 while (atomic_read(&mce_callin) != cpus) { 711 if (mce_timed_out(&timeout)) { 712 atomic_set(&global_nwo, 0); 713 *order = -1; 714 return no_way_out; 715 } 716 ndelay(SPINUNIT); 717 } 718 719 /* 720 * Cache the global no_way_out state. 721 */ 722 nwo = atomic_read(&global_nwo); 723 724 /* 725 * Monarch starts executing now, the others wait. 726 */ 727 if (*order == 1) { 728 atomic_set(&mce_executing, 1); 729 return nwo; 730 } 731 732 /* 733 * Now start the scanning loop one by one 734 * in the original callin order. 735 * This way when there are any shared banks it will 736 * be only seen by one CPU before cleared, avoiding duplicates. 737 */ 738 while (atomic_read(&mce_executing) < *order) { 739 if (mce_timed_out(&timeout)) { 740 atomic_set(&global_nwo, 0); 741 *order = -1; 742 return no_way_out; 743 } 744 ndelay(SPINUNIT); 745 } 746 return nwo; 747} 748 749/* 750 * Synchronize between CPUs after main scanning loop. 751 * This invokes the bulk of the Monarch processing. 752 */ 753static int mce_end(int order) 754{ 755 int ret = -1; 756 u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC; 757 758 if (!timeout) 759 goto reset; 760 if (order < 0) 761 goto reset; 762 763 /* 764 * Allow others to run. 765 */ 766 atomic_inc(&mce_executing); 767 768 if (order == 1) { 769 /* CHECKME: Can this race with a parallel hotplug? */ 770 int cpus = num_online_cpus(); 771 772 /* 773 * Monarch: Wait for everyone to go through their scanning 774 * loops. 775 */ 776 while (atomic_read(&mce_executing) <= cpus) { 777 if (mce_timed_out(&timeout)) 778 goto reset; 779 ndelay(SPINUNIT); 780 } 781 782 mce_reign(); 783 barrier(); 784 ret = 0; 785 } else { 786 /* 787 * Subject: Wait for Monarch to finish. 788 */ 789 while (atomic_read(&mce_executing) != 0) { 790 if (mce_timed_out(&timeout)) 791 goto reset; 792 ndelay(SPINUNIT); 793 } 794 795 /* 796 * Don't reset anything. That's done by the Monarch. 797 */ 798 return 0; 799 } 800 801 /* 802 * Reset all global state. 803 */ 804reset: 805 atomic_set(&global_nwo, 0); 806 atomic_set(&mce_callin, 0); 807 barrier(); 808 809 /* 810 * Let others run again. 811 */ 812 atomic_set(&mce_executing, 0); 813 return ret; 814} 815 816/* 817 * Check if the address reported by the CPU is in a format we can parse. 818 * It would be possible to add code for most other cases, but all would 819 * be somewhat complicated (e.g. segment offset would require an instruction 820 * parser). So only support physical addresses upto page granuality for now. 821 */ 822static int mce_usable_address(struct mce *m) 823{ 824 if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV)) 825 return 0; 826 if ((m->misc & 0x3f) > PAGE_SHIFT) 827 return 0; 828 if (((m->misc >> 6) & 7) != MCM_ADDR_PHYS) 829 return 0; 830 return 1; 831} 832 833static void mce_clear_state(unsigned long *toclear) 834{ 835 int i; 836 837 for (i = 0; i < banks; i++) { 838 if (test_bit(i, toclear)) 839 mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); 840 } 841} 842 843/* 844 * The actual machine check handler. This only handles real 845 * exceptions when something got corrupted coming in through int 18. 846 * 847 * This is executed in NMI context not subject to normal locking rules. This 848 * implies that most kernel services cannot be safely used. Don't even 849 * think about putting a printk in there! 850 * 851 * On Intel systems this is entered on all CPUs in parallel through 852 * MCE broadcast. However some CPUs might be broken beyond repair, 853 * so be always careful when synchronizing with others. 854 */ 855void do_machine_check(struct pt_regs *regs, long error_code) 856{ 857 struct mce m, *final; 858 int i; 859 int worst = 0; 860 int severity; 861 /* 862 * Establish sequential order between the CPUs entering the machine 863 * check handler. 864 */ 865 int order; 866 867 /* 868 * If no_way_out gets set, there is no safe way to recover from this 869 * MCE. If tolerant is cranked up, we'll try anyway. 870 */ 871 int no_way_out = 0; 872 /* 873 * If kill_it gets set, there might be a way to recover from this 874 * error. 875 */ 876 int kill_it = 0; 877 DECLARE_BITMAP(toclear, MAX_NR_BANKS); 878 char *msg = "Unknown"; 879 880 atomic_inc(&mce_entry); 881 882 __get_cpu_var(mce_exception_count)++; 883 884 if (notify_die(DIE_NMI, "machine check", regs, error_code, 885 18, SIGKILL) == NOTIFY_STOP) 886 goto out; 887 if (!banks) 888 goto out; 889 890 order = atomic_add_return(1, &mce_callin); 891 mce_setup(&m); 892 893 m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); 894 no_way_out = mce_no_way_out(&m, &msg); 895 896 final = &__get_cpu_var(mces_seen); 897 *final = m; 898 899 barrier(); 900 901 /* 902 * When no restart IP must always kill or panic. 903 */ 904 if (!(m.mcgstatus & MCG_STATUS_RIPV)) 905 kill_it = 1; 906 907 /* 908 * Go through all the banks in exclusion of the other CPUs. 909 * This way we don't report duplicated events on shared banks 910 * because the first one to see it will clear it. 911 */ 912 no_way_out = mce_start(no_way_out, &order); 913 for (i = 0; i < banks; i++) { 914 __clear_bit(i, toclear); 915 if (!bank[i]) 916 continue; 917 918 m.misc = 0; 919 m.addr = 0; 920 m.bank = i; 921 922 m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4); 923 if ((m.status & MCI_STATUS_VAL) == 0) 924 continue; 925 926 /* 927 * Non uncorrected or non signaled errors are handled by 928 * machine_check_poll. Leave them alone, unless this panics. 929 */ 930 if (!(m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)) && 931 !no_way_out) 932 continue; 933 934 /* 935 * Set taint even when machine check was not enabled. 936 */ 937 add_taint(TAINT_MACHINE_CHECK); 938 939 severity = mce_severity(&m, tolerant, NULL); 940 941 /* 942 * When machine check was for corrected handler don't touch, 943 * unless we're panicing. 944 */ 945 if (severity == MCE_KEEP_SEVERITY && !no_way_out) 946 continue; 947 __set_bit(i, toclear); 948 if (severity == MCE_NO_SEVERITY) { 949 /* 950 * Machine check event was not enabled. Clear, but 951 * ignore. 952 */ 953 continue; 954 } 955 956 /* 957 * Kill on action required. 958 */ 959 if (severity == MCE_AR_SEVERITY) 960 kill_it = 1; 961 962 if (m.status & MCI_STATUS_MISCV) 963 m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4); 964 if (m.status & MCI_STATUS_ADDRV) 965 m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4); 966 967 /* 968 * Action optional error. Queue address for later processing. 969 * When the ring overflows we just ignore the AO error. 970 * RED-PEN add some logging mechanism when 971 * usable_address or mce_add_ring fails. 972 * RED-PEN don't ignore overflow for tolerant == 0 973 */ 974 if (severity == MCE_AO_SEVERITY && mce_usable_address(&m)) 975 mce_ring_add(m.addr >> PAGE_SHIFT); 976 977 mce_get_rip(&m, regs); 978 mce_log(&m); 979 980 if (severity > worst) { 981 *final = m; 982 worst = severity; 983 } 984 } 985 986 if (!no_way_out) 987 mce_clear_state(toclear); 988 989 /* 990 * Do most of the synchronization with other CPUs. 991 * When there's any problem use only local no_way_out state. 992 */ 993 if (mce_end(order) < 0) 994 no_way_out = worst >= MCE_PANIC_SEVERITY; 995 996 /* 997 * If we have decided that we just CAN'T continue, and the user 998 * has not set tolerant to an insane level, give up and die. 999 * 1000 * This is mainly used in the case when the system doesn't 1001 * support MCE broadcasting or it has been disabled. 1002 */ 1003 if (no_way_out && tolerant < 3) 1004 mce_panic("Fatal machine check on current CPU", final, msg); 1005 1006 /* 1007 * If the error seems to be unrecoverable, something should be 1008 * done. Try to kill as little as possible. If we can kill just 1009 * one task, do that. If the user has set the tolerance very 1010 * high, don't try to do anything at all. 1011 */ 1012 1013 if (kill_it && tolerant < 3) 1014 force_sig(SIGBUS, current); 1015 1016 /* notify userspace ASAP */ 1017 set_thread_flag(TIF_MCE_NOTIFY); 1018 1019 if (worst > 0) 1020 mce_report_event(regs); 1021 mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); 1022out: 1023 atomic_dec(&mce_entry); 1024 sync_core(); 1025} 1026EXPORT_SYMBOL_GPL(do_machine_check); 1027 1028/* dummy to break dependency. actual code is in mm/memory-failure.c */ 1029void __attribute__((weak)) memory_failure(unsigned long pfn, int vector) 1030{ 1031 printk(KERN_ERR "Action optional memory failure at %lx ignored\n", pfn); 1032} 1033 1034/* 1035 * Called after mce notification in process context. This code 1036 * is allowed to sleep. Call the high level VM handler to process 1037 * any corrupted pages. 1038 * Assume that the work queue code only calls this one at a time 1039 * per CPU. 1040 * Note we don't disable preemption, so this code might run on the wrong 1041 * CPU. In this case the event is picked up by the scheduled work queue. 1042 * This is merely a fast path to expedite processing in some common 1043 * cases. 1044 */ 1045void mce_notify_process(void) 1046{ 1047 unsigned long pfn; 1048 mce_notify_irq(); 1049 while (mce_ring_get(&pfn)) 1050 memory_failure(pfn, MCE_VECTOR); 1051} 1052 1053static void mce_process_work(struct work_struct *dummy) 1054{ 1055 mce_notify_process(); 1056} 1057 1058#ifdef CONFIG_X86_MCE_INTEL 1059/*** 1060 * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog 1061 * @cpu: The CPU on which the event occurred. 1062 * @status: Event status information 1063 * 1064 * This function should be called by the thermal interrupt after the 1065 * event has been processed and the decision was made to log the event 1066 * further. 1067 * 1068 * The status parameter will be saved to the 'status' field of 'struct mce' 1069 * and historically has been the register value of the 1070 * MSR_IA32_THERMAL_STATUS (Intel) msr. 1071 */ 1072void mce_log_therm_throt_event(__u64 status) 1073{ 1074 struct mce m; 1075 1076 mce_setup(&m); 1077 m.bank = MCE_THERMAL_BANK; 1078 m.status = status; 1079 mce_log(&m); 1080} 1081#endif /* CONFIG_X86_MCE_INTEL */ 1082 1083/* 1084 * Periodic polling timer for "silent" machine check errors. If the 1085 * poller finds an MCE, poll 2x faster. When the poller finds no more 1086 * errors, poll 2x slower (up to check_interval seconds). 1087 */ 1088static int check_interval = 5 * 60; /* 5 minutes */ 1089 1090static DEFINE_PER_CPU(int, next_interval); /* in jiffies */ 1091static DEFINE_PER_CPU(struct timer_list, mce_timer); 1092 1093static void mcheck_timer(unsigned long data) 1094{ 1095 struct timer_list *t = &per_cpu(mce_timer, data); 1096 int *n; 1097 1098 WARN_ON(smp_processor_id() != data); 1099 1100 if (mce_available(¤t_cpu_data)) { 1101 machine_check_poll(MCP_TIMESTAMP, 1102 &__get_cpu_var(mce_poll_banks)); 1103 } 1104 1105 /* 1106 * Alert userspace if needed. If we logged an MCE, reduce the 1107 * polling interval, otherwise increase the polling interval. 1108 */ 1109 n = &__get_cpu_var(next_interval); 1110 if (mce_notify_irq()) 1111 *n = max(*n/2, HZ/100); 1112 else 1113 *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ)); 1114 1115 t->expires = jiffies + *n; 1116 add_timer(t); 1117} 1118 1119static void mce_do_trigger(struct work_struct *work) 1120{ 1121 call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT); 1122} 1123 1124static DECLARE_WORK(mce_trigger_work, mce_do_trigger); 1125 1126/* 1127 * Notify the user(s) about new machine check events. 1128 * Can be called from interrupt context, but not from machine check/NMI 1129 * context. 1130 */ 1131int mce_notify_irq(void) 1132{ 1133 /* Not more than two messages every minute */ 1134 static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); 1135 1136 clear_thread_flag(TIF_MCE_NOTIFY); 1137 1138 if (test_and_clear_bit(0, ¬ify_user)) { 1139 wake_up_interruptible(&mce_wait); 1140 1141 /* 1142 * There is no risk of missing notifications because 1143 * work_pending is always cleared before the function is 1144 * executed. 1145 */ 1146 if (trigger[0] && !work_pending(&mce_trigger_work)) 1147 schedule_work(&mce_trigger_work); 1148 1149 if (__ratelimit(&ratelimit)) 1150 printk(KERN_INFO "Machine check events logged\n"); 1151 1152 return 1; 1153 } 1154 return 0; 1155} 1156EXPORT_SYMBOL_GPL(mce_notify_irq); 1157 1158/* 1159 * Initialize Machine Checks for a CPU. 1160 */ 1161static int mce_cap_init(void) 1162{ 1163 unsigned b; 1164 u64 cap; 1165 1166 rdmsrl(MSR_IA32_MCG_CAP, cap); 1167 1168 b = cap & MCG_BANKCNT_MASK; 1169 printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b); 1170 1171 if (b > MAX_NR_BANKS) { 1172 printk(KERN_WARNING 1173 "MCE: Using only %u machine check banks out of %u\n", 1174 MAX_NR_BANKS, b); 1175 b = MAX_NR_BANKS; 1176 } 1177 1178 /* Don't support asymmetric configurations today */ 1179 WARN_ON(banks != 0 && b != banks); 1180 banks = b; 1181 if (!bank) { 1182 bank = kmalloc(banks * sizeof(u64), GFP_KERNEL); 1183 if (!bank) 1184 return -ENOMEM; 1185 memset(bank, 0xff, banks * sizeof(u64)); 1186 } 1187 1188 /* Use accurate RIP reporting if available. */ 1189 if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9) 1190 rip_msr = MSR_IA32_MCG_EIP; 1191 1192 if (cap & MCG_SER_P) 1193 mce_ser = 1; 1194 1195 return 0; 1196} 1197 1198static void mce_init(void) 1199{ 1200 mce_banks_t all_banks; 1201 u64 cap; 1202 int i; 1203 1204 /* 1205 * Log the machine checks left over from the previous reset. 1206 */ 1207 bitmap_fill(all_banks, MAX_NR_BANKS); 1208 machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks); 1209 1210 set_in_cr4(X86_CR4_MCE); 1211 1212 rdmsrl(MSR_IA32_MCG_CAP, cap); 1213 if (cap & MCG_CTL_P) 1214 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); 1215 1216 for (i = 0; i < banks; i++) { 1217 if (skip_bank_init(i)) 1218 continue; 1219 wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]); 1220 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); 1221 } 1222} 1223 1224/* Add per CPU specific workarounds here */ 1225static void mce_cpu_quirks(struct cpuinfo_x86 *c) 1226{ 1227 /* This should be disabled by the BIOS, but isn't always */ 1228 if (c->x86_vendor == X86_VENDOR_AMD) { 1229 if (c->x86 == 15 && banks > 4) { 1230 /* 1231 * disable GART TBL walk error reporting, which 1232 * trips off incorrectly with the IOMMU & 3ware 1233 * & Cerberus: 1234 */ 1235 clear_bit(10, (unsigned long *)&bank[4]); 1236 } 1237 if (c->x86 <= 17 && mce_bootlog < 0) { 1238 /* 1239 * Lots of broken BIOS around that don't clear them 1240 * by default and leave crap in there. Don't log: 1241 */ 1242 mce_bootlog = 0; 1243 } 1244 /* 1245 * Various K7s with broken bank 0 around. Always disable 1246 * by default. 1247 */ 1248 if (c->x86 == 6) 1249 bank[0] = 0; 1250 } 1251 1252 if (c->x86_vendor == X86_VENDOR_INTEL) { 1253 /* 1254 * SDM documents that on family 6 bank 0 should not be written 1255 * because it aliases to another special BIOS controlled 1256 * register. 1257 * But it's not aliased anymore on model 0x1a+ 1258 * Don't ignore bank 0 completely because there could be a 1259 * valid event later, merely don't write CTL0. 1260 */ 1261 1262 if (c->x86 == 6 && c->x86_model < 0x1A) 1263 __set_bit(0, &dont_init_banks); 1264 1265 /* 1266 * All newer Intel systems support MCE broadcasting. Enable 1267 * synchronization with a one second timeout. 1268 */ 1269 if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) && 1270 monarch_timeout < 0) 1271 monarch_timeout = USEC_PER_SEC; 1272 } 1273 if (monarch_timeout < 0) 1274 monarch_timeout = 0; 1275 if (mce_bootlog != 0) 1276 mce_panic_timeout = 30; 1277} 1278 1279static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c) 1280{ 1281 if (c->x86 != 5) 1282 return; 1283 switch (c->x86_vendor) { 1284 case X86_VENDOR_INTEL: 1285 if (mce_p5_enabled()) 1286 intel_p5_mcheck_init(c); 1287 break; 1288 case X86_VENDOR_CENTAUR: 1289 winchip_mcheck_init(c); 1290 break; 1291 } 1292} 1293 1294static void mce_cpu_features(struct cpuinfo_x86 *c) 1295{ 1296 switch (c->x86_vendor) { 1297 case X86_VENDOR_INTEL: 1298 mce_intel_feature_init(c); 1299 break; 1300 case X86_VENDOR_AMD: 1301 mce_amd_feature_init(c); 1302 break; 1303 default: 1304 break; 1305 } 1306} 1307 1308static void mce_init_timer(void) 1309{ 1310 struct timer_list *t = &__get_cpu_var(mce_timer); 1311 int *n = &__get_cpu_var(next_interval); 1312 1313 if (mce_ignore_ce) 1314 return; 1315 1316 *n = check_interval * HZ; 1317 if (!*n) 1318 return; 1319 setup_timer(t, mcheck_timer, smp_processor_id()); 1320 t->expires = round_jiffies(jiffies + *n); 1321 add_timer(t); 1322} 1323 1324/* 1325 * Called for each booted CPU to set up machine checks. 1326 * Must be called with preempt off: 1327 */ 1328void __cpuinit mcheck_init(struct cpuinfo_x86 *c) 1329{ 1330 if (mce_disabled) 1331 return; 1332 1333 mce_ancient_init(c); 1334 1335 if (!mce_available(c)) 1336 return; 1337 1338 if (mce_cap_init() < 0) { 1339 mce_disabled = 1; 1340 return; 1341 } 1342 mce_cpu_quirks(c); 1343 1344 machine_check_vector = do_machine_check; 1345 1346 mce_init(); 1347 mce_cpu_features(c); 1348 mce_init_timer(); 1349 INIT_WORK(&__get_cpu_var(mce_work), mce_process_work); 1350} 1351 1352/* 1353 * Character device to read and clear the MCE log. 1354 */ 1355 1356static DEFINE_SPINLOCK(mce_state_lock); 1357static int open_count; /* #times opened */ 1358static int open_exclu; /* already open exclusive? */ 1359 1360static int mce_open(struct inode *inode, struct file *file) 1361{ 1362 spin_lock(&mce_state_lock); 1363 1364 if (open_exclu || (open_count && (file->f_flags & O_EXCL))) { 1365 spin_unlock(&mce_state_lock); 1366 1367 return -EBUSY; 1368 } 1369 1370 if (file->f_flags & O_EXCL) 1371 open_exclu = 1; 1372 open_count++; 1373 1374 spin_unlock(&mce_state_lock); 1375 1376 return nonseekable_open(inode, file); 1377} 1378 1379static int mce_release(struct inode *inode, struct file *file) 1380{ 1381 spin_lock(&mce_state_lock); 1382 1383 open_count--; 1384 open_exclu = 0; 1385 1386 spin_unlock(&mce_state_lock); 1387 1388 return 0; 1389} 1390 1391static void collect_tscs(void *data) 1392{ 1393 unsigned long *cpu_tsc = (unsigned long *)data; 1394 1395 rdtscll(cpu_tsc[smp_processor_id()]); 1396} 1397 1398static DEFINE_MUTEX(mce_read_mutex); 1399 1400static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, 1401 loff_t *off) 1402{ 1403 char __user *buf = ubuf; 1404 unsigned long *cpu_tsc; 1405 unsigned prev, next; 1406 int i, err; 1407 1408 cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL); 1409 if (!cpu_tsc) 1410 return -ENOMEM; 1411 1412 mutex_lock(&mce_read_mutex); 1413 next = rcu_dereference(mcelog.next); 1414 1415 /* Only supports full reads right now */ 1416 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) { 1417 mutex_unlock(&mce_read_mutex); 1418 kfree(cpu_tsc); 1419 1420 return -EINVAL; 1421 } 1422 1423 err = 0; 1424 prev = 0; 1425 do { 1426 for (i = prev; i < next; i++) { 1427 unsigned long start = jiffies; 1428 1429 while (!mcelog.entry[i].finished) { 1430 if (time_after_eq(jiffies, start + 2)) { 1431 memset(mcelog.entry + i, 0, 1432 sizeof(struct mce)); 1433 goto timeout; 1434 } 1435 cpu_relax(); 1436 } 1437 smp_rmb(); 1438 err |= copy_to_user(buf, mcelog.entry + i, 1439 sizeof(struct mce)); 1440 buf += sizeof(struct mce); 1441timeout: 1442 ; 1443 } 1444 1445 memset(mcelog.entry + prev, 0, 1446 (next - prev) * sizeof(struct mce)); 1447 prev = next; 1448 next = cmpxchg(&mcelog.next, prev, 0); 1449 } while (next != prev); 1450 1451 synchronize_sched(); 1452 1453 /* 1454 * Collect entries that were still getting written before the 1455 * synchronize. 1456 */ 1457 on_each_cpu(collect_tscs, cpu_tsc, 1); 1458 1459 for (i = next; i < MCE_LOG_LEN; i++) { 1460 if (mcelog.entry[i].finished && 1461 mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) { 1462 err |= copy_to_user(buf, mcelog.entry+i, 1463 sizeof(struct mce)); 1464 smp_rmb(); 1465 buf += sizeof(struct mce); 1466 memset(&mcelog.entry[i], 0, sizeof(struct mce)); 1467 } 1468 } 1469 mutex_unlock(&mce_read_mutex); 1470 kfree(cpu_tsc); 1471 1472 return err ? -EFAULT : buf - ubuf; 1473} 1474 1475static unsigned int mce_poll(struct file *file, poll_table *wait) 1476{ 1477 poll_wait(file, &mce_wait, wait); 1478 if (rcu_dereference(mcelog.next)) 1479 return POLLIN | POLLRDNORM; 1480 return 0; 1481} 1482 1483static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg) 1484{ 1485 int __user *p = (int __user *)arg; 1486 1487 if (!capable(CAP_SYS_ADMIN)) 1488 return -EPERM; 1489 1490 switch (cmd) { 1491 case MCE_GET_RECORD_LEN: 1492 return put_user(sizeof(struct mce), p); 1493 case MCE_GET_LOG_LEN: 1494 return put_user(MCE_LOG_LEN, p); 1495 case MCE_GETCLEAR_FLAGS: { 1496 unsigned flags; 1497 1498 do { 1499 flags = mcelog.flags; 1500 } while (cmpxchg(&mcelog.flags, flags, 0) != flags); 1501 1502 return put_user(flags, p); 1503 } 1504 default: 1505 return -ENOTTY; 1506 } 1507} 1508 1509/* Modified in mce-inject.c, so not static or const */ 1510struct file_operations mce_chrdev_ops = { 1511 .open = mce_open, 1512 .release = mce_release, 1513 .read = mce_read, 1514 .poll = mce_poll, 1515 .unlocked_ioctl = mce_ioctl, 1516}; 1517EXPORT_SYMBOL_GPL(mce_chrdev_ops); 1518 1519static struct miscdevice mce_log_device = { 1520 MISC_MCELOG_MINOR, 1521 "mcelog", 1522 &mce_chrdev_ops, 1523}; 1524 1525/* 1526 * mce=off Disables machine check 1527 * mce=no_cmci Disables CMCI 1528 * mce=dont_log_ce Clears corrected events silently, no log created for CEs. 1529 * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared. 1530 * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above) 1531 * monarchtimeout is how long to wait for other CPUs on machine 1532 * check, or 0 to not wait 1533 * mce=bootlog Log MCEs from before booting. Disabled by default on AMD. 1534 * mce=nobootlog Don't log MCEs from before booting. 1535 */ 1536static int __init mcheck_enable(char *str) 1537{ 1538 if (*str == 0) 1539 enable_p5_mce(); 1540 if (*str == '=') 1541 str++; 1542 if (!strcmp(str, "off")) 1543 mce_disabled = 1; 1544 else if (!strcmp(str, "no_cmci")) 1545 mce_cmci_disabled = 1; 1546 else if (!strcmp(str, "dont_log_ce")) 1547 mce_dont_log_ce = 1; 1548 else if (!strcmp(str, "ignore_ce")) 1549 mce_ignore_ce = 1; 1550 else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog")) 1551 mce_bootlog = (str[0] == 'b'); 1552 else if (isdigit(str[0])) { 1553 get_option(&str, &tolerant); 1554 if (*str == ',') { 1555 ++str; 1556 get_option(&str, &monarch_timeout); 1557 } 1558 } else { 1559 printk(KERN_INFO "mce argument %s ignored. Please use /sys\n", 1560 str); 1561 return 0; 1562 } 1563 return 1; 1564} 1565__setup("mce", mcheck_enable); 1566 1567/* 1568 * Sysfs support 1569 */ 1570 1571/* 1572 * Disable machine checks on suspend and shutdown. We can't really handle 1573 * them later. 1574 */ 1575static int mce_disable(void) 1576{ 1577 int i; 1578 1579 for (i = 0; i < banks; i++) { 1580 if (!skip_bank_init(i)) 1581 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); 1582 } 1583 return 0; 1584} 1585 1586static int mce_suspend(struct sys_device *dev, pm_message_t state) 1587{ 1588 return mce_disable(); 1589} 1590 1591static int mce_shutdown(struct sys_device *dev) 1592{ 1593 return mce_disable(); 1594} 1595 1596/* 1597 * On resume clear all MCE state. Don't want to see leftovers from the BIOS. 1598 * Only one CPU is active at this time, the others get re-added later using 1599 * CPU hotplug: 1600 */ 1601static int mce_resume(struct sys_device *dev) 1602{ 1603 mce_init(); 1604 mce_cpu_features(¤t_cpu_data); 1605 1606 return 0; 1607} 1608 1609static void mce_cpu_restart(void *data) 1610{ 1611 del_timer_sync(&__get_cpu_var(mce_timer)); 1612 if (mce_available(¤t_cpu_data)) 1613 mce_init(); 1614 mce_init_timer(); 1615} 1616 1617/* Reinit MCEs after user configuration changes */ 1618static void mce_restart(void) 1619{ 1620 on_each_cpu(mce_cpu_restart, NULL, 1); 1621} 1622 1623static struct sysdev_class mce_sysclass = { 1624 .suspend = mce_suspend, 1625 .shutdown = mce_shutdown, 1626 .resume = mce_resume, 1627 .name = "machinecheck", 1628}; 1629 1630DEFINE_PER_CPU(struct sys_device, mce_dev); 1631 1632__cpuinitdata 1633void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); 1634 1635static struct sysdev_attribute *bank_attrs; 1636 1637static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr, 1638 char *buf) 1639{ 1640 u64 b = bank[attr - bank_attrs]; 1641 1642 return sprintf(buf, "%llx\n", b); 1643} 1644 1645static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, 1646 const char *buf, size_t size) 1647{ 1648 u64 new; 1649 1650 if (strict_strtoull(buf, 0, &new) < 0) 1651 return -EINVAL; 1652 1653 bank[attr - bank_attrs] = new; 1654 mce_restart(); 1655 1656 return size; 1657} 1658 1659static ssize_t 1660show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf) 1661{ 1662 strcpy(buf, trigger); 1663 strcat(buf, "\n"); 1664 return strlen(trigger) + 1; 1665} 1666 1667static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr, 1668 const char *buf, size_t siz) 1669{ 1670 char *p; 1671 int len; 1672 1673 strncpy(trigger, buf, sizeof(trigger)); 1674 trigger[sizeof(trigger)-1] = 0; 1675 len = strlen(trigger); 1676 p = strchr(trigger, '\n'); 1677 1678 if (*p) 1679 *p = 0; 1680 1681 return len; 1682} 1683 1684static ssize_t store_int_with_restart(struct sys_device *s, 1685 struct sysdev_attribute *attr, 1686 const char *buf, size_t size) 1687{ 1688 ssize_t ret = sysdev_store_int(s, attr, buf, size); 1689 mce_restart(); 1690 return ret; 1691} 1692 1693static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); 1694static SYSDEV_INT_ATTR(tolerant, 0644, tolerant); 1695static SYSDEV_INT_ATTR(monarch_timeout, 0644, monarch_timeout); 1696 1697static struct sysdev_ext_attribute attr_check_interval = { 1698 _SYSDEV_ATTR(check_interval, 0644, sysdev_show_int, 1699 store_int_with_restart), 1700 &check_interval 1701}; 1702 1703static struct sysdev_attribute *mce_attrs[] = { 1704 &attr_tolerant.attr, &attr_check_interval.attr, &attr_trigger, 1705 &attr_monarch_timeout.attr, 1706 NULL 1707}; 1708 1709static cpumask_var_t mce_dev_initialized; 1710 1711/* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */ 1712static __cpuinit int mce_create_device(unsigned int cpu) 1713{ 1714 int err; 1715 int i; 1716 1717 if (!mce_available(&boot_cpu_data)) 1718 return -EIO; 1719 1720 memset(&per_cpu(mce_dev, cpu).kobj, 0, sizeof(struct kobject)); 1721 per_cpu(mce_dev, cpu).id = cpu; 1722 per_cpu(mce_dev, cpu).cls = &mce_sysclass; 1723 1724 err = sysdev_register(&per_cpu(mce_dev, cpu)); 1725 if (err) 1726 return err; 1727 1728 for (i = 0; mce_attrs[i]; i++) { 1729 err = sysdev_create_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); 1730 if (err) 1731 goto error; 1732 } 1733 for (i = 0; i < banks; i++) { 1734 err = sysdev_create_file(&per_cpu(mce_dev, cpu), 1735 &bank_attrs[i]); 1736 if (err) 1737 goto error2; 1738 } 1739 cpumask_set_cpu(cpu, mce_dev_initialized); 1740 1741 return 0; 1742error2: 1743 while (--i >= 0) 1744 sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]); 1745error: 1746 while (--i >= 0) 1747 sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); 1748 1749 sysdev_unregister(&per_cpu(mce_dev, cpu)); 1750 1751 return err; 1752} 1753 1754static __cpuinit void mce_remove_device(unsigned int cpu) 1755{ 1756 int i; 1757 1758 if (!cpumask_test_cpu(cpu, mce_dev_initialized)) 1759 return; 1760 1761 for (i = 0; mce_attrs[i]; i++) 1762 sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); 1763 1764 for (i = 0; i < banks; i++) 1765 sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]); 1766 1767 sysdev_unregister(&per_cpu(mce_dev, cpu)); 1768 cpumask_clear_cpu(cpu, mce_dev_initialized); 1769} 1770 1771/* Make sure there are no machine checks on offlined CPUs. */ 1772static void mce_disable_cpu(void *h) 1773{ 1774 unsigned long action = *(unsigned long *)h; 1775 int i; 1776 1777 if (!mce_available(¤t_cpu_data)) 1778 return; 1779 if (!(action & CPU_TASKS_FROZEN)) 1780 cmci_clear(); 1781 for (i = 0; i < banks; i++) { 1782 if (!skip_bank_init(i)) 1783 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); 1784 } 1785} 1786 1787static void mce_reenable_cpu(void *h) 1788{ 1789 unsigned long action = *(unsigned long *)h; 1790 int i; 1791 1792 if (!mce_available(¤t_cpu_data)) 1793 return; 1794 1795 if (!(action & CPU_TASKS_FROZEN)) 1796 cmci_reenable(); 1797 for (i = 0; i < banks; i++) { 1798 if (!skip_bank_init(i)) 1799 wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]); 1800 } 1801} 1802 1803/* Get notified when a cpu comes on/off. Be hotplug friendly. */ 1804static int __cpuinit 1805mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) 1806{ 1807 unsigned int cpu = (unsigned long)hcpu; 1808 struct timer_list *t = &per_cpu(mce_timer, cpu); 1809 1810 switch (action) { 1811 case CPU_ONLINE: 1812 case CPU_ONLINE_FROZEN: 1813 mce_create_device(cpu); 1814 if (threshold_cpu_callback) 1815 threshold_cpu_callback(action, cpu); 1816 break; 1817 case CPU_DEAD: 1818 case CPU_DEAD_FROZEN: 1819 if (threshold_cpu_callback) 1820 threshold_cpu_callback(action, cpu); 1821 mce_remove_device(cpu); 1822 break; 1823 case CPU_DOWN_PREPARE: 1824 case CPU_DOWN_PREPARE_FROZEN: 1825 del_timer_sync(t); 1826 smp_call_function_single(cpu, mce_disable_cpu, &action, 1); 1827 break; 1828 case CPU_DOWN_FAILED: 1829 case CPU_DOWN_FAILED_FROZEN: 1830 t->expires = round_jiffies(jiffies + 1831 __get_cpu_var(next_interval)); 1832 add_timer_on(t, cpu); 1833 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); 1834 break; 1835 case CPU_POST_DEAD: 1836 /* intentionally ignoring frozen here */ 1837 cmci_rediscover(cpu); 1838 break; 1839 } 1840 return NOTIFY_OK; 1841} 1842 1843static struct notifier_block mce_cpu_notifier __cpuinitdata = { 1844 .notifier_call = mce_cpu_callback, 1845}; 1846 1847static __init int mce_init_banks(void) 1848{ 1849 int i; 1850 1851 bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks, 1852 GFP_KERNEL); 1853 if (!bank_attrs) 1854 return -ENOMEM; 1855 1856 for (i = 0; i < banks; i++) { 1857 struct sysdev_attribute *a = &bank_attrs[i]; 1858 1859 a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i); 1860 if (!a->attr.name) 1861 goto nomem; 1862 1863 a->attr.mode = 0644; 1864 a->show = show_bank; 1865 a->store = set_bank; 1866 } 1867 return 0; 1868 1869nomem: 1870 while (--i >= 0) 1871 kfree(bank_attrs[i].attr.name); 1872 kfree(bank_attrs); 1873 bank_attrs = NULL; 1874 1875 return -ENOMEM; 1876} 1877 1878static __init int mce_init_device(void) 1879{ 1880 int err; 1881 int i = 0; 1882 1883 if (!mce_available(&boot_cpu_data)) 1884 return -EIO; 1885 1886 alloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL); 1887 1888 err = mce_init_banks(); 1889 if (err) 1890 return err; 1891 1892 err = sysdev_class_register(&mce_sysclass); 1893 if (err) 1894 return err; 1895 1896 for_each_online_cpu(i) { 1897 err = mce_create_device(i); 1898 if (err) 1899 return err; 1900 } 1901 1902 register_hotcpu_notifier(&mce_cpu_notifier); 1903 misc_register(&mce_log_device); 1904 1905 return err; 1906} 1907 1908device_initcall(mce_init_device); 1909 1910#else /* CONFIG_X86_OLD_MCE: */ 1911 1912int nr_mce_banks; 1913EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */ 1914 1915/* This has to be run for each processor */ 1916void mcheck_init(struct cpuinfo_x86 *c) 1917{ 1918 if (mce_disabled == 1) 1919 return; 1920 1921 switch (c->x86_vendor) { 1922 case X86_VENDOR_AMD: 1923 amd_mcheck_init(c); 1924 break; 1925 1926 case X86_VENDOR_INTEL: 1927 if (c->x86 == 5) 1928 intel_p5_mcheck_init(c); 1929 if (c->x86 == 6) 1930 intel_p6_mcheck_init(c); 1931 if (c->x86 == 15) 1932 intel_p4_mcheck_init(c); 1933 break; 1934 1935 case X86_VENDOR_CENTAUR: 1936 if (c->x86 == 5) 1937 winchip_mcheck_init(c); 1938 break; 1939 1940 default: 1941 break; 1942 } 1943 printk(KERN_INFO "mce: CPU supports %d MCE banks\n", nr_mce_banks); 1944} 1945 1946static int __init mcheck_enable(char *str) 1947{ 1948 mce_disabled = -1; 1949 return 1; 1950} 1951 1952__setup("mce", mcheck_enable); 1953 1954#endif /* CONFIG_X86_OLD_MCE */ 1955 1956/* 1957 * Old style boot options parsing. Only for compatibility. 1958 */ 1959static int __init mcheck_disable(char *str) 1960{ 1961 mce_disabled = 1; 1962 return 1; 1963} 1964__setup("nomce", mcheck_disable); 1965