mce.c revision c697836985e18d9c34897428ba563b13044a6dcd
1/* 2 * Machine check handler. 3 * 4 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. 5 * Rest from unknown author(s). 6 * 2004 Andi Kleen. Rewrote most of it. 7 * Copyright 2008 Intel Corporation 8 * Author: Andi Kleen 9 */ 10#include <linux/thread_info.h> 11#include <linux/capability.h> 12#include <linux/miscdevice.h> 13#include <linux/interrupt.h> 14#include <linux/ratelimit.h> 15#include <linux/kallsyms.h> 16#include <linux/rcupdate.h> 17#include <linux/kobject.h> 18#include <linux/uaccess.h> 19#include <linux/kdebug.h> 20#include <linux/kernel.h> 21#include <linux/percpu.h> 22#include <linux/string.h> 23#include <linux/sysdev.h> 24#include <linux/delay.h> 25#include <linux/ctype.h> 26#include <linux/sched.h> 27#include <linux/sysfs.h> 28#include <linux/types.h> 29#include <linux/init.h> 30#include <linux/kmod.h> 31#include <linux/poll.h> 32#include <linux/nmi.h> 33#include <linux/cpu.h> 34#include <linux/smp.h> 35#include <linux/fs.h> 36#include <linux/mm.h> 37 38#include <asm/processor.h> 39#include <asm/hw_irq.h> 40#include <asm/apic.h> 41#include <asm/idle.h> 42#include <asm/ipi.h> 43#include <asm/mce.h> 44#include <asm/msr.h> 45 46#include "mce-internal.h" 47 48/* Handle unconfigured int18 (should never happen) */ 49static void unexpected_machine_check(struct pt_regs *regs, long error_code) 50{ 51 printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", 52 smp_processor_id()); 53} 54 55/* Call the installed machine check handler for this CPU setup. */ 56void (*machine_check_vector)(struct pt_regs *, long error_code) = 57 unexpected_machine_check; 58 59int mce_disabled __read_mostly; 60 61#ifdef CONFIG_X86_NEW_MCE 62 63#define MISC_MCELOG_MINOR 227 64 65#define SPINUNIT 100 /* 100ns */ 66 67atomic_t mce_entry; 68 69DEFINE_PER_CPU(unsigned, mce_exception_count); 70 71/* 72 * Tolerant levels: 73 * 0: always panic on uncorrected errors, log corrected errors 74 * 1: panic or SIGBUS on uncorrected errors, log corrected errors 75 * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors 76 * 3: never panic or SIGBUS, log all errors (for testing only) 77 */ 78static int tolerant __read_mostly = 1; 79static int banks __read_mostly; 80static u64 *bank __read_mostly; 81static int rip_msr __read_mostly; 82static int mce_bootlog __read_mostly = -1; 83static int monarch_timeout __read_mostly = -1; 84static int mce_panic_timeout __read_mostly; 85static int mce_dont_log_ce __read_mostly; 86int mce_cmci_disabled __read_mostly; 87int mce_ignore_ce __read_mostly; 88int mce_ser __read_mostly; 89 90/* User mode helper program triggered by machine check event */ 91static unsigned long mce_need_notify; 92static char mce_helper[128]; 93static char *mce_helper_argv[2] = { mce_helper, NULL }; 94 95static unsigned long dont_init_banks; 96 97static DECLARE_WAIT_QUEUE_HEAD(mce_wait); 98static DEFINE_PER_CPU(struct mce, mces_seen); 99static int cpu_missing; 100 101 102/* MCA banks polled by the period polling timer for corrected events */ 103DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { 104 [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL 105}; 106 107static inline int skip_bank_init(int i) 108{ 109 return i < BITS_PER_LONG && test_bit(i, &dont_init_banks); 110} 111 112static DEFINE_PER_CPU(struct work_struct, mce_work); 113 114/* Do initial initialization of a struct mce */ 115void mce_setup(struct mce *m) 116{ 117 memset(m, 0, sizeof(struct mce)); 118 m->cpu = m->extcpu = smp_processor_id(); 119 rdtscll(m->tsc); 120 /* We hope get_seconds stays lockless */ 121 m->time = get_seconds(); 122 m->cpuvendor = boot_cpu_data.x86_vendor; 123 m->cpuid = cpuid_eax(1); 124#ifdef CONFIG_SMP 125 m->socketid = cpu_data(m->extcpu).phys_proc_id; 126#endif 127 m->apicid = cpu_data(m->extcpu).initial_apicid; 128 rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap); 129} 130 131DEFINE_PER_CPU(struct mce, injectm); 132EXPORT_PER_CPU_SYMBOL_GPL(injectm); 133 134/* 135 * Lockless MCE logging infrastructure. 136 * This avoids deadlocks on printk locks without having to break locks. Also 137 * separate MCEs from kernel messages to avoid bogus bug reports. 138 */ 139 140static struct mce_log mcelog = { 141 .signature = MCE_LOG_SIGNATURE, 142 .len = MCE_LOG_LEN, 143 .recordlen = sizeof(struct mce), 144}; 145 146void mce_log(struct mce *mce) 147{ 148 unsigned next, entry; 149 150 mce->finished = 0; 151 wmb(); 152 for (;;) { 153 entry = rcu_dereference(mcelog.next); 154 for (;;) { 155 /* 156 * When the buffer fills up discard new entries. 157 * Assume that the earlier errors are the more 158 * interesting ones: 159 */ 160 if (entry >= MCE_LOG_LEN) { 161 set_bit(MCE_OVERFLOW, 162 (unsigned long *)&mcelog.flags); 163 return; 164 } 165 /* Old left over entry. Skip: */ 166 if (mcelog.entry[entry].finished) { 167 entry++; 168 continue; 169 } 170 break; 171 } 172 smp_rmb(); 173 next = entry + 1; 174 if (cmpxchg(&mcelog.next, entry, next) == entry) 175 break; 176 } 177 memcpy(mcelog.entry + entry, mce, sizeof(struct mce)); 178 wmb(); 179 mcelog.entry[entry].finished = 1; 180 wmb(); 181 182 mce->finished = 1; 183 set_bit(0, &mce_need_notify); 184} 185 186static void print_mce(struct mce *m) 187{ 188 printk(KERN_EMERG 189 "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n", 190 m->extcpu, m->mcgstatus, m->bank, m->status); 191 if (m->ip) { 192 printk(KERN_EMERG "RIP%s %02x:<%016Lx> ", 193 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", 194 m->cs, m->ip); 195 if (m->cs == __KERNEL_CS) 196 print_symbol("{%s}", m->ip); 197 printk("\n"); 198 } 199 printk(KERN_EMERG "TSC %llx ", m->tsc); 200 if (m->addr) 201 printk("ADDR %llx ", m->addr); 202 if (m->misc) 203 printk("MISC %llx ", m->misc); 204 printk("\n"); 205 printk(KERN_EMERG "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n", 206 m->cpuvendor, m->cpuid, m->time, m->socketid, 207 m->apicid); 208} 209 210static void print_mce_head(void) 211{ 212 printk(KERN_EMERG "\n" KERN_EMERG "HARDWARE ERROR\n"); 213} 214 215static void print_mce_tail(void) 216{ 217 printk(KERN_EMERG "This is not a software problem!\n" 218 KERN_EMERG "Run through mcelog --ascii to decode and contact your hardware vendor\n"); 219} 220 221#define PANIC_TIMEOUT 5 /* 5 seconds */ 222 223static atomic_t mce_paniced; 224 225/* Panic in progress. Enable interrupts and wait for final IPI */ 226static void wait_for_panic(void) 227{ 228 long timeout = PANIC_TIMEOUT*USEC_PER_SEC; 229 preempt_disable(); 230 local_irq_enable(); 231 while (timeout-- > 0) 232 udelay(1); 233 if (panic_timeout == 0) 234 panic_timeout = mce_panic_timeout; 235 panic("Panicing machine check CPU died"); 236} 237 238static void mce_panic(char *msg, struct mce *final, char *exp) 239{ 240 int i; 241 242 /* 243 * Make sure only one CPU runs in machine check panic 244 */ 245 if (atomic_add_return(1, &mce_paniced) > 1) 246 wait_for_panic(); 247 barrier(); 248 249 bust_spinlocks(1); 250 console_verbose(); 251 print_mce_head(); 252 /* First print corrected ones that are still unlogged */ 253 for (i = 0; i < MCE_LOG_LEN; i++) { 254 struct mce *m = &mcelog.entry[i]; 255 if (!(m->status & MCI_STATUS_VAL)) 256 continue; 257 if (!(m->status & MCI_STATUS_UC)) 258 print_mce(m); 259 } 260 /* Now print uncorrected but with the final one last */ 261 for (i = 0; i < MCE_LOG_LEN; i++) { 262 struct mce *m = &mcelog.entry[i]; 263 if (!(m->status & MCI_STATUS_VAL)) 264 continue; 265 if (!(m->status & MCI_STATUS_UC)) 266 continue; 267 if (!final || memcmp(m, final, sizeof(struct mce))) 268 print_mce(m); 269 } 270 if (final) 271 print_mce(final); 272 if (cpu_missing) 273 printk(KERN_EMERG "Some CPUs didn't answer in synchronization\n"); 274 print_mce_tail(); 275 if (exp) 276 printk(KERN_EMERG "Machine check: %s\n", exp); 277 if (panic_timeout == 0) 278 panic_timeout = mce_panic_timeout; 279 panic(msg); 280} 281 282/* Support code for software error injection */ 283 284static int msr_to_offset(u32 msr) 285{ 286 unsigned bank = __get_cpu_var(injectm.bank); 287 if (msr == rip_msr) 288 return offsetof(struct mce, ip); 289 if (msr == MSR_IA32_MC0_STATUS + bank*4) 290 return offsetof(struct mce, status); 291 if (msr == MSR_IA32_MC0_ADDR + bank*4) 292 return offsetof(struct mce, addr); 293 if (msr == MSR_IA32_MC0_MISC + bank*4) 294 return offsetof(struct mce, misc); 295 if (msr == MSR_IA32_MCG_STATUS) 296 return offsetof(struct mce, mcgstatus); 297 return -1; 298} 299 300/* MSR access wrappers used for error injection */ 301static u64 mce_rdmsrl(u32 msr) 302{ 303 u64 v; 304 if (__get_cpu_var(injectm).finished) { 305 int offset = msr_to_offset(msr); 306 if (offset < 0) 307 return 0; 308 return *(u64 *)((char *)&__get_cpu_var(injectm) + offset); 309 } 310 rdmsrl(msr, v); 311 return v; 312} 313 314static void mce_wrmsrl(u32 msr, u64 v) 315{ 316 if (__get_cpu_var(injectm).finished) { 317 int offset = msr_to_offset(msr); 318 if (offset >= 0) 319 *(u64 *)((char *)&__get_cpu_var(injectm) + offset) = v; 320 return; 321 } 322 wrmsrl(msr, v); 323} 324 325/* 326 * Simple lockless ring to communicate PFNs from the exception handler with the 327 * process context work function. This is vastly simplified because there's 328 * only a single reader and a single writer. 329 */ 330#define MCE_RING_SIZE 16 /* we use one entry less */ 331 332struct mce_ring { 333 unsigned short start; 334 unsigned short end; 335 unsigned long ring[MCE_RING_SIZE]; 336}; 337static DEFINE_PER_CPU(struct mce_ring, mce_ring); 338 339/* Runs with CPU affinity in workqueue */ 340static int mce_ring_empty(void) 341{ 342 struct mce_ring *r = &__get_cpu_var(mce_ring); 343 344 return r->start == r->end; 345} 346 347static int mce_ring_get(unsigned long *pfn) 348{ 349 struct mce_ring *r; 350 int ret = 0; 351 352 *pfn = 0; 353 get_cpu(); 354 r = &__get_cpu_var(mce_ring); 355 if (r->start == r->end) 356 goto out; 357 *pfn = r->ring[r->start]; 358 r->start = (r->start + 1) % MCE_RING_SIZE; 359 ret = 1; 360out: 361 put_cpu(); 362 return ret; 363} 364 365/* Always runs in MCE context with preempt off */ 366static int mce_ring_add(unsigned long pfn) 367{ 368 struct mce_ring *r = &__get_cpu_var(mce_ring); 369 unsigned next; 370 371 next = (r->end + 1) % MCE_RING_SIZE; 372 if (next == r->start) 373 return -1; 374 r->ring[r->end] = pfn; 375 wmb(); 376 r->end = next; 377 return 0; 378} 379 380int mce_available(struct cpuinfo_x86 *c) 381{ 382 if (mce_disabled) 383 return 0; 384 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); 385} 386 387static void mce_schedule_work(void) 388{ 389 if (!mce_ring_empty()) { 390 struct work_struct *work = &__get_cpu_var(mce_work); 391 if (!work_pending(work)) 392 schedule_work(work); 393 } 394} 395 396/* 397 * Get the address of the instruction at the time of the machine check 398 * error. 399 */ 400static inline void mce_get_rip(struct mce *m, struct pt_regs *regs) 401{ 402 403 if (regs && (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV))) { 404 m->ip = regs->ip; 405 m->cs = regs->cs; 406 } else { 407 m->ip = 0; 408 m->cs = 0; 409 } 410 if (rip_msr) 411 m->ip = mce_rdmsrl(rip_msr); 412} 413 414#ifdef CONFIG_X86_LOCAL_APIC 415/* 416 * Called after interrupts have been reenabled again 417 * when a MCE happened during an interrupts off region 418 * in the kernel. 419 */ 420asmlinkage void smp_mce_self_interrupt(struct pt_regs *regs) 421{ 422 ack_APIC_irq(); 423 exit_idle(); 424 irq_enter(); 425 mce_notify_irq(); 426 mce_schedule_work(); 427 irq_exit(); 428} 429#endif 430 431static void mce_report_event(struct pt_regs *regs) 432{ 433 if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) { 434 mce_notify_irq(); 435 /* 436 * Triggering the work queue here is just an insurance 437 * policy in case the syscall exit notify handler 438 * doesn't run soon enough or ends up running on the 439 * wrong CPU (can happen when audit sleeps) 440 */ 441 mce_schedule_work(); 442 return; 443 } 444 445#ifdef CONFIG_X86_LOCAL_APIC 446 /* 447 * Without APIC do not notify. The event will be picked 448 * up eventually. 449 */ 450 if (!cpu_has_apic) 451 return; 452 453 /* 454 * When interrupts are disabled we cannot use 455 * kernel services safely. Trigger an self interrupt 456 * through the APIC to instead do the notification 457 * after interrupts are reenabled again. 458 */ 459 apic->send_IPI_self(MCE_SELF_VECTOR); 460 461 /* 462 * Wait for idle afterwards again so that we don't leave the 463 * APIC in a non idle state because the normal APIC writes 464 * cannot exclude us. 465 */ 466 apic_wait_icr_idle(); 467#endif 468} 469 470DEFINE_PER_CPU(unsigned, mce_poll_count); 471 472/* 473 * Poll for corrected events or events that happened before reset. 474 * Those are just logged through /dev/mcelog. 475 * 476 * This is executed in standard interrupt context. 477 * 478 * Note: spec recommends to panic for fatal unsignalled 479 * errors here. However this would be quite problematic -- 480 * we would need to reimplement the Monarch handling and 481 * it would mess up the exclusion between exception handler 482 * and poll hander -- * so we skip this for now. 483 * These cases should not happen anyways, or only when the CPU 484 * is already totally * confused. In this case it's likely it will 485 * not fully execute the machine check handler either. 486 */ 487void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) 488{ 489 struct mce m; 490 int i; 491 492 __get_cpu_var(mce_poll_count)++; 493 494 mce_setup(&m); 495 496 m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); 497 for (i = 0; i < banks; i++) { 498 if (!bank[i] || !test_bit(i, *b)) 499 continue; 500 501 m.misc = 0; 502 m.addr = 0; 503 m.bank = i; 504 m.tsc = 0; 505 506 barrier(); 507 m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4); 508 if (!(m.status & MCI_STATUS_VAL)) 509 continue; 510 511 /* 512 * Uncorrected or signalled events are handled by the exception 513 * handler when it is enabled, so don't process those here. 514 * 515 * TBD do the same check for MCI_STATUS_EN here? 516 */ 517 if (!(flags & MCP_UC) && 518 (m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC))) 519 continue; 520 521 if (m.status & MCI_STATUS_MISCV) 522 m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4); 523 if (m.status & MCI_STATUS_ADDRV) 524 m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4); 525 526 if (!(flags & MCP_TIMESTAMP)) 527 m.tsc = 0; 528 /* 529 * Don't get the IP here because it's unlikely to 530 * have anything to do with the actual error location. 531 */ 532 if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) { 533 mce_log(&m); 534 add_taint(TAINT_MACHINE_CHECK); 535 } 536 537 /* 538 * Clear state for this bank. 539 */ 540 mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); 541 } 542 543 /* 544 * Don't clear MCG_STATUS here because it's only defined for 545 * exceptions. 546 */ 547 548 sync_core(); 549} 550EXPORT_SYMBOL_GPL(machine_check_poll); 551 552/* 553 * Do a quick check if any of the events requires a panic. 554 * This decides if we keep the events around or clear them. 555 */ 556static int mce_no_way_out(struct mce *m, char **msg) 557{ 558 int i; 559 560 for (i = 0; i < banks; i++) { 561 m->status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4); 562 if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY) 563 return 1; 564 } 565 return 0; 566} 567 568/* 569 * Variable to establish order between CPUs while scanning. 570 * Each CPU spins initially until executing is equal its number. 571 */ 572static atomic_t mce_executing; 573 574/* 575 * Defines order of CPUs on entry. First CPU becomes Monarch. 576 */ 577static atomic_t mce_callin; 578 579/* 580 * Check if a timeout waiting for other CPUs happened. 581 */ 582static int mce_timed_out(u64 *t) 583{ 584 /* 585 * The others already did panic for some reason. 586 * Bail out like in a timeout. 587 * rmb() to tell the compiler that system_state 588 * might have been modified by someone else. 589 */ 590 rmb(); 591 if (atomic_read(&mce_paniced)) 592 wait_for_panic(); 593 if (!monarch_timeout) 594 goto out; 595 if ((s64)*t < SPINUNIT) { 596 /* CHECKME: Make panic default for 1 too? */ 597 if (tolerant < 1) 598 mce_panic("Timeout synchronizing machine check over CPUs", 599 NULL, NULL); 600 cpu_missing = 1; 601 return 1; 602 } 603 *t -= SPINUNIT; 604out: 605 touch_nmi_watchdog(); 606 return 0; 607} 608 609/* 610 * The Monarch's reign. The Monarch is the CPU who entered 611 * the machine check handler first. It waits for the others to 612 * raise the exception too and then grades them. When any 613 * error is fatal panic. Only then let the others continue. 614 * 615 * The other CPUs entering the MCE handler will be controlled by the 616 * Monarch. They are called Subjects. 617 * 618 * This way we prevent any potential data corruption in a unrecoverable case 619 * and also makes sure always all CPU's errors are examined. 620 * 621 * Also this detects the case of an machine check event coming from outer 622 * space (not detected by any CPUs) In this case some external agent wants 623 * us to shut down, so panic too. 624 * 625 * The other CPUs might still decide to panic if the handler happens 626 * in a unrecoverable place, but in this case the system is in a semi-stable 627 * state and won't corrupt anything by itself. It's ok to let the others 628 * continue for a bit first. 629 * 630 * All the spin loops have timeouts; when a timeout happens a CPU 631 * typically elects itself to be Monarch. 632 */ 633static void mce_reign(void) 634{ 635 int cpu; 636 struct mce *m = NULL; 637 int global_worst = 0; 638 char *msg = NULL; 639 char *nmsg = NULL; 640 641 /* 642 * This CPU is the Monarch and the other CPUs have run 643 * through their handlers. 644 * Grade the severity of the errors of all the CPUs. 645 */ 646 for_each_possible_cpu(cpu) { 647 int severity = mce_severity(&per_cpu(mces_seen, cpu), tolerant, 648 &nmsg); 649 if (severity > global_worst) { 650 msg = nmsg; 651 global_worst = severity; 652 m = &per_cpu(mces_seen, cpu); 653 } 654 } 655 656 /* 657 * Cannot recover? Panic here then. 658 * This dumps all the mces in the log buffer and stops the 659 * other CPUs. 660 */ 661 if (m && global_worst >= MCE_PANIC_SEVERITY && tolerant < 3) 662 mce_panic("Fatal Machine check", m, msg); 663 664 /* 665 * For UC somewhere we let the CPU who detects it handle it. 666 * Also must let continue the others, otherwise the handling 667 * CPU could deadlock on a lock. 668 */ 669 670 /* 671 * No machine check event found. Must be some external 672 * source or one CPU is hung. Panic. 673 */ 674 if (!m && tolerant < 3) 675 mce_panic("Machine check from unknown source", NULL, NULL); 676 677 /* 678 * Now clear all the mces_seen so that they don't reappear on 679 * the next mce. 680 */ 681 for_each_possible_cpu(cpu) 682 memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce)); 683} 684 685static atomic_t global_nwo; 686 687/* 688 * Start of Monarch synchronization. This waits until all CPUs have 689 * entered the exception handler and then determines if any of them 690 * saw a fatal event that requires panic. Then it executes them 691 * in the entry order. 692 * TBD double check parallel CPU hotunplug 693 */ 694static int mce_start(int *no_way_out) 695{ 696 int order; 697 int cpus = num_online_cpus(); 698 u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC; 699 700 if (!timeout) 701 return -1; 702 703 atomic_add(*no_way_out, &global_nwo); 704 /* 705 * global_nwo should be updated before mce_callin 706 */ 707 smp_wmb(); 708 order = atomic_add_return(1, &mce_callin); 709 710 /* 711 * Wait for everyone. 712 */ 713 while (atomic_read(&mce_callin) != cpus) { 714 if (mce_timed_out(&timeout)) { 715 atomic_set(&global_nwo, 0); 716 return -1; 717 } 718 ndelay(SPINUNIT); 719 } 720 721 /* 722 * mce_callin should be read before global_nwo 723 */ 724 smp_rmb(); 725 726 if (order == 1) { 727 /* 728 * Monarch: Starts executing now, the others wait. 729 */ 730 atomic_set(&mce_executing, 1); 731 } else { 732 /* 733 * Subject: Now start the scanning loop one by one in 734 * the original callin order. 735 * This way when there are any shared banks it will be 736 * only seen by one CPU before cleared, avoiding duplicates. 737 */ 738 while (atomic_read(&mce_executing) < order) { 739 if (mce_timed_out(&timeout)) { 740 atomic_set(&global_nwo, 0); 741 return -1; 742 } 743 ndelay(SPINUNIT); 744 } 745 } 746 747 /* 748 * Cache the global no_way_out state. 749 */ 750 *no_way_out = atomic_read(&global_nwo); 751 752 return order; 753} 754 755/* 756 * Synchronize between CPUs after main scanning loop. 757 * This invokes the bulk of the Monarch processing. 758 */ 759static int mce_end(int order) 760{ 761 int ret = -1; 762 u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC; 763 764 if (!timeout) 765 goto reset; 766 if (order < 0) 767 goto reset; 768 769 /* 770 * Allow others to run. 771 */ 772 atomic_inc(&mce_executing); 773 774 if (order == 1) { 775 /* CHECKME: Can this race with a parallel hotplug? */ 776 int cpus = num_online_cpus(); 777 778 /* 779 * Monarch: Wait for everyone to go through their scanning 780 * loops. 781 */ 782 while (atomic_read(&mce_executing) <= cpus) { 783 if (mce_timed_out(&timeout)) 784 goto reset; 785 ndelay(SPINUNIT); 786 } 787 788 mce_reign(); 789 barrier(); 790 ret = 0; 791 } else { 792 /* 793 * Subject: Wait for Monarch to finish. 794 */ 795 while (atomic_read(&mce_executing) != 0) { 796 if (mce_timed_out(&timeout)) 797 goto reset; 798 ndelay(SPINUNIT); 799 } 800 801 /* 802 * Don't reset anything. That's done by the Monarch. 803 */ 804 return 0; 805 } 806 807 /* 808 * Reset all global state. 809 */ 810reset: 811 atomic_set(&global_nwo, 0); 812 atomic_set(&mce_callin, 0); 813 barrier(); 814 815 /* 816 * Let others run again. 817 */ 818 atomic_set(&mce_executing, 0); 819 return ret; 820} 821 822/* 823 * Check if the address reported by the CPU is in a format we can parse. 824 * It would be possible to add code for most other cases, but all would 825 * be somewhat complicated (e.g. segment offset would require an instruction 826 * parser). So only support physical addresses upto page granuality for now. 827 */ 828static int mce_usable_address(struct mce *m) 829{ 830 if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV)) 831 return 0; 832 if ((m->misc & 0x3f) > PAGE_SHIFT) 833 return 0; 834 if (((m->misc >> 6) & 7) != MCM_ADDR_PHYS) 835 return 0; 836 return 1; 837} 838 839static void mce_clear_state(unsigned long *toclear) 840{ 841 int i; 842 843 for (i = 0; i < banks; i++) { 844 if (test_bit(i, toclear)) 845 mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); 846 } 847} 848 849/* 850 * The actual machine check handler. This only handles real 851 * exceptions when something got corrupted coming in through int 18. 852 * 853 * This is executed in NMI context not subject to normal locking rules. This 854 * implies that most kernel services cannot be safely used. Don't even 855 * think about putting a printk in there! 856 * 857 * On Intel systems this is entered on all CPUs in parallel through 858 * MCE broadcast. However some CPUs might be broken beyond repair, 859 * so be always careful when synchronizing with others. 860 */ 861void do_machine_check(struct pt_regs *regs, long error_code) 862{ 863 struct mce m, *final; 864 int i; 865 int worst = 0; 866 int severity; 867 /* 868 * Establish sequential order between the CPUs entering the machine 869 * check handler. 870 */ 871 int order; 872 /* 873 * If no_way_out gets set, there is no safe way to recover from this 874 * MCE. If tolerant is cranked up, we'll try anyway. 875 */ 876 int no_way_out = 0; 877 /* 878 * If kill_it gets set, there might be a way to recover from this 879 * error. 880 */ 881 int kill_it = 0; 882 DECLARE_BITMAP(toclear, MAX_NR_BANKS); 883 char *msg = "Unknown"; 884 885 atomic_inc(&mce_entry); 886 887 __get_cpu_var(mce_exception_count)++; 888 889 if (notify_die(DIE_NMI, "machine check", regs, error_code, 890 18, SIGKILL) == NOTIFY_STOP) 891 goto out; 892 if (!banks) 893 goto out; 894 895 mce_setup(&m); 896 897 m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); 898 no_way_out = mce_no_way_out(&m, &msg); 899 900 final = &__get_cpu_var(mces_seen); 901 *final = m; 902 903 barrier(); 904 905 /* 906 * When no restart IP must always kill or panic. 907 */ 908 if (!(m.mcgstatus & MCG_STATUS_RIPV)) 909 kill_it = 1; 910 911 /* 912 * Go through all the banks in exclusion of the other CPUs. 913 * This way we don't report duplicated events on shared banks 914 * because the first one to see it will clear it. 915 */ 916 order = mce_start(&no_way_out); 917 for (i = 0; i < banks; i++) { 918 __clear_bit(i, toclear); 919 if (!bank[i]) 920 continue; 921 922 m.misc = 0; 923 m.addr = 0; 924 m.bank = i; 925 926 m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4); 927 if ((m.status & MCI_STATUS_VAL) == 0) 928 continue; 929 930 /* 931 * Non uncorrected or non signaled errors are handled by 932 * machine_check_poll. Leave them alone, unless this panics. 933 */ 934 if (!(m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)) && 935 !no_way_out) 936 continue; 937 938 /* 939 * Set taint even when machine check was not enabled. 940 */ 941 add_taint(TAINT_MACHINE_CHECK); 942 943 severity = mce_severity(&m, tolerant, NULL); 944 945 /* 946 * When machine check was for corrected handler don't touch, 947 * unless we're panicing. 948 */ 949 if (severity == MCE_KEEP_SEVERITY && !no_way_out) 950 continue; 951 __set_bit(i, toclear); 952 if (severity == MCE_NO_SEVERITY) { 953 /* 954 * Machine check event was not enabled. Clear, but 955 * ignore. 956 */ 957 continue; 958 } 959 960 /* 961 * Kill on action required. 962 */ 963 if (severity == MCE_AR_SEVERITY) 964 kill_it = 1; 965 966 if (m.status & MCI_STATUS_MISCV) 967 m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4); 968 if (m.status & MCI_STATUS_ADDRV) 969 m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4); 970 971 /* 972 * Action optional error. Queue address for later processing. 973 * When the ring overflows we just ignore the AO error. 974 * RED-PEN add some logging mechanism when 975 * usable_address or mce_add_ring fails. 976 * RED-PEN don't ignore overflow for tolerant == 0 977 */ 978 if (severity == MCE_AO_SEVERITY && mce_usable_address(&m)) 979 mce_ring_add(m.addr >> PAGE_SHIFT); 980 981 mce_get_rip(&m, regs); 982 mce_log(&m); 983 984 if (severity > worst) { 985 *final = m; 986 worst = severity; 987 } 988 } 989 990 if (!no_way_out) 991 mce_clear_state(toclear); 992 993 /* 994 * Do most of the synchronization with other CPUs. 995 * When there's any problem use only local no_way_out state. 996 */ 997 if (mce_end(order) < 0) 998 no_way_out = worst >= MCE_PANIC_SEVERITY; 999 1000 /* 1001 * If we have decided that we just CAN'T continue, and the user 1002 * has not set tolerant to an insane level, give up and die. 1003 * 1004 * This is mainly used in the case when the system doesn't 1005 * support MCE broadcasting or it has been disabled. 1006 */ 1007 if (no_way_out && tolerant < 3) 1008 mce_panic("Fatal machine check on current CPU", final, msg); 1009 1010 /* 1011 * If the error seems to be unrecoverable, something should be 1012 * done. Try to kill as little as possible. If we can kill just 1013 * one task, do that. If the user has set the tolerance very 1014 * high, don't try to do anything at all. 1015 */ 1016 1017 if (kill_it && tolerant < 3) 1018 force_sig(SIGBUS, current); 1019 1020 /* notify userspace ASAP */ 1021 set_thread_flag(TIF_MCE_NOTIFY); 1022 1023 if (worst > 0) 1024 mce_report_event(regs); 1025 mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); 1026out: 1027 atomic_dec(&mce_entry); 1028 sync_core(); 1029} 1030EXPORT_SYMBOL_GPL(do_machine_check); 1031 1032/* dummy to break dependency. actual code is in mm/memory-failure.c */ 1033void __attribute__((weak)) memory_failure(unsigned long pfn, int vector) 1034{ 1035 printk(KERN_ERR "Action optional memory failure at %lx ignored\n", pfn); 1036} 1037 1038/* 1039 * Called after mce notification in process context. This code 1040 * is allowed to sleep. Call the high level VM handler to process 1041 * any corrupted pages. 1042 * Assume that the work queue code only calls this one at a time 1043 * per CPU. 1044 * Note we don't disable preemption, so this code might run on the wrong 1045 * CPU. In this case the event is picked up by the scheduled work queue. 1046 * This is merely a fast path to expedite processing in some common 1047 * cases. 1048 */ 1049void mce_notify_process(void) 1050{ 1051 unsigned long pfn; 1052 mce_notify_irq(); 1053 while (mce_ring_get(&pfn)) 1054 memory_failure(pfn, MCE_VECTOR); 1055} 1056 1057static void mce_process_work(struct work_struct *dummy) 1058{ 1059 mce_notify_process(); 1060} 1061 1062#ifdef CONFIG_X86_MCE_INTEL 1063/*** 1064 * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog 1065 * @cpu: The CPU on which the event occurred. 1066 * @status: Event status information 1067 * 1068 * This function should be called by the thermal interrupt after the 1069 * event has been processed and the decision was made to log the event 1070 * further. 1071 * 1072 * The status parameter will be saved to the 'status' field of 'struct mce' 1073 * and historically has been the register value of the 1074 * MSR_IA32_THERMAL_STATUS (Intel) msr. 1075 */ 1076void mce_log_therm_throt_event(__u64 status) 1077{ 1078 struct mce m; 1079 1080 mce_setup(&m); 1081 m.bank = MCE_THERMAL_BANK; 1082 m.status = status; 1083 mce_log(&m); 1084} 1085#endif /* CONFIG_X86_MCE_INTEL */ 1086 1087/* 1088 * Periodic polling timer for "silent" machine check errors. If the 1089 * poller finds an MCE, poll 2x faster. When the poller finds no more 1090 * errors, poll 2x slower (up to check_interval seconds). 1091 */ 1092static int check_interval = 5 * 60; /* 5 minutes */ 1093 1094static DEFINE_PER_CPU(int, next_interval); /* in jiffies */ 1095static DEFINE_PER_CPU(struct timer_list, mce_timer); 1096 1097static void mcheck_timer(unsigned long data) 1098{ 1099 struct timer_list *t = &per_cpu(mce_timer, data); 1100 int *n; 1101 1102 WARN_ON(smp_processor_id() != data); 1103 1104 if (mce_available(¤t_cpu_data)) { 1105 machine_check_poll(MCP_TIMESTAMP, 1106 &__get_cpu_var(mce_poll_banks)); 1107 } 1108 1109 /* 1110 * Alert userspace if needed. If we logged an MCE, reduce the 1111 * polling interval, otherwise increase the polling interval. 1112 */ 1113 n = &__get_cpu_var(next_interval); 1114 if (mce_notify_irq()) 1115 *n = max(*n/2, HZ/100); 1116 else 1117 *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ)); 1118 1119 t->expires = jiffies + *n; 1120 add_timer(t); 1121} 1122 1123static void mce_do_trigger(struct work_struct *work) 1124{ 1125 call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT); 1126} 1127 1128static DECLARE_WORK(mce_trigger_work, mce_do_trigger); 1129 1130/* 1131 * Notify the user(s) about new machine check events. 1132 * Can be called from interrupt context, but not from machine check/NMI 1133 * context. 1134 */ 1135int mce_notify_irq(void) 1136{ 1137 /* Not more than two messages every minute */ 1138 static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); 1139 1140 clear_thread_flag(TIF_MCE_NOTIFY); 1141 1142 if (test_and_clear_bit(0, &mce_need_notify)) { 1143 wake_up_interruptible(&mce_wait); 1144 1145 /* 1146 * There is no risk of missing notifications because 1147 * work_pending is always cleared before the function is 1148 * executed. 1149 */ 1150 if (mce_helper[0] && !work_pending(&mce_trigger_work)) 1151 schedule_work(&mce_trigger_work); 1152 1153 if (__ratelimit(&ratelimit)) 1154 printk(KERN_INFO "Machine check events logged\n"); 1155 1156 return 1; 1157 } 1158 return 0; 1159} 1160EXPORT_SYMBOL_GPL(mce_notify_irq); 1161 1162/* 1163 * Initialize Machine Checks for a CPU. 1164 */ 1165static int mce_cap_init(void) 1166{ 1167 unsigned b; 1168 u64 cap; 1169 1170 rdmsrl(MSR_IA32_MCG_CAP, cap); 1171 1172 b = cap & MCG_BANKCNT_MASK; 1173 printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b); 1174 1175 if (b > MAX_NR_BANKS) { 1176 printk(KERN_WARNING 1177 "MCE: Using only %u machine check banks out of %u\n", 1178 MAX_NR_BANKS, b); 1179 b = MAX_NR_BANKS; 1180 } 1181 1182 /* Don't support asymmetric configurations today */ 1183 WARN_ON(banks != 0 && b != banks); 1184 banks = b; 1185 if (!bank) { 1186 bank = kmalloc(banks * sizeof(u64), GFP_KERNEL); 1187 if (!bank) 1188 return -ENOMEM; 1189 memset(bank, 0xff, banks * sizeof(u64)); 1190 } 1191 1192 /* Use accurate RIP reporting if available. */ 1193 if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9) 1194 rip_msr = MSR_IA32_MCG_EIP; 1195 1196 if (cap & MCG_SER_P) 1197 mce_ser = 1; 1198 1199 return 0; 1200} 1201 1202static void mce_init(void) 1203{ 1204 mce_banks_t all_banks; 1205 u64 cap; 1206 int i; 1207 1208 /* 1209 * Log the machine checks left over from the previous reset. 1210 */ 1211 bitmap_fill(all_banks, MAX_NR_BANKS); 1212 machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks); 1213 1214 set_in_cr4(X86_CR4_MCE); 1215 1216 rdmsrl(MSR_IA32_MCG_CAP, cap); 1217 if (cap & MCG_CTL_P) 1218 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); 1219 1220 for (i = 0; i < banks; i++) { 1221 if (skip_bank_init(i)) 1222 continue; 1223 wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]); 1224 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); 1225 } 1226} 1227 1228/* Add per CPU specific workarounds here */ 1229static void mce_cpu_quirks(struct cpuinfo_x86 *c) 1230{ 1231 /* This should be disabled by the BIOS, but isn't always */ 1232 if (c->x86_vendor == X86_VENDOR_AMD) { 1233 if (c->x86 == 15 && banks > 4) { 1234 /* 1235 * disable GART TBL walk error reporting, which 1236 * trips off incorrectly with the IOMMU & 3ware 1237 * & Cerberus: 1238 */ 1239 clear_bit(10, (unsigned long *)&bank[4]); 1240 } 1241 if (c->x86 <= 17 && mce_bootlog < 0) { 1242 /* 1243 * Lots of broken BIOS around that don't clear them 1244 * by default and leave crap in there. Don't log: 1245 */ 1246 mce_bootlog = 0; 1247 } 1248 /* 1249 * Various K7s with broken bank 0 around. Always disable 1250 * by default. 1251 */ 1252 if (c->x86 == 6) 1253 bank[0] = 0; 1254 } 1255 1256 if (c->x86_vendor == X86_VENDOR_INTEL) { 1257 /* 1258 * SDM documents that on family 6 bank 0 should not be written 1259 * because it aliases to another special BIOS controlled 1260 * register. 1261 * But it's not aliased anymore on model 0x1a+ 1262 * Don't ignore bank 0 completely because there could be a 1263 * valid event later, merely don't write CTL0. 1264 */ 1265 1266 if (c->x86 == 6 && c->x86_model < 0x1A) 1267 __set_bit(0, &dont_init_banks); 1268 1269 /* 1270 * All newer Intel systems support MCE broadcasting. Enable 1271 * synchronization with a one second timeout. 1272 */ 1273 if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) && 1274 monarch_timeout < 0) 1275 monarch_timeout = USEC_PER_SEC; 1276 } 1277 if (monarch_timeout < 0) 1278 monarch_timeout = 0; 1279 if (mce_bootlog != 0) 1280 mce_panic_timeout = 30; 1281} 1282 1283static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c) 1284{ 1285 if (c->x86 != 5) 1286 return; 1287 switch (c->x86_vendor) { 1288 case X86_VENDOR_INTEL: 1289 intel_p5_mcheck_init(c); 1290 break; 1291 case X86_VENDOR_CENTAUR: 1292 winchip_mcheck_init(c); 1293 break; 1294 } 1295} 1296 1297static void mce_cpu_features(struct cpuinfo_x86 *c) 1298{ 1299 switch (c->x86_vendor) { 1300 case X86_VENDOR_INTEL: 1301 mce_intel_feature_init(c); 1302 break; 1303 case X86_VENDOR_AMD: 1304 mce_amd_feature_init(c); 1305 break; 1306 default: 1307 break; 1308 } 1309} 1310 1311static void mce_init_timer(void) 1312{ 1313 struct timer_list *t = &__get_cpu_var(mce_timer); 1314 int *n = &__get_cpu_var(next_interval); 1315 1316 if (mce_ignore_ce) 1317 return; 1318 1319 *n = check_interval * HZ; 1320 if (!*n) 1321 return; 1322 setup_timer(t, mcheck_timer, smp_processor_id()); 1323 t->expires = round_jiffies(jiffies + *n); 1324 add_timer(t); 1325} 1326 1327/* 1328 * Called for each booted CPU to set up machine checks. 1329 * Must be called with preempt off: 1330 */ 1331void __cpuinit mcheck_init(struct cpuinfo_x86 *c) 1332{ 1333 if (mce_disabled) 1334 return; 1335 1336 mce_ancient_init(c); 1337 1338 if (!mce_available(c)) 1339 return; 1340 1341 if (mce_cap_init() < 0) { 1342 mce_disabled = 1; 1343 return; 1344 } 1345 mce_cpu_quirks(c); 1346 1347 machine_check_vector = do_machine_check; 1348 1349 mce_init(); 1350 mce_cpu_features(c); 1351 mce_init_timer(); 1352 INIT_WORK(&__get_cpu_var(mce_work), mce_process_work); 1353} 1354 1355/* 1356 * Character device to read and clear the MCE log. 1357 */ 1358 1359static DEFINE_SPINLOCK(mce_state_lock); 1360static int open_count; /* #times opened */ 1361static int open_exclu; /* already open exclusive? */ 1362 1363static int mce_open(struct inode *inode, struct file *file) 1364{ 1365 spin_lock(&mce_state_lock); 1366 1367 if (open_exclu || (open_count && (file->f_flags & O_EXCL))) { 1368 spin_unlock(&mce_state_lock); 1369 1370 return -EBUSY; 1371 } 1372 1373 if (file->f_flags & O_EXCL) 1374 open_exclu = 1; 1375 open_count++; 1376 1377 spin_unlock(&mce_state_lock); 1378 1379 return nonseekable_open(inode, file); 1380} 1381 1382static int mce_release(struct inode *inode, struct file *file) 1383{ 1384 spin_lock(&mce_state_lock); 1385 1386 open_count--; 1387 open_exclu = 0; 1388 1389 spin_unlock(&mce_state_lock); 1390 1391 return 0; 1392} 1393 1394static void collect_tscs(void *data) 1395{ 1396 unsigned long *cpu_tsc = (unsigned long *)data; 1397 1398 rdtscll(cpu_tsc[smp_processor_id()]); 1399} 1400 1401static DEFINE_MUTEX(mce_read_mutex); 1402 1403static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, 1404 loff_t *off) 1405{ 1406 char __user *buf = ubuf; 1407 unsigned long *cpu_tsc; 1408 unsigned prev, next; 1409 int i, err; 1410 1411 cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL); 1412 if (!cpu_tsc) 1413 return -ENOMEM; 1414 1415 mutex_lock(&mce_read_mutex); 1416 next = rcu_dereference(mcelog.next); 1417 1418 /* Only supports full reads right now */ 1419 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) { 1420 mutex_unlock(&mce_read_mutex); 1421 kfree(cpu_tsc); 1422 1423 return -EINVAL; 1424 } 1425 1426 err = 0; 1427 prev = 0; 1428 do { 1429 for (i = prev; i < next; i++) { 1430 unsigned long start = jiffies; 1431 1432 while (!mcelog.entry[i].finished) { 1433 if (time_after_eq(jiffies, start + 2)) { 1434 memset(mcelog.entry + i, 0, 1435 sizeof(struct mce)); 1436 goto timeout; 1437 } 1438 cpu_relax(); 1439 } 1440 smp_rmb(); 1441 err |= copy_to_user(buf, mcelog.entry + i, 1442 sizeof(struct mce)); 1443 buf += sizeof(struct mce); 1444timeout: 1445 ; 1446 } 1447 1448 memset(mcelog.entry + prev, 0, 1449 (next - prev) * sizeof(struct mce)); 1450 prev = next; 1451 next = cmpxchg(&mcelog.next, prev, 0); 1452 } while (next != prev); 1453 1454 synchronize_sched(); 1455 1456 /* 1457 * Collect entries that were still getting written before the 1458 * synchronize. 1459 */ 1460 on_each_cpu(collect_tscs, cpu_tsc, 1); 1461 1462 for (i = next; i < MCE_LOG_LEN; i++) { 1463 if (mcelog.entry[i].finished && 1464 mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) { 1465 err |= copy_to_user(buf, mcelog.entry+i, 1466 sizeof(struct mce)); 1467 smp_rmb(); 1468 buf += sizeof(struct mce); 1469 memset(&mcelog.entry[i], 0, sizeof(struct mce)); 1470 } 1471 } 1472 mutex_unlock(&mce_read_mutex); 1473 kfree(cpu_tsc); 1474 1475 return err ? -EFAULT : buf - ubuf; 1476} 1477 1478static unsigned int mce_poll(struct file *file, poll_table *wait) 1479{ 1480 poll_wait(file, &mce_wait, wait); 1481 if (rcu_dereference(mcelog.next)) 1482 return POLLIN | POLLRDNORM; 1483 return 0; 1484} 1485 1486static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg) 1487{ 1488 int __user *p = (int __user *)arg; 1489 1490 if (!capable(CAP_SYS_ADMIN)) 1491 return -EPERM; 1492 1493 switch (cmd) { 1494 case MCE_GET_RECORD_LEN: 1495 return put_user(sizeof(struct mce), p); 1496 case MCE_GET_LOG_LEN: 1497 return put_user(MCE_LOG_LEN, p); 1498 case MCE_GETCLEAR_FLAGS: { 1499 unsigned flags; 1500 1501 do { 1502 flags = mcelog.flags; 1503 } while (cmpxchg(&mcelog.flags, flags, 0) != flags); 1504 1505 return put_user(flags, p); 1506 } 1507 default: 1508 return -ENOTTY; 1509 } 1510} 1511 1512/* Modified in mce-inject.c, so not static or const */ 1513struct file_operations mce_chrdev_ops = { 1514 .open = mce_open, 1515 .release = mce_release, 1516 .read = mce_read, 1517 .poll = mce_poll, 1518 .unlocked_ioctl = mce_ioctl, 1519}; 1520EXPORT_SYMBOL_GPL(mce_chrdev_ops); 1521 1522static struct miscdevice mce_log_device = { 1523 MISC_MCELOG_MINOR, 1524 "mcelog", 1525 &mce_chrdev_ops, 1526}; 1527 1528/* 1529 * mce=off Disables machine check 1530 * mce=no_cmci Disables CMCI 1531 * mce=dont_log_ce Clears corrected events silently, no log created for CEs. 1532 * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared. 1533 * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above) 1534 * monarchtimeout is how long to wait for other CPUs on machine 1535 * check, or 0 to not wait 1536 * mce=bootlog Log MCEs from before booting. Disabled by default on AMD. 1537 * mce=nobootlog Don't log MCEs from before booting. 1538 */ 1539static int __init mcheck_enable(char *str) 1540{ 1541 if (*str == 0) 1542 enable_p5_mce(); 1543 if (*str == '=') 1544 str++; 1545 if (!strcmp(str, "off")) 1546 mce_disabled = 1; 1547 else if (!strcmp(str, "no_cmci")) 1548 mce_cmci_disabled = 1; 1549 else if (!strcmp(str, "dont_log_ce")) 1550 mce_dont_log_ce = 1; 1551 else if (!strcmp(str, "ignore_ce")) 1552 mce_ignore_ce = 1; 1553 else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog")) 1554 mce_bootlog = (str[0] == 'b'); 1555 else if (isdigit(str[0])) { 1556 get_option(&str, &tolerant); 1557 if (*str == ',') { 1558 ++str; 1559 get_option(&str, &monarch_timeout); 1560 } 1561 } else { 1562 printk(KERN_INFO "mce argument %s ignored. Please use /sys\n", 1563 str); 1564 return 0; 1565 } 1566 return 1; 1567} 1568__setup("mce", mcheck_enable); 1569 1570/* 1571 * Sysfs support 1572 */ 1573 1574/* 1575 * Disable machine checks on suspend and shutdown. We can't really handle 1576 * them later. 1577 */ 1578static int mce_disable(void) 1579{ 1580 int i; 1581 1582 for (i = 0; i < banks; i++) { 1583 if (!skip_bank_init(i)) 1584 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); 1585 } 1586 return 0; 1587} 1588 1589static int mce_suspend(struct sys_device *dev, pm_message_t state) 1590{ 1591 return mce_disable(); 1592} 1593 1594static int mce_shutdown(struct sys_device *dev) 1595{ 1596 return mce_disable(); 1597} 1598 1599/* 1600 * On resume clear all MCE state. Don't want to see leftovers from the BIOS. 1601 * Only one CPU is active at this time, the others get re-added later using 1602 * CPU hotplug: 1603 */ 1604static int mce_resume(struct sys_device *dev) 1605{ 1606 mce_init(); 1607 mce_cpu_features(¤t_cpu_data); 1608 1609 return 0; 1610} 1611 1612static void mce_cpu_restart(void *data) 1613{ 1614 del_timer_sync(&__get_cpu_var(mce_timer)); 1615 if (!mce_available(¤t_cpu_data)) 1616 return; 1617 mce_init(); 1618 mce_init_timer(); 1619} 1620 1621/* Reinit MCEs after user configuration changes */ 1622static void mce_restart(void) 1623{ 1624 on_each_cpu(mce_cpu_restart, NULL, 1); 1625} 1626 1627/* Toggle features for corrected errors */ 1628static void mce_disable_ce(void *all) 1629{ 1630 if (!mce_available(¤t_cpu_data)) 1631 return; 1632 if (all) 1633 del_timer_sync(&__get_cpu_var(mce_timer)); 1634 cmci_clear(); 1635} 1636 1637static void mce_enable_ce(void *all) 1638{ 1639 if (!mce_available(¤t_cpu_data)) 1640 return; 1641 cmci_reenable(); 1642 cmci_recheck(); 1643 if (all) 1644 mce_init_timer(); 1645} 1646 1647static struct sysdev_class mce_sysclass = { 1648 .suspend = mce_suspend, 1649 .shutdown = mce_shutdown, 1650 .resume = mce_resume, 1651 .name = "machinecheck", 1652}; 1653 1654DEFINE_PER_CPU(struct sys_device, mce_dev); 1655 1656__cpuinitdata 1657void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); 1658 1659static struct sysdev_attribute *bank_attrs; 1660 1661static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr, 1662 char *buf) 1663{ 1664 u64 b = bank[attr - bank_attrs]; 1665 1666 return sprintf(buf, "%llx\n", b); 1667} 1668 1669static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, 1670 const char *buf, size_t size) 1671{ 1672 u64 new; 1673 1674 if (strict_strtoull(buf, 0, &new) < 0) 1675 return -EINVAL; 1676 1677 bank[attr - bank_attrs] = new; 1678 mce_restart(); 1679 1680 return size; 1681} 1682 1683static ssize_t 1684show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf) 1685{ 1686 strcpy(buf, mce_helper); 1687 strcat(buf, "\n"); 1688 return strlen(mce_helper) + 1; 1689} 1690 1691static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr, 1692 const char *buf, size_t siz) 1693{ 1694 char *p; 1695 int len; 1696 1697 strncpy(mce_helper, buf, sizeof(mce_helper)); 1698 mce_helper[sizeof(mce_helper)-1] = 0; 1699 len = strlen(mce_helper); 1700 p = strchr(mce_helper, '\n'); 1701 1702 if (*p) 1703 *p = 0; 1704 1705 return len; 1706} 1707 1708static ssize_t set_ignore_ce(struct sys_device *s, 1709 struct sysdev_attribute *attr, 1710 const char *buf, size_t size) 1711{ 1712 u64 new; 1713 1714 if (strict_strtoull(buf, 0, &new) < 0) 1715 return -EINVAL; 1716 1717 if (mce_ignore_ce ^ !!new) { 1718 if (new) { 1719 /* disable ce features */ 1720 on_each_cpu(mce_disable_ce, (void *)1, 1); 1721 mce_ignore_ce = 1; 1722 } else { 1723 /* enable ce features */ 1724 mce_ignore_ce = 0; 1725 on_each_cpu(mce_enable_ce, (void *)1, 1); 1726 } 1727 } 1728 return size; 1729} 1730 1731static ssize_t set_cmci_disabled(struct sys_device *s, 1732 struct sysdev_attribute *attr, 1733 const char *buf, size_t size) 1734{ 1735 u64 new; 1736 1737 if (strict_strtoull(buf, 0, &new) < 0) 1738 return -EINVAL; 1739 1740 if (mce_cmci_disabled ^ !!new) { 1741 if (new) { 1742 /* disable cmci */ 1743 on_each_cpu(mce_disable_ce, NULL, 1); 1744 mce_cmci_disabled = 1; 1745 } else { 1746 /* enable cmci */ 1747 mce_cmci_disabled = 0; 1748 on_each_cpu(mce_enable_ce, NULL, 1); 1749 } 1750 } 1751 return size; 1752} 1753 1754static ssize_t store_int_with_restart(struct sys_device *s, 1755 struct sysdev_attribute *attr, 1756 const char *buf, size_t size) 1757{ 1758 ssize_t ret = sysdev_store_int(s, attr, buf, size); 1759 mce_restart(); 1760 return ret; 1761} 1762 1763static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); 1764static SYSDEV_INT_ATTR(tolerant, 0644, tolerant); 1765static SYSDEV_INT_ATTR(monarch_timeout, 0644, monarch_timeout); 1766static SYSDEV_INT_ATTR(dont_log_ce, 0644, mce_dont_log_ce); 1767 1768static struct sysdev_ext_attribute attr_check_interval = { 1769 _SYSDEV_ATTR(check_interval, 0644, sysdev_show_int, 1770 store_int_with_restart), 1771 &check_interval 1772}; 1773 1774static struct sysdev_ext_attribute attr_ignore_ce = { 1775 _SYSDEV_ATTR(ignore_ce, 0644, sysdev_show_int, set_ignore_ce), 1776 &mce_ignore_ce 1777}; 1778 1779static struct sysdev_ext_attribute attr_cmci_disabled = { 1780 _SYSDEV_ATTR(ignore_ce, 0644, sysdev_show_int, set_cmci_disabled), 1781 &mce_cmci_disabled 1782}; 1783 1784static struct sysdev_attribute *mce_attrs[] = { 1785 &attr_tolerant.attr, 1786 &attr_check_interval.attr, 1787 &attr_trigger, 1788 &attr_monarch_timeout.attr, 1789 &attr_dont_log_ce.attr, 1790 &attr_ignore_ce.attr, 1791 &attr_cmci_disabled.attr, 1792 NULL 1793}; 1794 1795static cpumask_var_t mce_dev_initialized; 1796 1797/* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */ 1798static __cpuinit int mce_create_device(unsigned int cpu) 1799{ 1800 int err; 1801 int i; 1802 1803 if (!mce_available(&boot_cpu_data)) 1804 return -EIO; 1805 1806 memset(&per_cpu(mce_dev, cpu).kobj, 0, sizeof(struct kobject)); 1807 per_cpu(mce_dev, cpu).id = cpu; 1808 per_cpu(mce_dev, cpu).cls = &mce_sysclass; 1809 1810 err = sysdev_register(&per_cpu(mce_dev, cpu)); 1811 if (err) 1812 return err; 1813 1814 for (i = 0; mce_attrs[i]; i++) { 1815 err = sysdev_create_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); 1816 if (err) 1817 goto error; 1818 } 1819 for (i = 0; i < banks; i++) { 1820 err = sysdev_create_file(&per_cpu(mce_dev, cpu), 1821 &bank_attrs[i]); 1822 if (err) 1823 goto error2; 1824 } 1825 cpumask_set_cpu(cpu, mce_dev_initialized); 1826 1827 return 0; 1828error2: 1829 while (--i >= 0) 1830 sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]); 1831error: 1832 while (--i >= 0) 1833 sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); 1834 1835 sysdev_unregister(&per_cpu(mce_dev, cpu)); 1836 1837 return err; 1838} 1839 1840static __cpuinit void mce_remove_device(unsigned int cpu) 1841{ 1842 int i; 1843 1844 if (!cpumask_test_cpu(cpu, mce_dev_initialized)) 1845 return; 1846 1847 for (i = 0; mce_attrs[i]; i++) 1848 sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); 1849 1850 for (i = 0; i < banks; i++) 1851 sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]); 1852 1853 sysdev_unregister(&per_cpu(mce_dev, cpu)); 1854 cpumask_clear_cpu(cpu, mce_dev_initialized); 1855} 1856 1857/* Make sure there are no machine checks on offlined CPUs. */ 1858static void mce_disable_cpu(void *h) 1859{ 1860 unsigned long action = *(unsigned long *)h; 1861 int i; 1862 1863 if (!mce_available(¤t_cpu_data)) 1864 return; 1865 if (!(action & CPU_TASKS_FROZEN)) 1866 cmci_clear(); 1867 for (i = 0; i < banks; i++) { 1868 if (!skip_bank_init(i)) 1869 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); 1870 } 1871} 1872 1873static void mce_reenable_cpu(void *h) 1874{ 1875 unsigned long action = *(unsigned long *)h; 1876 int i; 1877 1878 if (!mce_available(¤t_cpu_data)) 1879 return; 1880 1881 if (!(action & CPU_TASKS_FROZEN)) 1882 cmci_reenable(); 1883 for (i = 0; i < banks; i++) { 1884 if (!skip_bank_init(i)) 1885 wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]); 1886 } 1887} 1888 1889/* Get notified when a cpu comes on/off. Be hotplug friendly. */ 1890static int __cpuinit 1891mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) 1892{ 1893 unsigned int cpu = (unsigned long)hcpu; 1894 struct timer_list *t = &per_cpu(mce_timer, cpu); 1895 1896 switch (action) { 1897 case CPU_ONLINE: 1898 case CPU_ONLINE_FROZEN: 1899 mce_create_device(cpu); 1900 if (threshold_cpu_callback) 1901 threshold_cpu_callback(action, cpu); 1902 break; 1903 case CPU_DEAD: 1904 case CPU_DEAD_FROZEN: 1905 if (threshold_cpu_callback) 1906 threshold_cpu_callback(action, cpu); 1907 mce_remove_device(cpu); 1908 break; 1909 case CPU_DOWN_PREPARE: 1910 case CPU_DOWN_PREPARE_FROZEN: 1911 del_timer_sync(t); 1912 smp_call_function_single(cpu, mce_disable_cpu, &action, 1); 1913 break; 1914 case CPU_DOWN_FAILED: 1915 case CPU_DOWN_FAILED_FROZEN: 1916 t->expires = round_jiffies(jiffies + 1917 __get_cpu_var(next_interval)); 1918 add_timer_on(t, cpu); 1919 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); 1920 break; 1921 case CPU_POST_DEAD: 1922 /* intentionally ignoring frozen here */ 1923 cmci_rediscover(cpu); 1924 break; 1925 } 1926 return NOTIFY_OK; 1927} 1928 1929static struct notifier_block mce_cpu_notifier __cpuinitdata = { 1930 .notifier_call = mce_cpu_callback, 1931}; 1932 1933static __init int mce_init_banks(void) 1934{ 1935 int i; 1936 1937 bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks, 1938 GFP_KERNEL); 1939 if (!bank_attrs) 1940 return -ENOMEM; 1941 1942 for (i = 0; i < banks; i++) { 1943 struct sysdev_attribute *a = &bank_attrs[i]; 1944 1945 a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i); 1946 if (!a->attr.name) 1947 goto nomem; 1948 1949 a->attr.mode = 0644; 1950 a->show = show_bank; 1951 a->store = set_bank; 1952 } 1953 return 0; 1954 1955nomem: 1956 while (--i >= 0) 1957 kfree(bank_attrs[i].attr.name); 1958 kfree(bank_attrs); 1959 bank_attrs = NULL; 1960 1961 return -ENOMEM; 1962} 1963 1964static __init int mce_init_device(void) 1965{ 1966 int err; 1967 int i = 0; 1968 1969 if (!mce_available(&boot_cpu_data)) 1970 return -EIO; 1971 1972 alloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL); 1973 1974 err = mce_init_banks(); 1975 if (err) 1976 return err; 1977 1978 err = sysdev_class_register(&mce_sysclass); 1979 if (err) 1980 return err; 1981 1982 for_each_online_cpu(i) { 1983 err = mce_create_device(i); 1984 if (err) 1985 return err; 1986 } 1987 1988 register_hotcpu_notifier(&mce_cpu_notifier); 1989 misc_register(&mce_log_device); 1990 1991 return err; 1992} 1993 1994device_initcall(mce_init_device); 1995 1996#else /* CONFIG_X86_OLD_MCE: */ 1997 1998int nr_mce_banks; 1999EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */ 2000 2001/* This has to be run for each processor */ 2002void mcheck_init(struct cpuinfo_x86 *c) 2003{ 2004 if (mce_disabled) 2005 return; 2006 2007 switch (c->x86_vendor) { 2008 case X86_VENDOR_AMD: 2009 amd_mcheck_init(c); 2010 break; 2011 2012 case X86_VENDOR_INTEL: 2013 if (c->x86 == 5) 2014 intel_p5_mcheck_init(c); 2015 if (c->x86 == 6) 2016 intel_p6_mcheck_init(c); 2017 if (c->x86 == 15) 2018 intel_p4_mcheck_init(c); 2019 break; 2020 2021 case X86_VENDOR_CENTAUR: 2022 if (c->x86 == 5) 2023 winchip_mcheck_init(c); 2024 break; 2025 2026 default: 2027 break; 2028 } 2029 printk(KERN_INFO "mce: CPU supports %d MCE banks\n", nr_mce_banks); 2030} 2031 2032static int __init mcheck_enable(char *str) 2033{ 2034 mce_p5_enabled = 1; 2035 return 1; 2036} 2037__setup("mce", mcheck_enable); 2038 2039#endif /* CONFIG_X86_OLD_MCE */ 2040 2041/* 2042 * Old style boot options parsing. Only for compatibility. 2043 */ 2044static int __init mcheck_disable(char *str) 2045{ 2046 mce_disabled = 1; 2047 return 1; 2048} 2049__setup("nomce", mcheck_disable); 2050