mce.c revision 33edbf02a92771fa2a81e41084a44ba874e3a5a5
1/* 2 * Machine check handler. 3 * 4 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. 5 * Rest from unknown author(s). 6 * 2004 Andi Kleen. Rewrote most of it. 7 * Copyright 2008 Intel Corporation 8 * Author: Andi Kleen 9 */ 10#include <linux/thread_info.h> 11#include <linux/capability.h> 12#include <linux/miscdevice.h> 13#include <linux/interrupt.h> 14#include <linux/ratelimit.h> 15#include <linux/kallsyms.h> 16#include <linux/rcupdate.h> 17#include <linux/kobject.h> 18#include <linux/uaccess.h> 19#include <linux/kdebug.h> 20#include <linux/kernel.h> 21#include <linux/percpu.h> 22#include <linux/string.h> 23#include <linux/sysdev.h> 24#include <linux/delay.h> 25#include <linux/ctype.h> 26#include <linux/sched.h> 27#include <linux/sysfs.h> 28#include <linux/types.h> 29#include <linux/init.h> 30#include <linux/kmod.h> 31#include <linux/poll.h> 32#include <linux/nmi.h> 33#include <linux/cpu.h> 34#include <linux/smp.h> 35#include <linux/fs.h> 36#include <linux/mm.h> 37 38#include <asm/processor.h> 39#include <asm/hw_irq.h> 40#include <asm/apic.h> 41#include <asm/idle.h> 42#include <asm/ipi.h> 43#include <asm/mce.h> 44#include <asm/msr.h> 45 46#include "mce-internal.h" 47#include "mce.h" 48 49/* Handle unconfigured int18 (should never happen) */ 50static void unexpected_machine_check(struct pt_regs *regs, long error_code) 51{ 52 printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", 53 smp_processor_id()); 54} 55 56/* Call the installed machine check handler for this CPU setup. */ 57void (*machine_check_vector)(struct pt_regs *, long error_code) = 58 unexpected_machine_check; 59 60int mce_disabled; 61 62#ifdef CONFIG_X86_NEW_MCE 63 64#define MISC_MCELOG_MINOR 227 65 66#define SPINUNIT 100 /* 100ns */ 67 68atomic_t mce_entry; 69 70DEFINE_PER_CPU(unsigned, mce_exception_count); 71 72/* 73 * Tolerant levels: 74 * 0: always panic on uncorrected errors, log corrected errors 75 * 1: panic or SIGBUS on uncorrected errors, log corrected errors 76 * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors 77 * 3: never panic or SIGBUS, log all errors (for testing only) 78 */ 79static int tolerant = 1; 80static int banks; 81static u64 *bank; 82static unsigned long notify_user; 83static int rip_msr; 84static int mce_bootlog = -1; 85static int monarch_timeout = -1; 86static int mce_panic_timeout; 87static int mce_dont_log_ce; 88int mce_cmci_disabled; 89int mce_ignore_ce; 90int mce_ser; 91 92static char trigger[128]; 93static char *trigger_argv[2] = { trigger, NULL }; 94 95static unsigned long dont_init_banks; 96 97static DECLARE_WAIT_QUEUE_HEAD(mce_wait); 98static DEFINE_PER_CPU(struct mce, mces_seen); 99static int cpu_missing; 100 101 102/* MCA banks polled by the period polling timer for corrected events */ 103DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { 104 [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL 105}; 106 107static inline int skip_bank_init(int i) 108{ 109 return i < BITS_PER_LONG && test_bit(i, &dont_init_banks); 110} 111 112static DEFINE_PER_CPU(struct work_struct, mce_work); 113 114/* Do initial initialization of a struct mce */ 115void mce_setup(struct mce *m) 116{ 117 memset(m, 0, sizeof(struct mce)); 118 m->cpu = m->extcpu = smp_processor_id(); 119 rdtscll(m->tsc); 120 /* We hope get_seconds stays lockless */ 121 m->time = get_seconds(); 122 m->cpuvendor = boot_cpu_data.x86_vendor; 123 m->cpuid = cpuid_eax(1); 124#ifdef CONFIG_SMP 125 m->socketid = cpu_data(m->extcpu).phys_proc_id; 126#endif 127 m->apicid = cpu_data(m->extcpu).initial_apicid; 128 rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap); 129} 130 131DEFINE_PER_CPU(struct mce, injectm); 132EXPORT_PER_CPU_SYMBOL_GPL(injectm); 133 134/* 135 * Lockless MCE logging infrastructure. 136 * This avoids deadlocks on printk locks without having to break locks. Also 137 * separate MCEs from kernel messages to avoid bogus bug reports. 138 */ 139 140static struct mce_log mcelog = { 141 .signature = MCE_LOG_SIGNATURE, 142 .len = MCE_LOG_LEN, 143 .recordlen = sizeof(struct mce), 144}; 145 146void mce_log(struct mce *mce) 147{ 148 unsigned next, entry; 149 150 mce->finished = 0; 151 wmb(); 152 for (;;) { 153 entry = rcu_dereference(mcelog.next); 154 for (;;) { 155 /* 156 * When the buffer fills up discard new entries. 157 * Assume that the earlier errors are the more 158 * interesting ones: 159 */ 160 if (entry >= MCE_LOG_LEN) { 161 set_bit(MCE_OVERFLOW, 162 (unsigned long *)&mcelog.flags); 163 return; 164 } 165 /* Old left over entry. Skip: */ 166 if (mcelog.entry[entry].finished) { 167 entry++; 168 continue; 169 } 170 break; 171 } 172 smp_rmb(); 173 next = entry + 1; 174 if (cmpxchg(&mcelog.next, entry, next) == entry) 175 break; 176 } 177 memcpy(mcelog.entry + entry, mce, sizeof(struct mce)); 178 wmb(); 179 mcelog.entry[entry].finished = 1; 180 wmb(); 181 182 mce->finished = 1; 183 set_bit(0, ¬ify_user); 184} 185 186static void print_mce(struct mce *m) 187{ 188 printk(KERN_EMERG 189 "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n", 190 m->extcpu, m->mcgstatus, m->bank, m->status); 191 if (m->ip) { 192 printk(KERN_EMERG "RIP%s %02x:<%016Lx> ", 193 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", 194 m->cs, m->ip); 195 if (m->cs == __KERNEL_CS) 196 print_symbol("{%s}", m->ip); 197 printk("\n"); 198 } 199 printk(KERN_EMERG "TSC %llx ", m->tsc); 200 if (m->addr) 201 printk("ADDR %llx ", m->addr); 202 if (m->misc) 203 printk("MISC %llx ", m->misc); 204 printk("\n"); 205 printk(KERN_EMERG "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n", 206 m->cpuvendor, m->cpuid, m->time, m->socketid, 207 m->apicid); 208} 209 210static void print_mce_head(void) 211{ 212 printk(KERN_EMERG "\n" KERN_EMERG "HARDWARE ERROR\n"); 213} 214 215static void print_mce_tail(void) 216{ 217 printk(KERN_EMERG "This is not a software problem!\n" 218 KERN_EMERG "Run through mcelog --ascii to decode and contact your hardware vendor\n"); 219} 220 221#define PANIC_TIMEOUT 5 /* 5 seconds */ 222 223static atomic_t mce_paniced; 224 225/* Panic in progress. Enable interrupts and wait for final IPI */ 226static void wait_for_panic(void) 227{ 228 long timeout = PANIC_TIMEOUT*USEC_PER_SEC; 229 preempt_disable(); 230 local_irq_enable(); 231 while (timeout-- > 0) 232 udelay(1); 233 if (panic_timeout == 0) 234 panic_timeout = mce_panic_timeout; 235 panic("Panicing machine check CPU died"); 236} 237 238static void mce_panic(char *msg, struct mce *final, char *exp) 239{ 240 int i; 241 242 /* 243 * Make sure only one CPU runs in machine check panic 244 */ 245 if (atomic_add_return(1, &mce_paniced) > 1) 246 wait_for_panic(); 247 barrier(); 248 249 bust_spinlocks(1); 250 console_verbose(); 251 print_mce_head(); 252 /* First print corrected ones that are still unlogged */ 253 for (i = 0; i < MCE_LOG_LEN; i++) { 254 struct mce *m = &mcelog.entry[i]; 255 if (!(m->status & MCI_STATUS_VAL)) 256 continue; 257 if (!(m->status & MCI_STATUS_UC)) 258 print_mce(m); 259 } 260 /* Now print uncorrected but with the final one last */ 261 for (i = 0; i < MCE_LOG_LEN; i++) { 262 struct mce *m = &mcelog.entry[i]; 263 if (!(m->status & MCI_STATUS_VAL)) 264 continue; 265 if (!(m->status & MCI_STATUS_UC)) 266 continue; 267 if (!final || memcmp(m, final, sizeof(struct mce))) 268 print_mce(m); 269 } 270 if (final) 271 print_mce(final); 272 if (cpu_missing) 273 printk(KERN_EMERG "Some CPUs didn't answer in synchronization\n"); 274 print_mce_tail(); 275 if (exp) 276 printk(KERN_EMERG "Machine check: %s\n", exp); 277 if (panic_timeout == 0) 278 panic_timeout = mce_panic_timeout; 279 panic(msg); 280} 281 282/* Support code for software error injection */ 283 284static int msr_to_offset(u32 msr) 285{ 286 unsigned bank = __get_cpu_var(injectm.bank); 287 if (msr == rip_msr) 288 return offsetof(struct mce, ip); 289 if (msr == MSR_IA32_MC0_STATUS + bank*4) 290 return offsetof(struct mce, status); 291 if (msr == MSR_IA32_MC0_ADDR + bank*4) 292 return offsetof(struct mce, addr); 293 if (msr == MSR_IA32_MC0_MISC + bank*4) 294 return offsetof(struct mce, misc); 295 if (msr == MSR_IA32_MCG_STATUS) 296 return offsetof(struct mce, mcgstatus); 297 return -1; 298} 299 300/* MSR access wrappers used for error injection */ 301static u64 mce_rdmsrl(u32 msr) 302{ 303 u64 v; 304 if (__get_cpu_var(injectm).finished) { 305 int offset = msr_to_offset(msr); 306 if (offset < 0) 307 return 0; 308 return *(u64 *)((char *)&__get_cpu_var(injectm) + offset); 309 } 310 rdmsrl(msr, v); 311 return v; 312} 313 314static void mce_wrmsrl(u32 msr, u64 v) 315{ 316 if (__get_cpu_var(injectm).finished) { 317 int offset = msr_to_offset(msr); 318 if (offset >= 0) 319 *(u64 *)((char *)&__get_cpu_var(injectm) + offset) = v; 320 return; 321 } 322 wrmsrl(msr, v); 323} 324 325/* 326 * Simple lockless ring to communicate PFNs from the exception handler with the 327 * process context work function. This is vastly simplified because there's 328 * only a single reader and a single writer. 329 */ 330#define MCE_RING_SIZE 16 /* we use one entry less */ 331 332struct mce_ring { 333 unsigned short start; 334 unsigned short end; 335 unsigned long ring[MCE_RING_SIZE]; 336}; 337static DEFINE_PER_CPU(struct mce_ring, mce_ring); 338 339/* Runs with CPU affinity in workqueue */ 340static int mce_ring_empty(void) 341{ 342 struct mce_ring *r = &__get_cpu_var(mce_ring); 343 344 return r->start == r->end; 345} 346 347static int mce_ring_get(unsigned long *pfn) 348{ 349 struct mce_ring *r; 350 int ret = 0; 351 352 *pfn = 0; 353 get_cpu(); 354 r = &__get_cpu_var(mce_ring); 355 if (r->start == r->end) 356 goto out; 357 *pfn = r->ring[r->start]; 358 r->start = (r->start + 1) % MCE_RING_SIZE; 359 ret = 1; 360out: 361 put_cpu(); 362 return ret; 363} 364 365/* Always runs in MCE context with preempt off */ 366static int mce_ring_add(unsigned long pfn) 367{ 368 struct mce_ring *r = &__get_cpu_var(mce_ring); 369 unsigned next; 370 371 next = (r->end + 1) % MCE_RING_SIZE; 372 if (next == r->start) 373 return -1; 374 r->ring[r->end] = pfn; 375 wmb(); 376 r->end = next; 377 return 0; 378} 379 380int mce_available(struct cpuinfo_x86 *c) 381{ 382 if (mce_disabled) 383 return 0; 384 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); 385} 386 387static void mce_schedule_work(void) 388{ 389 if (!mce_ring_empty()) { 390 struct work_struct *work = &__get_cpu_var(mce_work); 391 if (!work_pending(work)) 392 schedule_work(work); 393 } 394} 395 396/* 397 * Get the address of the instruction at the time of the machine check 398 * error. 399 */ 400static inline void mce_get_rip(struct mce *m, struct pt_regs *regs) 401{ 402 403 if (regs && (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV))) { 404 m->ip = regs->ip; 405 m->cs = regs->cs; 406 } else { 407 m->ip = 0; 408 m->cs = 0; 409 } 410 if (rip_msr) 411 m->ip = mce_rdmsrl(rip_msr); 412} 413 414#ifdef CONFIG_X86_LOCAL_APIC 415/* 416 * Called after interrupts have been reenabled again 417 * when a MCE happened during an interrupts off region 418 * in the kernel. 419 */ 420asmlinkage void smp_mce_self_interrupt(struct pt_regs *regs) 421{ 422 ack_APIC_irq(); 423 exit_idle(); 424 irq_enter(); 425 mce_notify_irq(); 426 mce_schedule_work(); 427 irq_exit(); 428} 429#endif 430 431static void mce_report_event(struct pt_regs *regs) 432{ 433 if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) { 434 mce_notify_irq(); 435 /* 436 * Triggering the work queue here is just an insurance 437 * policy in case the syscall exit notify handler 438 * doesn't run soon enough or ends up running on the 439 * wrong CPU (can happen when audit sleeps) 440 */ 441 mce_schedule_work(); 442 return; 443 } 444 445#ifdef CONFIG_X86_LOCAL_APIC 446 /* 447 * Without APIC do not notify. The event will be picked 448 * up eventually. 449 */ 450 if (!cpu_has_apic) 451 return; 452 453 /* 454 * When interrupts are disabled we cannot use 455 * kernel services safely. Trigger an self interrupt 456 * through the APIC to instead do the notification 457 * after interrupts are reenabled again. 458 */ 459 apic->send_IPI_self(MCE_SELF_VECTOR); 460 461 /* 462 * Wait for idle afterwards again so that we don't leave the 463 * APIC in a non idle state because the normal APIC writes 464 * cannot exclude us. 465 */ 466 apic_wait_icr_idle(); 467#endif 468} 469 470DEFINE_PER_CPU(unsigned, mce_poll_count); 471 472/* 473 * Poll for corrected events or events that happened before reset. 474 * Those are just logged through /dev/mcelog. 475 * 476 * This is executed in standard interrupt context. 477 * 478 * Note: spec recommends to panic for fatal unsignalled 479 * errors here. However this would be quite problematic -- 480 * we would need to reimplement the Monarch handling and 481 * it would mess up the exclusion between exception handler 482 * and poll hander -- * so we skip this for now. 483 * These cases should not happen anyways, or only when the CPU 484 * is already totally * confused. In this case it's likely it will 485 * not fully execute the machine check handler either. 486 */ 487void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) 488{ 489 struct mce m; 490 int i; 491 492 __get_cpu_var(mce_poll_count)++; 493 494 mce_setup(&m); 495 496 m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); 497 for (i = 0; i < banks; i++) { 498 if (!bank[i] || !test_bit(i, *b)) 499 continue; 500 501 m.misc = 0; 502 m.addr = 0; 503 m.bank = i; 504 m.tsc = 0; 505 506 barrier(); 507 m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4); 508 if (!(m.status & MCI_STATUS_VAL)) 509 continue; 510 511 /* 512 * Uncorrected or signalled events are handled by the exception 513 * handler when it is enabled, so don't process those here. 514 * 515 * TBD do the same check for MCI_STATUS_EN here? 516 */ 517 if (!(flags & MCP_UC) && 518 (m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC))) 519 continue; 520 521 if (m.status & MCI_STATUS_MISCV) 522 m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4); 523 if (m.status & MCI_STATUS_ADDRV) 524 m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4); 525 526 if (!(flags & MCP_TIMESTAMP)) 527 m.tsc = 0; 528 /* 529 * Don't get the IP here because it's unlikely to 530 * have anything to do with the actual error location. 531 */ 532 if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) { 533 mce_log(&m); 534 add_taint(TAINT_MACHINE_CHECK); 535 } 536 537 /* 538 * Clear state for this bank. 539 */ 540 mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); 541 } 542 543 /* 544 * Don't clear MCG_STATUS here because it's only defined for 545 * exceptions. 546 */ 547 548 sync_core(); 549} 550EXPORT_SYMBOL_GPL(machine_check_poll); 551 552/* 553 * Do a quick check if any of the events requires a panic. 554 * This decides if we keep the events around or clear them. 555 */ 556static int mce_no_way_out(struct mce *m, char **msg) 557{ 558 int i; 559 560 for (i = 0; i < banks; i++) { 561 m->status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4); 562 if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY) 563 return 1; 564 } 565 return 0; 566} 567 568/* 569 * Variable to establish order between CPUs while scanning. 570 * Each CPU spins initially until executing is equal its number. 571 */ 572static atomic_t mce_executing; 573 574/* 575 * Defines order of CPUs on entry. First CPU becomes Monarch. 576 */ 577static atomic_t mce_callin; 578 579/* 580 * Check if a timeout waiting for other CPUs happened. 581 */ 582static int mce_timed_out(u64 *t) 583{ 584 /* 585 * The others already did panic for some reason. 586 * Bail out like in a timeout. 587 * rmb() to tell the compiler that system_state 588 * might have been modified by someone else. 589 */ 590 rmb(); 591 if (atomic_read(&mce_paniced)) 592 wait_for_panic(); 593 if (!monarch_timeout) 594 goto out; 595 if ((s64)*t < SPINUNIT) { 596 /* CHECKME: Make panic default for 1 too? */ 597 if (tolerant < 1) 598 mce_panic("Timeout synchronizing machine check over CPUs", 599 NULL, NULL); 600 cpu_missing = 1; 601 return 1; 602 } 603 *t -= SPINUNIT; 604out: 605 touch_nmi_watchdog(); 606 return 0; 607} 608 609/* 610 * The Monarch's reign. The Monarch is the CPU who entered 611 * the machine check handler first. It waits for the others to 612 * raise the exception too and then grades them. When any 613 * error is fatal panic. Only then let the others continue. 614 * 615 * The other CPUs entering the MCE handler will be controlled by the 616 * Monarch. They are called Subjects. 617 * 618 * This way we prevent any potential data corruption in a unrecoverable case 619 * and also makes sure always all CPU's errors are examined. 620 * 621 * Also this detects the case of an machine check event coming from outer 622 * space (not detected by any CPUs) In this case some external agent wants 623 * us to shut down, so panic too. 624 * 625 * The other CPUs might still decide to panic if the handler happens 626 * in a unrecoverable place, but in this case the system is in a semi-stable 627 * state and won't corrupt anything by itself. It's ok to let the others 628 * continue for a bit first. 629 * 630 * All the spin loops have timeouts; when a timeout happens a CPU 631 * typically elects itself to be Monarch. 632 */ 633static void mce_reign(void) 634{ 635 int cpu; 636 struct mce *m = NULL; 637 int global_worst = 0; 638 char *msg = NULL; 639 char *nmsg = NULL; 640 641 /* 642 * This CPU is the Monarch and the other CPUs have run 643 * through their handlers. 644 * Grade the severity of the errors of all the CPUs. 645 */ 646 for_each_possible_cpu(cpu) { 647 int severity = mce_severity(&per_cpu(mces_seen, cpu), tolerant, 648 &nmsg); 649 if (severity > global_worst) { 650 msg = nmsg; 651 global_worst = severity; 652 m = &per_cpu(mces_seen, cpu); 653 } 654 } 655 656 /* 657 * Cannot recover? Panic here then. 658 * This dumps all the mces in the log buffer and stops the 659 * other CPUs. 660 */ 661 if (m && global_worst >= MCE_PANIC_SEVERITY && tolerant < 3) 662 mce_panic("Fatal Machine check", m, msg); 663 664 /* 665 * For UC somewhere we let the CPU who detects it handle it. 666 * Also must let continue the others, otherwise the handling 667 * CPU could deadlock on a lock. 668 */ 669 670 /* 671 * No machine check event found. Must be some external 672 * source or one CPU is hung. Panic. 673 */ 674 if (!m && tolerant < 3) 675 mce_panic("Machine check from unknown source", NULL, NULL); 676 677 /* 678 * Now clear all the mces_seen so that they don't reappear on 679 * the next mce. 680 */ 681 for_each_possible_cpu(cpu) 682 memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce)); 683} 684 685static atomic_t global_nwo; 686 687/* 688 * Start of Monarch synchronization. This waits until all CPUs have 689 * entered the exception handler and then determines if any of them 690 * saw a fatal event that requires panic. Then it executes them 691 * in the entry order. 692 * TBD double check parallel CPU hotunplug 693 */ 694static int mce_start(int no_way_out, int *order) 695{ 696 int nwo; 697 int cpus = num_online_cpus(); 698 u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC; 699 700 if (!timeout) { 701 *order = -1; 702 return no_way_out; 703 } 704 705 atomic_add(no_way_out, &global_nwo); 706 /* 707 * global_nwo should be updated before mce_callin 708 */ 709 smp_wmb(); 710 *order = atomic_add_return(1, &mce_callin); 711 712 /* 713 * Wait for everyone. 714 */ 715 while (atomic_read(&mce_callin) != cpus) { 716 if (mce_timed_out(&timeout)) { 717 atomic_set(&global_nwo, 0); 718 *order = -1; 719 return no_way_out; 720 } 721 ndelay(SPINUNIT); 722 } 723 724 /* 725 * mce_callin should be read before global_nwo 726 */ 727 smp_rmb(); 728 /* 729 * Cache the global no_way_out state. 730 */ 731 nwo = atomic_read(&global_nwo); 732 733 /* 734 * Monarch starts executing now, the others wait. 735 */ 736 if (*order == 1) { 737 atomic_set(&mce_executing, 1); 738 return nwo; 739 } 740 741 /* 742 * Now start the scanning loop one by one 743 * in the original callin order. 744 * This way when there are any shared banks it will 745 * be only seen by one CPU before cleared, avoiding duplicates. 746 */ 747 while (atomic_read(&mce_executing) < *order) { 748 if (mce_timed_out(&timeout)) { 749 atomic_set(&global_nwo, 0); 750 *order = -1; 751 return no_way_out; 752 } 753 ndelay(SPINUNIT); 754 } 755 return nwo; 756} 757 758/* 759 * Synchronize between CPUs after main scanning loop. 760 * This invokes the bulk of the Monarch processing. 761 */ 762static int mce_end(int order) 763{ 764 int ret = -1; 765 u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC; 766 767 if (!timeout) 768 goto reset; 769 if (order < 0) 770 goto reset; 771 772 /* 773 * Allow others to run. 774 */ 775 atomic_inc(&mce_executing); 776 777 if (order == 1) { 778 /* CHECKME: Can this race with a parallel hotplug? */ 779 int cpus = num_online_cpus(); 780 781 /* 782 * Monarch: Wait for everyone to go through their scanning 783 * loops. 784 */ 785 while (atomic_read(&mce_executing) <= cpus) { 786 if (mce_timed_out(&timeout)) 787 goto reset; 788 ndelay(SPINUNIT); 789 } 790 791 mce_reign(); 792 barrier(); 793 ret = 0; 794 } else { 795 /* 796 * Subject: Wait for Monarch to finish. 797 */ 798 while (atomic_read(&mce_executing) != 0) { 799 if (mce_timed_out(&timeout)) 800 goto reset; 801 ndelay(SPINUNIT); 802 } 803 804 /* 805 * Don't reset anything. That's done by the Monarch. 806 */ 807 return 0; 808 } 809 810 /* 811 * Reset all global state. 812 */ 813reset: 814 atomic_set(&global_nwo, 0); 815 atomic_set(&mce_callin, 0); 816 barrier(); 817 818 /* 819 * Let others run again. 820 */ 821 atomic_set(&mce_executing, 0); 822 return ret; 823} 824 825/* 826 * Check if the address reported by the CPU is in a format we can parse. 827 * It would be possible to add code for most other cases, but all would 828 * be somewhat complicated (e.g. segment offset would require an instruction 829 * parser). So only support physical addresses upto page granuality for now. 830 */ 831static int mce_usable_address(struct mce *m) 832{ 833 if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV)) 834 return 0; 835 if ((m->misc & 0x3f) > PAGE_SHIFT) 836 return 0; 837 if (((m->misc >> 6) & 7) != MCM_ADDR_PHYS) 838 return 0; 839 return 1; 840} 841 842static void mce_clear_state(unsigned long *toclear) 843{ 844 int i; 845 846 for (i = 0; i < banks; i++) { 847 if (test_bit(i, toclear)) 848 mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); 849 } 850} 851 852/* 853 * The actual machine check handler. This only handles real 854 * exceptions when something got corrupted coming in through int 18. 855 * 856 * This is executed in NMI context not subject to normal locking rules. This 857 * implies that most kernel services cannot be safely used. Don't even 858 * think about putting a printk in there! 859 * 860 * On Intel systems this is entered on all CPUs in parallel through 861 * MCE broadcast. However some CPUs might be broken beyond repair, 862 * so be always careful when synchronizing with others. 863 */ 864void do_machine_check(struct pt_regs *regs, long error_code) 865{ 866 struct mce m, *final; 867 int i; 868 int worst = 0; 869 int severity; 870 /* 871 * Establish sequential order between the CPUs entering the machine 872 * check handler. 873 */ 874 int order = -1; 875 876 /* 877 * If no_way_out gets set, there is no safe way to recover from this 878 * MCE. If tolerant is cranked up, we'll try anyway. 879 */ 880 int no_way_out = 0; 881 /* 882 * If kill_it gets set, there might be a way to recover from this 883 * error. 884 */ 885 int kill_it = 0; 886 DECLARE_BITMAP(toclear, MAX_NR_BANKS); 887 char *msg = "Unknown"; 888 889 atomic_inc(&mce_entry); 890 891 __get_cpu_var(mce_exception_count)++; 892 893 if (notify_die(DIE_NMI, "machine check", regs, error_code, 894 18, SIGKILL) == NOTIFY_STOP) 895 goto out; 896 if (!banks) 897 goto out; 898 899 mce_setup(&m); 900 901 m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); 902 no_way_out = mce_no_way_out(&m, &msg); 903 904 final = &__get_cpu_var(mces_seen); 905 *final = m; 906 907 barrier(); 908 909 /* 910 * When no restart IP must always kill or panic. 911 */ 912 if (!(m.mcgstatus & MCG_STATUS_RIPV)) 913 kill_it = 1; 914 915 /* 916 * Go through all the banks in exclusion of the other CPUs. 917 * This way we don't report duplicated events on shared banks 918 * because the first one to see it will clear it. 919 */ 920 no_way_out = mce_start(no_way_out, &order); 921 for (i = 0; i < banks; i++) { 922 __clear_bit(i, toclear); 923 if (!bank[i]) 924 continue; 925 926 m.misc = 0; 927 m.addr = 0; 928 m.bank = i; 929 930 m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4); 931 if ((m.status & MCI_STATUS_VAL) == 0) 932 continue; 933 934 /* 935 * Non uncorrected or non signaled errors are handled by 936 * machine_check_poll. Leave them alone, unless this panics. 937 */ 938 if (!(m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)) && 939 !no_way_out) 940 continue; 941 942 /* 943 * Set taint even when machine check was not enabled. 944 */ 945 add_taint(TAINT_MACHINE_CHECK); 946 947 severity = mce_severity(&m, tolerant, NULL); 948 949 /* 950 * When machine check was for corrected handler don't touch, 951 * unless we're panicing. 952 */ 953 if (severity == MCE_KEEP_SEVERITY && !no_way_out) 954 continue; 955 __set_bit(i, toclear); 956 if (severity == MCE_NO_SEVERITY) { 957 /* 958 * Machine check event was not enabled. Clear, but 959 * ignore. 960 */ 961 continue; 962 } 963 964 /* 965 * Kill on action required. 966 */ 967 if (severity == MCE_AR_SEVERITY) 968 kill_it = 1; 969 970 if (m.status & MCI_STATUS_MISCV) 971 m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4); 972 if (m.status & MCI_STATUS_ADDRV) 973 m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4); 974 975 /* 976 * Action optional error. Queue address for later processing. 977 * When the ring overflows we just ignore the AO error. 978 * RED-PEN add some logging mechanism when 979 * usable_address or mce_add_ring fails. 980 * RED-PEN don't ignore overflow for tolerant == 0 981 */ 982 if (severity == MCE_AO_SEVERITY && mce_usable_address(&m)) 983 mce_ring_add(m.addr >> PAGE_SHIFT); 984 985 mce_get_rip(&m, regs); 986 mce_log(&m); 987 988 if (severity > worst) { 989 *final = m; 990 worst = severity; 991 } 992 } 993 994 if (!no_way_out) 995 mce_clear_state(toclear); 996 997 /* 998 * Do most of the synchronization with other CPUs. 999 * When there's any problem use only local no_way_out state. 1000 */ 1001 if (mce_end(order) < 0) 1002 no_way_out = worst >= MCE_PANIC_SEVERITY; 1003 1004 /* 1005 * If we have decided that we just CAN'T continue, and the user 1006 * has not set tolerant to an insane level, give up and die. 1007 * 1008 * This is mainly used in the case when the system doesn't 1009 * support MCE broadcasting or it has been disabled. 1010 */ 1011 if (no_way_out && tolerant < 3) 1012 mce_panic("Fatal machine check on current CPU", final, msg); 1013 1014 /* 1015 * If the error seems to be unrecoverable, something should be 1016 * done. Try to kill as little as possible. If we can kill just 1017 * one task, do that. If the user has set the tolerance very 1018 * high, don't try to do anything at all. 1019 */ 1020 1021 if (kill_it && tolerant < 3) 1022 force_sig(SIGBUS, current); 1023 1024 /* notify userspace ASAP */ 1025 set_thread_flag(TIF_MCE_NOTIFY); 1026 1027 if (worst > 0) 1028 mce_report_event(regs); 1029 mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); 1030out: 1031 atomic_dec(&mce_entry); 1032 sync_core(); 1033} 1034EXPORT_SYMBOL_GPL(do_machine_check); 1035 1036/* dummy to break dependency. actual code is in mm/memory-failure.c */ 1037void __attribute__((weak)) memory_failure(unsigned long pfn, int vector) 1038{ 1039 printk(KERN_ERR "Action optional memory failure at %lx ignored\n", pfn); 1040} 1041 1042/* 1043 * Called after mce notification in process context. This code 1044 * is allowed to sleep. Call the high level VM handler to process 1045 * any corrupted pages. 1046 * Assume that the work queue code only calls this one at a time 1047 * per CPU. 1048 * Note we don't disable preemption, so this code might run on the wrong 1049 * CPU. In this case the event is picked up by the scheduled work queue. 1050 * This is merely a fast path to expedite processing in some common 1051 * cases. 1052 */ 1053void mce_notify_process(void) 1054{ 1055 unsigned long pfn; 1056 mce_notify_irq(); 1057 while (mce_ring_get(&pfn)) 1058 memory_failure(pfn, MCE_VECTOR); 1059} 1060 1061static void mce_process_work(struct work_struct *dummy) 1062{ 1063 mce_notify_process(); 1064} 1065 1066#ifdef CONFIG_X86_MCE_INTEL 1067/*** 1068 * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog 1069 * @cpu: The CPU on which the event occurred. 1070 * @status: Event status information 1071 * 1072 * This function should be called by the thermal interrupt after the 1073 * event has been processed and the decision was made to log the event 1074 * further. 1075 * 1076 * The status parameter will be saved to the 'status' field of 'struct mce' 1077 * and historically has been the register value of the 1078 * MSR_IA32_THERMAL_STATUS (Intel) msr. 1079 */ 1080void mce_log_therm_throt_event(__u64 status) 1081{ 1082 struct mce m; 1083 1084 mce_setup(&m); 1085 m.bank = MCE_THERMAL_BANK; 1086 m.status = status; 1087 mce_log(&m); 1088} 1089#endif /* CONFIG_X86_MCE_INTEL */ 1090 1091/* 1092 * Periodic polling timer for "silent" machine check errors. If the 1093 * poller finds an MCE, poll 2x faster. When the poller finds no more 1094 * errors, poll 2x slower (up to check_interval seconds). 1095 */ 1096static int check_interval = 5 * 60; /* 5 minutes */ 1097 1098static DEFINE_PER_CPU(int, next_interval); /* in jiffies */ 1099static DEFINE_PER_CPU(struct timer_list, mce_timer); 1100 1101static void mcheck_timer(unsigned long data) 1102{ 1103 struct timer_list *t = &per_cpu(mce_timer, data); 1104 int *n; 1105 1106 WARN_ON(smp_processor_id() != data); 1107 1108 if (mce_available(¤t_cpu_data)) { 1109 machine_check_poll(MCP_TIMESTAMP, 1110 &__get_cpu_var(mce_poll_banks)); 1111 } 1112 1113 /* 1114 * Alert userspace if needed. If we logged an MCE, reduce the 1115 * polling interval, otherwise increase the polling interval. 1116 */ 1117 n = &__get_cpu_var(next_interval); 1118 if (mce_notify_irq()) 1119 *n = max(*n/2, HZ/100); 1120 else 1121 *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ)); 1122 1123 t->expires = jiffies + *n; 1124 add_timer(t); 1125} 1126 1127static void mce_do_trigger(struct work_struct *work) 1128{ 1129 call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT); 1130} 1131 1132static DECLARE_WORK(mce_trigger_work, mce_do_trigger); 1133 1134/* 1135 * Notify the user(s) about new machine check events. 1136 * Can be called from interrupt context, but not from machine check/NMI 1137 * context. 1138 */ 1139int mce_notify_irq(void) 1140{ 1141 /* Not more than two messages every minute */ 1142 static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); 1143 1144 clear_thread_flag(TIF_MCE_NOTIFY); 1145 1146 if (test_and_clear_bit(0, ¬ify_user)) { 1147 wake_up_interruptible(&mce_wait); 1148 1149 /* 1150 * There is no risk of missing notifications because 1151 * work_pending is always cleared before the function is 1152 * executed. 1153 */ 1154 if (trigger[0] && !work_pending(&mce_trigger_work)) 1155 schedule_work(&mce_trigger_work); 1156 1157 if (__ratelimit(&ratelimit)) 1158 printk(KERN_INFO "Machine check events logged\n"); 1159 1160 return 1; 1161 } 1162 return 0; 1163} 1164EXPORT_SYMBOL_GPL(mce_notify_irq); 1165 1166/* 1167 * Initialize Machine Checks for a CPU. 1168 */ 1169static int mce_cap_init(void) 1170{ 1171 unsigned b; 1172 u64 cap; 1173 1174 rdmsrl(MSR_IA32_MCG_CAP, cap); 1175 1176 b = cap & MCG_BANKCNT_MASK; 1177 printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b); 1178 1179 if (b > MAX_NR_BANKS) { 1180 printk(KERN_WARNING 1181 "MCE: Using only %u machine check banks out of %u\n", 1182 MAX_NR_BANKS, b); 1183 b = MAX_NR_BANKS; 1184 } 1185 1186 /* Don't support asymmetric configurations today */ 1187 WARN_ON(banks != 0 && b != banks); 1188 banks = b; 1189 if (!bank) { 1190 bank = kmalloc(banks * sizeof(u64), GFP_KERNEL); 1191 if (!bank) 1192 return -ENOMEM; 1193 memset(bank, 0xff, banks * sizeof(u64)); 1194 } 1195 1196 /* Use accurate RIP reporting if available. */ 1197 if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9) 1198 rip_msr = MSR_IA32_MCG_EIP; 1199 1200 if (cap & MCG_SER_P) 1201 mce_ser = 1; 1202 1203 return 0; 1204} 1205 1206static void mce_init(void) 1207{ 1208 mce_banks_t all_banks; 1209 u64 cap; 1210 int i; 1211 1212 /* 1213 * Log the machine checks left over from the previous reset. 1214 */ 1215 bitmap_fill(all_banks, MAX_NR_BANKS); 1216 machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks); 1217 1218 set_in_cr4(X86_CR4_MCE); 1219 1220 rdmsrl(MSR_IA32_MCG_CAP, cap); 1221 if (cap & MCG_CTL_P) 1222 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); 1223 1224 for (i = 0; i < banks; i++) { 1225 if (skip_bank_init(i)) 1226 continue; 1227 wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]); 1228 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); 1229 } 1230} 1231 1232/* Add per CPU specific workarounds here */ 1233static void mce_cpu_quirks(struct cpuinfo_x86 *c) 1234{ 1235 /* This should be disabled by the BIOS, but isn't always */ 1236 if (c->x86_vendor == X86_VENDOR_AMD) { 1237 if (c->x86 == 15 && banks > 4) { 1238 /* 1239 * disable GART TBL walk error reporting, which 1240 * trips off incorrectly with the IOMMU & 3ware 1241 * & Cerberus: 1242 */ 1243 clear_bit(10, (unsigned long *)&bank[4]); 1244 } 1245 if (c->x86 <= 17 && mce_bootlog < 0) { 1246 /* 1247 * Lots of broken BIOS around that don't clear them 1248 * by default and leave crap in there. Don't log: 1249 */ 1250 mce_bootlog = 0; 1251 } 1252 /* 1253 * Various K7s with broken bank 0 around. Always disable 1254 * by default. 1255 */ 1256 if (c->x86 == 6) 1257 bank[0] = 0; 1258 } 1259 1260 if (c->x86_vendor == X86_VENDOR_INTEL) { 1261 /* 1262 * SDM documents that on family 6 bank 0 should not be written 1263 * because it aliases to another special BIOS controlled 1264 * register. 1265 * But it's not aliased anymore on model 0x1a+ 1266 * Don't ignore bank 0 completely because there could be a 1267 * valid event later, merely don't write CTL0. 1268 */ 1269 1270 if (c->x86 == 6 && c->x86_model < 0x1A) 1271 __set_bit(0, &dont_init_banks); 1272 1273 /* 1274 * All newer Intel systems support MCE broadcasting. Enable 1275 * synchronization with a one second timeout. 1276 */ 1277 if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) && 1278 monarch_timeout < 0) 1279 monarch_timeout = USEC_PER_SEC; 1280 } 1281 if (monarch_timeout < 0) 1282 monarch_timeout = 0; 1283 if (mce_bootlog != 0) 1284 mce_panic_timeout = 30; 1285} 1286 1287static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c) 1288{ 1289 if (c->x86 != 5) 1290 return; 1291 switch (c->x86_vendor) { 1292 case X86_VENDOR_INTEL: 1293 if (mce_p5_enabled()) 1294 intel_p5_mcheck_init(c); 1295 break; 1296 case X86_VENDOR_CENTAUR: 1297 winchip_mcheck_init(c); 1298 break; 1299 } 1300} 1301 1302static void mce_cpu_features(struct cpuinfo_x86 *c) 1303{ 1304 switch (c->x86_vendor) { 1305 case X86_VENDOR_INTEL: 1306 mce_intel_feature_init(c); 1307 break; 1308 case X86_VENDOR_AMD: 1309 mce_amd_feature_init(c); 1310 break; 1311 default: 1312 break; 1313 } 1314} 1315 1316static void mce_init_timer(void) 1317{ 1318 struct timer_list *t = &__get_cpu_var(mce_timer); 1319 int *n = &__get_cpu_var(next_interval); 1320 1321 if (mce_ignore_ce) 1322 return; 1323 1324 *n = check_interval * HZ; 1325 if (!*n) 1326 return; 1327 setup_timer(t, mcheck_timer, smp_processor_id()); 1328 t->expires = round_jiffies(jiffies + *n); 1329 add_timer(t); 1330} 1331 1332/* 1333 * Called for each booted CPU to set up machine checks. 1334 * Must be called with preempt off: 1335 */ 1336void __cpuinit mcheck_init(struct cpuinfo_x86 *c) 1337{ 1338 if (mce_disabled) 1339 return; 1340 1341 mce_ancient_init(c); 1342 1343 if (!mce_available(c)) 1344 return; 1345 1346 if (mce_cap_init() < 0) { 1347 mce_disabled = 1; 1348 return; 1349 } 1350 mce_cpu_quirks(c); 1351 1352 machine_check_vector = do_machine_check; 1353 1354 mce_init(); 1355 mce_cpu_features(c); 1356 mce_init_timer(); 1357 INIT_WORK(&__get_cpu_var(mce_work), mce_process_work); 1358} 1359 1360/* 1361 * Character device to read and clear the MCE log. 1362 */ 1363 1364static DEFINE_SPINLOCK(mce_state_lock); 1365static int open_count; /* #times opened */ 1366static int open_exclu; /* already open exclusive? */ 1367 1368static int mce_open(struct inode *inode, struct file *file) 1369{ 1370 spin_lock(&mce_state_lock); 1371 1372 if (open_exclu || (open_count && (file->f_flags & O_EXCL))) { 1373 spin_unlock(&mce_state_lock); 1374 1375 return -EBUSY; 1376 } 1377 1378 if (file->f_flags & O_EXCL) 1379 open_exclu = 1; 1380 open_count++; 1381 1382 spin_unlock(&mce_state_lock); 1383 1384 return nonseekable_open(inode, file); 1385} 1386 1387static int mce_release(struct inode *inode, struct file *file) 1388{ 1389 spin_lock(&mce_state_lock); 1390 1391 open_count--; 1392 open_exclu = 0; 1393 1394 spin_unlock(&mce_state_lock); 1395 1396 return 0; 1397} 1398 1399static void collect_tscs(void *data) 1400{ 1401 unsigned long *cpu_tsc = (unsigned long *)data; 1402 1403 rdtscll(cpu_tsc[smp_processor_id()]); 1404} 1405 1406static DEFINE_MUTEX(mce_read_mutex); 1407 1408static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, 1409 loff_t *off) 1410{ 1411 char __user *buf = ubuf; 1412 unsigned long *cpu_tsc; 1413 unsigned prev, next; 1414 int i, err; 1415 1416 cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL); 1417 if (!cpu_tsc) 1418 return -ENOMEM; 1419 1420 mutex_lock(&mce_read_mutex); 1421 next = rcu_dereference(mcelog.next); 1422 1423 /* Only supports full reads right now */ 1424 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) { 1425 mutex_unlock(&mce_read_mutex); 1426 kfree(cpu_tsc); 1427 1428 return -EINVAL; 1429 } 1430 1431 err = 0; 1432 prev = 0; 1433 do { 1434 for (i = prev; i < next; i++) { 1435 unsigned long start = jiffies; 1436 1437 while (!mcelog.entry[i].finished) { 1438 if (time_after_eq(jiffies, start + 2)) { 1439 memset(mcelog.entry + i, 0, 1440 sizeof(struct mce)); 1441 goto timeout; 1442 } 1443 cpu_relax(); 1444 } 1445 smp_rmb(); 1446 err |= copy_to_user(buf, mcelog.entry + i, 1447 sizeof(struct mce)); 1448 buf += sizeof(struct mce); 1449timeout: 1450 ; 1451 } 1452 1453 memset(mcelog.entry + prev, 0, 1454 (next - prev) * sizeof(struct mce)); 1455 prev = next; 1456 next = cmpxchg(&mcelog.next, prev, 0); 1457 } while (next != prev); 1458 1459 synchronize_sched(); 1460 1461 /* 1462 * Collect entries that were still getting written before the 1463 * synchronize. 1464 */ 1465 on_each_cpu(collect_tscs, cpu_tsc, 1); 1466 1467 for (i = next; i < MCE_LOG_LEN; i++) { 1468 if (mcelog.entry[i].finished && 1469 mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) { 1470 err |= copy_to_user(buf, mcelog.entry+i, 1471 sizeof(struct mce)); 1472 smp_rmb(); 1473 buf += sizeof(struct mce); 1474 memset(&mcelog.entry[i], 0, sizeof(struct mce)); 1475 } 1476 } 1477 mutex_unlock(&mce_read_mutex); 1478 kfree(cpu_tsc); 1479 1480 return err ? -EFAULT : buf - ubuf; 1481} 1482 1483static unsigned int mce_poll(struct file *file, poll_table *wait) 1484{ 1485 poll_wait(file, &mce_wait, wait); 1486 if (rcu_dereference(mcelog.next)) 1487 return POLLIN | POLLRDNORM; 1488 return 0; 1489} 1490 1491static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg) 1492{ 1493 int __user *p = (int __user *)arg; 1494 1495 if (!capable(CAP_SYS_ADMIN)) 1496 return -EPERM; 1497 1498 switch (cmd) { 1499 case MCE_GET_RECORD_LEN: 1500 return put_user(sizeof(struct mce), p); 1501 case MCE_GET_LOG_LEN: 1502 return put_user(MCE_LOG_LEN, p); 1503 case MCE_GETCLEAR_FLAGS: { 1504 unsigned flags; 1505 1506 do { 1507 flags = mcelog.flags; 1508 } while (cmpxchg(&mcelog.flags, flags, 0) != flags); 1509 1510 return put_user(flags, p); 1511 } 1512 default: 1513 return -ENOTTY; 1514 } 1515} 1516 1517/* Modified in mce-inject.c, so not static or const */ 1518struct file_operations mce_chrdev_ops = { 1519 .open = mce_open, 1520 .release = mce_release, 1521 .read = mce_read, 1522 .poll = mce_poll, 1523 .unlocked_ioctl = mce_ioctl, 1524}; 1525EXPORT_SYMBOL_GPL(mce_chrdev_ops); 1526 1527static struct miscdevice mce_log_device = { 1528 MISC_MCELOG_MINOR, 1529 "mcelog", 1530 &mce_chrdev_ops, 1531}; 1532 1533/* 1534 * mce=off Disables machine check 1535 * mce=no_cmci Disables CMCI 1536 * mce=dont_log_ce Clears corrected events silently, no log created for CEs. 1537 * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared. 1538 * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above) 1539 * monarchtimeout is how long to wait for other CPUs on machine 1540 * check, or 0 to not wait 1541 * mce=bootlog Log MCEs from before booting. Disabled by default on AMD. 1542 * mce=nobootlog Don't log MCEs from before booting. 1543 */ 1544static int __init mcheck_enable(char *str) 1545{ 1546 if (*str == 0) 1547 enable_p5_mce(); 1548 if (*str == '=') 1549 str++; 1550 if (!strcmp(str, "off")) 1551 mce_disabled = 1; 1552 else if (!strcmp(str, "no_cmci")) 1553 mce_cmci_disabled = 1; 1554 else if (!strcmp(str, "dont_log_ce")) 1555 mce_dont_log_ce = 1; 1556 else if (!strcmp(str, "ignore_ce")) 1557 mce_ignore_ce = 1; 1558 else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog")) 1559 mce_bootlog = (str[0] == 'b'); 1560 else if (isdigit(str[0])) { 1561 get_option(&str, &tolerant); 1562 if (*str == ',') { 1563 ++str; 1564 get_option(&str, &monarch_timeout); 1565 } 1566 } else { 1567 printk(KERN_INFO "mce argument %s ignored. Please use /sys\n", 1568 str); 1569 return 0; 1570 } 1571 return 1; 1572} 1573__setup("mce", mcheck_enable); 1574 1575/* 1576 * Sysfs support 1577 */ 1578 1579/* 1580 * Disable machine checks on suspend and shutdown. We can't really handle 1581 * them later. 1582 */ 1583static int mce_disable(void) 1584{ 1585 int i; 1586 1587 for (i = 0; i < banks; i++) { 1588 if (!skip_bank_init(i)) 1589 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); 1590 } 1591 return 0; 1592} 1593 1594static int mce_suspend(struct sys_device *dev, pm_message_t state) 1595{ 1596 return mce_disable(); 1597} 1598 1599static int mce_shutdown(struct sys_device *dev) 1600{ 1601 return mce_disable(); 1602} 1603 1604/* 1605 * On resume clear all MCE state. Don't want to see leftovers from the BIOS. 1606 * Only one CPU is active at this time, the others get re-added later using 1607 * CPU hotplug: 1608 */ 1609static int mce_resume(struct sys_device *dev) 1610{ 1611 mce_init(); 1612 mce_cpu_features(¤t_cpu_data); 1613 1614 return 0; 1615} 1616 1617static void mce_cpu_restart(void *data) 1618{ 1619 del_timer_sync(&__get_cpu_var(mce_timer)); 1620 if (!mce_available(¤t_cpu_data)) 1621 return; 1622 mce_init(); 1623 mce_init_timer(); 1624} 1625 1626/* Reinit MCEs after user configuration changes */ 1627static void mce_restart(void) 1628{ 1629 on_each_cpu(mce_cpu_restart, NULL, 1); 1630} 1631 1632static struct sysdev_class mce_sysclass = { 1633 .suspend = mce_suspend, 1634 .shutdown = mce_shutdown, 1635 .resume = mce_resume, 1636 .name = "machinecheck", 1637}; 1638 1639DEFINE_PER_CPU(struct sys_device, mce_dev); 1640 1641__cpuinitdata 1642void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); 1643 1644static struct sysdev_attribute *bank_attrs; 1645 1646static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr, 1647 char *buf) 1648{ 1649 u64 b = bank[attr - bank_attrs]; 1650 1651 return sprintf(buf, "%llx\n", b); 1652} 1653 1654static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, 1655 const char *buf, size_t size) 1656{ 1657 u64 new; 1658 1659 if (strict_strtoull(buf, 0, &new) < 0) 1660 return -EINVAL; 1661 1662 bank[attr - bank_attrs] = new; 1663 mce_restart(); 1664 1665 return size; 1666} 1667 1668static ssize_t 1669show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf) 1670{ 1671 strcpy(buf, trigger); 1672 strcat(buf, "\n"); 1673 return strlen(trigger) + 1; 1674} 1675 1676static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr, 1677 const char *buf, size_t siz) 1678{ 1679 char *p; 1680 int len; 1681 1682 strncpy(trigger, buf, sizeof(trigger)); 1683 trigger[sizeof(trigger)-1] = 0; 1684 len = strlen(trigger); 1685 p = strchr(trigger, '\n'); 1686 1687 if (*p) 1688 *p = 0; 1689 1690 return len; 1691} 1692 1693static ssize_t store_int_with_restart(struct sys_device *s, 1694 struct sysdev_attribute *attr, 1695 const char *buf, size_t size) 1696{ 1697 ssize_t ret = sysdev_store_int(s, attr, buf, size); 1698 mce_restart(); 1699 return ret; 1700} 1701 1702static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); 1703static SYSDEV_INT_ATTR(tolerant, 0644, tolerant); 1704static SYSDEV_INT_ATTR(monarch_timeout, 0644, monarch_timeout); 1705 1706static struct sysdev_ext_attribute attr_check_interval = { 1707 _SYSDEV_ATTR(check_interval, 0644, sysdev_show_int, 1708 store_int_with_restart), 1709 &check_interval 1710}; 1711 1712static struct sysdev_attribute *mce_attrs[] = { 1713 &attr_tolerant.attr, &attr_check_interval.attr, &attr_trigger, 1714 &attr_monarch_timeout.attr, 1715 NULL 1716}; 1717 1718static cpumask_var_t mce_dev_initialized; 1719 1720/* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */ 1721static __cpuinit int mce_create_device(unsigned int cpu) 1722{ 1723 int err; 1724 int i; 1725 1726 if (!mce_available(&boot_cpu_data)) 1727 return -EIO; 1728 1729 memset(&per_cpu(mce_dev, cpu).kobj, 0, sizeof(struct kobject)); 1730 per_cpu(mce_dev, cpu).id = cpu; 1731 per_cpu(mce_dev, cpu).cls = &mce_sysclass; 1732 1733 err = sysdev_register(&per_cpu(mce_dev, cpu)); 1734 if (err) 1735 return err; 1736 1737 for (i = 0; mce_attrs[i]; i++) { 1738 err = sysdev_create_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); 1739 if (err) 1740 goto error; 1741 } 1742 for (i = 0; i < banks; i++) { 1743 err = sysdev_create_file(&per_cpu(mce_dev, cpu), 1744 &bank_attrs[i]); 1745 if (err) 1746 goto error2; 1747 } 1748 cpumask_set_cpu(cpu, mce_dev_initialized); 1749 1750 return 0; 1751error2: 1752 while (--i >= 0) 1753 sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]); 1754error: 1755 while (--i >= 0) 1756 sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); 1757 1758 sysdev_unregister(&per_cpu(mce_dev, cpu)); 1759 1760 return err; 1761} 1762 1763static __cpuinit void mce_remove_device(unsigned int cpu) 1764{ 1765 int i; 1766 1767 if (!cpumask_test_cpu(cpu, mce_dev_initialized)) 1768 return; 1769 1770 for (i = 0; mce_attrs[i]; i++) 1771 sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); 1772 1773 for (i = 0; i < banks; i++) 1774 sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]); 1775 1776 sysdev_unregister(&per_cpu(mce_dev, cpu)); 1777 cpumask_clear_cpu(cpu, mce_dev_initialized); 1778} 1779 1780/* Make sure there are no machine checks on offlined CPUs. */ 1781static void mce_disable_cpu(void *h) 1782{ 1783 unsigned long action = *(unsigned long *)h; 1784 int i; 1785 1786 if (!mce_available(¤t_cpu_data)) 1787 return; 1788 if (!(action & CPU_TASKS_FROZEN)) 1789 cmci_clear(); 1790 for (i = 0; i < banks; i++) { 1791 if (!skip_bank_init(i)) 1792 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); 1793 } 1794} 1795 1796static void mce_reenable_cpu(void *h) 1797{ 1798 unsigned long action = *(unsigned long *)h; 1799 int i; 1800 1801 if (!mce_available(¤t_cpu_data)) 1802 return; 1803 1804 if (!(action & CPU_TASKS_FROZEN)) 1805 cmci_reenable(); 1806 for (i = 0; i < banks; i++) { 1807 if (!skip_bank_init(i)) 1808 wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]); 1809 } 1810} 1811 1812/* Get notified when a cpu comes on/off. Be hotplug friendly. */ 1813static int __cpuinit 1814mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) 1815{ 1816 unsigned int cpu = (unsigned long)hcpu; 1817 struct timer_list *t = &per_cpu(mce_timer, cpu); 1818 1819 switch (action) { 1820 case CPU_ONLINE: 1821 case CPU_ONLINE_FROZEN: 1822 mce_create_device(cpu); 1823 if (threshold_cpu_callback) 1824 threshold_cpu_callback(action, cpu); 1825 break; 1826 case CPU_DEAD: 1827 case CPU_DEAD_FROZEN: 1828 if (threshold_cpu_callback) 1829 threshold_cpu_callback(action, cpu); 1830 mce_remove_device(cpu); 1831 break; 1832 case CPU_DOWN_PREPARE: 1833 case CPU_DOWN_PREPARE_FROZEN: 1834 del_timer_sync(t); 1835 smp_call_function_single(cpu, mce_disable_cpu, &action, 1); 1836 break; 1837 case CPU_DOWN_FAILED: 1838 case CPU_DOWN_FAILED_FROZEN: 1839 t->expires = round_jiffies(jiffies + 1840 __get_cpu_var(next_interval)); 1841 add_timer_on(t, cpu); 1842 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); 1843 break; 1844 case CPU_POST_DEAD: 1845 /* intentionally ignoring frozen here */ 1846 cmci_rediscover(cpu); 1847 break; 1848 } 1849 return NOTIFY_OK; 1850} 1851 1852static struct notifier_block mce_cpu_notifier __cpuinitdata = { 1853 .notifier_call = mce_cpu_callback, 1854}; 1855 1856static __init int mce_init_banks(void) 1857{ 1858 int i; 1859 1860 bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks, 1861 GFP_KERNEL); 1862 if (!bank_attrs) 1863 return -ENOMEM; 1864 1865 for (i = 0; i < banks; i++) { 1866 struct sysdev_attribute *a = &bank_attrs[i]; 1867 1868 a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i); 1869 if (!a->attr.name) 1870 goto nomem; 1871 1872 a->attr.mode = 0644; 1873 a->show = show_bank; 1874 a->store = set_bank; 1875 } 1876 return 0; 1877 1878nomem: 1879 while (--i >= 0) 1880 kfree(bank_attrs[i].attr.name); 1881 kfree(bank_attrs); 1882 bank_attrs = NULL; 1883 1884 return -ENOMEM; 1885} 1886 1887static __init int mce_init_device(void) 1888{ 1889 int err; 1890 int i = 0; 1891 1892 if (!mce_available(&boot_cpu_data)) 1893 return -EIO; 1894 1895 alloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL); 1896 1897 err = mce_init_banks(); 1898 if (err) 1899 return err; 1900 1901 err = sysdev_class_register(&mce_sysclass); 1902 if (err) 1903 return err; 1904 1905 for_each_online_cpu(i) { 1906 err = mce_create_device(i); 1907 if (err) 1908 return err; 1909 } 1910 1911 register_hotcpu_notifier(&mce_cpu_notifier); 1912 misc_register(&mce_log_device); 1913 1914 return err; 1915} 1916 1917device_initcall(mce_init_device); 1918 1919#else /* CONFIG_X86_OLD_MCE: */ 1920 1921int nr_mce_banks; 1922EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */ 1923 1924/* This has to be run for each processor */ 1925void mcheck_init(struct cpuinfo_x86 *c) 1926{ 1927 if (mce_disabled == 1) 1928 return; 1929 1930 switch (c->x86_vendor) { 1931 case X86_VENDOR_AMD: 1932 amd_mcheck_init(c); 1933 break; 1934 1935 case X86_VENDOR_INTEL: 1936 if (c->x86 == 5) 1937 intel_p5_mcheck_init(c); 1938 if (c->x86 == 6) 1939 intel_p6_mcheck_init(c); 1940 if (c->x86 == 15) 1941 intel_p4_mcheck_init(c); 1942 break; 1943 1944 case X86_VENDOR_CENTAUR: 1945 if (c->x86 == 5) 1946 winchip_mcheck_init(c); 1947 break; 1948 1949 default: 1950 break; 1951 } 1952 printk(KERN_INFO "mce: CPU supports %d MCE banks\n", nr_mce_banks); 1953} 1954 1955static int __init mcheck_enable(char *str) 1956{ 1957 mce_disabled = -1; 1958 return 1; 1959} 1960 1961__setup("mce", mcheck_enable); 1962 1963#endif /* CONFIG_X86_OLD_MCE */ 1964 1965/* 1966 * Old style boot options parsing. Only for compatibility. 1967 */ 1968static int __init mcheck_disable(char *str) 1969{ 1970 mce_disabled = 1; 1971 return 1; 1972} 1973__setup("nomce", mcheck_disable); 1974