mce.c revision 1020bcbcc7da36001d9226c5d57e999949cb80c5
1/* 2 * Machine check handler. 3 * 4 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. 5 * Rest from unknown author(s). 6 * 2004 Andi Kleen. Rewrote most of it. 7 * Copyright 2008 Intel Corporation 8 * Author: Andi Kleen 9 */ 10#include <linux/thread_info.h> 11#include <linux/capability.h> 12#include <linux/miscdevice.h> 13#include <linux/interrupt.h> 14#include <linux/ratelimit.h> 15#include <linux/kallsyms.h> 16#include <linux/rcupdate.h> 17#include <linux/kobject.h> 18#include <linux/uaccess.h> 19#include <linux/kdebug.h> 20#include <linux/kernel.h> 21#include <linux/percpu.h> 22#include <linux/string.h> 23#include <linux/sysdev.h> 24#include <linux/delay.h> 25#include <linux/ctype.h> 26#include <linux/sched.h> 27#include <linux/sysfs.h> 28#include <linux/types.h> 29#include <linux/init.h> 30#include <linux/kmod.h> 31#include <linux/poll.h> 32#include <linux/nmi.h> 33#include <linux/cpu.h> 34#include <linux/smp.h> 35#include <linux/fs.h> 36#include <linux/mm.h> 37 38#include <asm/processor.h> 39#include <asm/hw_irq.h> 40#include <asm/apic.h> 41#include <asm/idle.h> 42#include <asm/ipi.h> 43#include <asm/mce.h> 44#include <asm/msr.h> 45 46#include "mce-internal.h" 47#include "mce.h" 48 49/* Handle unconfigured int18 (should never happen) */ 50static void unexpected_machine_check(struct pt_regs *regs, long error_code) 51{ 52 printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", 53 smp_processor_id()); 54} 55 56/* Call the installed machine check handler for this CPU setup. */ 57void (*machine_check_vector)(struct pt_regs *, long error_code) = 58 unexpected_machine_check; 59 60int mce_disabled __read_mostly; 61 62#ifdef CONFIG_X86_NEW_MCE 63 64#define MISC_MCELOG_MINOR 227 65 66#define SPINUNIT 100 /* 100ns */ 67 68atomic_t mce_entry; 69 70DEFINE_PER_CPU(unsigned, mce_exception_count); 71 72/* 73 * Tolerant levels: 74 * 0: always panic on uncorrected errors, log corrected errors 75 * 1: panic or SIGBUS on uncorrected errors, log corrected errors 76 * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors 77 * 3: never panic or SIGBUS, log all errors (for testing only) 78 */ 79static int tolerant __read_mostly = 1; 80static int banks __read_mostly; 81static u64 *bank __read_mostly; 82static int rip_msr __read_mostly; 83static int mce_bootlog __read_mostly = -1; 84static int monarch_timeout __read_mostly = -1; 85static int mce_panic_timeout __read_mostly; 86static int mce_dont_log_ce __read_mostly; 87int mce_cmci_disabled __read_mostly; 88int mce_ignore_ce __read_mostly; 89int mce_ser __read_mostly; 90 91/* User mode helper program triggered by machine check event */ 92static unsigned long mce_need_notify; 93static char mce_helper[128]; 94static char *mce_helper_argv[2] = { mce_helper, NULL }; 95 96static unsigned long dont_init_banks; 97 98static DECLARE_WAIT_QUEUE_HEAD(mce_wait); 99static DEFINE_PER_CPU(struct mce, mces_seen); 100static int cpu_missing; 101 102 103/* MCA banks polled by the period polling timer for corrected events */ 104DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { 105 [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL 106}; 107 108static inline int skip_bank_init(int i) 109{ 110 return i < BITS_PER_LONG && test_bit(i, &dont_init_banks); 111} 112 113static DEFINE_PER_CPU(struct work_struct, mce_work); 114 115/* Do initial initialization of a struct mce */ 116void mce_setup(struct mce *m) 117{ 118 memset(m, 0, sizeof(struct mce)); 119 m->cpu = m->extcpu = smp_processor_id(); 120 rdtscll(m->tsc); 121 /* We hope get_seconds stays lockless */ 122 m->time = get_seconds(); 123 m->cpuvendor = boot_cpu_data.x86_vendor; 124 m->cpuid = cpuid_eax(1); 125#ifdef CONFIG_SMP 126 m->socketid = cpu_data(m->extcpu).phys_proc_id; 127#endif 128 m->apicid = cpu_data(m->extcpu).initial_apicid; 129 rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap); 130} 131 132DEFINE_PER_CPU(struct mce, injectm); 133EXPORT_PER_CPU_SYMBOL_GPL(injectm); 134 135/* 136 * Lockless MCE logging infrastructure. 137 * This avoids deadlocks on printk locks without having to break locks. Also 138 * separate MCEs from kernel messages to avoid bogus bug reports. 139 */ 140 141static struct mce_log mcelog = { 142 .signature = MCE_LOG_SIGNATURE, 143 .len = MCE_LOG_LEN, 144 .recordlen = sizeof(struct mce), 145}; 146 147void mce_log(struct mce *mce) 148{ 149 unsigned next, entry; 150 151 mce->finished = 0; 152 wmb(); 153 for (;;) { 154 entry = rcu_dereference(mcelog.next); 155 for (;;) { 156 /* 157 * When the buffer fills up discard new entries. 158 * Assume that the earlier errors are the more 159 * interesting ones: 160 */ 161 if (entry >= MCE_LOG_LEN) { 162 set_bit(MCE_OVERFLOW, 163 (unsigned long *)&mcelog.flags); 164 return; 165 } 166 /* Old left over entry. Skip: */ 167 if (mcelog.entry[entry].finished) { 168 entry++; 169 continue; 170 } 171 break; 172 } 173 smp_rmb(); 174 next = entry + 1; 175 if (cmpxchg(&mcelog.next, entry, next) == entry) 176 break; 177 } 178 memcpy(mcelog.entry + entry, mce, sizeof(struct mce)); 179 wmb(); 180 mcelog.entry[entry].finished = 1; 181 wmb(); 182 183 mce->finished = 1; 184 set_bit(0, &mce_need_notify); 185} 186 187static void print_mce(struct mce *m) 188{ 189 printk(KERN_EMERG 190 "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n", 191 m->extcpu, m->mcgstatus, m->bank, m->status); 192 if (m->ip) { 193 printk(KERN_EMERG "RIP%s %02x:<%016Lx> ", 194 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", 195 m->cs, m->ip); 196 if (m->cs == __KERNEL_CS) 197 print_symbol("{%s}", m->ip); 198 printk("\n"); 199 } 200 printk(KERN_EMERG "TSC %llx ", m->tsc); 201 if (m->addr) 202 printk("ADDR %llx ", m->addr); 203 if (m->misc) 204 printk("MISC %llx ", m->misc); 205 printk("\n"); 206 printk(KERN_EMERG "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n", 207 m->cpuvendor, m->cpuid, m->time, m->socketid, 208 m->apicid); 209} 210 211static void print_mce_head(void) 212{ 213 printk(KERN_EMERG "\n" KERN_EMERG "HARDWARE ERROR\n"); 214} 215 216static void print_mce_tail(void) 217{ 218 printk(KERN_EMERG "This is not a software problem!\n" 219 KERN_EMERG "Run through mcelog --ascii to decode and contact your hardware vendor\n"); 220} 221 222#define PANIC_TIMEOUT 5 /* 5 seconds */ 223 224static atomic_t mce_paniced; 225 226/* Panic in progress. Enable interrupts and wait for final IPI */ 227static void wait_for_panic(void) 228{ 229 long timeout = PANIC_TIMEOUT*USEC_PER_SEC; 230 preempt_disable(); 231 local_irq_enable(); 232 while (timeout-- > 0) 233 udelay(1); 234 if (panic_timeout == 0) 235 panic_timeout = mce_panic_timeout; 236 panic("Panicing machine check CPU died"); 237} 238 239static void mce_panic(char *msg, struct mce *final, char *exp) 240{ 241 int i; 242 243 /* 244 * Make sure only one CPU runs in machine check panic 245 */ 246 if (atomic_add_return(1, &mce_paniced) > 1) 247 wait_for_panic(); 248 barrier(); 249 250 bust_spinlocks(1); 251 console_verbose(); 252 print_mce_head(); 253 /* First print corrected ones that are still unlogged */ 254 for (i = 0; i < MCE_LOG_LEN; i++) { 255 struct mce *m = &mcelog.entry[i]; 256 if (!(m->status & MCI_STATUS_VAL)) 257 continue; 258 if (!(m->status & MCI_STATUS_UC)) 259 print_mce(m); 260 } 261 /* Now print uncorrected but with the final one last */ 262 for (i = 0; i < MCE_LOG_LEN; i++) { 263 struct mce *m = &mcelog.entry[i]; 264 if (!(m->status & MCI_STATUS_VAL)) 265 continue; 266 if (!(m->status & MCI_STATUS_UC)) 267 continue; 268 if (!final || memcmp(m, final, sizeof(struct mce))) 269 print_mce(m); 270 } 271 if (final) 272 print_mce(final); 273 if (cpu_missing) 274 printk(KERN_EMERG "Some CPUs didn't answer in synchronization\n"); 275 print_mce_tail(); 276 if (exp) 277 printk(KERN_EMERG "Machine check: %s\n", exp); 278 if (panic_timeout == 0) 279 panic_timeout = mce_panic_timeout; 280 panic(msg); 281} 282 283/* Support code for software error injection */ 284 285static int msr_to_offset(u32 msr) 286{ 287 unsigned bank = __get_cpu_var(injectm.bank); 288 if (msr == rip_msr) 289 return offsetof(struct mce, ip); 290 if (msr == MSR_IA32_MC0_STATUS + bank*4) 291 return offsetof(struct mce, status); 292 if (msr == MSR_IA32_MC0_ADDR + bank*4) 293 return offsetof(struct mce, addr); 294 if (msr == MSR_IA32_MC0_MISC + bank*4) 295 return offsetof(struct mce, misc); 296 if (msr == MSR_IA32_MCG_STATUS) 297 return offsetof(struct mce, mcgstatus); 298 return -1; 299} 300 301/* MSR access wrappers used for error injection */ 302static u64 mce_rdmsrl(u32 msr) 303{ 304 u64 v; 305 if (__get_cpu_var(injectm).finished) { 306 int offset = msr_to_offset(msr); 307 if (offset < 0) 308 return 0; 309 return *(u64 *)((char *)&__get_cpu_var(injectm) + offset); 310 } 311 rdmsrl(msr, v); 312 return v; 313} 314 315static void mce_wrmsrl(u32 msr, u64 v) 316{ 317 if (__get_cpu_var(injectm).finished) { 318 int offset = msr_to_offset(msr); 319 if (offset >= 0) 320 *(u64 *)((char *)&__get_cpu_var(injectm) + offset) = v; 321 return; 322 } 323 wrmsrl(msr, v); 324} 325 326/* 327 * Simple lockless ring to communicate PFNs from the exception handler with the 328 * process context work function. This is vastly simplified because there's 329 * only a single reader and a single writer. 330 */ 331#define MCE_RING_SIZE 16 /* we use one entry less */ 332 333struct mce_ring { 334 unsigned short start; 335 unsigned short end; 336 unsigned long ring[MCE_RING_SIZE]; 337}; 338static DEFINE_PER_CPU(struct mce_ring, mce_ring); 339 340/* Runs with CPU affinity in workqueue */ 341static int mce_ring_empty(void) 342{ 343 struct mce_ring *r = &__get_cpu_var(mce_ring); 344 345 return r->start == r->end; 346} 347 348static int mce_ring_get(unsigned long *pfn) 349{ 350 struct mce_ring *r; 351 int ret = 0; 352 353 *pfn = 0; 354 get_cpu(); 355 r = &__get_cpu_var(mce_ring); 356 if (r->start == r->end) 357 goto out; 358 *pfn = r->ring[r->start]; 359 r->start = (r->start + 1) % MCE_RING_SIZE; 360 ret = 1; 361out: 362 put_cpu(); 363 return ret; 364} 365 366/* Always runs in MCE context with preempt off */ 367static int mce_ring_add(unsigned long pfn) 368{ 369 struct mce_ring *r = &__get_cpu_var(mce_ring); 370 unsigned next; 371 372 next = (r->end + 1) % MCE_RING_SIZE; 373 if (next == r->start) 374 return -1; 375 r->ring[r->end] = pfn; 376 wmb(); 377 r->end = next; 378 return 0; 379} 380 381int mce_available(struct cpuinfo_x86 *c) 382{ 383 if (mce_disabled) 384 return 0; 385 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); 386} 387 388static void mce_schedule_work(void) 389{ 390 if (!mce_ring_empty()) { 391 struct work_struct *work = &__get_cpu_var(mce_work); 392 if (!work_pending(work)) 393 schedule_work(work); 394 } 395} 396 397/* 398 * Get the address of the instruction at the time of the machine check 399 * error. 400 */ 401static inline void mce_get_rip(struct mce *m, struct pt_regs *regs) 402{ 403 404 if (regs && (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV))) { 405 m->ip = regs->ip; 406 m->cs = regs->cs; 407 } else { 408 m->ip = 0; 409 m->cs = 0; 410 } 411 if (rip_msr) 412 m->ip = mce_rdmsrl(rip_msr); 413} 414 415#ifdef CONFIG_X86_LOCAL_APIC 416/* 417 * Called after interrupts have been reenabled again 418 * when a MCE happened during an interrupts off region 419 * in the kernel. 420 */ 421asmlinkage void smp_mce_self_interrupt(struct pt_regs *regs) 422{ 423 ack_APIC_irq(); 424 exit_idle(); 425 irq_enter(); 426 mce_notify_irq(); 427 mce_schedule_work(); 428 irq_exit(); 429} 430#endif 431 432static void mce_report_event(struct pt_regs *regs) 433{ 434 if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) { 435 mce_notify_irq(); 436 /* 437 * Triggering the work queue here is just an insurance 438 * policy in case the syscall exit notify handler 439 * doesn't run soon enough or ends up running on the 440 * wrong CPU (can happen when audit sleeps) 441 */ 442 mce_schedule_work(); 443 return; 444 } 445 446#ifdef CONFIG_X86_LOCAL_APIC 447 /* 448 * Without APIC do not notify. The event will be picked 449 * up eventually. 450 */ 451 if (!cpu_has_apic) 452 return; 453 454 /* 455 * When interrupts are disabled we cannot use 456 * kernel services safely. Trigger an self interrupt 457 * through the APIC to instead do the notification 458 * after interrupts are reenabled again. 459 */ 460 apic->send_IPI_self(MCE_SELF_VECTOR); 461 462 /* 463 * Wait for idle afterwards again so that we don't leave the 464 * APIC in a non idle state because the normal APIC writes 465 * cannot exclude us. 466 */ 467 apic_wait_icr_idle(); 468#endif 469} 470 471DEFINE_PER_CPU(unsigned, mce_poll_count); 472 473/* 474 * Poll for corrected events or events that happened before reset. 475 * Those are just logged through /dev/mcelog. 476 * 477 * This is executed in standard interrupt context. 478 * 479 * Note: spec recommends to panic for fatal unsignalled 480 * errors here. However this would be quite problematic -- 481 * we would need to reimplement the Monarch handling and 482 * it would mess up the exclusion between exception handler 483 * and poll hander -- * so we skip this for now. 484 * These cases should not happen anyways, or only when the CPU 485 * is already totally * confused. In this case it's likely it will 486 * not fully execute the machine check handler either. 487 */ 488void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) 489{ 490 struct mce m; 491 int i; 492 493 __get_cpu_var(mce_poll_count)++; 494 495 mce_setup(&m); 496 497 m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); 498 for (i = 0; i < banks; i++) { 499 if (!bank[i] || !test_bit(i, *b)) 500 continue; 501 502 m.misc = 0; 503 m.addr = 0; 504 m.bank = i; 505 m.tsc = 0; 506 507 barrier(); 508 m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4); 509 if (!(m.status & MCI_STATUS_VAL)) 510 continue; 511 512 /* 513 * Uncorrected or signalled events are handled by the exception 514 * handler when it is enabled, so don't process those here. 515 * 516 * TBD do the same check for MCI_STATUS_EN here? 517 */ 518 if (!(flags & MCP_UC) && 519 (m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC))) 520 continue; 521 522 if (m.status & MCI_STATUS_MISCV) 523 m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4); 524 if (m.status & MCI_STATUS_ADDRV) 525 m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4); 526 527 if (!(flags & MCP_TIMESTAMP)) 528 m.tsc = 0; 529 /* 530 * Don't get the IP here because it's unlikely to 531 * have anything to do with the actual error location. 532 */ 533 if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) { 534 mce_log(&m); 535 add_taint(TAINT_MACHINE_CHECK); 536 } 537 538 /* 539 * Clear state for this bank. 540 */ 541 mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); 542 } 543 544 /* 545 * Don't clear MCG_STATUS here because it's only defined for 546 * exceptions. 547 */ 548 549 sync_core(); 550} 551EXPORT_SYMBOL_GPL(machine_check_poll); 552 553/* 554 * Do a quick check if any of the events requires a panic. 555 * This decides if we keep the events around or clear them. 556 */ 557static int mce_no_way_out(struct mce *m, char **msg) 558{ 559 int i; 560 561 for (i = 0; i < banks; i++) { 562 m->status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4); 563 if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY) 564 return 1; 565 } 566 return 0; 567} 568 569/* 570 * Variable to establish order between CPUs while scanning. 571 * Each CPU spins initially until executing is equal its number. 572 */ 573static atomic_t mce_executing; 574 575/* 576 * Defines order of CPUs on entry. First CPU becomes Monarch. 577 */ 578static atomic_t mce_callin; 579 580/* 581 * Check if a timeout waiting for other CPUs happened. 582 */ 583static int mce_timed_out(u64 *t) 584{ 585 /* 586 * The others already did panic for some reason. 587 * Bail out like in a timeout. 588 * rmb() to tell the compiler that system_state 589 * might have been modified by someone else. 590 */ 591 rmb(); 592 if (atomic_read(&mce_paniced)) 593 wait_for_panic(); 594 if (!monarch_timeout) 595 goto out; 596 if ((s64)*t < SPINUNIT) { 597 /* CHECKME: Make panic default for 1 too? */ 598 if (tolerant < 1) 599 mce_panic("Timeout synchronizing machine check over CPUs", 600 NULL, NULL); 601 cpu_missing = 1; 602 return 1; 603 } 604 *t -= SPINUNIT; 605out: 606 touch_nmi_watchdog(); 607 return 0; 608} 609 610/* 611 * The Monarch's reign. The Monarch is the CPU who entered 612 * the machine check handler first. It waits for the others to 613 * raise the exception too and then grades them. When any 614 * error is fatal panic. Only then let the others continue. 615 * 616 * The other CPUs entering the MCE handler will be controlled by the 617 * Monarch. They are called Subjects. 618 * 619 * This way we prevent any potential data corruption in a unrecoverable case 620 * and also makes sure always all CPU's errors are examined. 621 * 622 * Also this detects the case of an machine check event coming from outer 623 * space (not detected by any CPUs) In this case some external agent wants 624 * us to shut down, so panic too. 625 * 626 * The other CPUs might still decide to panic if the handler happens 627 * in a unrecoverable place, but in this case the system is in a semi-stable 628 * state and won't corrupt anything by itself. It's ok to let the others 629 * continue for a bit first. 630 * 631 * All the spin loops have timeouts; when a timeout happens a CPU 632 * typically elects itself to be Monarch. 633 */ 634static void mce_reign(void) 635{ 636 int cpu; 637 struct mce *m = NULL; 638 int global_worst = 0; 639 char *msg = NULL; 640 char *nmsg = NULL; 641 642 /* 643 * This CPU is the Monarch and the other CPUs have run 644 * through their handlers. 645 * Grade the severity of the errors of all the CPUs. 646 */ 647 for_each_possible_cpu(cpu) { 648 int severity = mce_severity(&per_cpu(mces_seen, cpu), tolerant, 649 &nmsg); 650 if (severity > global_worst) { 651 msg = nmsg; 652 global_worst = severity; 653 m = &per_cpu(mces_seen, cpu); 654 } 655 } 656 657 /* 658 * Cannot recover? Panic here then. 659 * This dumps all the mces in the log buffer and stops the 660 * other CPUs. 661 */ 662 if (m && global_worst >= MCE_PANIC_SEVERITY && tolerant < 3) 663 mce_panic("Fatal Machine check", m, msg); 664 665 /* 666 * For UC somewhere we let the CPU who detects it handle it. 667 * Also must let continue the others, otherwise the handling 668 * CPU could deadlock on a lock. 669 */ 670 671 /* 672 * No machine check event found. Must be some external 673 * source or one CPU is hung. Panic. 674 */ 675 if (!m && tolerant < 3) 676 mce_panic("Machine check from unknown source", NULL, NULL); 677 678 /* 679 * Now clear all the mces_seen so that they don't reappear on 680 * the next mce. 681 */ 682 for_each_possible_cpu(cpu) 683 memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce)); 684} 685 686static atomic_t global_nwo; 687 688/* 689 * Start of Monarch synchronization. This waits until all CPUs have 690 * entered the exception handler and then determines if any of them 691 * saw a fatal event that requires panic. Then it executes them 692 * in the entry order. 693 * TBD double check parallel CPU hotunplug 694 */ 695static int mce_start(int *no_way_out) 696{ 697 int order; 698 int cpus = num_online_cpus(); 699 u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC; 700 701 if (!timeout) 702 return -1; 703 704 atomic_add(*no_way_out, &global_nwo); 705 /* 706 * global_nwo should be updated before mce_callin 707 */ 708 smp_wmb(); 709 order = atomic_add_return(1, &mce_callin); 710 711 /* 712 * Wait for everyone. 713 */ 714 while (atomic_read(&mce_callin) != cpus) { 715 if (mce_timed_out(&timeout)) { 716 atomic_set(&global_nwo, 0); 717 return -1; 718 } 719 ndelay(SPINUNIT); 720 } 721 722 /* 723 * mce_callin should be read before global_nwo 724 */ 725 smp_rmb(); 726 727 if (order == 1) { 728 /* 729 * Monarch: Starts executing now, the others wait. 730 */ 731 atomic_set(&mce_executing, 1); 732 } else { 733 /* 734 * Subject: Now start the scanning loop one by one in 735 * the original callin order. 736 * This way when there are any shared banks it will be 737 * only seen by one CPU before cleared, avoiding duplicates. 738 */ 739 while (atomic_read(&mce_executing) < order) { 740 if (mce_timed_out(&timeout)) { 741 atomic_set(&global_nwo, 0); 742 return -1; 743 } 744 ndelay(SPINUNIT); 745 } 746 } 747 748 /* 749 * Cache the global no_way_out state. 750 */ 751 *no_way_out = atomic_read(&global_nwo); 752 753 return order; 754} 755 756/* 757 * Synchronize between CPUs after main scanning loop. 758 * This invokes the bulk of the Monarch processing. 759 */ 760static int mce_end(int order) 761{ 762 int ret = -1; 763 u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC; 764 765 if (!timeout) 766 goto reset; 767 if (order < 0) 768 goto reset; 769 770 /* 771 * Allow others to run. 772 */ 773 atomic_inc(&mce_executing); 774 775 if (order == 1) { 776 /* CHECKME: Can this race with a parallel hotplug? */ 777 int cpus = num_online_cpus(); 778 779 /* 780 * Monarch: Wait for everyone to go through their scanning 781 * loops. 782 */ 783 while (atomic_read(&mce_executing) <= cpus) { 784 if (mce_timed_out(&timeout)) 785 goto reset; 786 ndelay(SPINUNIT); 787 } 788 789 mce_reign(); 790 barrier(); 791 ret = 0; 792 } else { 793 /* 794 * Subject: Wait for Monarch to finish. 795 */ 796 while (atomic_read(&mce_executing) != 0) { 797 if (mce_timed_out(&timeout)) 798 goto reset; 799 ndelay(SPINUNIT); 800 } 801 802 /* 803 * Don't reset anything. That's done by the Monarch. 804 */ 805 return 0; 806 } 807 808 /* 809 * Reset all global state. 810 */ 811reset: 812 atomic_set(&global_nwo, 0); 813 atomic_set(&mce_callin, 0); 814 barrier(); 815 816 /* 817 * Let others run again. 818 */ 819 atomic_set(&mce_executing, 0); 820 return ret; 821} 822 823/* 824 * Check if the address reported by the CPU is in a format we can parse. 825 * It would be possible to add code for most other cases, but all would 826 * be somewhat complicated (e.g. segment offset would require an instruction 827 * parser). So only support physical addresses upto page granuality for now. 828 */ 829static int mce_usable_address(struct mce *m) 830{ 831 if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV)) 832 return 0; 833 if ((m->misc & 0x3f) > PAGE_SHIFT) 834 return 0; 835 if (((m->misc >> 6) & 7) != MCM_ADDR_PHYS) 836 return 0; 837 return 1; 838} 839 840static void mce_clear_state(unsigned long *toclear) 841{ 842 int i; 843 844 for (i = 0; i < banks; i++) { 845 if (test_bit(i, toclear)) 846 mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); 847 } 848} 849 850/* 851 * The actual machine check handler. This only handles real 852 * exceptions when something got corrupted coming in through int 18. 853 * 854 * This is executed in NMI context not subject to normal locking rules. This 855 * implies that most kernel services cannot be safely used. Don't even 856 * think about putting a printk in there! 857 * 858 * On Intel systems this is entered on all CPUs in parallel through 859 * MCE broadcast. However some CPUs might be broken beyond repair, 860 * so be always careful when synchronizing with others. 861 */ 862void do_machine_check(struct pt_regs *regs, long error_code) 863{ 864 struct mce m, *final; 865 int i; 866 int worst = 0; 867 int severity; 868 /* 869 * Establish sequential order between the CPUs entering the machine 870 * check handler. 871 */ 872 int order; 873 /* 874 * If no_way_out gets set, there is no safe way to recover from this 875 * MCE. If tolerant is cranked up, we'll try anyway. 876 */ 877 int no_way_out = 0; 878 /* 879 * If kill_it gets set, there might be a way to recover from this 880 * error. 881 */ 882 int kill_it = 0; 883 DECLARE_BITMAP(toclear, MAX_NR_BANKS); 884 char *msg = "Unknown"; 885 886 atomic_inc(&mce_entry); 887 888 __get_cpu_var(mce_exception_count)++; 889 890 if (notify_die(DIE_NMI, "machine check", regs, error_code, 891 18, SIGKILL) == NOTIFY_STOP) 892 goto out; 893 if (!banks) 894 goto out; 895 896 mce_setup(&m); 897 898 m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); 899 no_way_out = mce_no_way_out(&m, &msg); 900 901 final = &__get_cpu_var(mces_seen); 902 *final = m; 903 904 barrier(); 905 906 /* 907 * When no restart IP must always kill or panic. 908 */ 909 if (!(m.mcgstatus & MCG_STATUS_RIPV)) 910 kill_it = 1; 911 912 /* 913 * Go through all the banks in exclusion of the other CPUs. 914 * This way we don't report duplicated events on shared banks 915 * because the first one to see it will clear it. 916 */ 917 order = mce_start(&no_way_out); 918 for (i = 0; i < banks; i++) { 919 __clear_bit(i, toclear); 920 if (!bank[i]) 921 continue; 922 923 m.misc = 0; 924 m.addr = 0; 925 m.bank = i; 926 927 m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4); 928 if ((m.status & MCI_STATUS_VAL) == 0) 929 continue; 930 931 /* 932 * Non uncorrected or non signaled errors are handled by 933 * machine_check_poll. Leave them alone, unless this panics. 934 */ 935 if (!(m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)) && 936 !no_way_out) 937 continue; 938 939 /* 940 * Set taint even when machine check was not enabled. 941 */ 942 add_taint(TAINT_MACHINE_CHECK); 943 944 severity = mce_severity(&m, tolerant, NULL); 945 946 /* 947 * When machine check was for corrected handler don't touch, 948 * unless we're panicing. 949 */ 950 if (severity == MCE_KEEP_SEVERITY && !no_way_out) 951 continue; 952 __set_bit(i, toclear); 953 if (severity == MCE_NO_SEVERITY) { 954 /* 955 * Machine check event was not enabled. Clear, but 956 * ignore. 957 */ 958 continue; 959 } 960 961 /* 962 * Kill on action required. 963 */ 964 if (severity == MCE_AR_SEVERITY) 965 kill_it = 1; 966 967 if (m.status & MCI_STATUS_MISCV) 968 m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4); 969 if (m.status & MCI_STATUS_ADDRV) 970 m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4); 971 972 /* 973 * Action optional error. Queue address for later processing. 974 * When the ring overflows we just ignore the AO error. 975 * RED-PEN add some logging mechanism when 976 * usable_address or mce_add_ring fails. 977 * RED-PEN don't ignore overflow for tolerant == 0 978 */ 979 if (severity == MCE_AO_SEVERITY && mce_usable_address(&m)) 980 mce_ring_add(m.addr >> PAGE_SHIFT); 981 982 mce_get_rip(&m, regs); 983 mce_log(&m); 984 985 if (severity > worst) { 986 *final = m; 987 worst = severity; 988 } 989 } 990 991 if (!no_way_out) 992 mce_clear_state(toclear); 993 994 /* 995 * Do most of the synchronization with other CPUs. 996 * When there's any problem use only local no_way_out state. 997 */ 998 if (mce_end(order) < 0) 999 no_way_out = worst >= MCE_PANIC_SEVERITY; 1000 1001 /* 1002 * If we have decided that we just CAN'T continue, and the user 1003 * has not set tolerant to an insane level, give up and die. 1004 * 1005 * This is mainly used in the case when the system doesn't 1006 * support MCE broadcasting or it has been disabled. 1007 */ 1008 if (no_way_out && tolerant < 3) 1009 mce_panic("Fatal machine check on current CPU", final, msg); 1010 1011 /* 1012 * If the error seems to be unrecoverable, something should be 1013 * done. Try to kill as little as possible. If we can kill just 1014 * one task, do that. If the user has set the tolerance very 1015 * high, don't try to do anything at all. 1016 */ 1017 1018 if (kill_it && tolerant < 3) 1019 force_sig(SIGBUS, current); 1020 1021 /* notify userspace ASAP */ 1022 set_thread_flag(TIF_MCE_NOTIFY); 1023 1024 if (worst > 0) 1025 mce_report_event(regs); 1026 mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); 1027out: 1028 atomic_dec(&mce_entry); 1029 sync_core(); 1030} 1031EXPORT_SYMBOL_GPL(do_machine_check); 1032 1033/* dummy to break dependency. actual code is in mm/memory-failure.c */ 1034void __attribute__((weak)) memory_failure(unsigned long pfn, int vector) 1035{ 1036 printk(KERN_ERR "Action optional memory failure at %lx ignored\n", pfn); 1037} 1038 1039/* 1040 * Called after mce notification in process context. This code 1041 * is allowed to sleep. Call the high level VM handler to process 1042 * any corrupted pages. 1043 * Assume that the work queue code only calls this one at a time 1044 * per CPU. 1045 * Note we don't disable preemption, so this code might run on the wrong 1046 * CPU. In this case the event is picked up by the scheduled work queue. 1047 * This is merely a fast path to expedite processing in some common 1048 * cases. 1049 */ 1050void mce_notify_process(void) 1051{ 1052 unsigned long pfn; 1053 mce_notify_irq(); 1054 while (mce_ring_get(&pfn)) 1055 memory_failure(pfn, MCE_VECTOR); 1056} 1057 1058static void mce_process_work(struct work_struct *dummy) 1059{ 1060 mce_notify_process(); 1061} 1062 1063#ifdef CONFIG_X86_MCE_INTEL 1064/*** 1065 * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog 1066 * @cpu: The CPU on which the event occurred. 1067 * @status: Event status information 1068 * 1069 * This function should be called by the thermal interrupt after the 1070 * event has been processed and the decision was made to log the event 1071 * further. 1072 * 1073 * The status parameter will be saved to the 'status' field of 'struct mce' 1074 * and historically has been the register value of the 1075 * MSR_IA32_THERMAL_STATUS (Intel) msr. 1076 */ 1077void mce_log_therm_throt_event(__u64 status) 1078{ 1079 struct mce m; 1080 1081 mce_setup(&m); 1082 m.bank = MCE_THERMAL_BANK; 1083 m.status = status; 1084 mce_log(&m); 1085} 1086#endif /* CONFIG_X86_MCE_INTEL */ 1087 1088/* 1089 * Periodic polling timer for "silent" machine check errors. If the 1090 * poller finds an MCE, poll 2x faster. When the poller finds no more 1091 * errors, poll 2x slower (up to check_interval seconds). 1092 */ 1093static int check_interval = 5 * 60; /* 5 minutes */ 1094 1095static DEFINE_PER_CPU(int, next_interval); /* in jiffies */ 1096static DEFINE_PER_CPU(struct timer_list, mce_timer); 1097 1098static void mcheck_timer(unsigned long data) 1099{ 1100 struct timer_list *t = &per_cpu(mce_timer, data); 1101 int *n; 1102 1103 WARN_ON(smp_processor_id() != data); 1104 1105 if (mce_available(¤t_cpu_data)) { 1106 machine_check_poll(MCP_TIMESTAMP, 1107 &__get_cpu_var(mce_poll_banks)); 1108 } 1109 1110 /* 1111 * Alert userspace if needed. If we logged an MCE, reduce the 1112 * polling interval, otherwise increase the polling interval. 1113 */ 1114 n = &__get_cpu_var(next_interval); 1115 if (mce_notify_irq()) 1116 *n = max(*n/2, HZ/100); 1117 else 1118 *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ)); 1119 1120 t->expires = jiffies + *n; 1121 add_timer(t); 1122} 1123 1124static void mce_do_trigger(struct work_struct *work) 1125{ 1126 call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT); 1127} 1128 1129static DECLARE_WORK(mce_trigger_work, mce_do_trigger); 1130 1131/* 1132 * Notify the user(s) about new machine check events. 1133 * Can be called from interrupt context, but not from machine check/NMI 1134 * context. 1135 */ 1136int mce_notify_irq(void) 1137{ 1138 /* Not more than two messages every minute */ 1139 static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); 1140 1141 clear_thread_flag(TIF_MCE_NOTIFY); 1142 1143 if (test_and_clear_bit(0, &mce_need_notify)) { 1144 wake_up_interruptible(&mce_wait); 1145 1146 /* 1147 * There is no risk of missing notifications because 1148 * work_pending is always cleared before the function is 1149 * executed. 1150 */ 1151 if (mce_helper[0] && !work_pending(&mce_trigger_work)) 1152 schedule_work(&mce_trigger_work); 1153 1154 if (__ratelimit(&ratelimit)) 1155 printk(KERN_INFO "Machine check events logged\n"); 1156 1157 return 1; 1158 } 1159 return 0; 1160} 1161EXPORT_SYMBOL_GPL(mce_notify_irq); 1162 1163/* 1164 * Initialize Machine Checks for a CPU. 1165 */ 1166static int mce_cap_init(void) 1167{ 1168 unsigned b; 1169 u64 cap; 1170 1171 rdmsrl(MSR_IA32_MCG_CAP, cap); 1172 1173 b = cap & MCG_BANKCNT_MASK; 1174 printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b); 1175 1176 if (b > MAX_NR_BANKS) { 1177 printk(KERN_WARNING 1178 "MCE: Using only %u machine check banks out of %u\n", 1179 MAX_NR_BANKS, b); 1180 b = MAX_NR_BANKS; 1181 } 1182 1183 /* Don't support asymmetric configurations today */ 1184 WARN_ON(banks != 0 && b != banks); 1185 banks = b; 1186 if (!bank) { 1187 bank = kmalloc(banks * sizeof(u64), GFP_KERNEL); 1188 if (!bank) 1189 return -ENOMEM; 1190 memset(bank, 0xff, banks * sizeof(u64)); 1191 } 1192 1193 /* Use accurate RIP reporting if available. */ 1194 if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9) 1195 rip_msr = MSR_IA32_MCG_EIP; 1196 1197 if (cap & MCG_SER_P) 1198 mce_ser = 1; 1199 1200 return 0; 1201} 1202 1203static void mce_init(void) 1204{ 1205 mce_banks_t all_banks; 1206 u64 cap; 1207 int i; 1208 1209 /* 1210 * Log the machine checks left over from the previous reset. 1211 */ 1212 bitmap_fill(all_banks, MAX_NR_BANKS); 1213 machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks); 1214 1215 set_in_cr4(X86_CR4_MCE); 1216 1217 rdmsrl(MSR_IA32_MCG_CAP, cap); 1218 if (cap & MCG_CTL_P) 1219 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); 1220 1221 for (i = 0; i < banks; i++) { 1222 if (skip_bank_init(i)) 1223 continue; 1224 wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]); 1225 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); 1226 } 1227} 1228 1229/* Add per CPU specific workarounds here */ 1230static void mce_cpu_quirks(struct cpuinfo_x86 *c) 1231{ 1232 /* This should be disabled by the BIOS, but isn't always */ 1233 if (c->x86_vendor == X86_VENDOR_AMD) { 1234 if (c->x86 == 15 && banks > 4) { 1235 /* 1236 * disable GART TBL walk error reporting, which 1237 * trips off incorrectly with the IOMMU & 3ware 1238 * & Cerberus: 1239 */ 1240 clear_bit(10, (unsigned long *)&bank[4]); 1241 } 1242 if (c->x86 <= 17 && mce_bootlog < 0) { 1243 /* 1244 * Lots of broken BIOS around that don't clear them 1245 * by default and leave crap in there. Don't log: 1246 */ 1247 mce_bootlog = 0; 1248 } 1249 /* 1250 * Various K7s with broken bank 0 around. Always disable 1251 * by default. 1252 */ 1253 if (c->x86 == 6) 1254 bank[0] = 0; 1255 } 1256 1257 if (c->x86_vendor == X86_VENDOR_INTEL) { 1258 /* 1259 * SDM documents that on family 6 bank 0 should not be written 1260 * because it aliases to another special BIOS controlled 1261 * register. 1262 * But it's not aliased anymore on model 0x1a+ 1263 * Don't ignore bank 0 completely because there could be a 1264 * valid event later, merely don't write CTL0. 1265 */ 1266 1267 if (c->x86 == 6 && c->x86_model < 0x1A) 1268 __set_bit(0, &dont_init_banks); 1269 1270 /* 1271 * All newer Intel systems support MCE broadcasting. Enable 1272 * synchronization with a one second timeout. 1273 */ 1274 if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) && 1275 monarch_timeout < 0) 1276 monarch_timeout = USEC_PER_SEC; 1277 } 1278 if (monarch_timeout < 0) 1279 monarch_timeout = 0; 1280 if (mce_bootlog != 0) 1281 mce_panic_timeout = 30; 1282} 1283 1284static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c) 1285{ 1286 if (c->x86 != 5) 1287 return; 1288 switch (c->x86_vendor) { 1289 case X86_VENDOR_INTEL: 1290 if (mce_p5_enabled()) 1291 intel_p5_mcheck_init(c); 1292 break; 1293 case X86_VENDOR_CENTAUR: 1294 winchip_mcheck_init(c); 1295 break; 1296 } 1297} 1298 1299static void mce_cpu_features(struct cpuinfo_x86 *c) 1300{ 1301 switch (c->x86_vendor) { 1302 case X86_VENDOR_INTEL: 1303 mce_intel_feature_init(c); 1304 break; 1305 case X86_VENDOR_AMD: 1306 mce_amd_feature_init(c); 1307 break; 1308 default: 1309 break; 1310 } 1311} 1312 1313static void mce_init_timer(void) 1314{ 1315 struct timer_list *t = &__get_cpu_var(mce_timer); 1316 int *n = &__get_cpu_var(next_interval); 1317 1318 if (mce_ignore_ce) 1319 return; 1320 1321 *n = check_interval * HZ; 1322 if (!*n) 1323 return; 1324 setup_timer(t, mcheck_timer, smp_processor_id()); 1325 t->expires = round_jiffies(jiffies + *n); 1326 add_timer(t); 1327} 1328 1329/* 1330 * Called for each booted CPU to set up machine checks. 1331 * Must be called with preempt off: 1332 */ 1333void __cpuinit mcheck_init(struct cpuinfo_x86 *c) 1334{ 1335 if (mce_disabled) 1336 return; 1337 1338 mce_ancient_init(c); 1339 1340 if (!mce_available(c)) 1341 return; 1342 1343 if (mce_cap_init() < 0) { 1344 mce_disabled = 1; 1345 return; 1346 } 1347 mce_cpu_quirks(c); 1348 1349 machine_check_vector = do_machine_check; 1350 1351 mce_init(); 1352 mce_cpu_features(c); 1353 mce_init_timer(); 1354 INIT_WORK(&__get_cpu_var(mce_work), mce_process_work); 1355} 1356 1357/* 1358 * Character device to read and clear the MCE log. 1359 */ 1360 1361static DEFINE_SPINLOCK(mce_state_lock); 1362static int open_count; /* #times opened */ 1363static int open_exclu; /* already open exclusive? */ 1364 1365static int mce_open(struct inode *inode, struct file *file) 1366{ 1367 spin_lock(&mce_state_lock); 1368 1369 if (open_exclu || (open_count && (file->f_flags & O_EXCL))) { 1370 spin_unlock(&mce_state_lock); 1371 1372 return -EBUSY; 1373 } 1374 1375 if (file->f_flags & O_EXCL) 1376 open_exclu = 1; 1377 open_count++; 1378 1379 spin_unlock(&mce_state_lock); 1380 1381 return nonseekable_open(inode, file); 1382} 1383 1384static int mce_release(struct inode *inode, struct file *file) 1385{ 1386 spin_lock(&mce_state_lock); 1387 1388 open_count--; 1389 open_exclu = 0; 1390 1391 spin_unlock(&mce_state_lock); 1392 1393 return 0; 1394} 1395 1396static void collect_tscs(void *data) 1397{ 1398 unsigned long *cpu_tsc = (unsigned long *)data; 1399 1400 rdtscll(cpu_tsc[smp_processor_id()]); 1401} 1402 1403static DEFINE_MUTEX(mce_read_mutex); 1404 1405static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, 1406 loff_t *off) 1407{ 1408 char __user *buf = ubuf; 1409 unsigned long *cpu_tsc; 1410 unsigned prev, next; 1411 int i, err; 1412 1413 cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL); 1414 if (!cpu_tsc) 1415 return -ENOMEM; 1416 1417 mutex_lock(&mce_read_mutex); 1418 next = rcu_dereference(mcelog.next); 1419 1420 /* Only supports full reads right now */ 1421 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) { 1422 mutex_unlock(&mce_read_mutex); 1423 kfree(cpu_tsc); 1424 1425 return -EINVAL; 1426 } 1427 1428 err = 0; 1429 prev = 0; 1430 do { 1431 for (i = prev; i < next; i++) { 1432 unsigned long start = jiffies; 1433 1434 while (!mcelog.entry[i].finished) { 1435 if (time_after_eq(jiffies, start + 2)) { 1436 memset(mcelog.entry + i, 0, 1437 sizeof(struct mce)); 1438 goto timeout; 1439 } 1440 cpu_relax(); 1441 } 1442 smp_rmb(); 1443 err |= copy_to_user(buf, mcelog.entry + i, 1444 sizeof(struct mce)); 1445 buf += sizeof(struct mce); 1446timeout: 1447 ; 1448 } 1449 1450 memset(mcelog.entry + prev, 0, 1451 (next - prev) * sizeof(struct mce)); 1452 prev = next; 1453 next = cmpxchg(&mcelog.next, prev, 0); 1454 } while (next != prev); 1455 1456 synchronize_sched(); 1457 1458 /* 1459 * Collect entries that were still getting written before the 1460 * synchronize. 1461 */ 1462 on_each_cpu(collect_tscs, cpu_tsc, 1); 1463 1464 for (i = next; i < MCE_LOG_LEN; i++) { 1465 if (mcelog.entry[i].finished && 1466 mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) { 1467 err |= copy_to_user(buf, mcelog.entry+i, 1468 sizeof(struct mce)); 1469 smp_rmb(); 1470 buf += sizeof(struct mce); 1471 memset(&mcelog.entry[i], 0, sizeof(struct mce)); 1472 } 1473 } 1474 mutex_unlock(&mce_read_mutex); 1475 kfree(cpu_tsc); 1476 1477 return err ? -EFAULT : buf - ubuf; 1478} 1479 1480static unsigned int mce_poll(struct file *file, poll_table *wait) 1481{ 1482 poll_wait(file, &mce_wait, wait); 1483 if (rcu_dereference(mcelog.next)) 1484 return POLLIN | POLLRDNORM; 1485 return 0; 1486} 1487 1488static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg) 1489{ 1490 int __user *p = (int __user *)arg; 1491 1492 if (!capable(CAP_SYS_ADMIN)) 1493 return -EPERM; 1494 1495 switch (cmd) { 1496 case MCE_GET_RECORD_LEN: 1497 return put_user(sizeof(struct mce), p); 1498 case MCE_GET_LOG_LEN: 1499 return put_user(MCE_LOG_LEN, p); 1500 case MCE_GETCLEAR_FLAGS: { 1501 unsigned flags; 1502 1503 do { 1504 flags = mcelog.flags; 1505 } while (cmpxchg(&mcelog.flags, flags, 0) != flags); 1506 1507 return put_user(flags, p); 1508 } 1509 default: 1510 return -ENOTTY; 1511 } 1512} 1513 1514/* Modified in mce-inject.c, so not static or const */ 1515struct file_operations mce_chrdev_ops = { 1516 .open = mce_open, 1517 .release = mce_release, 1518 .read = mce_read, 1519 .poll = mce_poll, 1520 .unlocked_ioctl = mce_ioctl, 1521}; 1522EXPORT_SYMBOL_GPL(mce_chrdev_ops); 1523 1524static struct miscdevice mce_log_device = { 1525 MISC_MCELOG_MINOR, 1526 "mcelog", 1527 &mce_chrdev_ops, 1528}; 1529 1530/* 1531 * mce=off Disables machine check 1532 * mce=no_cmci Disables CMCI 1533 * mce=dont_log_ce Clears corrected events silently, no log created for CEs. 1534 * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared. 1535 * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above) 1536 * monarchtimeout is how long to wait for other CPUs on machine 1537 * check, or 0 to not wait 1538 * mce=bootlog Log MCEs from before booting. Disabled by default on AMD. 1539 * mce=nobootlog Don't log MCEs from before booting. 1540 */ 1541static int __init mcheck_enable(char *str) 1542{ 1543 if (*str == 0) 1544 enable_p5_mce(); 1545 if (*str == '=') 1546 str++; 1547 if (!strcmp(str, "off")) 1548 mce_disabled = 1; 1549 else if (!strcmp(str, "no_cmci")) 1550 mce_cmci_disabled = 1; 1551 else if (!strcmp(str, "dont_log_ce")) 1552 mce_dont_log_ce = 1; 1553 else if (!strcmp(str, "ignore_ce")) 1554 mce_ignore_ce = 1; 1555 else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog")) 1556 mce_bootlog = (str[0] == 'b'); 1557 else if (isdigit(str[0])) { 1558 get_option(&str, &tolerant); 1559 if (*str == ',') { 1560 ++str; 1561 get_option(&str, &monarch_timeout); 1562 } 1563 } else { 1564 printk(KERN_INFO "mce argument %s ignored. Please use /sys\n", 1565 str); 1566 return 0; 1567 } 1568 return 1; 1569} 1570__setup("mce", mcheck_enable); 1571 1572/* 1573 * Sysfs support 1574 */ 1575 1576/* 1577 * Disable machine checks on suspend and shutdown. We can't really handle 1578 * them later. 1579 */ 1580static int mce_disable(void) 1581{ 1582 int i; 1583 1584 for (i = 0; i < banks; i++) { 1585 if (!skip_bank_init(i)) 1586 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); 1587 } 1588 return 0; 1589} 1590 1591static int mce_suspend(struct sys_device *dev, pm_message_t state) 1592{ 1593 return mce_disable(); 1594} 1595 1596static int mce_shutdown(struct sys_device *dev) 1597{ 1598 return mce_disable(); 1599} 1600 1601/* 1602 * On resume clear all MCE state. Don't want to see leftovers from the BIOS. 1603 * Only one CPU is active at this time, the others get re-added later using 1604 * CPU hotplug: 1605 */ 1606static int mce_resume(struct sys_device *dev) 1607{ 1608 mce_init(); 1609 mce_cpu_features(¤t_cpu_data); 1610 1611 return 0; 1612} 1613 1614static void mce_cpu_restart(void *data) 1615{ 1616 del_timer_sync(&__get_cpu_var(mce_timer)); 1617 if (!mce_available(¤t_cpu_data)) 1618 return; 1619 mce_init(); 1620 mce_init_timer(); 1621} 1622 1623/* Reinit MCEs after user configuration changes */ 1624static void mce_restart(void) 1625{ 1626 on_each_cpu(mce_cpu_restart, NULL, 1); 1627} 1628 1629static struct sysdev_class mce_sysclass = { 1630 .suspend = mce_suspend, 1631 .shutdown = mce_shutdown, 1632 .resume = mce_resume, 1633 .name = "machinecheck", 1634}; 1635 1636DEFINE_PER_CPU(struct sys_device, mce_dev); 1637 1638__cpuinitdata 1639void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); 1640 1641static struct sysdev_attribute *bank_attrs; 1642 1643static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr, 1644 char *buf) 1645{ 1646 u64 b = bank[attr - bank_attrs]; 1647 1648 return sprintf(buf, "%llx\n", b); 1649} 1650 1651static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, 1652 const char *buf, size_t size) 1653{ 1654 u64 new; 1655 1656 if (strict_strtoull(buf, 0, &new) < 0) 1657 return -EINVAL; 1658 1659 bank[attr - bank_attrs] = new; 1660 mce_restart(); 1661 1662 return size; 1663} 1664 1665static ssize_t 1666show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf) 1667{ 1668 strcpy(buf, mce_helper); 1669 strcat(buf, "\n"); 1670 return strlen(mce_helper) + 1; 1671} 1672 1673static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr, 1674 const char *buf, size_t siz) 1675{ 1676 char *p; 1677 int len; 1678 1679 strncpy(mce_helper, buf, sizeof(mce_helper)); 1680 mce_helper[sizeof(mce_helper)-1] = 0; 1681 len = strlen(mce_helper); 1682 p = strchr(mce_helper, '\n'); 1683 1684 if (*p) 1685 *p = 0; 1686 1687 return len; 1688} 1689 1690static ssize_t store_int_with_restart(struct sys_device *s, 1691 struct sysdev_attribute *attr, 1692 const char *buf, size_t size) 1693{ 1694 ssize_t ret = sysdev_store_int(s, attr, buf, size); 1695 mce_restart(); 1696 return ret; 1697} 1698 1699static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); 1700static SYSDEV_INT_ATTR(tolerant, 0644, tolerant); 1701static SYSDEV_INT_ATTR(monarch_timeout, 0644, monarch_timeout); 1702 1703static struct sysdev_ext_attribute attr_check_interval = { 1704 _SYSDEV_ATTR(check_interval, 0644, sysdev_show_int, 1705 store_int_with_restart), 1706 &check_interval 1707}; 1708 1709static struct sysdev_attribute *mce_attrs[] = { 1710 &attr_tolerant.attr, &attr_check_interval.attr, &attr_trigger, 1711 &attr_monarch_timeout.attr, 1712 NULL 1713}; 1714 1715static cpumask_var_t mce_dev_initialized; 1716 1717/* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */ 1718static __cpuinit int mce_create_device(unsigned int cpu) 1719{ 1720 int err; 1721 int i; 1722 1723 if (!mce_available(&boot_cpu_data)) 1724 return -EIO; 1725 1726 memset(&per_cpu(mce_dev, cpu).kobj, 0, sizeof(struct kobject)); 1727 per_cpu(mce_dev, cpu).id = cpu; 1728 per_cpu(mce_dev, cpu).cls = &mce_sysclass; 1729 1730 err = sysdev_register(&per_cpu(mce_dev, cpu)); 1731 if (err) 1732 return err; 1733 1734 for (i = 0; mce_attrs[i]; i++) { 1735 err = sysdev_create_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); 1736 if (err) 1737 goto error; 1738 } 1739 for (i = 0; i < banks; i++) { 1740 err = sysdev_create_file(&per_cpu(mce_dev, cpu), 1741 &bank_attrs[i]); 1742 if (err) 1743 goto error2; 1744 } 1745 cpumask_set_cpu(cpu, mce_dev_initialized); 1746 1747 return 0; 1748error2: 1749 while (--i >= 0) 1750 sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]); 1751error: 1752 while (--i >= 0) 1753 sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); 1754 1755 sysdev_unregister(&per_cpu(mce_dev, cpu)); 1756 1757 return err; 1758} 1759 1760static __cpuinit void mce_remove_device(unsigned int cpu) 1761{ 1762 int i; 1763 1764 if (!cpumask_test_cpu(cpu, mce_dev_initialized)) 1765 return; 1766 1767 for (i = 0; mce_attrs[i]; i++) 1768 sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); 1769 1770 for (i = 0; i < banks; i++) 1771 sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]); 1772 1773 sysdev_unregister(&per_cpu(mce_dev, cpu)); 1774 cpumask_clear_cpu(cpu, mce_dev_initialized); 1775} 1776 1777/* Make sure there are no machine checks on offlined CPUs. */ 1778static void mce_disable_cpu(void *h) 1779{ 1780 unsigned long action = *(unsigned long *)h; 1781 int i; 1782 1783 if (!mce_available(¤t_cpu_data)) 1784 return; 1785 if (!(action & CPU_TASKS_FROZEN)) 1786 cmci_clear(); 1787 for (i = 0; i < banks; i++) { 1788 if (!skip_bank_init(i)) 1789 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); 1790 } 1791} 1792 1793static void mce_reenable_cpu(void *h) 1794{ 1795 unsigned long action = *(unsigned long *)h; 1796 int i; 1797 1798 if (!mce_available(¤t_cpu_data)) 1799 return; 1800 1801 if (!(action & CPU_TASKS_FROZEN)) 1802 cmci_reenable(); 1803 for (i = 0; i < banks; i++) { 1804 if (!skip_bank_init(i)) 1805 wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]); 1806 } 1807} 1808 1809/* Get notified when a cpu comes on/off. Be hotplug friendly. */ 1810static int __cpuinit 1811mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) 1812{ 1813 unsigned int cpu = (unsigned long)hcpu; 1814 struct timer_list *t = &per_cpu(mce_timer, cpu); 1815 1816 switch (action) { 1817 case CPU_ONLINE: 1818 case CPU_ONLINE_FROZEN: 1819 mce_create_device(cpu); 1820 if (threshold_cpu_callback) 1821 threshold_cpu_callback(action, cpu); 1822 break; 1823 case CPU_DEAD: 1824 case CPU_DEAD_FROZEN: 1825 if (threshold_cpu_callback) 1826 threshold_cpu_callback(action, cpu); 1827 mce_remove_device(cpu); 1828 break; 1829 case CPU_DOWN_PREPARE: 1830 case CPU_DOWN_PREPARE_FROZEN: 1831 del_timer_sync(t); 1832 smp_call_function_single(cpu, mce_disable_cpu, &action, 1); 1833 break; 1834 case CPU_DOWN_FAILED: 1835 case CPU_DOWN_FAILED_FROZEN: 1836 t->expires = round_jiffies(jiffies + 1837 __get_cpu_var(next_interval)); 1838 add_timer_on(t, cpu); 1839 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); 1840 break; 1841 case CPU_POST_DEAD: 1842 /* intentionally ignoring frozen here */ 1843 cmci_rediscover(cpu); 1844 break; 1845 } 1846 return NOTIFY_OK; 1847} 1848 1849static struct notifier_block mce_cpu_notifier __cpuinitdata = { 1850 .notifier_call = mce_cpu_callback, 1851}; 1852 1853static __init int mce_init_banks(void) 1854{ 1855 int i; 1856 1857 bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks, 1858 GFP_KERNEL); 1859 if (!bank_attrs) 1860 return -ENOMEM; 1861 1862 for (i = 0; i < banks; i++) { 1863 struct sysdev_attribute *a = &bank_attrs[i]; 1864 1865 a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i); 1866 if (!a->attr.name) 1867 goto nomem; 1868 1869 a->attr.mode = 0644; 1870 a->show = show_bank; 1871 a->store = set_bank; 1872 } 1873 return 0; 1874 1875nomem: 1876 while (--i >= 0) 1877 kfree(bank_attrs[i].attr.name); 1878 kfree(bank_attrs); 1879 bank_attrs = NULL; 1880 1881 return -ENOMEM; 1882} 1883 1884static __init int mce_init_device(void) 1885{ 1886 int err; 1887 int i = 0; 1888 1889 if (!mce_available(&boot_cpu_data)) 1890 return -EIO; 1891 1892 alloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL); 1893 1894 err = mce_init_banks(); 1895 if (err) 1896 return err; 1897 1898 err = sysdev_class_register(&mce_sysclass); 1899 if (err) 1900 return err; 1901 1902 for_each_online_cpu(i) { 1903 err = mce_create_device(i); 1904 if (err) 1905 return err; 1906 } 1907 1908 register_hotcpu_notifier(&mce_cpu_notifier); 1909 misc_register(&mce_log_device); 1910 1911 return err; 1912} 1913 1914device_initcall(mce_init_device); 1915 1916#else /* CONFIG_X86_OLD_MCE: */ 1917 1918int nr_mce_banks; 1919EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */ 1920 1921/* This has to be run for each processor */ 1922void mcheck_init(struct cpuinfo_x86 *c) 1923{ 1924 if (mce_disabled == 1) 1925 return; 1926 1927 switch (c->x86_vendor) { 1928 case X86_VENDOR_AMD: 1929 amd_mcheck_init(c); 1930 break; 1931 1932 case X86_VENDOR_INTEL: 1933 if (c->x86 == 5) 1934 intel_p5_mcheck_init(c); 1935 if (c->x86 == 6) 1936 intel_p6_mcheck_init(c); 1937 if (c->x86 == 15) 1938 intel_p4_mcheck_init(c); 1939 break; 1940 1941 case X86_VENDOR_CENTAUR: 1942 if (c->x86 == 5) 1943 winchip_mcheck_init(c); 1944 break; 1945 1946 default: 1947 break; 1948 } 1949 printk(KERN_INFO "mce: CPU supports %d MCE banks\n", nr_mce_banks); 1950} 1951 1952static int __init mcheck_enable(char *str) 1953{ 1954 mce_disabled = -1; 1955 return 1; 1956} 1957 1958__setup("mce", mcheck_enable); 1959 1960#endif /* CONFIG_X86_OLD_MCE */ 1961 1962/* 1963 * Old style boot options parsing. Only for compatibility. 1964 */ 1965static int __init mcheck_disable(char *str) 1966{ 1967 mce_disabled = 1; 1968 return 1; 1969} 1970__setup("nomce", mcheck_disable); 1971