mce.c revision fb2531953fd8855abdcf458459020fd382c5deca
1/* 2 * Machine check handler. 3 * 4 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. 5 * Rest from unknown author(s). 6 * 2004 Andi Kleen. Rewrote most of it. 7 * Copyright 2008 Intel Corporation 8 * Author: Andi Kleen 9 */ 10#include <linux/thread_info.h> 11#include <linux/capability.h> 12#include <linux/miscdevice.h> 13#include <linux/interrupt.h> 14#include <linux/ratelimit.h> 15#include <linux/kallsyms.h> 16#include <linux/rcupdate.h> 17#include <linux/kobject.h> 18#include <linux/uaccess.h> 19#include <linux/kdebug.h> 20#include <linux/kernel.h> 21#include <linux/percpu.h> 22#include <linux/string.h> 23#include <linux/sysdev.h> 24#include <linux/delay.h> 25#include <linux/ctype.h> 26#include <linux/sched.h> 27#include <linux/sysfs.h> 28#include <linux/types.h> 29#include <linux/init.h> 30#include <linux/kmod.h> 31#include <linux/poll.h> 32#include <linux/nmi.h> 33#include <linux/cpu.h> 34#include <linux/smp.h> 35#include <linux/fs.h> 36#include <linux/mm.h> 37#include <linux/debugfs.h> 38 39#include <asm/processor.h> 40#include <asm/hw_irq.h> 41#include <asm/apic.h> 42#include <asm/idle.h> 43#include <asm/ipi.h> 44#include <asm/mce.h> 45#include <asm/msr.h> 46 47#include "mce-internal.h" 48 49int mce_disabled __read_mostly; 50 51#define MISC_MCELOG_MINOR 227 52 53#define SPINUNIT 100 /* 100ns */ 54 55atomic_t mce_entry; 56 57DEFINE_PER_CPU(unsigned, mce_exception_count); 58 59/* 60 * Tolerant levels: 61 * 0: always panic on uncorrected errors, log corrected errors 62 * 1: panic or SIGBUS on uncorrected errors, log corrected errors 63 * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors 64 * 3: never panic or SIGBUS, log all errors (for testing only) 65 */ 66static int tolerant __read_mostly = 1; 67static int banks __read_mostly; 68static int rip_msr __read_mostly; 69static int mce_bootlog __read_mostly = -1; 70static int monarch_timeout __read_mostly = -1; 71static int mce_panic_timeout __read_mostly; 72static int mce_dont_log_ce __read_mostly; 73int mce_cmci_disabled __read_mostly; 74int mce_ignore_ce __read_mostly; 75int mce_ser __read_mostly; 76 77struct mce_bank *mce_banks __read_mostly; 78 79/* User mode helper program triggered by machine check event */ 80static unsigned long mce_need_notify; 81static char mce_helper[128]; 82static char *mce_helper_argv[2] = { mce_helper, NULL }; 83 84static DECLARE_WAIT_QUEUE_HEAD(mce_wait); 85static DEFINE_PER_CPU(struct mce, mces_seen); 86static int cpu_missing; 87 88/* 89 * CPU/chipset specific EDAC code can register a notifier call here to print 90 * MCE errors in a human-readable form. 91 */ 92ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain); 93EXPORT_SYMBOL_GPL(x86_mce_decoder_chain); 94 95static int default_decode_mce(struct notifier_block *nb, unsigned long val, 96 void *data) 97{ 98 pr_emerg("No human readable MCE decoding support on this CPU type.\n"); 99 pr_emerg("Run the message through 'mcelog --ascii' to decode.\n"); 100 101 return NOTIFY_STOP; 102} 103 104static struct notifier_block mce_dec_nb = { 105 .notifier_call = default_decode_mce, 106 .priority = -1, 107}; 108 109/* MCA banks polled by the period polling timer for corrected events */ 110DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { 111 [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL 112}; 113 114static DEFINE_PER_CPU(struct work_struct, mce_work); 115 116/* Do initial initialization of a struct mce */ 117void mce_setup(struct mce *m) 118{ 119 memset(m, 0, sizeof(struct mce)); 120 m->cpu = m->extcpu = smp_processor_id(); 121 rdtscll(m->tsc); 122 /* We hope get_seconds stays lockless */ 123 m->time = get_seconds(); 124 m->cpuvendor = boot_cpu_data.x86_vendor; 125 m->cpuid = cpuid_eax(1); 126#ifdef CONFIG_SMP 127 m->socketid = cpu_data(m->extcpu).phys_proc_id; 128#endif 129 m->apicid = cpu_data(m->extcpu).initial_apicid; 130 rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap); 131} 132 133DEFINE_PER_CPU(struct mce, injectm); 134EXPORT_PER_CPU_SYMBOL_GPL(injectm); 135 136/* 137 * Lockless MCE logging infrastructure. 138 * This avoids deadlocks on printk locks without having to break locks. Also 139 * separate MCEs from kernel messages to avoid bogus bug reports. 140 */ 141 142static struct mce_log mcelog = { 143 .signature = MCE_LOG_SIGNATURE, 144 .len = MCE_LOG_LEN, 145 .recordlen = sizeof(struct mce), 146}; 147 148void mce_log(struct mce *mce) 149{ 150 unsigned next, entry; 151 152 mce->finished = 0; 153 wmb(); 154 for (;;) { 155 entry = rcu_dereference(mcelog.next); 156 for (;;) { 157 /* 158 * When the buffer fills up discard new entries. 159 * Assume that the earlier errors are the more 160 * interesting ones: 161 */ 162 if (entry >= MCE_LOG_LEN) { 163 set_bit(MCE_OVERFLOW, 164 (unsigned long *)&mcelog.flags); 165 return; 166 } 167 /* Old left over entry. Skip: */ 168 if (mcelog.entry[entry].finished) { 169 entry++; 170 continue; 171 } 172 break; 173 } 174 smp_rmb(); 175 next = entry + 1; 176 if (cmpxchg(&mcelog.next, entry, next) == entry) 177 break; 178 } 179 memcpy(mcelog.entry + entry, mce, sizeof(struct mce)); 180 wmb(); 181 mcelog.entry[entry].finished = 1; 182 wmb(); 183 184 mce->finished = 1; 185 set_bit(0, &mce_need_notify); 186} 187 188static void print_mce(struct mce *m) 189{ 190 pr_emerg("CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n", 191 m->extcpu, m->mcgstatus, m->bank, m->status); 192 193 if (m->ip) { 194 pr_emerg("RIP%s %02x:<%016Lx> ", 195 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", 196 m->cs, m->ip); 197 198 if (m->cs == __KERNEL_CS) 199 print_symbol("{%s}", m->ip); 200 pr_cont("\n"); 201 } 202 203 pr_emerg("TSC %llx ", m->tsc); 204 if (m->addr) 205 pr_cont("ADDR %llx ", m->addr); 206 if (m->misc) 207 pr_cont("MISC %llx ", m->misc); 208 209 pr_cont("\n"); 210 pr_emerg("PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n", 211 m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid); 212 213 /* 214 * Print out human-readable details about the MCE error, 215 * (if the CPU has an implementation for that) 216 */ 217 atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m); 218} 219 220static void print_mce_head(void) 221{ 222 pr_emerg("\nHARDWARE ERROR\n"); 223} 224 225static void print_mce_tail(void) 226{ 227 pr_emerg("This is not a software problem!\n"); 228} 229 230#define PANIC_TIMEOUT 5 /* 5 seconds */ 231 232static atomic_t mce_paniced; 233 234static int fake_panic; 235static atomic_t mce_fake_paniced; 236 237/* Panic in progress. Enable interrupts and wait for final IPI */ 238static void wait_for_panic(void) 239{ 240 long timeout = PANIC_TIMEOUT*USEC_PER_SEC; 241 242 preempt_disable(); 243 local_irq_enable(); 244 while (timeout-- > 0) 245 udelay(1); 246 if (panic_timeout == 0) 247 panic_timeout = mce_panic_timeout; 248 panic("Panicing machine check CPU died"); 249} 250 251static void mce_panic(char *msg, struct mce *final, char *exp) 252{ 253 int i; 254 255 if (!fake_panic) { 256 /* 257 * Make sure only one CPU runs in machine check panic 258 */ 259 if (atomic_inc_return(&mce_paniced) > 1) 260 wait_for_panic(); 261 barrier(); 262 263 bust_spinlocks(1); 264 console_verbose(); 265 } else { 266 /* Don't log too much for fake panic */ 267 if (atomic_inc_return(&mce_fake_paniced) > 1) 268 return; 269 } 270 print_mce_head(); 271 /* First print corrected ones that are still unlogged */ 272 for (i = 0; i < MCE_LOG_LEN; i++) { 273 struct mce *m = &mcelog.entry[i]; 274 if (!(m->status & MCI_STATUS_VAL)) 275 continue; 276 if (!(m->status & MCI_STATUS_UC)) 277 print_mce(m); 278 } 279 /* Now print uncorrected but with the final one last */ 280 for (i = 0; i < MCE_LOG_LEN; i++) { 281 struct mce *m = &mcelog.entry[i]; 282 if (!(m->status & MCI_STATUS_VAL)) 283 continue; 284 if (!(m->status & MCI_STATUS_UC)) 285 continue; 286 if (!final || memcmp(m, final, sizeof(struct mce))) 287 print_mce(m); 288 } 289 if (final) 290 print_mce(final); 291 if (cpu_missing) 292 printk(KERN_EMERG "Some CPUs didn't answer in synchronization\n"); 293 print_mce_tail(); 294 if (exp) 295 printk(KERN_EMERG "Machine check: %s\n", exp); 296 if (!fake_panic) { 297 if (panic_timeout == 0) 298 panic_timeout = mce_panic_timeout; 299 panic(msg); 300 } else 301 printk(KERN_EMERG "Fake kernel panic: %s\n", msg); 302} 303 304/* Support code for software error injection */ 305 306static int msr_to_offset(u32 msr) 307{ 308 unsigned bank = __get_cpu_var(injectm.bank); 309 310 if (msr == rip_msr) 311 return offsetof(struct mce, ip); 312 if (msr == MSR_IA32_MCx_STATUS(bank)) 313 return offsetof(struct mce, status); 314 if (msr == MSR_IA32_MCx_ADDR(bank)) 315 return offsetof(struct mce, addr); 316 if (msr == MSR_IA32_MCx_MISC(bank)) 317 return offsetof(struct mce, misc); 318 if (msr == MSR_IA32_MCG_STATUS) 319 return offsetof(struct mce, mcgstatus); 320 return -1; 321} 322 323/* MSR access wrappers used for error injection */ 324static u64 mce_rdmsrl(u32 msr) 325{ 326 u64 v; 327 328 if (__get_cpu_var(injectm).finished) { 329 int offset = msr_to_offset(msr); 330 331 if (offset < 0) 332 return 0; 333 return *(u64 *)((char *)&__get_cpu_var(injectm) + offset); 334 } 335 336 if (rdmsrl_safe(msr, &v)) { 337 WARN_ONCE(1, "mce: Unable to read msr %d!\n", msr); 338 /* 339 * Return zero in case the access faulted. This should 340 * not happen normally but can happen if the CPU does 341 * something weird, or if the code is buggy. 342 */ 343 v = 0; 344 } 345 346 return v; 347} 348 349static void mce_wrmsrl(u32 msr, u64 v) 350{ 351 if (__get_cpu_var(injectm).finished) { 352 int offset = msr_to_offset(msr); 353 354 if (offset >= 0) 355 *(u64 *)((char *)&__get_cpu_var(injectm) + offset) = v; 356 return; 357 } 358 wrmsrl(msr, v); 359} 360 361/* 362 * Simple lockless ring to communicate PFNs from the exception handler with the 363 * process context work function. This is vastly simplified because there's 364 * only a single reader and a single writer. 365 */ 366#define MCE_RING_SIZE 16 /* we use one entry less */ 367 368struct mce_ring { 369 unsigned short start; 370 unsigned short end; 371 unsigned long ring[MCE_RING_SIZE]; 372}; 373static DEFINE_PER_CPU(struct mce_ring, mce_ring); 374 375/* Runs with CPU affinity in workqueue */ 376static int mce_ring_empty(void) 377{ 378 struct mce_ring *r = &__get_cpu_var(mce_ring); 379 380 return r->start == r->end; 381} 382 383static int mce_ring_get(unsigned long *pfn) 384{ 385 struct mce_ring *r; 386 int ret = 0; 387 388 *pfn = 0; 389 get_cpu(); 390 r = &__get_cpu_var(mce_ring); 391 if (r->start == r->end) 392 goto out; 393 *pfn = r->ring[r->start]; 394 r->start = (r->start + 1) % MCE_RING_SIZE; 395 ret = 1; 396out: 397 put_cpu(); 398 return ret; 399} 400 401/* Always runs in MCE context with preempt off */ 402static int mce_ring_add(unsigned long pfn) 403{ 404 struct mce_ring *r = &__get_cpu_var(mce_ring); 405 unsigned next; 406 407 next = (r->end + 1) % MCE_RING_SIZE; 408 if (next == r->start) 409 return -1; 410 r->ring[r->end] = pfn; 411 wmb(); 412 r->end = next; 413 return 0; 414} 415 416int mce_available(struct cpuinfo_x86 *c) 417{ 418 if (mce_disabled) 419 return 0; 420 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); 421} 422 423static void mce_schedule_work(void) 424{ 425 if (!mce_ring_empty()) { 426 struct work_struct *work = &__get_cpu_var(mce_work); 427 if (!work_pending(work)) 428 schedule_work(work); 429 } 430} 431 432/* 433 * Get the address of the instruction at the time of the machine check 434 * error. 435 */ 436static inline void mce_get_rip(struct mce *m, struct pt_regs *regs) 437{ 438 439 if (regs && (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV))) { 440 m->ip = regs->ip; 441 m->cs = regs->cs; 442 } else { 443 m->ip = 0; 444 m->cs = 0; 445 } 446 if (rip_msr) 447 m->ip = mce_rdmsrl(rip_msr); 448} 449 450#ifdef CONFIG_X86_LOCAL_APIC 451/* 452 * Called after interrupts have been reenabled again 453 * when a MCE happened during an interrupts off region 454 * in the kernel. 455 */ 456asmlinkage void smp_mce_self_interrupt(struct pt_regs *regs) 457{ 458 ack_APIC_irq(); 459 exit_idle(); 460 irq_enter(); 461 mce_notify_irq(); 462 mce_schedule_work(); 463 irq_exit(); 464} 465#endif 466 467static void mce_report_event(struct pt_regs *regs) 468{ 469 if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) { 470 mce_notify_irq(); 471 /* 472 * Triggering the work queue here is just an insurance 473 * policy in case the syscall exit notify handler 474 * doesn't run soon enough or ends up running on the 475 * wrong CPU (can happen when audit sleeps) 476 */ 477 mce_schedule_work(); 478 return; 479 } 480 481#ifdef CONFIG_X86_LOCAL_APIC 482 /* 483 * Without APIC do not notify. The event will be picked 484 * up eventually. 485 */ 486 if (!cpu_has_apic) 487 return; 488 489 /* 490 * When interrupts are disabled we cannot use 491 * kernel services safely. Trigger an self interrupt 492 * through the APIC to instead do the notification 493 * after interrupts are reenabled again. 494 */ 495 apic->send_IPI_self(MCE_SELF_VECTOR); 496 497 /* 498 * Wait for idle afterwards again so that we don't leave the 499 * APIC in a non idle state because the normal APIC writes 500 * cannot exclude us. 501 */ 502 apic_wait_icr_idle(); 503#endif 504} 505 506DEFINE_PER_CPU(unsigned, mce_poll_count); 507 508/* 509 * Poll for corrected events or events that happened before reset. 510 * Those are just logged through /dev/mcelog. 511 * 512 * This is executed in standard interrupt context. 513 * 514 * Note: spec recommends to panic for fatal unsignalled 515 * errors here. However this would be quite problematic -- 516 * we would need to reimplement the Monarch handling and 517 * it would mess up the exclusion between exception handler 518 * and poll hander -- * so we skip this for now. 519 * These cases should not happen anyways, or only when the CPU 520 * is already totally * confused. In this case it's likely it will 521 * not fully execute the machine check handler either. 522 */ 523void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) 524{ 525 struct mce m; 526 int i; 527 528 __get_cpu_var(mce_poll_count)++; 529 530 mce_setup(&m); 531 532 m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); 533 for (i = 0; i < banks; i++) { 534 if (!mce_banks[i].ctl || !test_bit(i, *b)) 535 continue; 536 537 m.misc = 0; 538 m.addr = 0; 539 m.bank = i; 540 m.tsc = 0; 541 542 barrier(); 543 m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); 544 if (!(m.status & MCI_STATUS_VAL)) 545 continue; 546 547 /* 548 * Uncorrected or signalled events are handled by the exception 549 * handler when it is enabled, so don't process those here. 550 * 551 * TBD do the same check for MCI_STATUS_EN here? 552 */ 553 if (!(flags & MCP_UC) && 554 (m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC))) 555 continue; 556 557 if (m.status & MCI_STATUS_MISCV) 558 m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i)); 559 if (m.status & MCI_STATUS_ADDRV) 560 m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i)); 561 562 if (!(flags & MCP_TIMESTAMP)) 563 m.tsc = 0; 564 /* 565 * Don't get the IP here because it's unlikely to 566 * have anything to do with the actual error location. 567 */ 568 if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) { 569 mce_log(&m); 570 add_taint(TAINT_MACHINE_CHECK); 571 } 572 573 /* 574 * Clear state for this bank. 575 */ 576 mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0); 577 } 578 579 /* 580 * Don't clear MCG_STATUS here because it's only defined for 581 * exceptions. 582 */ 583 584 sync_core(); 585} 586EXPORT_SYMBOL_GPL(machine_check_poll); 587 588/* 589 * Do a quick check if any of the events requires a panic. 590 * This decides if we keep the events around or clear them. 591 */ 592static int mce_no_way_out(struct mce *m, char **msg) 593{ 594 int i; 595 596 for (i = 0; i < banks; i++) { 597 m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); 598 if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY) 599 return 1; 600 } 601 return 0; 602} 603 604/* 605 * Variable to establish order between CPUs while scanning. 606 * Each CPU spins initially until executing is equal its number. 607 */ 608static atomic_t mce_executing; 609 610/* 611 * Defines order of CPUs on entry. First CPU becomes Monarch. 612 */ 613static atomic_t mce_callin; 614 615/* 616 * Check if a timeout waiting for other CPUs happened. 617 */ 618static int mce_timed_out(u64 *t) 619{ 620 /* 621 * The others already did panic for some reason. 622 * Bail out like in a timeout. 623 * rmb() to tell the compiler that system_state 624 * might have been modified by someone else. 625 */ 626 rmb(); 627 if (atomic_read(&mce_paniced)) 628 wait_for_panic(); 629 if (!monarch_timeout) 630 goto out; 631 if ((s64)*t < SPINUNIT) { 632 /* CHECKME: Make panic default for 1 too? */ 633 if (tolerant < 1) 634 mce_panic("Timeout synchronizing machine check over CPUs", 635 NULL, NULL); 636 cpu_missing = 1; 637 return 1; 638 } 639 *t -= SPINUNIT; 640out: 641 touch_nmi_watchdog(); 642 return 0; 643} 644 645/* 646 * The Monarch's reign. The Monarch is the CPU who entered 647 * the machine check handler first. It waits for the others to 648 * raise the exception too and then grades them. When any 649 * error is fatal panic. Only then let the others continue. 650 * 651 * The other CPUs entering the MCE handler will be controlled by the 652 * Monarch. They are called Subjects. 653 * 654 * This way we prevent any potential data corruption in a unrecoverable case 655 * and also makes sure always all CPU's errors are examined. 656 * 657 * Also this detects the case of a machine check event coming from outer 658 * space (not detected by any CPUs) In this case some external agent wants 659 * us to shut down, so panic too. 660 * 661 * The other CPUs might still decide to panic if the handler happens 662 * in a unrecoverable place, but in this case the system is in a semi-stable 663 * state and won't corrupt anything by itself. It's ok to let the others 664 * continue for a bit first. 665 * 666 * All the spin loops have timeouts; when a timeout happens a CPU 667 * typically elects itself to be Monarch. 668 */ 669static void mce_reign(void) 670{ 671 int cpu; 672 struct mce *m = NULL; 673 int global_worst = 0; 674 char *msg = NULL; 675 char *nmsg = NULL; 676 677 /* 678 * This CPU is the Monarch and the other CPUs have run 679 * through their handlers. 680 * Grade the severity of the errors of all the CPUs. 681 */ 682 for_each_possible_cpu(cpu) { 683 int severity = mce_severity(&per_cpu(mces_seen, cpu), tolerant, 684 &nmsg); 685 if (severity > global_worst) { 686 msg = nmsg; 687 global_worst = severity; 688 m = &per_cpu(mces_seen, cpu); 689 } 690 } 691 692 /* 693 * Cannot recover? Panic here then. 694 * This dumps all the mces in the log buffer and stops the 695 * other CPUs. 696 */ 697 if (m && global_worst >= MCE_PANIC_SEVERITY && tolerant < 3) 698 mce_panic("Fatal Machine check", m, msg); 699 700 /* 701 * For UC somewhere we let the CPU who detects it handle it. 702 * Also must let continue the others, otherwise the handling 703 * CPU could deadlock on a lock. 704 */ 705 706 /* 707 * No machine check event found. Must be some external 708 * source or one CPU is hung. Panic. 709 */ 710 if (global_worst <= MCE_KEEP_SEVERITY && tolerant < 3) 711 mce_panic("Machine check from unknown source", NULL, NULL); 712 713 /* 714 * Now clear all the mces_seen so that they don't reappear on 715 * the next mce. 716 */ 717 for_each_possible_cpu(cpu) 718 memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce)); 719} 720 721static atomic_t global_nwo; 722 723/* 724 * Start of Monarch synchronization. This waits until all CPUs have 725 * entered the exception handler and then determines if any of them 726 * saw a fatal event that requires panic. Then it executes them 727 * in the entry order. 728 * TBD double check parallel CPU hotunplug 729 */ 730static int mce_start(int *no_way_out) 731{ 732 int order; 733 int cpus = num_online_cpus(); 734 u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC; 735 736 if (!timeout) 737 return -1; 738 739 atomic_add(*no_way_out, &global_nwo); 740 /* 741 * global_nwo should be updated before mce_callin 742 */ 743 smp_wmb(); 744 order = atomic_inc_return(&mce_callin); 745 746 /* 747 * Wait for everyone. 748 */ 749 while (atomic_read(&mce_callin) != cpus) { 750 if (mce_timed_out(&timeout)) { 751 atomic_set(&global_nwo, 0); 752 return -1; 753 } 754 ndelay(SPINUNIT); 755 } 756 757 /* 758 * mce_callin should be read before global_nwo 759 */ 760 smp_rmb(); 761 762 if (order == 1) { 763 /* 764 * Monarch: Starts executing now, the others wait. 765 */ 766 atomic_set(&mce_executing, 1); 767 } else { 768 /* 769 * Subject: Now start the scanning loop one by one in 770 * the original callin order. 771 * This way when there are any shared banks it will be 772 * only seen by one CPU before cleared, avoiding duplicates. 773 */ 774 while (atomic_read(&mce_executing) < order) { 775 if (mce_timed_out(&timeout)) { 776 atomic_set(&global_nwo, 0); 777 return -1; 778 } 779 ndelay(SPINUNIT); 780 } 781 } 782 783 /* 784 * Cache the global no_way_out state. 785 */ 786 *no_way_out = atomic_read(&global_nwo); 787 788 return order; 789} 790 791/* 792 * Synchronize between CPUs after main scanning loop. 793 * This invokes the bulk of the Monarch processing. 794 */ 795static int mce_end(int order) 796{ 797 int ret = -1; 798 u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC; 799 800 if (!timeout) 801 goto reset; 802 if (order < 0) 803 goto reset; 804 805 /* 806 * Allow others to run. 807 */ 808 atomic_inc(&mce_executing); 809 810 if (order == 1) { 811 /* CHECKME: Can this race with a parallel hotplug? */ 812 int cpus = num_online_cpus(); 813 814 /* 815 * Monarch: Wait for everyone to go through their scanning 816 * loops. 817 */ 818 while (atomic_read(&mce_executing) <= cpus) { 819 if (mce_timed_out(&timeout)) 820 goto reset; 821 ndelay(SPINUNIT); 822 } 823 824 mce_reign(); 825 barrier(); 826 ret = 0; 827 } else { 828 /* 829 * Subject: Wait for Monarch to finish. 830 */ 831 while (atomic_read(&mce_executing) != 0) { 832 if (mce_timed_out(&timeout)) 833 goto reset; 834 ndelay(SPINUNIT); 835 } 836 837 /* 838 * Don't reset anything. That's done by the Monarch. 839 */ 840 return 0; 841 } 842 843 /* 844 * Reset all global state. 845 */ 846reset: 847 atomic_set(&global_nwo, 0); 848 atomic_set(&mce_callin, 0); 849 barrier(); 850 851 /* 852 * Let others run again. 853 */ 854 atomic_set(&mce_executing, 0); 855 return ret; 856} 857 858/* 859 * Check if the address reported by the CPU is in a format we can parse. 860 * It would be possible to add code for most other cases, but all would 861 * be somewhat complicated (e.g. segment offset would require an instruction 862 * parser). So only support physical addresses upto page granuality for now. 863 */ 864static int mce_usable_address(struct mce *m) 865{ 866 if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV)) 867 return 0; 868 if ((m->misc & 0x3f) > PAGE_SHIFT) 869 return 0; 870 if (((m->misc >> 6) & 7) != MCM_ADDR_PHYS) 871 return 0; 872 return 1; 873} 874 875static void mce_clear_state(unsigned long *toclear) 876{ 877 int i; 878 879 for (i = 0; i < banks; i++) { 880 if (test_bit(i, toclear)) 881 mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0); 882 } 883} 884 885/* 886 * The actual machine check handler. This only handles real 887 * exceptions when something got corrupted coming in through int 18. 888 * 889 * This is executed in NMI context not subject to normal locking rules. This 890 * implies that most kernel services cannot be safely used. Don't even 891 * think about putting a printk in there! 892 * 893 * On Intel systems this is entered on all CPUs in parallel through 894 * MCE broadcast. However some CPUs might be broken beyond repair, 895 * so be always careful when synchronizing with others. 896 */ 897void do_machine_check(struct pt_regs *regs, long error_code) 898{ 899 struct mce m, *final; 900 int i; 901 int worst = 0; 902 int severity; 903 /* 904 * Establish sequential order between the CPUs entering the machine 905 * check handler. 906 */ 907 int order; 908 /* 909 * If no_way_out gets set, there is no safe way to recover from this 910 * MCE. If tolerant is cranked up, we'll try anyway. 911 */ 912 int no_way_out = 0; 913 /* 914 * If kill_it gets set, there might be a way to recover from this 915 * error. 916 */ 917 int kill_it = 0; 918 DECLARE_BITMAP(toclear, MAX_NR_BANKS); 919 char *msg = "Unknown"; 920 921 atomic_inc(&mce_entry); 922 923 __get_cpu_var(mce_exception_count)++; 924 925 if (notify_die(DIE_NMI, "machine check", regs, error_code, 926 18, SIGKILL) == NOTIFY_STOP) 927 goto out; 928 if (!banks) 929 goto out; 930 931 mce_setup(&m); 932 933 m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); 934 final = &__get_cpu_var(mces_seen); 935 *final = m; 936 937 no_way_out = mce_no_way_out(&m, &msg); 938 939 barrier(); 940 941 /* 942 * When no restart IP must always kill or panic. 943 */ 944 if (!(m.mcgstatus & MCG_STATUS_RIPV)) 945 kill_it = 1; 946 947 /* 948 * Go through all the banks in exclusion of the other CPUs. 949 * This way we don't report duplicated events on shared banks 950 * because the first one to see it will clear it. 951 */ 952 order = mce_start(&no_way_out); 953 for (i = 0; i < banks; i++) { 954 __clear_bit(i, toclear); 955 if (!mce_banks[i].ctl) 956 continue; 957 958 m.misc = 0; 959 m.addr = 0; 960 m.bank = i; 961 962 m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); 963 if ((m.status & MCI_STATUS_VAL) == 0) 964 continue; 965 966 /* 967 * Non uncorrected or non signaled errors are handled by 968 * machine_check_poll. Leave them alone, unless this panics. 969 */ 970 if (!(m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)) && 971 !no_way_out) 972 continue; 973 974 /* 975 * Set taint even when machine check was not enabled. 976 */ 977 add_taint(TAINT_MACHINE_CHECK); 978 979 severity = mce_severity(&m, tolerant, NULL); 980 981 /* 982 * When machine check was for corrected handler don't touch, 983 * unless we're panicing. 984 */ 985 if (severity == MCE_KEEP_SEVERITY && !no_way_out) 986 continue; 987 __set_bit(i, toclear); 988 if (severity == MCE_NO_SEVERITY) { 989 /* 990 * Machine check event was not enabled. Clear, but 991 * ignore. 992 */ 993 continue; 994 } 995 996 /* 997 * Kill on action required. 998 */ 999 if (severity == MCE_AR_SEVERITY) 1000 kill_it = 1; 1001 1002 if (m.status & MCI_STATUS_MISCV) 1003 m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i)); 1004 if (m.status & MCI_STATUS_ADDRV) 1005 m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i)); 1006 1007 /* 1008 * Action optional error. Queue address for later processing. 1009 * When the ring overflows we just ignore the AO error. 1010 * RED-PEN add some logging mechanism when 1011 * usable_address or mce_add_ring fails. 1012 * RED-PEN don't ignore overflow for tolerant == 0 1013 */ 1014 if (severity == MCE_AO_SEVERITY && mce_usable_address(&m)) 1015 mce_ring_add(m.addr >> PAGE_SHIFT); 1016 1017 mce_get_rip(&m, regs); 1018 mce_log(&m); 1019 1020 if (severity > worst) { 1021 *final = m; 1022 worst = severity; 1023 } 1024 } 1025 1026 if (!no_way_out) 1027 mce_clear_state(toclear); 1028 1029 /* 1030 * Do most of the synchronization with other CPUs. 1031 * When there's any problem use only local no_way_out state. 1032 */ 1033 if (mce_end(order) < 0) 1034 no_way_out = worst >= MCE_PANIC_SEVERITY; 1035 1036 /* 1037 * If we have decided that we just CAN'T continue, and the user 1038 * has not set tolerant to an insane level, give up and die. 1039 * 1040 * This is mainly used in the case when the system doesn't 1041 * support MCE broadcasting or it has been disabled. 1042 */ 1043 if (no_way_out && tolerant < 3) 1044 mce_panic("Fatal machine check on current CPU", final, msg); 1045 1046 /* 1047 * If the error seems to be unrecoverable, something should be 1048 * done. Try to kill as little as possible. If we can kill just 1049 * one task, do that. If the user has set the tolerance very 1050 * high, don't try to do anything at all. 1051 */ 1052 1053 if (kill_it && tolerant < 3) 1054 force_sig(SIGBUS, current); 1055 1056 /* notify userspace ASAP */ 1057 set_thread_flag(TIF_MCE_NOTIFY); 1058 1059 if (worst > 0) 1060 mce_report_event(regs); 1061 mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); 1062out: 1063 atomic_dec(&mce_entry); 1064 sync_core(); 1065} 1066EXPORT_SYMBOL_GPL(do_machine_check); 1067 1068/* dummy to break dependency. actual code is in mm/memory-failure.c */ 1069void __attribute__((weak)) memory_failure(unsigned long pfn, int vector) 1070{ 1071 printk(KERN_ERR "Action optional memory failure at %lx ignored\n", pfn); 1072} 1073 1074/* 1075 * Called after mce notification in process context. This code 1076 * is allowed to sleep. Call the high level VM handler to process 1077 * any corrupted pages. 1078 * Assume that the work queue code only calls this one at a time 1079 * per CPU. 1080 * Note we don't disable preemption, so this code might run on the wrong 1081 * CPU. In this case the event is picked up by the scheduled work queue. 1082 * This is merely a fast path to expedite processing in some common 1083 * cases. 1084 */ 1085void mce_notify_process(void) 1086{ 1087 unsigned long pfn; 1088 mce_notify_irq(); 1089 while (mce_ring_get(&pfn)) 1090 memory_failure(pfn, MCE_VECTOR); 1091} 1092 1093static void mce_process_work(struct work_struct *dummy) 1094{ 1095 mce_notify_process(); 1096} 1097 1098#ifdef CONFIG_X86_MCE_INTEL 1099/*** 1100 * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog 1101 * @cpu: The CPU on which the event occurred. 1102 * @status: Event status information 1103 * 1104 * This function should be called by the thermal interrupt after the 1105 * event has been processed and the decision was made to log the event 1106 * further. 1107 * 1108 * The status parameter will be saved to the 'status' field of 'struct mce' 1109 * and historically has been the register value of the 1110 * MSR_IA32_THERMAL_STATUS (Intel) msr. 1111 */ 1112void mce_log_therm_throt_event(__u64 status) 1113{ 1114 struct mce m; 1115 1116 mce_setup(&m); 1117 m.bank = MCE_THERMAL_BANK; 1118 m.status = status; 1119 mce_log(&m); 1120} 1121#endif /* CONFIG_X86_MCE_INTEL */ 1122 1123/* 1124 * Periodic polling timer for "silent" machine check errors. If the 1125 * poller finds an MCE, poll 2x faster. When the poller finds no more 1126 * errors, poll 2x slower (up to check_interval seconds). 1127 */ 1128static int check_interval = 5 * 60; /* 5 minutes */ 1129 1130static DEFINE_PER_CPU(int, mce_next_interval); /* in jiffies */ 1131static DEFINE_PER_CPU(struct timer_list, mce_timer); 1132 1133static void mcheck_timer(unsigned long data) 1134{ 1135 struct timer_list *t = &per_cpu(mce_timer, data); 1136 int *n; 1137 1138 WARN_ON(smp_processor_id() != data); 1139 1140 if (mce_available(¤t_cpu_data)) { 1141 machine_check_poll(MCP_TIMESTAMP, 1142 &__get_cpu_var(mce_poll_banks)); 1143 } 1144 1145 /* 1146 * Alert userspace if needed. If we logged an MCE, reduce the 1147 * polling interval, otherwise increase the polling interval. 1148 */ 1149 n = &__get_cpu_var(mce_next_interval); 1150 if (mce_notify_irq()) 1151 *n = max(*n/2, HZ/100); 1152 else 1153 *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ)); 1154 1155 t->expires = jiffies + *n; 1156 add_timer_on(t, smp_processor_id()); 1157} 1158 1159static void mce_do_trigger(struct work_struct *work) 1160{ 1161 call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT); 1162} 1163 1164static DECLARE_WORK(mce_trigger_work, mce_do_trigger); 1165 1166/* 1167 * Notify the user(s) about new machine check events. 1168 * Can be called from interrupt context, but not from machine check/NMI 1169 * context. 1170 */ 1171int mce_notify_irq(void) 1172{ 1173 /* Not more than two messages every minute */ 1174 static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); 1175 1176 clear_thread_flag(TIF_MCE_NOTIFY); 1177 1178 if (test_and_clear_bit(0, &mce_need_notify)) { 1179 wake_up_interruptible(&mce_wait); 1180 1181 /* 1182 * There is no risk of missing notifications because 1183 * work_pending is always cleared before the function is 1184 * executed. 1185 */ 1186 if (mce_helper[0] && !work_pending(&mce_trigger_work)) 1187 schedule_work(&mce_trigger_work); 1188 1189 if (__ratelimit(&ratelimit)) 1190 printk(KERN_INFO "Machine check events logged\n"); 1191 1192 return 1; 1193 } 1194 return 0; 1195} 1196EXPORT_SYMBOL_GPL(mce_notify_irq); 1197 1198static int mce_banks_init(void) 1199{ 1200 int i; 1201 1202 mce_banks = kzalloc(banks * sizeof(struct mce_bank), GFP_KERNEL); 1203 if (!mce_banks) 1204 return -ENOMEM; 1205 for (i = 0; i < banks; i++) { 1206 struct mce_bank *b = &mce_banks[i]; 1207 1208 b->ctl = -1ULL; 1209 b->init = 1; 1210 } 1211 return 0; 1212} 1213 1214/* 1215 * Initialize Machine Checks for a CPU. 1216 */ 1217static int __cpuinit mce_cap_init(void) 1218{ 1219 unsigned b; 1220 u64 cap; 1221 1222 rdmsrl(MSR_IA32_MCG_CAP, cap); 1223 1224 b = cap & MCG_BANKCNT_MASK; 1225 printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b); 1226 1227 if (b > MAX_NR_BANKS) { 1228 printk(KERN_WARNING 1229 "MCE: Using only %u machine check banks out of %u\n", 1230 MAX_NR_BANKS, b); 1231 b = MAX_NR_BANKS; 1232 } 1233 1234 /* Don't support asymmetric configurations today */ 1235 WARN_ON(banks != 0 && b != banks); 1236 banks = b; 1237 if (!mce_banks) { 1238 int err = mce_banks_init(); 1239 1240 if (err) 1241 return err; 1242 } 1243 1244 /* Use accurate RIP reporting if available. */ 1245 if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9) 1246 rip_msr = MSR_IA32_MCG_EIP; 1247 1248 if (cap & MCG_SER_P) 1249 mce_ser = 1; 1250 1251 return 0; 1252} 1253 1254static void mce_init(void) 1255{ 1256 mce_banks_t all_banks; 1257 u64 cap; 1258 int i; 1259 1260 /* 1261 * Log the machine checks left over from the previous reset. 1262 */ 1263 bitmap_fill(all_banks, MAX_NR_BANKS); 1264 machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks); 1265 1266 set_in_cr4(X86_CR4_MCE); 1267 1268 rdmsrl(MSR_IA32_MCG_CAP, cap); 1269 if (cap & MCG_CTL_P) 1270 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); 1271 1272 for (i = 0; i < banks; i++) { 1273 struct mce_bank *b = &mce_banks[i]; 1274 1275 if (!b->init) 1276 continue; 1277 wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl); 1278 wrmsrl(MSR_IA32_MCx_STATUS(i), 0); 1279 } 1280} 1281 1282/* Add per CPU specific workarounds here */ 1283static int __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c) 1284{ 1285 if (c->x86_vendor == X86_VENDOR_UNKNOWN) { 1286 pr_info("MCE: unknown CPU type - not enabling MCE support.\n"); 1287 return -EOPNOTSUPP; 1288 } 1289 1290 /* This should be disabled by the BIOS, but isn't always */ 1291 if (c->x86_vendor == X86_VENDOR_AMD) { 1292 if (c->x86 == 15 && banks > 4) { 1293 /* 1294 * disable GART TBL walk error reporting, which 1295 * trips off incorrectly with the IOMMU & 3ware 1296 * & Cerberus: 1297 */ 1298 clear_bit(10, (unsigned long *)&mce_banks[4].ctl); 1299 } 1300 if (c->x86 <= 17 && mce_bootlog < 0) { 1301 /* 1302 * Lots of broken BIOS around that don't clear them 1303 * by default and leave crap in there. Don't log: 1304 */ 1305 mce_bootlog = 0; 1306 } 1307 /* 1308 * Various K7s with broken bank 0 around. Always disable 1309 * by default. 1310 */ 1311 if (c->x86 == 6 && banks > 0) 1312 mce_banks[0].ctl = 0; 1313 } 1314 1315 if (c->x86_vendor == X86_VENDOR_INTEL) { 1316 /* 1317 * SDM documents that on family 6 bank 0 should not be written 1318 * because it aliases to another special BIOS controlled 1319 * register. 1320 * But it's not aliased anymore on model 0x1a+ 1321 * Don't ignore bank 0 completely because there could be a 1322 * valid event later, merely don't write CTL0. 1323 */ 1324 1325 if (c->x86 == 6 && c->x86_model < 0x1A && banks > 0) 1326 mce_banks[0].init = 0; 1327 1328 /* 1329 * All newer Intel systems support MCE broadcasting. Enable 1330 * synchronization with a one second timeout. 1331 */ 1332 if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) && 1333 monarch_timeout < 0) 1334 monarch_timeout = USEC_PER_SEC; 1335 1336 /* 1337 * There are also broken BIOSes on some Pentium M and 1338 * earlier systems: 1339 */ 1340 if (c->x86 == 6 && c->x86_model <= 13 && mce_bootlog < 0) 1341 mce_bootlog = 0; 1342 } 1343 if (monarch_timeout < 0) 1344 monarch_timeout = 0; 1345 if (mce_bootlog != 0) 1346 mce_panic_timeout = 30; 1347 1348 return 0; 1349} 1350 1351static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c) 1352{ 1353 if (c->x86 != 5) 1354 return; 1355 switch (c->x86_vendor) { 1356 case X86_VENDOR_INTEL: 1357 intel_p5_mcheck_init(c); 1358 break; 1359 case X86_VENDOR_CENTAUR: 1360 winchip_mcheck_init(c); 1361 break; 1362 } 1363} 1364 1365static void mce_cpu_features(struct cpuinfo_x86 *c) 1366{ 1367 switch (c->x86_vendor) { 1368 case X86_VENDOR_INTEL: 1369 mce_intel_feature_init(c); 1370 break; 1371 case X86_VENDOR_AMD: 1372 mce_amd_feature_init(c); 1373 break; 1374 default: 1375 break; 1376 } 1377} 1378 1379static void mce_init_timer(void) 1380{ 1381 struct timer_list *t = &__get_cpu_var(mce_timer); 1382 int *n = &__get_cpu_var(mce_next_interval); 1383 1384 if (mce_ignore_ce) 1385 return; 1386 1387 *n = check_interval * HZ; 1388 if (!*n) 1389 return; 1390 setup_timer(t, mcheck_timer, smp_processor_id()); 1391 t->expires = round_jiffies(jiffies + *n); 1392 add_timer_on(t, smp_processor_id()); 1393} 1394 1395/* Handle unconfigured int18 (should never happen) */ 1396static void unexpected_machine_check(struct pt_regs *regs, long error_code) 1397{ 1398 printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", 1399 smp_processor_id()); 1400} 1401 1402/* Call the installed machine check handler for this CPU setup. */ 1403void (*machine_check_vector)(struct pt_regs *, long error_code) = 1404 unexpected_machine_check; 1405 1406/* 1407 * Called for each booted CPU to set up machine checks. 1408 * Must be called with preempt off: 1409 */ 1410void __cpuinit mcheck_init(struct cpuinfo_x86 *c) 1411{ 1412 if (mce_disabled) 1413 return; 1414 1415 mce_ancient_init(c); 1416 1417 if (!mce_available(c)) 1418 return; 1419 1420 if (mce_cap_init() < 0 || mce_cpu_quirks(c) < 0) { 1421 mce_disabled = 1; 1422 return; 1423 } 1424 1425 machine_check_vector = do_machine_check; 1426 1427 mce_init(); 1428 mce_cpu_features(c); 1429 mce_init_timer(); 1430 INIT_WORK(&__get_cpu_var(mce_work), mce_process_work); 1431 1432 if (raw_smp_processor_id() == 0) 1433 atomic_notifier_chain_register(&x86_mce_decoder_chain, &mce_dec_nb); 1434} 1435 1436/* 1437 * Character device to read and clear the MCE log. 1438 */ 1439 1440static DEFINE_SPINLOCK(mce_state_lock); 1441static int open_count; /* #times opened */ 1442static int open_exclu; /* already open exclusive? */ 1443 1444static int mce_open(struct inode *inode, struct file *file) 1445{ 1446 spin_lock(&mce_state_lock); 1447 1448 if (open_exclu || (open_count && (file->f_flags & O_EXCL))) { 1449 spin_unlock(&mce_state_lock); 1450 1451 return -EBUSY; 1452 } 1453 1454 if (file->f_flags & O_EXCL) 1455 open_exclu = 1; 1456 open_count++; 1457 1458 spin_unlock(&mce_state_lock); 1459 1460 return nonseekable_open(inode, file); 1461} 1462 1463static int mce_release(struct inode *inode, struct file *file) 1464{ 1465 spin_lock(&mce_state_lock); 1466 1467 open_count--; 1468 open_exclu = 0; 1469 1470 spin_unlock(&mce_state_lock); 1471 1472 return 0; 1473} 1474 1475static void collect_tscs(void *data) 1476{ 1477 unsigned long *cpu_tsc = (unsigned long *)data; 1478 1479 rdtscll(cpu_tsc[smp_processor_id()]); 1480} 1481 1482static DEFINE_MUTEX(mce_read_mutex); 1483 1484static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, 1485 loff_t *off) 1486{ 1487 char __user *buf = ubuf; 1488 unsigned long *cpu_tsc; 1489 unsigned prev, next; 1490 int i, err; 1491 1492 cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL); 1493 if (!cpu_tsc) 1494 return -ENOMEM; 1495 1496 mutex_lock(&mce_read_mutex); 1497 next = rcu_dereference(mcelog.next); 1498 1499 /* Only supports full reads right now */ 1500 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) { 1501 mutex_unlock(&mce_read_mutex); 1502 kfree(cpu_tsc); 1503 1504 return -EINVAL; 1505 } 1506 1507 err = 0; 1508 prev = 0; 1509 do { 1510 for (i = prev; i < next; i++) { 1511 unsigned long start = jiffies; 1512 1513 while (!mcelog.entry[i].finished) { 1514 if (time_after_eq(jiffies, start + 2)) { 1515 memset(mcelog.entry + i, 0, 1516 sizeof(struct mce)); 1517 goto timeout; 1518 } 1519 cpu_relax(); 1520 } 1521 smp_rmb(); 1522 err |= copy_to_user(buf, mcelog.entry + i, 1523 sizeof(struct mce)); 1524 buf += sizeof(struct mce); 1525timeout: 1526 ; 1527 } 1528 1529 memset(mcelog.entry + prev, 0, 1530 (next - prev) * sizeof(struct mce)); 1531 prev = next; 1532 next = cmpxchg(&mcelog.next, prev, 0); 1533 } while (next != prev); 1534 1535 synchronize_sched(); 1536 1537 /* 1538 * Collect entries that were still getting written before the 1539 * synchronize. 1540 */ 1541 on_each_cpu(collect_tscs, cpu_tsc, 1); 1542 1543 for (i = next; i < MCE_LOG_LEN; i++) { 1544 if (mcelog.entry[i].finished && 1545 mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) { 1546 err |= copy_to_user(buf, mcelog.entry+i, 1547 sizeof(struct mce)); 1548 smp_rmb(); 1549 buf += sizeof(struct mce); 1550 memset(&mcelog.entry[i], 0, sizeof(struct mce)); 1551 } 1552 } 1553 mutex_unlock(&mce_read_mutex); 1554 kfree(cpu_tsc); 1555 1556 return err ? -EFAULT : buf - ubuf; 1557} 1558 1559static unsigned int mce_poll(struct file *file, poll_table *wait) 1560{ 1561 poll_wait(file, &mce_wait, wait); 1562 if (rcu_dereference(mcelog.next)) 1563 return POLLIN | POLLRDNORM; 1564 return 0; 1565} 1566 1567static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg) 1568{ 1569 int __user *p = (int __user *)arg; 1570 1571 if (!capable(CAP_SYS_ADMIN)) 1572 return -EPERM; 1573 1574 switch (cmd) { 1575 case MCE_GET_RECORD_LEN: 1576 return put_user(sizeof(struct mce), p); 1577 case MCE_GET_LOG_LEN: 1578 return put_user(MCE_LOG_LEN, p); 1579 case MCE_GETCLEAR_FLAGS: { 1580 unsigned flags; 1581 1582 do { 1583 flags = mcelog.flags; 1584 } while (cmpxchg(&mcelog.flags, flags, 0) != flags); 1585 1586 return put_user(flags, p); 1587 } 1588 default: 1589 return -ENOTTY; 1590 } 1591} 1592 1593/* Modified in mce-inject.c, so not static or const */ 1594struct file_operations mce_chrdev_ops = { 1595 .open = mce_open, 1596 .release = mce_release, 1597 .read = mce_read, 1598 .poll = mce_poll, 1599 .unlocked_ioctl = mce_ioctl, 1600}; 1601EXPORT_SYMBOL_GPL(mce_chrdev_ops); 1602 1603static struct miscdevice mce_log_device = { 1604 MISC_MCELOG_MINOR, 1605 "mcelog", 1606 &mce_chrdev_ops, 1607}; 1608 1609/* 1610 * mce=off Disables machine check 1611 * mce=no_cmci Disables CMCI 1612 * mce=dont_log_ce Clears corrected events silently, no log created for CEs. 1613 * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared. 1614 * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above) 1615 * monarchtimeout is how long to wait for other CPUs on machine 1616 * check, or 0 to not wait 1617 * mce=bootlog Log MCEs from before booting. Disabled by default on AMD. 1618 * mce=nobootlog Don't log MCEs from before booting. 1619 */ 1620static int __init mcheck_enable(char *str) 1621{ 1622 if (*str == 0) { 1623 enable_p5_mce(); 1624 return 1; 1625 } 1626 if (*str == '=') 1627 str++; 1628 if (!strcmp(str, "off")) 1629 mce_disabled = 1; 1630 else if (!strcmp(str, "no_cmci")) 1631 mce_cmci_disabled = 1; 1632 else if (!strcmp(str, "dont_log_ce")) 1633 mce_dont_log_ce = 1; 1634 else if (!strcmp(str, "ignore_ce")) 1635 mce_ignore_ce = 1; 1636 else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog")) 1637 mce_bootlog = (str[0] == 'b'); 1638 else if (isdigit(str[0])) { 1639 get_option(&str, &tolerant); 1640 if (*str == ',') { 1641 ++str; 1642 get_option(&str, &monarch_timeout); 1643 } 1644 } else { 1645 printk(KERN_INFO "mce argument %s ignored. Please use /sys\n", 1646 str); 1647 return 0; 1648 } 1649 return 1; 1650} 1651__setup("mce", mcheck_enable); 1652 1653/* 1654 * Sysfs support 1655 */ 1656 1657/* 1658 * Disable machine checks on suspend and shutdown. We can't really handle 1659 * them later. 1660 */ 1661static int mce_disable(void) 1662{ 1663 int i; 1664 1665 for (i = 0; i < banks; i++) { 1666 struct mce_bank *b = &mce_banks[i]; 1667 1668 if (b->init) 1669 wrmsrl(MSR_IA32_MCx_CTL(i), 0); 1670 } 1671 return 0; 1672} 1673 1674static int mce_suspend(struct sys_device *dev, pm_message_t state) 1675{ 1676 return mce_disable(); 1677} 1678 1679static int mce_shutdown(struct sys_device *dev) 1680{ 1681 return mce_disable(); 1682} 1683 1684/* 1685 * On resume clear all MCE state. Don't want to see leftovers from the BIOS. 1686 * Only one CPU is active at this time, the others get re-added later using 1687 * CPU hotplug: 1688 */ 1689static int mce_resume(struct sys_device *dev) 1690{ 1691 mce_init(); 1692 mce_cpu_features(¤t_cpu_data); 1693 1694 return 0; 1695} 1696 1697static void mce_cpu_restart(void *data) 1698{ 1699 del_timer_sync(&__get_cpu_var(mce_timer)); 1700 if (!mce_available(¤t_cpu_data)) 1701 return; 1702 mce_init(); 1703 mce_init_timer(); 1704} 1705 1706/* Reinit MCEs after user configuration changes */ 1707static void mce_restart(void) 1708{ 1709 on_each_cpu(mce_cpu_restart, NULL, 1); 1710} 1711 1712/* Toggle features for corrected errors */ 1713static void mce_disable_ce(void *all) 1714{ 1715 if (!mce_available(¤t_cpu_data)) 1716 return; 1717 if (all) 1718 del_timer_sync(&__get_cpu_var(mce_timer)); 1719 cmci_clear(); 1720} 1721 1722static void mce_enable_ce(void *all) 1723{ 1724 if (!mce_available(¤t_cpu_data)) 1725 return; 1726 cmci_reenable(); 1727 cmci_recheck(); 1728 if (all) 1729 mce_init_timer(); 1730} 1731 1732static struct sysdev_class mce_sysclass = { 1733 .suspend = mce_suspend, 1734 .shutdown = mce_shutdown, 1735 .resume = mce_resume, 1736 .name = "machinecheck", 1737}; 1738 1739DEFINE_PER_CPU(struct sys_device, mce_dev); 1740 1741__cpuinitdata 1742void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); 1743 1744static inline struct mce_bank *attr_to_bank(struct sysdev_attribute *attr) 1745{ 1746 return container_of(attr, struct mce_bank, attr); 1747} 1748 1749static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr, 1750 char *buf) 1751{ 1752 return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl); 1753} 1754 1755static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, 1756 const char *buf, size_t size) 1757{ 1758 u64 new; 1759 1760 if (strict_strtoull(buf, 0, &new) < 0) 1761 return -EINVAL; 1762 1763 attr_to_bank(attr)->ctl = new; 1764 mce_restart(); 1765 1766 return size; 1767} 1768 1769static ssize_t 1770show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf) 1771{ 1772 strcpy(buf, mce_helper); 1773 strcat(buf, "\n"); 1774 return strlen(mce_helper) + 1; 1775} 1776 1777static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr, 1778 const char *buf, size_t siz) 1779{ 1780 char *p; 1781 1782 strncpy(mce_helper, buf, sizeof(mce_helper)); 1783 mce_helper[sizeof(mce_helper)-1] = 0; 1784 p = strchr(mce_helper, '\n'); 1785 1786 if (p) 1787 *p = 0; 1788 1789 return strlen(mce_helper) + !!p; 1790} 1791 1792static ssize_t set_ignore_ce(struct sys_device *s, 1793 struct sysdev_attribute *attr, 1794 const char *buf, size_t size) 1795{ 1796 u64 new; 1797 1798 if (strict_strtoull(buf, 0, &new) < 0) 1799 return -EINVAL; 1800 1801 if (mce_ignore_ce ^ !!new) { 1802 if (new) { 1803 /* disable ce features */ 1804 on_each_cpu(mce_disable_ce, (void *)1, 1); 1805 mce_ignore_ce = 1; 1806 } else { 1807 /* enable ce features */ 1808 mce_ignore_ce = 0; 1809 on_each_cpu(mce_enable_ce, (void *)1, 1); 1810 } 1811 } 1812 return size; 1813} 1814 1815static ssize_t set_cmci_disabled(struct sys_device *s, 1816 struct sysdev_attribute *attr, 1817 const char *buf, size_t size) 1818{ 1819 u64 new; 1820 1821 if (strict_strtoull(buf, 0, &new) < 0) 1822 return -EINVAL; 1823 1824 if (mce_cmci_disabled ^ !!new) { 1825 if (new) { 1826 /* disable cmci */ 1827 on_each_cpu(mce_disable_ce, NULL, 1); 1828 mce_cmci_disabled = 1; 1829 } else { 1830 /* enable cmci */ 1831 mce_cmci_disabled = 0; 1832 on_each_cpu(mce_enable_ce, NULL, 1); 1833 } 1834 } 1835 return size; 1836} 1837 1838static ssize_t store_int_with_restart(struct sys_device *s, 1839 struct sysdev_attribute *attr, 1840 const char *buf, size_t size) 1841{ 1842 ssize_t ret = sysdev_store_int(s, attr, buf, size); 1843 mce_restart(); 1844 return ret; 1845} 1846 1847static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); 1848static SYSDEV_INT_ATTR(tolerant, 0644, tolerant); 1849static SYSDEV_INT_ATTR(monarch_timeout, 0644, monarch_timeout); 1850static SYSDEV_INT_ATTR(dont_log_ce, 0644, mce_dont_log_ce); 1851 1852static struct sysdev_ext_attribute attr_check_interval = { 1853 _SYSDEV_ATTR(check_interval, 0644, sysdev_show_int, 1854 store_int_with_restart), 1855 &check_interval 1856}; 1857 1858static struct sysdev_ext_attribute attr_ignore_ce = { 1859 _SYSDEV_ATTR(ignore_ce, 0644, sysdev_show_int, set_ignore_ce), 1860 &mce_ignore_ce 1861}; 1862 1863static struct sysdev_ext_attribute attr_cmci_disabled = { 1864 _SYSDEV_ATTR(cmci_disabled, 0644, sysdev_show_int, set_cmci_disabled), 1865 &mce_cmci_disabled 1866}; 1867 1868static struct sysdev_attribute *mce_attrs[] = { 1869 &attr_tolerant.attr, 1870 &attr_check_interval.attr, 1871 &attr_trigger, 1872 &attr_monarch_timeout.attr, 1873 &attr_dont_log_ce.attr, 1874 &attr_ignore_ce.attr, 1875 &attr_cmci_disabled.attr, 1876 NULL 1877}; 1878 1879static cpumask_var_t mce_dev_initialized; 1880 1881/* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */ 1882static __cpuinit int mce_create_device(unsigned int cpu) 1883{ 1884 int err; 1885 int i, j; 1886 1887 if (!mce_available(&boot_cpu_data)) 1888 return -EIO; 1889 1890 memset(&per_cpu(mce_dev, cpu).kobj, 0, sizeof(struct kobject)); 1891 per_cpu(mce_dev, cpu).id = cpu; 1892 per_cpu(mce_dev, cpu).cls = &mce_sysclass; 1893 1894 err = sysdev_register(&per_cpu(mce_dev, cpu)); 1895 if (err) 1896 return err; 1897 1898 for (i = 0; mce_attrs[i]; i++) { 1899 err = sysdev_create_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); 1900 if (err) 1901 goto error; 1902 } 1903 for (j = 0; j < banks; j++) { 1904 err = sysdev_create_file(&per_cpu(mce_dev, cpu), 1905 &mce_banks[j].attr); 1906 if (err) 1907 goto error2; 1908 } 1909 cpumask_set_cpu(cpu, mce_dev_initialized); 1910 1911 return 0; 1912error2: 1913 while (--j >= 0) 1914 sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[j].attr); 1915error: 1916 while (--i >= 0) 1917 sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[i].attr); 1918 1919 sysdev_unregister(&per_cpu(mce_dev, cpu)); 1920 1921 return err; 1922} 1923 1924static __cpuinit void mce_remove_device(unsigned int cpu) 1925{ 1926 int i; 1927 1928 if (!cpumask_test_cpu(cpu, mce_dev_initialized)) 1929 return; 1930 1931 for (i = 0; mce_attrs[i]; i++) 1932 sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); 1933 1934 for (i = 0; i < banks; i++) 1935 sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[i].attr); 1936 1937 sysdev_unregister(&per_cpu(mce_dev, cpu)); 1938 cpumask_clear_cpu(cpu, mce_dev_initialized); 1939} 1940 1941/* Make sure there are no machine checks on offlined CPUs. */ 1942static void mce_disable_cpu(void *h) 1943{ 1944 unsigned long action = *(unsigned long *)h; 1945 int i; 1946 1947 if (!mce_available(¤t_cpu_data)) 1948 return; 1949 if (!(action & CPU_TASKS_FROZEN)) 1950 cmci_clear(); 1951 for (i = 0; i < banks; i++) { 1952 struct mce_bank *b = &mce_banks[i]; 1953 1954 if (b->init) 1955 wrmsrl(MSR_IA32_MCx_CTL(i), 0); 1956 } 1957} 1958 1959static void mce_reenable_cpu(void *h) 1960{ 1961 unsigned long action = *(unsigned long *)h; 1962 int i; 1963 1964 if (!mce_available(¤t_cpu_data)) 1965 return; 1966 1967 if (!(action & CPU_TASKS_FROZEN)) 1968 cmci_reenable(); 1969 for (i = 0; i < banks; i++) { 1970 struct mce_bank *b = &mce_banks[i]; 1971 1972 if (b->init) 1973 wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl); 1974 } 1975} 1976 1977/* Get notified when a cpu comes on/off. Be hotplug friendly. */ 1978static int __cpuinit 1979mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) 1980{ 1981 unsigned int cpu = (unsigned long)hcpu; 1982 struct timer_list *t = &per_cpu(mce_timer, cpu); 1983 1984 switch (action) { 1985 case CPU_ONLINE: 1986 case CPU_ONLINE_FROZEN: 1987 mce_create_device(cpu); 1988 if (threshold_cpu_callback) 1989 threshold_cpu_callback(action, cpu); 1990 break; 1991 case CPU_DEAD: 1992 case CPU_DEAD_FROZEN: 1993 if (threshold_cpu_callback) 1994 threshold_cpu_callback(action, cpu); 1995 mce_remove_device(cpu); 1996 break; 1997 case CPU_DOWN_PREPARE: 1998 case CPU_DOWN_PREPARE_FROZEN: 1999 del_timer_sync(t); 2000 smp_call_function_single(cpu, mce_disable_cpu, &action, 1); 2001 break; 2002 case CPU_DOWN_FAILED: 2003 case CPU_DOWN_FAILED_FROZEN: 2004 t->expires = round_jiffies(jiffies + 2005 __get_cpu_var(mce_next_interval)); 2006 add_timer_on(t, cpu); 2007 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); 2008 break; 2009 case CPU_POST_DEAD: 2010 /* intentionally ignoring frozen here */ 2011 cmci_rediscover(cpu); 2012 break; 2013 } 2014 return NOTIFY_OK; 2015} 2016 2017static struct notifier_block mce_cpu_notifier __cpuinitdata = { 2018 .notifier_call = mce_cpu_callback, 2019}; 2020 2021static __init void mce_init_banks(void) 2022{ 2023 int i; 2024 2025 for (i = 0; i < banks; i++) { 2026 struct mce_bank *b = &mce_banks[i]; 2027 struct sysdev_attribute *a = &b->attr; 2028 2029 a->attr.name = b->attrname; 2030 snprintf(b->attrname, ATTR_LEN, "bank%d", i); 2031 2032 a->attr.mode = 0644; 2033 a->show = show_bank; 2034 a->store = set_bank; 2035 } 2036} 2037 2038static __init int mce_init_device(void) 2039{ 2040 int err; 2041 int i = 0; 2042 2043 if (!mce_available(&boot_cpu_data)) 2044 return -EIO; 2045 2046 zalloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL); 2047 2048 mce_init_banks(); 2049 2050 err = sysdev_class_register(&mce_sysclass); 2051 if (err) 2052 return err; 2053 2054 for_each_online_cpu(i) { 2055 err = mce_create_device(i); 2056 if (err) 2057 return err; 2058 } 2059 2060 register_hotcpu_notifier(&mce_cpu_notifier); 2061 misc_register(&mce_log_device); 2062 2063 return err; 2064} 2065 2066device_initcall(mce_init_device); 2067 2068/* 2069 * Old style boot options parsing. Only for compatibility. 2070 */ 2071static int __init mcheck_disable(char *str) 2072{ 2073 mce_disabled = 1; 2074 return 1; 2075} 2076__setup("nomce", mcheck_disable); 2077 2078#ifdef CONFIG_DEBUG_FS 2079struct dentry *mce_get_debugfs_dir(void) 2080{ 2081 static struct dentry *dmce; 2082 2083 if (!dmce) 2084 dmce = debugfs_create_dir("mce", NULL); 2085 2086 return dmce; 2087} 2088 2089static void mce_reset(void) 2090{ 2091 cpu_missing = 0; 2092 atomic_set(&mce_fake_paniced, 0); 2093 atomic_set(&mce_executing, 0); 2094 atomic_set(&mce_callin, 0); 2095 atomic_set(&global_nwo, 0); 2096} 2097 2098static int fake_panic_get(void *data, u64 *val) 2099{ 2100 *val = fake_panic; 2101 return 0; 2102} 2103 2104static int fake_panic_set(void *data, u64 val) 2105{ 2106 mce_reset(); 2107 fake_panic = val; 2108 return 0; 2109} 2110 2111DEFINE_SIMPLE_ATTRIBUTE(fake_panic_fops, fake_panic_get, 2112 fake_panic_set, "%llu\n"); 2113 2114static int __init mce_debugfs_init(void) 2115{ 2116 struct dentry *dmce, *ffake_panic; 2117 2118 dmce = mce_get_debugfs_dir(); 2119 if (!dmce) 2120 return -ENOMEM; 2121 ffake_panic = debugfs_create_file("fake_panic", 0444, dmce, NULL, 2122 &fake_panic_fops); 2123 if (!ffake_panic) 2124 return -ENOMEM; 2125 2126 return 0; 2127} 2128late_initcall(mce_debugfs_init); 2129#endif 2130