mce.c revision a07e4156a2ee6359d31a44946d7ee7f85dbf6bca
1/* 2 * Machine check handler. 3 * 4 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. 5 * Rest from unknown author(s). 6 * 2004 Andi Kleen. Rewrote most of it. 7 * Copyright 2008 Intel Corporation 8 * Author: Andi Kleen 9 */ 10#include <linux/thread_info.h> 11#include <linux/capability.h> 12#include <linux/miscdevice.h> 13#include <linux/interrupt.h> 14#include <linux/ratelimit.h> 15#include <linux/kallsyms.h> 16#include <linux/rcupdate.h> 17#include <linux/kobject.h> 18#include <linux/uaccess.h> 19#include <linux/kdebug.h> 20#include <linux/kernel.h> 21#include <linux/percpu.h> 22#include <linux/string.h> 23#include <linux/sysdev.h> 24#include <linux/delay.h> 25#include <linux/ctype.h> 26#include <linux/sched.h> 27#include <linux/sysfs.h> 28#include <linux/types.h> 29#include <linux/init.h> 30#include <linux/kmod.h> 31#include <linux/poll.h> 32#include <linux/nmi.h> 33#include <linux/cpu.h> 34#include <linux/smp.h> 35#include <linux/fs.h> 36#include <linux/mm.h> 37#include <linux/debugfs.h> 38 39#include <asm/processor.h> 40#include <asm/hw_irq.h> 41#include <asm/apic.h> 42#include <asm/idle.h> 43#include <asm/ipi.h> 44#include <asm/mce.h> 45#include <asm/msr.h> 46 47#include "mce-internal.h" 48 49#define CREATE_TRACE_POINTS 50#include <trace/events/mce.h> 51 52int mce_disabled __read_mostly; 53 54#define MISC_MCELOG_MINOR 227 55 56#define SPINUNIT 100 /* 100ns */ 57 58atomic_t mce_entry; 59 60DEFINE_PER_CPU(unsigned, mce_exception_count); 61 62/* 63 * Tolerant levels: 64 * 0: always panic on uncorrected errors, log corrected errors 65 * 1: panic or SIGBUS on uncorrected errors, log corrected errors 66 * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors 67 * 3: never panic or SIGBUS, log all errors (for testing only) 68 */ 69static int tolerant __read_mostly = 1; 70static int banks __read_mostly; 71static int rip_msr __read_mostly; 72static int mce_bootlog __read_mostly = -1; 73static int monarch_timeout __read_mostly = -1; 74static int mce_panic_timeout __read_mostly; 75static int mce_dont_log_ce __read_mostly; 76int mce_cmci_disabled __read_mostly; 77int mce_ignore_ce __read_mostly; 78int mce_ser __read_mostly; 79 80struct mce_bank *mce_banks __read_mostly; 81 82/* User mode helper program triggered by machine check event */ 83static unsigned long mce_need_notify; 84static char mce_helper[128]; 85static char *mce_helper_argv[2] = { mce_helper, NULL }; 86 87static DECLARE_WAIT_QUEUE_HEAD(mce_wait); 88static DEFINE_PER_CPU(struct mce, mces_seen); 89static int cpu_missing; 90 91/* 92 * CPU/chipset specific EDAC code can register a notifier call here to print 93 * MCE errors in a human-readable form. 94 */ 95ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain); 96EXPORT_SYMBOL_GPL(x86_mce_decoder_chain); 97 98static int default_decode_mce(struct notifier_block *nb, unsigned long val, 99 void *data) 100{ 101 pr_emerg("No human readable MCE decoding support on this CPU type.\n"); 102 pr_emerg("Run the message through 'mcelog --ascii' to decode.\n"); 103 104 return NOTIFY_STOP; 105} 106 107static struct notifier_block mce_dec_nb = { 108 .notifier_call = default_decode_mce, 109 .priority = -1, 110}; 111 112/* MCA banks polled by the period polling timer for corrected events */ 113DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { 114 [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL 115}; 116 117static DEFINE_PER_CPU(struct work_struct, mce_work); 118 119/* Do initial initialization of a struct mce */ 120void mce_setup(struct mce *m) 121{ 122 memset(m, 0, sizeof(struct mce)); 123 m->cpu = m->extcpu = smp_processor_id(); 124 rdtscll(m->tsc); 125 /* We hope get_seconds stays lockless */ 126 m->time = get_seconds(); 127 m->cpuvendor = boot_cpu_data.x86_vendor; 128 m->cpuid = cpuid_eax(1); 129#ifdef CONFIG_SMP 130 m->socketid = cpu_data(m->extcpu).phys_proc_id; 131#endif 132 m->apicid = cpu_data(m->extcpu).initial_apicid; 133 rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap); 134} 135 136DEFINE_PER_CPU(struct mce, injectm); 137EXPORT_PER_CPU_SYMBOL_GPL(injectm); 138 139/* 140 * Lockless MCE logging infrastructure. 141 * This avoids deadlocks on printk locks without having to break locks. Also 142 * separate MCEs from kernel messages to avoid bogus bug reports. 143 */ 144 145static struct mce_log mcelog = { 146 .signature = MCE_LOG_SIGNATURE, 147 .len = MCE_LOG_LEN, 148 .recordlen = sizeof(struct mce), 149}; 150 151void mce_log(struct mce *mce) 152{ 153 unsigned next, entry; 154 155 /* Emit the trace record: */ 156 trace_mce_record(mce); 157 158 mce->finished = 0; 159 wmb(); 160 for (;;) { 161 entry = rcu_dereference(mcelog.next); 162 for (;;) { 163 /* 164 * When the buffer fills up discard new entries. 165 * Assume that the earlier errors are the more 166 * interesting ones: 167 */ 168 if (entry >= MCE_LOG_LEN) { 169 set_bit(MCE_OVERFLOW, 170 (unsigned long *)&mcelog.flags); 171 return; 172 } 173 /* Old left over entry. Skip: */ 174 if (mcelog.entry[entry].finished) { 175 entry++; 176 continue; 177 } 178 break; 179 } 180 smp_rmb(); 181 next = entry + 1; 182 if (cmpxchg(&mcelog.next, entry, next) == entry) 183 break; 184 } 185 memcpy(mcelog.entry + entry, mce, sizeof(struct mce)); 186 wmb(); 187 mcelog.entry[entry].finished = 1; 188 wmb(); 189 190 mce->finished = 1; 191 set_bit(0, &mce_need_notify); 192} 193 194static void print_mce(struct mce *m) 195{ 196 pr_emerg("CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n", 197 m->extcpu, m->mcgstatus, m->bank, m->status); 198 199 if (m->ip) { 200 pr_emerg("RIP%s %02x:<%016Lx> ", 201 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", 202 m->cs, m->ip); 203 204 if (m->cs == __KERNEL_CS) 205 print_symbol("{%s}", m->ip); 206 pr_cont("\n"); 207 } 208 209 pr_emerg("TSC %llx ", m->tsc); 210 if (m->addr) 211 pr_cont("ADDR %llx ", m->addr); 212 if (m->misc) 213 pr_cont("MISC %llx ", m->misc); 214 215 pr_cont("\n"); 216 pr_emerg("PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n", 217 m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid); 218 219 /* 220 * Print out human-readable details about the MCE error, 221 * (if the CPU has an implementation for that) 222 */ 223 atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m); 224} 225 226static void print_mce_head(void) 227{ 228 pr_emerg("\nHARDWARE ERROR\n"); 229} 230 231static void print_mce_tail(void) 232{ 233 pr_emerg("This is not a software problem!\n"); 234} 235 236#define PANIC_TIMEOUT 5 /* 5 seconds */ 237 238static atomic_t mce_paniced; 239 240static int fake_panic; 241static atomic_t mce_fake_paniced; 242 243/* Panic in progress. Enable interrupts and wait for final IPI */ 244static void wait_for_panic(void) 245{ 246 long timeout = PANIC_TIMEOUT*USEC_PER_SEC; 247 248 preempt_disable(); 249 local_irq_enable(); 250 while (timeout-- > 0) 251 udelay(1); 252 if (panic_timeout == 0) 253 panic_timeout = mce_panic_timeout; 254 panic("Panicing machine check CPU died"); 255} 256 257static void mce_panic(char *msg, struct mce *final, char *exp) 258{ 259 int i; 260 261 if (!fake_panic) { 262 /* 263 * Make sure only one CPU runs in machine check panic 264 */ 265 if (atomic_inc_return(&mce_paniced) > 1) 266 wait_for_panic(); 267 barrier(); 268 269 bust_spinlocks(1); 270 console_verbose(); 271 } else { 272 /* Don't log too much for fake panic */ 273 if (atomic_inc_return(&mce_fake_paniced) > 1) 274 return; 275 } 276 print_mce_head(); 277 /* First print corrected ones that are still unlogged */ 278 for (i = 0; i < MCE_LOG_LEN; i++) { 279 struct mce *m = &mcelog.entry[i]; 280 if (!(m->status & MCI_STATUS_VAL)) 281 continue; 282 if (!(m->status & MCI_STATUS_UC)) 283 print_mce(m); 284 } 285 /* Now print uncorrected but with the final one last */ 286 for (i = 0; i < MCE_LOG_LEN; i++) { 287 struct mce *m = &mcelog.entry[i]; 288 if (!(m->status & MCI_STATUS_VAL)) 289 continue; 290 if (!(m->status & MCI_STATUS_UC)) 291 continue; 292 if (!final || memcmp(m, final, sizeof(struct mce))) 293 print_mce(m); 294 } 295 if (final) 296 print_mce(final); 297 if (cpu_missing) 298 printk(KERN_EMERG "Some CPUs didn't answer in synchronization\n"); 299 print_mce_tail(); 300 if (exp) 301 printk(KERN_EMERG "Machine check: %s\n", exp); 302 if (!fake_panic) { 303 if (panic_timeout == 0) 304 panic_timeout = mce_panic_timeout; 305 panic(msg); 306 } else 307 printk(KERN_EMERG "Fake kernel panic: %s\n", msg); 308} 309 310/* Support code for software error injection */ 311 312static int msr_to_offset(u32 msr) 313{ 314 unsigned bank = __get_cpu_var(injectm.bank); 315 316 if (msr == rip_msr) 317 return offsetof(struct mce, ip); 318 if (msr == MSR_IA32_MCx_STATUS(bank)) 319 return offsetof(struct mce, status); 320 if (msr == MSR_IA32_MCx_ADDR(bank)) 321 return offsetof(struct mce, addr); 322 if (msr == MSR_IA32_MCx_MISC(bank)) 323 return offsetof(struct mce, misc); 324 if (msr == MSR_IA32_MCG_STATUS) 325 return offsetof(struct mce, mcgstatus); 326 return -1; 327} 328 329/* MSR access wrappers used for error injection */ 330static u64 mce_rdmsrl(u32 msr) 331{ 332 u64 v; 333 334 if (__get_cpu_var(injectm).finished) { 335 int offset = msr_to_offset(msr); 336 337 if (offset < 0) 338 return 0; 339 return *(u64 *)((char *)&__get_cpu_var(injectm) + offset); 340 } 341 342 if (rdmsrl_safe(msr, &v)) { 343 WARN_ONCE(1, "mce: Unable to read msr %d!\n", msr); 344 /* 345 * Return zero in case the access faulted. This should 346 * not happen normally but can happen if the CPU does 347 * something weird, or if the code is buggy. 348 */ 349 v = 0; 350 } 351 352 return v; 353} 354 355static void mce_wrmsrl(u32 msr, u64 v) 356{ 357 if (__get_cpu_var(injectm).finished) { 358 int offset = msr_to_offset(msr); 359 360 if (offset >= 0) 361 *(u64 *)((char *)&__get_cpu_var(injectm) + offset) = v; 362 return; 363 } 364 wrmsrl(msr, v); 365} 366 367/* 368 * Simple lockless ring to communicate PFNs from the exception handler with the 369 * process context work function. This is vastly simplified because there's 370 * only a single reader and a single writer. 371 */ 372#define MCE_RING_SIZE 16 /* we use one entry less */ 373 374struct mce_ring { 375 unsigned short start; 376 unsigned short end; 377 unsigned long ring[MCE_RING_SIZE]; 378}; 379static DEFINE_PER_CPU(struct mce_ring, mce_ring); 380 381/* Runs with CPU affinity in workqueue */ 382static int mce_ring_empty(void) 383{ 384 struct mce_ring *r = &__get_cpu_var(mce_ring); 385 386 return r->start == r->end; 387} 388 389static int mce_ring_get(unsigned long *pfn) 390{ 391 struct mce_ring *r; 392 int ret = 0; 393 394 *pfn = 0; 395 get_cpu(); 396 r = &__get_cpu_var(mce_ring); 397 if (r->start == r->end) 398 goto out; 399 *pfn = r->ring[r->start]; 400 r->start = (r->start + 1) % MCE_RING_SIZE; 401 ret = 1; 402out: 403 put_cpu(); 404 return ret; 405} 406 407/* Always runs in MCE context with preempt off */ 408static int mce_ring_add(unsigned long pfn) 409{ 410 struct mce_ring *r = &__get_cpu_var(mce_ring); 411 unsigned next; 412 413 next = (r->end + 1) % MCE_RING_SIZE; 414 if (next == r->start) 415 return -1; 416 r->ring[r->end] = pfn; 417 wmb(); 418 r->end = next; 419 return 0; 420} 421 422int mce_available(struct cpuinfo_x86 *c) 423{ 424 if (mce_disabled) 425 return 0; 426 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); 427} 428 429static void mce_schedule_work(void) 430{ 431 if (!mce_ring_empty()) { 432 struct work_struct *work = &__get_cpu_var(mce_work); 433 if (!work_pending(work)) 434 schedule_work(work); 435 } 436} 437 438/* 439 * Get the address of the instruction at the time of the machine check 440 * error. 441 */ 442static inline void mce_get_rip(struct mce *m, struct pt_regs *regs) 443{ 444 445 if (regs && (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV))) { 446 m->ip = regs->ip; 447 m->cs = regs->cs; 448 } else { 449 m->ip = 0; 450 m->cs = 0; 451 } 452 if (rip_msr) 453 m->ip = mce_rdmsrl(rip_msr); 454} 455 456#ifdef CONFIG_X86_LOCAL_APIC 457/* 458 * Called after interrupts have been reenabled again 459 * when a MCE happened during an interrupts off region 460 * in the kernel. 461 */ 462asmlinkage void smp_mce_self_interrupt(struct pt_regs *regs) 463{ 464 ack_APIC_irq(); 465 exit_idle(); 466 irq_enter(); 467 mce_notify_irq(); 468 mce_schedule_work(); 469 irq_exit(); 470} 471#endif 472 473static void mce_report_event(struct pt_regs *regs) 474{ 475 if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) { 476 mce_notify_irq(); 477 /* 478 * Triggering the work queue here is just an insurance 479 * policy in case the syscall exit notify handler 480 * doesn't run soon enough or ends up running on the 481 * wrong CPU (can happen when audit sleeps) 482 */ 483 mce_schedule_work(); 484 return; 485 } 486 487#ifdef CONFIG_X86_LOCAL_APIC 488 /* 489 * Without APIC do not notify. The event will be picked 490 * up eventually. 491 */ 492 if (!cpu_has_apic) 493 return; 494 495 /* 496 * When interrupts are disabled we cannot use 497 * kernel services safely. Trigger an self interrupt 498 * through the APIC to instead do the notification 499 * after interrupts are reenabled again. 500 */ 501 apic->send_IPI_self(MCE_SELF_VECTOR); 502 503 /* 504 * Wait for idle afterwards again so that we don't leave the 505 * APIC in a non idle state because the normal APIC writes 506 * cannot exclude us. 507 */ 508 apic_wait_icr_idle(); 509#endif 510} 511 512DEFINE_PER_CPU(unsigned, mce_poll_count); 513 514/* 515 * Poll for corrected events or events that happened before reset. 516 * Those are just logged through /dev/mcelog. 517 * 518 * This is executed in standard interrupt context. 519 * 520 * Note: spec recommends to panic for fatal unsignalled 521 * errors here. However this would be quite problematic -- 522 * we would need to reimplement the Monarch handling and 523 * it would mess up the exclusion between exception handler 524 * and poll hander -- * so we skip this for now. 525 * These cases should not happen anyways, or only when the CPU 526 * is already totally * confused. In this case it's likely it will 527 * not fully execute the machine check handler either. 528 */ 529void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) 530{ 531 struct mce m; 532 int i; 533 534 __get_cpu_var(mce_poll_count)++; 535 536 mce_setup(&m); 537 538 m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); 539 for (i = 0; i < banks; i++) { 540 if (!mce_banks[i].ctl || !test_bit(i, *b)) 541 continue; 542 543 m.misc = 0; 544 m.addr = 0; 545 m.bank = i; 546 m.tsc = 0; 547 548 barrier(); 549 m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); 550 if (!(m.status & MCI_STATUS_VAL)) 551 continue; 552 553 /* 554 * Uncorrected or signalled events are handled by the exception 555 * handler when it is enabled, so don't process those here. 556 * 557 * TBD do the same check for MCI_STATUS_EN here? 558 */ 559 if (!(flags & MCP_UC) && 560 (m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC))) 561 continue; 562 563 if (m.status & MCI_STATUS_MISCV) 564 m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i)); 565 if (m.status & MCI_STATUS_ADDRV) 566 m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i)); 567 568 if (!(flags & MCP_TIMESTAMP)) 569 m.tsc = 0; 570 /* 571 * Don't get the IP here because it's unlikely to 572 * have anything to do with the actual error location. 573 */ 574 if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) { 575 mce_log(&m); 576 add_taint(TAINT_MACHINE_CHECK); 577 } 578 579 /* 580 * Clear state for this bank. 581 */ 582 mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0); 583 } 584 585 /* 586 * Don't clear MCG_STATUS here because it's only defined for 587 * exceptions. 588 */ 589 590 sync_core(); 591} 592EXPORT_SYMBOL_GPL(machine_check_poll); 593 594/* 595 * Do a quick check if any of the events requires a panic. 596 * This decides if we keep the events around or clear them. 597 */ 598static int mce_no_way_out(struct mce *m, char **msg) 599{ 600 int i; 601 602 for (i = 0; i < banks; i++) { 603 m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); 604 if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY) 605 return 1; 606 } 607 return 0; 608} 609 610/* 611 * Variable to establish order between CPUs while scanning. 612 * Each CPU spins initially until executing is equal its number. 613 */ 614static atomic_t mce_executing; 615 616/* 617 * Defines order of CPUs on entry. First CPU becomes Monarch. 618 */ 619static atomic_t mce_callin; 620 621/* 622 * Check if a timeout waiting for other CPUs happened. 623 */ 624static int mce_timed_out(u64 *t) 625{ 626 /* 627 * The others already did panic for some reason. 628 * Bail out like in a timeout. 629 * rmb() to tell the compiler that system_state 630 * might have been modified by someone else. 631 */ 632 rmb(); 633 if (atomic_read(&mce_paniced)) 634 wait_for_panic(); 635 if (!monarch_timeout) 636 goto out; 637 if ((s64)*t < SPINUNIT) { 638 /* CHECKME: Make panic default for 1 too? */ 639 if (tolerant < 1) 640 mce_panic("Timeout synchronizing machine check over CPUs", 641 NULL, NULL); 642 cpu_missing = 1; 643 return 1; 644 } 645 *t -= SPINUNIT; 646out: 647 touch_nmi_watchdog(); 648 return 0; 649} 650 651/* 652 * The Monarch's reign. The Monarch is the CPU who entered 653 * the machine check handler first. It waits for the others to 654 * raise the exception too and then grades them. When any 655 * error is fatal panic. Only then let the others continue. 656 * 657 * The other CPUs entering the MCE handler will be controlled by the 658 * Monarch. They are called Subjects. 659 * 660 * This way we prevent any potential data corruption in a unrecoverable case 661 * and also makes sure always all CPU's errors are examined. 662 * 663 * Also this detects the case of a machine check event coming from outer 664 * space (not detected by any CPUs) In this case some external agent wants 665 * us to shut down, so panic too. 666 * 667 * The other CPUs might still decide to panic if the handler happens 668 * in a unrecoverable place, but in this case the system is in a semi-stable 669 * state and won't corrupt anything by itself. It's ok to let the others 670 * continue for a bit first. 671 * 672 * All the spin loops have timeouts; when a timeout happens a CPU 673 * typically elects itself to be Monarch. 674 */ 675static void mce_reign(void) 676{ 677 int cpu; 678 struct mce *m = NULL; 679 int global_worst = 0; 680 char *msg = NULL; 681 char *nmsg = NULL; 682 683 /* 684 * This CPU is the Monarch and the other CPUs have run 685 * through their handlers. 686 * Grade the severity of the errors of all the CPUs. 687 */ 688 for_each_possible_cpu(cpu) { 689 int severity = mce_severity(&per_cpu(mces_seen, cpu), tolerant, 690 &nmsg); 691 if (severity > global_worst) { 692 msg = nmsg; 693 global_worst = severity; 694 m = &per_cpu(mces_seen, cpu); 695 } 696 } 697 698 /* 699 * Cannot recover? Panic here then. 700 * This dumps all the mces in the log buffer and stops the 701 * other CPUs. 702 */ 703 if (m && global_worst >= MCE_PANIC_SEVERITY && tolerant < 3) 704 mce_panic("Fatal Machine check", m, msg); 705 706 /* 707 * For UC somewhere we let the CPU who detects it handle it. 708 * Also must let continue the others, otherwise the handling 709 * CPU could deadlock on a lock. 710 */ 711 712 /* 713 * No machine check event found. Must be some external 714 * source or one CPU is hung. Panic. 715 */ 716 if (global_worst <= MCE_KEEP_SEVERITY && tolerant < 3) 717 mce_panic("Machine check from unknown source", NULL, NULL); 718 719 /* 720 * Now clear all the mces_seen so that they don't reappear on 721 * the next mce. 722 */ 723 for_each_possible_cpu(cpu) 724 memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce)); 725} 726 727static atomic_t global_nwo; 728 729/* 730 * Start of Monarch synchronization. This waits until all CPUs have 731 * entered the exception handler and then determines if any of them 732 * saw a fatal event that requires panic. Then it executes them 733 * in the entry order. 734 * TBD double check parallel CPU hotunplug 735 */ 736static int mce_start(int *no_way_out) 737{ 738 int order; 739 int cpus = num_online_cpus(); 740 u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC; 741 742 if (!timeout) 743 return -1; 744 745 atomic_add(*no_way_out, &global_nwo); 746 /* 747 * global_nwo should be updated before mce_callin 748 */ 749 smp_wmb(); 750 order = atomic_inc_return(&mce_callin); 751 752 /* 753 * Wait for everyone. 754 */ 755 while (atomic_read(&mce_callin) != cpus) { 756 if (mce_timed_out(&timeout)) { 757 atomic_set(&global_nwo, 0); 758 return -1; 759 } 760 ndelay(SPINUNIT); 761 } 762 763 /* 764 * mce_callin should be read before global_nwo 765 */ 766 smp_rmb(); 767 768 if (order == 1) { 769 /* 770 * Monarch: Starts executing now, the others wait. 771 */ 772 atomic_set(&mce_executing, 1); 773 } else { 774 /* 775 * Subject: Now start the scanning loop one by one in 776 * the original callin order. 777 * This way when there are any shared banks it will be 778 * only seen by one CPU before cleared, avoiding duplicates. 779 */ 780 while (atomic_read(&mce_executing) < order) { 781 if (mce_timed_out(&timeout)) { 782 atomic_set(&global_nwo, 0); 783 return -1; 784 } 785 ndelay(SPINUNIT); 786 } 787 } 788 789 /* 790 * Cache the global no_way_out state. 791 */ 792 *no_way_out = atomic_read(&global_nwo); 793 794 return order; 795} 796 797/* 798 * Synchronize between CPUs after main scanning loop. 799 * This invokes the bulk of the Monarch processing. 800 */ 801static int mce_end(int order) 802{ 803 int ret = -1; 804 u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC; 805 806 if (!timeout) 807 goto reset; 808 if (order < 0) 809 goto reset; 810 811 /* 812 * Allow others to run. 813 */ 814 atomic_inc(&mce_executing); 815 816 if (order == 1) { 817 /* CHECKME: Can this race with a parallel hotplug? */ 818 int cpus = num_online_cpus(); 819 820 /* 821 * Monarch: Wait for everyone to go through their scanning 822 * loops. 823 */ 824 while (atomic_read(&mce_executing) <= cpus) { 825 if (mce_timed_out(&timeout)) 826 goto reset; 827 ndelay(SPINUNIT); 828 } 829 830 mce_reign(); 831 barrier(); 832 ret = 0; 833 } else { 834 /* 835 * Subject: Wait for Monarch to finish. 836 */ 837 while (atomic_read(&mce_executing) != 0) { 838 if (mce_timed_out(&timeout)) 839 goto reset; 840 ndelay(SPINUNIT); 841 } 842 843 /* 844 * Don't reset anything. That's done by the Monarch. 845 */ 846 return 0; 847 } 848 849 /* 850 * Reset all global state. 851 */ 852reset: 853 atomic_set(&global_nwo, 0); 854 atomic_set(&mce_callin, 0); 855 barrier(); 856 857 /* 858 * Let others run again. 859 */ 860 atomic_set(&mce_executing, 0); 861 return ret; 862} 863 864/* 865 * Check if the address reported by the CPU is in a format we can parse. 866 * It would be possible to add code for most other cases, but all would 867 * be somewhat complicated (e.g. segment offset would require an instruction 868 * parser). So only support physical addresses upto page granuality for now. 869 */ 870static int mce_usable_address(struct mce *m) 871{ 872 if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV)) 873 return 0; 874 if ((m->misc & 0x3f) > PAGE_SHIFT) 875 return 0; 876 if (((m->misc >> 6) & 7) != MCM_ADDR_PHYS) 877 return 0; 878 return 1; 879} 880 881static void mce_clear_state(unsigned long *toclear) 882{ 883 int i; 884 885 for (i = 0; i < banks; i++) { 886 if (test_bit(i, toclear)) 887 mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0); 888 } 889} 890 891/* 892 * The actual machine check handler. This only handles real 893 * exceptions when something got corrupted coming in through int 18. 894 * 895 * This is executed in NMI context not subject to normal locking rules. This 896 * implies that most kernel services cannot be safely used. Don't even 897 * think about putting a printk in there! 898 * 899 * On Intel systems this is entered on all CPUs in parallel through 900 * MCE broadcast. However some CPUs might be broken beyond repair, 901 * so be always careful when synchronizing with others. 902 */ 903void do_machine_check(struct pt_regs *regs, long error_code) 904{ 905 struct mce m, *final; 906 int i; 907 int worst = 0; 908 int severity; 909 /* 910 * Establish sequential order between the CPUs entering the machine 911 * check handler. 912 */ 913 int order; 914 /* 915 * If no_way_out gets set, there is no safe way to recover from this 916 * MCE. If tolerant is cranked up, we'll try anyway. 917 */ 918 int no_way_out = 0; 919 /* 920 * If kill_it gets set, there might be a way to recover from this 921 * error. 922 */ 923 int kill_it = 0; 924 DECLARE_BITMAP(toclear, MAX_NR_BANKS); 925 char *msg = "Unknown"; 926 927 atomic_inc(&mce_entry); 928 929 __get_cpu_var(mce_exception_count)++; 930 931 if (notify_die(DIE_NMI, "machine check", regs, error_code, 932 18, SIGKILL) == NOTIFY_STOP) 933 goto out; 934 if (!banks) 935 goto out; 936 937 mce_setup(&m); 938 939 m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); 940 final = &__get_cpu_var(mces_seen); 941 *final = m; 942 943 no_way_out = mce_no_way_out(&m, &msg); 944 945 barrier(); 946 947 /* 948 * When no restart IP must always kill or panic. 949 */ 950 if (!(m.mcgstatus & MCG_STATUS_RIPV)) 951 kill_it = 1; 952 953 /* 954 * Go through all the banks in exclusion of the other CPUs. 955 * This way we don't report duplicated events on shared banks 956 * because the first one to see it will clear it. 957 */ 958 order = mce_start(&no_way_out); 959 for (i = 0; i < banks; i++) { 960 __clear_bit(i, toclear); 961 if (!mce_banks[i].ctl) 962 continue; 963 964 m.misc = 0; 965 m.addr = 0; 966 m.bank = i; 967 968 m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); 969 if ((m.status & MCI_STATUS_VAL) == 0) 970 continue; 971 972 /* 973 * Non uncorrected or non signaled errors are handled by 974 * machine_check_poll. Leave them alone, unless this panics. 975 */ 976 if (!(m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)) && 977 !no_way_out) 978 continue; 979 980 /* 981 * Set taint even when machine check was not enabled. 982 */ 983 add_taint(TAINT_MACHINE_CHECK); 984 985 severity = mce_severity(&m, tolerant, NULL); 986 987 /* 988 * When machine check was for corrected handler don't touch, 989 * unless we're panicing. 990 */ 991 if (severity == MCE_KEEP_SEVERITY && !no_way_out) 992 continue; 993 __set_bit(i, toclear); 994 if (severity == MCE_NO_SEVERITY) { 995 /* 996 * Machine check event was not enabled. Clear, but 997 * ignore. 998 */ 999 continue; 1000 } 1001 1002 /* 1003 * Kill on action required. 1004 */ 1005 if (severity == MCE_AR_SEVERITY) 1006 kill_it = 1; 1007 1008 if (m.status & MCI_STATUS_MISCV) 1009 m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i)); 1010 if (m.status & MCI_STATUS_ADDRV) 1011 m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i)); 1012 1013 /* 1014 * Action optional error. Queue address for later processing. 1015 * When the ring overflows we just ignore the AO error. 1016 * RED-PEN add some logging mechanism when 1017 * usable_address or mce_add_ring fails. 1018 * RED-PEN don't ignore overflow for tolerant == 0 1019 */ 1020 if (severity == MCE_AO_SEVERITY && mce_usable_address(&m)) 1021 mce_ring_add(m.addr >> PAGE_SHIFT); 1022 1023 mce_get_rip(&m, regs); 1024 mce_log(&m); 1025 1026 if (severity > worst) { 1027 *final = m; 1028 worst = severity; 1029 } 1030 } 1031 1032 if (!no_way_out) 1033 mce_clear_state(toclear); 1034 1035 /* 1036 * Do most of the synchronization with other CPUs. 1037 * When there's any problem use only local no_way_out state. 1038 */ 1039 if (mce_end(order) < 0) 1040 no_way_out = worst >= MCE_PANIC_SEVERITY; 1041 1042 /* 1043 * If we have decided that we just CAN'T continue, and the user 1044 * has not set tolerant to an insane level, give up and die. 1045 * 1046 * This is mainly used in the case when the system doesn't 1047 * support MCE broadcasting or it has been disabled. 1048 */ 1049 if (no_way_out && tolerant < 3) 1050 mce_panic("Fatal machine check on current CPU", final, msg); 1051 1052 /* 1053 * If the error seems to be unrecoverable, something should be 1054 * done. Try to kill as little as possible. If we can kill just 1055 * one task, do that. If the user has set the tolerance very 1056 * high, don't try to do anything at all. 1057 */ 1058 1059 if (kill_it && tolerant < 3) 1060 force_sig(SIGBUS, current); 1061 1062 /* notify userspace ASAP */ 1063 set_thread_flag(TIF_MCE_NOTIFY); 1064 1065 if (worst > 0) 1066 mce_report_event(regs); 1067 mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); 1068out: 1069 atomic_dec(&mce_entry); 1070 sync_core(); 1071} 1072EXPORT_SYMBOL_GPL(do_machine_check); 1073 1074/* dummy to break dependency. actual code is in mm/memory-failure.c */ 1075void __attribute__((weak)) memory_failure(unsigned long pfn, int vector) 1076{ 1077 printk(KERN_ERR "Action optional memory failure at %lx ignored\n", pfn); 1078} 1079 1080/* 1081 * Called after mce notification in process context. This code 1082 * is allowed to sleep. Call the high level VM handler to process 1083 * any corrupted pages. 1084 * Assume that the work queue code only calls this one at a time 1085 * per CPU. 1086 * Note we don't disable preemption, so this code might run on the wrong 1087 * CPU. In this case the event is picked up by the scheduled work queue. 1088 * This is merely a fast path to expedite processing in some common 1089 * cases. 1090 */ 1091void mce_notify_process(void) 1092{ 1093 unsigned long pfn; 1094 mce_notify_irq(); 1095 while (mce_ring_get(&pfn)) 1096 memory_failure(pfn, MCE_VECTOR); 1097} 1098 1099static void mce_process_work(struct work_struct *dummy) 1100{ 1101 mce_notify_process(); 1102} 1103 1104#ifdef CONFIG_X86_MCE_INTEL 1105/*** 1106 * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog 1107 * @cpu: The CPU on which the event occurred. 1108 * @status: Event status information 1109 * 1110 * This function should be called by the thermal interrupt after the 1111 * event has been processed and the decision was made to log the event 1112 * further. 1113 * 1114 * The status parameter will be saved to the 'status' field of 'struct mce' 1115 * and historically has been the register value of the 1116 * MSR_IA32_THERMAL_STATUS (Intel) msr. 1117 */ 1118void mce_log_therm_throt_event(__u64 status) 1119{ 1120 struct mce m; 1121 1122 mce_setup(&m); 1123 m.bank = MCE_THERMAL_BANK; 1124 m.status = status; 1125 mce_log(&m); 1126} 1127#endif /* CONFIG_X86_MCE_INTEL */ 1128 1129/* 1130 * Periodic polling timer for "silent" machine check errors. If the 1131 * poller finds an MCE, poll 2x faster. When the poller finds no more 1132 * errors, poll 2x slower (up to check_interval seconds). 1133 */ 1134static int check_interval = 5 * 60; /* 5 minutes */ 1135 1136static DEFINE_PER_CPU(int, mce_next_interval); /* in jiffies */ 1137static DEFINE_PER_CPU(struct timer_list, mce_timer); 1138 1139static void mce_start_timer(unsigned long data) 1140{ 1141 struct timer_list *t = &per_cpu(mce_timer, data); 1142 int *n; 1143 1144 WARN_ON(smp_processor_id() != data); 1145 1146 if (mce_available(¤t_cpu_data)) { 1147 machine_check_poll(MCP_TIMESTAMP, 1148 &__get_cpu_var(mce_poll_banks)); 1149 } 1150 1151 /* 1152 * Alert userspace if needed. If we logged an MCE, reduce the 1153 * polling interval, otherwise increase the polling interval. 1154 */ 1155 n = &__get_cpu_var(mce_next_interval); 1156 if (mce_notify_irq()) 1157 *n = max(*n/2, HZ/100); 1158 else 1159 *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ)); 1160 1161 t->expires = jiffies + *n; 1162 add_timer_on(t, smp_processor_id()); 1163} 1164 1165static void mce_do_trigger(struct work_struct *work) 1166{ 1167 call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT); 1168} 1169 1170static DECLARE_WORK(mce_trigger_work, mce_do_trigger); 1171 1172/* 1173 * Notify the user(s) about new machine check events. 1174 * Can be called from interrupt context, but not from machine check/NMI 1175 * context. 1176 */ 1177int mce_notify_irq(void) 1178{ 1179 /* Not more than two messages every minute */ 1180 static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); 1181 1182 clear_thread_flag(TIF_MCE_NOTIFY); 1183 1184 if (test_and_clear_bit(0, &mce_need_notify)) { 1185 wake_up_interruptible(&mce_wait); 1186 1187 /* 1188 * There is no risk of missing notifications because 1189 * work_pending is always cleared before the function is 1190 * executed. 1191 */ 1192 if (mce_helper[0] && !work_pending(&mce_trigger_work)) 1193 schedule_work(&mce_trigger_work); 1194 1195 if (__ratelimit(&ratelimit)) 1196 printk(KERN_INFO "Machine check events logged\n"); 1197 1198 return 1; 1199 } 1200 return 0; 1201} 1202EXPORT_SYMBOL_GPL(mce_notify_irq); 1203 1204static int __cpuinit __mcheck_cpu_mce_banks_init(void) 1205{ 1206 int i; 1207 1208 mce_banks = kzalloc(banks * sizeof(struct mce_bank), GFP_KERNEL); 1209 if (!mce_banks) 1210 return -ENOMEM; 1211 for (i = 0; i < banks; i++) { 1212 struct mce_bank *b = &mce_banks[i]; 1213 1214 b->ctl = -1ULL; 1215 b->init = 1; 1216 } 1217 return 0; 1218} 1219 1220/* 1221 * Initialize Machine Checks for a CPU. 1222 */ 1223static int __cpuinit __mcheck_cpu_cap_init(void) 1224{ 1225 unsigned b; 1226 u64 cap; 1227 1228 rdmsrl(MSR_IA32_MCG_CAP, cap); 1229 1230 b = cap & MCG_BANKCNT_MASK; 1231 if (!banks) 1232 printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b); 1233 1234 if (b > MAX_NR_BANKS) { 1235 printk(KERN_WARNING 1236 "MCE: Using only %u machine check banks out of %u\n", 1237 MAX_NR_BANKS, b); 1238 b = MAX_NR_BANKS; 1239 } 1240 1241 /* Don't support asymmetric configurations today */ 1242 WARN_ON(banks != 0 && b != banks); 1243 banks = b; 1244 if (!mce_banks) { 1245 int err = __mcheck_cpu_mce_banks_init(); 1246 1247 if (err) 1248 return err; 1249 } 1250 1251 /* Use accurate RIP reporting if available. */ 1252 if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9) 1253 rip_msr = MSR_IA32_MCG_EIP; 1254 1255 if (cap & MCG_SER_P) 1256 mce_ser = 1; 1257 1258 return 0; 1259} 1260 1261static void __mcheck_cpu_init_generic(void) 1262{ 1263 mce_banks_t all_banks; 1264 u64 cap; 1265 int i; 1266 1267 /* 1268 * Log the machine checks left over from the previous reset. 1269 */ 1270 bitmap_fill(all_banks, MAX_NR_BANKS); 1271 machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks); 1272 1273 set_in_cr4(X86_CR4_MCE); 1274 1275 rdmsrl(MSR_IA32_MCG_CAP, cap); 1276 if (cap & MCG_CTL_P) 1277 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); 1278 1279 for (i = 0; i < banks; i++) { 1280 struct mce_bank *b = &mce_banks[i]; 1281 1282 if (!b->init) 1283 continue; 1284 wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl); 1285 wrmsrl(MSR_IA32_MCx_STATUS(i), 0); 1286 } 1287} 1288 1289/* Add per CPU specific workarounds here */ 1290static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) 1291{ 1292 if (c->x86_vendor == X86_VENDOR_UNKNOWN) { 1293 pr_info("MCE: unknown CPU type - not enabling MCE support.\n"); 1294 return -EOPNOTSUPP; 1295 } 1296 1297 /* This should be disabled by the BIOS, but isn't always */ 1298 if (c->x86_vendor == X86_VENDOR_AMD) { 1299 if (c->x86 == 15 && banks > 4) { 1300 /* 1301 * disable GART TBL walk error reporting, which 1302 * trips off incorrectly with the IOMMU & 3ware 1303 * & Cerberus: 1304 */ 1305 clear_bit(10, (unsigned long *)&mce_banks[4].ctl); 1306 } 1307 if (c->x86 <= 17 && mce_bootlog < 0) { 1308 /* 1309 * Lots of broken BIOS around that don't clear them 1310 * by default and leave crap in there. Don't log: 1311 */ 1312 mce_bootlog = 0; 1313 } 1314 /* 1315 * Various K7s with broken bank 0 around. Always disable 1316 * by default. 1317 */ 1318 if (c->x86 == 6 && banks > 0) 1319 mce_banks[0].ctl = 0; 1320 } 1321 1322 if (c->x86_vendor == X86_VENDOR_INTEL) { 1323 /* 1324 * SDM documents that on family 6 bank 0 should not be written 1325 * because it aliases to another special BIOS controlled 1326 * register. 1327 * But it's not aliased anymore on model 0x1a+ 1328 * Don't ignore bank 0 completely because there could be a 1329 * valid event later, merely don't write CTL0. 1330 */ 1331 1332 if (c->x86 == 6 && c->x86_model < 0x1A && banks > 0) 1333 mce_banks[0].init = 0; 1334 1335 /* 1336 * All newer Intel systems support MCE broadcasting. Enable 1337 * synchronization with a one second timeout. 1338 */ 1339 if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) && 1340 monarch_timeout < 0) 1341 monarch_timeout = USEC_PER_SEC; 1342 1343 /* 1344 * There are also broken BIOSes on some Pentium M and 1345 * earlier systems: 1346 */ 1347 if (c->x86 == 6 && c->x86_model <= 13 && mce_bootlog < 0) 1348 mce_bootlog = 0; 1349 } 1350 if (monarch_timeout < 0) 1351 monarch_timeout = 0; 1352 if (mce_bootlog != 0) 1353 mce_panic_timeout = 30; 1354 1355 return 0; 1356} 1357 1358static void __cpuinit __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c) 1359{ 1360 if (c->x86 != 5) 1361 return; 1362 switch (c->x86_vendor) { 1363 case X86_VENDOR_INTEL: 1364 intel_p5_mcheck_init(c); 1365 break; 1366 case X86_VENDOR_CENTAUR: 1367 winchip_mcheck_init(c); 1368 break; 1369 } 1370} 1371 1372static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) 1373{ 1374 switch (c->x86_vendor) { 1375 case X86_VENDOR_INTEL: 1376 mce_intel_feature_init(c); 1377 break; 1378 case X86_VENDOR_AMD: 1379 mce_amd_feature_init(c); 1380 break; 1381 default: 1382 break; 1383 } 1384} 1385 1386static void __mcheck_cpu_init_timer(void) 1387{ 1388 struct timer_list *t = &__get_cpu_var(mce_timer); 1389 int *n = &__get_cpu_var(mce_next_interval); 1390 1391 setup_timer(t, mce_start_timer, smp_processor_id()); 1392 1393 if (mce_ignore_ce) 1394 return; 1395 1396 *n = check_interval * HZ; 1397 if (!*n) 1398 return; 1399 t->expires = round_jiffies(jiffies + *n); 1400 add_timer_on(t, smp_processor_id()); 1401} 1402 1403/* Handle unconfigured int18 (should never happen) */ 1404static void unexpected_machine_check(struct pt_regs *regs, long error_code) 1405{ 1406 printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", 1407 smp_processor_id()); 1408} 1409 1410/* Call the installed machine check handler for this CPU setup. */ 1411void (*machine_check_vector)(struct pt_regs *, long error_code) = 1412 unexpected_machine_check; 1413 1414/* 1415 * Called for each booted CPU to set up machine checks. 1416 * Must be called with preempt off: 1417 */ 1418void __cpuinit mcheck_cpu_init(struct cpuinfo_x86 *c) 1419{ 1420 if (mce_disabled) 1421 return; 1422 1423 __mcheck_cpu_ancient_init(c); 1424 1425 if (!mce_available(c)) 1426 return; 1427 1428 if (__mcheck_cpu_cap_init() < 0 || __mcheck_cpu_apply_quirks(c) < 0) { 1429 mce_disabled = 1; 1430 return; 1431 } 1432 1433 machine_check_vector = do_machine_check; 1434 1435 __mcheck_cpu_init_generic(); 1436 __mcheck_cpu_init_vendor(c); 1437 __mcheck_cpu_init_timer(); 1438 INIT_WORK(&__get_cpu_var(mce_work), mce_process_work); 1439 1440} 1441 1442/* 1443 * Character device to read and clear the MCE log. 1444 */ 1445 1446static DEFINE_SPINLOCK(mce_state_lock); 1447static int open_count; /* #times opened */ 1448static int open_exclu; /* already open exclusive? */ 1449 1450static int mce_open(struct inode *inode, struct file *file) 1451{ 1452 spin_lock(&mce_state_lock); 1453 1454 if (open_exclu || (open_count && (file->f_flags & O_EXCL))) { 1455 spin_unlock(&mce_state_lock); 1456 1457 return -EBUSY; 1458 } 1459 1460 if (file->f_flags & O_EXCL) 1461 open_exclu = 1; 1462 open_count++; 1463 1464 spin_unlock(&mce_state_lock); 1465 1466 return nonseekable_open(inode, file); 1467} 1468 1469static int mce_release(struct inode *inode, struct file *file) 1470{ 1471 spin_lock(&mce_state_lock); 1472 1473 open_count--; 1474 open_exclu = 0; 1475 1476 spin_unlock(&mce_state_lock); 1477 1478 return 0; 1479} 1480 1481static void collect_tscs(void *data) 1482{ 1483 unsigned long *cpu_tsc = (unsigned long *)data; 1484 1485 rdtscll(cpu_tsc[smp_processor_id()]); 1486} 1487 1488static DEFINE_MUTEX(mce_read_mutex); 1489 1490static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, 1491 loff_t *off) 1492{ 1493 char __user *buf = ubuf; 1494 unsigned long *cpu_tsc; 1495 unsigned prev, next; 1496 int i, err; 1497 1498 cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL); 1499 if (!cpu_tsc) 1500 return -ENOMEM; 1501 1502 mutex_lock(&mce_read_mutex); 1503 next = rcu_dereference(mcelog.next); 1504 1505 /* Only supports full reads right now */ 1506 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) { 1507 mutex_unlock(&mce_read_mutex); 1508 kfree(cpu_tsc); 1509 1510 return -EINVAL; 1511 } 1512 1513 err = 0; 1514 prev = 0; 1515 do { 1516 for (i = prev; i < next; i++) { 1517 unsigned long start = jiffies; 1518 1519 while (!mcelog.entry[i].finished) { 1520 if (time_after_eq(jiffies, start + 2)) { 1521 memset(mcelog.entry + i, 0, 1522 sizeof(struct mce)); 1523 goto timeout; 1524 } 1525 cpu_relax(); 1526 } 1527 smp_rmb(); 1528 err |= copy_to_user(buf, mcelog.entry + i, 1529 sizeof(struct mce)); 1530 buf += sizeof(struct mce); 1531timeout: 1532 ; 1533 } 1534 1535 memset(mcelog.entry + prev, 0, 1536 (next - prev) * sizeof(struct mce)); 1537 prev = next; 1538 next = cmpxchg(&mcelog.next, prev, 0); 1539 } while (next != prev); 1540 1541 synchronize_sched(); 1542 1543 /* 1544 * Collect entries that were still getting written before the 1545 * synchronize. 1546 */ 1547 on_each_cpu(collect_tscs, cpu_tsc, 1); 1548 1549 for (i = next; i < MCE_LOG_LEN; i++) { 1550 if (mcelog.entry[i].finished && 1551 mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) { 1552 err |= copy_to_user(buf, mcelog.entry+i, 1553 sizeof(struct mce)); 1554 smp_rmb(); 1555 buf += sizeof(struct mce); 1556 memset(&mcelog.entry[i], 0, sizeof(struct mce)); 1557 } 1558 } 1559 mutex_unlock(&mce_read_mutex); 1560 kfree(cpu_tsc); 1561 1562 return err ? -EFAULT : buf - ubuf; 1563} 1564 1565static unsigned int mce_poll(struct file *file, poll_table *wait) 1566{ 1567 poll_wait(file, &mce_wait, wait); 1568 if (rcu_dereference(mcelog.next)) 1569 return POLLIN | POLLRDNORM; 1570 return 0; 1571} 1572 1573static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg) 1574{ 1575 int __user *p = (int __user *)arg; 1576 1577 if (!capable(CAP_SYS_ADMIN)) 1578 return -EPERM; 1579 1580 switch (cmd) { 1581 case MCE_GET_RECORD_LEN: 1582 return put_user(sizeof(struct mce), p); 1583 case MCE_GET_LOG_LEN: 1584 return put_user(MCE_LOG_LEN, p); 1585 case MCE_GETCLEAR_FLAGS: { 1586 unsigned flags; 1587 1588 do { 1589 flags = mcelog.flags; 1590 } while (cmpxchg(&mcelog.flags, flags, 0) != flags); 1591 1592 return put_user(flags, p); 1593 } 1594 default: 1595 return -ENOTTY; 1596 } 1597} 1598 1599/* Modified in mce-inject.c, so not static or const */ 1600struct file_operations mce_chrdev_ops = { 1601 .open = mce_open, 1602 .release = mce_release, 1603 .read = mce_read, 1604 .poll = mce_poll, 1605 .unlocked_ioctl = mce_ioctl, 1606}; 1607EXPORT_SYMBOL_GPL(mce_chrdev_ops); 1608 1609static struct miscdevice mce_log_device = { 1610 MISC_MCELOG_MINOR, 1611 "mcelog", 1612 &mce_chrdev_ops, 1613}; 1614 1615/* 1616 * mce=off Disables machine check 1617 * mce=no_cmci Disables CMCI 1618 * mce=dont_log_ce Clears corrected events silently, no log created for CEs. 1619 * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared. 1620 * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above) 1621 * monarchtimeout is how long to wait for other CPUs on machine 1622 * check, or 0 to not wait 1623 * mce=bootlog Log MCEs from before booting. Disabled by default on AMD. 1624 * mce=nobootlog Don't log MCEs from before booting. 1625 */ 1626static int __init mcheck_enable(char *str) 1627{ 1628 if (*str == 0) { 1629 enable_p5_mce(); 1630 return 1; 1631 } 1632 if (*str == '=') 1633 str++; 1634 if (!strcmp(str, "off")) 1635 mce_disabled = 1; 1636 else if (!strcmp(str, "no_cmci")) 1637 mce_cmci_disabled = 1; 1638 else if (!strcmp(str, "dont_log_ce")) 1639 mce_dont_log_ce = 1; 1640 else if (!strcmp(str, "ignore_ce")) 1641 mce_ignore_ce = 1; 1642 else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog")) 1643 mce_bootlog = (str[0] == 'b'); 1644 else if (isdigit(str[0])) { 1645 get_option(&str, &tolerant); 1646 if (*str == ',') { 1647 ++str; 1648 get_option(&str, &monarch_timeout); 1649 } 1650 } else { 1651 printk(KERN_INFO "mce argument %s ignored. Please use /sys\n", 1652 str); 1653 return 0; 1654 } 1655 return 1; 1656} 1657__setup("mce", mcheck_enable); 1658 1659int __init mcheck_init(void) 1660{ 1661 atomic_notifier_chain_register(&x86_mce_decoder_chain, &mce_dec_nb); 1662 1663 mcheck_intel_therm_init(); 1664 1665 return 0; 1666} 1667 1668/* 1669 * Sysfs support 1670 */ 1671 1672/* 1673 * Disable machine checks on suspend and shutdown. We can't really handle 1674 * them later. 1675 */ 1676static int mce_disable_error_reporting(void) 1677{ 1678 int i; 1679 1680 for (i = 0; i < banks; i++) { 1681 struct mce_bank *b = &mce_banks[i]; 1682 1683 if (b->init) 1684 wrmsrl(MSR_IA32_MCx_CTL(i), 0); 1685 } 1686 return 0; 1687} 1688 1689static int mce_suspend(struct sys_device *dev, pm_message_t state) 1690{ 1691 return mce_disable_error_reporting(); 1692} 1693 1694static int mce_shutdown(struct sys_device *dev) 1695{ 1696 return mce_disable_error_reporting(); 1697} 1698 1699/* 1700 * On resume clear all MCE state. Don't want to see leftovers from the BIOS. 1701 * Only one CPU is active at this time, the others get re-added later using 1702 * CPU hotplug: 1703 */ 1704static int mce_resume(struct sys_device *dev) 1705{ 1706 __mcheck_cpu_init_generic(); 1707 __mcheck_cpu_init_vendor(¤t_cpu_data); 1708 1709 return 0; 1710} 1711 1712static void mce_cpu_restart(void *data) 1713{ 1714 del_timer_sync(&__get_cpu_var(mce_timer)); 1715 if (!mce_available(¤t_cpu_data)) 1716 return; 1717 __mcheck_cpu_init_generic(); 1718 __mcheck_cpu_init_timer(); 1719} 1720 1721/* Reinit MCEs after user configuration changes */ 1722static void mce_restart(void) 1723{ 1724 on_each_cpu(mce_cpu_restart, NULL, 1); 1725} 1726 1727/* Toggle features for corrected errors */ 1728static void mce_disable_ce(void *all) 1729{ 1730 if (!mce_available(¤t_cpu_data)) 1731 return; 1732 if (all) 1733 del_timer_sync(&__get_cpu_var(mce_timer)); 1734 cmci_clear(); 1735} 1736 1737static void mce_enable_ce(void *all) 1738{ 1739 if (!mce_available(¤t_cpu_data)) 1740 return; 1741 cmci_reenable(); 1742 cmci_recheck(); 1743 if (all) 1744 __mcheck_cpu_init_timer(); 1745} 1746 1747static struct sysdev_class mce_sysclass = { 1748 .suspend = mce_suspend, 1749 .shutdown = mce_shutdown, 1750 .resume = mce_resume, 1751 .name = "machinecheck", 1752}; 1753 1754DEFINE_PER_CPU(struct sys_device, mce_dev); 1755 1756__cpuinitdata 1757void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); 1758 1759static inline struct mce_bank *attr_to_bank(struct sysdev_attribute *attr) 1760{ 1761 return container_of(attr, struct mce_bank, attr); 1762} 1763 1764static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr, 1765 char *buf) 1766{ 1767 return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl); 1768} 1769 1770static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, 1771 const char *buf, size_t size) 1772{ 1773 u64 new; 1774 1775 if (strict_strtoull(buf, 0, &new) < 0) 1776 return -EINVAL; 1777 1778 attr_to_bank(attr)->ctl = new; 1779 mce_restart(); 1780 1781 return size; 1782} 1783 1784static ssize_t 1785show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf) 1786{ 1787 strcpy(buf, mce_helper); 1788 strcat(buf, "\n"); 1789 return strlen(mce_helper) + 1; 1790} 1791 1792static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr, 1793 const char *buf, size_t siz) 1794{ 1795 char *p; 1796 1797 strncpy(mce_helper, buf, sizeof(mce_helper)); 1798 mce_helper[sizeof(mce_helper)-1] = 0; 1799 p = strchr(mce_helper, '\n'); 1800 1801 if (p) 1802 *p = 0; 1803 1804 return strlen(mce_helper) + !!p; 1805} 1806 1807static ssize_t set_ignore_ce(struct sys_device *s, 1808 struct sysdev_attribute *attr, 1809 const char *buf, size_t size) 1810{ 1811 u64 new; 1812 1813 if (strict_strtoull(buf, 0, &new) < 0) 1814 return -EINVAL; 1815 1816 if (mce_ignore_ce ^ !!new) { 1817 if (new) { 1818 /* disable ce features */ 1819 on_each_cpu(mce_disable_ce, (void *)1, 1); 1820 mce_ignore_ce = 1; 1821 } else { 1822 /* enable ce features */ 1823 mce_ignore_ce = 0; 1824 on_each_cpu(mce_enable_ce, (void *)1, 1); 1825 } 1826 } 1827 return size; 1828} 1829 1830static ssize_t set_cmci_disabled(struct sys_device *s, 1831 struct sysdev_attribute *attr, 1832 const char *buf, size_t size) 1833{ 1834 u64 new; 1835 1836 if (strict_strtoull(buf, 0, &new) < 0) 1837 return -EINVAL; 1838 1839 if (mce_cmci_disabled ^ !!new) { 1840 if (new) { 1841 /* disable cmci */ 1842 on_each_cpu(mce_disable_ce, NULL, 1); 1843 mce_cmci_disabled = 1; 1844 } else { 1845 /* enable cmci */ 1846 mce_cmci_disabled = 0; 1847 on_each_cpu(mce_enable_ce, NULL, 1); 1848 } 1849 } 1850 return size; 1851} 1852 1853static ssize_t store_int_with_restart(struct sys_device *s, 1854 struct sysdev_attribute *attr, 1855 const char *buf, size_t size) 1856{ 1857 ssize_t ret = sysdev_store_int(s, attr, buf, size); 1858 mce_restart(); 1859 return ret; 1860} 1861 1862static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); 1863static SYSDEV_INT_ATTR(tolerant, 0644, tolerant); 1864static SYSDEV_INT_ATTR(monarch_timeout, 0644, monarch_timeout); 1865static SYSDEV_INT_ATTR(dont_log_ce, 0644, mce_dont_log_ce); 1866 1867static struct sysdev_ext_attribute attr_check_interval = { 1868 _SYSDEV_ATTR(check_interval, 0644, sysdev_show_int, 1869 store_int_with_restart), 1870 &check_interval 1871}; 1872 1873static struct sysdev_ext_attribute attr_ignore_ce = { 1874 _SYSDEV_ATTR(ignore_ce, 0644, sysdev_show_int, set_ignore_ce), 1875 &mce_ignore_ce 1876}; 1877 1878static struct sysdev_ext_attribute attr_cmci_disabled = { 1879 _SYSDEV_ATTR(cmci_disabled, 0644, sysdev_show_int, set_cmci_disabled), 1880 &mce_cmci_disabled 1881}; 1882 1883static struct sysdev_attribute *mce_attrs[] = { 1884 &attr_tolerant.attr, 1885 &attr_check_interval.attr, 1886 &attr_trigger, 1887 &attr_monarch_timeout.attr, 1888 &attr_dont_log_ce.attr, 1889 &attr_ignore_ce.attr, 1890 &attr_cmci_disabled.attr, 1891 NULL 1892}; 1893 1894static cpumask_var_t mce_dev_initialized; 1895 1896/* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */ 1897static __cpuinit int mce_create_device(unsigned int cpu) 1898{ 1899 int err; 1900 int i, j; 1901 1902 if (!mce_available(&boot_cpu_data)) 1903 return -EIO; 1904 1905 memset(&per_cpu(mce_dev, cpu).kobj, 0, sizeof(struct kobject)); 1906 per_cpu(mce_dev, cpu).id = cpu; 1907 per_cpu(mce_dev, cpu).cls = &mce_sysclass; 1908 1909 err = sysdev_register(&per_cpu(mce_dev, cpu)); 1910 if (err) 1911 return err; 1912 1913 for (i = 0; mce_attrs[i]; i++) { 1914 err = sysdev_create_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); 1915 if (err) 1916 goto error; 1917 } 1918 for (j = 0; j < banks; j++) { 1919 err = sysdev_create_file(&per_cpu(mce_dev, cpu), 1920 &mce_banks[j].attr); 1921 if (err) 1922 goto error2; 1923 } 1924 cpumask_set_cpu(cpu, mce_dev_initialized); 1925 1926 return 0; 1927error2: 1928 while (--j >= 0) 1929 sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[j].attr); 1930error: 1931 while (--i >= 0) 1932 sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); 1933 1934 sysdev_unregister(&per_cpu(mce_dev, cpu)); 1935 1936 return err; 1937} 1938 1939static __cpuinit void mce_remove_device(unsigned int cpu) 1940{ 1941 int i; 1942 1943 if (!cpumask_test_cpu(cpu, mce_dev_initialized)) 1944 return; 1945 1946 for (i = 0; mce_attrs[i]; i++) 1947 sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); 1948 1949 for (i = 0; i < banks; i++) 1950 sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[i].attr); 1951 1952 sysdev_unregister(&per_cpu(mce_dev, cpu)); 1953 cpumask_clear_cpu(cpu, mce_dev_initialized); 1954} 1955 1956/* Make sure there are no machine checks on offlined CPUs. */ 1957static void __cpuinit mce_disable_cpu(void *h) 1958{ 1959 unsigned long action = *(unsigned long *)h; 1960 int i; 1961 1962 if (!mce_available(¤t_cpu_data)) 1963 return; 1964 1965 if (!(action & CPU_TASKS_FROZEN)) 1966 cmci_clear(); 1967 for (i = 0; i < banks; i++) { 1968 struct mce_bank *b = &mce_banks[i]; 1969 1970 if (b->init) 1971 wrmsrl(MSR_IA32_MCx_CTL(i), 0); 1972 } 1973} 1974 1975static void __cpuinit mce_reenable_cpu(void *h) 1976{ 1977 unsigned long action = *(unsigned long *)h; 1978 int i; 1979 1980 if (!mce_available(¤t_cpu_data)) 1981 return; 1982 1983 if (!(action & CPU_TASKS_FROZEN)) 1984 cmci_reenable(); 1985 for (i = 0; i < banks; i++) { 1986 struct mce_bank *b = &mce_banks[i]; 1987 1988 if (b->init) 1989 wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl); 1990 } 1991} 1992 1993/* Get notified when a cpu comes on/off. Be hotplug friendly. */ 1994static int __cpuinit 1995mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) 1996{ 1997 unsigned int cpu = (unsigned long)hcpu; 1998 struct timer_list *t = &per_cpu(mce_timer, cpu); 1999 2000 switch (action) { 2001 case CPU_ONLINE: 2002 case CPU_ONLINE_FROZEN: 2003 mce_create_device(cpu); 2004 if (threshold_cpu_callback) 2005 threshold_cpu_callback(action, cpu); 2006 break; 2007 case CPU_DEAD: 2008 case CPU_DEAD_FROZEN: 2009 if (threshold_cpu_callback) 2010 threshold_cpu_callback(action, cpu); 2011 mce_remove_device(cpu); 2012 break; 2013 case CPU_DOWN_PREPARE: 2014 case CPU_DOWN_PREPARE_FROZEN: 2015 del_timer_sync(t); 2016 smp_call_function_single(cpu, mce_disable_cpu, &action, 1); 2017 break; 2018 case CPU_DOWN_FAILED: 2019 case CPU_DOWN_FAILED_FROZEN: 2020 if (!mce_ignore_ce && check_interval) { 2021 t->expires = round_jiffies(jiffies + 2022 __get_cpu_var(mce_next_interval)); 2023 add_timer_on(t, cpu); 2024 } 2025 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); 2026 break; 2027 case CPU_POST_DEAD: 2028 /* intentionally ignoring frozen here */ 2029 cmci_rediscover(cpu); 2030 break; 2031 } 2032 return NOTIFY_OK; 2033} 2034 2035static struct notifier_block mce_cpu_notifier __cpuinitdata = { 2036 .notifier_call = mce_cpu_callback, 2037}; 2038 2039static __init void mce_init_banks(void) 2040{ 2041 int i; 2042 2043 for (i = 0; i < banks; i++) { 2044 struct mce_bank *b = &mce_banks[i]; 2045 struct sysdev_attribute *a = &b->attr; 2046 2047 sysfs_attr_init(&a->attr); 2048 a->attr.name = b->attrname; 2049 snprintf(b->attrname, ATTR_LEN, "bank%d", i); 2050 2051 a->attr.mode = 0644; 2052 a->show = show_bank; 2053 a->store = set_bank; 2054 } 2055} 2056 2057static __init int mcheck_init_device(void) 2058{ 2059 int err; 2060 int i = 0; 2061 2062 if (!mce_available(&boot_cpu_data)) 2063 return -EIO; 2064 2065 zalloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL); 2066 2067 mce_init_banks(); 2068 2069 err = sysdev_class_register(&mce_sysclass); 2070 if (err) 2071 return err; 2072 2073 for_each_online_cpu(i) { 2074 err = mce_create_device(i); 2075 if (err) 2076 return err; 2077 } 2078 2079 register_hotcpu_notifier(&mce_cpu_notifier); 2080 misc_register(&mce_log_device); 2081 2082 return err; 2083} 2084 2085device_initcall(mcheck_init_device); 2086 2087/* 2088 * Old style boot options parsing. Only for compatibility. 2089 */ 2090static int __init mcheck_disable(char *str) 2091{ 2092 mce_disabled = 1; 2093 return 1; 2094} 2095__setup("nomce", mcheck_disable); 2096 2097#ifdef CONFIG_DEBUG_FS 2098struct dentry *mce_get_debugfs_dir(void) 2099{ 2100 static struct dentry *dmce; 2101 2102 if (!dmce) 2103 dmce = debugfs_create_dir("mce", NULL); 2104 2105 return dmce; 2106} 2107 2108static void mce_reset(void) 2109{ 2110 cpu_missing = 0; 2111 atomic_set(&mce_fake_paniced, 0); 2112 atomic_set(&mce_executing, 0); 2113 atomic_set(&mce_callin, 0); 2114 atomic_set(&global_nwo, 0); 2115} 2116 2117static int fake_panic_get(void *data, u64 *val) 2118{ 2119 *val = fake_panic; 2120 return 0; 2121} 2122 2123static int fake_panic_set(void *data, u64 val) 2124{ 2125 mce_reset(); 2126 fake_panic = val; 2127 return 0; 2128} 2129 2130DEFINE_SIMPLE_ATTRIBUTE(fake_panic_fops, fake_panic_get, 2131 fake_panic_set, "%llu\n"); 2132 2133static int __init mcheck_debugfs_init(void) 2134{ 2135 struct dentry *dmce, *ffake_panic; 2136 2137 dmce = mce_get_debugfs_dir(); 2138 if (!dmce) 2139 return -ENOMEM; 2140 ffake_panic = debugfs_create_file("fake_panic", 0444, dmce, NULL, 2141 &fake_panic_fops); 2142 if (!ffake_panic) 2143 return -ENOMEM; 2144 2145 return 0; 2146} 2147late_initcall(mcheck_debugfs_init); 2148#endif 2149