mce.c revision 98a5ae2d99b78d29d2d31283cd8b481a44f41fd3
1/* 2 * Machine check handler. 3 * 4 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. 5 * Rest from unknown author(s). 6 * 2004 Andi Kleen. Rewrote most of it. 7 * Copyright 2008 Intel Corporation 8 * Author: Andi Kleen 9 */ 10#include <linux/thread_info.h> 11#include <linux/capability.h> 12#include <linux/miscdevice.h> 13#include <linux/interrupt.h> 14#include <linux/ratelimit.h> 15#include <linux/kallsyms.h> 16#include <linux/rcupdate.h> 17#include <linux/kobject.h> 18#include <linux/uaccess.h> 19#include <linux/kdebug.h> 20#include <linux/kernel.h> 21#include <linux/percpu.h> 22#include <linux/string.h> 23#include <linux/sysdev.h> 24#include <linux/delay.h> 25#include <linux/ctype.h> 26#include <linux/sched.h> 27#include <linux/sysfs.h> 28#include <linux/types.h> 29#include <linux/slab.h> 30#include <linux/init.h> 31#include <linux/kmod.h> 32#include <linux/poll.h> 33#include <linux/nmi.h> 34#include <linux/cpu.h> 35#include <linux/smp.h> 36#include <linux/fs.h> 37#include <linux/mm.h> 38#include <linux/debugfs.h> 39#include <linux/edac_mce.h> 40 41#include <asm/processor.h> 42#include <asm/hw_irq.h> 43#include <asm/apic.h> 44#include <asm/idle.h> 45#include <asm/ipi.h> 46#include <asm/mce.h> 47#include <asm/msr.h> 48 49#include "mce-internal.h" 50 51static DEFINE_MUTEX(mce_read_mutex); 52 53#define rcu_dereference_check_mce(p) \ 54 rcu_dereference_check((p), \ 55 rcu_read_lock_sched_held() || \ 56 lockdep_is_held(&mce_read_mutex)) 57 58#define CREATE_TRACE_POINTS 59#include <trace/events/mce.h> 60 61int mce_disabled __read_mostly; 62 63#define MISC_MCELOG_MINOR 227 64 65#define SPINUNIT 100 /* 100ns */ 66 67atomic_t mce_entry; 68 69DEFINE_PER_CPU(unsigned, mce_exception_count); 70 71/* 72 * Tolerant levels: 73 * 0: always panic on uncorrected errors, log corrected errors 74 * 1: panic or SIGBUS on uncorrected errors, log corrected errors 75 * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors 76 * 3: never panic or SIGBUS, log all errors (for testing only) 77 */ 78static int tolerant __read_mostly = 1; 79static int banks __read_mostly; 80static int rip_msr __read_mostly; 81static int mce_bootlog __read_mostly = -1; 82static int monarch_timeout __read_mostly = -1; 83static int mce_panic_timeout __read_mostly; 84static int mce_dont_log_ce __read_mostly; 85int mce_cmci_disabled __read_mostly; 86int mce_ignore_ce __read_mostly; 87int mce_ser __read_mostly; 88 89struct mce_bank *mce_banks __read_mostly; 90 91/* User mode helper program triggered by machine check event */ 92static unsigned long mce_need_notify; 93static char mce_helper[128]; 94static char *mce_helper_argv[2] = { mce_helper, NULL }; 95 96static DECLARE_WAIT_QUEUE_HEAD(mce_wait); 97static DEFINE_PER_CPU(struct mce, mces_seen); 98static int cpu_missing; 99 100/* 101 * CPU/chipset specific EDAC code can register a notifier call here to print 102 * MCE errors in a human-readable form. 103 */ 104ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain); 105EXPORT_SYMBOL_GPL(x86_mce_decoder_chain); 106 107static int default_decode_mce(struct notifier_block *nb, unsigned long val, 108 void *data) 109{ 110 pr_emerg("No human readable MCE decoding support on this CPU type.\n"); 111 pr_emerg("Run the message through 'mcelog --ascii' to decode.\n"); 112 113 return NOTIFY_STOP; 114} 115 116static struct notifier_block mce_dec_nb = { 117 .notifier_call = default_decode_mce, 118 .priority = -1, 119}; 120 121/* MCA banks polled by the period polling timer for corrected events */ 122DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { 123 [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL 124}; 125 126static DEFINE_PER_CPU(struct work_struct, mce_work); 127 128/* Do initial initialization of a struct mce */ 129void mce_setup(struct mce *m) 130{ 131 memset(m, 0, sizeof(struct mce)); 132 m->cpu = m->extcpu = smp_processor_id(); 133 rdtscll(m->tsc); 134 /* We hope get_seconds stays lockless */ 135 m->time = get_seconds(); 136 m->cpuvendor = boot_cpu_data.x86_vendor; 137 m->cpuid = cpuid_eax(1); 138#ifdef CONFIG_SMP 139 m->socketid = cpu_data(m->extcpu).phys_proc_id; 140#endif 141 m->apicid = cpu_data(m->extcpu).initial_apicid; 142 rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap); 143} 144 145DEFINE_PER_CPU(struct mce, injectm); 146EXPORT_PER_CPU_SYMBOL_GPL(injectm); 147 148/* 149 * Lockless MCE logging infrastructure. 150 * This avoids deadlocks on printk locks without having to break locks. Also 151 * separate MCEs from kernel messages to avoid bogus bug reports. 152 */ 153 154static struct mce_log mcelog = { 155 .signature = MCE_LOG_SIGNATURE, 156 .len = MCE_LOG_LEN, 157 .recordlen = sizeof(struct mce), 158}; 159 160void mce_log(struct mce *mce) 161{ 162 unsigned next, entry; 163 164 /* Emit the trace record: */ 165 trace_mce_record(mce); 166 167 mce->finished = 0; 168 wmb(); 169 for (;;) { 170 entry = rcu_dereference_check_mce(mcelog.next); 171 for (;;) { 172 /* 173 * If edac_mce is enabled, it will check the error type 174 * and will process it, if it is a known error. 175 * Otherwise, the error will be sent through mcelog 176 * interface 177 */ 178 if (edac_mce_parse(mce)) 179 return; 180 181 /* 182 * When the buffer fills up discard new entries. 183 * Assume that the earlier errors are the more 184 * interesting ones: 185 */ 186 if (entry >= MCE_LOG_LEN) { 187 set_bit(MCE_OVERFLOW, 188 (unsigned long *)&mcelog.flags); 189 return; 190 } 191 /* Old left over entry. Skip: */ 192 if (mcelog.entry[entry].finished) { 193 entry++; 194 continue; 195 } 196 break; 197 } 198 smp_rmb(); 199 next = entry + 1; 200 if (cmpxchg(&mcelog.next, entry, next) == entry) 201 break; 202 } 203 memcpy(mcelog.entry + entry, mce, sizeof(struct mce)); 204 wmb(); 205 mcelog.entry[entry].finished = 1; 206 wmb(); 207 208 mce->finished = 1; 209 set_bit(0, &mce_need_notify); 210} 211 212static void print_mce(struct mce *m) 213{ 214 pr_emerg("CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n", 215 m->extcpu, m->mcgstatus, m->bank, m->status); 216 217 if (m->ip) { 218 pr_emerg("RIP%s %02x:<%016Lx> ", 219 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", 220 m->cs, m->ip); 221 222 if (m->cs == __KERNEL_CS) 223 print_symbol("{%s}", m->ip); 224 pr_cont("\n"); 225 } 226 227 pr_emerg("TSC %llx ", m->tsc); 228 if (m->addr) 229 pr_cont("ADDR %llx ", m->addr); 230 if (m->misc) 231 pr_cont("MISC %llx ", m->misc); 232 233 pr_cont("\n"); 234 pr_emerg("PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n", 235 m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid); 236 237 /* 238 * Print out human-readable details about the MCE error, 239 * (if the CPU has an implementation for that) 240 */ 241 atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m); 242} 243 244static void print_mce_head(void) 245{ 246 pr_emerg("\nHARDWARE ERROR\n"); 247} 248 249static void print_mce_tail(void) 250{ 251 pr_emerg("This is not a software problem!\n"); 252} 253 254#define PANIC_TIMEOUT 5 /* 5 seconds */ 255 256static atomic_t mce_paniced; 257 258static int fake_panic; 259static atomic_t mce_fake_paniced; 260 261/* Panic in progress. Enable interrupts and wait for final IPI */ 262static void wait_for_panic(void) 263{ 264 long timeout = PANIC_TIMEOUT*USEC_PER_SEC; 265 266 preempt_disable(); 267 local_irq_enable(); 268 while (timeout-- > 0) 269 udelay(1); 270 if (panic_timeout == 0) 271 panic_timeout = mce_panic_timeout; 272 panic("Panicing machine check CPU died"); 273} 274 275static void mce_panic(char *msg, struct mce *final, char *exp) 276{ 277 int i, apei_err = 0; 278 279 if (!fake_panic) { 280 /* 281 * Make sure only one CPU runs in machine check panic 282 */ 283 if (atomic_inc_return(&mce_paniced) > 1) 284 wait_for_panic(); 285 barrier(); 286 287 bust_spinlocks(1); 288 console_verbose(); 289 } else { 290 /* Don't log too much for fake panic */ 291 if (atomic_inc_return(&mce_fake_paniced) > 1) 292 return; 293 } 294 print_mce_head(); 295 /* First print corrected ones that are still unlogged */ 296 for (i = 0; i < MCE_LOG_LEN; i++) { 297 struct mce *m = &mcelog.entry[i]; 298 if (!(m->status & MCI_STATUS_VAL)) 299 continue; 300 if (!(m->status & MCI_STATUS_UC)) { 301 print_mce(m); 302 if (!apei_err) 303 apei_err = apei_write_mce(m); 304 } 305 } 306 /* Now print uncorrected but with the final one last */ 307 for (i = 0; i < MCE_LOG_LEN; i++) { 308 struct mce *m = &mcelog.entry[i]; 309 if (!(m->status & MCI_STATUS_VAL)) 310 continue; 311 if (!(m->status & MCI_STATUS_UC)) 312 continue; 313 if (!final || memcmp(m, final, sizeof(struct mce))) { 314 print_mce(m); 315 if (!apei_err) 316 apei_err = apei_write_mce(m); 317 } 318 } 319 if (final) { 320 print_mce(final); 321 if (!apei_err) 322 apei_err = apei_write_mce(final); 323 } 324 if (cpu_missing) 325 printk(KERN_EMERG "Some CPUs didn't answer in synchronization\n"); 326 print_mce_tail(); 327 if (exp) 328 printk(KERN_EMERG "Machine check: %s\n", exp); 329 if (!fake_panic) { 330 if (panic_timeout == 0) 331 panic_timeout = mce_panic_timeout; 332 panic(msg); 333 } else 334 printk(KERN_EMERG "Fake kernel panic: %s\n", msg); 335} 336 337/* Support code for software error injection */ 338 339static int msr_to_offset(u32 msr) 340{ 341 unsigned bank = __get_cpu_var(injectm.bank); 342 343 if (msr == rip_msr) 344 return offsetof(struct mce, ip); 345 if (msr == MSR_IA32_MCx_STATUS(bank)) 346 return offsetof(struct mce, status); 347 if (msr == MSR_IA32_MCx_ADDR(bank)) 348 return offsetof(struct mce, addr); 349 if (msr == MSR_IA32_MCx_MISC(bank)) 350 return offsetof(struct mce, misc); 351 if (msr == MSR_IA32_MCG_STATUS) 352 return offsetof(struct mce, mcgstatus); 353 return -1; 354} 355 356/* MSR access wrappers used for error injection */ 357static u64 mce_rdmsrl(u32 msr) 358{ 359 u64 v; 360 361 if (__get_cpu_var(injectm).finished) { 362 int offset = msr_to_offset(msr); 363 364 if (offset < 0) 365 return 0; 366 return *(u64 *)((char *)&__get_cpu_var(injectm) + offset); 367 } 368 369 if (rdmsrl_safe(msr, &v)) { 370 WARN_ONCE(1, "mce: Unable to read msr %d!\n", msr); 371 /* 372 * Return zero in case the access faulted. This should 373 * not happen normally but can happen if the CPU does 374 * something weird, or if the code is buggy. 375 */ 376 v = 0; 377 } 378 379 return v; 380} 381 382static void mce_wrmsrl(u32 msr, u64 v) 383{ 384 if (__get_cpu_var(injectm).finished) { 385 int offset = msr_to_offset(msr); 386 387 if (offset >= 0) 388 *(u64 *)((char *)&__get_cpu_var(injectm) + offset) = v; 389 return; 390 } 391 wrmsrl(msr, v); 392} 393 394/* 395 * Simple lockless ring to communicate PFNs from the exception handler with the 396 * process context work function. This is vastly simplified because there's 397 * only a single reader and a single writer. 398 */ 399#define MCE_RING_SIZE 16 /* we use one entry less */ 400 401struct mce_ring { 402 unsigned short start; 403 unsigned short end; 404 unsigned long ring[MCE_RING_SIZE]; 405}; 406static DEFINE_PER_CPU(struct mce_ring, mce_ring); 407 408/* Runs with CPU affinity in workqueue */ 409static int mce_ring_empty(void) 410{ 411 struct mce_ring *r = &__get_cpu_var(mce_ring); 412 413 return r->start == r->end; 414} 415 416static int mce_ring_get(unsigned long *pfn) 417{ 418 struct mce_ring *r; 419 int ret = 0; 420 421 *pfn = 0; 422 get_cpu(); 423 r = &__get_cpu_var(mce_ring); 424 if (r->start == r->end) 425 goto out; 426 *pfn = r->ring[r->start]; 427 r->start = (r->start + 1) % MCE_RING_SIZE; 428 ret = 1; 429out: 430 put_cpu(); 431 return ret; 432} 433 434/* Always runs in MCE context with preempt off */ 435static int mce_ring_add(unsigned long pfn) 436{ 437 struct mce_ring *r = &__get_cpu_var(mce_ring); 438 unsigned next; 439 440 next = (r->end + 1) % MCE_RING_SIZE; 441 if (next == r->start) 442 return -1; 443 r->ring[r->end] = pfn; 444 wmb(); 445 r->end = next; 446 return 0; 447} 448 449int mce_available(struct cpuinfo_x86 *c) 450{ 451 if (mce_disabled) 452 return 0; 453 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); 454} 455 456static void mce_schedule_work(void) 457{ 458 if (!mce_ring_empty()) { 459 struct work_struct *work = &__get_cpu_var(mce_work); 460 if (!work_pending(work)) 461 schedule_work(work); 462 } 463} 464 465/* 466 * Get the address of the instruction at the time of the machine check 467 * error. 468 */ 469static inline void mce_get_rip(struct mce *m, struct pt_regs *regs) 470{ 471 472 if (regs && (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV))) { 473 m->ip = regs->ip; 474 m->cs = regs->cs; 475 } else { 476 m->ip = 0; 477 m->cs = 0; 478 } 479 if (rip_msr) 480 m->ip = mce_rdmsrl(rip_msr); 481} 482 483#ifdef CONFIG_X86_LOCAL_APIC 484/* 485 * Called after interrupts have been reenabled again 486 * when a MCE happened during an interrupts off region 487 * in the kernel. 488 */ 489asmlinkage void smp_mce_self_interrupt(struct pt_regs *regs) 490{ 491 ack_APIC_irq(); 492 exit_idle(); 493 irq_enter(); 494 mce_notify_irq(); 495 mce_schedule_work(); 496 irq_exit(); 497} 498#endif 499 500static void mce_report_event(struct pt_regs *regs) 501{ 502 if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) { 503 mce_notify_irq(); 504 /* 505 * Triggering the work queue here is just an insurance 506 * policy in case the syscall exit notify handler 507 * doesn't run soon enough or ends up running on the 508 * wrong CPU (can happen when audit sleeps) 509 */ 510 mce_schedule_work(); 511 return; 512 } 513 514#ifdef CONFIG_X86_LOCAL_APIC 515 /* 516 * Without APIC do not notify. The event will be picked 517 * up eventually. 518 */ 519 if (!cpu_has_apic) 520 return; 521 522 /* 523 * When interrupts are disabled we cannot use 524 * kernel services safely. Trigger an self interrupt 525 * through the APIC to instead do the notification 526 * after interrupts are reenabled again. 527 */ 528 apic->send_IPI_self(MCE_SELF_VECTOR); 529 530 /* 531 * Wait for idle afterwards again so that we don't leave the 532 * APIC in a non idle state because the normal APIC writes 533 * cannot exclude us. 534 */ 535 apic_wait_icr_idle(); 536#endif 537} 538 539DEFINE_PER_CPU(unsigned, mce_poll_count); 540 541/* 542 * Poll for corrected events or events that happened before reset. 543 * Those are just logged through /dev/mcelog. 544 * 545 * This is executed in standard interrupt context. 546 * 547 * Note: spec recommends to panic for fatal unsignalled 548 * errors here. However this would be quite problematic -- 549 * we would need to reimplement the Monarch handling and 550 * it would mess up the exclusion between exception handler 551 * and poll hander -- * so we skip this for now. 552 * These cases should not happen anyways, or only when the CPU 553 * is already totally * confused. In this case it's likely it will 554 * not fully execute the machine check handler either. 555 */ 556void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) 557{ 558 struct mce m; 559 int i; 560 561 percpu_inc(mce_poll_count); 562 563 mce_setup(&m); 564 565 m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); 566 for (i = 0; i < banks; i++) { 567 if (!mce_banks[i].ctl || !test_bit(i, *b)) 568 continue; 569 570 m.misc = 0; 571 m.addr = 0; 572 m.bank = i; 573 m.tsc = 0; 574 575 barrier(); 576 m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); 577 if (!(m.status & MCI_STATUS_VAL)) 578 continue; 579 580 /* 581 * Uncorrected or signalled events are handled by the exception 582 * handler when it is enabled, so don't process those here. 583 * 584 * TBD do the same check for MCI_STATUS_EN here? 585 */ 586 if (!(flags & MCP_UC) && 587 (m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC))) 588 continue; 589 590 if (m.status & MCI_STATUS_MISCV) 591 m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i)); 592 if (m.status & MCI_STATUS_ADDRV) 593 m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i)); 594 595 if (!(flags & MCP_TIMESTAMP)) 596 m.tsc = 0; 597 /* 598 * Don't get the IP here because it's unlikely to 599 * have anything to do with the actual error location. 600 */ 601 if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) { 602 mce_log(&m); 603 atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, &m); 604 add_taint(TAINT_MACHINE_CHECK); 605 } 606 607 /* 608 * Clear state for this bank. 609 */ 610 mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0); 611 } 612 613 /* 614 * Don't clear MCG_STATUS here because it's only defined for 615 * exceptions. 616 */ 617 618 sync_core(); 619} 620EXPORT_SYMBOL_GPL(machine_check_poll); 621 622/* 623 * Do a quick check if any of the events requires a panic. 624 * This decides if we keep the events around or clear them. 625 */ 626static int mce_no_way_out(struct mce *m, char **msg) 627{ 628 int i; 629 630 for (i = 0; i < banks; i++) { 631 m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); 632 if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY) 633 return 1; 634 } 635 return 0; 636} 637 638/* 639 * Variable to establish order between CPUs while scanning. 640 * Each CPU spins initially until executing is equal its number. 641 */ 642static atomic_t mce_executing; 643 644/* 645 * Defines order of CPUs on entry. First CPU becomes Monarch. 646 */ 647static atomic_t mce_callin; 648 649/* 650 * Check if a timeout waiting for other CPUs happened. 651 */ 652static int mce_timed_out(u64 *t) 653{ 654 /* 655 * The others already did panic for some reason. 656 * Bail out like in a timeout. 657 * rmb() to tell the compiler that system_state 658 * might have been modified by someone else. 659 */ 660 rmb(); 661 if (atomic_read(&mce_paniced)) 662 wait_for_panic(); 663 if (!monarch_timeout) 664 goto out; 665 if ((s64)*t < SPINUNIT) { 666 /* CHECKME: Make panic default for 1 too? */ 667 if (tolerant < 1) 668 mce_panic("Timeout synchronizing machine check over CPUs", 669 NULL, NULL); 670 cpu_missing = 1; 671 return 1; 672 } 673 *t -= SPINUNIT; 674out: 675 touch_nmi_watchdog(); 676 return 0; 677} 678 679/* 680 * The Monarch's reign. The Monarch is the CPU who entered 681 * the machine check handler first. It waits for the others to 682 * raise the exception too and then grades them. When any 683 * error is fatal panic. Only then let the others continue. 684 * 685 * The other CPUs entering the MCE handler will be controlled by the 686 * Monarch. They are called Subjects. 687 * 688 * This way we prevent any potential data corruption in a unrecoverable case 689 * and also makes sure always all CPU's errors are examined. 690 * 691 * Also this detects the case of a machine check event coming from outer 692 * space (not detected by any CPUs) In this case some external agent wants 693 * us to shut down, so panic too. 694 * 695 * The other CPUs might still decide to panic if the handler happens 696 * in a unrecoverable place, but in this case the system is in a semi-stable 697 * state and won't corrupt anything by itself. It's ok to let the others 698 * continue for a bit first. 699 * 700 * All the spin loops have timeouts; when a timeout happens a CPU 701 * typically elects itself to be Monarch. 702 */ 703static void mce_reign(void) 704{ 705 int cpu; 706 struct mce *m = NULL; 707 int global_worst = 0; 708 char *msg = NULL; 709 char *nmsg = NULL; 710 711 /* 712 * This CPU is the Monarch and the other CPUs have run 713 * through their handlers. 714 * Grade the severity of the errors of all the CPUs. 715 */ 716 for_each_possible_cpu(cpu) { 717 int severity = mce_severity(&per_cpu(mces_seen, cpu), tolerant, 718 &nmsg); 719 if (severity > global_worst) { 720 msg = nmsg; 721 global_worst = severity; 722 m = &per_cpu(mces_seen, cpu); 723 } 724 } 725 726 /* 727 * Cannot recover? Panic here then. 728 * This dumps all the mces in the log buffer and stops the 729 * other CPUs. 730 */ 731 if (m && global_worst >= MCE_PANIC_SEVERITY && tolerant < 3) 732 mce_panic("Fatal Machine check", m, msg); 733 734 /* 735 * For UC somewhere we let the CPU who detects it handle it. 736 * Also must let continue the others, otherwise the handling 737 * CPU could deadlock on a lock. 738 */ 739 740 /* 741 * No machine check event found. Must be some external 742 * source or one CPU is hung. Panic. 743 */ 744 if (global_worst <= MCE_KEEP_SEVERITY && tolerant < 3) 745 mce_panic("Machine check from unknown source", NULL, NULL); 746 747 /* 748 * Now clear all the mces_seen so that they don't reappear on 749 * the next mce. 750 */ 751 for_each_possible_cpu(cpu) 752 memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce)); 753} 754 755static atomic_t global_nwo; 756 757/* 758 * Start of Monarch synchronization. This waits until all CPUs have 759 * entered the exception handler and then determines if any of them 760 * saw a fatal event that requires panic. Then it executes them 761 * in the entry order. 762 * TBD double check parallel CPU hotunplug 763 */ 764static int mce_start(int *no_way_out) 765{ 766 int order; 767 int cpus = num_online_cpus(); 768 u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC; 769 770 if (!timeout) 771 return -1; 772 773 atomic_add(*no_way_out, &global_nwo); 774 /* 775 * global_nwo should be updated before mce_callin 776 */ 777 smp_wmb(); 778 order = atomic_inc_return(&mce_callin); 779 780 /* 781 * Wait for everyone. 782 */ 783 while (atomic_read(&mce_callin) != cpus) { 784 if (mce_timed_out(&timeout)) { 785 atomic_set(&global_nwo, 0); 786 return -1; 787 } 788 ndelay(SPINUNIT); 789 } 790 791 /* 792 * mce_callin should be read before global_nwo 793 */ 794 smp_rmb(); 795 796 if (order == 1) { 797 /* 798 * Monarch: Starts executing now, the others wait. 799 */ 800 atomic_set(&mce_executing, 1); 801 } else { 802 /* 803 * Subject: Now start the scanning loop one by one in 804 * the original callin order. 805 * This way when there are any shared banks it will be 806 * only seen by one CPU before cleared, avoiding duplicates. 807 */ 808 while (atomic_read(&mce_executing) < order) { 809 if (mce_timed_out(&timeout)) { 810 atomic_set(&global_nwo, 0); 811 return -1; 812 } 813 ndelay(SPINUNIT); 814 } 815 } 816 817 /* 818 * Cache the global no_way_out state. 819 */ 820 *no_way_out = atomic_read(&global_nwo); 821 822 return order; 823} 824 825/* 826 * Synchronize between CPUs after main scanning loop. 827 * This invokes the bulk of the Monarch processing. 828 */ 829static int mce_end(int order) 830{ 831 int ret = -1; 832 u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC; 833 834 if (!timeout) 835 goto reset; 836 if (order < 0) 837 goto reset; 838 839 /* 840 * Allow others to run. 841 */ 842 atomic_inc(&mce_executing); 843 844 if (order == 1) { 845 /* CHECKME: Can this race with a parallel hotplug? */ 846 int cpus = num_online_cpus(); 847 848 /* 849 * Monarch: Wait for everyone to go through their scanning 850 * loops. 851 */ 852 while (atomic_read(&mce_executing) <= cpus) { 853 if (mce_timed_out(&timeout)) 854 goto reset; 855 ndelay(SPINUNIT); 856 } 857 858 mce_reign(); 859 barrier(); 860 ret = 0; 861 } else { 862 /* 863 * Subject: Wait for Monarch to finish. 864 */ 865 while (atomic_read(&mce_executing) != 0) { 866 if (mce_timed_out(&timeout)) 867 goto reset; 868 ndelay(SPINUNIT); 869 } 870 871 /* 872 * Don't reset anything. That's done by the Monarch. 873 */ 874 return 0; 875 } 876 877 /* 878 * Reset all global state. 879 */ 880reset: 881 atomic_set(&global_nwo, 0); 882 atomic_set(&mce_callin, 0); 883 barrier(); 884 885 /* 886 * Let others run again. 887 */ 888 atomic_set(&mce_executing, 0); 889 return ret; 890} 891 892/* 893 * Check if the address reported by the CPU is in a format we can parse. 894 * It would be possible to add code for most other cases, but all would 895 * be somewhat complicated (e.g. segment offset would require an instruction 896 * parser). So only support physical addresses upto page granuality for now. 897 */ 898static int mce_usable_address(struct mce *m) 899{ 900 if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV)) 901 return 0; 902 if ((m->misc & 0x3f) > PAGE_SHIFT) 903 return 0; 904 if (((m->misc >> 6) & 7) != MCM_ADDR_PHYS) 905 return 0; 906 return 1; 907} 908 909static void mce_clear_state(unsigned long *toclear) 910{ 911 int i; 912 913 for (i = 0; i < banks; i++) { 914 if (test_bit(i, toclear)) 915 mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0); 916 } 917} 918 919/* 920 * The actual machine check handler. This only handles real 921 * exceptions when something got corrupted coming in through int 18. 922 * 923 * This is executed in NMI context not subject to normal locking rules. This 924 * implies that most kernel services cannot be safely used. Don't even 925 * think about putting a printk in there! 926 * 927 * On Intel systems this is entered on all CPUs in parallel through 928 * MCE broadcast. However some CPUs might be broken beyond repair, 929 * so be always careful when synchronizing with others. 930 */ 931void do_machine_check(struct pt_regs *regs, long error_code) 932{ 933 struct mce m, *final; 934 int i; 935 int worst = 0; 936 int severity; 937 /* 938 * Establish sequential order between the CPUs entering the machine 939 * check handler. 940 */ 941 int order; 942 /* 943 * If no_way_out gets set, there is no safe way to recover from this 944 * MCE. If tolerant is cranked up, we'll try anyway. 945 */ 946 int no_way_out = 0; 947 /* 948 * If kill_it gets set, there might be a way to recover from this 949 * error. 950 */ 951 int kill_it = 0; 952 DECLARE_BITMAP(toclear, MAX_NR_BANKS); 953 char *msg = "Unknown"; 954 955 atomic_inc(&mce_entry); 956 957 percpu_inc(mce_exception_count); 958 959 if (notify_die(DIE_NMI, "machine check", regs, error_code, 960 18, SIGKILL) == NOTIFY_STOP) 961 goto out; 962 if (!banks) 963 goto out; 964 965 mce_setup(&m); 966 967 m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); 968 final = &__get_cpu_var(mces_seen); 969 *final = m; 970 971 no_way_out = mce_no_way_out(&m, &msg); 972 973 barrier(); 974 975 /* 976 * When no restart IP must always kill or panic. 977 */ 978 if (!(m.mcgstatus & MCG_STATUS_RIPV)) 979 kill_it = 1; 980 981 /* 982 * Go through all the banks in exclusion of the other CPUs. 983 * This way we don't report duplicated events on shared banks 984 * because the first one to see it will clear it. 985 */ 986 order = mce_start(&no_way_out); 987 for (i = 0; i < banks; i++) { 988 __clear_bit(i, toclear); 989 if (!mce_banks[i].ctl) 990 continue; 991 992 m.misc = 0; 993 m.addr = 0; 994 m.bank = i; 995 996 m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); 997 if ((m.status & MCI_STATUS_VAL) == 0) 998 continue; 999 1000 /* 1001 * Non uncorrected or non signaled errors are handled by 1002 * machine_check_poll. Leave them alone, unless this panics. 1003 */ 1004 if (!(m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)) && 1005 !no_way_out) 1006 continue; 1007 1008 /* 1009 * Set taint even when machine check was not enabled. 1010 */ 1011 add_taint(TAINT_MACHINE_CHECK); 1012 1013 severity = mce_severity(&m, tolerant, NULL); 1014 1015 /* 1016 * When machine check was for corrected handler don't touch, 1017 * unless we're panicing. 1018 */ 1019 if (severity == MCE_KEEP_SEVERITY && !no_way_out) 1020 continue; 1021 __set_bit(i, toclear); 1022 if (severity == MCE_NO_SEVERITY) { 1023 /* 1024 * Machine check event was not enabled. Clear, but 1025 * ignore. 1026 */ 1027 continue; 1028 } 1029 1030 /* 1031 * Kill on action required. 1032 */ 1033 if (severity == MCE_AR_SEVERITY) 1034 kill_it = 1; 1035 1036 if (m.status & MCI_STATUS_MISCV) 1037 m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i)); 1038 if (m.status & MCI_STATUS_ADDRV) 1039 m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i)); 1040 1041 /* 1042 * Action optional error. Queue address for later processing. 1043 * When the ring overflows we just ignore the AO error. 1044 * RED-PEN add some logging mechanism when 1045 * usable_address or mce_add_ring fails. 1046 * RED-PEN don't ignore overflow for tolerant == 0 1047 */ 1048 if (severity == MCE_AO_SEVERITY && mce_usable_address(&m)) 1049 mce_ring_add(m.addr >> PAGE_SHIFT); 1050 1051 mce_get_rip(&m, regs); 1052 mce_log(&m); 1053 1054 if (severity > worst) { 1055 *final = m; 1056 worst = severity; 1057 } 1058 } 1059 1060 if (!no_way_out) 1061 mce_clear_state(toclear); 1062 1063 /* 1064 * Do most of the synchronization with other CPUs. 1065 * When there's any problem use only local no_way_out state. 1066 */ 1067 if (mce_end(order) < 0) 1068 no_way_out = worst >= MCE_PANIC_SEVERITY; 1069 1070 /* 1071 * If we have decided that we just CAN'T continue, and the user 1072 * has not set tolerant to an insane level, give up and die. 1073 * 1074 * This is mainly used in the case when the system doesn't 1075 * support MCE broadcasting or it has been disabled. 1076 */ 1077 if (no_way_out && tolerant < 3) 1078 mce_panic("Fatal machine check on current CPU", final, msg); 1079 1080 /* 1081 * If the error seems to be unrecoverable, something should be 1082 * done. Try to kill as little as possible. If we can kill just 1083 * one task, do that. If the user has set the tolerance very 1084 * high, don't try to do anything at all. 1085 */ 1086 1087 if (kill_it && tolerant < 3) 1088 force_sig(SIGBUS, current); 1089 1090 /* notify userspace ASAP */ 1091 set_thread_flag(TIF_MCE_NOTIFY); 1092 1093 if (worst > 0) 1094 mce_report_event(regs); 1095 mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); 1096out: 1097 atomic_dec(&mce_entry); 1098 sync_core(); 1099} 1100EXPORT_SYMBOL_GPL(do_machine_check); 1101 1102/* dummy to break dependency. actual code is in mm/memory-failure.c */ 1103void __attribute__((weak)) memory_failure(unsigned long pfn, int vector) 1104{ 1105 printk(KERN_ERR "Action optional memory failure at %lx ignored\n", pfn); 1106} 1107 1108/* 1109 * Called after mce notification in process context. This code 1110 * is allowed to sleep. Call the high level VM handler to process 1111 * any corrupted pages. 1112 * Assume that the work queue code only calls this one at a time 1113 * per CPU. 1114 * Note we don't disable preemption, so this code might run on the wrong 1115 * CPU. In this case the event is picked up by the scheduled work queue. 1116 * This is merely a fast path to expedite processing in some common 1117 * cases. 1118 */ 1119void mce_notify_process(void) 1120{ 1121 unsigned long pfn; 1122 mce_notify_irq(); 1123 while (mce_ring_get(&pfn)) 1124 memory_failure(pfn, MCE_VECTOR); 1125} 1126 1127static void mce_process_work(struct work_struct *dummy) 1128{ 1129 mce_notify_process(); 1130} 1131 1132#ifdef CONFIG_X86_MCE_INTEL 1133/*** 1134 * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog 1135 * @cpu: The CPU on which the event occurred. 1136 * @status: Event status information 1137 * 1138 * This function should be called by the thermal interrupt after the 1139 * event has been processed and the decision was made to log the event 1140 * further. 1141 * 1142 * The status parameter will be saved to the 'status' field of 'struct mce' 1143 * and historically has been the register value of the 1144 * MSR_IA32_THERMAL_STATUS (Intel) msr. 1145 */ 1146void mce_log_therm_throt_event(__u64 status) 1147{ 1148 struct mce m; 1149 1150 mce_setup(&m); 1151 m.bank = MCE_THERMAL_BANK; 1152 m.status = status; 1153 mce_log(&m); 1154} 1155#endif /* CONFIG_X86_MCE_INTEL */ 1156 1157/* 1158 * Periodic polling timer for "silent" machine check errors. If the 1159 * poller finds an MCE, poll 2x faster. When the poller finds no more 1160 * errors, poll 2x slower (up to check_interval seconds). 1161 */ 1162static int check_interval = 5 * 60; /* 5 minutes */ 1163 1164static DEFINE_PER_CPU(int, mce_next_interval); /* in jiffies */ 1165static DEFINE_PER_CPU(struct timer_list, mce_timer); 1166 1167static void mce_start_timer(unsigned long data) 1168{ 1169 struct timer_list *t = &per_cpu(mce_timer, data); 1170 int *n; 1171 1172 WARN_ON(smp_processor_id() != data); 1173 1174 if (mce_available(¤t_cpu_data)) { 1175 machine_check_poll(MCP_TIMESTAMP, 1176 &__get_cpu_var(mce_poll_banks)); 1177 } 1178 1179 /* 1180 * Alert userspace if needed. If we logged an MCE, reduce the 1181 * polling interval, otherwise increase the polling interval. 1182 */ 1183 n = &__get_cpu_var(mce_next_interval); 1184 if (mce_notify_irq()) 1185 *n = max(*n/2, HZ/100); 1186 else 1187 *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ)); 1188 1189 t->expires = jiffies + *n; 1190 add_timer_on(t, smp_processor_id()); 1191} 1192 1193static void mce_do_trigger(struct work_struct *work) 1194{ 1195 call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT); 1196} 1197 1198static DECLARE_WORK(mce_trigger_work, mce_do_trigger); 1199 1200/* 1201 * Notify the user(s) about new machine check events. 1202 * Can be called from interrupt context, but not from machine check/NMI 1203 * context. 1204 */ 1205int mce_notify_irq(void) 1206{ 1207 /* Not more than two messages every minute */ 1208 static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); 1209 1210 clear_thread_flag(TIF_MCE_NOTIFY); 1211 1212 if (test_and_clear_bit(0, &mce_need_notify)) { 1213 wake_up_interruptible(&mce_wait); 1214 1215 /* 1216 * There is no risk of missing notifications because 1217 * work_pending is always cleared before the function is 1218 * executed. 1219 */ 1220 if (mce_helper[0] && !work_pending(&mce_trigger_work)) 1221 schedule_work(&mce_trigger_work); 1222 1223 if (__ratelimit(&ratelimit)) 1224 printk(KERN_INFO "Machine check events logged\n"); 1225 1226 return 1; 1227 } 1228 return 0; 1229} 1230EXPORT_SYMBOL_GPL(mce_notify_irq); 1231 1232static int __cpuinit __mcheck_cpu_mce_banks_init(void) 1233{ 1234 int i; 1235 1236 mce_banks = kzalloc(banks * sizeof(struct mce_bank), GFP_KERNEL); 1237 if (!mce_banks) 1238 return -ENOMEM; 1239 for (i = 0; i < banks; i++) { 1240 struct mce_bank *b = &mce_banks[i]; 1241 1242 b->ctl = -1ULL; 1243 b->init = 1; 1244 } 1245 return 0; 1246} 1247 1248/* 1249 * Initialize Machine Checks for a CPU. 1250 */ 1251static int __cpuinit __mcheck_cpu_cap_init(void) 1252{ 1253 unsigned b; 1254 u64 cap; 1255 1256 rdmsrl(MSR_IA32_MCG_CAP, cap); 1257 1258 b = cap & MCG_BANKCNT_MASK; 1259 if (!banks) 1260 printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b); 1261 1262 if (b > MAX_NR_BANKS) { 1263 printk(KERN_WARNING 1264 "MCE: Using only %u machine check banks out of %u\n", 1265 MAX_NR_BANKS, b); 1266 b = MAX_NR_BANKS; 1267 } 1268 1269 /* Don't support asymmetric configurations today */ 1270 WARN_ON(banks != 0 && b != banks); 1271 banks = b; 1272 if (!mce_banks) { 1273 int err = __mcheck_cpu_mce_banks_init(); 1274 1275 if (err) 1276 return err; 1277 } 1278 1279 /* Use accurate RIP reporting if available. */ 1280 if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9) 1281 rip_msr = MSR_IA32_MCG_EIP; 1282 1283 if (cap & MCG_SER_P) 1284 mce_ser = 1; 1285 1286 return 0; 1287} 1288 1289static void __mcheck_cpu_init_generic(void) 1290{ 1291 mce_banks_t all_banks; 1292 u64 cap; 1293 int i; 1294 1295 /* 1296 * Log the machine checks left over from the previous reset. 1297 */ 1298 bitmap_fill(all_banks, MAX_NR_BANKS); 1299 machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks); 1300 1301 set_in_cr4(X86_CR4_MCE); 1302 1303 rdmsrl(MSR_IA32_MCG_CAP, cap); 1304 if (cap & MCG_CTL_P) 1305 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); 1306 1307 for (i = 0; i < banks; i++) { 1308 struct mce_bank *b = &mce_banks[i]; 1309 1310 if (!b->init) 1311 continue; 1312 wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl); 1313 wrmsrl(MSR_IA32_MCx_STATUS(i), 0); 1314 } 1315} 1316 1317/* Add per CPU specific workarounds here */ 1318static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) 1319{ 1320 if (c->x86_vendor == X86_VENDOR_UNKNOWN) { 1321 pr_info("MCE: unknown CPU type - not enabling MCE support.\n"); 1322 return -EOPNOTSUPP; 1323 } 1324 1325 /* This should be disabled by the BIOS, but isn't always */ 1326 if (c->x86_vendor == X86_VENDOR_AMD) { 1327 if (c->x86 == 15 && banks > 4) { 1328 /* 1329 * disable GART TBL walk error reporting, which 1330 * trips off incorrectly with the IOMMU & 3ware 1331 * & Cerberus: 1332 */ 1333 clear_bit(10, (unsigned long *)&mce_banks[4].ctl); 1334 } 1335 if (c->x86 <= 17 && mce_bootlog < 0) { 1336 /* 1337 * Lots of broken BIOS around that don't clear them 1338 * by default and leave crap in there. Don't log: 1339 */ 1340 mce_bootlog = 0; 1341 } 1342 /* 1343 * Various K7s with broken bank 0 around. Always disable 1344 * by default. 1345 */ 1346 if (c->x86 == 6 && banks > 0) 1347 mce_banks[0].ctl = 0; 1348 } 1349 1350 if (c->x86_vendor == X86_VENDOR_INTEL) { 1351 /* 1352 * SDM documents that on family 6 bank 0 should not be written 1353 * because it aliases to another special BIOS controlled 1354 * register. 1355 * But it's not aliased anymore on model 0x1a+ 1356 * Don't ignore bank 0 completely because there could be a 1357 * valid event later, merely don't write CTL0. 1358 */ 1359 1360 if (c->x86 == 6 && c->x86_model < 0x1A && banks > 0) 1361 mce_banks[0].init = 0; 1362 1363 /* 1364 * All newer Intel systems support MCE broadcasting. Enable 1365 * synchronization with a one second timeout. 1366 */ 1367 if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) && 1368 monarch_timeout < 0) 1369 monarch_timeout = USEC_PER_SEC; 1370 1371 /* 1372 * There are also broken BIOSes on some Pentium M and 1373 * earlier systems: 1374 */ 1375 if (c->x86 == 6 && c->x86_model <= 13 && mce_bootlog < 0) 1376 mce_bootlog = 0; 1377 } 1378 if (monarch_timeout < 0) 1379 monarch_timeout = 0; 1380 if (mce_bootlog != 0) 1381 mce_panic_timeout = 30; 1382 1383 return 0; 1384} 1385 1386static void __cpuinit __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c) 1387{ 1388 if (c->x86 != 5) 1389 return; 1390 switch (c->x86_vendor) { 1391 case X86_VENDOR_INTEL: 1392 intel_p5_mcheck_init(c); 1393 break; 1394 case X86_VENDOR_CENTAUR: 1395 winchip_mcheck_init(c); 1396 break; 1397 } 1398} 1399 1400static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) 1401{ 1402 switch (c->x86_vendor) { 1403 case X86_VENDOR_INTEL: 1404 mce_intel_feature_init(c); 1405 break; 1406 case X86_VENDOR_AMD: 1407 mce_amd_feature_init(c); 1408 break; 1409 default: 1410 break; 1411 } 1412} 1413 1414static void __mcheck_cpu_init_timer(void) 1415{ 1416 struct timer_list *t = &__get_cpu_var(mce_timer); 1417 int *n = &__get_cpu_var(mce_next_interval); 1418 1419 setup_timer(t, mce_start_timer, smp_processor_id()); 1420 1421 if (mce_ignore_ce) 1422 return; 1423 1424 *n = check_interval * HZ; 1425 if (!*n) 1426 return; 1427 t->expires = round_jiffies(jiffies + *n); 1428 add_timer_on(t, smp_processor_id()); 1429} 1430 1431/* Handle unconfigured int18 (should never happen) */ 1432static void unexpected_machine_check(struct pt_regs *regs, long error_code) 1433{ 1434 printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", 1435 smp_processor_id()); 1436} 1437 1438/* Call the installed machine check handler for this CPU setup. */ 1439void (*machine_check_vector)(struct pt_regs *, long error_code) = 1440 unexpected_machine_check; 1441 1442/* 1443 * Called for each booted CPU to set up machine checks. 1444 * Must be called with preempt off: 1445 */ 1446void __cpuinit mcheck_cpu_init(struct cpuinfo_x86 *c) 1447{ 1448 if (mce_disabled) 1449 return; 1450 1451 __mcheck_cpu_ancient_init(c); 1452 1453 if (!mce_available(c)) 1454 return; 1455 1456 if (__mcheck_cpu_cap_init() < 0 || __mcheck_cpu_apply_quirks(c) < 0) { 1457 mce_disabled = 1; 1458 return; 1459 } 1460 1461 machine_check_vector = do_machine_check; 1462 1463 __mcheck_cpu_init_generic(); 1464 __mcheck_cpu_init_vendor(c); 1465 __mcheck_cpu_init_timer(); 1466 INIT_WORK(&__get_cpu_var(mce_work), mce_process_work); 1467 1468} 1469 1470/* 1471 * Character device to read and clear the MCE log. 1472 */ 1473 1474static DEFINE_SPINLOCK(mce_state_lock); 1475static int open_count; /* #times opened */ 1476static int open_exclu; /* already open exclusive? */ 1477 1478static int mce_open(struct inode *inode, struct file *file) 1479{ 1480 spin_lock(&mce_state_lock); 1481 1482 if (open_exclu || (open_count && (file->f_flags & O_EXCL))) { 1483 spin_unlock(&mce_state_lock); 1484 1485 return -EBUSY; 1486 } 1487 1488 if (file->f_flags & O_EXCL) 1489 open_exclu = 1; 1490 open_count++; 1491 1492 spin_unlock(&mce_state_lock); 1493 1494 return nonseekable_open(inode, file); 1495} 1496 1497static int mce_release(struct inode *inode, struct file *file) 1498{ 1499 spin_lock(&mce_state_lock); 1500 1501 open_count--; 1502 open_exclu = 0; 1503 1504 spin_unlock(&mce_state_lock); 1505 1506 return 0; 1507} 1508 1509static void collect_tscs(void *data) 1510{ 1511 unsigned long *cpu_tsc = (unsigned long *)data; 1512 1513 rdtscll(cpu_tsc[smp_processor_id()]); 1514} 1515 1516static int mce_apei_read_done; 1517 1518/* Collect MCE record of previous boot in persistent storage via APEI ERST. */ 1519static int __mce_read_apei(char __user **ubuf, size_t usize) 1520{ 1521 int rc; 1522 u64 record_id; 1523 struct mce m; 1524 1525 if (usize < sizeof(struct mce)) 1526 return -EINVAL; 1527 1528 rc = apei_read_mce(&m, &record_id); 1529 /* Error or no more MCE record */ 1530 if (rc <= 0) { 1531 mce_apei_read_done = 1; 1532 return rc; 1533 } 1534 rc = -EFAULT; 1535 if (copy_to_user(*ubuf, &m, sizeof(struct mce))) 1536 return rc; 1537 /* 1538 * In fact, we should have cleared the record after that has 1539 * been flushed to the disk or sent to network in 1540 * /sbin/mcelog, but we have no interface to support that now, 1541 * so just clear it to avoid duplication. 1542 */ 1543 rc = apei_clear_mce(record_id); 1544 if (rc) { 1545 mce_apei_read_done = 1; 1546 return rc; 1547 } 1548 *ubuf += sizeof(struct mce); 1549 1550 return 0; 1551} 1552 1553static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, 1554 loff_t *off) 1555{ 1556 char __user *buf = ubuf; 1557 unsigned long *cpu_tsc; 1558 unsigned prev, next; 1559 int i, err; 1560 1561 cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL); 1562 if (!cpu_tsc) 1563 return -ENOMEM; 1564 1565 mutex_lock(&mce_read_mutex); 1566 1567 if (!mce_apei_read_done) { 1568 err = __mce_read_apei(&buf, usize); 1569 if (err || buf != ubuf) 1570 goto out; 1571 } 1572 1573 next = rcu_dereference_check_mce(mcelog.next); 1574 1575 /* Only supports full reads right now */ 1576 err = -EINVAL; 1577 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) 1578 goto out; 1579 1580 err = 0; 1581 prev = 0; 1582 do { 1583 for (i = prev; i < next; i++) { 1584 unsigned long start = jiffies; 1585 1586 while (!mcelog.entry[i].finished) { 1587 if (time_after_eq(jiffies, start + 2)) { 1588 memset(mcelog.entry + i, 0, 1589 sizeof(struct mce)); 1590 goto timeout; 1591 } 1592 cpu_relax(); 1593 } 1594 smp_rmb(); 1595 err |= copy_to_user(buf, mcelog.entry + i, 1596 sizeof(struct mce)); 1597 buf += sizeof(struct mce); 1598timeout: 1599 ; 1600 } 1601 1602 memset(mcelog.entry + prev, 0, 1603 (next - prev) * sizeof(struct mce)); 1604 prev = next; 1605 next = cmpxchg(&mcelog.next, prev, 0); 1606 } while (next != prev); 1607 1608 synchronize_sched(); 1609 1610 /* 1611 * Collect entries that were still getting written before the 1612 * synchronize. 1613 */ 1614 on_each_cpu(collect_tscs, cpu_tsc, 1); 1615 1616 for (i = next; i < MCE_LOG_LEN; i++) { 1617 if (mcelog.entry[i].finished && 1618 mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) { 1619 err |= copy_to_user(buf, mcelog.entry+i, 1620 sizeof(struct mce)); 1621 smp_rmb(); 1622 buf += sizeof(struct mce); 1623 memset(&mcelog.entry[i], 0, sizeof(struct mce)); 1624 } 1625 } 1626 1627 if (err) 1628 err = -EFAULT; 1629 1630out: 1631 mutex_unlock(&mce_read_mutex); 1632 kfree(cpu_tsc); 1633 1634 return err ? err : buf - ubuf; 1635} 1636 1637static unsigned int mce_poll(struct file *file, poll_table *wait) 1638{ 1639 poll_wait(file, &mce_wait, wait); 1640 if (rcu_dereference_check_mce(mcelog.next)) 1641 return POLLIN | POLLRDNORM; 1642 if (!mce_apei_read_done && apei_check_mce()) 1643 return POLLIN | POLLRDNORM; 1644 return 0; 1645} 1646 1647static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg) 1648{ 1649 int __user *p = (int __user *)arg; 1650 1651 if (!capable(CAP_SYS_ADMIN)) 1652 return -EPERM; 1653 1654 switch (cmd) { 1655 case MCE_GET_RECORD_LEN: 1656 return put_user(sizeof(struct mce), p); 1657 case MCE_GET_LOG_LEN: 1658 return put_user(MCE_LOG_LEN, p); 1659 case MCE_GETCLEAR_FLAGS: { 1660 unsigned flags; 1661 1662 do { 1663 flags = mcelog.flags; 1664 } while (cmpxchg(&mcelog.flags, flags, 0) != flags); 1665 1666 return put_user(flags, p); 1667 } 1668 default: 1669 return -ENOTTY; 1670 } 1671} 1672 1673/* Modified in mce-inject.c, so not static or const */ 1674struct file_operations mce_chrdev_ops = { 1675 .open = mce_open, 1676 .release = mce_release, 1677 .read = mce_read, 1678 .poll = mce_poll, 1679 .unlocked_ioctl = mce_ioctl, 1680}; 1681EXPORT_SYMBOL_GPL(mce_chrdev_ops); 1682 1683static struct miscdevice mce_log_device = { 1684 MISC_MCELOG_MINOR, 1685 "mcelog", 1686 &mce_chrdev_ops, 1687}; 1688 1689/* 1690 * mce=off Disables machine check 1691 * mce=no_cmci Disables CMCI 1692 * mce=dont_log_ce Clears corrected events silently, no log created for CEs. 1693 * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared. 1694 * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above) 1695 * monarchtimeout is how long to wait for other CPUs on machine 1696 * check, or 0 to not wait 1697 * mce=bootlog Log MCEs from before booting. Disabled by default on AMD. 1698 * mce=nobootlog Don't log MCEs from before booting. 1699 */ 1700static int __init mcheck_enable(char *str) 1701{ 1702 if (*str == 0) { 1703 enable_p5_mce(); 1704 return 1; 1705 } 1706 if (*str == '=') 1707 str++; 1708 if (!strcmp(str, "off")) 1709 mce_disabled = 1; 1710 else if (!strcmp(str, "no_cmci")) 1711 mce_cmci_disabled = 1; 1712 else if (!strcmp(str, "dont_log_ce")) 1713 mce_dont_log_ce = 1; 1714 else if (!strcmp(str, "ignore_ce")) 1715 mce_ignore_ce = 1; 1716 else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog")) 1717 mce_bootlog = (str[0] == 'b'); 1718 else if (isdigit(str[0])) { 1719 get_option(&str, &tolerant); 1720 if (*str == ',') { 1721 ++str; 1722 get_option(&str, &monarch_timeout); 1723 } 1724 } else { 1725 printk(KERN_INFO "mce argument %s ignored. Please use /sys\n", 1726 str); 1727 return 0; 1728 } 1729 return 1; 1730} 1731__setup("mce", mcheck_enable); 1732 1733int __init mcheck_init(void) 1734{ 1735 atomic_notifier_chain_register(&x86_mce_decoder_chain, &mce_dec_nb); 1736 1737 mcheck_intel_therm_init(); 1738 1739 return 0; 1740} 1741 1742/* 1743 * Sysfs support 1744 */ 1745 1746/* 1747 * Disable machine checks on suspend and shutdown. We can't really handle 1748 * them later. 1749 */ 1750static int mce_disable_error_reporting(void) 1751{ 1752 int i; 1753 1754 for (i = 0; i < banks; i++) { 1755 struct mce_bank *b = &mce_banks[i]; 1756 1757 if (b->init) 1758 wrmsrl(MSR_IA32_MCx_CTL(i), 0); 1759 } 1760 return 0; 1761} 1762 1763static int mce_suspend(struct sys_device *dev, pm_message_t state) 1764{ 1765 return mce_disable_error_reporting(); 1766} 1767 1768static int mce_shutdown(struct sys_device *dev) 1769{ 1770 return mce_disable_error_reporting(); 1771} 1772 1773/* 1774 * On resume clear all MCE state. Don't want to see leftovers from the BIOS. 1775 * Only one CPU is active at this time, the others get re-added later using 1776 * CPU hotplug: 1777 */ 1778static int mce_resume(struct sys_device *dev) 1779{ 1780 __mcheck_cpu_init_generic(); 1781 __mcheck_cpu_init_vendor(¤t_cpu_data); 1782 1783 return 0; 1784} 1785 1786static void mce_cpu_restart(void *data) 1787{ 1788 del_timer_sync(&__get_cpu_var(mce_timer)); 1789 if (!mce_available(¤t_cpu_data)) 1790 return; 1791 __mcheck_cpu_init_generic(); 1792 __mcheck_cpu_init_timer(); 1793} 1794 1795/* Reinit MCEs after user configuration changes */ 1796static void mce_restart(void) 1797{ 1798 on_each_cpu(mce_cpu_restart, NULL, 1); 1799} 1800 1801/* Toggle features for corrected errors */ 1802static void mce_disable_ce(void *all) 1803{ 1804 if (!mce_available(¤t_cpu_data)) 1805 return; 1806 if (all) 1807 del_timer_sync(&__get_cpu_var(mce_timer)); 1808 cmci_clear(); 1809} 1810 1811static void mce_enable_ce(void *all) 1812{ 1813 if (!mce_available(¤t_cpu_data)) 1814 return; 1815 cmci_reenable(); 1816 cmci_recheck(); 1817 if (all) 1818 __mcheck_cpu_init_timer(); 1819} 1820 1821static struct sysdev_class mce_sysclass = { 1822 .suspend = mce_suspend, 1823 .shutdown = mce_shutdown, 1824 .resume = mce_resume, 1825 .name = "machinecheck", 1826}; 1827 1828DEFINE_PER_CPU(struct sys_device, mce_dev); 1829 1830__cpuinitdata 1831void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); 1832 1833static inline struct mce_bank *attr_to_bank(struct sysdev_attribute *attr) 1834{ 1835 return container_of(attr, struct mce_bank, attr); 1836} 1837 1838static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr, 1839 char *buf) 1840{ 1841 return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl); 1842} 1843 1844static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, 1845 const char *buf, size_t size) 1846{ 1847 u64 new; 1848 1849 if (strict_strtoull(buf, 0, &new) < 0) 1850 return -EINVAL; 1851 1852 attr_to_bank(attr)->ctl = new; 1853 mce_restart(); 1854 1855 return size; 1856} 1857 1858static ssize_t 1859show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf) 1860{ 1861 strcpy(buf, mce_helper); 1862 strcat(buf, "\n"); 1863 return strlen(mce_helper) + 1; 1864} 1865 1866static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr, 1867 const char *buf, size_t siz) 1868{ 1869 char *p; 1870 1871 strncpy(mce_helper, buf, sizeof(mce_helper)); 1872 mce_helper[sizeof(mce_helper)-1] = 0; 1873 p = strchr(mce_helper, '\n'); 1874 1875 if (p) 1876 *p = 0; 1877 1878 return strlen(mce_helper) + !!p; 1879} 1880 1881static ssize_t set_ignore_ce(struct sys_device *s, 1882 struct sysdev_attribute *attr, 1883 const char *buf, size_t size) 1884{ 1885 u64 new; 1886 1887 if (strict_strtoull(buf, 0, &new) < 0) 1888 return -EINVAL; 1889 1890 if (mce_ignore_ce ^ !!new) { 1891 if (new) { 1892 /* disable ce features */ 1893 on_each_cpu(mce_disable_ce, (void *)1, 1); 1894 mce_ignore_ce = 1; 1895 } else { 1896 /* enable ce features */ 1897 mce_ignore_ce = 0; 1898 on_each_cpu(mce_enable_ce, (void *)1, 1); 1899 } 1900 } 1901 return size; 1902} 1903 1904static ssize_t set_cmci_disabled(struct sys_device *s, 1905 struct sysdev_attribute *attr, 1906 const char *buf, size_t size) 1907{ 1908 u64 new; 1909 1910 if (strict_strtoull(buf, 0, &new) < 0) 1911 return -EINVAL; 1912 1913 if (mce_cmci_disabled ^ !!new) { 1914 if (new) { 1915 /* disable cmci */ 1916 on_each_cpu(mce_disable_ce, NULL, 1); 1917 mce_cmci_disabled = 1; 1918 } else { 1919 /* enable cmci */ 1920 mce_cmci_disabled = 0; 1921 on_each_cpu(mce_enable_ce, NULL, 1); 1922 } 1923 } 1924 return size; 1925} 1926 1927static ssize_t store_int_with_restart(struct sys_device *s, 1928 struct sysdev_attribute *attr, 1929 const char *buf, size_t size) 1930{ 1931 ssize_t ret = sysdev_store_int(s, attr, buf, size); 1932 mce_restart(); 1933 return ret; 1934} 1935 1936static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); 1937static SYSDEV_INT_ATTR(tolerant, 0644, tolerant); 1938static SYSDEV_INT_ATTR(monarch_timeout, 0644, monarch_timeout); 1939static SYSDEV_INT_ATTR(dont_log_ce, 0644, mce_dont_log_ce); 1940 1941static struct sysdev_ext_attribute attr_check_interval = { 1942 _SYSDEV_ATTR(check_interval, 0644, sysdev_show_int, 1943 store_int_with_restart), 1944 &check_interval 1945}; 1946 1947static struct sysdev_ext_attribute attr_ignore_ce = { 1948 _SYSDEV_ATTR(ignore_ce, 0644, sysdev_show_int, set_ignore_ce), 1949 &mce_ignore_ce 1950}; 1951 1952static struct sysdev_ext_attribute attr_cmci_disabled = { 1953 _SYSDEV_ATTR(cmci_disabled, 0644, sysdev_show_int, set_cmci_disabled), 1954 &mce_cmci_disabled 1955}; 1956 1957static struct sysdev_attribute *mce_attrs[] = { 1958 &attr_tolerant.attr, 1959 &attr_check_interval.attr, 1960 &attr_trigger, 1961 &attr_monarch_timeout.attr, 1962 &attr_dont_log_ce.attr, 1963 &attr_ignore_ce.attr, 1964 &attr_cmci_disabled.attr, 1965 NULL 1966}; 1967 1968static cpumask_var_t mce_dev_initialized; 1969 1970/* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */ 1971static __cpuinit int mce_create_device(unsigned int cpu) 1972{ 1973 int err; 1974 int i, j; 1975 1976 if (!mce_available(&boot_cpu_data)) 1977 return -EIO; 1978 1979 memset(&per_cpu(mce_dev, cpu).kobj, 0, sizeof(struct kobject)); 1980 per_cpu(mce_dev, cpu).id = cpu; 1981 per_cpu(mce_dev, cpu).cls = &mce_sysclass; 1982 1983 err = sysdev_register(&per_cpu(mce_dev, cpu)); 1984 if (err) 1985 return err; 1986 1987 for (i = 0; mce_attrs[i]; i++) { 1988 err = sysdev_create_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); 1989 if (err) 1990 goto error; 1991 } 1992 for (j = 0; j < banks; j++) { 1993 err = sysdev_create_file(&per_cpu(mce_dev, cpu), 1994 &mce_banks[j].attr); 1995 if (err) 1996 goto error2; 1997 } 1998 cpumask_set_cpu(cpu, mce_dev_initialized); 1999 2000 return 0; 2001error2: 2002 while (--j >= 0) 2003 sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[j].attr); 2004error: 2005 while (--i >= 0) 2006 sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); 2007 2008 sysdev_unregister(&per_cpu(mce_dev, cpu)); 2009 2010 return err; 2011} 2012 2013static __cpuinit void mce_remove_device(unsigned int cpu) 2014{ 2015 int i; 2016 2017 if (!cpumask_test_cpu(cpu, mce_dev_initialized)) 2018 return; 2019 2020 for (i = 0; mce_attrs[i]; i++) 2021 sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); 2022 2023 for (i = 0; i < banks; i++) 2024 sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[i].attr); 2025 2026 sysdev_unregister(&per_cpu(mce_dev, cpu)); 2027 cpumask_clear_cpu(cpu, mce_dev_initialized); 2028} 2029 2030/* Make sure there are no machine checks on offlined CPUs. */ 2031static void __cpuinit mce_disable_cpu(void *h) 2032{ 2033 unsigned long action = *(unsigned long *)h; 2034 int i; 2035 2036 if (!mce_available(¤t_cpu_data)) 2037 return; 2038 2039 if (!(action & CPU_TASKS_FROZEN)) 2040 cmci_clear(); 2041 for (i = 0; i < banks; i++) { 2042 struct mce_bank *b = &mce_banks[i]; 2043 2044 if (b->init) 2045 wrmsrl(MSR_IA32_MCx_CTL(i), 0); 2046 } 2047} 2048 2049static void __cpuinit mce_reenable_cpu(void *h) 2050{ 2051 unsigned long action = *(unsigned long *)h; 2052 int i; 2053 2054 if (!mce_available(¤t_cpu_data)) 2055 return; 2056 2057 if (!(action & CPU_TASKS_FROZEN)) 2058 cmci_reenable(); 2059 for (i = 0; i < banks; i++) { 2060 struct mce_bank *b = &mce_banks[i]; 2061 2062 if (b->init) 2063 wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl); 2064 } 2065} 2066 2067/* Get notified when a cpu comes on/off. Be hotplug friendly. */ 2068static int __cpuinit 2069mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) 2070{ 2071 unsigned int cpu = (unsigned long)hcpu; 2072 struct timer_list *t = &per_cpu(mce_timer, cpu); 2073 2074 switch (action) { 2075 case CPU_ONLINE: 2076 case CPU_ONLINE_FROZEN: 2077 mce_create_device(cpu); 2078 if (threshold_cpu_callback) 2079 threshold_cpu_callback(action, cpu); 2080 break; 2081 case CPU_DEAD: 2082 case CPU_DEAD_FROZEN: 2083 if (threshold_cpu_callback) 2084 threshold_cpu_callback(action, cpu); 2085 mce_remove_device(cpu); 2086 break; 2087 case CPU_DOWN_PREPARE: 2088 case CPU_DOWN_PREPARE_FROZEN: 2089 del_timer_sync(t); 2090 smp_call_function_single(cpu, mce_disable_cpu, &action, 1); 2091 break; 2092 case CPU_DOWN_FAILED: 2093 case CPU_DOWN_FAILED_FROZEN: 2094 if (!mce_ignore_ce && check_interval) { 2095 t->expires = round_jiffies(jiffies + 2096 __get_cpu_var(mce_next_interval)); 2097 add_timer_on(t, cpu); 2098 } 2099 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); 2100 break; 2101 case CPU_POST_DEAD: 2102 /* intentionally ignoring frozen here */ 2103 cmci_rediscover(cpu); 2104 break; 2105 } 2106 return NOTIFY_OK; 2107} 2108 2109static struct notifier_block mce_cpu_notifier __cpuinitdata = { 2110 .notifier_call = mce_cpu_callback, 2111}; 2112 2113static __init void mce_init_banks(void) 2114{ 2115 int i; 2116 2117 for (i = 0; i < banks; i++) { 2118 struct mce_bank *b = &mce_banks[i]; 2119 struct sysdev_attribute *a = &b->attr; 2120 2121 sysfs_attr_init(&a->attr); 2122 a->attr.name = b->attrname; 2123 snprintf(b->attrname, ATTR_LEN, "bank%d", i); 2124 2125 a->attr.mode = 0644; 2126 a->show = show_bank; 2127 a->store = set_bank; 2128 } 2129} 2130 2131static __init int mcheck_init_device(void) 2132{ 2133 int err; 2134 int i = 0; 2135 2136 if (!mce_available(&boot_cpu_data)) 2137 return -EIO; 2138 2139 zalloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL); 2140 2141 mce_init_banks(); 2142 2143 err = sysdev_class_register(&mce_sysclass); 2144 if (err) 2145 return err; 2146 2147 for_each_online_cpu(i) { 2148 err = mce_create_device(i); 2149 if (err) 2150 return err; 2151 } 2152 2153 register_hotcpu_notifier(&mce_cpu_notifier); 2154 misc_register(&mce_log_device); 2155 2156 return err; 2157} 2158 2159device_initcall(mcheck_init_device); 2160 2161/* 2162 * Old style boot options parsing. Only for compatibility. 2163 */ 2164static int __init mcheck_disable(char *str) 2165{ 2166 mce_disabled = 1; 2167 return 1; 2168} 2169__setup("nomce", mcheck_disable); 2170 2171#ifdef CONFIG_DEBUG_FS 2172struct dentry *mce_get_debugfs_dir(void) 2173{ 2174 static struct dentry *dmce; 2175 2176 if (!dmce) 2177 dmce = debugfs_create_dir("mce", NULL); 2178 2179 return dmce; 2180} 2181 2182static void mce_reset(void) 2183{ 2184 cpu_missing = 0; 2185 atomic_set(&mce_fake_paniced, 0); 2186 atomic_set(&mce_executing, 0); 2187 atomic_set(&mce_callin, 0); 2188 atomic_set(&global_nwo, 0); 2189} 2190 2191static int fake_panic_get(void *data, u64 *val) 2192{ 2193 *val = fake_panic; 2194 return 0; 2195} 2196 2197static int fake_panic_set(void *data, u64 val) 2198{ 2199 mce_reset(); 2200 fake_panic = val; 2201 return 0; 2202} 2203 2204DEFINE_SIMPLE_ATTRIBUTE(fake_panic_fops, fake_panic_get, 2205 fake_panic_set, "%llu\n"); 2206 2207static int __init mcheck_debugfs_init(void) 2208{ 2209 struct dentry *dmce, *ffake_panic; 2210 2211 dmce = mce_get_debugfs_dir(); 2212 if (!dmce) 2213 return -ENOMEM; 2214 ffake_panic = debugfs_create_file("fake_panic", 0444, dmce, NULL, 2215 &fake_panic_fops); 2216 if (!ffake_panic) 2217 return -ENOMEM; 2218 2219 return 0; 2220} 2221late_initcall(mcheck_debugfs_init); 2222#endif 2223