mce.c revision f3c6ea1b06c71b43f751b36bd99345369fe911af
1/* 2 * Machine check handler. 3 * 4 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. 5 * Rest from unknown author(s). 6 * 2004 Andi Kleen. Rewrote most of it. 7 * Copyright 2008 Intel Corporation 8 * Author: Andi Kleen 9 */ 10#include <linux/thread_info.h> 11#include <linux/capability.h> 12#include <linux/miscdevice.h> 13#include <linux/interrupt.h> 14#include <linux/ratelimit.h> 15#include <linux/kallsyms.h> 16#include <linux/rcupdate.h> 17#include <linux/kobject.h> 18#include <linux/uaccess.h> 19#include <linux/kdebug.h> 20#include <linux/kernel.h> 21#include <linux/percpu.h> 22#include <linux/string.h> 23#include <linux/sysdev.h> 24#include <linux/syscore_ops.h> 25#include <linux/delay.h> 26#include <linux/ctype.h> 27#include <linux/sched.h> 28#include <linux/sysfs.h> 29#include <linux/types.h> 30#include <linux/slab.h> 31#include <linux/init.h> 32#include <linux/kmod.h> 33#include <linux/poll.h> 34#include <linux/nmi.h> 35#include <linux/cpu.h> 36#include <linux/smp.h> 37#include <linux/fs.h> 38#include <linux/mm.h> 39#include <linux/debugfs.h> 40#include <linux/edac_mce.h> 41 42#include <asm/processor.h> 43#include <asm/hw_irq.h> 44#include <asm/apic.h> 45#include <asm/idle.h> 46#include <asm/ipi.h> 47#include <asm/mce.h> 48#include <asm/msr.h> 49 50#include "mce-internal.h" 51 52static DEFINE_MUTEX(mce_read_mutex); 53 54#define rcu_dereference_check_mce(p) \ 55 rcu_dereference_index_check((p), \ 56 rcu_read_lock_sched_held() || \ 57 lockdep_is_held(&mce_read_mutex)) 58 59#define CREATE_TRACE_POINTS 60#include <trace/events/mce.h> 61 62int mce_disabled __read_mostly; 63 64#define MISC_MCELOG_MINOR 227 65 66#define SPINUNIT 100 /* 100ns */ 67 68atomic_t mce_entry; 69 70DEFINE_PER_CPU(unsigned, mce_exception_count); 71 72/* 73 * Tolerant levels: 74 * 0: always panic on uncorrected errors, log corrected errors 75 * 1: panic or SIGBUS on uncorrected errors, log corrected errors 76 * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors 77 * 3: never panic or SIGBUS, log all errors (for testing only) 78 */ 79static int tolerant __read_mostly = 1; 80static int banks __read_mostly; 81static int rip_msr __read_mostly; 82static int mce_bootlog __read_mostly = -1; 83static int monarch_timeout __read_mostly = -1; 84static int mce_panic_timeout __read_mostly; 85static int mce_dont_log_ce __read_mostly; 86int mce_cmci_disabled __read_mostly; 87int mce_ignore_ce __read_mostly; 88int mce_ser __read_mostly; 89 90struct mce_bank *mce_banks __read_mostly; 91 92/* User mode helper program triggered by machine check event */ 93static unsigned long mce_need_notify; 94static char mce_helper[128]; 95static char *mce_helper_argv[2] = { mce_helper, NULL }; 96 97static DECLARE_WAIT_QUEUE_HEAD(mce_wait); 98static DEFINE_PER_CPU(struct mce, mces_seen); 99static int cpu_missing; 100 101/* 102 * CPU/chipset specific EDAC code can register a notifier call here to print 103 * MCE errors in a human-readable form. 104 */ 105ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain); 106EXPORT_SYMBOL_GPL(x86_mce_decoder_chain); 107 108static int default_decode_mce(struct notifier_block *nb, unsigned long val, 109 void *data) 110{ 111 pr_emerg(HW_ERR "No human readable MCE decoding support on this CPU type.\n"); 112 pr_emerg(HW_ERR "Run the message through 'mcelog --ascii' to decode.\n"); 113 114 return NOTIFY_STOP; 115} 116 117static struct notifier_block mce_dec_nb = { 118 .notifier_call = default_decode_mce, 119 .priority = -1, 120}; 121 122/* MCA banks polled by the period polling timer for corrected events */ 123DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { 124 [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL 125}; 126 127static DEFINE_PER_CPU(struct work_struct, mce_work); 128 129/* Do initial initialization of a struct mce */ 130void mce_setup(struct mce *m) 131{ 132 memset(m, 0, sizeof(struct mce)); 133 m->cpu = m->extcpu = smp_processor_id(); 134 rdtscll(m->tsc); 135 /* We hope get_seconds stays lockless */ 136 m->time = get_seconds(); 137 m->cpuvendor = boot_cpu_data.x86_vendor; 138 m->cpuid = cpuid_eax(1); 139#ifdef CONFIG_SMP 140 m->socketid = cpu_data(m->extcpu).phys_proc_id; 141#endif 142 m->apicid = cpu_data(m->extcpu).initial_apicid; 143 rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap); 144} 145 146DEFINE_PER_CPU(struct mce, injectm); 147EXPORT_PER_CPU_SYMBOL_GPL(injectm); 148 149/* 150 * Lockless MCE logging infrastructure. 151 * This avoids deadlocks on printk locks without having to break locks. Also 152 * separate MCEs from kernel messages to avoid bogus bug reports. 153 */ 154 155static struct mce_log mcelog = { 156 .signature = MCE_LOG_SIGNATURE, 157 .len = MCE_LOG_LEN, 158 .recordlen = sizeof(struct mce), 159}; 160 161void mce_log(struct mce *mce) 162{ 163 unsigned next, entry; 164 165 /* Emit the trace record: */ 166 trace_mce_record(mce); 167 168 mce->finished = 0; 169 wmb(); 170 for (;;) { 171 entry = rcu_dereference_check_mce(mcelog.next); 172 for (;;) { 173 /* 174 * If edac_mce is enabled, it will check the error type 175 * and will process it, if it is a known error. 176 * Otherwise, the error will be sent through mcelog 177 * interface 178 */ 179 if (edac_mce_parse(mce)) 180 return; 181 182 /* 183 * When the buffer fills up discard new entries. 184 * Assume that the earlier errors are the more 185 * interesting ones: 186 */ 187 if (entry >= MCE_LOG_LEN) { 188 set_bit(MCE_OVERFLOW, 189 (unsigned long *)&mcelog.flags); 190 return; 191 } 192 /* Old left over entry. Skip: */ 193 if (mcelog.entry[entry].finished) { 194 entry++; 195 continue; 196 } 197 break; 198 } 199 smp_rmb(); 200 next = entry + 1; 201 if (cmpxchg(&mcelog.next, entry, next) == entry) 202 break; 203 } 204 memcpy(mcelog.entry + entry, mce, sizeof(struct mce)); 205 wmb(); 206 mcelog.entry[entry].finished = 1; 207 wmb(); 208 209 mce->finished = 1; 210 set_bit(0, &mce_need_notify); 211} 212 213static void print_mce(struct mce *m) 214{ 215 pr_emerg(HW_ERR "CPU %d: Machine Check Exception: %Lx Bank %d: %016Lx\n", 216 m->extcpu, m->mcgstatus, m->bank, m->status); 217 218 if (m->ip) { 219 pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ", 220 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", 221 m->cs, m->ip); 222 223 if (m->cs == __KERNEL_CS) 224 print_symbol("{%s}", m->ip); 225 pr_cont("\n"); 226 } 227 228 pr_emerg(HW_ERR "TSC %llx ", m->tsc); 229 if (m->addr) 230 pr_cont("ADDR %llx ", m->addr); 231 if (m->misc) 232 pr_cont("MISC %llx ", m->misc); 233 234 pr_cont("\n"); 235 pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n", 236 m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid); 237 238 /* 239 * Print out human-readable details about the MCE error, 240 * (if the CPU has an implementation for that) 241 */ 242 atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m); 243} 244 245#define PANIC_TIMEOUT 5 /* 5 seconds */ 246 247static atomic_t mce_paniced; 248 249static int fake_panic; 250static atomic_t mce_fake_paniced; 251 252/* Panic in progress. Enable interrupts and wait for final IPI */ 253static void wait_for_panic(void) 254{ 255 long timeout = PANIC_TIMEOUT*USEC_PER_SEC; 256 257 preempt_disable(); 258 local_irq_enable(); 259 while (timeout-- > 0) 260 udelay(1); 261 if (panic_timeout == 0) 262 panic_timeout = mce_panic_timeout; 263 panic("Panicing machine check CPU died"); 264} 265 266static void mce_panic(char *msg, struct mce *final, char *exp) 267{ 268 int i, apei_err = 0; 269 270 if (!fake_panic) { 271 /* 272 * Make sure only one CPU runs in machine check panic 273 */ 274 if (atomic_inc_return(&mce_paniced) > 1) 275 wait_for_panic(); 276 barrier(); 277 278 bust_spinlocks(1); 279 console_verbose(); 280 } else { 281 /* Don't log too much for fake panic */ 282 if (atomic_inc_return(&mce_fake_paniced) > 1) 283 return; 284 } 285 /* First print corrected ones that are still unlogged */ 286 for (i = 0; i < MCE_LOG_LEN; i++) { 287 struct mce *m = &mcelog.entry[i]; 288 if (!(m->status & MCI_STATUS_VAL)) 289 continue; 290 if (!(m->status & MCI_STATUS_UC)) { 291 print_mce(m); 292 if (!apei_err) 293 apei_err = apei_write_mce(m); 294 } 295 } 296 /* Now print uncorrected but with the final one last */ 297 for (i = 0; i < MCE_LOG_LEN; i++) { 298 struct mce *m = &mcelog.entry[i]; 299 if (!(m->status & MCI_STATUS_VAL)) 300 continue; 301 if (!(m->status & MCI_STATUS_UC)) 302 continue; 303 if (!final || memcmp(m, final, sizeof(struct mce))) { 304 print_mce(m); 305 if (!apei_err) 306 apei_err = apei_write_mce(m); 307 } 308 } 309 if (final) { 310 print_mce(final); 311 if (!apei_err) 312 apei_err = apei_write_mce(final); 313 } 314 if (cpu_missing) 315 pr_emerg(HW_ERR "Some CPUs didn't answer in synchronization\n"); 316 if (exp) 317 pr_emerg(HW_ERR "Machine check: %s\n", exp); 318 if (!fake_panic) { 319 if (panic_timeout == 0) 320 panic_timeout = mce_panic_timeout; 321 panic(msg); 322 } else 323 pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg); 324} 325 326/* Support code for software error injection */ 327 328static int msr_to_offset(u32 msr) 329{ 330 unsigned bank = __this_cpu_read(injectm.bank); 331 332 if (msr == rip_msr) 333 return offsetof(struct mce, ip); 334 if (msr == MSR_IA32_MCx_STATUS(bank)) 335 return offsetof(struct mce, status); 336 if (msr == MSR_IA32_MCx_ADDR(bank)) 337 return offsetof(struct mce, addr); 338 if (msr == MSR_IA32_MCx_MISC(bank)) 339 return offsetof(struct mce, misc); 340 if (msr == MSR_IA32_MCG_STATUS) 341 return offsetof(struct mce, mcgstatus); 342 return -1; 343} 344 345/* MSR access wrappers used for error injection */ 346static u64 mce_rdmsrl(u32 msr) 347{ 348 u64 v; 349 350 if (__this_cpu_read(injectm.finished)) { 351 int offset = msr_to_offset(msr); 352 353 if (offset < 0) 354 return 0; 355 return *(u64 *)((char *)&__get_cpu_var(injectm) + offset); 356 } 357 358 if (rdmsrl_safe(msr, &v)) { 359 WARN_ONCE(1, "mce: Unable to read msr %d!\n", msr); 360 /* 361 * Return zero in case the access faulted. This should 362 * not happen normally but can happen if the CPU does 363 * something weird, or if the code is buggy. 364 */ 365 v = 0; 366 } 367 368 return v; 369} 370 371static void mce_wrmsrl(u32 msr, u64 v) 372{ 373 if (__this_cpu_read(injectm.finished)) { 374 int offset = msr_to_offset(msr); 375 376 if (offset >= 0) 377 *(u64 *)((char *)&__get_cpu_var(injectm) + offset) = v; 378 return; 379 } 380 wrmsrl(msr, v); 381} 382 383/* 384 * Simple lockless ring to communicate PFNs from the exception handler with the 385 * process context work function. This is vastly simplified because there's 386 * only a single reader and a single writer. 387 */ 388#define MCE_RING_SIZE 16 /* we use one entry less */ 389 390struct mce_ring { 391 unsigned short start; 392 unsigned short end; 393 unsigned long ring[MCE_RING_SIZE]; 394}; 395static DEFINE_PER_CPU(struct mce_ring, mce_ring); 396 397/* Runs with CPU affinity in workqueue */ 398static int mce_ring_empty(void) 399{ 400 struct mce_ring *r = &__get_cpu_var(mce_ring); 401 402 return r->start == r->end; 403} 404 405static int mce_ring_get(unsigned long *pfn) 406{ 407 struct mce_ring *r; 408 int ret = 0; 409 410 *pfn = 0; 411 get_cpu(); 412 r = &__get_cpu_var(mce_ring); 413 if (r->start == r->end) 414 goto out; 415 *pfn = r->ring[r->start]; 416 r->start = (r->start + 1) % MCE_RING_SIZE; 417 ret = 1; 418out: 419 put_cpu(); 420 return ret; 421} 422 423/* Always runs in MCE context with preempt off */ 424static int mce_ring_add(unsigned long pfn) 425{ 426 struct mce_ring *r = &__get_cpu_var(mce_ring); 427 unsigned next; 428 429 next = (r->end + 1) % MCE_RING_SIZE; 430 if (next == r->start) 431 return -1; 432 r->ring[r->end] = pfn; 433 wmb(); 434 r->end = next; 435 return 0; 436} 437 438int mce_available(struct cpuinfo_x86 *c) 439{ 440 if (mce_disabled) 441 return 0; 442 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); 443} 444 445static void mce_schedule_work(void) 446{ 447 if (!mce_ring_empty()) { 448 struct work_struct *work = &__get_cpu_var(mce_work); 449 if (!work_pending(work)) 450 schedule_work(work); 451 } 452} 453 454/* 455 * Get the address of the instruction at the time of the machine check 456 * error. 457 */ 458static inline void mce_get_rip(struct mce *m, struct pt_regs *regs) 459{ 460 461 if (regs && (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV))) { 462 m->ip = regs->ip; 463 m->cs = regs->cs; 464 } else { 465 m->ip = 0; 466 m->cs = 0; 467 } 468 if (rip_msr) 469 m->ip = mce_rdmsrl(rip_msr); 470} 471 472#ifdef CONFIG_X86_LOCAL_APIC 473/* 474 * Called after interrupts have been reenabled again 475 * when a MCE happened during an interrupts off region 476 * in the kernel. 477 */ 478asmlinkage void smp_mce_self_interrupt(struct pt_regs *regs) 479{ 480 ack_APIC_irq(); 481 exit_idle(); 482 irq_enter(); 483 mce_notify_irq(); 484 mce_schedule_work(); 485 irq_exit(); 486} 487#endif 488 489static void mce_report_event(struct pt_regs *regs) 490{ 491 if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) { 492 mce_notify_irq(); 493 /* 494 * Triggering the work queue here is just an insurance 495 * policy in case the syscall exit notify handler 496 * doesn't run soon enough or ends up running on the 497 * wrong CPU (can happen when audit sleeps) 498 */ 499 mce_schedule_work(); 500 return; 501 } 502 503#ifdef CONFIG_X86_LOCAL_APIC 504 /* 505 * Without APIC do not notify. The event will be picked 506 * up eventually. 507 */ 508 if (!cpu_has_apic) 509 return; 510 511 /* 512 * When interrupts are disabled we cannot use 513 * kernel services safely. Trigger an self interrupt 514 * through the APIC to instead do the notification 515 * after interrupts are reenabled again. 516 */ 517 apic->send_IPI_self(MCE_SELF_VECTOR); 518 519 /* 520 * Wait for idle afterwards again so that we don't leave the 521 * APIC in a non idle state because the normal APIC writes 522 * cannot exclude us. 523 */ 524 apic_wait_icr_idle(); 525#endif 526} 527 528DEFINE_PER_CPU(unsigned, mce_poll_count); 529 530/* 531 * Poll for corrected events or events that happened before reset. 532 * Those are just logged through /dev/mcelog. 533 * 534 * This is executed in standard interrupt context. 535 * 536 * Note: spec recommends to panic for fatal unsignalled 537 * errors here. However this would be quite problematic -- 538 * we would need to reimplement the Monarch handling and 539 * it would mess up the exclusion between exception handler 540 * and poll hander -- * so we skip this for now. 541 * These cases should not happen anyways, or only when the CPU 542 * is already totally * confused. In this case it's likely it will 543 * not fully execute the machine check handler either. 544 */ 545void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) 546{ 547 struct mce m; 548 int i; 549 550 percpu_inc(mce_poll_count); 551 552 mce_setup(&m); 553 554 m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); 555 for (i = 0; i < banks; i++) { 556 if (!mce_banks[i].ctl || !test_bit(i, *b)) 557 continue; 558 559 m.misc = 0; 560 m.addr = 0; 561 m.bank = i; 562 m.tsc = 0; 563 564 barrier(); 565 m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); 566 if (!(m.status & MCI_STATUS_VAL)) 567 continue; 568 569 /* 570 * Uncorrected or signalled events are handled by the exception 571 * handler when it is enabled, so don't process those here. 572 * 573 * TBD do the same check for MCI_STATUS_EN here? 574 */ 575 if (!(flags & MCP_UC) && 576 (m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC))) 577 continue; 578 579 if (m.status & MCI_STATUS_MISCV) 580 m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i)); 581 if (m.status & MCI_STATUS_ADDRV) 582 m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i)); 583 584 if (!(flags & MCP_TIMESTAMP)) 585 m.tsc = 0; 586 /* 587 * Don't get the IP here because it's unlikely to 588 * have anything to do with the actual error location. 589 */ 590 if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) { 591 mce_log(&m); 592 atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, &m); 593 add_taint(TAINT_MACHINE_CHECK); 594 } 595 596 /* 597 * Clear state for this bank. 598 */ 599 mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0); 600 } 601 602 /* 603 * Don't clear MCG_STATUS here because it's only defined for 604 * exceptions. 605 */ 606 607 sync_core(); 608} 609EXPORT_SYMBOL_GPL(machine_check_poll); 610 611/* 612 * Do a quick check if any of the events requires a panic. 613 * This decides if we keep the events around or clear them. 614 */ 615static int mce_no_way_out(struct mce *m, char **msg) 616{ 617 int i; 618 619 for (i = 0; i < banks; i++) { 620 m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); 621 if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY) 622 return 1; 623 } 624 return 0; 625} 626 627/* 628 * Variable to establish order between CPUs while scanning. 629 * Each CPU spins initially until executing is equal its number. 630 */ 631static atomic_t mce_executing; 632 633/* 634 * Defines order of CPUs on entry. First CPU becomes Monarch. 635 */ 636static atomic_t mce_callin; 637 638/* 639 * Check if a timeout waiting for other CPUs happened. 640 */ 641static int mce_timed_out(u64 *t) 642{ 643 /* 644 * The others already did panic for some reason. 645 * Bail out like in a timeout. 646 * rmb() to tell the compiler that system_state 647 * might have been modified by someone else. 648 */ 649 rmb(); 650 if (atomic_read(&mce_paniced)) 651 wait_for_panic(); 652 if (!monarch_timeout) 653 goto out; 654 if ((s64)*t < SPINUNIT) { 655 /* CHECKME: Make panic default for 1 too? */ 656 if (tolerant < 1) 657 mce_panic("Timeout synchronizing machine check over CPUs", 658 NULL, NULL); 659 cpu_missing = 1; 660 return 1; 661 } 662 *t -= SPINUNIT; 663out: 664 touch_nmi_watchdog(); 665 return 0; 666} 667 668/* 669 * The Monarch's reign. The Monarch is the CPU who entered 670 * the machine check handler first. It waits for the others to 671 * raise the exception too and then grades them. When any 672 * error is fatal panic. Only then let the others continue. 673 * 674 * The other CPUs entering the MCE handler will be controlled by the 675 * Monarch. They are called Subjects. 676 * 677 * This way we prevent any potential data corruption in a unrecoverable case 678 * and also makes sure always all CPU's errors are examined. 679 * 680 * Also this detects the case of a machine check event coming from outer 681 * space (not detected by any CPUs) In this case some external agent wants 682 * us to shut down, so panic too. 683 * 684 * The other CPUs might still decide to panic if the handler happens 685 * in a unrecoverable place, but in this case the system is in a semi-stable 686 * state and won't corrupt anything by itself. It's ok to let the others 687 * continue for a bit first. 688 * 689 * All the spin loops have timeouts; when a timeout happens a CPU 690 * typically elects itself to be Monarch. 691 */ 692static void mce_reign(void) 693{ 694 int cpu; 695 struct mce *m = NULL; 696 int global_worst = 0; 697 char *msg = NULL; 698 char *nmsg = NULL; 699 700 /* 701 * This CPU is the Monarch and the other CPUs have run 702 * through their handlers. 703 * Grade the severity of the errors of all the CPUs. 704 */ 705 for_each_possible_cpu(cpu) { 706 int severity = mce_severity(&per_cpu(mces_seen, cpu), tolerant, 707 &nmsg); 708 if (severity > global_worst) { 709 msg = nmsg; 710 global_worst = severity; 711 m = &per_cpu(mces_seen, cpu); 712 } 713 } 714 715 /* 716 * Cannot recover? Panic here then. 717 * This dumps all the mces in the log buffer and stops the 718 * other CPUs. 719 */ 720 if (m && global_worst >= MCE_PANIC_SEVERITY && tolerant < 3) 721 mce_panic("Fatal Machine check", m, msg); 722 723 /* 724 * For UC somewhere we let the CPU who detects it handle it. 725 * Also must let continue the others, otherwise the handling 726 * CPU could deadlock on a lock. 727 */ 728 729 /* 730 * No machine check event found. Must be some external 731 * source or one CPU is hung. Panic. 732 */ 733 if (global_worst <= MCE_KEEP_SEVERITY && tolerant < 3) 734 mce_panic("Machine check from unknown source", NULL, NULL); 735 736 /* 737 * Now clear all the mces_seen so that they don't reappear on 738 * the next mce. 739 */ 740 for_each_possible_cpu(cpu) 741 memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce)); 742} 743 744static atomic_t global_nwo; 745 746/* 747 * Start of Monarch synchronization. This waits until all CPUs have 748 * entered the exception handler and then determines if any of them 749 * saw a fatal event that requires panic. Then it executes them 750 * in the entry order. 751 * TBD double check parallel CPU hotunplug 752 */ 753static int mce_start(int *no_way_out) 754{ 755 int order; 756 int cpus = num_online_cpus(); 757 u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC; 758 759 if (!timeout) 760 return -1; 761 762 atomic_add(*no_way_out, &global_nwo); 763 /* 764 * global_nwo should be updated before mce_callin 765 */ 766 smp_wmb(); 767 order = atomic_inc_return(&mce_callin); 768 769 /* 770 * Wait for everyone. 771 */ 772 while (atomic_read(&mce_callin) != cpus) { 773 if (mce_timed_out(&timeout)) { 774 atomic_set(&global_nwo, 0); 775 return -1; 776 } 777 ndelay(SPINUNIT); 778 } 779 780 /* 781 * mce_callin should be read before global_nwo 782 */ 783 smp_rmb(); 784 785 if (order == 1) { 786 /* 787 * Monarch: Starts executing now, the others wait. 788 */ 789 atomic_set(&mce_executing, 1); 790 } else { 791 /* 792 * Subject: Now start the scanning loop one by one in 793 * the original callin order. 794 * This way when there are any shared banks it will be 795 * only seen by one CPU before cleared, avoiding duplicates. 796 */ 797 while (atomic_read(&mce_executing) < order) { 798 if (mce_timed_out(&timeout)) { 799 atomic_set(&global_nwo, 0); 800 return -1; 801 } 802 ndelay(SPINUNIT); 803 } 804 } 805 806 /* 807 * Cache the global no_way_out state. 808 */ 809 *no_way_out = atomic_read(&global_nwo); 810 811 return order; 812} 813 814/* 815 * Synchronize between CPUs after main scanning loop. 816 * This invokes the bulk of the Monarch processing. 817 */ 818static int mce_end(int order) 819{ 820 int ret = -1; 821 u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC; 822 823 if (!timeout) 824 goto reset; 825 if (order < 0) 826 goto reset; 827 828 /* 829 * Allow others to run. 830 */ 831 atomic_inc(&mce_executing); 832 833 if (order == 1) { 834 /* CHECKME: Can this race with a parallel hotplug? */ 835 int cpus = num_online_cpus(); 836 837 /* 838 * Monarch: Wait for everyone to go through their scanning 839 * loops. 840 */ 841 while (atomic_read(&mce_executing) <= cpus) { 842 if (mce_timed_out(&timeout)) 843 goto reset; 844 ndelay(SPINUNIT); 845 } 846 847 mce_reign(); 848 barrier(); 849 ret = 0; 850 } else { 851 /* 852 * Subject: Wait for Monarch to finish. 853 */ 854 while (atomic_read(&mce_executing) != 0) { 855 if (mce_timed_out(&timeout)) 856 goto reset; 857 ndelay(SPINUNIT); 858 } 859 860 /* 861 * Don't reset anything. That's done by the Monarch. 862 */ 863 return 0; 864 } 865 866 /* 867 * Reset all global state. 868 */ 869reset: 870 atomic_set(&global_nwo, 0); 871 atomic_set(&mce_callin, 0); 872 barrier(); 873 874 /* 875 * Let others run again. 876 */ 877 atomic_set(&mce_executing, 0); 878 return ret; 879} 880 881/* 882 * Check if the address reported by the CPU is in a format we can parse. 883 * It would be possible to add code for most other cases, but all would 884 * be somewhat complicated (e.g. segment offset would require an instruction 885 * parser). So only support physical addresses up to page granuality for now. 886 */ 887static int mce_usable_address(struct mce *m) 888{ 889 if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV)) 890 return 0; 891 if ((m->misc & 0x3f) > PAGE_SHIFT) 892 return 0; 893 if (((m->misc >> 6) & 7) != MCM_ADDR_PHYS) 894 return 0; 895 return 1; 896} 897 898static void mce_clear_state(unsigned long *toclear) 899{ 900 int i; 901 902 for (i = 0; i < banks; i++) { 903 if (test_bit(i, toclear)) 904 mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0); 905 } 906} 907 908/* 909 * The actual machine check handler. This only handles real 910 * exceptions when something got corrupted coming in through int 18. 911 * 912 * This is executed in NMI context not subject to normal locking rules. This 913 * implies that most kernel services cannot be safely used. Don't even 914 * think about putting a printk in there! 915 * 916 * On Intel systems this is entered on all CPUs in parallel through 917 * MCE broadcast. However some CPUs might be broken beyond repair, 918 * so be always careful when synchronizing with others. 919 */ 920void do_machine_check(struct pt_regs *regs, long error_code) 921{ 922 struct mce m, *final; 923 int i; 924 int worst = 0; 925 int severity; 926 /* 927 * Establish sequential order between the CPUs entering the machine 928 * check handler. 929 */ 930 int order; 931 /* 932 * If no_way_out gets set, there is no safe way to recover from this 933 * MCE. If tolerant is cranked up, we'll try anyway. 934 */ 935 int no_way_out = 0; 936 /* 937 * If kill_it gets set, there might be a way to recover from this 938 * error. 939 */ 940 int kill_it = 0; 941 DECLARE_BITMAP(toclear, MAX_NR_BANKS); 942 char *msg = "Unknown"; 943 944 atomic_inc(&mce_entry); 945 946 percpu_inc(mce_exception_count); 947 948 if (notify_die(DIE_NMI, "machine check", regs, error_code, 949 18, SIGKILL) == NOTIFY_STOP) 950 goto out; 951 if (!banks) 952 goto out; 953 954 mce_setup(&m); 955 956 m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); 957 final = &__get_cpu_var(mces_seen); 958 *final = m; 959 960 no_way_out = mce_no_way_out(&m, &msg); 961 962 barrier(); 963 964 /* 965 * When no restart IP must always kill or panic. 966 */ 967 if (!(m.mcgstatus & MCG_STATUS_RIPV)) 968 kill_it = 1; 969 970 /* 971 * Go through all the banks in exclusion of the other CPUs. 972 * This way we don't report duplicated events on shared banks 973 * because the first one to see it will clear it. 974 */ 975 order = mce_start(&no_way_out); 976 for (i = 0; i < banks; i++) { 977 __clear_bit(i, toclear); 978 if (!mce_banks[i].ctl) 979 continue; 980 981 m.misc = 0; 982 m.addr = 0; 983 m.bank = i; 984 985 m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); 986 if ((m.status & MCI_STATUS_VAL) == 0) 987 continue; 988 989 /* 990 * Non uncorrected or non signaled errors are handled by 991 * machine_check_poll. Leave them alone, unless this panics. 992 */ 993 if (!(m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)) && 994 !no_way_out) 995 continue; 996 997 /* 998 * Set taint even when machine check was not enabled. 999 */ 1000 add_taint(TAINT_MACHINE_CHECK); 1001 1002 severity = mce_severity(&m, tolerant, NULL); 1003 1004 /* 1005 * When machine check was for corrected handler don't touch, 1006 * unless we're panicing. 1007 */ 1008 if (severity == MCE_KEEP_SEVERITY && !no_way_out) 1009 continue; 1010 __set_bit(i, toclear); 1011 if (severity == MCE_NO_SEVERITY) { 1012 /* 1013 * Machine check event was not enabled. Clear, but 1014 * ignore. 1015 */ 1016 continue; 1017 } 1018 1019 /* 1020 * Kill on action required. 1021 */ 1022 if (severity == MCE_AR_SEVERITY) 1023 kill_it = 1; 1024 1025 if (m.status & MCI_STATUS_MISCV) 1026 m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i)); 1027 if (m.status & MCI_STATUS_ADDRV) 1028 m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i)); 1029 1030 /* 1031 * Action optional error. Queue address for later processing. 1032 * When the ring overflows we just ignore the AO error. 1033 * RED-PEN add some logging mechanism when 1034 * usable_address or mce_add_ring fails. 1035 * RED-PEN don't ignore overflow for tolerant == 0 1036 */ 1037 if (severity == MCE_AO_SEVERITY && mce_usable_address(&m)) 1038 mce_ring_add(m.addr >> PAGE_SHIFT); 1039 1040 mce_get_rip(&m, regs); 1041 mce_log(&m); 1042 1043 if (severity > worst) { 1044 *final = m; 1045 worst = severity; 1046 } 1047 } 1048 1049 if (!no_way_out) 1050 mce_clear_state(toclear); 1051 1052 /* 1053 * Do most of the synchronization with other CPUs. 1054 * When there's any problem use only local no_way_out state. 1055 */ 1056 if (mce_end(order) < 0) 1057 no_way_out = worst >= MCE_PANIC_SEVERITY; 1058 1059 /* 1060 * If we have decided that we just CAN'T continue, and the user 1061 * has not set tolerant to an insane level, give up and die. 1062 * 1063 * This is mainly used in the case when the system doesn't 1064 * support MCE broadcasting or it has been disabled. 1065 */ 1066 if (no_way_out && tolerant < 3) 1067 mce_panic("Fatal machine check on current CPU", final, msg); 1068 1069 /* 1070 * If the error seems to be unrecoverable, something should be 1071 * done. Try to kill as little as possible. If we can kill just 1072 * one task, do that. If the user has set the tolerance very 1073 * high, don't try to do anything at all. 1074 */ 1075 1076 if (kill_it && tolerant < 3) 1077 force_sig(SIGBUS, current); 1078 1079 /* notify userspace ASAP */ 1080 set_thread_flag(TIF_MCE_NOTIFY); 1081 1082 if (worst > 0) 1083 mce_report_event(regs); 1084 mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); 1085out: 1086 atomic_dec(&mce_entry); 1087 sync_core(); 1088} 1089EXPORT_SYMBOL_GPL(do_machine_check); 1090 1091/* dummy to break dependency. actual code is in mm/memory-failure.c */ 1092void __attribute__((weak)) memory_failure(unsigned long pfn, int vector) 1093{ 1094 printk(KERN_ERR "Action optional memory failure at %lx ignored\n", pfn); 1095} 1096 1097/* 1098 * Called after mce notification in process context. This code 1099 * is allowed to sleep. Call the high level VM handler to process 1100 * any corrupted pages. 1101 * Assume that the work queue code only calls this one at a time 1102 * per CPU. 1103 * Note we don't disable preemption, so this code might run on the wrong 1104 * CPU. In this case the event is picked up by the scheduled work queue. 1105 * This is merely a fast path to expedite processing in some common 1106 * cases. 1107 */ 1108void mce_notify_process(void) 1109{ 1110 unsigned long pfn; 1111 mce_notify_irq(); 1112 while (mce_ring_get(&pfn)) 1113 memory_failure(pfn, MCE_VECTOR); 1114} 1115 1116static void mce_process_work(struct work_struct *dummy) 1117{ 1118 mce_notify_process(); 1119} 1120 1121#ifdef CONFIG_X86_MCE_INTEL 1122/*** 1123 * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog 1124 * @cpu: The CPU on which the event occurred. 1125 * @status: Event status information 1126 * 1127 * This function should be called by the thermal interrupt after the 1128 * event has been processed and the decision was made to log the event 1129 * further. 1130 * 1131 * The status parameter will be saved to the 'status' field of 'struct mce' 1132 * and historically has been the register value of the 1133 * MSR_IA32_THERMAL_STATUS (Intel) msr. 1134 */ 1135void mce_log_therm_throt_event(__u64 status) 1136{ 1137 struct mce m; 1138 1139 mce_setup(&m); 1140 m.bank = MCE_THERMAL_BANK; 1141 m.status = status; 1142 mce_log(&m); 1143} 1144#endif /* CONFIG_X86_MCE_INTEL */ 1145 1146/* 1147 * Periodic polling timer for "silent" machine check errors. If the 1148 * poller finds an MCE, poll 2x faster. When the poller finds no more 1149 * errors, poll 2x slower (up to check_interval seconds). 1150 */ 1151static int check_interval = 5 * 60; /* 5 minutes */ 1152 1153static DEFINE_PER_CPU(int, mce_next_interval); /* in jiffies */ 1154static DEFINE_PER_CPU(struct timer_list, mce_timer); 1155 1156static void mce_start_timer(unsigned long data) 1157{ 1158 struct timer_list *t = &per_cpu(mce_timer, data); 1159 int *n; 1160 1161 WARN_ON(smp_processor_id() != data); 1162 1163 if (mce_available(__this_cpu_ptr(&cpu_info))) { 1164 machine_check_poll(MCP_TIMESTAMP, 1165 &__get_cpu_var(mce_poll_banks)); 1166 } 1167 1168 /* 1169 * Alert userspace if needed. If we logged an MCE, reduce the 1170 * polling interval, otherwise increase the polling interval. 1171 */ 1172 n = &__get_cpu_var(mce_next_interval); 1173 if (mce_notify_irq()) 1174 *n = max(*n/2, HZ/100); 1175 else 1176 *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ)); 1177 1178 t->expires = jiffies + *n; 1179 add_timer_on(t, smp_processor_id()); 1180} 1181 1182static void mce_do_trigger(struct work_struct *work) 1183{ 1184 call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT); 1185} 1186 1187static DECLARE_WORK(mce_trigger_work, mce_do_trigger); 1188 1189/* 1190 * Notify the user(s) about new machine check events. 1191 * Can be called from interrupt context, but not from machine check/NMI 1192 * context. 1193 */ 1194int mce_notify_irq(void) 1195{ 1196 /* Not more than two messages every minute */ 1197 static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); 1198 1199 clear_thread_flag(TIF_MCE_NOTIFY); 1200 1201 if (test_and_clear_bit(0, &mce_need_notify)) { 1202 wake_up_interruptible(&mce_wait); 1203 1204 /* 1205 * There is no risk of missing notifications because 1206 * work_pending is always cleared before the function is 1207 * executed. 1208 */ 1209 if (mce_helper[0] && !work_pending(&mce_trigger_work)) 1210 schedule_work(&mce_trigger_work); 1211 1212 if (__ratelimit(&ratelimit)) 1213 pr_info(HW_ERR "Machine check events logged\n"); 1214 1215 return 1; 1216 } 1217 return 0; 1218} 1219EXPORT_SYMBOL_GPL(mce_notify_irq); 1220 1221static int __cpuinit __mcheck_cpu_mce_banks_init(void) 1222{ 1223 int i; 1224 1225 mce_banks = kzalloc(banks * sizeof(struct mce_bank), GFP_KERNEL); 1226 if (!mce_banks) 1227 return -ENOMEM; 1228 for (i = 0; i < banks; i++) { 1229 struct mce_bank *b = &mce_banks[i]; 1230 1231 b->ctl = -1ULL; 1232 b->init = 1; 1233 } 1234 return 0; 1235} 1236 1237/* 1238 * Initialize Machine Checks for a CPU. 1239 */ 1240static int __cpuinit __mcheck_cpu_cap_init(void) 1241{ 1242 unsigned b; 1243 u64 cap; 1244 1245 rdmsrl(MSR_IA32_MCG_CAP, cap); 1246 1247 b = cap & MCG_BANKCNT_MASK; 1248 if (!banks) 1249 printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b); 1250 1251 if (b > MAX_NR_BANKS) { 1252 printk(KERN_WARNING 1253 "MCE: Using only %u machine check banks out of %u\n", 1254 MAX_NR_BANKS, b); 1255 b = MAX_NR_BANKS; 1256 } 1257 1258 /* Don't support asymmetric configurations today */ 1259 WARN_ON(banks != 0 && b != banks); 1260 banks = b; 1261 if (!mce_banks) { 1262 int err = __mcheck_cpu_mce_banks_init(); 1263 1264 if (err) 1265 return err; 1266 } 1267 1268 /* Use accurate RIP reporting if available. */ 1269 if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9) 1270 rip_msr = MSR_IA32_MCG_EIP; 1271 1272 if (cap & MCG_SER_P) 1273 mce_ser = 1; 1274 1275 return 0; 1276} 1277 1278static void __mcheck_cpu_init_generic(void) 1279{ 1280 mce_banks_t all_banks; 1281 u64 cap; 1282 int i; 1283 1284 /* 1285 * Log the machine checks left over from the previous reset. 1286 */ 1287 bitmap_fill(all_banks, MAX_NR_BANKS); 1288 machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks); 1289 1290 set_in_cr4(X86_CR4_MCE); 1291 1292 rdmsrl(MSR_IA32_MCG_CAP, cap); 1293 if (cap & MCG_CTL_P) 1294 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); 1295 1296 for (i = 0; i < banks; i++) { 1297 struct mce_bank *b = &mce_banks[i]; 1298 1299 if (!b->init) 1300 continue; 1301 wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl); 1302 wrmsrl(MSR_IA32_MCx_STATUS(i), 0); 1303 } 1304} 1305 1306/* Add per CPU specific workarounds here */ 1307static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) 1308{ 1309 if (c->x86_vendor == X86_VENDOR_UNKNOWN) { 1310 pr_info("MCE: unknown CPU type - not enabling MCE support.\n"); 1311 return -EOPNOTSUPP; 1312 } 1313 1314 /* This should be disabled by the BIOS, but isn't always */ 1315 if (c->x86_vendor == X86_VENDOR_AMD) { 1316 if (c->x86 == 15 && banks > 4) { 1317 /* 1318 * disable GART TBL walk error reporting, which 1319 * trips off incorrectly with the IOMMU & 3ware 1320 * & Cerberus: 1321 */ 1322 clear_bit(10, (unsigned long *)&mce_banks[4].ctl); 1323 } 1324 if (c->x86 <= 17 && mce_bootlog < 0) { 1325 /* 1326 * Lots of broken BIOS around that don't clear them 1327 * by default and leave crap in there. Don't log: 1328 */ 1329 mce_bootlog = 0; 1330 } 1331 /* 1332 * Various K7s with broken bank 0 around. Always disable 1333 * by default. 1334 */ 1335 if (c->x86 == 6 && banks > 0) 1336 mce_banks[0].ctl = 0; 1337 } 1338 1339 if (c->x86_vendor == X86_VENDOR_INTEL) { 1340 /* 1341 * SDM documents that on family 6 bank 0 should not be written 1342 * because it aliases to another special BIOS controlled 1343 * register. 1344 * But it's not aliased anymore on model 0x1a+ 1345 * Don't ignore bank 0 completely because there could be a 1346 * valid event later, merely don't write CTL0. 1347 */ 1348 1349 if (c->x86 == 6 && c->x86_model < 0x1A && banks > 0) 1350 mce_banks[0].init = 0; 1351 1352 /* 1353 * All newer Intel systems support MCE broadcasting. Enable 1354 * synchronization with a one second timeout. 1355 */ 1356 if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) && 1357 monarch_timeout < 0) 1358 monarch_timeout = USEC_PER_SEC; 1359 1360 /* 1361 * There are also broken BIOSes on some Pentium M and 1362 * earlier systems: 1363 */ 1364 if (c->x86 == 6 && c->x86_model <= 13 && mce_bootlog < 0) 1365 mce_bootlog = 0; 1366 } 1367 if (monarch_timeout < 0) 1368 monarch_timeout = 0; 1369 if (mce_bootlog != 0) 1370 mce_panic_timeout = 30; 1371 1372 return 0; 1373} 1374 1375static void __cpuinit __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c) 1376{ 1377 if (c->x86 != 5) 1378 return; 1379 switch (c->x86_vendor) { 1380 case X86_VENDOR_INTEL: 1381 intel_p5_mcheck_init(c); 1382 break; 1383 case X86_VENDOR_CENTAUR: 1384 winchip_mcheck_init(c); 1385 break; 1386 } 1387} 1388 1389static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) 1390{ 1391 switch (c->x86_vendor) { 1392 case X86_VENDOR_INTEL: 1393 mce_intel_feature_init(c); 1394 break; 1395 case X86_VENDOR_AMD: 1396 mce_amd_feature_init(c); 1397 break; 1398 default: 1399 break; 1400 } 1401} 1402 1403static void __mcheck_cpu_init_timer(void) 1404{ 1405 struct timer_list *t = &__get_cpu_var(mce_timer); 1406 int *n = &__get_cpu_var(mce_next_interval); 1407 1408 setup_timer(t, mce_start_timer, smp_processor_id()); 1409 1410 if (mce_ignore_ce) 1411 return; 1412 1413 *n = check_interval * HZ; 1414 if (!*n) 1415 return; 1416 t->expires = round_jiffies(jiffies + *n); 1417 add_timer_on(t, smp_processor_id()); 1418} 1419 1420/* Handle unconfigured int18 (should never happen) */ 1421static void unexpected_machine_check(struct pt_regs *regs, long error_code) 1422{ 1423 printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", 1424 smp_processor_id()); 1425} 1426 1427/* Call the installed machine check handler for this CPU setup. */ 1428void (*machine_check_vector)(struct pt_regs *, long error_code) = 1429 unexpected_machine_check; 1430 1431/* 1432 * Called for each booted CPU to set up machine checks. 1433 * Must be called with preempt off: 1434 */ 1435void __cpuinit mcheck_cpu_init(struct cpuinfo_x86 *c) 1436{ 1437 if (mce_disabled) 1438 return; 1439 1440 __mcheck_cpu_ancient_init(c); 1441 1442 if (!mce_available(c)) 1443 return; 1444 1445 if (__mcheck_cpu_cap_init() < 0 || __mcheck_cpu_apply_quirks(c) < 0) { 1446 mce_disabled = 1; 1447 return; 1448 } 1449 1450 machine_check_vector = do_machine_check; 1451 1452 __mcheck_cpu_init_generic(); 1453 __mcheck_cpu_init_vendor(c); 1454 __mcheck_cpu_init_timer(); 1455 INIT_WORK(&__get_cpu_var(mce_work), mce_process_work); 1456 1457} 1458 1459/* 1460 * Character device to read and clear the MCE log. 1461 */ 1462 1463static DEFINE_SPINLOCK(mce_state_lock); 1464static int open_count; /* #times opened */ 1465static int open_exclu; /* already open exclusive? */ 1466 1467static int mce_open(struct inode *inode, struct file *file) 1468{ 1469 spin_lock(&mce_state_lock); 1470 1471 if (open_exclu || (open_count && (file->f_flags & O_EXCL))) { 1472 spin_unlock(&mce_state_lock); 1473 1474 return -EBUSY; 1475 } 1476 1477 if (file->f_flags & O_EXCL) 1478 open_exclu = 1; 1479 open_count++; 1480 1481 spin_unlock(&mce_state_lock); 1482 1483 return nonseekable_open(inode, file); 1484} 1485 1486static int mce_release(struct inode *inode, struct file *file) 1487{ 1488 spin_lock(&mce_state_lock); 1489 1490 open_count--; 1491 open_exclu = 0; 1492 1493 spin_unlock(&mce_state_lock); 1494 1495 return 0; 1496} 1497 1498static void collect_tscs(void *data) 1499{ 1500 unsigned long *cpu_tsc = (unsigned long *)data; 1501 1502 rdtscll(cpu_tsc[smp_processor_id()]); 1503} 1504 1505static int mce_apei_read_done; 1506 1507/* Collect MCE record of previous boot in persistent storage via APEI ERST. */ 1508static int __mce_read_apei(char __user **ubuf, size_t usize) 1509{ 1510 int rc; 1511 u64 record_id; 1512 struct mce m; 1513 1514 if (usize < sizeof(struct mce)) 1515 return -EINVAL; 1516 1517 rc = apei_read_mce(&m, &record_id); 1518 /* Error or no more MCE record */ 1519 if (rc <= 0) { 1520 mce_apei_read_done = 1; 1521 return rc; 1522 } 1523 rc = -EFAULT; 1524 if (copy_to_user(*ubuf, &m, sizeof(struct mce))) 1525 return rc; 1526 /* 1527 * In fact, we should have cleared the record after that has 1528 * been flushed to the disk or sent to network in 1529 * /sbin/mcelog, but we have no interface to support that now, 1530 * so just clear it to avoid duplication. 1531 */ 1532 rc = apei_clear_mce(record_id); 1533 if (rc) { 1534 mce_apei_read_done = 1; 1535 return rc; 1536 } 1537 *ubuf += sizeof(struct mce); 1538 1539 return 0; 1540} 1541 1542static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, 1543 loff_t *off) 1544{ 1545 char __user *buf = ubuf; 1546 unsigned long *cpu_tsc; 1547 unsigned prev, next; 1548 int i, err; 1549 1550 cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL); 1551 if (!cpu_tsc) 1552 return -ENOMEM; 1553 1554 mutex_lock(&mce_read_mutex); 1555 1556 if (!mce_apei_read_done) { 1557 err = __mce_read_apei(&buf, usize); 1558 if (err || buf != ubuf) 1559 goto out; 1560 } 1561 1562 next = rcu_dereference_check_mce(mcelog.next); 1563 1564 /* Only supports full reads right now */ 1565 err = -EINVAL; 1566 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) 1567 goto out; 1568 1569 err = 0; 1570 prev = 0; 1571 do { 1572 for (i = prev; i < next; i++) { 1573 unsigned long start = jiffies; 1574 1575 while (!mcelog.entry[i].finished) { 1576 if (time_after_eq(jiffies, start + 2)) { 1577 memset(mcelog.entry + i, 0, 1578 sizeof(struct mce)); 1579 goto timeout; 1580 } 1581 cpu_relax(); 1582 } 1583 smp_rmb(); 1584 err |= copy_to_user(buf, mcelog.entry + i, 1585 sizeof(struct mce)); 1586 buf += sizeof(struct mce); 1587timeout: 1588 ; 1589 } 1590 1591 memset(mcelog.entry + prev, 0, 1592 (next - prev) * sizeof(struct mce)); 1593 prev = next; 1594 next = cmpxchg(&mcelog.next, prev, 0); 1595 } while (next != prev); 1596 1597 synchronize_sched(); 1598 1599 /* 1600 * Collect entries that were still getting written before the 1601 * synchronize. 1602 */ 1603 on_each_cpu(collect_tscs, cpu_tsc, 1); 1604 1605 for (i = next; i < MCE_LOG_LEN; i++) { 1606 if (mcelog.entry[i].finished && 1607 mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) { 1608 err |= copy_to_user(buf, mcelog.entry+i, 1609 sizeof(struct mce)); 1610 smp_rmb(); 1611 buf += sizeof(struct mce); 1612 memset(&mcelog.entry[i], 0, sizeof(struct mce)); 1613 } 1614 } 1615 1616 if (err) 1617 err = -EFAULT; 1618 1619out: 1620 mutex_unlock(&mce_read_mutex); 1621 kfree(cpu_tsc); 1622 1623 return err ? err : buf - ubuf; 1624} 1625 1626static unsigned int mce_poll(struct file *file, poll_table *wait) 1627{ 1628 poll_wait(file, &mce_wait, wait); 1629 if (rcu_dereference_check_mce(mcelog.next)) 1630 return POLLIN | POLLRDNORM; 1631 if (!mce_apei_read_done && apei_check_mce()) 1632 return POLLIN | POLLRDNORM; 1633 return 0; 1634} 1635 1636static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg) 1637{ 1638 int __user *p = (int __user *)arg; 1639 1640 if (!capable(CAP_SYS_ADMIN)) 1641 return -EPERM; 1642 1643 switch (cmd) { 1644 case MCE_GET_RECORD_LEN: 1645 return put_user(sizeof(struct mce), p); 1646 case MCE_GET_LOG_LEN: 1647 return put_user(MCE_LOG_LEN, p); 1648 case MCE_GETCLEAR_FLAGS: { 1649 unsigned flags; 1650 1651 do { 1652 flags = mcelog.flags; 1653 } while (cmpxchg(&mcelog.flags, flags, 0) != flags); 1654 1655 return put_user(flags, p); 1656 } 1657 default: 1658 return -ENOTTY; 1659 } 1660} 1661 1662/* Modified in mce-inject.c, so not static or const */ 1663struct file_operations mce_chrdev_ops = { 1664 .open = mce_open, 1665 .release = mce_release, 1666 .read = mce_read, 1667 .poll = mce_poll, 1668 .unlocked_ioctl = mce_ioctl, 1669 .llseek = no_llseek, 1670}; 1671EXPORT_SYMBOL_GPL(mce_chrdev_ops); 1672 1673static struct miscdevice mce_log_device = { 1674 MISC_MCELOG_MINOR, 1675 "mcelog", 1676 &mce_chrdev_ops, 1677}; 1678 1679/* 1680 * mce=off Disables machine check 1681 * mce=no_cmci Disables CMCI 1682 * mce=dont_log_ce Clears corrected events silently, no log created for CEs. 1683 * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared. 1684 * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above) 1685 * monarchtimeout is how long to wait for other CPUs on machine 1686 * check, or 0 to not wait 1687 * mce=bootlog Log MCEs from before booting. Disabled by default on AMD. 1688 * mce=nobootlog Don't log MCEs from before booting. 1689 */ 1690static int __init mcheck_enable(char *str) 1691{ 1692 if (*str == 0) { 1693 enable_p5_mce(); 1694 return 1; 1695 } 1696 if (*str == '=') 1697 str++; 1698 if (!strcmp(str, "off")) 1699 mce_disabled = 1; 1700 else if (!strcmp(str, "no_cmci")) 1701 mce_cmci_disabled = 1; 1702 else if (!strcmp(str, "dont_log_ce")) 1703 mce_dont_log_ce = 1; 1704 else if (!strcmp(str, "ignore_ce")) 1705 mce_ignore_ce = 1; 1706 else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog")) 1707 mce_bootlog = (str[0] == 'b'); 1708 else if (isdigit(str[0])) { 1709 get_option(&str, &tolerant); 1710 if (*str == ',') { 1711 ++str; 1712 get_option(&str, &monarch_timeout); 1713 } 1714 } else { 1715 printk(KERN_INFO "mce argument %s ignored. Please use /sys\n", 1716 str); 1717 return 0; 1718 } 1719 return 1; 1720} 1721__setup("mce", mcheck_enable); 1722 1723int __init mcheck_init(void) 1724{ 1725 atomic_notifier_chain_register(&x86_mce_decoder_chain, &mce_dec_nb); 1726 1727 mcheck_intel_therm_init(); 1728 1729 return 0; 1730} 1731 1732/* 1733 * Sysfs support 1734 */ 1735 1736/* 1737 * Disable machine checks on suspend and shutdown. We can't really handle 1738 * them later. 1739 */ 1740static int mce_disable_error_reporting(void) 1741{ 1742 int i; 1743 1744 for (i = 0; i < banks; i++) { 1745 struct mce_bank *b = &mce_banks[i]; 1746 1747 if (b->init) 1748 wrmsrl(MSR_IA32_MCx_CTL(i), 0); 1749 } 1750 return 0; 1751} 1752 1753static int mce_suspend(void) 1754{ 1755 return mce_disable_error_reporting(); 1756} 1757 1758static void mce_shutdown(void) 1759{ 1760 mce_disable_error_reporting(); 1761} 1762 1763/* 1764 * On resume clear all MCE state. Don't want to see leftovers from the BIOS. 1765 * Only one CPU is active at this time, the others get re-added later using 1766 * CPU hotplug: 1767 */ 1768static void mce_resume(void) 1769{ 1770 __mcheck_cpu_init_generic(); 1771 __mcheck_cpu_init_vendor(__this_cpu_ptr(&cpu_info)); 1772} 1773 1774static struct syscore_ops mce_syscore_ops = { 1775 .suspend = mce_suspend, 1776 .shutdown = mce_shutdown, 1777 .resume = mce_resume, 1778}; 1779 1780static void mce_cpu_restart(void *data) 1781{ 1782 del_timer_sync(&__get_cpu_var(mce_timer)); 1783 if (!mce_available(__this_cpu_ptr(&cpu_info))) 1784 return; 1785 __mcheck_cpu_init_generic(); 1786 __mcheck_cpu_init_timer(); 1787} 1788 1789/* Reinit MCEs after user configuration changes */ 1790static void mce_restart(void) 1791{ 1792 on_each_cpu(mce_cpu_restart, NULL, 1); 1793} 1794 1795/* Toggle features for corrected errors */ 1796static void mce_disable_ce(void *all) 1797{ 1798 if (!mce_available(__this_cpu_ptr(&cpu_info))) 1799 return; 1800 if (all) 1801 del_timer_sync(&__get_cpu_var(mce_timer)); 1802 cmci_clear(); 1803} 1804 1805static void mce_enable_ce(void *all) 1806{ 1807 if (!mce_available(__this_cpu_ptr(&cpu_info))) 1808 return; 1809 cmci_reenable(); 1810 cmci_recheck(); 1811 if (all) 1812 __mcheck_cpu_init_timer(); 1813} 1814 1815static struct sysdev_class mce_sysclass = { 1816 .name = "machinecheck", 1817}; 1818 1819DEFINE_PER_CPU(struct sys_device, mce_dev); 1820 1821__cpuinitdata 1822void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); 1823 1824static inline struct mce_bank *attr_to_bank(struct sysdev_attribute *attr) 1825{ 1826 return container_of(attr, struct mce_bank, attr); 1827} 1828 1829static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr, 1830 char *buf) 1831{ 1832 return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl); 1833} 1834 1835static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, 1836 const char *buf, size_t size) 1837{ 1838 u64 new; 1839 1840 if (strict_strtoull(buf, 0, &new) < 0) 1841 return -EINVAL; 1842 1843 attr_to_bank(attr)->ctl = new; 1844 mce_restart(); 1845 1846 return size; 1847} 1848 1849static ssize_t 1850show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf) 1851{ 1852 strcpy(buf, mce_helper); 1853 strcat(buf, "\n"); 1854 return strlen(mce_helper) + 1; 1855} 1856 1857static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr, 1858 const char *buf, size_t siz) 1859{ 1860 char *p; 1861 1862 strncpy(mce_helper, buf, sizeof(mce_helper)); 1863 mce_helper[sizeof(mce_helper)-1] = 0; 1864 p = strchr(mce_helper, '\n'); 1865 1866 if (p) 1867 *p = 0; 1868 1869 return strlen(mce_helper) + !!p; 1870} 1871 1872static ssize_t set_ignore_ce(struct sys_device *s, 1873 struct sysdev_attribute *attr, 1874 const char *buf, size_t size) 1875{ 1876 u64 new; 1877 1878 if (strict_strtoull(buf, 0, &new) < 0) 1879 return -EINVAL; 1880 1881 if (mce_ignore_ce ^ !!new) { 1882 if (new) { 1883 /* disable ce features */ 1884 on_each_cpu(mce_disable_ce, (void *)1, 1); 1885 mce_ignore_ce = 1; 1886 } else { 1887 /* enable ce features */ 1888 mce_ignore_ce = 0; 1889 on_each_cpu(mce_enable_ce, (void *)1, 1); 1890 } 1891 } 1892 return size; 1893} 1894 1895static ssize_t set_cmci_disabled(struct sys_device *s, 1896 struct sysdev_attribute *attr, 1897 const char *buf, size_t size) 1898{ 1899 u64 new; 1900 1901 if (strict_strtoull(buf, 0, &new) < 0) 1902 return -EINVAL; 1903 1904 if (mce_cmci_disabled ^ !!new) { 1905 if (new) { 1906 /* disable cmci */ 1907 on_each_cpu(mce_disable_ce, NULL, 1); 1908 mce_cmci_disabled = 1; 1909 } else { 1910 /* enable cmci */ 1911 mce_cmci_disabled = 0; 1912 on_each_cpu(mce_enable_ce, NULL, 1); 1913 } 1914 } 1915 return size; 1916} 1917 1918static ssize_t store_int_with_restart(struct sys_device *s, 1919 struct sysdev_attribute *attr, 1920 const char *buf, size_t size) 1921{ 1922 ssize_t ret = sysdev_store_int(s, attr, buf, size); 1923 mce_restart(); 1924 return ret; 1925} 1926 1927static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); 1928static SYSDEV_INT_ATTR(tolerant, 0644, tolerant); 1929static SYSDEV_INT_ATTR(monarch_timeout, 0644, monarch_timeout); 1930static SYSDEV_INT_ATTR(dont_log_ce, 0644, mce_dont_log_ce); 1931 1932static struct sysdev_ext_attribute attr_check_interval = { 1933 _SYSDEV_ATTR(check_interval, 0644, sysdev_show_int, 1934 store_int_with_restart), 1935 &check_interval 1936}; 1937 1938static struct sysdev_ext_attribute attr_ignore_ce = { 1939 _SYSDEV_ATTR(ignore_ce, 0644, sysdev_show_int, set_ignore_ce), 1940 &mce_ignore_ce 1941}; 1942 1943static struct sysdev_ext_attribute attr_cmci_disabled = { 1944 _SYSDEV_ATTR(cmci_disabled, 0644, sysdev_show_int, set_cmci_disabled), 1945 &mce_cmci_disabled 1946}; 1947 1948static struct sysdev_attribute *mce_attrs[] = { 1949 &attr_tolerant.attr, 1950 &attr_check_interval.attr, 1951 &attr_trigger, 1952 &attr_monarch_timeout.attr, 1953 &attr_dont_log_ce.attr, 1954 &attr_ignore_ce.attr, 1955 &attr_cmci_disabled.attr, 1956 NULL 1957}; 1958 1959static cpumask_var_t mce_dev_initialized; 1960 1961/* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */ 1962static __cpuinit int mce_create_device(unsigned int cpu) 1963{ 1964 int err; 1965 int i, j; 1966 1967 if (!mce_available(&boot_cpu_data)) 1968 return -EIO; 1969 1970 memset(&per_cpu(mce_dev, cpu).kobj, 0, sizeof(struct kobject)); 1971 per_cpu(mce_dev, cpu).id = cpu; 1972 per_cpu(mce_dev, cpu).cls = &mce_sysclass; 1973 1974 err = sysdev_register(&per_cpu(mce_dev, cpu)); 1975 if (err) 1976 return err; 1977 1978 for (i = 0; mce_attrs[i]; i++) { 1979 err = sysdev_create_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); 1980 if (err) 1981 goto error; 1982 } 1983 for (j = 0; j < banks; j++) { 1984 err = sysdev_create_file(&per_cpu(mce_dev, cpu), 1985 &mce_banks[j].attr); 1986 if (err) 1987 goto error2; 1988 } 1989 cpumask_set_cpu(cpu, mce_dev_initialized); 1990 1991 return 0; 1992error2: 1993 while (--j >= 0) 1994 sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[j].attr); 1995error: 1996 while (--i >= 0) 1997 sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); 1998 1999 sysdev_unregister(&per_cpu(mce_dev, cpu)); 2000 2001 return err; 2002} 2003 2004static __cpuinit void mce_remove_device(unsigned int cpu) 2005{ 2006 int i; 2007 2008 if (!cpumask_test_cpu(cpu, mce_dev_initialized)) 2009 return; 2010 2011 for (i = 0; mce_attrs[i]; i++) 2012 sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); 2013 2014 for (i = 0; i < banks; i++) 2015 sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[i].attr); 2016 2017 sysdev_unregister(&per_cpu(mce_dev, cpu)); 2018 cpumask_clear_cpu(cpu, mce_dev_initialized); 2019} 2020 2021/* Make sure there are no machine checks on offlined CPUs. */ 2022static void __cpuinit mce_disable_cpu(void *h) 2023{ 2024 unsigned long action = *(unsigned long *)h; 2025 int i; 2026 2027 if (!mce_available(__this_cpu_ptr(&cpu_info))) 2028 return; 2029 2030 if (!(action & CPU_TASKS_FROZEN)) 2031 cmci_clear(); 2032 for (i = 0; i < banks; i++) { 2033 struct mce_bank *b = &mce_banks[i]; 2034 2035 if (b->init) 2036 wrmsrl(MSR_IA32_MCx_CTL(i), 0); 2037 } 2038} 2039 2040static void __cpuinit mce_reenable_cpu(void *h) 2041{ 2042 unsigned long action = *(unsigned long *)h; 2043 int i; 2044 2045 if (!mce_available(__this_cpu_ptr(&cpu_info))) 2046 return; 2047 2048 if (!(action & CPU_TASKS_FROZEN)) 2049 cmci_reenable(); 2050 for (i = 0; i < banks; i++) { 2051 struct mce_bank *b = &mce_banks[i]; 2052 2053 if (b->init) 2054 wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl); 2055 } 2056} 2057 2058/* Get notified when a cpu comes on/off. Be hotplug friendly. */ 2059static int __cpuinit 2060mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) 2061{ 2062 unsigned int cpu = (unsigned long)hcpu; 2063 struct timer_list *t = &per_cpu(mce_timer, cpu); 2064 2065 switch (action) { 2066 case CPU_ONLINE: 2067 case CPU_ONLINE_FROZEN: 2068 mce_create_device(cpu); 2069 if (threshold_cpu_callback) 2070 threshold_cpu_callback(action, cpu); 2071 break; 2072 case CPU_DEAD: 2073 case CPU_DEAD_FROZEN: 2074 if (threshold_cpu_callback) 2075 threshold_cpu_callback(action, cpu); 2076 mce_remove_device(cpu); 2077 break; 2078 case CPU_DOWN_PREPARE: 2079 case CPU_DOWN_PREPARE_FROZEN: 2080 del_timer_sync(t); 2081 smp_call_function_single(cpu, mce_disable_cpu, &action, 1); 2082 break; 2083 case CPU_DOWN_FAILED: 2084 case CPU_DOWN_FAILED_FROZEN: 2085 if (!mce_ignore_ce && check_interval) { 2086 t->expires = round_jiffies(jiffies + 2087 __get_cpu_var(mce_next_interval)); 2088 add_timer_on(t, cpu); 2089 } 2090 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); 2091 break; 2092 case CPU_POST_DEAD: 2093 /* intentionally ignoring frozen here */ 2094 cmci_rediscover(cpu); 2095 break; 2096 } 2097 return NOTIFY_OK; 2098} 2099 2100static struct notifier_block mce_cpu_notifier __cpuinitdata = { 2101 .notifier_call = mce_cpu_callback, 2102}; 2103 2104static __init void mce_init_banks(void) 2105{ 2106 int i; 2107 2108 for (i = 0; i < banks; i++) { 2109 struct mce_bank *b = &mce_banks[i]; 2110 struct sysdev_attribute *a = &b->attr; 2111 2112 sysfs_attr_init(&a->attr); 2113 a->attr.name = b->attrname; 2114 snprintf(b->attrname, ATTR_LEN, "bank%d", i); 2115 2116 a->attr.mode = 0644; 2117 a->show = show_bank; 2118 a->store = set_bank; 2119 } 2120} 2121 2122static __init int mcheck_init_device(void) 2123{ 2124 int err; 2125 int i = 0; 2126 2127 if (!mce_available(&boot_cpu_data)) 2128 return -EIO; 2129 2130 zalloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL); 2131 2132 mce_init_banks(); 2133 2134 err = sysdev_class_register(&mce_sysclass); 2135 if (err) 2136 return err; 2137 2138 for_each_online_cpu(i) { 2139 err = mce_create_device(i); 2140 if (err) 2141 return err; 2142 } 2143 2144 register_syscore_ops(&mce_syscore_ops); 2145 register_hotcpu_notifier(&mce_cpu_notifier); 2146 misc_register(&mce_log_device); 2147 2148 return err; 2149} 2150 2151device_initcall(mcheck_init_device); 2152 2153/* 2154 * Old style boot options parsing. Only for compatibility. 2155 */ 2156static int __init mcheck_disable(char *str) 2157{ 2158 mce_disabled = 1; 2159 return 1; 2160} 2161__setup("nomce", mcheck_disable); 2162 2163#ifdef CONFIG_DEBUG_FS 2164struct dentry *mce_get_debugfs_dir(void) 2165{ 2166 static struct dentry *dmce; 2167 2168 if (!dmce) 2169 dmce = debugfs_create_dir("mce", NULL); 2170 2171 return dmce; 2172} 2173 2174static void mce_reset(void) 2175{ 2176 cpu_missing = 0; 2177 atomic_set(&mce_fake_paniced, 0); 2178 atomic_set(&mce_executing, 0); 2179 atomic_set(&mce_callin, 0); 2180 atomic_set(&global_nwo, 0); 2181} 2182 2183static int fake_panic_get(void *data, u64 *val) 2184{ 2185 *val = fake_panic; 2186 return 0; 2187} 2188 2189static int fake_panic_set(void *data, u64 val) 2190{ 2191 mce_reset(); 2192 fake_panic = val; 2193 return 0; 2194} 2195 2196DEFINE_SIMPLE_ATTRIBUTE(fake_panic_fops, fake_panic_get, 2197 fake_panic_set, "%llu\n"); 2198 2199static int __init mcheck_debugfs_init(void) 2200{ 2201 struct dentry *dmce, *ffake_panic; 2202 2203 dmce = mce_get_debugfs_dir(); 2204 if (!dmce) 2205 return -ENOMEM; 2206 ffake_panic = debugfs_create_file("fake_panic", 0444, dmce, NULL, 2207 &fake_panic_fops); 2208 if (!ffake_panic) 2209 return -ENOMEM; 2210 2211 return 0; 2212} 2213late_initcall(mcheck_debugfs_init); 2214#endif 2215