mce.c revision 881e23e56764808e7ab1ed73b5d8a6700042ea38
1/* 2 * Machine check handler. 3 * 4 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. 5 * Rest from unknown author(s). 6 * 2004 Andi Kleen. Rewrote most of it. 7 * Copyright 2008 Intel Corporation 8 * Author: Andi Kleen 9 */ 10#include <linux/thread_info.h> 11#include <linux/capability.h> 12#include <linux/miscdevice.h> 13#include <linux/ratelimit.h> 14#include <linux/kallsyms.h> 15#include <linux/rcupdate.h> 16#include <linux/kobject.h> 17#include <linux/uaccess.h> 18#include <linux/kdebug.h> 19#include <linux/kernel.h> 20#include <linux/percpu.h> 21#include <linux/string.h> 22#include <linux/sysdev.h> 23#include <linux/syscore_ops.h> 24#include <linux/delay.h> 25#include <linux/ctype.h> 26#include <linux/sched.h> 27#include <linux/sysfs.h> 28#include <linux/types.h> 29#include <linux/slab.h> 30#include <linux/init.h> 31#include <linux/kmod.h> 32#include <linux/poll.h> 33#include <linux/nmi.h> 34#include <linux/cpu.h> 35#include <linux/smp.h> 36#include <linux/fs.h> 37#include <linux/mm.h> 38#include <linux/debugfs.h> 39#include <linux/edac_mce.h> 40#include <linux/irq_work.h> 41 42#include <asm/processor.h> 43#include <asm/mce.h> 44#include <asm/msr.h> 45 46#include "mce-internal.h" 47 48static DEFINE_MUTEX(mce_chrdev_read_mutex); 49 50#define rcu_dereference_check_mce(p) \ 51 rcu_dereference_index_check((p), \ 52 rcu_read_lock_sched_held() || \ 53 lockdep_is_held(&mce_chrdev_read_mutex)) 54 55#define CREATE_TRACE_POINTS 56#include <trace/events/mce.h> 57 58int mce_disabled __read_mostly; 59 60#define MISC_MCELOG_MINOR 227 61 62#define SPINUNIT 100 /* 100ns */ 63 64atomic_t mce_entry; 65 66DEFINE_PER_CPU(unsigned, mce_exception_count); 67 68/* 69 * Tolerant levels: 70 * 0: always panic on uncorrected errors, log corrected errors 71 * 1: panic or SIGBUS on uncorrected errors, log corrected errors 72 * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors 73 * 3: never panic or SIGBUS, log all errors (for testing only) 74 */ 75static int tolerant __read_mostly = 1; 76static int banks __read_mostly; 77static int rip_msr __read_mostly; 78static int mce_bootlog __read_mostly = -1; 79static int monarch_timeout __read_mostly = -1; 80static int mce_panic_timeout __read_mostly; 81static int mce_dont_log_ce __read_mostly; 82int mce_cmci_disabled __read_mostly; 83int mce_ignore_ce __read_mostly; 84int mce_ser __read_mostly; 85 86struct mce_bank *mce_banks __read_mostly; 87 88/* User mode helper program triggered by machine check event */ 89static unsigned long mce_need_notify; 90static char mce_helper[128]; 91static char *mce_helper_argv[2] = { mce_helper, NULL }; 92 93static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait); 94 95static DEFINE_PER_CPU(struct mce, mces_seen); 96static int cpu_missing; 97 98/* 99 * CPU/chipset specific EDAC code can register a notifier call here to print 100 * MCE errors in a human-readable form. 101 */ 102ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain); 103EXPORT_SYMBOL_GPL(x86_mce_decoder_chain); 104 105/* MCA banks polled by the period polling timer for corrected events */ 106DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { 107 [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL 108}; 109 110static DEFINE_PER_CPU(struct work_struct, mce_work); 111 112/* Do initial initialization of a struct mce */ 113void mce_setup(struct mce *m) 114{ 115 memset(m, 0, sizeof(struct mce)); 116 m->cpu = m->extcpu = smp_processor_id(); 117 rdtscll(m->tsc); 118 /* We hope get_seconds stays lockless */ 119 m->time = get_seconds(); 120 m->cpuvendor = boot_cpu_data.x86_vendor; 121 m->cpuid = cpuid_eax(1); 122#ifdef CONFIG_SMP 123 m->socketid = cpu_data(m->extcpu).phys_proc_id; 124#endif 125 m->apicid = cpu_data(m->extcpu).initial_apicid; 126 rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap); 127} 128 129DEFINE_PER_CPU(struct mce, injectm); 130EXPORT_PER_CPU_SYMBOL_GPL(injectm); 131 132/* 133 * Lockless MCE logging infrastructure. 134 * This avoids deadlocks on printk locks without having to break locks. Also 135 * separate MCEs from kernel messages to avoid bogus bug reports. 136 */ 137 138static struct mce_log mcelog = { 139 .signature = MCE_LOG_SIGNATURE, 140 .len = MCE_LOG_LEN, 141 .recordlen = sizeof(struct mce), 142}; 143 144void mce_log(struct mce *mce) 145{ 146 unsigned next, entry; 147 148 /* Emit the trace record: */ 149 trace_mce_record(mce); 150 151 mce->finished = 0; 152 wmb(); 153 for (;;) { 154 entry = rcu_dereference_check_mce(mcelog.next); 155 for (;;) { 156 /* 157 * If edac_mce is enabled, it will check the error type 158 * and will process it, if it is a known error. 159 * Otherwise, the error will be sent through mcelog 160 * interface 161 */ 162 if (edac_mce_parse(mce)) 163 return; 164 165 /* 166 * When the buffer fills up discard new entries. 167 * Assume that the earlier errors are the more 168 * interesting ones: 169 */ 170 if (entry >= MCE_LOG_LEN) { 171 set_bit(MCE_OVERFLOW, 172 (unsigned long *)&mcelog.flags); 173 return; 174 } 175 /* Old left over entry. Skip: */ 176 if (mcelog.entry[entry].finished) { 177 entry++; 178 continue; 179 } 180 break; 181 } 182 smp_rmb(); 183 next = entry + 1; 184 if (cmpxchg(&mcelog.next, entry, next) == entry) 185 break; 186 } 187 memcpy(mcelog.entry + entry, mce, sizeof(struct mce)); 188 wmb(); 189 mcelog.entry[entry].finished = 1; 190 wmb(); 191 192 mce->finished = 1; 193 set_bit(0, &mce_need_notify); 194} 195 196static void print_mce(struct mce *m) 197{ 198 int ret = 0; 199 200 pr_emerg(HW_ERR "CPU %d: Machine Check Exception: %Lx Bank %d: %016Lx\n", 201 m->extcpu, m->mcgstatus, m->bank, m->status); 202 203 if (m->ip) { 204 pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ", 205 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", 206 m->cs, m->ip); 207 208 if (m->cs == __KERNEL_CS) 209 print_symbol("{%s}", m->ip); 210 pr_cont("\n"); 211 } 212 213 pr_emerg(HW_ERR "TSC %llx ", m->tsc); 214 if (m->addr) 215 pr_cont("ADDR %llx ", m->addr); 216 if (m->misc) 217 pr_cont("MISC %llx ", m->misc); 218 219 pr_cont("\n"); 220 /* 221 * Note this output is parsed by external tools and old fields 222 * should not be changed. 223 */ 224 pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x microcode %x\n", 225 m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid, 226 cpu_data(m->extcpu).microcode); 227 228 /* 229 * Print out human-readable details about the MCE error, 230 * (if the CPU has an implementation for that) 231 */ 232 ret = atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m); 233 if (ret == NOTIFY_STOP) 234 return; 235 236 pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n"); 237} 238 239#define PANIC_TIMEOUT 5 /* 5 seconds */ 240 241static atomic_t mce_paniced; 242 243static int fake_panic; 244static atomic_t mce_fake_paniced; 245 246/* Panic in progress. Enable interrupts and wait for final IPI */ 247static void wait_for_panic(void) 248{ 249 long timeout = PANIC_TIMEOUT*USEC_PER_SEC; 250 251 preempt_disable(); 252 local_irq_enable(); 253 while (timeout-- > 0) 254 udelay(1); 255 if (panic_timeout == 0) 256 panic_timeout = mce_panic_timeout; 257 panic("Panicing machine check CPU died"); 258} 259 260static void mce_panic(char *msg, struct mce *final, char *exp) 261{ 262 int i, apei_err = 0; 263 264 if (!fake_panic) { 265 /* 266 * Make sure only one CPU runs in machine check panic 267 */ 268 if (atomic_inc_return(&mce_paniced) > 1) 269 wait_for_panic(); 270 barrier(); 271 272 bust_spinlocks(1); 273 console_verbose(); 274 } else { 275 /* Don't log too much for fake panic */ 276 if (atomic_inc_return(&mce_fake_paniced) > 1) 277 return; 278 } 279 /* First print corrected ones that are still unlogged */ 280 for (i = 0; i < MCE_LOG_LEN; i++) { 281 struct mce *m = &mcelog.entry[i]; 282 if (!(m->status & MCI_STATUS_VAL)) 283 continue; 284 if (!(m->status & MCI_STATUS_UC)) { 285 print_mce(m); 286 if (!apei_err) 287 apei_err = apei_write_mce(m); 288 } 289 } 290 /* Now print uncorrected but with the final one last */ 291 for (i = 0; i < MCE_LOG_LEN; i++) { 292 struct mce *m = &mcelog.entry[i]; 293 if (!(m->status & MCI_STATUS_VAL)) 294 continue; 295 if (!(m->status & MCI_STATUS_UC)) 296 continue; 297 if (!final || memcmp(m, final, sizeof(struct mce))) { 298 print_mce(m); 299 if (!apei_err) 300 apei_err = apei_write_mce(m); 301 } 302 } 303 if (final) { 304 print_mce(final); 305 if (!apei_err) 306 apei_err = apei_write_mce(final); 307 } 308 if (cpu_missing) 309 pr_emerg(HW_ERR "Some CPUs didn't answer in synchronization\n"); 310 if (exp) 311 pr_emerg(HW_ERR "Machine check: %s\n", exp); 312 if (!fake_panic) { 313 if (panic_timeout == 0) 314 panic_timeout = mce_panic_timeout; 315 panic(msg); 316 } else 317 pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg); 318} 319 320/* Support code for software error injection */ 321 322static int msr_to_offset(u32 msr) 323{ 324 unsigned bank = __this_cpu_read(injectm.bank); 325 326 if (msr == rip_msr) 327 return offsetof(struct mce, ip); 328 if (msr == MSR_IA32_MCx_STATUS(bank)) 329 return offsetof(struct mce, status); 330 if (msr == MSR_IA32_MCx_ADDR(bank)) 331 return offsetof(struct mce, addr); 332 if (msr == MSR_IA32_MCx_MISC(bank)) 333 return offsetof(struct mce, misc); 334 if (msr == MSR_IA32_MCG_STATUS) 335 return offsetof(struct mce, mcgstatus); 336 return -1; 337} 338 339/* MSR access wrappers used for error injection */ 340static u64 mce_rdmsrl(u32 msr) 341{ 342 u64 v; 343 344 if (__this_cpu_read(injectm.finished)) { 345 int offset = msr_to_offset(msr); 346 347 if (offset < 0) 348 return 0; 349 return *(u64 *)((char *)&__get_cpu_var(injectm) + offset); 350 } 351 352 if (rdmsrl_safe(msr, &v)) { 353 WARN_ONCE(1, "mce: Unable to read msr %d!\n", msr); 354 /* 355 * Return zero in case the access faulted. This should 356 * not happen normally but can happen if the CPU does 357 * something weird, or if the code is buggy. 358 */ 359 v = 0; 360 } 361 362 return v; 363} 364 365static void mce_wrmsrl(u32 msr, u64 v) 366{ 367 if (__this_cpu_read(injectm.finished)) { 368 int offset = msr_to_offset(msr); 369 370 if (offset >= 0) 371 *(u64 *)((char *)&__get_cpu_var(injectm) + offset) = v; 372 return; 373 } 374 wrmsrl(msr, v); 375} 376 377/* 378 * Collect all global (w.r.t. this processor) status about this machine 379 * check into our "mce" struct so that we can use it later to assess 380 * the severity of the problem as we read per-bank specific details. 381 */ 382static inline void mce_gather_info(struct mce *m, struct pt_regs *regs) 383{ 384 mce_setup(m); 385 386 m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); 387 if (regs) { 388 /* 389 * Get the address of the instruction at the time of 390 * the machine check error. 391 */ 392 if (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) { 393 m->ip = regs->ip; 394 m->cs = regs->cs; 395 } 396 /* Use accurate RIP reporting if available. */ 397 if (rip_msr) 398 m->ip = mce_rdmsrl(rip_msr); 399 } 400} 401 402/* 403 * Simple lockless ring to communicate PFNs from the exception handler with the 404 * process context work function. This is vastly simplified because there's 405 * only a single reader and a single writer. 406 */ 407#define MCE_RING_SIZE 16 /* we use one entry less */ 408 409struct mce_ring { 410 unsigned short start; 411 unsigned short end; 412 unsigned long ring[MCE_RING_SIZE]; 413}; 414static DEFINE_PER_CPU(struct mce_ring, mce_ring); 415 416/* Runs with CPU affinity in workqueue */ 417static int mce_ring_empty(void) 418{ 419 struct mce_ring *r = &__get_cpu_var(mce_ring); 420 421 return r->start == r->end; 422} 423 424static int mce_ring_get(unsigned long *pfn) 425{ 426 struct mce_ring *r; 427 int ret = 0; 428 429 *pfn = 0; 430 get_cpu(); 431 r = &__get_cpu_var(mce_ring); 432 if (r->start == r->end) 433 goto out; 434 *pfn = r->ring[r->start]; 435 r->start = (r->start + 1) % MCE_RING_SIZE; 436 ret = 1; 437out: 438 put_cpu(); 439 return ret; 440} 441 442/* Always runs in MCE context with preempt off */ 443static int mce_ring_add(unsigned long pfn) 444{ 445 struct mce_ring *r = &__get_cpu_var(mce_ring); 446 unsigned next; 447 448 next = (r->end + 1) % MCE_RING_SIZE; 449 if (next == r->start) 450 return -1; 451 r->ring[r->end] = pfn; 452 wmb(); 453 r->end = next; 454 return 0; 455} 456 457int mce_available(struct cpuinfo_x86 *c) 458{ 459 if (mce_disabled) 460 return 0; 461 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); 462} 463 464static void mce_schedule_work(void) 465{ 466 if (!mce_ring_empty()) { 467 struct work_struct *work = &__get_cpu_var(mce_work); 468 if (!work_pending(work)) 469 schedule_work(work); 470 } 471} 472 473DEFINE_PER_CPU(struct irq_work, mce_irq_work); 474 475static void mce_irq_work_cb(struct irq_work *entry) 476{ 477 mce_notify_irq(); 478 mce_schedule_work(); 479} 480 481static void mce_report_event(struct pt_regs *regs) 482{ 483 if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) { 484 mce_notify_irq(); 485 /* 486 * Triggering the work queue here is just an insurance 487 * policy in case the syscall exit notify handler 488 * doesn't run soon enough or ends up running on the 489 * wrong CPU (can happen when audit sleeps) 490 */ 491 mce_schedule_work(); 492 return; 493 } 494 495 irq_work_queue(&__get_cpu_var(mce_irq_work)); 496} 497 498DEFINE_PER_CPU(unsigned, mce_poll_count); 499 500/* 501 * Poll for corrected events or events that happened before reset. 502 * Those are just logged through /dev/mcelog. 503 * 504 * This is executed in standard interrupt context. 505 * 506 * Note: spec recommends to panic for fatal unsignalled 507 * errors here. However this would be quite problematic -- 508 * we would need to reimplement the Monarch handling and 509 * it would mess up the exclusion between exception handler 510 * and poll hander -- * so we skip this for now. 511 * These cases should not happen anyways, or only when the CPU 512 * is already totally * confused. In this case it's likely it will 513 * not fully execute the machine check handler either. 514 */ 515void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) 516{ 517 struct mce m; 518 int i; 519 520 percpu_inc(mce_poll_count); 521 522 mce_gather_info(&m, NULL); 523 524 for (i = 0; i < banks; i++) { 525 if (!mce_banks[i].ctl || !test_bit(i, *b)) 526 continue; 527 528 m.misc = 0; 529 m.addr = 0; 530 m.bank = i; 531 m.tsc = 0; 532 533 barrier(); 534 m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); 535 if (!(m.status & MCI_STATUS_VAL)) 536 continue; 537 538 /* 539 * Uncorrected or signalled events are handled by the exception 540 * handler when it is enabled, so don't process those here. 541 * 542 * TBD do the same check for MCI_STATUS_EN here? 543 */ 544 if (!(flags & MCP_UC) && 545 (m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC))) 546 continue; 547 548 if (m.status & MCI_STATUS_MISCV) 549 m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i)); 550 if (m.status & MCI_STATUS_ADDRV) 551 m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i)); 552 553 if (!(flags & MCP_TIMESTAMP)) 554 m.tsc = 0; 555 /* 556 * Don't get the IP here because it's unlikely to 557 * have anything to do with the actual error location. 558 */ 559 if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) { 560 mce_log(&m); 561 atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, &m); 562 } 563 564 /* 565 * Clear state for this bank. 566 */ 567 mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0); 568 } 569 570 /* 571 * Don't clear MCG_STATUS here because it's only defined for 572 * exceptions. 573 */ 574 575 sync_core(); 576} 577EXPORT_SYMBOL_GPL(machine_check_poll); 578 579/* 580 * Do a quick check if any of the events requires a panic. 581 * This decides if we keep the events around or clear them. 582 */ 583static int mce_no_way_out(struct mce *m, char **msg) 584{ 585 int i; 586 587 for (i = 0; i < banks; i++) { 588 m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); 589 if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY) 590 return 1; 591 } 592 return 0; 593} 594 595/* 596 * Variable to establish order between CPUs while scanning. 597 * Each CPU spins initially until executing is equal its number. 598 */ 599static atomic_t mce_executing; 600 601/* 602 * Defines order of CPUs on entry. First CPU becomes Monarch. 603 */ 604static atomic_t mce_callin; 605 606/* 607 * Check if a timeout waiting for other CPUs happened. 608 */ 609static int mce_timed_out(u64 *t) 610{ 611 /* 612 * The others already did panic for some reason. 613 * Bail out like in a timeout. 614 * rmb() to tell the compiler that system_state 615 * might have been modified by someone else. 616 */ 617 rmb(); 618 if (atomic_read(&mce_paniced)) 619 wait_for_panic(); 620 if (!monarch_timeout) 621 goto out; 622 if ((s64)*t < SPINUNIT) { 623 /* CHECKME: Make panic default for 1 too? */ 624 if (tolerant < 1) 625 mce_panic("Timeout synchronizing machine check over CPUs", 626 NULL, NULL); 627 cpu_missing = 1; 628 return 1; 629 } 630 *t -= SPINUNIT; 631out: 632 touch_nmi_watchdog(); 633 return 0; 634} 635 636/* 637 * The Monarch's reign. The Monarch is the CPU who entered 638 * the machine check handler first. It waits for the others to 639 * raise the exception too and then grades them. When any 640 * error is fatal panic. Only then let the others continue. 641 * 642 * The other CPUs entering the MCE handler will be controlled by the 643 * Monarch. They are called Subjects. 644 * 645 * This way we prevent any potential data corruption in a unrecoverable case 646 * and also makes sure always all CPU's errors are examined. 647 * 648 * Also this detects the case of a machine check event coming from outer 649 * space (not detected by any CPUs) In this case some external agent wants 650 * us to shut down, so panic too. 651 * 652 * The other CPUs might still decide to panic if the handler happens 653 * in a unrecoverable place, but in this case the system is in a semi-stable 654 * state and won't corrupt anything by itself. It's ok to let the others 655 * continue for a bit first. 656 * 657 * All the spin loops have timeouts; when a timeout happens a CPU 658 * typically elects itself to be Monarch. 659 */ 660static void mce_reign(void) 661{ 662 int cpu; 663 struct mce *m = NULL; 664 int global_worst = 0; 665 char *msg = NULL; 666 char *nmsg = NULL; 667 668 /* 669 * This CPU is the Monarch and the other CPUs have run 670 * through their handlers. 671 * Grade the severity of the errors of all the CPUs. 672 */ 673 for_each_possible_cpu(cpu) { 674 int severity = mce_severity(&per_cpu(mces_seen, cpu), tolerant, 675 &nmsg); 676 if (severity > global_worst) { 677 msg = nmsg; 678 global_worst = severity; 679 m = &per_cpu(mces_seen, cpu); 680 } 681 } 682 683 /* 684 * Cannot recover? Panic here then. 685 * This dumps all the mces in the log buffer and stops the 686 * other CPUs. 687 */ 688 if (m && global_worst >= MCE_PANIC_SEVERITY && tolerant < 3) 689 mce_panic("Fatal Machine check", m, msg); 690 691 /* 692 * For UC somewhere we let the CPU who detects it handle it. 693 * Also must let continue the others, otherwise the handling 694 * CPU could deadlock on a lock. 695 */ 696 697 /* 698 * No machine check event found. Must be some external 699 * source or one CPU is hung. Panic. 700 */ 701 if (global_worst <= MCE_KEEP_SEVERITY && tolerant < 3) 702 mce_panic("Machine check from unknown source", NULL, NULL); 703 704 /* 705 * Now clear all the mces_seen so that they don't reappear on 706 * the next mce. 707 */ 708 for_each_possible_cpu(cpu) 709 memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce)); 710} 711 712static atomic_t global_nwo; 713 714/* 715 * Start of Monarch synchronization. This waits until all CPUs have 716 * entered the exception handler and then determines if any of them 717 * saw a fatal event that requires panic. Then it executes them 718 * in the entry order. 719 * TBD double check parallel CPU hotunplug 720 */ 721static int mce_start(int *no_way_out) 722{ 723 int order; 724 int cpus = num_online_cpus(); 725 u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC; 726 727 if (!timeout) 728 return -1; 729 730 atomic_add(*no_way_out, &global_nwo); 731 /* 732 * global_nwo should be updated before mce_callin 733 */ 734 smp_wmb(); 735 order = atomic_inc_return(&mce_callin); 736 737 /* 738 * Wait for everyone. 739 */ 740 while (atomic_read(&mce_callin) != cpus) { 741 if (mce_timed_out(&timeout)) { 742 atomic_set(&global_nwo, 0); 743 return -1; 744 } 745 ndelay(SPINUNIT); 746 } 747 748 /* 749 * mce_callin should be read before global_nwo 750 */ 751 smp_rmb(); 752 753 if (order == 1) { 754 /* 755 * Monarch: Starts executing now, the others wait. 756 */ 757 atomic_set(&mce_executing, 1); 758 } else { 759 /* 760 * Subject: Now start the scanning loop one by one in 761 * the original callin order. 762 * This way when there are any shared banks it will be 763 * only seen by one CPU before cleared, avoiding duplicates. 764 */ 765 while (atomic_read(&mce_executing) < order) { 766 if (mce_timed_out(&timeout)) { 767 atomic_set(&global_nwo, 0); 768 return -1; 769 } 770 ndelay(SPINUNIT); 771 } 772 } 773 774 /* 775 * Cache the global no_way_out state. 776 */ 777 *no_way_out = atomic_read(&global_nwo); 778 779 return order; 780} 781 782/* 783 * Synchronize between CPUs after main scanning loop. 784 * This invokes the bulk of the Monarch processing. 785 */ 786static int mce_end(int order) 787{ 788 int ret = -1; 789 u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC; 790 791 if (!timeout) 792 goto reset; 793 if (order < 0) 794 goto reset; 795 796 /* 797 * Allow others to run. 798 */ 799 atomic_inc(&mce_executing); 800 801 if (order == 1) { 802 /* CHECKME: Can this race with a parallel hotplug? */ 803 int cpus = num_online_cpus(); 804 805 /* 806 * Monarch: Wait for everyone to go through their scanning 807 * loops. 808 */ 809 while (atomic_read(&mce_executing) <= cpus) { 810 if (mce_timed_out(&timeout)) 811 goto reset; 812 ndelay(SPINUNIT); 813 } 814 815 mce_reign(); 816 barrier(); 817 ret = 0; 818 } else { 819 /* 820 * Subject: Wait for Monarch to finish. 821 */ 822 while (atomic_read(&mce_executing) != 0) { 823 if (mce_timed_out(&timeout)) 824 goto reset; 825 ndelay(SPINUNIT); 826 } 827 828 /* 829 * Don't reset anything. That's done by the Monarch. 830 */ 831 return 0; 832 } 833 834 /* 835 * Reset all global state. 836 */ 837reset: 838 atomic_set(&global_nwo, 0); 839 atomic_set(&mce_callin, 0); 840 barrier(); 841 842 /* 843 * Let others run again. 844 */ 845 atomic_set(&mce_executing, 0); 846 return ret; 847} 848 849/* 850 * Check if the address reported by the CPU is in a format we can parse. 851 * It would be possible to add code for most other cases, but all would 852 * be somewhat complicated (e.g. segment offset would require an instruction 853 * parser). So only support physical addresses up to page granuality for now. 854 */ 855static int mce_usable_address(struct mce *m) 856{ 857 if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV)) 858 return 0; 859 if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT) 860 return 0; 861 if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS) 862 return 0; 863 return 1; 864} 865 866static void mce_clear_state(unsigned long *toclear) 867{ 868 int i; 869 870 for (i = 0; i < banks; i++) { 871 if (test_bit(i, toclear)) 872 mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0); 873 } 874} 875 876/* 877 * The actual machine check handler. This only handles real 878 * exceptions when something got corrupted coming in through int 18. 879 * 880 * This is executed in NMI context not subject to normal locking rules. This 881 * implies that most kernel services cannot be safely used. Don't even 882 * think about putting a printk in there! 883 * 884 * On Intel systems this is entered on all CPUs in parallel through 885 * MCE broadcast. However some CPUs might be broken beyond repair, 886 * so be always careful when synchronizing with others. 887 */ 888void do_machine_check(struct pt_regs *regs, long error_code) 889{ 890 struct mce m, *final; 891 int i; 892 int worst = 0; 893 int severity; 894 /* 895 * Establish sequential order between the CPUs entering the machine 896 * check handler. 897 */ 898 int order; 899 /* 900 * If no_way_out gets set, there is no safe way to recover from this 901 * MCE. If tolerant is cranked up, we'll try anyway. 902 */ 903 int no_way_out = 0; 904 /* 905 * If kill_it gets set, there might be a way to recover from this 906 * error. 907 */ 908 int kill_it = 0; 909 DECLARE_BITMAP(toclear, MAX_NR_BANKS); 910 char *msg = "Unknown"; 911 912 atomic_inc(&mce_entry); 913 914 percpu_inc(mce_exception_count); 915 916 if (notify_die(DIE_NMI, "machine check", regs, error_code, 917 18, SIGKILL) == NOTIFY_STOP) 918 goto out; 919 if (!banks) 920 goto out; 921 922 mce_gather_info(&m, regs); 923 924 final = &__get_cpu_var(mces_seen); 925 *final = m; 926 927 no_way_out = mce_no_way_out(&m, &msg); 928 929 barrier(); 930 931 /* 932 * When no restart IP must always kill or panic. 933 */ 934 if (!(m.mcgstatus & MCG_STATUS_RIPV)) 935 kill_it = 1; 936 937 /* 938 * Go through all the banks in exclusion of the other CPUs. 939 * This way we don't report duplicated events on shared banks 940 * because the first one to see it will clear it. 941 */ 942 order = mce_start(&no_way_out); 943 for (i = 0; i < banks; i++) { 944 __clear_bit(i, toclear); 945 if (!mce_banks[i].ctl) 946 continue; 947 948 m.misc = 0; 949 m.addr = 0; 950 m.bank = i; 951 952 m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); 953 if ((m.status & MCI_STATUS_VAL) == 0) 954 continue; 955 956 /* 957 * Non uncorrected or non signaled errors are handled by 958 * machine_check_poll. Leave them alone, unless this panics. 959 */ 960 if (!(m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)) && 961 !no_way_out) 962 continue; 963 964 /* 965 * Set taint even when machine check was not enabled. 966 */ 967 add_taint(TAINT_MACHINE_CHECK); 968 969 severity = mce_severity(&m, tolerant, NULL); 970 971 /* 972 * When machine check was for corrected handler don't touch, 973 * unless we're panicing. 974 */ 975 if (severity == MCE_KEEP_SEVERITY && !no_way_out) 976 continue; 977 __set_bit(i, toclear); 978 if (severity == MCE_NO_SEVERITY) { 979 /* 980 * Machine check event was not enabled. Clear, but 981 * ignore. 982 */ 983 continue; 984 } 985 986 /* 987 * Kill on action required. 988 */ 989 if (severity == MCE_AR_SEVERITY) 990 kill_it = 1; 991 992 if (m.status & MCI_STATUS_MISCV) 993 m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i)); 994 if (m.status & MCI_STATUS_ADDRV) 995 m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i)); 996 997 /* 998 * Action optional error. Queue address for later processing. 999 * When the ring overflows we just ignore the AO error. 1000 * RED-PEN add some logging mechanism when 1001 * usable_address or mce_add_ring fails. 1002 * RED-PEN don't ignore overflow for tolerant == 0 1003 */ 1004 if (severity == MCE_AO_SEVERITY && mce_usable_address(&m)) 1005 mce_ring_add(m.addr >> PAGE_SHIFT); 1006 1007 mce_log(&m); 1008 1009 if (severity > worst) { 1010 *final = m; 1011 worst = severity; 1012 } 1013 } 1014 1015 if (!no_way_out) 1016 mce_clear_state(toclear); 1017 1018 /* 1019 * Do most of the synchronization with other CPUs. 1020 * When there's any problem use only local no_way_out state. 1021 */ 1022 if (mce_end(order) < 0) 1023 no_way_out = worst >= MCE_PANIC_SEVERITY; 1024 1025 /* 1026 * If we have decided that we just CAN'T continue, and the user 1027 * has not set tolerant to an insane level, give up and die. 1028 * 1029 * This is mainly used in the case when the system doesn't 1030 * support MCE broadcasting or it has been disabled. 1031 */ 1032 if (no_way_out && tolerant < 3) 1033 mce_panic("Fatal machine check on current CPU", final, msg); 1034 1035 /* 1036 * If the error seems to be unrecoverable, something should be 1037 * done. Try to kill as little as possible. If we can kill just 1038 * one task, do that. If the user has set the tolerance very 1039 * high, don't try to do anything at all. 1040 */ 1041 1042 if (kill_it && tolerant < 3) 1043 force_sig(SIGBUS, current); 1044 1045 /* notify userspace ASAP */ 1046 set_thread_flag(TIF_MCE_NOTIFY); 1047 1048 if (worst > 0) 1049 mce_report_event(regs); 1050 mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); 1051out: 1052 atomic_dec(&mce_entry); 1053 sync_core(); 1054} 1055EXPORT_SYMBOL_GPL(do_machine_check); 1056 1057/* dummy to break dependency. actual code is in mm/memory-failure.c */ 1058void __attribute__((weak)) memory_failure(unsigned long pfn, int vector) 1059{ 1060 printk(KERN_ERR "Action optional memory failure at %lx ignored\n", pfn); 1061} 1062 1063/* 1064 * Called after mce notification in process context. This code 1065 * is allowed to sleep. Call the high level VM handler to process 1066 * any corrupted pages. 1067 * Assume that the work queue code only calls this one at a time 1068 * per CPU. 1069 * Note we don't disable preemption, so this code might run on the wrong 1070 * CPU. In this case the event is picked up by the scheduled work queue. 1071 * This is merely a fast path to expedite processing in some common 1072 * cases. 1073 */ 1074void mce_notify_process(void) 1075{ 1076 unsigned long pfn; 1077 mce_notify_irq(); 1078 while (mce_ring_get(&pfn)) 1079 memory_failure(pfn, MCE_VECTOR); 1080} 1081 1082static void mce_process_work(struct work_struct *dummy) 1083{ 1084 mce_notify_process(); 1085} 1086 1087#ifdef CONFIG_X86_MCE_INTEL 1088/*** 1089 * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog 1090 * @cpu: The CPU on which the event occurred. 1091 * @status: Event status information 1092 * 1093 * This function should be called by the thermal interrupt after the 1094 * event has been processed and the decision was made to log the event 1095 * further. 1096 * 1097 * The status parameter will be saved to the 'status' field of 'struct mce' 1098 * and historically has been the register value of the 1099 * MSR_IA32_THERMAL_STATUS (Intel) msr. 1100 */ 1101void mce_log_therm_throt_event(__u64 status) 1102{ 1103 struct mce m; 1104 1105 mce_setup(&m); 1106 m.bank = MCE_THERMAL_BANK; 1107 m.status = status; 1108 mce_log(&m); 1109} 1110#endif /* CONFIG_X86_MCE_INTEL */ 1111 1112/* 1113 * Periodic polling timer for "silent" machine check errors. If the 1114 * poller finds an MCE, poll 2x faster. When the poller finds no more 1115 * errors, poll 2x slower (up to check_interval seconds). 1116 */ 1117static int check_interval = 5 * 60; /* 5 minutes */ 1118 1119static DEFINE_PER_CPU(int, mce_next_interval); /* in jiffies */ 1120static DEFINE_PER_CPU(struct timer_list, mce_timer); 1121 1122static void mce_start_timer(unsigned long data) 1123{ 1124 struct timer_list *t = &per_cpu(mce_timer, data); 1125 int *n; 1126 1127 WARN_ON(smp_processor_id() != data); 1128 1129 if (mce_available(__this_cpu_ptr(&cpu_info))) { 1130 machine_check_poll(MCP_TIMESTAMP, 1131 &__get_cpu_var(mce_poll_banks)); 1132 } 1133 1134 /* 1135 * Alert userspace if needed. If we logged an MCE, reduce the 1136 * polling interval, otherwise increase the polling interval. 1137 */ 1138 n = &__get_cpu_var(mce_next_interval); 1139 if (mce_notify_irq()) 1140 *n = max(*n/2, HZ/100); 1141 else 1142 *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ)); 1143 1144 t->expires = jiffies + *n; 1145 add_timer_on(t, smp_processor_id()); 1146} 1147 1148static void mce_do_trigger(struct work_struct *work) 1149{ 1150 call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT); 1151} 1152 1153static DECLARE_WORK(mce_trigger_work, mce_do_trigger); 1154 1155/* 1156 * Notify the user(s) about new machine check events. 1157 * Can be called from interrupt context, but not from machine check/NMI 1158 * context. 1159 */ 1160int mce_notify_irq(void) 1161{ 1162 /* Not more than two messages every minute */ 1163 static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); 1164 1165 clear_thread_flag(TIF_MCE_NOTIFY); 1166 1167 if (test_and_clear_bit(0, &mce_need_notify)) { 1168 /* wake processes polling /dev/mcelog */ 1169 wake_up_interruptible(&mce_chrdev_wait); 1170 1171 /* 1172 * There is no risk of missing notifications because 1173 * work_pending is always cleared before the function is 1174 * executed. 1175 */ 1176 if (mce_helper[0] && !work_pending(&mce_trigger_work)) 1177 schedule_work(&mce_trigger_work); 1178 1179 if (__ratelimit(&ratelimit)) 1180 pr_info(HW_ERR "Machine check events logged\n"); 1181 1182 return 1; 1183 } 1184 return 0; 1185} 1186EXPORT_SYMBOL_GPL(mce_notify_irq); 1187 1188static int __cpuinit __mcheck_cpu_mce_banks_init(void) 1189{ 1190 int i; 1191 1192 mce_banks = kzalloc(banks * sizeof(struct mce_bank), GFP_KERNEL); 1193 if (!mce_banks) 1194 return -ENOMEM; 1195 for (i = 0; i < banks; i++) { 1196 struct mce_bank *b = &mce_banks[i]; 1197 1198 b->ctl = -1ULL; 1199 b->init = 1; 1200 } 1201 return 0; 1202} 1203 1204/* 1205 * Initialize Machine Checks for a CPU. 1206 */ 1207static int __cpuinit __mcheck_cpu_cap_init(void) 1208{ 1209 unsigned b; 1210 u64 cap; 1211 1212 rdmsrl(MSR_IA32_MCG_CAP, cap); 1213 1214 b = cap & MCG_BANKCNT_MASK; 1215 if (!banks) 1216 printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b); 1217 1218 if (b > MAX_NR_BANKS) { 1219 printk(KERN_WARNING 1220 "MCE: Using only %u machine check banks out of %u\n", 1221 MAX_NR_BANKS, b); 1222 b = MAX_NR_BANKS; 1223 } 1224 1225 /* Don't support asymmetric configurations today */ 1226 WARN_ON(banks != 0 && b != banks); 1227 banks = b; 1228 if (!mce_banks) { 1229 int err = __mcheck_cpu_mce_banks_init(); 1230 1231 if (err) 1232 return err; 1233 } 1234 1235 /* Use accurate RIP reporting if available. */ 1236 if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9) 1237 rip_msr = MSR_IA32_MCG_EIP; 1238 1239 if (cap & MCG_SER_P) 1240 mce_ser = 1; 1241 1242 return 0; 1243} 1244 1245static void __mcheck_cpu_init_generic(void) 1246{ 1247 mce_banks_t all_banks; 1248 u64 cap; 1249 int i; 1250 1251 /* 1252 * Log the machine checks left over from the previous reset. 1253 */ 1254 bitmap_fill(all_banks, MAX_NR_BANKS); 1255 machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks); 1256 1257 set_in_cr4(X86_CR4_MCE); 1258 1259 rdmsrl(MSR_IA32_MCG_CAP, cap); 1260 if (cap & MCG_CTL_P) 1261 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); 1262 1263 for (i = 0; i < banks; i++) { 1264 struct mce_bank *b = &mce_banks[i]; 1265 1266 if (!b->init) 1267 continue; 1268 wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl); 1269 wrmsrl(MSR_IA32_MCx_STATUS(i), 0); 1270 } 1271} 1272 1273/* Add per CPU specific workarounds here */ 1274static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) 1275{ 1276 if (c->x86_vendor == X86_VENDOR_UNKNOWN) { 1277 pr_info("MCE: unknown CPU type - not enabling MCE support.\n"); 1278 return -EOPNOTSUPP; 1279 } 1280 1281 /* This should be disabled by the BIOS, but isn't always */ 1282 if (c->x86_vendor == X86_VENDOR_AMD) { 1283 if (c->x86 == 15 && banks > 4) { 1284 /* 1285 * disable GART TBL walk error reporting, which 1286 * trips off incorrectly with the IOMMU & 3ware 1287 * & Cerberus: 1288 */ 1289 clear_bit(10, (unsigned long *)&mce_banks[4].ctl); 1290 } 1291 if (c->x86 <= 17 && mce_bootlog < 0) { 1292 /* 1293 * Lots of broken BIOS around that don't clear them 1294 * by default and leave crap in there. Don't log: 1295 */ 1296 mce_bootlog = 0; 1297 } 1298 /* 1299 * Various K7s with broken bank 0 around. Always disable 1300 * by default. 1301 */ 1302 if (c->x86 == 6 && banks > 0) 1303 mce_banks[0].ctl = 0; 1304 } 1305 1306 if (c->x86_vendor == X86_VENDOR_INTEL) { 1307 /* 1308 * SDM documents that on family 6 bank 0 should not be written 1309 * because it aliases to another special BIOS controlled 1310 * register. 1311 * But it's not aliased anymore on model 0x1a+ 1312 * Don't ignore bank 0 completely because there could be a 1313 * valid event later, merely don't write CTL0. 1314 */ 1315 1316 if (c->x86 == 6 && c->x86_model < 0x1A && banks > 0) 1317 mce_banks[0].init = 0; 1318 1319 /* 1320 * All newer Intel systems support MCE broadcasting. Enable 1321 * synchronization with a one second timeout. 1322 */ 1323 if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) && 1324 monarch_timeout < 0) 1325 monarch_timeout = USEC_PER_SEC; 1326 1327 /* 1328 * There are also broken BIOSes on some Pentium M and 1329 * earlier systems: 1330 */ 1331 if (c->x86 == 6 && c->x86_model <= 13 && mce_bootlog < 0) 1332 mce_bootlog = 0; 1333 } 1334 if (monarch_timeout < 0) 1335 monarch_timeout = 0; 1336 if (mce_bootlog != 0) 1337 mce_panic_timeout = 30; 1338 1339 return 0; 1340} 1341 1342static int __cpuinit __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c) 1343{ 1344 if (c->x86 != 5) 1345 return 0; 1346 1347 switch (c->x86_vendor) { 1348 case X86_VENDOR_INTEL: 1349 intel_p5_mcheck_init(c); 1350 return 1; 1351 break; 1352 case X86_VENDOR_CENTAUR: 1353 winchip_mcheck_init(c); 1354 return 1; 1355 break; 1356 } 1357 1358 return 0; 1359} 1360 1361static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) 1362{ 1363 switch (c->x86_vendor) { 1364 case X86_VENDOR_INTEL: 1365 mce_intel_feature_init(c); 1366 break; 1367 case X86_VENDOR_AMD: 1368 mce_amd_feature_init(c); 1369 break; 1370 default: 1371 break; 1372 } 1373} 1374 1375static void __mcheck_cpu_init_timer(void) 1376{ 1377 struct timer_list *t = &__get_cpu_var(mce_timer); 1378 int *n = &__get_cpu_var(mce_next_interval); 1379 1380 setup_timer(t, mce_start_timer, smp_processor_id()); 1381 1382 if (mce_ignore_ce) 1383 return; 1384 1385 *n = check_interval * HZ; 1386 if (!*n) 1387 return; 1388 t->expires = round_jiffies(jiffies + *n); 1389 add_timer_on(t, smp_processor_id()); 1390} 1391 1392/* Handle unconfigured int18 (should never happen) */ 1393static void unexpected_machine_check(struct pt_regs *regs, long error_code) 1394{ 1395 printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", 1396 smp_processor_id()); 1397} 1398 1399/* Call the installed machine check handler for this CPU setup. */ 1400void (*machine_check_vector)(struct pt_regs *, long error_code) = 1401 unexpected_machine_check; 1402 1403/* 1404 * Called for each booted CPU to set up machine checks. 1405 * Must be called with preempt off: 1406 */ 1407void __cpuinit mcheck_cpu_init(struct cpuinfo_x86 *c) 1408{ 1409 if (mce_disabled) 1410 return; 1411 1412 if (__mcheck_cpu_ancient_init(c)) 1413 return; 1414 1415 if (!mce_available(c)) 1416 return; 1417 1418 if (__mcheck_cpu_cap_init() < 0 || __mcheck_cpu_apply_quirks(c) < 0) { 1419 mce_disabled = 1; 1420 return; 1421 } 1422 1423 machine_check_vector = do_machine_check; 1424 1425 __mcheck_cpu_init_generic(); 1426 __mcheck_cpu_init_vendor(c); 1427 __mcheck_cpu_init_timer(); 1428 INIT_WORK(&__get_cpu_var(mce_work), mce_process_work); 1429 init_irq_work(&__get_cpu_var(mce_irq_work), &mce_irq_work_cb); 1430} 1431 1432/* 1433 * mce_chrdev: Character device /dev/mcelog to read and clear the MCE log. 1434 */ 1435 1436static DEFINE_SPINLOCK(mce_chrdev_state_lock); 1437static int mce_chrdev_open_count; /* #times opened */ 1438static int mce_chrdev_open_exclu; /* already open exclusive? */ 1439 1440static int mce_chrdev_open(struct inode *inode, struct file *file) 1441{ 1442 spin_lock(&mce_chrdev_state_lock); 1443 1444 if (mce_chrdev_open_exclu || 1445 (mce_chrdev_open_count && (file->f_flags & O_EXCL))) { 1446 spin_unlock(&mce_chrdev_state_lock); 1447 1448 return -EBUSY; 1449 } 1450 1451 if (file->f_flags & O_EXCL) 1452 mce_chrdev_open_exclu = 1; 1453 mce_chrdev_open_count++; 1454 1455 spin_unlock(&mce_chrdev_state_lock); 1456 1457 return nonseekable_open(inode, file); 1458} 1459 1460static int mce_chrdev_release(struct inode *inode, struct file *file) 1461{ 1462 spin_lock(&mce_chrdev_state_lock); 1463 1464 mce_chrdev_open_count--; 1465 mce_chrdev_open_exclu = 0; 1466 1467 spin_unlock(&mce_chrdev_state_lock); 1468 1469 return 0; 1470} 1471 1472static void collect_tscs(void *data) 1473{ 1474 unsigned long *cpu_tsc = (unsigned long *)data; 1475 1476 rdtscll(cpu_tsc[smp_processor_id()]); 1477} 1478 1479static int mce_apei_read_done; 1480 1481/* Collect MCE record of previous boot in persistent storage via APEI ERST. */ 1482static int __mce_read_apei(char __user **ubuf, size_t usize) 1483{ 1484 int rc; 1485 u64 record_id; 1486 struct mce m; 1487 1488 if (usize < sizeof(struct mce)) 1489 return -EINVAL; 1490 1491 rc = apei_read_mce(&m, &record_id); 1492 /* Error or no more MCE record */ 1493 if (rc <= 0) { 1494 mce_apei_read_done = 1; 1495 return rc; 1496 } 1497 rc = -EFAULT; 1498 if (copy_to_user(*ubuf, &m, sizeof(struct mce))) 1499 return rc; 1500 /* 1501 * In fact, we should have cleared the record after that has 1502 * been flushed to the disk or sent to network in 1503 * /sbin/mcelog, but we have no interface to support that now, 1504 * so just clear it to avoid duplication. 1505 */ 1506 rc = apei_clear_mce(record_id); 1507 if (rc) { 1508 mce_apei_read_done = 1; 1509 return rc; 1510 } 1511 *ubuf += sizeof(struct mce); 1512 1513 return 0; 1514} 1515 1516static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf, 1517 size_t usize, loff_t *off) 1518{ 1519 char __user *buf = ubuf; 1520 unsigned long *cpu_tsc; 1521 unsigned prev, next; 1522 int i, err; 1523 1524 cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL); 1525 if (!cpu_tsc) 1526 return -ENOMEM; 1527 1528 mutex_lock(&mce_chrdev_read_mutex); 1529 1530 if (!mce_apei_read_done) { 1531 err = __mce_read_apei(&buf, usize); 1532 if (err || buf != ubuf) 1533 goto out; 1534 } 1535 1536 next = rcu_dereference_check_mce(mcelog.next); 1537 1538 /* Only supports full reads right now */ 1539 err = -EINVAL; 1540 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) 1541 goto out; 1542 1543 err = 0; 1544 prev = 0; 1545 do { 1546 for (i = prev; i < next; i++) { 1547 unsigned long start = jiffies; 1548 struct mce *m = &mcelog.entry[i]; 1549 1550 while (!m->finished) { 1551 if (time_after_eq(jiffies, start + 2)) { 1552 memset(m, 0, sizeof(*m)); 1553 goto timeout; 1554 } 1555 cpu_relax(); 1556 } 1557 smp_rmb(); 1558 err |= copy_to_user(buf, m, sizeof(*m)); 1559 buf += sizeof(*m); 1560timeout: 1561 ; 1562 } 1563 1564 memset(mcelog.entry + prev, 0, 1565 (next - prev) * sizeof(struct mce)); 1566 prev = next; 1567 next = cmpxchg(&mcelog.next, prev, 0); 1568 } while (next != prev); 1569 1570 synchronize_sched(); 1571 1572 /* 1573 * Collect entries that were still getting written before the 1574 * synchronize. 1575 */ 1576 on_each_cpu(collect_tscs, cpu_tsc, 1); 1577 1578 for (i = next; i < MCE_LOG_LEN; i++) { 1579 struct mce *m = &mcelog.entry[i]; 1580 1581 if (m->finished && m->tsc < cpu_tsc[m->cpu]) { 1582 err |= copy_to_user(buf, m, sizeof(*m)); 1583 smp_rmb(); 1584 buf += sizeof(*m); 1585 memset(m, 0, sizeof(*m)); 1586 } 1587 } 1588 1589 if (err) 1590 err = -EFAULT; 1591 1592out: 1593 mutex_unlock(&mce_chrdev_read_mutex); 1594 kfree(cpu_tsc); 1595 1596 return err ? err : buf - ubuf; 1597} 1598 1599static unsigned int mce_chrdev_poll(struct file *file, poll_table *wait) 1600{ 1601 poll_wait(file, &mce_chrdev_wait, wait); 1602 if (rcu_access_index(mcelog.next)) 1603 return POLLIN | POLLRDNORM; 1604 if (!mce_apei_read_done && apei_check_mce()) 1605 return POLLIN | POLLRDNORM; 1606 return 0; 1607} 1608 1609static long mce_chrdev_ioctl(struct file *f, unsigned int cmd, 1610 unsigned long arg) 1611{ 1612 int __user *p = (int __user *)arg; 1613 1614 if (!capable(CAP_SYS_ADMIN)) 1615 return -EPERM; 1616 1617 switch (cmd) { 1618 case MCE_GET_RECORD_LEN: 1619 return put_user(sizeof(struct mce), p); 1620 case MCE_GET_LOG_LEN: 1621 return put_user(MCE_LOG_LEN, p); 1622 case MCE_GETCLEAR_FLAGS: { 1623 unsigned flags; 1624 1625 do { 1626 flags = mcelog.flags; 1627 } while (cmpxchg(&mcelog.flags, flags, 0) != flags); 1628 1629 return put_user(flags, p); 1630 } 1631 default: 1632 return -ENOTTY; 1633 } 1634} 1635 1636/* Modified in mce-inject.c, so not static or const */ 1637struct file_operations mce_chrdev_ops = { 1638 .open = mce_chrdev_open, 1639 .release = mce_chrdev_release, 1640 .read = mce_chrdev_read, 1641 .poll = mce_chrdev_poll, 1642 .unlocked_ioctl = mce_chrdev_ioctl, 1643 .llseek = no_llseek, 1644}; 1645EXPORT_SYMBOL_GPL(mce_chrdev_ops); 1646 1647static struct miscdevice mce_chrdev_device = { 1648 MISC_MCELOG_MINOR, 1649 "mcelog", 1650 &mce_chrdev_ops, 1651}; 1652 1653/* 1654 * mce=off Disables machine check 1655 * mce=no_cmci Disables CMCI 1656 * mce=dont_log_ce Clears corrected events silently, no log created for CEs. 1657 * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared. 1658 * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above) 1659 * monarchtimeout is how long to wait for other CPUs on machine 1660 * check, or 0 to not wait 1661 * mce=bootlog Log MCEs from before booting. Disabled by default on AMD. 1662 * mce=nobootlog Don't log MCEs from before booting. 1663 */ 1664static int __init mcheck_enable(char *str) 1665{ 1666 if (*str == 0) { 1667 enable_p5_mce(); 1668 return 1; 1669 } 1670 if (*str == '=') 1671 str++; 1672 if (!strcmp(str, "off")) 1673 mce_disabled = 1; 1674 else if (!strcmp(str, "no_cmci")) 1675 mce_cmci_disabled = 1; 1676 else if (!strcmp(str, "dont_log_ce")) 1677 mce_dont_log_ce = 1; 1678 else if (!strcmp(str, "ignore_ce")) 1679 mce_ignore_ce = 1; 1680 else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog")) 1681 mce_bootlog = (str[0] == 'b'); 1682 else if (isdigit(str[0])) { 1683 get_option(&str, &tolerant); 1684 if (*str == ',') { 1685 ++str; 1686 get_option(&str, &monarch_timeout); 1687 } 1688 } else { 1689 printk(KERN_INFO "mce argument %s ignored. Please use /sys\n", 1690 str); 1691 return 0; 1692 } 1693 return 1; 1694} 1695__setup("mce", mcheck_enable); 1696 1697int __init mcheck_init(void) 1698{ 1699 mcheck_intel_therm_init(); 1700 1701 return 0; 1702} 1703 1704/* 1705 * mce_syscore: PM support 1706 */ 1707 1708/* 1709 * Disable machine checks on suspend and shutdown. We can't really handle 1710 * them later. 1711 */ 1712static int mce_disable_error_reporting(void) 1713{ 1714 int i; 1715 1716 for (i = 0; i < banks; i++) { 1717 struct mce_bank *b = &mce_banks[i]; 1718 1719 if (b->init) 1720 wrmsrl(MSR_IA32_MCx_CTL(i), 0); 1721 } 1722 return 0; 1723} 1724 1725static int mce_syscore_suspend(void) 1726{ 1727 return mce_disable_error_reporting(); 1728} 1729 1730static void mce_syscore_shutdown(void) 1731{ 1732 mce_disable_error_reporting(); 1733} 1734 1735/* 1736 * On resume clear all MCE state. Don't want to see leftovers from the BIOS. 1737 * Only one CPU is active at this time, the others get re-added later using 1738 * CPU hotplug: 1739 */ 1740static void mce_syscore_resume(void) 1741{ 1742 __mcheck_cpu_init_generic(); 1743 __mcheck_cpu_init_vendor(__this_cpu_ptr(&cpu_info)); 1744} 1745 1746static struct syscore_ops mce_syscore_ops = { 1747 .suspend = mce_syscore_suspend, 1748 .shutdown = mce_syscore_shutdown, 1749 .resume = mce_syscore_resume, 1750}; 1751 1752/* 1753 * mce_sysdev: Sysfs support 1754 */ 1755 1756static void mce_cpu_restart(void *data) 1757{ 1758 del_timer_sync(&__get_cpu_var(mce_timer)); 1759 if (!mce_available(__this_cpu_ptr(&cpu_info))) 1760 return; 1761 __mcheck_cpu_init_generic(); 1762 __mcheck_cpu_init_timer(); 1763} 1764 1765/* Reinit MCEs after user configuration changes */ 1766static void mce_restart(void) 1767{ 1768 on_each_cpu(mce_cpu_restart, NULL, 1); 1769} 1770 1771/* Toggle features for corrected errors */ 1772static void mce_disable_ce(void *all) 1773{ 1774 if (!mce_available(__this_cpu_ptr(&cpu_info))) 1775 return; 1776 if (all) 1777 del_timer_sync(&__get_cpu_var(mce_timer)); 1778 cmci_clear(); 1779} 1780 1781static void mce_enable_ce(void *all) 1782{ 1783 if (!mce_available(__this_cpu_ptr(&cpu_info))) 1784 return; 1785 cmci_reenable(); 1786 cmci_recheck(); 1787 if (all) 1788 __mcheck_cpu_init_timer(); 1789} 1790 1791static struct sysdev_class mce_sysdev_class = { 1792 .name = "machinecheck", 1793}; 1794 1795DEFINE_PER_CPU(struct sys_device, mce_sysdev); 1796 1797__cpuinitdata 1798void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); 1799 1800static inline struct mce_bank *attr_to_bank(struct sysdev_attribute *attr) 1801{ 1802 return container_of(attr, struct mce_bank, attr); 1803} 1804 1805static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr, 1806 char *buf) 1807{ 1808 return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl); 1809} 1810 1811static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, 1812 const char *buf, size_t size) 1813{ 1814 u64 new; 1815 1816 if (strict_strtoull(buf, 0, &new) < 0) 1817 return -EINVAL; 1818 1819 attr_to_bank(attr)->ctl = new; 1820 mce_restart(); 1821 1822 return size; 1823} 1824 1825static ssize_t 1826show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf) 1827{ 1828 strcpy(buf, mce_helper); 1829 strcat(buf, "\n"); 1830 return strlen(mce_helper) + 1; 1831} 1832 1833static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr, 1834 const char *buf, size_t siz) 1835{ 1836 char *p; 1837 1838 strncpy(mce_helper, buf, sizeof(mce_helper)); 1839 mce_helper[sizeof(mce_helper)-1] = 0; 1840 p = strchr(mce_helper, '\n'); 1841 1842 if (p) 1843 *p = 0; 1844 1845 return strlen(mce_helper) + !!p; 1846} 1847 1848static ssize_t set_ignore_ce(struct sys_device *s, 1849 struct sysdev_attribute *attr, 1850 const char *buf, size_t size) 1851{ 1852 u64 new; 1853 1854 if (strict_strtoull(buf, 0, &new) < 0) 1855 return -EINVAL; 1856 1857 if (mce_ignore_ce ^ !!new) { 1858 if (new) { 1859 /* disable ce features */ 1860 on_each_cpu(mce_disable_ce, (void *)1, 1); 1861 mce_ignore_ce = 1; 1862 } else { 1863 /* enable ce features */ 1864 mce_ignore_ce = 0; 1865 on_each_cpu(mce_enable_ce, (void *)1, 1); 1866 } 1867 } 1868 return size; 1869} 1870 1871static ssize_t set_cmci_disabled(struct sys_device *s, 1872 struct sysdev_attribute *attr, 1873 const char *buf, size_t size) 1874{ 1875 u64 new; 1876 1877 if (strict_strtoull(buf, 0, &new) < 0) 1878 return -EINVAL; 1879 1880 if (mce_cmci_disabled ^ !!new) { 1881 if (new) { 1882 /* disable cmci */ 1883 on_each_cpu(mce_disable_ce, NULL, 1); 1884 mce_cmci_disabled = 1; 1885 } else { 1886 /* enable cmci */ 1887 mce_cmci_disabled = 0; 1888 on_each_cpu(mce_enable_ce, NULL, 1); 1889 } 1890 } 1891 return size; 1892} 1893 1894static ssize_t store_int_with_restart(struct sys_device *s, 1895 struct sysdev_attribute *attr, 1896 const char *buf, size_t size) 1897{ 1898 ssize_t ret = sysdev_store_int(s, attr, buf, size); 1899 mce_restart(); 1900 return ret; 1901} 1902 1903static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); 1904static SYSDEV_INT_ATTR(tolerant, 0644, tolerant); 1905static SYSDEV_INT_ATTR(monarch_timeout, 0644, monarch_timeout); 1906static SYSDEV_INT_ATTR(dont_log_ce, 0644, mce_dont_log_ce); 1907 1908static struct sysdev_ext_attribute attr_check_interval = { 1909 _SYSDEV_ATTR(check_interval, 0644, sysdev_show_int, 1910 store_int_with_restart), 1911 &check_interval 1912}; 1913 1914static struct sysdev_ext_attribute attr_ignore_ce = { 1915 _SYSDEV_ATTR(ignore_ce, 0644, sysdev_show_int, set_ignore_ce), 1916 &mce_ignore_ce 1917}; 1918 1919static struct sysdev_ext_attribute attr_cmci_disabled = { 1920 _SYSDEV_ATTR(cmci_disabled, 0644, sysdev_show_int, set_cmci_disabled), 1921 &mce_cmci_disabled 1922}; 1923 1924static struct sysdev_attribute *mce_sysdev_attrs[] = { 1925 &attr_tolerant.attr, 1926 &attr_check_interval.attr, 1927 &attr_trigger, 1928 &attr_monarch_timeout.attr, 1929 &attr_dont_log_ce.attr, 1930 &attr_ignore_ce.attr, 1931 &attr_cmci_disabled.attr, 1932 NULL 1933}; 1934 1935static cpumask_var_t mce_sysdev_initialized; 1936 1937/* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */ 1938static __cpuinit int mce_sysdev_create(unsigned int cpu) 1939{ 1940 struct sys_device *sysdev = &per_cpu(mce_sysdev, cpu); 1941 int err; 1942 int i, j; 1943 1944 if (!mce_available(&boot_cpu_data)) 1945 return -EIO; 1946 1947 memset(&sysdev->kobj, 0, sizeof(struct kobject)); 1948 sysdev->id = cpu; 1949 sysdev->cls = &mce_sysdev_class; 1950 1951 err = sysdev_register(sysdev); 1952 if (err) 1953 return err; 1954 1955 for (i = 0; mce_sysdev_attrs[i]; i++) { 1956 err = sysdev_create_file(sysdev, mce_sysdev_attrs[i]); 1957 if (err) 1958 goto error; 1959 } 1960 for (j = 0; j < banks; j++) { 1961 err = sysdev_create_file(sysdev, &mce_banks[j].attr); 1962 if (err) 1963 goto error2; 1964 } 1965 cpumask_set_cpu(cpu, mce_sysdev_initialized); 1966 1967 return 0; 1968error2: 1969 while (--j >= 0) 1970 sysdev_remove_file(sysdev, &mce_banks[j].attr); 1971error: 1972 while (--i >= 0) 1973 sysdev_remove_file(sysdev, mce_sysdev_attrs[i]); 1974 1975 sysdev_unregister(sysdev); 1976 1977 return err; 1978} 1979 1980static __cpuinit void mce_sysdev_remove(unsigned int cpu) 1981{ 1982 struct sys_device *sysdev = &per_cpu(mce_sysdev, cpu); 1983 int i; 1984 1985 if (!cpumask_test_cpu(cpu, mce_sysdev_initialized)) 1986 return; 1987 1988 for (i = 0; mce_sysdev_attrs[i]; i++) 1989 sysdev_remove_file(sysdev, mce_sysdev_attrs[i]); 1990 1991 for (i = 0; i < banks; i++) 1992 sysdev_remove_file(sysdev, &mce_banks[i].attr); 1993 1994 sysdev_unregister(sysdev); 1995 cpumask_clear_cpu(cpu, mce_sysdev_initialized); 1996} 1997 1998/* Make sure there are no machine checks on offlined CPUs. */ 1999static void __cpuinit mce_disable_cpu(void *h) 2000{ 2001 unsigned long action = *(unsigned long *)h; 2002 int i; 2003 2004 if (!mce_available(__this_cpu_ptr(&cpu_info))) 2005 return; 2006 2007 if (!(action & CPU_TASKS_FROZEN)) 2008 cmci_clear(); 2009 for (i = 0; i < banks; i++) { 2010 struct mce_bank *b = &mce_banks[i]; 2011 2012 if (b->init) 2013 wrmsrl(MSR_IA32_MCx_CTL(i), 0); 2014 } 2015} 2016 2017static void __cpuinit mce_reenable_cpu(void *h) 2018{ 2019 unsigned long action = *(unsigned long *)h; 2020 int i; 2021 2022 if (!mce_available(__this_cpu_ptr(&cpu_info))) 2023 return; 2024 2025 if (!(action & CPU_TASKS_FROZEN)) 2026 cmci_reenable(); 2027 for (i = 0; i < banks; i++) { 2028 struct mce_bank *b = &mce_banks[i]; 2029 2030 if (b->init) 2031 wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl); 2032 } 2033} 2034 2035/* Get notified when a cpu comes on/off. Be hotplug friendly. */ 2036static int __cpuinit 2037mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) 2038{ 2039 unsigned int cpu = (unsigned long)hcpu; 2040 struct timer_list *t = &per_cpu(mce_timer, cpu); 2041 2042 switch (action) { 2043 case CPU_ONLINE: 2044 case CPU_ONLINE_FROZEN: 2045 mce_sysdev_create(cpu); 2046 if (threshold_cpu_callback) 2047 threshold_cpu_callback(action, cpu); 2048 break; 2049 case CPU_DEAD: 2050 case CPU_DEAD_FROZEN: 2051 if (threshold_cpu_callback) 2052 threshold_cpu_callback(action, cpu); 2053 mce_sysdev_remove(cpu); 2054 break; 2055 case CPU_DOWN_PREPARE: 2056 case CPU_DOWN_PREPARE_FROZEN: 2057 del_timer_sync(t); 2058 smp_call_function_single(cpu, mce_disable_cpu, &action, 1); 2059 break; 2060 case CPU_DOWN_FAILED: 2061 case CPU_DOWN_FAILED_FROZEN: 2062 if (!mce_ignore_ce && check_interval) { 2063 t->expires = round_jiffies(jiffies + 2064 __get_cpu_var(mce_next_interval)); 2065 add_timer_on(t, cpu); 2066 } 2067 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); 2068 break; 2069 case CPU_POST_DEAD: 2070 /* intentionally ignoring frozen here */ 2071 cmci_rediscover(cpu); 2072 break; 2073 } 2074 return NOTIFY_OK; 2075} 2076 2077static struct notifier_block mce_cpu_notifier __cpuinitdata = { 2078 .notifier_call = mce_cpu_callback, 2079}; 2080 2081static __init void mce_init_banks(void) 2082{ 2083 int i; 2084 2085 for (i = 0; i < banks; i++) { 2086 struct mce_bank *b = &mce_banks[i]; 2087 struct sysdev_attribute *a = &b->attr; 2088 2089 sysfs_attr_init(&a->attr); 2090 a->attr.name = b->attrname; 2091 snprintf(b->attrname, ATTR_LEN, "bank%d", i); 2092 2093 a->attr.mode = 0644; 2094 a->show = show_bank; 2095 a->store = set_bank; 2096 } 2097} 2098 2099static __init int mcheck_init_device(void) 2100{ 2101 int err; 2102 int i = 0; 2103 2104 if (!mce_available(&boot_cpu_data)) 2105 return -EIO; 2106 2107 zalloc_cpumask_var(&mce_sysdev_initialized, GFP_KERNEL); 2108 2109 mce_init_banks(); 2110 2111 err = sysdev_class_register(&mce_sysdev_class); 2112 if (err) 2113 return err; 2114 2115 for_each_online_cpu(i) { 2116 err = mce_sysdev_create(i); 2117 if (err) 2118 return err; 2119 } 2120 2121 register_syscore_ops(&mce_syscore_ops); 2122 register_hotcpu_notifier(&mce_cpu_notifier); 2123 2124 /* register character device /dev/mcelog */ 2125 misc_register(&mce_chrdev_device); 2126 2127 return err; 2128} 2129device_initcall(mcheck_init_device); 2130 2131/* 2132 * Old style boot options parsing. Only for compatibility. 2133 */ 2134static int __init mcheck_disable(char *str) 2135{ 2136 mce_disabled = 1; 2137 return 1; 2138} 2139__setup("nomce", mcheck_disable); 2140 2141#ifdef CONFIG_DEBUG_FS 2142struct dentry *mce_get_debugfs_dir(void) 2143{ 2144 static struct dentry *dmce; 2145 2146 if (!dmce) 2147 dmce = debugfs_create_dir("mce", NULL); 2148 2149 return dmce; 2150} 2151 2152static void mce_reset(void) 2153{ 2154 cpu_missing = 0; 2155 atomic_set(&mce_fake_paniced, 0); 2156 atomic_set(&mce_executing, 0); 2157 atomic_set(&mce_callin, 0); 2158 atomic_set(&global_nwo, 0); 2159} 2160 2161static int fake_panic_get(void *data, u64 *val) 2162{ 2163 *val = fake_panic; 2164 return 0; 2165} 2166 2167static int fake_panic_set(void *data, u64 val) 2168{ 2169 mce_reset(); 2170 fake_panic = val; 2171 return 0; 2172} 2173 2174DEFINE_SIMPLE_ATTRIBUTE(fake_panic_fops, fake_panic_get, 2175 fake_panic_set, "%llu\n"); 2176 2177static int __init mcheck_debugfs_init(void) 2178{ 2179 struct dentry *dmce, *ffake_panic; 2180 2181 dmce = mce_get_debugfs_dir(); 2182 if (!dmce) 2183 return -ENOMEM; 2184 ffake_panic = debugfs_create_file("fake_panic", 0444, dmce, NULL, 2185 &fake_panic_fops); 2186 if (!ffake_panic) 2187 return -ENOMEM; 2188 2189 return 0; 2190} 2191late_initcall(mcheck_debugfs_init); 2192#endif 2193