mce.c revision 3a97fc34130326da87b20de5d0259c35406707ce
1/* 2 * Machine check handler. 3 * 4 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. 5 * Rest from unknown author(s). 6 * 2004 Andi Kleen. Rewrote most of it. 7 * Copyright 2008 Intel Corporation 8 * Author: Andi Kleen 9 */ 10#include <linux/thread_info.h> 11#include <linux/capability.h> 12#include <linux/miscdevice.h> 13#include <linux/ratelimit.h> 14#include <linux/kallsyms.h> 15#include <linux/rcupdate.h> 16#include <linux/kobject.h> 17#include <linux/uaccess.h> 18#include <linux/kdebug.h> 19#include <linux/kernel.h> 20#include <linux/percpu.h> 21#include <linux/string.h> 22#include <linux/sysdev.h> 23#include <linux/syscore_ops.h> 24#include <linux/delay.h> 25#include <linux/ctype.h> 26#include <linux/sched.h> 27#include <linux/sysfs.h> 28#include <linux/types.h> 29#include <linux/slab.h> 30#include <linux/init.h> 31#include <linux/kmod.h> 32#include <linux/poll.h> 33#include <linux/nmi.h> 34#include <linux/cpu.h> 35#include <linux/smp.h> 36#include <linux/fs.h> 37#include <linux/mm.h> 38#include <linux/debugfs.h> 39#include <linux/edac_mce.h> 40#include <linux/irq_work.h> 41 42#include <asm/processor.h> 43#include <asm/mce.h> 44#include <asm/msr.h> 45 46#include "mce-internal.h" 47 48static DEFINE_MUTEX(mce_read_mutex); 49 50#define rcu_dereference_check_mce(p) \ 51 rcu_dereference_index_check((p), \ 52 rcu_read_lock_sched_held() || \ 53 lockdep_is_held(&mce_read_mutex)) 54 55#define CREATE_TRACE_POINTS 56#include <trace/events/mce.h> 57 58int mce_disabled __read_mostly; 59 60#define MISC_MCELOG_MINOR 227 61 62#define SPINUNIT 100 /* 100ns */ 63 64atomic_t mce_entry; 65 66DEFINE_PER_CPU(unsigned, mce_exception_count); 67 68/* 69 * Tolerant levels: 70 * 0: always panic on uncorrected errors, log corrected errors 71 * 1: panic or SIGBUS on uncorrected errors, log corrected errors 72 * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors 73 * 3: never panic or SIGBUS, log all errors (for testing only) 74 */ 75static int tolerant __read_mostly = 1; 76static int banks __read_mostly; 77static int rip_msr __read_mostly; 78static int mce_bootlog __read_mostly = -1; 79static int monarch_timeout __read_mostly = -1; 80static int mce_panic_timeout __read_mostly; 81static int mce_dont_log_ce __read_mostly; 82int mce_cmci_disabled __read_mostly; 83int mce_ignore_ce __read_mostly; 84int mce_ser __read_mostly; 85 86struct mce_bank *mce_banks __read_mostly; 87 88/* User mode helper program triggered by machine check event */ 89static unsigned long mce_need_notify; 90static char mce_helper[128]; 91static char *mce_helper_argv[2] = { mce_helper, NULL }; 92 93static DECLARE_WAIT_QUEUE_HEAD(mce_wait); 94static DEFINE_PER_CPU(struct mce, mces_seen); 95static int cpu_missing; 96 97/* 98 * CPU/chipset specific EDAC code can register a notifier call here to print 99 * MCE errors in a human-readable form. 100 */ 101ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain); 102EXPORT_SYMBOL_GPL(x86_mce_decoder_chain); 103 104/* MCA banks polled by the period polling timer for corrected events */ 105DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { 106 [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL 107}; 108 109static DEFINE_PER_CPU(struct work_struct, mce_work); 110 111/* Do initial initialization of a struct mce */ 112void mce_setup(struct mce *m) 113{ 114 memset(m, 0, sizeof(struct mce)); 115 m->cpu = m->extcpu = smp_processor_id(); 116 rdtscll(m->tsc); 117 /* We hope get_seconds stays lockless */ 118 m->time = get_seconds(); 119 m->cpuvendor = boot_cpu_data.x86_vendor; 120 m->cpuid = cpuid_eax(1); 121#ifdef CONFIG_SMP 122 m->socketid = cpu_data(m->extcpu).phys_proc_id; 123#endif 124 m->apicid = cpu_data(m->extcpu).initial_apicid; 125 rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap); 126} 127 128DEFINE_PER_CPU(struct mce, injectm); 129EXPORT_PER_CPU_SYMBOL_GPL(injectm); 130 131/* 132 * Lockless MCE logging infrastructure. 133 * This avoids deadlocks on printk locks without having to break locks. Also 134 * separate MCEs from kernel messages to avoid bogus bug reports. 135 */ 136 137static struct mce_log mcelog = { 138 .signature = MCE_LOG_SIGNATURE, 139 .len = MCE_LOG_LEN, 140 .recordlen = sizeof(struct mce), 141}; 142 143void mce_log(struct mce *mce) 144{ 145 unsigned next, entry; 146 147 /* Emit the trace record: */ 148 trace_mce_record(mce); 149 150 mce->finished = 0; 151 wmb(); 152 for (;;) { 153 entry = rcu_dereference_check_mce(mcelog.next); 154 for (;;) { 155 /* 156 * If edac_mce is enabled, it will check the error type 157 * and will process it, if it is a known error. 158 * Otherwise, the error will be sent through mcelog 159 * interface 160 */ 161 if (edac_mce_parse(mce)) 162 return; 163 164 /* 165 * When the buffer fills up discard new entries. 166 * Assume that the earlier errors are the more 167 * interesting ones: 168 */ 169 if (entry >= MCE_LOG_LEN) { 170 set_bit(MCE_OVERFLOW, 171 (unsigned long *)&mcelog.flags); 172 return; 173 } 174 /* Old left over entry. Skip: */ 175 if (mcelog.entry[entry].finished) { 176 entry++; 177 continue; 178 } 179 break; 180 } 181 smp_rmb(); 182 next = entry + 1; 183 if (cmpxchg(&mcelog.next, entry, next) == entry) 184 break; 185 } 186 memcpy(mcelog.entry + entry, mce, sizeof(struct mce)); 187 wmb(); 188 mcelog.entry[entry].finished = 1; 189 wmb(); 190 191 mce->finished = 1; 192 set_bit(0, &mce_need_notify); 193} 194 195static void print_mce(struct mce *m) 196{ 197 int ret = 0; 198 199 pr_emerg(HW_ERR "CPU %d: Machine Check Exception: %Lx Bank %d: %016Lx\n", 200 m->extcpu, m->mcgstatus, m->bank, m->status); 201 202 if (m->ip) { 203 pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ", 204 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", 205 m->cs, m->ip); 206 207 if (m->cs == __KERNEL_CS) 208 print_symbol("{%s}", m->ip); 209 pr_cont("\n"); 210 } 211 212 pr_emerg(HW_ERR "TSC %llx ", m->tsc); 213 if (m->addr) 214 pr_cont("ADDR %llx ", m->addr); 215 if (m->misc) 216 pr_cont("MISC %llx ", m->misc); 217 218 pr_cont("\n"); 219 pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n", 220 m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid); 221 222 /* 223 * Print out human-readable details about the MCE error, 224 * (if the CPU has an implementation for that) 225 */ 226 ret = atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m); 227 if (ret == NOTIFY_STOP) 228 return; 229 230 pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n"); 231} 232 233#define PANIC_TIMEOUT 5 /* 5 seconds */ 234 235static atomic_t mce_paniced; 236 237static int fake_panic; 238static atomic_t mce_fake_paniced; 239 240/* Panic in progress. Enable interrupts and wait for final IPI */ 241static void wait_for_panic(void) 242{ 243 long timeout = PANIC_TIMEOUT*USEC_PER_SEC; 244 245 preempt_disable(); 246 local_irq_enable(); 247 while (timeout-- > 0) 248 udelay(1); 249 if (panic_timeout == 0) 250 panic_timeout = mce_panic_timeout; 251 panic("Panicing machine check CPU died"); 252} 253 254static void mce_panic(char *msg, struct mce *final, char *exp) 255{ 256 int i, apei_err = 0; 257 258 if (!fake_panic) { 259 /* 260 * Make sure only one CPU runs in machine check panic 261 */ 262 if (atomic_inc_return(&mce_paniced) > 1) 263 wait_for_panic(); 264 barrier(); 265 266 bust_spinlocks(1); 267 console_verbose(); 268 } else { 269 /* Don't log too much for fake panic */ 270 if (atomic_inc_return(&mce_fake_paniced) > 1) 271 return; 272 } 273 /* First print corrected ones that are still unlogged */ 274 for (i = 0; i < MCE_LOG_LEN; i++) { 275 struct mce *m = &mcelog.entry[i]; 276 if (!(m->status & MCI_STATUS_VAL)) 277 continue; 278 if (!(m->status & MCI_STATUS_UC)) { 279 print_mce(m); 280 if (!apei_err) 281 apei_err = apei_write_mce(m); 282 } 283 } 284 /* Now print uncorrected but with the final one last */ 285 for (i = 0; i < MCE_LOG_LEN; i++) { 286 struct mce *m = &mcelog.entry[i]; 287 if (!(m->status & MCI_STATUS_VAL)) 288 continue; 289 if (!(m->status & MCI_STATUS_UC)) 290 continue; 291 if (!final || memcmp(m, final, sizeof(struct mce))) { 292 print_mce(m); 293 if (!apei_err) 294 apei_err = apei_write_mce(m); 295 } 296 } 297 if (final) { 298 print_mce(final); 299 if (!apei_err) 300 apei_err = apei_write_mce(final); 301 } 302 if (cpu_missing) 303 pr_emerg(HW_ERR "Some CPUs didn't answer in synchronization\n"); 304 if (exp) 305 pr_emerg(HW_ERR "Machine check: %s\n", exp); 306 if (!fake_panic) { 307 if (panic_timeout == 0) 308 panic_timeout = mce_panic_timeout; 309 panic(msg); 310 } else 311 pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg); 312} 313 314/* Support code for software error injection */ 315 316static int msr_to_offset(u32 msr) 317{ 318 unsigned bank = __this_cpu_read(injectm.bank); 319 320 if (msr == rip_msr) 321 return offsetof(struct mce, ip); 322 if (msr == MSR_IA32_MCx_STATUS(bank)) 323 return offsetof(struct mce, status); 324 if (msr == MSR_IA32_MCx_ADDR(bank)) 325 return offsetof(struct mce, addr); 326 if (msr == MSR_IA32_MCx_MISC(bank)) 327 return offsetof(struct mce, misc); 328 if (msr == MSR_IA32_MCG_STATUS) 329 return offsetof(struct mce, mcgstatus); 330 return -1; 331} 332 333/* MSR access wrappers used for error injection */ 334static u64 mce_rdmsrl(u32 msr) 335{ 336 u64 v; 337 338 if (__this_cpu_read(injectm.finished)) { 339 int offset = msr_to_offset(msr); 340 341 if (offset < 0) 342 return 0; 343 return *(u64 *)((char *)&__get_cpu_var(injectm) + offset); 344 } 345 346 if (rdmsrl_safe(msr, &v)) { 347 WARN_ONCE(1, "mce: Unable to read msr %d!\n", msr); 348 /* 349 * Return zero in case the access faulted. This should 350 * not happen normally but can happen if the CPU does 351 * something weird, or if the code is buggy. 352 */ 353 v = 0; 354 } 355 356 return v; 357} 358 359static void mce_wrmsrl(u32 msr, u64 v) 360{ 361 if (__this_cpu_read(injectm.finished)) { 362 int offset = msr_to_offset(msr); 363 364 if (offset >= 0) 365 *(u64 *)((char *)&__get_cpu_var(injectm) + offset) = v; 366 return; 367 } 368 wrmsrl(msr, v); 369} 370 371/* 372 * Collect all global (w.r.t. this processor) status about this machine 373 * check into our "mce" struct so that we can use it later to assess 374 * the severity of the problem as we read per-bank specific details. 375 */ 376static inline void mce_gather_info(struct mce *m, struct pt_regs *regs) 377{ 378 mce_setup(m); 379 380 m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); 381 if (regs) { 382 /* 383 * Get the address of the instruction at the time of 384 * the machine check error. 385 */ 386 if (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) { 387 m->ip = regs->ip; 388 m->cs = regs->cs; 389 } 390 /* Use accurate RIP reporting if available. */ 391 if (rip_msr) 392 m->ip = mce_rdmsrl(rip_msr); 393 } 394} 395 396/* 397 * Simple lockless ring to communicate PFNs from the exception handler with the 398 * process context work function. This is vastly simplified because there's 399 * only a single reader and a single writer. 400 */ 401#define MCE_RING_SIZE 16 /* we use one entry less */ 402 403struct mce_ring { 404 unsigned short start; 405 unsigned short end; 406 unsigned long ring[MCE_RING_SIZE]; 407}; 408static DEFINE_PER_CPU(struct mce_ring, mce_ring); 409 410/* Runs with CPU affinity in workqueue */ 411static int mce_ring_empty(void) 412{ 413 struct mce_ring *r = &__get_cpu_var(mce_ring); 414 415 return r->start == r->end; 416} 417 418static int mce_ring_get(unsigned long *pfn) 419{ 420 struct mce_ring *r; 421 int ret = 0; 422 423 *pfn = 0; 424 get_cpu(); 425 r = &__get_cpu_var(mce_ring); 426 if (r->start == r->end) 427 goto out; 428 *pfn = r->ring[r->start]; 429 r->start = (r->start + 1) % MCE_RING_SIZE; 430 ret = 1; 431out: 432 put_cpu(); 433 return ret; 434} 435 436/* Always runs in MCE context with preempt off */ 437static int mce_ring_add(unsigned long pfn) 438{ 439 struct mce_ring *r = &__get_cpu_var(mce_ring); 440 unsigned next; 441 442 next = (r->end + 1) % MCE_RING_SIZE; 443 if (next == r->start) 444 return -1; 445 r->ring[r->end] = pfn; 446 wmb(); 447 r->end = next; 448 return 0; 449} 450 451int mce_available(struct cpuinfo_x86 *c) 452{ 453 if (mce_disabled) 454 return 0; 455 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); 456} 457 458static void mce_schedule_work(void) 459{ 460 if (!mce_ring_empty()) { 461 struct work_struct *work = &__get_cpu_var(mce_work); 462 if (!work_pending(work)) 463 schedule_work(work); 464 } 465} 466 467DEFINE_PER_CPU(struct irq_work, mce_irq_work); 468 469static void mce_irq_work_cb(struct irq_work *entry) 470{ 471 mce_notify_irq(); 472 mce_schedule_work(); 473} 474 475static void mce_report_event(struct pt_regs *regs) 476{ 477 if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) { 478 mce_notify_irq(); 479 /* 480 * Triggering the work queue here is just an insurance 481 * policy in case the syscall exit notify handler 482 * doesn't run soon enough or ends up running on the 483 * wrong CPU (can happen when audit sleeps) 484 */ 485 mce_schedule_work(); 486 return; 487 } 488 489 irq_work_queue(&__get_cpu_var(mce_irq_work)); 490} 491 492DEFINE_PER_CPU(unsigned, mce_poll_count); 493 494/* 495 * Poll for corrected events or events that happened before reset. 496 * Those are just logged through /dev/mcelog. 497 * 498 * This is executed in standard interrupt context. 499 * 500 * Note: spec recommends to panic for fatal unsignalled 501 * errors here. However this would be quite problematic -- 502 * we would need to reimplement the Monarch handling and 503 * it would mess up the exclusion between exception handler 504 * and poll hander -- * so we skip this for now. 505 * These cases should not happen anyways, or only when the CPU 506 * is already totally * confused. In this case it's likely it will 507 * not fully execute the machine check handler either. 508 */ 509void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) 510{ 511 struct mce m; 512 int i; 513 514 percpu_inc(mce_poll_count); 515 516 mce_gather_info(&m, NULL); 517 518 for (i = 0; i < banks; i++) { 519 if (!mce_banks[i].ctl || !test_bit(i, *b)) 520 continue; 521 522 m.misc = 0; 523 m.addr = 0; 524 m.bank = i; 525 m.tsc = 0; 526 527 barrier(); 528 m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); 529 if (!(m.status & MCI_STATUS_VAL)) 530 continue; 531 532 /* 533 * Uncorrected or signalled events are handled by the exception 534 * handler when it is enabled, so don't process those here. 535 * 536 * TBD do the same check for MCI_STATUS_EN here? 537 */ 538 if (!(flags & MCP_UC) && 539 (m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC))) 540 continue; 541 542 if (m.status & MCI_STATUS_MISCV) 543 m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i)); 544 if (m.status & MCI_STATUS_ADDRV) 545 m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i)); 546 547 if (!(flags & MCP_TIMESTAMP)) 548 m.tsc = 0; 549 /* 550 * Don't get the IP here because it's unlikely to 551 * have anything to do with the actual error location. 552 */ 553 if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) { 554 mce_log(&m); 555 atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, &m); 556 } 557 558 /* 559 * Clear state for this bank. 560 */ 561 mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0); 562 } 563 564 /* 565 * Don't clear MCG_STATUS here because it's only defined for 566 * exceptions. 567 */ 568 569 sync_core(); 570} 571EXPORT_SYMBOL_GPL(machine_check_poll); 572 573/* 574 * Do a quick check if any of the events requires a panic. 575 * This decides if we keep the events around or clear them. 576 */ 577static int mce_no_way_out(struct mce *m, char **msg) 578{ 579 int i; 580 581 for (i = 0; i < banks; i++) { 582 m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); 583 if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY) 584 return 1; 585 } 586 return 0; 587} 588 589/* 590 * Variable to establish order between CPUs while scanning. 591 * Each CPU spins initially until executing is equal its number. 592 */ 593static atomic_t mce_executing; 594 595/* 596 * Defines order of CPUs on entry. First CPU becomes Monarch. 597 */ 598static atomic_t mce_callin; 599 600/* 601 * Check if a timeout waiting for other CPUs happened. 602 */ 603static int mce_timed_out(u64 *t) 604{ 605 /* 606 * The others already did panic for some reason. 607 * Bail out like in a timeout. 608 * rmb() to tell the compiler that system_state 609 * might have been modified by someone else. 610 */ 611 rmb(); 612 if (atomic_read(&mce_paniced)) 613 wait_for_panic(); 614 if (!monarch_timeout) 615 goto out; 616 if ((s64)*t < SPINUNIT) { 617 /* CHECKME: Make panic default for 1 too? */ 618 if (tolerant < 1) 619 mce_panic("Timeout synchronizing machine check over CPUs", 620 NULL, NULL); 621 cpu_missing = 1; 622 return 1; 623 } 624 *t -= SPINUNIT; 625out: 626 touch_nmi_watchdog(); 627 return 0; 628} 629 630/* 631 * The Monarch's reign. The Monarch is the CPU who entered 632 * the machine check handler first. It waits for the others to 633 * raise the exception too and then grades them. When any 634 * error is fatal panic. Only then let the others continue. 635 * 636 * The other CPUs entering the MCE handler will be controlled by the 637 * Monarch. They are called Subjects. 638 * 639 * This way we prevent any potential data corruption in a unrecoverable case 640 * and also makes sure always all CPU's errors are examined. 641 * 642 * Also this detects the case of a machine check event coming from outer 643 * space (not detected by any CPUs) In this case some external agent wants 644 * us to shut down, so panic too. 645 * 646 * The other CPUs might still decide to panic if the handler happens 647 * in a unrecoverable place, but in this case the system is in a semi-stable 648 * state and won't corrupt anything by itself. It's ok to let the others 649 * continue for a bit first. 650 * 651 * All the spin loops have timeouts; when a timeout happens a CPU 652 * typically elects itself to be Monarch. 653 */ 654static void mce_reign(void) 655{ 656 int cpu; 657 struct mce *m = NULL; 658 int global_worst = 0; 659 char *msg = NULL; 660 char *nmsg = NULL; 661 662 /* 663 * This CPU is the Monarch and the other CPUs have run 664 * through their handlers. 665 * Grade the severity of the errors of all the CPUs. 666 */ 667 for_each_possible_cpu(cpu) { 668 int severity = mce_severity(&per_cpu(mces_seen, cpu), tolerant, 669 &nmsg); 670 if (severity > global_worst) { 671 msg = nmsg; 672 global_worst = severity; 673 m = &per_cpu(mces_seen, cpu); 674 } 675 } 676 677 /* 678 * Cannot recover? Panic here then. 679 * This dumps all the mces in the log buffer and stops the 680 * other CPUs. 681 */ 682 if (m && global_worst >= MCE_PANIC_SEVERITY && tolerant < 3) 683 mce_panic("Fatal Machine check", m, msg); 684 685 /* 686 * For UC somewhere we let the CPU who detects it handle it. 687 * Also must let continue the others, otherwise the handling 688 * CPU could deadlock on a lock. 689 */ 690 691 /* 692 * No machine check event found. Must be some external 693 * source or one CPU is hung. Panic. 694 */ 695 if (global_worst <= MCE_KEEP_SEVERITY && tolerant < 3) 696 mce_panic("Machine check from unknown source", NULL, NULL); 697 698 /* 699 * Now clear all the mces_seen so that they don't reappear on 700 * the next mce. 701 */ 702 for_each_possible_cpu(cpu) 703 memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce)); 704} 705 706static atomic_t global_nwo; 707 708/* 709 * Start of Monarch synchronization. This waits until all CPUs have 710 * entered the exception handler and then determines if any of them 711 * saw a fatal event that requires panic. Then it executes them 712 * in the entry order. 713 * TBD double check parallel CPU hotunplug 714 */ 715static int mce_start(int *no_way_out) 716{ 717 int order; 718 int cpus = num_online_cpus(); 719 u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC; 720 721 if (!timeout) 722 return -1; 723 724 atomic_add(*no_way_out, &global_nwo); 725 /* 726 * global_nwo should be updated before mce_callin 727 */ 728 smp_wmb(); 729 order = atomic_inc_return(&mce_callin); 730 731 /* 732 * Wait for everyone. 733 */ 734 while (atomic_read(&mce_callin) != cpus) { 735 if (mce_timed_out(&timeout)) { 736 atomic_set(&global_nwo, 0); 737 return -1; 738 } 739 ndelay(SPINUNIT); 740 } 741 742 /* 743 * mce_callin should be read before global_nwo 744 */ 745 smp_rmb(); 746 747 if (order == 1) { 748 /* 749 * Monarch: Starts executing now, the others wait. 750 */ 751 atomic_set(&mce_executing, 1); 752 } else { 753 /* 754 * Subject: Now start the scanning loop one by one in 755 * the original callin order. 756 * This way when there are any shared banks it will be 757 * only seen by one CPU before cleared, avoiding duplicates. 758 */ 759 while (atomic_read(&mce_executing) < order) { 760 if (mce_timed_out(&timeout)) { 761 atomic_set(&global_nwo, 0); 762 return -1; 763 } 764 ndelay(SPINUNIT); 765 } 766 } 767 768 /* 769 * Cache the global no_way_out state. 770 */ 771 *no_way_out = atomic_read(&global_nwo); 772 773 return order; 774} 775 776/* 777 * Synchronize between CPUs after main scanning loop. 778 * This invokes the bulk of the Monarch processing. 779 */ 780static int mce_end(int order) 781{ 782 int ret = -1; 783 u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC; 784 785 if (!timeout) 786 goto reset; 787 if (order < 0) 788 goto reset; 789 790 /* 791 * Allow others to run. 792 */ 793 atomic_inc(&mce_executing); 794 795 if (order == 1) { 796 /* CHECKME: Can this race with a parallel hotplug? */ 797 int cpus = num_online_cpus(); 798 799 /* 800 * Monarch: Wait for everyone to go through their scanning 801 * loops. 802 */ 803 while (atomic_read(&mce_executing) <= cpus) { 804 if (mce_timed_out(&timeout)) 805 goto reset; 806 ndelay(SPINUNIT); 807 } 808 809 mce_reign(); 810 barrier(); 811 ret = 0; 812 } else { 813 /* 814 * Subject: Wait for Monarch to finish. 815 */ 816 while (atomic_read(&mce_executing) != 0) { 817 if (mce_timed_out(&timeout)) 818 goto reset; 819 ndelay(SPINUNIT); 820 } 821 822 /* 823 * Don't reset anything. That's done by the Monarch. 824 */ 825 return 0; 826 } 827 828 /* 829 * Reset all global state. 830 */ 831reset: 832 atomic_set(&global_nwo, 0); 833 atomic_set(&mce_callin, 0); 834 barrier(); 835 836 /* 837 * Let others run again. 838 */ 839 atomic_set(&mce_executing, 0); 840 return ret; 841} 842 843/* 844 * Check if the address reported by the CPU is in a format we can parse. 845 * It would be possible to add code for most other cases, but all would 846 * be somewhat complicated (e.g. segment offset would require an instruction 847 * parser). So only support physical addresses up to page granuality for now. 848 */ 849static int mce_usable_address(struct mce *m) 850{ 851 if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV)) 852 return 0; 853 if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT) 854 return 0; 855 if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS) 856 return 0; 857 return 1; 858} 859 860static void mce_clear_state(unsigned long *toclear) 861{ 862 int i; 863 864 for (i = 0; i < banks; i++) { 865 if (test_bit(i, toclear)) 866 mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0); 867 } 868} 869 870/* 871 * The actual machine check handler. This only handles real 872 * exceptions when something got corrupted coming in through int 18. 873 * 874 * This is executed in NMI context not subject to normal locking rules. This 875 * implies that most kernel services cannot be safely used. Don't even 876 * think about putting a printk in there! 877 * 878 * On Intel systems this is entered on all CPUs in parallel through 879 * MCE broadcast. However some CPUs might be broken beyond repair, 880 * so be always careful when synchronizing with others. 881 */ 882void do_machine_check(struct pt_regs *regs, long error_code) 883{ 884 struct mce m, *final; 885 int i; 886 int worst = 0; 887 int severity; 888 /* 889 * Establish sequential order between the CPUs entering the machine 890 * check handler. 891 */ 892 int order; 893 /* 894 * If no_way_out gets set, there is no safe way to recover from this 895 * MCE. If tolerant is cranked up, we'll try anyway. 896 */ 897 int no_way_out = 0; 898 /* 899 * If kill_it gets set, there might be a way to recover from this 900 * error. 901 */ 902 int kill_it = 0; 903 DECLARE_BITMAP(toclear, MAX_NR_BANKS); 904 char *msg = "Unknown"; 905 906 atomic_inc(&mce_entry); 907 908 percpu_inc(mce_exception_count); 909 910 if (notify_die(DIE_NMI, "machine check", regs, error_code, 911 18, SIGKILL) == NOTIFY_STOP) 912 goto out; 913 if (!banks) 914 goto out; 915 916 mce_gather_info(&m, regs); 917 918 final = &__get_cpu_var(mces_seen); 919 *final = m; 920 921 no_way_out = mce_no_way_out(&m, &msg); 922 923 barrier(); 924 925 /* 926 * When no restart IP must always kill or panic. 927 */ 928 if (!(m.mcgstatus & MCG_STATUS_RIPV)) 929 kill_it = 1; 930 931 /* 932 * Go through all the banks in exclusion of the other CPUs. 933 * This way we don't report duplicated events on shared banks 934 * because the first one to see it will clear it. 935 */ 936 order = mce_start(&no_way_out); 937 for (i = 0; i < banks; i++) { 938 __clear_bit(i, toclear); 939 if (!mce_banks[i].ctl) 940 continue; 941 942 m.misc = 0; 943 m.addr = 0; 944 m.bank = i; 945 946 m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); 947 if ((m.status & MCI_STATUS_VAL) == 0) 948 continue; 949 950 /* 951 * Non uncorrected or non signaled errors are handled by 952 * machine_check_poll. Leave them alone, unless this panics. 953 */ 954 if (!(m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)) && 955 !no_way_out) 956 continue; 957 958 /* 959 * Set taint even when machine check was not enabled. 960 */ 961 add_taint(TAINT_MACHINE_CHECK); 962 963 severity = mce_severity(&m, tolerant, NULL); 964 965 /* 966 * When machine check was for corrected handler don't touch, 967 * unless we're panicing. 968 */ 969 if (severity == MCE_KEEP_SEVERITY && !no_way_out) 970 continue; 971 __set_bit(i, toclear); 972 if (severity == MCE_NO_SEVERITY) { 973 /* 974 * Machine check event was not enabled. Clear, but 975 * ignore. 976 */ 977 continue; 978 } 979 980 /* 981 * Kill on action required. 982 */ 983 if (severity == MCE_AR_SEVERITY) 984 kill_it = 1; 985 986 if (m.status & MCI_STATUS_MISCV) 987 m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i)); 988 if (m.status & MCI_STATUS_ADDRV) 989 m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i)); 990 991 /* 992 * Action optional error. Queue address for later processing. 993 * When the ring overflows we just ignore the AO error. 994 * RED-PEN add some logging mechanism when 995 * usable_address or mce_add_ring fails. 996 * RED-PEN don't ignore overflow for tolerant == 0 997 */ 998 if (severity == MCE_AO_SEVERITY && mce_usable_address(&m)) 999 mce_ring_add(m.addr >> PAGE_SHIFT); 1000 1001 mce_log(&m); 1002 1003 if (severity > worst) { 1004 *final = m; 1005 worst = severity; 1006 } 1007 } 1008 1009 if (!no_way_out) 1010 mce_clear_state(toclear); 1011 1012 /* 1013 * Do most of the synchronization with other CPUs. 1014 * When there's any problem use only local no_way_out state. 1015 */ 1016 if (mce_end(order) < 0) 1017 no_way_out = worst >= MCE_PANIC_SEVERITY; 1018 1019 /* 1020 * If we have decided that we just CAN'T continue, and the user 1021 * has not set tolerant to an insane level, give up and die. 1022 * 1023 * This is mainly used in the case when the system doesn't 1024 * support MCE broadcasting or it has been disabled. 1025 */ 1026 if (no_way_out && tolerant < 3) 1027 mce_panic("Fatal machine check on current CPU", final, msg); 1028 1029 /* 1030 * If the error seems to be unrecoverable, something should be 1031 * done. Try to kill as little as possible. If we can kill just 1032 * one task, do that. If the user has set the tolerance very 1033 * high, don't try to do anything at all. 1034 */ 1035 1036 if (kill_it && tolerant < 3) 1037 force_sig(SIGBUS, current); 1038 1039 /* notify userspace ASAP */ 1040 set_thread_flag(TIF_MCE_NOTIFY); 1041 1042 if (worst > 0) 1043 mce_report_event(regs); 1044 mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); 1045out: 1046 atomic_dec(&mce_entry); 1047 sync_core(); 1048} 1049EXPORT_SYMBOL_GPL(do_machine_check); 1050 1051/* dummy to break dependency. actual code is in mm/memory-failure.c */ 1052void __attribute__((weak)) memory_failure(unsigned long pfn, int vector) 1053{ 1054 printk(KERN_ERR "Action optional memory failure at %lx ignored\n", pfn); 1055} 1056 1057/* 1058 * Called after mce notification in process context. This code 1059 * is allowed to sleep. Call the high level VM handler to process 1060 * any corrupted pages. 1061 * Assume that the work queue code only calls this one at a time 1062 * per CPU. 1063 * Note we don't disable preemption, so this code might run on the wrong 1064 * CPU. In this case the event is picked up by the scheduled work queue. 1065 * This is merely a fast path to expedite processing in some common 1066 * cases. 1067 */ 1068void mce_notify_process(void) 1069{ 1070 unsigned long pfn; 1071 mce_notify_irq(); 1072 while (mce_ring_get(&pfn)) 1073 memory_failure(pfn, MCE_VECTOR); 1074} 1075 1076static void mce_process_work(struct work_struct *dummy) 1077{ 1078 mce_notify_process(); 1079} 1080 1081#ifdef CONFIG_X86_MCE_INTEL 1082/*** 1083 * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog 1084 * @cpu: The CPU on which the event occurred. 1085 * @status: Event status information 1086 * 1087 * This function should be called by the thermal interrupt after the 1088 * event has been processed and the decision was made to log the event 1089 * further. 1090 * 1091 * The status parameter will be saved to the 'status' field of 'struct mce' 1092 * and historically has been the register value of the 1093 * MSR_IA32_THERMAL_STATUS (Intel) msr. 1094 */ 1095void mce_log_therm_throt_event(__u64 status) 1096{ 1097 struct mce m; 1098 1099 mce_setup(&m); 1100 m.bank = MCE_THERMAL_BANK; 1101 m.status = status; 1102 mce_log(&m); 1103} 1104#endif /* CONFIG_X86_MCE_INTEL */ 1105 1106/* 1107 * Periodic polling timer for "silent" machine check errors. If the 1108 * poller finds an MCE, poll 2x faster. When the poller finds no more 1109 * errors, poll 2x slower (up to check_interval seconds). 1110 */ 1111static int check_interval = 5 * 60; /* 5 minutes */ 1112 1113static DEFINE_PER_CPU(int, mce_next_interval); /* in jiffies */ 1114static DEFINE_PER_CPU(struct timer_list, mce_timer); 1115 1116static void mce_start_timer(unsigned long data) 1117{ 1118 struct timer_list *t = &per_cpu(mce_timer, data); 1119 int *n; 1120 1121 WARN_ON(smp_processor_id() != data); 1122 1123 if (mce_available(__this_cpu_ptr(&cpu_info))) { 1124 machine_check_poll(MCP_TIMESTAMP, 1125 &__get_cpu_var(mce_poll_banks)); 1126 } 1127 1128 /* 1129 * Alert userspace if needed. If we logged an MCE, reduce the 1130 * polling interval, otherwise increase the polling interval. 1131 */ 1132 n = &__get_cpu_var(mce_next_interval); 1133 if (mce_notify_irq()) 1134 *n = max(*n/2, HZ/100); 1135 else 1136 *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ)); 1137 1138 t->expires = jiffies + *n; 1139 add_timer_on(t, smp_processor_id()); 1140} 1141 1142static void mce_do_trigger(struct work_struct *work) 1143{ 1144 call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT); 1145} 1146 1147static DECLARE_WORK(mce_trigger_work, mce_do_trigger); 1148 1149/* 1150 * Notify the user(s) about new machine check events. 1151 * Can be called from interrupt context, but not from machine check/NMI 1152 * context. 1153 */ 1154int mce_notify_irq(void) 1155{ 1156 /* Not more than two messages every minute */ 1157 static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); 1158 1159 clear_thread_flag(TIF_MCE_NOTIFY); 1160 1161 if (test_and_clear_bit(0, &mce_need_notify)) { 1162 wake_up_interruptible(&mce_wait); 1163 1164 /* 1165 * There is no risk of missing notifications because 1166 * work_pending is always cleared before the function is 1167 * executed. 1168 */ 1169 if (mce_helper[0] && !work_pending(&mce_trigger_work)) 1170 schedule_work(&mce_trigger_work); 1171 1172 if (__ratelimit(&ratelimit)) 1173 pr_info(HW_ERR "Machine check events logged\n"); 1174 1175 return 1; 1176 } 1177 return 0; 1178} 1179EXPORT_SYMBOL_GPL(mce_notify_irq); 1180 1181static int __cpuinit __mcheck_cpu_mce_banks_init(void) 1182{ 1183 int i; 1184 1185 mce_banks = kzalloc(banks * sizeof(struct mce_bank), GFP_KERNEL); 1186 if (!mce_banks) 1187 return -ENOMEM; 1188 for (i = 0; i < banks; i++) { 1189 struct mce_bank *b = &mce_banks[i]; 1190 1191 b->ctl = -1ULL; 1192 b->init = 1; 1193 } 1194 return 0; 1195} 1196 1197/* 1198 * Initialize Machine Checks for a CPU. 1199 */ 1200static int __cpuinit __mcheck_cpu_cap_init(void) 1201{ 1202 unsigned b; 1203 u64 cap; 1204 1205 rdmsrl(MSR_IA32_MCG_CAP, cap); 1206 1207 b = cap & MCG_BANKCNT_MASK; 1208 if (!banks) 1209 printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b); 1210 1211 if (b > MAX_NR_BANKS) { 1212 printk(KERN_WARNING 1213 "MCE: Using only %u machine check banks out of %u\n", 1214 MAX_NR_BANKS, b); 1215 b = MAX_NR_BANKS; 1216 } 1217 1218 /* Don't support asymmetric configurations today */ 1219 WARN_ON(banks != 0 && b != banks); 1220 banks = b; 1221 if (!mce_banks) { 1222 int err = __mcheck_cpu_mce_banks_init(); 1223 1224 if (err) 1225 return err; 1226 } 1227 1228 /* Use accurate RIP reporting if available. */ 1229 if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9) 1230 rip_msr = MSR_IA32_MCG_EIP; 1231 1232 if (cap & MCG_SER_P) 1233 mce_ser = 1; 1234 1235 return 0; 1236} 1237 1238static void __mcheck_cpu_init_generic(void) 1239{ 1240 mce_banks_t all_banks; 1241 u64 cap; 1242 int i; 1243 1244 /* 1245 * Log the machine checks left over from the previous reset. 1246 */ 1247 bitmap_fill(all_banks, MAX_NR_BANKS); 1248 machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks); 1249 1250 set_in_cr4(X86_CR4_MCE); 1251 1252 rdmsrl(MSR_IA32_MCG_CAP, cap); 1253 if (cap & MCG_CTL_P) 1254 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); 1255 1256 for (i = 0; i < banks; i++) { 1257 struct mce_bank *b = &mce_banks[i]; 1258 1259 if (!b->init) 1260 continue; 1261 wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl); 1262 wrmsrl(MSR_IA32_MCx_STATUS(i), 0); 1263 } 1264} 1265 1266/* Add per CPU specific workarounds here */ 1267static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) 1268{ 1269 if (c->x86_vendor == X86_VENDOR_UNKNOWN) { 1270 pr_info("MCE: unknown CPU type - not enabling MCE support.\n"); 1271 return -EOPNOTSUPP; 1272 } 1273 1274 /* This should be disabled by the BIOS, but isn't always */ 1275 if (c->x86_vendor == X86_VENDOR_AMD) { 1276 if (c->x86 == 15 && banks > 4) { 1277 /* 1278 * disable GART TBL walk error reporting, which 1279 * trips off incorrectly with the IOMMU & 3ware 1280 * & Cerberus: 1281 */ 1282 clear_bit(10, (unsigned long *)&mce_banks[4].ctl); 1283 } 1284 if (c->x86 <= 17 && mce_bootlog < 0) { 1285 /* 1286 * Lots of broken BIOS around that don't clear them 1287 * by default and leave crap in there. Don't log: 1288 */ 1289 mce_bootlog = 0; 1290 } 1291 /* 1292 * Various K7s with broken bank 0 around. Always disable 1293 * by default. 1294 */ 1295 if (c->x86 == 6 && banks > 0) 1296 mce_banks[0].ctl = 0; 1297 } 1298 1299 if (c->x86_vendor == X86_VENDOR_INTEL) { 1300 /* 1301 * SDM documents that on family 6 bank 0 should not be written 1302 * because it aliases to another special BIOS controlled 1303 * register. 1304 * But it's not aliased anymore on model 0x1a+ 1305 * Don't ignore bank 0 completely because there could be a 1306 * valid event later, merely don't write CTL0. 1307 */ 1308 1309 if (c->x86 == 6 && c->x86_model < 0x1A && banks > 0) 1310 mce_banks[0].init = 0; 1311 1312 /* 1313 * All newer Intel systems support MCE broadcasting. Enable 1314 * synchronization with a one second timeout. 1315 */ 1316 if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) && 1317 monarch_timeout < 0) 1318 monarch_timeout = USEC_PER_SEC; 1319 1320 /* 1321 * There are also broken BIOSes on some Pentium M and 1322 * earlier systems: 1323 */ 1324 if (c->x86 == 6 && c->x86_model <= 13 && mce_bootlog < 0) 1325 mce_bootlog = 0; 1326 } 1327 if (monarch_timeout < 0) 1328 monarch_timeout = 0; 1329 if (mce_bootlog != 0) 1330 mce_panic_timeout = 30; 1331 1332 return 0; 1333} 1334 1335static int __cpuinit __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c) 1336{ 1337 if (c->x86 != 5) 1338 return 0; 1339 1340 switch (c->x86_vendor) { 1341 case X86_VENDOR_INTEL: 1342 intel_p5_mcheck_init(c); 1343 return 1; 1344 break; 1345 case X86_VENDOR_CENTAUR: 1346 winchip_mcheck_init(c); 1347 return 1; 1348 break; 1349 } 1350 1351 return 0; 1352} 1353 1354static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) 1355{ 1356 switch (c->x86_vendor) { 1357 case X86_VENDOR_INTEL: 1358 mce_intel_feature_init(c); 1359 break; 1360 case X86_VENDOR_AMD: 1361 mce_amd_feature_init(c); 1362 break; 1363 default: 1364 break; 1365 } 1366} 1367 1368static void __mcheck_cpu_init_timer(void) 1369{ 1370 struct timer_list *t = &__get_cpu_var(mce_timer); 1371 int *n = &__get_cpu_var(mce_next_interval); 1372 1373 setup_timer(t, mce_start_timer, smp_processor_id()); 1374 1375 if (mce_ignore_ce) 1376 return; 1377 1378 *n = check_interval * HZ; 1379 if (!*n) 1380 return; 1381 t->expires = round_jiffies(jiffies + *n); 1382 add_timer_on(t, smp_processor_id()); 1383} 1384 1385/* Handle unconfigured int18 (should never happen) */ 1386static void unexpected_machine_check(struct pt_regs *regs, long error_code) 1387{ 1388 printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", 1389 smp_processor_id()); 1390} 1391 1392/* Call the installed machine check handler for this CPU setup. */ 1393void (*machine_check_vector)(struct pt_regs *, long error_code) = 1394 unexpected_machine_check; 1395 1396/* 1397 * Called for each booted CPU to set up machine checks. 1398 * Must be called with preempt off: 1399 */ 1400void __cpuinit mcheck_cpu_init(struct cpuinfo_x86 *c) 1401{ 1402 if (mce_disabled) 1403 return; 1404 1405 if (__mcheck_cpu_ancient_init(c)) 1406 return; 1407 1408 if (!mce_available(c)) 1409 return; 1410 1411 if (__mcheck_cpu_cap_init() < 0 || __mcheck_cpu_apply_quirks(c) < 0) { 1412 mce_disabled = 1; 1413 return; 1414 } 1415 1416 machine_check_vector = do_machine_check; 1417 1418 __mcheck_cpu_init_generic(); 1419 __mcheck_cpu_init_vendor(c); 1420 __mcheck_cpu_init_timer(); 1421 INIT_WORK(&__get_cpu_var(mce_work), mce_process_work); 1422 init_irq_work(&__get_cpu_var(mce_irq_work), &mce_irq_work_cb); 1423} 1424 1425/* 1426 * Character device to read and clear the MCE log. 1427 */ 1428 1429static DEFINE_SPINLOCK(mce_state_lock); 1430static int open_count; /* #times opened */ 1431static int open_exclu; /* already open exclusive? */ 1432 1433static int mce_open(struct inode *inode, struct file *file) 1434{ 1435 spin_lock(&mce_state_lock); 1436 1437 if (open_exclu || (open_count && (file->f_flags & O_EXCL))) { 1438 spin_unlock(&mce_state_lock); 1439 1440 return -EBUSY; 1441 } 1442 1443 if (file->f_flags & O_EXCL) 1444 open_exclu = 1; 1445 open_count++; 1446 1447 spin_unlock(&mce_state_lock); 1448 1449 return nonseekable_open(inode, file); 1450} 1451 1452static int mce_release(struct inode *inode, struct file *file) 1453{ 1454 spin_lock(&mce_state_lock); 1455 1456 open_count--; 1457 open_exclu = 0; 1458 1459 spin_unlock(&mce_state_lock); 1460 1461 return 0; 1462} 1463 1464static void collect_tscs(void *data) 1465{ 1466 unsigned long *cpu_tsc = (unsigned long *)data; 1467 1468 rdtscll(cpu_tsc[smp_processor_id()]); 1469} 1470 1471static int mce_apei_read_done; 1472 1473/* Collect MCE record of previous boot in persistent storage via APEI ERST. */ 1474static int __mce_read_apei(char __user **ubuf, size_t usize) 1475{ 1476 int rc; 1477 u64 record_id; 1478 struct mce m; 1479 1480 if (usize < sizeof(struct mce)) 1481 return -EINVAL; 1482 1483 rc = apei_read_mce(&m, &record_id); 1484 /* Error or no more MCE record */ 1485 if (rc <= 0) { 1486 mce_apei_read_done = 1; 1487 return rc; 1488 } 1489 rc = -EFAULT; 1490 if (copy_to_user(*ubuf, &m, sizeof(struct mce))) 1491 return rc; 1492 /* 1493 * In fact, we should have cleared the record after that has 1494 * been flushed to the disk or sent to network in 1495 * /sbin/mcelog, but we have no interface to support that now, 1496 * so just clear it to avoid duplication. 1497 */ 1498 rc = apei_clear_mce(record_id); 1499 if (rc) { 1500 mce_apei_read_done = 1; 1501 return rc; 1502 } 1503 *ubuf += sizeof(struct mce); 1504 1505 return 0; 1506} 1507 1508static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, 1509 loff_t *off) 1510{ 1511 char __user *buf = ubuf; 1512 unsigned long *cpu_tsc; 1513 unsigned prev, next; 1514 int i, err; 1515 1516 cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL); 1517 if (!cpu_tsc) 1518 return -ENOMEM; 1519 1520 mutex_lock(&mce_read_mutex); 1521 1522 if (!mce_apei_read_done) { 1523 err = __mce_read_apei(&buf, usize); 1524 if (err || buf != ubuf) 1525 goto out; 1526 } 1527 1528 next = rcu_dereference_check_mce(mcelog.next); 1529 1530 /* Only supports full reads right now */ 1531 err = -EINVAL; 1532 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) 1533 goto out; 1534 1535 err = 0; 1536 prev = 0; 1537 do { 1538 for (i = prev; i < next; i++) { 1539 unsigned long start = jiffies; 1540 1541 while (!mcelog.entry[i].finished) { 1542 if (time_after_eq(jiffies, start + 2)) { 1543 memset(mcelog.entry + i, 0, 1544 sizeof(struct mce)); 1545 goto timeout; 1546 } 1547 cpu_relax(); 1548 } 1549 smp_rmb(); 1550 err |= copy_to_user(buf, mcelog.entry + i, 1551 sizeof(struct mce)); 1552 buf += sizeof(struct mce); 1553timeout: 1554 ; 1555 } 1556 1557 memset(mcelog.entry + prev, 0, 1558 (next - prev) * sizeof(struct mce)); 1559 prev = next; 1560 next = cmpxchg(&mcelog.next, prev, 0); 1561 } while (next != prev); 1562 1563 synchronize_sched(); 1564 1565 /* 1566 * Collect entries that were still getting written before the 1567 * synchronize. 1568 */ 1569 on_each_cpu(collect_tscs, cpu_tsc, 1); 1570 1571 for (i = next; i < MCE_LOG_LEN; i++) { 1572 if (mcelog.entry[i].finished && 1573 mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) { 1574 err |= copy_to_user(buf, mcelog.entry+i, 1575 sizeof(struct mce)); 1576 smp_rmb(); 1577 buf += sizeof(struct mce); 1578 memset(&mcelog.entry[i], 0, sizeof(struct mce)); 1579 } 1580 } 1581 1582 if (err) 1583 err = -EFAULT; 1584 1585out: 1586 mutex_unlock(&mce_read_mutex); 1587 kfree(cpu_tsc); 1588 1589 return err ? err : buf - ubuf; 1590} 1591 1592static unsigned int mce_poll(struct file *file, poll_table *wait) 1593{ 1594 poll_wait(file, &mce_wait, wait); 1595 if (rcu_access_index(mcelog.next)) 1596 return POLLIN | POLLRDNORM; 1597 if (!mce_apei_read_done && apei_check_mce()) 1598 return POLLIN | POLLRDNORM; 1599 return 0; 1600} 1601 1602static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg) 1603{ 1604 int __user *p = (int __user *)arg; 1605 1606 if (!capable(CAP_SYS_ADMIN)) 1607 return -EPERM; 1608 1609 switch (cmd) { 1610 case MCE_GET_RECORD_LEN: 1611 return put_user(sizeof(struct mce), p); 1612 case MCE_GET_LOG_LEN: 1613 return put_user(MCE_LOG_LEN, p); 1614 case MCE_GETCLEAR_FLAGS: { 1615 unsigned flags; 1616 1617 do { 1618 flags = mcelog.flags; 1619 } while (cmpxchg(&mcelog.flags, flags, 0) != flags); 1620 1621 return put_user(flags, p); 1622 } 1623 default: 1624 return -ENOTTY; 1625 } 1626} 1627 1628/* Modified in mce-inject.c, so not static or const */ 1629struct file_operations mce_chrdev_ops = { 1630 .open = mce_open, 1631 .release = mce_release, 1632 .read = mce_read, 1633 .poll = mce_poll, 1634 .unlocked_ioctl = mce_ioctl, 1635 .llseek = no_llseek, 1636}; 1637EXPORT_SYMBOL_GPL(mce_chrdev_ops); 1638 1639static struct miscdevice mce_log_device = { 1640 MISC_MCELOG_MINOR, 1641 "mcelog", 1642 &mce_chrdev_ops, 1643}; 1644 1645/* 1646 * mce=off Disables machine check 1647 * mce=no_cmci Disables CMCI 1648 * mce=dont_log_ce Clears corrected events silently, no log created for CEs. 1649 * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared. 1650 * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above) 1651 * monarchtimeout is how long to wait for other CPUs on machine 1652 * check, or 0 to not wait 1653 * mce=bootlog Log MCEs from before booting. Disabled by default on AMD. 1654 * mce=nobootlog Don't log MCEs from before booting. 1655 */ 1656static int __init mcheck_enable(char *str) 1657{ 1658 if (*str == 0) { 1659 enable_p5_mce(); 1660 return 1; 1661 } 1662 if (*str == '=') 1663 str++; 1664 if (!strcmp(str, "off")) 1665 mce_disabled = 1; 1666 else if (!strcmp(str, "no_cmci")) 1667 mce_cmci_disabled = 1; 1668 else if (!strcmp(str, "dont_log_ce")) 1669 mce_dont_log_ce = 1; 1670 else if (!strcmp(str, "ignore_ce")) 1671 mce_ignore_ce = 1; 1672 else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog")) 1673 mce_bootlog = (str[0] == 'b'); 1674 else if (isdigit(str[0])) { 1675 get_option(&str, &tolerant); 1676 if (*str == ',') { 1677 ++str; 1678 get_option(&str, &monarch_timeout); 1679 } 1680 } else { 1681 printk(KERN_INFO "mce argument %s ignored. Please use /sys\n", 1682 str); 1683 return 0; 1684 } 1685 return 1; 1686} 1687__setup("mce", mcheck_enable); 1688 1689int __init mcheck_init(void) 1690{ 1691 mcheck_intel_therm_init(); 1692 1693 return 0; 1694} 1695 1696/* 1697 * Sysfs support 1698 */ 1699 1700/* 1701 * Disable machine checks on suspend and shutdown. We can't really handle 1702 * them later. 1703 */ 1704static int mce_disable_error_reporting(void) 1705{ 1706 int i; 1707 1708 for (i = 0; i < banks; i++) { 1709 struct mce_bank *b = &mce_banks[i]; 1710 1711 if (b->init) 1712 wrmsrl(MSR_IA32_MCx_CTL(i), 0); 1713 } 1714 return 0; 1715} 1716 1717static int mce_suspend(void) 1718{ 1719 return mce_disable_error_reporting(); 1720} 1721 1722static void mce_shutdown(void) 1723{ 1724 mce_disable_error_reporting(); 1725} 1726 1727/* 1728 * On resume clear all MCE state. Don't want to see leftovers from the BIOS. 1729 * Only one CPU is active at this time, the others get re-added later using 1730 * CPU hotplug: 1731 */ 1732static void mce_resume(void) 1733{ 1734 __mcheck_cpu_init_generic(); 1735 __mcheck_cpu_init_vendor(__this_cpu_ptr(&cpu_info)); 1736} 1737 1738static struct syscore_ops mce_syscore_ops = { 1739 .suspend = mce_suspend, 1740 .shutdown = mce_shutdown, 1741 .resume = mce_resume, 1742}; 1743 1744static void mce_cpu_restart(void *data) 1745{ 1746 del_timer_sync(&__get_cpu_var(mce_timer)); 1747 if (!mce_available(__this_cpu_ptr(&cpu_info))) 1748 return; 1749 __mcheck_cpu_init_generic(); 1750 __mcheck_cpu_init_timer(); 1751} 1752 1753/* Reinit MCEs after user configuration changes */ 1754static void mce_restart(void) 1755{ 1756 on_each_cpu(mce_cpu_restart, NULL, 1); 1757} 1758 1759/* Toggle features for corrected errors */ 1760static void mce_disable_ce(void *all) 1761{ 1762 if (!mce_available(__this_cpu_ptr(&cpu_info))) 1763 return; 1764 if (all) 1765 del_timer_sync(&__get_cpu_var(mce_timer)); 1766 cmci_clear(); 1767} 1768 1769static void mce_enable_ce(void *all) 1770{ 1771 if (!mce_available(__this_cpu_ptr(&cpu_info))) 1772 return; 1773 cmci_reenable(); 1774 cmci_recheck(); 1775 if (all) 1776 __mcheck_cpu_init_timer(); 1777} 1778 1779static struct sysdev_class mce_sysclass = { 1780 .name = "machinecheck", 1781}; 1782 1783DEFINE_PER_CPU(struct sys_device, mce_dev); 1784 1785__cpuinitdata 1786void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); 1787 1788static inline struct mce_bank *attr_to_bank(struct sysdev_attribute *attr) 1789{ 1790 return container_of(attr, struct mce_bank, attr); 1791} 1792 1793static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr, 1794 char *buf) 1795{ 1796 return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl); 1797} 1798 1799static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, 1800 const char *buf, size_t size) 1801{ 1802 u64 new; 1803 1804 if (strict_strtoull(buf, 0, &new) < 0) 1805 return -EINVAL; 1806 1807 attr_to_bank(attr)->ctl = new; 1808 mce_restart(); 1809 1810 return size; 1811} 1812 1813static ssize_t 1814show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf) 1815{ 1816 strcpy(buf, mce_helper); 1817 strcat(buf, "\n"); 1818 return strlen(mce_helper) + 1; 1819} 1820 1821static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr, 1822 const char *buf, size_t siz) 1823{ 1824 char *p; 1825 1826 strncpy(mce_helper, buf, sizeof(mce_helper)); 1827 mce_helper[sizeof(mce_helper)-1] = 0; 1828 p = strchr(mce_helper, '\n'); 1829 1830 if (p) 1831 *p = 0; 1832 1833 return strlen(mce_helper) + !!p; 1834} 1835 1836static ssize_t set_ignore_ce(struct sys_device *s, 1837 struct sysdev_attribute *attr, 1838 const char *buf, size_t size) 1839{ 1840 u64 new; 1841 1842 if (strict_strtoull(buf, 0, &new) < 0) 1843 return -EINVAL; 1844 1845 if (mce_ignore_ce ^ !!new) { 1846 if (new) { 1847 /* disable ce features */ 1848 on_each_cpu(mce_disable_ce, (void *)1, 1); 1849 mce_ignore_ce = 1; 1850 } else { 1851 /* enable ce features */ 1852 mce_ignore_ce = 0; 1853 on_each_cpu(mce_enable_ce, (void *)1, 1); 1854 } 1855 } 1856 return size; 1857} 1858 1859static ssize_t set_cmci_disabled(struct sys_device *s, 1860 struct sysdev_attribute *attr, 1861 const char *buf, size_t size) 1862{ 1863 u64 new; 1864 1865 if (strict_strtoull(buf, 0, &new) < 0) 1866 return -EINVAL; 1867 1868 if (mce_cmci_disabled ^ !!new) { 1869 if (new) { 1870 /* disable cmci */ 1871 on_each_cpu(mce_disable_ce, NULL, 1); 1872 mce_cmci_disabled = 1; 1873 } else { 1874 /* enable cmci */ 1875 mce_cmci_disabled = 0; 1876 on_each_cpu(mce_enable_ce, NULL, 1); 1877 } 1878 } 1879 return size; 1880} 1881 1882static ssize_t store_int_with_restart(struct sys_device *s, 1883 struct sysdev_attribute *attr, 1884 const char *buf, size_t size) 1885{ 1886 ssize_t ret = sysdev_store_int(s, attr, buf, size); 1887 mce_restart(); 1888 return ret; 1889} 1890 1891static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); 1892static SYSDEV_INT_ATTR(tolerant, 0644, tolerant); 1893static SYSDEV_INT_ATTR(monarch_timeout, 0644, monarch_timeout); 1894static SYSDEV_INT_ATTR(dont_log_ce, 0644, mce_dont_log_ce); 1895 1896static struct sysdev_ext_attribute attr_check_interval = { 1897 _SYSDEV_ATTR(check_interval, 0644, sysdev_show_int, 1898 store_int_with_restart), 1899 &check_interval 1900}; 1901 1902static struct sysdev_ext_attribute attr_ignore_ce = { 1903 _SYSDEV_ATTR(ignore_ce, 0644, sysdev_show_int, set_ignore_ce), 1904 &mce_ignore_ce 1905}; 1906 1907static struct sysdev_ext_attribute attr_cmci_disabled = { 1908 _SYSDEV_ATTR(cmci_disabled, 0644, sysdev_show_int, set_cmci_disabled), 1909 &mce_cmci_disabled 1910}; 1911 1912static struct sysdev_attribute *mce_attrs[] = { 1913 &attr_tolerant.attr, 1914 &attr_check_interval.attr, 1915 &attr_trigger, 1916 &attr_monarch_timeout.attr, 1917 &attr_dont_log_ce.attr, 1918 &attr_ignore_ce.attr, 1919 &attr_cmci_disabled.attr, 1920 NULL 1921}; 1922 1923static cpumask_var_t mce_dev_initialized; 1924 1925/* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */ 1926static __cpuinit int mce_create_device(unsigned int cpu) 1927{ 1928 int err; 1929 int i, j; 1930 1931 if (!mce_available(&boot_cpu_data)) 1932 return -EIO; 1933 1934 memset(&per_cpu(mce_dev, cpu).kobj, 0, sizeof(struct kobject)); 1935 per_cpu(mce_dev, cpu).id = cpu; 1936 per_cpu(mce_dev, cpu).cls = &mce_sysclass; 1937 1938 err = sysdev_register(&per_cpu(mce_dev, cpu)); 1939 if (err) 1940 return err; 1941 1942 for (i = 0; mce_attrs[i]; i++) { 1943 err = sysdev_create_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); 1944 if (err) 1945 goto error; 1946 } 1947 for (j = 0; j < banks; j++) { 1948 err = sysdev_create_file(&per_cpu(mce_dev, cpu), 1949 &mce_banks[j].attr); 1950 if (err) 1951 goto error2; 1952 } 1953 cpumask_set_cpu(cpu, mce_dev_initialized); 1954 1955 return 0; 1956error2: 1957 while (--j >= 0) 1958 sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[j].attr); 1959error: 1960 while (--i >= 0) 1961 sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); 1962 1963 sysdev_unregister(&per_cpu(mce_dev, cpu)); 1964 1965 return err; 1966} 1967 1968static __cpuinit void mce_remove_device(unsigned int cpu) 1969{ 1970 int i; 1971 1972 if (!cpumask_test_cpu(cpu, mce_dev_initialized)) 1973 return; 1974 1975 for (i = 0; mce_attrs[i]; i++) 1976 sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); 1977 1978 for (i = 0; i < banks; i++) 1979 sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[i].attr); 1980 1981 sysdev_unregister(&per_cpu(mce_dev, cpu)); 1982 cpumask_clear_cpu(cpu, mce_dev_initialized); 1983} 1984 1985/* Make sure there are no machine checks on offlined CPUs. */ 1986static void __cpuinit mce_disable_cpu(void *h) 1987{ 1988 unsigned long action = *(unsigned long *)h; 1989 int i; 1990 1991 if (!mce_available(__this_cpu_ptr(&cpu_info))) 1992 return; 1993 1994 if (!(action & CPU_TASKS_FROZEN)) 1995 cmci_clear(); 1996 for (i = 0; i < banks; i++) { 1997 struct mce_bank *b = &mce_banks[i]; 1998 1999 if (b->init) 2000 wrmsrl(MSR_IA32_MCx_CTL(i), 0); 2001 } 2002} 2003 2004static void __cpuinit mce_reenable_cpu(void *h) 2005{ 2006 unsigned long action = *(unsigned long *)h; 2007 int i; 2008 2009 if (!mce_available(__this_cpu_ptr(&cpu_info))) 2010 return; 2011 2012 if (!(action & CPU_TASKS_FROZEN)) 2013 cmci_reenable(); 2014 for (i = 0; i < banks; i++) { 2015 struct mce_bank *b = &mce_banks[i]; 2016 2017 if (b->init) 2018 wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl); 2019 } 2020} 2021 2022/* Get notified when a cpu comes on/off. Be hotplug friendly. */ 2023static int __cpuinit 2024mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) 2025{ 2026 unsigned int cpu = (unsigned long)hcpu; 2027 struct timer_list *t = &per_cpu(mce_timer, cpu); 2028 2029 switch (action) { 2030 case CPU_ONLINE: 2031 case CPU_ONLINE_FROZEN: 2032 mce_create_device(cpu); 2033 if (threshold_cpu_callback) 2034 threshold_cpu_callback(action, cpu); 2035 break; 2036 case CPU_DEAD: 2037 case CPU_DEAD_FROZEN: 2038 if (threshold_cpu_callback) 2039 threshold_cpu_callback(action, cpu); 2040 mce_remove_device(cpu); 2041 break; 2042 case CPU_DOWN_PREPARE: 2043 case CPU_DOWN_PREPARE_FROZEN: 2044 del_timer_sync(t); 2045 smp_call_function_single(cpu, mce_disable_cpu, &action, 1); 2046 break; 2047 case CPU_DOWN_FAILED: 2048 case CPU_DOWN_FAILED_FROZEN: 2049 if (!mce_ignore_ce && check_interval) { 2050 t->expires = round_jiffies(jiffies + 2051 __get_cpu_var(mce_next_interval)); 2052 add_timer_on(t, cpu); 2053 } 2054 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); 2055 break; 2056 case CPU_POST_DEAD: 2057 /* intentionally ignoring frozen here */ 2058 cmci_rediscover(cpu); 2059 break; 2060 } 2061 return NOTIFY_OK; 2062} 2063 2064static struct notifier_block mce_cpu_notifier __cpuinitdata = { 2065 .notifier_call = mce_cpu_callback, 2066}; 2067 2068static __init void mce_init_banks(void) 2069{ 2070 int i; 2071 2072 for (i = 0; i < banks; i++) { 2073 struct mce_bank *b = &mce_banks[i]; 2074 struct sysdev_attribute *a = &b->attr; 2075 2076 sysfs_attr_init(&a->attr); 2077 a->attr.name = b->attrname; 2078 snprintf(b->attrname, ATTR_LEN, "bank%d", i); 2079 2080 a->attr.mode = 0644; 2081 a->show = show_bank; 2082 a->store = set_bank; 2083 } 2084} 2085 2086static __init int mcheck_init_device(void) 2087{ 2088 int err; 2089 int i = 0; 2090 2091 if (!mce_available(&boot_cpu_data)) 2092 return -EIO; 2093 2094 zalloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL); 2095 2096 mce_init_banks(); 2097 2098 err = sysdev_class_register(&mce_sysclass); 2099 if (err) 2100 return err; 2101 2102 for_each_online_cpu(i) { 2103 err = mce_create_device(i); 2104 if (err) 2105 return err; 2106 } 2107 2108 register_syscore_ops(&mce_syscore_ops); 2109 register_hotcpu_notifier(&mce_cpu_notifier); 2110 misc_register(&mce_log_device); 2111 2112 return err; 2113} 2114 2115device_initcall(mcheck_init_device); 2116 2117/* 2118 * Old style boot options parsing. Only for compatibility. 2119 */ 2120static int __init mcheck_disable(char *str) 2121{ 2122 mce_disabled = 1; 2123 return 1; 2124} 2125__setup("nomce", mcheck_disable); 2126 2127#ifdef CONFIG_DEBUG_FS 2128struct dentry *mce_get_debugfs_dir(void) 2129{ 2130 static struct dentry *dmce; 2131 2132 if (!dmce) 2133 dmce = debugfs_create_dir("mce", NULL); 2134 2135 return dmce; 2136} 2137 2138static void mce_reset(void) 2139{ 2140 cpu_missing = 0; 2141 atomic_set(&mce_fake_paniced, 0); 2142 atomic_set(&mce_executing, 0); 2143 atomic_set(&mce_callin, 0); 2144 atomic_set(&global_nwo, 0); 2145} 2146 2147static int fake_panic_get(void *data, u64 *val) 2148{ 2149 *val = fake_panic; 2150 return 0; 2151} 2152 2153static int fake_panic_set(void *data, u64 val) 2154{ 2155 mce_reset(); 2156 fake_panic = val; 2157 return 0; 2158} 2159 2160DEFINE_SIMPLE_ATTRIBUTE(fake_panic_fops, fake_panic_get, 2161 fake_panic_set, "%llu\n"); 2162 2163static int __init mcheck_debugfs_init(void) 2164{ 2165 struct dentry *dmce, *ffake_panic; 2166 2167 dmce = mce_get_debugfs_dir(); 2168 if (!dmce) 2169 return -ENOMEM; 2170 ffake_panic = debugfs_create_file("fake_panic", 0444, dmce, NULL, 2171 &fake_panic_fops); 2172 if (!ffake_panic) 2173 return -ENOMEM; 2174 2175 return 0; 2176} 2177late_initcall(mcheck_debugfs_init); 2178#endif 2179