mce.c revision 8968f9d3dc23d9a1821d97c6f11e72a59382e56c
1/* 2 * Machine check handler. 3 * 4 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. 5 * Rest from unknown author(s). 6 * 2004 Andi Kleen. Rewrote most of it. 7 * Copyright 2008 Intel Corporation 8 * Author: Andi Kleen 9 */ 10#include <linux/thread_info.h> 11#include <linux/capability.h> 12#include <linux/miscdevice.h> 13#include <linux/interrupt.h> 14#include <linux/ratelimit.h> 15#include <linux/kallsyms.h> 16#include <linux/rcupdate.h> 17#include <linux/kobject.h> 18#include <linux/uaccess.h> 19#include <linux/kdebug.h> 20#include <linux/kernel.h> 21#include <linux/percpu.h> 22#include <linux/string.h> 23#include <linux/sysdev.h> 24#include <linux/delay.h> 25#include <linux/ctype.h> 26#include <linux/sched.h> 27#include <linux/sysfs.h> 28#include <linux/types.h> 29#include <linux/init.h> 30#include <linux/kmod.h> 31#include <linux/poll.h> 32#include <linux/nmi.h> 33#include <linux/cpu.h> 34#include <linux/smp.h> 35#include <linux/fs.h> 36#include <linux/mm.h> 37#include <linux/debugfs.h> 38 39#include <asm/processor.h> 40#include <asm/hw_irq.h> 41#include <asm/apic.h> 42#include <asm/idle.h> 43#include <asm/ipi.h> 44#include <asm/mce.h> 45#include <asm/msr.h> 46 47#include "mce-internal.h" 48 49#define CREATE_TRACE_POINTS 50#include <trace/events/mce.h> 51 52int mce_disabled __read_mostly; 53 54#define MISC_MCELOG_MINOR 227 55 56#define SPINUNIT 100 /* 100ns */ 57 58atomic_t mce_entry; 59 60DEFINE_PER_CPU(unsigned, mce_exception_count); 61 62/* 63 * Tolerant levels: 64 * 0: always panic on uncorrected errors, log corrected errors 65 * 1: panic or SIGBUS on uncorrected errors, log corrected errors 66 * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors 67 * 3: never panic or SIGBUS, log all errors (for testing only) 68 */ 69static int tolerant __read_mostly = 1; 70static int banks __read_mostly; 71static int rip_msr __read_mostly; 72static int mce_bootlog __read_mostly = -1; 73static int monarch_timeout __read_mostly = -1; 74static int mce_panic_timeout __read_mostly; 75static int mce_dont_log_ce __read_mostly; 76int mce_cmci_disabled __read_mostly; 77int mce_ignore_ce __read_mostly; 78int mce_ser __read_mostly; 79 80struct mce_bank *mce_banks __read_mostly; 81 82/* User mode helper program triggered by machine check event */ 83static unsigned long mce_need_notify; 84static char mce_helper[128]; 85static char *mce_helper_argv[2] = { mce_helper, NULL }; 86 87static DECLARE_WAIT_QUEUE_HEAD(mce_wait); 88static DEFINE_PER_CPU(struct mce, mces_seen); 89static int cpu_missing; 90 91static void default_decode_mce(struct mce *m) 92{ 93 pr_emerg("No human readable MCE decoding support on this CPU type.\n"); 94 pr_emerg("Run the message through 'mcelog --ascii' to decode.\n"); 95} 96 97/* 98 * CPU/chipset specific EDAC code can register a callback here to print 99 * MCE errors in a human-readable form: 100 */ 101void (*x86_mce_decode_callback)(struct mce *m) = default_decode_mce; 102EXPORT_SYMBOL(x86_mce_decode_callback); 103 104/* MCA banks polled by the period polling timer for corrected events */ 105DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { 106 [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL 107}; 108 109static DEFINE_PER_CPU(struct work_struct, mce_work); 110 111/* Do initial initialization of a struct mce */ 112void mce_setup(struct mce *m) 113{ 114 memset(m, 0, sizeof(struct mce)); 115 m->cpu = m->extcpu = smp_processor_id(); 116 rdtscll(m->tsc); 117 /* We hope get_seconds stays lockless */ 118 m->time = get_seconds(); 119 m->cpuvendor = boot_cpu_data.x86_vendor; 120 m->cpuid = cpuid_eax(1); 121#ifdef CONFIG_SMP 122 m->socketid = cpu_data(m->extcpu).phys_proc_id; 123#endif 124 m->apicid = cpu_data(m->extcpu).initial_apicid; 125 rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap); 126} 127 128DEFINE_PER_CPU(struct mce, injectm); 129EXPORT_PER_CPU_SYMBOL_GPL(injectm); 130 131/* 132 * Lockless MCE logging infrastructure. 133 * This avoids deadlocks on printk locks without having to break locks. Also 134 * separate MCEs from kernel messages to avoid bogus bug reports. 135 */ 136 137static struct mce_log mcelog = { 138 .signature = MCE_LOG_SIGNATURE, 139 .len = MCE_LOG_LEN, 140 .recordlen = sizeof(struct mce), 141}; 142 143void mce_log(struct mce *mce) 144{ 145 unsigned next, entry; 146 147 /* Emit the trace record: */ 148 trace_mce_record(mce); 149 150 mce->finished = 0; 151 wmb(); 152 for (;;) { 153 entry = rcu_dereference(mcelog.next); 154 for (;;) { 155 /* 156 * When the buffer fills up discard new entries. 157 * Assume that the earlier errors are the more 158 * interesting ones: 159 */ 160 if (entry >= MCE_LOG_LEN) { 161 set_bit(MCE_OVERFLOW, 162 (unsigned long *)&mcelog.flags); 163 return; 164 } 165 /* Old left over entry. Skip: */ 166 if (mcelog.entry[entry].finished) { 167 entry++; 168 continue; 169 } 170 break; 171 } 172 smp_rmb(); 173 next = entry + 1; 174 if (cmpxchg(&mcelog.next, entry, next) == entry) 175 break; 176 } 177 memcpy(mcelog.entry + entry, mce, sizeof(struct mce)); 178 wmb(); 179 mcelog.entry[entry].finished = 1; 180 wmb(); 181 182 mce->finished = 1; 183 set_bit(0, &mce_need_notify); 184} 185 186static void print_mce(struct mce *m) 187{ 188 pr_emerg("CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n", 189 m->extcpu, m->mcgstatus, m->bank, m->status); 190 191 if (m->ip) { 192 pr_emerg("RIP%s %02x:<%016Lx> ", 193 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", 194 m->cs, m->ip); 195 196 if (m->cs == __KERNEL_CS) 197 print_symbol("{%s}", m->ip); 198 pr_cont("\n"); 199 } 200 201 pr_emerg("TSC %llx ", m->tsc); 202 if (m->addr) 203 pr_cont("ADDR %llx ", m->addr); 204 if (m->misc) 205 pr_cont("MISC %llx ", m->misc); 206 207 pr_cont("\n"); 208 pr_emerg("PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n", 209 m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid); 210 211 /* 212 * Print out human-readable details about the MCE error, 213 * (if the CPU has an implementation for that): 214 */ 215 x86_mce_decode_callback(m); 216} 217 218static void print_mce_head(void) 219{ 220 pr_emerg("\nHARDWARE ERROR\n"); 221} 222 223static void print_mce_tail(void) 224{ 225 pr_emerg("This is not a software problem!\n"); 226} 227 228#define PANIC_TIMEOUT 5 /* 5 seconds */ 229 230static atomic_t mce_paniced; 231 232static int fake_panic; 233static atomic_t mce_fake_paniced; 234 235/* Panic in progress. Enable interrupts and wait for final IPI */ 236static void wait_for_panic(void) 237{ 238 long timeout = PANIC_TIMEOUT*USEC_PER_SEC; 239 240 preempt_disable(); 241 local_irq_enable(); 242 while (timeout-- > 0) 243 udelay(1); 244 if (panic_timeout == 0) 245 panic_timeout = mce_panic_timeout; 246 panic("Panicing machine check CPU died"); 247} 248 249static void mce_panic(char *msg, struct mce *final, char *exp) 250{ 251 int i; 252 253 if (!fake_panic) { 254 /* 255 * Make sure only one CPU runs in machine check panic 256 */ 257 if (atomic_inc_return(&mce_paniced) > 1) 258 wait_for_panic(); 259 barrier(); 260 261 bust_spinlocks(1); 262 console_verbose(); 263 } else { 264 /* Don't log too much for fake panic */ 265 if (atomic_inc_return(&mce_fake_paniced) > 1) 266 return; 267 } 268 print_mce_head(); 269 /* First print corrected ones that are still unlogged */ 270 for (i = 0; i < MCE_LOG_LEN; i++) { 271 struct mce *m = &mcelog.entry[i]; 272 if (!(m->status & MCI_STATUS_VAL)) 273 continue; 274 if (!(m->status & MCI_STATUS_UC)) 275 print_mce(m); 276 } 277 /* Now print uncorrected but with the final one last */ 278 for (i = 0; i < MCE_LOG_LEN; i++) { 279 struct mce *m = &mcelog.entry[i]; 280 if (!(m->status & MCI_STATUS_VAL)) 281 continue; 282 if (!(m->status & MCI_STATUS_UC)) 283 continue; 284 if (!final || memcmp(m, final, sizeof(struct mce))) 285 print_mce(m); 286 } 287 if (final) 288 print_mce(final); 289 if (cpu_missing) 290 printk(KERN_EMERG "Some CPUs didn't answer in synchronization\n"); 291 print_mce_tail(); 292 if (exp) 293 printk(KERN_EMERG "Machine check: %s\n", exp); 294 if (!fake_panic) { 295 if (panic_timeout == 0) 296 panic_timeout = mce_panic_timeout; 297 panic(msg); 298 } else 299 printk(KERN_EMERG "Fake kernel panic: %s\n", msg); 300} 301 302/* Support code for software error injection */ 303 304static int msr_to_offset(u32 msr) 305{ 306 unsigned bank = __get_cpu_var(injectm.bank); 307 308 if (msr == rip_msr) 309 return offsetof(struct mce, ip); 310 if (msr == MSR_IA32_MCx_STATUS(bank)) 311 return offsetof(struct mce, status); 312 if (msr == MSR_IA32_MCx_ADDR(bank)) 313 return offsetof(struct mce, addr); 314 if (msr == MSR_IA32_MCx_MISC(bank)) 315 return offsetof(struct mce, misc); 316 if (msr == MSR_IA32_MCG_STATUS) 317 return offsetof(struct mce, mcgstatus); 318 return -1; 319} 320 321/* MSR access wrappers used for error injection */ 322static u64 mce_rdmsrl(u32 msr) 323{ 324 u64 v; 325 326 if (__get_cpu_var(injectm).finished) { 327 int offset = msr_to_offset(msr); 328 329 if (offset < 0) 330 return 0; 331 return *(u64 *)((char *)&__get_cpu_var(injectm) + offset); 332 } 333 334 if (rdmsrl_safe(msr, &v)) { 335 WARN_ONCE(1, "mce: Unable to read msr %d!\n", msr); 336 /* 337 * Return zero in case the access faulted. This should 338 * not happen normally but can happen if the CPU does 339 * something weird, or if the code is buggy. 340 */ 341 v = 0; 342 } 343 344 return v; 345} 346 347static void mce_wrmsrl(u32 msr, u64 v) 348{ 349 if (__get_cpu_var(injectm).finished) { 350 int offset = msr_to_offset(msr); 351 352 if (offset >= 0) 353 *(u64 *)((char *)&__get_cpu_var(injectm) + offset) = v; 354 return; 355 } 356 wrmsrl(msr, v); 357} 358 359/* 360 * Simple lockless ring to communicate PFNs from the exception handler with the 361 * process context work function. This is vastly simplified because there's 362 * only a single reader and a single writer. 363 */ 364#define MCE_RING_SIZE 16 /* we use one entry less */ 365 366struct mce_ring { 367 unsigned short start; 368 unsigned short end; 369 unsigned long ring[MCE_RING_SIZE]; 370}; 371static DEFINE_PER_CPU(struct mce_ring, mce_ring); 372 373/* Runs with CPU affinity in workqueue */ 374static int mce_ring_empty(void) 375{ 376 struct mce_ring *r = &__get_cpu_var(mce_ring); 377 378 return r->start == r->end; 379} 380 381static int mce_ring_get(unsigned long *pfn) 382{ 383 struct mce_ring *r; 384 int ret = 0; 385 386 *pfn = 0; 387 get_cpu(); 388 r = &__get_cpu_var(mce_ring); 389 if (r->start == r->end) 390 goto out; 391 *pfn = r->ring[r->start]; 392 r->start = (r->start + 1) % MCE_RING_SIZE; 393 ret = 1; 394out: 395 put_cpu(); 396 return ret; 397} 398 399/* Always runs in MCE context with preempt off */ 400static int mce_ring_add(unsigned long pfn) 401{ 402 struct mce_ring *r = &__get_cpu_var(mce_ring); 403 unsigned next; 404 405 next = (r->end + 1) % MCE_RING_SIZE; 406 if (next == r->start) 407 return -1; 408 r->ring[r->end] = pfn; 409 wmb(); 410 r->end = next; 411 return 0; 412} 413 414int mce_available(struct cpuinfo_x86 *c) 415{ 416 if (mce_disabled) 417 return 0; 418 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); 419} 420 421static void mce_schedule_work(void) 422{ 423 if (!mce_ring_empty()) { 424 struct work_struct *work = &__get_cpu_var(mce_work); 425 if (!work_pending(work)) 426 schedule_work(work); 427 } 428} 429 430/* 431 * Get the address of the instruction at the time of the machine check 432 * error. 433 */ 434static inline void mce_get_rip(struct mce *m, struct pt_regs *regs) 435{ 436 437 if (regs && (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV))) { 438 m->ip = regs->ip; 439 m->cs = regs->cs; 440 } else { 441 m->ip = 0; 442 m->cs = 0; 443 } 444 if (rip_msr) 445 m->ip = mce_rdmsrl(rip_msr); 446} 447 448#ifdef CONFIG_X86_LOCAL_APIC 449/* 450 * Called after interrupts have been reenabled again 451 * when a MCE happened during an interrupts off region 452 * in the kernel. 453 */ 454asmlinkage void smp_mce_self_interrupt(struct pt_regs *regs) 455{ 456 ack_APIC_irq(); 457 exit_idle(); 458 irq_enter(); 459 mce_notify_irq(); 460 mce_schedule_work(); 461 irq_exit(); 462} 463#endif 464 465static void mce_report_event(struct pt_regs *regs) 466{ 467 if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) { 468 mce_notify_irq(); 469 /* 470 * Triggering the work queue here is just an insurance 471 * policy in case the syscall exit notify handler 472 * doesn't run soon enough or ends up running on the 473 * wrong CPU (can happen when audit sleeps) 474 */ 475 mce_schedule_work(); 476 return; 477 } 478 479#ifdef CONFIG_X86_LOCAL_APIC 480 /* 481 * Without APIC do not notify. The event will be picked 482 * up eventually. 483 */ 484 if (!cpu_has_apic) 485 return; 486 487 /* 488 * When interrupts are disabled we cannot use 489 * kernel services safely. Trigger an self interrupt 490 * through the APIC to instead do the notification 491 * after interrupts are reenabled again. 492 */ 493 apic->send_IPI_self(MCE_SELF_VECTOR); 494 495 /* 496 * Wait for idle afterwards again so that we don't leave the 497 * APIC in a non idle state because the normal APIC writes 498 * cannot exclude us. 499 */ 500 apic_wait_icr_idle(); 501#endif 502} 503 504DEFINE_PER_CPU(unsigned, mce_poll_count); 505 506/* 507 * Poll for corrected events or events that happened before reset. 508 * Those are just logged through /dev/mcelog. 509 * 510 * This is executed in standard interrupt context. 511 * 512 * Note: spec recommends to panic for fatal unsignalled 513 * errors here. However this would be quite problematic -- 514 * we would need to reimplement the Monarch handling and 515 * it would mess up the exclusion between exception handler 516 * and poll hander -- * so we skip this for now. 517 * These cases should not happen anyways, or only when the CPU 518 * is already totally * confused. In this case it's likely it will 519 * not fully execute the machine check handler either. 520 */ 521void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) 522{ 523 struct mce m; 524 int i; 525 526 __get_cpu_var(mce_poll_count)++; 527 528 mce_setup(&m); 529 530 m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); 531 for (i = 0; i < banks; i++) { 532 if (!mce_banks[i].ctl || !test_bit(i, *b)) 533 continue; 534 535 m.misc = 0; 536 m.addr = 0; 537 m.bank = i; 538 m.tsc = 0; 539 540 barrier(); 541 m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); 542 if (!(m.status & MCI_STATUS_VAL)) 543 continue; 544 545 /* 546 * Uncorrected or signalled events are handled by the exception 547 * handler when it is enabled, so don't process those here. 548 * 549 * TBD do the same check for MCI_STATUS_EN here? 550 */ 551 if (!(flags & MCP_UC) && 552 (m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC))) 553 continue; 554 555 if (m.status & MCI_STATUS_MISCV) 556 m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i)); 557 if (m.status & MCI_STATUS_ADDRV) 558 m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i)); 559 560 if (!(flags & MCP_TIMESTAMP)) 561 m.tsc = 0; 562 /* 563 * Don't get the IP here because it's unlikely to 564 * have anything to do with the actual error location. 565 */ 566 if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) { 567 mce_log(&m); 568 add_taint(TAINT_MACHINE_CHECK); 569 } 570 571 /* 572 * Clear state for this bank. 573 */ 574 mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0); 575 } 576 577 /* 578 * Don't clear MCG_STATUS here because it's only defined for 579 * exceptions. 580 */ 581 582 sync_core(); 583} 584EXPORT_SYMBOL_GPL(machine_check_poll); 585 586/* 587 * Do a quick check if any of the events requires a panic. 588 * This decides if we keep the events around or clear them. 589 */ 590static int mce_no_way_out(struct mce *m, char **msg) 591{ 592 int i; 593 594 for (i = 0; i < banks; i++) { 595 m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); 596 if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY) 597 return 1; 598 } 599 return 0; 600} 601 602/* 603 * Variable to establish order between CPUs while scanning. 604 * Each CPU spins initially until executing is equal its number. 605 */ 606static atomic_t mce_executing; 607 608/* 609 * Defines order of CPUs on entry. First CPU becomes Monarch. 610 */ 611static atomic_t mce_callin; 612 613/* 614 * Check if a timeout waiting for other CPUs happened. 615 */ 616static int mce_timed_out(u64 *t) 617{ 618 /* 619 * The others already did panic for some reason. 620 * Bail out like in a timeout. 621 * rmb() to tell the compiler that system_state 622 * might have been modified by someone else. 623 */ 624 rmb(); 625 if (atomic_read(&mce_paniced)) 626 wait_for_panic(); 627 if (!monarch_timeout) 628 goto out; 629 if ((s64)*t < SPINUNIT) { 630 /* CHECKME: Make panic default for 1 too? */ 631 if (tolerant < 1) 632 mce_panic("Timeout synchronizing machine check over CPUs", 633 NULL, NULL); 634 cpu_missing = 1; 635 return 1; 636 } 637 *t -= SPINUNIT; 638out: 639 touch_nmi_watchdog(); 640 return 0; 641} 642 643/* 644 * The Monarch's reign. The Monarch is the CPU who entered 645 * the machine check handler first. It waits for the others to 646 * raise the exception too and then grades them. When any 647 * error is fatal panic. Only then let the others continue. 648 * 649 * The other CPUs entering the MCE handler will be controlled by the 650 * Monarch. They are called Subjects. 651 * 652 * This way we prevent any potential data corruption in a unrecoverable case 653 * and also makes sure always all CPU's errors are examined. 654 * 655 * Also this detects the case of a machine check event coming from outer 656 * space (not detected by any CPUs) In this case some external agent wants 657 * us to shut down, so panic too. 658 * 659 * The other CPUs might still decide to panic if the handler happens 660 * in a unrecoverable place, but in this case the system is in a semi-stable 661 * state and won't corrupt anything by itself. It's ok to let the others 662 * continue for a bit first. 663 * 664 * All the spin loops have timeouts; when a timeout happens a CPU 665 * typically elects itself to be Monarch. 666 */ 667static void mce_reign(void) 668{ 669 int cpu; 670 struct mce *m = NULL; 671 int global_worst = 0; 672 char *msg = NULL; 673 char *nmsg = NULL; 674 675 /* 676 * This CPU is the Monarch and the other CPUs have run 677 * through their handlers. 678 * Grade the severity of the errors of all the CPUs. 679 */ 680 for_each_possible_cpu(cpu) { 681 int severity = mce_severity(&per_cpu(mces_seen, cpu), tolerant, 682 &nmsg); 683 if (severity > global_worst) { 684 msg = nmsg; 685 global_worst = severity; 686 m = &per_cpu(mces_seen, cpu); 687 } 688 } 689 690 /* 691 * Cannot recover? Panic here then. 692 * This dumps all the mces in the log buffer and stops the 693 * other CPUs. 694 */ 695 if (m && global_worst >= MCE_PANIC_SEVERITY && tolerant < 3) 696 mce_panic("Fatal Machine check", m, msg); 697 698 /* 699 * For UC somewhere we let the CPU who detects it handle it. 700 * Also must let continue the others, otherwise the handling 701 * CPU could deadlock on a lock. 702 */ 703 704 /* 705 * No machine check event found. Must be some external 706 * source or one CPU is hung. Panic. 707 */ 708 if (global_worst <= MCE_KEEP_SEVERITY && tolerant < 3) 709 mce_panic("Machine check from unknown source", NULL, NULL); 710 711 /* 712 * Now clear all the mces_seen so that they don't reappear on 713 * the next mce. 714 */ 715 for_each_possible_cpu(cpu) 716 memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce)); 717} 718 719static atomic_t global_nwo; 720 721/* 722 * Start of Monarch synchronization. This waits until all CPUs have 723 * entered the exception handler and then determines if any of them 724 * saw a fatal event that requires panic. Then it executes them 725 * in the entry order. 726 * TBD double check parallel CPU hotunplug 727 */ 728static int mce_start(int *no_way_out) 729{ 730 int order; 731 int cpus = num_online_cpus(); 732 u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC; 733 734 if (!timeout) 735 return -1; 736 737 atomic_add(*no_way_out, &global_nwo); 738 /* 739 * global_nwo should be updated before mce_callin 740 */ 741 smp_wmb(); 742 order = atomic_inc_return(&mce_callin); 743 744 /* 745 * Wait for everyone. 746 */ 747 while (atomic_read(&mce_callin) != cpus) { 748 if (mce_timed_out(&timeout)) { 749 atomic_set(&global_nwo, 0); 750 return -1; 751 } 752 ndelay(SPINUNIT); 753 } 754 755 /* 756 * mce_callin should be read before global_nwo 757 */ 758 smp_rmb(); 759 760 if (order == 1) { 761 /* 762 * Monarch: Starts executing now, the others wait. 763 */ 764 atomic_set(&mce_executing, 1); 765 } else { 766 /* 767 * Subject: Now start the scanning loop one by one in 768 * the original callin order. 769 * This way when there are any shared banks it will be 770 * only seen by one CPU before cleared, avoiding duplicates. 771 */ 772 while (atomic_read(&mce_executing) < order) { 773 if (mce_timed_out(&timeout)) { 774 atomic_set(&global_nwo, 0); 775 return -1; 776 } 777 ndelay(SPINUNIT); 778 } 779 } 780 781 /* 782 * Cache the global no_way_out state. 783 */ 784 *no_way_out = atomic_read(&global_nwo); 785 786 return order; 787} 788 789/* 790 * Synchronize between CPUs after main scanning loop. 791 * This invokes the bulk of the Monarch processing. 792 */ 793static int mce_end(int order) 794{ 795 int ret = -1; 796 u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC; 797 798 if (!timeout) 799 goto reset; 800 if (order < 0) 801 goto reset; 802 803 /* 804 * Allow others to run. 805 */ 806 atomic_inc(&mce_executing); 807 808 if (order == 1) { 809 /* CHECKME: Can this race with a parallel hotplug? */ 810 int cpus = num_online_cpus(); 811 812 /* 813 * Monarch: Wait for everyone to go through their scanning 814 * loops. 815 */ 816 while (atomic_read(&mce_executing) <= cpus) { 817 if (mce_timed_out(&timeout)) 818 goto reset; 819 ndelay(SPINUNIT); 820 } 821 822 mce_reign(); 823 barrier(); 824 ret = 0; 825 } else { 826 /* 827 * Subject: Wait for Monarch to finish. 828 */ 829 while (atomic_read(&mce_executing) != 0) { 830 if (mce_timed_out(&timeout)) 831 goto reset; 832 ndelay(SPINUNIT); 833 } 834 835 /* 836 * Don't reset anything. That's done by the Monarch. 837 */ 838 return 0; 839 } 840 841 /* 842 * Reset all global state. 843 */ 844reset: 845 atomic_set(&global_nwo, 0); 846 atomic_set(&mce_callin, 0); 847 barrier(); 848 849 /* 850 * Let others run again. 851 */ 852 atomic_set(&mce_executing, 0); 853 return ret; 854} 855 856/* 857 * Check if the address reported by the CPU is in a format we can parse. 858 * It would be possible to add code for most other cases, but all would 859 * be somewhat complicated (e.g. segment offset would require an instruction 860 * parser). So only support physical addresses upto page granuality for now. 861 */ 862static int mce_usable_address(struct mce *m) 863{ 864 if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV)) 865 return 0; 866 if ((m->misc & 0x3f) > PAGE_SHIFT) 867 return 0; 868 if (((m->misc >> 6) & 7) != MCM_ADDR_PHYS) 869 return 0; 870 return 1; 871} 872 873static void mce_clear_state(unsigned long *toclear) 874{ 875 int i; 876 877 for (i = 0; i < banks; i++) { 878 if (test_bit(i, toclear)) 879 mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0); 880 } 881} 882 883/* 884 * The actual machine check handler. This only handles real 885 * exceptions when something got corrupted coming in through int 18. 886 * 887 * This is executed in NMI context not subject to normal locking rules. This 888 * implies that most kernel services cannot be safely used. Don't even 889 * think about putting a printk in there! 890 * 891 * On Intel systems this is entered on all CPUs in parallel through 892 * MCE broadcast. However some CPUs might be broken beyond repair, 893 * so be always careful when synchronizing with others. 894 */ 895void do_machine_check(struct pt_regs *regs, long error_code) 896{ 897 struct mce m, *final; 898 int i; 899 int worst = 0; 900 int severity; 901 /* 902 * Establish sequential order between the CPUs entering the machine 903 * check handler. 904 */ 905 int order; 906 /* 907 * If no_way_out gets set, there is no safe way to recover from this 908 * MCE. If tolerant is cranked up, we'll try anyway. 909 */ 910 int no_way_out = 0; 911 /* 912 * If kill_it gets set, there might be a way to recover from this 913 * error. 914 */ 915 int kill_it = 0; 916 DECLARE_BITMAP(toclear, MAX_NR_BANKS); 917 char *msg = "Unknown"; 918 919 atomic_inc(&mce_entry); 920 921 __get_cpu_var(mce_exception_count)++; 922 923 if (notify_die(DIE_NMI, "machine check", regs, error_code, 924 18, SIGKILL) == NOTIFY_STOP) 925 goto out; 926 if (!banks) 927 goto out; 928 929 mce_setup(&m); 930 931 m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); 932 final = &__get_cpu_var(mces_seen); 933 *final = m; 934 935 no_way_out = mce_no_way_out(&m, &msg); 936 937 barrier(); 938 939 /* 940 * When no restart IP must always kill or panic. 941 */ 942 if (!(m.mcgstatus & MCG_STATUS_RIPV)) 943 kill_it = 1; 944 945 /* 946 * Go through all the banks in exclusion of the other CPUs. 947 * This way we don't report duplicated events on shared banks 948 * because the first one to see it will clear it. 949 */ 950 order = mce_start(&no_way_out); 951 for (i = 0; i < banks; i++) { 952 __clear_bit(i, toclear); 953 if (!mce_banks[i].ctl) 954 continue; 955 956 m.misc = 0; 957 m.addr = 0; 958 m.bank = i; 959 960 m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); 961 if ((m.status & MCI_STATUS_VAL) == 0) 962 continue; 963 964 /* 965 * Non uncorrected or non signaled errors are handled by 966 * machine_check_poll. Leave them alone, unless this panics. 967 */ 968 if (!(m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)) && 969 !no_way_out) 970 continue; 971 972 /* 973 * Set taint even when machine check was not enabled. 974 */ 975 add_taint(TAINT_MACHINE_CHECK); 976 977 severity = mce_severity(&m, tolerant, NULL); 978 979 /* 980 * When machine check was for corrected handler don't touch, 981 * unless we're panicing. 982 */ 983 if (severity == MCE_KEEP_SEVERITY && !no_way_out) 984 continue; 985 __set_bit(i, toclear); 986 if (severity == MCE_NO_SEVERITY) { 987 /* 988 * Machine check event was not enabled. Clear, but 989 * ignore. 990 */ 991 continue; 992 } 993 994 /* 995 * Kill on action required. 996 */ 997 if (severity == MCE_AR_SEVERITY) 998 kill_it = 1; 999 1000 if (m.status & MCI_STATUS_MISCV) 1001 m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i)); 1002 if (m.status & MCI_STATUS_ADDRV) 1003 m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i)); 1004 1005 /* 1006 * Action optional error. Queue address for later processing. 1007 * When the ring overflows we just ignore the AO error. 1008 * RED-PEN add some logging mechanism when 1009 * usable_address or mce_add_ring fails. 1010 * RED-PEN don't ignore overflow for tolerant == 0 1011 */ 1012 if (severity == MCE_AO_SEVERITY && mce_usable_address(&m)) 1013 mce_ring_add(m.addr >> PAGE_SHIFT); 1014 1015 mce_get_rip(&m, regs); 1016 mce_log(&m); 1017 1018 if (severity > worst) { 1019 *final = m; 1020 worst = severity; 1021 } 1022 } 1023 1024 if (!no_way_out) 1025 mce_clear_state(toclear); 1026 1027 /* 1028 * Do most of the synchronization with other CPUs. 1029 * When there's any problem use only local no_way_out state. 1030 */ 1031 if (mce_end(order) < 0) 1032 no_way_out = worst >= MCE_PANIC_SEVERITY; 1033 1034 /* 1035 * If we have decided that we just CAN'T continue, and the user 1036 * has not set tolerant to an insane level, give up and die. 1037 * 1038 * This is mainly used in the case when the system doesn't 1039 * support MCE broadcasting or it has been disabled. 1040 */ 1041 if (no_way_out && tolerant < 3) 1042 mce_panic("Fatal machine check on current CPU", final, msg); 1043 1044 /* 1045 * If the error seems to be unrecoverable, something should be 1046 * done. Try to kill as little as possible. If we can kill just 1047 * one task, do that. If the user has set the tolerance very 1048 * high, don't try to do anything at all. 1049 */ 1050 1051 if (kill_it && tolerant < 3) 1052 force_sig(SIGBUS, current); 1053 1054 /* notify userspace ASAP */ 1055 set_thread_flag(TIF_MCE_NOTIFY); 1056 1057 if (worst > 0) 1058 mce_report_event(regs); 1059 mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); 1060out: 1061 atomic_dec(&mce_entry); 1062 sync_core(); 1063} 1064EXPORT_SYMBOL_GPL(do_machine_check); 1065 1066/* dummy to break dependency. actual code is in mm/memory-failure.c */ 1067void __attribute__((weak)) memory_failure(unsigned long pfn, int vector) 1068{ 1069 printk(KERN_ERR "Action optional memory failure at %lx ignored\n", pfn); 1070} 1071 1072/* 1073 * Called after mce notification in process context. This code 1074 * is allowed to sleep. Call the high level VM handler to process 1075 * any corrupted pages. 1076 * Assume that the work queue code only calls this one at a time 1077 * per CPU. 1078 * Note we don't disable preemption, so this code might run on the wrong 1079 * CPU. In this case the event is picked up by the scheduled work queue. 1080 * This is merely a fast path to expedite processing in some common 1081 * cases. 1082 */ 1083void mce_notify_process(void) 1084{ 1085 unsigned long pfn; 1086 mce_notify_irq(); 1087 while (mce_ring_get(&pfn)) 1088 memory_failure(pfn, MCE_VECTOR); 1089} 1090 1091static void mce_process_work(struct work_struct *dummy) 1092{ 1093 mce_notify_process(); 1094} 1095 1096#ifdef CONFIG_X86_MCE_INTEL 1097/*** 1098 * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog 1099 * @cpu: The CPU on which the event occurred. 1100 * @status: Event status information 1101 * 1102 * This function should be called by the thermal interrupt after the 1103 * event has been processed and the decision was made to log the event 1104 * further. 1105 * 1106 * The status parameter will be saved to the 'status' field of 'struct mce' 1107 * and historically has been the register value of the 1108 * MSR_IA32_THERMAL_STATUS (Intel) msr. 1109 */ 1110void mce_log_therm_throt_event(__u64 status) 1111{ 1112 struct mce m; 1113 1114 mce_setup(&m); 1115 m.bank = MCE_THERMAL_BANK; 1116 m.status = status; 1117 mce_log(&m); 1118} 1119#endif /* CONFIG_X86_MCE_INTEL */ 1120 1121/* 1122 * Periodic polling timer for "silent" machine check errors. If the 1123 * poller finds an MCE, poll 2x faster. When the poller finds no more 1124 * errors, poll 2x slower (up to check_interval seconds). 1125 */ 1126static int check_interval = 5 * 60; /* 5 minutes */ 1127 1128static DEFINE_PER_CPU(int, mce_next_interval); /* in jiffies */ 1129static DEFINE_PER_CPU(struct timer_list, mce_timer); 1130 1131static void mcheck_timer(unsigned long data) 1132{ 1133 struct timer_list *t = &per_cpu(mce_timer, data); 1134 int *n; 1135 1136 WARN_ON(smp_processor_id() != data); 1137 1138 if (mce_available(¤t_cpu_data)) { 1139 machine_check_poll(MCP_TIMESTAMP, 1140 &__get_cpu_var(mce_poll_banks)); 1141 } 1142 1143 /* 1144 * Alert userspace if needed. If we logged an MCE, reduce the 1145 * polling interval, otherwise increase the polling interval. 1146 */ 1147 n = &__get_cpu_var(mce_next_interval); 1148 if (mce_notify_irq()) 1149 *n = max(*n/2, HZ/100); 1150 else 1151 *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ)); 1152 1153 t->expires = jiffies + *n; 1154 add_timer_on(t, smp_processor_id()); 1155} 1156 1157static void mce_do_trigger(struct work_struct *work) 1158{ 1159 call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT); 1160} 1161 1162static DECLARE_WORK(mce_trigger_work, mce_do_trigger); 1163 1164/* 1165 * Notify the user(s) about new machine check events. 1166 * Can be called from interrupt context, but not from machine check/NMI 1167 * context. 1168 */ 1169int mce_notify_irq(void) 1170{ 1171 /* Not more than two messages every minute */ 1172 static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); 1173 1174 clear_thread_flag(TIF_MCE_NOTIFY); 1175 1176 if (test_and_clear_bit(0, &mce_need_notify)) { 1177 wake_up_interruptible(&mce_wait); 1178 1179 /* 1180 * There is no risk of missing notifications because 1181 * work_pending is always cleared before the function is 1182 * executed. 1183 */ 1184 if (mce_helper[0] && !work_pending(&mce_trigger_work)) 1185 schedule_work(&mce_trigger_work); 1186 1187 if (__ratelimit(&ratelimit)) 1188 printk(KERN_INFO "Machine check events logged\n"); 1189 1190 return 1; 1191 } 1192 return 0; 1193} 1194EXPORT_SYMBOL_GPL(mce_notify_irq); 1195 1196static int mce_banks_init(void) 1197{ 1198 int i; 1199 1200 mce_banks = kzalloc(banks * sizeof(struct mce_bank), GFP_KERNEL); 1201 if (!mce_banks) 1202 return -ENOMEM; 1203 for (i = 0; i < banks; i++) { 1204 struct mce_bank *b = &mce_banks[i]; 1205 1206 b->ctl = -1ULL; 1207 b->init = 1; 1208 } 1209 return 0; 1210} 1211 1212/* 1213 * Initialize Machine Checks for a CPU. 1214 */ 1215static int __cpuinit mce_cap_init(void) 1216{ 1217 unsigned b; 1218 u64 cap; 1219 1220 rdmsrl(MSR_IA32_MCG_CAP, cap); 1221 1222 b = cap & MCG_BANKCNT_MASK; 1223 printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b); 1224 1225 if (b > MAX_NR_BANKS) { 1226 printk(KERN_WARNING 1227 "MCE: Using only %u machine check banks out of %u\n", 1228 MAX_NR_BANKS, b); 1229 b = MAX_NR_BANKS; 1230 } 1231 1232 /* Don't support asymmetric configurations today */ 1233 WARN_ON(banks != 0 && b != banks); 1234 banks = b; 1235 if (!mce_banks) { 1236 int err = mce_banks_init(); 1237 1238 if (err) 1239 return err; 1240 } 1241 1242 /* Use accurate RIP reporting if available. */ 1243 if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9) 1244 rip_msr = MSR_IA32_MCG_EIP; 1245 1246 if (cap & MCG_SER_P) 1247 mce_ser = 1; 1248 1249 return 0; 1250} 1251 1252static void mce_init(void) 1253{ 1254 mce_banks_t all_banks; 1255 u64 cap; 1256 int i; 1257 1258 /* 1259 * Log the machine checks left over from the previous reset. 1260 */ 1261 bitmap_fill(all_banks, MAX_NR_BANKS); 1262 machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks); 1263 1264 set_in_cr4(X86_CR4_MCE); 1265 1266 rdmsrl(MSR_IA32_MCG_CAP, cap); 1267 if (cap & MCG_CTL_P) 1268 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); 1269 1270 for (i = 0; i < banks; i++) { 1271 struct mce_bank *b = &mce_banks[i]; 1272 1273 if (!b->init) 1274 continue; 1275 wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl); 1276 wrmsrl(MSR_IA32_MCx_STATUS(i), 0); 1277 } 1278} 1279 1280/* Add per CPU specific workarounds here */ 1281static int __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c) 1282{ 1283 if (c->x86_vendor == X86_VENDOR_UNKNOWN) { 1284 pr_info("MCE: unknown CPU type - not enabling MCE support.\n"); 1285 return -EOPNOTSUPP; 1286 } 1287 1288 /* This should be disabled by the BIOS, but isn't always */ 1289 if (c->x86_vendor == X86_VENDOR_AMD) { 1290 if (c->x86 == 15 && banks > 4) { 1291 /* 1292 * disable GART TBL walk error reporting, which 1293 * trips off incorrectly with the IOMMU & 3ware 1294 * & Cerberus: 1295 */ 1296 clear_bit(10, (unsigned long *)&mce_banks[4].ctl); 1297 } 1298 if (c->x86 <= 17 && mce_bootlog < 0) { 1299 /* 1300 * Lots of broken BIOS around that don't clear them 1301 * by default and leave crap in there. Don't log: 1302 */ 1303 mce_bootlog = 0; 1304 } 1305 /* 1306 * Various K7s with broken bank 0 around. Always disable 1307 * by default. 1308 */ 1309 if (c->x86 == 6 && banks > 0) 1310 mce_banks[0].ctl = 0; 1311 } 1312 1313 if (c->x86_vendor == X86_VENDOR_INTEL) { 1314 /* 1315 * SDM documents that on family 6 bank 0 should not be written 1316 * because it aliases to another special BIOS controlled 1317 * register. 1318 * But it's not aliased anymore on model 0x1a+ 1319 * Don't ignore bank 0 completely because there could be a 1320 * valid event later, merely don't write CTL0. 1321 */ 1322 1323 if (c->x86 == 6 && c->x86_model < 0x1A && banks > 0) 1324 mce_banks[0].init = 0; 1325 1326 /* 1327 * All newer Intel systems support MCE broadcasting. Enable 1328 * synchronization with a one second timeout. 1329 */ 1330 if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) && 1331 monarch_timeout < 0) 1332 monarch_timeout = USEC_PER_SEC; 1333 1334 /* 1335 * There are also broken BIOSes on some Pentium M and 1336 * earlier systems: 1337 */ 1338 if (c->x86 == 6 && c->x86_model <= 13 && mce_bootlog < 0) 1339 mce_bootlog = 0; 1340 } 1341 if (monarch_timeout < 0) 1342 monarch_timeout = 0; 1343 if (mce_bootlog != 0) 1344 mce_panic_timeout = 30; 1345 1346 return 0; 1347} 1348 1349static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c) 1350{ 1351 if (c->x86 != 5) 1352 return; 1353 switch (c->x86_vendor) { 1354 case X86_VENDOR_INTEL: 1355 intel_p5_mcheck_init(c); 1356 break; 1357 case X86_VENDOR_CENTAUR: 1358 winchip_mcheck_init(c); 1359 break; 1360 } 1361} 1362 1363static void mce_cpu_features(struct cpuinfo_x86 *c) 1364{ 1365 switch (c->x86_vendor) { 1366 case X86_VENDOR_INTEL: 1367 mce_intel_feature_init(c); 1368 break; 1369 case X86_VENDOR_AMD: 1370 mce_amd_feature_init(c); 1371 break; 1372 default: 1373 break; 1374 } 1375} 1376 1377static void mce_init_timer(void) 1378{ 1379 struct timer_list *t = &__get_cpu_var(mce_timer); 1380 int *n = &__get_cpu_var(mce_next_interval); 1381 1382 if (mce_ignore_ce) 1383 return; 1384 1385 *n = check_interval * HZ; 1386 if (!*n) 1387 return; 1388 setup_timer(t, mcheck_timer, smp_processor_id()); 1389 t->expires = round_jiffies(jiffies + *n); 1390 add_timer_on(t, smp_processor_id()); 1391} 1392 1393/* Handle unconfigured int18 (should never happen) */ 1394static void unexpected_machine_check(struct pt_regs *regs, long error_code) 1395{ 1396 printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", 1397 smp_processor_id()); 1398} 1399 1400/* Call the installed machine check handler for this CPU setup. */ 1401void (*machine_check_vector)(struct pt_regs *, long error_code) = 1402 unexpected_machine_check; 1403 1404/* 1405 * Called for each booted CPU to set up machine checks. 1406 * Must be called with preempt off: 1407 */ 1408void __cpuinit mcheck_init(struct cpuinfo_x86 *c) 1409{ 1410 if (mce_disabled) 1411 return; 1412 1413 mce_ancient_init(c); 1414 1415 if (!mce_available(c)) 1416 return; 1417 1418 if (mce_cap_init() < 0 || mce_cpu_quirks(c) < 0) { 1419 mce_disabled = 1; 1420 return; 1421 } 1422 1423 machine_check_vector = do_machine_check; 1424 1425 mce_init(); 1426 mce_cpu_features(c); 1427 mce_init_timer(); 1428 INIT_WORK(&__get_cpu_var(mce_work), mce_process_work); 1429} 1430 1431/* 1432 * Character device to read and clear the MCE log. 1433 */ 1434 1435static DEFINE_SPINLOCK(mce_state_lock); 1436static int open_count; /* #times opened */ 1437static int open_exclu; /* already open exclusive? */ 1438 1439static int mce_open(struct inode *inode, struct file *file) 1440{ 1441 spin_lock(&mce_state_lock); 1442 1443 if (open_exclu || (open_count && (file->f_flags & O_EXCL))) { 1444 spin_unlock(&mce_state_lock); 1445 1446 return -EBUSY; 1447 } 1448 1449 if (file->f_flags & O_EXCL) 1450 open_exclu = 1; 1451 open_count++; 1452 1453 spin_unlock(&mce_state_lock); 1454 1455 return nonseekable_open(inode, file); 1456} 1457 1458static int mce_release(struct inode *inode, struct file *file) 1459{ 1460 spin_lock(&mce_state_lock); 1461 1462 open_count--; 1463 open_exclu = 0; 1464 1465 spin_unlock(&mce_state_lock); 1466 1467 return 0; 1468} 1469 1470static void collect_tscs(void *data) 1471{ 1472 unsigned long *cpu_tsc = (unsigned long *)data; 1473 1474 rdtscll(cpu_tsc[smp_processor_id()]); 1475} 1476 1477static DEFINE_MUTEX(mce_read_mutex); 1478 1479static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, 1480 loff_t *off) 1481{ 1482 char __user *buf = ubuf; 1483 unsigned long *cpu_tsc; 1484 unsigned prev, next; 1485 int i, err; 1486 1487 cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL); 1488 if (!cpu_tsc) 1489 return -ENOMEM; 1490 1491 mutex_lock(&mce_read_mutex); 1492 next = rcu_dereference(mcelog.next); 1493 1494 /* Only supports full reads right now */ 1495 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) { 1496 mutex_unlock(&mce_read_mutex); 1497 kfree(cpu_tsc); 1498 1499 return -EINVAL; 1500 } 1501 1502 err = 0; 1503 prev = 0; 1504 do { 1505 for (i = prev; i < next; i++) { 1506 unsigned long start = jiffies; 1507 1508 while (!mcelog.entry[i].finished) { 1509 if (time_after_eq(jiffies, start + 2)) { 1510 memset(mcelog.entry + i, 0, 1511 sizeof(struct mce)); 1512 goto timeout; 1513 } 1514 cpu_relax(); 1515 } 1516 smp_rmb(); 1517 err |= copy_to_user(buf, mcelog.entry + i, 1518 sizeof(struct mce)); 1519 buf += sizeof(struct mce); 1520timeout: 1521 ; 1522 } 1523 1524 memset(mcelog.entry + prev, 0, 1525 (next - prev) * sizeof(struct mce)); 1526 prev = next; 1527 next = cmpxchg(&mcelog.next, prev, 0); 1528 } while (next != prev); 1529 1530 synchronize_sched(); 1531 1532 /* 1533 * Collect entries that were still getting written before the 1534 * synchronize. 1535 */ 1536 on_each_cpu(collect_tscs, cpu_tsc, 1); 1537 1538 for (i = next; i < MCE_LOG_LEN; i++) { 1539 if (mcelog.entry[i].finished && 1540 mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) { 1541 err |= copy_to_user(buf, mcelog.entry+i, 1542 sizeof(struct mce)); 1543 smp_rmb(); 1544 buf += sizeof(struct mce); 1545 memset(&mcelog.entry[i], 0, sizeof(struct mce)); 1546 } 1547 } 1548 mutex_unlock(&mce_read_mutex); 1549 kfree(cpu_tsc); 1550 1551 return err ? -EFAULT : buf - ubuf; 1552} 1553 1554static unsigned int mce_poll(struct file *file, poll_table *wait) 1555{ 1556 poll_wait(file, &mce_wait, wait); 1557 if (rcu_dereference(mcelog.next)) 1558 return POLLIN | POLLRDNORM; 1559 return 0; 1560} 1561 1562static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg) 1563{ 1564 int __user *p = (int __user *)arg; 1565 1566 if (!capable(CAP_SYS_ADMIN)) 1567 return -EPERM; 1568 1569 switch (cmd) { 1570 case MCE_GET_RECORD_LEN: 1571 return put_user(sizeof(struct mce), p); 1572 case MCE_GET_LOG_LEN: 1573 return put_user(MCE_LOG_LEN, p); 1574 case MCE_GETCLEAR_FLAGS: { 1575 unsigned flags; 1576 1577 do { 1578 flags = mcelog.flags; 1579 } while (cmpxchg(&mcelog.flags, flags, 0) != flags); 1580 1581 return put_user(flags, p); 1582 } 1583 default: 1584 return -ENOTTY; 1585 } 1586} 1587 1588/* Modified in mce-inject.c, so not static or const */ 1589struct file_operations mce_chrdev_ops = { 1590 .open = mce_open, 1591 .release = mce_release, 1592 .read = mce_read, 1593 .poll = mce_poll, 1594 .unlocked_ioctl = mce_ioctl, 1595}; 1596EXPORT_SYMBOL_GPL(mce_chrdev_ops); 1597 1598static struct miscdevice mce_log_device = { 1599 MISC_MCELOG_MINOR, 1600 "mcelog", 1601 &mce_chrdev_ops, 1602}; 1603 1604/* 1605 * mce=off Disables machine check 1606 * mce=no_cmci Disables CMCI 1607 * mce=dont_log_ce Clears corrected events silently, no log created for CEs. 1608 * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared. 1609 * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above) 1610 * monarchtimeout is how long to wait for other CPUs on machine 1611 * check, or 0 to not wait 1612 * mce=bootlog Log MCEs from before booting. Disabled by default on AMD. 1613 * mce=nobootlog Don't log MCEs from before booting. 1614 */ 1615static int __init mcheck_enable(char *str) 1616{ 1617 if (*str == 0) { 1618 enable_p5_mce(); 1619 return 1; 1620 } 1621 if (*str == '=') 1622 str++; 1623 if (!strcmp(str, "off")) 1624 mce_disabled = 1; 1625 else if (!strcmp(str, "no_cmci")) 1626 mce_cmci_disabled = 1; 1627 else if (!strcmp(str, "dont_log_ce")) 1628 mce_dont_log_ce = 1; 1629 else if (!strcmp(str, "ignore_ce")) 1630 mce_ignore_ce = 1; 1631 else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog")) 1632 mce_bootlog = (str[0] == 'b'); 1633 else if (isdigit(str[0])) { 1634 get_option(&str, &tolerant); 1635 if (*str == ',') { 1636 ++str; 1637 get_option(&str, &monarch_timeout); 1638 } 1639 } else { 1640 printk(KERN_INFO "mce argument %s ignored. Please use /sys\n", 1641 str); 1642 return 0; 1643 } 1644 return 1; 1645} 1646__setup("mce", mcheck_enable); 1647 1648/* 1649 * Sysfs support 1650 */ 1651 1652/* 1653 * Disable machine checks on suspend and shutdown. We can't really handle 1654 * them later. 1655 */ 1656static int mce_disable(void) 1657{ 1658 int i; 1659 1660 for (i = 0; i < banks; i++) { 1661 struct mce_bank *b = &mce_banks[i]; 1662 1663 if (b->init) 1664 wrmsrl(MSR_IA32_MCx_CTL(i), 0); 1665 } 1666 return 0; 1667} 1668 1669static int mce_suspend(struct sys_device *dev, pm_message_t state) 1670{ 1671 return mce_disable(); 1672} 1673 1674static int mce_shutdown(struct sys_device *dev) 1675{ 1676 return mce_disable(); 1677} 1678 1679/* 1680 * On resume clear all MCE state. Don't want to see leftovers from the BIOS. 1681 * Only one CPU is active at this time, the others get re-added later using 1682 * CPU hotplug: 1683 */ 1684static int mce_resume(struct sys_device *dev) 1685{ 1686 mce_init(); 1687 mce_cpu_features(¤t_cpu_data); 1688 1689 return 0; 1690} 1691 1692static void mce_cpu_restart(void *data) 1693{ 1694 del_timer_sync(&__get_cpu_var(mce_timer)); 1695 if (!mce_available(¤t_cpu_data)) 1696 return; 1697 mce_init(); 1698 mce_init_timer(); 1699} 1700 1701/* Reinit MCEs after user configuration changes */ 1702static void mce_restart(void) 1703{ 1704 on_each_cpu(mce_cpu_restart, NULL, 1); 1705} 1706 1707/* Toggle features for corrected errors */ 1708static void mce_disable_ce(void *all) 1709{ 1710 if (!mce_available(¤t_cpu_data)) 1711 return; 1712 if (all) 1713 del_timer_sync(&__get_cpu_var(mce_timer)); 1714 cmci_clear(); 1715} 1716 1717static void mce_enable_ce(void *all) 1718{ 1719 if (!mce_available(¤t_cpu_data)) 1720 return; 1721 cmci_reenable(); 1722 cmci_recheck(); 1723 if (all) 1724 mce_init_timer(); 1725} 1726 1727static struct sysdev_class mce_sysclass = { 1728 .suspend = mce_suspend, 1729 .shutdown = mce_shutdown, 1730 .resume = mce_resume, 1731 .name = "machinecheck", 1732}; 1733 1734DEFINE_PER_CPU(struct sys_device, mce_dev); 1735 1736__cpuinitdata 1737void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); 1738 1739static inline struct mce_bank *attr_to_bank(struct sysdev_attribute *attr) 1740{ 1741 return container_of(attr, struct mce_bank, attr); 1742} 1743 1744static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr, 1745 char *buf) 1746{ 1747 return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl); 1748} 1749 1750static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, 1751 const char *buf, size_t size) 1752{ 1753 u64 new; 1754 1755 if (strict_strtoull(buf, 0, &new) < 0) 1756 return -EINVAL; 1757 1758 attr_to_bank(attr)->ctl = new; 1759 mce_restart(); 1760 1761 return size; 1762} 1763 1764static ssize_t 1765show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf) 1766{ 1767 strcpy(buf, mce_helper); 1768 strcat(buf, "\n"); 1769 return strlen(mce_helper) + 1; 1770} 1771 1772static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr, 1773 const char *buf, size_t siz) 1774{ 1775 char *p; 1776 1777 strncpy(mce_helper, buf, sizeof(mce_helper)); 1778 mce_helper[sizeof(mce_helper)-1] = 0; 1779 p = strchr(mce_helper, '\n'); 1780 1781 if (p) 1782 *p = 0; 1783 1784 return strlen(mce_helper) + !!p; 1785} 1786 1787static ssize_t set_ignore_ce(struct sys_device *s, 1788 struct sysdev_attribute *attr, 1789 const char *buf, size_t size) 1790{ 1791 u64 new; 1792 1793 if (strict_strtoull(buf, 0, &new) < 0) 1794 return -EINVAL; 1795 1796 if (mce_ignore_ce ^ !!new) { 1797 if (new) { 1798 /* disable ce features */ 1799 on_each_cpu(mce_disable_ce, (void *)1, 1); 1800 mce_ignore_ce = 1; 1801 } else { 1802 /* enable ce features */ 1803 mce_ignore_ce = 0; 1804 on_each_cpu(mce_enable_ce, (void *)1, 1); 1805 } 1806 } 1807 return size; 1808} 1809 1810static ssize_t set_cmci_disabled(struct sys_device *s, 1811 struct sysdev_attribute *attr, 1812 const char *buf, size_t size) 1813{ 1814 u64 new; 1815 1816 if (strict_strtoull(buf, 0, &new) < 0) 1817 return -EINVAL; 1818 1819 if (mce_cmci_disabled ^ !!new) { 1820 if (new) { 1821 /* disable cmci */ 1822 on_each_cpu(mce_disable_ce, NULL, 1); 1823 mce_cmci_disabled = 1; 1824 } else { 1825 /* enable cmci */ 1826 mce_cmci_disabled = 0; 1827 on_each_cpu(mce_enable_ce, NULL, 1); 1828 } 1829 } 1830 return size; 1831} 1832 1833static ssize_t store_int_with_restart(struct sys_device *s, 1834 struct sysdev_attribute *attr, 1835 const char *buf, size_t size) 1836{ 1837 ssize_t ret = sysdev_store_int(s, attr, buf, size); 1838 mce_restart(); 1839 return ret; 1840} 1841 1842static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); 1843static SYSDEV_INT_ATTR(tolerant, 0644, tolerant); 1844static SYSDEV_INT_ATTR(monarch_timeout, 0644, monarch_timeout); 1845static SYSDEV_INT_ATTR(dont_log_ce, 0644, mce_dont_log_ce); 1846 1847static struct sysdev_ext_attribute attr_check_interval = { 1848 _SYSDEV_ATTR(check_interval, 0644, sysdev_show_int, 1849 store_int_with_restart), 1850 &check_interval 1851}; 1852 1853static struct sysdev_ext_attribute attr_ignore_ce = { 1854 _SYSDEV_ATTR(ignore_ce, 0644, sysdev_show_int, set_ignore_ce), 1855 &mce_ignore_ce 1856}; 1857 1858static struct sysdev_ext_attribute attr_cmci_disabled = { 1859 _SYSDEV_ATTR(cmci_disabled, 0644, sysdev_show_int, set_cmci_disabled), 1860 &mce_cmci_disabled 1861}; 1862 1863static struct sysdev_attribute *mce_attrs[] = { 1864 &attr_tolerant.attr, 1865 &attr_check_interval.attr, 1866 &attr_trigger, 1867 &attr_monarch_timeout.attr, 1868 &attr_dont_log_ce.attr, 1869 &attr_ignore_ce.attr, 1870 &attr_cmci_disabled.attr, 1871 NULL 1872}; 1873 1874static cpumask_var_t mce_dev_initialized; 1875 1876/* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */ 1877static __cpuinit int mce_create_device(unsigned int cpu) 1878{ 1879 int err; 1880 int i, j; 1881 1882 if (!mce_available(&boot_cpu_data)) 1883 return -EIO; 1884 1885 memset(&per_cpu(mce_dev, cpu).kobj, 0, sizeof(struct kobject)); 1886 per_cpu(mce_dev, cpu).id = cpu; 1887 per_cpu(mce_dev, cpu).cls = &mce_sysclass; 1888 1889 err = sysdev_register(&per_cpu(mce_dev, cpu)); 1890 if (err) 1891 return err; 1892 1893 for (i = 0; mce_attrs[i]; i++) { 1894 err = sysdev_create_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); 1895 if (err) 1896 goto error; 1897 } 1898 for (j = 0; j < banks; j++) { 1899 err = sysdev_create_file(&per_cpu(mce_dev, cpu), 1900 &mce_banks[j].attr); 1901 if (err) 1902 goto error2; 1903 } 1904 cpumask_set_cpu(cpu, mce_dev_initialized); 1905 1906 return 0; 1907error2: 1908 while (--j >= 0) 1909 sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[j].attr); 1910error: 1911 while (--i >= 0) 1912 sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[i].attr); 1913 1914 sysdev_unregister(&per_cpu(mce_dev, cpu)); 1915 1916 return err; 1917} 1918 1919static __cpuinit void mce_remove_device(unsigned int cpu) 1920{ 1921 int i; 1922 1923 if (!cpumask_test_cpu(cpu, mce_dev_initialized)) 1924 return; 1925 1926 for (i = 0; mce_attrs[i]; i++) 1927 sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); 1928 1929 for (i = 0; i < banks; i++) 1930 sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[i].attr); 1931 1932 sysdev_unregister(&per_cpu(mce_dev, cpu)); 1933 cpumask_clear_cpu(cpu, mce_dev_initialized); 1934} 1935 1936/* Make sure there are no machine checks on offlined CPUs. */ 1937static void mce_disable_cpu(void *h) 1938{ 1939 unsigned long action = *(unsigned long *)h; 1940 int i; 1941 1942 if (!mce_available(¤t_cpu_data)) 1943 return; 1944 if (!(action & CPU_TASKS_FROZEN)) 1945 cmci_clear(); 1946 for (i = 0; i < banks; i++) { 1947 struct mce_bank *b = &mce_banks[i]; 1948 1949 if (b->init) 1950 wrmsrl(MSR_IA32_MCx_CTL(i), 0); 1951 } 1952} 1953 1954static void mce_reenable_cpu(void *h) 1955{ 1956 unsigned long action = *(unsigned long *)h; 1957 int i; 1958 1959 if (!mce_available(¤t_cpu_data)) 1960 return; 1961 1962 if (!(action & CPU_TASKS_FROZEN)) 1963 cmci_reenable(); 1964 for (i = 0; i < banks; i++) { 1965 struct mce_bank *b = &mce_banks[i]; 1966 1967 if (b->init) 1968 wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl); 1969 } 1970} 1971 1972/* Get notified when a cpu comes on/off. Be hotplug friendly. */ 1973static int __cpuinit 1974mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) 1975{ 1976 unsigned int cpu = (unsigned long)hcpu; 1977 struct timer_list *t = &per_cpu(mce_timer, cpu); 1978 1979 switch (action) { 1980 case CPU_ONLINE: 1981 case CPU_ONLINE_FROZEN: 1982 mce_create_device(cpu); 1983 if (threshold_cpu_callback) 1984 threshold_cpu_callback(action, cpu); 1985 break; 1986 case CPU_DEAD: 1987 case CPU_DEAD_FROZEN: 1988 if (threshold_cpu_callback) 1989 threshold_cpu_callback(action, cpu); 1990 mce_remove_device(cpu); 1991 break; 1992 case CPU_DOWN_PREPARE: 1993 case CPU_DOWN_PREPARE_FROZEN: 1994 del_timer_sync(t); 1995 smp_call_function_single(cpu, mce_disable_cpu, &action, 1); 1996 break; 1997 case CPU_DOWN_FAILED: 1998 case CPU_DOWN_FAILED_FROZEN: 1999 t->expires = round_jiffies(jiffies + 2000 __get_cpu_var(mce_next_interval)); 2001 add_timer_on(t, cpu); 2002 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); 2003 break; 2004 case CPU_POST_DEAD: 2005 /* intentionally ignoring frozen here */ 2006 cmci_rediscover(cpu); 2007 break; 2008 } 2009 return NOTIFY_OK; 2010} 2011 2012static struct notifier_block mce_cpu_notifier __cpuinitdata = { 2013 .notifier_call = mce_cpu_callback, 2014}; 2015 2016static __init void mce_init_banks(void) 2017{ 2018 int i; 2019 2020 for (i = 0; i < banks; i++) { 2021 struct mce_bank *b = &mce_banks[i]; 2022 struct sysdev_attribute *a = &b->attr; 2023 2024 a->attr.name = b->attrname; 2025 snprintf(b->attrname, ATTR_LEN, "bank%d", i); 2026 2027 a->attr.mode = 0644; 2028 a->show = show_bank; 2029 a->store = set_bank; 2030 } 2031} 2032 2033static __init int mce_init_device(void) 2034{ 2035 int err; 2036 int i = 0; 2037 2038 if (!mce_available(&boot_cpu_data)) 2039 return -EIO; 2040 2041 zalloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL); 2042 2043 mce_init_banks(); 2044 2045 err = sysdev_class_register(&mce_sysclass); 2046 if (err) 2047 return err; 2048 2049 for_each_online_cpu(i) { 2050 err = mce_create_device(i); 2051 if (err) 2052 return err; 2053 } 2054 2055 register_hotcpu_notifier(&mce_cpu_notifier); 2056 misc_register(&mce_log_device); 2057 2058 return err; 2059} 2060 2061device_initcall(mce_init_device); 2062 2063/* 2064 * Old style boot options parsing. Only for compatibility. 2065 */ 2066static int __init mcheck_disable(char *str) 2067{ 2068 mce_disabled = 1; 2069 return 1; 2070} 2071__setup("nomce", mcheck_disable); 2072 2073#ifdef CONFIG_DEBUG_FS 2074struct dentry *mce_get_debugfs_dir(void) 2075{ 2076 static struct dentry *dmce; 2077 2078 if (!dmce) 2079 dmce = debugfs_create_dir("mce", NULL); 2080 2081 return dmce; 2082} 2083 2084static void mce_reset(void) 2085{ 2086 cpu_missing = 0; 2087 atomic_set(&mce_fake_paniced, 0); 2088 atomic_set(&mce_executing, 0); 2089 atomic_set(&mce_callin, 0); 2090 atomic_set(&global_nwo, 0); 2091} 2092 2093static int fake_panic_get(void *data, u64 *val) 2094{ 2095 *val = fake_panic; 2096 return 0; 2097} 2098 2099static int fake_panic_set(void *data, u64 val) 2100{ 2101 mce_reset(); 2102 fake_panic = val; 2103 return 0; 2104} 2105 2106DEFINE_SIMPLE_ATTRIBUTE(fake_panic_fops, fake_panic_get, 2107 fake_panic_set, "%llu\n"); 2108 2109static int __init mce_debugfs_init(void) 2110{ 2111 struct dentry *dmce, *ffake_panic; 2112 2113 dmce = mce_get_debugfs_dir(); 2114 if (!dmce) 2115 return -ENOMEM; 2116 ffake_panic = debugfs_create_file("fake_panic", 0444, dmce, NULL, 2117 &fake_panic_fops); 2118 if (!ffake_panic) 2119 return -ENOMEM; 2120 2121 return 0; 2122} 2123late_initcall(mce_debugfs_init); 2124#endif 2125