mce.c revision bf783f9f7d33576815bc89f9f1856a7309ea2f17
1/* 2 * Machine check handler. 3 * 4 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. 5 * Rest from unknown author(s). 6 * 2004 Andi Kleen. Rewrote most of it. 7 * Copyright 2008 Intel Corporation 8 * Author: Andi Kleen 9 */ 10#include <linux/thread_info.h> 11#include <linux/capability.h> 12#include <linux/miscdevice.h> 13#include <linux/interrupt.h> 14#include <linux/ratelimit.h> 15#include <linux/kallsyms.h> 16#include <linux/rcupdate.h> 17#include <linux/kobject.h> 18#include <linux/uaccess.h> 19#include <linux/kdebug.h> 20#include <linux/kernel.h> 21#include <linux/percpu.h> 22#include <linux/string.h> 23#include <linux/sysdev.h> 24#include <linux/delay.h> 25#include <linux/ctype.h> 26#include <linux/sched.h> 27#include <linux/sysfs.h> 28#include <linux/types.h> 29#include <linux/init.h> 30#include <linux/kmod.h> 31#include <linux/poll.h> 32#include <linux/nmi.h> 33#include <linux/cpu.h> 34#include <linux/smp.h> 35#include <linux/fs.h> 36#include <linux/mm.h> 37#include <linux/debugfs.h> 38 39#include <asm/processor.h> 40#include <asm/hw_irq.h> 41#include <asm/apic.h> 42#include <asm/idle.h> 43#include <asm/ipi.h> 44#include <asm/mce.h> 45#include <asm/msr.h> 46 47#include "mce-internal.h" 48 49int mce_disabled __read_mostly; 50 51#define MISC_MCELOG_MINOR 227 52 53#define SPINUNIT 100 /* 100ns */ 54 55atomic_t mce_entry; 56 57DEFINE_PER_CPU(unsigned, mce_exception_count); 58 59/* 60 * Tolerant levels: 61 * 0: always panic on uncorrected errors, log corrected errors 62 * 1: panic or SIGBUS on uncorrected errors, log corrected errors 63 * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors 64 * 3: never panic or SIGBUS, log all errors (for testing only) 65 */ 66static int tolerant __read_mostly = 1; 67static int banks __read_mostly; 68static int rip_msr __read_mostly; 69static int mce_bootlog __read_mostly = -1; 70static int monarch_timeout __read_mostly = -1; 71static int mce_panic_timeout __read_mostly; 72static int mce_dont_log_ce __read_mostly; 73int mce_cmci_disabled __read_mostly; 74int mce_ignore_ce __read_mostly; 75int mce_ser __read_mostly; 76 77struct mce_bank *mce_banks __read_mostly; 78 79/* User mode helper program triggered by machine check event */ 80static unsigned long mce_need_notify; 81static char mce_helper[128]; 82static char *mce_helper_argv[2] = { mce_helper, NULL }; 83 84static DECLARE_WAIT_QUEUE_HEAD(mce_wait); 85static DEFINE_PER_CPU(struct mce, mces_seen); 86static int cpu_missing; 87 88 89/* MCA banks polled by the period polling timer for corrected events */ 90DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { 91 [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL 92}; 93 94static DEFINE_PER_CPU(struct work_struct, mce_work); 95 96/* Do initial initialization of a struct mce */ 97void mce_setup(struct mce *m) 98{ 99 memset(m, 0, sizeof(struct mce)); 100 m->cpu = m->extcpu = smp_processor_id(); 101 rdtscll(m->tsc); 102 /* We hope get_seconds stays lockless */ 103 m->time = get_seconds(); 104 m->cpuvendor = boot_cpu_data.x86_vendor; 105 m->cpuid = cpuid_eax(1); 106#ifdef CONFIG_SMP 107 m->socketid = cpu_data(m->extcpu).phys_proc_id; 108#endif 109 m->apicid = cpu_data(m->extcpu).initial_apicid; 110 rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap); 111} 112 113DEFINE_PER_CPU(struct mce, injectm); 114EXPORT_PER_CPU_SYMBOL_GPL(injectm); 115 116/* 117 * Lockless MCE logging infrastructure. 118 * This avoids deadlocks on printk locks without having to break locks. Also 119 * separate MCEs from kernel messages to avoid bogus bug reports. 120 */ 121 122static struct mce_log mcelog = { 123 .signature = MCE_LOG_SIGNATURE, 124 .len = MCE_LOG_LEN, 125 .recordlen = sizeof(struct mce), 126}; 127 128void mce_log(struct mce *mce) 129{ 130 unsigned next, entry; 131 132 mce->finished = 0; 133 wmb(); 134 for (;;) { 135 entry = rcu_dereference(mcelog.next); 136 for (;;) { 137 /* 138 * When the buffer fills up discard new entries. 139 * Assume that the earlier errors are the more 140 * interesting ones: 141 */ 142 if (entry >= MCE_LOG_LEN) { 143 set_bit(MCE_OVERFLOW, 144 (unsigned long *)&mcelog.flags); 145 return; 146 } 147 /* Old left over entry. Skip: */ 148 if (mcelog.entry[entry].finished) { 149 entry++; 150 continue; 151 } 152 break; 153 } 154 smp_rmb(); 155 next = entry + 1; 156 if (cmpxchg(&mcelog.next, entry, next) == entry) 157 break; 158 } 159 memcpy(mcelog.entry + entry, mce, sizeof(struct mce)); 160 wmb(); 161 mcelog.entry[entry].finished = 1; 162 wmb(); 163 164 mce->finished = 1; 165 set_bit(0, &mce_need_notify); 166} 167 168static void print_mce(struct mce *m) 169{ 170 printk(KERN_EMERG 171 "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n", 172 m->extcpu, m->mcgstatus, m->bank, m->status); 173 if (m->ip) { 174 printk(KERN_EMERG "RIP%s %02x:<%016Lx> ", 175 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", 176 m->cs, m->ip); 177 if (m->cs == __KERNEL_CS) 178 print_symbol("{%s}", m->ip); 179 printk("\n"); 180 } 181 printk(KERN_EMERG "TSC %llx ", m->tsc); 182 if (m->addr) 183 printk("ADDR %llx ", m->addr); 184 if (m->misc) 185 printk("MISC %llx ", m->misc); 186 printk("\n"); 187 printk(KERN_EMERG "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n", 188 m->cpuvendor, m->cpuid, m->time, m->socketid, 189 m->apicid); 190} 191 192static void print_mce_head(void) 193{ 194 printk(KERN_EMERG "\n" KERN_EMERG "HARDWARE ERROR\n"); 195} 196 197static void print_mce_tail(void) 198{ 199 printk(KERN_EMERG "This is not a software problem!\n" 200 KERN_EMERG "Run through mcelog --ascii to decode and contact your hardware vendor\n"); 201} 202 203#define PANIC_TIMEOUT 5 /* 5 seconds */ 204 205static atomic_t mce_paniced; 206 207static int fake_panic; 208static atomic_t mce_fake_paniced; 209 210/* Panic in progress. Enable interrupts and wait for final IPI */ 211static void wait_for_panic(void) 212{ 213 long timeout = PANIC_TIMEOUT*USEC_PER_SEC; 214 preempt_disable(); 215 local_irq_enable(); 216 while (timeout-- > 0) 217 udelay(1); 218 if (panic_timeout == 0) 219 panic_timeout = mce_panic_timeout; 220 panic("Panicing machine check CPU died"); 221} 222 223static void mce_panic(char *msg, struct mce *final, char *exp) 224{ 225 int i; 226 227 if (!fake_panic) { 228 /* 229 * Make sure only one CPU runs in machine check panic 230 */ 231 if (atomic_inc_return(&mce_paniced) > 1) 232 wait_for_panic(); 233 barrier(); 234 235 bust_spinlocks(1); 236 console_verbose(); 237 } else { 238 /* Don't log too much for fake panic */ 239 if (atomic_inc_return(&mce_fake_paniced) > 1) 240 return; 241 } 242 print_mce_head(); 243 /* First print corrected ones that are still unlogged */ 244 for (i = 0; i < MCE_LOG_LEN; i++) { 245 struct mce *m = &mcelog.entry[i]; 246 if (!(m->status & MCI_STATUS_VAL)) 247 continue; 248 if (!(m->status & MCI_STATUS_UC)) 249 print_mce(m); 250 } 251 /* Now print uncorrected but with the final one last */ 252 for (i = 0; i < MCE_LOG_LEN; i++) { 253 struct mce *m = &mcelog.entry[i]; 254 if (!(m->status & MCI_STATUS_VAL)) 255 continue; 256 if (!(m->status & MCI_STATUS_UC)) 257 continue; 258 if (!final || memcmp(m, final, sizeof(struct mce))) 259 print_mce(m); 260 } 261 if (final) 262 print_mce(final); 263 if (cpu_missing) 264 printk(KERN_EMERG "Some CPUs didn't answer in synchronization\n"); 265 print_mce_tail(); 266 if (exp) 267 printk(KERN_EMERG "Machine check: %s\n", exp); 268 if (!fake_panic) { 269 if (panic_timeout == 0) 270 panic_timeout = mce_panic_timeout; 271 panic(msg); 272 } else 273 printk(KERN_EMERG "Fake kernel panic: %s\n", msg); 274} 275 276/* Support code for software error injection */ 277 278static int msr_to_offset(u32 msr) 279{ 280 unsigned bank = __get_cpu_var(injectm.bank); 281 if (msr == rip_msr) 282 return offsetof(struct mce, ip); 283 if (msr == MSR_IA32_MCx_STATUS(bank)) 284 return offsetof(struct mce, status); 285 if (msr == MSR_IA32_MCx_ADDR(bank)) 286 return offsetof(struct mce, addr); 287 if (msr == MSR_IA32_MCx_MISC(bank)) 288 return offsetof(struct mce, misc); 289 if (msr == MSR_IA32_MCG_STATUS) 290 return offsetof(struct mce, mcgstatus); 291 return -1; 292} 293 294/* MSR access wrappers used for error injection */ 295static u64 mce_rdmsrl(u32 msr) 296{ 297 u64 v; 298 if (__get_cpu_var(injectm).finished) { 299 int offset = msr_to_offset(msr); 300 if (offset < 0) 301 return 0; 302 return *(u64 *)((char *)&__get_cpu_var(injectm) + offset); 303 } 304 rdmsrl(msr, v); 305 return v; 306} 307 308static void mce_wrmsrl(u32 msr, u64 v) 309{ 310 if (__get_cpu_var(injectm).finished) { 311 int offset = msr_to_offset(msr); 312 if (offset >= 0) 313 *(u64 *)((char *)&__get_cpu_var(injectm) + offset) = v; 314 return; 315 } 316 wrmsrl(msr, v); 317} 318 319/* 320 * Simple lockless ring to communicate PFNs from the exception handler with the 321 * process context work function. This is vastly simplified because there's 322 * only a single reader and a single writer. 323 */ 324#define MCE_RING_SIZE 16 /* we use one entry less */ 325 326struct mce_ring { 327 unsigned short start; 328 unsigned short end; 329 unsigned long ring[MCE_RING_SIZE]; 330}; 331static DEFINE_PER_CPU(struct mce_ring, mce_ring); 332 333/* Runs with CPU affinity in workqueue */ 334static int mce_ring_empty(void) 335{ 336 struct mce_ring *r = &__get_cpu_var(mce_ring); 337 338 return r->start == r->end; 339} 340 341static int mce_ring_get(unsigned long *pfn) 342{ 343 struct mce_ring *r; 344 int ret = 0; 345 346 *pfn = 0; 347 get_cpu(); 348 r = &__get_cpu_var(mce_ring); 349 if (r->start == r->end) 350 goto out; 351 *pfn = r->ring[r->start]; 352 r->start = (r->start + 1) % MCE_RING_SIZE; 353 ret = 1; 354out: 355 put_cpu(); 356 return ret; 357} 358 359/* Always runs in MCE context with preempt off */ 360static int mce_ring_add(unsigned long pfn) 361{ 362 struct mce_ring *r = &__get_cpu_var(mce_ring); 363 unsigned next; 364 365 next = (r->end + 1) % MCE_RING_SIZE; 366 if (next == r->start) 367 return -1; 368 r->ring[r->end] = pfn; 369 wmb(); 370 r->end = next; 371 return 0; 372} 373 374int mce_available(struct cpuinfo_x86 *c) 375{ 376 if (mce_disabled) 377 return 0; 378 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); 379} 380 381static void mce_schedule_work(void) 382{ 383 if (!mce_ring_empty()) { 384 struct work_struct *work = &__get_cpu_var(mce_work); 385 if (!work_pending(work)) 386 schedule_work(work); 387 } 388} 389 390/* 391 * Get the address of the instruction at the time of the machine check 392 * error. 393 */ 394static inline void mce_get_rip(struct mce *m, struct pt_regs *regs) 395{ 396 397 if (regs && (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV))) { 398 m->ip = regs->ip; 399 m->cs = regs->cs; 400 } else { 401 m->ip = 0; 402 m->cs = 0; 403 } 404 if (rip_msr) 405 m->ip = mce_rdmsrl(rip_msr); 406} 407 408#ifdef CONFIG_X86_LOCAL_APIC 409/* 410 * Called after interrupts have been reenabled again 411 * when a MCE happened during an interrupts off region 412 * in the kernel. 413 */ 414asmlinkage void smp_mce_self_interrupt(struct pt_regs *regs) 415{ 416 ack_APIC_irq(); 417 exit_idle(); 418 irq_enter(); 419 mce_notify_irq(); 420 mce_schedule_work(); 421 irq_exit(); 422} 423#endif 424 425static void mce_report_event(struct pt_regs *regs) 426{ 427 if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) { 428 mce_notify_irq(); 429 /* 430 * Triggering the work queue here is just an insurance 431 * policy in case the syscall exit notify handler 432 * doesn't run soon enough or ends up running on the 433 * wrong CPU (can happen when audit sleeps) 434 */ 435 mce_schedule_work(); 436 return; 437 } 438 439#ifdef CONFIG_X86_LOCAL_APIC 440 /* 441 * Without APIC do not notify. The event will be picked 442 * up eventually. 443 */ 444 if (!cpu_has_apic) 445 return; 446 447 /* 448 * When interrupts are disabled we cannot use 449 * kernel services safely. Trigger an self interrupt 450 * through the APIC to instead do the notification 451 * after interrupts are reenabled again. 452 */ 453 apic->send_IPI_self(MCE_SELF_VECTOR); 454 455 /* 456 * Wait for idle afterwards again so that we don't leave the 457 * APIC in a non idle state because the normal APIC writes 458 * cannot exclude us. 459 */ 460 apic_wait_icr_idle(); 461#endif 462} 463 464DEFINE_PER_CPU(unsigned, mce_poll_count); 465 466/* 467 * Poll for corrected events or events that happened before reset. 468 * Those are just logged through /dev/mcelog. 469 * 470 * This is executed in standard interrupt context. 471 * 472 * Note: spec recommends to panic for fatal unsignalled 473 * errors here. However this would be quite problematic -- 474 * we would need to reimplement the Monarch handling and 475 * it would mess up the exclusion between exception handler 476 * and poll hander -- * so we skip this for now. 477 * These cases should not happen anyways, or only when the CPU 478 * is already totally * confused. In this case it's likely it will 479 * not fully execute the machine check handler either. 480 */ 481void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) 482{ 483 struct mce m; 484 int i; 485 486 __get_cpu_var(mce_poll_count)++; 487 488 mce_setup(&m); 489 490 m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); 491 for (i = 0; i < banks; i++) { 492 if (!mce_banks[i].ctl || !test_bit(i, *b)) 493 continue; 494 495 m.misc = 0; 496 m.addr = 0; 497 m.bank = i; 498 m.tsc = 0; 499 500 barrier(); 501 m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); 502 if (!(m.status & MCI_STATUS_VAL)) 503 continue; 504 505 /* 506 * Uncorrected or signalled events are handled by the exception 507 * handler when it is enabled, so don't process those here. 508 * 509 * TBD do the same check for MCI_STATUS_EN here? 510 */ 511 if (!(flags & MCP_UC) && 512 (m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC))) 513 continue; 514 515 if (m.status & MCI_STATUS_MISCV) 516 m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i)); 517 if (m.status & MCI_STATUS_ADDRV) 518 m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i)); 519 520 if (!(flags & MCP_TIMESTAMP)) 521 m.tsc = 0; 522 /* 523 * Don't get the IP here because it's unlikely to 524 * have anything to do with the actual error location. 525 */ 526 if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) { 527 mce_log(&m); 528 add_taint(TAINT_MACHINE_CHECK); 529 } 530 531 /* 532 * Clear state for this bank. 533 */ 534 mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0); 535 } 536 537 /* 538 * Don't clear MCG_STATUS here because it's only defined for 539 * exceptions. 540 */ 541 542 sync_core(); 543} 544EXPORT_SYMBOL_GPL(machine_check_poll); 545 546/* 547 * Do a quick check if any of the events requires a panic. 548 * This decides if we keep the events around or clear them. 549 */ 550static int mce_no_way_out(struct mce *m, char **msg) 551{ 552 int i; 553 554 for (i = 0; i < banks; i++) { 555 m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); 556 if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY) 557 return 1; 558 } 559 return 0; 560} 561 562/* 563 * Variable to establish order between CPUs while scanning. 564 * Each CPU spins initially until executing is equal its number. 565 */ 566static atomic_t mce_executing; 567 568/* 569 * Defines order of CPUs on entry. First CPU becomes Monarch. 570 */ 571static atomic_t mce_callin; 572 573/* 574 * Check if a timeout waiting for other CPUs happened. 575 */ 576static int mce_timed_out(u64 *t) 577{ 578 /* 579 * The others already did panic for some reason. 580 * Bail out like in a timeout. 581 * rmb() to tell the compiler that system_state 582 * might have been modified by someone else. 583 */ 584 rmb(); 585 if (atomic_read(&mce_paniced)) 586 wait_for_panic(); 587 if (!monarch_timeout) 588 goto out; 589 if ((s64)*t < SPINUNIT) { 590 /* CHECKME: Make panic default for 1 too? */ 591 if (tolerant < 1) 592 mce_panic("Timeout synchronizing machine check over CPUs", 593 NULL, NULL); 594 cpu_missing = 1; 595 return 1; 596 } 597 *t -= SPINUNIT; 598out: 599 touch_nmi_watchdog(); 600 return 0; 601} 602 603/* 604 * The Monarch's reign. The Monarch is the CPU who entered 605 * the machine check handler first. It waits for the others to 606 * raise the exception too and then grades them. When any 607 * error is fatal panic. Only then let the others continue. 608 * 609 * The other CPUs entering the MCE handler will be controlled by the 610 * Monarch. They are called Subjects. 611 * 612 * This way we prevent any potential data corruption in a unrecoverable case 613 * and also makes sure always all CPU's errors are examined. 614 * 615 * Also this detects the case of an machine check event coming from outer 616 * space (not detected by any CPUs) In this case some external agent wants 617 * us to shut down, so panic too. 618 * 619 * The other CPUs might still decide to panic if the handler happens 620 * in a unrecoverable place, but in this case the system is in a semi-stable 621 * state and won't corrupt anything by itself. It's ok to let the others 622 * continue for a bit first. 623 * 624 * All the spin loops have timeouts; when a timeout happens a CPU 625 * typically elects itself to be Monarch. 626 */ 627static void mce_reign(void) 628{ 629 int cpu; 630 struct mce *m = NULL; 631 int global_worst = 0; 632 char *msg = NULL; 633 char *nmsg = NULL; 634 635 /* 636 * This CPU is the Monarch and the other CPUs have run 637 * through their handlers. 638 * Grade the severity of the errors of all the CPUs. 639 */ 640 for_each_possible_cpu(cpu) { 641 int severity = mce_severity(&per_cpu(mces_seen, cpu), tolerant, 642 &nmsg); 643 if (severity > global_worst) { 644 msg = nmsg; 645 global_worst = severity; 646 m = &per_cpu(mces_seen, cpu); 647 } 648 } 649 650 /* 651 * Cannot recover? Panic here then. 652 * This dumps all the mces in the log buffer and stops the 653 * other CPUs. 654 */ 655 if (m && global_worst >= MCE_PANIC_SEVERITY && tolerant < 3) 656 mce_panic("Fatal Machine check", m, msg); 657 658 /* 659 * For UC somewhere we let the CPU who detects it handle it. 660 * Also must let continue the others, otherwise the handling 661 * CPU could deadlock on a lock. 662 */ 663 664 /* 665 * No machine check event found. Must be some external 666 * source or one CPU is hung. Panic. 667 */ 668 if (!m && tolerant < 3) 669 mce_panic("Machine check from unknown source", NULL, NULL); 670 671 /* 672 * Now clear all the mces_seen so that they don't reappear on 673 * the next mce. 674 */ 675 for_each_possible_cpu(cpu) 676 memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce)); 677} 678 679static atomic_t global_nwo; 680 681/* 682 * Start of Monarch synchronization. This waits until all CPUs have 683 * entered the exception handler and then determines if any of them 684 * saw a fatal event that requires panic. Then it executes them 685 * in the entry order. 686 * TBD double check parallel CPU hotunplug 687 */ 688static int mce_start(int *no_way_out) 689{ 690 int order; 691 int cpus = num_online_cpus(); 692 u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC; 693 694 if (!timeout) 695 return -1; 696 697 atomic_add(*no_way_out, &global_nwo); 698 /* 699 * global_nwo should be updated before mce_callin 700 */ 701 smp_wmb(); 702 order = atomic_inc_return(&mce_callin); 703 704 /* 705 * Wait for everyone. 706 */ 707 while (atomic_read(&mce_callin) != cpus) { 708 if (mce_timed_out(&timeout)) { 709 atomic_set(&global_nwo, 0); 710 return -1; 711 } 712 ndelay(SPINUNIT); 713 } 714 715 /* 716 * mce_callin should be read before global_nwo 717 */ 718 smp_rmb(); 719 720 if (order == 1) { 721 /* 722 * Monarch: Starts executing now, the others wait. 723 */ 724 atomic_set(&mce_executing, 1); 725 } else { 726 /* 727 * Subject: Now start the scanning loop one by one in 728 * the original callin order. 729 * This way when there are any shared banks it will be 730 * only seen by one CPU before cleared, avoiding duplicates. 731 */ 732 while (atomic_read(&mce_executing) < order) { 733 if (mce_timed_out(&timeout)) { 734 atomic_set(&global_nwo, 0); 735 return -1; 736 } 737 ndelay(SPINUNIT); 738 } 739 } 740 741 /* 742 * Cache the global no_way_out state. 743 */ 744 *no_way_out = atomic_read(&global_nwo); 745 746 return order; 747} 748 749/* 750 * Synchronize between CPUs after main scanning loop. 751 * This invokes the bulk of the Monarch processing. 752 */ 753static int mce_end(int order) 754{ 755 int ret = -1; 756 u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC; 757 758 if (!timeout) 759 goto reset; 760 if (order < 0) 761 goto reset; 762 763 /* 764 * Allow others to run. 765 */ 766 atomic_inc(&mce_executing); 767 768 if (order == 1) { 769 /* CHECKME: Can this race with a parallel hotplug? */ 770 int cpus = num_online_cpus(); 771 772 /* 773 * Monarch: Wait for everyone to go through their scanning 774 * loops. 775 */ 776 while (atomic_read(&mce_executing) <= cpus) { 777 if (mce_timed_out(&timeout)) 778 goto reset; 779 ndelay(SPINUNIT); 780 } 781 782 mce_reign(); 783 barrier(); 784 ret = 0; 785 } else { 786 /* 787 * Subject: Wait for Monarch to finish. 788 */ 789 while (atomic_read(&mce_executing) != 0) { 790 if (mce_timed_out(&timeout)) 791 goto reset; 792 ndelay(SPINUNIT); 793 } 794 795 /* 796 * Don't reset anything. That's done by the Monarch. 797 */ 798 return 0; 799 } 800 801 /* 802 * Reset all global state. 803 */ 804reset: 805 atomic_set(&global_nwo, 0); 806 atomic_set(&mce_callin, 0); 807 barrier(); 808 809 /* 810 * Let others run again. 811 */ 812 atomic_set(&mce_executing, 0); 813 return ret; 814} 815 816/* 817 * Check if the address reported by the CPU is in a format we can parse. 818 * It would be possible to add code for most other cases, but all would 819 * be somewhat complicated (e.g. segment offset would require an instruction 820 * parser). So only support physical addresses upto page granuality for now. 821 */ 822static int mce_usable_address(struct mce *m) 823{ 824 if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV)) 825 return 0; 826 if ((m->misc & 0x3f) > PAGE_SHIFT) 827 return 0; 828 if (((m->misc >> 6) & 7) != MCM_ADDR_PHYS) 829 return 0; 830 return 1; 831} 832 833static void mce_clear_state(unsigned long *toclear) 834{ 835 int i; 836 837 for (i = 0; i < banks; i++) { 838 if (test_bit(i, toclear)) 839 mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0); 840 } 841} 842 843/* 844 * The actual machine check handler. This only handles real 845 * exceptions when something got corrupted coming in through int 18. 846 * 847 * This is executed in NMI context not subject to normal locking rules. This 848 * implies that most kernel services cannot be safely used. Don't even 849 * think about putting a printk in there! 850 * 851 * On Intel systems this is entered on all CPUs in parallel through 852 * MCE broadcast. However some CPUs might be broken beyond repair, 853 * so be always careful when synchronizing with others. 854 */ 855void do_machine_check(struct pt_regs *regs, long error_code) 856{ 857 struct mce m, *final; 858 int i; 859 int worst = 0; 860 int severity; 861 /* 862 * Establish sequential order between the CPUs entering the machine 863 * check handler. 864 */ 865 int order; 866 /* 867 * If no_way_out gets set, there is no safe way to recover from this 868 * MCE. If tolerant is cranked up, we'll try anyway. 869 */ 870 int no_way_out = 0; 871 /* 872 * If kill_it gets set, there might be a way to recover from this 873 * error. 874 */ 875 int kill_it = 0; 876 DECLARE_BITMAP(toclear, MAX_NR_BANKS); 877 char *msg = "Unknown"; 878 879 atomic_inc(&mce_entry); 880 881 __get_cpu_var(mce_exception_count)++; 882 883 if (notify_die(DIE_NMI, "machine check", regs, error_code, 884 18, SIGKILL) == NOTIFY_STOP) 885 goto out; 886 if (!banks) 887 goto out; 888 889 mce_setup(&m); 890 891 m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); 892 no_way_out = mce_no_way_out(&m, &msg); 893 894 final = &__get_cpu_var(mces_seen); 895 *final = m; 896 897 barrier(); 898 899 /* 900 * When no restart IP must always kill or panic. 901 */ 902 if (!(m.mcgstatus & MCG_STATUS_RIPV)) 903 kill_it = 1; 904 905 /* 906 * Go through all the banks in exclusion of the other CPUs. 907 * This way we don't report duplicated events on shared banks 908 * because the first one to see it will clear it. 909 */ 910 order = mce_start(&no_way_out); 911 for (i = 0; i < banks; i++) { 912 __clear_bit(i, toclear); 913 if (!mce_banks[i].ctl) 914 continue; 915 916 m.misc = 0; 917 m.addr = 0; 918 m.bank = i; 919 920 m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); 921 if ((m.status & MCI_STATUS_VAL) == 0) 922 continue; 923 924 /* 925 * Non uncorrected or non signaled errors are handled by 926 * machine_check_poll. Leave them alone, unless this panics. 927 */ 928 if (!(m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)) && 929 !no_way_out) 930 continue; 931 932 /* 933 * Set taint even when machine check was not enabled. 934 */ 935 add_taint(TAINT_MACHINE_CHECK); 936 937 severity = mce_severity(&m, tolerant, NULL); 938 939 /* 940 * When machine check was for corrected handler don't touch, 941 * unless we're panicing. 942 */ 943 if (severity == MCE_KEEP_SEVERITY && !no_way_out) 944 continue; 945 __set_bit(i, toclear); 946 if (severity == MCE_NO_SEVERITY) { 947 /* 948 * Machine check event was not enabled. Clear, but 949 * ignore. 950 */ 951 continue; 952 } 953 954 /* 955 * Kill on action required. 956 */ 957 if (severity == MCE_AR_SEVERITY) 958 kill_it = 1; 959 960 if (m.status & MCI_STATUS_MISCV) 961 m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i)); 962 if (m.status & MCI_STATUS_ADDRV) 963 m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i)); 964 965 /* 966 * Action optional error. Queue address for later processing. 967 * When the ring overflows we just ignore the AO error. 968 * RED-PEN add some logging mechanism when 969 * usable_address or mce_add_ring fails. 970 * RED-PEN don't ignore overflow for tolerant == 0 971 */ 972 if (severity == MCE_AO_SEVERITY && mce_usable_address(&m)) 973 mce_ring_add(m.addr >> PAGE_SHIFT); 974 975 mce_get_rip(&m, regs); 976 mce_log(&m); 977 978 if (severity > worst) { 979 *final = m; 980 worst = severity; 981 } 982 } 983 984 if (!no_way_out) 985 mce_clear_state(toclear); 986 987 /* 988 * Do most of the synchronization with other CPUs. 989 * When there's any problem use only local no_way_out state. 990 */ 991 if (mce_end(order) < 0) 992 no_way_out = worst >= MCE_PANIC_SEVERITY; 993 994 /* 995 * If we have decided that we just CAN'T continue, and the user 996 * has not set tolerant to an insane level, give up and die. 997 * 998 * This is mainly used in the case when the system doesn't 999 * support MCE broadcasting or it has been disabled. 1000 */ 1001 if (no_way_out && tolerant < 3) 1002 mce_panic("Fatal machine check on current CPU", final, msg); 1003 1004 /* 1005 * If the error seems to be unrecoverable, something should be 1006 * done. Try to kill as little as possible. If we can kill just 1007 * one task, do that. If the user has set the tolerance very 1008 * high, don't try to do anything at all. 1009 */ 1010 1011 if (kill_it && tolerant < 3) 1012 force_sig(SIGBUS, current); 1013 1014 /* notify userspace ASAP */ 1015 set_thread_flag(TIF_MCE_NOTIFY); 1016 1017 if (worst > 0) 1018 mce_report_event(regs); 1019 mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); 1020out: 1021 atomic_dec(&mce_entry); 1022 sync_core(); 1023} 1024EXPORT_SYMBOL_GPL(do_machine_check); 1025 1026/* dummy to break dependency. actual code is in mm/memory-failure.c */ 1027void __attribute__((weak)) memory_failure(unsigned long pfn, int vector) 1028{ 1029 printk(KERN_ERR "Action optional memory failure at %lx ignored\n", pfn); 1030} 1031 1032/* 1033 * Called after mce notification in process context. This code 1034 * is allowed to sleep. Call the high level VM handler to process 1035 * any corrupted pages. 1036 * Assume that the work queue code only calls this one at a time 1037 * per CPU. 1038 * Note we don't disable preemption, so this code might run on the wrong 1039 * CPU. In this case the event is picked up by the scheduled work queue. 1040 * This is merely a fast path to expedite processing in some common 1041 * cases. 1042 */ 1043void mce_notify_process(void) 1044{ 1045 unsigned long pfn; 1046 mce_notify_irq(); 1047 while (mce_ring_get(&pfn)) 1048 memory_failure(pfn, MCE_VECTOR); 1049} 1050 1051static void mce_process_work(struct work_struct *dummy) 1052{ 1053 mce_notify_process(); 1054} 1055 1056#ifdef CONFIG_X86_MCE_INTEL 1057/*** 1058 * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog 1059 * @cpu: The CPU on which the event occurred. 1060 * @status: Event status information 1061 * 1062 * This function should be called by the thermal interrupt after the 1063 * event has been processed and the decision was made to log the event 1064 * further. 1065 * 1066 * The status parameter will be saved to the 'status' field of 'struct mce' 1067 * and historically has been the register value of the 1068 * MSR_IA32_THERMAL_STATUS (Intel) msr. 1069 */ 1070void mce_log_therm_throt_event(__u64 status) 1071{ 1072 struct mce m; 1073 1074 mce_setup(&m); 1075 m.bank = MCE_THERMAL_BANK; 1076 m.status = status; 1077 mce_log(&m); 1078} 1079#endif /* CONFIG_X86_MCE_INTEL */ 1080 1081/* 1082 * Periodic polling timer for "silent" machine check errors. If the 1083 * poller finds an MCE, poll 2x faster. When the poller finds no more 1084 * errors, poll 2x slower (up to check_interval seconds). 1085 */ 1086static int check_interval = 5 * 60; /* 5 minutes */ 1087 1088static DEFINE_PER_CPU(int, next_interval); /* in jiffies */ 1089static DEFINE_PER_CPU(struct timer_list, mce_timer); 1090 1091static void mcheck_timer(unsigned long data) 1092{ 1093 struct timer_list *t = &per_cpu(mce_timer, data); 1094 int *n; 1095 1096 WARN_ON(smp_processor_id() != data); 1097 1098 if (mce_available(¤t_cpu_data)) { 1099 machine_check_poll(MCP_TIMESTAMP, 1100 &__get_cpu_var(mce_poll_banks)); 1101 } 1102 1103 /* 1104 * Alert userspace if needed. If we logged an MCE, reduce the 1105 * polling interval, otherwise increase the polling interval. 1106 */ 1107 n = &__get_cpu_var(next_interval); 1108 if (mce_notify_irq()) 1109 *n = max(*n/2, HZ/100); 1110 else 1111 *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ)); 1112 1113 t->expires = jiffies + *n; 1114 add_timer(t); 1115} 1116 1117static void mce_do_trigger(struct work_struct *work) 1118{ 1119 call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT); 1120} 1121 1122static DECLARE_WORK(mce_trigger_work, mce_do_trigger); 1123 1124/* 1125 * Notify the user(s) about new machine check events. 1126 * Can be called from interrupt context, but not from machine check/NMI 1127 * context. 1128 */ 1129int mce_notify_irq(void) 1130{ 1131 /* Not more than two messages every minute */ 1132 static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); 1133 1134 clear_thread_flag(TIF_MCE_NOTIFY); 1135 1136 if (test_and_clear_bit(0, &mce_need_notify)) { 1137 wake_up_interruptible(&mce_wait); 1138 1139 /* 1140 * There is no risk of missing notifications because 1141 * work_pending is always cleared before the function is 1142 * executed. 1143 */ 1144 if (mce_helper[0] && !work_pending(&mce_trigger_work)) 1145 schedule_work(&mce_trigger_work); 1146 1147 if (__ratelimit(&ratelimit)) 1148 printk(KERN_INFO "Machine check events logged\n"); 1149 1150 return 1; 1151 } 1152 return 0; 1153} 1154EXPORT_SYMBOL_GPL(mce_notify_irq); 1155 1156static int mce_banks_init(void) 1157{ 1158 int i; 1159 1160 mce_banks = kzalloc(banks * sizeof(struct mce_bank), GFP_KERNEL); 1161 if (!mce_banks) 1162 return -ENOMEM; 1163 for (i = 0; i < banks; i++) { 1164 struct mce_bank *b = &mce_banks[i]; 1165 b->ctl = -1ULL; 1166 b->init = 1; 1167 } 1168 return 0; 1169} 1170 1171/* 1172 * Initialize Machine Checks for a CPU. 1173 */ 1174static int __cpuinit mce_cap_init(void) 1175{ 1176 unsigned b; 1177 u64 cap; 1178 1179 rdmsrl(MSR_IA32_MCG_CAP, cap); 1180 1181 b = cap & MCG_BANKCNT_MASK; 1182 printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b); 1183 1184 if (b > MAX_NR_BANKS) { 1185 printk(KERN_WARNING 1186 "MCE: Using only %u machine check banks out of %u\n", 1187 MAX_NR_BANKS, b); 1188 b = MAX_NR_BANKS; 1189 } 1190 1191 /* Don't support asymmetric configurations today */ 1192 WARN_ON(banks != 0 && b != banks); 1193 banks = b; 1194 if (!mce_banks) { 1195 int err = mce_banks_init(); 1196 if (err) 1197 return err; 1198 } 1199 1200 /* Use accurate RIP reporting if available. */ 1201 if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9) 1202 rip_msr = MSR_IA32_MCG_EIP; 1203 1204 if (cap & MCG_SER_P) 1205 mce_ser = 1; 1206 1207 return 0; 1208} 1209 1210static void mce_init(void) 1211{ 1212 mce_banks_t all_banks; 1213 u64 cap; 1214 int i; 1215 1216 /* 1217 * Log the machine checks left over from the previous reset. 1218 */ 1219 bitmap_fill(all_banks, MAX_NR_BANKS); 1220 machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks); 1221 1222 set_in_cr4(X86_CR4_MCE); 1223 1224 rdmsrl(MSR_IA32_MCG_CAP, cap); 1225 if (cap & MCG_CTL_P) 1226 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); 1227 1228 for (i = 0; i < banks; i++) { 1229 struct mce_bank *b = &mce_banks[i]; 1230 if (!b->init) 1231 continue; 1232 wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl); 1233 wrmsrl(MSR_IA32_MCx_STATUS(i), 0); 1234 } 1235} 1236 1237/* Add per CPU specific workarounds here */ 1238static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c) 1239{ 1240 /* This should be disabled by the BIOS, but isn't always */ 1241 if (c->x86_vendor == X86_VENDOR_AMD) { 1242 if (c->x86 == 15 && banks > 4) { 1243 /* 1244 * disable GART TBL walk error reporting, which 1245 * trips off incorrectly with the IOMMU & 3ware 1246 * & Cerberus: 1247 */ 1248 clear_bit(10, (unsigned long *)&mce_banks[4].ctl); 1249 } 1250 if (c->x86 <= 17 && mce_bootlog < 0) { 1251 /* 1252 * Lots of broken BIOS around that don't clear them 1253 * by default and leave crap in there. Don't log: 1254 */ 1255 mce_bootlog = 0; 1256 } 1257 /* 1258 * Various K7s with broken bank 0 around. Always disable 1259 * by default. 1260 */ 1261 if (c->x86 == 6 && banks > 0) 1262 mce_banks[0].ctl = 0; 1263 } 1264 1265 if (c->x86_vendor == X86_VENDOR_INTEL) { 1266 /* 1267 * SDM documents that on family 6 bank 0 should not be written 1268 * because it aliases to another special BIOS controlled 1269 * register. 1270 * But it's not aliased anymore on model 0x1a+ 1271 * Don't ignore bank 0 completely because there could be a 1272 * valid event later, merely don't write CTL0. 1273 */ 1274 1275 if (c->x86 == 6 && c->x86_model < 0x1A && banks > 0) 1276 mce_banks[0].init = 0; 1277 1278 /* 1279 * All newer Intel systems support MCE broadcasting. Enable 1280 * synchronization with a one second timeout. 1281 */ 1282 if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) && 1283 monarch_timeout < 0) 1284 monarch_timeout = USEC_PER_SEC; 1285 1286 /* There are also broken BIOSes on some Pentium M systems. */ 1287 if (c->x86 == 6 && c->x86_model == 13 && mce_bootlog < 0) 1288 mce_bootlog = 0; 1289 } 1290 if (monarch_timeout < 0) 1291 monarch_timeout = 0; 1292 if (mce_bootlog != 0) 1293 mce_panic_timeout = 30; 1294} 1295 1296static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c) 1297{ 1298 if (c->x86 != 5) 1299 return; 1300 switch (c->x86_vendor) { 1301 case X86_VENDOR_INTEL: 1302 intel_p5_mcheck_init(c); 1303 break; 1304 case X86_VENDOR_CENTAUR: 1305 winchip_mcheck_init(c); 1306 break; 1307 } 1308} 1309 1310static void mce_cpu_features(struct cpuinfo_x86 *c) 1311{ 1312 switch (c->x86_vendor) { 1313 case X86_VENDOR_INTEL: 1314 mce_intel_feature_init(c); 1315 break; 1316 case X86_VENDOR_AMD: 1317 mce_amd_feature_init(c); 1318 break; 1319 default: 1320 break; 1321 } 1322} 1323 1324static void mce_init_timer(void) 1325{ 1326 struct timer_list *t = &__get_cpu_var(mce_timer); 1327 int *n = &__get_cpu_var(next_interval); 1328 1329 if (mce_ignore_ce) 1330 return; 1331 1332 *n = check_interval * HZ; 1333 if (!*n) 1334 return; 1335 setup_timer(t, mcheck_timer, smp_processor_id()); 1336 t->expires = round_jiffies(jiffies + *n); 1337 add_timer(t); 1338} 1339 1340/* Handle unconfigured int18 (should never happen) */ 1341static void unexpected_machine_check(struct pt_regs *regs, long error_code) 1342{ 1343 printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", 1344 smp_processor_id()); 1345} 1346 1347/* Call the installed machine check handler for this CPU setup. */ 1348void (*machine_check_vector)(struct pt_regs *, long error_code) = 1349 unexpected_machine_check; 1350 1351/* 1352 * Called for each booted CPU to set up machine checks. 1353 * Must be called with preempt off: 1354 */ 1355void __cpuinit mcheck_init(struct cpuinfo_x86 *c) 1356{ 1357 if (mce_disabled) 1358 return; 1359 1360 mce_ancient_init(c); 1361 1362 if (!mce_available(c)) 1363 return; 1364 1365 if (mce_cap_init() < 0) { 1366 mce_disabled = 1; 1367 return; 1368 } 1369 mce_cpu_quirks(c); 1370 1371 machine_check_vector = do_machine_check; 1372 1373 mce_init(); 1374 mce_cpu_features(c); 1375 mce_init_timer(); 1376 INIT_WORK(&__get_cpu_var(mce_work), mce_process_work); 1377} 1378 1379/* 1380 * Character device to read and clear the MCE log. 1381 */ 1382 1383static DEFINE_SPINLOCK(mce_state_lock); 1384static int open_count; /* #times opened */ 1385static int open_exclu; /* already open exclusive? */ 1386 1387static int mce_open(struct inode *inode, struct file *file) 1388{ 1389 spin_lock(&mce_state_lock); 1390 1391 if (open_exclu || (open_count && (file->f_flags & O_EXCL))) { 1392 spin_unlock(&mce_state_lock); 1393 1394 return -EBUSY; 1395 } 1396 1397 if (file->f_flags & O_EXCL) 1398 open_exclu = 1; 1399 open_count++; 1400 1401 spin_unlock(&mce_state_lock); 1402 1403 return nonseekable_open(inode, file); 1404} 1405 1406static int mce_release(struct inode *inode, struct file *file) 1407{ 1408 spin_lock(&mce_state_lock); 1409 1410 open_count--; 1411 open_exclu = 0; 1412 1413 spin_unlock(&mce_state_lock); 1414 1415 return 0; 1416} 1417 1418static void collect_tscs(void *data) 1419{ 1420 unsigned long *cpu_tsc = (unsigned long *)data; 1421 1422 rdtscll(cpu_tsc[smp_processor_id()]); 1423} 1424 1425static DEFINE_MUTEX(mce_read_mutex); 1426 1427static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, 1428 loff_t *off) 1429{ 1430 char __user *buf = ubuf; 1431 unsigned long *cpu_tsc; 1432 unsigned prev, next; 1433 int i, err; 1434 1435 cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL); 1436 if (!cpu_tsc) 1437 return -ENOMEM; 1438 1439 mutex_lock(&mce_read_mutex); 1440 next = rcu_dereference(mcelog.next); 1441 1442 /* Only supports full reads right now */ 1443 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) { 1444 mutex_unlock(&mce_read_mutex); 1445 kfree(cpu_tsc); 1446 1447 return -EINVAL; 1448 } 1449 1450 err = 0; 1451 prev = 0; 1452 do { 1453 for (i = prev; i < next; i++) { 1454 unsigned long start = jiffies; 1455 1456 while (!mcelog.entry[i].finished) { 1457 if (time_after_eq(jiffies, start + 2)) { 1458 memset(mcelog.entry + i, 0, 1459 sizeof(struct mce)); 1460 goto timeout; 1461 } 1462 cpu_relax(); 1463 } 1464 smp_rmb(); 1465 err |= copy_to_user(buf, mcelog.entry + i, 1466 sizeof(struct mce)); 1467 buf += sizeof(struct mce); 1468timeout: 1469 ; 1470 } 1471 1472 memset(mcelog.entry + prev, 0, 1473 (next - prev) * sizeof(struct mce)); 1474 prev = next; 1475 next = cmpxchg(&mcelog.next, prev, 0); 1476 } while (next != prev); 1477 1478 synchronize_sched(); 1479 1480 /* 1481 * Collect entries that were still getting written before the 1482 * synchronize. 1483 */ 1484 on_each_cpu(collect_tscs, cpu_tsc, 1); 1485 1486 for (i = next; i < MCE_LOG_LEN; i++) { 1487 if (mcelog.entry[i].finished && 1488 mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) { 1489 err |= copy_to_user(buf, mcelog.entry+i, 1490 sizeof(struct mce)); 1491 smp_rmb(); 1492 buf += sizeof(struct mce); 1493 memset(&mcelog.entry[i], 0, sizeof(struct mce)); 1494 } 1495 } 1496 mutex_unlock(&mce_read_mutex); 1497 kfree(cpu_tsc); 1498 1499 return err ? -EFAULT : buf - ubuf; 1500} 1501 1502static unsigned int mce_poll(struct file *file, poll_table *wait) 1503{ 1504 poll_wait(file, &mce_wait, wait); 1505 if (rcu_dereference(mcelog.next)) 1506 return POLLIN | POLLRDNORM; 1507 return 0; 1508} 1509 1510static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg) 1511{ 1512 int __user *p = (int __user *)arg; 1513 1514 if (!capable(CAP_SYS_ADMIN)) 1515 return -EPERM; 1516 1517 switch (cmd) { 1518 case MCE_GET_RECORD_LEN: 1519 return put_user(sizeof(struct mce), p); 1520 case MCE_GET_LOG_LEN: 1521 return put_user(MCE_LOG_LEN, p); 1522 case MCE_GETCLEAR_FLAGS: { 1523 unsigned flags; 1524 1525 do { 1526 flags = mcelog.flags; 1527 } while (cmpxchg(&mcelog.flags, flags, 0) != flags); 1528 1529 return put_user(flags, p); 1530 } 1531 default: 1532 return -ENOTTY; 1533 } 1534} 1535 1536/* Modified in mce-inject.c, so not static or const */ 1537struct file_operations mce_chrdev_ops = { 1538 .open = mce_open, 1539 .release = mce_release, 1540 .read = mce_read, 1541 .poll = mce_poll, 1542 .unlocked_ioctl = mce_ioctl, 1543}; 1544EXPORT_SYMBOL_GPL(mce_chrdev_ops); 1545 1546static struct miscdevice mce_log_device = { 1547 MISC_MCELOG_MINOR, 1548 "mcelog", 1549 &mce_chrdev_ops, 1550}; 1551 1552/* 1553 * mce=off Disables machine check 1554 * mce=no_cmci Disables CMCI 1555 * mce=dont_log_ce Clears corrected events silently, no log created for CEs. 1556 * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared. 1557 * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above) 1558 * monarchtimeout is how long to wait for other CPUs on machine 1559 * check, or 0 to not wait 1560 * mce=bootlog Log MCEs from before booting. Disabled by default on AMD. 1561 * mce=nobootlog Don't log MCEs from before booting. 1562 */ 1563static int __init mcheck_enable(char *str) 1564{ 1565 if (*str == 0) { 1566 enable_p5_mce(); 1567 return 1; 1568 } 1569 if (*str == '=') 1570 str++; 1571 if (!strcmp(str, "off")) 1572 mce_disabled = 1; 1573 else if (!strcmp(str, "no_cmci")) 1574 mce_cmci_disabled = 1; 1575 else if (!strcmp(str, "dont_log_ce")) 1576 mce_dont_log_ce = 1; 1577 else if (!strcmp(str, "ignore_ce")) 1578 mce_ignore_ce = 1; 1579 else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog")) 1580 mce_bootlog = (str[0] == 'b'); 1581 else if (isdigit(str[0])) { 1582 get_option(&str, &tolerant); 1583 if (*str == ',') { 1584 ++str; 1585 get_option(&str, &monarch_timeout); 1586 } 1587 } else { 1588 printk(KERN_INFO "mce argument %s ignored. Please use /sys\n", 1589 str); 1590 return 0; 1591 } 1592 return 1; 1593} 1594__setup("mce", mcheck_enable); 1595 1596/* 1597 * Sysfs support 1598 */ 1599 1600/* 1601 * Disable machine checks on suspend and shutdown. We can't really handle 1602 * them later. 1603 */ 1604static int mce_disable(void) 1605{ 1606 int i; 1607 1608 for (i = 0; i < banks; i++) { 1609 struct mce_bank *b = &mce_banks[i]; 1610 if (b->init) 1611 wrmsrl(MSR_IA32_MCx_CTL(i), 0); 1612 } 1613 return 0; 1614} 1615 1616static int mce_suspend(struct sys_device *dev, pm_message_t state) 1617{ 1618 return mce_disable(); 1619} 1620 1621static int mce_shutdown(struct sys_device *dev) 1622{ 1623 return mce_disable(); 1624} 1625 1626/* 1627 * On resume clear all MCE state. Don't want to see leftovers from the BIOS. 1628 * Only one CPU is active at this time, the others get re-added later using 1629 * CPU hotplug: 1630 */ 1631static int mce_resume(struct sys_device *dev) 1632{ 1633 mce_init(); 1634 mce_cpu_features(¤t_cpu_data); 1635 1636 return 0; 1637} 1638 1639static void mce_cpu_restart(void *data) 1640{ 1641 del_timer_sync(&__get_cpu_var(mce_timer)); 1642 if (!mce_available(¤t_cpu_data)) 1643 return; 1644 mce_init(); 1645 mce_init_timer(); 1646} 1647 1648/* Reinit MCEs after user configuration changes */ 1649static void mce_restart(void) 1650{ 1651 on_each_cpu(mce_cpu_restart, NULL, 1); 1652} 1653 1654/* Toggle features for corrected errors */ 1655static void mce_disable_ce(void *all) 1656{ 1657 if (!mce_available(¤t_cpu_data)) 1658 return; 1659 if (all) 1660 del_timer_sync(&__get_cpu_var(mce_timer)); 1661 cmci_clear(); 1662} 1663 1664static void mce_enable_ce(void *all) 1665{ 1666 if (!mce_available(¤t_cpu_data)) 1667 return; 1668 cmci_reenable(); 1669 cmci_recheck(); 1670 if (all) 1671 mce_init_timer(); 1672} 1673 1674static struct sysdev_class mce_sysclass = { 1675 .suspend = mce_suspend, 1676 .shutdown = mce_shutdown, 1677 .resume = mce_resume, 1678 .name = "machinecheck", 1679}; 1680 1681DEFINE_PER_CPU(struct sys_device, mce_dev); 1682 1683__cpuinitdata 1684void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); 1685 1686static inline struct mce_bank *attr_to_bank(struct sysdev_attribute *attr) 1687{ 1688 return container_of(attr, struct mce_bank, attr); 1689} 1690 1691static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr, 1692 char *buf) 1693{ 1694 return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl); 1695} 1696 1697static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, 1698 const char *buf, size_t size) 1699{ 1700 u64 new; 1701 1702 if (strict_strtoull(buf, 0, &new) < 0) 1703 return -EINVAL; 1704 1705 attr_to_bank(attr)->ctl = new; 1706 mce_restart(); 1707 1708 return size; 1709} 1710 1711static ssize_t 1712show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf) 1713{ 1714 strcpy(buf, mce_helper); 1715 strcat(buf, "\n"); 1716 return strlen(mce_helper) + 1; 1717} 1718 1719static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr, 1720 const char *buf, size_t siz) 1721{ 1722 char *p; 1723 int len; 1724 1725 strncpy(mce_helper, buf, sizeof(mce_helper)); 1726 mce_helper[sizeof(mce_helper)-1] = 0; 1727 len = strlen(mce_helper); 1728 p = strchr(mce_helper, '\n'); 1729 1730 if (*p) 1731 *p = 0; 1732 1733 return len; 1734} 1735 1736static ssize_t set_ignore_ce(struct sys_device *s, 1737 struct sysdev_attribute *attr, 1738 const char *buf, size_t size) 1739{ 1740 u64 new; 1741 1742 if (strict_strtoull(buf, 0, &new) < 0) 1743 return -EINVAL; 1744 1745 if (mce_ignore_ce ^ !!new) { 1746 if (new) { 1747 /* disable ce features */ 1748 on_each_cpu(mce_disable_ce, (void *)1, 1); 1749 mce_ignore_ce = 1; 1750 } else { 1751 /* enable ce features */ 1752 mce_ignore_ce = 0; 1753 on_each_cpu(mce_enable_ce, (void *)1, 1); 1754 } 1755 } 1756 return size; 1757} 1758 1759static ssize_t set_cmci_disabled(struct sys_device *s, 1760 struct sysdev_attribute *attr, 1761 const char *buf, size_t size) 1762{ 1763 u64 new; 1764 1765 if (strict_strtoull(buf, 0, &new) < 0) 1766 return -EINVAL; 1767 1768 if (mce_cmci_disabled ^ !!new) { 1769 if (new) { 1770 /* disable cmci */ 1771 on_each_cpu(mce_disable_ce, NULL, 1); 1772 mce_cmci_disabled = 1; 1773 } else { 1774 /* enable cmci */ 1775 mce_cmci_disabled = 0; 1776 on_each_cpu(mce_enable_ce, NULL, 1); 1777 } 1778 } 1779 return size; 1780} 1781 1782static ssize_t store_int_with_restart(struct sys_device *s, 1783 struct sysdev_attribute *attr, 1784 const char *buf, size_t size) 1785{ 1786 ssize_t ret = sysdev_store_int(s, attr, buf, size); 1787 mce_restart(); 1788 return ret; 1789} 1790 1791static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); 1792static SYSDEV_INT_ATTR(tolerant, 0644, tolerant); 1793static SYSDEV_INT_ATTR(monarch_timeout, 0644, monarch_timeout); 1794static SYSDEV_INT_ATTR(dont_log_ce, 0644, mce_dont_log_ce); 1795 1796static struct sysdev_ext_attribute attr_check_interval = { 1797 _SYSDEV_ATTR(check_interval, 0644, sysdev_show_int, 1798 store_int_with_restart), 1799 &check_interval 1800}; 1801 1802static struct sysdev_ext_attribute attr_ignore_ce = { 1803 _SYSDEV_ATTR(ignore_ce, 0644, sysdev_show_int, set_ignore_ce), 1804 &mce_ignore_ce 1805}; 1806 1807static struct sysdev_ext_attribute attr_cmci_disabled = { 1808 _SYSDEV_ATTR(cmci_disabled, 0644, sysdev_show_int, set_cmci_disabled), 1809 &mce_cmci_disabled 1810}; 1811 1812static struct sysdev_attribute *mce_attrs[] = { 1813 &attr_tolerant.attr, 1814 &attr_check_interval.attr, 1815 &attr_trigger, 1816 &attr_monarch_timeout.attr, 1817 &attr_dont_log_ce.attr, 1818 &attr_ignore_ce.attr, 1819 &attr_cmci_disabled.attr, 1820 NULL 1821}; 1822 1823static cpumask_var_t mce_dev_initialized; 1824 1825/* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */ 1826static __cpuinit int mce_create_device(unsigned int cpu) 1827{ 1828 int err; 1829 int i, j; 1830 1831 if (!mce_available(&boot_cpu_data)) 1832 return -EIO; 1833 1834 memset(&per_cpu(mce_dev, cpu).kobj, 0, sizeof(struct kobject)); 1835 per_cpu(mce_dev, cpu).id = cpu; 1836 per_cpu(mce_dev, cpu).cls = &mce_sysclass; 1837 1838 err = sysdev_register(&per_cpu(mce_dev, cpu)); 1839 if (err) 1840 return err; 1841 1842 for (i = 0; mce_attrs[i]; i++) { 1843 err = sysdev_create_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); 1844 if (err) 1845 goto error; 1846 } 1847 for (j = 0; j < banks; j++) { 1848 err = sysdev_create_file(&per_cpu(mce_dev, cpu), 1849 &mce_banks[j].attr); 1850 if (err) 1851 goto error2; 1852 } 1853 cpumask_set_cpu(cpu, mce_dev_initialized); 1854 1855 return 0; 1856error2: 1857 while (--j >= 0) 1858 sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[j].attr); 1859error: 1860 while (--i >= 0) 1861 sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[i].attr); 1862 1863 sysdev_unregister(&per_cpu(mce_dev, cpu)); 1864 1865 return err; 1866} 1867 1868static __cpuinit void mce_remove_device(unsigned int cpu) 1869{ 1870 int i; 1871 1872 if (!cpumask_test_cpu(cpu, mce_dev_initialized)) 1873 return; 1874 1875 for (i = 0; mce_attrs[i]; i++) 1876 sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); 1877 1878 for (i = 0; i < banks; i++) 1879 sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[i].attr); 1880 1881 sysdev_unregister(&per_cpu(mce_dev, cpu)); 1882 cpumask_clear_cpu(cpu, mce_dev_initialized); 1883} 1884 1885/* Make sure there are no machine checks on offlined CPUs. */ 1886static void mce_disable_cpu(void *h) 1887{ 1888 unsigned long action = *(unsigned long *)h; 1889 int i; 1890 1891 if (!mce_available(¤t_cpu_data)) 1892 return; 1893 if (!(action & CPU_TASKS_FROZEN)) 1894 cmci_clear(); 1895 for (i = 0; i < banks; i++) { 1896 struct mce_bank *b = &mce_banks[i]; 1897 if (b->init) 1898 wrmsrl(MSR_IA32_MCx_CTL(i), 0); 1899 } 1900} 1901 1902static void mce_reenable_cpu(void *h) 1903{ 1904 unsigned long action = *(unsigned long *)h; 1905 int i; 1906 1907 if (!mce_available(¤t_cpu_data)) 1908 return; 1909 1910 if (!(action & CPU_TASKS_FROZEN)) 1911 cmci_reenable(); 1912 for (i = 0; i < banks; i++) { 1913 struct mce_bank *b = &mce_banks[i]; 1914 if (b->init) 1915 wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl); 1916 } 1917} 1918 1919/* Get notified when a cpu comes on/off. Be hotplug friendly. */ 1920static int __cpuinit 1921mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) 1922{ 1923 unsigned int cpu = (unsigned long)hcpu; 1924 struct timer_list *t = &per_cpu(mce_timer, cpu); 1925 1926 switch (action) { 1927 case CPU_ONLINE: 1928 case CPU_ONLINE_FROZEN: 1929 mce_create_device(cpu); 1930 if (threshold_cpu_callback) 1931 threshold_cpu_callback(action, cpu); 1932 break; 1933 case CPU_DEAD: 1934 case CPU_DEAD_FROZEN: 1935 if (threshold_cpu_callback) 1936 threshold_cpu_callback(action, cpu); 1937 mce_remove_device(cpu); 1938 break; 1939 case CPU_DOWN_PREPARE: 1940 case CPU_DOWN_PREPARE_FROZEN: 1941 del_timer_sync(t); 1942 smp_call_function_single(cpu, mce_disable_cpu, &action, 1); 1943 break; 1944 case CPU_DOWN_FAILED: 1945 case CPU_DOWN_FAILED_FROZEN: 1946 t->expires = round_jiffies(jiffies + 1947 __get_cpu_var(next_interval)); 1948 add_timer_on(t, cpu); 1949 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); 1950 break; 1951 case CPU_POST_DEAD: 1952 /* intentionally ignoring frozen here */ 1953 cmci_rediscover(cpu); 1954 break; 1955 } 1956 return NOTIFY_OK; 1957} 1958 1959static struct notifier_block mce_cpu_notifier __cpuinitdata = { 1960 .notifier_call = mce_cpu_callback, 1961}; 1962 1963static __init void mce_init_banks(void) 1964{ 1965 int i; 1966 1967 for (i = 0; i < banks; i++) { 1968 struct mce_bank *b = &mce_banks[i]; 1969 struct sysdev_attribute *a = &b->attr; 1970 1971 a->attr.name = b->attrname; 1972 snprintf(b->attrname, ATTR_LEN, "bank%d", i); 1973 1974 a->attr.mode = 0644; 1975 a->show = show_bank; 1976 a->store = set_bank; 1977 } 1978} 1979 1980static __init int mce_init_device(void) 1981{ 1982 int err; 1983 int i = 0; 1984 1985 if (!mce_available(&boot_cpu_data)) 1986 return -EIO; 1987 1988 zalloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL); 1989 1990 mce_init_banks(); 1991 1992 err = sysdev_class_register(&mce_sysclass); 1993 if (err) 1994 return err; 1995 1996 for_each_online_cpu(i) { 1997 err = mce_create_device(i); 1998 if (err) 1999 return err; 2000 } 2001 2002 register_hotcpu_notifier(&mce_cpu_notifier); 2003 misc_register(&mce_log_device); 2004 2005 return err; 2006} 2007 2008device_initcall(mce_init_device); 2009 2010/* 2011 * Old style boot options parsing. Only for compatibility. 2012 */ 2013static int __init mcheck_disable(char *str) 2014{ 2015 mce_disabled = 1; 2016 return 1; 2017} 2018__setup("nomce", mcheck_disable); 2019 2020#ifdef CONFIG_DEBUG_FS 2021struct dentry *mce_get_debugfs_dir(void) 2022{ 2023 static struct dentry *dmce; 2024 2025 if (!dmce) 2026 dmce = debugfs_create_dir("mce", NULL); 2027 2028 return dmce; 2029} 2030 2031static void mce_reset(void) 2032{ 2033 cpu_missing = 0; 2034 atomic_set(&mce_fake_paniced, 0); 2035 atomic_set(&mce_executing, 0); 2036 atomic_set(&mce_callin, 0); 2037 atomic_set(&global_nwo, 0); 2038} 2039 2040static int fake_panic_get(void *data, u64 *val) 2041{ 2042 *val = fake_panic; 2043 return 0; 2044} 2045 2046static int fake_panic_set(void *data, u64 val) 2047{ 2048 mce_reset(); 2049 fake_panic = val; 2050 return 0; 2051} 2052 2053DEFINE_SIMPLE_ATTRIBUTE(fake_panic_fops, fake_panic_get, 2054 fake_panic_set, "%llu\n"); 2055 2056static int __init mce_debugfs_init(void) 2057{ 2058 struct dentry *dmce, *ffake_panic; 2059 2060 dmce = mce_get_debugfs_dir(); 2061 if (!dmce) 2062 return -ENOMEM; 2063 ffake_panic = debugfs_create_file("fake_panic", 0444, dmce, NULL, 2064 &fake_panic_fops); 2065 if (!ffake_panic) 2066 return -ENOMEM; 2067 2068 return 0; 2069} 2070late_initcall(mce_debugfs_init); 2071#endif 2072