mce.c revision cd42f4a3b2b1c4cbd997363dc57821953d73fd87
1/* 2 * Machine check handler. 3 * 4 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. 5 * Rest from unknown author(s). 6 * 2004 Andi Kleen. Rewrote most of it. 7 * Copyright 2008 Intel Corporation 8 * Author: Andi Kleen 9 */ 10#include <linux/thread_info.h> 11#include <linux/capability.h> 12#include <linux/miscdevice.h> 13#include <linux/ratelimit.h> 14#include <linux/kallsyms.h> 15#include <linux/rcupdate.h> 16#include <linux/kobject.h> 17#include <linux/uaccess.h> 18#include <linux/kdebug.h> 19#include <linux/kernel.h> 20#include <linux/percpu.h> 21#include <linux/string.h> 22#include <linux/sysdev.h> 23#include <linux/syscore_ops.h> 24#include <linux/delay.h> 25#include <linux/ctype.h> 26#include <linux/sched.h> 27#include <linux/sysfs.h> 28#include <linux/types.h> 29#include <linux/slab.h> 30#include <linux/init.h> 31#include <linux/kmod.h> 32#include <linux/poll.h> 33#include <linux/nmi.h> 34#include <linux/cpu.h> 35#include <linux/smp.h> 36#include <linux/fs.h> 37#include <linux/mm.h> 38#include <linux/debugfs.h> 39#include <linux/irq_work.h> 40#include <linux/export.h> 41 42#include <asm/processor.h> 43#include <asm/mce.h> 44#include <asm/msr.h> 45 46#include "mce-internal.h" 47 48static DEFINE_MUTEX(mce_chrdev_read_mutex); 49 50#define rcu_dereference_check_mce(p) \ 51 rcu_dereference_index_check((p), \ 52 rcu_read_lock_sched_held() || \ 53 lockdep_is_held(&mce_chrdev_read_mutex)) 54 55#define CREATE_TRACE_POINTS 56#include <trace/events/mce.h> 57 58int mce_disabled __read_mostly; 59 60#define MISC_MCELOG_MINOR 227 61 62#define SPINUNIT 100 /* 100ns */ 63 64atomic_t mce_entry; 65 66DEFINE_PER_CPU(unsigned, mce_exception_count); 67 68/* 69 * Tolerant levels: 70 * 0: always panic on uncorrected errors, log corrected errors 71 * 1: panic or SIGBUS on uncorrected errors, log corrected errors 72 * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors 73 * 3: never panic or SIGBUS, log all errors (for testing only) 74 */ 75static int tolerant __read_mostly = 1; 76static int banks __read_mostly; 77static int rip_msr __read_mostly; 78static int mce_bootlog __read_mostly = -1; 79static int monarch_timeout __read_mostly = -1; 80static int mce_panic_timeout __read_mostly; 81static int mce_dont_log_ce __read_mostly; 82int mce_cmci_disabled __read_mostly; 83int mce_ignore_ce __read_mostly; 84int mce_ser __read_mostly; 85 86struct mce_bank *mce_banks __read_mostly; 87 88/* User mode helper program triggered by machine check event */ 89static unsigned long mce_need_notify; 90static char mce_helper[128]; 91static char *mce_helper_argv[2] = { mce_helper, NULL }; 92 93static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait); 94 95static DEFINE_PER_CPU(struct mce, mces_seen); 96static int cpu_missing; 97 98/* 99 * CPU/chipset specific EDAC code can register a notifier call here to print 100 * MCE errors in a human-readable form. 101 */ 102ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain); 103EXPORT_SYMBOL_GPL(x86_mce_decoder_chain); 104 105/* MCA banks polled by the period polling timer for corrected events */ 106DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { 107 [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL 108}; 109 110static DEFINE_PER_CPU(struct work_struct, mce_work); 111 112/* Do initial initialization of a struct mce */ 113void mce_setup(struct mce *m) 114{ 115 memset(m, 0, sizeof(struct mce)); 116 m->cpu = m->extcpu = smp_processor_id(); 117 rdtscll(m->tsc); 118 /* We hope get_seconds stays lockless */ 119 m->time = get_seconds(); 120 m->cpuvendor = boot_cpu_data.x86_vendor; 121 m->cpuid = cpuid_eax(1); 122#ifdef CONFIG_SMP 123 m->socketid = cpu_data(m->extcpu).phys_proc_id; 124#endif 125 m->apicid = cpu_data(m->extcpu).initial_apicid; 126 rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap); 127} 128 129DEFINE_PER_CPU(struct mce, injectm); 130EXPORT_PER_CPU_SYMBOL_GPL(injectm); 131 132/* 133 * Lockless MCE logging infrastructure. 134 * This avoids deadlocks on printk locks without having to break locks. Also 135 * separate MCEs from kernel messages to avoid bogus bug reports. 136 */ 137 138static struct mce_log mcelog = { 139 .signature = MCE_LOG_SIGNATURE, 140 .len = MCE_LOG_LEN, 141 .recordlen = sizeof(struct mce), 142}; 143 144void mce_log(struct mce *mce) 145{ 146 unsigned next, entry; 147 int ret = 0; 148 149 /* Emit the trace record: */ 150 trace_mce_record(mce); 151 152 ret = atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, mce); 153 if (ret == NOTIFY_STOP) 154 return; 155 156 mce->finished = 0; 157 wmb(); 158 for (;;) { 159 entry = rcu_dereference_check_mce(mcelog.next); 160 for (;;) { 161 162 /* 163 * When the buffer fills up discard new entries. 164 * Assume that the earlier errors are the more 165 * interesting ones: 166 */ 167 if (entry >= MCE_LOG_LEN) { 168 set_bit(MCE_OVERFLOW, 169 (unsigned long *)&mcelog.flags); 170 return; 171 } 172 /* Old left over entry. Skip: */ 173 if (mcelog.entry[entry].finished) { 174 entry++; 175 continue; 176 } 177 break; 178 } 179 smp_rmb(); 180 next = entry + 1; 181 if (cmpxchg(&mcelog.next, entry, next) == entry) 182 break; 183 } 184 memcpy(mcelog.entry + entry, mce, sizeof(struct mce)); 185 wmb(); 186 mcelog.entry[entry].finished = 1; 187 wmb(); 188 189 mce->finished = 1; 190 set_bit(0, &mce_need_notify); 191} 192 193static void print_mce(struct mce *m) 194{ 195 int ret = 0; 196 197 pr_emerg(HW_ERR "CPU %d: Machine Check Exception: %Lx Bank %d: %016Lx\n", 198 m->extcpu, m->mcgstatus, m->bank, m->status); 199 200 if (m->ip) { 201 pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ", 202 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", 203 m->cs, m->ip); 204 205 if (m->cs == __KERNEL_CS) 206 print_symbol("{%s}", m->ip); 207 pr_cont("\n"); 208 } 209 210 pr_emerg(HW_ERR "TSC %llx ", m->tsc); 211 if (m->addr) 212 pr_cont("ADDR %llx ", m->addr); 213 if (m->misc) 214 pr_cont("MISC %llx ", m->misc); 215 216 pr_cont("\n"); 217 /* 218 * Note this output is parsed by external tools and old fields 219 * should not be changed. 220 */ 221 pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x microcode %x\n", 222 m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid, 223 cpu_data(m->extcpu).microcode); 224 225 /* 226 * Print out human-readable details about the MCE error, 227 * (if the CPU has an implementation for that) 228 */ 229 ret = atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m); 230 if (ret == NOTIFY_STOP) 231 return; 232 233 pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n"); 234} 235 236#define PANIC_TIMEOUT 5 /* 5 seconds */ 237 238static atomic_t mce_paniced; 239 240static int fake_panic; 241static atomic_t mce_fake_paniced; 242 243/* Panic in progress. Enable interrupts and wait for final IPI */ 244static void wait_for_panic(void) 245{ 246 long timeout = PANIC_TIMEOUT*USEC_PER_SEC; 247 248 preempt_disable(); 249 local_irq_enable(); 250 while (timeout-- > 0) 251 udelay(1); 252 if (panic_timeout == 0) 253 panic_timeout = mce_panic_timeout; 254 panic("Panicing machine check CPU died"); 255} 256 257static void mce_panic(char *msg, struct mce *final, char *exp) 258{ 259 int i, apei_err = 0; 260 261 if (!fake_panic) { 262 /* 263 * Make sure only one CPU runs in machine check panic 264 */ 265 if (atomic_inc_return(&mce_paniced) > 1) 266 wait_for_panic(); 267 barrier(); 268 269 bust_spinlocks(1); 270 console_verbose(); 271 } else { 272 /* Don't log too much for fake panic */ 273 if (atomic_inc_return(&mce_fake_paniced) > 1) 274 return; 275 } 276 /* First print corrected ones that are still unlogged */ 277 for (i = 0; i < MCE_LOG_LEN; i++) { 278 struct mce *m = &mcelog.entry[i]; 279 if (!(m->status & MCI_STATUS_VAL)) 280 continue; 281 if (!(m->status & MCI_STATUS_UC)) { 282 print_mce(m); 283 if (!apei_err) 284 apei_err = apei_write_mce(m); 285 } 286 } 287 /* Now print uncorrected but with the final one last */ 288 for (i = 0; i < MCE_LOG_LEN; i++) { 289 struct mce *m = &mcelog.entry[i]; 290 if (!(m->status & MCI_STATUS_VAL)) 291 continue; 292 if (!(m->status & MCI_STATUS_UC)) 293 continue; 294 if (!final || memcmp(m, final, sizeof(struct mce))) { 295 print_mce(m); 296 if (!apei_err) 297 apei_err = apei_write_mce(m); 298 } 299 } 300 if (final) { 301 print_mce(final); 302 if (!apei_err) 303 apei_err = apei_write_mce(final); 304 } 305 if (cpu_missing) 306 pr_emerg(HW_ERR "Some CPUs didn't answer in synchronization\n"); 307 if (exp) 308 pr_emerg(HW_ERR "Machine check: %s\n", exp); 309 if (!fake_panic) { 310 if (panic_timeout == 0) 311 panic_timeout = mce_panic_timeout; 312 panic(msg); 313 } else 314 pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg); 315} 316 317/* Support code for software error injection */ 318 319static int msr_to_offset(u32 msr) 320{ 321 unsigned bank = __this_cpu_read(injectm.bank); 322 323 if (msr == rip_msr) 324 return offsetof(struct mce, ip); 325 if (msr == MSR_IA32_MCx_STATUS(bank)) 326 return offsetof(struct mce, status); 327 if (msr == MSR_IA32_MCx_ADDR(bank)) 328 return offsetof(struct mce, addr); 329 if (msr == MSR_IA32_MCx_MISC(bank)) 330 return offsetof(struct mce, misc); 331 if (msr == MSR_IA32_MCG_STATUS) 332 return offsetof(struct mce, mcgstatus); 333 return -1; 334} 335 336/* MSR access wrappers used for error injection */ 337static u64 mce_rdmsrl(u32 msr) 338{ 339 u64 v; 340 341 if (__this_cpu_read(injectm.finished)) { 342 int offset = msr_to_offset(msr); 343 344 if (offset < 0) 345 return 0; 346 return *(u64 *)((char *)&__get_cpu_var(injectm) + offset); 347 } 348 349 if (rdmsrl_safe(msr, &v)) { 350 WARN_ONCE(1, "mce: Unable to read msr %d!\n", msr); 351 /* 352 * Return zero in case the access faulted. This should 353 * not happen normally but can happen if the CPU does 354 * something weird, or if the code is buggy. 355 */ 356 v = 0; 357 } 358 359 return v; 360} 361 362static void mce_wrmsrl(u32 msr, u64 v) 363{ 364 if (__this_cpu_read(injectm.finished)) { 365 int offset = msr_to_offset(msr); 366 367 if (offset >= 0) 368 *(u64 *)((char *)&__get_cpu_var(injectm) + offset) = v; 369 return; 370 } 371 wrmsrl(msr, v); 372} 373 374/* 375 * Collect all global (w.r.t. this processor) status about this machine 376 * check into our "mce" struct so that we can use it later to assess 377 * the severity of the problem as we read per-bank specific details. 378 */ 379static inline void mce_gather_info(struct mce *m, struct pt_regs *regs) 380{ 381 mce_setup(m); 382 383 m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); 384 if (regs) { 385 /* 386 * Get the address of the instruction at the time of 387 * the machine check error. 388 */ 389 if (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) { 390 m->ip = regs->ip; 391 m->cs = regs->cs; 392 } 393 /* Use accurate RIP reporting if available. */ 394 if (rip_msr) 395 m->ip = mce_rdmsrl(rip_msr); 396 } 397} 398 399/* 400 * Simple lockless ring to communicate PFNs from the exception handler with the 401 * process context work function. This is vastly simplified because there's 402 * only a single reader and a single writer. 403 */ 404#define MCE_RING_SIZE 16 /* we use one entry less */ 405 406struct mce_ring { 407 unsigned short start; 408 unsigned short end; 409 unsigned long ring[MCE_RING_SIZE]; 410}; 411static DEFINE_PER_CPU(struct mce_ring, mce_ring); 412 413/* Runs with CPU affinity in workqueue */ 414static int mce_ring_empty(void) 415{ 416 struct mce_ring *r = &__get_cpu_var(mce_ring); 417 418 return r->start == r->end; 419} 420 421static int mce_ring_get(unsigned long *pfn) 422{ 423 struct mce_ring *r; 424 int ret = 0; 425 426 *pfn = 0; 427 get_cpu(); 428 r = &__get_cpu_var(mce_ring); 429 if (r->start == r->end) 430 goto out; 431 *pfn = r->ring[r->start]; 432 r->start = (r->start + 1) % MCE_RING_SIZE; 433 ret = 1; 434out: 435 put_cpu(); 436 return ret; 437} 438 439/* Always runs in MCE context with preempt off */ 440static int mce_ring_add(unsigned long pfn) 441{ 442 struct mce_ring *r = &__get_cpu_var(mce_ring); 443 unsigned next; 444 445 next = (r->end + 1) % MCE_RING_SIZE; 446 if (next == r->start) 447 return -1; 448 r->ring[r->end] = pfn; 449 wmb(); 450 r->end = next; 451 return 0; 452} 453 454int mce_available(struct cpuinfo_x86 *c) 455{ 456 if (mce_disabled) 457 return 0; 458 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); 459} 460 461static void mce_schedule_work(void) 462{ 463 if (!mce_ring_empty()) { 464 struct work_struct *work = &__get_cpu_var(mce_work); 465 if (!work_pending(work)) 466 schedule_work(work); 467 } 468} 469 470DEFINE_PER_CPU(struct irq_work, mce_irq_work); 471 472static void mce_irq_work_cb(struct irq_work *entry) 473{ 474 mce_notify_irq(); 475 mce_schedule_work(); 476} 477 478static void mce_report_event(struct pt_regs *regs) 479{ 480 if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) { 481 mce_notify_irq(); 482 /* 483 * Triggering the work queue here is just an insurance 484 * policy in case the syscall exit notify handler 485 * doesn't run soon enough or ends up running on the 486 * wrong CPU (can happen when audit sleeps) 487 */ 488 mce_schedule_work(); 489 return; 490 } 491 492 irq_work_queue(&__get_cpu_var(mce_irq_work)); 493} 494 495DEFINE_PER_CPU(unsigned, mce_poll_count); 496 497/* 498 * Poll for corrected events or events that happened before reset. 499 * Those are just logged through /dev/mcelog. 500 * 501 * This is executed in standard interrupt context. 502 * 503 * Note: spec recommends to panic for fatal unsignalled 504 * errors here. However this would be quite problematic -- 505 * we would need to reimplement the Monarch handling and 506 * it would mess up the exclusion between exception handler 507 * and poll hander -- * so we skip this for now. 508 * These cases should not happen anyways, or only when the CPU 509 * is already totally * confused. In this case it's likely it will 510 * not fully execute the machine check handler either. 511 */ 512void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) 513{ 514 struct mce m; 515 int i; 516 517 percpu_inc(mce_poll_count); 518 519 mce_gather_info(&m, NULL); 520 521 for (i = 0; i < banks; i++) { 522 if (!mce_banks[i].ctl || !test_bit(i, *b)) 523 continue; 524 525 m.misc = 0; 526 m.addr = 0; 527 m.bank = i; 528 m.tsc = 0; 529 530 barrier(); 531 m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); 532 if (!(m.status & MCI_STATUS_VAL)) 533 continue; 534 535 /* 536 * Uncorrected or signalled events are handled by the exception 537 * handler when it is enabled, so don't process those here. 538 * 539 * TBD do the same check for MCI_STATUS_EN here? 540 */ 541 if (!(flags & MCP_UC) && 542 (m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC))) 543 continue; 544 545 if (m.status & MCI_STATUS_MISCV) 546 m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i)); 547 if (m.status & MCI_STATUS_ADDRV) 548 m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i)); 549 550 if (!(flags & MCP_TIMESTAMP)) 551 m.tsc = 0; 552 /* 553 * Don't get the IP here because it's unlikely to 554 * have anything to do with the actual error location. 555 */ 556 if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) 557 mce_log(&m); 558 559 /* 560 * Clear state for this bank. 561 */ 562 mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0); 563 } 564 565 /* 566 * Don't clear MCG_STATUS here because it's only defined for 567 * exceptions. 568 */ 569 570 sync_core(); 571} 572EXPORT_SYMBOL_GPL(machine_check_poll); 573 574/* 575 * Do a quick check if any of the events requires a panic. 576 * This decides if we keep the events around or clear them. 577 */ 578static int mce_no_way_out(struct mce *m, char **msg) 579{ 580 int i; 581 582 for (i = 0; i < banks; i++) { 583 m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); 584 if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY) 585 return 1; 586 } 587 return 0; 588} 589 590/* 591 * Variable to establish order between CPUs while scanning. 592 * Each CPU spins initially until executing is equal its number. 593 */ 594static atomic_t mce_executing; 595 596/* 597 * Defines order of CPUs on entry. First CPU becomes Monarch. 598 */ 599static atomic_t mce_callin; 600 601/* 602 * Check if a timeout waiting for other CPUs happened. 603 */ 604static int mce_timed_out(u64 *t) 605{ 606 /* 607 * The others already did panic for some reason. 608 * Bail out like in a timeout. 609 * rmb() to tell the compiler that system_state 610 * might have been modified by someone else. 611 */ 612 rmb(); 613 if (atomic_read(&mce_paniced)) 614 wait_for_panic(); 615 if (!monarch_timeout) 616 goto out; 617 if ((s64)*t < SPINUNIT) { 618 /* CHECKME: Make panic default for 1 too? */ 619 if (tolerant < 1) 620 mce_panic("Timeout synchronizing machine check over CPUs", 621 NULL, NULL); 622 cpu_missing = 1; 623 return 1; 624 } 625 *t -= SPINUNIT; 626out: 627 touch_nmi_watchdog(); 628 return 0; 629} 630 631/* 632 * The Monarch's reign. The Monarch is the CPU who entered 633 * the machine check handler first. It waits for the others to 634 * raise the exception too and then grades them. When any 635 * error is fatal panic. Only then let the others continue. 636 * 637 * The other CPUs entering the MCE handler will be controlled by the 638 * Monarch. They are called Subjects. 639 * 640 * This way we prevent any potential data corruption in a unrecoverable case 641 * and also makes sure always all CPU's errors are examined. 642 * 643 * Also this detects the case of a machine check event coming from outer 644 * space (not detected by any CPUs) In this case some external agent wants 645 * us to shut down, so panic too. 646 * 647 * The other CPUs might still decide to panic if the handler happens 648 * in a unrecoverable place, but in this case the system is in a semi-stable 649 * state and won't corrupt anything by itself. It's ok to let the others 650 * continue for a bit first. 651 * 652 * All the spin loops have timeouts; when a timeout happens a CPU 653 * typically elects itself to be Monarch. 654 */ 655static void mce_reign(void) 656{ 657 int cpu; 658 struct mce *m = NULL; 659 int global_worst = 0; 660 char *msg = NULL; 661 char *nmsg = NULL; 662 663 /* 664 * This CPU is the Monarch and the other CPUs have run 665 * through their handlers. 666 * Grade the severity of the errors of all the CPUs. 667 */ 668 for_each_possible_cpu(cpu) { 669 int severity = mce_severity(&per_cpu(mces_seen, cpu), tolerant, 670 &nmsg); 671 if (severity > global_worst) { 672 msg = nmsg; 673 global_worst = severity; 674 m = &per_cpu(mces_seen, cpu); 675 } 676 } 677 678 /* 679 * Cannot recover? Panic here then. 680 * This dumps all the mces in the log buffer and stops the 681 * other CPUs. 682 */ 683 if (m && global_worst >= MCE_PANIC_SEVERITY && tolerant < 3) 684 mce_panic("Fatal Machine check", m, msg); 685 686 /* 687 * For UC somewhere we let the CPU who detects it handle it. 688 * Also must let continue the others, otherwise the handling 689 * CPU could deadlock on a lock. 690 */ 691 692 /* 693 * No machine check event found. Must be some external 694 * source or one CPU is hung. Panic. 695 */ 696 if (global_worst <= MCE_KEEP_SEVERITY && tolerant < 3) 697 mce_panic("Machine check from unknown source", NULL, NULL); 698 699 /* 700 * Now clear all the mces_seen so that they don't reappear on 701 * the next mce. 702 */ 703 for_each_possible_cpu(cpu) 704 memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce)); 705} 706 707static atomic_t global_nwo; 708 709/* 710 * Start of Monarch synchronization. This waits until all CPUs have 711 * entered the exception handler and then determines if any of them 712 * saw a fatal event that requires panic. Then it executes them 713 * in the entry order. 714 * TBD double check parallel CPU hotunplug 715 */ 716static int mce_start(int *no_way_out) 717{ 718 int order; 719 int cpus = num_online_cpus(); 720 u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC; 721 722 if (!timeout) 723 return -1; 724 725 atomic_add(*no_way_out, &global_nwo); 726 /* 727 * global_nwo should be updated before mce_callin 728 */ 729 smp_wmb(); 730 order = atomic_inc_return(&mce_callin); 731 732 /* 733 * Wait for everyone. 734 */ 735 while (atomic_read(&mce_callin) != cpus) { 736 if (mce_timed_out(&timeout)) { 737 atomic_set(&global_nwo, 0); 738 return -1; 739 } 740 ndelay(SPINUNIT); 741 } 742 743 /* 744 * mce_callin should be read before global_nwo 745 */ 746 smp_rmb(); 747 748 if (order == 1) { 749 /* 750 * Monarch: Starts executing now, the others wait. 751 */ 752 atomic_set(&mce_executing, 1); 753 } else { 754 /* 755 * Subject: Now start the scanning loop one by one in 756 * the original callin order. 757 * This way when there are any shared banks it will be 758 * only seen by one CPU before cleared, avoiding duplicates. 759 */ 760 while (atomic_read(&mce_executing) < order) { 761 if (mce_timed_out(&timeout)) { 762 atomic_set(&global_nwo, 0); 763 return -1; 764 } 765 ndelay(SPINUNIT); 766 } 767 } 768 769 /* 770 * Cache the global no_way_out state. 771 */ 772 *no_way_out = atomic_read(&global_nwo); 773 774 return order; 775} 776 777/* 778 * Synchronize between CPUs after main scanning loop. 779 * This invokes the bulk of the Monarch processing. 780 */ 781static int mce_end(int order) 782{ 783 int ret = -1; 784 u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC; 785 786 if (!timeout) 787 goto reset; 788 if (order < 0) 789 goto reset; 790 791 /* 792 * Allow others to run. 793 */ 794 atomic_inc(&mce_executing); 795 796 if (order == 1) { 797 /* CHECKME: Can this race with a parallel hotplug? */ 798 int cpus = num_online_cpus(); 799 800 /* 801 * Monarch: Wait for everyone to go through their scanning 802 * loops. 803 */ 804 while (atomic_read(&mce_executing) <= cpus) { 805 if (mce_timed_out(&timeout)) 806 goto reset; 807 ndelay(SPINUNIT); 808 } 809 810 mce_reign(); 811 barrier(); 812 ret = 0; 813 } else { 814 /* 815 * Subject: Wait for Monarch to finish. 816 */ 817 while (atomic_read(&mce_executing) != 0) { 818 if (mce_timed_out(&timeout)) 819 goto reset; 820 ndelay(SPINUNIT); 821 } 822 823 /* 824 * Don't reset anything. That's done by the Monarch. 825 */ 826 return 0; 827 } 828 829 /* 830 * Reset all global state. 831 */ 832reset: 833 atomic_set(&global_nwo, 0); 834 atomic_set(&mce_callin, 0); 835 barrier(); 836 837 /* 838 * Let others run again. 839 */ 840 atomic_set(&mce_executing, 0); 841 return ret; 842} 843 844/* 845 * Check if the address reported by the CPU is in a format we can parse. 846 * It would be possible to add code for most other cases, but all would 847 * be somewhat complicated (e.g. segment offset would require an instruction 848 * parser). So only support physical addresses up to page granuality for now. 849 */ 850static int mce_usable_address(struct mce *m) 851{ 852 if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV)) 853 return 0; 854 if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT) 855 return 0; 856 if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS) 857 return 0; 858 return 1; 859} 860 861static void mce_clear_state(unsigned long *toclear) 862{ 863 int i; 864 865 for (i = 0; i < banks; i++) { 866 if (test_bit(i, toclear)) 867 mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0); 868 } 869} 870 871/* 872 * The actual machine check handler. This only handles real 873 * exceptions when something got corrupted coming in through int 18. 874 * 875 * This is executed in NMI context not subject to normal locking rules. This 876 * implies that most kernel services cannot be safely used. Don't even 877 * think about putting a printk in there! 878 * 879 * On Intel systems this is entered on all CPUs in parallel through 880 * MCE broadcast. However some CPUs might be broken beyond repair, 881 * so be always careful when synchronizing with others. 882 */ 883void do_machine_check(struct pt_regs *regs, long error_code) 884{ 885 struct mce m, *final; 886 int i; 887 int worst = 0; 888 int severity; 889 /* 890 * Establish sequential order between the CPUs entering the machine 891 * check handler. 892 */ 893 int order; 894 /* 895 * If no_way_out gets set, there is no safe way to recover from this 896 * MCE. If tolerant is cranked up, we'll try anyway. 897 */ 898 int no_way_out = 0; 899 /* 900 * If kill_it gets set, there might be a way to recover from this 901 * error. 902 */ 903 int kill_it = 0; 904 DECLARE_BITMAP(toclear, MAX_NR_BANKS); 905 char *msg = "Unknown"; 906 907 atomic_inc(&mce_entry); 908 909 percpu_inc(mce_exception_count); 910 911 if (!banks) 912 goto out; 913 914 mce_gather_info(&m, regs); 915 916 final = &__get_cpu_var(mces_seen); 917 *final = m; 918 919 no_way_out = mce_no_way_out(&m, &msg); 920 921 barrier(); 922 923 /* 924 * When no restart IP must always kill or panic. 925 */ 926 if (!(m.mcgstatus & MCG_STATUS_RIPV)) 927 kill_it = 1; 928 929 /* 930 * Go through all the banks in exclusion of the other CPUs. 931 * This way we don't report duplicated events on shared banks 932 * because the first one to see it will clear it. 933 */ 934 order = mce_start(&no_way_out); 935 for (i = 0; i < banks; i++) { 936 __clear_bit(i, toclear); 937 if (!mce_banks[i].ctl) 938 continue; 939 940 m.misc = 0; 941 m.addr = 0; 942 m.bank = i; 943 944 m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); 945 if ((m.status & MCI_STATUS_VAL) == 0) 946 continue; 947 948 /* 949 * Non uncorrected or non signaled errors are handled by 950 * machine_check_poll. Leave them alone, unless this panics. 951 */ 952 if (!(m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)) && 953 !no_way_out) 954 continue; 955 956 /* 957 * Set taint even when machine check was not enabled. 958 */ 959 add_taint(TAINT_MACHINE_CHECK); 960 961 severity = mce_severity(&m, tolerant, NULL); 962 963 /* 964 * When machine check was for corrected handler don't touch, 965 * unless we're panicing. 966 */ 967 if (severity == MCE_KEEP_SEVERITY && !no_way_out) 968 continue; 969 __set_bit(i, toclear); 970 if (severity == MCE_NO_SEVERITY) { 971 /* 972 * Machine check event was not enabled. Clear, but 973 * ignore. 974 */ 975 continue; 976 } 977 978 /* 979 * Kill on action required. 980 */ 981 if (severity == MCE_AR_SEVERITY) 982 kill_it = 1; 983 984 if (m.status & MCI_STATUS_MISCV) 985 m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i)); 986 if (m.status & MCI_STATUS_ADDRV) 987 m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i)); 988 989 /* 990 * Action optional error. Queue address for later processing. 991 * When the ring overflows we just ignore the AO error. 992 * RED-PEN add some logging mechanism when 993 * usable_address or mce_add_ring fails. 994 * RED-PEN don't ignore overflow for tolerant == 0 995 */ 996 if (severity == MCE_AO_SEVERITY && mce_usable_address(&m)) 997 mce_ring_add(m.addr >> PAGE_SHIFT); 998 999 mce_log(&m); 1000 1001 if (severity > worst) { 1002 *final = m; 1003 worst = severity; 1004 } 1005 } 1006 1007 if (!no_way_out) 1008 mce_clear_state(toclear); 1009 1010 /* 1011 * Do most of the synchronization with other CPUs. 1012 * When there's any problem use only local no_way_out state. 1013 */ 1014 if (mce_end(order) < 0) 1015 no_way_out = worst >= MCE_PANIC_SEVERITY; 1016 1017 /* 1018 * If we have decided that we just CAN'T continue, and the user 1019 * has not set tolerant to an insane level, give up and die. 1020 * 1021 * This is mainly used in the case when the system doesn't 1022 * support MCE broadcasting or it has been disabled. 1023 */ 1024 if (no_way_out && tolerant < 3) 1025 mce_panic("Fatal machine check on current CPU", final, msg); 1026 1027 /* 1028 * If the error seems to be unrecoverable, something should be 1029 * done. Try to kill as little as possible. If we can kill just 1030 * one task, do that. If the user has set the tolerance very 1031 * high, don't try to do anything at all. 1032 */ 1033 1034 if (kill_it && tolerant < 3) 1035 force_sig(SIGBUS, current); 1036 1037 /* notify userspace ASAP */ 1038 set_thread_flag(TIF_MCE_NOTIFY); 1039 1040 if (worst > 0) 1041 mce_report_event(regs); 1042 mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); 1043out: 1044 atomic_dec(&mce_entry); 1045 sync_core(); 1046} 1047EXPORT_SYMBOL_GPL(do_machine_check); 1048 1049#ifndef CONFIG_MEMORY_FAILURE 1050int memory_failure(unsigned long pfn, int vector, int flags) 1051{ 1052 printk(KERN_ERR "Uncorrected memory error in page 0x%lx ignored\n" 1053 "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n", pfn); 1054 1055 return 0; 1056} 1057#endif 1058 1059/* 1060 * Called after mce notification in process context. This code 1061 * is allowed to sleep. Call the high level VM handler to process 1062 * any corrupted pages. 1063 * Assume that the work queue code only calls this one at a time 1064 * per CPU. 1065 * Note we don't disable preemption, so this code might run on the wrong 1066 * CPU. In this case the event is picked up by the scheduled work queue. 1067 * This is merely a fast path to expedite processing in some common 1068 * cases. 1069 */ 1070void mce_notify_process(void) 1071{ 1072 unsigned long pfn; 1073 mce_notify_irq(); 1074 while (mce_ring_get(&pfn)) 1075 memory_failure(pfn, MCE_VECTOR, 0); 1076} 1077 1078static void mce_process_work(struct work_struct *dummy) 1079{ 1080 mce_notify_process(); 1081} 1082 1083#ifdef CONFIG_X86_MCE_INTEL 1084/*** 1085 * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog 1086 * @cpu: The CPU on which the event occurred. 1087 * @status: Event status information 1088 * 1089 * This function should be called by the thermal interrupt after the 1090 * event has been processed and the decision was made to log the event 1091 * further. 1092 * 1093 * The status parameter will be saved to the 'status' field of 'struct mce' 1094 * and historically has been the register value of the 1095 * MSR_IA32_THERMAL_STATUS (Intel) msr. 1096 */ 1097void mce_log_therm_throt_event(__u64 status) 1098{ 1099 struct mce m; 1100 1101 mce_setup(&m); 1102 m.bank = MCE_THERMAL_BANK; 1103 m.status = status; 1104 mce_log(&m); 1105} 1106#endif /* CONFIG_X86_MCE_INTEL */ 1107 1108/* 1109 * Periodic polling timer for "silent" machine check errors. If the 1110 * poller finds an MCE, poll 2x faster. When the poller finds no more 1111 * errors, poll 2x slower (up to check_interval seconds). 1112 */ 1113static int check_interval = 5 * 60; /* 5 minutes */ 1114 1115static DEFINE_PER_CPU(int, mce_next_interval); /* in jiffies */ 1116static DEFINE_PER_CPU(struct timer_list, mce_timer); 1117 1118static void mce_start_timer(unsigned long data) 1119{ 1120 struct timer_list *t = &per_cpu(mce_timer, data); 1121 int *n; 1122 1123 WARN_ON(smp_processor_id() != data); 1124 1125 if (mce_available(__this_cpu_ptr(&cpu_info))) { 1126 machine_check_poll(MCP_TIMESTAMP, 1127 &__get_cpu_var(mce_poll_banks)); 1128 } 1129 1130 /* 1131 * Alert userspace if needed. If we logged an MCE, reduce the 1132 * polling interval, otherwise increase the polling interval. 1133 */ 1134 n = &__get_cpu_var(mce_next_interval); 1135 if (mce_notify_irq()) 1136 *n = max(*n/2, HZ/100); 1137 else 1138 *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ)); 1139 1140 t->expires = jiffies + *n; 1141 add_timer_on(t, smp_processor_id()); 1142} 1143 1144/* Must not be called in IRQ context where del_timer_sync() can deadlock */ 1145static void mce_timer_delete_all(void) 1146{ 1147 int cpu; 1148 1149 for_each_online_cpu(cpu) 1150 del_timer_sync(&per_cpu(mce_timer, cpu)); 1151} 1152 1153static void mce_do_trigger(struct work_struct *work) 1154{ 1155 call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT); 1156} 1157 1158static DECLARE_WORK(mce_trigger_work, mce_do_trigger); 1159 1160/* 1161 * Notify the user(s) about new machine check events. 1162 * Can be called from interrupt context, but not from machine check/NMI 1163 * context. 1164 */ 1165int mce_notify_irq(void) 1166{ 1167 /* Not more than two messages every minute */ 1168 static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); 1169 1170 clear_thread_flag(TIF_MCE_NOTIFY); 1171 1172 if (test_and_clear_bit(0, &mce_need_notify)) { 1173 /* wake processes polling /dev/mcelog */ 1174 wake_up_interruptible(&mce_chrdev_wait); 1175 1176 /* 1177 * There is no risk of missing notifications because 1178 * work_pending is always cleared before the function is 1179 * executed. 1180 */ 1181 if (mce_helper[0] && !work_pending(&mce_trigger_work)) 1182 schedule_work(&mce_trigger_work); 1183 1184 if (__ratelimit(&ratelimit)) 1185 pr_info(HW_ERR "Machine check events logged\n"); 1186 1187 return 1; 1188 } 1189 return 0; 1190} 1191EXPORT_SYMBOL_GPL(mce_notify_irq); 1192 1193static int __cpuinit __mcheck_cpu_mce_banks_init(void) 1194{ 1195 int i; 1196 1197 mce_banks = kzalloc(banks * sizeof(struct mce_bank), GFP_KERNEL); 1198 if (!mce_banks) 1199 return -ENOMEM; 1200 for (i = 0; i < banks; i++) { 1201 struct mce_bank *b = &mce_banks[i]; 1202 1203 b->ctl = -1ULL; 1204 b->init = 1; 1205 } 1206 return 0; 1207} 1208 1209/* 1210 * Initialize Machine Checks for a CPU. 1211 */ 1212static int __cpuinit __mcheck_cpu_cap_init(void) 1213{ 1214 unsigned b; 1215 u64 cap; 1216 1217 rdmsrl(MSR_IA32_MCG_CAP, cap); 1218 1219 b = cap & MCG_BANKCNT_MASK; 1220 if (!banks) 1221 printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b); 1222 1223 if (b > MAX_NR_BANKS) { 1224 printk(KERN_WARNING 1225 "MCE: Using only %u machine check banks out of %u\n", 1226 MAX_NR_BANKS, b); 1227 b = MAX_NR_BANKS; 1228 } 1229 1230 /* Don't support asymmetric configurations today */ 1231 WARN_ON(banks != 0 && b != banks); 1232 banks = b; 1233 if (!mce_banks) { 1234 int err = __mcheck_cpu_mce_banks_init(); 1235 1236 if (err) 1237 return err; 1238 } 1239 1240 /* Use accurate RIP reporting if available. */ 1241 if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9) 1242 rip_msr = MSR_IA32_MCG_EIP; 1243 1244 if (cap & MCG_SER_P) 1245 mce_ser = 1; 1246 1247 return 0; 1248} 1249 1250static void __mcheck_cpu_init_generic(void) 1251{ 1252 mce_banks_t all_banks; 1253 u64 cap; 1254 int i; 1255 1256 /* 1257 * Log the machine checks left over from the previous reset. 1258 */ 1259 bitmap_fill(all_banks, MAX_NR_BANKS); 1260 machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks); 1261 1262 set_in_cr4(X86_CR4_MCE); 1263 1264 rdmsrl(MSR_IA32_MCG_CAP, cap); 1265 if (cap & MCG_CTL_P) 1266 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); 1267 1268 for (i = 0; i < banks; i++) { 1269 struct mce_bank *b = &mce_banks[i]; 1270 1271 if (!b->init) 1272 continue; 1273 wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl); 1274 wrmsrl(MSR_IA32_MCx_STATUS(i), 0); 1275 } 1276} 1277 1278/* Add per CPU specific workarounds here */ 1279static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) 1280{ 1281 if (c->x86_vendor == X86_VENDOR_UNKNOWN) { 1282 pr_info("MCE: unknown CPU type - not enabling MCE support.\n"); 1283 return -EOPNOTSUPP; 1284 } 1285 1286 /* This should be disabled by the BIOS, but isn't always */ 1287 if (c->x86_vendor == X86_VENDOR_AMD) { 1288 if (c->x86 == 15 && banks > 4) { 1289 /* 1290 * disable GART TBL walk error reporting, which 1291 * trips off incorrectly with the IOMMU & 3ware 1292 * & Cerberus: 1293 */ 1294 clear_bit(10, (unsigned long *)&mce_banks[4].ctl); 1295 } 1296 if (c->x86 <= 17 && mce_bootlog < 0) { 1297 /* 1298 * Lots of broken BIOS around that don't clear them 1299 * by default and leave crap in there. Don't log: 1300 */ 1301 mce_bootlog = 0; 1302 } 1303 /* 1304 * Various K7s with broken bank 0 around. Always disable 1305 * by default. 1306 */ 1307 if (c->x86 == 6 && banks > 0) 1308 mce_banks[0].ctl = 0; 1309 } 1310 1311 if (c->x86_vendor == X86_VENDOR_INTEL) { 1312 /* 1313 * SDM documents that on family 6 bank 0 should not be written 1314 * because it aliases to another special BIOS controlled 1315 * register. 1316 * But it's not aliased anymore on model 0x1a+ 1317 * Don't ignore bank 0 completely because there could be a 1318 * valid event later, merely don't write CTL0. 1319 */ 1320 1321 if (c->x86 == 6 && c->x86_model < 0x1A && banks > 0) 1322 mce_banks[0].init = 0; 1323 1324 /* 1325 * All newer Intel systems support MCE broadcasting. Enable 1326 * synchronization with a one second timeout. 1327 */ 1328 if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) && 1329 monarch_timeout < 0) 1330 monarch_timeout = USEC_PER_SEC; 1331 1332 /* 1333 * There are also broken BIOSes on some Pentium M and 1334 * earlier systems: 1335 */ 1336 if (c->x86 == 6 && c->x86_model <= 13 && mce_bootlog < 0) 1337 mce_bootlog = 0; 1338 } 1339 if (monarch_timeout < 0) 1340 monarch_timeout = 0; 1341 if (mce_bootlog != 0) 1342 mce_panic_timeout = 30; 1343 1344 return 0; 1345} 1346 1347static int __cpuinit __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c) 1348{ 1349 if (c->x86 != 5) 1350 return 0; 1351 1352 switch (c->x86_vendor) { 1353 case X86_VENDOR_INTEL: 1354 intel_p5_mcheck_init(c); 1355 return 1; 1356 break; 1357 case X86_VENDOR_CENTAUR: 1358 winchip_mcheck_init(c); 1359 return 1; 1360 break; 1361 } 1362 1363 return 0; 1364} 1365 1366static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) 1367{ 1368 switch (c->x86_vendor) { 1369 case X86_VENDOR_INTEL: 1370 mce_intel_feature_init(c); 1371 break; 1372 case X86_VENDOR_AMD: 1373 mce_amd_feature_init(c); 1374 break; 1375 default: 1376 break; 1377 } 1378} 1379 1380static void __mcheck_cpu_init_timer(void) 1381{ 1382 struct timer_list *t = &__get_cpu_var(mce_timer); 1383 int *n = &__get_cpu_var(mce_next_interval); 1384 1385 setup_timer(t, mce_start_timer, smp_processor_id()); 1386 1387 if (mce_ignore_ce) 1388 return; 1389 1390 *n = check_interval * HZ; 1391 if (!*n) 1392 return; 1393 t->expires = round_jiffies(jiffies + *n); 1394 add_timer_on(t, smp_processor_id()); 1395} 1396 1397/* Handle unconfigured int18 (should never happen) */ 1398static void unexpected_machine_check(struct pt_regs *regs, long error_code) 1399{ 1400 printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", 1401 smp_processor_id()); 1402} 1403 1404/* Call the installed machine check handler for this CPU setup. */ 1405void (*machine_check_vector)(struct pt_regs *, long error_code) = 1406 unexpected_machine_check; 1407 1408/* 1409 * Called for each booted CPU to set up machine checks. 1410 * Must be called with preempt off: 1411 */ 1412void __cpuinit mcheck_cpu_init(struct cpuinfo_x86 *c) 1413{ 1414 if (mce_disabled) 1415 return; 1416 1417 if (__mcheck_cpu_ancient_init(c)) 1418 return; 1419 1420 if (!mce_available(c)) 1421 return; 1422 1423 if (__mcheck_cpu_cap_init() < 0 || __mcheck_cpu_apply_quirks(c) < 0) { 1424 mce_disabled = 1; 1425 return; 1426 } 1427 1428 machine_check_vector = do_machine_check; 1429 1430 __mcheck_cpu_init_generic(); 1431 __mcheck_cpu_init_vendor(c); 1432 __mcheck_cpu_init_timer(); 1433 INIT_WORK(&__get_cpu_var(mce_work), mce_process_work); 1434 init_irq_work(&__get_cpu_var(mce_irq_work), &mce_irq_work_cb); 1435} 1436 1437/* 1438 * mce_chrdev: Character device /dev/mcelog to read and clear the MCE log. 1439 */ 1440 1441static DEFINE_SPINLOCK(mce_chrdev_state_lock); 1442static int mce_chrdev_open_count; /* #times opened */ 1443static int mce_chrdev_open_exclu; /* already open exclusive? */ 1444 1445static int mce_chrdev_open(struct inode *inode, struct file *file) 1446{ 1447 spin_lock(&mce_chrdev_state_lock); 1448 1449 if (mce_chrdev_open_exclu || 1450 (mce_chrdev_open_count && (file->f_flags & O_EXCL))) { 1451 spin_unlock(&mce_chrdev_state_lock); 1452 1453 return -EBUSY; 1454 } 1455 1456 if (file->f_flags & O_EXCL) 1457 mce_chrdev_open_exclu = 1; 1458 mce_chrdev_open_count++; 1459 1460 spin_unlock(&mce_chrdev_state_lock); 1461 1462 return nonseekable_open(inode, file); 1463} 1464 1465static int mce_chrdev_release(struct inode *inode, struct file *file) 1466{ 1467 spin_lock(&mce_chrdev_state_lock); 1468 1469 mce_chrdev_open_count--; 1470 mce_chrdev_open_exclu = 0; 1471 1472 spin_unlock(&mce_chrdev_state_lock); 1473 1474 return 0; 1475} 1476 1477static void collect_tscs(void *data) 1478{ 1479 unsigned long *cpu_tsc = (unsigned long *)data; 1480 1481 rdtscll(cpu_tsc[smp_processor_id()]); 1482} 1483 1484static int mce_apei_read_done; 1485 1486/* Collect MCE record of previous boot in persistent storage via APEI ERST. */ 1487static int __mce_read_apei(char __user **ubuf, size_t usize) 1488{ 1489 int rc; 1490 u64 record_id; 1491 struct mce m; 1492 1493 if (usize < sizeof(struct mce)) 1494 return -EINVAL; 1495 1496 rc = apei_read_mce(&m, &record_id); 1497 /* Error or no more MCE record */ 1498 if (rc <= 0) { 1499 mce_apei_read_done = 1; 1500 return rc; 1501 } 1502 rc = -EFAULT; 1503 if (copy_to_user(*ubuf, &m, sizeof(struct mce))) 1504 return rc; 1505 /* 1506 * In fact, we should have cleared the record after that has 1507 * been flushed to the disk or sent to network in 1508 * /sbin/mcelog, but we have no interface to support that now, 1509 * so just clear it to avoid duplication. 1510 */ 1511 rc = apei_clear_mce(record_id); 1512 if (rc) { 1513 mce_apei_read_done = 1; 1514 return rc; 1515 } 1516 *ubuf += sizeof(struct mce); 1517 1518 return 0; 1519} 1520 1521static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf, 1522 size_t usize, loff_t *off) 1523{ 1524 char __user *buf = ubuf; 1525 unsigned long *cpu_tsc; 1526 unsigned prev, next; 1527 int i, err; 1528 1529 cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL); 1530 if (!cpu_tsc) 1531 return -ENOMEM; 1532 1533 mutex_lock(&mce_chrdev_read_mutex); 1534 1535 if (!mce_apei_read_done) { 1536 err = __mce_read_apei(&buf, usize); 1537 if (err || buf != ubuf) 1538 goto out; 1539 } 1540 1541 next = rcu_dereference_check_mce(mcelog.next); 1542 1543 /* Only supports full reads right now */ 1544 err = -EINVAL; 1545 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) 1546 goto out; 1547 1548 err = 0; 1549 prev = 0; 1550 do { 1551 for (i = prev; i < next; i++) { 1552 unsigned long start = jiffies; 1553 struct mce *m = &mcelog.entry[i]; 1554 1555 while (!m->finished) { 1556 if (time_after_eq(jiffies, start + 2)) { 1557 memset(m, 0, sizeof(*m)); 1558 goto timeout; 1559 } 1560 cpu_relax(); 1561 } 1562 smp_rmb(); 1563 err |= copy_to_user(buf, m, sizeof(*m)); 1564 buf += sizeof(*m); 1565timeout: 1566 ; 1567 } 1568 1569 memset(mcelog.entry + prev, 0, 1570 (next - prev) * sizeof(struct mce)); 1571 prev = next; 1572 next = cmpxchg(&mcelog.next, prev, 0); 1573 } while (next != prev); 1574 1575 synchronize_sched(); 1576 1577 /* 1578 * Collect entries that were still getting written before the 1579 * synchronize. 1580 */ 1581 on_each_cpu(collect_tscs, cpu_tsc, 1); 1582 1583 for (i = next; i < MCE_LOG_LEN; i++) { 1584 struct mce *m = &mcelog.entry[i]; 1585 1586 if (m->finished && m->tsc < cpu_tsc[m->cpu]) { 1587 err |= copy_to_user(buf, m, sizeof(*m)); 1588 smp_rmb(); 1589 buf += sizeof(*m); 1590 memset(m, 0, sizeof(*m)); 1591 } 1592 } 1593 1594 if (err) 1595 err = -EFAULT; 1596 1597out: 1598 mutex_unlock(&mce_chrdev_read_mutex); 1599 kfree(cpu_tsc); 1600 1601 return err ? err : buf - ubuf; 1602} 1603 1604static unsigned int mce_chrdev_poll(struct file *file, poll_table *wait) 1605{ 1606 poll_wait(file, &mce_chrdev_wait, wait); 1607 if (rcu_access_index(mcelog.next)) 1608 return POLLIN | POLLRDNORM; 1609 if (!mce_apei_read_done && apei_check_mce()) 1610 return POLLIN | POLLRDNORM; 1611 return 0; 1612} 1613 1614static long mce_chrdev_ioctl(struct file *f, unsigned int cmd, 1615 unsigned long arg) 1616{ 1617 int __user *p = (int __user *)arg; 1618 1619 if (!capable(CAP_SYS_ADMIN)) 1620 return -EPERM; 1621 1622 switch (cmd) { 1623 case MCE_GET_RECORD_LEN: 1624 return put_user(sizeof(struct mce), p); 1625 case MCE_GET_LOG_LEN: 1626 return put_user(MCE_LOG_LEN, p); 1627 case MCE_GETCLEAR_FLAGS: { 1628 unsigned flags; 1629 1630 do { 1631 flags = mcelog.flags; 1632 } while (cmpxchg(&mcelog.flags, flags, 0) != flags); 1633 1634 return put_user(flags, p); 1635 } 1636 default: 1637 return -ENOTTY; 1638 } 1639} 1640 1641static ssize_t (*mce_write)(struct file *filp, const char __user *ubuf, 1642 size_t usize, loff_t *off); 1643 1644void register_mce_write_callback(ssize_t (*fn)(struct file *filp, 1645 const char __user *ubuf, 1646 size_t usize, loff_t *off)) 1647{ 1648 mce_write = fn; 1649} 1650EXPORT_SYMBOL_GPL(register_mce_write_callback); 1651 1652ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf, 1653 size_t usize, loff_t *off) 1654{ 1655 if (mce_write) 1656 return mce_write(filp, ubuf, usize, off); 1657 else 1658 return -EINVAL; 1659} 1660 1661static const struct file_operations mce_chrdev_ops = { 1662 .open = mce_chrdev_open, 1663 .release = mce_chrdev_release, 1664 .read = mce_chrdev_read, 1665 .write = mce_chrdev_write, 1666 .poll = mce_chrdev_poll, 1667 .unlocked_ioctl = mce_chrdev_ioctl, 1668 .llseek = no_llseek, 1669}; 1670 1671static struct miscdevice mce_chrdev_device = { 1672 MISC_MCELOG_MINOR, 1673 "mcelog", 1674 &mce_chrdev_ops, 1675}; 1676 1677/* 1678 * mce=off Disables machine check 1679 * mce=no_cmci Disables CMCI 1680 * mce=dont_log_ce Clears corrected events silently, no log created for CEs. 1681 * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared. 1682 * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above) 1683 * monarchtimeout is how long to wait for other CPUs on machine 1684 * check, or 0 to not wait 1685 * mce=bootlog Log MCEs from before booting. Disabled by default on AMD. 1686 * mce=nobootlog Don't log MCEs from before booting. 1687 */ 1688static int __init mcheck_enable(char *str) 1689{ 1690 if (*str == 0) { 1691 enable_p5_mce(); 1692 return 1; 1693 } 1694 if (*str == '=') 1695 str++; 1696 if (!strcmp(str, "off")) 1697 mce_disabled = 1; 1698 else if (!strcmp(str, "no_cmci")) 1699 mce_cmci_disabled = 1; 1700 else if (!strcmp(str, "dont_log_ce")) 1701 mce_dont_log_ce = 1; 1702 else if (!strcmp(str, "ignore_ce")) 1703 mce_ignore_ce = 1; 1704 else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog")) 1705 mce_bootlog = (str[0] == 'b'); 1706 else if (isdigit(str[0])) { 1707 get_option(&str, &tolerant); 1708 if (*str == ',') { 1709 ++str; 1710 get_option(&str, &monarch_timeout); 1711 } 1712 } else { 1713 printk(KERN_INFO "mce argument %s ignored. Please use /sys\n", 1714 str); 1715 return 0; 1716 } 1717 return 1; 1718} 1719__setup("mce", mcheck_enable); 1720 1721int __init mcheck_init(void) 1722{ 1723 mcheck_intel_therm_init(); 1724 1725 return 0; 1726} 1727 1728/* 1729 * mce_syscore: PM support 1730 */ 1731 1732/* 1733 * Disable machine checks on suspend and shutdown. We can't really handle 1734 * them later. 1735 */ 1736static int mce_disable_error_reporting(void) 1737{ 1738 int i; 1739 1740 for (i = 0; i < banks; i++) { 1741 struct mce_bank *b = &mce_banks[i]; 1742 1743 if (b->init) 1744 wrmsrl(MSR_IA32_MCx_CTL(i), 0); 1745 } 1746 return 0; 1747} 1748 1749static int mce_syscore_suspend(void) 1750{ 1751 return mce_disable_error_reporting(); 1752} 1753 1754static void mce_syscore_shutdown(void) 1755{ 1756 mce_disable_error_reporting(); 1757} 1758 1759/* 1760 * On resume clear all MCE state. Don't want to see leftovers from the BIOS. 1761 * Only one CPU is active at this time, the others get re-added later using 1762 * CPU hotplug: 1763 */ 1764static void mce_syscore_resume(void) 1765{ 1766 __mcheck_cpu_init_generic(); 1767 __mcheck_cpu_init_vendor(__this_cpu_ptr(&cpu_info)); 1768} 1769 1770static struct syscore_ops mce_syscore_ops = { 1771 .suspend = mce_syscore_suspend, 1772 .shutdown = mce_syscore_shutdown, 1773 .resume = mce_syscore_resume, 1774}; 1775 1776/* 1777 * mce_sysdev: Sysfs support 1778 */ 1779 1780static void mce_cpu_restart(void *data) 1781{ 1782 if (!mce_available(__this_cpu_ptr(&cpu_info))) 1783 return; 1784 __mcheck_cpu_init_generic(); 1785 __mcheck_cpu_init_timer(); 1786} 1787 1788/* Reinit MCEs after user configuration changes */ 1789static void mce_restart(void) 1790{ 1791 mce_timer_delete_all(); 1792 on_each_cpu(mce_cpu_restart, NULL, 1); 1793} 1794 1795/* Toggle features for corrected errors */ 1796static void mce_disable_cmci(void *data) 1797{ 1798 if (!mce_available(__this_cpu_ptr(&cpu_info))) 1799 return; 1800 cmci_clear(); 1801} 1802 1803static void mce_enable_ce(void *all) 1804{ 1805 if (!mce_available(__this_cpu_ptr(&cpu_info))) 1806 return; 1807 cmci_reenable(); 1808 cmci_recheck(); 1809 if (all) 1810 __mcheck_cpu_init_timer(); 1811} 1812 1813static struct sysdev_class mce_sysdev_class = { 1814 .name = "machinecheck", 1815}; 1816 1817DEFINE_PER_CPU(struct sys_device, mce_sysdev); 1818 1819__cpuinitdata 1820void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); 1821 1822static inline struct mce_bank *attr_to_bank(struct sysdev_attribute *attr) 1823{ 1824 return container_of(attr, struct mce_bank, attr); 1825} 1826 1827static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr, 1828 char *buf) 1829{ 1830 return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl); 1831} 1832 1833static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, 1834 const char *buf, size_t size) 1835{ 1836 u64 new; 1837 1838 if (strict_strtoull(buf, 0, &new) < 0) 1839 return -EINVAL; 1840 1841 attr_to_bank(attr)->ctl = new; 1842 mce_restart(); 1843 1844 return size; 1845} 1846 1847static ssize_t 1848show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf) 1849{ 1850 strcpy(buf, mce_helper); 1851 strcat(buf, "\n"); 1852 return strlen(mce_helper) + 1; 1853} 1854 1855static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr, 1856 const char *buf, size_t siz) 1857{ 1858 char *p; 1859 1860 strncpy(mce_helper, buf, sizeof(mce_helper)); 1861 mce_helper[sizeof(mce_helper)-1] = 0; 1862 p = strchr(mce_helper, '\n'); 1863 1864 if (p) 1865 *p = 0; 1866 1867 return strlen(mce_helper) + !!p; 1868} 1869 1870static ssize_t set_ignore_ce(struct sys_device *s, 1871 struct sysdev_attribute *attr, 1872 const char *buf, size_t size) 1873{ 1874 u64 new; 1875 1876 if (strict_strtoull(buf, 0, &new) < 0) 1877 return -EINVAL; 1878 1879 if (mce_ignore_ce ^ !!new) { 1880 if (new) { 1881 /* disable ce features */ 1882 mce_timer_delete_all(); 1883 on_each_cpu(mce_disable_cmci, NULL, 1); 1884 mce_ignore_ce = 1; 1885 } else { 1886 /* enable ce features */ 1887 mce_ignore_ce = 0; 1888 on_each_cpu(mce_enable_ce, (void *)1, 1); 1889 } 1890 } 1891 return size; 1892} 1893 1894static ssize_t set_cmci_disabled(struct sys_device *s, 1895 struct sysdev_attribute *attr, 1896 const char *buf, size_t size) 1897{ 1898 u64 new; 1899 1900 if (strict_strtoull(buf, 0, &new) < 0) 1901 return -EINVAL; 1902 1903 if (mce_cmci_disabled ^ !!new) { 1904 if (new) { 1905 /* disable cmci */ 1906 on_each_cpu(mce_disable_cmci, NULL, 1); 1907 mce_cmci_disabled = 1; 1908 } else { 1909 /* enable cmci */ 1910 mce_cmci_disabled = 0; 1911 on_each_cpu(mce_enable_ce, NULL, 1); 1912 } 1913 } 1914 return size; 1915} 1916 1917static ssize_t store_int_with_restart(struct sys_device *s, 1918 struct sysdev_attribute *attr, 1919 const char *buf, size_t size) 1920{ 1921 ssize_t ret = sysdev_store_int(s, attr, buf, size); 1922 mce_restart(); 1923 return ret; 1924} 1925 1926static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); 1927static SYSDEV_INT_ATTR(tolerant, 0644, tolerant); 1928static SYSDEV_INT_ATTR(monarch_timeout, 0644, monarch_timeout); 1929static SYSDEV_INT_ATTR(dont_log_ce, 0644, mce_dont_log_ce); 1930 1931static struct sysdev_ext_attribute attr_check_interval = { 1932 _SYSDEV_ATTR(check_interval, 0644, sysdev_show_int, 1933 store_int_with_restart), 1934 &check_interval 1935}; 1936 1937static struct sysdev_ext_attribute attr_ignore_ce = { 1938 _SYSDEV_ATTR(ignore_ce, 0644, sysdev_show_int, set_ignore_ce), 1939 &mce_ignore_ce 1940}; 1941 1942static struct sysdev_ext_attribute attr_cmci_disabled = { 1943 _SYSDEV_ATTR(cmci_disabled, 0644, sysdev_show_int, set_cmci_disabled), 1944 &mce_cmci_disabled 1945}; 1946 1947static struct sysdev_attribute *mce_sysdev_attrs[] = { 1948 &attr_tolerant.attr, 1949 &attr_check_interval.attr, 1950 &attr_trigger, 1951 &attr_monarch_timeout.attr, 1952 &attr_dont_log_ce.attr, 1953 &attr_ignore_ce.attr, 1954 &attr_cmci_disabled.attr, 1955 NULL 1956}; 1957 1958static cpumask_var_t mce_sysdev_initialized; 1959 1960/* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */ 1961static __cpuinit int mce_sysdev_create(unsigned int cpu) 1962{ 1963 struct sys_device *sysdev = &per_cpu(mce_sysdev, cpu); 1964 int err; 1965 int i, j; 1966 1967 if (!mce_available(&boot_cpu_data)) 1968 return -EIO; 1969 1970 memset(&sysdev->kobj, 0, sizeof(struct kobject)); 1971 sysdev->id = cpu; 1972 sysdev->cls = &mce_sysdev_class; 1973 1974 err = sysdev_register(sysdev); 1975 if (err) 1976 return err; 1977 1978 for (i = 0; mce_sysdev_attrs[i]; i++) { 1979 err = sysdev_create_file(sysdev, mce_sysdev_attrs[i]); 1980 if (err) 1981 goto error; 1982 } 1983 for (j = 0; j < banks; j++) { 1984 err = sysdev_create_file(sysdev, &mce_banks[j].attr); 1985 if (err) 1986 goto error2; 1987 } 1988 cpumask_set_cpu(cpu, mce_sysdev_initialized); 1989 1990 return 0; 1991error2: 1992 while (--j >= 0) 1993 sysdev_remove_file(sysdev, &mce_banks[j].attr); 1994error: 1995 while (--i >= 0) 1996 sysdev_remove_file(sysdev, mce_sysdev_attrs[i]); 1997 1998 sysdev_unregister(sysdev); 1999 2000 return err; 2001} 2002 2003static __cpuinit void mce_sysdev_remove(unsigned int cpu) 2004{ 2005 struct sys_device *sysdev = &per_cpu(mce_sysdev, cpu); 2006 int i; 2007 2008 if (!cpumask_test_cpu(cpu, mce_sysdev_initialized)) 2009 return; 2010 2011 for (i = 0; mce_sysdev_attrs[i]; i++) 2012 sysdev_remove_file(sysdev, mce_sysdev_attrs[i]); 2013 2014 for (i = 0; i < banks; i++) 2015 sysdev_remove_file(sysdev, &mce_banks[i].attr); 2016 2017 sysdev_unregister(sysdev); 2018 cpumask_clear_cpu(cpu, mce_sysdev_initialized); 2019} 2020 2021/* Make sure there are no machine checks on offlined CPUs. */ 2022static void __cpuinit mce_disable_cpu(void *h) 2023{ 2024 unsigned long action = *(unsigned long *)h; 2025 int i; 2026 2027 if (!mce_available(__this_cpu_ptr(&cpu_info))) 2028 return; 2029 2030 if (!(action & CPU_TASKS_FROZEN)) 2031 cmci_clear(); 2032 for (i = 0; i < banks; i++) { 2033 struct mce_bank *b = &mce_banks[i]; 2034 2035 if (b->init) 2036 wrmsrl(MSR_IA32_MCx_CTL(i), 0); 2037 } 2038} 2039 2040static void __cpuinit mce_reenable_cpu(void *h) 2041{ 2042 unsigned long action = *(unsigned long *)h; 2043 int i; 2044 2045 if (!mce_available(__this_cpu_ptr(&cpu_info))) 2046 return; 2047 2048 if (!(action & CPU_TASKS_FROZEN)) 2049 cmci_reenable(); 2050 for (i = 0; i < banks; i++) { 2051 struct mce_bank *b = &mce_banks[i]; 2052 2053 if (b->init) 2054 wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl); 2055 } 2056} 2057 2058/* Get notified when a cpu comes on/off. Be hotplug friendly. */ 2059static int __cpuinit 2060mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) 2061{ 2062 unsigned int cpu = (unsigned long)hcpu; 2063 struct timer_list *t = &per_cpu(mce_timer, cpu); 2064 2065 switch (action) { 2066 case CPU_ONLINE: 2067 case CPU_ONLINE_FROZEN: 2068 mce_sysdev_create(cpu); 2069 if (threshold_cpu_callback) 2070 threshold_cpu_callback(action, cpu); 2071 break; 2072 case CPU_DEAD: 2073 case CPU_DEAD_FROZEN: 2074 if (threshold_cpu_callback) 2075 threshold_cpu_callback(action, cpu); 2076 mce_sysdev_remove(cpu); 2077 break; 2078 case CPU_DOWN_PREPARE: 2079 case CPU_DOWN_PREPARE_FROZEN: 2080 del_timer_sync(t); 2081 smp_call_function_single(cpu, mce_disable_cpu, &action, 1); 2082 break; 2083 case CPU_DOWN_FAILED: 2084 case CPU_DOWN_FAILED_FROZEN: 2085 if (!mce_ignore_ce && check_interval) { 2086 t->expires = round_jiffies(jiffies + 2087 __get_cpu_var(mce_next_interval)); 2088 add_timer_on(t, cpu); 2089 } 2090 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); 2091 break; 2092 case CPU_POST_DEAD: 2093 /* intentionally ignoring frozen here */ 2094 cmci_rediscover(cpu); 2095 break; 2096 } 2097 return NOTIFY_OK; 2098} 2099 2100static struct notifier_block mce_cpu_notifier __cpuinitdata = { 2101 .notifier_call = mce_cpu_callback, 2102}; 2103 2104static __init void mce_init_banks(void) 2105{ 2106 int i; 2107 2108 for (i = 0; i < banks; i++) { 2109 struct mce_bank *b = &mce_banks[i]; 2110 struct sysdev_attribute *a = &b->attr; 2111 2112 sysfs_attr_init(&a->attr); 2113 a->attr.name = b->attrname; 2114 snprintf(b->attrname, ATTR_LEN, "bank%d", i); 2115 2116 a->attr.mode = 0644; 2117 a->show = show_bank; 2118 a->store = set_bank; 2119 } 2120} 2121 2122static __init int mcheck_init_device(void) 2123{ 2124 int err; 2125 int i = 0; 2126 2127 if (!mce_available(&boot_cpu_data)) 2128 return -EIO; 2129 2130 zalloc_cpumask_var(&mce_sysdev_initialized, GFP_KERNEL); 2131 2132 mce_init_banks(); 2133 2134 err = sysdev_class_register(&mce_sysdev_class); 2135 if (err) 2136 return err; 2137 2138 for_each_online_cpu(i) { 2139 err = mce_sysdev_create(i); 2140 if (err) 2141 return err; 2142 } 2143 2144 register_syscore_ops(&mce_syscore_ops); 2145 register_hotcpu_notifier(&mce_cpu_notifier); 2146 2147 /* register character device /dev/mcelog */ 2148 misc_register(&mce_chrdev_device); 2149 2150 return err; 2151} 2152device_initcall(mcheck_init_device); 2153 2154/* 2155 * Old style boot options parsing. Only for compatibility. 2156 */ 2157static int __init mcheck_disable(char *str) 2158{ 2159 mce_disabled = 1; 2160 return 1; 2161} 2162__setup("nomce", mcheck_disable); 2163 2164#ifdef CONFIG_DEBUG_FS 2165struct dentry *mce_get_debugfs_dir(void) 2166{ 2167 static struct dentry *dmce; 2168 2169 if (!dmce) 2170 dmce = debugfs_create_dir("mce", NULL); 2171 2172 return dmce; 2173} 2174 2175static void mce_reset(void) 2176{ 2177 cpu_missing = 0; 2178 atomic_set(&mce_fake_paniced, 0); 2179 atomic_set(&mce_executing, 0); 2180 atomic_set(&mce_callin, 0); 2181 atomic_set(&global_nwo, 0); 2182} 2183 2184static int fake_panic_get(void *data, u64 *val) 2185{ 2186 *val = fake_panic; 2187 return 0; 2188} 2189 2190static int fake_panic_set(void *data, u64 val) 2191{ 2192 mce_reset(); 2193 fake_panic = val; 2194 return 0; 2195} 2196 2197DEFINE_SIMPLE_ATTRIBUTE(fake_panic_fops, fake_panic_get, 2198 fake_panic_set, "%llu\n"); 2199 2200static int __init mcheck_debugfs_init(void) 2201{ 2202 struct dentry *dmce, *ffake_panic; 2203 2204 dmce = mce_get_debugfs_dir(); 2205 if (!dmce) 2206 return -ENOMEM; 2207 ffake_panic = debugfs_create_file("fake_panic", 0444, dmce, NULL, 2208 &fake_panic_fops); 2209 if (!ffake_panic) 2210 return -ENOMEM; 2211 2212 return 0; 2213} 2214late_initcall(mcheck_debugfs_init); 2215#endif 2216