mce.c revision 69c60c88eeb364ebf58432f9bc38033522d58767
1/* 2 * Machine check handler. 3 * 4 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. 5 * Rest from unknown author(s). 6 * 2004 Andi Kleen. Rewrote most of it. 7 * Copyright 2008 Intel Corporation 8 * Author: Andi Kleen 9 */ 10#include <linux/thread_info.h> 11#include <linux/capability.h> 12#include <linux/miscdevice.h> 13#include <linux/ratelimit.h> 14#include <linux/kallsyms.h> 15#include <linux/rcupdate.h> 16#include <linux/kobject.h> 17#include <linux/uaccess.h> 18#include <linux/kdebug.h> 19#include <linux/kernel.h> 20#include <linux/percpu.h> 21#include <linux/string.h> 22#include <linux/sysdev.h> 23#include <linux/syscore_ops.h> 24#include <linux/delay.h> 25#include <linux/ctype.h> 26#include <linux/sched.h> 27#include <linux/sysfs.h> 28#include <linux/types.h> 29#include <linux/slab.h> 30#include <linux/init.h> 31#include <linux/kmod.h> 32#include <linux/poll.h> 33#include <linux/nmi.h> 34#include <linux/cpu.h> 35#include <linux/smp.h> 36#include <linux/fs.h> 37#include <linux/mm.h> 38#include <linux/debugfs.h> 39#include <linux/edac_mce.h> 40#include <linux/irq_work.h> 41#include <linux/export.h> 42 43#include <asm/processor.h> 44#include <asm/mce.h> 45#include <asm/msr.h> 46 47#include "mce-internal.h" 48 49static DEFINE_MUTEX(mce_chrdev_read_mutex); 50 51#define rcu_dereference_check_mce(p) \ 52 rcu_dereference_index_check((p), \ 53 rcu_read_lock_sched_held() || \ 54 lockdep_is_held(&mce_chrdev_read_mutex)) 55 56#define CREATE_TRACE_POINTS 57#include <trace/events/mce.h> 58 59int mce_disabled __read_mostly; 60 61#define MISC_MCELOG_MINOR 227 62 63#define SPINUNIT 100 /* 100ns */ 64 65atomic_t mce_entry; 66 67DEFINE_PER_CPU(unsigned, mce_exception_count); 68 69/* 70 * Tolerant levels: 71 * 0: always panic on uncorrected errors, log corrected errors 72 * 1: panic or SIGBUS on uncorrected errors, log corrected errors 73 * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors 74 * 3: never panic or SIGBUS, log all errors (for testing only) 75 */ 76static int tolerant __read_mostly = 1; 77static int banks __read_mostly; 78static int rip_msr __read_mostly; 79static int mce_bootlog __read_mostly = -1; 80static int monarch_timeout __read_mostly = -1; 81static int mce_panic_timeout __read_mostly; 82static int mce_dont_log_ce __read_mostly; 83int mce_cmci_disabled __read_mostly; 84int mce_ignore_ce __read_mostly; 85int mce_ser __read_mostly; 86 87struct mce_bank *mce_banks __read_mostly; 88 89/* User mode helper program triggered by machine check event */ 90static unsigned long mce_need_notify; 91static char mce_helper[128]; 92static char *mce_helper_argv[2] = { mce_helper, NULL }; 93 94static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait); 95 96static DEFINE_PER_CPU(struct mce, mces_seen); 97static int cpu_missing; 98 99/* 100 * CPU/chipset specific EDAC code can register a notifier call here to print 101 * MCE errors in a human-readable form. 102 */ 103ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain); 104EXPORT_SYMBOL_GPL(x86_mce_decoder_chain); 105 106/* MCA banks polled by the period polling timer for corrected events */ 107DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { 108 [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL 109}; 110 111static DEFINE_PER_CPU(struct work_struct, mce_work); 112 113/* Do initial initialization of a struct mce */ 114void mce_setup(struct mce *m) 115{ 116 memset(m, 0, sizeof(struct mce)); 117 m->cpu = m->extcpu = smp_processor_id(); 118 rdtscll(m->tsc); 119 /* We hope get_seconds stays lockless */ 120 m->time = get_seconds(); 121 m->cpuvendor = boot_cpu_data.x86_vendor; 122 m->cpuid = cpuid_eax(1); 123#ifdef CONFIG_SMP 124 m->socketid = cpu_data(m->extcpu).phys_proc_id; 125#endif 126 m->apicid = cpu_data(m->extcpu).initial_apicid; 127 rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap); 128} 129 130DEFINE_PER_CPU(struct mce, injectm); 131EXPORT_PER_CPU_SYMBOL_GPL(injectm); 132 133/* 134 * Lockless MCE logging infrastructure. 135 * This avoids deadlocks on printk locks without having to break locks. Also 136 * separate MCEs from kernel messages to avoid bogus bug reports. 137 */ 138 139static struct mce_log mcelog = { 140 .signature = MCE_LOG_SIGNATURE, 141 .len = MCE_LOG_LEN, 142 .recordlen = sizeof(struct mce), 143}; 144 145void mce_log(struct mce *mce) 146{ 147 unsigned next, entry; 148 149 /* Emit the trace record: */ 150 trace_mce_record(mce); 151 152 mce->finished = 0; 153 wmb(); 154 for (;;) { 155 entry = rcu_dereference_check_mce(mcelog.next); 156 for (;;) { 157 /* 158 * If edac_mce is enabled, it will check the error type 159 * and will process it, if it is a known error. 160 * Otherwise, the error will be sent through mcelog 161 * interface 162 */ 163 if (edac_mce_parse(mce)) 164 return; 165 166 /* 167 * When the buffer fills up discard new entries. 168 * Assume that the earlier errors are the more 169 * interesting ones: 170 */ 171 if (entry >= MCE_LOG_LEN) { 172 set_bit(MCE_OVERFLOW, 173 (unsigned long *)&mcelog.flags); 174 return; 175 } 176 /* Old left over entry. Skip: */ 177 if (mcelog.entry[entry].finished) { 178 entry++; 179 continue; 180 } 181 break; 182 } 183 smp_rmb(); 184 next = entry + 1; 185 if (cmpxchg(&mcelog.next, entry, next) == entry) 186 break; 187 } 188 memcpy(mcelog.entry + entry, mce, sizeof(struct mce)); 189 wmb(); 190 mcelog.entry[entry].finished = 1; 191 wmb(); 192 193 mce->finished = 1; 194 set_bit(0, &mce_need_notify); 195} 196 197static void print_mce(struct mce *m) 198{ 199 int ret = 0; 200 201 pr_emerg(HW_ERR "CPU %d: Machine Check Exception: %Lx Bank %d: %016Lx\n", 202 m->extcpu, m->mcgstatus, m->bank, m->status); 203 204 if (m->ip) { 205 pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ", 206 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", 207 m->cs, m->ip); 208 209 if (m->cs == __KERNEL_CS) 210 print_symbol("{%s}", m->ip); 211 pr_cont("\n"); 212 } 213 214 pr_emerg(HW_ERR "TSC %llx ", m->tsc); 215 if (m->addr) 216 pr_cont("ADDR %llx ", m->addr); 217 if (m->misc) 218 pr_cont("MISC %llx ", m->misc); 219 220 pr_cont("\n"); 221 /* 222 * Note this output is parsed by external tools and old fields 223 * should not be changed. 224 */ 225 pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x microcode %x\n", 226 m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid, 227 cpu_data(m->extcpu).microcode); 228 229 /* 230 * Print out human-readable details about the MCE error, 231 * (if the CPU has an implementation for that) 232 */ 233 ret = atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m); 234 if (ret == NOTIFY_STOP) 235 return; 236 237 pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n"); 238} 239 240#define PANIC_TIMEOUT 5 /* 5 seconds */ 241 242static atomic_t mce_paniced; 243 244static int fake_panic; 245static atomic_t mce_fake_paniced; 246 247/* Panic in progress. Enable interrupts and wait for final IPI */ 248static void wait_for_panic(void) 249{ 250 long timeout = PANIC_TIMEOUT*USEC_PER_SEC; 251 252 preempt_disable(); 253 local_irq_enable(); 254 while (timeout-- > 0) 255 udelay(1); 256 if (panic_timeout == 0) 257 panic_timeout = mce_panic_timeout; 258 panic("Panicing machine check CPU died"); 259} 260 261static void mce_panic(char *msg, struct mce *final, char *exp) 262{ 263 int i, apei_err = 0; 264 265 if (!fake_panic) { 266 /* 267 * Make sure only one CPU runs in machine check panic 268 */ 269 if (atomic_inc_return(&mce_paniced) > 1) 270 wait_for_panic(); 271 barrier(); 272 273 bust_spinlocks(1); 274 console_verbose(); 275 } else { 276 /* Don't log too much for fake panic */ 277 if (atomic_inc_return(&mce_fake_paniced) > 1) 278 return; 279 } 280 /* First print corrected ones that are still unlogged */ 281 for (i = 0; i < MCE_LOG_LEN; i++) { 282 struct mce *m = &mcelog.entry[i]; 283 if (!(m->status & MCI_STATUS_VAL)) 284 continue; 285 if (!(m->status & MCI_STATUS_UC)) { 286 print_mce(m); 287 if (!apei_err) 288 apei_err = apei_write_mce(m); 289 } 290 } 291 /* Now print uncorrected but with the final one last */ 292 for (i = 0; i < MCE_LOG_LEN; i++) { 293 struct mce *m = &mcelog.entry[i]; 294 if (!(m->status & MCI_STATUS_VAL)) 295 continue; 296 if (!(m->status & MCI_STATUS_UC)) 297 continue; 298 if (!final || memcmp(m, final, sizeof(struct mce))) { 299 print_mce(m); 300 if (!apei_err) 301 apei_err = apei_write_mce(m); 302 } 303 } 304 if (final) { 305 print_mce(final); 306 if (!apei_err) 307 apei_err = apei_write_mce(final); 308 } 309 if (cpu_missing) 310 pr_emerg(HW_ERR "Some CPUs didn't answer in synchronization\n"); 311 if (exp) 312 pr_emerg(HW_ERR "Machine check: %s\n", exp); 313 if (!fake_panic) { 314 if (panic_timeout == 0) 315 panic_timeout = mce_panic_timeout; 316 panic(msg); 317 } else 318 pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg); 319} 320 321/* Support code for software error injection */ 322 323static int msr_to_offset(u32 msr) 324{ 325 unsigned bank = __this_cpu_read(injectm.bank); 326 327 if (msr == rip_msr) 328 return offsetof(struct mce, ip); 329 if (msr == MSR_IA32_MCx_STATUS(bank)) 330 return offsetof(struct mce, status); 331 if (msr == MSR_IA32_MCx_ADDR(bank)) 332 return offsetof(struct mce, addr); 333 if (msr == MSR_IA32_MCx_MISC(bank)) 334 return offsetof(struct mce, misc); 335 if (msr == MSR_IA32_MCG_STATUS) 336 return offsetof(struct mce, mcgstatus); 337 return -1; 338} 339 340/* MSR access wrappers used for error injection */ 341static u64 mce_rdmsrl(u32 msr) 342{ 343 u64 v; 344 345 if (__this_cpu_read(injectm.finished)) { 346 int offset = msr_to_offset(msr); 347 348 if (offset < 0) 349 return 0; 350 return *(u64 *)((char *)&__get_cpu_var(injectm) + offset); 351 } 352 353 if (rdmsrl_safe(msr, &v)) { 354 WARN_ONCE(1, "mce: Unable to read msr %d!\n", msr); 355 /* 356 * Return zero in case the access faulted. This should 357 * not happen normally but can happen if the CPU does 358 * something weird, or if the code is buggy. 359 */ 360 v = 0; 361 } 362 363 return v; 364} 365 366static void mce_wrmsrl(u32 msr, u64 v) 367{ 368 if (__this_cpu_read(injectm.finished)) { 369 int offset = msr_to_offset(msr); 370 371 if (offset >= 0) 372 *(u64 *)((char *)&__get_cpu_var(injectm) + offset) = v; 373 return; 374 } 375 wrmsrl(msr, v); 376} 377 378/* 379 * Collect all global (w.r.t. this processor) status about this machine 380 * check into our "mce" struct so that we can use it later to assess 381 * the severity of the problem as we read per-bank specific details. 382 */ 383static inline void mce_gather_info(struct mce *m, struct pt_regs *regs) 384{ 385 mce_setup(m); 386 387 m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); 388 if (regs) { 389 /* 390 * Get the address of the instruction at the time of 391 * the machine check error. 392 */ 393 if (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) { 394 m->ip = regs->ip; 395 m->cs = regs->cs; 396 } 397 /* Use accurate RIP reporting if available. */ 398 if (rip_msr) 399 m->ip = mce_rdmsrl(rip_msr); 400 } 401} 402 403/* 404 * Simple lockless ring to communicate PFNs from the exception handler with the 405 * process context work function. This is vastly simplified because there's 406 * only a single reader and a single writer. 407 */ 408#define MCE_RING_SIZE 16 /* we use one entry less */ 409 410struct mce_ring { 411 unsigned short start; 412 unsigned short end; 413 unsigned long ring[MCE_RING_SIZE]; 414}; 415static DEFINE_PER_CPU(struct mce_ring, mce_ring); 416 417/* Runs with CPU affinity in workqueue */ 418static int mce_ring_empty(void) 419{ 420 struct mce_ring *r = &__get_cpu_var(mce_ring); 421 422 return r->start == r->end; 423} 424 425static int mce_ring_get(unsigned long *pfn) 426{ 427 struct mce_ring *r; 428 int ret = 0; 429 430 *pfn = 0; 431 get_cpu(); 432 r = &__get_cpu_var(mce_ring); 433 if (r->start == r->end) 434 goto out; 435 *pfn = r->ring[r->start]; 436 r->start = (r->start + 1) % MCE_RING_SIZE; 437 ret = 1; 438out: 439 put_cpu(); 440 return ret; 441} 442 443/* Always runs in MCE context with preempt off */ 444static int mce_ring_add(unsigned long pfn) 445{ 446 struct mce_ring *r = &__get_cpu_var(mce_ring); 447 unsigned next; 448 449 next = (r->end + 1) % MCE_RING_SIZE; 450 if (next == r->start) 451 return -1; 452 r->ring[r->end] = pfn; 453 wmb(); 454 r->end = next; 455 return 0; 456} 457 458int mce_available(struct cpuinfo_x86 *c) 459{ 460 if (mce_disabled) 461 return 0; 462 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); 463} 464 465static void mce_schedule_work(void) 466{ 467 if (!mce_ring_empty()) { 468 struct work_struct *work = &__get_cpu_var(mce_work); 469 if (!work_pending(work)) 470 schedule_work(work); 471 } 472} 473 474DEFINE_PER_CPU(struct irq_work, mce_irq_work); 475 476static void mce_irq_work_cb(struct irq_work *entry) 477{ 478 mce_notify_irq(); 479 mce_schedule_work(); 480} 481 482static void mce_report_event(struct pt_regs *regs) 483{ 484 if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) { 485 mce_notify_irq(); 486 /* 487 * Triggering the work queue here is just an insurance 488 * policy in case the syscall exit notify handler 489 * doesn't run soon enough or ends up running on the 490 * wrong CPU (can happen when audit sleeps) 491 */ 492 mce_schedule_work(); 493 return; 494 } 495 496 irq_work_queue(&__get_cpu_var(mce_irq_work)); 497} 498 499DEFINE_PER_CPU(unsigned, mce_poll_count); 500 501/* 502 * Poll for corrected events or events that happened before reset. 503 * Those are just logged through /dev/mcelog. 504 * 505 * This is executed in standard interrupt context. 506 * 507 * Note: spec recommends to panic for fatal unsignalled 508 * errors here. However this would be quite problematic -- 509 * we would need to reimplement the Monarch handling and 510 * it would mess up the exclusion between exception handler 511 * and poll hander -- * so we skip this for now. 512 * These cases should not happen anyways, or only when the CPU 513 * is already totally * confused. In this case it's likely it will 514 * not fully execute the machine check handler either. 515 */ 516void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) 517{ 518 struct mce m; 519 int i; 520 521 percpu_inc(mce_poll_count); 522 523 mce_gather_info(&m, NULL); 524 525 for (i = 0; i < banks; i++) { 526 if (!mce_banks[i].ctl || !test_bit(i, *b)) 527 continue; 528 529 m.misc = 0; 530 m.addr = 0; 531 m.bank = i; 532 m.tsc = 0; 533 534 barrier(); 535 m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); 536 if (!(m.status & MCI_STATUS_VAL)) 537 continue; 538 539 /* 540 * Uncorrected or signalled events are handled by the exception 541 * handler when it is enabled, so don't process those here. 542 * 543 * TBD do the same check for MCI_STATUS_EN here? 544 */ 545 if (!(flags & MCP_UC) && 546 (m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC))) 547 continue; 548 549 if (m.status & MCI_STATUS_MISCV) 550 m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i)); 551 if (m.status & MCI_STATUS_ADDRV) 552 m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i)); 553 554 if (!(flags & MCP_TIMESTAMP)) 555 m.tsc = 0; 556 /* 557 * Don't get the IP here because it's unlikely to 558 * have anything to do with the actual error location. 559 */ 560 if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) { 561 mce_log(&m); 562 atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, &m); 563 } 564 565 /* 566 * Clear state for this bank. 567 */ 568 mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0); 569 } 570 571 /* 572 * Don't clear MCG_STATUS here because it's only defined for 573 * exceptions. 574 */ 575 576 sync_core(); 577} 578EXPORT_SYMBOL_GPL(machine_check_poll); 579 580/* 581 * Do a quick check if any of the events requires a panic. 582 * This decides if we keep the events around or clear them. 583 */ 584static int mce_no_way_out(struct mce *m, char **msg) 585{ 586 int i; 587 588 for (i = 0; i < banks; i++) { 589 m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); 590 if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY) 591 return 1; 592 } 593 return 0; 594} 595 596/* 597 * Variable to establish order between CPUs while scanning. 598 * Each CPU spins initially until executing is equal its number. 599 */ 600static atomic_t mce_executing; 601 602/* 603 * Defines order of CPUs on entry. First CPU becomes Monarch. 604 */ 605static atomic_t mce_callin; 606 607/* 608 * Check if a timeout waiting for other CPUs happened. 609 */ 610static int mce_timed_out(u64 *t) 611{ 612 /* 613 * The others already did panic for some reason. 614 * Bail out like in a timeout. 615 * rmb() to tell the compiler that system_state 616 * might have been modified by someone else. 617 */ 618 rmb(); 619 if (atomic_read(&mce_paniced)) 620 wait_for_panic(); 621 if (!monarch_timeout) 622 goto out; 623 if ((s64)*t < SPINUNIT) { 624 /* CHECKME: Make panic default for 1 too? */ 625 if (tolerant < 1) 626 mce_panic("Timeout synchronizing machine check over CPUs", 627 NULL, NULL); 628 cpu_missing = 1; 629 return 1; 630 } 631 *t -= SPINUNIT; 632out: 633 touch_nmi_watchdog(); 634 return 0; 635} 636 637/* 638 * The Monarch's reign. The Monarch is the CPU who entered 639 * the machine check handler first. It waits for the others to 640 * raise the exception too and then grades them. When any 641 * error is fatal panic. Only then let the others continue. 642 * 643 * The other CPUs entering the MCE handler will be controlled by the 644 * Monarch. They are called Subjects. 645 * 646 * This way we prevent any potential data corruption in a unrecoverable case 647 * and also makes sure always all CPU's errors are examined. 648 * 649 * Also this detects the case of a machine check event coming from outer 650 * space (not detected by any CPUs) In this case some external agent wants 651 * us to shut down, so panic too. 652 * 653 * The other CPUs might still decide to panic if the handler happens 654 * in a unrecoverable place, but in this case the system is in a semi-stable 655 * state and won't corrupt anything by itself. It's ok to let the others 656 * continue for a bit first. 657 * 658 * All the spin loops have timeouts; when a timeout happens a CPU 659 * typically elects itself to be Monarch. 660 */ 661static void mce_reign(void) 662{ 663 int cpu; 664 struct mce *m = NULL; 665 int global_worst = 0; 666 char *msg = NULL; 667 char *nmsg = NULL; 668 669 /* 670 * This CPU is the Monarch and the other CPUs have run 671 * through their handlers. 672 * Grade the severity of the errors of all the CPUs. 673 */ 674 for_each_possible_cpu(cpu) { 675 int severity = mce_severity(&per_cpu(mces_seen, cpu), tolerant, 676 &nmsg); 677 if (severity > global_worst) { 678 msg = nmsg; 679 global_worst = severity; 680 m = &per_cpu(mces_seen, cpu); 681 } 682 } 683 684 /* 685 * Cannot recover? Panic here then. 686 * This dumps all the mces in the log buffer and stops the 687 * other CPUs. 688 */ 689 if (m && global_worst >= MCE_PANIC_SEVERITY && tolerant < 3) 690 mce_panic("Fatal Machine check", m, msg); 691 692 /* 693 * For UC somewhere we let the CPU who detects it handle it. 694 * Also must let continue the others, otherwise the handling 695 * CPU could deadlock on a lock. 696 */ 697 698 /* 699 * No machine check event found. Must be some external 700 * source or one CPU is hung. Panic. 701 */ 702 if (global_worst <= MCE_KEEP_SEVERITY && tolerant < 3) 703 mce_panic("Machine check from unknown source", NULL, NULL); 704 705 /* 706 * Now clear all the mces_seen so that they don't reappear on 707 * the next mce. 708 */ 709 for_each_possible_cpu(cpu) 710 memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce)); 711} 712 713static atomic_t global_nwo; 714 715/* 716 * Start of Monarch synchronization. This waits until all CPUs have 717 * entered the exception handler and then determines if any of them 718 * saw a fatal event that requires panic. Then it executes them 719 * in the entry order. 720 * TBD double check parallel CPU hotunplug 721 */ 722static int mce_start(int *no_way_out) 723{ 724 int order; 725 int cpus = num_online_cpus(); 726 u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC; 727 728 if (!timeout) 729 return -1; 730 731 atomic_add(*no_way_out, &global_nwo); 732 /* 733 * global_nwo should be updated before mce_callin 734 */ 735 smp_wmb(); 736 order = atomic_inc_return(&mce_callin); 737 738 /* 739 * Wait for everyone. 740 */ 741 while (atomic_read(&mce_callin) != cpus) { 742 if (mce_timed_out(&timeout)) { 743 atomic_set(&global_nwo, 0); 744 return -1; 745 } 746 ndelay(SPINUNIT); 747 } 748 749 /* 750 * mce_callin should be read before global_nwo 751 */ 752 smp_rmb(); 753 754 if (order == 1) { 755 /* 756 * Monarch: Starts executing now, the others wait. 757 */ 758 atomic_set(&mce_executing, 1); 759 } else { 760 /* 761 * Subject: Now start the scanning loop one by one in 762 * the original callin order. 763 * This way when there are any shared banks it will be 764 * only seen by one CPU before cleared, avoiding duplicates. 765 */ 766 while (atomic_read(&mce_executing) < order) { 767 if (mce_timed_out(&timeout)) { 768 atomic_set(&global_nwo, 0); 769 return -1; 770 } 771 ndelay(SPINUNIT); 772 } 773 } 774 775 /* 776 * Cache the global no_way_out state. 777 */ 778 *no_way_out = atomic_read(&global_nwo); 779 780 return order; 781} 782 783/* 784 * Synchronize between CPUs after main scanning loop. 785 * This invokes the bulk of the Monarch processing. 786 */ 787static int mce_end(int order) 788{ 789 int ret = -1; 790 u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC; 791 792 if (!timeout) 793 goto reset; 794 if (order < 0) 795 goto reset; 796 797 /* 798 * Allow others to run. 799 */ 800 atomic_inc(&mce_executing); 801 802 if (order == 1) { 803 /* CHECKME: Can this race with a parallel hotplug? */ 804 int cpus = num_online_cpus(); 805 806 /* 807 * Monarch: Wait for everyone to go through their scanning 808 * loops. 809 */ 810 while (atomic_read(&mce_executing) <= cpus) { 811 if (mce_timed_out(&timeout)) 812 goto reset; 813 ndelay(SPINUNIT); 814 } 815 816 mce_reign(); 817 barrier(); 818 ret = 0; 819 } else { 820 /* 821 * Subject: Wait for Monarch to finish. 822 */ 823 while (atomic_read(&mce_executing) != 0) { 824 if (mce_timed_out(&timeout)) 825 goto reset; 826 ndelay(SPINUNIT); 827 } 828 829 /* 830 * Don't reset anything. That's done by the Monarch. 831 */ 832 return 0; 833 } 834 835 /* 836 * Reset all global state. 837 */ 838reset: 839 atomic_set(&global_nwo, 0); 840 atomic_set(&mce_callin, 0); 841 barrier(); 842 843 /* 844 * Let others run again. 845 */ 846 atomic_set(&mce_executing, 0); 847 return ret; 848} 849 850/* 851 * Check if the address reported by the CPU is in a format we can parse. 852 * It would be possible to add code for most other cases, but all would 853 * be somewhat complicated (e.g. segment offset would require an instruction 854 * parser). So only support physical addresses up to page granuality for now. 855 */ 856static int mce_usable_address(struct mce *m) 857{ 858 if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV)) 859 return 0; 860 if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT) 861 return 0; 862 if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS) 863 return 0; 864 return 1; 865} 866 867static void mce_clear_state(unsigned long *toclear) 868{ 869 int i; 870 871 for (i = 0; i < banks; i++) { 872 if (test_bit(i, toclear)) 873 mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0); 874 } 875} 876 877/* 878 * The actual machine check handler. This only handles real 879 * exceptions when something got corrupted coming in through int 18. 880 * 881 * This is executed in NMI context not subject to normal locking rules. This 882 * implies that most kernel services cannot be safely used. Don't even 883 * think about putting a printk in there! 884 * 885 * On Intel systems this is entered on all CPUs in parallel through 886 * MCE broadcast. However some CPUs might be broken beyond repair, 887 * so be always careful when synchronizing with others. 888 */ 889void do_machine_check(struct pt_regs *regs, long error_code) 890{ 891 struct mce m, *final; 892 int i; 893 int worst = 0; 894 int severity; 895 /* 896 * Establish sequential order between the CPUs entering the machine 897 * check handler. 898 */ 899 int order; 900 /* 901 * If no_way_out gets set, there is no safe way to recover from this 902 * MCE. If tolerant is cranked up, we'll try anyway. 903 */ 904 int no_way_out = 0; 905 /* 906 * If kill_it gets set, there might be a way to recover from this 907 * error. 908 */ 909 int kill_it = 0; 910 DECLARE_BITMAP(toclear, MAX_NR_BANKS); 911 char *msg = "Unknown"; 912 913 atomic_inc(&mce_entry); 914 915 percpu_inc(mce_exception_count); 916 917 if (!banks) 918 goto out; 919 920 mce_gather_info(&m, regs); 921 922 final = &__get_cpu_var(mces_seen); 923 *final = m; 924 925 no_way_out = mce_no_way_out(&m, &msg); 926 927 barrier(); 928 929 /* 930 * When no restart IP must always kill or panic. 931 */ 932 if (!(m.mcgstatus & MCG_STATUS_RIPV)) 933 kill_it = 1; 934 935 /* 936 * Go through all the banks in exclusion of the other CPUs. 937 * This way we don't report duplicated events on shared banks 938 * because the first one to see it will clear it. 939 */ 940 order = mce_start(&no_way_out); 941 for (i = 0; i < banks; i++) { 942 __clear_bit(i, toclear); 943 if (!mce_banks[i].ctl) 944 continue; 945 946 m.misc = 0; 947 m.addr = 0; 948 m.bank = i; 949 950 m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); 951 if ((m.status & MCI_STATUS_VAL) == 0) 952 continue; 953 954 /* 955 * Non uncorrected or non signaled errors are handled by 956 * machine_check_poll. Leave them alone, unless this panics. 957 */ 958 if (!(m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)) && 959 !no_way_out) 960 continue; 961 962 /* 963 * Set taint even when machine check was not enabled. 964 */ 965 add_taint(TAINT_MACHINE_CHECK); 966 967 severity = mce_severity(&m, tolerant, NULL); 968 969 /* 970 * When machine check was for corrected handler don't touch, 971 * unless we're panicing. 972 */ 973 if (severity == MCE_KEEP_SEVERITY && !no_way_out) 974 continue; 975 __set_bit(i, toclear); 976 if (severity == MCE_NO_SEVERITY) { 977 /* 978 * Machine check event was not enabled. Clear, but 979 * ignore. 980 */ 981 continue; 982 } 983 984 /* 985 * Kill on action required. 986 */ 987 if (severity == MCE_AR_SEVERITY) 988 kill_it = 1; 989 990 if (m.status & MCI_STATUS_MISCV) 991 m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i)); 992 if (m.status & MCI_STATUS_ADDRV) 993 m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i)); 994 995 /* 996 * Action optional error. Queue address for later processing. 997 * When the ring overflows we just ignore the AO error. 998 * RED-PEN add some logging mechanism when 999 * usable_address or mce_add_ring fails. 1000 * RED-PEN don't ignore overflow for tolerant == 0 1001 */ 1002 if (severity == MCE_AO_SEVERITY && mce_usable_address(&m)) 1003 mce_ring_add(m.addr >> PAGE_SHIFT); 1004 1005 mce_log(&m); 1006 1007 if (severity > worst) { 1008 *final = m; 1009 worst = severity; 1010 } 1011 } 1012 1013 if (!no_way_out) 1014 mce_clear_state(toclear); 1015 1016 /* 1017 * Do most of the synchronization with other CPUs. 1018 * When there's any problem use only local no_way_out state. 1019 */ 1020 if (mce_end(order) < 0) 1021 no_way_out = worst >= MCE_PANIC_SEVERITY; 1022 1023 /* 1024 * If we have decided that we just CAN'T continue, and the user 1025 * has not set tolerant to an insane level, give up and die. 1026 * 1027 * This is mainly used in the case when the system doesn't 1028 * support MCE broadcasting or it has been disabled. 1029 */ 1030 if (no_way_out && tolerant < 3) 1031 mce_panic("Fatal machine check on current CPU", final, msg); 1032 1033 /* 1034 * If the error seems to be unrecoverable, something should be 1035 * done. Try to kill as little as possible. If we can kill just 1036 * one task, do that. If the user has set the tolerance very 1037 * high, don't try to do anything at all. 1038 */ 1039 1040 if (kill_it && tolerant < 3) 1041 force_sig(SIGBUS, current); 1042 1043 /* notify userspace ASAP */ 1044 set_thread_flag(TIF_MCE_NOTIFY); 1045 1046 if (worst > 0) 1047 mce_report_event(regs); 1048 mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); 1049out: 1050 atomic_dec(&mce_entry); 1051 sync_core(); 1052} 1053EXPORT_SYMBOL_GPL(do_machine_check); 1054 1055/* dummy to break dependency. actual code is in mm/memory-failure.c */ 1056void __attribute__((weak)) memory_failure(unsigned long pfn, int vector) 1057{ 1058 printk(KERN_ERR "Action optional memory failure at %lx ignored\n", pfn); 1059} 1060 1061/* 1062 * Called after mce notification in process context. This code 1063 * is allowed to sleep. Call the high level VM handler to process 1064 * any corrupted pages. 1065 * Assume that the work queue code only calls this one at a time 1066 * per CPU. 1067 * Note we don't disable preemption, so this code might run on the wrong 1068 * CPU. In this case the event is picked up by the scheduled work queue. 1069 * This is merely a fast path to expedite processing in some common 1070 * cases. 1071 */ 1072void mce_notify_process(void) 1073{ 1074 unsigned long pfn; 1075 mce_notify_irq(); 1076 while (mce_ring_get(&pfn)) 1077 memory_failure(pfn, MCE_VECTOR); 1078} 1079 1080static void mce_process_work(struct work_struct *dummy) 1081{ 1082 mce_notify_process(); 1083} 1084 1085#ifdef CONFIG_X86_MCE_INTEL 1086/*** 1087 * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog 1088 * @cpu: The CPU on which the event occurred. 1089 * @status: Event status information 1090 * 1091 * This function should be called by the thermal interrupt after the 1092 * event has been processed and the decision was made to log the event 1093 * further. 1094 * 1095 * The status parameter will be saved to the 'status' field of 'struct mce' 1096 * and historically has been the register value of the 1097 * MSR_IA32_THERMAL_STATUS (Intel) msr. 1098 */ 1099void mce_log_therm_throt_event(__u64 status) 1100{ 1101 struct mce m; 1102 1103 mce_setup(&m); 1104 m.bank = MCE_THERMAL_BANK; 1105 m.status = status; 1106 mce_log(&m); 1107} 1108#endif /* CONFIG_X86_MCE_INTEL */ 1109 1110/* 1111 * Periodic polling timer for "silent" machine check errors. If the 1112 * poller finds an MCE, poll 2x faster. When the poller finds no more 1113 * errors, poll 2x slower (up to check_interval seconds). 1114 */ 1115static int check_interval = 5 * 60; /* 5 minutes */ 1116 1117static DEFINE_PER_CPU(int, mce_next_interval); /* in jiffies */ 1118static DEFINE_PER_CPU(struct timer_list, mce_timer); 1119 1120static void mce_start_timer(unsigned long data) 1121{ 1122 struct timer_list *t = &per_cpu(mce_timer, data); 1123 int *n; 1124 1125 WARN_ON(smp_processor_id() != data); 1126 1127 if (mce_available(__this_cpu_ptr(&cpu_info))) { 1128 machine_check_poll(MCP_TIMESTAMP, 1129 &__get_cpu_var(mce_poll_banks)); 1130 } 1131 1132 /* 1133 * Alert userspace if needed. If we logged an MCE, reduce the 1134 * polling interval, otherwise increase the polling interval. 1135 */ 1136 n = &__get_cpu_var(mce_next_interval); 1137 if (mce_notify_irq()) 1138 *n = max(*n/2, HZ/100); 1139 else 1140 *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ)); 1141 1142 t->expires = jiffies + *n; 1143 add_timer_on(t, smp_processor_id()); 1144} 1145 1146/* Must not be called in IRQ context where del_timer_sync() can deadlock */ 1147static void mce_timer_delete_all(void) 1148{ 1149 int cpu; 1150 1151 for_each_online_cpu(cpu) 1152 del_timer_sync(&per_cpu(mce_timer, cpu)); 1153} 1154 1155static void mce_do_trigger(struct work_struct *work) 1156{ 1157 call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT); 1158} 1159 1160static DECLARE_WORK(mce_trigger_work, mce_do_trigger); 1161 1162/* 1163 * Notify the user(s) about new machine check events. 1164 * Can be called from interrupt context, but not from machine check/NMI 1165 * context. 1166 */ 1167int mce_notify_irq(void) 1168{ 1169 /* Not more than two messages every minute */ 1170 static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); 1171 1172 clear_thread_flag(TIF_MCE_NOTIFY); 1173 1174 if (test_and_clear_bit(0, &mce_need_notify)) { 1175 /* wake processes polling /dev/mcelog */ 1176 wake_up_interruptible(&mce_chrdev_wait); 1177 1178 /* 1179 * There is no risk of missing notifications because 1180 * work_pending is always cleared before the function is 1181 * executed. 1182 */ 1183 if (mce_helper[0] && !work_pending(&mce_trigger_work)) 1184 schedule_work(&mce_trigger_work); 1185 1186 if (__ratelimit(&ratelimit)) 1187 pr_info(HW_ERR "Machine check events logged\n"); 1188 1189 return 1; 1190 } 1191 return 0; 1192} 1193EXPORT_SYMBOL_GPL(mce_notify_irq); 1194 1195static int __cpuinit __mcheck_cpu_mce_banks_init(void) 1196{ 1197 int i; 1198 1199 mce_banks = kzalloc(banks * sizeof(struct mce_bank), GFP_KERNEL); 1200 if (!mce_banks) 1201 return -ENOMEM; 1202 for (i = 0; i < banks; i++) { 1203 struct mce_bank *b = &mce_banks[i]; 1204 1205 b->ctl = -1ULL; 1206 b->init = 1; 1207 } 1208 return 0; 1209} 1210 1211/* 1212 * Initialize Machine Checks for a CPU. 1213 */ 1214static int __cpuinit __mcheck_cpu_cap_init(void) 1215{ 1216 unsigned b; 1217 u64 cap; 1218 1219 rdmsrl(MSR_IA32_MCG_CAP, cap); 1220 1221 b = cap & MCG_BANKCNT_MASK; 1222 if (!banks) 1223 printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b); 1224 1225 if (b > MAX_NR_BANKS) { 1226 printk(KERN_WARNING 1227 "MCE: Using only %u machine check banks out of %u\n", 1228 MAX_NR_BANKS, b); 1229 b = MAX_NR_BANKS; 1230 } 1231 1232 /* Don't support asymmetric configurations today */ 1233 WARN_ON(banks != 0 && b != banks); 1234 banks = b; 1235 if (!mce_banks) { 1236 int err = __mcheck_cpu_mce_banks_init(); 1237 1238 if (err) 1239 return err; 1240 } 1241 1242 /* Use accurate RIP reporting if available. */ 1243 if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9) 1244 rip_msr = MSR_IA32_MCG_EIP; 1245 1246 if (cap & MCG_SER_P) 1247 mce_ser = 1; 1248 1249 return 0; 1250} 1251 1252static void __mcheck_cpu_init_generic(void) 1253{ 1254 mce_banks_t all_banks; 1255 u64 cap; 1256 int i; 1257 1258 /* 1259 * Log the machine checks left over from the previous reset. 1260 */ 1261 bitmap_fill(all_banks, MAX_NR_BANKS); 1262 machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks); 1263 1264 set_in_cr4(X86_CR4_MCE); 1265 1266 rdmsrl(MSR_IA32_MCG_CAP, cap); 1267 if (cap & MCG_CTL_P) 1268 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); 1269 1270 for (i = 0; i < banks; i++) { 1271 struct mce_bank *b = &mce_banks[i]; 1272 1273 if (!b->init) 1274 continue; 1275 wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl); 1276 wrmsrl(MSR_IA32_MCx_STATUS(i), 0); 1277 } 1278} 1279 1280/* Add per CPU specific workarounds here */ 1281static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) 1282{ 1283 if (c->x86_vendor == X86_VENDOR_UNKNOWN) { 1284 pr_info("MCE: unknown CPU type - not enabling MCE support.\n"); 1285 return -EOPNOTSUPP; 1286 } 1287 1288 /* This should be disabled by the BIOS, but isn't always */ 1289 if (c->x86_vendor == X86_VENDOR_AMD) { 1290 if (c->x86 == 15 && banks > 4) { 1291 /* 1292 * disable GART TBL walk error reporting, which 1293 * trips off incorrectly with the IOMMU & 3ware 1294 * & Cerberus: 1295 */ 1296 clear_bit(10, (unsigned long *)&mce_banks[4].ctl); 1297 } 1298 if (c->x86 <= 17 && mce_bootlog < 0) { 1299 /* 1300 * Lots of broken BIOS around that don't clear them 1301 * by default and leave crap in there. Don't log: 1302 */ 1303 mce_bootlog = 0; 1304 } 1305 /* 1306 * Various K7s with broken bank 0 around. Always disable 1307 * by default. 1308 */ 1309 if (c->x86 == 6 && banks > 0) 1310 mce_banks[0].ctl = 0; 1311 } 1312 1313 if (c->x86_vendor == X86_VENDOR_INTEL) { 1314 /* 1315 * SDM documents that on family 6 bank 0 should not be written 1316 * because it aliases to another special BIOS controlled 1317 * register. 1318 * But it's not aliased anymore on model 0x1a+ 1319 * Don't ignore bank 0 completely because there could be a 1320 * valid event later, merely don't write CTL0. 1321 */ 1322 1323 if (c->x86 == 6 && c->x86_model < 0x1A && banks > 0) 1324 mce_banks[0].init = 0; 1325 1326 /* 1327 * All newer Intel systems support MCE broadcasting. Enable 1328 * synchronization with a one second timeout. 1329 */ 1330 if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) && 1331 monarch_timeout < 0) 1332 monarch_timeout = USEC_PER_SEC; 1333 1334 /* 1335 * There are also broken BIOSes on some Pentium M and 1336 * earlier systems: 1337 */ 1338 if (c->x86 == 6 && c->x86_model <= 13 && mce_bootlog < 0) 1339 mce_bootlog = 0; 1340 } 1341 if (monarch_timeout < 0) 1342 monarch_timeout = 0; 1343 if (mce_bootlog != 0) 1344 mce_panic_timeout = 30; 1345 1346 return 0; 1347} 1348 1349static int __cpuinit __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c) 1350{ 1351 if (c->x86 != 5) 1352 return 0; 1353 1354 switch (c->x86_vendor) { 1355 case X86_VENDOR_INTEL: 1356 intel_p5_mcheck_init(c); 1357 return 1; 1358 break; 1359 case X86_VENDOR_CENTAUR: 1360 winchip_mcheck_init(c); 1361 return 1; 1362 break; 1363 } 1364 1365 return 0; 1366} 1367 1368static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) 1369{ 1370 switch (c->x86_vendor) { 1371 case X86_VENDOR_INTEL: 1372 mce_intel_feature_init(c); 1373 break; 1374 case X86_VENDOR_AMD: 1375 mce_amd_feature_init(c); 1376 break; 1377 default: 1378 break; 1379 } 1380} 1381 1382static void __mcheck_cpu_init_timer(void) 1383{ 1384 struct timer_list *t = &__get_cpu_var(mce_timer); 1385 int *n = &__get_cpu_var(mce_next_interval); 1386 1387 setup_timer(t, mce_start_timer, smp_processor_id()); 1388 1389 if (mce_ignore_ce) 1390 return; 1391 1392 *n = check_interval * HZ; 1393 if (!*n) 1394 return; 1395 t->expires = round_jiffies(jiffies + *n); 1396 add_timer_on(t, smp_processor_id()); 1397} 1398 1399/* Handle unconfigured int18 (should never happen) */ 1400static void unexpected_machine_check(struct pt_regs *regs, long error_code) 1401{ 1402 printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", 1403 smp_processor_id()); 1404} 1405 1406/* Call the installed machine check handler for this CPU setup. */ 1407void (*machine_check_vector)(struct pt_regs *, long error_code) = 1408 unexpected_machine_check; 1409 1410/* 1411 * Called for each booted CPU to set up machine checks. 1412 * Must be called with preempt off: 1413 */ 1414void __cpuinit mcheck_cpu_init(struct cpuinfo_x86 *c) 1415{ 1416 if (mce_disabled) 1417 return; 1418 1419 if (__mcheck_cpu_ancient_init(c)) 1420 return; 1421 1422 if (!mce_available(c)) 1423 return; 1424 1425 if (__mcheck_cpu_cap_init() < 0 || __mcheck_cpu_apply_quirks(c) < 0) { 1426 mce_disabled = 1; 1427 return; 1428 } 1429 1430 machine_check_vector = do_machine_check; 1431 1432 __mcheck_cpu_init_generic(); 1433 __mcheck_cpu_init_vendor(c); 1434 __mcheck_cpu_init_timer(); 1435 INIT_WORK(&__get_cpu_var(mce_work), mce_process_work); 1436 init_irq_work(&__get_cpu_var(mce_irq_work), &mce_irq_work_cb); 1437} 1438 1439/* 1440 * mce_chrdev: Character device /dev/mcelog to read and clear the MCE log. 1441 */ 1442 1443static DEFINE_SPINLOCK(mce_chrdev_state_lock); 1444static int mce_chrdev_open_count; /* #times opened */ 1445static int mce_chrdev_open_exclu; /* already open exclusive? */ 1446 1447static int mce_chrdev_open(struct inode *inode, struct file *file) 1448{ 1449 spin_lock(&mce_chrdev_state_lock); 1450 1451 if (mce_chrdev_open_exclu || 1452 (mce_chrdev_open_count && (file->f_flags & O_EXCL))) { 1453 spin_unlock(&mce_chrdev_state_lock); 1454 1455 return -EBUSY; 1456 } 1457 1458 if (file->f_flags & O_EXCL) 1459 mce_chrdev_open_exclu = 1; 1460 mce_chrdev_open_count++; 1461 1462 spin_unlock(&mce_chrdev_state_lock); 1463 1464 return nonseekable_open(inode, file); 1465} 1466 1467static int mce_chrdev_release(struct inode *inode, struct file *file) 1468{ 1469 spin_lock(&mce_chrdev_state_lock); 1470 1471 mce_chrdev_open_count--; 1472 mce_chrdev_open_exclu = 0; 1473 1474 spin_unlock(&mce_chrdev_state_lock); 1475 1476 return 0; 1477} 1478 1479static void collect_tscs(void *data) 1480{ 1481 unsigned long *cpu_tsc = (unsigned long *)data; 1482 1483 rdtscll(cpu_tsc[smp_processor_id()]); 1484} 1485 1486static int mce_apei_read_done; 1487 1488/* Collect MCE record of previous boot in persistent storage via APEI ERST. */ 1489static int __mce_read_apei(char __user **ubuf, size_t usize) 1490{ 1491 int rc; 1492 u64 record_id; 1493 struct mce m; 1494 1495 if (usize < sizeof(struct mce)) 1496 return -EINVAL; 1497 1498 rc = apei_read_mce(&m, &record_id); 1499 /* Error or no more MCE record */ 1500 if (rc <= 0) { 1501 mce_apei_read_done = 1; 1502 return rc; 1503 } 1504 rc = -EFAULT; 1505 if (copy_to_user(*ubuf, &m, sizeof(struct mce))) 1506 return rc; 1507 /* 1508 * In fact, we should have cleared the record after that has 1509 * been flushed to the disk or sent to network in 1510 * /sbin/mcelog, but we have no interface to support that now, 1511 * so just clear it to avoid duplication. 1512 */ 1513 rc = apei_clear_mce(record_id); 1514 if (rc) { 1515 mce_apei_read_done = 1; 1516 return rc; 1517 } 1518 *ubuf += sizeof(struct mce); 1519 1520 return 0; 1521} 1522 1523static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf, 1524 size_t usize, loff_t *off) 1525{ 1526 char __user *buf = ubuf; 1527 unsigned long *cpu_tsc; 1528 unsigned prev, next; 1529 int i, err; 1530 1531 cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL); 1532 if (!cpu_tsc) 1533 return -ENOMEM; 1534 1535 mutex_lock(&mce_chrdev_read_mutex); 1536 1537 if (!mce_apei_read_done) { 1538 err = __mce_read_apei(&buf, usize); 1539 if (err || buf != ubuf) 1540 goto out; 1541 } 1542 1543 next = rcu_dereference_check_mce(mcelog.next); 1544 1545 /* Only supports full reads right now */ 1546 err = -EINVAL; 1547 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) 1548 goto out; 1549 1550 err = 0; 1551 prev = 0; 1552 do { 1553 for (i = prev; i < next; i++) { 1554 unsigned long start = jiffies; 1555 struct mce *m = &mcelog.entry[i]; 1556 1557 while (!m->finished) { 1558 if (time_after_eq(jiffies, start + 2)) { 1559 memset(m, 0, sizeof(*m)); 1560 goto timeout; 1561 } 1562 cpu_relax(); 1563 } 1564 smp_rmb(); 1565 err |= copy_to_user(buf, m, sizeof(*m)); 1566 buf += sizeof(*m); 1567timeout: 1568 ; 1569 } 1570 1571 memset(mcelog.entry + prev, 0, 1572 (next - prev) * sizeof(struct mce)); 1573 prev = next; 1574 next = cmpxchg(&mcelog.next, prev, 0); 1575 } while (next != prev); 1576 1577 synchronize_sched(); 1578 1579 /* 1580 * Collect entries that were still getting written before the 1581 * synchronize. 1582 */ 1583 on_each_cpu(collect_tscs, cpu_tsc, 1); 1584 1585 for (i = next; i < MCE_LOG_LEN; i++) { 1586 struct mce *m = &mcelog.entry[i]; 1587 1588 if (m->finished && m->tsc < cpu_tsc[m->cpu]) { 1589 err |= copy_to_user(buf, m, sizeof(*m)); 1590 smp_rmb(); 1591 buf += sizeof(*m); 1592 memset(m, 0, sizeof(*m)); 1593 } 1594 } 1595 1596 if (err) 1597 err = -EFAULT; 1598 1599out: 1600 mutex_unlock(&mce_chrdev_read_mutex); 1601 kfree(cpu_tsc); 1602 1603 return err ? err : buf - ubuf; 1604} 1605 1606static unsigned int mce_chrdev_poll(struct file *file, poll_table *wait) 1607{ 1608 poll_wait(file, &mce_chrdev_wait, wait); 1609 if (rcu_access_index(mcelog.next)) 1610 return POLLIN | POLLRDNORM; 1611 if (!mce_apei_read_done && apei_check_mce()) 1612 return POLLIN | POLLRDNORM; 1613 return 0; 1614} 1615 1616static long mce_chrdev_ioctl(struct file *f, unsigned int cmd, 1617 unsigned long arg) 1618{ 1619 int __user *p = (int __user *)arg; 1620 1621 if (!capable(CAP_SYS_ADMIN)) 1622 return -EPERM; 1623 1624 switch (cmd) { 1625 case MCE_GET_RECORD_LEN: 1626 return put_user(sizeof(struct mce), p); 1627 case MCE_GET_LOG_LEN: 1628 return put_user(MCE_LOG_LEN, p); 1629 case MCE_GETCLEAR_FLAGS: { 1630 unsigned flags; 1631 1632 do { 1633 flags = mcelog.flags; 1634 } while (cmpxchg(&mcelog.flags, flags, 0) != flags); 1635 1636 return put_user(flags, p); 1637 } 1638 default: 1639 return -ENOTTY; 1640 } 1641} 1642 1643/* Modified in mce-inject.c, so not static or const */ 1644struct file_operations mce_chrdev_ops = { 1645 .open = mce_chrdev_open, 1646 .release = mce_chrdev_release, 1647 .read = mce_chrdev_read, 1648 .poll = mce_chrdev_poll, 1649 .unlocked_ioctl = mce_chrdev_ioctl, 1650 .llseek = no_llseek, 1651}; 1652EXPORT_SYMBOL_GPL(mce_chrdev_ops); 1653 1654static struct miscdevice mce_chrdev_device = { 1655 MISC_MCELOG_MINOR, 1656 "mcelog", 1657 &mce_chrdev_ops, 1658}; 1659 1660/* 1661 * mce=off Disables machine check 1662 * mce=no_cmci Disables CMCI 1663 * mce=dont_log_ce Clears corrected events silently, no log created for CEs. 1664 * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared. 1665 * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above) 1666 * monarchtimeout is how long to wait for other CPUs on machine 1667 * check, or 0 to not wait 1668 * mce=bootlog Log MCEs from before booting. Disabled by default on AMD. 1669 * mce=nobootlog Don't log MCEs from before booting. 1670 */ 1671static int __init mcheck_enable(char *str) 1672{ 1673 if (*str == 0) { 1674 enable_p5_mce(); 1675 return 1; 1676 } 1677 if (*str == '=') 1678 str++; 1679 if (!strcmp(str, "off")) 1680 mce_disabled = 1; 1681 else if (!strcmp(str, "no_cmci")) 1682 mce_cmci_disabled = 1; 1683 else if (!strcmp(str, "dont_log_ce")) 1684 mce_dont_log_ce = 1; 1685 else if (!strcmp(str, "ignore_ce")) 1686 mce_ignore_ce = 1; 1687 else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog")) 1688 mce_bootlog = (str[0] == 'b'); 1689 else if (isdigit(str[0])) { 1690 get_option(&str, &tolerant); 1691 if (*str == ',') { 1692 ++str; 1693 get_option(&str, &monarch_timeout); 1694 } 1695 } else { 1696 printk(KERN_INFO "mce argument %s ignored. Please use /sys\n", 1697 str); 1698 return 0; 1699 } 1700 return 1; 1701} 1702__setup("mce", mcheck_enable); 1703 1704int __init mcheck_init(void) 1705{ 1706 mcheck_intel_therm_init(); 1707 1708 return 0; 1709} 1710 1711/* 1712 * mce_syscore: PM support 1713 */ 1714 1715/* 1716 * Disable machine checks on suspend and shutdown. We can't really handle 1717 * them later. 1718 */ 1719static int mce_disable_error_reporting(void) 1720{ 1721 int i; 1722 1723 for (i = 0; i < banks; i++) { 1724 struct mce_bank *b = &mce_banks[i]; 1725 1726 if (b->init) 1727 wrmsrl(MSR_IA32_MCx_CTL(i), 0); 1728 } 1729 return 0; 1730} 1731 1732static int mce_syscore_suspend(void) 1733{ 1734 return mce_disable_error_reporting(); 1735} 1736 1737static void mce_syscore_shutdown(void) 1738{ 1739 mce_disable_error_reporting(); 1740} 1741 1742/* 1743 * On resume clear all MCE state. Don't want to see leftovers from the BIOS. 1744 * Only one CPU is active at this time, the others get re-added later using 1745 * CPU hotplug: 1746 */ 1747static void mce_syscore_resume(void) 1748{ 1749 __mcheck_cpu_init_generic(); 1750 __mcheck_cpu_init_vendor(__this_cpu_ptr(&cpu_info)); 1751} 1752 1753static struct syscore_ops mce_syscore_ops = { 1754 .suspend = mce_syscore_suspend, 1755 .shutdown = mce_syscore_shutdown, 1756 .resume = mce_syscore_resume, 1757}; 1758 1759/* 1760 * mce_sysdev: Sysfs support 1761 */ 1762 1763static void mce_cpu_restart(void *data) 1764{ 1765 if (!mce_available(__this_cpu_ptr(&cpu_info))) 1766 return; 1767 __mcheck_cpu_init_generic(); 1768 __mcheck_cpu_init_timer(); 1769} 1770 1771/* Reinit MCEs after user configuration changes */ 1772static void mce_restart(void) 1773{ 1774 mce_timer_delete_all(); 1775 on_each_cpu(mce_cpu_restart, NULL, 1); 1776} 1777 1778/* Toggle features for corrected errors */ 1779static void mce_disable_cmci(void *data) 1780{ 1781 if (!mce_available(__this_cpu_ptr(&cpu_info))) 1782 return; 1783 cmci_clear(); 1784} 1785 1786static void mce_enable_ce(void *all) 1787{ 1788 if (!mce_available(__this_cpu_ptr(&cpu_info))) 1789 return; 1790 cmci_reenable(); 1791 cmci_recheck(); 1792 if (all) 1793 __mcheck_cpu_init_timer(); 1794} 1795 1796static struct sysdev_class mce_sysdev_class = { 1797 .name = "machinecheck", 1798}; 1799 1800DEFINE_PER_CPU(struct sys_device, mce_sysdev); 1801 1802__cpuinitdata 1803void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); 1804 1805static inline struct mce_bank *attr_to_bank(struct sysdev_attribute *attr) 1806{ 1807 return container_of(attr, struct mce_bank, attr); 1808} 1809 1810static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr, 1811 char *buf) 1812{ 1813 return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl); 1814} 1815 1816static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, 1817 const char *buf, size_t size) 1818{ 1819 u64 new; 1820 1821 if (strict_strtoull(buf, 0, &new) < 0) 1822 return -EINVAL; 1823 1824 attr_to_bank(attr)->ctl = new; 1825 mce_restart(); 1826 1827 return size; 1828} 1829 1830static ssize_t 1831show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf) 1832{ 1833 strcpy(buf, mce_helper); 1834 strcat(buf, "\n"); 1835 return strlen(mce_helper) + 1; 1836} 1837 1838static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr, 1839 const char *buf, size_t siz) 1840{ 1841 char *p; 1842 1843 strncpy(mce_helper, buf, sizeof(mce_helper)); 1844 mce_helper[sizeof(mce_helper)-1] = 0; 1845 p = strchr(mce_helper, '\n'); 1846 1847 if (p) 1848 *p = 0; 1849 1850 return strlen(mce_helper) + !!p; 1851} 1852 1853static ssize_t set_ignore_ce(struct sys_device *s, 1854 struct sysdev_attribute *attr, 1855 const char *buf, size_t size) 1856{ 1857 u64 new; 1858 1859 if (strict_strtoull(buf, 0, &new) < 0) 1860 return -EINVAL; 1861 1862 if (mce_ignore_ce ^ !!new) { 1863 if (new) { 1864 /* disable ce features */ 1865 mce_timer_delete_all(); 1866 on_each_cpu(mce_disable_cmci, NULL, 1); 1867 mce_ignore_ce = 1; 1868 } else { 1869 /* enable ce features */ 1870 mce_ignore_ce = 0; 1871 on_each_cpu(mce_enable_ce, (void *)1, 1); 1872 } 1873 } 1874 return size; 1875} 1876 1877static ssize_t set_cmci_disabled(struct sys_device *s, 1878 struct sysdev_attribute *attr, 1879 const char *buf, size_t size) 1880{ 1881 u64 new; 1882 1883 if (strict_strtoull(buf, 0, &new) < 0) 1884 return -EINVAL; 1885 1886 if (mce_cmci_disabled ^ !!new) { 1887 if (new) { 1888 /* disable cmci */ 1889 on_each_cpu(mce_disable_cmci, NULL, 1); 1890 mce_cmci_disabled = 1; 1891 } else { 1892 /* enable cmci */ 1893 mce_cmci_disabled = 0; 1894 on_each_cpu(mce_enable_ce, NULL, 1); 1895 } 1896 } 1897 return size; 1898} 1899 1900static ssize_t store_int_with_restart(struct sys_device *s, 1901 struct sysdev_attribute *attr, 1902 const char *buf, size_t size) 1903{ 1904 ssize_t ret = sysdev_store_int(s, attr, buf, size); 1905 mce_restart(); 1906 return ret; 1907} 1908 1909static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); 1910static SYSDEV_INT_ATTR(tolerant, 0644, tolerant); 1911static SYSDEV_INT_ATTR(monarch_timeout, 0644, monarch_timeout); 1912static SYSDEV_INT_ATTR(dont_log_ce, 0644, mce_dont_log_ce); 1913 1914static struct sysdev_ext_attribute attr_check_interval = { 1915 _SYSDEV_ATTR(check_interval, 0644, sysdev_show_int, 1916 store_int_with_restart), 1917 &check_interval 1918}; 1919 1920static struct sysdev_ext_attribute attr_ignore_ce = { 1921 _SYSDEV_ATTR(ignore_ce, 0644, sysdev_show_int, set_ignore_ce), 1922 &mce_ignore_ce 1923}; 1924 1925static struct sysdev_ext_attribute attr_cmci_disabled = { 1926 _SYSDEV_ATTR(cmci_disabled, 0644, sysdev_show_int, set_cmci_disabled), 1927 &mce_cmci_disabled 1928}; 1929 1930static struct sysdev_attribute *mce_sysdev_attrs[] = { 1931 &attr_tolerant.attr, 1932 &attr_check_interval.attr, 1933 &attr_trigger, 1934 &attr_monarch_timeout.attr, 1935 &attr_dont_log_ce.attr, 1936 &attr_ignore_ce.attr, 1937 &attr_cmci_disabled.attr, 1938 NULL 1939}; 1940 1941static cpumask_var_t mce_sysdev_initialized; 1942 1943/* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */ 1944static __cpuinit int mce_sysdev_create(unsigned int cpu) 1945{ 1946 struct sys_device *sysdev = &per_cpu(mce_sysdev, cpu); 1947 int err; 1948 int i, j; 1949 1950 if (!mce_available(&boot_cpu_data)) 1951 return -EIO; 1952 1953 memset(&sysdev->kobj, 0, sizeof(struct kobject)); 1954 sysdev->id = cpu; 1955 sysdev->cls = &mce_sysdev_class; 1956 1957 err = sysdev_register(sysdev); 1958 if (err) 1959 return err; 1960 1961 for (i = 0; mce_sysdev_attrs[i]; i++) { 1962 err = sysdev_create_file(sysdev, mce_sysdev_attrs[i]); 1963 if (err) 1964 goto error; 1965 } 1966 for (j = 0; j < banks; j++) { 1967 err = sysdev_create_file(sysdev, &mce_banks[j].attr); 1968 if (err) 1969 goto error2; 1970 } 1971 cpumask_set_cpu(cpu, mce_sysdev_initialized); 1972 1973 return 0; 1974error2: 1975 while (--j >= 0) 1976 sysdev_remove_file(sysdev, &mce_banks[j].attr); 1977error: 1978 while (--i >= 0) 1979 sysdev_remove_file(sysdev, mce_sysdev_attrs[i]); 1980 1981 sysdev_unregister(sysdev); 1982 1983 return err; 1984} 1985 1986static __cpuinit void mce_sysdev_remove(unsigned int cpu) 1987{ 1988 struct sys_device *sysdev = &per_cpu(mce_sysdev, cpu); 1989 int i; 1990 1991 if (!cpumask_test_cpu(cpu, mce_sysdev_initialized)) 1992 return; 1993 1994 for (i = 0; mce_sysdev_attrs[i]; i++) 1995 sysdev_remove_file(sysdev, mce_sysdev_attrs[i]); 1996 1997 for (i = 0; i < banks; i++) 1998 sysdev_remove_file(sysdev, &mce_banks[i].attr); 1999 2000 sysdev_unregister(sysdev); 2001 cpumask_clear_cpu(cpu, mce_sysdev_initialized); 2002} 2003 2004/* Make sure there are no machine checks on offlined CPUs. */ 2005static void __cpuinit mce_disable_cpu(void *h) 2006{ 2007 unsigned long action = *(unsigned long *)h; 2008 int i; 2009 2010 if (!mce_available(__this_cpu_ptr(&cpu_info))) 2011 return; 2012 2013 if (!(action & CPU_TASKS_FROZEN)) 2014 cmci_clear(); 2015 for (i = 0; i < banks; i++) { 2016 struct mce_bank *b = &mce_banks[i]; 2017 2018 if (b->init) 2019 wrmsrl(MSR_IA32_MCx_CTL(i), 0); 2020 } 2021} 2022 2023static void __cpuinit mce_reenable_cpu(void *h) 2024{ 2025 unsigned long action = *(unsigned long *)h; 2026 int i; 2027 2028 if (!mce_available(__this_cpu_ptr(&cpu_info))) 2029 return; 2030 2031 if (!(action & CPU_TASKS_FROZEN)) 2032 cmci_reenable(); 2033 for (i = 0; i < banks; i++) { 2034 struct mce_bank *b = &mce_banks[i]; 2035 2036 if (b->init) 2037 wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl); 2038 } 2039} 2040 2041/* Get notified when a cpu comes on/off. Be hotplug friendly. */ 2042static int __cpuinit 2043mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) 2044{ 2045 unsigned int cpu = (unsigned long)hcpu; 2046 struct timer_list *t = &per_cpu(mce_timer, cpu); 2047 2048 switch (action) { 2049 case CPU_ONLINE: 2050 case CPU_ONLINE_FROZEN: 2051 mce_sysdev_create(cpu); 2052 if (threshold_cpu_callback) 2053 threshold_cpu_callback(action, cpu); 2054 break; 2055 case CPU_DEAD: 2056 case CPU_DEAD_FROZEN: 2057 if (threshold_cpu_callback) 2058 threshold_cpu_callback(action, cpu); 2059 mce_sysdev_remove(cpu); 2060 break; 2061 case CPU_DOWN_PREPARE: 2062 case CPU_DOWN_PREPARE_FROZEN: 2063 del_timer_sync(t); 2064 smp_call_function_single(cpu, mce_disable_cpu, &action, 1); 2065 break; 2066 case CPU_DOWN_FAILED: 2067 case CPU_DOWN_FAILED_FROZEN: 2068 if (!mce_ignore_ce && check_interval) { 2069 t->expires = round_jiffies(jiffies + 2070 __get_cpu_var(mce_next_interval)); 2071 add_timer_on(t, cpu); 2072 } 2073 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); 2074 break; 2075 case CPU_POST_DEAD: 2076 /* intentionally ignoring frozen here */ 2077 cmci_rediscover(cpu); 2078 break; 2079 } 2080 return NOTIFY_OK; 2081} 2082 2083static struct notifier_block mce_cpu_notifier __cpuinitdata = { 2084 .notifier_call = mce_cpu_callback, 2085}; 2086 2087static __init void mce_init_banks(void) 2088{ 2089 int i; 2090 2091 for (i = 0; i < banks; i++) { 2092 struct mce_bank *b = &mce_banks[i]; 2093 struct sysdev_attribute *a = &b->attr; 2094 2095 sysfs_attr_init(&a->attr); 2096 a->attr.name = b->attrname; 2097 snprintf(b->attrname, ATTR_LEN, "bank%d", i); 2098 2099 a->attr.mode = 0644; 2100 a->show = show_bank; 2101 a->store = set_bank; 2102 } 2103} 2104 2105static __init int mcheck_init_device(void) 2106{ 2107 int err; 2108 int i = 0; 2109 2110 if (!mce_available(&boot_cpu_data)) 2111 return -EIO; 2112 2113 zalloc_cpumask_var(&mce_sysdev_initialized, GFP_KERNEL); 2114 2115 mce_init_banks(); 2116 2117 err = sysdev_class_register(&mce_sysdev_class); 2118 if (err) 2119 return err; 2120 2121 for_each_online_cpu(i) { 2122 err = mce_sysdev_create(i); 2123 if (err) 2124 return err; 2125 } 2126 2127 register_syscore_ops(&mce_syscore_ops); 2128 register_hotcpu_notifier(&mce_cpu_notifier); 2129 2130 /* register character device /dev/mcelog */ 2131 misc_register(&mce_chrdev_device); 2132 2133 return err; 2134} 2135device_initcall(mcheck_init_device); 2136 2137/* 2138 * Old style boot options parsing. Only for compatibility. 2139 */ 2140static int __init mcheck_disable(char *str) 2141{ 2142 mce_disabled = 1; 2143 return 1; 2144} 2145__setup("nomce", mcheck_disable); 2146 2147#ifdef CONFIG_DEBUG_FS 2148struct dentry *mce_get_debugfs_dir(void) 2149{ 2150 static struct dentry *dmce; 2151 2152 if (!dmce) 2153 dmce = debugfs_create_dir("mce", NULL); 2154 2155 return dmce; 2156} 2157 2158static void mce_reset(void) 2159{ 2160 cpu_missing = 0; 2161 atomic_set(&mce_fake_paniced, 0); 2162 atomic_set(&mce_executing, 0); 2163 atomic_set(&mce_callin, 0); 2164 atomic_set(&global_nwo, 0); 2165} 2166 2167static int fake_panic_get(void *data, u64 *val) 2168{ 2169 *val = fake_panic; 2170 return 0; 2171} 2172 2173static int fake_panic_set(void *data, u64 val) 2174{ 2175 mce_reset(); 2176 fake_panic = val; 2177 return 0; 2178} 2179 2180DEFINE_SIMPLE_ATTRIBUTE(fake_panic_fops, fake_panic_get, 2181 fake_panic_set, "%llu\n"); 2182 2183static int __init mcheck_debugfs_init(void) 2184{ 2185 struct dentry *dmce, *ffake_panic; 2186 2187 dmce = mce_get_debugfs_dir(); 2188 if (!dmce) 2189 return -ENOMEM; 2190 ffake_panic = debugfs_create_file("fake_panic", 0444, dmce, NULL, 2191 &fake_panic_fops); 2192 if (!ffake_panic) 2193 return -ENOMEM; 2194 2195 return 0; 2196} 2197late_initcall(mcheck_debugfs_init); 2198#endif 2199