mce.c revision c767a54ba0657e52e6edaa97cbe0b0a8bf1c1655
1/* 2 * Machine check handler. 3 * 4 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. 5 * Rest from unknown author(s). 6 * 2004 Andi Kleen. Rewrote most of it. 7 * Copyright 2008 Intel Corporation 8 * Author: Andi Kleen 9 */ 10 11#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 12 13#include <linux/thread_info.h> 14#include <linux/capability.h> 15#include <linux/miscdevice.h> 16#include <linux/ratelimit.h> 17#include <linux/kallsyms.h> 18#include <linux/rcupdate.h> 19#include <linux/kobject.h> 20#include <linux/uaccess.h> 21#include <linux/kdebug.h> 22#include <linux/kernel.h> 23#include <linux/percpu.h> 24#include <linux/string.h> 25#include <linux/device.h> 26#include <linux/syscore_ops.h> 27#include <linux/delay.h> 28#include <linux/ctype.h> 29#include <linux/sched.h> 30#include <linux/sysfs.h> 31#include <linux/types.h> 32#include <linux/slab.h> 33#include <linux/init.h> 34#include <linux/kmod.h> 35#include <linux/poll.h> 36#include <linux/nmi.h> 37#include <linux/cpu.h> 38#include <linux/smp.h> 39#include <linux/fs.h> 40#include <linux/mm.h> 41#include <linux/debugfs.h> 42#include <linux/irq_work.h> 43#include <linux/export.h> 44 45#include <asm/processor.h> 46#include <asm/mce.h> 47#include <asm/msr.h> 48 49#include "mce-internal.h" 50 51static DEFINE_MUTEX(mce_chrdev_read_mutex); 52 53#define rcu_dereference_check_mce(p) \ 54 rcu_dereference_index_check((p), \ 55 rcu_read_lock_sched_held() || \ 56 lockdep_is_held(&mce_chrdev_read_mutex)) 57 58#define CREATE_TRACE_POINTS 59#include <trace/events/mce.h> 60 61int mce_disabled __read_mostly; 62 63#define MISC_MCELOG_MINOR 227 64 65#define SPINUNIT 100 /* 100ns */ 66 67atomic_t mce_entry; 68 69DEFINE_PER_CPU(unsigned, mce_exception_count); 70 71/* 72 * Tolerant levels: 73 * 0: always panic on uncorrected errors, log corrected errors 74 * 1: panic or SIGBUS on uncorrected errors, log corrected errors 75 * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors 76 * 3: never panic or SIGBUS, log all errors (for testing only) 77 */ 78static int tolerant __read_mostly = 1; 79static int banks __read_mostly; 80static int rip_msr __read_mostly; 81static int mce_bootlog __read_mostly = -1; 82static int monarch_timeout __read_mostly = -1; 83static int mce_panic_timeout __read_mostly; 84static int mce_dont_log_ce __read_mostly; 85int mce_cmci_disabled __read_mostly; 86int mce_ignore_ce __read_mostly; 87int mce_ser __read_mostly; 88 89struct mce_bank *mce_banks __read_mostly; 90 91/* User mode helper program triggered by machine check event */ 92static unsigned long mce_need_notify; 93static char mce_helper[128]; 94static char *mce_helper_argv[2] = { mce_helper, NULL }; 95 96static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait); 97 98static DEFINE_PER_CPU(struct mce, mces_seen); 99static int cpu_missing; 100 101/* MCA banks polled by the period polling timer for corrected events */ 102DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { 103 [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL 104}; 105 106static DEFINE_PER_CPU(struct work_struct, mce_work); 107 108/* 109 * CPU/chipset specific EDAC code can register a notifier call here to print 110 * MCE errors in a human-readable form. 111 */ 112ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain); 113 114/* Do initial initialization of a struct mce */ 115void mce_setup(struct mce *m) 116{ 117 memset(m, 0, sizeof(struct mce)); 118 m->cpu = m->extcpu = smp_processor_id(); 119 rdtscll(m->tsc); 120 /* We hope get_seconds stays lockless */ 121 m->time = get_seconds(); 122 m->cpuvendor = boot_cpu_data.x86_vendor; 123 m->cpuid = cpuid_eax(1); 124 m->socketid = cpu_data(m->extcpu).phys_proc_id; 125 m->apicid = cpu_data(m->extcpu).initial_apicid; 126 rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap); 127} 128 129DEFINE_PER_CPU(struct mce, injectm); 130EXPORT_PER_CPU_SYMBOL_GPL(injectm); 131 132/* 133 * Lockless MCE logging infrastructure. 134 * This avoids deadlocks on printk locks without having to break locks. Also 135 * separate MCEs from kernel messages to avoid bogus bug reports. 136 */ 137 138static struct mce_log mcelog = { 139 .signature = MCE_LOG_SIGNATURE, 140 .len = MCE_LOG_LEN, 141 .recordlen = sizeof(struct mce), 142}; 143 144void mce_log(struct mce *mce) 145{ 146 unsigned next, entry; 147 int ret = 0; 148 149 /* Emit the trace record: */ 150 trace_mce_record(mce); 151 152 ret = atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, mce); 153 if (ret == NOTIFY_STOP) 154 return; 155 156 mce->finished = 0; 157 wmb(); 158 for (;;) { 159 entry = rcu_dereference_check_mce(mcelog.next); 160 for (;;) { 161 162 /* 163 * When the buffer fills up discard new entries. 164 * Assume that the earlier errors are the more 165 * interesting ones: 166 */ 167 if (entry >= MCE_LOG_LEN) { 168 set_bit(MCE_OVERFLOW, 169 (unsigned long *)&mcelog.flags); 170 return; 171 } 172 /* Old left over entry. Skip: */ 173 if (mcelog.entry[entry].finished) { 174 entry++; 175 continue; 176 } 177 break; 178 } 179 smp_rmb(); 180 next = entry + 1; 181 if (cmpxchg(&mcelog.next, entry, next) == entry) 182 break; 183 } 184 memcpy(mcelog.entry + entry, mce, sizeof(struct mce)); 185 wmb(); 186 mcelog.entry[entry].finished = 1; 187 wmb(); 188 189 mce->finished = 1; 190 set_bit(0, &mce_need_notify); 191} 192 193static void drain_mcelog_buffer(void) 194{ 195 unsigned int next, i, prev = 0; 196 197 next = ACCESS_ONCE(mcelog.next); 198 199 do { 200 struct mce *m; 201 202 /* drain what was logged during boot */ 203 for (i = prev; i < next; i++) { 204 unsigned long start = jiffies; 205 unsigned retries = 1; 206 207 m = &mcelog.entry[i]; 208 209 while (!m->finished) { 210 if (time_after_eq(jiffies, start + 2*retries)) 211 retries++; 212 213 cpu_relax(); 214 215 if (!m->finished && retries >= 4) { 216 pr_err("skipping error being logged currently!\n"); 217 break; 218 } 219 } 220 smp_rmb(); 221 atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m); 222 } 223 224 memset(mcelog.entry + prev, 0, (next - prev) * sizeof(*m)); 225 prev = next; 226 next = cmpxchg(&mcelog.next, prev, 0); 227 } while (next != prev); 228} 229 230 231void mce_register_decode_chain(struct notifier_block *nb) 232{ 233 atomic_notifier_chain_register(&x86_mce_decoder_chain, nb); 234 drain_mcelog_buffer(); 235} 236EXPORT_SYMBOL_GPL(mce_register_decode_chain); 237 238void mce_unregister_decode_chain(struct notifier_block *nb) 239{ 240 atomic_notifier_chain_unregister(&x86_mce_decoder_chain, nb); 241} 242EXPORT_SYMBOL_GPL(mce_unregister_decode_chain); 243 244static void print_mce(struct mce *m) 245{ 246 int ret = 0; 247 248 pr_emerg(HW_ERR "CPU %d: Machine Check Exception: %Lx Bank %d: %016Lx\n", 249 m->extcpu, m->mcgstatus, m->bank, m->status); 250 251 if (m->ip) { 252 pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ", 253 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", 254 m->cs, m->ip); 255 256 if (m->cs == __KERNEL_CS) 257 print_symbol("{%s}", m->ip); 258 pr_cont("\n"); 259 } 260 261 pr_emerg(HW_ERR "TSC %llx ", m->tsc); 262 if (m->addr) 263 pr_cont("ADDR %llx ", m->addr); 264 if (m->misc) 265 pr_cont("MISC %llx ", m->misc); 266 267 pr_cont("\n"); 268 /* 269 * Note this output is parsed by external tools and old fields 270 * should not be changed. 271 */ 272 pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x microcode %x\n", 273 m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid, 274 cpu_data(m->extcpu).microcode); 275 276 /* 277 * Print out human-readable details about the MCE error, 278 * (if the CPU has an implementation for that) 279 */ 280 ret = atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m); 281 if (ret == NOTIFY_STOP) 282 return; 283 284 pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n"); 285} 286 287#define PANIC_TIMEOUT 5 /* 5 seconds */ 288 289static atomic_t mce_paniced; 290 291static int fake_panic; 292static atomic_t mce_fake_paniced; 293 294/* Panic in progress. Enable interrupts and wait for final IPI */ 295static void wait_for_panic(void) 296{ 297 long timeout = PANIC_TIMEOUT*USEC_PER_SEC; 298 299 preempt_disable(); 300 local_irq_enable(); 301 while (timeout-- > 0) 302 udelay(1); 303 if (panic_timeout == 0) 304 panic_timeout = mce_panic_timeout; 305 panic("Panicing machine check CPU died"); 306} 307 308static void mce_panic(char *msg, struct mce *final, char *exp) 309{ 310 int i, apei_err = 0; 311 312 if (!fake_panic) { 313 /* 314 * Make sure only one CPU runs in machine check panic 315 */ 316 if (atomic_inc_return(&mce_paniced) > 1) 317 wait_for_panic(); 318 barrier(); 319 320 bust_spinlocks(1); 321 console_verbose(); 322 } else { 323 /* Don't log too much for fake panic */ 324 if (atomic_inc_return(&mce_fake_paniced) > 1) 325 return; 326 } 327 /* First print corrected ones that are still unlogged */ 328 for (i = 0; i < MCE_LOG_LEN; i++) { 329 struct mce *m = &mcelog.entry[i]; 330 if (!(m->status & MCI_STATUS_VAL)) 331 continue; 332 if (!(m->status & MCI_STATUS_UC)) { 333 print_mce(m); 334 if (!apei_err) 335 apei_err = apei_write_mce(m); 336 } 337 } 338 /* Now print uncorrected but with the final one last */ 339 for (i = 0; i < MCE_LOG_LEN; i++) { 340 struct mce *m = &mcelog.entry[i]; 341 if (!(m->status & MCI_STATUS_VAL)) 342 continue; 343 if (!(m->status & MCI_STATUS_UC)) 344 continue; 345 if (!final || memcmp(m, final, sizeof(struct mce))) { 346 print_mce(m); 347 if (!apei_err) 348 apei_err = apei_write_mce(m); 349 } 350 } 351 if (final) { 352 print_mce(final); 353 if (!apei_err) 354 apei_err = apei_write_mce(final); 355 } 356 if (cpu_missing) 357 pr_emerg(HW_ERR "Some CPUs didn't answer in synchronization\n"); 358 if (exp) 359 pr_emerg(HW_ERR "Machine check: %s\n", exp); 360 if (!fake_panic) { 361 if (panic_timeout == 0) 362 panic_timeout = mce_panic_timeout; 363 panic(msg); 364 } else 365 pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg); 366} 367 368/* Support code for software error injection */ 369 370static int msr_to_offset(u32 msr) 371{ 372 unsigned bank = __this_cpu_read(injectm.bank); 373 374 if (msr == rip_msr) 375 return offsetof(struct mce, ip); 376 if (msr == MSR_IA32_MCx_STATUS(bank)) 377 return offsetof(struct mce, status); 378 if (msr == MSR_IA32_MCx_ADDR(bank)) 379 return offsetof(struct mce, addr); 380 if (msr == MSR_IA32_MCx_MISC(bank)) 381 return offsetof(struct mce, misc); 382 if (msr == MSR_IA32_MCG_STATUS) 383 return offsetof(struct mce, mcgstatus); 384 return -1; 385} 386 387/* MSR access wrappers used for error injection */ 388static u64 mce_rdmsrl(u32 msr) 389{ 390 u64 v; 391 392 if (__this_cpu_read(injectm.finished)) { 393 int offset = msr_to_offset(msr); 394 395 if (offset < 0) 396 return 0; 397 return *(u64 *)((char *)&__get_cpu_var(injectm) + offset); 398 } 399 400 if (rdmsrl_safe(msr, &v)) { 401 WARN_ONCE(1, "mce: Unable to read msr %d!\n", msr); 402 /* 403 * Return zero in case the access faulted. This should 404 * not happen normally but can happen if the CPU does 405 * something weird, or if the code is buggy. 406 */ 407 v = 0; 408 } 409 410 return v; 411} 412 413static void mce_wrmsrl(u32 msr, u64 v) 414{ 415 if (__this_cpu_read(injectm.finished)) { 416 int offset = msr_to_offset(msr); 417 418 if (offset >= 0) 419 *(u64 *)((char *)&__get_cpu_var(injectm) + offset) = v; 420 return; 421 } 422 wrmsrl(msr, v); 423} 424 425/* 426 * Collect all global (w.r.t. this processor) status about this machine 427 * check into our "mce" struct so that we can use it later to assess 428 * the severity of the problem as we read per-bank specific details. 429 */ 430static inline void mce_gather_info(struct mce *m, struct pt_regs *regs) 431{ 432 mce_setup(m); 433 434 m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); 435 if (regs) { 436 /* 437 * Get the address of the instruction at the time of 438 * the machine check error. 439 */ 440 if (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) { 441 m->ip = regs->ip; 442 m->cs = regs->cs; 443 444 /* 445 * When in VM86 mode make the cs look like ring 3 446 * always. This is a lie, but it's better than passing 447 * the additional vm86 bit around everywhere. 448 */ 449 if (v8086_mode(regs)) 450 m->cs |= 3; 451 } 452 /* Use accurate RIP reporting if available. */ 453 if (rip_msr) 454 m->ip = mce_rdmsrl(rip_msr); 455 } 456} 457 458/* 459 * Simple lockless ring to communicate PFNs from the exception handler with the 460 * process context work function. This is vastly simplified because there's 461 * only a single reader and a single writer. 462 */ 463#define MCE_RING_SIZE 16 /* we use one entry less */ 464 465struct mce_ring { 466 unsigned short start; 467 unsigned short end; 468 unsigned long ring[MCE_RING_SIZE]; 469}; 470static DEFINE_PER_CPU(struct mce_ring, mce_ring); 471 472/* Runs with CPU affinity in workqueue */ 473static int mce_ring_empty(void) 474{ 475 struct mce_ring *r = &__get_cpu_var(mce_ring); 476 477 return r->start == r->end; 478} 479 480static int mce_ring_get(unsigned long *pfn) 481{ 482 struct mce_ring *r; 483 int ret = 0; 484 485 *pfn = 0; 486 get_cpu(); 487 r = &__get_cpu_var(mce_ring); 488 if (r->start == r->end) 489 goto out; 490 *pfn = r->ring[r->start]; 491 r->start = (r->start + 1) % MCE_RING_SIZE; 492 ret = 1; 493out: 494 put_cpu(); 495 return ret; 496} 497 498/* Always runs in MCE context with preempt off */ 499static int mce_ring_add(unsigned long pfn) 500{ 501 struct mce_ring *r = &__get_cpu_var(mce_ring); 502 unsigned next; 503 504 next = (r->end + 1) % MCE_RING_SIZE; 505 if (next == r->start) 506 return -1; 507 r->ring[r->end] = pfn; 508 wmb(); 509 r->end = next; 510 return 0; 511} 512 513int mce_available(struct cpuinfo_x86 *c) 514{ 515 if (mce_disabled) 516 return 0; 517 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); 518} 519 520static void mce_schedule_work(void) 521{ 522 if (!mce_ring_empty()) { 523 struct work_struct *work = &__get_cpu_var(mce_work); 524 if (!work_pending(work)) 525 schedule_work(work); 526 } 527} 528 529DEFINE_PER_CPU(struct irq_work, mce_irq_work); 530 531static void mce_irq_work_cb(struct irq_work *entry) 532{ 533 mce_notify_irq(); 534 mce_schedule_work(); 535} 536 537static void mce_report_event(struct pt_regs *regs) 538{ 539 if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) { 540 mce_notify_irq(); 541 /* 542 * Triggering the work queue here is just an insurance 543 * policy in case the syscall exit notify handler 544 * doesn't run soon enough or ends up running on the 545 * wrong CPU (can happen when audit sleeps) 546 */ 547 mce_schedule_work(); 548 return; 549 } 550 551 irq_work_queue(&__get_cpu_var(mce_irq_work)); 552} 553 554/* 555 * Read ADDR and MISC registers. 556 */ 557static void mce_read_aux(struct mce *m, int i) 558{ 559 if (m->status & MCI_STATUS_MISCV) 560 m->misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i)); 561 if (m->status & MCI_STATUS_ADDRV) { 562 m->addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i)); 563 564 /* 565 * Mask the reported address by the reported granularity. 566 */ 567 if (mce_ser && (m->status & MCI_STATUS_MISCV)) { 568 u8 shift = MCI_MISC_ADDR_LSB(m->misc); 569 m->addr >>= shift; 570 m->addr <<= shift; 571 } 572 } 573} 574 575DEFINE_PER_CPU(unsigned, mce_poll_count); 576 577/* 578 * Poll for corrected events or events that happened before reset. 579 * Those are just logged through /dev/mcelog. 580 * 581 * This is executed in standard interrupt context. 582 * 583 * Note: spec recommends to panic for fatal unsignalled 584 * errors here. However this would be quite problematic -- 585 * we would need to reimplement the Monarch handling and 586 * it would mess up the exclusion between exception handler 587 * and poll hander -- * so we skip this for now. 588 * These cases should not happen anyways, or only when the CPU 589 * is already totally * confused. In this case it's likely it will 590 * not fully execute the machine check handler either. 591 */ 592void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) 593{ 594 struct mce m; 595 int i; 596 597 this_cpu_inc(mce_poll_count); 598 599 mce_gather_info(&m, NULL); 600 601 for (i = 0; i < banks; i++) { 602 if (!mce_banks[i].ctl || !test_bit(i, *b)) 603 continue; 604 605 m.misc = 0; 606 m.addr = 0; 607 m.bank = i; 608 m.tsc = 0; 609 610 barrier(); 611 m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); 612 if (!(m.status & MCI_STATUS_VAL)) 613 continue; 614 615 /* 616 * Uncorrected or signalled events are handled by the exception 617 * handler when it is enabled, so don't process those here. 618 * 619 * TBD do the same check for MCI_STATUS_EN here? 620 */ 621 if (!(flags & MCP_UC) && 622 (m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC))) 623 continue; 624 625 mce_read_aux(&m, i); 626 627 if (!(flags & MCP_TIMESTAMP)) 628 m.tsc = 0; 629 /* 630 * Don't get the IP here because it's unlikely to 631 * have anything to do with the actual error location. 632 */ 633 if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) 634 mce_log(&m); 635 636 /* 637 * Clear state for this bank. 638 */ 639 mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0); 640 } 641 642 /* 643 * Don't clear MCG_STATUS here because it's only defined for 644 * exceptions. 645 */ 646 647 sync_core(); 648} 649EXPORT_SYMBOL_GPL(machine_check_poll); 650 651/* 652 * Do a quick check if any of the events requires a panic. 653 * This decides if we keep the events around or clear them. 654 */ 655static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp) 656{ 657 int i, ret = 0; 658 659 for (i = 0; i < banks; i++) { 660 m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); 661 if (m->status & MCI_STATUS_VAL) 662 __set_bit(i, validp); 663 if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY) 664 ret = 1; 665 } 666 return ret; 667} 668 669/* 670 * Variable to establish order between CPUs while scanning. 671 * Each CPU spins initially until executing is equal its number. 672 */ 673static atomic_t mce_executing; 674 675/* 676 * Defines order of CPUs on entry. First CPU becomes Monarch. 677 */ 678static atomic_t mce_callin; 679 680/* 681 * Check if a timeout waiting for other CPUs happened. 682 */ 683static int mce_timed_out(u64 *t) 684{ 685 /* 686 * The others already did panic for some reason. 687 * Bail out like in a timeout. 688 * rmb() to tell the compiler that system_state 689 * might have been modified by someone else. 690 */ 691 rmb(); 692 if (atomic_read(&mce_paniced)) 693 wait_for_panic(); 694 if (!monarch_timeout) 695 goto out; 696 if ((s64)*t < SPINUNIT) { 697 /* CHECKME: Make panic default for 1 too? */ 698 if (tolerant < 1) 699 mce_panic("Timeout synchronizing machine check over CPUs", 700 NULL, NULL); 701 cpu_missing = 1; 702 return 1; 703 } 704 *t -= SPINUNIT; 705out: 706 touch_nmi_watchdog(); 707 return 0; 708} 709 710/* 711 * The Monarch's reign. The Monarch is the CPU who entered 712 * the machine check handler first. It waits for the others to 713 * raise the exception too and then grades them. When any 714 * error is fatal panic. Only then let the others continue. 715 * 716 * The other CPUs entering the MCE handler will be controlled by the 717 * Monarch. They are called Subjects. 718 * 719 * This way we prevent any potential data corruption in a unrecoverable case 720 * and also makes sure always all CPU's errors are examined. 721 * 722 * Also this detects the case of a machine check event coming from outer 723 * space (not detected by any CPUs) In this case some external agent wants 724 * us to shut down, so panic too. 725 * 726 * The other CPUs might still decide to panic if the handler happens 727 * in a unrecoverable place, but in this case the system is in a semi-stable 728 * state and won't corrupt anything by itself. It's ok to let the others 729 * continue for a bit first. 730 * 731 * All the spin loops have timeouts; when a timeout happens a CPU 732 * typically elects itself to be Monarch. 733 */ 734static void mce_reign(void) 735{ 736 int cpu; 737 struct mce *m = NULL; 738 int global_worst = 0; 739 char *msg = NULL; 740 char *nmsg = NULL; 741 742 /* 743 * This CPU is the Monarch and the other CPUs have run 744 * through their handlers. 745 * Grade the severity of the errors of all the CPUs. 746 */ 747 for_each_possible_cpu(cpu) { 748 int severity = mce_severity(&per_cpu(mces_seen, cpu), tolerant, 749 &nmsg); 750 if (severity > global_worst) { 751 msg = nmsg; 752 global_worst = severity; 753 m = &per_cpu(mces_seen, cpu); 754 } 755 } 756 757 /* 758 * Cannot recover? Panic here then. 759 * This dumps all the mces in the log buffer and stops the 760 * other CPUs. 761 */ 762 if (m && global_worst >= MCE_PANIC_SEVERITY && tolerant < 3) 763 mce_panic("Fatal Machine check", m, msg); 764 765 /* 766 * For UC somewhere we let the CPU who detects it handle it. 767 * Also must let continue the others, otherwise the handling 768 * CPU could deadlock on a lock. 769 */ 770 771 /* 772 * No machine check event found. Must be some external 773 * source or one CPU is hung. Panic. 774 */ 775 if (global_worst <= MCE_KEEP_SEVERITY && tolerant < 3) 776 mce_panic("Machine check from unknown source", NULL, NULL); 777 778 /* 779 * Now clear all the mces_seen so that they don't reappear on 780 * the next mce. 781 */ 782 for_each_possible_cpu(cpu) 783 memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce)); 784} 785 786static atomic_t global_nwo; 787 788/* 789 * Start of Monarch synchronization. This waits until all CPUs have 790 * entered the exception handler and then determines if any of them 791 * saw a fatal event that requires panic. Then it executes them 792 * in the entry order. 793 * TBD double check parallel CPU hotunplug 794 */ 795static int mce_start(int *no_way_out) 796{ 797 int order; 798 int cpus = num_online_cpus(); 799 u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC; 800 801 if (!timeout) 802 return -1; 803 804 atomic_add(*no_way_out, &global_nwo); 805 /* 806 * global_nwo should be updated before mce_callin 807 */ 808 smp_wmb(); 809 order = atomic_inc_return(&mce_callin); 810 811 /* 812 * Wait for everyone. 813 */ 814 while (atomic_read(&mce_callin) != cpus) { 815 if (mce_timed_out(&timeout)) { 816 atomic_set(&global_nwo, 0); 817 return -1; 818 } 819 ndelay(SPINUNIT); 820 } 821 822 /* 823 * mce_callin should be read before global_nwo 824 */ 825 smp_rmb(); 826 827 if (order == 1) { 828 /* 829 * Monarch: Starts executing now, the others wait. 830 */ 831 atomic_set(&mce_executing, 1); 832 } else { 833 /* 834 * Subject: Now start the scanning loop one by one in 835 * the original callin order. 836 * This way when there are any shared banks it will be 837 * only seen by one CPU before cleared, avoiding duplicates. 838 */ 839 while (atomic_read(&mce_executing) < order) { 840 if (mce_timed_out(&timeout)) { 841 atomic_set(&global_nwo, 0); 842 return -1; 843 } 844 ndelay(SPINUNIT); 845 } 846 } 847 848 /* 849 * Cache the global no_way_out state. 850 */ 851 *no_way_out = atomic_read(&global_nwo); 852 853 return order; 854} 855 856/* 857 * Synchronize between CPUs after main scanning loop. 858 * This invokes the bulk of the Monarch processing. 859 */ 860static int mce_end(int order) 861{ 862 int ret = -1; 863 u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC; 864 865 if (!timeout) 866 goto reset; 867 if (order < 0) 868 goto reset; 869 870 /* 871 * Allow others to run. 872 */ 873 atomic_inc(&mce_executing); 874 875 if (order == 1) { 876 /* CHECKME: Can this race with a parallel hotplug? */ 877 int cpus = num_online_cpus(); 878 879 /* 880 * Monarch: Wait for everyone to go through their scanning 881 * loops. 882 */ 883 while (atomic_read(&mce_executing) <= cpus) { 884 if (mce_timed_out(&timeout)) 885 goto reset; 886 ndelay(SPINUNIT); 887 } 888 889 mce_reign(); 890 barrier(); 891 ret = 0; 892 } else { 893 /* 894 * Subject: Wait for Monarch to finish. 895 */ 896 while (atomic_read(&mce_executing) != 0) { 897 if (mce_timed_out(&timeout)) 898 goto reset; 899 ndelay(SPINUNIT); 900 } 901 902 /* 903 * Don't reset anything. That's done by the Monarch. 904 */ 905 return 0; 906 } 907 908 /* 909 * Reset all global state. 910 */ 911reset: 912 atomic_set(&global_nwo, 0); 913 atomic_set(&mce_callin, 0); 914 barrier(); 915 916 /* 917 * Let others run again. 918 */ 919 atomic_set(&mce_executing, 0); 920 return ret; 921} 922 923/* 924 * Check if the address reported by the CPU is in a format we can parse. 925 * It would be possible to add code for most other cases, but all would 926 * be somewhat complicated (e.g. segment offset would require an instruction 927 * parser). So only support physical addresses up to page granuality for now. 928 */ 929static int mce_usable_address(struct mce *m) 930{ 931 if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV)) 932 return 0; 933 if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT) 934 return 0; 935 if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS) 936 return 0; 937 return 1; 938} 939 940static void mce_clear_state(unsigned long *toclear) 941{ 942 int i; 943 944 for (i = 0; i < banks; i++) { 945 if (test_bit(i, toclear)) 946 mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0); 947 } 948} 949 950/* 951 * Need to save faulting physical address associated with a process 952 * in the machine check handler some place where we can grab it back 953 * later in mce_notify_process() 954 */ 955#define MCE_INFO_MAX 16 956 957struct mce_info { 958 atomic_t inuse; 959 struct task_struct *t; 960 __u64 paddr; 961 int restartable; 962} mce_info[MCE_INFO_MAX]; 963 964static void mce_save_info(__u64 addr, int c) 965{ 966 struct mce_info *mi; 967 968 for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++) { 969 if (atomic_cmpxchg(&mi->inuse, 0, 1) == 0) { 970 mi->t = current; 971 mi->paddr = addr; 972 mi->restartable = c; 973 return; 974 } 975 } 976 977 mce_panic("Too many concurrent recoverable errors", NULL, NULL); 978} 979 980static struct mce_info *mce_find_info(void) 981{ 982 struct mce_info *mi; 983 984 for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++) 985 if (atomic_read(&mi->inuse) && mi->t == current) 986 return mi; 987 return NULL; 988} 989 990static void mce_clear_info(struct mce_info *mi) 991{ 992 atomic_set(&mi->inuse, 0); 993} 994 995/* 996 * The actual machine check handler. This only handles real 997 * exceptions when something got corrupted coming in through int 18. 998 * 999 * This is executed in NMI context not subject to normal locking rules. This 1000 * implies that most kernel services cannot be safely used. Don't even 1001 * think about putting a printk in there! 1002 * 1003 * On Intel systems this is entered on all CPUs in parallel through 1004 * MCE broadcast. However some CPUs might be broken beyond repair, 1005 * so be always careful when synchronizing with others. 1006 */ 1007void do_machine_check(struct pt_regs *regs, long error_code) 1008{ 1009 struct mce m, *final; 1010 int i; 1011 int worst = 0; 1012 int severity; 1013 /* 1014 * Establish sequential order between the CPUs entering the machine 1015 * check handler. 1016 */ 1017 int order; 1018 /* 1019 * If no_way_out gets set, there is no safe way to recover from this 1020 * MCE. If tolerant is cranked up, we'll try anyway. 1021 */ 1022 int no_way_out = 0; 1023 /* 1024 * If kill_it gets set, there might be a way to recover from this 1025 * error. 1026 */ 1027 int kill_it = 0; 1028 DECLARE_BITMAP(toclear, MAX_NR_BANKS); 1029 DECLARE_BITMAP(valid_banks, MAX_NR_BANKS); 1030 char *msg = "Unknown"; 1031 1032 atomic_inc(&mce_entry); 1033 1034 this_cpu_inc(mce_exception_count); 1035 1036 if (!banks) 1037 goto out; 1038 1039 mce_gather_info(&m, regs); 1040 1041 final = &__get_cpu_var(mces_seen); 1042 *final = m; 1043 1044 memset(valid_banks, 0, sizeof(valid_banks)); 1045 no_way_out = mce_no_way_out(&m, &msg, valid_banks); 1046 1047 barrier(); 1048 1049 /* 1050 * When no restart IP might need to kill or panic. 1051 * Assume the worst for now, but if we find the 1052 * severity is MCE_AR_SEVERITY we have other options. 1053 */ 1054 if (!(m.mcgstatus & MCG_STATUS_RIPV)) 1055 kill_it = 1; 1056 1057 /* 1058 * Go through all the banks in exclusion of the other CPUs. 1059 * This way we don't report duplicated events on shared banks 1060 * because the first one to see it will clear it. 1061 */ 1062 order = mce_start(&no_way_out); 1063 for (i = 0; i < banks; i++) { 1064 __clear_bit(i, toclear); 1065 if (!test_bit(i, valid_banks)) 1066 continue; 1067 if (!mce_banks[i].ctl) 1068 continue; 1069 1070 m.misc = 0; 1071 m.addr = 0; 1072 m.bank = i; 1073 1074 m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); 1075 if ((m.status & MCI_STATUS_VAL) == 0) 1076 continue; 1077 1078 /* 1079 * Non uncorrected or non signaled errors are handled by 1080 * machine_check_poll. Leave them alone, unless this panics. 1081 */ 1082 if (!(m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)) && 1083 !no_way_out) 1084 continue; 1085 1086 /* 1087 * Set taint even when machine check was not enabled. 1088 */ 1089 add_taint(TAINT_MACHINE_CHECK); 1090 1091 severity = mce_severity(&m, tolerant, NULL); 1092 1093 /* 1094 * When machine check was for corrected handler don't touch, 1095 * unless we're panicing. 1096 */ 1097 if (severity == MCE_KEEP_SEVERITY && !no_way_out) 1098 continue; 1099 __set_bit(i, toclear); 1100 if (severity == MCE_NO_SEVERITY) { 1101 /* 1102 * Machine check event was not enabled. Clear, but 1103 * ignore. 1104 */ 1105 continue; 1106 } 1107 1108 mce_read_aux(&m, i); 1109 1110 /* 1111 * Action optional error. Queue address for later processing. 1112 * When the ring overflows we just ignore the AO error. 1113 * RED-PEN add some logging mechanism when 1114 * usable_address or mce_add_ring fails. 1115 * RED-PEN don't ignore overflow for tolerant == 0 1116 */ 1117 if (severity == MCE_AO_SEVERITY && mce_usable_address(&m)) 1118 mce_ring_add(m.addr >> PAGE_SHIFT); 1119 1120 mce_log(&m); 1121 1122 if (severity > worst) { 1123 *final = m; 1124 worst = severity; 1125 } 1126 } 1127 1128 /* mce_clear_state will clear *final, save locally for use later */ 1129 m = *final; 1130 1131 if (!no_way_out) 1132 mce_clear_state(toclear); 1133 1134 /* 1135 * Do most of the synchronization with other CPUs. 1136 * When there's any problem use only local no_way_out state. 1137 */ 1138 if (mce_end(order) < 0) 1139 no_way_out = worst >= MCE_PANIC_SEVERITY; 1140 1141 /* 1142 * At insane "tolerant" levels we take no action. Otherwise 1143 * we only die if we have no other choice. For less serious 1144 * issues we try to recover, or limit damage to the current 1145 * process. 1146 */ 1147 if (tolerant < 3) { 1148 if (no_way_out) 1149 mce_panic("Fatal machine check on current CPU", &m, msg); 1150 if (worst == MCE_AR_SEVERITY) { 1151 /* schedule action before return to userland */ 1152 mce_save_info(m.addr, m.mcgstatus & MCG_STATUS_RIPV); 1153 set_thread_flag(TIF_MCE_NOTIFY); 1154 } else if (kill_it) { 1155 force_sig(SIGBUS, current); 1156 } 1157 } 1158 1159 if (worst > 0) 1160 mce_report_event(regs); 1161 mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); 1162out: 1163 atomic_dec(&mce_entry); 1164 sync_core(); 1165} 1166EXPORT_SYMBOL_GPL(do_machine_check); 1167 1168#ifndef CONFIG_MEMORY_FAILURE 1169int memory_failure(unsigned long pfn, int vector, int flags) 1170{ 1171 /* mce_severity() should not hand us an ACTION_REQUIRED error */ 1172 BUG_ON(flags & MF_ACTION_REQUIRED); 1173 pr_err("Uncorrected memory error in page 0x%lx ignored\n" 1174 "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n", 1175 pfn); 1176 1177 return 0; 1178} 1179#endif 1180 1181/* 1182 * Called in process context that interrupted by MCE and marked with 1183 * TIF_MCE_NOTIFY, just before returning to erroneous userland. 1184 * This code is allowed to sleep. 1185 * Attempt possible recovery such as calling the high level VM handler to 1186 * process any corrupted pages, and kill/signal current process if required. 1187 * Action required errors are handled here. 1188 */ 1189void mce_notify_process(void) 1190{ 1191 unsigned long pfn; 1192 struct mce_info *mi = mce_find_info(); 1193 1194 if (!mi) 1195 mce_panic("Lost physical address for unconsumed uncorrectable error", NULL, NULL); 1196 pfn = mi->paddr >> PAGE_SHIFT; 1197 1198 clear_thread_flag(TIF_MCE_NOTIFY); 1199 1200 pr_err("Uncorrected hardware memory error in user-access at %llx", 1201 mi->paddr); 1202 /* 1203 * We must call memory_failure() here even if the current process is 1204 * doomed. We still need to mark the page as poisoned and alert any 1205 * other users of the page. 1206 */ 1207 if (memory_failure(pfn, MCE_VECTOR, MF_ACTION_REQUIRED) < 0 || 1208 mi->restartable == 0) { 1209 pr_err("Memory error not recovered"); 1210 force_sig(SIGBUS, current); 1211 } 1212 mce_clear_info(mi); 1213} 1214 1215/* 1216 * Action optional processing happens here (picking up 1217 * from the list of faulting pages that do_machine_check() 1218 * placed into the "ring"). 1219 */ 1220static void mce_process_work(struct work_struct *dummy) 1221{ 1222 unsigned long pfn; 1223 1224 while (mce_ring_get(&pfn)) 1225 memory_failure(pfn, MCE_VECTOR, 0); 1226} 1227 1228#ifdef CONFIG_X86_MCE_INTEL 1229/*** 1230 * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog 1231 * @cpu: The CPU on which the event occurred. 1232 * @status: Event status information 1233 * 1234 * This function should be called by the thermal interrupt after the 1235 * event has been processed and the decision was made to log the event 1236 * further. 1237 * 1238 * The status parameter will be saved to the 'status' field of 'struct mce' 1239 * and historically has been the register value of the 1240 * MSR_IA32_THERMAL_STATUS (Intel) msr. 1241 */ 1242void mce_log_therm_throt_event(__u64 status) 1243{ 1244 struct mce m; 1245 1246 mce_setup(&m); 1247 m.bank = MCE_THERMAL_BANK; 1248 m.status = status; 1249 mce_log(&m); 1250} 1251#endif /* CONFIG_X86_MCE_INTEL */ 1252 1253/* 1254 * Periodic polling timer for "silent" machine check errors. If the 1255 * poller finds an MCE, poll 2x faster. When the poller finds no more 1256 * errors, poll 2x slower (up to check_interval seconds). 1257 */ 1258static unsigned long check_interval = 5 * 60; /* 5 minutes */ 1259 1260static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */ 1261static DEFINE_PER_CPU(struct timer_list, mce_timer); 1262 1263static void mce_timer_fn(unsigned long data) 1264{ 1265 struct timer_list *t = &__get_cpu_var(mce_timer); 1266 unsigned long iv; 1267 1268 WARN_ON(smp_processor_id() != data); 1269 1270 if (mce_available(__this_cpu_ptr(&cpu_info))) { 1271 machine_check_poll(MCP_TIMESTAMP, 1272 &__get_cpu_var(mce_poll_banks)); 1273 } 1274 1275 /* 1276 * Alert userspace if needed. If we logged an MCE, reduce the 1277 * polling interval, otherwise increase the polling interval. 1278 */ 1279 iv = __this_cpu_read(mce_next_interval); 1280 if (mce_notify_irq()) 1281 iv = max(iv, (unsigned long) HZ/100); 1282 else 1283 iv = min(iv * 2, round_jiffies_relative(check_interval * HZ)); 1284 __this_cpu_write(mce_next_interval, iv); 1285 1286 t->expires = jiffies + iv; 1287 add_timer_on(t, smp_processor_id()); 1288} 1289 1290/* Must not be called in IRQ context where del_timer_sync() can deadlock */ 1291static void mce_timer_delete_all(void) 1292{ 1293 int cpu; 1294 1295 for_each_online_cpu(cpu) 1296 del_timer_sync(&per_cpu(mce_timer, cpu)); 1297} 1298 1299static void mce_do_trigger(struct work_struct *work) 1300{ 1301 call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT); 1302} 1303 1304static DECLARE_WORK(mce_trigger_work, mce_do_trigger); 1305 1306/* 1307 * Notify the user(s) about new machine check events. 1308 * Can be called from interrupt context, but not from machine check/NMI 1309 * context. 1310 */ 1311int mce_notify_irq(void) 1312{ 1313 /* Not more than two messages every minute */ 1314 static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); 1315 1316 if (test_and_clear_bit(0, &mce_need_notify)) { 1317 /* wake processes polling /dev/mcelog */ 1318 wake_up_interruptible(&mce_chrdev_wait); 1319 1320 /* 1321 * There is no risk of missing notifications because 1322 * work_pending is always cleared before the function is 1323 * executed. 1324 */ 1325 if (mce_helper[0] && !work_pending(&mce_trigger_work)) 1326 schedule_work(&mce_trigger_work); 1327 1328 if (__ratelimit(&ratelimit)) 1329 pr_info(HW_ERR "Machine check events logged\n"); 1330 1331 return 1; 1332 } 1333 return 0; 1334} 1335EXPORT_SYMBOL_GPL(mce_notify_irq); 1336 1337static int __cpuinit __mcheck_cpu_mce_banks_init(void) 1338{ 1339 int i; 1340 1341 mce_banks = kzalloc(banks * sizeof(struct mce_bank), GFP_KERNEL); 1342 if (!mce_banks) 1343 return -ENOMEM; 1344 for (i = 0; i < banks; i++) { 1345 struct mce_bank *b = &mce_banks[i]; 1346 1347 b->ctl = -1ULL; 1348 b->init = 1; 1349 } 1350 return 0; 1351} 1352 1353/* 1354 * Initialize Machine Checks for a CPU. 1355 */ 1356static int __cpuinit __mcheck_cpu_cap_init(void) 1357{ 1358 unsigned b; 1359 u64 cap; 1360 1361 rdmsrl(MSR_IA32_MCG_CAP, cap); 1362 1363 b = cap & MCG_BANKCNT_MASK; 1364 if (!banks) 1365 pr_info("CPU supports %d MCE banks\n", b); 1366 1367 if (b > MAX_NR_BANKS) { 1368 pr_warn("Using only %u machine check banks out of %u\n", 1369 MAX_NR_BANKS, b); 1370 b = MAX_NR_BANKS; 1371 } 1372 1373 /* Don't support asymmetric configurations today */ 1374 WARN_ON(banks != 0 && b != banks); 1375 banks = b; 1376 if (!mce_banks) { 1377 int err = __mcheck_cpu_mce_banks_init(); 1378 1379 if (err) 1380 return err; 1381 } 1382 1383 /* Use accurate RIP reporting if available. */ 1384 if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9) 1385 rip_msr = MSR_IA32_MCG_EIP; 1386 1387 if (cap & MCG_SER_P) 1388 mce_ser = 1; 1389 1390 return 0; 1391} 1392 1393static void __mcheck_cpu_init_generic(void) 1394{ 1395 mce_banks_t all_banks; 1396 u64 cap; 1397 int i; 1398 1399 /* 1400 * Log the machine checks left over from the previous reset. 1401 */ 1402 bitmap_fill(all_banks, MAX_NR_BANKS); 1403 machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks); 1404 1405 set_in_cr4(X86_CR4_MCE); 1406 1407 rdmsrl(MSR_IA32_MCG_CAP, cap); 1408 if (cap & MCG_CTL_P) 1409 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); 1410 1411 for (i = 0; i < banks; i++) { 1412 struct mce_bank *b = &mce_banks[i]; 1413 1414 if (!b->init) 1415 continue; 1416 wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl); 1417 wrmsrl(MSR_IA32_MCx_STATUS(i), 0); 1418 } 1419} 1420 1421/* Add per CPU specific workarounds here */ 1422static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) 1423{ 1424 if (c->x86_vendor == X86_VENDOR_UNKNOWN) { 1425 pr_info("unknown CPU type - not enabling MCE support\n"); 1426 return -EOPNOTSUPP; 1427 } 1428 1429 /* This should be disabled by the BIOS, but isn't always */ 1430 if (c->x86_vendor == X86_VENDOR_AMD) { 1431 if (c->x86 == 15 && banks > 4) { 1432 /* 1433 * disable GART TBL walk error reporting, which 1434 * trips off incorrectly with the IOMMU & 3ware 1435 * & Cerberus: 1436 */ 1437 clear_bit(10, (unsigned long *)&mce_banks[4].ctl); 1438 } 1439 if (c->x86 <= 17 && mce_bootlog < 0) { 1440 /* 1441 * Lots of broken BIOS around that don't clear them 1442 * by default and leave crap in there. Don't log: 1443 */ 1444 mce_bootlog = 0; 1445 } 1446 /* 1447 * Various K7s with broken bank 0 around. Always disable 1448 * by default. 1449 */ 1450 if (c->x86 == 6 && banks > 0) 1451 mce_banks[0].ctl = 0; 1452 1453 /* 1454 * Turn off MC4_MISC thresholding banks on those models since 1455 * they're not supported there. 1456 */ 1457 if (c->x86 == 0x15 && 1458 (c->x86_model >= 0x10 && c->x86_model <= 0x1f)) { 1459 int i; 1460 u64 val, hwcr; 1461 bool need_toggle; 1462 u32 msrs[] = { 1463 0x00000413, /* MC4_MISC0 */ 1464 0xc0000408, /* MC4_MISC1 */ 1465 }; 1466 1467 rdmsrl(MSR_K7_HWCR, hwcr); 1468 1469 /* McStatusWrEn has to be set */ 1470 need_toggle = !(hwcr & BIT(18)); 1471 1472 if (need_toggle) 1473 wrmsrl(MSR_K7_HWCR, hwcr | BIT(18)); 1474 1475 for (i = 0; i < ARRAY_SIZE(msrs); i++) { 1476 rdmsrl(msrs[i], val); 1477 1478 /* CntP bit set? */ 1479 if (val & BIT_64(62)) { 1480 val &= ~BIT_64(62); 1481 wrmsrl(msrs[i], val); 1482 } 1483 } 1484 1485 /* restore old settings */ 1486 if (need_toggle) 1487 wrmsrl(MSR_K7_HWCR, hwcr); 1488 } 1489 } 1490 1491 if (c->x86_vendor == X86_VENDOR_INTEL) { 1492 /* 1493 * SDM documents that on family 6 bank 0 should not be written 1494 * because it aliases to another special BIOS controlled 1495 * register. 1496 * But it's not aliased anymore on model 0x1a+ 1497 * Don't ignore bank 0 completely because there could be a 1498 * valid event later, merely don't write CTL0. 1499 */ 1500 1501 if (c->x86 == 6 && c->x86_model < 0x1A && banks > 0) 1502 mce_banks[0].init = 0; 1503 1504 /* 1505 * All newer Intel systems support MCE broadcasting. Enable 1506 * synchronization with a one second timeout. 1507 */ 1508 if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) && 1509 monarch_timeout < 0) 1510 monarch_timeout = USEC_PER_SEC; 1511 1512 /* 1513 * There are also broken BIOSes on some Pentium M and 1514 * earlier systems: 1515 */ 1516 if (c->x86 == 6 && c->x86_model <= 13 && mce_bootlog < 0) 1517 mce_bootlog = 0; 1518 } 1519 if (monarch_timeout < 0) 1520 monarch_timeout = 0; 1521 if (mce_bootlog != 0) 1522 mce_panic_timeout = 30; 1523 1524 return 0; 1525} 1526 1527static int __cpuinit __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c) 1528{ 1529 if (c->x86 != 5) 1530 return 0; 1531 1532 switch (c->x86_vendor) { 1533 case X86_VENDOR_INTEL: 1534 intel_p5_mcheck_init(c); 1535 return 1; 1536 break; 1537 case X86_VENDOR_CENTAUR: 1538 winchip_mcheck_init(c); 1539 return 1; 1540 break; 1541 } 1542 1543 return 0; 1544} 1545 1546static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) 1547{ 1548 switch (c->x86_vendor) { 1549 case X86_VENDOR_INTEL: 1550 mce_intel_feature_init(c); 1551 break; 1552 case X86_VENDOR_AMD: 1553 mce_amd_feature_init(c); 1554 break; 1555 default: 1556 break; 1557 } 1558} 1559 1560static void __mcheck_cpu_init_timer(void) 1561{ 1562 struct timer_list *t = &__get_cpu_var(mce_timer); 1563 unsigned long iv = __this_cpu_read(mce_next_interval); 1564 1565 setup_timer(t, mce_timer_fn, smp_processor_id()); 1566 1567 if (mce_ignore_ce) 1568 return; 1569 1570 __this_cpu_write(mce_next_interval, iv); 1571 if (!iv) 1572 return; 1573 t->expires = round_jiffies(jiffies + iv); 1574 add_timer_on(t, smp_processor_id()); 1575} 1576 1577/* Handle unconfigured int18 (should never happen) */ 1578static void unexpected_machine_check(struct pt_regs *regs, long error_code) 1579{ 1580 pr_err("CPU#%d: Unexpected int18 (Machine Check)\n", 1581 smp_processor_id()); 1582} 1583 1584/* Call the installed machine check handler for this CPU setup. */ 1585void (*machine_check_vector)(struct pt_regs *, long error_code) = 1586 unexpected_machine_check; 1587 1588/* 1589 * Called for each booted CPU to set up machine checks. 1590 * Must be called with preempt off: 1591 */ 1592void __cpuinit mcheck_cpu_init(struct cpuinfo_x86 *c) 1593{ 1594 if (mce_disabled) 1595 return; 1596 1597 if (__mcheck_cpu_ancient_init(c)) 1598 return; 1599 1600 if (!mce_available(c)) 1601 return; 1602 1603 if (__mcheck_cpu_cap_init() < 0 || __mcheck_cpu_apply_quirks(c) < 0) { 1604 mce_disabled = 1; 1605 return; 1606 } 1607 1608 machine_check_vector = do_machine_check; 1609 1610 __mcheck_cpu_init_generic(); 1611 __mcheck_cpu_init_vendor(c); 1612 __mcheck_cpu_init_timer(); 1613 INIT_WORK(&__get_cpu_var(mce_work), mce_process_work); 1614 init_irq_work(&__get_cpu_var(mce_irq_work), &mce_irq_work_cb); 1615} 1616 1617/* 1618 * mce_chrdev: Character device /dev/mcelog to read and clear the MCE log. 1619 */ 1620 1621static DEFINE_SPINLOCK(mce_chrdev_state_lock); 1622static int mce_chrdev_open_count; /* #times opened */ 1623static int mce_chrdev_open_exclu; /* already open exclusive? */ 1624 1625static int mce_chrdev_open(struct inode *inode, struct file *file) 1626{ 1627 spin_lock(&mce_chrdev_state_lock); 1628 1629 if (mce_chrdev_open_exclu || 1630 (mce_chrdev_open_count && (file->f_flags & O_EXCL))) { 1631 spin_unlock(&mce_chrdev_state_lock); 1632 1633 return -EBUSY; 1634 } 1635 1636 if (file->f_flags & O_EXCL) 1637 mce_chrdev_open_exclu = 1; 1638 mce_chrdev_open_count++; 1639 1640 spin_unlock(&mce_chrdev_state_lock); 1641 1642 return nonseekable_open(inode, file); 1643} 1644 1645static int mce_chrdev_release(struct inode *inode, struct file *file) 1646{ 1647 spin_lock(&mce_chrdev_state_lock); 1648 1649 mce_chrdev_open_count--; 1650 mce_chrdev_open_exclu = 0; 1651 1652 spin_unlock(&mce_chrdev_state_lock); 1653 1654 return 0; 1655} 1656 1657static void collect_tscs(void *data) 1658{ 1659 unsigned long *cpu_tsc = (unsigned long *)data; 1660 1661 rdtscll(cpu_tsc[smp_processor_id()]); 1662} 1663 1664static int mce_apei_read_done; 1665 1666/* Collect MCE record of previous boot in persistent storage via APEI ERST. */ 1667static int __mce_read_apei(char __user **ubuf, size_t usize) 1668{ 1669 int rc; 1670 u64 record_id; 1671 struct mce m; 1672 1673 if (usize < sizeof(struct mce)) 1674 return -EINVAL; 1675 1676 rc = apei_read_mce(&m, &record_id); 1677 /* Error or no more MCE record */ 1678 if (rc <= 0) { 1679 mce_apei_read_done = 1; 1680 /* 1681 * When ERST is disabled, mce_chrdev_read() should return 1682 * "no record" instead of "no device." 1683 */ 1684 if (rc == -ENODEV) 1685 return 0; 1686 return rc; 1687 } 1688 rc = -EFAULT; 1689 if (copy_to_user(*ubuf, &m, sizeof(struct mce))) 1690 return rc; 1691 /* 1692 * In fact, we should have cleared the record after that has 1693 * been flushed to the disk or sent to network in 1694 * /sbin/mcelog, but we have no interface to support that now, 1695 * so just clear it to avoid duplication. 1696 */ 1697 rc = apei_clear_mce(record_id); 1698 if (rc) { 1699 mce_apei_read_done = 1; 1700 return rc; 1701 } 1702 *ubuf += sizeof(struct mce); 1703 1704 return 0; 1705} 1706 1707static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf, 1708 size_t usize, loff_t *off) 1709{ 1710 char __user *buf = ubuf; 1711 unsigned long *cpu_tsc; 1712 unsigned prev, next; 1713 int i, err; 1714 1715 cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL); 1716 if (!cpu_tsc) 1717 return -ENOMEM; 1718 1719 mutex_lock(&mce_chrdev_read_mutex); 1720 1721 if (!mce_apei_read_done) { 1722 err = __mce_read_apei(&buf, usize); 1723 if (err || buf != ubuf) 1724 goto out; 1725 } 1726 1727 next = rcu_dereference_check_mce(mcelog.next); 1728 1729 /* Only supports full reads right now */ 1730 err = -EINVAL; 1731 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) 1732 goto out; 1733 1734 err = 0; 1735 prev = 0; 1736 do { 1737 for (i = prev; i < next; i++) { 1738 unsigned long start = jiffies; 1739 struct mce *m = &mcelog.entry[i]; 1740 1741 while (!m->finished) { 1742 if (time_after_eq(jiffies, start + 2)) { 1743 memset(m, 0, sizeof(*m)); 1744 goto timeout; 1745 } 1746 cpu_relax(); 1747 } 1748 smp_rmb(); 1749 err |= copy_to_user(buf, m, sizeof(*m)); 1750 buf += sizeof(*m); 1751timeout: 1752 ; 1753 } 1754 1755 memset(mcelog.entry + prev, 0, 1756 (next - prev) * sizeof(struct mce)); 1757 prev = next; 1758 next = cmpxchg(&mcelog.next, prev, 0); 1759 } while (next != prev); 1760 1761 synchronize_sched(); 1762 1763 /* 1764 * Collect entries that were still getting written before the 1765 * synchronize. 1766 */ 1767 on_each_cpu(collect_tscs, cpu_tsc, 1); 1768 1769 for (i = next; i < MCE_LOG_LEN; i++) { 1770 struct mce *m = &mcelog.entry[i]; 1771 1772 if (m->finished && m->tsc < cpu_tsc[m->cpu]) { 1773 err |= copy_to_user(buf, m, sizeof(*m)); 1774 smp_rmb(); 1775 buf += sizeof(*m); 1776 memset(m, 0, sizeof(*m)); 1777 } 1778 } 1779 1780 if (err) 1781 err = -EFAULT; 1782 1783out: 1784 mutex_unlock(&mce_chrdev_read_mutex); 1785 kfree(cpu_tsc); 1786 1787 return err ? err : buf - ubuf; 1788} 1789 1790static unsigned int mce_chrdev_poll(struct file *file, poll_table *wait) 1791{ 1792 poll_wait(file, &mce_chrdev_wait, wait); 1793 if (rcu_access_index(mcelog.next)) 1794 return POLLIN | POLLRDNORM; 1795 if (!mce_apei_read_done && apei_check_mce()) 1796 return POLLIN | POLLRDNORM; 1797 return 0; 1798} 1799 1800static long mce_chrdev_ioctl(struct file *f, unsigned int cmd, 1801 unsigned long arg) 1802{ 1803 int __user *p = (int __user *)arg; 1804 1805 if (!capable(CAP_SYS_ADMIN)) 1806 return -EPERM; 1807 1808 switch (cmd) { 1809 case MCE_GET_RECORD_LEN: 1810 return put_user(sizeof(struct mce), p); 1811 case MCE_GET_LOG_LEN: 1812 return put_user(MCE_LOG_LEN, p); 1813 case MCE_GETCLEAR_FLAGS: { 1814 unsigned flags; 1815 1816 do { 1817 flags = mcelog.flags; 1818 } while (cmpxchg(&mcelog.flags, flags, 0) != flags); 1819 1820 return put_user(flags, p); 1821 } 1822 default: 1823 return -ENOTTY; 1824 } 1825} 1826 1827static ssize_t (*mce_write)(struct file *filp, const char __user *ubuf, 1828 size_t usize, loff_t *off); 1829 1830void register_mce_write_callback(ssize_t (*fn)(struct file *filp, 1831 const char __user *ubuf, 1832 size_t usize, loff_t *off)) 1833{ 1834 mce_write = fn; 1835} 1836EXPORT_SYMBOL_GPL(register_mce_write_callback); 1837 1838ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf, 1839 size_t usize, loff_t *off) 1840{ 1841 if (mce_write) 1842 return mce_write(filp, ubuf, usize, off); 1843 else 1844 return -EINVAL; 1845} 1846 1847static const struct file_operations mce_chrdev_ops = { 1848 .open = mce_chrdev_open, 1849 .release = mce_chrdev_release, 1850 .read = mce_chrdev_read, 1851 .write = mce_chrdev_write, 1852 .poll = mce_chrdev_poll, 1853 .unlocked_ioctl = mce_chrdev_ioctl, 1854 .llseek = no_llseek, 1855}; 1856 1857static struct miscdevice mce_chrdev_device = { 1858 MISC_MCELOG_MINOR, 1859 "mcelog", 1860 &mce_chrdev_ops, 1861}; 1862 1863/* 1864 * mce=off Disables machine check 1865 * mce=no_cmci Disables CMCI 1866 * mce=dont_log_ce Clears corrected events silently, no log created for CEs. 1867 * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared. 1868 * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above) 1869 * monarchtimeout is how long to wait for other CPUs on machine 1870 * check, or 0 to not wait 1871 * mce=bootlog Log MCEs from before booting. Disabled by default on AMD. 1872 * mce=nobootlog Don't log MCEs from before booting. 1873 */ 1874static int __init mcheck_enable(char *str) 1875{ 1876 if (*str == 0) { 1877 enable_p5_mce(); 1878 return 1; 1879 } 1880 if (*str == '=') 1881 str++; 1882 if (!strcmp(str, "off")) 1883 mce_disabled = 1; 1884 else if (!strcmp(str, "no_cmci")) 1885 mce_cmci_disabled = 1; 1886 else if (!strcmp(str, "dont_log_ce")) 1887 mce_dont_log_ce = 1; 1888 else if (!strcmp(str, "ignore_ce")) 1889 mce_ignore_ce = 1; 1890 else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog")) 1891 mce_bootlog = (str[0] == 'b'); 1892 else if (isdigit(str[0])) { 1893 get_option(&str, &tolerant); 1894 if (*str == ',') { 1895 ++str; 1896 get_option(&str, &monarch_timeout); 1897 } 1898 } else { 1899 pr_info("mce argument %s ignored. Please use /sys\n", str); 1900 return 0; 1901 } 1902 return 1; 1903} 1904__setup("mce", mcheck_enable); 1905 1906int __init mcheck_init(void) 1907{ 1908 mcheck_intel_therm_init(); 1909 1910 return 0; 1911} 1912 1913/* 1914 * mce_syscore: PM support 1915 */ 1916 1917/* 1918 * Disable machine checks on suspend and shutdown. We can't really handle 1919 * them later. 1920 */ 1921static int mce_disable_error_reporting(void) 1922{ 1923 int i; 1924 1925 for (i = 0; i < banks; i++) { 1926 struct mce_bank *b = &mce_banks[i]; 1927 1928 if (b->init) 1929 wrmsrl(MSR_IA32_MCx_CTL(i), 0); 1930 } 1931 return 0; 1932} 1933 1934static int mce_syscore_suspend(void) 1935{ 1936 return mce_disable_error_reporting(); 1937} 1938 1939static void mce_syscore_shutdown(void) 1940{ 1941 mce_disable_error_reporting(); 1942} 1943 1944/* 1945 * On resume clear all MCE state. Don't want to see leftovers from the BIOS. 1946 * Only one CPU is active at this time, the others get re-added later using 1947 * CPU hotplug: 1948 */ 1949static void mce_syscore_resume(void) 1950{ 1951 __mcheck_cpu_init_generic(); 1952 __mcheck_cpu_init_vendor(__this_cpu_ptr(&cpu_info)); 1953} 1954 1955static struct syscore_ops mce_syscore_ops = { 1956 .suspend = mce_syscore_suspend, 1957 .shutdown = mce_syscore_shutdown, 1958 .resume = mce_syscore_resume, 1959}; 1960 1961/* 1962 * mce_device: Sysfs support 1963 */ 1964 1965static void mce_cpu_restart(void *data) 1966{ 1967 if (!mce_available(__this_cpu_ptr(&cpu_info))) 1968 return; 1969 __mcheck_cpu_init_generic(); 1970 __mcheck_cpu_init_timer(); 1971} 1972 1973/* Reinit MCEs after user configuration changes */ 1974static void mce_restart(void) 1975{ 1976 mce_timer_delete_all(); 1977 on_each_cpu(mce_cpu_restart, NULL, 1); 1978} 1979 1980/* Toggle features for corrected errors */ 1981static void mce_disable_cmci(void *data) 1982{ 1983 if (!mce_available(__this_cpu_ptr(&cpu_info))) 1984 return; 1985 cmci_clear(); 1986} 1987 1988static void mce_enable_ce(void *all) 1989{ 1990 if (!mce_available(__this_cpu_ptr(&cpu_info))) 1991 return; 1992 cmci_reenable(); 1993 cmci_recheck(); 1994 if (all) 1995 __mcheck_cpu_init_timer(); 1996} 1997 1998static struct bus_type mce_subsys = { 1999 .name = "machinecheck", 2000 .dev_name = "machinecheck", 2001}; 2002 2003DEFINE_PER_CPU(struct device *, mce_device); 2004 2005__cpuinitdata 2006void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); 2007 2008static inline struct mce_bank *attr_to_bank(struct device_attribute *attr) 2009{ 2010 return container_of(attr, struct mce_bank, attr); 2011} 2012 2013static ssize_t show_bank(struct device *s, struct device_attribute *attr, 2014 char *buf) 2015{ 2016 return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl); 2017} 2018 2019static ssize_t set_bank(struct device *s, struct device_attribute *attr, 2020 const char *buf, size_t size) 2021{ 2022 u64 new; 2023 2024 if (strict_strtoull(buf, 0, &new) < 0) 2025 return -EINVAL; 2026 2027 attr_to_bank(attr)->ctl = new; 2028 mce_restart(); 2029 2030 return size; 2031} 2032 2033static ssize_t 2034show_trigger(struct device *s, struct device_attribute *attr, char *buf) 2035{ 2036 strcpy(buf, mce_helper); 2037 strcat(buf, "\n"); 2038 return strlen(mce_helper) + 1; 2039} 2040 2041static ssize_t set_trigger(struct device *s, struct device_attribute *attr, 2042 const char *buf, size_t siz) 2043{ 2044 char *p; 2045 2046 strncpy(mce_helper, buf, sizeof(mce_helper)); 2047 mce_helper[sizeof(mce_helper)-1] = 0; 2048 p = strchr(mce_helper, '\n'); 2049 2050 if (p) 2051 *p = 0; 2052 2053 return strlen(mce_helper) + !!p; 2054} 2055 2056static ssize_t set_ignore_ce(struct device *s, 2057 struct device_attribute *attr, 2058 const char *buf, size_t size) 2059{ 2060 u64 new; 2061 2062 if (strict_strtoull(buf, 0, &new) < 0) 2063 return -EINVAL; 2064 2065 if (mce_ignore_ce ^ !!new) { 2066 if (new) { 2067 /* disable ce features */ 2068 mce_timer_delete_all(); 2069 on_each_cpu(mce_disable_cmci, NULL, 1); 2070 mce_ignore_ce = 1; 2071 } else { 2072 /* enable ce features */ 2073 mce_ignore_ce = 0; 2074 on_each_cpu(mce_enable_ce, (void *)1, 1); 2075 } 2076 } 2077 return size; 2078} 2079 2080static ssize_t set_cmci_disabled(struct device *s, 2081 struct device_attribute *attr, 2082 const char *buf, size_t size) 2083{ 2084 u64 new; 2085 2086 if (strict_strtoull(buf, 0, &new) < 0) 2087 return -EINVAL; 2088 2089 if (mce_cmci_disabled ^ !!new) { 2090 if (new) { 2091 /* disable cmci */ 2092 on_each_cpu(mce_disable_cmci, NULL, 1); 2093 mce_cmci_disabled = 1; 2094 } else { 2095 /* enable cmci */ 2096 mce_cmci_disabled = 0; 2097 on_each_cpu(mce_enable_ce, NULL, 1); 2098 } 2099 } 2100 return size; 2101} 2102 2103static ssize_t store_int_with_restart(struct device *s, 2104 struct device_attribute *attr, 2105 const char *buf, size_t size) 2106{ 2107 ssize_t ret = device_store_int(s, attr, buf, size); 2108 mce_restart(); 2109 return ret; 2110} 2111 2112static DEVICE_ATTR(trigger, 0644, show_trigger, set_trigger); 2113static DEVICE_INT_ATTR(tolerant, 0644, tolerant); 2114static DEVICE_INT_ATTR(monarch_timeout, 0644, monarch_timeout); 2115static DEVICE_INT_ATTR(dont_log_ce, 0644, mce_dont_log_ce); 2116 2117static struct dev_ext_attribute dev_attr_check_interval = { 2118 __ATTR(check_interval, 0644, device_show_int, store_int_with_restart), 2119 &check_interval 2120}; 2121 2122static struct dev_ext_attribute dev_attr_ignore_ce = { 2123 __ATTR(ignore_ce, 0644, device_show_int, set_ignore_ce), 2124 &mce_ignore_ce 2125}; 2126 2127static struct dev_ext_attribute dev_attr_cmci_disabled = { 2128 __ATTR(cmci_disabled, 0644, device_show_int, set_cmci_disabled), 2129 &mce_cmci_disabled 2130}; 2131 2132static struct device_attribute *mce_device_attrs[] = { 2133 &dev_attr_tolerant.attr, 2134 &dev_attr_check_interval.attr, 2135 &dev_attr_trigger, 2136 &dev_attr_monarch_timeout.attr, 2137 &dev_attr_dont_log_ce.attr, 2138 &dev_attr_ignore_ce.attr, 2139 &dev_attr_cmci_disabled.attr, 2140 NULL 2141}; 2142 2143static cpumask_var_t mce_device_initialized; 2144 2145static void mce_device_release(struct device *dev) 2146{ 2147 kfree(dev); 2148} 2149 2150/* Per cpu device init. All of the cpus still share the same ctrl bank: */ 2151static __cpuinit int mce_device_create(unsigned int cpu) 2152{ 2153 struct device *dev; 2154 int err; 2155 int i, j; 2156 2157 if (!mce_available(&boot_cpu_data)) 2158 return -EIO; 2159 2160 dev = kzalloc(sizeof *dev, GFP_KERNEL); 2161 if (!dev) 2162 return -ENOMEM; 2163 dev->id = cpu; 2164 dev->bus = &mce_subsys; 2165 dev->release = &mce_device_release; 2166 2167 err = device_register(dev); 2168 if (err) 2169 return err; 2170 2171 for (i = 0; mce_device_attrs[i]; i++) { 2172 err = device_create_file(dev, mce_device_attrs[i]); 2173 if (err) 2174 goto error; 2175 } 2176 for (j = 0; j < banks; j++) { 2177 err = device_create_file(dev, &mce_banks[j].attr); 2178 if (err) 2179 goto error2; 2180 } 2181 cpumask_set_cpu(cpu, mce_device_initialized); 2182 per_cpu(mce_device, cpu) = dev; 2183 2184 return 0; 2185error2: 2186 while (--j >= 0) 2187 device_remove_file(dev, &mce_banks[j].attr); 2188error: 2189 while (--i >= 0) 2190 device_remove_file(dev, mce_device_attrs[i]); 2191 2192 device_unregister(dev); 2193 2194 return err; 2195} 2196 2197static __cpuinit void mce_device_remove(unsigned int cpu) 2198{ 2199 struct device *dev = per_cpu(mce_device, cpu); 2200 int i; 2201 2202 if (!cpumask_test_cpu(cpu, mce_device_initialized)) 2203 return; 2204 2205 for (i = 0; mce_device_attrs[i]; i++) 2206 device_remove_file(dev, mce_device_attrs[i]); 2207 2208 for (i = 0; i < banks; i++) 2209 device_remove_file(dev, &mce_banks[i].attr); 2210 2211 device_unregister(dev); 2212 cpumask_clear_cpu(cpu, mce_device_initialized); 2213 per_cpu(mce_device, cpu) = NULL; 2214} 2215 2216/* Make sure there are no machine checks on offlined CPUs. */ 2217static void __cpuinit mce_disable_cpu(void *h) 2218{ 2219 unsigned long action = *(unsigned long *)h; 2220 int i; 2221 2222 if (!mce_available(__this_cpu_ptr(&cpu_info))) 2223 return; 2224 2225 if (!(action & CPU_TASKS_FROZEN)) 2226 cmci_clear(); 2227 for (i = 0; i < banks; i++) { 2228 struct mce_bank *b = &mce_banks[i]; 2229 2230 if (b->init) 2231 wrmsrl(MSR_IA32_MCx_CTL(i), 0); 2232 } 2233} 2234 2235static void __cpuinit mce_reenable_cpu(void *h) 2236{ 2237 unsigned long action = *(unsigned long *)h; 2238 int i; 2239 2240 if (!mce_available(__this_cpu_ptr(&cpu_info))) 2241 return; 2242 2243 if (!(action & CPU_TASKS_FROZEN)) 2244 cmci_reenable(); 2245 for (i = 0; i < banks; i++) { 2246 struct mce_bank *b = &mce_banks[i]; 2247 2248 if (b->init) 2249 wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl); 2250 } 2251} 2252 2253/* Get notified when a cpu comes on/off. Be hotplug friendly. */ 2254static int __cpuinit 2255mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) 2256{ 2257 unsigned int cpu = (unsigned long)hcpu; 2258 struct timer_list *t = &per_cpu(mce_timer, cpu); 2259 2260 switch (action) { 2261 case CPU_ONLINE: 2262 case CPU_ONLINE_FROZEN: 2263 mce_device_create(cpu); 2264 if (threshold_cpu_callback) 2265 threshold_cpu_callback(action, cpu); 2266 break; 2267 case CPU_DEAD: 2268 case CPU_DEAD_FROZEN: 2269 if (threshold_cpu_callback) 2270 threshold_cpu_callback(action, cpu); 2271 mce_device_remove(cpu); 2272 break; 2273 case CPU_DOWN_PREPARE: 2274 case CPU_DOWN_PREPARE_FROZEN: 2275 del_timer_sync(t); 2276 smp_call_function_single(cpu, mce_disable_cpu, &action, 1); 2277 break; 2278 case CPU_DOWN_FAILED: 2279 case CPU_DOWN_FAILED_FROZEN: 2280 if (!mce_ignore_ce && check_interval) { 2281 t->expires = round_jiffies(jiffies + 2282 per_cpu(mce_next_interval, cpu)); 2283 add_timer_on(t, cpu); 2284 } 2285 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); 2286 break; 2287 case CPU_POST_DEAD: 2288 /* intentionally ignoring frozen here */ 2289 cmci_rediscover(cpu); 2290 break; 2291 } 2292 return NOTIFY_OK; 2293} 2294 2295static struct notifier_block mce_cpu_notifier __cpuinitdata = { 2296 .notifier_call = mce_cpu_callback, 2297}; 2298 2299static __init void mce_init_banks(void) 2300{ 2301 int i; 2302 2303 for (i = 0; i < banks; i++) { 2304 struct mce_bank *b = &mce_banks[i]; 2305 struct device_attribute *a = &b->attr; 2306 2307 sysfs_attr_init(&a->attr); 2308 a->attr.name = b->attrname; 2309 snprintf(b->attrname, ATTR_LEN, "bank%d", i); 2310 2311 a->attr.mode = 0644; 2312 a->show = show_bank; 2313 a->store = set_bank; 2314 } 2315} 2316 2317static __init int mcheck_init_device(void) 2318{ 2319 int err; 2320 int i = 0; 2321 2322 if (!mce_available(&boot_cpu_data)) 2323 return -EIO; 2324 2325 zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL); 2326 2327 mce_init_banks(); 2328 2329 err = subsys_system_register(&mce_subsys, NULL); 2330 if (err) 2331 return err; 2332 2333 for_each_online_cpu(i) { 2334 err = mce_device_create(i); 2335 if (err) 2336 return err; 2337 } 2338 2339 register_syscore_ops(&mce_syscore_ops); 2340 register_hotcpu_notifier(&mce_cpu_notifier); 2341 2342 /* register character device /dev/mcelog */ 2343 misc_register(&mce_chrdev_device); 2344 2345 return err; 2346} 2347device_initcall(mcheck_init_device); 2348 2349/* 2350 * Old style boot options parsing. Only for compatibility. 2351 */ 2352static int __init mcheck_disable(char *str) 2353{ 2354 mce_disabled = 1; 2355 return 1; 2356} 2357__setup("nomce", mcheck_disable); 2358 2359#ifdef CONFIG_DEBUG_FS 2360struct dentry *mce_get_debugfs_dir(void) 2361{ 2362 static struct dentry *dmce; 2363 2364 if (!dmce) 2365 dmce = debugfs_create_dir("mce", NULL); 2366 2367 return dmce; 2368} 2369 2370static void mce_reset(void) 2371{ 2372 cpu_missing = 0; 2373 atomic_set(&mce_fake_paniced, 0); 2374 atomic_set(&mce_executing, 0); 2375 atomic_set(&mce_callin, 0); 2376 atomic_set(&global_nwo, 0); 2377} 2378 2379static int fake_panic_get(void *data, u64 *val) 2380{ 2381 *val = fake_panic; 2382 return 0; 2383} 2384 2385static int fake_panic_set(void *data, u64 val) 2386{ 2387 mce_reset(); 2388 fake_panic = val; 2389 return 0; 2390} 2391 2392DEFINE_SIMPLE_ATTRIBUTE(fake_panic_fops, fake_panic_get, 2393 fake_panic_set, "%llu\n"); 2394 2395static int __init mcheck_debugfs_init(void) 2396{ 2397 struct dentry *dmce, *ffake_panic; 2398 2399 dmce = mce_get_debugfs_dir(); 2400 if (!dmce) 2401 return -ENOMEM; 2402 ffake_panic = debugfs_create_file("fake_panic", 0444, dmce, NULL, 2403 &fake_panic_fops); 2404 if (!ffake_panic) 2405 return -ENOMEM; 2406 2407 return 0; 2408} 2409late_initcall(mcheck_debugfs_init); 2410#endif 2411