mce.c revision 55babd8f41f122f5f4c7cebf520c766c983282c6
1/* 2 * Machine check handler. 3 * 4 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. 5 * Rest from unknown author(s). 6 * 2004 Andi Kleen. Rewrote most of it. 7 * Copyright 2008 Intel Corporation 8 * Author: Andi Kleen 9 */ 10 11#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 12 13#include <linux/thread_info.h> 14#include <linux/capability.h> 15#include <linux/miscdevice.h> 16#include <linux/ratelimit.h> 17#include <linux/kallsyms.h> 18#include <linux/rcupdate.h> 19#include <linux/kobject.h> 20#include <linux/uaccess.h> 21#include <linux/kdebug.h> 22#include <linux/kernel.h> 23#include <linux/percpu.h> 24#include <linux/string.h> 25#include <linux/device.h> 26#include <linux/syscore_ops.h> 27#include <linux/delay.h> 28#include <linux/ctype.h> 29#include <linux/sched.h> 30#include <linux/sysfs.h> 31#include <linux/types.h> 32#include <linux/slab.h> 33#include <linux/init.h> 34#include <linux/kmod.h> 35#include <linux/poll.h> 36#include <linux/nmi.h> 37#include <linux/cpu.h> 38#include <linux/smp.h> 39#include <linux/fs.h> 40#include <linux/mm.h> 41#include <linux/debugfs.h> 42#include <linux/irq_work.h> 43#include <linux/export.h> 44 45#include <asm/processor.h> 46#include <asm/mce.h> 47#include <asm/msr.h> 48 49#include "mce-internal.h" 50 51static DEFINE_MUTEX(mce_chrdev_read_mutex); 52 53#define rcu_dereference_check_mce(p) \ 54 rcu_dereference_index_check((p), \ 55 rcu_read_lock_sched_held() || \ 56 lockdep_is_held(&mce_chrdev_read_mutex)) 57 58#define CREATE_TRACE_POINTS 59#include <trace/events/mce.h> 60 61int mce_disabled __read_mostly; 62 63#define SPINUNIT 100 /* 100ns */ 64 65atomic_t mce_entry; 66 67DEFINE_PER_CPU(unsigned, mce_exception_count); 68 69/* 70 * Tolerant levels: 71 * 0: always panic on uncorrected errors, log corrected errors 72 * 1: panic or SIGBUS on uncorrected errors, log corrected errors 73 * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors 74 * 3: never panic or SIGBUS, log all errors (for testing only) 75 */ 76static int tolerant __read_mostly = 1; 77static int banks __read_mostly; 78static int rip_msr __read_mostly; 79static int mce_bootlog __read_mostly = -1; 80static int monarch_timeout __read_mostly = -1; 81static int mce_panic_timeout __read_mostly; 82static int mce_dont_log_ce __read_mostly; 83int mce_cmci_disabled __read_mostly; 84int mce_ignore_ce __read_mostly; 85int mce_ser __read_mostly; 86 87struct mce_bank *mce_banks __read_mostly; 88 89/* User mode helper program triggered by machine check event */ 90static unsigned long mce_need_notify; 91static char mce_helper[128]; 92static char *mce_helper_argv[2] = { mce_helper, NULL }; 93 94static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait); 95 96static DEFINE_PER_CPU(struct mce, mces_seen); 97static int cpu_missing; 98 99/* MCA banks polled by the period polling timer for corrected events */ 100DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { 101 [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL 102}; 103 104static DEFINE_PER_CPU(struct work_struct, mce_work); 105 106/* 107 * CPU/chipset specific EDAC code can register a notifier call here to print 108 * MCE errors in a human-readable form. 109 */ 110ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain); 111 112/* Do initial initialization of a struct mce */ 113void mce_setup(struct mce *m) 114{ 115 memset(m, 0, sizeof(struct mce)); 116 m->cpu = m->extcpu = smp_processor_id(); 117 rdtscll(m->tsc); 118 /* We hope get_seconds stays lockless */ 119 m->time = get_seconds(); 120 m->cpuvendor = boot_cpu_data.x86_vendor; 121 m->cpuid = cpuid_eax(1); 122 m->socketid = cpu_data(m->extcpu).phys_proc_id; 123 m->apicid = cpu_data(m->extcpu).initial_apicid; 124 rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap); 125} 126 127DEFINE_PER_CPU(struct mce, injectm); 128EXPORT_PER_CPU_SYMBOL_GPL(injectm); 129 130/* 131 * Lockless MCE logging infrastructure. 132 * This avoids deadlocks on printk locks without having to break locks. Also 133 * separate MCEs from kernel messages to avoid bogus bug reports. 134 */ 135 136static struct mce_log mcelog = { 137 .signature = MCE_LOG_SIGNATURE, 138 .len = MCE_LOG_LEN, 139 .recordlen = sizeof(struct mce), 140}; 141 142void mce_log(struct mce *mce) 143{ 144 unsigned next, entry; 145 int ret = 0; 146 147 /* Emit the trace record: */ 148 trace_mce_record(mce); 149 150 ret = atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, mce); 151 if (ret == NOTIFY_STOP) 152 return; 153 154 mce->finished = 0; 155 wmb(); 156 for (;;) { 157 entry = rcu_dereference_check_mce(mcelog.next); 158 for (;;) { 159 160 /* 161 * When the buffer fills up discard new entries. 162 * Assume that the earlier errors are the more 163 * interesting ones: 164 */ 165 if (entry >= MCE_LOG_LEN) { 166 set_bit(MCE_OVERFLOW, 167 (unsigned long *)&mcelog.flags); 168 return; 169 } 170 /* Old left over entry. Skip: */ 171 if (mcelog.entry[entry].finished) { 172 entry++; 173 continue; 174 } 175 break; 176 } 177 smp_rmb(); 178 next = entry + 1; 179 if (cmpxchg(&mcelog.next, entry, next) == entry) 180 break; 181 } 182 memcpy(mcelog.entry + entry, mce, sizeof(struct mce)); 183 wmb(); 184 mcelog.entry[entry].finished = 1; 185 wmb(); 186 187 mce->finished = 1; 188 set_bit(0, &mce_need_notify); 189} 190 191static void drain_mcelog_buffer(void) 192{ 193 unsigned int next, i, prev = 0; 194 195 next = ACCESS_ONCE(mcelog.next); 196 197 do { 198 struct mce *m; 199 200 /* drain what was logged during boot */ 201 for (i = prev; i < next; i++) { 202 unsigned long start = jiffies; 203 unsigned retries = 1; 204 205 m = &mcelog.entry[i]; 206 207 while (!m->finished) { 208 if (time_after_eq(jiffies, start + 2*retries)) 209 retries++; 210 211 cpu_relax(); 212 213 if (!m->finished && retries >= 4) { 214 pr_err("skipping error being logged currently!\n"); 215 break; 216 } 217 } 218 smp_rmb(); 219 atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m); 220 } 221 222 memset(mcelog.entry + prev, 0, (next - prev) * sizeof(*m)); 223 prev = next; 224 next = cmpxchg(&mcelog.next, prev, 0); 225 } while (next != prev); 226} 227 228 229void mce_register_decode_chain(struct notifier_block *nb) 230{ 231 atomic_notifier_chain_register(&x86_mce_decoder_chain, nb); 232 drain_mcelog_buffer(); 233} 234EXPORT_SYMBOL_GPL(mce_register_decode_chain); 235 236void mce_unregister_decode_chain(struct notifier_block *nb) 237{ 238 atomic_notifier_chain_unregister(&x86_mce_decoder_chain, nb); 239} 240EXPORT_SYMBOL_GPL(mce_unregister_decode_chain); 241 242static void print_mce(struct mce *m) 243{ 244 int ret = 0; 245 246 pr_emerg(HW_ERR "CPU %d: Machine Check Exception: %Lx Bank %d: %016Lx\n", 247 m->extcpu, m->mcgstatus, m->bank, m->status); 248 249 if (m->ip) { 250 pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ", 251 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", 252 m->cs, m->ip); 253 254 if (m->cs == __KERNEL_CS) 255 print_symbol("{%s}", m->ip); 256 pr_cont("\n"); 257 } 258 259 pr_emerg(HW_ERR "TSC %llx ", m->tsc); 260 if (m->addr) 261 pr_cont("ADDR %llx ", m->addr); 262 if (m->misc) 263 pr_cont("MISC %llx ", m->misc); 264 265 pr_cont("\n"); 266 /* 267 * Note this output is parsed by external tools and old fields 268 * should not be changed. 269 */ 270 pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x microcode %x\n", 271 m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid, 272 cpu_data(m->extcpu).microcode); 273 274 /* 275 * Print out human-readable details about the MCE error, 276 * (if the CPU has an implementation for that) 277 */ 278 ret = atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m); 279 if (ret == NOTIFY_STOP) 280 return; 281 282 pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n"); 283} 284 285#define PANIC_TIMEOUT 5 /* 5 seconds */ 286 287static atomic_t mce_paniced; 288 289static int fake_panic; 290static atomic_t mce_fake_paniced; 291 292/* Panic in progress. Enable interrupts and wait for final IPI */ 293static void wait_for_panic(void) 294{ 295 long timeout = PANIC_TIMEOUT*USEC_PER_SEC; 296 297 preempt_disable(); 298 local_irq_enable(); 299 while (timeout-- > 0) 300 udelay(1); 301 if (panic_timeout == 0) 302 panic_timeout = mce_panic_timeout; 303 panic("Panicing machine check CPU died"); 304} 305 306static void mce_panic(char *msg, struct mce *final, char *exp) 307{ 308 int i, apei_err = 0; 309 310 if (!fake_panic) { 311 /* 312 * Make sure only one CPU runs in machine check panic 313 */ 314 if (atomic_inc_return(&mce_paniced) > 1) 315 wait_for_panic(); 316 barrier(); 317 318 bust_spinlocks(1); 319 console_verbose(); 320 } else { 321 /* Don't log too much for fake panic */ 322 if (atomic_inc_return(&mce_fake_paniced) > 1) 323 return; 324 } 325 /* First print corrected ones that are still unlogged */ 326 for (i = 0; i < MCE_LOG_LEN; i++) { 327 struct mce *m = &mcelog.entry[i]; 328 if (!(m->status & MCI_STATUS_VAL)) 329 continue; 330 if (!(m->status & MCI_STATUS_UC)) { 331 print_mce(m); 332 if (!apei_err) 333 apei_err = apei_write_mce(m); 334 } 335 } 336 /* Now print uncorrected but with the final one last */ 337 for (i = 0; i < MCE_LOG_LEN; i++) { 338 struct mce *m = &mcelog.entry[i]; 339 if (!(m->status & MCI_STATUS_VAL)) 340 continue; 341 if (!(m->status & MCI_STATUS_UC)) 342 continue; 343 if (!final || memcmp(m, final, sizeof(struct mce))) { 344 print_mce(m); 345 if (!apei_err) 346 apei_err = apei_write_mce(m); 347 } 348 } 349 if (final) { 350 print_mce(final); 351 if (!apei_err) 352 apei_err = apei_write_mce(final); 353 } 354 if (cpu_missing) 355 pr_emerg(HW_ERR "Some CPUs didn't answer in synchronization\n"); 356 if (exp) 357 pr_emerg(HW_ERR "Machine check: %s\n", exp); 358 if (!fake_panic) { 359 if (panic_timeout == 0) 360 panic_timeout = mce_panic_timeout; 361 panic(msg); 362 } else 363 pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg); 364} 365 366/* Support code for software error injection */ 367 368static int msr_to_offset(u32 msr) 369{ 370 unsigned bank = __this_cpu_read(injectm.bank); 371 372 if (msr == rip_msr) 373 return offsetof(struct mce, ip); 374 if (msr == MSR_IA32_MCx_STATUS(bank)) 375 return offsetof(struct mce, status); 376 if (msr == MSR_IA32_MCx_ADDR(bank)) 377 return offsetof(struct mce, addr); 378 if (msr == MSR_IA32_MCx_MISC(bank)) 379 return offsetof(struct mce, misc); 380 if (msr == MSR_IA32_MCG_STATUS) 381 return offsetof(struct mce, mcgstatus); 382 return -1; 383} 384 385/* MSR access wrappers used for error injection */ 386static u64 mce_rdmsrl(u32 msr) 387{ 388 u64 v; 389 390 if (__this_cpu_read(injectm.finished)) { 391 int offset = msr_to_offset(msr); 392 393 if (offset < 0) 394 return 0; 395 return *(u64 *)((char *)&__get_cpu_var(injectm) + offset); 396 } 397 398 if (rdmsrl_safe(msr, &v)) { 399 WARN_ONCE(1, "mce: Unable to read msr %d!\n", msr); 400 /* 401 * Return zero in case the access faulted. This should 402 * not happen normally but can happen if the CPU does 403 * something weird, or if the code is buggy. 404 */ 405 v = 0; 406 } 407 408 return v; 409} 410 411static void mce_wrmsrl(u32 msr, u64 v) 412{ 413 if (__this_cpu_read(injectm.finished)) { 414 int offset = msr_to_offset(msr); 415 416 if (offset >= 0) 417 *(u64 *)((char *)&__get_cpu_var(injectm) + offset) = v; 418 return; 419 } 420 wrmsrl(msr, v); 421} 422 423/* 424 * Collect all global (w.r.t. this processor) status about this machine 425 * check into our "mce" struct so that we can use it later to assess 426 * the severity of the problem as we read per-bank specific details. 427 */ 428static inline void mce_gather_info(struct mce *m, struct pt_regs *regs) 429{ 430 mce_setup(m); 431 432 m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); 433 if (regs) { 434 /* 435 * Get the address of the instruction at the time of 436 * the machine check error. 437 */ 438 if (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) { 439 m->ip = regs->ip; 440 m->cs = regs->cs; 441 442 /* 443 * When in VM86 mode make the cs look like ring 3 444 * always. This is a lie, but it's better than passing 445 * the additional vm86 bit around everywhere. 446 */ 447 if (v8086_mode(regs)) 448 m->cs |= 3; 449 } 450 /* Use accurate RIP reporting if available. */ 451 if (rip_msr) 452 m->ip = mce_rdmsrl(rip_msr); 453 } 454} 455 456/* 457 * Simple lockless ring to communicate PFNs from the exception handler with the 458 * process context work function. This is vastly simplified because there's 459 * only a single reader and a single writer. 460 */ 461#define MCE_RING_SIZE 16 /* we use one entry less */ 462 463struct mce_ring { 464 unsigned short start; 465 unsigned short end; 466 unsigned long ring[MCE_RING_SIZE]; 467}; 468static DEFINE_PER_CPU(struct mce_ring, mce_ring); 469 470/* Runs with CPU affinity in workqueue */ 471static int mce_ring_empty(void) 472{ 473 struct mce_ring *r = &__get_cpu_var(mce_ring); 474 475 return r->start == r->end; 476} 477 478static int mce_ring_get(unsigned long *pfn) 479{ 480 struct mce_ring *r; 481 int ret = 0; 482 483 *pfn = 0; 484 get_cpu(); 485 r = &__get_cpu_var(mce_ring); 486 if (r->start == r->end) 487 goto out; 488 *pfn = r->ring[r->start]; 489 r->start = (r->start + 1) % MCE_RING_SIZE; 490 ret = 1; 491out: 492 put_cpu(); 493 return ret; 494} 495 496/* Always runs in MCE context with preempt off */ 497static int mce_ring_add(unsigned long pfn) 498{ 499 struct mce_ring *r = &__get_cpu_var(mce_ring); 500 unsigned next; 501 502 next = (r->end + 1) % MCE_RING_SIZE; 503 if (next == r->start) 504 return -1; 505 r->ring[r->end] = pfn; 506 wmb(); 507 r->end = next; 508 return 0; 509} 510 511int mce_available(struct cpuinfo_x86 *c) 512{ 513 if (mce_disabled) 514 return 0; 515 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); 516} 517 518static void mce_schedule_work(void) 519{ 520 if (!mce_ring_empty()) { 521 struct work_struct *work = &__get_cpu_var(mce_work); 522 if (!work_pending(work)) 523 schedule_work(work); 524 } 525} 526 527DEFINE_PER_CPU(struct irq_work, mce_irq_work); 528 529static void mce_irq_work_cb(struct irq_work *entry) 530{ 531 mce_notify_irq(); 532 mce_schedule_work(); 533} 534 535static void mce_report_event(struct pt_regs *regs) 536{ 537 if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) { 538 mce_notify_irq(); 539 /* 540 * Triggering the work queue here is just an insurance 541 * policy in case the syscall exit notify handler 542 * doesn't run soon enough or ends up running on the 543 * wrong CPU (can happen when audit sleeps) 544 */ 545 mce_schedule_work(); 546 return; 547 } 548 549 irq_work_queue(&__get_cpu_var(mce_irq_work)); 550} 551 552/* 553 * Read ADDR and MISC registers. 554 */ 555static void mce_read_aux(struct mce *m, int i) 556{ 557 if (m->status & MCI_STATUS_MISCV) 558 m->misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i)); 559 if (m->status & MCI_STATUS_ADDRV) { 560 m->addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i)); 561 562 /* 563 * Mask the reported address by the reported granularity. 564 */ 565 if (mce_ser && (m->status & MCI_STATUS_MISCV)) { 566 u8 shift = MCI_MISC_ADDR_LSB(m->misc); 567 m->addr >>= shift; 568 m->addr <<= shift; 569 } 570 } 571} 572 573DEFINE_PER_CPU(unsigned, mce_poll_count); 574 575/* 576 * Poll for corrected events or events that happened before reset. 577 * Those are just logged through /dev/mcelog. 578 * 579 * This is executed in standard interrupt context. 580 * 581 * Note: spec recommends to panic for fatal unsignalled 582 * errors here. However this would be quite problematic -- 583 * we would need to reimplement the Monarch handling and 584 * it would mess up the exclusion between exception handler 585 * and poll hander -- * so we skip this for now. 586 * These cases should not happen anyways, or only when the CPU 587 * is already totally * confused. In this case it's likely it will 588 * not fully execute the machine check handler either. 589 */ 590void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) 591{ 592 struct mce m; 593 int i; 594 595 this_cpu_inc(mce_poll_count); 596 597 mce_gather_info(&m, NULL); 598 599 for (i = 0; i < banks; i++) { 600 if (!mce_banks[i].ctl || !test_bit(i, *b)) 601 continue; 602 603 m.misc = 0; 604 m.addr = 0; 605 m.bank = i; 606 m.tsc = 0; 607 608 barrier(); 609 m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); 610 if (!(m.status & MCI_STATUS_VAL)) 611 continue; 612 613 /* 614 * Uncorrected or signalled events are handled by the exception 615 * handler when it is enabled, so don't process those here. 616 * 617 * TBD do the same check for MCI_STATUS_EN here? 618 */ 619 if (!(flags & MCP_UC) && 620 (m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC))) 621 continue; 622 623 mce_read_aux(&m, i); 624 625 if (!(flags & MCP_TIMESTAMP)) 626 m.tsc = 0; 627 /* 628 * Don't get the IP here because it's unlikely to 629 * have anything to do with the actual error location. 630 */ 631 if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) 632 mce_log(&m); 633 634 /* 635 * Clear state for this bank. 636 */ 637 mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0); 638 } 639 640 /* 641 * Don't clear MCG_STATUS here because it's only defined for 642 * exceptions. 643 */ 644 645 sync_core(); 646} 647EXPORT_SYMBOL_GPL(machine_check_poll); 648 649/* 650 * Do a quick check if any of the events requires a panic. 651 * This decides if we keep the events around or clear them. 652 */ 653static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp) 654{ 655 int i, ret = 0; 656 657 for (i = 0; i < banks; i++) { 658 m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); 659 if (m->status & MCI_STATUS_VAL) 660 __set_bit(i, validp); 661 if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY) 662 ret = 1; 663 } 664 return ret; 665} 666 667/* 668 * Variable to establish order between CPUs while scanning. 669 * Each CPU spins initially until executing is equal its number. 670 */ 671static atomic_t mce_executing; 672 673/* 674 * Defines order of CPUs on entry. First CPU becomes Monarch. 675 */ 676static atomic_t mce_callin; 677 678/* 679 * Check if a timeout waiting for other CPUs happened. 680 */ 681static int mce_timed_out(u64 *t) 682{ 683 /* 684 * The others already did panic for some reason. 685 * Bail out like in a timeout. 686 * rmb() to tell the compiler that system_state 687 * might have been modified by someone else. 688 */ 689 rmb(); 690 if (atomic_read(&mce_paniced)) 691 wait_for_panic(); 692 if (!monarch_timeout) 693 goto out; 694 if ((s64)*t < SPINUNIT) { 695 /* CHECKME: Make panic default for 1 too? */ 696 if (tolerant < 1) 697 mce_panic("Timeout synchronizing machine check over CPUs", 698 NULL, NULL); 699 cpu_missing = 1; 700 return 1; 701 } 702 *t -= SPINUNIT; 703out: 704 touch_nmi_watchdog(); 705 return 0; 706} 707 708/* 709 * The Monarch's reign. The Monarch is the CPU who entered 710 * the machine check handler first. It waits for the others to 711 * raise the exception too and then grades them. When any 712 * error is fatal panic. Only then let the others continue. 713 * 714 * The other CPUs entering the MCE handler will be controlled by the 715 * Monarch. They are called Subjects. 716 * 717 * This way we prevent any potential data corruption in a unrecoverable case 718 * and also makes sure always all CPU's errors are examined. 719 * 720 * Also this detects the case of a machine check event coming from outer 721 * space (not detected by any CPUs) In this case some external agent wants 722 * us to shut down, so panic too. 723 * 724 * The other CPUs might still decide to panic if the handler happens 725 * in a unrecoverable place, but in this case the system is in a semi-stable 726 * state and won't corrupt anything by itself. It's ok to let the others 727 * continue for a bit first. 728 * 729 * All the spin loops have timeouts; when a timeout happens a CPU 730 * typically elects itself to be Monarch. 731 */ 732static void mce_reign(void) 733{ 734 int cpu; 735 struct mce *m = NULL; 736 int global_worst = 0; 737 char *msg = NULL; 738 char *nmsg = NULL; 739 740 /* 741 * This CPU is the Monarch and the other CPUs have run 742 * through their handlers. 743 * Grade the severity of the errors of all the CPUs. 744 */ 745 for_each_possible_cpu(cpu) { 746 int severity = mce_severity(&per_cpu(mces_seen, cpu), tolerant, 747 &nmsg); 748 if (severity > global_worst) { 749 msg = nmsg; 750 global_worst = severity; 751 m = &per_cpu(mces_seen, cpu); 752 } 753 } 754 755 /* 756 * Cannot recover? Panic here then. 757 * This dumps all the mces in the log buffer and stops the 758 * other CPUs. 759 */ 760 if (m && global_worst >= MCE_PANIC_SEVERITY && tolerant < 3) 761 mce_panic("Fatal Machine check", m, msg); 762 763 /* 764 * For UC somewhere we let the CPU who detects it handle it. 765 * Also must let continue the others, otherwise the handling 766 * CPU could deadlock on a lock. 767 */ 768 769 /* 770 * No machine check event found. Must be some external 771 * source or one CPU is hung. Panic. 772 */ 773 if (global_worst <= MCE_KEEP_SEVERITY && tolerant < 3) 774 mce_panic("Machine check from unknown source", NULL, NULL); 775 776 /* 777 * Now clear all the mces_seen so that they don't reappear on 778 * the next mce. 779 */ 780 for_each_possible_cpu(cpu) 781 memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce)); 782} 783 784static atomic_t global_nwo; 785 786/* 787 * Start of Monarch synchronization. This waits until all CPUs have 788 * entered the exception handler and then determines if any of them 789 * saw a fatal event that requires panic. Then it executes them 790 * in the entry order. 791 * TBD double check parallel CPU hotunplug 792 */ 793static int mce_start(int *no_way_out) 794{ 795 int order; 796 int cpus = num_online_cpus(); 797 u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC; 798 799 if (!timeout) 800 return -1; 801 802 atomic_add(*no_way_out, &global_nwo); 803 /* 804 * global_nwo should be updated before mce_callin 805 */ 806 smp_wmb(); 807 order = atomic_inc_return(&mce_callin); 808 809 /* 810 * Wait for everyone. 811 */ 812 while (atomic_read(&mce_callin) != cpus) { 813 if (mce_timed_out(&timeout)) { 814 atomic_set(&global_nwo, 0); 815 return -1; 816 } 817 ndelay(SPINUNIT); 818 } 819 820 /* 821 * mce_callin should be read before global_nwo 822 */ 823 smp_rmb(); 824 825 if (order == 1) { 826 /* 827 * Monarch: Starts executing now, the others wait. 828 */ 829 atomic_set(&mce_executing, 1); 830 } else { 831 /* 832 * Subject: Now start the scanning loop one by one in 833 * the original callin order. 834 * This way when there are any shared banks it will be 835 * only seen by one CPU before cleared, avoiding duplicates. 836 */ 837 while (atomic_read(&mce_executing) < order) { 838 if (mce_timed_out(&timeout)) { 839 atomic_set(&global_nwo, 0); 840 return -1; 841 } 842 ndelay(SPINUNIT); 843 } 844 } 845 846 /* 847 * Cache the global no_way_out state. 848 */ 849 *no_way_out = atomic_read(&global_nwo); 850 851 return order; 852} 853 854/* 855 * Synchronize between CPUs after main scanning loop. 856 * This invokes the bulk of the Monarch processing. 857 */ 858static int mce_end(int order) 859{ 860 int ret = -1; 861 u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC; 862 863 if (!timeout) 864 goto reset; 865 if (order < 0) 866 goto reset; 867 868 /* 869 * Allow others to run. 870 */ 871 atomic_inc(&mce_executing); 872 873 if (order == 1) { 874 /* CHECKME: Can this race with a parallel hotplug? */ 875 int cpus = num_online_cpus(); 876 877 /* 878 * Monarch: Wait for everyone to go through their scanning 879 * loops. 880 */ 881 while (atomic_read(&mce_executing) <= cpus) { 882 if (mce_timed_out(&timeout)) 883 goto reset; 884 ndelay(SPINUNIT); 885 } 886 887 mce_reign(); 888 barrier(); 889 ret = 0; 890 } else { 891 /* 892 * Subject: Wait for Monarch to finish. 893 */ 894 while (atomic_read(&mce_executing) != 0) { 895 if (mce_timed_out(&timeout)) 896 goto reset; 897 ndelay(SPINUNIT); 898 } 899 900 /* 901 * Don't reset anything. That's done by the Monarch. 902 */ 903 return 0; 904 } 905 906 /* 907 * Reset all global state. 908 */ 909reset: 910 atomic_set(&global_nwo, 0); 911 atomic_set(&mce_callin, 0); 912 barrier(); 913 914 /* 915 * Let others run again. 916 */ 917 atomic_set(&mce_executing, 0); 918 return ret; 919} 920 921/* 922 * Check if the address reported by the CPU is in a format we can parse. 923 * It would be possible to add code for most other cases, but all would 924 * be somewhat complicated (e.g. segment offset would require an instruction 925 * parser). So only support physical addresses up to page granuality for now. 926 */ 927static int mce_usable_address(struct mce *m) 928{ 929 if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV)) 930 return 0; 931 if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT) 932 return 0; 933 if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS) 934 return 0; 935 return 1; 936} 937 938static void mce_clear_state(unsigned long *toclear) 939{ 940 int i; 941 942 for (i = 0; i < banks; i++) { 943 if (test_bit(i, toclear)) 944 mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0); 945 } 946} 947 948/* 949 * Need to save faulting physical address associated with a process 950 * in the machine check handler some place where we can grab it back 951 * later in mce_notify_process() 952 */ 953#define MCE_INFO_MAX 16 954 955struct mce_info { 956 atomic_t inuse; 957 struct task_struct *t; 958 __u64 paddr; 959 int restartable; 960} mce_info[MCE_INFO_MAX]; 961 962static void mce_save_info(__u64 addr, int c) 963{ 964 struct mce_info *mi; 965 966 for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++) { 967 if (atomic_cmpxchg(&mi->inuse, 0, 1) == 0) { 968 mi->t = current; 969 mi->paddr = addr; 970 mi->restartable = c; 971 return; 972 } 973 } 974 975 mce_panic("Too many concurrent recoverable errors", NULL, NULL); 976} 977 978static struct mce_info *mce_find_info(void) 979{ 980 struct mce_info *mi; 981 982 for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++) 983 if (atomic_read(&mi->inuse) && mi->t == current) 984 return mi; 985 return NULL; 986} 987 988static void mce_clear_info(struct mce_info *mi) 989{ 990 atomic_set(&mi->inuse, 0); 991} 992 993/* 994 * The actual machine check handler. This only handles real 995 * exceptions when something got corrupted coming in through int 18. 996 * 997 * This is executed in NMI context not subject to normal locking rules. This 998 * implies that most kernel services cannot be safely used. Don't even 999 * think about putting a printk in there! 1000 * 1001 * On Intel systems this is entered on all CPUs in parallel through 1002 * MCE broadcast. However some CPUs might be broken beyond repair, 1003 * so be always careful when synchronizing with others. 1004 */ 1005void do_machine_check(struct pt_regs *regs, long error_code) 1006{ 1007 struct mce m, *final; 1008 int i; 1009 int worst = 0; 1010 int severity; 1011 /* 1012 * Establish sequential order between the CPUs entering the machine 1013 * check handler. 1014 */ 1015 int order; 1016 /* 1017 * If no_way_out gets set, there is no safe way to recover from this 1018 * MCE. If tolerant is cranked up, we'll try anyway. 1019 */ 1020 int no_way_out = 0; 1021 /* 1022 * If kill_it gets set, there might be a way to recover from this 1023 * error. 1024 */ 1025 int kill_it = 0; 1026 DECLARE_BITMAP(toclear, MAX_NR_BANKS); 1027 DECLARE_BITMAP(valid_banks, MAX_NR_BANKS); 1028 char *msg = "Unknown"; 1029 1030 atomic_inc(&mce_entry); 1031 1032 this_cpu_inc(mce_exception_count); 1033 1034 if (!banks) 1035 goto out; 1036 1037 mce_gather_info(&m, regs); 1038 1039 final = &__get_cpu_var(mces_seen); 1040 *final = m; 1041 1042 memset(valid_banks, 0, sizeof(valid_banks)); 1043 no_way_out = mce_no_way_out(&m, &msg, valid_banks); 1044 1045 barrier(); 1046 1047 /* 1048 * When no restart IP might need to kill or panic. 1049 * Assume the worst for now, but if we find the 1050 * severity is MCE_AR_SEVERITY we have other options. 1051 */ 1052 if (!(m.mcgstatus & MCG_STATUS_RIPV)) 1053 kill_it = 1; 1054 1055 /* 1056 * Go through all the banks in exclusion of the other CPUs. 1057 * This way we don't report duplicated events on shared banks 1058 * because the first one to see it will clear it. 1059 */ 1060 order = mce_start(&no_way_out); 1061 for (i = 0; i < banks; i++) { 1062 __clear_bit(i, toclear); 1063 if (!test_bit(i, valid_banks)) 1064 continue; 1065 if (!mce_banks[i].ctl) 1066 continue; 1067 1068 m.misc = 0; 1069 m.addr = 0; 1070 m.bank = i; 1071 1072 m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); 1073 if ((m.status & MCI_STATUS_VAL) == 0) 1074 continue; 1075 1076 /* 1077 * Non uncorrected or non signaled errors are handled by 1078 * machine_check_poll. Leave them alone, unless this panics. 1079 */ 1080 if (!(m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)) && 1081 !no_way_out) 1082 continue; 1083 1084 /* 1085 * Set taint even when machine check was not enabled. 1086 */ 1087 add_taint(TAINT_MACHINE_CHECK); 1088 1089 severity = mce_severity(&m, tolerant, NULL); 1090 1091 /* 1092 * When machine check was for corrected handler don't touch, 1093 * unless we're panicing. 1094 */ 1095 if (severity == MCE_KEEP_SEVERITY && !no_way_out) 1096 continue; 1097 __set_bit(i, toclear); 1098 if (severity == MCE_NO_SEVERITY) { 1099 /* 1100 * Machine check event was not enabled. Clear, but 1101 * ignore. 1102 */ 1103 continue; 1104 } 1105 1106 mce_read_aux(&m, i); 1107 1108 /* 1109 * Action optional error. Queue address for later processing. 1110 * When the ring overflows we just ignore the AO error. 1111 * RED-PEN add some logging mechanism when 1112 * usable_address or mce_add_ring fails. 1113 * RED-PEN don't ignore overflow for tolerant == 0 1114 */ 1115 if (severity == MCE_AO_SEVERITY && mce_usable_address(&m)) 1116 mce_ring_add(m.addr >> PAGE_SHIFT); 1117 1118 mce_log(&m); 1119 1120 if (severity > worst) { 1121 *final = m; 1122 worst = severity; 1123 } 1124 } 1125 1126 /* mce_clear_state will clear *final, save locally for use later */ 1127 m = *final; 1128 1129 if (!no_way_out) 1130 mce_clear_state(toclear); 1131 1132 /* 1133 * Do most of the synchronization with other CPUs. 1134 * When there's any problem use only local no_way_out state. 1135 */ 1136 if (mce_end(order) < 0) 1137 no_way_out = worst >= MCE_PANIC_SEVERITY; 1138 1139 /* 1140 * At insane "tolerant" levels we take no action. Otherwise 1141 * we only die if we have no other choice. For less serious 1142 * issues we try to recover, or limit damage to the current 1143 * process. 1144 */ 1145 if (tolerant < 3) { 1146 if (no_way_out) 1147 mce_panic("Fatal machine check on current CPU", &m, msg); 1148 if (worst == MCE_AR_SEVERITY) { 1149 /* schedule action before return to userland */ 1150 mce_save_info(m.addr, m.mcgstatus & MCG_STATUS_RIPV); 1151 set_thread_flag(TIF_MCE_NOTIFY); 1152 } else if (kill_it) { 1153 force_sig(SIGBUS, current); 1154 } 1155 } 1156 1157 if (worst > 0) 1158 mce_report_event(regs); 1159 mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); 1160out: 1161 atomic_dec(&mce_entry); 1162 sync_core(); 1163} 1164EXPORT_SYMBOL_GPL(do_machine_check); 1165 1166#ifndef CONFIG_MEMORY_FAILURE 1167int memory_failure(unsigned long pfn, int vector, int flags) 1168{ 1169 /* mce_severity() should not hand us an ACTION_REQUIRED error */ 1170 BUG_ON(flags & MF_ACTION_REQUIRED); 1171 pr_err("Uncorrected memory error in page 0x%lx ignored\n" 1172 "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n", 1173 pfn); 1174 1175 return 0; 1176} 1177#endif 1178 1179/* 1180 * Called in process context that interrupted by MCE and marked with 1181 * TIF_MCE_NOTIFY, just before returning to erroneous userland. 1182 * This code is allowed to sleep. 1183 * Attempt possible recovery such as calling the high level VM handler to 1184 * process any corrupted pages, and kill/signal current process if required. 1185 * Action required errors are handled here. 1186 */ 1187void mce_notify_process(void) 1188{ 1189 unsigned long pfn; 1190 struct mce_info *mi = mce_find_info(); 1191 int flags = MF_ACTION_REQUIRED; 1192 1193 if (!mi) 1194 mce_panic("Lost physical address for unconsumed uncorrectable error", NULL, NULL); 1195 pfn = mi->paddr >> PAGE_SHIFT; 1196 1197 clear_thread_flag(TIF_MCE_NOTIFY); 1198 1199 pr_err("Uncorrected hardware memory error in user-access at %llx", 1200 mi->paddr); 1201 /* 1202 * We must call memory_failure() here even if the current process is 1203 * doomed. We still need to mark the page as poisoned and alert any 1204 * other users of the page. 1205 */ 1206 if (!mi->restartable) 1207 flags |= MF_MUST_KILL; 1208 if (memory_failure(pfn, MCE_VECTOR, flags) < 0) { 1209 pr_err("Memory error not recovered"); 1210 force_sig(SIGBUS, current); 1211 } 1212 mce_clear_info(mi); 1213} 1214 1215/* 1216 * Action optional processing happens here (picking up 1217 * from the list of faulting pages that do_machine_check() 1218 * placed into the "ring"). 1219 */ 1220static void mce_process_work(struct work_struct *dummy) 1221{ 1222 unsigned long pfn; 1223 1224 while (mce_ring_get(&pfn)) 1225 memory_failure(pfn, MCE_VECTOR, 0); 1226} 1227 1228#ifdef CONFIG_X86_MCE_INTEL 1229/*** 1230 * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog 1231 * @cpu: The CPU on which the event occurred. 1232 * @status: Event status information 1233 * 1234 * This function should be called by the thermal interrupt after the 1235 * event has been processed and the decision was made to log the event 1236 * further. 1237 * 1238 * The status parameter will be saved to the 'status' field of 'struct mce' 1239 * and historically has been the register value of the 1240 * MSR_IA32_THERMAL_STATUS (Intel) msr. 1241 */ 1242void mce_log_therm_throt_event(__u64 status) 1243{ 1244 struct mce m; 1245 1246 mce_setup(&m); 1247 m.bank = MCE_THERMAL_BANK; 1248 m.status = status; 1249 mce_log(&m); 1250} 1251#endif /* CONFIG_X86_MCE_INTEL */ 1252 1253/* 1254 * Periodic polling timer for "silent" machine check errors. If the 1255 * poller finds an MCE, poll 2x faster. When the poller finds no more 1256 * errors, poll 2x slower (up to check_interval seconds). 1257 */ 1258static unsigned long check_interval = 5 * 60; /* 5 minutes */ 1259 1260static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */ 1261static DEFINE_PER_CPU(struct timer_list, mce_timer); 1262 1263static unsigned long mce_adjust_timer_default(unsigned long interval) 1264{ 1265 return interval; 1266} 1267 1268static unsigned long (*mce_adjust_timer)(unsigned long interval) = 1269 mce_adjust_timer_default; 1270 1271static void mce_timer_fn(unsigned long data) 1272{ 1273 struct timer_list *t = &__get_cpu_var(mce_timer); 1274 unsigned long iv; 1275 1276 WARN_ON(smp_processor_id() != data); 1277 1278 if (mce_available(__this_cpu_ptr(&cpu_info))) { 1279 machine_check_poll(MCP_TIMESTAMP, 1280 &__get_cpu_var(mce_poll_banks)); 1281 mce_intel_cmci_poll(); 1282 } 1283 1284 /* 1285 * Alert userspace if needed. If we logged an MCE, reduce the 1286 * polling interval, otherwise increase the polling interval. 1287 */ 1288 iv = __this_cpu_read(mce_next_interval); 1289 if (mce_notify_irq()) { 1290 iv = max(iv / 2, (unsigned long) HZ/100); 1291 } else { 1292 iv = min(iv * 2, round_jiffies_relative(check_interval * HZ)); 1293 iv = mce_adjust_timer(iv); 1294 } 1295 __this_cpu_write(mce_next_interval, iv); 1296 /* Might have become 0 after CMCI storm subsided */ 1297 if (iv) { 1298 t->expires = jiffies + iv; 1299 add_timer_on(t, smp_processor_id()); 1300 } 1301} 1302 1303/* 1304 * Ensure that the timer is firing in @interval from now. 1305 */ 1306void mce_timer_kick(unsigned long interval) 1307{ 1308 struct timer_list *t = &__get_cpu_var(mce_timer); 1309 unsigned long when = jiffies + interval; 1310 unsigned long iv = __this_cpu_read(mce_next_interval); 1311 1312 if (timer_pending(t)) { 1313 if (time_before(when, t->expires)) 1314 mod_timer_pinned(t, when); 1315 } else { 1316 t->expires = round_jiffies(when); 1317 add_timer_on(t, smp_processor_id()); 1318 } 1319 if (interval < iv) 1320 __this_cpu_write(mce_next_interval, interval); 1321} 1322 1323/* Must not be called in IRQ context where del_timer_sync() can deadlock */ 1324static void mce_timer_delete_all(void) 1325{ 1326 int cpu; 1327 1328 for_each_online_cpu(cpu) 1329 del_timer_sync(&per_cpu(mce_timer, cpu)); 1330} 1331 1332static void mce_do_trigger(struct work_struct *work) 1333{ 1334 call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT); 1335} 1336 1337static DECLARE_WORK(mce_trigger_work, mce_do_trigger); 1338 1339/* 1340 * Notify the user(s) about new machine check events. 1341 * Can be called from interrupt context, but not from machine check/NMI 1342 * context. 1343 */ 1344int mce_notify_irq(void) 1345{ 1346 /* Not more than two messages every minute */ 1347 static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); 1348 1349 if (test_and_clear_bit(0, &mce_need_notify)) { 1350 /* wake processes polling /dev/mcelog */ 1351 wake_up_interruptible(&mce_chrdev_wait); 1352 1353 /* 1354 * There is no risk of missing notifications because 1355 * work_pending is always cleared before the function is 1356 * executed. 1357 */ 1358 if (mce_helper[0] && !work_pending(&mce_trigger_work)) 1359 schedule_work(&mce_trigger_work); 1360 1361 if (__ratelimit(&ratelimit)) 1362 pr_info(HW_ERR "Machine check events logged\n"); 1363 1364 return 1; 1365 } 1366 return 0; 1367} 1368EXPORT_SYMBOL_GPL(mce_notify_irq); 1369 1370static int __cpuinit __mcheck_cpu_mce_banks_init(void) 1371{ 1372 int i; 1373 1374 mce_banks = kzalloc(banks * sizeof(struct mce_bank), GFP_KERNEL); 1375 if (!mce_banks) 1376 return -ENOMEM; 1377 for (i = 0; i < banks; i++) { 1378 struct mce_bank *b = &mce_banks[i]; 1379 1380 b->ctl = -1ULL; 1381 b->init = 1; 1382 } 1383 return 0; 1384} 1385 1386/* 1387 * Initialize Machine Checks for a CPU. 1388 */ 1389static int __cpuinit __mcheck_cpu_cap_init(void) 1390{ 1391 unsigned b; 1392 u64 cap; 1393 1394 rdmsrl(MSR_IA32_MCG_CAP, cap); 1395 1396 b = cap & MCG_BANKCNT_MASK; 1397 if (!banks) 1398 pr_info("CPU supports %d MCE banks\n", b); 1399 1400 if (b > MAX_NR_BANKS) { 1401 pr_warn("Using only %u machine check banks out of %u\n", 1402 MAX_NR_BANKS, b); 1403 b = MAX_NR_BANKS; 1404 } 1405 1406 /* Don't support asymmetric configurations today */ 1407 WARN_ON(banks != 0 && b != banks); 1408 banks = b; 1409 if (!mce_banks) { 1410 int err = __mcheck_cpu_mce_banks_init(); 1411 1412 if (err) 1413 return err; 1414 } 1415 1416 /* Use accurate RIP reporting if available. */ 1417 if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9) 1418 rip_msr = MSR_IA32_MCG_EIP; 1419 1420 if (cap & MCG_SER_P) 1421 mce_ser = 1; 1422 1423 return 0; 1424} 1425 1426static void __mcheck_cpu_init_generic(void) 1427{ 1428 mce_banks_t all_banks; 1429 u64 cap; 1430 int i; 1431 1432 /* 1433 * Log the machine checks left over from the previous reset. 1434 */ 1435 bitmap_fill(all_banks, MAX_NR_BANKS); 1436 machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks); 1437 1438 set_in_cr4(X86_CR4_MCE); 1439 1440 rdmsrl(MSR_IA32_MCG_CAP, cap); 1441 if (cap & MCG_CTL_P) 1442 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); 1443 1444 for (i = 0; i < banks; i++) { 1445 struct mce_bank *b = &mce_banks[i]; 1446 1447 if (!b->init) 1448 continue; 1449 wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl); 1450 wrmsrl(MSR_IA32_MCx_STATUS(i), 0); 1451 } 1452} 1453 1454/* Add per CPU specific workarounds here */ 1455static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) 1456{ 1457 if (c->x86_vendor == X86_VENDOR_UNKNOWN) { 1458 pr_info("unknown CPU type - not enabling MCE support\n"); 1459 return -EOPNOTSUPP; 1460 } 1461 1462 /* This should be disabled by the BIOS, but isn't always */ 1463 if (c->x86_vendor == X86_VENDOR_AMD) { 1464 if (c->x86 == 15 && banks > 4) { 1465 /* 1466 * disable GART TBL walk error reporting, which 1467 * trips off incorrectly with the IOMMU & 3ware 1468 * & Cerberus: 1469 */ 1470 clear_bit(10, (unsigned long *)&mce_banks[4].ctl); 1471 } 1472 if (c->x86 <= 17 && mce_bootlog < 0) { 1473 /* 1474 * Lots of broken BIOS around that don't clear them 1475 * by default and leave crap in there. Don't log: 1476 */ 1477 mce_bootlog = 0; 1478 } 1479 /* 1480 * Various K7s with broken bank 0 around. Always disable 1481 * by default. 1482 */ 1483 if (c->x86 == 6 && banks > 0) 1484 mce_banks[0].ctl = 0; 1485 1486 /* 1487 * Turn off MC4_MISC thresholding banks on those models since 1488 * they're not supported there. 1489 */ 1490 if (c->x86 == 0x15 && 1491 (c->x86_model >= 0x10 && c->x86_model <= 0x1f)) { 1492 int i; 1493 u64 val, hwcr; 1494 bool need_toggle; 1495 u32 msrs[] = { 1496 0x00000413, /* MC4_MISC0 */ 1497 0xc0000408, /* MC4_MISC1 */ 1498 }; 1499 1500 rdmsrl(MSR_K7_HWCR, hwcr); 1501 1502 /* McStatusWrEn has to be set */ 1503 need_toggle = !(hwcr & BIT(18)); 1504 1505 if (need_toggle) 1506 wrmsrl(MSR_K7_HWCR, hwcr | BIT(18)); 1507 1508 for (i = 0; i < ARRAY_SIZE(msrs); i++) { 1509 rdmsrl(msrs[i], val); 1510 1511 /* CntP bit set? */ 1512 if (val & BIT_64(62)) { 1513 val &= ~BIT_64(62); 1514 wrmsrl(msrs[i], val); 1515 } 1516 } 1517 1518 /* restore old settings */ 1519 if (need_toggle) 1520 wrmsrl(MSR_K7_HWCR, hwcr); 1521 } 1522 } 1523 1524 if (c->x86_vendor == X86_VENDOR_INTEL) { 1525 /* 1526 * SDM documents that on family 6 bank 0 should not be written 1527 * because it aliases to another special BIOS controlled 1528 * register. 1529 * But it's not aliased anymore on model 0x1a+ 1530 * Don't ignore bank 0 completely because there could be a 1531 * valid event later, merely don't write CTL0. 1532 */ 1533 1534 if (c->x86 == 6 && c->x86_model < 0x1A && banks > 0) 1535 mce_banks[0].init = 0; 1536 1537 /* 1538 * All newer Intel systems support MCE broadcasting. Enable 1539 * synchronization with a one second timeout. 1540 */ 1541 if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) && 1542 monarch_timeout < 0) 1543 monarch_timeout = USEC_PER_SEC; 1544 1545 /* 1546 * There are also broken BIOSes on some Pentium M and 1547 * earlier systems: 1548 */ 1549 if (c->x86 == 6 && c->x86_model <= 13 && mce_bootlog < 0) 1550 mce_bootlog = 0; 1551 } 1552 if (monarch_timeout < 0) 1553 monarch_timeout = 0; 1554 if (mce_bootlog != 0) 1555 mce_panic_timeout = 30; 1556 1557 return 0; 1558} 1559 1560static int __cpuinit __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c) 1561{ 1562 if (c->x86 != 5) 1563 return 0; 1564 1565 switch (c->x86_vendor) { 1566 case X86_VENDOR_INTEL: 1567 intel_p5_mcheck_init(c); 1568 return 1; 1569 break; 1570 case X86_VENDOR_CENTAUR: 1571 winchip_mcheck_init(c); 1572 return 1; 1573 break; 1574 } 1575 1576 return 0; 1577} 1578 1579static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) 1580{ 1581 switch (c->x86_vendor) { 1582 case X86_VENDOR_INTEL: 1583 mce_intel_feature_init(c); 1584 mce_adjust_timer = mce_intel_adjust_timer; 1585 break; 1586 case X86_VENDOR_AMD: 1587 mce_amd_feature_init(c); 1588 break; 1589 default: 1590 break; 1591 } 1592} 1593 1594static void mce_start_timer(unsigned int cpu, struct timer_list *t) 1595{ 1596 unsigned long iv = mce_adjust_timer(check_interval * HZ); 1597 1598 __this_cpu_write(mce_next_interval, iv); 1599 1600 if (mce_ignore_ce || !iv) 1601 return; 1602 1603 t->expires = round_jiffies(jiffies + iv); 1604 add_timer_on(t, smp_processor_id()); 1605} 1606 1607static void __mcheck_cpu_init_timer(void) 1608{ 1609 struct timer_list *t = &__get_cpu_var(mce_timer); 1610 unsigned int cpu = smp_processor_id(); 1611 1612 setup_timer(t, mce_timer_fn, cpu); 1613 mce_start_timer(cpu, t); 1614} 1615 1616/* Handle unconfigured int18 (should never happen) */ 1617static void unexpected_machine_check(struct pt_regs *regs, long error_code) 1618{ 1619 pr_err("CPU#%d: Unexpected int18 (Machine Check)\n", 1620 smp_processor_id()); 1621} 1622 1623/* Call the installed machine check handler for this CPU setup. */ 1624void (*machine_check_vector)(struct pt_regs *, long error_code) = 1625 unexpected_machine_check; 1626 1627/* 1628 * Called for each booted CPU to set up machine checks. 1629 * Must be called with preempt off: 1630 */ 1631void __cpuinit mcheck_cpu_init(struct cpuinfo_x86 *c) 1632{ 1633 if (mce_disabled) 1634 return; 1635 1636 if (__mcheck_cpu_ancient_init(c)) 1637 return; 1638 1639 if (!mce_available(c)) 1640 return; 1641 1642 if (__mcheck_cpu_cap_init() < 0 || __mcheck_cpu_apply_quirks(c) < 0) { 1643 mce_disabled = 1; 1644 return; 1645 } 1646 1647 machine_check_vector = do_machine_check; 1648 1649 __mcheck_cpu_init_generic(); 1650 __mcheck_cpu_init_vendor(c); 1651 __mcheck_cpu_init_timer(); 1652 INIT_WORK(&__get_cpu_var(mce_work), mce_process_work); 1653 init_irq_work(&__get_cpu_var(mce_irq_work), &mce_irq_work_cb); 1654} 1655 1656/* 1657 * mce_chrdev: Character device /dev/mcelog to read and clear the MCE log. 1658 */ 1659 1660static DEFINE_SPINLOCK(mce_chrdev_state_lock); 1661static int mce_chrdev_open_count; /* #times opened */ 1662static int mce_chrdev_open_exclu; /* already open exclusive? */ 1663 1664static int mce_chrdev_open(struct inode *inode, struct file *file) 1665{ 1666 spin_lock(&mce_chrdev_state_lock); 1667 1668 if (mce_chrdev_open_exclu || 1669 (mce_chrdev_open_count && (file->f_flags & O_EXCL))) { 1670 spin_unlock(&mce_chrdev_state_lock); 1671 1672 return -EBUSY; 1673 } 1674 1675 if (file->f_flags & O_EXCL) 1676 mce_chrdev_open_exclu = 1; 1677 mce_chrdev_open_count++; 1678 1679 spin_unlock(&mce_chrdev_state_lock); 1680 1681 return nonseekable_open(inode, file); 1682} 1683 1684static int mce_chrdev_release(struct inode *inode, struct file *file) 1685{ 1686 spin_lock(&mce_chrdev_state_lock); 1687 1688 mce_chrdev_open_count--; 1689 mce_chrdev_open_exclu = 0; 1690 1691 spin_unlock(&mce_chrdev_state_lock); 1692 1693 return 0; 1694} 1695 1696static void collect_tscs(void *data) 1697{ 1698 unsigned long *cpu_tsc = (unsigned long *)data; 1699 1700 rdtscll(cpu_tsc[smp_processor_id()]); 1701} 1702 1703static int mce_apei_read_done; 1704 1705/* Collect MCE record of previous boot in persistent storage via APEI ERST. */ 1706static int __mce_read_apei(char __user **ubuf, size_t usize) 1707{ 1708 int rc; 1709 u64 record_id; 1710 struct mce m; 1711 1712 if (usize < sizeof(struct mce)) 1713 return -EINVAL; 1714 1715 rc = apei_read_mce(&m, &record_id); 1716 /* Error or no more MCE record */ 1717 if (rc <= 0) { 1718 mce_apei_read_done = 1; 1719 /* 1720 * When ERST is disabled, mce_chrdev_read() should return 1721 * "no record" instead of "no device." 1722 */ 1723 if (rc == -ENODEV) 1724 return 0; 1725 return rc; 1726 } 1727 rc = -EFAULT; 1728 if (copy_to_user(*ubuf, &m, sizeof(struct mce))) 1729 return rc; 1730 /* 1731 * In fact, we should have cleared the record after that has 1732 * been flushed to the disk or sent to network in 1733 * /sbin/mcelog, but we have no interface to support that now, 1734 * so just clear it to avoid duplication. 1735 */ 1736 rc = apei_clear_mce(record_id); 1737 if (rc) { 1738 mce_apei_read_done = 1; 1739 return rc; 1740 } 1741 *ubuf += sizeof(struct mce); 1742 1743 return 0; 1744} 1745 1746static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf, 1747 size_t usize, loff_t *off) 1748{ 1749 char __user *buf = ubuf; 1750 unsigned long *cpu_tsc; 1751 unsigned prev, next; 1752 int i, err; 1753 1754 cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL); 1755 if (!cpu_tsc) 1756 return -ENOMEM; 1757 1758 mutex_lock(&mce_chrdev_read_mutex); 1759 1760 if (!mce_apei_read_done) { 1761 err = __mce_read_apei(&buf, usize); 1762 if (err || buf != ubuf) 1763 goto out; 1764 } 1765 1766 next = rcu_dereference_check_mce(mcelog.next); 1767 1768 /* Only supports full reads right now */ 1769 err = -EINVAL; 1770 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) 1771 goto out; 1772 1773 err = 0; 1774 prev = 0; 1775 do { 1776 for (i = prev; i < next; i++) { 1777 unsigned long start = jiffies; 1778 struct mce *m = &mcelog.entry[i]; 1779 1780 while (!m->finished) { 1781 if (time_after_eq(jiffies, start + 2)) { 1782 memset(m, 0, sizeof(*m)); 1783 goto timeout; 1784 } 1785 cpu_relax(); 1786 } 1787 smp_rmb(); 1788 err |= copy_to_user(buf, m, sizeof(*m)); 1789 buf += sizeof(*m); 1790timeout: 1791 ; 1792 } 1793 1794 memset(mcelog.entry + prev, 0, 1795 (next - prev) * sizeof(struct mce)); 1796 prev = next; 1797 next = cmpxchg(&mcelog.next, prev, 0); 1798 } while (next != prev); 1799 1800 synchronize_sched(); 1801 1802 /* 1803 * Collect entries that were still getting written before the 1804 * synchronize. 1805 */ 1806 on_each_cpu(collect_tscs, cpu_tsc, 1); 1807 1808 for (i = next; i < MCE_LOG_LEN; i++) { 1809 struct mce *m = &mcelog.entry[i]; 1810 1811 if (m->finished && m->tsc < cpu_tsc[m->cpu]) { 1812 err |= copy_to_user(buf, m, sizeof(*m)); 1813 smp_rmb(); 1814 buf += sizeof(*m); 1815 memset(m, 0, sizeof(*m)); 1816 } 1817 } 1818 1819 if (err) 1820 err = -EFAULT; 1821 1822out: 1823 mutex_unlock(&mce_chrdev_read_mutex); 1824 kfree(cpu_tsc); 1825 1826 return err ? err : buf - ubuf; 1827} 1828 1829static unsigned int mce_chrdev_poll(struct file *file, poll_table *wait) 1830{ 1831 poll_wait(file, &mce_chrdev_wait, wait); 1832 if (rcu_access_index(mcelog.next)) 1833 return POLLIN | POLLRDNORM; 1834 if (!mce_apei_read_done && apei_check_mce()) 1835 return POLLIN | POLLRDNORM; 1836 return 0; 1837} 1838 1839static long mce_chrdev_ioctl(struct file *f, unsigned int cmd, 1840 unsigned long arg) 1841{ 1842 int __user *p = (int __user *)arg; 1843 1844 if (!capable(CAP_SYS_ADMIN)) 1845 return -EPERM; 1846 1847 switch (cmd) { 1848 case MCE_GET_RECORD_LEN: 1849 return put_user(sizeof(struct mce), p); 1850 case MCE_GET_LOG_LEN: 1851 return put_user(MCE_LOG_LEN, p); 1852 case MCE_GETCLEAR_FLAGS: { 1853 unsigned flags; 1854 1855 do { 1856 flags = mcelog.flags; 1857 } while (cmpxchg(&mcelog.flags, flags, 0) != flags); 1858 1859 return put_user(flags, p); 1860 } 1861 default: 1862 return -ENOTTY; 1863 } 1864} 1865 1866static ssize_t (*mce_write)(struct file *filp, const char __user *ubuf, 1867 size_t usize, loff_t *off); 1868 1869void register_mce_write_callback(ssize_t (*fn)(struct file *filp, 1870 const char __user *ubuf, 1871 size_t usize, loff_t *off)) 1872{ 1873 mce_write = fn; 1874} 1875EXPORT_SYMBOL_GPL(register_mce_write_callback); 1876 1877ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf, 1878 size_t usize, loff_t *off) 1879{ 1880 if (mce_write) 1881 return mce_write(filp, ubuf, usize, off); 1882 else 1883 return -EINVAL; 1884} 1885 1886static const struct file_operations mce_chrdev_ops = { 1887 .open = mce_chrdev_open, 1888 .release = mce_chrdev_release, 1889 .read = mce_chrdev_read, 1890 .write = mce_chrdev_write, 1891 .poll = mce_chrdev_poll, 1892 .unlocked_ioctl = mce_chrdev_ioctl, 1893 .llseek = no_llseek, 1894}; 1895 1896static struct miscdevice mce_chrdev_device = { 1897 MISC_MCELOG_MINOR, 1898 "mcelog", 1899 &mce_chrdev_ops, 1900}; 1901 1902/* 1903 * mce=off Disables machine check 1904 * mce=no_cmci Disables CMCI 1905 * mce=dont_log_ce Clears corrected events silently, no log created for CEs. 1906 * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared. 1907 * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above) 1908 * monarchtimeout is how long to wait for other CPUs on machine 1909 * check, or 0 to not wait 1910 * mce=bootlog Log MCEs from before booting. Disabled by default on AMD. 1911 * mce=nobootlog Don't log MCEs from before booting. 1912 */ 1913static int __init mcheck_enable(char *str) 1914{ 1915 if (*str == 0) { 1916 enable_p5_mce(); 1917 return 1; 1918 } 1919 if (*str == '=') 1920 str++; 1921 if (!strcmp(str, "off")) 1922 mce_disabled = 1; 1923 else if (!strcmp(str, "no_cmci")) 1924 mce_cmci_disabled = 1; 1925 else if (!strcmp(str, "dont_log_ce")) 1926 mce_dont_log_ce = 1; 1927 else if (!strcmp(str, "ignore_ce")) 1928 mce_ignore_ce = 1; 1929 else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog")) 1930 mce_bootlog = (str[0] == 'b'); 1931 else if (isdigit(str[0])) { 1932 get_option(&str, &tolerant); 1933 if (*str == ',') { 1934 ++str; 1935 get_option(&str, &monarch_timeout); 1936 } 1937 } else { 1938 pr_info("mce argument %s ignored. Please use /sys\n", str); 1939 return 0; 1940 } 1941 return 1; 1942} 1943__setup("mce", mcheck_enable); 1944 1945int __init mcheck_init(void) 1946{ 1947 mcheck_intel_therm_init(); 1948 1949 return 0; 1950} 1951 1952/* 1953 * mce_syscore: PM support 1954 */ 1955 1956/* 1957 * Disable machine checks on suspend and shutdown. We can't really handle 1958 * them later. 1959 */ 1960static int mce_disable_error_reporting(void) 1961{ 1962 int i; 1963 1964 for (i = 0; i < banks; i++) { 1965 struct mce_bank *b = &mce_banks[i]; 1966 1967 if (b->init) 1968 wrmsrl(MSR_IA32_MCx_CTL(i), 0); 1969 } 1970 return 0; 1971} 1972 1973static int mce_syscore_suspend(void) 1974{ 1975 return mce_disable_error_reporting(); 1976} 1977 1978static void mce_syscore_shutdown(void) 1979{ 1980 mce_disable_error_reporting(); 1981} 1982 1983/* 1984 * On resume clear all MCE state. Don't want to see leftovers from the BIOS. 1985 * Only one CPU is active at this time, the others get re-added later using 1986 * CPU hotplug: 1987 */ 1988static void mce_syscore_resume(void) 1989{ 1990 __mcheck_cpu_init_generic(); 1991 __mcheck_cpu_init_vendor(__this_cpu_ptr(&cpu_info)); 1992} 1993 1994static struct syscore_ops mce_syscore_ops = { 1995 .suspend = mce_syscore_suspend, 1996 .shutdown = mce_syscore_shutdown, 1997 .resume = mce_syscore_resume, 1998}; 1999 2000/* 2001 * mce_device: Sysfs support 2002 */ 2003 2004static void mce_cpu_restart(void *data) 2005{ 2006 if (!mce_available(__this_cpu_ptr(&cpu_info))) 2007 return; 2008 __mcheck_cpu_init_generic(); 2009 __mcheck_cpu_init_timer(); 2010} 2011 2012/* Reinit MCEs after user configuration changes */ 2013static void mce_restart(void) 2014{ 2015 mce_timer_delete_all(); 2016 on_each_cpu(mce_cpu_restart, NULL, 1); 2017} 2018 2019/* Toggle features for corrected errors */ 2020static void mce_disable_cmci(void *data) 2021{ 2022 if (!mce_available(__this_cpu_ptr(&cpu_info))) 2023 return; 2024 cmci_clear(); 2025} 2026 2027static void mce_enable_ce(void *all) 2028{ 2029 if (!mce_available(__this_cpu_ptr(&cpu_info))) 2030 return; 2031 cmci_reenable(); 2032 cmci_recheck(); 2033 if (all) 2034 __mcheck_cpu_init_timer(); 2035} 2036 2037static struct bus_type mce_subsys = { 2038 .name = "machinecheck", 2039 .dev_name = "machinecheck", 2040}; 2041 2042DEFINE_PER_CPU(struct device *, mce_device); 2043 2044__cpuinitdata 2045void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); 2046 2047static inline struct mce_bank *attr_to_bank(struct device_attribute *attr) 2048{ 2049 return container_of(attr, struct mce_bank, attr); 2050} 2051 2052static ssize_t show_bank(struct device *s, struct device_attribute *attr, 2053 char *buf) 2054{ 2055 return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl); 2056} 2057 2058static ssize_t set_bank(struct device *s, struct device_attribute *attr, 2059 const char *buf, size_t size) 2060{ 2061 u64 new; 2062 2063 if (strict_strtoull(buf, 0, &new) < 0) 2064 return -EINVAL; 2065 2066 attr_to_bank(attr)->ctl = new; 2067 mce_restart(); 2068 2069 return size; 2070} 2071 2072static ssize_t 2073show_trigger(struct device *s, struct device_attribute *attr, char *buf) 2074{ 2075 strcpy(buf, mce_helper); 2076 strcat(buf, "\n"); 2077 return strlen(mce_helper) + 1; 2078} 2079 2080static ssize_t set_trigger(struct device *s, struct device_attribute *attr, 2081 const char *buf, size_t siz) 2082{ 2083 char *p; 2084 2085 strncpy(mce_helper, buf, sizeof(mce_helper)); 2086 mce_helper[sizeof(mce_helper)-1] = 0; 2087 p = strchr(mce_helper, '\n'); 2088 2089 if (p) 2090 *p = 0; 2091 2092 return strlen(mce_helper) + !!p; 2093} 2094 2095static ssize_t set_ignore_ce(struct device *s, 2096 struct device_attribute *attr, 2097 const char *buf, size_t size) 2098{ 2099 u64 new; 2100 2101 if (strict_strtoull(buf, 0, &new) < 0) 2102 return -EINVAL; 2103 2104 if (mce_ignore_ce ^ !!new) { 2105 if (new) { 2106 /* disable ce features */ 2107 mce_timer_delete_all(); 2108 on_each_cpu(mce_disable_cmci, NULL, 1); 2109 mce_ignore_ce = 1; 2110 } else { 2111 /* enable ce features */ 2112 mce_ignore_ce = 0; 2113 on_each_cpu(mce_enable_ce, (void *)1, 1); 2114 } 2115 } 2116 return size; 2117} 2118 2119static ssize_t set_cmci_disabled(struct device *s, 2120 struct device_attribute *attr, 2121 const char *buf, size_t size) 2122{ 2123 u64 new; 2124 2125 if (strict_strtoull(buf, 0, &new) < 0) 2126 return -EINVAL; 2127 2128 if (mce_cmci_disabled ^ !!new) { 2129 if (new) { 2130 /* disable cmci */ 2131 on_each_cpu(mce_disable_cmci, NULL, 1); 2132 mce_cmci_disabled = 1; 2133 } else { 2134 /* enable cmci */ 2135 mce_cmci_disabled = 0; 2136 on_each_cpu(mce_enable_ce, NULL, 1); 2137 } 2138 } 2139 return size; 2140} 2141 2142static ssize_t store_int_with_restart(struct device *s, 2143 struct device_attribute *attr, 2144 const char *buf, size_t size) 2145{ 2146 ssize_t ret = device_store_int(s, attr, buf, size); 2147 mce_restart(); 2148 return ret; 2149} 2150 2151static DEVICE_ATTR(trigger, 0644, show_trigger, set_trigger); 2152static DEVICE_INT_ATTR(tolerant, 0644, tolerant); 2153static DEVICE_INT_ATTR(monarch_timeout, 0644, monarch_timeout); 2154static DEVICE_INT_ATTR(dont_log_ce, 0644, mce_dont_log_ce); 2155 2156static struct dev_ext_attribute dev_attr_check_interval = { 2157 __ATTR(check_interval, 0644, device_show_int, store_int_with_restart), 2158 &check_interval 2159}; 2160 2161static struct dev_ext_attribute dev_attr_ignore_ce = { 2162 __ATTR(ignore_ce, 0644, device_show_int, set_ignore_ce), 2163 &mce_ignore_ce 2164}; 2165 2166static struct dev_ext_attribute dev_attr_cmci_disabled = { 2167 __ATTR(cmci_disabled, 0644, device_show_int, set_cmci_disabled), 2168 &mce_cmci_disabled 2169}; 2170 2171static struct device_attribute *mce_device_attrs[] = { 2172 &dev_attr_tolerant.attr, 2173 &dev_attr_check_interval.attr, 2174 &dev_attr_trigger, 2175 &dev_attr_monarch_timeout.attr, 2176 &dev_attr_dont_log_ce.attr, 2177 &dev_attr_ignore_ce.attr, 2178 &dev_attr_cmci_disabled.attr, 2179 NULL 2180}; 2181 2182static cpumask_var_t mce_device_initialized; 2183 2184static void mce_device_release(struct device *dev) 2185{ 2186 kfree(dev); 2187} 2188 2189/* Per cpu device init. All of the cpus still share the same ctrl bank: */ 2190static __cpuinit int mce_device_create(unsigned int cpu) 2191{ 2192 struct device *dev; 2193 int err; 2194 int i, j; 2195 2196 if (!mce_available(&boot_cpu_data)) 2197 return -EIO; 2198 2199 dev = kzalloc(sizeof *dev, GFP_KERNEL); 2200 if (!dev) 2201 return -ENOMEM; 2202 dev->id = cpu; 2203 dev->bus = &mce_subsys; 2204 dev->release = &mce_device_release; 2205 2206 err = device_register(dev); 2207 if (err) 2208 return err; 2209 2210 for (i = 0; mce_device_attrs[i]; i++) { 2211 err = device_create_file(dev, mce_device_attrs[i]); 2212 if (err) 2213 goto error; 2214 } 2215 for (j = 0; j < banks; j++) { 2216 err = device_create_file(dev, &mce_banks[j].attr); 2217 if (err) 2218 goto error2; 2219 } 2220 cpumask_set_cpu(cpu, mce_device_initialized); 2221 per_cpu(mce_device, cpu) = dev; 2222 2223 return 0; 2224error2: 2225 while (--j >= 0) 2226 device_remove_file(dev, &mce_banks[j].attr); 2227error: 2228 while (--i >= 0) 2229 device_remove_file(dev, mce_device_attrs[i]); 2230 2231 device_unregister(dev); 2232 2233 return err; 2234} 2235 2236static __cpuinit void mce_device_remove(unsigned int cpu) 2237{ 2238 struct device *dev = per_cpu(mce_device, cpu); 2239 int i; 2240 2241 if (!cpumask_test_cpu(cpu, mce_device_initialized)) 2242 return; 2243 2244 for (i = 0; mce_device_attrs[i]; i++) 2245 device_remove_file(dev, mce_device_attrs[i]); 2246 2247 for (i = 0; i < banks; i++) 2248 device_remove_file(dev, &mce_banks[i].attr); 2249 2250 device_unregister(dev); 2251 cpumask_clear_cpu(cpu, mce_device_initialized); 2252 per_cpu(mce_device, cpu) = NULL; 2253} 2254 2255/* Make sure there are no machine checks on offlined CPUs. */ 2256static void __cpuinit mce_disable_cpu(void *h) 2257{ 2258 unsigned long action = *(unsigned long *)h; 2259 int i; 2260 2261 if (!mce_available(__this_cpu_ptr(&cpu_info))) 2262 return; 2263 2264 if (!(action & CPU_TASKS_FROZEN)) 2265 cmci_clear(); 2266 for (i = 0; i < banks; i++) { 2267 struct mce_bank *b = &mce_banks[i]; 2268 2269 if (b->init) 2270 wrmsrl(MSR_IA32_MCx_CTL(i), 0); 2271 } 2272} 2273 2274static void __cpuinit mce_reenable_cpu(void *h) 2275{ 2276 unsigned long action = *(unsigned long *)h; 2277 int i; 2278 2279 if (!mce_available(__this_cpu_ptr(&cpu_info))) 2280 return; 2281 2282 if (!(action & CPU_TASKS_FROZEN)) 2283 cmci_reenable(); 2284 for (i = 0; i < banks; i++) { 2285 struct mce_bank *b = &mce_banks[i]; 2286 2287 if (b->init) 2288 wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl); 2289 } 2290} 2291 2292/* Get notified when a cpu comes on/off. Be hotplug friendly. */ 2293static int __cpuinit 2294mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) 2295{ 2296 unsigned int cpu = (unsigned long)hcpu; 2297 struct timer_list *t = &per_cpu(mce_timer, cpu); 2298 2299 switch (action & ~CPU_TASKS_FROZEN) { 2300 case CPU_ONLINE: 2301 mce_device_create(cpu); 2302 if (threshold_cpu_callback) 2303 threshold_cpu_callback(action, cpu); 2304 break; 2305 case CPU_DEAD: 2306 if (threshold_cpu_callback) 2307 threshold_cpu_callback(action, cpu); 2308 mce_device_remove(cpu); 2309 mce_intel_hcpu_update(cpu); 2310 break; 2311 case CPU_DOWN_PREPARE: 2312 smp_call_function_single(cpu, mce_disable_cpu, &action, 1); 2313 del_timer_sync(t); 2314 break; 2315 case CPU_DOWN_FAILED: 2316 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); 2317 mce_start_timer(cpu, t); 2318 break; 2319 } 2320 2321 if (action == CPU_POST_DEAD) { 2322 /* intentionally ignoring frozen here */ 2323 cmci_rediscover(cpu); 2324 } 2325 2326 return NOTIFY_OK; 2327} 2328 2329static struct notifier_block mce_cpu_notifier __cpuinitdata = { 2330 .notifier_call = mce_cpu_callback, 2331}; 2332 2333static __init void mce_init_banks(void) 2334{ 2335 int i; 2336 2337 for (i = 0; i < banks; i++) { 2338 struct mce_bank *b = &mce_banks[i]; 2339 struct device_attribute *a = &b->attr; 2340 2341 sysfs_attr_init(&a->attr); 2342 a->attr.name = b->attrname; 2343 snprintf(b->attrname, ATTR_LEN, "bank%d", i); 2344 2345 a->attr.mode = 0644; 2346 a->show = show_bank; 2347 a->store = set_bank; 2348 } 2349} 2350 2351static __init int mcheck_init_device(void) 2352{ 2353 int err; 2354 int i = 0; 2355 2356 if (!mce_available(&boot_cpu_data)) 2357 return -EIO; 2358 2359 zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL); 2360 2361 mce_init_banks(); 2362 2363 err = subsys_system_register(&mce_subsys, NULL); 2364 if (err) 2365 return err; 2366 2367 for_each_online_cpu(i) { 2368 err = mce_device_create(i); 2369 if (err) 2370 return err; 2371 } 2372 2373 register_syscore_ops(&mce_syscore_ops); 2374 register_hotcpu_notifier(&mce_cpu_notifier); 2375 2376 /* register character device /dev/mcelog */ 2377 misc_register(&mce_chrdev_device); 2378 2379 return err; 2380} 2381device_initcall_sync(mcheck_init_device); 2382 2383/* 2384 * Old style boot options parsing. Only for compatibility. 2385 */ 2386static int __init mcheck_disable(char *str) 2387{ 2388 mce_disabled = 1; 2389 return 1; 2390} 2391__setup("nomce", mcheck_disable); 2392 2393#ifdef CONFIG_DEBUG_FS 2394struct dentry *mce_get_debugfs_dir(void) 2395{ 2396 static struct dentry *dmce; 2397 2398 if (!dmce) 2399 dmce = debugfs_create_dir("mce", NULL); 2400 2401 return dmce; 2402} 2403 2404static void mce_reset(void) 2405{ 2406 cpu_missing = 0; 2407 atomic_set(&mce_fake_paniced, 0); 2408 atomic_set(&mce_executing, 0); 2409 atomic_set(&mce_callin, 0); 2410 atomic_set(&global_nwo, 0); 2411} 2412 2413static int fake_panic_get(void *data, u64 *val) 2414{ 2415 *val = fake_panic; 2416 return 0; 2417} 2418 2419static int fake_panic_set(void *data, u64 val) 2420{ 2421 mce_reset(); 2422 fake_panic = val; 2423 return 0; 2424} 2425 2426DEFINE_SIMPLE_ATTRIBUTE(fake_panic_fops, fake_panic_get, 2427 fake_panic_set, "%llu\n"); 2428 2429static int __init mcheck_debugfs_init(void) 2430{ 2431 struct dentry *dmce, *ffake_panic; 2432 2433 dmce = mce_get_debugfs_dir(); 2434 if (!dmce) 2435 return -ENOMEM; 2436 ffake_panic = debugfs_create_file("fake_panic", 0444, dmce, NULL, 2437 &fake_panic_fops); 2438 if (!ffake_panic) 2439 return -ENOMEM; 2440 2441 return 0; 2442} 2443late_initcall(mcheck_debugfs_init); 2444#endif 2445