1/* 2 * Machine check handler. 3 * 4 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. 5 * Rest from unknown author(s). 6 * 2004 Andi Kleen. Rewrote most of it. 7 * Copyright 2008 Intel Corporation 8 * Author: Andi Kleen 9 */ 10#include <linux/thread_info.h> 11#include <linux/capability.h> 12#include <linux/miscdevice.h> 13#include <linux/ratelimit.h> 14#include <linux/kallsyms.h> 15#include <linux/rcupdate.h> 16#include <linux/kobject.h> 17#include <linux/uaccess.h> 18#include <linux/kdebug.h> 19#include <linux/kernel.h> 20#include <linux/percpu.h> 21#include <linux/string.h> 22#include <linux/device.h> 23#include <linux/syscore_ops.h> 24#include <linux/delay.h> 25#include <linux/ctype.h> 26#include <linux/sched.h> 27#include <linux/sysfs.h> 28#include <linux/types.h> 29#include <linux/slab.h> 30#include <linux/init.h> 31#include <linux/kmod.h> 32#include <linux/poll.h> 33#include <linux/nmi.h> 34#include <linux/cpu.h> 35#include <linux/smp.h> 36#include <linux/fs.h> 37#include <linux/mm.h> 38#include <linux/debugfs.h> 39#include <linux/irq_work.h> 40#include <linux/export.h> 41 42#include <asm/processor.h> 43#include <asm/mce.h> 44#include <asm/msr.h> 45 46#include "mce-internal.h" 47 48static DEFINE_MUTEX(mce_chrdev_read_mutex); 49 50#define rcu_dereference_check_mce(p) \ 51 rcu_dereference_index_check((p), \ 52 rcu_read_lock_sched_held() || \ 53 lockdep_is_held(&mce_chrdev_read_mutex)) 54 55#define CREATE_TRACE_POINTS 56#include <trace/events/mce.h> 57 58int mce_disabled __read_mostly; 59 60#define MISC_MCELOG_MINOR 227 61 62#define SPINUNIT 100 /* 100ns */ 63 64atomic_t mce_entry; 65 66DEFINE_PER_CPU(unsigned, mce_exception_count); 67 68/* 69 * Tolerant levels: 70 * 0: always panic on uncorrected errors, log corrected errors 71 * 1: panic or SIGBUS on uncorrected errors, log corrected errors 72 * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors 73 * 3: never panic or SIGBUS, log all errors (for testing only) 74 */ 75static int tolerant __read_mostly = 1; 76static int banks __read_mostly; 77static int rip_msr __read_mostly; 78static int mce_bootlog __read_mostly = -1; 79static int monarch_timeout __read_mostly = -1; 80static int mce_panic_timeout __read_mostly; 81static int mce_dont_log_ce __read_mostly; 82int mce_cmci_disabled __read_mostly; 83int mce_ignore_ce __read_mostly; 84int mce_ser __read_mostly; 85 86struct mce_bank *mce_banks __read_mostly; 87 88/* User mode helper program triggered by machine check event */ 89static unsigned long mce_need_notify; 90static char mce_helper[128]; 91static char *mce_helper_argv[2] = { mce_helper, NULL }; 92 93static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait); 94 95static DEFINE_PER_CPU(struct mce, mces_seen); 96static int cpu_missing; 97 98/* MCA banks polled by the period polling timer for corrected events */ 99DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { 100 [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL 101}; 102 103static DEFINE_PER_CPU(struct work_struct, mce_work); 104 105/* 106 * CPU/chipset specific EDAC code can register a notifier call here to print 107 * MCE errors in a human-readable form. 108 */ 109ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain); 110 111/* Do initial initialization of a struct mce */ 112void mce_setup(struct mce *m) 113{ 114 memset(m, 0, sizeof(struct mce)); 115 m->cpu = m->extcpu = smp_processor_id(); 116 rdtscll(m->tsc); 117 /* We hope get_seconds stays lockless */ 118 m->time = get_seconds(); 119 m->cpuvendor = boot_cpu_data.x86_vendor; 120 m->cpuid = cpuid_eax(1); 121 m->socketid = cpu_data(m->extcpu).phys_proc_id; 122 m->apicid = cpu_data(m->extcpu).initial_apicid; 123 rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap); 124} 125 126DEFINE_PER_CPU(struct mce, injectm); 127EXPORT_PER_CPU_SYMBOL_GPL(injectm); 128 129/* 130 * Lockless MCE logging infrastructure. 131 * This avoids deadlocks on printk locks without having to break locks. Also 132 * separate MCEs from kernel messages to avoid bogus bug reports. 133 */ 134 135static struct mce_log mcelog = { 136 .signature = MCE_LOG_SIGNATURE, 137 .len = MCE_LOG_LEN, 138 .recordlen = sizeof(struct mce), 139}; 140 141void mce_log(struct mce *mce) 142{ 143 unsigned next, entry; 144 int ret = 0; 145 146 /* Emit the trace record: */ 147 trace_mce_record(mce); 148 149 ret = atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, mce); 150 if (ret == NOTIFY_STOP) 151 return; 152 153 mce->finished = 0; 154 wmb(); 155 for (;;) { 156 entry = rcu_dereference_check_mce(mcelog.next); 157 for (;;) { 158 159 /* 160 * When the buffer fills up discard new entries. 161 * Assume that the earlier errors are the more 162 * interesting ones: 163 */ 164 if (entry >= MCE_LOG_LEN) { 165 set_bit(MCE_OVERFLOW, 166 (unsigned long *)&mcelog.flags); 167 return; 168 } 169 /* Old left over entry. Skip: */ 170 if (mcelog.entry[entry].finished) { 171 entry++; 172 continue; 173 } 174 break; 175 } 176 smp_rmb(); 177 next = entry + 1; 178 if (cmpxchg(&mcelog.next, entry, next) == entry) 179 break; 180 } 181 memcpy(mcelog.entry + entry, mce, sizeof(struct mce)); 182 wmb(); 183 mcelog.entry[entry].finished = 1; 184 wmb(); 185 186 mce->finished = 1; 187 set_bit(0, &mce_need_notify); 188} 189 190static void drain_mcelog_buffer(void) 191{ 192 unsigned int next, i, prev = 0; 193 194 next = ACCESS_ONCE(mcelog.next); 195 196 do { 197 struct mce *m; 198 199 /* drain what was logged during boot */ 200 for (i = prev; i < next; i++) { 201 unsigned long start = jiffies; 202 unsigned retries = 1; 203 204 m = &mcelog.entry[i]; 205 206 while (!m->finished) { 207 if (time_after_eq(jiffies, start + 2*retries)) 208 retries++; 209 210 cpu_relax(); 211 212 if (!m->finished && retries >= 4) { 213 pr_err("MCE: skipping error being logged currently!\n"); 214 break; 215 } 216 } 217 smp_rmb(); 218 atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m); 219 } 220 221 memset(mcelog.entry + prev, 0, (next - prev) * sizeof(*m)); 222 prev = next; 223 next = cmpxchg(&mcelog.next, prev, 0); 224 } while (next != prev); 225} 226 227 228void mce_register_decode_chain(struct notifier_block *nb) 229{ 230 atomic_notifier_chain_register(&x86_mce_decoder_chain, nb); 231 drain_mcelog_buffer(); 232} 233EXPORT_SYMBOL_GPL(mce_register_decode_chain); 234 235void mce_unregister_decode_chain(struct notifier_block *nb) 236{ 237 atomic_notifier_chain_unregister(&x86_mce_decoder_chain, nb); 238} 239EXPORT_SYMBOL_GPL(mce_unregister_decode_chain); 240 241static void print_mce(struct mce *m) 242{ 243 int ret = 0; 244 245 pr_emerg(HW_ERR "CPU %d: Machine Check Exception: %Lx Bank %d: %016Lx\n", 246 m->extcpu, m->mcgstatus, m->bank, m->status); 247 248 if (m->ip) { 249 pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ", 250 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", 251 m->cs, m->ip); 252 253 if (m->cs == __KERNEL_CS) 254 print_symbol("{%s}", m->ip); 255 pr_cont("\n"); 256 } 257 258 pr_emerg(HW_ERR "TSC %llx ", m->tsc); 259 if (m->addr) 260 pr_cont("ADDR %llx ", m->addr); 261 if (m->misc) 262 pr_cont("MISC %llx ", m->misc); 263 264 pr_cont("\n"); 265 /* 266 * Note this output is parsed by external tools and old fields 267 * should not be changed. 268 */ 269 pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x microcode %x\n", 270 m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid, 271 cpu_data(m->extcpu).microcode); 272 273 /* 274 * Print out human-readable details about the MCE error, 275 * (if the CPU has an implementation for that) 276 */ 277 ret = atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m); 278 if (ret == NOTIFY_STOP) 279 return; 280 281 pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n"); 282} 283 284#define PANIC_TIMEOUT 5 /* 5 seconds */ 285 286static atomic_t mce_paniced; 287 288static int fake_panic; 289static atomic_t mce_fake_paniced; 290 291/* Panic in progress. Enable interrupts and wait for final IPI */ 292static void wait_for_panic(void) 293{ 294 long timeout = PANIC_TIMEOUT*USEC_PER_SEC; 295 296 preempt_disable(); 297 local_irq_enable(); 298 while (timeout-- > 0) 299 udelay(1); 300 if (panic_timeout == 0) 301 panic_timeout = mce_panic_timeout; 302 panic("Panicing machine check CPU died"); 303} 304 305static void mce_panic(char *msg, struct mce *final, char *exp) 306{ 307 int i, apei_err = 0; 308 309 if (!fake_panic) { 310 /* 311 * Make sure only one CPU runs in machine check panic 312 */ 313 if (atomic_inc_return(&mce_paniced) > 1) 314 wait_for_panic(); 315 barrier(); 316 317 bust_spinlocks(1); 318 console_verbose(); 319 } else { 320 /* Don't log too much for fake panic */ 321 if (atomic_inc_return(&mce_fake_paniced) > 1) 322 return; 323 } 324 /* First print corrected ones that are still unlogged */ 325 for (i = 0; i < MCE_LOG_LEN; i++) { 326 struct mce *m = &mcelog.entry[i]; 327 if (!(m->status & MCI_STATUS_VAL)) 328 continue; 329 if (!(m->status & MCI_STATUS_UC)) { 330 print_mce(m); 331 if (!apei_err) 332 apei_err = apei_write_mce(m); 333 } 334 } 335 /* Now print uncorrected but with the final one last */ 336 for (i = 0; i < MCE_LOG_LEN; i++) { 337 struct mce *m = &mcelog.entry[i]; 338 if (!(m->status & MCI_STATUS_VAL)) 339 continue; 340 if (!(m->status & MCI_STATUS_UC)) 341 continue; 342 if (!final || memcmp(m, final, sizeof(struct mce))) { 343 print_mce(m); 344 if (!apei_err) 345 apei_err = apei_write_mce(m); 346 } 347 } 348 if (final) { 349 print_mce(final); 350 if (!apei_err) 351 apei_err = apei_write_mce(final); 352 } 353 if (cpu_missing) 354 pr_emerg(HW_ERR "Some CPUs didn't answer in synchronization\n"); 355 if (exp) 356 pr_emerg(HW_ERR "Machine check: %s\n", exp); 357 if (!fake_panic) { 358 if (panic_timeout == 0) 359 panic_timeout = mce_panic_timeout; 360 panic(msg); 361 } else 362 pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg); 363} 364 365/* Support code for software error injection */ 366 367static int msr_to_offset(u32 msr) 368{ 369 unsigned bank = __this_cpu_read(injectm.bank); 370 371 if (msr == rip_msr) 372 return offsetof(struct mce, ip); 373 if (msr == MSR_IA32_MCx_STATUS(bank)) 374 return offsetof(struct mce, status); 375 if (msr == MSR_IA32_MCx_ADDR(bank)) 376 return offsetof(struct mce, addr); 377 if (msr == MSR_IA32_MCx_MISC(bank)) 378 return offsetof(struct mce, misc); 379 if (msr == MSR_IA32_MCG_STATUS) 380 return offsetof(struct mce, mcgstatus); 381 return -1; 382} 383 384/* MSR access wrappers used for error injection */ 385static u64 mce_rdmsrl(u32 msr) 386{ 387 u64 v; 388 389 if (__this_cpu_read(injectm.finished)) { 390 int offset = msr_to_offset(msr); 391 392 if (offset < 0) 393 return 0; 394 return *(u64 *)((char *)&__get_cpu_var(injectm) + offset); 395 } 396 397 if (rdmsrl_safe(msr, &v)) { 398 WARN_ONCE(1, "mce: Unable to read msr %d!\n", msr); 399 /* 400 * Return zero in case the access faulted. This should 401 * not happen normally but can happen if the CPU does 402 * something weird, or if the code is buggy. 403 */ 404 v = 0; 405 } 406 407 return v; 408} 409 410static void mce_wrmsrl(u32 msr, u64 v) 411{ 412 if (__this_cpu_read(injectm.finished)) { 413 int offset = msr_to_offset(msr); 414 415 if (offset >= 0) 416 *(u64 *)((char *)&__get_cpu_var(injectm) + offset) = v; 417 return; 418 } 419 wrmsrl(msr, v); 420} 421 422/* 423 * Collect all global (w.r.t. this processor) status about this machine 424 * check into our "mce" struct so that we can use it later to assess 425 * the severity of the problem as we read per-bank specific details. 426 */ 427static inline void mce_gather_info(struct mce *m, struct pt_regs *regs) 428{ 429 mce_setup(m); 430 431 m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); 432 if (regs) { 433 /* 434 * Get the address of the instruction at the time of 435 * the machine check error. 436 */ 437 if (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) { 438 m->ip = regs->ip; 439 m->cs = regs->cs; 440 441 /* 442 * When in VM86 mode make the cs look like ring 3 443 * always. This is a lie, but it's better than passing 444 * the additional vm86 bit around everywhere. 445 */ 446 if (v8086_mode(regs)) 447 m->cs |= 3; 448 } 449 /* Use accurate RIP reporting if available. */ 450 if (rip_msr) 451 m->ip = mce_rdmsrl(rip_msr); 452 } 453} 454 455/* 456 * Simple lockless ring to communicate PFNs from the exception handler with the 457 * process context work function. This is vastly simplified because there's 458 * only a single reader and a single writer. 459 */ 460#define MCE_RING_SIZE 16 /* we use one entry less */ 461 462struct mce_ring { 463 unsigned short start; 464 unsigned short end; 465 unsigned long ring[MCE_RING_SIZE]; 466}; 467static DEFINE_PER_CPU(struct mce_ring, mce_ring); 468 469/* Runs with CPU affinity in workqueue */ 470static int mce_ring_empty(void) 471{ 472 struct mce_ring *r = &__get_cpu_var(mce_ring); 473 474 return r->start == r->end; 475} 476 477static int mce_ring_get(unsigned long *pfn) 478{ 479 struct mce_ring *r; 480 int ret = 0; 481 482 *pfn = 0; 483 get_cpu(); 484 r = &__get_cpu_var(mce_ring); 485 if (r->start == r->end) 486 goto out; 487 *pfn = r->ring[r->start]; 488 r->start = (r->start + 1) % MCE_RING_SIZE; 489 ret = 1; 490out: 491 put_cpu(); 492 return ret; 493} 494 495/* Always runs in MCE context with preempt off */ 496static int mce_ring_add(unsigned long pfn) 497{ 498 struct mce_ring *r = &__get_cpu_var(mce_ring); 499 unsigned next; 500 501 next = (r->end + 1) % MCE_RING_SIZE; 502 if (next == r->start) 503 return -1; 504 r->ring[r->end] = pfn; 505 wmb(); 506 r->end = next; 507 return 0; 508} 509 510int mce_available(struct cpuinfo_x86 *c) 511{ 512 if (mce_disabled) 513 return 0; 514 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); 515} 516 517static void mce_schedule_work(void) 518{ 519 if (!mce_ring_empty()) { 520 struct work_struct *work = &__get_cpu_var(mce_work); 521 if (!work_pending(work)) 522 schedule_work(work); 523 } 524} 525 526DEFINE_PER_CPU(struct irq_work, mce_irq_work); 527 528static void mce_irq_work_cb(struct irq_work *entry) 529{ 530 mce_notify_irq(); 531 mce_schedule_work(); 532} 533 534static void mce_report_event(struct pt_regs *regs) 535{ 536 if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) { 537 mce_notify_irq(); 538 /* 539 * Triggering the work queue here is just an insurance 540 * policy in case the syscall exit notify handler 541 * doesn't run soon enough or ends up running on the 542 * wrong CPU (can happen when audit sleeps) 543 */ 544 mce_schedule_work(); 545 return; 546 } 547 548 irq_work_queue(&__get_cpu_var(mce_irq_work)); 549} 550 551/* 552 * Read ADDR and MISC registers. 553 */ 554static void mce_read_aux(struct mce *m, int i) 555{ 556 if (m->status & MCI_STATUS_MISCV) 557 m->misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i)); 558 if (m->status & MCI_STATUS_ADDRV) { 559 m->addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i)); 560 561 /* 562 * Mask the reported address by the reported granularity. 563 */ 564 if (mce_ser && (m->status & MCI_STATUS_MISCV)) { 565 u8 shift = MCI_MISC_ADDR_LSB(m->misc); 566 m->addr >>= shift; 567 m->addr <<= shift; 568 } 569 } 570} 571 572DEFINE_PER_CPU(unsigned, mce_poll_count); 573 574/* 575 * Poll for corrected events or events that happened before reset. 576 * Those are just logged through /dev/mcelog. 577 * 578 * This is executed in standard interrupt context. 579 * 580 * Note: spec recommends to panic for fatal unsignalled 581 * errors here. However this would be quite problematic -- 582 * we would need to reimplement the Monarch handling and 583 * it would mess up the exclusion between exception handler 584 * and poll hander -- * so we skip this for now. 585 * These cases should not happen anyways, or only when the CPU 586 * is already totally * confused. In this case it's likely it will 587 * not fully execute the machine check handler either. 588 */ 589void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) 590{ 591 struct mce m; 592 int i; 593 594 percpu_inc(mce_poll_count); 595 596 mce_gather_info(&m, NULL); 597 598 for (i = 0; i < banks; i++) { 599 if (!mce_banks[i].ctl || !test_bit(i, *b)) 600 continue; 601 602 m.misc = 0; 603 m.addr = 0; 604 m.bank = i; 605 m.tsc = 0; 606 607 barrier(); 608 m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); 609 if (!(m.status & MCI_STATUS_VAL)) 610 continue; 611 612 /* 613 * Uncorrected or signalled events are handled by the exception 614 * handler when it is enabled, so don't process those here. 615 * 616 * TBD do the same check for MCI_STATUS_EN here? 617 */ 618 if (!(flags & MCP_UC) && 619 (m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC))) 620 continue; 621 622 mce_read_aux(&m, i); 623 624 if (!(flags & MCP_TIMESTAMP)) 625 m.tsc = 0; 626 /* 627 * Don't get the IP here because it's unlikely to 628 * have anything to do with the actual error location. 629 */ 630 if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) 631 mce_log(&m); 632 633 /* 634 * Clear state for this bank. 635 */ 636 mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0); 637 } 638 639 /* 640 * Don't clear MCG_STATUS here because it's only defined for 641 * exceptions. 642 */ 643 644 sync_core(); 645} 646EXPORT_SYMBOL_GPL(machine_check_poll); 647 648/* 649 * Do a quick check if any of the events requires a panic. 650 * This decides if we keep the events around or clear them. 651 */ 652static int mce_no_way_out(struct mce *m, char **msg) 653{ 654 int i; 655 656 for (i = 0; i < banks; i++) { 657 m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); 658 if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY) 659 return 1; 660 } 661 return 0; 662} 663 664/* 665 * Variable to establish order between CPUs while scanning. 666 * Each CPU spins initially until executing is equal its number. 667 */ 668static atomic_t mce_executing; 669 670/* 671 * Defines order of CPUs on entry. First CPU becomes Monarch. 672 */ 673static atomic_t mce_callin; 674 675/* 676 * Check if a timeout waiting for other CPUs happened. 677 */ 678static int mce_timed_out(u64 *t) 679{ 680 /* 681 * The others already did panic for some reason. 682 * Bail out like in a timeout. 683 * rmb() to tell the compiler that system_state 684 * might have been modified by someone else. 685 */ 686 rmb(); 687 if (atomic_read(&mce_paniced)) 688 wait_for_panic(); 689 if (!monarch_timeout) 690 goto out; 691 if ((s64)*t < SPINUNIT) { 692 /* CHECKME: Make panic default for 1 too? */ 693 if (tolerant < 1) 694 mce_panic("Timeout synchronizing machine check over CPUs", 695 NULL, NULL); 696 cpu_missing = 1; 697 return 1; 698 } 699 *t -= SPINUNIT; 700out: 701 touch_nmi_watchdog(); 702 return 0; 703} 704 705/* 706 * The Monarch's reign. The Monarch is the CPU who entered 707 * the machine check handler first. It waits for the others to 708 * raise the exception too and then grades them. When any 709 * error is fatal panic. Only then let the others continue. 710 * 711 * The other CPUs entering the MCE handler will be controlled by the 712 * Monarch. They are called Subjects. 713 * 714 * This way we prevent any potential data corruption in a unrecoverable case 715 * and also makes sure always all CPU's errors are examined. 716 * 717 * Also this detects the case of a machine check event coming from outer 718 * space (not detected by any CPUs) In this case some external agent wants 719 * us to shut down, so panic too. 720 * 721 * The other CPUs might still decide to panic if the handler happens 722 * in a unrecoverable place, but in this case the system is in a semi-stable 723 * state and won't corrupt anything by itself. It's ok to let the others 724 * continue for a bit first. 725 * 726 * All the spin loops have timeouts; when a timeout happens a CPU 727 * typically elects itself to be Monarch. 728 */ 729static void mce_reign(void) 730{ 731 int cpu; 732 struct mce *m = NULL; 733 int global_worst = 0; 734 char *msg = NULL; 735 char *nmsg = NULL; 736 737 /* 738 * This CPU is the Monarch and the other CPUs have run 739 * through their handlers. 740 * Grade the severity of the errors of all the CPUs. 741 */ 742 for_each_possible_cpu(cpu) { 743 int severity = mce_severity(&per_cpu(mces_seen, cpu), tolerant, 744 &nmsg); 745 if (severity > global_worst) { 746 msg = nmsg; 747 global_worst = severity; 748 m = &per_cpu(mces_seen, cpu); 749 } 750 } 751 752 /* 753 * Cannot recover? Panic here then. 754 * This dumps all the mces in the log buffer and stops the 755 * other CPUs. 756 */ 757 if (m && global_worst >= MCE_PANIC_SEVERITY && tolerant < 3) 758 mce_panic("Fatal Machine check", m, msg); 759 760 /* 761 * For UC somewhere we let the CPU who detects it handle it. 762 * Also must let continue the others, otherwise the handling 763 * CPU could deadlock on a lock. 764 */ 765 766 /* 767 * No machine check event found. Must be some external 768 * source or one CPU is hung. Panic. 769 */ 770 if (global_worst <= MCE_KEEP_SEVERITY && tolerant < 3) 771 mce_panic("Machine check from unknown source", NULL, NULL); 772 773 /* 774 * Now clear all the mces_seen so that they don't reappear on 775 * the next mce. 776 */ 777 for_each_possible_cpu(cpu) 778 memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce)); 779} 780 781static atomic_t global_nwo; 782 783/* 784 * Start of Monarch synchronization. This waits until all CPUs have 785 * entered the exception handler and then determines if any of them 786 * saw a fatal event that requires panic. Then it executes them 787 * in the entry order. 788 * TBD double check parallel CPU hotunplug 789 */ 790static int mce_start(int *no_way_out) 791{ 792 int order; 793 int cpus = num_online_cpus(); 794 u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC; 795 796 if (!timeout) 797 return -1; 798 799 atomic_add(*no_way_out, &global_nwo); 800 /* 801 * global_nwo should be updated before mce_callin 802 */ 803 smp_wmb(); 804 order = atomic_inc_return(&mce_callin); 805 806 /* 807 * Wait for everyone. 808 */ 809 while (atomic_read(&mce_callin) != cpus) { 810 if (mce_timed_out(&timeout)) { 811 atomic_set(&global_nwo, 0); 812 return -1; 813 } 814 ndelay(SPINUNIT); 815 } 816 817 /* 818 * mce_callin should be read before global_nwo 819 */ 820 smp_rmb(); 821 822 if (order == 1) { 823 /* 824 * Monarch: Starts executing now, the others wait. 825 */ 826 atomic_set(&mce_executing, 1); 827 } else { 828 /* 829 * Subject: Now start the scanning loop one by one in 830 * the original callin order. 831 * This way when there are any shared banks it will be 832 * only seen by one CPU before cleared, avoiding duplicates. 833 */ 834 while (atomic_read(&mce_executing) < order) { 835 if (mce_timed_out(&timeout)) { 836 atomic_set(&global_nwo, 0); 837 return -1; 838 } 839 ndelay(SPINUNIT); 840 } 841 } 842 843 /* 844 * Cache the global no_way_out state. 845 */ 846 *no_way_out = atomic_read(&global_nwo); 847 848 return order; 849} 850 851/* 852 * Synchronize between CPUs after main scanning loop. 853 * This invokes the bulk of the Monarch processing. 854 */ 855static int mce_end(int order) 856{ 857 int ret = -1; 858 u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC; 859 860 if (!timeout) 861 goto reset; 862 if (order < 0) 863 goto reset; 864 865 /* 866 * Allow others to run. 867 */ 868 atomic_inc(&mce_executing); 869 870 if (order == 1) { 871 /* CHECKME: Can this race with a parallel hotplug? */ 872 int cpus = num_online_cpus(); 873 874 /* 875 * Monarch: Wait for everyone to go through their scanning 876 * loops. 877 */ 878 while (atomic_read(&mce_executing) <= cpus) { 879 if (mce_timed_out(&timeout)) 880 goto reset; 881 ndelay(SPINUNIT); 882 } 883 884 mce_reign(); 885 barrier(); 886 ret = 0; 887 } else { 888 /* 889 * Subject: Wait for Monarch to finish. 890 */ 891 while (atomic_read(&mce_executing) != 0) { 892 if (mce_timed_out(&timeout)) 893 goto reset; 894 ndelay(SPINUNIT); 895 } 896 897 /* 898 * Don't reset anything. That's done by the Monarch. 899 */ 900 return 0; 901 } 902 903 /* 904 * Reset all global state. 905 */ 906reset: 907 atomic_set(&global_nwo, 0); 908 atomic_set(&mce_callin, 0); 909 barrier(); 910 911 /* 912 * Let others run again. 913 */ 914 atomic_set(&mce_executing, 0); 915 return ret; 916} 917 918/* 919 * Check if the address reported by the CPU is in a format we can parse. 920 * It would be possible to add code for most other cases, but all would 921 * be somewhat complicated (e.g. segment offset would require an instruction 922 * parser). So only support physical addresses up to page granuality for now. 923 */ 924static int mce_usable_address(struct mce *m) 925{ 926 if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV)) 927 return 0; 928 if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT) 929 return 0; 930 if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS) 931 return 0; 932 return 1; 933} 934 935static void mce_clear_state(unsigned long *toclear) 936{ 937 int i; 938 939 for (i = 0; i < banks; i++) { 940 if (test_bit(i, toclear)) 941 mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0); 942 } 943} 944 945/* 946 * Need to save faulting physical address associated with a process 947 * in the machine check handler some place where we can grab it back 948 * later in mce_notify_process() 949 */ 950#define MCE_INFO_MAX 16 951 952struct mce_info { 953 atomic_t inuse; 954 struct task_struct *t; 955 __u64 paddr; 956 int restartable; 957} mce_info[MCE_INFO_MAX]; 958 959static void mce_save_info(__u64 addr, int c) 960{ 961 struct mce_info *mi; 962 963 for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++) { 964 if (atomic_cmpxchg(&mi->inuse, 0, 1) == 0) { 965 mi->t = current; 966 mi->paddr = addr; 967 mi->restartable = c; 968 return; 969 } 970 } 971 972 mce_panic("Too many concurrent recoverable errors", NULL, NULL); 973} 974 975static struct mce_info *mce_find_info(void) 976{ 977 struct mce_info *mi; 978 979 for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++) 980 if (atomic_read(&mi->inuse) && mi->t == current) 981 return mi; 982 return NULL; 983} 984 985static void mce_clear_info(struct mce_info *mi) 986{ 987 atomic_set(&mi->inuse, 0); 988} 989 990/* 991 * The actual machine check handler. This only handles real 992 * exceptions when something got corrupted coming in through int 18. 993 * 994 * This is executed in NMI context not subject to normal locking rules. This 995 * implies that most kernel services cannot be safely used. Don't even 996 * think about putting a printk in there! 997 * 998 * On Intel systems this is entered on all CPUs in parallel through 999 * MCE broadcast. However some CPUs might be broken beyond repair, 1000 * so be always careful when synchronizing with others. 1001 */ 1002void do_machine_check(struct pt_regs *regs, long error_code) 1003{ 1004 struct mce m, *final; 1005 int i; 1006 int worst = 0; 1007 int severity; 1008 /* 1009 * Establish sequential order between the CPUs entering the machine 1010 * check handler. 1011 */ 1012 int order; 1013 /* 1014 * If no_way_out gets set, there is no safe way to recover from this 1015 * MCE. If tolerant is cranked up, we'll try anyway. 1016 */ 1017 int no_way_out = 0; 1018 /* 1019 * If kill_it gets set, there might be a way to recover from this 1020 * error. 1021 */ 1022 int kill_it = 0; 1023 DECLARE_BITMAP(toclear, MAX_NR_BANKS); 1024 char *msg = "Unknown"; 1025 1026 atomic_inc(&mce_entry); 1027 1028 percpu_inc(mce_exception_count); 1029 1030 if (!banks) 1031 goto out; 1032 1033 mce_gather_info(&m, regs); 1034 1035 final = &__get_cpu_var(mces_seen); 1036 *final = m; 1037 1038 no_way_out = mce_no_way_out(&m, &msg); 1039 1040 barrier(); 1041 1042 /* 1043 * When no restart IP might need to kill or panic. 1044 * Assume the worst for now, but if we find the 1045 * severity is MCE_AR_SEVERITY we have other options. 1046 */ 1047 if (!(m.mcgstatus & MCG_STATUS_RIPV)) 1048 kill_it = 1; 1049 1050 /* 1051 * Go through all the banks in exclusion of the other CPUs. 1052 * This way we don't report duplicated events on shared banks 1053 * because the first one to see it will clear it. 1054 */ 1055 order = mce_start(&no_way_out); 1056 for (i = 0; i < banks; i++) { 1057 __clear_bit(i, toclear); 1058 if (!mce_banks[i].ctl) 1059 continue; 1060 1061 m.misc = 0; 1062 m.addr = 0; 1063 m.bank = i; 1064 1065 m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); 1066 if ((m.status & MCI_STATUS_VAL) == 0) 1067 continue; 1068 1069 /* 1070 * Non uncorrected or non signaled errors are handled by 1071 * machine_check_poll. Leave them alone, unless this panics. 1072 */ 1073 if (!(m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)) && 1074 !no_way_out) 1075 continue; 1076 1077 /* 1078 * Set taint even when machine check was not enabled. 1079 */ 1080 add_taint(TAINT_MACHINE_CHECK); 1081 1082 severity = mce_severity(&m, tolerant, NULL); 1083 1084 /* 1085 * When machine check was for corrected handler don't touch, 1086 * unless we're panicing. 1087 */ 1088 if (severity == MCE_KEEP_SEVERITY && !no_way_out) 1089 continue; 1090 __set_bit(i, toclear); 1091 if (severity == MCE_NO_SEVERITY) { 1092 /* 1093 * Machine check event was not enabled. Clear, but 1094 * ignore. 1095 */ 1096 continue; 1097 } 1098 1099 mce_read_aux(&m, i); 1100 1101 /* 1102 * Action optional error. Queue address for later processing. 1103 * When the ring overflows we just ignore the AO error. 1104 * RED-PEN add some logging mechanism when 1105 * usable_address or mce_add_ring fails. 1106 * RED-PEN don't ignore overflow for tolerant == 0 1107 */ 1108 if (severity == MCE_AO_SEVERITY && mce_usable_address(&m)) 1109 mce_ring_add(m.addr >> PAGE_SHIFT); 1110 1111 mce_log(&m); 1112 1113 if (severity > worst) { 1114 *final = m; 1115 worst = severity; 1116 } 1117 } 1118 1119 /* mce_clear_state will clear *final, save locally for use later */ 1120 m = *final; 1121 1122 if (!no_way_out) 1123 mce_clear_state(toclear); 1124 1125 /* 1126 * Do most of the synchronization with other CPUs. 1127 * When there's any problem use only local no_way_out state. 1128 */ 1129 if (mce_end(order) < 0) 1130 no_way_out = worst >= MCE_PANIC_SEVERITY; 1131 1132 /* 1133 * At insane "tolerant" levels we take no action. Otherwise 1134 * we only die if we have no other choice. For less serious 1135 * issues we try to recover, or limit damage to the current 1136 * process. 1137 */ 1138 if (tolerant < 3) { 1139 if (no_way_out) 1140 mce_panic("Fatal machine check on current CPU", &m, msg); 1141 if (worst == MCE_AR_SEVERITY) { 1142 /* schedule action before return to userland */ 1143 mce_save_info(m.addr, m.mcgstatus & MCG_STATUS_RIPV); 1144 set_thread_flag(TIF_MCE_NOTIFY); 1145 } else if (kill_it) { 1146 force_sig(SIGBUS, current); 1147 } 1148 } 1149 1150 if (worst > 0) 1151 mce_report_event(regs); 1152 mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); 1153out: 1154 atomic_dec(&mce_entry); 1155 sync_core(); 1156} 1157EXPORT_SYMBOL_GPL(do_machine_check); 1158 1159#ifndef CONFIG_MEMORY_FAILURE 1160int memory_failure(unsigned long pfn, int vector, int flags) 1161{ 1162 /* mce_severity() should not hand us an ACTION_REQUIRED error */ 1163 BUG_ON(flags & MF_ACTION_REQUIRED); 1164 printk(KERN_ERR "Uncorrected memory error in page 0x%lx ignored\n" 1165 "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n", pfn); 1166 1167 return 0; 1168} 1169#endif 1170 1171/* 1172 * Called in process context that interrupted by MCE and marked with 1173 * TIF_MCE_NOTIFY, just before returning to erroneous userland. 1174 * This code is allowed to sleep. 1175 * Attempt possible recovery such as calling the high level VM handler to 1176 * process any corrupted pages, and kill/signal current process if required. 1177 * Action required errors are handled here. 1178 */ 1179void mce_notify_process(void) 1180{ 1181 unsigned long pfn; 1182 struct mce_info *mi = mce_find_info(); 1183 1184 if (!mi) 1185 mce_panic("Lost physical address for unconsumed uncorrectable error", NULL, NULL); 1186 pfn = mi->paddr >> PAGE_SHIFT; 1187 1188 clear_thread_flag(TIF_MCE_NOTIFY); 1189 1190 pr_err("Uncorrected hardware memory error in user-access at %llx", 1191 mi->paddr); 1192 /* 1193 * We must call memory_failure() here even if the current process is 1194 * doomed. We still need to mark the page as poisoned and alert any 1195 * other users of the page. 1196 */ 1197 if (memory_failure(pfn, MCE_VECTOR, MF_ACTION_REQUIRED) < 0 || 1198 mi->restartable == 0) { 1199 pr_err("Memory error not recovered"); 1200 force_sig(SIGBUS, current); 1201 } 1202 mce_clear_info(mi); 1203} 1204 1205/* 1206 * Action optional processing happens here (picking up 1207 * from the list of faulting pages that do_machine_check() 1208 * placed into the "ring"). 1209 */ 1210static void mce_process_work(struct work_struct *dummy) 1211{ 1212 unsigned long pfn; 1213 1214 while (mce_ring_get(&pfn)) 1215 memory_failure(pfn, MCE_VECTOR, 0); 1216} 1217 1218#ifdef CONFIG_X86_MCE_INTEL 1219/*** 1220 * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog 1221 * @cpu: The CPU on which the event occurred. 1222 * @status: Event status information 1223 * 1224 * This function should be called by the thermal interrupt after the 1225 * event has been processed and the decision was made to log the event 1226 * further. 1227 * 1228 * The status parameter will be saved to the 'status' field of 'struct mce' 1229 * and historically has been the register value of the 1230 * MSR_IA32_THERMAL_STATUS (Intel) msr. 1231 */ 1232void mce_log_therm_throt_event(__u64 status) 1233{ 1234 struct mce m; 1235 1236 mce_setup(&m); 1237 m.bank = MCE_THERMAL_BANK; 1238 m.status = status; 1239 mce_log(&m); 1240} 1241#endif /* CONFIG_X86_MCE_INTEL */ 1242 1243/* 1244 * Periodic polling timer for "silent" machine check errors. If the 1245 * poller finds an MCE, poll 2x faster. When the poller finds no more 1246 * errors, poll 2x slower (up to check_interval seconds). 1247 */ 1248static int check_interval = 5 * 60; /* 5 minutes */ 1249 1250static DEFINE_PER_CPU(int, mce_next_interval); /* in jiffies */ 1251static DEFINE_PER_CPU(struct timer_list, mce_timer); 1252 1253static void mce_start_timer(unsigned long data) 1254{ 1255 struct timer_list *t = &per_cpu(mce_timer, data); 1256 int *n; 1257 1258 WARN_ON(smp_processor_id() != data); 1259 1260 if (mce_available(__this_cpu_ptr(&cpu_info))) { 1261 machine_check_poll(MCP_TIMESTAMP, 1262 &__get_cpu_var(mce_poll_banks)); 1263 } 1264 1265 /* 1266 * Alert userspace if needed. If we logged an MCE, reduce the 1267 * polling interval, otherwise increase the polling interval. 1268 */ 1269 n = &__get_cpu_var(mce_next_interval); 1270 if (mce_notify_irq()) 1271 *n = max(*n/2, HZ/100); 1272 else 1273 *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ)); 1274 1275 t->expires = jiffies + *n; 1276 add_timer_on(t, smp_processor_id()); 1277} 1278 1279/* Must not be called in IRQ context where del_timer_sync() can deadlock */ 1280static void mce_timer_delete_all(void) 1281{ 1282 int cpu; 1283 1284 for_each_online_cpu(cpu) 1285 del_timer_sync(&per_cpu(mce_timer, cpu)); 1286} 1287 1288static void mce_do_trigger(struct work_struct *work) 1289{ 1290 call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT); 1291} 1292 1293static DECLARE_WORK(mce_trigger_work, mce_do_trigger); 1294 1295/* 1296 * Notify the user(s) about new machine check events. 1297 * Can be called from interrupt context, but not from machine check/NMI 1298 * context. 1299 */ 1300int mce_notify_irq(void) 1301{ 1302 /* Not more than two messages every minute */ 1303 static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); 1304 1305 if (test_and_clear_bit(0, &mce_need_notify)) { 1306 /* wake processes polling /dev/mcelog */ 1307 wake_up_interruptible(&mce_chrdev_wait); 1308 1309 /* 1310 * There is no risk of missing notifications because 1311 * work_pending is always cleared before the function is 1312 * executed. 1313 */ 1314 if (mce_helper[0] && !work_pending(&mce_trigger_work)) 1315 schedule_work(&mce_trigger_work); 1316 1317 if (__ratelimit(&ratelimit)) 1318 pr_info(HW_ERR "Machine check events logged\n"); 1319 1320 return 1; 1321 } 1322 return 0; 1323} 1324EXPORT_SYMBOL_GPL(mce_notify_irq); 1325 1326static int __cpuinit __mcheck_cpu_mce_banks_init(void) 1327{ 1328 int i; 1329 1330 mce_banks = kzalloc(banks * sizeof(struct mce_bank), GFP_KERNEL); 1331 if (!mce_banks) 1332 return -ENOMEM; 1333 for (i = 0; i < banks; i++) { 1334 struct mce_bank *b = &mce_banks[i]; 1335 1336 b->ctl = -1ULL; 1337 b->init = 1; 1338 } 1339 return 0; 1340} 1341 1342/* 1343 * Initialize Machine Checks for a CPU. 1344 */ 1345static int __cpuinit __mcheck_cpu_cap_init(void) 1346{ 1347 unsigned b; 1348 u64 cap; 1349 1350 rdmsrl(MSR_IA32_MCG_CAP, cap); 1351 1352 b = cap & MCG_BANKCNT_MASK; 1353 if (!banks) 1354 printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b); 1355 1356 if (b > MAX_NR_BANKS) { 1357 printk(KERN_WARNING 1358 "MCE: Using only %u machine check banks out of %u\n", 1359 MAX_NR_BANKS, b); 1360 b = MAX_NR_BANKS; 1361 } 1362 1363 /* Don't support asymmetric configurations today */ 1364 WARN_ON(banks != 0 && b != banks); 1365 banks = b; 1366 if (!mce_banks) { 1367 int err = __mcheck_cpu_mce_banks_init(); 1368 1369 if (err) 1370 return err; 1371 } 1372 1373 /* Use accurate RIP reporting if available. */ 1374 if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9) 1375 rip_msr = MSR_IA32_MCG_EIP; 1376 1377 if (cap & MCG_SER_P) 1378 mce_ser = 1; 1379 1380 return 0; 1381} 1382 1383static void __mcheck_cpu_init_generic(void) 1384{ 1385 mce_banks_t all_banks; 1386 u64 cap; 1387 int i; 1388 1389 /* 1390 * Log the machine checks left over from the previous reset. 1391 */ 1392 bitmap_fill(all_banks, MAX_NR_BANKS); 1393 machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks); 1394 1395 set_in_cr4(X86_CR4_MCE); 1396 1397 rdmsrl(MSR_IA32_MCG_CAP, cap); 1398 if (cap & MCG_CTL_P) 1399 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); 1400 1401 for (i = 0; i < banks; i++) { 1402 struct mce_bank *b = &mce_banks[i]; 1403 1404 if (!b->init) 1405 continue; 1406 wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl); 1407 wrmsrl(MSR_IA32_MCx_STATUS(i), 0); 1408 } 1409} 1410 1411/* Add per CPU specific workarounds here */ 1412static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) 1413{ 1414 if (c->x86_vendor == X86_VENDOR_UNKNOWN) { 1415 pr_info("MCE: unknown CPU type - not enabling MCE support.\n"); 1416 return -EOPNOTSUPP; 1417 } 1418 1419 /* This should be disabled by the BIOS, but isn't always */ 1420 if (c->x86_vendor == X86_VENDOR_AMD) { 1421 if (c->x86 == 15 && banks > 4) { 1422 /* 1423 * disable GART TBL walk error reporting, which 1424 * trips off incorrectly with the IOMMU & 3ware 1425 * & Cerberus: 1426 */ 1427 clear_bit(10, (unsigned long *)&mce_banks[4].ctl); 1428 } 1429 if (c->x86 <= 17 && mce_bootlog < 0) { 1430 /* 1431 * Lots of broken BIOS around that don't clear them 1432 * by default and leave crap in there. Don't log: 1433 */ 1434 mce_bootlog = 0; 1435 } 1436 /* 1437 * Various K7s with broken bank 0 around. Always disable 1438 * by default. 1439 */ 1440 if (c->x86 == 6 && banks > 0) 1441 mce_banks[0].ctl = 0; 1442 } 1443 1444 if (c->x86_vendor == X86_VENDOR_INTEL) { 1445 /* 1446 * SDM documents that on family 6 bank 0 should not be written 1447 * because it aliases to another special BIOS controlled 1448 * register. 1449 * But it's not aliased anymore on model 0x1a+ 1450 * Don't ignore bank 0 completely because there could be a 1451 * valid event later, merely don't write CTL0. 1452 */ 1453 1454 if (c->x86 == 6 && c->x86_model < 0x1A && banks > 0) 1455 mce_banks[0].init = 0; 1456 1457 /* 1458 * All newer Intel systems support MCE broadcasting. Enable 1459 * synchronization with a one second timeout. 1460 */ 1461 if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) && 1462 monarch_timeout < 0) 1463 monarch_timeout = USEC_PER_SEC; 1464 1465 /* 1466 * There are also broken BIOSes on some Pentium M and 1467 * earlier systems: 1468 */ 1469 if (c->x86 == 6 && c->x86_model <= 13 && mce_bootlog < 0) 1470 mce_bootlog = 0; 1471 } 1472 if (monarch_timeout < 0) 1473 monarch_timeout = 0; 1474 if (mce_bootlog != 0) 1475 mce_panic_timeout = 30; 1476 1477 return 0; 1478} 1479 1480static int __cpuinit __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c) 1481{ 1482 if (c->x86 != 5) 1483 return 0; 1484 1485 switch (c->x86_vendor) { 1486 case X86_VENDOR_INTEL: 1487 intel_p5_mcheck_init(c); 1488 return 1; 1489 break; 1490 case X86_VENDOR_CENTAUR: 1491 winchip_mcheck_init(c); 1492 return 1; 1493 break; 1494 } 1495 1496 return 0; 1497} 1498 1499static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) 1500{ 1501 switch (c->x86_vendor) { 1502 case X86_VENDOR_INTEL: 1503 mce_intel_feature_init(c); 1504 break; 1505 case X86_VENDOR_AMD: 1506 mce_amd_feature_init(c); 1507 break; 1508 default: 1509 break; 1510 } 1511} 1512 1513static void __mcheck_cpu_init_timer(void) 1514{ 1515 struct timer_list *t = &__get_cpu_var(mce_timer); 1516 int *n = &__get_cpu_var(mce_next_interval); 1517 1518 setup_timer(t, mce_start_timer, smp_processor_id()); 1519 1520 if (mce_ignore_ce) 1521 return; 1522 1523 *n = check_interval * HZ; 1524 if (!*n) 1525 return; 1526 t->expires = round_jiffies(jiffies + *n); 1527 add_timer_on(t, smp_processor_id()); 1528} 1529 1530/* Handle unconfigured int18 (should never happen) */ 1531static void unexpected_machine_check(struct pt_regs *regs, long error_code) 1532{ 1533 printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", 1534 smp_processor_id()); 1535} 1536 1537/* Call the installed machine check handler for this CPU setup. */ 1538void (*machine_check_vector)(struct pt_regs *, long error_code) = 1539 unexpected_machine_check; 1540 1541/* 1542 * Called for each booted CPU to set up machine checks. 1543 * Must be called with preempt off: 1544 */ 1545void __cpuinit mcheck_cpu_init(struct cpuinfo_x86 *c) 1546{ 1547 if (mce_disabled) 1548 return; 1549 1550 if (__mcheck_cpu_ancient_init(c)) 1551 return; 1552 1553 if (!mce_available(c)) 1554 return; 1555 1556 if (__mcheck_cpu_cap_init() < 0 || __mcheck_cpu_apply_quirks(c) < 0) { 1557 mce_disabled = 1; 1558 return; 1559 } 1560 1561 machine_check_vector = do_machine_check; 1562 1563 __mcheck_cpu_init_generic(); 1564 __mcheck_cpu_init_vendor(c); 1565 __mcheck_cpu_init_timer(); 1566 INIT_WORK(&__get_cpu_var(mce_work), mce_process_work); 1567 init_irq_work(&__get_cpu_var(mce_irq_work), &mce_irq_work_cb); 1568} 1569 1570/* 1571 * mce_chrdev: Character device /dev/mcelog to read and clear the MCE log. 1572 */ 1573 1574static DEFINE_SPINLOCK(mce_chrdev_state_lock); 1575static int mce_chrdev_open_count; /* #times opened */ 1576static int mce_chrdev_open_exclu; /* already open exclusive? */ 1577 1578static int mce_chrdev_open(struct inode *inode, struct file *file) 1579{ 1580 spin_lock(&mce_chrdev_state_lock); 1581 1582 if (mce_chrdev_open_exclu || 1583 (mce_chrdev_open_count && (file->f_flags & O_EXCL))) { 1584 spin_unlock(&mce_chrdev_state_lock); 1585 1586 return -EBUSY; 1587 } 1588 1589 if (file->f_flags & O_EXCL) 1590 mce_chrdev_open_exclu = 1; 1591 mce_chrdev_open_count++; 1592 1593 spin_unlock(&mce_chrdev_state_lock); 1594 1595 return nonseekable_open(inode, file); 1596} 1597 1598static int mce_chrdev_release(struct inode *inode, struct file *file) 1599{ 1600 spin_lock(&mce_chrdev_state_lock); 1601 1602 mce_chrdev_open_count--; 1603 mce_chrdev_open_exclu = 0; 1604 1605 spin_unlock(&mce_chrdev_state_lock); 1606 1607 return 0; 1608} 1609 1610static void collect_tscs(void *data) 1611{ 1612 unsigned long *cpu_tsc = (unsigned long *)data; 1613 1614 rdtscll(cpu_tsc[smp_processor_id()]); 1615} 1616 1617static int mce_apei_read_done; 1618 1619/* Collect MCE record of previous boot in persistent storage via APEI ERST. */ 1620static int __mce_read_apei(char __user **ubuf, size_t usize) 1621{ 1622 int rc; 1623 u64 record_id; 1624 struct mce m; 1625 1626 if (usize < sizeof(struct mce)) 1627 return -EINVAL; 1628 1629 rc = apei_read_mce(&m, &record_id); 1630 /* Error or no more MCE record */ 1631 if (rc <= 0) { 1632 mce_apei_read_done = 1; 1633 /* 1634 * When ERST is disabled, mce_chrdev_read() should return 1635 * "no record" instead of "no device." 1636 */ 1637 if (rc == -ENODEV) 1638 return 0; 1639 return rc; 1640 } 1641 rc = -EFAULT; 1642 if (copy_to_user(*ubuf, &m, sizeof(struct mce))) 1643 return rc; 1644 /* 1645 * In fact, we should have cleared the record after that has 1646 * been flushed to the disk or sent to network in 1647 * /sbin/mcelog, but we have no interface to support that now, 1648 * so just clear it to avoid duplication. 1649 */ 1650 rc = apei_clear_mce(record_id); 1651 if (rc) { 1652 mce_apei_read_done = 1; 1653 return rc; 1654 } 1655 *ubuf += sizeof(struct mce); 1656 1657 return 0; 1658} 1659 1660static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf, 1661 size_t usize, loff_t *off) 1662{ 1663 char __user *buf = ubuf; 1664 unsigned long *cpu_tsc; 1665 unsigned prev, next; 1666 int i, err; 1667 1668 cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL); 1669 if (!cpu_tsc) 1670 return -ENOMEM; 1671 1672 mutex_lock(&mce_chrdev_read_mutex); 1673 1674 if (!mce_apei_read_done) { 1675 err = __mce_read_apei(&buf, usize); 1676 if (err || buf != ubuf) 1677 goto out; 1678 } 1679 1680 next = rcu_dereference_check_mce(mcelog.next); 1681 1682 /* Only supports full reads right now */ 1683 err = -EINVAL; 1684 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) 1685 goto out; 1686 1687 err = 0; 1688 prev = 0; 1689 do { 1690 for (i = prev; i < next; i++) { 1691 unsigned long start = jiffies; 1692 struct mce *m = &mcelog.entry[i]; 1693 1694 while (!m->finished) { 1695 if (time_after_eq(jiffies, start + 2)) { 1696 memset(m, 0, sizeof(*m)); 1697 goto timeout; 1698 } 1699 cpu_relax(); 1700 } 1701 smp_rmb(); 1702 err |= copy_to_user(buf, m, sizeof(*m)); 1703 buf += sizeof(*m); 1704timeout: 1705 ; 1706 } 1707 1708 memset(mcelog.entry + prev, 0, 1709 (next - prev) * sizeof(struct mce)); 1710 prev = next; 1711 next = cmpxchg(&mcelog.next, prev, 0); 1712 } while (next != prev); 1713 1714 synchronize_sched(); 1715 1716 /* 1717 * Collect entries that were still getting written before the 1718 * synchronize. 1719 */ 1720 on_each_cpu(collect_tscs, cpu_tsc, 1); 1721 1722 for (i = next; i < MCE_LOG_LEN; i++) { 1723 struct mce *m = &mcelog.entry[i]; 1724 1725 if (m->finished && m->tsc < cpu_tsc[m->cpu]) { 1726 err |= copy_to_user(buf, m, sizeof(*m)); 1727 smp_rmb(); 1728 buf += sizeof(*m); 1729 memset(m, 0, sizeof(*m)); 1730 } 1731 } 1732 1733 if (err) 1734 err = -EFAULT; 1735 1736out: 1737 mutex_unlock(&mce_chrdev_read_mutex); 1738 kfree(cpu_tsc); 1739 1740 return err ? err : buf - ubuf; 1741} 1742 1743static unsigned int mce_chrdev_poll(struct file *file, poll_table *wait) 1744{ 1745 poll_wait(file, &mce_chrdev_wait, wait); 1746 if (rcu_access_index(mcelog.next)) 1747 return POLLIN | POLLRDNORM; 1748 if (!mce_apei_read_done && apei_check_mce()) 1749 return POLLIN | POLLRDNORM; 1750 return 0; 1751} 1752 1753static long mce_chrdev_ioctl(struct file *f, unsigned int cmd, 1754 unsigned long arg) 1755{ 1756 int __user *p = (int __user *)arg; 1757 1758 if (!capable(CAP_SYS_ADMIN)) 1759 return -EPERM; 1760 1761 switch (cmd) { 1762 case MCE_GET_RECORD_LEN: 1763 return put_user(sizeof(struct mce), p); 1764 case MCE_GET_LOG_LEN: 1765 return put_user(MCE_LOG_LEN, p); 1766 case MCE_GETCLEAR_FLAGS: { 1767 unsigned flags; 1768 1769 do { 1770 flags = mcelog.flags; 1771 } while (cmpxchg(&mcelog.flags, flags, 0) != flags); 1772 1773 return put_user(flags, p); 1774 } 1775 default: 1776 return -ENOTTY; 1777 } 1778} 1779 1780static ssize_t (*mce_write)(struct file *filp, const char __user *ubuf, 1781 size_t usize, loff_t *off); 1782 1783void register_mce_write_callback(ssize_t (*fn)(struct file *filp, 1784 const char __user *ubuf, 1785 size_t usize, loff_t *off)) 1786{ 1787 mce_write = fn; 1788} 1789EXPORT_SYMBOL_GPL(register_mce_write_callback); 1790 1791ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf, 1792 size_t usize, loff_t *off) 1793{ 1794 if (mce_write) 1795 return mce_write(filp, ubuf, usize, off); 1796 else 1797 return -EINVAL; 1798} 1799 1800static const struct file_operations mce_chrdev_ops = { 1801 .open = mce_chrdev_open, 1802 .release = mce_chrdev_release, 1803 .read = mce_chrdev_read, 1804 .write = mce_chrdev_write, 1805 .poll = mce_chrdev_poll, 1806 .unlocked_ioctl = mce_chrdev_ioctl, 1807 .llseek = no_llseek, 1808}; 1809 1810static struct miscdevice mce_chrdev_device = { 1811 MISC_MCELOG_MINOR, 1812 "mcelog", 1813 &mce_chrdev_ops, 1814}; 1815 1816/* 1817 * mce=off Disables machine check 1818 * mce=no_cmci Disables CMCI 1819 * mce=dont_log_ce Clears corrected events silently, no log created for CEs. 1820 * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared. 1821 * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above) 1822 * monarchtimeout is how long to wait for other CPUs on machine 1823 * check, or 0 to not wait 1824 * mce=bootlog Log MCEs from before booting. Disabled by default on AMD. 1825 * mce=nobootlog Don't log MCEs from before booting. 1826 */ 1827static int __init mcheck_enable(char *str) 1828{ 1829 if (*str == 0) { 1830 enable_p5_mce(); 1831 return 1; 1832 } 1833 if (*str == '=') 1834 str++; 1835 if (!strcmp(str, "off")) 1836 mce_disabled = 1; 1837 else if (!strcmp(str, "no_cmci")) 1838 mce_cmci_disabled = 1; 1839 else if (!strcmp(str, "dont_log_ce")) 1840 mce_dont_log_ce = 1; 1841 else if (!strcmp(str, "ignore_ce")) 1842 mce_ignore_ce = 1; 1843 else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog")) 1844 mce_bootlog = (str[0] == 'b'); 1845 else if (isdigit(str[0])) { 1846 get_option(&str, &tolerant); 1847 if (*str == ',') { 1848 ++str; 1849 get_option(&str, &monarch_timeout); 1850 } 1851 } else { 1852 printk(KERN_INFO "mce argument %s ignored. Please use /sys\n", 1853 str); 1854 return 0; 1855 } 1856 return 1; 1857} 1858__setup("mce", mcheck_enable); 1859 1860int __init mcheck_init(void) 1861{ 1862 mcheck_intel_therm_init(); 1863 1864 return 0; 1865} 1866 1867/* 1868 * mce_syscore: PM support 1869 */ 1870 1871/* 1872 * Disable machine checks on suspend and shutdown. We can't really handle 1873 * them later. 1874 */ 1875static int mce_disable_error_reporting(void) 1876{ 1877 int i; 1878 1879 for (i = 0; i < banks; i++) { 1880 struct mce_bank *b = &mce_banks[i]; 1881 1882 if (b->init) 1883 wrmsrl(MSR_IA32_MCx_CTL(i), 0); 1884 } 1885 return 0; 1886} 1887 1888static int mce_syscore_suspend(void) 1889{ 1890 return mce_disable_error_reporting(); 1891} 1892 1893static void mce_syscore_shutdown(void) 1894{ 1895 mce_disable_error_reporting(); 1896} 1897 1898/* 1899 * On resume clear all MCE state. Don't want to see leftovers from the BIOS. 1900 * Only one CPU is active at this time, the others get re-added later using 1901 * CPU hotplug: 1902 */ 1903static void mce_syscore_resume(void) 1904{ 1905 __mcheck_cpu_init_generic(); 1906 __mcheck_cpu_init_vendor(__this_cpu_ptr(&cpu_info)); 1907} 1908 1909static struct syscore_ops mce_syscore_ops = { 1910 .suspend = mce_syscore_suspend, 1911 .shutdown = mce_syscore_shutdown, 1912 .resume = mce_syscore_resume, 1913}; 1914 1915/* 1916 * mce_device: Sysfs support 1917 */ 1918 1919static void mce_cpu_restart(void *data) 1920{ 1921 if (!mce_available(__this_cpu_ptr(&cpu_info))) 1922 return; 1923 __mcheck_cpu_init_generic(); 1924 __mcheck_cpu_init_timer(); 1925} 1926 1927/* Reinit MCEs after user configuration changes */ 1928static void mce_restart(void) 1929{ 1930 mce_timer_delete_all(); 1931 on_each_cpu(mce_cpu_restart, NULL, 1); 1932} 1933 1934/* Toggle features for corrected errors */ 1935static void mce_disable_cmci(void *data) 1936{ 1937 if (!mce_available(__this_cpu_ptr(&cpu_info))) 1938 return; 1939 cmci_clear(); 1940} 1941 1942static void mce_enable_ce(void *all) 1943{ 1944 if (!mce_available(__this_cpu_ptr(&cpu_info))) 1945 return; 1946 cmci_reenable(); 1947 cmci_recheck(); 1948 if (all) 1949 __mcheck_cpu_init_timer(); 1950} 1951 1952static struct bus_type mce_subsys = { 1953 .name = "machinecheck", 1954 .dev_name = "machinecheck", 1955}; 1956 1957DEFINE_PER_CPU(struct device *, mce_device); 1958 1959__cpuinitdata 1960void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); 1961 1962static inline struct mce_bank *attr_to_bank(struct device_attribute *attr) 1963{ 1964 return container_of(attr, struct mce_bank, attr); 1965} 1966 1967static ssize_t show_bank(struct device *s, struct device_attribute *attr, 1968 char *buf) 1969{ 1970 return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl); 1971} 1972 1973static ssize_t set_bank(struct device *s, struct device_attribute *attr, 1974 const char *buf, size_t size) 1975{ 1976 u64 new; 1977 1978 if (strict_strtoull(buf, 0, &new) < 0) 1979 return -EINVAL; 1980 1981 attr_to_bank(attr)->ctl = new; 1982 mce_restart(); 1983 1984 return size; 1985} 1986 1987static ssize_t 1988show_trigger(struct device *s, struct device_attribute *attr, char *buf) 1989{ 1990 strcpy(buf, mce_helper); 1991 strcat(buf, "\n"); 1992 return strlen(mce_helper) + 1; 1993} 1994 1995static ssize_t set_trigger(struct device *s, struct device_attribute *attr, 1996 const char *buf, size_t siz) 1997{ 1998 char *p; 1999 2000 strncpy(mce_helper, buf, sizeof(mce_helper)); 2001 mce_helper[sizeof(mce_helper)-1] = 0; 2002 p = strchr(mce_helper, '\n'); 2003 2004 if (p) 2005 *p = 0; 2006 2007 return strlen(mce_helper) + !!p; 2008} 2009 2010static ssize_t set_ignore_ce(struct device *s, 2011 struct device_attribute *attr, 2012 const char *buf, size_t size) 2013{ 2014 u64 new; 2015 2016 if (strict_strtoull(buf, 0, &new) < 0) 2017 return -EINVAL; 2018 2019 if (mce_ignore_ce ^ !!new) { 2020 if (new) { 2021 /* disable ce features */ 2022 mce_timer_delete_all(); 2023 on_each_cpu(mce_disable_cmci, NULL, 1); 2024 mce_ignore_ce = 1; 2025 } else { 2026 /* enable ce features */ 2027 mce_ignore_ce = 0; 2028 on_each_cpu(mce_enable_ce, (void *)1, 1); 2029 } 2030 } 2031 return size; 2032} 2033 2034static ssize_t set_cmci_disabled(struct device *s, 2035 struct device_attribute *attr, 2036 const char *buf, size_t size) 2037{ 2038 u64 new; 2039 2040 if (strict_strtoull(buf, 0, &new) < 0) 2041 return -EINVAL; 2042 2043 if (mce_cmci_disabled ^ !!new) { 2044 if (new) { 2045 /* disable cmci */ 2046 on_each_cpu(mce_disable_cmci, NULL, 1); 2047 mce_cmci_disabled = 1; 2048 } else { 2049 /* enable cmci */ 2050 mce_cmci_disabled = 0; 2051 on_each_cpu(mce_enable_ce, NULL, 1); 2052 } 2053 } 2054 return size; 2055} 2056 2057static ssize_t store_int_with_restart(struct device *s, 2058 struct device_attribute *attr, 2059 const char *buf, size_t size) 2060{ 2061 ssize_t ret = device_store_int(s, attr, buf, size); 2062 mce_restart(); 2063 return ret; 2064} 2065 2066static DEVICE_ATTR(trigger, 0644, show_trigger, set_trigger); 2067static DEVICE_INT_ATTR(tolerant, 0644, tolerant); 2068static DEVICE_INT_ATTR(monarch_timeout, 0644, monarch_timeout); 2069static DEVICE_INT_ATTR(dont_log_ce, 0644, mce_dont_log_ce); 2070 2071static struct dev_ext_attribute dev_attr_check_interval = { 2072 __ATTR(check_interval, 0644, device_show_int, store_int_with_restart), 2073 &check_interval 2074}; 2075 2076static struct dev_ext_attribute dev_attr_ignore_ce = { 2077 __ATTR(ignore_ce, 0644, device_show_int, set_ignore_ce), 2078 &mce_ignore_ce 2079}; 2080 2081static struct dev_ext_attribute dev_attr_cmci_disabled = { 2082 __ATTR(cmci_disabled, 0644, device_show_int, set_cmci_disabled), 2083 &mce_cmci_disabled 2084}; 2085 2086static struct device_attribute *mce_device_attrs[] = { 2087 &dev_attr_tolerant.attr, 2088 &dev_attr_check_interval.attr, 2089 &dev_attr_trigger, 2090 &dev_attr_monarch_timeout.attr, 2091 &dev_attr_dont_log_ce.attr, 2092 &dev_attr_ignore_ce.attr, 2093 &dev_attr_cmci_disabled.attr, 2094 NULL 2095}; 2096 2097static cpumask_var_t mce_device_initialized; 2098 2099static void mce_device_release(struct device *dev) 2100{ 2101 kfree(dev); 2102} 2103 2104/* Per cpu device init. All of the cpus still share the same ctrl bank: */ 2105static __cpuinit int mce_device_create(unsigned int cpu) 2106{ 2107 struct device *dev; 2108 int err; 2109 int i, j; 2110 2111 if (!mce_available(&boot_cpu_data)) 2112 return -EIO; 2113 2114 dev = kzalloc(sizeof *dev, GFP_KERNEL); 2115 if (!dev) 2116 return -ENOMEM; 2117 dev->id = cpu; 2118 dev->bus = &mce_subsys; 2119 dev->release = &mce_device_release; 2120 2121 err = device_register(dev); 2122 if (err) 2123 return err; 2124 2125 for (i = 0; mce_device_attrs[i]; i++) { 2126 err = device_create_file(dev, mce_device_attrs[i]); 2127 if (err) 2128 goto error; 2129 } 2130 for (j = 0; j < banks; j++) { 2131 err = device_create_file(dev, &mce_banks[j].attr); 2132 if (err) 2133 goto error2; 2134 } 2135 cpumask_set_cpu(cpu, mce_device_initialized); 2136 per_cpu(mce_device, cpu) = dev; 2137 2138 return 0; 2139error2: 2140 while (--j >= 0) 2141 device_remove_file(dev, &mce_banks[j].attr); 2142error: 2143 while (--i >= 0) 2144 device_remove_file(dev, mce_device_attrs[i]); 2145 2146 device_unregister(dev); 2147 2148 return err; 2149} 2150 2151static __cpuinit void mce_device_remove(unsigned int cpu) 2152{ 2153 struct device *dev = per_cpu(mce_device, cpu); 2154 int i; 2155 2156 if (!cpumask_test_cpu(cpu, mce_device_initialized)) 2157 return; 2158 2159 for (i = 0; mce_device_attrs[i]; i++) 2160 device_remove_file(dev, mce_device_attrs[i]); 2161 2162 for (i = 0; i < banks; i++) 2163 device_remove_file(dev, &mce_banks[i].attr); 2164 2165 device_unregister(dev); 2166 cpumask_clear_cpu(cpu, mce_device_initialized); 2167 per_cpu(mce_device, cpu) = NULL; 2168} 2169 2170/* Make sure there are no machine checks on offlined CPUs. */ 2171static void __cpuinit mce_disable_cpu(void *h) 2172{ 2173 unsigned long action = *(unsigned long *)h; 2174 int i; 2175 2176 if (!mce_available(__this_cpu_ptr(&cpu_info))) 2177 return; 2178 2179 if (!(action & CPU_TASKS_FROZEN)) 2180 cmci_clear(); 2181 for (i = 0; i < banks; i++) { 2182 struct mce_bank *b = &mce_banks[i]; 2183 2184 if (b->init) 2185 wrmsrl(MSR_IA32_MCx_CTL(i), 0); 2186 } 2187} 2188 2189static void __cpuinit mce_reenable_cpu(void *h) 2190{ 2191 unsigned long action = *(unsigned long *)h; 2192 int i; 2193 2194 if (!mce_available(__this_cpu_ptr(&cpu_info))) 2195 return; 2196 2197 if (!(action & CPU_TASKS_FROZEN)) 2198 cmci_reenable(); 2199 for (i = 0; i < banks; i++) { 2200 struct mce_bank *b = &mce_banks[i]; 2201 2202 if (b->init) 2203 wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl); 2204 } 2205} 2206 2207/* Get notified when a cpu comes on/off. Be hotplug friendly. */ 2208static int __cpuinit 2209mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) 2210{ 2211 unsigned int cpu = (unsigned long)hcpu; 2212 struct timer_list *t = &per_cpu(mce_timer, cpu); 2213 2214 switch (action) { 2215 case CPU_ONLINE: 2216 case CPU_ONLINE_FROZEN: 2217 mce_device_create(cpu); 2218 if (threshold_cpu_callback) 2219 threshold_cpu_callback(action, cpu); 2220 break; 2221 case CPU_DEAD: 2222 case CPU_DEAD_FROZEN: 2223 if (threshold_cpu_callback) 2224 threshold_cpu_callback(action, cpu); 2225 mce_device_remove(cpu); 2226 break; 2227 case CPU_DOWN_PREPARE: 2228 case CPU_DOWN_PREPARE_FROZEN: 2229 del_timer_sync(t); 2230 smp_call_function_single(cpu, mce_disable_cpu, &action, 1); 2231 break; 2232 case CPU_DOWN_FAILED: 2233 case CPU_DOWN_FAILED_FROZEN: 2234 if (!mce_ignore_ce && check_interval) { 2235 t->expires = round_jiffies(jiffies + 2236 __get_cpu_var(mce_next_interval)); 2237 add_timer_on(t, cpu); 2238 } 2239 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); 2240 break; 2241 case CPU_POST_DEAD: 2242 /* intentionally ignoring frozen here */ 2243 cmci_rediscover(cpu); 2244 break; 2245 } 2246 return NOTIFY_OK; 2247} 2248 2249static struct notifier_block mce_cpu_notifier __cpuinitdata = { 2250 .notifier_call = mce_cpu_callback, 2251}; 2252 2253static __init void mce_init_banks(void) 2254{ 2255 int i; 2256 2257 for (i = 0; i < banks; i++) { 2258 struct mce_bank *b = &mce_banks[i]; 2259 struct device_attribute *a = &b->attr; 2260 2261 sysfs_attr_init(&a->attr); 2262 a->attr.name = b->attrname; 2263 snprintf(b->attrname, ATTR_LEN, "bank%d", i); 2264 2265 a->attr.mode = 0644; 2266 a->show = show_bank; 2267 a->store = set_bank; 2268 } 2269} 2270 2271static __init int mcheck_init_device(void) 2272{ 2273 int err; 2274 int i = 0; 2275 2276 if (!mce_available(&boot_cpu_data)) 2277 return -EIO; 2278 2279 zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL); 2280 2281 mce_init_banks(); 2282 2283 err = subsys_system_register(&mce_subsys, NULL); 2284 if (err) 2285 return err; 2286 2287 for_each_online_cpu(i) { 2288 err = mce_device_create(i); 2289 if (err) 2290 return err; 2291 } 2292 2293 register_syscore_ops(&mce_syscore_ops); 2294 register_hotcpu_notifier(&mce_cpu_notifier); 2295 2296 /* register character device /dev/mcelog */ 2297 misc_register(&mce_chrdev_device); 2298 2299 return err; 2300} 2301device_initcall(mcheck_init_device); 2302 2303/* 2304 * Old style boot options parsing. Only for compatibility. 2305 */ 2306static int __init mcheck_disable(char *str) 2307{ 2308 mce_disabled = 1; 2309 return 1; 2310} 2311__setup("nomce", mcheck_disable); 2312 2313#ifdef CONFIG_DEBUG_FS 2314struct dentry *mce_get_debugfs_dir(void) 2315{ 2316 static struct dentry *dmce; 2317 2318 if (!dmce) 2319 dmce = debugfs_create_dir("mce", NULL); 2320 2321 return dmce; 2322} 2323 2324static void mce_reset(void) 2325{ 2326 cpu_missing = 0; 2327 atomic_set(&mce_fake_paniced, 0); 2328 atomic_set(&mce_executing, 0); 2329 atomic_set(&mce_callin, 0); 2330 atomic_set(&global_nwo, 0); 2331} 2332 2333static int fake_panic_get(void *data, u64 *val) 2334{ 2335 *val = fake_panic; 2336 return 0; 2337} 2338 2339static int fake_panic_set(void *data, u64 val) 2340{ 2341 mce_reset(); 2342 fake_panic = val; 2343 return 0; 2344} 2345 2346DEFINE_SIMPLE_ATTRIBUTE(fake_panic_fops, fake_panic_get, 2347 fake_panic_set, "%llu\n"); 2348 2349static int __init mcheck_debugfs_init(void) 2350{ 2351 struct dentry *dmce, *ffake_panic; 2352 2353 dmce = mce_get_debugfs_dir(); 2354 if (!dmce) 2355 return -ENOMEM; 2356 ffake_panic = debugfs_create_file("fake_panic", 0444, dmce, NULL, 2357 &fake_panic_fops); 2358 if (!ffake_panic) 2359 return -ENOMEM; 2360 2361 return 0; 2362} 2363late_initcall(mcheck_debugfs_init); 2364#endif 2365