mce.c revision 0644414e62561f0ba1bea7c5ba6a94cc50dac3e3
1/* 2 * Machine check handler. 3 * 4 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. 5 * Rest from unknown author(s). 6 * 2004 Andi Kleen. Rewrote most of it. 7 * Copyright 2008 Intel Corporation 8 * Author: Andi Kleen 9 */ 10 11#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 12 13#include <linux/thread_info.h> 14#include <linux/capability.h> 15#include <linux/miscdevice.h> 16#include <linux/ratelimit.h> 17#include <linux/kallsyms.h> 18#include <linux/rcupdate.h> 19#include <linux/kobject.h> 20#include <linux/uaccess.h> 21#include <linux/kdebug.h> 22#include <linux/kernel.h> 23#include <linux/percpu.h> 24#include <linux/string.h> 25#include <linux/device.h> 26#include <linux/syscore_ops.h> 27#include <linux/delay.h> 28#include <linux/ctype.h> 29#include <linux/sched.h> 30#include <linux/sysfs.h> 31#include <linux/types.h> 32#include <linux/slab.h> 33#include <linux/init.h> 34#include <linux/kmod.h> 35#include <linux/poll.h> 36#include <linux/nmi.h> 37#include <linux/cpu.h> 38#include <linux/smp.h> 39#include <linux/fs.h> 40#include <linux/mm.h> 41#include <linux/debugfs.h> 42#include <linux/irq_work.h> 43#include <linux/export.h> 44 45#include <asm/processor.h> 46#include <asm/mce.h> 47#include <asm/msr.h> 48 49#include "mce-internal.h" 50 51static DEFINE_MUTEX(mce_chrdev_read_mutex); 52 53#define rcu_dereference_check_mce(p) \ 54 rcu_dereference_index_check((p), \ 55 rcu_read_lock_sched_held() || \ 56 lockdep_is_held(&mce_chrdev_read_mutex)) 57 58#define CREATE_TRACE_POINTS 59#include <trace/events/mce.h> 60 61#define SPINUNIT 100 /* 100ns */ 62 63atomic_t mce_entry; 64 65DEFINE_PER_CPU(unsigned, mce_exception_count); 66 67struct mce_bank *mce_banks __read_mostly; 68 69struct mca_config mca_cfg __read_mostly = { 70 .bootlog = -1, 71 /* 72 * Tolerant levels: 73 * 0: always panic on uncorrected errors, log corrected errors 74 * 1: panic or SIGBUS on uncorrected errors, log corrected errors 75 * 2: SIGBUS or log uncorrected errors (if possible), log corr. errors 76 * 3: never panic or SIGBUS, log all errors (for testing only) 77 */ 78 .tolerant = 1, 79 .monarch_timeout = -1 80}; 81 82/* User mode helper program triggered by machine check event */ 83static unsigned long mce_need_notify; 84static char mce_helper[128]; 85static char *mce_helper_argv[2] = { mce_helper, NULL }; 86 87static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait); 88 89static DEFINE_PER_CPU(struct mce, mces_seen); 90static int cpu_missing; 91 92/* 93 * MCA banks polled by the period polling timer for corrected events. 94 * With Intel CMCI, this only has MCA banks which do not support CMCI (if any). 95 */ 96DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { 97 [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL 98}; 99 100static DEFINE_PER_CPU(struct work_struct, mce_work); 101 102static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs); 103 104/* 105 * CPU/chipset specific EDAC code can register a notifier call here to print 106 * MCE errors in a human-readable form. 107 */ 108ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain); 109 110/* Do initial initialization of a struct mce */ 111void mce_setup(struct mce *m) 112{ 113 memset(m, 0, sizeof(struct mce)); 114 m->cpu = m->extcpu = smp_processor_id(); 115 rdtscll(m->tsc); 116 /* We hope get_seconds stays lockless */ 117 m->time = get_seconds(); 118 m->cpuvendor = boot_cpu_data.x86_vendor; 119 m->cpuid = cpuid_eax(1); 120 m->socketid = cpu_data(m->extcpu).phys_proc_id; 121 m->apicid = cpu_data(m->extcpu).initial_apicid; 122 rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap); 123} 124 125DEFINE_PER_CPU(struct mce, injectm); 126EXPORT_PER_CPU_SYMBOL_GPL(injectm); 127 128/* 129 * Lockless MCE logging infrastructure. 130 * This avoids deadlocks on printk locks without having to break locks. Also 131 * separate MCEs from kernel messages to avoid bogus bug reports. 132 */ 133 134static struct mce_log mcelog = { 135 .signature = MCE_LOG_SIGNATURE, 136 .len = MCE_LOG_LEN, 137 .recordlen = sizeof(struct mce), 138}; 139 140void mce_log(struct mce *mce) 141{ 142 unsigned next, entry; 143 int ret = 0; 144 145 /* Emit the trace record: */ 146 trace_mce_record(mce); 147 148 ret = atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, mce); 149 if (ret == NOTIFY_STOP) 150 return; 151 152 mce->finished = 0; 153 wmb(); 154 for (;;) { 155 entry = rcu_dereference_check_mce(mcelog.next); 156 for (;;) { 157 158 /* 159 * When the buffer fills up discard new entries. 160 * Assume that the earlier errors are the more 161 * interesting ones: 162 */ 163 if (entry >= MCE_LOG_LEN) { 164 set_bit(MCE_OVERFLOW, 165 (unsigned long *)&mcelog.flags); 166 return; 167 } 168 /* Old left over entry. Skip: */ 169 if (mcelog.entry[entry].finished) { 170 entry++; 171 continue; 172 } 173 break; 174 } 175 smp_rmb(); 176 next = entry + 1; 177 if (cmpxchg(&mcelog.next, entry, next) == entry) 178 break; 179 } 180 memcpy(mcelog.entry + entry, mce, sizeof(struct mce)); 181 wmb(); 182 mcelog.entry[entry].finished = 1; 183 wmb(); 184 185 mce->finished = 1; 186 set_bit(0, &mce_need_notify); 187} 188 189static void drain_mcelog_buffer(void) 190{ 191 unsigned int next, i, prev = 0; 192 193 next = ACCESS_ONCE(mcelog.next); 194 195 do { 196 struct mce *m; 197 198 /* drain what was logged during boot */ 199 for (i = prev; i < next; i++) { 200 unsigned long start = jiffies; 201 unsigned retries = 1; 202 203 m = &mcelog.entry[i]; 204 205 while (!m->finished) { 206 if (time_after_eq(jiffies, start + 2*retries)) 207 retries++; 208 209 cpu_relax(); 210 211 if (!m->finished && retries >= 4) { 212 pr_err("skipping error being logged currently!\n"); 213 break; 214 } 215 } 216 smp_rmb(); 217 atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m); 218 } 219 220 memset(mcelog.entry + prev, 0, (next - prev) * sizeof(*m)); 221 prev = next; 222 next = cmpxchg(&mcelog.next, prev, 0); 223 } while (next != prev); 224} 225 226 227void mce_register_decode_chain(struct notifier_block *nb) 228{ 229 atomic_notifier_chain_register(&x86_mce_decoder_chain, nb); 230 drain_mcelog_buffer(); 231} 232EXPORT_SYMBOL_GPL(mce_register_decode_chain); 233 234void mce_unregister_decode_chain(struct notifier_block *nb) 235{ 236 atomic_notifier_chain_unregister(&x86_mce_decoder_chain, nb); 237} 238EXPORT_SYMBOL_GPL(mce_unregister_decode_chain); 239 240static void print_mce(struct mce *m) 241{ 242 int ret = 0; 243 244 pr_emerg(HW_ERR "CPU %d: Machine Check Exception: %Lx Bank %d: %016Lx\n", 245 m->extcpu, m->mcgstatus, m->bank, m->status); 246 247 if (m->ip) { 248 pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ", 249 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", 250 m->cs, m->ip); 251 252 if (m->cs == __KERNEL_CS) 253 print_symbol("{%s}", m->ip); 254 pr_cont("\n"); 255 } 256 257 pr_emerg(HW_ERR "TSC %llx ", m->tsc); 258 if (m->addr) 259 pr_cont("ADDR %llx ", m->addr); 260 if (m->misc) 261 pr_cont("MISC %llx ", m->misc); 262 263 pr_cont("\n"); 264 /* 265 * Note this output is parsed by external tools and old fields 266 * should not be changed. 267 */ 268 pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x microcode %x\n", 269 m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid, 270 cpu_data(m->extcpu).microcode); 271 272 /* 273 * Print out human-readable details about the MCE error, 274 * (if the CPU has an implementation for that) 275 */ 276 ret = atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m); 277 if (ret == NOTIFY_STOP) 278 return; 279 280 pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n"); 281} 282 283#define PANIC_TIMEOUT 5 /* 5 seconds */ 284 285static atomic_t mce_paniced; 286 287static int fake_panic; 288static atomic_t mce_fake_paniced; 289 290/* Panic in progress. Enable interrupts and wait for final IPI */ 291static void wait_for_panic(void) 292{ 293 long timeout = PANIC_TIMEOUT*USEC_PER_SEC; 294 295 preempt_disable(); 296 local_irq_enable(); 297 while (timeout-- > 0) 298 udelay(1); 299 if (panic_timeout == 0) 300 panic_timeout = mca_cfg.panic_timeout; 301 panic("Panicing machine check CPU died"); 302} 303 304static void mce_panic(char *msg, struct mce *final, char *exp) 305{ 306 int i, apei_err = 0; 307 308 if (!fake_panic) { 309 /* 310 * Make sure only one CPU runs in machine check panic 311 */ 312 if (atomic_inc_return(&mce_paniced) > 1) 313 wait_for_panic(); 314 barrier(); 315 316 bust_spinlocks(1); 317 console_verbose(); 318 } else { 319 /* Don't log too much for fake panic */ 320 if (atomic_inc_return(&mce_fake_paniced) > 1) 321 return; 322 } 323 /* First print corrected ones that are still unlogged */ 324 for (i = 0; i < MCE_LOG_LEN; i++) { 325 struct mce *m = &mcelog.entry[i]; 326 if (!(m->status & MCI_STATUS_VAL)) 327 continue; 328 if (!(m->status & MCI_STATUS_UC)) { 329 print_mce(m); 330 if (!apei_err) 331 apei_err = apei_write_mce(m); 332 } 333 } 334 /* Now print uncorrected but with the final one last */ 335 for (i = 0; i < MCE_LOG_LEN; i++) { 336 struct mce *m = &mcelog.entry[i]; 337 if (!(m->status & MCI_STATUS_VAL)) 338 continue; 339 if (!(m->status & MCI_STATUS_UC)) 340 continue; 341 if (!final || memcmp(m, final, sizeof(struct mce))) { 342 print_mce(m); 343 if (!apei_err) 344 apei_err = apei_write_mce(m); 345 } 346 } 347 if (final) { 348 print_mce(final); 349 if (!apei_err) 350 apei_err = apei_write_mce(final); 351 } 352 if (cpu_missing) 353 pr_emerg(HW_ERR "Some CPUs didn't answer in synchronization\n"); 354 if (exp) 355 pr_emerg(HW_ERR "Machine check: %s\n", exp); 356 if (!fake_panic) { 357 if (panic_timeout == 0) 358 panic_timeout = mca_cfg.panic_timeout; 359 panic(msg); 360 } else 361 pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg); 362} 363 364/* Support code for software error injection */ 365 366static int msr_to_offset(u32 msr) 367{ 368 unsigned bank = __this_cpu_read(injectm.bank); 369 370 if (msr == mca_cfg.rip_msr) 371 return offsetof(struct mce, ip); 372 if (msr == MSR_IA32_MCx_STATUS(bank)) 373 return offsetof(struct mce, status); 374 if (msr == MSR_IA32_MCx_ADDR(bank)) 375 return offsetof(struct mce, addr); 376 if (msr == MSR_IA32_MCx_MISC(bank)) 377 return offsetof(struct mce, misc); 378 if (msr == MSR_IA32_MCG_STATUS) 379 return offsetof(struct mce, mcgstatus); 380 return -1; 381} 382 383/* MSR access wrappers used for error injection */ 384static u64 mce_rdmsrl(u32 msr) 385{ 386 u64 v; 387 388 if (__this_cpu_read(injectm.finished)) { 389 int offset = msr_to_offset(msr); 390 391 if (offset < 0) 392 return 0; 393 return *(u64 *)((char *)&__get_cpu_var(injectm) + offset); 394 } 395 396 if (rdmsrl_safe(msr, &v)) { 397 WARN_ONCE(1, "mce: Unable to read msr %d!\n", msr); 398 /* 399 * Return zero in case the access faulted. This should 400 * not happen normally but can happen if the CPU does 401 * something weird, or if the code is buggy. 402 */ 403 v = 0; 404 } 405 406 return v; 407} 408 409static void mce_wrmsrl(u32 msr, u64 v) 410{ 411 if (__this_cpu_read(injectm.finished)) { 412 int offset = msr_to_offset(msr); 413 414 if (offset >= 0) 415 *(u64 *)((char *)&__get_cpu_var(injectm) + offset) = v; 416 return; 417 } 418 wrmsrl(msr, v); 419} 420 421/* 422 * Collect all global (w.r.t. this processor) status about this machine 423 * check into our "mce" struct so that we can use it later to assess 424 * the severity of the problem as we read per-bank specific details. 425 */ 426static inline void mce_gather_info(struct mce *m, struct pt_regs *regs) 427{ 428 mce_setup(m); 429 430 m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); 431 if (regs) { 432 /* 433 * Get the address of the instruction at the time of 434 * the machine check error. 435 */ 436 if (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) { 437 m->ip = regs->ip; 438 m->cs = regs->cs; 439 440 /* 441 * When in VM86 mode make the cs look like ring 3 442 * always. This is a lie, but it's better than passing 443 * the additional vm86 bit around everywhere. 444 */ 445 if (v8086_mode(regs)) 446 m->cs |= 3; 447 } 448 /* Use accurate RIP reporting if available. */ 449 if (mca_cfg.rip_msr) 450 m->ip = mce_rdmsrl(mca_cfg.rip_msr); 451 } 452} 453 454/* 455 * Simple lockless ring to communicate PFNs from the exception handler with the 456 * process context work function. This is vastly simplified because there's 457 * only a single reader and a single writer. 458 */ 459#define MCE_RING_SIZE 16 /* we use one entry less */ 460 461struct mce_ring { 462 unsigned short start; 463 unsigned short end; 464 unsigned long ring[MCE_RING_SIZE]; 465}; 466static DEFINE_PER_CPU(struct mce_ring, mce_ring); 467 468/* Runs with CPU affinity in workqueue */ 469static int mce_ring_empty(void) 470{ 471 struct mce_ring *r = &__get_cpu_var(mce_ring); 472 473 return r->start == r->end; 474} 475 476static int mce_ring_get(unsigned long *pfn) 477{ 478 struct mce_ring *r; 479 int ret = 0; 480 481 *pfn = 0; 482 get_cpu(); 483 r = &__get_cpu_var(mce_ring); 484 if (r->start == r->end) 485 goto out; 486 *pfn = r->ring[r->start]; 487 r->start = (r->start + 1) % MCE_RING_SIZE; 488 ret = 1; 489out: 490 put_cpu(); 491 return ret; 492} 493 494/* Always runs in MCE context with preempt off */ 495static int mce_ring_add(unsigned long pfn) 496{ 497 struct mce_ring *r = &__get_cpu_var(mce_ring); 498 unsigned next; 499 500 next = (r->end + 1) % MCE_RING_SIZE; 501 if (next == r->start) 502 return -1; 503 r->ring[r->end] = pfn; 504 wmb(); 505 r->end = next; 506 return 0; 507} 508 509int mce_available(struct cpuinfo_x86 *c) 510{ 511 if (mca_cfg.disabled) 512 return 0; 513 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); 514} 515 516static void mce_schedule_work(void) 517{ 518 if (!mce_ring_empty()) 519 schedule_work(&__get_cpu_var(mce_work)); 520} 521 522DEFINE_PER_CPU(struct irq_work, mce_irq_work); 523 524static void mce_irq_work_cb(struct irq_work *entry) 525{ 526 mce_notify_irq(); 527 mce_schedule_work(); 528} 529 530static void mce_report_event(struct pt_regs *regs) 531{ 532 if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) { 533 mce_notify_irq(); 534 /* 535 * Triggering the work queue here is just an insurance 536 * policy in case the syscall exit notify handler 537 * doesn't run soon enough or ends up running on the 538 * wrong CPU (can happen when audit sleeps) 539 */ 540 mce_schedule_work(); 541 return; 542 } 543 544 irq_work_queue(&__get_cpu_var(mce_irq_work)); 545} 546 547/* 548 * Read ADDR and MISC registers. 549 */ 550static void mce_read_aux(struct mce *m, int i) 551{ 552 if (m->status & MCI_STATUS_MISCV) 553 m->misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i)); 554 if (m->status & MCI_STATUS_ADDRV) { 555 m->addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i)); 556 557 /* 558 * Mask the reported address by the reported granularity. 559 */ 560 if (mca_cfg.ser && (m->status & MCI_STATUS_MISCV)) { 561 u8 shift = MCI_MISC_ADDR_LSB(m->misc); 562 m->addr >>= shift; 563 m->addr <<= shift; 564 } 565 } 566} 567 568DEFINE_PER_CPU(unsigned, mce_poll_count); 569 570/* 571 * Poll for corrected events or events that happened before reset. 572 * Those are just logged through /dev/mcelog. 573 * 574 * This is executed in standard interrupt context. 575 * 576 * Note: spec recommends to panic for fatal unsignalled 577 * errors here. However this would be quite problematic -- 578 * we would need to reimplement the Monarch handling and 579 * it would mess up the exclusion between exception handler 580 * and poll hander -- * so we skip this for now. 581 * These cases should not happen anyways, or only when the CPU 582 * is already totally * confused. In this case it's likely it will 583 * not fully execute the machine check handler either. 584 */ 585void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) 586{ 587 struct mce m; 588 int i; 589 590 this_cpu_inc(mce_poll_count); 591 592 mce_gather_info(&m, NULL); 593 594 for (i = 0; i < mca_cfg.banks; i++) { 595 if (!mce_banks[i].ctl || !test_bit(i, *b)) 596 continue; 597 598 m.misc = 0; 599 m.addr = 0; 600 m.bank = i; 601 m.tsc = 0; 602 603 barrier(); 604 m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); 605 if (!(m.status & MCI_STATUS_VAL)) 606 continue; 607 608 /* 609 * Uncorrected or signalled events are handled by the exception 610 * handler when it is enabled, so don't process those here. 611 * 612 * TBD do the same check for MCI_STATUS_EN here? 613 */ 614 if (!(flags & MCP_UC) && 615 (m.status & (mca_cfg.ser ? MCI_STATUS_S : MCI_STATUS_UC))) 616 continue; 617 618 mce_read_aux(&m, i); 619 620 if (!(flags & MCP_TIMESTAMP)) 621 m.tsc = 0; 622 /* 623 * Don't get the IP here because it's unlikely to 624 * have anything to do with the actual error location. 625 */ 626 if (!(flags & MCP_DONTLOG) && !mca_cfg.dont_log_ce) 627 mce_log(&m); 628 629 /* 630 * Clear state for this bank. 631 */ 632 mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0); 633 } 634 635 /* 636 * Don't clear MCG_STATUS here because it's only defined for 637 * exceptions. 638 */ 639 640 sync_core(); 641} 642EXPORT_SYMBOL_GPL(machine_check_poll); 643 644/* 645 * Do a quick check if any of the events requires a panic. 646 * This decides if we keep the events around or clear them. 647 */ 648static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp, 649 struct pt_regs *regs) 650{ 651 int i, ret = 0; 652 653 for (i = 0; i < mca_cfg.banks; i++) { 654 m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); 655 if (m->status & MCI_STATUS_VAL) { 656 __set_bit(i, validp); 657 if (quirk_no_way_out) 658 quirk_no_way_out(i, m, regs); 659 } 660 if (mce_severity(m, mca_cfg.tolerant, msg) >= MCE_PANIC_SEVERITY) 661 ret = 1; 662 } 663 return ret; 664} 665 666/* 667 * Variable to establish order between CPUs while scanning. 668 * Each CPU spins initially until executing is equal its number. 669 */ 670static atomic_t mce_executing; 671 672/* 673 * Defines order of CPUs on entry. First CPU becomes Monarch. 674 */ 675static atomic_t mce_callin; 676 677/* 678 * Check if a timeout waiting for other CPUs happened. 679 */ 680static int mce_timed_out(u64 *t) 681{ 682 /* 683 * The others already did panic for some reason. 684 * Bail out like in a timeout. 685 * rmb() to tell the compiler that system_state 686 * might have been modified by someone else. 687 */ 688 rmb(); 689 if (atomic_read(&mce_paniced)) 690 wait_for_panic(); 691 if (!mca_cfg.monarch_timeout) 692 goto out; 693 if ((s64)*t < SPINUNIT) { 694 /* CHECKME: Make panic default for 1 too? */ 695 if (mca_cfg.tolerant < 1) 696 mce_panic("Timeout synchronizing machine check over CPUs", 697 NULL, NULL); 698 cpu_missing = 1; 699 return 1; 700 } 701 *t -= SPINUNIT; 702out: 703 touch_nmi_watchdog(); 704 return 0; 705} 706 707/* 708 * The Monarch's reign. The Monarch is the CPU who entered 709 * the machine check handler first. It waits for the others to 710 * raise the exception too and then grades them. When any 711 * error is fatal panic. Only then let the others continue. 712 * 713 * The other CPUs entering the MCE handler will be controlled by the 714 * Monarch. They are called Subjects. 715 * 716 * This way we prevent any potential data corruption in a unrecoverable case 717 * and also makes sure always all CPU's errors are examined. 718 * 719 * Also this detects the case of a machine check event coming from outer 720 * space (not detected by any CPUs) In this case some external agent wants 721 * us to shut down, so panic too. 722 * 723 * The other CPUs might still decide to panic if the handler happens 724 * in a unrecoverable place, but in this case the system is in a semi-stable 725 * state and won't corrupt anything by itself. It's ok to let the others 726 * continue for a bit first. 727 * 728 * All the spin loops have timeouts; when a timeout happens a CPU 729 * typically elects itself to be Monarch. 730 */ 731static void mce_reign(void) 732{ 733 int cpu; 734 struct mce *m = NULL; 735 int global_worst = 0; 736 char *msg = NULL; 737 char *nmsg = NULL; 738 739 /* 740 * This CPU is the Monarch and the other CPUs have run 741 * through their handlers. 742 * Grade the severity of the errors of all the CPUs. 743 */ 744 for_each_possible_cpu(cpu) { 745 int severity = mce_severity(&per_cpu(mces_seen, cpu), 746 mca_cfg.tolerant, 747 &nmsg); 748 if (severity > global_worst) { 749 msg = nmsg; 750 global_worst = severity; 751 m = &per_cpu(mces_seen, cpu); 752 } 753 } 754 755 /* 756 * Cannot recover? Panic here then. 757 * This dumps all the mces in the log buffer and stops the 758 * other CPUs. 759 */ 760 if (m && global_worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3) 761 mce_panic("Fatal Machine check", m, msg); 762 763 /* 764 * For UC somewhere we let the CPU who detects it handle it. 765 * Also must let continue the others, otherwise the handling 766 * CPU could deadlock on a lock. 767 */ 768 769 /* 770 * No machine check event found. Must be some external 771 * source or one CPU is hung. Panic. 772 */ 773 if (global_worst <= MCE_KEEP_SEVERITY && mca_cfg.tolerant < 3) 774 mce_panic("Machine check from unknown source", NULL, NULL); 775 776 /* 777 * Now clear all the mces_seen so that they don't reappear on 778 * the next mce. 779 */ 780 for_each_possible_cpu(cpu) 781 memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce)); 782} 783 784static atomic_t global_nwo; 785 786/* 787 * Start of Monarch synchronization. This waits until all CPUs have 788 * entered the exception handler and then determines if any of them 789 * saw a fatal event that requires panic. Then it executes them 790 * in the entry order. 791 * TBD double check parallel CPU hotunplug 792 */ 793static int mce_start(int *no_way_out) 794{ 795 int order; 796 int cpus = num_online_cpus(); 797 u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC; 798 799 if (!timeout) 800 return -1; 801 802 atomic_add(*no_way_out, &global_nwo); 803 /* 804 * global_nwo should be updated before mce_callin 805 */ 806 smp_wmb(); 807 order = atomic_inc_return(&mce_callin); 808 809 /* 810 * Wait for everyone. 811 */ 812 while (atomic_read(&mce_callin) != cpus) { 813 if (mce_timed_out(&timeout)) { 814 atomic_set(&global_nwo, 0); 815 return -1; 816 } 817 ndelay(SPINUNIT); 818 } 819 820 /* 821 * mce_callin should be read before global_nwo 822 */ 823 smp_rmb(); 824 825 if (order == 1) { 826 /* 827 * Monarch: Starts executing now, the others wait. 828 */ 829 atomic_set(&mce_executing, 1); 830 } else { 831 /* 832 * Subject: Now start the scanning loop one by one in 833 * the original callin order. 834 * This way when there are any shared banks it will be 835 * only seen by one CPU before cleared, avoiding duplicates. 836 */ 837 while (atomic_read(&mce_executing) < order) { 838 if (mce_timed_out(&timeout)) { 839 atomic_set(&global_nwo, 0); 840 return -1; 841 } 842 ndelay(SPINUNIT); 843 } 844 } 845 846 /* 847 * Cache the global no_way_out state. 848 */ 849 *no_way_out = atomic_read(&global_nwo); 850 851 return order; 852} 853 854/* 855 * Synchronize between CPUs after main scanning loop. 856 * This invokes the bulk of the Monarch processing. 857 */ 858static int mce_end(int order) 859{ 860 int ret = -1; 861 u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC; 862 863 if (!timeout) 864 goto reset; 865 if (order < 0) 866 goto reset; 867 868 /* 869 * Allow others to run. 870 */ 871 atomic_inc(&mce_executing); 872 873 if (order == 1) { 874 /* CHECKME: Can this race with a parallel hotplug? */ 875 int cpus = num_online_cpus(); 876 877 /* 878 * Monarch: Wait for everyone to go through their scanning 879 * loops. 880 */ 881 while (atomic_read(&mce_executing) <= cpus) { 882 if (mce_timed_out(&timeout)) 883 goto reset; 884 ndelay(SPINUNIT); 885 } 886 887 mce_reign(); 888 barrier(); 889 ret = 0; 890 } else { 891 /* 892 * Subject: Wait for Monarch to finish. 893 */ 894 while (atomic_read(&mce_executing) != 0) { 895 if (mce_timed_out(&timeout)) 896 goto reset; 897 ndelay(SPINUNIT); 898 } 899 900 /* 901 * Don't reset anything. That's done by the Monarch. 902 */ 903 return 0; 904 } 905 906 /* 907 * Reset all global state. 908 */ 909reset: 910 atomic_set(&global_nwo, 0); 911 atomic_set(&mce_callin, 0); 912 barrier(); 913 914 /* 915 * Let others run again. 916 */ 917 atomic_set(&mce_executing, 0); 918 return ret; 919} 920 921/* 922 * Check if the address reported by the CPU is in a format we can parse. 923 * It would be possible to add code for most other cases, but all would 924 * be somewhat complicated (e.g. segment offset would require an instruction 925 * parser). So only support physical addresses up to page granuality for now. 926 */ 927static int mce_usable_address(struct mce *m) 928{ 929 if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV)) 930 return 0; 931 if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT) 932 return 0; 933 if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS) 934 return 0; 935 return 1; 936} 937 938static void mce_clear_state(unsigned long *toclear) 939{ 940 int i; 941 942 for (i = 0; i < mca_cfg.banks; i++) { 943 if (test_bit(i, toclear)) 944 mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0); 945 } 946} 947 948/* 949 * Need to save faulting physical address associated with a process 950 * in the machine check handler some place where we can grab it back 951 * later in mce_notify_process() 952 */ 953#define MCE_INFO_MAX 16 954 955struct mce_info { 956 atomic_t inuse; 957 struct task_struct *t; 958 __u64 paddr; 959 int restartable; 960} mce_info[MCE_INFO_MAX]; 961 962static void mce_save_info(__u64 addr, int c) 963{ 964 struct mce_info *mi; 965 966 for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++) { 967 if (atomic_cmpxchg(&mi->inuse, 0, 1) == 0) { 968 mi->t = current; 969 mi->paddr = addr; 970 mi->restartable = c; 971 return; 972 } 973 } 974 975 mce_panic("Too many concurrent recoverable errors", NULL, NULL); 976} 977 978static struct mce_info *mce_find_info(void) 979{ 980 struct mce_info *mi; 981 982 for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++) 983 if (atomic_read(&mi->inuse) && mi->t == current) 984 return mi; 985 return NULL; 986} 987 988static void mce_clear_info(struct mce_info *mi) 989{ 990 atomic_set(&mi->inuse, 0); 991} 992 993/* 994 * The actual machine check handler. This only handles real 995 * exceptions when something got corrupted coming in through int 18. 996 * 997 * This is executed in NMI context not subject to normal locking rules. This 998 * implies that most kernel services cannot be safely used. Don't even 999 * think about putting a printk in there! 1000 * 1001 * On Intel systems this is entered on all CPUs in parallel through 1002 * MCE broadcast. However some CPUs might be broken beyond repair, 1003 * so be always careful when synchronizing with others. 1004 */ 1005void do_machine_check(struct pt_regs *regs, long error_code) 1006{ 1007 struct mca_config *cfg = &mca_cfg; 1008 struct mce m, *final; 1009 int i; 1010 int worst = 0; 1011 int severity; 1012 /* 1013 * Establish sequential order between the CPUs entering the machine 1014 * check handler. 1015 */ 1016 int order; 1017 /* 1018 * If no_way_out gets set, there is no safe way to recover from this 1019 * MCE. If mca_cfg.tolerant is cranked up, we'll try anyway. 1020 */ 1021 int no_way_out = 0; 1022 /* 1023 * If kill_it gets set, there might be a way to recover from this 1024 * error. 1025 */ 1026 int kill_it = 0; 1027 DECLARE_BITMAP(toclear, MAX_NR_BANKS); 1028 DECLARE_BITMAP(valid_banks, MAX_NR_BANKS); 1029 char *msg = "Unknown"; 1030 1031 atomic_inc(&mce_entry); 1032 1033 this_cpu_inc(mce_exception_count); 1034 1035 if (!cfg->banks) 1036 goto out; 1037 1038 mce_gather_info(&m, regs); 1039 1040 final = &__get_cpu_var(mces_seen); 1041 *final = m; 1042 1043 memset(valid_banks, 0, sizeof(valid_banks)); 1044 no_way_out = mce_no_way_out(&m, &msg, valid_banks, regs); 1045 1046 barrier(); 1047 1048 /* 1049 * When no restart IP might need to kill or panic. 1050 * Assume the worst for now, but if we find the 1051 * severity is MCE_AR_SEVERITY we have other options. 1052 */ 1053 if (!(m.mcgstatus & MCG_STATUS_RIPV)) 1054 kill_it = 1; 1055 1056 /* 1057 * Go through all the banks in exclusion of the other CPUs. 1058 * This way we don't report duplicated events on shared banks 1059 * because the first one to see it will clear it. 1060 */ 1061 order = mce_start(&no_way_out); 1062 for (i = 0; i < cfg->banks; i++) { 1063 __clear_bit(i, toclear); 1064 if (!test_bit(i, valid_banks)) 1065 continue; 1066 if (!mce_banks[i].ctl) 1067 continue; 1068 1069 m.misc = 0; 1070 m.addr = 0; 1071 m.bank = i; 1072 1073 m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); 1074 if ((m.status & MCI_STATUS_VAL) == 0) 1075 continue; 1076 1077 /* 1078 * Non uncorrected or non signaled errors are handled by 1079 * machine_check_poll. Leave them alone, unless this panics. 1080 */ 1081 if (!(m.status & (cfg->ser ? MCI_STATUS_S : MCI_STATUS_UC)) && 1082 !no_way_out) 1083 continue; 1084 1085 /* 1086 * Set taint even when machine check was not enabled. 1087 */ 1088 add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); 1089 1090 severity = mce_severity(&m, cfg->tolerant, NULL); 1091 1092 /* 1093 * When machine check was for corrected handler don't touch, 1094 * unless we're panicing. 1095 */ 1096 if (severity == MCE_KEEP_SEVERITY && !no_way_out) 1097 continue; 1098 __set_bit(i, toclear); 1099 if (severity == MCE_NO_SEVERITY) { 1100 /* 1101 * Machine check event was not enabled. Clear, but 1102 * ignore. 1103 */ 1104 continue; 1105 } 1106 1107 mce_read_aux(&m, i); 1108 1109 /* 1110 * Action optional error. Queue address for later processing. 1111 * When the ring overflows we just ignore the AO error. 1112 * RED-PEN add some logging mechanism when 1113 * usable_address or mce_add_ring fails. 1114 * RED-PEN don't ignore overflow for mca_cfg.tolerant == 0 1115 */ 1116 if (severity == MCE_AO_SEVERITY && mce_usable_address(&m)) 1117 mce_ring_add(m.addr >> PAGE_SHIFT); 1118 1119 mce_log(&m); 1120 1121 if (severity > worst) { 1122 *final = m; 1123 worst = severity; 1124 } 1125 } 1126 1127 /* mce_clear_state will clear *final, save locally for use later */ 1128 m = *final; 1129 1130 if (!no_way_out) 1131 mce_clear_state(toclear); 1132 1133 /* 1134 * Do most of the synchronization with other CPUs. 1135 * When there's any problem use only local no_way_out state. 1136 */ 1137 if (mce_end(order) < 0) 1138 no_way_out = worst >= MCE_PANIC_SEVERITY; 1139 1140 /* 1141 * At insane "tolerant" levels we take no action. Otherwise 1142 * we only die if we have no other choice. For less serious 1143 * issues we try to recover, or limit damage to the current 1144 * process. 1145 */ 1146 if (cfg->tolerant < 3) { 1147 if (no_way_out) 1148 mce_panic("Fatal machine check on current CPU", &m, msg); 1149 if (worst == MCE_AR_SEVERITY) { 1150 /* schedule action before return to userland */ 1151 mce_save_info(m.addr, m.mcgstatus & MCG_STATUS_RIPV); 1152 set_thread_flag(TIF_MCE_NOTIFY); 1153 } else if (kill_it) { 1154 force_sig(SIGBUS, current); 1155 } 1156 } 1157 1158 if (worst > 0) 1159 mce_report_event(regs); 1160 mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); 1161out: 1162 atomic_dec(&mce_entry); 1163 sync_core(); 1164} 1165EXPORT_SYMBOL_GPL(do_machine_check); 1166 1167#ifndef CONFIG_MEMORY_FAILURE 1168int memory_failure(unsigned long pfn, int vector, int flags) 1169{ 1170 /* mce_severity() should not hand us an ACTION_REQUIRED error */ 1171 BUG_ON(flags & MF_ACTION_REQUIRED); 1172 pr_err("Uncorrected memory error in page 0x%lx ignored\n" 1173 "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n", 1174 pfn); 1175 1176 return 0; 1177} 1178#endif 1179 1180/* 1181 * Called in process context that interrupted by MCE and marked with 1182 * TIF_MCE_NOTIFY, just before returning to erroneous userland. 1183 * This code is allowed to sleep. 1184 * Attempt possible recovery such as calling the high level VM handler to 1185 * process any corrupted pages, and kill/signal current process if required. 1186 * Action required errors are handled here. 1187 */ 1188void mce_notify_process(void) 1189{ 1190 unsigned long pfn; 1191 struct mce_info *mi = mce_find_info(); 1192 int flags = MF_ACTION_REQUIRED; 1193 1194 if (!mi) 1195 mce_panic("Lost physical address for unconsumed uncorrectable error", NULL, NULL); 1196 pfn = mi->paddr >> PAGE_SHIFT; 1197 1198 clear_thread_flag(TIF_MCE_NOTIFY); 1199 1200 pr_err("Uncorrected hardware memory error in user-access at %llx", 1201 mi->paddr); 1202 /* 1203 * We must call memory_failure() here even if the current process is 1204 * doomed. We still need to mark the page as poisoned and alert any 1205 * other users of the page. 1206 */ 1207 if (!mi->restartable) 1208 flags |= MF_MUST_KILL; 1209 if (memory_failure(pfn, MCE_VECTOR, flags) < 0) { 1210 pr_err("Memory error not recovered"); 1211 force_sig(SIGBUS, current); 1212 } 1213 mce_clear_info(mi); 1214} 1215 1216/* 1217 * Action optional processing happens here (picking up 1218 * from the list of faulting pages that do_machine_check() 1219 * placed into the "ring"). 1220 */ 1221static void mce_process_work(struct work_struct *dummy) 1222{ 1223 unsigned long pfn; 1224 1225 while (mce_ring_get(&pfn)) 1226 memory_failure(pfn, MCE_VECTOR, 0); 1227} 1228 1229#ifdef CONFIG_X86_MCE_INTEL 1230/*** 1231 * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog 1232 * @cpu: The CPU on which the event occurred. 1233 * @status: Event status information 1234 * 1235 * This function should be called by the thermal interrupt after the 1236 * event has been processed and the decision was made to log the event 1237 * further. 1238 * 1239 * The status parameter will be saved to the 'status' field of 'struct mce' 1240 * and historically has been the register value of the 1241 * MSR_IA32_THERMAL_STATUS (Intel) msr. 1242 */ 1243void mce_log_therm_throt_event(__u64 status) 1244{ 1245 struct mce m; 1246 1247 mce_setup(&m); 1248 m.bank = MCE_THERMAL_BANK; 1249 m.status = status; 1250 mce_log(&m); 1251} 1252#endif /* CONFIG_X86_MCE_INTEL */ 1253 1254/* 1255 * Periodic polling timer for "silent" machine check errors. If the 1256 * poller finds an MCE, poll 2x faster. When the poller finds no more 1257 * errors, poll 2x slower (up to check_interval seconds). 1258 */ 1259static unsigned long check_interval = 5 * 60; /* 5 minutes */ 1260 1261static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */ 1262static DEFINE_PER_CPU(struct timer_list, mce_timer); 1263 1264static unsigned long mce_adjust_timer_default(unsigned long interval) 1265{ 1266 return interval; 1267} 1268 1269static unsigned long (*mce_adjust_timer)(unsigned long interval) = 1270 mce_adjust_timer_default; 1271 1272static void mce_timer_fn(unsigned long data) 1273{ 1274 struct timer_list *t = &__get_cpu_var(mce_timer); 1275 unsigned long iv; 1276 1277 WARN_ON(smp_processor_id() != data); 1278 1279 if (mce_available(__this_cpu_ptr(&cpu_info))) { 1280 machine_check_poll(MCP_TIMESTAMP, 1281 &__get_cpu_var(mce_poll_banks)); 1282 mce_intel_cmci_poll(); 1283 } 1284 1285 /* 1286 * Alert userspace if needed. If we logged an MCE, reduce the 1287 * polling interval, otherwise increase the polling interval. 1288 */ 1289 iv = __this_cpu_read(mce_next_interval); 1290 if (mce_notify_irq()) { 1291 iv = max(iv / 2, (unsigned long) HZ/100); 1292 } else { 1293 iv = min(iv * 2, round_jiffies_relative(check_interval * HZ)); 1294 iv = mce_adjust_timer(iv); 1295 } 1296 __this_cpu_write(mce_next_interval, iv); 1297 /* Might have become 0 after CMCI storm subsided */ 1298 if (iv) { 1299 t->expires = jiffies + iv; 1300 add_timer_on(t, smp_processor_id()); 1301 } 1302} 1303 1304/* 1305 * Ensure that the timer is firing in @interval from now. 1306 */ 1307void mce_timer_kick(unsigned long interval) 1308{ 1309 struct timer_list *t = &__get_cpu_var(mce_timer); 1310 unsigned long when = jiffies + interval; 1311 unsigned long iv = __this_cpu_read(mce_next_interval); 1312 1313 if (timer_pending(t)) { 1314 if (time_before(when, t->expires)) 1315 mod_timer_pinned(t, when); 1316 } else { 1317 t->expires = round_jiffies(when); 1318 add_timer_on(t, smp_processor_id()); 1319 } 1320 if (interval < iv) 1321 __this_cpu_write(mce_next_interval, interval); 1322} 1323 1324/* Must not be called in IRQ context where del_timer_sync() can deadlock */ 1325static void mce_timer_delete_all(void) 1326{ 1327 int cpu; 1328 1329 for_each_online_cpu(cpu) 1330 del_timer_sync(&per_cpu(mce_timer, cpu)); 1331} 1332 1333static void mce_do_trigger(struct work_struct *work) 1334{ 1335 call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT); 1336} 1337 1338static DECLARE_WORK(mce_trigger_work, mce_do_trigger); 1339 1340/* 1341 * Notify the user(s) about new machine check events. 1342 * Can be called from interrupt context, but not from machine check/NMI 1343 * context. 1344 */ 1345int mce_notify_irq(void) 1346{ 1347 /* Not more than two messages every minute */ 1348 static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); 1349 1350 if (test_and_clear_bit(0, &mce_need_notify)) { 1351 /* wake processes polling /dev/mcelog */ 1352 wake_up_interruptible(&mce_chrdev_wait); 1353 1354 if (mce_helper[0]) 1355 schedule_work(&mce_trigger_work); 1356 1357 if (__ratelimit(&ratelimit)) 1358 pr_info(HW_ERR "Machine check events logged\n"); 1359 1360 return 1; 1361 } 1362 return 0; 1363} 1364EXPORT_SYMBOL_GPL(mce_notify_irq); 1365 1366static int __cpuinit __mcheck_cpu_mce_banks_init(void) 1367{ 1368 int i; 1369 u8 num_banks = mca_cfg.banks; 1370 1371 mce_banks = kzalloc(num_banks * sizeof(struct mce_bank), GFP_KERNEL); 1372 if (!mce_banks) 1373 return -ENOMEM; 1374 1375 for (i = 0; i < num_banks; i++) { 1376 struct mce_bank *b = &mce_banks[i]; 1377 1378 b->ctl = -1ULL; 1379 b->init = 1; 1380 } 1381 return 0; 1382} 1383 1384/* 1385 * Initialize Machine Checks for a CPU. 1386 */ 1387static int __cpuinit __mcheck_cpu_cap_init(void) 1388{ 1389 unsigned b; 1390 u64 cap; 1391 1392 rdmsrl(MSR_IA32_MCG_CAP, cap); 1393 1394 b = cap & MCG_BANKCNT_MASK; 1395 if (!mca_cfg.banks) 1396 pr_info("CPU supports %d MCE banks\n", b); 1397 1398 if (b > MAX_NR_BANKS) { 1399 pr_warn("Using only %u machine check banks out of %u\n", 1400 MAX_NR_BANKS, b); 1401 b = MAX_NR_BANKS; 1402 } 1403 1404 /* Don't support asymmetric configurations today */ 1405 WARN_ON(mca_cfg.banks != 0 && b != mca_cfg.banks); 1406 mca_cfg.banks = b; 1407 1408 if (!mce_banks) { 1409 int err = __mcheck_cpu_mce_banks_init(); 1410 1411 if (err) 1412 return err; 1413 } 1414 1415 /* Use accurate RIP reporting if available. */ 1416 if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9) 1417 mca_cfg.rip_msr = MSR_IA32_MCG_EIP; 1418 1419 if (cap & MCG_SER_P) 1420 mca_cfg.ser = true; 1421 1422 return 0; 1423} 1424 1425static void __mcheck_cpu_init_generic(void) 1426{ 1427 enum mcp_flags m_fl = 0; 1428 mce_banks_t all_banks; 1429 u64 cap; 1430 int i; 1431 1432 if (!mca_cfg.bootlog) 1433 m_fl = MCP_DONTLOG; 1434 1435 /* 1436 * Log the machine checks left over from the previous reset. 1437 */ 1438 bitmap_fill(all_banks, MAX_NR_BANKS); 1439 machine_check_poll(MCP_UC | m_fl, &all_banks); 1440 1441 set_in_cr4(X86_CR4_MCE); 1442 1443 rdmsrl(MSR_IA32_MCG_CAP, cap); 1444 if (cap & MCG_CTL_P) 1445 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); 1446 1447 for (i = 0; i < mca_cfg.banks; i++) { 1448 struct mce_bank *b = &mce_banks[i]; 1449 1450 if (!b->init) 1451 continue; 1452 wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl); 1453 wrmsrl(MSR_IA32_MCx_STATUS(i), 0); 1454 } 1455} 1456 1457/* 1458 * During IFU recovery Sandy Bridge -EP4S processors set the RIPV and 1459 * EIPV bits in MCG_STATUS to zero on the affected logical processor (SDM 1460 * Vol 3B Table 15-20). But this confuses both the code that determines 1461 * whether the machine check occurred in kernel or user mode, and also 1462 * the severity assessment code. Pretend that EIPV was set, and take the 1463 * ip/cs values from the pt_regs that mce_gather_info() ignored earlier. 1464 */ 1465static void quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs) 1466{ 1467 if (bank != 0) 1468 return; 1469 if ((m->mcgstatus & (MCG_STATUS_EIPV|MCG_STATUS_RIPV)) != 0) 1470 return; 1471 if ((m->status & (MCI_STATUS_OVER|MCI_STATUS_UC| 1472 MCI_STATUS_EN|MCI_STATUS_MISCV|MCI_STATUS_ADDRV| 1473 MCI_STATUS_PCC|MCI_STATUS_S|MCI_STATUS_AR| 1474 MCACOD)) != 1475 (MCI_STATUS_UC|MCI_STATUS_EN| 1476 MCI_STATUS_MISCV|MCI_STATUS_ADDRV|MCI_STATUS_S| 1477 MCI_STATUS_AR|MCACOD_INSTR)) 1478 return; 1479 1480 m->mcgstatus |= MCG_STATUS_EIPV; 1481 m->ip = regs->ip; 1482 m->cs = regs->cs; 1483} 1484 1485/* Add per CPU specific workarounds here */ 1486static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) 1487{ 1488 struct mca_config *cfg = &mca_cfg; 1489 1490 if (c->x86_vendor == X86_VENDOR_UNKNOWN) { 1491 pr_info("unknown CPU type - not enabling MCE support\n"); 1492 return -EOPNOTSUPP; 1493 } 1494 1495 /* This should be disabled by the BIOS, but isn't always */ 1496 if (c->x86_vendor == X86_VENDOR_AMD) { 1497 if (c->x86 == 15 && cfg->banks > 4) { 1498 /* 1499 * disable GART TBL walk error reporting, which 1500 * trips off incorrectly with the IOMMU & 3ware 1501 * & Cerberus: 1502 */ 1503 clear_bit(10, (unsigned long *)&mce_banks[4].ctl); 1504 } 1505 if (c->x86 <= 17 && cfg->bootlog < 0) { 1506 /* 1507 * Lots of broken BIOS around that don't clear them 1508 * by default and leave crap in there. Don't log: 1509 */ 1510 cfg->bootlog = 0; 1511 } 1512 /* 1513 * Various K7s with broken bank 0 around. Always disable 1514 * by default. 1515 */ 1516 if (c->x86 == 6 && cfg->banks > 0) 1517 mce_banks[0].ctl = 0; 1518 1519 /* 1520 * Turn off MC4_MISC thresholding banks on those models since 1521 * they're not supported there. 1522 */ 1523 if (c->x86 == 0x15 && 1524 (c->x86_model >= 0x10 && c->x86_model <= 0x1f)) { 1525 int i; 1526 u64 val, hwcr; 1527 bool need_toggle; 1528 u32 msrs[] = { 1529 0x00000413, /* MC4_MISC0 */ 1530 0xc0000408, /* MC4_MISC1 */ 1531 }; 1532 1533 rdmsrl(MSR_K7_HWCR, hwcr); 1534 1535 /* McStatusWrEn has to be set */ 1536 need_toggle = !(hwcr & BIT(18)); 1537 1538 if (need_toggle) 1539 wrmsrl(MSR_K7_HWCR, hwcr | BIT(18)); 1540 1541 for (i = 0; i < ARRAY_SIZE(msrs); i++) { 1542 rdmsrl(msrs[i], val); 1543 1544 /* CntP bit set? */ 1545 if (val & BIT_64(62)) { 1546 val &= ~BIT_64(62); 1547 wrmsrl(msrs[i], val); 1548 } 1549 } 1550 1551 /* restore old settings */ 1552 if (need_toggle) 1553 wrmsrl(MSR_K7_HWCR, hwcr); 1554 } 1555 } 1556 1557 if (c->x86_vendor == X86_VENDOR_INTEL) { 1558 /* 1559 * SDM documents that on family 6 bank 0 should not be written 1560 * because it aliases to another special BIOS controlled 1561 * register. 1562 * But it's not aliased anymore on model 0x1a+ 1563 * Don't ignore bank 0 completely because there could be a 1564 * valid event later, merely don't write CTL0. 1565 */ 1566 1567 if (c->x86 == 6 && c->x86_model < 0x1A && cfg->banks > 0) 1568 mce_banks[0].init = 0; 1569 1570 /* 1571 * All newer Intel systems support MCE broadcasting. Enable 1572 * synchronization with a one second timeout. 1573 */ 1574 if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) && 1575 cfg->monarch_timeout < 0) 1576 cfg->monarch_timeout = USEC_PER_SEC; 1577 1578 /* 1579 * There are also broken BIOSes on some Pentium M and 1580 * earlier systems: 1581 */ 1582 if (c->x86 == 6 && c->x86_model <= 13 && cfg->bootlog < 0) 1583 cfg->bootlog = 0; 1584 1585 if (c->x86 == 6 && c->x86_model == 45) 1586 quirk_no_way_out = quirk_sandybridge_ifu; 1587 } 1588 if (cfg->monarch_timeout < 0) 1589 cfg->monarch_timeout = 0; 1590 if (cfg->bootlog != 0) 1591 cfg->panic_timeout = 30; 1592 1593 return 0; 1594} 1595 1596static int __cpuinit __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c) 1597{ 1598 if (c->x86 != 5) 1599 return 0; 1600 1601 switch (c->x86_vendor) { 1602 case X86_VENDOR_INTEL: 1603 intel_p5_mcheck_init(c); 1604 return 1; 1605 break; 1606 case X86_VENDOR_CENTAUR: 1607 winchip_mcheck_init(c); 1608 return 1; 1609 break; 1610 } 1611 1612 return 0; 1613} 1614 1615static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) 1616{ 1617 switch (c->x86_vendor) { 1618 case X86_VENDOR_INTEL: 1619 mce_intel_feature_init(c); 1620 mce_adjust_timer = mce_intel_adjust_timer; 1621 break; 1622 case X86_VENDOR_AMD: 1623 mce_amd_feature_init(c); 1624 break; 1625 default: 1626 break; 1627 } 1628} 1629 1630static void mce_start_timer(unsigned int cpu, struct timer_list *t) 1631{ 1632 unsigned long iv = mce_adjust_timer(check_interval * HZ); 1633 1634 __this_cpu_write(mce_next_interval, iv); 1635 1636 if (mca_cfg.ignore_ce || !iv) 1637 return; 1638 1639 t->expires = round_jiffies(jiffies + iv); 1640 add_timer_on(t, smp_processor_id()); 1641} 1642 1643static void __mcheck_cpu_init_timer(void) 1644{ 1645 struct timer_list *t = &__get_cpu_var(mce_timer); 1646 unsigned int cpu = smp_processor_id(); 1647 1648 setup_timer(t, mce_timer_fn, cpu); 1649 mce_start_timer(cpu, t); 1650} 1651 1652/* Handle unconfigured int18 (should never happen) */ 1653static void unexpected_machine_check(struct pt_regs *regs, long error_code) 1654{ 1655 pr_err("CPU#%d: Unexpected int18 (Machine Check)\n", 1656 smp_processor_id()); 1657} 1658 1659/* Call the installed machine check handler for this CPU setup. */ 1660void (*machine_check_vector)(struct pt_regs *, long error_code) = 1661 unexpected_machine_check; 1662 1663/* 1664 * Called for each booted CPU to set up machine checks. 1665 * Must be called with preempt off: 1666 */ 1667void __cpuinit mcheck_cpu_init(struct cpuinfo_x86 *c) 1668{ 1669 if (mca_cfg.disabled) 1670 return; 1671 1672 if (__mcheck_cpu_ancient_init(c)) 1673 return; 1674 1675 if (!mce_available(c)) 1676 return; 1677 1678 if (__mcheck_cpu_cap_init() < 0 || __mcheck_cpu_apply_quirks(c) < 0) { 1679 mca_cfg.disabled = true; 1680 return; 1681 } 1682 1683 machine_check_vector = do_machine_check; 1684 1685 __mcheck_cpu_init_generic(); 1686 __mcheck_cpu_init_vendor(c); 1687 __mcheck_cpu_init_timer(); 1688 INIT_WORK(&__get_cpu_var(mce_work), mce_process_work); 1689 init_irq_work(&__get_cpu_var(mce_irq_work), &mce_irq_work_cb); 1690} 1691 1692/* 1693 * mce_chrdev: Character device /dev/mcelog to read and clear the MCE log. 1694 */ 1695 1696static DEFINE_SPINLOCK(mce_chrdev_state_lock); 1697static int mce_chrdev_open_count; /* #times opened */ 1698static int mce_chrdev_open_exclu; /* already open exclusive? */ 1699 1700static int mce_chrdev_open(struct inode *inode, struct file *file) 1701{ 1702 spin_lock(&mce_chrdev_state_lock); 1703 1704 if (mce_chrdev_open_exclu || 1705 (mce_chrdev_open_count && (file->f_flags & O_EXCL))) { 1706 spin_unlock(&mce_chrdev_state_lock); 1707 1708 return -EBUSY; 1709 } 1710 1711 if (file->f_flags & O_EXCL) 1712 mce_chrdev_open_exclu = 1; 1713 mce_chrdev_open_count++; 1714 1715 spin_unlock(&mce_chrdev_state_lock); 1716 1717 return nonseekable_open(inode, file); 1718} 1719 1720static int mce_chrdev_release(struct inode *inode, struct file *file) 1721{ 1722 spin_lock(&mce_chrdev_state_lock); 1723 1724 mce_chrdev_open_count--; 1725 mce_chrdev_open_exclu = 0; 1726 1727 spin_unlock(&mce_chrdev_state_lock); 1728 1729 return 0; 1730} 1731 1732static void collect_tscs(void *data) 1733{ 1734 unsigned long *cpu_tsc = (unsigned long *)data; 1735 1736 rdtscll(cpu_tsc[smp_processor_id()]); 1737} 1738 1739static int mce_apei_read_done; 1740 1741/* Collect MCE record of previous boot in persistent storage via APEI ERST. */ 1742static int __mce_read_apei(char __user **ubuf, size_t usize) 1743{ 1744 int rc; 1745 u64 record_id; 1746 struct mce m; 1747 1748 if (usize < sizeof(struct mce)) 1749 return -EINVAL; 1750 1751 rc = apei_read_mce(&m, &record_id); 1752 /* Error or no more MCE record */ 1753 if (rc <= 0) { 1754 mce_apei_read_done = 1; 1755 /* 1756 * When ERST is disabled, mce_chrdev_read() should return 1757 * "no record" instead of "no device." 1758 */ 1759 if (rc == -ENODEV) 1760 return 0; 1761 return rc; 1762 } 1763 rc = -EFAULT; 1764 if (copy_to_user(*ubuf, &m, sizeof(struct mce))) 1765 return rc; 1766 /* 1767 * In fact, we should have cleared the record after that has 1768 * been flushed to the disk or sent to network in 1769 * /sbin/mcelog, but we have no interface to support that now, 1770 * so just clear it to avoid duplication. 1771 */ 1772 rc = apei_clear_mce(record_id); 1773 if (rc) { 1774 mce_apei_read_done = 1; 1775 return rc; 1776 } 1777 *ubuf += sizeof(struct mce); 1778 1779 return 0; 1780} 1781 1782static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf, 1783 size_t usize, loff_t *off) 1784{ 1785 char __user *buf = ubuf; 1786 unsigned long *cpu_tsc; 1787 unsigned prev, next; 1788 int i, err; 1789 1790 cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL); 1791 if (!cpu_tsc) 1792 return -ENOMEM; 1793 1794 mutex_lock(&mce_chrdev_read_mutex); 1795 1796 if (!mce_apei_read_done) { 1797 err = __mce_read_apei(&buf, usize); 1798 if (err || buf != ubuf) 1799 goto out; 1800 } 1801 1802 next = rcu_dereference_check_mce(mcelog.next); 1803 1804 /* Only supports full reads right now */ 1805 err = -EINVAL; 1806 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) 1807 goto out; 1808 1809 err = 0; 1810 prev = 0; 1811 do { 1812 for (i = prev; i < next; i++) { 1813 unsigned long start = jiffies; 1814 struct mce *m = &mcelog.entry[i]; 1815 1816 while (!m->finished) { 1817 if (time_after_eq(jiffies, start + 2)) { 1818 memset(m, 0, sizeof(*m)); 1819 goto timeout; 1820 } 1821 cpu_relax(); 1822 } 1823 smp_rmb(); 1824 err |= copy_to_user(buf, m, sizeof(*m)); 1825 buf += sizeof(*m); 1826timeout: 1827 ; 1828 } 1829 1830 memset(mcelog.entry + prev, 0, 1831 (next - prev) * sizeof(struct mce)); 1832 prev = next; 1833 next = cmpxchg(&mcelog.next, prev, 0); 1834 } while (next != prev); 1835 1836 synchronize_sched(); 1837 1838 /* 1839 * Collect entries that were still getting written before the 1840 * synchronize. 1841 */ 1842 on_each_cpu(collect_tscs, cpu_tsc, 1); 1843 1844 for (i = next; i < MCE_LOG_LEN; i++) { 1845 struct mce *m = &mcelog.entry[i]; 1846 1847 if (m->finished && m->tsc < cpu_tsc[m->cpu]) { 1848 err |= copy_to_user(buf, m, sizeof(*m)); 1849 smp_rmb(); 1850 buf += sizeof(*m); 1851 memset(m, 0, sizeof(*m)); 1852 } 1853 } 1854 1855 if (err) 1856 err = -EFAULT; 1857 1858out: 1859 mutex_unlock(&mce_chrdev_read_mutex); 1860 kfree(cpu_tsc); 1861 1862 return err ? err : buf - ubuf; 1863} 1864 1865static unsigned int mce_chrdev_poll(struct file *file, poll_table *wait) 1866{ 1867 poll_wait(file, &mce_chrdev_wait, wait); 1868 if (rcu_access_index(mcelog.next)) 1869 return POLLIN | POLLRDNORM; 1870 if (!mce_apei_read_done && apei_check_mce()) 1871 return POLLIN | POLLRDNORM; 1872 return 0; 1873} 1874 1875static long mce_chrdev_ioctl(struct file *f, unsigned int cmd, 1876 unsigned long arg) 1877{ 1878 int __user *p = (int __user *)arg; 1879 1880 if (!capable(CAP_SYS_ADMIN)) 1881 return -EPERM; 1882 1883 switch (cmd) { 1884 case MCE_GET_RECORD_LEN: 1885 return put_user(sizeof(struct mce), p); 1886 case MCE_GET_LOG_LEN: 1887 return put_user(MCE_LOG_LEN, p); 1888 case MCE_GETCLEAR_FLAGS: { 1889 unsigned flags; 1890 1891 do { 1892 flags = mcelog.flags; 1893 } while (cmpxchg(&mcelog.flags, flags, 0) != flags); 1894 1895 return put_user(flags, p); 1896 } 1897 default: 1898 return -ENOTTY; 1899 } 1900} 1901 1902static ssize_t (*mce_write)(struct file *filp, const char __user *ubuf, 1903 size_t usize, loff_t *off); 1904 1905void register_mce_write_callback(ssize_t (*fn)(struct file *filp, 1906 const char __user *ubuf, 1907 size_t usize, loff_t *off)) 1908{ 1909 mce_write = fn; 1910} 1911EXPORT_SYMBOL_GPL(register_mce_write_callback); 1912 1913ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf, 1914 size_t usize, loff_t *off) 1915{ 1916 if (mce_write) 1917 return mce_write(filp, ubuf, usize, off); 1918 else 1919 return -EINVAL; 1920} 1921 1922static const struct file_operations mce_chrdev_ops = { 1923 .open = mce_chrdev_open, 1924 .release = mce_chrdev_release, 1925 .read = mce_chrdev_read, 1926 .write = mce_chrdev_write, 1927 .poll = mce_chrdev_poll, 1928 .unlocked_ioctl = mce_chrdev_ioctl, 1929 .llseek = no_llseek, 1930}; 1931 1932static struct miscdevice mce_chrdev_device = { 1933 MISC_MCELOG_MINOR, 1934 "mcelog", 1935 &mce_chrdev_ops, 1936}; 1937 1938/* 1939 * mce=off Disables machine check 1940 * mce=no_cmci Disables CMCI 1941 * mce=dont_log_ce Clears corrected events silently, no log created for CEs. 1942 * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared. 1943 * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above) 1944 * monarchtimeout is how long to wait for other CPUs on machine 1945 * check, or 0 to not wait 1946 * mce=bootlog Log MCEs from before booting. Disabled by default on AMD. 1947 * mce=nobootlog Don't log MCEs from before booting. 1948 * mce=bios_cmci_threshold Don't program the CMCI threshold 1949 */ 1950static int __init mcheck_enable(char *str) 1951{ 1952 struct mca_config *cfg = &mca_cfg; 1953 1954 if (*str == 0) { 1955 enable_p5_mce(); 1956 return 1; 1957 } 1958 if (*str == '=') 1959 str++; 1960 if (!strcmp(str, "off")) 1961 cfg->disabled = true; 1962 else if (!strcmp(str, "no_cmci")) 1963 cfg->cmci_disabled = true; 1964 else if (!strcmp(str, "dont_log_ce")) 1965 cfg->dont_log_ce = true; 1966 else if (!strcmp(str, "ignore_ce")) 1967 cfg->ignore_ce = true; 1968 else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog")) 1969 cfg->bootlog = (str[0] == 'b'); 1970 else if (!strcmp(str, "bios_cmci_threshold")) 1971 cfg->bios_cmci_threshold = true; 1972 else if (isdigit(str[0])) { 1973 get_option(&str, &(cfg->tolerant)); 1974 if (*str == ',') { 1975 ++str; 1976 get_option(&str, &(cfg->monarch_timeout)); 1977 } 1978 } else { 1979 pr_info("mce argument %s ignored. Please use /sys\n", str); 1980 return 0; 1981 } 1982 return 1; 1983} 1984__setup("mce", mcheck_enable); 1985 1986int __init mcheck_init(void) 1987{ 1988 mcheck_intel_therm_init(); 1989 1990 return 0; 1991} 1992 1993/* 1994 * mce_syscore: PM support 1995 */ 1996 1997/* 1998 * Disable machine checks on suspend and shutdown. We can't really handle 1999 * them later. 2000 */ 2001static int mce_disable_error_reporting(void) 2002{ 2003 int i; 2004 2005 for (i = 0; i < mca_cfg.banks; i++) { 2006 struct mce_bank *b = &mce_banks[i]; 2007 2008 if (b->init) 2009 wrmsrl(MSR_IA32_MCx_CTL(i), 0); 2010 } 2011 return 0; 2012} 2013 2014static int mce_syscore_suspend(void) 2015{ 2016 return mce_disable_error_reporting(); 2017} 2018 2019static void mce_syscore_shutdown(void) 2020{ 2021 mce_disable_error_reporting(); 2022} 2023 2024/* 2025 * On resume clear all MCE state. Don't want to see leftovers from the BIOS. 2026 * Only one CPU is active at this time, the others get re-added later using 2027 * CPU hotplug: 2028 */ 2029static void mce_syscore_resume(void) 2030{ 2031 __mcheck_cpu_init_generic(); 2032 __mcheck_cpu_init_vendor(__this_cpu_ptr(&cpu_info)); 2033} 2034 2035static struct syscore_ops mce_syscore_ops = { 2036 .suspend = mce_syscore_suspend, 2037 .shutdown = mce_syscore_shutdown, 2038 .resume = mce_syscore_resume, 2039}; 2040 2041/* 2042 * mce_device: Sysfs support 2043 */ 2044 2045static void mce_cpu_restart(void *data) 2046{ 2047 if (!mce_available(__this_cpu_ptr(&cpu_info))) 2048 return; 2049 __mcheck_cpu_init_generic(); 2050 __mcheck_cpu_init_timer(); 2051} 2052 2053/* Reinit MCEs after user configuration changes */ 2054static void mce_restart(void) 2055{ 2056 mce_timer_delete_all(); 2057 on_each_cpu(mce_cpu_restart, NULL, 1); 2058} 2059 2060/* Toggle features for corrected errors */ 2061static void mce_disable_cmci(void *data) 2062{ 2063 if (!mce_available(__this_cpu_ptr(&cpu_info))) 2064 return; 2065 cmci_clear(); 2066} 2067 2068static void mce_enable_ce(void *all) 2069{ 2070 if (!mce_available(__this_cpu_ptr(&cpu_info))) 2071 return; 2072 cmci_reenable(); 2073 cmci_recheck(); 2074 if (all) 2075 __mcheck_cpu_init_timer(); 2076} 2077 2078static struct bus_type mce_subsys = { 2079 .name = "machinecheck", 2080 .dev_name = "machinecheck", 2081}; 2082 2083DEFINE_PER_CPU(struct device *, mce_device); 2084 2085__cpuinitdata 2086void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); 2087 2088static inline struct mce_bank *attr_to_bank(struct device_attribute *attr) 2089{ 2090 return container_of(attr, struct mce_bank, attr); 2091} 2092 2093static ssize_t show_bank(struct device *s, struct device_attribute *attr, 2094 char *buf) 2095{ 2096 return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl); 2097} 2098 2099static ssize_t set_bank(struct device *s, struct device_attribute *attr, 2100 const char *buf, size_t size) 2101{ 2102 u64 new; 2103 2104 if (strict_strtoull(buf, 0, &new) < 0) 2105 return -EINVAL; 2106 2107 attr_to_bank(attr)->ctl = new; 2108 mce_restart(); 2109 2110 return size; 2111} 2112 2113static ssize_t 2114show_trigger(struct device *s, struct device_attribute *attr, char *buf) 2115{ 2116 strcpy(buf, mce_helper); 2117 strcat(buf, "\n"); 2118 return strlen(mce_helper) + 1; 2119} 2120 2121static ssize_t set_trigger(struct device *s, struct device_attribute *attr, 2122 const char *buf, size_t siz) 2123{ 2124 char *p; 2125 2126 strncpy(mce_helper, buf, sizeof(mce_helper)); 2127 mce_helper[sizeof(mce_helper)-1] = 0; 2128 p = strchr(mce_helper, '\n'); 2129 2130 if (p) 2131 *p = 0; 2132 2133 return strlen(mce_helper) + !!p; 2134} 2135 2136static ssize_t set_ignore_ce(struct device *s, 2137 struct device_attribute *attr, 2138 const char *buf, size_t size) 2139{ 2140 u64 new; 2141 2142 if (strict_strtoull(buf, 0, &new) < 0) 2143 return -EINVAL; 2144 2145 if (mca_cfg.ignore_ce ^ !!new) { 2146 if (new) { 2147 /* disable ce features */ 2148 mce_timer_delete_all(); 2149 on_each_cpu(mce_disable_cmci, NULL, 1); 2150 mca_cfg.ignore_ce = true; 2151 } else { 2152 /* enable ce features */ 2153 mca_cfg.ignore_ce = false; 2154 on_each_cpu(mce_enable_ce, (void *)1, 1); 2155 } 2156 } 2157 return size; 2158} 2159 2160static ssize_t set_cmci_disabled(struct device *s, 2161 struct device_attribute *attr, 2162 const char *buf, size_t size) 2163{ 2164 u64 new; 2165 2166 if (strict_strtoull(buf, 0, &new) < 0) 2167 return -EINVAL; 2168 2169 if (mca_cfg.cmci_disabled ^ !!new) { 2170 if (new) { 2171 /* disable cmci */ 2172 on_each_cpu(mce_disable_cmci, NULL, 1); 2173 mca_cfg.cmci_disabled = true; 2174 } else { 2175 /* enable cmci */ 2176 mca_cfg.cmci_disabled = false; 2177 on_each_cpu(mce_enable_ce, NULL, 1); 2178 } 2179 } 2180 return size; 2181} 2182 2183static ssize_t store_int_with_restart(struct device *s, 2184 struct device_attribute *attr, 2185 const char *buf, size_t size) 2186{ 2187 ssize_t ret = device_store_int(s, attr, buf, size); 2188 mce_restart(); 2189 return ret; 2190} 2191 2192static DEVICE_ATTR(trigger, 0644, show_trigger, set_trigger); 2193static DEVICE_INT_ATTR(tolerant, 0644, mca_cfg.tolerant); 2194static DEVICE_INT_ATTR(monarch_timeout, 0644, mca_cfg.monarch_timeout); 2195static DEVICE_BOOL_ATTR(dont_log_ce, 0644, mca_cfg.dont_log_ce); 2196 2197static struct dev_ext_attribute dev_attr_check_interval = { 2198 __ATTR(check_interval, 0644, device_show_int, store_int_with_restart), 2199 &check_interval 2200}; 2201 2202static struct dev_ext_attribute dev_attr_ignore_ce = { 2203 __ATTR(ignore_ce, 0644, device_show_bool, set_ignore_ce), 2204 &mca_cfg.ignore_ce 2205}; 2206 2207static struct dev_ext_attribute dev_attr_cmci_disabled = { 2208 __ATTR(cmci_disabled, 0644, device_show_bool, set_cmci_disabled), 2209 &mca_cfg.cmci_disabled 2210}; 2211 2212static struct device_attribute *mce_device_attrs[] = { 2213 &dev_attr_tolerant.attr, 2214 &dev_attr_check_interval.attr, 2215 &dev_attr_trigger, 2216 &dev_attr_monarch_timeout.attr, 2217 &dev_attr_dont_log_ce.attr, 2218 &dev_attr_ignore_ce.attr, 2219 &dev_attr_cmci_disabled.attr, 2220 NULL 2221}; 2222 2223static cpumask_var_t mce_device_initialized; 2224 2225static void mce_device_release(struct device *dev) 2226{ 2227 kfree(dev); 2228} 2229 2230/* Per cpu device init. All of the cpus still share the same ctrl bank: */ 2231static __cpuinit int mce_device_create(unsigned int cpu) 2232{ 2233 struct device *dev; 2234 int err; 2235 int i, j; 2236 2237 if (!mce_available(&boot_cpu_data)) 2238 return -EIO; 2239 2240 dev = kzalloc(sizeof *dev, GFP_KERNEL); 2241 if (!dev) 2242 return -ENOMEM; 2243 dev->id = cpu; 2244 dev->bus = &mce_subsys; 2245 dev->release = &mce_device_release; 2246 2247 err = device_register(dev); 2248 if (err) 2249 return err; 2250 2251 for (i = 0; mce_device_attrs[i]; i++) { 2252 err = device_create_file(dev, mce_device_attrs[i]); 2253 if (err) 2254 goto error; 2255 } 2256 for (j = 0; j < mca_cfg.banks; j++) { 2257 err = device_create_file(dev, &mce_banks[j].attr); 2258 if (err) 2259 goto error2; 2260 } 2261 cpumask_set_cpu(cpu, mce_device_initialized); 2262 per_cpu(mce_device, cpu) = dev; 2263 2264 return 0; 2265error2: 2266 while (--j >= 0) 2267 device_remove_file(dev, &mce_banks[j].attr); 2268error: 2269 while (--i >= 0) 2270 device_remove_file(dev, mce_device_attrs[i]); 2271 2272 device_unregister(dev); 2273 2274 return err; 2275} 2276 2277static __cpuinit void mce_device_remove(unsigned int cpu) 2278{ 2279 struct device *dev = per_cpu(mce_device, cpu); 2280 int i; 2281 2282 if (!cpumask_test_cpu(cpu, mce_device_initialized)) 2283 return; 2284 2285 for (i = 0; mce_device_attrs[i]; i++) 2286 device_remove_file(dev, mce_device_attrs[i]); 2287 2288 for (i = 0; i < mca_cfg.banks; i++) 2289 device_remove_file(dev, &mce_banks[i].attr); 2290 2291 device_unregister(dev); 2292 cpumask_clear_cpu(cpu, mce_device_initialized); 2293 per_cpu(mce_device, cpu) = NULL; 2294} 2295 2296/* Make sure there are no machine checks on offlined CPUs. */ 2297static void __cpuinit mce_disable_cpu(void *h) 2298{ 2299 unsigned long action = *(unsigned long *)h; 2300 int i; 2301 2302 if (!mce_available(__this_cpu_ptr(&cpu_info))) 2303 return; 2304 2305 if (!(action & CPU_TASKS_FROZEN)) 2306 cmci_clear(); 2307 for (i = 0; i < mca_cfg.banks; i++) { 2308 struct mce_bank *b = &mce_banks[i]; 2309 2310 if (b->init) 2311 wrmsrl(MSR_IA32_MCx_CTL(i), 0); 2312 } 2313} 2314 2315static void __cpuinit mce_reenable_cpu(void *h) 2316{ 2317 unsigned long action = *(unsigned long *)h; 2318 int i; 2319 2320 if (!mce_available(__this_cpu_ptr(&cpu_info))) 2321 return; 2322 2323 if (!(action & CPU_TASKS_FROZEN)) 2324 cmci_reenable(); 2325 for (i = 0; i < mca_cfg.banks; i++) { 2326 struct mce_bank *b = &mce_banks[i]; 2327 2328 if (b->init) 2329 wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl); 2330 } 2331} 2332 2333/* Get notified when a cpu comes on/off. Be hotplug friendly. */ 2334static int __cpuinit 2335mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) 2336{ 2337 unsigned int cpu = (unsigned long)hcpu; 2338 struct timer_list *t = &per_cpu(mce_timer, cpu); 2339 2340 switch (action & ~CPU_TASKS_FROZEN) { 2341 case CPU_ONLINE: 2342 mce_device_create(cpu); 2343 if (threshold_cpu_callback) 2344 threshold_cpu_callback(action, cpu); 2345 break; 2346 case CPU_DEAD: 2347 if (threshold_cpu_callback) 2348 threshold_cpu_callback(action, cpu); 2349 mce_device_remove(cpu); 2350 mce_intel_hcpu_update(cpu); 2351 break; 2352 case CPU_DOWN_PREPARE: 2353 smp_call_function_single(cpu, mce_disable_cpu, &action, 1); 2354 del_timer_sync(t); 2355 break; 2356 case CPU_DOWN_FAILED: 2357 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); 2358 mce_start_timer(cpu, t); 2359 break; 2360 } 2361 2362 if (action == CPU_POST_DEAD) { 2363 /* intentionally ignoring frozen here */ 2364 cmci_rediscover(); 2365 } 2366 2367 return NOTIFY_OK; 2368} 2369 2370static struct notifier_block mce_cpu_notifier __cpuinitdata = { 2371 .notifier_call = mce_cpu_callback, 2372}; 2373 2374static __init void mce_init_banks(void) 2375{ 2376 int i; 2377 2378 for (i = 0; i < mca_cfg.banks; i++) { 2379 struct mce_bank *b = &mce_banks[i]; 2380 struct device_attribute *a = &b->attr; 2381 2382 sysfs_attr_init(&a->attr); 2383 a->attr.name = b->attrname; 2384 snprintf(b->attrname, ATTR_LEN, "bank%d", i); 2385 2386 a->attr.mode = 0644; 2387 a->show = show_bank; 2388 a->store = set_bank; 2389 } 2390} 2391 2392static __init int mcheck_init_device(void) 2393{ 2394 int err; 2395 int i = 0; 2396 2397 if (!mce_available(&boot_cpu_data)) 2398 return -EIO; 2399 2400 zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL); 2401 2402 mce_init_banks(); 2403 2404 err = subsys_system_register(&mce_subsys, NULL); 2405 if (err) 2406 return err; 2407 2408 for_each_online_cpu(i) { 2409 err = mce_device_create(i); 2410 if (err) 2411 return err; 2412 } 2413 2414 register_syscore_ops(&mce_syscore_ops); 2415 register_hotcpu_notifier(&mce_cpu_notifier); 2416 2417 /* register character device /dev/mcelog */ 2418 misc_register(&mce_chrdev_device); 2419 2420 return err; 2421} 2422device_initcall_sync(mcheck_init_device); 2423 2424/* 2425 * Old style boot options parsing. Only for compatibility. 2426 */ 2427static int __init mcheck_disable(char *str) 2428{ 2429 mca_cfg.disabled = true; 2430 return 1; 2431} 2432__setup("nomce", mcheck_disable); 2433 2434#ifdef CONFIG_DEBUG_FS 2435struct dentry *mce_get_debugfs_dir(void) 2436{ 2437 static struct dentry *dmce; 2438 2439 if (!dmce) 2440 dmce = debugfs_create_dir("mce", NULL); 2441 2442 return dmce; 2443} 2444 2445static void mce_reset(void) 2446{ 2447 cpu_missing = 0; 2448 atomic_set(&mce_fake_paniced, 0); 2449 atomic_set(&mce_executing, 0); 2450 atomic_set(&mce_callin, 0); 2451 atomic_set(&global_nwo, 0); 2452} 2453 2454static int fake_panic_get(void *data, u64 *val) 2455{ 2456 *val = fake_panic; 2457 return 0; 2458} 2459 2460static int fake_panic_set(void *data, u64 val) 2461{ 2462 mce_reset(); 2463 fake_panic = val; 2464 return 0; 2465} 2466 2467DEFINE_SIMPLE_ATTRIBUTE(fake_panic_fops, fake_panic_get, 2468 fake_panic_set, "%llu\n"); 2469 2470static int __init mcheck_debugfs_init(void) 2471{ 2472 struct dentry *dmce, *ffake_panic; 2473 2474 dmce = mce_get_debugfs_dir(); 2475 if (!dmce) 2476 return -ENOMEM; 2477 ffake_panic = debugfs_create_file("fake_panic", 0444, dmce, NULL, 2478 &fake_panic_fops); 2479 if (!ffake_panic) 2480 return -ENOMEM; 2481 2482 return 0; 2483} 2484late_initcall(mcheck_debugfs_init); 2485#endif 2486