mce.c revision 89cbc76768c2fa4ed95545bf961f3a14ddfeed21
1/* 2 * Machine check handler. 3 * 4 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. 5 * Rest from unknown author(s). 6 * 2004 Andi Kleen. Rewrote most of it. 7 * Copyright 2008 Intel Corporation 8 * Author: Andi Kleen 9 */ 10 11#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 12 13#include <linux/thread_info.h> 14#include <linux/capability.h> 15#include <linux/miscdevice.h> 16#include <linux/ratelimit.h> 17#include <linux/kallsyms.h> 18#include <linux/rcupdate.h> 19#include <linux/kobject.h> 20#include <linux/uaccess.h> 21#include <linux/kdebug.h> 22#include <linux/kernel.h> 23#include <linux/percpu.h> 24#include <linux/string.h> 25#include <linux/device.h> 26#include <linux/syscore_ops.h> 27#include <linux/delay.h> 28#include <linux/ctype.h> 29#include <linux/sched.h> 30#include <linux/sysfs.h> 31#include <linux/types.h> 32#include <linux/slab.h> 33#include <linux/init.h> 34#include <linux/kmod.h> 35#include <linux/poll.h> 36#include <linux/nmi.h> 37#include <linux/cpu.h> 38#include <linux/smp.h> 39#include <linux/fs.h> 40#include <linux/mm.h> 41#include <linux/debugfs.h> 42#include <linux/irq_work.h> 43#include <linux/export.h> 44 45#include <asm/processor.h> 46#include <asm/mce.h> 47#include <asm/msr.h> 48 49#include "mce-internal.h" 50 51static DEFINE_MUTEX(mce_chrdev_read_mutex); 52 53#define rcu_dereference_check_mce(p) \ 54 rcu_dereference_index_check((p), \ 55 rcu_read_lock_sched_held() || \ 56 lockdep_is_held(&mce_chrdev_read_mutex)) 57 58#define CREATE_TRACE_POINTS 59#include <trace/events/mce.h> 60 61#define SPINUNIT 100 /* 100ns */ 62 63DEFINE_PER_CPU(unsigned, mce_exception_count); 64 65struct mce_bank *mce_banks __read_mostly; 66 67struct mca_config mca_cfg __read_mostly = { 68 .bootlog = -1, 69 /* 70 * Tolerant levels: 71 * 0: always panic on uncorrected errors, log corrected errors 72 * 1: panic or SIGBUS on uncorrected errors, log corrected errors 73 * 2: SIGBUS or log uncorrected errors (if possible), log corr. errors 74 * 3: never panic or SIGBUS, log all errors (for testing only) 75 */ 76 .tolerant = 1, 77 .monarch_timeout = -1 78}; 79 80/* User mode helper program triggered by machine check event */ 81static unsigned long mce_need_notify; 82static char mce_helper[128]; 83static char *mce_helper_argv[2] = { mce_helper, NULL }; 84 85static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait); 86 87static DEFINE_PER_CPU(struct mce, mces_seen); 88static int cpu_missing; 89 90/* CMCI storm detection filter */ 91static DEFINE_PER_CPU(unsigned long, mce_polled_error); 92 93/* 94 * MCA banks polled by the period polling timer for corrected events. 95 * With Intel CMCI, this only has MCA banks which do not support CMCI (if any). 96 */ 97DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { 98 [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL 99}; 100 101/* 102 * MCA banks controlled through firmware first for corrected errors. 103 * This is a global list of banks for which we won't enable CMCI and we 104 * won't poll. Firmware controls these banks and is responsible for 105 * reporting corrected errors through GHES. Uncorrected/recoverable 106 * errors are still notified through a machine check. 107 */ 108mce_banks_t mce_banks_ce_disabled; 109 110static DEFINE_PER_CPU(struct work_struct, mce_work); 111 112static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs); 113 114/* 115 * CPU/chipset specific EDAC code can register a notifier call here to print 116 * MCE errors in a human-readable form. 117 */ 118ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain); 119 120/* Do initial initialization of a struct mce */ 121void mce_setup(struct mce *m) 122{ 123 memset(m, 0, sizeof(struct mce)); 124 m->cpu = m->extcpu = smp_processor_id(); 125 rdtscll(m->tsc); 126 /* We hope get_seconds stays lockless */ 127 m->time = get_seconds(); 128 m->cpuvendor = boot_cpu_data.x86_vendor; 129 m->cpuid = cpuid_eax(1); 130 m->socketid = cpu_data(m->extcpu).phys_proc_id; 131 m->apicid = cpu_data(m->extcpu).initial_apicid; 132 rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap); 133} 134 135DEFINE_PER_CPU(struct mce, injectm); 136EXPORT_PER_CPU_SYMBOL_GPL(injectm); 137 138/* 139 * Lockless MCE logging infrastructure. 140 * This avoids deadlocks on printk locks without having to break locks. Also 141 * separate MCEs from kernel messages to avoid bogus bug reports. 142 */ 143 144static struct mce_log mcelog = { 145 .signature = MCE_LOG_SIGNATURE, 146 .len = MCE_LOG_LEN, 147 .recordlen = sizeof(struct mce), 148}; 149 150void mce_log(struct mce *mce) 151{ 152 unsigned next, entry; 153 int ret = 0; 154 155 /* Emit the trace record: */ 156 trace_mce_record(mce); 157 158 ret = atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, mce); 159 if (ret == NOTIFY_STOP) 160 return; 161 162 mce->finished = 0; 163 wmb(); 164 for (;;) { 165 entry = rcu_dereference_check_mce(mcelog.next); 166 for (;;) { 167 168 /* 169 * When the buffer fills up discard new entries. 170 * Assume that the earlier errors are the more 171 * interesting ones: 172 */ 173 if (entry >= MCE_LOG_LEN) { 174 set_bit(MCE_OVERFLOW, 175 (unsigned long *)&mcelog.flags); 176 return; 177 } 178 /* Old left over entry. Skip: */ 179 if (mcelog.entry[entry].finished) { 180 entry++; 181 continue; 182 } 183 break; 184 } 185 smp_rmb(); 186 next = entry + 1; 187 if (cmpxchg(&mcelog.next, entry, next) == entry) 188 break; 189 } 190 memcpy(mcelog.entry + entry, mce, sizeof(struct mce)); 191 wmb(); 192 mcelog.entry[entry].finished = 1; 193 wmb(); 194 195 mce->finished = 1; 196 set_bit(0, &mce_need_notify); 197} 198 199static void drain_mcelog_buffer(void) 200{ 201 unsigned int next, i, prev = 0; 202 203 next = ACCESS_ONCE(mcelog.next); 204 205 do { 206 struct mce *m; 207 208 /* drain what was logged during boot */ 209 for (i = prev; i < next; i++) { 210 unsigned long start = jiffies; 211 unsigned retries = 1; 212 213 m = &mcelog.entry[i]; 214 215 while (!m->finished) { 216 if (time_after_eq(jiffies, start + 2*retries)) 217 retries++; 218 219 cpu_relax(); 220 221 if (!m->finished && retries >= 4) { 222 pr_err("skipping error being logged currently!\n"); 223 break; 224 } 225 } 226 smp_rmb(); 227 atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m); 228 } 229 230 memset(mcelog.entry + prev, 0, (next - prev) * sizeof(*m)); 231 prev = next; 232 next = cmpxchg(&mcelog.next, prev, 0); 233 } while (next != prev); 234} 235 236 237void mce_register_decode_chain(struct notifier_block *nb) 238{ 239 atomic_notifier_chain_register(&x86_mce_decoder_chain, nb); 240 drain_mcelog_buffer(); 241} 242EXPORT_SYMBOL_GPL(mce_register_decode_chain); 243 244void mce_unregister_decode_chain(struct notifier_block *nb) 245{ 246 atomic_notifier_chain_unregister(&x86_mce_decoder_chain, nb); 247} 248EXPORT_SYMBOL_GPL(mce_unregister_decode_chain); 249 250static void print_mce(struct mce *m) 251{ 252 int ret = 0; 253 254 pr_emerg(HW_ERR "CPU %d: Machine Check Exception: %Lx Bank %d: %016Lx\n", 255 m->extcpu, m->mcgstatus, m->bank, m->status); 256 257 if (m->ip) { 258 pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ", 259 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", 260 m->cs, m->ip); 261 262 if (m->cs == __KERNEL_CS) 263 print_symbol("{%s}", m->ip); 264 pr_cont("\n"); 265 } 266 267 pr_emerg(HW_ERR "TSC %llx ", m->tsc); 268 if (m->addr) 269 pr_cont("ADDR %llx ", m->addr); 270 if (m->misc) 271 pr_cont("MISC %llx ", m->misc); 272 273 pr_cont("\n"); 274 /* 275 * Note this output is parsed by external tools and old fields 276 * should not be changed. 277 */ 278 pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x microcode %x\n", 279 m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid, 280 cpu_data(m->extcpu).microcode); 281 282 /* 283 * Print out human-readable details about the MCE error, 284 * (if the CPU has an implementation for that) 285 */ 286 ret = atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m); 287 if (ret == NOTIFY_STOP) 288 return; 289 290 pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n"); 291} 292 293#define PANIC_TIMEOUT 5 /* 5 seconds */ 294 295static atomic_t mce_paniced; 296 297static int fake_panic; 298static atomic_t mce_fake_paniced; 299 300/* Panic in progress. Enable interrupts and wait for final IPI */ 301static void wait_for_panic(void) 302{ 303 long timeout = PANIC_TIMEOUT*USEC_PER_SEC; 304 305 preempt_disable(); 306 local_irq_enable(); 307 while (timeout-- > 0) 308 udelay(1); 309 if (panic_timeout == 0) 310 panic_timeout = mca_cfg.panic_timeout; 311 panic("Panicing machine check CPU died"); 312} 313 314static void mce_panic(char *msg, struct mce *final, char *exp) 315{ 316 int i, apei_err = 0; 317 318 if (!fake_panic) { 319 /* 320 * Make sure only one CPU runs in machine check panic 321 */ 322 if (atomic_inc_return(&mce_paniced) > 1) 323 wait_for_panic(); 324 barrier(); 325 326 bust_spinlocks(1); 327 console_verbose(); 328 } else { 329 /* Don't log too much for fake panic */ 330 if (atomic_inc_return(&mce_fake_paniced) > 1) 331 return; 332 } 333 /* First print corrected ones that are still unlogged */ 334 for (i = 0; i < MCE_LOG_LEN; i++) { 335 struct mce *m = &mcelog.entry[i]; 336 if (!(m->status & MCI_STATUS_VAL)) 337 continue; 338 if (!(m->status & MCI_STATUS_UC)) { 339 print_mce(m); 340 if (!apei_err) 341 apei_err = apei_write_mce(m); 342 } 343 } 344 /* Now print uncorrected but with the final one last */ 345 for (i = 0; i < MCE_LOG_LEN; i++) { 346 struct mce *m = &mcelog.entry[i]; 347 if (!(m->status & MCI_STATUS_VAL)) 348 continue; 349 if (!(m->status & MCI_STATUS_UC)) 350 continue; 351 if (!final || memcmp(m, final, sizeof(struct mce))) { 352 print_mce(m); 353 if (!apei_err) 354 apei_err = apei_write_mce(m); 355 } 356 } 357 if (final) { 358 print_mce(final); 359 if (!apei_err) 360 apei_err = apei_write_mce(final); 361 } 362 if (cpu_missing) 363 pr_emerg(HW_ERR "Some CPUs didn't answer in synchronization\n"); 364 if (exp) 365 pr_emerg(HW_ERR "Machine check: %s\n", exp); 366 if (!fake_panic) { 367 if (panic_timeout == 0) 368 panic_timeout = mca_cfg.panic_timeout; 369 panic(msg); 370 } else 371 pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg); 372} 373 374/* Support code for software error injection */ 375 376static int msr_to_offset(u32 msr) 377{ 378 unsigned bank = __this_cpu_read(injectm.bank); 379 380 if (msr == mca_cfg.rip_msr) 381 return offsetof(struct mce, ip); 382 if (msr == MSR_IA32_MCx_STATUS(bank)) 383 return offsetof(struct mce, status); 384 if (msr == MSR_IA32_MCx_ADDR(bank)) 385 return offsetof(struct mce, addr); 386 if (msr == MSR_IA32_MCx_MISC(bank)) 387 return offsetof(struct mce, misc); 388 if (msr == MSR_IA32_MCG_STATUS) 389 return offsetof(struct mce, mcgstatus); 390 return -1; 391} 392 393/* MSR access wrappers used for error injection */ 394static u64 mce_rdmsrl(u32 msr) 395{ 396 u64 v; 397 398 if (__this_cpu_read(injectm.finished)) { 399 int offset = msr_to_offset(msr); 400 401 if (offset < 0) 402 return 0; 403 return *(u64 *)((char *)this_cpu_ptr(&injectm) + offset); 404 } 405 406 if (rdmsrl_safe(msr, &v)) { 407 WARN_ONCE(1, "mce: Unable to read msr %d!\n", msr); 408 /* 409 * Return zero in case the access faulted. This should 410 * not happen normally but can happen if the CPU does 411 * something weird, or if the code is buggy. 412 */ 413 v = 0; 414 } 415 416 return v; 417} 418 419static void mce_wrmsrl(u32 msr, u64 v) 420{ 421 if (__this_cpu_read(injectm.finished)) { 422 int offset = msr_to_offset(msr); 423 424 if (offset >= 0) 425 *(u64 *)((char *)this_cpu_ptr(&injectm) + offset) = v; 426 return; 427 } 428 wrmsrl(msr, v); 429} 430 431/* 432 * Collect all global (w.r.t. this processor) status about this machine 433 * check into our "mce" struct so that we can use it later to assess 434 * the severity of the problem as we read per-bank specific details. 435 */ 436static inline void mce_gather_info(struct mce *m, struct pt_regs *regs) 437{ 438 mce_setup(m); 439 440 m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); 441 if (regs) { 442 /* 443 * Get the address of the instruction at the time of 444 * the machine check error. 445 */ 446 if (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) { 447 m->ip = regs->ip; 448 m->cs = regs->cs; 449 450 /* 451 * When in VM86 mode make the cs look like ring 3 452 * always. This is a lie, but it's better than passing 453 * the additional vm86 bit around everywhere. 454 */ 455 if (v8086_mode(regs)) 456 m->cs |= 3; 457 } 458 /* Use accurate RIP reporting if available. */ 459 if (mca_cfg.rip_msr) 460 m->ip = mce_rdmsrl(mca_cfg.rip_msr); 461 } 462} 463 464/* 465 * Simple lockless ring to communicate PFNs from the exception handler with the 466 * process context work function. This is vastly simplified because there's 467 * only a single reader and a single writer. 468 */ 469#define MCE_RING_SIZE 16 /* we use one entry less */ 470 471struct mce_ring { 472 unsigned short start; 473 unsigned short end; 474 unsigned long ring[MCE_RING_SIZE]; 475}; 476static DEFINE_PER_CPU(struct mce_ring, mce_ring); 477 478/* Runs with CPU affinity in workqueue */ 479static int mce_ring_empty(void) 480{ 481 struct mce_ring *r = this_cpu_ptr(&mce_ring); 482 483 return r->start == r->end; 484} 485 486static int mce_ring_get(unsigned long *pfn) 487{ 488 struct mce_ring *r; 489 int ret = 0; 490 491 *pfn = 0; 492 get_cpu(); 493 r = this_cpu_ptr(&mce_ring); 494 if (r->start == r->end) 495 goto out; 496 *pfn = r->ring[r->start]; 497 r->start = (r->start + 1) % MCE_RING_SIZE; 498 ret = 1; 499out: 500 put_cpu(); 501 return ret; 502} 503 504/* Always runs in MCE context with preempt off */ 505static int mce_ring_add(unsigned long pfn) 506{ 507 struct mce_ring *r = this_cpu_ptr(&mce_ring); 508 unsigned next; 509 510 next = (r->end + 1) % MCE_RING_SIZE; 511 if (next == r->start) 512 return -1; 513 r->ring[r->end] = pfn; 514 wmb(); 515 r->end = next; 516 return 0; 517} 518 519int mce_available(struct cpuinfo_x86 *c) 520{ 521 if (mca_cfg.disabled) 522 return 0; 523 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); 524} 525 526static void mce_schedule_work(void) 527{ 528 if (!mce_ring_empty()) 529 schedule_work(this_cpu_ptr(&mce_work)); 530} 531 532DEFINE_PER_CPU(struct irq_work, mce_irq_work); 533 534static void mce_irq_work_cb(struct irq_work *entry) 535{ 536 mce_notify_irq(); 537 mce_schedule_work(); 538} 539 540static void mce_report_event(struct pt_regs *regs) 541{ 542 if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) { 543 mce_notify_irq(); 544 /* 545 * Triggering the work queue here is just an insurance 546 * policy in case the syscall exit notify handler 547 * doesn't run soon enough or ends up running on the 548 * wrong CPU (can happen when audit sleeps) 549 */ 550 mce_schedule_work(); 551 return; 552 } 553 554 irq_work_queue(this_cpu_ptr(&mce_irq_work)); 555} 556 557/* 558 * Read ADDR and MISC registers. 559 */ 560static void mce_read_aux(struct mce *m, int i) 561{ 562 if (m->status & MCI_STATUS_MISCV) 563 m->misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i)); 564 if (m->status & MCI_STATUS_ADDRV) { 565 m->addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i)); 566 567 /* 568 * Mask the reported address by the reported granularity. 569 */ 570 if (mca_cfg.ser && (m->status & MCI_STATUS_MISCV)) { 571 u8 shift = MCI_MISC_ADDR_LSB(m->misc); 572 m->addr >>= shift; 573 m->addr <<= shift; 574 } 575 } 576} 577 578DEFINE_PER_CPU(unsigned, mce_poll_count); 579 580/* 581 * Poll for corrected events or events that happened before reset. 582 * Those are just logged through /dev/mcelog. 583 * 584 * This is executed in standard interrupt context. 585 * 586 * Note: spec recommends to panic for fatal unsignalled 587 * errors here. However this would be quite problematic -- 588 * we would need to reimplement the Monarch handling and 589 * it would mess up the exclusion between exception handler 590 * and poll hander -- * so we skip this for now. 591 * These cases should not happen anyways, or only when the CPU 592 * is already totally * confused. In this case it's likely it will 593 * not fully execute the machine check handler either. 594 */ 595void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) 596{ 597 struct mce m; 598 int i; 599 600 this_cpu_inc(mce_poll_count); 601 602 mce_gather_info(&m, NULL); 603 604 for (i = 0; i < mca_cfg.banks; i++) { 605 if (!mce_banks[i].ctl || !test_bit(i, *b)) 606 continue; 607 608 m.misc = 0; 609 m.addr = 0; 610 m.bank = i; 611 m.tsc = 0; 612 613 barrier(); 614 m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); 615 if (!(m.status & MCI_STATUS_VAL)) 616 continue; 617 618 this_cpu_write(mce_polled_error, 1); 619 /* 620 * Uncorrected or signalled events are handled by the exception 621 * handler when it is enabled, so don't process those here. 622 * 623 * TBD do the same check for MCI_STATUS_EN here? 624 */ 625 if (!(flags & MCP_UC) && 626 (m.status & (mca_cfg.ser ? MCI_STATUS_S : MCI_STATUS_UC))) 627 continue; 628 629 mce_read_aux(&m, i); 630 631 if (!(flags & MCP_TIMESTAMP)) 632 m.tsc = 0; 633 /* 634 * Don't get the IP here because it's unlikely to 635 * have anything to do with the actual error location. 636 */ 637 if (!(flags & MCP_DONTLOG) && !mca_cfg.dont_log_ce) 638 mce_log(&m); 639 640 /* 641 * Clear state for this bank. 642 */ 643 mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0); 644 } 645 646 /* 647 * Don't clear MCG_STATUS here because it's only defined for 648 * exceptions. 649 */ 650 651 sync_core(); 652} 653EXPORT_SYMBOL_GPL(machine_check_poll); 654 655/* 656 * Do a quick check if any of the events requires a panic. 657 * This decides if we keep the events around or clear them. 658 */ 659static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp, 660 struct pt_regs *regs) 661{ 662 int i, ret = 0; 663 664 for (i = 0; i < mca_cfg.banks; i++) { 665 m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); 666 if (m->status & MCI_STATUS_VAL) { 667 __set_bit(i, validp); 668 if (quirk_no_way_out) 669 quirk_no_way_out(i, m, regs); 670 } 671 if (mce_severity(m, mca_cfg.tolerant, msg) >= MCE_PANIC_SEVERITY) 672 ret = 1; 673 } 674 return ret; 675} 676 677/* 678 * Variable to establish order between CPUs while scanning. 679 * Each CPU spins initially until executing is equal its number. 680 */ 681static atomic_t mce_executing; 682 683/* 684 * Defines order of CPUs on entry. First CPU becomes Monarch. 685 */ 686static atomic_t mce_callin; 687 688/* 689 * Check if a timeout waiting for other CPUs happened. 690 */ 691static int mce_timed_out(u64 *t) 692{ 693 /* 694 * The others already did panic for some reason. 695 * Bail out like in a timeout. 696 * rmb() to tell the compiler that system_state 697 * might have been modified by someone else. 698 */ 699 rmb(); 700 if (atomic_read(&mce_paniced)) 701 wait_for_panic(); 702 if (!mca_cfg.monarch_timeout) 703 goto out; 704 if ((s64)*t < SPINUNIT) { 705 if (mca_cfg.tolerant <= 1) 706 mce_panic("Timeout synchronizing machine check over CPUs", 707 NULL, NULL); 708 cpu_missing = 1; 709 return 1; 710 } 711 *t -= SPINUNIT; 712out: 713 touch_nmi_watchdog(); 714 return 0; 715} 716 717/* 718 * The Monarch's reign. The Monarch is the CPU who entered 719 * the machine check handler first. It waits for the others to 720 * raise the exception too and then grades them. When any 721 * error is fatal panic. Only then let the others continue. 722 * 723 * The other CPUs entering the MCE handler will be controlled by the 724 * Monarch. They are called Subjects. 725 * 726 * This way we prevent any potential data corruption in a unrecoverable case 727 * and also makes sure always all CPU's errors are examined. 728 * 729 * Also this detects the case of a machine check event coming from outer 730 * space (not detected by any CPUs) In this case some external agent wants 731 * us to shut down, so panic too. 732 * 733 * The other CPUs might still decide to panic if the handler happens 734 * in a unrecoverable place, but in this case the system is in a semi-stable 735 * state and won't corrupt anything by itself. It's ok to let the others 736 * continue for a bit first. 737 * 738 * All the spin loops have timeouts; when a timeout happens a CPU 739 * typically elects itself to be Monarch. 740 */ 741static void mce_reign(void) 742{ 743 int cpu; 744 struct mce *m = NULL; 745 int global_worst = 0; 746 char *msg = NULL; 747 char *nmsg = NULL; 748 749 /* 750 * This CPU is the Monarch and the other CPUs have run 751 * through their handlers. 752 * Grade the severity of the errors of all the CPUs. 753 */ 754 for_each_possible_cpu(cpu) { 755 int severity = mce_severity(&per_cpu(mces_seen, cpu), 756 mca_cfg.tolerant, 757 &nmsg); 758 if (severity > global_worst) { 759 msg = nmsg; 760 global_worst = severity; 761 m = &per_cpu(mces_seen, cpu); 762 } 763 } 764 765 /* 766 * Cannot recover? Panic here then. 767 * This dumps all the mces in the log buffer and stops the 768 * other CPUs. 769 */ 770 if (m && global_worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3) 771 mce_panic("Fatal Machine check", m, msg); 772 773 /* 774 * For UC somewhere we let the CPU who detects it handle it. 775 * Also must let continue the others, otherwise the handling 776 * CPU could deadlock on a lock. 777 */ 778 779 /* 780 * No machine check event found. Must be some external 781 * source or one CPU is hung. Panic. 782 */ 783 if (global_worst <= MCE_KEEP_SEVERITY && mca_cfg.tolerant < 3) 784 mce_panic("Machine check from unknown source", NULL, NULL); 785 786 /* 787 * Now clear all the mces_seen so that they don't reappear on 788 * the next mce. 789 */ 790 for_each_possible_cpu(cpu) 791 memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce)); 792} 793 794static atomic_t global_nwo; 795 796/* 797 * Start of Monarch synchronization. This waits until all CPUs have 798 * entered the exception handler and then determines if any of them 799 * saw a fatal event that requires panic. Then it executes them 800 * in the entry order. 801 * TBD double check parallel CPU hotunplug 802 */ 803static int mce_start(int *no_way_out) 804{ 805 int order; 806 int cpus = num_online_cpus(); 807 u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC; 808 809 if (!timeout) 810 return -1; 811 812 atomic_add(*no_way_out, &global_nwo); 813 /* 814 * global_nwo should be updated before mce_callin 815 */ 816 smp_wmb(); 817 order = atomic_inc_return(&mce_callin); 818 819 /* 820 * Wait for everyone. 821 */ 822 while (atomic_read(&mce_callin) != cpus) { 823 if (mce_timed_out(&timeout)) { 824 atomic_set(&global_nwo, 0); 825 return -1; 826 } 827 ndelay(SPINUNIT); 828 } 829 830 /* 831 * mce_callin should be read before global_nwo 832 */ 833 smp_rmb(); 834 835 if (order == 1) { 836 /* 837 * Monarch: Starts executing now, the others wait. 838 */ 839 atomic_set(&mce_executing, 1); 840 } else { 841 /* 842 * Subject: Now start the scanning loop one by one in 843 * the original callin order. 844 * This way when there are any shared banks it will be 845 * only seen by one CPU before cleared, avoiding duplicates. 846 */ 847 while (atomic_read(&mce_executing) < order) { 848 if (mce_timed_out(&timeout)) { 849 atomic_set(&global_nwo, 0); 850 return -1; 851 } 852 ndelay(SPINUNIT); 853 } 854 } 855 856 /* 857 * Cache the global no_way_out state. 858 */ 859 *no_way_out = atomic_read(&global_nwo); 860 861 return order; 862} 863 864/* 865 * Synchronize between CPUs after main scanning loop. 866 * This invokes the bulk of the Monarch processing. 867 */ 868static int mce_end(int order) 869{ 870 int ret = -1; 871 u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC; 872 873 if (!timeout) 874 goto reset; 875 if (order < 0) 876 goto reset; 877 878 /* 879 * Allow others to run. 880 */ 881 atomic_inc(&mce_executing); 882 883 if (order == 1) { 884 /* CHECKME: Can this race with a parallel hotplug? */ 885 int cpus = num_online_cpus(); 886 887 /* 888 * Monarch: Wait for everyone to go through their scanning 889 * loops. 890 */ 891 while (atomic_read(&mce_executing) <= cpus) { 892 if (mce_timed_out(&timeout)) 893 goto reset; 894 ndelay(SPINUNIT); 895 } 896 897 mce_reign(); 898 barrier(); 899 ret = 0; 900 } else { 901 /* 902 * Subject: Wait for Monarch to finish. 903 */ 904 while (atomic_read(&mce_executing) != 0) { 905 if (mce_timed_out(&timeout)) 906 goto reset; 907 ndelay(SPINUNIT); 908 } 909 910 /* 911 * Don't reset anything. That's done by the Monarch. 912 */ 913 return 0; 914 } 915 916 /* 917 * Reset all global state. 918 */ 919reset: 920 atomic_set(&global_nwo, 0); 921 atomic_set(&mce_callin, 0); 922 barrier(); 923 924 /* 925 * Let others run again. 926 */ 927 atomic_set(&mce_executing, 0); 928 return ret; 929} 930 931/* 932 * Check if the address reported by the CPU is in a format we can parse. 933 * It would be possible to add code for most other cases, but all would 934 * be somewhat complicated (e.g. segment offset would require an instruction 935 * parser). So only support physical addresses up to page granuality for now. 936 */ 937static int mce_usable_address(struct mce *m) 938{ 939 if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV)) 940 return 0; 941 if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT) 942 return 0; 943 if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS) 944 return 0; 945 return 1; 946} 947 948static void mce_clear_state(unsigned long *toclear) 949{ 950 int i; 951 952 for (i = 0; i < mca_cfg.banks; i++) { 953 if (test_bit(i, toclear)) 954 mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0); 955 } 956} 957 958/* 959 * Need to save faulting physical address associated with a process 960 * in the machine check handler some place where we can grab it back 961 * later in mce_notify_process() 962 */ 963#define MCE_INFO_MAX 16 964 965struct mce_info { 966 atomic_t inuse; 967 struct task_struct *t; 968 __u64 paddr; 969 int restartable; 970} mce_info[MCE_INFO_MAX]; 971 972static void mce_save_info(__u64 addr, int c) 973{ 974 struct mce_info *mi; 975 976 for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++) { 977 if (atomic_cmpxchg(&mi->inuse, 0, 1) == 0) { 978 mi->t = current; 979 mi->paddr = addr; 980 mi->restartable = c; 981 return; 982 } 983 } 984 985 mce_panic("Too many concurrent recoverable errors", NULL, NULL); 986} 987 988static struct mce_info *mce_find_info(void) 989{ 990 struct mce_info *mi; 991 992 for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++) 993 if (atomic_read(&mi->inuse) && mi->t == current) 994 return mi; 995 return NULL; 996} 997 998static void mce_clear_info(struct mce_info *mi) 999{ 1000 atomic_set(&mi->inuse, 0); 1001} 1002 1003/* 1004 * The actual machine check handler. This only handles real 1005 * exceptions when something got corrupted coming in through int 18. 1006 * 1007 * This is executed in NMI context not subject to normal locking rules. This 1008 * implies that most kernel services cannot be safely used. Don't even 1009 * think about putting a printk in there! 1010 * 1011 * On Intel systems this is entered on all CPUs in parallel through 1012 * MCE broadcast. However some CPUs might be broken beyond repair, 1013 * so be always careful when synchronizing with others. 1014 */ 1015void do_machine_check(struct pt_regs *regs, long error_code) 1016{ 1017 struct mca_config *cfg = &mca_cfg; 1018 struct mce m, *final; 1019 int i; 1020 int worst = 0; 1021 int severity; 1022 /* 1023 * Establish sequential order between the CPUs entering the machine 1024 * check handler. 1025 */ 1026 int order; 1027 /* 1028 * If no_way_out gets set, there is no safe way to recover from this 1029 * MCE. If mca_cfg.tolerant is cranked up, we'll try anyway. 1030 */ 1031 int no_way_out = 0; 1032 /* 1033 * If kill_it gets set, there might be a way to recover from this 1034 * error. 1035 */ 1036 int kill_it = 0; 1037 DECLARE_BITMAP(toclear, MAX_NR_BANKS); 1038 DECLARE_BITMAP(valid_banks, MAX_NR_BANKS); 1039 char *msg = "Unknown"; 1040 1041 this_cpu_inc(mce_exception_count); 1042 1043 if (!cfg->banks) 1044 goto out; 1045 1046 mce_gather_info(&m, regs); 1047 1048 final = this_cpu_ptr(&mces_seen); 1049 *final = m; 1050 1051 memset(valid_banks, 0, sizeof(valid_banks)); 1052 no_way_out = mce_no_way_out(&m, &msg, valid_banks, regs); 1053 1054 barrier(); 1055 1056 /* 1057 * When no restart IP might need to kill or panic. 1058 * Assume the worst for now, but if we find the 1059 * severity is MCE_AR_SEVERITY we have other options. 1060 */ 1061 if (!(m.mcgstatus & MCG_STATUS_RIPV)) 1062 kill_it = 1; 1063 1064 /* 1065 * Go through all the banks in exclusion of the other CPUs. 1066 * This way we don't report duplicated events on shared banks 1067 * because the first one to see it will clear it. 1068 */ 1069 order = mce_start(&no_way_out); 1070 for (i = 0; i < cfg->banks; i++) { 1071 __clear_bit(i, toclear); 1072 if (!test_bit(i, valid_banks)) 1073 continue; 1074 if (!mce_banks[i].ctl) 1075 continue; 1076 1077 m.misc = 0; 1078 m.addr = 0; 1079 m.bank = i; 1080 1081 m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); 1082 if ((m.status & MCI_STATUS_VAL) == 0) 1083 continue; 1084 1085 /* 1086 * Non uncorrected or non signaled errors are handled by 1087 * machine_check_poll. Leave them alone, unless this panics. 1088 */ 1089 if (!(m.status & (cfg->ser ? MCI_STATUS_S : MCI_STATUS_UC)) && 1090 !no_way_out) 1091 continue; 1092 1093 /* 1094 * Set taint even when machine check was not enabled. 1095 */ 1096 add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); 1097 1098 severity = mce_severity(&m, cfg->tolerant, NULL); 1099 1100 /* 1101 * When machine check was for corrected handler don't touch, 1102 * unless we're panicing. 1103 */ 1104 if (severity == MCE_KEEP_SEVERITY && !no_way_out) 1105 continue; 1106 __set_bit(i, toclear); 1107 if (severity == MCE_NO_SEVERITY) { 1108 /* 1109 * Machine check event was not enabled. Clear, but 1110 * ignore. 1111 */ 1112 continue; 1113 } 1114 1115 mce_read_aux(&m, i); 1116 1117 /* 1118 * Action optional error. Queue address for later processing. 1119 * When the ring overflows we just ignore the AO error. 1120 * RED-PEN add some logging mechanism when 1121 * usable_address or mce_add_ring fails. 1122 * RED-PEN don't ignore overflow for mca_cfg.tolerant == 0 1123 */ 1124 if (severity == MCE_AO_SEVERITY && mce_usable_address(&m)) 1125 mce_ring_add(m.addr >> PAGE_SHIFT); 1126 1127 mce_log(&m); 1128 1129 if (severity > worst) { 1130 *final = m; 1131 worst = severity; 1132 } 1133 } 1134 1135 /* mce_clear_state will clear *final, save locally for use later */ 1136 m = *final; 1137 1138 if (!no_way_out) 1139 mce_clear_state(toclear); 1140 1141 /* 1142 * Do most of the synchronization with other CPUs. 1143 * When there's any problem use only local no_way_out state. 1144 */ 1145 if (mce_end(order) < 0) 1146 no_way_out = worst >= MCE_PANIC_SEVERITY; 1147 1148 /* 1149 * At insane "tolerant" levels we take no action. Otherwise 1150 * we only die if we have no other choice. For less serious 1151 * issues we try to recover, or limit damage to the current 1152 * process. 1153 */ 1154 if (cfg->tolerant < 3) { 1155 if (no_way_out) 1156 mce_panic("Fatal machine check on current CPU", &m, msg); 1157 if (worst == MCE_AR_SEVERITY) { 1158 /* schedule action before return to userland */ 1159 mce_save_info(m.addr, m.mcgstatus & MCG_STATUS_RIPV); 1160 set_thread_flag(TIF_MCE_NOTIFY); 1161 } else if (kill_it) { 1162 force_sig(SIGBUS, current); 1163 } 1164 } 1165 1166 if (worst > 0) 1167 mce_report_event(regs); 1168 mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); 1169out: 1170 sync_core(); 1171} 1172EXPORT_SYMBOL_GPL(do_machine_check); 1173 1174#ifndef CONFIG_MEMORY_FAILURE 1175int memory_failure(unsigned long pfn, int vector, int flags) 1176{ 1177 /* mce_severity() should not hand us an ACTION_REQUIRED error */ 1178 BUG_ON(flags & MF_ACTION_REQUIRED); 1179 pr_err("Uncorrected memory error in page 0x%lx ignored\n" 1180 "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n", 1181 pfn); 1182 1183 return 0; 1184} 1185#endif 1186 1187/* 1188 * Called in process context that interrupted by MCE and marked with 1189 * TIF_MCE_NOTIFY, just before returning to erroneous userland. 1190 * This code is allowed to sleep. 1191 * Attempt possible recovery such as calling the high level VM handler to 1192 * process any corrupted pages, and kill/signal current process if required. 1193 * Action required errors are handled here. 1194 */ 1195void mce_notify_process(void) 1196{ 1197 unsigned long pfn; 1198 struct mce_info *mi = mce_find_info(); 1199 int flags = MF_ACTION_REQUIRED; 1200 1201 if (!mi) 1202 mce_panic("Lost physical address for unconsumed uncorrectable error", NULL, NULL); 1203 pfn = mi->paddr >> PAGE_SHIFT; 1204 1205 clear_thread_flag(TIF_MCE_NOTIFY); 1206 1207 pr_err("Uncorrected hardware memory error in user-access at %llx", 1208 mi->paddr); 1209 /* 1210 * We must call memory_failure() here even if the current process is 1211 * doomed. We still need to mark the page as poisoned and alert any 1212 * other users of the page. 1213 */ 1214 if (!mi->restartable) 1215 flags |= MF_MUST_KILL; 1216 if (memory_failure(pfn, MCE_VECTOR, flags) < 0) { 1217 pr_err("Memory error not recovered"); 1218 force_sig(SIGBUS, current); 1219 } 1220 mce_clear_info(mi); 1221} 1222 1223/* 1224 * Action optional processing happens here (picking up 1225 * from the list of faulting pages that do_machine_check() 1226 * placed into the "ring"). 1227 */ 1228static void mce_process_work(struct work_struct *dummy) 1229{ 1230 unsigned long pfn; 1231 1232 while (mce_ring_get(&pfn)) 1233 memory_failure(pfn, MCE_VECTOR, 0); 1234} 1235 1236#ifdef CONFIG_X86_MCE_INTEL 1237/*** 1238 * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog 1239 * @cpu: The CPU on which the event occurred. 1240 * @status: Event status information 1241 * 1242 * This function should be called by the thermal interrupt after the 1243 * event has been processed and the decision was made to log the event 1244 * further. 1245 * 1246 * The status parameter will be saved to the 'status' field of 'struct mce' 1247 * and historically has been the register value of the 1248 * MSR_IA32_THERMAL_STATUS (Intel) msr. 1249 */ 1250void mce_log_therm_throt_event(__u64 status) 1251{ 1252 struct mce m; 1253 1254 mce_setup(&m); 1255 m.bank = MCE_THERMAL_BANK; 1256 m.status = status; 1257 mce_log(&m); 1258} 1259#endif /* CONFIG_X86_MCE_INTEL */ 1260 1261/* 1262 * Periodic polling timer for "silent" machine check errors. If the 1263 * poller finds an MCE, poll 2x faster. When the poller finds no more 1264 * errors, poll 2x slower (up to check_interval seconds). 1265 */ 1266static unsigned long check_interval = 5 * 60; /* 5 minutes */ 1267 1268static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */ 1269static DEFINE_PER_CPU(struct timer_list, mce_timer); 1270 1271static unsigned long mce_adjust_timer_default(unsigned long interval) 1272{ 1273 return interval; 1274} 1275 1276static unsigned long (*mce_adjust_timer)(unsigned long interval) = 1277 mce_adjust_timer_default; 1278 1279static int cmc_error_seen(void) 1280{ 1281 unsigned long *v = this_cpu_ptr(&mce_polled_error); 1282 1283 return test_and_clear_bit(0, v); 1284} 1285 1286static void mce_timer_fn(unsigned long data) 1287{ 1288 struct timer_list *t = this_cpu_ptr(&mce_timer); 1289 unsigned long iv; 1290 int notify; 1291 1292 WARN_ON(smp_processor_id() != data); 1293 1294 if (mce_available(this_cpu_ptr(&cpu_info))) { 1295 machine_check_poll(MCP_TIMESTAMP, 1296 this_cpu_ptr(&mce_poll_banks)); 1297 mce_intel_cmci_poll(); 1298 } 1299 1300 /* 1301 * Alert userspace if needed. If we logged an MCE, reduce the 1302 * polling interval, otherwise increase the polling interval. 1303 */ 1304 iv = __this_cpu_read(mce_next_interval); 1305 notify = mce_notify_irq(); 1306 notify |= cmc_error_seen(); 1307 if (notify) { 1308 iv = max(iv / 2, (unsigned long) HZ/100); 1309 } else { 1310 iv = min(iv * 2, round_jiffies_relative(check_interval * HZ)); 1311 iv = mce_adjust_timer(iv); 1312 } 1313 __this_cpu_write(mce_next_interval, iv); 1314 /* Might have become 0 after CMCI storm subsided */ 1315 if (iv) { 1316 t->expires = jiffies + iv; 1317 add_timer_on(t, smp_processor_id()); 1318 } 1319} 1320 1321/* 1322 * Ensure that the timer is firing in @interval from now. 1323 */ 1324void mce_timer_kick(unsigned long interval) 1325{ 1326 struct timer_list *t = this_cpu_ptr(&mce_timer); 1327 unsigned long when = jiffies + interval; 1328 unsigned long iv = __this_cpu_read(mce_next_interval); 1329 1330 if (timer_pending(t)) { 1331 if (time_before(when, t->expires)) 1332 mod_timer_pinned(t, when); 1333 } else { 1334 t->expires = round_jiffies(when); 1335 add_timer_on(t, smp_processor_id()); 1336 } 1337 if (interval < iv) 1338 __this_cpu_write(mce_next_interval, interval); 1339} 1340 1341/* Must not be called in IRQ context where del_timer_sync() can deadlock */ 1342static void mce_timer_delete_all(void) 1343{ 1344 int cpu; 1345 1346 for_each_online_cpu(cpu) 1347 del_timer_sync(&per_cpu(mce_timer, cpu)); 1348} 1349 1350static void mce_do_trigger(struct work_struct *work) 1351{ 1352 call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT); 1353} 1354 1355static DECLARE_WORK(mce_trigger_work, mce_do_trigger); 1356 1357/* 1358 * Notify the user(s) about new machine check events. 1359 * Can be called from interrupt context, but not from machine check/NMI 1360 * context. 1361 */ 1362int mce_notify_irq(void) 1363{ 1364 /* Not more than two messages every minute */ 1365 static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); 1366 1367 if (test_and_clear_bit(0, &mce_need_notify)) { 1368 /* wake processes polling /dev/mcelog */ 1369 wake_up_interruptible(&mce_chrdev_wait); 1370 1371 if (mce_helper[0]) 1372 schedule_work(&mce_trigger_work); 1373 1374 if (__ratelimit(&ratelimit)) 1375 pr_info(HW_ERR "Machine check events logged\n"); 1376 1377 return 1; 1378 } 1379 return 0; 1380} 1381EXPORT_SYMBOL_GPL(mce_notify_irq); 1382 1383static int __mcheck_cpu_mce_banks_init(void) 1384{ 1385 int i; 1386 u8 num_banks = mca_cfg.banks; 1387 1388 mce_banks = kzalloc(num_banks * sizeof(struct mce_bank), GFP_KERNEL); 1389 if (!mce_banks) 1390 return -ENOMEM; 1391 1392 for (i = 0; i < num_banks; i++) { 1393 struct mce_bank *b = &mce_banks[i]; 1394 1395 b->ctl = -1ULL; 1396 b->init = 1; 1397 } 1398 return 0; 1399} 1400 1401/* 1402 * Initialize Machine Checks for a CPU. 1403 */ 1404static int __mcheck_cpu_cap_init(void) 1405{ 1406 unsigned b; 1407 u64 cap; 1408 1409 rdmsrl(MSR_IA32_MCG_CAP, cap); 1410 1411 b = cap & MCG_BANKCNT_MASK; 1412 if (!mca_cfg.banks) 1413 pr_info("CPU supports %d MCE banks\n", b); 1414 1415 if (b > MAX_NR_BANKS) { 1416 pr_warn("Using only %u machine check banks out of %u\n", 1417 MAX_NR_BANKS, b); 1418 b = MAX_NR_BANKS; 1419 } 1420 1421 /* Don't support asymmetric configurations today */ 1422 WARN_ON(mca_cfg.banks != 0 && b != mca_cfg.banks); 1423 mca_cfg.banks = b; 1424 1425 if (!mce_banks) { 1426 int err = __mcheck_cpu_mce_banks_init(); 1427 1428 if (err) 1429 return err; 1430 } 1431 1432 /* Use accurate RIP reporting if available. */ 1433 if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9) 1434 mca_cfg.rip_msr = MSR_IA32_MCG_EIP; 1435 1436 if (cap & MCG_SER_P) 1437 mca_cfg.ser = true; 1438 1439 return 0; 1440} 1441 1442static void __mcheck_cpu_init_generic(void) 1443{ 1444 enum mcp_flags m_fl = 0; 1445 mce_banks_t all_banks; 1446 u64 cap; 1447 int i; 1448 1449 if (!mca_cfg.bootlog) 1450 m_fl = MCP_DONTLOG; 1451 1452 /* 1453 * Log the machine checks left over from the previous reset. 1454 */ 1455 bitmap_fill(all_banks, MAX_NR_BANKS); 1456 machine_check_poll(MCP_UC | m_fl, &all_banks); 1457 1458 set_in_cr4(X86_CR4_MCE); 1459 1460 rdmsrl(MSR_IA32_MCG_CAP, cap); 1461 if (cap & MCG_CTL_P) 1462 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); 1463 1464 for (i = 0; i < mca_cfg.banks; i++) { 1465 struct mce_bank *b = &mce_banks[i]; 1466 1467 if (!b->init) 1468 continue; 1469 wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl); 1470 wrmsrl(MSR_IA32_MCx_STATUS(i), 0); 1471 } 1472} 1473 1474/* 1475 * During IFU recovery Sandy Bridge -EP4S processors set the RIPV and 1476 * EIPV bits in MCG_STATUS to zero on the affected logical processor (SDM 1477 * Vol 3B Table 15-20). But this confuses both the code that determines 1478 * whether the machine check occurred in kernel or user mode, and also 1479 * the severity assessment code. Pretend that EIPV was set, and take the 1480 * ip/cs values from the pt_regs that mce_gather_info() ignored earlier. 1481 */ 1482static void quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs) 1483{ 1484 if (bank != 0) 1485 return; 1486 if ((m->mcgstatus & (MCG_STATUS_EIPV|MCG_STATUS_RIPV)) != 0) 1487 return; 1488 if ((m->status & (MCI_STATUS_OVER|MCI_STATUS_UC| 1489 MCI_STATUS_EN|MCI_STATUS_MISCV|MCI_STATUS_ADDRV| 1490 MCI_STATUS_PCC|MCI_STATUS_S|MCI_STATUS_AR| 1491 MCACOD)) != 1492 (MCI_STATUS_UC|MCI_STATUS_EN| 1493 MCI_STATUS_MISCV|MCI_STATUS_ADDRV|MCI_STATUS_S| 1494 MCI_STATUS_AR|MCACOD_INSTR)) 1495 return; 1496 1497 m->mcgstatus |= MCG_STATUS_EIPV; 1498 m->ip = regs->ip; 1499 m->cs = regs->cs; 1500} 1501 1502/* Add per CPU specific workarounds here */ 1503static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) 1504{ 1505 struct mca_config *cfg = &mca_cfg; 1506 1507 if (c->x86_vendor == X86_VENDOR_UNKNOWN) { 1508 pr_info("unknown CPU type - not enabling MCE support\n"); 1509 return -EOPNOTSUPP; 1510 } 1511 1512 /* This should be disabled by the BIOS, but isn't always */ 1513 if (c->x86_vendor == X86_VENDOR_AMD) { 1514 if (c->x86 == 15 && cfg->banks > 4) { 1515 /* 1516 * disable GART TBL walk error reporting, which 1517 * trips off incorrectly with the IOMMU & 3ware 1518 * & Cerberus: 1519 */ 1520 clear_bit(10, (unsigned long *)&mce_banks[4].ctl); 1521 } 1522 if (c->x86 <= 17 && cfg->bootlog < 0) { 1523 /* 1524 * Lots of broken BIOS around that don't clear them 1525 * by default and leave crap in there. Don't log: 1526 */ 1527 cfg->bootlog = 0; 1528 } 1529 /* 1530 * Various K7s with broken bank 0 around. Always disable 1531 * by default. 1532 */ 1533 if (c->x86 == 6 && cfg->banks > 0) 1534 mce_banks[0].ctl = 0; 1535 1536 /* 1537 * Turn off MC4_MISC thresholding banks on those models since 1538 * they're not supported there. 1539 */ 1540 if (c->x86 == 0x15 && 1541 (c->x86_model >= 0x10 && c->x86_model <= 0x1f)) { 1542 int i; 1543 u64 val, hwcr; 1544 bool need_toggle; 1545 u32 msrs[] = { 1546 0x00000413, /* MC4_MISC0 */ 1547 0xc0000408, /* MC4_MISC1 */ 1548 }; 1549 1550 rdmsrl(MSR_K7_HWCR, hwcr); 1551 1552 /* McStatusWrEn has to be set */ 1553 need_toggle = !(hwcr & BIT(18)); 1554 1555 if (need_toggle) 1556 wrmsrl(MSR_K7_HWCR, hwcr | BIT(18)); 1557 1558 for (i = 0; i < ARRAY_SIZE(msrs); i++) { 1559 rdmsrl(msrs[i], val); 1560 1561 /* CntP bit set? */ 1562 if (val & BIT_64(62)) { 1563 val &= ~BIT_64(62); 1564 wrmsrl(msrs[i], val); 1565 } 1566 } 1567 1568 /* restore old settings */ 1569 if (need_toggle) 1570 wrmsrl(MSR_K7_HWCR, hwcr); 1571 } 1572 } 1573 1574 if (c->x86_vendor == X86_VENDOR_INTEL) { 1575 /* 1576 * SDM documents that on family 6 bank 0 should not be written 1577 * because it aliases to another special BIOS controlled 1578 * register. 1579 * But it's not aliased anymore on model 0x1a+ 1580 * Don't ignore bank 0 completely because there could be a 1581 * valid event later, merely don't write CTL0. 1582 */ 1583 1584 if (c->x86 == 6 && c->x86_model < 0x1A && cfg->banks > 0) 1585 mce_banks[0].init = 0; 1586 1587 /* 1588 * All newer Intel systems support MCE broadcasting. Enable 1589 * synchronization with a one second timeout. 1590 */ 1591 if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) && 1592 cfg->monarch_timeout < 0) 1593 cfg->monarch_timeout = USEC_PER_SEC; 1594 1595 /* 1596 * There are also broken BIOSes on some Pentium M and 1597 * earlier systems: 1598 */ 1599 if (c->x86 == 6 && c->x86_model <= 13 && cfg->bootlog < 0) 1600 cfg->bootlog = 0; 1601 1602 if (c->x86 == 6 && c->x86_model == 45) 1603 quirk_no_way_out = quirk_sandybridge_ifu; 1604 } 1605 if (cfg->monarch_timeout < 0) 1606 cfg->monarch_timeout = 0; 1607 if (cfg->bootlog != 0) 1608 cfg->panic_timeout = 30; 1609 1610 return 0; 1611} 1612 1613static int __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c) 1614{ 1615 if (c->x86 != 5) 1616 return 0; 1617 1618 switch (c->x86_vendor) { 1619 case X86_VENDOR_INTEL: 1620 intel_p5_mcheck_init(c); 1621 return 1; 1622 break; 1623 case X86_VENDOR_CENTAUR: 1624 winchip_mcheck_init(c); 1625 return 1; 1626 break; 1627 } 1628 1629 return 0; 1630} 1631 1632static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) 1633{ 1634 switch (c->x86_vendor) { 1635 case X86_VENDOR_INTEL: 1636 mce_intel_feature_init(c); 1637 mce_adjust_timer = mce_intel_adjust_timer; 1638 break; 1639 case X86_VENDOR_AMD: 1640 mce_amd_feature_init(c); 1641 break; 1642 default: 1643 break; 1644 } 1645} 1646 1647static void mce_start_timer(unsigned int cpu, struct timer_list *t) 1648{ 1649 unsigned long iv = check_interval * HZ; 1650 1651 if (mca_cfg.ignore_ce || !iv) 1652 return; 1653 1654 per_cpu(mce_next_interval, cpu) = iv; 1655 1656 t->expires = round_jiffies(jiffies + iv); 1657 add_timer_on(t, cpu); 1658} 1659 1660static void __mcheck_cpu_init_timer(void) 1661{ 1662 struct timer_list *t = this_cpu_ptr(&mce_timer); 1663 unsigned int cpu = smp_processor_id(); 1664 1665 setup_timer(t, mce_timer_fn, cpu); 1666 mce_start_timer(cpu, t); 1667} 1668 1669/* Handle unconfigured int18 (should never happen) */ 1670static void unexpected_machine_check(struct pt_regs *regs, long error_code) 1671{ 1672 pr_err("CPU#%d: Unexpected int18 (Machine Check)\n", 1673 smp_processor_id()); 1674} 1675 1676/* Call the installed machine check handler for this CPU setup. */ 1677void (*machine_check_vector)(struct pt_regs *, long error_code) = 1678 unexpected_machine_check; 1679 1680/* 1681 * Called for each booted CPU to set up machine checks. 1682 * Must be called with preempt off: 1683 */ 1684void mcheck_cpu_init(struct cpuinfo_x86 *c) 1685{ 1686 if (mca_cfg.disabled) 1687 return; 1688 1689 if (__mcheck_cpu_ancient_init(c)) 1690 return; 1691 1692 if (!mce_available(c)) 1693 return; 1694 1695 if (__mcheck_cpu_cap_init() < 0 || __mcheck_cpu_apply_quirks(c) < 0) { 1696 mca_cfg.disabled = true; 1697 return; 1698 } 1699 1700 machine_check_vector = do_machine_check; 1701 1702 __mcheck_cpu_init_generic(); 1703 __mcheck_cpu_init_vendor(c); 1704 __mcheck_cpu_init_timer(); 1705 INIT_WORK(this_cpu_ptr(&mce_work), mce_process_work); 1706 init_irq_work(this_cpu_ptr(&mce_irq_work), &mce_irq_work_cb); 1707} 1708 1709/* 1710 * mce_chrdev: Character device /dev/mcelog to read and clear the MCE log. 1711 */ 1712 1713static DEFINE_SPINLOCK(mce_chrdev_state_lock); 1714static int mce_chrdev_open_count; /* #times opened */ 1715static int mce_chrdev_open_exclu; /* already open exclusive? */ 1716 1717static int mce_chrdev_open(struct inode *inode, struct file *file) 1718{ 1719 spin_lock(&mce_chrdev_state_lock); 1720 1721 if (mce_chrdev_open_exclu || 1722 (mce_chrdev_open_count && (file->f_flags & O_EXCL))) { 1723 spin_unlock(&mce_chrdev_state_lock); 1724 1725 return -EBUSY; 1726 } 1727 1728 if (file->f_flags & O_EXCL) 1729 mce_chrdev_open_exclu = 1; 1730 mce_chrdev_open_count++; 1731 1732 spin_unlock(&mce_chrdev_state_lock); 1733 1734 return nonseekable_open(inode, file); 1735} 1736 1737static int mce_chrdev_release(struct inode *inode, struct file *file) 1738{ 1739 spin_lock(&mce_chrdev_state_lock); 1740 1741 mce_chrdev_open_count--; 1742 mce_chrdev_open_exclu = 0; 1743 1744 spin_unlock(&mce_chrdev_state_lock); 1745 1746 return 0; 1747} 1748 1749static void collect_tscs(void *data) 1750{ 1751 unsigned long *cpu_tsc = (unsigned long *)data; 1752 1753 rdtscll(cpu_tsc[smp_processor_id()]); 1754} 1755 1756static int mce_apei_read_done; 1757 1758/* Collect MCE record of previous boot in persistent storage via APEI ERST. */ 1759static int __mce_read_apei(char __user **ubuf, size_t usize) 1760{ 1761 int rc; 1762 u64 record_id; 1763 struct mce m; 1764 1765 if (usize < sizeof(struct mce)) 1766 return -EINVAL; 1767 1768 rc = apei_read_mce(&m, &record_id); 1769 /* Error or no more MCE record */ 1770 if (rc <= 0) { 1771 mce_apei_read_done = 1; 1772 /* 1773 * When ERST is disabled, mce_chrdev_read() should return 1774 * "no record" instead of "no device." 1775 */ 1776 if (rc == -ENODEV) 1777 return 0; 1778 return rc; 1779 } 1780 rc = -EFAULT; 1781 if (copy_to_user(*ubuf, &m, sizeof(struct mce))) 1782 return rc; 1783 /* 1784 * In fact, we should have cleared the record after that has 1785 * been flushed to the disk or sent to network in 1786 * /sbin/mcelog, but we have no interface to support that now, 1787 * so just clear it to avoid duplication. 1788 */ 1789 rc = apei_clear_mce(record_id); 1790 if (rc) { 1791 mce_apei_read_done = 1; 1792 return rc; 1793 } 1794 *ubuf += sizeof(struct mce); 1795 1796 return 0; 1797} 1798 1799static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf, 1800 size_t usize, loff_t *off) 1801{ 1802 char __user *buf = ubuf; 1803 unsigned long *cpu_tsc; 1804 unsigned prev, next; 1805 int i, err; 1806 1807 cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL); 1808 if (!cpu_tsc) 1809 return -ENOMEM; 1810 1811 mutex_lock(&mce_chrdev_read_mutex); 1812 1813 if (!mce_apei_read_done) { 1814 err = __mce_read_apei(&buf, usize); 1815 if (err || buf != ubuf) 1816 goto out; 1817 } 1818 1819 next = rcu_dereference_check_mce(mcelog.next); 1820 1821 /* Only supports full reads right now */ 1822 err = -EINVAL; 1823 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) 1824 goto out; 1825 1826 err = 0; 1827 prev = 0; 1828 do { 1829 for (i = prev; i < next; i++) { 1830 unsigned long start = jiffies; 1831 struct mce *m = &mcelog.entry[i]; 1832 1833 while (!m->finished) { 1834 if (time_after_eq(jiffies, start + 2)) { 1835 memset(m, 0, sizeof(*m)); 1836 goto timeout; 1837 } 1838 cpu_relax(); 1839 } 1840 smp_rmb(); 1841 err |= copy_to_user(buf, m, sizeof(*m)); 1842 buf += sizeof(*m); 1843timeout: 1844 ; 1845 } 1846 1847 memset(mcelog.entry + prev, 0, 1848 (next - prev) * sizeof(struct mce)); 1849 prev = next; 1850 next = cmpxchg(&mcelog.next, prev, 0); 1851 } while (next != prev); 1852 1853 synchronize_sched(); 1854 1855 /* 1856 * Collect entries that were still getting written before the 1857 * synchronize. 1858 */ 1859 on_each_cpu(collect_tscs, cpu_tsc, 1); 1860 1861 for (i = next; i < MCE_LOG_LEN; i++) { 1862 struct mce *m = &mcelog.entry[i]; 1863 1864 if (m->finished && m->tsc < cpu_tsc[m->cpu]) { 1865 err |= copy_to_user(buf, m, sizeof(*m)); 1866 smp_rmb(); 1867 buf += sizeof(*m); 1868 memset(m, 0, sizeof(*m)); 1869 } 1870 } 1871 1872 if (err) 1873 err = -EFAULT; 1874 1875out: 1876 mutex_unlock(&mce_chrdev_read_mutex); 1877 kfree(cpu_tsc); 1878 1879 return err ? err : buf - ubuf; 1880} 1881 1882static unsigned int mce_chrdev_poll(struct file *file, poll_table *wait) 1883{ 1884 poll_wait(file, &mce_chrdev_wait, wait); 1885 if (rcu_access_index(mcelog.next)) 1886 return POLLIN | POLLRDNORM; 1887 if (!mce_apei_read_done && apei_check_mce()) 1888 return POLLIN | POLLRDNORM; 1889 return 0; 1890} 1891 1892static long mce_chrdev_ioctl(struct file *f, unsigned int cmd, 1893 unsigned long arg) 1894{ 1895 int __user *p = (int __user *)arg; 1896 1897 if (!capable(CAP_SYS_ADMIN)) 1898 return -EPERM; 1899 1900 switch (cmd) { 1901 case MCE_GET_RECORD_LEN: 1902 return put_user(sizeof(struct mce), p); 1903 case MCE_GET_LOG_LEN: 1904 return put_user(MCE_LOG_LEN, p); 1905 case MCE_GETCLEAR_FLAGS: { 1906 unsigned flags; 1907 1908 do { 1909 flags = mcelog.flags; 1910 } while (cmpxchg(&mcelog.flags, flags, 0) != flags); 1911 1912 return put_user(flags, p); 1913 } 1914 default: 1915 return -ENOTTY; 1916 } 1917} 1918 1919static ssize_t (*mce_write)(struct file *filp, const char __user *ubuf, 1920 size_t usize, loff_t *off); 1921 1922void register_mce_write_callback(ssize_t (*fn)(struct file *filp, 1923 const char __user *ubuf, 1924 size_t usize, loff_t *off)) 1925{ 1926 mce_write = fn; 1927} 1928EXPORT_SYMBOL_GPL(register_mce_write_callback); 1929 1930ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf, 1931 size_t usize, loff_t *off) 1932{ 1933 if (mce_write) 1934 return mce_write(filp, ubuf, usize, off); 1935 else 1936 return -EINVAL; 1937} 1938 1939static const struct file_operations mce_chrdev_ops = { 1940 .open = mce_chrdev_open, 1941 .release = mce_chrdev_release, 1942 .read = mce_chrdev_read, 1943 .write = mce_chrdev_write, 1944 .poll = mce_chrdev_poll, 1945 .unlocked_ioctl = mce_chrdev_ioctl, 1946 .llseek = no_llseek, 1947}; 1948 1949static struct miscdevice mce_chrdev_device = { 1950 MISC_MCELOG_MINOR, 1951 "mcelog", 1952 &mce_chrdev_ops, 1953}; 1954 1955static void __mce_disable_bank(void *arg) 1956{ 1957 int bank = *((int *)arg); 1958 __clear_bit(bank, this_cpu_ptr(mce_poll_banks)); 1959 cmci_disable_bank(bank); 1960} 1961 1962void mce_disable_bank(int bank) 1963{ 1964 if (bank >= mca_cfg.banks) { 1965 pr_warn(FW_BUG 1966 "Ignoring request to disable invalid MCA bank %d.\n", 1967 bank); 1968 return; 1969 } 1970 set_bit(bank, mce_banks_ce_disabled); 1971 on_each_cpu(__mce_disable_bank, &bank, 1); 1972} 1973 1974/* 1975 * mce=off Disables machine check 1976 * mce=no_cmci Disables CMCI 1977 * mce=dont_log_ce Clears corrected events silently, no log created for CEs. 1978 * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared. 1979 * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above) 1980 * monarchtimeout is how long to wait for other CPUs on machine 1981 * check, or 0 to not wait 1982 * mce=bootlog Log MCEs from before booting. Disabled by default on AMD. 1983 * mce=nobootlog Don't log MCEs from before booting. 1984 * mce=bios_cmci_threshold Don't program the CMCI threshold 1985 */ 1986static int __init mcheck_enable(char *str) 1987{ 1988 struct mca_config *cfg = &mca_cfg; 1989 1990 if (*str == 0) { 1991 enable_p5_mce(); 1992 return 1; 1993 } 1994 if (*str == '=') 1995 str++; 1996 if (!strcmp(str, "off")) 1997 cfg->disabled = true; 1998 else if (!strcmp(str, "no_cmci")) 1999 cfg->cmci_disabled = true; 2000 else if (!strcmp(str, "dont_log_ce")) 2001 cfg->dont_log_ce = true; 2002 else if (!strcmp(str, "ignore_ce")) 2003 cfg->ignore_ce = true; 2004 else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog")) 2005 cfg->bootlog = (str[0] == 'b'); 2006 else if (!strcmp(str, "bios_cmci_threshold")) 2007 cfg->bios_cmci_threshold = true; 2008 else if (isdigit(str[0])) { 2009 get_option(&str, &(cfg->tolerant)); 2010 if (*str == ',') { 2011 ++str; 2012 get_option(&str, &(cfg->monarch_timeout)); 2013 } 2014 } else { 2015 pr_info("mce argument %s ignored. Please use /sys\n", str); 2016 return 0; 2017 } 2018 return 1; 2019} 2020__setup("mce", mcheck_enable); 2021 2022int __init mcheck_init(void) 2023{ 2024 mcheck_intel_therm_init(); 2025 2026 return 0; 2027} 2028 2029/* 2030 * mce_syscore: PM support 2031 */ 2032 2033/* 2034 * Disable machine checks on suspend and shutdown. We can't really handle 2035 * them later. 2036 */ 2037static int mce_disable_error_reporting(void) 2038{ 2039 int i; 2040 2041 for (i = 0; i < mca_cfg.banks; i++) { 2042 struct mce_bank *b = &mce_banks[i]; 2043 2044 if (b->init) 2045 wrmsrl(MSR_IA32_MCx_CTL(i), 0); 2046 } 2047 return 0; 2048} 2049 2050static int mce_syscore_suspend(void) 2051{ 2052 return mce_disable_error_reporting(); 2053} 2054 2055static void mce_syscore_shutdown(void) 2056{ 2057 mce_disable_error_reporting(); 2058} 2059 2060/* 2061 * On resume clear all MCE state. Don't want to see leftovers from the BIOS. 2062 * Only one CPU is active at this time, the others get re-added later using 2063 * CPU hotplug: 2064 */ 2065static void mce_syscore_resume(void) 2066{ 2067 __mcheck_cpu_init_generic(); 2068 __mcheck_cpu_init_vendor(raw_cpu_ptr(&cpu_info)); 2069} 2070 2071static struct syscore_ops mce_syscore_ops = { 2072 .suspend = mce_syscore_suspend, 2073 .shutdown = mce_syscore_shutdown, 2074 .resume = mce_syscore_resume, 2075}; 2076 2077/* 2078 * mce_device: Sysfs support 2079 */ 2080 2081static void mce_cpu_restart(void *data) 2082{ 2083 if (!mce_available(raw_cpu_ptr(&cpu_info))) 2084 return; 2085 __mcheck_cpu_init_generic(); 2086 __mcheck_cpu_init_timer(); 2087} 2088 2089/* Reinit MCEs after user configuration changes */ 2090static void mce_restart(void) 2091{ 2092 mce_timer_delete_all(); 2093 on_each_cpu(mce_cpu_restart, NULL, 1); 2094} 2095 2096/* Toggle features for corrected errors */ 2097static void mce_disable_cmci(void *data) 2098{ 2099 if (!mce_available(raw_cpu_ptr(&cpu_info))) 2100 return; 2101 cmci_clear(); 2102} 2103 2104static void mce_enable_ce(void *all) 2105{ 2106 if (!mce_available(raw_cpu_ptr(&cpu_info))) 2107 return; 2108 cmci_reenable(); 2109 cmci_recheck(); 2110 if (all) 2111 __mcheck_cpu_init_timer(); 2112} 2113 2114static struct bus_type mce_subsys = { 2115 .name = "machinecheck", 2116 .dev_name = "machinecheck", 2117}; 2118 2119DEFINE_PER_CPU(struct device *, mce_device); 2120 2121void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); 2122 2123static inline struct mce_bank *attr_to_bank(struct device_attribute *attr) 2124{ 2125 return container_of(attr, struct mce_bank, attr); 2126} 2127 2128static ssize_t show_bank(struct device *s, struct device_attribute *attr, 2129 char *buf) 2130{ 2131 return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl); 2132} 2133 2134static ssize_t set_bank(struct device *s, struct device_attribute *attr, 2135 const char *buf, size_t size) 2136{ 2137 u64 new; 2138 2139 if (kstrtou64(buf, 0, &new) < 0) 2140 return -EINVAL; 2141 2142 attr_to_bank(attr)->ctl = new; 2143 mce_restart(); 2144 2145 return size; 2146} 2147 2148static ssize_t 2149show_trigger(struct device *s, struct device_attribute *attr, char *buf) 2150{ 2151 strcpy(buf, mce_helper); 2152 strcat(buf, "\n"); 2153 return strlen(mce_helper) + 1; 2154} 2155 2156static ssize_t set_trigger(struct device *s, struct device_attribute *attr, 2157 const char *buf, size_t siz) 2158{ 2159 char *p; 2160 2161 strncpy(mce_helper, buf, sizeof(mce_helper)); 2162 mce_helper[sizeof(mce_helper)-1] = 0; 2163 p = strchr(mce_helper, '\n'); 2164 2165 if (p) 2166 *p = 0; 2167 2168 return strlen(mce_helper) + !!p; 2169} 2170 2171static ssize_t set_ignore_ce(struct device *s, 2172 struct device_attribute *attr, 2173 const char *buf, size_t size) 2174{ 2175 u64 new; 2176 2177 if (kstrtou64(buf, 0, &new) < 0) 2178 return -EINVAL; 2179 2180 if (mca_cfg.ignore_ce ^ !!new) { 2181 if (new) { 2182 /* disable ce features */ 2183 mce_timer_delete_all(); 2184 on_each_cpu(mce_disable_cmci, NULL, 1); 2185 mca_cfg.ignore_ce = true; 2186 } else { 2187 /* enable ce features */ 2188 mca_cfg.ignore_ce = false; 2189 on_each_cpu(mce_enable_ce, (void *)1, 1); 2190 } 2191 } 2192 return size; 2193} 2194 2195static ssize_t set_cmci_disabled(struct device *s, 2196 struct device_attribute *attr, 2197 const char *buf, size_t size) 2198{ 2199 u64 new; 2200 2201 if (kstrtou64(buf, 0, &new) < 0) 2202 return -EINVAL; 2203 2204 if (mca_cfg.cmci_disabled ^ !!new) { 2205 if (new) { 2206 /* disable cmci */ 2207 on_each_cpu(mce_disable_cmci, NULL, 1); 2208 mca_cfg.cmci_disabled = true; 2209 } else { 2210 /* enable cmci */ 2211 mca_cfg.cmci_disabled = false; 2212 on_each_cpu(mce_enable_ce, NULL, 1); 2213 } 2214 } 2215 return size; 2216} 2217 2218static ssize_t store_int_with_restart(struct device *s, 2219 struct device_attribute *attr, 2220 const char *buf, size_t size) 2221{ 2222 ssize_t ret = device_store_int(s, attr, buf, size); 2223 mce_restart(); 2224 return ret; 2225} 2226 2227static DEVICE_ATTR(trigger, 0644, show_trigger, set_trigger); 2228static DEVICE_INT_ATTR(tolerant, 0644, mca_cfg.tolerant); 2229static DEVICE_INT_ATTR(monarch_timeout, 0644, mca_cfg.monarch_timeout); 2230static DEVICE_BOOL_ATTR(dont_log_ce, 0644, mca_cfg.dont_log_ce); 2231 2232static struct dev_ext_attribute dev_attr_check_interval = { 2233 __ATTR(check_interval, 0644, device_show_int, store_int_with_restart), 2234 &check_interval 2235}; 2236 2237static struct dev_ext_attribute dev_attr_ignore_ce = { 2238 __ATTR(ignore_ce, 0644, device_show_bool, set_ignore_ce), 2239 &mca_cfg.ignore_ce 2240}; 2241 2242static struct dev_ext_attribute dev_attr_cmci_disabled = { 2243 __ATTR(cmci_disabled, 0644, device_show_bool, set_cmci_disabled), 2244 &mca_cfg.cmci_disabled 2245}; 2246 2247static struct device_attribute *mce_device_attrs[] = { 2248 &dev_attr_tolerant.attr, 2249 &dev_attr_check_interval.attr, 2250 &dev_attr_trigger, 2251 &dev_attr_monarch_timeout.attr, 2252 &dev_attr_dont_log_ce.attr, 2253 &dev_attr_ignore_ce.attr, 2254 &dev_attr_cmci_disabled.attr, 2255 NULL 2256}; 2257 2258static cpumask_var_t mce_device_initialized; 2259 2260static void mce_device_release(struct device *dev) 2261{ 2262 kfree(dev); 2263} 2264 2265/* Per cpu device init. All of the cpus still share the same ctrl bank: */ 2266static int mce_device_create(unsigned int cpu) 2267{ 2268 struct device *dev; 2269 int err; 2270 int i, j; 2271 2272 if (!mce_available(&boot_cpu_data)) 2273 return -EIO; 2274 2275 dev = kzalloc(sizeof *dev, GFP_KERNEL); 2276 if (!dev) 2277 return -ENOMEM; 2278 dev->id = cpu; 2279 dev->bus = &mce_subsys; 2280 dev->release = &mce_device_release; 2281 2282 err = device_register(dev); 2283 if (err) { 2284 put_device(dev); 2285 return err; 2286 } 2287 2288 for (i = 0; mce_device_attrs[i]; i++) { 2289 err = device_create_file(dev, mce_device_attrs[i]); 2290 if (err) 2291 goto error; 2292 } 2293 for (j = 0; j < mca_cfg.banks; j++) { 2294 err = device_create_file(dev, &mce_banks[j].attr); 2295 if (err) 2296 goto error2; 2297 } 2298 cpumask_set_cpu(cpu, mce_device_initialized); 2299 per_cpu(mce_device, cpu) = dev; 2300 2301 return 0; 2302error2: 2303 while (--j >= 0) 2304 device_remove_file(dev, &mce_banks[j].attr); 2305error: 2306 while (--i >= 0) 2307 device_remove_file(dev, mce_device_attrs[i]); 2308 2309 device_unregister(dev); 2310 2311 return err; 2312} 2313 2314static void mce_device_remove(unsigned int cpu) 2315{ 2316 struct device *dev = per_cpu(mce_device, cpu); 2317 int i; 2318 2319 if (!cpumask_test_cpu(cpu, mce_device_initialized)) 2320 return; 2321 2322 for (i = 0; mce_device_attrs[i]; i++) 2323 device_remove_file(dev, mce_device_attrs[i]); 2324 2325 for (i = 0; i < mca_cfg.banks; i++) 2326 device_remove_file(dev, &mce_banks[i].attr); 2327 2328 device_unregister(dev); 2329 cpumask_clear_cpu(cpu, mce_device_initialized); 2330 per_cpu(mce_device, cpu) = NULL; 2331} 2332 2333/* Make sure there are no machine checks on offlined CPUs. */ 2334static void mce_disable_cpu(void *h) 2335{ 2336 unsigned long action = *(unsigned long *)h; 2337 int i; 2338 2339 if (!mce_available(raw_cpu_ptr(&cpu_info))) 2340 return; 2341 2342 if (!(action & CPU_TASKS_FROZEN)) 2343 cmci_clear(); 2344 for (i = 0; i < mca_cfg.banks; i++) { 2345 struct mce_bank *b = &mce_banks[i]; 2346 2347 if (b->init) 2348 wrmsrl(MSR_IA32_MCx_CTL(i), 0); 2349 } 2350} 2351 2352static void mce_reenable_cpu(void *h) 2353{ 2354 unsigned long action = *(unsigned long *)h; 2355 int i; 2356 2357 if (!mce_available(raw_cpu_ptr(&cpu_info))) 2358 return; 2359 2360 if (!(action & CPU_TASKS_FROZEN)) 2361 cmci_reenable(); 2362 for (i = 0; i < mca_cfg.banks; i++) { 2363 struct mce_bank *b = &mce_banks[i]; 2364 2365 if (b->init) 2366 wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl); 2367 } 2368} 2369 2370/* Get notified when a cpu comes on/off. Be hotplug friendly. */ 2371static int 2372mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) 2373{ 2374 unsigned int cpu = (unsigned long)hcpu; 2375 struct timer_list *t = &per_cpu(mce_timer, cpu); 2376 2377 switch (action & ~CPU_TASKS_FROZEN) { 2378 case CPU_ONLINE: 2379 mce_device_create(cpu); 2380 if (threshold_cpu_callback) 2381 threshold_cpu_callback(action, cpu); 2382 break; 2383 case CPU_DEAD: 2384 if (threshold_cpu_callback) 2385 threshold_cpu_callback(action, cpu); 2386 mce_device_remove(cpu); 2387 mce_intel_hcpu_update(cpu); 2388 2389 /* intentionally ignoring frozen here */ 2390 if (!(action & CPU_TASKS_FROZEN)) 2391 cmci_rediscover(); 2392 break; 2393 case CPU_DOWN_PREPARE: 2394 smp_call_function_single(cpu, mce_disable_cpu, &action, 1); 2395 del_timer_sync(t); 2396 break; 2397 case CPU_DOWN_FAILED: 2398 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); 2399 mce_start_timer(cpu, t); 2400 break; 2401 } 2402 2403 return NOTIFY_OK; 2404} 2405 2406static struct notifier_block mce_cpu_notifier = { 2407 .notifier_call = mce_cpu_callback, 2408}; 2409 2410static __init void mce_init_banks(void) 2411{ 2412 int i; 2413 2414 for (i = 0; i < mca_cfg.banks; i++) { 2415 struct mce_bank *b = &mce_banks[i]; 2416 struct device_attribute *a = &b->attr; 2417 2418 sysfs_attr_init(&a->attr); 2419 a->attr.name = b->attrname; 2420 snprintf(b->attrname, ATTR_LEN, "bank%d", i); 2421 2422 a->attr.mode = 0644; 2423 a->show = show_bank; 2424 a->store = set_bank; 2425 } 2426} 2427 2428static __init int mcheck_init_device(void) 2429{ 2430 int err; 2431 int i = 0; 2432 2433 if (!mce_available(&boot_cpu_data)) { 2434 err = -EIO; 2435 goto err_out; 2436 } 2437 2438 if (!zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL)) { 2439 err = -ENOMEM; 2440 goto err_out; 2441 } 2442 2443 mce_init_banks(); 2444 2445 err = subsys_system_register(&mce_subsys, NULL); 2446 if (err) 2447 goto err_out_mem; 2448 2449 cpu_notifier_register_begin(); 2450 for_each_online_cpu(i) { 2451 err = mce_device_create(i); 2452 if (err) { 2453 /* 2454 * Register notifier anyway (and do not unreg it) so 2455 * that we don't leave undeleted timers, see notifier 2456 * callback above. 2457 */ 2458 __register_hotcpu_notifier(&mce_cpu_notifier); 2459 cpu_notifier_register_done(); 2460 goto err_device_create; 2461 } 2462 } 2463 2464 __register_hotcpu_notifier(&mce_cpu_notifier); 2465 cpu_notifier_register_done(); 2466 2467 register_syscore_ops(&mce_syscore_ops); 2468 2469 /* register character device /dev/mcelog */ 2470 err = misc_register(&mce_chrdev_device); 2471 if (err) 2472 goto err_register; 2473 2474 return 0; 2475 2476err_register: 2477 unregister_syscore_ops(&mce_syscore_ops); 2478 2479err_device_create: 2480 /* 2481 * We didn't keep track of which devices were created above, but 2482 * even if we had, the set of online cpus might have changed. 2483 * Play safe and remove for every possible cpu, since 2484 * mce_device_remove() will do the right thing. 2485 */ 2486 for_each_possible_cpu(i) 2487 mce_device_remove(i); 2488 2489err_out_mem: 2490 free_cpumask_var(mce_device_initialized); 2491 2492err_out: 2493 pr_err("Unable to init device /dev/mcelog (rc: %d)\n", err); 2494 2495 return err; 2496} 2497device_initcall_sync(mcheck_init_device); 2498 2499/* 2500 * Old style boot options parsing. Only for compatibility. 2501 */ 2502static int __init mcheck_disable(char *str) 2503{ 2504 mca_cfg.disabled = true; 2505 return 1; 2506} 2507__setup("nomce", mcheck_disable); 2508 2509#ifdef CONFIG_DEBUG_FS 2510struct dentry *mce_get_debugfs_dir(void) 2511{ 2512 static struct dentry *dmce; 2513 2514 if (!dmce) 2515 dmce = debugfs_create_dir("mce", NULL); 2516 2517 return dmce; 2518} 2519 2520static void mce_reset(void) 2521{ 2522 cpu_missing = 0; 2523 atomic_set(&mce_fake_paniced, 0); 2524 atomic_set(&mce_executing, 0); 2525 atomic_set(&mce_callin, 0); 2526 atomic_set(&global_nwo, 0); 2527} 2528 2529static int fake_panic_get(void *data, u64 *val) 2530{ 2531 *val = fake_panic; 2532 return 0; 2533} 2534 2535static int fake_panic_set(void *data, u64 val) 2536{ 2537 mce_reset(); 2538 fake_panic = val; 2539 return 0; 2540} 2541 2542DEFINE_SIMPLE_ATTRIBUTE(fake_panic_fops, fake_panic_get, 2543 fake_panic_set, "%llu\n"); 2544 2545static int __init mcheck_debugfs_init(void) 2546{ 2547 struct dentry *dmce, *ffake_panic; 2548 2549 dmce = mce_get_debugfs_dir(); 2550 if (!dmce) 2551 return -ENOMEM; 2552 ffake_panic = debugfs_create_file("fake_panic", 0444, dmce, NULL, 2553 &fake_panic_fops); 2554 if (!ffake_panic) 2555 return -ENOMEM; 2556 2557 return 0; 2558} 2559late_initcall(mcheck_debugfs_init); 2560#endif 2561