mce.c revision 82a8f131aadf55ac7fbc8c6f65f34d83101160de
1/* 2 * Machine check handler. 3 * 4 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. 5 * Rest from unknown author(s). 6 * 2004 Andi Kleen. Rewrote most of it. 7 * Copyright 2008 Intel Corporation 8 * Author: Andi Kleen 9 */ 10 11#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 12 13#include <linux/thread_info.h> 14#include <linux/capability.h> 15#include <linux/miscdevice.h> 16#include <linux/ratelimit.h> 17#include <linux/kallsyms.h> 18#include <linux/rcupdate.h> 19#include <linux/kobject.h> 20#include <linux/uaccess.h> 21#include <linux/kdebug.h> 22#include <linux/kernel.h> 23#include <linux/percpu.h> 24#include <linux/string.h> 25#include <linux/device.h> 26#include <linux/syscore_ops.h> 27#include <linux/delay.h> 28#include <linux/ctype.h> 29#include <linux/sched.h> 30#include <linux/sysfs.h> 31#include <linux/types.h> 32#include <linux/slab.h> 33#include <linux/init.h> 34#include <linux/kmod.h> 35#include <linux/poll.h> 36#include <linux/nmi.h> 37#include <linux/cpu.h> 38#include <linux/smp.h> 39#include <linux/fs.h> 40#include <linux/mm.h> 41#include <linux/debugfs.h> 42#include <linux/irq_work.h> 43#include <linux/export.h> 44 45#include <asm/processor.h> 46#include <asm/mce.h> 47#include <asm/msr.h> 48 49#include "mce-internal.h" 50 51static DEFINE_MUTEX(mce_chrdev_read_mutex); 52 53#define rcu_dereference_check_mce(p) \ 54 rcu_dereference_index_check((p), \ 55 rcu_read_lock_sched_held() || \ 56 lockdep_is_held(&mce_chrdev_read_mutex)) 57 58#define CREATE_TRACE_POINTS 59#include <trace/events/mce.h> 60 61#define SPINUNIT 100 /* 100ns */ 62 63atomic_t mce_entry; 64 65DEFINE_PER_CPU(unsigned, mce_exception_count); 66 67struct mce_bank *mce_banks __read_mostly; 68 69struct mca_config mca_cfg __read_mostly = { 70 .bootlog = -1, 71 /* 72 * Tolerant levels: 73 * 0: always panic on uncorrected errors, log corrected errors 74 * 1: panic or SIGBUS on uncorrected errors, log corrected errors 75 * 2: SIGBUS or log uncorrected errors (if possible), log corr. errors 76 * 3: never panic or SIGBUS, log all errors (for testing only) 77 */ 78 .tolerant = 1, 79 .monarch_timeout = -1 80}; 81 82/* User mode helper program triggered by machine check event */ 83static unsigned long mce_need_notify; 84static char mce_helper[128]; 85static char *mce_helper_argv[2] = { mce_helper, NULL }; 86 87static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait); 88 89static DEFINE_PER_CPU(struct mce, mces_seen); 90static int cpu_missing; 91 92/* 93 * MCA banks polled by the period polling timer for corrected events. 94 * With Intel CMCI, this only has MCA banks which do not support CMCI (if any). 95 */ 96DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { 97 [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL 98}; 99 100/* 101 * MCA banks controlled through firmware first for corrected errors. 102 * This is a global list of banks for which we won't enable CMCI and we 103 * won't poll. Firmware controls these banks and is responsible for 104 * reporting corrected errors through GHES. Uncorrected/recoverable 105 * errors are still notified through a machine check. 106 */ 107mce_banks_t mce_banks_ce_disabled; 108 109static DEFINE_PER_CPU(struct work_struct, mce_work); 110 111static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs); 112 113/* 114 * CPU/chipset specific EDAC code can register a notifier call here to print 115 * MCE errors in a human-readable form. 116 */ 117ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain); 118 119/* Do initial initialization of a struct mce */ 120void mce_setup(struct mce *m) 121{ 122 memset(m, 0, sizeof(struct mce)); 123 m->cpu = m->extcpu = smp_processor_id(); 124 rdtscll(m->tsc); 125 /* We hope get_seconds stays lockless */ 126 m->time = get_seconds(); 127 m->cpuvendor = boot_cpu_data.x86_vendor; 128 m->cpuid = cpuid_eax(1); 129 m->socketid = cpu_data(m->extcpu).phys_proc_id; 130 m->apicid = cpu_data(m->extcpu).initial_apicid; 131 rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap); 132} 133 134DEFINE_PER_CPU(struct mce, injectm); 135EXPORT_PER_CPU_SYMBOL_GPL(injectm); 136 137/* 138 * Lockless MCE logging infrastructure. 139 * This avoids deadlocks on printk locks without having to break locks. Also 140 * separate MCEs from kernel messages to avoid bogus bug reports. 141 */ 142 143static struct mce_log mcelog = { 144 .signature = MCE_LOG_SIGNATURE, 145 .len = MCE_LOG_LEN, 146 .recordlen = sizeof(struct mce), 147}; 148 149void mce_log(struct mce *mce) 150{ 151 unsigned next, entry; 152 int ret = 0; 153 154 /* Emit the trace record: */ 155 trace_mce_record(mce); 156 157 ret = atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, mce); 158 if (ret == NOTIFY_STOP) 159 return; 160 161 mce->finished = 0; 162 wmb(); 163 for (;;) { 164 entry = rcu_dereference_check_mce(mcelog.next); 165 for (;;) { 166 167 /* 168 * When the buffer fills up discard new entries. 169 * Assume that the earlier errors are the more 170 * interesting ones: 171 */ 172 if (entry >= MCE_LOG_LEN) { 173 set_bit(MCE_OVERFLOW, 174 (unsigned long *)&mcelog.flags); 175 return; 176 } 177 /* Old left over entry. Skip: */ 178 if (mcelog.entry[entry].finished) { 179 entry++; 180 continue; 181 } 182 break; 183 } 184 smp_rmb(); 185 next = entry + 1; 186 if (cmpxchg(&mcelog.next, entry, next) == entry) 187 break; 188 } 189 memcpy(mcelog.entry + entry, mce, sizeof(struct mce)); 190 wmb(); 191 mcelog.entry[entry].finished = 1; 192 wmb(); 193 194 mce->finished = 1; 195 set_bit(0, &mce_need_notify); 196} 197 198static void drain_mcelog_buffer(void) 199{ 200 unsigned int next, i, prev = 0; 201 202 next = ACCESS_ONCE(mcelog.next); 203 204 do { 205 struct mce *m; 206 207 /* drain what was logged during boot */ 208 for (i = prev; i < next; i++) { 209 unsigned long start = jiffies; 210 unsigned retries = 1; 211 212 m = &mcelog.entry[i]; 213 214 while (!m->finished) { 215 if (time_after_eq(jiffies, start + 2*retries)) 216 retries++; 217 218 cpu_relax(); 219 220 if (!m->finished && retries >= 4) { 221 pr_err("skipping error being logged currently!\n"); 222 break; 223 } 224 } 225 smp_rmb(); 226 atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m); 227 } 228 229 memset(mcelog.entry + prev, 0, (next - prev) * sizeof(*m)); 230 prev = next; 231 next = cmpxchg(&mcelog.next, prev, 0); 232 } while (next != prev); 233} 234 235 236void mce_register_decode_chain(struct notifier_block *nb) 237{ 238 atomic_notifier_chain_register(&x86_mce_decoder_chain, nb); 239 drain_mcelog_buffer(); 240} 241EXPORT_SYMBOL_GPL(mce_register_decode_chain); 242 243void mce_unregister_decode_chain(struct notifier_block *nb) 244{ 245 atomic_notifier_chain_unregister(&x86_mce_decoder_chain, nb); 246} 247EXPORT_SYMBOL_GPL(mce_unregister_decode_chain); 248 249static void print_mce(struct mce *m) 250{ 251 int ret = 0; 252 253 pr_emerg(HW_ERR "CPU %d: Machine Check Exception: %Lx Bank %d: %016Lx\n", 254 m->extcpu, m->mcgstatus, m->bank, m->status); 255 256 if (m->ip) { 257 pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ", 258 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", 259 m->cs, m->ip); 260 261 if (m->cs == __KERNEL_CS) 262 print_symbol("{%s}", m->ip); 263 pr_cont("\n"); 264 } 265 266 pr_emerg(HW_ERR "TSC %llx ", m->tsc); 267 if (m->addr) 268 pr_cont("ADDR %llx ", m->addr); 269 if (m->misc) 270 pr_cont("MISC %llx ", m->misc); 271 272 pr_cont("\n"); 273 /* 274 * Note this output is parsed by external tools and old fields 275 * should not be changed. 276 */ 277 pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x microcode %x\n", 278 m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid, 279 cpu_data(m->extcpu).microcode); 280 281 /* 282 * Print out human-readable details about the MCE error, 283 * (if the CPU has an implementation for that) 284 */ 285 ret = atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m); 286 if (ret == NOTIFY_STOP) 287 return; 288 289 pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n"); 290} 291 292#define PANIC_TIMEOUT 5 /* 5 seconds */ 293 294static atomic_t mce_paniced; 295 296static int fake_panic; 297static atomic_t mce_fake_paniced; 298 299/* Panic in progress. Enable interrupts and wait for final IPI */ 300static void wait_for_panic(void) 301{ 302 long timeout = PANIC_TIMEOUT*USEC_PER_SEC; 303 304 preempt_disable(); 305 local_irq_enable(); 306 while (timeout-- > 0) 307 udelay(1); 308 if (panic_timeout == 0) 309 panic_timeout = mca_cfg.panic_timeout; 310 panic("Panicing machine check CPU died"); 311} 312 313static void mce_panic(char *msg, struct mce *final, char *exp) 314{ 315 int i, apei_err = 0; 316 317 if (!fake_panic) { 318 /* 319 * Make sure only one CPU runs in machine check panic 320 */ 321 if (atomic_inc_return(&mce_paniced) > 1) 322 wait_for_panic(); 323 barrier(); 324 325 bust_spinlocks(1); 326 console_verbose(); 327 } else { 328 /* Don't log too much for fake panic */ 329 if (atomic_inc_return(&mce_fake_paniced) > 1) 330 return; 331 } 332 /* First print corrected ones that are still unlogged */ 333 for (i = 0; i < MCE_LOG_LEN; i++) { 334 struct mce *m = &mcelog.entry[i]; 335 if (!(m->status & MCI_STATUS_VAL)) 336 continue; 337 if (!(m->status & MCI_STATUS_UC)) { 338 print_mce(m); 339 if (!apei_err) 340 apei_err = apei_write_mce(m); 341 } 342 } 343 /* Now print uncorrected but with the final one last */ 344 for (i = 0; i < MCE_LOG_LEN; i++) { 345 struct mce *m = &mcelog.entry[i]; 346 if (!(m->status & MCI_STATUS_VAL)) 347 continue; 348 if (!(m->status & MCI_STATUS_UC)) 349 continue; 350 if (!final || memcmp(m, final, sizeof(struct mce))) { 351 print_mce(m); 352 if (!apei_err) 353 apei_err = apei_write_mce(m); 354 } 355 } 356 if (final) { 357 print_mce(final); 358 if (!apei_err) 359 apei_err = apei_write_mce(final); 360 } 361 if (cpu_missing) 362 pr_emerg(HW_ERR "Some CPUs didn't answer in synchronization\n"); 363 if (exp) 364 pr_emerg(HW_ERR "Machine check: %s\n", exp); 365 if (!fake_panic) { 366 if (panic_timeout == 0) 367 panic_timeout = mca_cfg.panic_timeout; 368 panic(msg); 369 } else 370 pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg); 371} 372 373/* Support code for software error injection */ 374 375static int msr_to_offset(u32 msr) 376{ 377 unsigned bank = __this_cpu_read(injectm.bank); 378 379 if (msr == mca_cfg.rip_msr) 380 return offsetof(struct mce, ip); 381 if (msr == MSR_IA32_MCx_STATUS(bank)) 382 return offsetof(struct mce, status); 383 if (msr == MSR_IA32_MCx_ADDR(bank)) 384 return offsetof(struct mce, addr); 385 if (msr == MSR_IA32_MCx_MISC(bank)) 386 return offsetof(struct mce, misc); 387 if (msr == MSR_IA32_MCG_STATUS) 388 return offsetof(struct mce, mcgstatus); 389 return -1; 390} 391 392/* MSR access wrappers used for error injection */ 393static u64 mce_rdmsrl(u32 msr) 394{ 395 u64 v; 396 397 if (__this_cpu_read(injectm.finished)) { 398 int offset = msr_to_offset(msr); 399 400 if (offset < 0) 401 return 0; 402 return *(u64 *)((char *)&__get_cpu_var(injectm) + offset); 403 } 404 405 if (rdmsrl_safe(msr, &v)) { 406 WARN_ONCE(1, "mce: Unable to read msr %d!\n", msr); 407 /* 408 * Return zero in case the access faulted. This should 409 * not happen normally but can happen if the CPU does 410 * something weird, or if the code is buggy. 411 */ 412 v = 0; 413 } 414 415 return v; 416} 417 418static void mce_wrmsrl(u32 msr, u64 v) 419{ 420 if (__this_cpu_read(injectm.finished)) { 421 int offset = msr_to_offset(msr); 422 423 if (offset >= 0) 424 *(u64 *)((char *)&__get_cpu_var(injectm) + offset) = v; 425 return; 426 } 427 wrmsrl(msr, v); 428} 429 430/* 431 * Collect all global (w.r.t. this processor) status about this machine 432 * check into our "mce" struct so that we can use it later to assess 433 * the severity of the problem as we read per-bank specific details. 434 */ 435static inline void mce_gather_info(struct mce *m, struct pt_regs *regs) 436{ 437 mce_setup(m); 438 439 m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); 440 if (regs) { 441 /* 442 * Get the address of the instruction at the time of 443 * the machine check error. 444 */ 445 if (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) { 446 m->ip = regs->ip; 447 m->cs = regs->cs; 448 449 /* 450 * When in VM86 mode make the cs look like ring 3 451 * always. This is a lie, but it's better than passing 452 * the additional vm86 bit around everywhere. 453 */ 454 if (v8086_mode(regs)) 455 m->cs |= 3; 456 } 457 /* Use accurate RIP reporting if available. */ 458 if (mca_cfg.rip_msr) 459 m->ip = mce_rdmsrl(mca_cfg.rip_msr); 460 } 461} 462 463/* 464 * Simple lockless ring to communicate PFNs from the exception handler with the 465 * process context work function. This is vastly simplified because there's 466 * only a single reader and a single writer. 467 */ 468#define MCE_RING_SIZE 16 /* we use one entry less */ 469 470struct mce_ring { 471 unsigned short start; 472 unsigned short end; 473 unsigned long ring[MCE_RING_SIZE]; 474}; 475static DEFINE_PER_CPU(struct mce_ring, mce_ring); 476 477/* Runs with CPU affinity in workqueue */ 478static int mce_ring_empty(void) 479{ 480 struct mce_ring *r = &__get_cpu_var(mce_ring); 481 482 return r->start == r->end; 483} 484 485static int mce_ring_get(unsigned long *pfn) 486{ 487 struct mce_ring *r; 488 int ret = 0; 489 490 *pfn = 0; 491 get_cpu(); 492 r = &__get_cpu_var(mce_ring); 493 if (r->start == r->end) 494 goto out; 495 *pfn = r->ring[r->start]; 496 r->start = (r->start + 1) % MCE_RING_SIZE; 497 ret = 1; 498out: 499 put_cpu(); 500 return ret; 501} 502 503/* Always runs in MCE context with preempt off */ 504static int mce_ring_add(unsigned long pfn) 505{ 506 struct mce_ring *r = &__get_cpu_var(mce_ring); 507 unsigned next; 508 509 next = (r->end + 1) % MCE_RING_SIZE; 510 if (next == r->start) 511 return -1; 512 r->ring[r->end] = pfn; 513 wmb(); 514 r->end = next; 515 return 0; 516} 517 518int mce_available(struct cpuinfo_x86 *c) 519{ 520 if (mca_cfg.disabled) 521 return 0; 522 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); 523} 524 525static void mce_schedule_work(void) 526{ 527 if (!mce_ring_empty()) 528 schedule_work(&__get_cpu_var(mce_work)); 529} 530 531DEFINE_PER_CPU(struct irq_work, mce_irq_work); 532 533static void mce_irq_work_cb(struct irq_work *entry) 534{ 535 mce_notify_irq(); 536 mce_schedule_work(); 537} 538 539static void mce_report_event(struct pt_regs *regs) 540{ 541 if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) { 542 mce_notify_irq(); 543 /* 544 * Triggering the work queue here is just an insurance 545 * policy in case the syscall exit notify handler 546 * doesn't run soon enough or ends up running on the 547 * wrong CPU (can happen when audit sleeps) 548 */ 549 mce_schedule_work(); 550 return; 551 } 552 553 irq_work_queue(&__get_cpu_var(mce_irq_work)); 554} 555 556/* 557 * Read ADDR and MISC registers. 558 */ 559static void mce_read_aux(struct mce *m, int i) 560{ 561 if (m->status & MCI_STATUS_MISCV) 562 m->misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i)); 563 if (m->status & MCI_STATUS_ADDRV) { 564 m->addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i)); 565 566 /* 567 * Mask the reported address by the reported granularity. 568 */ 569 if (mca_cfg.ser && (m->status & MCI_STATUS_MISCV)) { 570 u8 shift = MCI_MISC_ADDR_LSB(m->misc); 571 m->addr >>= shift; 572 m->addr <<= shift; 573 } 574 } 575} 576 577DEFINE_PER_CPU(unsigned, mce_poll_count); 578 579/* 580 * Poll for corrected events or events that happened before reset. 581 * Those are just logged through /dev/mcelog. 582 * 583 * This is executed in standard interrupt context. 584 * 585 * Note: spec recommends to panic for fatal unsignalled 586 * errors here. However this would be quite problematic -- 587 * we would need to reimplement the Monarch handling and 588 * it would mess up the exclusion between exception handler 589 * and poll hander -- * so we skip this for now. 590 * These cases should not happen anyways, or only when the CPU 591 * is already totally * confused. In this case it's likely it will 592 * not fully execute the machine check handler either. 593 */ 594void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) 595{ 596 struct mce m; 597 int i; 598 599 this_cpu_inc(mce_poll_count); 600 601 mce_gather_info(&m, NULL); 602 603 for (i = 0; i < mca_cfg.banks; i++) { 604 if (!mce_banks[i].ctl || !test_bit(i, *b)) 605 continue; 606 607 m.misc = 0; 608 m.addr = 0; 609 m.bank = i; 610 m.tsc = 0; 611 612 barrier(); 613 m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); 614 if (!(m.status & MCI_STATUS_VAL)) 615 continue; 616 617 /* 618 * Uncorrected or signalled events are handled by the exception 619 * handler when it is enabled, so don't process those here. 620 * 621 * TBD do the same check for MCI_STATUS_EN here? 622 */ 623 if (!(flags & MCP_UC) && 624 (m.status & (mca_cfg.ser ? MCI_STATUS_S : MCI_STATUS_UC))) 625 continue; 626 627 mce_read_aux(&m, i); 628 629 if (!(flags & MCP_TIMESTAMP)) 630 m.tsc = 0; 631 /* 632 * Don't get the IP here because it's unlikely to 633 * have anything to do with the actual error location. 634 */ 635 if (!(flags & MCP_DONTLOG) && !mca_cfg.dont_log_ce) 636 mce_log(&m); 637 638 /* 639 * Clear state for this bank. 640 */ 641 mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0); 642 } 643 644 /* 645 * Don't clear MCG_STATUS here because it's only defined for 646 * exceptions. 647 */ 648 649 sync_core(); 650} 651EXPORT_SYMBOL_GPL(machine_check_poll); 652 653/* 654 * Do a quick check if any of the events requires a panic. 655 * This decides if we keep the events around or clear them. 656 */ 657static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp, 658 struct pt_regs *regs) 659{ 660 int i, ret = 0; 661 662 for (i = 0; i < mca_cfg.banks; i++) { 663 m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); 664 if (m->status & MCI_STATUS_VAL) { 665 __set_bit(i, validp); 666 if (quirk_no_way_out) 667 quirk_no_way_out(i, m, regs); 668 } 669 if (mce_severity(m, mca_cfg.tolerant, msg) >= MCE_PANIC_SEVERITY) 670 ret = 1; 671 } 672 return ret; 673} 674 675/* 676 * Variable to establish order between CPUs while scanning. 677 * Each CPU spins initially until executing is equal its number. 678 */ 679static atomic_t mce_executing; 680 681/* 682 * Defines order of CPUs on entry. First CPU becomes Monarch. 683 */ 684static atomic_t mce_callin; 685 686/* 687 * Check if a timeout waiting for other CPUs happened. 688 */ 689static int mce_timed_out(u64 *t) 690{ 691 /* 692 * The others already did panic for some reason. 693 * Bail out like in a timeout. 694 * rmb() to tell the compiler that system_state 695 * might have been modified by someone else. 696 */ 697 rmb(); 698 if (atomic_read(&mce_paniced)) 699 wait_for_panic(); 700 if (!mca_cfg.monarch_timeout) 701 goto out; 702 if ((s64)*t < SPINUNIT) { 703 /* CHECKME: Make panic default for 1 too? */ 704 if (mca_cfg.tolerant < 1) 705 mce_panic("Timeout synchronizing machine check over CPUs", 706 NULL, NULL); 707 cpu_missing = 1; 708 return 1; 709 } 710 *t -= SPINUNIT; 711out: 712 touch_nmi_watchdog(); 713 return 0; 714} 715 716/* 717 * The Monarch's reign. The Monarch is the CPU who entered 718 * the machine check handler first. It waits for the others to 719 * raise the exception too and then grades them. When any 720 * error is fatal panic. Only then let the others continue. 721 * 722 * The other CPUs entering the MCE handler will be controlled by the 723 * Monarch. They are called Subjects. 724 * 725 * This way we prevent any potential data corruption in a unrecoverable case 726 * and also makes sure always all CPU's errors are examined. 727 * 728 * Also this detects the case of a machine check event coming from outer 729 * space (not detected by any CPUs) In this case some external agent wants 730 * us to shut down, so panic too. 731 * 732 * The other CPUs might still decide to panic if the handler happens 733 * in a unrecoverable place, but in this case the system is in a semi-stable 734 * state and won't corrupt anything by itself. It's ok to let the others 735 * continue for a bit first. 736 * 737 * All the spin loops have timeouts; when a timeout happens a CPU 738 * typically elects itself to be Monarch. 739 */ 740static void mce_reign(void) 741{ 742 int cpu; 743 struct mce *m = NULL; 744 int global_worst = 0; 745 char *msg = NULL; 746 char *nmsg = NULL; 747 748 /* 749 * This CPU is the Monarch and the other CPUs have run 750 * through their handlers. 751 * Grade the severity of the errors of all the CPUs. 752 */ 753 for_each_possible_cpu(cpu) { 754 int severity = mce_severity(&per_cpu(mces_seen, cpu), 755 mca_cfg.tolerant, 756 &nmsg); 757 if (severity > global_worst) { 758 msg = nmsg; 759 global_worst = severity; 760 m = &per_cpu(mces_seen, cpu); 761 } 762 } 763 764 /* 765 * Cannot recover? Panic here then. 766 * This dumps all the mces in the log buffer and stops the 767 * other CPUs. 768 */ 769 if (m && global_worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3) 770 mce_panic("Fatal Machine check", m, msg); 771 772 /* 773 * For UC somewhere we let the CPU who detects it handle it. 774 * Also must let continue the others, otherwise the handling 775 * CPU could deadlock on a lock. 776 */ 777 778 /* 779 * No machine check event found. Must be some external 780 * source or one CPU is hung. Panic. 781 */ 782 if (global_worst <= MCE_KEEP_SEVERITY && mca_cfg.tolerant < 3) 783 mce_panic("Machine check from unknown source", NULL, NULL); 784 785 /* 786 * Now clear all the mces_seen so that they don't reappear on 787 * the next mce. 788 */ 789 for_each_possible_cpu(cpu) 790 memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce)); 791} 792 793static atomic_t global_nwo; 794 795/* 796 * Start of Monarch synchronization. This waits until all CPUs have 797 * entered the exception handler and then determines if any of them 798 * saw a fatal event that requires panic. Then it executes them 799 * in the entry order. 800 * TBD double check parallel CPU hotunplug 801 */ 802static int mce_start(int *no_way_out) 803{ 804 int order; 805 int cpus = num_online_cpus(); 806 u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC; 807 808 if (!timeout) 809 return -1; 810 811 atomic_add(*no_way_out, &global_nwo); 812 /* 813 * global_nwo should be updated before mce_callin 814 */ 815 smp_wmb(); 816 order = atomic_inc_return(&mce_callin); 817 818 /* 819 * Wait for everyone. 820 */ 821 while (atomic_read(&mce_callin) != cpus) { 822 if (mce_timed_out(&timeout)) { 823 atomic_set(&global_nwo, 0); 824 return -1; 825 } 826 ndelay(SPINUNIT); 827 } 828 829 /* 830 * mce_callin should be read before global_nwo 831 */ 832 smp_rmb(); 833 834 if (order == 1) { 835 /* 836 * Monarch: Starts executing now, the others wait. 837 */ 838 atomic_set(&mce_executing, 1); 839 } else { 840 /* 841 * Subject: Now start the scanning loop one by one in 842 * the original callin order. 843 * This way when there are any shared banks it will be 844 * only seen by one CPU before cleared, avoiding duplicates. 845 */ 846 while (atomic_read(&mce_executing) < order) { 847 if (mce_timed_out(&timeout)) { 848 atomic_set(&global_nwo, 0); 849 return -1; 850 } 851 ndelay(SPINUNIT); 852 } 853 } 854 855 /* 856 * Cache the global no_way_out state. 857 */ 858 *no_way_out = atomic_read(&global_nwo); 859 860 return order; 861} 862 863/* 864 * Synchronize between CPUs after main scanning loop. 865 * This invokes the bulk of the Monarch processing. 866 */ 867static int mce_end(int order) 868{ 869 int ret = -1; 870 u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC; 871 872 if (!timeout) 873 goto reset; 874 if (order < 0) 875 goto reset; 876 877 /* 878 * Allow others to run. 879 */ 880 atomic_inc(&mce_executing); 881 882 if (order == 1) { 883 /* CHECKME: Can this race with a parallel hotplug? */ 884 int cpus = num_online_cpus(); 885 886 /* 887 * Monarch: Wait for everyone to go through their scanning 888 * loops. 889 */ 890 while (atomic_read(&mce_executing) <= cpus) { 891 if (mce_timed_out(&timeout)) 892 goto reset; 893 ndelay(SPINUNIT); 894 } 895 896 mce_reign(); 897 barrier(); 898 ret = 0; 899 } else { 900 /* 901 * Subject: Wait for Monarch to finish. 902 */ 903 while (atomic_read(&mce_executing) != 0) { 904 if (mce_timed_out(&timeout)) 905 goto reset; 906 ndelay(SPINUNIT); 907 } 908 909 /* 910 * Don't reset anything. That's done by the Monarch. 911 */ 912 return 0; 913 } 914 915 /* 916 * Reset all global state. 917 */ 918reset: 919 atomic_set(&global_nwo, 0); 920 atomic_set(&mce_callin, 0); 921 barrier(); 922 923 /* 924 * Let others run again. 925 */ 926 atomic_set(&mce_executing, 0); 927 return ret; 928} 929 930/* 931 * Check if the address reported by the CPU is in a format we can parse. 932 * It would be possible to add code for most other cases, but all would 933 * be somewhat complicated (e.g. segment offset would require an instruction 934 * parser). So only support physical addresses up to page granuality for now. 935 */ 936static int mce_usable_address(struct mce *m) 937{ 938 if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV)) 939 return 0; 940 if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT) 941 return 0; 942 if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS) 943 return 0; 944 return 1; 945} 946 947static void mce_clear_state(unsigned long *toclear) 948{ 949 int i; 950 951 for (i = 0; i < mca_cfg.banks; i++) { 952 if (test_bit(i, toclear)) 953 mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0); 954 } 955} 956 957/* 958 * Need to save faulting physical address associated with a process 959 * in the machine check handler some place where we can grab it back 960 * later in mce_notify_process() 961 */ 962#define MCE_INFO_MAX 16 963 964struct mce_info { 965 atomic_t inuse; 966 struct task_struct *t; 967 __u64 paddr; 968 int restartable; 969} mce_info[MCE_INFO_MAX]; 970 971static void mce_save_info(__u64 addr, int c) 972{ 973 struct mce_info *mi; 974 975 for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++) { 976 if (atomic_cmpxchg(&mi->inuse, 0, 1) == 0) { 977 mi->t = current; 978 mi->paddr = addr; 979 mi->restartable = c; 980 return; 981 } 982 } 983 984 mce_panic("Too many concurrent recoverable errors", NULL, NULL); 985} 986 987static struct mce_info *mce_find_info(void) 988{ 989 struct mce_info *mi; 990 991 for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++) 992 if (atomic_read(&mi->inuse) && mi->t == current) 993 return mi; 994 return NULL; 995} 996 997static void mce_clear_info(struct mce_info *mi) 998{ 999 atomic_set(&mi->inuse, 0); 1000} 1001 1002/* 1003 * The actual machine check handler. This only handles real 1004 * exceptions when something got corrupted coming in through int 18. 1005 * 1006 * This is executed in NMI context not subject to normal locking rules. This 1007 * implies that most kernel services cannot be safely used. Don't even 1008 * think about putting a printk in there! 1009 * 1010 * On Intel systems this is entered on all CPUs in parallel through 1011 * MCE broadcast. However some CPUs might be broken beyond repair, 1012 * so be always careful when synchronizing with others. 1013 */ 1014void do_machine_check(struct pt_regs *regs, long error_code) 1015{ 1016 struct mca_config *cfg = &mca_cfg; 1017 struct mce m, *final; 1018 int i; 1019 int worst = 0; 1020 int severity; 1021 /* 1022 * Establish sequential order between the CPUs entering the machine 1023 * check handler. 1024 */ 1025 int order; 1026 /* 1027 * If no_way_out gets set, there is no safe way to recover from this 1028 * MCE. If mca_cfg.tolerant is cranked up, we'll try anyway. 1029 */ 1030 int no_way_out = 0; 1031 /* 1032 * If kill_it gets set, there might be a way to recover from this 1033 * error. 1034 */ 1035 int kill_it = 0; 1036 DECLARE_BITMAP(toclear, MAX_NR_BANKS); 1037 DECLARE_BITMAP(valid_banks, MAX_NR_BANKS); 1038 char *msg = "Unknown"; 1039 1040 atomic_inc(&mce_entry); 1041 1042 this_cpu_inc(mce_exception_count); 1043 1044 if (!cfg->banks) 1045 goto out; 1046 1047 mce_gather_info(&m, regs); 1048 1049 final = &__get_cpu_var(mces_seen); 1050 *final = m; 1051 1052 memset(valid_banks, 0, sizeof(valid_banks)); 1053 no_way_out = mce_no_way_out(&m, &msg, valid_banks, regs); 1054 1055 barrier(); 1056 1057 /* 1058 * When no restart IP might need to kill or panic. 1059 * Assume the worst for now, but if we find the 1060 * severity is MCE_AR_SEVERITY we have other options. 1061 */ 1062 if (!(m.mcgstatus & MCG_STATUS_RIPV)) 1063 kill_it = 1; 1064 1065 /* 1066 * Go through all the banks in exclusion of the other CPUs. 1067 * This way we don't report duplicated events on shared banks 1068 * because the first one to see it will clear it. 1069 */ 1070 order = mce_start(&no_way_out); 1071 for (i = 0; i < cfg->banks; i++) { 1072 __clear_bit(i, toclear); 1073 if (!test_bit(i, valid_banks)) 1074 continue; 1075 if (!mce_banks[i].ctl) 1076 continue; 1077 1078 m.misc = 0; 1079 m.addr = 0; 1080 m.bank = i; 1081 1082 m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); 1083 if ((m.status & MCI_STATUS_VAL) == 0) 1084 continue; 1085 1086 /* 1087 * Non uncorrected or non signaled errors are handled by 1088 * machine_check_poll. Leave them alone, unless this panics. 1089 */ 1090 if (!(m.status & (cfg->ser ? MCI_STATUS_S : MCI_STATUS_UC)) && 1091 !no_way_out) 1092 continue; 1093 1094 /* 1095 * Set taint even when machine check was not enabled. 1096 */ 1097 add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); 1098 1099 severity = mce_severity(&m, cfg->tolerant, NULL); 1100 1101 /* 1102 * When machine check was for corrected handler don't touch, 1103 * unless we're panicing. 1104 */ 1105 if (severity == MCE_KEEP_SEVERITY && !no_way_out) 1106 continue; 1107 __set_bit(i, toclear); 1108 if (severity == MCE_NO_SEVERITY) { 1109 /* 1110 * Machine check event was not enabled. Clear, but 1111 * ignore. 1112 */ 1113 continue; 1114 } 1115 1116 mce_read_aux(&m, i); 1117 1118 /* 1119 * Action optional error. Queue address for later processing. 1120 * When the ring overflows we just ignore the AO error. 1121 * RED-PEN add some logging mechanism when 1122 * usable_address or mce_add_ring fails. 1123 * RED-PEN don't ignore overflow for mca_cfg.tolerant == 0 1124 */ 1125 if (severity == MCE_AO_SEVERITY && mce_usable_address(&m)) 1126 mce_ring_add(m.addr >> PAGE_SHIFT); 1127 1128 mce_log(&m); 1129 1130 if (severity > worst) { 1131 *final = m; 1132 worst = severity; 1133 } 1134 } 1135 1136 /* mce_clear_state will clear *final, save locally for use later */ 1137 m = *final; 1138 1139 if (!no_way_out) 1140 mce_clear_state(toclear); 1141 1142 /* 1143 * Do most of the synchronization with other CPUs. 1144 * When there's any problem use only local no_way_out state. 1145 */ 1146 if (mce_end(order) < 0) 1147 no_way_out = worst >= MCE_PANIC_SEVERITY; 1148 1149 /* 1150 * At insane "tolerant" levels we take no action. Otherwise 1151 * we only die if we have no other choice. For less serious 1152 * issues we try to recover, or limit damage to the current 1153 * process. 1154 */ 1155 if (cfg->tolerant < 3) { 1156 if (no_way_out) 1157 mce_panic("Fatal machine check on current CPU", &m, msg); 1158 if (worst == MCE_AR_SEVERITY) { 1159 /* schedule action before return to userland */ 1160 mce_save_info(m.addr, m.mcgstatus & MCG_STATUS_RIPV); 1161 set_thread_flag(TIF_MCE_NOTIFY); 1162 } else if (kill_it) { 1163 force_sig(SIGBUS, current); 1164 } 1165 } 1166 1167 if (worst > 0) 1168 mce_report_event(regs); 1169 mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); 1170out: 1171 atomic_dec(&mce_entry); 1172 sync_core(); 1173} 1174EXPORT_SYMBOL_GPL(do_machine_check); 1175 1176#ifndef CONFIG_MEMORY_FAILURE 1177int memory_failure(unsigned long pfn, int vector, int flags) 1178{ 1179 /* mce_severity() should not hand us an ACTION_REQUIRED error */ 1180 BUG_ON(flags & MF_ACTION_REQUIRED); 1181 pr_err("Uncorrected memory error in page 0x%lx ignored\n" 1182 "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n", 1183 pfn); 1184 1185 return 0; 1186} 1187#endif 1188 1189/* 1190 * Called in process context that interrupted by MCE and marked with 1191 * TIF_MCE_NOTIFY, just before returning to erroneous userland. 1192 * This code is allowed to sleep. 1193 * Attempt possible recovery such as calling the high level VM handler to 1194 * process any corrupted pages, and kill/signal current process if required. 1195 * Action required errors are handled here. 1196 */ 1197void mce_notify_process(void) 1198{ 1199 unsigned long pfn; 1200 struct mce_info *mi = mce_find_info(); 1201 int flags = MF_ACTION_REQUIRED; 1202 1203 if (!mi) 1204 mce_panic("Lost physical address for unconsumed uncorrectable error", NULL, NULL); 1205 pfn = mi->paddr >> PAGE_SHIFT; 1206 1207 clear_thread_flag(TIF_MCE_NOTIFY); 1208 1209 pr_err("Uncorrected hardware memory error in user-access at %llx", 1210 mi->paddr); 1211 /* 1212 * We must call memory_failure() here even if the current process is 1213 * doomed. We still need to mark the page as poisoned and alert any 1214 * other users of the page. 1215 */ 1216 if (!mi->restartable) 1217 flags |= MF_MUST_KILL; 1218 if (memory_failure(pfn, MCE_VECTOR, flags) < 0) { 1219 pr_err("Memory error not recovered"); 1220 force_sig(SIGBUS, current); 1221 } 1222 mce_clear_info(mi); 1223} 1224 1225/* 1226 * Action optional processing happens here (picking up 1227 * from the list of faulting pages that do_machine_check() 1228 * placed into the "ring"). 1229 */ 1230static void mce_process_work(struct work_struct *dummy) 1231{ 1232 unsigned long pfn; 1233 1234 while (mce_ring_get(&pfn)) 1235 memory_failure(pfn, MCE_VECTOR, 0); 1236} 1237 1238#ifdef CONFIG_X86_MCE_INTEL 1239/*** 1240 * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog 1241 * @cpu: The CPU on which the event occurred. 1242 * @status: Event status information 1243 * 1244 * This function should be called by the thermal interrupt after the 1245 * event has been processed and the decision was made to log the event 1246 * further. 1247 * 1248 * The status parameter will be saved to the 'status' field of 'struct mce' 1249 * and historically has been the register value of the 1250 * MSR_IA32_THERMAL_STATUS (Intel) msr. 1251 */ 1252void mce_log_therm_throt_event(__u64 status) 1253{ 1254 struct mce m; 1255 1256 mce_setup(&m); 1257 m.bank = MCE_THERMAL_BANK; 1258 m.status = status; 1259 mce_log(&m); 1260} 1261#endif /* CONFIG_X86_MCE_INTEL */ 1262 1263/* 1264 * Periodic polling timer for "silent" machine check errors. If the 1265 * poller finds an MCE, poll 2x faster. When the poller finds no more 1266 * errors, poll 2x slower (up to check_interval seconds). 1267 */ 1268static unsigned long check_interval = 5 * 60; /* 5 minutes */ 1269 1270static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */ 1271static DEFINE_PER_CPU(struct timer_list, mce_timer); 1272 1273static unsigned long mce_adjust_timer_default(unsigned long interval) 1274{ 1275 return interval; 1276} 1277 1278static unsigned long (*mce_adjust_timer)(unsigned long interval) = 1279 mce_adjust_timer_default; 1280 1281static void mce_timer_fn(unsigned long data) 1282{ 1283 struct timer_list *t = &__get_cpu_var(mce_timer); 1284 unsigned long iv; 1285 1286 WARN_ON(smp_processor_id() != data); 1287 1288 if (mce_available(__this_cpu_ptr(&cpu_info))) { 1289 machine_check_poll(MCP_TIMESTAMP, 1290 &__get_cpu_var(mce_poll_banks)); 1291 mce_intel_cmci_poll(); 1292 } 1293 1294 /* 1295 * Alert userspace if needed. If we logged an MCE, reduce the 1296 * polling interval, otherwise increase the polling interval. 1297 */ 1298 iv = __this_cpu_read(mce_next_interval); 1299 if (mce_notify_irq()) { 1300 iv = max(iv / 2, (unsigned long) HZ/100); 1301 } else { 1302 iv = min(iv * 2, round_jiffies_relative(check_interval * HZ)); 1303 iv = mce_adjust_timer(iv); 1304 } 1305 __this_cpu_write(mce_next_interval, iv); 1306 /* Might have become 0 after CMCI storm subsided */ 1307 if (iv) { 1308 t->expires = jiffies + iv; 1309 add_timer_on(t, smp_processor_id()); 1310 } 1311} 1312 1313/* 1314 * Ensure that the timer is firing in @interval from now. 1315 */ 1316void mce_timer_kick(unsigned long interval) 1317{ 1318 struct timer_list *t = &__get_cpu_var(mce_timer); 1319 unsigned long when = jiffies + interval; 1320 unsigned long iv = __this_cpu_read(mce_next_interval); 1321 1322 if (timer_pending(t)) { 1323 if (time_before(when, t->expires)) 1324 mod_timer_pinned(t, when); 1325 } else { 1326 t->expires = round_jiffies(when); 1327 add_timer_on(t, smp_processor_id()); 1328 } 1329 if (interval < iv) 1330 __this_cpu_write(mce_next_interval, interval); 1331} 1332 1333/* Must not be called in IRQ context where del_timer_sync() can deadlock */ 1334static void mce_timer_delete_all(void) 1335{ 1336 int cpu; 1337 1338 for_each_online_cpu(cpu) 1339 del_timer_sync(&per_cpu(mce_timer, cpu)); 1340} 1341 1342static void mce_do_trigger(struct work_struct *work) 1343{ 1344 call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT); 1345} 1346 1347static DECLARE_WORK(mce_trigger_work, mce_do_trigger); 1348 1349/* 1350 * Notify the user(s) about new machine check events. 1351 * Can be called from interrupt context, but not from machine check/NMI 1352 * context. 1353 */ 1354int mce_notify_irq(void) 1355{ 1356 /* Not more than two messages every minute */ 1357 static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); 1358 1359 if (test_and_clear_bit(0, &mce_need_notify)) { 1360 /* wake processes polling /dev/mcelog */ 1361 wake_up_interruptible(&mce_chrdev_wait); 1362 1363 if (mce_helper[0]) 1364 schedule_work(&mce_trigger_work); 1365 1366 if (__ratelimit(&ratelimit)) 1367 pr_info(HW_ERR "Machine check events logged\n"); 1368 1369 return 1; 1370 } 1371 return 0; 1372} 1373EXPORT_SYMBOL_GPL(mce_notify_irq); 1374 1375static int __mcheck_cpu_mce_banks_init(void) 1376{ 1377 int i; 1378 u8 num_banks = mca_cfg.banks; 1379 1380 mce_banks = kzalloc(num_banks * sizeof(struct mce_bank), GFP_KERNEL); 1381 if (!mce_banks) 1382 return -ENOMEM; 1383 1384 for (i = 0; i < num_banks; i++) { 1385 struct mce_bank *b = &mce_banks[i]; 1386 1387 b->ctl = -1ULL; 1388 b->init = 1; 1389 } 1390 return 0; 1391} 1392 1393/* 1394 * Initialize Machine Checks for a CPU. 1395 */ 1396static int __mcheck_cpu_cap_init(void) 1397{ 1398 unsigned b; 1399 u64 cap; 1400 1401 rdmsrl(MSR_IA32_MCG_CAP, cap); 1402 1403 b = cap & MCG_BANKCNT_MASK; 1404 if (!mca_cfg.banks) 1405 pr_info("CPU supports %d MCE banks\n", b); 1406 1407 if (b > MAX_NR_BANKS) { 1408 pr_warn("Using only %u machine check banks out of %u\n", 1409 MAX_NR_BANKS, b); 1410 b = MAX_NR_BANKS; 1411 } 1412 1413 /* Don't support asymmetric configurations today */ 1414 WARN_ON(mca_cfg.banks != 0 && b != mca_cfg.banks); 1415 mca_cfg.banks = b; 1416 1417 if (!mce_banks) { 1418 int err = __mcheck_cpu_mce_banks_init(); 1419 1420 if (err) 1421 return err; 1422 } 1423 1424 /* Use accurate RIP reporting if available. */ 1425 if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9) 1426 mca_cfg.rip_msr = MSR_IA32_MCG_EIP; 1427 1428 if (cap & MCG_SER_P) 1429 mca_cfg.ser = true; 1430 1431 return 0; 1432} 1433 1434static void __mcheck_cpu_init_generic(void) 1435{ 1436 enum mcp_flags m_fl = 0; 1437 mce_banks_t all_banks; 1438 u64 cap; 1439 int i; 1440 1441 if (!mca_cfg.bootlog) 1442 m_fl = MCP_DONTLOG; 1443 1444 /* 1445 * Log the machine checks left over from the previous reset. 1446 */ 1447 bitmap_fill(all_banks, MAX_NR_BANKS); 1448 machine_check_poll(MCP_UC | m_fl, &all_banks); 1449 1450 set_in_cr4(X86_CR4_MCE); 1451 1452 rdmsrl(MSR_IA32_MCG_CAP, cap); 1453 if (cap & MCG_CTL_P) 1454 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); 1455 1456 for (i = 0; i < mca_cfg.banks; i++) { 1457 struct mce_bank *b = &mce_banks[i]; 1458 1459 if (!b->init) 1460 continue; 1461 wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl); 1462 wrmsrl(MSR_IA32_MCx_STATUS(i), 0); 1463 } 1464} 1465 1466/* 1467 * During IFU recovery Sandy Bridge -EP4S processors set the RIPV and 1468 * EIPV bits in MCG_STATUS to zero on the affected logical processor (SDM 1469 * Vol 3B Table 15-20). But this confuses both the code that determines 1470 * whether the machine check occurred in kernel or user mode, and also 1471 * the severity assessment code. Pretend that EIPV was set, and take the 1472 * ip/cs values from the pt_regs that mce_gather_info() ignored earlier. 1473 */ 1474static void quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs) 1475{ 1476 if (bank != 0) 1477 return; 1478 if ((m->mcgstatus & (MCG_STATUS_EIPV|MCG_STATUS_RIPV)) != 0) 1479 return; 1480 if ((m->status & (MCI_STATUS_OVER|MCI_STATUS_UC| 1481 MCI_STATUS_EN|MCI_STATUS_MISCV|MCI_STATUS_ADDRV| 1482 MCI_STATUS_PCC|MCI_STATUS_S|MCI_STATUS_AR| 1483 MCACOD)) != 1484 (MCI_STATUS_UC|MCI_STATUS_EN| 1485 MCI_STATUS_MISCV|MCI_STATUS_ADDRV|MCI_STATUS_S| 1486 MCI_STATUS_AR|MCACOD_INSTR)) 1487 return; 1488 1489 m->mcgstatus |= MCG_STATUS_EIPV; 1490 m->ip = regs->ip; 1491 m->cs = regs->cs; 1492} 1493 1494/* Add per CPU specific workarounds here */ 1495static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) 1496{ 1497 struct mca_config *cfg = &mca_cfg; 1498 1499 if (c->x86_vendor == X86_VENDOR_UNKNOWN) { 1500 pr_info("unknown CPU type - not enabling MCE support\n"); 1501 return -EOPNOTSUPP; 1502 } 1503 1504 /* This should be disabled by the BIOS, but isn't always */ 1505 if (c->x86_vendor == X86_VENDOR_AMD) { 1506 if (c->x86 == 15 && cfg->banks > 4) { 1507 /* 1508 * disable GART TBL walk error reporting, which 1509 * trips off incorrectly with the IOMMU & 3ware 1510 * & Cerberus: 1511 */ 1512 clear_bit(10, (unsigned long *)&mce_banks[4].ctl); 1513 } 1514 if (c->x86 <= 17 && cfg->bootlog < 0) { 1515 /* 1516 * Lots of broken BIOS around that don't clear them 1517 * by default and leave crap in there. Don't log: 1518 */ 1519 cfg->bootlog = 0; 1520 } 1521 /* 1522 * Various K7s with broken bank 0 around. Always disable 1523 * by default. 1524 */ 1525 if (c->x86 == 6 && cfg->banks > 0) 1526 mce_banks[0].ctl = 0; 1527 1528 /* 1529 * Turn off MC4_MISC thresholding banks on those models since 1530 * they're not supported there. 1531 */ 1532 if (c->x86 == 0x15 && 1533 (c->x86_model >= 0x10 && c->x86_model <= 0x1f)) { 1534 int i; 1535 u64 val, hwcr; 1536 bool need_toggle; 1537 u32 msrs[] = { 1538 0x00000413, /* MC4_MISC0 */ 1539 0xc0000408, /* MC4_MISC1 */ 1540 }; 1541 1542 rdmsrl(MSR_K7_HWCR, hwcr); 1543 1544 /* McStatusWrEn has to be set */ 1545 need_toggle = !(hwcr & BIT(18)); 1546 1547 if (need_toggle) 1548 wrmsrl(MSR_K7_HWCR, hwcr | BIT(18)); 1549 1550 for (i = 0; i < ARRAY_SIZE(msrs); i++) { 1551 rdmsrl(msrs[i], val); 1552 1553 /* CntP bit set? */ 1554 if (val & BIT_64(62)) { 1555 val &= ~BIT_64(62); 1556 wrmsrl(msrs[i], val); 1557 } 1558 } 1559 1560 /* restore old settings */ 1561 if (need_toggle) 1562 wrmsrl(MSR_K7_HWCR, hwcr); 1563 } 1564 } 1565 1566 if (c->x86_vendor == X86_VENDOR_INTEL) { 1567 /* 1568 * SDM documents that on family 6 bank 0 should not be written 1569 * because it aliases to another special BIOS controlled 1570 * register. 1571 * But it's not aliased anymore on model 0x1a+ 1572 * Don't ignore bank 0 completely because there could be a 1573 * valid event later, merely don't write CTL0. 1574 */ 1575 1576 if (c->x86 == 6 && c->x86_model < 0x1A && cfg->banks > 0) 1577 mce_banks[0].init = 0; 1578 1579 /* 1580 * All newer Intel systems support MCE broadcasting. Enable 1581 * synchronization with a one second timeout. 1582 */ 1583 if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) && 1584 cfg->monarch_timeout < 0) 1585 cfg->monarch_timeout = USEC_PER_SEC; 1586 1587 /* 1588 * There are also broken BIOSes on some Pentium M and 1589 * earlier systems: 1590 */ 1591 if (c->x86 == 6 && c->x86_model <= 13 && cfg->bootlog < 0) 1592 cfg->bootlog = 0; 1593 1594 if (c->x86 == 6 && c->x86_model == 45) 1595 quirk_no_way_out = quirk_sandybridge_ifu; 1596 } 1597 if (cfg->monarch_timeout < 0) 1598 cfg->monarch_timeout = 0; 1599 if (cfg->bootlog != 0) 1600 cfg->panic_timeout = 30; 1601 1602 return 0; 1603} 1604 1605static int __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c) 1606{ 1607 if (c->x86 != 5) 1608 return 0; 1609 1610 switch (c->x86_vendor) { 1611 case X86_VENDOR_INTEL: 1612 intel_p5_mcheck_init(c); 1613 return 1; 1614 break; 1615 case X86_VENDOR_CENTAUR: 1616 winchip_mcheck_init(c); 1617 return 1; 1618 break; 1619 } 1620 1621 return 0; 1622} 1623 1624static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) 1625{ 1626 switch (c->x86_vendor) { 1627 case X86_VENDOR_INTEL: 1628 mce_intel_feature_init(c); 1629 mce_adjust_timer = mce_intel_adjust_timer; 1630 break; 1631 case X86_VENDOR_AMD: 1632 mce_amd_feature_init(c); 1633 break; 1634 default: 1635 break; 1636 } 1637} 1638 1639static void mce_start_timer(unsigned int cpu, struct timer_list *t) 1640{ 1641 unsigned long iv = check_interval * HZ; 1642 1643 if (mca_cfg.ignore_ce || !iv) 1644 return; 1645 1646 per_cpu(mce_next_interval, cpu) = iv; 1647 1648 t->expires = round_jiffies(jiffies + iv); 1649 add_timer_on(t, cpu); 1650} 1651 1652static void __mcheck_cpu_init_timer(void) 1653{ 1654 struct timer_list *t = &__get_cpu_var(mce_timer); 1655 unsigned int cpu = smp_processor_id(); 1656 1657 setup_timer(t, mce_timer_fn, cpu); 1658 mce_start_timer(cpu, t); 1659} 1660 1661/* Handle unconfigured int18 (should never happen) */ 1662static void unexpected_machine_check(struct pt_regs *regs, long error_code) 1663{ 1664 pr_err("CPU#%d: Unexpected int18 (Machine Check)\n", 1665 smp_processor_id()); 1666} 1667 1668/* Call the installed machine check handler for this CPU setup. */ 1669void (*machine_check_vector)(struct pt_regs *, long error_code) = 1670 unexpected_machine_check; 1671 1672/* 1673 * Called for each booted CPU to set up machine checks. 1674 * Must be called with preempt off: 1675 */ 1676void mcheck_cpu_init(struct cpuinfo_x86 *c) 1677{ 1678 if (mca_cfg.disabled) 1679 return; 1680 1681 if (__mcheck_cpu_ancient_init(c)) 1682 return; 1683 1684 if (!mce_available(c)) 1685 return; 1686 1687 if (__mcheck_cpu_cap_init() < 0 || __mcheck_cpu_apply_quirks(c) < 0) { 1688 mca_cfg.disabled = true; 1689 return; 1690 } 1691 1692 machine_check_vector = do_machine_check; 1693 1694 __mcheck_cpu_init_generic(); 1695 __mcheck_cpu_init_vendor(c); 1696 __mcheck_cpu_init_timer(); 1697 INIT_WORK(&__get_cpu_var(mce_work), mce_process_work); 1698 init_irq_work(&__get_cpu_var(mce_irq_work), &mce_irq_work_cb); 1699} 1700 1701/* 1702 * mce_chrdev: Character device /dev/mcelog to read and clear the MCE log. 1703 */ 1704 1705static DEFINE_SPINLOCK(mce_chrdev_state_lock); 1706static int mce_chrdev_open_count; /* #times opened */ 1707static int mce_chrdev_open_exclu; /* already open exclusive? */ 1708 1709static int mce_chrdev_open(struct inode *inode, struct file *file) 1710{ 1711 spin_lock(&mce_chrdev_state_lock); 1712 1713 if (mce_chrdev_open_exclu || 1714 (mce_chrdev_open_count && (file->f_flags & O_EXCL))) { 1715 spin_unlock(&mce_chrdev_state_lock); 1716 1717 return -EBUSY; 1718 } 1719 1720 if (file->f_flags & O_EXCL) 1721 mce_chrdev_open_exclu = 1; 1722 mce_chrdev_open_count++; 1723 1724 spin_unlock(&mce_chrdev_state_lock); 1725 1726 return nonseekable_open(inode, file); 1727} 1728 1729static int mce_chrdev_release(struct inode *inode, struct file *file) 1730{ 1731 spin_lock(&mce_chrdev_state_lock); 1732 1733 mce_chrdev_open_count--; 1734 mce_chrdev_open_exclu = 0; 1735 1736 spin_unlock(&mce_chrdev_state_lock); 1737 1738 return 0; 1739} 1740 1741static void collect_tscs(void *data) 1742{ 1743 unsigned long *cpu_tsc = (unsigned long *)data; 1744 1745 rdtscll(cpu_tsc[smp_processor_id()]); 1746} 1747 1748static int mce_apei_read_done; 1749 1750/* Collect MCE record of previous boot in persistent storage via APEI ERST. */ 1751static int __mce_read_apei(char __user **ubuf, size_t usize) 1752{ 1753 int rc; 1754 u64 record_id; 1755 struct mce m; 1756 1757 if (usize < sizeof(struct mce)) 1758 return -EINVAL; 1759 1760 rc = apei_read_mce(&m, &record_id); 1761 /* Error or no more MCE record */ 1762 if (rc <= 0) { 1763 mce_apei_read_done = 1; 1764 /* 1765 * When ERST is disabled, mce_chrdev_read() should return 1766 * "no record" instead of "no device." 1767 */ 1768 if (rc == -ENODEV) 1769 return 0; 1770 return rc; 1771 } 1772 rc = -EFAULT; 1773 if (copy_to_user(*ubuf, &m, sizeof(struct mce))) 1774 return rc; 1775 /* 1776 * In fact, we should have cleared the record after that has 1777 * been flushed to the disk or sent to network in 1778 * /sbin/mcelog, but we have no interface to support that now, 1779 * so just clear it to avoid duplication. 1780 */ 1781 rc = apei_clear_mce(record_id); 1782 if (rc) { 1783 mce_apei_read_done = 1; 1784 return rc; 1785 } 1786 *ubuf += sizeof(struct mce); 1787 1788 return 0; 1789} 1790 1791static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf, 1792 size_t usize, loff_t *off) 1793{ 1794 char __user *buf = ubuf; 1795 unsigned long *cpu_tsc; 1796 unsigned prev, next; 1797 int i, err; 1798 1799 cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL); 1800 if (!cpu_tsc) 1801 return -ENOMEM; 1802 1803 mutex_lock(&mce_chrdev_read_mutex); 1804 1805 if (!mce_apei_read_done) { 1806 err = __mce_read_apei(&buf, usize); 1807 if (err || buf != ubuf) 1808 goto out; 1809 } 1810 1811 next = rcu_dereference_check_mce(mcelog.next); 1812 1813 /* Only supports full reads right now */ 1814 err = -EINVAL; 1815 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) 1816 goto out; 1817 1818 err = 0; 1819 prev = 0; 1820 do { 1821 for (i = prev; i < next; i++) { 1822 unsigned long start = jiffies; 1823 struct mce *m = &mcelog.entry[i]; 1824 1825 while (!m->finished) { 1826 if (time_after_eq(jiffies, start + 2)) { 1827 memset(m, 0, sizeof(*m)); 1828 goto timeout; 1829 } 1830 cpu_relax(); 1831 } 1832 smp_rmb(); 1833 err |= copy_to_user(buf, m, sizeof(*m)); 1834 buf += sizeof(*m); 1835timeout: 1836 ; 1837 } 1838 1839 memset(mcelog.entry + prev, 0, 1840 (next - prev) * sizeof(struct mce)); 1841 prev = next; 1842 next = cmpxchg(&mcelog.next, prev, 0); 1843 } while (next != prev); 1844 1845 synchronize_sched(); 1846 1847 /* 1848 * Collect entries that were still getting written before the 1849 * synchronize. 1850 */ 1851 on_each_cpu(collect_tscs, cpu_tsc, 1); 1852 1853 for (i = next; i < MCE_LOG_LEN; i++) { 1854 struct mce *m = &mcelog.entry[i]; 1855 1856 if (m->finished && m->tsc < cpu_tsc[m->cpu]) { 1857 err |= copy_to_user(buf, m, sizeof(*m)); 1858 smp_rmb(); 1859 buf += sizeof(*m); 1860 memset(m, 0, sizeof(*m)); 1861 } 1862 } 1863 1864 if (err) 1865 err = -EFAULT; 1866 1867out: 1868 mutex_unlock(&mce_chrdev_read_mutex); 1869 kfree(cpu_tsc); 1870 1871 return err ? err : buf - ubuf; 1872} 1873 1874static unsigned int mce_chrdev_poll(struct file *file, poll_table *wait) 1875{ 1876 poll_wait(file, &mce_chrdev_wait, wait); 1877 if (rcu_access_index(mcelog.next)) 1878 return POLLIN | POLLRDNORM; 1879 if (!mce_apei_read_done && apei_check_mce()) 1880 return POLLIN | POLLRDNORM; 1881 return 0; 1882} 1883 1884static long mce_chrdev_ioctl(struct file *f, unsigned int cmd, 1885 unsigned long arg) 1886{ 1887 int __user *p = (int __user *)arg; 1888 1889 if (!capable(CAP_SYS_ADMIN)) 1890 return -EPERM; 1891 1892 switch (cmd) { 1893 case MCE_GET_RECORD_LEN: 1894 return put_user(sizeof(struct mce), p); 1895 case MCE_GET_LOG_LEN: 1896 return put_user(MCE_LOG_LEN, p); 1897 case MCE_GETCLEAR_FLAGS: { 1898 unsigned flags; 1899 1900 do { 1901 flags = mcelog.flags; 1902 } while (cmpxchg(&mcelog.flags, flags, 0) != flags); 1903 1904 return put_user(flags, p); 1905 } 1906 default: 1907 return -ENOTTY; 1908 } 1909} 1910 1911static ssize_t (*mce_write)(struct file *filp, const char __user *ubuf, 1912 size_t usize, loff_t *off); 1913 1914void register_mce_write_callback(ssize_t (*fn)(struct file *filp, 1915 const char __user *ubuf, 1916 size_t usize, loff_t *off)) 1917{ 1918 mce_write = fn; 1919} 1920EXPORT_SYMBOL_GPL(register_mce_write_callback); 1921 1922ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf, 1923 size_t usize, loff_t *off) 1924{ 1925 if (mce_write) 1926 return mce_write(filp, ubuf, usize, off); 1927 else 1928 return -EINVAL; 1929} 1930 1931static const struct file_operations mce_chrdev_ops = { 1932 .open = mce_chrdev_open, 1933 .release = mce_chrdev_release, 1934 .read = mce_chrdev_read, 1935 .write = mce_chrdev_write, 1936 .poll = mce_chrdev_poll, 1937 .unlocked_ioctl = mce_chrdev_ioctl, 1938 .llseek = no_llseek, 1939}; 1940 1941static struct miscdevice mce_chrdev_device = { 1942 MISC_MCELOG_MINOR, 1943 "mcelog", 1944 &mce_chrdev_ops, 1945}; 1946 1947static void __mce_disable_bank(void *arg) 1948{ 1949 int bank = *((int *)arg); 1950 __clear_bit(bank, __get_cpu_var(mce_poll_banks)); 1951 cmci_disable_bank(bank); 1952} 1953 1954void mce_disable_bank(int bank) 1955{ 1956 if (bank >= mca_cfg.banks) { 1957 pr_warn(FW_BUG 1958 "Ignoring request to disable invalid MCA bank %d.\n", 1959 bank); 1960 return; 1961 } 1962 set_bit(bank, mce_banks_ce_disabled); 1963 on_each_cpu(__mce_disable_bank, &bank, 1); 1964} 1965 1966/* 1967 * mce=off Disables machine check 1968 * mce=no_cmci Disables CMCI 1969 * mce=dont_log_ce Clears corrected events silently, no log created for CEs. 1970 * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared. 1971 * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above) 1972 * monarchtimeout is how long to wait for other CPUs on machine 1973 * check, or 0 to not wait 1974 * mce=bootlog Log MCEs from before booting. Disabled by default on AMD. 1975 * mce=nobootlog Don't log MCEs from before booting. 1976 * mce=bios_cmci_threshold Don't program the CMCI threshold 1977 */ 1978static int __init mcheck_enable(char *str) 1979{ 1980 struct mca_config *cfg = &mca_cfg; 1981 1982 if (*str == 0) { 1983 enable_p5_mce(); 1984 return 1; 1985 } 1986 if (*str == '=') 1987 str++; 1988 if (!strcmp(str, "off")) 1989 cfg->disabled = true; 1990 else if (!strcmp(str, "no_cmci")) 1991 cfg->cmci_disabled = true; 1992 else if (!strcmp(str, "dont_log_ce")) 1993 cfg->dont_log_ce = true; 1994 else if (!strcmp(str, "ignore_ce")) 1995 cfg->ignore_ce = true; 1996 else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog")) 1997 cfg->bootlog = (str[0] == 'b'); 1998 else if (!strcmp(str, "bios_cmci_threshold")) 1999 cfg->bios_cmci_threshold = true; 2000 else if (isdigit(str[0])) { 2001 get_option(&str, &(cfg->tolerant)); 2002 if (*str == ',') { 2003 ++str; 2004 get_option(&str, &(cfg->monarch_timeout)); 2005 } 2006 } else { 2007 pr_info("mce argument %s ignored. Please use /sys\n", str); 2008 return 0; 2009 } 2010 return 1; 2011} 2012__setup("mce", mcheck_enable); 2013 2014int __init mcheck_init(void) 2015{ 2016 mcheck_intel_therm_init(); 2017 2018 return 0; 2019} 2020 2021/* 2022 * mce_syscore: PM support 2023 */ 2024 2025/* 2026 * Disable machine checks on suspend and shutdown. We can't really handle 2027 * them later. 2028 */ 2029static int mce_disable_error_reporting(void) 2030{ 2031 int i; 2032 2033 for (i = 0; i < mca_cfg.banks; i++) { 2034 struct mce_bank *b = &mce_banks[i]; 2035 2036 if (b->init) 2037 wrmsrl(MSR_IA32_MCx_CTL(i), 0); 2038 } 2039 return 0; 2040} 2041 2042static int mce_syscore_suspend(void) 2043{ 2044 return mce_disable_error_reporting(); 2045} 2046 2047static void mce_syscore_shutdown(void) 2048{ 2049 mce_disable_error_reporting(); 2050} 2051 2052/* 2053 * On resume clear all MCE state. Don't want to see leftovers from the BIOS. 2054 * Only one CPU is active at this time, the others get re-added later using 2055 * CPU hotplug: 2056 */ 2057static void mce_syscore_resume(void) 2058{ 2059 __mcheck_cpu_init_generic(); 2060 __mcheck_cpu_init_vendor(__this_cpu_ptr(&cpu_info)); 2061} 2062 2063static struct syscore_ops mce_syscore_ops = { 2064 .suspend = mce_syscore_suspend, 2065 .shutdown = mce_syscore_shutdown, 2066 .resume = mce_syscore_resume, 2067}; 2068 2069/* 2070 * mce_device: Sysfs support 2071 */ 2072 2073static void mce_cpu_restart(void *data) 2074{ 2075 if (!mce_available(__this_cpu_ptr(&cpu_info))) 2076 return; 2077 __mcheck_cpu_init_generic(); 2078 __mcheck_cpu_init_timer(); 2079} 2080 2081/* Reinit MCEs after user configuration changes */ 2082static void mce_restart(void) 2083{ 2084 mce_timer_delete_all(); 2085 on_each_cpu(mce_cpu_restart, NULL, 1); 2086} 2087 2088/* Toggle features for corrected errors */ 2089static void mce_disable_cmci(void *data) 2090{ 2091 if (!mce_available(__this_cpu_ptr(&cpu_info))) 2092 return; 2093 cmci_clear(); 2094} 2095 2096static void mce_enable_ce(void *all) 2097{ 2098 if (!mce_available(__this_cpu_ptr(&cpu_info))) 2099 return; 2100 cmci_reenable(); 2101 cmci_recheck(); 2102 if (all) 2103 __mcheck_cpu_init_timer(); 2104} 2105 2106static struct bus_type mce_subsys = { 2107 .name = "machinecheck", 2108 .dev_name = "machinecheck", 2109}; 2110 2111DEFINE_PER_CPU(struct device *, mce_device); 2112 2113void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); 2114 2115static inline struct mce_bank *attr_to_bank(struct device_attribute *attr) 2116{ 2117 return container_of(attr, struct mce_bank, attr); 2118} 2119 2120static ssize_t show_bank(struct device *s, struct device_attribute *attr, 2121 char *buf) 2122{ 2123 return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl); 2124} 2125 2126static ssize_t set_bank(struct device *s, struct device_attribute *attr, 2127 const char *buf, size_t size) 2128{ 2129 u64 new; 2130 2131 if (strict_strtoull(buf, 0, &new) < 0) 2132 return -EINVAL; 2133 2134 attr_to_bank(attr)->ctl = new; 2135 mce_restart(); 2136 2137 return size; 2138} 2139 2140static ssize_t 2141show_trigger(struct device *s, struct device_attribute *attr, char *buf) 2142{ 2143 strcpy(buf, mce_helper); 2144 strcat(buf, "\n"); 2145 return strlen(mce_helper) + 1; 2146} 2147 2148static ssize_t set_trigger(struct device *s, struct device_attribute *attr, 2149 const char *buf, size_t siz) 2150{ 2151 char *p; 2152 2153 strncpy(mce_helper, buf, sizeof(mce_helper)); 2154 mce_helper[sizeof(mce_helper)-1] = 0; 2155 p = strchr(mce_helper, '\n'); 2156 2157 if (p) 2158 *p = 0; 2159 2160 return strlen(mce_helper) + !!p; 2161} 2162 2163static ssize_t set_ignore_ce(struct device *s, 2164 struct device_attribute *attr, 2165 const char *buf, size_t size) 2166{ 2167 u64 new; 2168 2169 if (strict_strtoull(buf, 0, &new) < 0) 2170 return -EINVAL; 2171 2172 if (mca_cfg.ignore_ce ^ !!new) { 2173 if (new) { 2174 /* disable ce features */ 2175 mce_timer_delete_all(); 2176 on_each_cpu(mce_disable_cmci, NULL, 1); 2177 mca_cfg.ignore_ce = true; 2178 } else { 2179 /* enable ce features */ 2180 mca_cfg.ignore_ce = false; 2181 on_each_cpu(mce_enable_ce, (void *)1, 1); 2182 } 2183 } 2184 return size; 2185} 2186 2187static ssize_t set_cmci_disabled(struct device *s, 2188 struct device_attribute *attr, 2189 const char *buf, size_t size) 2190{ 2191 u64 new; 2192 2193 if (strict_strtoull(buf, 0, &new) < 0) 2194 return -EINVAL; 2195 2196 if (mca_cfg.cmci_disabled ^ !!new) { 2197 if (new) { 2198 /* disable cmci */ 2199 on_each_cpu(mce_disable_cmci, NULL, 1); 2200 mca_cfg.cmci_disabled = true; 2201 } else { 2202 /* enable cmci */ 2203 mca_cfg.cmci_disabled = false; 2204 on_each_cpu(mce_enable_ce, NULL, 1); 2205 } 2206 } 2207 return size; 2208} 2209 2210static ssize_t store_int_with_restart(struct device *s, 2211 struct device_attribute *attr, 2212 const char *buf, size_t size) 2213{ 2214 ssize_t ret = device_store_int(s, attr, buf, size); 2215 mce_restart(); 2216 return ret; 2217} 2218 2219static DEVICE_ATTR(trigger, 0644, show_trigger, set_trigger); 2220static DEVICE_INT_ATTR(tolerant, 0644, mca_cfg.tolerant); 2221static DEVICE_INT_ATTR(monarch_timeout, 0644, mca_cfg.monarch_timeout); 2222static DEVICE_BOOL_ATTR(dont_log_ce, 0644, mca_cfg.dont_log_ce); 2223 2224static struct dev_ext_attribute dev_attr_check_interval = { 2225 __ATTR(check_interval, 0644, device_show_int, store_int_with_restart), 2226 &check_interval 2227}; 2228 2229static struct dev_ext_attribute dev_attr_ignore_ce = { 2230 __ATTR(ignore_ce, 0644, device_show_bool, set_ignore_ce), 2231 &mca_cfg.ignore_ce 2232}; 2233 2234static struct dev_ext_attribute dev_attr_cmci_disabled = { 2235 __ATTR(cmci_disabled, 0644, device_show_bool, set_cmci_disabled), 2236 &mca_cfg.cmci_disabled 2237}; 2238 2239static struct device_attribute *mce_device_attrs[] = { 2240 &dev_attr_tolerant.attr, 2241 &dev_attr_check_interval.attr, 2242 &dev_attr_trigger, 2243 &dev_attr_monarch_timeout.attr, 2244 &dev_attr_dont_log_ce.attr, 2245 &dev_attr_ignore_ce.attr, 2246 &dev_attr_cmci_disabled.attr, 2247 NULL 2248}; 2249 2250static cpumask_var_t mce_device_initialized; 2251 2252static void mce_device_release(struct device *dev) 2253{ 2254 kfree(dev); 2255} 2256 2257/* Per cpu device init. All of the cpus still share the same ctrl bank: */ 2258static int mce_device_create(unsigned int cpu) 2259{ 2260 struct device *dev; 2261 int err; 2262 int i, j; 2263 2264 if (!mce_available(&boot_cpu_data)) 2265 return -EIO; 2266 2267 dev = kzalloc(sizeof *dev, GFP_KERNEL); 2268 if (!dev) 2269 return -ENOMEM; 2270 dev->id = cpu; 2271 dev->bus = &mce_subsys; 2272 dev->release = &mce_device_release; 2273 2274 err = device_register(dev); 2275 if (err) { 2276 put_device(dev); 2277 return err; 2278 } 2279 2280 for (i = 0; mce_device_attrs[i]; i++) { 2281 err = device_create_file(dev, mce_device_attrs[i]); 2282 if (err) 2283 goto error; 2284 } 2285 for (j = 0; j < mca_cfg.banks; j++) { 2286 err = device_create_file(dev, &mce_banks[j].attr); 2287 if (err) 2288 goto error2; 2289 } 2290 cpumask_set_cpu(cpu, mce_device_initialized); 2291 per_cpu(mce_device, cpu) = dev; 2292 2293 return 0; 2294error2: 2295 while (--j >= 0) 2296 device_remove_file(dev, &mce_banks[j].attr); 2297error: 2298 while (--i >= 0) 2299 device_remove_file(dev, mce_device_attrs[i]); 2300 2301 device_unregister(dev); 2302 2303 return err; 2304} 2305 2306static void mce_device_remove(unsigned int cpu) 2307{ 2308 struct device *dev = per_cpu(mce_device, cpu); 2309 int i; 2310 2311 if (!cpumask_test_cpu(cpu, mce_device_initialized)) 2312 return; 2313 2314 for (i = 0; mce_device_attrs[i]; i++) 2315 device_remove_file(dev, mce_device_attrs[i]); 2316 2317 for (i = 0; i < mca_cfg.banks; i++) 2318 device_remove_file(dev, &mce_banks[i].attr); 2319 2320 device_unregister(dev); 2321 cpumask_clear_cpu(cpu, mce_device_initialized); 2322 per_cpu(mce_device, cpu) = NULL; 2323} 2324 2325/* Make sure there are no machine checks on offlined CPUs. */ 2326static void mce_disable_cpu(void *h) 2327{ 2328 unsigned long action = *(unsigned long *)h; 2329 int i; 2330 2331 if (!mce_available(__this_cpu_ptr(&cpu_info))) 2332 return; 2333 2334 if (!(action & CPU_TASKS_FROZEN)) 2335 cmci_clear(); 2336 for (i = 0; i < mca_cfg.banks; i++) { 2337 struct mce_bank *b = &mce_banks[i]; 2338 2339 if (b->init) 2340 wrmsrl(MSR_IA32_MCx_CTL(i), 0); 2341 } 2342} 2343 2344static void mce_reenable_cpu(void *h) 2345{ 2346 unsigned long action = *(unsigned long *)h; 2347 int i; 2348 2349 if (!mce_available(__this_cpu_ptr(&cpu_info))) 2350 return; 2351 2352 if (!(action & CPU_TASKS_FROZEN)) 2353 cmci_reenable(); 2354 for (i = 0; i < mca_cfg.banks; i++) { 2355 struct mce_bank *b = &mce_banks[i]; 2356 2357 if (b->init) 2358 wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl); 2359 } 2360} 2361 2362/* Get notified when a cpu comes on/off. Be hotplug friendly. */ 2363static int 2364mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) 2365{ 2366 unsigned int cpu = (unsigned long)hcpu; 2367 struct timer_list *t = &per_cpu(mce_timer, cpu); 2368 2369 switch (action & ~CPU_TASKS_FROZEN) { 2370 case CPU_ONLINE: 2371 mce_device_create(cpu); 2372 if (threshold_cpu_callback) 2373 threshold_cpu_callback(action, cpu); 2374 break; 2375 case CPU_DEAD: 2376 if (threshold_cpu_callback) 2377 threshold_cpu_callback(action, cpu); 2378 mce_device_remove(cpu); 2379 mce_intel_hcpu_update(cpu); 2380 break; 2381 case CPU_DOWN_PREPARE: 2382 smp_call_function_single(cpu, mce_disable_cpu, &action, 1); 2383 del_timer_sync(t); 2384 break; 2385 case CPU_DOWN_FAILED: 2386 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); 2387 mce_start_timer(cpu, t); 2388 break; 2389 } 2390 2391 if (action == CPU_POST_DEAD) { 2392 /* intentionally ignoring frozen here */ 2393 cmci_rediscover(); 2394 } 2395 2396 return NOTIFY_OK; 2397} 2398 2399static struct notifier_block mce_cpu_notifier = { 2400 .notifier_call = mce_cpu_callback, 2401}; 2402 2403static __init void mce_init_banks(void) 2404{ 2405 int i; 2406 2407 for (i = 0; i < mca_cfg.banks; i++) { 2408 struct mce_bank *b = &mce_banks[i]; 2409 struct device_attribute *a = &b->attr; 2410 2411 sysfs_attr_init(&a->attr); 2412 a->attr.name = b->attrname; 2413 snprintf(b->attrname, ATTR_LEN, "bank%d", i); 2414 2415 a->attr.mode = 0644; 2416 a->show = show_bank; 2417 a->store = set_bank; 2418 } 2419} 2420 2421static __init int mcheck_init_device(void) 2422{ 2423 int err; 2424 int i = 0; 2425 2426 if (!mce_available(&boot_cpu_data)) 2427 return -EIO; 2428 2429 zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL); 2430 2431 mce_init_banks(); 2432 2433 err = subsys_system_register(&mce_subsys, NULL); 2434 if (err) 2435 return err; 2436 2437 cpu_notifier_register_begin(); 2438 for_each_online_cpu(i) { 2439 err = mce_device_create(i); 2440 if (err) { 2441 cpu_notifier_register_done(); 2442 return err; 2443 } 2444 } 2445 2446 register_syscore_ops(&mce_syscore_ops); 2447 __register_hotcpu_notifier(&mce_cpu_notifier); 2448 cpu_notifier_register_done(); 2449 2450 /* register character device /dev/mcelog */ 2451 misc_register(&mce_chrdev_device); 2452 2453 return err; 2454} 2455device_initcall_sync(mcheck_init_device); 2456 2457/* 2458 * Old style boot options parsing. Only for compatibility. 2459 */ 2460static int __init mcheck_disable(char *str) 2461{ 2462 mca_cfg.disabled = true; 2463 return 1; 2464} 2465__setup("nomce", mcheck_disable); 2466 2467#ifdef CONFIG_DEBUG_FS 2468struct dentry *mce_get_debugfs_dir(void) 2469{ 2470 static struct dentry *dmce; 2471 2472 if (!dmce) 2473 dmce = debugfs_create_dir("mce", NULL); 2474 2475 return dmce; 2476} 2477 2478static void mce_reset(void) 2479{ 2480 cpu_missing = 0; 2481 atomic_set(&mce_fake_paniced, 0); 2482 atomic_set(&mce_executing, 0); 2483 atomic_set(&mce_callin, 0); 2484 atomic_set(&global_nwo, 0); 2485} 2486 2487static int fake_panic_get(void *data, u64 *val) 2488{ 2489 *val = fake_panic; 2490 return 0; 2491} 2492 2493static int fake_panic_set(void *data, u64 val) 2494{ 2495 mce_reset(); 2496 fake_panic = val; 2497 return 0; 2498} 2499 2500DEFINE_SIMPLE_ATTRIBUTE(fake_panic_fops, fake_panic_get, 2501 fake_panic_set, "%llu\n"); 2502 2503static int __init mcheck_debugfs_init(void) 2504{ 2505 struct dentry *dmce, *ffake_panic; 2506 2507 dmce = mce_get_debugfs_dir(); 2508 if (!dmce) 2509 return -ENOMEM; 2510 ffake_panic = debugfs_create_file("fake_panic", 0444, dmce, NULL, 2511 &fake_panic_fops); 2512 if (!ffake_panic) 2513 return -ENOMEM; 2514 2515 return 0; 2516} 2517late_initcall(mcheck_debugfs_init); 2518#endif 2519