mce.c revision f94b61c2c9fdcc90773c49df9ccf9ede3ad0d7db
1/* 2 * Machine check handler. 3 * 4 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. 5 * Rest from unknown author(s). 6 * 2004 Andi Kleen. Rewrote most of it. 7 * Copyright 2008 Intel Corporation 8 * Author: Andi Kleen 9 */ 10#include <linux/thread_info.h> 11#include <linux/capability.h> 12#include <linux/miscdevice.h> 13#include <linux/interrupt.h> 14#include <linux/ratelimit.h> 15#include <linux/kallsyms.h> 16#include <linux/rcupdate.h> 17#include <linux/kobject.h> 18#include <linux/uaccess.h> 19#include <linux/kdebug.h> 20#include <linux/kernel.h> 21#include <linux/percpu.h> 22#include <linux/string.h> 23#include <linux/sysdev.h> 24#include <linux/ctype.h> 25#include <linux/sched.h> 26#include <linux/sysfs.h> 27#include <linux/types.h> 28#include <linux/init.h> 29#include <linux/kmod.h> 30#include <linux/poll.h> 31#include <linux/cpu.h> 32#include <linux/smp.h> 33#include <linux/fs.h> 34 35#include <asm/processor.h> 36#include <asm/hw_irq.h> 37#include <asm/apic.h> 38#include <asm/idle.h> 39#include <asm/ipi.h> 40#include <asm/mce.h> 41#include <asm/msr.h> 42 43#include "mce-internal.h" 44#include "mce.h" 45 46/* Handle unconfigured int18 (should never happen) */ 47static void unexpected_machine_check(struct pt_regs *regs, long error_code) 48{ 49 printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", 50 smp_processor_id()); 51} 52 53/* Call the installed machine check handler for this CPU setup. */ 54void (*machine_check_vector)(struct pt_regs *, long error_code) = 55 unexpected_machine_check; 56 57int mce_disabled; 58 59#ifdef CONFIG_X86_NEW_MCE 60 61#define MISC_MCELOG_MINOR 227 62 63atomic_t mce_entry; 64 65DEFINE_PER_CPU(unsigned, mce_exception_count); 66 67/* 68 * Tolerant levels: 69 * 0: always panic on uncorrected errors, log corrected errors 70 * 1: panic or SIGBUS on uncorrected errors, log corrected errors 71 * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors 72 * 3: never panic or SIGBUS, log all errors (for testing only) 73 */ 74static int tolerant = 1; 75static int banks; 76static u64 *bank; 77static unsigned long notify_user; 78static int rip_msr; 79static int mce_bootlog = -1; 80 81static char trigger[128]; 82static char *trigger_argv[2] = { trigger, NULL }; 83 84static unsigned long dont_init_banks; 85 86static DECLARE_WAIT_QUEUE_HEAD(mce_wait); 87 88/* MCA banks polled by the period polling timer for corrected events */ 89DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { 90 [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL 91}; 92 93static inline int skip_bank_init(int i) 94{ 95 return i < BITS_PER_LONG && test_bit(i, &dont_init_banks); 96} 97 98/* Do initial initialization of a struct mce */ 99void mce_setup(struct mce *m) 100{ 101 memset(m, 0, sizeof(struct mce)); 102 m->cpu = m->extcpu = smp_processor_id(); 103 rdtscll(m->tsc); 104 /* We hope get_seconds stays lockless */ 105 m->time = get_seconds(); 106 m->cpuvendor = boot_cpu_data.x86_vendor; 107 m->cpuid = cpuid_eax(1); 108#ifdef CONFIG_SMP 109 m->socketid = cpu_data(m->extcpu).phys_proc_id; 110#endif 111 m->apicid = cpu_data(m->extcpu).initial_apicid; 112 rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap); 113} 114 115DEFINE_PER_CPU(struct mce, injectm); 116EXPORT_PER_CPU_SYMBOL_GPL(injectm); 117 118/* 119 * Lockless MCE logging infrastructure. 120 * This avoids deadlocks on printk locks without having to break locks. Also 121 * separate MCEs from kernel messages to avoid bogus bug reports. 122 */ 123 124static struct mce_log mcelog = { 125 .signature = MCE_LOG_SIGNATURE, 126 .len = MCE_LOG_LEN, 127 .recordlen = sizeof(struct mce), 128}; 129 130void mce_log(struct mce *mce) 131{ 132 unsigned next, entry; 133 134 mce->finished = 0; 135 wmb(); 136 for (;;) { 137 entry = rcu_dereference(mcelog.next); 138 for (;;) { 139 /* 140 * When the buffer fills up discard new entries. 141 * Assume that the earlier errors are the more 142 * interesting ones: 143 */ 144 if (entry >= MCE_LOG_LEN) { 145 set_bit(MCE_OVERFLOW, 146 (unsigned long *)&mcelog.flags); 147 return; 148 } 149 /* Old left over entry. Skip: */ 150 if (mcelog.entry[entry].finished) { 151 entry++; 152 continue; 153 } 154 break; 155 } 156 smp_rmb(); 157 next = entry + 1; 158 if (cmpxchg(&mcelog.next, entry, next) == entry) 159 break; 160 } 161 memcpy(mcelog.entry + entry, mce, sizeof(struct mce)); 162 wmb(); 163 mcelog.entry[entry].finished = 1; 164 wmb(); 165 166 mce->finished = 1; 167 set_bit(0, ¬ify_user); 168} 169 170static void print_mce(struct mce *m) 171{ 172 printk(KERN_EMERG "\n" 173 KERN_EMERG "HARDWARE ERROR\n" 174 KERN_EMERG 175 "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n", 176 m->extcpu, m->mcgstatus, m->bank, m->status); 177 if (m->ip) { 178 printk(KERN_EMERG "RIP%s %02x:<%016Lx> ", 179 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", 180 m->cs, m->ip); 181 if (m->cs == __KERNEL_CS) 182 print_symbol("{%s}", m->ip); 183 printk("\n"); 184 } 185 printk(KERN_EMERG "TSC %llx ", m->tsc); 186 if (m->addr) 187 printk("ADDR %llx ", m->addr); 188 if (m->misc) 189 printk("MISC %llx ", m->misc); 190 printk("\n"); 191 printk(KERN_EMERG "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n", 192 m->cpuvendor, m->cpuid, m->time, m->socketid, 193 m->apicid); 194 printk(KERN_EMERG "This is not a software problem!\n"); 195 printk(KERN_EMERG "Run through mcelog --ascii to decode " 196 "and contact your hardware vendor\n"); 197} 198 199#define PANIC_TIMEOUT 5 /* 5 seconds */ 200 201static atomic_t mce_paniced; 202 203/* Panic in progress. Enable interrupts and wait for final IPI */ 204static void wait_for_panic(void) 205{ 206 long timeout = PANIC_TIMEOUT*USEC_PER_SEC; 207 preempt_disable(); 208 local_irq_enable(); 209 while (timeout-- > 0) 210 udelay(1); 211 panic("Panicing machine check CPU died"); 212} 213 214static void mce_panic(char *msg, struct mce *final, char *exp) 215{ 216 int i; 217 218 /* 219 * Make sure only one CPU runs in machine check panic 220 */ 221 if (atomic_add_return(1, &mce_paniced) > 1) 222 wait_for_panic(); 223 barrier(); 224 225 bust_spinlocks(1); 226 console_verbose(); 227 /* First print corrected ones that are still unlogged */ 228 for (i = 0; i < MCE_LOG_LEN; i++) { 229 struct mce *m = &mcelog.entry[i]; 230 if ((m->status & MCI_STATUS_VAL) && 231 !(m->status & MCI_STATUS_UC)) 232 print_mce(m); 233 } 234 /* Now print uncorrected but with the final one last */ 235 for (i = 0; i < MCE_LOG_LEN; i++) { 236 struct mce *m = &mcelog.entry[i]; 237 if (!(m->status & MCI_STATUS_VAL)) 238 continue; 239 if (!final || memcmp(m, final, sizeof(struct mce))) 240 print_mce(m); 241 } 242 if (final) 243 print_mce(final); 244 if (exp) 245 printk(KERN_EMERG "Machine check: %s\n", exp); 246 panic(msg); 247} 248 249/* Support code for software error injection */ 250 251static int msr_to_offset(u32 msr) 252{ 253 unsigned bank = __get_cpu_var(injectm.bank); 254 if (msr == rip_msr) 255 return offsetof(struct mce, ip); 256 if (msr == MSR_IA32_MC0_STATUS + bank*4) 257 return offsetof(struct mce, status); 258 if (msr == MSR_IA32_MC0_ADDR + bank*4) 259 return offsetof(struct mce, addr); 260 if (msr == MSR_IA32_MC0_MISC + bank*4) 261 return offsetof(struct mce, misc); 262 if (msr == MSR_IA32_MCG_STATUS) 263 return offsetof(struct mce, mcgstatus); 264 return -1; 265} 266 267/* MSR access wrappers used for error injection */ 268static u64 mce_rdmsrl(u32 msr) 269{ 270 u64 v; 271 if (__get_cpu_var(injectm).finished) { 272 int offset = msr_to_offset(msr); 273 if (offset < 0) 274 return 0; 275 return *(u64 *)((char *)&__get_cpu_var(injectm) + offset); 276 } 277 rdmsrl(msr, v); 278 return v; 279} 280 281static void mce_wrmsrl(u32 msr, u64 v) 282{ 283 if (__get_cpu_var(injectm).finished) { 284 int offset = msr_to_offset(msr); 285 if (offset >= 0) 286 *(u64 *)((char *)&__get_cpu_var(injectm) + offset) = v; 287 return; 288 } 289 wrmsrl(msr, v); 290} 291 292int mce_available(struct cpuinfo_x86 *c) 293{ 294 if (mce_disabled) 295 return 0; 296 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); 297} 298 299static inline void mce_get_rip(struct mce *m, struct pt_regs *regs) 300{ 301 if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) { 302 m->ip = regs->ip; 303 m->cs = regs->cs; 304 } else { 305 m->ip = 0; 306 m->cs = 0; 307 } 308 if (rip_msr) { 309 /* Assume the RIP in the MSR is exact. Is this true? */ 310 m->mcgstatus |= MCG_STATUS_EIPV; 311 m->ip = mce_rdmsrl(rip_msr); 312 m->cs = 0; 313 } 314} 315 316#ifdef CONFIG_X86_LOCAL_APIC 317/* 318 * Called after interrupts have been reenabled again 319 * when a MCE happened during an interrupts off region 320 * in the kernel. 321 */ 322asmlinkage void smp_mce_self_interrupt(struct pt_regs *regs) 323{ 324 ack_APIC_irq(); 325 exit_idle(); 326 irq_enter(); 327 mce_notify_user(); 328 irq_exit(); 329} 330#endif 331 332static void mce_report_event(struct pt_regs *regs) 333{ 334 if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) { 335 mce_notify_user(); 336 return; 337 } 338 339#ifdef CONFIG_X86_LOCAL_APIC 340 /* 341 * Without APIC do not notify. The event will be picked 342 * up eventually. 343 */ 344 if (!cpu_has_apic) 345 return; 346 347 /* 348 * When interrupts are disabled we cannot use 349 * kernel services safely. Trigger an self interrupt 350 * through the APIC to instead do the notification 351 * after interrupts are reenabled again. 352 */ 353 apic->send_IPI_self(MCE_SELF_VECTOR); 354 355 /* 356 * Wait for idle afterwards again so that we don't leave the 357 * APIC in a non idle state because the normal APIC writes 358 * cannot exclude us. 359 */ 360 apic_wait_icr_idle(); 361#endif 362} 363 364DEFINE_PER_CPU(unsigned, mce_poll_count); 365 366/* 367 * Poll for corrected events or events that happened before reset. 368 * Those are just logged through /dev/mcelog. 369 * 370 * This is executed in standard interrupt context. 371 */ 372void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) 373{ 374 struct mce m; 375 int i; 376 377 __get_cpu_var(mce_poll_count)++; 378 379 mce_setup(&m); 380 381 m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); 382 for (i = 0; i < banks; i++) { 383 if (!bank[i] || !test_bit(i, *b)) 384 continue; 385 386 m.misc = 0; 387 m.addr = 0; 388 m.bank = i; 389 m.tsc = 0; 390 391 barrier(); 392 m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4); 393 if (!(m.status & MCI_STATUS_VAL)) 394 continue; 395 396 /* 397 * Uncorrected events are handled by the exception handler 398 * when it is enabled. But when the exception is disabled log 399 * everything. 400 * 401 * TBD do the same check for MCI_STATUS_EN here? 402 */ 403 if ((m.status & MCI_STATUS_UC) && !(flags & MCP_UC)) 404 continue; 405 406 if (m.status & MCI_STATUS_MISCV) 407 m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4); 408 if (m.status & MCI_STATUS_ADDRV) 409 m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4); 410 411 if (!(flags & MCP_TIMESTAMP)) 412 m.tsc = 0; 413 /* 414 * Don't get the IP here because it's unlikely to 415 * have anything to do with the actual error location. 416 */ 417 if (!(flags & MCP_DONTLOG)) { 418 mce_log(&m); 419 add_taint(TAINT_MACHINE_CHECK); 420 } 421 422 /* 423 * Clear state for this bank. 424 */ 425 mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); 426 } 427 428 /* 429 * Don't clear MCG_STATUS here because it's only defined for 430 * exceptions. 431 */ 432 433 sync_core(); 434} 435EXPORT_SYMBOL_GPL(machine_check_poll); 436 437/* 438 * Do a quick check if any of the events requires a panic. 439 * This decides if we keep the events around or clear them. 440 */ 441static int mce_no_way_out(struct mce *m, char **msg) 442{ 443 int i; 444 445 for (i = 0; i < banks; i++) { 446 m->status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4); 447 if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY) 448 return 1; 449 } 450 return 0; 451} 452 453/* 454 * The actual machine check handler. This only handles real 455 * exceptions when something got corrupted coming in through int 18. 456 * 457 * This is executed in NMI context not subject to normal locking rules. This 458 * implies that most kernel services cannot be safely used. Don't even 459 * think about putting a printk in there! 460 */ 461void do_machine_check(struct pt_regs *regs, long error_code) 462{ 463 struct mce m, panicm; 464 int panicm_found = 0; 465 int i; 466 /* 467 * If no_way_out gets set, there is no safe way to recover from this 468 * MCE. If tolerant is cranked up, we'll try anyway. 469 */ 470 int no_way_out = 0; 471 /* 472 * If kill_it gets set, there might be a way to recover from this 473 * error. 474 */ 475 int kill_it = 0; 476 DECLARE_BITMAP(toclear, MAX_NR_BANKS); 477 char *msg = "Unknown"; 478 479 atomic_inc(&mce_entry); 480 481 __get_cpu_var(mce_exception_count)++; 482 483 if (notify_die(DIE_NMI, "machine check", regs, error_code, 484 18, SIGKILL) == NOTIFY_STOP) 485 goto out; 486 if (!banks) 487 goto out; 488 489 mce_setup(&m); 490 491 m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); 492 no_way_out = mce_no_way_out(&m, &msg); 493 494 barrier(); 495 496 for (i = 0; i < banks; i++) { 497 __clear_bit(i, toclear); 498 if (!bank[i]) 499 continue; 500 501 m.misc = 0; 502 m.addr = 0; 503 m.bank = i; 504 505 m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4); 506 if ((m.status & MCI_STATUS_VAL) == 0) 507 continue; 508 509 /* 510 * Non uncorrected errors are handled by machine_check_poll 511 * Leave them alone, unless this panics. 512 */ 513 if ((m.status & MCI_STATUS_UC) == 0 && !no_way_out) 514 continue; 515 516 /* 517 * Set taint even when machine check was not enabled. 518 */ 519 add_taint(TAINT_MACHINE_CHECK); 520 521 __set_bit(i, toclear); 522 523 if (m.status & MCI_STATUS_EN) { 524 /* 525 * If this error was uncorrectable and there was 526 * an overflow, we're in trouble. If no overflow, 527 * we might get away with just killing a task. 528 */ 529 if (m.status & MCI_STATUS_UC) 530 kill_it = 1; 531 } else { 532 /* 533 * Machine check event was not enabled. Clear, but 534 * ignore. 535 */ 536 continue; 537 } 538 539 if (m.status & MCI_STATUS_MISCV) 540 m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4); 541 if (m.status & MCI_STATUS_ADDRV) 542 m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4); 543 544 mce_get_rip(&m, regs); 545 mce_log(&m); 546 547 /* 548 * Did this bank cause the exception? 549 * 550 * Assume that the bank with uncorrectable errors did it, 551 * and that there is only a single one: 552 */ 553 if ((m.status & MCI_STATUS_UC) && 554 (m.status & MCI_STATUS_EN)) { 555 panicm = m; 556 panicm_found = 1; 557 } 558 } 559 560 /* 561 * If we didn't find an uncorrectable error, pick 562 * the last one (shouldn't happen, just being safe). 563 */ 564 if (!panicm_found) 565 panicm = m; 566 567 /* 568 * If we have decided that we just CAN'T continue, and the user 569 * has not set tolerant to an insane level, give up and die. 570 */ 571 if (no_way_out && tolerant < 3) 572 mce_panic("Machine check", &panicm, msg); 573 574 /* 575 * If the error seems to be unrecoverable, something should be 576 * done. Try to kill as little as possible. If we can kill just 577 * one task, do that. If the user has set the tolerance very 578 * high, don't try to do anything at all. 579 */ 580 if (kill_it && tolerant < 3) { 581 int user_space = 0; 582 583 /* 584 * If the EIPV bit is set, it means the saved IP is the 585 * instruction which caused the MCE. 586 */ 587 if (m.mcgstatus & MCG_STATUS_EIPV) 588 user_space = panicm.ip && (panicm.cs & 3); 589 590 /* 591 * If we know that the error was in user space, send a 592 * SIGBUS. Otherwise, panic if tolerance is low. 593 * 594 * force_sig() takes an awful lot of locks and has a slight 595 * risk of deadlocking. 596 */ 597 if (user_space) { 598 force_sig(SIGBUS, current); 599 } else if (panic_on_oops || tolerant < 2) { 600 mce_panic("Uncorrected machine check", &panicm, msg); 601 } 602 } 603 604 /* notify userspace ASAP */ 605 set_thread_flag(TIF_MCE_NOTIFY); 606 607 mce_report_event(regs); 608 609 /* the last thing we do is clear state */ 610 for (i = 0; i < banks; i++) { 611 if (test_bit(i, toclear)) 612 mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); 613 } 614 mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); 615out: 616 atomic_dec(&mce_entry); 617 sync_core(); 618} 619EXPORT_SYMBOL_GPL(do_machine_check); 620 621#ifdef CONFIG_X86_MCE_INTEL 622/*** 623 * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog 624 * @cpu: The CPU on which the event occurred. 625 * @status: Event status information 626 * 627 * This function should be called by the thermal interrupt after the 628 * event has been processed and the decision was made to log the event 629 * further. 630 * 631 * The status parameter will be saved to the 'status' field of 'struct mce' 632 * and historically has been the register value of the 633 * MSR_IA32_THERMAL_STATUS (Intel) msr. 634 */ 635void mce_log_therm_throt_event(__u64 status) 636{ 637 struct mce m; 638 639 mce_setup(&m); 640 m.bank = MCE_THERMAL_BANK; 641 m.status = status; 642 mce_log(&m); 643} 644#endif /* CONFIG_X86_MCE_INTEL */ 645 646/* 647 * Periodic polling timer for "silent" machine check errors. If the 648 * poller finds an MCE, poll 2x faster. When the poller finds no more 649 * errors, poll 2x slower (up to check_interval seconds). 650 */ 651static int check_interval = 5 * 60; /* 5 minutes */ 652 653static DEFINE_PER_CPU(int, next_interval); /* in jiffies */ 654static DEFINE_PER_CPU(struct timer_list, mce_timer); 655 656static void mcheck_timer(unsigned long data) 657{ 658 struct timer_list *t = &per_cpu(mce_timer, data); 659 int *n; 660 661 WARN_ON(smp_processor_id() != data); 662 663 if (mce_available(¤t_cpu_data)) { 664 machine_check_poll(MCP_TIMESTAMP, 665 &__get_cpu_var(mce_poll_banks)); 666 } 667 668 /* 669 * Alert userspace if needed. If we logged an MCE, reduce the 670 * polling interval, otherwise increase the polling interval. 671 */ 672 n = &__get_cpu_var(next_interval); 673 if (mce_notify_user()) 674 *n = max(*n/2, HZ/100); 675 else 676 *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ)); 677 678 t->expires = jiffies + *n; 679 add_timer(t); 680} 681 682static void mce_do_trigger(struct work_struct *work) 683{ 684 call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT); 685} 686 687static DECLARE_WORK(mce_trigger_work, mce_do_trigger); 688 689/* 690 * Notify the user(s) about new machine check events. 691 * Can be called from interrupt context, but not from machine check/NMI 692 * context. 693 */ 694int mce_notify_user(void) 695{ 696 /* Not more than two messages every minute */ 697 static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); 698 699 clear_thread_flag(TIF_MCE_NOTIFY); 700 701 if (test_and_clear_bit(0, ¬ify_user)) { 702 wake_up_interruptible(&mce_wait); 703 704 /* 705 * There is no risk of missing notifications because 706 * work_pending is always cleared before the function is 707 * executed. 708 */ 709 if (trigger[0] && !work_pending(&mce_trigger_work)) 710 schedule_work(&mce_trigger_work); 711 712 if (__ratelimit(&ratelimit)) 713 printk(KERN_INFO "Machine check events logged\n"); 714 715 return 1; 716 } 717 return 0; 718} 719EXPORT_SYMBOL_GPL(mce_notify_user); 720 721/* 722 * Initialize Machine Checks for a CPU. 723 */ 724static int mce_cap_init(void) 725{ 726 unsigned b; 727 u64 cap; 728 729 rdmsrl(MSR_IA32_MCG_CAP, cap); 730 731 b = cap & MCG_BANKCNT_MASK; 732 printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b); 733 734 if (b > MAX_NR_BANKS) { 735 printk(KERN_WARNING 736 "MCE: Using only %u machine check banks out of %u\n", 737 MAX_NR_BANKS, b); 738 b = MAX_NR_BANKS; 739 } 740 741 /* Don't support asymmetric configurations today */ 742 WARN_ON(banks != 0 && b != banks); 743 banks = b; 744 if (!bank) { 745 bank = kmalloc(banks * sizeof(u64), GFP_KERNEL); 746 if (!bank) 747 return -ENOMEM; 748 memset(bank, 0xff, banks * sizeof(u64)); 749 } 750 751 /* Use accurate RIP reporting if available. */ 752 if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9) 753 rip_msr = MSR_IA32_MCG_EIP; 754 755 return 0; 756} 757 758static void mce_init(void) 759{ 760 mce_banks_t all_banks; 761 u64 cap; 762 int i; 763 764 /* 765 * Log the machine checks left over from the previous reset. 766 */ 767 bitmap_fill(all_banks, MAX_NR_BANKS); 768 machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks); 769 770 set_in_cr4(X86_CR4_MCE); 771 772 rdmsrl(MSR_IA32_MCG_CAP, cap); 773 if (cap & MCG_CTL_P) 774 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); 775 776 for (i = 0; i < banks; i++) { 777 if (skip_bank_init(i)) 778 continue; 779 wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]); 780 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); 781 } 782} 783 784/* Add per CPU specific workarounds here */ 785static void mce_cpu_quirks(struct cpuinfo_x86 *c) 786{ 787 /* This should be disabled by the BIOS, but isn't always */ 788 if (c->x86_vendor == X86_VENDOR_AMD) { 789 if (c->x86 == 15 && banks > 4) { 790 /* 791 * disable GART TBL walk error reporting, which 792 * trips off incorrectly with the IOMMU & 3ware 793 * & Cerberus: 794 */ 795 clear_bit(10, (unsigned long *)&bank[4]); 796 } 797 if (c->x86 <= 17 && mce_bootlog < 0) { 798 /* 799 * Lots of broken BIOS around that don't clear them 800 * by default and leave crap in there. Don't log: 801 */ 802 mce_bootlog = 0; 803 } 804 /* 805 * Various K7s with broken bank 0 around. Always disable 806 * by default. 807 */ 808 if (c->x86 == 6) 809 bank[0] = 0; 810 } 811 812 if (c->x86_vendor == X86_VENDOR_INTEL) { 813 /* 814 * SDM documents that on family 6 bank 0 should not be written 815 * because it aliases to another special BIOS controlled 816 * register. 817 * But it's not aliased anymore on model 0x1a+ 818 * Don't ignore bank 0 completely because there could be a 819 * valid event later, merely don't write CTL0. 820 */ 821 822 if (c->x86 == 6 && c->x86_model < 0x1A) 823 __set_bit(0, &dont_init_banks); 824 } 825} 826 827static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c) 828{ 829 if (c->x86 != 5) 830 return; 831 switch (c->x86_vendor) { 832 case X86_VENDOR_INTEL: 833 if (mce_p5_enabled()) 834 intel_p5_mcheck_init(c); 835 break; 836 case X86_VENDOR_CENTAUR: 837 winchip_mcheck_init(c); 838 break; 839 } 840} 841 842static void mce_cpu_features(struct cpuinfo_x86 *c) 843{ 844 switch (c->x86_vendor) { 845 case X86_VENDOR_INTEL: 846 mce_intel_feature_init(c); 847 break; 848 case X86_VENDOR_AMD: 849 mce_amd_feature_init(c); 850 break; 851 default: 852 break; 853 } 854} 855 856static void mce_init_timer(void) 857{ 858 struct timer_list *t = &__get_cpu_var(mce_timer); 859 int *n = &__get_cpu_var(next_interval); 860 861 *n = check_interval * HZ; 862 if (!*n) 863 return; 864 setup_timer(t, mcheck_timer, smp_processor_id()); 865 t->expires = round_jiffies(jiffies + *n); 866 add_timer(t); 867} 868 869/* 870 * Called for each booted CPU to set up machine checks. 871 * Must be called with preempt off: 872 */ 873void __cpuinit mcheck_init(struct cpuinfo_x86 *c) 874{ 875 if (mce_disabled) 876 return; 877 878 mce_ancient_init(c); 879 880 if (!mce_available(c)) 881 return; 882 883 if (mce_cap_init() < 0) { 884 mce_disabled = 1; 885 return; 886 } 887 mce_cpu_quirks(c); 888 889 machine_check_vector = do_machine_check; 890 891 mce_init(); 892 mce_cpu_features(c); 893 mce_init_timer(); 894} 895 896/* 897 * Character device to read and clear the MCE log. 898 */ 899 900static DEFINE_SPINLOCK(mce_state_lock); 901static int open_count; /* #times opened */ 902static int open_exclu; /* already open exclusive? */ 903 904static int mce_open(struct inode *inode, struct file *file) 905{ 906 spin_lock(&mce_state_lock); 907 908 if (open_exclu || (open_count && (file->f_flags & O_EXCL))) { 909 spin_unlock(&mce_state_lock); 910 911 return -EBUSY; 912 } 913 914 if (file->f_flags & O_EXCL) 915 open_exclu = 1; 916 open_count++; 917 918 spin_unlock(&mce_state_lock); 919 920 return nonseekable_open(inode, file); 921} 922 923static int mce_release(struct inode *inode, struct file *file) 924{ 925 spin_lock(&mce_state_lock); 926 927 open_count--; 928 open_exclu = 0; 929 930 spin_unlock(&mce_state_lock); 931 932 return 0; 933} 934 935static void collect_tscs(void *data) 936{ 937 unsigned long *cpu_tsc = (unsigned long *)data; 938 939 rdtscll(cpu_tsc[smp_processor_id()]); 940} 941 942static DEFINE_MUTEX(mce_read_mutex); 943 944static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, 945 loff_t *off) 946{ 947 char __user *buf = ubuf; 948 unsigned long *cpu_tsc; 949 unsigned prev, next; 950 int i, err; 951 952 cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL); 953 if (!cpu_tsc) 954 return -ENOMEM; 955 956 mutex_lock(&mce_read_mutex); 957 next = rcu_dereference(mcelog.next); 958 959 /* Only supports full reads right now */ 960 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) { 961 mutex_unlock(&mce_read_mutex); 962 kfree(cpu_tsc); 963 964 return -EINVAL; 965 } 966 967 err = 0; 968 prev = 0; 969 do { 970 for (i = prev; i < next; i++) { 971 unsigned long start = jiffies; 972 973 while (!mcelog.entry[i].finished) { 974 if (time_after_eq(jiffies, start + 2)) { 975 memset(mcelog.entry + i, 0, 976 sizeof(struct mce)); 977 goto timeout; 978 } 979 cpu_relax(); 980 } 981 smp_rmb(); 982 err |= copy_to_user(buf, mcelog.entry + i, 983 sizeof(struct mce)); 984 buf += sizeof(struct mce); 985timeout: 986 ; 987 } 988 989 memset(mcelog.entry + prev, 0, 990 (next - prev) * sizeof(struct mce)); 991 prev = next; 992 next = cmpxchg(&mcelog.next, prev, 0); 993 } while (next != prev); 994 995 synchronize_sched(); 996 997 /* 998 * Collect entries that were still getting written before the 999 * synchronize. 1000 */ 1001 on_each_cpu(collect_tscs, cpu_tsc, 1); 1002 1003 for (i = next; i < MCE_LOG_LEN; i++) { 1004 if (mcelog.entry[i].finished && 1005 mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) { 1006 err |= copy_to_user(buf, mcelog.entry+i, 1007 sizeof(struct mce)); 1008 smp_rmb(); 1009 buf += sizeof(struct mce); 1010 memset(&mcelog.entry[i], 0, sizeof(struct mce)); 1011 } 1012 } 1013 mutex_unlock(&mce_read_mutex); 1014 kfree(cpu_tsc); 1015 1016 return err ? -EFAULT : buf - ubuf; 1017} 1018 1019static unsigned int mce_poll(struct file *file, poll_table *wait) 1020{ 1021 poll_wait(file, &mce_wait, wait); 1022 if (rcu_dereference(mcelog.next)) 1023 return POLLIN | POLLRDNORM; 1024 return 0; 1025} 1026 1027static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg) 1028{ 1029 int __user *p = (int __user *)arg; 1030 1031 if (!capable(CAP_SYS_ADMIN)) 1032 return -EPERM; 1033 1034 switch (cmd) { 1035 case MCE_GET_RECORD_LEN: 1036 return put_user(sizeof(struct mce), p); 1037 case MCE_GET_LOG_LEN: 1038 return put_user(MCE_LOG_LEN, p); 1039 case MCE_GETCLEAR_FLAGS: { 1040 unsigned flags; 1041 1042 do { 1043 flags = mcelog.flags; 1044 } while (cmpxchg(&mcelog.flags, flags, 0) != flags); 1045 1046 return put_user(flags, p); 1047 } 1048 default: 1049 return -ENOTTY; 1050 } 1051} 1052 1053/* Modified in mce-inject.c, so not static or const */ 1054struct file_operations mce_chrdev_ops = { 1055 .open = mce_open, 1056 .release = mce_release, 1057 .read = mce_read, 1058 .poll = mce_poll, 1059 .unlocked_ioctl = mce_ioctl, 1060}; 1061EXPORT_SYMBOL_GPL(mce_chrdev_ops); 1062 1063static struct miscdevice mce_log_device = { 1064 MISC_MCELOG_MINOR, 1065 "mcelog", 1066 &mce_chrdev_ops, 1067}; 1068 1069/* 1070 * mce=off disables machine check 1071 * mce=TOLERANCELEVEL (number, see above) 1072 * mce=bootlog Log MCEs from before booting. Disabled by default on AMD. 1073 * mce=nobootlog Don't log MCEs from before booting. 1074 */ 1075static int __init mcheck_enable(char *str) 1076{ 1077 if (*str == 0) 1078 enable_p5_mce(); 1079 if (*str == '=') 1080 str++; 1081 if (!strcmp(str, "off")) 1082 mce_disabled = 1; 1083 else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog")) 1084 mce_bootlog = (str[0] == 'b'); 1085 else if (isdigit(str[0])) 1086 get_option(&str, &tolerant); 1087 else { 1088 printk(KERN_INFO "mce argument %s ignored. Please use /sys\n", 1089 str); 1090 return 0; 1091 } 1092 return 1; 1093} 1094__setup("mce", mcheck_enable); 1095 1096/* 1097 * Sysfs support 1098 */ 1099 1100/* 1101 * Disable machine checks on suspend and shutdown. We can't really handle 1102 * them later. 1103 */ 1104static int mce_disable(void) 1105{ 1106 int i; 1107 1108 for (i = 0; i < banks; i++) { 1109 if (!skip_bank_init(i)) 1110 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); 1111 } 1112 return 0; 1113} 1114 1115static int mce_suspend(struct sys_device *dev, pm_message_t state) 1116{ 1117 return mce_disable(); 1118} 1119 1120static int mce_shutdown(struct sys_device *dev) 1121{ 1122 return mce_disable(); 1123} 1124 1125/* 1126 * On resume clear all MCE state. Don't want to see leftovers from the BIOS. 1127 * Only one CPU is active at this time, the others get re-added later using 1128 * CPU hotplug: 1129 */ 1130static int mce_resume(struct sys_device *dev) 1131{ 1132 mce_init(); 1133 mce_cpu_features(¤t_cpu_data); 1134 1135 return 0; 1136} 1137 1138static void mce_cpu_restart(void *data) 1139{ 1140 del_timer_sync(&__get_cpu_var(mce_timer)); 1141 if (mce_available(¤t_cpu_data)) 1142 mce_init(); 1143 mce_init_timer(); 1144} 1145 1146/* Reinit MCEs after user configuration changes */ 1147static void mce_restart(void) 1148{ 1149 on_each_cpu(mce_cpu_restart, NULL, 1); 1150} 1151 1152static struct sysdev_class mce_sysclass = { 1153 .suspend = mce_suspend, 1154 .shutdown = mce_shutdown, 1155 .resume = mce_resume, 1156 .name = "machinecheck", 1157}; 1158 1159DEFINE_PER_CPU(struct sys_device, mce_dev); 1160 1161__cpuinitdata 1162void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); 1163 1164static struct sysdev_attribute *bank_attrs; 1165 1166static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr, 1167 char *buf) 1168{ 1169 u64 b = bank[attr - bank_attrs]; 1170 1171 return sprintf(buf, "%llx\n", b); 1172} 1173 1174static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, 1175 const char *buf, size_t size) 1176{ 1177 u64 new; 1178 1179 if (strict_strtoull(buf, 0, &new) < 0) 1180 return -EINVAL; 1181 1182 bank[attr - bank_attrs] = new; 1183 mce_restart(); 1184 1185 return size; 1186} 1187 1188static ssize_t 1189show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf) 1190{ 1191 strcpy(buf, trigger); 1192 strcat(buf, "\n"); 1193 return strlen(trigger) + 1; 1194} 1195 1196static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr, 1197 const char *buf, size_t siz) 1198{ 1199 char *p; 1200 int len; 1201 1202 strncpy(trigger, buf, sizeof(trigger)); 1203 trigger[sizeof(trigger)-1] = 0; 1204 len = strlen(trigger); 1205 p = strchr(trigger, '\n'); 1206 1207 if (*p) 1208 *p = 0; 1209 1210 return len; 1211} 1212 1213static ssize_t store_int_with_restart(struct sys_device *s, 1214 struct sysdev_attribute *attr, 1215 const char *buf, size_t size) 1216{ 1217 ssize_t ret = sysdev_store_int(s, attr, buf, size); 1218 mce_restart(); 1219 return ret; 1220} 1221 1222static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); 1223static SYSDEV_INT_ATTR(tolerant, 0644, tolerant); 1224 1225static struct sysdev_ext_attribute attr_check_interval = { 1226 _SYSDEV_ATTR(check_interval, 0644, sysdev_show_int, 1227 store_int_with_restart), 1228 &check_interval 1229}; 1230 1231static struct sysdev_attribute *mce_attrs[] = { 1232 &attr_tolerant.attr, &attr_check_interval.attr, &attr_trigger, 1233 NULL 1234}; 1235 1236static cpumask_var_t mce_dev_initialized; 1237 1238/* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */ 1239static __cpuinit int mce_create_device(unsigned int cpu) 1240{ 1241 int err; 1242 int i; 1243 1244 if (!mce_available(&boot_cpu_data)) 1245 return -EIO; 1246 1247 memset(&per_cpu(mce_dev, cpu).kobj, 0, sizeof(struct kobject)); 1248 per_cpu(mce_dev, cpu).id = cpu; 1249 per_cpu(mce_dev, cpu).cls = &mce_sysclass; 1250 1251 err = sysdev_register(&per_cpu(mce_dev, cpu)); 1252 if (err) 1253 return err; 1254 1255 for (i = 0; mce_attrs[i]; i++) { 1256 err = sysdev_create_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); 1257 if (err) 1258 goto error; 1259 } 1260 for (i = 0; i < banks; i++) { 1261 err = sysdev_create_file(&per_cpu(mce_dev, cpu), 1262 &bank_attrs[i]); 1263 if (err) 1264 goto error2; 1265 } 1266 cpumask_set_cpu(cpu, mce_dev_initialized); 1267 1268 return 0; 1269error2: 1270 while (--i >= 0) 1271 sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]); 1272error: 1273 while (--i >= 0) 1274 sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); 1275 1276 sysdev_unregister(&per_cpu(mce_dev, cpu)); 1277 1278 return err; 1279} 1280 1281static __cpuinit void mce_remove_device(unsigned int cpu) 1282{ 1283 int i; 1284 1285 if (!cpumask_test_cpu(cpu, mce_dev_initialized)) 1286 return; 1287 1288 for (i = 0; mce_attrs[i]; i++) 1289 sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); 1290 1291 for (i = 0; i < banks; i++) 1292 sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]); 1293 1294 sysdev_unregister(&per_cpu(mce_dev, cpu)); 1295 cpumask_clear_cpu(cpu, mce_dev_initialized); 1296} 1297 1298/* Make sure there are no machine checks on offlined CPUs. */ 1299static void mce_disable_cpu(void *h) 1300{ 1301 unsigned long action = *(unsigned long *)h; 1302 int i; 1303 1304 if (!mce_available(¤t_cpu_data)) 1305 return; 1306 if (!(action & CPU_TASKS_FROZEN)) 1307 cmci_clear(); 1308 for (i = 0; i < banks; i++) { 1309 if (!skip_bank_init(i)) 1310 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); 1311 } 1312} 1313 1314static void mce_reenable_cpu(void *h) 1315{ 1316 unsigned long action = *(unsigned long *)h; 1317 int i; 1318 1319 if (!mce_available(¤t_cpu_data)) 1320 return; 1321 1322 if (!(action & CPU_TASKS_FROZEN)) 1323 cmci_reenable(); 1324 for (i = 0; i < banks; i++) { 1325 if (!skip_bank_init(i)) 1326 wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]); 1327 } 1328} 1329 1330/* Get notified when a cpu comes on/off. Be hotplug friendly. */ 1331static int __cpuinit 1332mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) 1333{ 1334 unsigned int cpu = (unsigned long)hcpu; 1335 struct timer_list *t = &per_cpu(mce_timer, cpu); 1336 1337 switch (action) { 1338 case CPU_ONLINE: 1339 case CPU_ONLINE_FROZEN: 1340 mce_create_device(cpu); 1341 if (threshold_cpu_callback) 1342 threshold_cpu_callback(action, cpu); 1343 break; 1344 case CPU_DEAD: 1345 case CPU_DEAD_FROZEN: 1346 if (threshold_cpu_callback) 1347 threshold_cpu_callback(action, cpu); 1348 mce_remove_device(cpu); 1349 break; 1350 case CPU_DOWN_PREPARE: 1351 case CPU_DOWN_PREPARE_FROZEN: 1352 del_timer_sync(t); 1353 smp_call_function_single(cpu, mce_disable_cpu, &action, 1); 1354 break; 1355 case CPU_DOWN_FAILED: 1356 case CPU_DOWN_FAILED_FROZEN: 1357 t->expires = round_jiffies(jiffies + 1358 __get_cpu_var(next_interval)); 1359 add_timer_on(t, cpu); 1360 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); 1361 break; 1362 case CPU_POST_DEAD: 1363 /* intentionally ignoring frozen here */ 1364 cmci_rediscover(cpu); 1365 break; 1366 } 1367 return NOTIFY_OK; 1368} 1369 1370static struct notifier_block mce_cpu_notifier __cpuinitdata = { 1371 .notifier_call = mce_cpu_callback, 1372}; 1373 1374static __init int mce_init_banks(void) 1375{ 1376 int i; 1377 1378 bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks, 1379 GFP_KERNEL); 1380 if (!bank_attrs) 1381 return -ENOMEM; 1382 1383 for (i = 0; i < banks; i++) { 1384 struct sysdev_attribute *a = &bank_attrs[i]; 1385 1386 a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i); 1387 if (!a->attr.name) 1388 goto nomem; 1389 1390 a->attr.mode = 0644; 1391 a->show = show_bank; 1392 a->store = set_bank; 1393 } 1394 return 0; 1395 1396nomem: 1397 while (--i >= 0) 1398 kfree(bank_attrs[i].attr.name); 1399 kfree(bank_attrs); 1400 bank_attrs = NULL; 1401 1402 return -ENOMEM; 1403} 1404 1405static __init int mce_init_device(void) 1406{ 1407 int err; 1408 int i = 0; 1409 1410 if (!mce_available(&boot_cpu_data)) 1411 return -EIO; 1412 1413 alloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL); 1414 1415 err = mce_init_banks(); 1416 if (err) 1417 return err; 1418 1419 err = sysdev_class_register(&mce_sysclass); 1420 if (err) 1421 return err; 1422 1423 for_each_online_cpu(i) { 1424 err = mce_create_device(i); 1425 if (err) 1426 return err; 1427 } 1428 1429 register_hotcpu_notifier(&mce_cpu_notifier); 1430 misc_register(&mce_log_device); 1431 1432 return err; 1433} 1434 1435device_initcall(mce_init_device); 1436 1437#else /* CONFIG_X86_OLD_MCE: */ 1438 1439int nr_mce_banks; 1440EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */ 1441 1442/* This has to be run for each processor */ 1443void mcheck_init(struct cpuinfo_x86 *c) 1444{ 1445 if (mce_disabled == 1) 1446 return; 1447 1448 switch (c->x86_vendor) { 1449 case X86_VENDOR_AMD: 1450 amd_mcheck_init(c); 1451 break; 1452 1453 case X86_VENDOR_INTEL: 1454 if (c->x86 == 5) 1455 intel_p5_mcheck_init(c); 1456 if (c->x86 == 6) 1457 intel_p6_mcheck_init(c); 1458 if (c->x86 == 15) 1459 intel_p4_mcheck_init(c); 1460 break; 1461 1462 case X86_VENDOR_CENTAUR: 1463 if (c->x86 == 5) 1464 winchip_mcheck_init(c); 1465 break; 1466 1467 default: 1468 break; 1469 } 1470 printk(KERN_INFO "mce: CPU supports %d MCE banks\n", nr_mce_banks); 1471} 1472 1473static int __init mcheck_enable(char *str) 1474{ 1475 mce_disabled = -1; 1476 return 1; 1477} 1478 1479__setup("mce", mcheck_enable); 1480 1481#endif /* CONFIG_X86_OLD_MCE */ 1482 1483/* 1484 * Old style boot options parsing. Only for compatibility. 1485 */ 1486static int __init mcheck_disable(char *str) 1487{ 1488 mce_disabled = 1; 1489 return 1; 1490} 1491__setup("nomce", mcheck_disable); 1492