mce.c revision 32561696c23028596f24b353d98f2e23b58f91f7
1/* 2 * Machine check handler. 3 * 4 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. 5 * Rest from unknown author(s). 6 * 2004 Andi Kleen. Rewrote most of it. 7 * Copyright 2008 Intel Corporation 8 * Author: Andi Kleen 9 */ 10#include <linux/thread_info.h> 11#include <linux/capability.h> 12#include <linux/miscdevice.h> 13#include <linux/ratelimit.h> 14#include <linux/kallsyms.h> 15#include <linux/rcupdate.h> 16#include <linux/smp_lock.h> 17#include <linux/kobject.h> 18#include <linux/kdebug.h> 19#include <linux/kernel.h> 20#include <linux/percpu.h> 21#include <linux/string.h> 22#include <linux/sysdev.h> 23#include <linux/ctype.h> 24#include <linux/sched.h> 25#include <linux/sysfs.h> 26#include <linux/types.h> 27#include <linux/init.h> 28#include <linux/kmod.h> 29#include <linux/poll.h> 30#include <linux/cpu.h> 31#include <linux/fs.h> 32 33#include <asm/processor.h> 34#include <asm/uaccess.h> 35#include <asm/idle.h> 36#include <asm/mce.h> 37#include <asm/msr.h> 38#include <asm/smp.h> 39 40#include "mce.h" 41 42/* Handle unconfigured int18 (should never happen) */ 43static void unexpected_machine_check(struct pt_regs *regs, long error_code) 44{ 45 printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", 46 smp_processor_id()); 47} 48 49/* Call the installed machine check handler for this CPU setup. */ 50void (*machine_check_vector)(struct pt_regs *, long error_code) = 51 unexpected_machine_check; 52 53int mce_disabled; 54 55#ifdef CONFIG_X86_NEW_MCE 56 57#define MISC_MCELOG_MINOR 227 58 59atomic_t mce_entry; 60 61/* 62 * Tolerant levels: 63 * 0: always panic on uncorrected errors, log corrected errors 64 * 1: panic or SIGBUS on uncorrected errors, log corrected errors 65 * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors 66 * 3: never panic or SIGBUS, log all errors (for testing only) 67 */ 68static int tolerant = 1; 69static int banks; 70static u64 *bank; 71static unsigned long notify_user; 72static int rip_msr; 73static int mce_bootlog = -1; 74 75static char trigger[128]; 76static char *trigger_argv[2] = { trigger, NULL }; 77 78static unsigned long dont_init_banks; 79 80static DECLARE_WAIT_QUEUE_HEAD(mce_wait); 81 82/* MCA banks polled by the period polling timer for corrected events */ 83DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { 84 [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL 85}; 86 87static inline int skip_bank_init(int i) 88{ 89 return i < BITS_PER_LONG && test_bit(i, &dont_init_banks); 90} 91 92/* Do initial initialization of a struct mce */ 93void mce_setup(struct mce *m) 94{ 95 memset(m, 0, sizeof(struct mce)); 96 m->cpu = smp_processor_id(); 97 rdtscll(m->tsc); 98} 99 100DEFINE_PER_CPU(struct mce, injectm); 101EXPORT_PER_CPU_SYMBOL_GPL(injectm); 102 103/* 104 * Lockless MCE logging infrastructure. 105 * This avoids deadlocks on printk locks without having to break locks. Also 106 * separate MCEs from kernel messages to avoid bogus bug reports. 107 */ 108 109static struct mce_log mcelog = { 110 MCE_LOG_SIGNATURE, 111 MCE_LOG_LEN, 112}; 113 114void mce_log(struct mce *mce) 115{ 116 unsigned next, entry; 117 118 mce->finished = 0; 119 wmb(); 120 for (;;) { 121 entry = rcu_dereference(mcelog.next); 122 for (;;) { 123 /* 124 * When the buffer fills up discard new entries. 125 * Assume that the earlier errors are the more 126 * interesting ones: 127 */ 128 if (entry >= MCE_LOG_LEN) { 129 set_bit(MCE_OVERFLOW, (unsigned long *)&mcelog.flags); 130 return; 131 } 132 /* Old left over entry. Skip: */ 133 if (mcelog.entry[entry].finished) { 134 entry++; 135 continue; 136 } 137 break; 138 } 139 smp_rmb(); 140 next = entry + 1; 141 if (cmpxchg(&mcelog.next, entry, next) == entry) 142 break; 143 } 144 memcpy(mcelog.entry + entry, mce, sizeof(struct mce)); 145 wmb(); 146 mcelog.entry[entry].finished = 1; 147 wmb(); 148 149 set_bit(0, ¬ify_user); 150} 151 152static void print_mce(struct mce *m) 153{ 154 printk(KERN_EMERG "\n" 155 KERN_EMERG "HARDWARE ERROR\n" 156 KERN_EMERG 157 "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n", 158 m->cpu, m->mcgstatus, m->bank, m->status); 159 if (m->ip) { 160 printk(KERN_EMERG "RIP%s %02x:<%016Lx> ", 161 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", 162 m->cs, m->ip); 163 if (m->cs == __KERNEL_CS) 164 print_symbol("{%s}", m->ip); 165 printk("\n"); 166 } 167 printk(KERN_EMERG "TSC %llx ", m->tsc); 168 if (m->addr) 169 printk("ADDR %llx ", m->addr); 170 if (m->misc) 171 printk("MISC %llx ", m->misc); 172 printk("\n"); 173 printk(KERN_EMERG "This is not a software problem!\n"); 174 printk(KERN_EMERG "Run through mcelog --ascii to decode " 175 "and contact your hardware vendor\n"); 176} 177 178static void mce_panic(char *msg, struct mce *backup, u64 start) 179{ 180 int i; 181 182 bust_spinlocks(1); 183 console_verbose(); 184 for (i = 0; i < MCE_LOG_LEN; i++) { 185 u64 tsc = mcelog.entry[i].tsc; 186 187 if ((s64)(tsc - start) < 0) 188 continue; 189 print_mce(&mcelog.entry[i]); 190 if (backup && mcelog.entry[i].tsc == backup->tsc) 191 backup = NULL; 192 } 193 if (backup) 194 print_mce(backup); 195 panic(msg); 196} 197 198/* Support code for software error injection */ 199 200static int msr_to_offset(u32 msr) 201{ 202 unsigned bank = __get_cpu_var(injectm.bank); 203 if (msr == rip_msr) 204 return offsetof(struct mce, ip); 205 if (msr == MSR_IA32_MC0_STATUS + bank*4) 206 return offsetof(struct mce, status); 207 if (msr == MSR_IA32_MC0_ADDR + bank*4) 208 return offsetof(struct mce, addr); 209 if (msr == MSR_IA32_MC0_MISC + bank*4) 210 return offsetof(struct mce, misc); 211 if (msr == MSR_IA32_MCG_STATUS) 212 return offsetof(struct mce, mcgstatus); 213 return -1; 214} 215 216/* MSR access wrappers used for error injection */ 217static u64 mce_rdmsrl(u32 msr) 218{ 219 u64 v; 220 if (__get_cpu_var(injectm).finished) { 221 int offset = msr_to_offset(msr); 222 if (offset < 0) 223 return 0; 224 return *(u64 *)((char *)&__get_cpu_var(injectm) + offset); 225 } 226 rdmsrl(msr, v); 227 return v; 228} 229 230static void mce_wrmsrl(u32 msr, u64 v) 231{ 232 if (__get_cpu_var(injectm).finished) { 233 int offset = msr_to_offset(msr); 234 if (offset >= 0) 235 *(u64 *)((char *)&__get_cpu_var(injectm) + offset) = v; 236 return; 237 } 238 wrmsrl(msr, v); 239} 240 241int mce_available(struct cpuinfo_x86 *c) 242{ 243 if (mce_disabled) 244 return 0; 245 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); 246} 247 248static inline void mce_get_rip(struct mce *m, struct pt_regs *regs) 249{ 250 if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) { 251 m->ip = regs->ip; 252 m->cs = regs->cs; 253 } else { 254 m->ip = 0; 255 m->cs = 0; 256 } 257 if (rip_msr) { 258 /* Assume the RIP in the MSR is exact. Is this true? */ 259 m->mcgstatus |= MCG_STATUS_EIPV; 260 m->ip = mce_rdmsrl(rip_msr); 261 m->cs = 0; 262 } 263} 264 265/* 266 * Poll for corrected events or events that happened before reset. 267 * Those are just logged through /dev/mcelog. 268 * 269 * This is executed in standard interrupt context. 270 */ 271void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) 272{ 273 struct mce m; 274 int i; 275 276 mce_setup(&m); 277 278 m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); 279 for (i = 0; i < banks; i++) { 280 if (!bank[i] || !test_bit(i, *b)) 281 continue; 282 283 m.misc = 0; 284 m.addr = 0; 285 m.bank = i; 286 m.tsc = 0; 287 288 barrier(); 289 m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4); 290 if (!(m.status & MCI_STATUS_VAL)) 291 continue; 292 293 /* 294 * Uncorrected events are handled by the exception handler 295 * when it is enabled. But when the exception is disabled log 296 * everything. 297 * 298 * TBD do the same check for MCI_STATUS_EN here? 299 */ 300 if ((m.status & MCI_STATUS_UC) && !(flags & MCP_UC)) 301 continue; 302 303 if (m.status & MCI_STATUS_MISCV) 304 m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4); 305 if (m.status & MCI_STATUS_ADDRV) 306 m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4); 307 308 if (!(flags & MCP_TIMESTAMP)) 309 m.tsc = 0; 310 /* 311 * Don't get the IP here because it's unlikely to 312 * have anything to do with the actual error location. 313 */ 314 if (!(flags & MCP_DONTLOG)) { 315 mce_log(&m); 316 add_taint(TAINT_MACHINE_CHECK); 317 } 318 319 /* 320 * Clear state for this bank. 321 */ 322 mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); 323 } 324 325 /* 326 * Don't clear MCG_STATUS here because it's only defined for 327 * exceptions. 328 */ 329 330 sync_core(); 331} 332EXPORT_SYMBOL_GPL(machine_check_poll); 333 334/* 335 * The actual machine check handler. This only handles real 336 * exceptions when something got corrupted coming in through int 18. 337 * 338 * This is executed in NMI context not subject to normal locking rules. This 339 * implies that most kernel services cannot be safely used. Don't even 340 * think about putting a printk in there! 341 */ 342void do_machine_check(struct pt_regs *regs, long error_code) 343{ 344 struct mce m, panicm; 345 int panicm_found = 0; 346 u64 mcestart = 0; 347 int i; 348 /* 349 * If no_way_out gets set, there is no safe way to recover from this 350 * MCE. If tolerant is cranked up, we'll try anyway. 351 */ 352 int no_way_out = 0; 353 /* 354 * If kill_it gets set, there might be a way to recover from this 355 * error. 356 */ 357 int kill_it = 0; 358 DECLARE_BITMAP(toclear, MAX_NR_BANKS); 359 360 atomic_inc(&mce_entry); 361 362 if (notify_die(DIE_NMI, "machine check", regs, error_code, 363 18, SIGKILL) == NOTIFY_STOP) 364 goto out; 365 if (!banks) 366 goto out; 367 368 mce_setup(&m); 369 370 m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); 371 372 /* if the restart IP is not valid, we're done for */ 373 if (!(m.mcgstatus & MCG_STATUS_RIPV)) 374 no_way_out = 1; 375 376 rdtscll(mcestart); 377 barrier(); 378 379 for (i = 0; i < banks; i++) { 380 __clear_bit(i, toclear); 381 if (!bank[i]) 382 continue; 383 384 m.misc = 0; 385 m.addr = 0; 386 m.bank = i; 387 388 m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4); 389 if ((m.status & MCI_STATUS_VAL) == 0) 390 continue; 391 392 /* 393 * Non uncorrected errors are handled by machine_check_poll 394 * Leave them alone. 395 */ 396 if ((m.status & MCI_STATUS_UC) == 0) 397 continue; 398 399 /* 400 * Set taint even when machine check was not enabled. 401 */ 402 add_taint(TAINT_MACHINE_CHECK); 403 404 __set_bit(i, toclear); 405 406 if (m.status & MCI_STATUS_EN) { 407 /* if PCC was set, there's no way out */ 408 no_way_out |= !!(m.status & MCI_STATUS_PCC); 409 /* 410 * If this error was uncorrectable and there was 411 * an overflow, we're in trouble. If no overflow, 412 * we might get away with just killing a task. 413 */ 414 if (m.status & MCI_STATUS_UC) { 415 if (tolerant < 1 || m.status & MCI_STATUS_OVER) 416 no_way_out = 1; 417 kill_it = 1; 418 } 419 } else { 420 /* 421 * Machine check event was not enabled. Clear, but 422 * ignore. 423 */ 424 continue; 425 } 426 427 if (m.status & MCI_STATUS_MISCV) 428 m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4); 429 if (m.status & MCI_STATUS_ADDRV) 430 m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4); 431 432 mce_get_rip(&m, regs); 433 mce_log(&m); 434 435 /* 436 * Did this bank cause the exception? 437 * 438 * Assume that the bank with uncorrectable errors did it, 439 * and that there is only a single one: 440 */ 441 if ((m.status & MCI_STATUS_UC) && 442 (m.status & MCI_STATUS_EN)) { 443 panicm = m; 444 panicm_found = 1; 445 } 446 } 447 448 /* 449 * If we didn't find an uncorrectable error, pick 450 * the last one (shouldn't happen, just being safe). 451 */ 452 if (!panicm_found) 453 panicm = m; 454 455 /* 456 * If we have decided that we just CAN'T continue, and the user 457 * has not set tolerant to an insane level, give up and die. 458 */ 459 if (no_way_out && tolerant < 3) 460 mce_panic("Machine check", &panicm, mcestart); 461 462 /* 463 * If the error seems to be unrecoverable, something should be 464 * done. Try to kill as little as possible. If we can kill just 465 * one task, do that. If the user has set the tolerance very 466 * high, don't try to do anything at all. 467 */ 468 if (kill_it && tolerant < 3) { 469 int user_space = 0; 470 471 /* 472 * If the EIPV bit is set, it means the saved IP is the 473 * instruction which caused the MCE. 474 */ 475 if (m.mcgstatus & MCG_STATUS_EIPV) 476 user_space = panicm.ip && (panicm.cs & 3); 477 478 /* 479 * If we know that the error was in user space, send a 480 * SIGBUS. Otherwise, panic if tolerance is low. 481 * 482 * force_sig() takes an awful lot of locks and has a slight 483 * risk of deadlocking. 484 */ 485 if (user_space) { 486 force_sig(SIGBUS, current); 487 } else if (panic_on_oops || tolerant < 2) { 488 mce_panic("Uncorrected machine check", 489 &panicm, mcestart); 490 } 491 } 492 493 /* notify userspace ASAP */ 494 set_thread_flag(TIF_MCE_NOTIFY); 495 496 /* the last thing we do is clear state */ 497 for (i = 0; i < banks; i++) { 498 if (test_bit(i, toclear)) 499 mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); 500 } 501 mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); 502out: 503 atomic_dec(&mce_entry); 504 sync_core(); 505} 506EXPORT_SYMBOL_GPL(do_machine_check); 507 508#ifdef CONFIG_X86_MCE_INTEL 509/*** 510 * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog 511 * @cpu: The CPU on which the event occurred. 512 * @status: Event status information 513 * 514 * This function should be called by the thermal interrupt after the 515 * event has been processed and the decision was made to log the event 516 * further. 517 * 518 * The status parameter will be saved to the 'status' field of 'struct mce' 519 * and historically has been the register value of the 520 * MSR_IA32_THERMAL_STATUS (Intel) msr. 521 */ 522void mce_log_therm_throt_event(__u64 status) 523{ 524 struct mce m; 525 526 mce_setup(&m); 527 m.bank = MCE_THERMAL_BANK; 528 m.status = status; 529 mce_log(&m); 530} 531#endif /* CONFIG_X86_MCE_INTEL */ 532 533/* 534 * Periodic polling timer for "silent" machine check errors. If the 535 * poller finds an MCE, poll 2x faster. When the poller finds no more 536 * errors, poll 2x slower (up to check_interval seconds). 537 */ 538static int check_interval = 5 * 60; /* 5 minutes */ 539 540static DEFINE_PER_CPU(int, next_interval); /* in jiffies */ 541static DEFINE_PER_CPU(struct timer_list, mce_timer); 542 543static void mcheck_timer(unsigned long data) 544{ 545 struct timer_list *t = &per_cpu(mce_timer, data); 546 int *n; 547 548 WARN_ON(smp_processor_id() != data); 549 550 if (mce_available(¤t_cpu_data)) { 551 machine_check_poll(MCP_TIMESTAMP, 552 &__get_cpu_var(mce_poll_banks)); 553 } 554 555 /* 556 * Alert userspace if needed. If we logged an MCE, reduce the 557 * polling interval, otherwise increase the polling interval. 558 */ 559 n = &__get_cpu_var(next_interval); 560 if (mce_notify_user()) { 561 *n = max(*n/2, HZ/100); 562 } else { 563 *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ)); 564 } 565 566 t->expires = jiffies + *n; 567 add_timer(t); 568} 569 570static void mce_do_trigger(struct work_struct *work) 571{ 572 call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT); 573} 574 575static DECLARE_WORK(mce_trigger_work, mce_do_trigger); 576 577/* 578 * Notify the user(s) about new machine check events. 579 * Can be called from interrupt context, but not from machine check/NMI 580 * context. 581 */ 582int mce_notify_user(void) 583{ 584 /* Not more than two messages every minute */ 585 static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); 586 587 clear_thread_flag(TIF_MCE_NOTIFY); 588 589 if (test_and_clear_bit(0, ¬ify_user)) { 590 wake_up_interruptible(&mce_wait); 591 592 /* 593 * There is no risk of missing notifications because 594 * work_pending is always cleared before the function is 595 * executed. 596 */ 597 if (trigger[0] && !work_pending(&mce_trigger_work)) 598 schedule_work(&mce_trigger_work); 599 600 if (__ratelimit(&ratelimit)) 601 printk(KERN_INFO "Machine check events logged\n"); 602 603 return 1; 604 } 605 return 0; 606} 607EXPORT_SYMBOL_GPL(mce_notify_user); 608 609/* 610 * Initialize Machine Checks for a CPU. 611 */ 612static int mce_cap_init(void) 613{ 614 unsigned b; 615 u64 cap; 616 617 rdmsrl(MSR_IA32_MCG_CAP, cap); 618 619 b = cap & MCG_BANKCNT_MASK; 620 printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b); 621 622 if (b > MAX_NR_BANKS) { 623 printk(KERN_WARNING 624 "MCE: Using only %u machine check banks out of %u\n", 625 MAX_NR_BANKS, b); 626 b = MAX_NR_BANKS; 627 } 628 629 /* Don't support asymmetric configurations today */ 630 WARN_ON(banks != 0 && b != banks); 631 banks = b; 632 if (!bank) { 633 bank = kmalloc(banks * sizeof(u64), GFP_KERNEL); 634 if (!bank) 635 return -ENOMEM; 636 memset(bank, 0xff, banks * sizeof(u64)); 637 } 638 639 /* Use accurate RIP reporting if available. */ 640 if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9) 641 rip_msr = MSR_IA32_MCG_EIP; 642 643 return 0; 644} 645 646static void mce_init(void) 647{ 648 mce_banks_t all_banks; 649 u64 cap; 650 int i; 651 652 /* 653 * Log the machine checks left over from the previous reset. 654 */ 655 bitmap_fill(all_banks, MAX_NR_BANKS); 656 machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks); 657 658 set_in_cr4(X86_CR4_MCE); 659 660 rdmsrl(MSR_IA32_MCG_CAP, cap); 661 if (cap & MCG_CTL_P) 662 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); 663 664 for (i = 0; i < banks; i++) { 665 if (skip_bank_init(i)) 666 continue; 667 wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]); 668 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); 669 } 670} 671 672/* Add per CPU specific workarounds here */ 673static void mce_cpu_quirks(struct cpuinfo_x86 *c) 674{ 675 /* This should be disabled by the BIOS, but isn't always */ 676 if (c->x86_vendor == X86_VENDOR_AMD) { 677 if (c->x86 == 15 && banks > 4) { 678 /* 679 * disable GART TBL walk error reporting, which 680 * trips off incorrectly with the IOMMU & 3ware 681 * & Cerberus: 682 */ 683 clear_bit(10, (unsigned long *)&bank[4]); 684 } 685 if (c->x86 <= 17 && mce_bootlog < 0) { 686 /* 687 * Lots of broken BIOS around that don't clear them 688 * by default and leave crap in there. Don't log: 689 */ 690 mce_bootlog = 0; 691 } 692 /* 693 * Various K7s with broken bank 0 around. Always disable 694 * by default. 695 */ 696 if (c->x86 == 6) 697 bank[0] = 0; 698 } 699 700 if (c->x86_vendor == X86_VENDOR_INTEL) { 701 /* 702 * SDM documents that on family 6 bank 0 should not be written 703 * because it aliases to another special BIOS controlled 704 * register. 705 * But it's not aliased anymore on model 0x1a+ 706 * Don't ignore bank 0 completely because there could be a 707 * valid event later, merely don't write CTL0. 708 */ 709 710 if (c->x86 == 6 && c->x86_model < 0x1A) 711 __set_bit(0, &dont_init_banks); 712 } 713} 714 715static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c) 716{ 717 if (c->x86 != 5) 718 return; 719 switch (c->x86_vendor) { 720 case X86_VENDOR_INTEL: 721 if (mce_p5_enabled()) 722 intel_p5_mcheck_init(c); 723 break; 724 case X86_VENDOR_CENTAUR: 725 winchip_mcheck_init(c); 726 break; 727 } 728} 729 730static void mce_cpu_features(struct cpuinfo_x86 *c) 731{ 732 switch (c->x86_vendor) { 733 case X86_VENDOR_INTEL: 734 mce_intel_feature_init(c); 735 break; 736 case X86_VENDOR_AMD: 737 mce_amd_feature_init(c); 738 break; 739 default: 740 break; 741 } 742} 743 744static void mce_init_timer(void) 745{ 746 struct timer_list *t = &__get_cpu_var(mce_timer); 747 int *n = &__get_cpu_var(next_interval); 748 749 *n = check_interval * HZ; 750 if (!*n) 751 return; 752 setup_timer(t, mcheck_timer, smp_processor_id()); 753 t->expires = round_jiffies(jiffies + *n); 754 add_timer(t); 755} 756 757/* 758 * Called for each booted CPU to set up machine checks. 759 * Must be called with preempt off: 760 */ 761void __cpuinit mcheck_init(struct cpuinfo_x86 *c) 762{ 763 if (mce_disabled) 764 return; 765 766 mce_ancient_init(c); 767 768 if (!mce_available(c)) 769 return; 770 771 if (mce_cap_init() < 0) { 772 mce_disabled = 1; 773 return; 774 } 775 mce_cpu_quirks(c); 776 777 machine_check_vector = do_machine_check; 778 779 mce_init(); 780 mce_cpu_features(c); 781 mce_init_timer(); 782} 783 784/* 785 * Character device to read and clear the MCE log. 786 */ 787 788static DEFINE_SPINLOCK(mce_state_lock); 789static int open_count; /* #times opened */ 790static int open_exclu; /* already open exclusive? */ 791 792static int mce_open(struct inode *inode, struct file *file) 793{ 794 lock_kernel(); 795 spin_lock(&mce_state_lock); 796 797 if (open_exclu || (open_count && (file->f_flags & O_EXCL))) { 798 spin_unlock(&mce_state_lock); 799 unlock_kernel(); 800 801 return -EBUSY; 802 } 803 804 if (file->f_flags & O_EXCL) 805 open_exclu = 1; 806 open_count++; 807 808 spin_unlock(&mce_state_lock); 809 unlock_kernel(); 810 811 return nonseekable_open(inode, file); 812} 813 814static int mce_release(struct inode *inode, struct file *file) 815{ 816 spin_lock(&mce_state_lock); 817 818 open_count--; 819 open_exclu = 0; 820 821 spin_unlock(&mce_state_lock); 822 823 return 0; 824} 825 826static void collect_tscs(void *data) 827{ 828 unsigned long *cpu_tsc = (unsigned long *)data; 829 830 rdtscll(cpu_tsc[smp_processor_id()]); 831} 832 833static DEFINE_MUTEX(mce_read_mutex); 834 835static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, 836 loff_t *off) 837{ 838 char __user *buf = ubuf; 839 unsigned long *cpu_tsc; 840 unsigned prev, next; 841 int i, err; 842 843 cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL); 844 if (!cpu_tsc) 845 return -ENOMEM; 846 847 mutex_lock(&mce_read_mutex); 848 next = rcu_dereference(mcelog.next); 849 850 /* Only supports full reads right now */ 851 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) { 852 mutex_unlock(&mce_read_mutex); 853 kfree(cpu_tsc); 854 855 return -EINVAL; 856 } 857 858 err = 0; 859 prev = 0; 860 do { 861 for (i = prev; i < next; i++) { 862 unsigned long start = jiffies; 863 864 while (!mcelog.entry[i].finished) { 865 if (time_after_eq(jiffies, start + 2)) { 866 memset(mcelog.entry + i, 0, 867 sizeof(struct mce)); 868 goto timeout; 869 } 870 cpu_relax(); 871 } 872 smp_rmb(); 873 err |= copy_to_user(buf, mcelog.entry + i, 874 sizeof(struct mce)); 875 buf += sizeof(struct mce); 876timeout: 877 ; 878 } 879 880 memset(mcelog.entry + prev, 0, 881 (next - prev) * sizeof(struct mce)); 882 prev = next; 883 next = cmpxchg(&mcelog.next, prev, 0); 884 } while (next != prev); 885 886 synchronize_sched(); 887 888 /* 889 * Collect entries that were still getting written before the 890 * synchronize. 891 */ 892 on_each_cpu(collect_tscs, cpu_tsc, 1); 893 894 for (i = next; i < MCE_LOG_LEN; i++) { 895 if (mcelog.entry[i].finished && 896 mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) { 897 err |= copy_to_user(buf, mcelog.entry+i, 898 sizeof(struct mce)); 899 smp_rmb(); 900 buf += sizeof(struct mce); 901 memset(&mcelog.entry[i], 0, sizeof(struct mce)); 902 } 903 } 904 mutex_unlock(&mce_read_mutex); 905 kfree(cpu_tsc); 906 907 return err ? -EFAULT : buf - ubuf; 908} 909 910static unsigned int mce_poll(struct file *file, poll_table *wait) 911{ 912 poll_wait(file, &mce_wait, wait); 913 if (rcu_dereference(mcelog.next)) 914 return POLLIN | POLLRDNORM; 915 return 0; 916} 917 918static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg) 919{ 920 int __user *p = (int __user *)arg; 921 922 if (!capable(CAP_SYS_ADMIN)) 923 return -EPERM; 924 925 switch (cmd) { 926 case MCE_GET_RECORD_LEN: 927 return put_user(sizeof(struct mce), p); 928 case MCE_GET_LOG_LEN: 929 return put_user(MCE_LOG_LEN, p); 930 case MCE_GETCLEAR_FLAGS: { 931 unsigned flags; 932 933 do { 934 flags = mcelog.flags; 935 } while (cmpxchg(&mcelog.flags, flags, 0) != flags); 936 937 return put_user(flags, p); 938 } 939 default: 940 return -ENOTTY; 941 } 942} 943 944/* Modified in mce-inject.c, so not static or const */ 945struct file_operations mce_chrdev_ops = { 946 .open = mce_open, 947 .release = mce_release, 948 .read = mce_read, 949 .poll = mce_poll, 950 .unlocked_ioctl = mce_ioctl, 951}; 952EXPORT_SYMBOL_GPL(mce_chrdev_ops); 953 954static struct miscdevice mce_log_device = { 955 MISC_MCELOG_MINOR, 956 "mcelog", 957 &mce_chrdev_ops, 958}; 959 960/* 961 * mce=off disables machine check 962 * mce=TOLERANCELEVEL (number, see above) 963 * mce=bootlog Log MCEs from before booting. Disabled by default on AMD. 964 * mce=nobootlog Don't log MCEs from before booting. 965 */ 966static int __init mcheck_enable(char *str) 967{ 968 if (*str == 0) 969 enable_p5_mce(); 970 if (*str == '=') 971 str++; 972 if (!strcmp(str, "off")) 973 mce_disabled = 1; 974 else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog")) 975 mce_bootlog = (str[0] == 'b'); 976 else if (isdigit(str[0])) 977 get_option(&str, &tolerant); 978 else { 979 printk(KERN_INFO "mce argument %s ignored. Please use /sys\n", 980 str); 981 return 0; 982 } 983 return 1; 984} 985__setup("mce", mcheck_enable); 986 987/* 988 * Sysfs support 989 */ 990 991/* 992 * Disable machine checks on suspend and shutdown. We can't really handle 993 * them later. 994 */ 995static int mce_disable(void) 996{ 997 int i; 998 999 for (i = 0; i < banks; i++) { 1000 if (!skip_bank_init(i)) 1001 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); 1002 } 1003 return 0; 1004} 1005 1006static int mce_suspend(struct sys_device *dev, pm_message_t state) 1007{ 1008 return mce_disable(); 1009} 1010 1011static int mce_shutdown(struct sys_device *dev) 1012{ 1013 return mce_disable(); 1014} 1015 1016/* 1017 * On resume clear all MCE state. Don't want to see leftovers from the BIOS. 1018 * Only one CPU is active at this time, the others get re-added later using 1019 * CPU hotplug: 1020 */ 1021static int mce_resume(struct sys_device *dev) 1022{ 1023 mce_init(); 1024 mce_cpu_features(¤t_cpu_data); 1025 1026 return 0; 1027} 1028 1029static void mce_cpu_restart(void *data) 1030{ 1031 del_timer_sync(&__get_cpu_var(mce_timer)); 1032 if (mce_available(¤t_cpu_data)) 1033 mce_init(); 1034 mce_init_timer(); 1035} 1036 1037/* Reinit MCEs after user configuration changes */ 1038static void mce_restart(void) 1039{ 1040 on_each_cpu(mce_cpu_restart, NULL, 1); 1041} 1042 1043static struct sysdev_class mce_sysclass = { 1044 .suspend = mce_suspend, 1045 .shutdown = mce_shutdown, 1046 .resume = mce_resume, 1047 .name = "machinecheck", 1048}; 1049 1050DEFINE_PER_CPU(struct sys_device, mce_dev); 1051 1052__cpuinitdata 1053void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); 1054 1055static struct sysdev_attribute *bank_attrs; 1056 1057static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr, 1058 char *buf) 1059{ 1060 u64 b = bank[attr - bank_attrs]; 1061 1062 return sprintf(buf, "%llx\n", b); 1063} 1064 1065static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, 1066 const char *buf, size_t siz) 1067{ 1068 char *end; 1069 u64 new = simple_strtoull(buf, &end, 0); 1070 1071 if (end == buf) 1072 return -EINVAL; 1073 1074 bank[attr - bank_attrs] = new; 1075 mce_restart(); 1076 1077 return end-buf; 1078} 1079 1080static ssize_t 1081show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf) 1082{ 1083 strcpy(buf, trigger); 1084 strcat(buf, "\n"); 1085 return strlen(trigger) + 1; 1086} 1087 1088static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr, 1089 const char *buf, size_t siz) 1090{ 1091 char *p; 1092 int len; 1093 1094 strncpy(trigger, buf, sizeof(trigger)); 1095 trigger[sizeof(trigger)-1] = 0; 1096 len = strlen(trigger); 1097 p = strchr(trigger, '\n'); 1098 1099 if (*p) 1100 *p = 0; 1101 1102 return len; 1103} 1104 1105static ssize_t store_int_with_restart(struct sys_device *s, 1106 struct sysdev_attribute *attr, 1107 const char *buf, size_t size) 1108{ 1109 ssize_t ret = sysdev_store_int(s, attr, buf, size); 1110 mce_restart(); 1111 return ret; 1112} 1113 1114static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); 1115static SYSDEV_INT_ATTR(tolerant, 0644, tolerant); 1116 1117static struct sysdev_ext_attribute attr_check_interval = { 1118 _SYSDEV_ATTR(check_interval, 0644, sysdev_show_int, 1119 store_int_with_restart), 1120 &check_interval 1121}; 1122 1123static struct sysdev_attribute *mce_attrs[] = { 1124 &attr_tolerant.attr, &attr_check_interval.attr, &attr_trigger, 1125 NULL 1126}; 1127 1128static cpumask_var_t mce_dev_initialized; 1129 1130/* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */ 1131static __cpuinit int mce_create_device(unsigned int cpu) 1132{ 1133 int err; 1134 int i; 1135 1136 if (!mce_available(&boot_cpu_data)) 1137 return -EIO; 1138 1139 memset(&per_cpu(mce_dev, cpu).kobj, 0, sizeof(struct kobject)); 1140 per_cpu(mce_dev, cpu).id = cpu; 1141 per_cpu(mce_dev, cpu).cls = &mce_sysclass; 1142 1143 err = sysdev_register(&per_cpu(mce_dev, cpu)); 1144 if (err) 1145 return err; 1146 1147 for (i = 0; mce_attrs[i]; i++) { 1148 err = sysdev_create_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); 1149 if (err) 1150 goto error; 1151 } 1152 for (i = 0; i < banks; i++) { 1153 err = sysdev_create_file(&per_cpu(mce_dev, cpu), 1154 &bank_attrs[i]); 1155 if (err) 1156 goto error2; 1157 } 1158 cpumask_set_cpu(cpu, mce_dev_initialized); 1159 1160 return 0; 1161error2: 1162 while (--i >= 0) 1163 sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]); 1164error: 1165 while (--i >= 0) 1166 sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); 1167 1168 sysdev_unregister(&per_cpu(mce_dev, cpu)); 1169 1170 return err; 1171} 1172 1173static __cpuinit void mce_remove_device(unsigned int cpu) 1174{ 1175 int i; 1176 1177 if (!cpumask_test_cpu(cpu, mce_dev_initialized)) 1178 return; 1179 1180 for (i = 0; mce_attrs[i]; i++) 1181 sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); 1182 1183 for (i = 0; i < banks; i++) 1184 sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]); 1185 1186 sysdev_unregister(&per_cpu(mce_dev, cpu)); 1187 cpumask_clear_cpu(cpu, mce_dev_initialized); 1188} 1189 1190/* Make sure there are no machine checks on offlined CPUs. */ 1191static void mce_disable_cpu(void *h) 1192{ 1193 unsigned long action = *(unsigned long *)h; 1194 int i; 1195 1196 if (!mce_available(¤t_cpu_data)) 1197 return; 1198 if (!(action & CPU_TASKS_FROZEN)) 1199 cmci_clear(); 1200 for (i = 0; i < banks; i++) { 1201 if (!skip_bank_init(i)) 1202 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); 1203 } 1204} 1205 1206static void mce_reenable_cpu(void *h) 1207{ 1208 unsigned long action = *(unsigned long *)h; 1209 int i; 1210 1211 if (!mce_available(¤t_cpu_data)) 1212 return; 1213 1214 if (!(action & CPU_TASKS_FROZEN)) 1215 cmci_reenable(); 1216 for (i = 0; i < banks; i++) { 1217 if (!skip_bank_init(i)) 1218 wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]); 1219 } 1220} 1221 1222/* Get notified when a cpu comes on/off. Be hotplug friendly. */ 1223static int __cpuinit 1224mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) 1225{ 1226 unsigned int cpu = (unsigned long)hcpu; 1227 struct timer_list *t = &per_cpu(mce_timer, cpu); 1228 1229 switch (action) { 1230 case CPU_ONLINE: 1231 case CPU_ONLINE_FROZEN: 1232 mce_create_device(cpu); 1233 if (threshold_cpu_callback) 1234 threshold_cpu_callback(action, cpu); 1235 break; 1236 case CPU_DEAD: 1237 case CPU_DEAD_FROZEN: 1238 if (threshold_cpu_callback) 1239 threshold_cpu_callback(action, cpu); 1240 mce_remove_device(cpu); 1241 break; 1242 case CPU_DOWN_PREPARE: 1243 case CPU_DOWN_PREPARE_FROZEN: 1244 del_timer_sync(t); 1245 smp_call_function_single(cpu, mce_disable_cpu, &action, 1); 1246 break; 1247 case CPU_DOWN_FAILED: 1248 case CPU_DOWN_FAILED_FROZEN: 1249 t->expires = round_jiffies(jiffies + 1250 __get_cpu_var(next_interval)); 1251 add_timer_on(t, cpu); 1252 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); 1253 break; 1254 case CPU_POST_DEAD: 1255 /* intentionally ignoring frozen here */ 1256 cmci_rediscover(cpu); 1257 break; 1258 } 1259 return NOTIFY_OK; 1260} 1261 1262static struct notifier_block mce_cpu_notifier __cpuinitdata = { 1263 .notifier_call = mce_cpu_callback, 1264}; 1265 1266static __init int mce_init_banks(void) 1267{ 1268 int i; 1269 1270 bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks, 1271 GFP_KERNEL); 1272 if (!bank_attrs) 1273 return -ENOMEM; 1274 1275 for (i = 0; i < banks; i++) { 1276 struct sysdev_attribute *a = &bank_attrs[i]; 1277 1278 a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i); 1279 if (!a->attr.name) 1280 goto nomem; 1281 1282 a->attr.mode = 0644; 1283 a->show = show_bank; 1284 a->store = set_bank; 1285 } 1286 return 0; 1287 1288nomem: 1289 while (--i >= 0) 1290 kfree(bank_attrs[i].attr.name); 1291 kfree(bank_attrs); 1292 bank_attrs = NULL; 1293 1294 return -ENOMEM; 1295} 1296 1297static __init int mce_init_device(void) 1298{ 1299 int err; 1300 int i = 0; 1301 1302 if (!mce_available(&boot_cpu_data)) 1303 return -EIO; 1304 1305 alloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL); 1306 1307 err = mce_init_banks(); 1308 if (err) 1309 return err; 1310 1311 err = sysdev_class_register(&mce_sysclass); 1312 if (err) 1313 return err; 1314 1315 for_each_online_cpu(i) { 1316 err = mce_create_device(i); 1317 if (err) 1318 return err; 1319 } 1320 1321 register_hotcpu_notifier(&mce_cpu_notifier); 1322 misc_register(&mce_log_device); 1323 1324 return err; 1325} 1326 1327device_initcall(mce_init_device); 1328 1329#else /* CONFIG_X86_OLD_MCE: */ 1330 1331int nr_mce_banks; 1332EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */ 1333 1334/* This has to be run for each processor */ 1335void mcheck_init(struct cpuinfo_x86 *c) 1336{ 1337 if (mce_disabled == 1) 1338 return; 1339 1340 switch (c->x86_vendor) { 1341 case X86_VENDOR_AMD: 1342 amd_mcheck_init(c); 1343 break; 1344 1345 case X86_VENDOR_INTEL: 1346 if (c->x86 == 5) 1347 intel_p5_mcheck_init(c); 1348 if (c->x86 == 6) 1349 intel_p6_mcheck_init(c); 1350 if (c->x86 == 15) 1351 intel_p4_mcheck_init(c); 1352 break; 1353 1354 case X86_VENDOR_CENTAUR: 1355 if (c->x86 == 5) 1356 winchip_mcheck_init(c); 1357 break; 1358 1359 default: 1360 break; 1361 } 1362 printk(KERN_INFO "mce: CPU supports %d MCE banks\n", nr_mce_banks); 1363} 1364 1365static int __init mcheck_enable(char *str) 1366{ 1367 mce_disabled = -1; 1368 return 1; 1369} 1370 1371__setup("mce", mcheck_enable); 1372 1373#endif /* CONFIG_X86_OLD_MCE */ 1374 1375/* 1376 * Old style boot options parsing. Only for compatibility. 1377 */ 1378static int __init mcheck_disable(char *str) 1379{ 1380 mce_disabled = 1; 1381 return 1; 1382} 1383__setup("nomce", mcheck_disable); 1384