mce.c revision 01ca79f1411eae2a45352709c838b946b1af9fbd
1/* 2 * Machine check handler. 3 * 4 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. 5 * Rest from unknown author(s). 6 * 2004 Andi Kleen. Rewrote most of it. 7 * Copyright 2008 Intel Corporation 8 * Author: Andi Kleen 9 */ 10#include <linux/thread_info.h> 11#include <linux/capability.h> 12#include <linux/miscdevice.h> 13#include <linux/ratelimit.h> 14#include <linux/kallsyms.h> 15#include <linux/rcupdate.h> 16#include <linux/kobject.h> 17#include <linux/uaccess.h> 18#include <linux/kdebug.h> 19#include <linux/kernel.h> 20#include <linux/percpu.h> 21#include <linux/string.h> 22#include <linux/sysdev.h> 23#include <linux/ctype.h> 24#include <linux/sched.h> 25#include <linux/sysfs.h> 26#include <linux/types.h> 27#include <linux/init.h> 28#include <linux/kmod.h> 29#include <linux/poll.h> 30#include <linux/cpu.h> 31#include <linux/smp.h> 32#include <linux/fs.h> 33 34#include <asm/processor.h> 35#include <asm/idle.h> 36#include <asm/mce.h> 37#include <asm/msr.h> 38 39#include "mce.h" 40 41/* Handle unconfigured int18 (should never happen) */ 42static void unexpected_machine_check(struct pt_regs *regs, long error_code) 43{ 44 printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", 45 smp_processor_id()); 46} 47 48/* Call the installed machine check handler for this CPU setup. */ 49void (*machine_check_vector)(struct pt_regs *, long error_code) = 50 unexpected_machine_check; 51 52int mce_disabled; 53 54#ifdef CONFIG_X86_NEW_MCE 55 56#define MISC_MCELOG_MINOR 227 57 58atomic_t mce_entry; 59 60DEFINE_PER_CPU(unsigned, mce_exception_count); 61 62/* 63 * Tolerant levels: 64 * 0: always panic on uncorrected errors, log corrected errors 65 * 1: panic or SIGBUS on uncorrected errors, log corrected errors 66 * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors 67 * 3: never panic or SIGBUS, log all errors (for testing only) 68 */ 69static int tolerant = 1; 70static int banks; 71static u64 *bank; 72static unsigned long notify_user; 73static int rip_msr; 74static int mce_bootlog = -1; 75 76static char trigger[128]; 77static char *trigger_argv[2] = { trigger, NULL }; 78 79static unsigned long dont_init_banks; 80 81static DECLARE_WAIT_QUEUE_HEAD(mce_wait); 82 83/* MCA banks polled by the period polling timer for corrected events */ 84DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { 85 [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL 86}; 87 88static inline int skip_bank_init(int i) 89{ 90 return i < BITS_PER_LONG && test_bit(i, &dont_init_banks); 91} 92 93/* Do initial initialization of a struct mce */ 94void mce_setup(struct mce *m) 95{ 96 memset(m, 0, sizeof(struct mce)); 97 m->cpu = smp_processor_id(); 98 rdtscll(m->tsc); 99} 100 101DEFINE_PER_CPU(struct mce, injectm); 102EXPORT_PER_CPU_SYMBOL_GPL(injectm); 103 104/* 105 * Lockless MCE logging infrastructure. 106 * This avoids deadlocks on printk locks without having to break locks. Also 107 * separate MCEs from kernel messages to avoid bogus bug reports. 108 */ 109 110static struct mce_log mcelog = { 111 MCE_LOG_SIGNATURE, 112 MCE_LOG_LEN, 113}; 114 115void mce_log(struct mce *mce) 116{ 117 unsigned next, entry; 118 119 mce->finished = 0; 120 wmb(); 121 for (;;) { 122 entry = rcu_dereference(mcelog.next); 123 for (;;) { 124 /* 125 * When the buffer fills up discard new entries. 126 * Assume that the earlier errors are the more 127 * interesting ones: 128 */ 129 if (entry >= MCE_LOG_LEN) { 130 set_bit(MCE_OVERFLOW, 131 (unsigned long *)&mcelog.flags); 132 return; 133 } 134 /* Old left over entry. Skip: */ 135 if (mcelog.entry[entry].finished) { 136 entry++; 137 continue; 138 } 139 break; 140 } 141 smp_rmb(); 142 next = entry + 1; 143 if (cmpxchg(&mcelog.next, entry, next) == entry) 144 break; 145 } 146 memcpy(mcelog.entry + entry, mce, sizeof(struct mce)); 147 wmb(); 148 mcelog.entry[entry].finished = 1; 149 wmb(); 150 151 set_bit(0, ¬ify_user); 152} 153 154static void print_mce(struct mce *m) 155{ 156 printk(KERN_EMERG "\n" 157 KERN_EMERG "HARDWARE ERROR\n" 158 KERN_EMERG 159 "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n", 160 m->cpu, m->mcgstatus, m->bank, m->status); 161 if (m->ip) { 162 printk(KERN_EMERG "RIP%s %02x:<%016Lx> ", 163 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", 164 m->cs, m->ip); 165 if (m->cs == __KERNEL_CS) 166 print_symbol("{%s}", m->ip); 167 printk("\n"); 168 } 169 printk(KERN_EMERG "TSC %llx ", m->tsc); 170 if (m->addr) 171 printk("ADDR %llx ", m->addr); 172 if (m->misc) 173 printk("MISC %llx ", m->misc); 174 printk("\n"); 175 printk(KERN_EMERG "This is not a software problem!\n"); 176 printk(KERN_EMERG "Run through mcelog --ascii to decode " 177 "and contact your hardware vendor\n"); 178} 179 180static void mce_panic(char *msg, struct mce *backup, u64 start) 181{ 182 int i; 183 184 bust_spinlocks(1); 185 console_verbose(); 186 for (i = 0; i < MCE_LOG_LEN; i++) { 187 u64 tsc = mcelog.entry[i].tsc; 188 189 if ((s64)(tsc - start) < 0) 190 continue; 191 print_mce(&mcelog.entry[i]); 192 if (backup && mcelog.entry[i].tsc == backup->tsc) 193 backup = NULL; 194 } 195 if (backup) 196 print_mce(backup); 197 panic(msg); 198} 199 200/* Support code for software error injection */ 201 202static int msr_to_offset(u32 msr) 203{ 204 unsigned bank = __get_cpu_var(injectm.bank); 205 if (msr == rip_msr) 206 return offsetof(struct mce, ip); 207 if (msr == MSR_IA32_MC0_STATUS + bank*4) 208 return offsetof(struct mce, status); 209 if (msr == MSR_IA32_MC0_ADDR + bank*4) 210 return offsetof(struct mce, addr); 211 if (msr == MSR_IA32_MC0_MISC + bank*4) 212 return offsetof(struct mce, misc); 213 if (msr == MSR_IA32_MCG_STATUS) 214 return offsetof(struct mce, mcgstatus); 215 return -1; 216} 217 218/* MSR access wrappers used for error injection */ 219static u64 mce_rdmsrl(u32 msr) 220{ 221 u64 v; 222 if (__get_cpu_var(injectm).finished) { 223 int offset = msr_to_offset(msr); 224 if (offset < 0) 225 return 0; 226 return *(u64 *)((char *)&__get_cpu_var(injectm) + offset); 227 } 228 rdmsrl(msr, v); 229 return v; 230} 231 232static void mce_wrmsrl(u32 msr, u64 v) 233{ 234 if (__get_cpu_var(injectm).finished) { 235 int offset = msr_to_offset(msr); 236 if (offset >= 0) 237 *(u64 *)((char *)&__get_cpu_var(injectm) + offset) = v; 238 return; 239 } 240 wrmsrl(msr, v); 241} 242 243int mce_available(struct cpuinfo_x86 *c) 244{ 245 if (mce_disabled) 246 return 0; 247 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); 248} 249 250static inline void mce_get_rip(struct mce *m, struct pt_regs *regs) 251{ 252 if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) { 253 m->ip = regs->ip; 254 m->cs = regs->cs; 255 } else { 256 m->ip = 0; 257 m->cs = 0; 258 } 259 if (rip_msr) { 260 /* Assume the RIP in the MSR is exact. Is this true? */ 261 m->mcgstatus |= MCG_STATUS_EIPV; 262 m->ip = mce_rdmsrl(rip_msr); 263 m->cs = 0; 264 } 265} 266 267/* 268 * Poll for corrected events or events that happened before reset. 269 * Those are just logged through /dev/mcelog. 270 * 271 * This is executed in standard interrupt context. 272 */ 273void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) 274{ 275 struct mce m; 276 int i; 277 278 mce_setup(&m); 279 280 m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); 281 for (i = 0; i < banks; i++) { 282 if (!bank[i] || !test_bit(i, *b)) 283 continue; 284 285 m.misc = 0; 286 m.addr = 0; 287 m.bank = i; 288 m.tsc = 0; 289 290 barrier(); 291 m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4); 292 if (!(m.status & MCI_STATUS_VAL)) 293 continue; 294 295 /* 296 * Uncorrected events are handled by the exception handler 297 * when it is enabled. But when the exception is disabled log 298 * everything. 299 * 300 * TBD do the same check for MCI_STATUS_EN here? 301 */ 302 if ((m.status & MCI_STATUS_UC) && !(flags & MCP_UC)) 303 continue; 304 305 if (m.status & MCI_STATUS_MISCV) 306 m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4); 307 if (m.status & MCI_STATUS_ADDRV) 308 m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4); 309 310 if (!(flags & MCP_TIMESTAMP)) 311 m.tsc = 0; 312 /* 313 * Don't get the IP here because it's unlikely to 314 * have anything to do with the actual error location. 315 */ 316 if (!(flags & MCP_DONTLOG)) { 317 mce_log(&m); 318 add_taint(TAINT_MACHINE_CHECK); 319 } 320 321 /* 322 * Clear state for this bank. 323 */ 324 mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); 325 } 326 327 /* 328 * Don't clear MCG_STATUS here because it's only defined for 329 * exceptions. 330 */ 331 332 sync_core(); 333} 334EXPORT_SYMBOL_GPL(machine_check_poll); 335 336/* 337 * The actual machine check handler. This only handles real 338 * exceptions when something got corrupted coming in through int 18. 339 * 340 * This is executed in NMI context not subject to normal locking rules. This 341 * implies that most kernel services cannot be safely used. Don't even 342 * think about putting a printk in there! 343 */ 344void do_machine_check(struct pt_regs *regs, long error_code) 345{ 346 struct mce m, panicm; 347 int panicm_found = 0; 348 u64 mcestart = 0; 349 int i; 350 /* 351 * If no_way_out gets set, there is no safe way to recover from this 352 * MCE. If tolerant is cranked up, we'll try anyway. 353 */ 354 int no_way_out = 0; 355 /* 356 * If kill_it gets set, there might be a way to recover from this 357 * error. 358 */ 359 int kill_it = 0; 360 DECLARE_BITMAP(toclear, MAX_NR_BANKS); 361 362 atomic_inc(&mce_entry); 363 364 __get_cpu_var(mce_exception_count)++; 365 366 if (notify_die(DIE_NMI, "machine check", regs, error_code, 367 18, SIGKILL) == NOTIFY_STOP) 368 goto out; 369 if (!banks) 370 goto out; 371 372 mce_setup(&m); 373 374 m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); 375 376 /* if the restart IP is not valid, we're done for */ 377 if (!(m.mcgstatus & MCG_STATUS_RIPV)) 378 no_way_out = 1; 379 380 rdtscll(mcestart); 381 barrier(); 382 383 for (i = 0; i < banks; i++) { 384 __clear_bit(i, toclear); 385 if (!bank[i]) 386 continue; 387 388 m.misc = 0; 389 m.addr = 0; 390 m.bank = i; 391 392 m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4); 393 if ((m.status & MCI_STATUS_VAL) == 0) 394 continue; 395 396 /* 397 * Non uncorrected errors are handled by machine_check_poll 398 * Leave them alone. 399 */ 400 if ((m.status & MCI_STATUS_UC) == 0) 401 continue; 402 403 /* 404 * Set taint even when machine check was not enabled. 405 */ 406 add_taint(TAINT_MACHINE_CHECK); 407 408 __set_bit(i, toclear); 409 410 if (m.status & MCI_STATUS_EN) { 411 /* if PCC was set, there's no way out */ 412 no_way_out |= !!(m.status & MCI_STATUS_PCC); 413 /* 414 * If this error was uncorrectable and there was 415 * an overflow, we're in trouble. If no overflow, 416 * we might get away with just killing a task. 417 */ 418 if (m.status & MCI_STATUS_UC) { 419 if (tolerant < 1 || m.status & MCI_STATUS_OVER) 420 no_way_out = 1; 421 kill_it = 1; 422 } 423 } else { 424 /* 425 * Machine check event was not enabled. Clear, but 426 * ignore. 427 */ 428 continue; 429 } 430 431 if (m.status & MCI_STATUS_MISCV) 432 m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4); 433 if (m.status & MCI_STATUS_ADDRV) 434 m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4); 435 436 mce_get_rip(&m, regs); 437 mce_log(&m); 438 439 /* 440 * Did this bank cause the exception? 441 * 442 * Assume that the bank with uncorrectable errors did it, 443 * and that there is only a single one: 444 */ 445 if ((m.status & MCI_STATUS_UC) && 446 (m.status & MCI_STATUS_EN)) { 447 panicm = m; 448 panicm_found = 1; 449 } 450 } 451 452 /* 453 * If we didn't find an uncorrectable error, pick 454 * the last one (shouldn't happen, just being safe). 455 */ 456 if (!panicm_found) 457 panicm = m; 458 459 /* 460 * If we have decided that we just CAN'T continue, and the user 461 * has not set tolerant to an insane level, give up and die. 462 */ 463 if (no_way_out && tolerant < 3) 464 mce_panic("Machine check", &panicm, mcestart); 465 466 /* 467 * If the error seems to be unrecoverable, something should be 468 * done. Try to kill as little as possible. If we can kill just 469 * one task, do that. If the user has set the tolerance very 470 * high, don't try to do anything at all. 471 */ 472 if (kill_it && tolerant < 3) { 473 int user_space = 0; 474 475 /* 476 * If the EIPV bit is set, it means the saved IP is the 477 * instruction which caused the MCE. 478 */ 479 if (m.mcgstatus & MCG_STATUS_EIPV) 480 user_space = panicm.ip && (panicm.cs & 3); 481 482 /* 483 * If we know that the error was in user space, send a 484 * SIGBUS. Otherwise, panic if tolerance is low. 485 * 486 * force_sig() takes an awful lot of locks and has a slight 487 * risk of deadlocking. 488 */ 489 if (user_space) { 490 force_sig(SIGBUS, current); 491 } else if (panic_on_oops || tolerant < 2) { 492 mce_panic("Uncorrected machine check", 493 &panicm, mcestart); 494 } 495 } 496 497 /* notify userspace ASAP */ 498 set_thread_flag(TIF_MCE_NOTIFY); 499 500 /* the last thing we do is clear state */ 501 for (i = 0; i < banks; i++) { 502 if (test_bit(i, toclear)) 503 mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); 504 } 505 mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); 506out: 507 atomic_dec(&mce_entry); 508 sync_core(); 509} 510EXPORT_SYMBOL_GPL(do_machine_check); 511 512#ifdef CONFIG_X86_MCE_INTEL 513/*** 514 * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog 515 * @cpu: The CPU on which the event occurred. 516 * @status: Event status information 517 * 518 * This function should be called by the thermal interrupt after the 519 * event has been processed and the decision was made to log the event 520 * further. 521 * 522 * The status parameter will be saved to the 'status' field of 'struct mce' 523 * and historically has been the register value of the 524 * MSR_IA32_THERMAL_STATUS (Intel) msr. 525 */ 526void mce_log_therm_throt_event(__u64 status) 527{ 528 struct mce m; 529 530 mce_setup(&m); 531 m.bank = MCE_THERMAL_BANK; 532 m.status = status; 533 mce_log(&m); 534} 535#endif /* CONFIG_X86_MCE_INTEL */ 536 537/* 538 * Periodic polling timer for "silent" machine check errors. If the 539 * poller finds an MCE, poll 2x faster. When the poller finds no more 540 * errors, poll 2x slower (up to check_interval seconds). 541 */ 542static int check_interval = 5 * 60; /* 5 minutes */ 543 544static DEFINE_PER_CPU(int, next_interval); /* in jiffies */ 545static DEFINE_PER_CPU(struct timer_list, mce_timer); 546 547static void mcheck_timer(unsigned long data) 548{ 549 struct timer_list *t = &per_cpu(mce_timer, data); 550 int *n; 551 552 WARN_ON(smp_processor_id() != data); 553 554 if (mce_available(¤t_cpu_data)) { 555 machine_check_poll(MCP_TIMESTAMP, 556 &__get_cpu_var(mce_poll_banks)); 557 } 558 559 /* 560 * Alert userspace if needed. If we logged an MCE, reduce the 561 * polling interval, otherwise increase the polling interval. 562 */ 563 n = &__get_cpu_var(next_interval); 564 if (mce_notify_user()) 565 *n = max(*n/2, HZ/100); 566 else 567 *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ)); 568 569 t->expires = jiffies + *n; 570 add_timer(t); 571} 572 573static void mce_do_trigger(struct work_struct *work) 574{ 575 call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT); 576} 577 578static DECLARE_WORK(mce_trigger_work, mce_do_trigger); 579 580/* 581 * Notify the user(s) about new machine check events. 582 * Can be called from interrupt context, but not from machine check/NMI 583 * context. 584 */ 585int mce_notify_user(void) 586{ 587 /* Not more than two messages every minute */ 588 static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); 589 590 clear_thread_flag(TIF_MCE_NOTIFY); 591 592 if (test_and_clear_bit(0, ¬ify_user)) { 593 wake_up_interruptible(&mce_wait); 594 595 /* 596 * There is no risk of missing notifications because 597 * work_pending is always cleared before the function is 598 * executed. 599 */ 600 if (trigger[0] && !work_pending(&mce_trigger_work)) 601 schedule_work(&mce_trigger_work); 602 603 if (__ratelimit(&ratelimit)) 604 printk(KERN_INFO "Machine check events logged\n"); 605 606 return 1; 607 } 608 return 0; 609} 610EXPORT_SYMBOL_GPL(mce_notify_user); 611 612/* 613 * Initialize Machine Checks for a CPU. 614 */ 615static int mce_cap_init(void) 616{ 617 unsigned b; 618 u64 cap; 619 620 rdmsrl(MSR_IA32_MCG_CAP, cap); 621 622 b = cap & MCG_BANKCNT_MASK; 623 printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b); 624 625 if (b > MAX_NR_BANKS) { 626 printk(KERN_WARNING 627 "MCE: Using only %u machine check banks out of %u\n", 628 MAX_NR_BANKS, b); 629 b = MAX_NR_BANKS; 630 } 631 632 /* Don't support asymmetric configurations today */ 633 WARN_ON(banks != 0 && b != banks); 634 banks = b; 635 if (!bank) { 636 bank = kmalloc(banks * sizeof(u64), GFP_KERNEL); 637 if (!bank) 638 return -ENOMEM; 639 memset(bank, 0xff, banks * sizeof(u64)); 640 } 641 642 /* Use accurate RIP reporting if available. */ 643 if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9) 644 rip_msr = MSR_IA32_MCG_EIP; 645 646 return 0; 647} 648 649static void mce_init(void) 650{ 651 mce_banks_t all_banks; 652 u64 cap; 653 int i; 654 655 /* 656 * Log the machine checks left over from the previous reset. 657 */ 658 bitmap_fill(all_banks, MAX_NR_BANKS); 659 machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks); 660 661 set_in_cr4(X86_CR4_MCE); 662 663 rdmsrl(MSR_IA32_MCG_CAP, cap); 664 if (cap & MCG_CTL_P) 665 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); 666 667 for (i = 0; i < banks; i++) { 668 if (skip_bank_init(i)) 669 continue; 670 wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]); 671 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); 672 } 673} 674 675/* Add per CPU specific workarounds here */ 676static void mce_cpu_quirks(struct cpuinfo_x86 *c) 677{ 678 /* This should be disabled by the BIOS, but isn't always */ 679 if (c->x86_vendor == X86_VENDOR_AMD) { 680 if (c->x86 == 15 && banks > 4) { 681 /* 682 * disable GART TBL walk error reporting, which 683 * trips off incorrectly with the IOMMU & 3ware 684 * & Cerberus: 685 */ 686 clear_bit(10, (unsigned long *)&bank[4]); 687 } 688 if (c->x86 <= 17 && mce_bootlog < 0) { 689 /* 690 * Lots of broken BIOS around that don't clear them 691 * by default and leave crap in there. Don't log: 692 */ 693 mce_bootlog = 0; 694 } 695 /* 696 * Various K7s with broken bank 0 around. Always disable 697 * by default. 698 */ 699 if (c->x86 == 6) 700 bank[0] = 0; 701 } 702 703 if (c->x86_vendor == X86_VENDOR_INTEL) { 704 /* 705 * SDM documents that on family 6 bank 0 should not be written 706 * because it aliases to another special BIOS controlled 707 * register. 708 * But it's not aliased anymore on model 0x1a+ 709 * Don't ignore bank 0 completely because there could be a 710 * valid event later, merely don't write CTL0. 711 */ 712 713 if (c->x86 == 6 && c->x86_model < 0x1A) 714 __set_bit(0, &dont_init_banks); 715 } 716} 717 718static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c) 719{ 720 if (c->x86 != 5) 721 return; 722 switch (c->x86_vendor) { 723 case X86_VENDOR_INTEL: 724 if (mce_p5_enabled()) 725 intel_p5_mcheck_init(c); 726 break; 727 case X86_VENDOR_CENTAUR: 728 winchip_mcheck_init(c); 729 break; 730 } 731} 732 733static void mce_cpu_features(struct cpuinfo_x86 *c) 734{ 735 switch (c->x86_vendor) { 736 case X86_VENDOR_INTEL: 737 mce_intel_feature_init(c); 738 break; 739 case X86_VENDOR_AMD: 740 mce_amd_feature_init(c); 741 break; 742 default: 743 break; 744 } 745} 746 747static void mce_init_timer(void) 748{ 749 struct timer_list *t = &__get_cpu_var(mce_timer); 750 int *n = &__get_cpu_var(next_interval); 751 752 *n = check_interval * HZ; 753 if (!*n) 754 return; 755 setup_timer(t, mcheck_timer, smp_processor_id()); 756 t->expires = round_jiffies(jiffies + *n); 757 add_timer(t); 758} 759 760/* 761 * Called for each booted CPU to set up machine checks. 762 * Must be called with preempt off: 763 */ 764void __cpuinit mcheck_init(struct cpuinfo_x86 *c) 765{ 766 if (mce_disabled) 767 return; 768 769 mce_ancient_init(c); 770 771 if (!mce_available(c)) 772 return; 773 774 if (mce_cap_init() < 0) { 775 mce_disabled = 1; 776 return; 777 } 778 mce_cpu_quirks(c); 779 780 machine_check_vector = do_machine_check; 781 782 mce_init(); 783 mce_cpu_features(c); 784 mce_init_timer(); 785} 786 787/* 788 * Character device to read and clear the MCE log. 789 */ 790 791static DEFINE_SPINLOCK(mce_state_lock); 792static int open_count; /* #times opened */ 793static int open_exclu; /* already open exclusive? */ 794 795static int mce_open(struct inode *inode, struct file *file) 796{ 797 spin_lock(&mce_state_lock); 798 799 if (open_exclu || (open_count && (file->f_flags & O_EXCL))) { 800 spin_unlock(&mce_state_lock); 801 802 return -EBUSY; 803 } 804 805 if (file->f_flags & O_EXCL) 806 open_exclu = 1; 807 open_count++; 808 809 spin_unlock(&mce_state_lock); 810 811 return nonseekable_open(inode, file); 812} 813 814static int mce_release(struct inode *inode, struct file *file) 815{ 816 spin_lock(&mce_state_lock); 817 818 open_count--; 819 open_exclu = 0; 820 821 spin_unlock(&mce_state_lock); 822 823 return 0; 824} 825 826static void collect_tscs(void *data) 827{ 828 unsigned long *cpu_tsc = (unsigned long *)data; 829 830 rdtscll(cpu_tsc[smp_processor_id()]); 831} 832 833static DEFINE_MUTEX(mce_read_mutex); 834 835static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, 836 loff_t *off) 837{ 838 char __user *buf = ubuf; 839 unsigned long *cpu_tsc; 840 unsigned prev, next; 841 int i, err; 842 843 cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL); 844 if (!cpu_tsc) 845 return -ENOMEM; 846 847 mutex_lock(&mce_read_mutex); 848 next = rcu_dereference(mcelog.next); 849 850 /* Only supports full reads right now */ 851 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) { 852 mutex_unlock(&mce_read_mutex); 853 kfree(cpu_tsc); 854 855 return -EINVAL; 856 } 857 858 err = 0; 859 prev = 0; 860 do { 861 for (i = prev; i < next; i++) { 862 unsigned long start = jiffies; 863 864 while (!mcelog.entry[i].finished) { 865 if (time_after_eq(jiffies, start + 2)) { 866 memset(mcelog.entry + i, 0, 867 sizeof(struct mce)); 868 goto timeout; 869 } 870 cpu_relax(); 871 } 872 smp_rmb(); 873 err |= copy_to_user(buf, mcelog.entry + i, 874 sizeof(struct mce)); 875 buf += sizeof(struct mce); 876timeout: 877 ; 878 } 879 880 memset(mcelog.entry + prev, 0, 881 (next - prev) * sizeof(struct mce)); 882 prev = next; 883 next = cmpxchg(&mcelog.next, prev, 0); 884 } while (next != prev); 885 886 synchronize_sched(); 887 888 /* 889 * Collect entries that were still getting written before the 890 * synchronize. 891 */ 892 on_each_cpu(collect_tscs, cpu_tsc, 1); 893 894 for (i = next; i < MCE_LOG_LEN; i++) { 895 if (mcelog.entry[i].finished && 896 mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) { 897 err |= copy_to_user(buf, mcelog.entry+i, 898 sizeof(struct mce)); 899 smp_rmb(); 900 buf += sizeof(struct mce); 901 memset(&mcelog.entry[i], 0, sizeof(struct mce)); 902 } 903 } 904 mutex_unlock(&mce_read_mutex); 905 kfree(cpu_tsc); 906 907 return err ? -EFAULT : buf - ubuf; 908} 909 910static unsigned int mce_poll(struct file *file, poll_table *wait) 911{ 912 poll_wait(file, &mce_wait, wait); 913 if (rcu_dereference(mcelog.next)) 914 return POLLIN | POLLRDNORM; 915 return 0; 916} 917 918static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg) 919{ 920 int __user *p = (int __user *)arg; 921 922 if (!capable(CAP_SYS_ADMIN)) 923 return -EPERM; 924 925 switch (cmd) { 926 case MCE_GET_RECORD_LEN: 927 return put_user(sizeof(struct mce), p); 928 case MCE_GET_LOG_LEN: 929 return put_user(MCE_LOG_LEN, p); 930 case MCE_GETCLEAR_FLAGS: { 931 unsigned flags; 932 933 do { 934 flags = mcelog.flags; 935 } while (cmpxchg(&mcelog.flags, flags, 0) != flags); 936 937 return put_user(flags, p); 938 } 939 default: 940 return -ENOTTY; 941 } 942} 943 944/* Modified in mce-inject.c, so not static or const */ 945struct file_operations mce_chrdev_ops = { 946 .open = mce_open, 947 .release = mce_release, 948 .read = mce_read, 949 .poll = mce_poll, 950 .unlocked_ioctl = mce_ioctl, 951}; 952EXPORT_SYMBOL_GPL(mce_chrdev_ops); 953 954static struct miscdevice mce_log_device = { 955 MISC_MCELOG_MINOR, 956 "mcelog", 957 &mce_chrdev_ops, 958}; 959 960/* 961 * mce=off disables machine check 962 * mce=TOLERANCELEVEL (number, see above) 963 * mce=bootlog Log MCEs from before booting. Disabled by default on AMD. 964 * mce=nobootlog Don't log MCEs from before booting. 965 */ 966static int __init mcheck_enable(char *str) 967{ 968 if (*str == 0) 969 enable_p5_mce(); 970 if (*str == '=') 971 str++; 972 if (!strcmp(str, "off")) 973 mce_disabled = 1; 974 else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog")) 975 mce_bootlog = (str[0] == 'b'); 976 else if (isdigit(str[0])) 977 get_option(&str, &tolerant); 978 else { 979 printk(KERN_INFO "mce argument %s ignored. Please use /sys\n", 980 str); 981 return 0; 982 } 983 return 1; 984} 985__setup("mce", mcheck_enable); 986 987/* 988 * Sysfs support 989 */ 990 991/* 992 * Disable machine checks on suspend and shutdown. We can't really handle 993 * them later. 994 */ 995static int mce_disable(void) 996{ 997 int i; 998 999 for (i = 0; i < banks; i++) { 1000 if (!skip_bank_init(i)) 1001 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); 1002 } 1003 return 0; 1004} 1005 1006static int mce_suspend(struct sys_device *dev, pm_message_t state) 1007{ 1008 return mce_disable(); 1009} 1010 1011static int mce_shutdown(struct sys_device *dev) 1012{ 1013 return mce_disable(); 1014} 1015 1016/* 1017 * On resume clear all MCE state. Don't want to see leftovers from the BIOS. 1018 * Only one CPU is active at this time, the others get re-added later using 1019 * CPU hotplug: 1020 */ 1021static int mce_resume(struct sys_device *dev) 1022{ 1023 mce_init(); 1024 mce_cpu_features(¤t_cpu_data); 1025 1026 return 0; 1027} 1028 1029static void mce_cpu_restart(void *data) 1030{ 1031 del_timer_sync(&__get_cpu_var(mce_timer)); 1032 if (mce_available(¤t_cpu_data)) 1033 mce_init(); 1034 mce_init_timer(); 1035} 1036 1037/* Reinit MCEs after user configuration changes */ 1038static void mce_restart(void) 1039{ 1040 on_each_cpu(mce_cpu_restart, NULL, 1); 1041} 1042 1043static struct sysdev_class mce_sysclass = { 1044 .suspend = mce_suspend, 1045 .shutdown = mce_shutdown, 1046 .resume = mce_resume, 1047 .name = "machinecheck", 1048}; 1049 1050DEFINE_PER_CPU(struct sys_device, mce_dev); 1051 1052__cpuinitdata 1053void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); 1054 1055static struct sysdev_attribute *bank_attrs; 1056 1057static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr, 1058 char *buf) 1059{ 1060 u64 b = bank[attr - bank_attrs]; 1061 1062 return sprintf(buf, "%llx\n", b); 1063} 1064 1065static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, 1066 const char *buf, size_t size) 1067{ 1068 u64 new; 1069 1070 if (strict_strtoull(buf, 0, &new) < 0) 1071 return -EINVAL; 1072 1073 bank[attr - bank_attrs] = new; 1074 mce_restart(); 1075 1076 return size; 1077} 1078 1079static ssize_t 1080show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf) 1081{ 1082 strcpy(buf, trigger); 1083 strcat(buf, "\n"); 1084 return strlen(trigger) + 1; 1085} 1086 1087static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr, 1088 const char *buf, size_t siz) 1089{ 1090 char *p; 1091 int len; 1092 1093 strncpy(trigger, buf, sizeof(trigger)); 1094 trigger[sizeof(trigger)-1] = 0; 1095 len = strlen(trigger); 1096 p = strchr(trigger, '\n'); 1097 1098 if (*p) 1099 *p = 0; 1100 1101 return len; 1102} 1103 1104static ssize_t store_int_with_restart(struct sys_device *s, 1105 struct sysdev_attribute *attr, 1106 const char *buf, size_t size) 1107{ 1108 ssize_t ret = sysdev_store_int(s, attr, buf, size); 1109 mce_restart(); 1110 return ret; 1111} 1112 1113static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); 1114static SYSDEV_INT_ATTR(tolerant, 0644, tolerant); 1115 1116static struct sysdev_ext_attribute attr_check_interval = { 1117 _SYSDEV_ATTR(check_interval, 0644, sysdev_show_int, 1118 store_int_with_restart), 1119 &check_interval 1120}; 1121 1122static struct sysdev_attribute *mce_attrs[] = { 1123 &attr_tolerant.attr, &attr_check_interval.attr, &attr_trigger, 1124 NULL 1125}; 1126 1127static cpumask_var_t mce_dev_initialized; 1128 1129/* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */ 1130static __cpuinit int mce_create_device(unsigned int cpu) 1131{ 1132 int err; 1133 int i; 1134 1135 if (!mce_available(&boot_cpu_data)) 1136 return -EIO; 1137 1138 memset(&per_cpu(mce_dev, cpu).kobj, 0, sizeof(struct kobject)); 1139 per_cpu(mce_dev, cpu).id = cpu; 1140 per_cpu(mce_dev, cpu).cls = &mce_sysclass; 1141 1142 err = sysdev_register(&per_cpu(mce_dev, cpu)); 1143 if (err) 1144 return err; 1145 1146 for (i = 0; mce_attrs[i]; i++) { 1147 err = sysdev_create_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); 1148 if (err) 1149 goto error; 1150 } 1151 for (i = 0; i < banks; i++) { 1152 err = sysdev_create_file(&per_cpu(mce_dev, cpu), 1153 &bank_attrs[i]); 1154 if (err) 1155 goto error2; 1156 } 1157 cpumask_set_cpu(cpu, mce_dev_initialized); 1158 1159 return 0; 1160error2: 1161 while (--i >= 0) 1162 sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]); 1163error: 1164 while (--i >= 0) 1165 sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); 1166 1167 sysdev_unregister(&per_cpu(mce_dev, cpu)); 1168 1169 return err; 1170} 1171 1172static __cpuinit void mce_remove_device(unsigned int cpu) 1173{ 1174 int i; 1175 1176 if (!cpumask_test_cpu(cpu, mce_dev_initialized)) 1177 return; 1178 1179 for (i = 0; mce_attrs[i]; i++) 1180 sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); 1181 1182 for (i = 0; i < banks; i++) 1183 sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]); 1184 1185 sysdev_unregister(&per_cpu(mce_dev, cpu)); 1186 cpumask_clear_cpu(cpu, mce_dev_initialized); 1187} 1188 1189/* Make sure there are no machine checks on offlined CPUs. */ 1190static void mce_disable_cpu(void *h) 1191{ 1192 unsigned long action = *(unsigned long *)h; 1193 int i; 1194 1195 if (!mce_available(¤t_cpu_data)) 1196 return; 1197 if (!(action & CPU_TASKS_FROZEN)) 1198 cmci_clear(); 1199 for (i = 0; i < banks; i++) { 1200 if (!skip_bank_init(i)) 1201 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); 1202 } 1203} 1204 1205static void mce_reenable_cpu(void *h) 1206{ 1207 unsigned long action = *(unsigned long *)h; 1208 int i; 1209 1210 if (!mce_available(¤t_cpu_data)) 1211 return; 1212 1213 if (!(action & CPU_TASKS_FROZEN)) 1214 cmci_reenable(); 1215 for (i = 0; i < banks; i++) { 1216 if (!skip_bank_init(i)) 1217 wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]); 1218 } 1219} 1220 1221/* Get notified when a cpu comes on/off. Be hotplug friendly. */ 1222static int __cpuinit 1223mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) 1224{ 1225 unsigned int cpu = (unsigned long)hcpu; 1226 struct timer_list *t = &per_cpu(mce_timer, cpu); 1227 1228 switch (action) { 1229 case CPU_ONLINE: 1230 case CPU_ONLINE_FROZEN: 1231 mce_create_device(cpu); 1232 if (threshold_cpu_callback) 1233 threshold_cpu_callback(action, cpu); 1234 break; 1235 case CPU_DEAD: 1236 case CPU_DEAD_FROZEN: 1237 if (threshold_cpu_callback) 1238 threshold_cpu_callback(action, cpu); 1239 mce_remove_device(cpu); 1240 break; 1241 case CPU_DOWN_PREPARE: 1242 case CPU_DOWN_PREPARE_FROZEN: 1243 del_timer_sync(t); 1244 smp_call_function_single(cpu, mce_disable_cpu, &action, 1); 1245 break; 1246 case CPU_DOWN_FAILED: 1247 case CPU_DOWN_FAILED_FROZEN: 1248 t->expires = round_jiffies(jiffies + 1249 __get_cpu_var(next_interval)); 1250 add_timer_on(t, cpu); 1251 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); 1252 break; 1253 case CPU_POST_DEAD: 1254 /* intentionally ignoring frozen here */ 1255 cmci_rediscover(cpu); 1256 break; 1257 } 1258 return NOTIFY_OK; 1259} 1260 1261static struct notifier_block mce_cpu_notifier __cpuinitdata = { 1262 .notifier_call = mce_cpu_callback, 1263}; 1264 1265static __init int mce_init_banks(void) 1266{ 1267 int i; 1268 1269 bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks, 1270 GFP_KERNEL); 1271 if (!bank_attrs) 1272 return -ENOMEM; 1273 1274 for (i = 0; i < banks; i++) { 1275 struct sysdev_attribute *a = &bank_attrs[i]; 1276 1277 a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i); 1278 if (!a->attr.name) 1279 goto nomem; 1280 1281 a->attr.mode = 0644; 1282 a->show = show_bank; 1283 a->store = set_bank; 1284 } 1285 return 0; 1286 1287nomem: 1288 while (--i >= 0) 1289 kfree(bank_attrs[i].attr.name); 1290 kfree(bank_attrs); 1291 bank_attrs = NULL; 1292 1293 return -ENOMEM; 1294} 1295 1296static __init int mce_init_device(void) 1297{ 1298 int err; 1299 int i = 0; 1300 1301 if (!mce_available(&boot_cpu_data)) 1302 return -EIO; 1303 1304 alloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL); 1305 1306 err = mce_init_banks(); 1307 if (err) 1308 return err; 1309 1310 err = sysdev_class_register(&mce_sysclass); 1311 if (err) 1312 return err; 1313 1314 for_each_online_cpu(i) { 1315 err = mce_create_device(i); 1316 if (err) 1317 return err; 1318 } 1319 1320 register_hotcpu_notifier(&mce_cpu_notifier); 1321 misc_register(&mce_log_device); 1322 1323 return err; 1324} 1325 1326device_initcall(mce_init_device); 1327 1328#else /* CONFIG_X86_OLD_MCE: */ 1329 1330int nr_mce_banks; 1331EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */ 1332 1333/* This has to be run for each processor */ 1334void mcheck_init(struct cpuinfo_x86 *c) 1335{ 1336 if (mce_disabled == 1) 1337 return; 1338 1339 switch (c->x86_vendor) { 1340 case X86_VENDOR_AMD: 1341 amd_mcheck_init(c); 1342 break; 1343 1344 case X86_VENDOR_INTEL: 1345 if (c->x86 == 5) 1346 intel_p5_mcheck_init(c); 1347 if (c->x86 == 6) 1348 intel_p6_mcheck_init(c); 1349 if (c->x86 == 15) 1350 intel_p4_mcheck_init(c); 1351 break; 1352 1353 case X86_VENDOR_CENTAUR: 1354 if (c->x86 == 5) 1355 winchip_mcheck_init(c); 1356 break; 1357 1358 default: 1359 break; 1360 } 1361 printk(KERN_INFO "mce: CPU supports %d MCE banks\n", nr_mce_banks); 1362} 1363 1364static int __init mcheck_enable(char *str) 1365{ 1366 mce_disabled = -1; 1367 return 1; 1368} 1369 1370__setup("mce", mcheck_enable); 1371 1372#endif /* CONFIG_X86_OLD_MCE */ 1373 1374/* 1375 * Old style boot options parsing. Only for compatibility. 1376 */ 1377static int __init mcheck_disable(char *str) 1378{ 1379 mce_disabled = 1; 1380 return 1; 1381} 1382__setup("nomce", mcheck_disable); 1383