mce.c revision a1ff41bfc1bb7a6d19cf958f89a9b539678781e5
1/* 2 * Machine check handler. 3 * 4 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. 5 * Rest from unknown author(s). 6 * 2004 Andi Kleen. Rewrote most of it. 7 * Copyright 2008 Intel Corporation 8 * Author: Andi Kleen 9 */ 10#include <linux/thread_info.h> 11#include <linux/capability.h> 12#include <linux/miscdevice.h> 13#include <linux/ratelimit.h> 14#include <linux/kallsyms.h> 15#include <linux/rcupdate.h> 16#include <linux/smp_lock.h> 17#include <linux/kobject.h> 18#include <linux/kdebug.h> 19#include <linux/kernel.h> 20#include <linux/percpu.h> 21#include <linux/string.h> 22#include <linux/sysdev.h> 23#include <linux/ctype.h> 24#include <linux/sched.h> 25#include <linux/sysfs.h> 26#include <linux/types.h> 27#include <linux/init.h> 28#include <linux/kmod.h> 29#include <linux/poll.h> 30#include <linux/cpu.h> 31#include <linux/fs.h> 32 33#include <asm/processor.h> 34#include <asm/uaccess.h> 35#include <asm/idle.h> 36#include <asm/mce.h> 37#include <asm/msr.h> 38#include <asm/smp.h> 39 40#include "mce.h" 41 42/* Handle unconfigured int18 (should never happen) */ 43static void unexpected_machine_check(struct pt_regs *regs, long error_code) 44{ 45 printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", 46 smp_processor_id()); 47} 48 49/* Call the installed machine check handler for this CPU setup. */ 50void (*machine_check_vector)(struct pt_regs *, long error_code) = 51 unexpected_machine_check; 52 53int mce_disabled; 54 55#ifdef CONFIG_X86_NEW_MCE 56 57#define MISC_MCELOG_MINOR 227 58 59atomic_t mce_entry; 60 61/* 62 * Tolerant levels: 63 * 0: always panic on uncorrected errors, log corrected errors 64 * 1: panic or SIGBUS on uncorrected errors, log corrected errors 65 * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors 66 * 3: never panic or SIGBUS, log all errors (for testing only) 67 */ 68static int tolerant = 1; 69static int banks; 70static u64 *bank; 71static unsigned long notify_user; 72static int rip_msr; 73static int mce_bootlog = -1; 74static atomic_t mce_events; 75 76static char trigger[128]; 77static char *trigger_argv[2] = { trigger, NULL }; 78 79static unsigned long dont_init_banks; 80 81static DECLARE_WAIT_QUEUE_HEAD(mce_wait); 82 83/* MCA banks polled by the period polling timer for corrected events */ 84DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { 85 [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL 86}; 87 88static inline int skip_bank_init(int i) 89{ 90 return i < BITS_PER_LONG && test_bit(i, &dont_init_banks); 91} 92 93/* Do initial initialization of a struct mce */ 94void mce_setup(struct mce *m) 95{ 96 memset(m, 0, sizeof(struct mce)); 97 m->cpu = smp_processor_id(); 98 rdtscll(m->tsc); 99} 100 101DEFINE_PER_CPU(struct mce, injectm); 102EXPORT_PER_CPU_SYMBOL_GPL(injectm); 103 104/* 105 * Lockless MCE logging infrastructure. 106 * This avoids deadlocks on printk locks without having to break locks. Also 107 * separate MCEs from kernel messages to avoid bogus bug reports. 108 */ 109 110static struct mce_log mcelog = { 111 MCE_LOG_SIGNATURE, 112 MCE_LOG_LEN, 113}; 114 115void mce_log(struct mce *mce) 116{ 117 unsigned next, entry; 118 119 atomic_inc(&mce_events); 120 mce->finished = 0; 121 wmb(); 122 for (;;) { 123 entry = rcu_dereference(mcelog.next); 124 for (;;) { 125 /* 126 * When the buffer fills up discard new entries. 127 * Assume that the earlier errors are the more 128 * interesting ones: 129 */ 130 if (entry >= MCE_LOG_LEN) { 131 set_bit(MCE_OVERFLOW, (unsigned long *)&mcelog.flags); 132 return; 133 } 134 /* Old left over entry. Skip: */ 135 if (mcelog.entry[entry].finished) { 136 entry++; 137 continue; 138 } 139 break; 140 } 141 smp_rmb(); 142 next = entry + 1; 143 if (cmpxchg(&mcelog.next, entry, next) == entry) 144 break; 145 } 146 memcpy(mcelog.entry + entry, mce, sizeof(struct mce)); 147 wmb(); 148 mcelog.entry[entry].finished = 1; 149 wmb(); 150 151 set_bit(0, ¬ify_user); 152} 153 154static void print_mce(struct mce *m) 155{ 156 printk(KERN_EMERG "\n" 157 KERN_EMERG "HARDWARE ERROR\n" 158 KERN_EMERG 159 "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n", 160 m->cpu, m->mcgstatus, m->bank, m->status); 161 if (m->ip) { 162 printk(KERN_EMERG "RIP%s %02x:<%016Lx> ", 163 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", 164 m->cs, m->ip); 165 if (m->cs == __KERNEL_CS) 166 print_symbol("{%s}", m->ip); 167 printk("\n"); 168 } 169 printk(KERN_EMERG "TSC %llx ", m->tsc); 170 if (m->addr) 171 printk("ADDR %llx ", m->addr); 172 if (m->misc) 173 printk("MISC %llx ", m->misc); 174 printk("\n"); 175 printk(KERN_EMERG "This is not a software problem!\n"); 176 printk(KERN_EMERG "Run through mcelog --ascii to decode " 177 "and contact your hardware vendor\n"); 178} 179 180static void mce_panic(char *msg, struct mce *backup, u64 start) 181{ 182 int i; 183 184 bust_spinlocks(1); 185 console_verbose(); 186 for (i = 0; i < MCE_LOG_LEN; i++) { 187 u64 tsc = mcelog.entry[i].tsc; 188 189 if ((s64)(tsc - start) < 0) 190 continue; 191 print_mce(&mcelog.entry[i]); 192 if (backup && mcelog.entry[i].tsc == backup->tsc) 193 backup = NULL; 194 } 195 if (backup) 196 print_mce(backup); 197 panic(msg); 198} 199 200/* Support code for software error injection */ 201 202static int msr_to_offset(u32 msr) 203{ 204 unsigned bank = __get_cpu_var(injectm.bank); 205 if (msr == rip_msr) 206 return offsetof(struct mce, ip); 207 if (msr == MSR_IA32_MC0_STATUS + bank*4) 208 return offsetof(struct mce, status); 209 if (msr == MSR_IA32_MC0_ADDR + bank*4) 210 return offsetof(struct mce, addr); 211 if (msr == MSR_IA32_MC0_MISC + bank*4) 212 return offsetof(struct mce, misc); 213 if (msr == MSR_IA32_MCG_STATUS) 214 return offsetof(struct mce, mcgstatus); 215 return -1; 216} 217 218/* MSR access wrappers used for error injection */ 219static u64 mce_rdmsrl(u32 msr) 220{ 221 u64 v; 222 if (__get_cpu_var(injectm).finished) { 223 int offset = msr_to_offset(msr); 224 if (offset < 0) 225 return 0; 226 return *(u64 *)((char *)&__get_cpu_var(injectm) + offset); 227 } 228 rdmsrl(msr, v); 229 return v; 230} 231 232static void mce_wrmsrl(u32 msr, u64 v) 233{ 234 if (__get_cpu_var(injectm).finished) { 235 int offset = msr_to_offset(msr); 236 if (offset >= 0) 237 *(u64 *)((char *)&__get_cpu_var(injectm) + offset) = v; 238 return; 239 } 240 wrmsrl(msr, v); 241} 242 243int mce_available(struct cpuinfo_x86 *c) 244{ 245 if (mce_disabled) 246 return 0; 247 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); 248} 249 250static inline void mce_get_rip(struct mce *m, struct pt_regs *regs) 251{ 252 if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) { 253 m->ip = regs->ip; 254 m->cs = regs->cs; 255 } else { 256 m->ip = 0; 257 m->cs = 0; 258 } 259 if (rip_msr) { 260 /* Assume the RIP in the MSR is exact. Is this true? */ 261 m->mcgstatus |= MCG_STATUS_EIPV; 262 m->ip = mce_rdmsrl(rip_msr); 263 m->cs = 0; 264 } 265} 266 267/* 268 * Poll for corrected events or events that happened before reset. 269 * Those are just logged through /dev/mcelog. 270 * 271 * This is executed in standard interrupt context. 272 */ 273void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) 274{ 275 struct mce m; 276 int i; 277 278 mce_setup(&m); 279 280 m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); 281 for (i = 0; i < banks; i++) { 282 if (!bank[i] || !test_bit(i, *b)) 283 continue; 284 285 m.misc = 0; 286 m.addr = 0; 287 m.bank = i; 288 m.tsc = 0; 289 290 barrier(); 291 m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4); 292 if (!(m.status & MCI_STATUS_VAL)) 293 continue; 294 295 /* 296 * Uncorrected events are handled by the exception handler 297 * when it is enabled. But when the exception is disabled log 298 * everything. 299 * 300 * TBD do the same check for MCI_STATUS_EN here? 301 */ 302 if ((m.status & MCI_STATUS_UC) && !(flags & MCP_UC)) 303 continue; 304 305 if (m.status & MCI_STATUS_MISCV) 306 m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4); 307 if (m.status & MCI_STATUS_ADDRV) 308 m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4); 309 310 if (!(flags & MCP_TIMESTAMP)) 311 m.tsc = 0; 312 /* 313 * Don't get the IP here because it's unlikely to 314 * have anything to do with the actual error location. 315 */ 316 if (!(flags & MCP_DONTLOG)) { 317 mce_log(&m); 318 add_taint(TAINT_MACHINE_CHECK); 319 } 320 321 /* 322 * Clear state for this bank. 323 */ 324 mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); 325 } 326 327 /* 328 * Don't clear MCG_STATUS here because it's only defined for 329 * exceptions. 330 */ 331} 332EXPORT_SYMBOL_GPL(machine_check_poll); 333 334/* 335 * The actual machine check handler. This only handles real 336 * exceptions when something got corrupted coming in through int 18. 337 * 338 * This is executed in NMI context not subject to normal locking rules. This 339 * implies that most kernel services cannot be safely used. Don't even 340 * think about putting a printk in there! 341 */ 342void do_machine_check(struct pt_regs *regs, long error_code) 343{ 344 struct mce m, panicm; 345 int panicm_found = 0; 346 u64 mcestart = 0; 347 int i; 348 /* 349 * If no_way_out gets set, there is no safe way to recover from this 350 * MCE. If tolerant is cranked up, we'll try anyway. 351 */ 352 int no_way_out = 0; 353 /* 354 * If kill_it gets set, there might be a way to recover from this 355 * error. 356 */ 357 int kill_it = 0; 358 DECLARE_BITMAP(toclear, MAX_NR_BANKS); 359 360 atomic_inc(&mce_entry); 361 362 if (notify_die(DIE_NMI, "machine check", regs, error_code, 363 18, SIGKILL) == NOTIFY_STOP) 364 goto out2; 365 if (!banks) 366 goto out2; 367 368 mce_setup(&m); 369 370 m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); 371 372 /* if the restart IP is not valid, we're done for */ 373 if (!(m.mcgstatus & MCG_STATUS_RIPV)) 374 no_way_out = 1; 375 376 rdtscll(mcestart); 377 barrier(); 378 379 for (i = 0; i < banks; i++) { 380 __clear_bit(i, toclear); 381 if (!bank[i]) 382 continue; 383 384 m.misc = 0; 385 m.addr = 0; 386 m.bank = i; 387 388 m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4); 389 if ((m.status & MCI_STATUS_VAL) == 0) 390 continue; 391 392 /* 393 * Non uncorrected errors are handled by machine_check_poll 394 * Leave them alone. 395 */ 396 if ((m.status & MCI_STATUS_UC) == 0) 397 continue; 398 399 /* 400 * Set taint even when machine check was not enabled. 401 */ 402 add_taint(TAINT_MACHINE_CHECK); 403 404 __set_bit(i, toclear); 405 406 if (m.status & MCI_STATUS_EN) { 407 /* if PCC was set, there's no way out */ 408 no_way_out |= !!(m.status & MCI_STATUS_PCC); 409 /* 410 * If this error was uncorrectable and there was 411 * an overflow, we're in trouble. If no overflow, 412 * we might get away with just killing a task. 413 */ 414 if (m.status & MCI_STATUS_UC) { 415 if (tolerant < 1 || m.status & MCI_STATUS_OVER) 416 no_way_out = 1; 417 kill_it = 1; 418 } 419 } else { 420 /* 421 * Machine check event was not enabled. Clear, but 422 * ignore. 423 */ 424 continue; 425 } 426 427 if (m.status & MCI_STATUS_MISCV) 428 m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4); 429 if (m.status & MCI_STATUS_ADDRV) 430 m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4); 431 432 mce_get_rip(&m, regs); 433 mce_log(&m); 434 435 /* 436 * Did this bank cause the exception? 437 * 438 * Assume that the bank with uncorrectable errors did it, 439 * and that there is only a single one: 440 */ 441 if ((m.status & MCI_STATUS_UC) && 442 (m.status & MCI_STATUS_EN)) { 443 panicm = m; 444 panicm_found = 1; 445 } 446 } 447 448 /* 449 * If we didn't find an uncorrectable error, pick 450 * the last one (shouldn't happen, just being safe). 451 */ 452 if (!panicm_found) 453 panicm = m; 454 455 /* 456 * If we have decided that we just CAN'T continue, and the user 457 * has not set tolerant to an insane level, give up and die. 458 */ 459 if (no_way_out && tolerant < 3) 460 mce_panic("Machine check", &panicm, mcestart); 461 462 /* 463 * If the error seems to be unrecoverable, something should be 464 * done. Try to kill as little as possible. If we can kill just 465 * one task, do that. If the user has set the tolerance very 466 * high, don't try to do anything at all. 467 */ 468 if (kill_it && tolerant < 3) { 469 int user_space = 0; 470 471 /* 472 * If the EIPV bit is set, it means the saved IP is the 473 * instruction which caused the MCE. 474 */ 475 if (m.mcgstatus & MCG_STATUS_EIPV) 476 user_space = panicm.ip && (panicm.cs & 3); 477 478 /* 479 * If we know that the error was in user space, send a 480 * SIGBUS. Otherwise, panic if tolerance is low. 481 * 482 * force_sig() takes an awful lot of locks and has a slight 483 * risk of deadlocking. 484 */ 485 if (user_space) { 486 force_sig(SIGBUS, current); 487 } else if (panic_on_oops || tolerant < 2) { 488 mce_panic("Uncorrected machine check", 489 &panicm, mcestart); 490 } 491 } 492 493 /* notify userspace ASAP */ 494 set_thread_flag(TIF_MCE_NOTIFY); 495 496 /* the last thing we do is clear state */ 497 for (i = 0; i < banks; i++) { 498 if (test_bit(i, toclear)) 499 mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); 500 } 501 mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); 502 out2: 503 atomic_dec(&mce_entry); 504} 505EXPORT_SYMBOL_GPL(do_machine_check); 506 507#ifdef CONFIG_X86_MCE_INTEL 508/*** 509 * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog 510 * @cpu: The CPU on which the event occurred. 511 * @status: Event status information 512 * 513 * This function should be called by the thermal interrupt after the 514 * event has been processed and the decision was made to log the event 515 * further. 516 * 517 * The status parameter will be saved to the 'status' field of 'struct mce' 518 * and historically has been the register value of the 519 * MSR_IA32_THERMAL_STATUS (Intel) msr. 520 */ 521void mce_log_therm_throt_event(__u64 status) 522{ 523 struct mce m; 524 525 mce_setup(&m); 526 m.bank = MCE_THERMAL_BANK; 527 m.status = status; 528 mce_log(&m); 529} 530#endif /* CONFIG_X86_MCE_INTEL */ 531 532/* 533 * Periodic polling timer for "silent" machine check errors. If the 534 * poller finds an MCE, poll 2x faster. When the poller finds no more 535 * errors, poll 2x slower (up to check_interval seconds). 536 */ 537static int check_interval = 5 * 60; /* 5 minutes */ 538 539static DEFINE_PER_CPU(int, next_interval); /* in jiffies */ 540static DEFINE_PER_CPU(struct timer_list, mce_timer); 541 542static void mcheck_timer(unsigned long data) 543{ 544 struct timer_list *t = &per_cpu(mce_timer, data); 545 int *n; 546 547 WARN_ON(smp_processor_id() != data); 548 549 if (mce_available(¤t_cpu_data)) { 550 machine_check_poll(MCP_TIMESTAMP, 551 &__get_cpu_var(mce_poll_banks)); 552 } 553 554 /* 555 * Alert userspace if needed. If we logged an MCE, reduce the 556 * polling interval, otherwise increase the polling interval. 557 */ 558 n = &__get_cpu_var(next_interval); 559 if (mce_notify_user()) { 560 *n = max(*n/2, HZ/100); 561 } else { 562 *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ)); 563 } 564 565 t->expires = jiffies + *n; 566 add_timer(t); 567} 568 569static void mce_do_trigger(struct work_struct *work) 570{ 571 call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT); 572} 573 574static DECLARE_WORK(mce_trigger_work, mce_do_trigger); 575 576/* 577 * Notify the user(s) about new machine check events. 578 * Can be called from interrupt context, but not from machine check/NMI 579 * context. 580 */ 581int mce_notify_user(void) 582{ 583 /* Not more than two messages every minute */ 584 static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); 585 586 clear_thread_flag(TIF_MCE_NOTIFY); 587 588 if (test_and_clear_bit(0, ¬ify_user)) { 589 wake_up_interruptible(&mce_wait); 590 591 /* 592 * There is no risk of missing notifications because 593 * work_pending is always cleared before the function is 594 * executed. 595 */ 596 if (trigger[0] && !work_pending(&mce_trigger_work)) 597 schedule_work(&mce_trigger_work); 598 599 if (__ratelimit(&ratelimit)) 600 printk(KERN_INFO "Machine check events logged\n"); 601 602 return 1; 603 } 604 return 0; 605} 606EXPORT_SYMBOL_GPL(mce_notify_user); 607 608/* 609 * Initialize Machine Checks for a CPU. 610 */ 611static int mce_cap_init(void) 612{ 613 unsigned b; 614 u64 cap; 615 616 rdmsrl(MSR_IA32_MCG_CAP, cap); 617 618 b = cap & MCG_BANKCNT_MASK; 619 printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b); 620 621 if (b > MAX_NR_BANKS) { 622 printk(KERN_WARNING 623 "MCE: Using only %u machine check banks out of %u\n", 624 MAX_NR_BANKS, b); 625 b = MAX_NR_BANKS; 626 } 627 628 /* Don't support asymmetric configurations today */ 629 WARN_ON(banks != 0 && b != banks); 630 banks = b; 631 if (!bank) { 632 bank = kmalloc(banks * sizeof(u64), GFP_KERNEL); 633 if (!bank) 634 return -ENOMEM; 635 memset(bank, 0xff, banks * sizeof(u64)); 636 } 637 638 /* Use accurate RIP reporting if available. */ 639 if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9) 640 rip_msr = MSR_IA32_MCG_EIP; 641 642 return 0; 643} 644 645static void mce_init(void *dummy) 646{ 647 mce_banks_t all_banks; 648 u64 cap; 649 int i; 650 651 /* 652 * Log the machine checks left over from the previous reset. 653 */ 654 bitmap_fill(all_banks, MAX_NR_BANKS); 655 machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks); 656 657 set_in_cr4(X86_CR4_MCE); 658 659 rdmsrl(MSR_IA32_MCG_CAP, cap); 660 if (cap & MCG_CTL_P) 661 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); 662 663 for (i = 0; i < banks; i++) { 664 if (skip_bank_init(i)) 665 continue; 666 wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]); 667 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); 668 } 669} 670 671/* Add per CPU specific workarounds here */ 672static void mce_cpu_quirks(struct cpuinfo_x86 *c) 673{ 674 /* This should be disabled by the BIOS, but isn't always */ 675 if (c->x86_vendor == X86_VENDOR_AMD) { 676 if (c->x86 == 15 && banks > 4) { 677 /* 678 * disable GART TBL walk error reporting, which 679 * trips off incorrectly with the IOMMU & 3ware 680 * & Cerberus: 681 */ 682 clear_bit(10, (unsigned long *)&bank[4]); 683 } 684 if (c->x86 <= 17 && mce_bootlog < 0) { 685 /* 686 * Lots of broken BIOS around that don't clear them 687 * by default and leave crap in there. Don't log: 688 */ 689 mce_bootlog = 0; 690 } 691 /* 692 * Various K7s with broken bank 0 around. Always disable 693 * by default. 694 */ 695 if (c->x86 == 6) 696 bank[0] = 0; 697 } 698 699 if (c->x86_vendor == X86_VENDOR_INTEL) { 700 /* 701 * SDM documents that on family 6 bank 0 should not be written 702 * because it aliases to another special BIOS controlled 703 * register. 704 * But it's not aliased anymore on model 0x1a+ 705 * Don't ignore bank 0 completely because there could be a 706 * valid event later, merely don't write CTL0. 707 */ 708 709 if (c->x86 == 6 && c->x86_model < 0x1A) 710 __set_bit(0, &dont_init_banks); 711 } 712} 713 714static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c) 715{ 716 if (c->x86 != 5) 717 return; 718 switch (c->x86_vendor) { 719 case X86_VENDOR_INTEL: 720 if (mce_p5_enabled()) 721 intel_p5_mcheck_init(c); 722 break; 723 case X86_VENDOR_CENTAUR: 724 winchip_mcheck_init(c); 725 break; 726 } 727} 728 729static void mce_cpu_features(struct cpuinfo_x86 *c) 730{ 731 switch (c->x86_vendor) { 732 case X86_VENDOR_INTEL: 733 mce_intel_feature_init(c); 734 break; 735 case X86_VENDOR_AMD: 736 mce_amd_feature_init(c); 737 break; 738 default: 739 break; 740 } 741} 742 743static void mce_init_timer(void) 744{ 745 struct timer_list *t = &__get_cpu_var(mce_timer); 746 int *n = &__get_cpu_var(next_interval); 747 748 *n = check_interval * HZ; 749 if (!*n) 750 return; 751 setup_timer(t, mcheck_timer, smp_processor_id()); 752 t->expires = round_jiffies(jiffies + *n); 753 add_timer(t); 754} 755 756/* 757 * Called for each booted CPU to set up machine checks. 758 * Must be called with preempt off: 759 */ 760void __cpuinit mcheck_init(struct cpuinfo_x86 *c) 761{ 762 if (mce_disabled) 763 return; 764 765 mce_ancient_init(c); 766 767 if (!mce_available(c)) 768 return; 769 770 if (mce_cap_init() < 0) { 771 mce_disabled = 1; 772 return; 773 } 774 mce_cpu_quirks(c); 775 776 machine_check_vector = do_machine_check; 777 778 mce_init(NULL); 779 mce_cpu_features(c); 780 mce_init_timer(); 781} 782 783/* 784 * Character device to read and clear the MCE log. 785 */ 786 787static DEFINE_SPINLOCK(mce_state_lock); 788static int open_count; /* #times opened */ 789static int open_exclu; /* already open exclusive? */ 790 791static int mce_open(struct inode *inode, struct file *file) 792{ 793 lock_kernel(); 794 spin_lock(&mce_state_lock); 795 796 if (open_exclu || (open_count && (file->f_flags & O_EXCL))) { 797 spin_unlock(&mce_state_lock); 798 unlock_kernel(); 799 800 return -EBUSY; 801 } 802 803 if (file->f_flags & O_EXCL) 804 open_exclu = 1; 805 open_count++; 806 807 spin_unlock(&mce_state_lock); 808 unlock_kernel(); 809 810 return nonseekable_open(inode, file); 811} 812 813static int mce_release(struct inode *inode, struct file *file) 814{ 815 spin_lock(&mce_state_lock); 816 817 open_count--; 818 open_exclu = 0; 819 820 spin_unlock(&mce_state_lock); 821 822 return 0; 823} 824 825static void collect_tscs(void *data) 826{ 827 unsigned long *cpu_tsc = (unsigned long *)data; 828 829 rdtscll(cpu_tsc[smp_processor_id()]); 830} 831 832static DEFINE_MUTEX(mce_read_mutex); 833 834static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, 835 loff_t *off) 836{ 837 char __user *buf = ubuf; 838 unsigned long *cpu_tsc; 839 unsigned prev, next; 840 int i, err; 841 842 cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL); 843 if (!cpu_tsc) 844 return -ENOMEM; 845 846 mutex_lock(&mce_read_mutex); 847 next = rcu_dereference(mcelog.next); 848 849 /* Only supports full reads right now */ 850 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) { 851 mutex_unlock(&mce_read_mutex); 852 kfree(cpu_tsc); 853 854 return -EINVAL; 855 } 856 857 err = 0; 858 prev = 0; 859 do { 860 for (i = prev; i < next; i++) { 861 unsigned long start = jiffies; 862 863 while (!mcelog.entry[i].finished) { 864 if (time_after_eq(jiffies, start + 2)) { 865 memset(mcelog.entry + i, 0, 866 sizeof(struct mce)); 867 goto timeout; 868 } 869 cpu_relax(); 870 } 871 smp_rmb(); 872 err |= copy_to_user(buf, mcelog.entry + i, 873 sizeof(struct mce)); 874 buf += sizeof(struct mce); 875timeout: 876 ; 877 } 878 879 memset(mcelog.entry + prev, 0, 880 (next - prev) * sizeof(struct mce)); 881 prev = next; 882 next = cmpxchg(&mcelog.next, prev, 0); 883 } while (next != prev); 884 885 synchronize_sched(); 886 887 /* 888 * Collect entries that were still getting written before the 889 * synchronize. 890 */ 891 on_each_cpu(collect_tscs, cpu_tsc, 1); 892 893 for (i = next; i < MCE_LOG_LEN; i++) { 894 if (mcelog.entry[i].finished && 895 mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) { 896 err |= copy_to_user(buf, mcelog.entry+i, 897 sizeof(struct mce)); 898 smp_rmb(); 899 buf += sizeof(struct mce); 900 memset(&mcelog.entry[i], 0, sizeof(struct mce)); 901 } 902 } 903 mutex_unlock(&mce_read_mutex); 904 kfree(cpu_tsc); 905 906 return err ? -EFAULT : buf - ubuf; 907} 908 909static unsigned int mce_poll(struct file *file, poll_table *wait) 910{ 911 poll_wait(file, &mce_wait, wait); 912 if (rcu_dereference(mcelog.next)) 913 return POLLIN | POLLRDNORM; 914 return 0; 915} 916 917static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg) 918{ 919 int __user *p = (int __user *)arg; 920 921 if (!capable(CAP_SYS_ADMIN)) 922 return -EPERM; 923 924 switch (cmd) { 925 case MCE_GET_RECORD_LEN: 926 return put_user(sizeof(struct mce), p); 927 case MCE_GET_LOG_LEN: 928 return put_user(MCE_LOG_LEN, p); 929 case MCE_GETCLEAR_FLAGS: { 930 unsigned flags; 931 932 do { 933 flags = mcelog.flags; 934 } while (cmpxchg(&mcelog.flags, flags, 0) != flags); 935 936 return put_user(flags, p); 937 } 938 default: 939 return -ENOTTY; 940 } 941} 942 943/* Modified in mce-inject.c, so not static or const */ 944struct file_operations mce_chrdev_ops = { 945 .open = mce_open, 946 .release = mce_release, 947 .read = mce_read, 948 .poll = mce_poll, 949 .unlocked_ioctl = mce_ioctl, 950}; 951EXPORT_SYMBOL_GPL(mce_chrdev_ops); 952 953static struct miscdevice mce_log_device = { 954 MISC_MCELOG_MINOR, 955 "mcelog", 956 &mce_chrdev_ops, 957}; 958 959/* 960 * mce=off disables machine check 961 * mce=TOLERANCELEVEL (number, see above) 962 * mce=bootlog Log MCEs from before booting. Disabled by default on AMD. 963 * mce=nobootlog Don't log MCEs from before booting. 964 */ 965static int __init mcheck_enable(char *str) 966{ 967 if (*str == 0) 968 enable_p5_mce(); 969 if (*str == '=') 970 str++; 971 if (!strcmp(str, "off")) 972 mce_disabled = 1; 973 else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog")) 974 mce_bootlog = (str[0] == 'b'); 975 else if (isdigit(str[0])) 976 get_option(&str, &tolerant); 977 else { 978 printk(KERN_INFO "mce argument %s ignored. Please use /sys\n", 979 str); 980 return 0; 981 } 982 return 1; 983} 984__setup("mce", mcheck_enable); 985 986/* 987 * Sysfs support 988 */ 989 990/* 991 * Disable machine checks on suspend and shutdown. We can't really handle 992 * them later. 993 */ 994static int mce_disable(void) 995{ 996 int i; 997 998 for (i = 0; i < banks; i++) { 999 if (!skip_bank_init(i)) 1000 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); 1001 } 1002 return 0; 1003} 1004 1005static int mce_suspend(struct sys_device *dev, pm_message_t state) 1006{ 1007 return mce_disable(); 1008} 1009 1010static int mce_shutdown(struct sys_device *dev) 1011{ 1012 return mce_disable(); 1013} 1014 1015/* 1016 * On resume clear all MCE state. Don't want to see leftovers from the BIOS. 1017 * Only one CPU is active at this time, the others get re-added later using 1018 * CPU hotplug: 1019 */ 1020static int mce_resume(struct sys_device *dev) 1021{ 1022 mce_init(NULL); 1023 mce_cpu_features(¤t_cpu_data); 1024 1025 return 0; 1026} 1027 1028static void mce_cpu_restart(void *data) 1029{ 1030 del_timer_sync(&__get_cpu_var(mce_timer)); 1031 if (mce_available(¤t_cpu_data)) 1032 mce_init(NULL); 1033 mce_init_timer(); 1034} 1035 1036/* Reinit MCEs after user configuration changes */ 1037static void mce_restart(void) 1038{ 1039 on_each_cpu(mce_cpu_restart, NULL, 1); 1040} 1041 1042static struct sysdev_class mce_sysclass = { 1043 .suspend = mce_suspend, 1044 .shutdown = mce_shutdown, 1045 .resume = mce_resume, 1046 .name = "machinecheck", 1047}; 1048 1049DEFINE_PER_CPU(struct sys_device, mce_dev); 1050 1051__cpuinitdata 1052void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); 1053 1054/* Why are there no generic functions for this? */ 1055#define ACCESSOR(name, var, start) \ 1056 static ssize_t show_ ## name(struct sys_device *s, \ 1057 struct sysdev_attribute *attr, \ 1058 char *buf) { \ 1059 return sprintf(buf, "%Lx\n", (u64)var); \ 1060 } \ 1061 static ssize_t set_ ## name(struct sys_device *s, \ 1062 struct sysdev_attribute *attr, \ 1063 const char *buf, size_t siz) { \ 1064 char *end; \ 1065 u64 new = simple_strtoull(buf, &end, 0); \ 1066 \ 1067 if (end == buf) \ 1068 return -EINVAL; \ 1069 var = new; \ 1070 start; \ 1071 \ 1072 return end-buf; \ 1073 } \ 1074 static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name); 1075 1076static struct sysdev_attribute *bank_attrs; 1077 1078static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr, 1079 char *buf) 1080{ 1081 u64 b = bank[attr - bank_attrs]; 1082 1083 return sprintf(buf, "%llx\n", b); 1084} 1085 1086static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, 1087 const char *buf, size_t siz) 1088{ 1089 char *end; 1090 u64 new = simple_strtoull(buf, &end, 0); 1091 1092 if (end == buf) 1093 return -EINVAL; 1094 1095 bank[attr - bank_attrs] = new; 1096 mce_restart(); 1097 1098 return end-buf; 1099} 1100 1101static ssize_t 1102show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf) 1103{ 1104 strcpy(buf, trigger); 1105 strcat(buf, "\n"); 1106 return strlen(trigger) + 1; 1107} 1108 1109static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr, 1110 const char *buf, size_t siz) 1111{ 1112 char *p; 1113 int len; 1114 1115 strncpy(trigger, buf, sizeof(trigger)); 1116 trigger[sizeof(trigger)-1] = 0; 1117 len = strlen(trigger); 1118 p = strchr(trigger, '\n'); 1119 1120 if (*p) 1121 *p = 0; 1122 1123 return len; 1124} 1125 1126static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); 1127static SYSDEV_INT_ATTR(tolerant, 0644, tolerant); 1128 1129ACCESSOR(check_interval, check_interval, mce_restart()) 1130 1131static struct sysdev_attribute *mce_attrs[] = { 1132 &attr_tolerant.attr, &attr_check_interval, &attr_trigger, 1133 NULL 1134}; 1135 1136static cpumask_var_t mce_dev_initialized; 1137 1138/* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */ 1139static __cpuinit int mce_create_device(unsigned int cpu) 1140{ 1141 int err; 1142 int i; 1143 1144 if (!mce_available(&boot_cpu_data)) 1145 return -EIO; 1146 1147 memset(&per_cpu(mce_dev, cpu).kobj, 0, sizeof(struct kobject)); 1148 per_cpu(mce_dev, cpu).id = cpu; 1149 per_cpu(mce_dev, cpu).cls = &mce_sysclass; 1150 1151 err = sysdev_register(&per_cpu(mce_dev, cpu)); 1152 if (err) 1153 return err; 1154 1155 for (i = 0; mce_attrs[i]; i++) { 1156 err = sysdev_create_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); 1157 if (err) 1158 goto error; 1159 } 1160 for (i = 0; i < banks; i++) { 1161 err = sysdev_create_file(&per_cpu(mce_dev, cpu), 1162 &bank_attrs[i]); 1163 if (err) 1164 goto error2; 1165 } 1166 cpumask_set_cpu(cpu, mce_dev_initialized); 1167 1168 return 0; 1169error2: 1170 while (--i >= 0) 1171 sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]); 1172error: 1173 while (--i >= 0) 1174 sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); 1175 1176 sysdev_unregister(&per_cpu(mce_dev, cpu)); 1177 1178 return err; 1179} 1180 1181static __cpuinit void mce_remove_device(unsigned int cpu) 1182{ 1183 int i; 1184 1185 if (!cpumask_test_cpu(cpu, mce_dev_initialized)) 1186 return; 1187 1188 for (i = 0; mce_attrs[i]; i++) 1189 sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); 1190 1191 for (i = 0; i < banks; i++) 1192 sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]); 1193 1194 sysdev_unregister(&per_cpu(mce_dev, cpu)); 1195 cpumask_clear_cpu(cpu, mce_dev_initialized); 1196} 1197 1198/* Make sure there are no machine checks on offlined CPUs. */ 1199static void mce_disable_cpu(void *h) 1200{ 1201 unsigned long action = *(unsigned long *)h; 1202 int i; 1203 1204 if (!mce_available(¤t_cpu_data)) 1205 return; 1206 if (!(action & CPU_TASKS_FROZEN)) 1207 cmci_clear(); 1208 for (i = 0; i < banks; i++) { 1209 if (!skip_bank_init(i)) 1210 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); 1211 } 1212} 1213 1214static void mce_reenable_cpu(void *h) 1215{ 1216 unsigned long action = *(unsigned long *)h; 1217 int i; 1218 1219 if (!mce_available(¤t_cpu_data)) 1220 return; 1221 1222 if (!(action & CPU_TASKS_FROZEN)) 1223 cmci_reenable(); 1224 for (i = 0; i < banks; i++) { 1225 if (!skip_bank_init(i)) 1226 wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]); 1227 } 1228} 1229 1230/* Get notified when a cpu comes on/off. Be hotplug friendly. */ 1231static int __cpuinit 1232mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) 1233{ 1234 unsigned int cpu = (unsigned long)hcpu; 1235 struct timer_list *t = &per_cpu(mce_timer, cpu); 1236 1237 switch (action) { 1238 case CPU_ONLINE: 1239 case CPU_ONLINE_FROZEN: 1240 mce_create_device(cpu); 1241 if (threshold_cpu_callback) 1242 threshold_cpu_callback(action, cpu); 1243 break; 1244 case CPU_DEAD: 1245 case CPU_DEAD_FROZEN: 1246 if (threshold_cpu_callback) 1247 threshold_cpu_callback(action, cpu); 1248 mce_remove_device(cpu); 1249 break; 1250 case CPU_DOWN_PREPARE: 1251 case CPU_DOWN_PREPARE_FROZEN: 1252 del_timer_sync(t); 1253 smp_call_function_single(cpu, mce_disable_cpu, &action, 1); 1254 break; 1255 case CPU_DOWN_FAILED: 1256 case CPU_DOWN_FAILED_FROZEN: 1257 t->expires = round_jiffies(jiffies + 1258 __get_cpu_var(next_interval)); 1259 add_timer_on(t, cpu); 1260 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); 1261 break; 1262 case CPU_POST_DEAD: 1263 /* intentionally ignoring frozen here */ 1264 cmci_rediscover(cpu); 1265 break; 1266 } 1267 return NOTIFY_OK; 1268} 1269 1270static struct notifier_block mce_cpu_notifier __cpuinitdata = { 1271 .notifier_call = mce_cpu_callback, 1272}; 1273 1274static __init int mce_init_banks(void) 1275{ 1276 int i; 1277 1278 bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks, 1279 GFP_KERNEL); 1280 if (!bank_attrs) 1281 return -ENOMEM; 1282 1283 for (i = 0; i < banks; i++) { 1284 struct sysdev_attribute *a = &bank_attrs[i]; 1285 1286 a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i); 1287 if (!a->attr.name) 1288 goto nomem; 1289 1290 a->attr.mode = 0644; 1291 a->show = show_bank; 1292 a->store = set_bank; 1293 } 1294 return 0; 1295 1296nomem: 1297 while (--i >= 0) 1298 kfree(bank_attrs[i].attr.name); 1299 kfree(bank_attrs); 1300 bank_attrs = NULL; 1301 1302 return -ENOMEM; 1303} 1304 1305static __init int mce_init_device(void) 1306{ 1307 int err; 1308 int i = 0; 1309 1310 if (!mce_available(&boot_cpu_data)) 1311 return -EIO; 1312 1313 alloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL); 1314 1315 err = mce_init_banks(); 1316 if (err) 1317 return err; 1318 1319 err = sysdev_class_register(&mce_sysclass); 1320 if (err) 1321 return err; 1322 1323 for_each_online_cpu(i) { 1324 err = mce_create_device(i); 1325 if (err) 1326 return err; 1327 } 1328 1329 register_hotcpu_notifier(&mce_cpu_notifier); 1330 misc_register(&mce_log_device); 1331 1332 return err; 1333} 1334 1335device_initcall(mce_init_device); 1336 1337#else /* CONFIG_X86_OLD_MCE: */ 1338 1339int nr_mce_banks; 1340EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */ 1341 1342/* This has to be run for each processor */ 1343void mcheck_init(struct cpuinfo_x86 *c) 1344{ 1345 if (mce_disabled == 1) 1346 return; 1347 1348 switch (c->x86_vendor) { 1349 case X86_VENDOR_AMD: 1350 amd_mcheck_init(c); 1351 break; 1352 1353 case X86_VENDOR_INTEL: 1354 if (c->x86 == 5) 1355 intel_p5_mcheck_init(c); 1356 if (c->x86 == 6) 1357 intel_p6_mcheck_init(c); 1358 if (c->x86 == 15) 1359 intel_p4_mcheck_init(c); 1360 break; 1361 1362 case X86_VENDOR_CENTAUR: 1363 if (c->x86 == 5) 1364 winchip_mcheck_init(c); 1365 break; 1366 1367 default: 1368 break; 1369 } 1370 printk(KERN_INFO "mce: CPU supports %d MCE banks\n", nr_mce_banks); 1371} 1372 1373static int __init mcheck_enable(char *str) 1374{ 1375 mce_disabled = -1; 1376 return 1; 1377} 1378 1379__setup("mce", mcheck_enable); 1380 1381#endif /* CONFIG_X86_OLD_MCE */ 1382 1383/* 1384 * Old style boot options parsing. Only for compatibility. 1385 */ 1386static int __init mcheck_disable(char *str) 1387{ 1388 mce_disabled = 1; 1389 return 1; 1390} 1391__setup("nomce", mcheck_disable); 1392