mce.c revision de8a84d85ad8bb46d01d72ebc57030b95075603c
1/* 2 * Machine check handler. 3 * 4 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. 5 * Rest from unknown author(s). 6 * 2004 Andi Kleen. Rewrote most of it. 7 * Copyright 2008 Intel Corporation 8 * Author: Andi Kleen 9 */ 10#include <linux/thread_info.h> 11#include <linux/capability.h> 12#include <linux/miscdevice.h> 13#include <linux/ratelimit.h> 14#include <linux/kallsyms.h> 15#include <linux/rcupdate.h> 16#include <linux/kobject.h> 17#include <linux/uaccess.h> 18#include <linux/kdebug.h> 19#include <linux/kernel.h> 20#include <linux/percpu.h> 21#include <linux/string.h> 22#include <linux/sysdev.h> 23#include <linux/ctype.h> 24#include <linux/sched.h> 25#include <linux/sysfs.h> 26#include <linux/types.h> 27#include <linux/init.h> 28#include <linux/kmod.h> 29#include <linux/poll.h> 30#include <linux/cpu.h> 31#include <linux/smp.h> 32#include <linux/fs.h> 33 34#include <asm/processor.h> 35#include <asm/idle.h> 36#include <asm/mce.h> 37#include <asm/msr.h> 38 39#include "mce.h" 40 41/* Handle unconfigured int18 (should never happen) */ 42static void unexpected_machine_check(struct pt_regs *regs, long error_code) 43{ 44 printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", 45 smp_processor_id()); 46} 47 48/* Call the installed machine check handler for this CPU setup. */ 49void (*machine_check_vector)(struct pt_regs *, long error_code) = 50 unexpected_machine_check; 51 52int mce_disabled; 53 54#ifdef CONFIG_X86_NEW_MCE 55 56#define MISC_MCELOG_MINOR 227 57 58atomic_t mce_entry; 59 60DEFINE_PER_CPU(unsigned, mce_exception_count); 61 62/* 63 * Tolerant levels: 64 * 0: always panic on uncorrected errors, log corrected errors 65 * 1: panic or SIGBUS on uncorrected errors, log corrected errors 66 * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors 67 * 3: never panic or SIGBUS, log all errors (for testing only) 68 */ 69static int tolerant = 1; 70static int banks; 71static u64 *bank; 72static unsigned long notify_user; 73static int rip_msr; 74static int mce_bootlog = -1; 75 76static char trigger[128]; 77static char *trigger_argv[2] = { trigger, NULL }; 78 79static unsigned long dont_init_banks; 80 81static DECLARE_WAIT_QUEUE_HEAD(mce_wait); 82 83/* MCA banks polled by the period polling timer for corrected events */ 84DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { 85 [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL 86}; 87 88static inline int skip_bank_init(int i) 89{ 90 return i < BITS_PER_LONG && test_bit(i, &dont_init_banks); 91} 92 93/* Do initial initialization of a struct mce */ 94void mce_setup(struct mce *m) 95{ 96 memset(m, 0, sizeof(struct mce)); 97 m->cpu = m->extcpu = smp_processor_id(); 98 rdtscll(m->tsc); 99 /* We hope get_seconds stays lockless */ 100 m->time = get_seconds(); 101 m->cpuvendor = boot_cpu_data.x86_vendor; 102 m->cpuid = cpuid_eax(1); 103#ifdef CONFIG_SMP 104 m->socketid = cpu_data(m->extcpu).phys_proc_id; 105#endif 106 m->apicid = cpu_data(m->extcpu).initial_apicid; 107 rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap); 108} 109 110DEFINE_PER_CPU(struct mce, injectm); 111EXPORT_PER_CPU_SYMBOL_GPL(injectm); 112 113/* 114 * Lockless MCE logging infrastructure. 115 * This avoids deadlocks on printk locks without having to break locks. Also 116 * separate MCEs from kernel messages to avoid bogus bug reports. 117 */ 118 119static struct mce_log mcelog = { 120 .signature = MCE_LOG_SIGNATURE, 121 .len = MCE_LOG_LEN, 122 .recordlen = sizeof(struct mce), 123}; 124 125void mce_log(struct mce *mce) 126{ 127 unsigned next, entry; 128 129 mce->finished = 0; 130 wmb(); 131 for (;;) { 132 entry = rcu_dereference(mcelog.next); 133 for (;;) { 134 /* 135 * When the buffer fills up discard new entries. 136 * Assume that the earlier errors are the more 137 * interesting ones: 138 */ 139 if (entry >= MCE_LOG_LEN) { 140 set_bit(MCE_OVERFLOW, 141 (unsigned long *)&mcelog.flags); 142 return; 143 } 144 /* Old left over entry. Skip: */ 145 if (mcelog.entry[entry].finished) { 146 entry++; 147 continue; 148 } 149 break; 150 } 151 smp_rmb(); 152 next = entry + 1; 153 if (cmpxchg(&mcelog.next, entry, next) == entry) 154 break; 155 } 156 memcpy(mcelog.entry + entry, mce, sizeof(struct mce)); 157 wmb(); 158 mcelog.entry[entry].finished = 1; 159 wmb(); 160 161 set_bit(0, ¬ify_user); 162} 163 164static void print_mce(struct mce *m) 165{ 166 printk(KERN_EMERG "\n" 167 KERN_EMERG "HARDWARE ERROR\n" 168 KERN_EMERG 169 "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n", 170 m->extcpu, m->mcgstatus, m->bank, m->status); 171 if (m->ip) { 172 printk(KERN_EMERG "RIP%s %02x:<%016Lx> ", 173 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", 174 m->cs, m->ip); 175 if (m->cs == __KERNEL_CS) 176 print_symbol("{%s}", m->ip); 177 printk("\n"); 178 } 179 printk(KERN_EMERG "TSC %llx ", m->tsc); 180 if (m->addr) 181 printk("ADDR %llx ", m->addr); 182 if (m->misc) 183 printk("MISC %llx ", m->misc); 184 printk("\n"); 185 printk(KERN_EMERG "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n", 186 m->cpuvendor, m->cpuid, m->time, m->socketid, 187 m->apicid); 188 printk(KERN_EMERG "This is not a software problem!\n"); 189 printk(KERN_EMERG "Run through mcelog --ascii to decode " 190 "and contact your hardware vendor\n"); 191} 192 193static void mce_panic(char *msg, struct mce *backup, u64 start) 194{ 195 int i; 196 197 bust_spinlocks(1); 198 console_verbose(); 199 for (i = 0; i < MCE_LOG_LEN; i++) { 200 u64 tsc = mcelog.entry[i].tsc; 201 202 if ((s64)(tsc - start) < 0) 203 continue; 204 print_mce(&mcelog.entry[i]); 205 if (backup && mcelog.entry[i].tsc == backup->tsc) 206 backup = NULL; 207 } 208 if (backup) 209 print_mce(backup); 210 panic(msg); 211} 212 213/* Support code for software error injection */ 214 215static int msr_to_offset(u32 msr) 216{ 217 unsigned bank = __get_cpu_var(injectm.bank); 218 if (msr == rip_msr) 219 return offsetof(struct mce, ip); 220 if (msr == MSR_IA32_MC0_STATUS + bank*4) 221 return offsetof(struct mce, status); 222 if (msr == MSR_IA32_MC0_ADDR + bank*4) 223 return offsetof(struct mce, addr); 224 if (msr == MSR_IA32_MC0_MISC + bank*4) 225 return offsetof(struct mce, misc); 226 if (msr == MSR_IA32_MCG_STATUS) 227 return offsetof(struct mce, mcgstatus); 228 return -1; 229} 230 231/* MSR access wrappers used for error injection */ 232static u64 mce_rdmsrl(u32 msr) 233{ 234 u64 v; 235 if (__get_cpu_var(injectm).finished) { 236 int offset = msr_to_offset(msr); 237 if (offset < 0) 238 return 0; 239 return *(u64 *)((char *)&__get_cpu_var(injectm) + offset); 240 } 241 rdmsrl(msr, v); 242 return v; 243} 244 245static void mce_wrmsrl(u32 msr, u64 v) 246{ 247 if (__get_cpu_var(injectm).finished) { 248 int offset = msr_to_offset(msr); 249 if (offset >= 0) 250 *(u64 *)((char *)&__get_cpu_var(injectm) + offset) = v; 251 return; 252 } 253 wrmsrl(msr, v); 254} 255 256int mce_available(struct cpuinfo_x86 *c) 257{ 258 if (mce_disabled) 259 return 0; 260 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); 261} 262 263static inline void mce_get_rip(struct mce *m, struct pt_regs *regs) 264{ 265 if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) { 266 m->ip = regs->ip; 267 m->cs = regs->cs; 268 } else { 269 m->ip = 0; 270 m->cs = 0; 271 } 272 if (rip_msr) { 273 /* Assume the RIP in the MSR is exact. Is this true? */ 274 m->mcgstatus |= MCG_STATUS_EIPV; 275 m->ip = mce_rdmsrl(rip_msr); 276 m->cs = 0; 277 } 278} 279 280DEFINE_PER_CPU(unsigned, mce_poll_count); 281 282/* 283 * Poll for corrected events or events that happened before reset. 284 * Those are just logged through /dev/mcelog. 285 * 286 * This is executed in standard interrupt context. 287 */ 288void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) 289{ 290 struct mce m; 291 int i; 292 293 __get_cpu_var(mce_poll_count)++; 294 295 mce_setup(&m); 296 297 m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); 298 for (i = 0; i < banks; i++) { 299 if (!bank[i] || !test_bit(i, *b)) 300 continue; 301 302 m.misc = 0; 303 m.addr = 0; 304 m.bank = i; 305 m.tsc = 0; 306 307 barrier(); 308 m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4); 309 if (!(m.status & MCI_STATUS_VAL)) 310 continue; 311 312 /* 313 * Uncorrected events are handled by the exception handler 314 * when it is enabled. But when the exception is disabled log 315 * everything. 316 * 317 * TBD do the same check for MCI_STATUS_EN here? 318 */ 319 if ((m.status & MCI_STATUS_UC) && !(flags & MCP_UC)) 320 continue; 321 322 if (m.status & MCI_STATUS_MISCV) 323 m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4); 324 if (m.status & MCI_STATUS_ADDRV) 325 m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4); 326 327 if (!(flags & MCP_TIMESTAMP)) 328 m.tsc = 0; 329 /* 330 * Don't get the IP here because it's unlikely to 331 * have anything to do with the actual error location. 332 */ 333 if (!(flags & MCP_DONTLOG)) { 334 mce_log(&m); 335 add_taint(TAINT_MACHINE_CHECK); 336 } 337 338 /* 339 * Clear state for this bank. 340 */ 341 mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); 342 } 343 344 /* 345 * Don't clear MCG_STATUS here because it's only defined for 346 * exceptions. 347 */ 348 349 sync_core(); 350} 351EXPORT_SYMBOL_GPL(machine_check_poll); 352 353/* 354 * The actual machine check handler. This only handles real 355 * exceptions when something got corrupted coming in through int 18. 356 * 357 * This is executed in NMI context not subject to normal locking rules. This 358 * implies that most kernel services cannot be safely used. Don't even 359 * think about putting a printk in there! 360 */ 361void do_machine_check(struct pt_regs *regs, long error_code) 362{ 363 struct mce m, panicm; 364 int panicm_found = 0; 365 u64 mcestart = 0; 366 int i; 367 /* 368 * If no_way_out gets set, there is no safe way to recover from this 369 * MCE. If tolerant is cranked up, we'll try anyway. 370 */ 371 int no_way_out = 0; 372 /* 373 * If kill_it gets set, there might be a way to recover from this 374 * error. 375 */ 376 int kill_it = 0; 377 DECLARE_BITMAP(toclear, MAX_NR_BANKS); 378 379 atomic_inc(&mce_entry); 380 381 __get_cpu_var(mce_exception_count)++; 382 383 if (notify_die(DIE_NMI, "machine check", regs, error_code, 384 18, SIGKILL) == NOTIFY_STOP) 385 goto out; 386 if (!banks) 387 goto out; 388 389 mce_setup(&m); 390 391 m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); 392 393 /* if the restart IP is not valid, we're done for */ 394 if (!(m.mcgstatus & MCG_STATUS_RIPV)) 395 no_way_out = 1; 396 397 rdtscll(mcestart); 398 barrier(); 399 400 for (i = 0; i < banks; i++) { 401 __clear_bit(i, toclear); 402 if (!bank[i]) 403 continue; 404 405 m.misc = 0; 406 m.addr = 0; 407 m.bank = i; 408 409 m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4); 410 if ((m.status & MCI_STATUS_VAL) == 0) 411 continue; 412 413 /* 414 * Non uncorrected errors are handled by machine_check_poll 415 * Leave them alone, unless this panics. 416 */ 417 if ((m.status & MCI_STATUS_UC) == 0 && !no_way_out) 418 continue; 419 420 /* 421 * Set taint even when machine check was not enabled. 422 */ 423 add_taint(TAINT_MACHINE_CHECK); 424 425 __set_bit(i, toclear); 426 427 if (m.status & MCI_STATUS_EN) { 428 /* if PCC was set, there's no way out */ 429 no_way_out |= !!(m.status & MCI_STATUS_PCC); 430 /* 431 * If this error was uncorrectable and there was 432 * an overflow, we're in trouble. If no overflow, 433 * we might get away with just killing a task. 434 */ 435 if (m.status & MCI_STATUS_UC) { 436 if (tolerant < 1 || m.status & MCI_STATUS_OVER) 437 no_way_out = 1; 438 kill_it = 1; 439 } 440 } else { 441 /* 442 * Machine check event was not enabled. Clear, but 443 * ignore. 444 */ 445 continue; 446 } 447 448 if (m.status & MCI_STATUS_MISCV) 449 m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4); 450 if (m.status & MCI_STATUS_ADDRV) 451 m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4); 452 453 mce_get_rip(&m, regs); 454 mce_log(&m); 455 456 /* 457 * Did this bank cause the exception? 458 * 459 * Assume that the bank with uncorrectable errors did it, 460 * and that there is only a single one: 461 */ 462 if ((m.status & MCI_STATUS_UC) && 463 (m.status & MCI_STATUS_EN)) { 464 panicm = m; 465 panicm_found = 1; 466 } 467 } 468 469 /* 470 * If we didn't find an uncorrectable error, pick 471 * the last one (shouldn't happen, just being safe). 472 */ 473 if (!panicm_found) 474 panicm = m; 475 476 /* 477 * If we have decided that we just CAN'T continue, and the user 478 * has not set tolerant to an insane level, give up and die. 479 */ 480 if (no_way_out && tolerant < 3) 481 mce_panic("Machine check", &panicm, mcestart); 482 483 /* 484 * If the error seems to be unrecoverable, something should be 485 * done. Try to kill as little as possible. If we can kill just 486 * one task, do that. If the user has set the tolerance very 487 * high, don't try to do anything at all. 488 */ 489 if (kill_it && tolerant < 3) { 490 int user_space = 0; 491 492 /* 493 * If the EIPV bit is set, it means the saved IP is the 494 * instruction which caused the MCE. 495 */ 496 if (m.mcgstatus & MCG_STATUS_EIPV) 497 user_space = panicm.ip && (panicm.cs & 3); 498 499 /* 500 * If we know that the error was in user space, send a 501 * SIGBUS. Otherwise, panic if tolerance is low. 502 * 503 * force_sig() takes an awful lot of locks and has a slight 504 * risk of deadlocking. 505 */ 506 if (user_space) { 507 force_sig(SIGBUS, current); 508 } else if (panic_on_oops || tolerant < 2) { 509 mce_panic("Uncorrected machine check", 510 &panicm, mcestart); 511 } 512 } 513 514 /* notify userspace ASAP */ 515 set_thread_flag(TIF_MCE_NOTIFY); 516 517 /* the last thing we do is clear state */ 518 for (i = 0; i < banks; i++) { 519 if (test_bit(i, toclear)) 520 mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); 521 } 522 mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); 523out: 524 atomic_dec(&mce_entry); 525 sync_core(); 526} 527EXPORT_SYMBOL_GPL(do_machine_check); 528 529#ifdef CONFIG_X86_MCE_INTEL 530/*** 531 * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog 532 * @cpu: The CPU on which the event occurred. 533 * @status: Event status information 534 * 535 * This function should be called by the thermal interrupt after the 536 * event has been processed and the decision was made to log the event 537 * further. 538 * 539 * The status parameter will be saved to the 'status' field of 'struct mce' 540 * and historically has been the register value of the 541 * MSR_IA32_THERMAL_STATUS (Intel) msr. 542 */ 543void mce_log_therm_throt_event(__u64 status) 544{ 545 struct mce m; 546 547 mce_setup(&m); 548 m.bank = MCE_THERMAL_BANK; 549 m.status = status; 550 mce_log(&m); 551} 552#endif /* CONFIG_X86_MCE_INTEL */ 553 554/* 555 * Periodic polling timer for "silent" machine check errors. If the 556 * poller finds an MCE, poll 2x faster. When the poller finds no more 557 * errors, poll 2x slower (up to check_interval seconds). 558 */ 559static int check_interval = 5 * 60; /* 5 minutes */ 560 561static DEFINE_PER_CPU(int, next_interval); /* in jiffies */ 562static DEFINE_PER_CPU(struct timer_list, mce_timer); 563 564static void mcheck_timer(unsigned long data) 565{ 566 struct timer_list *t = &per_cpu(mce_timer, data); 567 int *n; 568 569 WARN_ON(smp_processor_id() != data); 570 571 if (mce_available(¤t_cpu_data)) { 572 machine_check_poll(MCP_TIMESTAMP, 573 &__get_cpu_var(mce_poll_banks)); 574 } 575 576 /* 577 * Alert userspace if needed. If we logged an MCE, reduce the 578 * polling interval, otherwise increase the polling interval. 579 */ 580 n = &__get_cpu_var(next_interval); 581 if (mce_notify_user()) 582 *n = max(*n/2, HZ/100); 583 else 584 *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ)); 585 586 t->expires = jiffies + *n; 587 add_timer(t); 588} 589 590static void mce_do_trigger(struct work_struct *work) 591{ 592 call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT); 593} 594 595static DECLARE_WORK(mce_trigger_work, mce_do_trigger); 596 597/* 598 * Notify the user(s) about new machine check events. 599 * Can be called from interrupt context, but not from machine check/NMI 600 * context. 601 */ 602int mce_notify_user(void) 603{ 604 /* Not more than two messages every minute */ 605 static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); 606 607 clear_thread_flag(TIF_MCE_NOTIFY); 608 609 if (test_and_clear_bit(0, ¬ify_user)) { 610 wake_up_interruptible(&mce_wait); 611 612 /* 613 * There is no risk of missing notifications because 614 * work_pending is always cleared before the function is 615 * executed. 616 */ 617 if (trigger[0] && !work_pending(&mce_trigger_work)) 618 schedule_work(&mce_trigger_work); 619 620 if (__ratelimit(&ratelimit)) 621 printk(KERN_INFO "Machine check events logged\n"); 622 623 return 1; 624 } 625 return 0; 626} 627EXPORT_SYMBOL_GPL(mce_notify_user); 628 629/* 630 * Initialize Machine Checks for a CPU. 631 */ 632static int mce_cap_init(void) 633{ 634 unsigned b; 635 u64 cap; 636 637 rdmsrl(MSR_IA32_MCG_CAP, cap); 638 639 b = cap & MCG_BANKCNT_MASK; 640 printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b); 641 642 if (b > MAX_NR_BANKS) { 643 printk(KERN_WARNING 644 "MCE: Using only %u machine check banks out of %u\n", 645 MAX_NR_BANKS, b); 646 b = MAX_NR_BANKS; 647 } 648 649 /* Don't support asymmetric configurations today */ 650 WARN_ON(banks != 0 && b != banks); 651 banks = b; 652 if (!bank) { 653 bank = kmalloc(banks * sizeof(u64), GFP_KERNEL); 654 if (!bank) 655 return -ENOMEM; 656 memset(bank, 0xff, banks * sizeof(u64)); 657 } 658 659 /* Use accurate RIP reporting if available. */ 660 if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9) 661 rip_msr = MSR_IA32_MCG_EIP; 662 663 return 0; 664} 665 666static void mce_init(void) 667{ 668 mce_banks_t all_banks; 669 u64 cap; 670 int i; 671 672 /* 673 * Log the machine checks left over from the previous reset. 674 */ 675 bitmap_fill(all_banks, MAX_NR_BANKS); 676 machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks); 677 678 set_in_cr4(X86_CR4_MCE); 679 680 rdmsrl(MSR_IA32_MCG_CAP, cap); 681 if (cap & MCG_CTL_P) 682 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); 683 684 for (i = 0; i < banks; i++) { 685 if (skip_bank_init(i)) 686 continue; 687 wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]); 688 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); 689 } 690} 691 692/* Add per CPU specific workarounds here */ 693static void mce_cpu_quirks(struct cpuinfo_x86 *c) 694{ 695 /* This should be disabled by the BIOS, but isn't always */ 696 if (c->x86_vendor == X86_VENDOR_AMD) { 697 if (c->x86 == 15 && banks > 4) { 698 /* 699 * disable GART TBL walk error reporting, which 700 * trips off incorrectly with the IOMMU & 3ware 701 * & Cerberus: 702 */ 703 clear_bit(10, (unsigned long *)&bank[4]); 704 } 705 if (c->x86 <= 17 && mce_bootlog < 0) { 706 /* 707 * Lots of broken BIOS around that don't clear them 708 * by default and leave crap in there. Don't log: 709 */ 710 mce_bootlog = 0; 711 } 712 /* 713 * Various K7s with broken bank 0 around. Always disable 714 * by default. 715 */ 716 if (c->x86 == 6) 717 bank[0] = 0; 718 } 719 720 if (c->x86_vendor == X86_VENDOR_INTEL) { 721 /* 722 * SDM documents that on family 6 bank 0 should not be written 723 * because it aliases to another special BIOS controlled 724 * register. 725 * But it's not aliased anymore on model 0x1a+ 726 * Don't ignore bank 0 completely because there could be a 727 * valid event later, merely don't write CTL0. 728 */ 729 730 if (c->x86 == 6 && c->x86_model < 0x1A) 731 __set_bit(0, &dont_init_banks); 732 } 733} 734 735static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c) 736{ 737 if (c->x86 != 5) 738 return; 739 switch (c->x86_vendor) { 740 case X86_VENDOR_INTEL: 741 if (mce_p5_enabled()) 742 intel_p5_mcheck_init(c); 743 break; 744 case X86_VENDOR_CENTAUR: 745 winchip_mcheck_init(c); 746 break; 747 } 748} 749 750static void mce_cpu_features(struct cpuinfo_x86 *c) 751{ 752 switch (c->x86_vendor) { 753 case X86_VENDOR_INTEL: 754 mce_intel_feature_init(c); 755 break; 756 case X86_VENDOR_AMD: 757 mce_amd_feature_init(c); 758 break; 759 default: 760 break; 761 } 762} 763 764static void mce_init_timer(void) 765{ 766 struct timer_list *t = &__get_cpu_var(mce_timer); 767 int *n = &__get_cpu_var(next_interval); 768 769 *n = check_interval * HZ; 770 if (!*n) 771 return; 772 setup_timer(t, mcheck_timer, smp_processor_id()); 773 t->expires = round_jiffies(jiffies + *n); 774 add_timer(t); 775} 776 777/* 778 * Called for each booted CPU to set up machine checks. 779 * Must be called with preempt off: 780 */ 781void __cpuinit mcheck_init(struct cpuinfo_x86 *c) 782{ 783 if (mce_disabled) 784 return; 785 786 mce_ancient_init(c); 787 788 if (!mce_available(c)) 789 return; 790 791 if (mce_cap_init() < 0) { 792 mce_disabled = 1; 793 return; 794 } 795 mce_cpu_quirks(c); 796 797 machine_check_vector = do_machine_check; 798 799 mce_init(); 800 mce_cpu_features(c); 801 mce_init_timer(); 802} 803 804/* 805 * Character device to read and clear the MCE log. 806 */ 807 808static DEFINE_SPINLOCK(mce_state_lock); 809static int open_count; /* #times opened */ 810static int open_exclu; /* already open exclusive? */ 811 812static int mce_open(struct inode *inode, struct file *file) 813{ 814 spin_lock(&mce_state_lock); 815 816 if (open_exclu || (open_count && (file->f_flags & O_EXCL))) { 817 spin_unlock(&mce_state_lock); 818 819 return -EBUSY; 820 } 821 822 if (file->f_flags & O_EXCL) 823 open_exclu = 1; 824 open_count++; 825 826 spin_unlock(&mce_state_lock); 827 828 return nonseekable_open(inode, file); 829} 830 831static int mce_release(struct inode *inode, struct file *file) 832{ 833 spin_lock(&mce_state_lock); 834 835 open_count--; 836 open_exclu = 0; 837 838 spin_unlock(&mce_state_lock); 839 840 return 0; 841} 842 843static void collect_tscs(void *data) 844{ 845 unsigned long *cpu_tsc = (unsigned long *)data; 846 847 rdtscll(cpu_tsc[smp_processor_id()]); 848} 849 850static DEFINE_MUTEX(mce_read_mutex); 851 852static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, 853 loff_t *off) 854{ 855 char __user *buf = ubuf; 856 unsigned long *cpu_tsc; 857 unsigned prev, next; 858 int i, err; 859 860 cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL); 861 if (!cpu_tsc) 862 return -ENOMEM; 863 864 mutex_lock(&mce_read_mutex); 865 next = rcu_dereference(mcelog.next); 866 867 /* Only supports full reads right now */ 868 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) { 869 mutex_unlock(&mce_read_mutex); 870 kfree(cpu_tsc); 871 872 return -EINVAL; 873 } 874 875 err = 0; 876 prev = 0; 877 do { 878 for (i = prev; i < next; i++) { 879 unsigned long start = jiffies; 880 881 while (!mcelog.entry[i].finished) { 882 if (time_after_eq(jiffies, start + 2)) { 883 memset(mcelog.entry + i, 0, 884 sizeof(struct mce)); 885 goto timeout; 886 } 887 cpu_relax(); 888 } 889 smp_rmb(); 890 err |= copy_to_user(buf, mcelog.entry + i, 891 sizeof(struct mce)); 892 buf += sizeof(struct mce); 893timeout: 894 ; 895 } 896 897 memset(mcelog.entry + prev, 0, 898 (next - prev) * sizeof(struct mce)); 899 prev = next; 900 next = cmpxchg(&mcelog.next, prev, 0); 901 } while (next != prev); 902 903 synchronize_sched(); 904 905 /* 906 * Collect entries that were still getting written before the 907 * synchronize. 908 */ 909 on_each_cpu(collect_tscs, cpu_tsc, 1); 910 911 for (i = next; i < MCE_LOG_LEN; i++) { 912 if (mcelog.entry[i].finished && 913 mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) { 914 err |= copy_to_user(buf, mcelog.entry+i, 915 sizeof(struct mce)); 916 smp_rmb(); 917 buf += sizeof(struct mce); 918 memset(&mcelog.entry[i], 0, sizeof(struct mce)); 919 } 920 } 921 mutex_unlock(&mce_read_mutex); 922 kfree(cpu_tsc); 923 924 return err ? -EFAULT : buf - ubuf; 925} 926 927static unsigned int mce_poll(struct file *file, poll_table *wait) 928{ 929 poll_wait(file, &mce_wait, wait); 930 if (rcu_dereference(mcelog.next)) 931 return POLLIN | POLLRDNORM; 932 return 0; 933} 934 935static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg) 936{ 937 int __user *p = (int __user *)arg; 938 939 if (!capable(CAP_SYS_ADMIN)) 940 return -EPERM; 941 942 switch (cmd) { 943 case MCE_GET_RECORD_LEN: 944 return put_user(sizeof(struct mce), p); 945 case MCE_GET_LOG_LEN: 946 return put_user(MCE_LOG_LEN, p); 947 case MCE_GETCLEAR_FLAGS: { 948 unsigned flags; 949 950 do { 951 flags = mcelog.flags; 952 } while (cmpxchg(&mcelog.flags, flags, 0) != flags); 953 954 return put_user(flags, p); 955 } 956 default: 957 return -ENOTTY; 958 } 959} 960 961/* Modified in mce-inject.c, so not static or const */ 962struct file_operations mce_chrdev_ops = { 963 .open = mce_open, 964 .release = mce_release, 965 .read = mce_read, 966 .poll = mce_poll, 967 .unlocked_ioctl = mce_ioctl, 968}; 969EXPORT_SYMBOL_GPL(mce_chrdev_ops); 970 971static struct miscdevice mce_log_device = { 972 MISC_MCELOG_MINOR, 973 "mcelog", 974 &mce_chrdev_ops, 975}; 976 977/* 978 * mce=off disables machine check 979 * mce=TOLERANCELEVEL (number, see above) 980 * mce=bootlog Log MCEs from before booting. Disabled by default on AMD. 981 * mce=nobootlog Don't log MCEs from before booting. 982 */ 983static int __init mcheck_enable(char *str) 984{ 985 if (*str == 0) 986 enable_p5_mce(); 987 if (*str == '=') 988 str++; 989 if (!strcmp(str, "off")) 990 mce_disabled = 1; 991 else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog")) 992 mce_bootlog = (str[0] == 'b'); 993 else if (isdigit(str[0])) 994 get_option(&str, &tolerant); 995 else { 996 printk(KERN_INFO "mce argument %s ignored. Please use /sys\n", 997 str); 998 return 0; 999 } 1000 return 1; 1001} 1002__setup("mce", mcheck_enable); 1003 1004/* 1005 * Sysfs support 1006 */ 1007 1008/* 1009 * Disable machine checks on suspend and shutdown. We can't really handle 1010 * them later. 1011 */ 1012static int mce_disable(void) 1013{ 1014 int i; 1015 1016 for (i = 0; i < banks; i++) { 1017 if (!skip_bank_init(i)) 1018 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); 1019 } 1020 return 0; 1021} 1022 1023static int mce_suspend(struct sys_device *dev, pm_message_t state) 1024{ 1025 return mce_disable(); 1026} 1027 1028static int mce_shutdown(struct sys_device *dev) 1029{ 1030 return mce_disable(); 1031} 1032 1033/* 1034 * On resume clear all MCE state. Don't want to see leftovers from the BIOS. 1035 * Only one CPU is active at this time, the others get re-added later using 1036 * CPU hotplug: 1037 */ 1038static int mce_resume(struct sys_device *dev) 1039{ 1040 mce_init(); 1041 mce_cpu_features(¤t_cpu_data); 1042 1043 return 0; 1044} 1045 1046static void mce_cpu_restart(void *data) 1047{ 1048 del_timer_sync(&__get_cpu_var(mce_timer)); 1049 if (mce_available(¤t_cpu_data)) 1050 mce_init(); 1051 mce_init_timer(); 1052} 1053 1054/* Reinit MCEs after user configuration changes */ 1055static void mce_restart(void) 1056{ 1057 on_each_cpu(mce_cpu_restart, NULL, 1); 1058} 1059 1060static struct sysdev_class mce_sysclass = { 1061 .suspend = mce_suspend, 1062 .shutdown = mce_shutdown, 1063 .resume = mce_resume, 1064 .name = "machinecheck", 1065}; 1066 1067DEFINE_PER_CPU(struct sys_device, mce_dev); 1068 1069__cpuinitdata 1070void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); 1071 1072static struct sysdev_attribute *bank_attrs; 1073 1074static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr, 1075 char *buf) 1076{ 1077 u64 b = bank[attr - bank_attrs]; 1078 1079 return sprintf(buf, "%llx\n", b); 1080} 1081 1082static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, 1083 const char *buf, size_t size) 1084{ 1085 u64 new; 1086 1087 if (strict_strtoull(buf, 0, &new) < 0) 1088 return -EINVAL; 1089 1090 bank[attr - bank_attrs] = new; 1091 mce_restart(); 1092 1093 return size; 1094} 1095 1096static ssize_t 1097show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf) 1098{ 1099 strcpy(buf, trigger); 1100 strcat(buf, "\n"); 1101 return strlen(trigger) + 1; 1102} 1103 1104static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr, 1105 const char *buf, size_t siz) 1106{ 1107 char *p; 1108 int len; 1109 1110 strncpy(trigger, buf, sizeof(trigger)); 1111 trigger[sizeof(trigger)-1] = 0; 1112 len = strlen(trigger); 1113 p = strchr(trigger, '\n'); 1114 1115 if (*p) 1116 *p = 0; 1117 1118 return len; 1119} 1120 1121static ssize_t store_int_with_restart(struct sys_device *s, 1122 struct sysdev_attribute *attr, 1123 const char *buf, size_t size) 1124{ 1125 ssize_t ret = sysdev_store_int(s, attr, buf, size); 1126 mce_restart(); 1127 return ret; 1128} 1129 1130static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); 1131static SYSDEV_INT_ATTR(tolerant, 0644, tolerant); 1132 1133static struct sysdev_ext_attribute attr_check_interval = { 1134 _SYSDEV_ATTR(check_interval, 0644, sysdev_show_int, 1135 store_int_with_restart), 1136 &check_interval 1137}; 1138 1139static struct sysdev_attribute *mce_attrs[] = { 1140 &attr_tolerant.attr, &attr_check_interval.attr, &attr_trigger, 1141 NULL 1142}; 1143 1144static cpumask_var_t mce_dev_initialized; 1145 1146/* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */ 1147static __cpuinit int mce_create_device(unsigned int cpu) 1148{ 1149 int err; 1150 int i; 1151 1152 if (!mce_available(&boot_cpu_data)) 1153 return -EIO; 1154 1155 memset(&per_cpu(mce_dev, cpu).kobj, 0, sizeof(struct kobject)); 1156 per_cpu(mce_dev, cpu).id = cpu; 1157 per_cpu(mce_dev, cpu).cls = &mce_sysclass; 1158 1159 err = sysdev_register(&per_cpu(mce_dev, cpu)); 1160 if (err) 1161 return err; 1162 1163 for (i = 0; mce_attrs[i]; i++) { 1164 err = sysdev_create_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); 1165 if (err) 1166 goto error; 1167 } 1168 for (i = 0; i < banks; i++) { 1169 err = sysdev_create_file(&per_cpu(mce_dev, cpu), 1170 &bank_attrs[i]); 1171 if (err) 1172 goto error2; 1173 } 1174 cpumask_set_cpu(cpu, mce_dev_initialized); 1175 1176 return 0; 1177error2: 1178 while (--i >= 0) 1179 sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]); 1180error: 1181 while (--i >= 0) 1182 sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); 1183 1184 sysdev_unregister(&per_cpu(mce_dev, cpu)); 1185 1186 return err; 1187} 1188 1189static __cpuinit void mce_remove_device(unsigned int cpu) 1190{ 1191 int i; 1192 1193 if (!cpumask_test_cpu(cpu, mce_dev_initialized)) 1194 return; 1195 1196 for (i = 0; mce_attrs[i]; i++) 1197 sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); 1198 1199 for (i = 0; i < banks; i++) 1200 sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]); 1201 1202 sysdev_unregister(&per_cpu(mce_dev, cpu)); 1203 cpumask_clear_cpu(cpu, mce_dev_initialized); 1204} 1205 1206/* Make sure there are no machine checks on offlined CPUs. */ 1207static void mce_disable_cpu(void *h) 1208{ 1209 unsigned long action = *(unsigned long *)h; 1210 int i; 1211 1212 if (!mce_available(¤t_cpu_data)) 1213 return; 1214 if (!(action & CPU_TASKS_FROZEN)) 1215 cmci_clear(); 1216 for (i = 0; i < banks; i++) { 1217 if (!skip_bank_init(i)) 1218 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); 1219 } 1220} 1221 1222static void mce_reenable_cpu(void *h) 1223{ 1224 unsigned long action = *(unsigned long *)h; 1225 int i; 1226 1227 if (!mce_available(¤t_cpu_data)) 1228 return; 1229 1230 if (!(action & CPU_TASKS_FROZEN)) 1231 cmci_reenable(); 1232 for (i = 0; i < banks; i++) { 1233 if (!skip_bank_init(i)) 1234 wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]); 1235 } 1236} 1237 1238/* Get notified when a cpu comes on/off. Be hotplug friendly. */ 1239static int __cpuinit 1240mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) 1241{ 1242 unsigned int cpu = (unsigned long)hcpu; 1243 struct timer_list *t = &per_cpu(mce_timer, cpu); 1244 1245 switch (action) { 1246 case CPU_ONLINE: 1247 case CPU_ONLINE_FROZEN: 1248 mce_create_device(cpu); 1249 if (threshold_cpu_callback) 1250 threshold_cpu_callback(action, cpu); 1251 break; 1252 case CPU_DEAD: 1253 case CPU_DEAD_FROZEN: 1254 if (threshold_cpu_callback) 1255 threshold_cpu_callback(action, cpu); 1256 mce_remove_device(cpu); 1257 break; 1258 case CPU_DOWN_PREPARE: 1259 case CPU_DOWN_PREPARE_FROZEN: 1260 del_timer_sync(t); 1261 smp_call_function_single(cpu, mce_disable_cpu, &action, 1); 1262 break; 1263 case CPU_DOWN_FAILED: 1264 case CPU_DOWN_FAILED_FROZEN: 1265 t->expires = round_jiffies(jiffies + 1266 __get_cpu_var(next_interval)); 1267 add_timer_on(t, cpu); 1268 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); 1269 break; 1270 case CPU_POST_DEAD: 1271 /* intentionally ignoring frozen here */ 1272 cmci_rediscover(cpu); 1273 break; 1274 } 1275 return NOTIFY_OK; 1276} 1277 1278static struct notifier_block mce_cpu_notifier __cpuinitdata = { 1279 .notifier_call = mce_cpu_callback, 1280}; 1281 1282static __init int mce_init_banks(void) 1283{ 1284 int i; 1285 1286 bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks, 1287 GFP_KERNEL); 1288 if (!bank_attrs) 1289 return -ENOMEM; 1290 1291 for (i = 0; i < banks; i++) { 1292 struct sysdev_attribute *a = &bank_attrs[i]; 1293 1294 a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i); 1295 if (!a->attr.name) 1296 goto nomem; 1297 1298 a->attr.mode = 0644; 1299 a->show = show_bank; 1300 a->store = set_bank; 1301 } 1302 return 0; 1303 1304nomem: 1305 while (--i >= 0) 1306 kfree(bank_attrs[i].attr.name); 1307 kfree(bank_attrs); 1308 bank_attrs = NULL; 1309 1310 return -ENOMEM; 1311} 1312 1313static __init int mce_init_device(void) 1314{ 1315 int err; 1316 int i = 0; 1317 1318 if (!mce_available(&boot_cpu_data)) 1319 return -EIO; 1320 1321 alloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL); 1322 1323 err = mce_init_banks(); 1324 if (err) 1325 return err; 1326 1327 err = sysdev_class_register(&mce_sysclass); 1328 if (err) 1329 return err; 1330 1331 for_each_online_cpu(i) { 1332 err = mce_create_device(i); 1333 if (err) 1334 return err; 1335 } 1336 1337 register_hotcpu_notifier(&mce_cpu_notifier); 1338 misc_register(&mce_log_device); 1339 1340 return err; 1341} 1342 1343device_initcall(mce_init_device); 1344 1345#else /* CONFIG_X86_OLD_MCE: */ 1346 1347int nr_mce_banks; 1348EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */ 1349 1350/* This has to be run for each processor */ 1351void mcheck_init(struct cpuinfo_x86 *c) 1352{ 1353 if (mce_disabled == 1) 1354 return; 1355 1356 switch (c->x86_vendor) { 1357 case X86_VENDOR_AMD: 1358 amd_mcheck_init(c); 1359 break; 1360 1361 case X86_VENDOR_INTEL: 1362 if (c->x86 == 5) 1363 intel_p5_mcheck_init(c); 1364 if (c->x86 == 6) 1365 intel_p6_mcheck_init(c); 1366 if (c->x86 == 15) 1367 intel_p4_mcheck_init(c); 1368 break; 1369 1370 case X86_VENDOR_CENTAUR: 1371 if (c->x86 == 5) 1372 winchip_mcheck_init(c); 1373 break; 1374 1375 default: 1376 break; 1377 } 1378 printk(KERN_INFO "mce: CPU supports %d MCE banks\n", nr_mce_banks); 1379} 1380 1381static int __init mcheck_enable(char *str) 1382{ 1383 mce_disabled = -1; 1384 return 1; 1385} 1386 1387__setup("mce", mcheck_enable); 1388 1389#endif /* CONFIG_X86_OLD_MCE */ 1390 1391/* 1392 * Old style boot options parsing. Only for compatibility. 1393 */ 1394static int __init mcheck_disable(char *str) 1395{ 1396 mce_disabled = 1; 1397 return 1; 1398} 1399__setup("nomce", mcheck_disable); 1400