mce.c revision b659294b779565c60f5e12ef505328e2b974eb62
1/* 2 * Machine check handler. 3 * 4 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. 5 * Rest from unknown author(s). 6 * 2004 Andi Kleen. Rewrote most of it. 7 * Copyright 2008 Intel Corporation 8 * Author: Andi Kleen 9 */ 10#include <linux/thread_info.h> 11#include <linux/capability.h> 12#include <linux/miscdevice.h> 13#include <linux/ratelimit.h> 14#include <linux/kallsyms.h> 15#include <linux/rcupdate.h> 16#include <linux/smp_lock.h> 17#include <linux/kobject.h> 18#include <linux/kdebug.h> 19#include <linux/kernel.h> 20#include <linux/percpu.h> 21#include <linux/string.h> 22#include <linux/sysdev.h> 23#include <linux/ctype.h> 24#include <linux/sched.h> 25#include <linux/sysfs.h> 26#include <linux/types.h> 27#include <linux/init.h> 28#include <linux/kmod.h> 29#include <linux/poll.h> 30#include <linux/cpu.h> 31#include <linux/fs.h> 32 33#include <asm/processor.h> 34#include <asm/uaccess.h> 35#include <asm/idle.h> 36#include <asm/mce.h> 37#include <asm/msr.h> 38#include <asm/smp.h> 39 40#include "mce.h" 41 42#ifdef CONFIG_X86_64 43 44#define MISC_MCELOG_MINOR 227 45 46atomic_t mce_entry; 47 48static int mce_dont_init; 49 50/* 51 * Tolerant levels: 52 * 0: always panic on uncorrected errors, log corrected errors 53 * 1: panic or SIGBUS on uncorrected errors, log corrected errors 54 * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors 55 * 3: never panic or SIGBUS, log all errors (for testing only) 56 */ 57static int tolerant = 1; 58static int banks; 59static u64 *bank; 60static unsigned long notify_user; 61static int rip_msr; 62static int mce_bootlog = -1; 63static atomic_t mce_events; 64 65static char trigger[128]; 66static char *trigger_argv[2] = { trigger, NULL }; 67 68static DECLARE_WAIT_QUEUE_HEAD(mce_wait); 69 70/* MCA banks polled by the period polling timer for corrected events */ 71DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { 72 [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL 73}; 74 75/* Do initial initialization of a struct mce */ 76void mce_setup(struct mce *m) 77{ 78 memset(m, 0, sizeof(struct mce)); 79 m->cpu = smp_processor_id(); 80 rdtscll(m->tsc); 81} 82 83/* 84 * Lockless MCE logging infrastructure. 85 * This avoids deadlocks on printk locks without having to break locks. Also 86 * separate MCEs from kernel messages to avoid bogus bug reports. 87 */ 88 89static struct mce_log mcelog = { 90 MCE_LOG_SIGNATURE, 91 MCE_LOG_LEN, 92}; 93 94void mce_log(struct mce *mce) 95{ 96 unsigned next, entry; 97 98 atomic_inc(&mce_events); 99 mce->finished = 0; 100 wmb(); 101 for (;;) { 102 entry = rcu_dereference(mcelog.next); 103 for (;;) { 104 /* 105 * When the buffer fills up discard new entries. 106 * Assume that the earlier errors are the more 107 * interesting ones: 108 */ 109 if (entry >= MCE_LOG_LEN) { 110 set_bit(MCE_OVERFLOW, (unsigned long *)&mcelog.flags); 111 return; 112 } 113 /* Old left over entry. Skip: */ 114 if (mcelog.entry[entry].finished) { 115 entry++; 116 continue; 117 } 118 break; 119 } 120 smp_rmb(); 121 next = entry + 1; 122 if (cmpxchg(&mcelog.next, entry, next) == entry) 123 break; 124 } 125 memcpy(mcelog.entry + entry, mce, sizeof(struct mce)); 126 wmb(); 127 mcelog.entry[entry].finished = 1; 128 wmb(); 129 130 set_bit(0, ¬ify_user); 131} 132 133static void print_mce(struct mce *m) 134{ 135 printk(KERN_EMERG "\n" 136 KERN_EMERG "HARDWARE ERROR\n" 137 KERN_EMERG 138 "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n", 139 m->cpu, m->mcgstatus, m->bank, m->status); 140 if (m->ip) { 141 printk(KERN_EMERG "RIP%s %02x:<%016Lx> ", 142 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", 143 m->cs, m->ip); 144 if (m->cs == __KERNEL_CS) 145 print_symbol("{%s}", m->ip); 146 printk("\n"); 147 } 148 printk(KERN_EMERG "TSC %llx ", m->tsc); 149 if (m->addr) 150 printk("ADDR %llx ", m->addr); 151 if (m->misc) 152 printk("MISC %llx ", m->misc); 153 printk("\n"); 154 printk(KERN_EMERG "This is not a software problem!\n"); 155 printk(KERN_EMERG "Run through mcelog --ascii to decode " 156 "and contact your hardware vendor\n"); 157} 158 159static void mce_panic(char *msg, struct mce *backup, unsigned long start) 160{ 161 int i; 162 163 oops_begin(); 164 for (i = 0; i < MCE_LOG_LEN; i++) { 165 unsigned long tsc = mcelog.entry[i].tsc; 166 167 if (time_before(tsc, start)) 168 continue; 169 print_mce(&mcelog.entry[i]); 170 if (backup && mcelog.entry[i].tsc == backup->tsc) 171 backup = NULL; 172 } 173 if (backup) 174 print_mce(backup); 175 panic(msg); 176} 177 178int mce_available(struct cpuinfo_x86 *c) 179{ 180 if (mce_dont_init) 181 return 0; 182 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); 183} 184 185static inline void mce_get_rip(struct mce *m, struct pt_regs *regs) 186{ 187 if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) { 188 m->ip = regs->ip; 189 m->cs = regs->cs; 190 } else { 191 m->ip = 0; 192 m->cs = 0; 193 } 194 if (rip_msr) { 195 /* Assume the RIP in the MSR is exact. Is this true? */ 196 m->mcgstatus |= MCG_STATUS_EIPV; 197 rdmsrl(rip_msr, m->ip); 198 m->cs = 0; 199 } 200} 201 202/* 203 * Poll for corrected events or events that happened before reset. 204 * Those are just logged through /dev/mcelog. 205 * 206 * This is executed in standard interrupt context. 207 */ 208void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) 209{ 210 struct mce m; 211 int i; 212 213 mce_setup(&m); 214 215 rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); 216 for (i = 0; i < banks; i++) { 217 if (!bank[i] || !test_bit(i, *b)) 218 continue; 219 220 m.misc = 0; 221 m.addr = 0; 222 m.bank = i; 223 m.tsc = 0; 224 225 barrier(); 226 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status); 227 if (!(m.status & MCI_STATUS_VAL)) 228 continue; 229 230 /* 231 * Uncorrected events are handled by the exception handler 232 * when it is enabled. But when the exception is disabled log 233 * everything. 234 * 235 * TBD do the same check for MCI_STATUS_EN here? 236 */ 237 if ((m.status & MCI_STATUS_UC) && !(flags & MCP_UC)) 238 continue; 239 240 if (m.status & MCI_STATUS_MISCV) 241 rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc); 242 if (m.status & MCI_STATUS_ADDRV) 243 rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr); 244 245 if (!(flags & MCP_TIMESTAMP)) 246 m.tsc = 0; 247 /* 248 * Don't get the IP here because it's unlikely to 249 * have anything to do with the actual error location. 250 */ 251 if (!(flags & MCP_DONTLOG)) { 252 mce_log(&m); 253 add_taint(TAINT_MACHINE_CHECK); 254 } 255 256 /* 257 * Clear state for this bank. 258 */ 259 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); 260 } 261 262 /* 263 * Don't clear MCG_STATUS here because it's only defined for 264 * exceptions. 265 */ 266} 267 268/* 269 * The actual machine check handler. This only handles real 270 * exceptions when something got corrupted coming in through int 18. 271 * 272 * This is executed in NMI context not subject to normal locking rules. This 273 * implies that most kernel services cannot be safely used. Don't even 274 * think about putting a printk in there! 275 */ 276void do_machine_check(struct pt_regs *regs, long error_code) 277{ 278 struct mce m, panicm; 279 int panicm_found = 0; 280 u64 mcestart = 0; 281 int i; 282 /* 283 * If no_way_out gets set, there is no safe way to recover from this 284 * MCE. If tolerant is cranked up, we'll try anyway. 285 */ 286 int no_way_out = 0; 287 /* 288 * If kill_it gets set, there might be a way to recover from this 289 * error. 290 */ 291 int kill_it = 0; 292 DECLARE_BITMAP(toclear, MAX_NR_BANKS); 293 294 atomic_inc(&mce_entry); 295 296 if (notify_die(DIE_NMI, "machine check", regs, error_code, 297 18, SIGKILL) == NOTIFY_STOP) 298 goto out2; 299 if (!banks) 300 goto out2; 301 302 mce_setup(&m); 303 304 rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); 305 306 /* if the restart IP is not valid, we're done for */ 307 if (!(m.mcgstatus & MCG_STATUS_RIPV)) 308 no_way_out = 1; 309 310 rdtscll(mcestart); 311 barrier(); 312 313 for (i = 0; i < banks; i++) { 314 __clear_bit(i, toclear); 315 if (!bank[i]) 316 continue; 317 318 m.misc = 0; 319 m.addr = 0; 320 m.bank = i; 321 322 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status); 323 if ((m.status & MCI_STATUS_VAL) == 0) 324 continue; 325 326 /* 327 * Non uncorrected errors are handled by machine_check_poll 328 * Leave them alone. 329 */ 330 if ((m.status & MCI_STATUS_UC) == 0) 331 continue; 332 333 /* 334 * Set taint even when machine check was not enabled. 335 */ 336 add_taint(TAINT_MACHINE_CHECK); 337 338 __set_bit(i, toclear); 339 340 if (m.status & MCI_STATUS_EN) { 341 /* if PCC was set, there's no way out */ 342 no_way_out |= !!(m.status & MCI_STATUS_PCC); 343 /* 344 * If this error was uncorrectable and there was 345 * an overflow, we're in trouble. If no overflow, 346 * we might get away with just killing a task. 347 */ 348 if (m.status & MCI_STATUS_UC) { 349 if (tolerant < 1 || m.status & MCI_STATUS_OVER) 350 no_way_out = 1; 351 kill_it = 1; 352 } 353 } else { 354 /* 355 * Machine check event was not enabled. Clear, but 356 * ignore. 357 */ 358 continue; 359 } 360 361 if (m.status & MCI_STATUS_MISCV) 362 rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc); 363 if (m.status & MCI_STATUS_ADDRV) 364 rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr); 365 366 mce_get_rip(&m, regs); 367 mce_log(&m); 368 369 /* 370 * Did this bank cause the exception? 371 * 372 * Assume that the bank with uncorrectable errors did it, 373 * and that there is only a single one: 374 */ 375 if ((m.status & MCI_STATUS_UC) && 376 (m.status & MCI_STATUS_EN)) { 377 panicm = m; 378 panicm_found = 1; 379 } 380 } 381 382 /* 383 * If we didn't find an uncorrectable error, pick 384 * the last one (shouldn't happen, just being safe). 385 */ 386 if (!panicm_found) 387 panicm = m; 388 389 /* 390 * If we have decided that we just CAN'T continue, and the user 391 * has not set tolerant to an insane level, give up and die. 392 */ 393 if (no_way_out && tolerant < 3) 394 mce_panic("Machine check", &panicm, mcestart); 395 396 /* 397 * If the error seems to be unrecoverable, something should be 398 * done. Try to kill as little as possible. If we can kill just 399 * one task, do that. If the user has set the tolerance very 400 * high, don't try to do anything at all. 401 */ 402 if (kill_it && tolerant < 3) { 403 int user_space = 0; 404 405 /* 406 * If the EIPV bit is set, it means the saved IP is the 407 * instruction which caused the MCE. 408 */ 409 if (m.mcgstatus & MCG_STATUS_EIPV) 410 user_space = panicm.ip && (panicm.cs & 3); 411 412 /* 413 * If we know that the error was in user space, send a 414 * SIGBUS. Otherwise, panic if tolerance is low. 415 * 416 * force_sig() takes an awful lot of locks and has a slight 417 * risk of deadlocking. 418 */ 419 if (user_space) { 420 force_sig(SIGBUS, current); 421 } else if (panic_on_oops || tolerant < 2) { 422 mce_panic("Uncorrected machine check", 423 &panicm, mcestart); 424 } 425 } 426 427 /* notify userspace ASAP */ 428 set_thread_flag(TIF_MCE_NOTIFY); 429 430 /* the last thing we do is clear state */ 431 for (i = 0; i < banks; i++) { 432 if (test_bit(i, toclear)) 433 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); 434 } 435 wrmsrl(MSR_IA32_MCG_STATUS, 0); 436 out2: 437 atomic_dec(&mce_entry); 438} 439 440#ifdef CONFIG_X86_MCE_INTEL 441/*** 442 * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog 443 * @cpu: The CPU on which the event occurred. 444 * @status: Event status information 445 * 446 * This function should be called by the thermal interrupt after the 447 * event has been processed and the decision was made to log the event 448 * further. 449 * 450 * The status parameter will be saved to the 'status' field of 'struct mce' 451 * and historically has been the register value of the 452 * MSR_IA32_THERMAL_STATUS (Intel) msr. 453 */ 454void mce_log_therm_throt_event(__u64 status) 455{ 456 struct mce m; 457 458 mce_setup(&m); 459 m.bank = MCE_THERMAL_BANK; 460 m.status = status; 461 mce_log(&m); 462} 463#endif /* CONFIG_X86_MCE_INTEL */ 464 465/* 466 * Periodic polling timer for "silent" machine check errors. If the 467 * poller finds an MCE, poll 2x faster. When the poller finds no more 468 * errors, poll 2x slower (up to check_interval seconds). 469 */ 470static int check_interval = 5 * 60; /* 5 minutes */ 471 472static DEFINE_PER_CPU(int, next_interval); /* in jiffies */ 473static DEFINE_PER_CPU(struct timer_list, mce_timer); 474 475static void mcheck_timer(unsigned long data) 476{ 477 struct timer_list *t = &per_cpu(mce_timer, data); 478 int *n; 479 480 WARN_ON(smp_processor_id() != data); 481 482 if (mce_available(¤t_cpu_data)) { 483 machine_check_poll(MCP_TIMESTAMP, 484 &__get_cpu_var(mce_poll_banks)); 485 } 486 487 /* 488 * Alert userspace if needed. If we logged an MCE, reduce the 489 * polling interval, otherwise increase the polling interval. 490 */ 491 n = &__get_cpu_var(next_interval); 492 if (mce_notify_user()) { 493 *n = max(*n/2, HZ/100); 494 } else { 495 *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ)); 496 } 497 498 t->expires = jiffies + *n; 499 add_timer(t); 500} 501 502static void mce_do_trigger(struct work_struct *work) 503{ 504 call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT); 505} 506 507static DECLARE_WORK(mce_trigger_work, mce_do_trigger); 508 509/* 510 * Notify the user(s) about new machine check events. 511 * Can be called from interrupt context, but not from machine check/NMI 512 * context. 513 */ 514int mce_notify_user(void) 515{ 516 /* Not more than two messages every minute */ 517 static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); 518 519 clear_thread_flag(TIF_MCE_NOTIFY); 520 521 if (test_and_clear_bit(0, ¬ify_user)) { 522 wake_up_interruptible(&mce_wait); 523 524 /* 525 * There is no risk of missing notifications because 526 * work_pending is always cleared before the function is 527 * executed. 528 */ 529 if (trigger[0] && !work_pending(&mce_trigger_work)) 530 schedule_work(&mce_trigger_work); 531 532 if (__ratelimit(&ratelimit)) 533 printk(KERN_INFO "Machine check events logged\n"); 534 535 return 1; 536 } 537 return 0; 538} 539 540/* see if the idle task needs to notify userspace: */ 541static int 542mce_idle_callback(struct notifier_block *nfb, unsigned long action, 543 void *unused) 544{ 545 /* IDLE_END should be safe - interrupts are back on */ 546 if (action == IDLE_END && test_thread_flag(TIF_MCE_NOTIFY)) 547 mce_notify_user(); 548 549 return NOTIFY_OK; 550} 551 552static struct notifier_block mce_idle_notifier = { 553 .notifier_call = mce_idle_callback, 554}; 555 556static __init int periodic_mcheck_init(void) 557{ 558 idle_notifier_register(&mce_idle_notifier); 559 return 0; 560} 561__initcall(periodic_mcheck_init); 562 563/* 564 * Initialize Machine Checks for a CPU. 565 */ 566static int mce_cap_init(void) 567{ 568 unsigned b; 569 u64 cap; 570 571 rdmsrl(MSR_IA32_MCG_CAP, cap); 572 b = cap & 0xff; 573 printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b); 574 575 if (b > MAX_NR_BANKS) { 576 printk(KERN_WARNING 577 "MCE: Using only %u machine check banks out of %u\n", 578 MAX_NR_BANKS, b); 579 b = MAX_NR_BANKS; 580 } 581 582 /* Don't support asymmetric configurations today */ 583 WARN_ON(banks != 0 && b != banks); 584 banks = b; 585 if (!bank) { 586 bank = kmalloc(banks * sizeof(u64), GFP_KERNEL); 587 if (!bank) 588 return -ENOMEM; 589 memset(bank, 0xff, banks * sizeof(u64)); 590 } 591 592 /* Use accurate RIP reporting if available. */ 593 if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9) 594 rip_msr = MSR_IA32_MCG_EIP; 595 596 return 0; 597} 598 599static void mce_init(void *dummy) 600{ 601 mce_banks_t all_banks; 602 u64 cap; 603 int i; 604 605 /* 606 * Log the machine checks left over from the previous reset. 607 */ 608 bitmap_fill(all_banks, MAX_NR_BANKS); 609 machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks); 610 611 set_in_cr4(X86_CR4_MCE); 612 613 rdmsrl(MSR_IA32_MCG_CAP, cap); 614 if (cap & MCG_CTL_P) 615 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); 616 617 for (i = 0; i < banks; i++) { 618 wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]); 619 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); 620 } 621} 622 623/* Add per CPU specific workarounds here */ 624static void mce_cpu_quirks(struct cpuinfo_x86 *c) 625{ 626 /* This should be disabled by the BIOS, but isn't always */ 627 if (c->x86_vendor == X86_VENDOR_AMD) { 628 if (c->x86 == 15 && banks > 4) { 629 /* 630 * disable GART TBL walk error reporting, which 631 * trips off incorrectly with the IOMMU & 3ware 632 * & Cerberus: 633 */ 634 clear_bit(10, (unsigned long *)&bank[4]); 635 } 636 if (c->x86 <= 17 && mce_bootlog < 0) { 637 /* 638 * Lots of broken BIOS around that don't clear them 639 * by default and leave crap in there. Don't log: 640 */ 641 mce_bootlog = 0; 642 } 643 } 644 645} 646 647static void mce_cpu_features(struct cpuinfo_x86 *c) 648{ 649 switch (c->x86_vendor) { 650 case X86_VENDOR_INTEL: 651 mce_intel_feature_init(c); 652 break; 653 case X86_VENDOR_AMD: 654 mce_amd_feature_init(c); 655 break; 656 default: 657 break; 658 } 659} 660 661static void mce_init_timer(void) 662{ 663 struct timer_list *t = &__get_cpu_var(mce_timer); 664 int *n = &__get_cpu_var(next_interval); 665 666 *n = check_interval * HZ; 667 if (!*n) 668 return; 669 setup_timer(t, mcheck_timer, smp_processor_id()); 670 t->expires = round_jiffies(jiffies + *n); 671 add_timer(t); 672} 673 674/* 675 * Called for each booted CPU to set up machine checks. 676 * Must be called with preempt off: 677 */ 678void __cpuinit mcheck_init(struct cpuinfo_x86 *c) 679{ 680 if (!mce_available(c)) 681 return; 682 683 if (mce_cap_init() < 0) { 684 mce_dont_init = 1; 685 return; 686 } 687 mce_cpu_quirks(c); 688 689 mce_init(NULL); 690 mce_cpu_features(c); 691 mce_init_timer(); 692} 693 694/* 695 * Character device to read and clear the MCE log. 696 */ 697 698static DEFINE_SPINLOCK(mce_state_lock); 699static int open_count; /* #times opened */ 700static int open_exclu; /* already open exclusive? */ 701 702static int mce_open(struct inode *inode, struct file *file) 703{ 704 lock_kernel(); 705 spin_lock(&mce_state_lock); 706 707 if (open_exclu || (open_count && (file->f_flags & O_EXCL))) { 708 spin_unlock(&mce_state_lock); 709 unlock_kernel(); 710 711 return -EBUSY; 712 } 713 714 if (file->f_flags & O_EXCL) 715 open_exclu = 1; 716 open_count++; 717 718 spin_unlock(&mce_state_lock); 719 unlock_kernel(); 720 721 return nonseekable_open(inode, file); 722} 723 724static int mce_release(struct inode *inode, struct file *file) 725{ 726 spin_lock(&mce_state_lock); 727 728 open_count--; 729 open_exclu = 0; 730 731 spin_unlock(&mce_state_lock); 732 733 return 0; 734} 735 736static void collect_tscs(void *data) 737{ 738 unsigned long *cpu_tsc = (unsigned long *)data; 739 740 rdtscll(cpu_tsc[smp_processor_id()]); 741} 742 743static DEFINE_MUTEX(mce_read_mutex); 744 745static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, 746 loff_t *off) 747{ 748 char __user *buf = ubuf; 749 unsigned long *cpu_tsc; 750 unsigned prev, next; 751 int i, err; 752 753 cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL); 754 if (!cpu_tsc) 755 return -ENOMEM; 756 757 mutex_lock(&mce_read_mutex); 758 next = rcu_dereference(mcelog.next); 759 760 /* Only supports full reads right now */ 761 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) { 762 mutex_unlock(&mce_read_mutex); 763 kfree(cpu_tsc); 764 765 return -EINVAL; 766 } 767 768 err = 0; 769 prev = 0; 770 do { 771 for (i = prev; i < next; i++) { 772 unsigned long start = jiffies; 773 774 while (!mcelog.entry[i].finished) { 775 if (time_after_eq(jiffies, start + 2)) { 776 memset(mcelog.entry + i, 0, 777 sizeof(struct mce)); 778 goto timeout; 779 } 780 cpu_relax(); 781 } 782 smp_rmb(); 783 err |= copy_to_user(buf, mcelog.entry + i, 784 sizeof(struct mce)); 785 buf += sizeof(struct mce); 786timeout: 787 ; 788 } 789 790 memset(mcelog.entry + prev, 0, 791 (next - prev) * sizeof(struct mce)); 792 prev = next; 793 next = cmpxchg(&mcelog.next, prev, 0); 794 } while (next != prev); 795 796 synchronize_sched(); 797 798 /* 799 * Collect entries that were still getting written before the 800 * synchronize. 801 */ 802 on_each_cpu(collect_tscs, cpu_tsc, 1); 803 804 for (i = next; i < MCE_LOG_LEN; i++) { 805 if (mcelog.entry[i].finished && 806 mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) { 807 err |= copy_to_user(buf, mcelog.entry+i, 808 sizeof(struct mce)); 809 smp_rmb(); 810 buf += sizeof(struct mce); 811 memset(&mcelog.entry[i], 0, sizeof(struct mce)); 812 } 813 } 814 mutex_unlock(&mce_read_mutex); 815 kfree(cpu_tsc); 816 817 return err ? -EFAULT : buf - ubuf; 818} 819 820static unsigned int mce_poll(struct file *file, poll_table *wait) 821{ 822 poll_wait(file, &mce_wait, wait); 823 if (rcu_dereference(mcelog.next)) 824 return POLLIN | POLLRDNORM; 825 return 0; 826} 827 828static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg) 829{ 830 int __user *p = (int __user *)arg; 831 832 if (!capable(CAP_SYS_ADMIN)) 833 return -EPERM; 834 835 switch (cmd) { 836 case MCE_GET_RECORD_LEN: 837 return put_user(sizeof(struct mce), p); 838 case MCE_GET_LOG_LEN: 839 return put_user(MCE_LOG_LEN, p); 840 case MCE_GETCLEAR_FLAGS: { 841 unsigned flags; 842 843 do { 844 flags = mcelog.flags; 845 } while (cmpxchg(&mcelog.flags, flags, 0) != flags); 846 847 return put_user(flags, p); 848 } 849 default: 850 return -ENOTTY; 851 } 852} 853 854static const struct file_operations mce_chrdev_ops = { 855 .open = mce_open, 856 .release = mce_release, 857 .read = mce_read, 858 .poll = mce_poll, 859 .unlocked_ioctl = mce_ioctl, 860}; 861 862static struct miscdevice mce_log_device = { 863 MISC_MCELOG_MINOR, 864 "mcelog", 865 &mce_chrdev_ops, 866}; 867 868/* 869 * Old style boot options parsing. Only for compatibility. 870 */ 871static int __init mcheck_disable(char *str) 872{ 873 mce_dont_init = 1; 874 return 1; 875} 876__setup("nomce", mcheck_disable); 877 878/* 879 * mce=off disables machine check 880 * mce=TOLERANCELEVEL (number, see above) 881 * mce=bootlog Log MCEs from before booting. Disabled by default on AMD. 882 * mce=nobootlog Don't log MCEs from before booting. 883 */ 884static int __init mcheck_enable(char *str) 885{ 886 if (!strcmp(str, "off")) 887 mce_dont_init = 1; 888 else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog")) 889 mce_bootlog = (str[0] == 'b'); 890 else if (isdigit(str[0])) 891 get_option(&str, &tolerant); 892 else { 893 printk(KERN_INFO "mce= argument %s ignored. Please use /sys\n", 894 str); 895 return 0; 896 } 897 return 1; 898} 899__setup("mce=", mcheck_enable); 900 901/* 902 * Sysfs support 903 */ 904 905/* 906 * Disable machine checks on suspend and shutdown. We can't really handle 907 * them later. 908 */ 909static int mce_disable(void) 910{ 911 int i; 912 913 for (i = 0; i < banks; i++) 914 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); 915 return 0; 916} 917 918static int mce_suspend(struct sys_device *dev, pm_message_t state) 919{ 920 return mce_disable(); 921} 922 923static int mce_shutdown(struct sys_device *dev) 924{ 925 return mce_disable(); 926} 927 928/* 929 * On resume clear all MCE state. Don't want to see leftovers from the BIOS. 930 * Only one CPU is active at this time, the others get re-added later using 931 * CPU hotplug: 932 */ 933static int mce_resume(struct sys_device *dev) 934{ 935 mce_init(NULL); 936 mce_cpu_features(¤t_cpu_data); 937 938 return 0; 939} 940 941static void mce_cpu_restart(void *data) 942{ 943 del_timer_sync(&__get_cpu_var(mce_timer)); 944 if (mce_available(¤t_cpu_data)) 945 mce_init(NULL); 946 mce_init_timer(); 947} 948 949/* Reinit MCEs after user configuration changes */ 950static void mce_restart(void) 951{ 952 on_each_cpu(mce_cpu_restart, NULL, 1); 953} 954 955static struct sysdev_class mce_sysclass = { 956 .suspend = mce_suspend, 957 .shutdown = mce_shutdown, 958 .resume = mce_resume, 959 .name = "machinecheck", 960}; 961 962DEFINE_PER_CPU(struct sys_device, mce_dev); 963 964__cpuinitdata 965void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); 966 967/* Why are there no generic functions for this? */ 968#define ACCESSOR(name, var, start) \ 969 static ssize_t show_ ## name(struct sys_device *s, \ 970 struct sysdev_attribute *attr, \ 971 char *buf) { \ 972 return sprintf(buf, "%lx\n", (unsigned long)var); \ 973 } \ 974 static ssize_t set_ ## name(struct sys_device *s, \ 975 struct sysdev_attribute *attr, \ 976 const char *buf, size_t siz) { \ 977 char *end; \ 978 unsigned long new = simple_strtoul(buf, &end, 0); \ 979 \ 980 if (end == buf) \ 981 return -EINVAL; \ 982 var = new; \ 983 start; \ 984 \ 985 return end-buf; \ 986 } \ 987 static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name); 988 989static struct sysdev_attribute *bank_attrs; 990 991static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr, 992 char *buf) 993{ 994 u64 b = bank[attr - bank_attrs]; 995 996 return sprintf(buf, "%llx\n", b); 997} 998 999static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, 1000 const char *buf, size_t siz) 1001{ 1002 char *end; 1003 u64 new = simple_strtoull(buf, &end, 0); 1004 1005 if (end == buf) 1006 return -EINVAL; 1007 1008 bank[attr - bank_attrs] = new; 1009 mce_restart(); 1010 1011 return end-buf; 1012} 1013 1014static ssize_t 1015show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf) 1016{ 1017 strcpy(buf, trigger); 1018 strcat(buf, "\n"); 1019 return strlen(trigger) + 1; 1020} 1021 1022static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr, 1023 const char *buf, size_t siz) 1024{ 1025 char *p; 1026 int len; 1027 1028 strncpy(trigger, buf, sizeof(trigger)); 1029 trigger[sizeof(trigger)-1] = 0; 1030 len = strlen(trigger); 1031 p = strchr(trigger, '\n'); 1032 1033 if (*p) 1034 *p = 0; 1035 1036 return len; 1037} 1038 1039static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); 1040static SYSDEV_INT_ATTR(tolerant, 0644, tolerant); 1041 1042ACCESSOR(check_interval, check_interval, mce_restart()) 1043 1044static struct sysdev_attribute *mce_attrs[] = { 1045 &attr_tolerant.attr, &attr_check_interval, &attr_trigger, 1046 NULL 1047}; 1048 1049static cpumask_var_t mce_dev_initialized; 1050 1051/* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */ 1052static __cpuinit int mce_create_device(unsigned int cpu) 1053{ 1054 int err; 1055 int i; 1056 1057 if (!mce_available(&boot_cpu_data)) 1058 return -EIO; 1059 1060 memset(&per_cpu(mce_dev, cpu).kobj, 0, sizeof(struct kobject)); 1061 per_cpu(mce_dev, cpu).id = cpu; 1062 per_cpu(mce_dev, cpu).cls = &mce_sysclass; 1063 1064 err = sysdev_register(&per_cpu(mce_dev, cpu)); 1065 if (err) 1066 return err; 1067 1068 for (i = 0; mce_attrs[i]; i++) { 1069 err = sysdev_create_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); 1070 if (err) 1071 goto error; 1072 } 1073 for (i = 0; i < banks; i++) { 1074 err = sysdev_create_file(&per_cpu(mce_dev, cpu), 1075 &bank_attrs[i]); 1076 if (err) 1077 goto error2; 1078 } 1079 cpumask_set_cpu(cpu, mce_dev_initialized); 1080 1081 return 0; 1082error2: 1083 while (--i >= 0) 1084 sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]); 1085error: 1086 while (--i >= 0) 1087 sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); 1088 1089 sysdev_unregister(&per_cpu(mce_dev, cpu)); 1090 1091 return err; 1092} 1093 1094static __cpuinit void mce_remove_device(unsigned int cpu) 1095{ 1096 int i; 1097 1098 if (!cpumask_test_cpu(cpu, mce_dev_initialized)) 1099 return; 1100 1101 for (i = 0; mce_attrs[i]; i++) 1102 sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); 1103 1104 for (i = 0; i < banks; i++) 1105 sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]); 1106 1107 sysdev_unregister(&per_cpu(mce_dev, cpu)); 1108 cpumask_clear_cpu(cpu, mce_dev_initialized); 1109} 1110 1111/* Make sure there are no machine checks on offlined CPUs. */ 1112static void mce_disable_cpu(void *h) 1113{ 1114 unsigned long action = *(unsigned long *)h; 1115 int i; 1116 1117 if (!mce_available(¤t_cpu_data)) 1118 return; 1119 if (!(action & CPU_TASKS_FROZEN)) 1120 cmci_clear(); 1121 for (i = 0; i < banks; i++) 1122 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); 1123} 1124 1125static void mce_reenable_cpu(void *h) 1126{ 1127 unsigned long action = *(unsigned long *)h; 1128 int i; 1129 1130 if (!mce_available(¤t_cpu_data)) 1131 return; 1132 1133 if (!(action & CPU_TASKS_FROZEN)) 1134 cmci_reenable(); 1135 for (i = 0; i < banks; i++) 1136 wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]); 1137} 1138 1139/* Get notified when a cpu comes on/off. Be hotplug friendly. */ 1140static int __cpuinit 1141mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) 1142{ 1143 unsigned int cpu = (unsigned long)hcpu; 1144 struct timer_list *t = &per_cpu(mce_timer, cpu); 1145 1146 switch (action) { 1147 case CPU_ONLINE: 1148 case CPU_ONLINE_FROZEN: 1149 mce_create_device(cpu); 1150 if (threshold_cpu_callback) 1151 threshold_cpu_callback(action, cpu); 1152 break; 1153 case CPU_DEAD: 1154 case CPU_DEAD_FROZEN: 1155 if (threshold_cpu_callback) 1156 threshold_cpu_callback(action, cpu); 1157 mce_remove_device(cpu); 1158 break; 1159 case CPU_DOWN_PREPARE: 1160 case CPU_DOWN_PREPARE_FROZEN: 1161 del_timer_sync(t); 1162 smp_call_function_single(cpu, mce_disable_cpu, &action, 1); 1163 break; 1164 case CPU_DOWN_FAILED: 1165 case CPU_DOWN_FAILED_FROZEN: 1166 t->expires = round_jiffies(jiffies + 1167 __get_cpu_var(next_interval)); 1168 add_timer_on(t, cpu); 1169 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); 1170 break; 1171 case CPU_POST_DEAD: 1172 /* intentionally ignoring frozen here */ 1173 cmci_rediscover(cpu); 1174 break; 1175 } 1176 return NOTIFY_OK; 1177} 1178 1179static struct notifier_block mce_cpu_notifier __cpuinitdata = { 1180 .notifier_call = mce_cpu_callback, 1181}; 1182 1183static __init int mce_init_banks(void) 1184{ 1185 int i; 1186 1187 bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks, 1188 GFP_KERNEL); 1189 if (!bank_attrs) 1190 return -ENOMEM; 1191 1192 for (i = 0; i < banks; i++) { 1193 struct sysdev_attribute *a = &bank_attrs[i]; 1194 1195 a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i); 1196 if (!a->attr.name) 1197 goto nomem; 1198 1199 a->attr.mode = 0644; 1200 a->show = show_bank; 1201 a->store = set_bank; 1202 } 1203 return 0; 1204 1205nomem: 1206 while (--i >= 0) 1207 kfree(bank_attrs[i].attr.name); 1208 kfree(bank_attrs); 1209 bank_attrs = NULL; 1210 1211 return -ENOMEM; 1212} 1213 1214static __init int mce_init_device(void) 1215{ 1216 int err; 1217 int i = 0; 1218 1219 if (!mce_available(&boot_cpu_data)) 1220 return -EIO; 1221 1222 alloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL); 1223 1224 err = mce_init_banks(); 1225 if (err) 1226 return err; 1227 1228 err = sysdev_class_register(&mce_sysclass); 1229 if (err) 1230 return err; 1231 1232 for_each_online_cpu(i) { 1233 err = mce_create_device(i); 1234 if (err) 1235 return err; 1236 } 1237 1238 register_hotcpu_notifier(&mce_cpu_notifier); 1239 misc_register(&mce_log_device); 1240 1241 return err; 1242} 1243 1244device_initcall(mce_init_device); 1245 1246#else /* CONFIG_X86_32: */ 1247 1248int mce_disabled; 1249 1250int nr_mce_banks; 1251EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */ 1252 1253/* Handle unconfigured int18 (should never happen) */ 1254static void unexpected_machine_check(struct pt_regs *regs, long error_code) 1255{ 1256 printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", 1257 smp_processor_id()); 1258} 1259 1260/* Call the installed machine check handler for this CPU setup. */ 1261void (*machine_check_vector)(struct pt_regs *, long error_code) = 1262 unexpected_machine_check; 1263 1264/* This has to be run for each processor */ 1265void mcheck_init(struct cpuinfo_x86 *c) 1266{ 1267 if (mce_disabled == 1) 1268 return; 1269 1270 switch (c->x86_vendor) { 1271 case X86_VENDOR_AMD: 1272 amd_mcheck_init(c); 1273 break; 1274 1275 case X86_VENDOR_INTEL: 1276 if (c->x86 == 5) 1277 intel_p5_mcheck_init(c); 1278 if (c->x86 == 6) 1279 intel_p6_mcheck_init(c); 1280 if (c->x86 == 15) 1281 intel_p4_mcheck_init(c); 1282 break; 1283 1284 case X86_VENDOR_CENTAUR: 1285 if (c->x86 == 5) 1286 winchip_mcheck_init(c); 1287 break; 1288 1289 default: 1290 break; 1291 } 1292 printk(KERN_INFO "mce: CPU supports %d MCE banks\n", nr_mce_banks); 1293} 1294 1295static int __init mcheck_disable(char *str) 1296{ 1297 mce_disabled = 1; 1298 return 1; 1299} 1300 1301static int __init mcheck_enable(char *str) 1302{ 1303 mce_disabled = -1; 1304 return 1; 1305} 1306 1307__setup("nomce", mcheck_disable); 1308__setup("mce", mcheck_enable); 1309 1310#endif /* CONFIG_X86_32 */ 1311