mce.c revision 06b7a7a5ec917761969444fee967c43868a76468
1/* 2 * Machine check handler. 3 * 4 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. 5 * Rest from unknown author(s). 6 * 2004 Andi Kleen. Rewrote most of it. 7 * Copyright 2008 Intel Corporation 8 * Author: Andi Kleen 9 */ 10#include <linux/thread_info.h> 11#include <linux/capability.h> 12#include <linux/miscdevice.h> 13#include <linux/ratelimit.h> 14#include <linux/kallsyms.h> 15#include <linux/rcupdate.h> 16#include <linux/smp_lock.h> 17#include <linux/kobject.h> 18#include <linux/kdebug.h> 19#include <linux/kernel.h> 20#include <linux/percpu.h> 21#include <linux/string.h> 22#include <linux/sysdev.h> 23#include <linux/ctype.h> 24#include <linux/sched.h> 25#include <linux/sysfs.h> 26#include <linux/types.h> 27#include <linux/init.h> 28#include <linux/kmod.h> 29#include <linux/poll.h> 30#include <linux/cpu.h> 31#include <linux/fs.h> 32 33#include <asm/processor.h> 34#include <asm/uaccess.h> 35#include <asm/idle.h> 36#include <asm/mce.h> 37#include <asm/msr.h> 38#include <asm/smp.h> 39 40#include "mce.h" 41 42#ifdef CONFIG_X86_64 43 44#define MISC_MCELOG_MINOR 227 45 46atomic_t mce_entry; 47 48static int mce_dont_init; 49 50/* 51 * Tolerant levels: 52 * 0: always panic on uncorrected errors, log corrected errors 53 * 1: panic or SIGBUS on uncorrected errors, log corrected errors 54 * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors 55 * 3: never panic or SIGBUS, log all errors (for testing only) 56 */ 57static int tolerant = 1; 58static int banks; 59static u64 *bank; 60static unsigned long notify_user; 61static int rip_msr; 62static int mce_bootlog = -1; 63static atomic_t mce_events; 64 65static char trigger[128]; 66static char *trigger_argv[2] = { trigger, NULL }; 67 68static unsigned long dont_init_banks; 69 70static DECLARE_WAIT_QUEUE_HEAD(mce_wait); 71 72/* MCA banks polled by the period polling timer for corrected events */ 73DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { 74 [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL 75}; 76 77static inline int skip_bank_init(int i) 78{ 79 return i < BITS_PER_LONG && test_bit(i, &dont_init_banks); 80} 81 82/* Do initial initialization of a struct mce */ 83void mce_setup(struct mce *m) 84{ 85 memset(m, 0, sizeof(struct mce)); 86 m->cpu = smp_processor_id(); 87 rdtscll(m->tsc); 88} 89 90/* 91 * Lockless MCE logging infrastructure. 92 * This avoids deadlocks on printk locks without having to break locks. Also 93 * separate MCEs from kernel messages to avoid bogus bug reports. 94 */ 95 96static struct mce_log mcelog = { 97 MCE_LOG_SIGNATURE, 98 MCE_LOG_LEN, 99}; 100 101void mce_log(struct mce *mce) 102{ 103 unsigned next, entry; 104 105 atomic_inc(&mce_events); 106 mce->finished = 0; 107 wmb(); 108 for (;;) { 109 entry = rcu_dereference(mcelog.next); 110 for (;;) { 111 /* 112 * When the buffer fills up discard new entries. 113 * Assume that the earlier errors are the more 114 * interesting ones: 115 */ 116 if (entry >= MCE_LOG_LEN) { 117 set_bit(MCE_OVERFLOW, (unsigned long *)&mcelog.flags); 118 return; 119 } 120 /* Old left over entry. Skip: */ 121 if (mcelog.entry[entry].finished) { 122 entry++; 123 continue; 124 } 125 break; 126 } 127 smp_rmb(); 128 next = entry + 1; 129 if (cmpxchg(&mcelog.next, entry, next) == entry) 130 break; 131 } 132 memcpy(mcelog.entry + entry, mce, sizeof(struct mce)); 133 wmb(); 134 mcelog.entry[entry].finished = 1; 135 wmb(); 136 137 set_bit(0, ¬ify_user); 138} 139 140static void print_mce(struct mce *m) 141{ 142 printk(KERN_EMERG "\n" 143 KERN_EMERG "HARDWARE ERROR\n" 144 KERN_EMERG 145 "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n", 146 m->cpu, m->mcgstatus, m->bank, m->status); 147 if (m->ip) { 148 printk(KERN_EMERG "RIP%s %02x:<%016Lx> ", 149 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", 150 m->cs, m->ip); 151 if (m->cs == __KERNEL_CS) 152 print_symbol("{%s}", m->ip); 153 printk("\n"); 154 } 155 printk(KERN_EMERG "TSC %llx ", m->tsc); 156 if (m->addr) 157 printk("ADDR %llx ", m->addr); 158 if (m->misc) 159 printk("MISC %llx ", m->misc); 160 printk("\n"); 161 printk(KERN_EMERG "This is not a software problem!\n"); 162 printk(KERN_EMERG "Run through mcelog --ascii to decode " 163 "and contact your hardware vendor\n"); 164} 165 166static void mce_panic(char *msg, struct mce *backup, u64 start) 167{ 168 int i; 169 170 oops_begin(); 171 for (i = 0; i < MCE_LOG_LEN; i++) { 172 u64 tsc = mcelog.entry[i].tsc; 173 174 if ((s64)(tsc - start) < 0) 175 continue; 176 print_mce(&mcelog.entry[i]); 177 if (backup && mcelog.entry[i].tsc == backup->tsc) 178 backup = NULL; 179 } 180 if (backup) 181 print_mce(backup); 182 panic(msg); 183} 184 185int mce_available(struct cpuinfo_x86 *c) 186{ 187 if (mce_dont_init) 188 return 0; 189 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); 190} 191 192static inline void mce_get_rip(struct mce *m, struct pt_regs *regs) 193{ 194 if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) { 195 m->ip = regs->ip; 196 m->cs = regs->cs; 197 } else { 198 m->ip = 0; 199 m->cs = 0; 200 } 201 if (rip_msr) { 202 /* Assume the RIP in the MSR is exact. Is this true? */ 203 m->mcgstatus |= MCG_STATUS_EIPV; 204 rdmsrl(rip_msr, m->ip); 205 m->cs = 0; 206 } 207} 208 209/* 210 * Poll for corrected events or events that happened before reset. 211 * Those are just logged through /dev/mcelog. 212 * 213 * This is executed in standard interrupt context. 214 */ 215void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) 216{ 217 struct mce m; 218 int i; 219 220 mce_setup(&m); 221 222 rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); 223 for (i = 0; i < banks; i++) { 224 if (!bank[i] || !test_bit(i, *b)) 225 continue; 226 227 m.misc = 0; 228 m.addr = 0; 229 m.bank = i; 230 m.tsc = 0; 231 232 barrier(); 233 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status); 234 if (!(m.status & MCI_STATUS_VAL)) 235 continue; 236 237 /* 238 * Uncorrected events are handled by the exception handler 239 * when it is enabled. But when the exception is disabled log 240 * everything. 241 * 242 * TBD do the same check for MCI_STATUS_EN here? 243 */ 244 if ((m.status & MCI_STATUS_UC) && !(flags & MCP_UC)) 245 continue; 246 247 if (m.status & MCI_STATUS_MISCV) 248 rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc); 249 if (m.status & MCI_STATUS_ADDRV) 250 rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr); 251 252 if (!(flags & MCP_TIMESTAMP)) 253 m.tsc = 0; 254 /* 255 * Don't get the IP here because it's unlikely to 256 * have anything to do with the actual error location. 257 */ 258 if (!(flags & MCP_DONTLOG)) { 259 mce_log(&m); 260 add_taint(TAINT_MACHINE_CHECK); 261 } 262 263 /* 264 * Clear state for this bank. 265 */ 266 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); 267 } 268 269 /* 270 * Don't clear MCG_STATUS here because it's only defined for 271 * exceptions. 272 */ 273} 274 275/* 276 * The actual machine check handler. This only handles real 277 * exceptions when something got corrupted coming in through int 18. 278 * 279 * This is executed in NMI context not subject to normal locking rules. This 280 * implies that most kernel services cannot be safely used. Don't even 281 * think about putting a printk in there! 282 */ 283void do_machine_check(struct pt_regs *regs, long error_code) 284{ 285 struct mce m, panicm; 286 int panicm_found = 0; 287 u64 mcestart = 0; 288 int i; 289 /* 290 * If no_way_out gets set, there is no safe way to recover from this 291 * MCE. If tolerant is cranked up, we'll try anyway. 292 */ 293 int no_way_out = 0; 294 /* 295 * If kill_it gets set, there might be a way to recover from this 296 * error. 297 */ 298 int kill_it = 0; 299 DECLARE_BITMAP(toclear, MAX_NR_BANKS); 300 301 atomic_inc(&mce_entry); 302 303 if (notify_die(DIE_NMI, "machine check", regs, error_code, 304 18, SIGKILL) == NOTIFY_STOP) 305 goto out2; 306 if (!banks) 307 goto out2; 308 309 mce_setup(&m); 310 311 rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); 312 313 /* if the restart IP is not valid, we're done for */ 314 if (!(m.mcgstatus & MCG_STATUS_RIPV)) 315 no_way_out = 1; 316 317 rdtscll(mcestart); 318 barrier(); 319 320 for (i = 0; i < banks; i++) { 321 __clear_bit(i, toclear); 322 if (!bank[i]) 323 continue; 324 325 m.misc = 0; 326 m.addr = 0; 327 m.bank = i; 328 329 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status); 330 if ((m.status & MCI_STATUS_VAL) == 0) 331 continue; 332 333 /* 334 * Non uncorrected errors are handled by machine_check_poll 335 * Leave them alone. 336 */ 337 if ((m.status & MCI_STATUS_UC) == 0) 338 continue; 339 340 /* 341 * Set taint even when machine check was not enabled. 342 */ 343 add_taint(TAINT_MACHINE_CHECK); 344 345 __set_bit(i, toclear); 346 347 if (m.status & MCI_STATUS_EN) { 348 /* if PCC was set, there's no way out */ 349 no_way_out |= !!(m.status & MCI_STATUS_PCC); 350 /* 351 * If this error was uncorrectable and there was 352 * an overflow, we're in trouble. If no overflow, 353 * we might get away with just killing a task. 354 */ 355 if (m.status & MCI_STATUS_UC) { 356 if (tolerant < 1 || m.status & MCI_STATUS_OVER) 357 no_way_out = 1; 358 kill_it = 1; 359 } 360 } else { 361 /* 362 * Machine check event was not enabled. Clear, but 363 * ignore. 364 */ 365 continue; 366 } 367 368 if (m.status & MCI_STATUS_MISCV) 369 rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc); 370 if (m.status & MCI_STATUS_ADDRV) 371 rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr); 372 373 mce_get_rip(&m, regs); 374 mce_log(&m); 375 376 /* 377 * Did this bank cause the exception? 378 * 379 * Assume that the bank with uncorrectable errors did it, 380 * and that there is only a single one: 381 */ 382 if ((m.status & MCI_STATUS_UC) && 383 (m.status & MCI_STATUS_EN)) { 384 panicm = m; 385 panicm_found = 1; 386 } 387 } 388 389 /* 390 * If we didn't find an uncorrectable error, pick 391 * the last one (shouldn't happen, just being safe). 392 */ 393 if (!panicm_found) 394 panicm = m; 395 396 /* 397 * If we have decided that we just CAN'T continue, and the user 398 * has not set tolerant to an insane level, give up and die. 399 */ 400 if (no_way_out && tolerant < 3) 401 mce_panic("Machine check", &panicm, mcestart); 402 403 /* 404 * If the error seems to be unrecoverable, something should be 405 * done. Try to kill as little as possible. If we can kill just 406 * one task, do that. If the user has set the tolerance very 407 * high, don't try to do anything at all. 408 */ 409 if (kill_it && tolerant < 3) { 410 int user_space = 0; 411 412 /* 413 * If the EIPV bit is set, it means the saved IP is the 414 * instruction which caused the MCE. 415 */ 416 if (m.mcgstatus & MCG_STATUS_EIPV) 417 user_space = panicm.ip && (panicm.cs & 3); 418 419 /* 420 * If we know that the error was in user space, send a 421 * SIGBUS. Otherwise, panic if tolerance is low. 422 * 423 * force_sig() takes an awful lot of locks and has a slight 424 * risk of deadlocking. 425 */ 426 if (user_space) { 427 force_sig(SIGBUS, current); 428 } else if (panic_on_oops || tolerant < 2) { 429 mce_panic("Uncorrected machine check", 430 &panicm, mcestart); 431 } 432 } 433 434 /* notify userspace ASAP */ 435 set_thread_flag(TIF_MCE_NOTIFY); 436 437 /* the last thing we do is clear state */ 438 for (i = 0; i < banks; i++) { 439 if (test_bit(i, toclear)) 440 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); 441 } 442 wrmsrl(MSR_IA32_MCG_STATUS, 0); 443 out2: 444 atomic_dec(&mce_entry); 445} 446 447#ifdef CONFIG_X86_MCE_INTEL 448/*** 449 * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog 450 * @cpu: The CPU on which the event occurred. 451 * @status: Event status information 452 * 453 * This function should be called by the thermal interrupt after the 454 * event has been processed and the decision was made to log the event 455 * further. 456 * 457 * The status parameter will be saved to the 'status' field of 'struct mce' 458 * and historically has been the register value of the 459 * MSR_IA32_THERMAL_STATUS (Intel) msr. 460 */ 461void mce_log_therm_throt_event(__u64 status) 462{ 463 struct mce m; 464 465 mce_setup(&m); 466 m.bank = MCE_THERMAL_BANK; 467 m.status = status; 468 mce_log(&m); 469} 470#endif /* CONFIG_X86_MCE_INTEL */ 471 472/* 473 * Periodic polling timer for "silent" machine check errors. If the 474 * poller finds an MCE, poll 2x faster. When the poller finds no more 475 * errors, poll 2x slower (up to check_interval seconds). 476 */ 477static int check_interval = 5 * 60; /* 5 minutes */ 478 479static DEFINE_PER_CPU(int, next_interval); /* in jiffies */ 480static DEFINE_PER_CPU(struct timer_list, mce_timer); 481 482static void mcheck_timer(unsigned long data) 483{ 484 struct timer_list *t = &per_cpu(mce_timer, data); 485 int *n; 486 487 WARN_ON(smp_processor_id() != data); 488 489 if (mce_available(¤t_cpu_data)) { 490 machine_check_poll(MCP_TIMESTAMP, 491 &__get_cpu_var(mce_poll_banks)); 492 } 493 494 /* 495 * Alert userspace if needed. If we logged an MCE, reduce the 496 * polling interval, otherwise increase the polling interval. 497 */ 498 n = &__get_cpu_var(next_interval); 499 if (mce_notify_user()) { 500 *n = max(*n/2, HZ/100); 501 } else { 502 *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ)); 503 } 504 505 t->expires = jiffies + *n; 506 add_timer(t); 507} 508 509static void mce_do_trigger(struct work_struct *work) 510{ 511 call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT); 512} 513 514static DECLARE_WORK(mce_trigger_work, mce_do_trigger); 515 516/* 517 * Notify the user(s) about new machine check events. 518 * Can be called from interrupt context, but not from machine check/NMI 519 * context. 520 */ 521int mce_notify_user(void) 522{ 523 /* Not more than two messages every minute */ 524 static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); 525 526 clear_thread_flag(TIF_MCE_NOTIFY); 527 528 if (test_and_clear_bit(0, ¬ify_user)) { 529 wake_up_interruptible(&mce_wait); 530 531 /* 532 * There is no risk of missing notifications because 533 * work_pending is always cleared before the function is 534 * executed. 535 */ 536 if (trigger[0] && !work_pending(&mce_trigger_work)) 537 schedule_work(&mce_trigger_work); 538 539 if (__ratelimit(&ratelimit)) 540 printk(KERN_INFO "Machine check events logged\n"); 541 542 return 1; 543 } 544 return 0; 545} 546 547/* see if the idle task needs to notify userspace: */ 548static int 549mce_idle_callback(struct notifier_block *nfb, unsigned long action, 550 void *unused) 551{ 552 /* IDLE_END should be safe - interrupts are back on */ 553 if (action == IDLE_END && test_thread_flag(TIF_MCE_NOTIFY)) 554 mce_notify_user(); 555 556 return NOTIFY_OK; 557} 558 559static struct notifier_block mce_idle_notifier = { 560 .notifier_call = mce_idle_callback, 561}; 562 563static __init int periodic_mcheck_init(void) 564{ 565 idle_notifier_register(&mce_idle_notifier); 566 return 0; 567} 568__initcall(periodic_mcheck_init); 569 570/* 571 * Initialize Machine Checks for a CPU. 572 */ 573static int mce_cap_init(void) 574{ 575 unsigned b; 576 u64 cap; 577 578 rdmsrl(MSR_IA32_MCG_CAP, cap); 579 580 b = cap & MCG_BANKCNT_MASK; 581 printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b); 582 583 if (b > MAX_NR_BANKS) { 584 printk(KERN_WARNING 585 "MCE: Using only %u machine check banks out of %u\n", 586 MAX_NR_BANKS, b); 587 b = MAX_NR_BANKS; 588 } 589 590 /* Don't support asymmetric configurations today */ 591 WARN_ON(banks != 0 && b != banks); 592 banks = b; 593 if (!bank) { 594 bank = kmalloc(banks * sizeof(u64), GFP_KERNEL); 595 if (!bank) 596 return -ENOMEM; 597 memset(bank, 0xff, banks * sizeof(u64)); 598 } 599 600 /* Use accurate RIP reporting if available. */ 601 if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9) 602 rip_msr = MSR_IA32_MCG_EIP; 603 604 return 0; 605} 606 607static void mce_init(void *dummy) 608{ 609 mce_banks_t all_banks; 610 u64 cap; 611 int i; 612 613 /* 614 * Log the machine checks left over from the previous reset. 615 */ 616 bitmap_fill(all_banks, MAX_NR_BANKS); 617 machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks); 618 619 set_in_cr4(X86_CR4_MCE); 620 621 rdmsrl(MSR_IA32_MCG_CAP, cap); 622 if (cap & MCG_CTL_P) 623 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); 624 625 for (i = 0; i < banks; i++) { 626 if (skip_bank_init(i)) 627 continue; 628 wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]); 629 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); 630 } 631} 632 633/* Add per CPU specific workarounds here */ 634static void mce_cpu_quirks(struct cpuinfo_x86 *c) 635{ 636 /* This should be disabled by the BIOS, but isn't always */ 637 if (c->x86_vendor == X86_VENDOR_AMD) { 638 if (c->x86 == 15 && banks > 4) { 639 /* 640 * disable GART TBL walk error reporting, which 641 * trips off incorrectly with the IOMMU & 3ware 642 * & Cerberus: 643 */ 644 clear_bit(10, (unsigned long *)&bank[4]); 645 } 646 if (c->x86 <= 17 && mce_bootlog < 0) { 647 /* 648 * Lots of broken BIOS around that don't clear them 649 * by default and leave crap in there. Don't log: 650 */ 651 mce_bootlog = 0; 652 } 653 } 654 655 if (c->x86_vendor == X86_VENDOR_INTEL) { 656 /* 657 * SDM documents that on family 6 bank 0 should not be written 658 * because it aliases to another special BIOS controlled 659 * register. 660 * But it's not aliased anymore on model 0x1a+ 661 * Don't ignore bank 0 completely because there could be a 662 * valid event later, merely don't write CTL0. 663 */ 664 665 if (c->x86 == 6 && c->x86_model < 0x1A) 666 __set_bit(0, &dont_init_banks); 667 } 668} 669 670static void mce_cpu_features(struct cpuinfo_x86 *c) 671{ 672 switch (c->x86_vendor) { 673 case X86_VENDOR_INTEL: 674 mce_intel_feature_init(c); 675 break; 676 case X86_VENDOR_AMD: 677 mce_amd_feature_init(c); 678 break; 679 default: 680 break; 681 } 682} 683 684static void mce_init_timer(void) 685{ 686 struct timer_list *t = &__get_cpu_var(mce_timer); 687 int *n = &__get_cpu_var(next_interval); 688 689 *n = check_interval * HZ; 690 if (!*n) 691 return; 692 setup_timer(t, mcheck_timer, smp_processor_id()); 693 t->expires = round_jiffies(jiffies + *n); 694 add_timer(t); 695} 696 697/* 698 * Called for each booted CPU to set up machine checks. 699 * Must be called with preempt off: 700 */ 701void __cpuinit mcheck_init(struct cpuinfo_x86 *c) 702{ 703 if (!mce_available(c)) 704 return; 705 706 if (mce_cap_init() < 0) { 707 mce_dont_init = 1; 708 return; 709 } 710 mce_cpu_quirks(c); 711 712 mce_init(NULL); 713 mce_cpu_features(c); 714 mce_init_timer(); 715} 716 717/* 718 * Character device to read and clear the MCE log. 719 */ 720 721static DEFINE_SPINLOCK(mce_state_lock); 722static int open_count; /* #times opened */ 723static int open_exclu; /* already open exclusive? */ 724 725static int mce_open(struct inode *inode, struct file *file) 726{ 727 lock_kernel(); 728 spin_lock(&mce_state_lock); 729 730 if (open_exclu || (open_count && (file->f_flags & O_EXCL))) { 731 spin_unlock(&mce_state_lock); 732 unlock_kernel(); 733 734 return -EBUSY; 735 } 736 737 if (file->f_flags & O_EXCL) 738 open_exclu = 1; 739 open_count++; 740 741 spin_unlock(&mce_state_lock); 742 unlock_kernel(); 743 744 return nonseekable_open(inode, file); 745} 746 747static int mce_release(struct inode *inode, struct file *file) 748{ 749 spin_lock(&mce_state_lock); 750 751 open_count--; 752 open_exclu = 0; 753 754 spin_unlock(&mce_state_lock); 755 756 return 0; 757} 758 759static void collect_tscs(void *data) 760{ 761 unsigned long *cpu_tsc = (unsigned long *)data; 762 763 rdtscll(cpu_tsc[smp_processor_id()]); 764} 765 766static DEFINE_MUTEX(mce_read_mutex); 767 768static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, 769 loff_t *off) 770{ 771 char __user *buf = ubuf; 772 unsigned long *cpu_tsc; 773 unsigned prev, next; 774 int i, err; 775 776 cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL); 777 if (!cpu_tsc) 778 return -ENOMEM; 779 780 mutex_lock(&mce_read_mutex); 781 next = rcu_dereference(mcelog.next); 782 783 /* Only supports full reads right now */ 784 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) { 785 mutex_unlock(&mce_read_mutex); 786 kfree(cpu_tsc); 787 788 return -EINVAL; 789 } 790 791 err = 0; 792 prev = 0; 793 do { 794 for (i = prev; i < next; i++) { 795 unsigned long start = jiffies; 796 797 while (!mcelog.entry[i].finished) { 798 if (time_after_eq(jiffies, start + 2)) { 799 memset(mcelog.entry + i, 0, 800 sizeof(struct mce)); 801 goto timeout; 802 } 803 cpu_relax(); 804 } 805 smp_rmb(); 806 err |= copy_to_user(buf, mcelog.entry + i, 807 sizeof(struct mce)); 808 buf += sizeof(struct mce); 809timeout: 810 ; 811 } 812 813 memset(mcelog.entry + prev, 0, 814 (next - prev) * sizeof(struct mce)); 815 prev = next; 816 next = cmpxchg(&mcelog.next, prev, 0); 817 } while (next != prev); 818 819 synchronize_sched(); 820 821 /* 822 * Collect entries that were still getting written before the 823 * synchronize. 824 */ 825 on_each_cpu(collect_tscs, cpu_tsc, 1); 826 827 for (i = next; i < MCE_LOG_LEN; i++) { 828 if (mcelog.entry[i].finished && 829 mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) { 830 err |= copy_to_user(buf, mcelog.entry+i, 831 sizeof(struct mce)); 832 smp_rmb(); 833 buf += sizeof(struct mce); 834 memset(&mcelog.entry[i], 0, sizeof(struct mce)); 835 } 836 } 837 mutex_unlock(&mce_read_mutex); 838 kfree(cpu_tsc); 839 840 return err ? -EFAULT : buf - ubuf; 841} 842 843static unsigned int mce_poll(struct file *file, poll_table *wait) 844{ 845 poll_wait(file, &mce_wait, wait); 846 if (rcu_dereference(mcelog.next)) 847 return POLLIN | POLLRDNORM; 848 return 0; 849} 850 851static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg) 852{ 853 int __user *p = (int __user *)arg; 854 855 if (!capable(CAP_SYS_ADMIN)) 856 return -EPERM; 857 858 switch (cmd) { 859 case MCE_GET_RECORD_LEN: 860 return put_user(sizeof(struct mce), p); 861 case MCE_GET_LOG_LEN: 862 return put_user(MCE_LOG_LEN, p); 863 case MCE_GETCLEAR_FLAGS: { 864 unsigned flags; 865 866 do { 867 flags = mcelog.flags; 868 } while (cmpxchg(&mcelog.flags, flags, 0) != flags); 869 870 return put_user(flags, p); 871 } 872 default: 873 return -ENOTTY; 874 } 875} 876 877static const struct file_operations mce_chrdev_ops = { 878 .open = mce_open, 879 .release = mce_release, 880 .read = mce_read, 881 .poll = mce_poll, 882 .unlocked_ioctl = mce_ioctl, 883}; 884 885static struct miscdevice mce_log_device = { 886 MISC_MCELOG_MINOR, 887 "mcelog", 888 &mce_chrdev_ops, 889}; 890 891/* 892 * Old style boot options parsing. Only for compatibility. 893 */ 894static int __init mcheck_disable(char *str) 895{ 896 mce_dont_init = 1; 897 return 1; 898} 899__setup("nomce", mcheck_disable); 900 901/* 902 * mce=off disables machine check 903 * mce=TOLERANCELEVEL (number, see above) 904 * mce=bootlog Log MCEs from before booting. Disabled by default on AMD. 905 * mce=nobootlog Don't log MCEs from before booting. 906 */ 907static int __init mcheck_enable(char *str) 908{ 909 if (!strcmp(str, "off")) 910 mce_dont_init = 1; 911 else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog")) 912 mce_bootlog = (str[0] == 'b'); 913 else if (isdigit(str[0])) 914 get_option(&str, &tolerant); 915 else { 916 printk(KERN_INFO "mce= argument %s ignored. Please use /sys\n", 917 str); 918 return 0; 919 } 920 return 1; 921} 922__setup("mce=", mcheck_enable); 923 924/* 925 * Sysfs support 926 */ 927 928/* 929 * Disable machine checks on suspend and shutdown. We can't really handle 930 * them later. 931 */ 932static int mce_disable(void) 933{ 934 int i; 935 936 for (i = 0; i < banks; i++) { 937 if (!skip_bank_init(i)) 938 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); 939 } 940 return 0; 941} 942 943static int mce_suspend(struct sys_device *dev, pm_message_t state) 944{ 945 return mce_disable(); 946} 947 948static int mce_shutdown(struct sys_device *dev) 949{ 950 return mce_disable(); 951} 952 953/* 954 * On resume clear all MCE state. Don't want to see leftovers from the BIOS. 955 * Only one CPU is active at this time, the others get re-added later using 956 * CPU hotplug: 957 */ 958static int mce_resume(struct sys_device *dev) 959{ 960 mce_init(NULL); 961 mce_cpu_features(¤t_cpu_data); 962 963 return 0; 964} 965 966static void mce_cpu_restart(void *data) 967{ 968 del_timer_sync(&__get_cpu_var(mce_timer)); 969 if (mce_available(¤t_cpu_data)) 970 mce_init(NULL); 971 mce_init_timer(); 972} 973 974/* Reinit MCEs after user configuration changes */ 975static void mce_restart(void) 976{ 977 on_each_cpu(mce_cpu_restart, NULL, 1); 978} 979 980static struct sysdev_class mce_sysclass = { 981 .suspend = mce_suspend, 982 .shutdown = mce_shutdown, 983 .resume = mce_resume, 984 .name = "machinecheck", 985}; 986 987DEFINE_PER_CPU(struct sys_device, mce_dev); 988 989__cpuinitdata 990void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); 991 992/* Why are there no generic functions for this? */ 993#define ACCESSOR(name, var, start) \ 994 static ssize_t show_ ## name(struct sys_device *s, \ 995 struct sysdev_attribute *attr, \ 996 char *buf) { \ 997 return sprintf(buf, "%Lx\n", (u64)var); \ 998 } \ 999 static ssize_t set_ ## name(struct sys_device *s, \ 1000 struct sysdev_attribute *attr, \ 1001 const char *buf, size_t siz) { \ 1002 char *end; \ 1003 u64 new = simple_strtoull(buf, &end, 0); \ 1004 \ 1005 if (end == buf) \ 1006 return -EINVAL; \ 1007 var = new; \ 1008 start; \ 1009 \ 1010 return end-buf; \ 1011 } \ 1012 static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name); 1013 1014static struct sysdev_attribute *bank_attrs; 1015 1016static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr, 1017 char *buf) 1018{ 1019 u64 b = bank[attr - bank_attrs]; 1020 1021 return sprintf(buf, "%llx\n", b); 1022} 1023 1024static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, 1025 const char *buf, size_t siz) 1026{ 1027 char *end; 1028 u64 new = simple_strtoull(buf, &end, 0); 1029 1030 if (end == buf) 1031 return -EINVAL; 1032 1033 bank[attr - bank_attrs] = new; 1034 mce_restart(); 1035 1036 return end-buf; 1037} 1038 1039static ssize_t 1040show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf) 1041{ 1042 strcpy(buf, trigger); 1043 strcat(buf, "\n"); 1044 return strlen(trigger) + 1; 1045} 1046 1047static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr, 1048 const char *buf, size_t siz) 1049{ 1050 char *p; 1051 int len; 1052 1053 strncpy(trigger, buf, sizeof(trigger)); 1054 trigger[sizeof(trigger)-1] = 0; 1055 len = strlen(trigger); 1056 p = strchr(trigger, '\n'); 1057 1058 if (*p) 1059 *p = 0; 1060 1061 return len; 1062} 1063 1064static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); 1065static SYSDEV_INT_ATTR(tolerant, 0644, tolerant); 1066 1067ACCESSOR(check_interval, check_interval, mce_restart()) 1068 1069static struct sysdev_attribute *mce_attrs[] = { 1070 &attr_tolerant.attr, &attr_check_interval, &attr_trigger, 1071 NULL 1072}; 1073 1074static cpumask_var_t mce_dev_initialized; 1075 1076/* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */ 1077static __cpuinit int mce_create_device(unsigned int cpu) 1078{ 1079 int err; 1080 int i; 1081 1082 if (!mce_available(&boot_cpu_data)) 1083 return -EIO; 1084 1085 memset(&per_cpu(mce_dev, cpu).kobj, 0, sizeof(struct kobject)); 1086 per_cpu(mce_dev, cpu).id = cpu; 1087 per_cpu(mce_dev, cpu).cls = &mce_sysclass; 1088 1089 err = sysdev_register(&per_cpu(mce_dev, cpu)); 1090 if (err) 1091 return err; 1092 1093 for (i = 0; mce_attrs[i]; i++) { 1094 err = sysdev_create_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); 1095 if (err) 1096 goto error; 1097 } 1098 for (i = 0; i < banks; i++) { 1099 err = sysdev_create_file(&per_cpu(mce_dev, cpu), 1100 &bank_attrs[i]); 1101 if (err) 1102 goto error2; 1103 } 1104 cpumask_set_cpu(cpu, mce_dev_initialized); 1105 1106 return 0; 1107error2: 1108 while (--i >= 0) 1109 sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]); 1110error: 1111 while (--i >= 0) 1112 sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); 1113 1114 sysdev_unregister(&per_cpu(mce_dev, cpu)); 1115 1116 return err; 1117} 1118 1119static __cpuinit void mce_remove_device(unsigned int cpu) 1120{ 1121 int i; 1122 1123 if (!cpumask_test_cpu(cpu, mce_dev_initialized)) 1124 return; 1125 1126 for (i = 0; mce_attrs[i]; i++) 1127 sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); 1128 1129 for (i = 0; i < banks; i++) 1130 sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]); 1131 1132 sysdev_unregister(&per_cpu(mce_dev, cpu)); 1133 cpumask_clear_cpu(cpu, mce_dev_initialized); 1134} 1135 1136/* Make sure there are no machine checks on offlined CPUs. */ 1137static void mce_disable_cpu(void *h) 1138{ 1139 unsigned long action = *(unsigned long *)h; 1140 int i; 1141 1142 if (!mce_available(¤t_cpu_data)) 1143 return; 1144 if (!(action & CPU_TASKS_FROZEN)) 1145 cmci_clear(); 1146 for (i = 0; i < banks; i++) { 1147 if (!skip_bank_init(i)) 1148 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); 1149 } 1150} 1151 1152static void mce_reenable_cpu(void *h) 1153{ 1154 unsigned long action = *(unsigned long *)h; 1155 int i; 1156 1157 if (!mce_available(¤t_cpu_data)) 1158 return; 1159 1160 if (!(action & CPU_TASKS_FROZEN)) 1161 cmci_reenable(); 1162 for (i = 0; i < banks; i++) { 1163 if (!skip_bank_init(i)) 1164 wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]); 1165 } 1166} 1167 1168/* Get notified when a cpu comes on/off. Be hotplug friendly. */ 1169static int __cpuinit 1170mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) 1171{ 1172 unsigned int cpu = (unsigned long)hcpu; 1173 struct timer_list *t = &per_cpu(mce_timer, cpu); 1174 1175 switch (action) { 1176 case CPU_ONLINE: 1177 case CPU_ONLINE_FROZEN: 1178 mce_create_device(cpu); 1179 if (threshold_cpu_callback) 1180 threshold_cpu_callback(action, cpu); 1181 break; 1182 case CPU_DEAD: 1183 case CPU_DEAD_FROZEN: 1184 if (threshold_cpu_callback) 1185 threshold_cpu_callback(action, cpu); 1186 mce_remove_device(cpu); 1187 break; 1188 case CPU_DOWN_PREPARE: 1189 case CPU_DOWN_PREPARE_FROZEN: 1190 del_timer_sync(t); 1191 smp_call_function_single(cpu, mce_disable_cpu, &action, 1); 1192 break; 1193 case CPU_DOWN_FAILED: 1194 case CPU_DOWN_FAILED_FROZEN: 1195 t->expires = round_jiffies(jiffies + 1196 __get_cpu_var(next_interval)); 1197 add_timer_on(t, cpu); 1198 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); 1199 break; 1200 case CPU_POST_DEAD: 1201 /* intentionally ignoring frozen here */ 1202 cmci_rediscover(cpu); 1203 break; 1204 } 1205 return NOTIFY_OK; 1206} 1207 1208static struct notifier_block mce_cpu_notifier __cpuinitdata = { 1209 .notifier_call = mce_cpu_callback, 1210}; 1211 1212static __init int mce_init_banks(void) 1213{ 1214 int i; 1215 1216 bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks, 1217 GFP_KERNEL); 1218 if (!bank_attrs) 1219 return -ENOMEM; 1220 1221 for (i = 0; i < banks; i++) { 1222 struct sysdev_attribute *a = &bank_attrs[i]; 1223 1224 a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i); 1225 if (!a->attr.name) 1226 goto nomem; 1227 1228 a->attr.mode = 0644; 1229 a->show = show_bank; 1230 a->store = set_bank; 1231 } 1232 return 0; 1233 1234nomem: 1235 while (--i >= 0) 1236 kfree(bank_attrs[i].attr.name); 1237 kfree(bank_attrs); 1238 bank_attrs = NULL; 1239 1240 return -ENOMEM; 1241} 1242 1243static __init int mce_init_device(void) 1244{ 1245 int err; 1246 int i = 0; 1247 1248 if (!mce_available(&boot_cpu_data)) 1249 return -EIO; 1250 1251 alloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL); 1252 1253 err = mce_init_banks(); 1254 if (err) 1255 return err; 1256 1257 err = sysdev_class_register(&mce_sysclass); 1258 if (err) 1259 return err; 1260 1261 for_each_online_cpu(i) { 1262 err = mce_create_device(i); 1263 if (err) 1264 return err; 1265 } 1266 1267 register_hotcpu_notifier(&mce_cpu_notifier); 1268 misc_register(&mce_log_device); 1269 1270 return err; 1271} 1272 1273device_initcall(mce_init_device); 1274 1275#else /* CONFIG_X86_32: */ 1276 1277int mce_disabled; 1278 1279int nr_mce_banks; 1280EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */ 1281 1282/* Handle unconfigured int18 (should never happen) */ 1283static void unexpected_machine_check(struct pt_regs *regs, long error_code) 1284{ 1285 printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", 1286 smp_processor_id()); 1287} 1288 1289/* Call the installed machine check handler for this CPU setup. */ 1290void (*machine_check_vector)(struct pt_regs *, long error_code) = 1291 unexpected_machine_check; 1292 1293/* This has to be run for each processor */ 1294void mcheck_init(struct cpuinfo_x86 *c) 1295{ 1296 if (mce_disabled == 1) 1297 return; 1298 1299 switch (c->x86_vendor) { 1300 case X86_VENDOR_AMD: 1301 amd_mcheck_init(c); 1302 break; 1303 1304 case X86_VENDOR_INTEL: 1305 if (c->x86 == 5) 1306 intel_p5_mcheck_init(c); 1307 if (c->x86 == 6) 1308 intel_p6_mcheck_init(c); 1309 if (c->x86 == 15) 1310 intel_p4_mcheck_init(c); 1311 break; 1312 1313 case X86_VENDOR_CENTAUR: 1314 if (c->x86 == 5) 1315 winchip_mcheck_init(c); 1316 break; 1317 1318 default: 1319 break; 1320 } 1321 printk(KERN_INFO "mce: CPU supports %d MCE banks\n", nr_mce_banks); 1322} 1323 1324static int __init mcheck_disable(char *str) 1325{ 1326 mce_disabled = 1; 1327 return 1; 1328} 1329 1330static int __init mcheck_enable(char *str) 1331{ 1332 mce_disabled = -1; 1333 return 1; 1334} 1335 1336__setup("nomce", mcheck_disable); 1337__setup("mce", mcheck_enable); 1338 1339#endif /* CONFIG_X86_32 */ 1340