mce.c revision 2e6f694fde0a7158590e121962ca2e3c06633528
1/* 2 * Machine check handler. 3 * 4 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. 5 * Rest from unknown author(s). 6 * 2004 Andi Kleen. Rewrote most of it. 7 * Copyright 2008 Intel Corporation 8 * Author: Andi Kleen 9 */ 10#include <linux/thread_info.h> 11#include <linux/capability.h> 12#include <linux/miscdevice.h> 13#include <linux/ratelimit.h> 14#include <linux/kallsyms.h> 15#include <linux/rcupdate.h> 16#include <linux/smp_lock.h> 17#include <linux/kobject.h> 18#include <linux/kdebug.h> 19#include <linux/kernel.h> 20#include <linux/percpu.h> 21#include <linux/string.h> 22#include <linux/sysdev.h> 23#include <linux/ctype.h> 24#include <linux/sched.h> 25#include <linux/sysfs.h> 26#include <linux/types.h> 27#include <linux/init.h> 28#include <linux/kmod.h> 29#include <linux/poll.h> 30#include <linux/cpu.h> 31#include <linux/fs.h> 32 33#include <asm/processor.h> 34#include <asm/uaccess.h> 35#include <asm/idle.h> 36#include <asm/mce.h> 37#include <asm/msr.h> 38#include <asm/smp.h> 39 40#include "mce.h" 41 42#ifdef CONFIG_X86_64 43 44#define MISC_MCELOG_MINOR 227 45 46atomic_t mce_entry; 47 48static int mce_dont_init; 49 50/* 51 * Tolerant levels: 52 * 0: always panic on uncorrected errors, log corrected errors 53 * 1: panic or SIGBUS on uncorrected errors, log corrected errors 54 * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors 55 * 3: never panic or SIGBUS, log all errors (for testing only) 56 */ 57static int tolerant = 1; 58static int banks; 59static u64 *bank; 60static unsigned long notify_user; 61static int rip_msr; 62static int mce_bootlog = -1; 63static atomic_t mce_events; 64 65static char trigger[128]; 66static char *trigger_argv[2] = { trigger, NULL }; 67 68static unsigned long dont_init_banks; 69 70static DECLARE_WAIT_QUEUE_HEAD(mce_wait); 71 72/* MCA banks polled by the period polling timer for corrected events */ 73DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { 74 [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL 75}; 76 77static inline int skip_bank_init(int i) 78{ 79 return i < BITS_PER_LONG && test_bit(i, &dont_init_banks); 80} 81 82/* Do initial initialization of a struct mce */ 83void mce_setup(struct mce *m) 84{ 85 memset(m, 0, sizeof(struct mce)); 86 m->cpu = smp_processor_id(); 87 rdtscll(m->tsc); 88} 89 90/* 91 * Lockless MCE logging infrastructure. 92 * This avoids deadlocks on printk locks without having to break locks. Also 93 * separate MCEs from kernel messages to avoid bogus bug reports. 94 */ 95 96static struct mce_log mcelog = { 97 MCE_LOG_SIGNATURE, 98 MCE_LOG_LEN, 99}; 100 101void mce_log(struct mce *mce) 102{ 103 unsigned next, entry; 104 105 atomic_inc(&mce_events); 106 mce->finished = 0; 107 wmb(); 108 for (;;) { 109 entry = rcu_dereference(mcelog.next); 110 for (;;) { 111 /* 112 * When the buffer fills up discard new entries. 113 * Assume that the earlier errors are the more 114 * interesting ones: 115 */ 116 if (entry >= MCE_LOG_LEN) { 117 set_bit(MCE_OVERFLOW, (unsigned long *)&mcelog.flags); 118 return; 119 } 120 /* Old left over entry. Skip: */ 121 if (mcelog.entry[entry].finished) { 122 entry++; 123 continue; 124 } 125 break; 126 } 127 smp_rmb(); 128 next = entry + 1; 129 if (cmpxchg(&mcelog.next, entry, next) == entry) 130 break; 131 } 132 memcpy(mcelog.entry + entry, mce, sizeof(struct mce)); 133 wmb(); 134 mcelog.entry[entry].finished = 1; 135 wmb(); 136 137 set_bit(0, ¬ify_user); 138} 139 140static void print_mce(struct mce *m) 141{ 142 printk(KERN_EMERG "\n" 143 KERN_EMERG "HARDWARE ERROR\n" 144 KERN_EMERG 145 "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n", 146 m->cpu, m->mcgstatus, m->bank, m->status); 147 if (m->ip) { 148 printk(KERN_EMERG "RIP%s %02x:<%016Lx> ", 149 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", 150 m->cs, m->ip); 151 if (m->cs == __KERNEL_CS) 152 print_symbol("{%s}", m->ip); 153 printk("\n"); 154 } 155 printk(KERN_EMERG "TSC %llx ", m->tsc); 156 if (m->addr) 157 printk("ADDR %llx ", m->addr); 158 if (m->misc) 159 printk("MISC %llx ", m->misc); 160 printk("\n"); 161 printk(KERN_EMERG "This is not a software problem!\n"); 162 printk(KERN_EMERG "Run through mcelog --ascii to decode " 163 "and contact your hardware vendor\n"); 164} 165 166static void mce_panic(char *msg, struct mce *backup, u64 start) 167{ 168 int i; 169 170 oops_begin(); 171 for (i = 0; i < MCE_LOG_LEN; i++) { 172 u64 tsc = mcelog.entry[i].tsc; 173 174 if ((s64)(tsc - start) < 0) 175 continue; 176 print_mce(&mcelog.entry[i]); 177 if (backup && mcelog.entry[i].tsc == backup->tsc) 178 backup = NULL; 179 } 180 if (backup) 181 print_mce(backup); 182 panic(msg); 183} 184 185int mce_available(struct cpuinfo_x86 *c) 186{ 187 if (mce_dont_init) 188 return 0; 189 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); 190} 191 192static inline void mce_get_rip(struct mce *m, struct pt_regs *regs) 193{ 194 if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) { 195 m->ip = regs->ip; 196 m->cs = regs->cs; 197 } else { 198 m->ip = 0; 199 m->cs = 0; 200 } 201 if (rip_msr) { 202 /* Assume the RIP in the MSR is exact. Is this true? */ 203 m->mcgstatus |= MCG_STATUS_EIPV; 204 rdmsrl(rip_msr, m->ip); 205 m->cs = 0; 206 } 207} 208 209/* 210 * Poll for corrected events or events that happened before reset. 211 * Those are just logged through /dev/mcelog. 212 * 213 * This is executed in standard interrupt context. 214 */ 215void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) 216{ 217 struct mce m; 218 int i; 219 220 mce_setup(&m); 221 222 rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); 223 for (i = 0; i < banks; i++) { 224 if (!bank[i] || !test_bit(i, *b)) 225 continue; 226 227 m.misc = 0; 228 m.addr = 0; 229 m.bank = i; 230 m.tsc = 0; 231 232 barrier(); 233 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status); 234 if (!(m.status & MCI_STATUS_VAL)) 235 continue; 236 237 /* 238 * Uncorrected events are handled by the exception handler 239 * when it is enabled. But when the exception is disabled log 240 * everything. 241 * 242 * TBD do the same check for MCI_STATUS_EN here? 243 */ 244 if ((m.status & MCI_STATUS_UC) && !(flags & MCP_UC)) 245 continue; 246 247 if (m.status & MCI_STATUS_MISCV) 248 rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc); 249 if (m.status & MCI_STATUS_ADDRV) 250 rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr); 251 252 if (!(flags & MCP_TIMESTAMP)) 253 m.tsc = 0; 254 /* 255 * Don't get the IP here because it's unlikely to 256 * have anything to do with the actual error location. 257 */ 258 if (!(flags & MCP_DONTLOG)) { 259 mce_log(&m); 260 add_taint(TAINT_MACHINE_CHECK); 261 } 262 263 /* 264 * Clear state for this bank. 265 */ 266 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); 267 } 268 269 /* 270 * Don't clear MCG_STATUS here because it's only defined for 271 * exceptions. 272 */ 273} 274 275/* 276 * The actual machine check handler. This only handles real 277 * exceptions when something got corrupted coming in through int 18. 278 * 279 * This is executed in NMI context not subject to normal locking rules. This 280 * implies that most kernel services cannot be safely used. Don't even 281 * think about putting a printk in there! 282 */ 283void do_machine_check(struct pt_regs *regs, long error_code) 284{ 285 struct mce m, panicm; 286 int panicm_found = 0; 287 u64 mcestart = 0; 288 int i; 289 /* 290 * If no_way_out gets set, there is no safe way to recover from this 291 * MCE. If tolerant is cranked up, we'll try anyway. 292 */ 293 int no_way_out = 0; 294 /* 295 * If kill_it gets set, there might be a way to recover from this 296 * error. 297 */ 298 int kill_it = 0; 299 DECLARE_BITMAP(toclear, MAX_NR_BANKS); 300 301 atomic_inc(&mce_entry); 302 303 if (notify_die(DIE_NMI, "machine check", regs, error_code, 304 18, SIGKILL) == NOTIFY_STOP) 305 goto out2; 306 if (!banks) 307 goto out2; 308 309 mce_setup(&m); 310 311 rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); 312 313 /* if the restart IP is not valid, we're done for */ 314 if (!(m.mcgstatus & MCG_STATUS_RIPV)) 315 no_way_out = 1; 316 317 rdtscll(mcestart); 318 barrier(); 319 320 for (i = 0; i < banks; i++) { 321 __clear_bit(i, toclear); 322 if (!bank[i]) 323 continue; 324 325 m.misc = 0; 326 m.addr = 0; 327 m.bank = i; 328 329 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status); 330 if ((m.status & MCI_STATUS_VAL) == 0) 331 continue; 332 333 /* 334 * Non uncorrected errors are handled by machine_check_poll 335 * Leave them alone. 336 */ 337 if ((m.status & MCI_STATUS_UC) == 0) 338 continue; 339 340 /* 341 * Set taint even when machine check was not enabled. 342 */ 343 add_taint(TAINT_MACHINE_CHECK); 344 345 __set_bit(i, toclear); 346 347 if (m.status & MCI_STATUS_EN) { 348 /* if PCC was set, there's no way out */ 349 no_way_out |= !!(m.status & MCI_STATUS_PCC); 350 /* 351 * If this error was uncorrectable and there was 352 * an overflow, we're in trouble. If no overflow, 353 * we might get away with just killing a task. 354 */ 355 if (m.status & MCI_STATUS_UC) { 356 if (tolerant < 1 || m.status & MCI_STATUS_OVER) 357 no_way_out = 1; 358 kill_it = 1; 359 } 360 } else { 361 /* 362 * Machine check event was not enabled. Clear, but 363 * ignore. 364 */ 365 continue; 366 } 367 368 if (m.status & MCI_STATUS_MISCV) 369 rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc); 370 if (m.status & MCI_STATUS_ADDRV) 371 rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr); 372 373 mce_get_rip(&m, regs); 374 mce_log(&m); 375 376 /* 377 * Did this bank cause the exception? 378 * 379 * Assume that the bank with uncorrectable errors did it, 380 * and that there is only a single one: 381 */ 382 if ((m.status & MCI_STATUS_UC) && 383 (m.status & MCI_STATUS_EN)) { 384 panicm = m; 385 panicm_found = 1; 386 } 387 } 388 389 /* 390 * If we didn't find an uncorrectable error, pick 391 * the last one (shouldn't happen, just being safe). 392 */ 393 if (!panicm_found) 394 panicm = m; 395 396 /* 397 * If we have decided that we just CAN'T continue, and the user 398 * has not set tolerant to an insane level, give up and die. 399 */ 400 if (no_way_out && tolerant < 3) 401 mce_panic("Machine check", &panicm, mcestart); 402 403 /* 404 * If the error seems to be unrecoverable, something should be 405 * done. Try to kill as little as possible. If we can kill just 406 * one task, do that. If the user has set the tolerance very 407 * high, don't try to do anything at all. 408 */ 409 if (kill_it && tolerant < 3) { 410 int user_space = 0; 411 412 /* 413 * If the EIPV bit is set, it means the saved IP is the 414 * instruction which caused the MCE. 415 */ 416 if (m.mcgstatus & MCG_STATUS_EIPV) 417 user_space = panicm.ip && (panicm.cs & 3); 418 419 /* 420 * If we know that the error was in user space, send a 421 * SIGBUS. Otherwise, panic if tolerance is low. 422 * 423 * force_sig() takes an awful lot of locks and has a slight 424 * risk of deadlocking. 425 */ 426 if (user_space) { 427 force_sig(SIGBUS, current); 428 } else if (panic_on_oops || tolerant < 2) { 429 mce_panic("Uncorrected machine check", 430 &panicm, mcestart); 431 } 432 } 433 434 /* notify userspace ASAP */ 435 set_thread_flag(TIF_MCE_NOTIFY); 436 437 /* the last thing we do is clear state */ 438 for (i = 0; i < banks; i++) { 439 if (test_bit(i, toclear)) 440 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); 441 } 442 wrmsrl(MSR_IA32_MCG_STATUS, 0); 443 out2: 444 atomic_dec(&mce_entry); 445} 446 447#ifdef CONFIG_X86_MCE_INTEL 448/*** 449 * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog 450 * @cpu: The CPU on which the event occurred. 451 * @status: Event status information 452 * 453 * This function should be called by the thermal interrupt after the 454 * event has been processed and the decision was made to log the event 455 * further. 456 * 457 * The status parameter will be saved to the 'status' field of 'struct mce' 458 * and historically has been the register value of the 459 * MSR_IA32_THERMAL_STATUS (Intel) msr. 460 */ 461void mce_log_therm_throt_event(__u64 status) 462{ 463 struct mce m; 464 465 mce_setup(&m); 466 m.bank = MCE_THERMAL_BANK; 467 m.status = status; 468 mce_log(&m); 469} 470#endif /* CONFIG_X86_MCE_INTEL */ 471 472/* 473 * Periodic polling timer for "silent" machine check errors. If the 474 * poller finds an MCE, poll 2x faster. When the poller finds no more 475 * errors, poll 2x slower (up to check_interval seconds). 476 */ 477static int check_interval = 5 * 60; /* 5 minutes */ 478 479static DEFINE_PER_CPU(int, next_interval); /* in jiffies */ 480static DEFINE_PER_CPU(struct timer_list, mce_timer); 481 482static void mcheck_timer(unsigned long data) 483{ 484 struct timer_list *t = &per_cpu(mce_timer, data); 485 int *n; 486 487 WARN_ON(smp_processor_id() != data); 488 489 if (mce_available(¤t_cpu_data)) { 490 machine_check_poll(MCP_TIMESTAMP, 491 &__get_cpu_var(mce_poll_banks)); 492 } 493 494 /* 495 * Alert userspace if needed. If we logged an MCE, reduce the 496 * polling interval, otherwise increase the polling interval. 497 */ 498 n = &__get_cpu_var(next_interval); 499 if (mce_notify_user()) { 500 *n = max(*n/2, HZ/100); 501 } else { 502 *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ)); 503 } 504 505 t->expires = jiffies + *n; 506 add_timer(t); 507} 508 509static void mce_do_trigger(struct work_struct *work) 510{ 511 call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT); 512} 513 514static DECLARE_WORK(mce_trigger_work, mce_do_trigger); 515 516/* 517 * Notify the user(s) about new machine check events. 518 * Can be called from interrupt context, but not from machine check/NMI 519 * context. 520 */ 521int mce_notify_user(void) 522{ 523 /* Not more than two messages every minute */ 524 static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); 525 526 clear_thread_flag(TIF_MCE_NOTIFY); 527 528 if (test_and_clear_bit(0, ¬ify_user)) { 529 wake_up_interruptible(&mce_wait); 530 531 /* 532 * There is no risk of missing notifications because 533 * work_pending is always cleared before the function is 534 * executed. 535 */ 536 if (trigger[0] && !work_pending(&mce_trigger_work)) 537 schedule_work(&mce_trigger_work); 538 539 if (__ratelimit(&ratelimit)) 540 printk(KERN_INFO "Machine check events logged\n"); 541 542 return 1; 543 } 544 return 0; 545} 546 547/* see if the idle task needs to notify userspace: */ 548static int 549mce_idle_callback(struct notifier_block *nfb, unsigned long action, 550 void *unused) 551{ 552 /* IDLE_END should be safe - interrupts are back on */ 553 if (action == IDLE_END && test_thread_flag(TIF_MCE_NOTIFY)) 554 mce_notify_user(); 555 556 return NOTIFY_OK; 557} 558 559static struct notifier_block mce_idle_notifier = { 560 .notifier_call = mce_idle_callback, 561}; 562 563static __init int periodic_mcheck_init(void) 564{ 565 idle_notifier_register(&mce_idle_notifier); 566 return 0; 567} 568__initcall(periodic_mcheck_init); 569 570/* 571 * Initialize Machine Checks for a CPU. 572 */ 573static int mce_cap_init(void) 574{ 575 unsigned b; 576 u64 cap; 577 578 rdmsrl(MSR_IA32_MCG_CAP, cap); 579 580 b = cap & MCG_BANKCNT_MASK; 581 printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b); 582 583 if (b > MAX_NR_BANKS) { 584 printk(KERN_WARNING 585 "MCE: Using only %u machine check banks out of %u\n", 586 MAX_NR_BANKS, b); 587 b = MAX_NR_BANKS; 588 } 589 590 /* Don't support asymmetric configurations today */ 591 WARN_ON(banks != 0 && b != banks); 592 banks = b; 593 if (!bank) { 594 bank = kmalloc(banks * sizeof(u64), GFP_KERNEL); 595 if (!bank) 596 return -ENOMEM; 597 memset(bank, 0xff, banks * sizeof(u64)); 598 } 599 600 /* Use accurate RIP reporting if available. */ 601 if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9) 602 rip_msr = MSR_IA32_MCG_EIP; 603 604 return 0; 605} 606 607static void mce_init(void *dummy) 608{ 609 mce_banks_t all_banks; 610 u64 cap; 611 int i; 612 613 /* 614 * Log the machine checks left over from the previous reset. 615 */ 616 bitmap_fill(all_banks, MAX_NR_BANKS); 617 machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks); 618 619 set_in_cr4(X86_CR4_MCE); 620 621 rdmsrl(MSR_IA32_MCG_CAP, cap); 622 if (cap & MCG_CTL_P) 623 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); 624 625 for (i = 0; i < banks; i++) { 626 if (skip_bank_init(i)) 627 continue; 628 wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]); 629 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); 630 } 631} 632 633/* Add per CPU specific workarounds here */ 634static void mce_cpu_quirks(struct cpuinfo_x86 *c) 635{ 636 /* This should be disabled by the BIOS, but isn't always */ 637 if (c->x86_vendor == X86_VENDOR_AMD) { 638 if (c->x86 == 15 && banks > 4) { 639 /* 640 * disable GART TBL walk error reporting, which 641 * trips off incorrectly with the IOMMU & 3ware 642 * & Cerberus: 643 */ 644 clear_bit(10, (unsigned long *)&bank[4]); 645 } 646 if (c->x86 <= 17 && mce_bootlog < 0) { 647 /* 648 * Lots of broken BIOS around that don't clear them 649 * by default and leave crap in there. Don't log: 650 */ 651 mce_bootlog = 0; 652 } 653 /* 654 * Various K7s with broken bank 0 around. Always disable 655 * by default. 656 */ 657 if (c->x86 == 6) 658 bank[0] = 0; 659 } 660 661 if (c->x86_vendor == X86_VENDOR_INTEL) { 662 /* 663 * SDM documents that on family 6 bank 0 should not be written 664 * because it aliases to another special BIOS controlled 665 * register. 666 * But it's not aliased anymore on model 0x1a+ 667 * Don't ignore bank 0 completely because there could be a 668 * valid event later, merely don't write CTL0. 669 */ 670 671 if (c->x86 == 6 && c->x86_model < 0x1A) 672 __set_bit(0, &dont_init_banks); 673 } 674} 675 676static void mce_cpu_features(struct cpuinfo_x86 *c) 677{ 678 switch (c->x86_vendor) { 679 case X86_VENDOR_INTEL: 680 mce_intel_feature_init(c); 681 break; 682 case X86_VENDOR_AMD: 683 mce_amd_feature_init(c); 684 break; 685 default: 686 break; 687 } 688} 689 690static void mce_init_timer(void) 691{ 692 struct timer_list *t = &__get_cpu_var(mce_timer); 693 int *n = &__get_cpu_var(next_interval); 694 695 *n = check_interval * HZ; 696 if (!*n) 697 return; 698 setup_timer(t, mcheck_timer, smp_processor_id()); 699 t->expires = round_jiffies(jiffies + *n); 700 add_timer(t); 701} 702 703/* 704 * Called for each booted CPU to set up machine checks. 705 * Must be called with preempt off: 706 */ 707void __cpuinit mcheck_init(struct cpuinfo_x86 *c) 708{ 709 if (!mce_available(c)) 710 return; 711 712 if (mce_cap_init() < 0) { 713 mce_dont_init = 1; 714 return; 715 } 716 mce_cpu_quirks(c); 717 718 mce_init(NULL); 719 mce_cpu_features(c); 720 mce_init_timer(); 721} 722 723/* 724 * Character device to read and clear the MCE log. 725 */ 726 727static DEFINE_SPINLOCK(mce_state_lock); 728static int open_count; /* #times opened */ 729static int open_exclu; /* already open exclusive? */ 730 731static int mce_open(struct inode *inode, struct file *file) 732{ 733 lock_kernel(); 734 spin_lock(&mce_state_lock); 735 736 if (open_exclu || (open_count && (file->f_flags & O_EXCL))) { 737 spin_unlock(&mce_state_lock); 738 unlock_kernel(); 739 740 return -EBUSY; 741 } 742 743 if (file->f_flags & O_EXCL) 744 open_exclu = 1; 745 open_count++; 746 747 spin_unlock(&mce_state_lock); 748 unlock_kernel(); 749 750 return nonseekable_open(inode, file); 751} 752 753static int mce_release(struct inode *inode, struct file *file) 754{ 755 spin_lock(&mce_state_lock); 756 757 open_count--; 758 open_exclu = 0; 759 760 spin_unlock(&mce_state_lock); 761 762 return 0; 763} 764 765static void collect_tscs(void *data) 766{ 767 unsigned long *cpu_tsc = (unsigned long *)data; 768 769 rdtscll(cpu_tsc[smp_processor_id()]); 770} 771 772static DEFINE_MUTEX(mce_read_mutex); 773 774static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, 775 loff_t *off) 776{ 777 char __user *buf = ubuf; 778 unsigned long *cpu_tsc; 779 unsigned prev, next; 780 int i, err; 781 782 cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL); 783 if (!cpu_tsc) 784 return -ENOMEM; 785 786 mutex_lock(&mce_read_mutex); 787 next = rcu_dereference(mcelog.next); 788 789 /* Only supports full reads right now */ 790 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) { 791 mutex_unlock(&mce_read_mutex); 792 kfree(cpu_tsc); 793 794 return -EINVAL; 795 } 796 797 err = 0; 798 prev = 0; 799 do { 800 for (i = prev; i < next; i++) { 801 unsigned long start = jiffies; 802 803 while (!mcelog.entry[i].finished) { 804 if (time_after_eq(jiffies, start + 2)) { 805 memset(mcelog.entry + i, 0, 806 sizeof(struct mce)); 807 goto timeout; 808 } 809 cpu_relax(); 810 } 811 smp_rmb(); 812 err |= copy_to_user(buf, mcelog.entry + i, 813 sizeof(struct mce)); 814 buf += sizeof(struct mce); 815timeout: 816 ; 817 } 818 819 memset(mcelog.entry + prev, 0, 820 (next - prev) * sizeof(struct mce)); 821 prev = next; 822 next = cmpxchg(&mcelog.next, prev, 0); 823 } while (next != prev); 824 825 synchronize_sched(); 826 827 /* 828 * Collect entries that were still getting written before the 829 * synchronize. 830 */ 831 on_each_cpu(collect_tscs, cpu_tsc, 1); 832 833 for (i = next; i < MCE_LOG_LEN; i++) { 834 if (mcelog.entry[i].finished && 835 mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) { 836 err |= copy_to_user(buf, mcelog.entry+i, 837 sizeof(struct mce)); 838 smp_rmb(); 839 buf += sizeof(struct mce); 840 memset(&mcelog.entry[i], 0, sizeof(struct mce)); 841 } 842 } 843 mutex_unlock(&mce_read_mutex); 844 kfree(cpu_tsc); 845 846 return err ? -EFAULT : buf - ubuf; 847} 848 849static unsigned int mce_poll(struct file *file, poll_table *wait) 850{ 851 poll_wait(file, &mce_wait, wait); 852 if (rcu_dereference(mcelog.next)) 853 return POLLIN | POLLRDNORM; 854 return 0; 855} 856 857static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg) 858{ 859 int __user *p = (int __user *)arg; 860 861 if (!capable(CAP_SYS_ADMIN)) 862 return -EPERM; 863 864 switch (cmd) { 865 case MCE_GET_RECORD_LEN: 866 return put_user(sizeof(struct mce), p); 867 case MCE_GET_LOG_LEN: 868 return put_user(MCE_LOG_LEN, p); 869 case MCE_GETCLEAR_FLAGS: { 870 unsigned flags; 871 872 do { 873 flags = mcelog.flags; 874 } while (cmpxchg(&mcelog.flags, flags, 0) != flags); 875 876 return put_user(flags, p); 877 } 878 default: 879 return -ENOTTY; 880 } 881} 882 883static const struct file_operations mce_chrdev_ops = { 884 .open = mce_open, 885 .release = mce_release, 886 .read = mce_read, 887 .poll = mce_poll, 888 .unlocked_ioctl = mce_ioctl, 889}; 890 891static struct miscdevice mce_log_device = { 892 MISC_MCELOG_MINOR, 893 "mcelog", 894 &mce_chrdev_ops, 895}; 896 897/* 898 * Old style boot options parsing. Only for compatibility. 899 */ 900static int __init mcheck_disable(char *str) 901{ 902 mce_dont_init = 1; 903 return 1; 904} 905__setup("nomce", mcheck_disable); 906 907/* 908 * mce=off disables machine check 909 * mce=TOLERANCELEVEL (number, see above) 910 * mce=bootlog Log MCEs from before booting. Disabled by default on AMD. 911 * mce=nobootlog Don't log MCEs from before booting. 912 */ 913static int __init mcheck_enable(char *str) 914{ 915 if (!strcmp(str, "off")) 916 mce_dont_init = 1; 917 else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog")) 918 mce_bootlog = (str[0] == 'b'); 919 else if (isdigit(str[0])) 920 get_option(&str, &tolerant); 921 else { 922 printk(KERN_INFO "mce= argument %s ignored. Please use /sys\n", 923 str); 924 return 0; 925 } 926 return 1; 927} 928__setup("mce=", mcheck_enable); 929 930/* 931 * Sysfs support 932 */ 933 934/* 935 * Disable machine checks on suspend and shutdown. We can't really handle 936 * them later. 937 */ 938static int mce_disable(void) 939{ 940 int i; 941 942 for (i = 0; i < banks; i++) { 943 if (!skip_bank_init(i)) 944 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); 945 } 946 return 0; 947} 948 949static int mce_suspend(struct sys_device *dev, pm_message_t state) 950{ 951 return mce_disable(); 952} 953 954static int mce_shutdown(struct sys_device *dev) 955{ 956 return mce_disable(); 957} 958 959/* 960 * On resume clear all MCE state. Don't want to see leftovers from the BIOS. 961 * Only one CPU is active at this time, the others get re-added later using 962 * CPU hotplug: 963 */ 964static int mce_resume(struct sys_device *dev) 965{ 966 mce_init(NULL); 967 mce_cpu_features(¤t_cpu_data); 968 969 return 0; 970} 971 972static void mce_cpu_restart(void *data) 973{ 974 del_timer_sync(&__get_cpu_var(mce_timer)); 975 if (mce_available(¤t_cpu_data)) 976 mce_init(NULL); 977 mce_init_timer(); 978} 979 980/* Reinit MCEs after user configuration changes */ 981static void mce_restart(void) 982{ 983 on_each_cpu(mce_cpu_restart, NULL, 1); 984} 985 986static struct sysdev_class mce_sysclass = { 987 .suspend = mce_suspend, 988 .shutdown = mce_shutdown, 989 .resume = mce_resume, 990 .name = "machinecheck", 991}; 992 993DEFINE_PER_CPU(struct sys_device, mce_dev); 994 995__cpuinitdata 996void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); 997 998/* Why are there no generic functions for this? */ 999#define ACCESSOR(name, var, start) \ 1000 static ssize_t show_ ## name(struct sys_device *s, \ 1001 struct sysdev_attribute *attr, \ 1002 char *buf) { \ 1003 return sprintf(buf, "%Lx\n", (u64)var); \ 1004 } \ 1005 static ssize_t set_ ## name(struct sys_device *s, \ 1006 struct sysdev_attribute *attr, \ 1007 const char *buf, size_t siz) { \ 1008 char *end; \ 1009 u64 new = simple_strtoull(buf, &end, 0); \ 1010 \ 1011 if (end == buf) \ 1012 return -EINVAL; \ 1013 var = new; \ 1014 start; \ 1015 \ 1016 return end-buf; \ 1017 } \ 1018 static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name); 1019 1020static struct sysdev_attribute *bank_attrs; 1021 1022static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr, 1023 char *buf) 1024{ 1025 u64 b = bank[attr - bank_attrs]; 1026 1027 return sprintf(buf, "%llx\n", b); 1028} 1029 1030static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, 1031 const char *buf, size_t siz) 1032{ 1033 char *end; 1034 u64 new = simple_strtoull(buf, &end, 0); 1035 1036 if (end == buf) 1037 return -EINVAL; 1038 1039 bank[attr - bank_attrs] = new; 1040 mce_restart(); 1041 1042 return end-buf; 1043} 1044 1045static ssize_t 1046show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf) 1047{ 1048 strcpy(buf, trigger); 1049 strcat(buf, "\n"); 1050 return strlen(trigger) + 1; 1051} 1052 1053static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr, 1054 const char *buf, size_t siz) 1055{ 1056 char *p; 1057 int len; 1058 1059 strncpy(trigger, buf, sizeof(trigger)); 1060 trigger[sizeof(trigger)-1] = 0; 1061 len = strlen(trigger); 1062 p = strchr(trigger, '\n'); 1063 1064 if (*p) 1065 *p = 0; 1066 1067 return len; 1068} 1069 1070static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); 1071static SYSDEV_INT_ATTR(tolerant, 0644, tolerant); 1072 1073ACCESSOR(check_interval, check_interval, mce_restart()) 1074 1075static struct sysdev_attribute *mce_attrs[] = { 1076 &attr_tolerant.attr, &attr_check_interval, &attr_trigger, 1077 NULL 1078}; 1079 1080static cpumask_var_t mce_dev_initialized; 1081 1082/* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */ 1083static __cpuinit int mce_create_device(unsigned int cpu) 1084{ 1085 int err; 1086 int i; 1087 1088 if (!mce_available(&boot_cpu_data)) 1089 return -EIO; 1090 1091 memset(&per_cpu(mce_dev, cpu).kobj, 0, sizeof(struct kobject)); 1092 per_cpu(mce_dev, cpu).id = cpu; 1093 per_cpu(mce_dev, cpu).cls = &mce_sysclass; 1094 1095 err = sysdev_register(&per_cpu(mce_dev, cpu)); 1096 if (err) 1097 return err; 1098 1099 for (i = 0; mce_attrs[i]; i++) { 1100 err = sysdev_create_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); 1101 if (err) 1102 goto error; 1103 } 1104 for (i = 0; i < banks; i++) { 1105 err = sysdev_create_file(&per_cpu(mce_dev, cpu), 1106 &bank_attrs[i]); 1107 if (err) 1108 goto error2; 1109 } 1110 cpumask_set_cpu(cpu, mce_dev_initialized); 1111 1112 return 0; 1113error2: 1114 while (--i >= 0) 1115 sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]); 1116error: 1117 while (--i >= 0) 1118 sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); 1119 1120 sysdev_unregister(&per_cpu(mce_dev, cpu)); 1121 1122 return err; 1123} 1124 1125static __cpuinit void mce_remove_device(unsigned int cpu) 1126{ 1127 int i; 1128 1129 if (!cpumask_test_cpu(cpu, mce_dev_initialized)) 1130 return; 1131 1132 for (i = 0; mce_attrs[i]; i++) 1133 sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); 1134 1135 for (i = 0; i < banks; i++) 1136 sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]); 1137 1138 sysdev_unregister(&per_cpu(mce_dev, cpu)); 1139 cpumask_clear_cpu(cpu, mce_dev_initialized); 1140} 1141 1142/* Make sure there are no machine checks on offlined CPUs. */ 1143static void mce_disable_cpu(void *h) 1144{ 1145 unsigned long action = *(unsigned long *)h; 1146 int i; 1147 1148 if (!mce_available(¤t_cpu_data)) 1149 return; 1150 if (!(action & CPU_TASKS_FROZEN)) 1151 cmci_clear(); 1152 for (i = 0; i < banks; i++) { 1153 if (!skip_bank_init(i)) 1154 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); 1155 } 1156} 1157 1158static void mce_reenable_cpu(void *h) 1159{ 1160 unsigned long action = *(unsigned long *)h; 1161 int i; 1162 1163 if (!mce_available(¤t_cpu_data)) 1164 return; 1165 1166 if (!(action & CPU_TASKS_FROZEN)) 1167 cmci_reenable(); 1168 for (i = 0; i < banks; i++) { 1169 if (!skip_bank_init(i)) 1170 wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]); 1171 } 1172} 1173 1174/* Get notified when a cpu comes on/off. Be hotplug friendly. */ 1175static int __cpuinit 1176mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) 1177{ 1178 unsigned int cpu = (unsigned long)hcpu; 1179 struct timer_list *t = &per_cpu(mce_timer, cpu); 1180 1181 switch (action) { 1182 case CPU_ONLINE: 1183 case CPU_ONLINE_FROZEN: 1184 mce_create_device(cpu); 1185 if (threshold_cpu_callback) 1186 threshold_cpu_callback(action, cpu); 1187 break; 1188 case CPU_DEAD: 1189 case CPU_DEAD_FROZEN: 1190 if (threshold_cpu_callback) 1191 threshold_cpu_callback(action, cpu); 1192 mce_remove_device(cpu); 1193 break; 1194 case CPU_DOWN_PREPARE: 1195 case CPU_DOWN_PREPARE_FROZEN: 1196 del_timer_sync(t); 1197 smp_call_function_single(cpu, mce_disable_cpu, &action, 1); 1198 break; 1199 case CPU_DOWN_FAILED: 1200 case CPU_DOWN_FAILED_FROZEN: 1201 t->expires = round_jiffies(jiffies + 1202 __get_cpu_var(next_interval)); 1203 add_timer_on(t, cpu); 1204 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); 1205 break; 1206 case CPU_POST_DEAD: 1207 /* intentionally ignoring frozen here */ 1208 cmci_rediscover(cpu); 1209 break; 1210 } 1211 return NOTIFY_OK; 1212} 1213 1214static struct notifier_block mce_cpu_notifier __cpuinitdata = { 1215 .notifier_call = mce_cpu_callback, 1216}; 1217 1218static __init int mce_init_banks(void) 1219{ 1220 int i; 1221 1222 bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks, 1223 GFP_KERNEL); 1224 if (!bank_attrs) 1225 return -ENOMEM; 1226 1227 for (i = 0; i < banks; i++) { 1228 struct sysdev_attribute *a = &bank_attrs[i]; 1229 1230 a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i); 1231 if (!a->attr.name) 1232 goto nomem; 1233 1234 a->attr.mode = 0644; 1235 a->show = show_bank; 1236 a->store = set_bank; 1237 } 1238 return 0; 1239 1240nomem: 1241 while (--i >= 0) 1242 kfree(bank_attrs[i].attr.name); 1243 kfree(bank_attrs); 1244 bank_attrs = NULL; 1245 1246 return -ENOMEM; 1247} 1248 1249static __init int mce_init_device(void) 1250{ 1251 int err; 1252 int i = 0; 1253 1254 if (!mce_available(&boot_cpu_data)) 1255 return -EIO; 1256 1257 alloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL); 1258 1259 err = mce_init_banks(); 1260 if (err) 1261 return err; 1262 1263 err = sysdev_class_register(&mce_sysclass); 1264 if (err) 1265 return err; 1266 1267 for_each_online_cpu(i) { 1268 err = mce_create_device(i); 1269 if (err) 1270 return err; 1271 } 1272 1273 register_hotcpu_notifier(&mce_cpu_notifier); 1274 misc_register(&mce_log_device); 1275 1276 return err; 1277} 1278 1279device_initcall(mce_init_device); 1280 1281#else /* CONFIG_X86_32: */ 1282 1283int mce_disabled; 1284 1285int nr_mce_banks; 1286EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */ 1287 1288/* Handle unconfigured int18 (should never happen) */ 1289static void unexpected_machine_check(struct pt_regs *regs, long error_code) 1290{ 1291 printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", 1292 smp_processor_id()); 1293} 1294 1295/* Call the installed machine check handler for this CPU setup. */ 1296void (*machine_check_vector)(struct pt_regs *, long error_code) = 1297 unexpected_machine_check; 1298 1299/* This has to be run for each processor */ 1300void mcheck_init(struct cpuinfo_x86 *c) 1301{ 1302 if (mce_disabled == 1) 1303 return; 1304 1305 switch (c->x86_vendor) { 1306 case X86_VENDOR_AMD: 1307 amd_mcheck_init(c); 1308 break; 1309 1310 case X86_VENDOR_INTEL: 1311 if (c->x86 == 5) 1312 intel_p5_mcheck_init(c); 1313 if (c->x86 == 6) 1314 intel_p6_mcheck_init(c); 1315 if (c->x86 == 15) 1316 intel_p4_mcheck_init(c); 1317 break; 1318 1319 case X86_VENDOR_CENTAUR: 1320 if (c->x86 == 5) 1321 winchip_mcheck_init(c); 1322 break; 1323 1324 default: 1325 break; 1326 } 1327 printk(KERN_INFO "mce: CPU supports %d MCE banks\n", nr_mce_banks); 1328} 1329 1330static int __init mcheck_disable(char *str) 1331{ 1332 mce_disabled = 1; 1333 return 1; 1334} 1335 1336static int __init mcheck_enable(char *str) 1337{ 1338 mce_disabled = -1; 1339 return 1; 1340} 1341 1342__setup("nomce", mcheck_disable); 1343__setup("mce", mcheck_enable); 1344 1345#endif /* CONFIG_X86_32 */ 1346