mce.c revision 8e97aef5f43ec715f394bc15015ff263b80c3ad6
1/* 2 * Machine check handler. 3 * 4 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. 5 * Rest from unknown author(s). 6 * 2004 Andi Kleen. Rewrote most of it. 7 * Copyright 2008 Intel Corporation 8 * Author: Andi Kleen 9 */ 10#include <linux/thread_info.h> 11#include <linux/capability.h> 12#include <linux/miscdevice.h> 13#include <linux/ratelimit.h> 14#include <linux/kallsyms.h> 15#include <linux/rcupdate.h> 16#include <linux/smp_lock.h> 17#include <linux/kobject.h> 18#include <linux/kdebug.h> 19#include <linux/kernel.h> 20#include <linux/percpu.h> 21#include <linux/string.h> 22#include <linux/sysdev.h> 23#include <linux/ctype.h> 24#include <linux/sched.h> 25#include <linux/sysfs.h> 26#include <linux/types.h> 27#include <linux/init.h> 28#include <linux/kmod.h> 29#include <linux/poll.h> 30#include <linux/cpu.h> 31#include <linux/fs.h> 32 33#include <asm/processor.h> 34#include <asm/uaccess.h> 35#include <asm/idle.h> 36#include <asm/mce.h> 37#include <asm/msr.h> 38#include <asm/smp.h> 39 40#include "mce.h" 41 42/* Handle unconfigured int18 (should never happen) */ 43static void unexpected_machine_check(struct pt_regs *regs, long error_code) 44{ 45 printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", 46 smp_processor_id()); 47} 48 49/* Call the installed machine check handler for this CPU setup. */ 50void (*machine_check_vector)(struct pt_regs *, long error_code) = 51 unexpected_machine_check; 52 53int mce_disabled; 54 55#ifdef CONFIG_X86_64 56 57#define MISC_MCELOG_MINOR 227 58 59atomic_t mce_entry; 60 61/* 62 * Tolerant levels: 63 * 0: always panic on uncorrected errors, log corrected errors 64 * 1: panic or SIGBUS on uncorrected errors, log corrected errors 65 * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors 66 * 3: never panic or SIGBUS, log all errors (for testing only) 67 */ 68static int tolerant = 1; 69static int banks; 70static u64 *bank; 71static unsigned long notify_user; 72static int rip_msr; 73static int mce_bootlog = -1; 74static atomic_t mce_events; 75 76static char trigger[128]; 77static char *trigger_argv[2] = { trigger, NULL }; 78 79static unsigned long dont_init_banks; 80 81static DECLARE_WAIT_QUEUE_HEAD(mce_wait); 82 83/* MCA banks polled by the period polling timer for corrected events */ 84DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { 85 [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL 86}; 87 88static inline int skip_bank_init(int i) 89{ 90 return i < BITS_PER_LONG && test_bit(i, &dont_init_banks); 91} 92 93/* Do initial initialization of a struct mce */ 94void mce_setup(struct mce *m) 95{ 96 memset(m, 0, sizeof(struct mce)); 97 m->cpu = smp_processor_id(); 98 rdtscll(m->tsc); 99} 100 101/* 102 * Lockless MCE logging infrastructure. 103 * This avoids deadlocks on printk locks without having to break locks. Also 104 * separate MCEs from kernel messages to avoid bogus bug reports. 105 */ 106 107static struct mce_log mcelog = { 108 MCE_LOG_SIGNATURE, 109 MCE_LOG_LEN, 110}; 111 112void mce_log(struct mce *mce) 113{ 114 unsigned next, entry; 115 116 atomic_inc(&mce_events); 117 mce->finished = 0; 118 wmb(); 119 for (;;) { 120 entry = rcu_dereference(mcelog.next); 121 for (;;) { 122 /* 123 * When the buffer fills up discard new entries. 124 * Assume that the earlier errors are the more 125 * interesting ones: 126 */ 127 if (entry >= MCE_LOG_LEN) { 128 set_bit(MCE_OVERFLOW, (unsigned long *)&mcelog.flags); 129 return; 130 } 131 /* Old left over entry. Skip: */ 132 if (mcelog.entry[entry].finished) { 133 entry++; 134 continue; 135 } 136 break; 137 } 138 smp_rmb(); 139 next = entry + 1; 140 if (cmpxchg(&mcelog.next, entry, next) == entry) 141 break; 142 } 143 memcpy(mcelog.entry + entry, mce, sizeof(struct mce)); 144 wmb(); 145 mcelog.entry[entry].finished = 1; 146 wmb(); 147 148 set_bit(0, ¬ify_user); 149} 150 151static void print_mce(struct mce *m) 152{ 153 printk(KERN_EMERG "\n" 154 KERN_EMERG "HARDWARE ERROR\n" 155 KERN_EMERG 156 "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n", 157 m->cpu, m->mcgstatus, m->bank, m->status); 158 if (m->ip) { 159 printk(KERN_EMERG "RIP%s %02x:<%016Lx> ", 160 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", 161 m->cs, m->ip); 162 if (m->cs == __KERNEL_CS) 163 print_symbol("{%s}", m->ip); 164 printk("\n"); 165 } 166 printk(KERN_EMERG "TSC %llx ", m->tsc); 167 if (m->addr) 168 printk("ADDR %llx ", m->addr); 169 if (m->misc) 170 printk("MISC %llx ", m->misc); 171 printk("\n"); 172 printk(KERN_EMERG "This is not a software problem!\n"); 173 printk(KERN_EMERG "Run through mcelog --ascii to decode " 174 "and contact your hardware vendor\n"); 175} 176 177static void mce_panic(char *msg, struct mce *backup, u64 start) 178{ 179 int i; 180 181 oops_begin(); 182 for (i = 0; i < MCE_LOG_LEN; i++) { 183 u64 tsc = mcelog.entry[i].tsc; 184 185 if ((s64)(tsc - start) < 0) 186 continue; 187 print_mce(&mcelog.entry[i]); 188 if (backup && mcelog.entry[i].tsc == backup->tsc) 189 backup = NULL; 190 } 191 if (backup) 192 print_mce(backup); 193 panic(msg); 194} 195 196int mce_available(struct cpuinfo_x86 *c) 197{ 198 if (mce_disabled) 199 return 0; 200 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); 201} 202 203static inline void mce_get_rip(struct mce *m, struct pt_regs *regs) 204{ 205 if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) { 206 m->ip = regs->ip; 207 m->cs = regs->cs; 208 } else { 209 m->ip = 0; 210 m->cs = 0; 211 } 212 if (rip_msr) { 213 /* Assume the RIP in the MSR is exact. Is this true? */ 214 m->mcgstatus |= MCG_STATUS_EIPV; 215 rdmsrl(rip_msr, m->ip); 216 m->cs = 0; 217 } 218} 219 220/* 221 * Poll for corrected events or events that happened before reset. 222 * Those are just logged through /dev/mcelog. 223 * 224 * This is executed in standard interrupt context. 225 */ 226void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) 227{ 228 struct mce m; 229 int i; 230 231 mce_setup(&m); 232 233 rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); 234 for (i = 0; i < banks; i++) { 235 if (!bank[i] || !test_bit(i, *b)) 236 continue; 237 238 m.misc = 0; 239 m.addr = 0; 240 m.bank = i; 241 m.tsc = 0; 242 243 barrier(); 244 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status); 245 if (!(m.status & MCI_STATUS_VAL)) 246 continue; 247 248 /* 249 * Uncorrected events are handled by the exception handler 250 * when it is enabled. But when the exception is disabled log 251 * everything. 252 * 253 * TBD do the same check for MCI_STATUS_EN here? 254 */ 255 if ((m.status & MCI_STATUS_UC) && !(flags & MCP_UC)) 256 continue; 257 258 if (m.status & MCI_STATUS_MISCV) 259 rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc); 260 if (m.status & MCI_STATUS_ADDRV) 261 rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr); 262 263 if (!(flags & MCP_TIMESTAMP)) 264 m.tsc = 0; 265 /* 266 * Don't get the IP here because it's unlikely to 267 * have anything to do with the actual error location. 268 */ 269 if (!(flags & MCP_DONTLOG)) { 270 mce_log(&m); 271 add_taint(TAINT_MACHINE_CHECK); 272 } 273 274 /* 275 * Clear state for this bank. 276 */ 277 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); 278 } 279 280 /* 281 * Don't clear MCG_STATUS here because it's only defined for 282 * exceptions. 283 */ 284} 285 286/* 287 * The actual machine check handler. This only handles real 288 * exceptions when something got corrupted coming in through int 18. 289 * 290 * This is executed in NMI context not subject to normal locking rules. This 291 * implies that most kernel services cannot be safely used. Don't even 292 * think about putting a printk in there! 293 */ 294void do_machine_check(struct pt_regs *regs, long error_code) 295{ 296 struct mce m, panicm; 297 int panicm_found = 0; 298 u64 mcestart = 0; 299 int i; 300 /* 301 * If no_way_out gets set, there is no safe way to recover from this 302 * MCE. If tolerant is cranked up, we'll try anyway. 303 */ 304 int no_way_out = 0; 305 /* 306 * If kill_it gets set, there might be a way to recover from this 307 * error. 308 */ 309 int kill_it = 0; 310 DECLARE_BITMAP(toclear, MAX_NR_BANKS); 311 312 atomic_inc(&mce_entry); 313 314 if (notify_die(DIE_NMI, "machine check", regs, error_code, 315 18, SIGKILL) == NOTIFY_STOP) 316 goto out2; 317 if (!banks) 318 goto out2; 319 320 mce_setup(&m); 321 322 rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); 323 324 /* if the restart IP is not valid, we're done for */ 325 if (!(m.mcgstatus & MCG_STATUS_RIPV)) 326 no_way_out = 1; 327 328 rdtscll(mcestart); 329 barrier(); 330 331 for (i = 0; i < banks; i++) { 332 __clear_bit(i, toclear); 333 if (!bank[i]) 334 continue; 335 336 m.misc = 0; 337 m.addr = 0; 338 m.bank = i; 339 340 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status); 341 if ((m.status & MCI_STATUS_VAL) == 0) 342 continue; 343 344 /* 345 * Non uncorrected errors are handled by machine_check_poll 346 * Leave them alone. 347 */ 348 if ((m.status & MCI_STATUS_UC) == 0) 349 continue; 350 351 /* 352 * Set taint even when machine check was not enabled. 353 */ 354 add_taint(TAINT_MACHINE_CHECK); 355 356 __set_bit(i, toclear); 357 358 if (m.status & MCI_STATUS_EN) { 359 /* if PCC was set, there's no way out */ 360 no_way_out |= !!(m.status & MCI_STATUS_PCC); 361 /* 362 * If this error was uncorrectable and there was 363 * an overflow, we're in trouble. If no overflow, 364 * we might get away with just killing a task. 365 */ 366 if (m.status & MCI_STATUS_UC) { 367 if (tolerant < 1 || m.status & MCI_STATUS_OVER) 368 no_way_out = 1; 369 kill_it = 1; 370 } 371 } else { 372 /* 373 * Machine check event was not enabled. Clear, but 374 * ignore. 375 */ 376 continue; 377 } 378 379 if (m.status & MCI_STATUS_MISCV) 380 rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc); 381 if (m.status & MCI_STATUS_ADDRV) 382 rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr); 383 384 mce_get_rip(&m, regs); 385 mce_log(&m); 386 387 /* 388 * Did this bank cause the exception? 389 * 390 * Assume that the bank with uncorrectable errors did it, 391 * and that there is only a single one: 392 */ 393 if ((m.status & MCI_STATUS_UC) && 394 (m.status & MCI_STATUS_EN)) { 395 panicm = m; 396 panicm_found = 1; 397 } 398 } 399 400 /* 401 * If we didn't find an uncorrectable error, pick 402 * the last one (shouldn't happen, just being safe). 403 */ 404 if (!panicm_found) 405 panicm = m; 406 407 /* 408 * If we have decided that we just CAN'T continue, and the user 409 * has not set tolerant to an insane level, give up and die. 410 */ 411 if (no_way_out && tolerant < 3) 412 mce_panic("Machine check", &panicm, mcestart); 413 414 /* 415 * If the error seems to be unrecoverable, something should be 416 * done. Try to kill as little as possible. If we can kill just 417 * one task, do that. If the user has set the tolerance very 418 * high, don't try to do anything at all. 419 */ 420 if (kill_it && tolerant < 3) { 421 int user_space = 0; 422 423 /* 424 * If the EIPV bit is set, it means the saved IP is the 425 * instruction which caused the MCE. 426 */ 427 if (m.mcgstatus & MCG_STATUS_EIPV) 428 user_space = panicm.ip && (panicm.cs & 3); 429 430 /* 431 * If we know that the error was in user space, send a 432 * SIGBUS. Otherwise, panic if tolerance is low. 433 * 434 * force_sig() takes an awful lot of locks and has a slight 435 * risk of deadlocking. 436 */ 437 if (user_space) { 438 force_sig(SIGBUS, current); 439 } else if (panic_on_oops || tolerant < 2) { 440 mce_panic("Uncorrected machine check", 441 &panicm, mcestart); 442 } 443 } 444 445 /* notify userspace ASAP */ 446 set_thread_flag(TIF_MCE_NOTIFY); 447 448 /* the last thing we do is clear state */ 449 for (i = 0; i < banks; i++) { 450 if (test_bit(i, toclear)) 451 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); 452 } 453 wrmsrl(MSR_IA32_MCG_STATUS, 0); 454 out2: 455 atomic_dec(&mce_entry); 456} 457 458#ifdef CONFIG_X86_MCE_INTEL 459/*** 460 * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog 461 * @cpu: The CPU on which the event occurred. 462 * @status: Event status information 463 * 464 * This function should be called by the thermal interrupt after the 465 * event has been processed and the decision was made to log the event 466 * further. 467 * 468 * The status parameter will be saved to the 'status' field of 'struct mce' 469 * and historically has been the register value of the 470 * MSR_IA32_THERMAL_STATUS (Intel) msr. 471 */ 472void mce_log_therm_throt_event(__u64 status) 473{ 474 struct mce m; 475 476 mce_setup(&m); 477 m.bank = MCE_THERMAL_BANK; 478 m.status = status; 479 mce_log(&m); 480} 481#endif /* CONFIG_X86_MCE_INTEL */ 482 483/* 484 * Periodic polling timer for "silent" machine check errors. If the 485 * poller finds an MCE, poll 2x faster. When the poller finds no more 486 * errors, poll 2x slower (up to check_interval seconds). 487 */ 488static int check_interval = 5 * 60; /* 5 minutes */ 489 490static DEFINE_PER_CPU(int, next_interval); /* in jiffies */ 491static DEFINE_PER_CPU(struct timer_list, mce_timer); 492 493static void mcheck_timer(unsigned long data) 494{ 495 struct timer_list *t = &per_cpu(mce_timer, data); 496 int *n; 497 498 WARN_ON(smp_processor_id() != data); 499 500 if (mce_available(¤t_cpu_data)) { 501 machine_check_poll(MCP_TIMESTAMP, 502 &__get_cpu_var(mce_poll_banks)); 503 } 504 505 /* 506 * Alert userspace if needed. If we logged an MCE, reduce the 507 * polling interval, otherwise increase the polling interval. 508 */ 509 n = &__get_cpu_var(next_interval); 510 if (mce_notify_user()) { 511 *n = max(*n/2, HZ/100); 512 } else { 513 *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ)); 514 } 515 516 t->expires = jiffies + *n; 517 add_timer(t); 518} 519 520static void mce_do_trigger(struct work_struct *work) 521{ 522 call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT); 523} 524 525static DECLARE_WORK(mce_trigger_work, mce_do_trigger); 526 527/* 528 * Notify the user(s) about new machine check events. 529 * Can be called from interrupt context, but not from machine check/NMI 530 * context. 531 */ 532int mce_notify_user(void) 533{ 534 /* Not more than two messages every minute */ 535 static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); 536 537 clear_thread_flag(TIF_MCE_NOTIFY); 538 539 if (test_and_clear_bit(0, ¬ify_user)) { 540 wake_up_interruptible(&mce_wait); 541 542 /* 543 * There is no risk of missing notifications because 544 * work_pending is always cleared before the function is 545 * executed. 546 */ 547 if (trigger[0] && !work_pending(&mce_trigger_work)) 548 schedule_work(&mce_trigger_work); 549 550 if (__ratelimit(&ratelimit)) 551 printk(KERN_INFO "Machine check events logged\n"); 552 553 return 1; 554 } 555 return 0; 556} 557 558/* 559 * Initialize Machine Checks for a CPU. 560 */ 561static int mce_cap_init(void) 562{ 563 unsigned b; 564 u64 cap; 565 566 rdmsrl(MSR_IA32_MCG_CAP, cap); 567 568 b = cap & MCG_BANKCNT_MASK; 569 printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b); 570 571 if (b > MAX_NR_BANKS) { 572 printk(KERN_WARNING 573 "MCE: Using only %u machine check banks out of %u\n", 574 MAX_NR_BANKS, b); 575 b = MAX_NR_BANKS; 576 } 577 578 /* Don't support asymmetric configurations today */ 579 WARN_ON(banks != 0 && b != banks); 580 banks = b; 581 if (!bank) { 582 bank = kmalloc(banks * sizeof(u64), GFP_KERNEL); 583 if (!bank) 584 return -ENOMEM; 585 memset(bank, 0xff, banks * sizeof(u64)); 586 } 587 588 /* Use accurate RIP reporting if available. */ 589 if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9) 590 rip_msr = MSR_IA32_MCG_EIP; 591 592 return 0; 593} 594 595static void mce_init(void *dummy) 596{ 597 mce_banks_t all_banks; 598 u64 cap; 599 int i; 600 601 /* 602 * Log the machine checks left over from the previous reset. 603 */ 604 bitmap_fill(all_banks, MAX_NR_BANKS); 605 machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks); 606 607 set_in_cr4(X86_CR4_MCE); 608 609 rdmsrl(MSR_IA32_MCG_CAP, cap); 610 if (cap & MCG_CTL_P) 611 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); 612 613 for (i = 0; i < banks; i++) { 614 if (skip_bank_init(i)) 615 continue; 616 wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]); 617 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); 618 } 619} 620 621/* Add per CPU specific workarounds here */ 622static void mce_cpu_quirks(struct cpuinfo_x86 *c) 623{ 624 /* This should be disabled by the BIOS, but isn't always */ 625 if (c->x86_vendor == X86_VENDOR_AMD) { 626 if (c->x86 == 15 && banks > 4) { 627 /* 628 * disable GART TBL walk error reporting, which 629 * trips off incorrectly with the IOMMU & 3ware 630 * & Cerberus: 631 */ 632 clear_bit(10, (unsigned long *)&bank[4]); 633 } 634 if (c->x86 <= 17 && mce_bootlog < 0) { 635 /* 636 * Lots of broken BIOS around that don't clear them 637 * by default and leave crap in there. Don't log: 638 */ 639 mce_bootlog = 0; 640 } 641 /* 642 * Various K7s with broken bank 0 around. Always disable 643 * by default. 644 */ 645 if (c->x86 == 6) 646 bank[0] = 0; 647 } 648 649 if (c->x86_vendor == X86_VENDOR_INTEL) { 650 /* 651 * SDM documents that on family 6 bank 0 should not be written 652 * because it aliases to another special BIOS controlled 653 * register. 654 * But it's not aliased anymore on model 0x1a+ 655 * Don't ignore bank 0 completely because there could be a 656 * valid event later, merely don't write CTL0. 657 */ 658 659 if (c->x86 == 6 && c->x86_model < 0x1A) 660 __set_bit(0, &dont_init_banks); 661 } 662} 663 664static void mce_cpu_features(struct cpuinfo_x86 *c) 665{ 666 switch (c->x86_vendor) { 667 case X86_VENDOR_INTEL: 668 mce_intel_feature_init(c); 669 break; 670 case X86_VENDOR_AMD: 671 mce_amd_feature_init(c); 672 break; 673 default: 674 break; 675 } 676} 677 678static void mce_init_timer(void) 679{ 680 struct timer_list *t = &__get_cpu_var(mce_timer); 681 int *n = &__get_cpu_var(next_interval); 682 683 *n = check_interval * HZ; 684 if (!*n) 685 return; 686 setup_timer(t, mcheck_timer, smp_processor_id()); 687 t->expires = round_jiffies(jiffies + *n); 688 add_timer(t); 689} 690 691/* 692 * Called for each booted CPU to set up machine checks. 693 * Must be called with preempt off: 694 */ 695void __cpuinit mcheck_init(struct cpuinfo_x86 *c) 696{ 697 if (!mce_available(c)) 698 return; 699 700 if (mce_cap_init() < 0) { 701 mce_disabled = 1; 702 return; 703 } 704 mce_cpu_quirks(c); 705 706 machine_check_vector = do_machine_check; 707 708 mce_init(NULL); 709 mce_cpu_features(c); 710 mce_init_timer(); 711} 712 713/* 714 * Character device to read and clear the MCE log. 715 */ 716 717static DEFINE_SPINLOCK(mce_state_lock); 718static int open_count; /* #times opened */ 719static int open_exclu; /* already open exclusive? */ 720 721static int mce_open(struct inode *inode, struct file *file) 722{ 723 lock_kernel(); 724 spin_lock(&mce_state_lock); 725 726 if (open_exclu || (open_count && (file->f_flags & O_EXCL))) { 727 spin_unlock(&mce_state_lock); 728 unlock_kernel(); 729 730 return -EBUSY; 731 } 732 733 if (file->f_flags & O_EXCL) 734 open_exclu = 1; 735 open_count++; 736 737 spin_unlock(&mce_state_lock); 738 unlock_kernel(); 739 740 return nonseekable_open(inode, file); 741} 742 743static int mce_release(struct inode *inode, struct file *file) 744{ 745 spin_lock(&mce_state_lock); 746 747 open_count--; 748 open_exclu = 0; 749 750 spin_unlock(&mce_state_lock); 751 752 return 0; 753} 754 755static void collect_tscs(void *data) 756{ 757 unsigned long *cpu_tsc = (unsigned long *)data; 758 759 rdtscll(cpu_tsc[smp_processor_id()]); 760} 761 762static DEFINE_MUTEX(mce_read_mutex); 763 764static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, 765 loff_t *off) 766{ 767 char __user *buf = ubuf; 768 unsigned long *cpu_tsc; 769 unsigned prev, next; 770 int i, err; 771 772 cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL); 773 if (!cpu_tsc) 774 return -ENOMEM; 775 776 mutex_lock(&mce_read_mutex); 777 next = rcu_dereference(mcelog.next); 778 779 /* Only supports full reads right now */ 780 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) { 781 mutex_unlock(&mce_read_mutex); 782 kfree(cpu_tsc); 783 784 return -EINVAL; 785 } 786 787 err = 0; 788 prev = 0; 789 do { 790 for (i = prev; i < next; i++) { 791 unsigned long start = jiffies; 792 793 while (!mcelog.entry[i].finished) { 794 if (time_after_eq(jiffies, start + 2)) { 795 memset(mcelog.entry + i, 0, 796 sizeof(struct mce)); 797 goto timeout; 798 } 799 cpu_relax(); 800 } 801 smp_rmb(); 802 err |= copy_to_user(buf, mcelog.entry + i, 803 sizeof(struct mce)); 804 buf += sizeof(struct mce); 805timeout: 806 ; 807 } 808 809 memset(mcelog.entry + prev, 0, 810 (next - prev) * sizeof(struct mce)); 811 prev = next; 812 next = cmpxchg(&mcelog.next, prev, 0); 813 } while (next != prev); 814 815 synchronize_sched(); 816 817 /* 818 * Collect entries that were still getting written before the 819 * synchronize. 820 */ 821 on_each_cpu(collect_tscs, cpu_tsc, 1); 822 823 for (i = next; i < MCE_LOG_LEN; i++) { 824 if (mcelog.entry[i].finished && 825 mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) { 826 err |= copy_to_user(buf, mcelog.entry+i, 827 sizeof(struct mce)); 828 smp_rmb(); 829 buf += sizeof(struct mce); 830 memset(&mcelog.entry[i], 0, sizeof(struct mce)); 831 } 832 } 833 mutex_unlock(&mce_read_mutex); 834 kfree(cpu_tsc); 835 836 return err ? -EFAULT : buf - ubuf; 837} 838 839static unsigned int mce_poll(struct file *file, poll_table *wait) 840{ 841 poll_wait(file, &mce_wait, wait); 842 if (rcu_dereference(mcelog.next)) 843 return POLLIN | POLLRDNORM; 844 return 0; 845} 846 847static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg) 848{ 849 int __user *p = (int __user *)arg; 850 851 if (!capable(CAP_SYS_ADMIN)) 852 return -EPERM; 853 854 switch (cmd) { 855 case MCE_GET_RECORD_LEN: 856 return put_user(sizeof(struct mce), p); 857 case MCE_GET_LOG_LEN: 858 return put_user(MCE_LOG_LEN, p); 859 case MCE_GETCLEAR_FLAGS: { 860 unsigned flags; 861 862 do { 863 flags = mcelog.flags; 864 } while (cmpxchg(&mcelog.flags, flags, 0) != flags); 865 866 return put_user(flags, p); 867 } 868 default: 869 return -ENOTTY; 870 } 871} 872 873static const struct file_operations mce_chrdev_ops = { 874 .open = mce_open, 875 .release = mce_release, 876 .read = mce_read, 877 .poll = mce_poll, 878 .unlocked_ioctl = mce_ioctl, 879}; 880 881static struct miscdevice mce_log_device = { 882 MISC_MCELOG_MINOR, 883 "mcelog", 884 &mce_chrdev_ops, 885}; 886 887/* 888 * mce=off disables machine check 889 * mce=TOLERANCELEVEL (number, see above) 890 * mce=bootlog Log MCEs from before booting. Disabled by default on AMD. 891 * mce=nobootlog Don't log MCEs from before booting. 892 */ 893static int __init mcheck_enable(char *str) 894{ 895 if (!strcmp(str, "off")) 896 mce_disabled = 1; 897 else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog")) 898 mce_bootlog = (str[0] == 'b'); 899 else if (isdigit(str[0])) 900 get_option(&str, &tolerant); 901 else { 902 printk(KERN_INFO "mce= argument %s ignored. Please use /sys\n", 903 str); 904 return 0; 905 } 906 return 1; 907} 908__setup("mce=", mcheck_enable); 909 910/* 911 * Sysfs support 912 */ 913 914/* 915 * Disable machine checks on suspend and shutdown. We can't really handle 916 * them later. 917 */ 918static int mce_disable(void) 919{ 920 int i; 921 922 for (i = 0; i < banks; i++) { 923 if (!skip_bank_init(i)) 924 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); 925 } 926 return 0; 927} 928 929static int mce_suspend(struct sys_device *dev, pm_message_t state) 930{ 931 return mce_disable(); 932} 933 934static int mce_shutdown(struct sys_device *dev) 935{ 936 return mce_disable(); 937} 938 939/* 940 * On resume clear all MCE state. Don't want to see leftovers from the BIOS. 941 * Only one CPU is active at this time, the others get re-added later using 942 * CPU hotplug: 943 */ 944static int mce_resume(struct sys_device *dev) 945{ 946 mce_init(NULL); 947 mce_cpu_features(¤t_cpu_data); 948 949 return 0; 950} 951 952static void mce_cpu_restart(void *data) 953{ 954 del_timer_sync(&__get_cpu_var(mce_timer)); 955 if (mce_available(¤t_cpu_data)) 956 mce_init(NULL); 957 mce_init_timer(); 958} 959 960/* Reinit MCEs after user configuration changes */ 961static void mce_restart(void) 962{ 963 on_each_cpu(mce_cpu_restart, NULL, 1); 964} 965 966static struct sysdev_class mce_sysclass = { 967 .suspend = mce_suspend, 968 .shutdown = mce_shutdown, 969 .resume = mce_resume, 970 .name = "machinecheck", 971}; 972 973DEFINE_PER_CPU(struct sys_device, mce_dev); 974 975__cpuinitdata 976void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); 977 978/* Why are there no generic functions for this? */ 979#define ACCESSOR(name, var, start) \ 980 static ssize_t show_ ## name(struct sys_device *s, \ 981 struct sysdev_attribute *attr, \ 982 char *buf) { \ 983 return sprintf(buf, "%Lx\n", (u64)var); \ 984 } \ 985 static ssize_t set_ ## name(struct sys_device *s, \ 986 struct sysdev_attribute *attr, \ 987 const char *buf, size_t siz) { \ 988 char *end; \ 989 u64 new = simple_strtoull(buf, &end, 0); \ 990 \ 991 if (end == buf) \ 992 return -EINVAL; \ 993 var = new; \ 994 start; \ 995 \ 996 return end-buf; \ 997 } \ 998 static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name); 999 1000static struct sysdev_attribute *bank_attrs; 1001 1002static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr, 1003 char *buf) 1004{ 1005 u64 b = bank[attr - bank_attrs]; 1006 1007 return sprintf(buf, "%llx\n", b); 1008} 1009 1010static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, 1011 const char *buf, size_t siz) 1012{ 1013 char *end; 1014 u64 new = simple_strtoull(buf, &end, 0); 1015 1016 if (end == buf) 1017 return -EINVAL; 1018 1019 bank[attr - bank_attrs] = new; 1020 mce_restart(); 1021 1022 return end-buf; 1023} 1024 1025static ssize_t 1026show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf) 1027{ 1028 strcpy(buf, trigger); 1029 strcat(buf, "\n"); 1030 return strlen(trigger) + 1; 1031} 1032 1033static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr, 1034 const char *buf, size_t siz) 1035{ 1036 char *p; 1037 int len; 1038 1039 strncpy(trigger, buf, sizeof(trigger)); 1040 trigger[sizeof(trigger)-1] = 0; 1041 len = strlen(trigger); 1042 p = strchr(trigger, '\n'); 1043 1044 if (*p) 1045 *p = 0; 1046 1047 return len; 1048} 1049 1050static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); 1051static SYSDEV_INT_ATTR(tolerant, 0644, tolerant); 1052 1053ACCESSOR(check_interval, check_interval, mce_restart()) 1054 1055static struct sysdev_attribute *mce_attrs[] = { 1056 &attr_tolerant.attr, &attr_check_interval, &attr_trigger, 1057 NULL 1058}; 1059 1060static cpumask_var_t mce_dev_initialized; 1061 1062/* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */ 1063static __cpuinit int mce_create_device(unsigned int cpu) 1064{ 1065 int err; 1066 int i; 1067 1068 if (!mce_available(&boot_cpu_data)) 1069 return -EIO; 1070 1071 memset(&per_cpu(mce_dev, cpu).kobj, 0, sizeof(struct kobject)); 1072 per_cpu(mce_dev, cpu).id = cpu; 1073 per_cpu(mce_dev, cpu).cls = &mce_sysclass; 1074 1075 err = sysdev_register(&per_cpu(mce_dev, cpu)); 1076 if (err) 1077 return err; 1078 1079 for (i = 0; mce_attrs[i]; i++) { 1080 err = sysdev_create_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); 1081 if (err) 1082 goto error; 1083 } 1084 for (i = 0; i < banks; i++) { 1085 err = sysdev_create_file(&per_cpu(mce_dev, cpu), 1086 &bank_attrs[i]); 1087 if (err) 1088 goto error2; 1089 } 1090 cpumask_set_cpu(cpu, mce_dev_initialized); 1091 1092 return 0; 1093error2: 1094 while (--i >= 0) 1095 sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]); 1096error: 1097 while (--i >= 0) 1098 sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); 1099 1100 sysdev_unregister(&per_cpu(mce_dev, cpu)); 1101 1102 return err; 1103} 1104 1105static __cpuinit void mce_remove_device(unsigned int cpu) 1106{ 1107 int i; 1108 1109 if (!cpumask_test_cpu(cpu, mce_dev_initialized)) 1110 return; 1111 1112 for (i = 0; mce_attrs[i]; i++) 1113 sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); 1114 1115 for (i = 0; i < banks; i++) 1116 sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]); 1117 1118 sysdev_unregister(&per_cpu(mce_dev, cpu)); 1119 cpumask_clear_cpu(cpu, mce_dev_initialized); 1120} 1121 1122/* Make sure there are no machine checks on offlined CPUs. */ 1123static void mce_disable_cpu(void *h) 1124{ 1125 unsigned long action = *(unsigned long *)h; 1126 int i; 1127 1128 if (!mce_available(¤t_cpu_data)) 1129 return; 1130 if (!(action & CPU_TASKS_FROZEN)) 1131 cmci_clear(); 1132 for (i = 0; i < banks; i++) { 1133 if (!skip_bank_init(i)) 1134 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); 1135 } 1136} 1137 1138static void mce_reenable_cpu(void *h) 1139{ 1140 unsigned long action = *(unsigned long *)h; 1141 int i; 1142 1143 if (!mce_available(¤t_cpu_data)) 1144 return; 1145 1146 if (!(action & CPU_TASKS_FROZEN)) 1147 cmci_reenable(); 1148 for (i = 0; i < banks; i++) { 1149 if (!skip_bank_init(i)) 1150 wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]); 1151 } 1152} 1153 1154/* Get notified when a cpu comes on/off. Be hotplug friendly. */ 1155static int __cpuinit 1156mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) 1157{ 1158 unsigned int cpu = (unsigned long)hcpu; 1159 struct timer_list *t = &per_cpu(mce_timer, cpu); 1160 1161 switch (action) { 1162 case CPU_ONLINE: 1163 case CPU_ONLINE_FROZEN: 1164 mce_create_device(cpu); 1165 if (threshold_cpu_callback) 1166 threshold_cpu_callback(action, cpu); 1167 break; 1168 case CPU_DEAD: 1169 case CPU_DEAD_FROZEN: 1170 if (threshold_cpu_callback) 1171 threshold_cpu_callback(action, cpu); 1172 mce_remove_device(cpu); 1173 break; 1174 case CPU_DOWN_PREPARE: 1175 case CPU_DOWN_PREPARE_FROZEN: 1176 del_timer_sync(t); 1177 smp_call_function_single(cpu, mce_disable_cpu, &action, 1); 1178 break; 1179 case CPU_DOWN_FAILED: 1180 case CPU_DOWN_FAILED_FROZEN: 1181 t->expires = round_jiffies(jiffies + 1182 __get_cpu_var(next_interval)); 1183 add_timer_on(t, cpu); 1184 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); 1185 break; 1186 case CPU_POST_DEAD: 1187 /* intentionally ignoring frozen here */ 1188 cmci_rediscover(cpu); 1189 break; 1190 } 1191 return NOTIFY_OK; 1192} 1193 1194static struct notifier_block mce_cpu_notifier __cpuinitdata = { 1195 .notifier_call = mce_cpu_callback, 1196}; 1197 1198static __init int mce_init_banks(void) 1199{ 1200 int i; 1201 1202 bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks, 1203 GFP_KERNEL); 1204 if (!bank_attrs) 1205 return -ENOMEM; 1206 1207 for (i = 0; i < banks; i++) { 1208 struct sysdev_attribute *a = &bank_attrs[i]; 1209 1210 a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i); 1211 if (!a->attr.name) 1212 goto nomem; 1213 1214 a->attr.mode = 0644; 1215 a->show = show_bank; 1216 a->store = set_bank; 1217 } 1218 return 0; 1219 1220nomem: 1221 while (--i >= 0) 1222 kfree(bank_attrs[i].attr.name); 1223 kfree(bank_attrs); 1224 bank_attrs = NULL; 1225 1226 return -ENOMEM; 1227} 1228 1229static __init int mce_init_device(void) 1230{ 1231 int err; 1232 int i = 0; 1233 1234 if (!mce_available(&boot_cpu_data)) 1235 return -EIO; 1236 1237 alloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL); 1238 1239 err = mce_init_banks(); 1240 if (err) 1241 return err; 1242 1243 err = sysdev_class_register(&mce_sysclass); 1244 if (err) 1245 return err; 1246 1247 for_each_online_cpu(i) { 1248 err = mce_create_device(i); 1249 if (err) 1250 return err; 1251 } 1252 1253 register_hotcpu_notifier(&mce_cpu_notifier); 1254 misc_register(&mce_log_device); 1255 1256 return err; 1257} 1258 1259device_initcall(mce_init_device); 1260 1261#else /* CONFIG_X86_32: */ 1262 1263int nr_mce_banks; 1264EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */ 1265 1266/* This has to be run for each processor */ 1267void mcheck_init(struct cpuinfo_x86 *c) 1268{ 1269 if (mce_disabled == 1) 1270 return; 1271 1272 switch (c->x86_vendor) { 1273 case X86_VENDOR_AMD: 1274 amd_mcheck_init(c); 1275 break; 1276 1277 case X86_VENDOR_INTEL: 1278 if (c->x86 == 5) 1279 intel_p5_mcheck_init(c); 1280 if (c->x86 == 6) 1281 intel_p6_mcheck_init(c); 1282 if (c->x86 == 15) 1283 intel_p4_mcheck_init(c); 1284 break; 1285 1286 case X86_VENDOR_CENTAUR: 1287 if (c->x86 == 5) 1288 winchip_mcheck_init(c); 1289 break; 1290 1291 default: 1292 break; 1293 } 1294 printk(KERN_INFO "mce: CPU supports %d MCE banks\n", nr_mce_banks); 1295} 1296 1297static int __init mcheck_enable(char *str) 1298{ 1299 mce_disabled = -1; 1300 return 1; 1301} 1302 1303__setup("mce", mcheck_enable); 1304 1305#endif /* CONFIG_X86_OLD_MCE */ 1306 1307/* 1308 * Old style boot options parsing. Only for compatibility. 1309 */ 1310static int __init mcheck_disable(char *str) 1311{ 1312 mce_disabled = 1; 1313 return 1; 1314} 1315__setup("nomce", mcheck_disable); 1316