mce.c revision 1b2797dcc9f0ad89bc382ace26c6baafbc7e33c2
1/* 2 * Machine check handler. 3 * 4 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. 5 * Rest from unknown author(s). 6 * 2004 Andi Kleen. Rewrote most of it. 7 * Copyright 2008 Intel Corporation 8 * Author: Andi Kleen 9 */ 10#include <linux/thread_info.h> 11#include <linux/capability.h> 12#include <linux/miscdevice.h> 13#include <linux/interrupt.h> 14#include <linux/ratelimit.h> 15#include <linux/kallsyms.h> 16#include <linux/rcupdate.h> 17#include <linux/kobject.h> 18#include <linux/uaccess.h> 19#include <linux/kdebug.h> 20#include <linux/kernel.h> 21#include <linux/percpu.h> 22#include <linux/string.h> 23#include <linux/sysdev.h> 24#include <linux/delay.h> 25#include <linux/ctype.h> 26#include <linux/sched.h> 27#include <linux/sysfs.h> 28#include <linux/types.h> 29#include <linux/init.h> 30#include <linux/kmod.h> 31#include <linux/poll.h> 32#include <linux/nmi.h> 33#include <linux/cpu.h> 34#include <linux/smp.h> 35#include <linux/fs.h> 36 37#include <asm/processor.h> 38#include <asm/hw_irq.h> 39#include <asm/apic.h> 40#include <asm/idle.h> 41#include <asm/ipi.h> 42#include <asm/mce.h> 43#include <asm/msr.h> 44 45#include "mce-internal.h" 46#include "mce.h" 47 48/* Handle unconfigured int18 (should never happen) */ 49static void unexpected_machine_check(struct pt_regs *regs, long error_code) 50{ 51 printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", 52 smp_processor_id()); 53} 54 55/* Call the installed machine check handler for this CPU setup. */ 56void (*machine_check_vector)(struct pt_regs *, long error_code) = 57 unexpected_machine_check; 58 59int mce_disabled; 60 61#ifdef CONFIG_X86_NEW_MCE 62 63#define MISC_MCELOG_MINOR 227 64 65#define SPINUNIT 100 /* 100ns */ 66 67atomic_t mce_entry; 68 69DEFINE_PER_CPU(unsigned, mce_exception_count); 70 71/* 72 * Tolerant levels: 73 * 0: always panic on uncorrected errors, log corrected errors 74 * 1: panic or SIGBUS on uncorrected errors, log corrected errors 75 * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors 76 * 3: never panic or SIGBUS, log all errors (for testing only) 77 */ 78static int tolerant = 1; 79static int banks; 80static u64 *bank; 81static unsigned long notify_user; 82static int rip_msr; 83static int mce_bootlog = -1; 84static int monarch_timeout = -1; 85 86static char trigger[128]; 87static char *trigger_argv[2] = { trigger, NULL }; 88 89static unsigned long dont_init_banks; 90 91static DECLARE_WAIT_QUEUE_HEAD(mce_wait); 92static DEFINE_PER_CPU(struct mce, mces_seen); 93static int cpu_missing; 94 95 96/* MCA banks polled by the period polling timer for corrected events */ 97DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { 98 [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL 99}; 100 101static inline int skip_bank_init(int i) 102{ 103 return i < BITS_PER_LONG && test_bit(i, &dont_init_banks); 104} 105 106/* Do initial initialization of a struct mce */ 107void mce_setup(struct mce *m) 108{ 109 memset(m, 0, sizeof(struct mce)); 110 m->cpu = m->extcpu = smp_processor_id(); 111 rdtscll(m->tsc); 112 /* We hope get_seconds stays lockless */ 113 m->time = get_seconds(); 114 m->cpuvendor = boot_cpu_data.x86_vendor; 115 m->cpuid = cpuid_eax(1); 116#ifdef CONFIG_SMP 117 m->socketid = cpu_data(m->extcpu).phys_proc_id; 118#endif 119 m->apicid = cpu_data(m->extcpu).initial_apicid; 120 rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap); 121} 122 123DEFINE_PER_CPU(struct mce, injectm); 124EXPORT_PER_CPU_SYMBOL_GPL(injectm); 125 126/* 127 * Lockless MCE logging infrastructure. 128 * This avoids deadlocks on printk locks without having to break locks. Also 129 * separate MCEs from kernel messages to avoid bogus bug reports. 130 */ 131 132static struct mce_log mcelog = { 133 .signature = MCE_LOG_SIGNATURE, 134 .len = MCE_LOG_LEN, 135 .recordlen = sizeof(struct mce), 136}; 137 138void mce_log(struct mce *mce) 139{ 140 unsigned next, entry; 141 142 mce->finished = 0; 143 wmb(); 144 for (;;) { 145 entry = rcu_dereference(mcelog.next); 146 for (;;) { 147 /* 148 * When the buffer fills up discard new entries. 149 * Assume that the earlier errors are the more 150 * interesting ones: 151 */ 152 if (entry >= MCE_LOG_LEN) { 153 set_bit(MCE_OVERFLOW, 154 (unsigned long *)&mcelog.flags); 155 return; 156 } 157 /* Old left over entry. Skip: */ 158 if (mcelog.entry[entry].finished) { 159 entry++; 160 continue; 161 } 162 break; 163 } 164 smp_rmb(); 165 next = entry + 1; 166 if (cmpxchg(&mcelog.next, entry, next) == entry) 167 break; 168 } 169 memcpy(mcelog.entry + entry, mce, sizeof(struct mce)); 170 wmb(); 171 mcelog.entry[entry].finished = 1; 172 wmb(); 173 174 mce->finished = 1; 175 set_bit(0, ¬ify_user); 176} 177 178static void print_mce(struct mce *m) 179{ 180 printk(KERN_EMERG "\n" 181 KERN_EMERG "HARDWARE ERROR\n" 182 KERN_EMERG 183 "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n", 184 m->extcpu, m->mcgstatus, m->bank, m->status); 185 if (m->ip) { 186 printk(KERN_EMERG "RIP%s %02x:<%016Lx> ", 187 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", 188 m->cs, m->ip); 189 if (m->cs == __KERNEL_CS) 190 print_symbol("{%s}", m->ip); 191 printk("\n"); 192 } 193 printk(KERN_EMERG "TSC %llx ", m->tsc); 194 if (m->addr) 195 printk("ADDR %llx ", m->addr); 196 if (m->misc) 197 printk("MISC %llx ", m->misc); 198 printk("\n"); 199 printk(KERN_EMERG "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n", 200 m->cpuvendor, m->cpuid, m->time, m->socketid, 201 m->apicid); 202 printk(KERN_EMERG "This is not a software problem!\n"); 203 printk(KERN_EMERG "Run through mcelog --ascii to decode " 204 "and contact your hardware vendor\n"); 205} 206 207#define PANIC_TIMEOUT 5 /* 5 seconds */ 208 209static atomic_t mce_paniced; 210 211/* Panic in progress. Enable interrupts and wait for final IPI */ 212static void wait_for_panic(void) 213{ 214 long timeout = PANIC_TIMEOUT*USEC_PER_SEC; 215 preempt_disable(); 216 local_irq_enable(); 217 while (timeout-- > 0) 218 udelay(1); 219 panic("Panicing machine check CPU died"); 220} 221 222static void mce_panic(char *msg, struct mce *final, char *exp) 223{ 224 int i; 225 226 /* 227 * Make sure only one CPU runs in machine check panic 228 */ 229 if (atomic_add_return(1, &mce_paniced) > 1) 230 wait_for_panic(); 231 barrier(); 232 233 bust_spinlocks(1); 234 console_verbose(); 235 /* First print corrected ones that are still unlogged */ 236 for (i = 0; i < MCE_LOG_LEN; i++) { 237 struct mce *m = &mcelog.entry[i]; 238 if ((m->status & MCI_STATUS_VAL) && 239 !(m->status & MCI_STATUS_UC)) 240 print_mce(m); 241 } 242 /* Now print uncorrected but with the final one last */ 243 for (i = 0; i < MCE_LOG_LEN; i++) { 244 struct mce *m = &mcelog.entry[i]; 245 if (!(m->status & MCI_STATUS_VAL)) 246 continue; 247 if (!final || memcmp(m, final, sizeof(struct mce))) 248 print_mce(m); 249 } 250 if (final) 251 print_mce(final); 252 if (cpu_missing) 253 printk(KERN_EMERG "Some CPUs didn't answer in synchronization\n"); 254 if (exp) 255 printk(KERN_EMERG "Machine check: %s\n", exp); 256 panic(msg); 257} 258 259/* Support code for software error injection */ 260 261static int msr_to_offset(u32 msr) 262{ 263 unsigned bank = __get_cpu_var(injectm.bank); 264 if (msr == rip_msr) 265 return offsetof(struct mce, ip); 266 if (msr == MSR_IA32_MC0_STATUS + bank*4) 267 return offsetof(struct mce, status); 268 if (msr == MSR_IA32_MC0_ADDR + bank*4) 269 return offsetof(struct mce, addr); 270 if (msr == MSR_IA32_MC0_MISC + bank*4) 271 return offsetof(struct mce, misc); 272 if (msr == MSR_IA32_MCG_STATUS) 273 return offsetof(struct mce, mcgstatus); 274 return -1; 275} 276 277/* MSR access wrappers used for error injection */ 278static u64 mce_rdmsrl(u32 msr) 279{ 280 u64 v; 281 if (__get_cpu_var(injectm).finished) { 282 int offset = msr_to_offset(msr); 283 if (offset < 0) 284 return 0; 285 return *(u64 *)((char *)&__get_cpu_var(injectm) + offset); 286 } 287 rdmsrl(msr, v); 288 return v; 289} 290 291static void mce_wrmsrl(u32 msr, u64 v) 292{ 293 if (__get_cpu_var(injectm).finished) { 294 int offset = msr_to_offset(msr); 295 if (offset >= 0) 296 *(u64 *)((char *)&__get_cpu_var(injectm) + offset) = v; 297 return; 298 } 299 wrmsrl(msr, v); 300} 301 302int mce_available(struct cpuinfo_x86 *c) 303{ 304 if (mce_disabled) 305 return 0; 306 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); 307} 308 309/* 310 * Get the address of the instruction at the time of the machine check 311 * error. 312 */ 313static inline void mce_get_rip(struct mce *m, struct pt_regs *regs) 314{ 315 316 if (regs && (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV))) { 317 m->ip = regs->ip; 318 m->cs = regs->cs; 319 } else { 320 m->ip = 0; 321 m->cs = 0; 322 } 323 if (rip_msr) 324 m->ip = mce_rdmsrl(rip_msr); 325} 326 327#ifdef CONFIG_X86_LOCAL_APIC 328/* 329 * Called after interrupts have been reenabled again 330 * when a MCE happened during an interrupts off region 331 * in the kernel. 332 */ 333asmlinkage void smp_mce_self_interrupt(struct pt_regs *regs) 334{ 335 ack_APIC_irq(); 336 exit_idle(); 337 irq_enter(); 338 mce_notify_user(); 339 irq_exit(); 340} 341#endif 342 343static void mce_report_event(struct pt_regs *regs) 344{ 345 if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) { 346 mce_notify_user(); 347 return; 348 } 349 350#ifdef CONFIG_X86_LOCAL_APIC 351 /* 352 * Without APIC do not notify. The event will be picked 353 * up eventually. 354 */ 355 if (!cpu_has_apic) 356 return; 357 358 /* 359 * When interrupts are disabled we cannot use 360 * kernel services safely. Trigger an self interrupt 361 * through the APIC to instead do the notification 362 * after interrupts are reenabled again. 363 */ 364 apic->send_IPI_self(MCE_SELF_VECTOR); 365 366 /* 367 * Wait for idle afterwards again so that we don't leave the 368 * APIC in a non idle state because the normal APIC writes 369 * cannot exclude us. 370 */ 371 apic_wait_icr_idle(); 372#endif 373} 374 375DEFINE_PER_CPU(unsigned, mce_poll_count); 376 377/* 378 * Poll for corrected events or events that happened before reset. 379 * Those are just logged through /dev/mcelog. 380 * 381 * This is executed in standard interrupt context. 382 */ 383void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) 384{ 385 struct mce m; 386 int i; 387 388 __get_cpu_var(mce_poll_count)++; 389 390 mce_setup(&m); 391 392 m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); 393 for (i = 0; i < banks; i++) { 394 if (!bank[i] || !test_bit(i, *b)) 395 continue; 396 397 m.misc = 0; 398 m.addr = 0; 399 m.bank = i; 400 m.tsc = 0; 401 402 barrier(); 403 m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4); 404 if (!(m.status & MCI_STATUS_VAL)) 405 continue; 406 407 /* 408 * Uncorrected events are handled by the exception handler 409 * when it is enabled. But when the exception is disabled log 410 * everything. 411 * 412 * TBD do the same check for MCI_STATUS_EN here? 413 */ 414 if ((m.status & MCI_STATUS_UC) && !(flags & MCP_UC)) 415 continue; 416 417 if (m.status & MCI_STATUS_MISCV) 418 m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4); 419 if (m.status & MCI_STATUS_ADDRV) 420 m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4); 421 422 if (!(flags & MCP_TIMESTAMP)) 423 m.tsc = 0; 424 /* 425 * Don't get the IP here because it's unlikely to 426 * have anything to do with the actual error location. 427 */ 428 if (!(flags & MCP_DONTLOG)) { 429 mce_log(&m); 430 add_taint(TAINT_MACHINE_CHECK); 431 } 432 433 /* 434 * Clear state for this bank. 435 */ 436 mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); 437 } 438 439 /* 440 * Don't clear MCG_STATUS here because it's only defined for 441 * exceptions. 442 */ 443 444 sync_core(); 445} 446EXPORT_SYMBOL_GPL(machine_check_poll); 447 448/* 449 * Do a quick check if any of the events requires a panic. 450 * This decides if we keep the events around or clear them. 451 */ 452static int mce_no_way_out(struct mce *m, char **msg) 453{ 454 int i; 455 456 for (i = 0; i < banks; i++) { 457 m->status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4); 458 if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY) 459 return 1; 460 } 461 return 0; 462} 463 464/* 465 * Variable to establish order between CPUs while scanning. 466 * Each CPU spins initially until executing is equal its number. 467 */ 468static atomic_t mce_executing; 469 470/* 471 * Defines order of CPUs on entry. First CPU becomes Monarch. 472 */ 473static atomic_t mce_callin; 474 475/* 476 * Check if a timeout waiting for other CPUs happened. 477 */ 478static int mce_timed_out(u64 *t) 479{ 480 /* 481 * The others already did panic for some reason. 482 * Bail out like in a timeout. 483 * rmb() to tell the compiler that system_state 484 * might have been modified by someone else. 485 */ 486 rmb(); 487 if (atomic_read(&mce_paniced)) 488 wait_for_panic(); 489 if (!monarch_timeout) 490 goto out; 491 if ((s64)*t < SPINUNIT) { 492 /* CHECKME: Make panic default for 1 too? */ 493 if (tolerant < 1) 494 mce_panic("Timeout synchronizing machine check over CPUs", 495 NULL, NULL); 496 cpu_missing = 1; 497 return 1; 498 } 499 *t -= SPINUNIT; 500out: 501 touch_nmi_watchdog(); 502 return 0; 503} 504 505/* 506 * The Monarch's reign. The Monarch is the CPU who entered 507 * the machine check handler first. It waits for the others to 508 * raise the exception too and then grades them. When any 509 * error is fatal panic. Only then let the others continue. 510 * 511 * The other CPUs entering the MCE handler will be controlled by the 512 * Monarch. They are called Subjects. 513 * 514 * This way we prevent any potential data corruption in a unrecoverable case 515 * and also makes sure always all CPU's errors are examined. 516 * 517 * Also this detects the case of an machine check event coming from outer 518 * space (not detected by any CPUs) In this case some external agent wants 519 * us to shut down, so panic too. 520 * 521 * The other CPUs might still decide to panic if the handler happens 522 * in a unrecoverable place, but in this case the system is in a semi-stable 523 * state and won't corrupt anything by itself. It's ok to let the others 524 * continue for a bit first. 525 * 526 * All the spin loops have timeouts; when a timeout happens a CPU 527 * typically elects itself to be Monarch. 528 */ 529static void mce_reign(void) 530{ 531 int cpu; 532 struct mce *m = NULL; 533 int global_worst = 0; 534 char *msg = NULL; 535 char *nmsg = NULL; 536 537 /* 538 * This CPU is the Monarch and the other CPUs have run 539 * through their handlers. 540 * Grade the severity of the errors of all the CPUs. 541 */ 542 for_each_possible_cpu(cpu) { 543 int severity = mce_severity(&per_cpu(mces_seen, cpu), tolerant, 544 &nmsg); 545 if (severity > global_worst) { 546 msg = nmsg; 547 global_worst = severity; 548 m = &per_cpu(mces_seen, cpu); 549 } 550 } 551 552 /* 553 * Cannot recover? Panic here then. 554 * This dumps all the mces in the log buffer and stops the 555 * other CPUs. 556 */ 557 if (m && global_worst >= MCE_PANIC_SEVERITY && tolerant < 3) 558 mce_panic("Fatal Machine check", m, msg); 559 560 /* 561 * For UC somewhere we let the CPU who detects it handle it. 562 * Also must let continue the others, otherwise the handling 563 * CPU could deadlock on a lock. 564 */ 565 566 /* 567 * No machine check event found. Must be some external 568 * source or one CPU is hung. Panic. 569 */ 570 if (!m && tolerant < 3) 571 mce_panic("Machine check from unknown source", NULL, NULL); 572 573 /* 574 * Now clear all the mces_seen so that they don't reappear on 575 * the next mce. 576 */ 577 for_each_possible_cpu(cpu) 578 memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce)); 579} 580 581static atomic_t global_nwo; 582 583/* 584 * Start of Monarch synchronization. This waits until all CPUs have 585 * entered the exception handler and then determines if any of them 586 * saw a fatal event that requires panic. Then it executes them 587 * in the entry order. 588 * TBD double check parallel CPU hotunplug 589 */ 590static int mce_start(int no_way_out, int *order) 591{ 592 int nwo; 593 int cpus = num_online_cpus(); 594 u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC; 595 596 if (!timeout) { 597 *order = -1; 598 return no_way_out; 599 } 600 601 atomic_add(no_way_out, &global_nwo); 602 603 /* 604 * Wait for everyone. 605 */ 606 while (atomic_read(&mce_callin) != cpus) { 607 if (mce_timed_out(&timeout)) { 608 atomic_set(&global_nwo, 0); 609 *order = -1; 610 return no_way_out; 611 } 612 ndelay(SPINUNIT); 613 } 614 615 /* 616 * Cache the global no_way_out state. 617 */ 618 nwo = atomic_read(&global_nwo); 619 620 /* 621 * Monarch starts executing now, the others wait. 622 */ 623 if (*order == 1) { 624 atomic_set(&mce_executing, 1); 625 return nwo; 626 } 627 628 /* 629 * Now start the scanning loop one by one 630 * in the original callin order. 631 * This way when there are any shared banks it will 632 * be only seen by one CPU before cleared, avoiding duplicates. 633 */ 634 while (atomic_read(&mce_executing) < *order) { 635 if (mce_timed_out(&timeout)) { 636 atomic_set(&global_nwo, 0); 637 *order = -1; 638 return no_way_out; 639 } 640 ndelay(SPINUNIT); 641 } 642 return nwo; 643} 644 645/* 646 * Synchronize between CPUs after main scanning loop. 647 * This invokes the bulk of the Monarch processing. 648 */ 649static int mce_end(int order) 650{ 651 int ret = -1; 652 u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC; 653 654 if (!timeout) 655 goto reset; 656 if (order < 0) 657 goto reset; 658 659 /* 660 * Allow others to run. 661 */ 662 atomic_inc(&mce_executing); 663 664 if (order == 1) { 665 /* CHECKME: Can this race with a parallel hotplug? */ 666 int cpus = num_online_cpus(); 667 668 /* 669 * Monarch: Wait for everyone to go through their scanning 670 * loops. 671 */ 672 while (atomic_read(&mce_executing) <= cpus) { 673 if (mce_timed_out(&timeout)) 674 goto reset; 675 ndelay(SPINUNIT); 676 } 677 678 mce_reign(); 679 barrier(); 680 ret = 0; 681 } else { 682 /* 683 * Subject: Wait for Monarch to finish. 684 */ 685 while (atomic_read(&mce_executing) != 0) { 686 if (mce_timed_out(&timeout)) 687 goto reset; 688 ndelay(SPINUNIT); 689 } 690 691 /* 692 * Don't reset anything. That's done by the Monarch. 693 */ 694 return 0; 695 } 696 697 /* 698 * Reset all global state. 699 */ 700reset: 701 atomic_set(&global_nwo, 0); 702 atomic_set(&mce_callin, 0); 703 barrier(); 704 705 /* 706 * Let others run again. 707 */ 708 atomic_set(&mce_executing, 0); 709 return ret; 710} 711 712static void mce_clear_state(unsigned long *toclear) 713{ 714 int i; 715 716 for (i = 0; i < banks; i++) { 717 if (test_bit(i, toclear)) 718 mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); 719 } 720} 721 722/* 723 * The actual machine check handler. This only handles real 724 * exceptions when something got corrupted coming in through int 18. 725 * 726 * This is executed in NMI context not subject to normal locking rules. This 727 * implies that most kernel services cannot be safely used. Don't even 728 * think about putting a printk in there! 729 * 730 * On Intel systems this is entered on all CPUs in parallel through 731 * MCE broadcast. However some CPUs might be broken beyond repair, 732 * so be always careful when synchronizing with others. 733 */ 734void do_machine_check(struct pt_regs *regs, long error_code) 735{ 736 struct mce m, *final; 737 int i; 738 int worst = 0; 739 int severity; 740 /* 741 * Establish sequential order between the CPUs entering the machine 742 * check handler. 743 */ 744 int order; 745 746 /* 747 * If no_way_out gets set, there is no safe way to recover from this 748 * MCE. If tolerant is cranked up, we'll try anyway. 749 */ 750 int no_way_out = 0; 751 /* 752 * If kill_it gets set, there might be a way to recover from this 753 * error. 754 */ 755 int kill_it = 0; 756 DECLARE_BITMAP(toclear, MAX_NR_BANKS); 757 char *msg = "Unknown"; 758 759 atomic_inc(&mce_entry); 760 761 __get_cpu_var(mce_exception_count)++; 762 763 if (notify_die(DIE_NMI, "machine check", regs, error_code, 764 18, SIGKILL) == NOTIFY_STOP) 765 goto out; 766 if (!banks) 767 goto out; 768 769 order = atomic_add_return(1, &mce_callin); 770 mce_setup(&m); 771 772 m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); 773 no_way_out = mce_no_way_out(&m, &msg); 774 775 final = &__get_cpu_var(mces_seen); 776 *final = m; 777 778 barrier(); 779 780 /* 781 * Go through all the banks in exclusion of the other CPUs. 782 * This way we don't report duplicated events on shared banks 783 * because the first one to see it will clear it. 784 */ 785 no_way_out = mce_start(no_way_out, &order); 786 for (i = 0; i < banks; i++) { 787 __clear_bit(i, toclear); 788 if (!bank[i]) 789 continue; 790 791 m.misc = 0; 792 m.addr = 0; 793 m.bank = i; 794 795 m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4); 796 if ((m.status & MCI_STATUS_VAL) == 0) 797 continue; 798 799 /* 800 * Non uncorrected errors are handled by machine_check_poll 801 * Leave them alone, unless this panics. 802 */ 803 if ((m.status & MCI_STATUS_UC) == 0 && !no_way_out) 804 continue; 805 806 /* 807 * Set taint even when machine check was not enabled. 808 */ 809 add_taint(TAINT_MACHINE_CHECK); 810 811 __set_bit(i, toclear); 812 813 if (m.status & MCI_STATUS_EN) { 814 /* 815 * If this error was uncorrectable and there was 816 * an overflow, we're in trouble. If no overflow, 817 * we might get away with just killing a task. 818 */ 819 if (m.status & MCI_STATUS_UC) 820 kill_it = 1; 821 } else { 822 /* 823 * Machine check event was not enabled. Clear, but 824 * ignore. 825 */ 826 continue; 827 } 828 829 if (m.status & MCI_STATUS_MISCV) 830 m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4); 831 if (m.status & MCI_STATUS_ADDRV) 832 m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4); 833 834 mce_get_rip(&m, regs); 835 mce_log(&m); 836 837 severity = mce_severity(&m, tolerant, NULL); 838 if (severity > worst) { 839 *final = m; 840 worst = severity; 841 } 842 } 843 844 if (!no_way_out) 845 mce_clear_state(toclear); 846 847 /* 848 * Do most of the synchronization with other CPUs. 849 * When there's any problem use only local no_way_out state. 850 */ 851 if (mce_end(order) < 0) 852 no_way_out = worst >= MCE_PANIC_SEVERITY; 853 854 /* 855 * If we have decided that we just CAN'T continue, and the user 856 * has not set tolerant to an insane level, give up and die. 857 * 858 * This is mainly used in the case when the system doesn't 859 * support MCE broadcasting or it has been disabled. 860 */ 861 if (no_way_out && tolerant < 3) 862 mce_panic("Fatal machine check on current CPU", final, msg); 863 864 /* 865 * If the error seems to be unrecoverable, something should be 866 * done. Try to kill as little as possible. If we can kill just 867 * one task, do that. If the user has set the tolerance very 868 * high, don't try to do anything at all. 869 */ 870 if (kill_it && tolerant < 3) { 871 int user_space = 0; 872 873 /* 874 * If the EIPV bit is set, it means the saved IP is the 875 * instruction which caused the MCE. 876 */ 877 if (m.mcgstatus & MCG_STATUS_EIPV) 878 user_space = final->ip && (final->cs & 3); 879 880 /* 881 * If we know that the error was in user space, send a 882 * SIGBUS. Otherwise, panic if tolerance is low. 883 * 884 * force_sig() takes an awful lot of locks and has a slight 885 * risk of deadlocking. 886 */ 887 if (user_space) { 888 force_sig(SIGBUS, current); 889 } else if (panic_on_oops || tolerant < 2) { 890 mce_panic("Uncorrected machine check", final, msg); 891 } 892 } 893 894 /* notify userspace ASAP */ 895 set_thread_flag(TIF_MCE_NOTIFY); 896 897 if (worst > 0) 898 mce_report_event(regs); 899 mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); 900out: 901 atomic_dec(&mce_entry); 902 sync_core(); 903} 904EXPORT_SYMBOL_GPL(do_machine_check); 905 906#ifdef CONFIG_X86_MCE_INTEL 907/*** 908 * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog 909 * @cpu: The CPU on which the event occurred. 910 * @status: Event status information 911 * 912 * This function should be called by the thermal interrupt after the 913 * event has been processed and the decision was made to log the event 914 * further. 915 * 916 * The status parameter will be saved to the 'status' field of 'struct mce' 917 * and historically has been the register value of the 918 * MSR_IA32_THERMAL_STATUS (Intel) msr. 919 */ 920void mce_log_therm_throt_event(__u64 status) 921{ 922 struct mce m; 923 924 mce_setup(&m); 925 m.bank = MCE_THERMAL_BANK; 926 m.status = status; 927 mce_log(&m); 928} 929#endif /* CONFIG_X86_MCE_INTEL */ 930 931/* 932 * Periodic polling timer for "silent" machine check errors. If the 933 * poller finds an MCE, poll 2x faster. When the poller finds no more 934 * errors, poll 2x slower (up to check_interval seconds). 935 */ 936static int check_interval = 5 * 60; /* 5 minutes */ 937 938static DEFINE_PER_CPU(int, next_interval); /* in jiffies */ 939static DEFINE_PER_CPU(struct timer_list, mce_timer); 940 941static void mcheck_timer(unsigned long data) 942{ 943 struct timer_list *t = &per_cpu(mce_timer, data); 944 int *n; 945 946 WARN_ON(smp_processor_id() != data); 947 948 if (mce_available(¤t_cpu_data)) { 949 machine_check_poll(MCP_TIMESTAMP, 950 &__get_cpu_var(mce_poll_banks)); 951 } 952 953 /* 954 * Alert userspace if needed. If we logged an MCE, reduce the 955 * polling interval, otherwise increase the polling interval. 956 */ 957 n = &__get_cpu_var(next_interval); 958 if (mce_notify_user()) 959 *n = max(*n/2, HZ/100); 960 else 961 *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ)); 962 963 t->expires = jiffies + *n; 964 add_timer(t); 965} 966 967static void mce_do_trigger(struct work_struct *work) 968{ 969 call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT); 970} 971 972static DECLARE_WORK(mce_trigger_work, mce_do_trigger); 973 974/* 975 * Notify the user(s) about new machine check events. 976 * Can be called from interrupt context, but not from machine check/NMI 977 * context. 978 */ 979int mce_notify_user(void) 980{ 981 /* Not more than two messages every minute */ 982 static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); 983 984 clear_thread_flag(TIF_MCE_NOTIFY); 985 986 if (test_and_clear_bit(0, ¬ify_user)) { 987 wake_up_interruptible(&mce_wait); 988 989 /* 990 * There is no risk of missing notifications because 991 * work_pending is always cleared before the function is 992 * executed. 993 */ 994 if (trigger[0] && !work_pending(&mce_trigger_work)) 995 schedule_work(&mce_trigger_work); 996 997 if (__ratelimit(&ratelimit)) 998 printk(KERN_INFO "Machine check events logged\n"); 999 1000 return 1; 1001 } 1002 return 0; 1003} 1004EXPORT_SYMBOL_GPL(mce_notify_user); 1005 1006/* 1007 * Initialize Machine Checks for a CPU. 1008 */ 1009static int mce_cap_init(void) 1010{ 1011 unsigned b; 1012 u64 cap; 1013 1014 rdmsrl(MSR_IA32_MCG_CAP, cap); 1015 1016 b = cap & MCG_BANKCNT_MASK; 1017 printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b); 1018 1019 if (b > MAX_NR_BANKS) { 1020 printk(KERN_WARNING 1021 "MCE: Using only %u machine check banks out of %u\n", 1022 MAX_NR_BANKS, b); 1023 b = MAX_NR_BANKS; 1024 } 1025 1026 /* Don't support asymmetric configurations today */ 1027 WARN_ON(banks != 0 && b != banks); 1028 banks = b; 1029 if (!bank) { 1030 bank = kmalloc(banks * sizeof(u64), GFP_KERNEL); 1031 if (!bank) 1032 return -ENOMEM; 1033 memset(bank, 0xff, banks * sizeof(u64)); 1034 } 1035 1036 /* Use accurate RIP reporting if available. */ 1037 if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9) 1038 rip_msr = MSR_IA32_MCG_EIP; 1039 1040 return 0; 1041} 1042 1043static void mce_init(void) 1044{ 1045 mce_banks_t all_banks; 1046 u64 cap; 1047 int i; 1048 1049 /* 1050 * Log the machine checks left over from the previous reset. 1051 */ 1052 bitmap_fill(all_banks, MAX_NR_BANKS); 1053 machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks); 1054 1055 set_in_cr4(X86_CR4_MCE); 1056 1057 rdmsrl(MSR_IA32_MCG_CAP, cap); 1058 if (cap & MCG_CTL_P) 1059 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); 1060 1061 for (i = 0; i < banks; i++) { 1062 if (skip_bank_init(i)) 1063 continue; 1064 wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]); 1065 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); 1066 } 1067} 1068 1069/* Add per CPU specific workarounds here */ 1070static void mce_cpu_quirks(struct cpuinfo_x86 *c) 1071{ 1072 /* This should be disabled by the BIOS, but isn't always */ 1073 if (c->x86_vendor == X86_VENDOR_AMD) { 1074 if (c->x86 == 15 && banks > 4) { 1075 /* 1076 * disable GART TBL walk error reporting, which 1077 * trips off incorrectly with the IOMMU & 3ware 1078 * & Cerberus: 1079 */ 1080 clear_bit(10, (unsigned long *)&bank[4]); 1081 } 1082 if (c->x86 <= 17 && mce_bootlog < 0) { 1083 /* 1084 * Lots of broken BIOS around that don't clear them 1085 * by default and leave crap in there. Don't log: 1086 */ 1087 mce_bootlog = 0; 1088 } 1089 /* 1090 * Various K7s with broken bank 0 around. Always disable 1091 * by default. 1092 */ 1093 if (c->x86 == 6) 1094 bank[0] = 0; 1095 } 1096 1097 if (c->x86_vendor == X86_VENDOR_INTEL) { 1098 /* 1099 * SDM documents that on family 6 bank 0 should not be written 1100 * because it aliases to another special BIOS controlled 1101 * register. 1102 * But it's not aliased anymore on model 0x1a+ 1103 * Don't ignore bank 0 completely because there could be a 1104 * valid event later, merely don't write CTL0. 1105 */ 1106 1107 if (c->x86 == 6 && c->x86_model < 0x1A) 1108 __set_bit(0, &dont_init_banks); 1109 1110 /* 1111 * All newer Intel systems support MCE broadcasting. Enable 1112 * synchronization with a one second timeout. 1113 */ 1114 if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) && 1115 monarch_timeout < 0) 1116 monarch_timeout = USEC_PER_SEC; 1117 } 1118 if (monarch_timeout < 0) 1119 monarch_timeout = 0; 1120} 1121 1122static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c) 1123{ 1124 if (c->x86 != 5) 1125 return; 1126 switch (c->x86_vendor) { 1127 case X86_VENDOR_INTEL: 1128 if (mce_p5_enabled()) 1129 intel_p5_mcheck_init(c); 1130 break; 1131 case X86_VENDOR_CENTAUR: 1132 winchip_mcheck_init(c); 1133 break; 1134 } 1135} 1136 1137static void mce_cpu_features(struct cpuinfo_x86 *c) 1138{ 1139 switch (c->x86_vendor) { 1140 case X86_VENDOR_INTEL: 1141 mce_intel_feature_init(c); 1142 break; 1143 case X86_VENDOR_AMD: 1144 mce_amd_feature_init(c); 1145 break; 1146 default: 1147 break; 1148 } 1149} 1150 1151static void mce_init_timer(void) 1152{ 1153 struct timer_list *t = &__get_cpu_var(mce_timer); 1154 int *n = &__get_cpu_var(next_interval); 1155 1156 *n = check_interval * HZ; 1157 if (!*n) 1158 return; 1159 setup_timer(t, mcheck_timer, smp_processor_id()); 1160 t->expires = round_jiffies(jiffies + *n); 1161 add_timer(t); 1162} 1163 1164/* 1165 * Called for each booted CPU to set up machine checks. 1166 * Must be called with preempt off: 1167 */ 1168void __cpuinit mcheck_init(struct cpuinfo_x86 *c) 1169{ 1170 if (mce_disabled) 1171 return; 1172 1173 mce_ancient_init(c); 1174 1175 if (!mce_available(c)) 1176 return; 1177 1178 if (mce_cap_init() < 0) { 1179 mce_disabled = 1; 1180 return; 1181 } 1182 mce_cpu_quirks(c); 1183 1184 machine_check_vector = do_machine_check; 1185 1186 mce_init(); 1187 mce_cpu_features(c); 1188 mce_init_timer(); 1189} 1190 1191/* 1192 * Character device to read and clear the MCE log. 1193 */ 1194 1195static DEFINE_SPINLOCK(mce_state_lock); 1196static int open_count; /* #times opened */ 1197static int open_exclu; /* already open exclusive? */ 1198 1199static int mce_open(struct inode *inode, struct file *file) 1200{ 1201 spin_lock(&mce_state_lock); 1202 1203 if (open_exclu || (open_count && (file->f_flags & O_EXCL))) { 1204 spin_unlock(&mce_state_lock); 1205 1206 return -EBUSY; 1207 } 1208 1209 if (file->f_flags & O_EXCL) 1210 open_exclu = 1; 1211 open_count++; 1212 1213 spin_unlock(&mce_state_lock); 1214 1215 return nonseekable_open(inode, file); 1216} 1217 1218static int mce_release(struct inode *inode, struct file *file) 1219{ 1220 spin_lock(&mce_state_lock); 1221 1222 open_count--; 1223 open_exclu = 0; 1224 1225 spin_unlock(&mce_state_lock); 1226 1227 return 0; 1228} 1229 1230static void collect_tscs(void *data) 1231{ 1232 unsigned long *cpu_tsc = (unsigned long *)data; 1233 1234 rdtscll(cpu_tsc[smp_processor_id()]); 1235} 1236 1237static DEFINE_MUTEX(mce_read_mutex); 1238 1239static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, 1240 loff_t *off) 1241{ 1242 char __user *buf = ubuf; 1243 unsigned long *cpu_tsc; 1244 unsigned prev, next; 1245 int i, err; 1246 1247 cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL); 1248 if (!cpu_tsc) 1249 return -ENOMEM; 1250 1251 mutex_lock(&mce_read_mutex); 1252 next = rcu_dereference(mcelog.next); 1253 1254 /* Only supports full reads right now */ 1255 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) { 1256 mutex_unlock(&mce_read_mutex); 1257 kfree(cpu_tsc); 1258 1259 return -EINVAL; 1260 } 1261 1262 err = 0; 1263 prev = 0; 1264 do { 1265 for (i = prev; i < next; i++) { 1266 unsigned long start = jiffies; 1267 1268 while (!mcelog.entry[i].finished) { 1269 if (time_after_eq(jiffies, start + 2)) { 1270 memset(mcelog.entry + i, 0, 1271 sizeof(struct mce)); 1272 goto timeout; 1273 } 1274 cpu_relax(); 1275 } 1276 smp_rmb(); 1277 err |= copy_to_user(buf, mcelog.entry + i, 1278 sizeof(struct mce)); 1279 buf += sizeof(struct mce); 1280timeout: 1281 ; 1282 } 1283 1284 memset(mcelog.entry + prev, 0, 1285 (next - prev) * sizeof(struct mce)); 1286 prev = next; 1287 next = cmpxchg(&mcelog.next, prev, 0); 1288 } while (next != prev); 1289 1290 synchronize_sched(); 1291 1292 /* 1293 * Collect entries that were still getting written before the 1294 * synchronize. 1295 */ 1296 on_each_cpu(collect_tscs, cpu_tsc, 1); 1297 1298 for (i = next; i < MCE_LOG_LEN; i++) { 1299 if (mcelog.entry[i].finished && 1300 mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) { 1301 err |= copy_to_user(buf, mcelog.entry+i, 1302 sizeof(struct mce)); 1303 smp_rmb(); 1304 buf += sizeof(struct mce); 1305 memset(&mcelog.entry[i], 0, sizeof(struct mce)); 1306 } 1307 } 1308 mutex_unlock(&mce_read_mutex); 1309 kfree(cpu_tsc); 1310 1311 return err ? -EFAULT : buf - ubuf; 1312} 1313 1314static unsigned int mce_poll(struct file *file, poll_table *wait) 1315{ 1316 poll_wait(file, &mce_wait, wait); 1317 if (rcu_dereference(mcelog.next)) 1318 return POLLIN | POLLRDNORM; 1319 return 0; 1320} 1321 1322static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg) 1323{ 1324 int __user *p = (int __user *)arg; 1325 1326 if (!capable(CAP_SYS_ADMIN)) 1327 return -EPERM; 1328 1329 switch (cmd) { 1330 case MCE_GET_RECORD_LEN: 1331 return put_user(sizeof(struct mce), p); 1332 case MCE_GET_LOG_LEN: 1333 return put_user(MCE_LOG_LEN, p); 1334 case MCE_GETCLEAR_FLAGS: { 1335 unsigned flags; 1336 1337 do { 1338 flags = mcelog.flags; 1339 } while (cmpxchg(&mcelog.flags, flags, 0) != flags); 1340 1341 return put_user(flags, p); 1342 } 1343 default: 1344 return -ENOTTY; 1345 } 1346} 1347 1348/* Modified in mce-inject.c, so not static or const */ 1349struct file_operations mce_chrdev_ops = { 1350 .open = mce_open, 1351 .release = mce_release, 1352 .read = mce_read, 1353 .poll = mce_poll, 1354 .unlocked_ioctl = mce_ioctl, 1355}; 1356EXPORT_SYMBOL_GPL(mce_chrdev_ops); 1357 1358static struct miscdevice mce_log_device = { 1359 MISC_MCELOG_MINOR, 1360 "mcelog", 1361 &mce_chrdev_ops, 1362}; 1363 1364/* 1365 * mce=off disables machine check 1366 * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above) 1367 * monarchtimeout is how long to wait for other CPUs on machine 1368 * check, or 0 to not wait 1369 * mce=bootlog Log MCEs from before booting. Disabled by default on AMD. 1370 * mce=nobootlog Don't log MCEs from before booting. 1371 */ 1372static int __init mcheck_enable(char *str) 1373{ 1374 if (*str == 0) 1375 enable_p5_mce(); 1376 if (*str == '=') 1377 str++; 1378 if (!strcmp(str, "off")) 1379 mce_disabled = 1; 1380 else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog")) 1381 mce_bootlog = (str[0] == 'b'); 1382 else if (isdigit(str[0])) { 1383 get_option(&str, &tolerant); 1384 if (*str == ',') { 1385 ++str; 1386 get_option(&str, &monarch_timeout); 1387 } 1388 } else { 1389 printk(KERN_INFO "mce argument %s ignored. Please use /sys\n", 1390 str); 1391 return 0; 1392 } 1393 return 1; 1394} 1395__setup("mce", mcheck_enable); 1396 1397/* 1398 * Sysfs support 1399 */ 1400 1401/* 1402 * Disable machine checks on suspend and shutdown. We can't really handle 1403 * them later. 1404 */ 1405static int mce_disable(void) 1406{ 1407 int i; 1408 1409 for (i = 0; i < banks; i++) { 1410 if (!skip_bank_init(i)) 1411 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); 1412 } 1413 return 0; 1414} 1415 1416static int mce_suspend(struct sys_device *dev, pm_message_t state) 1417{ 1418 return mce_disable(); 1419} 1420 1421static int mce_shutdown(struct sys_device *dev) 1422{ 1423 return mce_disable(); 1424} 1425 1426/* 1427 * On resume clear all MCE state. Don't want to see leftovers from the BIOS. 1428 * Only one CPU is active at this time, the others get re-added later using 1429 * CPU hotplug: 1430 */ 1431static int mce_resume(struct sys_device *dev) 1432{ 1433 mce_init(); 1434 mce_cpu_features(¤t_cpu_data); 1435 1436 return 0; 1437} 1438 1439static void mce_cpu_restart(void *data) 1440{ 1441 del_timer_sync(&__get_cpu_var(mce_timer)); 1442 if (mce_available(¤t_cpu_data)) 1443 mce_init(); 1444 mce_init_timer(); 1445} 1446 1447/* Reinit MCEs after user configuration changes */ 1448static void mce_restart(void) 1449{ 1450 on_each_cpu(mce_cpu_restart, NULL, 1); 1451} 1452 1453static struct sysdev_class mce_sysclass = { 1454 .suspend = mce_suspend, 1455 .shutdown = mce_shutdown, 1456 .resume = mce_resume, 1457 .name = "machinecheck", 1458}; 1459 1460DEFINE_PER_CPU(struct sys_device, mce_dev); 1461 1462__cpuinitdata 1463void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); 1464 1465static struct sysdev_attribute *bank_attrs; 1466 1467static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr, 1468 char *buf) 1469{ 1470 u64 b = bank[attr - bank_attrs]; 1471 1472 return sprintf(buf, "%llx\n", b); 1473} 1474 1475static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, 1476 const char *buf, size_t size) 1477{ 1478 u64 new; 1479 1480 if (strict_strtoull(buf, 0, &new) < 0) 1481 return -EINVAL; 1482 1483 bank[attr - bank_attrs] = new; 1484 mce_restart(); 1485 1486 return size; 1487} 1488 1489static ssize_t 1490show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf) 1491{ 1492 strcpy(buf, trigger); 1493 strcat(buf, "\n"); 1494 return strlen(trigger) + 1; 1495} 1496 1497static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr, 1498 const char *buf, size_t siz) 1499{ 1500 char *p; 1501 int len; 1502 1503 strncpy(trigger, buf, sizeof(trigger)); 1504 trigger[sizeof(trigger)-1] = 0; 1505 len = strlen(trigger); 1506 p = strchr(trigger, '\n'); 1507 1508 if (*p) 1509 *p = 0; 1510 1511 return len; 1512} 1513 1514static ssize_t store_int_with_restart(struct sys_device *s, 1515 struct sysdev_attribute *attr, 1516 const char *buf, size_t size) 1517{ 1518 ssize_t ret = sysdev_store_int(s, attr, buf, size); 1519 mce_restart(); 1520 return ret; 1521} 1522 1523static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); 1524static SYSDEV_INT_ATTR(tolerant, 0644, tolerant); 1525static SYSDEV_INT_ATTR(monarch_timeout, 0644, monarch_timeout); 1526 1527static struct sysdev_ext_attribute attr_check_interval = { 1528 _SYSDEV_ATTR(check_interval, 0644, sysdev_show_int, 1529 store_int_with_restart), 1530 &check_interval 1531}; 1532 1533static struct sysdev_attribute *mce_attrs[] = { 1534 &attr_tolerant.attr, &attr_check_interval.attr, &attr_trigger, 1535 &attr_monarch_timeout.attr, 1536 NULL 1537}; 1538 1539static cpumask_var_t mce_dev_initialized; 1540 1541/* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */ 1542static __cpuinit int mce_create_device(unsigned int cpu) 1543{ 1544 int err; 1545 int i; 1546 1547 if (!mce_available(&boot_cpu_data)) 1548 return -EIO; 1549 1550 memset(&per_cpu(mce_dev, cpu).kobj, 0, sizeof(struct kobject)); 1551 per_cpu(mce_dev, cpu).id = cpu; 1552 per_cpu(mce_dev, cpu).cls = &mce_sysclass; 1553 1554 err = sysdev_register(&per_cpu(mce_dev, cpu)); 1555 if (err) 1556 return err; 1557 1558 for (i = 0; mce_attrs[i]; i++) { 1559 err = sysdev_create_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); 1560 if (err) 1561 goto error; 1562 } 1563 for (i = 0; i < banks; i++) { 1564 err = sysdev_create_file(&per_cpu(mce_dev, cpu), 1565 &bank_attrs[i]); 1566 if (err) 1567 goto error2; 1568 } 1569 cpumask_set_cpu(cpu, mce_dev_initialized); 1570 1571 return 0; 1572error2: 1573 while (--i >= 0) 1574 sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]); 1575error: 1576 while (--i >= 0) 1577 sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); 1578 1579 sysdev_unregister(&per_cpu(mce_dev, cpu)); 1580 1581 return err; 1582} 1583 1584static __cpuinit void mce_remove_device(unsigned int cpu) 1585{ 1586 int i; 1587 1588 if (!cpumask_test_cpu(cpu, mce_dev_initialized)) 1589 return; 1590 1591 for (i = 0; mce_attrs[i]; i++) 1592 sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); 1593 1594 for (i = 0; i < banks; i++) 1595 sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]); 1596 1597 sysdev_unregister(&per_cpu(mce_dev, cpu)); 1598 cpumask_clear_cpu(cpu, mce_dev_initialized); 1599} 1600 1601/* Make sure there are no machine checks on offlined CPUs. */ 1602static void mce_disable_cpu(void *h) 1603{ 1604 unsigned long action = *(unsigned long *)h; 1605 int i; 1606 1607 if (!mce_available(¤t_cpu_data)) 1608 return; 1609 if (!(action & CPU_TASKS_FROZEN)) 1610 cmci_clear(); 1611 for (i = 0; i < banks; i++) { 1612 if (!skip_bank_init(i)) 1613 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); 1614 } 1615} 1616 1617static void mce_reenable_cpu(void *h) 1618{ 1619 unsigned long action = *(unsigned long *)h; 1620 int i; 1621 1622 if (!mce_available(¤t_cpu_data)) 1623 return; 1624 1625 if (!(action & CPU_TASKS_FROZEN)) 1626 cmci_reenable(); 1627 for (i = 0; i < banks; i++) { 1628 if (!skip_bank_init(i)) 1629 wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]); 1630 } 1631} 1632 1633/* Get notified when a cpu comes on/off. Be hotplug friendly. */ 1634static int __cpuinit 1635mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) 1636{ 1637 unsigned int cpu = (unsigned long)hcpu; 1638 struct timer_list *t = &per_cpu(mce_timer, cpu); 1639 1640 switch (action) { 1641 case CPU_ONLINE: 1642 case CPU_ONLINE_FROZEN: 1643 mce_create_device(cpu); 1644 if (threshold_cpu_callback) 1645 threshold_cpu_callback(action, cpu); 1646 break; 1647 case CPU_DEAD: 1648 case CPU_DEAD_FROZEN: 1649 if (threshold_cpu_callback) 1650 threshold_cpu_callback(action, cpu); 1651 mce_remove_device(cpu); 1652 break; 1653 case CPU_DOWN_PREPARE: 1654 case CPU_DOWN_PREPARE_FROZEN: 1655 del_timer_sync(t); 1656 smp_call_function_single(cpu, mce_disable_cpu, &action, 1); 1657 break; 1658 case CPU_DOWN_FAILED: 1659 case CPU_DOWN_FAILED_FROZEN: 1660 t->expires = round_jiffies(jiffies + 1661 __get_cpu_var(next_interval)); 1662 add_timer_on(t, cpu); 1663 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); 1664 break; 1665 case CPU_POST_DEAD: 1666 /* intentionally ignoring frozen here */ 1667 cmci_rediscover(cpu); 1668 break; 1669 } 1670 return NOTIFY_OK; 1671} 1672 1673static struct notifier_block mce_cpu_notifier __cpuinitdata = { 1674 .notifier_call = mce_cpu_callback, 1675}; 1676 1677static __init int mce_init_banks(void) 1678{ 1679 int i; 1680 1681 bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks, 1682 GFP_KERNEL); 1683 if (!bank_attrs) 1684 return -ENOMEM; 1685 1686 for (i = 0; i < banks; i++) { 1687 struct sysdev_attribute *a = &bank_attrs[i]; 1688 1689 a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i); 1690 if (!a->attr.name) 1691 goto nomem; 1692 1693 a->attr.mode = 0644; 1694 a->show = show_bank; 1695 a->store = set_bank; 1696 } 1697 return 0; 1698 1699nomem: 1700 while (--i >= 0) 1701 kfree(bank_attrs[i].attr.name); 1702 kfree(bank_attrs); 1703 bank_attrs = NULL; 1704 1705 return -ENOMEM; 1706} 1707 1708static __init int mce_init_device(void) 1709{ 1710 int err; 1711 int i = 0; 1712 1713 if (!mce_available(&boot_cpu_data)) 1714 return -EIO; 1715 1716 alloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL); 1717 1718 err = mce_init_banks(); 1719 if (err) 1720 return err; 1721 1722 err = sysdev_class_register(&mce_sysclass); 1723 if (err) 1724 return err; 1725 1726 for_each_online_cpu(i) { 1727 err = mce_create_device(i); 1728 if (err) 1729 return err; 1730 } 1731 1732 register_hotcpu_notifier(&mce_cpu_notifier); 1733 misc_register(&mce_log_device); 1734 1735 return err; 1736} 1737 1738device_initcall(mce_init_device); 1739 1740#else /* CONFIG_X86_OLD_MCE: */ 1741 1742int nr_mce_banks; 1743EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */ 1744 1745/* This has to be run for each processor */ 1746void mcheck_init(struct cpuinfo_x86 *c) 1747{ 1748 if (mce_disabled == 1) 1749 return; 1750 1751 switch (c->x86_vendor) { 1752 case X86_VENDOR_AMD: 1753 amd_mcheck_init(c); 1754 break; 1755 1756 case X86_VENDOR_INTEL: 1757 if (c->x86 == 5) 1758 intel_p5_mcheck_init(c); 1759 if (c->x86 == 6) 1760 intel_p6_mcheck_init(c); 1761 if (c->x86 == 15) 1762 intel_p4_mcheck_init(c); 1763 break; 1764 1765 case X86_VENDOR_CENTAUR: 1766 if (c->x86 == 5) 1767 winchip_mcheck_init(c); 1768 break; 1769 1770 default: 1771 break; 1772 } 1773 printk(KERN_INFO "mce: CPU supports %d MCE banks\n", nr_mce_banks); 1774} 1775 1776static int __init mcheck_enable(char *str) 1777{ 1778 mce_disabled = -1; 1779 return 1; 1780} 1781 1782__setup("mce", mcheck_enable); 1783 1784#endif /* CONFIG_X86_OLD_MCE */ 1785 1786/* 1787 * Old style boot options parsing. Only for compatibility. 1788 */ 1789static int __init mcheck_disable(char *str) 1790{ 1791 mce_disabled = 1; 1792 return 1; 1793} 1794__setup("nomce", mcheck_disable); 1795