mce.c revision ac9603754dc7e286e62ae4f1067958d5b0075f99
1/* 2 * Machine check handler. 3 * 4 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. 5 * Rest from unknown author(s). 6 * 2004 Andi Kleen. Rewrote most of it. 7 * Copyright 2008 Intel Corporation 8 * Author: Andi Kleen 9 */ 10#include <linux/thread_info.h> 11#include <linux/capability.h> 12#include <linux/miscdevice.h> 13#include <linux/interrupt.h> 14#include <linux/ratelimit.h> 15#include <linux/kallsyms.h> 16#include <linux/rcupdate.h> 17#include <linux/kobject.h> 18#include <linux/uaccess.h> 19#include <linux/kdebug.h> 20#include <linux/kernel.h> 21#include <linux/percpu.h> 22#include <linux/string.h> 23#include <linux/sysdev.h> 24#include <linux/delay.h> 25#include <linux/ctype.h> 26#include <linux/sched.h> 27#include <linux/sysfs.h> 28#include <linux/types.h> 29#include <linux/init.h> 30#include <linux/kmod.h> 31#include <linux/poll.h> 32#include <linux/nmi.h> 33#include <linux/cpu.h> 34#include <linux/smp.h> 35#include <linux/fs.h> 36 37#include <asm/processor.h> 38#include <asm/hw_irq.h> 39#include <asm/apic.h> 40#include <asm/idle.h> 41#include <asm/ipi.h> 42#include <asm/mce.h> 43#include <asm/msr.h> 44 45#include "mce-internal.h" 46#include "mce.h" 47 48/* Handle unconfigured int18 (should never happen) */ 49static void unexpected_machine_check(struct pt_regs *regs, long error_code) 50{ 51 printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", 52 smp_processor_id()); 53} 54 55/* Call the installed machine check handler for this CPU setup. */ 56void (*machine_check_vector)(struct pt_regs *, long error_code) = 57 unexpected_machine_check; 58 59int mce_disabled; 60 61#ifdef CONFIG_X86_NEW_MCE 62 63#define MISC_MCELOG_MINOR 227 64 65#define SPINUNIT 100 /* 100ns */ 66 67atomic_t mce_entry; 68 69DEFINE_PER_CPU(unsigned, mce_exception_count); 70 71/* 72 * Tolerant levels: 73 * 0: always panic on uncorrected errors, log corrected errors 74 * 1: panic or SIGBUS on uncorrected errors, log corrected errors 75 * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors 76 * 3: never panic or SIGBUS, log all errors (for testing only) 77 */ 78static int tolerant = 1; 79static int banks; 80static u64 *bank; 81static unsigned long notify_user; 82static int rip_msr; 83static int mce_bootlog = -1; 84static int monarch_timeout = -1; 85 86static char trigger[128]; 87static char *trigger_argv[2] = { trigger, NULL }; 88 89static unsigned long dont_init_banks; 90 91static DECLARE_WAIT_QUEUE_HEAD(mce_wait); 92static DEFINE_PER_CPU(struct mce, mces_seen); 93static int cpu_missing; 94 95 96/* MCA banks polled by the period polling timer for corrected events */ 97DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { 98 [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL 99}; 100 101static inline int skip_bank_init(int i) 102{ 103 return i < BITS_PER_LONG && test_bit(i, &dont_init_banks); 104} 105 106/* Do initial initialization of a struct mce */ 107void mce_setup(struct mce *m) 108{ 109 memset(m, 0, sizeof(struct mce)); 110 m->cpu = m->extcpu = smp_processor_id(); 111 rdtscll(m->tsc); 112 /* We hope get_seconds stays lockless */ 113 m->time = get_seconds(); 114 m->cpuvendor = boot_cpu_data.x86_vendor; 115 m->cpuid = cpuid_eax(1); 116#ifdef CONFIG_SMP 117 m->socketid = cpu_data(m->extcpu).phys_proc_id; 118#endif 119 m->apicid = cpu_data(m->extcpu).initial_apicid; 120 rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap); 121} 122 123DEFINE_PER_CPU(struct mce, injectm); 124EXPORT_PER_CPU_SYMBOL_GPL(injectm); 125 126/* 127 * Lockless MCE logging infrastructure. 128 * This avoids deadlocks on printk locks without having to break locks. Also 129 * separate MCEs from kernel messages to avoid bogus bug reports. 130 */ 131 132static struct mce_log mcelog = { 133 .signature = MCE_LOG_SIGNATURE, 134 .len = MCE_LOG_LEN, 135 .recordlen = sizeof(struct mce), 136}; 137 138void mce_log(struct mce *mce) 139{ 140 unsigned next, entry; 141 142 mce->finished = 0; 143 wmb(); 144 for (;;) { 145 entry = rcu_dereference(mcelog.next); 146 for (;;) { 147 /* 148 * When the buffer fills up discard new entries. 149 * Assume that the earlier errors are the more 150 * interesting ones: 151 */ 152 if (entry >= MCE_LOG_LEN) { 153 set_bit(MCE_OVERFLOW, 154 (unsigned long *)&mcelog.flags); 155 return; 156 } 157 /* Old left over entry. Skip: */ 158 if (mcelog.entry[entry].finished) { 159 entry++; 160 continue; 161 } 162 break; 163 } 164 smp_rmb(); 165 next = entry + 1; 166 if (cmpxchg(&mcelog.next, entry, next) == entry) 167 break; 168 } 169 memcpy(mcelog.entry + entry, mce, sizeof(struct mce)); 170 wmb(); 171 mcelog.entry[entry].finished = 1; 172 wmb(); 173 174 mce->finished = 1; 175 set_bit(0, ¬ify_user); 176} 177 178static void print_mce(struct mce *m) 179{ 180 printk(KERN_EMERG "\n" 181 KERN_EMERG "HARDWARE ERROR\n" 182 KERN_EMERG 183 "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n", 184 m->extcpu, m->mcgstatus, m->bank, m->status); 185 if (m->ip) { 186 printk(KERN_EMERG "RIP%s %02x:<%016Lx> ", 187 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", 188 m->cs, m->ip); 189 if (m->cs == __KERNEL_CS) 190 print_symbol("{%s}", m->ip); 191 printk("\n"); 192 } 193 printk(KERN_EMERG "TSC %llx ", m->tsc); 194 if (m->addr) 195 printk("ADDR %llx ", m->addr); 196 if (m->misc) 197 printk("MISC %llx ", m->misc); 198 printk("\n"); 199 printk(KERN_EMERG "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n", 200 m->cpuvendor, m->cpuid, m->time, m->socketid, 201 m->apicid); 202 printk(KERN_EMERG "This is not a software problem!\n"); 203 printk(KERN_EMERG "Run through mcelog --ascii to decode " 204 "and contact your hardware vendor\n"); 205} 206 207#define PANIC_TIMEOUT 5 /* 5 seconds */ 208 209static atomic_t mce_paniced; 210 211/* Panic in progress. Enable interrupts and wait for final IPI */ 212static void wait_for_panic(void) 213{ 214 long timeout = PANIC_TIMEOUT*USEC_PER_SEC; 215 preempt_disable(); 216 local_irq_enable(); 217 while (timeout-- > 0) 218 udelay(1); 219 panic("Panicing machine check CPU died"); 220} 221 222static void mce_panic(char *msg, struct mce *final, char *exp) 223{ 224 int i; 225 226 /* 227 * Make sure only one CPU runs in machine check panic 228 */ 229 if (atomic_add_return(1, &mce_paniced) > 1) 230 wait_for_panic(); 231 barrier(); 232 233 bust_spinlocks(1); 234 console_verbose(); 235 /* First print corrected ones that are still unlogged */ 236 for (i = 0; i < MCE_LOG_LEN; i++) { 237 struct mce *m = &mcelog.entry[i]; 238 if ((m->status & MCI_STATUS_VAL) && 239 !(m->status & MCI_STATUS_UC)) 240 print_mce(m); 241 } 242 /* Now print uncorrected but with the final one last */ 243 for (i = 0; i < MCE_LOG_LEN; i++) { 244 struct mce *m = &mcelog.entry[i]; 245 if (!(m->status & MCI_STATUS_VAL)) 246 continue; 247 if (!final || memcmp(m, final, sizeof(struct mce))) 248 print_mce(m); 249 } 250 if (final) 251 print_mce(final); 252 if (cpu_missing) 253 printk(KERN_EMERG "Some CPUs didn't answer in synchronization\n"); 254 if (exp) 255 printk(KERN_EMERG "Machine check: %s\n", exp); 256 panic(msg); 257} 258 259/* Support code for software error injection */ 260 261static int msr_to_offset(u32 msr) 262{ 263 unsigned bank = __get_cpu_var(injectm.bank); 264 if (msr == rip_msr) 265 return offsetof(struct mce, ip); 266 if (msr == MSR_IA32_MC0_STATUS + bank*4) 267 return offsetof(struct mce, status); 268 if (msr == MSR_IA32_MC0_ADDR + bank*4) 269 return offsetof(struct mce, addr); 270 if (msr == MSR_IA32_MC0_MISC + bank*4) 271 return offsetof(struct mce, misc); 272 if (msr == MSR_IA32_MCG_STATUS) 273 return offsetof(struct mce, mcgstatus); 274 return -1; 275} 276 277/* MSR access wrappers used for error injection */ 278static u64 mce_rdmsrl(u32 msr) 279{ 280 u64 v; 281 if (__get_cpu_var(injectm).finished) { 282 int offset = msr_to_offset(msr); 283 if (offset < 0) 284 return 0; 285 return *(u64 *)((char *)&__get_cpu_var(injectm) + offset); 286 } 287 rdmsrl(msr, v); 288 return v; 289} 290 291static void mce_wrmsrl(u32 msr, u64 v) 292{ 293 if (__get_cpu_var(injectm).finished) { 294 int offset = msr_to_offset(msr); 295 if (offset >= 0) 296 *(u64 *)((char *)&__get_cpu_var(injectm) + offset) = v; 297 return; 298 } 299 wrmsrl(msr, v); 300} 301 302int mce_available(struct cpuinfo_x86 *c) 303{ 304 if (mce_disabled) 305 return 0; 306 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); 307} 308 309static inline void mce_get_rip(struct mce *m, struct pt_regs *regs) 310{ 311 if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) { 312 m->ip = regs->ip; 313 m->cs = regs->cs; 314 } else { 315 m->ip = 0; 316 m->cs = 0; 317 } 318 if (rip_msr) { 319 /* Assume the RIP in the MSR is exact. Is this true? */ 320 m->mcgstatus |= MCG_STATUS_EIPV; 321 m->ip = mce_rdmsrl(rip_msr); 322 m->cs = 0; 323 } 324} 325 326#ifdef CONFIG_X86_LOCAL_APIC 327/* 328 * Called after interrupts have been reenabled again 329 * when a MCE happened during an interrupts off region 330 * in the kernel. 331 */ 332asmlinkage void smp_mce_self_interrupt(struct pt_regs *regs) 333{ 334 ack_APIC_irq(); 335 exit_idle(); 336 irq_enter(); 337 mce_notify_user(); 338 irq_exit(); 339} 340#endif 341 342static void mce_report_event(struct pt_regs *regs) 343{ 344 if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) { 345 mce_notify_user(); 346 return; 347 } 348 349#ifdef CONFIG_X86_LOCAL_APIC 350 /* 351 * Without APIC do not notify. The event will be picked 352 * up eventually. 353 */ 354 if (!cpu_has_apic) 355 return; 356 357 /* 358 * When interrupts are disabled we cannot use 359 * kernel services safely. Trigger an self interrupt 360 * through the APIC to instead do the notification 361 * after interrupts are reenabled again. 362 */ 363 apic->send_IPI_self(MCE_SELF_VECTOR); 364 365 /* 366 * Wait for idle afterwards again so that we don't leave the 367 * APIC in a non idle state because the normal APIC writes 368 * cannot exclude us. 369 */ 370 apic_wait_icr_idle(); 371#endif 372} 373 374DEFINE_PER_CPU(unsigned, mce_poll_count); 375 376/* 377 * Poll for corrected events or events that happened before reset. 378 * Those are just logged through /dev/mcelog. 379 * 380 * This is executed in standard interrupt context. 381 */ 382void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) 383{ 384 struct mce m; 385 int i; 386 387 __get_cpu_var(mce_poll_count)++; 388 389 mce_setup(&m); 390 391 m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); 392 for (i = 0; i < banks; i++) { 393 if (!bank[i] || !test_bit(i, *b)) 394 continue; 395 396 m.misc = 0; 397 m.addr = 0; 398 m.bank = i; 399 m.tsc = 0; 400 401 barrier(); 402 m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4); 403 if (!(m.status & MCI_STATUS_VAL)) 404 continue; 405 406 /* 407 * Uncorrected events are handled by the exception handler 408 * when it is enabled. But when the exception is disabled log 409 * everything. 410 * 411 * TBD do the same check for MCI_STATUS_EN here? 412 */ 413 if ((m.status & MCI_STATUS_UC) && !(flags & MCP_UC)) 414 continue; 415 416 if (m.status & MCI_STATUS_MISCV) 417 m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4); 418 if (m.status & MCI_STATUS_ADDRV) 419 m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4); 420 421 if (!(flags & MCP_TIMESTAMP)) 422 m.tsc = 0; 423 /* 424 * Don't get the IP here because it's unlikely to 425 * have anything to do with the actual error location. 426 */ 427 if (!(flags & MCP_DONTLOG)) { 428 mce_log(&m); 429 add_taint(TAINT_MACHINE_CHECK); 430 } 431 432 /* 433 * Clear state for this bank. 434 */ 435 mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); 436 } 437 438 /* 439 * Don't clear MCG_STATUS here because it's only defined for 440 * exceptions. 441 */ 442 443 sync_core(); 444} 445EXPORT_SYMBOL_GPL(machine_check_poll); 446 447/* 448 * Do a quick check if any of the events requires a panic. 449 * This decides if we keep the events around or clear them. 450 */ 451static int mce_no_way_out(struct mce *m, char **msg) 452{ 453 int i; 454 455 for (i = 0; i < banks; i++) { 456 m->status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4); 457 if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY) 458 return 1; 459 } 460 return 0; 461} 462 463/* 464 * Variable to establish order between CPUs while scanning. 465 * Each CPU spins initially until executing is equal its number. 466 */ 467static atomic_t mce_executing; 468 469/* 470 * Defines order of CPUs on entry. First CPU becomes Monarch. 471 */ 472static atomic_t mce_callin; 473 474/* 475 * Check if a timeout waiting for other CPUs happened. 476 */ 477static int mce_timed_out(u64 *t) 478{ 479 /* 480 * The others already did panic for some reason. 481 * Bail out like in a timeout. 482 * rmb() to tell the compiler that system_state 483 * might have been modified by someone else. 484 */ 485 rmb(); 486 if (atomic_read(&mce_paniced)) 487 wait_for_panic(); 488 if (!monarch_timeout) 489 goto out; 490 if ((s64)*t < SPINUNIT) { 491 /* CHECKME: Make panic default for 1 too? */ 492 if (tolerant < 1) 493 mce_panic("Timeout synchronizing machine check over CPUs", 494 NULL, NULL); 495 cpu_missing = 1; 496 return 1; 497 } 498 *t -= SPINUNIT; 499out: 500 touch_nmi_watchdog(); 501 return 0; 502} 503 504/* 505 * The Monarch's reign. The Monarch is the CPU who entered 506 * the machine check handler first. It waits for the others to 507 * raise the exception too and then grades them. When any 508 * error is fatal panic. Only then let the others continue. 509 * 510 * The other CPUs entering the MCE handler will be controlled by the 511 * Monarch. They are called Subjects. 512 * 513 * This way we prevent any potential data corruption in a unrecoverable case 514 * and also makes sure always all CPU's errors are examined. 515 * 516 * Also this detects the case of an machine check event coming from outer 517 * space (not detected by any CPUs) In this case some external agent wants 518 * us to shut down, so panic too. 519 * 520 * The other CPUs might still decide to panic if the handler happens 521 * in a unrecoverable place, but in this case the system is in a semi-stable 522 * state and won't corrupt anything by itself. It's ok to let the others 523 * continue for a bit first. 524 * 525 * All the spin loops have timeouts; when a timeout happens a CPU 526 * typically elects itself to be Monarch. 527 */ 528static void mce_reign(void) 529{ 530 int cpu; 531 struct mce *m = NULL; 532 int global_worst = 0; 533 char *msg = NULL; 534 char *nmsg = NULL; 535 536 /* 537 * This CPU is the Monarch and the other CPUs have run 538 * through their handlers. 539 * Grade the severity of the errors of all the CPUs. 540 */ 541 for_each_possible_cpu(cpu) { 542 int severity = mce_severity(&per_cpu(mces_seen, cpu), tolerant, 543 &nmsg); 544 if (severity > global_worst) { 545 msg = nmsg; 546 global_worst = severity; 547 m = &per_cpu(mces_seen, cpu); 548 } 549 } 550 551 /* 552 * Cannot recover? Panic here then. 553 * This dumps all the mces in the log buffer and stops the 554 * other CPUs. 555 */ 556 if (m && global_worst >= MCE_PANIC_SEVERITY && tolerant < 3) 557 mce_panic("Fatal Machine check", m, msg); 558 559 /* 560 * For UC somewhere we let the CPU who detects it handle it. 561 * Also must let continue the others, otherwise the handling 562 * CPU could deadlock on a lock. 563 */ 564 565 /* 566 * No machine check event found. Must be some external 567 * source or one CPU is hung. Panic. 568 */ 569 if (!m && tolerant < 3) 570 mce_panic("Machine check from unknown source", NULL, NULL); 571 572 /* 573 * Now clear all the mces_seen so that they don't reappear on 574 * the next mce. 575 */ 576 for_each_possible_cpu(cpu) 577 memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce)); 578} 579 580static atomic_t global_nwo; 581 582/* 583 * Start of Monarch synchronization. This waits until all CPUs have 584 * entered the exception handler and then determines if any of them 585 * saw a fatal event that requires panic. Then it executes them 586 * in the entry order. 587 * TBD double check parallel CPU hotunplug 588 */ 589static int mce_start(int no_way_out, int *order) 590{ 591 int nwo; 592 int cpus = num_online_cpus(); 593 u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC; 594 595 if (!timeout) { 596 *order = -1; 597 return no_way_out; 598 } 599 600 atomic_add(no_way_out, &global_nwo); 601 602 /* 603 * Wait for everyone. 604 */ 605 while (atomic_read(&mce_callin) != cpus) { 606 if (mce_timed_out(&timeout)) { 607 atomic_set(&global_nwo, 0); 608 *order = -1; 609 return no_way_out; 610 } 611 ndelay(SPINUNIT); 612 } 613 614 /* 615 * Cache the global no_way_out state. 616 */ 617 nwo = atomic_read(&global_nwo); 618 619 /* 620 * Monarch starts executing now, the others wait. 621 */ 622 if (*order == 1) { 623 atomic_set(&mce_executing, 1); 624 return nwo; 625 } 626 627 /* 628 * Now start the scanning loop one by one 629 * in the original callin order. 630 * This way when there are any shared banks it will 631 * be only seen by one CPU before cleared, avoiding duplicates. 632 */ 633 while (atomic_read(&mce_executing) < *order) { 634 if (mce_timed_out(&timeout)) { 635 atomic_set(&global_nwo, 0); 636 *order = -1; 637 return no_way_out; 638 } 639 ndelay(SPINUNIT); 640 } 641 return nwo; 642} 643 644/* 645 * Synchronize between CPUs after main scanning loop. 646 * This invokes the bulk of the Monarch processing. 647 */ 648static int mce_end(int order) 649{ 650 int ret = -1; 651 u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC; 652 653 if (!timeout) 654 goto reset; 655 if (order < 0) 656 goto reset; 657 658 /* 659 * Allow others to run. 660 */ 661 atomic_inc(&mce_executing); 662 663 if (order == 1) { 664 /* CHECKME: Can this race with a parallel hotplug? */ 665 int cpus = num_online_cpus(); 666 667 /* 668 * Monarch: Wait for everyone to go through their scanning 669 * loops. 670 */ 671 while (atomic_read(&mce_executing) <= cpus) { 672 if (mce_timed_out(&timeout)) 673 goto reset; 674 ndelay(SPINUNIT); 675 } 676 677 mce_reign(); 678 barrier(); 679 ret = 0; 680 } else { 681 /* 682 * Subject: Wait for Monarch to finish. 683 */ 684 while (atomic_read(&mce_executing) != 0) { 685 if (mce_timed_out(&timeout)) 686 goto reset; 687 ndelay(SPINUNIT); 688 } 689 690 /* 691 * Don't reset anything. That's done by the Monarch. 692 */ 693 return 0; 694 } 695 696 /* 697 * Reset all global state. 698 */ 699reset: 700 atomic_set(&global_nwo, 0); 701 atomic_set(&mce_callin, 0); 702 barrier(); 703 704 /* 705 * Let others run again. 706 */ 707 atomic_set(&mce_executing, 0); 708 return ret; 709} 710 711static void mce_clear_state(unsigned long *toclear) 712{ 713 int i; 714 715 for (i = 0; i < banks; i++) { 716 if (test_bit(i, toclear)) 717 mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); 718 } 719} 720 721/* 722 * The actual machine check handler. This only handles real 723 * exceptions when something got corrupted coming in through int 18. 724 * 725 * This is executed in NMI context not subject to normal locking rules. This 726 * implies that most kernel services cannot be safely used. Don't even 727 * think about putting a printk in there! 728 * 729 * On Intel systems this is entered on all CPUs in parallel through 730 * MCE broadcast. However some CPUs might be broken beyond repair, 731 * so be always careful when synchronizing with others. 732 */ 733void do_machine_check(struct pt_regs *regs, long error_code) 734{ 735 struct mce m, *final; 736 int i; 737 int worst = 0; 738 int severity; 739 /* 740 * Establish sequential order between the CPUs entering the machine 741 * check handler. 742 */ 743 int order; 744 745 /* 746 * If no_way_out gets set, there is no safe way to recover from this 747 * MCE. If tolerant is cranked up, we'll try anyway. 748 */ 749 int no_way_out = 0; 750 /* 751 * If kill_it gets set, there might be a way to recover from this 752 * error. 753 */ 754 int kill_it = 0; 755 DECLARE_BITMAP(toclear, MAX_NR_BANKS); 756 char *msg = "Unknown"; 757 758 atomic_inc(&mce_entry); 759 760 __get_cpu_var(mce_exception_count)++; 761 762 if (notify_die(DIE_NMI, "machine check", regs, error_code, 763 18, SIGKILL) == NOTIFY_STOP) 764 goto out; 765 if (!banks) 766 goto out; 767 768 order = atomic_add_return(1, &mce_callin); 769 mce_setup(&m); 770 771 m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); 772 no_way_out = mce_no_way_out(&m, &msg); 773 774 final = &__get_cpu_var(mces_seen); 775 *final = m; 776 777 barrier(); 778 779 /* 780 * Go through all the banks in exclusion of the other CPUs. 781 * This way we don't report duplicated events on shared banks 782 * because the first one to see it will clear it. 783 */ 784 no_way_out = mce_start(no_way_out, &order); 785 for (i = 0; i < banks; i++) { 786 __clear_bit(i, toclear); 787 if (!bank[i]) 788 continue; 789 790 m.misc = 0; 791 m.addr = 0; 792 m.bank = i; 793 794 m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4); 795 if ((m.status & MCI_STATUS_VAL) == 0) 796 continue; 797 798 /* 799 * Non uncorrected errors are handled by machine_check_poll 800 * Leave them alone, unless this panics. 801 */ 802 if ((m.status & MCI_STATUS_UC) == 0 && !no_way_out) 803 continue; 804 805 /* 806 * Set taint even when machine check was not enabled. 807 */ 808 add_taint(TAINT_MACHINE_CHECK); 809 810 __set_bit(i, toclear); 811 812 if (m.status & MCI_STATUS_EN) { 813 /* 814 * If this error was uncorrectable and there was 815 * an overflow, we're in trouble. If no overflow, 816 * we might get away with just killing a task. 817 */ 818 if (m.status & MCI_STATUS_UC) 819 kill_it = 1; 820 } else { 821 /* 822 * Machine check event was not enabled. Clear, but 823 * ignore. 824 */ 825 continue; 826 } 827 828 if (m.status & MCI_STATUS_MISCV) 829 m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4); 830 if (m.status & MCI_STATUS_ADDRV) 831 m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4); 832 833 mce_get_rip(&m, regs); 834 mce_log(&m); 835 836 severity = mce_severity(&m, tolerant, NULL); 837 if (severity > worst) { 838 *final = m; 839 worst = severity; 840 } 841 } 842 843 if (!no_way_out) 844 mce_clear_state(toclear); 845 846 /* 847 * Do most of the synchronization with other CPUs. 848 * When there's any problem use only local no_way_out state. 849 */ 850 if (mce_end(order) < 0) 851 no_way_out = worst >= MCE_PANIC_SEVERITY; 852 853 /* 854 * If we have decided that we just CAN'T continue, and the user 855 * has not set tolerant to an insane level, give up and die. 856 * 857 * This is mainly used in the case when the system doesn't 858 * support MCE broadcasting or it has been disabled. 859 */ 860 if (no_way_out && tolerant < 3) 861 mce_panic("Fatal machine check on current CPU", final, msg); 862 863 /* 864 * If the error seems to be unrecoverable, something should be 865 * done. Try to kill as little as possible. If we can kill just 866 * one task, do that. If the user has set the tolerance very 867 * high, don't try to do anything at all. 868 */ 869 if (kill_it && tolerant < 3) { 870 int user_space = 0; 871 872 /* 873 * If the EIPV bit is set, it means the saved IP is the 874 * instruction which caused the MCE. 875 */ 876 if (m.mcgstatus & MCG_STATUS_EIPV) 877 user_space = final->ip && (final->cs & 3); 878 879 /* 880 * If we know that the error was in user space, send a 881 * SIGBUS. Otherwise, panic if tolerance is low. 882 * 883 * force_sig() takes an awful lot of locks and has a slight 884 * risk of deadlocking. 885 */ 886 if (user_space) { 887 force_sig(SIGBUS, current); 888 } else if (panic_on_oops || tolerant < 2) { 889 mce_panic("Uncorrected machine check", final, msg); 890 } 891 } 892 893 /* notify userspace ASAP */ 894 set_thread_flag(TIF_MCE_NOTIFY); 895 896 if (worst > 0) 897 mce_report_event(regs); 898 mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); 899out: 900 atomic_dec(&mce_entry); 901 sync_core(); 902} 903EXPORT_SYMBOL_GPL(do_machine_check); 904 905#ifdef CONFIG_X86_MCE_INTEL 906/*** 907 * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog 908 * @cpu: The CPU on which the event occurred. 909 * @status: Event status information 910 * 911 * This function should be called by the thermal interrupt after the 912 * event has been processed and the decision was made to log the event 913 * further. 914 * 915 * The status parameter will be saved to the 'status' field of 'struct mce' 916 * and historically has been the register value of the 917 * MSR_IA32_THERMAL_STATUS (Intel) msr. 918 */ 919void mce_log_therm_throt_event(__u64 status) 920{ 921 struct mce m; 922 923 mce_setup(&m); 924 m.bank = MCE_THERMAL_BANK; 925 m.status = status; 926 mce_log(&m); 927} 928#endif /* CONFIG_X86_MCE_INTEL */ 929 930/* 931 * Periodic polling timer for "silent" machine check errors. If the 932 * poller finds an MCE, poll 2x faster. When the poller finds no more 933 * errors, poll 2x slower (up to check_interval seconds). 934 */ 935static int check_interval = 5 * 60; /* 5 minutes */ 936 937static DEFINE_PER_CPU(int, next_interval); /* in jiffies */ 938static DEFINE_PER_CPU(struct timer_list, mce_timer); 939 940static void mcheck_timer(unsigned long data) 941{ 942 struct timer_list *t = &per_cpu(mce_timer, data); 943 int *n; 944 945 WARN_ON(smp_processor_id() != data); 946 947 if (mce_available(¤t_cpu_data)) { 948 machine_check_poll(MCP_TIMESTAMP, 949 &__get_cpu_var(mce_poll_banks)); 950 } 951 952 /* 953 * Alert userspace if needed. If we logged an MCE, reduce the 954 * polling interval, otherwise increase the polling interval. 955 */ 956 n = &__get_cpu_var(next_interval); 957 if (mce_notify_user()) 958 *n = max(*n/2, HZ/100); 959 else 960 *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ)); 961 962 t->expires = jiffies + *n; 963 add_timer(t); 964} 965 966static void mce_do_trigger(struct work_struct *work) 967{ 968 call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT); 969} 970 971static DECLARE_WORK(mce_trigger_work, mce_do_trigger); 972 973/* 974 * Notify the user(s) about new machine check events. 975 * Can be called from interrupt context, but not from machine check/NMI 976 * context. 977 */ 978int mce_notify_user(void) 979{ 980 /* Not more than two messages every minute */ 981 static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); 982 983 clear_thread_flag(TIF_MCE_NOTIFY); 984 985 if (test_and_clear_bit(0, ¬ify_user)) { 986 wake_up_interruptible(&mce_wait); 987 988 /* 989 * There is no risk of missing notifications because 990 * work_pending is always cleared before the function is 991 * executed. 992 */ 993 if (trigger[0] && !work_pending(&mce_trigger_work)) 994 schedule_work(&mce_trigger_work); 995 996 if (__ratelimit(&ratelimit)) 997 printk(KERN_INFO "Machine check events logged\n"); 998 999 return 1; 1000 } 1001 return 0; 1002} 1003EXPORT_SYMBOL_GPL(mce_notify_user); 1004 1005/* 1006 * Initialize Machine Checks for a CPU. 1007 */ 1008static int mce_cap_init(void) 1009{ 1010 unsigned b; 1011 u64 cap; 1012 1013 rdmsrl(MSR_IA32_MCG_CAP, cap); 1014 1015 b = cap & MCG_BANKCNT_MASK; 1016 printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b); 1017 1018 if (b > MAX_NR_BANKS) { 1019 printk(KERN_WARNING 1020 "MCE: Using only %u machine check banks out of %u\n", 1021 MAX_NR_BANKS, b); 1022 b = MAX_NR_BANKS; 1023 } 1024 1025 /* Don't support asymmetric configurations today */ 1026 WARN_ON(banks != 0 && b != banks); 1027 banks = b; 1028 if (!bank) { 1029 bank = kmalloc(banks * sizeof(u64), GFP_KERNEL); 1030 if (!bank) 1031 return -ENOMEM; 1032 memset(bank, 0xff, banks * sizeof(u64)); 1033 } 1034 1035 /* Use accurate RIP reporting if available. */ 1036 if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9) 1037 rip_msr = MSR_IA32_MCG_EIP; 1038 1039 return 0; 1040} 1041 1042static void mce_init(void) 1043{ 1044 mce_banks_t all_banks; 1045 u64 cap; 1046 int i; 1047 1048 /* 1049 * Log the machine checks left over from the previous reset. 1050 */ 1051 bitmap_fill(all_banks, MAX_NR_BANKS); 1052 machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks); 1053 1054 set_in_cr4(X86_CR4_MCE); 1055 1056 rdmsrl(MSR_IA32_MCG_CAP, cap); 1057 if (cap & MCG_CTL_P) 1058 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); 1059 1060 for (i = 0; i < banks; i++) { 1061 if (skip_bank_init(i)) 1062 continue; 1063 wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]); 1064 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); 1065 } 1066} 1067 1068/* Add per CPU specific workarounds here */ 1069static void mce_cpu_quirks(struct cpuinfo_x86 *c) 1070{ 1071 /* This should be disabled by the BIOS, but isn't always */ 1072 if (c->x86_vendor == X86_VENDOR_AMD) { 1073 if (c->x86 == 15 && banks > 4) { 1074 /* 1075 * disable GART TBL walk error reporting, which 1076 * trips off incorrectly with the IOMMU & 3ware 1077 * & Cerberus: 1078 */ 1079 clear_bit(10, (unsigned long *)&bank[4]); 1080 } 1081 if (c->x86 <= 17 && mce_bootlog < 0) { 1082 /* 1083 * Lots of broken BIOS around that don't clear them 1084 * by default and leave crap in there. Don't log: 1085 */ 1086 mce_bootlog = 0; 1087 } 1088 /* 1089 * Various K7s with broken bank 0 around. Always disable 1090 * by default. 1091 */ 1092 if (c->x86 == 6) 1093 bank[0] = 0; 1094 } 1095 1096 if (c->x86_vendor == X86_VENDOR_INTEL) { 1097 /* 1098 * SDM documents that on family 6 bank 0 should not be written 1099 * because it aliases to another special BIOS controlled 1100 * register. 1101 * But it's not aliased anymore on model 0x1a+ 1102 * Don't ignore bank 0 completely because there could be a 1103 * valid event later, merely don't write CTL0. 1104 */ 1105 1106 if (c->x86 == 6 && c->x86_model < 0x1A) 1107 __set_bit(0, &dont_init_banks); 1108 1109 /* 1110 * All newer Intel systems support MCE broadcasting. Enable 1111 * synchronization with a one second timeout. 1112 */ 1113 if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) && 1114 monarch_timeout < 0) 1115 monarch_timeout = USEC_PER_SEC; 1116 } 1117 if (monarch_timeout < 0) 1118 monarch_timeout = 0; 1119} 1120 1121static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c) 1122{ 1123 if (c->x86 != 5) 1124 return; 1125 switch (c->x86_vendor) { 1126 case X86_VENDOR_INTEL: 1127 if (mce_p5_enabled()) 1128 intel_p5_mcheck_init(c); 1129 break; 1130 case X86_VENDOR_CENTAUR: 1131 winchip_mcheck_init(c); 1132 break; 1133 } 1134} 1135 1136static void mce_cpu_features(struct cpuinfo_x86 *c) 1137{ 1138 switch (c->x86_vendor) { 1139 case X86_VENDOR_INTEL: 1140 mce_intel_feature_init(c); 1141 break; 1142 case X86_VENDOR_AMD: 1143 mce_amd_feature_init(c); 1144 break; 1145 default: 1146 break; 1147 } 1148} 1149 1150static void mce_init_timer(void) 1151{ 1152 struct timer_list *t = &__get_cpu_var(mce_timer); 1153 int *n = &__get_cpu_var(next_interval); 1154 1155 *n = check_interval * HZ; 1156 if (!*n) 1157 return; 1158 setup_timer(t, mcheck_timer, smp_processor_id()); 1159 t->expires = round_jiffies(jiffies + *n); 1160 add_timer(t); 1161} 1162 1163/* 1164 * Called for each booted CPU to set up machine checks. 1165 * Must be called with preempt off: 1166 */ 1167void __cpuinit mcheck_init(struct cpuinfo_x86 *c) 1168{ 1169 if (mce_disabled) 1170 return; 1171 1172 mce_ancient_init(c); 1173 1174 if (!mce_available(c)) 1175 return; 1176 1177 if (mce_cap_init() < 0) { 1178 mce_disabled = 1; 1179 return; 1180 } 1181 mce_cpu_quirks(c); 1182 1183 machine_check_vector = do_machine_check; 1184 1185 mce_init(); 1186 mce_cpu_features(c); 1187 mce_init_timer(); 1188} 1189 1190/* 1191 * Character device to read and clear the MCE log. 1192 */ 1193 1194static DEFINE_SPINLOCK(mce_state_lock); 1195static int open_count; /* #times opened */ 1196static int open_exclu; /* already open exclusive? */ 1197 1198static int mce_open(struct inode *inode, struct file *file) 1199{ 1200 spin_lock(&mce_state_lock); 1201 1202 if (open_exclu || (open_count && (file->f_flags & O_EXCL))) { 1203 spin_unlock(&mce_state_lock); 1204 1205 return -EBUSY; 1206 } 1207 1208 if (file->f_flags & O_EXCL) 1209 open_exclu = 1; 1210 open_count++; 1211 1212 spin_unlock(&mce_state_lock); 1213 1214 return nonseekable_open(inode, file); 1215} 1216 1217static int mce_release(struct inode *inode, struct file *file) 1218{ 1219 spin_lock(&mce_state_lock); 1220 1221 open_count--; 1222 open_exclu = 0; 1223 1224 spin_unlock(&mce_state_lock); 1225 1226 return 0; 1227} 1228 1229static void collect_tscs(void *data) 1230{ 1231 unsigned long *cpu_tsc = (unsigned long *)data; 1232 1233 rdtscll(cpu_tsc[smp_processor_id()]); 1234} 1235 1236static DEFINE_MUTEX(mce_read_mutex); 1237 1238static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, 1239 loff_t *off) 1240{ 1241 char __user *buf = ubuf; 1242 unsigned long *cpu_tsc; 1243 unsigned prev, next; 1244 int i, err; 1245 1246 cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL); 1247 if (!cpu_tsc) 1248 return -ENOMEM; 1249 1250 mutex_lock(&mce_read_mutex); 1251 next = rcu_dereference(mcelog.next); 1252 1253 /* Only supports full reads right now */ 1254 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) { 1255 mutex_unlock(&mce_read_mutex); 1256 kfree(cpu_tsc); 1257 1258 return -EINVAL; 1259 } 1260 1261 err = 0; 1262 prev = 0; 1263 do { 1264 for (i = prev; i < next; i++) { 1265 unsigned long start = jiffies; 1266 1267 while (!mcelog.entry[i].finished) { 1268 if (time_after_eq(jiffies, start + 2)) { 1269 memset(mcelog.entry + i, 0, 1270 sizeof(struct mce)); 1271 goto timeout; 1272 } 1273 cpu_relax(); 1274 } 1275 smp_rmb(); 1276 err |= copy_to_user(buf, mcelog.entry + i, 1277 sizeof(struct mce)); 1278 buf += sizeof(struct mce); 1279timeout: 1280 ; 1281 } 1282 1283 memset(mcelog.entry + prev, 0, 1284 (next - prev) * sizeof(struct mce)); 1285 prev = next; 1286 next = cmpxchg(&mcelog.next, prev, 0); 1287 } while (next != prev); 1288 1289 synchronize_sched(); 1290 1291 /* 1292 * Collect entries that were still getting written before the 1293 * synchronize. 1294 */ 1295 on_each_cpu(collect_tscs, cpu_tsc, 1); 1296 1297 for (i = next; i < MCE_LOG_LEN; i++) { 1298 if (mcelog.entry[i].finished && 1299 mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) { 1300 err |= copy_to_user(buf, mcelog.entry+i, 1301 sizeof(struct mce)); 1302 smp_rmb(); 1303 buf += sizeof(struct mce); 1304 memset(&mcelog.entry[i], 0, sizeof(struct mce)); 1305 } 1306 } 1307 mutex_unlock(&mce_read_mutex); 1308 kfree(cpu_tsc); 1309 1310 return err ? -EFAULT : buf - ubuf; 1311} 1312 1313static unsigned int mce_poll(struct file *file, poll_table *wait) 1314{ 1315 poll_wait(file, &mce_wait, wait); 1316 if (rcu_dereference(mcelog.next)) 1317 return POLLIN | POLLRDNORM; 1318 return 0; 1319} 1320 1321static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg) 1322{ 1323 int __user *p = (int __user *)arg; 1324 1325 if (!capable(CAP_SYS_ADMIN)) 1326 return -EPERM; 1327 1328 switch (cmd) { 1329 case MCE_GET_RECORD_LEN: 1330 return put_user(sizeof(struct mce), p); 1331 case MCE_GET_LOG_LEN: 1332 return put_user(MCE_LOG_LEN, p); 1333 case MCE_GETCLEAR_FLAGS: { 1334 unsigned flags; 1335 1336 do { 1337 flags = mcelog.flags; 1338 } while (cmpxchg(&mcelog.flags, flags, 0) != flags); 1339 1340 return put_user(flags, p); 1341 } 1342 default: 1343 return -ENOTTY; 1344 } 1345} 1346 1347/* Modified in mce-inject.c, so not static or const */ 1348struct file_operations mce_chrdev_ops = { 1349 .open = mce_open, 1350 .release = mce_release, 1351 .read = mce_read, 1352 .poll = mce_poll, 1353 .unlocked_ioctl = mce_ioctl, 1354}; 1355EXPORT_SYMBOL_GPL(mce_chrdev_ops); 1356 1357static struct miscdevice mce_log_device = { 1358 MISC_MCELOG_MINOR, 1359 "mcelog", 1360 &mce_chrdev_ops, 1361}; 1362 1363/* 1364 * mce=off disables machine check 1365 * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above) 1366 * monarchtimeout is how long to wait for other CPUs on machine 1367 * check, or 0 to not wait 1368 * mce=bootlog Log MCEs from before booting. Disabled by default on AMD. 1369 * mce=nobootlog Don't log MCEs from before booting. 1370 */ 1371static int __init mcheck_enable(char *str) 1372{ 1373 if (*str == 0) 1374 enable_p5_mce(); 1375 if (*str == '=') 1376 str++; 1377 if (!strcmp(str, "off")) 1378 mce_disabled = 1; 1379 else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog")) 1380 mce_bootlog = (str[0] == 'b'); 1381 else if (isdigit(str[0])) { 1382 get_option(&str, &tolerant); 1383 if (*str == ',') { 1384 ++str; 1385 get_option(&str, &monarch_timeout); 1386 } 1387 } else { 1388 printk(KERN_INFO "mce argument %s ignored. Please use /sys\n", 1389 str); 1390 return 0; 1391 } 1392 return 1; 1393} 1394__setup("mce", mcheck_enable); 1395 1396/* 1397 * Sysfs support 1398 */ 1399 1400/* 1401 * Disable machine checks on suspend and shutdown. We can't really handle 1402 * them later. 1403 */ 1404static int mce_disable(void) 1405{ 1406 int i; 1407 1408 for (i = 0; i < banks; i++) { 1409 if (!skip_bank_init(i)) 1410 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); 1411 } 1412 return 0; 1413} 1414 1415static int mce_suspend(struct sys_device *dev, pm_message_t state) 1416{ 1417 return mce_disable(); 1418} 1419 1420static int mce_shutdown(struct sys_device *dev) 1421{ 1422 return mce_disable(); 1423} 1424 1425/* 1426 * On resume clear all MCE state. Don't want to see leftovers from the BIOS. 1427 * Only one CPU is active at this time, the others get re-added later using 1428 * CPU hotplug: 1429 */ 1430static int mce_resume(struct sys_device *dev) 1431{ 1432 mce_init(); 1433 mce_cpu_features(¤t_cpu_data); 1434 1435 return 0; 1436} 1437 1438static void mce_cpu_restart(void *data) 1439{ 1440 del_timer_sync(&__get_cpu_var(mce_timer)); 1441 if (mce_available(¤t_cpu_data)) 1442 mce_init(); 1443 mce_init_timer(); 1444} 1445 1446/* Reinit MCEs after user configuration changes */ 1447static void mce_restart(void) 1448{ 1449 on_each_cpu(mce_cpu_restart, NULL, 1); 1450} 1451 1452static struct sysdev_class mce_sysclass = { 1453 .suspend = mce_suspend, 1454 .shutdown = mce_shutdown, 1455 .resume = mce_resume, 1456 .name = "machinecheck", 1457}; 1458 1459DEFINE_PER_CPU(struct sys_device, mce_dev); 1460 1461__cpuinitdata 1462void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); 1463 1464static struct sysdev_attribute *bank_attrs; 1465 1466static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr, 1467 char *buf) 1468{ 1469 u64 b = bank[attr - bank_attrs]; 1470 1471 return sprintf(buf, "%llx\n", b); 1472} 1473 1474static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, 1475 const char *buf, size_t size) 1476{ 1477 u64 new; 1478 1479 if (strict_strtoull(buf, 0, &new) < 0) 1480 return -EINVAL; 1481 1482 bank[attr - bank_attrs] = new; 1483 mce_restart(); 1484 1485 return size; 1486} 1487 1488static ssize_t 1489show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf) 1490{ 1491 strcpy(buf, trigger); 1492 strcat(buf, "\n"); 1493 return strlen(trigger) + 1; 1494} 1495 1496static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr, 1497 const char *buf, size_t siz) 1498{ 1499 char *p; 1500 int len; 1501 1502 strncpy(trigger, buf, sizeof(trigger)); 1503 trigger[sizeof(trigger)-1] = 0; 1504 len = strlen(trigger); 1505 p = strchr(trigger, '\n'); 1506 1507 if (*p) 1508 *p = 0; 1509 1510 return len; 1511} 1512 1513static ssize_t store_int_with_restart(struct sys_device *s, 1514 struct sysdev_attribute *attr, 1515 const char *buf, size_t size) 1516{ 1517 ssize_t ret = sysdev_store_int(s, attr, buf, size); 1518 mce_restart(); 1519 return ret; 1520} 1521 1522static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); 1523static SYSDEV_INT_ATTR(tolerant, 0644, tolerant); 1524static SYSDEV_INT_ATTR(monarch_timeout, 0644, monarch_timeout); 1525 1526static struct sysdev_ext_attribute attr_check_interval = { 1527 _SYSDEV_ATTR(check_interval, 0644, sysdev_show_int, 1528 store_int_with_restart), 1529 &check_interval 1530}; 1531 1532static struct sysdev_attribute *mce_attrs[] = { 1533 &attr_tolerant.attr, &attr_check_interval.attr, &attr_trigger, 1534 &attr_monarch_timeout.attr, 1535 NULL 1536}; 1537 1538static cpumask_var_t mce_dev_initialized; 1539 1540/* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */ 1541static __cpuinit int mce_create_device(unsigned int cpu) 1542{ 1543 int err; 1544 int i; 1545 1546 if (!mce_available(&boot_cpu_data)) 1547 return -EIO; 1548 1549 memset(&per_cpu(mce_dev, cpu).kobj, 0, sizeof(struct kobject)); 1550 per_cpu(mce_dev, cpu).id = cpu; 1551 per_cpu(mce_dev, cpu).cls = &mce_sysclass; 1552 1553 err = sysdev_register(&per_cpu(mce_dev, cpu)); 1554 if (err) 1555 return err; 1556 1557 for (i = 0; mce_attrs[i]; i++) { 1558 err = sysdev_create_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); 1559 if (err) 1560 goto error; 1561 } 1562 for (i = 0; i < banks; i++) { 1563 err = sysdev_create_file(&per_cpu(mce_dev, cpu), 1564 &bank_attrs[i]); 1565 if (err) 1566 goto error2; 1567 } 1568 cpumask_set_cpu(cpu, mce_dev_initialized); 1569 1570 return 0; 1571error2: 1572 while (--i >= 0) 1573 sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]); 1574error: 1575 while (--i >= 0) 1576 sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); 1577 1578 sysdev_unregister(&per_cpu(mce_dev, cpu)); 1579 1580 return err; 1581} 1582 1583static __cpuinit void mce_remove_device(unsigned int cpu) 1584{ 1585 int i; 1586 1587 if (!cpumask_test_cpu(cpu, mce_dev_initialized)) 1588 return; 1589 1590 for (i = 0; mce_attrs[i]; i++) 1591 sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); 1592 1593 for (i = 0; i < banks; i++) 1594 sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]); 1595 1596 sysdev_unregister(&per_cpu(mce_dev, cpu)); 1597 cpumask_clear_cpu(cpu, mce_dev_initialized); 1598} 1599 1600/* Make sure there are no machine checks on offlined CPUs. */ 1601static void mce_disable_cpu(void *h) 1602{ 1603 unsigned long action = *(unsigned long *)h; 1604 int i; 1605 1606 if (!mce_available(¤t_cpu_data)) 1607 return; 1608 if (!(action & CPU_TASKS_FROZEN)) 1609 cmci_clear(); 1610 for (i = 0; i < banks; i++) { 1611 if (!skip_bank_init(i)) 1612 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); 1613 } 1614} 1615 1616static void mce_reenable_cpu(void *h) 1617{ 1618 unsigned long action = *(unsigned long *)h; 1619 int i; 1620 1621 if (!mce_available(¤t_cpu_data)) 1622 return; 1623 1624 if (!(action & CPU_TASKS_FROZEN)) 1625 cmci_reenable(); 1626 for (i = 0; i < banks; i++) { 1627 if (!skip_bank_init(i)) 1628 wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]); 1629 } 1630} 1631 1632/* Get notified when a cpu comes on/off. Be hotplug friendly. */ 1633static int __cpuinit 1634mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) 1635{ 1636 unsigned int cpu = (unsigned long)hcpu; 1637 struct timer_list *t = &per_cpu(mce_timer, cpu); 1638 1639 switch (action) { 1640 case CPU_ONLINE: 1641 case CPU_ONLINE_FROZEN: 1642 mce_create_device(cpu); 1643 if (threshold_cpu_callback) 1644 threshold_cpu_callback(action, cpu); 1645 break; 1646 case CPU_DEAD: 1647 case CPU_DEAD_FROZEN: 1648 if (threshold_cpu_callback) 1649 threshold_cpu_callback(action, cpu); 1650 mce_remove_device(cpu); 1651 break; 1652 case CPU_DOWN_PREPARE: 1653 case CPU_DOWN_PREPARE_FROZEN: 1654 del_timer_sync(t); 1655 smp_call_function_single(cpu, mce_disable_cpu, &action, 1); 1656 break; 1657 case CPU_DOWN_FAILED: 1658 case CPU_DOWN_FAILED_FROZEN: 1659 t->expires = round_jiffies(jiffies + 1660 __get_cpu_var(next_interval)); 1661 add_timer_on(t, cpu); 1662 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); 1663 break; 1664 case CPU_POST_DEAD: 1665 /* intentionally ignoring frozen here */ 1666 cmci_rediscover(cpu); 1667 break; 1668 } 1669 return NOTIFY_OK; 1670} 1671 1672static struct notifier_block mce_cpu_notifier __cpuinitdata = { 1673 .notifier_call = mce_cpu_callback, 1674}; 1675 1676static __init int mce_init_banks(void) 1677{ 1678 int i; 1679 1680 bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks, 1681 GFP_KERNEL); 1682 if (!bank_attrs) 1683 return -ENOMEM; 1684 1685 for (i = 0; i < banks; i++) { 1686 struct sysdev_attribute *a = &bank_attrs[i]; 1687 1688 a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i); 1689 if (!a->attr.name) 1690 goto nomem; 1691 1692 a->attr.mode = 0644; 1693 a->show = show_bank; 1694 a->store = set_bank; 1695 } 1696 return 0; 1697 1698nomem: 1699 while (--i >= 0) 1700 kfree(bank_attrs[i].attr.name); 1701 kfree(bank_attrs); 1702 bank_attrs = NULL; 1703 1704 return -ENOMEM; 1705} 1706 1707static __init int mce_init_device(void) 1708{ 1709 int err; 1710 int i = 0; 1711 1712 if (!mce_available(&boot_cpu_data)) 1713 return -EIO; 1714 1715 alloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL); 1716 1717 err = mce_init_banks(); 1718 if (err) 1719 return err; 1720 1721 err = sysdev_class_register(&mce_sysclass); 1722 if (err) 1723 return err; 1724 1725 for_each_online_cpu(i) { 1726 err = mce_create_device(i); 1727 if (err) 1728 return err; 1729 } 1730 1731 register_hotcpu_notifier(&mce_cpu_notifier); 1732 misc_register(&mce_log_device); 1733 1734 return err; 1735} 1736 1737device_initcall(mce_init_device); 1738 1739#else /* CONFIG_X86_OLD_MCE: */ 1740 1741int nr_mce_banks; 1742EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */ 1743 1744/* This has to be run for each processor */ 1745void mcheck_init(struct cpuinfo_x86 *c) 1746{ 1747 if (mce_disabled == 1) 1748 return; 1749 1750 switch (c->x86_vendor) { 1751 case X86_VENDOR_AMD: 1752 amd_mcheck_init(c); 1753 break; 1754 1755 case X86_VENDOR_INTEL: 1756 if (c->x86 == 5) 1757 intel_p5_mcheck_init(c); 1758 if (c->x86 == 6) 1759 intel_p6_mcheck_init(c); 1760 if (c->x86 == 15) 1761 intel_p4_mcheck_init(c); 1762 break; 1763 1764 case X86_VENDOR_CENTAUR: 1765 if (c->x86 == 5) 1766 winchip_mcheck_init(c); 1767 break; 1768 1769 default: 1770 break; 1771 } 1772 printk(KERN_INFO "mce: CPU supports %d MCE banks\n", nr_mce_banks); 1773} 1774 1775static int __init mcheck_enable(char *str) 1776{ 1777 mce_disabled = -1; 1778 return 1; 1779} 1780 1781__setup("mce", mcheck_enable); 1782 1783#endif /* CONFIG_X86_OLD_MCE */ 1784 1785/* 1786 * Old style boot options parsing. Only for compatibility. 1787 */ 1788static int __init mcheck_disable(char *str) 1789{ 1790 mce_disabled = 1; 1791 return 1; 1792} 1793__setup("nomce", mcheck_disable); 1794