mce.c revision 80f033610fb968e75f5d470233d8d0260d7a72ed
1/* 2 * Machine check handler. 3 * 4 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. 5 * Rest from unknown author(s). 6 * 2004 Andi Kleen. Rewrote most of it. 7 * Copyright 2008 Intel Corporation 8 * Author: Andi Kleen 9 */ 10#include <linux/thread_info.h> 11#include <linux/capability.h> 12#include <linux/miscdevice.h> 13#include <linux/ratelimit.h> 14#include <linux/kallsyms.h> 15#include <linux/rcupdate.h> 16#include <linux/kobject.h> 17#include <linux/uaccess.h> 18#include <linux/kdebug.h> 19#include <linux/kernel.h> 20#include <linux/percpu.h> 21#include <linux/string.h> 22#include <linux/device.h> 23#include <linux/syscore_ops.h> 24#include <linux/delay.h> 25#include <linux/ctype.h> 26#include <linux/sched.h> 27#include <linux/sysfs.h> 28#include <linux/types.h> 29#include <linux/slab.h> 30#include <linux/init.h> 31#include <linux/kmod.h> 32#include <linux/poll.h> 33#include <linux/nmi.h> 34#include <linux/cpu.h> 35#include <linux/smp.h> 36#include <linux/fs.h> 37#include <linux/mm.h> 38#include <linux/debugfs.h> 39#include <linux/irq_work.h> 40#include <linux/export.h> 41 42#include <asm/processor.h> 43#include <asm/mce.h> 44#include <asm/msr.h> 45 46#include "mce-internal.h" 47 48static DEFINE_MUTEX(mce_chrdev_read_mutex); 49 50#define rcu_dereference_check_mce(p) \ 51 rcu_dereference_index_check((p), \ 52 rcu_read_lock_sched_held() || \ 53 lockdep_is_held(&mce_chrdev_read_mutex)) 54 55#define CREATE_TRACE_POINTS 56#include <trace/events/mce.h> 57 58int mce_disabled __read_mostly; 59 60#define MISC_MCELOG_MINOR 227 61 62#define SPINUNIT 100 /* 100ns */ 63 64atomic_t mce_entry; 65 66DEFINE_PER_CPU(unsigned, mce_exception_count); 67 68/* 69 * Tolerant levels: 70 * 0: always panic on uncorrected errors, log corrected errors 71 * 1: panic or SIGBUS on uncorrected errors, log corrected errors 72 * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors 73 * 3: never panic or SIGBUS, log all errors (for testing only) 74 */ 75static int tolerant __read_mostly = 1; 76static int banks __read_mostly; 77static int rip_msr __read_mostly; 78static int mce_bootlog __read_mostly = -1; 79static int monarch_timeout __read_mostly = -1; 80static int mce_panic_timeout __read_mostly; 81static int mce_dont_log_ce __read_mostly; 82int mce_cmci_disabled __read_mostly; 83int mce_ignore_ce __read_mostly; 84int mce_ser __read_mostly; 85 86struct mce_bank *mce_banks __read_mostly; 87 88/* User mode helper program triggered by machine check event */ 89static unsigned long mce_need_notify; 90static char mce_helper[128]; 91static char *mce_helper_argv[2] = { mce_helper, NULL }; 92 93static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait); 94 95static DEFINE_PER_CPU(struct mce, mces_seen); 96static int cpu_missing; 97 98/* MCA banks polled by the period polling timer for corrected events */ 99DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { 100 [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL 101}; 102 103static DEFINE_PER_CPU(struct work_struct, mce_work); 104 105/* 106 * CPU/chipset specific EDAC code can register a notifier call here to print 107 * MCE errors in a human-readable form. 108 */ 109ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain); 110 111/* Do initial initialization of a struct mce */ 112void mce_setup(struct mce *m) 113{ 114 memset(m, 0, sizeof(struct mce)); 115 m->cpu = m->extcpu = smp_processor_id(); 116 rdtscll(m->tsc); 117 /* We hope get_seconds stays lockless */ 118 m->time = get_seconds(); 119 m->cpuvendor = boot_cpu_data.x86_vendor; 120 m->cpuid = cpuid_eax(1); 121 m->socketid = cpu_data(m->extcpu).phys_proc_id; 122 m->apicid = cpu_data(m->extcpu).initial_apicid; 123 rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap); 124} 125 126DEFINE_PER_CPU(struct mce, injectm); 127EXPORT_PER_CPU_SYMBOL_GPL(injectm); 128 129/* 130 * Lockless MCE logging infrastructure. 131 * This avoids deadlocks on printk locks without having to break locks. Also 132 * separate MCEs from kernel messages to avoid bogus bug reports. 133 */ 134 135static struct mce_log mcelog = { 136 .signature = MCE_LOG_SIGNATURE, 137 .len = MCE_LOG_LEN, 138 .recordlen = sizeof(struct mce), 139}; 140 141void mce_log(struct mce *mce) 142{ 143 unsigned next, entry; 144 int ret = 0; 145 146 /* Emit the trace record: */ 147 trace_mce_record(mce); 148 149 ret = atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, mce); 150 if (ret == NOTIFY_STOP) 151 return; 152 153 mce->finished = 0; 154 wmb(); 155 for (;;) { 156 entry = rcu_dereference_check_mce(mcelog.next); 157 for (;;) { 158 159 /* 160 * When the buffer fills up discard new entries. 161 * Assume that the earlier errors are the more 162 * interesting ones: 163 */ 164 if (entry >= MCE_LOG_LEN) { 165 set_bit(MCE_OVERFLOW, 166 (unsigned long *)&mcelog.flags); 167 return; 168 } 169 /* Old left over entry. Skip: */ 170 if (mcelog.entry[entry].finished) { 171 entry++; 172 continue; 173 } 174 break; 175 } 176 smp_rmb(); 177 next = entry + 1; 178 if (cmpxchg(&mcelog.next, entry, next) == entry) 179 break; 180 } 181 memcpy(mcelog.entry + entry, mce, sizeof(struct mce)); 182 wmb(); 183 mcelog.entry[entry].finished = 1; 184 wmb(); 185 186 mce->finished = 1; 187 set_bit(0, &mce_need_notify); 188} 189 190static void drain_mcelog_buffer(void) 191{ 192 unsigned int next, i, prev = 0; 193 194 next = ACCESS_ONCE(mcelog.next); 195 196 do { 197 struct mce *m; 198 199 /* drain what was logged during boot */ 200 for (i = prev; i < next; i++) { 201 unsigned long start = jiffies; 202 unsigned retries = 1; 203 204 m = &mcelog.entry[i]; 205 206 while (!m->finished) { 207 if (time_after_eq(jiffies, start + 2*retries)) 208 retries++; 209 210 cpu_relax(); 211 212 if (!m->finished && retries >= 4) { 213 pr_err("MCE: skipping error being logged currently!\n"); 214 break; 215 } 216 } 217 smp_rmb(); 218 atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m); 219 } 220 221 memset(mcelog.entry + prev, 0, (next - prev) * sizeof(*m)); 222 prev = next; 223 next = cmpxchg(&mcelog.next, prev, 0); 224 } while (next != prev); 225} 226 227 228void mce_register_decode_chain(struct notifier_block *nb) 229{ 230 atomic_notifier_chain_register(&x86_mce_decoder_chain, nb); 231 drain_mcelog_buffer(); 232} 233EXPORT_SYMBOL_GPL(mce_register_decode_chain); 234 235void mce_unregister_decode_chain(struct notifier_block *nb) 236{ 237 atomic_notifier_chain_unregister(&x86_mce_decoder_chain, nb); 238} 239EXPORT_SYMBOL_GPL(mce_unregister_decode_chain); 240 241static void print_mce(struct mce *m) 242{ 243 int ret = 0; 244 245 pr_emerg(HW_ERR "CPU %d: Machine Check Exception: %Lx Bank %d: %016Lx\n", 246 m->extcpu, m->mcgstatus, m->bank, m->status); 247 248 if (m->ip) { 249 pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ", 250 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", 251 m->cs, m->ip); 252 253 if (m->cs == __KERNEL_CS) 254 print_symbol("{%s}", m->ip); 255 pr_cont("\n"); 256 } 257 258 pr_emerg(HW_ERR "TSC %llx ", m->tsc); 259 if (m->addr) 260 pr_cont("ADDR %llx ", m->addr); 261 if (m->misc) 262 pr_cont("MISC %llx ", m->misc); 263 264 pr_cont("\n"); 265 /* 266 * Note this output is parsed by external tools and old fields 267 * should not be changed. 268 */ 269 pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x microcode %x\n", 270 m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid, 271 cpu_data(m->extcpu).microcode); 272 273 /* 274 * Print out human-readable details about the MCE error, 275 * (if the CPU has an implementation for that) 276 */ 277 ret = atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m); 278 if (ret == NOTIFY_STOP) 279 return; 280 281 pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n"); 282} 283 284#define PANIC_TIMEOUT 5 /* 5 seconds */ 285 286static atomic_t mce_paniced; 287 288static int fake_panic; 289static atomic_t mce_fake_paniced; 290 291/* Panic in progress. Enable interrupts and wait for final IPI */ 292static void wait_for_panic(void) 293{ 294 long timeout = PANIC_TIMEOUT*USEC_PER_SEC; 295 296 preempt_disable(); 297 local_irq_enable(); 298 while (timeout-- > 0) 299 udelay(1); 300 if (panic_timeout == 0) 301 panic_timeout = mce_panic_timeout; 302 panic("Panicing machine check CPU died"); 303} 304 305static void mce_panic(char *msg, struct mce *final, char *exp) 306{ 307 int i, apei_err = 0; 308 309 if (!fake_panic) { 310 /* 311 * Make sure only one CPU runs in machine check panic 312 */ 313 if (atomic_inc_return(&mce_paniced) > 1) 314 wait_for_panic(); 315 barrier(); 316 317 bust_spinlocks(1); 318 console_verbose(); 319 } else { 320 /* Don't log too much for fake panic */ 321 if (atomic_inc_return(&mce_fake_paniced) > 1) 322 return; 323 } 324 /* First print corrected ones that are still unlogged */ 325 for (i = 0; i < MCE_LOG_LEN; i++) { 326 struct mce *m = &mcelog.entry[i]; 327 if (!(m->status & MCI_STATUS_VAL)) 328 continue; 329 if (!(m->status & MCI_STATUS_UC)) { 330 print_mce(m); 331 if (!apei_err) 332 apei_err = apei_write_mce(m); 333 } 334 } 335 /* Now print uncorrected but with the final one last */ 336 for (i = 0; i < MCE_LOG_LEN; i++) { 337 struct mce *m = &mcelog.entry[i]; 338 if (!(m->status & MCI_STATUS_VAL)) 339 continue; 340 if (!(m->status & MCI_STATUS_UC)) 341 continue; 342 if (!final || memcmp(m, final, sizeof(struct mce))) { 343 print_mce(m); 344 if (!apei_err) 345 apei_err = apei_write_mce(m); 346 } 347 } 348 if (final) { 349 print_mce(final); 350 if (!apei_err) 351 apei_err = apei_write_mce(final); 352 } 353 if (cpu_missing) 354 pr_emerg(HW_ERR "Some CPUs didn't answer in synchronization\n"); 355 if (exp) 356 pr_emerg(HW_ERR "Machine check: %s\n", exp); 357 if (!fake_panic) { 358 if (panic_timeout == 0) 359 panic_timeout = mce_panic_timeout; 360 panic(msg); 361 } else 362 pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg); 363} 364 365/* Support code for software error injection */ 366 367static int msr_to_offset(u32 msr) 368{ 369 unsigned bank = __this_cpu_read(injectm.bank); 370 371 if (msr == rip_msr) 372 return offsetof(struct mce, ip); 373 if (msr == MSR_IA32_MCx_STATUS(bank)) 374 return offsetof(struct mce, status); 375 if (msr == MSR_IA32_MCx_ADDR(bank)) 376 return offsetof(struct mce, addr); 377 if (msr == MSR_IA32_MCx_MISC(bank)) 378 return offsetof(struct mce, misc); 379 if (msr == MSR_IA32_MCG_STATUS) 380 return offsetof(struct mce, mcgstatus); 381 return -1; 382} 383 384/* MSR access wrappers used for error injection */ 385static u64 mce_rdmsrl(u32 msr) 386{ 387 u64 v; 388 389 if (__this_cpu_read(injectm.finished)) { 390 int offset = msr_to_offset(msr); 391 392 if (offset < 0) 393 return 0; 394 return *(u64 *)((char *)&__get_cpu_var(injectm) + offset); 395 } 396 397 if (rdmsrl_safe(msr, &v)) { 398 WARN_ONCE(1, "mce: Unable to read msr %d!\n", msr); 399 /* 400 * Return zero in case the access faulted. This should 401 * not happen normally but can happen if the CPU does 402 * something weird, or if the code is buggy. 403 */ 404 v = 0; 405 } 406 407 return v; 408} 409 410static void mce_wrmsrl(u32 msr, u64 v) 411{ 412 if (__this_cpu_read(injectm.finished)) { 413 int offset = msr_to_offset(msr); 414 415 if (offset >= 0) 416 *(u64 *)((char *)&__get_cpu_var(injectm) + offset) = v; 417 return; 418 } 419 wrmsrl(msr, v); 420} 421 422/* 423 * Collect all global (w.r.t. this processor) status about this machine 424 * check into our "mce" struct so that we can use it later to assess 425 * the severity of the problem as we read per-bank specific details. 426 */ 427static inline void mce_gather_info(struct mce *m, struct pt_regs *regs) 428{ 429 mce_setup(m); 430 431 m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); 432 if (regs) { 433 /* 434 * Get the address of the instruction at the time of 435 * the machine check error. 436 */ 437 if (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) { 438 m->ip = regs->ip; 439 m->cs = regs->cs; 440 } 441 /* Use accurate RIP reporting if available. */ 442 if (rip_msr) 443 m->ip = mce_rdmsrl(rip_msr); 444 } 445} 446 447/* 448 * Simple lockless ring to communicate PFNs from the exception handler with the 449 * process context work function. This is vastly simplified because there's 450 * only a single reader and a single writer. 451 */ 452#define MCE_RING_SIZE 16 /* we use one entry less */ 453 454struct mce_ring { 455 unsigned short start; 456 unsigned short end; 457 unsigned long ring[MCE_RING_SIZE]; 458}; 459static DEFINE_PER_CPU(struct mce_ring, mce_ring); 460 461/* Runs with CPU affinity in workqueue */ 462static int mce_ring_empty(void) 463{ 464 struct mce_ring *r = &__get_cpu_var(mce_ring); 465 466 return r->start == r->end; 467} 468 469static int mce_ring_get(unsigned long *pfn) 470{ 471 struct mce_ring *r; 472 int ret = 0; 473 474 *pfn = 0; 475 get_cpu(); 476 r = &__get_cpu_var(mce_ring); 477 if (r->start == r->end) 478 goto out; 479 *pfn = r->ring[r->start]; 480 r->start = (r->start + 1) % MCE_RING_SIZE; 481 ret = 1; 482out: 483 put_cpu(); 484 return ret; 485} 486 487/* Always runs in MCE context with preempt off */ 488static int mce_ring_add(unsigned long pfn) 489{ 490 struct mce_ring *r = &__get_cpu_var(mce_ring); 491 unsigned next; 492 493 next = (r->end + 1) % MCE_RING_SIZE; 494 if (next == r->start) 495 return -1; 496 r->ring[r->end] = pfn; 497 wmb(); 498 r->end = next; 499 return 0; 500} 501 502int mce_available(struct cpuinfo_x86 *c) 503{ 504 if (mce_disabled) 505 return 0; 506 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); 507} 508 509static void mce_schedule_work(void) 510{ 511 if (!mce_ring_empty()) { 512 struct work_struct *work = &__get_cpu_var(mce_work); 513 if (!work_pending(work)) 514 schedule_work(work); 515 } 516} 517 518DEFINE_PER_CPU(struct irq_work, mce_irq_work); 519 520static void mce_irq_work_cb(struct irq_work *entry) 521{ 522 mce_notify_irq(); 523 mce_schedule_work(); 524} 525 526static void mce_report_event(struct pt_regs *regs) 527{ 528 if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) { 529 mce_notify_irq(); 530 /* 531 * Triggering the work queue here is just an insurance 532 * policy in case the syscall exit notify handler 533 * doesn't run soon enough or ends up running on the 534 * wrong CPU (can happen when audit sleeps) 535 */ 536 mce_schedule_work(); 537 return; 538 } 539 540 irq_work_queue(&__get_cpu_var(mce_irq_work)); 541} 542 543/* 544 * Read ADDR and MISC registers. 545 */ 546static void mce_read_aux(struct mce *m, int i) 547{ 548 if (m->status & MCI_STATUS_MISCV) 549 m->misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i)); 550 if (m->status & MCI_STATUS_ADDRV) { 551 m->addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i)); 552 553 /* 554 * Mask the reported address by the reported granularity. 555 */ 556 if (mce_ser && (m->status & MCI_STATUS_MISCV)) { 557 u8 shift = MCI_MISC_ADDR_LSB(m->misc); 558 m->addr >>= shift; 559 m->addr <<= shift; 560 } 561 } 562} 563 564DEFINE_PER_CPU(unsigned, mce_poll_count); 565 566/* 567 * Poll for corrected events or events that happened before reset. 568 * Those are just logged through /dev/mcelog. 569 * 570 * This is executed in standard interrupt context. 571 * 572 * Note: spec recommends to panic for fatal unsignalled 573 * errors here. However this would be quite problematic -- 574 * we would need to reimplement the Monarch handling and 575 * it would mess up the exclusion between exception handler 576 * and poll hander -- * so we skip this for now. 577 * These cases should not happen anyways, or only when the CPU 578 * is already totally * confused. In this case it's likely it will 579 * not fully execute the machine check handler either. 580 */ 581void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) 582{ 583 struct mce m; 584 int i; 585 586 percpu_inc(mce_poll_count); 587 588 mce_gather_info(&m, NULL); 589 590 for (i = 0; i < banks; i++) { 591 if (!mce_banks[i].ctl || !test_bit(i, *b)) 592 continue; 593 594 m.misc = 0; 595 m.addr = 0; 596 m.bank = i; 597 m.tsc = 0; 598 599 barrier(); 600 m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); 601 if (!(m.status & MCI_STATUS_VAL)) 602 continue; 603 604 /* 605 * Uncorrected or signalled events are handled by the exception 606 * handler when it is enabled, so don't process those here. 607 * 608 * TBD do the same check for MCI_STATUS_EN here? 609 */ 610 if (!(flags & MCP_UC) && 611 (m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC))) 612 continue; 613 614 mce_read_aux(&m, i); 615 616 if (!(flags & MCP_TIMESTAMP)) 617 m.tsc = 0; 618 /* 619 * Don't get the IP here because it's unlikely to 620 * have anything to do with the actual error location. 621 */ 622 if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) 623 mce_log(&m); 624 625 /* 626 * Clear state for this bank. 627 */ 628 mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0); 629 } 630 631 /* 632 * Don't clear MCG_STATUS here because it's only defined for 633 * exceptions. 634 */ 635 636 sync_core(); 637} 638EXPORT_SYMBOL_GPL(machine_check_poll); 639 640/* 641 * Do a quick check if any of the events requires a panic. 642 * This decides if we keep the events around or clear them. 643 */ 644static int mce_no_way_out(struct mce *m, char **msg) 645{ 646 int i; 647 648 for (i = 0; i < banks; i++) { 649 m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); 650 if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY) 651 return 1; 652 } 653 return 0; 654} 655 656/* 657 * Variable to establish order between CPUs while scanning. 658 * Each CPU spins initially until executing is equal its number. 659 */ 660static atomic_t mce_executing; 661 662/* 663 * Defines order of CPUs on entry. First CPU becomes Monarch. 664 */ 665static atomic_t mce_callin; 666 667/* 668 * Check if a timeout waiting for other CPUs happened. 669 */ 670static int mce_timed_out(u64 *t) 671{ 672 /* 673 * The others already did panic for some reason. 674 * Bail out like in a timeout. 675 * rmb() to tell the compiler that system_state 676 * might have been modified by someone else. 677 */ 678 rmb(); 679 if (atomic_read(&mce_paniced)) 680 wait_for_panic(); 681 if (!monarch_timeout) 682 goto out; 683 if ((s64)*t < SPINUNIT) { 684 /* CHECKME: Make panic default for 1 too? */ 685 if (tolerant < 1) 686 mce_panic("Timeout synchronizing machine check over CPUs", 687 NULL, NULL); 688 cpu_missing = 1; 689 return 1; 690 } 691 *t -= SPINUNIT; 692out: 693 touch_nmi_watchdog(); 694 return 0; 695} 696 697/* 698 * The Monarch's reign. The Monarch is the CPU who entered 699 * the machine check handler first. It waits for the others to 700 * raise the exception too and then grades them. When any 701 * error is fatal panic. Only then let the others continue. 702 * 703 * The other CPUs entering the MCE handler will be controlled by the 704 * Monarch. They are called Subjects. 705 * 706 * This way we prevent any potential data corruption in a unrecoverable case 707 * and also makes sure always all CPU's errors are examined. 708 * 709 * Also this detects the case of a machine check event coming from outer 710 * space (not detected by any CPUs) In this case some external agent wants 711 * us to shut down, so panic too. 712 * 713 * The other CPUs might still decide to panic if the handler happens 714 * in a unrecoverable place, but in this case the system is in a semi-stable 715 * state and won't corrupt anything by itself. It's ok to let the others 716 * continue for a bit first. 717 * 718 * All the spin loops have timeouts; when a timeout happens a CPU 719 * typically elects itself to be Monarch. 720 */ 721static void mce_reign(void) 722{ 723 int cpu; 724 struct mce *m = NULL; 725 int global_worst = 0; 726 char *msg = NULL; 727 char *nmsg = NULL; 728 729 /* 730 * This CPU is the Monarch and the other CPUs have run 731 * through their handlers. 732 * Grade the severity of the errors of all the CPUs. 733 */ 734 for_each_possible_cpu(cpu) { 735 int severity = mce_severity(&per_cpu(mces_seen, cpu), tolerant, 736 &nmsg); 737 if (severity > global_worst) { 738 msg = nmsg; 739 global_worst = severity; 740 m = &per_cpu(mces_seen, cpu); 741 } 742 } 743 744 /* 745 * Cannot recover? Panic here then. 746 * This dumps all the mces in the log buffer and stops the 747 * other CPUs. 748 */ 749 if (m && global_worst >= MCE_PANIC_SEVERITY && tolerant < 3) 750 mce_panic("Fatal Machine check", m, msg); 751 752 /* 753 * For UC somewhere we let the CPU who detects it handle it. 754 * Also must let continue the others, otherwise the handling 755 * CPU could deadlock on a lock. 756 */ 757 758 /* 759 * No machine check event found. Must be some external 760 * source or one CPU is hung. Panic. 761 */ 762 if (global_worst <= MCE_KEEP_SEVERITY && tolerant < 3) 763 mce_panic("Machine check from unknown source", NULL, NULL); 764 765 /* 766 * Now clear all the mces_seen so that they don't reappear on 767 * the next mce. 768 */ 769 for_each_possible_cpu(cpu) 770 memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce)); 771} 772 773static atomic_t global_nwo; 774 775/* 776 * Start of Monarch synchronization. This waits until all CPUs have 777 * entered the exception handler and then determines if any of them 778 * saw a fatal event that requires panic. Then it executes them 779 * in the entry order. 780 * TBD double check parallel CPU hotunplug 781 */ 782static int mce_start(int *no_way_out) 783{ 784 int order; 785 int cpus = num_online_cpus(); 786 u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC; 787 788 if (!timeout) 789 return -1; 790 791 atomic_add(*no_way_out, &global_nwo); 792 /* 793 * global_nwo should be updated before mce_callin 794 */ 795 smp_wmb(); 796 order = atomic_inc_return(&mce_callin); 797 798 /* 799 * Wait for everyone. 800 */ 801 while (atomic_read(&mce_callin) != cpus) { 802 if (mce_timed_out(&timeout)) { 803 atomic_set(&global_nwo, 0); 804 return -1; 805 } 806 ndelay(SPINUNIT); 807 } 808 809 /* 810 * mce_callin should be read before global_nwo 811 */ 812 smp_rmb(); 813 814 if (order == 1) { 815 /* 816 * Monarch: Starts executing now, the others wait. 817 */ 818 atomic_set(&mce_executing, 1); 819 } else { 820 /* 821 * Subject: Now start the scanning loop one by one in 822 * the original callin order. 823 * This way when there are any shared banks it will be 824 * only seen by one CPU before cleared, avoiding duplicates. 825 */ 826 while (atomic_read(&mce_executing) < order) { 827 if (mce_timed_out(&timeout)) { 828 atomic_set(&global_nwo, 0); 829 return -1; 830 } 831 ndelay(SPINUNIT); 832 } 833 } 834 835 /* 836 * Cache the global no_way_out state. 837 */ 838 *no_way_out = atomic_read(&global_nwo); 839 840 return order; 841} 842 843/* 844 * Synchronize between CPUs after main scanning loop. 845 * This invokes the bulk of the Monarch processing. 846 */ 847static int mce_end(int order) 848{ 849 int ret = -1; 850 u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC; 851 852 if (!timeout) 853 goto reset; 854 if (order < 0) 855 goto reset; 856 857 /* 858 * Allow others to run. 859 */ 860 atomic_inc(&mce_executing); 861 862 if (order == 1) { 863 /* CHECKME: Can this race with a parallel hotplug? */ 864 int cpus = num_online_cpus(); 865 866 /* 867 * Monarch: Wait for everyone to go through their scanning 868 * loops. 869 */ 870 while (atomic_read(&mce_executing) <= cpus) { 871 if (mce_timed_out(&timeout)) 872 goto reset; 873 ndelay(SPINUNIT); 874 } 875 876 mce_reign(); 877 barrier(); 878 ret = 0; 879 } else { 880 /* 881 * Subject: Wait for Monarch to finish. 882 */ 883 while (atomic_read(&mce_executing) != 0) { 884 if (mce_timed_out(&timeout)) 885 goto reset; 886 ndelay(SPINUNIT); 887 } 888 889 /* 890 * Don't reset anything. That's done by the Monarch. 891 */ 892 return 0; 893 } 894 895 /* 896 * Reset all global state. 897 */ 898reset: 899 atomic_set(&global_nwo, 0); 900 atomic_set(&mce_callin, 0); 901 barrier(); 902 903 /* 904 * Let others run again. 905 */ 906 atomic_set(&mce_executing, 0); 907 return ret; 908} 909 910/* 911 * Check if the address reported by the CPU is in a format we can parse. 912 * It would be possible to add code for most other cases, but all would 913 * be somewhat complicated (e.g. segment offset would require an instruction 914 * parser). So only support physical addresses up to page granuality for now. 915 */ 916static int mce_usable_address(struct mce *m) 917{ 918 if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV)) 919 return 0; 920 if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT) 921 return 0; 922 if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS) 923 return 0; 924 return 1; 925} 926 927static void mce_clear_state(unsigned long *toclear) 928{ 929 int i; 930 931 for (i = 0; i < banks; i++) { 932 if (test_bit(i, toclear)) 933 mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0); 934 } 935} 936 937/* 938 * Need to save faulting physical address associated with a process 939 * in the machine check handler some place where we can grab it back 940 * later in mce_notify_process() 941 */ 942#define MCE_INFO_MAX 16 943 944struct mce_info { 945 atomic_t inuse; 946 struct task_struct *t; 947 __u64 paddr; 948} mce_info[MCE_INFO_MAX]; 949 950static void mce_save_info(__u64 addr) 951{ 952 struct mce_info *mi; 953 954 for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++) { 955 if (atomic_cmpxchg(&mi->inuse, 0, 1) == 0) { 956 mi->t = current; 957 mi->paddr = addr; 958 return; 959 } 960 } 961 962 mce_panic("Too many concurrent recoverable errors", NULL, NULL); 963} 964 965static struct mce_info *mce_find_info(void) 966{ 967 struct mce_info *mi; 968 969 for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++) 970 if (atomic_read(&mi->inuse) && mi->t == current) 971 return mi; 972 return NULL; 973} 974 975static void mce_clear_info(struct mce_info *mi) 976{ 977 atomic_set(&mi->inuse, 0); 978} 979 980/* 981 * The actual machine check handler. This only handles real 982 * exceptions when something got corrupted coming in through int 18. 983 * 984 * This is executed in NMI context not subject to normal locking rules. This 985 * implies that most kernel services cannot be safely used. Don't even 986 * think about putting a printk in there! 987 * 988 * On Intel systems this is entered on all CPUs in parallel through 989 * MCE broadcast. However some CPUs might be broken beyond repair, 990 * so be always careful when synchronizing with others. 991 */ 992void do_machine_check(struct pt_regs *regs, long error_code) 993{ 994 struct mce m, *final; 995 int i; 996 int worst = 0; 997 int severity; 998 /* 999 * Establish sequential order between the CPUs entering the machine 1000 * check handler. 1001 */ 1002 int order; 1003 /* 1004 * If no_way_out gets set, there is no safe way to recover from this 1005 * MCE. If tolerant is cranked up, we'll try anyway. 1006 */ 1007 int no_way_out = 0; 1008 /* 1009 * If kill_it gets set, there might be a way to recover from this 1010 * error. 1011 */ 1012 int kill_it = 0; 1013 DECLARE_BITMAP(toclear, MAX_NR_BANKS); 1014 char *msg = "Unknown"; 1015 1016 atomic_inc(&mce_entry); 1017 1018 percpu_inc(mce_exception_count); 1019 1020 if (!banks) 1021 goto out; 1022 1023 mce_gather_info(&m, regs); 1024 1025 final = &__get_cpu_var(mces_seen); 1026 *final = m; 1027 1028 no_way_out = mce_no_way_out(&m, &msg); 1029 1030 barrier(); 1031 1032 /* 1033 * When no restart IP might need to kill or panic. 1034 * Assume the worst for now, but if we find the 1035 * severity is MCE_AR_SEVERITY we have other options. 1036 */ 1037 if (!(m.mcgstatus & MCG_STATUS_RIPV)) 1038 kill_it = 1; 1039 1040 /* 1041 * Go through all the banks in exclusion of the other CPUs. 1042 * This way we don't report duplicated events on shared banks 1043 * because the first one to see it will clear it. 1044 */ 1045 order = mce_start(&no_way_out); 1046 for (i = 0; i < banks; i++) { 1047 __clear_bit(i, toclear); 1048 if (!mce_banks[i].ctl) 1049 continue; 1050 1051 m.misc = 0; 1052 m.addr = 0; 1053 m.bank = i; 1054 1055 m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); 1056 if ((m.status & MCI_STATUS_VAL) == 0) 1057 continue; 1058 1059 /* 1060 * Non uncorrected or non signaled errors are handled by 1061 * machine_check_poll. Leave them alone, unless this panics. 1062 */ 1063 if (!(m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)) && 1064 !no_way_out) 1065 continue; 1066 1067 /* 1068 * Set taint even when machine check was not enabled. 1069 */ 1070 add_taint(TAINT_MACHINE_CHECK); 1071 1072 severity = mce_severity(&m, tolerant, NULL); 1073 1074 /* 1075 * When machine check was for corrected handler don't touch, 1076 * unless we're panicing. 1077 */ 1078 if (severity == MCE_KEEP_SEVERITY && !no_way_out) 1079 continue; 1080 __set_bit(i, toclear); 1081 if (severity == MCE_NO_SEVERITY) { 1082 /* 1083 * Machine check event was not enabled. Clear, but 1084 * ignore. 1085 */ 1086 continue; 1087 } 1088 1089 mce_read_aux(&m, i); 1090 1091 /* 1092 * Action optional error. Queue address for later processing. 1093 * When the ring overflows we just ignore the AO error. 1094 * RED-PEN add some logging mechanism when 1095 * usable_address or mce_add_ring fails. 1096 * RED-PEN don't ignore overflow for tolerant == 0 1097 */ 1098 if (severity == MCE_AO_SEVERITY && mce_usable_address(&m)) 1099 mce_ring_add(m.addr >> PAGE_SHIFT); 1100 1101 mce_log(&m); 1102 1103 if (severity > worst) { 1104 *final = m; 1105 worst = severity; 1106 } 1107 } 1108 1109 /* mce_clear_state will clear *final, save locally for use later */ 1110 m = *final; 1111 1112 if (!no_way_out) 1113 mce_clear_state(toclear); 1114 1115 /* 1116 * Do most of the synchronization with other CPUs. 1117 * When there's any problem use only local no_way_out state. 1118 */ 1119 if (mce_end(order) < 0) 1120 no_way_out = worst >= MCE_PANIC_SEVERITY; 1121 1122 /* 1123 * At insane "tolerant" levels we take no action. Otherwise 1124 * we only die if we have no other choice. For less serious 1125 * issues we try to recover, or limit damage to the current 1126 * process. 1127 */ 1128 if (tolerant < 3) { 1129 if (no_way_out) 1130 mce_panic("Fatal machine check on current CPU", &m, msg); 1131 if (worst == MCE_AR_SEVERITY) { 1132 /* schedule action before return to userland */ 1133 mce_save_info(m.addr); 1134 set_thread_flag(TIF_MCE_NOTIFY); 1135 } else if (kill_it) { 1136 force_sig(SIGBUS, current); 1137 } 1138 } 1139 1140 if (worst > 0) 1141 mce_report_event(regs); 1142 mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); 1143out: 1144 atomic_dec(&mce_entry); 1145 sync_core(); 1146} 1147EXPORT_SYMBOL_GPL(do_machine_check); 1148 1149#ifndef CONFIG_MEMORY_FAILURE 1150int memory_failure(unsigned long pfn, int vector, int flags) 1151{ 1152 /* mce_severity() should not hand us an ACTION_REQUIRED error */ 1153 BUG_ON(flags & MF_ACTION_REQUIRED); 1154 printk(KERN_ERR "Uncorrected memory error in page 0x%lx ignored\n" 1155 "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n", pfn); 1156 1157 return 0; 1158} 1159#endif 1160 1161/* 1162 * Called in process context that interrupted by MCE and marked with 1163 * TIF_MCE_NOTIFY, just before returning to erroneous userland. 1164 * This code is allowed to sleep. 1165 * Attempt possible recovery such as calling the high level VM handler to 1166 * process any corrupted pages, and kill/signal current process if required. 1167 * Action required errors are handled here. 1168 */ 1169void mce_notify_process(void) 1170{ 1171 unsigned long pfn; 1172 struct mce_info *mi = mce_find_info(); 1173 1174 if (!mi) 1175 mce_panic("Lost physical address for unconsumed uncorrectable error", NULL, NULL); 1176 pfn = mi->paddr >> PAGE_SHIFT; 1177 1178 clear_thread_flag(TIF_MCE_NOTIFY); 1179 1180 pr_err("Uncorrected hardware memory error in user-access at %llx", 1181 mi->paddr); 1182 if (memory_failure(pfn, MCE_VECTOR, MF_ACTION_REQUIRED) < 0) { 1183 pr_err("Memory error not recovered"); 1184 force_sig(SIGBUS, current); 1185 } 1186 mce_clear_info(mi); 1187} 1188 1189/* 1190 * Action optional processing happens here (picking up 1191 * from the list of faulting pages that do_machine_check() 1192 * placed into the "ring"). 1193 */ 1194static void mce_process_work(struct work_struct *dummy) 1195{ 1196 unsigned long pfn; 1197 1198 while (mce_ring_get(&pfn)) 1199 memory_failure(pfn, MCE_VECTOR, 0); 1200} 1201 1202#ifdef CONFIG_X86_MCE_INTEL 1203/*** 1204 * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog 1205 * @cpu: The CPU on which the event occurred. 1206 * @status: Event status information 1207 * 1208 * This function should be called by the thermal interrupt after the 1209 * event has been processed and the decision was made to log the event 1210 * further. 1211 * 1212 * The status parameter will be saved to the 'status' field of 'struct mce' 1213 * and historically has been the register value of the 1214 * MSR_IA32_THERMAL_STATUS (Intel) msr. 1215 */ 1216void mce_log_therm_throt_event(__u64 status) 1217{ 1218 struct mce m; 1219 1220 mce_setup(&m); 1221 m.bank = MCE_THERMAL_BANK; 1222 m.status = status; 1223 mce_log(&m); 1224} 1225#endif /* CONFIG_X86_MCE_INTEL */ 1226 1227/* 1228 * Periodic polling timer for "silent" machine check errors. If the 1229 * poller finds an MCE, poll 2x faster. When the poller finds no more 1230 * errors, poll 2x slower (up to check_interval seconds). 1231 */ 1232static int check_interval = 5 * 60; /* 5 minutes */ 1233 1234static DEFINE_PER_CPU(int, mce_next_interval); /* in jiffies */ 1235static DEFINE_PER_CPU(struct timer_list, mce_timer); 1236 1237static void mce_start_timer(unsigned long data) 1238{ 1239 struct timer_list *t = &per_cpu(mce_timer, data); 1240 int *n; 1241 1242 WARN_ON(smp_processor_id() != data); 1243 1244 if (mce_available(__this_cpu_ptr(&cpu_info))) { 1245 machine_check_poll(MCP_TIMESTAMP, 1246 &__get_cpu_var(mce_poll_banks)); 1247 } 1248 1249 /* 1250 * Alert userspace if needed. If we logged an MCE, reduce the 1251 * polling interval, otherwise increase the polling interval. 1252 */ 1253 n = &__get_cpu_var(mce_next_interval); 1254 if (mce_notify_irq()) 1255 *n = max(*n/2, HZ/100); 1256 else 1257 *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ)); 1258 1259 t->expires = jiffies + *n; 1260 add_timer_on(t, smp_processor_id()); 1261} 1262 1263/* Must not be called in IRQ context where del_timer_sync() can deadlock */ 1264static void mce_timer_delete_all(void) 1265{ 1266 int cpu; 1267 1268 for_each_online_cpu(cpu) 1269 del_timer_sync(&per_cpu(mce_timer, cpu)); 1270} 1271 1272static void mce_do_trigger(struct work_struct *work) 1273{ 1274 call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT); 1275} 1276 1277static DECLARE_WORK(mce_trigger_work, mce_do_trigger); 1278 1279/* 1280 * Notify the user(s) about new machine check events. 1281 * Can be called from interrupt context, but not from machine check/NMI 1282 * context. 1283 */ 1284int mce_notify_irq(void) 1285{ 1286 /* Not more than two messages every minute */ 1287 static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); 1288 1289 if (test_and_clear_bit(0, &mce_need_notify)) { 1290 /* wake processes polling /dev/mcelog */ 1291 wake_up_interruptible(&mce_chrdev_wait); 1292 1293 /* 1294 * There is no risk of missing notifications because 1295 * work_pending is always cleared before the function is 1296 * executed. 1297 */ 1298 if (mce_helper[0] && !work_pending(&mce_trigger_work)) 1299 schedule_work(&mce_trigger_work); 1300 1301 if (__ratelimit(&ratelimit)) 1302 pr_info(HW_ERR "Machine check events logged\n"); 1303 1304 return 1; 1305 } 1306 return 0; 1307} 1308EXPORT_SYMBOL_GPL(mce_notify_irq); 1309 1310static int __cpuinit __mcheck_cpu_mce_banks_init(void) 1311{ 1312 int i; 1313 1314 mce_banks = kzalloc(banks * sizeof(struct mce_bank), GFP_KERNEL); 1315 if (!mce_banks) 1316 return -ENOMEM; 1317 for (i = 0; i < banks; i++) { 1318 struct mce_bank *b = &mce_banks[i]; 1319 1320 b->ctl = -1ULL; 1321 b->init = 1; 1322 } 1323 return 0; 1324} 1325 1326/* 1327 * Initialize Machine Checks for a CPU. 1328 */ 1329static int __cpuinit __mcheck_cpu_cap_init(void) 1330{ 1331 unsigned b; 1332 u64 cap; 1333 1334 rdmsrl(MSR_IA32_MCG_CAP, cap); 1335 1336 b = cap & MCG_BANKCNT_MASK; 1337 if (!banks) 1338 printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b); 1339 1340 if (b > MAX_NR_BANKS) { 1341 printk(KERN_WARNING 1342 "MCE: Using only %u machine check banks out of %u\n", 1343 MAX_NR_BANKS, b); 1344 b = MAX_NR_BANKS; 1345 } 1346 1347 /* Don't support asymmetric configurations today */ 1348 WARN_ON(banks != 0 && b != banks); 1349 banks = b; 1350 if (!mce_banks) { 1351 int err = __mcheck_cpu_mce_banks_init(); 1352 1353 if (err) 1354 return err; 1355 } 1356 1357 /* Use accurate RIP reporting if available. */ 1358 if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9) 1359 rip_msr = MSR_IA32_MCG_EIP; 1360 1361 if (cap & MCG_SER_P) 1362 mce_ser = 1; 1363 1364 return 0; 1365} 1366 1367static void __mcheck_cpu_init_generic(void) 1368{ 1369 mce_banks_t all_banks; 1370 u64 cap; 1371 int i; 1372 1373 /* 1374 * Log the machine checks left over from the previous reset. 1375 */ 1376 bitmap_fill(all_banks, MAX_NR_BANKS); 1377 machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks); 1378 1379 set_in_cr4(X86_CR4_MCE); 1380 1381 rdmsrl(MSR_IA32_MCG_CAP, cap); 1382 if (cap & MCG_CTL_P) 1383 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); 1384 1385 for (i = 0; i < banks; i++) { 1386 struct mce_bank *b = &mce_banks[i]; 1387 1388 if (!b->init) 1389 continue; 1390 wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl); 1391 wrmsrl(MSR_IA32_MCx_STATUS(i), 0); 1392 } 1393} 1394 1395/* Add per CPU specific workarounds here */ 1396static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) 1397{ 1398 if (c->x86_vendor == X86_VENDOR_UNKNOWN) { 1399 pr_info("MCE: unknown CPU type - not enabling MCE support.\n"); 1400 return -EOPNOTSUPP; 1401 } 1402 1403 /* This should be disabled by the BIOS, but isn't always */ 1404 if (c->x86_vendor == X86_VENDOR_AMD) { 1405 if (c->x86 == 15 && banks > 4) { 1406 /* 1407 * disable GART TBL walk error reporting, which 1408 * trips off incorrectly with the IOMMU & 3ware 1409 * & Cerberus: 1410 */ 1411 clear_bit(10, (unsigned long *)&mce_banks[4].ctl); 1412 } 1413 if (c->x86 <= 17 && mce_bootlog < 0) { 1414 /* 1415 * Lots of broken BIOS around that don't clear them 1416 * by default and leave crap in there. Don't log: 1417 */ 1418 mce_bootlog = 0; 1419 } 1420 /* 1421 * Various K7s with broken bank 0 around. Always disable 1422 * by default. 1423 */ 1424 if (c->x86 == 6 && banks > 0) 1425 mce_banks[0].ctl = 0; 1426 1427 /* 1428 * Turn off MC4_MISC thresholding banks on those models since 1429 * they're not supported there. 1430 */ 1431 if (c->x86 == 0x15 && 1432 (c->x86_model >= 0x10 && c->x86_model <= 0x1f)) { 1433 int i; 1434 u64 val, hwcr; 1435 bool need_toggle; 1436 u32 msrs[] = { 1437 0x00000413, /* MC4_MISC0 */ 1438 0xc0000408, /* MC4_MISC1 */ 1439 }; 1440 1441 rdmsrl(MSR_K7_HWCR, hwcr); 1442 1443 /* McStatusWrEn has to be set */ 1444 need_toggle = !(hwcr & BIT(18)); 1445 1446 if (need_toggle) 1447 wrmsrl(MSR_K7_HWCR, hwcr | BIT(18)); 1448 1449 for (i = 0; i < ARRAY_SIZE(msrs); i++) { 1450 rdmsrl(msrs[i], val); 1451 1452 /* CntP bit set? */ 1453 if (val & BIT_64(62)) { 1454 val &= ~BIT_64(62); 1455 wrmsrl(msrs[i], val); 1456 } 1457 } 1458 1459 /* restore old settings */ 1460 if (need_toggle) 1461 wrmsrl(MSR_K7_HWCR, hwcr); 1462 } 1463 } 1464 1465 if (c->x86_vendor == X86_VENDOR_INTEL) { 1466 /* 1467 * SDM documents that on family 6 bank 0 should not be written 1468 * because it aliases to another special BIOS controlled 1469 * register. 1470 * But it's not aliased anymore on model 0x1a+ 1471 * Don't ignore bank 0 completely because there could be a 1472 * valid event later, merely don't write CTL0. 1473 */ 1474 1475 if (c->x86 == 6 && c->x86_model < 0x1A && banks > 0) 1476 mce_banks[0].init = 0; 1477 1478 /* 1479 * All newer Intel systems support MCE broadcasting. Enable 1480 * synchronization with a one second timeout. 1481 */ 1482 if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) && 1483 monarch_timeout < 0) 1484 monarch_timeout = USEC_PER_SEC; 1485 1486 /* 1487 * There are also broken BIOSes on some Pentium M and 1488 * earlier systems: 1489 */ 1490 if (c->x86 == 6 && c->x86_model <= 13 && mce_bootlog < 0) 1491 mce_bootlog = 0; 1492 } 1493 if (monarch_timeout < 0) 1494 monarch_timeout = 0; 1495 if (mce_bootlog != 0) 1496 mce_panic_timeout = 30; 1497 1498 return 0; 1499} 1500 1501static int __cpuinit __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c) 1502{ 1503 if (c->x86 != 5) 1504 return 0; 1505 1506 switch (c->x86_vendor) { 1507 case X86_VENDOR_INTEL: 1508 intel_p5_mcheck_init(c); 1509 return 1; 1510 break; 1511 case X86_VENDOR_CENTAUR: 1512 winchip_mcheck_init(c); 1513 return 1; 1514 break; 1515 } 1516 1517 return 0; 1518} 1519 1520static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) 1521{ 1522 switch (c->x86_vendor) { 1523 case X86_VENDOR_INTEL: 1524 mce_intel_feature_init(c); 1525 break; 1526 case X86_VENDOR_AMD: 1527 mce_amd_feature_init(c); 1528 break; 1529 default: 1530 break; 1531 } 1532} 1533 1534static void __mcheck_cpu_init_timer(void) 1535{ 1536 struct timer_list *t = &__get_cpu_var(mce_timer); 1537 int *n = &__get_cpu_var(mce_next_interval); 1538 1539 setup_timer(t, mce_start_timer, smp_processor_id()); 1540 1541 if (mce_ignore_ce) 1542 return; 1543 1544 *n = check_interval * HZ; 1545 if (!*n) 1546 return; 1547 t->expires = round_jiffies(jiffies + *n); 1548 add_timer_on(t, smp_processor_id()); 1549} 1550 1551/* Handle unconfigured int18 (should never happen) */ 1552static void unexpected_machine_check(struct pt_regs *regs, long error_code) 1553{ 1554 printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", 1555 smp_processor_id()); 1556} 1557 1558/* Call the installed machine check handler for this CPU setup. */ 1559void (*machine_check_vector)(struct pt_regs *, long error_code) = 1560 unexpected_machine_check; 1561 1562/* 1563 * Called for each booted CPU to set up machine checks. 1564 * Must be called with preempt off: 1565 */ 1566void __cpuinit mcheck_cpu_init(struct cpuinfo_x86 *c) 1567{ 1568 if (mce_disabled) 1569 return; 1570 1571 if (__mcheck_cpu_ancient_init(c)) 1572 return; 1573 1574 if (!mce_available(c)) 1575 return; 1576 1577 if (__mcheck_cpu_cap_init() < 0 || __mcheck_cpu_apply_quirks(c) < 0) { 1578 mce_disabled = 1; 1579 return; 1580 } 1581 1582 machine_check_vector = do_machine_check; 1583 1584 __mcheck_cpu_init_generic(); 1585 __mcheck_cpu_init_vendor(c); 1586 __mcheck_cpu_init_timer(); 1587 INIT_WORK(&__get_cpu_var(mce_work), mce_process_work); 1588 init_irq_work(&__get_cpu_var(mce_irq_work), &mce_irq_work_cb); 1589} 1590 1591/* 1592 * mce_chrdev: Character device /dev/mcelog to read and clear the MCE log. 1593 */ 1594 1595static DEFINE_SPINLOCK(mce_chrdev_state_lock); 1596static int mce_chrdev_open_count; /* #times opened */ 1597static int mce_chrdev_open_exclu; /* already open exclusive? */ 1598 1599static int mce_chrdev_open(struct inode *inode, struct file *file) 1600{ 1601 spin_lock(&mce_chrdev_state_lock); 1602 1603 if (mce_chrdev_open_exclu || 1604 (mce_chrdev_open_count && (file->f_flags & O_EXCL))) { 1605 spin_unlock(&mce_chrdev_state_lock); 1606 1607 return -EBUSY; 1608 } 1609 1610 if (file->f_flags & O_EXCL) 1611 mce_chrdev_open_exclu = 1; 1612 mce_chrdev_open_count++; 1613 1614 spin_unlock(&mce_chrdev_state_lock); 1615 1616 return nonseekable_open(inode, file); 1617} 1618 1619static int mce_chrdev_release(struct inode *inode, struct file *file) 1620{ 1621 spin_lock(&mce_chrdev_state_lock); 1622 1623 mce_chrdev_open_count--; 1624 mce_chrdev_open_exclu = 0; 1625 1626 spin_unlock(&mce_chrdev_state_lock); 1627 1628 return 0; 1629} 1630 1631static void collect_tscs(void *data) 1632{ 1633 unsigned long *cpu_tsc = (unsigned long *)data; 1634 1635 rdtscll(cpu_tsc[smp_processor_id()]); 1636} 1637 1638static int mce_apei_read_done; 1639 1640/* Collect MCE record of previous boot in persistent storage via APEI ERST. */ 1641static int __mce_read_apei(char __user **ubuf, size_t usize) 1642{ 1643 int rc; 1644 u64 record_id; 1645 struct mce m; 1646 1647 if (usize < sizeof(struct mce)) 1648 return -EINVAL; 1649 1650 rc = apei_read_mce(&m, &record_id); 1651 /* Error or no more MCE record */ 1652 if (rc <= 0) { 1653 mce_apei_read_done = 1; 1654 /* 1655 * When ERST is disabled, mce_chrdev_read() should return 1656 * "no record" instead of "no device." 1657 */ 1658 if (rc == -ENODEV) 1659 return 0; 1660 return rc; 1661 } 1662 rc = -EFAULT; 1663 if (copy_to_user(*ubuf, &m, sizeof(struct mce))) 1664 return rc; 1665 /* 1666 * In fact, we should have cleared the record after that has 1667 * been flushed to the disk or sent to network in 1668 * /sbin/mcelog, but we have no interface to support that now, 1669 * so just clear it to avoid duplication. 1670 */ 1671 rc = apei_clear_mce(record_id); 1672 if (rc) { 1673 mce_apei_read_done = 1; 1674 return rc; 1675 } 1676 *ubuf += sizeof(struct mce); 1677 1678 return 0; 1679} 1680 1681static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf, 1682 size_t usize, loff_t *off) 1683{ 1684 char __user *buf = ubuf; 1685 unsigned long *cpu_tsc; 1686 unsigned prev, next; 1687 int i, err; 1688 1689 cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL); 1690 if (!cpu_tsc) 1691 return -ENOMEM; 1692 1693 mutex_lock(&mce_chrdev_read_mutex); 1694 1695 if (!mce_apei_read_done) { 1696 err = __mce_read_apei(&buf, usize); 1697 if (err || buf != ubuf) 1698 goto out; 1699 } 1700 1701 next = rcu_dereference_check_mce(mcelog.next); 1702 1703 /* Only supports full reads right now */ 1704 err = -EINVAL; 1705 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) 1706 goto out; 1707 1708 err = 0; 1709 prev = 0; 1710 do { 1711 for (i = prev; i < next; i++) { 1712 unsigned long start = jiffies; 1713 struct mce *m = &mcelog.entry[i]; 1714 1715 while (!m->finished) { 1716 if (time_after_eq(jiffies, start + 2)) { 1717 memset(m, 0, sizeof(*m)); 1718 goto timeout; 1719 } 1720 cpu_relax(); 1721 } 1722 smp_rmb(); 1723 err |= copy_to_user(buf, m, sizeof(*m)); 1724 buf += sizeof(*m); 1725timeout: 1726 ; 1727 } 1728 1729 memset(mcelog.entry + prev, 0, 1730 (next - prev) * sizeof(struct mce)); 1731 prev = next; 1732 next = cmpxchg(&mcelog.next, prev, 0); 1733 } while (next != prev); 1734 1735 synchronize_sched(); 1736 1737 /* 1738 * Collect entries that were still getting written before the 1739 * synchronize. 1740 */ 1741 on_each_cpu(collect_tscs, cpu_tsc, 1); 1742 1743 for (i = next; i < MCE_LOG_LEN; i++) { 1744 struct mce *m = &mcelog.entry[i]; 1745 1746 if (m->finished && m->tsc < cpu_tsc[m->cpu]) { 1747 err |= copy_to_user(buf, m, sizeof(*m)); 1748 smp_rmb(); 1749 buf += sizeof(*m); 1750 memset(m, 0, sizeof(*m)); 1751 } 1752 } 1753 1754 if (err) 1755 err = -EFAULT; 1756 1757out: 1758 mutex_unlock(&mce_chrdev_read_mutex); 1759 kfree(cpu_tsc); 1760 1761 return err ? err : buf - ubuf; 1762} 1763 1764static unsigned int mce_chrdev_poll(struct file *file, poll_table *wait) 1765{ 1766 poll_wait(file, &mce_chrdev_wait, wait); 1767 if (rcu_access_index(mcelog.next)) 1768 return POLLIN | POLLRDNORM; 1769 if (!mce_apei_read_done && apei_check_mce()) 1770 return POLLIN | POLLRDNORM; 1771 return 0; 1772} 1773 1774static long mce_chrdev_ioctl(struct file *f, unsigned int cmd, 1775 unsigned long arg) 1776{ 1777 int __user *p = (int __user *)arg; 1778 1779 if (!capable(CAP_SYS_ADMIN)) 1780 return -EPERM; 1781 1782 switch (cmd) { 1783 case MCE_GET_RECORD_LEN: 1784 return put_user(sizeof(struct mce), p); 1785 case MCE_GET_LOG_LEN: 1786 return put_user(MCE_LOG_LEN, p); 1787 case MCE_GETCLEAR_FLAGS: { 1788 unsigned flags; 1789 1790 do { 1791 flags = mcelog.flags; 1792 } while (cmpxchg(&mcelog.flags, flags, 0) != flags); 1793 1794 return put_user(flags, p); 1795 } 1796 default: 1797 return -ENOTTY; 1798 } 1799} 1800 1801static ssize_t (*mce_write)(struct file *filp, const char __user *ubuf, 1802 size_t usize, loff_t *off); 1803 1804void register_mce_write_callback(ssize_t (*fn)(struct file *filp, 1805 const char __user *ubuf, 1806 size_t usize, loff_t *off)) 1807{ 1808 mce_write = fn; 1809} 1810EXPORT_SYMBOL_GPL(register_mce_write_callback); 1811 1812ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf, 1813 size_t usize, loff_t *off) 1814{ 1815 if (mce_write) 1816 return mce_write(filp, ubuf, usize, off); 1817 else 1818 return -EINVAL; 1819} 1820 1821static const struct file_operations mce_chrdev_ops = { 1822 .open = mce_chrdev_open, 1823 .release = mce_chrdev_release, 1824 .read = mce_chrdev_read, 1825 .write = mce_chrdev_write, 1826 .poll = mce_chrdev_poll, 1827 .unlocked_ioctl = mce_chrdev_ioctl, 1828 .llseek = no_llseek, 1829}; 1830 1831static struct miscdevice mce_chrdev_device = { 1832 MISC_MCELOG_MINOR, 1833 "mcelog", 1834 &mce_chrdev_ops, 1835}; 1836 1837/* 1838 * mce=off Disables machine check 1839 * mce=no_cmci Disables CMCI 1840 * mce=dont_log_ce Clears corrected events silently, no log created for CEs. 1841 * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared. 1842 * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above) 1843 * monarchtimeout is how long to wait for other CPUs on machine 1844 * check, or 0 to not wait 1845 * mce=bootlog Log MCEs from before booting. Disabled by default on AMD. 1846 * mce=nobootlog Don't log MCEs from before booting. 1847 */ 1848static int __init mcheck_enable(char *str) 1849{ 1850 if (*str == 0) { 1851 enable_p5_mce(); 1852 return 1; 1853 } 1854 if (*str == '=') 1855 str++; 1856 if (!strcmp(str, "off")) 1857 mce_disabled = 1; 1858 else if (!strcmp(str, "no_cmci")) 1859 mce_cmci_disabled = 1; 1860 else if (!strcmp(str, "dont_log_ce")) 1861 mce_dont_log_ce = 1; 1862 else if (!strcmp(str, "ignore_ce")) 1863 mce_ignore_ce = 1; 1864 else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog")) 1865 mce_bootlog = (str[0] == 'b'); 1866 else if (isdigit(str[0])) { 1867 get_option(&str, &tolerant); 1868 if (*str == ',') { 1869 ++str; 1870 get_option(&str, &monarch_timeout); 1871 } 1872 } else { 1873 printk(KERN_INFO "mce argument %s ignored. Please use /sys\n", 1874 str); 1875 return 0; 1876 } 1877 return 1; 1878} 1879__setup("mce", mcheck_enable); 1880 1881int __init mcheck_init(void) 1882{ 1883 mcheck_intel_therm_init(); 1884 1885 return 0; 1886} 1887 1888/* 1889 * mce_syscore: PM support 1890 */ 1891 1892/* 1893 * Disable machine checks on suspend and shutdown. We can't really handle 1894 * them later. 1895 */ 1896static int mce_disable_error_reporting(void) 1897{ 1898 int i; 1899 1900 for (i = 0; i < banks; i++) { 1901 struct mce_bank *b = &mce_banks[i]; 1902 1903 if (b->init) 1904 wrmsrl(MSR_IA32_MCx_CTL(i), 0); 1905 } 1906 return 0; 1907} 1908 1909static int mce_syscore_suspend(void) 1910{ 1911 return mce_disable_error_reporting(); 1912} 1913 1914static void mce_syscore_shutdown(void) 1915{ 1916 mce_disable_error_reporting(); 1917} 1918 1919/* 1920 * On resume clear all MCE state. Don't want to see leftovers from the BIOS. 1921 * Only one CPU is active at this time, the others get re-added later using 1922 * CPU hotplug: 1923 */ 1924static void mce_syscore_resume(void) 1925{ 1926 __mcheck_cpu_init_generic(); 1927 __mcheck_cpu_init_vendor(__this_cpu_ptr(&cpu_info)); 1928} 1929 1930static struct syscore_ops mce_syscore_ops = { 1931 .suspend = mce_syscore_suspend, 1932 .shutdown = mce_syscore_shutdown, 1933 .resume = mce_syscore_resume, 1934}; 1935 1936/* 1937 * mce_device: Sysfs support 1938 */ 1939 1940static void mce_cpu_restart(void *data) 1941{ 1942 if (!mce_available(__this_cpu_ptr(&cpu_info))) 1943 return; 1944 __mcheck_cpu_init_generic(); 1945 __mcheck_cpu_init_timer(); 1946} 1947 1948/* Reinit MCEs after user configuration changes */ 1949static void mce_restart(void) 1950{ 1951 mce_timer_delete_all(); 1952 on_each_cpu(mce_cpu_restart, NULL, 1); 1953} 1954 1955/* Toggle features for corrected errors */ 1956static void mce_disable_cmci(void *data) 1957{ 1958 if (!mce_available(__this_cpu_ptr(&cpu_info))) 1959 return; 1960 cmci_clear(); 1961} 1962 1963static void mce_enable_ce(void *all) 1964{ 1965 if (!mce_available(__this_cpu_ptr(&cpu_info))) 1966 return; 1967 cmci_reenable(); 1968 cmci_recheck(); 1969 if (all) 1970 __mcheck_cpu_init_timer(); 1971} 1972 1973static struct bus_type mce_subsys = { 1974 .name = "machinecheck", 1975 .dev_name = "machinecheck", 1976}; 1977 1978DEFINE_PER_CPU(struct device *, mce_device); 1979 1980__cpuinitdata 1981void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); 1982 1983static inline struct mce_bank *attr_to_bank(struct device_attribute *attr) 1984{ 1985 return container_of(attr, struct mce_bank, attr); 1986} 1987 1988static ssize_t show_bank(struct device *s, struct device_attribute *attr, 1989 char *buf) 1990{ 1991 return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl); 1992} 1993 1994static ssize_t set_bank(struct device *s, struct device_attribute *attr, 1995 const char *buf, size_t size) 1996{ 1997 u64 new; 1998 1999 if (strict_strtoull(buf, 0, &new) < 0) 2000 return -EINVAL; 2001 2002 attr_to_bank(attr)->ctl = new; 2003 mce_restart(); 2004 2005 return size; 2006} 2007 2008static ssize_t 2009show_trigger(struct device *s, struct device_attribute *attr, char *buf) 2010{ 2011 strcpy(buf, mce_helper); 2012 strcat(buf, "\n"); 2013 return strlen(mce_helper) + 1; 2014} 2015 2016static ssize_t set_trigger(struct device *s, struct device_attribute *attr, 2017 const char *buf, size_t siz) 2018{ 2019 char *p; 2020 2021 strncpy(mce_helper, buf, sizeof(mce_helper)); 2022 mce_helper[sizeof(mce_helper)-1] = 0; 2023 p = strchr(mce_helper, '\n'); 2024 2025 if (p) 2026 *p = 0; 2027 2028 return strlen(mce_helper) + !!p; 2029} 2030 2031static ssize_t set_ignore_ce(struct device *s, 2032 struct device_attribute *attr, 2033 const char *buf, size_t size) 2034{ 2035 u64 new; 2036 2037 if (strict_strtoull(buf, 0, &new) < 0) 2038 return -EINVAL; 2039 2040 if (mce_ignore_ce ^ !!new) { 2041 if (new) { 2042 /* disable ce features */ 2043 mce_timer_delete_all(); 2044 on_each_cpu(mce_disable_cmci, NULL, 1); 2045 mce_ignore_ce = 1; 2046 } else { 2047 /* enable ce features */ 2048 mce_ignore_ce = 0; 2049 on_each_cpu(mce_enable_ce, (void *)1, 1); 2050 } 2051 } 2052 return size; 2053} 2054 2055static ssize_t set_cmci_disabled(struct device *s, 2056 struct device_attribute *attr, 2057 const char *buf, size_t size) 2058{ 2059 u64 new; 2060 2061 if (strict_strtoull(buf, 0, &new) < 0) 2062 return -EINVAL; 2063 2064 if (mce_cmci_disabled ^ !!new) { 2065 if (new) { 2066 /* disable cmci */ 2067 on_each_cpu(mce_disable_cmci, NULL, 1); 2068 mce_cmci_disabled = 1; 2069 } else { 2070 /* enable cmci */ 2071 mce_cmci_disabled = 0; 2072 on_each_cpu(mce_enable_ce, NULL, 1); 2073 } 2074 } 2075 return size; 2076} 2077 2078static ssize_t store_int_with_restart(struct device *s, 2079 struct device_attribute *attr, 2080 const char *buf, size_t size) 2081{ 2082 ssize_t ret = device_store_int(s, attr, buf, size); 2083 mce_restart(); 2084 return ret; 2085} 2086 2087static DEVICE_ATTR(trigger, 0644, show_trigger, set_trigger); 2088static DEVICE_INT_ATTR(tolerant, 0644, tolerant); 2089static DEVICE_INT_ATTR(monarch_timeout, 0644, monarch_timeout); 2090static DEVICE_INT_ATTR(dont_log_ce, 0644, mce_dont_log_ce); 2091 2092static struct dev_ext_attribute dev_attr_check_interval = { 2093 __ATTR(check_interval, 0644, device_show_int, store_int_with_restart), 2094 &check_interval 2095}; 2096 2097static struct dev_ext_attribute dev_attr_ignore_ce = { 2098 __ATTR(ignore_ce, 0644, device_show_int, set_ignore_ce), 2099 &mce_ignore_ce 2100}; 2101 2102static struct dev_ext_attribute dev_attr_cmci_disabled = { 2103 __ATTR(cmci_disabled, 0644, device_show_int, set_cmci_disabled), 2104 &mce_cmci_disabled 2105}; 2106 2107static struct device_attribute *mce_device_attrs[] = { 2108 &dev_attr_tolerant.attr, 2109 &dev_attr_check_interval.attr, 2110 &dev_attr_trigger, 2111 &dev_attr_monarch_timeout.attr, 2112 &dev_attr_dont_log_ce.attr, 2113 &dev_attr_ignore_ce.attr, 2114 &dev_attr_cmci_disabled.attr, 2115 NULL 2116}; 2117 2118static cpumask_var_t mce_device_initialized; 2119 2120static void mce_device_release(struct device *dev) 2121{ 2122 kfree(dev); 2123} 2124 2125/* Per cpu device init. All of the cpus still share the same ctrl bank: */ 2126static __cpuinit int mce_device_create(unsigned int cpu) 2127{ 2128 struct device *dev; 2129 int err; 2130 int i, j; 2131 2132 if (!mce_available(&boot_cpu_data)) 2133 return -EIO; 2134 2135 dev = kzalloc(sizeof *dev, GFP_KERNEL); 2136 if (!dev) 2137 return -ENOMEM; 2138 dev->id = cpu; 2139 dev->bus = &mce_subsys; 2140 dev->release = &mce_device_release; 2141 2142 err = device_register(dev); 2143 if (err) 2144 return err; 2145 2146 for (i = 0; mce_device_attrs[i]; i++) { 2147 err = device_create_file(dev, mce_device_attrs[i]); 2148 if (err) 2149 goto error; 2150 } 2151 for (j = 0; j < banks; j++) { 2152 err = device_create_file(dev, &mce_banks[j].attr); 2153 if (err) 2154 goto error2; 2155 } 2156 cpumask_set_cpu(cpu, mce_device_initialized); 2157 per_cpu(mce_device, cpu) = dev; 2158 2159 return 0; 2160error2: 2161 while (--j >= 0) 2162 device_remove_file(dev, &mce_banks[j].attr); 2163error: 2164 while (--i >= 0) 2165 device_remove_file(dev, mce_device_attrs[i]); 2166 2167 device_unregister(dev); 2168 2169 return err; 2170} 2171 2172static __cpuinit void mce_device_remove(unsigned int cpu) 2173{ 2174 struct device *dev = per_cpu(mce_device, cpu); 2175 int i; 2176 2177 if (!cpumask_test_cpu(cpu, mce_device_initialized)) 2178 return; 2179 2180 for (i = 0; mce_device_attrs[i]; i++) 2181 device_remove_file(dev, mce_device_attrs[i]); 2182 2183 for (i = 0; i < banks; i++) 2184 device_remove_file(dev, &mce_banks[i].attr); 2185 2186 device_unregister(dev); 2187 cpumask_clear_cpu(cpu, mce_device_initialized); 2188 per_cpu(mce_device, cpu) = NULL; 2189} 2190 2191/* Make sure there are no machine checks on offlined CPUs. */ 2192static void __cpuinit mce_disable_cpu(void *h) 2193{ 2194 unsigned long action = *(unsigned long *)h; 2195 int i; 2196 2197 if (!mce_available(__this_cpu_ptr(&cpu_info))) 2198 return; 2199 2200 if (!(action & CPU_TASKS_FROZEN)) 2201 cmci_clear(); 2202 for (i = 0; i < banks; i++) { 2203 struct mce_bank *b = &mce_banks[i]; 2204 2205 if (b->init) 2206 wrmsrl(MSR_IA32_MCx_CTL(i), 0); 2207 } 2208} 2209 2210static void __cpuinit mce_reenable_cpu(void *h) 2211{ 2212 unsigned long action = *(unsigned long *)h; 2213 int i; 2214 2215 if (!mce_available(__this_cpu_ptr(&cpu_info))) 2216 return; 2217 2218 if (!(action & CPU_TASKS_FROZEN)) 2219 cmci_reenable(); 2220 for (i = 0; i < banks; i++) { 2221 struct mce_bank *b = &mce_banks[i]; 2222 2223 if (b->init) 2224 wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl); 2225 } 2226} 2227 2228/* Get notified when a cpu comes on/off. Be hotplug friendly. */ 2229static int __cpuinit 2230mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) 2231{ 2232 unsigned int cpu = (unsigned long)hcpu; 2233 struct timer_list *t = &per_cpu(mce_timer, cpu); 2234 2235 switch (action) { 2236 case CPU_ONLINE: 2237 case CPU_ONLINE_FROZEN: 2238 mce_device_create(cpu); 2239 if (threshold_cpu_callback) 2240 threshold_cpu_callback(action, cpu); 2241 break; 2242 case CPU_DEAD: 2243 case CPU_DEAD_FROZEN: 2244 if (threshold_cpu_callback) 2245 threshold_cpu_callback(action, cpu); 2246 mce_device_remove(cpu); 2247 break; 2248 case CPU_DOWN_PREPARE: 2249 case CPU_DOWN_PREPARE_FROZEN: 2250 del_timer_sync(t); 2251 smp_call_function_single(cpu, mce_disable_cpu, &action, 1); 2252 break; 2253 case CPU_DOWN_FAILED: 2254 case CPU_DOWN_FAILED_FROZEN: 2255 if (!mce_ignore_ce && check_interval) { 2256 t->expires = round_jiffies(jiffies + 2257 __get_cpu_var(mce_next_interval)); 2258 add_timer_on(t, cpu); 2259 } 2260 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); 2261 break; 2262 case CPU_POST_DEAD: 2263 /* intentionally ignoring frozen here */ 2264 cmci_rediscover(cpu); 2265 break; 2266 } 2267 return NOTIFY_OK; 2268} 2269 2270static struct notifier_block mce_cpu_notifier __cpuinitdata = { 2271 .notifier_call = mce_cpu_callback, 2272}; 2273 2274static __init void mce_init_banks(void) 2275{ 2276 int i; 2277 2278 for (i = 0; i < banks; i++) { 2279 struct mce_bank *b = &mce_banks[i]; 2280 struct device_attribute *a = &b->attr; 2281 2282 sysfs_attr_init(&a->attr); 2283 a->attr.name = b->attrname; 2284 snprintf(b->attrname, ATTR_LEN, "bank%d", i); 2285 2286 a->attr.mode = 0644; 2287 a->show = show_bank; 2288 a->store = set_bank; 2289 } 2290} 2291 2292static __init int mcheck_init_device(void) 2293{ 2294 int err; 2295 int i = 0; 2296 2297 if (!mce_available(&boot_cpu_data)) 2298 return -EIO; 2299 2300 zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL); 2301 2302 mce_init_banks(); 2303 2304 err = subsys_system_register(&mce_subsys, NULL); 2305 if (err) 2306 return err; 2307 2308 for_each_online_cpu(i) { 2309 err = mce_device_create(i); 2310 if (err) 2311 return err; 2312 } 2313 2314 register_syscore_ops(&mce_syscore_ops); 2315 register_hotcpu_notifier(&mce_cpu_notifier); 2316 2317 /* register character device /dev/mcelog */ 2318 misc_register(&mce_chrdev_device); 2319 2320 return err; 2321} 2322device_initcall(mcheck_init_device); 2323 2324/* 2325 * Old style boot options parsing. Only for compatibility. 2326 */ 2327static int __init mcheck_disable(char *str) 2328{ 2329 mce_disabled = 1; 2330 return 1; 2331} 2332__setup("nomce", mcheck_disable); 2333 2334#ifdef CONFIG_DEBUG_FS 2335struct dentry *mce_get_debugfs_dir(void) 2336{ 2337 static struct dentry *dmce; 2338 2339 if (!dmce) 2340 dmce = debugfs_create_dir("mce", NULL); 2341 2342 return dmce; 2343} 2344 2345static void mce_reset(void) 2346{ 2347 cpu_missing = 0; 2348 atomic_set(&mce_fake_paniced, 0); 2349 atomic_set(&mce_executing, 0); 2350 atomic_set(&mce_callin, 0); 2351 atomic_set(&global_nwo, 0); 2352} 2353 2354static int fake_panic_get(void *data, u64 *val) 2355{ 2356 *val = fake_panic; 2357 return 0; 2358} 2359 2360static int fake_panic_set(void *data, u64 val) 2361{ 2362 mce_reset(); 2363 fake_panic = val; 2364 return 0; 2365} 2366 2367DEFINE_SIMPLE_ATTRIBUTE(fake_panic_fops, fake_panic_get, 2368 fake_panic_set, "%llu\n"); 2369 2370static int __init mcheck_debugfs_init(void) 2371{ 2372 struct dentry *dmce, *ffake_panic; 2373 2374 dmce = mce_get_debugfs_dir(); 2375 if (!dmce) 2376 return -ENOMEM; 2377 ffake_panic = debugfs_create_file("fake_panic", 0444, dmce, NULL, 2378 &fake_panic_fops); 2379 if (!ffake_panic) 2380 return -ENOMEM; 2381 2382 return 0; 2383} 2384late_initcall(mcheck_debugfs_init); 2385#endif 2386