mce.c revision 450cc201038f31bd496e1b3a44a49790b8827a06
1/* 2 * Machine check handler. 3 * 4 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. 5 * Rest from unknown author(s). 6 * 2004 Andi Kleen. Rewrote most of it. 7 * Copyright 2008 Intel Corporation 8 * Author: Andi Kleen 9 */ 10 11#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 12 13#include <linux/thread_info.h> 14#include <linux/capability.h> 15#include <linux/miscdevice.h> 16#include <linux/ratelimit.h> 17#include <linux/kallsyms.h> 18#include <linux/rcupdate.h> 19#include <linux/kobject.h> 20#include <linux/uaccess.h> 21#include <linux/kdebug.h> 22#include <linux/kernel.h> 23#include <linux/percpu.h> 24#include <linux/string.h> 25#include <linux/device.h> 26#include <linux/syscore_ops.h> 27#include <linux/delay.h> 28#include <linux/ctype.h> 29#include <linux/sched.h> 30#include <linux/sysfs.h> 31#include <linux/types.h> 32#include <linux/slab.h> 33#include <linux/init.h> 34#include <linux/kmod.h> 35#include <linux/poll.h> 36#include <linux/nmi.h> 37#include <linux/cpu.h> 38#include <linux/smp.h> 39#include <linux/fs.h> 40#include <linux/mm.h> 41#include <linux/debugfs.h> 42#include <linux/irq_work.h> 43#include <linux/export.h> 44 45#include <asm/processor.h> 46#include <asm/mce.h> 47#include <asm/msr.h> 48 49#include "mce-internal.h" 50 51static DEFINE_MUTEX(mce_chrdev_read_mutex); 52 53#define rcu_dereference_check_mce(p) \ 54 rcu_dereference_index_check((p), \ 55 rcu_read_lock_sched_held() || \ 56 lockdep_is_held(&mce_chrdev_read_mutex)) 57 58#define CREATE_TRACE_POINTS 59#include <trace/events/mce.h> 60 61int mce_disabled __read_mostly; 62 63#define SPINUNIT 100 /* 100ns */ 64 65atomic_t mce_entry; 66 67DEFINE_PER_CPU(unsigned, mce_exception_count); 68 69/* 70 * Tolerant levels: 71 * 0: always panic on uncorrected errors, log corrected errors 72 * 1: panic or SIGBUS on uncorrected errors, log corrected errors 73 * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors 74 * 3: never panic or SIGBUS, log all errors (for testing only) 75 */ 76static int tolerant __read_mostly = 1; 77static int banks __read_mostly; 78static int rip_msr __read_mostly; 79static int mce_bootlog __read_mostly = -1; 80static int monarch_timeout __read_mostly = -1; 81static int mce_panic_timeout __read_mostly; 82static int mce_dont_log_ce __read_mostly; 83int mce_cmci_disabled __read_mostly; 84int mce_ignore_ce __read_mostly; 85int mce_ser __read_mostly; 86int mce_bios_cmci_threshold __read_mostly; 87 88struct mce_bank *mce_banks __read_mostly; 89 90/* User mode helper program triggered by machine check event */ 91static unsigned long mce_need_notify; 92static char mce_helper[128]; 93static char *mce_helper_argv[2] = { mce_helper, NULL }; 94 95static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait); 96 97static DEFINE_PER_CPU(struct mce, mces_seen); 98static int cpu_missing; 99 100/* MCA banks polled by the period polling timer for corrected events */ 101DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { 102 [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL 103}; 104 105static DEFINE_PER_CPU(struct work_struct, mce_work); 106 107static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs); 108 109/* 110 * CPU/chipset specific EDAC code can register a notifier call here to print 111 * MCE errors in a human-readable form. 112 */ 113ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain); 114 115/* Do initial initialization of a struct mce */ 116void mce_setup(struct mce *m) 117{ 118 memset(m, 0, sizeof(struct mce)); 119 m->cpu = m->extcpu = smp_processor_id(); 120 rdtscll(m->tsc); 121 /* We hope get_seconds stays lockless */ 122 m->time = get_seconds(); 123 m->cpuvendor = boot_cpu_data.x86_vendor; 124 m->cpuid = cpuid_eax(1); 125 m->socketid = cpu_data(m->extcpu).phys_proc_id; 126 m->apicid = cpu_data(m->extcpu).initial_apicid; 127 rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap); 128} 129 130DEFINE_PER_CPU(struct mce, injectm); 131EXPORT_PER_CPU_SYMBOL_GPL(injectm); 132 133/* 134 * Lockless MCE logging infrastructure. 135 * This avoids deadlocks on printk locks without having to break locks. Also 136 * separate MCEs from kernel messages to avoid bogus bug reports. 137 */ 138 139static struct mce_log mcelog = { 140 .signature = MCE_LOG_SIGNATURE, 141 .len = MCE_LOG_LEN, 142 .recordlen = sizeof(struct mce), 143}; 144 145void mce_log(struct mce *mce) 146{ 147 unsigned next, entry; 148 int ret = 0; 149 150 /* Emit the trace record: */ 151 trace_mce_record(mce); 152 153 ret = atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, mce); 154 if (ret == NOTIFY_STOP) 155 return; 156 157 mce->finished = 0; 158 wmb(); 159 for (;;) { 160 entry = rcu_dereference_check_mce(mcelog.next); 161 for (;;) { 162 163 /* 164 * When the buffer fills up discard new entries. 165 * Assume that the earlier errors are the more 166 * interesting ones: 167 */ 168 if (entry >= MCE_LOG_LEN) { 169 set_bit(MCE_OVERFLOW, 170 (unsigned long *)&mcelog.flags); 171 return; 172 } 173 /* Old left over entry. Skip: */ 174 if (mcelog.entry[entry].finished) { 175 entry++; 176 continue; 177 } 178 break; 179 } 180 smp_rmb(); 181 next = entry + 1; 182 if (cmpxchg(&mcelog.next, entry, next) == entry) 183 break; 184 } 185 memcpy(mcelog.entry + entry, mce, sizeof(struct mce)); 186 wmb(); 187 mcelog.entry[entry].finished = 1; 188 wmb(); 189 190 mce->finished = 1; 191 set_bit(0, &mce_need_notify); 192} 193 194static void drain_mcelog_buffer(void) 195{ 196 unsigned int next, i, prev = 0; 197 198 next = ACCESS_ONCE(mcelog.next); 199 200 do { 201 struct mce *m; 202 203 /* drain what was logged during boot */ 204 for (i = prev; i < next; i++) { 205 unsigned long start = jiffies; 206 unsigned retries = 1; 207 208 m = &mcelog.entry[i]; 209 210 while (!m->finished) { 211 if (time_after_eq(jiffies, start + 2*retries)) 212 retries++; 213 214 cpu_relax(); 215 216 if (!m->finished && retries >= 4) { 217 pr_err("skipping error being logged currently!\n"); 218 break; 219 } 220 } 221 smp_rmb(); 222 atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m); 223 } 224 225 memset(mcelog.entry + prev, 0, (next - prev) * sizeof(*m)); 226 prev = next; 227 next = cmpxchg(&mcelog.next, prev, 0); 228 } while (next != prev); 229} 230 231 232void mce_register_decode_chain(struct notifier_block *nb) 233{ 234 atomic_notifier_chain_register(&x86_mce_decoder_chain, nb); 235 drain_mcelog_buffer(); 236} 237EXPORT_SYMBOL_GPL(mce_register_decode_chain); 238 239void mce_unregister_decode_chain(struct notifier_block *nb) 240{ 241 atomic_notifier_chain_unregister(&x86_mce_decoder_chain, nb); 242} 243EXPORT_SYMBOL_GPL(mce_unregister_decode_chain); 244 245static void print_mce(struct mce *m) 246{ 247 int ret = 0; 248 249 pr_emerg(HW_ERR "CPU %d: Machine Check Exception: %Lx Bank %d: %016Lx\n", 250 m->extcpu, m->mcgstatus, m->bank, m->status); 251 252 if (m->ip) { 253 pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ", 254 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", 255 m->cs, m->ip); 256 257 if (m->cs == __KERNEL_CS) 258 print_symbol("{%s}", m->ip); 259 pr_cont("\n"); 260 } 261 262 pr_emerg(HW_ERR "TSC %llx ", m->tsc); 263 if (m->addr) 264 pr_cont("ADDR %llx ", m->addr); 265 if (m->misc) 266 pr_cont("MISC %llx ", m->misc); 267 268 pr_cont("\n"); 269 /* 270 * Note this output is parsed by external tools and old fields 271 * should not be changed. 272 */ 273 pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x microcode %x\n", 274 m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid, 275 cpu_data(m->extcpu).microcode); 276 277 /* 278 * Print out human-readable details about the MCE error, 279 * (if the CPU has an implementation for that) 280 */ 281 ret = atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m); 282 if (ret == NOTIFY_STOP) 283 return; 284 285 pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n"); 286} 287 288#define PANIC_TIMEOUT 5 /* 5 seconds */ 289 290static atomic_t mce_paniced; 291 292static int fake_panic; 293static atomic_t mce_fake_paniced; 294 295/* Panic in progress. Enable interrupts and wait for final IPI */ 296static void wait_for_panic(void) 297{ 298 long timeout = PANIC_TIMEOUT*USEC_PER_SEC; 299 300 preempt_disable(); 301 local_irq_enable(); 302 while (timeout-- > 0) 303 udelay(1); 304 if (panic_timeout == 0) 305 panic_timeout = mce_panic_timeout; 306 panic("Panicing machine check CPU died"); 307} 308 309static void mce_panic(char *msg, struct mce *final, char *exp) 310{ 311 int i, apei_err = 0; 312 313 if (!fake_panic) { 314 /* 315 * Make sure only one CPU runs in machine check panic 316 */ 317 if (atomic_inc_return(&mce_paniced) > 1) 318 wait_for_panic(); 319 barrier(); 320 321 bust_spinlocks(1); 322 console_verbose(); 323 } else { 324 /* Don't log too much for fake panic */ 325 if (atomic_inc_return(&mce_fake_paniced) > 1) 326 return; 327 } 328 /* First print corrected ones that are still unlogged */ 329 for (i = 0; i < MCE_LOG_LEN; i++) { 330 struct mce *m = &mcelog.entry[i]; 331 if (!(m->status & MCI_STATUS_VAL)) 332 continue; 333 if (!(m->status & MCI_STATUS_UC)) { 334 print_mce(m); 335 if (!apei_err) 336 apei_err = apei_write_mce(m); 337 } 338 } 339 /* Now print uncorrected but with the final one last */ 340 for (i = 0; i < MCE_LOG_LEN; i++) { 341 struct mce *m = &mcelog.entry[i]; 342 if (!(m->status & MCI_STATUS_VAL)) 343 continue; 344 if (!(m->status & MCI_STATUS_UC)) 345 continue; 346 if (!final || memcmp(m, final, sizeof(struct mce))) { 347 print_mce(m); 348 if (!apei_err) 349 apei_err = apei_write_mce(m); 350 } 351 } 352 if (final) { 353 print_mce(final); 354 if (!apei_err) 355 apei_err = apei_write_mce(final); 356 } 357 if (cpu_missing) 358 pr_emerg(HW_ERR "Some CPUs didn't answer in synchronization\n"); 359 if (exp) 360 pr_emerg(HW_ERR "Machine check: %s\n", exp); 361 if (!fake_panic) { 362 if (panic_timeout == 0) 363 panic_timeout = mce_panic_timeout; 364 panic(msg); 365 } else 366 pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg); 367} 368 369/* Support code for software error injection */ 370 371static int msr_to_offset(u32 msr) 372{ 373 unsigned bank = __this_cpu_read(injectm.bank); 374 375 if (msr == rip_msr) 376 return offsetof(struct mce, ip); 377 if (msr == MSR_IA32_MCx_STATUS(bank)) 378 return offsetof(struct mce, status); 379 if (msr == MSR_IA32_MCx_ADDR(bank)) 380 return offsetof(struct mce, addr); 381 if (msr == MSR_IA32_MCx_MISC(bank)) 382 return offsetof(struct mce, misc); 383 if (msr == MSR_IA32_MCG_STATUS) 384 return offsetof(struct mce, mcgstatus); 385 return -1; 386} 387 388/* MSR access wrappers used for error injection */ 389static u64 mce_rdmsrl(u32 msr) 390{ 391 u64 v; 392 393 if (__this_cpu_read(injectm.finished)) { 394 int offset = msr_to_offset(msr); 395 396 if (offset < 0) 397 return 0; 398 return *(u64 *)((char *)&__get_cpu_var(injectm) + offset); 399 } 400 401 if (rdmsrl_safe(msr, &v)) { 402 WARN_ONCE(1, "mce: Unable to read msr %d!\n", msr); 403 /* 404 * Return zero in case the access faulted. This should 405 * not happen normally but can happen if the CPU does 406 * something weird, or if the code is buggy. 407 */ 408 v = 0; 409 } 410 411 return v; 412} 413 414static void mce_wrmsrl(u32 msr, u64 v) 415{ 416 if (__this_cpu_read(injectm.finished)) { 417 int offset = msr_to_offset(msr); 418 419 if (offset >= 0) 420 *(u64 *)((char *)&__get_cpu_var(injectm) + offset) = v; 421 return; 422 } 423 wrmsrl(msr, v); 424} 425 426/* 427 * Collect all global (w.r.t. this processor) status about this machine 428 * check into our "mce" struct so that we can use it later to assess 429 * the severity of the problem as we read per-bank specific details. 430 */ 431static inline void mce_gather_info(struct mce *m, struct pt_regs *regs) 432{ 433 mce_setup(m); 434 435 m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); 436 if (regs) { 437 /* 438 * Get the address of the instruction at the time of 439 * the machine check error. 440 */ 441 if (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) { 442 m->ip = regs->ip; 443 m->cs = regs->cs; 444 445 /* 446 * When in VM86 mode make the cs look like ring 3 447 * always. This is a lie, but it's better than passing 448 * the additional vm86 bit around everywhere. 449 */ 450 if (v8086_mode(regs)) 451 m->cs |= 3; 452 } 453 /* Use accurate RIP reporting if available. */ 454 if (rip_msr) 455 m->ip = mce_rdmsrl(rip_msr); 456 } 457} 458 459/* 460 * Simple lockless ring to communicate PFNs from the exception handler with the 461 * process context work function. This is vastly simplified because there's 462 * only a single reader and a single writer. 463 */ 464#define MCE_RING_SIZE 16 /* we use one entry less */ 465 466struct mce_ring { 467 unsigned short start; 468 unsigned short end; 469 unsigned long ring[MCE_RING_SIZE]; 470}; 471static DEFINE_PER_CPU(struct mce_ring, mce_ring); 472 473/* Runs with CPU affinity in workqueue */ 474static int mce_ring_empty(void) 475{ 476 struct mce_ring *r = &__get_cpu_var(mce_ring); 477 478 return r->start == r->end; 479} 480 481static int mce_ring_get(unsigned long *pfn) 482{ 483 struct mce_ring *r; 484 int ret = 0; 485 486 *pfn = 0; 487 get_cpu(); 488 r = &__get_cpu_var(mce_ring); 489 if (r->start == r->end) 490 goto out; 491 *pfn = r->ring[r->start]; 492 r->start = (r->start + 1) % MCE_RING_SIZE; 493 ret = 1; 494out: 495 put_cpu(); 496 return ret; 497} 498 499/* Always runs in MCE context with preempt off */ 500static int mce_ring_add(unsigned long pfn) 501{ 502 struct mce_ring *r = &__get_cpu_var(mce_ring); 503 unsigned next; 504 505 next = (r->end + 1) % MCE_RING_SIZE; 506 if (next == r->start) 507 return -1; 508 r->ring[r->end] = pfn; 509 wmb(); 510 r->end = next; 511 return 0; 512} 513 514int mce_available(struct cpuinfo_x86 *c) 515{ 516 if (mce_disabled) 517 return 0; 518 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); 519} 520 521static void mce_schedule_work(void) 522{ 523 if (!mce_ring_empty()) { 524 struct work_struct *work = &__get_cpu_var(mce_work); 525 if (!work_pending(work)) 526 schedule_work(work); 527 } 528} 529 530DEFINE_PER_CPU(struct irq_work, mce_irq_work); 531 532static void mce_irq_work_cb(struct irq_work *entry) 533{ 534 mce_notify_irq(); 535 mce_schedule_work(); 536} 537 538static void mce_report_event(struct pt_regs *regs) 539{ 540 if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) { 541 mce_notify_irq(); 542 /* 543 * Triggering the work queue here is just an insurance 544 * policy in case the syscall exit notify handler 545 * doesn't run soon enough or ends up running on the 546 * wrong CPU (can happen when audit sleeps) 547 */ 548 mce_schedule_work(); 549 return; 550 } 551 552 irq_work_queue(&__get_cpu_var(mce_irq_work)); 553} 554 555/* 556 * Read ADDR and MISC registers. 557 */ 558static void mce_read_aux(struct mce *m, int i) 559{ 560 if (m->status & MCI_STATUS_MISCV) 561 m->misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i)); 562 if (m->status & MCI_STATUS_ADDRV) { 563 m->addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i)); 564 565 /* 566 * Mask the reported address by the reported granularity. 567 */ 568 if (mce_ser && (m->status & MCI_STATUS_MISCV)) { 569 u8 shift = MCI_MISC_ADDR_LSB(m->misc); 570 m->addr >>= shift; 571 m->addr <<= shift; 572 } 573 } 574} 575 576DEFINE_PER_CPU(unsigned, mce_poll_count); 577 578/* 579 * Poll for corrected events or events that happened before reset. 580 * Those are just logged through /dev/mcelog. 581 * 582 * This is executed in standard interrupt context. 583 * 584 * Note: spec recommends to panic for fatal unsignalled 585 * errors here. However this would be quite problematic -- 586 * we would need to reimplement the Monarch handling and 587 * it would mess up the exclusion between exception handler 588 * and poll hander -- * so we skip this for now. 589 * These cases should not happen anyways, or only when the CPU 590 * is already totally * confused. In this case it's likely it will 591 * not fully execute the machine check handler either. 592 */ 593void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) 594{ 595 struct mce m; 596 int i; 597 598 this_cpu_inc(mce_poll_count); 599 600 mce_gather_info(&m, NULL); 601 602 for (i = 0; i < banks; i++) { 603 if (!mce_banks[i].ctl || !test_bit(i, *b)) 604 continue; 605 606 m.misc = 0; 607 m.addr = 0; 608 m.bank = i; 609 m.tsc = 0; 610 611 barrier(); 612 m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); 613 if (!(m.status & MCI_STATUS_VAL)) 614 continue; 615 616 /* 617 * Uncorrected or signalled events are handled by the exception 618 * handler when it is enabled, so don't process those here. 619 * 620 * TBD do the same check for MCI_STATUS_EN here? 621 */ 622 if (!(flags & MCP_UC) && 623 (m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC))) 624 continue; 625 626 mce_read_aux(&m, i); 627 628 if (!(flags & MCP_TIMESTAMP)) 629 m.tsc = 0; 630 /* 631 * Don't get the IP here because it's unlikely to 632 * have anything to do with the actual error location. 633 */ 634 if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) 635 mce_log(&m); 636 637 /* 638 * Clear state for this bank. 639 */ 640 mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0); 641 } 642 643 /* 644 * Don't clear MCG_STATUS here because it's only defined for 645 * exceptions. 646 */ 647 648 sync_core(); 649} 650EXPORT_SYMBOL_GPL(machine_check_poll); 651 652/* 653 * Do a quick check if any of the events requires a panic. 654 * This decides if we keep the events around or clear them. 655 */ 656static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp, 657 struct pt_regs *regs) 658{ 659 int i, ret = 0; 660 661 for (i = 0; i < banks; i++) { 662 m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); 663 if (m->status & MCI_STATUS_VAL) { 664 __set_bit(i, validp); 665 if (quirk_no_way_out) 666 quirk_no_way_out(i, m, regs); 667 } 668 if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY) 669 ret = 1; 670 } 671 return ret; 672} 673 674/* 675 * Variable to establish order between CPUs while scanning. 676 * Each CPU spins initially until executing is equal its number. 677 */ 678static atomic_t mce_executing; 679 680/* 681 * Defines order of CPUs on entry. First CPU becomes Monarch. 682 */ 683static atomic_t mce_callin; 684 685/* 686 * Check if a timeout waiting for other CPUs happened. 687 */ 688static int mce_timed_out(u64 *t) 689{ 690 /* 691 * The others already did panic for some reason. 692 * Bail out like in a timeout. 693 * rmb() to tell the compiler that system_state 694 * might have been modified by someone else. 695 */ 696 rmb(); 697 if (atomic_read(&mce_paniced)) 698 wait_for_panic(); 699 if (!monarch_timeout) 700 goto out; 701 if ((s64)*t < SPINUNIT) { 702 /* CHECKME: Make panic default for 1 too? */ 703 if (tolerant < 1) 704 mce_panic("Timeout synchronizing machine check over CPUs", 705 NULL, NULL); 706 cpu_missing = 1; 707 return 1; 708 } 709 *t -= SPINUNIT; 710out: 711 touch_nmi_watchdog(); 712 return 0; 713} 714 715/* 716 * The Monarch's reign. The Monarch is the CPU who entered 717 * the machine check handler first. It waits for the others to 718 * raise the exception too and then grades them. When any 719 * error is fatal panic. Only then let the others continue. 720 * 721 * The other CPUs entering the MCE handler will be controlled by the 722 * Monarch. They are called Subjects. 723 * 724 * This way we prevent any potential data corruption in a unrecoverable case 725 * and also makes sure always all CPU's errors are examined. 726 * 727 * Also this detects the case of a machine check event coming from outer 728 * space (not detected by any CPUs) In this case some external agent wants 729 * us to shut down, so panic too. 730 * 731 * The other CPUs might still decide to panic if the handler happens 732 * in a unrecoverable place, but in this case the system is in a semi-stable 733 * state and won't corrupt anything by itself. It's ok to let the others 734 * continue for a bit first. 735 * 736 * All the spin loops have timeouts; when a timeout happens a CPU 737 * typically elects itself to be Monarch. 738 */ 739static void mce_reign(void) 740{ 741 int cpu; 742 struct mce *m = NULL; 743 int global_worst = 0; 744 char *msg = NULL; 745 char *nmsg = NULL; 746 747 /* 748 * This CPU is the Monarch and the other CPUs have run 749 * through their handlers. 750 * Grade the severity of the errors of all the CPUs. 751 */ 752 for_each_possible_cpu(cpu) { 753 int severity = mce_severity(&per_cpu(mces_seen, cpu), tolerant, 754 &nmsg); 755 if (severity > global_worst) { 756 msg = nmsg; 757 global_worst = severity; 758 m = &per_cpu(mces_seen, cpu); 759 } 760 } 761 762 /* 763 * Cannot recover? Panic here then. 764 * This dumps all the mces in the log buffer and stops the 765 * other CPUs. 766 */ 767 if (m && global_worst >= MCE_PANIC_SEVERITY && tolerant < 3) 768 mce_panic("Fatal Machine check", m, msg); 769 770 /* 771 * For UC somewhere we let the CPU who detects it handle it. 772 * Also must let continue the others, otherwise the handling 773 * CPU could deadlock on a lock. 774 */ 775 776 /* 777 * No machine check event found. Must be some external 778 * source or one CPU is hung. Panic. 779 */ 780 if (global_worst <= MCE_KEEP_SEVERITY && tolerant < 3) 781 mce_panic("Machine check from unknown source", NULL, NULL); 782 783 /* 784 * Now clear all the mces_seen so that they don't reappear on 785 * the next mce. 786 */ 787 for_each_possible_cpu(cpu) 788 memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce)); 789} 790 791static atomic_t global_nwo; 792 793/* 794 * Start of Monarch synchronization. This waits until all CPUs have 795 * entered the exception handler and then determines if any of them 796 * saw a fatal event that requires panic. Then it executes them 797 * in the entry order. 798 * TBD double check parallel CPU hotunplug 799 */ 800static int mce_start(int *no_way_out) 801{ 802 int order; 803 int cpus = num_online_cpus(); 804 u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC; 805 806 if (!timeout) 807 return -1; 808 809 atomic_add(*no_way_out, &global_nwo); 810 /* 811 * global_nwo should be updated before mce_callin 812 */ 813 smp_wmb(); 814 order = atomic_inc_return(&mce_callin); 815 816 /* 817 * Wait for everyone. 818 */ 819 while (atomic_read(&mce_callin) != cpus) { 820 if (mce_timed_out(&timeout)) { 821 atomic_set(&global_nwo, 0); 822 return -1; 823 } 824 ndelay(SPINUNIT); 825 } 826 827 /* 828 * mce_callin should be read before global_nwo 829 */ 830 smp_rmb(); 831 832 if (order == 1) { 833 /* 834 * Monarch: Starts executing now, the others wait. 835 */ 836 atomic_set(&mce_executing, 1); 837 } else { 838 /* 839 * Subject: Now start the scanning loop one by one in 840 * the original callin order. 841 * This way when there are any shared banks it will be 842 * only seen by one CPU before cleared, avoiding duplicates. 843 */ 844 while (atomic_read(&mce_executing) < order) { 845 if (mce_timed_out(&timeout)) { 846 atomic_set(&global_nwo, 0); 847 return -1; 848 } 849 ndelay(SPINUNIT); 850 } 851 } 852 853 /* 854 * Cache the global no_way_out state. 855 */ 856 *no_way_out = atomic_read(&global_nwo); 857 858 return order; 859} 860 861/* 862 * Synchronize between CPUs after main scanning loop. 863 * This invokes the bulk of the Monarch processing. 864 */ 865static int mce_end(int order) 866{ 867 int ret = -1; 868 u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC; 869 870 if (!timeout) 871 goto reset; 872 if (order < 0) 873 goto reset; 874 875 /* 876 * Allow others to run. 877 */ 878 atomic_inc(&mce_executing); 879 880 if (order == 1) { 881 /* CHECKME: Can this race with a parallel hotplug? */ 882 int cpus = num_online_cpus(); 883 884 /* 885 * Monarch: Wait for everyone to go through their scanning 886 * loops. 887 */ 888 while (atomic_read(&mce_executing) <= cpus) { 889 if (mce_timed_out(&timeout)) 890 goto reset; 891 ndelay(SPINUNIT); 892 } 893 894 mce_reign(); 895 barrier(); 896 ret = 0; 897 } else { 898 /* 899 * Subject: Wait for Monarch to finish. 900 */ 901 while (atomic_read(&mce_executing) != 0) { 902 if (mce_timed_out(&timeout)) 903 goto reset; 904 ndelay(SPINUNIT); 905 } 906 907 /* 908 * Don't reset anything. That's done by the Monarch. 909 */ 910 return 0; 911 } 912 913 /* 914 * Reset all global state. 915 */ 916reset: 917 atomic_set(&global_nwo, 0); 918 atomic_set(&mce_callin, 0); 919 barrier(); 920 921 /* 922 * Let others run again. 923 */ 924 atomic_set(&mce_executing, 0); 925 return ret; 926} 927 928/* 929 * Check if the address reported by the CPU is in a format we can parse. 930 * It would be possible to add code for most other cases, but all would 931 * be somewhat complicated (e.g. segment offset would require an instruction 932 * parser). So only support physical addresses up to page granuality for now. 933 */ 934static int mce_usable_address(struct mce *m) 935{ 936 if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV)) 937 return 0; 938 if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT) 939 return 0; 940 if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS) 941 return 0; 942 return 1; 943} 944 945static void mce_clear_state(unsigned long *toclear) 946{ 947 int i; 948 949 for (i = 0; i < banks; i++) { 950 if (test_bit(i, toclear)) 951 mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0); 952 } 953} 954 955/* 956 * Need to save faulting physical address associated with a process 957 * in the machine check handler some place where we can grab it back 958 * later in mce_notify_process() 959 */ 960#define MCE_INFO_MAX 16 961 962struct mce_info { 963 atomic_t inuse; 964 struct task_struct *t; 965 __u64 paddr; 966 int restartable; 967} mce_info[MCE_INFO_MAX]; 968 969static void mce_save_info(__u64 addr, int c) 970{ 971 struct mce_info *mi; 972 973 for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++) { 974 if (atomic_cmpxchg(&mi->inuse, 0, 1) == 0) { 975 mi->t = current; 976 mi->paddr = addr; 977 mi->restartable = c; 978 return; 979 } 980 } 981 982 mce_panic("Too many concurrent recoverable errors", NULL, NULL); 983} 984 985static struct mce_info *mce_find_info(void) 986{ 987 struct mce_info *mi; 988 989 for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++) 990 if (atomic_read(&mi->inuse) && mi->t == current) 991 return mi; 992 return NULL; 993} 994 995static void mce_clear_info(struct mce_info *mi) 996{ 997 atomic_set(&mi->inuse, 0); 998} 999 1000/* 1001 * The actual machine check handler. This only handles real 1002 * exceptions when something got corrupted coming in through int 18. 1003 * 1004 * This is executed in NMI context not subject to normal locking rules. This 1005 * implies that most kernel services cannot be safely used. Don't even 1006 * think about putting a printk in there! 1007 * 1008 * On Intel systems this is entered on all CPUs in parallel through 1009 * MCE broadcast. However some CPUs might be broken beyond repair, 1010 * so be always careful when synchronizing with others. 1011 */ 1012void do_machine_check(struct pt_regs *regs, long error_code) 1013{ 1014 struct mce m, *final; 1015 int i; 1016 int worst = 0; 1017 int severity; 1018 /* 1019 * Establish sequential order between the CPUs entering the machine 1020 * check handler. 1021 */ 1022 int order; 1023 /* 1024 * If no_way_out gets set, there is no safe way to recover from this 1025 * MCE. If tolerant is cranked up, we'll try anyway. 1026 */ 1027 int no_way_out = 0; 1028 /* 1029 * If kill_it gets set, there might be a way to recover from this 1030 * error. 1031 */ 1032 int kill_it = 0; 1033 DECLARE_BITMAP(toclear, MAX_NR_BANKS); 1034 DECLARE_BITMAP(valid_banks, MAX_NR_BANKS); 1035 char *msg = "Unknown"; 1036 1037 atomic_inc(&mce_entry); 1038 1039 this_cpu_inc(mce_exception_count); 1040 1041 if (!banks) 1042 goto out; 1043 1044 mce_gather_info(&m, regs); 1045 1046 final = &__get_cpu_var(mces_seen); 1047 *final = m; 1048 1049 memset(valid_banks, 0, sizeof(valid_banks)); 1050 no_way_out = mce_no_way_out(&m, &msg, valid_banks, regs); 1051 1052 barrier(); 1053 1054 /* 1055 * When no restart IP might need to kill or panic. 1056 * Assume the worst for now, but if we find the 1057 * severity is MCE_AR_SEVERITY we have other options. 1058 */ 1059 if (!(m.mcgstatus & MCG_STATUS_RIPV)) 1060 kill_it = 1; 1061 1062 /* 1063 * Go through all the banks in exclusion of the other CPUs. 1064 * This way we don't report duplicated events on shared banks 1065 * because the first one to see it will clear it. 1066 */ 1067 order = mce_start(&no_way_out); 1068 for (i = 0; i < banks; i++) { 1069 __clear_bit(i, toclear); 1070 if (!test_bit(i, valid_banks)) 1071 continue; 1072 if (!mce_banks[i].ctl) 1073 continue; 1074 1075 m.misc = 0; 1076 m.addr = 0; 1077 m.bank = i; 1078 1079 m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); 1080 if ((m.status & MCI_STATUS_VAL) == 0) 1081 continue; 1082 1083 /* 1084 * Non uncorrected or non signaled errors are handled by 1085 * machine_check_poll. Leave them alone, unless this panics. 1086 */ 1087 if (!(m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)) && 1088 !no_way_out) 1089 continue; 1090 1091 /* 1092 * Set taint even when machine check was not enabled. 1093 */ 1094 add_taint(TAINT_MACHINE_CHECK); 1095 1096 severity = mce_severity(&m, tolerant, NULL); 1097 1098 /* 1099 * When machine check was for corrected handler don't touch, 1100 * unless we're panicing. 1101 */ 1102 if (severity == MCE_KEEP_SEVERITY && !no_way_out) 1103 continue; 1104 __set_bit(i, toclear); 1105 if (severity == MCE_NO_SEVERITY) { 1106 /* 1107 * Machine check event was not enabled. Clear, but 1108 * ignore. 1109 */ 1110 continue; 1111 } 1112 1113 mce_read_aux(&m, i); 1114 1115 /* 1116 * Action optional error. Queue address for later processing. 1117 * When the ring overflows we just ignore the AO error. 1118 * RED-PEN add some logging mechanism when 1119 * usable_address or mce_add_ring fails. 1120 * RED-PEN don't ignore overflow for tolerant == 0 1121 */ 1122 if (severity == MCE_AO_SEVERITY && mce_usable_address(&m)) 1123 mce_ring_add(m.addr >> PAGE_SHIFT); 1124 1125 mce_log(&m); 1126 1127 if (severity > worst) { 1128 *final = m; 1129 worst = severity; 1130 } 1131 } 1132 1133 /* mce_clear_state will clear *final, save locally for use later */ 1134 m = *final; 1135 1136 if (!no_way_out) 1137 mce_clear_state(toclear); 1138 1139 /* 1140 * Do most of the synchronization with other CPUs. 1141 * When there's any problem use only local no_way_out state. 1142 */ 1143 if (mce_end(order) < 0) 1144 no_way_out = worst >= MCE_PANIC_SEVERITY; 1145 1146 /* 1147 * At insane "tolerant" levels we take no action. Otherwise 1148 * we only die if we have no other choice. For less serious 1149 * issues we try to recover, or limit damage to the current 1150 * process. 1151 */ 1152 if (tolerant < 3) { 1153 if (no_way_out) 1154 mce_panic("Fatal machine check on current CPU", &m, msg); 1155 if (worst == MCE_AR_SEVERITY) { 1156 /* schedule action before return to userland */ 1157 mce_save_info(m.addr, m.mcgstatus & MCG_STATUS_RIPV); 1158 set_thread_flag(TIF_MCE_NOTIFY); 1159 } else if (kill_it) { 1160 force_sig(SIGBUS, current); 1161 } 1162 } 1163 1164 if (worst > 0) 1165 mce_report_event(regs); 1166 mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); 1167out: 1168 atomic_dec(&mce_entry); 1169 sync_core(); 1170} 1171EXPORT_SYMBOL_GPL(do_machine_check); 1172 1173#ifndef CONFIG_MEMORY_FAILURE 1174int memory_failure(unsigned long pfn, int vector, int flags) 1175{ 1176 /* mce_severity() should not hand us an ACTION_REQUIRED error */ 1177 BUG_ON(flags & MF_ACTION_REQUIRED); 1178 pr_err("Uncorrected memory error in page 0x%lx ignored\n" 1179 "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n", 1180 pfn); 1181 1182 return 0; 1183} 1184#endif 1185 1186/* 1187 * Called in process context that interrupted by MCE and marked with 1188 * TIF_MCE_NOTIFY, just before returning to erroneous userland. 1189 * This code is allowed to sleep. 1190 * Attempt possible recovery such as calling the high level VM handler to 1191 * process any corrupted pages, and kill/signal current process if required. 1192 * Action required errors are handled here. 1193 */ 1194void mce_notify_process(void) 1195{ 1196 unsigned long pfn; 1197 struct mce_info *mi = mce_find_info(); 1198 int flags = MF_ACTION_REQUIRED; 1199 1200 if (!mi) 1201 mce_panic("Lost physical address for unconsumed uncorrectable error", NULL, NULL); 1202 pfn = mi->paddr >> PAGE_SHIFT; 1203 1204 clear_thread_flag(TIF_MCE_NOTIFY); 1205 1206 pr_err("Uncorrected hardware memory error in user-access at %llx", 1207 mi->paddr); 1208 /* 1209 * We must call memory_failure() here even if the current process is 1210 * doomed. We still need to mark the page as poisoned and alert any 1211 * other users of the page. 1212 */ 1213 if (!mi->restartable) 1214 flags |= MF_MUST_KILL; 1215 if (memory_failure(pfn, MCE_VECTOR, flags) < 0) { 1216 pr_err("Memory error not recovered"); 1217 force_sig(SIGBUS, current); 1218 } 1219 mce_clear_info(mi); 1220} 1221 1222/* 1223 * Action optional processing happens here (picking up 1224 * from the list of faulting pages that do_machine_check() 1225 * placed into the "ring"). 1226 */ 1227static void mce_process_work(struct work_struct *dummy) 1228{ 1229 unsigned long pfn; 1230 1231 while (mce_ring_get(&pfn)) 1232 memory_failure(pfn, MCE_VECTOR, 0); 1233} 1234 1235#ifdef CONFIG_X86_MCE_INTEL 1236/*** 1237 * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog 1238 * @cpu: The CPU on which the event occurred. 1239 * @status: Event status information 1240 * 1241 * This function should be called by the thermal interrupt after the 1242 * event has been processed and the decision was made to log the event 1243 * further. 1244 * 1245 * The status parameter will be saved to the 'status' field of 'struct mce' 1246 * and historically has been the register value of the 1247 * MSR_IA32_THERMAL_STATUS (Intel) msr. 1248 */ 1249void mce_log_therm_throt_event(__u64 status) 1250{ 1251 struct mce m; 1252 1253 mce_setup(&m); 1254 m.bank = MCE_THERMAL_BANK; 1255 m.status = status; 1256 mce_log(&m); 1257} 1258#endif /* CONFIG_X86_MCE_INTEL */ 1259 1260/* 1261 * Periodic polling timer for "silent" machine check errors. If the 1262 * poller finds an MCE, poll 2x faster. When the poller finds no more 1263 * errors, poll 2x slower (up to check_interval seconds). 1264 */ 1265static unsigned long check_interval = 5 * 60; /* 5 minutes */ 1266 1267static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */ 1268static DEFINE_PER_CPU(struct timer_list, mce_timer); 1269 1270static unsigned long mce_adjust_timer_default(unsigned long interval) 1271{ 1272 return interval; 1273} 1274 1275static unsigned long (*mce_adjust_timer)(unsigned long interval) = 1276 mce_adjust_timer_default; 1277 1278static void mce_timer_fn(unsigned long data) 1279{ 1280 struct timer_list *t = &__get_cpu_var(mce_timer); 1281 unsigned long iv; 1282 1283 WARN_ON(smp_processor_id() != data); 1284 1285 if (mce_available(__this_cpu_ptr(&cpu_info))) { 1286 machine_check_poll(MCP_TIMESTAMP, 1287 &__get_cpu_var(mce_poll_banks)); 1288 mce_intel_cmci_poll(); 1289 } 1290 1291 /* 1292 * Alert userspace if needed. If we logged an MCE, reduce the 1293 * polling interval, otherwise increase the polling interval. 1294 */ 1295 iv = __this_cpu_read(mce_next_interval); 1296 if (mce_notify_irq()) { 1297 iv = max(iv / 2, (unsigned long) HZ/100); 1298 } else { 1299 iv = min(iv * 2, round_jiffies_relative(check_interval * HZ)); 1300 iv = mce_adjust_timer(iv); 1301 } 1302 __this_cpu_write(mce_next_interval, iv); 1303 /* Might have become 0 after CMCI storm subsided */ 1304 if (iv) { 1305 t->expires = jiffies + iv; 1306 add_timer_on(t, smp_processor_id()); 1307 } 1308} 1309 1310/* 1311 * Ensure that the timer is firing in @interval from now. 1312 */ 1313void mce_timer_kick(unsigned long interval) 1314{ 1315 struct timer_list *t = &__get_cpu_var(mce_timer); 1316 unsigned long when = jiffies + interval; 1317 unsigned long iv = __this_cpu_read(mce_next_interval); 1318 1319 if (timer_pending(t)) { 1320 if (time_before(when, t->expires)) 1321 mod_timer_pinned(t, when); 1322 } else { 1323 t->expires = round_jiffies(when); 1324 add_timer_on(t, smp_processor_id()); 1325 } 1326 if (interval < iv) 1327 __this_cpu_write(mce_next_interval, interval); 1328} 1329 1330/* Must not be called in IRQ context where del_timer_sync() can deadlock */ 1331static void mce_timer_delete_all(void) 1332{ 1333 int cpu; 1334 1335 for_each_online_cpu(cpu) 1336 del_timer_sync(&per_cpu(mce_timer, cpu)); 1337} 1338 1339static void mce_do_trigger(struct work_struct *work) 1340{ 1341 call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT); 1342} 1343 1344static DECLARE_WORK(mce_trigger_work, mce_do_trigger); 1345 1346/* 1347 * Notify the user(s) about new machine check events. 1348 * Can be called from interrupt context, but not from machine check/NMI 1349 * context. 1350 */ 1351int mce_notify_irq(void) 1352{ 1353 /* Not more than two messages every minute */ 1354 static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); 1355 1356 if (test_and_clear_bit(0, &mce_need_notify)) { 1357 /* wake processes polling /dev/mcelog */ 1358 wake_up_interruptible(&mce_chrdev_wait); 1359 1360 /* 1361 * There is no risk of missing notifications because 1362 * work_pending is always cleared before the function is 1363 * executed. 1364 */ 1365 if (mce_helper[0] && !work_pending(&mce_trigger_work)) 1366 schedule_work(&mce_trigger_work); 1367 1368 if (__ratelimit(&ratelimit)) 1369 pr_info(HW_ERR "Machine check events logged\n"); 1370 1371 return 1; 1372 } 1373 return 0; 1374} 1375EXPORT_SYMBOL_GPL(mce_notify_irq); 1376 1377static int __cpuinit __mcheck_cpu_mce_banks_init(void) 1378{ 1379 int i; 1380 1381 mce_banks = kzalloc(banks * sizeof(struct mce_bank), GFP_KERNEL); 1382 if (!mce_banks) 1383 return -ENOMEM; 1384 for (i = 0; i < banks; i++) { 1385 struct mce_bank *b = &mce_banks[i]; 1386 1387 b->ctl = -1ULL; 1388 b->init = 1; 1389 } 1390 return 0; 1391} 1392 1393/* 1394 * Initialize Machine Checks for a CPU. 1395 */ 1396static int __cpuinit __mcheck_cpu_cap_init(void) 1397{ 1398 unsigned b; 1399 u64 cap; 1400 1401 rdmsrl(MSR_IA32_MCG_CAP, cap); 1402 1403 b = cap & MCG_BANKCNT_MASK; 1404 if (!banks) 1405 pr_info("CPU supports %d MCE banks\n", b); 1406 1407 if (b > MAX_NR_BANKS) { 1408 pr_warn("Using only %u machine check banks out of %u\n", 1409 MAX_NR_BANKS, b); 1410 b = MAX_NR_BANKS; 1411 } 1412 1413 /* Don't support asymmetric configurations today */ 1414 WARN_ON(banks != 0 && b != banks); 1415 banks = b; 1416 if (!mce_banks) { 1417 int err = __mcheck_cpu_mce_banks_init(); 1418 1419 if (err) 1420 return err; 1421 } 1422 1423 /* Use accurate RIP reporting if available. */ 1424 if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9) 1425 rip_msr = MSR_IA32_MCG_EIP; 1426 1427 if (cap & MCG_SER_P) 1428 mce_ser = 1; 1429 1430 return 0; 1431} 1432 1433static void __mcheck_cpu_init_generic(void) 1434{ 1435 mce_banks_t all_banks; 1436 u64 cap; 1437 int i; 1438 1439 /* 1440 * Log the machine checks left over from the previous reset. 1441 */ 1442 bitmap_fill(all_banks, MAX_NR_BANKS); 1443 machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks); 1444 1445 set_in_cr4(X86_CR4_MCE); 1446 1447 rdmsrl(MSR_IA32_MCG_CAP, cap); 1448 if (cap & MCG_CTL_P) 1449 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); 1450 1451 for (i = 0; i < banks; i++) { 1452 struct mce_bank *b = &mce_banks[i]; 1453 1454 if (!b->init) 1455 continue; 1456 wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl); 1457 wrmsrl(MSR_IA32_MCx_STATUS(i), 0); 1458 } 1459} 1460 1461/* 1462 * During IFU recovery Sandy Bridge -EP4S processors set the RIPV and 1463 * EIPV bits in MCG_STATUS to zero on the affected logical processor (SDM 1464 * Vol 3B Table 15-20). But this confuses both the code that determines 1465 * whether the machine check occurred in kernel or user mode, and also 1466 * the severity assessment code. Pretend that EIPV was set, and take the 1467 * ip/cs values from the pt_regs that mce_gather_info() ignored earlier. 1468 */ 1469static void quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs) 1470{ 1471 if (bank != 0) 1472 return; 1473 if ((m->mcgstatus & (MCG_STATUS_EIPV|MCG_STATUS_RIPV)) != 0) 1474 return; 1475 if ((m->status & (MCI_STATUS_OVER|MCI_STATUS_UC| 1476 MCI_STATUS_EN|MCI_STATUS_MISCV|MCI_STATUS_ADDRV| 1477 MCI_STATUS_PCC|MCI_STATUS_S|MCI_STATUS_AR| 1478 MCACOD)) != 1479 (MCI_STATUS_UC|MCI_STATUS_EN| 1480 MCI_STATUS_MISCV|MCI_STATUS_ADDRV|MCI_STATUS_S| 1481 MCI_STATUS_AR|MCACOD_INSTR)) 1482 return; 1483 1484 m->mcgstatus |= MCG_STATUS_EIPV; 1485 m->ip = regs->ip; 1486 m->cs = regs->cs; 1487} 1488 1489/* Add per CPU specific workarounds here */ 1490static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) 1491{ 1492 if (c->x86_vendor == X86_VENDOR_UNKNOWN) { 1493 pr_info("unknown CPU type - not enabling MCE support\n"); 1494 return -EOPNOTSUPP; 1495 } 1496 1497 /* This should be disabled by the BIOS, but isn't always */ 1498 if (c->x86_vendor == X86_VENDOR_AMD) { 1499 if (c->x86 == 15 && banks > 4) { 1500 /* 1501 * disable GART TBL walk error reporting, which 1502 * trips off incorrectly with the IOMMU & 3ware 1503 * & Cerberus: 1504 */ 1505 clear_bit(10, (unsigned long *)&mce_banks[4].ctl); 1506 } 1507 if (c->x86 <= 17 && mce_bootlog < 0) { 1508 /* 1509 * Lots of broken BIOS around that don't clear them 1510 * by default and leave crap in there. Don't log: 1511 */ 1512 mce_bootlog = 0; 1513 } 1514 /* 1515 * Various K7s with broken bank 0 around. Always disable 1516 * by default. 1517 */ 1518 if (c->x86 == 6 && banks > 0) 1519 mce_banks[0].ctl = 0; 1520 1521 /* 1522 * Turn off MC4_MISC thresholding banks on those models since 1523 * they're not supported there. 1524 */ 1525 if (c->x86 == 0x15 && 1526 (c->x86_model >= 0x10 && c->x86_model <= 0x1f)) { 1527 int i; 1528 u64 val, hwcr; 1529 bool need_toggle; 1530 u32 msrs[] = { 1531 0x00000413, /* MC4_MISC0 */ 1532 0xc0000408, /* MC4_MISC1 */ 1533 }; 1534 1535 rdmsrl(MSR_K7_HWCR, hwcr); 1536 1537 /* McStatusWrEn has to be set */ 1538 need_toggle = !(hwcr & BIT(18)); 1539 1540 if (need_toggle) 1541 wrmsrl(MSR_K7_HWCR, hwcr | BIT(18)); 1542 1543 for (i = 0; i < ARRAY_SIZE(msrs); i++) { 1544 rdmsrl(msrs[i], val); 1545 1546 /* CntP bit set? */ 1547 if (val & BIT_64(62)) { 1548 val &= ~BIT_64(62); 1549 wrmsrl(msrs[i], val); 1550 } 1551 } 1552 1553 /* restore old settings */ 1554 if (need_toggle) 1555 wrmsrl(MSR_K7_HWCR, hwcr); 1556 } 1557 } 1558 1559 if (c->x86_vendor == X86_VENDOR_INTEL) { 1560 /* 1561 * SDM documents that on family 6 bank 0 should not be written 1562 * because it aliases to another special BIOS controlled 1563 * register. 1564 * But it's not aliased anymore on model 0x1a+ 1565 * Don't ignore bank 0 completely because there could be a 1566 * valid event later, merely don't write CTL0. 1567 */ 1568 1569 if (c->x86 == 6 && c->x86_model < 0x1A && banks > 0) 1570 mce_banks[0].init = 0; 1571 1572 /* 1573 * All newer Intel systems support MCE broadcasting. Enable 1574 * synchronization with a one second timeout. 1575 */ 1576 if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) && 1577 monarch_timeout < 0) 1578 monarch_timeout = USEC_PER_SEC; 1579 1580 /* 1581 * There are also broken BIOSes on some Pentium M and 1582 * earlier systems: 1583 */ 1584 if (c->x86 == 6 && c->x86_model <= 13 && mce_bootlog < 0) 1585 mce_bootlog = 0; 1586 1587 if (c->x86 == 6 && c->x86_model == 45) 1588 quirk_no_way_out = quirk_sandybridge_ifu; 1589 } 1590 if (monarch_timeout < 0) 1591 monarch_timeout = 0; 1592 if (mce_bootlog != 0) 1593 mce_panic_timeout = 30; 1594 1595 return 0; 1596} 1597 1598static int __cpuinit __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c) 1599{ 1600 if (c->x86 != 5) 1601 return 0; 1602 1603 switch (c->x86_vendor) { 1604 case X86_VENDOR_INTEL: 1605 intel_p5_mcheck_init(c); 1606 return 1; 1607 break; 1608 case X86_VENDOR_CENTAUR: 1609 winchip_mcheck_init(c); 1610 return 1; 1611 break; 1612 } 1613 1614 return 0; 1615} 1616 1617static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) 1618{ 1619 switch (c->x86_vendor) { 1620 case X86_VENDOR_INTEL: 1621 mce_intel_feature_init(c); 1622 mce_adjust_timer = mce_intel_adjust_timer; 1623 break; 1624 case X86_VENDOR_AMD: 1625 mce_amd_feature_init(c); 1626 break; 1627 default: 1628 break; 1629 } 1630} 1631 1632static void mce_start_timer(unsigned int cpu, struct timer_list *t) 1633{ 1634 unsigned long iv = mce_adjust_timer(check_interval * HZ); 1635 1636 __this_cpu_write(mce_next_interval, iv); 1637 1638 if (mce_ignore_ce || !iv) 1639 return; 1640 1641 t->expires = round_jiffies(jiffies + iv); 1642 add_timer_on(t, smp_processor_id()); 1643} 1644 1645static void __mcheck_cpu_init_timer(void) 1646{ 1647 struct timer_list *t = &__get_cpu_var(mce_timer); 1648 unsigned int cpu = smp_processor_id(); 1649 1650 setup_timer(t, mce_timer_fn, cpu); 1651 mce_start_timer(cpu, t); 1652} 1653 1654/* Handle unconfigured int18 (should never happen) */ 1655static void unexpected_machine_check(struct pt_regs *regs, long error_code) 1656{ 1657 pr_err("CPU#%d: Unexpected int18 (Machine Check)\n", 1658 smp_processor_id()); 1659} 1660 1661/* Call the installed machine check handler for this CPU setup. */ 1662void (*machine_check_vector)(struct pt_regs *, long error_code) = 1663 unexpected_machine_check; 1664 1665/* 1666 * Called for each booted CPU to set up machine checks. 1667 * Must be called with preempt off: 1668 */ 1669void __cpuinit mcheck_cpu_init(struct cpuinfo_x86 *c) 1670{ 1671 if (mce_disabled) 1672 return; 1673 1674 if (__mcheck_cpu_ancient_init(c)) 1675 return; 1676 1677 if (!mce_available(c)) 1678 return; 1679 1680 if (__mcheck_cpu_cap_init() < 0 || __mcheck_cpu_apply_quirks(c) < 0) { 1681 mce_disabled = 1; 1682 return; 1683 } 1684 1685 machine_check_vector = do_machine_check; 1686 1687 __mcheck_cpu_init_generic(); 1688 __mcheck_cpu_init_vendor(c); 1689 __mcheck_cpu_init_timer(); 1690 INIT_WORK(&__get_cpu_var(mce_work), mce_process_work); 1691 init_irq_work(&__get_cpu_var(mce_irq_work), &mce_irq_work_cb); 1692} 1693 1694/* 1695 * mce_chrdev: Character device /dev/mcelog to read and clear the MCE log. 1696 */ 1697 1698static DEFINE_SPINLOCK(mce_chrdev_state_lock); 1699static int mce_chrdev_open_count; /* #times opened */ 1700static int mce_chrdev_open_exclu; /* already open exclusive? */ 1701 1702static int mce_chrdev_open(struct inode *inode, struct file *file) 1703{ 1704 spin_lock(&mce_chrdev_state_lock); 1705 1706 if (mce_chrdev_open_exclu || 1707 (mce_chrdev_open_count && (file->f_flags & O_EXCL))) { 1708 spin_unlock(&mce_chrdev_state_lock); 1709 1710 return -EBUSY; 1711 } 1712 1713 if (file->f_flags & O_EXCL) 1714 mce_chrdev_open_exclu = 1; 1715 mce_chrdev_open_count++; 1716 1717 spin_unlock(&mce_chrdev_state_lock); 1718 1719 return nonseekable_open(inode, file); 1720} 1721 1722static int mce_chrdev_release(struct inode *inode, struct file *file) 1723{ 1724 spin_lock(&mce_chrdev_state_lock); 1725 1726 mce_chrdev_open_count--; 1727 mce_chrdev_open_exclu = 0; 1728 1729 spin_unlock(&mce_chrdev_state_lock); 1730 1731 return 0; 1732} 1733 1734static void collect_tscs(void *data) 1735{ 1736 unsigned long *cpu_tsc = (unsigned long *)data; 1737 1738 rdtscll(cpu_tsc[smp_processor_id()]); 1739} 1740 1741static int mce_apei_read_done; 1742 1743/* Collect MCE record of previous boot in persistent storage via APEI ERST. */ 1744static int __mce_read_apei(char __user **ubuf, size_t usize) 1745{ 1746 int rc; 1747 u64 record_id; 1748 struct mce m; 1749 1750 if (usize < sizeof(struct mce)) 1751 return -EINVAL; 1752 1753 rc = apei_read_mce(&m, &record_id); 1754 /* Error or no more MCE record */ 1755 if (rc <= 0) { 1756 mce_apei_read_done = 1; 1757 /* 1758 * When ERST is disabled, mce_chrdev_read() should return 1759 * "no record" instead of "no device." 1760 */ 1761 if (rc == -ENODEV) 1762 return 0; 1763 return rc; 1764 } 1765 rc = -EFAULT; 1766 if (copy_to_user(*ubuf, &m, sizeof(struct mce))) 1767 return rc; 1768 /* 1769 * In fact, we should have cleared the record after that has 1770 * been flushed to the disk or sent to network in 1771 * /sbin/mcelog, but we have no interface to support that now, 1772 * so just clear it to avoid duplication. 1773 */ 1774 rc = apei_clear_mce(record_id); 1775 if (rc) { 1776 mce_apei_read_done = 1; 1777 return rc; 1778 } 1779 *ubuf += sizeof(struct mce); 1780 1781 return 0; 1782} 1783 1784static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf, 1785 size_t usize, loff_t *off) 1786{ 1787 char __user *buf = ubuf; 1788 unsigned long *cpu_tsc; 1789 unsigned prev, next; 1790 int i, err; 1791 1792 cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL); 1793 if (!cpu_tsc) 1794 return -ENOMEM; 1795 1796 mutex_lock(&mce_chrdev_read_mutex); 1797 1798 if (!mce_apei_read_done) { 1799 err = __mce_read_apei(&buf, usize); 1800 if (err || buf != ubuf) 1801 goto out; 1802 } 1803 1804 next = rcu_dereference_check_mce(mcelog.next); 1805 1806 /* Only supports full reads right now */ 1807 err = -EINVAL; 1808 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) 1809 goto out; 1810 1811 err = 0; 1812 prev = 0; 1813 do { 1814 for (i = prev; i < next; i++) { 1815 unsigned long start = jiffies; 1816 struct mce *m = &mcelog.entry[i]; 1817 1818 while (!m->finished) { 1819 if (time_after_eq(jiffies, start + 2)) { 1820 memset(m, 0, sizeof(*m)); 1821 goto timeout; 1822 } 1823 cpu_relax(); 1824 } 1825 smp_rmb(); 1826 err |= copy_to_user(buf, m, sizeof(*m)); 1827 buf += sizeof(*m); 1828timeout: 1829 ; 1830 } 1831 1832 memset(mcelog.entry + prev, 0, 1833 (next - prev) * sizeof(struct mce)); 1834 prev = next; 1835 next = cmpxchg(&mcelog.next, prev, 0); 1836 } while (next != prev); 1837 1838 synchronize_sched(); 1839 1840 /* 1841 * Collect entries that were still getting written before the 1842 * synchronize. 1843 */ 1844 on_each_cpu(collect_tscs, cpu_tsc, 1); 1845 1846 for (i = next; i < MCE_LOG_LEN; i++) { 1847 struct mce *m = &mcelog.entry[i]; 1848 1849 if (m->finished && m->tsc < cpu_tsc[m->cpu]) { 1850 err |= copy_to_user(buf, m, sizeof(*m)); 1851 smp_rmb(); 1852 buf += sizeof(*m); 1853 memset(m, 0, sizeof(*m)); 1854 } 1855 } 1856 1857 if (err) 1858 err = -EFAULT; 1859 1860out: 1861 mutex_unlock(&mce_chrdev_read_mutex); 1862 kfree(cpu_tsc); 1863 1864 return err ? err : buf - ubuf; 1865} 1866 1867static unsigned int mce_chrdev_poll(struct file *file, poll_table *wait) 1868{ 1869 poll_wait(file, &mce_chrdev_wait, wait); 1870 if (rcu_access_index(mcelog.next)) 1871 return POLLIN | POLLRDNORM; 1872 if (!mce_apei_read_done && apei_check_mce()) 1873 return POLLIN | POLLRDNORM; 1874 return 0; 1875} 1876 1877static long mce_chrdev_ioctl(struct file *f, unsigned int cmd, 1878 unsigned long arg) 1879{ 1880 int __user *p = (int __user *)arg; 1881 1882 if (!capable(CAP_SYS_ADMIN)) 1883 return -EPERM; 1884 1885 switch (cmd) { 1886 case MCE_GET_RECORD_LEN: 1887 return put_user(sizeof(struct mce), p); 1888 case MCE_GET_LOG_LEN: 1889 return put_user(MCE_LOG_LEN, p); 1890 case MCE_GETCLEAR_FLAGS: { 1891 unsigned flags; 1892 1893 do { 1894 flags = mcelog.flags; 1895 } while (cmpxchg(&mcelog.flags, flags, 0) != flags); 1896 1897 return put_user(flags, p); 1898 } 1899 default: 1900 return -ENOTTY; 1901 } 1902} 1903 1904static ssize_t (*mce_write)(struct file *filp, const char __user *ubuf, 1905 size_t usize, loff_t *off); 1906 1907void register_mce_write_callback(ssize_t (*fn)(struct file *filp, 1908 const char __user *ubuf, 1909 size_t usize, loff_t *off)) 1910{ 1911 mce_write = fn; 1912} 1913EXPORT_SYMBOL_GPL(register_mce_write_callback); 1914 1915ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf, 1916 size_t usize, loff_t *off) 1917{ 1918 if (mce_write) 1919 return mce_write(filp, ubuf, usize, off); 1920 else 1921 return -EINVAL; 1922} 1923 1924static const struct file_operations mce_chrdev_ops = { 1925 .open = mce_chrdev_open, 1926 .release = mce_chrdev_release, 1927 .read = mce_chrdev_read, 1928 .write = mce_chrdev_write, 1929 .poll = mce_chrdev_poll, 1930 .unlocked_ioctl = mce_chrdev_ioctl, 1931 .llseek = no_llseek, 1932}; 1933 1934static struct miscdevice mce_chrdev_device = { 1935 MISC_MCELOG_MINOR, 1936 "mcelog", 1937 &mce_chrdev_ops, 1938}; 1939 1940/* 1941 * mce=off Disables machine check 1942 * mce=no_cmci Disables CMCI 1943 * mce=dont_log_ce Clears corrected events silently, no log created for CEs. 1944 * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared. 1945 * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above) 1946 * monarchtimeout is how long to wait for other CPUs on machine 1947 * check, or 0 to not wait 1948 * mce=bootlog Log MCEs from before booting. Disabled by default on AMD. 1949 * mce=nobootlog Don't log MCEs from before booting. 1950 * mce=bios_cmci_threshold Don't program the CMCI threshold 1951 */ 1952static int __init mcheck_enable(char *str) 1953{ 1954 if (*str == 0) { 1955 enable_p5_mce(); 1956 return 1; 1957 } 1958 if (*str == '=') 1959 str++; 1960 if (!strcmp(str, "off")) 1961 mce_disabled = 1; 1962 else if (!strcmp(str, "no_cmci")) 1963 mce_cmci_disabled = 1; 1964 else if (!strcmp(str, "dont_log_ce")) 1965 mce_dont_log_ce = 1; 1966 else if (!strcmp(str, "ignore_ce")) 1967 mce_ignore_ce = 1; 1968 else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog")) 1969 mce_bootlog = (str[0] == 'b'); 1970 else if (!strcmp(str, "bios_cmci_threshold")) 1971 mce_bios_cmci_threshold = 1; 1972 else if (isdigit(str[0])) { 1973 get_option(&str, &tolerant); 1974 if (*str == ',') { 1975 ++str; 1976 get_option(&str, &monarch_timeout); 1977 } 1978 } else { 1979 pr_info("mce argument %s ignored. Please use /sys\n", str); 1980 return 0; 1981 } 1982 return 1; 1983} 1984__setup("mce", mcheck_enable); 1985 1986int __init mcheck_init(void) 1987{ 1988 mcheck_intel_therm_init(); 1989 1990 return 0; 1991} 1992 1993/* 1994 * mce_syscore: PM support 1995 */ 1996 1997/* 1998 * Disable machine checks on suspend and shutdown. We can't really handle 1999 * them later. 2000 */ 2001static int mce_disable_error_reporting(void) 2002{ 2003 int i; 2004 2005 for (i = 0; i < banks; i++) { 2006 struct mce_bank *b = &mce_banks[i]; 2007 2008 if (b->init) 2009 wrmsrl(MSR_IA32_MCx_CTL(i), 0); 2010 } 2011 return 0; 2012} 2013 2014static int mce_syscore_suspend(void) 2015{ 2016 return mce_disable_error_reporting(); 2017} 2018 2019static void mce_syscore_shutdown(void) 2020{ 2021 mce_disable_error_reporting(); 2022} 2023 2024/* 2025 * On resume clear all MCE state. Don't want to see leftovers from the BIOS. 2026 * Only one CPU is active at this time, the others get re-added later using 2027 * CPU hotplug: 2028 */ 2029static void mce_syscore_resume(void) 2030{ 2031 __mcheck_cpu_init_generic(); 2032 __mcheck_cpu_init_vendor(__this_cpu_ptr(&cpu_info)); 2033} 2034 2035static struct syscore_ops mce_syscore_ops = { 2036 .suspend = mce_syscore_suspend, 2037 .shutdown = mce_syscore_shutdown, 2038 .resume = mce_syscore_resume, 2039}; 2040 2041/* 2042 * mce_device: Sysfs support 2043 */ 2044 2045static void mce_cpu_restart(void *data) 2046{ 2047 if (!mce_available(__this_cpu_ptr(&cpu_info))) 2048 return; 2049 __mcheck_cpu_init_generic(); 2050 __mcheck_cpu_init_timer(); 2051} 2052 2053/* Reinit MCEs after user configuration changes */ 2054static void mce_restart(void) 2055{ 2056 mce_timer_delete_all(); 2057 on_each_cpu(mce_cpu_restart, NULL, 1); 2058} 2059 2060/* Toggle features for corrected errors */ 2061static void mce_disable_cmci(void *data) 2062{ 2063 if (!mce_available(__this_cpu_ptr(&cpu_info))) 2064 return; 2065 cmci_clear(); 2066} 2067 2068static void mce_enable_ce(void *all) 2069{ 2070 if (!mce_available(__this_cpu_ptr(&cpu_info))) 2071 return; 2072 cmci_reenable(); 2073 cmci_recheck(); 2074 if (all) 2075 __mcheck_cpu_init_timer(); 2076} 2077 2078static struct bus_type mce_subsys = { 2079 .name = "machinecheck", 2080 .dev_name = "machinecheck", 2081}; 2082 2083DEFINE_PER_CPU(struct device *, mce_device); 2084 2085__cpuinitdata 2086void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); 2087 2088static inline struct mce_bank *attr_to_bank(struct device_attribute *attr) 2089{ 2090 return container_of(attr, struct mce_bank, attr); 2091} 2092 2093static ssize_t show_bank(struct device *s, struct device_attribute *attr, 2094 char *buf) 2095{ 2096 return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl); 2097} 2098 2099static ssize_t set_bank(struct device *s, struct device_attribute *attr, 2100 const char *buf, size_t size) 2101{ 2102 u64 new; 2103 2104 if (strict_strtoull(buf, 0, &new) < 0) 2105 return -EINVAL; 2106 2107 attr_to_bank(attr)->ctl = new; 2108 mce_restart(); 2109 2110 return size; 2111} 2112 2113static ssize_t 2114show_trigger(struct device *s, struct device_attribute *attr, char *buf) 2115{ 2116 strcpy(buf, mce_helper); 2117 strcat(buf, "\n"); 2118 return strlen(mce_helper) + 1; 2119} 2120 2121static ssize_t set_trigger(struct device *s, struct device_attribute *attr, 2122 const char *buf, size_t siz) 2123{ 2124 char *p; 2125 2126 strncpy(mce_helper, buf, sizeof(mce_helper)); 2127 mce_helper[sizeof(mce_helper)-1] = 0; 2128 p = strchr(mce_helper, '\n'); 2129 2130 if (p) 2131 *p = 0; 2132 2133 return strlen(mce_helper) + !!p; 2134} 2135 2136static ssize_t set_ignore_ce(struct device *s, 2137 struct device_attribute *attr, 2138 const char *buf, size_t size) 2139{ 2140 u64 new; 2141 2142 if (strict_strtoull(buf, 0, &new) < 0) 2143 return -EINVAL; 2144 2145 if (mce_ignore_ce ^ !!new) { 2146 if (new) { 2147 /* disable ce features */ 2148 mce_timer_delete_all(); 2149 on_each_cpu(mce_disable_cmci, NULL, 1); 2150 mce_ignore_ce = 1; 2151 } else { 2152 /* enable ce features */ 2153 mce_ignore_ce = 0; 2154 on_each_cpu(mce_enable_ce, (void *)1, 1); 2155 } 2156 } 2157 return size; 2158} 2159 2160static ssize_t set_cmci_disabled(struct device *s, 2161 struct device_attribute *attr, 2162 const char *buf, size_t size) 2163{ 2164 u64 new; 2165 2166 if (strict_strtoull(buf, 0, &new) < 0) 2167 return -EINVAL; 2168 2169 if (mce_cmci_disabled ^ !!new) { 2170 if (new) { 2171 /* disable cmci */ 2172 on_each_cpu(mce_disable_cmci, NULL, 1); 2173 mce_cmci_disabled = 1; 2174 } else { 2175 /* enable cmci */ 2176 mce_cmci_disabled = 0; 2177 on_each_cpu(mce_enable_ce, NULL, 1); 2178 } 2179 } 2180 return size; 2181} 2182 2183static ssize_t store_int_with_restart(struct device *s, 2184 struct device_attribute *attr, 2185 const char *buf, size_t size) 2186{ 2187 ssize_t ret = device_store_int(s, attr, buf, size); 2188 mce_restart(); 2189 return ret; 2190} 2191 2192static DEVICE_ATTR(trigger, 0644, show_trigger, set_trigger); 2193static DEVICE_INT_ATTR(tolerant, 0644, tolerant); 2194static DEVICE_INT_ATTR(monarch_timeout, 0644, monarch_timeout); 2195static DEVICE_INT_ATTR(dont_log_ce, 0644, mce_dont_log_ce); 2196 2197static struct dev_ext_attribute dev_attr_check_interval = { 2198 __ATTR(check_interval, 0644, device_show_int, store_int_with_restart), 2199 &check_interval 2200}; 2201 2202static struct dev_ext_attribute dev_attr_ignore_ce = { 2203 __ATTR(ignore_ce, 0644, device_show_int, set_ignore_ce), 2204 &mce_ignore_ce 2205}; 2206 2207static struct dev_ext_attribute dev_attr_cmci_disabled = { 2208 __ATTR(cmci_disabled, 0644, device_show_int, set_cmci_disabled), 2209 &mce_cmci_disabled 2210}; 2211 2212static struct dev_ext_attribute dev_attr_bios_cmci_threshold = { 2213 __ATTR(bios_cmci_threshold, 0444, device_show_int, NULL), 2214 &mce_bios_cmci_threshold 2215}; 2216 2217static struct device_attribute *mce_device_attrs[] = { 2218 &dev_attr_tolerant.attr, 2219 &dev_attr_check_interval.attr, 2220 &dev_attr_trigger, 2221 &dev_attr_monarch_timeout.attr, 2222 &dev_attr_dont_log_ce.attr, 2223 &dev_attr_ignore_ce.attr, 2224 &dev_attr_cmci_disabled.attr, 2225 &dev_attr_bios_cmci_threshold.attr, 2226 NULL 2227}; 2228 2229static cpumask_var_t mce_device_initialized; 2230 2231static void mce_device_release(struct device *dev) 2232{ 2233 kfree(dev); 2234} 2235 2236/* Per cpu device init. All of the cpus still share the same ctrl bank: */ 2237static __cpuinit int mce_device_create(unsigned int cpu) 2238{ 2239 struct device *dev; 2240 int err; 2241 int i, j; 2242 2243 if (!mce_available(&boot_cpu_data)) 2244 return -EIO; 2245 2246 dev = kzalloc(sizeof *dev, GFP_KERNEL); 2247 if (!dev) 2248 return -ENOMEM; 2249 dev->id = cpu; 2250 dev->bus = &mce_subsys; 2251 dev->release = &mce_device_release; 2252 2253 err = device_register(dev); 2254 if (err) 2255 return err; 2256 2257 for (i = 0; mce_device_attrs[i]; i++) { 2258 err = device_create_file(dev, mce_device_attrs[i]); 2259 if (err) 2260 goto error; 2261 } 2262 for (j = 0; j < banks; j++) { 2263 err = device_create_file(dev, &mce_banks[j].attr); 2264 if (err) 2265 goto error2; 2266 } 2267 cpumask_set_cpu(cpu, mce_device_initialized); 2268 per_cpu(mce_device, cpu) = dev; 2269 2270 return 0; 2271error2: 2272 while (--j >= 0) 2273 device_remove_file(dev, &mce_banks[j].attr); 2274error: 2275 while (--i >= 0) 2276 device_remove_file(dev, mce_device_attrs[i]); 2277 2278 device_unregister(dev); 2279 2280 return err; 2281} 2282 2283static __cpuinit void mce_device_remove(unsigned int cpu) 2284{ 2285 struct device *dev = per_cpu(mce_device, cpu); 2286 int i; 2287 2288 if (!cpumask_test_cpu(cpu, mce_device_initialized)) 2289 return; 2290 2291 for (i = 0; mce_device_attrs[i]; i++) 2292 device_remove_file(dev, mce_device_attrs[i]); 2293 2294 for (i = 0; i < banks; i++) 2295 device_remove_file(dev, &mce_banks[i].attr); 2296 2297 device_unregister(dev); 2298 cpumask_clear_cpu(cpu, mce_device_initialized); 2299 per_cpu(mce_device, cpu) = NULL; 2300} 2301 2302/* Make sure there are no machine checks on offlined CPUs. */ 2303static void __cpuinit mce_disable_cpu(void *h) 2304{ 2305 unsigned long action = *(unsigned long *)h; 2306 int i; 2307 2308 if (!mce_available(__this_cpu_ptr(&cpu_info))) 2309 return; 2310 2311 if (!(action & CPU_TASKS_FROZEN)) 2312 cmci_clear(); 2313 for (i = 0; i < banks; i++) { 2314 struct mce_bank *b = &mce_banks[i]; 2315 2316 if (b->init) 2317 wrmsrl(MSR_IA32_MCx_CTL(i), 0); 2318 } 2319} 2320 2321static void __cpuinit mce_reenable_cpu(void *h) 2322{ 2323 unsigned long action = *(unsigned long *)h; 2324 int i; 2325 2326 if (!mce_available(__this_cpu_ptr(&cpu_info))) 2327 return; 2328 2329 if (!(action & CPU_TASKS_FROZEN)) 2330 cmci_reenable(); 2331 for (i = 0; i < banks; i++) { 2332 struct mce_bank *b = &mce_banks[i]; 2333 2334 if (b->init) 2335 wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl); 2336 } 2337} 2338 2339/* Get notified when a cpu comes on/off. Be hotplug friendly. */ 2340static int __cpuinit 2341mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) 2342{ 2343 unsigned int cpu = (unsigned long)hcpu; 2344 struct timer_list *t = &per_cpu(mce_timer, cpu); 2345 2346 switch (action & ~CPU_TASKS_FROZEN) { 2347 case CPU_ONLINE: 2348 mce_device_create(cpu); 2349 if (threshold_cpu_callback) 2350 threshold_cpu_callback(action, cpu); 2351 break; 2352 case CPU_DEAD: 2353 if (threshold_cpu_callback) 2354 threshold_cpu_callback(action, cpu); 2355 mce_device_remove(cpu); 2356 mce_intel_hcpu_update(cpu); 2357 break; 2358 case CPU_DOWN_PREPARE: 2359 smp_call_function_single(cpu, mce_disable_cpu, &action, 1); 2360 del_timer_sync(t); 2361 break; 2362 case CPU_DOWN_FAILED: 2363 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); 2364 mce_start_timer(cpu, t); 2365 break; 2366 } 2367 2368 if (action == CPU_POST_DEAD) { 2369 /* intentionally ignoring frozen here */ 2370 cmci_rediscover(cpu); 2371 } 2372 2373 return NOTIFY_OK; 2374} 2375 2376static struct notifier_block mce_cpu_notifier __cpuinitdata = { 2377 .notifier_call = mce_cpu_callback, 2378}; 2379 2380static __init void mce_init_banks(void) 2381{ 2382 int i; 2383 2384 for (i = 0; i < banks; i++) { 2385 struct mce_bank *b = &mce_banks[i]; 2386 struct device_attribute *a = &b->attr; 2387 2388 sysfs_attr_init(&a->attr); 2389 a->attr.name = b->attrname; 2390 snprintf(b->attrname, ATTR_LEN, "bank%d", i); 2391 2392 a->attr.mode = 0644; 2393 a->show = show_bank; 2394 a->store = set_bank; 2395 } 2396} 2397 2398static __init int mcheck_init_device(void) 2399{ 2400 int err; 2401 int i = 0; 2402 2403 if (!mce_available(&boot_cpu_data)) 2404 return -EIO; 2405 2406 zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL); 2407 2408 mce_init_banks(); 2409 2410 err = subsys_system_register(&mce_subsys, NULL); 2411 if (err) 2412 return err; 2413 2414 for_each_online_cpu(i) { 2415 err = mce_device_create(i); 2416 if (err) 2417 return err; 2418 } 2419 2420 register_syscore_ops(&mce_syscore_ops); 2421 register_hotcpu_notifier(&mce_cpu_notifier); 2422 2423 /* register character device /dev/mcelog */ 2424 misc_register(&mce_chrdev_device); 2425 2426 return err; 2427} 2428device_initcall_sync(mcheck_init_device); 2429 2430/* 2431 * Old style boot options parsing. Only for compatibility. 2432 */ 2433static int __init mcheck_disable(char *str) 2434{ 2435 mce_disabled = 1; 2436 return 1; 2437} 2438__setup("nomce", mcheck_disable); 2439 2440#ifdef CONFIG_DEBUG_FS 2441struct dentry *mce_get_debugfs_dir(void) 2442{ 2443 static struct dentry *dmce; 2444 2445 if (!dmce) 2446 dmce = debugfs_create_dir("mce", NULL); 2447 2448 return dmce; 2449} 2450 2451static void mce_reset(void) 2452{ 2453 cpu_missing = 0; 2454 atomic_set(&mce_fake_paniced, 0); 2455 atomic_set(&mce_executing, 0); 2456 atomic_set(&mce_callin, 0); 2457 atomic_set(&global_nwo, 0); 2458} 2459 2460static int fake_panic_get(void *data, u64 *val) 2461{ 2462 *val = fake_panic; 2463 return 0; 2464} 2465 2466static int fake_panic_set(void *data, u64 val) 2467{ 2468 mce_reset(); 2469 fake_panic = val; 2470 return 0; 2471} 2472 2473DEFINE_SIMPLE_ATTRIBUTE(fake_panic_fops, fake_panic_get, 2474 fake_panic_set, "%llu\n"); 2475 2476static int __init mcheck_debugfs_init(void) 2477{ 2478 struct dentry *dmce, *ffake_panic; 2479 2480 dmce = mce_get_debugfs_dir(); 2481 if (!dmce) 2482 return -ENOMEM; 2483 ffake_panic = debugfs_create_file("fake_panic", 0444, dmce, NULL, 2484 &fake_panic_fops); 2485 if (!ffake_panic) 2486 return -ENOMEM; 2487 2488 return 0; 2489} 2490late_initcall(mcheck_debugfs_init); 2491#endif 2492