mce.c revision c6ae41e7d469f00d9c92a2b2887c7235d121c009
1/* 2 * Machine check handler. 3 * 4 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. 5 * Rest from unknown author(s). 6 * 2004 Andi Kleen. Rewrote most of it. 7 * Copyright 2008 Intel Corporation 8 * Author: Andi Kleen 9 */ 10#include <linux/thread_info.h> 11#include <linux/capability.h> 12#include <linux/miscdevice.h> 13#include <linux/ratelimit.h> 14#include <linux/kallsyms.h> 15#include <linux/rcupdate.h> 16#include <linux/kobject.h> 17#include <linux/uaccess.h> 18#include <linux/kdebug.h> 19#include <linux/kernel.h> 20#include <linux/percpu.h> 21#include <linux/string.h> 22#include <linux/device.h> 23#include <linux/syscore_ops.h> 24#include <linux/delay.h> 25#include <linux/ctype.h> 26#include <linux/sched.h> 27#include <linux/sysfs.h> 28#include <linux/types.h> 29#include <linux/slab.h> 30#include <linux/init.h> 31#include <linux/kmod.h> 32#include <linux/poll.h> 33#include <linux/nmi.h> 34#include <linux/cpu.h> 35#include <linux/smp.h> 36#include <linux/fs.h> 37#include <linux/mm.h> 38#include <linux/debugfs.h> 39#include <linux/irq_work.h> 40#include <linux/export.h> 41 42#include <asm/processor.h> 43#include <asm/mce.h> 44#include <asm/msr.h> 45 46#include "mce-internal.h" 47 48static DEFINE_MUTEX(mce_chrdev_read_mutex); 49 50#define rcu_dereference_check_mce(p) \ 51 rcu_dereference_index_check((p), \ 52 rcu_read_lock_sched_held() || \ 53 lockdep_is_held(&mce_chrdev_read_mutex)) 54 55#define CREATE_TRACE_POINTS 56#include <trace/events/mce.h> 57 58int mce_disabled __read_mostly; 59 60#define MISC_MCELOG_MINOR 227 61 62#define SPINUNIT 100 /* 100ns */ 63 64atomic_t mce_entry; 65 66DEFINE_PER_CPU(unsigned, mce_exception_count); 67 68/* 69 * Tolerant levels: 70 * 0: always panic on uncorrected errors, log corrected errors 71 * 1: panic or SIGBUS on uncorrected errors, log corrected errors 72 * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors 73 * 3: never panic or SIGBUS, log all errors (for testing only) 74 */ 75static int tolerant __read_mostly = 1; 76static int banks __read_mostly; 77static int rip_msr __read_mostly; 78static int mce_bootlog __read_mostly = -1; 79static int monarch_timeout __read_mostly = -1; 80static int mce_panic_timeout __read_mostly; 81static int mce_dont_log_ce __read_mostly; 82int mce_cmci_disabled __read_mostly; 83int mce_ignore_ce __read_mostly; 84int mce_ser __read_mostly; 85 86struct mce_bank *mce_banks __read_mostly; 87 88/* User mode helper program triggered by machine check event */ 89static unsigned long mce_need_notify; 90static char mce_helper[128]; 91static char *mce_helper_argv[2] = { mce_helper, NULL }; 92 93static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait); 94 95static DEFINE_PER_CPU(struct mce, mces_seen); 96static int cpu_missing; 97 98/* MCA banks polled by the period polling timer for corrected events */ 99DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { 100 [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL 101}; 102 103static DEFINE_PER_CPU(struct work_struct, mce_work); 104 105/* 106 * CPU/chipset specific EDAC code can register a notifier call here to print 107 * MCE errors in a human-readable form. 108 */ 109ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain); 110 111/* Do initial initialization of a struct mce */ 112void mce_setup(struct mce *m) 113{ 114 memset(m, 0, sizeof(struct mce)); 115 m->cpu = m->extcpu = smp_processor_id(); 116 rdtscll(m->tsc); 117 /* We hope get_seconds stays lockless */ 118 m->time = get_seconds(); 119 m->cpuvendor = boot_cpu_data.x86_vendor; 120 m->cpuid = cpuid_eax(1); 121 m->socketid = cpu_data(m->extcpu).phys_proc_id; 122 m->apicid = cpu_data(m->extcpu).initial_apicid; 123 rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap); 124} 125 126DEFINE_PER_CPU(struct mce, injectm); 127EXPORT_PER_CPU_SYMBOL_GPL(injectm); 128 129/* 130 * Lockless MCE logging infrastructure. 131 * This avoids deadlocks on printk locks without having to break locks. Also 132 * separate MCEs from kernel messages to avoid bogus bug reports. 133 */ 134 135static struct mce_log mcelog = { 136 .signature = MCE_LOG_SIGNATURE, 137 .len = MCE_LOG_LEN, 138 .recordlen = sizeof(struct mce), 139}; 140 141void mce_log(struct mce *mce) 142{ 143 unsigned next, entry; 144 int ret = 0; 145 146 /* Emit the trace record: */ 147 trace_mce_record(mce); 148 149 ret = atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, mce); 150 if (ret == NOTIFY_STOP) 151 return; 152 153 mce->finished = 0; 154 wmb(); 155 for (;;) { 156 entry = rcu_dereference_check_mce(mcelog.next); 157 for (;;) { 158 159 /* 160 * When the buffer fills up discard new entries. 161 * Assume that the earlier errors are the more 162 * interesting ones: 163 */ 164 if (entry >= MCE_LOG_LEN) { 165 set_bit(MCE_OVERFLOW, 166 (unsigned long *)&mcelog.flags); 167 return; 168 } 169 /* Old left over entry. Skip: */ 170 if (mcelog.entry[entry].finished) { 171 entry++; 172 continue; 173 } 174 break; 175 } 176 smp_rmb(); 177 next = entry + 1; 178 if (cmpxchg(&mcelog.next, entry, next) == entry) 179 break; 180 } 181 memcpy(mcelog.entry + entry, mce, sizeof(struct mce)); 182 wmb(); 183 mcelog.entry[entry].finished = 1; 184 wmb(); 185 186 mce->finished = 1; 187 set_bit(0, &mce_need_notify); 188} 189 190static void drain_mcelog_buffer(void) 191{ 192 unsigned int next, i, prev = 0; 193 194 next = ACCESS_ONCE(mcelog.next); 195 196 do { 197 struct mce *m; 198 199 /* drain what was logged during boot */ 200 for (i = prev; i < next; i++) { 201 unsigned long start = jiffies; 202 unsigned retries = 1; 203 204 m = &mcelog.entry[i]; 205 206 while (!m->finished) { 207 if (time_after_eq(jiffies, start + 2*retries)) 208 retries++; 209 210 cpu_relax(); 211 212 if (!m->finished && retries >= 4) { 213 pr_err("MCE: skipping error being logged currently!\n"); 214 break; 215 } 216 } 217 smp_rmb(); 218 atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m); 219 } 220 221 memset(mcelog.entry + prev, 0, (next - prev) * sizeof(*m)); 222 prev = next; 223 next = cmpxchg(&mcelog.next, prev, 0); 224 } while (next != prev); 225} 226 227 228void mce_register_decode_chain(struct notifier_block *nb) 229{ 230 atomic_notifier_chain_register(&x86_mce_decoder_chain, nb); 231 drain_mcelog_buffer(); 232} 233EXPORT_SYMBOL_GPL(mce_register_decode_chain); 234 235void mce_unregister_decode_chain(struct notifier_block *nb) 236{ 237 atomic_notifier_chain_unregister(&x86_mce_decoder_chain, nb); 238} 239EXPORT_SYMBOL_GPL(mce_unregister_decode_chain); 240 241static void print_mce(struct mce *m) 242{ 243 int ret = 0; 244 245 pr_emerg(HW_ERR "CPU %d: Machine Check Exception: %Lx Bank %d: %016Lx\n", 246 m->extcpu, m->mcgstatus, m->bank, m->status); 247 248 if (m->ip) { 249 pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ", 250 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", 251 m->cs, m->ip); 252 253 if (m->cs == __KERNEL_CS) 254 print_symbol("{%s}", m->ip); 255 pr_cont("\n"); 256 } 257 258 pr_emerg(HW_ERR "TSC %llx ", m->tsc); 259 if (m->addr) 260 pr_cont("ADDR %llx ", m->addr); 261 if (m->misc) 262 pr_cont("MISC %llx ", m->misc); 263 264 pr_cont("\n"); 265 /* 266 * Note this output is parsed by external tools and old fields 267 * should not be changed. 268 */ 269 pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x microcode %x\n", 270 m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid, 271 cpu_data(m->extcpu).microcode); 272 273 /* 274 * Print out human-readable details about the MCE error, 275 * (if the CPU has an implementation for that) 276 */ 277 ret = atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m); 278 if (ret == NOTIFY_STOP) 279 return; 280 281 pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n"); 282} 283 284#define PANIC_TIMEOUT 5 /* 5 seconds */ 285 286static atomic_t mce_paniced; 287 288static int fake_panic; 289static atomic_t mce_fake_paniced; 290 291/* Panic in progress. Enable interrupts and wait for final IPI */ 292static void wait_for_panic(void) 293{ 294 long timeout = PANIC_TIMEOUT*USEC_PER_SEC; 295 296 preempt_disable(); 297 local_irq_enable(); 298 while (timeout-- > 0) 299 udelay(1); 300 if (panic_timeout == 0) 301 panic_timeout = mce_panic_timeout; 302 panic("Panicing machine check CPU died"); 303} 304 305static void mce_panic(char *msg, struct mce *final, char *exp) 306{ 307 int i, apei_err = 0; 308 309 if (!fake_panic) { 310 /* 311 * Make sure only one CPU runs in machine check panic 312 */ 313 if (atomic_inc_return(&mce_paniced) > 1) 314 wait_for_panic(); 315 barrier(); 316 317 bust_spinlocks(1); 318 console_verbose(); 319 } else { 320 /* Don't log too much for fake panic */ 321 if (atomic_inc_return(&mce_fake_paniced) > 1) 322 return; 323 } 324 /* First print corrected ones that are still unlogged */ 325 for (i = 0; i < MCE_LOG_LEN; i++) { 326 struct mce *m = &mcelog.entry[i]; 327 if (!(m->status & MCI_STATUS_VAL)) 328 continue; 329 if (!(m->status & MCI_STATUS_UC)) { 330 print_mce(m); 331 if (!apei_err) 332 apei_err = apei_write_mce(m); 333 } 334 } 335 /* Now print uncorrected but with the final one last */ 336 for (i = 0; i < MCE_LOG_LEN; i++) { 337 struct mce *m = &mcelog.entry[i]; 338 if (!(m->status & MCI_STATUS_VAL)) 339 continue; 340 if (!(m->status & MCI_STATUS_UC)) 341 continue; 342 if (!final || memcmp(m, final, sizeof(struct mce))) { 343 print_mce(m); 344 if (!apei_err) 345 apei_err = apei_write_mce(m); 346 } 347 } 348 if (final) { 349 print_mce(final); 350 if (!apei_err) 351 apei_err = apei_write_mce(final); 352 } 353 if (cpu_missing) 354 pr_emerg(HW_ERR "Some CPUs didn't answer in synchronization\n"); 355 if (exp) 356 pr_emerg(HW_ERR "Machine check: %s\n", exp); 357 if (!fake_panic) { 358 if (panic_timeout == 0) 359 panic_timeout = mce_panic_timeout; 360 panic(msg); 361 } else 362 pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg); 363} 364 365/* Support code for software error injection */ 366 367static int msr_to_offset(u32 msr) 368{ 369 unsigned bank = __this_cpu_read(injectm.bank); 370 371 if (msr == rip_msr) 372 return offsetof(struct mce, ip); 373 if (msr == MSR_IA32_MCx_STATUS(bank)) 374 return offsetof(struct mce, status); 375 if (msr == MSR_IA32_MCx_ADDR(bank)) 376 return offsetof(struct mce, addr); 377 if (msr == MSR_IA32_MCx_MISC(bank)) 378 return offsetof(struct mce, misc); 379 if (msr == MSR_IA32_MCG_STATUS) 380 return offsetof(struct mce, mcgstatus); 381 return -1; 382} 383 384/* MSR access wrappers used for error injection */ 385static u64 mce_rdmsrl(u32 msr) 386{ 387 u64 v; 388 389 if (__this_cpu_read(injectm.finished)) { 390 int offset = msr_to_offset(msr); 391 392 if (offset < 0) 393 return 0; 394 return *(u64 *)((char *)&__get_cpu_var(injectm) + offset); 395 } 396 397 if (rdmsrl_safe(msr, &v)) { 398 WARN_ONCE(1, "mce: Unable to read msr %d!\n", msr); 399 /* 400 * Return zero in case the access faulted. This should 401 * not happen normally but can happen if the CPU does 402 * something weird, or if the code is buggy. 403 */ 404 v = 0; 405 } 406 407 return v; 408} 409 410static void mce_wrmsrl(u32 msr, u64 v) 411{ 412 if (__this_cpu_read(injectm.finished)) { 413 int offset = msr_to_offset(msr); 414 415 if (offset >= 0) 416 *(u64 *)((char *)&__get_cpu_var(injectm) + offset) = v; 417 return; 418 } 419 wrmsrl(msr, v); 420} 421 422/* 423 * Collect all global (w.r.t. this processor) status about this machine 424 * check into our "mce" struct so that we can use it later to assess 425 * the severity of the problem as we read per-bank specific details. 426 */ 427static inline void mce_gather_info(struct mce *m, struct pt_regs *regs) 428{ 429 mce_setup(m); 430 431 m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); 432 if (regs) { 433 /* 434 * Get the address of the instruction at the time of 435 * the machine check error. 436 */ 437 if (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) { 438 m->ip = regs->ip; 439 m->cs = regs->cs; 440 } 441 /* Use accurate RIP reporting if available. */ 442 if (rip_msr) 443 m->ip = mce_rdmsrl(rip_msr); 444 } 445} 446 447/* 448 * Simple lockless ring to communicate PFNs from the exception handler with the 449 * process context work function. This is vastly simplified because there's 450 * only a single reader and a single writer. 451 */ 452#define MCE_RING_SIZE 16 /* we use one entry less */ 453 454struct mce_ring { 455 unsigned short start; 456 unsigned short end; 457 unsigned long ring[MCE_RING_SIZE]; 458}; 459static DEFINE_PER_CPU(struct mce_ring, mce_ring); 460 461/* Runs with CPU affinity in workqueue */ 462static int mce_ring_empty(void) 463{ 464 struct mce_ring *r = &__get_cpu_var(mce_ring); 465 466 return r->start == r->end; 467} 468 469static int mce_ring_get(unsigned long *pfn) 470{ 471 struct mce_ring *r; 472 int ret = 0; 473 474 *pfn = 0; 475 get_cpu(); 476 r = &__get_cpu_var(mce_ring); 477 if (r->start == r->end) 478 goto out; 479 *pfn = r->ring[r->start]; 480 r->start = (r->start + 1) % MCE_RING_SIZE; 481 ret = 1; 482out: 483 put_cpu(); 484 return ret; 485} 486 487/* Always runs in MCE context with preempt off */ 488static int mce_ring_add(unsigned long pfn) 489{ 490 struct mce_ring *r = &__get_cpu_var(mce_ring); 491 unsigned next; 492 493 next = (r->end + 1) % MCE_RING_SIZE; 494 if (next == r->start) 495 return -1; 496 r->ring[r->end] = pfn; 497 wmb(); 498 r->end = next; 499 return 0; 500} 501 502int mce_available(struct cpuinfo_x86 *c) 503{ 504 if (mce_disabled) 505 return 0; 506 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); 507} 508 509static void mce_schedule_work(void) 510{ 511 if (!mce_ring_empty()) { 512 struct work_struct *work = &__get_cpu_var(mce_work); 513 if (!work_pending(work)) 514 schedule_work(work); 515 } 516} 517 518DEFINE_PER_CPU(struct irq_work, mce_irq_work); 519 520static void mce_irq_work_cb(struct irq_work *entry) 521{ 522 mce_notify_irq(); 523 mce_schedule_work(); 524} 525 526static void mce_report_event(struct pt_regs *regs) 527{ 528 if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) { 529 mce_notify_irq(); 530 /* 531 * Triggering the work queue here is just an insurance 532 * policy in case the syscall exit notify handler 533 * doesn't run soon enough or ends up running on the 534 * wrong CPU (can happen when audit sleeps) 535 */ 536 mce_schedule_work(); 537 return; 538 } 539 540 irq_work_queue(&__get_cpu_var(mce_irq_work)); 541} 542 543/* 544 * Read ADDR and MISC registers. 545 */ 546static void mce_read_aux(struct mce *m, int i) 547{ 548 if (m->status & MCI_STATUS_MISCV) 549 m->misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i)); 550 if (m->status & MCI_STATUS_ADDRV) { 551 m->addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i)); 552 553 /* 554 * Mask the reported address by the reported granularity. 555 */ 556 if (mce_ser && (m->status & MCI_STATUS_MISCV)) { 557 u8 shift = MCI_MISC_ADDR_LSB(m->misc); 558 m->addr >>= shift; 559 m->addr <<= shift; 560 } 561 } 562} 563 564DEFINE_PER_CPU(unsigned, mce_poll_count); 565 566/* 567 * Poll for corrected events or events that happened before reset. 568 * Those are just logged through /dev/mcelog. 569 * 570 * This is executed in standard interrupt context. 571 * 572 * Note: spec recommends to panic for fatal unsignalled 573 * errors here. However this would be quite problematic -- 574 * we would need to reimplement the Monarch handling and 575 * it would mess up the exclusion between exception handler 576 * and poll hander -- * so we skip this for now. 577 * These cases should not happen anyways, or only when the CPU 578 * is already totally * confused. In this case it's likely it will 579 * not fully execute the machine check handler either. 580 */ 581void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) 582{ 583 struct mce m; 584 int i; 585 586 this_cpu_inc(mce_poll_count); 587 588 mce_gather_info(&m, NULL); 589 590 for (i = 0; i < banks; i++) { 591 if (!mce_banks[i].ctl || !test_bit(i, *b)) 592 continue; 593 594 m.misc = 0; 595 m.addr = 0; 596 m.bank = i; 597 m.tsc = 0; 598 599 barrier(); 600 m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); 601 if (!(m.status & MCI_STATUS_VAL)) 602 continue; 603 604 /* 605 * Uncorrected or signalled events are handled by the exception 606 * handler when it is enabled, so don't process those here. 607 * 608 * TBD do the same check for MCI_STATUS_EN here? 609 */ 610 if (!(flags & MCP_UC) && 611 (m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC))) 612 continue; 613 614 mce_read_aux(&m, i); 615 616 if (!(flags & MCP_TIMESTAMP)) 617 m.tsc = 0; 618 /* 619 * Don't get the IP here because it's unlikely to 620 * have anything to do with the actual error location. 621 */ 622 if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) 623 mce_log(&m); 624 625 /* 626 * Clear state for this bank. 627 */ 628 mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0); 629 } 630 631 /* 632 * Don't clear MCG_STATUS here because it's only defined for 633 * exceptions. 634 */ 635 636 sync_core(); 637} 638EXPORT_SYMBOL_GPL(machine_check_poll); 639 640/* 641 * Do a quick check if any of the events requires a panic. 642 * This decides if we keep the events around or clear them. 643 */ 644static int mce_no_way_out(struct mce *m, char **msg) 645{ 646 int i; 647 648 for (i = 0; i < banks; i++) { 649 m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); 650 if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY) 651 return 1; 652 } 653 return 0; 654} 655 656/* 657 * Variable to establish order between CPUs while scanning. 658 * Each CPU spins initially until executing is equal its number. 659 */ 660static atomic_t mce_executing; 661 662/* 663 * Defines order of CPUs on entry. First CPU becomes Monarch. 664 */ 665static atomic_t mce_callin; 666 667/* 668 * Check if a timeout waiting for other CPUs happened. 669 */ 670static int mce_timed_out(u64 *t) 671{ 672 /* 673 * The others already did panic for some reason. 674 * Bail out like in a timeout. 675 * rmb() to tell the compiler that system_state 676 * might have been modified by someone else. 677 */ 678 rmb(); 679 if (atomic_read(&mce_paniced)) 680 wait_for_panic(); 681 if (!monarch_timeout) 682 goto out; 683 if ((s64)*t < SPINUNIT) { 684 /* CHECKME: Make panic default for 1 too? */ 685 if (tolerant < 1) 686 mce_panic("Timeout synchronizing machine check over CPUs", 687 NULL, NULL); 688 cpu_missing = 1; 689 return 1; 690 } 691 *t -= SPINUNIT; 692out: 693 touch_nmi_watchdog(); 694 return 0; 695} 696 697/* 698 * The Monarch's reign. The Monarch is the CPU who entered 699 * the machine check handler first. It waits for the others to 700 * raise the exception too and then grades them. When any 701 * error is fatal panic. Only then let the others continue. 702 * 703 * The other CPUs entering the MCE handler will be controlled by the 704 * Monarch. They are called Subjects. 705 * 706 * This way we prevent any potential data corruption in a unrecoverable case 707 * and also makes sure always all CPU's errors are examined. 708 * 709 * Also this detects the case of a machine check event coming from outer 710 * space (not detected by any CPUs) In this case some external agent wants 711 * us to shut down, so panic too. 712 * 713 * The other CPUs might still decide to panic if the handler happens 714 * in a unrecoverable place, but in this case the system is in a semi-stable 715 * state and won't corrupt anything by itself. It's ok to let the others 716 * continue for a bit first. 717 * 718 * All the spin loops have timeouts; when a timeout happens a CPU 719 * typically elects itself to be Monarch. 720 */ 721static void mce_reign(void) 722{ 723 int cpu; 724 struct mce *m = NULL; 725 int global_worst = 0; 726 char *msg = NULL; 727 char *nmsg = NULL; 728 729 /* 730 * This CPU is the Monarch and the other CPUs have run 731 * through their handlers. 732 * Grade the severity of the errors of all the CPUs. 733 */ 734 for_each_possible_cpu(cpu) { 735 int severity = mce_severity(&per_cpu(mces_seen, cpu), tolerant, 736 &nmsg); 737 if (severity > global_worst) { 738 msg = nmsg; 739 global_worst = severity; 740 m = &per_cpu(mces_seen, cpu); 741 } 742 } 743 744 /* 745 * Cannot recover? Panic here then. 746 * This dumps all the mces in the log buffer and stops the 747 * other CPUs. 748 */ 749 if (m && global_worst >= MCE_PANIC_SEVERITY && tolerant < 3) 750 mce_panic("Fatal Machine check", m, msg); 751 752 /* 753 * For UC somewhere we let the CPU who detects it handle it. 754 * Also must let continue the others, otherwise the handling 755 * CPU could deadlock on a lock. 756 */ 757 758 /* 759 * No machine check event found. Must be some external 760 * source or one CPU is hung. Panic. 761 */ 762 if (global_worst <= MCE_KEEP_SEVERITY && tolerant < 3) 763 mce_panic("Machine check from unknown source", NULL, NULL); 764 765 /* 766 * Now clear all the mces_seen so that they don't reappear on 767 * the next mce. 768 */ 769 for_each_possible_cpu(cpu) 770 memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce)); 771} 772 773static atomic_t global_nwo; 774 775/* 776 * Start of Monarch synchronization. This waits until all CPUs have 777 * entered the exception handler and then determines if any of them 778 * saw a fatal event that requires panic. Then it executes them 779 * in the entry order. 780 * TBD double check parallel CPU hotunplug 781 */ 782static int mce_start(int *no_way_out) 783{ 784 int order; 785 int cpus = num_online_cpus(); 786 u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC; 787 788 if (!timeout) 789 return -1; 790 791 atomic_add(*no_way_out, &global_nwo); 792 /* 793 * global_nwo should be updated before mce_callin 794 */ 795 smp_wmb(); 796 order = atomic_inc_return(&mce_callin); 797 798 /* 799 * Wait for everyone. 800 */ 801 while (atomic_read(&mce_callin) != cpus) { 802 if (mce_timed_out(&timeout)) { 803 atomic_set(&global_nwo, 0); 804 return -1; 805 } 806 ndelay(SPINUNIT); 807 } 808 809 /* 810 * mce_callin should be read before global_nwo 811 */ 812 smp_rmb(); 813 814 if (order == 1) { 815 /* 816 * Monarch: Starts executing now, the others wait. 817 */ 818 atomic_set(&mce_executing, 1); 819 } else { 820 /* 821 * Subject: Now start the scanning loop one by one in 822 * the original callin order. 823 * This way when there are any shared banks it will be 824 * only seen by one CPU before cleared, avoiding duplicates. 825 */ 826 while (atomic_read(&mce_executing) < order) { 827 if (mce_timed_out(&timeout)) { 828 atomic_set(&global_nwo, 0); 829 return -1; 830 } 831 ndelay(SPINUNIT); 832 } 833 } 834 835 /* 836 * Cache the global no_way_out state. 837 */ 838 *no_way_out = atomic_read(&global_nwo); 839 840 return order; 841} 842 843/* 844 * Synchronize between CPUs after main scanning loop. 845 * This invokes the bulk of the Monarch processing. 846 */ 847static int mce_end(int order) 848{ 849 int ret = -1; 850 u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC; 851 852 if (!timeout) 853 goto reset; 854 if (order < 0) 855 goto reset; 856 857 /* 858 * Allow others to run. 859 */ 860 atomic_inc(&mce_executing); 861 862 if (order == 1) { 863 /* CHECKME: Can this race with a parallel hotplug? */ 864 int cpus = num_online_cpus(); 865 866 /* 867 * Monarch: Wait for everyone to go through their scanning 868 * loops. 869 */ 870 while (atomic_read(&mce_executing) <= cpus) { 871 if (mce_timed_out(&timeout)) 872 goto reset; 873 ndelay(SPINUNIT); 874 } 875 876 mce_reign(); 877 barrier(); 878 ret = 0; 879 } else { 880 /* 881 * Subject: Wait for Monarch to finish. 882 */ 883 while (atomic_read(&mce_executing) != 0) { 884 if (mce_timed_out(&timeout)) 885 goto reset; 886 ndelay(SPINUNIT); 887 } 888 889 /* 890 * Don't reset anything. That's done by the Monarch. 891 */ 892 return 0; 893 } 894 895 /* 896 * Reset all global state. 897 */ 898reset: 899 atomic_set(&global_nwo, 0); 900 atomic_set(&mce_callin, 0); 901 barrier(); 902 903 /* 904 * Let others run again. 905 */ 906 atomic_set(&mce_executing, 0); 907 return ret; 908} 909 910/* 911 * Check if the address reported by the CPU is in a format we can parse. 912 * It would be possible to add code for most other cases, but all would 913 * be somewhat complicated (e.g. segment offset would require an instruction 914 * parser). So only support physical addresses up to page granuality for now. 915 */ 916static int mce_usable_address(struct mce *m) 917{ 918 if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV)) 919 return 0; 920 if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT) 921 return 0; 922 if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS) 923 return 0; 924 return 1; 925} 926 927static void mce_clear_state(unsigned long *toclear) 928{ 929 int i; 930 931 for (i = 0; i < banks; i++) { 932 if (test_bit(i, toclear)) 933 mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0); 934 } 935} 936 937/* 938 * Need to save faulting physical address associated with a process 939 * in the machine check handler some place where we can grab it back 940 * later in mce_notify_process() 941 */ 942#define MCE_INFO_MAX 16 943 944struct mce_info { 945 atomic_t inuse; 946 struct task_struct *t; 947 __u64 paddr; 948} mce_info[MCE_INFO_MAX]; 949 950static void mce_save_info(__u64 addr) 951{ 952 struct mce_info *mi; 953 954 for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++) { 955 if (atomic_cmpxchg(&mi->inuse, 0, 1) == 0) { 956 mi->t = current; 957 mi->paddr = addr; 958 return; 959 } 960 } 961 962 mce_panic("Too many concurrent recoverable errors", NULL, NULL); 963} 964 965static struct mce_info *mce_find_info(void) 966{ 967 struct mce_info *mi; 968 969 for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++) 970 if (atomic_read(&mi->inuse) && mi->t == current) 971 return mi; 972 return NULL; 973} 974 975static void mce_clear_info(struct mce_info *mi) 976{ 977 atomic_set(&mi->inuse, 0); 978} 979 980/* 981 * The actual machine check handler. This only handles real 982 * exceptions when something got corrupted coming in through int 18. 983 * 984 * This is executed in NMI context not subject to normal locking rules. This 985 * implies that most kernel services cannot be safely used. Don't even 986 * think about putting a printk in there! 987 * 988 * On Intel systems this is entered on all CPUs in parallel through 989 * MCE broadcast. However some CPUs might be broken beyond repair, 990 * so be always careful when synchronizing with others. 991 */ 992void do_machine_check(struct pt_regs *regs, long error_code) 993{ 994 struct mce m, *final; 995 int i; 996 int worst = 0; 997 int severity; 998 /* 999 * Establish sequential order between the CPUs entering the machine 1000 * check handler. 1001 */ 1002 int order; 1003 /* 1004 * If no_way_out gets set, there is no safe way to recover from this 1005 * MCE. If tolerant is cranked up, we'll try anyway. 1006 */ 1007 int no_way_out = 0; 1008 /* 1009 * If kill_it gets set, there might be a way to recover from this 1010 * error. 1011 */ 1012 int kill_it = 0; 1013 DECLARE_BITMAP(toclear, MAX_NR_BANKS); 1014 char *msg = "Unknown"; 1015 1016 atomic_inc(&mce_entry); 1017 1018 this_cpu_inc(mce_exception_count); 1019 1020 if (!banks) 1021 goto out; 1022 1023 mce_gather_info(&m, regs); 1024 1025 final = &__get_cpu_var(mces_seen); 1026 *final = m; 1027 1028 no_way_out = mce_no_way_out(&m, &msg); 1029 1030 barrier(); 1031 1032 /* 1033 * When no restart IP might need to kill or panic. 1034 * Assume the worst for now, but if we find the 1035 * severity is MCE_AR_SEVERITY we have other options. 1036 */ 1037 if (!(m.mcgstatus & MCG_STATUS_RIPV)) 1038 kill_it = 1; 1039 1040 /* 1041 * Go through all the banks in exclusion of the other CPUs. 1042 * This way we don't report duplicated events on shared banks 1043 * because the first one to see it will clear it. 1044 */ 1045 order = mce_start(&no_way_out); 1046 for (i = 0; i < banks; i++) { 1047 __clear_bit(i, toclear); 1048 if (!mce_banks[i].ctl) 1049 continue; 1050 1051 m.misc = 0; 1052 m.addr = 0; 1053 m.bank = i; 1054 1055 m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); 1056 if ((m.status & MCI_STATUS_VAL) == 0) 1057 continue; 1058 1059 /* 1060 * Non uncorrected or non signaled errors are handled by 1061 * machine_check_poll. Leave them alone, unless this panics. 1062 */ 1063 if (!(m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)) && 1064 !no_way_out) 1065 continue; 1066 1067 /* 1068 * Set taint even when machine check was not enabled. 1069 */ 1070 add_taint(TAINT_MACHINE_CHECK); 1071 1072 severity = mce_severity(&m, tolerant, NULL); 1073 1074 /* 1075 * When machine check was for corrected handler don't touch, 1076 * unless we're panicing. 1077 */ 1078 if (severity == MCE_KEEP_SEVERITY && !no_way_out) 1079 continue; 1080 __set_bit(i, toclear); 1081 if (severity == MCE_NO_SEVERITY) { 1082 /* 1083 * Machine check event was not enabled. Clear, but 1084 * ignore. 1085 */ 1086 continue; 1087 } 1088 1089 mce_read_aux(&m, i); 1090 1091 /* 1092 * Action optional error. Queue address for later processing. 1093 * When the ring overflows we just ignore the AO error. 1094 * RED-PEN add some logging mechanism when 1095 * usable_address or mce_add_ring fails. 1096 * RED-PEN don't ignore overflow for tolerant == 0 1097 */ 1098 if (severity == MCE_AO_SEVERITY && mce_usable_address(&m)) 1099 mce_ring_add(m.addr >> PAGE_SHIFT); 1100 1101 mce_log(&m); 1102 1103 if (severity > worst) { 1104 *final = m; 1105 worst = severity; 1106 } 1107 } 1108 1109 /* mce_clear_state will clear *final, save locally for use later */ 1110 m = *final; 1111 1112 if (!no_way_out) 1113 mce_clear_state(toclear); 1114 1115 /* 1116 * Do most of the synchronization with other CPUs. 1117 * When there's any problem use only local no_way_out state. 1118 */ 1119 if (mce_end(order) < 0) 1120 no_way_out = worst >= MCE_PANIC_SEVERITY; 1121 1122 /* 1123 * At insane "tolerant" levels we take no action. Otherwise 1124 * we only die if we have no other choice. For less serious 1125 * issues we try to recover, or limit damage to the current 1126 * process. 1127 */ 1128 if (tolerant < 3) { 1129 if (no_way_out) 1130 mce_panic("Fatal machine check on current CPU", &m, msg); 1131 if (worst == MCE_AR_SEVERITY) { 1132 /* schedule action before return to userland */ 1133 mce_save_info(m.addr); 1134 set_thread_flag(TIF_MCE_NOTIFY); 1135 } else if (kill_it) { 1136 force_sig(SIGBUS, current); 1137 } 1138 } 1139 1140 if (worst > 0) 1141 mce_report_event(regs); 1142 mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); 1143out: 1144 atomic_dec(&mce_entry); 1145 sync_core(); 1146} 1147EXPORT_SYMBOL_GPL(do_machine_check); 1148 1149#ifndef CONFIG_MEMORY_FAILURE 1150int memory_failure(unsigned long pfn, int vector, int flags) 1151{ 1152 /* mce_severity() should not hand us an ACTION_REQUIRED error */ 1153 BUG_ON(flags & MF_ACTION_REQUIRED); 1154 printk(KERN_ERR "Uncorrected memory error in page 0x%lx ignored\n" 1155 "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n", pfn); 1156 1157 return 0; 1158} 1159#endif 1160 1161/* 1162 * Called in process context that interrupted by MCE and marked with 1163 * TIF_MCE_NOTIFY, just before returning to erroneous userland. 1164 * This code is allowed to sleep. 1165 * Attempt possible recovery such as calling the high level VM handler to 1166 * process any corrupted pages, and kill/signal current process if required. 1167 * Action required errors are handled here. 1168 */ 1169void mce_notify_process(void) 1170{ 1171 unsigned long pfn; 1172 struct mce_info *mi = mce_find_info(); 1173 1174 if (!mi) 1175 mce_panic("Lost physical address for unconsumed uncorrectable error", NULL, NULL); 1176 pfn = mi->paddr >> PAGE_SHIFT; 1177 1178 clear_thread_flag(TIF_MCE_NOTIFY); 1179 1180 pr_err("Uncorrected hardware memory error in user-access at %llx", 1181 mi->paddr); 1182 if (memory_failure(pfn, MCE_VECTOR, MF_ACTION_REQUIRED) < 0) { 1183 pr_err("Memory error not recovered"); 1184 force_sig(SIGBUS, current); 1185 } 1186 mce_clear_info(mi); 1187} 1188 1189/* 1190 * Action optional processing happens here (picking up 1191 * from the list of faulting pages that do_machine_check() 1192 * placed into the "ring"). 1193 */ 1194static void mce_process_work(struct work_struct *dummy) 1195{ 1196 unsigned long pfn; 1197 1198 while (mce_ring_get(&pfn)) 1199 memory_failure(pfn, MCE_VECTOR, 0); 1200} 1201 1202#ifdef CONFIG_X86_MCE_INTEL 1203/*** 1204 * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog 1205 * @cpu: The CPU on which the event occurred. 1206 * @status: Event status information 1207 * 1208 * This function should be called by the thermal interrupt after the 1209 * event has been processed and the decision was made to log the event 1210 * further. 1211 * 1212 * The status parameter will be saved to the 'status' field of 'struct mce' 1213 * and historically has been the register value of the 1214 * MSR_IA32_THERMAL_STATUS (Intel) msr. 1215 */ 1216void mce_log_therm_throt_event(__u64 status) 1217{ 1218 struct mce m; 1219 1220 mce_setup(&m); 1221 m.bank = MCE_THERMAL_BANK; 1222 m.status = status; 1223 mce_log(&m); 1224} 1225#endif /* CONFIG_X86_MCE_INTEL */ 1226 1227/* 1228 * Periodic polling timer for "silent" machine check errors. If the 1229 * poller finds an MCE, poll 2x faster. When the poller finds no more 1230 * errors, poll 2x slower (up to check_interval seconds). 1231 */ 1232static int check_interval = 5 * 60; /* 5 minutes */ 1233 1234static DEFINE_PER_CPU(int, mce_next_interval); /* in jiffies */ 1235static DEFINE_PER_CPU(struct timer_list, mce_timer); 1236 1237static void mce_start_timer(unsigned long data) 1238{ 1239 struct timer_list *t = &per_cpu(mce_timer, data); 1240 int *n; 1241 1242 WARN_ON(smp_processor_id() != data); 1243 1244 if (mce_available(__this_cpu_ptr(&cpu_info))) { 1245 machine_check_poll(MCP_TIMESTAMP, 1246 &__get_cpu_var(mce_poll_banks)); 1247 } 1248 1249 /* 1250 * Alert userspace if needed. If we logged an MCE, reduce the 1251 * polling interval, otherwise increase the polling interval. 1252 */ 1253 n = &__get_cpu_var(mce_next_interval); 1254 if (mce_notify_irq()) 1255 *n = max(*n/2, HZ/100); 1256 else 1257 *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ)); 1258 1259 t->expires = jiffies + *n; 1260 add_timer_on(t, smp_processor_id()); 1261} 1262 1263/* Must not be called in IRQ context where del_timer_sync() can deadlock */ 1264static void mce_timer_delete_all(void) 1265{ 1266 int cpu; 1267 1268 for_each_online_cpu(cpu) 1269 del_timer_sync(&per_cpu(mce_timer, cpu)); 1270} 1271 1272static void mce_do_trigger(struct work_struct *work) 1273{ 1274 call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT); 1275} 1276 1277static DECLARE_WORK(mce_trigger_work, mce_do_trigger); 1278 1279/* 1280 * Notify the user(s) about new machine check events. 1281 * Can be called from interrupt context, but not from machine check/NMI 1282 * context. 1283 */ 1284int mce_notify_irq(void) 1285{ 1286 /* Not more than two messages every minute */ 1287 static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); 1288 1289 if (test_and_clear_bit(0, &mce_need_notify)) { 1290 /* wake processes polling /dev/mcelog */ 1291 wake_up_interruptible(&mce_chrdev_wait); 1292 1293 /* 1294 * There is no risk of missing notifications because 1295 * work_pending is always cleared before the function is 1296 * executed. 1297 */ 1298 if (mce_helper[0] && !work_pending(&mce_trigger_work)) 1299 schedule_work(&mce_trigger_work); 1300 1301 if (__ratelimit(&ratelimit)) 1302 pr_info(HW_ERR "Machine check events logged\n"); 1303 1304 return 1; 1305 } 1306 return 0; 1307} 1308EXPORT_SYMBOL_GPL(mce_notify_irq); 1309 1310static int __cpuinit __mcheck_cpu_mce_banks_init(void) 1311{ 1312 int i; 1313 1314 mce_banks = kzalloc(banks * sizeof(struct mce_bank), GFP_KERNEL); 1315 if (!mce_banks) 1316 return -ENOMEM; 1317 for (i = 0; i < banks; i++) { 1318 struct mce_bank *b = &mce_banks[i]; 1319 1320 b->ctl = -1ULL; 1321 b->init = 1; 1322 } 1323 return 0; 1324} 1325 1326/* 1327 * Initialize Machine Checks for a CPU. 1328 */ 1329static int __cpuinit __mcheck_cpu_cap_init(void) 1330{ 1331 unsigned b; 1332 u64 cap; 1333 1334 rdmsrl(MSR_IA32_MCG_CAP, cap); 1335 1336 b = cap & MCG_BANKCNT_MASK; 1337 if (!banks) 1338 printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b); 1339 1340 if (b > MAX_NR_BANKS) { 1341 printk(KERN_WARNING 1342 "MCE: Using only %u machine check banks out of %u\n", 1343 MAX_NR_BANKS, b); 1344 b = MAX_NR_BANKS; 1345 } 1346 1347 /* Don't support asymmetric configurations today */ 1348 WARN_ON(banks != 0 && b != banks); 1349 banks = b; 1350 if (!mce_banks) { 1351 int err = __mcheck_cpu_mce_banks_init(); 1352 1353 if (err) 1354 return err; 1355 } 1356 1357 /* Use accurate RIP reporting if available. */ 1358 if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9) 1359 rip_msr = MSR_IA32_MCG_EIP; 1360 1361 if (cap & MCG_SER_P) 1362 mce_ser = 1; 1363 1364 return 0; 1365} 1366 1367static void __mcheck_cpu_init_generic(void) 1368{ 1369 mce_banks_t all_banks; 1370 u64 cap; 1371 int i; 1372 1373 /* 1374 * Log the machine checks left over from the previous reset. 1375 */ 1376 bitmap_fill(all_banks, MAX_NR_BANKS); 1377 machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks); 1378 1379 set_in_cr4(X86_CR4_MCE); 1380 1381 rdmsrl(MSR_IA32_MCG_CAP, cap); 1382 if (cap & MCG_CTL_P) 1383 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); 1384 1385 for (i = 0; i < banks; i++) { 1386 struct mce_bank *b = &mce_banks[i]; 1387 1388 if (!b->init) 1389 continue; 1390 wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl); 1391 wrmsrl(MSR_IA32_MCx_STATUS(i), 0); 1392 } 1393} 1394 1395/* Add per CPU specific workarounds here */ 1396static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) 1397{ 1398 if (c->x86_vendor == X86_VENDOR_UNKNOWN) { 1399 pr_info("MCE: unknown CPU type - not enabling MCE support.\n"); 1400 return -EOPNOTSUPP; 1401 } 1402 1403 /* This should be disabled by the BIOS, but isn't always */ 1404 if (c->x86_vendor == X86_VENDOR_AMD) { 1405 if (c->x86 == 15 && banks > 4) { 1406 /* 1407 * disable GART TBL walk error reporting, which 1408 * trips off incorrectly with the IOMMU & 3ware 1409 * & Cerberus: 1410 */ 1411 clear_bit(10, (unsigned long *)&mce_banks[4].ctl); 1412 } 1413 if (c->x86 <= 17 && mce_bootlog < 0) { 1414 /* 1415 * Lots of broken BIOS around that don't clear them 1416 * by default and leave crap in there. Don't log: 1417 */ 1418 mce_bootlog = 0; 1419 } 1420 /* 1421 * Various K7s with broken bank 0 around. Always disable 1422 * by default. 1423 */ 1424 if (c->x86 == 6 && banks > 0) 1425 mce_banks[0].ctl = 0; 1426 } 1427 1428 if (c->x86_vendor == X86_VENDOR_INTEL) { 1429 /* 1430 * SDM documents that on family 6 bank 0 should not be written 1431 * because it aliases to another special BIOS controlled 1432 * register. 1433 * But it's not aliased anymore on model 0x1a+ 1434 * Don't ignore bank 0 completely because there could be a 1435 * valid event later, merely don't write CTL0. 1436 */ 1437 1438 if (c->x86 == 6 && c->x86_model < 0x1A && banks > 0) 1439 mce_banks[0].init = 0; 1440 1441 /* 1442 * All newer Intel systems support MCE broadcasting. Enable 1443 * synchronization with a one second timeout. 1444 */ 1445 if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) && 1446 monarch_timeout < 0) 1447 monarch_timeout = USEC_PER_SEC; 1448 1449 /* 1450 * There are also broken BIOSes on some Pentium M and 1451 * earlier systems: 1452 */ 1453 if (c->x86 == 6 && c->x86_model <= 13 && mce_bootlog < 0) 1454 mce_bootlog = 0; 1455 } 1456 if (monarch_timeout < 0) 1457 monarch_timeout = 0; 1458 if (mce_bootlog != 0) 1459 mce_panic_timeout = 30; 1460 1461 return 0; 1462} 1463 1464static int __cpuinit __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c) 1465{ 1466 if (c->x86 != 5) 1467 return 0; 1468 1469 switch (c->x86_vendor) { 1470 case X86_VENDOR_INTEL: 1471 intel_p5_mcheck_init(c); 1472 return 1; 1473 break; 1474 case X86_VENDOR_CENTAUR: 1475 winchip_mcheck_init(c); 1476 return 1; 1477 break; 1478 } 1479 1480 return 0; 1481} 1482 1483static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) 1484{ 1485 switch (c->x86_vendor) { 1486 case X86_VENDOR_INTEL: 1487 mce_intel_feature_init(c); 1488 break; 1489 case X86_VENDOR_AMD: 1490 mce_amd_feature_init(c); 1491 break; 1492 default: 1493 break; 1494 } 1495} 1496 1497static void __mcheck_cpu_init_timer(void) 1498{ 1499 struct timer_list *t = &__get_cpu_var(mce_timer); 1500 int *n = &__get_cpu_var(mce_next_interval); 1501 1502 setup_timer(t, mce_start_timer, smp_processor_id()); 1503 1504 if (mce_ignore_ce) 1505 return; 1506 1507 *n = check_interval * HZ; 1508 if (!*n) 1509 return; 1510 t->expires = round_jiffies(jiffies + *n); 1511 add_timer_on(t, smp_processor_id()); 1512} 1513 1514/* Handle unconfigured int18 (should never happen) */ 1515static void unexpected_machine_check(struct pt_regs *regs, long error_code) 1516{ 1517 printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", 1518 smp_processor_id()); 1519} 1520 1521/* Call the installed machine check handler for this CPU setup. */ 1522void (*machine_check_vector)(struct pt_regs *, long error_code) = 1523 unexpected_machine_check; 1524 1525/* 1526 * Called for each booted CPU to set up machine checks. 1527 * Must be called with preempt off: 1528 */ 1529void __cpuinit mcheck_cpu_init(struct cpuinfo_x86 *c) 1530{ 1531 if (mce_disabled) 1532 return; 1533 1534 if (__mcheck_cpu_ancient_init(c)) 1535 return; 1536 1537 if (!mce_available(c)) 1538 return; 1539 1540 if (__mcheck_cpu_cap_init() < 0 || __mcheck_cpu_apply_quirks(c) < 0) { 1541 mce_disabled = 1; 1542 return; 1543 } 1544 1545 machine_check_vector = do_machine_check; 1546 1547 __mcheck_cpu_init_generic(); 1548 __mcheck_cpu_init_vendor(c); 1549 __mcheck_cpu_init_timer(); 1550 INIT_WORK(&__get_cpu_var(mce_work), mce_process_work); 1551 init_irq_work(&__get_cpu_var(mce_irq_work), &mce_irq_work_cb); 1552} 1553 1554/* 1555 * mce_chrdev: Character device /dev/mcelog to read and clear the MCE log. 1556 */ 1557 1558static DEFINE_SPINLOCK(mce_chrdev_state_lock); 1559static int mce_chrdev_open_count; /* #times opened */ 1560static int mce_chrdev_open_exclu; /* already open exclusive? */ 1561 1562static int mce_chrdev_open(struct inode *inode, struct file *file) 1563{ 1564 spin_lock(&mce_chrdev_state_lock); 1565 1566 if (mce_chrdev_open_exclu || 1567 (mce_chrdev_open_count && (file->f_flags & O_EXCL))) { 1568 spin_unlock(&mce_chrdev_state_lock); 1569 1570 return -EBUSY; 1571 } 1572 1573 if (file->f_flags & O_EXCL) 1574 mce_chrdev_open_exclu = 1; 1575 mce_chrdev_open_count++; 1576 1577 spin_unlock(&mce_chrdev_state_lock); 1578 1579 return nonseekable_open(inode, file); 1580} 1581 1582static int mce_chrdev_release(struct inode *inode, struct file *file) 1583{ 1584 spin_lock(&mce_chrdev_state_lock); 1585 1586 mce_chrdev_open_count--; 1587 mce_chrdev_open_exclu = 0; 1588 1589 spin_unlock(&mce_chrdev_state_lock); 1590 1591 return 0; 1592} 1593 1594static void collect_tscs(void *data) 1595{ 1596 unsigned long *cpu_tsc = (unsigned long *)data; 1597 1598 rdtscll(cpu_tsc[smp_processor_id()]); 1599} 1600 1601static int mce_apei_read_done; 1602 1603/* Collect MCE record of previous boot in persistent storage via APEI ERST. */ 1604static int __mce_read_apei(char __user **ubuf, size_t usize) 1605{ 1606 int rc; 1607 u64 record_id; 1608 struct mce m; 1609 1610 if (usize < sizeof(struct mce)) 1611 return -EINVAL; 1612 1613 rc = apei_read_mce(&m, &record_id); 1614 /* Error or no more MCE record */ 1615 if (rc <= 0) { 1616 mce_apei_read_done = 1; 1617 /* 1618 * When ERST is disabled, mce_chrdev_read() should return 1619 * "no record" instead of "no device." 1620 */ 1621 if (rc == -ENODEV) 1622 return 0; 1623 return rc; 1624 } 1625 rc = -EFAULT; 1626 if (copy_to_user(*ubuf, &m, sizeof(struct mce))) 1627 return rc; 1628 /* 1629 * In fact, we should have cleared the record after that has 1630 * been flushed to the disk or sent to network in 1631 * /sbin/mcelog, but we have no interface to support that now, 1632 * so just clear it to avoid duplication. 1633 */ 1634 rc = apei_clear_mce(record_id); 1635 if (rc) { 1636 mce_apei_read_done = 1; 1637 return rc; 1638 } 1639 *ubuf += sizeof(struct mce); 1640 1641 return 0; 1642} 1643 1644static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf, 1645 size_t usize, loff_t *off) 1646{ 1647 char __user *buf = ubuf; 1648 unsigned long *cpu_tsc; 1649 unsigned prev, next; 1650 int i, err; 1651 1652 cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL); 1653 if (!cpu_tsc) 1654 return -ENOMEM; 1655 1656 mutex_lock(&mce_chrdev_read_mutex); 1657 1658 if (!mce_apei_read_done) { 1659 err = __mce_read_apei(&buf, usize); 1660 if (err || buf != ubuf) 1661 goto out; 1662 } 1663 1664 next = rcu_dereference_check_mce(mcelog.next); 1665 1666 /* Only supports full reads right now */ 1667 err = -EINVAL; 1668 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) 1669 goto out; 1670 1671 err = 0; 1672 prev = 0; 1673 do { 1674 for (i = prev; i < next; i++) { 1675 unsigned long start = jiffies; 1676 struct mce *m = &mcelog.entry[i]; 1677 1678 while (!m->finished) { 1679 if (time_after_eq(jiffies, start + 2)) { 1680 memset(m, 0, sizeof(*m)); 1681 goto timeout; 1682 } 1683 cpu_relax(); 1684 } 1685 smp_rmb(); 1686 err |= copy_to_user(buf, m, sizeof(*m)); 1687 buf += sizeof(*m); 1688timeout: 1689 ; 1690 } 1691 1692 memset(mcelog.entry + prev, 0, 1693 (next - prev) * sizeof(struct mce)); 1694 prev = next; 1695 next = cmpxchg(&mcelog.next, prev, 0); 1696 } while (next != prev); 1697 1698 synchronize_sched(); 1699 1700 /* 1701 * Collect entries that were still getting written before the 1702 * synchronize. 1703 */ 1704 on_each_cpu(collect_tscs, cpu_tsc, 1); 1705 1706 for (i = next; i < MCE_LOG_LEN; i++) { 1707 struct mce *m = &mcelog.entry[i]; 1708 1709 if (m->finished && m->tsc < cpu_tsc[m->cpu]) { 1710 err |= copy_to_user(buf, m, sizeof(*m)); 1711 smp_rmb(); 1712 buf += sizeof(*m); 1713 memset(m, 0, sizeof(*m)); 1714 } 1715 } 1716 1717 if (err) 1718 err = -EFAULT; 1719 1720out: 1721 mutex_unlock(&mce_chrdev_read_mutex); 1722 kfree(cpu_tsc); 1723 1724 return err ? err : buf - ubuf; 1725} 1726 1727static unsigned int mce_chrdev_poll(struct file *file, poll_table *wait) 1728{ 1729 poll_wait(file, &mce_chrdev_wait, wait); 1730 if (rcu_access_index(mcelog.next)) 1731 return POLLIN | POLLRDNORM; 1732 if (!mce_apei_read_done && apei_check_mce()) 1733 return POLLIN | POLLRDNORM; 1734 return 0; 1735} 1736 1737static long mce_chrdev_ioctl(struct file *f, unsigned int cmd, 1738 unsigned long arg) 1739{ 1740 int __user *p = (int __user *)arg; 1741 1742 if (!capable(CAP_SYS_ADMIN)) 1743 return -EPERM; 1744 1745 switch (cmd) { 1746 case MCE_GET_RECORD_LEN: 1747 return put_user(sizeof(struct mce), p); 1748 case MCE_GET_LOG_LEN: 1749 return put_user(MCE_LOG_LEN, p); 1750 case MCE_GETCLEAR_FLAGS: { 1751 unsigned flags; 1752 1753 do { 1754 flags = mcelog.flags; 1755 } while (cmpxchg(&mcelog.flags, flags, 0) != flags); 1756 1757 return put_user(flags, p); 1758 } 1759 default: 1760 return -ENOTTY; 1761 } 1762} 1763 1764static ssize_t (*mce_write)(struct file *filp, const char __user *ubuf, 1765 size_t usize, loff_t *off); 1766 1767void register_mce_write_callback(ssize_t (*fn)(struct file *filp, 1768 const char __user *ubuf, 1769 size_t usize, loff_t *off)) 1770{ 1771 mce_write = fn; 1772} 1773EXPORT_SYMBOL_GPL(register_mce_write_callback); 1774 1775ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf, 1776 size_t usize, loff_t *off) 1777{ 1778 if (mce_write) 1779 return mce_write(filp, ubuf, usize, off); 1780 else 1781 return -EINVAL; 1782} 1783 1784static const struct file_operations mce_chrdev_ops = { 1785 .open = mce_chrdev_open, 1786 .release = mce_chrdev_release, 1787 .read = mce_chrdev_read, 1788 .write = mce_chrdev_write, 1789 .poll = mce_chrdev_poll, 1790 .unlocked_ioctl = mce_chrdev_ioctl, 1791 .llseek = no_llseek, 1792}; 1793 1794static struct miscdevice mce_chrdev_device = { 1795 MISC_MCELOG_MINOR, 1796 "mcelog", 1797 &mce_chrdev_ops, 1798}; 1799 1800/* 1801 * mce=off Disables machine check 1802 * mce=no_cmci Disables CMCI 1803 * mce=dont_log_ce Clears corrected events silently, no log created for CEs. 1804 * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared. 1805 * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above) 1806 * monarchtimeout is how long to wait for other CPUs on machine 1807 * check, or 0 to not wait 1808 * mce=bootlog Log MCEs from before booting. Disabled by default on AMD. 1809 * mce=nobootlog Don't log MCEs from before booting. 1810 */ 1811static int __init mcheck_enable(char *str) 1812{ 1813 if (*str == 0) { 1814 enable_p5_mce(); 1815 return 1; 1816 } 1817 if (*str == '=') 1818 str++; 1819 if (!strcmp(str, "off")) 1820 mce_disabled = 1; 1821 else if (!strcmp(str, "no_cmci")) 1822 mce_cmci_disabled = 1; 1823 else if (!strcmp(str, "dont_log_ce")) 1824 mce_dont_log_ce = 1; 1825 else if (!strcmp(str, "ignore_ce")) 1826 mce_ignore_ce = 1; 1827 else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog")) 1828 mce_bootlog = (str[0] == 'b'); 1829 else if (isdigit(str[0])) { 1830 get_option(&str, &tolerant); 1831 if (*str == ',') { 1832 ++str; 1833 get_option(&str, &monarch_timeout); 1834 } 1835 } else { 1836 printk(KERN_INFO "mce argument %s ignored. Please use /sys\n", 1837 str); 1838 return 0; 1839 } 1840 return 1; 1841} 1842__setup("mce", mcheck_enable); 1843 1844int __init mcheck_init(void) 1845{ 1846 mcheck_intel_therm_init(); 1847 1848 return 0; 1849} 1850 1851/* 1852 * mce_syscore: PM support 1853 */ 1854 1855/* 1856 * Disable machine checks on suspend and shutdown. We can't really handle 1857 * them later. 1858 */ 1859static int mce_disable_error_reporting(void) 1860{ 1861 int i; 1862 1863 for (i = 0; i < banks; i++) { 1864 struct mce_bank *b = &mce_banks[i]; 1865 1866 if (b->init) 1867 wrmsrl(MSR_IA32_MCx_CTL(i), 0); 1868 } 1869 return 0; 1870} 1871 1872static int mce_syscore_suspend(void) 1873{ 1874 return mce_disable_error_reporting(); 1875} 1876 1877static void mce_syscore_shutdown(void) 1878{ 1879 mce_disable_error_reporting(); 1880} 1881 1882/* 1883 * On resume clear all MCE state. Don't want to see leftovers from the BIOS. 1884 * Only one CPU is active at this time, the others get re-added later using 1885 * CPU hotplug: 1886 */ 1887static void mce_syscore_resume(void) 1888{ 1889 __mcheck_cpu_init_generic(); 1890 __mcheck_cpu_init_vendor(__this_cpu_ptr(&cpu_info)); 1891} 1892 1893static struct syscore_ops mce_syscore_ops = { 1894 .suspend = mce_syscore_suspend, 1895 .shutdown = mce_syscore_shutdown, 1896 .resume = mce_syscore_resume, 1897}; 1898 1899/* 1900 * mce_device: Sysfs support 1901 */ 1902 1903static void mce_cpu_restart(void *data) 1904{ 1905 if (!mce_available(__this_cpu_ptr(&cpu_info))) 1906 return; 1907 __mcheck_cpu_init_generic(); 1908 __mcheck_cpu_init_timer(); 1909} 1910 1911/* Reinit MCEs after user configuration changes */ 1912static void mce_restart(void) 1913{ 1914 mce_timer_delete_all(); 1915 on_each_cpu(mce_cpu_restart, NULL, 1); 1916} 1917 1918/* Toggle features for corrected errors */ 1919static void mce_disable_cmci(void *data) 1920{ 1921 if (!mce_available(__this_cpu_ptr(&cpu_info))) 1922 return; 1923 cmci_clear(); 1924} 1925 1926static void mce_enable_ce(void *all) 1927{ 1928 if (!mce_available(__this_cpu_ptr(&cpu_info))) 1929 return; 1930 cmci_reenable(); 1931 cmci_recheck(); 1932 if (all) 1933 __mcheck_cpu_init_timer(); 1934} 1935 1936static struct bus_type mce_subsys = { 1937 .name = "machinecheck", 1938 .dev_name = "machinecheck", 1939}; 1940 1941DEFINE_PER_CPU(struct device *, mce_device); 1942 1943__cpuinitdata 1944void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); 1945 1946static inline struct mce_bank *attr_to_bank(struct device_attribute *attr) 1947{ 1948 return container_of(attr, struct mce_bank, attr); 1949} 1950 1951static ssize_t show_bank(struct device *s, struct device_attribute *attr, 1952 char *buf) 1953{ 1954 return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl); 1955} 1956 1957static ssize_t set_bank(struct device *s, struct device_attribute *attr, 1958 const char *buf, size_t size) 1959{ 1960 u64 new; 1961 1962 if (strict_strtoull(buf, 0, &new) < 0) 1963 return -EINVAL; 1964 1965 attr_to_bank(attr)->ctl = new; 1966 mce_restart(); 1967 1968 return size; 1969} 1970 1971static ssize_t 1972show_trigger(struct device *s, struct device_attribute *attr, char *buf) 1973{ 1974 strcpy(buf, mce_helper); 1975 strcat(buf, "\n"); 1976 return strlen(mce_helper) + 1; 1977} 1978 1979static ssize_t set_trigger(struct device *s, struct device_attribute *attr, 1980 const char *buf, size_t siz) 1981{ 1982 char *p; 1983 1984 strncpy(mce_helper, buf, sizeof(mce_helper)); 1985 mce_helper[sizeof(mce_helper)-1] = 0; 1986 p = strchr(mce_helper, '\n'); 1987 1988 if (p) 1989 *p = 0; 1990 1991 return strlen(mce_helper) + !!p; 1992} 1993 1994static ssize_t set_ignore_ce(struct device *s, 1995 struct device_attribute *attr, 1996 const char *buf, size_t size) 1997{ 1998 u64 new; 1999 2000 if (strict_strtoull(buf, 0, &new) < 0) 2001 return -EINVAL; 2002 2003 if (mce_ignore_ce ^ !!new) { 2004 if (new) { 2005 /* disable ce features */ 2006 mce_timer_delete_all(); 2007 on_each_cpu(mce_disable_cmci, NULL, 1); 2008 mce_ignore_ce = 1; 2009 } else { 2010 /* enable ce features */ 2011 mce_ignore_ce = 0; 2012 on_each_cpu(mce_enable_ce, (void *)1, 1); 2013 } 2014 } 2015 return size; 2016} 2017 2018static ssize_t set_cmci_disabled(struct device *s, 2019 struct device_attribute *attr, 2020 const char *buf, size_t size) 2021{ 2022 u64 new; 2023 2024 if (strict_strtoull(buf, 0, &new) < 0) 2025 return -EINVAL; 2026 2027 if (mce_cmci_disabled ^ !!new) { 2028 if (new) { 2029 /* disable cmci */ 2030 on_each_cpu(mce_disable_cmci, NULL, 1); 2031 mce_cmci_disabled = 1; 2032 } else { 2033 /* enable cmci */ 2034 mce_cmci_disabled = 0; 2035 on_each_cpu(mce_enable_ce, NULL, 1); 2036 } 2037 } 2038 return size; 2039} 2040 2041static ssize_t store_int_with_restart(struct device *s, 2042 struct device_attribute *attr, 2043 const char *buf, size_t size) 2044{ 2045 ssize_t ret = device_store_int(s, attr, buf, size); 2046 mce_restart(); 2047 return ret; 2048} 2049 2050static DEVICE_ATTR(trigger, 0644, show_trigger, set_trigger); 2051static DEVICE_INT_ATTR(tolerant, 0644, tolerant); 2052static DEVICE_INT_ATTR(monarch_timeout, 0644, monarch_timeout); 2053static DEVICE_INT_ATTR(dont_log_ce, 0644, mce_dont_log_ce); 2054 2055static struct dev_ext_attribute dev_attr_check_interval = { 2056 __ATTR(check_interval, 0644, device_show_int, store_int_with_restart), 2057 &check_interval 2058}; 2059 2060static struct dev_ext_attribute dev_attr_ignore_ce = { 2061 __ATTR(ignore_ce, 0644, device_show_int, set_ignore_ce), 2062 &mce_ignore_ce 2063}; 2064 2065static struct dev_ext_attribute dev_attr_cmci_disabled = { 2066 __ATTR(cmci_disabled, 0644, device_show_int, set_cmci_disabled), 2067 &mce_cmci_disabled 2068}; 2069 2070static struct device_attribute *mce_device_attrs[] = { 2071 &dev_attr_tolerant.attr, 2072 &dev_attr_check_interval.attr, 2073 &dev_attr_trigger, 2074 &dev_attr_monarch_timeout.attr, 2075 &dev_attr_dont_log_ce.attr, 2076 &dev_attr_ignore_ce.attr, 2077 &dev_attr_cmci_disabled.attr, 2078 NULL 2079}; 2080 2081static cpumask_var_t mce_device_initialized; 2082 2083static void mce_device_release(struct device *dev) 2084{ 2085 kfree(dev); 2086} 2087 2088/* Per cpu device init. All of the cpus still share the same ctrl bank: */ 2089static __cpuinit int mce_device_create(unsigned int cpu) 2090{ 2091 struct device *dev; 2092 int err; 2093 int i, j; 2094 2095 if (!mce_available(&boot_cpu_data)) 2096 return -EIO; 2097 2098 dev = kzalloc(sizeof *dev, GFP_KERNEL); 2099 if (!dev) 2100 return -ENOMEM; 2101 dev->id = cpu; 2102 dev->bus = &mce_subsys; 2103 dev->release = &mce_device_release; 2104 2105 err = device_register(dev); 2106 if (err) 2107 return err; 2108 2109 for (i = 0; mce_device_attrs[i]; i++) { 2110 err = device_create_file(dev, mce_device_attrs[i]); 2111 if (err) 2112 goto error; 2113 } 2114 for (j = 0; j < banks; j++) { 2115 err = device_create_file(dev, &mce_banks[j].attr); 2116 if (err) 2117 goto error2; 2118 } 2119 cpumask_set_cpu(cpu, mce_device_initialized); 2120 per_cpu(mce_device, cpu) = dev; 2121 2122 return 0; 2123error2: 2124 while (--j >= 0) 2125 device_remove_file(dev, &mce_banks[j].attr); 2126error: 2127 while (--i >= 0) 2128 device_remove_file(dev, mce_device_attrs[i]); 2129 2130 device_unregister(dev); 2131 2132 return err; 2133} 2134 2135static __cpuinit void mce_device_remove(unsigned int cpu) 2136{ 2137 struct device *dev = per_cpu(mce_device, cpu); 2138 int i; 2139 2140 if (!cpumask_test_cpu(cpu, mce_device_initialized)) 2141 return; 2142 2143 for (i = 0; mce_device_attrs[i]; i++) 2144 device_remove_file(dev, mce_device_attrs[i]); 2145 2146 for (i = 0; i < banks; i++) 2147 device_remove_file(dev, &mce_banks[i].attr); 2148 2149 device_unregister(dev); 2150 cpumask_clear_cpu(cpu, mce_device_initialized); 2151 per_cpu(mce_device, cpu) = NULL; 2152} 2153 2154/* Make sure there are no machine checks on offlined CPUs. */ 2155static void __cpuinit mce_disable_cpu(void *h) 2156{ 2157 unsigned long action = *(unsigned long *)h; 2158 int i; 2159 2160 if (!mce_available(__this_cpu_ptr(&cpu_info))) 2161 return; 2162 2163 if (!(action & CPU_TASKS_FROZEN)) 2164 cmci_clear(); 2165 for (i = 0; i < banks; i++) { 2166 struct mce_bank *b = &mce_banks[i]; 2167 2168 if (b->init) 2169 wrmsrl(MSR_IA32_MCx_CTL(i), 0); 2170 } 2171} 2172 2173static void __cpuinit mce_reenable_cpu(void *h) 2174{ 2175 unsigned long action = *(unsigned long *)h; 2176 int i; 2177 2178 if (!mce_available(__this_cpu_ptr(&cpu_info))) 2179 return; 2180 2181 if (!(action & CPU_TASKS_FROZEN)) 2182 cmci_reenable(); 2183 for (i = 0; i < banks; i++) { 2184 struct mce_bank *b = &mce_banks[i]; 2185 2186 if (b->init) 2187 wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl); 2188 } 2189} 2190 2191/* Get notified when a cpu comes on/off. Be hotplug friendly. */ 2192static int __cpuinit 2193mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) 2194{ 2195 unsigned int cpu = (unsigned long)hcpu; 2196 struct timer_list *t = &per_cpu(mce_timer, cpu); 2197 2198 switch (action) { 2199 case CPU_ONLINE: 2200 case CPU_ONLINE_FROZEN: 2201 mce_device_create(cpu); 2202 if (threshold_cpu_callback) 2203 threshold_cpu_callback(action, cpu); 2204 break; 2205 case CPU_DEAD: 2206 case CPU_DEAD_FROZEN: 2207 if (threshold_cpu_callback) 2208 threshold_cpu_callback(action, cpu); 2209 mce_device_remove(cpu); 2210 break; 2211 case CPU_DOWN_PREPARE: 2212 case CPU_DOWN_PREPARE_FROZEN: 2213 del_timer_sync(t); 2214 smp_call_function_single(cpu, mce_disable_cpu, &action, 1); 2215 break; 2216 case CPU_DOWN_FAILED: 2217 case CPU_DOWN_FAILED_FROZEN: 2218 if (!mce_ignore_ce && check_interval) { 2219 t->expires = round_jiffies(jiffies + 2220 __get_cpu_var(mce_next_interval)); 2221 add_timer_on(t, cpu); 2222 } 2223 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); 2224 break; 2225 case CPU_POST_DEAD: 2226 /* intentionally ignoring frozen here */ 2227 cmci_rediscover(cpu); 2228 break; 2229 } 2230 return NOTIFY_OK; 2231} 2232 2233static struct notifier_block mce_cpu_notifier __cpuinitdata = { 2234 .notifier_call = mce_cpu_callback, 2235}; 2236 2237static __init void mce_init_banks(void) 2238{ 2239 int i; 2240 2241 for (i = 0; i < banks; i++) { 2242 struct mce_bank *b = &mce_banks[i]; 2243 struct device_attribute *a = &b->attr; 2244 2245 sysfs_attr_init(&a->attr); 2246 a->attr.name = b->attrname; 2247 snprintf(b->attrname, ATTR_LEN, "bank%d", i); 2248 2249 a->attr.mode = 0644; 2250 a->show = show_bank; 2251 a->store = set_bank; 2252 } 2253} 2254 2255static __init int mcheck_init_device(void) 2256{ 2257 int err; 2258 int i = 0; 2259 2260 if (!mce_available(&boot_cpu_data)) 2261 return -EIO; 2262 2263 zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL); 2264 2265 mce_init_banks(); 2266 2267 err = subsys_system_register(&mce_subsys, NULL); 2268 if (err) 2269 return err; 2270 2271 for_each_online_cpu(i) { 2272 err = mce_device_create(i); 2273 if (err) 2274 return err; 2275 } 2276 2277 register_syscore_ops(&mce_syscore_ops); 2278 register_hotcpu_notifier(&mce_cpu_notifier); 2279 2280 /* register character device /dev/mcelog */ 2281 misc_register(&mce_chrdev_device); 2282 2283 return err; 2284} 2285device_initcall(mcheck_init_device); 2286 2287/* 2288 * Old style boot options parsing. Only for compatibility. 2289 */ 2290static int __init mcheck_disable(char *str) 2291{ 2292 mce_disabled = 1; 2293 return 1; 2294} 2295__setup("nomce", mcheck_disable); 2296 2297#ifdef CONFIG_DEBUG_FS 2298struct dentry *mce_get_debugfs_dir(void) 2299{ 2300 static struct dentry *dmce; 2301 2302 if (!dmce) 2303 dmce = debugfs_create_dir("mce", NULL); 2304 2305 return dmce; 2306} 2307 2308static void mce_reset(void) 2309{ 2310 cpu_missing = 0; 2311 atomic_set(&mce_fake_paniced, 0); 2312 atomic_set(&mce_executing, 0); 2313 atomic_set(&mce_callin, 0); 2314 atomic_set(&global_nwo, 0); 2315} 2316 2317static int fake_panic_get(void *data, u64 *val) 2318{ 2319 *val = fake_panic; 2320 return 0; 2321} 2322 2323static int fake_panic_set(void *data, u64 val) 2324{ 2325 mce_reset(); 2326 fake_panic = val; 2327 return 0; 2328} 2329 2330DEFINE_SIMPLE_ATTRIBUTE(fake_panic_fops, fake_panic_get, 2331 fake_panic_set, "%llu\n"); 2332 2333static int __init mcheck_debugfs_init(void) 2334{ 2335 struct dentry *dmce, *ffake_panic; 2336 2337 dmce = mce_get_debugfs_dir(); 2338 if (!dmce) 2339 return -ENOMEM; 2340 ffake_panic = debugfs_create_file("fake_panic", 0444, dmce, NULL, 2341 &fake_panic_fops); 2342 if (!ffake_panic) 2343 return -ENOMEM; 2344 2345 return 0; 2346} 2347late_initcall(mcheck_debugfs_init); 2348#endif 2349