mce.c revision f94b61c2c9fdcc90773c49df9ccf9ede3ad0d7db
1/*
2 * Machine check handler.
3 *
4 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
5 * Rest from unknown author(s).
6 * 2004 Andi Kleen. Rewrote most of it.
7 * Copyright 2008 Intel Corporation
8 * Author: Andi Kleen
9 */
10#include <linux/thread_info.h>
11#include <linux/capability.h>
12#include <linux/miscdevice.h>
13#include <linux/interrupt.h>
14#include <linux/ratelimit.h>
15#include <linux/kallsyms.h>
16#include <linux/rcupdate.h>
17#include <linux/kobject.h>
18#include <linux/uaccess.h>
19#include <linux/kdebug.h>
20#include <linux/kernel.h>
21#include <linux/percpu.h>
22#include <linux/string.h>
23#include <linux/sysdev.h>
24#include <linux/ctype.h>
25#include <linux/sched.h>
26#include <linux/sysfs.h>
27#include <linux/types.h>
28#include <linux/init.h>
29#include <linux/kmod.h>
30#include <linux/poll.h>
31#include <linux/cpu.h>
32#include <linux/smp.h>
33#include <linux/fs.h>
34
35#include <asm/processor.h>
36#include <asm/hw_irq.h>
37#include <asm/apic.h>
38#include <asm/idle.h>
39#include <asm/ipi.h>
40#include <asm/mce.h>
41#include <asm/msr.h>
42
43#include "mce-internal.h"
44#include "mce.h"
45
46/* Handle unconfigured int18 (should never happen) */
47static void unexpected_machine_check(struct pt_regs *regs, long error_code)
48{
49	printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n",
50	       smp_processor_id());
51}
52
53/* Call the installed machine check handler for this CPU setup. */
54void (*machine_check_vector)(struct pt_regs *, long error_code) =
55						unexpected_machine_check;
56
57int				mce_disabled;
58
59#ifdef CONFIG_X86_NEW_MCE
60
61#define MISC_MCELOG_MINOR	227
62
63atomic_t mce_entry;
64
65DEFINE_PER_CPU(unsigned, mce_exception_count);
66
67/*
68 * Tolerant levels:
69 *   0: always panic on uncorrected errors, log corrected errors
70 *   1: panic or SIGBUS on uncorrected errors, log corrected errors
71 *   2: SIGBUS or log uncorrected errors (if possible), log corrected errors
72 *   3: never panic or SIGBUS, log all errors (for testing only)
73 */
74static int			tolerant = 1;
75static int			banks;
76static u64			*bank;
77static unsigned long		notify_user;
78static int			rip_msr;
79static int			mce_bootlog = -1;
80
81static char			trigger[128];
82static char			*trigger_argv[2] = { trigger, NULL };
83
84static unsigned long		dont_init_banks;
85
86static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
87
88/* MCA banks polled by the period polling timer for corrected events */
89DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
90	[0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
91};
92
93static inline int skip_bank_init(int i)
94{
95	return i < BITS_PER_LONG && test_bit(i, &dont_init_banks);
96}
97
98/* Do initial initialization of a struct mce */
99void mce_setup(struct mce *m)
100{
101	memset(m, 0, sizeof(struct mce));
102	m->cpu = m->extcpu = smp_processor_id();
103	rdtscll(m->tsc);
104	/* We hope get_seconds stays lockless */
105	m->time = get_seconds();
106	m->cpuvendor = boot_cpu_data.x86_vendor;
107	m->cpuid = cpuid_eax(1);
108#ifdef CONFIG_SMP
109	m->socketid = cpu_data(m->extcpu).phys_proc_id;
110#endif
111	m->apicid = cpu_data(m->extcpu).initial_apicid;
112	rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap);
113}
114
115DEFINE_PER_CPU(struct mce, injectm);
116EXPORT_PER_CPU_SYMBOL_GPL(injectm);
117
118/*
119 * Lockless MCE logging infrastructure.
120 * This avoids deadlocks on printk locks without having to break locks. Also
121 * separate MCEs from kernel messages to avoid bogus bug reports.
122 */
123
124static struct mce_log mcelog = {
125	.signature	= MCE_LOG_SIGNATURE,
126	.len		= MCE_LOG_LEN,
127	.recordlen	= sizeof(struct mce),
128};
129
130void mce_log(struct mce *mce)
131{
132	unsigned next, entry;
133
134	mce->finished = 0;
135	wmb();
136	for (;;) {
137		entry = rcu_dereference(mcelog.next);
138		for (;;) {
139			/*
140			 * When the buffer fills up discard new entries.
141			 * Assume that the earlier errors are the more
142			 * interesting ones:
143			 */
144			if (entry >= MCE_LOG_LEN) {
145				set_bit(MCE_OVERFLOW,
146					(unsigned long *)&mcelog.flags);
147				return;
148			}
149			/* Old left over entry. Skip: */
150			if (mcelog.entry[entry].finished) {
151				entry++;
152				continue;
153			}
154			break;
155		}
156		smp_rmb();
157		next = entry + 1;
158		if (cmpxchg(&mcelog.next, entry, next) == entry)
159			break;
160	}
161	memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
162	wmb();
163	mcelog.entry[entry].finished = 1;
164	wmb();
165
166	mce->finished = 1;
167	set_bit(0, &notify_user);
168}
169
170static void print_mce(struct mce *m)
171{
172	printk(KERN_EMERG "\n"
173	       KERN_EMERG "HARDWARE ERROR\n"
174	       KERN_EMERG
175	       "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
176	       m->extcpu, m->mcgstatus, m->bank, m->status);
177	if (m->ip) {
178		printk(KERN_EMERG "RIP%s %02x:<%016Lx> ",
179		       !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
180		       m->cs, m->ip);
181		if (m->cs == __KERNEL_CS)
182			print_symbol("{%s}", m->ip);
183		printk("\n");
184	}
185	printk(KERN_EMERG "TSC %llx ", m->tsc);
186	if (m->addr)
187		printk("ADDR %llx ", m->addr);
188	if (m->misc)
189		printk("MISC %llx ", m->misc);
190	printk("\n");
191	printk(KERN_EMERG "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n",
192			m->cpuvendor, m->cpuid, m->time, m->socketid,
193			m->apicid);
194	printk(KERN_EMERG "This is not a software problem!\n");
195	printk(KERN_EMERG "Run through mcelog --ascii to decode "
196	       "and contact your hardware vendor\n");
197}
198
199#define PANIC_TIMEOUT 5 /* 5 seconds */
200
201static atomic_t mce_paniced;
202
203/* Panic in progress. Enable interrupts and wait for final IPI */
204static void wait_for_panic(void)
205{
206	long timeout = PANIC_TIMEOUT*USEC_PER_SEC;
207	preempt_disable();
208	local_irq_enable();
209	while (timeout-- > 0)
210		udelay(1);
211	panic("Panicing machine check CPU died");
212}
213
214static void mce_panic(char *msg, struct mce *final, char *exp)
215{
216	int i;
217
218	/*
219	 * Make sure only one CPU runs in machine check panic
220	 */
221	if (atomic_add_return(1, &mce_paniced) > 1)
222		wait_for_panic();
223	barrier();
224
225	bust_spinlocks(1);
226	console_verbose();
227	/* First print corrected ones that are still unlogged */
228	for (i = 0; i < MCE_LOG_LEN; i++) {
229		struct mce *m = &mcelog.entry[i];
230		if ((m->status & MCI_STATUS_VAL) &&
231			!(m->status & MCI_STATUS_UC))
232			print_mce(m);
233	}
234	/* Now print uncorrected but with the final one last */
235	for (i = 0; i < MCE_LOG_LEN; i++) {
236		struct mce *m = &mcelog.entry[i];
237		if (!(m->status & MCI_STATUS_VAL))
238			continue;
239		if (!final || memcmp(m, final, sizeof(struct mce)))
240			print_mce(m);
241	}
242	if (final)
243		print_mce(final);
244	if (exp)
245		printk(KERN_EMERG "Machine check: %s\n", exp);
246	panic(msg);
247}
248
249/* Support code for software error injection */
250
251static int msr_to_offset(u32 msr)
252{
253	unsigned bank = __get_cpu_var(injectm.bank);
254	if (msr == rip_msr)
255		return offsetof(struct mce, ip);
256	if (msr == MSR_IA32_MC0_STATUS + bank*4)
257		return offsetof(struct mce, status);
258	if (msr == MSR_IA32_MC0_ADDR + bank*4)
259		return offsetof(struct mce, addr);
260	if (msr == MSR_IA32_MC0_MISC + bank*4)
261		return offsetof(struct mce, misc);
262	if (msr == MSR_IA32_MCG_STATUS)
263		return offsetof(struct mce, mcgstatus);
264	return -1;
265}
266
267/* MSR access wrappers used for error injection */
268static u64 mce_rdmsrl(u32 msr)
269{
270	u64 v;
271	if (__get_cpu_var(injectm).finished) {
272		int offset = msr_to_offset(msr);
273		if (offset < 0)
274			return 0;
275		return *(u64 *)((char *)&__get_cpu_var(injectm) + offset);
276	}
277	rdmsrl(msr, v);
278	return v;
279}
280
281static void mce_wrmsrl(u32 msr, u64 v)
282{
283	if (__get_cpu_var(injectm).finished) {
284		int offset = msr_to_offset(msr);
285		if (offset >= 0)
286			*(u64 *)((char *)&__get_cpu_var(injectm) + offset) = v;
287		return;
288	}
289	wrmsrl(msr, v);
290}
291
292int mce_available(struct cpuinfo_x86 *c)
293{
294	if (mce_disabled)
295		return 0;
296	return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
297}
298
299static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
300{
301	if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) {
302		m->ip = regs->ip;
303		m->cs = regs->cs;
304	} else {
305		m->ip = 0;
306		m->cs = 0;
307	}
308	if (rip_msr) {
309		/* Assume the RIP in the MSR is exact. Is this true? */
310		m->mcgstatus |= MCG_STATUS_EIPV;
311		m->ip = mce_rdmsrl(rip_msr);
312		m->cs = 0;
313	}
314}
315
316#ifdef CONFIG_X86_LOCAL_APIC
317/*
318 * Called after interrupts have been reenabled again
319 * when a MCE happened during an interrupts off region
320 * in the kernel.
321 */
322asmlinkage void smp_mce_self_interrupt(struct pt_regs *regs)
323{
324	ack_APIC_irq();
325	exit_idle();
326	irq_enter();
327	mce_notify_user();
328	irq_exit();
329}
330#endif
331
332static void mce_report_event(struct pt_regs *regs)
333{
334	if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) {
335		mce_notify_user();
336		return;
337	}
338
339#ifdef CONFIG_X86_LOCAL_APIC
340	/*
341	 * Without APIC do not notify. The event will be picked
342	 * up eventually.
343	 */
344	if (!cpu_has_apic)
345		return;
346
347	/*
348	 * When interrupts are disabled we cannot use
349	 * kernel services safely. Trigger an self interrupt
350	 * through the APIC to instead do the notification
351	 * after interrupts are reenabled again.
352	 */
353	apic->send_IPI_self(MCE_SELF_VECTOR);
354
355	/*
356	 * Wait for idle afterwards again so that we don't leave the
357	 * APIC in a non idle state because the normal APIC writes
358	 * cannot exclude us.
359	 */
360	apic_wait_icr_idle();
361#endif
362}
363
364DEFINE_PER_CPU(unsigned, mce_poll_count);
365
366/*
367 * Poll for corrected events or events that happened before reset.
368 * Those are just logged through /dev/mcelog.
369 *
370 * This is executed in standard interrupt context.
371 */
372void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
373{
374	struct mce m;
375	int i;
376
377	__get_cpu_var(mce_poll_count)++;
378
379	mce_setup(&m);
380
381	m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
382	for (i = 0; i < banks; i++) {
383		if (!bank[i] || !test_bit(i, *b))
384			continue;
385
386		m.misc = 0;
387		m.addr = 0;
388		m.bank = i;
389		m.tsc = 0;
390
391		barrier();
392		m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4);
393		if (!(m.status & MCI_STATUS_VAL))
394			continue;
395
396		/*
397		 * Uncorrected events are handled by the exception handler
398		 * when it is enabled. But when the exception is disabled log
399		 * everything.
400		 *
401		 * TBD do the same check for MCI_STATUS_EN here?
402		 */
403		if ((m.status & MCI_STATUS_UC) && !(flags & MCP_UC))
404			continue;
405
406		if (m.status & MCI_STATUS_MISCV)
407			m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4);
408		if (m.status & MCI_STATUS_ADDRV)
409			m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4);
410
411		if (!(flags & MCP_TIMESTAMP))
412			m.tsc = 0;
413		/*
414		 * Don't get the IP here because it's unlikely to
415		 * have anything to do with the actual error location.
416		 */
417		if (!(flags & MCP_DONTLOG)) {
418			mce_log(&m);
419			add_taint(TAINT_MACHINE_CHECK);
420		}
421
422		/*
423		 * Clear state for this bank.
424		 */
425		mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
426	}
427
428	/*
429	 * Don't clear MCG_STATUS here because it's only defined for
430	 * exceptions.
431	 */
432
433	sync_core();
434}
435EXPORT_SYMBOL_GPL(machine_check_poll);
436
437/*
438 * Do a quick check if any of the events requires a panic.
439 * This decides if we keep the events around or clear them.
440 */
441static int mce_no_way_out(struct mce *m, char **msg)
442{
443	int i;
444
445	for (i = 0; i < banks; i++) {
446		m->status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4);
447		if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY)
448			return 1;
449	}
450	return 0;
451}
452
453/*
454 * The actual machine check handler. This only handles real
455 * exceptions when something got corrupted coming in through int 18.
456 *
457 * This is executed in NMI context not subject to normal locking rules. This
458 * implies that most kernel services cannot be safely used. Don't even
459 * think about putting a printk in there!
460 */
461void do_machine_check(struct pt_regs *regs, long error_code)
462{
463	struct mce m, panicm;
464	int panicm_found = 0;
465	int i;
466	/*
467	 * If no_way_out gets set, there is no safe way to recover from this
468	 * MCE.  If tolerant is cranked up, we'll try anyway.
469	 */
470	int no_way_out = 0;
471	/*
472	 * If kill_it gets set, there might be a way to recover from this
473	 * error.
474	 */
475	int kill_it = 0;
476	DECLARE_BITMAP(toclear, MAX_NR_BANKS);
477	char *msg = "Unknown";
478
479	atomic_inc(&mce_entry);
480
481	__get_cpu_var(mce_exception_count)++;
482
483	if (notify_die(DIE_NMI, "machine check", regs, error_code,
484			   18, SIGKILL) == NOTIFY_STOP)
485		goto out;
486	if (!banks)
487		goto out;
488
489	mce_setup(&m);
490
491	m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
492	no_way_out = mce_no_way_out(&m, &msg);
493
494	barrier();
495
496	for (i = 0; i < banks; i++) {
497		__clear_bit(i, toclear);
498		if (!bank[i])
499			continue;
500
501		m.misc = 0;
502		m.addr = 0;
503		m.bank = i;
504
505		m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4);
506		if ((m.status & MCI_STATUS_VAL) == 0)
507			continue;
508
509		/*
510		 * Non uncorrected errors are handled by machine_check_poll
511		 * Leave them alone, unless this panics.
512		 */
513		if ((m.status & MCI_STATUS_UC) == 0 && !no_way_out)
514			continue;
515
516		/*
517		 * Set taint even when machine check was not enabled.
518		 */
519		add_taint(TAINT_MACHINE_CHECK);
520
521		__set_bit(i, toclear);
522
523		if (m.status & MCI_STATUS_EN) {
524			/*
525			 * If this error was uncorrectable and there was
526			 * an overflow, we're in trouble.  If no overflow,
527			 * we might get away with just killing a task.
528			 */
529			if (m.status & MCI_STATUS_UC)
530				kill_it = 1;
531		} else {
532			/*
533			 * Machine check event was not enabled. Clear, but
534			 * ignore.
535			 */
536			continue;
537		}
538
539		if (m.status & MCI_STATUS_MISCV)
540			m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4);
541		if (m.status & MCI_STATUS_ADDRV)
542			m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4);
543
544		mce_get_rip(&m, regs);
545		mce_log(&m);
546
547		/*
548		 * Did this bank cause the exception?
549		 *
550		 * Assume that the bank with uncorrectable errors did it,
551		 * and that there is only a single one:
552		 */
553		if ((m.status & MCI_STATUS_UC) &&
554					(m.status & MCI_STATUS_EN)) {
555			panicm = m;
556			panicm_found = 1;
557		}
558	}
559
560	/*
561	 * If we didn't find an uncorrectable error, pick
562	 * the last one (shouldn't happen, just being safe).
563	 */
564	if (!panicm_found)
565		panicm = m;
566
567	/*
568	 * If we have decided that we just CAN'T continue, and the user
569	 * has not set tolerant to an insane level, give up and die.
570	 */
571	if (no_way_out && tolerant < 3)
572		mce_panic("Machine check", &panicm, msg);
573
574	/*
575	 * If the error seems to be unrecoverable, something should be
576	 * done.  Try to kill as little as possible.  If we can kill just
577	 * one task, do that.  If the user has set the tolerance very
578	 * high, don't try to do anything at all.
579	 */
580	if (kill_it && tolerant < 3) {
581		int user_space = 0;
582
583		/*
584		 * If the EIPV bit is set, it means the saved IP is the
585		 * instruction which caused the MCE.
586		 */
587		if (m.mcgstatus & MCG_STATUS_EIPV)
588			user_space = panicm.ip && (panicm.cs & 3);
589
590		/*
591		 * If we know that the error was in user space, send a
592		 * SIGBUS.  Otherwise, panic if tolerance is low.
593		 *
594		 * force_sig() takes an awful lot of locks and has a slight
595		 * risk of deadlocking.
596		 */
597		if (user_space) {
598			force_sig(SIGBUS, current);
599		} else if (panic_on_oops || tolerant < 2) {
600			mce_panic("Uncorrected machine check", &panicm, msg);
601		}
602	}
603
604	/* notify userspace ASAP */
605	set_thread_flag(TIF_MCE_NOTIFY);
606
607	mce_report_event(regs);
608
609	/* the last thing we do is clear state */
610	for (i = 0; i < banks; i++) {
611		if (test_bit(i, toclear))
612			mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
613	}
614	mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
615out:
616	atomic_dec(&mce_entry);
617	sync_core();
618}
619EXPORT_SYMBOL_GPL(do_machine_check);
620
621#ifdef CONFIG_X86_MCE_INTEL
622/***
623 * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
624 * @cpu: The CPU on which the event occurred.
625 * @status: Event status information
626 *
627 * This function should be called by the thermal interrupt after the
628 * event has been processed and the decision was made to log the event
629 * further.
630 *
631 * The status parameter will be saved to the 'status' field of 'struct mce'
632 * and historically has been the register value of the
633 * MSR_IA32_THERMAL_STATUS (Intel) msr.
634 */
635void mce_log_therm_throt_event(__u64 status)
636{
637	struct mce m;
638
639	mce_setup(&m);
640	m.bank = MCE_THERMAL_BANK;
641	m.status = status;
642	mce_log(&m);
643}
644#endif /* CONFIG_X86_MCE_INTEL */
645
646/*
647 * Periodic polling timer for "silent" machine check errors.  If the
648 * poller finds an MCE, poll 2x faster.  When the poller finds no more
649 * errors, poll 2x slower (up to check_interval seconds).
650 */
651static int check_interval = 5 * 60; /* 5 minutes */
652
653static DEFINE_PER_CPU(int, next_interval); /* in jiffies */
654static DEFINE_PER_CPU(struct timer_list, mce_timer);
655
656static void mcheck_timer(unsigned long data)
657{
658	struct timer_list *t = &per_cpu(mce_timer, data);
659	int *n;
660
661	WARN_ON(smp_processor_id() != data);
662
663	if (mce_available(&current_cpu_data)) {
664		machine_check_poll(MCP_TIMESTAMP,
665				&__get_cpu_var(mce_poll_banks));
666	}
667
668	/*
669	 * Alert userspace if needed.  If we logged an MCE, reduce the
670	 * polling interval, otherwise increase the polling interval.
671	 */
672	n = &__get_cpu_var(next_interval);
673	if (mce_notify_user())
674		*n = max(*n/2, HZ/100);
675	else
676		*n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ));
677
678	t->expires = jiffies + *n;
679	add_timer(t);
680}
681
682static void mce_do_trigger(struct work_struct *work)
683{
684	call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT);
685}
686
687static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
688
689/*
690 * Notify the user(s) about new machine check events.
691 * Can be called from interrupt context, but not from machine check/NMI
692 * context.
693 */
694int mce_notify_user(void)
695{
696	/* Not more than two messages every minute */
697	static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
698
699	clear_thread_flag(TIF_MCE_NOTIFY);
700
701	if (test_and_clear_bit(0, &notify_user)) {
702		wake_up_interruptible(&mce_wait);
703
704		/*
705		 * There is no risk of missing notifications because
706		 * work_pending is always cleared before the function is
707		 * executed.
708		 */
709		if (trigger[0] && !work_pending(&mce_trigger_work))
710			schedule_work(&mce_trigger_work);
711
712		if (__ratelimit(&ratelimit))
713			printk(KERN_INFO "Machine check events logged\n");
714
715		return 1;
716	}
717	return 0;
718}
719EXPORT_SYMBOL_GPL(mce_notify_user);
720
721/*
722 * Initialize Machine Checks for a CPU.
723 */
724static int mce_cap_init(void)
725{
726	unsigned b;
727	u64 cap;
728
729	rdmsrl(MSR_IA32_MCG_CAP, cap);
730
731	b = cap & MCG_BANKCNT_MASK;
732	printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b);
733
734	if (b > MAX_NR_BANKS) {
735		printk(KERN_WARNING
736		       "MCE: Using only %u machine check banks out of %u\n",
737			MAX_NR_BANKS, b);
738		b = MAX_NR_BANKS;
739	}
740
741	/* Don't support asymmetric configurations today */
742	WARN_ON(banks != 0 && b != banks);
743	banks = b;
744	if (!bank) {
745		bank = kmalloc(banks * sizeof(u64), GFP_KERNEL);
746		if (!bank)
747			return -ENOMEM;
748		memset(bank, 0xff, banks * sizeof(u64));
749	}
750
751	/* Use accurate RIP reporting if available. */
752	if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9)
753		rip_msr = MSR_IA32_MCG_EIP;
754
755	return 0;
756}
757
758static void mce_init(void)
759{
760	mce_banks_t all_banks;
761	u64 cap;
762	int i;
763
764	/*
765	 * Log the machine checks left over from the previous reset.
766	 */
767	bitmap_fill(all_banks, MAX_NR_BANKS);
768	machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks);
769
770	set_in_cr4(X86_CR4_MCE);
771
772	rdmsrl(MSR_IA32_MCG_CAP, cap);
773	if (cap & MCG_CTL_P)
774		wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
775
776	for (i = 0; i < banks; i++) {
777		if (skip_bank_init(i))
778			continue;
779		wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
780		wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
781	}
782}
783
784/* Add per CPU specific workarounds here */
785static void mce_cpu_quirks(struct cpuinfo_x86 *c)
786{
787	/* This should be disabled by the BIOS, but isn't always */
788	if (c->x86_vendor == X86_VENDOR_AMD) {
789		if (c->x86 == 15 && banks > 4) {
790			/*
791			 * disable GART TBL walk error reporting, which
792			 * trips off incorrectly with the IOMMU & 3ware
793			 * & Cerberus:
794			 */
795			clear_bit(10, (unsigned long *)&bank[4]);
796		}
797		if (c->x86 <= 17 && mce_bootlog < 0) {
798			/*
799			 * Lots of broken BIOS around that don't clear them
800			 * by default and leave crap in there. Don't log:
801			 */
802			mce_bootlog = 0;
803		}
804		/*
805		 * Various K7s with broken bank 0 around. Always disable
806		 * by default.
807		 */
808		 if (c->x86 == 6)
809			bank[0] = 0;
810	}
811
812	if (c->x86_vendor == X86_VENDOR_INTEL) {
813		/*
814		 * SDM documents that on family 6 bank 0 should not be written
815		 * because it aliases to another special BIOS controlled
816		 * register.
817		 * But it's not aliased anymore on model 0x1a+
818		 * Don't ignore bank 0 completely because there could be a
819		 * valid event later, merely don't write CTL0.
820		 */
821
822		if (c->x86 == 6 && c->x86_model < 0x1A)
823			__set_bit(0, &dont_init_banks);
824	}
825}
826
827static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c)
828{
829	if (c->x86 != 5)
830		return;
831	switch (c->x86_vendor) {
832	case X86_VENDOR_INTEL:
833		if (mce_p5_enabled())
834			intel_p5_mcheck_init(c);
835		break;
836	case X86_VENDOR_CENTAUR:
837		winchip_mcheck_init(c);
838		break;
839	}
840}
841
842static void mce_cpu_features(struct cpuinfo_x86 *c)
843{
844	switch (c->x86_vendor) {
845	case X86_VENDOR_INTEL:
846		mce_intel_feature_init(c);
847		break;
848	case X86_VENDOR_AMD:
849		mce_amd_feature_init(c);
850		break;
851	default:
852		break;
853	}
854}
855
856static void mce_init_timer(void)
857{
858	struct timer_list *t = &__get_cpu_var(mce_timer);
859	int *n = &__get_cpu_var(next_interval);
860
861	*n = check_interval * HZ;
862	if (!*n)
863		return;
864	setup_timer(t, mcheck_timer, smp_processor_id());
865	t->expires = round_jiffies(jiffies + *n);
866	add_timer(t);
867}
868
869/*
870 * Called for each booted CPU to set up machine checks.
871 * Must be called with preempt off:
872 */
873void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
874{
875	if (mce_disabled)
876		return;
877
878	mce_ancient_init(c);
879
880	if (!mce_available(c))
881		return;
882
883	if (mce_cap_init() < 0) {
884		mce_disabled = 1;
885		return;
886	}
887	mce_cpu_quirks(c);
888
889	machine_check_vector = do_machine_check;
890
891	mce_init();
892	mce_cpu_features(c);
893	mce_init_timer();
894}
895
896/*
897 * Character device to read and clear the MCE log.
898 */
899
900static DEFINE_SPINLOCK(mce_state_lock);
901static int		open_count;		/* #times opened */
902static int		open_exclu;		/* already open exclusive? */
903
904static int mce_open(struct inode *inode, struct file *file)
905{
906	spin_lock(&mce_state_lock);
907
908	if (open_exclu || (open_count && (file->f_flags & O_EXCL))) {
909		spin_unlock(&mce_state_lock);
910
911		return -EBUSY;
912	}
913
914	if (file->f_flags & O_EXCL)
915		open_exclu = 1;
916	open_count++;
917
918	spin_unlock(&mce_state_lock);
919
920	return nonseekable_open(inode, file);
921}
922
923static int mce_release(struct inode *inode, struct file *file)
924{
925	spin_lock(&mce_state_lock);
926
927	open_count--;
928	open_exclu = 0;
929
930	spin_unlock(&mce_state_lock);
931
932	return 0;
933}
934
935static void collect_tscs(void *data)
936{
937	unsigned long *cpu_tsc = (unsigned long *)data;
938
939	rdtscll(cpu_tsc[smp_processor_id()]);
940}
941
942static DEFINE_MUTEX(mce_read_mutex);
943
944static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
945			loff_t *off)
946{
947	char __user *buf = ubuf;
948	unsigned long *cpu_tsc;
949	unsigned prev, next;
950	int i, err;
951
952	cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
953	if (!cpu_tsc)
954		return -ENOMEM;
955
956	mutex_lock(&mce_read_mutex);
957	next = rcu_dereference(mcelog.next);
958
959	/* Only supports full reads right now */
960	if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
961		mutex_unlock(&mce_read_mutex);
962		kfree(cpu_tsc);
963
964		return -EINVAL;
965	}
966
967	err = 0;
968	prev = 0;
969	do {
970		for (i = prev; i < next; i++) {
971			unsigned long start = jiffies;
972
973			while (!mcelog.entry[i].finished) {
974				if (time_after_eq(jiffies, start + 2)) {
975					memset(mcelog.entry + i, 0,
976					       sizeof(struct mce));
977					goto timeout;
978				}
979				cpu_relax();
980			}
981			smp_rmb();
982			err |= copy_to_user(buf, mcelog.entry + i,
983					    sizeof(struct mce));
984			buf += sizeof(struct mce);
985timeout:
986			;
987		}
988
989		memset(mcelog.entry + prev, 0,
990		       (next - prev) * sizeof(struct mce));
991		prev = next;
992		next = cmpxchg(&mcelog.next, prev, 0);
993	} while (next != prev);
994
995	synchronize_sched();
996
997	/*
998	 * Collect entries that were still getting written before the
999	 * synchronize.
1000	 */
1001	on_each_cpu(collect_tscs, cpu_tsc, 1);
1002
1003	for (i = next; i < MCE_LOG_LEN; i++) {
1004		if (mcelog.entry[i].finished &&
1005		    mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
1006			err |= copy_to_user(buf, mcelog.entry+i,
1007					    sizeof(struct mce));
1008			smp_rmb();
1009			buf += sizeof(struct mce);
1010			memset(&mcelog.entry[i], 0, sizeof(struct mce));
1011		}
1012	}
1013	mutex_unlock(&mce_read_mutex);
1014	kfree(cpu_tsc);
1015
1016	return err ? -EFAULT : buf - ubuf;
1017}
1018
1019static unsigned int mce_poll(struct file *file, poll_table *wait)
1020{
1021	poll_wait(file, &mce_wait, wait);
1022	if (rcu_dereference(mcelog.next))
1023		return POLLIN | POLLRDNORM;
1024	return 0;
1025}
1026
1027static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
1028{
1029	int __user *p = (int __user *)arg;
1030
1031	if (!capable(CAP_SYS_ADMIN))
1032		return -EPERM;
1033
1034	switch (cmd) {
1035	case MCE_GET_RECORD_LEN:
1036		return put_user(sizeof(struct mce), p);
1037	case MCE_GET_LOG_LEN:
1038		return put_user(MCE_LOG_LEN, p);
1039	case MCE_GETCLEAR_FLAGS: {
1040		unsigned flags;
1041
1042		do {
1043			flags = mcelog.flags;
1044		} while (cmpxchg(&mcelog.flags, flags, 0) != flags);
1045
1046		return put_user(flags, p);
1047	}
1048	default:
1049		return -ENOTTY;
1050	}
1051}
1052
1053/* Modified in mce-inject.c, so not static or const */
1054struct file_operations mce_chrdev_ops = {
1055	.open			= mce_open,
1056	.release		= mce_release,
1057	.read			= mce_read,
1058	.poll			= mce_poll,
1059	.unlocked_ioctl		= mce_ioctl,
1060};
1061EXPORT_SYMBOL_GPL(mce_chrdev_ops);
1062
1063static struct miscdevice mce_log_device = {
1064	MISC_MCELOG_MINOR,
1065	"mcelog",
1066	&mce_chrdev_ops,
1067};
1068
1069/*
1070 * mce=off disables machine check
1071 * mce=TOLERANCELEVEL (number, see above)
1072 * mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
1073 * mce=nobootlog Don't log MCEs from before booting.
1074 */
1075static int __init mcheck_enable(char *str)
1076{
1077	if (*str == 0)
1078		enable_p5_mce();
1079	if (*str == '=')
1080		str++;
1081	if (!strcmp(str, "off"))
1082		mce_disabled = 1;
1083	else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
1084		mce_bootlog = (str[0] == 'b');
1085	else if (isdigit(str[0]))
1086		get_option(&str, &tolerant);
1087	else {
1088		printk(KERN_INFO "mce argument %s ignored. Please use /sys\n",
1089		       str);
1090		return 0;
1091	}
1092	return 1;
1093}
1094__setup("mce", mcheck_enable);
1095
1096/*
1097 * Sysfs support
1098 */
1099
1100/*
1101 * Disable machine checks on suspend and shutdown. We can't really handle
1102 * them later.
1103 */
1104static int mce_disable(void)
1105{
1106	int i;
1107
1108	for (i = 0; i < banks; i++) {
1109		if (!skip_bank_init(i))
1110			wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
1111	}
1112	return 0;
1113}
1114
1115static int mce_suspend(struct sys_device *dev, pm_message_t state)
1116{
1117	return mce_disable();
1118}
1119
1120static int mce_shutdown(struct sys_device *dev)
1121{
1122	return mce_disable();
1123}
1124
1125/*
1126 * On resume clear all MCE state. Don't want to see leftovers from the BIOS.
1127 * Only one CPU is active at this time, the others get re-added later using
1128 * CPU hotplug:
1129 */
1130static int mce_resume(struct sys_device *dev)
1131{
1132	mce_init();
1133	mce_cpu_features(&current_cpu_data);
1134
1135	return 0;
1136}
1137
1138static void mce_cpu_restart(void *data)
1139{
1140	del_timer_sync(&__get_cpu_var(mce_timer));
1141	if (mce_available(&current_cpu_data))
1142		mce_init();
1143	mce_init_timer();
1144}
1145
1146/* Reinit MCEs after user configuration changes */
1147static void mce_restart(void)
1148{
1149	on_each_cpu(mce_cpu_restart, NULL, 1);
1150}
1151
1152static struct sysdev_class mce_sysclass = {
1153	.suspend	= mce_suspend,
1154	.shutdown	= mce_shutdown,
1155	.resume		= mce_resume,
1156	.name		= "machinecheck",
1157};
1158
1159DEFINE_PER_CPU(struct sys_device, mce_dev);
1160
1161__cpuinitdata
1162void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
1163
1164static struct sysdev_attribute *bank_attrs;
1165
1166static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr,
1167			 char *buf)
1168{
1169	u64 b = bank[attr - bank_attrs];
1170
1171	return sprintf(buf, "%llx\n", b);
1172}
1173
1174static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
1175			const char *buf, size_t size)
1176{
1177	u64 new;
1178
1179	if (strict_strtoull(buf, 0, &new) < 0)
1180		return -EINVAL;
1181
1182	bank[attr - bank_attrs] = new;
1183	mce_restart();
1184
1185	return size;
1186}
1187
1188static ssize_t
1189show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf)
1190{
1191	strcpy(buf, trigger);
1192	strcat(buf, "\n");
1193	return strlen(trigger) + 1;
1194}
1195
1196static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr,
1197				const char *buf, size_t siz)
1198{
1199	char *p;
1200	int len;
1201
1202	strncpy(trigger, buf, sizeof(trigger));
1203	trigger[sizeof(trigger)-1] = 0;
1204	len = strlen(trigger);
1205	p = strchr(trigger, '\n');
1206
1207	if (*p)
1208		*p = 0;
1209
1210	return len;
1211}
1212
1213static ssize_t store_int_with_restart(struct sys_device *s,
1214				      struct sysdev_attribute *attr,
1215				      const char *buf, size_t size)
1216{
1217	ssize_t ret = sysdev_store_int(s, attr, buf, size);
1218	mce_restart();
1219	return ret;
1220}
1221
1222static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
1223static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
1224
1225static struct sysdev_ext_attribute attr_check_interval = {
1226	_SYSDEV_ATTR(check_interval, 0644, sysdev_show_int,
1227		     store_int_with_restart),
1228	&check_interval
1229};
1230
1231static struct sysdev_attribute *mce_attrs[] = {
1232	&attr_tolerant.attr, &attr_check_interval.attr, &attr_trigger,
1233	NULL
1234};
1235
1236static cpumask_var_t mce_dev_initialized;
1237
1238/* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */
1239static __cpuinit int mce_create_device(unsigned int cpu)
1240{
1241	int err;
1242	int i;
1243
1244	if (!mce_available(&boot_cpu_data))
1245		return -EIO;
1246
1247	memset(&per_cpu(mce_dev, cpu).kobj, 0, sizeof(struct kobject));
1248	per_cpu(mce_dev, cpu).id	= cpu;
1249	per_cpu(mce_dev, cpu).cls	= &mce_sysclass;
1250
1251	err = sysdev_register(&per_cpu(mce_dev, cpu));
1252	if (err)
1253		return err;
1254
1255	for (i = 0; mce_attrs[i]; i++) {
1256		err = sysdev_create_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
1257		if (err)
1258			goto error;
1259	}
1260	for (i = 0; i < banks; i++) {
1261		err = sysdev_create_file(&per_cpu(mce_dev, cpu),
1262					&bank_attrs[i]);
1263		if (err)
1264			goto error2;
1265	}
1266	cpumask_set_cpu(cpu, mce_dev_initialized);
1267
1268	return 0;
1269error2:
1270	while (--i >= 0)
1271		sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]);
1272error:
1273	while (--i >= 0)
1274		sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
1275
1276	sysdev_unregister(&per_cpu(mce_dev, cpu));
1277
1278	return err;
1279}
1280
1281static __cpuinit void mce_remove_device(unsigned int cpu)
1282{
1283	int i;
1284
1285	if (!cpumask_test_cpu(cpu, mce_dev_initialized))
1286		return;
1287
1288	for (i = 0; mce_attrs[i]; i++)
1289		sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
1290
1291	for (i = 0; i < banks; i++)
1292		sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]);
1293
1294	sysdev_unregister(&per_cpu(mce_dev, cpu));
1295	cpumask_clear_cpu(cpu, mce_dev_initialized);
1296}
1297
1298/* Make sure there are no machine checks on offlined CPUs. */
1299static void mce_disable_cpu(void *h)
1300{
1301	unsigned long action = *(unsigned long *)h;
1302	int i;
1303
1304	if (!mce_available(&current_cpu_data))
1305		return;
1306	if (!(action & CPU_TASKS_FROZEN))
1307		cmci_clear();
1308	for (i = 0; i < banks; i++) {
1309		if (!skip_bank_init(i))
1310			wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
1311	}
1312}
1313
1314static void mce_reenable_cpu(void *h)
1315{
1316	unsigned long action = *(unsigned long *)h;
1317	int i;
1318
1319	if (!mce_available(&current_cpu_data))
1320		return;
1321
1322	if (!(action & CPU_TASKS_FROZEN))
1323		cmci_reenable();
1324	for (i = 0; i < banks; i++) {
1325		if (!skip_bank_init(i))
1326			wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]);
1327	}
1328}
1329
1330/* Get notified when a cpu comes on/off. Be hotplug friendly. */
1331static int __cpuinit
1332mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
1333{
1334	unsigned int cpu = (unsigned long)hcpu;
1335	struct timer_list *t = &per_cpu(mce_timer, cpu);
1336
1337	switch (action) {
1338	case CPU_ONLINE:
1339	case CPU_ONLINE_FROZEN:
1340		mce_create_device(cpu);
1341		if (threshold_cpu_callback)
1342			threshold_cpu_callback(action, cpu);
1343		break;
1344	case CPU_DEAD:
1345	case CPU_DEAD_FROZEN:
1346		if (threshold_cpu_callback)
1347			threshold_cpu_callback(action, cpu);
1348		mce_remove_device(cpu);
1349		break;
1350	case CPU_DOWN_PREPARE:
1351	case CPU_DOWN_PREPARE_FROZEN:
1352		del_timer_sync(t);
1353		smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
1354		break;
1355	case CPU_DOWN_FAILED:
1356	case CPU_DOWN_FAILED_FROZEN:
1357		t->expires = round_jiffies(jiffies +
1358						__get_cpu_var(next_interval));
1359		add_timer_on(t, cpu);
1360		smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
1361		break;
1362	case CPU_POST_DEAD:
1363		/* intentionally ignoring frozen here */
1364		cmci_rediscover(cpu);
1365		break;
1366	}
1367	return NOTIFY_OK;
1368}
1369
1370static struct notifier_block mce_cpu_notifier __cpuinitdata = {
1371	.notifier_call = mce_cpu_callback,
1372};
1373
1374static __init int mce_init_banks(void)
1375{
1376	int i;
1377
1378	bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks,
1379				GFP_KERNEL);
1380	if (!bank_attrs)
1381		return -ENOMEM;
1382
1383	for (i = 0; i < banks; i++) {
1384		struct sysdev_attribute *a = &bank_attrs[i];
1385
1386		a->attr.name	= kasprintf(GFP_KERNEL, "bank%d", i);
1387		if (!a->attr.name)
1388			goto nomem;
1389
1390		a->attr.mode	= 0644;
1391		a->show		= show_bank;
1392		a->store	= set_bank;
1393	}
1394	return 0;
1395
1396nomem:
1397	while (--i >= 0)
1398		kfree(bank_attrs[i].attr.name);
1399	kfree(bank_attrs);
1400	bank_attrs = NULL;
1401
1402	return -ENOMEM;
1403}
1404
1405static __init int mce_init_device(void)
1406{
1407	int err;
1408	int i = 0;
1409
1410	if (!mce_available(&boot_cpu_data))
1411		return -EIO;
1412
1413	alloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL);
1414
1415	err = mce_init_banks();
1416	if (err)
1417		return err;
1418
1419	err = sysdev_class_register(&mce_sysclass);
1420	if (err)
1421		return err;
1422
1423	for_each_online_cpu(i) {
1424		err = mce_create_device(i);
1425		if (err)
1426			return err;
1427	}
1428
1429	register_hotcpu_notifier(&mce_cpu_notifier);
1430	misc_register(&mce_log_device);
1431
1432	return err;
1433}
1434
1435device_initcall(mce_init_device);
1436
1437#else /* CONFIG_X86_OLD_MCE: */
1438
1439int nr_mce_banks;
1440EXPORT_SYMBOL_GPL(nr_mce_banks);	/* non-fatal.o */
1441
1442/* This has to be run for each processor */
1443void mcheck_init(struct cpuinfo_x86 *c)
1444{
1445	if (mce_disabled == 1)
1446		return;
1447
1448	switch (c->x86_vendor) {
1449	case X86_VENDOR_AMD:
1450		amd_mcheck_init(c);
1451		break;
1452
1453	case X86_VENDOR_INTEL:
1454		if (c->x86 == 5)
1455			intel_p5_mcheck_init(c);
1456		if (c->x86 == 6)
1457			intel_p6_mcheck_init(c);
1458		if (c->x86 == 15)
1459			intel_p4_mcheck_init(c);
1460		break;
1461
1462	case X86_VENDOR_CENTAUR:
1463		if (c->x86 == 5)
1464			winchip_mcheck_init(c);
1465		break;
1466
1467	default:
1468		break;
1469	}
1470	printk(KERN_INFO "mce: CPU supports %d MCE banks\n", nr_mce_banks);
1471}
1472
1473static int __init mcheck_enable(char *str)
1474{
1475	mce_disabled = -1;
1476	return 1;
1477}
1478
1479__setup("mce", mcheck_enable);
1480
1481#endif /* CONFIG_X86_OLD_MCE */
1482
1483/*
1484 * Old style boot options parsing. Only for compatibility.
1485 */
1486static int __init mcheck_disable(char *str)
1487{
1488	mce_disabled = 1;
1489	return 1;
1490}
1491__setup("nomce", mcheck_disable);
1492