mce.c revision 5bb38adcb54cf7192b154368ad62982caa11ca0b
1/*
2 * Machine check handler.
3 *
4 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
5 * Rest from unknown author(s).
6 * 2004 Andi Kleen. Rewrote most of it.
7 * Copyright 2008 Intel Corporation
8 * Author: Andi Kleen
9 */
10#include <linux/thread_info.h>
11#include <linux/capability.h>
12#include <linux/miscdevice.h>
13#include <linux/interrupt.h>
14#include <linux/ratelimit.h>
15#include <linux/kallsyms.h>
16#include <linux/rcupdate.h>
17#include <linux/kobject.h>
18#include <linux/uaccess.h>
19#include <linux/kdebug.h>
20#include <linux/kernel.h>
21#include <linux/percpu.h>
22#include <linux/string.h>
23#include <linux/sysdev.h>
24#include <linux/delay.h>
25#include <linux/ctype.h>
26#include <linux/sched.h>
27#include <linux/sysfs.h>
28#include <linux/types.h>
29#include <linux/init.h>
30#include <linux/kmod.h>
31#include <linux/poll.h>
32#include <linux/nmi.h>
33#include <linux/cpu.h>
34#include <linux/smp.h>
35#include <linux/fs.h>
36#include <linux/mm.h>
37
38#include <asm/processor.h>
39#include <asm/hw_irq.h>
40#include <asm/apic.h>
41#include <asm/idle.h>
42#include <asm/ipi.h>
43#include <asm/mce.h>
44#include <asm/msr.h>
45
46#include "mce-internal.h"
47
48/* Handle unconfigured int18 (should never happen) */
49static void unexpected_machine_check(struct pt_regs *regs, long error_code)
50{
51	printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n",
52	       smp_processor_id());
53}
54
55/* Call the installed machine check handler for this CPU setup. */
56void (*machine_check_vector)(struct pt_regs *, long error_code) =
57						unexpected_machine_check;
58
59int mce_disabled __read_mostly;
60
61#define MISC_MCELOG_MINOR	227
62
63#define SPINUNIT 100	/* 100ns */
64
65atomic_t mce_entry;
66
67DEFINE_PER_CPU(unsigned, mce_exception_count);
68
69/*
70 * Tolerant levels:
71 *   0: always panic on uncorrected errors, log corrected errors
72 *   1: panic or SIGBUS on uncorrected errors, log corrected errors
73 *   2: SIGBUS or log uncorrected errors (if possible), log corrected errors
74 *   3: never panic or SIGBUS, log all errors (for testing only)
75 */
76static int			tolerant		__read_mostly = 1;
77static int			banks			__read_mostly;
78static u64			*bank			__read_mostly;
79static int			rip_msr			__read_mostly;
80static int			mce_bootlog		__read_mostly = -1;
81static int			monarch_timeout		__read_mostly = -1;
82static int			mce_panic_timeout	__read_mostly;
83static int			mce_dont_log_ce		__read_mostly;
84int				mce_cmci_disabled	__read_mostly;
85int				mce_ignore_ce		__read_mostly;
86int				mce_ser			__read_mostly;
87
88/* User mode helper program triggered by machine check event */
89static unsigned long		mce_need_notify;
90static char			mce_helper[128];
91static char			*mce_helper_argv[2] = { mce_helper, NULL };
92
93static unsigned long		dont_init_banks;
94
95static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
96static DEFINE_PER_CPU(struct mce, mces_seen);
97static int			cpu_missing;
98
99
100/* MCA banks polled by the period polling timer for corrected events */
101DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
102	[0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
103};
104
105static inline int skip_bank_init(int i)
106{
107	return i < BITS_PER_LONG && test_bit(i, &dont_init_banks);
108}
109
110static DEFINE_PER_CPU(struct work_struct, mce_work);
111
112/* Do initial initialization of a struct mce */
113void mce_setup(struct mce *m)
114{
115	memset(m, 0, sizeof(struct mce));
116	m->cpu = m->extcpu = smp_processor_id();
117	rdtscll(m->tsc);
118	/* We hope get_seconds stays lockless */
119	m->time = get_seconds();
120	m->cpuvendor = boot_cpu_data.x86_vendor;
121	m->cpuid = cpuid_eax(1);
122#ifdef CONFIG_SMP
123	m->socketid = cpu_data(m->extcpu).phys_proc_id;
124#endif
125	m->apicid = cpu_data(m->extcpu).initial_apicid;
126	rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap);
127}
128
129DEFINE_PER_CPU(struct mce, injectm);
130EXPORT_PER_CPU_SYMBOL_GPL(injectm);
131
132/*
133 * Lockless MCE logging infrastructure.
134 * This avoids deadlocks on printk locks without having to break locks. Also
135 * separate MCEs from kernel messages to avoid bogus bug reports.
136 */
137
138static struct mce_log mcelog = {
139	.signature	= MCE_LOG_SIGNATURE,
140	.len		= MCE_LOG_LEN,
141	.recordlen	= sizeof(struct mce),
142};
143
144void mce_log(struct mce *mce)
145{
146	unsigned next, entry;
147
148	mce->finished = 0;
149	wmb();
150	for (;;) {
151		entry = rcu_dereference(mcelog.next);
152		for (;;) {
153			/*
154			 * When the buffer fills up discard new entries.
155			 * Assume that the earlier errors are the more
156			 * interesting ones:
157			 */
158			if (entry >= MCE_LOG_LEN) {
159				set_bit(MCE_OVERFLOW,
160					(unsigned long *)&mcelog.flags);
161				return;
162			}
163			/* Old left over entry. Skip: */
164			if (mcelog.entry[entry].finished) {
165				entry++;
166				continue;
167			}
168			break;
169		}
170		smp_rmb();
171		next = entry + 1;
172		if (cmpxchg(&mcelog.next, entry, next) == entry)
173			break;
174	}
175	memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
176	wmb();
177	mcelog.entry[entry].finished = 1;
178	wmb();
179
180	mce->finished = 1;
181	set_bit(0, &mce_need_notify);
182}
183
184static void print_mce(struct mce *m)
185{
186	printk(KERN_EMERG
187	       "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
188	       m->extcpu, m->mcgstatus, m->bank, m->status);
189	if (m->ip) {
190		printk(KERN_EMERG "RIP%s %02x:<%016Lx> ",
191		       !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
192		       m->cs, m->ip);
193		if (m->cs == __KERNEL_CS)
194			print_symbol("{%s}", m->ip);
195		printk("\n");
196	}
197	printk(KERN_EMERG "TSC %llx ", m->tsc);
198	if (m->addr)
199		printk("ADDR %llx ", m->addr);
200	if (m->misc)
201		printk("MISC %llx ", m->misc);
202	printk("\n");
203	printk(KERN_EMERG "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n",
204			m->cpuvendor, m->cpuid, m->time, m->socketid,
205			m->apicid);
206}
207
208static void print_mce_head(void)
209{
210	printk(KERN_EMERG "\n" KERN_EMERG "HARDWARE ERROR\n");
211}
212
213static void print_mce_tail(void)
214{
215	printk(KERN_EMERG "This is not a software problem!\n"
216	       KERN_EMERG "Run through mcelog --ascii to decode and contact your hardware vendor\n");
217}
218
219#define PANIC_TIMEOUT 5 /* 5 seconds */
220
221static atomic_t mce_paniced;
222
223/* Panic in progress. Enable interrupts and wait for final IPI */
224static void wait_for_panic(void)
225{
226	long timeout = PANIC_TIMEOUT*USEC_PER_SEC;
227	preempt_disable();
228	local_irq_enable();
229	while (timeout-- > 0)
230		udelay(1);
231	if (panic_timeout == 0)
232		panic_timeout = mce_panic_timeout;
233	panic("Panicing machine check CPU died");
234}
235
236static void mce_panic(char *msg, struct mce *final, char *exp)
237{
238	int i;
239
240	/*
241	 * Make sure only one CPU runs in machine check panic
242	 */
243	if (atomic_inc_return(&mce_paniced) > 1)
244		wait_for_panic();
245	barrier();
246
247	bust_spinlocks(1);
248	console_verbose();
249	print_mce_head();
250	/* First print corrected ones that are still unlogged */
251	for (i = 0; i < MCE_LOG_LEN; i++) {
252		struct mce *m = &mcelog.entry[i];
253		if (!(m->status & MCI_STATUS_VAL))
254			continue;
255		if (!(m->status & MCI_STATUS_UC))
256			print_mce(m);
257	}
258	/* Now print uncorrected but with the final one last */
259	for (i = 0; i < MCE_LOG_LEN; i++) {
260		struct mce *m = &mcelog.entry[i];
261		if (!(m->status & MCI_STATUS_VAL))
262			continue;
263		if (!(m->status & MCI_STATUS_UC))
264			continue;
265		if (!final || memcmp(m, final, sizeof(struct mce)))
266			print_mce(m);
267	}
268	if (final)
269		print_mce(final);
270	if (cpu_missing)
271		printk(KERN_EMERG "Some CPUs didn't answer in synchronization\n");
272	print_mce_tail();
273	if (exp)
274		printk(KERN_EMERG "Machine check: %s\n", exp);
275	if (panic_timeout == 0)
276		panic_timeout = mce_panic_timeout;
277	panic(msg);
278}
279
280/* Support code for software error injection */
281
282static int msr_to_offset(u32 msr)
283{
284	unsigned bank = __get_cpu_var(injectm.bank);
285	if (msr == rip_msr)
286		return offsetof(struct mce, ip);
287	if (msr == MSR_IA32_MC0_STATUS + bank*4)
288		return offsetof(struct mce, status);
289	if (msr == MSR_IA32_MC0_ADDR + bank*4)
290		return offsetof(struct mce, addr);
291	if (msr == MSR_IA32_MC0_MISC + bank*4)
292		return offsetof(struct mce, misc);
293	if (msr == MSR_IA32_MCG_STATUS)
294		return offsetof(struct mce, mcgstatus);
295	return -1;
296}
297
298/* MSR access wrappers used for error injection */
299static u64 mce_rdmsrl(u32 msr)
300{
301	u64 v;
302	if (__get_cpu_var(injectm).finished) {
303		int offset = msr_to_offset(msr);
304		if (offset < 0)
305			return 0;
306		return *(u64 *)((char *)&__get_cpu_var(injectm) + offset);
307	}
308	rdmsrl(msr, v);
309	return v;
310}
311
312static void mce_wrmsrl(u32 msr, u64 v)
313{
314	if (__get_cpu_var(injectm).finished) {
315		int offset = msr_to_offset(msr);
316		if (offset >= 0)
317			*(u64 *)((char *)&__get_cpu_var(injectm) + offset) = v;
318		return;
319	}
320	wrmsrl(msr, v);
321}
322
323/*
324 * Simple lockless ring to communicate PFNs from the exception handler with the
325 * process context work function. This is vastly simplified because there's
326 * only a single reader and a single writer.
327 */
328#define MCE_RING_SIZE 16	/* we use one entry less */
329
330struct mce_ring {
331	unsigned short start;
332	unsigned short end;
333	unsigned long ring[MCE_RING_SIZE];
334};
335static DEFINE_PER_CPU(struct mce_ring, mce_ring);
336
337/* Runs with CPU affinity in workqueue */
338static int mce_ring_empty(void)
339{
340	struct mce_ring *r = &__get_cpu_var(mce_ring);
341
342	return r->start == r->end;
343}
344
345static int mce_ring_get(unsigned long *pfn)
346{
347	struct mce_ring *r;
348	int ret = 0;
349
350	*pfn = 0;
351	get_cpu();
352	r = &__get_cpu_var(mce_ring);
353	if (r->start == r->end)
354		goto out;
355	*pfn = r->ring[r->start];
356	r->start = (r->start + 1) % MCE_RING_SIZE;
357	ret = 1;
358out:
359	put_cpu();
360	return ret;
361}
362
363/* Always runs in MCE context with preempt off */
364static int mce_ring_add(unsigned long pfn)
365{
366	struct mce_ring *r = &__get_cpu_var(mce_ring);
367	unsigned next;
368
369	next = (r->end + 1) % MCE_RING_SIZE;
370	if (next == r->start)
371		return -1;
372	r->ring[r->end] = pfn;
373	wmb();
374	r->end = next;
375	return 0;
376}
377
378int mce_available(struct cpuinfo_x86 *c)
379{
380	if (mce_disabled)
381		return 0;
382	return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
383}
384
385static void mce_schedule_work(void)
386{
387	if (!mce_ring_empty()) {
388		struct work_struct *work = &__get_cpu_var(mce_work);
389		if (!work_pending(work))
390			schedule_work(work);
391	}
392}
393
394/*
395 * Get the address of the instruction at the time of the machine check
396 * error.
397 */
398static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
399{
400
401	if (regs && (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV))) {
402		m->ip = regs->ip;
403		m->cs = regs->cs;
404	} else {
405		m->ip = 0;
406		m->cs = 0;
407	}
408	if (rip_msr)
409		m->ip = mce_rdmsrl(rip_msr);
410}
411
412#ifdef CONFIG_X86_LOCAL_APIC
413/*
414 * Called after interrupts have been reenabled again
415 * when a MCE happened during an interrupts off region
416 * in the kernel.
417 */
418asmlinkage void smp_mce_self_interrupt(struct pt_regs *regs)
419{
420	ack_APIC_irq();
421	exit_idle();
422	irq_enter();
423	mce_notify_irq();
424	mce_schedule_work();
425	irq_exit();
426}
427#endif
428
429static void mce_report_event(struct pt_regs *regs)
430{
431	if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) {
432		mce_notify_irq();
433		/*
434		 * Triggering the work queue here is just an insurance
435		 * policy in case the syscall exit notify handler
436		 * doesn't run soon enough or ends up running on the
437		 * wrong CPU (can happen when audit sleeps)
438		 */
439		mce_schedule_work();
440		return;
441	}
442
443#ifdef CONFIG_X86_LOCAL_APIC
444	/*
445	 * Without APIC do not notify. The event will be picked
446	 * up eventually.
447	 */
448	if (!cpu_has_apic)
449		return;
450
451	/*
452	 * When interrupts are disabled we cannot use
453	 * kernel services safely. Trigger an self interrupt
454	 * through the APIC to instead do the notification
455	 * after interrupts are reenabled again.
456	 */
457	apic->send_IPI_self(MCE_SELF_VECTOR);
458
459	/*
460	 * Wait for idle afterwards again so that we don't leave the
461	 * APIC in a non idle state because the normal APIC writes
462	 * cannot exclude us.
463	 */
464	apic_wait_icr_idle();
465#endif
466}
467
468DEFINE_PER_CPU(unsigned, mce_poll_count);
469
470/*
471 * Poll for corrected events or events that happened before reset.
472 * Those are just logged through /dev/mcelog.
473 *
474 * This is executed in standard interrupt context.
475 *
476 * Note: spec recommends to panic for fatal unsignalled
477 * errors here. However this would be quite problematic --
478 * we would need to reimplement the Monarch handling and
479 * it would mess up the exclusion between exception handler
480 * and poll hander -- * so we skip this for now.
481 * These cases should not happen anyways, or only when the CPU
482 * is already totally * confused. In this case it's likely it will
483 * not fully execute the machine check handler either.
484 */
485void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
486{
487	struct mce m;
488	int i;
489
490	__get_cpu_var(mce_poll_count)++;
491
492	mce_setup(&m);
493
494	m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
495	for (i = 0; i < banks; i++) {
496		if (!bank[i] || !test_bit(i, *b))
497			continue;
498
499		m.misc = 0;
500		m.addr = 0;
501		m.bank = i;
502		m.tsc = 0;
503
504		barrier();
505		m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4);
506		if (!(m.status & MCI_STATUS_VAL))
507			continue;
508
509		/*
510		 * Uncorrected or signalled events are handled by the exception
511		 * handler when it is enabled, so don't process those here.
512		 *
513		 * TBD do the same check for MCI_STATUS_EN here?
514		 */
515		if (!(flags & MCP_UC) &&
516		    (m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)))
517			continue;
518
519		if (m.status & MCI_STATUS_MISCV)
520			m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4);
521		if (m.status & MCI_STATUS_ADDRV)
522			m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4);
523
524		if (!(flags & MCP_TIMESTAMP))
525			m.tsc = 0;
526		/*
527		 * Don't get the IP here because it's unlikely to
528		 * have anything to do with the actual error location.
529		 */
530		if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) {
531			mce_log(&m);
532			add_taint(TAINT_MACHINE_CHECK);
533		}
534
535		/*
536		 * Clear state for this bank.
537		 */
538		mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
539	}
540
541	/*
542	 * Don't clear MCG_STATUS here because it's only defined for
543	 * exceptions.
544	 */
545
546	sync_core();
547}
548EXPORT_SYMBOL_GPL(machine_check_poll);
549
550/*
551 * Do a quick check if any of the events requires a panic.
552 * This decides if we keep the events around or clear them.
553 */
554static int mce_no_way_out(struct mce *m, char **msg)
555{
556	int i;
557
558	for (i = 0; i < banks; i++) {
559		m->status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4);
560		if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY)
561			return 1;
562	}
563	return 0;
564}
565
566/*
567 * Variable to establish order between CPUs while scanning.
568 * Each CPU spins initially until executing is equal its number.
569 */
570static atomic_t mce_executing;
571
572/*
573 * Defines order of CPUs on entry. First CPU becomes Monarch.
574 */
575static atomic_t mce_callin;
576
577/*
578 * Check if a timeout waiting for other CPUs happened.
579 */
580static int mce_timed_out(u64 *t)
581{
582	/*
583	 * The others already did panic for some reason.
584	 * Bail out like in a timeout.
585	 * rmb() to tell the compiler that system_state
586	 * might have been modified by someone else.
587	 */
588	rmb();
589	if (atomic_read(&mce_paniced))
590		wait_for_panic();
591	if (!monarch_timeout)
592		goto out;
593	if ((s64)*t < SPINUNIT) {
594		/* CHECKME: Make panic default for 1 too? */
595		if (tolerant < 1)
596			mce_panic("Timeout synchronizing machine check over CPUs",
597				  NULL, NULL);
598		cpu_missing = 1;
599		return 1;
600	}
601	*t -= SPINUNIT;
602out:
603	touch_nmi_watchdog();
604	return 0;
605}
606
607/*
608 * The Monarch's reign.  The Monarch is the CPU who entered
609 * the machine check handler first. It waits for the others to
610 * raise the exception too and then grades them. When any
611 * error is fatal panic. Only then let the others continue.
612 *
613 * The other CPUs entering the MCE handler will be controlled by the
614 * Monarch. They are called Subjects.
615 *
616 * This way we prevent any potential data corruption in a unrecoverable case
617 * and also makes sure always all CPU's errors are examined.
618 *
619 * Also this detects the case of an machine check event coming from outer
620 * space (not detected by any CPUs) In this case some external agent wants
621 * us to shut down, so panic too.
622 *
623 * The other CPUs might still decide to panic if the handler happens
624 * in a unrecoverable place, but in this case the system is in a semi-stable
625 * state and won't corrupt anything by itself. It's ok to let the others
626 * continue for a bit first.
627 *
628 * All the spin loops have timeouts; when a timeout happens a CPU
629 * typically elects itself to be Monarch.
630 */
631static void mce_reign(void)
632{
633	int cpu;
634	struct mce *m = NULL;
635	int global_worst = 0;
636	char *msg = NULL;
637	char *nmsg = NULL;
638
639	/*
640	 * This CPU is the Monarch and the other CPUs have run
641	 * through their handlers.
642	 * Grade the severity of the errors of all the CPUs.
643	 */
644	for_each_possible_cpu(cpu) {
645		int severity = mce_severity(&per_cpu(mces_seen, cpu), tolerant,
646					    &nmsg);
647		if (severity > global_worst) {
648			msg = nmsg;
649			global_worst = severity;
650			m = &per_cpu(mces_seen, cpu);
651		}
652	}
653
654	/*
655	 * Cannot recover? Panic here then.
656	 * This dumps all the mces in the log buffer and stops the
657	 * other CPUs.
658	 */
659	if (m && global_worst >= MCE_PANIC_SEVERITY && tolerant < 3)
660		mce_panic("Fatal Machine check", m, msg);
661
662	/*
663	 * For UC somewhere we let the CPU who detects it handle it.
664	 * Also must let continue the others, otherwise the handling
665	 * CPU could deadlock on a lock.
666	 */
667
668	/*
669	 * No machine check event found. Must be some external
670	 * source or one CPU is hung. Panic.
671	 */
672	if (!m && tolerant < 3)
673		mce_panic("Machine check from unknown source", NULL, NULL);
674
675	/*
676	 * Now clear all the mces_seen so that they don't reappear on
677	 * the next mce.
678	 */
679	for_each_possible_cpu(cpu)
680		memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce));
681}
682
683static atomic_t global_nwo;
684
685/*
686 * Start of Monarch synchronization. This waits until all CPUs have
687 * entered the exception handler and then determines if any of them
688 * saw a fatal event that requires panic. Then it executes them
689 * in the entry order.
690 * TBD double check parallel CPU hotunplug
691 */
692static int mce_start(int *no_way_out)
693{
694	int order;
695	int cpus = num_online_cpus();
696	u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC;
697
698	if (!timeout)
699		return -1;
700
701	atomic_add(*no_way_out, &global_nwo);
702	/*
703	 * global_nwo should be updated before mce_callin
704	 */
705	smp_wmb();
706	order = atomic_inc_return(&mce_callin);
707
708	/*
709	 * Wait for everyone.
710	 */
711	while (atomic_read(&mce_callin) != cpus) {
712		if (mce_timed_out(&timeout)) {
713			atomic_set(&global_nwo, 0);
714			return -1;
715		}
716		ndelay(SPINUNIT);
717	}
718
719	/*
720	 * mce_callin should be read before global_nwo
721	 */
722	smp_rmb();
723
724	if (order == 1) {
725		/*
726		 * Monarch: Starts executing now, the others wait.
727		 */
728		atomic_set(&mce_executing, 1);
729	} else {
730		/*
731		 * Subject: Now start the scanning loop one by one in
732		 * the original callin order.
733		 * This way when there are any shared banks it will be
734		 * only seen by one CPU before cleared, avoiding duplicates.
735		 */
736		while (atomic_read(&mce_executing) < order) {
737			if (mce_timed_out(&timeout)) {
738				atomic_set(&global_nwo, 0);
739				return -1;
740			}
741			ndelay(SPINUNIT);
742		}
743	}
744
745	/*
746	 * Cache the global no_way_out state.
747	 */
748	*no_way_out = atomic_read(&global_nwo);
749
750	return order;
751}
752
753/*
754 * Synchronize between CPUs after main scanning loop.
755 * This invokes the bulk of the Monarch processing.
756 */
757static int mce_end(int order)
758{
759	int ret = -1;
760	u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC;
761
762	if (!timeout)
763		goto reset;
764	if (order < 0)
765		goto reset;
766
767	/*
768	 * Allow others to run.
769	 */
770	atomic_inc(&mce_executing);
771
772	if (order == 1) {
773		/* CHECKME: Can this race with a parallel hotplug? */
774		int cpus = num_online_cpus();
775
776		/*
777		 * Monarch: Wait for everyone to go through their scanning
778		 * loops.
779		 */
780		while (atomic_read(&mce_executing) <= cpus) {
781			if (mce_timed_out(&timeout))
782				goto reset;
783			ndelay(SPINUNIT);
784		}
785
786		mce_reign();
787		barrier();
788		ret = 0;
789	} else {
790		/*
791		 * Subject: Wait for Monarch to finish.
792		 */
793		while (atomic_read(&mce_executing) != 0) {
794			if (mce_timed_out(&timeout))
795				goto reset;
796			ndelay(SPINUNIT);
797		}
798
799		/*
800		 * Don't reset anything. That's done by the Monarch.
801		 */
802		return 0;
803	}
804
805	/*
806	 * Reset all global state.
807	 */
808reset:
809	atomic_set(&global_nwo, 0);
810	atomic_set(&mce_callin, 0);
811	barrier();
812
813	/*
814	 * Let others run again.
815	 */
816	atomic_set(&mce_executing, 0);
817	return ret;
818}
819
820/*
821 * Check if the address reported by the CPU is in a format we can parse.
822 * It would be possible to add code for most other cases, but all would
823 * be somewhat complicated (e.g. segment offset would require an instruction
824 * parser). So only support physical addresses upto page granuality for now.
825 */
826static int mce_usable_address(struct mce *m)
827{
828	if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV))
829		return 0;
830	if ((m->misc & 0x3f) > PAGE_SHIFT)
831		return 0;
832	if (((m->misc >> 6) & 7) != MCM_ADDR_PHYS)
833		return 0;
834	return 1;
835}
836
837static void mce_clear_state(unsigned long *toclear)
838{
839	int i;
840
841	for (i = 0; i < banks; i++) {
842		if (test_bit(i, toclear))
843			mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
844	}
845}
846
847/*
848 * The actual machine check handler. This only handles real
849 * exceptions when something got corrupted coming in through int 18.
850 *
851 * This is executed in NMI context not subject to normal locking rules. This
852 * implies that most kernel services cannot be safely used. Don't even
853 * think about putting a printk in there!
854 *
855 * On Intel systems this is entered on all CPUs in parallel through
856 * MCE broadcast. However some CPUs might be broken beyond repair,
857 * so be always careful when synchronizing with others.
858 */
859void do_machine_check(struct pt_regs *regs, long error_code)
860{
861	struct mce m, *final;
862	int i;
863	int worst = 0;
864	int severity;
865	/*
866	 * Establish sequential order between the CPUs entering the machine
867	 * check handler.
868	 */
869	int order;
870	/*
871	 * If no_way_out gets set, there is no safe way to recover from this
872	 * MCE.  If tolerant is cranked up, we'll try anyway.
873	 */
874	int no_way_out = 0;
875	/*
876	 * If kill_it gets set, there might be a way to recover from this
877	 * error.
878	 */
879	int kill_it = 0;
880	DECLARE_BITMAP(toclear, MAX_NR_BANKS);
881	char *msg = "Unknown";
882
883	atomic_inc(&mce_entry);
884
885	__get_cpu_var(mce_exception_count)++;
886
887	if (notify_die(DIE_NMI, "machine check", regs, error_code,
888			   18, SIGKILL) == NOTIFY_STOP)
889		goto out;
890	if (!banks)
891		goto out;
892
893	mce_setup(&m);
894
895	m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
896	no_way_out = mce_no_way_out(&m, &msg);
897
898	final = &__get_cpu_var(mces_seen);
899	*final = m;
900
901	barrier();
902
903	/*
904	 * When no restart IP must always kill or panic.
905	 */
906	if (!(m.mcgstatus & MCG_STATUS_RIPV))
907		kill_it = 1;
908
909	/*
910	 * Go through all the banks in exclusion of the other CPUs.
911	 * This way we don't report duplicated events on shared banks
912	 * because the first one to see it will clear it.
913	 */
914	order = mce_start(&no_way_out);
915	for (i = 0; i < banks; i++) {
916		__clear_bit(i, toclear);
917		if (!bank[i])
918			continue;
919
920		m.misc = 0;
921		m.addr = 0;
922		m.bank = i;
923
924		m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4);
925		if ((m.status & MCI_STATUS_VAL) == 0)
926			continue;
927
928		/*
929		 * Non uncorrected or non signaled errors are handled by
930		 * machine_check_poll. Leave them alone, unless this panics.
931		 */
932		if (!(m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)) &&
933			!no_way_out)
934			continue;
935
936		/*
937		 * Set taint even when machine check was not enabled.
938		 */
939		add_taint(TAINT_MACHINE_CHECK);
940
941		severity = mce_severity(&m, tolerant, NULL);
942
943		/*
944		 * When machine check was for corrected handler don't touch,
945		 * unless we're panicing.
946		 */
947		if (severity == MCE_KEEP_SEVERITY && !no_way_out)
948			continue;
949		__set_bit(i, toclear);
950		if (severity == MCE_NO_SEVERITY) {
951			/*
952			 * Machine check event was not enabled. Clear, but
953			 * ignore.
954			 */
955			continue;
956		}
957
958		/*
959		 * Kill on action required.
960		 */
961		if (severity == MCE_AR_SEVERITY)
962			kill_it = 1;
963
964		if (m.status & MCI_STATUS_MISCV)
965			m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4);
966		if (m.status & MCI_STATUS_ADDRV)
967			m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4);
968
969		/*
970		 * Action optional error. Queue address for later processing.
971		 * When the ring overflows we just ignore the AO error.
972		 * RED-PEN add some logging mechanism when
973		 * usable_address or mce_add_ring fails.
974		 * RED-PEN don't ignore overflow for tolerant == 0
975		 */
976		if (severity == MCE_AO_SEVERITY && mce_usable_address(&m))
977			mce_ring_add(m.addr >> PAGE_SHIFT);
978
979		mce_get_rip(&m, regs);
980		mce_log(&m);
981
982		if (severity > worst) {
983			*final = m;
984			worst = severity;
985		}
986	}
987
988	if (!no_way_out)
989		mce_clear_state(toclear);
990
991	/*
992	 * Do most of the synchronization with other CPUs.
993	 * When there's any problem use only local no_way_out state.
994	 */
995	if (mce_end(order) < 0)
996		no_way_out = worst >= MCE_PANIC_SEVERITY;
997
998	/*
999	 * If we have decided that we just CAN'T continue, and the user
1000	 * has not set tolerant to an insane level, give up and die.
1001	 *
1002	 * This is mainly used in the case when the system doesn't
1003	 * support MCE broadcasting or it has been disabled.
1004	 */
1005	if (no_way_out && tolerant < 3)
1006		mce_panic("Fatal machine check on current CPU", final, msg);
1007
1008	/*
1009	 * If the error seems to be unrecoverable, something should be
1010	 * done.  Try to kill as little as possible.  If we can kill just
1011	 * one task, do that.  If the user has set the tolerance very
1012	 * high, don't try to do anything at all.
1013	 */
1014
1015	if (kill_it && tolerant < 3)
1016		force_sig(SIGBUS, current);
1017
1018	/* notify userspace ASAP */
1019	set_thread_flag(TIF_MCE_NOTIFY);
1020
1021	if (worst > 0)
1022		mce_report_event(regs);
1023	mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
1024out:
1025	atomic_dec(&mce_entry);
1026	sync_core();
1027}
1028EXPORT_SYMBOL_GPL(do_machine_check);
1029
1030/* dummy to break dependency. actual code is in mm/memory-failure.c */
1031void __attribute__((weak)) memory_failure(unsigned long pfn, int vector)
1032{
1033	printk(KERN_ERR "Action optional memory failure at %lx ignored\n", pfn);
1034}
1035
1036/*
1037 * Called after mce notification in process context. This code
1038 * is allowed to sleep. Call the high level VM handler to process
1039 * any corrupted pages.
1040 * Assume that the work queue code only calls this one at a time
1041 * per CPU.
1042 * Note we don't disable preemption, so this code might run on the wrong
1043 * CPU. In this case the event is picked up by the scheduled work queue.
1044 * This is merely a fast path to expedite processing in some common
1045 * cases.
1046 */
1047void mce_notify_process(void)
1048{
1049	unsigned long pfn;
1050	mce_notify_irq();
1051	while (mce_ring_get(&pfn))
1052		memory_failure(pfn, MCE_VECTOR);
1053}
1054
1055static void mce_process_work(struct work_struct *dummy)
1056{
1057	mce_notify_process();
1058}
1059
1060#ifdef CONFIG_X86_MCE_INTEL
1061/***
1062 * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
1063 * @cpu: The CPU on which the event occurred.
1064 * @status: Event status information
1065 *
1066 * This function should be called by the thermal interrupt after the
1067 * event has been processed and the decision was made to log the event
1068 * further.
1069 *
1070 * The status parameter will be saved to the 'status' field of 'struct mce'
1071 * and historically has been the register value of the
1072 * MSR_IA32_THERMAL_STATUS (Intel) msr.
1073 */
1074void mce_log_therm_throt_event(__u64 status)
1075{
1076	struct mce m;
1077
1078	mce_setup(&m);
1079	m.bank = MCE_THERMAL_BANK;
1080	m.status = status;
1081	mce_log(&m);
1082}
1083#endif /* CONFIG_X86_MCE_INTEL */
1084
1085/*
1086 * Periodic polling timer for "silent" machine check errors.  If the
1087 * poller finds an MCE, poll 2x faster.  When the poller finds no more
1088 * errors, poll 2x slower (up to check_interval seconds).
1089 */
1090static int check_interval = 5 * 60; /* 5 minutes */
1091
1092static DEFINE_PER_CPU(int, next_interval); /* in jiffies */
1093static DEFINE_PER_CPU(struct timer_list, mce_timer);
1094
1095static void mcheck_timer(unsigned long data)
1096{
1097	struct timer_list *t = &per_cpu(mce_timer, data);
1098	int *n;
1099
1100	WARN_ON(smp_processor_id() != data);
1101
1102	if (mce_available(&current_cpu_data)) {
1103		machine_check_poll(MCP_TIMESTAMP,
1104				&__get_cpu_var(mce_poll_banks));
1105	}
1106
1107	/*
1108	 * Alert userspace if needed.  If we logged an MCE, reduce the
1109	 * polling interval, otherwise increase the polling interval.
1110	 */
1111	n = &__get_cpu_var(next_interval);
1112	if (mce_notify_irq())
1113		*n = max(*n/2, HZ/100);
1114	else
1115		*n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ));
1116
1117	t->expires = jiffies + *n;
1118	add_timer(t);
1119}
1120
1121static void mce_do_trigger(struct work_struct *work)
1122{
1123	call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT);
1124}
1125
1126static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
1127
1128/*
1129 * Notify the user(s) about new machine check events.
1130 * Can be called from interrupt context, but not from machine check/NMI
1131 * context.
1132 */
1133int mce_notify_irq(void)
1134{
1135	/* Not more than two messages every minute */
1136	static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
1137
1138	clear_thread_flag(TIF_MCE_NOTIFY);
1139
1140	if (test_and_clear_bit(0, &mce_need_notify)) {
1141		wake_up_interruptible(&mce_wait);
1142
1143		/*
1144		 * There is no risk of missing notifications because
1145		 * work_pending is always cleared before the function is
1146		 * executed.
1147		 */
1148		if (mce_helper[0] && !work_pending(&mce_trigger_work))
1149			schedule_work(&mce_trigger_work);
1150
1151		if (__ratelimit(&ratelimit))
1152			printk(KERN_INFO "Machine check events logged\n");
1153
1154		return 1;
1155	}
1156	return 0;
1157}
1158EXPORT_SYMBOL_GPL(mce_notify_irq);
1159
1160/*
1161 * Initialize Machine Checks for a CPU.
1162 */
1163static int mce_cap_init(void)
1164{
1165	unsigned b;
1166	u64 cap;
1167
1168	rdmsrl(MSR_IA32_MCG_CAP, cap);
1169
1170	b = cap & MCG_BANKCNT_MASK;
1171	printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b);
1172
1173	if (b > MAX_NR_BANKS) {
1174		printk(KERN_WARNING
1175		       "MCE: Using only %u machine check banks out of %u\n",
1176			MAX_NR_BANKS, b);
1177		b = MAX_NR_BANKS;
1178	}
1179
1180	/* Don't support asymmetric configurations today */
1181	WARN_ON(banks != 0 && b != banks);
1182	banks = b;
1183	if (!bank) {
1184		bank = kmalloc(banks * sizeof(u64), GFP_KERNEL);
1185		if (!bank)
1186			return -ENOMEM;
1187		memset(bank, 0xff, banks * sizeof(u64));
1188	}
1189
1190	/* Use accurate RIP reporting if available. */
1191	if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9)
1192		rip_msr = MSR_IA32_MCG_EIP;
1193
1194	if (cap & MCG_SER_P)
1195		mce_ser = 1;
1196
1197	return 0;
1198}
1199
1200static void mce_init(void)
1201{
1202	mce_banks_t all_banks;
1203	u64 cap;
1204	int i;
1205
1206	/*
1207	 * Log the machine checks left over from the previous reset.
1208	 */
1209	bitmap_fill(all_banks, MAX_NR_BANKS);
1210	machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks);
1211
1212	set_in_cr4(X86_CR4_MCE);
1213
1214	rdmsrl(MSR_IA32_MCG_CAP, cap);
1215	if (cap & MCG_CTL_P)
1216		wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
1217
1218	for (i = 0; i < banks; i++) {
1219		if (skip_bank_init(i))
1220			continue;
1221		wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
1222		wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
1223	}
1224}
1225
1226/* Add per CPU specific workarounds here */
1227static void mce_cpu_quirks(struct cpuinfo_x86 *c)
1228{
1229	/* This should be disabled by the BIOS, but isn't always */
1230	if (c->x86_vendor == X86_VENDOR_AMD) {
1231		if (c->x86 == 15 && banks > 4) {
1232			/*
1233			 * disable GART TBL walk error reporting, which
1234			 * trips off incorrectly with the IOMMU & 3ware
1235			 * & Cerberus:
1236			 */
1237			clear_bit(10, (unsigned long *)&bank[4]);
1238		}
1239		if (c->x86 <= 17 && mce_bootlog < 0) {
1240			/*
1241			 * Lots of broken BIOS around that don't clear them
1242			 * by default and leave crap in there. Don't log:
1243			 */
1244			mce_bootlog = 0;
1245		}
1246		/*
1247		 * Various K7s with broken bank 0 around. Always disable
1248		 * by default.
1249		 */
1250		 if (c->x86 == 6 && banks > 0)
1251			bank[0] = 0;
1252	}
1253
1254	if (c->x86_vendor == X86_VENDOR_INTEL) {
1255		/*
1256		 * SDM documents that on family 6 bank 0 should not be written
1257		 * because it aliases to another special BIOS controlled
1258		 * register.
1259		 * But it's not aliased anymore on model 0x1a+
1260		 * Don't ignore bank 0 completely because there could be a
1261		 * valid event later, merely don't write CTL0.
1262		 */
1263
1264		if (c->x86 == 6 && c->x86_model < 0x1A)
1265			__set_bit(0, &dont_init_banks);
1266
1267		/*
1268		 * All newer Intel systems support MCE broadcasting. Enable
1269		 * synchronization with a one second timeout.
1270		 */
1271		if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) &&
1272			monarch_timeout < 0)
1273			monarch_timeout = USEC_PER_SEC;
1274	}
1275	if (monarch_timeout < 0)
1276		monarch_timeout = 0;
1277	if (mce_bootlog != 0)
1278		mce_panic_timeout = 30;
1279}
1280
1281static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c)
1282{
1283	if (c->x86 != 5)
1284		return;
1285	switch (c->x86_vendor) {
1286	case X86_VENDOR_INTEL:
1287		intel_p5_mcheck_init(c);
1288		break;
1289	case X86_VENDOR_CENTAUR:
1290		winchip_mcheck_init(c);
1291		break;
1292	}
1293}
1294
1295static void mce_cpu_features(struct cpuinfo_x86 *c)
1296{
1297	switch (c->x86_vendor) {
1298	case X86_VENDOR_INTEL:
1299		mce_intel_feature_init(c);
1300		break;
1301	case X86_VENDOR_AMD:
1302		mce_amd_feature_init(c);
1303		break;
1304	default:
1305		break;
1306	}
1307}
1308
1309static void mce_init_timer(void)
1310{
1311	struct timer_list *t = &__get_cpu_var(mce_timer);
1312	int *n = &__get_cpu_var(next_interval);
1313
1314	if (mce_ignore_ce)
1315		return;
1316
1317	*n = check_interval * HZ;
1318	if (!*n)
1319		return;
1320	setup_timer(t, mcheck_timer, smp_processor_id());
1321	t->expires = round_jiffies(jiffies + *n);
1322	add_timer(t);
1323}
1324
1325/*
1326 * Called for each booted CPU to set up machine checks.
1327 * Must be called with preempt off:
1328 */
1329void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
1330{
1331	if (mce_disabled)
1332		return;
1333
1334	mce_ancient_init(c);
1335
1336	if (!mce_available(c))
1337		return;
1338
1339	if (mce_cap_init() < 0) {
1340		mce_disabled = 1;
1341		return;
1342	}
1343	mce_cpu_quirks(c);
1344
1345	machine_check_vector = do_machine_check;
1346
1347	mce_init();
1348	mce_cpu_features(c);
1349	mce_init_timer();
1350	INIT_WORK(&__get_cpu_var(mce_work), mce_process_work);
1351}
1352
1353/*
1354 * Character device to read and clear the MCE log.
1355 */
1356
1357static DEFINE_SPINLOCK(mce_state_lock);
1358static int		open_count;		/* #times opened */
1359static int		open_exclu;		/* already open exclusive? */
1360
1361static int mce_open(struct inode *inode, struct file *file)
1362{
1363	spin_lock(&mce_state_lock);
1364
1365	if (open_exclu || (open_count && (file->f_flags & O_EXCL))) {
1366		spin_unlock(&mce_state_lock);
1367
1368		return -EBUSY;
1369	}
1370
1371	if (file->f_flags & O_EXCL)
1372		open_exclu = 1;
1373	open_count++;
1374
1375	spin_unlock(&mce_state_lock);
1376
1377	return nonseekable_open(inode, file);
1378}
1379
1380static int mce_release(struct inode *inode, struct file *file)
1381{
1382	spin_lock(&mce_state_lock);
1383
1384	open_count--;
1385	open_exclu = 0;
1386
1387	spin_unlock(&mce_state_lock);
1388
1389	return 0;
1390}
1391
1392static void collect_tscs(void *data)
1393{
1394	unsigned long *cpu_tsc = (unsigned long *)data;
1395
1396	rdtscll(cpu_tsc[smp_processor_id()]);
1397}
1398
1399static DEFINE_MUTEX(mce_read_mutex);
1400
1401static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
1402			loff_t *off)
1403{
1404	char __user *buf = ubuf;
1405	unsigned long *cpu_tsc;
1406	unsigned prev, next;
1407	int i, err;
1408
1409	cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
1410	if (!cpu_tsc)
1411		return -ENOMEM;
1412
1413	mutex_lock(&mce_read_mutex);
1414	next = rcu_dereference(mcelog.next);
1415
1416	/* Only supports full reads right now */
1417	if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
1418		mutex_unlock(&mce_read_mutex);
1419		kfree(cpu_tsc);
1420
1421		return -EINVAL;
1422	}
1423
1424	err = 0;
1425	prev = 0;
1426	do {
1427		for (i = prev; i < next; i++) {
1428			unsigned long start = jiffies;
1429
1430			while (!mcelog.entry[i].finished) {
1431				if (time_after_eq(jiffies, start + 2)) {
1432					memset(mcelog.entry + i, 0,
1433					       sizeof(struct mce));
1434					goto timeout;
1435				}
1436				cpu_relax();
1437			}
1438			smp_rmb();
1439			err |= copy_to_user(buf, mcelog.entry + i,
1440					    sizeof(struct mce));
1441			buf += sizeof(struct mce);
1442timeout:
1443			;
1444		}
1445
1446		memset(mcelog.entry + prev, 0,
1447		       (next - prev) * sizeof(struct mce));
1448		prev = next;
1449		next = cmpxchg(&mcelog.next, prev, 0);
1450	} while (next != prev);
1451
1452	synchronize_sched();
1453
1454	/*
1455	 * Collect entries that were still getting written before the
1456	 * synchronize.
1457	 */
1458	on_each_cpu(collect_tscs, cpu_tsc, 1);
1459
1460	for (i = next; i < MCE_LOG_LEN; i++) {
1461		if (mcelog.entry[i].finished &&
1462		    mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
1463			err |= copy_to_user(buf, mcelog.entry+i,
1464					    sizeof(struct mce));
1465			smp_rmb();
1466			buf += sizeof(struct mce);
1467			memset(&mcelog.entry[i], 0, sizeof(struct mce));
1468		}
1469	}
1470	mutex_unlock(&mce_read_mutex);
1471	kfree(cpu_tsc);
1472
1473	return err ? -EFAULT : buf - ubuf;
1474}
1475
1476static unsigned int mce_poll(struct file *file, poll_table *wait)
1477{
1478	poll_wait(file, &mce_wait, wait);
1479	if (rcu_dereference(mcelog.next))
1480		return POLLIN | POLLRDNORM;
1481	return 0;
1482}
1483
1484static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
1485{
1486	int __user *p = (int __user *)arg;
1487
1488	if (!capable(CAP_SYS_ADMIN))
1489		return -EPERM;
1490
1491	switch (cmd) {
1492	case MCE_GET_RECORD_LEN:
1493		return put_user(sizeof(struct mce), p);
1494	case MCE_GET_LOG_LEN:
1495		return put_user(MCE_LOG_LEN, p);
1496	case MCE_GETCLEAR_FLAGS: {
1497		unsigned flags;
1498
1499		do {
1500			flags = mcelog.flags;
1501		} while (cmpxchg(&mcelog.flags, flags, 0) != flags);
1502
1503		return put_user(flags, p);
1504	}
1505	default:
1506		return -ENOTTY;
1507	}
1508}
1509
1510/* Modified in mce-inject.c, so not static or const */
1511struct file_operations mce_chrdev_ops = {
1512	.open			= mce_open,
1513	.release		= mce_release,
1514	.read			= mce_read,
1515	.poll			= mce_poll,
1516	.unlocked_ioctl		= mce_ioctl,
1517};
1518EXPORT_SYMBOL_GPL(mce_chrdev_ops);
1519
1520static struct miscdevice mce_log_device = {
1521	MISC_MCELOG_MINOR,
1522	"mcelog",
1523	&mce_chrdev_ops,
1524};
1525
1526/*
1527 * mce=off Disables machine check
1528 * mce=no_cmci Disables CMCI
1529 * mce=dont_log_ce Clears corrected events silently, no log created for CEs.
1530 * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared.
1531 * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above)
1532 *	monarchtimeout is how long to wait for other CPUs on machine
1533 *	check, or 0 to not wait
1534 * mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
1535 * mce=nobootlog Don't log MCEs from before booting.
1536 */
1537static int __init mcheck_enable(char *str)
1538{
1539	if (*str == 0)
1540		enable_p5_mce();
1541	if (*str == '=')
1542		str++;
1543	if (!strcmp(str, "off"))
1544		mce_disabled = 1;
1545	else if (!strcmp(str, "no_cmci"))
1546		mce_cmci_disabled = 1;
1547	else if (!strcmp(str, "dont_log_ce"))
1548		mce_dont_log_ce = 1;
1549	else if (!strcmp(str, "ignore_ce"))
1550		mce_ignore_ce = 1;
1551	else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
1552		mce_bootlog = (str[0] == 'b');
1553	else if (isdigit(str[0])) {
1554		get_option(&str, &tolerant);
1555		if (*str == ',') {
1556			++str;
1557			get_option(&str, &monarch_timeout);
1558		}
1559	} else {
1560		printk(KERN_INFO "mce argument %s ignored. Please use /sys\n",
1561		       str);
1562		return 0;
1563	}
1564	return 1;
1565}
1566__setup("mce", mcheck_enable);
1567
1568/*
1569 * Sysfs support
1570 */
1571
1572/*
1573 * Disable machine checks on suspend and shutdown. We can't really handle
1574 * them later.
1575 */
1576static int mce_disable(void)
1577{
1578	int i;
1579
1580	for (i = 0; i < banks; i++) {
1581		if (!skip_bank_init(i))
1582			wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
1583	}
1584	return 0;
1585}
1586
1587static int mce_suspend(struct sys_device *dev, pm_message_t state)
1588{
1589	return mce_disable();
1590}
1591
1592static int mce_shutdown(struct sys_device *dev)
1593{
1594	return mce_disable();
1595}
1596
1597/*
1598 * On resume clear all MCE state. Don't want to see leftovers from the BIOS.
1599 * Only one CPU is active at this time, the others get re-added later using
1600 * CPU hotplug:
1601 */
1602static int mce_resume(struct sys_device *dev)
1603{
1604	mce_init();
1605	mce_cpu_features(&current_cpu_data);
1606
1607	return 0;
1608}
1609
1610static void mce_cpu_restart(void *data)
1611{
1612	del_timer_sync(&__get_cpu_var(mce_timer));
1613	if (!mce_available(&current_cpu_data))
1614		return;
1615	mce_init();
1616	mce_init_timer();
1617}
1618
1619/* Reinit MCEs after user configuration changes */
1620static void mce_restart(void)
1621{
1622	on_each_cpu(mce_cpu_restart, NULL, 1);
1623}
1624
1625/* Toggle features for corrected errors */
1626static void mce_disable_ce(void *all)
1627{
1628	if (!mce_available(&current_cpu_data))
1629		return;
1630	if (all)
1631		del_timer_sync(&__get_cpu_var(mce_timer));
1632	cmci_clear();
1633}
1634
1635static void mce_enable_ce(void *all)
1636{
1637	if (!mce_available(&current_cpu_data))
1638		return;
1639	cmci_reenable();
1640	cmci_recheck();
1641	if (all)
1642		mce_init_timer();
1643}
1644
1645static struct sysdev_class mce_sysclass = {
1646	.suspend	= mce_suspend,
1647	.shutdown	= mce_shutdown,
1648	.resume		= mce_resume,
1649	.name		= "machinecheck",
1650};
1651
1652DEFINE_PER_CPU(struct sys_device, mce_dev);
1653
1654__cpuinitdata
1655void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
1656
1657static struct sysdev_attribute *bank_attrs;
1658
1659static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr,
1660			 char *buf)
1661{
1662	u64 b = bank[attr - bank_attrs];
1663
1664	return sprintf(buf, "%llx\n", b);
1665}
1666
1667static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
1668			const char *buf, size_t size)
1669{
1670	u64 new;
1671
1672	if (strict_strtoull(buf, 0, &new) < 0)
1673		return -EINVAL;
1674
1675	bank[attr - bank_attrs] = new;
1676	mce_restart();
1677
1678	return size;
1679}
1680
1681static ssize_t
1682show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf)
1683{
1684	strcpy(buf, mce_helper);
1685	strcat(buf, "\n");
1686	return strlen(mce_helper) + 1;
1687}
1688
1689static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr,
1690				const char *buf, size_t siz)
1691{
1692	char *p;
1693	int len;
1694
1695	strncpy(mce_helper, buf, sizeof(mce_helper));
1696	mce_helper[sizeof(mce_helper)-1] = 0;
1697	len = strlen(mce_helper);
1698	p = strchr(mce_helper, '\n');
1699
1700	if (*p)
1701		*p = 0;
1702
1703	return len;
1704}
1705
1706static ssize_t set_ignore_ce(struct sys_device *s,
1707			     struct sysdev_attribute *attr,
1708			     const char *buf, size_t size)
1709{
1710	u64 new;
1711
1712	if (strict_strtoull(buf, 0, &new) < 0)
1713		return -EINVAL;
1714
1715	if (mce_ignore_ce ^ !!new) {
1716		if (new) {
1717			/* disable ce features */
1718			on_each_cpu(mce_disable_ce, (void *)1, 1);
1719			mce_ignore_ce = 1;
1720		} else {
1721			/* enable ce features */
1722			mce_ignore_ce = 0;
1723			on_each_cpu(mce_enable_ce, (void *)1, 1);
1724		}
1725	}
1726	return size;
1727}
1728
1729static ssize_t set_cmci_disabled(struct sys_device *s,
1730				 struct sysdev_attribute *attr,
1731				 const char *buf, size_t size)
1732{
1733	u64 new;
1734
1735	if (strict_strtoull(buf, 0, &new) < 0)
1736		return -EINVAL;
1737
1738	if (mce_cmci_disabled ^ !!new) {
1739		if (new) {
1740			/* disable cmci */
1741			on_each_cpu(mce_disable_ce, NULL, 1);
1742			mce_cmci_disabled = 1;
1743		} else {
1744			/* enable cmci */
1745			mce_cmci_disabled = 0;
1746			on_each_cpu(mce_enable_ce, NULL, 1);
1747		}
1748	}
1749	return size;
1750}
1751
1752static ssize_t store_int_with_restart(struct sys_device *s,
1753				      struct sysdev_attribute *attr,
1754				      const char *buf, size_t size)
1755{
1756	ssize_t ret = sysdev_store_int(s, attr, buf, size);
1757	mce_restart();
1758	return ret;
1759}
1760
1761static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
1762static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
1763static SYSDEV_INT_ATTR(monarch_timeout, 0644, monarch_timeout);
1764static SYSDEV_INT_ATTR(dont_log_ce, 0644, mce_dont_log_ce);
1765
1766static struct sysdev_ext_attribute attr_check_interval = {
1767	_SYSDEV_ATTR(check_interval, 0644, sysdev_show_int,
1768		     store_int_with_restart),
1769	&check_interval
1770};
1771
1772static struct sysdev_ext_attribute attr_ignore_ce = {
1773	_SYSDEV_ATTR(ignore_ce, 0644, sysdev_show_int, set_ignore_ce),
1774	&mce_ignore_ce
1775};
1776
1777static struct sysdev_ext_attribute attr_cmci_disabled = {
1778	_SYSDEV_ATTR(cmci_disabled, 0644, sysdev_show_int, set_cmci_disabled),
1779	&mce_cmci_disabled
1780};
1781
1782static struct sysdev_attribute *mce_attrs[] = {
1783	&attr_tolerant.attr,
1784	&attr_check_interval.attr,
1785	&attr_trigger,
1786	&attr_monarch_timeout.attr,
1787	&attr_dont_log_ce.attr,
1788	&attr_ignore_ce.attr,
1789	&attr_cmci_disabled.attr,
1790	NULL
1791};
1792
1793static cpumask_var_t mce_dev_initialized;
1794
1795/* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */
1796static __cpuinit int mce_create_device(unsigned int cpu)
1797{
1798	int err;
1799	int i, j;
1800
1801	if (!mce_available(&boot_cpu_data))
1802		return -EIO;
1803
1804	memset(&per_cpu(mce_dev, cpu).kobj, 0, sizeof(struct kobject));
1805	per_cpu(mce_dev, cpu).id	= cpu;
1806	per_cpu(mce_dev, cpu).cls	= &mce_sysclass;
1807
1808	err = sysdev_register(&per_cpu(mce_dev, cpu));
1809	if (err)
1810		return err;
1811
1812	for (i = 0; mce_attrs[i]; i++) {
1813		err = sysdev_create_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
1814		if (err)
1815			goto error;
1816	}
1817	for (j = 0; j < banks; j++) {
1818		err = sysdev_create_file(&per_cpu(mce_dev, cpu),
1819					&bank_attrs[j]);
1820		if (err)
1821			goto error2;
1822	}
1823	cpumask_set_cpu(cpu, mce_dev_initialized);
1824
1825	return 0;
1826error2:
1827	while (--j >= 0)
1828		sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[j]);
1829error:
1830	while (--i >= 0)
1831		sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
1832
1833	sysdev_unregister(&per_cpu(mce_dev, cpu));
1834
1835	return err;
1836}
1837
1838static __cpuinit void mce_remove_device(unsigned int cpu)
1839{
1840	int i;
1841
1842	if (!cpumask_test_cpu(cpu, mce_dev_initialized))
1843		return;
1844
1845	for (i = 0; mce_attrs[i]; i++)
1846		sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
1847
1848	for (i = 0; i < banks; i++)
1849		sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]);
1850
1851	sysdev_unregister(&per_cpu(mce_dev, cpu));
1852	cpumask_clear_cpu(cpu, mce_dev_initialized);
1853}
1854
1855/* Make sure there are no machine checks on offlined CPUs. */
1856static void mce_disable_cpu(void *h)
1857{
1858	unsigned long action = *(unsigned long *)h;
1859	int i;
1860
1861	if (!mce_available(&current_cpu_data))
1862		return;
1863	if (!(action & CPU_TASKS_FROZEN))
1864		cmci_clear();
1865	for (i = 0; i < banks; i++) {
1866		if (!skip_bank_init(i))
1867			wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
1868	}
1869}
1870
1871static void mce_reenable_cpu(void *h)
1872{
1873	unsigned long action = *(unsigned long *)h;
1874	int i;
1875
1876	if (!mce_available(&current_cpu_data))
1877		return;
1878
1879	if (!(action & CPU_TASKS_FROZEN))
1880		cmci_reenable();
1881	for (i = 0; i < banks; i++) {
1882		if (!skip_bank_init(i))
1883			wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]);
1884	}
1885}
1886
1887/* Get notified when a cpu comes on/off. Be hotplug friendly. */
1888static int __cpuinit
1889mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
1890{
1891	unsigned int cpu = (unsigned long)hcpu;
1892	struct timer_list *t = &per_cpu(mce_timer, cpu);
1893
1894	switch (action) {
1895	case CPU_ONLINE:
1896	case CPU_ONLINE_FROZEN:
1897		mce_create_device(cpu);
1898		if (threshold_cpu_callback)
1899			threshold_cpu_callback(action, cpu);
1900		break;
1901	case CPU_DEAD:
1902	case CPU_DEAD_FROZEN:
1903		if (threshold_cpu_callback)
1904			threshold_cpu_callback(action, cpu);
1905		mce_remove_device(cpu);
1906		break;
1907	case CPU_DOWN_PREPARE:
1908	case CPU_DOWN_PREPARE_FROZEN:
1909		del_timer_sync(t);
1910		smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
1911		break;
1912	case CPU_DOWN_FAILED:
1913	case CPU_DOWN_FAILED_FROZEN:
1914		t->expires = round_jiffies(jiffies +
1915						__get_cpu_var(next_interval));
1916		add_timer_on(t, cpu);
1917		smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
1918		break;
1919	case CPU_POST_DEAD:
1920		/* intentionally ignoring frozen here */
1921		cmci_rediscover(cpu);
1922		break;
1923	}
1924	return NOTIFY_OK;
1925}
1926
1927static struct notifier_block mce_cpu_notifier __cpuinitdata = {
1928	.notifier_call = mce_cpu_callback,
1929};
1930
1931static __init int mce_init_banks(void)
1932{
1933	int i;
1934
1935	bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks,
1936				GFP_KERNEL);
1937	if (!bank_attrs)
1938		return -ENOMEM;
1939
1940	for (i = 0; i < banks; i++) {
1941		struct sysdev_attribute *a = &bank_attrs[i];
1942
1943		a->attr.name	= kasprintf(GFP_KERNEL, "bank%d", i);
1944		if (!a->attr.name)
1945			goto nomem;
1946
1947		a->attr.mode	= 0644;
1948		a->show		= show_bank;
1949		a->store	= set_bank;
1950	}
1951	return 0;
1952
1953nomem:
1954	while (--i >= 0)
1955		kfree(bank_attrs[i].attr.name);
1956	kfree(bank_attrs);
1957	bank_attrs = NULL;
1958
1959	return -ENOMEM;
1960}
1961
1962static __init int mce_init_device(void)
1963{
1964	int err;
1965	int i = 0;
1966
1967	if (!mce_available(&boot_cpu_data))
1968		return -EIO;
1969
1970	zalloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL);
1971
1972	err = mce_init_banks();
1973	if (err)
1974		return err;
1975
1976	err = sysdev_class_register(&mce_sysclass);
1977	if (err)
1978		return err;
1979
1980	for_each_online_cpu(i) {
1981		err = mce_create_device(i);
1982		if (err)
1983			return err;
1984	}
1985
1986	register_hotcpu_notifier(&mce_cpu_notifier);
1987	misc_register(&mce_log_device);
1988
1989	return err;
1990}
1991
1992device_initcall(mce_init_device);
1993
1994/*
1995 * Old style boot options parsing. Only for compatibility.
1996 */
1997static int __init mcheck_disable(char *str)
1998{
1999	mce_disabled = 1;
2000	return 1;
2001}
2002__setup("nomce", mcheck_disable);
2003