mce.c revision ac9603754dc7e286e62ae4f1067958d5b0075f99
1/*
2 * Machine check handler.
3 *
4 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
5 * Rest from unknown author(s).
6 * 2004 Andi Kleen. Rewrote most of it.
7 * Copyright 2008 Intel Corporation
8 * Author: Andi Kleen
9 */
10#include <linux/thread_info.h>
11#include <linux/capability.h>
12#include <linux/miscdevice.h>
13#include <linux/interrupt.h>
14#include <linux/ratelimit.h>
15#include <linux/kallsyms.h>
16#include <linux/rcupdate.h>
17#include <linux/kobject.h>
18#include <linux/uaccess.h>
19#include <linux/kdebug.h>
20#include <linux/kernel.h>
21#include <linux/percpu.h>
22#include <linux/string.h>
23#include <linux/sysdev.h>
24#include <linux/delay.h>
25#include <linux/ctype.h>
26#include <linux/sched.h>
27#include <linux/sysfs.h>
28#include <linux/types.h>
29#include <linux/init.h>
30#include <linux/kmod.h>
31#include <linux/poll.h>
32#include <linux/nmi.h>
33#include <linux/cpu.h>
34#include <linux/smp.h>
35#include <linux/fs.h>
36
37#include <asm/processor.h>
38#include <asm/hw_irq.h>
39#include <asm/apic.h>
40#include <asm/idle.h>
41#include <asm/ipi.h>
42#include <asm/mce.h>
43#include <asm/msr.h>
44
45#include "mce-internal.h"
46#include "mce.h"
47
48/* Handle unconfigured int18 (should never happen) */
49static void unexpected_machine_check(struct pt_regs *regs, long error_code)
50{
51	printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n",
52	       smp_processor_id());
53}
54
55/* Call the installed machine check handler for this CPU setup. */
56void (*machine_check_vector)(struct pt_regs *, long error_code) =
57						unexpected_machine_check;
58
59int				mce_disabled;
60
61#ifdef CONFIG_X86_NEW_MCE
62
63#define MISC_MCELOG_MINOR	227
64
65#define SPINUNIT 100	/* 100ns */
66
67atomic_t mce_entry;
68
69DEFINE_PER_CPU(unsigned, mce_exception_count);
70
71/*
72 * Tolerant levels:
73 *   0: always panic on uncorrected errors, log corrected errors
74 *   1: panic or SIGBUS on uncorrected errors, log corrected errors
75 *   2: SIGBUS or log uncorrected errors (if possible), log corrected errors
76 *   3: never panic or SIGBUS, log all errors (for testing only)
77 */
78static int			tolerant = 1;
79static int			banks;
80static u64			*bank;
81static unsigned long		notify_user;
82static int			rip_msr;
83static int			mce_bootlog = -1;
84static int			monarch_timeout = -1;
85
86static char			trigger[128];
87static char			*trigger_argv[2] = { trigger, NULL };
88
89static unsigned long		dont_init_banks;
90
91static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
92static DEFINE_PER_CPU(struct mce, mces_seen);
93static int			cpu_missing;
94
95
96/* MCA banks polled by the period polling timer for corrected events */
97DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
98	[0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
99};
100
101static inline int skip_bank_init(int i)
102{
103	return i < BITS_PER_LONG && test_bit(i, &dont_init_banks);
104}
105
106/* Do initial initialization of a struct mce */
107void mce_setup(struct mce *m)
108{
109	memset(m, 0, sizeof(struct mce));
110	m->cpu = m->extcpu = smp_processor_id();
111	rdtscll(m->tsc);
112	/* We hope get_seconds stays lockless */
113	m->time = get_seconds();
114	m->cpuvendor = boot_cpu_data.x86_vendor;
115	m->cpuid = cpuid_eax(1);
116#ifdef CONFIG_SMP
117	m->socketid = cpu_data(m->extcpu).phys_proc_id;
118#endif
119	m->apicid = cpu_data(m->extcpu).initial_apicid;
120	rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap);
121}
122
123DEFINE_PER_CPU(struct mce, injectm);
124EXPORT_PER_CPU_SYMBOL_GPL(injectm);
125
126/*
127 * Lockless MCE logging infrastructure.
128 * This avoids deadlocks on printk locks without having to break locks. Also
129 * separate MCEs from kernel messages to avoid bogus bug reports.
130 */
131
132static struct mce_log mcelog = {
133	.signature	= MCE_LOG_SIGNATURE,
134	.len		= MCE_LOG_LEN,
135	.recordlen	= sizeof(struct mce),
136};
137
138void mce_log(struct mce *mce)
139{
140	unsigned next, entry;
141
142	mce->finished = 0;
143	wmb();
144	for (;;) {
145		entry = rcu_dereference(mcelog.next);
146		for (;;) {
147			/*
148			 * When the buffer fills up discard new entries.
149			 * Assume that the earlier errors are the more
150			 * interesting ones:
151			 */
152			if (entry >= MCE_LOG_LEN) {
153				set_bit(MCE_OVERFLOW,
154					(unsigned long *)&mcelog.flags);
155				return;
156			}
157			/* Old left over entry. Skip: */
158			if (mcelog.entry[entry].finished) {
159				entry++;
160				continue;
161			}
162			break;
163		}
164		smp_rmb();
165		next = entry + 1;
166		if (cmpxchg(&mcelog.next, entry, next) == entry)
167			break;
168	}
169	memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
170	wmb();
171	mcelog.entry[entry].finished = 1;
172	wmb();
173
174	mce->finished = 1;
175	set_bit(0, &notify_user);
176}
177
178static void print_mce(struct mce *m)
179{
180	printk(KERN_EMERG "\n"
181	       KERN_EMERG "HARDWARE ERROR\n"
182	       KERN_EMERG
183	       "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
184	       m->extcpu, m->mcgstatus, m->bank, m->status);
185	if (m->ip) {
186		printk(KERN_EMERG "RIP%s %02x:<%016Lx> ",
187		       !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
188		       m->cs, m->ip);
189		if (m->cs == __KERNEL_CS)
190			print_symbol("{%s}", m->ip);
191		printk("\n");
192	}
193	printk(KERN_EMERG "TSC %llx ", m->tsc);
194	if (m->addr)
195		printk("ADDR %llx ", m->addr);
196	if (m->misc)
197		printk("MISC %llx ", m->misc);
198	printk("\n");
199	printk(KERN_EMERG "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n",
200			m->cpuvendor, m->cpuid, m->time, m->socketid,
201			m->apicid);
202	printk(KERN_EMERG "This is not a software problem!\n");
203	printk(KERN_EMERG "Run through mcelog --ascii to decode "
204	       "and contact your hardware vendor\n");
205}
206
207#define PANIC_TIMEOUT 5 /* 5 seconds */
208
209static atomic_t mce_paniced;
210
211/* Panic in progress. Enable interrupts and wait for final IPI */
212static void wait_for_panic(void)
213{
214	long timeout = PANIC_TIMEOUT*USEC_PER_SEC;
215	preempt_disable();
216	local_irq_enable();
217	while (timeout-- > 0)
218		udelay(1);
219	panic("Panicing machine check CPU died");
220}
221
222static void mce_panic(char *msg, struct mce *final, char *exp)
223{
224	int i;
225
226	/*
227	 * Make sure only one CPU runs in machine check panic
228	 */
229	if (atomic_add_return(1, &mce_paniced) > 1)
230		wait_for_panic();
231	barrier();
232
233	bust_spinlocks(1);
234	console_verbose();
235	/* First print corrected ones that are still unlogged */
236	for (i = 0; i < MCE_LOG_LEN; i++) {
237		struct mce *m = &mcelog.entry[i];
238		if ((m->status & MCI_STATUS_VAL) &&
239			!(m->status & MCI_STATUS_UC))
240			print_mce(m);
241	}
242	/* Now print uncorrected but with the final one last */
243	for (i = 0; i < MCE_LOG_LEN; i++) {
244		struct mce *m = &mcelog.entry[i];
245		if (!(m->status & MCI_STATUS_VAL))
246			continue;
247		if (!final || memcmp(m, final, sizeof(struct mce)))
248			print_mce(m);
249	}
250	if (final)
251		print_mce(final);
252	if (cpu_missing)
253		printk(KERN_EMERG "Some CPUs didn't answer in synchronization\n");
254	if (exp)
255		printk(KERN_EMERG "Machine check: %s\n", exp);
256	panic(msg);
257}
258
259/* Support code for software error injection */
260
261static int msr_to_offset(u32 msr)
262{
263	unsigned bank = __get_cpu_var(injectm.bank);
264	if (msr == rip_msr)
265		return offsetof(struct mce, ip);
266	if (msr == MSR_IA32_MC0_STATUS + bank*4)
267		return offsetof(struct mce, status);
268	if (msr == MSR_IA32_MC0_ADDR + bank*4)
269		return offsetof(struct mce, addr);
270	if (msr == MSR_IA32_MC0_MISC + bank*4)
271		return offsetof(struct mce, misc);
272	if (msr == MSR_IA32_MCG_STATUS)
273		return offsetof(struct mce, mcgstatus);
274	return -1;
275}
276
277/* MSR access wrappers used for error injection */
278static u64 mce_rdmsrl(u32 msr)
279{
280	u64 v;
281	if (__get_cpu_var(injectm).finished) {
282		int offset = msr_to_offset(msr);
283		if (offset < 0)
284			return 0;
285		return *(u64 *)((char *)&__get_cpu_var(injectm) + offset);
286	}
287	rdmsrl(msr, v);
288	return v;
289}
290
291static void mce_wrmsrl(u32 msr, u64 v)
292{
293	if (__get_cpu_var(injectm).finished) {
294		int offset = msr_to_offset(msr);
295		if (offset >= 0)
296			*(u64 *)((char *)&__get_cpu_var(injectm) + offset) = v;
297		return;
298	}
299	wrmsrl(msr, v);
300}
301
302int mce_available(struct cpuinfo_x86 *c)
303{
304	if (mce_disabled)
305		return 0;
306	return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
307}
308
309static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
310{
311	if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) {
312		m->ip = regs->ip;
313		m->cs = regs->cs;
314	} else {
315		m->ip = 0;
316		m->cs = 0;
317	}
318	if (rip_msr) {
319		/* Assume the RIP in the MSR is exact. Is this true? */
320		m->mcgstatus |= MCG_STATUS_EIPV;
321		m->ip = mce_rdmsrl(rip_msr);
322		m->cs = 0;
323	}
324}
325
326#ifdef CONFIG_X86_LOCAL_APIC
327/*
328 * Called after interrupts have been reenabled again
329 * when a MCE happened during an interrupts off region
330 * in the kernel.
331 */
332asmlinkage void smp_mce_self_interrupt(struct pt_regs *regs)
333{
334	ack_APIC_irq();
335	exit_idle();
336	irq_enter();
337	mce_notify_user();
338	irq_exit();
339}
340#endif
341
342static void mce_report_event(struct pt_regs *regs)
343{
344	if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) {
345		mce_notify_user();
346		return;
347	}
348
349#ifdef CONFIG_X86_LOCAL_APIC
350	/*
351	 * Without APIC do not notify. The event will be picked
352	 * up eventually.
353	 */
354	if (!cpu_has_apic)
355		return;
356
357	/*
358	 * When interrupts are disabled we cannot use
359	 * kernel services safely. Trigger an self interrupt
360	 * through the APIC to instead do the notification
361	 * after interrupts are reenabled again.
362	 */
363	apic->send_IPI_self(MCE_SELF_VECTOR);
364
365	/*
366	 * Wait for idle afterwards again so that we don't leave the
367	 * APIC in a non idle state because the normal APIC writes
368	 * cannot exclude us.
369	 */
370	apic_wait_icr_idle();
371#endif
372}
373
374DEFINE_PER_CPU(unsigned, mce_poll_count);
375
376/*
377 * Poll for corrected events or events that happened before reset.
378 * Those are just logged through /dev/mcelog.
379 *
380 * This is executed in standard interrupt context.
381 */
382void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
383{
384	struct mce m;
385	int i;
386
387	__get_cpu_var(mce_poll_count)++;
388
389	mce_setup(&m);
390
391	m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
392	for (i = 0; i < banks; i++) {
393		if (!bank[i] || !test_bit(i, *b))
394			continue;
395
396		m.misc = 0;
397		m.addr = 0;
398		m.bank = i;
399		m.tsc = 0;
400
401		barrier();
402		m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4);
403		if (!(m.status & MCI_STATUS_VAL))
404			continue;
405
406		/*
407		 * Uncorrected events are handled by the exception handler
408		 * when it is enabled. But when the exception is disabled log
409		 * everything.
410		 *
411		 * TBD do the same check for MCI_STATUS_EN here?
412		 */
413		if ((m.status & MCI_STATUS_UC) && !(flags & MCP_UC))
414			continue;
415
416		if (m.status & MCI_STATUS_MISCV)
417			m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4);
418		if (m.status & MCI_STATUS_ADDRV)
419			m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4);
420
421		if (!(flags & MCP_TIMESTAMP))
422			m.tsc = 0;
423		/*
424		 * Don't get the IP here because it's unlikely to
425		 * have anything to do with the actual error location.
426		 */
427		if (!(flags & MCP_DONTLOG)) {
428			mce_log(&m);
429			add_taint(TAINT_MACHINE_CHECK);
430		}
431
432		/*
433		 * Clear state for this bank.
434		 */
435		mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
436	}
437
438	/*
439	 * Don't clear MCG_STATUS here because it's only defined for
440	 * exceptions.
441	 */
442
443	sync_core();
444}
445EXPORT_SYMBOL_GPL(machine_check_poll);
446
447/*
448 * Do a quick check if any of the events requires a panic.
449 * This decides if we keep the events around or clear them.
450 */
451static int mce_no_way_out(struct mce *m, char **msg)
452{
453	int i;
454
455	for (i = 0; i < banks; i++) {
456		m->status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4);
457		if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY)
458			return 1;
459	}
460	return 0;
461}
462
463/*
464 * Variable to establish order between CPUs while scanning.
465 * Each CPU spins initially until executing is equal its number.
466 */
467static atomic_t mce_executing;
468
469/*
470 * Defines order of CPUs on entry. First CPU becomes Monarch.
471 */
472static atomic_t mce_callin;
473
474/*
475 * Check if a timeout waiting for other CPUs happened.
476 */
477static int mce_timed_out(u64 *t)
478{
479	/*
480	 * The others already did panic for some reason.
481	 * Bail out like in a timeout.
482	 * rmb() to tell the compiler that system_state
483	 * might have been modified by someone else.
484	 */
485	rmb();
486	if (atomic_read(&mce_paniced))
487		wait_for_panic();
488	if (!monarch_timeout)
489		goto out;
490	if ((s64)*t < SPINUNIT) {
491		/* CHECKME: Make panic default for 1 too? */
492		if (tolerant < 1)
493			mce_panic("Timeout synchronizing machine check over CPUs",
494				  NULL, NULL);
495		cpu_missing = 1;
496		return 1;
497	}
498	*t -= SPINUNIT;
499out:
500	touch_nmi_watchdog();
501	return 0;
502}
503
504/*
505 * The Monarch's reign.  The Monarch is the CPU who entered
506 * the machine check handler first. It waits for the others to
507 * raise the exception too and then grades them. When any
508 * error is fatal panic. Only then let the others continue.
509 *
510 * The other CPUs entering the MCE handler will be controlled by the
511 * Monarch. They are called Subjects.
512 *
513 * This way we prevent any potential data corruption in a unrecoverable case
514 * and also makes sure always all CPU's errors are examined.
515 *
516 * Also this detects the case of an machine check event coming from outer
517 * space (not detected by any CPUs) In this case some external agent wants
518 * us to shut down, so panic too.
519 *
520 * The other CPUs might still decide to panic if the handler happens
521 * in a unrecoverable place, but in this case the system is in a semi-stable
522 * state and won't corrupt anything by itself. It's ok to let the others
523 * continue for a bit first.
524 *
525 * All the spin loops have timeouts; when a timeout happens a CPU
526 * typically elects itself to be Monarch.
527 */
528static void mce_reign(void)
529{
530	int cpu;
531	struct mce *m = NULL;
532	int global_worst = 0;
533	char *msg = NULL;
534	char *nmsg = NULL;
535
536	/*
537	 * This CPU is the Monarch and the other CPUs have run
538	 * through their handlers.
539	 * Grade the severity of the errors of all the CPUs.
540	 */
541	for_each_possible_cpu(cpu) {
542		int severity = mce_severity(&per_cpu(mces_seen, cpu), tolerant,
543					    &nmsg);
544		if (severity > global_worst) {
545			msg = nmsg;
546			global_worst = severity;
547			m = &per_cpu(mces_seen, cpu);
548		}
549	}
550
551	/*
552	 * Cannot recover? Panic here then.
553	 * This dumps all the mces in the log buffer and stops the
554	 * other CPUs.
555	 */
556	if (m && global_worst >= MCE_PANIC_SEVERITY && tolerant < 3)
557		mce_panic("Fatal Machine check", m, msg);
558
559	/*
560	 * For UC somewhere we let the CPU who detects it handle it.
561	 * Also must let continue the others, otherwise the handling
562	 * CPU could deadlock on a lock.
563	 */
564
565	/*
566	 * No machine check event found. Must be some external
567	 * source or one CPU is hung. Panic.
568	 */
569	if (!m && tolerant < 3)
570		mce_panic("Machine check from unknown source", NULL, NULL);
571
572	/*
573	 * Now clear all the mces_seen so that they don't reappear on
574	 * the next mce.
575	 */
576	for_each_possible_cpu(cpu)
577		memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce));
578}
579
580static atomic_t global_nwo;
581
582/*
583 * Start of Monarch synchronization. This waits until all CPUs have
584 * entered the exception handler and then determines if any of them
585 * saw a fatal event that requires panic. Then it executes them
586 * in the entry order.
587 * TBD double check parallel CPU hotunplug
588 */
589static int mce_start(int no_way_out, int *order)
590{
591	int nwo;
592	int cpus = num_online_cpus();
593	u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC;
594
595	if (!timeout) {
596		*order = -1;
597		return no_way_out;
598	}
599
600	atomic_add(no_way_out, &global_nwo);
601
602	/*
603	 * Wait for everyone.
604	 */
605	while (atomic_read(&mce_callin) != cpus) {
606		if (mce_timed_out(&timeout)) {
607			atomic_set(&global_nwo, 0);
608			*order = -1;
609			return no_way_out;
610		}
611		ndelay(SPINUNIT);
612	}
613
614	/*
615	 * Cache the global no_way_out state.
616	 */
617	nwo = atomic_read(&global_nwo);
618
619	/*
620	 * Monarch starts executing now, the others wait.
621	 */
622	if (*order == 1) {
623		atomic_set(&mce_executing, 1);
624		return nwo;
625	}
626
627	/*
628	 * Now start the scanning loop one by one
629	 * in the original callin order.
630	 * This way when there are any shared banks it will
631	 * be only seen by one CPU before cleared, avoiding duplicates.
632	 */
633	while (atomic_read(&mce_executing) < *order) {
634		if (mce_timed_out(&timeout)) {
635			atomic_set(&global_nwo, 0);
636			*order = -1;
637			return no_way_out;
638		}
639		ndelay(SPINUNIT);
640	}
641	return nwo;
642}
643
644/*
645 * Synchronize between CPUs after main scanning loop.
646 * This invokes the bulk of the Monarch processing.
647 */
648static int mce_end(int order)
649{
650	int ret = -1;
651	u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC;
652
653	if (!timeout)
654		goto reset;
655	if (order < 0)
656		goto reset;
657
658	/*
659	 * Allow others to run.
660	 */
661	atomic_inc(&mce_executing);
662
663	if (order == 1) {
664		/* CHECKME: Can this race with a parallel hotplug? */
665		int cpus = num_online_cpus();
666
667		/*
668		 * Monarch: Wait for everyone to go through their scanning
669		 * loops.
670		 */
671		while (atomic_read(&mce_executing) <= cpus) {
672			if (mce_timed_out(&timeout))
673				goto reset;
674			ndelay(SPINUNIT);
675		}
676
677		mce_reign();
678		barrier();
679		ret = 0;
680	} else {
681		/*
682		 * Subject: Wait for Monarch to finish.
683		 */
684		while (atomic_read(&mce_executing) != 0) {
685			if (mce_timed_out(&timeout))
686				goto reset;
687			ndelay(SPINUNIT);
688		}
689
690		/*
691		 * Don't reset anything. That's done by the Monarch.
692		 */
693		return 0;
694	}
695
696	/*
697	 * Reset all global state.
698	 */
699reset:
700	atomic_set(&global_nwo, 0);
701	atomic_set(&mce_callin, 0);
702	barrier();
703
704	/*
705	 * Let others run again.
706	 */
707	atomic_set(&mce_executing, 0);
708	return ret;
709}
710
711static void mce_clear_state(unsigned long *toclear)
712{
713	int i;
714
715	for (i = 0; i < banks; i++) {
716		if (test_bit(i, toclear))
717			mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
718	}
719}
720
721/*
722 * The actual machine check handler. This only handles real
723 * exceptions when something got corrupted coming in through int 18.
724 *
725 * This is executed in NMI context not subject to normal locking rules. This
726 * implies that most kernel services cannot be safely used. Don't even
727 * think about putting a printk in there!
728 *
729 * On Intel systems this is entered on all CPUs in parallel through
730 * MCE broadcast. However some CPUs might be broken beyond repair,
731 * so be always careful when synchronizing with others.
732 */
733void do_machine_check(struct pt_regs *regs, long error_code)
734{
735	struct mce m, *final;
736	int i;
737	int worst = 0;
738	int severity;
739	/*
740	 * Establish sequential order between the CPUs entering the machine
741	 * check handler.
742	 */
743	int order;
744
745	/*
746	 * If no_way_out gets set, there is no safe way to recover from this
747	 * MCE.  If tolerant is cranked up, we'll try anyway.
748	 */
749	int no_way_out = 0;
750	/*
751	 * If kill_it gets set, there might be a way to recover from this
752	 * error.
753	 */
754	int kill_it = 0;
755	DECLARE_BITMAP(toclear, MAX_NR_BANKS);
756	char *msg = "Unknown";
757
758	atomic_inc(&mce_entry);
759
760	__get_cpu_var(mce_exception_count)++;
761
762	if (notify_die(DIE_NMI, "machine check", regs, error_code,
763			   18, SIGKILL) == NOTIFY_STOP)
764		goto out;
765	if (!banks)
766		goto out;
767
768	order = atomic_add_return(1, &mce_callin);
769	mce_setup(&m);
770
771	m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
772	no_way_out = mce_no_way_out(&m, &msg);
773
774	final = &__get_cpu_var(mces_seen);
775	*final = m;
776
777	barrier();
778
779	/*
780	 * Go through all the banks in exclusion of the other CPUs.
781	 * This way we don't report duplicated events on shared banks
782	 * because the first one to see it will clear it.
783	 */
784	no_way_out = mce_start(no_way_out, &order);
785	for (i = 0; i < banks; i++) {
786		__clear_bit(i, toclear);
787		if (!bank[i])
788			continue;
789
790		m.misc = 0;
791		m.addr = 0;
792		m.bank = i;
793
794		m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4);
795		if ((m.status & MCI_STATUS_VAL) == 0)
796			continue;
797
798		/*
799		 * Non uncorrected errors are handled by machine_check_poll
800		 * Leave them alone, unless this panics.
801		 */
802		if ((m.status & MCI_STATUS_UC) == 0 && !no_way_out)
803			continue;
804
805		/*
806		 * Set taint even when machine check was not enabled.
807		 */
808		add_taint(TAINT_MACHINE_CHECK);
809
810		__set_bit(i, toclear);
811
812		if (m.status & MCI_STATUS_EN) {
813			/*
814			 * If this error was uncorrectable and there was
815			 * an overflow, we're in trouble.  If no overflow,
816			 * we might get away with just killing a task.
817			 */
818			if (m.status & MCI_STATUS_UC)
819				kill_it = 1;
820		} else {
821			/*
822			 * Machine check event was not enabled. Clear, but
823			 * ignore.
824			 */
825			continue;
826		}
827
828		if (m.status & MCI_STATUS_MISCV)
829			m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4);
830		if (m.status & MCI_STATUS_ADDRV)
831			m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4);
832
833		mce_get_rip(&m, regs);
834		mce_log(&m);
835
836		severity = mce_severity(&m, tolerant, NULL);
837		if (severity > worst) {
838			*final = m;
839			worst = severity;
840		}
841	}
842
843	if (!no_way_out)
844		mce_clear_state(toclear);
845
846	/*
847	 * Do most of the synchronization with other CPUs.
848	 * When there's any problem use only local no_way_out state.
849	 */
850	if (mce_end(order) < 0)
851		no_way_out = worst >= MCE_PANIC_SEVERITY;
852
853	/*
854	 * If we have decided that we just CAN'T continue, and the user
855	 * has not set tolerant to an insane level, give up and die.
856	 *
857	 * This is mainly used in the case when the system doesn't
858	 * support MCE broadcasting or it has been disabled.
859	 */
860	if (no_way_out && tolerant < 3)
861		mce_panic("Fatal machine check on current CPU", final, msg);
862
863	/*
864	 * If the error seems to be unrecoverable, something should be
865	 * done.  Try to kill as little as possible.  If we can kill just
866	 * one task, do that.  If the user has set the tolerance very
867	 * high, don't try to do anything at all.
868	 */
869	if (kill_it && tolerant < 3) {
870		int user_space = 0;
871
872		/*
873		 * If the EIPV bit is set, it means the saved IP is the
874		 * instruction which caused the MCE.
875		 */
876		if (m.mcgstatus & MCG_STATUS_EIPV)
877			user_space = final->ip && (final->cs & 3);
878
879		/*
880		 * If we know that the error was in user space, send a
881		 * SIGBUS.  Otherwise, panic if tolerance is low.
882		 *
883		 * force_sig() takes an awful lot of locks and has a slight
884		 * risk of deadlocking.
885		 */
886		if (user_space) {
887			force_sig(SIGBUS, current);
888		} else if (panic_on_oops || tolerant < 2) {
889			mce_panic("Uncorrected machine check", final, msg);
890		}
891	}
892
893	/* notify userspace ASAP */
894	set_thread_flag(TIF_MCE_NOTIFY);
895
896	if (worst > 0)
897		mce_report_event(regs);
898	mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
899out:
900	atomic_dec(&mce_entry);
901	sync_core();
902}
903EXPORT_SYMBOL_GPL(do_machine_check);
904
905#ifdef CONFIG_X86_MCE_INTEL
906/***
907 * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
908 * @cpu: The CPU on which the event occurred.
909 * @status: Event status information
910 *
911 * This function should be called by the thermal interrupt after the
912 * event has been processed and the decision was made to log the event
913 * further.
914 *
915 * The status parameter will be saved to the 'status' field of 'struct mce'
916 * and historically has been the register value of the
917 * MSR_IA32_THERMAL_STATUS (Intel) msr.
918 */
919void mce_log_therm_throt_event(__u64 status)
920{
921	struct mce m;
922
923	mce_setup(&m);
924	m.bank = MCE_THERMAL_BANK;
925	m.status = status;
926	mce_log(&m);
927}
928#endif /* CONFIG_X86_MCE_INTEL */
929
930/*
931 * Periodic polling timer for "silent" machine check errors.  If the
932 * poller finds an MCE, poll 2x faster.  When the poller finds no more
933 * errors, poll 2x slower (up to check_interval seconds).
934 */
935static int check_interval = 5 * 60; /* 5 minutes */
936
937static DEFINE_PER_CPU(int, next_interval); /* in jiffies */
938static DEFINE_PER_CPU(struct timer_list, mce_timer);
939
940static void mcheck_timer(unsigned long data)
941{
942	struct timer_list *t = &per_cpu(mce_timer, data);
943	int *n;
944
945	WARN_ON(smp_processor_id() != data);
946
947	if (mce_available(&current_cpu_data)) {
948		machine_check_poll(MCP_TIMESTAMP,
949				&__get_cpu_var(mce_poll_banks));
950	}
951
952	/*
953	 * Alert userspace if needed.  If we logged an MCE, reduce the
954	 * polling interval, otherwise increase the polling interval.
955	 */
956	n = &__get_cpu_var(next_interval);
957	if (mce_notify_user())
958		*n = max(*n/2, HZ/100);
959	else
960		*n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ));
961
962	t->expires = jiffies + *n;
963	add_timer(t);
964}
965
966static void mce_do_trigger(struct work_struct *work)
967{
968	call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT);
969}
970
971static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
972
973/*
974 * Notify the user(s) about new machine check events.
975 * Can be called from interrupt context, but not from machine check/NMI
976 * context.
977 */
978int mce_notify_user(void)
979{
980	/* Not more than two messages every minute */
981	static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
982
983	clear_thread_flag(TIF_MCE_NOTIFY);
984
985	if (test_and_clear_bit(0, &notify_user)) {
986		wake_up_interruptible(&mce_wait);
987
988		/*
989		 * There is no risk of missing notifications because
990		 * work_pending is always cleared before the function is
991		 * executed.
992		 */
993		if (trigger[0] && !work_pending(&mce_trigger_work))
994			schedule_work(&mce_trigger_work);
995
996		if (__ratelimit(&ratelimit))
997			printk(KERN_INFO "Machine check events logged\n");
998
999		return 1;
1000	}
1001	return 0;
1002}
1003EXPORT_SYMBOL_GPL(mce_notify_user);
1004
1005/*
1006 * Initialize Machine Checks for a CPU.
1007 */
1008static int mce_cap_init(void)
1009{
1010	unsigned b;
1011	u64 cap;
1012
1013	rdmsrl(MSR_IA32_MCG_CAP, cap);
1014
1015	b = cap & MCG_BANKCNT_MASK;
1016	printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b);
1017
1018	if (b > MAX_NR_BANKS) {
1019		printk(KERN_WARNING
1020		       "MCE: Using only %u machine check banks out of %u\n",
1021			MAX_NR_BANKS, b);
1022		b = MAX_NR_BANKS;
1023	}
1024
1025	/* Don't support asymmetric configurations today */
1026	WARN_ON(banks != 0 && b != banks);
1027	banks = b;
1028	if (!bank) {
1029		bank = kmalloc(banks * sizeof(u64), GFP_KERNEL);
1030		if (!bank)
1031			return -ENOMEM;
1032		memset(bank, 0xff, banks * sizeof(u64));
1033	}
1034
1035	/* Use accurate RIP reporting if available. */
1036	if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9)
1037		rip_msr = MSR_IA32_MCG_EIP;
1038
1039	return 0;
1040}
1041
1042static void mce_init(void)
1043{
1044	mce_banks_t all_banks;
1045	u64 cap;
1046	int i;
1047
1048	/*
1049	 * Log the machine checks left over from the previous reset.
1050	 */
1051	bitmap_fill(all_banks, MAX_NR_BANKS);
1052	machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks);
1053
1054	set_in_cr4(X86_CR4_MCE);
1055
1056	rdmsrl(MSR_IA32_MCG_CAP, cap);
1057	if (cap & MCG_CTL_P)
1058		wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
1059
1060	for (i = 0; i < banks; i++) {
1061		if (skip_bank_init(i))
1062			continue;
1063		wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
1064		wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
1065	}
1066}
1067
1068/* Add per CPU specific workarounds here */
1069static void mce_cpu_quirks(struct cpuinfo_x86 *c)
1070{
1071	/* This should be disabled by the BIOS, but isn't always */
1072	if (c->x86_vendor == X86_VENDOR_AMD) {
1073		if (c->x86 == 15 && banks > 4) {
1074			/*
1075			 * disable GART TBL walk error reporting, which
1076			 * trips off incorrectly with the IOMMU & 3ware
1077			 * & Cerberus:
1078			 */
1079			clear_bit(10, (unsigned long *)&bank[4]);
1080		}
1081		if (c->x86 <= 17 && mce_bootlog < 0) {
1082			/*
1083			 * Lots of broken BIOS around that don't clear them
1084			 * by default and leave crap in there. Don't log:
1085			 */
1086			mce_bootlog = 0;
1087		}
1088		/*
1089		 * Various K7s with broken bank 0 around. Always disable
1090		 * by default.
1091		 */
1092		 if (c->x86 == 6)
1093			bank[0] = 0;
1094	}
1095
1096	if (c->x86_vendor == X86_VENDOR_INTEL) {
1097		/*
1098		 * SDM documents that on family 6 bank 0 should not be written
1099		 * because it aliases to another special BIOS controlled
1100		 * register.
1101		 * But it's not aliased anymore on model 0x1a+
1102		 * Don't ignore bank 0 completely because there could be a
1103		 * valid event later, merely don't write CTL0.
1104		 */
1105
1106		if (c->x86 == 6 && c->x86_model < 0x1A)
1107			__set_bit(0, &dont_init_banks);
1108
1109		/*
1110		 * All newer Intel systems support MCE broadcasting. Enable
1111		 * synchronization with a one second timeout.
1112		 */
1113		if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) &&
1114			monarch_timeout < 0)
1115			monarch_timeout = USEC_PER_SEC;
1116	}
1117	if (monarch_timeout < 0)
1118		monarch_timeout = 0;
1119}
1120
1121static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c)
1122{
1123	if (c->x86 != 5)
1124		return;
1125	switch (c->x86_vendor) {
1126	case X86_VENDOR_INTEL:
1127		if (mce_p5_enabled())
1128			intel_p5_mcheck_init(c);
1129		break;
1130	case X86_VENDOR_CENTAUR:
1131		winchip_mcheck_init(c);
1132		break;
1133	}
1134}
1135
1136static void mce_cpu_features(struct cpuinfo_x86 *c)
1137{
1138	switch (c->x86_vendor) {
1139	case X86_VENDOR_INTEL:
1140		mce_intel_feature_init(c);
1141		break;
1142	case X86_VENDOR_AMD:
1143		mce_amd_feature_init(c);
1144		break;
1145	default:
1146		break;
1147	}
1148}
1149
1150static void mce_init_timer(void)
1151{
1152	struct timer_list *t = &__get_cpu_var(mce_timer);
1153	int *n = &__get_cpu_var(next_interval);
1154
1155	*n = check_interval * HZ;
1156	if (!*n)
1157		return;
1158	setup_timer(t, mcheck_timer, smp_processor_id());
1159	t->expires = round_jiffies(jiffies + *n);
1160	add_timer(t);
1161}
1162
1163/*
1164 * Called for each booted CPU to set up machine checks.
1165 * Must be called with preempt off:
1166 */
1167void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
1168{
1169	if (mce_disabled)
1170		return;
1171
1172	mce_ancient_init(c);
1173
1174	if (!mce_available(c))
1175		return;
1176
1177	if (mce_cap_init() < 0) {
1178		mce_disabled = 1;
1179		return;
1180	}
1181	mce_cpu_quirks(c);
1182
1183	machine_check_vector = do_machine_check;
1184
1185	mce_init();
1186	mce_cpu_features(c);
1187	mce_init_timer();
1188}
1189
1190/*
1191 * Character device to read and clear the MCE log.
1192 */
1193
1194static DEFINE_SPINLOCK(mce_state_lock);
1195static int		open_count;		/* #times opened */
1196static int		open_exclu;		/* already open exclusive? */
1197
1198static int mce_open(struct inode *inode, struct file *file)
1199{
1200	spin_lock(&mce_state_lock);
1201
1202	if (open_exclu || (open_count && (file->f_flags & O_EXCL))) {
1203		spin_unlock(&mce_state_lock);
1204
1205		return -EBUSY;
1206	}
1207
1208	if (file->f_flags & O_EXCL)
1209		open_exclu = 1;
1210	open_count++;
1211
1212	spin_unlock(&mce_state_lock);
1213
1214	return nonseekable_open(inode, file);
1215}
1216
1217static int mce_release(struct inode *inode, struct file *file)
1218{
1219	spin_lock(&mce_state_lock);
1220
1221	open_count--;
1222	open_exclu = 0;
1223
1224	spin_unlock(&mce_state_lock);
1225
1226	return 0;
1227}
1228
1229static void collect_tscs(void *data)
1230{
1231	unsigned long *cpu_tsc = (unsigned long *)data;
1232
1233	rdtscll(cpu_tsc[smp_processor_id()]);
1234}
1235
1236static DEFINE_MUTEX(mce_read_mutex);
1237
1238static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
1239			loff_t *off)
1240{
1241	char __user *buf = ubuf;
1242	unsigned long *cpu_tsc;
1243	unsigned prev, next;
1244	int i, err;
1245
1246	cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
1247	if (!cpu_tsc)
1248		return -ENOMEM;
1249
1250	mutex_lock(&mce_read_mutex);
1251	next = rcu_dereference(mcelog.next);
1252
1253	/* Only supports full reads right now */
1254	if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
1255		mutex_unlock(&mce_read_mutex);
1256		kfree(cpu_tsc);
1257
1258		return -EINVAL;
1259	}
1260
1261	err = 0;
1262	prev = 0;
1263	do {
1264		for (i = prev; i < next; i++) {
1265			unsigned long start = jiffies;
1266
1267			while (!mcelog.entry[i].finished) {
1268				if (time_after_eq(jiffies, start + 2)) {
1269					memset(mcelog.entry + i, 0,
1270					       sizeof(struct mce));
1271					goto timeout;
1272				}
1273				cpu_relax();
1274			}
1275			smp_rmb();
1276			err |= copy_to_user(buf, mcelog.entry + i,
1277					    sizeof(struct mce));
1278			buf += sizeof(struct mce);
1279timeout:
1280			;
1281		}
1282
1283		memset(mcelog.entry + prev, 0,
1284		       (next - prev) * sizeof(struct mce));
1285		prev = next;
1286		next = cmpxchg(&mcelog.next, prev, 0);
1287	} while (next != prev);
1288
1289	synchronize_sched();
1290
1291	/*
1292	 * Collect entries that were still getting written before the
1293	 * synchronize.
1294	 */
1295	on_each_cpu(collect_tscs, cpu_tsc, 1);
1296
1297	for (i = next; i < MCE_LOG_LEN; i++) {
1298		if (mcelog.entry[i].finished &&
1299		    mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
1300			err |= copy_to_user(buf, mcelog.entry+i,
1301					    sizeof(struct mce));
1302			smp_rmb();
1303			buf += sizeof(struct mce);
1304			memset(&mcelog.entry[i], 0, sizeof(struct mce));
1305		}
1306	}
1307	mutex_unlock(&mce_read_mutex);
1308	kfree(cpu_tsc);
1309
1310	return err ? -EFAULT : buf - ubuf;
1311}
1312
1313static unsigned int mce_poll(struct file *file, poll_table *wait)
1314{
1315	poll_wait(file, &mce_wait, wait);
1316	if (rcu_dereference(mcelog.next))
1317		return POLLIN | POLLRDNORM;
1318	return 0;
1319}
1320
1321static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
1322{
1323	int __user *p = (int __user *)arg;
1324
1325	if (!capable(CAP_SYS_ADMIN))
1326		return -EPERM;
1327
1328	switch (cmd) {
1329	case MCE_GET_RECORD_LEN:
1330		return put_user(sizeof(struct mce), p);
1331	case MCE_GET_LOG_LEN:
1332		return put_user(MCE_LOG_LEN, p);
1333	case MCE_GETCLEAR_FLAGS: {
1334		unsigned flags;
1335
1336		do {
1337			flags = mcelog.flags;
1338		} while (cmpxchg(&mcelog.flags, flags, 0) != flags);
1339
1340		return put_user(flags, p);
1341	}
1342	default:
1343		return -ENOTTY;
1344	}
1345}
1346
1347/* Modified in mce-inject.c, so not static or const */
1348struct file_operations mce_chrdev_ops = {
1349	.open			= mce_open,
1350	.release		= mce_release,
1351	.read			= mce_read,
1352	.poll			= mce_poll,
1353	.unlocked_ioctl		= mce_ioctl,
1354};
1355EXPORT_SYMBOL_GPL(mce_chrdev_ops);
1356
1357static struct miscdevice mce_log_device = {
1358	MISC_MCELOG_MINOR,
1359	"mcelog",
1360	&mce_chrdev_ops,
1361};
1362
1363/*
1364 * mce=off disables machine check
1365 * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above)
1366 *	monarchtimeout is how long to wait for other CPUs on machine
1367 *	check, or 0 to not wait
1368 * mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
1369 * mce=nobootlog Don't log MCEs from before booting.
1370 */
1371static int __init mcheck_enable(char *str)
1372{
1373	if (*str == 0)
1374		enable_p5_mce();
1375	if (*str == '=')
1376		str++;
1377	if (!strcmp(str, "off"))
1378		mce_disabled = 1;
1379	else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
1380		mce_bootlog = (str[0] == 'b');
1381	else if (isdigit(str[0])) {
1382		get_option(&str, &tolerant);
1383		if (*str == ',') {
1384			++str;
1385			get_option(&str, &monarch_timeout);
1386		}
1387	} else {
1388		printk(KERN_INFO "mce argument %s ignored. Please use /sys\n",
1389		       str);
1390		return 0;
1391	}
1392	return 1;
1393}
1394__setup("mce", mcheck_enable);
1395
1396/*
1397 * Sysfs support
1398 */
1399
1400/*
1401 * Disable machine checks on suspend and shutdown. We can't really handle
1402 * them later.
1403 */
1404static int mce_disable(void)
1405{
1406	int i;
1407
1408	for (i = 0; i < banks; i++) {
1409		if (!skip_bank_init(i))
1410			wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
1411	}
1412	return 0;
1413}
1414
1415static int mce_suspend(struct sys_device *dev, pm_message_t state)
1416{
1417	return mce_disable();
1418}
1419
1420static int mce_shutdown(struct sys_device *dev)
1421{
1422	return mce_disable();
1423}
1424
1425/*
1426 * On resume clear all MCE state. Don't want to see leftovers from the BIOS.
1427 * Only one CPU is active at this time, the others get re-added later using
1428 * CPU hotplug:
1429 */
1430static int mce_resume(struct sys_device *dev)
1431{
1432	mce_init();
1433	mce_cpu_features(&current_cpu_data);
1434
1435	return 0;
1436}
1437
1438static void mce_cpu_restart(void *data)
1439{
1440	del_timer_sync(&__get_cpu_var(mce_timer));
1441	if (mce_available(&current_cpu_data))
1442		mce_init();
1443	mce_init_timer();
1444}
1445
1446/* Reinit MCEs after user configuration changes */
1447static void mce_restart(void)
1448{
1449	on_each_cpu(mce_cpu_restart, NULL, 1);
1450}
1451
1452static struct sysdev_class mce_sysclass = {
1453	.suspend	= mce_suspend,
1454	.shutdown	= mce_shutdown,
1455	.resume		= mce_resume,
1456	.name		= "machinecheck",
1457};
1458
1459DEFINE_PER_CPU(struct sys_device, mce_dev);
1460
1461__cpuinitdata
1462void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
1463
1464static struct sysdev_attribute *bank_attrs;
1465
1466static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr,
1467			 char *buf)
1468{
1469	u64 b = bank[attr - bank_attrs];
1470
1471	return sprintf(buf, "%llx\n", b);
1472}
1473
1474static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
1475			const char *buf, size_t size)
1476{
1477	u64 new;
1478
1479	if (strict_strtoull(buf, 0, &new) < 0)
1480		return -EINVAL;
1481
1482	bank[attr - bank_attrs] = new;
1483	mce_restart();
1484
1485	return size;
1486}
1487
1488static ssize_t
1489show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf)
1490{
1491	strcpy(buf, trigger);
1492	strcat(buf, "\n");
1493	return strlen(trigger) + 1;
1494}
1495
1496static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr,
1497				const char *buf, size_t siz)
1498{
1499	char *p;
1500	int len;
1501
1502	strncpy(trigger, buf, sizeof(trigger));
1503	trigger[sizeof(trigger)-1] = 0;
1504	len = strlen(trigger);
1505	p = strchr(trigger, '\n');
1506
1507	if (*p)
1508		*p = 0;
1509
1510	return len;
1511}
1512
1513static ssize_t store_int_with_restart(struct sys_device *s,
1514				      struct sysdev_attribute *attr,
1515				      const char *buf, size_t size)
1516{
1517	ssize_t ret = sysdev_store_int(s, attr, buf, size);
1518	mce_restart();
1519	return ret;
1520}
1521
1522static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
1523static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
1524static SYSDEV_INT_ATTR(monarch_timeout, 0644, monarch_timeout);
1525
1526static struct sysdev_ext_attribute attr_check_interval = {
1527	_SYSDEV_ATTR(check_interval, 0644, sysdev_show_int,
1528		     store_int_with_restart),
1529	&check_interval
1530};
1531
1532static struct sysdev_attribute *mce_attrs[] = {
1533	&attr_tolerant.attr, &attr_check_interval.attr, &attr_trigger,
1534	&attr_monarch_timeout.attr,
1535	NULL
1536};
1537
1538static cpumask_var_t mce_dev_initialized;
1539
1540/* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */
1541static __cpuinit int mce_create_device(unsigned int cpu)
1542{
1543	int err;
1544	int i;
1545
1546	if (!mce_available(&boot_cpu_data))
1547		return -EIO;
1548
1549	memset(&per_cpu(mce_dev, cpu).kobj, 0, sizeof(struct kobject));
1550	per_cpu(mce_dev, cpu).id	= cpu;
1551	per_cpu(mce_dev, cpu).cls	= &mce_sysclass;
1552
1553	err = sysdev_register(&per_cpu(mce_dev, cpu));
1554	if (err)
1555		return err;
1556
1557	for (i = 0; mce_attrs[i]; i++) {
1558		err = sysdev_create_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
1559		if (err)
1560			goto error;
1561	}
1562	for (i = 0; i < banks; i++) {
1563		err = sysdev_create_file(&per_cpu(mce_dev, cpu),
1564					&bank_attrs[i]);
1565		if (err)
1566			goto error2;
1567	}
1568	cpumask_set_cpu(cpu, mce_dev_initialized);
1569
1570	return 0;
1571error2:
1572	while (--i >= 0)
1573		sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]);
1574error:
1575	while (--i >= 0)
1576		sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
1577
1578	sysdev_unregister(&per_cpu(mce_dev, cpu));
1579
1580	return err;
1581}
1582
1583static __cpuinit void mce_remove_device(unsigned int cpu)
1584{
1585	int i;
1586
1587	if (!cpumask_test_cpu(cpu, mce_dev_initialized))
1588		return;
1589
1590	for (i = 0; mce_attrs[i]; i++)
1591		sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
1592
1593	for (i = 0; i < banks; i++)
1594		sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]);
1595
1596	sysdev_unregister(&per_cpu(mce_dev, cpu));
1597	cpumask_clear_cpu(cpu, mce_dev_initialized);
1598}
1599
1600/* Make sure there are no machine checks on offlined CPUs. */
1601static void mce_disable_cpu(void *h)
1602{
1603	unsigned long action = *(unsigned long *)h;
1604	int i;
1605
1606	if (!mce_available(&current_cpu_data))
1607		return;
1608	if (!(action & CPU_TASKS_FROZEN))
1609		cmci_clear();
1610	for (i = 0; i < banks; i++) {
1611		if (!skip_bank_init(i))
1612			wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
1613	}
1614}
1615
1616static void mce_reenable_cpu(void *h)
1617{
1618	unsigned long action = *(unsigned long *)h;
1619	int i;
1620
1621	if (!mce_available(&current_cpu_data))
1622		return;
1623
1624	if (!(action & CPU_TASKS_FROZEN))
1625		cmci_reenable();
1626	for (i = 0; i < banks; i++) {
1627		if (!skip_bank_init(i))
1628			wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]);
1629	}
1630}
1631
1632/* Get notified when a cpu comes on/off. Be hotplug friendly. */
1633static int __cpuinit
1634mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
1635{
1636	unsigned int cpu = (unsigned long)hcpu;
1637	struct timer_list *t = &per_cpu(mce_timer, cpu);
1638
1639	switch (action) {
1640	case CPU_ONLINE:
1641	case CPU_ONLINE_FROZEN:
1642		mce_create_device(cpu);
1643		if (threshold_cpu_callback)
1644			threshold_cpu_callback(action, cpu);
1645		break;
1646	case CPU_DEAD:
1647	case CPU_DEAD_FROZEN:
1648		if (threshold_cpu_callback)
1649			threshold_cpu_callback(action, cpu);
1650		mce_remove_device(cpu);
1651		break;
1652	case CPU_DOWN_PREPARE:
1653	case CPU_DOWN_PREPARE_FROZEN:
1654		del_timer_sync(t);
1655		smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
1656		break;
1657	case CPU_DOWN_FAILED:
1658	case CPU_DOWN_FAILED_FROZEN:
1659		t->expires = round_jiffies(jiffies +
1660						__get_cpu_var(next_interval));
1661		add_timer_on(t, cpu);
1662		smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
1663		break;
1664	case CPU_POST_DEAD:
1665		/* intentionally ignoring frozen here */
1666		cmci_rediscover(cpu);
1667		break;
1668	}
1669	return NOTIFY_OK;
1670}
1671
1672static struct notifier_block mce_cpu_notifier __cpuinitdata = {
1673	.notifier_call = mce_cpu_callback,
1674};
1675
1676static __init int mce_init_banks(void)
1677{
1678	int i;
1679
1680	bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks,
1681				GFP_KERNEL);
1682	if (!bank_attrs)
1683		return -ENOMEM;
1684
1685	for (i = 0; i < banks; i++) {
1686		struct sysdev_attribute *a = &bank_attrs[i];
1687
1688		a->attr.name	= kasprintf(GFP_KERNEL, "bank%d", i);
1689		if (!a->attr.name)
1690			goto nomem;
1691
1692		a->attr.mode	= 0644;
1693		a->show		= show_bank;
1694		a->store	= set_bank;
1695	}
1696	return 0;
1697
1698nomem:
1699	while (--i >= 0)
1700		kfree(bank_attrs[i].attr.name);
1701	kfree(bank_attrs);
1702	bank_attrs = NULL;
1703
1704	return -ENOMEM;
1705}
1706
1707static __init int mce_init_device(void)
1708{
1709	int err;
1710	int i = 0;
1711
1712	if (!mce_available(&boot_cpu_data))
1713		return -EIO;
1714
1715	alloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL);
1716
1717	err = mce_init_banks();
1718	if (err)
1719		return err;
1720
1721	err = sysdev_class_register(&mce_sysclass);
1722	if (err)
1723		return err;
1724
1725	for_each_online_cpu(i) {
1726		err = mce_create_device(i);
1727		if (err)
1728			return err;
1729	}
1730
1731	register_hotcpu_notifier(&mce_cpu_notifier);
1732	misc_register(&mce_log_device);
1733
1734	return err;
1735}
1736
1737device_initcall(mce_init_device);
1738
1739#else /* CONFIG_X86_OLD_MCE: */
1740
1741int nr_mce_banks;
1742EXPORT_SYMBOL_GPL(nr_mce_banks);	/* non-fatal.o */
1743
1744/* This has to be run for each processor */
1745void mcheck_init(struct cpuinfo_x86 *c)
1746{
1747	if (mce_disabled == 1)
1748		return;
1749
1750	switch (c->x86_vendor) {
1751	case X86_VENDOR_AMD:
1752		amd_mcheck_init(c);
1753		break;
1754
1755	case X86_VENDOR_INTEL:
1756		if (c->x86 == 5)
1757			intel_p5_mcheck_init(c);
1758		if (c->x86 == 6)
1759			intel_p6_mcheck_init(c);
1760		if (c->x86 == 15)
1761			intel_p4_mcheck_init(c);
1762		break;
1763
1764	case X86_VENDOR_CENTAUR:
1765		if (c->x86 == 5)
1766			winchip_mcheck_init(c);
1767		break;
1768
1769	default:
1770		break;
1771	}
1772	printk(KERN_INFO "mce: CPU supports %d MCE banks\n", nr_mce_banks);
1773}
1774
1775static int __init mcheck_enable(char *str)
1776{
1777	mce_disabled = -1;
1778	return 1;
1779}
1780
1781__setup("mce", mcheck_enable);
1782
1783#endif /* CONFIG_X86_OLD_MCE */
1784
1785/*
1786 * Old style boot options parsing. Only for compatibility.
1787 */
1788static int __init mcheck_disable(char *str)
1789{
1790	mce_disabled = 1;
1791	return 1;
1792}
1793__setup("nomce", mcheck_disable);
1794