mce.c revision 1b2797dcc9f0ad89bc382ace26c6baafbc7e33c2
1/*
2 * Machine check handler.
3 *
4 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
5 * Rest from unknown author(s).
6 * 2004 Andi Kleen. Rewrote most of it.
7 * Copyright 2008 Intel Corporation
8 * Author: Andi Kleen
9 */
10#include <linux/thread_info.h>
11#include <linux/capability.h>
12#include <linux/miscdevice.h>
13#include <linux/interrupt.h>
14#include <linux/ratelimit.h>
15#include <linux/kallsyms.h>
16#include <linux/rcupdate.h>
17#include <linux/kobject.h>
18#include <linux/uaccess.h>
19#include <linux/kdebug.h>
20#include <linux/kernel.h>
21#include <linux/percpu.h>
22#include <linux/string.h>
23#include <linux/sysdev.h>
24#include <linux/delay.h>
25#include <linux/ctype.h>
26#include <linux/sched.h>
27#include <linux/sysfs.h>
28#include <linux/types.h>
29#include <linux/init.h>
30#include <linux/kmod.h>
31#include <linux/poll.h>
32#include <linux/nmi.h>
33#include <linux/cpu.h>
34#include <linux/smp.h>
35#include <linux/fs.h>
36
37#include <asm/processor.h>
38#include <asm/hw_irq.h>
39#include <asm/apic.h>
40#include <asm/idle.h>
41#include <asm/ipi.h>
42#include <asm/mce.h>
43#include <asm/msr.h>
44
45#include "mce-internal.h"
46#include "mce.h"
47
48/* Handle unconfigured int18 (should never happen) */
49static void unexpected_machine_check(struct pt_regs *regs, long error_code)
50{
51	printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n",
52	       smp_processor_id());
53}
54
55/* Call the installed machine check handler for this CPU setup. */
56void (*machine_check_vector)(struct pt_regs *, long error_code) =
57						unexpected_machine_check;
58
59int				mce_disabled;
60
61#ifdef CONFIG_X86_NEW_MCE
62
63#define MISC_MCELOG_MINOR	227
64
65#define SPINUNIT 100	/* 100ns */
66
67atomic_t mce_entry;
68
69DEFINE_PER_CPU(unsigned, mce_exception_count);
70
71/*
72 * Tolerant levels:
73 *   0: always panic on uncorrected errors, log corrected errors
74 *   1: panic or SIGBUS on uncorrected errors, log corrected errors
75 *   2: SIGBUS or log uncorrected errors (if possible), log corrected errors
76 *   3: never panic or SIGBUS, log all errors (for testing only)
77 */
78static int			tolerant = 1;
79static int			banks;
80static u64			*bank;
81static unsigned long		notify_user;
82static int			rip_msr;
83static int			mce_bootlog = -1;
84static int			monarch_timeout = -1;
85
86static char			trigger[128];
87static char			*trigger_argv[2] = { trigger, NULL };
88
89static unsigned long		dont_init_banks;
90
91static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
92static DEFINE_PER_CPU(struct mce, mces_seen);
93static int			cpu_missing;
94
95
96/* MCA banks polled by the period polling timer for corrected events */
97DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
98	[0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
99};
100
101static inline int skip_bank_init(int i)
102{
103	return i < BITS_PER_LONG && test_bit(i, &dont_init_banks);
104}
105
106/* Do initial initialization of a struct mce */
107void mce_setup(struct mce *m)
108{
109	memset(m, 0, sizeof(struct mce));
110	m->cpu = m->extcpu = smp_processor_id();
111	rdtscll(m->tsc);
112	/* We hope get_seconds stays lockless */
113	m->time = get_seconds();
114	m->cpuvendor = boot_cpu_data.x86_vendor;
115	m->cpuid = cpuid_eax(1);
116#ifdef CONFIG_SMP
117	m->socketid = cpu_data(m->extcpu).phys_proc_id;
118#endif
119	m->apicid = cpu_data(m->extcpu).initial_apicid;
120	rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap);
121}
122
123DEFINE_PER_CPU(struct mce, injectm);
124EXPORT_PER_CPU_SYMBOL_GPL(injectm);
125
126/*
127 * Lockless MCE logging infrastructure.
128 * This avoids deadlocks on printk locks without having to break locks. Also
129 * separate MCEs from kernel messages to avoid bogus bug reports.
130 */
131
132static struct mce_log mcelog = {
133	.signature	= MCE_LOG_SIGNATURE,
134	.len		= MCE_LOG_LEN,
135	.recordlen	= sizeof(struct mce),
136};
137
138void mce_log(struct mce *mce)
139{
140	unsigned next, entry;
141
142	mce->finished = 0;
143	wmb();
144	for (;;) {
145		entry = rcu_dereference(mcelog.next);
146		for (;;) {
147			/*
148			 * When the buffer fills up discard new entries.
149			 * Assume that the earlier errors are the more
150			 * interesting ones:
151			 */
152			if (entry >= MCE_LOG_LEN) {
153				set_bit(MCE_OVERFLOW,
154					(unsigned long *)&mcelog.flags);
155				return;
156			}
157			/* Old left over entry. Skip: */
158			if (mcelog.entry[entry].finished) {
159				entry++;
160				continue;
161			}
162			break;
163		}
164		smp_rmb();
165		next = entry + 1;
166		if (cmpxchg(&mcelog.next, entry, next) == entry)
167			break;
168	}
169	memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
170	wmb();
171	mcelog.entry[entry].finished = 1;
172	wmb();
173
174	mce->finished = 1;
175	set_bit(0, &notify_user);
176}
177
178static void print_mce(struct mce *m)
179{
180	printk(KERN_EMERG "\n"
181	       KERN_EMERG "HARDWARE ERROR\n"
182	       KERN_EMERG
183	       "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
184	       m->extcpu, m->mcgstatus, m->bank, m->status);
185	if (m->ip) {
186		printk(KERN_EMERG "RIP%s %02x:<%016Lx> ",
187		       !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
188		       m->cs, m->ip);
189		if (m->cs == __KERNEL_CS)
190			print_symbol("{%s}", m->ip);
191		printk("\n");
192	}
193	printk(KERN_EMERG "TSC %llx ", m->tsc);
194	if (m->addr)
195		printk("ADDR %llx ", m->addr);
196	if (m->misc)
197		printk("MISC %llx ", m->misc);
198	printk("\n");
199	printk(KERN_EMERG "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n",
200			m->cpuvendor, m->cpuid, m->time, m->socketid,
201			m->apicid);
202	printk(KERN_EMERG "This is not a software problem!\n");
203	printk(KERN_EMERG "Run through mcelog --ascii to decode "
204	       "and contact your hardware vendor\n");
205}
206
207#define PANIC_TIMEOUT 5 /* 5 seconds */
208
209static atomic_t mce_paniced;
210
211/* Panic in progress. Enable interrupts and wait for final IPI */
212static void wait_for_panic(void)
213{
214	long timeout = PANIC_TIMEOUT*USEC_PER_SEC;
215	preempt_disable();
216	local_irq_enable();
217	while (timeout-- > 0)
218		udelay(1);
219	panic("Panicing machine check CPU died");
220}
221
222static void mce_panic(char *msg, struct mce *final, char *exp)
223{
224	int i;
225
226	/*
227	 * Make sure only one CPU runs in machine check panic
228	 */
229	if (atomic_add_return(1, &mce_paniced) > 1)
230		wait_for_panic();
231	barrier();
232
233	bust_spinlocks(1);
234	console_verbose();
235	/* First print corrected ones that are still unlogged */
236	for (i = 0; i < MCE_LOG_LEN; i++) {
237		struct mce *m = &mcelog.entry[i];
238		if ((m->status & MCI_STATUS_VAL) &&
239			!(m->status & MCI_STATUS_UC))
240			print_mce(m);
241	}
242	/* Now print uncorrected but with the final one last */
243	for (i = 0; i < MCE_LOG_LEN; i++) {
244		struct mce *m = &mcelog.entry[i];
245		if (!(m->status & MCI_STATUS_VAL))
246			continue;
247		if (!final || memcmp(m, final, sizeof(struct mce)))
248			print_mce(m);
249	}
250	if (final)
251		print_mce(final);
252	if (cpu_missing)
253		printk(KERN_EMERG "Some CPUs didn't answer in synchronization\n");
254	if (exp)
255		printk(KERN_EMERG "Machine check: %s\n", exp);
256	panic(msg);
257}
258
259/* Support code for software error injection */
260
261static int msr_to_offset(u32 msr)
262{
263	unsigned bank = __get_cpu_var(injectm.bank);
264	if (msr == rip_msr)
265		return offsetof(struct mce, ip);
266	if (msr == MSR_IA32_MC0_STATUS + bank*4)
267		return offsetof(struct mce, status);
268	if (msr == MSR_IA32_MC0_ADDR + bank*4)
269		return offsetof(struct mce, addr);
270	if (msr == MSR_IA32_MC0_MISC + bank*4)
271		return offsetof(struct mce, misc);
272	if (msr == MSR_IA32_MCG_STATUS)
273		return offsetof(struct mce, mcgstatus);
274	return -1;
275}
276
277/* MSR access wrappers used for error injection */
278static u64 mce_rdmsrl(u32 msr)
279{
280	u64 v;
281	if (__get_cpu_var(injectm).finished) {
282		int offset = msr_to_offset(msr);
283		if (offset < 0)
284			return 0;
285		return *(u64 *)((char *)&__get_cpu_var(injectm) + offset);
286	}
287	rdmsrl(msr, v);
288	return v;
289}
290
291static void mce_wrmsrl(u32 msr, u64 v)
292{
293	if (__get_cpu_var(injectm).finished) {
294		int offset = msr_to_offset(msr);
295		if (offset >= 0)
296			*(u64 *)((char *)&__get_cpu_var(injectm) + offset) = v;
297		return;
298	}
299	wrmsrl(msr, v);
300}
301
302int mce_available(struct cpuinfo_x86 *c)
303{
304	if (mce_disabled)
305		return 0;
306	return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
307}
308
309/*
310 * Get the address of the instruction at the time of the machine check
311 * error.
312 */
313static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
314{
315
316	if (regs && (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV))) {
317		m->ip = regs->ip;
318		m->cs = regs->cs;
319	} else {
320		m->ip = 0;
321		m->cs = 0;
322	}
323	if (rip_msr)
324		m->ip = mce_rdmsrl(rip_msr);
325}
326
327#ifdef CONFIG_X86_LOCAL_APIC
328/*
329 * Called after interrupts have been reenabled again
330 * when a MCE happened during an interrupts off region
331 * in the kernel.
332 */
333asmlinkage void smp_mce_self_interrupt(struct pt_regs *regs)
334{
335	ack_APIC_irq();
336	exit_idle();
337	irq_enter();
338	mce_notify_user();
339	irq_exit();
340}
341#endif
342
343static void mce_report_event(struct pt_regs *regs)
344{
345	if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) {
346		mce_notify_user();
347		return;
348	}
349
350#ifdef CONFIG_X86_LOCAL_APIC
351	/*
352	 * Without APIC do not notify. The event will be picked
353	 * up eventually.
354	 */
355	if (!cpu_has_apic)
356		return;
357
358	/*
359	 * When interrupts are disabled we cannot use
360	 * kernel services safely. Trigger an self interrupt
361	 * through the APIC to instead do the notification
362	 * after interrupts are reenabled again.
363	 */
364	apic->send_IPI_self(MCE_SELF_VECTOR);
365
366	/*
367	 * Wait for idle afterwards again so that we don't leave the
368	 * APIC in a non idle state because the normal APIC writes
369	 * cannot exclude us.
370	 */
371	apic_wait_icr_idle();
372#endif
373}
374
375DEFINE_PER_CPU(unsigned, mce_poll_count);
376
377/*
378 * Poll for corrected events or events that happened before reset.
379 * Those are just logged through /dev/mcelog.
380 *
381 * This is executed in standard interrupt context.
382 */
383void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
384{
385	struct mce m;
386	int i;
387
388	__get_cpu_var(mce_poll_count)++;
389
390	mce_setup(&m);
391
392	m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
393	for (i = 0; i < banks; i++) {
394		if (!bank[i] || !test_bit(i, *b))
395			continue;
396
397		m.misc = 0;
398		m.addr = 0;
399		m.bank = i;
400		m.tsc = 0;
401
402		barrier();
403		m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4);
404		if (!(m.status & MCI_STATUS_VAL))
405			continue;
406
407		/*
408		 * Uncorrected events are handled by the exception handler
409		 * when it is enabled. But when the exception is disabled log
410		 * everything.
411		 *
412		 * TBD do the same check for MCI_STATUS_EN here?
413		 */
414		if ((m.status & MCI_STATUS_UC) && !(flags & MCP_UC))
415			continue;
416
417		if (m.status & MCI_STATUS_MISCV)
418			m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4);
419		if (m.status & MCI_STATUS_ADDRV)
420			m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4);
421
422		if (!(flags & MCP_TIMESTAMP))
423			m.tsc = 0;
424		/*
425		 * Don't get the IP here because it's unlikely to
426		 * have anything to do with the actual error location.
427		 */
428		if (!(flags & MCP_DONTLOG)) {
429			mce_log(&m);
430			add_taint(TAINT_MACHINE_CHECK);
431		}
432
433		/*
434		 * Clear state for this bank.
435		 */
436		mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
437	}
438
439	/*
440	 * Don't clear MCG_STATUS here because it's only defined for
441	 * exceptions.
442	 */
443
444	sync_core();
445}
446EXPORT_SYMBOL_GPL(machine_check_poll);
447
448/*
449 * Do a quick check if any of the events requires a panic.
450 * This decides if we keep the events around or clear them.
451 */
452static int mce_no_way_out(struct mce *m, char **msg)
453{
454	int i;
455
456	for (i = 0; i < banks; i++) {
457		m->status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4);
458		if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY)
459			return 1;
460	}
461	return 0;
462}
463
464/*
465 * Variable to establish order between CPUs while scanning.
466 * Each CPU spins initially until executing is equal its number.
467 */
468static atomic_t mce_executing;
469
470/*
471 * Defines order of CPUs on entry. First CPU becomes Monarch.
472 */
473static atomic_t mce_callin;
474
475/*
476 * Check if a timeout waiting for other CPUs happened.
477 */
478static int mce_timed_out(u64 *t)
479{
480	/*
481	 * The others already did panic for some reason.
482	 * Bail out like in a timeout.
483	 * rmb() to tell the compiler that system_state
484	 * might have been modified by someone else.
485	 */
486	rmb();
487	if (atomic_read(&mce_paniced))
488		wait_for_panic();
489	if (!monarch_timeout)
490		goto out;
491	if ((s64)*t < SPINUNIT) {
492		/* CHECKME: Make panic default for 1 too? */
493		if (tolerant < 1)
494			mce_panic("Timeout synchronizing machine check over CPUs",
495				  NULL, NULL);
496		cpu_missing = 1;
497		return 1;
498	}
499	*t -= SPINUNIT;
500out:
501	touch_nmi_watchdog();
502	return 0;
503}
504
505/*
506 * The Monarch's reign.  The Monarch is the CPU who entered
507 * the machine check handler first. It waits for the others to
508 * raise the exception too and then grades them. When any
509 * error is fatal panic. Only then let the others continue.
510 *
511 * The other CPUs entering the MCE handler will be controlled by the
512 * Monarch. They are called Subjects.
513 *
514 * This way we prevent any potential data corruption in a unrecoverable case
515 * and also makes sure always all CPU's errors are examined.
516 *
517 * Also this detects the case of an machine check event coming from outer
518 * space (not detected by any CPUs) In this case some external agent wants
519 * us to shut down, so panic too.
520 *
521 * The other CPUs might still decide to panic if the handler happens
522 * in a unrecoverable place, but in this case the system is in a semi-stable
523 * state and won't corrupt anything by itself. It's ok to let the others
524 * continue for a bit first.
525 *
526 * All the spin loops have timeouts; when a timeout happens a CPU
527 * typically elects itself to be Monarch.
528 */
529static void mce_reign(void)
530{
531	int cpu;
532	struct mce *m = NULL;
533	int global_worst = 0;
534	char *msg = NULL;
535	char *nmsg = NULL;
536
537	/*
538	 * This CPU is the Monarch and the other CPUs have run
539	 * through their handlers.
540	 * Grade the severity of the errors of all the CPUs.
541	 */
542	for_each_possible_cpu(cpu) {
543		int severity = mce_severity(&per_cpu(mces_seen, cpu), tolerant,
544					    &nmsg);
545		if (severity > global_worst) {
546			msg = nmsg;
547			global_worst = severity;
548			m = &per_cpu(mces_seen, cpu);
549		}
550	}
551
552	/*
553	 * Cannot recover? Panic here then.
554	 * This dumps all the mces in the log buffer and stops the
555	 * other CPUs.
556	 */
557	if (m && global_worst >= MCE_PANIC_SEVERITY && tolerant < 3)
558		mce_panic("Fatal Machine check", m, msg);
559
560	/*
561	 * For UC somewhere we let the CPU who detects it handle it.
562	 * Also must let continue the others, otherwise the handling
563	 * CPU could deadlock on a lock.
564	 */
565
566	/*
567	 * No machine check event found. Must be some external
568	 * source or one CPU is hung. Panic.
569	 */
570	if (!m && tolerant < 3)
571		mce_panic("Machine check from unknown source", NULL, NULL);
572
573	/*
574	 * Now clear all the mces_seen so that they don't reappear on
575	 * the next mce.
576	 */
577	for_each_possible_cpu(cpu)
578		memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce));
579}
580
581static atomic_t global_nwo;
582
583/*
584 * Start of Monarch synchronization. This waits until all CPUs have
585 * entered the exception handler and then determines if any of them
586 * saw a fatal event that requires panic. Then it executes them
587 * in the entry order.
588 * TBD double check parallel CPU hotunplug
589 */
590static int mce_start(int no_way_out, int *order)
591{
592	int nwo;
593	int cpus = num_online_cpus();
594	u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC;
595
596	if (!timeout) {
597		*order = -1;
598		return no_way_out;
599	}
600
601	atomic_add(no_way_out, &global_nwo);
602
603	/*
604	 * Wait for everyone.
605	 */
606	while (atomic_read(&mce_callin) != cpus) {
607		if (mce_timed_out(&timeout)) {
608			atomic_set(&global_nwo, 0);
609			*order = -1;
610			return no_way_out;
611		}
612		ndelay(SPINUNIT);
613	}
614
615	/*
616	 * Cache the global no_way_out state.
617	 */
618	nwo = atomic_read(&global_nwo);
619
620	/*
621	 * Monarch starts executing now, the others wait.
622	 */
623	if (*order == 1) {
624		atomic_set(&mce_executing, 1);
625		return nwo;
626	}
627
628	/*
629	 * Now start the scanning loop one by one
630	 * in the original callin order.
631	 * This way when there are any shared banks it will
632	 * be only seen by one CPU before cleared, avoiding duplicates.
633	 */
634	while (atomic_read(&mce_executing) < *order) {
635		if (mce_timed_out(&timeout)) {
636			atomic_set(&global_nwo, 0);
637			*order = -1;
638			return no_way_out;
639		}
640		ndelay(SPINUNIT);
641	}
642	return nwo;
643}
644
645/*
646 * Synchronize between CPUs after main scanning loop.
647 * This invokes the bulk of the Monarch processing.
648 */
649static int mce_end(int order)
650{
651	int ret = -1;
652	u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC;
653
654	if (!timeout)
655		goto reset;
656	if (order < 0)
657		goto reset;
658
659	/*
660	 * Allow others to run.
661	 */
662	atomic_inc(&mce_executing);
663
664	if (order == 1) {
665		/* CHECKME: Can this race with a parallel hotplug? */
666		int cpus = num_online_cpus();
667
668		/*
669		 * Monarch: Wait for everyone to go through their scanning
670		 * loops.
671		 */
672		while (atomic_read(&mce_executing) <= cpus) {
673			if (mce_timed_out(&timeout))
674				goto reset;
675			ndelay(SPINUNIT);
676		}
677
678		mce_reign();
679		barrier();
680		ret = 0;
681	} else {
682		/*
683		 * Subject: Wait for Monarch to finish.
684		 */
685		while (atomic_read(&mce_executing) != 0) {
686			if (mce_timed_out(&timeout))
687				goto reset;
688			ndelay(SPINUNIT);
689		}
690
691		/*
692		 * Don't reset anything. That's done by the Monarch.
693		 */
694		return 0;
695	}
696
697	/*
698	 * Reset all global state.
699	 */
700reset:
701	atomic_set(&global_nwo, 0);
702	atomic_set(&mce_callin, 0);
703	barrier();
704
705	/*
706	 * Let others run again.
707	 */
708	atomic_set(&mce_executing, 0);
709	return ret;
710}
711
712static void mce_clear_state(unsigned long *toclear)
713{
714	int i;
715
716	for (i = 0; i < banks; i++) {
717		if (test_bit(i, toclear))
718			mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
719	}
720}
721
722/*
723 * The actual machine check handler. This only handles real
724 * exceptions when something got corrupted coming in through int 18.
725 *
726 * This is executed in NMI context not subject to normal locking rules. This
727 * implies that most kernel services cannot be safely used. Don't even
728 * think about putting a printk in there!
729 *
730 * On Intel systems this is entered on all CPUs in parallel through
731 * MCE broadcast. However some CPUs might be broken beyond repair,
732 * so be always careful when synchronizing with others.
733 */
734void do_machine_check(struct pt_regs *regs, long error_code)
735{
736	struct mce m, *final;
737	int i;
738	int worst = 0;
739	int severity;
740	/*
741	 * Establish sequential order between the CPUs entering the machine
742	 * check handler.
743	 */
744	int order;
745
746	/*
747	 * If no_way_out gets set, there is no safe way to recover from this
748	 * MCE.  If tolerant is cranked up, we'll try anyway.
749	 */
750	int no_way_out = 0;
751	/*
752	 * If kill_it gets set, there might be a way to recover from this
753	 * error.
754	 */
755	int kill_it = 0;
756	DECLARE_BITMAP(toclear, MAX_NR_BANKS);
757	char *msg = "Unknown";
758
759	atomic_inc(&mce_entry);
760
761	__get_cpu_var(mce_exception_count)++;
762
763	if (notify_die(DIE_NMI, "machine check", regs, error_code,
764			   18, SIGKILL) == NOTIFY_STOP)
765		goto out;
766	if (!banks)
767		goto out;
768
769	order = atomic_add_return(1, &mce_callin);
770	mce_setup(&m);
771
772	m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
773	no_way_out = mce_no_way_out(&m, &msg);
774
775	final = &__get_cpu_var(mces_seen);
776	*final = m;
777
778	barrier();
779
780	/*
781	 * Go through all the banks in exclusion of the other CPUs.
782	 * This way we don't report duplicated events on shared banks
783	 * because the first one to see it will clear it.
784	 */
785	no_way_out = mce_start(no_way_out, &order);
786	for (i = 0; i < banks; i++) {
787		__clear_bit(i, toclear);
788		if (!bank[i])
789			continue;
790
791		m.misc = 0;
792		m.addr = 0;
793		m.bank = i;
794
795		m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4);
796		if ((m.status & MCI_STATUS_VAL) == 0)
797			continue;
798
799		/*
800		 * Non uncorrected errors are handled by machine_check_poll
801		 * Leave them alone, unless this panics.
802		 */
803		if ((m.status & MCI_STATUS_UC) == 0 && !no_way_out)
804			continue;
805
806		/*
807		 * Set taint even when machine check was not enabled.
808		 */
809		add_taint(TAINT_MACHINE_CHECK);
810
811		__set_bit(i, toclear);
812
813		if (m.status & MCI_STATUS_EN) {
814			/*
815			 * If this error was uncorrectable and there was
816			 * an overflow, we're in trouble.  If no overflow,
817			 * we might get away with just killing a task.
818			 */
819			if (m.status & MCI_STATUS_UC)
820				kill_it = 1;
821		} else {
822			/*
823			 * Machine check event was not enabled. Clear, but
824			 * ignore.
825			 */
826			continue;
827		}
828
829		if (m.status & MCI_STATUS_MISCV)
830			m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4);
831		if (m.status & MCI_STATUS_ADDRV)
832			m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4);
833
834		mce_get_rip(&m, regs);
835		mce_log(&m);
836
837		severity = mce_severity(&m, tolerant, NULL);
838		if (severity > worst) {
839			*final = m;
840			worst = severity;
841		}
842	}
843
844	if (!no_way_out)
845		mce_clear_state(toclear);
846
847	/*
848	 * Do most of the synchronization with other CPUs.
849	 * When there's any problem use only local no_way_out state.
850	 */
851	if (mce_end(order) < 0)
852		no_way_out = worst >= MCE_PANIC_SEVERITY;
853
854	/*
855	 * If we have decided that we just CAN'T continue, and the user
856	 * has not set tolerant to an insane level, give up and die.
857	 *
858	 * This is mainly used in the case when the system doesn't
859	 * support MCE broadcasting or it has been disabled.
860	 */
861	if (no_way_out && tolerant < 3)
862		mce_panic("Fatal machine check on current CPU", final, msg);
863
864	/*
865	 * If the error seems to be unrecoverable, something should be
866	 * done.  Try to kill as little as possible.  If we can kill just
867	 * one task, do that.  If the user has set the tolerance very
868	 * high, don't try to do anything at all.
869	 */
870	if (kill_it && tolerant < 3) {
871		int user_space = 0;
872
873		/*
874		 * If the EIPV bit is set, it means the saved IP is the
875		 * instruction which caused the MCE.
876		 */
877		if (m.mcgstatus & MCG_STATUS_EIPV)
878			user_space = final->ip && (final->cs & 3);
879
880		/*
881		 * If we know that the error was in user space, send a
882		 * SIGBUS.  Otherwise, panic if tolerance is low.
883		 *
884		 * force_sig() takes an awful lot of locks and has a slight
885		 * risk of deadlocking.
886		 */
887		if (user_space) {
888			force_sig(SIGBUS, current);
889		} else if (panic_on_oops || tolerant < 2) {
890			mce_panic("Uncorrected machine check", final, msg);
891		}
892	}
893
894	/* notify userspace ASAP */
895	set_thread_flag(TIF_MCE_NOTIFY);
896
897	if (worst > 0)
898		mce_report_event(regs);
899	mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
900out:
901	atomic_dec(&mce_entry);
902	sync_core();
903}
904EXPORT_SYMBOL_GPL(do_machine_check);
905
906#ifdef CONFIG_X86_MCE_INTEL
907/***
908 * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
909 * @cpu: The CPU on which the event occurred.
910 * @status: Event status information
911 *
912 * This function should be called by the thermal interrupt after the
913 * event has been processed and the decision was made to log the event
914 * further.
915 *
916 * The status parameter will be saved to the 'status' field of 'struct mce'
917 * and historically has been the register value of the
918 * MSR_IA32_THERMAL_STATUS (Intel) msr.
919 */
920void mce_log_therm_throt_event(__u64 status)
921{
922	struct mce m;
923
924	mce_setup(&m);
925	m.bank = MCE_THERMAL_BANK;
926	m.status = status;
927	mce_log(&m);
928}
929#endif /* CONFIG_X86_MCE_INTEL */
930
931/*
932 * Periodic polling timer for "silent" machine check errors.  If the
933 * poller finds an MCE, poll 2x faster.  When the poller finds no more
934 * errors, poll 2x slower (up to check_interval seconds).
935 */
936static int check_interval = 5 * 60; /* 5 minutes */
937
938static DEFINE_PER_CPU(int, next_interval); /* in jiffies */
939static DEFINE_PER_CPU(struct timer_list, mce_timer);
940
941static void mcheck_timer(unsigned long data)
942{
943	struct timer_list *t = &per_cpu(mce_timer, data);
944	int *n;
945
946	WARN_ON(smp_processor_id() != data);
947
948	if (mce_available(&current_cpu_data)) {
949		machine_check_poll(MCP_TIMESTAMP,
950				&__get_cpu_var(mce_poll_banks));
951	}
952
953	/*
954	 * Alert userspace if needed.  If we logged an MCE, reduce the
955	 * polling interval, otherwise increase the polling interval.
956	 */
957	n = &__get_cpu_var(next_interval);
958	if (mce_notify_user())
959		*n = max(*n/2, HZ/100);
960	else
961		*n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ));
962
963	t->expires = jiffies + *n;
964	add_timer(t);
965}
966
967static void mce_do_trigger(struct work_struct *work)
968{
969	call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT);
970}
971
972static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
973
974/*
975 * Notify the user(s) about new machine check events.
976 * Can be called from interrupt context, but not from machine check/NMI
977 * context.
978 */
979int mce_notify_user(void)
980{
981	/* Not more than two messages every minute */
982	static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
983
984	clear_thread_flag(TIF_MCE_NOTIFY);
985
986	if (test_and_clear_bit(0, &notify_user)) {
987		wake_up_interruptible(&mce_wait);
988
989		/*
990		 * There is no risk of missing notifications because
991		 * work_pending is always cleared before the function is
992		 * executed.
993		 */
994		if (trigger[0] && !work_pending(&mce_trigger_work))
995			schedule_work(&mce_trigger_work);
996
997		if (__ratelimit(&ratelimit))
998			printk(KERN_INFO "Machine check events logged\n");
999
1000		return 1;
1001	}
1002	return 0;
1003}
1004EXPORT_SYMBOL_GPL(mce_notify_user);
1005
1006/*
1007 * Initialize Machine Checks for a CPU.
1008 */
1009static int mce_cap_init(void)
1010{
1011	unsigned b;
1012	u64 cap;
1013
1014	rdmsrl(MSR_IA32_MCG_CAP, cap);
1015
1016	b = cap & MCG_BANKCNT_MASK;
1017	printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b);
1018
1019	if (b > MAX_NR_BANKS) {
1020		printk(KERN_WARNING
1021		       "MCE: Using only %u machine check banks out of %u\n",
1022			MAX_NR_BANKS, b);
1023		b = MAX_NR_BANKS;
1024	}
1025
1026	/* Don't support asymmetric configurations today */
1027	WARN_ON(banks != 0 && b != banks);
1028	banks = b;
1029	if (!bank) {
1030		bank = kmalloc(banks * sizeof(u64), GFP_KERNEL);
1031		if (!bank)
1032			return -ENOMEM;
1033		memset(bank, 0xff, banks * sizeof(u64));
1034	}
1035
1036	/* Use accurate RIP reporting if available. */
1037	if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9)
1038		rip_msr = MSR_IA32_MCG_EIP;
1039
1040	return 0;
1041}
1042
1043static void mce_init(void)
1044{
1045	mce_banks_t all_banks;
1046	u64 cap;
1047	int i;
1048
1049	/*
1050	 * Log the machine checks left over from the previous reset.
1051	 */
1052	bitmap_fill(all_banks, MAX_NR_BANKS);
1053	machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks);
1054
1055	set_in_cr4(X86_CR4_MCE);
1056
1057	rdmsrl(MSR_IA32_MCG_CAP, cap);
1058	if (cap & MCG_CTL_P)
1059		wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
1060
1061	for (i = 0; i < banks; i++) {
1062		if (skip_bank_init(i))
1063			continue;
1064		wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
1065		wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
1066	}
1067}
1068
1069/* Add per CPU specific workarounds here */
1070static void mce_cpu_quirks(struct cpuinfo_x86 *c)
1071{
1072	/* This should be disabled by the BIOS, but isn't always */
1073	if (c->x86_vendor == X86_VENDOR_AMD) {
1074		if (c->x86 == 15 && banks > 4) {
1075			/*
1076			 * disable GART TBL walk error reporting, which
1077			 * trips off incorrectly with the IOMMU & 3ware
1078			 * & Cerberus:
1079			 */
1080			clear_bit(10, (unsigned long *)&bank[4]);
1081		}
1082		if (c->x86 <= 17 && mce_bootlog < 0) {
1083			/*
1084			 * Lots of broken BIOS around that don't clear them
1085			 * by default and leave crap in there. Don't log:
1086			 */
1087			mce_bootlog = 0;
1088		}
1089		/*
1090		 * Various K7s with broken bank 0 around. Always disable
1091		 * by default.
1092		 */
1093		 if (c->x86 == 6)
1094			bank[0] = 0;
1095	}
1096
1097	if (c->x86_vendor == X86_VENDOR_INTEL) {
1098		/*
1099		 * SDM documents that on family 6 bank 0 should not be written
1100		 * because it aliases to another special BIOS controlled
1101		 * register.
1102		 * But it's not aliased anymore on model 0x1a+
1103		 * Don't ignore bank 0 completely because there could be a
1104		 * valid event later, merely don't write CTL0.
1105		 */
1106
1107		if (c->x86 == 6 && c->x86_model < 0x1A)
1108			__set_bit(0, &dont_init_banks);
1109
1110		/*
1111		 * All newer Intel systems support MCE broadcasting. Enable
1112		 * synchronization with a one second timeout.
1113		 */
1114		if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) &&
1115			monarch_timeout < 0)
1116			monarch_timeout = USEC_PER_SEC;
1117	}
1118	if (monarch_timeout < 0)
1119		monarch_timeout = 0;
1120}
1121
1122static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c)
1123{
1124	if (c->x86 != 5)
1125		return;
1126	switch (c->x86_vendor) {
1127	case X86_VENDOR_INTEL:
1128		if (mce_p5_enabled())
1129			intel_p5_mcheck_init(c);
1130		break;
1131	case X86_VENDOR_CENTAUR:
1132		winchip_mcheck_init(c);
1133		break;
1134	}
1135}
1136
1137static void mce_cpu_features(struct cpuinfo_x86 *c)
1138{
1139	switch (c->x86_vendor) {
1140	case X86_VENDOR_INTEL:
1141		mce_intel_feature_init(c);
1142		break;
1143	case X86_VENDOR_AMD:
1144		mce_amd_feature_init(c);
1145		break;
1146	default:
1147		break;
1148	}
1149}
1150
1151static void mce_init_timer(void)
1152{
1153	struct timer_list *t = &__get_cpu_var(mce_timer);
1154	int *n = &__get_cpu_var(next_interval);
1155
1156	*n = check_interval * HZ;
1157	if (!*n)
1158		return;
1159	setup_timer(t, mcheck_timer, smp_processor_id());
1160	t->expires = round_jiffies(jiffies + *n);
1161	add_timer(t);
1162}
1163
1164/*
1165 * Called for each booted CPU to set up machine checks.
1166 * Must be called with preempt off:
1167 */
1168void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
1169{
1170	if (mce_disabled)
1171		return;
1172
1173	mce_ancient_init(c);
1174
1175	if (!mce_available(c))
1176		return;
1177
1178	if (mce_cap_init() < 0) {
1179		mce_disabled = 1;
1180		return;
1181	}
1182	mce_cpu_quirks(c);
1183
1184	machine_check_vector = do_machine_check;
1185
1186	mce_init();
1187	mce_cpu_features(c);
1188	mce_init_timer();
1189}
1190
1191/*
1192 * Character device to read and clear the MCE log.
1193 */
1194
1195static DEFINE_SPINLOCK(mce_state_lock);
1196static int		open_count;		/* #times opened */
1197static int		open_exclu;		/* already open exclusive? */
1198
1199static int mce_open(struct inode *inode, struct file *file)
1200{
1201	spin_lock(&mce_state_lock);
1202
1203	if (open_exclu || (open_count && (file->f_flags & O_EXCL))) {
1204		spin_unlock(&mce_state_lock);
1205
1206		return -EBUSY;
1207	}
1208
1209	if (file->f_flags & O_EXCL)
1210		open_exclu = 1;
1211	open_count++;
1212
1213	spin_unlock(&mce_state_lock);
1214
1215	return nonseekable_open(inode, file);
1216}
1217
1218static int mce_release(struct inode *inode, struct file *file)
1219{
1220	spin_lock(&mce_state_lock);
1221
1222	open_count--;
1223	open_exclu = 0;
1224
1225	spin_unlock(&mce_state_lock);
1226
1227	return 0;
1228}
1229
1230static void collect_tscs(void *data)
1231{
1232	unsigned long *cpu_tsc = (unsigned long *)data;
1233
1234	rdtscll(cpu_tsc[smp_processor_id()]);
1235}
1236
1237static DEFINE_MUTEX(mce_read_mutex);
1238
1239static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
1240			loff_t *off)
1241{
1242	char __user *buf = ubuf;
1243	unsigned long *cpu_tsc;
1244	unsigned prev, next;
1245	int i, err;
1246
1247	cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
1248	if (!cpu_tsc)
1249		return -ENOMEM;
1250
1251	mutex_lock(&mce_read_mutex);
1252	next = rcu_dereference(mcelog.next);
1253
1254	/* Only supports full reads right now */
1255	if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
1256		mutex_unlock(&mce_read_mutex);
1257		kfree(cpu_tsc);
1258
1259		return -EINVAL;
1260	}
1261
1262	err = 0;
1263	prev = 0;
1264	do {
1265		for (i = prev; i < next; i++) {
1266			unsigned long start = jiffies;
1267
1268			while (!mcelog.entry[i].finished) {
1269				if (time_after_eq(jiffies, start + 2)) {
1270					memset(mcelog.entry + i, 0,
1271					       sizeof(struct mce));
1272					goto timeout;
1273				}
1274				cpu_relax();
1275			}
1276			smp_rmb();
1277			err |= copy_to_user(buf, mcelog.entry + i,
1278					    sizeof(struct mce));
1279			buf += sizeof(struct mce);
1280timeout:
1281			;
1282		}
1283
1284		memset(mcelog.entry + prev, 0,
1285		       (next - prev) * sizeof(struct mce));
1286		prev = next;
1287		next = cmpxchg(&mcelog.next, prev, 0);
1288	} while (next != prev);
1289
1290	synchronize_sched();
1291
1292	/*
1293	 * Collect entries that were still getting written before the
1294	 * synchronize.
1295	 */
1296	on_each_cpu(collect_tscs, cpu_tsc, 1);
1297
1298	for (i = next; i < MCE_LOG_LEN; i++) {
1299		if (mcelog.entry[i].finished &&
1300		    mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
1301			err |= copy_to_user(buf, mcelog.entry+i,
1302					    sizeof(struct mce));
1303			smp_rmb();
1304			buf += sizeof(struct mce);
1305			memset(&mcelog.entry[i], 0, sizeof(struct mce));
1306		}
1307	}
1308	mutex_unlock(&mce_read_mutex);
1309	kfree(cpu_tsc);
1310
1311	return err ? -EFAULT : buf - ubuf;
1312}
1313
1314static unsigned int mce_poll(struct file *file, poll_table *wait)
1315{
1316	poll_wait(file, &mce_wait, wait);
1317	if (rcu_dereference(mcelog.next))
1318		return POLLIN | POLLRDNORM;
1319	return 0;
1320}
1321
1322static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
1323{
1324	int __user *p = (int __user *)arg;
1325
1326	if (!capable(CAP_SYS_ADMIN))
1327		return -EPERM;
1328
1329	switch (cmd) {
1330	case MCE_GET_RECORD_LEN:
1331		return put_user(sizeof(struct mce), p);
1332	case MCE_GET_LOG_LEN:
1333		return put_user(MCE_LOG_LEN, p);
1334	case MCE_GETCLEAR_FLAGS: {
1335		unsigned flags;
1336
1337		do {
1338			flags = mcelog.flags;
1339		} while (cmpxchg(&mcelog.flags, flags, 0) != flags);
1340
1341		return put_user(flags, p);
1342	}
1343	default:
1344		return -ENOTTY;
1345	}
1346}
1347
1348/* Modified in mce-inject.c, so not static or const */
1349struct file_operations mce_chrdev_ops = {
1350	.open			= mce_open,
1351	.release		= mce_release,
1352	.read			= mce_read,
1353	.poll			= mce_poll,
1354	.unlocked_ioctl		= mce_ioctl,
1355};
1356EXPORT_SYMBOL_GPL(mce_chrdev_ops);
1357
1358static struct miscdevice mce_log_device = {
1359	MISC_MCELOG_MINOR,
1360	"mcelog",
1361	&mce_chrdev_ops,
1362};
1363
1364/*
1365 * mce=off disables machine check
1366 * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above)
1367 *	monarchtimeout is how long to wait for other CPUs on machine
1368 *	check, or 0 to not wait
1369 * mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
1370 * mce=nobootlog Don't log MCEs from before booting.
1371 */
1372static int __init mcheck_enable(char *str)
1373{
1374	if (*str == 0)
1375		enable_p5_mce();
1376	if (*str == '=')
1377		str++;
1378	if (!strcmp(str, "off"))
1379		mce_disabled = 1;
1380	else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
1381		mce_bootlog = (str[0] == 'b');
1382	else if (isdigit(str[0])) {
1383		get_option(&str, &tolerant);
1384		if (*str == ',') {
1385			++str;
1386			get_option(&str, &monarch_timeout);
1387		}
1388	} else {
1389		printk(KERN_INFO "mce argument %s ignored. Please use /sys\n",
1390		       str);
1391		return 0;
1392	}
1393	return 1;
1394}
1395__setup("mce", mcheck_enable);
1396
1397/*
1398 * Sysfs support
1399 */
1400
1401/*
1402 * Disable machine checks on suspend and shutdown. We can't really handle
1403 * them later.
1404 */
1405static int mce_disable(void)
1406{
1407	int i;
1408
1409	for (i = 0; i < banks; i++) {
1410		if (!skip_bank_init(i))
1411			wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
1412	}
1413	return 0;
1414}
1415
1416static int mce_suspend(struct sys_device *dev, pm_message_t state)
1417{
1418	return mce_disable();
1419}
1420
1421static int mce_shutdown(struct sys_device *dev)
1422{
1423	return mce_disable();
1424}
1425
1426/*
1427 * On resume clear all MCE state. Don't want to see leftovers from the BIOS.
1428 * Only one CPU is active at this time, the others get re-added later using
1429 * CPU hotplug:
1430 */
1431static int mce_resume(struct sys_device *dev)
1432{
1433	mce_init();
1434	mce_cpu_features(&current_cpu_data);
1435
1436	return 0;
1437}
1438
1439static void mce_cpu_restart(void *data)
1440{
1441	del_timer_sync(&__get_cpu_var(mce_timer));
1442	if (mce_available(&current_cpu_data))
1443		mce_init();
1444	mce_init_timer();
1445}
1446
1447/* Reinit MCEs after user configuration changes */
1448static void mce_restart(void)
1449{
1450	on_each_cpu(mce_cpu_restart, NULL, 1);
1451}
1452
1453static struct sysdev_class mce_sysclass = {
1454	.suspend	= mce_suspend,
1455	.shutdown	= mce_shutdown,
1456	.resume		= mce_resume,
1457	.name		= "machinecheck",
1458};
1459
1460DEFINE_PER_CPU(struct sys_device, mce_dev);
1461
1462__cpuinitdata
1463void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
1464
1465static struct sysdev_attribute *bank_attrs;
1466
1467static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr,
1468			 char *buf)
1469{
1470	u64 b = bank[attr - bank_attrs];
1471
1472	return sprintf(buf, "%llx\n", b);
1473}
1474
1475static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
1476			const char *buf, size_t size)
1477{
1478	u64 new;
1479
1480	if (strict_strtoull(buf, 0, &new) < 0)
1481		return -EINVAL;
1482
1483	bank[attr - bank_attrs] = new;
1484	mce_restart();
1485
1486	return size;
1487}
1488
1489static ssize_t
1490show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf)
1491{
1492	strcpy(buf, trigger);
1493	strcat(buf, "\n");
1494	return strlen(trigger) + 1;
1495}
1496
1497static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr,
1498				const char *buf, size_t siz)
1499{
1500	char *p;
1501	int len;
1502
1503	strncpy(trigger, buf, sizeof(trigger));
1504	trigger[sizeof(trigger)-1] = 0;
1505	len = strlen(trigger);
1506	p = strchr(trigger, '\n');
1507
1508	if (*p)
1509		*p = 0;
1510
1511	return len;
1512}
1513
1514static ssize_t store_int_with_restart(struct sys_device *s,
1515				      struct sysdev_attribute *attr,
1516				      const char *buf, size_t size)
1517{
1518	ssize_t ret = sysdev_store_int(s, attr, buf, size);
1519	mce_restart();
1520	return ret;
1521}
1522
1523static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
1524static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
1525static SYSDEV_INT_ATTR(monarch_timeout, 0644, monarch_timeout);
1526
1527static struct sysdev_ext_attribute attr_check_interval = {
1528	_SYSDEV_ATTR(check_interval, 0644, sysdev_show_int,
1529		     store_int_with_restart),
1530	&check_interval
1531};
1532
1533static struct sysdev_attribute *mce_attrs[] = {
1534	&attr_tolerant.attr, &attr_check_interval.attr, &attr_trigger,
1535	&attr_monarch_timeout.attr,
1536	NULL
1537};
1538
1539static cpumask_var_t mce_dev_initialized;
1540
1541/* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */
1542static __cpuinit int mce_create_device(unsigned int cpu)
1543{
1544	int err;
1545	int i;
1546
1547	if (!mce_available(&boot_cpu_data))
1548		return -EIO;
1549
1550	memset(&per_cpu(mce_dev, cpu).kobj, 0, sizeof(struct kobject));
1551	per_cpu(mce_dev, cpu).id	= cpu;
1552	per_cpu(mce_dev, cpu).cls	= &mce_sysclass;
1553
1554	err = sysdev_register(&per_cpu(mce_dev, cpu));
1555	if (err)
1556		return err;
1557
1558	for (i = 0; mce_attrs[i]; i++) {
1559		err = sysdev_create_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
1560		if (err)
1561			goto error;
1562	}
1563	for (i = 0; i < banks; i++) {
1564		err = sysdev_create_file(&per_cpu(mce_dev, cpu),
1565					&bank_attrs[i]);
1566		if (err)
1567			goto error2;
1568	}
1569	cpumask_set_cpu(cpu, mce_dev_initialized);
1570
1571	return 0;
1572error2:
1573	while (--i >= 0)
1574		sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]);
1575error:
1576	while (--i >= 0)
1577		sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
1578
1579	sysdev_unregister(&per_cpu(mce_dev, cpu));
1580
1581	return err;
1582}
1583
1584static __cpuinit void mce_remove_device(unsigned int cpu)
1585{
1586	int i;
1587
1588	if (!cpumask_test_cpu(cpu, mce_dev_initialized))
1589		return;
1590
1591	for (i = 0; mce_attrs[i]; i++)
1592		sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
1593
1594	for (i = 0; i < banks; i++)
1595		sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]);
1596
1597	sysdev_unregister(&per_cpu(mce_dev, cpu));
1598	cpumask_clear_cpu(cpu, mce_dev_initialized);
1599}
1600
1601/* Make sure there are no machine checks on offlined CPUs. */
1602static void mce_disable_cpu(void *h)
1603{
1604	unsigned long action = *(unsigned long *)h;
1605	int i;
1606
1607	if (!mce_available(&current_cpu_data))
1608		return;
1609	if (!(action & CPU_TASKS_FROZEN))
1610		cmci_clear();
1611	for (i = 0; i < banks; i++) {
1612		if (!skip_bank_init(i))
1613			wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
1614	}
1615}
1616
1617static void mce_reenable_cpu(void *h)
1618{
1619	unsigned long action = *(unsigned long *)h;
1620	int i;
1621
1622	if (!mce_available(&current_cpu_data))
1623		return;
1624
1625	if (!(action & CPU_TASKS_FROZEN))
1626		cmci_reenable();
1627	for (i = 0; i < banks; i++) {
1628		if (!skip_bank_init(i))
1629			wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]);
1630	}
1631}
1632
1633/* Get notified when a cpu comes on/off. Be hotplug friendly. */
1634static int __cpuinit
1635mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
1636{
1637	unsigned int cpu = (unsigned long)hcpu;
1638	struct timer_list *t = &per_cpu(mce_timer, cpu);
1639
1640	switch (action) {
1641	case CPU_ONLINE:
1642	case CPU_ONLINE_FROZEN:
1643		mce_create_device(cpu);
1644		if (threshold_cpu_callback)
1645			threshold_cpu_callback(action, cpu);
1646		break;
1647	case CPU_DEAD:
1648	case CPU_DEAD_FROZEN:
1649		if (threshold_cpu_callback)
1650			threshold_cpu_callback(action, cpu);
1651		mce_remove_device(cpu);
1652		break;
1653	case CPU_DOWN_PREPARE:
1654	case CPU_DOWN_PREPARE_FROZEN:
1655		del_timer_sync(t);
1656		smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
1657		break;
1658	case CPU_DOWN_FAILED:
1659	case CPU_DOWN_FAILED_FROZEN:
1660		t->expires = round_jiffies(jiffies +
1661						__get_cpu_var(next_interval));
1662		add_timer_on(t, cpu);
1663		smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
1664		break;
1665	case CPU_POST_DEAD:
1666		/* intentionally ignoring frozen here */
1667		cmci_rediscover(cpu);
1668		break;
1669	}
1670	return NOTIFY_OK;
1671}
1672
1673static struct notifier_block mce_cpu_notifier __cpuinitdata = {
1674	.notifier_call = mce_cpu_callback,
1675};
1676
1677static __init int mce_init_banks(void)
1678{
1679	int i;
1680
1681	bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks,
1682				GFP_KERNEL);
1683	if (!bank_attrs)
1684		return -ENOMEM;
1685
1686	for (i = 0; i < banks; i++) {
1687		struct sysdev_attribute *a = &bank_attrs[i];
1688
1689		a->attr.name	= kasprintf(GFP_KERNEL, "bank%d", i);
1690		if (!a->attr.name)
1691			goto nomem;
1692
1693		a->attr.mode	= 0644;
1694		a->show		= show_bank;
1695		a->store	= set_bank;
1696	}
1697	return 0;
1698
1699nomem:
1700	while (--i >= 0)
1701		kfree(bank_attrs[i].attr.name);
1702	kfree(bank_attrs);
1703	bank_attrs = NULL;
1704
1705	return -ENOMEM;
1706}
1707
1708static __init int mce_init_device(void)
1709{
1710	int err;
1711	int i = 0;
1712
1713	if (!mce_available(&boot_cpu_data))
1714		return -EIO;
1715
1716	alloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL);
1717
1718	err = mce_init_banks();
1719	if (err)
1720		return err;
1721
1722	err = sysdev_class_register(&mce_sysclass);
1723	if (err)
1724		return err;
1725
1726	for_each_online_cpu(i) {
1727		err = mce_create_device(i);
1728		if (err)
1729			return err;
1730	}
1731
1732	register_hotcpu_notifier(&mce_cpu_notifier);
1733	misc_register(&mce_log_device);
1734
1735	return err;
1736}
1737
1738device_initcall(mce_init_device);
1739
1740#else /* CONFIG_X86_OLD_MCE: */
1741
1742int nr_mce_banks;
1743EXPORT_SYMBOL_GPL(nr_mce_banks);	/* non-fatal.o */
1744
1745/* This has to be run for each processor */
1746void mcheck_init(struct cpuinfo_x86 *c)
1747{
1748	if (mce_disabled == 1)
1749		return;
1750
1751	switch (c->x86_vendor) {
1752	case X86_VENDOR_AMD:
1753		amd_mcheck_init(c);
1754		break;
1755
1756	case X86_VENDOR_INTEL:
1757		if (c->x86 == 5)
1758			intel_p5_mcheck_init(c);
1759		if (c->x86 == 6)
1760			intel_p6_mcheck_init(c);
1761		if (c->x86 == 15)
1762			intel_p4_mcheck_init(c);
1763		break;
1764
1765	case X86_VENDOR_CENTAUR:
1766		if (c->x86 == 5)
1767			winchip_mcheck_init(c);
1768		break;
1769
1770	default:
1771		break;
1772	}
1773	printk(KERN_INFO "mce: CPU supports %d MCE banks\n", nr_mce_banks);
1774}
1775
1776static int __init mcheck_enable(char *str)
1777{
1778	mce_disabled = -1;
1779	return 1;
1780}
1781
1782__setup("mce", mcheck_enable);
1783
1784#endif /* CONFIG_X86_OLD_MCE */
1785
1786/*
1787 * Old style boot options parsing. Only for compatibility.
1788 */
1789static int __init mcheck_disable(char *str)
1790{
1791	mce_disabled = 1;
1792	return 1;
1793}
1794__setup("nomce", mcheck_disable);
1795