mce.c revision de8a84d85ad8bb46d01d72ebc57030b95075603c
1/*
2 * Machine check handler.
3 *
4 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
5 * Rest from unknown author(s).
6 * 2004 Andi Kleen. Rewrote most of it.
7 * Copyright 2008 Intel Corporation
8 * Author: Andi Kleen
9 */
10#include <linux/thread_info.h>
11#include <linux/capability.h>
12#include <linux/miscdevice.h>
13#include <linux/ratelimit.h>
14#include <linux/kallsyms.h>
15#include <linux/rcupdate.h>
16#include <linux/kobject.h>
17#include <linux/uaccess.h>
18#include <linux/kdebug.h>
19#include <linux/kernel.h>
20#include <linux/percpu.h>
21#include <linux/string.h>
22#include <linux/sysdev.h>
23#include <linux/ctype.h>
24#include <linux/sched.h>
25#include <linux/sysfs.h>
26#include <linux/types.h>
27#include <linux/init.h>
28#include <linux/kmod.h>
29#include <linux/poll.h>
30#include <linux/cpu.h>
31#include <linux/smp.h>
32#include <linux/fs.h>
33
34#include <asm/processor.h>
35#include <asm/idle.h>
36#include <asm/mce.h>
37#include <asm/msr.h>
38
39#include "mce.h"
40
41/* Handle unconfigured int18 (should never happen) */
42static void unexpected_machine_check(struct pt_regs *regs, long error_code)
43{
44	printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n",
45	       smp_processor_id());
46}
47
48/* Call the installed machine check handler for this CPU setup. */
49void (*machine_check_vector)(struct pt_regs *, long error_code) =
50						unexpected_machine_check;
51
52int				mce_disabled;
53
54#ifdef CONFIG_X86_NEW_MCE
55
56#define MISC_MCELOG_MINOR	227
57
58atomic_t mce_entry;
59
60DEFINE_PER_CPU(unsigned, mce_exception_count);
61
62/*
63 * Tolerant levels:
64 *   0: always panic on uncorrected errors, log corrected errors
65 *   1: panic or SIGBUS on uncorrected errors, log corrected errors
66 *   2: SIGBUS or log uncorrected errors (if possible), log corrected errors
67 *   3: never panic or SIGBUS, log all errors (for testing only)
68 */
69static int			tolerant = 1;
70static int			banks;
71static u64			*bank;
72static unsigned long		notify_user;
73static int			rip_msr;
74static int			mce_bootlog = -1;
75
76static char			trigger[128];
77static char			*trigger_argv[2] = { trigger, NULL };
78
79static unsigned long		dont_init_banks;
80
81static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
82
83/* MCA banks polled by the period polling timer for corrected events */
84DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
85	[0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
86};
87
88static inline int skip_bank_init(int i)
89{
90	return i < BITS_PER_LONG && test_bit(i, &dont_init_banks);
91}
92
93/* Do initial initialization of a struct mce */
94void mce_setup(struct mce *m)
95{
96	memset(m, 0, sizeof(struct mce));
97	m->cpu = m->extcpu = smp_processor_id();
98	rdtscll(m->tsc);
99	/* We hope get_seconds stays lockless */
100	m->time = get_seconds();
101	m->cpuvendor = boot_cpu_data.x86_vendor;
102	m->cpuid = cpuid_eax(1);
103#ifdef CONFIG_SMP
104	m->socketid = cpu_data(m->extcpu).phys_proc_id;
105#endif
106	m->apicid = cpu_data(m->extcpu).initial_apicid;
107	rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap);
108}
109
110DEFINE_PER_CPU(struct mce, injectm);
111EXPORT_PER_CPU_SYMBOL_GPL(injectm);
112
113/*
114 * Lockless MCE logging infrastructure.
115 * This avoids deadlocks on printk locks without having to break locks. Also
116 * separate MCEs from kernel messages to avoid bogus bug reports.
117 */
118
119static struct mce_log mcelog = {
120	.signature	= MCE_LOG_SIGNATURE,
121	.len		= MCE_LOG_LEN,
122	.recordlen	= sizeof(struct mce),
123};
124
125void mce_log(struct mce *mce)
126{
127	unsigned next, entry;
128
129	mce->finished = 0;
130	wmb();
131	for (;;) {
132		entry = rcu_dereference(mcelog.next);
133		for (;;) {
134			/*
135			 * When the buffer fills up discard new entries.
136			 * Assume that the earlier errors are the more
137			 * interesting ones:
138			 */
139			if (entry >= MCE_LOG_LEN) {
140				set_bit(MCE_OVERFLOW,
141					(unsigned long *)&mcelog.flags);
142				return;
143			}
144			/* Old left over entry. Skip: */
145			if (mcelog.entry[entry].finished) {
146				entry++;
147				continue;
148			}
149			break;
150		}
151		smp_rmb();
152		next = entry + 1;
153		if (cmpxchg(&mcelog.next, entry, next) == entry)
154			break;
155	}
156	memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
157	wmb();
158	mcelog.entry[entry].finished = 1;
159	wmb();
160
161	set_bit(0, &notify_user);
162}
163
164static void print_mce(struct mce *m)
165{
166	printk(KERN_EMERG "\n"
167	       KERN_EMERG "HARDWARE ERROR\n"
168	       KERN_EMERG
169	       "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
170	       m->extcpu, m->mcgstatus, m->bank, m->status);
171	if (m->ip) {
172		printk(KERN_EMERG "RIP%s %02x:<%016Lx> ",
173		       !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
174		       m->cs, m->ip);
175		if (m->cs == __KERNEL_CS)
176			print_symbol("{%s}", m->ip);
177		printk("\n");
178	}
179	printk(KERN_EMERG "TSC %llx ", m->tsc);
180	if (m->addr)
181		printk("ADDR %llx ", m->addr);
182	if (m->misc)
183		printk("MISC %llx ", m->misc);
184	printk("\n");
185	printk(KERN_EMERG "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n",
186			m->cpuvendor, m->cpuid, m->time, m->socketid,
187			m->apicid);
188	printk(KERN_EMERG "This is not a software problem!\n");
189	printk(KERN_EMERG "Run through mcelog --ascii to decode "
190	       "and contact your hardware vendor\n");
191}
192
193static void mce_panic(char *msg, struct mce *backup, u64 start)
194{
195	int i;
196
197	bust_spinlocks(1);
198	console_verbose();
199	for (i = 0; i < MCE_LOG_LEN; i++) {
200		u64 tsc = mcelog.entry[i].tsc;
201
202		if ((s64)(tsc - start) < 0)
203			continue;
204		print_mce(&mcelog.entry[i]);
205		if (backup && mcelog.entry[i].tsc == backup->tsc)
206			backup = NULL;
207	}
208	if (backup)
209		print_mce(backup);
210	panic(msg);
211}
212
213/* Support code for software error injection */
214
215static int msr_to_offset(u32 msr)
216{
217	unsigned bank = __get_cpu_var(injectm.bank);
218	if (msr == rip_msr)
219		return offsetof(struct mce, ip);
220	if (msr == MSR_IA32_MC0_STATUS + bank*4)
221		return offsetof(struct mce, status);
222	if (msr == MSR_IA32_MC0_ADDR + bank*4)
223		return offsetof(struct mce, addr);
224	if (msr == MSR_IA32_MC0_MISC + bank*4)
225		return offsetof(struct mce, misc);
226	if (msr == MSR_IA32_MCG_STATUS)
227		return offsetof(struct mce, mcgstatus);
228	return -1;
229}
230
231/* MSR access wrappers used for error injection */
232static u64 mce_rdmsrl(u32 msr)
233{
234	u64 v;
235	if (__get_cpu_var(injectm).finished) {
236		int offset = msr_to_offset(msr);
237		if (offset < 0)
238			return 0;
239		return *(u64 *)((char *)&__get_cpu_var(injectm) + offset);
240	}
241	rdmsrl(msr, v);
242	return v;
243}
244
245static void mce_wrmsrl(u32 msr, u64 v)
246{
247	if (__get_cpu_var(injectm).finished) {
248		int offset = msr_to_offset(msr);
249		if (offset >= 0)
250			*(u64 *)((char *)&__get_cpu_var(injectm) + offset) = v;
251		return;
252	}
253	wrmsrl(msr, v);
254}
255
256int mce_available(struct cpuinfo_x86 *c)
257{
258	if (mce_disabled)
259		return 0;
260	return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
261}
262
263static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
264{
265	if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) {
266		m->ip = regs->ip;
267		m->cs = regs->cs;
268	} else {
269		m->ip = 0;
270		m->cs = 0;
271	}
272	if (rip_msr) {
273		/* Assume the RIP in the MSR is exact. Is this true? */
274		m->mcgstatus |= MCG_STATUS_EIPV;
275		m->ip = mce_rdmsrl(rip_msr);
276		m->cs = 0;
277	}
278}
279
280DEFINE_PER_CPU(unsigned, mce_poll_count);
281
282/*
283 * Poll for corrected events or events that happened before reset.
284 * Those are just logged through /dev/mcelog.
285 *
286 * This is executed in standard interrupt context.
287 */
288void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
289{
290	struct mce m;
291	int i;
292
293	__get_cpu_var(mce_poll_count)++;
294
295	mce_setup(&m);
296
297	m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
298	for (i = 0; i < banks; i++) {
299		if (!bank[i] || !test_bit(i, *b))
300			continue;
301
302		m.misc = 0;
303		m.addr = 0;
304		m.bank = i;
305		m.tsc = 0;
306
307		barrier();
308		m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4);
309		if (!(m.status & MCI_STATUS_VAL))
310			continue;
311
312		/*
313		 * Uncorrected events are handled by the exception handler
314		 * when it is enabled. But when the exception is disabled log
315		 * everything.
316		 *
317		 * TBD do the same check for MCI_STATUS_EN here?
318		 */
319		if ((m.status & MCI_STATUS_UC) && !(flags & MCP_UC))
320			continue;
321
322		if (m.status & MCI_STATUS_MISCV)
323			m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4);
324		if (m.status & MCI_STATUS_ADDRV)
325			m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4);
326
327		if (!(flags & MCP_TIMESTAMP))
328			m.tsc = 0;
329		/*
330		 * Don't get the IP here because it's unlikely to
331		 * have anything to do with the actual error location.
332		 */
333		if (!(flags & MCP_DONTLOG)) {
334			mce_log(&m);
335			add_taint(TAINT_MACHINE_CHECK);
336		}
337
338		/*
339		 * Clear state for this bank.
340		 */
341		mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
342	}
343
344	/*
345	 * Don't clear MCG_STATUS here because it's only defined for
346	 * exceptions.
347	 */
348
349	sync_core();
350}
351EXPORT_SYMBOL_GPL(machine_check_poll);
352
353/*
354 * The actual machine check handler. This only handles real
355 * exceptions when something got corrupted coming in through int 18.
356 *
357 * This is executed in NMI context not subject to normal locking rules. This
358 * implies that most kernel services cannot be safely used. Don't even
359 * think about putting a printk in there!
360 */
361void do_machine_check(struct pt_regs *regs, long error_code)
362{
363	struct mce m, panicm;
364	int panicm_found = 0;
365	u64 mcestart = 0;
366	int i;
367	/*
368	 * If no_way_out gets set, there is no safe way to recover from this
369	 * MCE.  If tolerant is cranked up, we'll try anyway.
370	 */
371	int no_way_out = 0;
372	/*
373	 * If kill_it gets set, there might be a way to recover from this
374	 * error.
375	 */
376	int kill_it = 0;
377	DECLARE_BITMAP(toclear, MAX_NR_BANKS);
378
379	atomic_inc(&mce_entry);
380
381	__get_cpu_var(mce_exception_count)++;
382
383	if (notify_die(DIE_NMI, "machine check", regs, error_code,
384			   18, SIGKILL) == NOTIFY_STOP)
385		goto out;
386	if (!banks)
387		goto out;
388
389	mce_setup(&m);
390
391	m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
392
393	/* if the restart IP is not valid, we're done for */
394	if (!(m.mcgstatus & MCG_STATUS_RIPV))
395		no_way_out = 1;
396
397	rdtscll(mcestart);
398	barrier();
399
400	for (i = 0; i < banks; i++) {
401		__clear_bit(i, toclear);
402		if (!bank[i])
403			continue;
404
405		m.misc = 0;
406		m.addr = 0;
407		m.bank = i;
408
409		m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4);
410		if ((m.status & MCI_STATUS_VAL) == 0)
411			continue;
412
413		/*
414		 * Non uncorrected errors are handled by machine_check_poll
415		 * Leave them alone, unless this panics.
416		 */
417		if ((m.status & MCI_STATUS_UC) == 0 && !no_way_out)
418			continue;
419
420		/*
421		 * Set taint even when machine check was not enabled.
422		 */
423		add_taint(TAINT_MACHINE_CHECK);
424
425		__set_bit(i, toclear);
426
427		if (m.status & MCI_STATUS_EN) {
428			/* if PCC was set, there's no way out */
429			no_way_out |= !!(m.status & MCI_STATUS_PCC);
430			/*
431			 * If this error was uncorrectable and there was
432			 * an overflow, we're in trouble.  If no overflow,
433			 * we might get away with just killing a task.
434			 */
435			if (m.status & MCI_STATUS_UC) {
436				if (tolerant < 1 || m.status & MCI_STATUS_OVER)
437					no_way_out = 1;
438				kill_it = 1;
439			}
440		} else {
441			/*
442			 * Machine check event was not enabled. Clear, but
443			 * ignore.
444			 */
445			continue;
446		}
447
448		if (m.status & MCI_STATUS_MISCV)
449			m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4);
450		if (m.status & MCI_STATUS_ADDRV)
451			m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4);
452
453		mce_get_rip(&m, regs);
454		mce_log(&m);
455
456		/*
457		 * Did this bank cause the exception?
458		 *
459		 * Assume that the bank with uncorrectable errors did it,
460		 * and that there is only a single one:
461		 */
462		if ((m.status & MCI_STATUS_UC) &&
463					(m.status & MCI_STATUS_EN)) {
464			panicm = m;
465			panicm_found = 1;
466		}
467	}
468
469	/*
470	 * If we didn't find an uncorrectable error, pick
471	 * the last one (shouldn't happen, just being safe).
472	 */
473	if (!panicm_found)
474		panicm = m;
475
476	/*
477	 * If we have decided that we just CAN'T continue, and the user
478	 * has not set tolerant to an insane level, give up and die.
479	 */
480	if (no_way_out && tolerant < 3)
481		mce_panic("Machine check", &panicm, mcestart);
482
483	/*
484	 * If the error seems to be unrecoverable, something should be
485	 * done.  Try to kill as little as possible.  If we can kill just
486	 * one task, do that.  If the user has set the tolerance very
487	 * high, don't try to do anything at all.
488	 */
489	if (kill_it && tolerant < 3) {
490		int user_space = 0;
491
492		/*
493		 * If the EIPV bit is set, it means the saved IP is the
494		 * instruction which caused the MCE.
495		 */
496		if (m.mcgstatus & MCG_STATUS_EIPV)
497			user_space = panicm.ip && (panicm.cs & 3);
498
499		/*
500		 * If we know that the error was in user space, send a
501		 * SIGBUS.  Otherwise, panic if tolerance is low.
502		 *
503		 * force_sig() takes an awful lot of locks and has a slight
504		 * risk of deadlocking.
505		 */
506		if (user_space) {
507			force_sig(SIGBUS, current);
508		} else if (panic_on_oops || tolerant < 2) {
509			mce_panic("Uncorrected machine check",
510				&panicm, mcestart);
511		}
512	}
513
514	/* notify userspace ASAP */
515	set_thread_flag(TIF_MCE_NOTIFY);
516
517	/* the last thing we do is clear state */
518	for (i = 0; i < banks; i++) {
519		if (test_bit(i, toclear))
520			mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
521	}
522	mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
523out:
524	atomic_dec(&mce_entry);
525	sync_core();
526}
527EXPORT_SYMBOL_GPL(do_machine_check);
528
529#ifdef CONFIG_X86_MCE_INTEL
530/***
531 * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
532 * @cpu: The CPU on which the event occurred.
533 * @status: Event status information
534 *
535 * This function should be called by the thermal interrupt after the
536 * event has been processed and the decision was made to log the event
537 * further.
538 *
539 * The status parameter will be saved to the 'status' field of 'struct mce'
540 * and historically has been the register value of the
541 * MSR_IA32_THERMAL_STATUS (Intel) msr.
542 */
543void mce_log_therm_throt_event(__u64 status)
544{
545	struct mce m;
546
547	mce_setup(&m);
548	m.bank = MCE_THERMAL_BANK;
549	m.status = status;
550	mce_log(&m);
551}
552#endif /* CONFIG_X86_MCE_INTEL */
553
554/*
555 * Periodic polling timer for "silent" machine check errors.  If the
556 * poller finds an MCE, poll 2x faster.  When the poller finds no more
557 * errors, poll 2x slower (up to check_interval seconds).
558 */
559static int check_interval = 5 * 60; /* 5 minutes */
560
561static DEFINE_PER_CPU(int, next_interval); /* in jiffies */
562static DEFINE_PER_CPU(struct timer_list, mce_timer);
563
564static void mcheck_timer(unsigned long data)
565{
566	struct timer_list *t = &per_cpu(mce_timer, data);
567	int *n;
568
569	WARN_ON(smp_processor_id() != data);
570
571	if (mce_available(&current_cpu_data)) {
572		machine_check_poll(MCP_TIMESTAMP,
573				&__get_cpu_var(mce_poll_banks));
574	}
575
576	/*
577	 * Alert userspace if needed.  If we logged an MCE, reduce the
578	 * polling interval, otherwise increase the polling interval.
579	 */
580	n = &__get_cpu_var(next_interval);
581	if (mce_notify_user())
582		*n = max(*n/2, HZ/100);
583	else
584		*n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ));
585
586	t->expires = jiffies + *n;
587	add_timer(t);
588}
589
590static void mce_do_trigger(struct work_struct *work)
591{
592	call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT);
593}
594
595static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
596
597/*
598 * Notify the user(s) about new machine check events.
599 * Can be called from interrupt context, but not from machine check/NMI
600 * context.
601 */
602int mce_notify_user(void)
603{
604	/* Not more than two messages every minute */
605	static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
606
607	clear_thread_flag(TIF_MCE_NOTIFY);
608
609	if (test_and_clear_bit(0, &notify_user)) {
610		wake_up_interruptible(&mce_wait);
611
612		/*
613		 * There is no risk of missing notifications because
614		 * work_pending is always cleared before the function is
615		 * executed.
616		 */
617		if (trigger[0] && !work_pending(&mce_trigger_work))
618			schedule_work(&mce_trigger_work);
619
620		if (__ratelimit(&ratelimit))
621			printk(KERN_INFO "Machine check events logged\n");
622
623		return 1;
624	}
625	return 0;
626}
627EXPORT_SYMBOL_GPL(mce_notify_user);
628
629/*
630 * Initialize Machine Checks for a CPU.
631 */
632static int mce_cap_init(void)
633{
634	unsigned b;
635	u64 cap;
636
637	rdmsrl(MSR_IA32_MCG_CAP, cap);
638
639	b = cap & MCG_BANKCNT_MASK;
640	printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b);
641
642	if (b > MAX_NR_BANKS) {
643		printk(KERN_WARNING
644		       "MCE: Using only %u machine check banks out of %u\n",
645			MAX_NR_BANKS, b);
646		b = MAX_NR_BANKS;
647	}
648
649	/* Don't support asymmetric configurations today */
650	WARN_ON(banks != 0 && b != banks);
651	banks = b;
652	if (!bank) {
653		bank = kmalloc(banks * sizeof(u64), GFP_KERNEL);
654		if (!bank)
655			return -ENOMEM;
656		memset(bank, 0xff, banks * sizeof(u64));
657	}
658
659	/* Use accurate RIP reporting if available. */
660	if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9)
661		rip_msr = MSR_IA32_MCG_EIP;
662
663	return 0;
664}
665
666static void mce_init(void)
667{
668	mce_banks_t all_banks;
669	u64 cap;
670	int i;
671
672	/*
673	 * Log the machine checks left over from the previous reset.
674	 */
675	bitmap_fill(all_banks, MAX_NR_BANKS);
676	machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks);
677
678	set_in_cr4(X86_CR4_MCE);
679
680	rdmsrl(MSR_IA32_MCG_CAP, cap);
681	if (cap & MCG_CTL_P)
682		wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
683
684	for (i = 0; i < banks; i++) {
685		if (skip_bank_init(i))
686			continue;
687		wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
688		wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
689	}
690}
691
692/* Add per CPU specific workarounds here */
693static void mce_cpu_quirks(struct cpuinfo_x86 *c)
694{
695	/* This should be disabled by the BIOS, but isn't always */
696	if (c->x86_vendor == X86_VENDOR_AMD) {
697		if (c->x86 == 15 && banks > 4) {
698			/*
699			 * disable GART TBL walk error reporting, which
700			 * trips off incorrectly with the IOMMU & 3ware
701			 * & Cerberus:
702			 */
703			clear_bit(10, (unsigned long *)&bank[4]);
704		}
705		if (c->x86 <= 17 && mce_bootlog < 0) {
706			/*
707			 * Lots of broken BIOS around that don't clear them
708			 * by default and leave crap in there. Don't log:
709			 */
710			mce_bootlog = 0;
711		}
712		/*
713		 * Various K7s with broken bank 0 around. Always disable
714		 * by default.
715		 */
716		 if (c->x86 == 6)
717			bank[0] = 0;
718	}
719
720	if (c->x86_vendor == X86_VENDOR_INTEL) {
721		/*
722		 * SDM documents that on family 6 bank 0 should not be written
723		 * because it aliases to another special BIOS controlled
724		 * register.
725		 * But it's not aliased anymore on model 0x1a+
726		 * Don't ignore bank 0 completely because there could be a
727		 * valid event later, merely don't write CTL0.
728		 */
729
730		if (c->x86 == 6 && c->x86_model < 0x1A)
731			__set_bit(0, &dont_init_banks);
732	}
733}
734
735static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c)
736{
737	if (c->x86 != 5)
738		return;
739	switch (c->x86_vendor) {
740	case X86_VENDOR_INTEL:
741		if (mce_p5_enabled())
742			intel_p5_mcheck_init(c);
743		break;
744	case X86_VENDOR_CENTAUR:
745		winchip_mcheck_init(c);
746		break;
747	}
748}
749
750static void mce_cpu_features(struct cpuinfo_x86 *c)
751{
752	switch (c->x86_vendor) {
753	case X86_VENDOR_INTEL:
754		mce_intel_feature_init(c);
755		break;
756	case X86_VENDOR_AMD:
757		mce_amd_feature_init(c);
758		break;
759	default:
760		break;
761	}
762}
763
764static void mce_init_timer(void)
765{
766	struct timer_list *t = &__get_cpu_var(mce_timer);
767	int *n = &__get_cpu_var(next_interval);
768
769	*n = check_interval * HZ;
770	if (!*n)
771		return;
772	setup_timer(t, mcheck_timer, smp_processor_id());
773	t->expires = round_jiffies(jiffies + *n);
774	add_timer(t);
775}
776
777/*
778 * Called for each booted CPU to set up machine checks.
779 * Must be called with preempt off:
780 */
781void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
782{
783	if (mce_disabled)
784		return;
785
786	mce_ancient_init(c);
787
788	if (!mce_available(c))
789		return;
790
791	if (mce_cap_init() < 0) {
792		mce_disabled = 1;
793		return;
794	}
795	mce_cpu_quirks(c);
796
797	machine_check_vector = do_machine_check;
798
799	mce_init();
800	mce_cpu_features(c);
801	mce_init_timer();
802}
803
804/*
805 * Character device to read and clear the MCE log.
806 */
807
808static DEFINE_SPINLOCK(mce_state_lock);
809static int		open_count;		/* #times opened */
810static int		open_exclu;		/* already open exclusive? */
811
812static int mce_open(struct inode *inode, struct file *file)
813{
814	spin_lock(&mce_state_lock);
815
816	if (open_exclu || (open_count && (file->f_flags & O_EXCL))) {
817		spin_unlock(&mce_state_lock);
818
819		return -EBUSY;
820	}
821
822	if (file->f_flags & O_EXCL)
823		open_exclu = 1;
824	open_count++;
825
826	spin_unlock(&mce_state_lock);
827
828	return nonseekable_open(inode, file);
829}
830
831static int mce_release(struct inode *inode, struct file *file)
832{
833	spin_lock(&mce_state_lock);
834
835	open_count--;
836	open_exclu = 0;
837
838	spin_unlock(&mce_state_lock);
839
840	return 0;
841}
842
843static void collect_tscs(void *data)
844{
845	unsigned long *cpu_tsc = (unsigned long *)data;
846
847	rdtscll(cpu_tsc[smp_processor_id()]);
848}
849
850static DEFINE_MUTEX(mce_read_mutex);
851
852static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
853			loff_t *off)
854{
855	char __user *buf = ubuf;
856	unsigned long *cpu_tsc;
857	unsigned prev, next;
858	int i, err;
859
860	cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
861	if (!cpu_tsc)
862		return -ENOMEM;
863
864	mutex_lock(&mce_read_mutex);
865	next = rcu_dereference(mcelog.next);
866
867	/* Only supports full reads right now */
868	if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
869		mutex_unlock(&mce_read_mutex);
870		kfree(cpu_tsc);
871
872		return -EINVAL;
873	}
874
875	err = 0;
876	prev = 0;
877	do {
878		for (i = prev; i < next; i++) {
879			unsigned long start = jiffies;
880
881			while (!mcelog.entry[i].finished) {
882				if (time_after_eq(jiffies, start + 2)) {
883					memset(mcelog.entry + i, 0,
884					       sizeof(struct mce));
885					goto timeout;
886				}
887				cpu_relax();
888			}
889			smp_rmb();
890			err |= copy_to_user(buf, mcelog.entry + i,
891					    sizeof(struct mce));
892			buf += sizeof(struct mce);
893timeout:
894			;
895		}
896
897		memset(mcelog.entry + prev, 0,
898		       (next - prev) * sizeof(struct mce));
899		prev = next;
900		next = cmpxchg(&mcelog.next, prev, 0);
901	} while (next != prev);
902
903	synchronize_sched();
904
905	/*
906	 * Collect entries that were still getting written before the
907	 * synchronize.
908	 */
909	on_each_cpu(collect_tscs, cpu_tsc, 1);
910
911	for (i = next; i < MCE_LOG_LEN; i++) {
912		if (mcelog.entry[i].finished &&
913		    mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
914			err |= copy_to_user(buf, mcelog.entry+i,
915					    sizeof(struct mce));
916			smp_rmb();
917			buf += sizeof(struct mce);
918			memset(&mcelog.entry[i], 0, sizeof(struct mce));
919		}
920	}
921	mutex_unlock(&mce_read_mutex);
922	kfree(cpu_tsc);
923
924	return err ? -EFAULT : buf - ubuf;
925}
926
927static unsigned int mce_poll(struct file *file, poll_table *wait)
928{
929	poll_wait(file, &mce_wait, wait);
930	if (rcu_dereference(mcelog.next))
931		return POLLIN | POLLRDNORM;
932	return 0;
933}
934
935static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
936{
937	int __user *p = (int __user *)arg;
938
939	if (!capable(CAP_SYS_ADMIN))
940		return -EPERM;
941
942	switch (cmd) {
943	case MCE_GET_RECORD_LEN:
944		return put_user(sizeof(struct mce), p);
945	case MCE_GET_LOG_LEN:
946		return put_user(MCE_LOG_LEN, p);
947	case MCE_GETCLEAR_FLAGS: {
948		unsigned flags;
949
950		do {
951			flags = mcelog.flags;
952		} while (cmpxchg(&mcelog.flags, flags, 0) != flags);
953
954		return put_user(flags, p);
955	}
956	default:
957		return -ENOTTY;
958	}
959}
960
961/* Modified in mce-inject.c, so not static or const */
962struct file_operations mce_chrdev_ops = {
963	.open			= mce_open,
964	.release		= mce_release,
965	.read			= mce_read,
966	.poll			= mce_poll,
967	.unlocked_ioctl		= mce_ioctl,
968};
969EXPORT_SYMBOL_GPL(mce_chrdev_ops);
970
971static struct miscdevice mce_log_device = {
972	MISC_MCELOG_MINOR,
973	"mcelog",
974	&mce_chrdev_ops,
975};
976
977/*
978 * mce=off disables machine check
979 * mce=TOLERANCELEVEL (number, see above)
980 * mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
981 * mce=nobootlog Don't log MCEs from before booting.
982 */
983static int __init mcheck_enable(char *str)
984{
985	if (*str == 0)
986		enable_p5_mce();
987	if (*str == '=')
988		str++;
989	if (!strcmp(str, "off"))
990		mce_disabled = 1;
991	else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
992		mce_bootlog = (str[0] == 'b');
993	else if (isdigit(str[0]))
994		get_option(&str, &tolerant);
995	else {
996		printk(KERN_INFO "mce argument %s ignored. Please use /sys\n",
997		       str);
998		return 0;
999	}
1000	return 1;
1001}
1002__setup("mce", mcheck_enable);
1003
1004/*
1005 * Sysfs support
1006 */
1007
1008/*
1009 * Disable machine checks on suspend and shutdown. We can't really handle
1010 * them later.
1011 */
1012static int mce_disable(void)
1013{
1014	int i;
1015
1016	for (i = 0; i < banks; i++) {
1017		if (!skip_bank_init(i))
1018			wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
1019	}
1020	return 0;
1021}
1022
1023static int mce_suspend(struct sys_device *dev, pm_message_t state)
1024{
1025	return mce_disable();
1026}
1027
1028static int mce_shutdown(struct sys_device *dev)
1029{
1030	return mce_disable();
1031}
1032
1033/*
1034 * On resume clear all MCE state. Don't want to see leftovers from the BIOS.
1035 * Only one CPU is active at this time, the others get re-added later using
1036 * CPU hotplug:
1037 */
1038static int mce_resume(struct sys_device *dev)
1039{
1040	mce_init();
1041	mce_cpu_features(&current_cpu_data);
1042
1043	return 0;
1044}
1045
1046static void mce_cpu_restart(void *data)
1047{
1048	del_timer_sync(&__get_cpu_var(mce_timer));
1049	if (mce_available(&current_cpu_data))
1050		mce_init();
1051	mce_init_timer();
1052}
1053
1054/* Reinit MCEs after user configuration changes */
1055static void mce_restart(void)
1056{
1057	on_each_cpu(mce_cpu_restart, NULL, 1);
1058}
1059
1060static struct sysdev_class mce_sysclass = {
1061	.suspend	= mce_suspend,
1062	.shutdown	= mce_shutdown,
1063	.resume		= mce_resume,
1064	.name		= "machinecheck",
1065};
1066
1067DEFINE_PER_CPU(struct sys_device, mce_dev);
1068
1069__cpuinitdata
1070void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
1071
1072static struct sysdev_attribute *bank_attrs;
1073
1074static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr,
1075			 char *buf)
1076{
1077	u64 b = bank[attr - bank_attrs];
1078
1079	return sprintf(buf, "%llx\n", b);
1080}
1081
1082static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
1083			const char *buf, size_t size)
1084{
1085	u64 new;
1086
1087	if (strict_strtoull(buf, 0, &new) < 0)
1088		return -EINVAL;
1089
1090	bank[attr - bank_attrs] = new;
1091	mce_restart();
1092
1093	return size;
1094}
1095
1096static ssize_t
1097show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf)
1098{
1099	strcpy(buf, trigger);
1100	strcat(buf, "\n");
1101	return strlen(trigger) + 1;
1102}
1103
1104static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr,
1105				const char *buf, size_t siz)
1106{
1107	char *p;
1108	int len;
1109
1110	strncpy(trigger, buf, sizeof(trigger));
1111	trigger[sizeof(trigger)-1] = 0;
1112	len = strlen(trigger);
1113	p = strchr(trigger, '\n');
1114
1115	if (*p)
1116		*p = 0;
1117
1118	return len;
1119}
1120
1121static ssize_t store_int_with_restart(struct sys_device *s,
1122				      struct sysdev_attribute *attr,
1123				      const char *buf, size_t size)
1124{
1125	ssize_t ret = sysdev_store_int(s, attr, buf, size);
1126	mce_restart();
1127	return ret;
1128}
1129
1130static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
1131static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
1132
1133static struct sysdev_ext_attribute attr_check_interval = {
1134	_SYSDEV_ATTR(check_interval, 0644, sysdev_show_int,
1135		     store_int_with_restart),
1136	&check_interval
1137};
1138
1139static struct sysdev_attribute *mce_attrs[] = {
1140	&attr_tolerant.attr, &attr_check_interval.attr, &attr_trigger,
1141	NULL
1142};
1143
1144static cpumask_var_t mce_dev_initialized;
1145
1146/* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */
1147static __cpuinit int mce_create_device(unsigned int cpu)
1148{
1149	int err;
1150	int i;
1151
1152	if (!mce_available(&boot_cpu_data))
1153		return -EIO;
1154
1155	memset(&per_cpu(mce_dev, cpu).kobj, 0, sizeof(struct kobject));
1156	per_cpu(mce_dev, cpu).id	= cpu;
1157	per_cpu(mce_dev, cpu).cls	= &mce_sysclass;
1158
1159	err = sysdev_register(&per_cpu(mce_dev, cpu));
1160	if (err)
1161		return err;
1162
1163	for (i = 0; mce_attrs[i]; i++) {
1164		err = sysdev_create_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
1165		if (err)
1166			goto error;
1167	}
1168	for (i = 0; i < banks; i++) {
1169		err = sysdev_create_file(&per_cpu(mce_dev, cpu),
1170					&bank_attrs[i]);
1171		if (err)
1172			goto error2;
1173	}
1174	cpumask_set_cpu(cpu, mce_dev_initialized);
1175
1176	return 0;
1177error2:
1178	while (--i >= 0)
1179		sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]);
1180error:
1181	while (--i >= 0)
1182		sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
1183
1184	sysdev_unregister(&per_cpu(mce_dev, cpu));
1185
1186	return err;
1187}
1188
1189static __cpuinit void mce_remove_device(unsigned int cpu)
1190{
1191	int i;
1192
1193	if (!cpumask_test_cpu(cpu, mce_dev_initialized))
1194		return;
1195
1196	for (i = 0; mce_attrs[i]; i++)
1197		sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
1198
1199	for (i = 0; i < banks; i++)
1200		sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]);
1201
1202	sysdev_unregister(&per_cpu(mce_dev, cpu));
1203	cpumask_clear_cpu(cpu, mce_dev_initialized);
1204}
1205
1206/* Make sure there are no machine checks on offlined CPUs. */
1207static void mce_disable_cpu(void *h)
1208{
1209	unsigned long action = *(unsigned long *)h;
1210	int i;
1211
1212	if (!mce_available(&current_cpu_data))
1213		return;
1214	if (!(action & CPU_TASKS_FROZEN))
1215		cmci_clear();
1216	for (i = 0; i < banks; i++) {
1217		if (!skip_bank_init(i))
1218			wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
1219	}
1220}
1221
1222static void mce_reenable_cpu(void *h)
1223{
1224	unsigned long action = *(unsigned long *)h;
1225	int i;
1226
1227	if (!mce_available(&current_cpu_data))
1228		return;
1229
1230	if (!(action & CPU_TASKS_FROZEN))
1231		cmci_reenable();
1232	for (i = 0; i < banks; i++) {
1233		if (!skip_bank_init(i))
1234			wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]);
1235	}
1236}
1237
1238/* Get notified when a cpu comes on/off. Be hotplug friendly. */
1239static int __cpuinit
1240mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
1241{
1242	unsigned int cpu = (unsigned long)hcpu;
1243	struct timer_list *t = &per_cpu(mce_timer, cpu);
1244
1245	switch (action) {
1246	case CPU_ONLINE:
1247	case CPU_ONLINE_FROZEN:
1248		mce_create_device(cpu);
1249		if (threshold_cpu_callback)
1250			threshold_cpu_callback(action, cpu);
1251		break;
1252	case CPU_DEAD:
1253	case CPU_DEAD_FROZEN:
1254		if (threshold_cpu_callback)
1255			threshold_cpu_callback(action, cpu);
1256		mce_remove_device(cpu);
1257		break;
1258	case CPU_DOWN_PREPARE:
1259	case CPU_DOWN_PREPARE_FROZEN:
1260		del_timer_sync(t);
1261		smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
1262		break;
1263	case CPU_DOWN_FAILED:
1264	case CPU_DOWN_FAILED_FROZEN:
1265		t->expires = round_jiffies(jiffies +
1266						__get_cpu_var(next_interval));
1267		add_timer_on(t, cpu);
1268		smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
1269		break;
1270	case CPU_POST_DEAD:
1271		/* intentionally ignoring frozen here */
1272		cmci_rediscover(cpu);
1273		break;
1274	}
1275	return NOTIFY_OK;
1276}
1277
1278static struct notifier_block mce_cpu_notifier __cpuinitdata = {
1279	.notifier_call = mce_cpu_callback,
1280};
1281
1282static __init int mce_init_banks(void)
1283{
1284	int i;
1285
1286	bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks,
1287				GFP_KERNEL);
1288	if (!bank_attrs)
1289		return -ENOMEM;
1290
1291	for (i = 0; i < banks; i++) {
1292		struct sysdev_attribute *a = &bank_attrs[i];
1293
1294		a->attr.name	= kasprintf(GFP_KERNEL, "bank%d", i);
1295		if (!a->attr.name)
1296			goto nomem;
1297
1298		a->attr.mode	= 0644;
1299		a->show		= show_bank;
1300		a->store	= set_bank;
1301	}
1302	return 0;
1303
1304nomem:
1305	while (--i >= 0)
1306		kfree(bank_attrs[i].attr.name);
1307	kfree(bank_attrs);
1308	bank_attrs = NULL;
1309
1310	return -ENOMEM;
1311}
1312
1313static __init int mce_init_device(void)
1314{
1315	int err;
1316	int i = 0;
1317
1318	if (!mce_available(&boot_cpu_data))
1319		return -EIO;
1320
1321	alloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL);
1322
1323	err = mce_init_banks();
1324	if (err)
1325		return err;
1326
1327	err = sysdev_class_register(&mce_sysclass);
1328	if (err)
1329		return err;
1330
1331	for_each_online_cpu(i) {
1332		err = mce_create_device(i);
1333		if (err)
1334			return err;
1335	}
1336
1337	register_hotcpu_notifier(&mce_cpu_notifier);
1338	misc_register(&mce_log_device);
1339
1340	return err;
1341}
1342
1343device_initcall(mce_init_device);
1344
1345#else /* CONFIG_X86_OLD_MCE: */
1346
1347int nr_mce_banks;
1348EXPORT_SYMBOL_GPL(nr_mce_banks);	/* non-fatal.o */
1349
1350/* This has to be run for each processor */
1351void mcheck_init(struct cpuinfo_x86 *c)
1352{
1353	if (mce_disabled == 1)
1354		return;
1355
1356	switch (c->x86_vendor) {
1357	case X86_VENDOR_AMD:
1358		amd_mcheck_init(c);
1359		break;
1360
1361	case X86_VENDOR_INTEL:
1362		if (c->x86 == 5)
1363			intel_p5_mcheck_init(c);
1364		if (c->x86 == 6)
1365			intel_p6_mcheck_init(c);
1366		if (c->x86 == 15)
1367			intel_p4_mcheck_init(c);
1368		break;
1369
1370	case X86_VENDOR_CENTAUR:
1371		if (c->x86 == 5)
1372			winchip_mcheck_init(c);
1373		break;
1374
1375	default:
1376		break;
1377	}
1378	printk(KERN_INFO "mce: CPU supports %d MCE banks\n", nr_mce_banks);
1379}
1380
1381static int __init mcheck_enable(char *str)
1382{
1383	mce_disabled = -1;
1384	return 1;
1385}
1386
1387__setup("mce", mcheck_enable);
1388
1389#endif /* CONFIG_X86_OLD_MCE */
1390
1391/*
1392 * Old style boot options parsing. Only for compatibility.
1393 */
1394static int __init mcheck_disable(char *str)
1395{
1396	mce_disabled = 1;
1397	return 1;
1398}
1399__setup("nomce", mcheck_disable);
1400