mce.c revision d620c67fb92aa11736112f9a03e31d8e3079c57a
1/*
2 * Machine check handler.
3 *
4 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
5 * Rest from unknown author(s).
6 * 2004 Andi Kleen. Rewrote most of it.
7 * Copyright 2008 Intel Corporation
8 * Author: Andi Kleen
9 */
10#include <linux/thread_info.h>
11#include <linux/capability.h>
12#include <linux/miscdevice.h>
13#include <linux/ratelimit.h>
14#include <linux/kallsyms.h>
15#include <linux/rcupdate.h>
16#include <linux/kobject.h>
17#include <linux/uaccess.h>
18#include <linux/kdebug.h>
19#include <linux/kernel.h>
20#include <linux/percpu.h>
21#include <linux/string.h>
22#include <linux/sysdev.h>
23#include <linux/ctype.h>
24#include <linux/sched.h>
25#include <linux/sysfs.h>
26#include <linux/types.h>
27#include <linux/init.h>
28#include <linux/kmod.h>
29#include <linux/poll.h>
30#include <linux/cpu.h>
31#include <linux/smp.h>
32#include <linux/fs.h>
33
34#include <asm/processor.h>
35#include <asm/idle.h>
36#include <asm/mce.h>
37#include <asm/msr.h>
38
39#include "mce.h"
40
41/* Handle unconfigured int18 (should never happen) */
42static void unexpected_machine_check(struct pt_regs *regs, long error_code)
43{
44	printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n",
45	       smp_processor_id());
46}
47
48/* Call the installed machine check handler for this CPU setup. */
49void (*machine_check_vector)(struct pt_regs *, long error_code) =
50						unexpected_machine_check;
51
52int				mce_disabled;
53
54#ifdef CONFIG_X86_NEW_MCE
55
56#define MISC_MCELOG_MINOR	227
57
58atomic_t mce_entry;
59
60DEFINE_PER_CPU(unsigned, mce_exception_count);
61
62/*
63 * Tolerant levels:
64 *   0: always panic on uncorrected errors, log corrected errors
65 *   1: panic or SIGBUS on uncorrected errors, log corrected errors
66 *   2: SIGBUS or log uncorrected errors (if possible), log corrected errors
67 *   3: never panic or SIGBUS, log all errors (for testing only)
68 */
69static int			tolerant = 1;
70static int			banks;
71static u64			*bank;
72static unsigned long		notify_user;
73static int			rip_msr;
74static int			mce_bootlog = -1;
75
76static char			trigger[128];
77static char			*trigger_argv[2] = { trigger, NULL };
78
79static unsigned long		dont_init_banks;
80
81static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
82
83/* MCA banks polled by the period polling timer for corrected events */
84DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
85	[0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
86};
87
88static inline int skip_bank_init(int i)
89{
90	return i < BITS_PER_LONG && test_bit(i, &dont_init_banks);
91}
92
93/* Do initial initialization of a struct mce */
94void mce_setup(struct mce *m)
95{
96	memset(m, 0, sizeof(struct mce));
97	m->cpu = m->extcpu = smp_processor_id();
98	rdtscll(m->tsc);
99}
100
101DEFINE_PER_CPU(struct mce, injectm);
102EXPORT_PER_CPU_SYMBOL_GPL(injectm);
103
104/*
105 * Lockless MCE logging infrastructure.
106 * This avoids deadlocks on printk locks without having to break locks. Also
107 * separate MCEs from kernel messages to avoid bogus bug reports.
108 */
109
110static struct mce_log mcelog = {
111	.signature	= MCE_LOG_SIGNATURE,
112	.len		= MCE_LOG_LEN,
113	.recordlen	= sizeof(struct mce),
114};
115
116void mce_log(struct mce *mce)
117{
118	unsigned next, entry;
119
120	mce->finished = 0;
121	wmb();
122	for (;;) {
123		entry = rcu_dereference(mcelog.next);
124		for (;;) {
125			/*
126			 * When the buffer fills up discard new entries.
127			 * Assume that the earlier errors are the more
128			 * interesting ones:
129			 */
130			if (entry >= MCE_LOG_LEN) {
131				set_bit(MCE_OVERFLOW,
132					(unsigned long *)&mcelog.flags);
133				return;
134			}
135			/* Old left over entry. Skip: */
136			if (mcelog.entry[entry].finished) {
137				entry++;
138				continue;
139			}
140			break;
141		}
142		smp_rmb();
143		next = entry + 1;
144		if (cmpxchg(&mcelog.next, entry, next) == entry)
145			break;
146	}
147	memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
148	wmb();
149	mcelog.entry[entry].finished = 1;
150	wmb();
151
152	set_bit(0, &notify_user);
153}
154
155static void print_mce(struct mce *m)
156{
157	printk(KERN_EMERG "\n"
158	       KERN_EMERG "HARDWARE ERROR\n"
159	       KERN_EMERG
160	       "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
161	       m->extcpu, m->mcgstatus, m->bank, m->status);
162	if (m->ip) {
163		printk(KERN_EMERG "RIP%s %02x:<%016Lx> ",
164		       !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
165		       m->cs, m->ip);
166		if (m->cs == __KERNEL_CS)
167			print_symbol("{%s}", m->ip);
168		printk("\n");
169	}
170	printk(KERN_EMERG "TSC %llx ", m->tsc);
171	if (m->addr)
172		printk("ADDR %llx ", m->addr);
173	if (m->misc)
174		printk("MISC %llx ", m->misc);
175	printk("\n");
176	printk(KERN_EMERG "This is not a software problem!\n");
177	printk(KERN_EMERG "Run through mcelog --ascii to decode "
178	       "and contact your hardware vendor\n");
179}
180
181static void mce_panic(char *msg, struct mce *backup, u64 start)
182{
183	int i;
184
185	bust_spinlocks(1);
186	console_verbose();
187	for (i = 0; i < MCE_LOG_LEN; i++) {
188		u64 tsc = mcelog.entry[i].tsc;
189
190		if ((s64)(tsc - start) < 0)
191			continue;
192		print_mce(&mcelog.entry[i]);
193		if (backup && mcelog.entry[i].tsc == backup->tsc)
194			backup = NULL;
195	}
196	if (backup)
197		print_mce(backup);
198	panic(msg);
199}
200
201/* Support code for software error injection */
202
203static int msr_to_offset(u32 msr)
204{
205	unsigned bank = __get_cpu_var(injectm.bank);
206	if (msr == rip_msr)
207		return offsetof(struct mce, ip);
208	if (msr == MSR_IA32_MC0_STATUS + bank*4)
209		return offsetof(struct mce, status);
210	if (msr == MSR_IA32_MC0_ADDR + bank*4)
211		return offsetof(struct mce, addr);
212	if (msr == MSR_IA32_MC0_MISC + bank*4)
213		return offsetof(struct mce, misc);
214	if (msr == MSR_IA32_MCG_STATUS)
215		return offsetof(struct mce, mcgstatus);
216	return -1;
217}
218
219/* MSR access wrappers used for error injection */
220static u64 mce_rdmsrl(u32 msr)
221{
222	u64 v;
223	if (__get_cpu_var(injectm).finished) {
224		int offset = msr_to_offset(msr);
225		if (offset < 0)
226			return 0;
227		return *(u64 *)((char *)&__get_cpu_var(injectm) + offset);
228	}
229	rdmsrl(msr, v);
230	return v;
231}
232
233static void mce_wrmsrl(u32 msr, u64 v)
234{
235	if (__get_cpu_var(injectm).finished) {
236		int offset = msr_to_offset(msr);
237		if (offset >= 0)
238			*(u64 *)((char *)&__get_cpu_var(injectm) + offset) = v;
239		return;
240	}
241	wrmsrl(msr, v);
242}
243
244int mce_available(struct cpuinfo_x86 *c)
245{
246	if (mce_disabled)
247		return 0;
248	return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
249}
250
251static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
252{
253	if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) {
254		m->ip = regs->ip;
255		m->cs = regs->cs;
256	} else {
257		m->ip = 0;
258		m->cs = 0;
259	}
260	if (rip_msr) {
261		/* Assume the RIP in the MSR is exact. Is this true? */
262		m->mcgstatus |= MCG_STATUS_EIPV;
263		m->ip = mce_rdmsrl(rip_msr);
264		m->cs = 0;
265	}
266}
267
268DEFINE_PER_CPU(unsigned, mce_poll_count);
269
270/*
271 * Poll for corrected events or events that happened before reset.
272 * Those are just logged through /dev/mcelog.
273 *
274 * This is executed in standard interrupt context.
275 */
276void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
277{
278	struct mce m;
279	int i;
280
281	__get_cpu_var(mce_poll_count)++;
282
283	mce_setup(&m);
284
285	m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
286	for (i = 0; i < banks; i++) {
287		if (!bank[i] || !test_bit(i, *b))
288			continue;
289
290		m.misc = 0;
291		m.addr = 0;
292		m.bank = i;
293		m.tsc = 0;
294
295		barrier();
296		m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4);
297		if (!(m.status & MCI_STATUS_VAL))
298			continue;
299
300		/*
301		 * Uncorrected events are handled by the exception handler
302		 * when it is enabled. But when the exception is disabled log
303		 * everything.
304		 *
305		 * TBD do the same check for MCI_STATUS_EN here?
306		 */
307		if ((m.status & MCI_STATUS_UC) && !(flags & MCP_UC))
308			continue;
309
310		if (m.status & MCI_STATUS_MISCV)
311			m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4);
312		if (m.status & MCI_STATUS_ADDRV)
313			m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4);
314
315		if (!(flags & MCP_TIMESTAMP))
316			m.tsc = 0;
317		/*
318		 * Don't get the IP here because it's unlikely to
319		 * have anything to do with the actual error location.
320		 */
321		if (!(flags & MCP_DONTLOG)) {
322			mce_log(&m);
323			add_taint(TAINT_MACHINE_CHECK);
324		}
325
326		/*
327		 * Clear state for this bank.
328		 */
329		mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
330	}
331
332	/*
333	 * Don't clear MCG_STATUS here because it's only defined for
334	 * exceptions.
335	 */
336
337	sync_core();
338}
339EXPORT_SYMBOL_GPL(machine_check_poll);
340
341/*
342 * The actual machine check handler. This only handles real
343 * exceptions when something got corrupted coming in through int 18.
344 *
345 * This is executed in NMI context not subject to normal locking rules. This
346 * implies that most kernel services cannot be safely used. Don't even
347 * think about putting a printk in there!
348 */
349void do_machine_check(struct pt_regs *regs, long error_code)
350{
351	struct mce m, panicm;
352	int panicm_found = 0;
353	u64 mcestart = 0;
354	int i;
355	/*
356	 * If no_way_out gets set, there is no safe way to recover from this
357	 * MCE.  If tolerant is cranked up, we'll try anyway.
358	 */
359	int no_way_out = 0;
360	/*
361	 * If kill_it gets set, there might be a way to recover from this
362	 * error.
363	 */
364	int kill_it = 0;
365	DECLARE_BITMAP(toclear, MAX_NR_BANKS);
366
367	atomic_inc(&mce_entry);
368
369	__get_cpu_var(mce_exception_count)++;
370
371	if (notify_die(DIE_NMI, "machine check", regs, error_code,
372			   18, SIGKILL) == NOTIFY_STOP)
373		goto out;
374	if (!banks)
375		goto out;
376
377	mce_setup(&m);
378
379	m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
380
381	/* if the restart IP is not valid, we're done for */
382	if (!(m.mcgstatus & MCG_STATUS_RIPV))
383		no_way_out = 1;
384
385	rdtscll(mcestart);
386	barrier();
387
388	for (i = 0; i < banks; i++) {
389		__clear_bit(i, toclear);
390		if (!bank[i])
391			continue;
392
393		m.misc = 0;
394		m.addr = 0;
395		m.bank = i;
396
397		m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4);
398		if ((m.status & MCI_STATUS_VAL) == 0)
399			continue;
400
401		/*
402		 * Non uncorrected errors are handled by machine_check_poll
403		 * Leave them alone.
404		 */
405		if ((m.status & MCI_STATUS_UC) == 0)
406			continue;
407
408		/*
409		 * Set taint even when machine check was not enabled.
410		 */
411		add_taint(TAINT_MACHINE_CHECK);
412
413		__set_bit(i, toclear);
414
415		if (m.status & MCI_STATUS_EN) {
416			/* if PCC was set, there's no way out */
417			no_way_out |= !!(m.status & MCI_STATUS_PCC);
418			/*
419			 * If this error was uncorrectable and there was
420			 * an overflow, we're in trouble.  If no overflow,
421			 * we might get away with just killing a task.
422			 */
423			if (m.status & MCI_STATUS_UC) {
424				if (tolerant < 1 || m.status & MCI_STATUS_OVER)
425					no_way_out = 1;
426				kill_it = 1;
427			}
428		} else {
429			/*
430			 * Machine check event was not enabled. Clear, but
431			 * ignore.
432			 */
433			continue;
434		}
435
436		if (m.status & MCI_STATUS_MISCV)
437			m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4);
438		if (m.status & MCI_STATUS_ADDRV)
439			m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4);
440
441		mce_get_rip(&m, regs);
442		mce_log(&m);
443
444		/*
445		 * Did this bank cause the exception?
446		 *
447		 * Assume that the bank with uncorrectable errors did it,
448		 * and that there is only a single one:
449		 */
450		if ((m.status & MCI_STATUS_UC) &&
451					(m.status & MCI_STATUS_EN)) {
452			panicm = m;
453			panicm_found = 1;
454		}
455	}
456
457	/*
458	 * If we didn't find an uncorrectable error, pick
459	 * the last one (shouldn't happen, just being safe).
460	 */
461	if (!panicm_found)
462		panicm = m;
463
464	/*
465	 * If we have decided that we just CAN'T continue, and the user
466	 * has not set tolerant to an insane level, give up and die.
467	 */
468	if (no_way_out && tolerant < 3)
469		mce_panic("Machine check", &panicm, mcestart);
470
471	/*
472	 * If the error seems to be unrecoverable, something should be
473	 * done.  Try to kill as little as possible.  If we can kill just
474	 * one task, do that.  If the user has set the tolerance very
475	 * high, don't try to do anything at all.
476	 */
477	if (kill_it && tolerant < 3) {
478		int user_space = 0;
479
480		/*
481		 * If the EIPV bit is set, it means the saved IP is the
482		 * instruction which caused the MCE.
483		 */
484		if (m.mcgstatus & MCG_STATUS_EIPV)
485			user_space = panicm.ip && (panicm.cs & 3);
486
487		/*
488		 * If we know that the error was in user space, send a
489		 * SIGBUS.  Otherwise, panic if tolerance is low.
490		 *
491		 * force_sig() takes an awful lot of locks and has a slight
492		 * risk of deadlocking.
493		 */
494		if (user_space) {
495			force_sig(SIGBUS, current);
496		} else if (panic_on_oops || tolerant < 2) {
497			mce_panic("Uncorrected machine check",
498				&panicm, mcestart);
499		}
500	}
501
502	/* notify userspace ASAP */
503	set_thread_flag(TIF_MCE_NOTIFY);
504
505	/* the last thing we do is clear state */
506	for (i = 0; i < banks; i++) {
507		if (test_bit(i, toclear))
508			mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
509	}
510	mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
511out:
512	atomic_dec(&mce_entry);
513	sync_core();
514}
515EXPORT_SYMBOL_GPL(do_machine_check);
516
517#ifdef CONFIG_X86_MCE_INTEL
518/***
519 * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
520 * @cpu: The CPU on which the event occurred.
521 * @status: Event status information
522 *
523 * This function should be called by the thermal interrupt after the
524 * event has been processed and the decision was made to log the event
525 * further.
526 *
527 * The status parameter will be saved to the 'status' field of 'struct mce'
528 * and historically has been the register value of the
529 * MSR_IA32_THERMAL_STATUS (Intel) msr.
530 */
531void mce_log_therm_throt_event(__u64 status)
532{
533	struct mce m;
534
535	mce_setup(&m);
536	m.bank = MCE_THERMAL_BANK;
537	m.status = status;
538	mce_log(&m);
539}
540#endif /* CONFIG_X86_MCE_INTEL */
541
542/*
543 * Periodic polling timer for "silent" machine check errors.  If the
544 * poller finds an MCE, poll 2x faster.  When the poller finds no more
545 * errors, poll 2x slower (up to check_interval seconds).
546 */
547static int check_interval = 5 * 60; /* 5 minutes */
548
549static DEFINE_PER_CPU(int, next_interval); /* in jiffies */
550static DEFINE_PER_CPU(struct timer_list, mce_timer);
551
552static void mcheck_timer(unsigned long data)
553{
554	struct timer_list *t = &per_cpu(mce_timer, data);
555	int *n;
556
557	WARN_ON(smp_processor_id() != data);
558
559	if (mce_available(&current_cpu_data)) {
560		machine_check_poll(MCP_TIMESTAMP,
561				&__get_cpu_var(mce_poll_banks));
562	}
563
564	/*
565	 * Alert userspace if needed.  If we logged an MCE, reduce the
566	 * polling interval, otherwise increase the polling interval.
567	 */
568	n = &__get_cpu_var(next_interval);
569	if (mce_notify_user())
570		*n = max(*n/2, HZ/100);
571	else
572		*n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ));
573
574	t->expires = jiffies + *n;
575	add_timer(t);
576}
577
578static void mce_do_trigger(struct work_struct *work)
579{
580	call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT);
581}
582
583static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
584
585/*
586 * Notify the user(s) about new machine check events.
587 * Can be called from interrupt context, but not from machine check/NMI
588 * context.
589 */
590int mce_notify_user(void)
591{
592	/* Not more than two messages every minute */
593	static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
594
595	clear_thread_flag(TIF_MCE_NOTIFY);
596
597	if (test_and_clear_bit(0, &notify_user)) {
598		wake_up_interruptible(&mce_wait);
599
600		/*
601		 * There is no risk of missing notifications because
602		 * work_pending is always cleared before the function is
603		 * executed.
604		 */
605		if (trigger[0] && !work_pending(&mce_trigger_work))
606			schedule_work(&mce_trigger_work);
607
608		if (__ratelimit(&ratelimit))
609			printk(KERN_INFO "Machine check events logged\n");
610
611		return 1;
612	}
613	return 0;
614}
615EXPORT_SYMBOL_GPL(mce_notify_user);
616
617/*
618 * Initialize Machine Checks for a CPU.
619 */
620static int mce_cap_init(void)
621{
622	unsigned b;
623	u64 cap;
624
625	rdmsrl(MSR_IA32_MCG_CAP, cap);
626
627	b = cap & MCG_BANKCNT_MASK;
628	printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b);
629
630	if (b > MAX_NR_BANKS) {
631		printk(KERN_WARNING
632		       "MCE: Using only %u machine check banks out of %u\n",
633			MAX_NR_BANKS, b);
634		b = MAX_NR_BANKS;
635	}
636
637	/* Don't support asymmetric configurations today */
638	WARN_ON(banks != 0 && b != banks);
639	banks = b;
640	if (!bank) {
641		bank = kmalloc(banks * sizeof(u64), GFP_KERNEL);
642		if (!bank)
643			return -ENOMEM;
644		memset(bank, 0xff, banks * sizeof(u64));
645	}
646
647	/* Use accurate RIP reporting if available. */
648	if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9)
649		rip_msr = MSR_IA32_MCG_EIP;
650
651	return 0;
652}
653
654static void mce_init(void)
655{
656	mce_banks_t all_banks;
657	u64 cap;
658	int i;
659
660	/*
661	 * Log the machine checks left over from the previous reset.
662	 */
663	bitmap_fill(all_banks, MAX_NR_BANKS);
664	machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks);
665
666	set_in_cr4(X86_CR4_MCE);
667
668	rdmsrl(MSR_IA32_MCG_CAP, cap);
669	if (cap & MCG_CTL_P)
670		wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
671
672	for (i = 0; i < banks; i++) {
673		if (skip_bank_init(i))
674			continue;
675		wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
676		wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
677	}
678}
679
680/* Add per CPU specific workarounds here */
681static void mce_cpu_quirks(struct cpuinfo_x86 *c)
682{
683	/* This should be disabled by the BIOS, but isn't always */
684	if (c->x86_vendor == X86_VENDOR_AMD) {
685		if (c->x86 == 15 && banks > 4) {
686			/*
687			 * disable GART TBL walk error reporting, which
688			 * trips off incorrectly with the IOMMU & 3ware
689			 * & Cerberus:
690			 */
691			clear_bit(10, (unsigned long *)&bank[4]);
692		}
693		if (c->x86 <= 17 && mce_bootlog < 0) {
694			/*
695			 * Lots of broken BIOS around that don't clear them
696			 * by default and leave crap in there. Don't log:
697			 */
698			mce_bootlog = 0;
699		}
700		/*
701		 * Various K7s with broken bank 0 around. Always disable
702		 * by default.
703		 */
704		 if (c->x86 == 6)
705			bank[0] = 0;
706	}
707
708	if (c->x86_vendor == X86_VENDOR_INTEL) {
709		/*
710		 * SDM documents that on family 6 bank 0 should not be written
711		 * because it aliases to another special BIOS controlled
712		 * register.
713		 * But it's not aliased anymore on model 0x1a+
714		 * Don't ignore bank 0 completely because there could be a
715		 * valid event later, merely don't write CTL0.
716		 */
717
718		if (c->x86 == 6 && c->x86_model < 0x1A)
719			__set_bit(0, &dont_init_banks);
720	}
721}
722
723static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c)
724{
725	if (c->x86 != 5)
726		return;
727	switch (c->x86_vendor) {
728	case X86_VENDOR_INTEL:
729		if (mce_p5_enabled())
730			intel_p5_mcheck_init(c);
731		break;
732	case X86_VENDOR_CENTAUR:
733		winchip_mcheck_init(c);
734		break;
735	}
736}
737
738static void mce_cpu_features(struct cpuinfo_x86 *c)
739{
740	switch (c->x86_vendor) {
741	case X86_VENDOR_INTEL:
742		mce_intel_feature_init(c);
743		break;
744	case X86_VENDOR_AMD:
745		mce_amd_feature_init(c);
746		break;
747	default:
748		break;
749	}
750}
751
752static void mce_init_timer(void)
753{
754	struct timer_list *t = &__get_cpu_var(mce_timer);
755	int *n = &__get_cpu_var(next_interval);
756
757	*n = check_interval * HZ;
758	if (!*n)
759		return;
760	setup_timer(t, mcheck_timer, smp_processor_id());
761	t->expires = round_jiffies(jiffies + *n);
762	add_timer(t);
763}
764
765/*
766 * Called for each booted CPU to set up machine checks.
767 * Must be called with preempt off:
768 */
769void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
770{
771	if (mce_disabled)
772		return;
773
774	mce_ancient_init(c);
775
776	if (!mce_available(c))
777		return;
778
779	if (mce_cap_init() < 0) {
780		mce_disabled = 1;
781		return;
782	}
783	mce_cpu_quirks(c);
784
785	machine_check_vector = do_machine_check;
786
787	mce_init();
788	mce_cpu_features(c);
789	mce_init_timer();
790}
791
792/*
793 * Character device to read and clear the MCE log.
794 */
795
796static DEFINE_SPINLOCK(mce_state_lock);
797static int		open_count;		/* #times opened */
798static int		open_exclu;		/* already open exclusive? */
799
800static int mce_open(struct inode *inode, struct file *file)
801{
802	spin_lock(&mce_state_lock);
803
804	if (open_exclu || (open_count && (file->f_flags & O_EXCL))) {
805		spin_unlock(&mce_state_lock);
806
807		return -EBUSY;
808	}
809
810	if (file->f_flags & O_EXCL)
811		open_exclu = 1;
812	open_count++;
813
814	spin_unlock(&mce_state_lock);
815
816	return nonseekable_open(inode, file);
817}
818
819static int mce_release(struct inode *inode, struct file *file)
820{
821	spin_lock(&mce_state_lock);
822
823	open_count--;
824	open_exclu = 0;
825
826	spin_unlock(&mce_state_lock);
827
828	return 0;
829}
830
831static void collect_tscs(void *data)
832{
833	unsigned long *cpu_tsc = (unsigned long *)data;
834
835	rdtscll(cpu_tsc[smp_processor_id()]);
836}
837
838static DEFINE_MUTEX(mce_read_mutex);
839
840static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
841			loff_t *off)
842{
843	char __user *buf = ubuf;
844	unsigned long *cpu_tsc;
845	unsigned prev, next;
846	int i, err;
847
848	cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
849	if (!cpu_tsc)
850		return -ENOMEM;
851
852	mutex_lock(&mce_read_mutex);
853	next = rcu_dereference(mcelog.next);
854
855	/* Only supports full reads right now */
856	if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
857		mutex_unlock(&mce_read_mutex);
858		kfree(cpu_tsc);
859
860		return -EINVAL;
861	}
862
863	err = 0;
864	prev = 0;
865	do {
866		for (i = prev; i < next; i++) {
867			unsigned long start = jiffies;
868
869			while (!mcelog.entry[i].finished) {
870				if (time_after_eq(jiffies, start + 2)) {
871					memset(mcelog.entry + i, 0,
872					       sizeof(struct mce));
873					goto timeout;
874				}
875				cpu_relax();
876			}
877			smp_rmb();
878			err |= copy_to_user(buf, mcelog.entry + i,
879					    sizeof(struct mce));
880			buf += sizeof(struct mce);
881timeout:
882			;
883		}
884
885		memset(mcelog.entry + prev, 0,
886		       (next - prev) * sizeof(struct mce));
887		prev = next;
888		next = cmpxchg(&mcelog.next, prev, 0);
889	} while (next != prev);
890
891	synchronize_sched();
892
893	/*
894	 * Collect entries that were still getting written before the
895	 * synchronize.
896	 */
897	on_each_cpu(collect_tscs, cpu_tsc, 1);
898
899	for (i = next; i < MCE_LOG_LEN; i++) {
900		if (mcelog.entry[i].finished &&
901		    mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
902			err |= copy_to_user(buf, mcelog.entry+i,
903					    sizeof(struct mce));
904			smp_rmb();
905			buf += sizeof(struct mce);
906			memset(&mcelog.entry[i], 0, sizeof(struct mce));
907		}
908	}
909	mutex_unlock(&mce_read_mutex);
910	kfree(cpu_tsc);
911
912	return err ? -EFAULT : buf - ubuf;
913}
914
915static unsigned int mce_poll(struct file *file, poll_table *wait)
916{
917	poll_wait(file, &mce_wait, wait);
918	if (rcu_dereference(mcelog.next))
919		return POLLIN | POLLRDNORM;
920	return 0;
921}
922
923static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
924{
925	int __user *p = (int __user *)arg;
926
927	if (!capable(CAP_SYS_ADMIN))
928		return -EPERM;
929
930	switch (cmd) {
931	case MCE_GET_RECORD_LEN:
932		return put_user(sizeof(struct mce), p);
933	case MCE_GET_LOG_LEN:
934		return put_user(MCE_LOG_LEN, p);
935	case MCE_GETCLEAR_FLAGS: {
936		unsigned flags;
937
938		do {
939			flags = mcelog.flags;
940		} while (cmpxchg(&mcelog.flags, flags, 0) != flags);
941
942		return put_user(flags, p);
943	}
944	default:
945		return -ENOTTY;
946	}
947}
948
949/* Modified in mce-inject.c, so not static or const */
950struct file_operations mce_chrdev_ops = {
951	.open			= mce_open,
952	.release		= mce_release,
953	.read			= mce_read,
954	.poll			= mce_poll,
955	.unlocked_ioctl		= mce_ioctl,
956};
957EXPORT_SYMBOL_GPL(mce_chrdev_ops);
958
959static struct miscdevice mce_log_device = {
960	MISC_MCELOG_MINOR,
961	"mcelog",
962	&mce_chrdev_ops,
963};
964
965/*
966 * mce=off disables machine check
967 * mce=TOLERANCELEVEL (number, see above)
968 * mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
969 * mce=nobootlog Don't log MCEs from before booting.
970 */
971static int __init mcheck_enable(char *str)
972{
973	if (*str == 0)
974		enable_p5_mce();
975	if (*str == '=')
976		str++;
977	if (!strcmp(str, "off"))
978		mce_disabled = 1;
979	else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
980		mce_bootlog = (str[0] == 'b');
981	else if (isdigit(str[0]))
982		get_option(&str, &tolerant);
983	else {
984		printk(KERN_INFO "mce argument %s ignored. Please use /sys\n",
985		       str);
986		return 0;
987	}
988	return 1;
989}
990__setup("mce", mcheck_enable);
991
992/*
993 * Sysfs support
994 */
995
996/*
997 * Disable machine checks on suspend and shutdown. We can't really handle
998 * them later.
999 */
1000static int mce_disable(void)
1001{
1002	int i;
1003
1004	for (i = 0; i < banks; i++) {
1005		if (!skip_bank_init(i))
1006			wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
1007	}
1008	return 0;
1009}
1010
1011static int mce_suspend(struct sys_device *dev, pm_message_t state)
1012{
1013	return mce_disable();
1014}
1015
1016static int mce_shutdown(struct sys_device *dev)
1017{
1018	return mce_disable();
1019}
1020
1021/*
1022 * On resume clear all MCE state. Don't want to see leftovers from the BIOS.
1023 * Only one CPU is active at this time, the others get re-added later using
1024 * CPU hotplug:
1025 */
1026static int mce_resume(struct sys_device *dev)
1027{
1028	mce_init();
1029	mce_cpu_features(&current_cpu_data);
1030
1031	return 0;
1032}
1033
1034static void mce_cpu_restart(void *data)
1035{
1036	del_timer_sync(&__get_cpu_var(mce_timer));
1037	if (mce_available(&current_cpu_data))
1038		mce_init();
1039	mce_init_timer();
1040}
1041
1042/* Reinit MCEs after user configuration changes */
1043static void mce_restart(void)
1044{
1045	on_each_cpu(mce_cpu_restart, NULL, 1);
1046}
1047
1048static struct sysdev_class mce_sysclass = {
1049	.suspend	= mce_suspend,
1050	.shutdown	= mce_shutdown,
1051	.resume		= mce_resume,
1052	.name		= "machinecheck",
1053};
1054
1055DEFINE_PER_CPU(struct sys_device, mce_dev);
1056
1057__cpuinitdata
1058void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
1059
1060static struct sysdev_attribute *bank_attrs;
1061
1062static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr,
1063			 char *buf)
1064{
1065	u64 b = bank[attr - bank_attrs];
1066
1067	return sprintf(buf, "%llx\n", b);
1068}
1069
1070static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
1071			const char *buf, size_t size)
1072{
1073	u64 new;
1074
1075	if (strict_strtoull(buf, 0, &new) < 0)
1076		return -EINVAL;
1077
1078	bank[attr - bank_attrs] = new;
1079	mce_restart();
1080
1081	return size;
1082}
1083
1084static ssize_t
1085show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf)
1086{
1087	strcpy(buf, trigger);
1088	strcat(buf, "\n");
1089	return strlen(trigger) + 1;
1090}
1091
1092static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr,
1093				const char *buf, size_t siz)
1094{
1095	char *p;
1096	int len;
1097
1098	strncpy(trigger, buf, sizeof(trigger));
1099	trigger[sizeof(trigger)-1] = 0;
1100	len = strlen(trigger);
1101	p = strchr(trigger, '\n');
1102
1103	if (*p)
1104		*p = 0;
1105
1106	return len;
1107}
1108
1109static ssize_t store_int_with_restart(struct sys_device *s,
1110				      struct sysdev_attribute *attr,
1111				      const char *buf, size_t size)
1112{
1113	ssize_t ret = sysdev_store_int(s, attr, buf, size);
1114	mce_restart();
1115	return ret;
1116}
1117
1118static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
1119static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
1120
1121static struct sysdev_ext_attribute attr_check_interval = {
1122	_SYSDEV_ATTR(check_interval, 0644, sysdev_show_int,
1123		     store_int_with_restart),
1124	&check_interval
1125};
1126
1127static struct sysdev_attribute *mce_attrs[] = {
1128	&attr_tolerant.attr, &attr_check_interval.attr, &attr_trigger,
1129	NULL
1130};
1131
1132static cpumask_var_t mce_dev_initialized;
1133
1134/* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */
1135static __cpuinit int mce_create_device(unsigned int cpu)
1136{
1137	int err;
1138	int i;
1139
1140	if (!mce_available(&boot_cpu_data))
1141		return -EIO;
1142
1143	memset(&per_cpu(mce_dev, cpu).kobj, 0, sizeof(struct kobject));
1144	per_cpu(mce_dev, cpu).id	= cpu;
1145	per_cpu(mce_dev, cpu).cls	= &mce_sysclass;
1146
1147	err = sysdev_register(&per_cpu(mce_dev, cpu));
1148	if (err)
1149		return err;
1150
1151	for (i = 0; mce_attrs[i]; i++) {
1152		err = sysdev_create_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
1153		if (err)
1154			goto error;
1155	}
1156	for (i = 0; i < banks; i++) {
1157		err = sysdev_create_file(&per_cpu(mce_dev, cpu),
1158					&bank_attrs[i]);
1159		if (err)
1160			goto error2;
1161	}
1162	cpumask_set_cpu(cpu, mce_dev_initialized);
1163
1164	return 0;
1165error2:
1166	while (--i >= 0)
1167		sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]);
1168error:
1169	while (--i >= 0)
1170		sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
1171
1172	sysdev_unregister(&per_cpu(mce_dev, cpu));
1173
1174	return err;
1175}
1176
1177static __cpuinit void mce_remove_device(unsigned int cpu)
1178{
1179	int i;
1180
1181	if (!cpumask_test_cpu(cpu, mce_dev_initialized))
1182		return;
1183
1184	for (i = 0; mce_attrs[i]; i++)
1185		sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
1186
1187	for (i = 0; i < banks; i++)
1188		sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]);
1189
1190	sysdev_unregister(&per_cpu(mce_dev, cpu));
1191	cpumask_clear_cpu(cpu, mce_dev_initialized);
1192}
1193
1194/* Make sure there are no machine checks on offlined CPUs. */
1195static void mce_disable_cpu(void *h)
1196{
1197	unsigned long action = *(unsigned long *)h;
1198	int i;
1199
1200	if (!mce_available(&current_cpu_data))
1201		return;
1202	if (!(action & CPU_TASKS_FROZEN))
1203		cmci_clear();
1204	for (i = 0; i < banks; i++) {
1205		if (!skip_bank_init(i))
1206			wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
1207	}
1208}
1209
1210static void mce_reenable_cpu(void *h)
1211{
1212	unsigned long action = *(unsigned long *)h;
1213	int i;
1214
1215	if (!mce_available(&current_cpu_data))
1216		return;
1217
1218	if (!(action & CPU_TASKS_FROZEN))
1219		cmci_reenable();
1220	for (i = 0; i < banks; i++) {
1221		if (!skip_bank_init(i))
1222			wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]);
1223	}
1224}
1225
1226/* Get notified when a cpu comes on/off. Be hotplug friendly. */
1227static int __cpuinit
1228mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
1229{
1230	unsigned int cpu = (unsigned long)hcpu;
1231	struct timer_list *t = &per_cpu(mce_timer, cpu);
1232
1233	switch (action) {
1234	case CPU_ONLINE:
1235	case CPU_ONLINE_FROZEN:
1236		mce_create_device(cpu);
1237		if (threshold_cpu_callback)
1238			threshold_cpu_callback(action, cpu);
1239		break;
1240	case CPU_DEAD:
1241	case CPU_DEAD_FROZEN:
1242		if (threshold_cpu_callback)
1243			threshold_cpu_callback(action, cpu);
1244		mce_remove_device(cpu);
1245		break;
1246	case CPU_DOWN_PREPARE:
1247	case CPU_DOWN_PREPARE_FROZEN:
1248		del_timer_sync(t);
1249		smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
1250		break;
1251	case CPU_DOWN_FAILED:
1252	case CPU_DOWN_FAILED_FROZEN:
1253		t->expires = round_jiffies(jiffies +
1254						__get_cpu_var(next_interval));
1255		add_timer_on(t, cpu);
1256		smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
1257		break;
1258	case CPU_POST_DEAD:
1259		/* intentionally ignoring frozen here */
1260		cmci_rediscover(cpu);
1261		break;
1262	}
1263	return NOTIFY_OK;
1264}
1265
1266static struct notifier_block mce_cpu_notifier __cpuinitdata = {
1267	.notifier_call = mce_cpu_callback,
1268};
1269
1270static __init int mce_init_banks(void)
1271{
1272	int i;
1273
1274	bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks,
1275				GFP_KERNEL);
1276	if (!bank_attrs)
1277		return -ENOMEM;
1278
1279	for (i = 0; i < banks; i++) {
1280		struct sysdev_attribute *a = &bank_attrs[i];
1281
1282		a->attr.name	= kasprintf(GFP_KERNEL, "bank%d", i);
1283		if (!a->attr.name)
1284			goto nomem;
1285
1286		a->attr.mode	= 0644;
1287		a->show		= show_bank;
1288		a->store	= set_bank;
1289	}
1290	return 0;
1291
1292nomem:
1293	while (--i >= 0)
1294		kfree(bank_attrs[i].attr.name);
1295	kfree(bank_attrs);
1296	bank_attrs = NULL;
1297
1298	return -ENOMEM;
1299}
1300
1301static __init int mce_init_device(void)
1302{
1303	int err;
1304	int i = 0;
1305
1306	if (!mce_available(&boot_cpu_data))
1307		return -EIO;
1308
1309	alloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL);
1310
1311	err = mce_init_banks();
1312	if (err)
1313		return err;
1314
1315	err = sysdev_class_register(&mce_sysclass);
1316	if (err)
1317		return err;
1318
1319	for_each_online_cpu(i) {
1320		err = mce_create_device(i);
1321		if (err)
1322			return err;
1323	}
1324
1325	register_hotcpu_notifier(&mce_cpu_notifier);
1326	misc_register(&mce_log_device);
1327
1328	return err;
1329}
1330
1331device_initcall(mce_init_device);
1332
1333#else /* CONFIG_X86_OLD_MCE: */
1334
1335int nr_mce_banks;
1336EXPORT_SYMBOL_GPL(nr_mce_banks);	/* non-fatal.o */
1337
1338/* This has to be run for each processor */
1339void mcheck_init(struct cpuinfo_x86 *c)
1340{
1341	if (mce_disabled == 1)
1342		return;
1343
1344	switch (c->x86_vendor) {
1345	case X86_VENDOR_AMD:
1346		amd_mcheck_init(c);
1347		break;
1348
1349	case X86_VENDOR_INTEL:
1350		if (c->x86 == 5)
1351			intel_p5_mcheck_init(c);
1352		if (c->x86 == 6)
1353			intel_p6_mcheck_init(c);
1354		if (c->x86 == 15)
1355			intel_p4_mcheck_init(c);
1356		break;
1357
1358	case X86_VENDOR_CENTAUR:
1359		if (c->x86 == 5)
1360			winchip_mcheck_init(c);
1361		break;
1362
1363	default:
1364		break;
1365	}
1366	printk(KERN_INFO "mce: CPU supports %d MCE banks\n", nr_mce_banks);
1367}
1368
1369static int __init mcheck_enable(char *str)
1370{
1371	mce_disabled = -1;
1372	return 1;
1373}
1374
1375__setup("mce", mcheck_enable);
1376
1377#endif /* CONFIG_X86_OLD_MCE */
1378
1379/*
1380 * Old style boot options parsing. Only for compatibility.
1381 */
1382static int __init mcheck_disable(char *str)
1383{
1384	mce_disabled = 1;
1385	return 1;
1386}
1387__setup("nomce", mcheck_disable);
1388