mce.c revision 01ca79f1411eae2a45352709c838b946b1af9fbd
1/*
2 * Machine check handler.
3 *
4 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
5 * Rest from unknown author(s).
6 * 2004 Andi Kleen. Rewrote most of it.
7 * Copyright 2008 Intel Corporation
8 * Author: Andi Kleen
9 */
10#include <linux/thread_info.h>
11#include <linux/capability.h>
12#include <linux/miscdevice.h>
13#include <linux/ratelimit.h>
14#include <linux/kallsyms.h>
15#include <linux/rcupdate.h>
16#include <linux/kobject.h>
17#include <linux/uaccess.h>
18#include <linux/kdebug.h>
19#include <linux/kernel.h>
20#include <linux/percpu.h>
21#include <linux/string.h>
22#include <linux/sysdev.h>
23#include <linux/ctype.h>
24#include <linux/sched.h>
25#include <linux/sysfs.h>
26#include <linux/types.h>
27#include <linux/init.h>
28#include <linux/kmod.h>
29#include <linux/poll.h>
30#include <linux/cpu.h>
31#include <linux/smp.h>
32#include <linux/fs.h>
33
34#include <asm/processor.h>
35#include <asm/idle.h>
36#include <asm/mce.h>
37#include <asm/msr.h>
38
39#include "mce.h"
40
41/* Handle unconfigured int18 (should never happen) */
42static void unexpected_machine_check(struct pt_regs *regs, long error_code)
43{
44	printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n",
45	       smp_processor_id());
46}
47
48/* Call the installed machine check handler for this CPU setup. */
49void (*machine_check_vector)(struct pt_regs *, long error_code) =
50						unexpected_machine_check;
51
52int				mce_disabled;
53
54#ifdef CONFIG_X86_NEW_MCE
55
56#define MISC_MCELOG_MINOR	227
57
58atomic_t mce_entry;
59
60DEFINE_PER_CPU(unsigned, mce_exception_count);
61
62/*
63 * Tolerant levels:
64 *   0: always panic on uncorrected errors, log corrected errors
65 *   1: panic or SIGBUS on uncorrected errors, log corrected errors
66 *   2: SIGBUS or log uncorrected errors (if possible), log corrected errors
67 *   3: never panic or SIGBUS, log all errors (for testing only)
68 */
69static int			tolerant = 1;
70static int			banks;
71static u64			*bank;
72static unsigned long		notify_user;
73static int			rip_msr;
74static int			mce_bootlog = -1;
75
76static char			trigger[128];
77static char			*trigger_argv[2] = { trigger, NULL };
78
79static unsigned long		dont_init_banks;
80
81static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
82
83/* MCA banks polled by the period polling timer for corrected events */
84DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
85	[0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
86};
87
88static inline int skip_bank_init(int i)
89{
90	return i < BITS_PER_LONG && test_bit(i, &dont_init_banks);
91}
92
93/* Do initial initialization of a struct mce */
94void mce_setup(struct mce *m)
95{
96	memset(m, 0, sizeof(struct mce));
97	m->cpu = smp_processor_id();
98	rdtscll(m->tsc);
99}
100
101DEFINE_PER_CPU(struct mce, injectm);
102EXPORT_PER_CPU_SYMBOL_GPL(injectm);
103
104/*
105 * Lockless MCE logging infrastructure.
106 * This avoids deadlocks on printk locks without having to break locks. Also
107 * separate MCEs from kernel messages to avoid bogus bug reports.
108 */
109
110static struct mce_log mcelog = {
111	MCE_LOG_SIGNATURE,
112	MCE_LOG_LEN,
113};
114
115void mce_log(struct mce *mce)
116{
117	unsigned next, entry;
118
119	mce->finished = 0;
120	wmb();
121	for (;;) {
122		entry = rcu_dereference(mcelog.next);
123		for (;;) {
124			/*
125			 * When the buffer fills up discard new entries.
126			 * Assume that the earlier errors are the more
127			 * interesting ones:
128			 */
129			if (entry >= MCE_LOG_LEN) {
130				set_bit(MCE_OVERFLOW,
131					(unsigned long *)&mcelog.flags);
132				return;
133			}
134			/* Old left over entry. Skip: */
135			if (mcelog.entry[entry].finished) {
136				entry++;
137				continue;
138			}
139			break;
140		}
141		smp_rmb();
142		next = entry + 1;
143		if (cmpxchg(&mcelog.next, entry, next) == entry)
144			break;
145	}
146	memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
147	wmb();
148	mcelog.entry[entry].finished = 1;
149	wmb();
150
151	set_bit(0, &notify_user);
152}
153
154static void print_mce(struct mce *m)
155{
156	printk(KERN_EMERG "\n"
157	       KERN_EMERG "HARDWARE ERROR\n"
158	       KERN_EMERG
159	       "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
160	       m->cpu, m->mcgstatus, m->bank, m->status);
161	if (m->ip) {
162		printk(KERN_EMERG "RIP%s %02x:<%016Lx> ",
163		       !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
164		       m->cs, m->ip);
165		if (m->cs == __KERNEL_CS)
166			print_symbol("{%s}", m->ip);
167		printk("\n");
168	}
169	printk(KERN_EMERG "TSC %llx ", m->tsc);
170	if (m->addr)
171		printk("ADDR %llx ", m->addr);
172	if (m->misc)
173		printk("MISC %llx ", m->misc);
174	printk("\n");
175	printk(KERN_EMERG "This is not a software problem!\n");
176	printk(KERN_EMERG "Run through mcelog --ascii to decode "
177	       "and contact your hardware vendor\n");
178}
179
180static void mce_panic(char *msg, struct mce *backup, u64 start)
181{
182	int i;
183
184	bust_spinlocks(1);
185	console_verbose();
186	for (i = 0; i < MCE_LOG_LEN; i++) {
187		u64 tsc = mcelog.entry[i].tsc;
188
189		if ((s64)(tsc - start) < 0)
190			continue;
191		print_mce(&mcelog.entry[i]);
192		if (backup && mcelog.entry[i].tsc == backup->tsc)
193			backup = NULL;
194	}
195	if (backup)
196		print_mce(backup);
197	panic(msg);
198}
199
200/* Support code for software error injection */
201
202static int msr_to_offset(u32 msr)
203{
204	unsigned bank = __get_cpu_var(injectm.bank);
205	if (msr == rip_msr)
206		return offsetof(struct mce, ip);
207	if (msr == MSR_IA32_MC0_STATUS + bank*4)
208		return offsetof(struct mce, status);
209	if (msr == MSR_IA32_MC0_ADDR + bank*4)
210		return offsetof(struct mce, addr);
211	if (msr == MSR_IA32_MC0_MISC + bank*4)
212		return offsetof(struct mce, misc);
213	if (msr == MSR_IA32_MCG_STATUS)
214		return offsetof(struct mce, mcgstatus);
215	return -1;
216}
217
218/* MSR access wrappers used for error injection */
219static u64 mce_rdmsrl(u32 msr)
220{
221	u64 v;
222	if (__get_cpu_var(injectm).finished) {
223		int offset = msr_to_offset(msr);
224		if (offset < 0)
225			return 0;
226		return *(u64 *)((char *)&__get_cpu_var(injectm) + offset);
227	}
228	rdmsrl(msr, v);
229	return v;
230}
231
232static void mce_wrmsrl(u32 msr, u64 v)
233{
234	if (__get_cpu_var(injectm).finished) {
235		int offset = msr_to_offset(msr);
236		if (offset >= 0)
237			*(u64 *)((char *)&__get_cpu_var(injectm) + offset) = v;
238		return;
239	}
240	wrmsrl(msr, v);
241}
242
243int mce_available(struct cpuinfo_x86 *c)
244{
245	if (mce_disabled)
246		return 0;
247	return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
248}
249
250static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
251{
252	if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) {
253		m->ip = regs->ip;
254		m->cs = regs->cs;
255	} else {
256		m->ip = 0;
257		m->cs = 0;
258	}
259	if (rip_msr) {
260		/* Assume the RIP in the MSR is exact. Is this true? */
261		m->mcgstatus |= MCG_STATUS_EIPV;
262		m->ip = mce_rdmsrl(rip_msr);
263		m->cs = 0;
264	}
265}
266
267/*
268 * Poll for corrected events or events that happened before reset.
269 * Those are just logged through /dev/mcelog.
270 *
271 * This is executed in standard interrupt context.
272 */
273void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
274{
275	struct mce m;
276	int i;
277
278	mce_setup(&m);
279
280	m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
281	for (i = 0; i < banks; i++) {
282		if (!bank[i] || !test_bit(i, *b))
283			continue;
284
285		m.misc = 0;
286		m.addr = 0;
287		m.bank = i;
288		m.tsc = 0;
289
290		barrier();
291		m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4);
292		if (!(m.status & MCI_STATUS_VAL))
293			continue;
294
295		/*
296		 * Uncorrected events are handled by the exception handler
297		 * when it is enabled. But when the exception is disabled log
298		 * everything.
299		 *
300		 * TBD do the same check for MCI_STATUS_EN here?
301		 */
302		if ((m.status & MCI_STATUS_UC) && !(flags & MCP_UC))
303			continue;
304
305		if (m.status & MCI_STATUS_MISCV)
306			m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4);
307		if (m.status & MCI_STATUS_ADDRV)
308			m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4);
309
310		if (!(flags & MCP_TIMESTAMP))
311			m.tsc = 0;
312		/*
313		 * Don't get the IP here because it's unlikely to
314		 * have anything to do with the actual error location.
315		 */
316		if (!(flags & MCP_DONTLOG)) {
317			mce_log(&m);
318			add_taint(TAINT_MACHINE_CHECK);
319		}
320
321		/*
322		 * Clear state for this bank.
323		 */
324		mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
325	}
326
327	/*
328	 * Don't clear MCG_STATUS here because it's only defined for
329	 * exceptions.
330	 */
331
332	sync_core();
333}
334EXPORT_SYMBOL_GPL(machine_check_poll);
335
336/*
337 * The actual machine check handler. This only handles real
338 * exceptions when something got corrupted coming in through int 18.
339 *
340 * This is executed in NMI context not subject to normal locking rules. This
341 * implies that most kernel services cannot be safely used. Don't even
342 * think about putting a printk in there!
343 */
344void do_machine_check(struct pt_regs *regs, long error_code)
345{
346	struct mce m, panicm;
347	int panicm_found = 0;
348	u64 mcestart = 0;
349	int i;
350	/*
351	 * If no_way_out gets set, there is no safe way to recover from this
352	 * MCE.  If tolerant is cranked up, we'll try anyway.
353	 */
354	int no_way_out = 0;
355	/*
356	 * If kill_it gets set, there might be a way to recover from this
357	 * error.
358	 */
359	int kill_it = 0;
360	DECLARE_BITMAP(toclear, MAX_NR_BANKS);
361
362	atomic_inc(&mce_entry);
363
364	__get_cpu_var(mce_exception_count)++;
365
366	if (notify_die(DIE_NMI, "machine check", regs, error_code,
367			   18, SIGKILL) == NOTIFY_STOP)
368		goto out;
369	if (!banks)
370		goto out;
371
372	mce_setup(&m);
373
374	m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
375
376	/* if the restart IP is not valid, we're done for */
377	if (!(m.mcgstatus & MCG_STATUS_RIPV))
378		no_way_out = 1;
379
380	rdtscll(mcestart);
381	barrier();
382
383	for (i = 0; i < banks; i++) {
384		__clear_bit(i, toclear);
385		if (!bank[i])
386			continue;
387
388		m.misc = 0;
389		m.addr = 0;
390		m.bank = i;
391
392		m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4);
393		if ((m.status & MCI_STATUS_VAL) == 0)
394			continue;
395
396		/*
397		 * Non uncorrected errors are handled by machine_check_poll
398		 * Leave them alone.
399		 */
400		if ((m.status & MCI_STATUS_UC) == 0)
401			continue;
402
403		/*
404		 * Set taint even when machine check was not enabled.
405		 */
406		add_taint(TAINT_MACHINE_CHECK);
407
408		__set_bit(i, toclear);
409
410		if (m.status & MCI_STATUS_EN) {
411			/* if PCC was set, there's no way out */
412			no_way_out |= !!(m.status & MCI_STATUS_PCC);
413			/*
414			 * If this error was uncorrectable and there was
415			 * an overflow, we're in trouble.  If no overflow,
416			 * we might get away with just killing a task.
417			 */
418			if (m.status & MCI_STATUS_UC) {
419				if (tolerant < 1 || m.status & MCI_STATUS_OVER)
420					no_way_out = 1;
421				kill_it = 1;
422			}
423		} else {
424			/*
425			 * Machine check event was not enabled. Clear, but
426			 * ignore.
427			 */
428			continue;
429		}
430
431		if (m.status & MCI_STATUS_MISCV)
432			m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4);
433		if (m.status & MCI_STATUS_ADDRV)
434			m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4);
435
436		mce_get_rip(&m, regs);
437		mce_log(&m);
438
439		/*
440		 * Did this bank cause the exception?
441		 *
442		 * Assume that the bank with uncorrectable errors did it,
443		 * and that there is only a single one:
444		 */
445		if ((m.status & MCI_STATUS_UC) &&
446					(m.status & MCI_STATUS_EN)) {
447			panicm = m;
448			panicm_found = 1;
449		}
450	}
451
452	/*
453	 * If we didn't find an uncorrectable error, pick
454	 * the last one (shouldn't happen, just being safe).
455	 */
456	if (!panicm_found)
457		panicm = m;
458
459	/*
460	 * If we have decided that we just CAN'T continue, and the user
461	 * has not set tolerant to an insane level, give up and die.
462	 */
463	if (no_way_out && tolerant < 3)
464		mce_panic("Machine check", &panicm, mcestart);
465
466	/*
467	 * If the error seems to be unrecoverable, something should be
468	 * done.  Try to kill as little as possible.  If we can kill just
469	 * one task, do that.  If the user has set the tolerance very
470	 * high, don't try to do anything at all.
471	 */
472	if (kill_it && tolerant < 3) {
473		int user_space = 0;
474
475		/*
476		 * If the EIPV bit is set, it means the saved IP is the
477		 * instruction which caused the MCE.
478		 */
479		if (m.mcgstatus & MCG_STATUS_EIPV)
480			user_space = panicm.ip && (panicm.cs & 3);
481
482		/*
483		 * If we know that the error was in user space, send a
484		 * SIGBUS.  Otherwise, panic if tolerance is low.
485		 *
486		 * force_sig() takes an awful lot of locks and has a slight
487		 * risk of deadlocking.
488		 */
489		if (user_space) {
490			force_sig(SIGBUS, current);
491		} else if (panic_on_oops || tolerant < 2) {
492			mce_panic("Uncorrected machine check",
493				&panicm, mcestart);
494		}
495	}
496
497	/* notify userspace ASAP */
498	set_thread_flag(TIF_MCE_NOTIFY);
499
500	/* the last thing we do is clear state */
501	for (i = 0; i < banks; i++) {
502		if (test_bit(i, toclear))
503			mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
504	}
505	mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
506out:
507	atomic_dec(&mce_entry);
508	sync_core();
509}
510EXPORT_SYMBOL_GPL(do_machine_check);
511
512#ifdef CONFIG_X86_MCE_INTEL
513/***
514 * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
515 * @cpu: The CPU on which the event occurred.
516 * @status: Event status information
517 *
518 * This function should be called by the thermal interrupt after the
519 * event has been processed and the decision was made to log the event
520 * further.
521 *
522 * The status parameter will be saved to the 'status' field of 'struct mce'
523 * and historically has been the register value of the
524 * MSR_IA32_THERMAL_STATUS (Intel) msr.
525 */
526void mce_log_therm_throt_event(__u64 status)
527{
528	struct mce m;
529
530	mce_setup(&m);
531	m.bank = MCE_THERMAL_BANK;
532	m.status = status;
533	mce_log(&m);
534}
535#endif /* CONFIG_X86_MCE_INTEL */
536
537/*
538 * Periodic polling timer for "silent" machine check errors.  If the
539 * poller finds an MCE, poll 2x faster.  When the poller finds no more
540 * errors, poll 2x slower (up to check_interval seconds).
541 */
542static int check_interval = 5 * 60; /* 5 minutes */
543
544static DEFINE_PER_CPU(int, next_interval); /* in jiffies */
545static DEFINE_PER_CPU(struct timer_list, mce_timer);
546
547static void mcheck_timer(unsigned long data)
548{
549	struct timer_list *t = &per_cpu(mce_timer, data);
550	int *n;
551
552	WARN_ON(smp_processor_id() != data);
553
554	if (mce_available(&current_cpu_data)) {
555		machine_check_poll(MCP_TIMESTAMP,
556				&__get_cpu_var(mce_poll_banks));
557	}
558
559	/*
560	 * Alert userspace if needed.  If we logged an MCE, reduce the
561	 * polling interval, otherwise increase the polling interval.
562	 */
563	n = &__get_cpu_var(next_interval);
564	if (mce_notify_user())
565		*n = max(*n/2, HZ/100);
566	else
567		*n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ));
568
569	t->expires = jiffies + *n;
570	add_timer(t);
571}
572
573static void mce_do_trigger(struct work_struct *work)
574{
575	call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT);
576}
577
578static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
579
580/*
581 * Notify the user(s) about new machine check events.
582 * Can be called from interrupt context, but not from machine check/NMI
583 * context.
584 */
585int mce_notify_user(void)
586{
587	/* Not more than two messages every minute */
588	static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
589
590	clear_thread_flag(TIF_MCE_NOTIFY);
591
592	if (test_and_clear_bit(0, &notify_user)) {
593		wake_up_interruptible(&mce_wait);
594
595		/*
596		 * There is no risk of missing notifications because
597		 * work_pending is always cleared before the function is
598		 * executed.
599		 */
600		if (trigger[0] && !work_pending(&mce_trigger_work))
601			schedule_work(&mce_trigger_work);
602
603		if (__ratelimit(&ratelimit))
604			printk(KERN_INFO "Machine check events logged\n");
605
606		return 1;
607	}
608	return 0;
609}
610EXPORT_SYMBOL_GPL(mce_notify_user);
611
612/*
613 * Initialize Machine Checks for a CPU.
614 */
615static int mce_cap_init(void)
616{
617	unsigned b;
618	u64 cap;
619
620	rdmsrl(MSR_IA32_MCG_CAP, cap);
621
622	b = cap & MCG_BANKCNT_MASK;
623	printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b);
624
625	if (b > MAX_NR_BANKS) {
626		printk(KERN_WARNING
627		       "MCE: Using only %u machine check banks out of %u\n",
628			MAX_NR_BANKS, b);
629		b = MAX_NR_BANKS;
630	}
631
632	/* Don't support asymmetric configurations today */
633	WARN_ON(banks != 0 && b != banks);
634	banks = b;
635	if (!bank) {
636		bank = kmalloc(banks * sizeof(u64), GFP_KERNEL);
637		if (!bank)
638			return -ENOMEM;
639		memset(bank, 0xff, banks * sizeof(u64));
640	}
641
642	/* Use accurate RIP reporting if available. */
643	if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9)
644		rip_msr = MSR_IA32_MCG_EIP;
645
646	return 0;
647}
648
649static void mce_init(void)
650{
651	mce_banks_t all_banks;
652	u64 cap;
653	int i;
654
655	/*
656	 * Log the machine checks left over from the previous reset.
657	 */
658	bitmap_fill(all_banks, MAX_NR_BANKS);
659	machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks);
660
661	set_in_cr4(X86_CR4_MCE);
662
663	rdmsrl(MSR_IA32_MCG_CAP, cap);
664	if (cap & MCG_CTL_P)
665		wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
666
667	for (i = 0; i < banks; i++) {
668		if (skip_bank_init(i))
669			continue;
670		wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
671		wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
672	}
673}
674
675/* Add per CPU specific workarounds here */
676static void mce_cpu_quirks(struct cpuinfo_x86 *c)
677{
678	/* This should be disabled by the BIOS, but isn't always */
679	if (c->x86_vendor == X86_VENDOR_AMD) {
680		if (c->x86 == 15 && banks > 4) {
681			/*
682			 * disable GART TBL walk error reporting, which
683			 * trips off incorrectly with the IOMMU & 3ware
684			 * & Cerberus:
685			 */
686			clear_bit(10, (unsigned long *)&bank[4]);
687		}
688		if (c->x86 <= 17 && mce_bootlog < 0) {
689			/*
690			 * Lots of broken BIOS around that don't clear them
691			 * by default and leave crap in there. Don't log:
692			 */
693			mce_bootlog = 0;
694		}
695		/*
696		 * Various K7s with broken bank 0 around. Always disable
697		 * by default.
698		 */
699		 if (c->x86 == 6)
700			bank[0] = 0;
701	}
702
703	if (c->x86_vendor == X86_VENDOR_INTEL) {
704		/*
705		 * SDM documents that on family 6 bank 0 should not be written
706		 * because it aliases to another special BIOS controlled
707		 * register.
708		 * But it's not aliased anymore on model 0x1a+
709		 * Don't ignore bank 0 completely because there could be a
710		 * valid event later, merely don't write CTL0.
711		 */
712
713		if (c->x86 == 6 && c->x86_model < 0x1A)
714			__set_bit(0, &dont_init_banks);
715	}
716}
717
718static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c)
719{
720	if (c->x86 != 5)
721		return;
722	switch (c->x86_vendor) {
723	case X86_VENDOR_INTEL:
724		if (mce_p5_enabled())
725			intel_p5_mcheck_init(c);
726		break;
727	case X86_VENDOR_CENTAUR:
728		winchip_mcheck_init(c);
729		break;
730	}
731}
732
733static void mce_cpu_features(struct cpuinfo_x86 *c)
734{
735	switch (c->x86_vendor) {
736	case X86_VENDOR_INTEL:
737		mce_intel_feature_init(c);
738		break;
739	case X86_VENDOR_AMD:
740		mce_amd_feature_init(c);
741		break;
742	default:
743		break;
744	}
745}
746
747static void mce_init_timer(void)
748{
749	struct timer_list *t = &__get_cpu_var(mce_timer);
750	int *n = &__get_cpu_var(next_interval);
751
752	*n = check_interval * HZ;
753	if (!*n)
754		return;
755	setup_timer(t, mcheck_timer, smp_processor_id());
756	t->expires = round_jiffies(jiffies + *n);
757	add_timer(t);
758}
759
760/*
761 * Called for each booted CPU to set up machine checks.
762 * Must be called with preempt off:
763 */
764void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
765{
766	if (mce_disabled)
767		return;
768
769	mce_ancient_init(c);
770
771	if (!mce_available(c))
772		return;
773
774	if (mce_cap_init() < 0) {
775		mce_disabled = 1;
776		return;
777	}
778	mce_cpu_quirks(c);
779
780	machine_check_vector = do_machine_check;
781
782	mce_init();
783	mce_cpu_features(c);
784	mce_init_timer();
785}
786
787/*
788 * Character device to read and clear the MCE log.
789 */
790
791static DEFINE_SPINLOCK(mce_state_lock);
792static int		open_count;		/* #times opened */
793static int		open_exclu;		/* already open exclusive? */
794
795static int mce_open(struct inode *inode, struct file *file)
796{
797	spin_lock(&mce_state_lock);
798
799	if (open_exclu || (open_count && (file->f_flags & O_EXCL))) {
800		spin_unlock(&mce_state_lock);
801
802		return -EBUSY;
803	}
804
805	if (file->f_flags & O_EXCL)
806		open_exclu = 1;
807	open_count++;
808
809	spin_unlock(&mce_state_lock);
810
811	return nonseekable_open(inode, file);
812}
813
814static int mce_release(struct inode *inode, struct file *file)
815{
816	spin_lock(&mce_state_lock);
817
818	open_count--;
819	open_exclu = 0;
820
821	spin_unlock(&mce_state_lock);
822
823	return 0;
824}
825
826static void collect_tscs(void *data)
827{
828	unsigned long *cpu_tsc = (unsigned long *)data;
829
830	rdtscll(cpu_tsc[smp_processor_id()]);
831}
832
833static DEFINE_MUTEX(mce_read_mutex);
834
835static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
836			loff_t *off)
837{
838	char __user *buf = ubuf;
839	unsigned long *cpu_tsc;
840	unsigned prev, next;
841	int i, err;
842
843	cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
844	if (!cpu_tsc)
845		return -ENOMEM;
846
847	mutex_lock(&mce_read_mutex);
848	next = rcu_dereference(mcelog.next);
849
850	/* Only supports full reads right now */
851	if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
852		mutex_unlock(&mce_read_mutex);
853		kfree(cpu_tsc);
854
855		return -EINVAL;
856	}
857
858	err = 0;
859	prev = 0;
860	do {
861		for (i = prev; i < next; i++) {
862			unsigned long start = jiffies;
863
864			while (!mcelog.entry[i].finished) {
865				if (time_after_eq(jiffies, start + 2)) {
866					memset(mcelog.entry + i, 0,
867					       sizeof(struct mce));
868					goto timeout;
869				}
870				cpu_relax();
871			}
872			smp_rmb();
873			err |= copy_to_user(buf, mcelog.entry + i,
874					    sizeof(struct mce));
875			buf += sizeof(struct mce);
876timeout:
877			;
878		}
879
880		memset(mcelog.entry + prev, 0,
881		       (next - prev) * sizeof(struct mce));
882		prev = next;
883		next = cmpxchg(&mcelog.next, prev, 0);
884	} while (next != prev);
885
886	synchronize_sched();
887
888	/*
889	 * Collect entries that were still getting written before the
890	 * synchronize.
891	 */
892	on_each_cpu(collect_tscs, cpu_tsc, 1);
893
894	for (i = next; i < MCE_LOG_LEN; i++) {
895		if (mcelog.entry[i].finished &&
896		    mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
897			err |= copy_to_user(buf, mcelog.entry+i,
898					    sizeof(struct mce));
899			smp_rmb();
900			buf += sizeof(struct mce);
901			memset(&mcelog.entry[i], 0, sizeof(struct mce));
902		}
903	}
904	mutex_unlock(&mce_read_mutex);
905	kfree(cpu_tsc);
906
907	return err ? -EFAULT : buf - ubuf;
908}
909
910static unsigned int mce_poll(struct file *file, poll_table *wait)
911{
912	poll_wait(file, &mce_wait, wait);
913	if (rcu_dereference(mcelog.next))
914		return POLLIN | POLLRDNORM;
915	return 0;
916}
917
918static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
919{
920	int __user *p = (int __user *)arg;
921
922	if (!capable(CAP_SYS_ADMIN))
923		return -EPERM;
924
925	switch (cmd) {
926	case MCE_GET_RECORD_LEN:
927		return put_user(sizeof(struct mce), p);
928	case MCE_GET_LOG_LEN:
929		return put_user(MCE_LOG_LEN, p);
930	case MCE_GETCLEAR_FLAGS: {
931		unsigned flags;
932
933		do {
934			flags = mcelog.flags;
935		} while (cmpxchg(&mcelog.flags, flags, 0) != flags);
936
937		return put_user(flags, p);
938	}
939	default:
940		return -ENOTTY;
941	}
942}
943
944/* Modified in mce-inject.c, so not static or const */
945struct file_operations mce_chrdev_ops = {
946	.open			= mce_open,
947	.release		= mce_release,
948	.read			= mce_read,
949	.poll			= mce_poll,
950	.unlocked_ioctl		= mce_ioctl,
951};
952EXPORT_SYMBOL_GPL(mce_chrdev_ops);
953
954static struct miscdevice mce_log_device = {
955	MISC_MCELOG_MINOR,
956	"mcelog",
957	&mce_chrdev_ops,
958};
959
960/*
961 * mce=off disables machine check
962 * mce=TOLERANCELEVEL (number, see above)
963 * mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
964 * mce=nobootlog Don't log MCEs from before booting.
965 */
966static int __init mcheck_enable(char *str)
967{
968	if (*str == 0)
969		enable_p5_mce();
970	if (*str == '=')
971		str++;
972	if (!strcmp(str, "off"))
973		mce_disabled = 1;
974	else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
975		mce_bootlog = (str[0] == 'b');
976	else if (isdigit(str[0]))
977		get_option(&str, &tolerant);
978	else {
979		printk(KERN_INFO "mce argument %s ignored. Please use /sys\n",
980		       str);
981		return 0;
982	}
983	return 1;
984}
985__setup("mce", mcheck_enable);
986
987/*
988 * Sysfs support
989 */
990
991/*
992 * Disable machine checks on suspend and shutdown. We can't really handle
993 * them later.
994 */
995static int mce_disable(void)
996{
997	int i;
998
999	for (i = 0; i < banks; i++) {
1000		if (!skip_bank_init(i))
1001			wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
1002	}
1003	return 0;
1004}
1005
1006static int mce_suspend(struct sys_device *dev, pm_message_t state)
1007{
1008	return mce_disable();
1009}
1010
1011static int mce_shutdown(struct sys_device *dev)
1012{
1013	return mce_disable();
1014}
1015
1016/*
1017 * On resume clear all MCE state. Don't want to see leftovers from the BIOS.
1018 * Only one CPU is active at this time, the others get re-added later using
1019 * CPU hotplug:
1020 */
1021static int mce_resume(struct sys_device *dev)
1022{
1023	mce_init();
1024	mce_cpu_features(&current_cpu_data);
1025
1026	return 0;
1027}
1028
1029static void mce_cpu_restart(void *data)
1030{
1031	del_timer_sync(&__get_cpu_var(mce_timer));
1032	if (mce_available(&current_cpu_data))
1033		mce_init();
1034	mce_init_timer();
1035}
1036
1037/* Reinit MCEs after user configuration changes */
1038static void mce_restart(void)
1039{
1040	on_each_cpu(mce_cpu_restart, NULL, 1);
1041}
1042
1043static struct sysdev_class mce_sysclass = {
1044	.suspend	= mce_suspend,
1045	.shutdown	= mce_shutdown,
1046	.resume		= mce_resume,
1047	.name		= "machinecheck",
1048};
1049
1050DEFINE_PER_CPU(struct sys_device, mce_dev);
1051
1052__cpuinitdata
1053void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
1054
1055static struct sysdev_attribute *bank_attrs;
1056
1057static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr,
1058			 char *buf)
1059{
1060	u64 b = bank[attr - bank_attrs];
1061
1062	return sprintf(buf, "%llx\n", b);
1063}
1064
1065static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
1066			const char *buf, size_t size)
1067{
1068	u64 new;
1069
1070	if (strict_strtoull(buf, 0, &new) < 0)
1071		return -EINVAL;
1072
1073	bank[attr - bank_attrs] = new;
1074	mce_restart();
1075
1076	return size;
1077}
1078
1079static ssize_t
1080show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf)
1081{
1082	strcpy(buf, trigger);
1083	strcat(buf, "\n");
1084	return strlen(trigger) + 1;
1085}
1086
1087static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr,
1088				const char *buf, size_t siz)
1089{
1090	char *p;
1091	int len;
1092
1093	strncpy(trigger, buf, sizeof(trigger));
1094	trigger[sizeof(trigger)-1] = 0;
1095	len = strlen(trigger);
1096	p = strchr(trigger, '\n');
1097
1098	if (*p)
1099		*p = 0;
1100
1101	return len;
1102}
1103
1104static ssize_t store_int_with_restart(struct sys_device *s,
1105				      struct sysdev_attribute *attr,
1106				      const char *buf, size_t size)
1107{
1108	ssize_t ret = sysdev_store_int(s, attr, buf, size);
1109	mce_restart();
1110	return ret;
1111}
1112
1113static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
1114static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
1115
1116static struct sysdev_ext_attribute attr_check_interval = {
1117	_SYSDEV_ATTR(check_interval, 0644, sysdev_show_int,
1118		     store_int_with_restart),
1119	&check_interval
1120};
1121
1122static struct sysdev_attribute *mce_attrs[] = {
1123	&attr_tolerant.attr, &attr_check_interval.attr, &attr_trigger,
1124	NULL
1125};
1126
1127static cpumask_var_t mce_dev_initialized;
1128
1129/* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */
1130static __cpuinit int mce_create_device(unsigned int cpu)
1131{
1132	int err;
1133	int i;
1134
1135	if (!mce_available(&boot_cpu_data))
1136		return -EIO;
1137
1138	memset(&per_cpu(mce_dev, cpu).kobj, 0, sizeof(struct kobject));
1139	per_cpu(mce_dev, cpu).id	= cpu;
1140	per_cpu(mce_dev, cpu).cls	= &mce_sysclass;
1141
1142	err = sysdev_register(&per_cpu(mce_dev, cpu));
1143	if (err)
1144		return err;
1145
1146	for (i = 0; mce_attrs[i]; i++) {
1147		err = sysdev_create_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
1148		if (err)
1149			goto error;
1150	}
1151	for (i = 0; i < banks; i++) {
1152		err = sysdev_create_file(&per_cpu(mce_dev, cpu),
1153					&bank_attrs[i]);
1154		if (err)
1155			goto error2;
1156	}
1157	cpumask_set_cpu(cpu, mce_dev_initialized);
1158
1159	return 0;
1160error2:
1161	while (--i >= 0)
1162		sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]);
1163error:
1164	while (--i >= 0)
1165		sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
1166
1167	sysdev_unregister(&per_cpu(mce_dev, cpu));
1168
1169	return err;
1170}
1171
1172static __cpuinit void mce_remove_device(unsigned int cpu)
1173{
1174	int i;
1175
1176	if (!cpumask_test_cpu(cpu, mce_dev_initialized))
1177		return;
1178
1179	for (i = 0; mce_attrs[i]; i++)
1180		sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
1181
1182	for (i = 0; i < banks; i++)
1183		sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]);
1184
1185	sysdev_unregister(&per_cpu(mce_dev, cpu));
1186	cpumask_clear_cpu(cpu, mce_dev_initialized);
1187}
1188
1189/* Make sure there are no machine checks on offlined CPUs. */
1190static void mce_disable_cpu(void *h)
1191{
1192	unsigned long action = *(unsigned long *)h;
1193	int i;
1194
1195	if (!mce_available(&current_cpu_data))
1196		return;
1197	if (!(action & CPU_TASKS_FROZEN))
1198		cmci_clear();
1199	for (i = 0; i < banks; i++) {
1200		if (!skip_bank_init(i))
1201			wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
1202	}
1203}
1204
1205static void mce_reenable_cpu(void *h)
1206{
1207	unsigned long action = *(unsigned long *)h;
1208	int i;
1209
1210	if (!mce_available(&current_cpu_data))
1211		return;
1212
1213	if (!(action & CPU_TASKS_FROZEN))
1214		cmci_reenable();
1215	for (i = 0; i < banks; i++) {
1216		if (!skip_bank_init(i))
1217			wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]);
1218	}
1219}
1220
1221/* Get notified when a cpu comes on/off. Be hotplug friendly. */
1222static int __cpuinit
1223mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
1224{
1225	unsigned int cpu = (unsigned long)hcpu;
1226	struct timer_list *t = &per_cpu(mce_timer, cpu);
1227
1228	switch (action) {
1229	case CPU_ONLINE:
1230	case CPU_ONLINE_FROZEN:
1231		mce_create_device(cpu);
1232		if (threshold_cpu_callback)
1233			threshold_cpu_callback(action, cpu);
1234		break;
1235	case CPU_DEAD:
1236	case CPU_DEAD_FROZEN:
1237		if (threshold_cpu_callback)
1238			threshold_cpu_callback(action, cpu);
1239		mce_remove_device(cpu);
1240		break;
1241	case CPU_DOWN_PREPARE:
1242	case CPU_DOWN_PREPARE_FROZEN:
1243		del_timer_sync(t);
1244		smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
1245		break;
1246	case CPU_DOWN_FAILED:
1247	case CPU_DOWN_FAILED_FROZEN:
1248		t->expires = round_jiffies(jiffies +
1249						__get_cpu_var(next_interval));
1250		add_timer_on(t, cpu);
1251		smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
1252		break;
1253	case CPU_POST_DEAD:
1254		/* intentionally ignoring frozen here */
1255		cmci_rediscover(cpu);
1256		break;
1257	}
1258	return NOTIFY_OK;
1259}
1260
1261static struct notifier_block mce_cpu_notifier __cpuinitdata = {
1262	.notifier_call = mce_cpu_callback,
1263};
1264
1265static __init int mce_init_banks(void)
1266{
1267	int i;
1268
1269	bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks,
1270				GFP_KERNEL);
1271	if (!bank_attrs)
1272		return -ENOMEM;
1273
1274	for (i = 0; i < banks; i++) {
1275		struct sysdev_attribute *a = &bank_attrs[i];
1276
1277		a->attr.name	= kasprintf(GFP_KERNEL, "bank%d", i);
1278		if (!a->attr.name)
1279			goto nomem;
1280
1281		a->attr.mode	= 0644;
1282		a->show		= show_bank;
1283		a->store	= set_bank;
1284	}
1285	return 0;
1286
1287nomem:
1288	while (--i >= 0)
1289		kfree(bank_attrs[i].attr.name);
1290	kfree(bank_attrs);
1291	bank_attrs = NULL;
1292
1293	return -ENOMEM;
1294}
1295
1296static __init int mce_init_device(void)
1297{
1298	int err;
1299	int i = 0;
1300
1301	if (!mce_available(&boot_cpu_data))
1302		return -EIO;
1303
1304	alloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL);
1305
1306	err = mce_init_banks();
1307	if (err)
1308		return err;
1309
1310	err = sysdev_class_register(&mce_sysclass);
1311	if (err)
1312		return err;
1313
1314	for_each_online_cpu(i) {
1315		err = mce_create_device(i);
1316		if (err)
1317			return err;
1318	}
1319
1320	register_hotcpu_notifier(&mce_cpu_notifier);
1321	misc_register(&mce_log_device);
1322
1323	return err;
1324}
1325
1326device_initcall(mce_init_device);
1327
1328#else /* CONFIG_X86_OLD_MCE: */
1329
1330int nr_mce_banks;
1331EXPORT_SYMBOL_GPL(nr_mce_banks);	/* non-fatal.o */
1332
1333/* This has to be run for each processor */
1334void mcheck_init(struct cpuinfo_x86 *c)
1335{
1336	if (mce_disabled == 1)
1337		return;
1338
1339	switch (c->x86_vendor) {
1340	case X86_VENDOR_AMD:
1341		amd_mcheck_init(c);
1342		break;
1343
1344	case X86_VENDOR_INTEL:
1345		if (c->x86 == 5)
1346			intel_p5_mcheck_init(c);
1347		if (c->x86 == 6)
1348			intel_p6_mcheck_init(c);
1349		if (c->x86 == 15)
1350			intel_p4_mcheck_init(c);
1351		break;
1352
1353	case X86_VENDOR_CENTAUR:
1354		if (c->x86 == 5)
1355			winchip_mcheck_init(c);
1356		break;
1357
1358	default:
1359		break;
1360	}
1361	printk(KERN_INFO "mce: CPU supports %d MCE banks\n", nr_mce_banks);
1362}
1363
1364static int __init mcheck_enable(char *str)
1365{
1366	mce_disabled = -1;
1367	return 1;
1368}
1369
1370__setup("mce", mcheck_enable);
1371
1372#endif /* CONFIG_X86_OLD_MCE */
1373
1374/*
1375 * Old style boot options parsing. Only for compatibility.
1376 */
1377static int __init mcheck_disable(char *str)
1378{
1379	mce_disabled = 1;
1380	return 1;
1381}
1382__setup("nomce", mcheck_disable);
1383