mce.c revision bd19a5e6b73df276e1ccedf9059e9ee70c372d7d
1/*
2 * Machine check handler.
3 *
4 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
5 * Rest from unknown author(s).
6 * 2004 Andi Kleen. Rewrote most of it.
7 * Copyright 2008 Intel Corporation
8 * Author: Andi Kleen
9 */
10#include <linux/thread_info.h>
11#include <linux/capability.h>
12#include <linux/miscdevice.h>
13#include <linux/ratelimit.h>
14#include <linux/kallsyms.h>
15#include <linux/rcupdate.h>
16#include <linux/kobject.h>
17#include <linux/uaccess.h>
18#include <linux/kdebug.h>
19#include <linux/kernel.h>
20#include <linux/percpu.h>
21#include <linux/string.h>
22#include <linux/sysdev.h>
23#include <linux/ctype.h>
24#include <linux/sched.h>
25#include <linux/sysfs.h>
26#include <linux/types.h>
27#include <linux/init.h>
28#include <linux/kmod.h>
29#include <linux/poll.h>
30#include <linux/cpu.h>
31#include <linux/smp.h>
32#include <linux/fs.h>
33
34#include <asm/processor.h>
35#include <asm/idle.h>
36#include <asm/mce.h>
37#include <asm/msr.h>
38
39#include "mce-internal.h"
40#include "mce.h"
41
42/* Handle unconfigured int18 (should never happen) */
43static void unexpected_machine_check(struct pt_regs *regs, long error_code)
44{
45	printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n",
46	       smp_processor_id());
47}
48
49/* Call the installed machine check handler for this CPU setup. */
50void (*machine_check_vector)(struct pt_regs *, long error_code) =
51						unexpected_machine_check;
52
53int				mce_disabled;
54
55#ifdef CONFIG_X86_NEW_MCE
56
57#define MISC_MCELOG_MINOR	227
58
59atomic_t mce_entry;
60
61DEFINE_PER_CPU(unsigned, mce_exception_count);
62
63/*
64 * Tolerant levels:
65 *   0: always panic on uncorrected errors, log corrected errors
66 *   1: panic or SIGBUS on uncorrected errors, log corrected errors
67 *   2: SIGBUS or log uncorrected errors (if possible), log corrected errors
68 *   3: never panic or SIGBUS, log all errors (for testing only)
69 */
70static int			tolerant = 1;
71static int			banks;
72static u64			*bank;
73static unsigned long		notify_user;
74static int			rip_msr;
75static int			mce_bootlog = -1;
76
77static char			trigger[128];
78static char			*trigger_argv[2] = { trigger, NULL };
79
80static unsigned long		dont_init_banks;
81
82static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
83
84/* MCA banks polled by the period polling timer for corrected events */
85DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
86	[0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
87};
88
89static inline int skip_bank_init(int i)
90{
91	return i < BITS_PER_LONG && test_bit(i, &dont_init_banks);
92}
93
94/* Do initial initialization of a struct mce */
95void mce_setup(struct mce *m)
96{
97	memset(m, 0, sizeof(struct mce));
98	m->cpu = m->extcpu = smp_processor_id();
99	rdtscll(m->tsc);
100	/* We hope get_seconds stays lockless */
101	m->time = get_seconds();
102	m->cpuvendor = boot_cpu_data.x86_vendor;
103	m->cpuid = cpuid_eax(1);
104#ifdef CONFIG_SMP
105	m->socketid = cpu_data(m->extcpu).phys_proc_id;
106#endif
107	m->apicid = cpu_data(m->extcpu).initial_apicid;
108	rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap);
109}
110
111DEFINE_PER_CPU(struct mce, injectm);
112EXPORT_PER_CPU_SYMBOL_GPL(injectm);
113
114/*
115 * Lockless MCE logging infrastructure.
116 * This avoids deadlocks on printk locks without having to break locks. Also
117 * separate MCEs from kernel messages to avoid bogus bug reports.
118 */
119
120static struct mce_log mcelog = {
121	.signature	= MCE_LOG_SIGNATURE,
122	.len		= MCE_LOG_LEN,
123	.recordlen	= sizeof(struct mce),
124};
125
126void mce_log(struct mce *mce)
127{
128	unsigned next, entry;
129
130	mce->finished = 0;
131	wmb();
132	for (;;) {
133		entry = rcu_dereference(mcelog.next);
134		for (;;) {
135			/*
136			 * When the buffer fills up discard new entries.
137			 * Assume that the earlier errors are the more
138			 * interesting ones:
139			 */
140			if (entry >= MCE_LOG_LEN) {
141				set_bit(MCE_OVERFLOW,
142					(unsigned long *)&mcelog.flags);
143				return;
144			}
145			/* Old left over entry. Skip: */
146			if (mcelog.entry[entry].finished) {
147				entry++;
148				continue;
149			}
150			break;
151		}
152		smp_rmb();
153		next = entry + 1;
154		if (cmpxchg(&mcelog.next, entry, next) == entry)
155			break;
156	}
157	memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
158	wmb();
159	mcelog.entry[entry].finished = 1;
160	wmb();
161
162	mce->finished = 1;
163	set_bit(0, &notify_user);
164}
165
166static void print_mce(struct mce *m)
167{
168	printk(KERN_EMERG "\n"
169	       KERN_EMERG "HARDWARE ERROR\n"
170	       KERN_EMERG
171	       "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
172	       m->extcpu, m->mcgstatus, m->bank, m->status);
173	if (m->ip) {
174		printk(KERN_EMERG "RIP%s %02x:<%016Lx> ",
175		       !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
176		       m->cs, m->ip);
177		if (m->cs == __KERNEL_CS)
178			print_symbol("{%s}", m->ip);
179		printk("\n");
180	}
181	printk(KERN_EMERG "TSC %llx ", m->tsc);
182	if (m->addr)
183		printk("ADDR %llx ", m->addr);
184	if (m->misc)
185		printk("MISC %llx ", m->misc);
186	printk("\n");
187	printk(KERN_EMERG "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n",
188			m->cpuvendor, m->cpuid, m->time, m->socketid,
189			m->apicid);
190	printk(KERN_EMERG "This is not a software problem!\n");
191	printk(KERN_EMERG "Run through mcelog --ascii to decode "
192	       "and contact your hardware vendor\n");
193}
194
195static void mce_panic(char *msg, struct mce *final, char *exp)
196{
197	int i;
198
199	bust_spinlocks(1);
200	console_verbose();
201	/* First print corrected ones that are still unlogged */
202	for (i = 0; i < MCE_LOG_LEN; i++) {
203		struct mce *m = &mcelog.entry[i];
204		if ((m->status & MCI_STATUS_VAL) &&
205			!(m->status & MCI_STATUS_UC))
206			print_mce(m);
207	}
208	/* Now print uncorrected but with the final one last */
209	for (i = 0; i < MCE_LOG_LEN; i++) {
210		struct mce *m = &mcelog.entry[i];
211		if (!(m->status & MCI_STATUS_VAL))
212			continue;
213		if (!final || memcmp(m, final, sizeof(struct mce)))
214			print_mce(m);
215	}
216	if (final)
217		print_mce(final);
218	if (exp)
219		printk(KERN_EMERG "Machine check: %s\n", exp);
220	panic(msg);
221}
222
223/* Support code for software error injection */
224
225static int msr_to_offset(u32 msr)
226{
227	unsigned bank = __get_cpu_var(injectm.bank);
228	if (msr == rip_msr)
229		return offsetof(struct mce, ip);
230	if (msr == MSR_IA32_MC0_STATUS + bank*4)
231		return offsetof(struct mce, status);
232	if (msr == MSR_IA32_MC0_ADDR + bank*4)
233		return offsetof(struct mce, addr);
234	if (msr == MSR_IA32_MC0_MISC + bank*4)
235		return offsetof(struct mce, misc);
236	if (msr == MSR_IA32_MCG_STATUS)
237		return offsetof(struct mce, mcgstatus);
238	return -1;
239}
240
241/* MSR access wrappers used for error injection */
242static u64 mce_rdmsrl(u32 msr)
243{
244	u64 v;
245	if (__get_cpu_var(injectm).finished) {
246		int offset = msr_to_offset(msr);
247		if (offset < 0)
248			return 0;
249		return *(u64 *)((char *)&__get_cpu_var(injectm) + offset);
250	}
251	rdmsrl(msr, v);
252	return v;
253}
254
255static void mce_wrmsrl(u32 msr, u64 v)
256{
257	if (__get_cpu_var(injectm).finished) {
258		int offset = msr_to_offset(msr);
259		if (offset >= 0)
260			*(u64 *)((char *)&__get_cpu_var(injectm) + offset) = v;
261		return;
262	}
263	wrmsrl(msr, v);
264}
265
266int mce_available(struct cpuinfo_x86 *c)
267{
268	if (mce_disabled)
269		return 0;
270	return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
271}
272
273static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
274{
275	if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) {
276		m->ip = regs->ip;
277		m->cs = regs->cs;
278	} else {
279		m->ip = 0;
280		m->cs = 0;
281	}
282	if (rip_msr) {
283		/* Assume the RIP in the MSR is exact. Is this true? */
284		m->mcgstatus |= MCG_STATUS_EIPV;
285		m->ip = mce_rdmsrl(rip_msr);
286		m->cs = 0;
287	}
288}
289
290DEFINE_PER_CPU(unsigned, mce_poll_count);
291
292/*
293 * Poll for corrected events or events that happened before reset.
294 * Those are just logged through /dev/mcelog.
295 *
296 * This is executed in standard interrupt context.
297 */
298void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
299{
300	struct mce m;
301	int i;
302
303	__get_cpu_var(mce_poll_count)++;
304
305	mce_setup(&m);
306
307	m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
308	for (i = 0; i < banks; i++) {
309		if (!bank[i] || !test_bit(i, *b))
310			continue;
311
312		m.misc = 0;
313		m.addr = 0;
314		m.bank = i;
315		m.tsc = 0;
316
317		barrier();
318		m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4);
319		if (!(m.status & MCI_STATUS_VAL))
320			continue;
321
322		/*
323		 * Uncorrected events are handled by the exception handler
324		 * when it is enabled. But when the exception is disabled log
325		 * everything.
326		 *
327		 * TBD do the same check for MCI_STATUS_EN here?
328		 */
329		if ((m.status & MCI_STATUS_UC) && !(flags & MCP_UC))
330			continue;
331
332		if (m.status & MCI_STATUS_MISCV)
333			m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4);
334		if (m.status & MCI_STATUS_ADDRV)
335			m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4);
336
337		if (!(flags & MCP_TIMESTAMP))
338			m.tsc = 0;
339		/*
340		 * Don't get the IP here because it's unlikely to
341		 * have anything to do with the actual error location.
342		 */
343		if (!(flags & MCP_DONTLOG)) {
344			mce_log(&m);
345			add_taint(TAINT_MACHINE_CHECK);
346		}
347
348		/*
349		 * Clear state for this bank.
350		 */
351		mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
352	}
353
354	/*
355	 * Don't clear MCG_STATUS here because it's only defined for
356	 * exceptions.
357	 */
358
359	sync_core();
360}
361EXPORT_SYMBOL_GPL(machine_check_poll);
362
363/*
364 * Do a quick check if any of the events requires a panic.
365 * This decides if we keep the events around or clear them.
366 */
367static int mce_no_way_out(struct mce *m, char **msg)
368{
369	int i;
370
371	for (i = 0; i < banks; i++) {
372		m->status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4);
373		if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY)
374			return 1;
375	}
376	return 0;
377}
378
379/*
380 * The actual machine check handler. This only handles real
381 * exceptions when something got corrupted coming in through int 18.
382 *
383 * This is executed in NMI context not subject to normal locking rules. This
384 * implies that most kernel services cannot be safely used. Don't even
385 * think about putting a printk in there!
386 */
387void do_machine_check(struct pt_regs *regs, long error_code)
388{
389	struct mce m, panicm;
390	int panicm_found = 0;
391	int i;
392	/*
393	 * If no_way_out gets set, there is no safe way to recover from this
394	 * MCE.  If tolerant is cranked up, we'll try anyway.
395	 */
396	int no_way_out = 0;
397	/*
398	 * If kill_it gets set, there might be a way to recover from this
399	 * error.
400	 */
401	int kill_it = 0;
402	DECLARE_BITMAP(toclear, MAX_NR_BANKS);
403	char *msg = "Unknown";
404
405	atomic_inc(&mce_entry);
406
407	__get_cpu_var(mce_exception_count)++;
408
409	if (notify_die(DIE_NMI, "machine check", regs, error_code,
410			   18, SIGKILL) == NOTIFY_STOP)
411		goto out;
412	if (!banks)
413		goto out;
414
415	mce_setup(&m);
416
417	m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
418	no_way_out = mce_no_way_out(&m, &msg);
419
420	barrier();
421
422	for (i = 0; i < banks; i++) {
423		__clear_bit(i, toclear);
424		if (!bank[i])
425			continue;
426
427		m.misc = 0;
428		m.addr = 0;
429		m.bank = i;
430
431		m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4);
432		if ((m.status & MCI_STATUS_VAL) == 0)
433			continue;
434
435		/*
436		 * Non uncorrected errors are handled by machine_check_poll
437		 * Leave them alone, unless this panics.
438		 */
439		if ((m.status & MCI_STATUS_UC) == 0 && !no_way_out)
440			continue;
441
442		/*
443		 * Set taint even when machine check was not enabled.
444		 */
445		add_taint(TAINT_MACHINE_CHECK);
446
447		__set_bit(i, toclear);
448
449		if (m.status & MCI_STATUS_EN) {
450			/*
451			 * If this error was uncorrectable and there was
452			 * an overflow, we're in trouble.  If no overflow,
453			 * we might get away with just killing a task.
454			 */
455			if (m.status & MCI_STATUS_UC)
456				kill_it = 1;
457		} else {
458			/*
459			 * Machine check event was not enabled. Clear, but
460			 * ignore.
461			 */
462			continue;
463		}
464
465		if (m.status & MCI_STATUS_MISCV)
466			m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4);
467		if (m.status & MCI_STATUS_ADDRV)
468			m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4);
469
470		mce_get_rip(&m, regs);
471		mce_log(&m);
472
473		/*
474		 * Did this bank cause the exception?
475		 *
476		 * Assume that the bank with uncorrectable errors did it,
477		 * and that there is only a single one:
478		 */
479		if ((m.status & MCI_STATUS_UC) &&
480					(m.status & MCI_STATUS_EN)) {
481			panicm = m;
482			panicm_found = 1;
483		}
484	}
485
486	/*
487	 * If we didn't find an uncorrectable error, pick
488	 * the last one (shouldn't happen, just being safe).
489	 */
490	if (!panicm_found)
491		panicm = m;
492
493	/*
494	 * If we have decided that we just CAN'T continue, and the user
495	 * has not set tolerant to an insane level, give up and die.
496	 */
497	if (no_way_out && tolerant < 3)
498		mce_panic("Machine check", &panicm, msg);
499
500	/*
501	 * If the error seems to be unrecoverable, something should be
502	 * done.  Try to kill as little as possible.  If we can kill just
503	 * one task, do that.  If the user has set the tolerance very
504	 * high, don't try to do anything at all.
505	 */
506	if (kill_it && tolerant < 3) {
507		int user_space = 0;
508
509		/*
510		 * If the EIPV bit is set, it means the saved IP is the
511		 * instruction which caused the MCE.
512		 */
513		if (m.mcgstatus & MCG_STATUS_EIPV)
514			user_space = panicm.ip && (panicm.cs & 3);
515
516		/*
517		 * If we know that the error was in user space, send a
518		 * SIGBUS.  Otherwise, panic if tolerance is low.
519		 *
520		 * force_sig() takes an awful lot of locks and has a slight
521		 * risk of deadlocking.
522		 */
523		if (user_space) {
524			force_sig(SIGBUS, current);
525		} else if (panic_on_oops || tolerant < 2) {
526			mce_panic("Uncorrected machine check", &panicm, msg);
527		}
528	}
529
530	/* notify userspace ASAP */
531	set_thread_flag(TIF_MCE_NOTIFY);
532
533	/* the last thing we do is clear state */
534	for (i = 0; i < banks; i++) {
535		if (test_bit(i, toclear))
536			mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
537	}
538	mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
539out:
540	atomic_dec(&mce_entry);
541	sync_core();
542}
543EXPORT_SYMBOL_GPL(do_machine_check);
544
545#ifdef CONFIG_X86_MCE_INTEL
546/***
547 * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
548 * @cpu: The CPU on which the event occurred.
549 * @status: Event status information
550 *
551 * This function should be called by the thermal interrupt after the
552 * event has been processed and the decision was made to log the event
553 * further.
554 *
555 * The status parameter will be saved to the 'status' field of 'struct mce'
556 * and historically has been the register value of the
557 * MSR_IA32_THERMAL_STATUS (Intel) msr.
558 */
559void mce_log_therm_throt_event(__u64 status)
560{
561	struct mce m;
562
563	mce_setup(&m);
564	m.bank = MCE_THERMAL_BANK;
565	m.status = status;
566	mce_log(&m);
567}
568#endif /* CONFIG_X86_MCE_INTEL */
569
570/*
571 * Periodic polling timer for "silent" machine check errors.  If the
572 * poller finds an MCE, poll 2x faster.  When the poller finds no more
573 * errors, poll 2x slower (up to check_interval seconds).
574 */
575static int check_interval = 5 * 60; /* 5 minutes */
576
577static DEFINE_PER_CPU(int, next_interval); /* in jiffies */
578static DEFINE_PER_CPU(struct timer_list, mce_timer);
579
580static void mcheck_timer(unsigned long data)
581{
582	struct timer_list *t = &per_cpu(mce_timer, data);
583	int *n;
584
585	WARN_ON(smp_processor_id() != data);
586
587	if (mce_available(&current_cpu_data)) {
588		machine_check_poll(MCP_TIMESTAMP,
589				&__get_cpu_var(mce_poll_banks));
590	}
591
592	/*
593	 * Alert userspace if needed.  If we logged an MCE, reduce the
594	 * polling interval, otherwise increase the polling interval.
595	 */
596	n = &__get_cpu_var(next_interval);
597	if (mce_notify_user())
598		*n = max(*n/2, HZ/100);
599	else
600		*n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ));
601
602	t->expires = jiffies + *n;
603	add_timer(t);
604}
605
606static void mce_do_trigger(struct work_struct *work)
607{
608	call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT);
609}
610
611static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
612
613/*
614 * Notify the user(s) about new machine check events.
615 * Can be called from interrupt context, but not from machine check/NMI
616 * context.
617 */
618int mce_notify_user(void)
619{
620	/* Not more than two messages every minute */
621	static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
622
623	clear_thread_flag(TIF_MCE_NOTIFY);
624
625	if (test_and_clear_bit(0, &notify_user)) {
626		wake_up_interruptible(&mce_wait);
627
628		/*
629		 * There is no risk of missing notifications because
630		 * work_pending is always cleared before the function is
631		 * executed.
632		 */
633		if (trigger[0] && !work_pending(&mce_trigger_work))
634			schedule_work(&mce_trigger_work);
635
636		if (__ratelimit(&ratelimit))
637			printk(KERN_INFO "Machine check events logged\n");
638
639		return 1;
640	}
641	return 0;
642}
643EXPORT_SYMBOL_GPL(mce_notify_user);
644
645/*
646 * Initialize Machine Checks for a CPU.
647 */
648static int mce_cap_init(void)
649{
650	unsigned b;
651	u64 cap;
652
653	rdmsrl(MSR_IA32_MCG_CAP, cap);
654
655	b = cap & MCG_BANKCNT_MASK;
656	printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b);
657
658	if (b > MAX_NR_BANKS) {
659		printk(KERN_WARNING
660		       "MCE: Using only %u machine check banks out of %u\n",
661			MAX_NR_BANKS, b);
662		b = MAX_NR_BANKS;
663	}
664
665	/* Don't support asymmetric configurations today */
666	WARN_ON(banks != 0 && b != banks);
667	banks = b;
668	if (!bank) {
669		bank = kmalloc(banks * sizeof(u64), GFP_KERNEL);
670		if (!bank)
671			return -ENOMEM;
672		memset(bank, 0xff, banks * sizeof(u64));
673	}
674
675	/* Use accurate RIP reporting if available. */
676	if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9)
677		rip_msr = MSR_IA32_MCG_EIP;
678
679	return 0;
680}
681
682static void mce_init(void)
683{
684	mce_banks_t all_banks;
685	u64 cap;
686	int i;
687
688	/*
689	 * Log the machine checks left over from the previous reset.
690	 */
691	bitmap_fill(all_banks, MAX_NR_BANKS);
692	machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks);
693
694	set_in_cr4(X86_CR4_MCE);
695
696	rdmsrl(MSR_IA32_MCG_CAP, cap);
697	if (cap & MCG_CTL_P)
698		wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
699
700	for (i = 0; i < banks; i++) {
701		if (skip_bank_init(i))
702			continue;
703		wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
704		wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
705	}
706}
707
708/* Add per CPU specific workarounds here */
709static void mce_cpu_quirks(struct cpuinfo_x86 *c)
710{
711	/* This should be disabled by the BIOS, but isn't always */
712	if (c->x86_vendor == X86_VENDOR_AMD) {
713		if (c->x86 == 15 && banks > 4) {
714			/*
715			 * disable GART TBL walk error reporting, which
716			 * trips off incorrectly with the IOMMU & 3ware
717			 * & Cerberus:
718			 */
719			clear_bit(10, (unsigned long *)&bank[4]);
720		}
721		if (c->x86 <= 17 && mce_bootlog < 0) {
722			/*
723			 * Lots of broken BIOS around that don't clear them
724			 * by default and leave crap in there. Don't log:
725			 */
726			mce_bootlog = 0;
727		}
728		/*
729		 * Various K7s with broken bank 0 around. Always disable
730		 * by default.
731		 */
732		 if (c->x86 == 6)
733			bank[0] = 0;
734	}
735
736	if (c->x86_vendor == X86_VENDOR_INTEL) {
737		/*
738		 * SDM documents that on family 6 bank 0 should not be written
739		 * because it aliases to another special BIOS controlled
740		 * register.
741		 * But it's not aliased anymore on model 0x1a+
742		 * Don't ignore bank 0 completely because there could be a
743		 * valid event later, merely don't write CTL0.
744		 */
745
746		if (c->x86 == 6 && c->x86_model < 0x1A)
747			__set_bit(0, &dont_init_banks);
748	}
749}
750
751static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c)
752{
753	if (c->x86 != 5)
754		return;
755	switch (c->x86_vendor) {
756	case X86_VENDOR_INTEL:
757		if (mce_p5_enabled())
758			intel_p5_mcheck_init(c);
759		break;
760	case X86_VENDOR_CENTAUR:
761		winchip_mcheck_init(c);
762		break;
763	}
764}
765
766static void mce_cpu_features(struct cpuinfo_x86 *c)
767{
768	switch (c->x86_vendor) {
769	case X86_VENDOR_INTEL:
770		mce_intel_feature_init(c);
771		break;
772	case X86_VENDOR_AMD:
773		mce_amd_feature_init(c);
774		break;
775	default:
776		break;
777	}
778}
779
780static void mce_init_timer(void)
781{
782	struct timer_list *t = &__get_cpu_var(mce_timer);
783	int *n = &__get_cpu_var(next_interval);
784
785	*n = check_interval * HZ;
786	if (!*n)
787		return;
788	setup_timer(t, mcheck_timer, smp_processor_id());
789	t->expires = round_jiffies(jiffies + *n);
790	add_timer(t);
791}
792
793/*
794 * Called for each booted CPU to set up machine checks.
795 * Must be called with preempt off:
796 */
797void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
798{
799	if (mce_disabled)
800		return;
801
802	mce_ancient_init(c);
803
804	if (!mce_available(c))
805		return;
806
807	if (mce_cap_init() < 0) {
808		mce_disabled = 1;
809		return;
810	}
811	mce_cpu_quirks(c);
812
813	machine_check_vector = do_machine_check;
814
815	mce_init();
816	mce_cpu_features(c);
817	mce_init_timer();
818}
819
820/*
821 * Character device to read and clear the MCE log.
822 */
823
824static DEFINE_SPINLOCK(mce_state_lock);
825static int		open_count;		/* #times opened */
826static int		open_exclu;		/* already open exclusive? */
827
828static int mce_open(struct inode *inode, struct file *file)
829{
830	spin_lock(&mce_state_lock);
831
832	if (open_exclu || (open_count && (file->f_flags & O_EXCL))) {
833		spin_unlock(&mce_state_lock);
834
835		return -EBUSY;
836	}
837
838	if (file->f_flags & O_EXCL)
839		open_exclu = 1;
840	open_count++;
841
842	spin_unlock(&mce_state_lock);
843
844	return nonseekable_open(inode, file);
845}
846
847static int mce_release(struct inode *inode, struct file *file)
848{
849	spin_lock(&mce_state_lock);
850
851	open_count--;
852	open_exclu = 0;
853
854	spin_unlock(&mce_state_lock);
855
856	return 0;
857}
858
859static void collect_tscs(void *data)
860{
861	unsigned long *cpu_tsc = (unsigned long *)data;
862
863	rdtscll(cpu_tsc[smp_processor_id()]);
864}
865
866static DEFINE_MUTEX(mce_read_mutex);
867
868static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
869			loff_t *off)
870{
871	char __user *buf = ubuf;
872	unsigned long *cpu_tsc;
873	unsigned prev, next;
874	int i, err;
875
876	cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
877	if (!cpu_tsc)
878		return -ENOMEM;
879
880	mutex_lock(&mce_read_mutex);
881	next = rcu_dereference(mcelog.next);
882
883	/* Only supports full reads right now */
884	if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
885		mutex_unlock(&mce_read_mutex);
886		kfree(cpu_tsc);
887
888		return -EINVAL;
889	}
890
891	err = 0;
892	prev = 0;
893	do {
894		for (i = prev; i < next; i++) {
895			unsigned long start = jiffies;
896
897			while (!mcelog.entry[i].finished) {
898				if (time_after_eq(jiffies, start + 2)) {
899					memset(mcelog.entry + i, 0,
900					       sizeof(struct mce));
901					goto timeout;
902				}
903				cpu_relax();
904			}
905			smp_rmb();
906			err |= copy_to_user(buf, mcelog.entry + i,
907					    sizeof(struct mce));
908			buf += sizeof(struct mce);
909timeout:
910			;
911		}
912
913		memset(mcelog.entry + prev, 0,
914		       (next - prev) * sizeof(struct mce));
915		prev = next;
916		next = cmpxchg(&mcelog.next, prev, 0);
917	} while (next != prev);
918
919	synchronize_sched();
920
921	/*
922	 * Collect entries that were still getting written before the
923	 * synchronize.
924	 */
925	on_each_cpu(collect_tscs, cpu_tsc, 1);
926
927	for (i = next; i < MCE_LOG_LEN; i++) {
928		if (mcelog.entry[i].finished &&
929		    mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
930			err |= copy_to_user(buf, mcelog.entry+i,
931					    sizeof(struct mce));
932			smp_rmb();
933			buf += sizeof(struct mce);
934			memset(&mcelog.entry[i], 0, sizeof(struct mce));
935		}
936	}
937	mutex_unlock(&mce_read_mutex);
938	kfree(cpu_tsc);
939
940	return err ? -EFAULT : buf - ubuf;
941}
942
943static unsigned int mce_poll(struct file *file, poll_table *wait)
944{
945	poll_wait(file, &mce_wait, wait);
946	if (rcu_dereference(mcelog.next))
947		return POLLIN | POLLRDNORM;
948	return 0;
949}
950
951static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
952{
953	int __user *p = (int __user *)arg;
954
955	if (!capable(CAP_SYS_ADMIN))
956		return -EPERM;
957
958	switch (cmd) {
959	case MCE_GET_RECORD_LEN:
960		return put_user(sizeof(struct mce), p);
961	case MCE_GET_LOG_LEN:
962		return put_user(MCE_LOG_LEN, p);
963	case MCE_GETCLEAR_FLAGS: {
964		unsigned flags;
965
966		do {
967			flags = mcelog.flags;
968		} while (cmpxchg(&mcelog.flags, flags, 0) != flags);
969
970		return put_user(flags, p);
971	}
972	default:
973		return -ENOTTY;
974	}
975}
976
977/* Modified in mce-inject.c, so not static or const */
978struct file_operations mce_chrdev_ops = {
979	.open			= mce_open,
980	.release		= mce_release,
981	.read			= mce_read,
982	.poll			= mce_poll,
983	.unlocked_ioctl		= mce_ioctl,
984};
985EXPORT_SYMBOL_GPL(mce_chrdev_ops);
986
987static struct miscdevice mce_log_device = {
988	MISC_MCELOG_MINOR,
989	"mcelog",
990	&mce_chrdev_ops,
991};
992
993/*
994 * mce=off disables machine check
995 * mce=TOLERANCELEVEL (number, see above)
996 * mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
997 * mce=nobootlog Don't log MCEs from before booting.
998 */
999static int __init mcheck_enable(char *str)
1000{
1001	if (*str == 0)
1002		enable_p5_mce();
1003	if (*str == '=')
1004		str++;
1005	if (!strcmp(str, "off"))
1006		mce_disabled = 1;
1007	else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
1008		mce_bootlog = (str[0] == 'b');
1009	else if (isdigit(str[0]))
1010		get_option(&str, &tolerant);
1011	else {
1012		printk(KERN_INFO "mce argument %s ignored. Please use /sys\n",
1013		       str);
1014		return 0;
1015	}
1016	return 1;
1017}
1018__setup("mce", mcheck_enable);
1019
1020/*
1021 * Sysfs support
1022 */
1023
1024/*
1025 * Disable machine checks on suspend and shutdown. We can't really handle
1026 * them later.
1027 */
1028static int mce_disable(void)
1029{
1030	int i;
1031
1032	for (i = 0; i < banks; i++) {
1033		if (!skip_bank_init(i))
1034			wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
1035	}
1036	return 0;
1037}
1038
1039static int mce_suspend(struct sys_device *dev, pm_message_t state)
1040{
1041	return mce_disable();
1042}
1043
1044static int mce_shutdown(struct sys_device *dev)
1045{
1046	return mce_disable();
1047}
1048
1049/*
1050 * On resume clear all MCE state. Don't want to see leftovers from the BIOS.
1051 * Only one CPU is active at this time, the others get re-added later using
1052 * CPU hotplug:
1053 */
1054static int mce_resume(struct sys_device *dev)
1055{
1056	mce_init();
1057	mce_cpu_features(&current_cpu_data);
1058
1059	return 0;
1060}
1061
1062static void mce_cpu_restart(void *data)
1063{
1064	del_timer_sync(&__get_cpu_var(mce_timer));
1065	if (mce_available(&current_cpu_data))
1066		mce_init();
1067	mce_init_timer();
1068}
1069
1070/* Reinit MCEs after user configuration changes */
1071static void mce_restart(void)
1072{
1073	on_each_cpu(mce_cpu_restart, NULL, 1);
1074}
1075
1076static struct sysdev_class mce_sysclass = {
1077	.suspend	= mce_suspend,
1078	.shutdown	= mce_shutdown,
1079	.resume		= mce_resume,
1080	.name		= "machinecheck",
1081};
1082
1083DEFINE_PER_CPU(struct sys_device, mce_dev);
1084
1085__cpuinitdata
1086void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
1087
1088static struct sysdev_attribute *bank_attrs;
1089
1090static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr,
1091			 char *buf)
1092{
1093	u64 b = bank[attr - bank_attrs];
1094
1095	return sprintf(buf, "%llx\n", b);
1096}
1097
1098static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
1099			const char *buf, size_t size)
1100{
1101	u64 new;
1102
1103	if (strict_strtoull(buf, 0, &new) < 0)
1104		return -EINVAL;
1105
1106	bank[attr - bank_attrs] = new;
1107	mce_restart();
1108
1109	return size;
1110}
1111
1112static ssize_t
1113show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf)
1114{
1115	strcpy(buf, trigger);
1116	strcat(buf, "\n");
1117	return strlen(trigger) + 1;
1118}
1119
1120static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr,
1121				const char *buf, size_t siz)
1122{
1123	char *p;
1124	int len;
1125
1126	strncpy(trigger, buf, sizeof(trigger));
1127	trigger[sizeof(trigger)-1] = 0;
1128	len = strlen(trigger);
1129	p = strchr(trigger, '\n');
1130
1131	if (*p)
1132		*p = 0;
1133
1134	return len;
1135}
1136
1137static ssize_t store_int_with_restart(struct sys_device *s,
1138				      struct sysdev_attribute *attr,
1139				      const char *buf, size_t size)
1140{
1141	ssize_t ret = sysdev_store_int(s, attr, buf, size);
1142	mce_restart();
1143	return ret;
1144}
1145
1146static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
1147static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
1148
1149static struct sysdev_ext_attribute attr_check_interval = {
1150	_SYSDEV_ATTR(check_interval, 0644, sysdev_show_int,
1151		     store_int_with_restart),
1152	&check_interval
1153};
1154
1155static struct sysdev_attribute *mce_attrs[] = {
1156	&attr_tolerant.attr, &attr_check_interval.attr, &attr_trigger,
1157	NULL
1158};
1159
1160static cpumask_var_t mce_dev_initialized;
1161
1162/* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */
1163static __cpuinit int mce_create_device(unsigned int cpu)
1164{
1165	int err;
1166	int i;
1167
1168	if (!mce_available(&boot_cpu_data))
1169		return -EIO;
1170
1171	memset(&per_cpu(mce_dev, cpu).kobj, 0, sizeof(struct kobject));
1172	per_cpu(mce_dev, cpu).id	= cpu;
1173	per_cpu(mce_dev, cpu).cls	= &mce_sysclass;
1174
1175	err = sysdev_register(&per_cpu(mce_dev, cpu));
1176	if (err)
1177		return err;
1178
1179	for (i = 0; mce_attrs[i]; i++) {
1180		err = sysdev_create_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
1181		if (err)
1182			goto error;
1183	}
1184	for (i = 0; i < banks; i++) {
1185		err = sysdev_create_file(&per_cpu(mce_dev, cpu),
1186					&bank_attrs[i]);
1187		if (err)
1188			goto error2;
1189	}
1190	cpumask_set_cpu(cpu, mce_dev_initialized);
1191
1192	return 0;
1193error2:
1194	while (--i >= 0)
1195		sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]);
1196error:
1197	while (--i >= 0)
1198		sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
1199
1200	sysdev_unregister(&per_cpu(mce_dev, cpu));
1201
1202	return err;
1203}
1204
1205static __cpuinit void mce_remove_device(unsigned int cpu)
1206{
1207	int i;
1208
1209	if (!cpumask_test_cpu(cpu, mce_dev_initialized))
1210		return;
1211
1212	for (i = 0; mce_attrs[i]; i++)
1213		sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
1214
1215	for (i = 0; i < banks; i++)
1216		sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]);
1217
1218	sysdev_unregister(&per_cpu(mce_dev, cpu));
1219	cpumask_clear_cpu(cpu, mce_dev_initialized);
1220}
1221
1222/* Make sure there are no machine checks on offlined CPUs. */
1223static void mce_disable_cpu(void *h)
1224{
1225	unsigned long action = *(unsigned long *)h;
1226	int i;
1227
1228	if (!mce_available(&current_cpu_data))
1229		return;
1230	if (!(action & CPU_TASKS_FROZEN))
1231		cmci_clear();
1232	for (i = 0; i < banks; i++) {
1233		if (!skip_bank_init(i))
1234			wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
1235	}
1236}
1237
1238static void mce_reenable_cpu(void *h)
1239{
1240	unsigned long action = *(unsigned long *)h;
1241	int i;
1242
1243	if (!mce_available(&current_cpu_data))
1244		return;
1245
1246	if (!(action & CPU_TASKS_FROZEN))
1247		cmci_reenable();
1248	for (i = 0; i < banks; i++) {
1249		if (!skip_bank_init(i))
1250			wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]);
1251	}
1252}
1253
1254/* Get notified when a cpu comes on/off. Be hotplug friendly. */
1255static int __cpuinit
1256mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
1257{
1258	unsigned int cpu = (unsigned long)hcpu;
1259	struct timer_list *t = &per_cpu(mce_timer, cpu);
1260
1261	switch (action) {
1262	case CPU_ONLINE:
1263	case CPU_ONLINE_FROZEN:
1264		mce_create_device(cpu);
1265		if (threshold_cpu_callback)
1266			threshold_cpu_callback(action, cpu);
1267		break;
1268	case CPU_DEAD:
1269	case CPU_DEAD_FROZEN:
1270		if (threshold_cpu_callback)
1271			threshold_cpu_callback(action, cpu);
1272		mce_remove_device(cpu);
1273		break;
1274	case CPU_DOWN_PREPARE:
1275	case CPU_DOWN_PREPARE_FROZEN:
1276		del_timer_sync(t);
1277		smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
1278		break;
1279	case CPU_DOWN_FAILED:
1280	case CPU_DOWN_FAILED_FROZEN:
1281		t->expires = round_jiffies(jiffies +
1282						__get_cpu_var(next_interval));
1283		add_timer_on(t, cpu);
1284		smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
1285		break;
1286	case CPU_POST_DEAD:
1287		/* intentionally ignoring frozen here */
1288		cmci_rediscover(cpu);
1289		break;
1290	}
1291	return NOTIFY_OK;
1292}
1293
1294static struct notifier_block mce_cpu_notifier __cpuinitdata = {
1295	.notifier_call = mce_cpu_callback,
1296};
1297
1298static __init int mce_init_banks(void)
1299{
1300	int i;
1301
1302	bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks,
1303				GFP_KERNEL);
1304	if (!bank_attrs)
1305		return -ENOMEM;
1306
1307	for (i = 0; i < banks; i++) {
1308		struct sysdev_attribute *a = &bank_attrs[i];
1309
1310		a->attr.name	= kasprintf(GFP_KERNEL, "bank%d", i);
1311		if (!a->attr.name)
1312			goto nomem;
1313
1314		a->attr.mode	= 0644;
1315		a->show		= show_bank;
1316		a->store	= set_bank;
1317	}
1318	return 0;
1319
1320nomem:
1321	while (--i >= 0)
1322		kfree(bank_attrs[i].attr.name);
1323	kfree(bank_attrs);
1324	bank_attrs = NULL;
1325
1326	return -ENOMEM;
1327}
1328
1329static __init int mce_init_device(void)
1330{
1331	int err;
1332	int i = 0;
1333
1334	if (!mce_available(&boot_cpu_data))
1335		return -EIO;
1336
1337	alloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL);
1338
1339	err = mce_init_banks();
1340	if (err)
1341		return err;
1342
1343	err = sysdev_class_register(&mce_sysclass);
1344	if (err)
1345		return err;
1346
1347	for_each_online_cpu(i) {
1348		err = mce_create_device(i);
1349		if (err)
1350			return err;
1351	}
1352
1353	register_hotcpu_notifier(&mce_cpu_notifier);
1354	misc_register(&mce_log_device);
1355
1356	return err;
1357}
1358
1359device_initcall(mce_init_device);
1360
1361#else /* CONFIG_X86_OLD_MCE: */
1362
1363int nr_mce_banks;
1364EXPORT_SYMBOL_GPL(nr_mce_banks);	/* non-fatal.o */
1365
1366/* This has to be run for each processor */
1367void mcheck_init(struct cpuinfo_x86 *c)
1368{
1369	if (mce_disabled == 1)
1370		return;
1371
1372	switch (c->x86_vendor) {
1373	case X86_VENDOR_AMD:
1374		amd_mcheck_init(c);
1375		break;
1376
1377	case X86_VENDOR_INTEL:
1378		if (c->x86 == 5)
1379			intel_p5_mcheck_init(c);
1380		if (c->x86 == 6)
1381			intel_p6_mcheck_init(c);
1382		if (c->x86 == 15)
1383			intel_p4_mcheck_init(c);
1384		break;
1385
1386	case X86_VENDOR_CENTAUR:
1387		if (c->x86 == 5)
1388			winchip_mcheck_init(c);
1389		break;
1390
1391	default:
1392		break;
1393	}
1394	printk(KERN_INFO "mce: CPU supports %d MCE banks\n", nr_mce_banks);
1395}
1396
1397static int __init mcheck_enable(char *str)
1398{
1399	mce_disabled = -1;
1400	return 1;
1401}
1402
1403__setup("mce", mcheck_enable);
1404
1405#endif /* CONFIG_X86_OLD_MCE */
1406
1407/*
1408 * Old style boot options parsing. Only for compatibility.
1409 */
1410static int __init mcheck_disable(char *str)
1411{
1412	mce_disabled = 1;
1413	return 1;
1414}
1415__setup("nomce", mcheck_disable);
1416