mce.c revision 450cc201038f31bd496e1b3a44a49790b8827a06
1/*
2 * Machine check handler.
3 *
4 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
5 * Rest from unknown author(s).
6 * 2004 Andi Kleen. Rewrote most of it.
7 * Copyright 2008 Intel Corporation
8 * Author: Andi Kleen
9 */
10
11#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
12
13#include <linux/thread_info.h>
14#include <linux/capability.h>
15#include <linux/miscdevice.h>
16#include <linux/ratelimit.h>
17#include <linux/kallsyms.h>
18#include <linux/rcupdate.h>
19#include <linux/kobject.h>
20#include <linux/uaccess.h>
21#include <linux/kdebug.h>
22#include <linux/kernel.h>
23#include <linux/percpu.h>
24#include <linux/string.h>
25#include <linux/device.h>
26#include <linux/syscore_ops.h>
27#include <linux/delay.h>
28#include <linux/ctype.h>
29#include <linux/sched.h>
30#include <linux/sysfs.h>
31#include <linux/types.h>
32#include <linux/slab.h>
33#include <linux/init.h>
34#include <linux/kmod.h>
35#include <linux/poll.h>
36#include <linux/nmi.h>
37#include <linux/cpu.h>
38#include <linux/smp.h>
39#include <linux/fs.h>
40#include <linux/mm.h>
41#include <linux/debugfs.h>
42#include <linux/irq_work.h>
43#include <linux/export.h>
44
45#include <asm/processor.h>
46#include <asm/mce.h>
47#include <asm/msr.h>
48
49#include "mce-internal.h"
50
51static DEFINE_MUTEX(mce_chrdev_read_mutex);
52
53#define rcu_dereference_check_mce(p) \
54	rcu_dereference_index_check((p), \
55			      rcu_read_lock_sched_held() || \
56			      lockdep_is_held(&mce_chrdev_read_mutex))
57
58#define CREATE_TRACE_POINTS
59#include <trace/events/mce.h>
60
61int mce_disabled __read_mostly;
62
63#define SPINUNIT 100	/* 100ns */
64
65atomic_t mce_entry;
66
67DEFINE_PER_CPU(unsigned, mce_exception_count);
68
69/*
70 * Tolerant levels:
71 *   0: always panic on uncorrected errors, log corrected errors
72 *   1: panic or SIGBUS on uncorrected errors, log corrected errors
73 *   2: SIGBUS or log uncorrected errors (if possible), log corrected errors
74 *   3: never panic or SIGBUS, log all errors (for testing only)
75 */
76static int			tolerant		__read_mostly = 1;
77static int			banks			__read_mostly;
78static int			rip_msr			__read_mostly;
79static int			mce_bootlog		__read_mostly = -1;
80static int			monarch_timeout		__read_mostly = -1;
81static int			mce_panic_timeout	__read_mostly;
82static int			mce_dont_log_ce		__read_mostly;
83int				mce_cmci_disabled	__read_mostly;
84int				mce_ignore_ce		__read_mostly;
85int				mce_ser			__read_mostly;
86int				mce_bios_cmci_threshold	__read_mostly;
87
88struct mce_bank                *mce_banks		__read_mostly;
89
90/* User mode helper program triggered by machine check event */
91static unsigned long		mce_need_notify;
92static char			mce_helper[128];
93static char			*mce_helper_argv[2] = { mce_helper, NULL };
94
95static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait);
96
97static DEFINE_PER_CPU(struct mce, mces_seen);
98static int			cpu_missing;
99
100/* MCA banks polled by the period polling timer for corrected events */
101DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
102	[0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
103};
104
105static DEFINE_PER_CPU(struct work_struct, mce_work);
106
107static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs);
108
109/*
110 * CPU/chipset specific EDAC code can register a notifier call here to print
111 * MCE errors in a human-readable form.
112 */
113ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain);
114
115/* Do initial initialization of a struct mce */
116void mce_setup(struct mce *m)
117{
118	memset(m, 0, sizeof(struct mce));
119	m->cpu = m->extcpu = smp_processor_id();
120	rdtscll(m->tsc);
121	/* We hope get_seconds stays lockless */
122	m->time = get_seconds();
123	m->cpuvendor = boot_cpu_data.x86_vendor;
124	m->cpuid = cpuid_eax(1);
125	m->socketid = cpu_data(m->extcpu).phys_proc_id;
126	m->apicid = cpu_data(m->extcpu).initial_apicid;
127	rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap);
128}
129
130DEFINE_PER_CPU(struct mce, injectm);
131EXPORT_PER_CPU_SYMBOL_GPL(injectm);
132
133/*
134 * Lockless MCE logging infrastructure.
135 * This avoids deadlocks on printk locks without having to break locks. Also
136 * separate MCEs from kernel messages to avoid bogus bug reports.
137 */
138
139static struct mce_log mcelog = {
140	.signature	= MCE_LOG_SIGNATURE,
141	.len		= MCE_LOG_LEN,
142	.recordlen	= sizeof(struct mce),
143};
144
145void mce_log(struct mce *mce)
146{
147	unsigned next, entry;
148	int ret = 0;
149
150	/* Emit the trace record: */
151	trace_mce_record(mce);
152
153	ret = atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, mce);
154	if (ret == NOTIFY_STOP)
155		return;
156
157	mce->finished = 0;
158	wmb();
159	for (;;) {
160		entry = rcu_dereference_check_mce(mcelog.next);
161		for (;;) {
162
163			/*
164			 * When the buffer fills up discard new entries.
165			 * Assume that the earlier errors are the more
166			 * interesting ones:
167			 */
168			if (entry >= MCE_LOG_LEN) {
169				set_bit(MCE_OVERFLOW,
170					(unsigned long *)&mcelog.flags);
171				return;
172			}
173			/* Old left over entry. Skip: */
174			if (mcelog.entry[entry].finished) {
175				entry++;
176				continue;
177			}
178			break;
179		}
180		smp_rmb();
181		next = entry + 1;
182		if (cmpxchg(&mcelog.next, entry, next) == entry)
183			break;
184	}
185	memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
186	wmb();
187	mcelog.entry[entry].finished = 1;
188	wmb();
189
190	mce->finished = 1;
191	set_bit(0, &mce_need_notify);
192}
193
194static void drain_mcelog_buffer(void)
195{
196	unsigned int next, i, prev = 0;
197
198	next = ACCESS_ONCE(mcelog.next);
199
200	do {
201		struct mce *m;
202
203		/* drain what was logged during boot */
204		for (i = prev; i < next; i++) {
205			unsigned long start = jiffies;
206			unsigned retries = 1;
207
208			m = &mcelog.entry[i];
209
210			while (!m->finished) {
211				if (time_after_eq(jiffies, start + 2*retries))
212					retries++;
213
214				cpu_relax();
215
216				if (!m->finished && retries >= 4) {
217					pr_err("skipping error being logged currently!\n");
218					break;
219				}
220			}
221			smp_rmb();
222			atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m);
223		}
224
225		memset(mcelog.entry + prev, 0, (next - prev) * sizeof(*m));
226		prev = next;
227		next = cmpxchg(&mcelog.next, prev, 0);
228	} while (next != prev);
229}
230
231
232void mce_register_decode_chain(struct notifier_block *nb)
233{
234	atomic_notifier_chain_register(&x86_mce_decoder_chain, nb);
235	drain_mcelog_buffer();
236}
237EXPORT_SYMBOL_GPL(mce_register_decode_chain);
238
239void mce_unregister_decode_chain(struct notifier_block *nb)
240{
241	atomic_notifier_chain_unregister(&x86_mce_decoder_chain, nb);
242}
243EXPORT_SYMBOL_GPL(mce_unregister_decode_chain);
244
245static void print_mce(struct mce *m)
246{
247	int ret = 0;
248
249	pr_emerg(HW_ERR "CPU %d: Machine Check Exception: %Lx Bank %d: %016Lx\n",
250	       m->extcpu, m->mcgstatus, m->bank, m->status);
251
252	if (m->ip) {
253		pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ",
254			!(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
255				m->cs, m->ip);
256
257		if (m->cs == __KERNEL_CS)
258			print_symbol("{%s}", m->ip);
259		pr_cont("\n");
260	}
261
262	pr_emerg(HW_ERR "TSC %llx ", m->tsc);
263	if (m->addr)
264		pr_cont("ADDR %llx ", m->addr);
265	if (m->misc)
266		pr_cont("MISC %llx ", m->misc);
267
268	pr_cont("\n");
269	/*
270	 * Note this output is parsed by external tools and old fields
271	 * should not be changed.
272	 */
273	pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x microcode %x\n",
274		m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid,
275		cpu_data(m->extcpu).microcode);
276
277	/*
278	 * Print out human-readable details about the MCE error,
279	 * (if the CPU has an implementation for that)
280	 */
281	ret = atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m);
282	if (ret == NOTIFY_STOP)
283		return;
284
285	pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n");
286}
287
288#define PANIC_TIMEOUT 5 /* 5 seconds */
289
290static atomic_t mce_paniced;
291
292static int fake_panic;
293static atomic_t mce_fake_paniced;
294
295/* Panic in progress. Enable interrupts and wait for final IPI */
296static void wait_for_panic(void)
297{
298	long timeout = PANIC_TIMEOUT*USEC_PER_SEC;
299
300	preempt_disable();
301	local_irq_enable();
302	while (timeout-- > 0)
303		udelay(1);
304	if (panic_timeout == 0)
305		panic_timeout = mce_panic_timeout;
306	panic("Panicing machine check CPU died");
307}
308
309static void mce_panic(char *msg, struct mce *final, char *exp)
310{
311	int i, apei_err = 0;
312
313	if (!fake_panic) {
314		/*
315		 * Make sure only one CPU runs in machine check panic
316		 */
317		if (atomic_inc_return(&mce_paniced) > 1)
318			wait_for_panic();
319		barrier();
320
321		bust_spinlocks(1);
322		console_verbose();
323	} else {
324		/* Don't log too much for fake panic */
325		if (atomic_inc_return(&mce_fake_paniced) > 1)
326			return;
327	}
328	/* First print corrected ones that are still unlogged */
329	for (i = 0; i < MCE_LOG_LEN; i++) {
330		struct mce *m = &mcelog.entry[i];
331		if (!(m->status & MCI_STATUS_VAL))
332			continue;
333		if (!(m->status & MCI_STATUS_UC)) {
334			print_mce(m);
335			if (!apei_err)
336				apei_err = apei_write_mce(m);
337		}
338	}
339	/* Now print uncorrected but with the final one last */
340	for (i = 0; i < MCE_LOG_LEN; i++) {
341		struct mce *m = &mcelog.entry[i];
342		if (!(m->status & MCI_STATUS_VAL))
343			continue;
344		if (!(m->status & MCI_STATUS_UC))
345			continue;
346		if (!final || memcmp(m, final, sizeof(struct mce))) {
347			print_mce(m);
348			if (!apei_err)
349				apei_err = apei_write_mce(m);
350		}
351	}
352	if (final) {
353		print_mce(final);
354		if (!apei_err)
355			apei_err = apei_write_mce(final);
356	}
357	if (cpu_missing)
358		pr_emerg(HW_ERR "Some CPUs didn't answer in synchronization\n");
359	if (exp)
360		pr_emerg(HW_ERR "Machine check: %s\n", exp);
361	if (!fake_panic) {
362		if (panic_timeout == 0)
363			panic_timeout = mce_panic_timeout;
364		panic(msg);
365	} else
366		pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg);
367}
368
369/* Support code for software error injection */
370
371static int msr_to_offset(u32 msr)
372{
373	unsigned bank = __this_cpu_read(injectm.bank);
374
375	if (msr == rip_msr)
376		return offsetof(struct mce, ip);
377	if (msr == MSR_IA32_MCx_STATUS(bank))
378		return offsetof(struct mce, status);
379	if (msr == MSR_IA32_MCx_ADDR(bank))
380		return offsetof(struct mce, addr);
381	if (msr == MSR_IA32_MCx_MISC(bank))
382		return offsetof(struct mce, misc);
383	if (msr == MSR_IA32_MCG_STATUS)
384		return offsetof(struct mce, mcgstatus);
385	return -1;
386}
387
388/* MSR access wrappers used for error injection */
389static u64 mce_rdmsrl(u32 msr)
390{
391	u64 v;
392
393	if (__this_cpu_read(injectm.finished)) {
394		int offset = msr_to_offset(msr);
395
396		if (offset < 0)
397			return 0;
398		return *(u64 *)((char *)&__get_cpu_var(injectm) + offset);
399	}
400
401	if (rdmsrl_safe(msr, &v)) {
402		WARN_ONCE(1, "mce: Unable to read msr %d!\n", msr);
403		/*
404		 * Return zero in case the access faulted. This should
405		 * not happen normally but can happen if the CPU does
406		 * something weird, or if the code is buggy.
407		 */
408		v = 0;
409	}
410
411	return v;
412}
413
414static void mce_wrmsrl(u32 msr, u64 v)
415{
416	if (__this_cpu_read(injectm.finished)) {
417		int offset = msr_to_offset(msr);
418
419		if (offset >= 0)
420			*(u64 *)((char *)&__get_cpu_var(injectm) + offset) = v;
421		return;
422	}
423	wrmsrl(msr, v);
424}
425
426/*
427 * Collect all global (w.r.t. this processor) status about this machine
428 * check into our "mce" struct so that we can use it later to assess
429 * the severity of the problem as we read per-bank specific details.
430 */
431static inline void mce_gather_info(struct mce *m, struct pt_regs *regs)
432{
433	mce_setup(m);
434
435	m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
436	if (regs) {
437		/*
438		 * Get the address of the instruction at the time of
439		 * the machine check error.
440		 */
441		if (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) {
442			m->ip = regs->ip;
443			m->cs = regs->cs;
444
445			/*
446			 * When in VM86 mode make the cs look like ring 3
447			 * always. This is a lie, but it's better than passing
448			 * the additional vm86 bit around everywhere.
449			 */
450			if (v8086_mode(regs))
451				m->cs |= 3;
452		}
453		/* Use accurate RIP reporting if available. */
454		if (rip_msr)
455			m->ip = mce_rdmsrl(rip_msr);
456	}
457}
458
459/*
460 * Simple lockless ring to communicate PFNs from the exception handler with the
461 * process context work function. This is vastly simplified because there's
462 * only a single reader and a single writer.
463 */
464#define MCE_RING_SIZE 16	/* we use one entry less */
465
466struct mce_ring {
467	unsigned short start;
468	unsigned short end;
469	unsigned long ring[MCE_RING_SIZE];
470};
471static DEFINE_PER_CPU(struct mce_ring, mce_ring);
472
473/* Runs with CPU affinity in workqueue */
474static int mce_ring_empty(void)
475{
476	struct mce_ring *r = &__get_cpu_var(mce_ring);
477
478	return r->start == r->end;
479}
480
481static int mce_ring_get(unsigned long *pfn)
482{
483	struct mce_ring *r;
484	int ret = 0;
485
486	*pfn = 0;
487	get_cpu();
488	r = &__get_cpu_var(mce_ring);
489	if (r->start == r->end)
490		goto out;
491	*pfn = r->ring[r->start];
492	r->start = (r->start + 1) % MCE_RING_SIZE;
493	ret = 1;
494out:
495	put_cpu();
496	return ret;
497}
498
499/* Always runs in MCE context with preempt off */
500static int mce_ring_add(unsigned long pfn)
501{
502	struct mce_ring *r = &__get_cpu_var(mce_ring);
503	unsigned next;
504
505	next = (r->end + 1) % MCE_RING_SIZE;
506	if (next == r->start)
507		return -1;
508	r->ring[r->end] = pfn;
509	wmb();
510	r->end = next;
511	return 0;
512}
513
514int mce_available(struct cpuinfo_x86 *c)
515{
516	if (mce_disabled)
517		return 0;
518	return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
519}
520
521static void mce_schedule_work(void)
522{
523	if (!mce_ring_empty()) {
524		struct work_struct *work = &__get_cpu_var(mce_work);
525		if (!work_pending(work))
526			schedule_work(work);
527	}
528}
529
530DEFINE_PER_CPU(struct irq_work, mce_irq_work);
531
532static void mce_irq_work_cb(struct irq_work *entry)
533{
534	mce_notify_irq();
535	mce_schedule_work();
536}
537
538static void mce_report_event(struct pt_regs *regs)
539{
540	if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) {
541		mce_notify_irq();
542		/*
543		 * Triggering the work queue here is just an insurance
544		 * policy in case the syscall exit notify handler
545		 * doesn't run soon enough or ends up running on the
546		 * wrong CPU (can happen when audit sleeps)
547		 */
548		mce_schedule_work();
549		return;
550	}
551
552	irq_work_queue(&__get_cpu_var(mce_irq_work));
553}
554
555/*
556 * Read ADDR and MISC registers.
557 */
558static void mce_read_aux(struct mce *m, int i)
559{
560	if (m->status & MCI_STATUS_MISCV)
561		m->misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i));
562	if (m->status & MCI_STATUS_ADDRV) {
563		m->addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i));
564
565		/*
566		 * Mask the reported address by the reported granularity.
567		 */
568		if (mce_ser && (m->status & MCI_STATUS_MISCV)) {
569			u8 shift = MCI_MISC_ADDR_LSB(m->misc);
570			m->addr >>= shift;
571			m->addr <<= shift;
572		}
573	}
574}
575
576DEFINE_PER_CPU(unsigned, mce_poll_count);
577
578/*
579 * Poll for corrected events or events that happened before reset.
580 * Those are just logged through /dev/mcelog.
581 *
582 * This is executed in standard interrupt context.
583 *
584 * Note: spec recommends to panic for fatal unsignalled
585 * errors here. However this would be quite problematic --
586 * we would need to reimplement the Monarch handling and
587 * it would mess up the exclusion between exception handler
588 * and poll hander -- * so we skip this for now.
589 * These cases should not happen anyways, or only when the CPU
590 * is already totally * confused. In this case it's likely it will
591 * not fully execute the machine check handler either.
592 */
593void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
594{
595	struct mce m;
596	int i;
597
598	this_cpu_inc(mce_poll_count);
599
600	mce_gather_info(&m, NULL);
601
602	for (i = 0; i < banks; i++) {
603		if (!mce_banks[i].ctl || !test_bit(i, *b))
604			continue;
605
606		m.misc = 0;
607		m.addr = 0;
608		m.bank = i;
609		m.tsc = 0;
610
611		barrier();
612		m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
613		if (!(m.status & MCI_STATUS_VAL))
614			continue;
615
616		/*
617		 * Uncorrected or signalled events are handled by the exception
618		 * handler when it is enabled, so don't process those here.
619		 *
620		 * TBD do the same check for MCI_STATUS_EN here?
621		 */
622		if (!(flags & MCP_UC) &&
623		    (m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)))
624			continue;
625
626		mce_read_aux(&m, i);
627
628		if (!(flags & MCP_TIMESTAMP))
629			m.tsc = 0;
630		/*
631		 * Don't get the IP here because it's unlikely to
632		 * have anything to do with the actual error location.
633		 */
634		if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce)
635			mce_log(&m);
636
637		/*
638		 * Clear state for this bank.
639		 */
640		mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
641	}
642
643	/*
644	 * Don't clear MCG_STATUS here because it's only defined for
645	 * exceptions.
646	 */
647
648	sync_core();
649}
650EXPORT_SYMBOL_GPL(machine_check_poll);
651
652/*
653 * Do a quick check if any of the events requires a panic.
654 * This decides if we keep the events around or clear them.
655 */
656static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp,
657			  struct pt_regs *regs)
658{
659	int i, ret = 0;
660
661	for (i = 0; i < banks; i++) {
662		m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
663		if (m->status & MCI_STATUS_VAL) {
664			__set_bit(i, validp);
665			if (quirk_no_way_out)
666				quirk_no_way_out(i, m, regs);
667		}
668		if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY)
669			ret = 1;
670	}
671	return ret;
672}
673
674/*
675 * Variable to establish order between CPUs while scanning.
676 * Each CPU spins initially until executing is equal its number.
677 */
678static atomic_t mce_executing;
679
680/*
681 * Defines order of CPUs on entry. First CPU becomes Monarch.
682 */
683static atomic_t mce_callin;
684
685/*
686 * Check if a timeout waiting for other CPUs happened.
687 */
688static int mce_timed_out(u64 *t)
689{
690	/*
691	 * The others already did panic for some reason.
692	 * Bail out like in a timeout.
693	 * rmb() to tell the compiler that system_state
694	 * might have been modified by someone else.
695	 */
696	rmb();
697	if (atomic_read(&mce_paniced))
698		wait_for_panic();
699	if (!monarch_timeout)
700		goto out;
701	if ((s64)*t < SPINUNIT) {
702		/* CHECKME: Make panic default for 1 too? */
703		if (tolerant < 1)
704			mce_panic("Timeout synchronizing machine check over CPUs",
705				  NULL, NULL);
706		cpu_missing = 1;
707		return 1;
708	}
709	*t -= SPINUNIT;
710out:
711	touch_nmi_watchdog();
712	return 0;
713}
714
715/*
716 * The Monarch's reign.  The Monarch is the CPU who entered
717 * the machine check handler first. It waits for the others to
718 * raise the exception too and then grades them. When any
719 * error is fatal panic. Only then let the others continue.
720 *
721 * The other CPUs entering the MCE handler will be controlled by the
722 * Monarch. They are called Subjects.
723 *
724 * This way we prevent any potential data corruption in a unrecoverable case
725 * and also makes sure always all CPU's errors are examined.
726 *
727 * Also this detects the case of a machine check event coming from outer
728 * space (not detected by any CPUs) In this case some external agent wants
729 * us to shut down, so panic too.
730 *
731 * The other CPUs might still decide to panic if the handler happens
732 * in a unrecoverable place, but in this case the system is in a semi-stable
733 * state and won't corrupt anything by itself. It's ok to let the others
734 * continue for a bit first.
735 *
736 * All the spin loops have timeouts; when a timeout happens a CPU
737 * typically elects itself to be Monarch.
738 */
739static void mce_reign(void)
740{
741	int cpu;
742	struct mce *m = NULL;
743	int global_worst = 0;
744	char *msg = NULL;
745	char *nmsg = NULL;
746
747	/*
748	 * This CPU is the Monarch and the other CPUs have run
749	 * through their handlers.
750	 * Grade the severity of the errors of all the CPUs.
751	 */
752	for_each_possible_cpu(cpu) {
753		int severity = mce_severity(&per_cpu(mces_seen, cpu), tolerant,
754					    &nmsg);
755		if (severity > global_worst) {
756			msg = nmsg;
757			global_worst = severity;
758			m = &per_cpu(mces_seen, cpu);
759		}
760	}
761
762	/*
763	 * Cannot recover? Panic here then.
764	 * This dumps all the mces in the log buffer and stops the
765	 * other CPUs.
766	 */
767	if (m && global_worst >= MCE_PANIC_SEVERITY && tolerant < 3)
768		mce_panic("Fatal Machine check", m, msg);
769
770	/*
771	 * For UC somewhere we let the CPU who detects it handle it.
772	 * Also must let continue the others, otherwise the handling
773	 * CPU could deadlock on a lock.
774	 */
775
776	/*
777	 * No machine check event found. Must be some external
778	 * source or one CPU is hung. Panic.
779	 */
780	if (global_worst <= MCE_KEEP_SEVERITY && tolerant < 3)
781		mce_panic("Machine check from unknown source", NULL, NULL);
782
783	/*
784	 * Now clear all the mces_seen so that they don't reappear on
785	 * the next mce.
786	 */
787	for_each_possible_cpu(cpu)
788		memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce));
789}
790
791static atomic_t global_nwo;
792
793/*
794 * Start of Monarch synchronization. This waits until all CPUs have
795 * entered the exception handler and then determines if any of them
796 * saw a fatal event that requires panic. Then it executes them
797 * in the entry order.
798 * TBD double check parallel CPU hotunplug
799 */
800static int mce_start(int *no_way_out)
801{
802	int order;
803	int cpus = num_online_cpus();
804	u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC;
805
806	if (!timeout)
807		return -1;
808
809	atomic_add(*no_way_out, &global_nwo);
810	/*
811	 * global_nwo should be updated before mce_callin
812	 */
813	smp_wmb();
814	order = atomic_inc_return(&mce_callin);
815
816	/*
817	 * Wait for everyone.
818	 */
819	while (atomic_read(&mce_callin) != cpus) {
820		if (mce_timed_out(&timeout)) {
821			atomic_set(&global_nwo, 0);
822			return -1;
823		}
824		ndelay(SPINUNIT);
825	}
826
827	/*
828	 * mce_callin should be read before global_nwo
829	 */
830	smp_rmb();
831
832	if (order == 1) {
833		/*
834		 * Monarch: Starts executing now, the others wait.
835		 */
836		atomic_set(&mce_executing, 1);
837	} else {
838		/*
839		 * Subject: Now start the scanning loop one by one in
840		 * the original callin order.
841		 * This way when there are any shared banks it will be
842		 * only seen by one CPU before cleared, avoiding duplicates.
843		 */
844		while (atomic_read(&mce_executing) < order) {
845			if (mce_timed_out(&timeout)) {
846				atomic_set(&global_nwo, 0);
847				return -1;
848			}
849			ndelay(SPINUNIT);
850		}
851	}
852
853	/*
854	 * Cache the global no_way_out state.
855	 */
856	*no_way_out = atomic_read(&global_nwo);
857
858	return order;
859}
860
861/*
862 * Synchronize between CPUs after main scanning loop.
863 * This invokes the bulk of the Monarch processing.
864 */
865static int mce_end(int order)
866{
867	int ret = -1;
868	u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC;
869
870	if (!timeout)
871		goto reset;
872	if (order < 0)
873		goto reset;
874
875	/*
876	 * Allow others to run.
877	 */
878	atomic_inc(&mce_executing);
879
880	if (order == 1) {
881		/* CHECKME: Can this race with a parallel hotplug? */
882		int cpus = num_online_cpus();
883
884		/*
885		 * Monarch: Wait for everyone to go through their scanning
886		 * loops.
887		 */
888		while (atomic_read(&mce_executing) <= cpus) {
889			if (mce_timed_out(&timeout))
890				goto reset;
891			ndelay(SPINUNIT);
892		}
893
894		mce_reign();
895		barrier();
896		ret = 0;
897	} else {
898		/*
899		 * Subject: Wait for Monarch to finish.
900		 */
901		while (atomic_read(&mce_executing) != 0) {
902			if (mce_timed_out(&timeout))
903				goto reset;
904			ndelay(SPINUNIT);
905		}
906
907		/*
908		 * Don't reset anything. That's done by the Monarch.
909		 */
910		return 0;
911	}
912
913	/*
914	 * Reset all global state.
915	 */
916reset:
917	atomic_set(&global_nwo, 0);
918	atomic_set(&mce_callin, 0);
919	barrier();
920
921	/*
922	 * Let others run again.
923	 */
924	atomic_set(&mce_executing, 0);
925	return ret;
926}
927
928/*
929 * Check if the address reported by the CPU is in a format we can parse.
930 * It would be possible to add code for most other cases, but all would
931 * be somewhat complicated (e.g. segment offset would require an instruction
932 * parser). So only support physical addresses up to page granuality for now.
933 */
934static int mce_usable_address(struct mce *m)
935{
936	if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV))
937		return 0;
938	if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT)
939		return 0;
940	if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS)
941		return 0;
942	return 1;
943}
944
945static void mce_clear_state(unsigned long *toclear)
946{
947	int i;
948
949	for (i = 0; i < banks; i++) {
950		if (test_bit(i, toclear))
951			mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
952	}
953}
954
955/*
956 * Need to save faulting physical address associated with a process
957 * in the machine check handler some place where we can grab it back
958 * later in mce_notify_process()
959 */
960#define	MCE_INFO_MAX	16
961
962struct mce_info {
963	atomic_t		inuse;
964	struct task_struct	*t;
965	__u64			paddr;
966	int			restartable;
967} mce_info[MCE_INFO_MAX];
968
969static void mce_save_info(__u64 addr, int c)
970{
971	struct mce_info *mi;
972
973	for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++) {
974		if (atomic_cmpxchg(&mi->inuse, 0, 1) == 0) {
975			mi->t = current;
976			mi->paddr = addr;
977			mi->restartable = c;
978			return;
979		}
980	}
981
982	mce_panic("Too many concurrent recoverable errors", NULL, NULL);
983}
984
985static struct mce_info *mce_find_info(void)
986{
987	struct mce_info *mi;
988
989	for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++)
990		if (atomic_read(&mi->inuse) && mi->t == current)
991			return mi;
992	return NULL;
993}
994
995static void mce_clear_info(struct mce_info *mi)
996{
997	atomic_set(&mi->inuse, 0);
998}
999
1000/*
1001 * The actual machine check handler. This only handles real
1002 * exceptions when something got corrupted coming in through int 18.
1003 *
1004 * This is executed in NMI context not subject to normal locking rules. This
1005 * implies that most kernel services cannot be safely used. Don't even
1006 * think about putting a printk in there!
1007 *
1008 * On Intel systems this is entered on all CPUs in parallel through
1009 * MCE broadcast. However some CPUs might be broken beyond repair,
1010 * so be always careful when synchronizing with others.
1011 */
1012void do_machine_check(struct pt_regs *regs, long error_code)
1013{
1014	struct mce m, *final;
1015	int i;
1016	int worst = 0;
1017	int severity;
1018	/*
1019	 * Establish sequential order between the CPUs entering the machine
1020	 * check handler.
1021	 */
1022	int order;
1023	/*
1024	 * If no_way_out gets set, there is no safe way to recover from this
1025	 * MCE.  If tolerant is cranked up, we'll try anyway.
1026	 */
1027	int no_way_out = 0;
1028	/*
1029	 * If kill_it gets set, there might be a way to recover from this
1030	 * error.
1031	 */
1032	int kill_it = 0;
1033	DECLARE_BITMAP(toclear, MAX_NR_BANKS);
1034	DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
1035	char *msg = "Unknown";
1036
1037	atomic_inc(&mce_entry);
1038
1039	this_cpu_inc(mce_exception_count);
1040
1041	if (!banks)
1042		goto out;
1043
1044	mce_gather_info(&m, regs);
1045
1046	final = &__get_cpu_var(mces_seen);
1047	*final = m;
1048
1049	memset(valid_banks, 0, sizeof(valid_banks));
1050	no_way_out = mce_no_way_out(&m, &msg, valid_banks, regs);
1051
1052	barrier();
1053
1054	/*
1055	 * When no restart IP might need to kill or panic.
1056	 * Assume the worst for now, but if we find the
1057	 * severity is MCE_AR_SEVERITY we have other options.
1058	 */
1059	if (!(m.mcgstatus & MCG_STATUS_RIPV))
1060		kill_it = 1;
1061
1062	/*
1063	 * Go through all the banks in exclusion of the other CPUs.
1064	 * This way we don't report duplicated events on shared banks
1065	 * because the first one to see it will clear it.
1066	 */
1067	order = mce_start(&no_way_out);
1068	for (i = 0; i < banks; i++) {
1069		__clear_bit(i, toclear);
1070		if (!test_bit(i, valid_banks))
1071			continue;
1072		if (!mce_banks[i].ctl)
1073			continue;
1074
1075		m.misc = 0;
1076		m.addr = 0;
1077		m.bank = i;
1078
1079		m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
1080		if ((m.status & MCI_STATUS_VAL) == 0)
1081			continue;
1082
1083		/*
1084		 * Non uncorrected or non signaled errors are handled by
1085		 * machine_check_poll. Leave them alone, unless this panics.
1086		 */
1087		if (!(m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)) &&
1088			!no_way_out)
1089			continue;
1090
1091		/*
1092		 * Set taint even when machine check was not enabled.
1093		 */
1094		add_taint(TAINT_MACHINE_CHECK);
1095
1096		severity = mce_severity(&m, tolerant, NULL);
1097
1098		/*
1099		 * When machine check was for corrected handler don't touch,
1100		 * unless we're panicing.
1101		 */
1102		if (severity == MCE_KEEP_SEVERITY && !no_way_out)
1103			continue;
1104		__set_bit(i, toclear);
1105		if (severity == MCE_NO_SEVERITY) {
1106			/*
1107			 * Machine check event was not enabled. Clear, but
1108			 * ignore.
1109			 */
1110			continue;
1111		}
1112
1113		mce_read_aux(&m, i);
1114
1115		/*
1116		 * Action optional error. Queue address for later processing.
1117		 * When the ring overflows we just ignore the AO error.
1118		 * RED-PEN add some logging mechanism when
1119		 * usable_address or mce_add_ring fails.
1120		 * RED-PEN don't ignore overflow for tolerant == 0
1121		 */
1122		if (severity == MCE_AO_SEVERITY && mce_usable_address(&m))
1123			mce_ring_add(m.addr >> PAGE_SHIFT);
1124
1125		mce_log(&m);
1126
1127		if (severity > worst) {
1128			*final = m;
1129			worst = severity;
1130		}
1131	}
1132
1133	/* mce_clear_state will clear *final, save locally for use later */
1134	m = *final;
1135
1136	if (!no_way_out)
1137		mce_clear_state(toclear);
1138
1139	/*
1140	 * Do most of the synchronization with other CPUs.
1141	 * When there's any problem use only local no_way_out state.
1142	 */
1143	if (mce_end(order) < 0)
1144		no_way_out = worst >= MCE_PANIC_SEVERITY;
1145
1146	/*
1147	 * At insane "tolerant" levels we take no action. Otherwise
1148	 * we only die if we have no other choice. For less serious
1149	 * issues we try to recover, or limit damage to the current
1150	 * process.
1151	 */
1152	if (tolerant < 3) {
1153		if (no_way_out)
1154			mce_panic("Fatal machine check on current CPU", &m, msg);
1155		if (worst == MCE_AR_SEVERITY) {
1156			/* schedule action before return to userland */
1157			mce_save_info(m.addr, m.mcgstatus & MCG_STATUS_RIPV);
1158			set_thread_flag(TIF_MCE_NOTIFY);
1159		} else if (kill_it) {
1160			force_sig(SIGBUS, current);
1161		}
1162	}
1163
1164	if (worst > 0)
1165		mce_report_event(regs);
1166	mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
1167out:
1168	atomic_dec(&mce_entry);
1169	sync_core();
1170}
1171EXPORT_SYMBOL_GPL(do_machine_check);
1172
1173#ifndef CONFIG_MEMORY_FAILURE
1174int memory_failure(unsigned long pfn, int vector, int flags)
1175{
1176	/* mce_severity() should not hand us an ACTION_REQUIRED error */
1177	BUG_ON(flags & MF_ACTION_REQUIRED);
1178	pr_err("Uncorrected memory error in page 0x%lx ignored\n"
1179	       "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n",
1180	       pfn);
1181
1182	return 0;
1183}
1184#endif
1185
1186/*
1187 * Called in process context that interrupted by MCE and marked with
1188 * TIF_MCE_NOTIFY, just before returning to erroneous userland.
1189 * This code is allowed to sleep.
1190 * Attempt possible recovery such as calling the high level VM handler to
1191 * process any corrupted pages, and kill/signal current process if required.
1192 * Action required errors are handled here.
1193 */
1194void mce_notify_process(void)
1195{
1196	unsigned long pfn;
1197	struct mce_info *mi = mce_find_info();
1198	int flags = MF_ACTION_REQUIRED;
1199
1200	if (!mi)
1201		mce_panic("Lost physical address for unconsumed uncorrectable error", NULL, NULL);
1202	pfn = mi->paddr >> PAGE_SHIFT;
1203
1204	clear_thread_flag(TIF_MCE_NOTIFY);
1205
1206	pr_err("Uncorrected hardware memory error in user-access at %llx",
1207		 mi->paddr);
1208	/*
1209	 * We must call memory_failure() here even if the current process is
1210	 * doomed. We still need to mark the page as poisoned and alert any
1211	 * other users of the page.
1212	 */
1213	if (!mi->restartable)
1214		flags |= MF_MUST_KILL;
1215	if (memory_failure(pfn, MCE_VECTOR, flags) < 0) {
1216		pr_err("Memory error not recovered");
1217		force_sig(SIGBUS, current);
1218	}
1219	mce_clear_info(mi);
1220}
1221
1222/*
1223 * Action optional processing happens here (picking up
1224 * from the list of faulting pages that do_machine_check()
1225 * placed into the "ring").
1226 */
1227static void mce_process_work(struct work_struct *dummy)
1228{
1229	unsigned long pfn;
1230
1231	while (mce_ring_get(&pfn))
1232		memory_failure(pfn, MCE_VECTOR, 0);
1233}
1234
1235#ifdef CONFIG_X86_MCE_INTEL
1236/***
1237 * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
1238 * @cpu: The CPU on which the event occurred.
1239 * @status: Event status information
1240 *
1241 * This function should be called by the thermal interrupt after the
1242 * event has been processed and the decision was made to log the event
1243 * further.
1244 *
1245 * The status parameter will be saved to the 'status' field of 'struct mce'
1246 * and historically has been the register value of the
1247 * MSR_IA32_THERMAL_STATUS (Intel) msr.
1248 */
1249void mce_log_therm_throt_event(__u64 status)
1250{
1251	struct mce m;
1252
1253	mce_setup(&m);
1254	m.bank = MCE_THERMAL_BANK;
1255	m.status = status;
1256	mce_log(&m);
1257}
1258#endif /* CONFIG_X86_MCE_INTEL */
1259
1260/*
1261 * Periodic polling timer for "silent" machine check errors.  If the
1262 * poller finds an MCE, poll 2x faster.  When the poller finds no more
1263 * errors, poll 2x slower (up to check_interval seconds).
1264 */
1265static unsigned long check_interval = 5 * 60; /* 5 minutes */
1266
1267static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
1268static DEFINE_PER_CPU(struct timer_list, mce_timer);
1269
1270static unsigned long mce_adjust_timer_default(unsigned long interval)
1271{
1272	return interval;
1273}
1274
1275static unsigned long (*mce_adjust_timer)(unsigned long interval) =
1276	mce_adjust_timer_default;
1277
1278static void mce_timer_fn(unsigned long data)
1279{
1280	struct timer_list *t = &__get_cpu_var(mce_timer);
1281	unsigned long iv;
1282
1283	WARN_ON(smp_processor_id() != data);
1284
1285	if (mce_available(__this_cpu_ptr(&cpu_info))) {
1286		machine_check_poll(MCP_TIMESTAMP,
1287				&__get_cpu_var(mce_poll_banks));
1288		mce_intel_cmci_poll();
1289	}
1290
1291	/*
1292	 * Alert userspace if needed.  If we logged an MCE, reduce the
1293	 * polling interval, otherwise increase the polling interval.
1294	 */
1295	iv = __this_cpu_read(mce_next_interval);
1296	if (mce_notify_irq()) {
1297		iv = max(iv / 2, (unsigned long) HZ/100);
1298	} else {
1299		iv = min(iv * 2, round_jiffies_relative(check_interval * HZ));
1300		iv = mce_adjust_timer(iv);
1301	}
1302	__this_cpu_write(mce_next_interval, iv);
1303	/* Might have become 0 after CMCI storm subsided */
1304	if (iv) {
1305		t->expires = jiffies + iv;
1306		add_timer_on(t, smp_processor_id());
1307	}
1308}
1309
1310/*
1311 * Ensure that the timer is firing in @interval from now.
1312 */
1313void mce_timer_kick(unsigned long interval)
1314{
1315	struct timer_list *t = &__get_cpu_var(mce_timer);
1316	unsigned long when = jiffies + interval;
1317	unsigned long iv = __this_cpu_read(mce_next_interval);
1318
1319	if (timer_pending(t)) {
1320		if (time_before(when, t->expires))
1321			mod_timer_pinned(t, when);
1322	} else {
1323		t->expires = round_jiffies(when);
1324		add_timer_on(t, smp_processor_id());
1325	}
1326	if (interval < iv)
1327		__this_cpu_write(mce_next_interval, interval);
1328}
1329
1330/* Must not be called in IRQ context where del_timer_sync() can deadlock */
1331static void mce_timer_delete_all(void)
1332{
1333	int cpu;
1334
1335	for_each_online_cpu(cpu)
1336		del_timer_sync(&per_cpu(mce_timer, cpu));
1337}
1338
1339static void mce_do_trigger(struct work_struct *work)
1340{
1341	call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT);
1342}
1343
1344static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
1345
1346/*
1347 * Notify the user(s) about new machine check events.
1348 * Can be called from interrupt context, but not from machine check/NMI
1349 * context.
1350 */
1351int mce_notify_irq(void)
1352{
1353	/* Not more than two messages every minute */
1354	static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
1355
1356	if (test_and_clear_bit(0, &mce_need_notify)) {
1357		/* wake processes polling /dev/mcelog */
1358		wake_up_interruptible(&mce_chrdev_wait);
1359
1360		/*
1361		 * There is no risk of missing notifications because
1362		 * work_pending is always cleared before the function is
1363		 * executed.
1364		 */
1365		if (mce_helper[0] && !work_pending(&mce_trigger_work))
1366			schedule_work(&mce_trigger_work);
1367
1368		if (__ratelimit(&ratelimit))
1369			pr_info(HW_ERR "Machine check events logged\n");
1370
1371		return 1;
1372	}
1373	return 0;
1374}
1375EXPORT_SYMBOL_GPL(mce_notify_irq);
1376
1377static int __cpuinit __mcheck_cpu_mce_banks_init(void)
1378{
1379	int i;
1380
1381	mce_banks = kzalloc(banks * sizeof(struct mce_bank), GFP_KERNEL);
1382	if (!mce_banks)
1383		return -ENOMEM;
1384	for (i = 0; i < banks; i++) {
1385		struct mce_bank *b = &mce_banks[i];
1386
1387		b->ctl = -1ULL;
1388		b->init = 1;
1389	}
1390	return 0;
1391}
1392
1393/*
1394 * Initialize Machine Checks for a CPU.
1395 */
1396static int __cpuinit __mcheck_cpu_cap_init(void)
1397{
1398	unsigned b;
1399	u64 cap;
1400
1401	rdmsrl(MSR_IA32_MCG_CAP, cap);
1402
1403	b = cap & MCG_BANKCNT_MASK;
1404	if (!banks)
1405		pr_info("CPU supports %d MCE banks\n", b);
1406
1407	if (b > MAX_NR_BANKS) {
1408		pr_warn("Using only %u machine check banks out of %u\n",
1409			MAX_NR_BANKS, b);
1410		b = MAX_NR_BANKS;
1411	}
1412
1413	/* Don't support asymmetric configurations today */
1414	WARN_ON(banks != 0 && b != banks);
1415	banks = b;
1416	if (!mce_banks) {
1417		int err = __mcheck_cpu_mce_banks_init();
1418
1419		if (err)
1420			return err;
1421	}
1422
1423	/* Use accurate RIP reporting if available. */
1424	if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9)
1425		rip_msr = MSR_IA32_MCG_EIP;
1426
1427	if (cap & MCG_SER_P)
1428		mce_ser = 1;
1429
1430	return 0;
1431}
1432
1433static void __mcheck_cpu_init_generic(void)
1434{
1435	mce_banks_t all_banks;
1436	u64 cap;
1437	int i;
1438
1439	/*
1440	 * Log the machine checks left over from the previous reset.
1441	 */
1442	bitmap_fill(all_banks, MAX_NR_BANKS);
1443	machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks);
1444
1445	set_in_cr4(X86_CR4_MCE);
1446
1447	rdmsrl(MSR_IA32_MCG_CAP, cap);
1448	if (cap & MCG_CTL_P)
1449		wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
1450
1451	for (i = 0; i < banks; i++) {
1452		struct mce_bank *b = &mce_banks[i];
1453
1454		if (!b->init)
1455			continue;
1456		wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl);
1457		wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
1458	}
1459}
1460
1461/*
1462 * During IFU recovery Sandy Bridge -EP4S processors set the RIPV and
1463 * EIPV bits in MCG_STATUS to zero on the affected logical processor (SDM
1464 * Vol 3B Table 15-20). But this confuses both the code that determines
1465 * whether the machine check occurred in kernel or user mode, and also
1466 * the severity assessment code. Pretend that EIPV was set, and take the
1467 * ip/cs values from the pt_regs that mce_gather_info() ignored earlier.
1468 */
1469static void quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs)
1470{
1471	if (bank != 0)
1472		return;
1473	if ((m->mcgstatus & (MCG_STATUS_EIPV|MCG_STATUS_RIPV)) != 0)
1474		return;
1475	if ((m->status & (MCI_STATUS_OVER|MCI_STATUS_UC|
1476		          MCI_STATUS_EN|MCI_STATUS_MISCV|MCI_STATUS_ADDRV|
1477			  MCI_STATUS_PCC|MCI_STATUS_S|MCI_STATUS_AR|
1478			  MCACOD)) !=
1479			 (MCI_STATUS_UC|MCI_STATUS_EN|
1480			  MCI_STATUS_MISCV|MCI_STATUS_ADDRV|MCI_STATUS_S|
1481			  MCI_STATUS_AR|MCACOD_INSTR))
1482		return;
1483
1484	m->mcgstatus |= MCG_STATUS_EIPV;
1485	m->ip = regs->ip;
1486	m->cs = regs->cs;
1487}
1488
1489/* Add per CPU specific workarounds here */
1490static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
1491{
1492	if (c->x86_vendor == X86_VENDOR_UNKNOWN) {
1493		pr_info("unknown CPU type - not enabling MCE support\n");
1494		return -EOPNOTSUPP;
1495	}
1496
1497	/* This should be disabled by the BIOS, but isn't always */
1498	if (c->x86_vendor == X86_VENDOR_AMD) {
1499		if (c->x86 == 15 && banks > 4) {
1500			/*
1501			 * disable GART TBL walk error reporting, which
1502			 * trips off incorrectly with the IOMMU & 3ware
1503			 * & Cerberus:
1504			 */
1505			clear_bit(10, (unsigned long *)&mce_banks[4].ctl);
1506		}
1507		if (c->x86 <= 17 && mce_bootlog < 0) {
1508			/*
1509			 * Lots of broken BIOS around that don't clear them
1510			 * by default and leave crap in there. Don't log:
1511			 */
1512			mce_bootlog = 0;
1513		}
1514		/*
1515		 * Various K7s with broken bank 0 around. Always disable
1516		 * by default.
1517		 */
1518		 if (c->x86 == 6 && banks > 0)
1519			mce_banks[0].ctl = 0;
1520
1521		 /*
1522		  * Turn off MC4_MISC thresholding banks on those models since
1523		  * they're not supported there.
1524		  */
1525		 if (c->x86 == 0x15 &&
1526		     (c->x86_model >= 0x10 && c->x86_model <= 0x1f)) {
1527			 int i;
1528			 u64 val, hwcr;
1529			 bool need_toggle;
1530			 u32 msrs[] = {
1531				0x00000413, /* MC4_MISC0 */
1532				0xc0000408, /* MC4_MISC1 */
1533			 };
1534
1535			 rdmsrl(MSR_K7_HWCR, hwcr);
1536
1537			 /* McStatusWrEn has to be set */
1538			 need_toggle = !(hwcr & BIT(18));
1539
1540			 if (need_toggle)
1541				 wrmsrl(MSR_K7_HWCR, hwcr | BIT(18));
1542
1543			 for (i = 0; i < ARRAY_SIZE(msrs); i++) {
1544				 rdmsrl(msrs[i], val);
1545
1546				 /* CntP bit set? */
1547				 if (val & BIT_64(62)) {
1548					val &= ~BIT_64(62);
1549					wrmsrl(msrs[i], val);
1550				 }
1551			 }
1552
1553			 /* restore old settings */
1554			 if (need_toggle)
1555				 wrmsrl(MSR_K7_HWCR, hwcr);
1556		 }
1557	}
1558
1559	if (c->x86_vendor == X86_VENDOR_INTEL) {
1560		/*
1561		 * SDM documents that on family 6 bank 0 should not be written
1562		 * because it aliases to another special BIOS controlled
1563		 * register.
1564		 * But it's not aliased anymore on model 0x1a+
1565		 * Don't ignore bank 0 completely because there could be a
1566		 * valid event later, merely don't write CTL0.
1567		 */
1568
1569		if (c->x86 == 6 && c->x86_model < 0x1A && banks > 0)
1570			mce_banks[0].init = 0;
1571
1572		/*
1573		 * All newer Intel systems support MCE broadcasting. Enable
1574		 * synchronization with a one second timeout.
1575		 */
1576		if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) &&
1577			monarch_timeout < 0)
1578			monarch_timeout = USEC_PER_SEC;
1579
1580		/*
1581		 * There are also broken BIOSes on some Pentium M and
1582		 * earlier systems:
1583		 */
1584		if (c->x86 == 6 && c->x86_model <= 13 && mce_bootlog < 0)
1585			mce_bootlog = 0;
1586
1587		if (c->x86 == 6 && c->x86_model == 45)
1588			quirk_no_way_out = quirk_sandybridge_ifu;
1589	}
1590	if (monarch_timeout < 0)
1591		monarch_timeout = 0;
1592	if (mce_bootlog != 0)
1593		mce_panic_timeout = 30;
1594
1595	return 0;
1596}
1597
1598static int __cpuinit __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
1599{
1600	if (c->x86 != 5)
1601		return 0;
1602
1603	switch (c->x86_vendor) {
1604	case X86_VENDOR_INTEL:
1605		intel_p5_mcheck_init(c);
1606		return 1;
1607		break;
1608	case X86_VENDOR_CENTAUR:
1609		winchip_mcheck_init(c);
1610		return 1;
1611		break;
1612	}
1613
1614	return 0;
1615}
1616
1617static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
1618{
1619	switch (c->x86_vendor) {
1620	case X86_VENDOR_INTEL:
1621		mce_intel_feature_init(c);
1622		mce_adjust_timer = mce_intel_adjust_timer;
1623		break;
1624	case X86_VENDOR_AMD:
1625		mce_amd_feature_init(c);
1626		break;
1627	default:
1628		break;
1629	}
1630}
1631
1632static void mce_start_timer(unsigned int cpu, struct timer_list *t)
1633{
1634	unsigned long iv = mce_adjust_timer(check_interval * HZ);
1635
1636	__this_cpu_write(mce_next_interval, iv);
1637
1638	if (mce_ignore_ce || !iv)
1639		return;
1640
1641	t->expires = round_jiffies(jiffies + iv);
1642	add_timer_on(t, smp_processor_id());
1643}
1644
1645static void __mcheck_cpu_init_timer(void)
1646{
1647	struct timer_list *t = &__get_cpu_var(mce_timer);
1648	unsigned int cpu = smp_processor_id();
1649
1650	setup_timer(t, mce_timer_fn, cpu);
1651	mce_start_timer(cpu, t);
1652}
1653
1654/* Handle unconfigured int18 (should never happen) */
1655static void unexpected_machine_check(struct pt_regs *regs, long error_code)
1656{
1657	pr_err("CPU#%d: Unexpected int18 (Machine Check)\n",
1658	       smp_processor_id());
1659}
1660
1661/* Call the installed machine check handler for this CPU setup. */
1662void (*machine_check_vector)(struct pt_regs *, long error_code) =
1663						unexpected_machine_check;
1664
1665/*
1666 * Called for each booted CPU to set up machine checks.
1667 * Must be called with preempt off:
1668 */
1669void __cpuinit mcheck_cpu_init(struct cpuinfo_x86 *c)
1670{
1671	if (mce_disabled)
1672		return;
1673
1674	if (__mcheck_cpu_ancient_init(c))
1675		return;
1676
1677	if (!mce_available(c))
1678		return;
1679
1680	if (__mcheck_cpu_cap_init() < 0 || __mcheck_cpu_apply_quirks(c) < 0) {
1681		mce_disabled = 1;
1682		return;
1683	}
1684
1685	machine_check_vector = do_machine_check;
1686
1687	__mcheck_cpu_init_generic();
1688	__mcheck_cpu_init_vendor(c);
1689	__mcheck_cpu_init_timer();
1690	INIT_WORK(&__get_cpu_var(mce_work), mce_process_work);
1691	init_irq_work(&__get_cpu_var(mce_irq_work), &mce_irq_work_cb);
1692}
1693
1694/*
1695 * mce_chrdev: Character device /dev/mcelog to read and clear the MCE log.
1696 */
1697
1698static DEFINE_SPINLOCK(mce_chrdev_state_lock);
1699static int mce_chrdev_open_count;	/* #times opened */
1700static int mce_chrdev_open_exclu;	/* already open exclusive? */
1701
1702static int mce_chrdev_open(struct inode *inode, struct file *file)
1703{
1704	spin_lock(&mce_chrdev_state_lock);
1705
1706	if (mce_chrdev_open_exclu ||
1707	    (mce_chrdev_open_count && (file->f_flags & O_EXCL))) {
1708		spin_unlock(&mce_chrdev_state_lock);
1709
1710		return -EBUSY;
1711	}
1712
1713	if (file->f_flags & O_EXCL)
1714		mce_chrdev_open_exclu = 1;
1715	mce_chrdev_open_count++;
1716
1717	spin_unlock(&mce_chrdev_state_lock);
1718
1719	return nonseekable_open(inode, file);
1720}
1721
1722static int mce_chrdev_release(struct inode *inode, struct file *file)
1723{
1724	spin_lock(&mce_chrdev_state_lock);
1725
1726	mce_chrdev_open_count--;
1727	mce_chrdev_open_exclu = 0;
1728
1729	spin_unlock(&mce_chrdev_state_lock);
1730
1731	return 0;
1732}
1733
1734static void collect_tscs(void *data)
1735{
1736	unsigned long *cpu_tsc = (unsigned long *)data;
1737
1738	rdtscll(cpu_tsc[smp_processor_id()]);
1739}
1740
1741static int mce_apei_read_done;
1742
1743/* Collect MCE record of previous boot in persistent storage via APEI ERST. */
1744static int __mce_read_apei(char __user **ubuf, size_t usize)
1745{
1746	int rc;
1747	u64 record_id;
1748	struct mce m;
1749
1750	if (usize < sizeof(struct mce))
1751		return -EINVAL;
1752
1753	rc = apei_read_mce(&m, &record_id);
1754	/* Error or no more MCE record */
1755	if (rc <= 0) {
1756		mce_apei_read_done = 1;
1757		/*
1758		 * When ERST is disabled, mce_chrdev_read() should return
1759		 * "no record" instead of "no device."
1760		 */
1761		if (rc == -ENODEV)
1762			return 0;
1763		return rc;
1764	}
1765	rc = -EFAULT;
1766	if (copy_to_user(*ubuf, &m, sizeof(struct mce)))
1767		return rc;
1768	/*
1769	 * In fact, we should have cleared the record after that has
1770	 * been flushed to the disk or sent to network in
1771	 * /sbin/mcelog, but we have no interface to support that now,
1772	 * so just clear it to avoid duplication.
1773	 */
1774	rc = apei_clear_mce(record_id);
1775	if (rc) {
1776		mce_apei_read_done = 1;
1777		return rc;
1778	}
1779	*ubuf += sizeof(struct mce);
1780
1781	return 0;
1782}
1783
1784static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf,
1785				size_t usize, loff_t *off)
1786{
1787	char __user *buf = ubuf;
1788	unsigned long *cpu_tsc;
1789	unsigned prev, next;
1790	int i, err;
1791
1792	cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
1793	if (!cpu_tsc)
1794		return -ENOMEM;
1795
1796	mutex_lock(&mce_chrdev_read_mutex);
1797
1798	if (!mce_apei_read_done) {
1799		err = __mce_read_apei(&buf, usize);
1800		if (err || buf != ubuf)
1801			goto out;
1802	}
1803
1804	next = rcu_dereference_check_mce(mcelog.next);
1805
1806	/* Only supports full reads right now */
1807	err = -EINVAL;
1808	if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce))
1809		goto out;
1810
1811	err = 0;
1812	prev = 0;
1813	do {
1814		for (i = prev; i < next; i++) {
1815			unsigned long start = jiffies;
1816			struct mce *m = &mcelog.entry[i];
1817
1818			while (!m->finished) {
1819				if (time_after_eq(jiffies, start + 2)) {
1820					memset(m, 0, sizeof(*m));
1821					goto timeout;
1822				}
1823				cpu_relax();
1824			}
1825			smp_rmb();
1826			err |= copy_to_user(buf, m, sizeof(*m));
1827			buf += sizeof(*m);
1828timeout:
1829			;
1830		}
1831
1832		memset(mcelog.entry + prev, 0,
1833		       (next - prev) * sizeof(struct mce));
1834		prev = next;
1835		next = cmpxchg(&mcelog.next, prev, 0);
1836	} while (next != prev);
1837
1838	synchronize_sched();
1839
1840	/*
1841	 * Collect entries that were still getting written before the
1842	 * synchronize.
1843	 */
1844	on_each_cpu(collect_tscs, cpu_tsc, 1);
1845
1846	for (i = next; i < MCE_LOG_LEN; i++) {
1847		struct mce *m = &mcelog.entry[i];
1848
1849		if (m->finished && m->tsc < cpu_tsc[m->cpu]) {
1850			err |= copy_to_user(buf, m, sizeof(*m));
1851			smp_rmb();
1852			buf += sizeof(*m);
1853			memset(m, 0, sizeof(*m));
1854		}
1855	}
1856
1857	if (err)
1858		err = -EFAULT;
1859
1860out:
1861	mutex_unlock(&mce_chrdev_read_mutex);
1862	kfree(cpu_tsc);
1863
1864	return err ? err : buf - ubuf;
1865}
1866
1867static unsigned int mce_chrdev_poll(struct file *file, poll_table *wait)
1868{
1869	poll_wait(file, &mce_chrdev_wait, wait);
1870	if (rcu_access_index(mcelog.next))
1871		return POLLIN | POLLRDNORM;
1872	if (!mce_apei_read_done && apei_check_mce())
1873		return POLLIN | POLLRDNORM;
1874	return 0;
1875}
1876
1877static long mce_chrdev_ioctl(struct file *f, unsigned int cmd,
1878				unsigned long arg)
1879{
1880	int __user *p = (int __user *)arg;
1881
1882	if (!capable(CAP_SYS_ADMIN))
1883		return -EPERM;
1884
1885	switch (cmd) {
1886	case MCE_GET_RECORD_LEN:
1887		return put_user(sizeof(struct mce), p);
1888	case MCE_GET_LOG_LEN:
1889		return put_user(MCE_LOG_LEN, p);
1890	case MCE_GETCLEAR_FLAGS: {
1891		unsigned flags;
1892
1893		do {
1894			flags = mcelog.flags;
1895		} while (cmpxchg(&mcelog.flags, flags, 0) != flags);
1896
1897		return put_user(flags, p);
1898	}
1899	default:
1900		return -ENOTTY;
1901	}
1902}
1903
1904static ssize_t (*mce_write)(struct file *filp, const char __user *ubuf,
1905			    size_t usize, loff_t *off);
1906
1907void register_mce_write_callback(ssize_t (*fn)(struct file *filp,
1908			     const char __user *ubuf,
1909			     size_t usize, loff_t *off))
1910{
1911	mce_write = fn;
1912}
1913EXPORT_SYMBOL_GPL(register_mce_write_callback);
1914
1915ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf,
1916			 size_t usize, loff_t *off)
1917{
1918	if (mce_write)
1919		return mce_write(filp, ubuf, usize, off);
1920	else
1921		return -EINVAL;
1922}
1923
1924static const struct file_operations mce_chrdev_ops = {
1925	.open			= mce_chrdev_open,
1926	.release		= mce_chrdev_release,
1927	.read			= mce_chrdev_read,
1928	.write			= mce_chrdev_write,
1929	.poll			= mce_chrdev_poll,
1930	.unlocked_ioctl		= mce_chrdev_ioctl,
1931	.llseek			= no_llseek,
1932};
1933
1934static struct miscdevice mce_chrdev_device = {
1935	MISC_MCELOG_MINOR,
1936	"mcelog",
1937	&mce_chrdev_ops,
1938};
1939
1940/*
1941 * mce=off Disables machine check
1942 * mce=no_cmci Disables CMCI
1943 * mce=dont_log_ce Clears corrected events silently, no log created for CEs.
1944 * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared.
1945 * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above)
1946 *	monarchtimeout is how long to wait for other CPUs on machine
1947 *	check, or 0 to not wait
1948 * mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
1949 * mce=nobootlog Don't log MCEs from before booting.
1950 * mce=bios_cmci_threshold Don't program the CMCI threshold
1951 */
1952static int __init mcheck_enable(char *str)
1953{
1954	if (*str == 0) {
1955		enable_p5_mce();
1956		return 1;
1957	}
1958	if (*str == '=')
1959		str++;
1960	if (!strcmp(str, "off"))
1961		mce_disabled = 1;
1962	else if (!strcmp(str, "no_cmci"))
1963		mce_cmci_disabled = 1;
1964	else if (!strcmp(str, "dont_log_ce"))
1965		mce_dont_log_ce = 1;
1966	else if (!strcmp(str, "ignore_ce"))
1967		mce_ignore_ce = 1;
1968	else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
1969		mce_bootlog = (str[0] == 'b');
1970	else if (!strcmp(str, "bios_cmci_threshold"))
1971		mce_bios_cmci_threshold = 1;
1972	else if (isdigit(str[0])) {
1973		get_option(&str, &tolerant);
1974		if (*str == ',') {
1975			++str;
1976			get_option(&str, &monarch_timeout);
1977		}
1978	} else {
1979		pr_info("mce argument %s ignored. Please use /sys\n", str);
1980		return 0;
1981	}
1982	return 1;
1983}
1984__setup("mce", mcheck_enable);
1985
1986int __init mcheck_init(void)
1987{
1988	mcheck_intel_therm_init();
1989
1990	return 0;
1991}
1992
1993/*
1994 * mce_syscore: PM support
1995 */
1996
1997/*
1998 * Disable machine checks on suspend and shutdown. We can't really handle
1999 * them later.
2000 */
2001static int mce_disable_error_reporting(void)
2002{
2003	int i;
2004
2005	for (i = 0; i < banks; i++) {
2006		struct mce_bank *b = &mce_banks[i];
2007
2008		if (b->init)
2009			wrmsrl(MSR_IA32_MCx_CTL(i), 0);
2010	}
2011	return 0;
2012}
2013
2014static int mce_syscore_suspend(void)
2015{
2016	return mce_disable_error_reporting();
2017}
2018
2019static void mce_syscore_shutdown(void)
2020{
2021	mce_disable_error_reporting();
2022}
2023
2024/*
2025 * On resume clear all MCE state. Don't want to see leftovers from the BIOS.
2026 * Only one CPU is active at this time, the others get re-added later using
2027 * CPU hotplug:
2028 */
2029static void mce_syscore_resume(void)
2030{
2031	__mcheck_cpu_init_generic();
2032	__mcheck_cpu_init_vendor(__this_cpu_ptr(&cpu_info));
2033}
2034
2035static struct syscore_ops mce_syscore_ops = {
2036	.suspend	= mce_syscore_suspend,
2037	.shutdown	= mce_syscore_shutdown,
2038	.resume		= mce_syscore_resume,
2039};
2040
2041/*
2042 * mce_device: Sysfs support
2043 */
2044
2045static void mce_cpu_restart(void *data)
2046{
2047	if (!mce_available(__this_cpu_ptr(&cpu_info)))
2048		return;
2049	__mcheck_cpu_init_generic();
2050	__mcheck_cpu_init_timer();
2051}
2052
2053/* Reinit MCEs after user configuration changes */
2054static void mce_restart(void)
2055{
2056	mce_timer_delete_all();
2057	on_each_cpu(mce_cpu_restart, NULL, 1);
2058}
2059
2060/* Toggle features for corrected errors */
2061static void mce_disable_cmci(void *data)
2062{
2063	if (!mce_available(__this_cpu_ptr(&cpu_info)))
2064		return;
2065	cmci_clear();
2066}
2067
2068static void mce_enable_ce(void *all)
2069{
2070	if (!mce_available(__this_cpu_ptr(&cpu_info)))
2071		return;
2072	cmci_reenable();
2073	cmci_recheck();
2074	if (all)
2075		__mcheck_cpu_init_timer();
2076}
2077
2078static struct bus_type mce_subsys = {
2079	.name		= "machinecheck",
2080	.dev_name	= "machinecheck",
2081};
2082
2083DEFINE_PER_CPU(struct device *, mce_device);
2084
2085__cpuinitdata
2086void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
2087
2088static inline struct mce_bank *attr_to_bank(struct device_attribute *attr)
2089{
2090	return container_of(attr, struct mce_bank, attr);
2091}
2092
2093static ssize_t show_bank(struct device *s, struct device_attribute *attr,
2094			 char *buf)
2095{
2096	return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl);
2097}
2098
2099static ssize_t set_bank(struct device *s, struct device_attribute *attr,
2100			const char *buf, size_t size)
2101{
2102	u64 new;
2103
2104	if (strict_strtoull(buf, 0, &new) < 0)
2105		return -EINVAL;
2106
2107	attr_to_bank(attr)->ctl = new;
2108	mce_restart();
2109
2110	return size;
2111}
2112
2113static ssize_t
2114show_trigger(struct device *s, struct device_attribute *attr, char *buf)
2115{
2116	strcpy(buf, mce_helper);
2117	strcat(buf, "\n");
2118	return strlen(mce_helper) + 1;
2119}
2120
2121static ssize_t set_trigger(struct device *s, struct device_attribute *attr,
2122				const char *buf, size_t siz)
2123{
2124	char *p;
2125
2126	strncpy(mce_helper, buf, sizeof(mce_helper));
2127	mce_helper[sizeof(mce_helper)-1] = 0;
2128	p = strchr(mce_helper, '\n');
2129
2130	if (p)
2131		*p = 0;
2132
2133	return strlen(mce_helper) + !!p;
2134}
2135
2136static ssize_t set_ignore_ce(struct device *s,
2137			     struct device_attribute *attr,
2138			     const char *buf, size_t size)
2139{
2140	u64 new;
2141
2142	if (strict_strtoull(buf, 0, &new) < 0)
2143		return -EINVAL;
2144
2145	if (mce_ignore_ce ^ !!new) {
2146		if (new) {
2147			/* disable ce features */
2148			mce_timer_delete_all();
2149			on_each_cpu(mce_disable_cmci, NULL, 1);
2150			mce_ignore_ce = 1;
2151		} else {
2152			/* enable ce features */
2153			mce_ignore_ce = 0;
2154			on_each_cpu(mce_enable_ce, (void *)1, 1);
2155		}
2156	}
2157	return size;
2158}
2159
2160static ssize_t set_cmci_disabled(struct device *s,
2161				 struct device_attribute *attr,
2162				 const char *buf, size_t size)
2163{
2164	u64 new;
2165
2166	if (strict_strtoull(buf, 0, &new) < 0)
2167		return -EINVAL;
2168
2169	if (mce_cmci_disabled ^ !!new) {
2170		if (new) {
2171			/* disable cmci */
2172			on_each_cpu(mce_disable_cmci, NULL, 1);
2173			mce_cmci_disabled = 1;
2174		} else {
2175			/* enable cmci */
2176			mce_cmci_disabled = 0;
2177			on_each_cpu(mce_enable_ce, NULL, 1);
2178		}
2179	}
2180	return size;
2181}
2182
2183static ssize_t store_int_with_restart(struct device *s,
2184				      struct device_attribute *attr,
2185				      const char *buf, size_t size)
2186{
2187	ssize_t ret = device_store_int(s, attr, buf, size);
2188	mce_restart();
2189	return ret;
2190}
2191
2192static DEVICE_ATTR(trigger, 0644, show_trigger, set_trigger);
2193static DEVICE_INT_ATTR(tolerant, 0644, tolerant);
2194static DEVICE_INT_ATTR(monarch_timeout, 0644, monarch_timeout);
2195static DEVICE_INT_ATTR(dont_log_ce, 0644, mce_dont_log_ce);
2196
2197static struct dev_ext_attribute dev_attr_check_interval = {
2198	__ATTR(check_interval, 0644, device_show_int, store_int_with_restart),
2199	&check_interval
2200};
2201
2202static struct dev_ext_attribute dev_attr_ignore_ce = {
2203	__ATTR(ignore_ce, 0644, device_show_int, set_ignore_ce),
2204	&mce_ignore_ce
2205};
2206
2207static struct dev_ext_attribute dev_attr_cmci_disabled = {
2208	__ATTR(cmci_disabled, 0644, device_show_int, set_cmci_disabled),
2209	&mce_cmci_disabled
2210};
2211
2212static struct dev_ext_attribute dev_attr_bios_cmci_threshold = {
2213	__ATTR(bios_cmci_threshold, 0444, device_show_int, NULL),
2214	&mce_bios_cmci_threshold
2215};
2216
2217static struct device_attribute *mce_device_attrs[] = {
2218	&dev_attr_tolerant.attr,
2219	&dev_attr_check_interval.attr,
2220	&dev_attr_trigger,
2221	&dev_attr_monarch_timeout.attr,
2222	&dev_attr_dont_log_ce.attr,
2223	&dev_attr_ignore_ce.attr,
2224	&dev_attr_cmci_disabled.attr,
2225	&dev_attr_bios_cmci_threshold.attr,
2226	NULL
2227};
2228
2229static cpumask_var_t mce_device_initialized;
2230
2231static void mce_device_release(struct device *dev)
2232{
2233	kfree(dev);
2234}
2235
2236/* Per cpu device init. All of the cpus still share the same ctrl bank: */
2237static __cpuinit int mce_device_create(unsigned int cpu)
2238{
2239	struct device *dev;
2240	int err;
2241	int i, j;
2242
2243	if (!mce_available(&boot_cpu_data))
2244		return -EIO;
2245
2246	dev = kzalloc(sizeof *dev, GFP_KERNEL);
2247	if (!dev)
2248		return -ENOMEM;
2249	dev->id  = cpu;
2250	dev->bus = &mce_subsys;
2251	dev->release = &mce_device_release;
2252
2253	err = device_register(dev);
2254	if (err)
2255		return err;
2256
2257	for (i = 0; mce_device_attrs[i]; i++) {
2258		err = device_create_file(dev, mce_device_attrs[i]);
2259		if (err)
2260			goto error;
2261	}
2262	for (j = 0; j < banks; j++) {
2263		err = device_create_file(dev, &mce_banks[j].attr);
2264		if (err)
2265			goto error2;
2266	}
2267	cpumask_set_cpu(cpu, mce_device_initialized);
2268	per_cpu(mce_device, cpu) = dev;
2269
2270	return 0;
2271error2:
2272	while (--j >= 0)
2273		device_remove_file(dev, &mce_banks[j].attr);
2274error:
2275	while (--i >= 0)
2276		device_remove_file(dev, mce_device_attrs[i]);
2277
2278	device_unregister(dev);
2279
2280	return err;
2281}
2282
2283static __cpuinit void mce_device_remove(unsigned int cpu)
2284{
2285	struct device *dev = per_cpu(mce_device, cpu);
2286	int i;
2287
2288	if (!cpumask_test_cpu(cpu, mce_device_initialized))
2289		return;
2290
2291	for (i = 0; mce_device_attrs[i]; i++)
2292		device_remove_file(dev, mce_device_attrs[i]);
2293
2294	for (i = 0; i < banks; i++)
2295		device_remove_file(dev, &mce_banks[i].attr);
2296
2297	device_unregister(dev);
2298	cpumask_clear_cpu(cpu, mce_device_initialized);
2299	per_cpu(mce_device, cpu) = NULL;
2300}
2301
2302/* Make sure there are no machine checks on offlined CPUs. */
2303static void __cpuinit mce_disable_cpu(void *h)
2304{
2305	unsigned long action = *(unsigned long *)h;
2306	int i;
2307
2308	if (!mce_available(__this_cpu_ptr(&cpu_info)))
2309		return;
2310
2311	if (!(action & CPU_TASKS_FROZEN))
2312		cmci_clear();
2313	for (i = 0; i < banks; i++) {
2314		struct mce_bank *b = &mce_banks[i];
2315
2316		if (b->init)
2317			wrmsrl(MSR_IA32_MCx_CTL(i), 0);
2318	}
2319}
2320
2321static void __cpuinit mce_reenable_cpu(void *h)
2322{
2323	unsigned long action = *(unsigned long *)h;
2324	int i;
2325
2326	if (!mce_available(__this_cpu_ptr(&cpu_info)))
2327		return;
2328
2329	if (!(action & CPU_TASKS_FROZEN))
2330		cmci_reenable();
2331	for (i = 0; i < banks; i++) {
2332		struct mce_bank *b = &mce_banks[i];
2333
2334		if (b->init)
2335			wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl);
2336	}
2337}
2338
2339/* Get notified when a cpu comes on/off. Be hotplug friendly. */
2340static int __cpuinit
2341mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
2342{
2343	unsigned int cpu = (unsigned long)hcpu;
2344	struct timer_list *t = &per_cpu(mce_timer, cpu);
2345
2346	switch (action & ~CPU_TASKS_FROZEN) {
2347	case CPU_ONLINE:
2348		mce_device_create(cpu);
2349		if (threshold_cpu_callback)
2350			threshold_cpu_callback(action, cpu);
2351		break;
2352	case CPU_DEAD:
2353		if (threshold_cpu_callback)
2354			threshold_cpu_callback(action, cpu);
2355		mce_device_remove(cpu);
2356		mce_intel_hcpu_update(cpu);
2357		break;
2358	case CPU_DOWN_PREPARE:
2359		smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
2360		del_timer_sync(t);
2361		break;
2362	case CPU_DOWN_FAILED:
2363		smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
2364		mce_start_timer(cpu, t);
2365		break;
2366	}
2367
2368	if (action == CPU_POST_DEAD) {
2369		/* intentionally ignoring frozen here */
2370		cmci_rediscover(cpu);
2371	}
2372
2373	return NOTIFY_OK;
2374}
2375
2376static struct notifier_block mce_cpu_notifier __cpuinitdata = {
2377	.notifier_call = mce_cpu_callback,
2378};
2379
2380static __init void mce_init_banks(void)
2381{
2382	int i;
2383
2384	for (i = 0; i < banks; i++) {
2385		struct mce_bank *b = &mce_banks[i];
2386		struct device_attribute *a = &b->attr;
2387
2388		sysfs_attr_init(&a->attr);
2389		a->attr.name	= b->attrname;
2390		snprintf(b->attrname, ATTR_LEN, "bank%d", i);
2391
2392		a->attr.mode	= 0644;
2393		a->show		= show_bank;
2394		a->store	= set_bank;
2395	}
2396}
2397
2398static __init int mcheck_init_device(void)
2399{
2400	int err;
2401	int i = 0;
2402
2403	if (!mce_available(&boot_cpu_data))
2404		return -EIO;
2405
2406	zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL);
2407
2408	mce_init_banks();
2409
2410	err = subsys_system_register(&mce_subsys, NULL);
2411	if (err)
2412		return err;
2413
2414	for_each_online_cpu(i) {
2415		err = mce_device_create(i);
2416		if (err)
2417			return err;
2418	}
2419
2420	register_syscore_ops(&mce_syscore_ops);
2421	register_hotcpu_notifier(&mce_cpu_notifier);
2422
2423	/* register character device /dev/mcelog */
2424	misc_register(&mce_chrdev_device);
2425
2426	return err;
2427}
2428device_initcall_sync(mcheck_init_device);
2429
2430/*
2431 * Old style boot options parsing. Only for compatibility.
2432 */
2433static int __init mcheck_disable(char *str)
2434{
2435	mce_disabled = 1;
2436	return 1;
2437}
2438__setup("nomce", mcheck_disable);
2439
2440#ifdef CONFIG_DEBUG_FS
2441struct dentry *mce_get_debugfs_dir(void)
2442{
2443	static struct dentry *dmce;
2444
2445	if (!dmce)
2446		dmce = debugfs_create_dir("mce", NULL);
2447
2448	return dmce;
2449}
2450
2451static void mce_reset(void)
2452{
2453	cpu_missing = 0;
2454	atomic_set(&mce_fake_paniced, 0);
2455	atomic_set(&mce_executing, 0);
2456	atomic_set(&mce_callin, 0);
2457	atomic_set(&global_nwo, 0);
2458}
2459
2460static int fake_panic_get(void *data, u64 *val)
2461{
2462	*val = fake_panic;
2463	return 0;
2464}
2465
2466static int fake_panic_set(void *data, u64 val)
2467{
2468	mce_reset();
2469	fake_panic = val;
2470	return 0;
2471}
2472
2473DEFINE_SIMPLE_ATTRIBUTE(fake_panic_fops, fake_panic_get,
2474			fake_panic_set, "%llu\n");
2475
2476static int __init mcheck_debugfs_init(void)
2477{
2478	struct dentry *dmce, *ffake_panic;
2479
2480	dmce = mce_get_debugfs_dir();
2481	if (!dmce)
2482		return -ENOMEM;
2483	ffake_panic = debugfs_create_file("fake_panic", 0444, dmce, NULL,
2484					  &fake_panic_fops);
2485	if (!ffake_panic)
2486		return -ENOMEM;
2487
2488	return 0;
2489}
2490late_initcall(mcheck_debugfs_init);
2491#endif
2492