mce.c revision 4f75d8412792777a314ac5c1393a9ed43d695fd1
1/*
2 * Machine check handler.
3 *
4 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
5 * Rest from unknown author(s).
6 * 2004 Andi Kleen. Rewrote most of it.
7 * Copyright 2008 Intel Corporation
8 * Author: Andi Kleen
9 */
10
11#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
12
13#include <linux/thread_info.h>
14#include <linux/capability.h>
15#include <linux/miscdevice.h>
16#include <linux/ratelimit.h>
17#include <linux/kallsyms.h>
18#include <linux/rcupdate.h>
19#include <linux/kobject.h>
20#include <linux/uaccess.h>
21#include <linux/kdebug.h>
22#include <linux/kernel.h>
23#include <linux/percpu.h>
24#include <linux/string.h>
25#include <linux/device.h>
26#include <linux/syscore_ops.h>
27#include <linux/delay.h>
28#include <linux/ctype.h>
29#include <linux/sched.h>
30#include <linux/sysfs.h>
31#include <linux/types.h>
32#include <linux/slab.h>
33#include <linux/init.h>
34#include <linux/kmod.h>
35#include <linux/poll.h>
36#include <linux/nmi.h>
37#include <linux/cpu.h>
38#include <linux/smp.h>
39#include <linux/fs.h>
40#include <linux/mm.h>
41#include <linux/debugfs.h>
42#include <linux/irq_work.h>
43#include <linux/export.h>
44
45#include <asm/processor.h>
46#include <asm/mce.h>
47#include <asm/msr.h>
48
49#include "mce-internal.h"
50
51static DEFINE_MUTEX(mce_chrdev_read_mutex);
52
53#define rcu_dereference_check_mce(p) \
54	rcu_dereference_index_check((p), \
55			      rcu_read_lock_sched_held() || \
56			      lockdep_is_held(&mce_chrdev_read_mutex))
57
58#define CREATE_TRACE_POINTS
59#include <trace/events/mce.h>
60
61#define SPINUNIT 100	/* 100ns */
62
63atomic_t mce_entry;
64
65DEFINE_PER_CPU(unsigned, mce_exception_count);
66
67struct mce_bank *mce_banks __read_mostly;
68
69struct mca_config mca_cfg __read_mostly = {
70	.bootlog  = -1,
71	/*
72	 * Tolerant levels:
73	 * 0: always panic on uncorrected errors, log corrected errors
74	 * 1: panic or SIGBUS on uncorrected errors, log corrected errors
75	 * 2: SIGBUS or log uncorrected errors (if possible), log corr. errors
76	 * 3: never panic or SIGBUS, log all errors (for testing only)
77	 */
78	.tolerant = 1,
79	.monarch_timeout = -1
80};
81
82/* User mode helper program triggered by machine check event */
83static unsigned long		mce_need_notify;
84static char			mce_helper[128];
85static char			*mce_helper_argv[2] = { mce_helper, NULL };
86
87static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait);
88
89static DEFINE_PER_CPU(struct mce, mces_seen);
90static int			cpu_missing;
91
92/*
93 * MCA banks polled by the period polling timer for corrected events.
94 * With Intel CMCI, this only has MCA banks which do not support CMCI (if any).
95 */
96DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
97	[0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
98};
99
100/*
101 * MCA banks controlled through firmware first for corrected errors.
102 * This is a global list of banks for which we won't enable CMCI and we
103 * won't poll. Firmware controls these banks and is responsible for
104 * reporting corrected errors through GHES. Uncorrected/recoverable
105 * errors are still notified through a machine check.
106 */
107mce_banks_t mce_banks_ce_disabled;
108
109static DEFINE_PER_CPU(struct work_struct, mce_work);
110
111static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs);
112
113/*
114 * CPU/chipset specific EDAC code can register a notifier call here to print
115 * MCE errors in a human-readable form.
116 */
117ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain);
118
119/* Do initial initialization of a struct mce */
120void mce_setup(struct mce *m)
121{
122	memset(m, 0, sizeof(struct mce));
123	m->cpu = m->extcpu = smp_processor_id();
124	rdtscll(m->tsc);
125	/* We hope get_seconds stays lockless */
126	m->time = get_seconds();
127	m->cpuvendor = boot_cpu_data.x86_vendor;
128	m->cpuid = cpuid_eax(1);
129	m->socketid = cpu_data(m->extcpu).phys_proc_id;
130	m->apicid = cpu_data(m->extcpu).initial_apicid;
131	rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap);
132}
133
134DEFINE_PER_CPU(struct mce, injectm);
135EXPORT_PER_CPU_SYMBOL_GPL(injectm);
136
137/*
138 * Lockless MCE logging infrastructure.
139 * This avoids deadlocks on printk locks without having to break locks. Also
140 * separate MCEs from kernel messages to avoid bogus bug reports.
141 */
142
143static struct mce_log mcelog = {
144	.signature	= MCE_LOG_SIGNATURE,
145	.len		= MCE_LOG_LEN,
146	.recordlen	= sizeof(struct mce),
147};
148
149void mce_log(struct mce *mce)
150{
151	unsigned next, entry;
152	int ret = 0;
153
154	/* Emit the trace record: */
155	trace_mce_record(mce);
156
157	ret = atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, mce);
158	if (ret == NOTIFY_STOP)
159		return;
160
161	mce->finished = 0;
162	wmb();
163	for (;;) {
164		entry = rcu_dereference_check_mce(mcelog.next);
165		for (;;) {
166
167			/*
168			 * When the buffer fills up discard new entries.
169			 * Assume that the earlier errors are the more
170			 * interesting ones:
171			 */
172			if (entry >= MCE_LOG_LEN) {
173				set_bit(MCE_OVERFLOW,
174					(unsigned long *)&mcelog.flags);
175				return;
176			}
177			/* Old left over entry. Skip: */
178			if (mcelog.entry[entry].finished) {
179				entry++;
180				continue;
181			}
182			break;
183		}
184		smp_rmb();
185		next = entry + 1;
186		if (cmpxchg(&mcelog.next, entry, next) == entry)
187			break;
188	}
189	memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
190	wmb();
191	mcelog.entry[entry].finished = 1;
192	wmb();
193
194	mce->finished = 1;
195	set_bit(0, &mce_need_notify);
196}
197
198static void drain_mcelog_buffer(void)
199{
200	unsigned int next, i, prev = 0;
201
202	next = ACCESS_ONCE(mcelog.next);
203
204	do {
205		struct mce *m;
206
207		/* drain what was logged during boot */
208		for (i = prev; i < next; i++) {
209			unsigned long start = jiffies;
210			unsigned retries = 1;
211
212			m = &mcelog.entry[i];
213
214			while (!m->finished) {
215				if (time_after_eq(jiffies, start + 2*retries))
216					retries++;
217
218				cpu_relax();
219
220				if (!m->finished && retries >= 4) {
221					pr_err("skipping error being logged currently!\n");
222					break;
223				}
224			}
225			smp_rmb();
226			atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m);
227		}
228
229		memset(mcelog.entry + prev, 0, (next - prev) * sizeof(*m));
230		prev = next;
231		next = cmpxchg(&mcelog.next, prev, 0);
232	} while (next != prev);
233}
234
235
236void mce_register_decode_chain(struct notifier_block *nb)
237{
238	atomic_notifier_chain_register(&x86_mce_decoder_chain, nb);
239	drain_mcelog_buffer();
240}
241EXPORT_SYMBOL_GPL(mce_register_decode_chain);
242
243void mce_unregister_decode_chain(struct notifier_block *nb)
244{
245	atomic_notifier_chain_unregister(&x86_mce_decoder_chain, nb);
246}
247EXPORT_SYMBOL_GPL(mce_unregister_decode_chain);
248
249static void print_mce(struct mce *m)
250{
251	int ret = 0;
252
253	pr_emerg(HW_ERR "CPU %d: Machine Check Exception: %Lx Bank %d: %016Lx\n",
254	       m->extcpu, m->mcgstatus, m->bank, m->status);
255
256	if (m->ip) {
257		pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ",
258			!(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
259				m->cs, m->ip);
260
261		if (m->cs == __KERNEL_CS)
262			print_symbol("{%s}", m->ip);
263		pr_cont("\n");
264	}
265
266	pr_emerg(HW_ERR "TSC %llx ", m->tsc);
267	if (m->addr)
268		pr_cont("ADDR %llx ", m->addr);
269	if (m->misc)
270		pr_cont("MISC %llx ", m->misc);
271
272	pr_cont("\n");
273	/*
274	 * Note this output is parsed by external tools and old fields
275	 * should not be changed.
276	 */
277	pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x microcode %x\n",
278		m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid,
279		cpu_data(m->extcpu).microcode);
280
281	/*
282	 * Print out human-readable details about the MCE error,
283	 * (if the CPU has an implementation for that)
284	 */
285	ret = atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m);
286	if (ret == NOTIFY_STOP)
287		return;
288
289	pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n");
290}
291
292#define PANIC_TIMEOUT 5 /* 5 seconds */
293
294static atomic_t mce_paniced;
295
296static int fake_panic;
297static atomic_t mce_fake_paniced;
298
299/* Panic in progress. Enable interrupts and wait for final IPI */
300static void wait_for_panic(void)
301{
302	long timeout = PANIC_TIMEOUT*USEC_PER_SEC;
303
304	preempt_disable();
305	local_irq_enable();
306	while (timeout-- > 0)
307		udelay(1);
308	if (panic_timeout == 0)
309		panic_timeout = mca_cfg.panic_timeout;
310	panic("Panicing machine check CPU died");
311}
312
313static void mce_panic(char *msg, struct mce *final, char *exp)
314{
315	int i, apei_err = 0;
316
317	if (!fake_panic) {
318		/*
319		 * Make sure only one CPU runs in machine check panic
320		 */
321		if (atomic_inc_return(&mce_paniced) > 1)
322			wait_for_panic();
323		barrier();
324
325		bust_spinlocks(1);
326		console_verbose();
327	} else {
328		/* Don't log too much for fake panic */
329		if (atomic_inc_return(&mce_fake_paniced) > 1)
330			return;
331	}
332	/* First print corrected ones that are still unlogged */
333	for (i = 0; i < MCE_LOG_LEN; i++) {
334		struct mce *m = &mcelog.entry[i];
335		if (!(m->status & MCI_STATUS_VAL))
336			continue;
337		if (!(m->status & MCI_STATUS_UC)) {
338			print_mce(m);
339			if (!apei_err)
340				apei_err = apei_write_mce(m);
341		}
342	}
343	/* Now print uncorrected but with the final one last */
344	for (i = 0; i < MCE_LOG_LEN; i++) {
345		struct mce *m = &mcelog.entry[i];
346		if (!(m->status & MCI_STATUS_VAL))
347			continue;
348		if (!(m->status & MCI_STATUS_UC))
349			continue;
350		if (!final || memcmp(m, final, sizeof(struct mce))) {
351			print_mce(m);
352			if (!apei_err)
353				apei_err = apei_write_mce(m);
354		}
355	}
356	if (final) {
357		print_mce(final);
358		if (!apei_err)
359			apei_err = apei_write_mce(final);
360	}
361	if (cpu_missing)
362		pr_emerg(HW_ERR "Some CPUs didn't answer in synchronization\n");
363	if (exp)
364		pr_emerg(HW_ERR "Machine check: %s\n", exp);
365	if (!fake_panic) {
366		if (panic_timeout == 0)
367			panic_timeout = mca_cfg.panic_timeout;
368		panic(msg);
369	} else
370		pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg);
371}
372
373/* Support code for software error injection */
374
375static int msr_to_offset(u32 msr)
376{
377	unsigned bank = __this_cpu_read(injectm.bank);
378
379	if (msr == mca_cfg.rip_msr)
380		return offsetof(struct mce, ip);
381	if (msr == MSR_IA32_MCx_STATUS(bank))
382		return offsetof(struct mce, status);
383	if (msr == MSR_IA32_MCx_ADDR(bank))
384		return offsetof(struct mce, addr);
385	if (msr == MSR_IA32_MCx_MISC(bank))
386		return offsetof(struct mce, misc);
387	if (msr == MSR_IA32_MCG_STATUS)
388		return offsetof(struct mce, mcgstatus);
389	return -1;
390}
391
392/* MSR access wrappers used for error injection */
393static u64 mce_rdmsrl(u32 msr)
394{
395	u64 v;
396
397	if (__this_cpu_read(injectm.finished)) {
398		int offset = msr_to_offset(msr);
399
400		if (offset < 0)
401			return 0;
402		return *(u64 *)((char *)&__get_cpu_var(injectm) + offset);
403	}
404
405	if (rdmsrl_safe(msr, &v)) {
406		WARN_ONCE(1, "mce: Unable to read msr %d!\n", msr);
407		/*
408		 * Return zero in case the access faulted. This should
409		 * not happen normally but can happen if the CPU does
410		 * something weird, or if the code is buggy.
411		 */
412		v = 0;
413	}
414
415	return v;
416}
417
418static void mce_wrmsrl(u32 msr, u64 v)
419{
420	if (__this_cpu_read(injectm.finished)) {
421		int offset = msr_to_offset(msr);
422
423		if (offset >= 0)
424			*(u64 *)((char *)&__get_cpu_var(injectm) + offset) = v;
425		return;
426	}
427	wrmsrl(msr, v);
428}
429
430/*
431 * Collect all global (w.r.t. this processor) status about this machine
432 * check into our "mce" struct so that we can use it later to assess
433 * the severity of the problem as we read per-bank specific details.
434 */
435static inline void mce_gather_info(struct mce *m, struct pt_regs *regs)
436{
437	mce_setup(m);
438
439	m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
440	if (regs) {
441		/*
442		 * Get the address of the instruction at the time of
443		 * the machine check error.
444		 */
445		if (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) {
446			m->ip = regs->ip;
447			m->cs = regs->cs;
448
449			/*
450			 * When in VM86 mode make the cs look like ring 3
451			 * always. This is a lie, but it's better than passing
452			 * the additional vm86 bit around everywhere.
453			 */
454			if (v8086_mode(regs))
455				m->cs |= 3;
456		}
457		/* Use accurate RIP reporting if available. */
458		if (mca_cfg.rip_msr)
459			m->ip = mce_rdmsrl(mca_cfg.rip_msr);
460	}
461}
462
463/*
464 * Simple lockless ring to communicate PFNs from the exception handler with the
465 * process context work function. This is vastly simplified because there's
466 * only a single reader and a single writer.
467 */
468#define MCE_RING_SIZE 16	/* we use one entry less */
469
470struct mce_ring {
471	unsigned short start;
472	unsigned short end;
473	unsigned long ring[MCE_RING_SIZE];
474};
475static DEFINE_PER_CPU(struct mce_ring, mce_ring);
476
477/* Runs with CPU affinity in workqueue */
478static int mce_ring_empty(void)
479{
480	struct mce_ring *r = &__get_cpu_var(mce_ring);
481
482	return r->start == r->end;
483}
484
485static int mce_ring_get(unsigned long *pfn)
486{
487	struct mce_ring *r;
488	int ret = 0;
489
490	*pfn = 0;
491	get_cpu();
492	r = &__get_cpu_var(mce_ring);
493	if (r->start == r->end)
494		goto out;
495	*pfn = r->ring[r->start];
496	r->start = (r->start + 1) % MCE_RING_SIZE;
497	ret = 1;
498out:
499	put_cpu();
500	return ret;
501}
502
503/* Always runs in MCE context with preempt off */
504static int mce_ring_add(unsigned long pfn)
505{
506	struct mce_ring *r = &__get_cpu_var(mce_ring);
507	unsigned next;
508
509	next = (r->end + 1) % MCE_RING_SIZE;
510	if (next == r->start)
511		return -1;
512	r->ring[r->end] = pfn;
513	wmb();
514	r->end = next;
515	return 0;
516}
517
518int mce_available(struct cpuinfo_x86 *c)
519{
520	if (mca_cfg.disabled)
521		return 0;
522	return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
523}
524
525static void mce_schedule_work(void)
526{
527	if (!mce_ring_empty())
528		schedule_work(&__get_cpu_var(mce_work));
529}
530
531DEFINE_PER_CPU(struct irq_work, mce_irq_work);
532
533static void mce_irq_work_cb(struct irq_work *entry)
534{
535	mce_notify_irq();
536	mce_schedule_work();
537}
538
539static void mce_report_event(struct pt_regs *regs)
540{
541	if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) {
542		mce_notify_irq();
543		/*
544		 * Triggering the work queue here is just an insurance
545		 * policy in case the syscall exit notify handler
546		 * doesn't run soon enough or ends up running on the
547		 * wrong CPU (can happen when audit sleeps)
548		 */
549		mce_schedule_work();
550		return;
551	}
552
553	irq_work_queue(&__get_cpu_var(mce_irq_work));
554}
555
556/*
557 * Read ADDR and MISC registers.
558 */
559static void mce_read_aux(struct mce *m, int i)
560{
561	if (m->status & MCI_STATUS_MISCV)
562		m->misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i));
563	if (m->status & MCI_STATUS_ADDRV) {
564		m->addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i));
565
566		/*
567		 * Mask the reported address by the reported granularity.
568		 */
569		if (mca_cfg.ser && (m->status & MCI_STATUS_MISCV)) {
570			u8 shift = MCI_MISC_ADDR_LSB(m->misc);
571			m->addr >>= shift;
572			m->addr <<= shift;
573		}
574	}
575}
576
577DEFINE_PER_CPU(unsigned, mce_poll_count);
578
579/*
580 * Poll for corrected events or events that happened before reset.
581 * Those are just logged through /dev/mcelog.
582 *
583 * This is executed in standard interrupt context.
584 *
585 * Note: spec recommends to panic for fatal unsignalled
586 * errors here. However this would be quite problematic --
587 * we would need to reimplement the Monarch handling and
588 * it would mess up the exclusion between exception handler
589 * and poll hander -- * so we skip this for now.
590 * These cases should not happen anyways, or only when the CPU
591 * is already totally * confused. In this case it's likely it will
592 * not fully execute the machine check handler either.
593 */
594void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
595{
596	struct mce m;
597	int i;
598
599	this_cpu_inc(mce_poll_count);
600
601	mce_gather_info(&m, NULL);
602
603	for (i = 0; i < mca_cfg.banks; i++) {
604		if (!mce_banks[i].ctl || !test_bit(i, *b))
605			continue;
606
607		m.misc = 0;
608		m.addr = 0;
609		m.bank = i;
610		m.tsc = 0;
611
612		barrier();
613		m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
614		if (!(m.status & MCI_STATUS_VAL))
615			continue;
616
617		/*
618		 * Uncorrected or signalled events are handled by the exception
619		 * handler when it is enabled, so don't process those here.
620		 *
621		 * TBD do the same check for MCI_STATUS_EN here?
622		 */
623		if (!(flags & MCP_UC) &&
624		    (m.status & (mca_cfg.ser ? MCI_STATUS_S : MCI_STATUS_UC)))
625			continue;
626
627		mce_read_aux(&m, i);
628
629		if (!(flags & MCP_TIMESTAMP))
630			m.tsc = 0;
631		/*
632		 * Don't get the IP here because it's unlikely to
633		 * have anything to do with the actual error location.
634		 */
635		if (!(flags & MCP_DONTLOG) && !mca_cfg.dont_log_ce)
636			mce_log(&m);
637
638		/*
639		 * Clear state for this bank.
640		 */
641		mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
642	}
643
644	/*
645	 * Don't clear MCG_STATUS here because it's only defined for
646	 * exceptions.
647	 */
648
649	sync_core();
650}
651EXPORT_SYMBOL_GPL(machine_check_poll);
652
653/*
654 * Do a quick check if any of the events requires a panic.
655 * This decides if we keep the events around or clear them.
656 */
657static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp,
658			  struct pt_regs *regs)
659{
660	int i, ret = 0;
661
662	for (i = 0; i < mca_cfg.banks; i++) {
663		m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
664		if (m->status & MCI_STATUS_VAL) {
665			__set_bit(i, validp);
666			if (quirk_no_way_out)
667				quirk_no_way_out(i, m, regs);
668		}
669		if (mce_severity(m, mca_cfg.tolerant, msg) >= MCE_PANIC_SEVERITY)
670			ret = 1;
671	}
672	return ret;
673}
674
675/*
676 * Variable to establish order between CPUs while scanning.
677 * Each CPU spins initially until executing is equal its number.
678 */
679static atomic_t mce_executing;
680
681/*
682 * Defines order of CPUs on entry. First CPU becomes Monarch.
683 */
684static atomic_t mce_callin;
685
686/*
687 * Check if a timeout waiting for other CPUs happened.
688 */
689static int mce_timed_out(u64 *t)
690{
691	/*
692	 * The others already did panic for some reason.
693	 * Bail out like in a timeout.
694	 * rmb() to tell the compiler that system_state
695	 * might have been modified by someone else.
696	 */
697	rmb();
698	if (atomic_read(&mce_paniced))
699		wait_for_panic();
700	if (!mca_cfg.monarch_timeout)
701		goto out;
702	if ((s64)*t < SPINUNIT) {
703		/* CHECKME: Make panic default for 1 too? */
704		if (mca_cfg.tolerant < 1)
705			mce_panic("Timeout synchronizing machine check over CPUs",
706				  NULL, NULL);
707		cpu_missing = 1;
708		return 1;
709	}
710	*t -= SPINUNIT;
711out:
712	touch_nmi_watchdog();
713	return 0;
714}
715
716/*
717 * The Monarch's reign.  The Monarch is the CPU who entered
718 * the machine check handler first. It waits for the others to
719 * raise the exception too and then grades them. When any
720 * error is fatal panic. Only then let the others continue.
721 *
722 * The other CPUs entering the MCE handler will be controlled by the
723 * Monarch. They are called Subjects.
724 *
725 * This way we prevent any potential data corruption in a unrecoverable case
726 * and also makes sure always all CPU's errors are examined.
727 *
728 * Also this detects the case of a machine check event coming from outer
729 * space (not detected by any CPUs) In this case some external agent wants
730 * us to shut down, so panic too.
731 *
732 * The other CPUs might still decide to panic if the handler happens
733 * in a unrecoverable place, but in this case the system is in a semi-stable
734 * state and won't corrupt anything by itself. It's ok to let the others
735 * continue for a bit first.
736 *
737 * All the spin loops have timeouts; when a timeout happens a CPU
738 * typically elects itself to be Monarch.
739 */
740static void mce_reign(void)
741{
742	int cpu;
743	struct mce *m = NULL;
744	int global_worst = 0;
745	char *msg = NULL;
746	char *nmsg = NULL;
747
748	/*
749	 * This CPU is the Monarch and the other CPUs have run
750	 * through their handlers.
751	 * Grade the severity of the errors of all the CPUs.
752	 */
753	for_each_possible_cpu(cpu) {
754		int severity = mce_severity(&per_cpu(mces_seen, cpu),
755					    mca_cfg.tolerant,
756					    &nmsg);
757		if (severity > global_worst) {
758			msg = nmsg;
759			global_worst = severity;
760			m = &per_cpu(mces_seen, cpu);
761		}
762	}
763
764	/*
765	 * Cannot recover? Panic here then.
766	 * This dumps all the mces in the log buffer and stops the
767	 * other CPUs.
768	 */
769	if (m && global_worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3)
770		mce_panic("Fatal Machine check", m, msg);
771
772	/*
773	 * For UC somewhere we let the CPU who detects it handle it.
774	 * Also must let continue the others, otherwise the handling
775	 * CPU could deadlock on a lock.
776	 */
777
778	/*
779	 * No machine check event found. Must be some external
780	 * source or one CPU is hung. Panic.
781	 */
782	if (global_worst <= MCE_KEEP_SEVERITY && mca_cfg.tolerant < 3)
783		mce_panic("Machine check from unknown source", NULL, NULL);
784
785	/*
786	 * Now clear all the mces_seen so that they don't reappear on
787	 * the next mce.
788	 */
789	for_each_possible_cpu(cpu)
790		memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce));
791}
792
793static atomic_t global_nwo;
794
795/*
796 * Start of Monarch synchronization. This waits until all CPUs have
797 * entered the exception handler and then determines if any of them
798 * saw a fatal event that requires panic. Then it executes them
799 * in the entry order.
800 * TBD double check parallel CPU hotunplug
801 */
802static int mce_start(int *no_way_out)
803{
804	int order;
805	int cpus = num_online_cpus();
806	u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
807
808	if (!timeout)
809		return -1;
810
811	atomic_add(*no_way_out, &global_nwo);
812	/*
813	 * global_nwo should be updated before mce_callin
814	 */
815	smp_wmb();
816	order = atomic_inc_return(&mce_callin);
817
818	/*
819	 * Wait for everyone.
820	 */
821	while (atomic_read(&mce_callin) != cpus) {
822		if (mce_timed_out(&timeout)) {
823			atomic_set(&global_nwo, 0);
824			return -1;
825		}
826		ndelay(SPINUNIT);
827	}
828
829	/*
830	 * mce_callin should be read before global_nwo
831	 */
832	smp_rmb();
833
834	if (order == 1) {
835		/*
836		 * Monarch: Starts executing now, the others wait.
837		 */
838		atomic_set(&mce_executing, 1);
839	} else {
840		/*
841		 * Subject: Now start the scanning loop one by one in
842		 * the original callin order.
843		 * This way when there are any shared banks it will be
844		 * only seen by one CPU before cleared, avoiding duplicates.
845		 */
846		while (atomic_read(&mce_executing) < order) {
847			if (mce_timed_out(&timeout)) {
848				atomic_set(&global_nwo, 0);
849				return -1;
850			}
851			ndelay(SPINUNIT);
852		}
853	}
854
855	/*
856	 * Cache the global no_way_out state.
857	 */
858	*no_way_out = atomic_read(&global_nwo);
859
860	return order;
861}
862
863/*
864 * Synchronize between CPUs after main scanning loop.
865 * This invokes the bulk of the Monarch processing.
866 */
867static int mce_end(int order)
868{
869	int ret = -1;
870	u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
871
872	if (!timeout)
873		goto reset;
874	if (order < 0)
875		goto reset;
876
877	/*
878	 * Allow others to run.
879	 */
880	atomic_inc(&mce_executing);
881
882	if (order == 1) {
883		/* CHECKME: Can this race with a parallel hotplug? */
884		int cpus = num_online_cpus();
885
886		/*
887		 * Monarch: Wait for everyone to go through their scanning
888		 * loops.
889		 */
890		while (atomic_read(&mce_executing) <= cpus) {
891			if (mce_timed_out(&timeout))
892				goto reset;
893			ndelay(SPINUNIT);
894		}
895
896		mce_reign();
897		barrier();
898		ret = 0;
899	} else {
900		/*
901		 * Subject: Wait for Monarch to finish.
902		 */
903		while (atomic_read(&mce_executing) != 0) {
904			if (mce_timed_out(&timeout))
905				goto reset;
906			ndelay(SPINUNIT);
907		}
908
909		/*
910		 * Don't reset anything. That's done by the Monarch.
911		 */
912		return 0;
913	}
914
915	/*
916	 * Reset all global state.
917	 */
918reset:
919	atomic_set(&global_nwo, 0);
920	atomic_set(&mce_callin, 0);
921	barrier();
922
923	/*
924	 * Let others run again.
925	 */
926	atomic_set(&mce_executing, 0);
927	return ret;
928}
929
930/*
931 * Check if the address reported by the CPU is in a format we can parse.
932 * It would be possible to add code for most other cases, but all would
933 * be somewhat complicated (e.g. segment offset would require an instruction
934 * parser). So only support physical addresses up to page granuality for now.
935 */
936static int mce_usable_address(struct mce *m)
937{
938	if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV))
939		return 0;
940	if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT)
941		return 0;
942	if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS)
943		return 0;
944	return 1;
945}
946
947static void mce_clear_state(unsigned long *toclear)
948{
949	int i;
950
951	for (i = 0; i < mca_cfg.banks; i++) {
952		if (test_bit(i, toclear))
953			mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
954	}
955}
956
957/*
958 * Need to save faulting physical address associated with a process
959 * in the machine check handler some place where we can grab it back
960 * later in mce_notify_process()
961 */
962#define	MCE_INFO_MAX	16
963
964struct mce_info {
965	atomic_t		inuse;
966	struct task_struct	*t;
967	__u64			paddr;
968	int			restartable;
969} mce_info[MCE_INFO_MAX];
970
971static void mce_save_info(__u64 addr, int c)
972{
973	struct mce_info *mi;
974
975	for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++) {
976		if (atomic_cmpxchg(&mi->inuse, 0, 1) == 0) {
977			mi->t = current;
978			mi->paddr = addr;
979			mi->restartable = c;
980			return;
981		}
982	}
983
984	mce_panic("Too many concurrent recoverable errors", NULL, NULL);
985}
986
987static struct mce_info *mce_find_info(void)
988{
989	struct mce_info *mi;
990
991	for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++)
992		if (atomic_read(&mi->inuse) && mi->t == current)
993			return mi;
994	return NULL;
995}
996
997static void mce_clear_info(struct mce_info *mi)
998{
999	atomic_set(&mi->inuse, 0);
1000}
1001
1002/*
1003 * The actual machine check handler. This only handles real
1004 * exceptions when something got corrupted coming in through int 18.
1005 *
1006 * This is executed in NMI context not subject to normal locking rules. This
1007 * implies that most kernel services cannot be safely used. Don't even
1008 * think about putting a printk in there!
1009 *
1010 * On Intel systems this is entered on all CPUs in parallel through
1011 * MCE broadcast. However some CPUs might be broken beyond repair,
1012 * so be always careful when synchronizing with others.
1013 */
1014void do_machine_check(struct pt_regs *regs, long error_code)
1015{
1016	struct mca_config *cfg = &mca_cfg;
1017	struct mce m, *final;
1018	int i;
1019	int worst = 0;
1020	int severity;
1021	/*
1022	 * Establish sequential order between the CPUs entering the machine
1023	 * check handler.
1024	 */
1025	int order;
1026	/*
1027	 * If no_way_out gets set, there is no safe way to recover from this
1028	 * MCE.  If mca_cfg.tolerant is cranked up, we'll try anyway.
1029	 */
1030	int no_way_out = 0;
1031	/*
1032	 * If kill_it gets set, there might be a way to recover from this
1033	 * error.
1034	 */
1035	int kill_it = 0;
1036	DECLARE_BITMAP(toclear, MAX_NR_BANKS);
1037	DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
1038	char *msg = "Unknown";
1039
1040	atomic_inc(&mce_entry);
1041
1042	this_cpu_inc(mce_exception_count);
1043
1044	if (!cfg->banks)
1045		goto out;
1046
1047	mce_gather_info(&m, regs);
1048
1049	final = &__get_cpu_var(mces_seen);
1050	*final = m;
1051
1052	memset(valid_banks, 0, sizeof(valid_banks));
1053	no_way_out = mce_no_way_out(&m, &msg, valid_banks, regs);
1054
1055	barrier();
1056
1057	/*
1058	 * When no restart IP might need to kill or panic.
1059	 * Assume the worst for now, but if we find the
1060	 * severity is MCE_AR_SEVERITY we have other options.
1061	 */
1062	if (!(m.mcgstatus & MCG_STATUS_RIPV))
1063		kill_it = 1;
1064
1065	/*
1066	 * Go through all the banks in exclusion of the other CPUs.
1067	 * This way we don't report duplicated events on shared banks
1068	 * because the first one to see it will clear it.
1069	 */
1070	order = mce_start(&no_way_out);
1071	for (i = 0; i < cfg->banks; i++) {
1072		__clear_bit(i, toclear);
1073		if (!test_bit(i, valid_banks))
1074			continue;
1075		if (!mce_banks[i].ctl)
1076			continue;
1077
1078		m.misc = 0;
1079		m.addr = 0;
1080		m.bank = i;
1081
1082		m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
1083		if ((m.status & MCI_STATUS_VAL) == 0)
1084			continue;
1085
1086		/*
1087		 * Non uncorrected or non signaled errors are handled by
1088		 * machine_check_poll. Leave them alone, unless this panics.
1089		 */
1090		if (!(m.status & (cfg->ser ? MCI_STATUS_S : MCI_STATUS_UC)) &&
1091			!no_way_out)
1092			continue;
1093
1094		/*
1095		 * Set taint even when machine check was not enabled.
1096		 */
1097		add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
1098
1099		severity = mce_severity(&m, cfg->tolerant, NULL);
1100
1101		/*
1102		 * When machine check was for corrected handler don't touch,
1103		 * unless we're panicing.
1104		 */
1105		if (severity == MCE_KEEP_SEVERITY && !no_way_out)
1106			continue;
1107		__set_bit(i, toclear);
1108		if (severity == MCE_NO_SEVERITY) {
1109			/*
1110			 * Machine check event was not enabled. Clear, but
1111			 * ignore.
1112			 */
1113			continue;
1114		}
1115
1116		mce_read_aux(&m, i);
1117
1118		/*
1119		 * Action optional error. Queue address for later processing.
1120		 * When the ring overflows we just ignore the AO error.
1121		 * RED-PEN add some logging mechanism when
1122		 * usable_address or mce_add_ring fails.
1123		 * RED-PEN don't ignore overflow for mca_cfg.tolerant == 0
1124		 */
1125		if (severity == MCE_AO_SEVERITY && mce_usable_address(&m))
1126			mce_ring_add(m.addr >> PAGE_SHIFT);
1127
1128		mce_log(&m);
1129
1130		if (severity > worst) {
1131			*final = m;
1132			worst = severity;
1133		}
1134	}
1135
1136	/* mce_clear_state will clear *final, save locally for use later */
1137	m = *final;
1138
1139	if (!no_way_out)
1140		mce_clear_state(toclear);
1141
1142	/*
1143	 * Do most of the synchronization with other CPUs.
1144	 * When there's any problem use only local no_way_out state.
1145	 */
1146	if (mce_end(order) < 0)
1147		no_way_out = worst >= MCE_PANIC_SEVERITY;
1148
1149	/*
1150	 * At insane "tolerant" levels we take no action. Otherwise
1151	 * we only die if we have no other choice. For less serious
1152	 * issues we try to recover, or limit damage to the current
1153	 * process.
1154	 */
1155	if (cfg->tolerant < 3) {
1156		if (no_way_out)
1157			mce_panic("Fatal machine check on current CPU", &m, msg);
1158		if (worst == MCE_AR_SEVERITY) {
1159			/* schedule action before return to userland */
1160			mce_save_info(m.addr, m.mcgstatus & MCG_STATUS_RIPV);
1161			set_thread_flag(TIF_MCE_NOTIFY);
1162		} else if (kill_it) {
1163			force_sig(SIGBUS, current);
1164		}
1165	}
1166
1167	if (worst > 0)
1168		mce_report_event(regs);
1169	mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
1170out:
1171	atomic_dec(&mce_entry);
1172	sync_core();
1173}
1174EXPORT_SYMBOL_GPL(do_machine_check);
1175
1176#ifndef CONFIG_MEMORY_FAILURE
1177int memory_failure(unsigned long pfn, int vector, int flags)
1178{
1179	/* mce_severity() should not hand us an ACTION_REQUIRED error */
1180	BUG_ON(flags & MF_ACTION_REQUIRED);
1181	pr_err("Uncorrected memory error in page 0x%lx ignored\n"
1182	       "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n",
1183	       pfn);
1184
1185	return 0;
1186}
1187#endif
1188
1189/*
1190 * Called in process context that interrupted by MCE and marked with
1191 * TIF_MCE_NOTIFY, just before returning to erroneous userland.
1192 * This code is allowed to sleep.
1193 * Attempt possible recovery such as calling the high level VM handler to
1194 * process any corrupted pages, and kill/signal current process if required.
1195 * Action required errors are handled here.
1196 */
1197void mce_notify_process(void)
1198{
1199	unsigned long pfn;
1200	struct mce_info *mi = mce_find_info();
1201	int flags = MF_ACTION_REQUIRED;
1202
1203	if (!mi)
1204		mce_panic("Lost physical address for unconsumed uncorrectable error", NULL, NULL);
1205	pfn = mi->paddr >> PAGE_SHIFT;
1206
1207	clear_thread_flag(TIF_MCE_NOTIFY);
1208
1209	pr_err("Uncorrected hardware memory error in user-access at %llx",
1210		 mi->paddr);
1211	/*
1212	 * We must call memory_failure() here even if the current process is
1213	 * doomed. We still need to mark the page as poisoned and alert any
1214	 * other users of the page.
1215	 */
1216	if (!mi->restartable)
1217		flags |= MF_MUST_KILL;
1218	if (memory_failure(pfn, MCE_VECTOR, flags) < 0) {
1219		pr_err("Memory error not recovered");
1220		force_sig(SIGBUS, current);
1221	}
1222	mce_clear_info(mi);
1223}
1224
1225/*
1226 * Action optional processing happens here (picking up
1227 * from the list of faulting pages that do_machine_check()
1228 * placed into the "ring").
1229 */
1230static void mce_process_work(struct work_struct *dummy)
1231{
1232	unsigned long pfn;
1233
1234	while (mce_ring_get(&pfn))
1235		memory_failure(pfn, MCE_VECTOR, 0);
1236}
1237
1238#ifdef CONFIG_X86_MCE_INTEL
1239/***
1240 * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
1241 * @cpu: The CPU on which the event occurred.
1242 * @status: Event status information
1243 *
1244 * This function should be called by the thermal interrupt after the
1245 * event has been processed and the decision was made to log the event
1246 * further.
1247 *
1248 * The status parameter will be saved to the 'status' field of 'struct mce'
1249 * and historically has been the register value of the
1250 * MSR_IA32_THERMAL_STATUS (Intel) msr.
1251 */
1252void mce_log_therm_throt_event(__u64 status)
1253{
1254	struct mce m;
1255
1256	mce_setup(&m);
1257	m.bank = MCE_THERMAL_BANK;
1258	m.status = status;
1259	mce_log(&m);
1260}
1261#endif /* CONFIG_X86_MCE_INTEL */
1262
1263/*
1264 * Periodic polling timer for "silent" machine check errors.  If the
1265 * poller finds an MCE, poll 2x faster.  When the poller finds no more
1266 * errors, poll 2x slower (up to check_interval seconds).
1267 */
1268static unsigned long check_interval = 5 * 60; /* 5 minutes */
1269
1270static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
1271static DEFINE_PER_CPU(struct timer_list, mce_timer);
1272
1273static unsigned long mce_adjust_timer_default(unsigned long interval)
1274{
1275	return interval;
1276}
1277
1278static unsigned long (*mce_adjust_timer)(unsigned long interval) =
1279	mce_adjust_timer_default;
1280
1281static void mce_timer_fn(unsigned long data)
1282{
1283	struct timer_list *t = &__get_cpu_var(mce_timer);
1284	unsigned long iv;
1285
1286	WARN_ON(smp_processor_id() != data);
1287
1288	if (mce_available(__this_cpu_ptr(&cpu_info))) {
1289		machine_check_poll(MCP_TIMESTAMP,
1290				&__get_cpu_var(mce_poll_banks));
1291		mce_intel_cmci_poll();
1292	}
1293
1294	/*
1295	 * Alert userspace if needed.  If we logged an MCE, reduce the
1296	 * polling interval, otherwise increase the polling interval.
1297	 */
1298	iv = __this_cpu_read(mce_next_interval);
1299	if (mce_notify_irq()) {
1300		iv = max(iv / 2, (unsigned long) HZ/100);
1301	} else {
1302		iv = min(iv * 2, round_jiffies_relative(check_interval * HZ));
1303		iv = mce_adjust_timer(iv);
1304	}
1305	__this_cpu_write(mce_next_interval, iv);
1306	/* Might have become 0 after CMCI storm subsided */
1307	if (iv) {
1308		t->expires = jiffies + iv;
1309		add_timer_on(t, smp_processor_id());
1310	}
1311}
1312
1313/*
1314 * Ensure that the timer is firing in @interval from now.
1315 */
1316void mce_timer_kick(unsigned long interval)
1317{
1318	struct timer_list *t = &__get_cpu_var(mce_timer);
1319	unsigned long when = jiffies + interval;
1320	unsigned long iv = __this_cpu_read(mce_next_interval);
1321
1322	if (timer_pending(t)) {
1323		if (time_before(when, t->expires))
1324			mod_timer_pinned(t, when);
1325	} else {
1326		t->expires = round_jiffies(when);
1327		add_timer_on(t, smp_processor_id());
1328	}
1329	if (interval < iv)
1330		__this_cpu_write(mce_next_interval, interval);
1331}
1332
1333/* Must not be called in IRQ context where del_timer_sync() can deadlock */
1334static void mce_timer_delete_all(void)
1335{
1336	int cpu;
1337
1338	for_each_online_cpu(cpu)
1339		del_timer_sync(&per_cpu(mce_timer, cpu));
1340}
1341
1342static void mce_do_trigger(struct work_struct *work)
1343{
1344	call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT);
1345}
1346
1347static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
1348
1349/*
1350 * Notify the user(s) about new machine check events.
1351 * Can be called from interrupt context, but not from machine check/NMI
1352 * context.
1353 */
1354int mce_notify_irq(void)
1355{
1356	/* Not more than two messages every minute */
1357	static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
1358
1359	if (test_and_clear_bit(0, &mce_need_notify)) {
1360		/* wake processes polling /dev/mcelog */
1361		wake_up_interruptible(&mce_chrdev_wait);
1362
1363		if (mce_helper[0])
1364			schedule_work(&mce_trigger_work);
1365
1366		if (__ratelimit(&ratelimit))
1367			pr_info(HW_ERR "Machine check events logged\n");
1368
1369		return 1;
1370	}
1371	return 0;
1372}
1373EXPORT_SYMBOL_GPL(mce_notify_irq);
1374
1375static int __mcheck_cpu_mce_banks_init(void)
1376{
1377	int i;
1378	u8 num_banks = mca_cfg.banks;
1379
1380	mce_banks = kzalloc(num_banks * sizeof(struct mce_bank), GFP_KERNEL);
1381	if (!mce_banks)
1382		return -ENOMEM;
1383
1384	for (i = 0; i < num_banks; i++) {
1385		struct mce_bank *b = &mce_banks[i];
1386
1387		b->ctl = -1ULL;
1388		b->init = 1;
1389	}
1390	return 0;
1391}
1392
1393/*
1394 * Initialize Machine Checks for a CPU.
1395 */
1396static int __mcheck_cpu_cap_init(void)
1397{
1398	unsigned b;
1399	u64 cap;
1400
1401	rdmsrl(MSR_IA32_MCG_CAP, cap);
1402
1403	b = cap & MCG_BANKCNT_MASK;
1404	if (!mca_cfg.banks)
1405		pr_info("CPU supports %d MCE banks\n", b);
1406
1407	if (b > MAX_NR_BANKS) {
1408		pr_warn("Using only %u machine check banks out of %u\n",
1409			MAX_NR_BANKS, b);
1410		b = MAX_NR_BANKS;
1411	}
1412
1413	/* Don't support asymmetric configurations today */
1414	WARN_ON(mca_cfg.banks != 0 && b != mca_cfg.banks);
1415	mca_cfg.banks = b;
1416
1417	if (!mce_banks) {
1418		int err = __mcheck_cpu_mce_banks_init();
1419
1420		if (err)
1421			return err;
1422	}
1423
1424	/* Use accurate RIP reporting if available. */
1425	if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9)
1426		mca_cfg.rip_msr = MSR_IA32_MCG_EIP;
1427
1428	if (cap & MCG_SER_P)
1429		mca_cfg.ser = true;
1430
1431	return 0;
1432}
1433
1434static void __mcheck_cpu_init_generic(void)
1435{
1436	enum mcp_flags m_fl = 0;
1437	mce_banks_t all_banks;
1438	u64 cap;
1439	int i;
1440
1441	if (!mca_cfg.bootlog)
1442		m_fl = MCP_DONTLOG;
1443
1444	/*
1445	 * Log the machine checks left over from the previous reset.
1446	 */
1447	bitmap_fill(all_banks, MAX_NR_BANKS);
1448	machine_check_poll(MCP_UC | m_fl, &all_banks);
1449
1450	set_in_cr4(X86_CR4_MCE);
1451
1452	rdmsrl(MSR_IA32_MCG_CAP, cap);
1453	if (cap & MCG_CTL_P)
1454		wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
1455
1456	for (i = 0; i < mca_cfg.banks; i++) {
1457		struct mce_bank *b = &mce_banks[i];
1458
1459		if (!b->init)
1460			continue;
1461		wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl);
1462		wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
1463	}
1464}
1465
1466/*
1467 * During IFU recovery Sandy Bridge -EP4S processors set the RIPV and
1468 * EIPV bits in MCG_STATUS to zero on the affected logical processor (SDM
1469 * Vol 3B Table 15-20). But this confuses both the code that determines
1470 * whether the machine check occurred in kernel or user mode, and also
1471 * the severity assessment code. Pretend that EIPV was set, and take the
1472 * ip/cs values from the pt_regs that mce_gather_info() ignored earlier.
1473 */
1474static void quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs)
1475{
1476	if (bank != 0)
1477		return;
1478	if ((m->mcgstatus & (MCG_STATUS_EIPV|MCG_STATUS_RIPV)) != 0)
1479		return;
1480	if ((m->status & (MCI_STATUS_OVER|MCI_STATUS_UC|
1481		          MCI_STATUS_EN|MCI_STATUS_MISCV|MCI_STATUS_ADDRV|
1482			  MCI_STATUS_PCC|MCI_STATUS_S|MCI_STATUS_AR|
1483			  MCACOD)) !=
1484			 (MCI_STATUS_UC|MCI_STATUS_EN|
1485			  MCI_STATUS_MISCV|MCI_STATUS_ADDRV|MCI_STATUS_S|
1486			  MCI_STATUS_AR|MCACOD_INSTR))
1487		return;
1488
1489	m->mcgstatus |= MCG_STATUS_EIPV;
1490	m->ip = regs->ip;
1491	m->cs = regs->cs;
1492}
1493
1494/* Add per CPU specific workarounds here */
1495static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
1496{
1497	struct mca_config *cfg = &mca_cfg;
1498
1499	if (c->x86_vendor == X86_VENDOR_UNKNOWN) {
1500		pr_info("unknown CPU type - not enabling MCE support\n");
1501		return -EOPNOTSUPP;
1502	}
1503
1504	/* This should be disabled by the BIOS, but isn't always */
1505	if (c->x86_vendor == X86_VENDOR_AMD) {
1506		if (c->x86 == 15 && cfg->banks > 4) {
1507			/*
1508			 * disable GART TBL walk error reporting, which
1509			 * trips off incorrectly with the IOMMU & 3ware
1510			 * & Cerberus:
1511			 */
1512			clear_bit(10, (unsigned long *)&mce_banks[4].ctl);
1513		}
1514		if (c->x86 <= 17 && cfg->bootlog < 0) {
1515			/*
1516			 * Lots of broken BIOS around that don't clear them
1517			 * by default and leave crap in there. Don't log:
1518			 */
1519			cfg->bootlog = 0;
1520		}
1521		/*
1522		 * Various K7s with broken bank 0 around. Always disable
1523		 * by default.
1524		 */
1525		 if (c->x86 == 6 && cfg->banks > 0)
1526			mce_banks[0].ctl = 0;
1527
1528		 /*
1529		  * Turn off MC4_MISC thresholding banks on those models since
1530		  * they're not supported there.
1531		  */
1532		 if (c->x86 == 0x15 &&
1533		     (c->x86_model >= 0x10 && c->x86_model <= 0x1f)) {
1534			 int i;
1535			 u64 val, hwcr;
1536			 bool need_toggle;
1537			 u32 msrs[] = {
1538				0x00000413, /* MC4_MISC0 */
1539				0xc0000408, /* MC4_MISC1 */
1540			 };
1541
1542			 rdmsrl(MSR_K7_HWCR, hwcr);
1543
1544			 /* McStatusWrEn has to be set */
1545			 need_toggle = !(hwcr & BIT(18));
1546
1547			 if (need_toggle)
1548				 wrmsrl(MSR_K7_HWCR, hwcr | BIT(18));
1549
1550			 for (i = 0; i < ARRAY_SIZE(msrs); i++) {
1551				 rdmsrl(msrs[i], val);
1552
1553				 /* CntP bit set? */
1554				 if (val & BIT_64(62)) {
1555					val &= ~BIT_64(62);
1556					wrmsrl(msrs[i], val);
1557				 }
1558			 }
1559
1560			 /* restore old settings */
1561			 if (need_toggle)
1562				 wrmsrl(MSR_K7_HWCR, hwcr);
1563		 }
1564	}
1565
1566	if (c->x86_vendor == X86_VENDOR_INTEL) {
1567		/*
1568		 * SDM documents that on family 6 bank 0 should not be written
1569		 * because it aliases to another special BIOS controlled
1570		 * register.
1571		 * But it's not aliased anymore on model 0x1a+
1572		 * Don't ignore bank 0 completely because there could be a
1573		 * valid event later, merely don't write CTL0.
1574		 */
1575
1576		if (c->x86 == 6 && c->x86_model < 0x1A && cfg->banks > 0)
1577			mce_banks[0].init = 0;
1578
1579		/*
1580		 * All newer Intel systems support MCE broadcasting. Enable
1581		 * synchronization with a one second timeout.
1582		 */
1583		if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) &&
1584			cfg->monarch_timeout < 0)
1585			cfg->monarch_timeout = USEC_PER_SEC;
1586
1587		/*
1588		 * There are also broken BIOSes on some Pentium M and
1589		 * earlier systems:
1590		 */
1591		if (c->x86 == 6 && c->x86_model <= 13 && cfg->bootlog < 0)
1592			cfg->bootlog = 0;
1593
1594		if (c->x86 == 6 && c->x86_model == 45)
1595			quirk_no_way_out = quirk_sandybridge_ifu;
1596	}
1597	if (cfg->monarch_timeout < 0)
1598		cfg->monarch_timeout = 0;
1599	if (cfg->bootlog != 0)
1600		cfg->panic_timeout = 30;
1601
1602	return 0;
1603}
1604
1605static int __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
1606{
1607	if (c->x86 != 5)
1608		return 0;
1609
1610	switch (c->x86_vendor) {
1611	case X86_VENDOR_INTEL:
1612		intel_p5_mcheck_init(c);
1613		return 1;
1614		break;
1615	case X86_VENDOR_CENTAUR:
1616		winchip_mcheck_init(c);
1617		return 1;
1618		break;
1619	}
1620
1621	return 0;
1622}
1623
1624static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
1625{
1626	switch (c->x86_vendor) {
1627	case X86_VENDOR_INTEL:
1628		mce_intel_feature_init(c);
1629		mce_adjust_timer = mce_intel_adjust_timer;
1630		break;
1631	case X86_VENDOR_AMD:
1632		mce_amd_feature_init(c);
1633		break;
1634	default:
1635		break;
1636	}
1637}
1638
1639static void mce_start_timer(unsigned int cpu, struct timer_list *t)
1640{
1641	unsigned long iv = check_interval * HZ;
1642
1643	if (mca_cfg.ignore_ce || !iv)
1644		return;
1645
1646	per_cpu(mce_next_interval, cpu) = iv;
1647
1648	t->expires = round_jiffies(jiffies + iv);
1649	add_timer_on(t, cpu);
1650}
1651
1652static void __mcheck_cpu_init_timer(void)
1653{
1654	struct timer_list *t = &__get_cpu_var(mce_timer);
1655	unsigned int cpu = smp_processor_id();
1656
1657	setup_timer(t, mce_timer_fn, cpu);
1658	mce_start_timer(cpu, t);
1659}
1660
1661/* Handle unconfigured int18 (should never happen) */
1662static void unexpected_machine_check(struct pt_regs *regs, long error_code)
1663{
1664	pr_err("CPU#%d: Unexpected int18 (Machine Check)\n",
1665	       smp_processor_id());
1666}
1667
1668/* Call the installed machine check handler for this CPU setup. */
1669void (*machine_check_vector)(struct pt_regs *, long error_code) =
1670						unexpected_machine_check;
1671
1672/*
1673 * Called for each booted CPU to set up machine checks.
1674 * Must be called with preempt off:
1675 */
1676void mcheck_cpu_init(struct cpuinfo_x86 *c)
1677{
1678	if (mca_cfg.disabled)
1679		return;
1680
1681	if (__mcheck_cpu_ancient_init(c))
1682		return;
1683
1684	if (!mce_available(c))
1685		return;
1686
1687	if (__mcheck_cpu_cap_init() < 0 || __mcheck_cpu_apply_quirks(c) < 0) {
1688		mca_cfg.disabled = true;
1689		return;
1690	}
1691
1692	machine_check_vector = do_machine_check;
1693
1694	__mcheck_cpu_init_generic();
1695	__mcheck_cpu_init_vendor(c);
1696	__mcheck_cpu_init_timer();
1697	INIT_WORK(&__get_cpu_var(mce_work), mce_process_work);
1698	init_irq_work(&__get_cpu_var(mce_irq_work), &mce_irq_work_cb);
1699}
1700
1701/*
1702 * mce_chrdev: Character device /dev/mcelog to read and clear the MCE log.
1703 */
1704
1705static DEFINE_SPINLOCK(mce_chrdev_state_lock);
1706static int mce_chrdev_open_count;	/* #times opened */
1707static int mce_chrdev_open_exclu;	/* already open exclusive? */
1708
1709static int mce_chrdev_open(struct inode *inode, struct file *file)
1710{
1711	spin_lock(&mce_chrdev_state_lock);
1712
1713	if (mce_chrdev_open_exclu ||
1714	    (mce_chrdev_open_count && (file->f_flags & O_EXCL))) {
1715		spin_unlock(&mce_chrdev_state_lock);
1716
1717		return -EBUSY;
1718	}
1719
1720	if (file->f_flags & O_EXCL)
1721		mce_chrdev_open_exclu = 1;
1722	mce_chrdev_open_count++;
1723
1724	spin_unlock(&mce_chrdev_state_lock);
1725
1726	return nonseekable_open(inode, file);
1727}
1728
1729static int mce_chrdev_release(struct inode *inode, struct file *file)
1730{
1731	spin_lock(&mce_chrdev_state_lock);
1732
1733	mce_chrdev_open_count--;
1734	mce_chrdev_open_exclu = 0;
1735
1736	spin_unlock(&mce_chrdev_state_lock);
1737
1738	return 0;
1739}
1740
1741static void collect_tscs(void *data)
1742{
1743	unsigned long *cpu_tsc = (unsigned long *)data;
1744
1745	rdtscll(cpu_tsc[smp_processor_id()]);
1746}
1747
1748static int mce_apei_read_done;
1749
1750/* Collect MCE record of previous boot in persistent storage via APEI ERST. */
1751static int __mce_read_apei(char __user **ubuf, size_t usize)
1752{
1753	int rc;
1754	u64 record_id;
1755	struct mce m;
1756
1757	if (usize < sizeof(struct mce))
1758		return -EINVAL;
1759
1760	rc = apei_read_mce(&m, &record_id);
1761	/* Error or no more MCE record */
1762	if (rc <= 0) {
1763		mce_apei_read_done = 1;
1764		/*
1765		 * When ERST is disabled, mce_chrdev_read() should return
1766		 * "no record" instead of "no device."
1767		 */
1768		if (rc == -ENODEV)
1769			return 0;
1770		return rc;
1771	}
1772	rc = -EFAULT;
1773	if (copy_to_user(*ubuf, &m, sizeof(struct mce)))
1774		return rc;
1775	/*
1776	 * In fact, we should have cleared the record after that has
1777	 * been flushed to the disk or sent to network in
1778	 * /sbin/mcelog, but we have no interface to support that now,
1779	 * so just clear it to avoid duplication.
1780	 */
1781	rc = apei_clear_mce(record_id);
1782	if (rc) {
1783		mce_apei_read_done = 1;
1784		return rc;
1785	}
1786	*ubuf += sizeof(struct mce);
1787
1788	return 0;
1789}
1790
1791static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf,
1792				size_t usize, loff_t *off)
1793{
1794	char __user *buf = ubuf;
1795	unsigned long *cpu_tsc;
1796	unsigned prev, next;
1797	int i, err;
1798
1799	cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
1800	if (!cpu_tsc)
1801		return -ENOMEM;
1802
1803	mutex_lock(&mce_chrdev_read_mutex);
1804
1805	if (!mce_apei_read_done) {
1806		err = __mce_read_apei(&buf, usize);
1807		if (err || buf != ubuf)
1808			goto out;
1809	}
1810
1811	next = rcu_dereference_check_mce(mcelog.next);
1812
1813	/* Only supports full reads right now */
1814	err = -EINVAL;
1815	if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce))
1816		goto out;
1817
1818	err = 0;
1819	prev = 0;
1820	do {
1821		for (i = prev; i < next; i++) {
1822			unsigned long start = jiffies;
1823			struct mce *m = &mcelog.entry[i];
1824
1825			while (!m->finished) {
1826				if (time_after_eq(jiffies, start + 2)) {
1827					memset(m, 0, sizeof(*m));
1828					goto timeout;
1829				}
1830				cpu_relax();
1831			}
1832			smp_rmb();
1833			err |= copy_to_user(buf, m, sizeof(*m));
1834			buf += sizeof(*m);
1835timeout:
1836			;
1837		}
1838
1839		memset(mcelog.entry + prev, 0,
1840		       (next - prev) * sizeof(struct mce));
1841		prev = next;
1842		next = cmpxchg(&mcelog.next, prev, 0);
1843	} while (next != prev);
1844
1845	synchronize_sched();
1846
1847	/*
1848	 * Collect entries that were still getting written before the
1849	 * synchronize.
1850	 */
1851	on_each_cpu(collect_tscs, cpu_tsc, 1);
1852
1853	for (i = next; i < MCE_LOG_LEN; i++) {
1854		struct mce *m = &mcelog.entry[i];
1855
1856		if (m->finished && m->tsc < cpu_tsc[m->cpu]) {
1857			err |= copy_to_user(buf, m, sizeof(*m));
1858			smp_rmb();
1859			buf += sizeof(*m);
1860			memset(m, 0, sizeof(*m));
1861		}
1862	}
1863
1864	if (err)
1865		err = -EFAULT;
1866
1867out:
1868	mutex_unlock(&mce_chrdev_read_mutex);
1869	kfree(cpu_tsc);
1870
1871	return err ? err : buf - ubuf;
1872}
1873
1874static unsigned int mce_chrdev_poll(struct file *file, poll_table *wait)
1875{
1876	poll_wait(file, &mce_chrdev_wait, wait);
1877	if (rcu_access_index(mcelog.next))
1878		return POLLIN | POLLRDNORM;
1879	if (!mce_apei_read_done && apei_check_mce())
1880		return POLLIN | POLLRDNORM;
1881	return 0;
1882}
1883
1884static long mce_chrdev_ioctl(struct file *f, unsigned int cmd,
1885				unsigned long arg)
1886{
1887	int __user *p = (int __user *)arg;
1888
1889	if (!capable(CAP_SYS_ADMIN))
1890		return -EPERM;
1891
1892	switch (cmd) {
1893	case MCE_GET_RECORD_LEN:
1894		return put_user(sizeof(struct mce), p);
1895	case MCE_GET_LOG_LEN:
1896		return put_user(MCE_LOG_LEN, p);
1897	case MCE_GETCLEAR_FLAGS: {
1898		unsigned flags;
1899
1900		do {
1901			flags = mcelog.flags;
1902		} while (cmpxchg(&mcelog.flags, flags, 0) != flags);
1903
1904		return put_user(flags, p);
1905	}
1906	default:
1907		return -ENOTTY;
1908	}
1909}
1910
1911static ssize_t (*mce_write)(struct file *filp, const char __user *ubuf,
1912			    size_t usize, loff_t *off);
1913
1914void register_mce_write_callback(ssize_t (*fn)(struct file *filp,
1915			     const char __user *ubuf,
1916			     size_t usize, loff_t *off))
1917{
1918	mce_write = fn;
1919}
1920EXPORT_SYMBOL_GPL(register_mce_write_callback);
1921
1922ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf,
1923			 size_t usize, loff_t *off)
1924{
1925	if (mce_write)
1926		return mce_write(filp, ubuf, usize, off);
1927	else
1928		return -EINVAL;
1929}
1930
1931static const struct file_operations mce_chrdev_ops = {
1932	.open			= mce_chrdev_open,
1933	.release		= mce_chrdev_release,
1934	.read			= mce_chrdev_read,
1935	.write			= mce_chrdev_write,
1936	.poll			= mce_chrdev_poll,
1937	.unlocked_ioctl		= mce_chrdev_ioctl,
1938	.llseek			= no_llseek,
1939};
1940
1941static struct miscdevice mce_chrdev_device = {
1942	MISC_MCELOG_MINOR,
1943	"mcelog",
1944	&mce_chrdev_ops,
1945};
1946
1947static void __mce_disable_bank(void *arg)
1948{
1949	int bank = *((int *)arg);
1950	__clear_bit(bank, __get_cpu_var(mce_poll_banks));
1951	cmci_disable_bank(bank);
1952}
1953
1954void mce_disable_bank(int bank)
1955{
1956	if (bank >= mca_cfg.banks) {
1957		pr_warn(FW_BUG
1958			"Ignoring request to disable invalid MCA bank %d.\n",
1959			bank);
1960		return;
1961	}
1962	set_bit(bank, mce_banks_ce_disabled);
1963	on_each_cpu(__mce_disable_bank, &bank, 1);
1964}
1965
1966/*
1967 * mce=off Disables machine check
1968 * mce=no_cmci Disables CMCI
1969 * mce=dont_log_ce Clears corrected events silently, no log created for CEs.
1970 * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared.
1971 * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above)
1972 *	monarchtimeout is how long to wait for other CPUs on machine
1973 *	check, or 0 to not wait
1974 * mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
1975 * mce=nobootlog Don't log MCEs from before booting.
1976 * mce=bios_cmci_threshold Don't program the CMCI threshold
1977 */
1978static int __init mcheck_enable(char *str)
1979{
1980	struct mca_config *cfg = &mca_cfg;
1981
1982	if (*str == 0) {
1983		enable_p5_mce();
1984		return 1;
1985	}
1986	if (*str == '=')
1987		str++;
1988	if (!strcmp(str, "off"))
1989		cfg->disabled = true;
1990	else if (!strcmp(str, "no_cmci"))
1991		cfg->cmci_disabled = true;
1992	else if (!strcmp(str, "dont_log_ce"))
1993		cfg->dont_log_ce = true;
1994	else if (!strcmp(str, "ignore_ce"))
1995		cfg->ignore_ce = true;
1996	else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
1997		cfg->bootlog = (str[0] == 'b');
1998	else if (!strcmp(str, "bios_cmci_threshold"))
1999		cfg->bios_cmci_threshold = true;
2000	else if (isdigit(str[0])) {
2001		get_option(&str, &(cfg->tolerant));
2002		if (*str == ',') {
2003			++str;
2004			get_option(&str, &(cfg->monarch_timeout));
2005		}
2006	} else {
2007		pr_info("mce argument %s ignored. Please use /sys\n", str);
2008		return 0;
2009	}
2010	return 1;
2011}
2012__setup("mce", mcheck_enable);
2013
2014int __init mcheck_init(void)
2015{
2016	mcheck_intel_therm_init();
2017
2018	return 0;
2019}
2020
2021/*
2022 * mce_syscore: PM support
2023 */
2024
2025/*
2026 * Disable machine checks on suspend and shutdown. We can't really handle
2027 * them later.
2028 */
2029static int mce_disable_error_reporting(void)
2030{
2031	int i;
2032
2033	for (i = 0; i < mca_cfg.banks; i++) {
2034		struct mce_bank *b = &mce_banks[i];
2035
2036		if (b->init)
2037			wrmsrl(MSR_IA32_MCx_CTL(i), 0);
2038	}
2039	return 0;
2040}
2041
2042static int mce_syscore_suspend(void)
2043{
2044	return mce_disable_error_reporting();
2045}
2046
2047static void mce_syscore_shutdown(void)
2048{
2049	mce_disable_error_reporting();
2050}
2051
2052/*
2053 * On resume clear all MCE state. Don't want to see leftovers from the BIOS.
2054 * Only one CPU is active at this time, the others get re-added later using
2055 * CPU hotplug:
2056 */
2057static void mce_syscore_resume(void)
2058{
2059	__mcheck_cpu_init_generic();
2060	__mcheck_cpu_init_vendor(__this_cpu_ptr(&cpu_info));
2061}
2062
2063static struct syscore_ops mce_syscore_ops = {
2064	.suspend	= mce_syscore_suspend,
2065	.shutdown	= mce_syscore_shutdown,
2066	.resume		= mce_syscore_resume,
2067};
2068
2069/*
2070 * mce_device: Sysfs support
2071 */
2072
2073static void mce_cpu_restart(void *data)
2074{
2075	if (!mce_available(__this_cpu_ptr(&cpu_info)))
2076		return;
2077	__mcheck_cpu_init_generic();
2078	__mcheck_cpu_init_timer();
2079}
2080
2081/* Reinit MCEs after user configuration changes */
2082static void mce_restart(void)
2083{
2084	mce_timer_delete_all();
2085	on_each_cpu(mce_cpu_restart, NULL, 1);
2086}
2087
2088/* Toggle features for corrected errors */
2089static void mce_disable_cmci(void *data)
2090{
2091	if (!mce_available(__this_cpu_ptr(&cpu_info)))
2092		return;
2093	cmci_clear();
2094}
2095
2096static void mce_enable_ce(void *all)
2097{
2098	if (!mce_available(__this_cpu_ptr(&cpu_info)))
2099		return;
2100	cmci_reenable();
2101	cmci_recheck();
2102	if (all)
2103		__mcheck_cpu_init_timer();
2104}
2105
2106static struct bus_type mce_subsys = {
2107	.name		= "machinecheck",
2108	.dev_name	= "machinecheck",
2109};
2110
2111DEFINE_PER_CPU(struct device *, mce_device);
2112
2113void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
2114
2115static inline struct mce_bank *attr_to_bank(struct device_attribute *attr)
2116{
2117	return container_of(attr, struct mce_bank, attr);
2118}
2119
2120static ssize_t show_bank(struct device *s, struct device_attribute *attr,
2121			 char *buf)
2122{
2123	return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl);
2124}
2125
2126static ssize_t set_bank(struct device *s, struct device_attribute *attr,
2127			const char *buf, size_t size)
2128{
2129	u64 new;
2130
2131	if (strict_strtoull(buf, 0, &new) < 0)
2132		return -EINVAL;
2133
2134	attr_to_bank(attr)->ctl = new;
2135	mce_restart();
2136
2137	return size;
2138}
2139
2140static ssize_t
2141show_trigger(struct device *s, struct device_attribute *attr, char *buf)
2142{
2143	strcpy(buf, mce_helper);
2144	strcat(buf, "\n");
2145	return strlen(mce_helper) + 1;
2146}
2147
2148static ssize_t set_trigger(struct device *s, struct device_attribute *attr,
2149				const char *buf, size_t siz)
2150{
2151	char *p;
2152
2153	strncpy(mce_helper, buf, sizeof(mce_helper));
2154	mce_helper[sizeof(mce_helper)-1] = 0;
2155	p = strchr(mce_helper, '\n');
2156
2157	if (p)
2158		*p = 0;
2159
2160	return strlen(mce_helper) + !!p;
2161}
2162
2163static ssize_t set_ignore_ce(struct device *s,
2164			     struct device_attribute *attr,
2165			     const char *buf, size_t size)
2166{
2167	u64 new;
2168
2169	if (strict_strtoull(buf, 0, &new) < 0)
2170		return -EINVAL;
2171
2172	if (mca_cfg.ignore_ce ^ !!new) {
2173		if (new) {
2174			/* disable ce features */
2175			mce_timer_delete_all();
2176			on_each_cpu(mce_disable_cmci, NULL, 1);
2177			mca_cfg.ignore_ce = true;
2178		} else {
2179			/* enable ce features */
2180			mca_cfg.ignore_ce = false;
2181			on_each_cpu(mce_enable_ce, (void *)1, 1);
2182		}
2183	}
2184	return size;
2185}
2186
2187static ssize_t set_cmci_disabled(struct device *s,
2188				 struct device_attribute *attr,
2189				 const char *buf, size_t size)
2190{
2191	u64 new;
2192
2193	if (strict_strtoull(buf, 0, &new) < 0)
2194		return -EINVAL;
2195
2196	if (mca_cfg.cmci_disabled ^ !!new) {
2197		if (new) {
2198			/* disable cmci */
2199			on_each_cpu(mce_disable_cmci, NULL, 1);
2200			mca_cfg.cmci_disabled = true;
2201		} else {
2202			/* enable cmci */
2203			mca_cfg.cmci_disabled = false;
2204			on_each_cpu(mce_enable_ce, NULL, 1);
2205		}
2206	}
2207	return size;
2208}
2209
2210static ssize_t store_int_with_restart(struct device *s,
2211				      struct device_attribute *attr,
2212				      const char *buf, size_t size)
2213{
2214	ssize_t ret = device_store_int(s, attr, buf, size);
2215	mce_restart();
2216	return ret;
2217}
2218
2219static DEVICE_ATTR(trigger, 0644, show_trigger, set_trigger);
2220static DEVICE_INT_ATTR(tolerant, 0644, mca_cfg.tolerant);
2221static DEVICE_INT_ATTR(monarch_timeout, 0644, mca_cfg.monarch_timeout);
2222static DEVICE_BOOL_ATTR(dont_log_ce, 0644, mca_cfg.dont_log_ce);
2223
2224static struct dev_ext_attribute dev_attr_check_interval = {
2225	__ATTR(check_interval, 0644, device_show_int, store_int_with_restart),
2226	&check_interval
2227};
2228
2229static struct dev_ext_attribute dev_attr_ignore_ce = {
2230	__ATTR(ignore_ce, 0644, device_show_bool, set_ignore_ce),
2231	&mca_cfg.ignore_ce
2232};
2233
2234static struct dev_ext_attribute dev_attr_cmci_disabled = {
2235	__ATTR(cmci_disabled, 0644, device_show_bool, set_cmci_disabled),
2236	&mca_cfg.cmci_disabled
2237};
2238
2239static struct device_attribute *mce_device_attrs[] = {
2240	&dev_attr_tolerant.attr,
2241	&dev_attr_check_interval.attr,
2242	&dev_attr_trigger,
2243	&dev_attr_monarch_timeout.attr,
2244	&dev_attr_dont_log_ce.attr,
2245	&dev_attr_ignore_ce.attr,
2246	&dev_attr_cmci_disabled.attr,
2247	NULL
2248};
2249
2250static cpumask_var_t mce_device_initialized;
2251
2252static void mce_device_release(struct device *dev)
2253{
2254	kfree(dev);
2255}
2256
2257/* Per cpu device init. All of the cpus still share the same ctrl bank: */
2258static int mce_device_create(unsigned int cpu)
2259{
2260	struct device *dev;
2261	int err;
2262	int i, j;
2263
2264	if (!mce_available(&boot_cpu_data))
2265		return -EIO;
2266
2267	dev = kzalloc(sizeof *dev, GFP_KERNEL);
2268	if (!dev)
2269		return -ENOMEM;
2270	dev->id  = cpu;
2271	dev->bus = &mce_subsys;
2272	dev->release = &mce_device_release;
2273
2274	err = device_register(dev);
2275	if (err) {
2276		put_device(dev);
2277		return err;
2278	}
2279
2280	for (i = 0; mce_device_attrs[i]; i++) {
2281		err = device_create_file(dev, mce_device_attrs[i]);
2282		if (err)
2283			goto error;
2284	}
2285	for (j = 0; j < mca_cfg.banks; j++) {
2286		err = device_create_file(dev, &mce_banks[j].attr);
2287		if (err)
2288			goto error2;
2289	}
2290	cpumask_set_cpu(cpu, mce_device_initialized);
2291	per_cpu(mce_device, cpu) = dev;
2292
2293	return 0;
2294error2:
2295	while (--j >= 0)
2296		device_remove_file(dev, &mce_banks[j].attr);
2297error:
2298	while (--i >= 0)
2299		device_remove_file(dev, mce_device_attrs[i]);
2300
2301	device_unregister(dev);
2302
2303	return err;
2304}
2305
2306static void mce_device_remove(unsigned int cpu)
2307{
2308	struct device *dev = per_cpu(mce_device, cpu);
2309	int i;
2310
2311	if (!cpumask_test_cpu(cpu, mce_device_initialized))
2312		return;
2313
2314	for (i = 0; mce_device_attrs[i]; i++)
2315		device_remove_file(dev, mce_device_attrs[i]);
2316
2317	for (i = 0; i < mca_cfg.banks; i++)
2318		device_remove_file(dev, &mce_banks[i].attr);
2319
2320	device_unregister(dev);
2321	cpumask_clear_cpu(cpu, mce_device_initialized);
2322	per_cpu(mce_device, cpu) = NULL;
2323}
2324
2325/* Make sure there are no machine checks on offlined CPUs. */
2326static void mce_disable_cpu(void *h)
2327{
2328	unsigned long action = *(unsigned long *)h;
2329	int i;
2330
2331	if (!mce_available(__this_cpu_ptr(&cpu_info)))
2332		return;
2333
2334	if (!(action & CPU_TASKS_FROZEN))
2335		cmci_clear();
2336	for (i = 0; i < mca_cfg.banks; i++) {
2337		struct mce_bank *b = &mce_banks[i];
2338
2339		if (b->init)
2340			wrmsrl(MSR_IA32_MCx_CTL(i), 0);
2341	}
2342}
2343
2344static void mce_reenable_cpu(void *h)
2345{
2346	unsigned long action = *(unsigned long *)h;
2347	int i;
2348
2349	if (!mce_available(__this_cpu_ptr(&cpu_info)))
2350		return;
2351
2352	if (!(action & CPU_TASKS_FROZEN))
2353		cmci_reenable();
2354	for (i = 0; i < mca_cfg.banks; i++) {
2355		struct mce_bank *b = &mce_banks[i];
2356
2357		if (b->init)
2358			wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl);
2359	}
2360}
2361
2362/* Get notified when a cpu comes on/off. Be hotplug friendly. */
2363static int
2364mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
2365{
2366	unsigned int cpu = (unsigned long)hcpu;
2367	struct timer_list *t = &per_cpu(mce_timer, cpu);
2368
2369	switch (action & ~CPU_TASKS_FROZEN) {
2370	case CPU_ONLINE:
2371		mce_device_create(cpu);
2372		if (threshold_cpu_callback)
2373			threshold_cpu_callback(action, cpu);
2374		break;
2375	case CPU_DEAD:
2376		if (threshold_cpu_callback)
2377			threshold_cpu_callback(action, cpu);
2378		mce_device_remove(cpu);
2379		mce_intel_hcpu_update(cpu);
2380		break;
2381	case CPU_DOWN_PREPARE:
2382		smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
2383		del_timer_sync(t);
2384		break;
2385	case CPU_DOWN_FAILED:
2386		smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
2387		mce_start_timer(cpu, t);
2388		break;
2389	}
2390
2391	if (action == CPU_POST_DEAD) {
2392		/* intentionally ignoring frozen here */
2393		cmci_rediscover();
2394	}
2395
2396	return NOTIFY_OK;
2397}
2398
2399static struct notifier_block mce_cpu_notifier = {
2400	.notifier_call = mce_cpu_callback,
2401};
2402
2403static __init void mce_init_banks(void)
2404{
2405	int i;
2406
2407	for (i = 0; i < mca_cfg.banks; i++) {
2408		struct mce_bank *b = &mce_banks[i];
2409		struct device_attribute *a = &b->attr;
2410
2411		sysfs_attr_init(&a->attr);
2412		a->attr.name	= b->attrname;
2413		snprintf(b->attrname, ATTR_LEN, "bank%d", i);
2414
2415		a->attr.mode	= 0644;
2416		a->show		= show_bank;
2417		a->store	= set_bank;
2418	}
2419}
2420
2421static __init int mcheck_init_device(void)
2422{
2423	int err;
2424	int i = 0;
2425
2426	if (!mce_available(&boot_cpu_data))
2427		return -EIO;
2428
2429	zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL);
2430
2431	mce_init_banks();
2432
2433	err = subsys_system_register(&mce_subsys, NULL);
2434	if (err)
2435		return err;
2436
2437	for_each_online_cpu(i) {
2438		err = mce_device_create(i);
2439		if (err)
2440			return err;
2441	}
2442
2443	register_syscore_ops(&mce_syscore_ops);
2444	register_hotcpu_notifier(&mce_cpu_notifier);
2445
2446	/* register character device /dev/mcelog */
2447	misc_register(&mce_chrdev_device);
2448
2449	return err;
2450}
2451device_initcall_sync(mcheck_init_device);
2452
2453/*
2454 * Old style boot options parsing. Only for compatibility.
2455 */
2456static int __init mcheck_disable(char *str)
2457{
2458	mca_cfg.disabled = true;
2459	return 1;
2460}
2461__setup("nomce", mcheck_disable);
2462
2463#ifdef CONFIG_DEBUG_FS
2464struct dentry *mce_get_debugfs_dir(void)
2465{
2466	static struct dentry *dmce;
2467
2468	if (!dmce)
2469		dmce = debugfs_create_dir("mce", NULL);
2470
2471	return dmce;
2472}
2473
2474static void mce_reset(void)
2475{
2476	cpu_missing = 0;
2477	atomic_set(&mce_fake_paniced, 0);
2478	atomic_set(&mce_executing, 0);
2479	atomic_set(&mce_callin, 0);
2480	atomic_set(&global_nwo, 0);
2481}
2482
2483static int fake_panic_get(void *data, u64 *val)
2484{
2485	*val = fake_panic;
2486	return 0;
2487}
2488
2489static int fake_panic_set(void *data, u64 val)
2490{
2491	mce_reset();
2492	fake_panic = val;
2493	return 0;
2494}
2495
2496DEFINE_SIMPLE_ATTRIBUTE(fake_panic_fops, fake_panic_get,
2497			fake_panic_set, "%llu\n");
2498
2499static int __init mcheck_debugfs_init(void)
2500{
2501	struct dentry *dmce, *ffake_panic;
2502
2503	dmce = mce_get_debugfs_dir();
2504	if (!dmce)
2505		return -ENOMEM;
2506	ffake_panic = debugfs_create_file("fake_panic", 0444, dmce, NULL,
2507					  &fake_panic_fops);
2508	if (!ffake_panic)
2509		return -ENOMEM;
2510
2511	return 0;
2512}
2513late_initcall(mcheck_debugfs_init);
2514#endif
2515