mce.c revision 55babd8f41f122f5f4c7cebf520c766c983282c6
1/*
2 * Machine check handler.
3 *
4 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
5 * Rest from unknown author(s).
6 * 2004 Andi Kleen. Rewrote most of it.
7 * Copyright 2008 Intel Corporation
8 * Author: Andi Kleen
9 */
10
11#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
12
13#include <linux/thread_info.h>
14#include <linux/capability.h>
15#include <linux/miscdevice.h>
16#include <linux/ratelimit.h>
17#include <linux/kallsyms.h>
18#include <linux/rcupdate.h>
19#include <linux/kobject.h>
20#include <linux/uaccess.h>
21#include <linux/kdebug.h>
22#include <linux/kernel.h>
23#include <linux/percpu.h>
24#include <linux/string.h>
25#include <linux/device.h>
26#include <linux/syscore_ops.h>
27#include <linux/delay.h>
28#include <linux/ctype.h>
29#include <linux/sched.h>
30#include <linux/sysfs.h>
31#include <linux/types.h>
32#include <linux/slab.h>
33#include <linux/init.h>
34#include <linux/kmod.h>
35#include <linux/poll.h>
36#include <linux/nmi.h>
37#include <linux/cpu.h>
38#include <linux/smp.h>
39#include <linux/fs.h>
40#include <linux/mm.h>
41#include <linux/debugfs.h>
42#include <linux/irq_work.h>
43#include <linux/export.h>
44
45#include <asm/processor.h>
46#include <asm/mce.h>
47#include <asm/msr.h>
48
49#include "mce-internal.h"
50
51static DEFINE_MUTEX(mce_chrdev_read_mutex);
52
53#define rcu_dereference_check_mce(p) \
54	rcu_dereference_index_check((p), \
55			      rcu_read_lock_sched_held() || \
56			      lockdep_is_held(&mce_chrdev_read_mutex))
57
58#define CREATE_TRACE_POINTS
59#include <trace/events/mce.h>
60
61int mce_disabled __read_mostly;
62
63#define SPINUNIT 100	/* 100ns */
64
65atomic_t mce_entry;
66
67DEFINE_PER_CPU(unsigned, mce_exception_count);
68
69/*
70 * Tolerant levels:
71 *   0: always panic on uncorrected errors, log corrected errors
72 *   1: panic or SIGBUS on uncorrected errors, log corrected errors
73 *   2: SIGBUS or log uncorrected errors (if possible), log corrected errors
74 *   3: never panic or SIGBUS, log all errors (for testing only)
75 */
76static int			tolerant		__read_mostly = 1;
77static int			banks			__read_mostly;
78static int			rip_msr			__read_mostly;
79static int			mce_bootlog		__read_mostly = -1;
80static int			monarch_timeout		__read_mostly = -1;
81static int			mce_panic_timeout	__read_mostly;
82static int			mce_dont_log_ce		__read_mostly;
83int				mce_cmci_disabled	__read_mostly;
84int				mce_ignore_ce		__read_mostly;
85int				mce_ser			__read_mostly;
86
87struct mce_bank                *mce_banks		__read_mostly;
88
89/* User mode helper program triggered by machine check event */
90static unsigned long		mce_need_notify;
91static char			mce_helper[128];
92static char			*mce_helper_argv[2] = { mce_helper, NULL };
93
94static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait);
95
96static DEFINE_PER_CPU(struct mce, mces_seen);
97static int			cpu_missing;
98
99/* MCA banks polled by the period polling timer for corrected events */
100DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
101	[0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
102};
103
104static DEFINE_PER_CPU(struct work_struct, mce_work);
105
106/*
107 * CPU/chipset specific EDAC code can register a notifier call here to print
108 * MCE errors in a human-readable form.
109 */
110ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain);
111
112/* Do initial initialization of a struct mce */
113void mce_setup(struct mce *m)
114{
115	memset(m, 0, sizeof(struct mce));
116	m->cpu = m->extcpu = smp_processor_id();
117	rdtscll(m->tsc);
118	/* We hope get_seconds stays lockless */
119	m->time = get_seconds();
120	m->cpuvendor = boot_cpu_data.x86_vendor;
121	m->cpuid = cpuid_eax(1);
122	m->socketid = cpu_data(m->extcpu).phys_proc_id;
123	m->apicid = cpu_data(m->extcpu).initial_apicid;
124	rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap);
125}
126
127DEFINE_PER_CPU(struct mce, injectm);
128EXPORT_PER_CPU_SYMBOL_GPL(injectm);
129
130/*
131 * Lockless MCE logging infrastructure.
132 * This avoids deadlocks on printk locks without having to break locks. Also
133 * separate MCEs from kernel messages to avoid bogus bug reports.
134 */
135
136static struct mce_log mcelog = {
137	.signature	= MCE_LOG_SIGNATURE,
138	.len		= MCE_LOG_LEN,
139	.recordlen	= sizeof(struct mce),
140};
141
142void mce_log(struct mce *mce)
143{
144	unsigned next, entry;
145	int ret = 0;
146
147	/* Emit the trace record: */
148	trace_mce_record(mce);
149
150	ret = atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, mce);
151	if (ret == NOTIFY_STOP)
152		return;
153
154	mce->finished = 0;
155	wmb();
156	for (;;) {
157		entry = rcu_dereference_check_mce(mcelog.next);
158		for (;;) {
159
160			/*
161			 * When the buffer fills up discard new entries.
162			 * Assume that the earlier errors are the more
163			 * interesting ones:
164			 */
165			if (entry >= MCE_LOG_LEN) {
166				set_bit(MCE_OVERFLOW,
167					(unsigned long *)&mcelog.flags);
168				return;
169			}
170			/* Old left over entry. Skip: */
171			if (mcelog.entry[entry].finished) {
172				entry++;
173				continue;
174			}
175			break;
176		}
177		smp_rmb();
178		next = entry + 1;
179		if (cmpxchg(&mcelog.next, entry, next) == entry)
180			break;
181	}
182	memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
183	wmb();
184	mcelog.entry[entry].finished = 1;
185	wmb();
186
187	mce->finished = 1;
188	set_bit(0, &mce_need_notify);
189}
190
191static void drain_mcelog_buffer(void)
192{
193	unsigned int next, i, prev = 0;
194
195	next = ACCESS_ONCE(mcelog.next);
196
197	do {
198		struct mce *m;
199
200		/* drain what was logged during boot */
201		for (i = prev; i < next; i++) {
202			unsigned long start = jiffies;
203			unsigned retries = 1;
204
205			m = &mcelog.entry[i];
206
207			while (!m->finished) {
208				if (time_after_eq(jiffies, start + 2*retries))
209					retries++;
210
211				cpu_relax();
212
213				if (!m->finished && retries >= 4) {
214					pr_err("skipping error being logged currently!\n");
215					break;
216				}
217			}
218			smp_rmb();
219			atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m);
220		}
221
222		memset(mcelog.entry + prev, 0, (next - prev) * sizeof(*m));
223		prev = next;
224		next = cmpxchg(&mcelog.next, prev, 0);
225	} while (next != prev);
226}
227
228
229void mce_register_decode_chain(struct notifier_block *nb)
230{
231	atomic_notifier_chain_register(&x86_mce_decoder_chain, nb);
232	drain_mcelog_buffer();
233}
234EXPORT_SYMBOL_GPL(mce_register_decode_chain);
235
236void mce_unregister_decode_chain(struct notifier_block *nb)
237{
238	atomic_notifier_chain_unregister(&x86_mce_decoder_chain, nb);
239}
240EXPORT_SYMBOL_GPL(mce_unregister_decode_chain);
241
242static void print_mce(struct mce *m)
243{
244	int ret = 0;
245
246	pr_emerg(HW_ERR "CPU %d: Machine Check Exception: %Lx Bank %d: %016Lx\n",
247	       m->extcpu, m->mcgstatus, m->bank, m->status);
248
249	if (m->ip) {
250		pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ",
251			!(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
252				m->cs, m->ip);
253
254		if (m->cs == __KERNEL_CS)
255			print_symbol("{%s}", m->ip);
256		pr_cont("\n");
257	}
258
259	pr_emerg(HW_ERR "TSC %llx ", m->tsc);
260	if (m->addr)
261		pr_cont("ADDR %llx ", m->addr);
262	if (m->misc)
263		pr_cont("MISC %llx ", m->misc);
264
265	pr_cont("\n");
266	/*
267	 * Note this output is parsed by external tools and old fields
268	 * should not be changed.
269	 */
270	pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x microcode %x\n",
271		m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid,
272		cpu_data(m->extcpu).microcode);
273
274	/*
275	 * Print out human-readable details about the MCE error,
276	 * (if the CPU has an implementation for that)
277	 */
278	ret = atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m);
279	if (ret == NOTIFY_STOP)
280		return;
281
282	pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n");
283}
284
285#define PANIC_TIMEOUT 5 /* 5 seconds */
286
287static atomic_t mce_paniced;
288
289static int fake_panic;
290static atomic_t mce_fake_paniced;
291
292/* Panic in progress. Enable interrupts and wait for final IPI */
293static void wait_for_panic(void)
294{
295	long timeout = PANIC_TIMEOUT*USEC_PER_SEC;
296
297	preempt_disable();
298	local_irq_enable();
299	while (timeout-- > 0)
300		udelay(1);
301	if (panic_timeout == 0)
302		panic_timeout = mce_panic_timeout;
303	panic("Panicing machine check CPU died");
304}
305
306static void mce_panic(char *msg, struct mce *final, char *exp)
307{
308	int i, apei_err = 0;
309
310	if (!fake_panic) {
311		/*
312		 * Make sure only one CPU runs in machine check panic
313		 */
314		if (atomic_inc_return(&mce_paniced) > 1)
315			wait_for_panic();
316		barrier();
317
318		bust_spinlocks(1);
319		console_verbose();
320	} else {
321		/* Don't log too much for fake panic */
322		if (atomic_inc_return(&mce_fake_paniced) > 1)
323			return;
324	}
325	/* First print corrected ones that are still unlogged */
326	for (i = 0; i < MCE_LOG_LEN; i++) {
327		struct mce *m = &mcelog.entry[i];
328		if (!(m->status & MCI_STATUS_VAL))
329			continue;
330		if (!(m->status & MCI_STATUS_UC)) {
331			print_mce(m);
332			if (!apei_err)
333				apei_err = apei_write_mce(m);
334		}
335	}
336	/* Now print uncorrected but with the final one last */
337	for (i = 0; i < MCE_LOG_LEN; i++) {
338		struct mce *m = &mcelog.entry[i];
339		if (!(m->status & MCI_STATUS_VAL))
340			continue;
341		if (!(m->status & MCI_STATUS_UC))
342			continue;
343		if (!final || memcmp(m, final, sizeof(struct mce))) {
344			print_mce(m);
345			if (!apei_err)
346				apei_err = apei_write_mce(m);
347		}
348	}
349	if (final) {
350		print_mce(final);
351		if (!apei_err)
352			apei_err = apei_write_mce(final);
353	}
354	if (cpu_missing)
355		pr_emerg(HW_ERR "Some CPUs didn't answer in synchronization\n");
356	if (exp)
357		pr_emerg(HW_ERR "Machine check: %s\n", exp);
358	if (!fake_panic) {
359		if (panic_timeout == 0)
360			panic_timeout = mce_panic_timeout;
361		panic(msg);
362	} else
363		pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg);
364}
365
366/* Support code for software error injection */
367
368static int msr_to_offset(u32 msr)
369{
370	unsigned bank = __this_cpu_read(injectm.bank);
371
372	if (msr == rip_msr)
373		return offsetof(struct mce, ip);
374	if (msr == MSR_IA32_MCx_STATUS(bank))
375		return offsetof(struct mce, status);
376	if (msr == MSR_IA32_MCx_ADDR(bank))
377		return offsetof(struct mce, addr);
378	if (msr == MSR_IA32_MCx_MISC(bank))
379		return offsetof(struct mce, misc);
380	if (msr == MSR_IA32_MCG_STATUS)
381		return offsetof(struct mce, mcgstatus);
382	return -1;
383}
384
385/* MSR access wrappers used for error injection */
386static u64 mce_rdmsrl(u32 msr)
387{
388	u64 v;
389
390	if (__this_cpu_read(injectm.finished)) {
391		int offset = msr_to_offset(msr);
392
393		if (offset < 0)
394			return 0;
395		return *(u64 *)((char *)&__get_cpu_var(injectm) + offset);
396	}
397
398	if (rdmsrl_safe(msr, &v)) {
399		WARN_ONCE(1, "mce: Unable to read msr %d!\n", msr);
400		/*
401		 * Return zero in case the access faulted. This should
402		 * not happen normally but can happen if the CPU does
403		 * something weird, or if the code is buggy.
404		 */
405		v = 0;
406	}
407
408	return v;
409}
410
411static void mce_wrmsrl(u32 msr, u64 v)
412{
413	if (__this_cpu_read(injectm.finished)) {
414		int offset = msr_to_offset(msr);
415
416		if (offset >= 0)
417			*(u64 *)((char *)&__get_cpu_var(injectm) + offset) = v;
418		return;
419	}
420	wrmsrl(msr, v);
421}
422
423/*
424 * Collect all global (w.r.t. this processor) status about this machine
425 * check into our "mce" struct so that we can use it later to assess
426 * the severity of the problem as we read per-bank specific details.
427 */
428static inline void mce_gather_info(struct mce *m, struct pt_regs *regs)
429{
430	mce_setup(m);
431
432	m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
433	if (regs) {
434		/*
435		 * Get the address of the instruction at the time of
436		 * the machine check error.
437		 */
438		if (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) {
439			m->ip = regs->ip;
440			m->cs = regs->cs;
441
442			/*
443			 * When in VM86 mode make the cs look like ring 3
444			 * always. This is a lie, but it's better than passing
445			 * the additional vm86 bit around everywhere.
446			 */
447			if (v8086_mode(regs))
448				m->cs |= 3;
449		}
450		/* Use accurate RIP reporting if available. */
451		if (rip_msr)
452			m->ip = mce_rdmsrl(rip_msr);
453	}
454}
455
456/*
457 * Simple lockless ring to communicate PFNs from the exception handler with the
458 * process context work function. This is vastly simplified because there's
459 * only a single reader and a single writer.
460 */
461#define MCE_RING_SIZE 16	/* we use one entry less */
462
463struct mce_ring {
464	unsigned short start;
465	unsigned short end;
466	unsigned long ring[MCE_RING_SIZE];
467};
468static DEFINE_PER_CPU(struct mce_ring, mce_ring);
469
470/* Runs with CPU affinity in workqueue */
471static int mce_ring_empty(void)
472{
473	struct mce_ring *r = &__get_cpu_var(mce_ring);
474
475	return r->start == r->end;
476}
477
478static int mce_ring_get(unsigned long *pfn)
479{
480	struct mce_ring *r;
481	int ret = 0;
482
483	*pfn = 0;
484	get_cpu();
485	r = &__get_cpu_var(mce_ring);
486	if (r->start == r->end)
487		goto out;
488	*pfn = r->ring[r->start];
489	r->start = (r->start + 1) % MCE_RING_SIZE;
490	ret = 1;
491out:
492	put_cpu();
493	return ret;
494}
495
496/* Always runs in MCE context with preempt off */
497static int mce_ring_add(unsigned long pfn)
498{
499	struct mce_ring *r = &__get_cpu_var(mce_ring);
500	unsigned next;
501
502	next = (r->end + 1) % MCE_RING_SIZE;
503	if (next == r->start)
504		return -1;
505	r->ring[r->end] = pfn;
506	wmb();
507	r->end = next;
508	return 0;
509}
510
511int mce_available(struct cpuinfo_x86 *c)
512{
513	if (mce_disabled)
514		return 0;
515	return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
516}
517
518static void mce_schedule_work(void)
519{
520	if (!mce_ring_empty()) {
521		struct work_struct *work = &__get_cpu_var(mce_work);
522		if (!work_pending(work))
523			schedule_work(work);
524	}
525}
526
527DEFINE_PER_CPU(struct irq_work, mce_irq_work);
528
529static void mce_irq_work_cb(struct irq_work *entry)
530{
531	mce_notify_irq();
532	mce_schedule_work();
533}
534
535static void mce_report_event(struct pt_regs *regs)
536{
537	if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) {
538		mce_notify_irq();
539		/*
540		 * Triggering the work queue here is just an insurance
541		 * policy in case the syscall exit notify handler
542		 * doesn't run soon enough or ends up running on the
543		 * wrong CPU (can happen when audit sleeps)
544		 */
545		mce_schedule_work();
546		return;
547	}
548
549	irq_work_queue(&__get_cpu_var(mce_irq_work));
550}
551
552/*
553 * Read ADDR and MISC registers.
554 */
555static void mce_read_aux(struct mce *m, int i)
556{
557	if (m->status & MCI_STATUS_MISCV)
558		m->misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i));
559	if (m->status & MCI_STATUS_ADDRV) {
560		m->addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i));
561
562		/*
563		 * Mask the reported address by the reported granularity.
564		 */
565		if (mce_ser && (m->status & MCI_STATUS_MISCV)) {
566			u8 shift = MCI_MISC_ADDR_LSB(m->misc);
567			m->addr >>= shift;
568			m->addr <<= shift;
569		}
570	}
571}
572
573DEFINE_PER_CPU(unsigned, mce_poll_count);
574
575/*
576 * Poll for corrected events or events that happened before reset.
577 * Those are just logged through /dev/mcelog.
578 *
579 * This is executed in standard interrupt context.
580 *
581 * Note: spec recommends to panic for fatal unsignalled
582 * errors here. However this would be quite problematic --
583 * we would need to reimplement the Monarch handling and
584 * it would mess up the exclusion between exception handler
585 * and poll hander -- * so we skip this for now.
586 * These cases should not happen anyways, or only when the CPU
587 * is already totally * confused. In this case it's likely it will
588 * not fully execute the machine check handler either.
589 */
590void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
591{
592	struct mce m;
593	int i;
594
595	this_cpu_inc(mce_poll_count);
596
597	mce_gather_info(&m, NULL);
598
599	for (i = 0; i < banks; i++) {
600		if (!mce_banks[i].ctl || !test_bit(i, *b))
601			continue;
602
603		m.misc = 0;
604		m.addr = 0;
605		m.bank = i;
606		m.tsc = 0;
607
608		barrier();
609		m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
610		if (!(m.status & MCI_STATUS_VAL))
611			continue;
612
613		/*
614		 * Uncorrected or signalled events are handled by the exception
615		 * handler when it is enabled, so don't process those here.
616		 *
617		 * TBD do the same check for MCI_STATUS_EN here?
618		 */
619		if (!(flags & MCP_UC) &&
620		    (m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)))
621			continue;
622
623		mce_read_aux(&m, i);
624
625		if (!(flags & MCP_TIMESTAMP))
626			m.tsc = 0;
627		/*
628		 * Don't get the IP here because it's unlikely to
629		 * have anything to do with the actual error location.
630		 */
631		if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce)
632			mce_log(&m);
633
634		/*
635		 * Clear state for this bank.
636		 */
637		mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
638	}
639
640	/*
641	 * Don't clear MCG_STATUS here because it's only defined for
642	 * exceptions.
643	 */
644
645	sync_core();
646}
647EXPORT_SYMBOL_GPL(machine_check_poll);
648
649/*
650 * Do a quick check if any of the events requires a panic.
651 * This decides if we keep the events around or clear them.
652 */
653static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp)
654{
655	int i, ret = 0;
656
657	for (i = 0; i < banks; i++) {
658		m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
659		if (m->status & MCI_STATUS_VAL)
660			__set_bit(i, validp);
661		if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY)
662			ret = 1;
663	}
664	return ret;
665}
666
667/*
668 * Variable to establish order between CPUs while scanning.
669 * Each CPU spins initially until executing is equal its number.
670 */
671static atomic_t mce_executing;
672
673/*
674 * Defines order of CPUs on entry. First CPU becomes Monarch.
675 */
676static atomic_t mce_callin;
677
678/*
679 * Check if a timeout waiting for other CPUs happened.
680 */
681static int mce_timed_out(u64 *t)
682{
683	/*
684	 * The others already did panic for some reason.
685	 * Bail out like in a timeout.
686	 * rmb() to tell the compiler that system_state
687	 * might have been modified by someone else.
688	 */
689	rmb();
690	if (atomic_read(&mce_paniced))
691		wait_for_panic();
692	if (!monarch_timeout)
693		goto out;
694	if ((s64)*t < SPINUNIT) {
695		/* CHECKME: Make panic default for 1 too? */
696		if (tolerant < 1)
697			mce_panic("Timeout synchronizing machine check over CPUs",
698				  NULL, NULL);
699		cpu_missing = 1;
700		return 1;
701	}
702	*t -= SPINUNIT;
703out:
704	touch_nmi_watchdog();
705	return 0;
706}
707
708/*
709 * The Monarch's reign.  The Monarch is the CPU who entered
710 * the machine check handler first. It waits for the others to
711 * raise the exception too and then grades them. When any
712 * error is fatal panic. Only then let the others continue.
713 *
714 * The other CPUs entering the MCE handler will be controlled by the
715 * Monarch. They are called Subjects.
716 *
717 * This way we prevent any potential data corruption in a unrecoverable case
718 * and also makes sure always all CPU's errors are examined.
719 *
720 * Also this detects the case of a machine check event coming from outer
721 * space (not detected by any CPUs) In this case some external agent wants
722 * us to shut down, so panic too.
723 *
724 * The other CPUs might still decide to panic if the handler happens
725 * in a unrecoverable place, but in this case the system is in a semi-stable
726 * state and won't corrupt anything by itself. It's ok to let the others
727 * continue for a bit first.
728 *
729 * All the spin loops have timeouts; when a timeout happens a CPU
730 * typically elects itself to be Monarch.
731 */
732static void mce_reign(void)
733{
734	int cpu;
735	struct mce *m = NULL;
736	int global_worst = 0;
737	char *msg = NULL;
738	char *nmsg = NULL;
739
740	/*
741	 * This CPU is the Monarch and the other CPUs have run
742	 * through their handlers.
743	 * Grade the severity of the errors of all the CPUs.
744	 */
745	for_each_possible_cpu(cpu) {
746		int severity = mce_severity(&per_cpu(mces_seen, cpu), tolerant,
747					    &nmsg);
748		if (severity > global_worst) {
749			msg = nmsg;
750			global_worst = severity;
751			m = &per_cpu(mces_seen, cpu);
752		}
753	}
754
755	/*
756	 * Cannot recover? Panic here then.
757	 * This dumps all the mces in the log buffer and stops the
758	 * other CPUs.
759	 */
760	if (m && global_worst >= MCE_PANIC_SEVERITY && tolerant < 3)
761		mce_panic("Fatal Machine check", m, msg);
762
763	/*
764	 * For UC somewhere we let the CPU who detects it handle it.
765	 * Also must let continue the others, otherwise the handling
766	 * CPU could deadlock on a lock.
767	 */
768
769	/*
770	 * No machine check event found. Must be some external
771	 * source or one CPU is hung. Panic.
772	 */
773	if (global_worst <= MCE_KEEP_SEVERITY && tolerant < 3)
774		mce_panic("Machine check from unknown source", NULL, NULL);
775
776	/*
777	 * Now clear all the mces_seen so that they don't reappear on
778	 * the next mce.
779	 */
780	for_each_possible_cpu(cpu)
781		memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce));
782}
783
784static atomic_t global_nwo;
785
786/*
787 * Start of Monarch synchronization. This waits until all CPUs have
788 * entered the exception handler and then determines if any of them
789 * saw a fatal event that requires panic. Then it executes them
790 * in the entry order.
791 * TBD double check parallel CPU hotunplug
792 */
793static int mce_start(int *no_way_out)
794{
795	int order;
796	int cpus = num_online_cpus();
797	u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC;
798
799	if (!timeout)
800		return -1;
801
802	atomic_add(*no_way_out, &global_nwo);
803	/*
804	 * global_nwo should be updated before mce_callin
805	 */
806	smp_wmb();
807	order = atomic_inc_return(&mce_callin);
808
809	/*
810	 * Wait for everyone.
811	 */
812	while (atomic_read(&mce_callin) != cpus) {
813		if (mce_timed_out(&timeout)) {
814			atomic_set(&global_nwo, 0);
815			return -1;
816		}
817		ndelay(SPINUNIT);
818	}
819
820	/*
821	 * mce_callin should be read before global_nwo
822	 */
823	smp_rmb();
824
825	if (order == 1) {
826		/*
827		 * Monarch: Starts executing now, the others wait.
828		 */
829		atomic_set(&mce_executing, 1);
830	} else {
831		/*
832		 * Subject: Now start the scanning loop one by one in
833		 * the original callin order.
834		 * This way when there are any shared banks it will be
835		 * only seen by one CPU before cleared, avoiding duplicates.
836		 */
837		while (atomic_read(&mce_executing) < order) {
838			if (mce_timed_out(&timeout)) {
839				atomic_set(&global_nwo, 0);
840				return -1;
841			}
842			ndelay(SPINUNIT);
843		}
844	}
845
846	/*
847	 * Cache the global no_way_out state.
848	 */
849	*no_way_out = atomic_read(&global_nwo);
850
851	return order;
852}
853
854/*
855 * Synchronize between CPUs after main scanning loop.
856 * This invokes the bulk of the Monarch processing.
857 */
858static int mce_end(int order)
859{
860	int ret = -1;
861	u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC;
862
863	if (!timeout)
864		goto reset;
865	if (order < 0)
866		goto reset;
867
868	/*
869	 * Allow others to run.
870	 */
871	atomic_inc(&mce_executing);
872
873	if (order == 1) {
874		/* CHECKME: Can this race with a parallel hotplug? */
875		int cpus = num_online_cpus();
876
877		/*
878		 * Monarch: Wait for everyone to go through their scanning
879		 * loops.
880		 */
881		while (atomic_read(&mce_executing) <= cpus) {
882			if (mce_timed_out(&timeout))
883				goto reset;
884			ndelay(SPINUNIT);
885		}
886
887		mce_reign();
888		barrier();
889		ret = 0;
890	} else {
891		/*
892		 * Subject: Wait for Monarch to finish.
893		 */
894		while (atomic_read(&mce_executing) != 0) {
895			if (mce_timed_out(&timeout))
896				goto reset;
897			ndelay(SPINUNIT);
898		}
899
900		/*
901		 * Don't reset anything. That's done by the Monarch.
902		 */
903		return 0;
904	}
905
906	/*
907	 * Reset all global state.
908	 */
909reset:
910	atomic_set(&global_nwo, 0);
911	atomic_set(&mce_callin, 0);
912	barrier();
913
914	/*
915	 * Let others run again.
916	 */
917	atomic_set(&mce_executing, 0);
918	return ret;
919}
920
921/*
922 * Check if the address reported by the CPU is in a format we can parse.
923 * It would be possible to add code for most other cases, but all would
924 * be somewhat complicated (e.g. segment offset would require an instruction
925 * parser). So only support physical addresses up to page granuality for now.
926 */
927static int mce_usable_address(struct mce *m)
928{
929	if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV))
930		return 0;
931	if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT)
932		return 0;
933	if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS)
934		return 0;
935	return 1;
936}
937
938static void mce_clear_state(unsigned long *toclear)
939{
940	int i;
941
942	for (i = 0; i < banks; i++) {
943		if (test_bit(i, toclear))
944			mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
945	}
946}
947
948/*
949 * Need to save faulting physical address associated with a process
950 * in the machine check handler some place where we can grab it back
951 * later in mce_notify_process()
952 */
953#define	MCE_INFO_MAX	16
954
955struct mce_info {
956	atomic_t		inuse;
957	struct task_struct	*t;
958	__u64			paddr;
959	int			restartable;
960} mce_info[MCE_INFO_MAX];
961
962static void mce_save_info(__u64 addr, int c)
963{
964	struct mce_info *mi;
965
966	for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++) {
967		if (atomic_cmpxchg(&mi->inuse, 0, 1) == 0) {
968			mi->t = current;
969			mi->paddr = addr;
970			mi->restartable = c;
971			return;
972		}
973	}
974
975	mce_panic("Too many concurrent recoverable errors", NULL, NULL);
976}
977
978static struct mce_info *mce_find_info(void)
979{
980	struct mce_info *mi;
981
982	for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++)
983		if (atomic_read(&mi->inuse) && mi->t == current)
984			return mi;
985	return NULL;
986}
987
988static void mce_clear_info(struct mce_info *mi)
989{
990	atomic_set(&mi->inuse, 0);
991}
992
993/*
994 * The actual machine check handler. This only handles real
995 * exceptions when something got corrupted coming in through int 18.
996 *
997 * This is executed in NMI context not subject to normal locking rules. This
998 * implies that most kernel services cannot be safely used. Don't even
999 * think about putting a printk in there!
1000 *
1001 * On Intel systems this is entered on all CPUs in parallel through
1002 * MCE broadcast. However some CPUs might be broken beyond repair,
1003 * so be always careful when synchronizing with others.
1004 */
1005void do_machine_check(struct pt_regs *regs, long error_code)
1006{
1007	struct mce m, *final;
1008	int i;
1009	int worst = 0;
1010	int severity;
1011	/*
1012	 * Establish sequential order between the CPUs entering the machine
1013	 * check handler.
1014	 */
1015	int order;
1016	/*
1017	 * If no_way_out gets set, there is no safe way to recover from this
1018	 * MCE.  If tolerant is cranked up, we'll try anyway.
1019	 */
1020	int no_way_out = 0;
1021	/*
1022	 * If kill_it gets set, there might be a way to recover from this
1023	 * error.
1024	 */
1025	int kill_it = 0;
1026	DECLARE_BITMAP(toclear, MAX_NR_BANKS);
1027	DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
1028	char *msg = "Unknown";
1029
1030	atomic_inc(&mce_entry);
1031
1032	this_cpu_inc(mce_exception_count);
1033
1034	if (!banks)
1035		goto out;
1036
1037	mce_gather_info(&m, regs);
1038
1039	final = &__get_cpu_var(mces_seen);
1040	*final = m;
1041
1042	memset(valid_banks, 0, sizeof(valid_banks));
1043	no_way_out = mce_no_way_out(&m, &msg, valid_banks);
1044
1045	barrier();
1046
1047	/*
1048	 * When no restart IP might need to kill or panic.
1049	 * Assume the worst for now, but if we find the
1050	 * severity is MCE_AR_SEVERITY we have other options.
1051	 */
1052	if (!(m.mcgstatus & MCG_STATUS_RIPV))
1053		kill_it = 1;
1054
1055	/*
1056	 * Go through all the banks in exclusion of the other CPUs.
1057	 * This way we don't report duplicated events on shared banks
1058	 * because the first one to see it will clear it.
1059	 */
1060	order = mce_start(&no_way_out);
1061	for (i = 0; i < banks; i++) {
1062		__clear_bit(i, toclear);
1063		if (!test_bit(i, valid_banks))
1064			continue;
1065		if (!mce_banks[i].ctl)
1066			continue;
1067
1068		m.misc = 0;
1069		m.addr = 0;
1070		m.bank = i;
1071
1072		m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
1073		if ((m.status & MCI_STATUS_VAL) == 0)
1074			continue;
1075
1076		/*
1077		 * Non uncorrected or non signaled errors are handled by
1078		 * machine_check_poll. Leave them alone, unless this panics.
1079		 */
1080		if (!(m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)) &&
1081			!no_way_out)
1082			continue;
1083
1084		/*
1085		 * Set taint even when machine check was not enabled.
1086		 */
1087		add_taint(TAINT_MACHINE_CHECK);
1088
1089		severity = mce_severity(&m, tolerant, NULL);
1090
1091		/*
1092		 * When machine check was for corrected handler don't touch,
1093		 * unless we're panicing.
1094		 */
1095		if (severity == MCE_KEEP_SEVERITY && !no_way_out)
1096			continue;
1097		__set_bit(i, toclear);
1098		if (severity == MCE_NO_SEVERITY) {
1099			/*
1100			 * Machine check event was not enabled. Clear, but
1101			 * ignore.
1102			 */
1103			continue;
1104		}
1105
1106		mce_read_aux(&m, i);
1107
1108		/*
1109		 * Action optional error. Queue address for later processing.
1110		 * When the ring overflows we just ignore the AO error.
1111		 * RED-PEN add some logging mechanism when
1112		 * usable_address or mce_add_ring fails.
1113		 * RED-PEN don't ignore overflow for tolerant == 0
1114		 */
1115		if (severity == MCE_AO_SEVERITY && mce_usable_address(&m))
1116			mce_ring_add(m.addr >> PAGE_SHIFT);
1117
1118		mce_log(&m);
1119
1120		if (severity > worst) {
1121			*final = m;
1122			worst = severity;
1123		}
1124	}
1125
1126	/* mce_clear_state will clear *final, save locally for use later */
1127	m = *final;
1128
1129	if (!no_way_out)
1130		mce_clear_state(toclear);
1131
1132	/*
1133	 * Do most of the synchronization with other CPUs.
1134	 * When there's any problem use only local no_way_out state.
1135	 */
1136	if (mce_end(order) < 0)
1137		no_way_out = worst >= MCE_PANIC_SEVERITY;
1138
1139	/*
1140	 * At insane "tolerant" levels we take no action. Otherwise
1141	 * we only die if we have no other choice. For less serious
1142	 * issues we try to recover, or limit damage to the current
1143	 * process.
1144	 */
1145	if (tolerant < 3) {
1146		if (no_way_out)
1147			mce_panic("Fatal machine check on current CPU", &m, msg);
1148		if (worst == MCE_AR_SEVERITY) {
1149			/* schedule action before return to userland */
1150			mce_save_info(m.addr, m.mcgstatus & MCG_STATUS_RIPV);
1151			set_thread_flag(TIF_MCE_NOTIFY);
1152		} else if (kill_it) {
1153			force_sig(SIGBUS, current);
1154		}
1155	}
1156
1157	if (worst > 0)
1158		mce_report_event(regs);
1159	mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
1160out:
1161	atomic_dec(&mce_entry);
1162	sync_core();
1163}
1164EXPORT_SYMBOL_GPL(do_machine_check);
1165
1166#ifndef CONFIG_MEMORY_FAILURE
1167int memory_failure(unsigned long pfn, int vector, int flags)
1168{
1169	/* mce_severity() should not hand us an ACTION_REQUIRED error */
1170	BUG_ON(flags & MF_ACTION_REQUIRED);
1171	pr_err("Uncorrected memory error in page 0x%lx ignored\n"
1172	       "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n",
1173	       pfn);
1174
1175	return 0;
1176}
1177#endif
1178
1179/*
1180 * Called in process context that interrupted by MCE and marked with
1181 * TIF_MCE_NOTIFY, just before returning to erroneous userland.
1182 * This code is allowed to sleep.
1183 * Attempt possible recovery such as calling the high level VM handler to
1184 * process any corrupted pages, and kill/signal current process if required.
1185 * Action required errors are handled here.
1186 */
1187void mce_notify_process(void)
1188{
1189	unsigned long pfn;
1190	struct mce_info *mi = mce_find_info();
1191	int flags = MF_ACTION_REQUIRED;
1192
1193	if (!mi)
1194		mce_panic("Lost physical address for unconsumed uncorrectable error", NULL, NULL);
1195	pfn = mi->paddr >> PAGE_SHIFT;
1196
1197	clear_thread_flag(TIF_MCE_NOTIFY);
1198
1199	pr_err("Uncorrected hardware memory error in user-access at %llx",
1200		 mi->paddr);
1201	/*
1202	 * We must call memory_failure() here even if the current process is
1203	 * doomed. We still need to mark the page as poisoned and alert any
1204	 * other users of the page.
1205	 */
1206	if (!mi->restartable)
1207		flags |= MF_MUST_KILL;
1208	if (memory_failure(pfn, MCE_VECTOR, flags) < 0) {
1209		pr_err("Memory error not recovered");
1210		force_sig(SIGBUS, current);
1211	}
1212	mce_clear_info(mi);
1213}
1214
1215/*
1216 * Action optional processing happens here (picking up
1217 * from the list of faulting pages that do_machine_check()
1218 * placed into the "ring").
1219 */
1220static void mce_process_work(struct work_struct *dummy)
1221{
1222	unsigned long pfn;
1223
1224	while (mce_ring_get(&pfn))
1225		memory_failure(pfn, MCE_VECTOR, 0);
1226}
1227
1228#ifdef CONFIG_X86_MCE_INTEL
1229/***
1230 * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
1231 * @cpu: The CPU on which the event occurred.
1232 * @status: Event status information
1233 *
1234 * This function should be called by the thermal interrupt after the
1235 * event has been processed and the decision was made to log the event
1236 * further.
1237 *
1238 * The status parameter will be saved to the 'status' field of 'struct mce'
1239 * and historically has been the register value of the
1240 * MSR_IA32_THERMAL_STATUS (Intel) msr.
1241 */
1242void mce_log_therm_throt_event(__u64 status)
1243{
1244	struct mce m;
1245
1246	mce_setup(&m);
1247	m.bank = MCE_THERMAL_BANK;
1248	m.status = status;
1249	mce_log(&m);
1250}
1251#endif /* CONFIG_X86_MCE_INTEL */
1252
1253/*
1254 * Periodic polling timer for "silent" machine check errors.  If the
1255 * poller finds an MCE, poll 2x faster.  When the poller finds no more
1256 * errors, poll 2x slower (up to check_interval seconds).
1257 */
1258static unsigned long check_interval = 5 * 60; /* 5 minutes */
1259
1260static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
1261static DEFINE_PER_CPU(struct timer_list, mce_timer);
1262
1263static unsigned long mce_adjust_timer_default(unsigned long interval)
1264{
1265	return interval;
1266}
1267
1268static unsigned long (*mce_adjust_timer)(unsigned long interval) =
1269	mce_adjust_timer_default;
1270
1271static void mce_timer_fn(unsigned long data)
1272{
1273	struct timer_list *t = &__get_cpu_var(mce_timer);
1274	unsigned long iv;
1275
1276	WARN_ON(smp_processor_id() != data);
1277
1278	if (mce_available(__this_cpu_ptr(&cpu_info))) {
1279		machine_check_poll(MCP_TIMESTAMP,
1280				&__get_cpu_var(mce_poll_banks));
1281		mce_intel_cmci_poll();
1282	}
1283
1284	/*
1285	 * Alert userspace if needed.  If we logged an MCE, reduce the
1286	 * polling interval, otherwise increase the polling interval.
1287	 */
1288	iv = __this_cpu_read(mce_next_interval);
1289	if (mce_notify_irq()) {
1290		iv = max(iv / 2, (unsigned long) HZ/100);
1291	} else {
1292		iv = min(iv * 2, round_jiffies_relative(check_interval * HZ));
1293		iv = mce_adjust_timer(iv);
1294	}
1295	__this_cpu_write(mce_next_interval, iv);
1296	/* Might have become 0 after CMCI storm subsided */
1297	if (iv) {
1298		t->expires = jiffies + iv;
1299		add_timer_on(t, smp_processor_id());
1300	}
1301}
1302
1303/*
1304 * Ensure that the timer is firing in @interval from now.
1305 */
1306void mce_timer_kick(unsigned long interval)
1307{
1308	struct timer_list *t = &__get_cpu_var(mce_timer);
1309	unsigned long when = jiffies + interval;
1310	unsigned long iv = __this_cpu_read(mce_next_interval);
1311
1312	if (timer_pending(t)) {
1313		if (time_before(when, t->expires))
1314			mod_timer_pinned(t, when);
1315	} else {
1316		t->expires = round_jiffies(when);
1317		add_timer_on(t, smp_processor_id());
1318	}
1319	if (interval < iv)
1320		__this_cpu_write(mce_next_interval, interval);
1321}
1322
1323/* Must not be called in IRQ context where del_timer_sync() can deadlock */
1324static void mce_timer_delete_all(void)
1325{
1326	int cpu;
1327
1328	for_each_online_cpu(cpu)
1329		del_timer_sync(&per_cpu(mce_timer, cpu));
1330}
1331
1332static void mce_do_trigger(struct work_struct *work)
1333{
1334	call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT);
1335}
1336
1337static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
1338
1339/*
1340 * Notify the user(s) about new machine check events.
1341 * Can be called from interrupt context, but not from machine check/NMI
1342 * context.
1343 */
1344int mce_notify_irq(void)
1345{
1346	/* Not more than two messages every minute */
1347	static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
1348
1349	if (test_and_clear_bit(0, &mce_need_notify)) {
1350		/* wake processes polling /dev/mcelog */
1351		wake_up_interruptible(&mce_chrdev_wait);
1352
1353		/*
1354		 * There is no risk of missing notifications because
1355		 * work_pending is always cleared before the function is
1356		 * executed.
1357		 */
1358		if (mce_helper[0] && !work_pending(&mce_trigger_work))
1359			schedule_work(&mce_trigger_work);
1360
1361		if (__ratelimit(&ratelimit))
1362			pr_info(HW_ERR "Machine check events logged\n");
1363
1364		return 1;
1365	}
1366	return 0;
1367}
1368EXPORT_SYMBOL_GPL(mce_notify_irq);
1369
1370static int __cpuinit __mcheck_cpu_mce_banks_init(void)
1371{
1372	int i;
1373
1374	mce_banks = kzalloc(banks * sizeof(struct mce_bank), GFP_KERNEL);
1375	if (!mce_banks)
1376		return -ENOMEM;
1377	for (i = 0; i < banks; i++) {
1378		struct mce_bank *b = &mce_banks[i];
1379
1380		b->ctl = -1ULL;
1381		b->init = 1;
1382	}
1383	return 0;
1384}
1385
1386/*
1387 * Initialize Machine Checks for a CPU.
1388 */
1389static int __cpuinit __mcheck_cpu_cap_init(void)
1390{
1391	unsigned b;
1392	u64 cap;
1393
1394	rdmsrl(MSR_IA32_MCG_CAP, cap);
1395
1396	b = cap & MCG_BANKCNT_MASK;
1397	if (!banks)
1398		pr_info("CPU supports %d MCE banks\n", b);
1399
1400	if (b > MAX_NR_BANKS) {
1401		pr_warn("Using only %u machine check banks out of %u\n",
1402			MAX_NR_BANKS, b);
1403		b = MAX_NR_BANKS;
1404	}
1405
1406	/* Don't support asymmetric configurations today */
1407	WARN_ON(banks != 0 && b != banks);
1408	banks = b;
1409	if (!mce_banks) {
1410		int err = __mcheck_cpu_mce_banks_init();
1411
1412		if (err)
1413			return err;
1414	}
1415
1416	/* Use accurate RIP reporting if available. */
1417	if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9)
1418		rip_msr = MSR_IA32_MCG_EIP;
1419
1420	if (cap & MCG_SER_P)
1421		mce_ser = 1;
1422
1423	return 0;
1424}
1425
1426static void __mcheck_cpu_init_generic(void)
1427{
1428	mce_banks_t all_banks;
1429	u64 cap;
1430	int i;
1431
1432	/*
1433	 * Log the machine checks left over from the previous reset.
1434	 */
1435	bitmap_fill(all_banks, MAX_NR_BANKS);
1436	machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks);
1437
1438	set_in_cr4(X86_CR4_MCE);
1439
1440	rdmsrl(MSR_IA32_MCG_CAP, cap);
1441	if (cap & MCG_CTL_P)
1442		wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
1443
1444	for (i = 0; i < banks; i++) {
1445		struct mce_bank *b = &mce_banks[i];
1446
1447		if (!b->init)
1448			continue;
1449		wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl);
1450		wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
1451	}
1452}
1453
1454/* Add per CPU specific workarounds here */
1455static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
1456{
1457	if (c->x86_vendor == X86_VENDOR_UNKNOWN) {
1458		pr_info("unknown CPU type - not enabling MCE support\n");
1459		return -EOPNOTSUPP;
1460	}
1461
1462	/* This should be disabled by the BIOS, but isn't always */
1463	if (c->x86_vendor == X86_VENDOR_AMD) {
1464		if (c->x86 == 15 && banks > 4) {
1465			/*
1466			 * disable GART TBL walk error reporting, which
1467			 * trips off incorrectly with the IOMMU & 3ware
1468			 * & Cerberus:
1469			 */
1470			clear_bit(10, (unsigned long *)&mce_banks[4].ctl);
1471		}
1472		if (c->x86 <= 17 && mce_bootlog < 0) {
1473			/*
1474			 * Lots of broken BIOS around that don't clear them
1475			 * by default and leave crap in there. Don't log:
1476			 */
1477			mce_bootlog = 0;
1478		}
1479		/*
1480		 * Various K7s with broken bank 0 around. Always disable
1481		 * by default.
1482		 */
1483		 if (c->x86 == 6 && banks > 0)
1484			mce_banks[0].ctl = 0;
1485
1486		 /*
1487		  * Turn off MC4_MISC thresholding banks on those models since
1488		  * they're not supported there.
1489		  */
1490		 if (c->x86 == 0x15 &&
1491		     (c->x86_model >= 0x10 && c->x86_model <= 0x1f)) {
1492			 int i;
1493			 u64 val, hwcr;
1494			 bool need_toggle;
1495			 u32 msrs[] = {
1496				0x00000413, /* MC4_MISC0 */
1497				0xc0000408, /* MC4_MISC1 */
1498			 };
1499
1500			 rdmsrl(MSR_K7_HWCR, hwcr);
1501
1502			 /* McStatusWrEn has to be set */
1503			 need_toggle = !(hwcr & BIT(18));
1504
1505			 if (need_toggle)
1506				 wrmsrl(MSR_K7_HWCR, hwcr | BIT(18));
1507
1508			 for (i = 0; i < ARRAY_SIZE(msrs); i++) {
1509				 rdmsrl(msrs[i], val);
1510
1511				 /* CntP bit set? */
1512				 if (val & BIT_64(62)) {
1513					val &= ~BIT_64(62);
1514					wrmsrl(msrs[i], val);
1515				 }
1516			 }
1517
1518			 /* restore old settings */
1519			 if (need_toggle)
1520				 wrmsrl(MSR_K7_HWCR, hwcr);
1521		 }
1522	}
1523
1524	if (c->x86_vendor == X86_VENDOR_INTEL) {
1525		/*
1526		 * SDM documents that on family 6 bank 0 should not be written
1527		 * because it aliases to another special BIOS controlled
1528		 * register.
1529		 * But it's not aliased anymore on model 0x1a+
1530		 * Don't ignore bank 0 completely because there could be a
1531		 * valid event later, merely don't write CTL0.
1532		 */
1533
1534		if (c->x86 == 6 && c->x86_model < 0x1A && banks > 0)
1535			mce_banks[0].init = 0;
1536
1537		/*
1538		 * All newer Intel systems support MCE broadcasting. Enable
1539		 * synchronization with a one second timeout.
1540		 */
1541		if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) &&
1542			monarch_timeout < 0)
1543			monarch_timeout = USEC_PER_SEC;
1544
1545		/*
1546		 * There are also broken BIOSes on some Pentium M and
1547		 * earlier systems:
1548		 */
1549		if (c->x86 == 6 && c->x86_model <= 13 && mce_bootlog < 0)
1550			mce_bootlog = 0;
1551	}
1552	if (monarch_timeout < 0)
1553		monarch_timeout = 0;
1554	if (mce_bootlog != 0)
1555		mce_panic_timeout = 30;
1556
1557	return 0;
1558}
1559
1560static int __cpuinit __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
1561{
1562	if (c->x86 != 5)
1563		return 0;
1564
1565	switch (c->x86_vendor) {
1566	case X86_VENDOR_INTEL:
1567		intel_p5_mcheck_init(c);
1568		return 1;
1569		break;
1570	case X86_VENDOR_CENTAUR:
1571		winchip_mcheck_init(c);
1572		return 1;
1573		break;
1574	}
1575
1576	return 0;
1577}
1578
1579static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
1580{
1581	switch (c->x86_vendor) {
1582	case X86_VENDOR_INTEL:
1583		mce_intel_feature_init(c);
1584		mce_adjust_timer = mce_intel_adjust_timer;
1585		break;
1586	case X86_VENDOR_AMD:
1587		mce_amd_feature_init(c);
1588		break;
1589	default:
1590		break;
1591	}
1592}
1593
1594static void mce_start_timer(unsigned int cpu, struct timer_list *t)
1595{
1596	unsigned long iv = mce_adjust_timer(check_interval * HZ);
1597
1598	__this_cpu_write(mce_next_interval, iv);
1599
1600	if (mce_ignore_ce || !iv)
1601		return;
1602
1603	t->expires = round_jiffies(jiffies + iv);
1604	add_timer_on(t, smp_processor_id());
1605}
1606
1607static void __mcheck_cpu_init_timer(void)
1608{
1609	struct timer_list *t = &__get_cpu_var(mce_timer);
1610	unsigned int cpu = smp_processor_id();
1611
1612	setup_timer(t, mce_timer_fn, cpu);
1613	mce_start_timer(cpu, t);
1614}
1615
1616/* Handle unconfigured int18 (should never happen) */
1617static void unexpected_machine_check(struct pt_regs *regs, long error_code)
1618{
1619	pr_err("CPU#%d: Unexpected int18 (Machine Check)\n",
1620	       smp_processor_id());
1621}
1622
1623/* Call the installed machine check handler for this CPU setup. */
1624void (*machine_check_vector)(struct pt_regs *, long error_code) =
1625						unexpected_machine_check;
1626
1627/*
1628 * Called for each booted CPU to set up machine checks.
1629 * Must be called with preempt off:
1630 */
1631void __cpuinit mcheck_cpu_init(struct cpuinfo_x86 *c)
1632{
1633	if (mce_disabled)
1634		return;
1635
1636	if (__mcheck_cpu_ancient_init(c))
1637		return;
1638
1639	if (!mce_available(c))
1640		return;
1641
1642	if (__mcheck_cpu_cap_init() < 0 || __mcheck_cpu_apply_quirks(c) < 0) {
1643		mce_disabled = 1;
1644		return;
1645	}
1646
1647	machine_check_vector = do_machine_check;
1648
1649	__mcheck_cpu_init_generic();
1650	__mcheck_cpu_init_vendor(c);
1651	__mcheck_cpu_init_timer();
1652	INIT_WORK(&__get_cpu_var(mce_work), mce_process_work);
1653	init_irq_work(&__get_cpu_var(mce_irq_work), &mce_irq_work_cb);
1654}
1655
1656/*
1657 * mce_chrdev: Character device /dev/mcelog to read and clear the MCE log.
1658 */
1659
1660static DEFINE_SPINLOCK(mce_chrdev_state_lock);
1661static int mce_chrdev_open_count;	/* #times opened */
1662static int mce_chrdev_open_exclu;	/* already open exclusive? */
1663
1664static int mce_chrdev_open(struct inode *inode, struct file *file)
1665{
1666	spin_lock(&mce_chrdev_state_lock);
1667
1668	if (mce_chrdev_open_exclu ||
1669	    (mce_chrdev_open_count && (file->f_flags & O_EXCL))) {
1670		spin_unlock(&mce_chrdev_state_lock);
1671
1672		return -EBUSY;
1673	}
1674
1675	if (file->f_flags & O_EXCL)
1676		mce_chrdev_open_exclu = 1;
1677	mce_chrdev_open_count++;
1678
1679	spin_unlock(&mce_chrdev_state_lock);
1680
1681	return nonseekable_open(inode, file);
1682}
1683
1684static int mce_chrdev_release(struct inode *inode, struct file *file)
1685{
1686	spin_lock(&mce_chrdev_state_lock);
1687
1688	mce_chrdev_open_count--;
1689	mce_chrdev_open_exclu = 0;
1690
1691	spin_unlock(&mce_chrdev_state_lock);
1692
1693	return 0;
1694}
1695
1696static void collect_tscs(void *data)
1697{
1698	unsigned long *cpu_tsc = (unsigned long *)data;
1699
1700	rdtscll(cpu_tsc[smp_processor_id()]);
1701}
1702
1703static int mce_apei_read_done;
1704
1705/* Collect MCE record of previous boot in persistent storage via APEI ERST. */
1706static int __mce_read_apei(char __user **ubuf, size_t usize)
1707{
1708	int rc;
1709	u64 record_id;
1710	struct mce m;
1711
1712	if (usize < sizeof(struct mce))
1713		return -EINVAL;
1714
1715	rc = apei_read_mce(&m, &record_id);
1716	/* Error or no more MCE record */
1717	if (rc <= 0) {
1718		mce_apei_read_done = 1;
1719		/*
1720		 * When ERST is disabled, mce_chrdev_read() should return
1721		 * "no record" instead of "no device."
1722		 */
1723		if (rc == -ENODEV)
1724			return 0;
1725		return rc;
1726	}
1727	rc = -EFAULT;
1728	if (copy_to_user(*ubuf, &m, sizeof(struct mce)))
1729		return rc;
1730	/*
1731	 * In fact, we should have cleared the record after that has
1732	 * been flushed to the disk or sent to network in
1733	 * /sbin/mcelog, but we have no interface to support that now,
1734	 * so just clear it to avoid duplication.
1735	 */
1736	rc = apei_clear_mce(record_id);
1737	if (rc) {
1738		mce_apei_read_done = 1;
1739		return rc;
1740	}
1741	*ubuf += sizeof(struct mce);
1742
1743	return 0;
1744}
1745
1746static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf,
1747				size_t usize, loff_t *off)
1748{
1749	char __user *buf = ubuf;
1750	unsigned long *cpu_tsc;
1751	unsigned prev, next;
1752	int i, err;
1753
1754	cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
1755	if (!cpu_tsc)
1756		return -ENOMEM;
1757
1758	mutex_lock(&mce_chrdev_read_mutex);
1759
1760	if (!mce_apei_read_done) {
1761		err = __mce_read_apei(&buf, usize);
1762		if (err || buf != ubuf)
1763			goto out;
1764	}
1765
1766	next = rcu_dereference_check_mce(mcelog.next);
1767
1768	/* Only supports full reads right now */
1769	err = -EINVAL;
1770	if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce))
1771		goto out;
1772
1773	err = 0;
1774	prev = 0;
1775	do {
1776		for (i = prev; i < next; i++) {
1777			unsigned long start = jiffies;
1778			struct mce *m = &mcelog.entry[i];
1779
1780			while (!m->finished) {
1781				if (time_after_eq(jiffies, start + 2)) {
1782					memset(m, 0, sizeof(*m));
1783					goto timeout;
1784				}
1785				cpu_relax();
1786			}
1787			smp_rmb();
1788			err |= copy_to_user(buf, m, sizeof(*m));
1789			buf += sizeof(*m);
1790timeout:
1791			;
1792		}
1793
1794		memset(mcelog.entry + prev, 0,
1795		       (next - prev) * sizeof(struct mce));
1796		prev = next;
1797		next = cmpxchg(&mcelog.next, prev, 0);
1798	} while (next != prev);
1799
1800	synchronize_sched();
1801
1802	/*
1803	 * Collect entries that were still getting written before the
1804	 * synchronize.
1805	 */
1806	on_each_cpu(collect_tscs, cpu_tsc, 1);
1807
1808	for (i = next; i < MCE_LOG_LEN; i++) {
1809		struct mce *m = &mcelog.entry[i];
1810
1811		if (m->finished && m->tsc < cpu_tsc[m->cpu]) {
1812			err |= copy_to_user(buf, m, sizeof(*m));
1813			smp_rmb();
1814			buf += sizeof(*m);
1815			memset(m, 0, sizeof(*m));
1816		}
1817	}
1818
1819	if (err)
1820		err = -EFAULT;
1821
1822out:
1823	mutex_unlock(&mce_chrdev_read_mutex);
1824	kfree(cpu_tsc);
1825
1826	return err ? err : buf - ubuf;
1827}
1828
1829static unsigned int mce_chrdev_poll(struct file *file, poll_table *wait)
1830{
1831	poll_wait(file, &mce_chrdev_wait, wait);
1832	if (rcu_access_index(mcelog.next))
1833		return POLLIN | POLLRDNORM;
1834	if (!mce_apei_read_done && apei_check_mce())
1835		return POLLIN | POLLRDNORM;
1836	return 0;
1837}
1838
1839static long mce_chrdev_ioctl(struct file *f, unsigned int cmd,
1840				unsigned long arg)
1841{
1842	int __user *p = (int __user *)arg;
1843
1844	if (!capable(CAP_SYS_ADMIN))
1845		return -EPERM;
1846
1847	switch (cmd) {
1848	case MCE_GET_RECORD_LEN:
1849		return put_user(sizeof(struct mce), p);
1850	case MCE_GET_LOG_LEN:
1851		return put_user(MCE_LOG_LEN, p);
1852	case MCE_GETCLEAR_FLAGS: {
1853		unsigned flags;
1854
1855		do {
1856			flags = mcelog.flags;
1857		} while (cmpxchg(&mcelog.flags, flags, 0) != flags);
1858
1859		return put_user(flags, p);
1860	}
1861	default:
1862		return -ENOTTY;
1863	}
1864}
1865
1866static ssize_t (*mce_write)(struct file *filp, const char __user *ubuf,
1867			    size_t usize, loff_t *off);
1868
1869void register_mce_write_callback(ssize_t (*fn)(struct file *filp,
1870			     const char __user *ubuf,
1871			     size_t usize, loff_t *off))
1872{
1873	mce_write = fn;
1874}
1875EXPORT_SYMBOL_GPL(register_mce_write_callback);
1876
1877ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf,
1878			 size_t usize, loff_t *off)
1879{
1880	if (mce_write)
1881		return mce_write(filp, ubuf, usize, off);
1882	else
1883		return -EINVAL;
1884}
1885
1886static const struct file_operations mce_chrdev_ops = {
1887	.open			= mce_chrdev_open,
1888	.release		= mce_chrdev_release,
1889	.read			= mce_chrdev_read,
1890	.write			= mce_chrdev_write,
1891	.poll			= mce_chrdev_poll,
1892	.unlocked_ioctl		= mce_chrdev_ioctl,
1893	.llseek			= no_llseek,
1894};
1895
1896static struct miscdevice mce_chrdev_device = {
1897	MISC_MCELOG_MINOR,
1898	"mcelog",
1899	&mce_chrdev_ops,
1900};
1901
1902/*
1903 * mce=off Disables machine check
1904 * mce=no_cmci Disables CMCI
1905 * mce=dont_log_ce Clears corrected events silently, no log created for CEs.
1906 * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared.
1907 * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above)
1908 *	monarchtimeout is how long to wait for other CPUs on machine
1909 *	check, or 0 to not wait
1910 * mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
1911 * mce=nobootlog Don't log MCEs from before booting.
1912 */
1913static int __init mcheck_enable(char *str)
1914{
1915	if (*str == 0) {
1916		enable_p5_mce();
1917		return 1;
1918	}
1919	if (*str == '=')
1920		str++;
1921	if (!strcmp(str, "off"))
1922		mce_disabled = 1;
1923	else if (!strcmp(str, "no_cmci"))
1924		mce_cmci_disabled = 1;
1925	else if (!strcmp(str, "dont_log_ce"))
1926		mce_dont_log_ce = 1;
1927	else if (!strcmp(str, "ignore_ce"))
1928		mce_ignore_ce = 1;
1929	else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
1930		mce_bootlog = (str[0] == 'b');
1931	else if (isdigit(str[0])) {
1932		get_option(&str, &tolerant);
1933		if (*str == ',') {
1934			++str;
1935			get_option(&str, &monarch_timeout);
1936		}
1937	} else {
1938		pr_info("mce argument %s ignored. Please use /sys\n", str);
1939		return 0;
1940	}
1941	return 1;
1942}
1943__setup("mce", mcheck_enable);
1944
1945int __init mcheck_init(void)
1946{
1947	mcheck_intel_therm_init();
1948
1949	return 0;
1950}
1951
1952/*
1953 * mce_syscore: PM support
1954 */
1955
1956/*
1957 * Disable machine checks on suspend and shutdown. We can't really handle
1958 * them later.
1959 */
1960static int mce_disable_error_reporting(void)
1961{
1962	int i;
1963
1964	for (i = 0; i < banks; i++) {
1965		struct mce_bank *b = &mce_banks[i];
1966
1967		if (b->init)
1968			wrmsrl(MSR_IA32_MCx_CTL(i), 0);
1969	}
1970	return 0;
1971}
1972
1973static int mce_syscore_suspend(void)
1974{
1975	return mce_disable_error_reporting();
1976}
1977
1978static void mce_syscore_shutdown(void)
1979{
1980	mce_disable_error_reporting();
1981}
1982
1983/*
1984 * On resume clear all MCE state. Don't want to see leftovers from the BIOS.
1985 * Only one CPU is active at this time, the others get re-added later using
1986 * CPU hotplug:
1987 */
1988static void mce_syscore_resume(void)
1989{
1990	__mcheck_cpu_init_generic();
1991	__mcheck_cpu_init_vendor(__this_cpu_ptr(&cpu_info));
1992}
1993
1994static struct syscore_ops mce_syscore_ops = {
1995	.suspend	= mce_syscore_suspend,
1996	.shutdown	= mce_syscore_shutdown,
1997	.resume		= mce_syscore_resume,
1998};
1999
2000/*
2001 * mce_device: Sysfs support
2002 */
2003
2004static void mce_cpu_restart(void *data)
2005{
2006	if (!mce_available(__this_cpu_ptr(&cpu_info)))
2007		return;
2008	__mcheck_cpu_init_generic();
2009	__mcheck_cpu_init_timer();
2010}
2011
2012/* Reinit MCEs after user configuration changes */
2013static void mce_restart(void)
2014{
2015	mce_timer_delete_all();
2016	on_each_cpu(mce_cpu_restart, NULL, 1);
2017}
2018
2019/* Toggle features for corrected errors */
2020static void mce_disable_cmci(void *data)
2021{
2022	if (!mce_available(__this_cpu_ptr(&cpu_info)))
2023		return;
2024	cmci_clear();
2025}
2026
2027static void mce_enable_ce(void *all)
2028{
2029	if (!mce_available(__this_cpu_ptr(&cpu_info)))
2030		return;
2031	cmci_reenable();
2032	cmci_recheck();
2033	if (all)
2034		__mcheck_cpu_init_timer();
2035}
2036
2037static struct bus_type mce_subsys = {
2038	.name		= "machinecheck",
2039	.dev_name	= "machinecheck",
2040};
2041
2042DEFINE_PER_CPU(struct device *, mce_device);
2043
2044__cpuinitdata
2045void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
2046
2047static inline struct mce_bank *attr_to_bank(struct device_attribute *attr)
2048{
2049	return container_of(attr, struct mce_bank, attr);
2050}
2051
2052static ssize_t show_bank(struct device *s, struct device_attribute *attr,
2053			 char *buf)
2054{
2055	return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl);
2056}
2057
2058static ssize_t set_bank(struct device *s, struct device_attribute *attr,
2059			const char *buf, size_t size)
2060{
2061	u64 new;
2062
2063	if (strict_strtoull(buf, 0, &new) < 0)
2064		return -EINVAL;
2065
2066	attr_to_bank(attr)->ctl = new;
2067	mce_restart();
2068
2069	return size;
2070}
2071
2072static ssize_t
2073show_trigger(struct device *s, struct device_attribute *attr, char *buf)
2074{
2075	strcpy(buf, mce_helper);
2076	strcat(buf, "\n");
2077	return strlen(mce_helper) + 1;
2078}
2079
2080static ssize_t set_trigger(struct device *s, struct device_attribute *attr,
2081				const char *buf, size_t siz)
2082{
2083	char *p;
2084
2085	strncpy(mce_helper, buf, sizeof(mce_helper));
2086	mce_helper[sizeof(mce_helper)-1] = 0;
2087	p = strchr(mce_helper, '\n');
2088
2089	if (p)
2090		*p = 0;
2091
2092	return strlen(mce_helper) + !!p;
2093}
2094
2095static ssize_t set_ignore_ce(struct device *s,
2096			     struct device_attribute *attr,
2097			     const char *buf, size_t size)
2098{
2099	u64 new;
2100
2101	if (strict_strtoull(buf, 0, &new) < 0)
2102		return -EINVAL;
2103
2104	if (mce_ignore_ce ^ !!new) {
2105		if (new) {
2106			/* disable ce features */
2107			mce_timer_delete_all();
2108			on_each_cpu(mce_disable_cmci, NULL, 1);
2109			mce_ignore_ce = 1;
2110		} else {
2111			/* enable ce features */
2112			mce_ignore_ce = 0;
2113			on_each_cpu(mce_enable_ce, (void *)1, 1);
2114		}
2115	}
2116	return size;
2117}
2118
2119static ssize_t set_cmci_disabled(struct device *s,
2120				 struct device_attribute *attr,
2121				 const char *buf, size_t size)
2122{
2123	u64 new;
2124
2125	if (strict_strtoull(buf, 0, &new) < 0)
2126		return -EINVAL;
2127
2128	if (mce_cmci_disabled ^ !!new) {
2129		if (new) {
2130			/* disable cmci */
2131			on_each_cpu(mce_disable_cmci, NULL, 1);
2132			mce_cmci_disabled = 1;
2133		} else {
2134			/* enable cmci */
2135			mce_cmci_disabled = 0;
2136			on_each_cpu(mce_enable_ce, NULL, 1);
2137		}
2138	}
2139	return size;
2140}
2141
2142static ssize_t store_int_with_restart(struct device *s,
2143				      struct device_attribute *attr,
2144				      const char *buf, size_t size)
2145{
2146	ssize_t ret = device_store_int(s, attr, buf, size);
2147	mce_restart();
2148	return ret;
2149}
2150
2151static DEVICE_ATTR(trigger, 0644, show_trigger, set_trigger);
2152static DEVICE_INT_ATTR(tolerant, 0644, tolerant);
2153static DEVICE_INT_ATTR(monarch_timeout, 0644, monarch_timeout);
2154static DEVICE_INT_ATTR(dont_log_ce, 0644, mce_dont_log_ce);
2155
2156static struct dev_ext_attribute dev_attr_check_interval = {
2157	__ATTR(check_interval, 0644, device_show_int, store_int_with_restart),
2158	&check_interval
2159};
2160
2161static struct dev_ext_attribute dev_attr_ignore_ce = {
2162	__ATTR(ignore_ce, 0644, device_show_int, set_ignore_ce),
2163	&mce_ignore_ce
2164};
2165
2166static struct dev_ext_attribute dev_attr_cmci_disabled = {
2167	__ATTR(cmci_disabled, 0644, device_show_int, set_cmci_disabled),
2168	&mce_cmci_disabled
2169};
2170
2171static struct device_attribute *mce_device_attrs[] = {
2172	&dev_attr_tolerant.attr,
2173	&dev_attr_check_interval.attr,
2174	&dev_attr_trigger,
2175	&dev_attr_monarch_timeout.attr,
2176	&dev_attr_dont_log_ce.attr,
2177	&dev_attr_ignore_ce.attr,
2178	&dev_attr_cmci_disabled.attr,
2179	NULL
2180};
2181
2182static cpumask_var_t mce_device_initialized;
2183
2184static void mce_device_release(struct device *dev)
2185{
2186	kfree(dev);
2187}
2188
2189/* Per cpu device init. All of the cpus still share the same ctrl bank: */
2190static __cpuinit int mce_device_create(unsigned int cpu)
2191{
2192	struct device *dev;
2193	int err;
2194	int i, j;
2195
2196	if (!mce_available(&boot_cpu_data))
2197		return -EIO;
2198
2199	dev = kzalloc(sizeof *dev, GFP_KERNEL);
2200	if (!dev)
2201		return -ENOMEM;
2202	dev->id  = cpu;
2203	dev->bus = &mce_subsys;
2204	dev->release = &mce_device_release;
2205
2206	err = device_register(dev);
2207	if (err)
2208		return err;
2209
2210	for (i = 0; mce_device_attrs[i]; i++) {
2211		err = device_create_file(dev, mce_device_attrs[i]);
2212		if (err)
2213			goto error;
2214	}
2215	for (j = 0; j < banks; j++) {
2216		err = device_create_file(dev, &mce_banks[j].attr);
2217		if (err)
2218			goto error2;
2219	}
2220	cpumask_set_cpu(cpu, mce_device_initialized);
2221	per_cpu(mce_device, cpu) = dev;
2222
2223	return 0;
2224error2:
2225	while (--j >= 0)
2226		device_remove_file(dev, &mce_banks[j].attr);
2227error:
2228	while (--i >= 0)
2229		device_remove_file(dev, mce_device_attrs[i]);
2230
2231	device_unregister(dev);
2232
2233	return err;
2234}
2235
2236static __cpuinit void mce_device_remove(unsigned int cpu)
2237{
2238	struct device *dev = per_cpu(mce_device, cpu);
2239	int i;
2240
2241	if (!cpumask_test_cpu(cpu, mce_device_initialized))
2242		return;
2243
2244	for (i = 0; mce_device_attrs[i]; i++)
2245		device_remove_file(dev, mce_device_attrs[i]);
2246
2247	for (i = 0; i < banks; i++)
2248		device_remove_file(dev, &mce_banks[i].attr);
2249
2250	device_unregister(dev);
2251	cpumask_clear_cpu(cpu, mce_device_initialized);
2252	per_cpu(mce_device, cpu) = NULL;
2253}
2254
2255/* Make sure there are no machine checks on offlined CPUs. */
2256static void __cpuinit mce_disable_cpu(void *h)
2257{
2258	unsigned long action = *(unsigned long *)h;
2259	int i;
2260
2261	if (!mce_available(__this_cpu_ptr(&cpu_info)))
2262		return;
2263
2264	if (!(action & CPU_TASKS_FROZEN))
2265		cmci_clear();
2266	for (i = 0; i < banks; i++) {
2267		struct mce_bank *b = &mce_banks[i];
2268
2269		if (b->init)
2270			wrmsrl(MSR_IA32_MCx_CTL(i), 0);
2271	}
2272}
2273
2274static void __cpuinit mce_reenable_cpu(void *h)
2275{
2276	unsigned long action = *(unsigned long *)h;
2277	int i;
2278
2279	if (!mce_available(__this_cpu_ptr(&cpu_info)))
2280		return;
2281
2282	if (!(action & CPU_TASKS_FROZEN))
2283		cmci_reenable();
2284	for (i = 0; i < banks; i++) {
2285		struct mce_bank *b = &mce_banks[i];
2286
2287		if (b->init)
2288			wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl);
2289	}
2290}
2291
2292/* Get notified when a cpu comes on/off. Be hotplug friendly. */
2293static int __cpuinit
2294mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
2295{
2296	unsigned int cpu = (unsigned long)hcpu;
2297	struct timer_list *t = &per_cpu(mce_timer, cpu);
2298
2299	switch (action & ~CPU_TASKS_FROZEN) {
2300	case CPU_ONLINE:
2301		mce_device_create(cpu);
2302		if (threshold_cpu_callback)
2303			threshold_cpu_callback(action, cpu);
2304		break;
2305	case CPU_DEAD:
2306		if (threshold_cpu_callback)
2307			threshold_cpu_callback(action, cpu);
2308		mce_device_remove(cpu);
2309		mce_intel_hcpu_update(cpu);
2310		break;
2311	case CPU_DOWN_PREPARE:
2312		smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
2313		del_timer_sync(t);
2314		break;
2315	case CPU_DOWN_FAILED:
2316		smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
2317		mce_start_timer(cpu, t);
2318		break;
2319	}
2320
2321	if (action == CPU_POST_DEAD) {
2322		/* intentionally ignoring frozen here */
2323		cmci_rediscover(cpu);
2324	}
2325
2326	return NOTIFY_OK;
2327}
2328
2329static struct notifier_block mce_cpu_notifier __cpuinitdata = {
2330	.notifier_call = mce_cpu_callback,
2331};
2332
2333static __init void mce_init_banks(void)
2334{
2335	int i;
2336
2337	for (i = 0; i < banks; i++) {
2338		struct mce_bank *b = &mce_banks[i];
2339		struct device_attribute *a = &b->attr;
2340
2341		sysfs_attr_init(&a->attr);
2342		a->attr.name	= b->attrname;
2343		snprintf(b->attrname, ATTR_LEN, "bank%d", i);
2344
2345		a->attr.mode	= 0644;
2346		a->show		= show_bank;
2347		a->store	= set_bank;
2348	}
2349}
2350
2351static __init int mcheck_init_device(void)
2352{
2353	int err;
2354	int i = 0;
2355
2356	if (!mce_available(&boot_cpu_data))
2357		return -EIO;
2358
2359	zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL);
2360
2361	mce_init_banks();
2362
2363	err = subsys_system_register(&mce_subsys, NULL);
2364	if (err)
2365		return err;
2366
2367	for_each_online_cpu(i) {
2368		err = mce_device_create(i);
2369		if (err)
2370			return err;
2371	}
2372
2373	register_syscore_ops(&mce_syscore_ops);
2374	register_hotcpu_notifier(&mce_cpu_notifier);
2375
2376	/* register character device /dev/mcelog */
2377	misc_register(&mce_chrdev_device);
2378
2379	return err;
2380}
2381device_initcall_sync(mcheck_init_device);
2382
2383/*
2384 * Old style boot options parsing. Only for compatibility.
2385 */
2386static int __init mcheck_disable(char *str)
2387{
2388	mce_disabled = 1;
2389	return 1;
2390}
2391__setup("nomce", mcheck_disable);
2392
2393#ifdef CONFIG_DEBUG_FS
2394struct dentry *mce_get_debugfs_dir(void)
2395{
2396	static struct dentry *dmce;
2397
2398	if (!dmce)
2399		dmce = debugfs_create_dir("mce", NULL);
2400
2401	return dmce;
2402}
2403
2404static void mce_reset(void)
2405{
2406	cpu_missing = 0;
2407	atomic_set(&mce_fake_paniced, 0);
2408	atomic_set(&mce_executing, 0);
2409	atomic_set(&mce_callin, 0);
2410	atomic_set(&global_nwo, 0);
2411}
2412
2413static int fake_panic_get(void *data, u64 *val)
2414{
2415	*val = fake_panic;
2416	return 0;
2417}
2418
2419static int fake_panic_set(void *data, u64 val)
2420{
2421	mce_reset();
2422	fake_panic = val;
2423	return 0;
2424}
2425
2426DEFINE_SIMPLE_ATTRIBUTE(fake_panic_fops, fake_panic_get,
2427			fake_panic_set, "%llu\n");
2428
2429static int __init mcheck_debugfs_init(void)
2430{
2431	struct dentry *dmce, *ffake_panic;
2432
2433	dmce = mce_get_debugfs_dir();
2434	if (!dmce)
2435		return -ENOMEM;
2436	ffake_panic = debugfs_create_file("fake_panic", 0444, dmce, NULL,
2437					  &fake_panic_fops);
2438	if (!ffake_panic)
2439		return -ENOMEM;
2440
2441	return 0;
2442}
2443late_initcall(mcheck_debugfs_init);
2444#endif
2445