mce.c revision b659294b779565c60f5e12ef505328e2b974eb62
1/*
2 * Machine check handler.
3 *
4 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
5 * Rest from unknown author(s).
6 * 2004 Andi Kleen. Rewrote most of it.
7 * Copyright 2008 Intel Corporation
8 * Author: Andi Kleen
9 */
10#include <linux/thread_info.h>
11#include <linux/capability.h>
12#include <linux/miscdevice.h>
13#include <linux/ratelimit.h>
14#include <linux/kallsyms.h>
15#include <linux/rcupdate.h>
16#include <linux/smp_lock.h>
17#include <linux/kobject.h>
18#include <linux/kdebug.h>
19#include <linux/kernel.h>
20#include <linux/percpu.h>
21#include <linux/string.h>
22#include <linux/sysdev.h>
23#include <linux/ctype.h>
24#include <linux/sched.h>
25#include <linux/sysfs.h>
26#include <linux/types.h>
27#include <linux/init.h>
28#include <linux/kmod.h>
29#include <linux/poll.h>
30#include <linux/cpu.h>
31#include <linux/fs.h>
32
33#include <asm/processor.h>
34#include <asm/uaccess.h>
35#include <asm/idle.h>
36#include <asm/mce.h>
37#include <asm/msr.h>
38#include <asm/smp.h>
39
40#include "mce.h"
41
42#ifdef CONFIG_X86_64
43
44#define MISC_MCELOG_MINOR	227
45
46atomic_t mce_entry;
47
48static int			mce_dont_init;
49
50/*
51 * Tolerant levels:
52 *   0: always panic on uncorrected errors, log corrected errors
53 *   1: panic or SIGBUS on uncorrected errors, log corrected errors
54 *   2: SIGBUS or log uncorrected errors (if possible), log corrected errors
55 *   3: never panic or SIGBUS, log all errors (for testing only)
56 */
57static int			tolerant = 1;
58static int			banks;
59static u64			*bank;
60static unsigned long		notify_user;
61static int			rip_msr;
62static int			mce_bootlog = -1;
63static atomic_t			mce_events;
64
65static char			trigger[128];
66static char			*trigger_argv[2] = { trigger, NULL };
67
68static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
69
70/* MCA banks polled by the period polling timer for corrected events */
71DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
72	[0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
73};
74
75/* Do initial initialization of a struct mce */
76void mce_setup(struct mce *m)
77{
78	memset(m, 0, sizeof(struct mce));
79	m->cpu = smp_processor_id();
80	rdtscll(m->tsc);
81}
82
83/*
84 * Lockless MCE logging infrastructure.
85 * This avoids deadlocks on printk locks without having to break locks. Also
86 * separate MCEs from kernel messages to avoid bogus bug reports.
87 */
88
89static struct mce_log mcelog = {
90	MCE_LOG_SIGNATURE,
91	MCE_LOG_LEN,
92};
93
94void mce_log(struct mce *mce)
95{
96	unsigned next, entry;
97
98	atomic_inc(&mce_events);
99	mce->finished = 0;
100	wmb();
101	for (;;) {
102		entry = rcu_dereference(mcelog.next);
103		for (;;) {
104			/*
105			 * When the buffer fills up discard new entries.
106			 * Assume that the earlier errors are the more
107			 * interesting ones:
108			 */
109			if (entry >= MCE_LOG_LEN) {
110				set_bit(MCE_OVERFLOW, (unsigned long *)&mcelog.flags);
111				return;
112			}
113			/* Old left over entry. Skip: */
114			if (mcelog.entry[entry].finished) {
115				entry++;
116				continue;
117			}
118			break;
119		}
120		smp_rmb();
121		next = entry + 1;
122		if (cmpxchg(&mcelog.next, entry, next) == entry)
123			break;
124	}
125	memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
126	wmb();
127	mcelog.entry[entry].finished = 1;
128	wmb();
129
130	set_bit(0, &notify_user);
131}
132
133static void print_mce(struct mce *m)
134{
135	printk(KERN_EMERG "\n"
136	       KERN_EMERG "HARDWARE ERROR\n"
137	       KERN_EMERG
138	       "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
139	       m->cpu, m->mcgstatus, m->bank, m->status);
140	if (m->ip) {
141		printk(KERN_EMERG "RIP%s %02x:<%016Lx> ",
142		       !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
143		       m->cs, m->ip);
144		if (m->cs == __KERNEL_CS)
145			print_symbol("{%s}", m->ip);
146		printk("\n");
147	}
148	printk(KERN_EMERG "TSC %llx ", m->tsc);
149	if (m->addr)
150		printk("ADDR %llx ", m->addr);
151	if (m->misc)
152		printk("MISC %llx ", m->misc);
153	printk("\n");
154	printk(KERN_EMERG "This is not a software problem!\n");
155	printk(KERN_EMERG "Run through mcelog --ascii to decode "
156	       "and contact your hardware vendor\n");
157}
158
159static void mce_panic(char *msg, struct mce *backup, unsigned long start)
160{
161	int i;
162
163	oops_begin();
164	for (i = 0; i < MCE_LOG_LEN; i++) {
165		unsigned long tsc = mcelog.entry[i].tsc;
166
167		if (time_before(tsc, start))
168			continue;
169		print_mce(&mcelog.entry[i]);
170		if (backup && mcelog.entry[i].tsc == backup->tsc)
171			backup = NULL;
172	}
173	if (backup)
174		print_mce(backup);
175	panic(msg);
176}
177
178int mce_available(struct cpuinfo_x86 *c)
179{
180	if (mce_dont_init)
181		return 0;
182	return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
183}
184
185static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
186{
187	if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) {
188		m->ip = regs->ip;
189		m->cs = regs->cs;
190	} else {
191		m->ip = 0;
192		m->cs = 0;
193	}
194	if (rip_msr) {
195		/* Assume the RIP in the MSR is exact. Is this true? */
196		m->mcgstatus |= MCG_STATUS_EIPV;
197		rdmsrl(rip_msr, m->ip);
198		m->cs = 0;
199	}
200}
201
202/*
203 * Poll for corrected events or events that happened before reset.
204 * Those are just logged through /dev/mcelog.
205 *
206 * This is executed in standard interrupt context.
207 */
208void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
209{
210	struct mce m;
211	int i;
212
213	mce_setup(&m);
214
215	rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
216	for (i = 0; i < banks; i++) {
217		if (!bank[i] || !test_bit(i, *b))
218			continue;
219
220		m.misc = 0;
221		m.addr = 0;
222		m.bank = i;
223		m.tsc = 0;
224
225		barrier();
226		rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
227		if (!(m.status & MCI_STATUS_VAL))
228			continue;
229
230		/*
231		 * Uncorrected events are handled by the exception handler
232		 * when it is enabled. But when the exception is disabled log
233		 * everything.
234		 *
235		 * TBD do the same check for MCI_STATUS_EN here?
236		 */
237		if ((m.status & MCI_STATUS_UC) && !(flags & MCP_UC))
238			continue;
239
240		if (m.status & MCI_STATUS_MISCV)
241			rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
242		if (m.status & MCI_STATUS_ADDRV)
243			rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
244
245		if (!(flags & MCP_TIMESTAMP))
246			m.tsc = 0;
247		/*
248		 * Don't get the IP here because it's unlikely to
249		 * have anything to do with the actual error location.
250		 */
251		if (!(flags & MCP_DONTLOG)) {
252			mce_log(&m);
253			add_taint(TAINT_MACHINE_CHECK);
254		}
255
256		/*
257		 * Clear state for this bank.
258		 */
259		wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
260	}
261
262	/*
263	 * Don't clear MCG_STATUS here because it's only defined for
264	 * exceptions.
265	 */
266}
267
268/*
269 * The actual machine check handler. This only handles real
270 * exceptions when something got corrupted coming in through int 18.
271 *
272 * This is executed in NMI context not subject to normal locking rules. This
273 * implies that most kernel services cannot be safely used. Don't even
274 * think about putting a printk in there!
275 */
276void do_machine_check(struct pt_regs *regs, long error_code)
277{
278	struct mce m, panicm;
279	int panicm_found = 0;
280	u64 mcestart = 0;
281	int i;
282	/*
283	 * If no_way_out gets set, there is no safe way to recover from this
284	 * MCE.  If tolerant is cranked up, we'll try anyway.
285	 */
286	int no_way_out = 0;
287	/*
288	 * If kill_it gets set, there might be a way to recover from this
289	 * error.
290	 */
291	int kill_it = 0;
292	DECLARE_BITMAP(toclear, MAX_NR_BANKS);
293
294	atomic_inc(&mce_entry);
295
296	if (notify_die(DIE_NMI, "machine check", regs, error_code,
297			   18, SIGKILL) == NOTIFY_STOP)
298		goto out2;
299	if (!banks)
300		goto out2;
301
302	mce_setup(&m);
303
304	rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
305
306	/* if the restart IP is not valid, we're done for */
307	if (!(m.mcgstatus & MCG_STATUS_RIPV))
308		no_way_out = 1;
309
310	rdtscll(mcestart);
311	barrier();
312
313	for (i = 0; i < banks; i++) {
314		__clear_bit(i, toclear);
315		if (!bank[i])
316			continue;
317
318		m.misc = 0;
319		m.addr = 0;
320		m.bank = i;
321
322		rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
323		if ((m.status & MCI_STATUS_VAL) == 0)
324			continue;
325
326		/*
327		 * Non uncorrected errors are handled by machine_check_poll
328		 * Leave them alone.
329		 */
330		if ((m.status & MCI_STATUS_UC) == 0)
331			continue;
332
333		/*
334		 * Set taint even when machine check was not enabled.
335		 */
336		add_taint(TAINT_MACHINE_CHECK);
337
338		__set_bit(i, toclear);
339
340		if (m.status & MCI_STATUS_EN) {
341			/* if PCC was set, there's no way out */
342			no_way_out |= !!(m.status & MCI_STATUS_PCC);
343			/*
344			 * If this error was uncorrectable and there was
345			 * an overflow, we're in trouble.  If no overflow,
346			 * we might get away with just killing a task.
347			 */
348			if (m.status & MCI_STATUS_UC) {
349				if (tolerant < 1 || m.status & MCI_STATUS_OVER)
350					no_way_out = 1;
351				kill_it = 1;
352			}
353		} else {
354			/*
355			 * Machine check event was not enabled. Clear, but
356			 * ignore.
357			 */
358			continue;
359		}
360
361		if (m.status & MCI_STATUS_MISCV)
362			rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
363		if (m.status & MCI_STATUS_ADDRV)
364			rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
365
366		mce_get_rip(&m, regs);
367		mce_log(&m);
368
369		/*
370		 * Did this bank cause the exception?
371		 *
372		 * Assume that the bank with uncorrectable errors did it,
373		 * and that there is only a single one:
374		 */
375		if ((m.status & MCI_STATUS_UC) &&
376					(m.status & MCI_STATUS_EN)) {
377			panicm = m;
378			panicm_found = 1;
379		}
380	}
381
382	/*
383	 * If we didn't find an uncorrectable error, pick
384	 * the last one (shouldn't happen, just being safe).
385	 */
386	if (!panicm_found)
387		panicm = m;
388
389	/*
390	 * If we have decided that we just CAN'T continue, and the user
391	 * has not set tolerant to an insane level, give up and die.
392	 */
393	if (no_way_out && tolerant < 3)
394		mce_panic("Machine check", &panicm, mcestart);
395
396	/*
397	 * If the error seems to be unrecoverable, something should be
398	 * done.  Try to kill as little as possible.  If we can kill just
399	 * one task, do that.  If the user has set the tolerance very
400	 * high, don't try to do anything at all.
401	 */
402	if (kill_it && tolerant < 3) {
403		int user_space = 0;
404
405		/*
406		 * If the EIPV bit is set, it means the saved IP is the
407		 * instruction which caused the MCE.
408		 */
409		if (m.mcgstatus & MCG_STATUS_EIPV)
410			user_space = panicm.ip && (panicm.cs & 3);
411
412		/*
413		 * If we know that the error was in user space, send a
414		 * SIGBUS.  Otherwise, panic if tolerance is low.
415		 *
416		 * force_sig() takes an awful lot of locks and has a slight
417		 * risk of deadlocking.
418		 */
419		if (user_space) {
420			force_sig(SIGBUS, current);
421		} else if (panic_on_oops || tolerant < 2) {
422			mce_panic("Uncorrected machine check",
423				&panicm, mcestart);
424		}
425	}
426
427	/* notify userspace ASAP */
428	set_thread_flag(TIF_MCE_NOTIFY);
429
430	/* the last thing we do is clear state */
431	for (i = 0; i < banks; i++) {
432		if (test_bit(i, toclear))
433			wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
434	}
435	wrmsrl(MSR_IA32_MCG_STATUS, 0);
436 out2:
437	atomic_dec(&mce_entry);
438}
439
440#ifdef CONFIG_X86_MCE_INTEL
441/***
442 * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
443 * @cpu: The CPU on which the event occurred.
444 * @status: Event status information
445 *
446 * This function should be called by the thermal interrupt after the
447 * event has been processed and the decision was made to log the event
448 * further.
449 *
450 * The status parameter will be saved to the 'status' field of 'struct mce'
451 * and historically has been the register value of the
452 * MSR_IA32_THERMAL_STATUS (Intel) msr.
453 */
454void mce_log_therm_throt_event(__u64 status)
455{
456	struct mce m;
457
458	mce_setup(&m);
459	m.bank = MCE_THERMAL_BANK;
460	m.status = status;
461	mce_log(&m);
462}
463#endif /* CONFIG_X86_MCE_INTEL */
464
465/*
466 * Periodic polling timer for "silent" machine check errors.  If the
467 * poller finds an MCE, poll 2x faster.  When the poller finds no more
468 * errors, poll 2x slower (up to check_interval seconds).
469 */
470static int check_interval = 5 * 60; /* 5 minutes */
471
472static DEFINE_PER_CPU(int, next_interval); /* in jiffies */
473static DEFINE_PER_CPU(struct timer_list, mce_timer);
474
475static void mcheck_timer(unsigned long data)
476{
477	struct timer_list *t = &per_cpu(mce_timer, data);
478	int *n;
479
480	WARN_ON(smp_processor_id() != data);
481
482	if (mce_available(&current_cpu_data)) {
483		machine_check_poll(MCP_TIMESTAMP,
484				&__get_cpu_var(mce_poll_banks));
485	}
486
487	/*
488	 * Alert userspace if needed.  If we logged an MCE, reduce the
489	 * polling interval, otherwise increase the polling interval.
490	 */
491	n = &__get_cpu_var(next_interval);
492	if (mce_notify_user()) {
493		*n = max(*n/2, HZ/100);
494	} else {
495		*n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ));
496	}
497
498	t->expires = jiffies + *n;
499	add_timer(t);
500}
501
502static void mce_do_trigger(struct work_struct *work)
503{
504	call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT);
505}
506
507static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
508
509/*
510 * Notify the user(s) about new machine check events.
511 * Can be called from interrupt context, but not from machine check/NMI
512 * context.
513 */
514int mce_notify_user(void)
515{
516	/* Not more than two messages every minute */
517	static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
518
519	clear_thread_flag(TIF_MCE_NOTIFY);
520
521	if (test_and_clear_bit(0, &notify_user)) {
522		wake_up_interruptible(&mce_wait);
523
524		/*
525		 * There is no risk of missing notifications because
526		 * work_pending is always cleared before the function is
527		 * executed.
528		 */
529		if (trigger[0] && !work_pending(&mce_trigger_work))
530			schedule_work(&mce_trigger_work);
531
532		if (__ratelimit(&ratelimit))
533			printk(KERN_INFO "Machine check events logged\n");
534
535		return 1;
536	}
537	return 0;
538}
539
540/* see if the idle task needs to notify userspace: */
541static int
542mce_idle_callback(struct notifier_block *nfb, unsigned long action,
543		  void *unused)
544{
545	/* IDLE_END should be safe - interrupts are back on */
546	if (action == IDLE_END && test_thread_flag(TIF_MCE_NOTIFY))
547		mce_notify_user();
548
549	return NOTIFY_OK;
550}
551
552static struct notifier_block mce_idle_notifier = {
553	.notifier_call		= mce_idle_callback,
554};
555
556static __init int periodic_mcheck_init(void)
557{
558       idle_notifier_register(&mce_idle_notifier);
559       return 0;
560}
561__initcall(periodic_mcheck_init);
562
563/*
564 * Initialize Machine Checks for a CPU.
565 */
566static int mce_cap_init(void)
567{
568	unsigned b;
569	u64 cap;
570
571	rdmsrl(MSR_IA32_MCG_CAP, cap);
572	b = cap & 0xff;
573	printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b);
574
575	if (b > MAX_NR_BANKS) {
576		printk(KERN_WARNING
577		       "MCE: Using only %u machine check banks out of %u\n",
578			MAX_NR_BANKS, b);
579		b = MAX_NR_BANKS;
580	}
581
582	/* Don't support asymmetric configurations today */
583	WARN_ON(banks != 0 && b != banks);
584	banks = b;
585	if (!bank) {
586		bank = kmalloc(banks * sizeof(u64), GFP_KERNEL);
587		if (!bank)
588			return -ENOMEM;
589		memset(bank, 0xff, banks * sizeof(u64));
590	}
591
592	/* Use accurate RIP reporting if available. */
593	if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
594		rip_msr = MSR_IA32_MCG_EIP;
595
596	return 0;
597}
598
599static void mce_init(void *dummy)
600{
601	mce_banks_t all_banks;
602	u64 cap;
603	int i;
604
605	/*
606	 * Log the machine checks left over from the previous reset.
607	 */
608	bitmap_fill(all_banks, MAX_NR_BANKS);
609	machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks);
610
611	set_in_cr4(X86_CR4_MCE);
612
613	rdmsrl(MSR_IA32_MCG_CAP, cap);
614	if (cap & MCG_CTL_P)
615		wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
616
617	for (i = 0; i < banks; i++) {
618		wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
619		wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
620	}
621}
622
623/* Add per CPU specific workarounds here */
624static void mce_cpu_quirks(struct cpuinfo_x86 *c)
625{
626	/* This should be disabled by the BIOS, but isn't always */
627	if (c->x86_vendor == X86_VENDOR_AMD) {
628		if (c->x86 == 15 && banks > 4) {
629			/*
630			 * disable GART TBL walk error reporting, which
631			 * trips off incorrectly with the IOMMU & 3ware
632			 * & Cerberus:
633			 */
634			clear_bit(10, (unsigned long *)&bank[4]);
635		}
636		if (c->x86 <= 17 && mce_bootlog < 0) {
637			/*
638			 * Lots of broken BIOS around that don't clear them
639			 * by default and leave crap in there. Don't log:
640			 */
641			mce_bootlog = 0;
642		}
643	}
644
645}
646
647static void mce_cpu_features(struct cpuinfo_x86 *c)
648{
649	switch (c->x86_vendor) {
650	case X86_VENDOR_INTEL:
651		mce_intel_feature_init(c);
652		break;
653	case X86_VENDOR_AMD:
654		mce_amd_feature_init(c);
655		break;
656	default:
657		break;
658	}
659}
660
661static void mce_init_timer(void)
662{
663	struct timer_list *t = &__get_cpu_var(mce_timer);
664	int *n = &__get_cpu_var(next_interval);
665
666	*n = check_interval * HZ;
667	if (!*n)
668		return;
669	setup_timer(t, mcheck_timer, smp_processor_id());
670	t->expires = round_jiffies(jiffies + *n);
671	add_timer(t);
672}
673
674/*
675 * Called for each booted CPU to set up machine checks.
676 * Must be called with preempt off:
677 */
678void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
679{
680	if (!mce_available(c))
681		return;
682
683	if (mce_cap_init() < 0) {
684		mce_dont_init = 1;
685		return;
686	}
687	mce_cpu_quirks(c);
688
689	mce_init(NULL);
690	mce_cpu_features(c);
691	mce_init_timer();
692}
693
694/*
695 * Character device to read and clear the MCE log.
696 */
697
698static DEFINE_SPINLOCK(mce_state_lock);
699static int		open_count;		/* #times opened */
700static int		open_exclu;		/* already open exclusive? */
701
702static int mce_open(struct inode *inode, struct file *file)
703{
704	lock_kernel();
705	spin_lock(&mce_state_lock);
706
707	if (open_exclu || (open_count && (file->f_flags & O_EXCL))) {
708		spin_unlock(&mce_state_lock);
709		unlock_kernel();
710
711		return -EBUSY;
712	}
713
714	if (file->f_flags & O_EXCL)
715		open_exclu = 1;
716	open_count++;
717
718	spin_unlock(&mce_state_lock);
719	unlock_kernel();
720
721	return nonseekable_open(inode, file);
722}
723
724static int mce_release(struct inode *inode, struct file *file)
725{
726	spin_lock(&mce_state_lock);
727
728	open_count--;
729	open_exclu = 0;
730
731	spin_unlock(&mce_state_lock);
732
733	return 0;
734}
735
736static void collect_tscs(void *data)
737{
738	unsigned long *cpu_tsc = (unsigned long *)data;
739
740	rdtscll(cpu_tsc[smp_processor_id()]);
741}
742
743static DEFINE_MUTEX(mce_read_mutex);
744
745static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
746			loff_t *off)
747{
748	char __user *buf = ubuf;
749	unsigned long *cpu_tsc;
750	unsigned prev, next;
751	int i, err;
752
753	cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
754	if (!cpu_tsc)
755		return -ENOMEM;
756
757	mutex_lock(&mce_read_mutex);
758	next = rcu_dereference(mcelog.next);
759
760	/* Only supports full reads right now */
761	if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
762		mutex_unlock(&mce_read_mutex);
763		kfree(cpu_tsc);
764
765		return -EINVAL;
766	}
767
768	err = 0;
769	prev = 0;
770	do {
771		for (i = prev; i < next; i++) {
772			unsigned long start = jiffies;
773
774			while (!mcelog.entry[i].finished) {
775				if (time_after_eq(jiffies, start + 2)) {
776					memset(mcelog.entry + i, 0,
777					       sizeof(struct mce));
778					goto timeout;
779				}
780				cpu_relax();
781			}
782			smp_rmb();
783			err |= copy_to_user(buf, mcelog.entry + i,
784					    sizeof(struct mce));
785			buf += sizeof(struct mce);
786timeout:
787			;
788		}
789
790		memset(mcelog.entry + prev, 0,
791		       (next - prev) * sizeof(struct mce));
792		prev = next;
793		next = cmpxchg(&mcelog.next, prev, 0);
794	} while (next != prev);
795
796	synchronize_sched();
797
798	/*
799	 * Collect entries that were still getting written before the
800	 * synchronize.
801	 */
802	on_each_cpu(collect_tscs, cpu_tsc, 1);
803
804	for (i = next; i < MCE_LOG_LEN; i++) {
805		if (mcelog.entry[i].finished &&
806		    mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
807			err |= copy_to_user(buf, mcelog.entry+i,
808					    sizeof(struct mce));
809			smp_rmb();
810			buf += sizeof(struct mce);
811			memset(&mcelog.entry[i], 0, sizeof(struct mce));
812		}
813	}
814	mutex_unlock(&mce_read_mutex);
815	kfree(cpu_tsc);
816
817	return err ? -EFAULT : buf - ubuf;
818}
819
820static unsigned int mce_poll(struct file *file, poll_table *wait)
821{
822	poll_wait(file, &mce_wait, wait);
823	if (rcu_dereference(mcelog.next))
824		return POLLIN | POLLRDNORM;
825	return 0;
826}
827
828static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
829{
830	int __user *p = (int __user *)arg;
831
832	if (!capable(CAP_SYS_ADMIN))
833		return -EPERM;
834
835	switch (cmd) {
836	case MCE_GET_RECORD_LEN:
837		return put_user(sizeof(struct mce), p);
838	case MCE_GET_LOG_LEN:
839		return put_user(MCE_LOG_LEN, p);
840	case MCE_GETCLEAR_FLAGS: {
841		unsigned flags;
842
843		do {
844			flags = mcelog.flags;
845		} while (cmpxchg(&mcelog.flags, flags, 0) != flags);
846
847		return put_user(flags, p);
848	}
849	default:
850		return -ENOTTY;
851	}
852}
853
854static const struct file_operations mce_chrdev_ops = {
855	.open			= mce_open,
856	.release		= mce_release,
857	.read			= mce_read,
858	.poll			= mce_poll,
859	.unlocked_ioctl		= mce_ioctl,
860};
861
862static struct miscdevice mce_log_device = {
863	MISC_MCELOG_MINOR,
864	"mcelog",
865	&mce_chrdev_ops,
866};
867
868/*
869 * Old style boot options parsing. Only for compatibility.
870 */
871static int __init mcheck_disable(char *str)
872{
873	mce_dont_init = 1;
874	return 1;
875}
876__setup("nomce", mcheck_disable);
877
878/*
879 * mce=off disables machine check
880 * mce=TOLERANCELEVEL (number, see above)
881 * mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
882 * mce=nobootlog Don't log MCEs from before booting.
883 */
884static int __init mcheck_enable(char *str)
885{
886	if (!strcmp(str, "off"))
887		mce_dont_init = 1;
888	else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
889		mce_bootlog = (str[0] == 'b');
890	else if (isdigit(str[0]))
891		get_option(&str, &tolerant);
892	else {
893		printk(KERN_INFO "mce= argument %s ignored. Please use /sys\n",
894		       str);
895		return 0;
896	}
897	return 1;
898}
899__setup("mce=", mcheck_enable);
900
901/*
902 * Sysfs support
903 */
904
905/*
906 * Disable machine checks on suspend and shutdown. We can't really handle
907 * them later.
908 */
909static int mce_disable(void)
910{
911	int i;
912
913	for (i = 0; i < banks; i++)
914		wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
915	return 0;
916}
917
918static int mce_suspend(struct sys_device *dev, pm_message_t state)
919{
920	return mce_disable();
921}
922
923static int mce_shutdown(struct sys_device *dev)
924{
925	return mce_disable();
926}
927
928/*
929 * On resume clear all MCE state. Don't want to see leftovers from the BIOS.
930 * Only one CPU is active at this time, the others get re-added later using
931 * CPU hotplug:
932 */
933static int mce_resume(struct sys_device *dev)
934{
935	mce_init(NULL);
936	mce_cpu_features(&current_cpu_data);
937
938	return 0;
939}
940
941static void mce_cpu_restart(void *data)
942{
943	del_timer_sync(&__get_cpu_var(mce_timer));
944	if (mce_available(&current_cpu_data))
945		mce_init(NULL);
946	mce_init_timer();
947}
948
949/* Reinit MCEs after user configuration changes */
950static void mce_restart(void)
951{
952	on_each_cpu(mce_cpu_restart, NULL, 1);
953}
954
955static struct sysdev_class mce_sysclass = {
956	.suspend	= mce_suspend,
957	.shutdown	= mce_shutdown,
958	.resume		= mce_resume,
959	.name		= "machinecheck",
960};
961
962DEFINE_PER_CPU(struct sys_device, mce_dev);
963
964__cpuinitdata
965void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
966
967/* Why are there no generic functions for this? */
968#define ACCESSOR(name, var, start) \
969	static ssize_t show_ ## name(struct sys_device *s,		\
970				     struct sysdev_attribute *attr,	\
971				     char *buf) {			\
972		return sprintf(buf, "%lx\n", (unsigned long)var);	\
973	}								\
974	static ssize_t set_ ## name(struct sys_device *s,		\
975				    struct sysdev_attribute *attr,	\
976				    const char *buf, size_t siz) {	\
977		char *end;						\
978		unsigned long new = simple_strtoul(buf, &end, 0);	\
979									\
980		if (end == buf)						\
981			return -EINVAL;					\
982		var = new;						\
983		start;							\
984									\
985		return end-buf;						\
986	}								\
987	static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
988
989static struct sysdev_attribute *bank_attrs;
990
991static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr,
992			 char *buf)
993{
994	u64 b = bank[attr - bank_attrs];
995
996	return sprintf(buf, "%llx\n", b);
997}
998
999static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
1000			const char *buf, size_t siz)
1001{
1002	char *end;
1003	u64 new = simple_strtoull(buf, &end, 0);
1004
1005	if (end == buf)
1006		return -EINVAL;
1007
1008	bank[attr - bank_attrs] = new;
1009	mce_restart();
1010
1011	return end-buf;
1012}
1013
1014static ssize_t
1015show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf)
1016{
1017	strcpy(buf, trigger);
1018	strcat(buf, "\n");
1019	return strlen(trigger) + 1;
1020}
1021
1022static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr,
1023				const char *buf, size_t siz)
1024{
1025	char *p;
1026	int len;
1027
1028	strncpy(trigger, buf, sizeof(trigger));
1029	trigger[sizeof(trigger)-1] = 0;
1030	len = strlen(trigger);
1031	p = strchr(trigger, '\n');
1032
1033	if (*p)
1034		*p = 0;
1035
1036	return len;
1037}
1038
1039static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
1040static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
1041
1042ACCESSOR(check_interval, check_interval, mce_restart())
1043
1044static struct sysdev_attribute *mce_attrs[] = {
1045	&attr_tolerant.attr, &attr_check_interval, &attr_trigger,
1046	NULL
1047};
1048
1049static cpumask_var_t mce_dev_initialized;
1050
1051/* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */
1052static __cpuinit int mce_create_device(unsigned int cpu)
1053{
1054	int err;
1055	int i;
1056
1057	if (!mce_available(&boot_cpu_data))
1058		return -EIO;
1059
1060	memset(&per_cpu(mce_dev, cpu).kobj, 0, sizeof(struct kobject));
1061	per_cpu(mce_dev, cpu).id	= cpu;
1062	per_cpu(mce_dev, cpu).cls	= &mce_sysclass;
1063
1064	err = sysdev_register(&per_cpu(mce_dev, cpu));
1065	if (err)
1066		return err;
1067
1068	for (i = 0; mce_attrs[i]; i++) {
1069		err = sysdev_create_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
1070		if (err)
1071			goto error;
1072	}
1073	for (i = 0; i < banks; i++) {
1074		err = sysdev_create_file(&per_cpu(mce_dev, cpu),
1075					&bank_attrs[i]);
1076		if (err)
1077			goto error2;
1078	}
1079	cpumask_set_cpu(cpu, mce_dev_initialized);
1080
1081	return 0;
1082error2:
1083	while (--i >= 0)
1084		sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]);
1085error:
1086	while (--i >= 0)
1087		sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
1088
1089	sysdev_unregister(&per_cpu(mce_dev, cpu));
1090
1091	return err;
1092}
1093
1094static __cpuinit void mce_remove_device(unsigned int cpu)
1095{
1096	int i;
1097
1098	if (!cpumask_test_cpu(cpu, mce_dev_initialized))
1099		return;
1100
1101	for (i = 0; mce_attrs[i]; i++)
1102		sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
1103
1104	for (i = 0; i < banks; i++)
1105		sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]);
1106
1107	sysdev_unregister(&per_cpu(mce_dev, cpu));
1108	cpumask_clear_cpu(cpu, mce_dev_initialized);
1109}
1110
1111/* Make sure there are no machine checks on offlined CPUs. */
1112static void mce_disable_cpu(void *h)
1113{
1114	unsigned long action = *(unsigned long *)h;
1115	int i;
1116
1117	if (!mce_available(&current_cpu_data))
1118		return;
1119	if (!(action & CPU_TASKS_FROZEN))
1120		cmci_clear();
1121	for (i = 0; i < banks; i++)
1122		wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
1123}
1124
1125static void mce_reenable_cpu(void *h)
1126{
1127	unsigned long action = *(unsigned long *)h;
1128	int i;
1129
1130	if (!mce_available(&current_cpu_data))
1131		return;
1132
1133	if (!(action & CPU_TASKS_FROZEN))
1134		cmci_reenable();
1135	for (i = 0; i < banks; i++)
1136		wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]);
1137}
1138
1139/* Get notified when a cpu comes on/off. Be hotplug friendly. */
1140static int __cpuinit
1141mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
1142{
1143	unsigned int cpu = (unsigned long)hcpu;
1144	struct timer_list *t = &per_cpu(mce_timer, cpu);
1145
1146	switch (action) {
1147	case CPU_ONLINE:
1148	case CPU_ONLINE_FROZEN:
1149		mce_create_device(cpu);
1150		if (threshold_cpu_callback)
1151			threshold_cpu_callback(action, cpu);
1152		break;
1153	case CPU_DEAD:
1154	case CPU_DEAD_FROZEN:
1155		if (threshold_cpu_callback)
1156			threshold_cpu_callback(action, cpu);
1157		mce_remove_device(cpu);
1158		break;
1159	case CPU_DOWN_PREPARE:
1160	case CPU_DOWN_PREPARE_FROZEN:
1161		del_timer_sync(t);
1162		smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
1163		break;
1164	case CPU_DOWN_FAILED:
1165	case CPU_DOWN_FAILED_FROZEN:
1166		t->expires = round_jiffies(jiffies +
1167						__get_cpu_var(next_interval));
1168		add_timer_on(t, cpu);
1169		smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
1170		break;
1171	case CPU_POST_DEAD:
1172		/* intentionally ignoring frozen here */
1173		cmci_rediscover(cpu);
1174		break;
1175	}
1176	return NOTIFY_OK;
1177}
1178
1179static struct notifier_block mce_cpu_notifier __cpuinitdata = {
1180	.notifier_call = mce_cpu_callback,
1181};
1182
1183static __init int mce_init_banks(void)
1184{
1185	int i;
1186
1187	bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks,
1188				GFP_KERNEL);
1189	if (!bank_attrs)
1190		return -ENOMEM;
1191
1192	for (i = 0; i < banks; i++) {
1193		struct sysdev_attribute *a = &bank_attrs[i];
1194
1195		a->attr.name	= kasprintf(GFP_KERNEL, "bank%d", i);
1196		if (!a->attr.name)
1197			goto nomem;
1198
1199		a->attr.mode	= 0644;
1200		a->show		= show_bank;
1201		a->store	= set_bank;
1202	}
1203	return 0;
1204
1205nomem:
1206	while (--i >= 0)
1207		kfree(bank_attrs[i].attr.name);
1208	kfree(bank_attrs);
1209	bank_attrs = NULL;
1210
1211	return -ENOMEM;
1212}
1213
1214static __init int mce_init_device(void)
1215{
1216	int err;
1217	int i = 0;
1218
1219	if (!mce_available(&boot_cpu_data))
1220		return -EIO;
1221
1222	alloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL);
1223
1224	err = mce_init_banks();
1225	if (err)
1226		return err;
1227
1228	err = sysdev_class_register(&mce_sysclass);
1229	if (err)
1230		return err;
1231
1232	for_each_online_cpu(i) {
1233		err = mce_create_device(i);
1234		if (err)
1235			return err;
1236	}
1237
1238	register_hotcpu_notifier(&mce_cpu_notifier);
1239	misc_register(&mce_log_device);
1240
1241	return err;
1242}
1243
1244device_initcall(mce_init_device);
1245
1246#else /* CONFIG_X86_32: */
1247
1248int mce_disabled;
1249
1250int nr_mce_banks;
1251EXPORT_SYMBOL_GPL(nr_mce_banks);	/* non-fatal.o */
1252
1253/* Handle unconfigured int18 (should never happen) */
1254static void unexpected_machine_check(struct pt_regs *regs, long error_code)
1255{
1256	printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n",
1257	       smp_processor_id());
1258}
1259
1260/* Call the installed machine check handler for this CPU setup. */
1261void (*machine_check_vector)(struct pt_regs *, long error_code) =
1262						unexpected_machine_check;
1263
1264/* This has to be run for each processor */
1265void mcheck_init(struct cpuinfo_x86 *c)
1266{
1267	if (mce_disabled == 1)
1268		return;
1269
1270	switch (c->x86_vendor) {
1271	case X86_VENDOR_AMD:
1272		amd_mcheck_init(c);
1273		break;
1274
1275	case X86_VENDOR_INTEL:
1276		if (c->x86 == 5)
1277			intel_p5_mcheck_init(c);
1278		if (c->x86 == 6)
1279			intel_p6_mcheck_init(c);
1280		if (c->x86 == 15)
1281			intel_p4_mcheck_init(c);
1282		break;
1283
1284	case X86_VENDOR_CENTAUR:
1285		if (c->x86 == 5)
1286			winchip_mcheck_init(c);
1287		break;
1288
1289	default:
1290		break;
1291	}
1292	printk(KERN_INFO "mce: CPU supports %d MCE banks\n", nr_mce_banks);
1293}
1294
1295static int __init mcheck_disable(char *str)
1296{
1297	mce_disabled = 1;
1298	return 1;
1299}
1300
1301static int __init mcheck_enable(char *str)
1302{
1303	mce_disabled = -1;
1304	return 1;
1305}
1306
1307__setup("nomce", mcheck_disable);
1308__setup("mce", mcheck_enable);
1309
1310#endif /* CONFIG_X86_32 */
1311