mce.c revision 8e97aef5f43ec715f394bc15015ff263b80c3ad6
1/*
2 * Machine check handler.
3 *
4 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
5 * Rest from unknown author(s).
6 * 2004 Andi Kleen. Rewrote most of it.
7 * Copyright 2008 Intel Corporation
8 * Author: Andi Kleen
9 */
10#include <linux/thread_info.h>
11#include <linux/capability.h>
12#include <linux/miscdevice.h>
13#include <linux/ratelimit.h>
14#include <linux/kallsyms.h>
15#include <linux/rcupdate.h>
16#include <linux/smp_lock.h>
17#include <linux/kobject.h>
18#include <linux/kdebug.h>
19#include <linux/kernel.h>
20#include <linux/percpu.h>
21#include <linux/string.h>
22#include <linux/sysdev.h>
23#include <linux/ctype.h>
24#include <linux/sched.h>
25#include <linux/sysfs.h>
26#include <linux/types.h>
27#include <linux/init.h>
28#include <linux/kmod.h>
29#include <linux/poll.h>
30#include <linux/cpu.h>
31#include <linux/fs.h>
32
33#include <asm/processor.h>
34#include <asm/uaccess.h>
35#include <asm/idle.h>
36#include <asm/mce.h>
37#include <asm/msr.h>
38#include <asm/smp.h>
39
40#include "mce.h"
41
42/* Handle unconfigured int18 (should never happen) */
43static void unexpected_machine_check(struct pt_regs *regs, long error_code)
44{
45	printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n",
46	       smp_processor_id());
47}
48
49/* Call the installed machine check handler for this CPU setup. */
50void (*machine_check_vector)(struct pt_regs *, long error_code) =
51						unexpected_machine_check;
52
53int				mce_disabled;
54
55#ifdef CONFIG_X86_64
56
57#define MISC_MCELOG_MINOR	227
58
59atomic_t mce_entry;
60
61/*
62 * Tolerant levels:
63 *   0: always panic on uncorrected errors, log corrected errors
64 *   1: panic or SIGBUS on uncorrected errors, log corrected errors
65 *   2: SIGBUS or log uncorrected errors (if possible), log corrected errors
66 *   3: never panic or SIGBUS, log all errors (for testing only)
67 */
68static int			tolerant = 1;
69static int			banks;
70static u64			*bank;
71static unsigned long		notify_user;
72static int			rip_msr;
73static int			mce_bootlog = -1;
74static atomic_t			mce_events;
75
76static char			trigger[128];
77static char			*trigger_argv[2] = { trigger, NULL };
78
79static unsigned long		dont_init_banks;
80
81static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
82
83/* MCA banks polled by the period polling timer for corrected events */
84DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
85	[0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
86};
87
88static inline int skip_bank_init(int i)
89{
90	return i < BITS_PER_LONG && test_bit(i, &dont_init_banks);
91}
92
93/* Do initial initialization of a struct mce */
94void mce_setup(struct mce *m)
95{
96	memset(m, 0, sizeof(struct mce));
97	m->cpu = smp_processor_id();
98	rdtscll(m->tsc);
99}
100
101/*
102 * Lockless MCE logging infrastructure.
103 * This avoids deadlocks on printk locks without having to break locks. Also
104 * separate MCEs from kernel messages to avoid bogus bug reports.
105 */
106
107static struct mce_log mcelog = {
108	MCE_LOG_SIGNATURE,
109	MCE_LOG_LEN,
110};
111
112void mce_log(struct mce *mce)
113{
114	unsigned next, entry;
115
116	atomic_inc(&mce_events);
117	mce->finished = 0;
118	wmb();
119	for (;;) {
120		entry = rcu_dereference(mcelog.next);
121		for (;;) {
122			/*
123			 * When the buffer fills up discard new entries.
124			 * Assume that the earlier errors are the more
125			 * interesting ones:
126			 */
127			if (entry >= MCE_LOG_LEN) {
128				set_bit(MCE_OVERFLOW, (unsigned long *)&mcelog.flags);
129				return;
130			}
131			/* Old left over entry. Skip: */
132			if (mcelog.entry[entry].finished) {
133				entry++;
134				continue;
135			}
136			break;
137		}
138		smp_rmb();
139		next = entry + 1;
140		if (cmpxchg(&mcelog.next, entry, next) == entry)
141			break;
142	}
143	memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
144	wmb();
145	mcelog.entry[entry].finished = 1;
146	wmb();
147
148	set_bit(0, &notify_user);
149}
150
151static void print_mce(struct mce *m)
152{
153	printk(KERN_EMERG "\n"
154	       KERN_EMERG "HARDWARE ERROR\n"
155	       KERN_EMERG
156	       "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
157	       m->cpu, m->mcgstatus, m->bank, m->status);
158	if (m->ip) {
159		printk(KERN_EMERG "RIP%s %02x:<%016Lx> ",
160		       !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
161		       m->cs, m->ip);
162		if (m->cs == __KERNEL_CS)
163			print_symbol("{%s}", m->ip);
164		printk("\n");
165	}
166	printk(KERN_EMERG "TSC %llx ", m->tsc);
167	if (m->addr)
168		printk("ADDR %llx ", m->addr);
169	if (m->misc)
170		printk("MISC %llx ", m->misc);
171	printk("\n");
172	printk(KERN_EMERG "This is not a software problem!\n");
173	printk(KERN_EMERG "Run through mcelog --ascii to decode "
174	       "and contact your hardware vendor\n");
175}
176
177static void mce_panic(char *msg, struct mce *backup, u64 start)
178{
179	int i;
180
181	oops_begin();
182	for (i = 0; i < MCE_LOG_LEN; i++) {
183		u64 tsc = mcelog.entry[i].tsc;
184
185		if ((s64)(tsc - start) < 0)
186			continue;
187		print_mce(&mcelog.entry[i]);
188		if (backup && mcelog.entry[i].tsc == backup->tsc)
189			backup = NULL;
190	}
191	if (backup)
192		print_mce(backup);
193	panic(msg);
194}
195
196int mce_available(struct cpuinfo_x86 *c)
197{
198	if (mce_disabled)
199		return 0;
200	return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
201}
202
203static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
204{
205	if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) {
206		m->ip = regs->ip;
207		m->cs = regs->cs;
208	} else {
209		m->ip = 0;
210		m->cs = 0;
211	}
212	if (rip_msr) {
213		/* Assume the RIP in the MSR is exact. Is this true? */
214		m->mcgstatus |= MCG_STATUS_EIPV;
215		rdmsrl(rip_msr, m->ip);
216		m->cs = 0;
217	}
218}
219
220/*
221 * Poll for corrected events or events that happened before reset.
222 * Those are just logged through /dev/mcelog.
223 *
224 * This is executed in standard interrupt context.
225 */
226void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
227{
228	struct mce m;
229	int i;
230
231	mce_setup(&m);
232
233	rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
234	for (i = 0; i < banks; i++) {
235		if (!bank[i] || !test_bit(i, *b))
236			continue;
237
238		m.misc = 0;
239		m.addr = 0;
240		m.bank = i;
241		m.tsc = 0;
242
243		barrier();
244		rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
245		if (!(m.status & MCI_STATUS_VAL))
246			continue;
247
248		/*
249		 * Uncorrected events are handled by the exception handler
250		 * when it is enabled. But when the exception is disabled log
251		 * everything.
252		 *
253		 * TBD do the same check for MCI_STATUS_EN here?
254		 */
255		if ((m.status & MCI_STATUS_UC) && !(flags & MCP_UC))
256			continue;
257
258		if (m.status & MCI_STATUS_MISCV)
259			rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
260		if (m.status & MCI_STATUS_ADDRV)
261			rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
262
263		if (!(flags & MCP_TIMESTAMP))
264			m.tsc = 0;
265		/*
266		 * Don't get the IP here because it's unlikely to
267		 * have anything to do with the actual error location.
268		 */
269		if (!(flags & MCP_DONTLOG)) {
270			mce_log(&m);
271			add_taint(TAINT_MACHINE_CHECK);
272		}
273
274		/*
275		 * Clear state for this bank.
276		 */
277		wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
278	}
279
280	/*
281	 * Don't clear MCG_STATUS here because it's only defined for
282	 * exceptions.
283	 */
284}
285
286/*
287 * The actual machine check handler. This only handles real
288 * exceptions when something got corrupted coming in through int 18.
289 *
290 * This is executed in NMI context not subject to normal locking rules. This
291 * implies that most kernel services cannot be safely used. Don't even
292 * think about putting a printk in there!
293 */
294void do_machine_check(struct pt_regs *regs, long error_code)
295{
296	struct mce m, panicm;
297	int panicm_found = 0;
298	u64 mcestart = 0;
299	int i;
300	/*
301	 * If no_way_out gets set, there is no safe way to recover from this
302	 * MCE.  If tolerant is cranked up, we'll try anyway.
303	 */
304	int no_way_out = 0;
305	/*
306	 * If kill_it gets set, there might be a way to recover from this
307	 * error.
308	 */
309	int kill_it = 0;
310	DECLARE_BITMAP(toclear, MAX_NR_BANKS);
311
312	atomic_inc(&mce_entry);
313
314	if (notify_die(DIE_NMI, "machine check", regs, error_code,
315			   18, SIGKILL) == NOTIFY_STOP)
316		goto out2;
317	if (!banks)
318		goto out2;
319
320	mce_setup(&m);
321
322	rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
323
324	/* if the restart IP is not valid, we're done for */
325	if (!(m.mcgstatus & MCG_STATUS_RIPV))
326		no_way_out = 1;
327
328	rdtscll(mcestart);
329	barrier();
330
331	for (i = 0; i < banks; i++) {
332		__clear_bit(i, toclear);
333		if (!bank[i])
334			continue;
335
336		m.misc = 0;
337		m.addr = 0;
338		m.bank = i;
339
340		rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
341		if ((m.status & MCI_STATUS_VAL) == 0)
342			continue;
343
344		/*
345		 * Non uncorrected errors are handled by machine_check_poll
346		 * Leave them alone.
347		 */
348		if ((m.status & MCI_STATUS_UC) == 0)
349			continue;
350
351		/*
352		 * Set taint even when machine check was not enabled.
353		 */
354		add_taint(TAINT_MACHINE_CHECK);
355
356		__set_bit(i, toclear);
357
358		if (m.status & MCI_STATUS_EN) {
359			/* if PCC was set, there's no way out */
360			no_way_out |= !!(m.status & MCI_STATUS_PCC);
361			/*
362			 * If this error was uncorrectable and there was
363			 * an overflow, we're in trouble.  If no overflow,
364			 * we might get away with just killing a task.
365			 */
366			if (m.status & MCI_STATUS_UC) {
367				if (tolerant < 1 || m.status & MCI_STATUS_OVER)
368					no_way_out = 1;
369				kill_it = 1;
370			}
371		} else {
372			/*
373			 * Machine check event was not enabled. Clear, but
374			 * ignore.
375			 */
376			continue;
377		}
378
379		if (m.status & MCI_STATUS_MISCV)
380			rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
381		if (m.status & MCI_STATUS_ADDRV)
382			rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
383
384		mce_get_rip(&m, regs);
385		mce_log(&m);
386
387		/*
388		 * Did this bank cause the exception?
389		 *
390		 * Assume that the bank with uncorrectable errors did it,
391		 * and that there is only a single one:
392		 */
393		if ((m.status & MCI_STATUS_UC) &&
394					(m.status & MCI_STATUS_EN)) {
395			panicm = m;
396			panicm_found = 1;
397		}
398	}
399
400	/*
401	 * If we didn't find an uncorrectable error, pick
402	 * the last one (shouldn't happen, just being safe).
403	 */
404	if (!panicm_found)
405		panicm = m;
406
407	/*
408	 * If we have decided that we just CAN'T continue, and the user
409	 * has not set tolerant to an insane level, give up and die.
410	 */
411	if (no_way_out && tolerant < 3)
412		mce_panic("Machine check", &panicm, mcestart);
413
414	/*
415	 * If the error seems to be unrecoverable, something should be
416	 * done.  Try to kill as little as possible.  If we can kill just
417	 * one task, do that.  If the user has set the tolerance very
418	 * high, don't try to do anything at all.
419	 */
420	if (kill_it && tolerant < 3) {
421		int user_space = 0;
422
423		/*
424		 * If the EIPV bit is set, it means the saved IP is the
425		 * instruction which caused the MCE.
426		 */
427		if (m.mcgstatus & MCG_STATUS_EIPV)
428			user_space = panicm.ip && (panicm.cs & 3);
429
430		/*
431		 * If we know that the error was in user space, send a
432		 * SIGBUS.  Otherwise, panic if tolerance is low.
433		 *
434		 * force_sig() takes an awful lot of locks and has a slight
435		 * risk of deadlocking.
436		 */
437		if (user_space) {
438			force_sig(SIGBUS, current);
439		} else if (panic_on_oops || tolerant < 2) {
440			mce_panic("Uncorrected machine check",
441				&panicm, mcestart);
442		}
443	}
444
445	/* notify userspace ASAP */
446	set_thread_flag(TIF_MCE_NOTIFY);
447
448	/* the last thing we do is clear state */
449	for (i = 0; i < banks; i++) {
450		if (test_bit(i, toclear))
451			wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
452	}
453	wrmsrl(MSR_IA32_MCG_STATUS, 0);
454 out2:
455	atomic_dec(&mce_entry);
456}
457
458#ifdef CONFIG_X86_MCE_INTEL
459/***
460 * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
461 * @cpu: The CPU on which the event occurred.
462 * @status: Event status information
463 *
464 * This function should be called by the thermal interrupt after the
465 * event has been processed and the decision was made to log the event
466 * further.
467 *
468 * The status parameter will be saved to the 'status' field of 'struct mce'
469 * and historically has been the register value of the
470 * MSR_IA32_THERMAL_STATUS (Intel) msr.
471 */
472void mce_log_therm_throt_event(__u64 status)
473{
474	struct mce m;
475
476	mce_setup(&m);
477	m.bank = MCE_THERMAL_BANK;
478	m.status = status;
479	mce_log(&m);
480}
481#endif /* CONFIG_X86_MCE_INTEL */
482
483/*
484 * Periodic polling timer for "silent" machine check errors.  If the
485 * poller finds an MCE, poll 2x faster.  When the poller finds no more
486 * errors, poll 2x slower (up to check_interval seconds).
487 */
488static int check_interval = 5 * 60; /* 5 minutes */
489
490static DEFINE_PER_CPU(int, next_interval); /* in jiffies */
491static DEFINE_PER_CPU(struct timer_list, mce_timer);
492
493static void mcheck_timer(unsigned long data)
494{
495	struct timer_list *t = &per_cpu(mce_timer, data);
496	int *n;
497
498	WARN_ON(smp_processor_id() != data);
499
500	if (mce_available(&current_cpu_data)) {
501		machine_check_poll(MCP_TIMESTAMP,
502				&__get_cpu_var(mce_poll_banks));
503	}
504
505	/*
506	 * Alert userspace if needed.  If we logged an MCE, reduce the
507	 * polling interval, otherwise increase the polling interval.
508	 */
509	n = &__get_cpu_var(next_interval);
510	if (mce_notify_user()) {
511		*n = max(*n/2, HZ/100);
512	} else {
513		*n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ));
514	}
515
516	t->expires = jiffies + *n;
517	add_timer(t);
518}
519
520static void mce_do_trigger(struct work_struct *work)
521{
522	call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT);
523}
524
525static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
526
527/*
528 * Notify the user(s) about new machine check events.
529 * Can be called from interrupt context, but not from machine check/NMI
530 * context.
531 */
532int mce_notify_user(void)
533{
534	/* Not more than two messages every minute */
535	static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
536
537	clear_thread_flag(TIF_MCE_NOTIFY);
538
539	if (test_and_clear_bit(0, &notify_user)) {
540		wake_up_interruptible(&mce_wait);
541
542		/*
543		 * There is no risk of missing notifications because
544		 * work_pending is always cleared before the function is
545		 * executed.
546		 */
547		if (trigger[0] && !work_pending(&mce_trigger_work))
548			schedule_work(&mce_trigger_work);
549
550		if (__ratelimit(&ratelimit))
551			printk(KERN_INFO "Machine check events logged\n");
552
553		return 1;
554	}
555	return 0;
556}
557
558/*
559 * Initialize Machine Checks for a CPU.
560 */
561static int mce_cap_init(void)
562{
563	unsigned b;
564	u64 cap;
565
566	rdmsrl(MSR_IA32_MCG_CAP, cap);
567
568	b = cap & MCG_BANKCNT_MASK;
569	printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b);
570
571	if (b > MAX_NR_BANKS) {
572		printk(KERN_WARNING
573		       "MCE: Using only %u machine check banks out of %u\n",
574			MAX_NR_BANKS, b);
575		b = MAX_NR_BANKS;
576	}
577
578	/* Don't support asymmetric configurations today */
579	WARN_ON(banks != 0 && b != banks);
580	banks = b;
581	if (!bank) {
582		bank = kmalloc(banks * sizeof(u64), GFP_KERNEL);
583		if (!bank)
584			return -ENOMEM;
585		memset(bank, 0xff, banks * sizeof(u64));
586	}
587
588	/* Use accurate RIP reporting if available. */
589	if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9)
590		rip_msr = MSR_IA32_MCG_EIP;
591
592	return 0;
593}
594
595static void mce_init(void *dummy)
596{
597	mce_banks_t all_banks;
598	u64 cap;
599	int i;
600
601	/*
602	 * Log the machine checks left over from the previous reset.
603	 */
604	bitmap_fill(all_banks, MAX_NR_BANKS);
605	machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks);
606
607	set_in_cr4(X86_CR4_MCE);
608
609	rdmsrl(MSR_IA32_MCG_CAP, cap);
610	if (cap & MCG_CTL_P)
611		wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
612
613	for (i = 0; i < banks; i++) {
614		if (skip_bank_init(i))
615			continue;
616		wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
617		wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
618	}
619}
620
621/* Add per CPU specific workarounds here */
622static void mce_cpu_quirks(struct cpuinfo_x86 *c)
623{
624	/* This should be disabled by the BIOS, but isn't always */
625	if (c->x86_vendor == X86_VENDOR_AMD) {
626		if (c->x86 == 15 && banks > 4) {
627			/*
628			 * disable GART TBL walk error reporting, which
629			 * trips off incorrectly with the IOMMU & 3ware
630			 * & Cerberus:
631			 */
632			clear_bit(10, (unsigned long *)&bank[4]);
633		}
634		if (c->x86 <= 17 && mce_bootlog < 0) {
635			/*
636			 * Lots of broken BIOS around that don't clear them
637			 * by default and leave crap in there. Don't log:
638			 */
639			mce_bootlog = 0;
640		}
641		/*
642		 * Various K7s with broken bank 0 around. Always disable
643		 * by default.
644		 */
645		 if (c->x86 == 6)
646			bank[0] = 0;
647	}
648
649	if (c->x86_vendor == X86_VENDOR_INTEL) {
650		/*
651		 * SDM documents that on family 6 bank 0 should not be written
652		 * because it aliases to another special BIOS controlled
653		 * register.
654		 * But it's not aliased anymore on model 0x1a+
655		 * Don't ignore bank 0 completely because there could be a
656		 * valid event later, merely don't write CTL0.
657		 */
658
659		if (c->x86 == 6 && c->x86_model < 0x1A)
660			__set_bit(0, &dont_init_banks);
661	}
662}
663
664static void mce_cpu_features(struct cpuinfo_x86 *c)
665{
666	switch (c->x86_vendor) {
667	case X86_VENDOR_INTEL:
668		mce_intel_feature_init(c);
669		break;
670	case X86_VENDOR_AMD:
671		mce_amd_feature_init(c);
672		break;
673	default:
674		break;
675	}
676}
677
678static void mce_init_timer(void)
679{
680	struct timer_list *t = &__get_cpu_var(mce_timer);
681	int *n = &__get_cpu_var(next_interval);
682
683	*n = check_interval * HZ;
684	if (!*n)
685		return;
686	setup_timer(t, mcheck_timer, smp_processor_id());
687	t->expires = round_jiffies(jiffies + *n);
688	add_timer(t);
689}
690
691/*
692 * Called for each booted CPU to set up machine checks.
693 * Must be called with preempt off:
694 */
695void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
696{
697	if (!mce_available(c))
698		return;
699
700	if (mce_cap_init() < 0) {
701		mce_disabled = 1;
702		return;
703	}
704	mce_cpu_quirks(c);
705
706	machine_check_vector = do_machine_check;
707
708	mce_init(NULL);
709	mce_cpu_features(c);
710	mce_init_timer();
711}
712
713/*
714 * Character device to read and clear the MCE log.
715 */
716
717static DEFINE_SPINLOCK(mce_state_lock);
718static int		open_count;		/* #times opened */
719static int		open_exclu;		/* already open exclusive? */
720
721static int mce_open(struct inode *inode, struct file *file)
722{
723	lock_kernel();
724	spin_lock(&mce_state_lock);
725
726	if (open_exclu || (open_count && (file->f_flags & O_EXCL))) {
727		spin_unlock(&mce_state_lock);
728		unlock_kernel();
729
730		return -EBUSY;
731	}
732
733	if (file->f_flags & O_EXCL)
734		open_exclu = 1;
735	open_count++;
736
737	spin_unlock(&mce_state_lock);
738	unlock_kernel();
739
740	return nonseekable_open(inode, file);
741}
742
743static int mce_release(struct inode *inode, struct file *file)
744{
745	spin_lock(&mce_state_lock);
746
747	open_count--;
748	open_exclu = 0;
749
750	spin_unlock(&mce_state_lock);
751
752	return 0;
753}
754
755static void collect_tscs(void *data)
756{
757	unsigned long *cpu_tsc = (unsigned long *)data;
758
759	rdtscll(cpu_tsc[smp_processor_id()]);
760}
761
762static DEFINE_MUTEX(mce_read_mutex);
763
764static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
765			loff_t *off)
766{
767	char __user *buf = ubuf;
768	unsigned long *cpu_tsc;
769	unsigned prev, next;
770	int i, err;
771
772	cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
773	if (!cpu_tsc)
774		return -ENOMEM;
775
776	mutex_lock(&mce_read_mutex);
777	next = rcu_dereference(mcelog.next);
778
779	/* Only supports full reads right now */
780	if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
781		mutex_unlock(&mce_read_mutex);
782		kfree(cpu_tsc);
783
784		return -EINVAL;
785	}
786
787	err = 0;
788	prev = 0;
789	do {
790		for (i = prev; i < next; i++) {
791			unsigned long start = jiffies;
792
793			while (!mcelog.entry[i].finished) {
794				if (time_after_eq(jiffies, start + 2)) {
795					memset(mcelog.entry + i, 0,
796					       sizeof(struct mce));
797					goto timeout;
798				}
799				cpu_relax();
800			}
801			smp_rmb();
802			err |= copy_to_user(buf, mcelog.entry + i,
803					    sizeof(struct mce));
804			buf += sizeof(struct mce);
805timeout:
806			;
807		}
808
809		memset(mcelog.entry + prev, 0,
810		       (next - prev) * sizeof(struct mce));
811		prev = next;
812		next = cmpxchg(&mcelog.next, prev, 0);
813	} while (next != prev);
814
815	synchronize_sched();
816
817	/*
818	 * Collect entries that were still getting written before the
819	 * synchronize.
820	 */
821	on_each_cpu(collect_tscs, cpu_tsc, 1);
822
823	for (i = next; i < MCE_LOG_LEN; i++) {
824		if (mcelog.entry[i].finished &&
825		    mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
826			err |= copy_to_user(buf, mcelog.entry+i,
827					    sizeof(struct mce));
828			smp_rmb();
829			buf += sizeof(struct mce);
830			memset(&mcelog.entry[i], 0, sizeof(struct mce));
831		}
832	}
833	mutex_unlock(&mce_read_mutex);
834	kfree(cpu_tsc);
835
836	return err ? -EFAULT : buf - ubuf;
837}
838
839static unsigned int mce_poll(struct file *file, poll_table *wait)
840{
841	poll_wait(file, &mce_wait, wait);
842	if (rcu_dereference(mcelog.next))
843		return POLLIN | POLLRDNORM;
844	return 0;
845}
846
847static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
848{
849	int __user *p = (int __user *)arg;
850
851	if (!capable(CAP_SYS_ADMIN))
852		return -EPERM;
853
854	switch (cmd) {
855	case MCE_GET_RECORD_LEN:
856		return put_user(sizeof(struct mce), p);
857	case MCE_GET_LOG_LEN:
858		return put_user(MCE_LOG_LEN, p);
859	case MCE_GETCLEAR_FLAGS: {
860		unsigned flags;
861
862		do {
863			flags = mcelog.flags;
864		} while (cmpxchg(&mcelog.flags, flags, 0) != flags);
865
866		return put_user(flags, p);
867	}
868	default:
869		return -ENOTTY;
870	}
871}
872
873static const struct file_operations mce_chrdev_ops = {
874	.open			= mce_open,
875	.release		= mce_release,
876	.read			= mce_read,
877	.poll			= mce_poll,
878	.unlocked_ioctl		= mce_ioctl,
879};
880
881static struct miscdevice mce_log_device = {
882	MISC_MCELOG_MINOR,
883	"mcelog",
884	&mce_chrdev_ops,
885};
886
887/*
888 * mce=off disables machine check
889 * mce=TOLERANCELEVEL (number, see above)
890 * mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
891 * mce=nobootlog Don't log MCEs from before booting.
892 */
893static int __init mcheck_enable(char *str)
894{
895	if (!strcmp(str, "off"))
896		mce_disabled = 1;
897	else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
898		mce_bootlog = (str[0] == 'b');
899	else if (isdigit(str[0]))
900		get_option(&str, &tolerant);
901	else {
902		printk(KERN_INFO "mce= argument %s ignored. Please use /sys\n",
903		       str);
904		return 0;
905	}
906	return 1;
907}
908__setup("mce=", mcheck_enable);
909
910/*
911 * Sysfs support
912 */
913
914/*
915 * Disable machine checks on suspend and shutdown. We can't really handle
916 * them later.
917 */
918static int mce_disable(void)
919{
920	int i;
921
922	for (i = 0; i < banks; i++) {
923		if (!skip_bank_init(i))
924			wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
925	}
926	return 0;
927}
928
929static int mce_suspend(struct sys_device *dev, pm_message_t state)
930{
931	return mce_disable();
932}
933
934static int mce_shutdown(struct sys_device *dev)
935{
936	return mce_disable();
937}
938
939/*
940 * On resume clear all MCE state. Don't want to see leftovers from the BIOS.
941 * Only one CPU is active at this time, the others get re-added later using
942 * CPU hotplug:
943 */
944static int mce_resume(struct sys_device *dev)
945{
946	mce_init(NULL);
947	mce_cpu_features(&current_cpu_data);
948
949	return 0;
950}
951
952static void mce_cpu_restart(void *data)
953{
954	del_timer_sync(&__get_cpu_var(mce_timer));
955	if (mce_available(&current_cpu_data))
956		mce_init(NULL);
957	mce_init_timer();
958}
959
960/* Reinit MCEs after user configuration changes */
961static void mce_restart(void)
962{
963	on_each_cpu(mce_cpu_restart, NULL, 1);
964}
965
966static struct sysdev_class mce_sysclass = {
967	.suspend	= mce_suspend,
968	.shutdown	= mce_shutdown,
969	.resume		= mce_resume,
970	.name		= "machinecheck",
971};
972
973DEFINE_PER_CPU(struct sys_device, mce_dev);
974
975__cpuinitdata
976void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
977
978/* Why are there no generic functions for this? */
979#define ACCESSOR(name, var, start) \
980	static ssize_t show_ ## name(struct sys_device *s,		\
981				     struct sysdev_attribute *attr,	\
982				     char *buf) {			\
983		return sprintf(buf, "%Lx\n", (u64)var);			\
984	}								\
985	static ssize_t set_ ## name(struct sys_device *s,		\
986				    struct sysdev_attribute *attr,	\
987				    const char *buf, size_t siz) {	\
988		char *end;						\
989		u64 new = simple_strtoull(buf, &end, 0);		\
990									\
991		if (end == buf)						\
992			return -EINVAL;					\
993		var = new;						\
994		start;							\
995									\
996		return end-buf;						\
997	}								\
998	static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
999
1000static struct sysdev_attribute *bank_attrs;
1001
1002static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr,
1003			 char *buf)
1004{
1005	u64 b = bank[attr - bank_attrs];
1006
1007	return sprintf(buf, "%llx\n", b);
1008}
1009
1010static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
1011			const char *buf, size_t siz)
1012{
1013	char *end;
1014	u64 new = simple_strtoull(buf, &end, 0);
1015
1016	if (end == buf)
1017		return -EINVAL;
1018
1019	bank[attr - bank_attrs] = new;
1020	mce_restart();
1021
1022	return end-buf;
1023}
1024
1025static ssize_t
1026show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf)
1027{
1028	strcpy(buf, trigger);
1029	strcat(buf, "\n");
1030	return strlen(trigger) + 1;
1031}
1032
1033static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr,
1034				const char *buf, size_t siz)
1035{
1036	char *p;
1037	int len;
1038
1039	strncpy(trigger, buf, sizeof(trigger));
1040	trigger[sizeof(trigger)-1] = 0;
1041	len = strlen(trigger);
1042	p = strchr(trigger, '\n');
1043
1044	if (*p)
1045		*p = 0;
1046
1047	return len;
1048}
1049
1050static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
1051static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
1052
1053ACCESSOR(check_interval, check_interval, mce_restart())
1054
1055static struct sysdev_attribute *mce_attrs[] = {
1056	&attr_tolerant.attr, &attr_check_interval, &attr_trigger,
1057	NULL
1058};
1059
1060static cpumask_var_t mce_dev_initialized;
1061
1062/* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */
1063static __cpuinit int mce_create_device(unsigned int cpu)
1064{
1065	int err;
1066	int i;
1067
1068	if (!mce_available(&boot_cpu_data))
1069		return -EIO;
1070
1071	memset(&per_cpu(mce_dev, cpu).kobj, 0, sizeof(struct kobject));
1072	per_cpu(mce_dev, cpu).id	= cpu;
1073	per_cpu(mce_dev, cpu).cls	= &mce_sysclass;
1074
1075	err = sysdev_register(&per_cpu(mce_dev, cpu));
1076	if (err)
1077		return err;
1078
1079	for (i = 0; mce_attrs[i]; i++) {
1080		err = sysdev_create_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
1081		if (err)
1082			goto error;
1083	}
1084	for (i = 0; i < banks; i++) {
1085		err = sysdev_create_file(&per_cpu(mce_dev, cpu),
1086					&bank_attrs[i]);
1087		if (err)
1088			goto error2;
1089	}
1090	cpumask_set_cpu(cpu, mce_dev_initialized);
1091
1092	return 0;
1093error2:
1094	while (--i >= 0)
1095		sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]);
1096error:
1097	while (--i >= 0)
1098		sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
1099
1100	sysdev_unregister(&per_cpu(mce_dev, cpu));
1101
1102	return err;
1103}
1104
1105static __cpuinit void mce_remove_device(unsigned int cpu)
1106{
1107	int i;
1108
1109	if (!cpumask_test_cpu(cpu, mce_dev_initialized))
1110		return;
1111
1112	for (i = 0; mce_attrs[i]; i++)
1113		sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
1114
1115	for (i = 0; i < banks; i++)
1116		sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]);
1117
1118	sysdev_unregister(&per_cpu(mce_dev, cpu));
1119	cpumask_clear_cpu(cpu, mce_dev_initialized);
1120}
1121
1122/* Make sure there are no machine checks on offlined CPUs. */
1123static void mce_disable_cpu(void *h)
1124{
1125	unsigned long action = *(unsigned long *)h;
1126	int i;
1127
1128	if (!mce_available(&current_cpu_data))
1129		return;
1130	if (!(action & CPU_TASKS_FROZEN))
1131		cmci_clear();
1132	for (i = 0; i < banks; i++) {
1133		if (!skip_bank_init(i))
1134			wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
1135	}
1136}
1137
1138static void mce_reenable_cpu(void *h)
1139{
1140	unsigned long action = *(unsigned long *)h;
1141	int i;
1142
1143	if (!mce_available(&current_cpu_data))
1144		return;
1145
1146	if (!(action & CPU_TASKS_FROZEN))
1147		cmci_reenable();
1148	for (i = 0; i < banks; i++) {
1149		if (!skip_bank_init(i))
1150			wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]);
1151	}
1152}
1153
1154/* Get notified when a cpu comes on/off. Be hotplug friendly. */
1155static int __cpuinit
1156mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
1157{
1158	unsigned int cpu = (unsigned long)hcpu;
1159	struct timer_list *t = &per_cpu(mce_timer, cpu);
1160
1161	switch (action) {
1162	case CPU_ONLINE:
1163	case CPU_ONLINE_FROZEN:
1164		mce_create_device(cpu);
1165		if (threshold_cpu_callback)
1166			threshold_cpu_callback(action, cpu);
1167		break;
1168	case CPU_DEAD:
1169	case CPU_DEAD_FROZEN:
1170		if (threshold_cpu_callback)
1171			threshold_cpu_callback(action, cpu);
1172		mce_remove_device(cpu);
1173		break;
1174	case CPU_DOWN_PREPARE:
1175	case CPU_DOWN_PREPARE_FROZEN:
1176		del_timer_sync(t);
1177		smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
1178		break;
1179	case CPU_DOWN_FAILED:
1180	case CPU_DOWN_FAILED_FROZEN:
1181		t->expires = round_jiffies(jiffies +
1182						__get_cpu_var(next_interval));
1183		add_timer_on(t, cpu);
1184		smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
1185		break;
1186	case CPU_POST_DEAD:
1187		/* intentionally ignoring frozen here */
1188		cmci_rediscover(cpu);
1189		break;
1190	}
1191	return NOTIFY_OK;
1192}
1193
1194static struct notifier_block mce_cpu_notifier __cpuinitdata = {
1195	.notifier_call = mce_cpu_callback,
1196};
1197
1198static __init int mce_init_banks(void)
1199{
1200	int i;
1201
1202	bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks,
1203				GFP_KERNEL);
1204	if (!bank_attrs)
1205		return -ENOMEM;
1206
1207	for (i = 0; i < banks; i++) {
1208		struct sysdev_attribute *a = &bank_attrs[i];
1209
1210		a->attr.name	= kasprintf(GFP_KERNEL, "bank%d", i);
1211		if (!a->attr.name)
1212			goto nomem;
1213
1214		a->attr.mode	= 0644;
1215		a->show		= show_bank;
1216		a->store	= set_bank;
1217	}
1218	return 0;
1219
1220nomem:
1221	while (--i >= 0)
1222		kfree(bank_attrs[i].attr.name);
1223	kfree(bank_attrs);
1224	bank_attrs = NULL;
1225
1226	return -ENOMEM;
1227}
1228
1229static __init int mce_init_device(void)
1230{
1231	int err;
1232	int i = 0;
1233
1234	if (!mce_available(&boot_cpu_data))
1235		return -EIO;
1236
1237	alloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL);
1238
1239	err = mce_init_banks();
1240	if (err)
1241		return err;
1242
1243	err = sysdev_class_register(&mce_sysclass);
1244	if (err)
1245		return err;
1246
1247	for_each_online_cpu(i) {
1248		err = mce_create_device(i);
1249		if (err)
1250			return err;
1251	}
1252
1253	register_hotcpu_notifier(&mce_cpu_notifier);
1254	misc_register(&mce_log_device);
1255
1256	return err;
1257}
1258
1259device_initcall(mce_init_device);
1260
1261#else /* CONFIG_X86_32: */
1262
1263int nr_mce_banks;
1264EXPORT_SYMBOL_GPL(nr_mce_banks);	/* non-fatal.o */
1265
1266/* This has to be run for each processor */
1267void mcheck_init(struct cpuinfo_x86 *c)
1268{
1269	if (mce_disabled == 1)
1270		return;
1271
1272	switch (c->x86_vendor) {
1273	case X86_VENDOR_AMD:
1274		amd_mcheck_init(c);
1275		break;
1276
1277	case X86_VENDOR_INTEL:
1278		if (c->x86 == 5)
1279			intel_p5_mcheck_init(c);
1280		if (c->x86 == 6)
1281			intel_p6_mcheck_init(c);
1282		if (c->x86 == 15)
1283			intel_p4_mcheck_init(c);
1284		break;
1285
1286	case X86_VENDOR_CENTAUR:
1287		if (c->x86 == 5)
1288			winchip_mcheck_init(c);
1289		break;
1290
1291	default:
1292		break;
1293	}
1294	printk(KERN_INFO "mce: CPU supports %d MCE banks\n", nr_mce_banks);
1295}
1296
1297static int __init mcheck_enable(char *str)
1298{
1299	mce_disabled = -1;
1300	return 1;
1301}
1302
1303__setup("mce", mcheck_enable);
1304
1305#endif /* CONFIG_X86_OLD_MCE */
1306
1307/*
1308 * Old style boot options parsing. Only for compatibility.
1309 */
1310static int __init mcheck_disable(char *str)
1311{
1312	mce_disabled = 1;
1313	return 1;
1314}
1315__setup("nomce", mcheck_disable);
1316