mce.c revision 32561696c23028596f24b353d98f2e23b58f91f7
1/*
2 * Machine check handler.
3 *
4 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
5 * Rest from unknown author(s).
6 * 2004 Andi Kleen. Rewrote most of it.
7 * Copyright 2008 Intel Corporation
8 * Author: Andi Kleen
9 */
10#include <linux/thread_info.h>
11#include <linux/capability.h>
12#include <linux/miscdevice.h>
13#include <linux/ratelimit.h>
14#include <linux/kallsyms.h>
15#include <linux/rcupdate.h>
16#include <linux/smp_lock.h>
17#include <linux/kobject.h>
18#include <linux/kdebug.h>
19#include <linux/kernel.h>
20#include <linux/percpu.h>
21#include <linux/string.h>
22#include <linux/sysdev.h>
23#include <linux/ctype.h>
24#include <linux/sched.h>
25#include <linux/sysfs.h>
26#include <linux/types.h>
27#include <linux/init.h>
28#include <linux/kmod.h>
29#include <linux/poll.h>
30#include <linux/cpu.h>
31#include <linux/fs.h>
32
33#include <asm/processor.h>
34#include <asm/uaccess.h>
35#include <asm/idle.h>
36#include <asm/mce.h>
37#include <asm/msr.h>
38#include <asm/smp.h>
39
40#include "mce.h"
41
42/* Handle unconfigured int18 (should never happen) */
43static void unexpected_machine_check(struct pt_regs *regs, long error_code)
44{
45	printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n",
46	       smp_processor_id());
47}
48
49/* Call the installed machine check handler for this CPU setup. */
50void (*machine_check_vector)(struct pt_regs *, long error_code) =
51						unexpected_machine_check;
52
53int				mce_disabled;
54
55#ifdef CONFIG_X86_NEW_MCE
56
57#define MISC_MCELOG_MINOR	227
58
59atomic_t mce_entry;
60
61/*
62 * Tolerant levels:
63 *   0: always panic on uncorrected errors, log corrected errors
64 *   1: panic or SIGBUS on uncorrected errors, log corrected errors
65 *   2: SIGBUS or log uncorrected errors (if possible), log corrected errors
66 *   3: never panic or SIGBUS, log all errors (for testing only)
67 */
68static int			tolerant = 1;
69static int			banks;
70static u64			*bank;
71static unsigned long		notify_user;
72static int			rip_msr;
73static int			mce_bootlog = -1;
74
75static char			trigger[128];
76static char			*trigger_argv[2] = { trigger, NULL };
77
78static unsigned long		dont_init_banks;
79
80static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
81
82/* MCA banks polled by the period polling timer for corrected events */
83DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
84	[0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
85};
86
87static inline int skip_bank_init(int i)
88{
89	return i < BITS_PER_LONG && test_bit(i, &dont_init_banks);
90}
91
92/* Do initial initialization of a struct mce */
93void mce_setup(struct mce *m)
94{
95	memset(m, 0, sizeof(struct mce));
96	m->cpu = smp_processor_id();
97	rdtscll(m->tsc);
98}
99
100DEFINE_PER_CPU(struct mce, injectm);
101EXPORT_PER_CPU_SYMBOL_GPL(injectm);
102
103/*
104 * Lockless MCE logging infrastructure.
105 * This avoids deadlocks on printk locks without having to break locks. Also
106 * separate MCEs from kernel messages to avoid bogus bug reports.
107 */
108
109static struct mce_log mcelog = {
110	MCE_LOG_SIGNATURE,
111	MCE_LOG_LEN,
112};
113
114void mce_log(struct mce *mce)
115{
116	unsigned next, entry;
117
118	mce->finished = 0;
119	wmb();
120	for (;;) {
121		entry = rcu_dereference(mcelog.next);
122		for (;;) {
123			/*
124			 * When the buffer fills up discard new entries.
125			 * Assume that the earlier errors are the more
126			 * interesting ones:
127			 */
128			if (entry >= MCE_LOG_LEN) {
129				set_bit(MCE_OVERFLOW, (unsigned long *)&mcelog.flags);
130				return;
131			}
132			/* Old left over entry. Skip: */
133			if (mcelog.entry[entry].finished) {
134				entry++;
135				continue;
136			}
137			break;
138		}
139		smp_rmb();
140		next = entry + 1;
141		if (cmpxchg(&mcelog.next, entry, next) == entry)
142			break;
143	}
144	memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
145	wmb();
146	mcelog.entry[entry].finished = 1;
147	wmb();
148
149	set_bit(0, &notify_user);
150}
151
152static void print_mce(struct mce *m)
153{
154	printk(KERN_EMERG "\n"
155	       KERN_EMERG "HARDWARE ERROR\n"
156	       KERN_EMERG
157	       "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
158	       m->cpu, m->mcgstatus, m->bank, m->status);
159	if (m->ip) {
160		printk(KERN_EMERG "RIP%s %02x:<%016Lx> ",
161		       !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
162		       m->cs, m->ip);
163		if (m->cs == __KERNEL_CS)
164			print_symbol("{%s}", m->ip);
165		printk("\n");
166	}
167	printk(KERN_EMERG "TSC %llx ", m->tsc);
168	if (m->addr)
169		printk("ADDR %llx ", m->addr);
170	if (m->misc)
171		printk("MISC %llx ", m->misc);
172	printk("\n");
173	printk(KERN_EMERG "This is not a software problem!\n");
174	printk(KERN_EMERG "Run through mcelog --ascii to decode "
175	       "and contact your hardware vendor\n");
176}
177
178static void mce_panic(char *msg, struct mce *backup, u64 start)
179{
180	int i;
181
182	bust_spinlocks(1);
183	console_verbose();
184	for (i = 0; i < MCE_LOG_LEN; i++) {
185		u64 tsc = mcelog.entry[i].tsc;
186
187		if ((s64)(tsc - start) < 0)
188			continue;
189		print_mce(&mcelog.entry[i]);
190		if (backup && mcelog.entry[i].tsc == backup->tsc)
191			backup = NULL;
192	}
193	if (backup)
194		print_mce(backup);
195	panic(msg);
196}
197
198/* Support code for software error injection */
199
200static int msr_to_offset(u32 msr)
201{
202	unsigned bank = __get_cpu_var(injectm.bank);
203	if (msr == rip_msr)
204		return offsetof(struct mce, ip);
205	if (msr == MSR_IA32_MC0_STATUS + bank*4)
206		return offsetof(struct mce, status);
207	if (msr == MSR_IA32_MC0_ADDR + bank*4)
208		return offsetof(struct mce, addr);
209	if (msr == MSR_IA32_MC0_MISC + bank*4)
210		return offsetof(struct mce, misc);
211	if (msr == MSR_IA32_MCG_STATUS)
212		return offsetof(struct mce, mcgstatus);
213	return -1;
214}
215
216/* MSR access wrappers used for error injection */
217static u64 mce_rdmsrl(u32 msr)
218{
219	u64 v;
220	if (__get_cpu_var(injectm).finished) {
221		int offset = msr_to_offset(msr);
222		if (offset < 0)
223			return 0;
224		return *(u64 *)((char *)&__get_cpu_var(injectm) + offset);
225	}
226	rdmsrl(msr, v);
227	return v;
228}
229
230static void mce_wrmsrl(u32 msr, u64 v)
231{
232	if (__get_cpu_var(injectm).finished) {
233		int offset = msr_to_offset(msr);
234		if (offset >= 0)
235			*(u64 *)((char *)&__get_cpu_var(injectm) + offset) = v;
236		return;
237	}
238	wrmsrl(msr, v);
239}
240
241int mce_available(struct cpuinfo_x86 *c)
242{
243	if (mce_disabled)
244		return 0;
245	return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
246}
247
248static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
249{
250	if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) {
251		m->ip = regs->ip;
252		m->cs = regs->cs;
253	} else {
254		m->ip = 0;
255		m->cs = 0;
256	}
257	if (rip_msr) {
258		/* Assume the RIP in the MSR is exact. Is this true? */
259		m->mcgstatus |= MCG_STATUS_EIPV;
260		m->ip = mce_rdmsrl(rip_msr);
261		m->cs = 0;
262	}
263}
264
265/*
266 * Poll for corrected events or events that happened before reset.
267 * Those are just logged through /dev/mcelog.
268 *
269 * This is executed in standard interrupt context.
270 */
271void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
272{
273	struct mce m;
274	int i;
275
276	mce_setup(&m);
277
278	m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
279	for (i = 0; i < banks; i++) {
280		if (!bank[i] || !test_bit(i, *b))
281			continue;
282
283		m.misc = 0;
284		m.addr = 0;
285		m.bank = i;
286		m.tsc = 0;
287
288		barrier();
289		m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4);
290		if (!(m.status & MCI_STATUS_VAL))
291			continue;
292
293		/*
294		 * Uncorrected events are handled by the exception handler
295		 * when it is enabled. But when the exception is disabled log
296		 * everything.
297		 *
298		 * TBD do the same check for MCI_STATUS_EN here?
299		 */
300		if ((m.status & MCI_STATUS_UC) && !(flags & MCP_UC))
301			continue;
302
303		if (m.status & MCI_STATUS_MISCV)
304			m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4);
305		if (m.status & MCI_STATUS_ADDRV)
306			m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4);
307
308		if (!(flags & MCP_TIMESTAMP))
309			m.tsc = 0;
310		/*
311		 * Don't get the IP here because it's unlikely to
312		 * have anything to do with the actual error location.
313		 */
314		if (!(flags & MCP_DONTLOG)) {
315			mce_log(&m);
316			add_taint(TAINT_MACHINE_CHECK);
317		}
318
319		/*
320		 * Clear state for this bank.
321		 */
322		mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
323	}
324
325	/*
326	 * Don't clear MCG_STATUS here because it's only defined for
327	 * exceptions.
328	 */
329
330	sync_core();
331}
332EXPORT_SYMBOL_GPL(machine_check_poll);
333
334/*
335 * The actual machine check handler. This only handles real
336 * exceptions when something got corrupted coming in through int 18.
337 *
338 * This is executed in NMI context not subject to normal locking rules. This
339 * implies that most kernel services cannot be safely used. Don't even
340 * think about putting a printk in there!
341 */
342void do_machine_check(struct pt_regs *regs, long error_code)
343{
344	struct mce m, panicm;
345	int panicm_found = 0;
346	u64 mcestart = 0;
347	int i;
348	/*
349	 * If no_way_out gets set, there is no safe way to recover from this
350	 * MCE.  If tolerant is cranked up, we'll try anyway.
351	 */
352	int no_way_out = 0;
353	/*
354	 * If kill_it gets set, there might be a way to recover from this
355	 * error.
356	 */
357	int kill_it = 0;
358	DECLARE_BITMAP(toclear, MAX_NR_BANKS);
359
360	atomic_inc(&mce_entry);
361
362	if (notify_die(DIE_NMI, "machine check", regs, error_code,
363			   18, SIGKILL) == NOTIFY_STOP)
364		goto out;
365	if (!banks)
366		goto out;
367
368	mce_setup(&m);
369
370	m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
371
372	/* if the restart IP is not valid, we're done for */
373	if (!(m.mcgstatus & MCG_STATUS_RIPV))
374		no_way_out = 1;
375
376	rdtscll(mcestart);
377	barrier();
378
379	for (i = 0; i < banks; i++) {
380		__clear_bit(i, toclear);
381		if (!bank[i])
382			continue;
383
384		m.misc = 0;
385		m.addr = 0;
386		m.bank = i;
387
388		m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4);
389		if ((m.status & MCI_STATUS_VAL) == 0)
390			continue;
391
392		/*
393		 * Non uncorrected errors are handled by machine_check_poll
394		 * Leave them alone.
395		 */
396		if ((m.status & MCI_STATUS_UC) == 0)
397			continue;
398
399		/*
400		 * Set taint even when machine check was not enabled.
401		 */
402		add_taint(TAINT_MACHINE_CHECK);
403
404		__set_bit(i, toclear);
405
406		if (m.status & MCI_STATUS_EN) {
407			/* if PCC was set, there's no way out */
408			no_way_out |= !!(m.status & MCI_STATUS_PCC);
409			/*
410			 * If this error was uncorrectable and there was
411			 * an overflow, we're in trouble.  If no overflow,
412			 * we might get away with just killing a task.
413			 */
414			if (m.status & MCI_STATUS_UC) {
415				if (tolerant < 1 || m.status & MCI_STATUS_OVER)
416					no_way_out = 1;
417				kill_it = 1;
418			}
419		} else {
420			/*
421			 * Machine check event was not enabled. Clear, but
422			 * ignore.
423			 */
424			continue;
425		}
426
427		if (m.status & MCI_STATUS_MISCV)
428			m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4);
429		if (m.status & MCI_STATUS_ADDRV)
430			m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4);
431
432		mce_get_rip(&m, regs);
433		mce_log(&m);
434
435		/*
436		 * Did this bank cause the exception?
437		 *
438		 * Assume that the bank with uncorrectable errors did it,
439		 * and that there is only a single one:
440		 */
441		if ((m.status & MCI_STATUS_UC) &&
442					(m.status & MCI_STATUS_EN)) {
443			panicm = m;
444			panicm_found = 1;
445		}
446	}
447
448	/*
449	 * If we didn't find an uncorrectable error, pick
450	 * the last one (shouldn't happen, just being safe).
451	 */
452	if (!panicm_found)
453		panicm = m;
454
455	/*
456	 * If we have decided that we just CAN'T continue, and the user
457	 * has not set tolerant to an insane level, give up and die.
458	 */
459	if (no_way_out && tolerant < 3)
460		mce_panic("Machine check", &panicm, mcestart);
461
462	/*
463	 * If the error seems to be unrecoverable, something should be
464	 * done.  Try to kill as little as possible.  If we can kill just
465	 * one task, do that.  If the user has set the tolerance very
466	 * high, don't try to do anything at all.
467	 */
468	if (kill_it && tolerant < 3) {
469		int user_space = 0;
470
471		/*
472		 * If the EIPV bit is set, it means the saved IP is the
473		 * instruction which caused the MCE.
474		 */
475		if (m.mcgstatus & MCG_STATUS_EIPV)
476			user_space = panicm.ip && (panicm.cs & 3);
477
478		/*
479		 * If we know that the error was in user space, send a
480		 * SIGBUS.  Otherwise, panic if tolerance is low.
481		 *
482		 * force_sig() takes an awful lot of locks and has a slight
483		 * risk of deadlocking.
484		 */
485		if (user_space) {
486			force_sig(SIGBUS, current);
487		} else if (panic_on_oops || tolerant < 2) {
488			mce_panic("Uncorrected machine check",
489				&panicm, mcestart);
490		}
491	}
492
493	/* notify userspace ASAP */
494	set_thread_flag(TIF_MCE_NOTIFY);
495
496	/* the last thing we do is clear state */
497	for (i = 0; i < banks; i++) {
498		if (test_bit(i, toclear))
499			mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
500	}
501	mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
502out:
503	atomic_dec(&mce_entry);
504	sync_core();
505}
506EXPORT_SYMBOL_GPL(do_machine_check);
507
508#ifdef CONFIG_X86_MCE_INTEL
509/***
510 * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
511 * @cpu: The CPU on which the event occurred.
512 * @status: Event status information
513 *
514 * This function should be called by the thermal interrupt after the
515 * event has been processed and the decision was made to log the event
516 * further.
517 *
518 * The status parameter will be saved to the 'status' field of 'struct mce'
519 * and historically has been the register value of the
520 * MSR_IA32_THERMAL_STATUS (Intel) msr.
521 */
522void mce_log_therm_throt_event(__u64 status)
523{
524	struct mce m;
525
526	mce_setup(&m);
527	m.bank = MCE_THERMAL_BANK;
528	m.status = status;
529	mce_log(&m);
530}
531#endif /* CONFIG_X86_MCE_INTEL */
532
533/*
534 * Periodic polling timer for "silent" machine check errors.  If the
535 * poller finds an MCE, poll 2x faster.  When the poller finds no more
536 * errors, poll 2x slower (up to check_interval seconds).
537 */
538static int check_interval = 5 * 60; /* 5 minutes */
539
540static DEFINE_PER_CPU(int, next_interval); /* in jiffies */
541static DEFINE_PER_CPU(struct timer_list, mce_timer);
542
543static void mcheck_timer(unsigned long data)
544{
545	struct timer_list *t = &per_cpu(mce_timer, data);
546	int *n;
547
548	WARN_ON(smp_processor_id() != data);
549
550	if (mce_available(&current_cpu_data)) {
551		machine_check_poll(MCP_TIMESTAMP,
552				&__get_cpu_var(mce_poll_banks));
553	}
554
555	/*
556	 * Alert userspace if needed.  If we logged an MCE, reduce the
557	 * polling interval, otherwise increase the polling interval.
558	 */
559	n = &__get_cpu_var(next_interval);
560	if (mce_notify_user()) {
561		*n = max(*n/2, HZ/100);
562	} else {
563		*n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ));
564	}
565
566	t->expires = jiffies + *n;
567	add_timer(t);
568}
569
570static void mce_do_trigger(struct work_struct *work)
571{
572	call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT);
573}
574
575static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
576
577/*
578 * Notify the user(s) about new machine check events.
579 * Can be called from interrupt context, but not from machine check/NMI
580 * context.
581 */
582int mce_notify_user(void)
583{
584	/* Not more than two messages every minute */
585	static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
586
587	clear_thread_flag(TIF_MCE_NOTIFY);
588
589	if (test_and_clear_bit(0, &notify_user)) {
590		wake_up_interruptible(&mce_wait);
591
592		/*
593		 * There is no risk of missing notifications because
594		 * work_pending is always cleared before the function is
595		 * executed.
596		 */
597		if (trigger[0] && !work_pending(&mce_trigger_work))
598			schedule_work(&mce_trigger_work);
599
600		if (__ratelimit(&ratelimit))
601			printk(KERN_INFO "Machine check events logged\n");
602
603		return 1;
604	}
605	return 0;
606}
607EXPORT_SYMBOL_GPL(mce_notify_user);
608
609/*
610 * Initialize Machine Checks for a CPU.
611 */
612static int mce_cap_init(void)
613{
614	unsigned b;
615	u64 cap;
616
617	rdmsrl(MSR_IA32_MCG_CAP, cap);
618
619	b = cap & MCG_BANKCNT_MASK;
620	printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b);
621
622	if (b > MAX_NR_BANKS) {
623		printk(KERN_WARNING
624		       "MCE: Using only %u machine check banks out of %u\n",
625			MAX_NR_BANKS, b);
626		b = MAX_NR_BANKS;
627	}
628
629	/* Don't support asymmetric configurations today */
630	WARN_ON(banks != 0 && b != banks);
631	banks = b;
632	if (!bank) {
633		bank = kmalloc(banks * sizeof(u64), GFP_KERNEL);
634		if (!bank)
635			return -ENOMEM;
636		memset(bank, 0xff, banks * sizeof(u64));
637	}
638
639	/* Use accurate RIP reporting if available. */
640	if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9)
641		rip_msr = MSR_IA32_MCG_EIP;
642
643	return 0;
644}
645
646static void mce_init(void)
647{
648	mce_banks_t all_banks;
649	u64 cap;
650	int i;
651
652	/*
653	 * Log the machine checks left over from the previous reset.
654	 */
655	bitmap_fill(all_banks, MAX_NR_BANKS);
656	machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks);
657
658	set_in_cr4(X86_CR4_MCE);
659
660	rdmsrl(MSR_IA32_MCG_CAP, cap);
661	if (cap & MCG_CTL_P)
662		wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
663
664	for (i = 0; i < banks; i++) {
665		if (skip_bank_init(i))
666			continue;
667		wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
668		wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
669	}
670}
671
672/* Add per CPU specific workarounds here */
673static void mce_cpu_quirks(struct cpuinfo_x86 *c)
674{
675	/* This should be disabled by the BIOS, but isn't always */
676	if (c->x86_vendor == X86_VENDOR_AMD) {
677		if (c->x86 == 15 && banks > 4) {
678			/*
679			 * disable GART TBL walk error reporting, which
680			 * trips off incorrectly with the IOMMU & 3ware
681			 * & Cerberus:
682			 */
683			clear_bit(10, (unsigned long *)&bank[4]);
684		}
685		if (c->x86 <= 17 && mce_bootlog < 0) {
686			/*
687			 * Lots of broken BIOS around that don't clear them
688			 * by default and leave crap in there. Don't log:
689			 */
690			mce_bootlog = 0;
691		}
692		/*
693		 * Various K7s with broken bank 0 around. Always disable
694		 * by default.
695		 */
696		 if (c->x86 == 6)
697			bank[0] = 0;
698	}
699
700	if (c->x86_vendor == X86_VENDOR_INTEL) {
701		/*
702		 * SDM documents that on family 6 bank 0 should not be written
703		 * because it aliases to another special BIOS controlled
704		 * register.
705		 * But it's not aliased anymore on model 0x1a+
706		 * Don't ignore bank 0 completely because there could be a
707		 * valid event later, merely don't write CTL0.
708		 */
709
710		if (c->x86 == 6 && c->x86_model < 0x1A)
711			__set_bit(0, &dont_init_banks);
712	}
713}
714
715static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c)
716{
717	if (c->x86 != 5)
718		return;
719	switch (c->x86_vendor) {
720	case X86_VENDOR_INTEL:
721		if (mce_p5_enabled())
722			intel_p5_mcheck_init(c);
723		break;
724	case X86_VENDOR_CENTAUR:
725		winchip_mcheck_init(c);
726		break;
727	}
728}
729
730static void mce_cpu_features(struct cpuinfo_x86 *c)
731{
732	switch (c->x86_vendor) {
733	case X86_VENDOR_INTEL:
734		mce_intel_feature_init(c);
735		break;
736	case X86_VENDOR_AMD:
737		mce_amd_feature_init(c);
738		break;
739	default:
740		break;
741	}
742}
743
744static void mce_init_timer(void)
745{
746	struct timer_list *t = &__get_cpu_var(mce_timer);
747	int *n = &__get_cpu_var(next_interval);
748
749	*n = check_interval * HZ;
750	if (!*n)
751		return;
752	setup_timer(t, mcheck_timer, smp_processor_id());
753	t->expires = round_jiffies(jiffies + *n);
754	add_timer(t);
755}
756
757/*
758 * Called for each booted CPU to set up machine checks.
759 * Must be called with preempt off:
760 */
761void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
762{
763	if (mce_disabled)
764		return;
765
766	mce_ancient_init(c);
767
768	if (!mce_available(c))
769		return;
770
771	if (mce_cap_init() < 0) {
772		mce_disabled = 1;
773		return;
774	}
775	mce_cpu_quirks(c);
776
777	machine_check_vector = do_machine_check;
778
779	mce_init();
780	mce_cpu_features(c);
781	mce_init_timer();
782}
783
784/*
785 * Character device to read and clear the MCE log.
786 */
787
788static DEFINE_SPINLOCK(mce_state_lock);
789static int		open_count;		/* #times opened */
790static int		open_exclu;		/* already open exclusive? */
791
792static int mce_open(struct inode *inode, struct file *file)
793{
794	lock_kernel();
795	spin_lock(&mce_state_lock);
796
797	if (open_exclu || (open_count && (file->f_flags & O_EXCL))) {
798		spin_unlock(&mce_state_lock);
799		unlock_kernel();
800
801		return -EBUSY;
802	}
803
804	if (file->f_flags & O_EXCL)
805		open_exclu = 1;
806	open_count++;
807
808	spin_unlock(&mce_state_lock);
809	unlock_kernel();
810
811	return nonseekable_open(inode, file);
812}
813
814static int mce_release(struct inode *inode, struct file *file)
815{
816	spin_lock(&mce_state_lock);
817
818	open_count--;
819	open_exclu = 0;
820
821	spin_unlock(&mce_state_lock);
822
823	return 0;
824}
825
826static void collect_tscs(void *data)
827{
828	unsigned long *cpu_tsc = (unsigned long *)data;
829
830	rdtscll(cpu_tsc[smp_processor_id()]);
831}
832
833static DEFINE_MUTEX(mce_read_mutex);
834
835static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
836			loff_t *off)
837{
838	char __user *buf = ubuf;
839	unsigned long *cpu_tsc;
840	unsigned prev, next;
841	int i, err;
842
843	cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
844	if (!cpu_tsc)
845		return -ENOMEM;
846
847	mutex_lock(&mce_read_mutex);
848	next = rcu_dereference(mcelog.next);
849
850	/* Only supports full reads right now */
851	if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
852		mutex_unlock(&mce_read_mutex);
853		kfree(cpu_tsc);
854
855		return -EINVAL;
856	}
857
858	err = 0;
859	prev = 0;
860	do {
861		for (i = prev; i < next; i++) {
862			unsigned long start = jiffies;
863
864			while (!mcelog.entry[i].finished) {
865				if (time_after_eq(jiffies, start + 2)) {
866					memset(mcelog.entry + i, 0,
867					       sizeof(struct mce));
868					goto timeout;
869				}
870				cpu_relax();
871			}
872			smp_rmb();
873			err |= copy_to_user(buf, mcelog.entry + i,
874					    sizeof(struct mce));
875			buf += sizeof(struct mce);
876timeout:
877			;
878		}
879
880		memset(mcelog.entry + prev, 0,
881		       (next - prev) * sizeof(struct mce));
882		prev = next;
883		next = cmpxchg(&mcelog.next, prev, 0);
884	} while (next != prev);
885
886	synchronize_sched();
887
888	/*
889	 * Collect entries that were still getting written before the
890	 * synchronize.
891	 */
892	on_each_cpu(collect_tscs, cpu_tsc, 1);
893
894	for (i = next; i < MCE_LOG_LEN; i++) {
895		if (mcelog.entry[i].finished &&
896		    mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
897			err |= copy_to_user(buf, mcelog.entry+i,
898					    sizeof(struct mce));
899			smp_rmb();
900			buf += sizeof(struct mce);
901			memset(&mcelog.entry[i], 0, sizeof(struct mce));
902		}
903	}
904	mutex_unlock(&mce_read_mutex);
905	kfree(cpu_tsc);
906
907	return err ? -EFAULT : buf - ubuf;
908}
909
910static unsigned int mce_poll(struct file *file, poll_table *wait)
911{
912	poll_wait(file, &mce_wait, wait);
913	if (rcu_dereference(mcelog.next))
914		return POLLIN | POLLRDNORM;
915	return 0;
916}
917
918static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
919{
920	int __user *p = (int __user *)arg;
921
922	if (!capable(CAP_SYS_ADMIN))
923		return -EPERM;
924
925	switch (cmd) {
926	case MCE_GET_RECORD_LEN:
927		return put_user(sizeof(struct mce), p);
928	case MCE_GET_LOG_LEN:
929		return put_user(MCE_LOG_LEN, p);
930	case MCE_GETCLEAR_FLAGS: {
931		unsigned flags;
932
933		do {
934			flags = mcelog.flags;
935		} while (cmpxchg(&mcelog.flags, flags, 0) != flags);
936
937		return put_user(flags, p);
938	}
939	default:
940		return -ENOTTY;
941	}
942}
943
944/* Modified in mce-inject.c, so not static or const */
945struct file_operations mce_chrdev_ops = {
946	.open			= mce_open,
947	.release		= mce_release,
948	.read			= mce_read,
949	.poll			= mce_poll,
950	.unlocked_ioctl		= mce_ioctl,
951};
952EXPORT_SYMBOL_GPL(mce_chrdev_ops);
953
954static struct miscdevice mce_log_device = {
955	MISC_MCELOG_MINOR,
956	"mcelog",
957	&mce_chrdev_ops,
958};
959
960/*
961 * mce=off disables machine check
962 * mce=TOLERANCELEVEL (number, see above)
963 * mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
964 * mce=nobootlog Don't log MCEs from before booting.
965 */
966static int __init mcheck_enable(char *str)
967{
968	if (*str == 0)
969		enable_p5_mce();
970	if (*str == '=')
971		str++;
972	if (!strcmp(str, "off"))
973		mce_disabled = 1;
974	else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
975		mce_bootlog = (str[0] == 'b');
976	else if (isdigit(str[0]))
977		get_option(&str, &tolerant);
978	else {
979		printk(KERN_INFO "mce argument %s ignored. Please use /sys\n",
980		       str);
981		return 0;
982	}
983	return 1;
984}
985__setup("mce", mcheck_enable);
986
987/*
988 * Sysfs support
989 */
990
991/*
992 * Disable machine checks on suspend and shutdown. We can't really handle
993 * them later.
994 */
995static int mce_disable(void)
996{
997	int i;
998
999	for (i = 0; i < banks; i++) {
1000		if (!skip_bank_init(i))
1001			wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
1002	}
1003	return 0;
1004}
1005
1006static int mce_suspend(struct sys_device *dev, pm_message_t state)
1007{
1008	return mce_disable();
1009}
1010
1011static int mce_shutdown(struct sys_device *dev)
1012{
1013	return mce_disable();
1014}
1015
1016/*
1017 * On resume clear all MCE state. Don't want to see leftovers from the BIOS.
1018 * Only one CPU is active at this time, the others get re-added later using
1019 * CPU hotplug:
1020 */
1021static int mce_resume(struct sys_device *dev)
1022{
1023	mce_init();
1024	mce_cpu_features(&current_cpu_data);
1025
1026	return 0;
1027}
1028
1029static void mce_cpu_restart(void *data)
1030{
1031	del_timer_sync(&__get_cpu_var(mce_timer));
1032	if (mce_available(&current_cpu_data))
1033		mce_init();
1034	mce_init_timer();
1035}
1036
1037/* Reinit MCEs after user configuration changes */
1038static void mce_restart(void)
1039{
1040	on_each_cpu(mce_cpu_restart, NULL, 1);
1041}
1042
1043static struct sysdev_class mce_sysclass = {
1044	.suspend	= mce_suspend,
1045	.shutdown	= mce_shutdown,
1046	.resume		= mce_resume,
1047	.name		= "machinecheck",
1048};
1049
1050DEFINE_PER_CPU(struct sys_device, mce_dev);
1051
1052__cpuinitdata
1053void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
1054
1055static struct sysdev_attribute *bank_attrs;
1056
1057static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr,
1058			 char *buf)
1059{
1060	u64 b = bank[attr - bank_attrs];
1061
1062	return sprintf(buf, "%llx\n", b);
1063}
1064
1065static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
1066			const char *buf, size_t siz)
1067{
1068	char *end;
1069	u64 new = simple_strtoull(buf, &end, 0);
1070
1071	if (end == buf)
1072		return -EINVAL;
1073
1074	bank[attr - bank_attrs] = new;
1075	mce_restart();
1076
1077	return end-buf;
1078}
1079
1080static ssize_t
1081show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf)
1082{
1083	strcpy(buf, trigger);
1084	strcat(buf, "\n");
1085	return strlen(trigger) + 1;
1086}
1087
1088static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr,
1089				const char *buf, size_t siz)
1090{
1091	char *p;
1092	int len;
1093
1094	strncpy(trigger, buf, sizeof(trigger));
1095	trigger[sizeof(trigger)-1] = 0;
1096	len = strlen(trigger);
1097	p = strchr(trigger, '\n');
1098
1099	if (*p)
1100		*p = 0;
1101
1102	return len;
1103}
1104
1105static ssize_t store_int_with_restart(struct sys_device *s,
1106				      struct sysdev_attribute *attr,
1107				      const char *buf, size_t size)
1108{
1109	ssize_t ret = sysdev_store_int(s, attr, buf, size);
1110	mce_restart();
1111	return ret;
1112}
1113
1114static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
1115static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
1116
1117static struct sysdev_ext_attribute attr_check_interval = {
1118	_SYSDEV_ATTR(check_interval, 0644, sysdev_show_int,
1119		     store_int_with_restart),
1120	&check_interval
1121};
1122
1123static struct sysdev_attribute *mce_attrs[] = {
1124	&attr_tolerant.attr, &attr_check_interval.attr, &attr_trigger,
1125	NULL
1126};
1127
1128static cpumask_var_t mce_dev_initialized;
1129
1130/* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */
1131static __cpuinit int mce_create_device(unsigned int cpu)
1132{
1133	int err;
1134	int i;
1135
1136	if (!mce_available(&boot_cpu_data))
1137		return -EIO;
1138
1139	memset(&per_cpu(mce_dev, cpu).kobj, 0, sizeof(struct kobject));
1140	per_cpu(mce_dev, cpu).id	= cpu;
1141	per_cpu(mce_dev, cpu).cls	= &mce_sysclass;
1142
1143	err = sysdev_register(&per_cpu(mce_dev, cpu));
1144	if (err)
1145		return err;
1146
1147	for (i = 0; mce_attrs[i]; i++) {
1148		err = sysdev_create_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
1149		if (err)
1150			goto error;
1151	}
1152	for (i = 0; i < banks; i++) {
1153		err = sysdev_create_file(&per_cpu(mce_dev, cpu),
1154					&bank_attrs[i]);
1155		if (err)
1156			goto error2;
1157	}
1158	cpumask_set_cpu(cpu, mce_dev_initialized);
1159
1160	return 0;
1161error2:
1162	while (--i >= 0)
1163		sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]);
1164error:
1165	while (--i >= 0)
1166		sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
1167
1168	sysdev_unregister(&per_cpu(mce_dev, cpu));
1169
1170	return err;
1171}
1172
1173static __cpuinit void mce_remove_device(unsigned int cpu)
1174{
1175	int i;
1176
1177	if (!cpumask_test_cpu(cpu, mce_dev_initialized))
1178		return;
1179
1180	for (i = 0; mce_attrs[i]; i++)
1181		sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
1182
1183	for (i = 0; i < banks; i++)
1184		sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]);
1185
1186	sysdev_unregister(&per_cpu(mce_dev, cpu));
1187	cpumask_clear_cpu(cpu, mce_dev_initialized);
1188}
1189
1190/* Make sure there are no machine checks on offlined CPUs. */
1191static void mce_disable_cpu(void *h)
1192{
1193	unsigned long action = *(unsigned long *)h;
1194	int i;
1195
1196	if (!mce_available(&current_cpu_data))
1197		return;
1198	if (!(action & CPU_TASKS_FROZEN))
1199		cmci_clear();
1200	for (i = 0; i < banks; i++) {
1201		if (!skip_bank_init(i))
1202			wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
1203	}
1204}
1205
1206static void mce_reenable_cpu(void *h)
1207{
1208	unsigned long action = *(unsigned long *)h;
1209	int i;
1210
1211	if (!mce_available(&current_cpu_data))
1212		return;
1213
1214	if (!(action & CPU_TASKS_FROZEN))
1215		cmci_reenable();
1216	for (i = 0; i < banks; i++) {
1217		if (!skip_bank_init(i))
1218			wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]);
1219	}
1220}
1221
1222/* Get notified when a cpu comes on/off. Be hotplug friendly. */
1223static int __cpuinit
1224mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
1225{
1226	unsigned int cpu = (unsigned long)hcpu;
1227	struct timer_list *t = &per_cpu(mce_timer, cpu);
1228
1229	switch (action) {
1230	case CPU_ONLINE:
1231	case CPU_ONLINE_FROZEN:
1232		mce_create_device(cpu);
1233		if (threshold_cpu_callback)
1234			threshold_cpu_callback(action, cpu);
1235		break;
1236	case CPU_DEAD:
1237	case CPU_DEAD_FROZEN:
1238		if (threshold_cpu_callback)
1239			threshold_cpu_callback(action, cpu);
1240		mce_remove_device(cpu);
1241		break;
1242	case CPU_DOWN_PREPARE:
1243	case CPU_DOWN_PREPARE_FROZEN:
1244		del_timer_sync(t);
1245		smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
1246		break;
1247	case CPU_DOWN_FAILED:
1248	case CPU_DOWN_FAILED_FROZEN:
1249		t->expires = round_jiffies(jiffies +
1250						__get_cpu_var(next_interval));
1251		add_timer_on(t, cpu);
1252		smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
1253		break;
1254	case CPU_POST_DEAD:
1255		/* intentionally ignoring frozen here */
1256		cmci_rediscover(cpu);
1257		break;
1258	}
1259	return NOTIFY_OK;
1260}
1261
1262static struct notifier_block mce_cpu_notifier __cpuinitdata = {
1263	.notifier_call = mce_cpu_callback,
1264};
1265
1266static __init int mce_init_banks(void)
1267{
1268	int i;
1269
1270	bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks,
1271				GFP_KERNEL);
1272	if (!bank_attrs)
1273		return -ENOMEM;
1274
1275	for (i = 0; i < banks; i++) {
1276		struct sysdev_attribute *a = &bank_attrs[i];
1277
1278		a->attr.name	= kasprintf(GFP_KERNEL, "bank%d", i);
1279		if (!a->attr.name)
1280			goto nomem;
1281
1282		a->attr.mode	= 0644;
1283		a->show		= show_bank;
1284		a->store	= set_bank;
1285	}
1286	return 0;
1287
1288nomem:
1289	while (--i >= 0)
1290		kfree(bank_attrs[i].attr.name);
1291	kfree(bank_attrs);
1292	bank_attrs = NULL;
1293
1294	return -ENOMEM;
1295}
1296
1297static __init int mce_init_device(void)
1298{
1299	int err;
1300	int i = 0;
1301
1302	if (!mce_available(&boot_cpu_data))
1303		return -EIO;
1304
1305	alloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL);
1306
1307	err = mce_init_banks();
1308	if (err)
1309		return err;
1310
1311	err = sysdev_class_register(&mce_sysclass);
1312	if (err)
1313		return err;
1314
1315	for_each_online_cpu(i) {
1316		err = mce_create_device(i);
1317		if (err)
1318			return err;
1319	}
1320
1321	register_hotcpu_notifier(&mce_cpu_notifier);
1322	misc_register(&mce_log_device);
1323
1324	return err;
1325}
1326
1327device_initcall(mce_init_device);
1328
1329#else /* CONFIG_X86_OLD_MCE: */
1330
1331int nr_mce_banks;
1332EXPORT_SYMBOL_GPL(nr_mce_banks);	/* non-fatal.o */
1333
1334/* This has to be run for each processor */
1335void mcheck_init(struct cpuinfo_x86 *c)
1336{
1337	if (mce_disabled == 1)
1338		return;
1339
1340	switch (c->x86_vendor) {
1341	case X86_VENDOR_AMD:
1342		amd_mcheck_init(c);
1343		break;
1344
1345	case X86_VENDOR_INTEL:
1346		if (c->x86 == 5)
1347			intel_p5_mcheck_init(c);
1348		if (c->x86 == 6)
1349			intel_p6_mcheck_init(c);
1350		if (c->x86 == 15)
1351			intel_p4_mcheck_init(c);
1352		break;
1353
1354	case X86_VENDOR_CENTAUR:
1355		if (c->x86 == 5)
1356			winchip_mcheck_init(c);
1357		break;
1358
1359	default:
1360		break;
1361	}
1362	printk(KERN_INFO "mce: CPU supports %d MCE banks\n", nr_mce_banks);
1363}
1364
1365static int __init mcheck_enable(char *str)
1366{
1367	mce_disabled = -1;
1368	return 1;
1369}
1370
1371__setup("mce", mcheck_enable);
1372
1373#endif /* CONFIG_X86_OLD_MCE */
1374
1375/*
1376 * Old style boot options parsing. Only for compatibility.
1377 */
1378static int __init mcheck_disable(char *str)
1379{
1380	mce_disabled = 1;
1381	return 1;
1382}
1383__setup("nomce", mcheck_disable);
1384