mce_amd.c revision 86039cd401e1780573733870f9c0bd458fc96ea2
1#include <linux/module.h>
2#include <linux/slab.h>
3
4#include "mce_amd.h"
5
6static struct amd_decoder_ops *fam_ops;
7
8static u8 xec_mask	 = 0xf;
9static u8 nb_err_cpumask = 0xf;
10
11static bool report_gart_errors;
12static void (*nb_bus_decoder)(int node_id, struct mce *m, u32 nbcfg);
13
14void amd_report_gart_errors(bool v)
15{
16	report_gart_errors = v;
17}
18EXPORT_SYMBOL_GPL(amd_report_gart_errors);
19
20void amd_register_ecc_decoder(void (*f)(int, struct mce *, u32))
21{
22	nb_bus_decoder = f;
23}
24EXPORT_SYMBOL_GPL(amd_register_ecc_decoder);
25
26void amd_unregister_ecc_decoder(void (*f)(int, struct mce *, u32))
27{
28	if (nb_bus_decoder) {
29		WARN_ON(nb_bus_decoder != f);
30
31		nb_bus_decoder = NULL;
32	}
33}
34EXPORT_SYMBOL_GPL(amd_unregister_ecc_decoder);
35
36/*
37 * string representation for the different MCA reported error types, see F3x48
38 * or MSR0000_0411.
39 */
40
41/* transaction type */
42const char *tt_msgs[] = { "INSN", "DATA", "GEN", "RESV" };
43EXPORT_SYMBOL_GPL(tt_msgs);
44
45/* cache level */
46const char *ll_msgs[] = { "RESV", "L1", "L2", "L3/GEN" };
47EXPORT_SYMBOL_GPL(ll_msgs);
48
49/* memory transaction type */
50const char *rrrr_msgs[] = {
51       "GEN", "RD", "WR", "DRD", "DWR", "IRD", "PRF", "EV", "SNP"
52};
53EXPORT_SYMBOL_GPL(rrrr_msgs);
54
55/* participating processor */
56const char *pp_msgs[] = { "SRC", "RES", "OBS", "GEN" };
57EXPORT_SYMBOL_GPL(pp_msgs);
58
59/* request timeout */
60const char *to_msgs[] = { "no timeout",	"timed out" };
61EXPORT_SYMBOL_GPL(to_msgs);
62
63/* memory or i/o */
64const char *ii_msgs[] = { "MEM", "RESV", "IO", "GEN" };
65EXPORT_SYMBOL_GPL(ii_msgs);
66
67static const char *f10h_nb_mce_desc[] = {
68	"HT link data error",
69	"Protocol error (link, L3, probe filter, etc.)",
70	"Parity error in NB-internal arrays",
71	"Link Retry due to IO link transmission error",
72	"L3 ECC data cache error",
73	"ECC error in L3 cache tag",
74	"L3 LRU parity bits error",
75	"ECC Error in the Probe Filter directory"
76};
77
78static const char * const f15h_ic_mce_desc[] = {
79	"UC during a demand linefill from L2",
80	"Parity error during data load from IC",
81	"Parity error for IC valid bit",
82	"Main tag parity error",
83	"Parity error in prediction queue",
84	"PFB data/address parity error",
85	"Parity error in the branch status reg",
86	"PFB promotion address error",
87	"Tag error during probe/victimization",
88	"Parity error for IC probe tag valid bit",
89	"PFB non-cacheable bit parity error",
90	"PFB valid bit parity error",			/* xec = 0xd */
91	"patch RAM",					/* xec = 010 */
92	"uop queue",
93	"insn buffer",
94	"predecode buffer",
95	"fetch address FIFO"
96};
97
98static bool f12h_dc_mce(u16 ec, u8 xec)
99{
100	bool ret = false;
101
102	if (MEM_ERROR(ec)) {
103		u8 ll = ec & 0x3;
104		ret = true;
105
106		if (ll == LL_L2)
107			pr_cont("during L1 linefill from L2.\n");
108		else if (ll == LL_L1)
109			pr_cont("Data/Tag %s error.\n", RRRR_MSG(ec));
110		else
111			ret = false;
112	}
113	return ret;
114}
115
116static bool f10h_dc_mce(u16 ec, u8 xec)
117{
118	u8 r4  = (ec >> 4) & 0xf;
119	u8 ll  = ec & 0x3;
120
121	if (r4 == R4_GEN && ll == LL_L1) {
122		pr_cont("during data scrub.\n");
123		return true;
124	}
125	return f12h_dc_mce(ec, xec);
126}
127
128static bool k8_dc_mce(u16 ec, u8 xec)
129{
130	if (BUS_ERROR(ec)) {
131		pr_cont("during system linefill.\n");
132		return true;
133	}
134
135	return f10h_dc_mce(ec, xec);
136}
137
138static bool f14h_dc_mce(u16 ec, u8 xec)
139{
140	u8 r4	 = (ec >> 4) & 0xf;
141	u8 ll	 = ec & 0x3;
142	u8 tt	 = (ec >> 2) & 0x3;
143	u8 ii	 = tt;
144	bool ret = true;
145
146	if (MEM_ERROR(ec)) {
147
148		if (tt != TT_DATA || ll != LL_L1)
149			return false;
150
151		switch (r4) {
152		case R4_DRD:
153		case R4_DWR:
154			pr_cont("Data/Tag parity error due to %s.\n",
155				(r4 == R4_DRD ? "load/hw prf" : "store"));
156			break;
157		case R4_EVICT:
158			pr_cont("Copyback parity error on a tag miss.\n");
159			break;
160		case R4_SNOOP:
161			pr_cont("Tag parity error during snoop.\n");
162			break;
163		default:
164			ret = false;
165		}
166	} else if (BUS_ERROR(ec)) {
167
168		if ((ii != II_MEM && ii != II_IO) || ll != LL_LG)
169			return false;
170
171		pr_cont("System read data error on a ");
172
173		switch (r4) {
174		case R4_RD:
175			pr_cont("TLB reload.\n");
176			break;
177		case R4_DWR:
178			pr_cont("store.\n");
179			break;
180		case R4_DRD:
181			pr_cont("load.\n");
182			break;
183		default:
184			ret = false;
185		}
186	} else {
187		ret = false;
188	}
189
190	return ret;
191}
192
193static bool f15h_dc_mce(u16 ec, u8 xec)
194{
195	bool ret = true;
196
197	if (MEM_ERROR(ec)) {
198
199		switch (xec) {
200		case 0x0:
201			pr_cont("Data Array access error.\n");
202			break;
203
204		case 0x1:
205			pr_cont("UC error during a linefill from L2/NB.\n");
206			break;
207
208		case 0x2:
209		case 0x11:
210			pr_cont("STQ access error.\n");
211			break;
212
213		case 0x3:
214			pr_cont("SCB access error.\n");
215			break;
216
217		case 0x10:
218			pr_cont("Tag error.\n");
219			break;
220
221		case 0x12:
222			pr_cont("LDQ access error.\n");
223			break;
224
225		default:
226			ret = false;
227		}
228	} else if (BUS_ERROR(ec)) {
229
230		if (!xec)
231			pr_cont("during system linefill.\n");
232		else
233			pr_cont(" Internal %s condition.\n",
234				((xec == 1) ? "livelock" : "deadlock"));
235	} else
236		ret = false;
237
238	return ret;
239}
240
241static void amd_decode_dc_mce(struct mce *m)
242{
243	u16 ec = m->status & 0xffff;
244	u8 xec = (m->status >> 16) & xec_mask;
245
246	pr_emerg(HW_ERR "Data Cache Error: ");
247
248	/* TLB error signatures are the same across families */
249	if (TLB_ERROR(ec)) {
250		u8 tt = (ec >> 2) & 0x3;
251
252		if (tt == TT_DATA) {
253			pr_cont("%s TLB %s.\n", LL_MSG(ec),
254				((xec == 2) ? "locked miss"
255					    : (xec ? "multimatch" : "parity")));
256			return;
257		}
258	} else if (fam_ops->dc_mce(ec, xec))
259		;
260	else
261		pr_emerg(HW_ERR "Corrupted DC MCE info?\n");
262}
263
264static bool k8_ic_mce(u16 ec, u8 xec)
265{
266	u8 ll	 = ec & 0x3;
267	u8 r4	 = (ec >> 4) & 0xf;
268	bool ret = true;
269
270	if (!MEM_ERROR(ec))
271		return false;
272
273	if (ll == 0x2)
274		pr_cont("during a linefill from L2.\n");
275	else if (ll == 0x1) {
276		switch (r4) {
277		case R4_IRD:
278			pr_cont("Parity error during data load.\n");
279			break;
280
281		case R4_EVICT:
282			pr_cont("Copyback Parity/Victim error.\n");
283			break;
284
285		case R4_SNOOP:
286			pr_cont("Tag Snoop error.\n");
287			break;
288
289		default:
290			ret = false;
291			break;
292		}
293	} else
294		ret = false;
295
296	return ret;
297}
298
299static bool f14h_ic_mce(u16 ec, u8 xec)
300{
301	u8 ll    = ec & 0x3;
302	u8 tt    = (ec >> 2) & 0x3;
303	u8 r4  = (ec >> 4) & 0xf;
304	bool ret = true;
305
306	if (MEM_ERROR(ec)) {
307		if (tt != 0 || ll != 1)
308			ret = false;
309
310		if (r4 == R4_IRD)
311			pr_cont("Data/tag array parity error for a tag hit.\n");
312		else if (r4 == R4_SNOOP)
313			pr_cont("Tag error during snoop/victimization.\n");
314		else
315			ret = false;
316	}
317	return ret;
318}
319
320static bool f15h_ic_mce(u16 ec, u8 xec)
321{
322	bool ret = true;
323
324	if (!MEM_ERROR(ec))
325		return false;
326
327	switch (xec) {
328	case 0x0 ... 0xa:
329		pr_cont("%s.\n", f15h_ic_mce_desc[xec]);
330		break;
331
332	case 0xd:
333		pr_cont("%s.\n", f15h_ic_mce_desc[xec-2]);
334		break;
335
336	case 0x10 ... 0x14:
337		pr_cont("Decoder %s parity error.\n", f15h_ic_mce_desc[xec-4]);
338		break;
339
340	default:
341		ret = false;
342	}
343	return ret;
344}
345
346static void amd_decode_ic_mce(struct mce *m)
347{
348	u16 ec = m->status & 0xffff;
349	u8 xec = (m->status >> 16) & xec_mask;
350
351	pr_emerg(HW_ERR "Instruction Cache Error: ");
352
353	if (TLB_ERROR(ec))
354		pr_cont("%s TLB %s.\n", LL_MSG(ec),
355			(xec ? "multimatch" : "parity error"));
356	else if (BUS_ERROR(ec)) {
357		bool k8 = (boot_cpu_data.x86 == 0xf && (m->status & BIT_64(58)));
358
359		pr_cont("during %s.\n", (k8 ? "system linefill" : "NB data read"));
360	} else if (fam_ops->ic_mce(ec, xec))
361		;
362	else
363		pr_emerg(HW_ERR "Corrupted IC MCE info?\n");
364}
365
366static void amd_decode_bu_mce(struct mce *m)
367{
368	u32 ec = m->status & 0xffff;
369	u32 xec = (m->status >> 16) & xec_mask;
370
371	pr_emerg(HW_ERR "Bus Unit Error");
372
373	if (xec == 0x1)
374		pr_cont(" in the write data buffers.\n");
375	else if (xec == 0x3)
376		pr_cont(" in the victim data buffers.\n");
377	else if (xec == 0x2 && MEM_ERROR(ec))
378		pr_cont(": %s error in the L2 cache tags.\n", RRRR_MSG(ec));
379	else if (xec == 0x0) {
380		if (TLB_ERROR(ec))
381			pr_cont(": %s error in a Page Descriptor Cache or "
382				"Guest TLB.\n", TT_MSG(ec));
383		else if (BUS_ERROR(ec))
384			pr_cont(": %s/ECC error in data read from NB: %s.\n",
385				RRRR_MSG(ec), PP_MSG(ec));
386		else if (MEM_ERROR(ec)) {
387			u8 rrrr = (ec >> 4) & 0xf;
388
389			if (rrrr >= 0x7)
390				pr_cont(": %s error during data copyback.\n",
391					RRRR_MSG(ec));
392			else if (rrrr <= 0x1)
393				pr_cont(": %s parity/ECC error during data "
394					"access from L2.\n", RRRR_MSG(ec));
395			else
396				goto wrong_bu_mce;
397		} else
398			goto wrong_bu_mce;
399	} else
400		goto wrong_bu_mce;
401
402	return;
403
404wrong_bu_mce:
405	pr_emerg(HW_ERR "Corrupted BU MCE info?\n");
406}
407
408static void amd_decode_ls_mce(struct mce *m)
409{
410	u16 ec = m->status & 0xffff;
411	u8 xec = (m->status >> 16) & xec_mask;
412
413	if (boot_cpu_data.x86 == 0x14) {
414		pr_emerg("You shouldn't be seeing an LS MCE on this cpu family,"
415			 " please report on LKML.\n");
416		return;
417	}
418
419	pr_emerg(HW_ERR "Load Store Error");
420
421	if (xec == 0x0) {
422		u8 r4 = (ec >> 4) & 0xf;
423
424		if (!BUS_ERROR(ec) || (r4 != R4_DRD && r4 != R4_DWR))
425			goto wrong_ls_mce;
426
427		pr_cont(" during %s.\n", RRRR_MSG(ec));
428	} else
429		goto wrong_ls_mce;
430
431	return;
432
433wrong_ls_mce:
434	pr_emerg(HW_ERR "Corrupted LS MCE info?\n");
435}
436
437static bool k8_nb_mce(u16 ec, u8 xec)
438{
439	bool ret = true;
440
441	switch (xec) {
442	case 0x1:
443		pr_cont("CRC error detected on HT link.\n");
444		break;
445
446	case 0x5:
447		pr_cont("Invalid GART PTE entry during GART table walk.\n");
448		break;
449
450	case 0x6:
451		pr_cont("Unsupported atomic RMW received from an IO link.\n");
452		break;
453
454	case 0x0:
455	case 0x8:
456		if (boot_cpu_data.x86 == 0x11)
457			return false;
458
459		pr_cont("DRAM ECC error detected on the NB.\n");
460		break;
461
462	case 0xd:
463		pr_cont("Parity error on the DRAM addr/ctl signals.\n");
464		break;
465
466	default:
467		ret = false;
468		break;
469	}
470
471	return ret;
472}
473
474static bool f10h_nb_mce(u16 ec, u8 xec)
475{
476	bool ret = true;
477	u8 offset = 0;
478
479	if (k8_nb_mce(ec, xec))
480		return true;
481
482	switch(xec) {
483	case 0xa ... 0xc:
484		offset = 10;
485		break;
486
487	case 0xe:
488		offset = 11;
489		break;
490
491	case 0xf:
492		if (TLB_ERROR(ec))
493			pr_cont("GART Table Walk data error.\n");
494		else if (BUS_ERROR(ec))
495			pr_cont("DMA Exclusion Vector Table Walk error.\n");
496		else
497			ret = false;
498
499		goto out;
500		break;
501
502	case 0x1c ... 0x1f:
503		offset = 24;
504		break;
505
506	default:
507		ret = false;
508
509		goto out;
510		break;
511	}
512
513	pr_cont("%s.\n", f10h_nb_mce_desc[xec - offset]);
514
515out:
516	return ret;
517}
518
519static bool nb_noop_mce(u16 ec, u8 xec)
520{
521	return false;
522}
523
524void amd_decode_nb_mce(int node_id, struct mce *m, u32 nbcfg)
525{
526	u8 xec   = (m->status >> 16) & 0x1f;
527	u16 ec   = m->status & 0xffff;
528	u32 nbsh = (u32)(m->status >> 32);
529
530	pr_emerg(HW_ERR "Northbridge Error, node %d: ", node_id);
531
532	/*
533	 * F10h, revD can disable ErrCpu[3:0] so check that first and also the
534	 * value encoding has changed so interpret those differently
535	 */
536	if ((boot_cpu_data.x86 == 0x10) &&
537	    (boot_cpu_data.x86_model > 7)) {
538		if (nbsh & K8_NBSH_ERR_CPU_VAL)
539			pr_cont(", core: %u", (u8)(nbsh & nb_err_cpumask));
540	} else {
541		u8 assoc_cpus = nbsh & nb_err_cpumask;
542
543		if (assoc_cpus > 0)
544			pr_cont(", core: %d", fls(assoc_cpus) - 1);
545	}
546
547	switch (xec) {
548	case 0x2:
549		pr_cont("Sync error (sync packets on HT link detected).\n");
550		return;
551
552	case 0x3:
553		pr_cont("HT Master abort.\n");
554		return;
555
556	case 0x4:
557		pr_cont("HT Target abort.\n");
558		return;
559
560	case 0x7:
561		pr_cont("NB Watchdog timeout.\n");
562		return;
563
564	case 0x9:
565		pr_cont("SVM DMA Exclusion Vector error.\n");
566		return;
567
568	default:
569		break;
570	}
571
572	if (!fam_ops->nb_mce(ec, xec))
573		goto wrong_nb_mce;
574
575	if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10)
576		if ((xec == 0x8 || xec == 0x0) && nb_bus_decoder)
577			nb_bus_decoder(node_id, m, nbcfg);
578
579	return;
580
581wrong_nb_mce:
582	pr_emerg(HW_ERR "Corrupted NB MCE info?\n");
583}
584EXPORT_SYMBOL_GPL(amd_decode_nb_mce);
585
586static void amd_decode_fr_mce(struct mce *m)
587{
588	if (boot_cpu_data.x86 == 0xf ||
589	    boot_cpu_data.x86 == 0x11)
590		goto wrong_fr_mce;
591
592	/* we have only one error signature so match all fields at once. */
593	if ((m->status & 0xffff) == 0x0f0f) {
594		pr_emerg(HW_ERR "FR Error: CPU Watchdog timer expire.\n");
595		return;
596	}
597
598wrong_fr_mce:
599	pr_emerg(HW_ERR "Corrupted FR MCE info?\n");
600}
601
602static inline void amd_decode_err_code(u16 ec)
603{
604	if (TLB_ERROR(ec)) {
605		pr_emerg(HW_ERR "Transaction: %s, Cache Level: %s\n",
606			 TT_MSG(ec), LL_MSG(ec));
607	} else if (MEM_ERROR(ec)) {
608		pr_emerg(HW_ERR "Transaction: %s, Type: %s, Cache Level: %s\n",
609			 RRRR_MSG(ec), TT_MSG(ec), LL_MSG(ec));
610	} else if (BUS_ERROR(ec)) {
611		pr_emerg(HW_ERR "Transaction: %s (%s), %s, Cache Level: %s, "
612			 "Participating Processor: %s\n",
613			  RRRR_MSG(ec), II_MSG(ec), TO_MSG(ec), LL_MSG(ec),
614			  PP_MSG(ec));
615	} else
616		pr_emerg(HW_ERR "Huh? Unknown MCE error 0x%x\n", ec);
617}
618
619/*
620 * Filter out unwanted MCE signatures here.
621 */
622static bool amd_filter_mce(struct mce *m)
623{
624	u8 xec = (m->status >> 16) & 0x1f;
625
626	/*
627	 * NB GART TLB error reporting is disabled by default.
628	 */
629	if (m->bank == 4 && xec == 0x5 && !report_gart_errors)
630		return true;
631
632	return false;
633}
634
635int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
636{
637	struct mce *m = (struct mce *)data;
638	int node, ecc;
639
640	if (amd_filter_mce(m))
641		return NOTIFY_STOP;
642
643	pr_emerg(HW_ERR "MC%d_STATUS: ", m->bank);
644
645	pr_cont("%sorrected error, other errors lost: %s, "
646		 "CPU context corrupt: %s",
647		 ((m->status & MCI_STATUS_UC) ? "Unc"  : "C"),
648		 ((m->status & MCI_STATUS_OVER) ? "yes"  : "no"),
649		 ((m->status & MCI_STATUS_PCC) ? "yes" : "no"));
650
651	/* do the two bits[14:13] together */
652	ecc = (m->status >> 45) & 0x3;
653	if (ecc)
654		pr_cont(", %sECC Error", ((ecc == 2) ? "C" : "U"));
655
656	pr_cont("\n");
657
658	switch (m->bank) {
659	case 0:
660		amd_decode_dc_mce(m);
661		break;
662
663	case 1:
664		amd_decode_ic_mce(m);
665		break;
666
667	case 2:
668		amd_decode_bu_mce(m);
669		break;
670
671	case 3:
672		amd_decode_ls_mce(m);
673		break;
674
675	case 4:
676		node = amd_get_nb_id(m->extcpu);
677		amd_decode_nb_mce(node, m, 0);
678		break;
679
680	case 5:
681		amd_decode_fr_mce(m);
682		break;
683
684	default:
685		break;
686	}
687
688	amd_decode_err_code(m->status & 0xffff);
689
690	return NOTIFY_STOP;
691}
692EXPORT_SYMBOL_GPL(amd_decode_mce);
693
694static struct notifier_block amd_mce_dec_nb = {
695	.notifier_call	= amd_decode_mce,
696};
697
698static int __init mce_amd_init(void)
699{
700	if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
701		return 0;
702
703	if ((boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x12) &&
704	    (boot_cpu_data.x86 != 0x14 || boot_cpu_data.x86_model > 0xf))
705		return 0;
706
707	fam_ops = kzalloc(sizeof(struct amd_decoder_ops), GFP_KERNEL);
708	if (!fam_ops)
709		return -ENOMEM;
710
711	switch (boot_cpu_data.x86) {
712	case 0xf:
713		fam_ops->dc_mce = k8_dc_mce;
714		fam_ops->ic_mce = k8_ic_mce;
715		fam_ops->nb_mce = k8_nb_mce;
716		break;
717
718	case 0x10:
719		fam_ops->dc_mce = f10h_dc_mce;
720		fam_ops->ic_mce = k8_ic_mce;
721		fam_ops->nb_mce = f10h_nb_mce;
722		break;
723
724	case 0x11:
725		fam_ops->dc_mce = k8_dc_mce;
726		fam_ops->ic_mce = k8_ic_mce;
727		fam_ops->nb_mce = f10h_nb_mce;
728		break;
729
730	case 0x12:
731		fam_ops->dc_mce = f12h_dc_mce;
732		fam_ops->ic_mce = k8_ic_mce;
733		fam_ops->nb_mce = nb_noop_mce;
734		break;
735
736	case 0x14:
737		nb_err_cpumask  = 0x3;
738		fam_ops->dc_mce = f14h_dc_mce;
739		fam_ops->ic_mce = f14h_ic_mce;
740		fam_ops->nb_mce = nb_noop_mce;
741		break;
742
743	case 0x15:
744		xec_mask = 0x1f;
745		fam_ops->dc_mce = f15h_dc_mce;
746		fam_ops->ic_mce = f15h_ic_mce;
747		break;
748
749	default:
750		printk(KERN_WARNING "Huh? What family is that: %d?!\n",
751				    boot_cpu_data.x86);
752		kfree(fam_ops);
753		return -EINVAL;
754	}
755
756	pr_info("MCE: In-kernel MCE decoding enabled.\n");
757
758	atomic_notifier_chain_register(&x86_mce_decoder_chain, &amd_mce_dec_nb);
759
760	return 0;
761}
762early_initcall(mce_amd_init);
763
764#ifdef MODULE
765static void __exit mce_amd_exit(void)
766{
767	atomic_notifier_chain_unregister(&x86_mce_decoder_chain, &amd_mce_dec_nb);
768	kfree(fam_ops);
769}
770
771MODULE_DESCRIPTION("AMD MCE decoder");
772MODULE_ALIAS("edac-mce-amd");
773MODULE_LICENSE("GPL");
774module_exit(mce_amd_exit);
775#endif
776