mce_amd.c revision fda7561f438aeddf074e2db0890e283195aa7779
1#include <linux/module.h>
2#include <linux/slab.h>
3
4#include "mce_amd.h"
5
6static struct amd_decoder_ops *fam_ops;
7
8static u8 nb_err_cpumask = 0xf;
9
10static bool report_gart_errors;
11static void (*nb_bus_decoder)(int node_id, struct mce *m, u32 nbcfg);
12
13void amd_report_gart_errors(bool v)
14{
15	report_gart_errors = v;
16}
17EXPORT_SYMBOL_GPL(amd_report_gart_errors);
18
19void amd_register_ecc_decoder(void (*f)(int, struct mce *, u32))
20{
21	nb_bus_decoder = f;
22}
23EXPORT_SYMBOL_GPL(amd_register_ecc_decoder);
24
25void amd_unregister_ecc_decoder(void (*f)(int, struct mce *, u32))
26{
27	if (nb_bus_decoder) {
28		WARN_ON(nb_bus_decoder != f);
29
30		nb_bus_decoder = NULL;
31	}
32}
33EXPORT_SYMBOL_GPL(amd_unregister_ecc_decoder);
34
35/*
36 * string representation for the different MCA reported error types, see F3x48
37 * or MSR0000_0411.
38 */
39
40/* transaction type */
41const char *tt_msgs[] = { "INSN", "DATA", "GEN", "RESV" };
42EXPORT_SYMBOL_GPL(tt_msgs);
43
44/* cache level */
45const char *ll_msgs[] = { "RESV", "L1", "L2", "L3/GEN" };
46EXPORT_SYMBOL_GPL(ll_msgs);
47
48/* memory transaction type */
49const char *rrrr_msgs[] = {
50       "GEN", "RD", "WR", "DRD", "DWR", "IRD", "PRF", "EV", "SNP"
51};
52EXPORT_SYMBOL_GPL(rrrr_msgs);
53
54/* participating processor */
55const char *pp_msgs[] = { "SRC", "RES", "OBS", "GEN" };
56EXPORT_SYMBOL_GPL(pp_msgs);
57
58/* request timeout */
59const char *to_msgs[] = { "no timeout",	"timed out" };
60EXPORT_SYMBOL_GPL(to_msgs);
61
62/* memory or i/o */
63const char *ii_msgs[] = { "MEM", "RESV", "IO", "GEN" };
64EXPORT_SYMBOL_GPL(ii_msgs);
65
66static const char *f10h_nb_mce_desc[] = {
67	"HT link data error",
68	"Protocol error (link, L3, probe filter, etc.)",
69	"Parity error in NB-internal arrays",
70	"Link Retry due to IO link transmission error",
71	"L3 ECC data cache error",
72	"ECC error in L3 cache tag",
73	"L3 LRU parity bits error",
74	"ECC Error in the Probe Filter directory"
75};
76
77static bool f12h_dc_mce(u16 ec)
78{
79	bool ret = false;
80
81	if (MEM_ERROR(ec)) {
82		u8 ll = ec & 0x3;
83		ret = true;
84
85		if (ll == LL_L2)
86			pr_cont("during L1 linefill from L2.\n");
87		else if (ll == LL_L1)
88			pr_cont("Data/Tag %s error.\n", RRRR_MSG(ec));
89		else
90			ret = false;
91	}
92	return ret;
93}
94
95static bool f10h_dc_mce(u16 ec)
96{
97	u8 r4  = (ec >> 4) & 0xf;
98	u8 ll  = ec & 0x3;
99
100	if (r4 == R4_GEN && ll == LL_L1) {
101		pr_cont("during data scrub.\n");
102		return true;
103	}
104	return f12h_dc_mce(ec);
105}
106
107static bool k8_dc_mce(u16 ec)
108{
109	if (BUS_ERROR(ec)) {
110		pr_cont("during system linefill.\n");
111		return true;
112	}
113
114	return f10h_dc_mce(ec);
115}
116
117static bool f14h_dc_mce(u16 ec)
118{
119	u8 r4	 = (ec >> 4) & 0xf;
120	u8 ll	 = ec & 0x3;
121	u8 tt	 = (ec >> 2) & 0x3;
122	u8 ii	 = tt;
123	bool ret = true;
124
125	if (MEM_ERROR(ec)) {
126
127		if (tt != TT_DATA || ll != LL_L1)
128			return false;
129
130		switch (r4) {
131		case R4_DRD:
132		case R4_DWR:
133			pr_cont("Data/Tag parity error due to %s.\n",
134				(r4 == R4_DRD ? "load/hw prf" : "store"));
135			break;
136		case R4_EVICT:
137			pr_cont("Copyback parity error on a tag miss.\n");
138			break;
139		case R4_SNOOP:
140			pr_cont("Tag parity error during snoop.\n");
141			break;
142		default:
143			ret = false;
144		}
145	} else if (BUS_ERROR(ec)) {
146
147		if ((ii != II_MEM && ii != II_IO) || ll != LL_LG)
148			return false;
149
150		pr_cont("System read data error on a ");
151
152		switch (r4) {
153		case R4_RD:
154			pr_cont("TLB reload.\n");
155			break;
156		case R4_DWR:
157			pr_cont("store.\n");
158			break;
159		case R4_DRD:
160			pr_cont("load.\n");
161			break;
162		default:
163			ret = false;
164		}
165	} else {
166		ret = false;
167	}
168
169	return ret;
170}
171
172static void amd_decode_dc_mce(struct mce *m)
173{
174	u16 ec = m->status & 0xffff;
175	u8 xec = (m->status >> 16) & 0xf;
176
177	pr_emerg(HW_ERR "Data Cache Error: ");
178
179	/* TLB error signatures are the same across families */
180	if (TLB_ERROR(ec)) {
181		u8 tt = (ec >> 2) & 0x3;
182
183		if (tt == TT_DATA) {
184			pr_cont("%s TLB %s.\n", LL_MSG(ec),
185				(xec ? "multimatch" : "parity error"));
186			return;
187		}
188		else
189			goto wrong_dc_mce;
190	}
191
192	if (!fam_ops->dc_mce(ec))
193		goto wrong_dc_mce;
194
195	return;
196
197wrong_dc_mce:
198	pr_emerg(HW_ERR "Corrupted DC MCE info?\n");
199}
200
201static bool k8_ic_mce(u16 ec)
202{
203	u8 ll	 = ec & 0x3;
204	u8 r4	 = (ec >> 4) & 0xf;
205	bool ret = true;
206
207	if (!MEM_ERROR(ec))
208		return false;
209
210	if (ll == 0x2)
211		pr_cont("during a linefill from L2.\n");
212	else if (ll == 0x1) {
213		switch (r4) {
214		case R4_IRD:
215			pr_cont("Parity error during data load.\n");
216			break;
217
218		case R4_EVICT:
219			pr_cont("Copyback Parity/Victim error.\n");
220			break;
221
222		case R4_SNOOP:
223			pr_cont("Tag Snoop error.\n");
224			break;
225
226		default:
227			ret = false;
228			break;
229		}
230	} else
231		ret = false;
232
233	return ret;
234}
235
236static bool f14h_ic_mce(u16 ec)
237{
238	u8 ll    = ec & 0x3;
239	u8 tt    = (ec >> 2) & 0x3;
240	u8 r4  = (ec >> 4) & 0xf;
241	bool ret = true;
242
243	if (MEM_ERROR(ec)) {
244		if (tt != 0 || ll != 1)
245			ret = false;
246
247		if (r4 == R4_IRD)
248			pr_cont("Data/tag array parity error for a tag hit.\n");
249		else if (r4 == R4_SNOOP)
250			pr_cont("Tag error during snoop/victimization.\n");
251		else
252			ret = false;
253	}
254	return ret;
255}
256
257static void amd_decode_ic_mce(struct mce *m)
258{
259	u16 ec = m->status & 0xffff;
260	u8 xec = (m->status >> 16) & 0xf;
261
262	pr_emerg(HW_ERR "Instruction Cache Error: ");
263
264	if (TLB_ERROR(ec))
265		pr_cont("%s TLB %s.\n", LL_MSG(ec),
266			(xec ? "multimatch" : "parity error"));
267	else if (BUS_ERROR(ec)) {
268		bool k8 = (boot_cpu_data.x86 == 0xf && (m->status & BIT(58)));
269
270		pr_cont("during %s.\n", (k8 ? "system linefill" : "NB data read"));
271	} else if (fam_ops->ic_mce(ec))
272		;
273	else
274		pr_emerg(HW_ERR "Corrupted IC MCE info?\n");
275}
276
277static void amd_decode_bu_mce(struct mce *m)
278{
279	u32 ec = m->status & 0xffff;
280	u32 xec = (m->status >> 16) & 0xf;
281
282	pr_emerg(HW_ERR "Bus Unit Error");
283
284	if (xec == 0x1)
285		pr_cont(" in the write data buffers.\n");
286	else if (xec == 0x3)
287		pr_cont(" in the victim data buffers.\n");
288	else if (xec == 0x2 && MEM_ERROR(ec))
289		pr_cont(": %s error in the L2 cache tags.\n", RRRR_MSG(ec));
290	else if (xec == 0x0) {
291		if (TLB_ERROR(ec))
292			pr_cont(": %s error in a Page Descriptor Cache or "
293				"Guest TLB.\n", TT_MSG(ec));
294		else if (BUS_ERROR(ec))
295			pr_cont(": %s/ECC error in data read from NB: %s.\n",
296				RRRR_MSG(ec), PP_MSG(ec));
297		else if (MEM_ERROR(ec)) {
298			u8 rrrr = (ec >> 4) & 0xf;
299
300			if (rrrr >= 0x7)
301				pr_cont(": %s error during data copyback.\n",
302					RRRR_MSG(ec));
303			else if (rrrr <= 0x1)
304				pr_cont(": %s parity/ECC error during data "
305					"access from L2.\n", RRRR_MSG(ec));
306			else
307				goto wrong_bu_mce;
308		} else
309			goto wrong_bu_mce;
310	} else
311		goto wrong_bu_mce;
312
313	return;
314
315wrong_bu_mce:
316	pr_emerg(HW_ERR "Corrupted BU MCE info?\n");
317}
318
319static void amd_decode_ls_mce(struct mce *m)
320{
321	u16 ec = m->status & 0xffff;
322	u8 xec = (m->status >> 16) & 0xf;
323
324	if (boot_cpu_data.x86 == 0x14) {
325		pr_emerg("You shouldn't be seeing an LS MCE on this cpu family,"
326			 " please report on LKML.\n");
327		return;
328	}
329
330	pr_emerg(HW_ERR "Load Store Error");
331
332	if (xec == 0x0) {
333		u8 r4 = (ec >> 4) & 0xf;
334
335		if (!BUS_ERROR(ec) || (r4 != R4_DRD && r4 != R4_DWR))
336			goto wrong_ls_mce;
337
338		pr_cont(" during %s.\n", RRRR_MSG(ec));
339	} else
340		goto wrong_ls_mce;
341
342	return;
343
344wrong_ls_mce:
345	pr_emerg(HW_ERR "Corrupted LS MCE info?\n");
346}
347
348static bool k8_nb_mce(u16 ec, u8 xec)
349{
350	bool ret = true;
351
352	switch (xec) {
353	case 0x1:
354		pr_cont("CRC error detected on HT link.\n");
355		break;
356
357	case 0x5:
358		pr_cont("Invalid GART PTE entry during GART table walk.\n");
359		break;
360
361	case 0x6:
362		pr_cont("Unsupported atomic RMW received from an IO link.\n");
363		break;
364
365	case 0x0:
366	case 0x8:
367		if (boot_cpu_data.x86 == 0x11)
368			return false;
369
370		pr_cont("DRAM ECC error detected on the NB.\n");
371		break;
372
373	case 0xd:
374		pr_cont("Parity error on the DRAM addr/ctl signals.\n");
375		break;
376
377	default:
378		ret = false;
379		break;
380	}
381
382	return ret;
383}
384
385static bool f10h_nb_mce(u16 ec, u8 xec)
386{
387	bool ret = true;
388	u8 offset = 0;
389
390	if (k8_nb_mce(ec, xec))
391		return true;
392
393	switch(xec) {
394	case 0xa ... 0xc:
395		offset = 10;
396		break;
397
398	case 0xe:
399		offset = 11;
400		break;
401
402	case 0xf:
403		if (TLB_ERROR(ec))
404			pr_cont("GART Table Walk data error.\n");
405		else if (BUS_ERROR(ec))
406			pr_cont("DMA Exclusion Vector Table Walk error.\n");
407		else
408			ret = false;
409
410		goto out;
411		break;
412
413	case 0x1c ... 0x1f:
414		offset = 24;
415		break;
416
417	default:
418		ret = false;
419
420		goto out;
421		break;
422	}
423
424	pr_cont("%s.\n", f10h_nb_mce_desc[xec - offset]);
425
426out:
427	return ret;
428}
429
430static bool nb_noop_mce(u16 ec, u8 xec)
431{
432	return false;
433}
434
435void amd_decode_nb_mce(int node_id, struct mce *m, u32 nbcfg)
436{
437	u8 xec   = (m->status >> 16) & 0x1f;
438	u16 ec   = m->status & 0xffff;
439	u32 nbsh = (u32)(m->status >> 32);
440
441	pr_emerg(HW_ERR "Northbridge Error, node %d: ", node_id);
442
443	/*
444	 * F10h, revD can disable ErrCpu[3:0] so check that first and also the
445	 * value encoding has changed so interpret those differently
446	 */
447	if ((boot_cpu_data.x86 == 0x10) &&
448	    (boot_cpu_data.x86_model > 7)) {
449		if (nbsh & K8_NBSH_ERR_CPU_VAL)
450			pr_cont(", core: %u", (u8)(nbsh & nb_err_cpumask));
451	} else {
452		u8 assoc_cpus = nbsh & nb_err_cpumask;
453
454		if (assoc_cpus > 0)
455			pr_cont(", core: %d", fls(assoc_cpus) - 1);
456	}
457
458	switch (xec) {
459	case 0x2:
460		pr_cont("Sync error (sync packets on HT link detected).\n");
461		return;
462
463	case 0x3:
464		pr_cont("HT Master abort.\n");
465		return;
466
467	case 0x4:
468		pr_cont("HT Target abort.\n");
469		return;
470
471	case 0x7:
472		pr_cont("NB Watchdog timeout.\n");
473		return;
474
475	case 0x9:
476		pr_cont("SVM DMA Exclusion Vector error.\n");
477		return;
478
479	default:
480		break;
481	}
482
483	if (!fam_ops->nb_mce(ec, xec))
484		goto wrong_nb_mce;
485
486	if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10)
487		if ((xec == 0x8 || xec == 0x0) && nb_bus_decoder)
488			nb_bus_decoder(node_id, m, nbcfg);
489
490	return;
491
492wrong_nb_mce:
493	pr_emerg(HW_ERR "Corrupted NB MCE info?\n");
494}
495EXPORT_SYMBOL_GPL(amd_decode_nb_mce);
496
497static void amd_decode_fr_mce(struct mce *m)
498{
499	if (boot_cpu_data.x86 == 0xf ||
500	    boot_cpu_data.x86 == 0x11)
501		goto wrong_fr_mce;
502
503	/* we have only one error signature so match all fields at once. */
504	if ((m->status & 0xffff) == 0x0f0f) {
505		pr_emerg(HW_ERR "FR Error: CPU Watchdog timer expire.\n");
506		return;
507	}
508
509wrong_fr_mce:
510	pr_emerg(HW_ERR "Corrupted FR MCE info?\n");
511}
512
513static inline void amd_decode_err_code(u16 ec)
514{
515	if (TLB_ERROR(ec)) {
516		pr_emerg(HW_ERR "Transaction: %s, Cache Level: %s\n",
517			 TT_MSG(ec), LL_MSG(ec));
518	} else if (MEM_ERROR(ec)) {
519		pr_emerg(HW_ERR "Transaction: %s, Type: %s, Cache Level: %s\n",
520			 RRRR_MSG(ec), TT_MSG(ec), LL_MSG(ec));
521	} else if (BUS_ERROR(ec)) {
522		pr_emerg(HW_ERR "Transaction: %s (%s), %s, Cache Level: %s, "
523			 "Participating Processor: %s\n",
524			  RRRR_MSG(ec), II_MSG(ec), TO_MSG(ec), LL_MSG(ec),
525			  PP_MSG(ec));
526	} else
527		pr_emerg(HW_ERR "Huh? Unknown MCE error 0x%x\n", ec);
528}
529
530/*
531 * Filter out unwanted MCE signatures here.
532 */
533static bool amd_filter_mce(struct mce *m)
534{
535	u8 xec = (m->status >> 16) & 0x1f;
536
537	/*
538	 * NB GART TLB error reporting is disabled by default.
539	 */
540	if (m->bank == 4 && xec == 0x5 && !report_gart_errors)
541		return true;
542
543	return false;
544}
545
546int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
547{
548	struct mce *m = (struct mce *)data;
549	int node, ecc;
550
551	if (amd_filter_mce(m))
552		return NOTIFY_STOP;
553
554	pr_emerg(HW_ERR "MC%d_STATUS: ", m->bank);
555
556	pr_cont("%sorrected error, other errors lost: %s, "
557		 "CPU context corrupt: %s",
558		 ((m->status & MCI_STATUS_UC) ? "Unc"  : "C"),
559		 ((m->status & MCI_STATUS_OVER) ? "yes"  : "no"),
560		 ((m->status & MCI_STATUS_PCC) ? "yes" : "no"));
561
562	/* do the two bits[14:13] together */
563	ecc = (m->status >> 45) & 0x3;
564	if (ecc)
565		pr_cont(", %sECC Error", ((ecc == 2) ? "C" : "U"));
566
567	pr_cont("\n");
568
569	switch (m->bank) {
570	case 0:
571		amd_decode_dc_mce(m);
572		break;
573
574	case 1:
575		amd_decode_ic_mce(m);
576		break;
577
578	case 2:
579		amd_decode_bu_mce(m);
580		break;
581
582	case 3:
583		amd_decode_ls_mce(m);
584		break;
585
586	case 4:
587		node = amd_get_nb_id(m->extcpu);
588		amd_decode_nb_mce(node, m, 0);
589		break;
590
591	case 5:
592		amd_decode_fr_mce(m);
593		break;
594
595	default:
596		break;
597	}
598
599	amd_decode_err_code(m->status & 0xffff);
600
601	return NOTIFY_STOP;
602}
603EXPORT_SYMBOL_GPL(amd_decode_mce);
604
605static struct notifier_block amd_mce_dec_nb = {
606	.notifier_call	= amd_decode_mce,
607};
608
609static int __init mce_amd_init(void)
610{
611	if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
612		return 0;
613
614	if ((boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x12) &&
615	    (boot_cpu_data.x86 != 0x14 || boot_cpu_data.x86_model > 0xf))
616		return 0;
617
618	fam_ops = kzalloc(sizeof(struct amd_decoder_ops), GFP_KERNEL);
619	if (!fam_ops)
620		return -ENOMEM;
621
622	switch (boot_cpu_data.x86) {
623	case 0xf:
624		fam_ops->dc_mce = k8_dc_mce;
625		fam_ops->ic_mce = k8_ic_mce;
626		fam_ops->nb_mce = k8_nb_mce;
627		break;
628
629	case 0x10:
630		fam_ops->dc_mce = f10h_dc_mce;
631		fam_ops->ic_mce = k8_ic_mce;
632		fam_ops->nb_mce = f10h_nb_mce;
633		break;
634
635	case 0x11:
636		fam_ops->dc_mce = k8_dc_mce;
637		fam_ops->ic_mce = k8_ic_mce;
638		fam_ops->nb_mce = f10h_nb_mce;
639		break;
640
641	case 0x12:
642		fam_ops->dc_mce = f12h_dc_mce;
643		fam_ops->ic_mce = k8_ic_mce;
644		fam_ops->nb_mce = nb_noop_mce;
645		break;
646
647	case 0x14:
648		nb_err_cpumask  = 0x3;
649		fam_ops->dc_mce = f14h_dc_mce;
650		fam_ops->ic_mce = f14h_ic_mce;
651		fam_ops->nb_mce = nb_noop_mce;
652		break;
653
654	default:
655		printk(KERN_WARNING "Huh? What family is that: %d?!\n",
656				    boot_cpu_data.x86);
657		kfree(fam_ops);
658		return -EINVAL;
659	}
660
661	pr_info("MCE: In-kernel MCE decoding enabled.\n");
662
663	atomic_notifier_chain_register(&x86_mce_decoder_chain, &amd_mce_dec_nb);
664
665	return 0;
666}
667early_initcall(mce_amd_init);
668
669#ifdef MODULE
670static void __exit mce_amd_exit(void)
671{
672	atomic_notifier_chain_unregister(&x86_mce_decoder_chain, &amd_mce_dec_nb);
673	kfree(fam_ops);
674}
675
676MODULE_DESCRIPTION("AMD MCE decoder");
677MODULE_ALIAS("edac-mce-amd");
678MODULE_LICENSE("GPL");
679module_exit(mce_amd_exit);
680#endif
681