mce_amd.c revision 5ce88f6ea6bef929f59f9468413f922c9a486fa4
1#include <linux/module.h>
2#include <linux/slab.h>
3
4#include "mce_amd.h"
5
6static struct amd_decoder_ops *fam_ops;
7
8static u8 nb_err_cpumask = 0xf;
9
10static bool report_gart_errors;
11static void (*nb_bus_decoder)(int node_id, struct mce *m, u32 nbcfg);
12
13void amd_report_gart_errors(bool v)
14{
15	report_gart_errors = v;
16}
17EXPORT_SYMBOL_GPL(amd_report_gart_errors);
18
19void amd_register_ecc_decoder(void (*f)(int, struct mce *, u32))
20{
21	nb_bus_decoder = f;
22}
23EXPORT_SYMBOL_GPL(amd_register_ecc_decoder);
24
25void amd_unregister_ecc_decoder(void (*f)(int, struct mce *, u32))
26{
27	if (nb_bus_decoder) {
28		WARN_ON(nb_bus_decoder != f);
29
30		nb_bus_decoder = NULL;
31	}
32}
33EXPORT_SYMBOL_GPL(amd_unregister_ecc_decoder);
34
35/*
36 * string representation for the different MCA reported error types, see F3x48
37 * or MSR0000_0411.
38 */
39
40/* transaction type */
41const char *tt_msgs[] = { "INSN", "DATA", "GEN", "RESV" };
42EXPORT_SYMBOL_GPL(tt_msgs);
43
44/* cache level */
45const char *ll_msgs[] = { "RESV", "L1", "L2", "L3/GEN" };
46EXPORT_SYMBOL_GPL(ll_msgs);
47
48/* memory transaction type */
49const char *rrrr_msgs[] = {
50       "GEN", "RD", "WR", "DRD", "DWR", "IRD", "PRF", "EV", "SNP"
51};
52EXPORT_SYMBOL_GPL(rrrr_msgs);
53
54/* participating processor */
55const char *pp_msgs[] = { "SRC", "RES", "OBS", "GEN" };
56EXPORT_SYMBOL_GPL(pp_msgs);
57
58/* request timeout */
59const char *to_msgs[] = { "no timeout",	"timed out" };
60EXPORT_SYMBOL_GPL(to_msgs);
61
62/* memory or i/o */
63const char *ii_msgs[] = { "MEM", "RESV", "IO", "GEN" };
64EXPORT_SYMBOL_GPL(ii_msgs);
65
66static const char *f10h_nb_mce_desc[] = {
67	"HT link data error",
68	"Protocol error (link, L3, probe filter, etc.)",
69	"Parity error in NB-internal arrays",
70	"Link Retry due to IO link transmission error",
71	"L3 ECC data cache error",
72	"ECC error in L3 cache tag",
73	"L3 LRU parity bits error",
74	"ECC Error in the Probe Filter directory"
75};
76
77static bool f10h_dc_mce(u16 ec)
78{
79	u8 r4  = (ec >> 4) & 0xf;
80	bool ret = false;
81
82	if (r4 == R4_GEN) {
83		pr_cont("during data scrub.\n");
84		return true;
85	}
86
87	if (MEM_ERROR(ec)) {
88		u8 ll = ec & 0x3;
89		ret = true;
90
91		if (ll == LL_L2)
92			pr_cont("during L1 linefill from L2.\n");
93		else if (ll == LL_L1)
94			pr_cont("Data/Tag %s error.\n", RRRR_MSG(ec));
95		else
96			ret = false;
97	}
98	return ret;
99}
100
101static bool k8_dc_mce(u16 ec)
102{
103	if (BUS_ERROR(ec)) {
104		pr_cont("during system linefill.\n");
105		return true;
106	}
107
108	return f10h_dc_mce(ec);
109}
110
111static bool f14h_dc_mce(u16 ec)
112{
113	u8 r4	 = (ec >> 4) & 0xf;
114	u8 ll	 = ec & 0x3;
115	u8 tt	 = (ec >> 2) & 0x3;
116	u8 ii	 = tt;
117	bool ret = true;
118
119	if (MEM_ERROR(ec)) {
120
121		if (tt != TT_DATA || ll != LL_L1)
122			return false;
123
124		switch (r4) {
125		case R4_DRD:
126		case R4_DWR:
127			pr_cont("Data/Tag parity error due to %s.\n",
128				(r4 == R4_DRD ? "load/hw prf" : "store"));
129			break;
130		case R4_EVICT:
131			pr_cont("Copyback parity error on a tag miss.\n");
132			break;
133		case R4_SNOOP:
134			pr_cont("Tag parity error during snoop.\n");
135			break;
136		default:
137			ret = false;
138		}
139	} else if (BUS_ERROR(ec)) {
140
141		if ((ii != II_MEM && ii != II_IO) || ll != LL_LG)
142			return false;
143
144		pr_cont("System read data error on a ");
145
146		switch (r4) {
147		case R4_RD:
148			pr_cont("TLB reload.\n");
149			break;
150		case R4_DWR:
151			pr_cont("store.\n");
152			break;
153		case R4_DRD:
154			pr_cont("load.\n");
155			break;
156		default:
157			ret = false;
158		}
159	} else {
160		ret = false;
161	}
162
163	return ret;
164}
165
166static void amd_decode_dc_mce(struct mce *m)
167{
168	u16 ec = m->status & 0xffff;
169	u8 xec = (m->status >> 16) & 0xf;
170
171	pr_emerg(HW_ERR "Data Cache Error: ");
172
173	/* TLB error signatures are the same across families */
174	if (TLB_ERROR(ec)) {
175		u8 tt = (ec >> 2) & 0x3;
176
177		if (tt == TT_DATA) {
178			pr_cont("%s TLB %s.\n", LL_MSG(ec),
179				(xec ? "multimatch" : "parity error"));
180			return;
181		}
182		else
183			goto wrong_dc_mce;
184	}
185
186	if (!fam_ops->dc_mce(ec))
187		goto wrong_dc_mce;
188
189	return;
190
191wrong_dc_mce:
192	pr_emerg(HW_ERR "Corrupted DC MCE info?\n");
193}
194
195static bool k8_ic_mce(u16 ec)
196{
197	u8 ll	 = ec & 0x3;
198	u8 r4	 = (ec >> 4) & 0xf;
199	bool ret = true;
200
201	if (!MEM_ERROR(ec))
202		return false;
203
204	if (ll == 0x2)
205		pr_cont("during a linefill from L2.\n");
206	else if (ll == 0x1) {
207		switch (r4) {
208		case R4_IRD:
209			pr_cont("Parity error during data load.\n");
210			break;
211
212		case R4_EVICT:
213			pr_cont("Copyback Parity/Victim error.\n");
214			break;
215
216		case R4_SNOOP:
217			pr_cont("Tag Snoop error.\n");
218			break;
219
220		default:
221			ret = false;
222			break;
223		}
224	} else
225		ret = false;
226
227	return ret;
228}
229
230static bool f14h_ic_mce(u16 ec)
231{
232	u8 ll    = ec & 0x3;
233	u8 tt    = (ec >> 2) & 0x3;
234	u8 r4  = (ec >> 4) & 0xf;
235	bool ret = true;
236
237	if (MEM_ERROR(ec)) {
238		if (tt != 0 || ll != 1)
239			ret = false;
240
241		if (r4 == R4_IRD)
242			pr_cont("Data/tag array parity error for a tag hit.\n");
243		else if (r4 == R4_SNOOP)
244			pr_cont("Tag error during snoop/victimization.\n");
245		else
246			ret = false;
247	}
248	return ret;
249}
250
251static void amd_decode_ic_mce(struct mce *m)
252{
253	u16 ec = m->status & 0xffff;
254	u8 xec = (m->status >> 16) & 0xf;
255
256	pr_emerg(HW_ERR "Instruction Cache Error: ");
257
258	if (TLB_ERROR(ec))
259		pr_cont("%s TLB %s.\n", LL_MSG(ec),
260			(xec ? "multimatch" : "parity error"));
261	else if (BUS_ERROR(ec)) {
262		bool k8 = (boot_cpu_data.x86 == 0xf && (m->status & BIT(58)));
263
264		pr_cont("during %s.\n", (k8 ? "system linefill" : "NB data read"));
265	} else if (fam_ops->ic_mce(ec))
266		;
267	else
268		pr_emerg(HW_ERR "Corrupted IC MCE info?\n");
269}
270
271static void amd_decode_bu_mce(struct mce *m)
272{
273	u32 ec = m->status & 0xffff;
274	u32 xec = (m->status >> 16) & 0xf;
275
276	pr_emerg(HW_ERR "Bus Unit Error");
277
278	if (xec == 0x1)
279		pr_cont(" in the write data buffers.\n");
280	else if (xec == 0x3)
281		pr_cont(" in the victim data buffers.\n");
282	else if (xec == 0x2 && MEM_ERROR(ec))
283		pr_cont(": %s error in the L2 cache tags.\n", RRRR_MSG(ec));
284	else if (xec == 0x0) {
285		if (TLB_ERROR(ec))
286			pr_cont(": %s error in a Page Descriptor Cache or "
287				"Guest TLB.\n", TT_MSG(ec));
288		else if (BUS_ERROR(ec))
289			pr_cont(": %s/ECC error in data read from NB: %s.\n",
290				RRRR_MSG(ec), PP_MSG(ec));
291		else if (MEM_ERROR(ec)) {
292			u8 rrrr = (ec >> 4) & 0xf;
293
294			if (rrrr >= 0x7)
295				pr_cont(": %s error during data copyback.\n",
296					RRRR_MSG(ec));
297			else if (rrrr <= 0x1)
298				pr_cont(": %s parity/ECC error during data "
299					"access from L2.\n", RRRR_MSG(ec));
300			else
301				goto wrong_bu_mce;
302		} else
303			goto wrong_bu_mce;
304	} else
305		goto wrong_bu_mce;
306
307	return;
308
309wrong_bu_mce:
310	pr_emerg(HW_ERR "Corrupted BU MCE info?\n");
311}
312
313static void amd_decode_ls_mce(struct mce *m)
314{
315	u16 ec = m->status & 0xffff;
316	u8 xec = (m->status >> 16) & 0xf;
317
318	if (boot_cpu_data.x86 == 0x14) {
319		pr_emerg("You shouldn't be seeing an LS MCE on this cpu family,"
320			 " please report on LKML.\n");
321		return;
322	}
323
324	pr_emerg(HW_ERR "Load Store Error");
325
326	if (xec == 0x0) {
327		u8 r4 = (ec >> 4) & 0xf;
328
329		if (!BUS_ERROR(ec) || (r4 != R4_DRD && r4 != R4_DWR))
330			goto wrong_ls_mce;
331
332		pr_cont(" during %s.\n", RRRR_MSG(ec));
333	} else
334		goto wrong_ls_mce;
335
336	return;
337
338wrong_ls_mce:
339	pr_emerg(HW_ERR "Corrupted LS MCE info?\n");
340}
341
342static bool k8_nb_mce(u16 ec, u8 xec)
343{
344	bool ret = true;
345
346	switch (xec) {
347	case 0x1:
348		pr_cont("CRC error detected on HT link.\n");
349		break;
350
351	case 0x5:
352		pr_cont("Invalid GART PTE entry during GART table walk.\n");
353		break;
354
355	case 0x6:
356		pr_cont("Unsupported atomic RMW received from an IO link.\n");
357		break;
358
359	case 0x0:
360	case 0x8:
361		pr_cont("DRAM ECC error detected on the NB.\n");
362		break;
363
364	case 0xd:
365		pr_cont("Parity error on the DRAM addr/ctl signals.\n");
366		break;
367
368	default:
369		ret = false;
370		break;
371	}
372
373	return ret;
374}
375
376static bool f10h_nb_mce(u16 ec, u8 xec)
377{
378	bool ret = true;
379	u8 offset = 0;
380
381	if (k8_nb_mce(ec, xec))
382		return true;
383
384	switch(xec) {
385	case 0xa ... 0xc:
386		offset = 10;
387		break;
388
389	case 0xe:
390		offset = 11;
391		break;
392
393	case 0xf:
394		if (TLB_ERROR(ec))
395			pr_cont("GART Table Walk data error.\n");
396		else if (BUS_ERROR(ec))
397			pr_cont("DMA Exclusion Vector Table Walk error.\n");
398		else
399			ret = false;
400
401		goto out;
402		break;
403
404	case 0x1c ... 0x1f:
405		offset = 24;
406		break;
407
408	default:
409		ret = false;
410
411		goto out;
412		break;
413	}
414
415	pr_cont("%s.\n", f10h_nb_mce_desc[xec - offset]);
416
417out:
418	return ret;
419}
420
421static bool f14h_nb_mce(u16 ec, u8 xec)
422{
423	return false;
424}
425
426void amd_decode_nb_mce(int node_id, struct mce *m, u32 nbcfg)
427{
428	u8 xec   = (m->status >> 16) & 0x1f;
429	u16 ec   = m->status & 0xffff;
430	u32 nbsh = (u32)(m->status >> 32);
431
432	pr_emerg(HW_ERR "Northbridge Error, node %d: ", node_id);
433
434	/*
435	 * F10h, revD can disable ErrCpu[3:0] so check that first and also the
436	 * value encoding has changed so interpret those differently
437	 */
438	if ((boot_cpu_data.x86 == 0x10) &&
439	    (boot_cpu_data.x86_model > 7)) {
440		if (nbsh & K8_NBSH_ERR_CPU_VAL)
441			pr_cont(", core: %u", (u8)(nbsh & nb_err_cpumask));
442	} else {
443		u8 assoc_cpus = nbsh & nb_err_cpumask;
444
445		if (assoc_cpus > 0)
446			pr_cont(", core: %d", fls(assoc_cpus) - 1);
447	}
448
449	switch (xec) {
450	case 0x2:
451		pr_cont("Sync error (sync packets on HT link detected).\n");
452		return;
453
454	case 0x3:
455		pr_cont("HT Master abort.\n");
456		return;
457
458	case 0x4:
459		pr_cont("HT Target abort.\n");
460		return;
461
462	case 0x7:
463		pr_cont("NB Watchdog timeout.\n");
464		return;
465
466	case 0x9:
467		pr_cont("SVM DMA Exclusion Vector error.\n");
468		return;
469
470	default:
471		break;
472	}
473
474	if (!fam_ops->nb_mce(ec, xec))
475		goto wrong_nb_mce;
476
477	if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10)
478		if ((xec == 0x8 || xec == 0x0) && nb_bus_decoder)
479			nb_bus_decoder(node_id, m, nbcfg);
480
481	return;
482
483wrong_nb_mce:
484	pr_emerg(HW_ERR "Corrupted NB MCE info?\n");
485}
486EXPORT_SYMBOL_GPL(amd_decode_nb_mce);
487
488static void amd_decode_fr_mce(struct mce *m)
489{
490	/* we have only one error signature so match all fields at once. */
491	if ((m->status & 0xffff) == 0x0f0f)
492		pr_emerg(HW_ERR " FR Error: CPU Watchdog timer expire.\n");
493	else
494		pr_emerg(HW_ERR "Corrupted FR MCE info?\n");
495}
496
497static inline void amd_decode_err_code(u16 ec)
498{
499	if (TLB_ERROR(ec)) {
500		pr_emerg(HW_ERR "Transaction: %s, Cache Level: %s\n",
501			 TT_MSG(ec), LL_MSG(ec));
502	} else if (MEM_ERROR(ec)) {
503		pr_emerg(HW_ERR "Transaction: %s, Type: %s, Cache Level: %s\n",
504			 RRRR_MSG(ec), TT_MSG(ec), LL_MSG(ec));
505	} else if (BUS_ERROR(ec)) {
506		pr_emerg(HW_ERR "Transaction: %s (%s), %s, Cache Level: %s, "
507			 "Participating Processor: %s\n",
508			  RRRR_MSG(ec), II_MSG(ec), TO_MSG(ec), LL_MSG(ec),
509			  PP_MSG(ec));
510	} else
511		pr_emerg(HW_ERR "Huh? Unknown MCE error 0x%x\n", ec);
512}
513
514/*
515 * Filter out unwanted MCE signatures here.
516 */
517static bool amd_filter_mce(struct mce *m)
518{
519	u8 xec = (m->status >> 16) & 0x1f;
520
521	/*
522	 * NB GART TLB error reporting is disabled by default.
523	 */
524	if (m->bank == 4 && xec == 0x5 && !report_gart_errors)
525		return true;
526
527	return false;
528}
529
530int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
531{
532	struct mce *m = (struct mce *)data;
533	int node, ecc;
534
535	if (amd_filter_mce(m))
536		return NOTIFY_STOP;
537
538	pr_emerg(HW_ERR "MC%d_STATUS: ", m->bank);
539
540	pr_cont("%sorrected error, other errors lost: %s, "
541		 "CPU context corrupt: %s",
542		 ((m->status & MCI_STATUS_UC) ? "Unc"  : "C"),
543		 ((m->status & MCI_STATUS_OVER) ? "yes"  : "no"),
544		 ((m->status & MCI_STATUS_PCC) ? "yes" : "no"));
545
546	/* do the two bits[14:13] together */
547	ecc = (m->status >> 45) & 0x3;
548	if (ecc)
549		pr_cont(", %sECC Error", ((ecc == 2) ? "C" : "U"));
550
551	pr_cont("\n");
552
553	switch (m->bank) {
554	case 0:
555		amd_decode_dc_mce(m);
556		break;
557
558	case 1:
559		amd_decode_ic_mce(m);
560		break;
561
562	case 2:
563		amd_decode_bu_mce(m);
564		break;
565
566	case 3:
567		amd_decode_ls_mce(m);
568		break;
569
570	case 4:
571		node = amd_get_nb_id(m->extcpu);
572		amd_decode_nb_mce(node, m, 0);
573		break;
574
575	case 5:
576		amd_decode_fr_mce(m);
577		break;
578
579	default:
580		break;
581	}
582
583	amd_decode_err_code(m->status & 0xffff);
584
585	return NOTIFY_STOP;
586}
587EXPORT_SYMBOL_GPL(amd_decode_mce);
588
589static struct notifier_block amd_mce_dec_nb = {
590	.notifier_call	= amd_decode_mce,
591};
592
593static int __init mce_amd_init(void)
594{
595	/*
596	 * We can decode MCEs for K8, F10h and F11h CPUs:
597	 */
598	if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
599		return 0;
600
601	if (boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x11)
602		return 0;
603
604	fam_ops = kzalloc(sizeof(struct amd_decoder_ops), GFP_KERNEL);
605	if (!fam_ops)
606		return -ENOMEM;
607
608	switch (boot_cpu_data.x86) {
609	case 0xf:
610		fam_ops->dc_mce = k8_dc_mce;
611		fam_ops->ic_mce = k8_ic_mce;
612		fam_ops->nb_mce = k8_nb_mce;
613		break;
614
615	case 0x10:
616		fam_ops->dc_mce = f10h_dc_mce;
617		fam_ops->ic_mce = k8_ic_mce;
618		fam_ops->nb_mce = f10h_nb_mce;
619		break;
620
621	case 0x14:
622		nb_err_cpumask  = 0x3;
623		fam_ops->dc_mce = f14h_dc_mce;
624		fam_ops->ic_mce = f14h_ic_mce;
625		fam_ops->nb_mce = f14h_nb_mce;
626		break;
627
628	default:
629		printk(KERN_WARNING "Huh? What family is that: %d?!\n",
630				    boot_cpu_data.x86);
631		kfree(fam_ops);
632		return -EINVAL;
633	}
634
635	atomic_notifier_chain_register(&x86_mce_decoder_chain, &amd_mce_dec_nb);
636
637	return 0;
638}
639early_initcall(mce_amd_init);
640
641#ifdef MODULE
642static void __exit mce_amd_exit(void)
643{
644	atomic_notifier_chain_unregister(&x86_mce_decoder_chain, &amd_mce_dec_nb);
645	kfree(fam_ops);
646}
647
648MODULE_DESCRIPTION("AMD MCE decoder");
649MODULE_ALIAS("edac-mce-amd");
650MODULE_LICENSE("GPL");
651module_exit(mce_amd_exit);
652#endif
653