edac_mc.c revision 53f2d02898755d1b24bde1975e202815d29fdb81
1/*
2 * edac_mc kernel module
3 * (C) 2005, 2006 Linux Networx (http://lnxi.com)
4 * This file may be distributed under the terms of the
5 * GNU General Public License.
6 *
7 * Written by Thayne Harbaugh
8 * Based on work by Dan Hollis <goemon at anime dot net> and others.
9 *	http://www.anime.net/~goemon/linux-ecc/
10 *
11 * Modified by Dave Peterson and Doug Thompson
12 *
13 */
14
15#include <linux/module.h>
16#include <linux/proc_fs.h>
17#include <linux/kernel.h>
18#include <linux/types.h>
19#include <linux/smp.h>
20#include <linux/init.h>
21#include <linux/sysctl.h>
22#include <linux/highmem.h>
23#include <linux/timer.h>
24#include <linux/slab.h>
25#include <linux/jiffies.h>
26#include <linux/spinlock.h>
27#include <linux/list.h>
28#include <linux/ctype.h>
29#include <linux/edac.h>
30#include <linux/bitops.h>
31#include <asm/uaccess.h>
32#include <asm/page.h>
33#include <asm/edac.h>
34#include "edac_core.h"
35#include "edac_module.h"
36
37#define CREATE_TRACE_POINTS
38#define TRACE_INCLUDE_PATH ../../include/ras
39#include <ras/ras_event.h>
40
41/* lock to memory controller's control array */
42static DEFINE_MUTEX(mem_ctls_mutex);
43static LIST_HEAD(mc_devices);
44
45#ifdef CONFIG_EDAC_DEBUG
46
47static void edac_mc_dump_channel(struct rank_info *chan)
48{
49	debugf4("\tchannel = %p\n", chan);
50	debugf4("\tchannel->chan_idx = %d\n", chan->chan_idx);
51	debugf4("\tchannel->csrow = %p\n\n", chan->csrow);
52	debugf4("\tchannel->dimm = %p\n", chan->dimm);
53}
54
55static void edac_mc_dump_dimm(struct dimm_info *dimm)
56{
57	int i;
58
59	debugf4("\tdimm = %p\n", dimm);
60	debugf4("\tdimm->label = '%s'\n", dimm->label);
61	debugf4("\tdimm->nr_pages = 0x%x\n", dimm->nr_pages);
62	debugf4("\tdimm location ");
63	for (i = 0; i < dimm->mci->n_layers; i++) {
64		printk(KERN_CONT "%d", dimm->location[i]);
65		if (i < dimm->mci->n_layers - 1)
66			printk(KERN_CONT ".");
67	}
68	printk(KERN_CONT "\n");
69	debugf4("\tdimm->grain = %d\n", dimm->grain);
70	debugf4("\tdimm->nr_pages = 0x%x\n", dimm->nr_pages);
71}
72
73static void edac_mc_dump_csrow(struct csrow_info *csrow)
74{
75	debugf4("\tcsrow = %p\n", csrow);
76	debugf4("\tcsrow->csrow_idx = %d\n", csrow->csrow_idx);
77	debugf4("\tcsrow->first_page = 0x%lx\n", csrow->first_page);
78	debugf4("\tcsrow->last_page = 0x%lx\n", csrow->last_page);
79	debugf4("\tcsrow->page_mask = 0x%lx\n", csrow->page_mask);
80	debugf4("\tcsrow->nr_channels = %d\n", csrow->nr_channels);
81	debugf4("\tcsrow->channels = %p\n", csrow->channels);
82	debugf4("\tcsrow->mci = %p\n\n", csrow->mci);
83}
84
85static void edac_mc_dump_mci(struct mem_ctl_info *mci)
86{
87	debugf3("\tmci = %p\n", mci);
88	debugf3("\tmci->mtype_cap = %lx\n", mci->mtype_cap);
89	debugf3("\tmci->edac_ctl_cap = %lx\n", mci->edac_ctl_cap);
90	debugf3("\tmci->edac_cap = %lx\n", mci->edac_cap);
91	debugf4("\tmci->edac_check = %p\n", mci->edac_check);
92	debugf3("\tmci->nr_csrows = %d, csrows = %p\n",
93		mci->nr_csrows, mci->csrows);
94	debugf3("\tmci->nr_dimms = %d, dimms = %p\n",
95		mci->tot_dimms, mci->dimms);
96	debugf3("\tdev = %p\n", mci->dev);
97	debugf3("\tmod_name:ctl_name = %s:%s\n", mci->mod_name, mci->ctl_name);
98	debugf3("\tpvt_info = %p\n\n", mci->pvt_info);
99}
100
101#endif				/* CONFIG_EDAC_DEBUG */
102
103/*
104 * keep those in sync with the enum mem_type
105 */
106const char *edac_mem_types[] = {
107	"Empty csrow",
108	"Reserved csrow type",
109	"Unknown csrow type",
110	"Fast page mode RAM",
111	"Extended data out RAM",
112	"Burst Extended data out RAM",
113	"Single data rate SDRAM",
114	"Registered single data rate SDRAM",
115	"Double data rate SDRAM",
116	"Registered Double data rate SDRAM",
117	"Rambus DRAM",
118	"Unbuffered DDR2 RAM",
119	"Fully buffered DDR2",
120	"Registered DDR2 RAM",
121	"Rambus XDR",
122	"Unbuffered DDR3 RAM",
123	"Registered DDR3 RAM",
124};
125EXPORT_SYMBOL_GPL(edac_mem_types);
126
127/**
128 * edac_align_ptr - Prepares the pointer offsets for a single-shot allocation
129 * @p:		pointer to a pointer with the memory offset to be used. At
130 *		return, this will be incremented to point to the next offset
131 * @size:	Size of the data structure to be reserved
132 * @n_elems:	Number of elements that should be reserved
133 *
134 * If 'size' is a constant, the compiler will optimize this whole function
135 * down to either a no-op or the addition of a constant to the value of '*p'.
136 *
137 * The 'p' pointer is absolutely needed to keep the proper advancing
138 * further in memory to the proper offsets when allocating the struct along
139 * with its embedded structs, as edac_device_alloc_ctl_info() does it
140 * above, for example.
141 *
142 * At return, the pointer 'p' will be incremented to be used on a next call
143 * to this function.
144 */
145void *edac_align_ptr(void **p, unsigned size, int n_elems)
146{
147	unsigned align, r;
148	void *ptr = *p;
149
150	*p += size * n_elems;
151
152	/*
153	 * 'p' can possibly be an unaligned item X such that sizeof(X) is
154	 * 'size'.  Adjust 'p' so that its alignment is at least as
155	 * stringent as what the compiler would provide for X and return
156	 * the aligned result.
157	 * Here we assume that the alignment of a "long long" is the most
158	 * stringent alignment that the compiler will ever provide by default.
159	 * As far as I know, this is a reasonable assumption.
160	 */
161	if (size > sizeof(long))
162		align = sizeof(long long);
163	else if (size > sizeof(int))
164		align = sizeof(long);
165	else if (size > sizeof(short))
166		align = sizeof(int);
167	else if (size > sizeof(char))
168		align = sizeof(short);
169	else
170		return (char *)ptr;
171
172	r = size % align;
173
174	if (r == 0)
175		return (char *)ptr;
176
177	*p += align - r;
178
179	return (void *)(((unsigned long)ptr) + align - r);
180}
181
182/**
183 * edac_mc_alloc: Allocate and partially fill a struct mem_ctl_info structure
184 * @mc_num:		Memory controller number
185 * @n_layers:		Number of MC hierarchy layers
186 * layers:		Describes each layer as seen by the Memory Controller
187 * @size_pvt:		size of private storage needed
188 *
189 *
190 * Everything is kmalloc'ed as one big chunk - more efficient.
191 * Only can be used if all structures have the same lifetime - otherwise
192 * you have to allocate and initialize your own structures.
193 *
194 * Use edac_mc_free() to free mc structures allocated by this function.
195 *
196 * NOTE: drivers handle multi-rank memories in different ways: in some
197 * drivers, one multi-rank memory stick is mapped as one entry, while, in
198 * others, a single multi-rank memory stick would be mapped into several
199 * entries. Currently, this function will allocate multiple struct dimm_info
200 * on such scenarios, as grouping the multiple ranks require drivers change.
201 *
202 * Returns:
203 *	On failure: NULL
204 *	On success: struct mem_ctl_info pointer
205 */
206struct mem_ctl_info *edac_mc_alloc(unsigned mc_num,
207				   unsigned n_layers,
208				   struct edac_mc_layer *layers,
209				   unsigned sz_pvt)
210{
211	struct mem_ctl_info *mci;
212	struct edac_mc_layer *layer;
213	struct csrow_info *csi, *csr;
214	struct rank_info *chi, *chp, *chan;
215	struct dimm_info *dimm;
216	u32 *ce_per_layer[EDAC_MAX_LAYERS], *ue_per_layer[EDAC_MAX_LAYERS];
217	unsigned pos[EDAC_MAX_LAYERS];
218	unsigned size, tot_dimms = 1, count = 1;
219	unsigned tot_csrows = 1, tot_channels = 1, tot_errcount = 0;
220	void *pvt, *p, *ptr = NULL;
221	int i, j, err, row, chn, n, len;
222	bool per_rank = false;
223
224	BUG_ON(n_layers > EDAC_MAX_LAYERS || n_layers == 0);
225	/*
226	 * Calculate the total amount of dimms and csrows/cschannels while
227	 * in the old API emulation mode
228	 */
229	for (i = 0; i < n_layers; i++) {
230		tot_dimms *= layers[i].size;
231		if (layers[i].is_virt_csrow)
232			tot_csrows *= layers[i].size;
233		else
234			tot_channels *= layers[i].size;
235
236		if (layers[i].type == EDAC_MC_LAYER_CHIP_SELECT)
237			per_rank = true;
238	}
239
240	/* Figure out the offsets of the various items from the start of an mc
241	 * structure.  We want the alignment of each item to be at least as
242	 * stringent as what the compiler would provide if we could simply
243	 * hardcode everything into a single struct.
244	 */
245	mci = edac_align_ptr(&ptr, sizeof(*mci), 1);
246	layer = edac_align_ptr(&ptr, sizeof(*layer), n_layers);
247	csi = edac_align_ptr(&ptr, sizeof(*csi), tot_csrows);
248	chi = edac_align_ptr(&ptr, sizeof(*chi), tot_csrows * tot_channels);
249	dimm = edac_align_ptr(&ptr, sizeof(*dimm), tot_dimms);
250	for (i = 0; i < n_layers; i++) {
251		count *= layers[i].size;
252		debugf4("%s: errcount layer %d size %d\n", __func__, i, count);
253		ce_per_layer[i] = edac_align_ptr(&ptr, sizeof(u32), count);
254		ue_per_layer[i] = edac_align_ptr(&ptr, sizeof(u32), count);
255		tot_errcount += 2 * count;
256	}
257
258	debugf4("%s: allocating %d error counters\n", __func__, tot_errcount);
259	pvt = edac_align_ptr(&ptr, sz_pvt, 1);
260	size = ((unsigned long)pvt) + sz_pvt;
261
262	debugf1("%s(): allocating %u bytes for mci data (%d %s, %d csrows/channels)\n",
263		__func__, size,
264		tot_dimms,
265		per_rank ? "ranks" : "dimms",
266		tot_csrows * tot_channels);
267	mci = kzalloc(size, GFP_KERNEL);
268	if (mci == NULL)
269		return NULL;
270
271	/* Adjust pointers so they point within the memory we just allocated
272	 * rather than an imaginary chunk of memory located at address 0.
273	 */
274	layer = (struct edac_mc_layer *)(((char *)mci) + ((unsigned long)layer));
275	csi = (struct csrow_info *)(((char *)mci) + ((unsigned long)csi));
276	chi = (struct rank_info *)(((char *)mci) + ((unsigned long)chi));
277	dimm = (struct dimm_info *)(((char *)mci) + ((unsigned long)dimm));
278	for (i = 0; i < n_layers; i++) {
279		mci->ce_per_layer[i] = (u32 *)((char *)mci + ((unsigned long)ce_per_layer[i]));
280		mci->ue_per_layer[i] = (u32 *)((char *)mci + ((unsigned long)ue_per_layer[i]));
281	}
282	pvt = sz_pvt ? (((char *)mci) + ((unsigned long)pvt)) : NULL;
283
284	/* setup index and various internal pointers */
285	mci->mc_idx = mc_num;
286	mci->csrows = csi;
287	mci->dimms  = dimm;
288	mci->tot_dimms = tot_dimms;
289	mci->pvt_info = pvt;
290	mci->n_layers = n_layers;
291	mci->layers = layer;
292	memcpy(mci->layers, layers, sizeof(*layer) * n_layers);
293	mci->nr_csrows = tot_csrows;
294	mci->num_cschannel = tot_channels;
295	mci->mem_is_per_rank = per_rank;
296
297	/*
298	 * Fill the csrow struct
299	 */
300	for (row = 0; row < tot_csrows; row++) {
301		csr = &csi[row];
302		csr->csrow_idx = row;
303		csr->mci = mci;
304		csr->nr_channels = tot_channels;
305		chp = &chi[row * tot_channels];
306		csr->channels = chp;
307
308		for (chn = 0; chn < tot_channels; chn++) {
309			chan = &chp[chn];
310			chan->chan_idx = chn;
311			chan->csrow = csr;
312		}
313	}
314
315	/*
316	 * Fill the dimm struct
317	 */
318	memset(&pos, 0, sizeof(pos));
319	row = 0;
320	chn = 0;
321	debugf4("%s: initializing %d %s\n", __func__, tot_dimms,
322		per_rank ? "ranks" : "dimms");
323	for (i = 0; i < tot_dimms; i++) {
324		chan = &csi[row].channels[chn];
325		dimm = EDAC_DIMM_PTR(layer, mci->dimms, n_layers,
326			       pos[0], pos[1], pos[2]);
327		dimm->mci = mci;
328
329		debugf2("%s: %d: %s%zd (%d:%d:%d): row %d, chan %d\n", __func__,
330			i, per_rank ? "rank" : "dimm", (dimm - mci->dimms),
331			pos[0], pos[1], pos[2], row, chn);
332
333		/*
334		 * Copy DIMM location and initialize it.
335		 */
336		len = sizeof(dimm->label);
337		p = dimm->label;
338		n = snprintf(p, len, "mc#%u", mc_num);
339		p += n;
340		len -= n;
341		for (j = 0; j < n_layers; j++) {
342			n = snprintf(p, len, "%s#%u",
343				     edac_layer_name[layers[j].type],
344				     pos[j]);
345			p += n;
346			len -= n;
347			dimm->location[j] = pos[j];
348
349			if (len <= 0)
350				break;
351		}
352
353		/* Link it to the csrows old API data */
354		chan->dimm = dimm;
355		dimm->csrow = row;
356		dimm->cschannel = chn;
357
358		/* Increment csrow location */
359		row++;
360		if (row == tot_csrows) {
361			row = 0;
362			chn++;
363		}
364
365		/* Increment dimm location */
366		for (j = n_layers - 1; j >= 0; j--) {
367			pos[j]++;
368			if (pos[j] < layers[j].size)
369				break;
370			pos[j] = 0;
371		}
372	}
373
374	mci->op_state = OP_ALLOC;
375	INIT_LIST_HEAD(&mci->grp_kobj_list);
376
377	/*
378	 * Initialize the 'root' kobj for the edac_mc controller
379	 */
380	err = edac_mc_register_sysfs_main_kobj(mci);
381	if (err) {
382		kfree(mci);
383		return NULL;
384	}
385
386	/* at this point, the root kobj is valid, and in order to
387	 * 'free' the object, then the function:
388	 *      edac_mc_unregister_sysfs_main_kobj() must be called
389	 * which will perform kobj unregistration and the actual free
390	 * will occur during the kobject callback operation
391	 */
392
393	return mci;
394}
395EXPORT_SYMBOL_GPL(edac_mc_alloc);
396
397/**
398 * edac_mc_free
399 *	'Free' a previously allocated 'mci' structure
400 * @mci: pointer to a struct mem_ctl_info structure
401 */
402void edac_mc_free(struct mem_ctl_info *mci)
403{
404	debugf1("%s()\n", __func__);
405
406	edac_mc_unregister_sysfs_main_kobj(mci);
407
408	/* free the mci instance memory here */
409	kfree(mci);
410}
411EXPORT_SYMBOL_GPL(edac_mc_free);
412
413
414/**
415 * find_mci_by_dev
416 *
417 *	scan list of controllers looking for the one that manages
418 *	the 'dev' device
419 * @dev: pointer to a struct device related with the MCI
420 */
421struct mem_ctl_info *find_mci_by_dev(struct device *dev)
422{
423	struct mem_ctl_info *mci;
424	struct list_head *item;
425
426	debugf3("%s()\n", __func__);
427
428	list_for_each(item, &mc_devices) {
429		mci = list_entry(item, struct mem_ctl_info, link);
430
431		if (mci->dev == dev)
432			return mci;
433	}
434
435	return NULL;
436}
437EXPORT_SYMBOL_GPL(find_mci_by_dev);
438
439/*
440 * handler for EDAC to check if NMI type handler has asserted interrupt
441 */
442static int edac_mc_assert_error_check_and_clear(void)
443{
444	int old_state;
445
446	if (edac_op_state == EDAC_OPSTATE_POLL)
447		return 1;
448
449	old_state = edac_err_assert;
450	edac_err_assert = 0;
451
452	return old_state;
453}
454
455/*
456 * edac_mc_workq_function
457 *	performs the operation scheduled by a workq request
458 */
459static void edac_mc_workq_function(struct work_struct *work_req)
460{
461	struct delayed_work *d_work = to_delayed_work(work_req);
462	struct mem_ctl_info *mci = to_edac_mem_ctl_work(d_work);
463
464	mutex_lock(&mem_ctls_mutex);
465
466	/* if this control struct has movd to offline state, we are done */
467	if (mci->op_state == OP_OFFLINE) {
468		mutex_unlock(&mem_ctls_mutex);
469		return;
470	}
471
472	/* Only poll controllers that are running polled and have a check */
473	if (edac_mc_assert_error_check_and_clear() && (mci->edac_check != NULL))
474		mci->edac_check(mci);
475
476	mutex_unlock(&mem_ctls_mutex);
477
478	/* Reschedule */
479	queue_delayed_work(edac_workqueue, &mci->work,
480			msecs_to_jiffies(edac_mc_get_poll_msec()));
481}
482
483/*
484 * edac_mc_workq_setup
485 *	initialize a workq item for this mci
486 *	passing in the new delay period in msec
487 *
488 *	locking model:
489 *
490 *		called with the mem_ctls_mutex held
491 */
492static void edac_mc_workq_setup(struct mem_ctl_info *mci, unsigned msec)
493{
494	debugf0("%s()\n", __func__);
495
496	/* if this instance is not in the POLL state, then simply return */
497	if (mci->op_state != OP_RUNNING_POLL)
498		return;
499
500	INIT_DELAYED_WORK(&mci->work, edac_mc_workq_function);
501	queue_delayed_work(edac_workqueue, &mci->work, msecs_to_jiffies(msec));
502}
503
504/*
505 * edac_mc_workq_teardown
506 *	stop the workq processing on this mci
507 *
508 *	locking model:
509 *
510 *		called WITHOUT lock held
511 */
512static void edac_mc_workq_teardown(struct mem_ctl_info *mci)
513{
514	int status;
515
516	if (mci->op_state != OP_RUNNING_POLL)
517		return;
518
519	status = cancel_delayed_work(&mci->work);
520	if (status == 0) {
521		debugf0("%s() not canceled, flush the queue\n",
522			__func__);
523
524		/* workq instance might be running, wait for it */
525		flush_workqueue(edac_workqueue);
526	}
527}
528
529/*
530 * edac_mc_reset_delay_period(unsigned long value)
531 *
532 *	user space has updated our poll period value, need to
533 *	reset our workq delays
534 */
535void edac_mc_reset_delay_period(int value)
536{
537	struct mem_ctl_info *mci;
538	struct list_head *item;
539
540	mutex_lock(&mem_ctls_mutex);
541
542	/* scan the list and turn off all workq timers, doing so under lock
543	 */
544	list_for_each(item, &mc_devices) {
545		mci = list_entry(item, struct mem_ctl_info, link);
546
547		if (mci->op_state == OP_RUNNING_POLL)
548			cancel_delayed_work(&mci->work);
549	}
550
551	mutex_unlock(&mem_ctls_mutex);
552
553
554	/* re-walk the list, and reset the poll delay */
555	mutex_lock(&mem_ctls_mutex);
556
557	list_for_each(item, &mc_devices) {
558		mci = list_entry(item, struct mem_ctl_info, link);
559
560		edac_mc_workq_setup(mci, (unsigned long) value);
561	}
562
563	mutex_unlock(&mem_ctls_mutex);
564}
565
566
567
568/* Return 0 on success, 1 on failure.
569 * Before calling this function, caller must
570 * assign a unique value to mci->mc_idx.
571 *
572 *	locking model:
573 *
574 *		called with the mem_ctls_mutex lock held
575 */
576static int add_mc_to_global_list(struct mem_ctl_info *mci)
577{
578	struct list_head *item, *insert_before;
579	struct mem_ctl_info *p;
580
581	insert_before = &mc_devices;
582
583	p = find_mci_by_dev(mci->dev);
584	if (unlikely(p != NULL))
585		goto fail0;
586
587	list_for_each(item, &mc_devices) {
588		p = list_entry(item, struct mem_ctl_info, link);
589
590		if (p->mc_idx >= mci->mc_idx) {
591			if (unlikely(p->mc_idx == mci->mc_idx))
592				goto fail1;
593
594			insert_before = item;
595			break;
596		}
597	}
598
599	list_add_tail_rcu(&mci->link, insert_before);
600	atomic_inc(&edac_handlers);
601	return 0;
602
603fail0:
604	edac_printk(KERN_WARNING, EDAC_MC,
605		"%s (%s) %s %s already assigned %d\n", dev_name(p->dev),
606		edac_dev_name(mci), p->mod_name, p->ctl_name, p->mc_idx);
607	return 1;
608
609fail1:
610	edac_printk(KERN_WARNING, EDAC_MC,
611		"bug in low-level driver: attempt to assign\n"
612		"    duplicate mc_idx %d in %s()\n", p->mc_idx, __func__);
613	return 1;
614}
615
616static void del_mc_from_global_list(struct mem_ctl_info *mci)
617{
618	atomic_dec(&edac_handlers);
619	list_del_rcu(&mci->link);
620
621	/* these are for safe removal of devices from global list while
622	 * NMI handlers may be traversing list
623	 */
624	synchronize_rcu();
625	INIT_LIST_HEAD(&mci->link);
626}
627
628/**
629 * edac_mc_find: Search for a mem_ctl_info structure whose index is 'idx'.
630 *
631 * If found, return a pointer to the structure.
632 * Else return NULL.
633 *
634 * Caller must hold mem_ctls_mutex.
635 */
636struct mem_ctl_info *edac_mc_find(int idx)
637{
638	struct list_head *item;
639	struct mem_ctl_info *mci;
640
641	list_for_each(item, &mc_devices) {
642		mci = list_entry(item, struct mem_ctl_info, link);
643
644		if (mci->mc_idx >= idx) {
645			if (mci->mc_idx == idx)
646				return mci;
647
648			break;
649		}
650	}
651
652	return NULL;
653}
654EXPORT_SYMBOL(edac_mc_find);
655
656/**
657 * edac_mc_add_mc: Insert the 'mci' structure into the mci global list and
658 *                 create sysfs entries associated with mci structure
659 * @mci: pointer to the mci structure to be added to the list
660 *
661 * Return:
662 *	0	Success
663 *	!0	Failure
664 */
665
666/* FIXME - should a warning be printed if no error detection? correction? */
667int edac_mc_add_mc(struct mem_ctl_info *mci)
668{
669	debugf0("%s()\n", __func__);
670
671#ifdef CONFIG_EDAC_DEBUG
672	if (edac_debug_level >= 3)
673		edac_mc_dump_mci(mci);
674
675	if (edac_debug_level >= 4) {
676		int i;
677
678		for (i = 0; i < mci->nr_csrows; i++) {
679			int j;
680
681			edac_mc_dump_csrow(&mci->csrows[i]);
682			for (j = 0; j < mci->csrows[i].nr_channels; j++)
683				edac_mc_dump_channel(&mci->csrows[i].
684						channels[j]);
685		}
686		for (i = 0; i < mci->tot_dimms; i++)
687			edac_mc_dump_dimm(&mci->dimms[i]);
688	}
689#endif
690	mutex_lock(&mem_ctls_mutex);
691
692	if (add_mc_to_global_list(mci))
693		goto fail0;
694
695	/* set load time so that error rate can be tracked */
696	mci->start_time = jiffies;
697
698	if (edac_create_sysfs_mci_device(mci)) {
699		edac_mc_printk(mci, KERN_WARNING,
700			"failed to create sysfs device\n");
701		goto fail1;
702	}
703
704	/* If there IS a check routine, then we are running POLLED */
705	if (mci->edac_check != NULL) {
706		/* This instance is NOW RUNNING */
707		mci->op_state = OP_RUNNING_POLL;
708
709		edac_mc_workq_setup(mci, edac_mc_get_poll_msec());
710	} else {
711		mci->op_state = OP_RUNNING_INTERRUPT;
712	}
713
714	/* Report action taken */
715	edac_mc_printk(mci, KERN_INFO, "Giving out device to '%s' '%s':"
716		" DEV %s\n", mci->mod_name, mci->ctl_name, edac_dev_name(mci));
717
718	mutex_unlock(&mem_ctls_mutex);
719	return 0;
720
721fail1:
722	del_mc_from_global_list(mci);
723
724fail0:
725	mutex_unlock(&mem_ctls_mutex);
726	return 1;
727}
728EXPORT_SYMBOL_GPL(edac_mc_add_mc);
729
730/**
731 * edac_mc_del_mc: Remove sysfs entries for specified mci structure and
732 *                 remove mci structure from global list
733 * @pdev: Pointer to 'struct device' representing mci structure to remove.
734 *
735 * Return pointer to removed mci structure, or NULL if device not found.
736 */
737struct mem_ctl_info *edac_mc_del_mc(struct device *dev)
738{
739	struct mem_ctl_info *mci;
740
741	debugf0("%s()\n", __func__);
742
743	mutex_lock(&mem_ctls_mutex);
744
745	/* find the requested mci struct in the global list */
746	mci = find_mci_by_dev(dev);
747	if (mci == NULL) {
748		mutex_unlock(&mem_ctls_mutex);
749		return NULL;
750	}
751
752	del_mc_from_global_list(mci);
753	mutex_unlock(&mem_ctls_mutex);
754
755	/* flush workq processes */
756	edac_mc_workq_teardown(mci);
757
758	/* marking MCI offline */
759	mci->op_state = OP_OFFLINE;
760
761	/* remove from sysfs */
762	edac_remove_sysfs_mci_device(mci);
763
764	edac_printk(KERN_INFO, EDAC_MC,
765		"Removed device %d for %s %s: DEV %s\n", mci->mc_idx,
766		mci->mod_name, mci->ctl_name, edac_dev_name(mci));
767
768	return mci;
769}
770EXPORT_SYMBOL_GPL(edac_mc_del_mc);
771
772static void edac_mc_scrub_block(unsigned long page, unsigned long offset,
773				u32 size)
774{
775	struct page *pg;
776	void *virt_addr;
777	unsigned long flags = 0;
778
779	debugf3("%s()\n", __func__);
780
781	/* ECC error page was not in our memory. Ignore it. */
782	if (!pfn_valid(page))
783		return;
784
785	/* Find the actual page structure then map it and fix */
786	pg = pfn_to_page(page);
787
788	if (PageHighMem(pg))
789		local_irq_save(flags);
790
791	virt_addr = kmap_atomic(pg);
792
793	/* Perform architecture specific atomic scrub operation */
794	atomic_scrub(virt_addr + offset, size);
795
796	/* Unmap and complete */
797	kunmap_atomic(virt_addr);
798
799	if (PageHighMem(pg))
800		local_irq_restore(flags);
801}
802
803/* FIXME - should return -1 */
804int edac_mc_find_csrow_by_page(struct mem_ctl_info *mci, unsigned long page)
805{
806	struct csrow_info *csrows = mci->csrows;
807	int row, i, j, n;
808
809	debugf1("MC%d: %s(): 0x%lx\n", mci->mc_idx, __func__, page);
810	row = -1;
811
812	for (i = 0; i < mci->nr_csrows; i++) {
813		struct csrow_info *csrow = &csrows[i];
814		n = 0;
815		for (j = 0; j < csrow->nr_channels; j++) {
816			struct dimm_info *dimm = csrow->channels[j].dimm;
817			n += dimm->nr_pages;
818		}
819		if (n == 0)
820			continue;
821
822		debugf3("MC%d: %s(): first(0x%lx) page(0x%lx) last(0x%lx) "
823			"mask(0x%lx)\n", mci->mc_idx, __func__,
824			csrow->first_page, page, csrow->last_page,
825			csrow->page_mask);
826
827		if ((page >= csrow->first_page) &&
828		    (page <= csrow->last_page) &&
829		    ((page & csrow->page_mask) ==
830		     (csrow->first_page & csrow->page_mask))) {
831			row = i;
832			break;
833		}
834	}
835
836	if (row == -1)
837		edac_mc_printk(mci, KERN_ERR,
838			"could not look up page error address %lx\n",
839			(unsigned long)page);
840
841	return row;
842}
843EXPORT_SYMBOL_GPL(edac_mc_find_csrow_by_page);
844
845const char *edac_layer_name[] = {
846	[EDAC_MC_LAYER_BRANCH] = "branch",
847	[EDAC_MC_LAYER_CHANNEL] = "channel",
848	[EDAC_MC_LAYER_SLOT] = "slot",
849	[EDAC_MC_LAYER_CHIP_SELECT] = "csrow",
850};
851EXPORT_SYMBOL_GPL(edac_layer_name);
852
853static void edac_inc_ce_error(struct mem_ctl_info *mci,
854				    bool enable_per_layer_report,
855				    const int pos[EDAC_MAX_LAYERS])
856{
857	int i, index = 0;
858
859	mci->ce_mc++;
860
861	if (!enable_per_layer_report) {
862		mci->ce_noinfo_count++;
863		return;
864	}
865
866	for (i = 0; i < mci->n_layers; i++) {
867		if (pos[i] < 0)
868			break;
869		index += pos[i];
870		mci->ce_per_layer[i][index]++;
871
872		if (i < mci->n_layers - 1)
873			index *= mci->layers[i + 1].size;
874	}
875}
876
877static void edac_inc_ue_error(struct mem_ctl_info *mci,
878				    bool enable_per_layer_report,
879				    const int pos[EDAC_MAX_LAYERS])
880{
881	int i, index = 0;
882
883	mci->ue_mc++;
884
885	if (!enable_per_layer_report) {
886		mci->ce_noinfo_count++;
887		return;
888	}
889
890	for (i = 0; i < mci->n_layers; i++) {
891		if (pos[i] < 0)
892			break;
893		index += pos[i];
894		mci->ue_per_layer[i][index]++;
895
896		if (i < mci->n_layers - 1)
897			index *= mci->layers[i + 1].size;
898	}
899}
900
901static void edac_ce_error(struct mem_ctl_info *mci,
902			  const int pos[EDAC_MAX_LAYERS],
903			  const char *msg,
904			  const char *location,
905			  const char *label,
906			  const char *detail,
907			  const char *other_detail,
908			  const bool enable_per_layer_report,
909			  const unsigned long page_frame_number,
910			  const unsigned long offset_in_page,
911			  long grain)
912{
913	unsigned long remapped_page;
914
915	if (edac_mc_get_log_ce()) {
916		if (other_detail && *other_detail)
917			edac_mc_printk(mci, KERN_WARNING,
918				       "CE %s on %s (%s %s - %s)\n",
919				       msg, label, location,
920				       detail, other_detail);
921		else
922			edac_mc_printk(mci, KERN_WARNING,
923				       "CE %s on %s (%s %s)\n",
924				       msg, label, location,
925				       detail);
926	}
927	edac_inc_ce_error(mci, enable_per_layer_report, pos);
928
929	if (mci->scrub_mode & SCRUB_SW_SRC) {
930		/*
931			* Some memory controllers (called MCs below) can remap
932			* memory so that it is still available at a different
933			* address when PCI devices map into memory.
934			* MC's that can't do this, lose the memory where PCI
935			* devices are mapped. This mapping is MC-dependent
936			* and so we call back into the MC driver for it to
937			* map the MC page to a physical (CPU) page which can
938			* then be mapped to a virtual page - which can then
939			* be scrubbed.
940			*/
941		remapped_page = mci->ctl_page_to_phys ?
942			mci->ctl_page_to_phys(mci, page_frame_number) :
943			page_frame_number;
944
945		edac_mc_scrub_block(remapped_page,
946					offset_in_page, grain);
947	}
948}
949
950static void edac_ue_error(struct mem_ctl_info *mci,
951			  const int pos[EDAC_MAX_LAYERS],
952			  const char *msg,
953			  const char *location,
954			  const char *label,
955			  const char *detail,
956			  const char *other_detail,
957			  const bool enable_per_layer_report)
958{
959	if (edac_mc_get_log_ue()) {
960		if (other_detail && *other_detail)
961			edac_mc_printk(mci, KERN_WARNING,
962				       "UE %s on %s (%s %s - %s)\n",
963			               msg, label, location, detail,
964				       other_detail);
965		else
966			edac_mc_printk(mci, KERN_WARNING,
967				       "UE %s on %s (%s %s)\n",
968			               msg, label, location, detail);
969	}
970
971	if (edac_mc_get_panic_on_ue()) {
972		if (other_detail && *other_detail)
973			panic("UE %s on %s (%s%s - %s)\n",
974			      msg, label, location, detail, other_detail);
975		else
976			panic("UE %s on %s (%s%s)\n",
977			      msg, label, location, detail);
978	}
979
980	edac_inc_ue_error(mci, enable_per_layer_report, pos);
981}
982
983#define OTHER_LABEL " or "
984
985/**
986 * edac_mc_handle_error - reports a memory event to userspace
987 *
988 * @type:		severity of the error (CE/UE/Fatal)
989 * @mci:		a struct mem_ctl_info pointer
990 * @page_frame_number:	mem page where the error occurred
991 * @offset_in_page:	offset of the error inside the page
992 * @syndrome:		ECC syndrome
993 * @top_layer:		Memory layer[0] position
994 * @mid_layer:		Memory layer[1] position
995 * @low_layer:		Memory layer[2] position
996 * @msg:		Message meaningful to the end users that
997 *			explains the event
998 * @other_detail:	Technical details about the event that
999 *			may help hardware manufacturers and
1000 *			EDAC developers to analyse the event
1001 * @arch_log:		Architecture-specific struct that can
1002 *			be used to add extended information to the
1003 *			tracepoint, like dumping MCE registers.
1004 */
1005void edac_mc_handle_error(const enum hw_event_mc_err_type type,
1006			  struct mem_ctl_info *mci,
1007			  const unsigned long page_frame_number,
1008			  const unsigned long offset_in_page,
1009			  const unsigned long syndrome,
1010			  const int top_layer,
1011			  const int mid_layer,
1012			  const int low_layer,
1013			  const char *msg,
1014			  const char *other_detail,
1015			  const void *arch_log)
1016{
1017	/* FIXME: too much for stack: move it to some pre-alocated area */
1018	char detail[80], location[80];
1019	char label[(EDAC_MC_LABEL_LEN + 1 + sizeof(OTHER_LABEL)) * mci->tot_dimms];
1020	char *p;
1021	int row = -1, chan = -1;
1022	int pos[EDAC_MAX_LAYERS] = { top_layer, mid_layer, low_layer };
1023	int i;
1024	long grain;
1025	bool enable_per_layer_report = false;
1026	u16 error_count;	/* FIXME: make it a parameter */
1027	u8 grain_bits;
1028
1029	debugf3("MC%d: %s()\n", mci->mc_idx, __func__);
1030
1031	/*
1032	 * Check if the event report is consistent and if the memory
1033	 * location is known. If it is known, enable_per_layer_report will be
1034	 * true, the DIMM(s) label info will be filled and the per-layer
1035	 * error counters will be incremented.
1036	 */
1037	for (i = 0; i < mci->n_layers; i++) {
1038		if (pos[i] >= (int)mci->layers[i].size) {
1039			if (type == HW_EVENT_ERR_CORRECTED)
1040				p = "CE";
1041			else
1042				p = "UE";
1043
1044			edac_mc_printk(mci, KERN_ERR,
1045				       "INTERNAL ERROR: %s value is out of range (%d >= %d)\n",
1046				       edac_layer_name[mci->layers[i].type],
1047				       pos[i], mci->layers[i].size);
1048			/*
1049			 * Instead of just returning it, let's use what's
1050			 * known about the error. The increment routines and
1051			 * the DIMM filter logic will do the right thing by
1052			 * pointing the likely damaged DIMMs.
1053			 */
1054			pos[i] = -1;
1055		}
1056		if (pos[i] >= 0)
1057			enable_per_layer_report = true;
1058	}
1059
1060	/*
1061	 * Get the dimm label/grain that applies to the match criteria.
1062	 * As the error algorithm may not be able to point to just one memory
1063	 * stick, the logic here will get all possible labels that could
1064	 * pottentially be affected by the error.
1065	 * On FB-DIMM memory controllers, for uncorrected errors, it is common
1066	 * to have only the MC channel and the MC dimm (also called "branch")
1067	 * but the channel is not known, as the memory is arranged in pairs,
1068	 * where each memory belongs to a separate channel within the same
1069	 * branch.
1070	 */
1071	grain = 0;
1072	p = label;
1073	*p = '\0';
1074	for (i = 0; i < mci->tot_dimms; i++) {
1075		struct dimm_info *dimm = &mci->dimms[i];
1076
1077		if (top_layer >= 0 && top_layer != dimm->location[0])
1078			continue;
1079		if (mid_layer >= 0 && mid_layer != dimm->location[1])
1080			continue;
1081		if (low_layer >= 0 && low_layer != dimm->location[2])
1082			continue;
1083
1084		/* get the max grain, over the error match range */
1085		if (dimm->grain > grain)
1086			grain = dimm->grain;
1087
1088		/*
1089		 * If the error is memory-controller wide, there's no need to
1090		 * seek for the affected DIMMs because the whole
1091		 * channel/memory controller/...  may be affected.
1092		 * Also, don't show errors for empty DIMM slots.
1093		 */
1094		if (enable_per_layer_report && dimm->nr_pages) {
1095			if (p != label) {
1096				strcpy(p, OTHER_LABEL);
1097				p += strlen(OTHER_LABEL);
1098			}
1099			strcpy(p, dimm->label);
1100			p += strlen(p);
1101			*p = '\0';
1102
1103			/*
1104			 * get csrow/channel of the DIMM, in order to allow
1105			 * incrementing the compat API counters
1106			 */
1107			debugf4("%s: %s csrows map: (%d,%d)\n",
1108				__func__,
1109				mci->mem_is_per_rank ? "rank" : "dimm",
1110				dimm->csrow, dimm->cschannel);
1111
1112			if (row == -1)
1113				row = dimm->csrow;
1114			else if (row >= 0 && row != dimm->csrow)
1115				row = -2;
1116
1117			if (chan == -1)
1118				chan = dimm->cschannel;
1119			else if (chan >= 0 && chan != dimm->cschannel)
1120				chan = -2;
1121		}
1122	}
1123
1124	if (!enable_per_layer_report) {
1125		strcpy(label, "any memory");
1126	} else {
1127		debugf4("%s: csrow/channel to increment: (%d,%d)\n",
1128			__func__, row, chan);
1129		if (p == label)
1130			strcpy(label, "unknown memory");
1131		if (type == HW_EVENT_ERR_CORRECTED) {
1132			if (row >= 0) {
1133				mci->csrows[row].ce_count++;
1134				if (chan >= 0)
1135					mci->csrows[row].channels[chan].ce_count++;
1136			}
1137		} else
1138			if (row >= 0)
1139				mci->csrows[row].ue_count++;
1140	}
1141
1142	/* Fill the RAM location data */
1143	p = location;
1144	for (i = 0; i < mci->n_layers; i++) {
1145		if (pos[i] < 0)
1146			continue;
1147
1148		p += sprintf(p, "%s:%d ",
1149			     edac_layer_name[mci->layers[i].type],
1150			     pos[i]);
1151	}
1152	if (p > location)
1153		*(p - 1) = '\0';
1154
1155	/* Report the error via the trace interface */
1156
1157	error_count = 1;	/* FIXME: allow change it */
1158	grain_bits = fls_long(grain) + 1;
1159	trace_mc_event(type, msg, label, error_count,
1160		       mci->mc_idx, top_layer, mid_layer, low_layer,
1161		       PAGES_TO_MiB(page_frame_number) | offset_in_page,
1162		       grain_bits, syndrome, other_detail);
1163
1164	/* Memory type dependent details about the error */
1165	if (type == HW_EVENT_ERR_CORRECTED) {
1166		snprintf(detail, sizeof(detail),
1167			"page:0x%lx offset:0x%lx grain:%ld syndrome:0x%lx",
1168			page_frame_number, offset_in_page,
1169			grain, syndrome);
1170		edac_ce_error(mci, pos, msg, location, label, detail,
1171			      other_detail, enable_per_layer_report,
1172			      page_frame_number, offset_in_page, grain);
1173	} else {
1174		snprintf(detail, sizeof(detail),
1175			"page:0x%lx offset:0x%lx grain:%ld",
1176			page_frame_number, offset_in_page, grain);
1177
1178		edac_ue_error(mci, pos, msg, location, label, detail,
1179			      other_detail, enable_per_layer_report);
1180	}
1181}
1182EXPORT_SYMBOL_GPL(edac_mc_handle_error);
1183