edac_mc.c revision 93e4fe64ece4eccf0ff4ac69bceb389290b8ab7c
1/* 2 * edac_mc kernel module 3 * (C) 2005, 2006 Linux Networx (http://lnxi.com) 4 * This file may be distributed under the terms of the 5 * GNU General Public License. 6 * 7 * Written by Thayne Harbaugh 8 * Based on work by Dan Hollis <goemon at anime dot net> and others. 9 * http://www.anime.net/~goemon/linux-ecc/ 10 * 11 * Modified by Dave Peterson and Doug Thompson 12 * 13 */ 14 15#include <linux/module.h> 16#include <linux/proc_fs.h> 17#include <linux/kernel.h> 18#include <linux/types.h> 19#include <linux/smp.h> 20#include <linux/init.h> 21#include <linux/sysctl.h> 22#include <linux/highmem.h> 23#include <linux/timer.h> 24#include <linux/slab.h> 25#include <linux/jiffies.h> 26#include <linux/spinlock.h> 27#include <linux/list.h> 28#include <linux/ctype.h> 29#include <linux/edac.h> 30#include <asm/uaccess.h> 31#include <asm/page.h> 32#include <asm/edac.h> 33#include "edac_core.h" 34#include "edac_module.h" 35 36/* lock to memory controller's control array */ 37static DEFINE_MUTEX(mem_ctls_mutex); 38static LIST_HEAD(mc_devices); 39 40#ifdef CONFIG_EDAC_DEBUG 41 42static void edac_mc_dump_channel(struct rank_info *chan) 43{ 44 debugf4("\tchannel = %p\n", chan); 45 debugf4("\tchannel->chan_idx = %d\n", chan->chan_idx); 46 debugf4("\tchannel->csrow = %p\n\n", chan->csrow); 47 debugf4("\tdimm->ce_count = %d\n", chan->dimm->ce_count); 48 debugf4("\tdimm->label = '%s'\n", chan->dimm->label); 49 debugf4("\tdimm->nr_pages = 0x%x\n", chan->dimm->nr_pages); 50} 51 52static void edac_mc_dump_csrow(struct csrow_info *csrow) 53{ 54 debugf4("\tcsrow = %p\n", csrow); 55 debugf4("\tcsrow->csrow_idx = %d\n", csrow->csrow_idx); 56 debugf4("\tcsrow->first_page = 0x%lx\n", csrow->first_page); 57 debugf4("\tcsrow->last_page = 0x%lx\n", csrow->last_page); 58 debugf4("\tcsrow->page_mask = 0x%lx\n", csrow->page_mask); 59 debugf4("\tcsrow->nr_channels = %d\n", csrow->nr_channels); 60 debugf4("\tcsrow->channels = %p\n", csrow->channels); 61 debugf4("\tcsrow->mci = %p\n\n", csrow->mci); 62} 63 64static void edac_mc_dump_mci(struct mem_ctl_info *mci) 65{ 66 debugf3("\tmci = %p\n", mci); 67 debugf3("\tmci->mtype_cap = %lx\n", mci->mtype_cap); 68 debugf3("\tmci->edac_ctl_cap = %lx\n", mci->edac_ctl_cap); 69 debugf3("\tmci->edac_cap = %lx\n", mci->edac_cap); 70 debugf4("\tmci->edac_check = %p\n", mci->edac_check); 71 debugf3("\tmci->nr_csrows = %d, csrows = %p\n", 72 mci->nr_csrows, mci->csrows); 73 debugf3("\tdev = %p\n", mci->dev); 74 debugf3("\tmod_name:ctl_name = %s:%s\n", mci->mod_name, mci->ctl_name); 75 debugf3("\tpvt_info = %p\n\n", mci->pvt_info); 76} 77 78#endif /* CONFIG_EDAC_DEBUG */ 79 80/* 81 * keep those in sync with the enum mem_type 82 */ 83const char *edac_mem_types[] = { 84 "Empty csrow", 85 "Reserved csrow type", 86 "Unknown csrow type", 87 "Fast page mode RAM", 88 "Extended data out RAM", 89 "Burst Extended data out RAM", 90 "Single data rate SDRAM", 91 "Registered single data rate SDRAM", 92 "Double data rate SDRAM", 93 "Registered Double data rate SDRAM", 94 "Rambus DRAM", 95 "Unbuffered DDR2 RAM", 96 "Fully buffered DDR2", 97 "Registered DDR2 RAM", 98 "Rambus XDR", 99 "Unbuffered DDR3 RAM", 100 "Registered DDR3 RAM", 101}; 102EXPORT_SYMBOL_GPL(edac_mem_types); 103 104/** 105 * edac_align_ptr - Prepares the pointer offsets for a single-shot allocation 106 * @p: pointer to a pointer with the memory offset to be used. At 107 * return, this will be incremented to point to the next offset 108 * @size: Size of the data structure to be reserved 109 * @n_elems: Number of elements that should be reserved 110 * 111 * If 'size' is a constant, the compiler will optimize this whole function 112 * down to either a no-op or the addition of a constant to the value of '*p'. 113 * 114 * The 'p' pointer is absolutely needed to keep the proper advancing 115 * further in memory to the proper offsets when allocating the struct along 116 * with its embedded structs, as edac_device_alloc_ctl_info() does it 117 * above, for example. 118 * 119 * At return, the pointer 'p' will be incremented to be used on a next call 120 * to this function. 121 */ 122void *edac_align_ptr(void **p, unsigned size, int n_elems) 123{ 124 unsigned align, r; 125 void *ptr = *p; 126 127 *p += size * n_elems; 128 129 /* 130 * 'p' can possibly be an unaligned item X such that sizeof(X) is 131 * 'size'. Adjust 'p' so that its alignment is at least as 132 * stringent as what the compiler would provide for X and return 133 * the aligned result. 134 * Here we assume that the alignment of a "long long" is the most 135 * stringent alignment that the compiler will ever provide by default. 136 * As far as I know, this is a reasonable assumption. 137 */ 138 if (size > sizeof(long)) 139 align = sizeof(long long); 140 else if (size > sizeof(int)) 141 align = sizeof(long); 142 else if (size > sizeof(short)) 143 align = sizeof(int); 144 else if (size > sizeof(char)) 145 align = sizeof(short); 146 else 147 return (char *)ptr; 148 149 r = size % align; 150 151 if (r == 0) 152 return (char *)ptr; 153 154 *p += align - r; 155 156 return (void *)(((unsigned long)ptr) + align - r); 157} 158 159/** 160 * edac_mc_alloc: Allocate a struct mem_ctl_info structure 161 * @size_pvt: size of private storage needed 162 * @nr_csrows: Number of CWROWS needed for this MC 163 * @nr_chans: Number of channels for the MC 164 * 165 * Everything is kmalloc'ed as one big chunk - more efficient. 166 * Only can be used if all structures have the same lifetime - otherwise 167 * you have to allocate and initialize your own structures. 168 * 169 * Use edac_mc_free() to free mc structures allocated by this function. 170 * 171 * Returns: 172 * NULL allocation failed 173 * struct mem_ctl_info pointer 174 */ 175struct mem_ctl_info *edac_mc_alloc(unsigned sz_pvt, unsigned nr_csrows, 176 unsigned nr_chans, int edac_index) 177{ 178 void *ptr = NULL; 179 struct mem_ctl_info *mci; 180 struct csrow_info *csi, *csrow; 181 struct rank_info *chi, *chp, *chan; 182 struct dimm_info *dimm; 183 void *pvt; 184 unsigned size; 185 int row, chn; 186 int err; 187 188 /* Figure out the offsets of the various items from the start of an mc 189 * structure. We want the alignment of each item to be at least as 190 * stringent as what the compiler would provide if we could simply 191 * hardcode everything into a single struct. 192 */ 193 mci = edac_align_ptr(&ptr, sizeof(*mci), 1); 194 csi = edac_align_ptr(&ptr, sizeof(*csi), nr_csrows); 195 chi = edac_align_ptr(&ptr, sizeof(*chi), nr_csrows * nr_chans); 196 dimm = edac_align_ptr(&ptr, sizeof(*dimm), nr_csrows * nr_chans); 197 pvt = edac_align_ptr(&ptr, sz_pvt, 1); 198 size = ((unsigned long)pvt) + sz_pvt; 199 200 mci = kzalloc(size, GFP_KERNEL); 201 if (mci == NULL) 202 return NULL; 203 204 /* Adjust pointers so they point within the memory we just allocated 205 * rather than an imaginary chunk of memory located at address 0. 206 */ 207 csi = (struct csrow_info *)(((char *)mci) + ((unsigned long)csi)); 208 chi = (struct rank_info *)(((char *)mci) + ((unsigned long)chi)); 209 dimm = (struct dimm_info *)(((char *)mci) + ((unsigned long)dimm)); 210 pvt = sz_pvt ? (((char *)mci) + ((unsigned long)pvt)) : NULL; 211 212 /* setup index and various internal pointers */ 213 mci->mc_idx = edac_index; 214 mci->csrows = csi; 215 mci->dimms = dimm; 216 mci->pvt_info = pvt; 217 mci->nr_csrows = nr_csrows; 218 219 /* 220 * For now, assumes that a per-csrow arrangement for dimms. 221 * This will be latter changed. 222 */ 223 dimm = mci->dimms; 224 225 for (row = 0; row < nr_csrows; row++) { 226 csrow = &csi[row]; 227 csrow->csrow_idx = row; 228 csrow->mci = mci; 229 csrow->nr_channels = nr_chans; 230 chp = &chi[row * nr_chans]; 231 csrow->channels = chp; 232 233 for (chn = 0; chn < nr_chans; chn++) { 234 chan = &chp[chn]; 235 chan->chan_idx = chn; 236 chan->csrow = csrow; 237 238 mci->csrows[row].channels[chn].dimm = dimm; 239 dimm->csrow = row; 240 dimm->csrow_channel = chn; 241 dimm++; 242 mci->nr_dimms++; 243 } 244 } 245 246 mci->op_state = OP_ALLOC; 247 INIT_LIST_HEAD(&mci->grp_kobj_list); 248 249 /* 250 * Initialize the 'root' kobj for the edac_mc controller 251 */ 252 err = edac_mc_register_sysfs_main_kobj(mci); 253 if (err) { 254 kfree(mci); 255 return NULL; 256 } 257 258 /* at this point, the root kobj is valid, and in order to 259 * 'free' the object, then the function: 260 * edac_mc_unregister_sysfs_main_kobj() must be called 261 * which will perform kobj unregistration and the actual free 262 * will occur during the kobject callback operation 263 */ 264 return mci; 265} 266EXPORT_SYMBOL_GPL(edac_mc_alloc); 267 268/** 269 * edac_mc_free 270 * 'Free' a previously allocated 'mci' structure 271 * @mci: pointer to a struct mem_ctl_info structure 272 */ 273void edac_mc_free(struct mem_ctl_info *mci) 274{ 275 debugf1("%s()\n", __func__); 276 277 edac_mc_unregister_sysfs_main_kobj(mci); 278 279 /* free the mci instance memory here */ 280 kfree(mci); 281} 282EXPORT_SYMBOL_GPL(edac_mc_free); 283 284 285/** 286 * find_mci_by_dev 287 * 288 * scan list of controllers looking for the one that manages 289 * the 'dev' device 290 * @dev: pointer to a struct device related with the MCI 291 */ 292struct mem_ctl_info *find_mci_by_dev(struct device *dev) 293{ 294 struct mem_ctl_info *mci; 295 struct list_head *item; 296 297 debugf3("%s()\n", __func__); 298 299 list_for_each(item, &mc_devices) { 300 mci = list_entry(item, struct mem_ctl_info, link); 301 302 if (mci->dev == dev) 303 return mci; 304 } 305 306 return NULL; 307} 308EXPORT_SYMBOL_GPL(find_mci_by_dev); 309 310/* 311 * handler for EDAC to check if NMI type handler has asserted interrupt 312 */ 313static int edac_mc_assert_error_check_and_clear(void) 314{ 315 int old_state; 316 317 if (edac_op_state == EDAC_OPSTATE_POLL) 318 return 1; 319 320 old_state = edac_err_assert; 321 edac_err_assert = 0; 322 323 return old_state; 324} 325 326/* 327 * edac_mc_workq_function 328 * performs the operation scheduled by a workq request 329 */ 330static void edac_mc_workq_function(struct work_struct *work_req) 331{ 332 struct delayed_work *d_work = to_delayed_work(work_req); 333 struct mem_ctl_info *mci = to_edac_mem_ctl_work(d_work); 334 335 mutex_lock(&mem_ctls_mutex); 336 337 /* if this control struct has movd to offline state, we are done */ 338 if (mci->op_state == OP_OFFLINE) { 339 mutex_unlock(&mem_ctls_mutex); 340 return; 341 } 342 343 /* Only poll controllers that are running polled and have a check */ 344 if (edac_mc_assert_error_check_and_clear() && (mci->edac_check != NULL)) 345 mci->edac_check(mci); 346 347 mutex_unlock(&mem_ctls_mutex); 348 349 /* Reschedule */ 350 queue_delayed_work(edac_workqueue, &mci->work, 351 msecs_to_jiffies(edac_mc_get_poll_msec())); 352} 353 354/* 355 * edac_mc_workq_setup 356 * initialize a workq item for this mci 357 * passing in the new delay period in msec 358 * 359 * locking model: 360 * 361 * called with the mem_ctls_mutex held 362 */ 363static void edac_mc_workq_setup(struct mem_ctl_info *mci, unsigned msec) 364{ 365 debugf0("%s()\n", __func__); 366 367 /* if this instance is not in the POLL state, then simply return */ 368 if (mci->op_state != OP_RUNNING_POLL) 369 return; 370 371 INIT_DELAYED_WORK(&mci->work, edac_mc_workq_function); 372 queue_delayed_work(edac_workqueue, &mci->work, msecs_to_jiffies(msec)); 373} 374 375/* 376 * edac_mc_workq_teardown 377 * stop the workq processing on this mci 378 * 379 * locking model: 380 * 381 * called WITHOUT lock held 382 */ 383static void edac_mc_workq_teardown(struct mem_ctl_info *mci) 384{ 385 int status; 386 387 if (mci->op_state != OP_RUNNING_POLL) 388 return; 389 390 status = cancel_delayed_work(&mci->work); 391 if (status == 0) { 392 debugf0("%s() not canceled, flush the queue\n", 393 __func__); 394 395 /* workq instance might be running, wait for it */ 396 flush_workqueue(edac_workqueue); 397 } 398} 399 400/* 401 * edac_mc_reset_delay_period(unsigned long value) 402 * 403 * user space has updated our poll period value, need to 404 * reset our workq delays 405 */ 406void edac_mc_reset_delay_period(int value) 407{ 408 struct mem_ctl_info *mci; 409 struct list_head *item; 410 411 mutex_lock(&mem_ctls_mutex); 412 413 /* scan the list and turn off all workq timers, doing so under lock 414 */ 415 list_for_each(item, &mc_devices) { 416 mci = list_entry(item, struct mem_ctl_info, link); 417 418 if (mci->op_state == OP_RUNNING_POLL) 419 cancel_delayed_work(&mci->work); 420 } 421 422 mutex_unlock(&mem_ctls_mutex); 423 424 425 /* re-walk the list, and reset the poll delay */ 426 mutex_lock(&mem_ctls_mutex); 427 428 list_for_each(item, &mc_devices) { 429 mci = list_entry(item, struct mem_ctl_info, link); 430 431 edac_mc_workq_setup(mci, (unsigned long) value); 432 } 433 434 mutex_unlock(&mem_ctls_mutex); 435} 436 437 438 439/* Return 0 on success, 1 on failure. 440 * Before calling this function, caller must 441 * assign a unique value to mci->mc_idx. 442 * 443 * locking model: 444 * 445 * called with the mem_ctls_mutex lock held 446 */ 447static int add_mc_to_global_list(struct mem_ctl_info *mci) 448{ 449 struct list_head *item, *insert_before; 450 struct mem_ctl_info *p; 451 452 insert_before = &mc_devices; 453 454 p = find_mci_by_dev(mci->dev); 455 if (unlikely(p != NULL)) 456 goto fail0; 457 458 list_for_each(item, &mc_devices) { 459 p = list_entry(item, struct mem_ctl_info, link); 460 461 if (p->mc_idx >= mci->mc_idx) { 462 if (unlikely(p->mc_idx == mci->mc_idx)) 463 goto fail1; 464 465 insert_before = item; 466 break; 467 } 468 } 469 470 list_add_tail_rcu(&mci->link, insert_before); 471 atomic_inc(&edac_handlers); 472 return 0; 473 474fail0: 475 edac_printk(KERN_WARNING, EDAC_MC, 476 "%s (%s) %s %s already assigned %d\n", dev_name(p->dev), 477 edac_dev_name(mci), p->mod_name, p->ctl_name, p->mc_idx); 478 return 1; 479 480fail1: 481 edac_printk(KERN_WARNING, EDAC_MC, 482 "bug in low-level driver: attempt to assign\n" 483 " duplicate mc_idx %d in %s()\n", p->mc_idx, __func__); 484 return 1; 485} 486 487static void del_mc_from_global_list(struct mem_ctl_info *mci) 488{ 489 atomic_dec(&edac_handlers); 490 list_del_rcu(&mci->link); 491 492 /* these are for safe removal of devices from global list while 493 * NMI handlers may be traversing list 494 */ 495 synchronize_rcu(); 496 INIT_LIST_HEAD(&mci->link); 497} 498 499/** 500 * edac_mc_find: Search for a mem_ctl_info structure whose index is 'idx'. 501 * 502 * If found, return a pointer to the structure. 503 * Else return NULL. 504 * 505 * Caller must hold mem_ctls_mutex. 506 */ 507struct mem_ctl_info *edac_mc_find(int idx) 508{ 509 struct list_head *item; 510 struct mem_ctl_info *mci; 511 512 list_for_each(item, &mc_devices) { 513 mci = list_entry(item, struct mem_ctl_info, link); 514 515 if (mci->mc_idx >= idx) { 516 if (mci->mc_idx == idx) 517 return mci; 518 519 break; 520 } 521 } 522 523 return NULL; 524} 525EXPORT_SYMBOL(edac_mc_find); 526 527/** 528 * edac_mc_add_mc: Insert the 'mci' structure into the mci global list and 529 * create sysfs entries associated with mci structure 530 * @mci: pointer to the mci structure to be added to the list 531 * @mc_idx: A unique numeric identifier to be assigned to the 'mci' structure. 532 * 533 * Return: 534 * 0 Success 535 * !0 Failure 536 */ 537 538/* FIXME - should a warning be printed if no error detection? correction? */ 539int edac_mc_add_mc(struct mem_ctl_info *mci) 540{ 541 debugf0("%s()\n", __func__); 542 543#ifdef CONFIG_EDAC_DEBUG 544 if (edac_debug_level >= 3) 545 edac_mc_dump_mci(mci); 546 547 if (edac_debug_level >= 4) { 548 int i; 549 550 for (i = 0; i < mci->nr_csrows; i++) { 551 int j; 552 553 edac_mc_dump_csrow(&mci->csrows[i]); 554 for (j = 0; j < mci->csrows[i].nr_channels; j++) 555 edac_mc_dump_channel(&mci->csrows[i]. 556 channels[j]); 557 } 558 } 559#endif 560 mutex_lock(&mem_ctls_mutex); 561 562 if (add_mc_to_global_list(mci)) 563 goto fail0; 564 565 /* set load time so that error rate can be tracked */ 566 mci->start_time = jiffies; 567 568 if (edac_create_sysfs_mci_device(mci)) { 569 edac_mc_printk(mci, KERN_WARNING, 570 "failed to create sysfs device\n"); 571 goto fail1; 572 } 573 574 /* If there IS a check routine, then we are running POLLED */ 575 if (mci->edac_check != NULL) { 576 /* This instance is NOW RUNNING */ 577 mci->op_state = OP_RUNNING_POLL; 578 579 edac_mc_workq_setup(mci, edac_mc_get_poll_msec()); 580 } else { 581 mci->op_state = OP_RUNNING_INTERRUPT; 582 } 583 584 /* Report action taken */ 585 edac_mc_printk(mci, KERN_INFO, "Giving out device to '%s' '%s':" 586 " DEV %s\n", mci->mod_name, mci->ctl_name, edac_dev_name(mci)); 587 588 mutex_unlock(&mem_ctls_mutex); 589 return 0; 590 591fail1: 592 del_mc_from_global_list(mci); 593 594fail0: 595 mutex_unlock(&mem_ctls_mutex); 596 return 1; 597} 598EXPORT_SYMBOL_GPL(edac_mc_add_mc); 599 600/** 601 * edac_mc_del_mc: Remove sysfs entries for specified mci structure and 602 * remove mci structure from global list 603 * @pdev: Pointer to 'struct device' representing mci structure to remove. 604 * 605 * Return pointer to removed mci structure, or NULL if device not found. 606 */ 607struct mem_ctl_info *edac_mc_del_mc(struct device *dev) 608{ 609 struct mem_ctl_info *mci; 610 611 debugf0("%s()\n", __func__); 612 613 mutex_lock(&mem_ctls_mutex); 614 615 /* find the requested mci struct in the global list */ 616 mci = find_mci_by_dev(dev); 617 if (mci == NULL) { 618 mutex_unlock(&mem_ctls_mutex); 619 return NULL; 620 } 621 622 del_mc_from_global_list(mci); 623 mutex_unlock(&mem_ctls_mutex); 624 625 /* flush workq processes */ 626 edac_mc_workq_teardown(mci); 627 628 /* marking MCI offline */ 629 mci->op_state = OP_OFFLINE; 630 631 /* remove from sysfs */ 632 edac_remove_sysfs_mci_device(mci); 633 634 edac_printk(KERN_INFO, EDAC_MC, 635 "Removed device %d for %s %s: DEV %s\n", mci->mc_idx, 636 mci->mod_name, mci->ctl_name, edac_dev_name(mci)); 637 638 return mci; 639} 640EXPORT_SYMBOL_GPL(edac_mc_del_mc); 641 642static void edac_mc_scrub_block(unsigned long page, unsigned long offset, 643 u32 size) 644{ 645 struct page *pg; 646 void *virt_addr; 647 unsigned long flags = 0; 648 649 debugf3("%s()\n", __func__); 650 651 /* ECC error page was not in our memory. Ignore it. */ 652 if (!pfn_valid(page)) 653 return; 654 655 /* Find the actual page structure then map it and fix */ 656 pg = pfn_to_page(page); 657 658 if (PageHighMem(pg)) 659 local_irq_save(flags); 660 661 virt_addr = kmap_atomic(pg); 662 663 /* Perform architecture specific atomic scrub operation */ 664 atomic_scrub(virt_addr + offset, size); 665 666 /* Unmap and complete */ 667 kunmap_atomic(virt_addr); 668 669 if (PageHighMem(pg)) 670 local_irq_restore(flags); 671} 672 673/* FIXME - should return -1 */ 674int edac_mc_find_csrow_by_page(struct mem_ctl_info *mci, unsigned long page) 675{ 676 struct csrow_info *csrows = mci->csrows; 677 int row, i, j, n; 678 679 debugf1("MC%d: %s(): 0x%lx\n", mci->mc_idx, __func__, page); 680 row = -1; 681 682 for (i = 0; i < mci->nr_csrows; i++) { 683 struct csrow_info *csrow = &csrows[i]; 684 n = 0; 685 for (j = 0; j < csrow->nr_channels; j++) { 686 struct dimm_info *dimm = csrow->channels[j].dimm; 687 n += dimm->nr_pages; 688 } 689 if (n == 0) 690 continue; 691 692 debugf3("MC%d: %s(): first(0x%lx) page(0x%lx) last(0x%lx) " 693 "mask(0x%lx)\n", mci->mc_idx, __func__, 694 csrow->first_page, page, csrow->last_page, 695 csrow->page_mask); 696 697 if ((page >= csrow->first_page) && 698 (page <= csrow->last_page) && 699 ((page & csrow->page_mask) == 700 (csrow->first_page & csrow->page_mask))) { 701 row = i; 702 break; 703 } 704 } 705 706 if (row == -1) 707 edac_mc_printk(mci, KERN_ERR, 708 "could not look up page error address %lx\n", 709 (unsigned long)page); 710 711 return row; 712} 713EXPORT_SYMBOL_GPL(edac_mc_find_csrow_by_page); 714 715/* FIXME - setable log (warning/emerg) levels */ 716/* FIXME - integrate with evlog: http://evlog.sourceforge.net/ */ 717void edac_mc_handle_ce(struct mem_ctl_info *mci, 718 unsigned long page_frame_number, 719 unsigned long offset_in_page, unsigned long syndrome, 720 int row, int channel, const char *msg) 721{ 722 unsigned long remapped_page; 723 char *label = NULL; 724 u32 grain; 725 726 debugf3("MC%d: %s()\n", mci->mc_idx, __func__); 727 728 /* FIXME - maybe make panic on INTERNAL ERROR an option */ 729 if (row >= mci->nr_csrows || row < 0) { 730 /* something is wrong */ 731 edac_mc_printk(mci, KERN_ERR, 732 "INTERNAL ERROR: row out of range " 733 "(%d >= %d)\n", row, mci->nr_csrows); 734 edac_mc_handle_ce_no_info(mci, "INTERNAL ERROR"); 735 return; 736 } 737 738 if (channel >= mci->csrows[row].nr_channels || channel < 0) { 739 /* something is wrong */ 740 edac_mc_printk(mci, KERN_ERR, 741 "INTERNAL ERROR: channel out of range " 742 "(%d >= %d)\n", channel, 743 mci->csrows[row].nr_channels); 744 edac_mc_handle_ce_no_info(mci, "INTERNAL ERROR"); 745 return; 746 } 747 748 label = mci->csrows[row].channels[channel].dimm->label; 749 grain = mci->csrows[row].channels[channel].dimm->grain; 750 751 if (edac_mc_get_log_ce()) 752 /* FIXME - put in DIMM location */ 753 edac_mc_printk(mci, KERN_WARNING, 754 "CE page 0x%lx, offset 0x%lx, grain %d, syndrome " 755 "0x%lx, row %d, channel %d, label \"%s\": %s\n", 756 page_frame_number, offset_in_page, 757 grain, syndrome, row, channel, 758 label, msg); 759 760 mci->ce_count++; 761 mci->csrows[row].ce_count++; 762 mci->csrows[row].channels[channel].dimm->ce_count++; 763 mci->csrows[row].channels[channel].ce_count++; 764 765 if (mci->scrub_mode & SCRUB_SW_SRC) { 766 /* 767 * Some MC's can remap memory so that it is still available 768 * at a different address when PCI devices map into memory. 769 * MC's that can't do this lose the memory where PCI devices 770 * are mapped. This mapping is MC dependent and so we call 771 * back into the MC driver for it to map the MC page to 772 * a physical (CPU) page which can then be mapped to a virtual 773 * page - which can then be scrubbed. 774 */ 775 remapped_page = mci->ctl_page_to_phys ? 776 mci->ctl_page_to_phys(mci, page_frame_number) : 777 page_frame_number; 778 779 edac_mc_scrub_block(remapped_page, offset_in_page, grain); 780 } 781} 782EXPORT_SYMBOL_GPL(edac_mc_handle_ce); 783 784void edac_mc_handle_ce_no_info(struct mem_ctl_info *mci, const char *msg) 785{ 786 if (edac_mc_get_log_ce()) 787 edac_mc_printk(mci, KERN_WARNING, 788 "CE - no information available: %s\n", msg); 789 790 mci->ce_noinfo_count++; 791 mci->ce_count++; 792} 793EXPORT_SYMBOL_GPL(edac_mc_handle_ce_no_info); 794 795void edac_mc_handle_ue(struct mem_ctl_info *mci, 796 unsigned long page_frame_number, 797 unsigned long offset_in_page, int row, const char *msg) 798{ 799 int len = EDAC_MC_LABEL_LEN * 4; 800 char labels[len + 1]; 801 char *pos = labels; 802 int chan; 803 int chars; 804 char *label = NULL; 805 u32 grain; 806 807 debugf3("MC%d: %s()\n", mci->mc_idx, __func__); 808 809 /* FIXME - maybe make panic on INTERNAL ERROR an option */ 810 if (row >= mci->nr_csrows || row < 0) { 811 /* something is wrong */ 812 edac_mc_printk(mci, KERN_ERR, 813 "INTERNAL ERROR: row out of range " 814 "(%d >= %d)\n", row, mci->nr_csrows); 815 edac_mc_handle_ue_no_info(mci, "INTERNAL ERROR"); 816 return; 817 } 818 819 grain = mci->csrows[row].channels[0].dimm->grain; 820 label = mci->csrows[row].channels[0].dimm->label; 821 chars = snprintf(pos, len + 1, "%s", label); 822 len -= chars; 823 pos += chars; 824 825 for (chan = 1; (chan < mci->csrows[row].nr_channels) && (len > 0); 826 chan++) { 827 label = mci->csrows[row].channels[chan].dimm->label; 828 chars = snprintf(pos, len + 1, ":%s", label); 829 len -= chars; 830 pos += chars; 831 } 832 833 if (edac_mc_get_log_ue()) 834 edac_mc_printk(mci, KERN_EMERG, 835 "UE page 0x%lx, offset 0x%lx, grain %d, row %d, " 836 "labels \"%s\": %s\n", page_frame_number, 837 offset_in_page, grain, row, labels, msg); 838 839 if (edac_mc_get_panic_on_ue()) 840 panic("EDAC MC%d: UE page 0x%lx, offset 0x%lx, grain %d, " 841 "row %d, labels \"%s\": %s\n", mci->mc_idx, 842 page_frame_number, offset_in_page, 843 grain, row, labels, msg); 844 845 mci->ue_count++; 846 mci->csrows[row].ue_count++; 847} 848EXPORT_SYMBOL_GPL(edac_mc_handle_ue); 849 850void edac_mc_handle_ue_no_info(struct mem_ctl_info *mci, const char *msg) 851{ 852 if (edac_mc_get_panic_on_ue()) 853 panic("EDAC MC%d: Uncorrected Error", mci->mc_idx); 854 855 if (edac_mc_get_log_ue()) 856 edac_mc_printk(mci, KERN_WARNING, 857 "UE - no information available: %s\n", msg); 858 mci->ue_noinfo_count++; 859 mci->ue_count++; 860} 861EXPORT_SYMBOL_GPL(edac_mc_handle_ue_no_info); 862 863/************************************************************* 864 * On Fully Buffered DIMM modules, this help function is 865 * called to process UE events 866 */ 867void edac_mc_handle_fbd_ue(struct mem_ctl_info *mci, 868 unsigned int csrow, 869 unsigned int channela, 870 unsigned int channelb, char *msg) 871{ 872 int len = EDAC_MC_LABEL_LEN * 4; 873 char labels[len + 1]; 874 char *pos = labels; 875 int chars; 876 char *label; 877 878 if (csrow >= mci->nr_csrows) { 879 /* something is wrong */ 880 edac_mc_printk(mci, KERN_ERR, 881 "INTERNAL ERROR: row out of range (%d >= %d)\n", 882 csrow, mci->nr_csrows); 883 edac_mc_handle_ue_no_info(mci, "INTERNAL ERROR"); 884 return; 885 } 886 887 if (channela >= mci->csrows[csrow].nr_channels) { 888 /* something is wrong */ 889 edac_mc_printk(mci, KERN_ERR, 890 "INTERNAL ERROR: channel-a out of range " 891 "(%d >= %d)\n", 892 channela, mci->csrows[csrow].nr_channels); 893 edac_mc_handle_ue_no_info(mci, "INTERNAL ERROR"); 894 return; 895 } 896 897 if (channelb >= mci->csrows[csrow].nr_channels) { 898 /* something is wrong */ 899 edac_mc_printk(mci, KERN_ERR, 900 "INTERNAL ERROR: channel-b out of range " 901 "(%d >= %d)\n", 902 channelb, mci->csrows[csrow].nr_channels); 903 edac_mc_handle_ue_no_info(mci, "INTERNAL ERROR"); 904 return; 905 } 906 907 mci->ue_count++; 908 mci->csrows[csrow].ue_count++; 909 910 /* Generate the DIMM labels from the specified channels */ 911 label = mci->csrows[csrow].channels[channela].dimm->label; 912 chars = snprintf(pos, len + 1, "%s", label); 913 len -= chars; 914 pos += chars; 915 916 chars = snprintf(pos, len + 1, "-%s", 917 mci->csrows[csrow].channels[channelb].dimm->label); 918 919 if (edac_mc_get_log_ue()) 920 edac_mc_printk(mci, KERN_EMERG, 921 "UE row %d, channel-a= %d channel-b= %d " 922 "labels \"%s\": %s\n", csrow, channela, channelb, 923 labels, msg); 924 925 if (edac_mc_get_panic_on_ue()) 926 panic("UE row %d, channel-a= %d channel-b= %d " 927 "labels \"%s\": %s\n", csrow, channela, 928 channelb, labels, msg); 929} 930EXPORT_SYMBOL(edac_mc_handle_fbd_ue); 931 932/************************************************************* 933 * On Fully Buffered DIMM modules, this help function is 934 * called to process CE events 935 */ 936void edac_mc_handle_fbd_ce(struct mem_ctl_info *mci, 937 unsigned int csrow, unsigned int channel, char *msg) 938{ 939 char *label = NULL; 940 941 /* Ensure boundary values */ 942 if (csrow >= mci->nr_csrows) { 943 /* something is wrong */ 944 edac_mc_printk(mci, KERN_ERR, 945 "INTERNAL ERROR: row out of range (%d >= %d)\n", 946 csrow, mci->nr_csrows); 947 edac_mc_handle_ce_no_info(mci, "INTERNAL ERROR"); 948 return; 949 } 950 if (channel >= mci->csrows[csrow].nr_channels) { 951 /* something is wrong */ 952 edac_mc_printk(mci, KERN_ERR, 953 "INTERNAL ERROR: channel out of range (%d >= %d)\n", 954 channel, mci->csrows[csrow].nr_channels); 955 edac_mc_handle_ce_no_info(mci, "INTERNAL ERROR"); 956 return; 957 } 958 959 label = mci->csrows[csrow].channels[channel].dimm->label; 960 961 if (edac_mc_get_log_ce()) 962 /* FIXME - put in DIMM location */ 963 edac_mc_printk(mci, KERN_WARNING, 964 "CE row %d, channel %d, label \"%s\": %s\n", 965 csrow, channel, label, msg); 966 967 mci->ce_count++; 968 mci->csrows[csrow].ce_count++; 969 mci->csrows[csrow].channels[channel].dimm->ce_count++; 970 mci->csrows[csrow].channels[channel].ce_count++; 971} 972EXPORT_SYMBOL(edac_mc_handle_fbd_ce); 973