ghes.c revision 32c361f574f85fa47600d84900598e2efc99082e
1/* 2 * APEI Generic Hardware Error Source support 3 * 4 * Generic Hardware Error Source provides a way to report platform 5 * hardware errors (such as that from chipset). It works in so called 6 * "Firmware First" mode, that is, hardware errors are reported to 7 * firmware firstly, then reported to Linux by firmware. This way, 8 * some non-standard hardware error registers or non-standard hardware 9 * link can be checked by firmware to produce more hardware error 10 * information for Linux. 11 * 12 * For more information about Generic Hardware Error Source, please 13 * refer to ACPI Specification version 4.0, section 17.3.2.6 14 * 15 * Now, only SCI notification type and memory errors are 16 * supported. More notification type and hardware error type will be 17 * added later. 18 * 19 * Copyright 2010 Intel Corp. 20 * Author: Huang Ying <ying.huang@intel.com> 21 * 22 * This program is free software; you can redistribute it and/or 23 * modify it under the terms of the GNU General Public License version 24 * 2 as published by the Free Software Foundation; 25 * 26 * This program is distributed in the hope that it will be useful, 27 * but WITHOUT ANY WARRANTY; without even the implied warranty of 28 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 29 * GNU General Public License for more details. 30 * 31 * You should have received a copy of the GNU General Public License 32 * along with this program; if not, write to the Free Software 33 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 34 */ 35 36#include <linux/kernel.h> 37#include <linux/module.h> 38#include <linux/init.h> 39#include <linux/acpi.h> 40#include <linux/io.h> 41#include <linux/interrupt.h> 42#include <linux/cper.h> 43#include <linux/kdebug.h> 44#include <linux/platform_device.h> 45#include <linux/mutex.h> 46#include <linux/ratelimit.h> 47#include <acpi/apei.h> 48#include <acpi/atomicio.h> 49#include <acpi/hed.h> 50#include <asm/mce.h> 51 52#include "apei-internal.h" 53 54#define GHES_PFX "GHES: " 55 56#define GHES_ESTATUS_MAX_SIZE 65536 57 58/* 59 * One struct ghes is created for each generic hardware error 60 * source. 61 * 62 * It provides the context for APEI hardware error timer/IRQ/SCI/NMI 63 * handler. Handler for one generic hardware error source is only 64 * triggered after the previous one is done. So handler can uses 65 * struct ghes without locking. 66 * 67 * estatus: memory buffer for error status block, allocated during 68 * HEST parsing. 69 */ 70#define GHES_TO_CLEAR 0x0001 71 72struct ghes { 73 struct acpi_hest_generic *generic; 74 struct acpi_hest_generic_status *estatus; 75 struct list_head list; 76 u64 buffer_paddr; 77 unsigned long flags; 78}; 79 80/* 81 * Error source lists, one list for each notification method. The 82 * members in lists are struct ghes. 83 * 84 * The list members are only added in HEST parsing and deleted during 85 * module_exit, that is, single-threaded. So no lock is needed for 86 * that. 87 * 88 * But the mutual exclusion is needed between members adding/deleting 89 * and timer/IRQ/SCI/NMI handler, which may traverse the list. RCU is 90 * used for that. 91 */ 92static LIST_HEAD(ghes_sci); 93static DEFINE_MUTEX(ghes_list_mutex); 94 95static struct ghes *ghes_new(struct acpi_hest_generic *generic) 96{ 97 struct ghes *ghes; 98 unsigned int error_block_length; 99 int rc; 100 101 ghes = kzalloc(sizeof(*ghes), GFP_KERNEL); 102 if (!ghes) 103 return ERR_PTR(-ENOMEM); 104 ghes->generic = generic; 105 INIT_LIST_HEAD(&ghes->list); 106 rc = acpi_pre_map_gar(&generic->error_status_address); 107 if (rc) 108 goto err_free; 109 error_block_length = generic->error_block_length; 110 if (error_block_length > GHES_ESTATUS_MAX_SIZE) { 111 pr_warning(FW_WARN GHES_PFX 112 "Error status block length is too long: %u for " 113 "generic hardware error source: %d.\n", 114 error_block_length, generic->header.source_id); 115 error_block_length = GHES_ESTATUS_MAX_SIZE; 116 } 117 ghes->estatus = kmalloc(error_block_length, GFP_KERNEL); 118 if (!ghes->estatus) { 119 rc = -ENOMEM; 120 goto err_unmap; 121 } 122 123 return ghes; 124 125err_unmap: 126 acpi_post_unmap_gar(&generic->error_status_address); 127err_free: 128 kfree(ghes); 129 return ERR_PTR(rc); 130} 131 132static void ghes_fini(struct ghes *ghes) 133{ 134 kfree(ghes->estatus); 135 acpi_post_unmap_gar(&ghes->generic->error_status_address); 136} 137 138enum { 139 GHES_SEV_NO = 0x0, 140 GHES_SEV_CORRECTED = 0x1, 141 GHES_SEV_RECOVERABLE = 0x2, 142 GHES_SEV_PANIC = 0x3, 143}; 144 145static inline int ghes_severity(int severity) 146{ 147 switch (severity) { 148 case CPER_SEV_INFORMATIONAL: 149 return GHES_SEV_NO; 150 case CPER_SEV_CORRECTED: 151 return GHES_SEV_CORRECTED; 152 case CPER_SEV_RECOVERABLE: 153 return GHES_SEV_RECOVERABLE; 154 case CPER_SEV_FATAL: 155 return GHES_SEV_PANIC; 156 default: 157 /* Unkown, go panic */ 158 return GHES_SEV_PANIC; 159 } 160} 161 162/* SCI handler run in work queue, so ioremap can be used here */ 163static int ghes_copy_tofrom_phys(void *buffer, u64 paddr, u32 len, 164 int from_phys) 165{ 166 void *vaddr; 167 168 vaddr = ioremap_cache(paddr, len); 169 if (!vaddr) 170 return -ENOMEM; 171 if (from_phys) 172 memcpy(buffer, vaddr, len); 173 else 174 memcpy(vaddr, buffer, len); 175 iounmap(vaddr); 176 177 return 0; 178} 179 180static int ghes_read_estatus(struct ghes *ghes, int silent) 181{ 182 struct acpi_hest_generic *g = ghes->generic; 183 u64 buf_paddr; 184 u32 len; 185 int rc; 186 187 rc = acpi_atomic_read(&buf_paddr, &g->error_status_address); 188 if (rc) { 189 if (!silent && printk_ratelimit()) 190 pr_warning(FW_WARN GHES_PFX 191"Failed to read error status block address for hardware error source: %d.\n", 192 g->header.source_id); 193 return -EIO; 194 } 195 if (!buf_paddr) 196 return -ENOENT; 197 198 rc = ghes_copy_tofrom_phys(ghes->estatus, buf_paddr, 199 sizeof(*ghes->estatus), 1); 200 if (rc) 201 return rc; 202 if (!ghes->estatus->block_status) 203 return -ENOENT; 204 205 ghes->buffer_paddr = buf_paddr; 206 ghes->flags |= GHES_TO_CLEAR; 207 208 rc = -EIO; 209 len = apei_estatus_len(ghes->estatus); 210 if (len < sizeof(*ghes->estatus)) 211 goto err_read_block; 212 if (len > ghes->generic->error_block_length) 213 goto err_read_block; 214 if (apei_estatus_check_header(ghes->estatus)) 215 goto err_read_block; 216 rc = ghes_copy_tofrom_phys(ghes->estatus + 1, 217 buf_paddr + sizeof(*ghes->estatus), 218 len - sizeof(*ghes->estatus), 1); 219 if (rc) 220 return rc; 221 if (apei_estatus_check(ghes->estatus)) 222 goto err_read_block; 223 rc = 0; 224 225err_read_block: 226 if (rc && !silent) 227 pr_warning(FW_WARN GHES_PFX 228 "Failed to read error status block!\n"); 229 return rc; 230} 231 232static void ghes_clear_estatus(struct ghes *ghes) 233{ 234 ghes->estatus->block_status = 0; 235 if (!(ghes->flags & GHES_TO_CLEAR)) 236 return; 237 ghes_copy_tofrom_phys(ghes->estatus, ghes->buffer_paddr, 238 sizeof(ghes->estatus->block_status), 0); 239 ghes->flags &= ~GHES_TO_CLEAR; 240} 241 242static void ghes_do_proc(struct ghes *ghes) 243{ 244 int sev, processed = 0; 245 struct acpi_hest_generic_data *gdata; 246 247 sev = ghes_severity(ghes->estatus->error_severity); 248 apei_estatus_for_each_section(ghes->estatus, gdata) { 249#ifdef CONFIG_X86_MCE 250 if (!uuid_le_cmp(*(uuid_le *)gdata->section_type, 251 CPER_SEC_PLATFORM_MEM)) { 252 apei_mce_report_mem_error( 253 sev == GHES_SEV_CORRECTED, 254 (struct cper_sec_mem_err *)(gdata+1)); 255 processed = 1; 256 } 257#endif 258 } 259} 260 261static void ghes_print_estatus(const char *pfx, struct ghes *ghes) 262{ 263 /* Not more than 2 messages every 5 seconds */ 264 static DEFINE_RATELIMIT_STATE(ratelimit, 5*HZ, 2); 265 266 if (pfx == NULL) { 267 if (ghes_severity(ghes->estatus->error_severity) <= 268 GHES_SEV_CORRECTED) 269 pfx = KERN_WARNING HW_ERR; 270 else 271 pfx = KERN_ERR HW_ERR; 272 } 273 if (__ratelimit(&ratelimit)) { 274 printk( 275 "%s""Hardware error from APEI Generic Hardware Error Source: %d\n", 276 pfx, ghes->generic->header.source_id); 277 apei_estatus_print(pfx, ghes->estatus); 278 } 279} 280 281static int ghes_proc(struct ghes *ghes) 282{ 283 int rc; 284 285 rc = ghes_read_estatus(ghes, 0); 286 if (rc) 287 goto out; 288 ghes_print_estatus(NULL, ghes); 289 ghes_do_proc(ghes); 290 291out: 292 ghes_clear_estatus(ghes); 293 return 0; 294} 295 296static int ghes_notify_sci(struct notifier_block *this, 297 unsigned long event, void *data) 298{ 299 struct ghes *ghes; 300 int ret = NOTIFY_DONE; 301 302 rcu_read_lock(); 303 list_for_each_entry_rcu(ghes, &ghes_sci, list) { 304 if (!ghes_proc(ghes)) 305 ret = NOTIFY_OK; 306 } 307 rcu_read_unlock(); 308 309 return ret; 310} 311 312static struct notifier_block ghes_notifier_sci = { 313 .notifier_call = ghes_notify_sci, 314}; 315 316static int __devinit ghes_probe(struct platform_device *ghes_dev) 317{ 318 struct acpi_hest_generic *generic; 319 struct ghes *ghes = NULL; 320 int rc = -EINVAL; 321 322 generic = *(struct acpi_hest_generic **)ghes_dev->dev.platform_data; 323 if (!generic->enabled) 324 return -ENODEV; 325 326 if (generic->error_block_length < 327 sizeof(struct acpi_hest_generic_status)) { 328 pr_warning(FW_BUG GHES_PFX 329"Invalid error block length: %u for generic hardware error source: %d\n", 330 generic->error_block_length, 331 generic->header.source_id); 332 goto err; 333 } 334 if (generic->records_to_preallocate == 0) { 335 pr_warning(FW_BUG GHES_PFX 336"Invalid records to preallocate: %u for generic hardware error source: %d\n", 337 generic->records_to_preallocate, 338 generic->header.source_id); 339 goto err; 340 } 341 ghes = ghes_new(generic); 342 if (IS_ERR(ghes)) { 343 rc = PTR_ERR(ghes); 344 ghes = NULL; 345 goto err; 346 } 347 if (generic->notify.type == ACPI_HEST_NOTIFY_SCI) { 348 mutex_lock(&ghes_list_mutex); 349 if (list_empty(&ghes_sci)) 350 register_acpi_hed_notifier(&ghes_notifier_sci); 351 list_add_rcu(&ghes->list, &ghes_sci); 352 mutex_unlock(&ghes_list_mutex); 353 } else { 354 unsigned char *notify = NULL; 355 356 switch (generic->notify.type) { 357 case ACPI_HEST_NOTIFY_POLLED: 358 notify = "POLL"; 359 break; 360 case ACPI_HEST_NOTIFY_EXTERNAL: 361 case ACPI_HEST_NOTIFY_LOCAL: 362 notify = "IRQ"; 363 break; 364 case ACPI_HEST_NOTIFY_NMI: 365 notify = "NMI"; 366 break; 367 } 368 if (notify) { 369 pr_warning(GHES_PFX 370"Generic hardware error source: %d notified via %s is not supported!\n", 371 generic->header.source_id, notify); 372 } else { 373 pr_warning(FW_WARN GHES_PFX 374"Unknown notification type: %u for generic hardware error source: %d\n", 375 generic->notify.type, generic->header.source_id); 376 } 377 rc = -ENODEV; 378 goto err; 379 } 380 platform_set_drvdata(ghes_dev, ghes); 381 382 return 0; 383err: 384 if (ghes) { 385 ghes_fini(ghes); 386 kfree(ghes); 387 } 388 return rc; 389} 390 391static int __devexit ghes_remove(struct platform_device *ghes_dev) 392{ 393 struct ghes *ghes; 394 struct acpi_hest_generic *generic; 395 396 ghes = platform_get_drvdata(ghes_dev); 397 generic = ghes->generic; 398 399 switch (generic->notify.type) { 400 case ACPI_HEST_NOTIFY_SCI: 401 mutex_lock(&ghes_list_mutex); 402 list_del_rcu(&ghes->list); 403 if (list_empty(&ghes_sci)) 404 unregister_acpi_hed_notifier(&ghes_notifier_sci); 405 mutex_unlock(&ghes_list_mutex); 406 break; 407 default: 408 BUG(); 409 break; 410 } 411 412 synchronize_rcu(); 413 ghes_fini(ghes); 414 kfree(ghes); 415 416 platform_set_drvdata(ghes_dev, NULL); 417 418 return 0; 419} 420 421static struct platform_driver ghes_platform_driver = { 422 .driver = { 423 .name = "GHES", 424 .owner = THIS_MODULE, 425 }, 426 .probe = ghes_probe, 427 .remove = ghes_remove, 428}; 429 430static int __init ghes_init(void) 431{ 432 if (acpi_disabled) 433 return -ENODEV; 434 435 if (hest_disable) { 436 pr_info(GHES_PFX "HEST is not enabled!\n"); 437 return -EINVAL; 438 } 439 440 return platform_driver_register(&ghes_platform_driver); 441} 442 443static void __exit ghes_exit(void) 444{ 445 platform_driver_unregister(&ghes_platform_driver); 446} 447 448module_init(ghes_init); 449module_exit(ghes_exit); 450 451MODULE_AUTHOR("Huang Ying"); 452MODULE_DESCRIPTION("APEI Generic Hardware Error Source support"); 453MODULE_LICENSE("GPL"); 454MODULE_ALIAS("platform:GHES"); 455