1/* 2 * Memory subsystem support 3 * 4 * Written by Matt Tolentino <matthew.e.tolentino@intel.com> 5 * Dave Hansen <haveblue@us.ibm.com> 6 * 7 * This file provides the necessary infrastructure to represent 8 * a SPARSEMEM-memory-model system's physical memory in /sysfs. 9 * All arch-independent code that assumes MEMORY_HOTPLUG requires 10 * SPARSEMEM should be contained here, or in mm/memory_hotplug.c. 11 */ 12 13#include <linux/module.h> 14#include <linux/init.h> 15#include <linux/topology.h> 16#include <linux/capability.h> 17#include <linux/device.h> 18#include <linux/memory.h> 19#include <linux/kobject.h> 20#include <linux/memory_hotplug.h> 21#include <linux/mm.h> 22#include <linux/mutex.h> 23#include <linux/stat.h> 24#include <linux/slab.h> 25 26#include <linux/atomic.h> 27#include <asm/uaccess.h> 28 29static DEFINE_MUTEX(mem_sysfs_mutex); 30 31#define MEMORY_CLASS_NAME "memory" 32 33static int sections_per_block; 34 35static inline int base_memory_block_id(int section_nr) 36{ 37 return section_nr / sections_per_block; 38} 39 40static struct bus_type memory_subsys = { 41 .name = MEMORY_CLASS_NAME, 42 .dev_name = MEMORY_CLASS_NAME, 43}; 44 45static BLOCKING_NOTIFIER_HEAD(memory_chain); 46 47int register_memory_notifier(struct notifier_block *nb) 48{ 49 return blocking_notifier_chain_register(&memory_chain, nb); 50} 51EXPORT_SYMBOL(register_memory_notifier); 52 53void unregister_memory_notifier(struct notifier_block *nb) 54{ 55 blocking_notifier_chain_unregister(&memory_chain, nb); 56} 57EXPORT_SYMBOL(unregister_memory_notifier); 58 59static ATOMIC_NOTIFIER_HEAD(memory_isolate_chain); 60 61int register_memory_isolate_notifier(struct notifier_block *nb) 62{ 63 return atomic_notifier_chain_register(&memory_isolate_chain, nb); 64} 65EXPORT_SYMBOL(register_memory_isolate_notifier); 66 67void unregister_memory_isolate_notifier(struct notifier_block *nb) 68{ 69 atomic_notifier_chain_unregister(&memory_isolate_chain, nb); 70} 71EXPORT_SYMBOL(unregister_memory_isolate_notifier); 72 73/* 74 * register_memory - Setup a sysfs device for a memory block 75 */ 76static 77int register_memory(struct memory_block *memory) 78{ 79 int error; 80 81 memory->dev.bus = &memory_subsys; 82 memory->dev.id = memory->start_section_nr / sections_per_block; 83 84 error = device_register(&memory->dev); 85 return error; 86} 87 88static void 89unregister_memory(struct memory_block *memory) 90{ 91 BUG_ON(memory->dev.bus != &memory_subsys); 92 93 /* drop the ref. we got in remove_memory_block() */ 94 kobject_put(&memory->dev.kobj); 95 device_unregister(&memory->dev); 96} 97 98unsigned long __weak memory_block_size_bytes(void) 99{ 100 return MIN_MEMORY_BLOCK_SIZE; 101} 102 103static unsigned long get_memory_block_size(void) 104{ 105 unsigned long block_sz; 106 107 block_sz = memory_block_size_bytes(); 108 109 /* Validate blk_sz is a power of 2 and not less than section size */ 110 if ((block_sz & (block_sz - 1)) || (block_sz < MIN_MEMORY_BLOCK_SIZE)) { 111 WARN_ON(1); 112 block_sz = MIN_MEMORY_BLOCK_SIZE; 113 } 114 115 return block_sz; 116} 117 118/* 119 * use this as the physical section index that this memsection 120 * uses. 121 */ 122 123static ssize_t show_mem_start_phys_index(struct device *dev, 124 struct device_attribute *attr, char *buf) 125{ 126 struct memory_block *mem = 127 container_of(dev, struct memory_block, dev); 128 unsigned long phys_index; 129 130 phys_index = mem->start_section_nr / sections_per_block; 131 return sprintf(buf, "%08lx\n", phys_index); 132} 133 134static ssize_t show_mem_end_phys_index(struct device *dev, 135 struct device_attribute *attr, char *buf) 136{ 137 struct memory_block *mem = 138 container_of(dev, struct memory_block, dev); 139 unsigned long phys_index; 140 141 phys_index = mem->end_section_nr / sections_per_block; 142 return sprintf(buf, "%08lx\n", phys_index); 143} 144 145/* 146 * Show whether the section of memory is likely to be hot-removable 147 */ 148static ssize_t show_mem_removable(struct device *dev, 149 struct device_attribute *attr, char *buf) 150{ 151 unsigned long i, pfn; 152 int ret = 1; 153 struct memory_block *mem = 154 container_of(dev, struct memory_block, dev); 155 156 for (i = 0; i < sections_per_block; i++) { 157 pfn = section_nr_to_pfn(mem->start_section_nr + i); 158 ret &= is_mem_section_removable(pfn, PAGES_PER_SECTION); 159 } 160 161 return sprintf(buf, "%d\n", ret); 162} 163 164/* 165 * online, offline, going offline, etc. 166 */ 167static ssize_t show_mem_state(struct device *dev, 168 struct device_attribute *attr, char *buf) 169{ 170 struct memory_block *mem = 171 container_of(dev, struct memory_block, dev); 172 ssize_t len = 0; 173 174 /* 175 * We can probably put these states in a nice little array 176 * so that they're not open-coded 177 */ 178 switch (mem->state) { 179 case MEM_ONLINE: 180 len = sprintf(buf, "online\n"); 181 break; 182 case MEM_OFFLINE: 183 len = sprintf(buf, "offline\n"); 184 break; 185 case MEM_GOING_OFFLINE: 186 len = sprintf(buf, "going-offline\n"); 187 break; 188 default: 189 len = sprintf(buf, "ERROR-UNKNOWN-%ld\n", 190 mem->state); 191 WARN_ON(1); 192 break; 193 } 194 195 return len; 196} 197 198int memory_notify(unsigned long val, void *v) 199{ 200 return blocking_notifier_call_chain(&memory_chain, val, v); 201} 202 203int memory_isolate_notify(unsigned long val, void *v) 204{ 205 return atomic_notifier_call_chain(&memory_isolate_chain, val, v); 206} 207 208/* 209 * The probe routines leave the pages reserved, just as the bootmem code does. 210 * Make sure they're still that way. 211 */ 212static bool pages_correctly_reserved(unsigned long start_pfn, 213 unsigned long nr_pages) 214{ 215 int i, j; 216 struct page *page; 217 unsigned long pfn = start_pfn; 218 219 /* 220 * memmap between sections is not contiguous except with 221 * SPARSEMEM_VMEMMAP. We lookup the page once per section 222 * and assume memmap is contiguous within each section 223 */ 224 for (i = 0; i < sections_per_block; i++, pfn += PAGES_PER_SECTION) { 225 if (WARN_ON_ONCE(!pfn_valid(pfn))) 226 return false; 227 page = pfn_to_page(pfn); 228 229 for (j = 0; j < PAGES_PER_SECTION; j++) { 230 if (PageReserved(page + j)) 231 continue; 232 233 printk(KERN_WARNING "section number %ld page number %d " 234 "not reserved, was it already online?\n", 235 pfn_to_section_nr(pfn), j); 236 237 return false; 238 } 239 } 240 241 return true; 242} 243 244/* 245 * MEMORY_HOTPLUG depends on SPARSEMEM in mm/Kconfig, so it is 246 * OK to have direct references to sparsemem variables in here. 247 */ 248static int 249memory_block_action(unsigned long phys_index, unsigned long action) 250{ 251 unsigned long start_pfn, start_paddr; 252 unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; 253 struct page *first_page; 254 int ret; 255 256 first_page = pfn_to_page(phys_index << PFN_SECTION_SHIFT); 257 258 switch (action) { 259 case MEM_ONLINE: 260 start_pfn = page_to_pfn(first_page); 261 262 if (!pages_correctly_reserved(start_pfn, nr_pages)) 263 return -EBUSY; 264 265 ret = online_pages(start_pfn, nr_pages); 266 break; 267 case MEM_OFFLINE: 268 start_paddr = page_to_pfn(first_page) << PAGE_SHIFT; 269 ret = remove_memory(start_paddr, 270 nr_pages << PAGE_SHIFT); 271 break; 272 default: 273 WARN(1, KERN_WARNING "%s(%ld, %ld) unknown action: " 274 "%ld\n", __func__, phys_index, action, action); 275 ret = -EINVAL; 276 } 277 278 return ret; 279} 280 281static int memory_block_change_state(struct memory_block *mem, 282 unsigned long to_state, unsigned long from_state_req) 283{ 284 int ret = 0; 285 286 mutex_lock(&mem->state_mutex); 287 288 if (mem->state != from_state_req) { 289 ret = -EINVAL; 290 goto out; 291 } 292 293 if (to_state == MEM_OFFLINE) 294 mem->state = MEM_GOING_OFFLINE; 295 296 ret = memory_block_action(mem->start_section_nr, to_state); 297 298 if (ret) { 299 mem->state = from_state_req; 300 goto out; 301 } 302 303 mem->state = to_state; 304 switch (mem->state) { 305 case MEM_OFFLINE: 306 kobject_uevent(&mem->dev.kobj, KOBJ_OFFLINE); 307 break; 308 case MEM_ONLINE: 309 kobject_uevent(&mem->dev.kobj, KOBJ_ONLINE); 310 break; 311 default: 312 break; 313 } 314out: 315 mutex_unlock(&mem->state_mutex); 316 return ret; 317} 318 319static ssize_t 320store_mem_state(struct device *dev, 321 struct device_attribute *attr, const char *buf, size_t count) 322{ 323 struct memory_block *mem; 324 int ret = -EINVAL; 325 326 mem = container_of(dev, struct memory_block, dev); 327 328 if (!strncmp(buf, "online", min((int)count, 6))) 329 ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE); 330 else if(!strncmp(buf, "offline", min((int)count, 7))) 331 ret = memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE); 332 333 if (ret) 334 return ret; 335 return count; 336} 337 338/* 339 * phys_device is a bad name for this. What I really want 340 * is a way to differentiate between memory ranges that 341 * are part of physical devices that constitute 342 * a complete removable unit or fru. 343 * i.e. do these ranges belong to the same physical device, 344 * s.t. if I offline all of these sections I can then 345 * remove the physical device? 346 */ 347static ssize_t show_phys_device(struct device *dev, 348 struct device_attribute *attr, char *buf) 349{ 350 struct memory_block *mem = 351 container_of(dev, struct memory_block, dev); 352 return sprintf(buf, "%d\n", mem->phys_device); 353} 354 355static DEVICE_ATTR(phys_index, 0444, show_mem_start_phys_index, NULL); 356static DEVICE_ATTR(end_phys_index, 0444, show_mem_end_phys_index, NULL); 357static DEVICE_ATTR(state, 0644, show_mem_state, store_mem_state); 358static DEVICE_ATTR(phys_device, 0444, show_phys_device, NULL); 359static DEVICE_ATTR(removable, 0444, show_mem_removable, NULL); 360 361#define mem_create_simple_file(mem, attr_name) \ 362 device_create_file(&mem->dev, &dev_attr_##attr_name) 363#define mem_remove_simple_file(mem, attr_name) \ 364 device_remove_file(&mem->dev, &dev_attr_##attr_name) 365 366/* 367 * Block size attribute stuff 368 */ 369static ssize_t 370print_block_size(struct device *dev, struct device_attribute *attr, 371 char *buf) 372{ 373 return sprintf(buf, "%lx\n", get_memory_block_size()); 374} 375 376static DEVICE_ATTR(block_size_bytes, 0444, print_block_size, NULL); 377 378static int block_size_init(void) 379{ 380 return device_create_file(memory_subsys.dev_root, 381 &dev_attr_block_size_bytes); 382} 383 384/* 385 * Some architectures will have custom drivers to do this, and 386 * will not need to do it from userspace. The fake hot-add code 387 * as well as ppc64 will do all of their discovery in userspace 388 * and will require this interface. 389 */ 390#ifdef CONFIG_ARCH_MEMORY_PROBE 391static ssize_t 392memory_probe_store(struct device *dev, struct device_attribute *attr, 393 const char *buf, size_t count) 394{ 395 u64 phys_addr; 396 int nid; 397 int i, ret; 398 unsigned long pages_per_block = PAGES_PER_SECTION * sections_per_block; 399 400 phys_addr = simple_strtoull(buf, NULL, 0); 401 402 if (phys_addr & ((pages_per_block << PAGE_SHIFT) - 1)) 403 return -EINVAL; 404 405 for (i = 0; i < sections_per_block; i++) { 406 nid = memory_add_physaddr_to_nid(phys_addr); 407 ret = add_memory(nid, phys_addr, 408 PAGES_PER_SECTION << PAGE_SHIFT); 409 if (ret) 410 goto out; 411 412 phys_addr += MIN_MEMORY_BLOCK_SIZE; 413 } 414 415 ret = count; 416out: 417 return ret; 418} 419static DEVICE_ATTR(probe, S_IWUSR, NULL, memory_probe_store); 420 421static int memory_probe_init(void) 422{ 423 return device_create_file(memory_subsys.dev_root, &dev_attr_probe); 424} 425#else 426static inline int memory_probe_init(void) 427{ 428 return 0; 429} 430#endif 431 432#ifdef CONFIG_MEMORY_FAILURE 433/* 434 * Support for offlining pages of memory 435 */ 436 437/* Soft offline a page */ 438static ssize_t 439store_soft_offline_page(struct device *dev, 440 struct device_attribute *attr, 441 const char *buf, size_t count) 442{ 443 int ret; 444 u64 pfn; 445 if (!capable(CAP_SYS_ADMIN)) 446 return -EPERM; 447 if (strict_strtoull(buf, 0, &pfn) < 0) 448 return -EINVAL; 449 pfn >>= PAGE_SHIFT; 450 if (!pfn_valid(pfn)) 451 return -ENXIO; 452 ret = soft_offline_page(pfn_to_page(pfn), 0); 453 return ret == 0 ? count : ret; 454} 455 456/* Forcibly offline a page, including killing processes. */ 457static ssize_t 458store_hard_offline_page(struct device *dev, 459 struct device_attribute *attr, 460 const char *buf, size_t count) 461{ 462 int ret; 463 u64 pfn; 464 if (!capable(CAP_SYS_ADMIN)) 465 return -EPERM; 466 if (strict_strtoull(buf, 0, &pfn) < 0) 467 return -EINVAL; 468 pfn >>= PAGE_SHIFT; 469 ret = memory_failure(pfn, 0, 0); 470 return ret ? ret : count; 471} 472 473static DEVICE_ATTR(soft_offline_page, 0644, NULL, store_soft_offline_page); 474static DEVICE_ATTR(hard_offline_page, 0644, NULL, store_hard_offline_page); 475 476static __init int memory_fail_init(void) 477{ 478 int err; 479 480 err = device_create_file(memory_subsys.dev_root, 481 &dev_attr_soft_offline_page); 482 if (!err) 483 err = device_create_file(memory_subsys.dev_root, 484 &dev_attr_hard_offline_page); 485 return err; 486} 487#else 488static inline int memory_fail_init(void) 489{ 490 return 0; 491} 492#endif 493 494/* 495 * Note that phys_device is optional. It is here to allow for 496 * differentiation between which *physical* devices each 497 * section belongs to... 498 */ 499int __weak arch_get_memory_phys_device(unsigned long start_pfn) 500{ 501 return 0; 502} 503 504/* 505 * A reference for the returned object is held and the reference for the 506 * hinted object is released. 507 */ 508struct memory_block *find_memory_block_hinted(struct mem_section *section, 509 struct memory_block *hint) 510{ 511 int block_id = base_memory_block_id(__section_nr(section)); 512 struct device *hintdev = hint ? &hint->dev : NULL; 513 struct device *dev; 514 515 dev = subsys_find_device_by_id(&memory_subsys, block_id, hintdev); 516 if (hint) 517 put_device(&hint->dev); 518 if (!dev) 519 return NULL; 520 return container_of(dev, struct memory_block, dev); 521} 522 523/* 524 * For now, we have a linear search to go find the appropriate 525 * memory_block corresponding to a particular phys_index. If 526 * this gets to be a real problem, we can always use a radix 527 * tree or something here. 528 * 529 * This could be made generic for all device subsystems. 530 */ 531struct memory_block *find_memory_block(struct mem_section *section) 532{ 533 return find_memory_block_hinted(section, NULL); 534} 535 536static int init_memory_block(struct memory_block **memory, 537 struct mem_section *section, unsigned long state) 538{ 539 struct memory_block *mem; 540 unsigned long start_pfn; 541 int scn_nr; 542 int ret = 0; 543 544 mem = kzalloc(sizeof(*mem), GFP_KERNEL); 545 if (!mem) 546 return -ENOMEM; 547 548 scn_nr = __section_nr(section); 549 mem->start_section_nr = 550 base_memory_block_id(scn_nr) * sections_per_block; 551 mem->end_section_nr = mem->start_section_nr + sections_per_block - 1; 552 mem->state = state; 553 mem->section_count++; 554 mutex_init(&mem->state_mutex); 555 start_pfn = section_nr_to_pfn(mem->start_section_nr); 556 mem->phys_device = arch_get_memory_phys_device(start_pfn); 557 558 ret = register_memory(mem); 559 if (!ret) 560 ret = mem_create_simple_file(mem, phys_index); 561 if (!ret) 562 ret = mem_create_simple_file(mem, end_phys_index); 563 if (!ret) 564 ret = mem_create_simple_file(mem, state); 565 if (!ret) 566 ret = mem_create_simple_file(mem, phys_device); 567 if (!ret) 568 ret = mem_create_simple_file(mem, removable); 569 570 *memory = mem; 571 return ret; 572} 573 574static int add_memory_section(int nid, struct mem_section *section, 575 struct memory_block **mem_p, 576 unsigned long state, enum mem_add_context context) 577{ 578 struct memory_block *mem = NULL; 579 int scn_nr = __section_nr(section); 580 int ret = 0; 581 582 mutex_lock(&mem_sysfs_mutex); 583 584 if (context == BOOT) { 585 /* same memory block ? */ 586 if (mem_p && *mem_p) 587 if (scn_nr >= (*mem_p)->start_section_nr && 588 scn_nr <= (*mem_p)->end_section_nr) { 589 mem = *mem_p; 590 kobject_get(&mem->dev.kobj); 591 } 592 } else 593 mem = find_memory_block(section); 594 595 if (mem) { 596 mem->section_count++; 597 kobject_put(&mem->dev.kobj); 598 } else { 599 ret = init_memory_block(&mem, section, state); 600 /* store memory_block pointer for next loop */ 601 if (!ret && context == BOOT) 602 if (mem_p) 603 *mem_p = mem; 604 } 605 606 if (!ret) { 607 if (context == HOTPLUG && 608 mem->section_count == sections_per_block) 609 ret = register_mem_sect_under_node(mem, nid); 610 } 611 612 mutex_unlock(&mem_sysfs_mutex); 613 return ret; 614} 615 616int remove_memory_block(unsigned long node_id, struct mem_section *section, 617 int phys_device) 618{ 619 struct memory_block *mem; 620 621 mutex_lock(&mem_sysfs_mutex); 622 mem = find_memory_block(section); 623 unregister_mem_sect_under_nodes(mem, __section_nr(section)); 624 625 mem->section_count--; 626 if (mem->section_count == 0) { 627 mem_remove_simple_file(mem, phys_index); 628 mem_remove_simple_file(mem, end_phys_index); 629 mem_remove_simple_file(mem, state); 630 mem_remove_simple_file(mem, phys_device); 631 mem_remove_simple_file(mem, removable); 632 unregister_memory(mem); 633 kfree(mem); 634 } else 635 kobject_put(&mem->dev.kobj); 636 637 mutex_unlock(&mem_sysfs_mutex); 638 return 0; 639} 640 641/* 642 * need an interface for the VM to add new memory regions, 643 * but without onlining it. 644 */ 645int register_new_memory(int nid, struct mem_section *section) 646{ 647 return add_memory_section(nid, section, NULL, MEM_OFFLINE, HOTPLUG); 648} 649 650int unregister_memory_section(struct mem_section *section) 651{ 652 if (!present_section(section)) 653 return -EINVAL; 654 655 return remove_memory_block(0, section, 0); 656} 657 658/* 659 * Initialize the sysfs support for memory devices... 660 */ 661int __init memory_dev_init(void) 662{ 663 unsigned int i; 664 int ret; 665 int err; 666 unsigned long block_sz; 667 struct memory_block *mem = NULL; 668 669 ret = subsys_system_register(&memory_subsys, NULL); 670 if (ret) 671 goto out; 672 673 block_sz = get_memory_block_size(); 674 sections_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE; 675 676 /* 677 * Create entries for memory sections that were found 678 * during boot and have been initialized 679 */ 680 for (i = 0; i < NR_MEM_SECTIONS; i++) { 681 if (!present_section_nr(i)) 682 continue; 683 /* don't need to reuse memory_block if only one per block */ 684 err = add_memory_section(0, __nr_to_section(i), 685 (sections_per_block == 1) ? NULL : &mem, 686 MEM_ONLINE, 687 BOOT); 688 if (!ret) 689 ret = err; 690 } 691 692 err = memory_probe_init(); 693 if (!ret) 694 ret = err; 695 err = memory_fail_init(); 696 if (!ret) 697 ret = err; 698 err = block_size_init(); 699 if (!ret) 700 ret = err; 701out: 702 if (ret) 703 printk(KERN_ERR "%s() failed: %d\n", __func__, ret); 704 return ret; 705} 706