hugetlb.c revision f412c97abef71026d8192ca8efca231f1e3906b3
1/* 2 * Generic hugetlb support. 3 * (C) Nadia Yvette Chambers, April 2004 4 */ 5#include <linux/list.h> 6#include <linux/init.h> 7#include <linux/module.h> 8#include <linux/mm.h> 9#include <linux/seq_file.h> 10#include <linux/sysctl.h> 11#include <linux/highmem.h> 12#include <linux/mmu_notifier.h> 13#include <linux/nodemask.h> 14#include <linux/pagemap.h> 15#include <linux/mempolicy.h> 16#include <linux/cpuset.h> 17#include <linux/mutex.h> 18#include <linux/bootmem.h> 19#include <linux/sysfs.h> 20#include <linux/slab.h> 21#include <linux/rmap.h> 22#include <linux/swap.h> 23#include <linux/swapops.h> 24#include <linux/page-isolation.h> 25#include <linux/jhash.h> 26 27#include <asm/page.h> 28#include <asm/pgtable.h> 29#include <asm/tlb.h> 30 31#include <linux/io.h> 32#include <linux/hugetlb.h> 33#include <linux/hugetlb_cgroup.h> 34#include <linux/node.h> 35#include "internal.h" 36 37const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; 38unsigned long hugepages_treat_as_movable; 39 40int hugetlb_max_hstate __read_mostly; 41unsigned int default_hstate_idx; 42struct hstate hstates[HUGE_MAX_HSTATE]; 43 44__initdata LIST_HEAD(huge_boot_pages); 45 46/* for command line parsing */ 47static struct hstate * __initdata parsed_hstate; 48static unsigned long __initdata default_hstate_max_huge_pages; 49static unsigned long __initdata default_hstate_size; 50 51/* 52 * Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages, 53 * free_huge_pages, and surplus_huge_pages. 54 */ 55DEFINE_SPINLOCK(hugetlb_lock); 56 57/* 58 * Serializes faults on the same logical page. This is used to 59 * prevent spurious OOMs when the hugepage pool is fully utilized. 60 */ 61static int num_fault_mutexes; 62static struct mutex *htlb_fault_mutex_table ____cacheline_aligned_in_smp; 63 64static inline void unlock_or_release_subpool(struct hugepage_subpool *spool) 65{ 66 bool free = (spool->count == 0) && (spool->used_hpages == 0); 67 68 spin_unlock(&spool->lock); 69 70 /* If no pages are used, and no other handles to the subpool 71 * remain, free the subpool the subpool remain */ 72 if (free) 73 kfree(spool); 74} 75 76struct hugepage_subpool *hugepage_new_subpool(long nr_blocks) 77{ 78 struct hugepage_subpool *spool; 79 80 spool = kmalloc(sizeof(*spool), GFP_KERNEL); 81 if (!spool) 82 return NULL; 83 84 spin_lock_init(&spool->lock); 85 spool->count = 1; 86 spool->max_hpages = nr_blocks; 87 spool->used_hpages = 0; 88 89 return spool; 90} 91 92void hugepage_put_subpool(struct hugepage_subpool *spool) 93{ 94 spin_lock(&spool->lock); 95 BUG_ON(!spool->count); 96 spool->count--; 97 unlock_or_release_subpool(spool); 98} 99 100static int hugepage_subpool_get_pages(struct hugepage_subpool *spool, 101 long delta) 102{ 103 int ret = 0; 104 105 if (!spool) 106 return 0; 107 108 spin_lock(&spool->lock); 109 if ((spool->used_hpages + delta) <= spool->max_hpages) { 110 spool->used_hpages += delta; 111 } else { 112 ret = -ENOMEM; 113 } 114 spin_unlock(&spool->lock); 115 116 return ret; 117} 118 119static void hugepage_subpool_put_pages(struct hugepage_subpool *spool, 120 long delta) 121{ 122 if (!spool) 123 return; 124 125 spin_lock(&spool->lock); 126 spool->used_hpages -= delta; 127 /* If hugetlbfs_put_super couldn't free spool due to 128 * an outstanding quota reference, free it now. */ 129 unlock_or_release_subpool(spool); 130} 131 132static inline struct hugepage_subpool *subpool_inode(struct inode *inode) 133{ 134 return HUGETLBFS_SB(inode->i_sb)->spool; 135} 136 137static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma) 138{ 139 return subpool_inode(file_inode(vma->vm_file)); 140} 141 142/* 143 * Region tracking -- allows tracking of reservations and instantiated pages 144 * across the pages in a mapping. 145 * 146 * The region data structures are embedded into a resv_map and 147 * protected by a resv_map's lock 148 */ 149struct file_region { 150 struct list_head link; 151 long from; 152 long to; 153}; 154 155static long region_add(struct resv_map *resv, long f, long t) 156{ 157 struct list_head *head = &resv->regions; 158 struct file_region *rg, *nrg, *trg; 159 160 spin_lock(&resv->lock); 161 /* Locate the region we are either in or before. */ 162 list_for_each_entry(rg, head, link) 163 if (f <= rg->to) 164 break; 165 166 /* Round our left edge to the current segment if it encloses us. */ 167 if (f > rg->from) 168 f = rg->from; 169 170 /* Check for and consume any regions we now overlap with. */ 171 nrg = rg; 172 list_for_each_entry_safe(rg, trg, rg->link.prev, link) { 173 if (&rg->link == head) 174 break; 175 if (rg->from > t) 176 break; 177 178 /* If this area reaches higher then extend our area to 179 * include it completely. If this is not the first area 180 * which we intend to reuse, free it. */ 181 if (rg->to > t) 182 t = rg->to; 183 if (rg != nrg) { 184 list_del(&rg->link); 185 kfree(rg); 186 } 187 } 188 nrg->from = f; 189 nrg->to = t; 190 spin_unlock(&resv->lock); 191 return 0; 192} 193 194static long region_chg(struct resv_map *resv, long f, long t) 195{ 196 struct list_head *head = &resv->regions; 197 struct file_region *rg, *nrg = NULL; 198 long chg = 0; 199 200retry: 201 spin_lock(&resv->lock); 202 /* Locate the region we are before or in. */ 203 list_for_each_entry(rg, head, link) 204 if (f <= rg->to) 205 break; 206 207 /* If we are below the current region then a new region is required. 208 * Subtle, allocate a new region at the position but make it zero 209 * size such that we can guarantee to record the reservation. */ 210 if (&rg->link == head || t < rg->from) { 211 if (!nrg) { 212 spin_unlock(&resv->lock); 213 nrg = kmalloc(sizeof(*nrg), GFP_KERNEL); 214 if (!nrg) 215 return -ENOMEM; 216 217 nrg->from = f; 218 nrg->to = f; 219 INIT_LIST_HEAD(&nrg->link); 220 goto retry; 221 } 222 223 list_add(&nrg->link, rg->link.prev); 224 chg = t - f; 225 goto out_nrg; 226 } 227 228 /* Round our left edge to the current segment if it encloses us. */ 229 if (f > rg->from) 230 f = rg->from; 231 chg = t - f; 232 233 /* Check for and consume any regions we now overlap with. */ 234 list_for_each_entry(rg, rg->link.prev, link) { 235 if (&rg->link == head) 236 break; 237 if (rg->from > t) 238 goto out; 239 240 /* We overlap with this area, if it extends further than 241 * us then we must extend ourselves. Account for its 242 * existing reservation. */ 243 if (rg->to > t) { 244 chg += rg->to - t; 245 t = rg->to; 246 } 247 chg -= rg->to - rg->from; 248 } 249 250out: 251 spin_unlock(&resv->lock); 252 /* We already know we raced and no longer need the new region */ 253 kfree(nrg); 254 return chg; 255out_nrg: 256 spin_unlock(&resv->lock); 257 return chg; 258} 259 260static long region_truncate(struct resv_map *resv, long end) 261{ 262 struct list_head *head = &resv->regions; 263 struct file_region *rg, *trg; 264 long chg = 0; 265 266 spin_lock(&resv->lock); 267 /* Locate the region we are either in or before. */ 268 list_for_each_entry(rg, head, link) 269 if (end <= rg->to) 270 break; 271 if (&rg->link == head) 272 goto out; 273 274 /* If we are in the middle of a region then adjust it. */ 275 if (end > rg->from) { 276 chg = rg->to - end; 277 rg->to = end; 278 rg = list_entry(rg->link.next, typeof(*rg), link); 279 } 280 281 /* Drop any remaining regions. */ 282 list_for_each_entry_safe(rg, trg, rg->link.prev, link) { 283 if (&rg->link == head) 284 break; 285 chg += rg->to - rg->from; 286 list_del(&rg->link); 287 kfree(rg); 288 } 289 290out: 291 spin_unlock(&resv->lock); 292 return chg; 293} 294 295static long region_count(struct resv_map *resv, long f, long t) 296{ 297 struct list_head *head = &resv->regions; 298 struct file_region *rg; 299 long chg = 0; 300 301 spin_lock(&resv->lock); 302 /* Locate each segment we overlap with, and count that overlap. */ 303 list_for_each_entry(rg, head, link) { 304 long seg_from; 305 long seg_to; 306 307 if (rg->to <= f) 308 continue; 309 if (rg->from >= t) 310 break; 311 312 seg_from = max(rg->from, f); 313 seg_to = min(rg->to, t); 314 315 chg += seg_to - seg_from; 316 } 317 spin_unlock(&resv->lock); 318 319 return chg; 320} 321 322/* 323 * Convert the address within this vma to the page offset within 324 * the mapping, in pagecache page units; huge pages here. 325 */ 326static pgoff_t vma_hugecache_offset(struct hstate *h, 327 struct vm_area_struct *vma, unsigned long address) 328{ 329 return ((address - vma->vm_start) >> huge_page_shift(h)) + 330 (vma->vm_pgoff >> huge_page_order(h)); 331} 332 333pgoff_t linear_hugepage_index(struct vm_area_struct *vma, 334 unsigned long address) 335{ 336 return vma_hugecache_offset(hstate_vma(vma), vma, address); 337} 338 339/* 340 * Return the size of the pages allocated when backing a VMA. In the majority 341 * cases this will be same size as used by the page table entries. 342 */ 343unsigned long vma_kernel_pagesize(struct vm_area_struct *vma) 344{ 345 struct hstate *hstate; 346 347 if (!is_vm_hugetlb_page(vma)) 348 return PAGE_SIZE; 349 350 hstate = hstate_vma(vma); 351 352 return 1UL << huge_page_shift(hstate); 353} 354EXPORT_SYMBOL_GPL(vma_kernel_pagesize); 355 356/* 357 * Return the page size being used by the MMU to back a VMA. In the majority 358 * of cases, the page size used by the kernel matches the MMU size. On 359 * architectures where it differs, an architecture-specific version of this 360 * function is required. 361 */ 362#ifndef vma_mmu_pagesize 363unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) 364{ 365 return vma_kernel_pagesize(vma); 366} 367#endif 368 369/* 370 * Flags for MAP_PRIVATE reservations. These are stored in the bottom 371 * bits of the reservation map pointer, which are always clear due to 372 * alignment. 373 */ 374#define HPAGE_RESV_OWNER (1UL << 0) 375#define HPAGE_RESV_UNMAPPED (1UL << 1) 376#define HPAGE_RESV_MASK (HPAGE_RESV_OWNER | HPAGE_RESV_UNMAPPED) 377 378/* 379 * These helpers are used to track how many pages are reserved for 380 * faults in a MAP_PRIVATE mapping. Only the process that called mmap() 381 * is guaranteed to have their future faults succeed. 382 * 383 * With the exception of reset_vma_resv_huge_pages() which is called at fork(), 384 * the reserve counters are updated with the hugetlb_lock held. It is safe 385 * to reset the VMA at fork() time as it is not in use yet and there is no 386 * chance of the global counters getting corrupted as a result of the values. 387 * 388 * The private mapping reservation is represented in a subtly different 389 * manner to a shared mapping. A shared mapping has a region map associated 390 * with the underlying file, this region map represents the backing file 391 * pages which have ever had a reservation assigned which this persists even 392 * after the page is instantiated. A private mapping has a region map 393 * associated with the original mmap which is attached to all VMAs which 394 * reference it, this region map represents those offsets which have consumed 395 * reservation ie. where pages have been instantiated. 396 */ 397static unsigned long get_vma_private_data(struct vm_area_struct *vma) 398{ 399 return (unsigned long)vma->vm_private_data; 400} 401 402static void set_vma_private_data(struct vm_area_struct *vma, 403 unsigned long value) 404{ 405 vma->vm_private_data = (void *)value; 406} 407 408struct resv_map *resv_map_alloc(void) 409{ 410 struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL); 411 if (!resv_map) 412 return NULL; 413 414 kref_init(&resv_map->refs); 415 spin_lock_init(&resv_map->lock); 416 INIT_LIST_HEAD(&resv_map->regions); 417 418 return resv_map; 419} 420 421void resv_map_release(struct kref *ref) 422{ 423 struct resv_map *resv_map = container_of(ref, struct resv_map, refs); 424 425 /* Clear out any active regions before we release the map. */ 426 region_truncate(resv_map, 0); 427 kfree(resv_map); 428} 429 430static inline struct resv_map *inode_resv_map(struct inode *inode) 431{ 432 return inode->i_mapping->private_data; 433} 434 435static struct resv_map *vma_resv_map(struct vm_area_struct *vma) 436{ 437 VM_BUG_ON(!is_vm_hugetlb_page(vma)); 438 if (vma->vm_flags & VM_MAYSHARE) { 439 struct address_space *mapping = vma->vm_file->f_mapping; 440 struct inode *inode = mapping->host; 441 442 return inode_resv_map(inode); 443 444 } else { 445 return (struct resv_map *)(get_vma_private_data(vma) & 446 ~HPAGE_RESV_MASK); 447 } 448} 449 450static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map) 451{ 452 VM_BUG_ON(!is_vm_hugetlb_page(vma)); 453 VM_BUG_ON(vma->vm_flags & VM_MAYSHARE); 454 455 set_vma_private_data(vma, (get_vma_private_data(vma) & 456 HPAGE_RESV_MASK) | (unsigned long)map); 457} 458 459static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags) 460{ 461 VM_BUG_ON(!is_vm_hugetlb_page(vma)); 462 VM_BUG_ON(vma->vm_flags & VM_MAYSHARE); 463 464 set_vma_private_data(vma, get_vma_private_data(vma) | flags); 465} 466 467static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag) 468{ 469 VM_BUG_ON(!is_vm_hugetlb_page(vma)); 470 471 return (get_vma_private_data(vma) & flag) != 0; 472} 473 474/* Reset counters to 0 and clear all HPAGE_RESV_* flags */ 475void reset_vma_resv_huge_pages(struct vm_area_struct *vma) 476{ 477 VM_BUG_ON(!is_vm_hugetlb_page(vma)); 478 if (!(vma->vm_flags & VM_MAYSHARE)) 479 vma->vm_private_data = (void *)0; 480} 481 482/* Returns true if the VMA has associated reserve pages */ 483static int vma_has_reserves(struct vm_area_struct *vma, long chg) 484{ 485 if (vma->vm_flags & VM_NORESERVE) { 486 /* 487 * This address is already reserved by other process(chg == 0), 488 * so, we should decrement reserved count. Without decrementing, 489 * reserve count remains after releasing inode, because this 490 * allocated page will go into page cache and is regarded as 491 * coming from reserved pool in releasing step. Currently, we 492 * don't have any other solution to deal with this situation 493 * properly, so add work-around here. 494 */ 495 if (vma->vm_flags & VM_MAYSHARE && chg == 0) 496 return 1; 497 else 498 return 0; 499 } 500 501 /* Shared mappings always use reserves */ 502 if (vma->vm_flags & VM_MAYSHARE) 503 return 1; 504 505 /* 506 * Only the process that called mmap() has reserves for 507 * private mappings. 508 */ 509 if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) 510 return 1; 511 512 return 0; 513} 514 515static void enqueue_huge_page(struct hstate *h, struct page *page) 516{ 517 int nid = page_to_nid(page); 518 list_move(&page->lru, &h->hugepage_freelists[nid]); 519 h->free_huge_pages++; 520 h->free_huge_pages_node[nid]++; 521} 522 523static struct page *dequeue_huge_page_node(struct hstate *h, int nid) 524{ 525 struct page *page; 526 527 list_for_each_entry(page, &h->hugepage_freelists[nid], lru) 528 if (!is_migrate_isolate_page(page)) 529 break; 530 /* 531 * if 'non-isolated free hugepage' not found on the list, 532 * the allocation fails. 533 */ 534 if (&h->hugepage_freelists[nid] == &page->lru) 535 return NULL; 536 list_move(&page->lru, &h->hugepage_activelist); 537 set_page_refcounted(page); 538 h->free_huge_pages--; 539 h->free_huge_pages_node[nid]--; 540 return page; 541} 542 543/* Movability of hugepages depends on migration support. */ 544static inline gfp_t htlb_alloc_mask(struct hstate *h) 545{ 546 if (hugepages_treat_as_movable || hugepage_migration_support(h)) 547 return GFP_HIGHUSER_MOVABLE; 548 else 549 return GFP_HIGHUSER; 550} 551 552static struct page *dequeue_huge_page_vma(struct hstate *h, 553 struct vm_area_struct *vma, 554 unsigned long address, int avoid_reserve, 555 long chg) 556{ 557 struct page *page = NULL; 558 struct mempolicy *mpol; 559 nodemask_t *nodemask; 560 struct zonelist *zonelist; 561 struct zone *zone; 562 struct zoneref *z; 563 unsigned int cpuset_mems_cookie; 564 565 /* 566 * A child process with MAP_PRIVATE mappings created by their parent 567 * have no page reserves. This check ensures that reservations are 568 * not "stolen". The child may still get SIGKILLed 569 */ 570 if (!vma_has_reserves(vma, chg) && 571 h->free_huge_pages - h->resv_huge_pages == 0) 572 goto err; 573 574 /* If reserves cannot be used, ensure enough pages are in the pool */ 575 if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0) 576 goto err; 577 578retry_cpuset: 579 cpuset_mems_cookie = read_mems_allowed_begin(); 580 zonelist = huge_zonelist(vma, address, 581 htlb_alloc_mask(h), &mpol, &nodemask); 582 583 for_each_zone_zonelist_nodemask(zone, z, zonelist, 584 MAX_NR_ZONES - 1, nodemask) { 585 if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask(h))) { 586 page = dequeue_huge_page_node(h, zone_to_nid(zone)); 587 if (page) { 588 if (avoid_reserve) 589 break; 590 if (!vma_has_reserves(vma, chg)) 591 break; 592 593 SetPagePrivate(page); 594 h->resv_huge_pages--; 595 break; 596 } 597 } 598 } 599 600 mpol_cond_put(mpol); 601 if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) 602 goto retry_cpuset; 603 return page; 604 605err: 606 return NULL; 607} 608 609static void update_and_free_page(struct hstate *h, struct page *page) 610{ 611 int i; 612 613 VM_BUG_ON(h->order >= MAX_ORDER); 614 615 h->nr_huge_pages--; 616 h->nr_huge_pages_node[page_to_nid(page)]--; 617 for (i = 0; i < pages_per_huge_page(h); i++) { 618 page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 619 1 << PG_referenced | 1 << PG_dirty | 620 1 << PG_active | 1 << PG_reserved | 621 1 << PG_private | 1 << PG_writeback); 622 } 623 VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page); 624 set_compound_page_dtor(page, NULL); 625 set_page_refcounted(page); 626 arch_release_hugepage(page); 627 __free_pages(page, huge_page_order(h)); 628} 629 630struct hstate *size_to_hstate(unsigned long size) 631{ 632 struct hstate *h; 633 634 for_each_hstate(h) { 635 if (huge_page_size(h) == size) 636 return h; 637 } 638 return NULL; 639} 640 641static void free_huge_page(struct page *page) 642{ 643 /* 644 * Can't pass hstate in here because it is called from the 645 * compound page destructor. 646 */ 647 struct hstate *h = page_hstate(page); 648 int nid = page_to_nid(page); 649 struct hugepage_subpool *spool = 650 (struct hugepage_subpool *)page_private(page); 651 bool restore_reserve; 652 653 set_page_private(page, 0); 654 page->mapping = NULL; 655 BUG_ON(page_count(page)); 656 BUG_ON(page_mapcount(page)); 657 restore_reserve = PagePrivate(page); 658 ClearPagePrivate(page); 659 660 spin_lock(&hugetlb_lock); 661 hugetlb_cgroup_uncharge_page(hstate_index(h), 662 pages_per_huge_page(h), page); 663 if (restore_reserve) 664 h->resv_huge_pages++; 665 666 if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) { 667 /* remove the page from active list */ 668 list_del(&page->lru); 669 update_and_free_page(h, page); 670 h->surplus_huge_pages--; 671 h->surplus_huge_pages_node[nid]--; 672 } else { 673 arch_clear_hugepage_flags(page); 674 enqueue_huge_page(h, page); 675 } 676 spin_unlock(&hugetlb_lock); 677 hugepage_subpool_put_pages(spool, 1); 678} 679 680static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) 681{ 682 INIT_LIST_HEAD(&page->lru); 683 set_compound_page_dtor(page, free_huge_page); 684 spin_lock(&hugetlb_lock); 685 set_hugetlb_cgroup(page, NULL); 686 h->nr_huge_pages++; 687 h->nr_huge_pages_node[nid]++; 688 spin_unlock(&hugetlb_lock); 689 put_page(page); /* free it into the hugepage allocator */ 690} 691 692static void __init prep_compound_gigantic_page(struct page *page, 693 unsigned long order) 694{ 695 int i; 696 int nr_pages = 1 << order; 697 struct page *p = page + 1; 698 699 /* we rely on prep_new_huge_page to set the destructor */ 700 set_compound_order(page, order); 701 __SetPageHead(page); 702 __ClearPageReserved(page); 703 for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { 704 __SetPageTail(p); 705 /* 706 * For gigantic hugepages allocated through bootmem at 707 * boot, it's safer to be consistent with the not-gigantic 708 * hugepages and clear the PG_reserved bit from all tail pages 709 * too. Otherwse drivers using get_user_pages() to access tail 710 * pages may get the reference counting wrong if they see 711 * PG_reserved set on a tail page (despite the head page not 712 * having PG_reserved set). Enforcing this consistency between 713 * head and tail pages allows drivers to optimize away a check 714 * on the head page when they need know if put_page() is needed 715 * after get_user_pages(). 716 */ 717 __ClearPageReserved(p); 718 set_page_count(p, 0); 719 p->first_page = page; 720 } 721} 722 723/* 724 * PageHuge() only returns true for hugetlbfs pages, but not for normal or 725 * transparent huge pages. See the PageTransHuge() documentation for more 726 * details. 727 */ 728int PageHuge(struct page *page) 729{ 730 if (!PageCompound(page)) 731 return 0; 732 733 page = compound_head(page); 734 return get_compound_page_dtor(page) == free_huge_page; 735} 736EXPORT_SYMBOL_GPL(PageHuge); 737 738/* 739 * PageHeadHuge() only returns true for hugetlbfs head page, but not for 740 * normal or transparent huge pages. 741 */ 742int PageHeadHuge(struct page *page_head) 743{ 744 if (!PageHead(page_head)) 745 return 0; 746 747 return get_compound_page_dtor(page_head) == free_huge_page; 748} 749 750pgoff_t __basepage_index(struct page *page) 751{ 752 struct page *page_head = compound_head(page); 753 pgoff_t index = page_index(page_head); 754 unsigned long compound_idx; 755 756 if (!PageHuge(page_head)) 757 return page_index(page); 758 759 if (compound_order(page_head) >= MAX_ORDER) 760 compound_idx = page_to_pfn(page) - page_to_pfn(page_head); 761 else 762 compound_idx = page - page_head; 763 764 return (index << compound_order(page_head)) + compound_idx; 765} 766 767static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) 768{ 769 struct page *page; 770 771 if (h->order >= MAX_ORDER) 772 return NULL; 773 774 page = alloc_pages_exact_node(nid, 775 htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE| 776 __GFP_REPEAT|__GFP_NOWARN, 777 huge_page_order(h)); 778 if (page) { 779 if (arch_prepare_hugepage(page)) { 780 __free_pages(page, huge_page_order(h)); 781 return NULL; 782 } 783 prep_new_huge_page(h, page, nid); 784 } 785 786 return page; 787} 788 789/* 790 * common helper functions for hstate_next_node_to_{alloc|free}. 791 * We may have allocated or freed a huge page based on a different 792 * nodes_allowed previously, so h->next_node_to_{alloc|free} might 793 * be outside of *nodes_allowed. Ensure that we use an allowed 794 * node for alloc or free. 795 */ 796static int next_node_allowed(int nid, nodemask_t *nodes_allowed) 797{ 798 nid = next_node(nid, *nodes_allowed); 799 if (nid == MAX_NUMNODES) 800 nid = first_node(*nodes_allowed); 801 VM_BUG_ON(nid >= MAX_NUMNODES); 802 803 return nid; 804} 805 806static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed) 807{ 808 if (!node_isset(nid, *nodes_allowed)) 809 nid = next_node_allowed(nid, nodes_allowed); 810 return nid; 811} 812 813/* 814 * returns the previously saved node ["this node"] from which to 815 * allocate a persistent huge page for the pool and advance the 816 * next node from which to allocate, handling wrap at end of node 817 * mask. 818 */ 819static int hstate_next_node_to_alloc(struct hstate *h, 820 nodemask_t *nodes_allowed) 821{ 822 int nid; 823 824 VM_BUG_ON(!nodes_allowed); 825 826 nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed); 827 h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed); 828 829 return nid; 830} 831 832/* 833 * helper for free_pool_huge_page() - return the previously saved 834 * node ["this node"] from which to free a huge page. Advance the 835 * next node id whether or not we find a free huge page to free so 836 * that the next attempt to free addresses the next node. 837 */ 838static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed) 839{ 840 int nid; 841 842 VM_BUG_ON(!nodes_allowed); 843 844 nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed); 845 h->next_nid_to_free = next_node_allowed(nid, nodes_allowed); 846 847 return nid; 848} 849 850#define for_each_node_mask_to_alloc(hs, nr_nodes, node, mask) \ 851 for (nr_nodes = nodes_weight(*mask); \ 852 nr_nodes > 0 && \ 853 ((node = hstate_next_node_to_alloc(hs, mask)) || 1); \ 854 nr_nodes--) 855 856#define for_each_node_mask_to_free(hs, nr_nodes, node, mask) \ 857 for (nr_nodes = nodes_weight(*mask); \ 858 nr_nodes > 0 && \ 859 ((node = hstate_next_node_to_free(hs, mask)) || 1); \ 860 nr_nodes--) 861 862static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed) 863{ 864 struct page *page; 865 int nr_nodes, node; 866 int ret = 0; 867 868 for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) { 869 page = alloc_fresh_huge_page_node(h, node); 870 if (page) { 871 ret = 1; 872 break; 873 } 874 } 875 876 if (ret) 877 count_vm_event(HTLB_BUDDY_PGALLOC); 878 else 879 count_vm_event(HTLB_BUDDY_PGALLOC_FAIL); 880 881 return ret; 882} 883 884/* 885 * Free huge page from pool from next node to free. 886 * Attempt to keep persistent huge pages more or less 887 * balanced over allowed nodes. 888 * Called with hugetlb_lock locked. 889 */ 890static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, 891 bool acct_surplus) 892{ 893 int nr_nodes, node; 894 int ret = 0; 895 896 for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) { 897 /* 898 * If we're returning unused surplus pages, only examine 899 * nodes with surplus pages. 900 */ 901 if ((!acct_surplus || h->surplus_huge_pages_node[node]) && 902 !list_empty(&h->hugepage_freelists[node])) { 903 struct page *page = 904 list_entry(h->hugepage_freelists[node].next, 905 struct page, lru); 906 list_del(&page->lru); 907 h->free_huge_pages--; 908 h->free_huge_pages_node[node]--; 909 if (acct_surplus) { 910 h->surplus_huge_pages--; 911 h->surplus_huge_pages_node[node]--; 912 } 913 update_and_free_page(h, page); 914 ret = 1; 915 break; 916 } 917 } 918 919 return ret; 920} 921 922/* 923 * Dissolve a given free hugepage into free buddy pages. This function does 924 * nothing for in-use (including surplus) hugepages. 925 */ 926static void dissolve_free_huge_page(struct page *page) 927{ 928 spin_lock(&hugetlb_lock); 929 if (PageHuge(page) && !page_count(page)) { 930 struct hstate *h = page_hstate(page); 931 int nid = page_to_nid(page); 932 list_del(&page->lru); 933 h->free_huge_pages--; 934 h->free_huge_pages_node[nid]--; 935 update_and_free_page(h, page); 936 } 937 spin_unlock(&hugetlb_lock); 938} 939 940/* 941 * Dissolve free hugepages in a given pfn range. Used by memory hotplug to 942 * make specified memory blocks removable from the system. 943 * Note that start_pfn should aligned with (minimum) hugepage size. 944 */ 945void dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn) 946{ 947 unsigned int order = 8 * sizeof(void *); 948 unsigned long pfn; 949 struct hstate *h; 950 951 /* Set scan step to minimum hugepage size */ 952 for_each_hstate(h) 953 if (order > huge_page_order(h)) 954 order = huge_page_order(h); 955 VM_BUG_ON(!IS_ALIGNED(start_pfn, 1 << order)); 956 for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << order) 957 dissolve_free_huge_page(pfn_to_page(pfn)); 958} 959 960static struct page *alloc_buddy_huge_page(struct hstate *h, int nid) 961{ 962 struct page *page; 963 unsigned int r_nid; 964 965 if (h->order >= MAX_ORDER) 966 return NULL; 967 968 /* 969 * Assume we will successfully allocate the surplus page to 970 * prevent racing processes from causing the surplus to exceed 971 * overcommit 972 * 973 * This however introduces a different race, where a process B 974 * tries to grow the static hugepage pool while alloc_pages() is 975 * called by process A. B will only examine the per-node 976 * counters in determining if surplus huge pages can be 977 * converted to normal huge pages in adjust_pool_surplus(). A 978 * won't be able to increment the per-node counter, until the 979 * lock is dropped by B, but B doesn't drop hugetlb_lock until 980 * no more huge pages can be converted from surplus to normal 981 * state (and doesn't try to convert again). Thus, we have a 982 * case where a surplus huge page exists, the pool is grown, and 983 * the surplus huge page still exists after, even though it 984 * should just have been converted to a normal huge page. This 985 * does not leak memory, though, as the hugepage will be freed 986 * once it is out of use. It also does not allow the counters to 987 * go out of whack in adjust_pool_surplus() as we don't modify 988 * the node values until we've gotten the hugepage and only the 989 * per-node value is checked there. 990 */ 991 spin_lock(&hugetlb_lock); 992 if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) { 993 spin_unlock(&hugetlb_lock); 994 return NULL; 995 } else { 996 h->nr_huge_pages++; 997 h->surplus_huge_pages++; 998 } 999 spin_unlock(&hugetlb_lock); 1000 1001 if (nid == NUMA_NO_NODE) 1002 page = alloc_pages(htlb_alloc_mask(h)|__GFP_COMP| 1003 __GFP_REPEAT|__GFP_NOWARN, 1004 huge_page_order(h)); 1005 else 1006 page = alloc_pages_exact_node(nid, 1007 htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE| 1008 __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h)); 1009 1010 if (page && arch_prepare_hugepage(page)) { 1011 __free_pages(page, huge_page_order(h)); 1012 page = NULL; 1013 } 1014 1015 spin_lock(&hugetlb_lock); 1016 if (page) { 1017 INIT_LIST_HEAD(&page->lru); 1018 r_nid = page_to_nid(page); 1019 set_compound_page_dtor(page, free_huge_page); 1020 set_hugetlb_cgroup(page, NULL); 1021 /* 1022 * We incremented the global counters already 1023 */ 1024 h->nr_huge_pages_node[r_nid]++; 1025 h->surplus_huge_pages_node[r_nid]++; 1026 __count_vm_event(HTLB_BUDDY_PGALLOC); 1027 } else { 1028 h->nr_huge_pages--; 1029 h->surplus_huge_pages--; 1030 __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL); 1031 } 1032 spin_unlock(&hugetlb_lock); 1033 1034 return page; 1035} 1036 1037/* 1038 * This allocation function is useful in the context where vma is irrelevant. 1039 * E.g. soft-offlining uses this function because it only cares physical 1040 * address of error page. 1041 */ 1042struct page *alloc_huge_page_node(struct hstate *h, int nid) 1043{ 1044 struct page *page = NULL; 1045 1046 spin_lock(&hugetlb_lock); 1047 if (h->free_huge_pages - h->resv_huge_pages > 0) 1048 page = dequeue_huge_page_node(h, nid); 1049 spin_unlock(&hugetlb_lock); 1050 1051 if (!page) 1052 page = alloc_buddy_huge_page(h, nid); 1053 1054 return page; 1055} 1056 1057/* 1058 * Increase the hugetlb pool such that it can accommodate a reservation 1059 * of size 'delta'. 1060 */ 1061static int gather_surplus_pages(struct hstate *h, int delta) 1062{ 1063 struct list_head surplus_list; 1064 struct page *page, *tmp; 1065 int ret, i; 1066 int needed, allocated; 1067 bool alloc_ok = true; 1068 1069 needed = (h->resv_huge_pages + delta) - h->free_huge_pages; 1070 if (needed <= 0) { 1071 h->resv_huge_pages += delta; 1072 return 0; 1073 } 1074 1075 allocated = 0; 1076 INIT_LIST_HEAD(&surplus_list); 1077 1078 ret = -ENOMEM; 1079retry: 1080 spin_unlock(&hugetlb_lock); 1081 for (i = 0; i < needed; i++) { 1082 page = alloc_buddy_huge_page(h, NUMA_NO_NODE); 1083 if (!page) { 1084 alloc_ok = false; 1085 break; 1086 } 1087 list_add(&page->lru, &surplus_list); 1088 } 1089 allocated += i; 1090 1091 /* 1092 * After retaking hugetlb_lock, we need to recalculate 'needed' 1093 * because either resv_huge_pages or free_huge_pages may have changed. 1094 */ 1095 spin_lock(&hugetlb_lock); 1096 needed = (h->resv_huge_pages + delta) - 1097 (h->free_huge_pages + allocated); 1098 if (needed > 0) { 1099 if (alloc_ok) 1100 goto retry; 1101 /* 1102 * We were not able to allocate enough pages to 1103 * satisfy the entire reservation so we free what 1104 * we've allocated so far. 1105 */ 1106 goto free; 1107 } 1108 /* 1109 * The surplus_list now contains _at_least_ the number of extra pages 1110 * needed to accommodate the reservation. Add the appropriate number 1111 * of pages to the hugetlb pool and free the extras back to the buddy 1112 * allocator. Commit the entire reservation here to prevent another 1113 * process from stealing the pages as they are added to the pool but 1114 * before they are reserved. 1115 */ 1116 needed += allocated; 1117 h->resv_huge_pages += delta; 1118 ret = 0; 1119 1120 /* Free the needed pages to the hugetlb pool */ 1121 list_for_each_entry_safe(page, tmp, &surplus_list, lru) { 1122 if ((--needed) < 0) 1123 break; 1124 /* 1125 * This page is now managed by the hugetlb allocator and has 1126 * no users -- drop the buddy allocator's reference. 1127 */ 1128 put_page_testzero(page); 1129 VM_BUG_ON_PAGE(page_count(page), page); 1130 enqueue_huge_page(h, page); 1131 } 1132free: 1133 spin_unlock(&hugetlb_lock); 1134 1135 /* Free unnecessary surplus pages to the buddy allocator */ 1136 list_for_each_entry_safe(page, tmp, &surplus_list, lru) 1137 put_page(page); 1138 spin_lock(&hugetlb_lock); 1139 1140 return ret; 1141} 1142 1143/* 1144 * When releasing a hugetlb pool reservation, any surplus pages that were 1145 * allocated to satisfy the reservation must be explicitly freed if they were 1146 * never used. 1147 * Called with hugetlb_lock held. 1148 */ 1149static void return_unused_surplus_pages(struct hstate *h, 1150 unsigned long unused_resv_pages) 1151{ 1152 unsigned long nr_pages; 1153 1154 /* Uncommit the reservation */ 1155 h->resv_huge_pages -= unused_resv_pages; 1156 1157 /* Cannot return gigantic pages currently */ 1158 if (h->order >= MAX_ORDER) 1159 return; 1160 1161 nr_pages = min(unused_resv_pages, h->surplus_huge_pages); 1162 1163 /* 1164 * We want to release as many surplus pages as possible, spread 1165 * evenly across all nodes with memory. Iterate across these nodes 1166 * until we can no longer free unreserved surplus pages. This occurs 1167 * when the nodes with surplus pages have no free pages. 1168 * free_pool_huge_page() will balance the the freed pages across the 1169 * on-line nodes with memory and will handle the hstate accounting. 1170 */ 1171 while (nr_pages--) { 1172 if (!free_pool_huge_page(h, &node_states[N_MEMORY], 1)) 1173 break; 1174 } 1175} 1176 1177/* 1178 * Determine if the huge page at addr within the vma has an associated 1179 * reservation. Where it does not we will need to logically increase 1180 * reservation and actually increase subpool usage before an allocation 1181 * can occur. Where any new reservation would be required the 1182 * reservation change is prepared, but not committed. Once the page 1183 * has been allocated from the subpool and instantiated the change should 1184 * be committed via vma_commit_reservation. No action is required on 1185 * failure. 1186 */ 1187static long vma_needs_reservation(struct hstate *h, 1188 struct vm_area_struct *vma, unsigned long addr) 1189{ 1190 struct resv_map *resv; 1191 pgoff_t idx; 1192 long chg; 1193 1194 resv = vma_resv_map(vma); 1195 if (!resv) 1196 return 1; 1197 1198 idx = vma_hugecache_offset(h, vma, addr); 1199 chg = region_chg(resv, idx, idx + 1); 1200 1201 if (vma->vm_flags & VM_MAYSHARE) 1202 return chg; 1203 else 1204 return chg < 0 ? chg : 0; 1205} 1206static void vma_commit_reservation(struct hstate *h, 1207 struct vm_area_struct *vma, unsigned long addr) 1208{ 1209 struct resv_map *resv; 1210 pgoff_t idx; 1211 1212 resv = vma_resv_map(vma); 1213 if (!resv) 1214 return; 1215 1216 idx = vma_hugecache_offset(h, vma, addr); 1217 region_add(resv, idx, idx + 1); 1218} 1219 1220static struct page *alloc_huge_page(struct vm_area_struct *vma, 1221 unsigned long addr, int avoid_reserve) 1222{ 1223 struct hugepage_subpool *spool = subpool_vma(vma); 1224 struct hstate *h = hstate_vma(vma); 1225 struct page *page; 1226 long chg; 1227 int ret, idx; 1228 struct hugetlb_cgroup *h_cg; 1229 1230 idx = hstate_index(h); 1231 /* 1232 * Processes that did not create the mapping will have no 1233 * reserves and will not have accounted against subpool 1234 * limit. Check that the subpool limit can be made before 1235 * satisfying the allocation MAP_NORESERVE mappings may also 1236 * need pages and subpool limit allocated allocated if no reserve 1237 * mapping overlaps. 1238 */ 1239 chg = vma_needs_reservation(h, vma, addr); 1240 if (chg < 0) 1241 return ERR_PTR(-ENOMEM); 1242 if (chg || avoid_reserve) 1243 if (hugepage_subpool_get_pages(spool, 1)) 1244 return ERR_PTR(-ENOSPC); 1245 1246 ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg); 1247 if (ret) { 1248 if (chg || avoid_reserve) 1249 hugepage_subpool_put_pages(spool, 1); 1250 return ERR_PTR(-ENOSPC); 1251 } 1252 spin_lock(&hugetlb_lock); 1253 page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, chg); 1254 if (!page) { 1255 spin_unlock(&hugetlb_lock); 1256 page = alloc_buddy_huge_page(h, NUMA_NO_NODE); 1257 if (!page) { 1258 hugetlb_cgroup_uncharge_cgroup(idx, 1259 pages_per_huge_page(h), 1260 h_cg); 1261 if (chg || avoid_reserve) 1262 hugepage_subpool_put_pages(spool, 1); 1263 return ERR_PTR(-ENOSPC); 1264 } 1265 spin_lock(&hugetlb_lock); 1266 list_move(&page->lru, &h->hugepage_activelist); 1267 /* Fall through */ 1268 } 1269 hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page); 1270 spin_unlock(&hugetlb_lock); 1271 1272 set_page_private(page, (unsigned long)spool); 1273 1274 vma_commit_reservation(h, vma, addr); 1275 return page; 1276} 1277 1278/* 1279 * alloc_huge_page()'s wrapper which simply returns the page if allocation 1280 * succeeds, otherwise NULL. This function is called from new_vma_page(), 1281 * where no ERR_VALUE is expected to be returned. 1282 */ 1283struct page *alloc_huge_page_noerr(struct vm_area_struct *vma, 1284 unsigned long addr, int avoid_reserve) 1285{ 1286 struct page *page = alloc_huge_page(vma, addr, avoid_reserve); 1287 if (IS_ERR(page)) 1288 page = NULL; 1289 return page; 1290} 1291 1292int __weak alloc_bootmem_huge_page(struct hstate *h) 1293{ 1294 struct huge_bootmem_page *m; 1295 int nr_nodes, node; 1296 1297 for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) { 1298 void *addr; 1299 1300 addr = memblock_virt_alloc_try_nid_nopanic( 1301 huge_page_size(h), huge_page_size(h), 1302 0, BOOTMEM_ALLOC_ACCESSIBLE, node); 1303 if (addr) { 1304 /* 1305 * Use the beginning of the huge page to store the 1306 * huge_bootmem_page struct (until gather_bootmem 1307 * puts them into the mem_map). 1308 */ 1309 m = addr; 1310 goto found; 1311 } 1312 } 1313 return 0; 1314 1315found: 1316 BUG_ON((unsigned long)virt_to_phys(m) & (huge_page_size(h) - 1)); 1317 /* Put them into a private list first because mem_map is not up yet */ 1318 list_add(&m->list, &huge_boot_pages); 1319 m->hstate = h; 1320 return 1; 1321} 1322 1323static void __init prep_compound_huge_page(struct page *page, int order) 1324{ 1325 if (unlikely(order > (MAX_ORDER - 1))) 1326 prep_compound_gigantic_page(page, order); 1327 else 1328 prep_compound_page(page, order); 1329} 1330 1331/* Put bootmem huge pages into the standard lists after mem_map is up */ 1332static void __init gather_bootmem_prealloc(void) 1333{ 1334 struct huge_bootmem_page *m; 1335 1336 list_for_each_entry(m, &huge_boot_pages, list) { 1337 struct hstate *h = m->hstate; 1338 struct page *page; 1339 1340#ifdef CONFIG_HIGHMEM 1341 page = pfn_to_page(m->phys >> PAGE_SHIFT); 1342 memblock_free_late(__pa(m), 1343 sizeof(struct huge_bootmem_page)); 1344#else 1345 page = virt_to_page(m); 1346#endif 1347 WARN_ON(page_count(page) != 1); 1348 prep_compound_huge_page(page, h->order); 1349 WARN_ON(PageReserved(page)); 1350 prep_new_huge_page(h, page, page_to_nid(page)); 1351 /* 1352 * If we had gigantic hugepages allocated at boot time, we need 1353 * to restore the 'stolen' pages to totalram_pages in order to 1354 * fix confusing memory reports from free(1) and another 1355 * side-effects, like CommitLimit going negative. 1356 */ 1357 if (h->order > (MAX_ORDER - 1)) 1358 adjust_managed_page_count(page, 1 << h->order); 1359 } 1360} 1361 1362static void __init hugetlb_hstate_alloc_pages(struct hstate *h) 1363{ 1364 unsigned long i; 1365 1366 for (i = 0; i < h->max_huge_pages; ++i) { 1367 if (h->order >= MAX_ORDER) { 1368 if (!alloc_bootmem_huge_page(h)) 1369 break; 1370 } else if (!alloc_fresh_huge_page(h, 1371 &node_states[N_MEMORY])) 1372 break; 1373 } 1374 h->max_huge_pages = i; 1375} 1376 1377static void __init hugetlb_init_hstates(void) 1378{ 1379 struct hstate *h; 1380 1381 for_each_hstate(h) { 1382 /* oversize hugepages were init'ed in early boot */ 1383 if (h->order < MAX_ORDER) 1384 hugetlb_hstate_alloc_pages(h); 1385 } 1386} 1387 1388static char * __init memfmt(char *buf, unsigned long n) 1389{ 1390 if (n >= (1UL << 30)) 1391 sprintf(buf, "%lu GB", n >> 30); 1392 else if (n >= (1UL << 20)) 1393 sprintf(buf, "%lu MB", n >> 20); 1394 else 1395 sprintf(buf, "%lu KB", n >> 10); 1396 return buf; 1397} 1398 1399static void __init report_hugepages(void) 1400{ 1401 struct hstate *h; 1402 1403 for_each_hstate(h) { 1404 char buf[32]; 1405 pr_info("HugeTLB registered %s page size, pre-allocated %ld pages\n", 1406 memfmt(buf, huge_page_size(h)), 1407 h->free_huge_pages); 1408 } 1409} 1410 1411#ifdef CONFIG_HIGHMEM 1412static void try_to_free_low(struct hstate *h, unsigned long count, 1413 nodemask_t *nodes_allowed) 1414{ 1415 int i; 1416 1417 if (h->order >= MAX_ORDER) 1418 return; 1419 1420 for_each_node_mask(i, *nodes_allowed) { 1421 struct page *page, *next; 1422 struct list_head *freel = &h->hugepage_freelists[i]; 1423 list_for_each_entry_safe(page, next, freel, lru) { 1424 if (count >= h->nr_huge_pages) 1425 return; 1426 if (PageHighMem(page)) 1427 continue; 1428 list_del(&page->lru); 1429 update_and_free_page(h, page); 1430 h->free_huge_pages--; 1431 h->free_huge_pages_node[page_to_nid(page)]--; 1432 } 1433 } 1434} 1435#else 1436static inline void try_to_free_low(struct hstate *h, unsigned long count, 1437 nodemask_t *nodes_allowed) 1438{ 1439} 1440#endif 1441 1442/* 1443 * Increment or decrement surplus_huge_pages. Keep node-specific counters 1444 * balanced by operating on them in a round-robin fashion. 1445 * Returns 1 if an adjustment was made. 1446 */ 1447static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed, 1448 int delta) 1449{ 1450 int nr_nodes, node; 1451 1452 VM_BUG_ON(delta != -1 && delta != 1); 1453 1454 if (delta < 0) { 1455 for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) { 1456 if (h->surplus_huge_pages_node[node]) 1457 goto found; 1458 } 1459 } else { 1460 for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) { 1461 if (h->surplus_huge_pages_node[node] < 1462 h->nr_huge_pages_node[node]) 1463 goto found; 1464 } 1465 } 1466 return 0; 1467 1468found: 1469 h->surplus_huge_pages += delta; 1470 h->surplus_huge_pages_node[node] += delta; 1471 return 1; 1472} 1473 1474#define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages) 1475static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count, 1476 nodemask_t *nodes_allowed) 1477{ 1478 unsigned long min_count, ret; 1479 1480 if (h->order >= MAX_ORDER) 1481 return h->max_huge_pages; 1482 1483 /* 1484 * Increase the pool size 1485 * First take pages out of surplus state. Then make up the 1486 * remaining difference by allocating fresh huge pages. 1487 * 1488 * We might race with alloc_buddy_huge_page() here and be unable 1489 * to convert a surplus huge page to a normal huge page. That is 1490 * not critical, though, it just means the overall size of the 1491 * pool might be one hugepage larger than it needs to be, but 1492 * within all the constraints specified by the sysctls. 1493 */ 1494 spin_lock(&hugetlb_lock); 1495 while (h->surplus_huge_pages && count > persistent_huge_pages(h)) { 1496 if (!adjust_pool_surplus(h, nodes_allowed, -1)) 1497 break; 1498 } 1499 1500 while (count > persistent_huge_pages(h)) { 1501 /* 1502 * If this allocation races such that we no longer need the 1503 * page, free_huge_page will handle it by freeing the page 1504 * and reducing the surplus. 1505 */ 1506 spin_unlock(&hugetlb_lock); 1507 ret = alloc_fresh_huge_page(h, nodes_allowed); 1508 spin_lock(&hugetlb_lock); 1509 if (!ret) 1510 goto out; 1511 1512 /* Bail for signals. Probably ctrl-c from user */ 1513 if (signal_pending(current)) 1514 goto out; 1515 } 1516 1517 /* 1518 * Decrease the pool size 1519 * First return free pages to the buddy allocator (being careful 1520 * to keep enough around to satisfy reservations). Then place 1521 * pages into surplus state as needed so the pool will shrink 1522 * to the desired size as pages become free. 1523 * 1524 * By placing pages into the surplus state independent of the 1525 * overcommit value, we are allowing the surplus pool size to 1526 * exceed overcommit. There are few sane options here. Since 1527 * alloc_buddy_huge_page() is checking the global counter, 1528 * though, we'll note that we're not allowed to exceed surplus 1529 * and won't grow the pool anywhere else. Not until one of the 1530 * sysctls are changed, or the surplus pages go out of use. 1531 */ 1532 min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages; 1533 min_count = max(count, min_count); 1534 try_to_free_low(h, min_count, nodes_allowed); 1535 while (min_count < persistent_huge_pages(h)) { 1536 if (!free_pool_huge_page(h, nodes_allowed, 0)) 1537 break; 1538 } 1539 while (count < persistent_huge_pages(h)) { 1540 if (!adjust_pool_surplus(h, nodes_allowed, 1)) 1541 break; 1542 } 1543out: 1544 ret = persistent_huge_pages(h); 1545 spin_unlock(&hugetlb_lock); 1546 return ret; 1547} 1548 1549#define HSTATE_ATTR_RO(_name) \ 1550 static struct kobj_attribute _name##_attr = __ATTR_RO(_name) 1551 1552#define HSTATE_ATTR(_name) \ 1553 static struct kobj_attribute _name##_attr = \ 1554 __ATTR(_name, 0644, _name##_show, _name##_store) 1555 1556static struct kobject *hugepages_kobj; 1557static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE]; 1558 1559static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp); 1560 1561static struct hstate *kobj_to_hstate(struct kobject *kobj, int *nidp) 1562{ 1563 int i; 1564 1565 for (i = 0; i < HUGE_MAX_HSTATE; i++) 1566 if (hstate_kobjs[i] == kobj) { 1567 if (nidp) 1568 *nidp = NUMA_NO_NODE; 1569 return &hstates[i]; 1570 } 1571 1572 return kobj_to_node_hstate(kobj, nidp); 1573} 1574 1575static ssize_t nr_hugepages_show_common(struct kobject *kobj, 1576 struct kobj_attribute *attr, char *buf) 1577{ 1578 struct hstate *h; 1579 unsigned long nr_huge_pages; 1580 int nid; 1581 1582 h = kobj_to_hstate(kobj, &nid); 1583 if (nid == NUMA_NO_NODE) 1584 nr_huge_pages = h->nr_huge_pages; 1585 else 1586 nr_huge_pages = h->nr_huge_pages_node[nid]; 1587 1588 return sprintf(buf, "%lu\n", nr_huge_pages); 1589} 1590 1591static ssize_t nr_hugepages_store_common(bool obey_mempolicy, 1592 struct kobject *kobj, struct kobj_attribute *attr, 1593 const char *buf, size_t len) 1594{ 1595 int err; 1596 int nid; 1597 unsigned long count; 1598 struct hstate *h; 1599 NODEMASK_ALLOC(nodemask_t, nodes_allowed, GFP_KERNEL | __GFP_NORETRY); 1600 1601 err = kstrtoul(buf, 10, &count); 1602 if (err) 1603 goto out; 1604 1605 h = kobj_to_hstate(kobj, &nid); 1606 if (h->order >= MAX_ORDER) { 1607 err = -EINVAL; 1608 goto out; 1609 } 1610 1611 if (nid == NUMA_NO_NODE) { 1612 /* 1613 * global hstate attribute 1614 */ 1615 if (!(obey_mempolicy && 1616 init_nodemask_of_mempolicy(nodes_allowed))) { 1617 NODEMASK_FREE(nodes_allowed); 1618 nodes_allowed = &node_states[N_MEMORY]; 1619 } 1620 } else if (nodes_allowed) { 1621 /* 1622 * per node hstate attribute: adjust count to global, 1623 * but restrict alloc/free to the specified node. 1624 */ 1625 count += h->nr_huge_pages - h->nr_huge_pages_node[nid]; 1626 init_nodemask_of_node(nodes_allowed, nid); 1627 } else 1628 nodes_allowed = &node_states[N_MEMORY]; 1629 1630 h->max_huge_pages = set_max_huge_pages(h, count, nodes_allowed); 1631 1632 if (nodes_allowed != &node_states[N_MEMORY]) 1633 NODEMASK_FREE(nodes_allowed); 1634 1635 return len; 1636out: 1637 NODEMASK_FREE(nodes_allowed); 1638 return err; 1639} 1640 1641static ssize_t nr_hugepages_show(struct kobject *kobj, 1642 struct kobj_attribute *attr, char *buf) 1643{ 1644 return nr_hugepages_show_common(kobj, attr, buf); 1645} 1646 1647static ssize_t nr_hugepages_store(struct kobject *kobj, 1648 struct kobj_attribute *attr, const char *buf, size_t len) 1649{ 1650 return nr_hugepages_store_common(false, kobj, attr, buf, len); 1651} 1652HSTATE_ATTR(nr_hugepages); 1653 1654#ifdef CONFIG_NUMA 1655 1656/* 1657 * hstate attribute for optionally mempolicy-based constraint on persistent 1658 * huge page alloc/free. 1659 */ 1660static ssize_t nr_hugepages_mempolicy_show(struct kobject *kobj, 1661 struct kobj_attribute *attr, char *buf) 1662{ 1663 return nr_hugepages_show_common(kobj, attr, buf); 1664} 1665 1666static ssize_t nr_hugepages_mempolicy_store(struct kobject *kobj, 1667 struct kobj_attribute *attr, const char *buf, size_t len) 1668{ 1669 return nr_hugepages_store_common(true, kobj, attr, buf, len); 1670} 1671HSTATE_ATTR(nr_hugepages_mempolicy); 1672#endif 1673 1674 1675static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj, 1676 struct kobj_attribute *attr, char *buf) 1677{ 1678 struct hstate *h = kobj_to_hstate(kobj, NULL); 1679 return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages); 1680} 1681 1682static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, 1683 struct kobj_attribute *attr, const char *buf, size_t count) 1684{ 1685 int err; 1686 unsigned long input; 1687 struct hstate *h = kobj_to_hstate(kobj, NULL); 1688 1689 if (h->order >= MAX_ORDER) 1690 return -EINVAL; 1691 1692 err = kstrtoul(buf, 10, &input); 1693 if (err) 1694 return err; 1695 1696 spin_lock(&hugetlb_lock); 1697 h->nr_overcommit_huge_pages = input; 1698 spin_unlock(&hugetlb_lock); 1699 1700 return count; 1701} 1702HSTATE_ATTR(nr_overcommit_hugepages); 1703 1704static ssize_t free_hugepages_show(struct kobject *kobj, 1705 struct kobj_attribute *attr, char *buf) 1706{ 1707 struct hstate *h; 1708 unsigned long free_huge_pages; 1709 int nid; 1710 1711 h = kobj_to_hstate(kobj, &nid); 1712 if (nid == NUMA_NO_NODE) 1713 free_huge_pages = h->free_huge_pages; 1714 else 1715 free_huge_pages = h->free_huge_pages_node[nid]; 1716 1717 return sprintf(buf, "%lu\n", free_huge_pages); 1718} 1719HSTATE_ATTR_RO(free_hugepages); 1720 1721static ssize_t resv_hugepages_show(struct kobject *kobj, 1722 struct kobj_attribute *attr, char *buf) 1723{ 1724 struct hstate *h = kobj_to_hstate(kobj, NULL); 1725 return sprintf(buf, "%lu\n", h->resv_huge_pages); 1726} 1727HSTATE_ATTR_RO(resv_hugepages); 1728 1729static ssize_t surplus_hugepages_show(struct kobject *kobj, 1730 struct kobj_attribute *attr, char *buf) 1731{ 1732 struct hstate *h; 1733 unsigned long surplus_huge_pages; 1734 int nid; 1735 1736 h = kobj_to_hstate(kobj, &nid); 1737 if (nid == NUMA_NO_NODE) 1738 surplus_huge_pages = h->surplus_huge_pages; 1739 else 1740 surplus_huge_pages = h->surplus_huge_pages_node[nid]; 1741 1742 return sprintf(buf, "%lu\n", surplus_huge_pages); 1743} 1744HSTATE_ATTR_RO(surplus_hugepages); 1745 1746static struct attribute *hstate_attrs[] = { 1747 &nr_hugepages_attr.attr, 1748 &nr_overcommit_hugepages_attr.attr, 1749 &free_hugepages_attr.attr, 1750 &resv_hugepages_attr.attr, 1751 &surplus_hugepages_attr.attr, 1752#ifdef CONFIG_NUMA 1753 &nr_hugepages_mempolicy_attr.attr, 1754#endif 1755 NULL, 1756}; 1757 1758static struct attribute_group hstate_attr_group = { 1759 .attrs = hstate_attrs, 1760}; 1761 1762static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent, 1763 struct kobject **hstate_kobjs, 1764 struct attribute_group *hstate_attr_group) 1765{ 1766 int retval; 1767 int hi = hstate_index(h); 1768 1769 hstate_kobjs[hi] = kobject_create_and_add(h->name, parent); 1770 if (!hstate_kobjs[hi]) 1771 return -ENOMEM; 1772 1773 retval = sysfs_create_group(hstate_kobjs[hi], hstate_attr_group); 1774 if (retval) 1775 kobject_put(hstate_kobjs[hi]); 1776 1777 return retval; 1778} 1779 1780static void __init hugetlb_sysfs_init(void) 1781{ 1782 struct hstate *h; 1783 int err; 1784 1785 hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj); 1786 if (!hugepages_kobj) 1787 return; 1788 1789 for_each_hstate(h) { 1790 err = hugetlb_sysfs_add_hstate(h, hugepages_kobj, 1791 hstate_kobjs, &hstate_attr_group); 1792 if (err) 1793 pr_err("Hugetlb: Unable to add hstate %s", h->name); 1794 } 1795} 1796 1797#ifdef CONFIG_NUMA 1798 1799/* 1800 * node_hstate/s - associate per node hstate attributes, via their kobjects, 1801 * with node devices in node_devices[] using a parallel array. The array 1802 * index of a node device or _hstate == node id. 1803 * This is here to avoid any static dependency of the node device driver, in 1804 * the base kernel, on the hugetlb module. 1805 */ 1806struct node_hstate { 1807 struct kobject *hugepages_kobj; 1808 struct kobject *hstate_kobjs[HUGE_MAX_HSTATE]; 1809}; 1810struct node_hstate node_hstates[MAX_NUMNODES]; 1811 1812/* 1813 * A subset of global hstate attributes for node devices 1814 */ 1815static struct attribute *per_node_hstate_attrs[] = { 1816 &nr_hugepages_attr.attr, 1817 &free_hugepages_attr.attr, 1818 &surplus_hugepages_attr.attr, 1819 NULL, 1820}; 1821 1822static struct attribute_group per_node_hstate_attr_group = { 1823 .attrs = per_node_hstate_attrs, 1824}; 1825 1826/* 1827 * kobj_to_node_hstate - lookup global hstate for node device hstate attr kobj. 1828 * Returns node id via non-NULL nidp. 1829 */ 1830static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp) 1831{ 1832 int nid; 1833 1834 for (nid = 0; nid < nr_node_ids; nid++) { 1835 struct node_hstate *nhs = &node_hstates[nid]; 1836 int i; 1837 for (i = 0; i < HUGE_MAX_HSTATE; i++) 1838 if (nhs->hstate_kobjs[i] == kobj) { 1839 if (nidp) 1840 *nidp = nid; 1841 return &hstates[i]; 1842 } 1843 } 1844 1845 BUG(); 1846 return NULL; 1847} 1848 1849/* 1850 * Unregister hstate attributes from a single node device. 1851 * No-op if no hstate attributes attached. 1852 */ 1853static void hugetlb_unregister_node(struct node *node) 1854{ 1855 struct hstate *h; 1856 struct node_hstate *nhs = &node_hstates[node->dev.id]; 1857 1858 if (!nhs->hugepages_kobj) 1859 return; /* no hstate attributes */ 1860 1861 for_each_hstate(h) { 1862 int idx = hstate_index(h); 1863 if (nhs->hstate_kobjs[idx]) { 1864 kobject_put(nhs->hstate_kobjs[idx]); 1865 nhs->hstate_kobjs[idx] = NULL; 1866 } 1867 } 1868 1869 kobject_put(nhs->hugepages_kobj); 1870 nhs->hugepages_kobj = NULL; 1871} 1872 1873/* 1874 * hugetlb module exit: unregister hstate attributes from node devices 1875 * that have them. 1876 */ 1877static void hugetlb_unregister_all_nodes(void) 1878{ 1879 int nid; 1880 1881 /* 1882 * disable node device registrations. 1883 */ 1884 register_hugetlbfs_with_node(NULL, NULL); 1885 1886 /* 1887 * remove hstate attributes from any nodes that have them. 1888 */ 1889 for (nid = 0; nid < nr_node_ids; nid++) 1890 hugetlb_unregister_node(node_devices[nid]); 1891} 1892 1893/* 1894 * Register hstate attributes for a single node device. 1895 * No-op if attributes already registered. 1896 */ 1897static void hugetlb_register_node(struct node *node) 1898{ 1899 struct hstate *h; 1900 struct node_hstate *nhs = &node_hstates[node->dev.id]; 1901 int err; 1902 1903 if (nhs->hugepages_kobj) 1904 return; /* already allocated */ 1905 1906 nhs->hugepages_kobj = kobject_create_and_add("hugepages", 1907 &node->dev.kobj); 1908 if (!nhs->hugepages_kobj) 1909 return; 1910 1911 for_each_hstate(h) { 1912 err = hugetlb_sysfs_add_hstate(h, nhs->hugepages_kobj, 1913 nhs->hstate_kobjs, 1914 &per_node_hstate_attr_group); 1915 if (err) { 1916 pr_err("Hugetlb: Unable to add hstate %s for node %d\n", 1917 h->name, node->dev.id); 1918 hugetlb_unregister_node(node); 1919 break; 1920 } 1921 } 1922} 1923 1924/* 1925 * hugetlb init time: register hstate attributes for all registered node 1926 * devices of nodes that have memory. All on-line nodes should have 1927 * registered their associated device by this time. 1928 */ 1929static void hugetlb_register_all_nodes(void) 1930{ 1931 int nid; 1932 1933 for_each_node_state(nid, N_MEMORY) { 1934 struct node *node = node_devices[nid]; 1935 if (node->dev.id == nid) 1936 hugetlb_register_node(node); 1937 } 1938 1939 /* 1940 * Let the node device driver know we're here so it can 1941 * [un]register hstate attributes on node hotplug. 1942 */ 1943 register_hugetlbfs_with_node(hugetlb_register_node, 1944 hugetlb_unregister_node); 1945} 1946#else /* !CONFIG_NUMA */ 1947 1948static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp) 1949{ 1950 BUG(); 1951 if (nidp) 1952 *nidp = -1; 1953 return NULL; 1954} 1955 1956static void hugetlb_unregister_all_nodes(void) { } 1957 1958static void hugetlb_register_all_nodes(void) { } 1959 1960#endif 1961 1962static void __exit hugetlb_exit(void) 1963{ 1964 struct hstate *h; 1965 1966 hugetlb_unregister_all_nodes(); 1967 1968 for_each_hstate(h) { 1969 kobject_put(hstate_kobjs[hstate_index(h)]); 1970 } 1971 1972 kobject_put(hugepages_kobj); 1973 kfree(htlb_fault_mutex_table); 1974} 1975module_exit(hugetlb_exit); 1976 1977static int __init hugetlb_init(void) 1978{ 1979 int i; 1980 1981 /* Some platform decide whether they support huge pages at boot 1982 * time. On these, such as powerpc, HPAGE_SHIFT is set to 0 when 1983 * there is no such support 1984 */ 1985 if (HPAGE_SHIFT == 0) 1986 return 0; 1987 1988 if (!size_to_hstate(default_hstate_size)) { 1989 default_hstate_size = HPAGE_SIZE; 1990 if (!size_to_hstate(default_hstate_size)) 1991 hugetlb_add_hstate(HUGETLB_PAGE_ORDER); 1992 } 1993 default_hstate_idx = hstate_index(size_to_hstate(default_hstate_size)); 1994 if (default_hstate_max_huge_pages) 1995 default_hstate.max_huge_pages = default_hstate_max_huge_pages; 1996 1997 hugetlb_init_hstates(); 1998 gather_bootmem_prealloc(); 1999 report_hugepages(); 2000 2001 hugetlb_sysfs_init(); 2002 hugetlb_register_all_nodes(); 2003 hugetlb_cgroup_file_init(); 2004 2005#ifdef CONFIG_SMP 2006 num_fault_mutexes = roundup_pow_of_two(8 * num_possible_cpus()); 2007#else 2008 num_fault_mutexes = 1; 2009#endif 2010 htlb_fault_mutex_table = 2011 kmalloc(sizeof(struct mutex) * num_fault_mutexes, GFP_KERNEL); 2012 BUG_ON(!htlb_fault_mutex_table); 2013 2014 for (i = 0; i < num_fault_mutexes; i++) 2015 mutex_init(&htlb_fault_mutex_table[i]); 2016 return 0; 2017} 2018module_init(hugetlb_init); 2019 2020/* Should be called on processing a hugepagesz=... option */ 2021void __init hugetlb_add_hstate(unsigned order) 2022{ 2023 struct hstate *h; 2024 unsigned long i; 2025 2026 if (size_to_hstate(PAGE_SIZE << order)) { 2027 pr_warning("hugepagesz= specified twice, ignoring\n"); 2028 return; 2029 } 2030 BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE); 2031 BUG_ON(order == 0); 2032 h = &hstates[hugetlb_max_hstate++]; 2033 h->order = order; 2034 h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1); 2035 h->nr_huge_pages = 0; 2036 h->free_huge_pages = 0; 2037 for (i = 0; i < MAX_NUMNODES; ++i) 2038 INIT_LIST_HEAD(&h->hugepage_freelists[i]); 2039 INIT_LIST_HEAD(&h->hugepage_activelist); 2040 h->next_nid_to_alloc = first_node(node_states[N_MEMORY]); 2041 h->next_nid_to_free = first_node(node_states[N_MEMORY]); 2042 snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", 2043 huge_page_size(h)/1024); 2044 2045 parsed_hstate = h; 2046} 2047 2048static int __init hugetlb_nrpages_setup(char *s) 2049{ 2050 unsigned long *mhp; 2051 static unsigned long *last_mhp; 2052 2053 /* 2054 * !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter yet, 2055 * so this hugepages= parameter goes to the "default hstate". 2056 */ 2057 if (!hugetlb_max_hstate) 2058 mhp = &default_hstate_max_huge_pages; 2059 else 2060 mhp = &parsed_hstate->max_huge_pages; 2061 2062 if (mhp == last_mhp) { 2063 pr_warning("hugepages= specified twice without " 2064 "interleaving hugepagesz=, ignoring\n"); 2065 return 1; 2066 } 2067 2068 if (sscanf(s, "%lu", mhp) <= 0) 2069 *mhp = 0; 2070 2071 /* 2072 * Global state is always initialized later in hugetlb_init. 2073 * But we need to allocate >= MAX_ORDER hstates here early to still 2074 * use the bootmem allocator. 2075 */ 2076 if (hugetlb_max_hstate && parsed_hstate->order >= MAX_ORDER) 2077 hugetlb_hstate_alloc_pages(parsed_hstate); 2078 2079 last_mhp = mhp; 2080 2081 return 1; 2082} 2083__setup("hugepages=", hugetlb_nrpages_setup); 2084 2085static int __init hugetlb_default_setup(char *s) 2086{ 2087 default_hstate_size = memparse(s, &s); 2088 return 1; 2089} 2090__setup("default_hugepagesz=", hugetlb_default_setup); 2091 2092static unsigned int cpuset_mems_nr(unsigned int *array) 2093{ 2094 int node; 2095 unsigned int nr = 0; 2096 2097 for_each_node_mask(node, cpuset_current_mems_allowed) 2098 nr += array[node]; 2099 2100 return nr; 2101} 2102 2103#ifdef CONFIG_SYSCTL 2104static int hugetlb_sysctl_handler_common(bool obey_mempolicy, 2105 struct ctl_table *table, int write, 2106 void __user *buffer, size_t *length, loff_t *ppos) 2107{ 2108 struct hstate *h = &default_hstate; 2109 unsigned long tmp; 2110 int ret; 2111 2112 tmp = h->max_huge_pages; 2113 2114 if (write && h->order >= MAX_ORDER) 2115 return -EINVAL; 2116 2117 table->data = &tmp; 2118 table->maxlen = sizeof(unsigned long); 2119 ret = proc_doulongvec_minmax(table, write, buffer, length, ppos); 2120 if (ret) 2121 goto out; 2122 2123 if (write) { 2124 NODEMASK_ALLOC(nodemask_t, nodes_allowed, 2125 GFP_KERNEL | __GFP_NORETRY); 2126 if (!(obey_mempolicy && 2127 init_nodemask_of_mempolicy(nodes_allowed))) { 2128 NODEMASK_FREE(nodes_allowed); 2129 nodes_allowed = &node_states[N_MEMORY]; 2130 } 2131 h->max_huge_pages = set_max_huge_pages(h, tmp, nodes_allowed); 2132 2133 if (nodes_allowed != &node_states[N_MEMORY]) 2134 NODEMASK_FREE(nodes_allowed); 2135 } 2136out: 2137 return ret; 2138} 2139 2140int hugetlb_sysctl_handler(struct ctl_table *table, int write, 2141 void __user *buffer, size_t *length, loff_t *ppos) 2142{ 2143 2144 return hugetlb_sysctl_handler_common(false, table, write, 2145 buffer, length, ppos); 2146} 2147 2148#ifdef CONFIG_NUMA 2149int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write, 2150 void __user *buffer, size_t *length, loff_t *ppos) 2151{ 2152 return hugetlb_sysctl_handler_common(true, table, write, 2153 buffer, length, ppos); 2154} 2155#endif /* CONFIG_NUMA */ 2156 2157int hugetlb_overcommit_handler(struct ctl_table *table, int write, 2158 void __user *buffer, 2159 size_t *length, loff_t *ppos) 2160{ 2161 struct hstate *h = &default_hstate; 2162 unsigned long tmp; 2163 int ret; 2164 2165 tmp = h->nr_overcommit_huge_pages; 2166 2167 if (write && h->order >= MAX_ORDER) 2168 return -EINVAL; 2169 2170 table->data = &tmp; 2171 table->maxlen = sizeof(unsigned long); 2172 ret = proc_doulongvec_minmax(table, write, buffer, length, ppos); 2173 if (ret) 2174 goto out; 2175 2176 if (write) { 2177 spin_lock(&hugetlb_lock); 2178 h->nr_overcommit_huge_pages = tmp; 2179 spin_unlock(&hugetlb_lock); 2180 } 2181out: 2182 return ret; 2183} 2184 2185#endif /* CONFIG_SYSCTL */ 2186 2187void hugetlb_report_meminfo(struct seq_file *m) 2188{ 2189 struct hstate *h = &default_hstate; 2190 seq_printf(m, 2191 "HugePages_Total: %5lu\n" 2192 "HugePages_Free: %5lu\n" 2193 "HugePages_Rsvd: %5lu\n" 2194 "HugePages_Surp: %5lu\n" 2195 "Hugepagesize: %8lu kB\n", 2196 h->nr_huge_pages, 2197 h->free_huge_pages, 2198 h->resv_huge_pages, 2199 h->surplus_huge_pages, 2200 1UL << (huge_page_order(h) + PAGE_SHIFT - 10)); 2201} 2202 2203int hugetlb_report_node_meminfo(int nid, char *buf) 2204{ 2205 struct hstate *h = &default_hstate; 2206 return sprintf(buf, 2207 "Node %d HugePages_Total: %5u\n" 2208 "Node %d HugePages_Free: %5u\n" 2209 "Node %d HugePages_Surp: %5u\n", 2210 nid, h->nr_huge_pages_node[nid], 2211 nid, h->free_huge_pages_node[nid], 2212 nid, h->surplus_huge_pages_node[nid]); 2213} 2214 2215void hugetlb_show_meminfo(void) 2216{ 2217 struct hstate *h; 2218 int nid; 2219 2220 for_each_node_state(nid, N_MEMORY) 2221 for_each_hstate(h) 2222 pr_info("Node %d hugepages_total=%u hugepages_free=%u hugepages_surp=%u hugepages_size=%lukB\n", 2223 nid, 2224 h->nr_huge_pages_node[nid], 2225 h->free_huge_pages_node[nid], 2226 h->surplus_huge_pages_node[nid], 2227 1UL << (huge_page_order(h) + PAGE_SHIFT - 10)); 2228} 2229 2230/* Return the number pages of memory we physically have, in PAGE_SIZE units. */ 2231unsigned long hugetlb_total_pages(void) 2232{ 2233 struct hstate *h; 2234 unsigned long nr_total_pages = 0; 2235 2236 for_each_hstate(h) 2237 nr_total_pages += h->nr_huge_pages * pages_per_huge_page(h); 2238 return nr_total_pages; 2239} 2240 2241static int hugetlb_acct_memory(struct hstate *h, long delta) 2242{ 2243 int ret = -ENOMEM; 2244 2245 spin_lock(&hugetlb_lock); 2246 /* 2247 * When cpuset is configured, it breaks the strict hugetlb page 2248 * reservation as the accounting is done on a global variable. Such 2249 * reservation is completely rubbish in the presence of cpuset because 2250 * the reservation is not checked against page availability for the 2251 * current cpuset. Application can still potentially OOM'ed by kernel 2252 * with lack of free htlb page in cpuset that the task is in. 2253 * Attempt to enforce strict accounting with cpuset is almost 2254 * impossible (or too ugly) because cpuset is too fluid that 2255 * task or memory node can be dynamically moved between cpusets. 2256 * 2257 * The change of semantics for shared hugetlb mapping with cpuset is 2258 * undesirable. However, in order to preserve some of the semantics, 2259 * we fall back to check against current free page availability as 2260 * a best attempt and hopefully to minimize the impact of changing 2261 * semantics that cpuset has. 2262 */ 2263 if (delta > 0) { 2264 if (gather_surplus_pages(h, delta) < 0) 2265 goto out; 2266 2267 if (delta > cpuset_mems_nr(h->free_huge_pages_node)) { 2268 return_unused_surplus_pages(h, delta); 2269 goto out; 2270 } 2271 } 2272 2273 ret = 0; 2274 if (delta < 0) 2275 return_unused_surplus_pages(h, (unsigned long) -delta); 2276 2277out: 2278 spin_unlock(&hugetlb_lock); 2279 return ret; 2280} 2281 2282static void hugetlb_vm_op_open(struct vm_area_struct *vma) 2283{ 2284 struct resv_map *resv = vma_resv_map(vma); 2285 2286 /* 2287 * This new VMA should share its siblings reservation map if present. 2288 * The VMA will only ever have a valid reservation map pointer where 2289 * it is being copied for another still existing VMA. As that VMA 2290 * has a reference to the reservation map it cannot disappear until 2291 * after this open call completes. It is therefore safe to take a 2292 * new reference here without additional locking. 2293 */ 2294 if (resv && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) 2295 kref_get(&resv->refs); 2296} 2297 2298static void hugetlb_vm_op_close(struct vm_area_struct *vma) 2299{ 2300 struct hstate *h = hstate_vma(vma); 2301 struct resv_map *resv = vma_resv_map(vma); 2302 struct hugepage_subpool *spool = subpool_vma(vma); 2303 unsigned long reserve, start, end; 2304 2305 if (!resv || !is_vma_resv_set(vma, HPAGE_RESV_OWNER)) 2306 return; 2307 2308 start = vma_hugecache_offset(h, vma, vma->vm_start); 2309 end = vma_hugecache_offset(h, vma, vma->vm_end); 2310 2311 reserve = (end - start) - region_count(resv, start, end); 2312 2313 kref_put(&resv->refs, resv_map_release); 2314 2315 if (reserve) { 2316 hugetlb_acct_memory(h, -reserve); 2317 hugepage_subpool_put_pages(spool, reserve); 2318 } 2319} 2320 2321/* 2322 * We cannot handle pagefaults against hugetlb pages at all. They cause 2323 * handle_mm_fault() to try to instantiate regular-sized pages in the 2324 * hugegpage VMA. do_page_fault() is supposed to trap this, so BUG is we get 2325 * this far. 2326 */ 2327static int hugetlb_vm_op_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 2328{ 2329 BUG(); 2330 return 0; 2331} 2332 2333const struct vm_operations_struct hugetlb_vm_ops = { 2334 .fault = hugetlb_vm_op_fault, 2335 .open = hugetlb_vm_op_open, 2336 .close = hugetlb_vm_op_close, 2337}; 2338 2339static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page, 2340 int writable) 2341{ 2342 pte_t entry; 2343 2344 if (writable) { 2345 entry = huge_pte_mkwrite(huge_pte_mkdirty(mk_huge_pte(page, 2346 vma->vm_page_prot))); 2347 } else { 2348 entry = huge_pte_wrprotect(mk_huge_pte(page, 2349 vma->vm_page_prot)); 2350 } 2351 entry = pte_mkyoung(entry); 2352 entry = pte_mkhuge(entry); 2353 entry = arch_make_huge_pte(entry, vma, page, writable); 2354 2355 return entry; 2356} 2357 2358static void set_huge_ptep_writable(struct vm_area_struct *vma, 2359 unsigned long address, pte_t *ptep) 2360{ 2361 pte_t entry; 2362 2363 entry = huge_pte_mkwrite(huge_pte_mkdirty(huge_ptep_get(ptep))); 2364 if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1)) 2365 update_mmu_cache(vma, address, ptep); 2366} 2367 2368 2369int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, 2370 struct vm_area_struct *vma) 2371{ 2372 pte_t *src_pte, *dst_pte, entry; 2373 struct page *ptepage; 2374 unsigned long addr; 2375 int cow; 2376 struct hstate *h = hstate_vma(vma); 2377 unsigned long sz = huge_page_size(h); 2378 unsigned long mmun_start; /* For mmu_notifiers */ 2379 unsigned long mmun_end; /* For mmu_notifiers */ 2380 int ret = 0; 2381 2382 cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; 2383 2384 mmun_start = vma->vm_start; 2385 mmun_end = vma->vm_end; 2386 if (cow) 2387 mmu_notifier_invalidate_range_start(src, mmun_start, mmun_end); 2388 2389 for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) { 2390 spinlock_t *src_ptl, *dst_ptl; 2391 src_pte = huge_pte_offset(src, addr); 2392 if (!src_pte) 2393 continue; 2394 dst_pte = huge_pte_alloc(dst, addr, sz); 2395 if (!dst_pte) { 2396 ret = -ENOMEM; 2397 break; 2398 } 2399 2400 /* If the pagetables are shared don't copy or take references */ 2401 if (dst_pte == src_pte) 2402 continue; 2403 2404 dst_ptl = huge_pte_lock(h, dst, dst_pte); 2405 src_ptl = huge_pte_lockptr(h, src, src_pte); 2406 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); 2407 if (!huge_pte_none(huge_ptep_get(src_pte))) { 2408 if (cow) 2409 huge_ptep_set_wrprotect(src, addr, src_pte); 2410 entry = huge_ptep_get(src_pte); 2411 ptepage = pte_page(entry); 2412 get_page(ptepage); 2413 page_dup_rmap(ptepage); 2414 set_huge_pte_at(dst, addr, dst_pte, entry); 2415 } 2416 spin_unlock(src_ptl); 2417 spin_unlock(dst_ptl); 2418 } 2419 2420 if (cow) 2421 mmu_notifier_invalidate_range_end(src, mmun_start, mmun_end); 2422 2423 return ret; 2424} 2425 2426static int is_hugetlb_entry_migration(pte_t pte) 2427{ 2428 swp_entry_t swp; 2429 2430 if (huge_pte_none(pte) || pte_present(pte)) 2431 return 0; 2432 swp = pte_to_swp_entry(pte); 2433 if (non_swap_entry(swp) && is_migration_entry(swp)) 2434 return 1; 2435 else 2436 return 0; 2437} 2438 2439static int is_hugetlb_entry_hwpoisoned(pte_t pte) 2440{ 2441 swp_entry_t swp; 2442 2443 if (huge_pte_none(pte) || pte_present(pte)) 2444 return 0; 2445 swp = pte_to_swp_entry(pte); 2446 if (non_swap_entry(swp) && is_hwpoison_entry(swp)) 2447 return 1; 2448 else 2449 return 0; 2450} 2451 2452void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, 2453 unsigned long start, unsigned long end, 2454 struct page *ref_page) 2455{ 2456 int force_flush = 0; 2457 struct mm_struct *mm = vma->vm_mm; 2458 unsigned long address; 2459 pte_t *ptep; 2460 pte_t pte; 2461 spinlock_t *ptl; 2462 struct page *page; 2463 struct hstate *h = hstate_vma(vma); 2464 unsigned long sz = huge_page_size(h); 2465 const unsigned long mmun_start = start; /* For mmu_notifiers */ 2466 const unsigned long mmun_end = end; /* For mmu_notifiers */ 2467 2468 WARN_ON(!is_vm_hugetlb_page(vma)); 2469 BUG_ON(start & ~huge_page_mask(h)); 2470 BUG_ON(end & ~huge_page_mask(h)); 2471 2472 tlb_start_vma(tlb, vma); 2473 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); 2474again: 2475 for (address = start; address < end; address += sz) { 2476 ptep = huge_pte_offset(mm, address); 2477 if (!ptep) 2478 continue; 2479 2480 ptl = huge_pte_lock(h, mm, ptep); 2481 if (huge_pmd_unshare(mm, &address, ptep)) 2482 goto unlock; 2483 2484 pte = huge_ptep_get(ptep); 2485 if (huge_pte_none(pte)) 2486 goto unlock; 2487 2488 /* 2489 * HWPoisoned hugepage is already unmapped and dropped reference 2490 */ 2491 if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) { 2492 huge_pte_clear(mm, address, ptep); 2493 goto unlock; 2494 } 2495 2496 page = pte_page(pte); 2497 /* 2498 * If a reference page is supplied, it is because a specific 2499 * page is being unmapped, not a range. Ensure the page we 2500 * are about to unmap is the actual page of interest. 2501 */ 2502 if (ref_page) { 2503 if (page != ref_page) 2504 goto unlock; 2505 2506 /* 2507 * Mark the VMA as having unmapped its page so that 2508 * future faults in this VMA will fail rather than 2509 * looking like data was lost 2510 */ 2511 set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED); 2512 } 2513 2514 pte = huge_ptep_get_and_clear(mm, address, ptep); 2515 tlb_remove_tlb_entry(tlb, ptep, address); 2516 if (huge_pte_dirty(pte)) 2517 set_page_dirty(page); 2518 2519 page_remove_rmap(page); 2520 force_flush = !__tlb_remove_page(tlb, page); 2521 if (force_flush) { 2522 spin_unlock(ptl); 2523 break; 2524 } 2525 /* Bail out after unmapping reference page if supplied */ 2526 if (ref_page) { 2527 spin_unlock(ptl); 2528 break; 2529 } 2530unlock: 2531 spin_unlock(ptl); 2532 } 2533 /* 2534 * mmu_gather ran out of room to batch pages, we break out of 2535 * the PTE lock to avoid doing the potential expensive TLB invalidate 2536 * and page-free while holding it. 2537 */ 2538 if (force_flush) { 2539 force_flush = 0; 2540 tlb_flush_mmu(tlb); 2541 if (address < end && !ref_page) 2542 goto again; 2543 } 2544 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 2545 tlb_end_vma(tlb, vma); 2546} 2547 2548void __unmap_hugepage_range_final(struct mmu_gather *tlb, 2549 struct vm_area_struct *vma, unsigned long start, 2550 unsigned long end, struct page *ref_page) 2551{ 2552 __unmap_hugepage_range(tlb, vma, start, end, ref_page); 2553 2554 /* 2555 * Clear this flag so that x86's huge_pmd_share page_table_shareable 2556 * test will fail on a vma being torn down, and not grab a page table 2557 * on its way out. We're lucky that the flag has such an appropriate 2558 * name, and can in fact be safely cleared here. We could clear it 2559 * before the __unmap_hugepage_range above, but all that's necessary 2560 * is to clear it before releasing the i_mmap_mutex. This works 2561 * because in the context this is called, the VMA is about to be 2562 * destroyed and the i_mmap_mutex is held. 2563 */ 2564 vma->vm_flags &= ~VM_MAYSHARE; 2565} 2566 2567void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, 2568 unsigned long end, struct page *ref_page) 2569{ 2570 struct mm_struct *mm; 2571 struct mmu_gather tlb; 2572 2573 mm = vma->vm_mm; 2574 2575 tlb_gather_mmu(&tlb, mm, start, end); 2576 __unmap_hugepage_range(&tlb, vma, start, end, ref_page); 2577 tlb_finish_mmu(&tlb, start, end); 2578} 2579 2580/* 2581 * This is called when the original mapper is failing to COW a MAP_PRIVATE 2582 * mappping it owns the reserve page for. The intention is to unmap the page 2583 * from other VMAs and let the children be SIGKILLed if they are faulting the 2584 * same region. 2585 */ 2586static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, 2587 struct page *page, unsigned long address) 2588{ 2589 struct hstate *h = hstate_vma(vma); 2590 struct vm_area_struct *iter_vma; 2591 struct address_space *mapping; 2592 pgoff_t pgoff; 2593 2594 /* 2595 * vm_pgoff is in PAGE_SIZE units, hence the different calculation 2596 * from page cache lookup which is in HPAGE_SIZE units. 2597 */ 2598 address = address & huge_page_mask(h); 2599 pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + 2600 vma->vm_pgoff; 2601 mapping = file_inode(vma->vm_file)->i_mapping; 2602 2603 /* 2604 * Take the mapping lock for the duration of the table walk. As 2605 * this mapping should be shared between all the VMAs, 2606 * __unmap_hugepage_range() is called as the lock is already held 2607 */ 2608 mutex_lock(&mapping->i_mmap_mutex); 2609 vma_interval_tree_foreach(iter_vma, &mapping->i_mmap, pgoff, pgoff) { 2610 /* Do not unmap the current VMA */ 2611 if (iter_vma == vma) 2612 continue; 2613 2614 /* 2615 * Unmap the page from other VMAs without their own reserves. 2616 * They get marked to be SIGKILLed if they fault in these 2617 * areas. This is because a future no-page fault on this VMA 2618 * could insert a zeroed page instead of the data existing 2619 * from the time of fork. This would look like data corruption 2620 */ 2621 if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER)) 2622 unmap_hugepage_range(iter_vma, address, 2623 address + huge_page_size(h), page); 2624 } 2625 mutex_unlock(&mapping->i_mmap_mutex); 2626 2627 return 1; 2628} 2629 2630/* 2631 * Hugetlb_cow() should be called with page lock of the original hugepage held. 2632 * Called with hugetlb_instantiation_mutex held and pte_page locked so we 2633 * cannot race with other handlers or page migration. 2634 * Keep the pte_same checks anyway to make transition from the mutex easier. 2635 */ 2636static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, 2637 unsigned long address, pte_t *ptep, pte_t pte, 2638 struct page *pagecache_page, spinlock_t *ptl) 2639{ 2640 struct hstate *h = hstate_vma(vma); 2641 struct page *old_page, *new_page; 2642 int outside_reserve = 0; 2643 unsigned long mmun_start; /* For mmu_notifiers */ 2644 unsigned long mmun_end; /* For mmu_notifiers */ 2645 2646 old_page = pte_page(pte); 2647 2648retry_avoidcopy: 2649 /* If no-one else is actually using this page, avoid the copy 2650 * and just make the page writable */ 2651 if (page_mapcount(old_page) == 1 && PageAnon(old_page)) { 2652 page_move_anon_rmap(old_page, vma, address); 2653 set_huge_ptep_writable(vma, address, ptep); 2654 return 0; 2655 } 2656 2657 /* 2658 * If the process that created a MAP_PRIVATE mapping is about to 2659 * perform a COW due to a shared page count, attempt to satisfy 2660 * the allocation without using the existing reserves. The pagecache 2661 * page is used to determine if the reserve at this address was 2662 * consumed or not. If reserves were used, a partial faulted mapping 2663 * at the time of fork() could consume its reserves on COW instead 2664 * of the full address range. 2665 */ 2666 if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) && 2667 old_page != pagecache_page) 2668 outside_reserve = 1; 2669 2670 page_cache_get(old_page); 2671 2672 /* Drop page table lock as buddy allocator may be called */ 2673 spin_unlock(ptl); 2674 new_page = alloc_huge_page(vma, address, outside_reserve); 2675 2676 if (IS_ERR(new_page)) { 2677 long err = PTR_ERR(new_page); 2678 page_cache_release(old_page); 2679 2680 /* 2681 * If a process owning a MAP_PRIVATE mapping fails to COW, 2682 * it is due to references held by a child and an insufficient 2683 * huge page pool. To guarantee the original mappers 2684 * reliability, unmap the page from child processes. The child 2685 * may get SIGKILLed if it later faults. 2686 */ 2687 if (outside_reserve) { 2688 BUG_ON(huge_pte_none(pte)); 2689 if (unmap_ref_private(mm, vma, old_page, address)) { 2690 BUG_ON(huge_pte_none(pte)); 2691 spin_lock(ptl); 2692 ptep = huge_pte_offset(mm, address & huge_page_mask(h)); 2693 if (likely(pte_same(huge_ptep_get(ptep), pte))) 2694 goto retry_avoidcopy; 2695 /* 2696 * race occurs while re-acquiring page table 2697 * lock, and our job is done. 2698 */ 2699 return 0; 2700 } 2701 WARN_ON_ONCE(1); 2702 } 2703 2704 /* Caller expects lock to be held */ 2705 spin_lock(ptl); 2706 if (err == -ENOMEM) 2707 return VM_FAULT_OOM; 2708 else 2709 return VM_FAULT_SIGBUS; 2710 } 2711 2712 /* 2713 * When the original hugepage is shared one, it does not have 2714 * anon_vma prepared. 2715 */ 2716 if (unlikely(anon_vma_prepare(vma))) { 2717 page_cache_release(new_page); 2718 page_cache_release(old_page); 2719 /* Caller expects lock to be held */ 2720 spin_lock(ptl); 2721 return VM_FAULT_OOM; 2722 } 2723 2724 copy_user_huge_page(new_page, old_page, address, vma, 2725 pages_per_huge_page(h)); 2726 __SetPageUptodate(new_page); 2727 2728 mmun_start = address & huge_page_mask(h); 2729 mmun_end = mmun_start + huge_page_size(h); 2730 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); 2731 /* 2732 * Retake the page table lock to check for racing updates 2733 * before the page tables are altered 2734 */ 2735 spin_lock(ptl); 2736 ptep = huge_pte_offset(mm, address & huge_page_mask(h)); 2737 if (likely(pte_same(huge_ptep_get(ptep), pte))) { 2738 ClearPagePrivate(new_page); 2739 2740 /* Break COW */ 2741 huge_ptep_clear_flush(vma, address, ptep); 2742 set_huge_pte_at(mm, address, ptep, 2743 make_huge_pte(vma, new_page, 1)); 2744 page_remove_rmap(old_page); 2745 hugepage_add_new_anon_rmap(new_page, vma, address); 2746 /* Make the old page be freed below */ 2747 new_page = old_page; 2748 } 2749 spin_unlock(ptl); 2750 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 2751 page_cache_release(new_page); 2752 page_cache_release(old_page); 2753 2754 /* Caller expects lock to be held */ 2755 spin_lock(ptl); 2756 return 0; 2757} 2758 2759/* Return the pagecache page at a given address within a VMA */ 2760static struct page *hugetlbfs_pagecache_page(struct hstate *h, 2761 struct vm_area_struct *vma, unsigned long address) 2762{ 2763 struct address_space *mapping; 2764 pgoff_t idx; 2765 2766 mapping = vma->vm_file->f_mapping; 2767 idx = vma_hugecache_offset(h, vma, address); 2768 2769 return find_lock_page(mapping, idx); 2770} 2771 2772/* 2773 * Return whether there is a pagecache page to back given address within VMA. 2774 * Caller follow_hugetlb_page() holds page_table_lock so we cannot lock_page. 2775 */ 2776static bool hugetlbfs_pagecache_present(struct hstate *h, 2777 struct vm_area_struct *vma, unsigned long address) 2778{ 2779 struct address_space *mapping; 2780 pgoff_t idx; 2781 struct page *page; 2782 2783 mapping = vma->vm_file->f_mapping; 2784 idx = vma_hugecache_offset(h, vma, address); 2785 2786 page = find_get_page(mapping, idx); 2787 if (page) 2788 put_page(page); 2789 return page != NULL; 2790} 2791 2792static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, 2793 struct address_space *mapping, pgoff_t idx, 2794 unsigned long address, pte_t *ptep, unsigned int flags) 2795{ 2796 struct hstate *h = hstate_vma(vma); 2797 int ret = VM_FAULT_SIGBUS; 2798 int anon_rmap = 0; 2799 unsigned long size; 2800 struct page *page; 2801 pte_t new_pte; 2802 spinlock_t *ptl; 2803 2804 /* 2805 * Currently, we are forced to kill the process in the event the 2806 * original mapper has unmapped pages from the child due to a failed 2807 * COW. Warn that such a situation has occurred as it may not be obvious 2808 */ 2809 if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) { 2810 pr_warning("PID %d killed due to inadequate hugepage pool\n", 2811 current->pid); 2812 return ret; 2813 } 2814 2815 /* 2816 * Use page lock to guard against racing truncation 2817 * before we get page_table_lock. 2818 */ 2819retry: 2820 page = find_lock_page(mapping, idx); 2821 if (!page) { 2822 size = i_size_read(mapping->host) >> huge_page_shift(h); 2823 if (idx >= size) 2824 goto out; 2825 page = alloc_huge_page(vma, address, 0); 2826 if (IS_ERR(page)) { 2827 ret = PTR_ERR(page); 2828 if (ret == -ENOMEM) 2829 ret = VM_FAULT_OOM; 2830 else 2831 ret = VM_FAULT_SIGBUS; 2832 goto out; 2833 } 2834 clear_huge_page(page, address, pages_per_huge_page(h)); 2835 __SetPageUptodate(page); 2836 2837 if (vma->vm_flags & VM_MAYSHARE) { 2838 int err; 2839 struct inode *inode = mapping->host; 2840 2841 err = add_to_page_cache(page, mapping, idx, GFP_KERNEL); 2842 if (err) { 2843 put_page(page); 2844 if (err == -EEXIST) 2845 goto retry; 2846 goto out; 2847 } 2848 ClearPagePrivate(page); 2849 2850 spin_lock(&inode->i_lock); 2851 inode->i_blocks += blocks_per_huge_page(h); 2852 spin_unlock(&inode->i_lock); 2853 } else { 2854 lock_page(page); 2855 if (unlikely(anon_vma_prepare(vma))) { 2856 ret = VM_FAULT_OOM; 2857 goto backout_unlocked; 2858 } 2859 anon_rmap = 1; 2860 } 2861 } else { 2862 /* 2863 * If memory error occurs between mmap() and fault, some process 2864 * don't have hwpoisoned swap entry for errored virtual address. 2865 * So we need to block hugepage fault by PG_hwpoison bit check. 2866 */ 2867 if (unlikely(PageHWPoison(page))) { 2868 ret = VM_FAULT_HWPOISON | 2869 VM_FAULT_SET_HINDEX(hstate_index(h)); 2870 goto backout_unlocked; 2871 } 2872 } 2873 2874 /* 2875 * If we are going to COW a private mapping later, we examine the 2876 * pending reservations for this page now. This will ensure that 2877 * any allocations necessary to record that reservation occur outside 2878 * the spinlock. 2879 */ 2880 if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) 2881 if (vma_needs_reservation(h, vma, address) < 0) { 2882 ret = VM_FAULT_OOM; 2883 goto backout_unlocked; 2884 } 2885 2886 ptl = huge_pte_lockptr(h, mm, ptep); 2887 spin_lock(ptl); 2888 size = i_size_read(mapping->host) >> huge_page_shift(h); 2889 if (idx >= size) 2890 goto backout; 2891 2892 ret = 0; 2893 if (!huge_pte_none(huge_ptep_get(ptep))) 2894 goto backout; 2895 2896 if (anon_rmap) { 2897 ClearPagePrivate(page); 2898 hugepage_add_new_anon_rmap(page, vma, address); 2899 } 2900 else 2901 page_dup_rmap(page); 2902 new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) 2903 && (vma->vm_flags & VM_SHARED))); 2904 set_huge_pte_at(mm, address, ptep, new_pte); 2905 2906 if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { 2907 /* Optimization, do the COW without a second fault */ 2908 ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page, ptl); 2909 } 2910 2911 spin_unlock(ptl); 2912 unlock_page(page); 2913out: 2914 return ret; 2915 2916backout: 2917 spin_unlock(ptl); 2918backout_unlocked: 2919 unlock_page(page); 2920 put_page(page); 2921 goto out; 2922} 2923 2924#ifdef CONFIG_SMP 2925static u32 fault_mutex_hash(struct hstate *h, struct mm_struct *mm, 2926 struct vm_area_struct *vma, 2927 struct address_space *mapping, 2928 pgoff_t idx, unsigned long address) 2929{ 2930 unsigned long key[2]; 2931 u32 hash; 2932 2933 if (vma->vm_flags & VM_SHARED) { 2934 key[0] = (unsigned long) mapping; 2935 key[1] = idx; 2936 } else { 2937 key[0] = (unsigned long) mm; 2938 key[1] = address >> huge_page_shift(h); 2939 } 2940 2941 hash = jhash2((u32 *)&key, sizeof(key)/sizeof(u32), 0); 2942 2943 return hash & (num_fault_mutexes - 1); 2944} 2945#else 2946/* 2947 * For uniprocesor systems we always use a single mutex, so just 2948 * return 0 and avoid the hashing overhead. 2949 */ 2950static u32 fault_mutex_hash(struct hstate *h, struct mm_struct *mm, 2951 struct vm_area_struct *vma, 2952 struct address_space *mapping, 2953 pgoff_t idx, unsigned long address) 2954{ 2955 return 0; 2956} 2957#endif 2958 2959int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, 2960 unsigned long address, unsigned int flags) 2961{ 2962 pte_t *ptep, entry; 2963 spinlock_t *ptl; 2964 int ret; 2965 u32 hash; 2966 pgoff_t idx; 2967 struct page *page = NULL; 2968 struct page *pagecache_page = NULL; 2969 struct hstate *h = hstate_vma(vma); 2970 struct address_space *mapping; 2971 2972 address &= huge_page_mask(h); 2973 2974 ptep = huge_pte_offset(mm, address); 2975 if (ptep) { 2976 entry = huge_ptep_get(ptep); 2977 if (unlikely(is_hugetlb_entry_migration(entry))) { 2978 migration_entry_wait_huge(vma, mm, ptep); 2979 return 0; 2980 } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) 2981 return VM_FAULT_HWPOISON_LARGE | 2982 VM_FAULT_SET_HINDEX(hstate_index(h)); 2983 } 2984 2985 ptep = huge_pte_alloc(mm, address, huge_page_size(h)); 2986 if (!ptep) 2987 return VM_FAULT_OOM; 2988 2989 mapping = vma->vm_file->f_mapping; 2990 idx = vma_hugecache_offset(h, vma, address); 2991 2992 /* 2993 * Serialize hugepage allocation and instantiation, so that we don't 2994 * get spurious allocation failures if two CPUs race to instantiate 2995 * the same page in the page cache. 2996 */ 2997 hash = fault_mutex_hash(h, mm, vma, mapping, idx, address); 2998 mutex_lock(&htlb_fault_mutex_table[hash]); 2999 3000 entry = huge_ptep_get(ptep); 3001 if (huge_pte_none(entry)) { 3002 ret = hugetlb_no_page(mm, vma, mapping, idx, address, ptep, flags); 3003 goto out_mutex; 3004 } 3005 3006 ret = 0; 3007 3008 /* 3009 * If we are going to COW the mapping later, we examine the pending 3010 * reservations for this page now. This will ensure that any 3011 * allocations necessary to record that reservation occur outside the 3012 * spinlock. For private mappings, we also lookup the pagecache 3013 * page now as it is used to determine if a reservation has been 3014 * consumed. 3015 */ 3016 if ((flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) { 3017 if (vma_needs_reservation(h, vma, address) < 0) { 3018 ret = VM_FAULT_OOM; 3019 goto out_mutex; 3020 } 3021 3022 if (!(vma->vm_flags & VM_MAYSHARE)) 3023 pagecache_page = hugetlbfs_pagecache_page(h, 3024 vma, address); 3025 } 3026 3027 /* 3028 * hugetlb_cow() requires page locks of pte_page(entry) and 3029 * pagecache_page, so here we need take the former one 3030 * when page != pagecache_page or !pagecache_page. 3031 * Note that locking order is always pagecache_page -> page, 3032 * so no worry about deadlock. 3033 */ 3034 page = pte_page(entry); 3035 get_page(page); 3036 if (page != pagecache_page) 3037 lock_page(page); 3038 3039 ptl = huge_pte_lockptr(h, mm, ptep); 3040 spin_lock(ptl); 3041 /* Check for a racing update before calling hugetlb_cow */ 3042 if (unlikely(!pte_same(entry, huge_ptep_get(ptep)))) 3043 goto out_ptl; 3044 3045 3046 if (flags & FAULT_FLAG_WRITE) { 3047 if (!huge_pte_write(entry)) { 3048 ret = hugetlb_cow(mm, vma, address, ptep, entry, 3049 pagecache_page, ptl); 3050 goto out_ptl; 3051 } 3052 entry = huge_pte_mkdirty(entry); 3053 } 3054 entry = pte_mkyoung(entry); 3055 if (huge_ptep_set_access_flags(vma, address, ptep, entry, 3056 flags & FAULT_FLAG_WRITE)) 3057 update_mmu_cache(vma, address, ptep); 3058 3059out_ptl: 3060 spin_unlock(ptl); 3061 3062 if (pagecache_page) { 3063 unlock_page(pagecache_page); 3064 put_page(pagecache_page); 3065 } 3066 if (page != pagecache_page) 3067 unlock_page(page); 3068 put_page(page); 3069 3070out_mutex: 3071 mutex_unlock(&htlb_fault_mutex_table[hash]); 3072 return ret; 3073} 3074 3075long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, 3076 struct page **pages, struct vm_area_struct **vmas, 3077 unsigned long *position, unsigned long *nr_pages, 3078 long i, unsigned int flags) 3079{ 3080 unsigned long pfn_offset; 3081 unsigned long vaddr = *position; 3082 unsigned long remainder = *nr_pages; 3083 struct hstate *h = hstate_vma(vma); 3084 3085 while (vaddr < vma->vm_end && remainder) { 3086 pte_t *pte; 3087 spinlock_t *ptl = NULL; 3088 int absent; 3089 struct page *page; 3090 3091 /* 3092 * Some archs (sparc64, sh*) have multiple pte_ts to 3093 * each hugepage. We have to make sure we get the 3094 * first, for the page indexing below to work. 3095 * 3096 * Note that page table lock is not held when pte is null. 3097 */ 3098 pte = huge_pte_offset(mm, vaddr & huge_page_mask(h)); 3099 if (pte) 3100 ptl = huge_pte_lock(h, mm, pte); 3101 absent = !pte || huge_pte_none(huge_ptep_get(pte)); 3102 3103 /* 3104 * When coredumping, it suits get_dump_page if we just return 3105 * an error where there's an empty slot with no huge pagecache 3106 * to back it. This way, we avoid allocating a hugepage, and 3107 * the sparse dumpfile avoids allocating disk blocks, but its 3108 * huge holes still show up with zeroes where they need to be. 3109 */ 3110 if (absent && (flags & FOLL_DUMP) && 3111 !hugetlbfs_pagecache_present(h, vma, vaddr)) { 3112 if (pte) 3113 spin_unlock(ptl); 3114 remainder = 0; 3115 break; 3116 } 3117 3118 /* 3119 * We need call hugetlb_fault for both hugepages under migration 3120 * (in which case hugetlb_fault waits for the migration,) and 3121 * hwpoisoned hugepages (in which case we need to prevent the 3122 * caller from accessing to them.) In order to do this, we use 3123 * here is_swap_pte instead of is_hugetlb_entry_migration and 3124 * is_hugetlb_entry_hwpoisoned. This is because it simply covers 3125 * both cases, and because we can't follow correct pages 3126 * directly from any kind of swap entries. 3127 */ 3128 if (absent || is_swap_pte(huge_ptep_get(pte)) || 3129 ((flags & FOLL_WRITE) && 3130 !huge_pte_write(huge_ptep_get(pte)))) { 3131 int ret; 3132 3133 if (pte) 3134 spin_unlock(ptl); 3135 ret = hugetlb_fault(mm, vma, vaddr, 3136 (flags & FOLL_WRITE) ? FAULT_FLAG_WRITE : 0); 3137 if (!(ret & VM_FAULT_ERROR)) 3138 continue; 3139 3140 remainder = 0; 3141 break; 3142 } 3143 3144 pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT; 3145 page = pte_page(huge_ptep_get(pte)); 3146same_page: 3147 if (pages) { 3148 pages[i] = mem_map_offset(page, pfn_offset); 3149 get_page_foll(pages[i]); 3150 } 3151 3152 if (vmas) 3153 vmas[i] = vma; 3154 3155 vaddr += PAGE_SIZE; 3156 ++pfn_offset; 3157 --remainder; 3158 ++i; 3159 if (vaddr < vma->vm_end && remainder && 3160 pfn_offset < pages_per_huge_page(h)) { 3161 /* 3162 * We use pfn_offset to avoid touching the pageframes 3163 * of this compound page. 3164 */ 3165 goto same_page; 3166 } 3167 spin_unlock(ptl); 3168 } 3169 *nr_pages = remainder; 3170 *position = vaddr; 3171 3172 return i ? i : -EFAULT; 3173} 3174 3175unsigned long hugetlb_change_protection(struct vm_area_struct *vma, 3176 unsigned long address, unsigned long end, pgprot_t newprot) 3177{ 3178 struct mm_struct *mm = vma->vm_mm; 3179 unsigned long start = address; 3180 pte_t *ptep; 3181 pte_t pte; 3182 struct hstate *h = hstate_vma(vma); 3183 unsigned long pages = 0; 3184 3185 BUG_ON(address >= end); 3186 flush_cache_range(vma, address, end); 3187 3188 mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex); 3189 for (; address < end; address += huge_page_size(h)) { 3190 spinlock_t *ptl; 3191 ptep = huge_pte_offset(mm, address); 3192 if (!ptep) 3193 continue; 3194 ptl = huge_pte_lock(h, mm, ptep); 3195 if (huge_pmd_unshare(mm, &address, ptep)) { 3196 pages++; 3197 spin_unlock(ptl); 3198 continue; 3199 } 3200 if (!huge_pte_none(huge_ptep_get(ptep))) { 3201 pte = huge_ptep_get_and_clear(mm, address, ptep); 3202 pte = pte_mkhuge(huge_pte_modify(pte, newprot)); 3203 pte = arch_make_huge_pte(pte, vma, NULL, 0); 3204 set_huge_pte_at(mm, address, ptep, pte); 3205 pages++; 3206 } 3207 spin_unlock(ptl); 3208 } 3209 /* 3210 * Must flush TLB before releasing i_mmap_mutex: x86's huge_pmd_unshare 3211 * may have cleared our pud entry and done put_page on the page table: 3212 * once we release i_mmap_mutex, another task can do the final put_page 3213 * and that page table be reused and filled with junk. 3214 */ 3215 flush_tlb_range(vma, start, end); 3216 mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); 3217 3218 return pages << h->order; 3219} 3220 3221int hugetlb_reserve_pages(struct inode *inode, 3222 long from, long to, 3223 struct vm_area_struct *vma, 3224 vm_flags_t vm_flags) 3225{ 3226 long ret, chg; 3227 struct hstate *h = hstate_inode(inode); 3228 struct hugepage_subpool *spool = subpool_inode(inode); 3229 struct resv_map *resv_map; 3230 3231 /* 3232 * Only apply hugepage reservation if asked. At fault time, an 3233 * attempt will be made for VM_NORESERVE to allocate a page 3234 * without using reserves 3235 */ 3236 if (vm_flags & VM_NORESERVE) 3237 return 0; 3238 3239 /* 3240 * Shared mappings base their reservation on the number of pages that 3241 * are already allocated on behalf of the file. Private mappings need 3242 * to reserve the full area even if read-only as mprotect() may be 3243 * called to make the mapping read-write. Assume !vma is a shm mapping 3244 */ 3245 if (!vma || vma->vm_flags & VM_MAYSHARE) { 3246 resv_map = inode_resv_map(inode); 3247 3248 chg = region_chg(resv_map, from, to); 3249 3250 } else { 3251 resv_map = resv_map_alloc(); 3252 if (!resv_map) 3253 return -ENOMEM; 3254 3255 chg = to - from; 3256 3257 set_vma_resv_map(vma, resv_map); 3258 set_vma_resv_flags(vma, HPAGE_RESV_OWNER); 3259 } 3260 3261 if (chg < 0) { 3262 ret = chg; 3263 goto out_err; 3264 } 3265 3266 /* There must be enough pages in the subpool for the mapping */ 3267 if (hugepage_subpool_get_pages(spool, chg)) { 3268 ret = -ENOSPC; 3269 goto out_err; 3270 } 3271 3272 /* 3273 * Check enough hugepages are available for the reservation. 3274 * Hand the pages back to the subpool if there are not 3275 */ 3276 ret = hugetlb_acct_memory(h, chg); 3277 if (ret < 0) { 3278 hugepage_subpool_put_pages(spool, chg); 3279 goto out_err; 3280 } 3281 3282 /* 3283 * Account for the reservations made. Shared mappings record regions 3284 * that have reservations as they are shared by multiple VMAs. 3285 * When the last VMA disappears, the region map says how much 3286 * the reservation was and the page cache tells how much of 3287 * the reservation was consumed. Private mappings are per-VMA and 3288 * only the consumed reservations are tracked. When the VMA 3289 * disappears, the original reservation is the VMA size and the 3290 * consumed reservations are stored in the map. Hence, nothing 3291 * else has to be done for private mappings here 3292 */ 3293 if (!vma || vma->vm_flags & VM_MAYSHARE) 3294 region_add(resv_map, from, to); 3295 return 0; 3296out_err: 3297 if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) 3298 kref_put(&resv_map->refs, resv_map_release); 3299 return ret; 3300} 3301 3302void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) 3303{ 3304 struct hstate *h = hstate_inode(inode); 3305 struct resv_map *resv_map = inode_resv_map(inode); 3306 long chg = 0; 3307 struct hugepage_subpool *spool = subpool_inode(inode); 3308 3309 if (resv_map) 3310 chg = region_truncate(resv_map, offset); 3311 spin_lock(&inode->i_lock); 3312 inode->i_blocks -= (blocks_per_huge_page(h) * freed); 3313 spin_unlock(&inode->i_lock); 3314 3315 hugepage_subpool_put_pages(spool, (chg - freed)); 3316 hugetlb_acct_memory(h, -(chg - freed)); 3317} 3318 3319#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE 3320static unsigned long page_table_shareable(struct vm_area_struct *svma, 3321 struct vm_area_struct *vma, 3322 unsigned long addr, pgoff_t idx) 3323{ 3324 unsigned long saddr = ((idx - svma->vm_pgoff) << PAGE_SHIFT) + 3325 svma->vm_start; 3326 unsigned long sbase = saddr & PUD_MASK; 3327 unsigned long s_end = sbase + PUD_SIZE; 3328 3329 /* Allow segments to share if only one is marked locked */ 3330 unsigned long vm_flags = vma->vm_flags & ~VM_LOCKED; 3331 unsigned long svm_flags = svma->vm_flags & ~VM_LOCKED; 3332 3333 /* 3334 * match the virtual addresses, permission and the alignment of the 3335 * page table page. 3336 */ 3337 if (pmd_index(addr) != pmd_index(saddr) || 3338 vm_flags != svm_flags || 3339 sbase < svma->vm_start || svma->vm_end < s_end) 3340 return 0; 3341 3342 return saddr; 3343} 3344 3345static int vma_shareable(struct vm_area_struct *vma, unsigned long addr) 3346{ 3347 unsigned long base = addr & PUD_MASK; 3348 unsigned long end = base + PUD_SIZE; 3349 3350 /* 3351 * check on proper vm_flags and page table alignment 3352 */ 3353 if (vma->vm_flags & VM_MAYSHARE && 3354 vma->vm_start <= base && end <= vma->vm_end) 3355 return 1; 3356 return 0; 3357} 3358 3359/* 3360 * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc() 3361 * and returns the corresponding pte. While this is not necessary for the 3362 * !shared pmd case because we can allocate the pmd later as well, it makes the 3363 * code much cleaner. pmd allocation is essential for the shared case because 3364 * pud has to be populated inside the same i_mmap_mutex section - otherwise 3365 * racing tasks could either miss the sharing (see huge_pte_offset) or select a 3366 * bad pmd for sharing. 3367 */ 3368pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) 3369{ 3370 struct vm_area_struct *vma = find_vma(mm, addr); 3371 struct address_space *mapping = vma->vm_file->f_mapping; 3372 pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + 3373 vma->vm_pgoff; 3374 struct vm_area_struct *svma; 3375 unsigned long saddr; 3376 pte_t *spte = NULL; 3377 pte_t *pte; 3378 spinlock_t *ptl; 3379 3380 if (!vma_shareable(vma, addr)) 3381 return (pte_t *)pmd_alloc(mm, pud, addr); 3382 3383 mutex_lock(&mapping->i_mmap_mutex); 3384 vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) { 3385 if (svma == vma) 3386 continue; 3387 3388 saddr = page_table_shareable(svma, vma, addr, idx); 3389 if (saddr) { 3390 spte = huge_pte_offset(svma->vm_mm, saddr); 3391 if (spte) { 3392 get_page(virt_to_page(spte)); 3393 break; 3394 } 3395 } 3396 } 3397 3398 if (!spte) 3399 goto out; 3400 3401 ptl = huge_pte_lockptr(hstate_vma(vma), mm, spte); 3402 spin_lock(ptl); 3403 if (pud_none(*pud)) 3404 pud_populate(mm, pud, 3405 (pmd_t *)((unsigned long)spte & PAGE_MASK)); 3406 else 3407 put_page(virt_to_page(spte)); 3408 spin_unlock(ptl); 3409out: 3410 pte = (pte_t *)pmd_alloc(mm, pud, addr); 3411 mutex_unlock(&mapping->i_mmap_mutex); 3412 return pte; 3413} 3414 3415/* 3416 * unmap huge page backed by shared pte. 3417 * 3418 * Hugetlb pte page is ref counted at the time of mapping. If pte is shared 3419 * indicated by page_count > 1, unmap is achieved by clearing pud and 3420 * decrementing the ref count. If count == 1, the pte page is not shared. 3421 * 3422 * called with page table lock held. 3423 * 3424 * returns: 1 successfully unmapped a shared pte page 3425 * 0 the underlying pte page is not shared, or it is the last user 3426 */ 3427int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) 3428{ 3429 pgd_t *pgd = pgd_offset(mm, *addr); 3430 pud_t *pud = pud_offset(pgd, *addr); 3431 3432 BUG_ON(page_count(virt_to_page(ptep)) == 0); 3433 if (page_count(virt_to_page(ptep)) == 1) 3434 return 0; 3435 3436 pud_clear(pud); 3437 put_page(virt_to_page(ptep)); 3438 *addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE; 3439 return 1; 3440} 3441#define want_pmd_share() (1) 3442#else /* !CONFIG_ARCH_WANT_HUGE_PMD_SHARE */ 3443pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) 3444{ 3445 return NULL; 3446} 3447#define want_pmd_share() (0) 3448#endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */ 3449 3450#ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB 3451pte_t *huge_pte_alloc(struct mm_struct *mm, 3452 unsigned long addr, unsigned long sz) 3453{ 3454 pgd_t *pgd; 3455 pud_t *pud; 3456 pte_t *pte = NULL; 3457 3458 pgd = pgd_offset(mm, addr); 3459 pud = pud_alloc(mm, pgd, addr); 3460 if (pud) { 3461 if (sz == PUD_SIZE) { 3462 pte = (pte_t *)pud; 3463 } else { 3464 BUG_ON(sz != PMD_SIZE); 3465 if (want_pmd_share() && pud_none(*pud)) 3466 pte = huge_pmd_share(mm, addr, pud); 3467 else 3468 pte = (pte_t *)pmd_alloc(mm, pud, addr); 3469 } 3470 } 3471 BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte)); 3472 3473 return pte; 3474} 3475 3476pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) 3477{ 3478 pgd_t *pgd; 3479 pud_t *pud; 3480 pmd_t *pmd = NULL; 3481 3482 pgd = pgd_offset(mm, addr); 3483 if (pgd_present(*pgd)) { 3484 pud = pud_offset(pgd, addr); 3485 if (pud_present(*pud)) { 3486 if (pud_huge(*pud)) 3487 return (pte_t *)pud; 3488 pmd = pmd_offset(pud, addr); 3489 } 3490 } 3491 return (pte_t *) pmd; 3492} 3493 3494struct page * 3495follow_huge_pmd(struct mm_struct *mm, unsigned long address, 3496 pmd_t *pmd, int write) 3497{ 3498 struct page *page; 3499 3500 page = pte_page(*(pte_t *)pmd); 3501 if (page) 3502 page += ((address & ~PMD_MASK) >> PAGE_SHIFT); 3503 return page; 3504} 3505 3506struct page * 3507follow_huge_pud(struct mm_struct *mm, unsigned long address, 3508 pud_t *pud, int write) 3509{ 3510 struct page *page; 3511 3512 page = pte_page(*(pte_t *)pud); 3513 if (page) 3514 page += ((address & ~PUD_MASK) >> PAGE_SHIFT); 3515 return page; 3516} 3517 3518#else /* !CONFIG_ARCH_WANT_GENERAL_HUGETLB */ 3519 3520/* Can be overriden by architectures */ 3521__attribute__((weak)) struct page * 3522follow_huge_pud(struct mm_struct *mm, unsigned long address, 3523 pud_t *pud, int write) 3524{ 3525 BUG(); 3526 return NULL; 3527} 3528 3529#endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */ 3530 3531#ifdef CONFIG_MEMORY_FAILURE 3532 3533/* Should be called in hugetlb_lock */ 3534static int is_hugepage_on_freelist(struct page *hpage) 3535{ 3536 struct page *page; 3537 struct page *tmp; 3538 struct hstate *h = page_hstate(hpage); 3539 int nid = page_to_nid(hpage); 3540 3541 list_for_each_entry_safe(page, tmp, &h->hugepage_freelists[nid], lru) 3542 if (page == hpage) 3543 return 1; 3544 return 0; 3545} 3546 3547/* 3548 * This function is called from memory failure code. 3549 * Assume the caller holds page lock of the head page. 3550 */ 3551int dequeue_hwpoisoned_huge_page(struct page *hpage) 3552{ 3553 struct hstate *h = page_hstate(hpage); 3554 int nid = page_to_nid(hpage); 3555 int ret = -EBUSY; 3556 3557 spin_lock(&hugetlb_lock); 3558 if (is_hugepage_on_freelist(hpage)) { 3559 /* 3560 * Hwpoisoned hugepage isn't linked to activelist or freelist, 3561 * but dangling hpage->lru can trigger list-debug warnings 3562 * (this happens when we call unpoison_memory() on it), 3563 * so let it point to itself with list_del_init(). 3564 */ 3565 list_del_init(&hpage->lru); 3566 set_page_refcounted(hpage); 3567 h->free_huge_pages--; 3568 h->free_huge_pages_node[nid]--; 3569 ret = 0; 3570 } 3571 spin_unlock(&hugetlb_lock); 3572 return ret; 3573} 3574#endif 3575 3576bool isolate_huge_page(struct page *page, struct list_head *list) 3577{ 3578 VM_BUG_ON_PAGE(!PageHead(page), page); 3579 if (!get_page_unless_zero(page)) 3580 return false; 3581 spin_lock(&hugetlb_lock); 3582 list_move_tail(&page->lru, list); 3583 spin_unlock(&hugetlb_lock); 3584 return true; 3585} 3586 3587void putback_active_hugepage(struct page *page) 3588{ 3589 VM_BUG_ON_PAGE(!PageHead(page), page); 3590 spin_lock(&hugetlb_lock); 3591 list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist); 3592 spin_unlock(&hugetlb_lock); 3593 put_page(page); 3594} 3595 3596bool is_hugepage_active(struct page *page) 3597{ 3598 VM_BUG_ON_PAGE(!PageHuge(page), page); 3599 /* 3600 * This function can be called for a tail page because the caller, 3601 * scan_movable_pages, scans through a given pfn-range which typically 3602 * covers one memory block. In systems using gigantic hugepage (1GB 3603 * for x86_64,) a hugepage is larger than a memory block, and we don't 3604 * support migrating such large hugepages for now, so return false 3605 * when called for tail pages. 3606 */ 3607 if (PageTail(page)) 3608 return false; 3609 /* 3610 * Refcount of a hwpoisoned hugepages is 1, but they are not active, 3611 * so we should return false for them. 3612 */ 3613 if (unlikely(PageHWPoison(page))) 3614 return false; 3615 return page_count(page) > 0; 3616} 3617