hugetlb.c revision 8c6c2ecb44667f7204e9d2b89c4c1f42edc5a196
1/* 2 * Generic hugetlb support. 3 * (C) William Irwin, April 2004 4 */ 5#include <linux/list.h> 6#include <linux/init.h> 7#include <linux/module.h> 8#include <linux/mm.h> 9#include <linux/seq_file.h> 10#include <linux/sysctl.h> 11#include <linux/highmem.h> 12#include <linux/mmu_notifier.h> 13#include <linux/nodemask.h> 14#include <linux/pagemap.h> 15#include <linux/mempolicy.h> 16#include <linux/cpuset.h> 17#include <linux/mutex.h> 18#include <linux/bootmem.h> 19#include <linux/sysfs.h> 20#include <linux/slab.h> 21#include <linux/rmap.h> 22#include <linux/swap.h> 23#include <linux/swapops.h> 24 25#include <asm/page.h> 26#include <asm/pgtable.h> 27#include <asm/io.h> 28 29#include <linux/hugetlb.h> 30#include <linux/node.h> 31#include "internal.h" 32 33const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; 34static gfp_t htlb_alloc_mask = GFP_HIGHUSER; 35unsigned long hugepages_treat_as_movable; 36 37static int max_hstate; 38unsigned int default_hstate_idx; 39struct hstate hstates[HUGE_MAX_HSTATE]; 40 41__initdata LIST_HEAD(huge_boot_pages); 42 43/* for command line parsing */ 44static struct hstate * __initdata parsed_hstate; 45static unsigned long __initdata default_hstate_max_huge_pages; 46static unsigned long __initdata default_hstate_size; 47 48#define for_each_hstate(h) \ 49 for ((h) = hstates; (h) < &hstates[max_hstate]; (h)++) 50 51/* 52 * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages 53 */ 54static DEFINE_SPINLOCK(hugetlb_lock); 55 56/* 57 * Region tracking -- allows tracking of reservations and instantiated pages 58 * across the pages in a mapping. 59 * 60 * The region data structures are protected by a combination of the mmap_sem 61 * and the hugetlb_instantion_mutex. To access or modify a region the caller 62 * must either hold the mmap_sem for write, or the mmap_sem for read and 63 * the hugetlb_instantiation mutex: 64 * 65 * down_write(&mm->mmap_sem); 66 * or 67 * down_read(&mm->mmap_sem); 68 * mutex_lock(&hugetlb_instantiation_mutex); 69 */ 70struct file_region { 71 struct list_head link; 72 long from; 73 long to; 74}; 75 76static long region_add(struct list_head *head, long f, long t) 77{ 78 struct file_region *rg, *nrg, *trg; 79 80 /* Locate the region we are either in or before. */ 81 list_for_each_entry(rg, head, link) 82 if (f <= rg->to) 83 break; 84 85 /* Round our left edge to the current segment if it encloses us. */ 86 if (f > rg->from) 87 f = rg->from; 88 89 /* Check for and consume any regions we now overlap with. */ 90 nrg = rg; 91 list_for_each_entry_safe(rg, trg, rg->link.prev, link) { 92 if (&rg->link == head) 93 break; 94 if (rg->from > t) 95 break; 96 97 /* If this area reaches higher then extend our area to 98 * include it completely. If this is not the first area 99 * which we intend to reuse, free it. */ 100 if (rg->to > t) 101 t = rg->to; 102 if (rg != nrg) { 103 list_del(&rg->link); 104 kfree(rg); 105 } 106 } 107 nrg->from = f; 108 nrg->to = t; 109 return 0; 110} 111 112static long region_chg(struct list_head *head, long f, long t) 113{ 114 struct file_region *rg, *nrg; 115 long chg = 0; 116 117 /* Locate the region we are before or in. */ 118 list_for_each_entry(rg, head, link) 119 if (f <= rg->to) 120 break; 121 122 /* If we are below the current region then a new region is required. 123 * Subtle, allocate a new region at the position but make it zero 124 * size such that we can guarantee to record the reservation. */ 125 if (&rg->link == head || t < rg->from) { 126 nrg = kmalloc(sizeof(*nrg), GFP_KERNEL); 127 if (!nrg) 128 return -ENOMEM; 129 nrg->from = f; 130 nrg->to = f; 131 INIT_LIST_HEAD(&nrg->link); 132 list_add(&nrg->link, rg->link.prev); 133 134 return t - f; 135 } 136 137 /* Round our left edge to the current segment if it encloses us. */ 138 if (f > rg->from) 139 f = rg->from; 140 chg = t - f; 141 142 /* Check for and consume any regions we now overlap with. */ 143 list_for_each_entry(rg, rg->link.prev, link) { 144 if (&rg->link == head) 145 break; 146 if (rg->from > t) 147 return chg; 148 149 /* We overlap with this area, if it extends futher than 150 * us then we must extend ourselves. Account for its 151 * existing reservation. */ 152 if (rg->to > t) { 153 chg += rg->to - t; 154 t = rg->to; 155 } 156 chg -= rg->to - rg->from; 157 } 158 return chg; 159} 160 161static long region_truncate(struct list_head *head, long end) 162{ 163 struct file_region *rg, *trg; 164 long chg = 0; 165 166 /* Locate the region we are either in or before. */ 167 list_for_each_entry(rg, head, link) 168 if (end <= rg->to) 169 break; 170 if (&rg->link == head) 171 return 0; 172 173 /* If we are in the middle of a region then adjust it. */ 174 if (end > rg->from) { 175 chg = rg->to - end; 176 rg->to = end; 177 rg = list_entry(rg->link.next, typeof(*rg), link); 178 } 179 180 /* Drop any remaining regions. */ 181 list_for_each_entry_safe(rg, trg, rg->link.prev, link) { 182 if (&rg->link == head) 183 break; 184 chg += rg->to - rg->from; 185 list_del(&rg->link); 186 kfree(rg); 187 } 188 return chg; 189} 190 191static long region_count(struct list_head *head, long f, long t) 192{ 193 struct file_region *rg; 194 long chg = 0; 195 196 /* Locate each segment we overlap with, and count that overlap. */ 197 list_for_each_entry(rg, head, link) { 198 int seg_from; 199 int seg_to; 200 201 if (rg->to <= f) 202 continue; 203 if (rg->from >= t) 204 break; 205 206 seg_from = max(rg->from, f); 207 seg_to = min(rg->to, t); 208 209 chg += seg_to - seg_from; 210 } 211 212 return chg; 213} 214 215/* 216 * Convert the address within this vma to the page offset within 217 * the mapping, in pagecache page units; huge pages here. 218 */ 219static pgoff_t vma_hugecache_offset(struct hstate *h, 220 struct vm_area_struct *vma, unsigned long address) 221{ 222 return ((address - vma->vm_start) >> huge_page_shift(h)) + 223 (vma->vm_pgoff >> huge_page_order(h)); 224} 225 226pgoff_t linear_hugepage_index(struct vm_area_struct *vma, 227 unsigned long address) 228{ 229 return vma_hugecache_offset(hstate_vma(vma), vma, address); 230} 231 232/* 233 * Return the size of the pages allocated when backing a VMA. In the majority 234 * cases this will be same size as used by the page table entries. 235 */ 236unsigned long vma_kernel_pagesize(struct vm_area_struct *vma) 237{ 238 struct hstate *hstate; 239 240 if (!is_vm_hugetlb_page(vma)) 241 return PAGE_SIZE; 242 243 hstate = hstate_vma(vma); 244 245 return 1UL << (hstate->order + PAGE_SHIFT); 246} 247EXPORT_SYMBOL_GPL(vma_kernel_pagesize); 248 249/* 250 * Return the page size being used by the MMU to back a VMA. In the majority 251 * of cases, the page size used by the kernel matches the MMU size. On 252 * architectures where it differs, an architecture-specific version of this 253 * function is required. 254 */ 255#ifndef vma_mmu_pagesize 256unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) 257{ 258 return vma_kernel_pagesize(vma); 259} 260#endif 261 262/* 263 * Flags for MAP_PRIVATE reservations. These are stored in the bottom 264 * bits of the reservation map pointer, which are always clear due to 265 * alignment. 266 */ 267#define HPAGE_RESV_OWNER (1UL << 0) 268#define HPAGE_RESV_UNMAPPED (1UL << 1) 269#define HPAGE_RESV_MASK (HPAGE_RESV_OWNER | HPAGE_RESV_UNMAPPED) 270 271/* 272 * These helpers are used to track how many pages are reserved for 273 * faults in a MAP_PRIVATE mapping. Only the process that called mmap() 274 * is guaranteed to have their future faults succeed. 275 * 276 * With the exception of reset_vma_resv_huge_pages() which is called at fork(), 277 * the reserve counters are updated with the hugetlb_lock held. It is safe 278 * to reset the VMA at fork() time as it is not in use yet and there is no 279 * chance of the global counters getting corrupted as a result of the values. 280 * 281 * The private mapping reservation is represented in a subtly different 282 * manner to a shared mapping. A shared mapping has a region map associated 283 * with the underlying file, this region map represents the backing file 284 * pages which have ever had a reservation assigned which this persists even 285 * after the page is instantiated. A private mapping has a region map 286 * associated with the original mmap which is attached to all VMAs which 287 * reference it, this region map represents those offsets which have consumed 288 * reservation ie. where pages have been instantiated. 289 */ 290static unsigned long get_vma_private_data(struct vm_area_struct *vma) 291{ 292 return (unsigned long)vma->vm_private_data; 293} 294 295static void set_vma_private_data(struct vm_area_struct *vma, 296 unsigned long value) 297{ 298 vma->vm_private_data = (void *)value; 299} 300 301struct resv_map { 302 struct kref refs; 303 struct list_head regions; 304}; 305 306static struct resv_map *resv_map_alloc(void) 307{ 308 struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL); 309 if (!resv_map) 310 return NULL; 311 312 kref_init(&resv_map->refs); 313 INIT_LIST_HEAD(&resv_map->regions); 314 315 return resv_map; 316} 317 318static void resv_map_release(struct kref *ref) 319{ 320 struct resv_map *resv_map = container_of(ref, struct resv_map, refs); 321 322 /* Clear out any active regions before we release the map. */ 323 region_truncate(&resv_map->regions, 0); 324 kfree(resv_map); 325} 326 327static struct resv_map *vma_resv_map(struct vm_area_struct *vma) 328{ 329 VM_BUG_ON(!is_vm_hugetlb_page(vma)); 330 if (!(vma->vm_flags & VM_MAYSHARE)) 331 return (struct resv_map *)(get_vma_private_data(vma) & 332 ~HPAGE_RESV_MASK); 333 return NULL; 334} 335 336static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map) 337{ 338 VM_BUG_ON(!is_vm_hugetlb_page(vma)); 339 VM_BUG_ON(vma->vm_flags & VM_MAYSHARE); 340 341 set_vma_private_data(vma, (get_vma_private_data(vma) & 342 HPAGE_RESV_MASK) | (unsigned long)map); 343} 344 345static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags) 346{ 347 VM_BUG_ON(!is_vm_hugetlb_page(vma)); 348 VM_BUG_ON(vma->vm_flags & VM_MAYSHARE); 349 350 set_vma_private_data(vma, get_vma_private_data(vma) | flags); 351} 352 353static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag) 354{ 355 VM_BUG_ON(!is_vm_hugetlb_page(vma)); 356 357 return (get_vma_private_data(vma) & flag) != 0; 358} 359 360/* Decrement the reserved pages in the hugepage pool by one */ 361static void decrement_hugepage_resv_vma(struct hstate *h, 362 struct vm_area_struct *vma) 363{ 364 if (vma->vm_flags & VM_NORESERVE) 365 return; 366 367 if (vma->vm_flags & VM_MAYSHARE) { 368 /* Shared mappings always use reserves */ 369 h->resv_huge_pages--; 370 } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { 371 /* 372 * Only the process that called mmap() has reserves for 373 * private mappings. 374 */ 375 h->resv_huge_pages--; 376 } 377} 378 379/* Reset counters to 0 and clear all HPAGE_RESV_* flags */ 380void reset_vma_resv_huge_pages(struct vm_area_struct *vma) 381{ 382 VM_BUG_ON(!is_vm_hugetlb_page(vma)); 383 if (!(vma->vm_flags & VM_MAYSHARE)) 384 vma->vm_private_data = (void *)0; 385} 386 387/* Returns true if the VMA has associated reserve pages */ 388static int vma_has_reserves(struct vm_area_struct *vma) 389{ 390 if (vma->vm_flags & VM_MAYSHARE) 391 return 1; 392 if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) 393 return 1; 394 return 0; 395} 396 397static void clear_gigantic_page(struct page *page, 398 unsigned long addr, unsigned long sz) 399{ 400 int i; 401 struct page *p = page; 402 403 might_sleep(); 404 for (i = 0; i < sz/PAGE_SIZE; i++, p = mem_map_next(p, page, i)) { 405 cond_resched(); 406 clear_user_highpage(p, addr + i * PAGE_SIZE); 407 } 408} 409static void clear_huge_page(struct page *page, 410 unsigned long addr, unsigned long sz) 411{ 412 int i; 413 414 if (unlikely(sz/PAGE_SIZE > MAX_ORDER_NR_PAGES)) { 415 clear_gigantic_page(page, addr, sz); 416 return; 417 } 418 419 might_sleep(); 420 for (i = 0; i < sz/PAGE_SIZE; i++) { 421 cond_resched(); 422 clear_user_highpage(page + i, addr + i * PAGE_SIZE); 423 } 424} 425 426static void copy_user_gigantic_page(struct page *dst, struct page *src, 427 unsigned long addr, struct vm_area_struct *vma) 428{ 429 int i; 430 struct hstate *h = hstate_vma(vma); 431 struct page *dst_base = dst; 432 struct page *src_base = src; 433 434 for (i = 0; i < pages_per_huge_page(h); ) { 435 cond_resched(); 436 copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma); 437 438 i++; 439 dst = mem_map_next(dst, dst_base, i); 440 src = mem_map_next(src, src_base, i); 441 } 442} 443 444static void copy_user_huge_page(struct page *dst, struct page *src, 445 unsigned long addr, struct vm_area_struct *vma) 446{ 447 int i; 448 struct hstate *h = hstate_vma(vma); 449 450 if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) { 451 copy_user_gigantic_page(dst, src, addr, vma); 452 return; 453 } 454 455 might_sleep(); 456 for (i = 0; i < pages_per_huge_page(h); i++) { 457 cond_resched(); 458 copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma); 459 } 460} 461 462static void copy_gigantic_page(struct page *dst, struct page *src) 463{ 464 int i; 465 struct hstate *h = page_hstate(src); 466 struct page *dst_base = dst; 467 struct page *src_base = src; 468 469 for (i = 0; i < pages_per_huge_page(h); ) { 470 cond_resched(); 471 copy_highpage(dst, src); 472 473 i++; 474 dst = mem_map_next(dst, dst_base, i); 475 src = mem_map_next(src, src_base, i); 476 } 477} 478 479void copy_huge_page(struct page *dst, struct page *src) 480{ 481 int i; 482 struct hstate *h = page_hstate(src); 483 484 if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) { 485 copy_gigantic_page(dst, src); 486 return; 487 } 488 489 might_sleep(); 490 for (i = 0; i < pages_per_huge_page(h); i++) { 491 cond_resched(); 492 copy_highpage(dst + i, src + i); 493 } 494} 495 496static void enqueue_huge_page(struct hstate *h, struct page *page) 497{ 498 int nid = page_to_nid(page); 499 list_add(&page->lru, &h->hugepage_freelists[nid]); 500 h->free_huge_pages++; 501 h->free_huge_pages_node[nid]++; 502} 503 504static struct page *dequeue_huge_page_node(struct hstate *h, int nid) 505{ 506 struct page *page; 507 508 if (list_empty(&h->hugepage_freelists[nid])) 509 return NULL; 510 page = list_entry(h->hugepage_freelists[nid].next, struct page, lru); 511 list_del(&page->lru); 512 set_page_refcounted(page); 513 h->free_huge_pages--; 514 h->free_huge_pages_node[nid]--; 515 return page; 516} 517 518static struct page *dequeue_huge_page_vma(struct hstate *h, 519 struct vm_area_struct *vma, 520 unsigned long address, int avoid_reserve) 521{ 522 struct page *page = NULL; 523 struct mempolicy *mpol; 524 nodemask_t *nodemask; 525 struct zonelist *zonelist; 526 struct zone *zone; 527 struct zoneref *z; 528 529 get_mems_allowed(); 530 zonelist = huge_zonelist(vma, address, 531 htlb_alloc_mask, &mpol, &nodemask); 532 /* 533 * A child process with MAP_PRIVATE mappings created by their parent 534 * have no page reserves. This check ensures that reservations are 535 * not "stolen". The child may still get SIGKILLed 536 */ 537 if (!vma_has_reserves(vma) && 538 h->free_huge_pages - h->resv_huge_pages == 0) 539 goto err; 540 541 /* If reserves cannot be used, ensure enough pages are in the pool */ 542 if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0) 543 goto err;; 544 545 for_each_zone_zonelist_nodemask(zone, z, zonelist, 546 MAX_NR_ZONES - 1, nodemask) { 547 if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask)) { 548 page = dequeue_huge_page_node(h, zone_to_nid(zone)); 549 if (page) { 550 if (!avoid_reserve) 551 decrement_hugepage_resv_vma(h, vma); 552 break; 553 } 554 } 555 } 556err: 557 mpol_cond_put(mpol); 558 put_mems_allowed(); 559 return page; 560} 561 562static void update_and_free_page(struct hstate *h, struct page *page) 563{ 564 int i; 565 566 VM_BUG_ON(h->order >= MAX_ORDER); 567 568 h->nr_huge_pages--; 569 h->nr_huge_pages_node[page_to_nid(page)]--; 570 for (i = 0; i < pages_per_huge_page(h); i++) { 571 page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced | 572 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved | 573 1 << PG_private | 1<< PG_writeback); 574 } 575 set_compound_page_dtor(page, NULL); 576 set_page_refcounted(page); 577 arch_release_hugepage(page); 578 __free_pages(page, huge_page_order(h)); 579} 580 581struct hstate *size_to_hstate(unsigned long size) 582{ 583 struct hstate *h; 584 585 for_each_hstate(h) { 586 if (huge_page_size(h) == size) 587 return h; 588 } 589 return NULL; 590} 591 592static void free_huge_page(struct page *page) 593{ 594 /* 595 * Can't pass hstate in here because it is called from the 596 * compound page destructor. 597 */ 598 struct hstate *h = page_hstate(page); 599 int nid = page_to_nid(page); 600 struct address_space *mapping; 601 602 mapping = (struct address_space *) page_private(page); 603 set_page_private(page, 0); 604 page->mapping = NULL; 605 BUG_ON(page_count(page)); 606 BUG_ON(page_mapcount(page)); 607 INIT_LIST_HEAD(&page->lru); 608 609 spin_lock(&hugetlb_lock); 610 if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) { 611 update_and_free_page(h, page); 612 h->surplus_huge_pages--; 613 h->surplus_huge_pages_node[nid]--; 614 } else { 615 enqueue_huge_page(h, page); 616 } 617 spin_unlock(&hugetlb_lock); 618 if (mapping) 619 hugetlb_put_quota(mapping, 1); 620} 621 622static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) 623{ 624 set_compound_page_dtor(page, free_huge_page); 625 spin_lock(&hugetlb_lock); 626 h->nr_huge_pages++; 627 h->nr_huge_pages_node[nid]++; 628 spin_unlock(&hugetlb_lock); 629 put_page(page); /* free it into the hugepage allocator */ 630} 631 632static void prep_compound_gigantic_page(struct page *page, unsigned long order) 633{ 634 int i; 635 int nr_pages = 1 << order; 636 struct page *p = page + 1; 637 638 /* we rely on prep_new_huge_page to set the destructor */ 639 set_compound_order(page, order); 640 __SetPageHead(page); 641 for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { 642 __SetPageTail(p); 643 p->first_page = page; 644 } 645} 646 647int PageHuge(struct page *page) 648{ 649 compound_page_dtor *dtor; 650 651 if (!PageCompound(page)) 652 return 0; 653 654 page = compound_head(page); 655 dtor = get_compound_page_dtor(page); 656 657 return dtor == free_huge_page; 658} 659 660EXPORT_SYMBOL_GPL(PageHuge); 661 662static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) 663{ 664 struct page *page; 665 666 if (h->order >= MAX_ORDER) 667 return NULL; 668 669 page = alloc_pages_exact_node(nid, 670 htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE| 671 __GFP_REPEAT|__GFP_NOWARN, 672 huge_page_order(h)); 673 if (page) { 674 if (arch_prepare_hugepage(page)) { 675 __free_pages(page, huge_page_order(h)); 676 return NULL; 677 } 678 prep_new_huge_page(h, page, nid); 679 } 680 681 return page; 682} 683 684/* 685 * common helper functions for hstate_next_node_to_{alloc|free}. 686 * We may have allocated or freed a huge page based on a different 687 * nodes_allowed previously, so h->next_node_to_{alloc|free} might 688 * be outside of *nodes_allowed. Ensure that we use an allowed 689 * node for alloc or free. 690 */ 691static int next_node_allowed(int nid, nodemask_t *nodes_allowed) 692{ 693 nid = next_node(nid, *nodes_allowed); 694 if (nid == MAX_NUMNODES) 695 nid = first_node(*nodes_allowed); 696 VM_BUG_ON(nid >= MAX_NUMNODES); 697 698 return nid; 699} 700 701static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed) 702{ 703 if (!node_isset(nid, *nodes_allowed)) 704 nid = next_node_allowed(nid, nodes_allowed); 705 return nid; 706} 707 708/* 709 * returns the previously saved node ["this node"] from which to 710 * allocate a persistent huge page for the pool and advance the 711 * next node from which to allocate, handling wrap at end of node 712 * mask. 713 */ 714static int hstate_next_node_to_alloc(struct hstate *h, 715 nodemask_t *nodes_allowed) 716{ 717 int nid; 718 719 VM_BUG_ON(!nodes_allowed); 720 721 nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed); 722 h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed); 723 724 return nid; 725} 726 727static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed) 728{ 729 struct page *page; 730 int start_nid; 731 int next_nid; 732 int ret = 0; 733 734 start_nid = hstate_next_node_to_alloc(h, nodes_allowed); 735 next_nid = start_nid; 736 737 do { 738 page = alloc_fresh_huge_page_node(h, next_nid); 739 if (page) { 740 ret = 1; 741 break; 742 } 743 next_nid = hstate_next_node_to_alloc(h, nodes_allowed); 744 } while (next_nid != start_nid); 745 746 if (ret) 747 count_vm_event(HTLB_BUDDY_PGALLOC); 748 else 749 count_vm_event(HTLB_BUDDY_PGALLOC_FAIL); 750 751 return ret; 752} 753 754/* 755 * helper for free_pool_huge_page() - return the previously saved 756 * node ["this node"] from which to free a huge page. Advance the 757 * next node id whether or not we find a free huge page to free so 758 * that the next attempt to free addresses the next node. 759 */ 760static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed) 761{ 762 int nid; 763 764 VM_BUG_ON(!nodes_allowed); 765 766 nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed); 767 h->next_nid_to_free = next_node_allowed(nid, nodes_allowed); 768 769 return nid; 770} 771 772/* 773 * Free huge page from pool from next node to free. 774 * Attempt to keep persistent huge pages more or less 775 * balanced over allowed nodes. 776 * Called with hugetlb_lock locked. 777 */ 778static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, 779 bool acct_surplus) 780{ 781 int start_nid; 782 int next_nid; 783 int ret = 0; 784 785 start_nid = hstate_next_node_to_free(h, nodes_allowed); 786 next_nid = start_nid; 787 788 do { 789 /* 790 * If we're returning unused surplus pages, only examine 791 * nodes with surplus pages. 792 */ 793 if ((!acct_surplus || h->surplus_huge_pages_node[next_nid]) && 794 !list_empty(&h->hugepage_freelists[next_nid])) { 795 struct page *page = 796 list_entry(h->hugepage_freelists[next_nid].next, 797 struct page, lru); 798 list_del(&page->lru); 799 h->free_huge_pages--; 800 h->free_huge_pages_node[next_nid]--; 801 if (acct_surplus) { 802 h->surplus_huge_pages--; 803 h->surplus_huge_pages_node[next_nid]--; 804 } 805 update_and_free_page(h, page); 806 ret = 1; 807 break; 808 } 809 next_nid = hstate_next_node_to_free(h, nodes_allowed); 810 } while (next_nid != start_nid); 811 812 return ret; 813} 814 815static struct page *alloc_buddy_huge_page(struct hstate *h, int nid) 816{ 817 struct page *page; 818 unsigned int r_nid; 819 820 if (h->order >= MAX_ORDER) 821 return NULL; 822 823 /* 824 * Assume we will successfully allocate the surplus page to 825 * prevent racing processes from causing the surplus to exceed 826 * overcommit 827 * 828 * This however introduces a different race, where a process B 829 * tries to grow the static hugepage pool while alloc_pages() is 830 * called by process A. B will only examine the per-node 831 * counters in determining if surplus huge pages can be 832 * converted to normal huge pages in adjust_pool_surplus(). A 833 * won't be able to increment the per-node counter, until the 834 * lock is dropped by B, but B doesn't drop hugetlb_lock until 835 * no more huge pages can be converted from surplus to normal 836 * state (and doesn't try to convert again). Thus, we have a 837 * case where a surplus huge page exists, the pool is grown, and 838 * the surplus huge page still exists after, even though it 839 * should just have been converted to a normal huge page. This 840 * does not leak memory, though, as the hugepage will be freed 841 * once it is out of use. It also does not allow the counters to 842 * go out of whack in adjust_pool_surplus() as we don't modify 843 * the node values until we've gotten the hugepage and only the 844 * per-node value is checked there. 845 */ 846 spin_lock(&hugetlb_lock); 847 if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) { 848 spin_unlock(&hugetlb_lock); 849 return NULL; 850 } else { 851 h->nr_huge_pages++; 852 h->surplus_huge_pages++; 853 } 854 spin_unlock(&hugetlb_lock); 855 856 if (nid == NUMA_NO_NODE) 857 page = alloc_pages(htlb_alloc_mask|__GFP_COMP| 858 __GFP_REPEAT|__GFP_NOWARN, 859 huge_page_order(h)); 860 else 861 page = alloc_pages_exact_node(nid, 862 htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE| 863 __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h)); 864 865 if (page && arch_prepare_hugepage(page)) { 866 __free_pages(page, huge_page_order(h)); 867 return NULL; 868 } 869 870 spin_lock(&hugetlb_lock); 871 if (page) { 872 r_nid = page_to_nid(page); 873 set_compound_page_dtor(page, free_huge_page); 874 /* 875 * We incremented the global counters already 876 */ 877 h->nr_huge_pages_node[r_nid]++; 878 h->surplus_huge_pages_node[r_nid]++; 879 __count_vm_event(HTLB_BUDDY_PGALLOC); 880 } else { 881 h->nr_huge_pages--; 882 h->surplus_huge_pages--; 883 __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL); 884 } 885 spin_unlock(&hugetlb_lock); 886 887 return page; 888} 889 890/* 891 * This allocation function is useful in the context where vma is irrelevant. 892 * E.g. soft-offlining uses this function because it only cares physical 893 * address of error page. 894 */ 895struct page *alloc_huge_page_node(struct hstate *h, int nid) 896{ 897 struct page *page; 898 899 spin_lock(&hugetlb_lock); 900 page = dequeue_huge_page_node(h, nid); 901 spin_unlock(&hugetlb_lock); 902 903 if (!page) 904 page = alloc_buddy_huge_page(h, nid); 905 906 return page; 907} 908 909/* 910 * Increase the hugetlb pool such that it can accomodate a reservation 911 * of size 'delta'. 912 */ 913static int gather_surplus_pages(struct hstate *h, int delta) 914{ 915 struct list_head surplus_list; 916 struct page *page, *tmp; 917 int ret, i; 918 int needed, allocated; 919 920 needed = (h->resv_huge_pages + delta) - h->free_huge_pages; 921 if (needed <= 0) { 922 h->resv_huge_pages += delta; 923 return 0; 924 } 925 926 allocated = 0; 927 INIT_LIST_HEAD(&surplus_list); 928 929 ret = -ENOMEM; 930retry: 931 spin_unlock(&hugetlb_lock); 932 for (i = 0; i < needed; i++) { 933 page = alloc_buddy_huge_page(h, NUMA_NO_NODE); 934 if (!page) 935 /* 936 * We were not able to allocate enough pages to 937 * satisfy the entire reservation so we free what 938 * we've allocated so far. 939 */ 940 goto free; 941 942 list_add(&page->lru, &surplus_list); 943 } 944 allocated += needed; 945 946 /* 947 * After retaking hugetlb_lock, we need to recalculate 'needed' 948 * because either resv_huge_pages or free_huge_pages may have changed. 949 */ 950 spin_lock(&hugetlb_lock); 951 needed = (h->resv_huge_pages + delta) - 952 (h->free_huge_pages + allocated); 953 if (needed > 0) 954 goto retry; 955 956 /* 957 * The surplus_list now contains _at_least_ the number of extra pages 958 * needed to accomodate the reservation. Add the appropriate number 959 * of pages to the hugetlb pool and free the extras back to the buddy 960 * allocator. Commit the entire reservation here to prevent another 961 * process from stealing the pages as they are added to the pool but 962 * before they are reserved. 963 */ 964 needed += allocated; 965 h->resv_huge_pages += delta; 966 ret = 0; 967 968 spin_unlock(&hugetlb_lock); 969 /* Free the needed pages to the hugetlb pool */ 970 list_for_each_entry_safe(page, tmp, &surplus_list, lru) { 971 if ((--needed) < 0) 972 break; 973 list_del(&page->lru); 974 /* 975 * This page is now managed by the hugetlb allocator and has 976 * no users -- drop the buddy allocator's reference. 977 */ 978 put_page_testzero(page); 979 VM_BUG_ON(page_count(page)); 980 enqueue_huge_page(h, page); 981 } 982 983 /* Free unnecessary surplus pages to the buddy allocator */ 984free: 985 if (!list_empty(&surplus_list)) { 986 list_for_each_entry_safe(page, tmp, &surplus_list, lru) { 987 list_del(&page->lru); 988 put_page(page); 989 } 990 } 991 spin_lock(&hugetlb_lock); 992 993 return ret; 994} 995 996/* 997 * When releasing a hugetlb pool reservation, any surplus pages that were 998 * allocated to satisfy the reservation must be explicitly freed if they were 999 * never used. 1000 * Called with hugetlb_lock held. 1001 */ 1002static void return_unused_surplus_pages(struct hstate *h, 1003 unsigned long unused_resv_pages) 1004{ 1005 unsigned long nr_pages; 1006 1007 /* Uncommit the reservation */ 1008 h->resv_huge_pages -= unused_resv_pages; 1009 1010 /* Cannot return gigantic pages currently */ 1011 if (h->order >= MAX_ORDER) 1012 return; 1013 1014 nr_pages = min(unused_resv_pages, h->surplus_huge_pages); 1015 1016 /* 1017 * We want to release as many surplus pages as possible, spread 1018 * evenly across all nodes with memory. Iterate across these nodes 1019 * until we can no longer free unreserved surplus pages. This occurs 1020 * when the nodes with surplus pages have no free pages. 1021 * free_pool_huge_page() will balance the the freed pages across the 1022 * on-line nodes with memory and will handle the hstate accounting. 1023 */ 1024 while (nr_pages--) { 1025 if (!free_pool_huge_page(h, &node_states[N_HIGH_MEMORY], 1)) 1026 break; 1027 } 1028} 1029 1030/* 1031 * Determine if the huge page at addr within the vma has an associated 1032 * reservation. Where it does not we will need to logically increase 1033 * reservation and actually increase quota before an allocation can occur. 1034 * Where any new reservation would be required the reservation change is 1035 * prepared, but not committed. Once the page has been quota'd allocated 1036 * an instantiated the change should be committed via vma_commit_reservation. 1037 * No action is required on failure. 1038 */ 1039static long vma_needs_reservation(struct hstate *h, 1040 struct vm_area_struct *vma, unsigned long addr) 1041{ 1042 struct address_space *mapping = vma->vm_file->f_mapping; 1043 struct inode *inode = mapping->host; 1044 1045 if (vma->vm_flags & VM_MAYSHARE) { 1046 pgoff_t idx = vma_hugecache_offset(h, vma, addr); 1047 return region_chg(&inode->i_mapping->private_list, 1048 idx, idx + 1); 1049 1050 } else if (!is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { 1051 return 1; 1052 1053 } else { 1054 long err; 1055 pgoff_t idx = vma_hugecache_offset(h, vma, addr); 1056 struct resv_map *reservations = vma_resv_map(vma); 1057 1058 err = region_chg(&reservations->regions, idx, idx + 1); 1059 if (err < 0) 1060 return err; 1061 return 0; 1062 } 1063} 1064static void vma_commit_reservation(struct hstate *h, 1065 struct vm_area_struct *vma, unsigned long addr) 1066{ 1067 struct address_space *mapping = vma->vm_file->f_mapping; 1068 struct inode *inode = mapping->host; 1069 1070 if (vma->vm_flags & VM_MAYSHARE) { 1071 pgoff_t idx = vma_hugecache_offset(h, vma, addr); 1072 region_add(&inode->i_mapping->private_list, idx, idx + 1); 1073 1074 } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { 1075 pgoff_t idx = vma_hugecache_offset(h, vma, addr); 1076 struct resv_map *reservations = vma_resv_map(vma); 1077 1078 /* Mark this page used in the map. */ 1079 region_add(&reservations->regions, idx, idx + 1); 1080 } 1081} 1082 1083static struct page *alloc_huge_page(struct vm_area_struct *vma, 1084 unsigned long addr, int avoid_reserve) 1085{ 1086 struct hstate *h = hstate_vma(vma); 1087 struct page *page; 1088 struct address_space *mapping = vma->vm_file->f_mapping; 1089 struct inode *inode = mapping->host; 1090 long chg; 1091 1092 /* 1093 * Processes that did not create the mapping will have no reserves and 1094 * will not have accounted against quota. Check that the quota can be 1095 * made before satisfying the allocation 1096 * MAP_NORESERVE mappings may also need pages and quota allocated 1097 * if no reserve mapping overlaps. 1098 */ 1099 chg = vma_needs_reservation(h, vma, addr); 1100 if (chg < 0) 1101 return ERR_PTR(chg); 1102 if (chg) 1103 if (hugetlb_get_quota(inode->i_mapping, chg)) 1104 return ERR_PTR(-ENOSPC); 1105 1106 spin_lock(&hugetlb_lock); 1107 page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve); 1108 spin_unlock(&hugetlb_lock); 1109 1110 if (!page) { 1111 page = alloc_buddy_huge_page(h, NUMA_NO_NODE); 1112 if (!page) { 1113 hugetlb_put_quota(inode->i_mapping, chg); 1114 return ERR_PTR(-VM_FAULT_SIGBUS); 1115 } 1116 } 1117 1118 set_page_private(page, (unsigned long) mapping); 1119 1120 vma_commit_reservation(h, vma, addr); 1121 1122 return page; 1123} 1124 1125int __weak alloc_bootmem_huge_page(struct hstate *h) 1126{ 1127 struct huge_bootmem_page *m; 1128 int nr_nodes = nodes_weight(node_states[N_HIGH_MEMORY]); 1129 1130 while (nr_nodes) { 1131 void *addr; 1132 1133 addr = __alloc_bootmem_node_nopanic( 1134 NODE_DATA(hstate_next_node_to_alloc(h, 1135 &node_states[N_HIGH_MEMORY])), 1136 huge_page_size(h), huge_page_size(h), 0); 1137 1138 if (addr) { 1139 /* 1140 * Use the beginning of the huge page to store the 1141 * huge_bootmem_page struct (until gather_bootmem 1142 * puts them into the mem_map). 1143 */ 1144 m = addr; 1145 goto found; 1146 } 1147 nr_nodes--; 1148 } 1149 return 0; 1150 1151found: 1152 BUG_ON((unsigned long)virt_to_phys(m) & (huge_page_size(h) - 1)); 1153 /* Put them into a private list first because mem_map is not up yet */ 1154 list_add(&m->list, &huge_boot_pages); 1155 m->hstate = h; 1156 return 1; 1157} 1158 1159static void prep_compound_huge_page(struct page *page, int order) 1160{ 1161 if (unlikely(order > (MAX_ORDER - 1))) 1162 prep_compound_gigantic_page(page, order); 1163 else 1164 prep_compound_page(page, order); 1165} 1166 1167/* Put bootmem huge pages into the standard lists after mem_map is up */ 1168static void __init gather_bootmem_prealloc(void) 1169{ 1170 struct huge_bootmem_page *m; 1171 1172 list_for_each_entry(m, &huge_boot_pages, list) { 1173 struct page *page = virt_to_page(m); 1174 struct hstate *h = m->hstate; 1175 __ClearPageReserved(page); 1176 WARN_ON(page_count(page) != 1); 1177 prep_compound_huge_page(page, h->order); 1178 prep_new_huge_page(h, page, page_to_nid(page)); 1179 } 1180} 1181 1182static void __init hugetlb_hstate_alloc_pages(struct hstate *h) 1183{ 1184 unsigned long i; 1185 1186 for (i = 0; i < h->max_huge_pages; ++i) { 1187 if (h->order >= MAX_ORDER) { 1188 if (!alloc_bootmem_huge_page(h)) 1189 break; 1190 } else if (!alloc_fresh_huge_page(h, 1191 &node_states[N_HIGH_MEMORY])) 1192 break; 1193 } 1194 h->max_huge_pages = i; 1195} 1196 1197static void __init hugetlb_init_hstates(void) 1198{ 1199 struct hstate *h; 1200 1201 for_each_hstate(h) { 1202 /* oversize hugepages were init'ed in early boot */ 1203 if (h->order < MAX_ORDER) 1204 hugetlb_hstate_alloc_pages(h); 1205 } 1206} 1207 1208static char * __init memfmt(char *buf, unsigned long n) 1209{ 1210 if (n >= (1UL << 30)) 1211 sprintf(buf, "%lu GB", n >> 30); 1212 else if (n >= (1UL << 20)) 1213 sprintf(buf, "%lu MB", n >> 20); 1214 else 1215 sprintf(buf, "%lu KB", n >> 10); 1216 return buf; 1217} 1218 1219static void __init report_hugepages(void) 1220{ 1221 struct hstate *h; 1222 1223 for_each_hstate(h) { 1224 char buf[32]; 1225 printk(KERN_INFO "HugeTLB registered %s page size, " 1226 "pre-allocated %ld pages\n", 1227 memfmt(buf, huge_page_size(h)), 1228 h->free_huge_pages); 1229 } 1230} 1231 1232#ifdef CONFIG_HIGHMEM 1233static void try_to_free_low(struct hstate *h, unsigned long count, 1234 nodemask_t *nodes_allowed) 1235{ 1236 int i; 1237 1238 if (h->order >= MAX_ORDER) 1239 return; 1240 1241 for_each_node_mask(i, *nodes_allowed) { 1242 struct page *page, *next; 1243 struct list_head *freel = &h->hugepage_freelists[i]; 1244 list_for_each_entry_safe(page, next, freel, lru) { 1245 if (count >= h->nr_huge_pages) 1246 return; 1247 if (PageHighMem(page)) 1248 continue; 1249 list_del(&page->lru); 1250 update_and_free_page(h, page); 1251 h->free_huge_pages--; 1252 h->free_huge_pages_node[page_to_nid(page)]--; 1253 } 1254 } 1255} 1256#else 1257static inline void try_to_free_low(struct hstate *h, unsigned long count, 1258 nodemask_t *nodes_allowed) 1259{ 1260} 1261#endif 1262 1263/* 1264 * Increment or decrement surplus_huge_pages. Keep node-specific counters 1265 * balanced by operating on them in a round-robin fashion. 1266 * Returns 1 if an adjustment was made. 1267 */ 1268static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed, 1269 int delta) 1270{ 1271 int start_nid, next_nid; 1272 int ret = 0; 1273 1274 VM_BUG_ON(delta != -1 && delta != 1); 1275 1276 if (delta < 0) 1277 start_nid = hstate_next_node_to_alloc(h, nodes_allowed); 1278 else 1279 start_nid = hstate_next_node_to_free(h, nodes_allowed); 1280 next_nid = start_nid; 1281 1282 do { 1283 int nid = next_nid; 1284 if (delta < 0) { 1285 /* 1286 * To shrink on this node, there must be a surplus page 1287 */ 1288 if (!h->surplus_huge_pages_node[nid]) { 1289 next_nid = hstate_next_node_to_alloc(h, 1290 nodes_allowed); 1291 continue; 1292 } 1293 } 1294 if (delta > 0) { 1295 /* 1296 * Surplus cannot exceed the total number of pages 1297 */ 1298 if (h->surplus_huge_pages_node[nid] >= 1299 h->nr_huge_pages_node[nid]) { 1300 next_nid = hstate_next_node_to_free(h, 1301 nodes_allowed); 1302 continue; 1303 } 1304 } 1305 1306 h->surplus_huge_pages += delta; 1307 h->surplus_huge_pages_node[nid] += delta; 1308 ret = 1; 1309 break; 1310 } while (next_nid != start_nid); 1311 1312 return ret; 1313} 1314 1315#define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages) 1316static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count, 1317 nodemask_t *nodes_allowed) 1318{ 1319 unsigned long min_count, ret; 1320 1321 if (h->order >= MAX_ORDER) 1322 return h->max_huge_pages; 1323 1324 /* 1325 * Increase the pool size 1326 * First take pages out of surplus state. Then make up the 1327 * remaining difference by allocating fresh huge pages. 1328 * 1329 * We might race with alloc_buddy_huge_page() here and be unable 1330 * to convert a surplus huge page to a normal huge page. That is 1331 * not critical, though, it just means the overall size of the 1332 * pool might be one hugepage larger than it needs to be, but 1333 * within all the constraints specified by the sysctls. 1334 */ 1335 spin_lock(&hugetlb_lock); 1336 while (h->surplus_huge_pages && count > persistent_huge_pages(h)) { 1337 if (!adjust_pool_surplus(h, nodes_allowed, -1)) 1338 break; 1339 } 1340 1341 while (count > persistent_huge_pages(h)) { 1342 /* 1343 * If this allocation races such that we no longer need the 1344 * page, free_huge_page will handle it by freeing the page 1345 * and reducing the surplus. 1346 */ 1347 spin_unlock(&hugetlb_lock); 1348 ret = alloc_fresh_huge_page(h, nodes_allowed); 1349 spin_lock(&hugetlb_lock); 1350 if (!ret) 1351 goto out; 1352 1353 /* Bail for signals. Probably ctrl-c from user */ 1354 if (signal_pending(current)) 1355 goto out; 1356 } 1357 1358 /* 1359 * Decrease the pool size 1360 * First return free pages to the buddy allocator (being careful 1361 * to keep enough around to satisfy reservations). Then place 1362 * pages into surplus state as needed so the pool will shrink 1363 * to the desired size as pages become free. 1364 * 1365 * By placing pages into the surplus state independent of the 1366 * overcommit value, we are allowing the surplus pool size to 1367 * exceed overcommit. There are few sane options here. Since 1368 * alloc_buddy_huge_page() is checking the global counter, 1369 * though, we'll note that we're not allowed to exceed surplus 1370 * and won't grow the pool anywhere else. Not until one of the 1371 * sysctls are changed, or the surplus pages go out of use. 1372 */ 1373 min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages; 1374 min_count = max(count, min_count); 1375 try_to_free_low(h, min_count, nodes_allowed); 1376 while (min_count < persistent_huge_pages(h)) { 1377 if (!free_pool_huge_page(h, nodes_allowed, 0)) 1378 break; 1379 } 1380 while (count < persistent_huge_pages(h)) { 1381 if (!adjust_pool_surplus(h, nodes_allowed, 1)) 1382 break; 1383 } 1384out: 1385 ret = persistent_huge_pages(h); 1386 spin_unlock(&hugetlb_lock); 1387 return ret; 1388} 1389 1390#define HSTATE_ATTR_RO(_name) \ 1391 static struct kobj_attribute _name##_attr = __ATTR_RO(_name) 1392 1393#define HSTATE_ATTR(_name) \ 1394 static struct kobj_attribute _name##_attr = \ 1395 __ATTR(_name, 0644, _name##_show, _name##_store) 1396 1397static struct kobject *hugepages_kobj; 1398static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE]; 1399 1400static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp); 1401 1402static struct hstate *kobj_to_hstate(struct kobject *kobj, int *nidp) 1403{ 1404 int i; 1405 1406 for (i = 0; i < HUGE_MAX_HSTATE; i++) 1407 if (hstate_kobjs[i] == kobj) { 1408 if (nidp) 1409 *nidp = NUMA_NO_NODE; 1410 return &hstates[i]; 1411 } 1412 1413 return kobj_to_node_hstate(kobj, nidp); 1414} 1415 1416static ssize_t nr_hugepages_show_common(struct kobject *kobj, 1417 struct kobj_attribute *attr, char *buf) 1418{ 1419 struct hstate *h; 1420 unsigned long nr_huge_pages; 1421 int nid; 1422 1423 h = kobj_to_hstate(kobj, &nid); 1424 if (nid == NUMA_NO_NODE) 1425 nr_huge_pages = h->nr_huge_pages; 1426 else 1427 nr_huge_pages = h->nr_huge_pages_node[nid]; 1428 1429 return sprintf(buf, "%lu\n", nr_huge_pages); 1430} 1431static ssize_t nr_hugepages_store_common(bool obey_mempolicy, 1432 struct kobject *kobj, struct kobj_attribute *attr, 1433 const char *buf, size_t len) 1434{ 1435 int err; 1436 int nid; 1437 unsigned long count; 1438 struct hstate *h; 1439 NODEMASK_ALLOC(nodemask_t, nodes_allowed, GFP_KERNEL | __GFP_NORETRY); 1440 1441 err = strict_strtoul(buf, 10, &count); 1442 if (err) 1443 return 0; 1444 1445 h = kobj_to_hstate(kobj, &nid); 1446 if (nid == NUMA_NO_NODE) { 1447 /* 1448 * global hstate attribute 1449 */ 1450 if (!(obey_mempolicy && 1451 init_nodemask_of_mempolicy(nodes_allowed))) { 1452 NODEMASK_FREE(nodes_allowed); 1453 nodes_allowed = &node_states[N_HIGH_MEMORY]; 1454 } 1455 } else if (nodes_allowed) { 1456 /* 1457 * per node hstate attribute: adjust count to global, 1458 * but restrict alloc/free to the specified node. 1459 */ 1460 count += h->nr_huge_pages - h->nr_huge_pages_node[nid]; 1461 init_nodemask_of_node(nodes_allowed, nid); 1462 } else 1463 nodes_allowed = &node_states[N_HIGH_MEMORY]; 1464 1465 h->max_huge_pages = set_max_huge_pages(h, count, nodes_allowed); 1466 1467 if (nodes_allowed != &node_states[N_HIGH_MEMORY]) 1468 NODEMASK_FREE(nodes_allowed); 1469 1470 return len; 1471} 1472 1473static ssize_t nr_hugepages_show(struct kobject *kobj, 1474 struct kobj_attribute *attr, char *buf) 1475{ 1476 return nr_hugepages_show_common(kobj, attr, buf); 1477} 1478 1479static ssize_t nr_hugepages_store(struct kobject *kobj, 1480 struct kobj_attribute *attr, const char *buf, size_t len) 1481{ 1482 return nr_hugepages_store_common(false, kobj, attr, buf, len); 1483} 1484HSTATE_ATTR(nr_hugepages); 1485 1486#ifdef CONFIG_NUMA 1487 1488/* 1489 * hstate attribute for optionally mempolicy-based constraint on persistent 1490 * huge page alloc/free. 1491 */ 1492static ssize_t nr_hugepages_mempolicy_show(struct kobject *kobj, 1493 struct kobj_attribute *attr, char *buf) 1494{ 1495 return nr_hugepages_show_common(kobj, attr, buf); 1496} 1497 1498static ssize_t nr_hugepages_mempolicy_store(struct kobject *kobj, 1499 struct kobj_attribute *attr, const char *buf, size_t len) 1500{ 1501 return nr_hugepages_store_common(true, kobj, attr, buf, len); 1502} 1503HSTATE_ATTR(nr_hugepages_mempolicy); 1504#endif 1505 1506 1507static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj, 1508 struct kobj_attribute *attr, char *buf) 1509{ 1510 struct hstate *h = kobj_to_hstate(kobj, NULL); 1511 return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages); 1512} 1513static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, 1514 struct kobj_attribute *attr, const char *buf, size_t count) 1515{ 1516 int err; 1517 unsigned long input; 1518 struct hstate *h = kobj_to_hstate(kobj, NULL); 1519 1520 err = strict_strtoul(buf, 10, &input); 1521 if (err) 1522 return 0; 1523 1524 spin_lock(&hugetlb_lock); 1525 h->nr_overcommit_huge_pages = input; 1526 spin_unlock(&hugetlb_lock); 1527 1528 return count; 1529} 1530HSTATE_ATTR(nr_overcommit_hugepages); 1531 1532static ssize_t free_hugepages_show(struct kobject *kobj, 1533 struct kobj_attribute *attr, char *buf) 1534{ 1535 struct hstate *h; 1536 unsigned long free_huge_pages; 1537 int nid; 1538 1539 h = kobj_to_hstate(kobj, &nid); 1540 if (nid == NUMA_NO_NODE) 1541 free_huge_pages = h->free_huge_pages; 1542 else 1543 free_huge_pages = h->free_huge_pages_node[nid]; 1544 1545 return sprintf(buf, "%lu\n", free_huge_pages); 1546} 1547HSTATE_ATTR_RO(free_hugepages); 1548 1549static ssize_t resv_hugepages_show(struct kobject *kobj, 1550 struct kobj_attribute *attr, char *buf) 1551{ 1552 struct hstate *h = kobj_to_hstate(kobj, NULL); 1553 return sprintf(buf, "%lu\n", h->resv_huge_pages); 1554} 1555HSTATE_ATTR_RO(resv_hugepages); 1556 1557static ssize_t surplus_hugepages_show(struct kobject *kobj, 1558 struct kobj_attribute *attr, char *buf) 1559{ 1560 struct hstate *h; 1561 unsigned long surplus_huge_pages; 1562 int nid; 1563 1564 h = kobj_to_hstate(kobj, &nid); 1565 if (nid == NUMA_NO_NODE) 1566 surplus_huge_pages = h->surplus_huge_pages; 1567 else 1568 surplus_huge_pages = h->surplus_huge_pages_node[nid]; 1569 1570 return sprintf(buf, "%lu\n", surplus_huge_pages); 1571} 1572HSTATE_ATTR_RO(surplus_hugepages); 1573 1574static struct attribute *hstate_attrs[] = { 1575 &nr_hugepages_attr.attr, 1576 &nr_overcommit_hugepages_attr.attr, 1577 &free_hugepages_attr.attr, 1578 &resv_hugepages_attr.attr, 1579 &surplus_hugepages_attr.attr, 1580#ifdef CONFIG_NUMA 1581 &nr_hugepages_mempolicy_attr.attr, 1582#endif 1583 NULL, 1584}; 1585 1586static struct attribute_group hstate_attr_group = { 1587 .attrs = hstate_attrs, 1588}; 1589 1590static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent, 1591 struct kobject **hstate_kobjs, 1592 struct attribute_group *hstate_attr_group) 1593{ 1594 int retval; 1595 int hi = h - hstates; 1596 1597 hstate_kobjs[hi] = kobject_create_and_add(h->name, parent); 1598 if (!hstate_kobjs[hi]) 1599 return -ENOMEM; 1600 1601 retval = sysfs_create_group(hstate_kobjs[hi], hstate_attr_group); 1602 if (retval) 1603 kobject_put(hstate_kobjs[hi]); 1604 1605 return retval; 1606} 1607 1608static void __init hugetlb_sysfs_init(void) 1609{ 1610 struct hstate *h; 1611 int err; 1612 1613 hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj); 1614 if (!hugepages_kobj) 1615 return; 1616 1617 for_each_hstate(h) { 1618 err = hugetlb_sysfs_add_hstate(h, hugepages_kobj, 1619 hstate_kobjs, &hstate_attr_group); 1620 if (err) 1621 printk(KERN_ERR "Hugetlb: Unable to add hstate %s", 1622 h->name); 1623 } 1624} 1625 1626#ifdef CONFIG_NUMA 1627 1628/* 1629 * node_hstate/s - associate per node hstate attributes, via their kobjects, 1630 * with node sysdevs in node_devices[] using a parallel array. The array 1631 * index of a node sysdev or _hstate == node id. 1632 * This is here to avoid any static dependency of the node sysdev driver, in 1633 * the base kernel, on the hugetlb module. 1634 */ 1635struct node_hstate { 1636 struct kobject *hugepages_kobj; 1637 struct kobject *hstate_kobjs[HUGE_MAX_HSTATE]; 1638}; 1639struct node_hstate node_hstates[MAX_NUMNODES]; 1640 1641/* 1642 * A subset of global hstate attributes for node sysdevs 1643 */ 1644static struct attribute *per_node_hstate_attrs[] = { 1645 &nr_hugepages_attr.attr, 1646 &free_hugepages_attr.attr, 1647 &surplus_hugepages_attr.attr, 1648 NULL, 1649}; 1650 1651static struct attribute_group per_node_hstate_attr_group = { 1652 .attrs = per_node_hstate_attrs, 1653}; 1654 1655/* 1656 * kobj_to_node_hstate - lookup global hstate for node sysdev hstate attr kobj. 1657 * Returns node id via non-NULL nidp. 1658 */ 1659static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp) 1660{ 1661 int nid; 1662 1663 for (nid = 0; nid < nr_node_ids; nid++) { 1664 struct node_hstate *nhs = &node_hstates[nid]; 1665 int i; 1666 for (i = 0; i < HUGE_MAX_HSTATE; i++) 1667 if (nhs->hstate_kobjs[i] == kobj) { 1668 if (nidp) 1669 *nidp = nid; 1670 return &hstates[i]; 1671 } 1672 } 1673 1674 BUG(); 1675 return NULL; 1676} 1677 1678/* 1679 * Unregister hstate attributes from a single node sysdev. 1680 * No-op if no hstate attributes attached. 1681 */ 1682void hugetlb_unregister_node(struct node *node) 1683{ 1684 struct hstate *h; 1685 struct node_hstate *nhs = &node_hstates[node->sysdev.id]; 1686 1687 if (!nhs->hugepages_kobj) 1688 return; /* no hstate attributes */ 1689 1690 for_each_hstate(h) 1691 if (nhs->hstate_kobjs[h - hstates]) { 1692 kobject_put(nhs->hstate_kobjs[h - hstates]); 1693 nhs->hstate_kobjs[h - hstates] = NULL; 1694 } 1695 1696 kobject_put(nhs->hugepages_kobj); 1697 nhs->hugepages_kobj = NULL; 1698} 1699 1700/* 1701 * hugetlb module exit: unregister hstate attributes from node sysdevs 1702 * that have them. 1703 */ 1704static void hugetlb_unregister_all_nodes(void) 1705{ 1706 int nid; 1707 1708 /* 1709 * disable node sysdev registrations. 1710 */ 1711 register_hugetlbfs_with_node(NULL, NULL); 1712 1713 /* 1714 * remove hstate attributes from any nodes that have them. 1715 */ 1716 for (nid = 0; nid < nr_node_ids; nid++) 1717 hugetlb_unregister_node(&node_devices[nid]); 1718} 1719 1720/* 1721 * Register hstate attributes for a single node sysdev. 1722 * No-op if attributes already registered. 1723 */ 1724void hugetlb_register_node(struct node *node) 1725{ 1726 struct hstate *h; 1727 struct node_hstate *nhs = &node_hstates[node->sysdev.id]; 1728 int err; 1729 1730 if (nhs->hugepages_kobj) 1731 return; /* already allocated */ 1732 1733 nhs->hugepages_kobj = kobject_create_and_add("hugepages", 1734 &node->sysdev.kobj); 1735 if (!nhs->hugepages_kobj) 1736 return; 1737 1738 for_each_hstate(h) { 1739 err = hugetlb_sysfs_add_hstate(h, nhs->hugepages_kobj, 1740 nhs->hstate_kobjs, 1741 &per_node_hstate_attr_group); 1742 if (err) { 1743 printk(KERN_ERR "Hugetlb: Unable to add hstate %s" 1744 " for node %d\n", 1745 h->name, node->sysdev.id); 1746 hugetlb_unregister_node(node); 1747 break; 1748 } 1749 } 1750} 1751 1752/* 1753 * hugetlb init time: register hstate attributes for all registered node 1754 * sysdevs of nodes that have memory. All on-line nodes should have 1755 * registered their associated sysdev by this time. 1756 */ 1757static void hugetlb_register_all_nodes(void) 1758{ 1759 int nid; 1760 1761 for_each_node_state(nid, N_HIGH_MEMORY) { 1762 struct node *node = &node_devices[nid]; 1763 if (node->sysdev.id == nid) 1764 hugetlb_register_node(node); 1765 } 1766 1767 /* 1768 * Let the node sysdev driver know we're here so it can 1769 * [un]register hstate attributes on node hotplug. 1770 */ 1771 register_hugetlbfs_with_node(hugetlb_register_node, 1772 hugetlb_unregister_node); 1773} 1774#else /* !CONFIG_NUMA */ 1775 1776static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp) 1777{ 1778 BUG(); 1779 if (nidp) 1780 *nidp = -1; 1781 return NULL; 1782} 1783 1784static void hugetlb_unregister_all_nodes(void) { } 1785 1786static void hugetlb_register_all_nodes(void) { } 1787 1788#endif 1789 1790static void __exit hugetlb_exit(void) 1791{ 1792 struct hstate *h; 1793 1794 hugetlb_unregister_all_nodes(); 1795 1796 for_each_hstate(h) { 1797 kobject_put(hstate_kobjs[h - hstates]); 1798 } 1799 1800 kobject_put(hugepages_kobj); 1801} 1802module_exit(hugetlb_exit); 1803 1804static int __init hugetlb_init(void) 1805{ 1806 /* Some platform decide whether they support huge pages at boot 1807 * time. On these, such as powerpc, HPAGE_SHIFT is set to 0 when 1808 * there is no such support 1809 */ 1810 if (HPAGE_SHIFT == 0) 1811 return 0; 1812 1813 if (!size_to_hstate(default_hstate_size)) { 1814 default_hstate_size = HPAGE_SIZE; 1815 if (!size_to_hstate(default_hstate_size)) 1816 hugetlb_add_hstate(HUGETLB_PAGE_ORDER); 1817 } 1818 default_hstate_idx = size_to_hstate(default_hstate_size) - hstates; 1819 if (default_hstate_max_huge_pages) 1820 default_hstate.max_huge_pages = default_hstate_max_huge_pages; 1821 1822 hugetlb_init_hstates(); 1823 1824 gather_bootmem_prealloc(); 1825 1826 report_hugepages(); 1827 1828 hugetlb_sysfs_init(); 1829 1830 hugetlb_register_all_nodes(); 1831 1832 return 0; 1833} 1834module_init(hugetlb_init); 1835 1836/* Should be called on processing a hugepagesz=... option */ 1837void __init hugetlb_add_hstate(unsigned order) 1838{ 1839 struct hstate *h; 1840 unsigned long i; 1841 1842 if (size_to_hstate(PAGE_SIZE << order)) { 1843 printk(KERN_WARNING "hugepagesz= specified twice, ignoring\n"); 1844 return; 1845 } 1846 BUG_ON(max_hstate >= HUGE_MAX_HSTATE); 1847 BUG_ON(order == 0); 1848 h = &hstates[max_hstate++]; 1849 h->order = order; 1850 h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1); 1851 h->nr_huge_pages = 0; 1852 h->free_huge_pages = 0; 1853 for (i = 0; i < MAX_NUMNODES; ++i) 1854 INIT_LIST_HEAD(&h->hugepage_freelists[i]); 1855 h->next_nid_to_alloc = first_node(node_states[N_HIGH_MEMORY]); 1856 h->next_nid_to_free = first_node(node_states[N_HIGH_MEMORY]); 1857 snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", 1858 huge_page_size(h)/1024); 1859 1860 parsed_hstate = h; 1861} 1862 1863static int __init hugetlb_nrpages_setup(char *s) 1864{ 1865 unsigned long *mhp; 1866 static unsigned long *last_mhp; 1867 1868 /* 1869 * !max_hstate means we haven't parsed a hugepagesz= parameter yet, 1870 * so this hugepages= parameter goes to the "default hstate". 1871 */ 1872 if (!max_hstate) 1873 mhp = &default_hstate_max_huge_pages; 1874 else 1875 mhp = &parsed_hstate->max_huge_pages; 1876 1877 if (mhp == last_mhp) { 1878 printk(KERN_WARNING "hugepages= specified twice without " 1879 "interleaving hugepagesz=, ignoring\n"); 1880 return 1; 1881 } 1882 1883 if (sscanf(s, "%lu", mhp) <= 0) 1884 *mhp = 0; 1885 1886 /* 1887 * Global state is always initialized later in hugetlb_init. 1888 * But we need to allocate >= MAX_ORDER hstates here early to still 1889 * use the bootmem allocator. 1890 */ 1891 if (max_hstate && parsed_hstate->order >= MAX_ORDER) 1892 hugetlb_hstate_alloc_pages(parsed_hstate); 1893 1894 last_mhp = mhp; 1895 1896 return 1; 1897} 1898__setup("hugepages=", hugetlb_nrpages_setup); 1899 1900static int __init hugetlb_default_setup(char *s) 1901{ 1902 default_hstate_size = memparse(s, &s); 1903 return 1; 1904} 1905__setup("default_hugepagesz=", hugetlb_default_setup); 1906 1907static unsigned int cpuset_mems_nr(unsigned int *array) 1908{ 1909 int node; 1910 unsigned int nr = 0; 1911 1912 for_each_node_mask(node, cpuset_current_mems_allowed) 1913 nr += array[node]; 1914 1915 return nr; 1916} 1917 1918#ifdef CONFIG_SYSCTL 1919static int hugetlb_sysctl_handler_common(bool obey_mempolicy, 1920 struct ctl_table *table, int write, 1921 void __user *buffer, size_t *length, loff_t *ppos) 1922{ 1923 struct hstate *h = &default_hstate; 1924 unsigned long tmp; 1925 1926 if (!write) 1927 tmp = h->max_huge_pages; 1928 1929 table->data = &tmp; 1930 table->maxlen = sizeof(unsigned long); 1931 proc_doulongvec_minmax(table, write, buffer, length, ppos); 1932 1933 if (write) { 1934 NODEMASK_ALLOC(nodemask_t, nodes_allowed, 1935 GFP_KERNEL | __GFP_NORETRY); 1936 if (!(obey_mempolicy && 1937 init_nodemask_of_mempolicy(nodes_allowed))) { 1938 NODEMASK_FREE(nodes_allowed); 1939 nodes_allowed = &node_states[N_HIGH_MEMORY]; 1940 } 1941 h->max_huge_pages = set_max_huge_pages(h, tmp, nodes_allowed); 1942 1943 if (nodes_allowed != &node_states[N_HIGH_MEMORY]) 1944 NODEMASK_FREE(nodes_allowed); 1945 } 1946 1947 return 0; 1948} 1949 1950int hugetlb_sysctl_handler(struct ctl_table *table, int write, 1951 void __user *buffer, size_t *length, loff_t *ppos) 1952{ 1953 1954 return hugetlb_sysctl_handler_common(false, table, write, 1955 buffer, length, ppos); 1956} 1957 1958#ifdef CONFIG_NUMA 1959int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write, 1960 void __user *buffer, size_t *length, loff_t *ppos) 1961{ 1962 return hugetlb_sysctl_handler_common(true, table, write, 1963 buffer, length, ppos); 1964} 1965#endif /* CONFIG_NUMA */ 1966 1967int hugetlb_treat_movable_handler(struct ctl_table *table, int write, 1968 void __user *buffer, 1969 size_t *length, loff_t *ppos) 1970{ 1971 proc_dointvec(table, write, buffer, length, ppos); 1972 if (hugepages_treat_as_movable) 1973 htlb_alloc_mask = GFP_HIGHUSER_MOVABLE; 1974 else 1975 htlb_alloc_mask = GFP_HIGHUSER; 1976 return 0; 1977} 1978 1979int hugetlb_overcommit_handler(struct ctl_table *table, int write, 1980 void __user *buffer, 1981 size_t *length, loff_t *ppos) 1982{ 1983 struct hstate *h = &default_hstate; 1984 unsigned long tmp; 1985 1986 if (!write) 1987 tmp = h->nr_overcommit_huge_pages; 1988 1989 table->data = &tmp; 1990 table->maxlen = sizeof(unsigned long); 1991 proc_doulongvec_minmax(table, write, buffer, length, ppos); 1992 1993 if (write) { 1994 spin_lock(&hugetlb_lock); 1995 h->nr_overcommit_huge_pages = tmp; 1996 spin_unlock(&hugetlb_lock); 1997 } 1998 1999 return 0; 2000} 2001 2002#endif /* CONFIG_SYSCTL */ 2003 2004void hugetlb_report_meminfo(struct seq_file *m) 2005{ 2006 struct hstate *h = &default_hstate; 2007 seq_printf(m, 2008 "HugePages_Total: %5lu\n" 2009 "HugePages_Free: %5lu\n" 2010 "HugePages_Rsvd: %5lu\n" 2011 "HugePages_Surp: %5lu\n" 2012 "Hugepagesize: %8lu kB\n", 2013 h->nr_huge_pages, 2014 h->free_huge_pages, 2015 h->resv_huge_pages, 2016 h->surplus_huge_pages, 2017 1UL << (huge_page_order(h) + PAGE_SHIFT - 10)); 2018} 2019 2020int hugetlb_report_node_meminfo(int nid, char *buf) 2021{ 2022 struct hstate *h = &default_hstate; 2023 return sprintf(buf, 2024 "Node %d HugePages_Total: %5u\n" 2025 "Node %d HugePages_Free: %5u\n" 2026 "Node %d HugePages_Surp: %5u\n", 2027 nid, h->nr_huge_pages_node[nid], 2028 nid, h->free_huge_pages_node[nid], 2029 nid, h->surplus_huge_pages_node[nid]); 2030} 2031 2032/* Return the number pages of memory we physically have, in PAGE_SIZE units. */ 2033unsigned long hugetlb_total_pages(void) 2034{ 2035 struct hstate *h = &default_hstate; 2036 return h->nr_huge_pages * pages_per_huge_page(h); 2037} 2038 2039static int hugetlb_acct_memory(struct hstate *h, long delta) 2040{ 2041 int ret = -ENOMEM; 2042 2043 spin_lock(&hugetlb_lock); 2044 /* 2045 * When cpuset is configured, it breaks the strict hugetlb page 2046 * reservation as the accounting is done on a global variable. Such 2047 * reservation is completely rubbish in the presence of cpuset because 2048 * the reservation is not checked against page availability for the 2049 * current cpuset. Application can still potentially OOM'ed by kernel 2050 * with lack of free htlb page in cpuset that the task is in. 2051 * Attempt to enforce strict accounting with cpuset is almost 2052 * impossible (or too ugly) because cpuset is too fluid that 2053 * task or memory node can be dynamically moved between cpusets. 2054 * 2055 * The change of semantics for shared hugetlb mapping with cpuset is 2056 * undesirable. However, in order to preserve some of the semantics, 2057 * we fall back to check against current free page availability as 2058 * a best attempt and hopefully to minimize the impact of changing 2059 * semantics that cpuset has. 2060 */ 2061 if (delta > 0) { 2062 if (gather_surplus_pages(h, delta) < 0) 2063 goto out; 2064 2065 if (delta > cpuset_mems_nr(h->free_huge_pages_node)) { 2066 return_unused_surplus_pages(h, delta); 2067 goto out; 2068 } 2069 } 2070 2071 ret = 0; 2072 if (delta < 0) 2073 return_unused_surplus_pages(h, (unsigned long) -delta); 2074 2075out: 2076 spin_unlock(&hugetlb_lock); 2077 return ret; 2078} 2079 2080static void hugetlb_vm_op_open(struct vm_area_struct *vma) 2081{ 2082 struct resv_map *reservations = vma_resv_map(vma); 2083 2084 /* 2085 * This new VMA should share its siblings reservation map if present. 2086 * The VMA will only ever have a valid reservation map pointer where 2087 * it is being copied for another still existing VMA. As that VMA 2088 * has a reference to the reservation map it cannot dissappear until 2089 * after this open call completes. It is therefore safe to take a 2090 * new reference here without additional locking. 2091 */ 2092 if (reservations) 2093 kref_get(&reservations->refs); 2094} 2095 2096static void hugetlb_vm_op_close(struct vm_area_struct *vma) 2097{ 2098 struct hstate *h = hstate_vma(vma); 2099 struct resv_map *reservations = vma_resv_map(vma); 2100 unsigned long reserve; 2101 unsigned long start; 2102 unsigned long end; 2103 2104 if (reservations) { 2105 start = vma_hugecache_offset(h, vma, vma->vm_start); 2106 end = vma_hugecache_offset(h, vma, vma->vm_end); 2107 2108 reserve = (end - start) - 2109 region_count(&reservations->regions, start, end); 2110 2111 kref_put(&reservations->refs, resv_map_release); 2112 2113 if (reserve) { 2114 hugetlb_acct_memory(h, -reserve); 2115 hugetlb_put_quota(vma->vm_file->f_mapping, reserve); 2116 } 2117 } 2118} 2119 2120/* 2121 * We cannot handle pagefaults against hugetlb pages at all. They cause 2122 * handle_mm_fault() to try to instantiate regular-sized pages in the 2123 * hugegpage VMA. do_page_fault() is supposed to trap this, so BUG is we get 2124 * this far. 2125 */ 2126static int hugetlb_vm_op_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 2127{ 2128 BUG(); 2129 return 0; 2130} 2131 2132const struct vm_operations_struct hugetlb_vm_ops = { 2133 .fault = hugetlb_vm_op_fault, 2134 .open = hugetlb_vm_op_open, 2135 .close = hugetlb_vm_op_close, 2136}; 2137 2138static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page, 2139 int writable) 2140{ 2141 pte_t entry; 2142 2143 if (writable) { 2144 entry = 2145 pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); 2146 } else { 2147 entry = huge_pte_wrprotect(mk_pte(page, vma->vm_page_prot)); 2148 } 2149 entry = pte_mkyoung(entry); 2150 entry = pte_mkhuge(entry); 2151 2152 return entry; 2153} 2154 2155static void set_huge_ptep_writable(struct vm_area_struct *vma, 2156 unsigned long address, pte_t *ptep) 2157{ 2158 pte_t entry; 2159 2160 entry = pte_mkwrite(pte_mkdirty(huge_ptep_get(ptep))); 2161 if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1)) { 2162 update_mmu_cache(vma, address, ptep); 2163 } 2164} 2165 2166 2167int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, 2168 struct vm_area_struct *vma) 2169{ 2170 pte_t *src_pte, *dst_pte, entry; 2171 struct page *ptepage; 2172 unsigned long addr; 2173 int cow; 2174 struct hstate *h = hstate_vma(vma); 2175 unsigned long sz = huge_page_size(h); 2176 2177 cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; 2178 2179 for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) { 2180 src_pte = huge_pte_offset(src, addr); 2181 if (!src_pte) 2182 continue; 2183 dst_pte = huge_pte_alloc(dst, addr, sz); 2184 if (!dst_pte) 2185 goto nomem; 2186 2187 /* If the pagetables are shared don't copy or take references */ 2188 if (dst_pte == src_pte) 2189 continue; 2190 2191 spin_lock(&dst->page_table_lock); 2192 spin_lock_nested(&src->page_table_lock, SINGLE_DEPTH_NESTING); 2193 if (!huge_pte_none(huge_ptep_get(src_pte))) { 2194 if (cow) 2195 huge_ptep_set_wrprotect(src, addr, src_pte); 2196 entry = huge_ptep_get(src_pte); 2197 ptepage = pte_page(entry); 2198 get_page(ptepage); 2199 page_dup_rmap(ptepage); 2200 set_huge_pte_at(dst, addr, dst_pte, entry); 2201 } 2202 spin_unlock(&src->page_table_lock); 2203 spin_unlock(&dst->page_table_lock); 2204 } 2205 return 0; 2206 2207nomem: 2208 return -ENOMEM; 2209} 2210 2211static int is_hugetlb_entry_migration(pte_t pte) 2212{ 2213 swp_entry_t swp; 2214 2215 if (huge_pte_none(pte) || pte_present(pte)) 2216 return 0; 2217 swp = pte_to_swp_entry(pte); 2218 if (non_swap_entry(swp) && is_migration_entry(swp)) { 2219 return 1; 2220 } else 2221 return 0; 2222} 2223 2224static int is_hugetlb_entry_hwpoisoned(pte_t pte) 2225{ 2226 swp_entry_t swp; 2227 2228 if (huge_pte_none(pte) || pte_present(pte)) 2229 return 0; 2230 swp = pte_to_swp_entry(pte); 2231 if (non_swap_entry(swp) && is_hwpoison_entry(swp)) { 2232 return 1; 2233 } else 2234 return 0; 2235} 2236 2237void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, 2238 unsigned long end, struct page *ref_page) 2239{ 2240 struct mm_struct *mm = vma->vm_mm; 2241 unsigned long address; 2242 pte_t *ptep; 2243 pte_t pte; 2244 struct page *page; 2245 struct page *tmp; 2246 struct hstate *h = hstate_vma(vma); 2247 unsigned long sz = huge_page_size(h); 2248 2249 /* 2250 * A page gathering list, protected by per file i_mmap_lock. The 2251 * lock is used to avoid list corruption from multiple unmapping 2252 * of the same page since we are using page->lru. 2253 */ 2254 LIST_HEAD(page_list); 2255 2256 WARN_ON(!is_vm_hugetlb_page(vma)); 2257 BUG_ON(start & ~huge_page_mask(h)); 2258 BUG_ON(end & ~huge_page_mask(h)); 2259 2260 mmu_notifier_invalidate_range_start(mm, start, end); 2261 spin_lock(&mm->page_table_lock); 2262 for (address = start; address < end; address += sz) { 2263 ptep = huge_pte_offset(mm, address); 2264 if (!ptep) 2265 continue; 2266 2267 if (huge_pmd_unshare(mm, &address, ptep)) 2268 continue; 2269 2270 /* 2271 * If a reference page is supplied, it is because a specific 2272 * page is being unmapped, not a range. Ensure the page we 2273 * are about to unmap is the actual page of interest. 2274 */ 2275 if (ref_page) { 2276 pte = huge_ptep_get(ptep); 2277 if (huge_pte_none(pte)) 2278 continue; 2279 page = pte_page(pte); 2280 if (page != ref_page) 2281 continue; 2282 2283 /* 2284 * Mark the VMA as having unmapped its page so that 2285 * future faults in this VMA will fail rather than 2286 * looking like data was lost 2287 */ 2288 set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED); 2289 } 2290 2291 pte = huge_ptep_get_and_clear(mm, address, ptep); 2292 if (huge_pte_none(pte)) 2293 continue; 2294 2295 /* 2296 * HWPoisoned hugepage is already unmapped and dropped reference 2297 */ 2298 if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) 2299 continue; 2300 2301 page = pte_page(pte); 2302 if (pte_dirty(pte)) 2303 set_page_dirty(page); 2304 list_add(&page->lru, &page_list); 2305 } 2306 spin_unlock(&mm->page_table_lock); 2307 flush_tlb_range(vma, start, end); 2308 mmu_notifier_invalidate_range_end(mm, start, end); 2309 list_for_each_entry_safe(page, tmp, &page_list, lru) { 2310 page_remove_rmap(page); 2311 list_del(&page->lru); 2312 put_page(page); 2313 } 2314} 2315 2316void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, 2317 unsigned long end, struct page *ref_page) 2318{ 2319 spin_lock(&vma->vm_file->f_mapping->i_mmap_lock); 2320 __unmap_hugepage_range(vma, start, end, ref_page); 2321 spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock); 2322} 2323 2324/* 2325 * This is called when the original mapper is failing to COW a MAP_PRIVATE 2326 * mappping it owns the reserve page for. The intention is to unmap the page 2327 * from other VMAs and let the children be SIGKILLed if they are faulting the 2328 * same region. 2329 */ 2330static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, 2331 struct page *page, unsigned long address) 2332{ 2333 struct hstate *h = hstate_vma(vma); 2334 struct vm_area_struct *iter_vma; 2335 struct address_space *mapping; 2336 struct prio_tree_iter iter; 2337 pgoff_t pgoff; 2338 2339 /* 2340 * vm_pgoff is in PAGE_SIZE units, hence the different calculation 2341 * from page cache lookup which is in HPAGE_SIZE units. 2342 */ 2343 address = address & huge_page_mask(h); 2344 pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) 2345 + (vma->vm_pgoff >> PAGE_SHIFT); 2346 mapping = (struct address_space *)page_private(page); 2347 2348 /* 2349 * Take the mapping lock for the duration of the table walk. As 2350 * this mapping should be shared between all the VMAs, 2351 * __unmap_hugepage_range() is called as the lock is already held 2352 */ 2353 spin_lock(&mapping->i_mmap_lock); 2354 vma_prio_tree_foreach(iter_vma, &iter, &mapping->i_mmap, pgoff, pgoff) { 2355 /* Do not unmap the current VMA */ 2356 if (iter_vma == vma) 2357 continue; 2358 2359 /* 2360 * Unmap the page from other VMAs without their own reserves. 2361 * They get marked to be SIGKILLed if they fault in these 2362 * areas. This is because a future no-page fault on this VMA 2363 * could insert a zeroed page instead of the data existing 2364 * from the time of fork. This would look like data corruption 2365 */ 2366 if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER)) 2367 __unmap_hugepage_range(iter_vma, 2368 address, address + huge_page_size(h), 2369 page); 2370 } 2371 spin_unlock(&mapping->i_mmap_lock); 2372 2373 return 1; 2374} 2375 2376/* 2377 * Hugetlb_cow() should be called with page lock of the original hugepage held. 2378 */ 2379static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, 2380 unsigned long address, pte_t *ptep, pte_t pte, 2381 struct page *pagecache_page) 2382{ 2383 struct hstate *h = hstate_vma(vma); 2384 struct page *old_page, *new_page; 2385 int avoidcopy; 2386 int outside_reserve = 0; 2387 2388 old_page = pte_page(pte); 2389 2390retry_avoidcopy: 2391 /* If no-one else is actually using this page, avoid the copy 2392 * and just make the page writable */ 2393 avoidcopy = (page_mapcount(old_page) == 1); 2394 if (avoidcopy) { 2395 if (PageAnon(old_page)) 2396 page_move_anon_rmap(old_page, vma, address); 2397 set_huge_ptep_writable(vma, address, ptep); 2398 return 0; 2399 } 2400 2401 /* 2402 * If the process that created a MAP_PRIVATE mapping is about to 2403 * perform a COW due to a shared page count, attempt to satisfy 2404 * the allocation without using the existing reserves. The pagecache 2405 * page is used to determine if the reserve at this address was 2406 * consumed or not. If reserves were used, a partial faulted mapping 2407 * at the time of fork() could consume its reserves on COW instead 2408 * of the full address range. 2409 */ 2410 if (!(vma->vm_flags & VM_MAYSHARE) && 2411 is_vma_resv_set(vma, HPAGE_RESV_OWNER) && 2412 old_page != pagecache_page) 2413 outside_reserve = 1; 2414 2415 page_cache_get(old_page); 2416 2417 /* Drop page_table_lock as buddy allocator may be called */ 2418 spin_unlock(&mm->page_table_lock); 2419 new_page = alloc_huge_page(vma, address, outside_reserve); 2420 2421 if (IS_ERR(new_page)) { 2422 page_cache_release(old_page); 2423 2424 /* 2425 * If a process owning a MAP_PRIVATE mapping fails to COW, 2426 * it is due to references held by a child and an insufficient 2427 * huge page pool. To guarantee the original mappers 2428 * reliability, unmap the page from child processes. The child 2429 * may get SIGKILLed if it later faults. 2430 */ 2431 if (outside_reserve) { 2432 BUG_ON(huge_pte_none(pte)); 2433 if (unmap_ref_private(mm, vma, old_page, address)) { 2434 BUG_ON(page_count(old_page) != 1); 2435 BUG_ON(huge_pte_none(pte)); 2436 spin_lock(&mm->page_table_lock); 2437 goto retry_avoidcopy; 2438 } 2439 WARN_ON_ONCE(1); 2440 } 2441 2442 /* Caller expects lock to be held */ 2443 spin_lock(&mm->page_table_lock); 2444 return -PTR_ERR(new_page); 2445 } 2446 2447 /* 2448 * When the original hugepage is shared one, it does not have 2449 * anon_vma prepared. 2450 */ 2451 if (unlikely(anon_vma_prepare(vma))) 2452 return VM_FAULT_OOM; 2453 2454 copy_user_huge_page(new_page, old_page, address, vma); 2455 __SetPageUptodate(new_page); 2456 2457 /* 2458 * Retake the page_table_lock to check for racing updates 2459 * before the page tables are altered 2460 */ 2461 spin_lock(&mm->page_table_lock); 2462 ptep = huge_pte_offset(mm, address & huge_page_mask(h)); 2463 if (likely(pte_same(huge_ptep_get(ptep), pte))) { 2464 /* Break COW */ 2465 mmu_notifier_invalidate_range_start(mm, 2466 address & huge_page_mask(h), 2467 (address & huge_page_mask(h)) + huge_page_size(h)); 2468 huge_ptep_clear_flush(vma, address, ptep); 2469 set_huge_pte_at(mm, address, ptep, 2470 make_huge_pte(vma, new_page, 1)); 2471 page_remove_rmap(old_page); 2472 hugepage_add_new_anon_rmap(new_page, vma, address); 2473 /* Make the old page be freed below */ 2474 new_page = old_page; 2475 mmu_notifier_invalidate_range_end(mm, 2476 address & huge_page_mask(h), 2477 (address & huge_page_mask(h)) + huge_page_size(h)); 2478 } 2479 page_cache_release(new_page); 2480 page_cache_release(old_page); 2481 return 0; 2482} 2483 2484/* Return the pagecache page at a given address within a VMA */ 2485static struct page *hugetlbfs_pagecache_page(struct hstate *h, 2486 struct vm_area_struct *vma, unsigned long address) 2487{ 2488 struct address_space *mapping; 2489 pgoff_t idx; 2490 2491 mapping = vma->vm_file->f_mapping; 2492 idx = vma_hugecache_offset(h, vma, address); 2493 2494 return find_lock_page(mapping, idx); 2495} 2496 2497/* 2498 * Return whether there is a pagecache page to back given address within VMA. 2499 * Caller follow_hugetlb_page() holds page_table_lock so we cannot lock_page. 2500 */ 2501static bool hugetlbfs_pagecache_present(struct hstate *h, 2502 struct vm_area_struct *vma, unsigned long address) 2503{ 2504 struct address_space *mapping; 2505 pgoff_t idx; 2506 struct page *page; 2507 2508 mapping = vma->vm_file->f_mapping; 2509 idx = vma_hugecache_offset(h, vma, address); 2510 2511 page = find_get_page(mapping, idx); 2512 if (page) 2513 put_page(page); 2514 return page != NULL; 2515} 2516 2517static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, 2518 unsigned long address, pte_t *ptep, unsigned int flags) 2519{ 2520 struct hstate *h = hstate_vma(vma); 2521 int ret = VM_FAULT_SIGBUS; 2522 pgoff_t idx; 2523 unsigned long size; 2524 struct page *page; 2525 struct address_space *mapping; 2526 pte_t new_pte; 2527 2528 /* 2529 * Currently, we are forced to kill the process in the event the 2530 * original mapper has unmapped pages from the child due to a failed 2531 * COW. Warn that such a situation has occured as it may not be obvious 2532 */ 2533 if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) { 2534 printk(KERN_WARNING 2535 "PID %d killed due to inadequate hugepage pool\n", 2536 current->pid); 2537 return ret; 2538 } 2539 2540 mapping = vma->vm_file->f_mapping; 2541 idx = vma_hugecache_offset(h, vma, address); 2542 2543 /* 2544 * Use page lock to guard against racing truncation 2545 * before we get page_table_lock. 2546 */ 2547retry: 2548 page = find_lock_page(mapping, idx); 2549 if (!page) { 2550 size = i_size_read(mapping->host) >> huge_page_shift(h); 2551 if (idx >= size) 2552 goto out; 2553 page = alloc_huge_page(vma, address, 0); 2554 if (IS_ERR(page)) { 2555 ret = -PTR_ERR(page); 2556 goto out; 2557 } 2558 clear_huge_page(page, address, huge_page_size(h)); 2559 __SetPageUptodate(page); 2560 2561 if (vma->vm_flags & VM_MAYSHARE) { 2562 int err; 2563 struct inode *inode = mapping->host; 2564 2565 err = add_to_page_cache(page, mapping, idx, GFP_KERNEL); 2566 if (err) { 2567 put_page(page); 2568 if (err == -EEXIST) 2569 goto retry; 2570 goto out; 2571 } 2572 2573 spin_lock(&inode->i_lock); 2574 inode->i_blocks += blocks_per_huge_page(h); 2575 spin_unlock(&inode->i_lock); 2576 page_dup_rmap(page); 2577 } else { 2578 lock_page(page); 2579 if (unlikely(anon_vma_prepare(vma))) { 2580 ret = VM_FAULT_OOM; 2581 goto backout_unlocked; 2582 } 2583 hugepage_add_new_anon_rmap(page, vma, address); 2584 } 2585 } else { 2586 /* 2587 * If memory error occurs between mmap() and fault, some process 2588 * don't have hwpoisoned swap entry for errored virtual address. 2589 * So we need to block hugepage fault by PG_hwpoison bit check. 2590 */ 2591 if (unlikely(PageHWPoison(page))) { 2592 ret = VM_FAULT_HWPOISON; 2593 goto backout_unlocked; 2594 } 2595 page_dup_rmap(page); 2596 } 2597 2598 /* 2599 * If we are going to COW a private mapping later, we examine the 2600 * pending reservations for this page now. This will ensure that 2601 * any allocations necessary to record that reservation occur outside 2602 * the spinlock. 2603 */ 2604 if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) 2605 if (vma_needs_reservation(h, vma, address) < 0) { 2606 ret = VM_FAULT_OOM; 2607 goto backout_unlocked; 2608 } 2609 2610 spin_lock(&mm->page_table_lock); 2611 size = i_size_read(mapping->host) >> huge_page_shift(h); 2612 if (idx >= size) 2613 goto backout; 2614 2615 ret = 0; 2616 if (!huge_pte_none(huge_ptep_get(ptep))) 2617 goto backout; 2618 2619 new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) 2620 && (vma->vm_flags & VM_SHARED))); 2621 set_huge_pte_at(mm, address, ptep, new_pte); 2622 2623 if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { 2624 /* Optimization, do the COW without a second fault */ 2625 ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page); 2626 } 2627 2628 spin_unlock(&mm->page_table_lock); 2629 unlock_page(page); 2630out: 2631 return ret; 2632 2633backout: 2634 spin_unlock(&mm->page_table_lock); 2635backout_unlocked: 2636 unlock_page(page); 2637 put_page(page); 2638 goto out; 2639} 2640 2641int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, 2642 unsigned long address, unsigned int flags) 2643{ 2644 pte_t *ptep; 2645 pte_t entry; 2646 int ret; 2647 struct page *page = NULL; 2648 struct page *pagecache_page = NULL; 2649 static DEFINE_MUTEX(hugetlb_instantiation_mutex); 2650 struct hstate *h = hstate_vma(vma); 2651 2652 ptep = huge_pte_offset(mm, address); 2653 if (ptep) { 2654 entry = huge_ptep_get(ptep); 2655 if (unlikely(is_hugetlb_entry_migration(entry))) { 2656 migration_entry_wait(mm, (pmd_t *)ptep, address); 2657 return 0; 2658 } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) 2659 return VM_FAULT_HWPOISON; 2660 } 2661 2662 ptep = huge_pte_alloc(mm, address, huge_page_size(h)); 2663 if (!ptep) 2664 return VM_FAULT_OOM; 2665 2666 /* 2667 * Serialize hugepage allocation and instantiation, so that we don't 2668 * get spurious allocation failures if two CPUs race to instantiate 2669 * the same page in the page cache. 2670 */ 2671 mutex_lock(&hugetlb_instantiation_mutex); 2672 entry = huge_ptep_get(ptep); 2673 if (huge_pte_none(entry)) { 2674 ret = hugetlb_no_page(mm, vma, address, ptep, flags); 2675 goto out_mutex; 2676 } 2677 2678 ret = 0; 2679 2680 /* 2681 * If we are going to COW the mapping later, we examine the pending 2682 * reservations for this page now. This will ensure that any 2683 * allocations necessary to record that reservation occur outside the 2684 * spinlock. For private mappings, we also lookup the pagecache 2685 * page now as it is used to determine if a reservation has been 2686 * consumed. 2687 */ 2688 if ((flags & FAULT_FLAG_WRITE) && !pte_write(entry)) { 2689 if (vma_needs_reservation(h, vma, address) < 0) { 2690 ret = VM_FAULT_OOM; 2691 goto out_mutex; 2692 } 2693 2694 if (!(vma->vm_flags & VM_MAYSHARE)) 2695 pagecache_page = hugetlbfs_pagecache_page(h, 2696 vma, address); 2697 } 2698 2699 /* 2700 * hugetlb_cow() requires page locks of pte_page(entry) and 2701 * pagecache_page, so here we need take the former one 2702 * when page != pagecache_page or !pagecache_page. 2703 * Note that locking order is always pagecache_page -> page, 2704 * so no worry about deadlock. 2705 */ 2706 page = pte_page(entry); 2707 if (page != pagecache_page) 2708 lock_page(page); 2709 2710 spin_lock(&mm->page_table_lock); 2711 /* Check for a racing update before calling hugetlb_cow */ 2712 if (unlikely(!pte_same(entry, huge_ptep_get(ptep)))) 2713 goto out_page_table_lock; 2714 2715 2716 if (flags & FAULT_FLAG_WRITE) { 2717 if (!pte_write(entry)) { 2718 ret = hugetlb_cow(mm, vma, address, ptep, entry, 2719 pagecache_page); 2720 goto out_page_table_lock; 2721 } 2722 entry = pte_mkdirty(entry); 2723 } 2724 entry = pte_mkyoung(entry); 2725 if (huge_ptep_set_access_flags(vma, address, ptep, entry, 2726 flags & FAULT_FLAG_WRITE)) 2727 update_mmu_cache(vma, address, ptep); 2728 2729out_page_table_lock: 2730 spin_unlock(&mm->page_table_lock); 2731 2732 if (pagecache_page) { 2733 unlock_page(pagecache_page); 2734 put_page(pagecache_page); 2735 } 2736 unlock_page(page); 2737 2738out_mutex: 2739 mutex_unlock(&hugetlb_instantiation_mutex); 2740 2741 return ret; 2742} 2743 2744/* Can be overriden by architectures */ 2745__attribute__((weak)) struct page * 2746follow_huge_pud(struct mm_struct *mm, unsigned long address, 2747 pud_t *pud, int write) 2748{ 2749 BUG(); 2750 return NULL; 2751} 2752 2753int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, 2754 struct page **pages, struct vm_area_struct **vmas, 2755 unsigned long *position, int *length, int i, 2756 unsigned int flags) 2757{ 2758 unsigned long pfn_offset; 2759 unsigned long vaddr = *position; 2760 int remainder = *length; 2761 struct hstate *h = hstate_vma(vma); 2762 2763 spin_lock(&mm->page_table_lock); 2764 while (vaddr < vma->vm_end && remainder) { 2765 pte_t *pte; 2766 int absent; 2767 struct page *page; 2768 2769 /* 2770 * Some archs (sparc64, sh*) have multiple pte_ts to 2771 * each hugepage. We have to make sure we get the 2772 * first, for the page indexing below to work. 2773 */ 2774 pte = huge_pte_offset(mm, vaddr & huge_page_mask(h)); 2775 absent = !pte || huge_pte_none(huge_ptep_get(pte)); 2776 2777 /* 2778 * When coredumping, it suits get_dump_page if we just return 2779 * an error where there's an empty slot with no huge pagecache 2780 * to back it. This way, we avoid allocating a hugepage, and 2781 * the sparse dumpfile avoids allocating disk blocks, but its 2782 * huge holes still show up with zeroes where they need to be. 2783 */ 2784 if (absent && (flags & FOLL_DUMP) && 2785 !hugetlbfs_pagecache_present(h, vma, vaddr)) { 2786 remainder = 0; 2787 break; 2788 } 2789 2790 if (absent || 2791 ((flags & FOLL_WRITE) && !pte_write(huge_ptep_get(pte)))) { 2792 int ret; 2793 2794 spin_unlock(&mm->page_table_lock); 2795 ret = hugetlb_fault(mm, vma, vaddr, 2796 (flags & FOLL_WRITE) ? FAULT_FLAG_WRITE : 0); 2797 spin_lock(&mm->page_table_lock); 2798 if (!(ret & VM_FAULT_ERROR)) 2799 continue; 2800 2801 remainder = 0; 2802 break; 2803 } 2804 2805 pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT; 2806 page = pte_page(huge_ptep_get(pte)); 2807same_page: 2808 if (pages) { 2809 pages[i] = mem_map_offset(page, pfn_offset); 2810 get_page(pages[i]); 2811 } 2812 2813 if (vmas) 2814 vmas[i] = vma; 2815 2816 vaddr += PAGE_SIZE; 2817 ++pfn_offset; 2818 --remainder; 2819 ++i; 2820 if (vaddr < vma->vm_end && remainder && 2821 pfn_offset < pages_per_huge_page(h)) { 2822 /* 2823 * We use pfn_offset to avoid touching the pageframes 2824 * of this compound page. 2825 */ 2826 goto same_page; 2827 } 2828 } 2829 spin_unlock(&mm->page_table_lock); 2830 *length = remainder; 2831 *position = vaddr; 2832 2833 return i ? i : -EFAULT; 2834} 2835 2836void hugetlb_change_protection(struct vm_area_struct *vma, 2837 unsigned long address, unsigned long end, pgprot_t newprot) 2838{ 2839 struct mm_struct *mm = vma->vm_mm; 2840 unsigned long start = address; 2841 pte_t *ptep; 2842 pte_t pte; 2843 struct hstate *h = hstate_vma(vma); 2844 2845 BUG_ON(address >= end); 2846 flush_cache_range(vma, address, end); 2847 2848 spin_lock(&vma->vm_file->f_mapping->i_mmap_lock); 2849 spin_lock(&mm->page_table_lock); 2850 for (; address < end; address += huge_page_size(h)) { 2851 ptep = huge_pte_offset(mm, address); 2852 if (!ptep) 2853 continue; 2854 if (huge_pmd_unshare(mm, &address, ptep)) 2855 continue; 2856 if (!huge_pte_none(huge_ptep_get(ptep))) { 2857 pte = huge_ptep_get_and_clear(mm, address, ptep); 2858 pte = pte_mkhuge(pte_modify(pte, newprot)); 2859 set_huge_pte_at(mm, address, ptep, pte); 2860 } 2861 } 2862 spin_unlock(&mm->page_table_lock); 2863 spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock); 2864 2865 flush_tlb_range(vma, start, end); 2866} 2867 2868int hugetlb_reserve_pages(struct inode *inode, 2869 long from, long to, 2870 struct vm_area_struct *vma, 2871 int acctflag) 2872{ 2873 long ret, chg; 2874 struct hstate *h = hstate_inode(inode); 2875 2876 /* 2877 * Only apply hugepage reservation if asked. At fault time, an 2878 * attempt will be made for VM_NORESERVE to allocate a page 2879 * and filesystem quota without using reserves 2880 */ 2881 if (acctflag & VM_NORESERVE) 2882 return 0; 2883 2884 /* 2885 * Shared mappings base their reservation on the number of pages that 2886 * are already allocated on behalf of the file. Private mappings need 2887 * to reserve the full area even if read-only as mprotect() may be 2888 * called to make the mapping read-write. Assume !vma is a shm mapping 2889 */ 2890 if (!vma || vma->vm_flags & VM_MAYSHARE) 2891 chg = region_chg(&inode->i_mapping->private_list, from, to); 2892 else { 2893 struct resv_map *resv_map = resv_map_alloc(); 2894 if (!resv_map) 2895 return -ENOMEM; 2896 2897 chg = to - from; 2898 2899 set_vma_resv_map(vma, resv_map); 2900 set_vma_resv_flags(vma, HPAGE_RESV_OWNER); 2901 } 2902 2903 if (chg < 0) 2904 return chg; 2905 2906 /* There must be enough filesystem quota for the mapping */ 2907 if (hugetlb_get_quota(inode->i_mapping, chg)) 2908 return -ENOSPC; 2909 2910 /* 2911 * Check enough hugepages are available for the reservation. 2912 * Hand back the quota if there are not 2913 */ 2914 ret = hugetlb_acct_memory(h, chg); 2915 if (ret < 0) { 2916 hugetlb_put_quota(inode->i_mapping, chg); 2917 return ret; 2918 } 2919 2920 /* 2921 * Account for the reservations made. Shared mappings record regions 2922 * that have reservations as they are shared by multiple VMAs. 2923 * When the last VMA disappears, the region map says how much 2924 * the reservation was and the page cache tells how much of 2925 * the reservation was consumed. Private mappings are per-VMA and 2926 * only the consumed reservations are tracked. When the VMA 2927 * disappears, the original reservation is the VMA size and the 2928 * consumed reservations are stored in the map. Hence, nothing 2929 * else has to be done for private mappings here 2930 */ 2931 if (!vma || vma->vm_flags & VM_MAYSHARE) 2932 region_add(&inode->i_mapping->private_list, from, to); 2933 return 0; 2934} 2935 2936void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) 2937{ 2938 struct hstate *h = hstate_inode(inode); 2939 long chg = region_truncate(&inode->i_mapping->private_list, offset); 2940 2941 spin_lock(&inode->i_lock); 2942 inode->i_blocks -= (blocks_per_huge_page(h) * freed); 2943 spin_unlock(&inode->i_lock); 2944 2945 hugetlb_put_quota(inode->i_mapping, (chg - freed)); 2946 hugetlb_acct_memory(h, -(chg - freed)); 2947} 2948 2949/* Should be called in hugetlb_lock */ 2950static int is_hugepage_on_freelist(struct page *hpage) 2951{ 2952 struct page *page; 2953 struct page *tmp; 2954 struct hstate *h = page_hstate(hpage); 2955 int nid = page_to_nid(hpage); 2956 2957 list_for_each_entry_safe(page, tmp, &h->hugepage_freelists[nid], lru) 2958 if (page == hpage) 2959 return 1; 2960 return 0; 2961} 2962 2963#ifdef CONFIG_MEMORY_FAILURE 2964/* 2965 * This function is called from memory failure code. 2966 * Assume the caller holds page lock of the head page. 2967 */ 2968int dequeue_hwpoisoned_huge_page(struct page *hpage) 2969{ 2970 struct hstate *h = page_hstate(hpage); 2971 int nid = page_to_nid(hpage); 2972 int ret = -EBUSY; 2973 2974 spin_lock(&hugetlb_lock); 2975 if (is_hugepage_on_freelist(hpage)) { 2976 list_del(&hpage->lru); 2977 set_page_refcounted(hpage); 2978 h->free_huge_pages--; 2979 h->free_huge_pages_node[nid]--; 2980 ret = 0; 2981 } 2982 spin_unlock(&hugetlb_lock); 2983 return ret; 2984} 2985#endif 2986