hugetlb.c revision c79fb75e5a514a5a35f22c229042aa29f4237e3a
1/* 2 * Generic hugetlb support. 3 * (C) William Irwin, April 2004 4 */ 5#include <linux/gfp.h> 6#include <linux/list.h> 7#include <linux/init.h> 8#include <linux/module.h> 9#include <linux/mm.h> 10#include <linux/sysctl.h> 11#include <linux/highmem.h> 12#include <linux/nodemask.h> 13#include <linux/pagemap.h> 14#include <linux/mempolicy.h> 15#include <linux/cpuset.h> 16#include <linux/mutex.h> 17 18#include <asm/page.h> 19#include <asm/pgtable.h> 20 21#include <linux/hugetlb.h> 22#include "internal.h" 23 24const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; 25static unsigned long nr_huge_pages, free_huge_pages, resv_huge_pages; 26static unsigned long surplus_huge_pages; 27unsigned long max_huge_pages; 28static struct list_head hugepage_freelists[MAX_NUMNODES]; 29static unsigned int nr_huge_pages_node[MAX_NUMNODES]; 30static unsigned int free_huge_pages_node[MAX_NUMNODES]; 31static unsigned int surplus_huge_pages_node[MAX_NUMNODES]; 32static gfp_t htlb_alloc_mask = GFP_HIGHUSER; 33unsigned long hugepages_treat_as_movable; 34int hugetlb_dynamic_pool; 35static int hugetlb_next_nid; 36 37/* 38 * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages 39 */ 40static DEFINE_SPINLOCK(hugetlb_lock); 41 42static void clear_huge_page(struct page *page, unsigned long addr) 43{ 44 int i; 45 46 might_sleep(); 47 for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); i++) { 48 cond_resched(); 49 clear_user_highpage(page + i, addr + i * PAGE_SIZE); 50 } 51} 52 53static void copy_huge_page(struct page *dst, struct page *src, 54 unsigned long addr, struct vm_area_struct *vma) 55{ 56 int i; 57 58 might_sleep(); 59 for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) { 60 cond_resched(); 61 copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma); 62 } 63} 64 65static void enqueue_huge_page(struct page *page) 66{ 67 int nid = page_to_nid(page); 68 list_add(&page->lru, &hugepage_freelists[nid]); 69 free_huge_pages++; 70 free_huge_pages_node[nid]++; 71} 72 73static struct page *dequeue_huge_page(struct vm_area_struct *vma, 74 unsigned long address) 75{ 76 int nid; 77 struct page *page = NULL; 78 struct mempolicy *mpol; 79 struct zonelist *zonelist = huge_zonelist(vma, address, 80 htlb_alloc_mask, &mpol); 81 struct zone **z; 82 83 for (z = zonelist->zones; *z; z++) { 84 nid = zone_to_nid(*z); 85 if (cpuset_zone_allowed_softwall(*z, htlb_alloc_mask) && 86 !list_empty(&hugepage_freelists[nid])) { 87 page = list_entry(hugepage_freelists[nid].next, 88 struct page, lru); 89 list_del(&page->lru); 90 free_huge_pages--; 91 free_huge_pages_node[nid]--; 92 if (vma && vma->vm_flags & VM_MAYSHARE) 93 resv_huge_pages--; 94 break; 95 } 96 } 97 mpol_free(mpol); /* unref if mpol !NULL */ 98 return page; 99} 100 101static void update_and_free_page(struct page *page) 102{ 103 int i; 104 nr_huge_pages--; 105 nr_huge_pages_node[page_to_nid(page)]--; 106 for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) { 107 page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced | 108 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved | 109 1 << PG_private | 1<< PG_writeback); 110 } 111 set_compound_page_dtor(page, NULL); 112 set_page_refcounted(page); 113 __free_pages(page, HUGETLB_PAGE_ORDER); 114} 115 116static void free_huge_page(struct page *page) 117{ 118 int nid = page_to_nid(page); 119 struct address_space *mapping; 120 121 mapping = (struct address_space *) page_private(page); 122 BUG_ON(page_count(page)); 123 INIT_LIST_HEAD(&page->lru); 124 125 spin_lock(&hugetlb_lock); 126 if (surplus_huge_pages_node[nid]) { 127 update_and_free_page(page); 128 surplus_huge_pages--; 129 surplus_huge_pages_node[nid]--; 130 } else { 131 enqueue_huge_page(page); 132 } 133 spin_unlock(&hugetlb_lock); 134 if (mapping) 135 hugetlb_put_quota(mapping); 136 set_page_private(page, 0); 137} 138 139/* 140 * Increment or decrement surplus_huge_pages. Keep node-specific counters 141 * balanced by operating on them in a round-robin fashion. 142 * Returns 1 if an adjustment was made. 143 */ 144static int adjust_pool_surplus(int delta) 145{ 146 static int prev_nid; 147 int nid = prev_nid; 148 int ret = 0; 149 150 VM_BUG_ON(delta != -1 && delta != 1); 151 do { 152 nid = next_node(nid, node_online_map); 153 if (nid == MAX_NUMNODES) 154 nid = first_node(node_online_map); 155 156 /* To shrink on this node, there must be a surplus page */ 157 if (delta < 0 && !surplus_huge_pages_node[nid]) 158 continue; 159 /* Surplus cannot exceed the total number of pages */ 160 if (delta > 0 && surplus_huge_pages_node[nid] >= 161 nr_huge_pages_node[nid]) 162 continue; 163 164 surplus_huge_pages += delta; 165 surplus_huge_pages_node[nid] += delta; 166 ret = 1; 167 break; 168 } while (nid != prev_nid); 169 170 prev_nid = nid; 171 return ret; 172} 173 174static struct page *alloc_fresh_huge_page_node(int nid) 175{ 176 struct page *page; 177 178 page = alloc_pages_node(nid, 179 htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|__GFP_NOWARN, 180 HUGETLB_PAGE_ORDER); 181 if (page) { 182 set_compound_page_dtor(page, free_huge_page); 183 spin_lock(&hugetlb_lock); 184 nr_huge_pages++; 185 nr_huge_pages_node[nid]++; 186 spin_unlock(&hugetlb_lock); 187 put_page(page); /* free it into the hugepage allocator */ 188 } 189 190 return page; 191} 192 193static int alloc_fresh_huge_page(void) 194{ 195 struct page *page; 196 int start_nid; 197 int next_nid; 198 int ret = 0; 199 200 start_nid = hugetlb_next_nid; 201 202 do { 203 page = alloc_fresh_huge_page_node(hugetlb_next_nid); 204 if (page) 205 ret = 1; 206 /* 207 * Use a helper variable to find the next node and then 208 * copy it back to hugetlb_next_nid afterwards: 209 * otherwise there's a window in which a racer might 210 * pass invalid nid MAX_NUMNODES to alloc_pages_node. 211 * But we don't need to use a spin_lock here: it really 212 * doesn't matter if occasionally a racer chooses the 213 * same nid as we do. Move nid forward in the mask even 214 * if we just successfully allocated a hugepage so that 215 * the next caller gets hugepages on the next node. 216 */ 217 next_nid = next_node(hugetlb_next_nid, node_online_map); 218 if (next_nid == MAX_NUMNODES) 219 next_nid = first_node(node_online_map); 220 hugetlb_next_nid = next_nid; 221 } while (!page && hugetlb_next_nid != start_nid); 222 223 return ret; 224} 225 226static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma, 227 unsigned long address) 228{ 229 struct page *page; 230 231 /* Check if the dynamic pool is enabled */ 232 if (!hugetlb_dynamic_pool) 233 return NULL; 234 235 page = alloc_pages(htlb_alloc_mask|__GFP_COMP|__GFP_NOWARN, 236 HUGETLB_PAGE_ORDER); 237 if (page) { 238 set_compound_page_dtor(page, free_huge_page); 239 spin_lock(&hugetlb_lock); 240 nr_huge_pages++; 241 nr_huge_pages_node[page_to_nid(page)]++; 242 surplus_huge_pages++; 243 surplus_huge_pages_node[page_to_nid(page)]++; 244 spin_unlock(&hugetlb_lock); 245 } 246 247 return page; 248} 249 250/* 251 * Increase the hugetlb pool such that it can accomodate a reservation 252 * of size 'delta'. 253 */ 254static int gather_surplus_pages(int delta) 255{ 256 struct list_head surplus_list; 257 struct page *page, *tmp; 258 int ret, i; 259 int needed, allocated; 260 261 needed = (resv_huge_pages + delta) - free_huge_pages; 262 if (needed <= 0) 263 return 0; 264 265 allocated = 0; 266 INIT_LIST_HEAD(&surplus_list); 267 268 ret = -ENOMEM; 269retry: 270 spin_unlock(&hugetlb_lock); 271 for (i = 0; i < needed; i++) { 272 page = alloc_buddy_huge_page(NULL, 0); 273 if (!page) { 274 /* 275 * We were not able to allocate enough pages to 276 * satisfy the entire reservation so we free what 277 * we've allocated so far. 278 */ 279 spin_lock(&hugetlb_lock); 280 needed = 0; 281 goto free; 282 } 283 284 list_add(&page->lru, &surplus_list); 285 } 286 allocated += needed; 287 288 /* 289 * After retaking hugetlb_lock, we need to recalculate 'needed' 290 * because either resv_huge_pages or free_huge_pages may have changed. 291 */ 292 spin_lock(&hugetlb_lock); 293 needed = (resv_huge_pages + delta) - (free_huge_pages + allocated); 294 if (needed > 0) 295 goto retry; 296 297 /* 298 * The surplus_list now contains _at_least_ the number of extra pages 299 * needed to accomodate the reservation. Add the appropriate number 300 * of pages to the hugetlb pool and free the extras back to the buddy 301 * allocator. 302 */ 303 needed += allocated; 304 ret = 0; 305free: 306 list_for_each_entry_safe(page, tmp, &surplus_list, lru) { 307 list_del(&page->lru); 308 if ((--needed) >= 0) 309 enqueue_huge_page(page); 310 else { 311 /* 312 * Decrement the refcount and free the page using its 313 * destructor. This must be done with hugetlb_lock 314 * unlocked which is safe because free_huge_page takes 315 * hugetlb_lock before deciding how to free the page. 316 */ 317 spin_unlock(&hugetlb_lock); 318 put_page(page); 319 spin_lock(&hugetlb_lock); 320 } 321 } 322 323 return ret; 324} 325 326/* 327 * When releasing a hugetlb pool reservation, any surplus pages that were 328 * allocated to satisfy the reservation must be explicitly freed if they were 329 * never used. 330 */ 331void return_unused_surplus_pages(unsigned long unused_resv_pages) 332{ 333 static int nid = -1; 334 struct page *page; 335 unsigned long nr_pages; 336 337 nr_pages = min(unused_resv_pages, surplus_huge_pages); 338 339 while (nr_pages) { 340 nid = next_node(nid, node_online_map); 341 if (nid == MAX_NUMNODES) 342 nid = first_node(node_online_map); 343 344 if (!surplus_huge_pages_node[nid]) 345 continue; 346 347 if (!list_empty(&hugepage_freelists[nid])) { 348 page = list_entry(hugepage_freelists[nid].next, 349 struct page, lru); 350 list_del(&page->lru); 351 update_and_free_page(page); 352 free_huge_pages--; 353 free_huge_pages_node[nid]--; 354 surplus_huge_pages--; 355 surplus_huge_pages_node[nid]--; 356 nr_pages--; 357 } 358 } 359} 360 361 362static struct page *alloc_huge_page_shared(struct vm_area_struct *vma, 363 unsigned long addr) 364{ 365 struct page *page; 366 367 spin_lock(&hugetlb_lock); 368 page = dequeue_huge_page(vma, addr); 369 spin_unlock(&hugetlb_lock); 370 return page; 371} 372 373static struct page *alloc_huge_page_private(struct vm_area_struct *vma, 374 unsigned long addr) 375{ 376 struct page *page = NULL; 377 378 spin_lock(&hugetlb_lock); 379 if (free_huge_pages > resv_huge_pages) 380 page = dequeue_huge_page(vma, addr); 381 spin_unlock(&hugetlb_lock); 382 if (!page) 383 page = alloc_buddy_huge_page(vma, addr); 384 return page; 385} 386 387static struct page *alloc_huge_page(struct vm_area_struct *vma, 388 unsigned long addr) 389{ 390 struct page *page; 391 392 if (vma->vm_flags & VM_MAYSHARE) 393 page = alloc_huge_page_shared(vma, addr); 394 else 395 page = alloc_huge_page_private(vma, addr); 396 if (page) { 397 set_page_refcounted(page); 398 set_page_private(page, (unsigned long) vma->vm_file->f_mapping); 399 } 400 return page; 401} 402 403static int __init hugetlb_init(void) 404{ 405 unsigned long i; 406 407 if (HPAGE_SHIFT == 0) 408 return 0; 409 410 for (i = 0; i < MAX_NUMNODES; ++i) 411 INIT_LIST_HEAD(&hugepage_freelists[i]); 412 413 hugetlb_next_nid = first_node(node_online_map); 414 415 for (i = 0; i < max_huge_pages; ++i) { 416 if (!alloc_fresh_huge_page()) 417 break; 418 } 419 max_huge_pages = free_huge_pages = nr_huge_pages = i; 420 printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages); 421 return 0; 422} 423module_init(hugetlb_init); 424 425static int __init hugetlb_setup(char *s) 426{ 427 if (sscanf(s, "%lu", &max_huge_pages) <= 0) 428 max_huge_pages = 0; 429 return 1; 430} 431__setup("hugepages=", hugetlb_setup); 432 433static unsigned int cpuset_mems_nr(unsigned int *array) 434{ 435 int node; 436 unsigned int nr = 0; 437 438 for_each_node_mask(node, cpuset_current_mems_allowed) 439 nr += array[node]; 440 441 return nr; 442} 443 444#ifdef CONFIG_SYSCTL 445#ifdef CONFIG_HIGHMEM 446static void try_to_free_low(unsigned long count) 447{ 448 int i; 449 450 for (i = 0; i < MAX_NUMNODES; ++i) { 451 struct page *page, *next; 452 list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) { 453 if (count >= nr_huge_pages) 454 return; 455 if (PageHighMem(page)) 456 continue; 457 list_del(&page->lru); 458 update_and_free_page(page); 459 free_huge_pages--; 460 free_huge_pages_node[page_to_nid(page)]--; 461 } 462 } 463} 464#else 465static inline void try_to_free_low(unsigned long count) 466{ 467} 468#endif 469 470#define persistent_huge_pages (nr_huge_pages - surplus_huge_pages) 471static unsigned long set_max_huge_pages(unsigned long count) 472{ 473 unsigned long min_count, ret; 474 475 /* 476 * Increase the pool size 477 * First take pages out of surplus state. Then make up the 478 * remaining difference by allocating fresh huge pages. 479 */ 480 spin_lock(&hugetlb_lock); 481 while (surplus_huge_pages && count > persistent_huge_pages) { 482 if (!adjust_pool_surplus(-1)) 483 break; 484 } 485 486 while (count > persistent_huge_pages) { 487 int ret; 488 /* 489 * If this allocation races such that we no longer need the 490 * page, free_huge_page will handle it by freeing the page 491 * and reducing the surplus. 492 */ 493 spin_unlock(&hugetlb_lock); 494 ret = alloc_fresh_huge_page(); 495 spin_lock(&hugetlb_lock); 496 if (!ret) 497 goto out; 498 499 } 500 501 /* 502 * Decrease the pool size 503 * First return free pages to the buddy allocator (being careful 504 * to keep enough around to satisfy reservations). Then place 505 * pages into surplus state as needed so the pool will shrink 506 * to the desired size as pages become free. 507 */ 508 min_count = resv_huge_pages + nr_huge_pages - free_huge_pages; 509 min_count = max(count, min_count); 510 try_to_free_low(min_count); 511 while (min_count < persistent_huge_pages) { 512 struct page *page = dequeue_huge_page(NULL, 0); 513 if (!page) 514 break; 515 update_and_free_page(page); 516 } 517 while (count < persistent_huge_pages) { 518 if (!adjust_pool_surplus(1)) 519 break; 520 } 521out: 522 ret = persistent_huge_pages; 523 spin_unlock(&hugetlb_lock); 524 return ret; 525} 526 527int hugetlb_sysctl_handler(struct ctl_table *table, int write, 528 struct file *file, void __user *buffer, 529 size_t *length, loff_t *ppos) 530{ 531 proc_doulongvec_minmax(table, write, file, buffer, length, ppos); 532 max_huge_pages = set_max_huge_pages(max_huge_pages); 533 return 0; 534} 535 536int hugetlb_treat_movable_handler(struct ctl_table *table, int write, 537 struct file *file, void __user *buffer, 538 size_t *length, loff_t *ppos) 539{ 540 proc_dointvec(table, write, file, buffer, length, ppos); 541 if (hugepages_treat_as_movable) 542 htlb_alloc_mask = GFP_HIGHUSER_MOVABLE; 543 else 544 htlb_alloc_mask = GFP_HIGHUSER; 545 return 0; 546} 547 548#endif /* CONFIG_SYSCTL */ 549 550int hugetlb_report_meminfo(char *buf) 551{ 552 return sprintf(buf, 553 "HugePages_Total: %5lu\n" 554 "HugePages_Free: %5lu\n" 555 "HugePages_Rsvd: %5lu\n" 556 "HugePages_Surp: %5lu\n" 557 "Hugepagesize: %5lu kB\n", 558 nr_huge_pages, 559 free_huge_pages, 560 resv_huge_pages, 561 surplus_huge_pages, 562 HPAGE_SIZE/1024); 563} 564 565int hugetlb_report_node_meminfo(int nid, char *buf) 566{ 567 return sprintf(buf, 568 "Node %d HugePages_Total: %5u\n" 569 "Node %d HugePages_Free: %5u\n", 570 nid, nr_huge_pages_node[nid], 571 nid, free_huge_pages_node[nid]); 572} 573 574/* Return the number pages of memory we physically have, in PAGE_SIZE units. */ 575unsigned long hugetlb_total_pages(void) 576{ 577 return nr_huge_pages * (HPAGE_SIZE / PAGE_SIZE); 578} 579 580/* 581 * We cannot handle pagefaults against hugetlb pages at all. They cause 582 * handle_mm_fault() to try to instantiate regular-sized pages in the 583 * hugegpage VMA. do_page_fault() is supposed to trap this, so BUG is we get 584 * this far. 585 */ 586static int hugetlb_vm_op_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 587{ 588 BUG(); 589 return 0; 590} 591 592struct vm_operations_struct hugetlb_vm_ops = { 593 .fault = hugetlb_vm_op_fault, 594}; 595 596static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page, 597 int writable) 598{ 599 pte_t entry; 600 601 if (writable) { 602 entry = 603 pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); 604 } else { 605 entry = pte_wrprotect(mk_pte(page, vma->vm_page_prot)); 606 } 607 entry = pte_mkyoung(entry); 608 entry = pte_mkhuge(entry); 609 610 return entry; 611} 612 613static void set_huge_ptep_writable(struct vm_area_struct *vma, 614 unsigned long address, pte_t *ptep) 615{ 616 pte_t entry; 617 618 entry = pte_mkwrite(pte_mkdirty(*ptep)); 619 if (ptep_set_access_flags(vma, address, ptep, entry, 1)) { 620 update_mmu_cache(vma, address, entry); 621 } 622} 623 624 625int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, 626 struct vm_area_struct *vma) 627{ 628 pte_t *src_pte, *dst_pte, entry; 629 struct page *ptepage; 630 unsigned long addr; 631 int cow; 632 633 cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; 634 635 for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) { 636 src_pte = huge_pte_offset(src, addr); 637 if (!src_pte) 638 continue; 639 dst_pte = huge_pte_alloc(dst, addr); 640 if (!dst_pte) 641 goto nomem; 642 spin_lock(&dst->page_table_lock); 643 spin_lock(&src->page_table_lock); 644 if (!pte_none(*src_pte)) { 645 if (cow) 646 ptep_set_wrprotect(src, addr, src_pte); 647 entry = *src_pte; 648 ptepage = pte_page(entry); 649 get_page(ptepage); 650 set_huge_pte_at(dst, addr, dst_pte, entry); 651 } 652 spin_unlock(&src->page_table_lock); 653 spin_unlock(&dst->page_table_lock); 654 } 655 return 0; 656 657nomem: 658 return -ENOMEM; 659} 660 661void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, 662 unsigned long end) 663{ 664 struct mm_struct *mm = vma->vm_mm; 665 unsigned long address; 666 pte_t *ptep; 667 pte_t pte; 668 struct page *page; 669 struct page *tmp; 670 /* 671 * A page gathering list, protected by per file i_mmap_lock. The 672 * lock is used to avoid list corruption from multiple unmapping 673 * of the same page since we are using page->lru. 674 */ 675 LIST_HEAD(page_list); 676 677 WARN_ON(!is_vm_hugetlb_page(vma)); 678 BUG_ON(start & ~HPAGE_MASK); 679 BUG_ON(end & ~HPAGE_MASK); 680 681 spin_lock(&mm->page_table_lock); 682 for (address = start; address < end; address += HPAGE_SIZE) { 683 ptep = huge_pte_offset(mm, address); 684 if (!ptep) 685 continue; 686 687 if (huge_pmd_unshare(mm, &address, ptep)) 688 continue; 689 690 pte = huge_ptep_get_and_clear(mm, address, ptep); 691 if (pte_none(pte)) 692 continue; 693 694 page = pte_page(pte); 695 if (pte_dirty(pte)) 696 set_page_dirty(page); 697 list_add(&page->lru, &page_list); 698 } 699 spin_unlock(&mm->page_table_lock); 700 flush_tlb_range(vma, start, end); 701 list_for_each_entry_safe(page, tmp, &page_list, lru) { 702 list_del(&page->lru); 703 put_page(page); 704 } 705} 706 707void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, 708 unsigned long end) 709{ 710 /* 711 * It is undesirable to test vma->vm_file as it should be non-null 712 * for valid hugetlb area. However, vm_file will be NULL in the error 713 * cleanup path of do_mmap_pgoff. When hugetlbfs ->mmap method fails, 714 * do_mmap_pgoff() nullifies vma->vm_file before calling this function 715 * to clean up. Since no pte has actually been setup, it is safe to 716 * do nothing in this case. 717 */ 718 if (vma->vm_file) { 719 spin_lock(&vma->vm_file->f_mapping->i_mmap_lock); 720 __unmap_hugepage_range(vma, start, end); 721 spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock); 722 } 723} 724 725static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, 726 unsigned long address, pte_t *ptep, pte_t pte) 727{ 728 struct page *old_page, *new_page; 729 int avoidcopy; 730 731 old_page = pte_page(pte); 732 733 /* If no-one else is actually using this page, avoid the copy 734 * and just make the page writable */ 735 avoidcopy = (page_count(old_page) == 1); 736 if (avoidcopy) { 737 set_huge_ptep_writable(vma, address, ptep); 738 return 0; 739 } 740 if (hugetlb_get_quota(vma->vm_file->f_mapping)) 741 return VM_FAULT_SIGBUS; 742 743 page_cache_get(old_page); 744 new_page = alloc_huge_page(vma, address); 745 746 if (!new_page) { 747 page_cache_release(old_page); 748 return VM_FAULT_OOM; 749 } 750 751 spin_unlock(&mm->page_table_lock); 752 copy_huge_page(new_page, old_page, address, vma); 753 spin_lock(&mm->page_table_lock); 754 755 ptep = huge_pte_offset(mm, address & HPAGE_MASK); 756 if (likely(pte_same(*ptep, pte))) { 757 /* Break COW */ 758 set_huge_pte_at(mm, address, ptep, 759 make_huge_pte(vma, new_page, 1)); 760 /* Make the old page be freed below */ 761 new_page = old_page; 762 } 763 page_cache_release(new_page); 764 page_cache_release(old_page); 765 return 0; 766} 767 768static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, 769 unsigned long address, pte_t *ptep, int write_access) 770{ 771 int ret = VM_FAULT_SIGBUS; 772 unsigned long idx; 773 unsigned long size; 774 struct page *page; 775 struct address_space *mapping; 776 pte_t new_pte; 777 778 mapping = vma->vm_file->f_mapping; 779 idx = ((address - vma->vm_start) >> HPAGE_SHIFT) 780 + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT)); 781 782 /* 783 * Use page lock to guard against racing truncation 784 * before we get page_table_lock. 785 */ 786retry: 787 page = find_lock_page(mapping, idx); 788 if (!page) { 789 size = i_size_read(mapping->host) >> HPAGE_SHIFT; 790 if (idx >= size) 791 goto out; 792 if (hugetlb_get_quota(mapping)) 793 goto out; 794 page = alloc_huge_page(vma, address); 795 if (!page) { 796 hugetlb_put_quota(mapping); 797 ret = VM_FAULT_OOM; 798 goto out; 799 } 800 clear_huge_page(page, address); 801 802 if (vma->vm_flags & VM_SHARED) { 803 int err; 804 805 err = add_to_page_cache(page, mapping, idx, GFP_KERNEL); 806 if (err) { 807 put_page(page); 808 if (err == -EEXIST) 809 goto retry; 810 goto out; 811 } 812 } else 813 lock_page(page); 814 } 815 816 spin_lock(&mm->page_table_lock); 817 size = i_size_read(mapping->host) >> HPAGE_SHIFT; 818 if (idx >= size) 819 goto backout; 820 821 ret = 0; 822 if (!pte_none(*ptep)) 823 goto backout; 824 825 new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) 826 && (vma->vm_flags & VM_SHARED))); 827 set_huge_pte_at(mm, address, ptep, new_pte); 828 829 if (write_access && !(vma->vm_flags & VM_SHARED)) { 830 /* Optimization, do the COW without a second fault */ 831 ret = hugetlb_cow(mm, vma, address, ptep, new_pte); 832 } 833 834 spin_unlock(&mm->page_table_lock); 835 unlock_page(page); 836out: 837 return ret; 838 839backout: 840 spin_unlock(&mm->page_table_lock); 841 unlock_page(page); 842 put_page(page); 843 goto out; 844} 845 846int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, 847 unsigned long address, int write_access) 848{ 849 pte_t *ptep; 850 pte_t entry; 851 int ret; 852 static DEFINE_MUTEX(hugetlb_instantiation_mutex); 853 854 ptep = huge_pte_alloc(mm, address); 855 if (!ptep) 856 return VM_FAULT_OOM; 857 858 /* 859 * Serialize hugepage allocation and instantiation, so that we don't 860 * get spurious allocation failures if two CPUs race to instantiate 861 * the same page in the page cache. 862 */ 863 mutex_lock(&hugetlb_instantiation_mutex); 864 entry = *ptep; 865 if (pte_none(entry)) { 866 ret = hugetlb_no_page(mm, vma, address, ptep, write_access); 867 mutex_unlock(&hugetlb_instantiation_mutex); 868 return ret; 869 } 870 871 ret = 0; 872 873 spin_lock(&mm->page_table_lock); 874 /* Check for a racing update before calling hugetlb_cow */ 875 if (likely(pte_same(entry, *ptep))) 876 if (write_access && !pte_write(entry)) 877 ret = hugetlb_cow(mm, vma, address, ptep, entry); 878 spin_unlock(&mm->page_table_lock); 879 mutex_unlock(&hugetlb_instantiation_mutex); 880 881 return ret; 882} 883 884int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, 885 struct page **pages, struct vm_area_struct **vmas, 886 unsigned long *position, int *length, int i, 887 int write) 888{ 889 unsigned long pfn_offset; 890 unsigned long vaddr = *position; 891 int remainder = *length; 892 893 spin_lock(&mm->page_table_lock); 894 while (vaddr < vma->vm_end && remainder) { 895 pte_t *pte; 896 struct page *page; 897 898 /* 899 * Some archs (sparc64, sh*) have multiple pte_ts to 900 * each hugepage. We have to make * sure we get the 901 * first, for the page indexing below to work. 902 */ 903 pte = huge_pte_offset(mm, vaddr & HPAGE_MASK); 904 905 if (!pte || pte_none(*pte)) { 906 int ret; 907 908 spin_unlock(&mm->page_table_lock); 909 ret = hugetlb_fault(mm, vma, vaddr, write); 910 spin_lock(&mm->page_table_lock); 911 if (!(ret & VM_FAULT_ERROR)) 912 continue; 913 914 remainder = 0; 915 if (!i) 916 i = -EFAULT; 917 break; 918 } 919 920 pfn_offset = (vaddr & ~HPAGE_MASK) >> PAGE_SHIFT; 921 page = pte_page(*pte); 922same_page: 923 if (pages) { 924 get_page(page); 925 pages[i] = page + pfn_offset; 926 } 927 928 if (vmas) 929 vmas[i] = vma; 930 931 vaddr += PAGE_SIZE; 932 ++pfn_offset; 933 --remainder; 934 ++i; 935 if (vaddr < vma->vm_end && remainder && 936 pfn_offset < HPAGE_SIZE/PAGE_SIZE) { 937 /* 938 * We use pfn_offset to avoid touching the pageframes 939 * of this compound page. 940 */ 941 goto same_page; 942 } 943 } 944 spin_unlock(&mm->page_table_lock); 945 *length = remainder; 946 *position = vaddr; 947 948 return i; 949} 950 951void hugetlb_change_protection(struct vm_area_struct *vma, 952 unsigned long address, unsigned long end, pgprot_t newprot) 953{ 954 struct mm_struct *mm = vma->vm_mm; 955 unsigned long start = address; 956 pte_t *ptep; 957 pte_t pte; 958 959 BUG_ON(address >= end); 960 flush_cache_range(vma, address, end); 961 962 spin_lock(&vma->vm_file->f_mapping->i_mmap_lock); 963 spin_lock(&mm->page_table_lock); 964 for (; address < end; address += HPAGE_SIZE) { 965 ptep = huge_pte_offset(mm, address); 966 if (!ptep) 967 continue; 968 if (huge_pmd_unshare(mm, &address, ptep)) 969 continue; 970 if (!pte_none(*ptep)) { 971 pte = huge_ptep_get_and_clear(mm, address, ptep); 972 pte = pte_mkhuge(pte_modify(pte, newprot)); 973 set_huge_pte_at(mm, address, ptep, pte); 974 } 975 } 976 spin_unlock(&mm->page_table_lock); 977 spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock); 978 979 flush_tlb_range(vma, start, end); 980} 981 982struct file_region { 983 struct list_head link; 984 long from; 985 long to; 986}; 987 988static long region_add(struct list_head *head, long f, long t) 989{ 990 struct file_region *rg, *nrg, *trg; 991 992 /* Locate the region we are either in or before. */ 993 list_for_each_entry(rg, head, link) 994 if (f <= rg->to) 995 break; 996 997 /* Round our left edge to the current segment if it encloses us. */ 998 if (f > rg->from) 999 f = rg->from; 1000 1001 /* Check for and consume any regions we now overlap with. */ 1002 nrg = rg; 1003 list_for_each_entry_safe(rg, trg, rg->link.prev, link) { 1004 if (&rg->link == head) 1005 break; 1006 if (rg->from > t) 1007 break; 1008 1009 /* If this area reaches higher then extend our area to 1010 * include it completely. If this is not the first area 1011 * which we intend to reuse, free it. */ 1012 if (rg->to > t) 1013 t = rg->to; 1014 if (rg != nrg) { 1015 list_del(&rg->link); 1016 kfree(rg); 1017 } 1018 } 1019 nrg->from = f; 1020 nrg->to = t; 1021 return 0; 1022} 1023 1024static long region_chg(struct list_head *head, long f, long t) 1025{ 1026 struct file_region *rg, *nrg; 1027 long chg = 0; 1028 1029 /* Locate the region we are before or in. */ 1030 list_for_each_entry(rg, head, link) 1031 if (f <= rg->to) 1032 break; 1033 1034 /* If we are below the current region then a new region is required. 1035 * Subtle, allocate a new region at the position but make it zero 1036 * size such that we can guarantee to record the reservation. */ 1037 if (&rg->link == head || t < rg->from) { 1038 nrg = kmalloc(sizeof(*nrg), GFP_KERNEL); 1039 if (!nrg) 1040 return -ENOMEM; 1041 nrg->from = f; 1042 nrg->to = f; 1043 INIT_LIST_HEAD(&nrg->link); 1044 list_add(&nrg->link, rg->link.prev); 1045 1046 return t - f; 1047 } 1048 1049 /* Round our left edge to the current segment if it encloses us. */ 1050 if (f > rg->from) 1051 f = rg->from; 1052 chg = t - f; 1053 1054 /* Check for and consume any regions we now overlap with. */ 1055 list_for_each_entry(rg, rg->link.prev, link) { 1056 if (&rg->link == head) 1057 break; 1058 if (rg->from > t) 1059 return chg; 1060 1061 /* We overlap with this area, if it extends futher than 1062 * us then we must extend ourselves. Account for its 1063 * existing reservation. */ 1064 if (rg->to > t) { 1065 chg += rg->to - t; 1066 t = rg->to; 1067 } 1068 chg -= rg->to - rg->from; 1069 } 1070 return chg; 1071} 1072 1073static long region_truncate(struct list_head *head, long end) 1074{ 1075 struct file_region *rg, *trg; 1076 long chg = 0; 1077 1078 /* Locate the region we are either in or before. */ 1079 list_for_each_entry(rg, head, link) 1080 if (end <= rg->to) 1081 break; 1082 if (&rg->link == head) 1083 return 0; 1084 1085 /* If we are in the middle of a region then adjust it. */ 1086 if (end > rg->from) { 1087 chg = rg->to - end; 1088 rg->to = end; 1089 rg = list_entry(rg->link.next, typeof(*rg), link); 1090 } 1091 1092 /* Drop any remaining regions. */ 1093 list_for_each_entry_safe(rg, trg, rg->link.prev, link) { 1094 if (&rg->link == head) 1095 break; 1096 chg += rg->to - rg->from; 1097 list_del(&rg->link); 1098 kfree(rg); 1099 } 1100 return chg; 1101} 1102 1103static int hugetlb_acct_memory(long delta) 1104{ 1105 int ret = -ENOMEM; 1106 1107 spin_lock(&hugetlb_lock); 1108 /* 1109 * When cpuset is configured, it breaks the strict hugetlb page 1110 * reservation as the accounting is done on a global variable. Such 1111 * reservation is completely rubbish in the presence of cpuset because 1112 * the reservation is not checked against page availability for the 1113 * current cpuset. Application can still potentially OOM'ed by kernel 1114 * with lack of free htlb page in cpuset that the task is in. 1115 * Attempt to enforce strict accounting with cpuset is almost 1116 * impossible (or too ugly) because cpuset is too fluid that 1117 * task or memory node can be dynamically moved between cpusets. 1118 * 1119 * The change of semantics for shared hugetlb mapping with cpuset is 1120 * undesirable. However, in order to preserve some of the semantics, 1121 * we fall back to check against current free page availability as 1122 * a best attempt and hopefully to minimize the impact of changing 1123 * semantics that cpuset has. 1124 */ 1125 if (delta > 0) { 1126 if (gather_surplus_pages(delta) < 0) 1127 goto out; 1128 1129 if (delta > cpuset_mems_nr(free_huge_pages_node)) 1130 goto out; 1131 } 1132 1133 ret = 0; 1134 resv_huge_pages += delta; 1135 if (delta < 0) 1136 return_unused_surplus_pages((unsigned long) -delta); 1137 1138out: 1139 spin_unlock(&hugetlb_lock); 1140 return ret; 1141} 1142 1143int hugetlb_reserve_pages(struct inode *inode, long from, long to) 1144{ 1145 long ret, chg; 1146 1147 chg = region_chg(&inode->i_mapping->private_list, from, to); 1148 if (chg < 0) 1149 return chg; 1150 1151 ret = hugetlb_acct_memory(chg); 1152 if (ret < 0) 1153 return ret; 1154 region_add(&inode->i_mapping->private_list, from, to); 1155 return 0; 1156} 1157 1158void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) 1159{ 1160 long chg = region_truncate(&inode->i_mapping->private_list, offset); 1161 hugetlb_acct_memory(freed - chg); 1162} 1163