hugetlb.c revision ac09b3a15154af5f081fed509c6c3662e79de785
1/* 2 * Generic hugetlb support. 3 * (C) William Irwin, April 2004 4 */ 5#include <linux/gfp.h> 6#include <linux/list.h> 7#include <linux/init.h> 8#include <linux/module.h> 9#include <linux/mm.h> 10#include <linux/sysctl.h> 11#include <linux/highmem.h> 12#include <linux/nodemask.h> 13#include <linux/pagemap.h> 14#include <linux/mempolicy.h> 15#include <linux/cpuset.h> 16#include <linux/mutex.h> 17 18#include <asm/page.h> 19#include <asm/pgtable.h> 20 21#include <linux/hugetlb.h> 22#include "internal.h" 23 24const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; 25static unsigned long nr_huge_pages, free_huge_pages, resv_huge_pages; 26static unsigned long surplus_huge_pages; 27static unsigned long nr_overcommit_huge_pages; 28unsigned long max_huge_pages; 29unsigned long sysctl_overcommit_huge_pages; 30static struct list_head hugepage_freelists[MAX_NUMNODES]; 31static unsigned int nr_huge_pages_node[MAX_NUMNODES]; 32static unsigned int free_huge_pages_node[MAX_NUMNODES]; 33static unsigned int surplus_huge_pages_node[MAX_NUMNODES]; 34static gfp_t htlb_alloc_mask = GFP_HIGHUSER; 35unsigned long hugepages_treat_as_movable; 36static int hugetlb_next_nid; 37 38/* 39 * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages 40 */ 41static DEFINE_SPINLOCK(hugetlb_lock); 42 43static void clear_huge_page(struct page *page, unsigned long addr) 44{ 45 int i; 46 47 might_sleep(); 48 for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); i++) { 49 cond_resched(); 50 clear_user_highpage(page + i, addr + i * PAGE_SIZE); 51 } 52} 53 54static void copy_huge_page(struct page *dst, struct page *src, 55 unsigned long addr, struct vm_area_struct *vma) 56{ 57 int i; 58 59 might_sleep(); 60 for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) { 61 cond_resched(); 62 copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma); 63 } 64} 65 66static void enqueue_huge_page(struct page *page) 67{ 68 int nid = page_to_nid(page); 69 list_add(&page->lru, &hugepage_freelists[nid]); 70 free_huge_pages++; 71 free_huge_pages_node[nid]++; 72} 73 74static struct page *dequeue_huge_page(struct vm_area_struct *vma, 75 unsigned long address) 76{ 77 int nid; 78 struct page *page = NULL; 79 struct mempolicy *mpol; 80 struct zonelist *zonelist = huge_zonelist(vma, address, 81 htlb_alloc_mask, &mpol); 82 struct zone **z; 83 84 for (z = zonelist->zones; *z; z++) { 85 nid = zone_to_nid(*z); 86 if (cpuset_zone_allowed_softwall(*z, htlb_alloc_mask) && 87 !list_empty(&hugepage_freelists[nid])) { 88 page = list_entry(hugepage_freelists[nid].next, 89 struct page, lru); 90 list_del(&page->lru); 91 free_huge_pages--; 92 free_huge_pages_node[nid]--; 93 if (vma && vma->vm_flags & VM_MAYSHARE) 94 resv_huge_pages--; 95 break; 96 } 97 } 98 mpol_free(mpol); /* unref if mpol !NULL */ 99 return page; 100} 101 102static void update_and_free_page(struct page *page) 103{ 104 int i; 105 nr_huge_pages--; 106 nr_huge_pages_node[page_to_nid(page)]--; 107 for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) { 108 page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced | 109 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved | 110 1 << PG_private | 1<< PG_writeback); 111 } 112 set_compound_page_dtor(page, NULL); 113 set_page_refcounted(page); 114 __free_pages(page, HUGETLB_PAGE_ORDER); 115} 116 117static void free_huge_page(struct page *page) 118{ 119 int nid = page_to_nid(page); 120 struct address_space *mapping; 121 122 mapping = (struct address_space *) page_private(page); 123 set_page_private(page, 0); 124 BUG_ON(page_count(page)); 125 INIT_LIST_HEAD(&page->lru); 126 127 spin_lock(&hugetlb_lock); 128 if (surplus_huge_pages_node[nid]) { 129 update_and_free_page(page); 130 surplus_huge_pages--; 131 surplus_huge_pages_node[nid]--; 132 } else { 133 enqueue_huge_page(page); 134 } 135 spin_unlock(&hugetlb_lock); 136 if (mapping) 137 hugetlb_put_quota(mapping, 1); 138} 139 140/* 141 * Increment or decrement surplus_huge_pages. Keep node-specific counters 142 * balanced by operating on them in a round-robin fashion. 143 * Returns 1 if an adjustment was made. 144 */ 145static int adjust_pool_surplus(int delta) 146{ 147 static int prev_nid; 148 int nid = prev_nid; 149 int ret = 0; 150 151 VM_BUG_ON(delta != -1 && delta != 1); 152 do { 153 nid = next_node(nid, node_online_map); 154 if (nid == MAX_NUMNODES) 155 nid = first_node(node_online_map); 156 157 /* To shrink on this node, there must be a surplus page */ 158 if (delta < 0 && !surplus_huge_pages_node[nid]) 159 continue; 160 /* Surplus cannot exceed the total number of pages */ 161 if (delta > 0 && surplus_huge_pages_node[nid] >= 162 nr_huge_pages_node[nid]) 163 continue; 164 165 surplus_huge_pages += delta; 166 surplus_huge_pages_node[nid] += delta; 167 ret = 1; 168 break; 169 } while (nid != prev_nid); 170 171 prev_nid = nid; 172 return ret; 173} 174 175static struct page *alloc_fresh_huge_page_node(int nid) 176{ 177 struct page *page; 178 179 page = alloc_pages_node(nid, 180 htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|__GFP_NOWARN, 181 HUGETLB_PAGE_ORDER); 182 if (page) { 183 set_compound_page_dtor(page, free_huge_page); 184 spin_lock(&hugetlb_lock); 185 nr_huge_pages++; 186 nr_huge_pages_node[nid]++; 187 spin_unlock(&hugetlb_lock); 188 put_page(page); /* free it into the hugepage allocator */ 189 } 190 191 return page; 192} 193 194static int alloc_fresh_huge_page(void) 195{ 196 struct page *page; 197 int start_nid; 198 int next_nid; 199 int ret = 0; 200 201 start_nid = hugetlb_next_nid; 202 203 do { 204 page = alloc_fresh_huge_page_node(hugetlb_next_nid); 205 if (page) 206 ret = 1; 207 /* 208 * Use a helper variable to find the next node and then 209 * copy it back to hugetlb_next_nid afterwards: 210 * otherwise there's a window in which a racer might 211 * pass invalid nid MAX_NUMNODES to alloc_pages_node. 212 * But we don't need to use a spin_lock here: it really 213 * doesn't matter if occasionally a racer chooses the 214 * same nid as we do. Move nid forward in the mask even 215 * if we just successfully allocated a hugepage so that 216 * the next caller gets hugepages on the next node. 217 */ 218 next_nid = next_node(hugetlb_next_nid, node_online_map); 219 if (next_nid == MAX_NUMNODES) 220 next_nid = first_node(node_online_map); 221 hugetlb_next_nid = next_nid; 222 } while (!page && hugetlb_next_nid != start_nid); 223 224 return ret; 225} 226 227static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma, 228 unsigned long address) 229{ 230 struct page *page; 231 unsigned int nid; 232 233 /* 234 * Assume we will successfully allocate the surplus page to 235 * prevent racing processes from causing the surplus to exceed 236 * overcommit 237 * 238 * This however introduces a different race, where a process B 239 * tries to grow the static hugepage pool while alloc_pages() is 240 * called by process A. B will only examine the per-node 241 * counters in determining if surplus huge pages can be 242 * converted to normal huge pages in adjust_pool_surplus(). A 243 * won't be able to increment the per-node counter, until the 244 * lock is dropped by B, but B doesn't drop hugetlb_lock until 245 * no more huge pages can be converted from surplus to normal 246 * state (and doesn't try to convert again). Thus, we have a 247 * case where a surplus huge page exists, the pool is grown, and 248 * the surplus huge page still exists after, even though it 249 * should just have been converted to a normal huge page. This 250 * does not leak memory, though, as the hugepage will be freed 251 * once it is out of use. It also does not allow the counters to 252 * go out of whack in adjust_pool_surplus() as we don't modify 253 * the node values until we've gotten the hugepage and only the 254 * per-node value is checked there. 255 */ 256 spin_lock(&hugetlb_lock); 257 if (surplus_huge_pages >= nr_overcommit_huge_pages) { 258 spin_unlock(&hugetlb_lock); 259 return NULL; 260 } else { 261 nr_huge_pages++; 262 surplus_huge_pages++; 263 } 264 spin_unlock(&hugetlb_lock); 265 266 page = alloc_pages(htlb_alloc_mask|__GFP_COMP|__GFP_NOWARN, 267 HUGETLB_PAGE_ORDER); 268 269 spin_lock(&hugetlb_lock); 270 if (page) { 271 nid = page_to_nid(page); 272 set_compound_page_dtor(page, free_huge_page); 273 /* 274 * We incremented the global counters already 275 */ 276 nr_huge_pages_node[nid]++; 277 surplus_huge_pages_node[nid]++; 278 } else { 279 nr_huge_pages--; 280 surplus_huge_pages--; 281 } 282 spin_unlock(&hugetlb_lock); 283 284 return page; 285} 286 287/* 288 * Increase the hugetlb pool such that it can accomodate a reservation 289 * of size 'delta'. 290 */ 291static int gather_surplus_pages(int delta) 292{ 293 struct list_head surplus_list; 294 struct page *page, *tmp; 295 int ret, i; 296 int needed, allocated; 297 298 needed = (resv_huge_pages + delta) - free_huge_pages; 299 if (needed <= 0) { 300 resv_huge_pages += delta; 301 return 0; 302 } 303 304 allocated = 0; 305 INIT_LIST_HEAD(&surplus_list); 306 307 ret = -ENOMEM; 308retry: 309 spin_unlock(&hugetlb_lock); 310 for (i = 0; i < needed; i++) { 311 page = alloc_buddy_huge_page(NULL, 0); 312 if (!page) { 313 /* 314 * We were not able to allocate enough pages to 315 * satisfy the entire reservation so we free what 316 * we've allocated so far. 317 */ 318 spin_lock(&hugetlb_lock); 319 needed = 0; 320 goto free; 321 } 322 323 list_add(&page->lru, &surplus_list); 324 } 325 allocated += needed; 326 327 /* 328 * After retaking hugetlb_lock, we need to recalculate 'needed' 329 * because either resv_huge_pages or free_huge_pages may have changed. 330 */ 331 spin_lock(&hugetlb_lock); 332 needed = (resv_huge_pages + delta) - (free_huge_pages + allocated); 333 if (needed > 0) 334 goto retry; 335 336 /* 337 * The surplus_list now contains _at_least_ the number of extra pages 338 * needed to accomodate the reservation. Add the appropriate number 339 * of pages to the hugetlb pool and free the extras back to the buddy 340 * allocator. Commit the entire reservation here to prevent another 341 * process from stealing the pages as they are added to the pool but 342 * before they are reserved. 343 */ 344 needed += allocated; 345 resv_huge_pages += delta; 346 ret = 0; 347free: 348 list_for_each_entry_safe(page, tmp, &surplus_list, lru) { 349 list_del(&page->lru); 350 if ((--needed) >= 0) 351 enqueue_huge_page(page); 352 else { 353 /* 354 * Decrement the refcount and free the page using its 355 * destructor. This must be done with hugetlb_lock 356 * unlocked which is safe because free_huge_page takes 357 * hugetlb_lock before deciding how to free the page. 358 */ 359 spin_unlock(&hugetlb_lock); 360 put_page(page); 361 spin_lock(&hugetlb_lock); 362 } 363 } 364 365 return ret; 366} 367 368/* 369 * When releasing a hugetlb pool reservation, any surplus pages that were 370 * allocated to satisfy the reservation must be explicitly freed if they were 371 * never used. 372 */ 373static void return_unused_surplus_pages(unsigned long unused_resv_pages) 374{ 375 static int nid = -1; 376 struct page *page; 377 unsigned long nr_pages; 378 379 /* Uncommit the reservation */ 380 resv_huge_pages -= unused_resv_pages; 381 382 nr_pages = min(unused_resv_pages, surplus_huge_pages); 383 384 while (nr_pages) { 385 nid = next_node(nid, node_online_map); 386 if (nid == MAX_NUMNODES) 387 nid = first_node(node_online_map); 388 389 if (!surplus_huge_pages_node[nid]) 390 continue; 391 392 if (!list_empty(&hugepage_freelists[nid])) { 393 page = list_entry(hugepage_freelists[nid].next, 394 struct page, lru); 395 list_del(&page->lru); 396 update_and_free_page(page); 397 free_huge_pages--; 398 free_huge_pages_node[nid]--; 399 surplus_huge_pages--; 400 surplus_huge_pages_node[nid]--; 401 nr_pages--; 402 } 403 } 404} 405 406 407static struct page *alloc_huge_page_shared(struct vm_area_struct *vma, 408 unsigned long addr) 409{ 410 struct page *page; 411 412 spin_lock(&hugetlb_lock); 413 page = dequeue_huge_page(vma, addr); 414 spin_unlock(&hugetlb_lock); 415 return page ? page : ERR_PTR(-VM_FAULT_OOM); 416} 417 418static struct page *alloc_huge_page_private(struct vm_area_struct *vma, 419 unsigned long addr) 420{ 421 struct page *page = NULL; 422 423 if (hugetlb_get_quota(vma->vm_file->f_mapping, 1)) 424 return ERR_PTR(-VM_FAULT_SIGBUS); 425 426 spin_lock(&hugetlb_lock); 427 if (free_huge_pages > resv_huge_pages) 428 page = dequeue_huge_page(vma, addr); 429 spin_unlock(&hugetlb_lock); 430 if (!page) { 431 page = alloc_buddy_huge_page(vma, addr); 432 if (!page) { 433 hugetlb_put_quota(vma->vm_file->f_mapping, 1); 434 return ERR_PTR(-VM_FAULT_OOM); 435 } 436 } 437 return page; 438} 439 440static struct page *alloc_huge_page(struct vm_area_struct *vma, 441 unsigned long addr) 442{ 443 struct page *page; 444 struct address_space *mapping = vma->vm_file->f_mapping; 445 446 if (vma->vm_flags & VM_MAYSHARE) 447 page = alloc_huge_page_shared(vma, addr); 448 else 449 page = alloc_huge_page_private(vma, addr); 450 451 if (!IS_ERR(page)) { 452 set_page_refcounted(page); 453 set_page_private(page, (unsigned long) mapping); 454 } 455 return page; 456} 457 458static int __init hugetlb_init(void) 459{ 460 unsigned long i; 461 462 if (HPAGE_SHIFT == 0) 463 return 0; 464 465 for (i = 0; i < MAX_NUMNODES; ++i) 466 INIT_LIST_HEAD(&hugepage_freelists[i]); 467 468 hugetlb_next_nid = first_node(node_online_map); 469 470 for (i = 0; i < max_huge_pages; ++i) { 471 if (!alloc_fresh_huge_page()) 472 break; 473 } 474 max_huge_pages = free_huge_pages = nr_huge_pages = i; 475 printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages); 476 return 0; 477} 478module_init(hugetlb_init); 479 480static int __init hugetlb_setup(char *s) 481{ 482 if (sscanf(s, "%lu", &max_huge_pages) <= 0) 483 max_huge_pages = 0; 484 return 1; 485} 486__setup("hugepages=", hugetlb_setup); 487 488static unsigned int cpuset_mems_nr(unsigned int *array) 489{ 490 int node; 491 unsigned int nr = 0; 492 493 for_each_node_mask(node, cpuset_current_mems_allowed) 494 nr += array[node]; 495 496 return nr; 497} 498 499#ifdef CONFIG_SYSCTL 500#ifdef CONFIG_HIGHMEM 501static void try_to_free_low(unsigned long count) 502{ 503 int i; 504 505 for (i = 0; i < MAX_NUMNODES; ++i) { 506 struct page *page, *next; 507 list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) { 508 if (count >= nr_huge_pages) 509 return; 510 if (PageHighMem(page)) 511 continue; 512 list_del(&page->lru); 513 update_and_free_page(page); 514 free_huge_pages--; 515 free_huge_pages_node[page_to_nid(page)]--; 516 } 517 } 518} 519#else 520static inline void try_to_free_low(unsigned long count) 521{ 522} 523#endif 524 525#define persistent_huge_pages (nr_huge_pages - surplus_huge_pages) 526static unsigned long set_max_huge_pages(unsigned long count) 527{ 528 unsigned long min_count, ret; 529 530 /* 531 * Increase the pool size 532 * First take pages out of surplus state. Then make up the 533 * remaining difference by allocating fresh huge pages. 534 * 535 * We might race with alloc_buddy_huge_page() here and be unable 536 * to convert a surplus huge page to a normal huge page. That is 537 * not critical, though, it just means the overall size of the 538 * pool might be one hugepage larger than it needs to be, but 539 * within all the constraints specified by the sysctls. 540 */ 541 spin_lock(&hugetlb_lock); 542 while (surplus_huge_pages && count > persistent_huge_pages) { 543 if (!adjust_pool_surplus(-1)) 544 break; 545 } 546 547 while (count > persistent_huge_pages) { 548 int ret; 549 /* 550 * If this allocation races such that we no longer need the 551 * page, free_huge_page will handle it by freeing the page 552 * and reducing the surplus. 553 */ 554 spin_unlock(&hugetlb_lock); 555 ret = alloc_fresh_huge_page(); 556 spin_lock(&hugetlb_lock); 557 if (!ret) 558 goto out; 559 560 } 561 562 /* 563 * Decrease the pool size 564 * First return free pages to the buddy allocator (being careful 565 * to keep enough around to satisfy reservations). Then place 566 * pages into surplus state as needed so the pool will shrink 567 * to the desired size as pages become free. 568 * 569 * By placing pages into the surplus state independent of the 570 * overcommit value, we are allowing the surplus pool size to 571 * exceed overcommit. There are few sane options here. Since 572 * alloc_buddy_huge_page() is checking the global counter, 573 * though, we'll note that we're not allowed to exceed surplus 574 * and won't grow the pool anywhere else. Not until one of the 575 * sysctls are changed, or the surplus pages go out of use. 576 */ 577 min_count = resv_huge_pages + nr_huge_pages - free_huge_pages; 578 min_count = max(count, min_count); 579 try_to_free_low(min_count); 580 while (min_count < persistent_huge_pages) { 581 struct page *page = dequeue_huge_page(NULL, 0); 582 if (!page) 583 break; 584 update_and_free_page(page); 585 } 586 while (count < persistent_huge_pages) { 587 if (!adjust_pool_surplus(1)) 588 break; 589 } 590out: 591 ret = persistent_huge_pages; 592 spin_unlock(&hugetlb_lock); 593 return ret; 594} 595 596int hugetlb_sysctl_handler(struct ctl_table *table, int write, 597 struct file *file, void __user *buffer, 598 size_t *length, loff_t *ppos) 599{ 600 proc_doulongvec_minmax(table, write, file, buffer, length, ppos); 601 max_huge_pages = set_max_huge_pages(max_huge_pages); 602 return 0; 603} 604 605int hugetlb_treat_movable_handler(struct ctl_table *table, int write, 606 struct file *file, void __user *buffer, 607 size_t *length, loff_t *ppos) 608{ 609 proc_dointvec(table, write, file, buffer, length, ppos); 610 if (hugepages_treat_as_movable) 611 htlb_alloc_mask = GFP_HIGHUSER_MOVABLE; 612 else 613 htlb_alloc_mask = GFP_HIGHUSER; 614 return 0; 615} 616 617int hugetlb_overcommit_handler(struct ctl_table *table, int write, 618 struct file *file, void __user *buffer, 619 size_t *length, loff_t *ppos) 620{ 621 proc_doulongvec_minmax(table, write, file, buffer, length, ppos); 622 spin_lock(&hugetlb_lock); 623 nr_overcommit_huge_pages = sysctl_overcommit_huge_pages; 624 spin_unlock(&hugetlb_lock); 625 return 0; 626} 627 628#endif /* CONFIG_SYSCTL */ 629 630int hugetlb_report_meminfo(char *buf) 631{ 632 return sprintf(buf, 633 "HugePages_Total: %5lu\n" 634 "HugePages_Free: %5lu\n" 635 "HugePages_Rsvd: %5lu\n" 636 "HugePages_Surp: %5lu\n" 637 "Hugepagesize: %5lu kB\n", 638 nr_huge_pages, 639 free_huge_pages, 640 resv_huge_pages, 641 surplus_huge_pages, 642 HPAGE_SIZE/1024); 643} 644 645int hugetlb_report_node_meminfo(int nid, char *buf) 646{ 647 return sprintf(buf, 648 "Node %d HugePages_Total: %5u\n" 649 "Node %d HugePages_Free: %5u\n", 650 nid, nr_huge_pages_node[nid], 651 nid, free_huge_pages_node[nid]); 652} 653 654/* Return the number pages of memory we physically have, in PAGE_SIZE units. */ 655unsigned long hugetlb_total_pages(void) 656{ 657 return nr_huge_pages * (HPAGE_SIZE / PAGE_SIZE); 658} 659 660/* 661 * We cannot handle pagefaults against hugetlb pages at all. They cause 662 * handle_mm_fault() to try to instantiate regular-sized pages in the 663 * hugegpage VMA. do_page_fault() is supposed to trap this, so BUG is we get 664 * this far. 665 */ 666static int hugetlb_vm_op_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 667{ 668 BUG(); 669 return 0; 670} 671 672struct vm_operations_struct hugetlb_vm_ops = { 673 .fault = hugetlb_vm_op_fault, 674}; 675 676static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page, 677 int writable) 678{ 679 pte_t entry; 680 681 if (writable) { 682 entry = 683 pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); 684 } else { 685 entry = pte_wrprotect(mk_pte(page, vma->vm_page_prot)); 686 } 687 entry = pte_mkyoung(entry); 688 entry = pte_mkhuge(entry); 689 690 return entry; 691} 692 693static void set_huge_ptep_writable(struct vm_area_struct *vma, 694 unsigned long address, pte_t *ptep) 695{ 696 pte_t entry; 697 698 entry = pte_mkwrite(pte_mkdirty(*ptep)); 699 if (ptep_set_access_flags(vma, address, ptep, entry, 1)) { 700 update_mmu_cache(vma, address, entry); 701 } 702} 703 704 705int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, 706 struct vm_area_struct *vma) 707{ 708 pte_t *src_pte, *dst_pte, entry; 709 struct page *ptepage; 710 unsigned long addr; 711 int cow; 712 713 cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; 714 715 for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) { 716 src_pte = huge_pte_offset(src, addr); 717 if (!src_pte) 718 continue; 719 dst_pte = huge_pte_alloc(dst, addr); 720 if (!dst_pte) 721 goto nomem; 722 723 /* If the pagetables are shared don't copy or take references */ 724 if (dst_pte == src_pte) 725 continue; 726 727 spin_lock(&dst->page_table_lock); 728 spin_lock(&src->page_table_lock); 729 if (!pte_none(*src_pte)) { 730 if (cow) 731 ptep_set_wrprotect(src, addr, src_pte); 732 entry = *src_pte; 733 ptepage = pte_page(entry); 734 get_page(ptepage); 735 set_huge_pte_at(dst, addr, dst_pte, entry); 736 } 737 spin_unlock(&src->page_table_lock); 738 spin_unlock(&dst->page_table_lock); 739 } 740 return 0; 741 742nomem: 743 return -ENOMEM; 744} 745 746void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, 747 unsigned long end) 748{ 749 struct mm_struct *mm = vma->vm_mm; 750 unsigned long address; 751 pte_t *ptep; 752 pte_t pte; 753 struct page *page; 754 struct page *tmp; 755 /* 756 * A page gathering list, protected by per file i_mmap_lock. The 757 * lock is used to avoid list corruption from multiple unmapping 758 * of the same page since we are using page->lru. 759 */ 760 LIST_HEAD(page_list); 761 762 WARN_ON(!is_vm_hugetlb_page(vma)); 763 BUG_ON(start & ~HPAGE_MASK); 764 BUG_ON(end & ~HPAGE_MASK); 765 766 spin_lock(&mm->page_table_lock); 767 for (address = start; address < end; address += HPAGE_SIZE) { 768 ptep = huge_pte_offset(mm, address); 769 if (!ptep) 770 continue; 771 772 if (huge_pmd_unshare(mm, &address, ptep)) 773 continue; 774 775 pte = huge_ptep_get_and_clear(mm, address, ptep); 776 if (pte_none(pte)) 777 continue; 778 779 page = pte_page(pte); 780 if (pte_dirty(pte)) 781 set_page_dirty(page); 782 list_add(&page->lru, &page_list); 783 } 784 spin_unlock(&mm->page_table_lock); 785 flush_tlb_range(vma, start, end); 786 list_for_each_entry_safe(page, tmp, &page_list, lru) { 787 list_del(&page->lru); 788 put_page(page); 789 } 790} 791 792void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, 793 unsigned long end) 794{ 795 /* 796 * It is undesirable to test vma->vm_file as it should be non-null 797 * for valid hugetlb area. However, vm_file will be NULL in the error 798 * cleanup path of do_mmap_pgoff. When hugetlbfs ->mmap method fails, 799 * do_mmap_pgoff() nullifies vma->vm_file before calling this function 800 * to clean up. Since no pte has actually been setup, it is safe to 801 * do nothing in this case. 802 */ 803 if (vma->vm_file) { 804 spin_lock(&vma->vm_file->f_mapping->i_mmap_lock); 805 __unmap_hugepage_range(vma, start, end); 806 spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock); 807 } 808} 809 810static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, 811 unsigned long address, pte_t *ptep, pte_t pte) 812{ 813 struct page *old_page, *new_page; 814 int avoidcopy; 815 816 old_page = pte_page(pte); 817 818 /* If no-one else is actually using this page, avoid the copy 819 * and just make the page writable */ 820 avoidcopy = (page_count(old_page) == 1); 821 if (avoidcopy) { 822 set_huge_ptep_writable(vma, address, ptep); 823 return 0; 824 } 825 826 page_cache_get(old_page); 827 new_page = alloc_huge_page(vma, address); 828 829 if (IS_ERR(new_page)) { 830 page_cache_release(old_page); 831 return -PTR_ERR(new_page); 832 } 833 834 spin_unlock(&mm->page_table_lock); 835 copy_huge_page(new_page, old_page, address, vma); 836 __SetPageUptodate(new_page); 837 spin_lock(&mm->page_table_lock); 838 839 ptep = huge_pte_offset(mm, address & HPAGE_MASK); 840 if (likely(pte_same(*ptep, pte))) { 841 /* Break COW */ 842 set_huge_pte_at(mm, address, ptep, 843 make_huge_pte(vma, new_page, 1)); 844 /* Make the old page be freed below */ 845 new_page = old_page; 846 } 847 page_cache_release(new_page); 848 page_cache_release(old_page); 849 return 0; 850} 851 852static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, 853 unsigned long address, pte_t *ptep, int write_access) 854{ 855 int ret = VM_FAULT_SIGBUS; 856 unsigned long idx; 857 unsigned long size; 858 struct page *page; 859 struct address_space *mapping; 860 pte_t new_pte; 861 862 mapping = vma->vm_file->f_mapping; 863 idx = ((address - vma->vm_start) >> HPAGE_SHIFT) 864 + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT)); 865 866 /* 867 * Use page lock to guard against racing truncation 868 * before we get page_table_lock. 869 */ 870retry: 871 page = find_lock_page(mapping, idx); 872 if (!page) { 873 size = i_size_read(mapping->host) >> HPAGE_SHIFT; 874 if (idx >= size) 875 goto out; 876 page = alloc_huge_page(vma, address); 877 if (IS_ERR(page)) { 878 ret = -PTR_ERR(page); 879 goto out; 880 } 881 clear_huge_page(page, address); 882 __SetPageUptodate(page); 883 884 if (vma->vm_flags & VM_SHARED) { 885 int err; 886 struct inode *inode = mapping->host; 887 888 err = add_to_page_cache(page, mapping, idx, GFP_KERNEL); 889 if (err) { 890 put_page(page); 891 if (err == -EEXIST) 892 goto retry; 893 goto out; 894 } 895 896 spin_lock(&inode->i_lock); 897 inode->i_blocks += BLOCKS_PER_HUGEPAGE; 898 spin_unlock(&inode->i_lock); 899 } else 900 lock_page(page); 901 } 902 903 spin_lock(&mm->page_table_lock); 904 size = i_size_read(mapping->host) >> HPAGE_SHIFT; 905 if (idx >= size) 906 goto backout; 907 908 ret = 0; 909 if (!pte_none(*ptep)) 910 goto backout; 911 912 new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) 913 && (vma->vm_flags & VM_SHARED))); 914 set_huge_pte_at(mm, address, ptep, new_pte); 915 916 if (write_access && !(vma->vm_flags & VM_SHARED)) { 917 /* Optimization, do the COW without a second fault */ 918 ret = hugetlb_cow(mm, vma, address, ptep, new_pte); 919 } 920 921 spin_unlock(&mm->page_table_lock); 922 unlock_page(page); 923out: 924 return ret; 925 926backout: 927 spin_unlock(&mm->page_table_lock); 928 unlock_page(page); 929 put_page(page); 930 goto out; 931} 932 933int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, 934 unsigned long address, int write_access) 935{ 936 pte_t *ptep; 937 pte_t entry; 938 int ret; 939 static DEFINE_MUTEX(hugetlb_instantiation_mutex); 940 941 ptep = huge_pte_alloc(mm, address); 942 if (!ptep) 943 return VM_FAULT_OOM; 944 945 /* 946 * Serialize hugepage allocation and instantiation, so that we don't 947 * get spurious allocation failures if two CPUs race to instantiate 948 * the same page in the page cache. 949 */ 950 mutex_lock(&hugetlb_instantiation_mutex); 951 entry = *ptep; 952 if (pte_none(entry)) { 953 ret = hugetlb_no_page(mm, vma, address, ptep, write_access); 954 mutex_unlock(&hugetlb_instantiation_mutex); 955 return ret; 956 } 957 958 ret = 0; 959 960 spin_lock(&mm->page_table_lock); 961 /* Check for a racing update before calling hugetlb_cow */ 962 if (likely(pte_same(entry, *ptep))) 963 if (write_access && !pte_write(entry)) 964 ret = hugetlb_cow(mm, vma, address, ptep, entry); 965 spin_unlock(&mm->page_table_lock); 966 mutex_unlock(&hugetlb_instantiation_mutex); 967 968 return ret; 969} 970 971int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, 972 struct page **pages, struct vm_area_struct **vmas, 973 unsigned long *position, int *length, int i, 974 int write) 975{ 976 unsigned long pfn_offset; 977 unsigned long vaddr = *position; 978 int remainder = *length; 979 980 spin_lock(&mm->page_table_lock); 981 while (vaddr < vma->vm_end && remainder) { 982 pte_t *pte; 983 struct page *page; 984 985 /* 986 * Some archs (sparc64, sh*) have multiple pte_ts to 987 * each hugepage. We have to make * sure we get the 988 * first, for the page indexing below to work. 989 */ 990 pte = huge_pte_offset(mm, vaddr & HPAGE_MASK); 991 992 if (!pte || pte_none(*pte) || (write && !pte_write(*pte))) { 993 int ret; 994 995 spin_unlock(&mm->page_table_lock); 996 ret = hugetlb_fault(mm, vma, vaddr, write); 997 spin_lock(&mm->page_table_lock); 998 if (!(ret & VM_FAULT_ERROR)) 999 continue; 1000 1001 remainder = 0; 1002 if (!i) 1003 i = -EFAULT; 1004 break; 1005 } 1006 1007 pfn_offset = (vaddr & ~HPAGE_MASK) >> PAGE_SHIFT; 1008 page = pte_page(*pte); 1009same_page: 1010 if (pages) { 1011 get_page(page); 1012 pages[i] = page + pfn_offset; 1013 } 1014 1015 if (vmas) 1016 vmas[i] = vma; 1017 1018 vaddr += PAGE_SIZE; 1019 ++pfn_offset; 1020 --remainder; 1021 ++i; 1022 if (vaddr < vma->vm_end && remainder && 1023 pfn_offset < HPAGE_SIZE/PAGE_SIZE) { 1024 /* 1025 * We use pfn_offset to avoid touching the pageframes 1026 * of this compound page. 1027 */ 1028 goto same_page; 1029 } 1030 } 1031 spin_unlock(&mm->page_table_lock); 1032 *length = remainder; 1033 *position = vaddr; 1034 1035 return i; 1036} 1037 1038void hugetlb_change_protection(struct vm_area_struct *vma, 1039 unsigned long address, unsigned long end, pgprot_t newprot) 1040{ 1041 struct mm_struct *mm = vma->vm_mm; 1042 unsigned long start = address; 1043 pte_t *ptep; 1044 pte_t pte; 1045 1046 BUG_ON(address >= end); 1047 flush_cache_range(vma, address, end); 1048 1049 spin_lock(&vma->vm_file->f_mapping->i_mmap_lock); 1050 spin_lock(&mm->page_table_lock); 1051 for (; address < end; address += HPAGE_SIZE) { 1052 ptep = huge_pte_offset(mm, address); 1053 if (!ptep) 1054 continue; 1055 if (huge_pmd_unshare(mm, &address, ptep)) 1056 continue; 1057 if (!pte_none(*ptep)) { 1058 pte = huge_ptep_get_and_clear(mm, address, ptep); 1059 pte = pte_mkhuge(pte_modify(pte, newprot)); 1060 set_huge_pte_at(mm, address, ptep, pte); 1061 } 1062 } 1063 spin_unlock(&mm->page_table_lock); 1064 spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock); 1065 1066 flush_tlb_range(vma, start, end); 1067} 1068 1069struct file_region { 1070 struct list_head link; 1071 long from; 1072 long to; 1073}; 1074 1075static long region_add(struct list_head *head, long f, long t) 1076{ 1077 struct file_region *rg, *nrg, *trg; 1078 1079 /* Locate the region we are either in or before. */ 1080 list_for_each_entry(rg, head, link) 1081 if (f <= rg->to) 1082 break; 1083 1084 /* Round our left edge to the current segment if it encloses us. */ 1085 if (f > rg->from) 1086 f = rg->from; 1087 1088 /* Check for and consume any regions we now overlap with. */ 1089 nrg = rg; 1090 list_for_each_entry_safe(rg, trg, rg->link.prev, link) { 1091 if (&rg->link == head) 1092 break; 1093 if (rg->from > t) 1094 break; 1095 1096 /* If this area reaches higher then extend our area to 1097 * include it completely. If this is not the first area 1098 * which we intend to reuse, free it. */ 1099 if (rg->to > t) 1100 t = rg->to; 1101 if (rg != nrg) { 1102 list_del(&rg->link); 1103 kfree(rg); 1104 } 1105 } 1106 nrg->from = f; 1107 nrg->to = t; 1108 return 0; 1109} 1110 1111static long region_chg(struct list_head *head, long f, long t) 1112{ 1113 struct file_region *rg, *nrg; 1114 long chg = 0; 1115 1116 /* Locate the region we are before or in. */ 1117 list_for_each_entry(rg, head, link) 1118 if (f <= rg->to) 1119 break; 1120 1121 /* If we are below the current region then a new region is required. 1122 * Subtle, allocate a new region at the position but make it zero 1123 * size such that we can guarantee to record the reservation. */ 1124 if (&rg->link == head || t < rg->from) { 1125 nrg = kmalloc(sizeof(*nrg), GFP_KERNEL); 1126 if (!nrg) 1127 return -ENOMEM; 1128 nrg->from = f; 1129 nrg->to = f; 1130 INIT_LIST_HEAD(&nrg->link); 1131 list_add(&nrg->link, rg->link.prev); 1132 1133 return t - f; 1134 } 1135 1136 /* Round our left edge to the current segment if it encloses us. */ 1137 if (f > rg->from) 1138 f = rg->from; 1139 chg = t - f; 1140 1141 /* Check for and consume any regions we now overlap with. */ 1142 list_for_each_entry(rg, rg->link.prev, link) { 1143 if (&rg->link == head) 1144 break; 1145 if (rg->from > t) 1146 return chg; 1147 1148 /* We overlap with this area, if it extends futher than 1149 * us then we must extend ourselves. Account for its 1150 * existing reservation. */ 1151 if (rg->to > t) { 1152 chg += rg->to - t; 1153 t = rg->to; 1154 } 1155 chg -= rg->to - rg->from; 1156 } 1157 return chg; 1158} 1159 1160static long region_truncate(struct list_head *head, long end) 1161{ 1162 struct file_region *rg, *trg; 1163 long chg = 0; 1164 1165 /* Locate the region we are either in or before. */ 1166 list_for_each_entry(rg, head, link) 1167 if (end <= rg->to) 1168 break; 1169 if (&rg->link == head) 1170 return 0; 1171 1172 /* If we are in the middle of a region then adjust it. */ 1173 if (end > rg->from) { 1174 chg = rg->to - end; 1175 rg->to = end; 1176 rg = list_entry(rg->link.next, typeof(*rg), link); 1177 } 1178 1179 /* Drop any remaining regions. */ 1180 list_for_each_entry_safe(rg, trg, rg->link.prev, link) { 1181 if (&rg->link == head) 1182 break; 1183 chg += rg->to - rg->from; 1184 list_del(&rg->link); 1185 kfree(rg); 1186 } 1187 return chg; 1188} 1189 1190static int hugetlb_acct_memory(long delta) 1191{ 1192 int ret = -ENOMEM; 1193 1194 spin_lock(&hugetlb_lock); 1195 /* 1196 * When cpuset is configured, it breaks the strict hugetlb page 1197 * reservation as the accounting is done on a global variable. Such 1198 * reservation is completely rubbish in the presence of cpuset because 1199 * the reservation is not checked against page availability for the 1200 * current cpuset. Application can still potentially OOM'ed by kernel 1201 * with lack of free htlb page in cpuset that the task is in. 1202 * Attempt to enforce strict accounting with cpuset is almost 1203 * impossible (or too ugly) because cpuset is too fluid that 1204 * task or memory node can be dynamically moved between cpusets. 1205 * 1206 * The change of semantics for shared hugetlb mapping with cpuset is 1207 * undesirable. However, in order to preserve some of the semantics, 1208 * we fall back to check against current free page availability as 1209 * a best attempt and hopefully to minimize the impact of changing 1210 * semantics that cpuset has. 1211 */ 1212 if (delta > 0) { 1213 if (gather_surplus_pages(delta) < 0) 1214 goto out; 1215 1216 if (delta > cpuset_mems_nr(free_huge_pages_node)) { 1217 return_unused_surplus_pages(delta); 1218 goto out; 1219 } 1220 } 1221 1222 ret = 0; 1223 if (delta < 0) 1224 return_unused_surplus_pages((unsigned long) -delta); 1225 1226out: 1227 spin_unlock(&hugetlb_lock); 1228 return ret; 1229} 1230 1231int hugetlb_reserve_pages(struct inode *inode, long from, long to) 1232{ 1233 long ret, chg; 1234 1235 chg = region_chg(&inode->i_mapping->private_list, from, to); 1236 if (chg < 0) 1237 return chg; 1238 1239 if (hugetlb_get_quota(inode->i_mapping, chg)) 1240 return -ENOSPC; 1241 ret = hugetlb_acct_memory(chg); 1242 if (ret < 0) { 1243 hugetlb_put_quota(inode->i_mapping, chg); 1244 return ret; 1245 } 1246 region_add(&inode->i_mapping->private_list, from, to); 1247 return 0; 1248} 1249 1250void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) 1251{ 1252 long chg = region_truncate(&inode->i_mapping->private_list, offset); 1253 1254 spin_lock(&inode->i_lock); 1255 inode->i_blocks -= BLOCKS_PER_HUGEPAGE * freed; 1256 spin_unlock(&inode->i_lock); 1257 1258 hugetlb_put_quota(inode->i_mapping, (chg - freed)); 1259 hugetlb_acct_memory(-(chg - freed)); 1260} 1261