hugetlb.c revision a1de09195b294c6a4c5dec8c8defd0a2688d3f75
1/* 2 * Generic hugetlb support. 3 * (C) William Irwin, April 2004 4 */ 5#include <linux/gfp.h> 6#include <linux/list.h> 7#include <linux/init.h> 8#include <linux/module.h> 9#include <linux/mm.h> 10#include <linux/sysctl.h> 11#include <linux/highmem.h> 12#include <linux/nodemask.h> 13#include <linux/pagemap.h> 14#include <linux/mempolicy.h> 15#include <linux/cpuset.h> 16#include <linux/mutex.h> 17 18#include <asm/page.h> 19#include <asm/pgtable.h> 20 21#include <linux/hugetlb.h> 22#include "internal.h" 23 24const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; 25static unsigned long nr_huge_pages, free_huge_pages, resv_huge_pages; 26static unsigned long surplus_huge_pages; 27static unsigned long nr_overcommit_huge_pages; 28unsigned long max_huge_pages; 29unsigned long sysctl_overcommit_huge_pages; 30static struct list_head hugepage_freelists[MAX_NUMNODES]; 31static unsigned int nr_huge_pages_node[MAX_NUMNODES]; 32static unsigned int free_huge_pages_node[MAX_NUMNODES]; 33static unsigned int surplus_huge_pages_node[MAX_NUMNODES]; 34static gfp_t htlb_alloc_mask = GFP_HIGHUSER; 35unsigned long hugepages_treat_as_movable; 36static int hugetlb_next_nid; 37 38/* 39 * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages 40 */ 41static DEFINE_SPINLOCK(hugetlb_lock); 42 43static void clear_huge_page(struct page *page, unsigned long addr) 44{ 45 int i; 46 47 might_sleep(); 48 for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); i++) { 49 cond_resched(); 50 clear_user_highpage(page + i, addr + i * PAGE_SIZE); 51 } 52} 53 54static void copy_huge_page(struct page *dst, struct page *src, 55 unsigned long addr, struct vm_area_struct *vma) 56{ 57 int i; 58 59 might_sleep(); 60 for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) { 61 cond_resched(); 62 copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma); 63 } 64} 65 66static void enqueue_huge_page(struct page *page) 67{ 68 int nid = page_to_nid(page); 69 list_add(&page->lru, &hugepage_freelists[nid]); 70 free_huge_pages++; 71 free_huge_pages_node[nid]++; 72} 73 74static struct page *dequeue_huge_page(void) 75{ 76 int nid; 77 struct page *page = NULL; 78 79 for (nid = 0; nid < MAX_NUMNODES; ++nid) { 80 if (!list_empty(&hugepage_freelists[nid])) { 81 page = list_entry(hugepage_freelists[nid].next, 82 struct page, lru); 83 list_del(&page->lru); 84 free_huge_pages--; 85 free_huge_pages_node[nid]--; 86 break; 87 } 88 } 89 return page; 90} 91 92static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma, 93 unsigned long address) 94{ 95 int nid; 96 struct page *page = NULL; 97 struct mempolicy *mpol; 98 struct zonelist *zonelist = huge_zonelist(vma, address, 99 htlb_alloc_mask, &mpol); 100 struct zone **z; 101 102 for (z = zonelist->zones; *z; z++) { 103 nid = zone_to_nid(*z); 104 if (cpuset_zone_allowed_softwall(*z, htlb_alloc_mask) && 105 !list_empty(&hugepage_freelists[nid])) { 106 page = list_entry(hugepage_freelists[nid].next, 107 struct page, lru); 108 list_del(&page->lru); 109 free_huge_pages--; 110 free_huge_pages_node[nid]--; 111 if (vma && vma->vm_flags & VM_MAYSHARE) 112 resv_huge_pages--; 113 break; 114 } 115 } 116 mpol_free(mpol); /* unref if mpol !NULL */ 117 return page; 118} 119 120static void update_and_free_page(struct page *page) 121{ 122 int i; 123 nr_huge_pages--; 124 nr_huge_pages_node[page_to_nid(page)]--; 125 for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) { 126 page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced | 127 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved | 128 1 << PG_private | 1<< PG_writeback); 129 } 130 set_compound_page_dtor(page, NULL); 131 set_page_refcounted(page); 132 __free_pages(page, HUGETLB_PAGE_ORDER); 133} 134 135static void free_huge_page(struct page *page) 136{ 137 int nid = page_to_nid(page); 138 struct address_space *mapping; 139 140 mapping = (struct address_space *) page_private(page); 141 set_page_private(page, 0); 142 BUG_ON(page_count(page)); 143 INIT_LIST_HEAD(&page->lru); 144 145 spin_lock(&hugetlb_lock); 146 if (surplus_huge_pages_node[nid]) { 147 update_and_free_page(page); 148 surplus_huge_pages--; 149 surplus_huge_pages_node[nid]--; 150 } else { 151 enqueue_huge_page(page); 152 } 153 spin_unlock(&hugetlb_lock); 154 if (mapping) 155 hugetlb_put_quota(mapping, 1); 156} 157 158/* 159 * Increment or decrement surplus_huge_pages. Keep node-specific counters 160 * balanced by operating on them in a round-robin fashion. 161 * Returns 1 if an adjustment was made. 162 */ 163static int adjust_pool_surplus(int delta) 164{ 165 static int prev_nid; 166 int nid = prev_nid; 167 int ret = 0; 168 169 VM_BUG_ON(delta != -1 && delta != 1); 170 do { 171 nid = next_node(nid, node_online_map); 172 if (nid == MAX_NUMNODES) 173 nid = first_node(node_online_map); 174 175 /* To shrink on this node, there must be a surplus page */ 176 if (delta < 0 && !surplus_huge_pages_node[nid]) 177 continue; 178 /* Surplus cannot exceed the total number of pages */ 179 if (delta > 0 && surplus_huge_pages_node[nid] >= 180 nr_huge_pages_node[nid]) 181 continue; 182 183 surplus_huge_pages += delta; 184 surplus_huge_pages_node[nid] += delta; 185 ret = 1; 186 break; 187 } while (nid != prev_nid); 188 189 prev_nid = nid; 190 return ret; 191} 192 193static struct page *alloc_fresh_huge_page_node(int nid) 194{ 195 struct page *page; 196 197 page = alloc_pages_node(nid, 198 htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|__GFP_NOWARN, 199 HUGETLB_PAGE_ORDER); 200 if (page) { 201 set_compound_page_dtor(page, free_huge_page); 202 spin_lock(&hugetlb_lock); 203 nr_huge_pages++; 204 nr_huge_pages_node[nid]++; 205 spin_unlock(&hugetlb_lock); 206 put_page(page); /* free it into the hugepage allocator */ 207 } 208 209 return page; 210} 211 212static int alloc_fresh_huge_page(void) 213{ 214 struct page *page; 215 int start_nid; 216 int next_nid; 217 int ret = 0; 218 219 start_nid = hugetlb_next_nid; 220 221 do { 222 page = alloc_fresh_huge_page_node(hugetlb_next_nid); 223 if (page) 224 ret = 1; 225 /* 226 * Use a helper variable to find the next node and then 227 * copy it back to hugetlb_next_nid afterwards: 228 * otherwise there's a window in which a racer might 229 * pass invalid nid MAX_NUMNODES to alloc_pages_node. 230 * But we don't need to use a spin_lock here: it really 231 * doesn't matter if occasionally a racer chooses the 232 * same nid as we do. Move nid forward in the mask even 233 * if we just successfully allocated a hugepage so that 234 * the next caller gets hugepages on the next node. 235 */ 236 next_nid = next_node(hugetlb_next_nid, node_online_map); 237 if (next_nid == MAX_NUMNODES) 238 next_nid = first_node(node_online_map); 239 hugetlb_next_nid = next_nid; 240 } while (!page && hugetlb_next_nid != start_nid); 241 242 return ret; 243} 244 245static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma, 246 unsigned long address) 247{ 248 struct page *page; 249 unsigned int nid; 250 251 /* 252 * Assume we will successfully allocate the surplus page to 253 * prevent racing processes from causing the surplus to exceed 254 * overcommit 255 * 256 * This however introduces a different race, where a process B 257 * tries to grow the static hugepage pool while alloc_pages() is 258 * called by process A. B will only examine the per-node 259 * counters in determining if surplus huge pages can be 260 * converted to normal huge pages in adjust_pool_surplus(). A 261 * won't be able to increment the per-node counter, until the 262 * lock is dropped by B, but B doesn't drop hugetlb_lock until 263 * no more huge pages can be converted from surplus to normal 264 * state (and doesn't try to convert again). Thus, we have a 265 * case where a surplus huge page exists, the pool is grown, and 266 * the surplus huge page still exists after, even though it 267 * should just have been converted to a normal huge page. This 268 * does not leak memory, though, as the hugepage will be freed 269 * once it is out of use. It also does not allow the counters to 270 * go out of whack in adjust_pool_surplus() as we don't modify 271 * the node values until we've gotten the hugepage and only the 272 * per-node value is checked there. 273 */ 274 spin_lock(&hugetlb_lock); 275 if (surplus_huge_pages >= nr_overcommit_huge_pages) { 276 spin_unlock(&hugetlb_lock); 277 return NULL; 278 } else { 279 nr_huge_pages++; 280 surplus_huge_pages++; 281 } 282 spin_unlock(&hugetlb_lock); 283 284 page = alloc_pages(htlb_alloc_mask|__GFP_COMP|__GFP_NOWARN, 285 HUGETLB_PAGE_ORDER); 286 287 spin_lock(&hugetlb_lock); 288 if (page) { 289 /* 290 * This page is now managed by the hugetlb allocator and has 291 * no users -- drop the buddy allocator's reference. 292 */ 293 put_page_testzero(page); 294 VM_BUG_ON(page_count(page)); 295 nid = page_to_nid(page); 296 set_compound_page_dtor(page, free_huge_page); 297 /* 298 * We incremented the global counters already 299 */ 300 nr_huge_pages_node[nid]++; 301 surplus_huge_pages_node[nid]++; 302 } else { 303 nr_huge_pages--; 304 surplus_huge_pages--; 305 } 306 spin_unlock(&hugetlb_lock); 307 308 return page; 309} 310 311/* 312 * Increase the hugetlb pool such that it can accomodate a reservation 313 * of size 'delta'. 314 */ 315static int gather_surplus_pages(int delta) 316{ 317 struct list_head surplus_list; 318 struct page *page, *tmp; 319 int ret, i; 320 int needed, allocated; 321 322 needed = (resv_huge_pages + delta) - free_huge_pages; 323 if (needed <= 0) { 324 resv_huge_pages += delta; 325 return 0; 326 } 327 328 allocated = 0; 329 INIT_LIST_HEAD(&surplus_list); 330 331 ret = -ENOMEM; 332retry: 333 spin_unlock(&hugetlb_lock); 334 for (i = 0; i < needed; i++) { 335 page = alloc_buddy_huge_page(NULL, 0); 336 if (!page) { 337 /* 338 * We were not able to allocate enough pages to 339 * satisfy the entire reservation so we free what 340 * we've allocated so far. 341 */ 342 spin_lock(&hugetlb_lock); 343 needed = 0; 344 goto free; 345 } 346 347 list_add(&page->lru, &surplus_list); 348 } 349 allocated += needed; 350 351 /* 352 * After retaking hugetlb_lock, we need to recalculate 'needed' 353 * because either resv_huge_pages or free_huge_pages may have changed. 354 */ 355 spin_lock(&hugetlb_lock); 356 needed = (resv_huge_pages + delta) - (free_huge_pages + allocated); 357 if (needed > 0) 358 goto retry; 359 360 /* 361 * The surplus_list now contains _at_least_ the number of extra pages 362 * needed to accomodate the reservation. Add the appropriate number 363 * of pages to the hugetlb pool and free the extras back to the buddy 364 * allocator. Commit the entire reservation here to prevent another 365 * process from stealing the pages as they are added to the pool but 366 * before they are reserved. 367 */ 368 needed += allocated; 369 resv_huge_pages += delta; 370 ret = 0; 371free: 372 list_for_each_entry_safe(page, tmp, &surplus_list, lru) { 373 list_del(&page->lru); 374 if ((--needed) >= 0) 375 enqueue_huge_page(page); 376 else { 377 /* 378 * The page has a reference count of zero already, so 379 * call free_huge_page directly instead of using 380 * put_page. This must be done with hugetlb_lock 381 * unlocked which is safe because free_huge_page takes 382 * hugetlb_lock before deciding how to free the page. 383 */ 384 spin_unlock(&hugetlb_lock); 385 free_huge_page(page); 386 spin_lock(&hugetlb_lock); 387 } 388 } 389 390 return ret; 391} 392 393/* 394 * When releasing a hugetlb pool reservation, any surplus pages that were 395 * allocated to satisfy the reservation must be explicitly freed if they were 396 * never used. 397 */ 398static void return_unused_surplus_pages(unsigned long unused_resv_pages) 399{ 400 static int nid = -1; 401 struct page *page; 402 unsigned long nr_pages; 403 404 /* Uncommit the reservation */ 405 resv_huge_pages -= unused_resv_pages; 406 407 nr_pages = min(unused_resv_pages, surplus_huge_pages); 408 409 while (nr_pages) { 410 nid = next_node(nid, node_online_map); 411 if (nid == MAX_NUMNODES) 412 nid = first_node(node_online_map); 413 414 if (!surplus_huge_pages_node[nid]) 415 continue; 416 417 if (!list_empty(&hugepage_freelists[nid])) { 418 page = list_entry(hugepage_freelists[nid].next, 419 struct page, lru); 420 list_del(&page->lru); 421 update_and_free_page(page); 422 free_huge_pages--; 423 free_huge_pages_node[nid]--; 424 surplus_huge_pages--; 425 surplus_huge_pages_node[nid]--; 426 nr_pages--; 427 } 428 } 429} 430 431 432static struct page *alloc_huge_page_shared(struct vm_area_struct *vma, 433 unsigned long addr) 434{ 435 struct page *page; 436 437 spin_lock(&hugetlb_lock); 438 page = dequeue_huge_page_vma(vma, addr); 439 spin_unlock(&hugetlb_lock); 440 return page ? page : ERR_PTR(-VM_FAULT_OOM); 441} 442 443static struct page *alloc_huge_page_private(struct vm_area_struct *vma, 444 unsigned long addr) 445{ 446 struct page *page = NULL; 447 448 if (hugetlb_get_quota(vma->vm_file->f_mapping, 1)) 449 return ERR_PTR(-VM_FAULT_SIGBUS); 450 451 spin_lock(&hugetlb_lock); 452 if (free_huge_pages > resv_huge_pages) 453 page = dequeue_huge_page_vma(vma, addr); 454 spin_unlock(&hugetlb_lock); 455 if (!page) { 456 page = alloc_buddy_huge_page(vma, addr); 457 if (!page) { 458 hugetlb_put_quota(vma->vm_file->f_mapping, 1); 459 return ERR_PTR(-VM_FAULT_OOM); 460 } 461 } 462 return page; 463} 464 465static struct page *alloc_huge_page(struct vm_area_struct *vma, 466 unsigned long addr) 467{ 468 struct page *page; 469 struct address_space *mapping = vma->vm_file->f_mapping; 470 471 if (vma->vm_flags & VM_MAYSHARE) 472 page = alloc_huge_page_shared(vma, addr); 473 else 474 page = alloc_huge_page_private(vma, addr); 475 476 if (!IS_ERR(page)) { 477 set_page_refcounted(page); 478 set_page_private(page, (unsigned long) mapping); 479 } 480 return page; 481} 482 483static int __init hugetlb_init(void) 484{ 485 unsigned long i; 486 487 if (HPAGE_SHIFT == 0) 488 return 0; 489 490 for (i = 0; i < MAX_NUMNODES; ++i) 491 INIT_LIST_HEAD(&hugepage_freelists[i]); 492 493 hugetlb_next_nid = first_node(node_online_map); 494 495 for (i = 0; i < max_huge_pages; ++i) { 496 if (!alloc_fresh_huge_page()) 497 break; 498 } 499 max_huge_pages = free_huge_pages = nr_huge_pages = i; 500 printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages); 501 return 0; 502} 503module_init(hugetlb_init); 504 505static int __init hugetlb_setup(char *s) 506{ 507 if (sscanf(s, "%lu", &max_huge_pages) <= 0) 508 max_huge_pages = 0; 509 return 1; 510} 511__setup("hugepages=", hugetlb_setup); 512 513static unsigned int cpuset_mems_nr(unsigned int *array) 514{ 515 int node; 516 unsigned int nr = 0; 517 518 for_each_node_mask(node, cpuset_current_mems_allowed) 519 nr += array[node]; 520 521 return nr; 522} 523 524#ifdef CONFIG_SYSCTL 525#ifdef CONFIG_HIGHMEM 526static void try_to_free_low(unsigned long count) 527{ 528 int i; 529 530 for (i = 0; i < MAX_NUMNODES; ++i) { 531 struct page *page, *next; 532 list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) { 533 if (count >= nr_huge_pages) 534 return; 535 if (PageHighMem(page)) 536 continue; 537 list_del(&page->lru); 538 update_and_free_page(page); 539 free_huge_pages--; 540 free_huge_pages_node[page_to_nid(page)]--; 541 } 542 } 543} 544#else 545static inline void try_to_free_low(unsigned long count) 546{ 547} 548#endif 549 550#define persistent_huge_pages (nr_huge_pages - surplus_huge_pages) 551static unsigned long set_max_huge_pages(unsigned long count) 552{ 553 unsigned long min_count, ret; 554 555 /* 556 * Increase the pool size 557 * First take pages out of surplus state. Then make up the 558 * remaining difference by allocating fresh huge pages. 559 * 560 * We might race with alloc_buddy_huge_page() here and be unable 561 * to convert a surplus huge page to a normal huge page. That is 562 * not critical, though, it just means the overall size of the 563 * pool might be one hugepage larger than it needs to be, but 564 * within all the constraints specified by the sysctls. 565 */ 566 spin_lock(&hugetlb_lock); 567 while (surplus_huge_pages && count > persistent_huge_pages) { 568 if (!adjust_pool_surplus(-1)) 569 break; 570 } 571 572 while (count > persistent_huge_pages) { 573 int ret; 574 /* 575 * If this allocation races such that we no longer need the 576 * page, free_huge_page will handle it by freeing the page 577 * and reducing the surplus. 578 */ 579 spin_unlock(&hugetlb_lock); 580 ret = alloc_fresh_huge_page(); 581 spin_lock(&hugetlb_lock); 582 if (!ret) 583 goto out; 584 585 } 586 587 /* 588 * Decrease the pool size 589 * First return free pages to the buddy allocator (being careful 590 * to keep enough around to satisfy reservations). Then place 591 * pages into surplus state as needed so the pool will shrink 592 * to the desired size as pages become free. 593 * 594 * By placing pages into the surplus state independent of the 595 * overcommit value, we are allowing the surplus pool size to 596 * exceed overcommit. There are few sane options here. Since 597 * alloc_buddy_huge_page() is checking the global counter, 598 * though, we'll note that we're not allowed to exceed surplus 599 * and won't grow the pool anywhere else. Not until one of the 600 * sysctls are changed, or the surplus pages go out of use. 601 */ 602 min_count = resv_huge_pages + nr_huge_pages - free_huge_pages; 603 min_count = max(count, min_count); 604 try_to_free_low(min_count); 605 while (min_count < persistent_huge_pages) { 606 struct page *page = dequeue_huge_page(); 607 if (!page) 608 break; 609 update_and_free_page(page); 610 } 611 while (count < persistent_huge_pages) { 612 if (!adjust_pool_surplus(1)) 613 break; 614 } 615out: 616 ret = persistent_huge_pages; 617 spin_unlock(&hugetlb_lock); 618 return ret; 619} 620 621int hugetlb_sysctl_handler(struct ctl_table *table, int write, 622 struct file *file, void __user *buffer, 623 size_t *length, loff_t *ppos) 624{ 625 proc_doulongvec_minmax(table, write, file, buffer, length, ppos); 626 max_huge_pages = set_max_huge_pages(max_huge_pages); 627 return 0; 628} 629 630int hugetlb_treat_movable_handler(struct ctl_table *table, int write, 631 struct file *file, void __user *buffer, 632 size_t *length, loff_t *ppos) 633{ 634 proc_dointvec(table, write, file, buffer, length, ppos); 635 if (hugepages_treat_as_movable) 636 htlb_alloc_mask = GFP_HIGHUSER_MOVABLE; 637 else 638 htlb_alloc_mask = GFP_HIGHUSER; 639 return 0; 640} 641 642int hugetlb_overcommit_handler(struct ctl_table *table, int write, 643 struct file *file, void __user *buffer, 644 size_t *length, loff_t *ppos) 645{ 646 proc_doulongvec_minmax(table, write, file, buffer, length, ppos); 647 spin_lock(&hugetlb_lock); 648 nr_overcommit_huge_pages = sysctl_overcommit_huge_pages; 649 spin_unlock(&hugetlb_lock); 650 return 0; 651} 652 653#endif /* CONFIG_SYSCTL */ 654 655int hugetlb_report_meminfo(char *buf) 656{ 657 return sprintf(buf, 658 "HugePages_Total: %5lu\n" 659 "HugePages_Free: %5lu\n" 660 "HugePages_Rsvd: %5lu\n" 661 "HugePages_Surp: %5lu\n" 662 "Hugepagesize: %5lu kB\n", 663 nr_huge_pages, 664 free_huge_pages, 665 resv_huge_pages, 666 surplus_huge_pages, 667 HPAGE_SIZE/1024); 668} 669 670int hugetlb_report_node_meminfo(int nid, char *buf) 671{ 672 return sprintf(buf, 673 "Node %d HugePages_Total: %5u\n" 674 "Node %d HugePages_Free: %5u\n" 675 "Node %d HugePages_Surp: %5u\n", 676 nid, nr_huge_pages_node[nid], 677 nid, free_huge_pages_node[nid], 678 nid, surplus_huge_pages_node[nid]); 679} 680 681/* Return the number pages of memory we physically have, in PAGE_SIZE units. */ 682unsigned long hugetlb_total_pages(void) 683{ 684 return nr_huge_pages * (HPAGE_SIZE / PAGE_SIZE); 685} 686 687/* 688 * We cannot handle pagefaults against hugetlb pages at all. They cause 689 * handle_mm_fault() to try to instantiate regular-sized pages in the 690 * hugegpage VMA. do_page_fault() is supposed to trap this, so BUG is we get 691 * this far. 692 */ 693static int hugetlb_vm_op_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 694{ 695 BUG(); 696 return 0; 697} 698 699struct vm_operations_struct hugetlb_vm_ops = { 700 .fault = hugetlb_vm_op_fault, 701}; 702 703static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page, 704 int writable) 705{ 706 pte_t entry; 707 708 if (writable) { 709 entry = 710 pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); 711 } else { 712 entry = pte_wrprotect(mk_pte(page, vma->vm_page_prot)); 713 } 714 entry = pte_mkyoung(entry); 715 entry = pte_mkhuge(entry); 716 717 return entry; 718} 719 720static void set_huge_ptep_writable(struct vm_area_struct *vma, 721 unsigned long address, pte_t *ptep) 722{ 723 pte_t entry; 724 725 entry = pte_mkwrite(pte_mkdirty(*ptep)); 726 if (ptep_set_access_flags(vma, address, ptep, entry, 1)) { 727 update_mmu_cache(vma, address, entry); 728 } 729} 730 731 732int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, 733 struct vm_area_struct *vma) 734{ 735 pte_t *src_pte, *dst_pte, entry; 736 struct page *ptepage; 737 unsigned long addr; 738 int cow; 739 740 cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; 741 742 for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) { 743 src_pte = huge_pte_offset(src, addr); 744 if (!src_pte) 745 continue; 746 dst_pte = huge_pte_alloc(dst, addr); 747 if (!dst_pte) 748 goto nomem; 749 750 /* If the pagetables are shared don't copy or take references */ 751 if (dst_pte == src_pte) 752 continue; 753 754 spin_lock(&dst->page_table_lock); 755 spin_lock(&src->page_table_lock); 756 if (!pte_none(*src_pte)) { 757 if (cow) 758 ptep_set_wrprotect(src, addr, src_pte); 759 entry = *src_pte; 760 ptepage = pte_page(entry); 761 get_page(ptepage); 762 set_huge_pte_at(dst, addr, dst_pte, entry); 763 } 764 spin_unlock(&src->page_table_lock); 765 spin_unlock(&dst->page_table_lock); 766 } 767 return 0; 768 769nomem: 770 return -ENOMEM; 771} 772 773void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, 774 unsigned long end) 775{ 776 struct mm_struct *mm = vma->vm_mm; 777 unsigned long address; 778 pte_t *ptep; 779 pte_t pte; 780 struct page *page; 781 struct page *tmp; 782 /* 783 * A page gathering list, protected by per file i_mmap_lock. The 784 * lock is used to avoid list corruption from multiple unmapping 785 * of the same page since we are using page->lru. 786 */ 787 LIST_HEAD(page_list); 788 789 WARN_ON(!is_vm_hugetlb_page(vma)); 790 BUG_ON(start & ~HPAGE_MASK); 791 BUG_ON(end & ~HPAGE_MASK); 792 793 spin_lock(&mm->page_table_lock); 794 for (address = start; address < end; address += HPAGE_SIZE) { 795 ptep = huge_pte_offset(mm, address); 796 if (!ptep) 797 continue; 798 799 if (huge_pmd_unshare(mm, &address, ptep)) 800 continue; 801 802 pte = huge_ptep_get_and_clear(mm, address, ptep); 803 if (pte_none(pte)) 804 continue; 805 806 page = pte_page(pte); 807 if (pte_dirty(pte)) 808 set_page_dirty(page); 809 list_add(&page->lru, &page_list); 810 } 811 spin_unlock(&mm->page_table_lock); 812 flush_tlb_range(vma, start, end); 813 list_for_each_entry_safe(page, tmp, &page_list, lru) { 814 list_del(&page->lru); 815 put_page(page); 816 } 817} 818 819void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, 820 unsigned long end) 821{ 822 /* 823 * It is undesirable to test vma->vm_file as it should be non-null 824 * for valid hugetlb area. However, vm_file will be NULL in the error 825 * cleanup path of do_mmap_pgoff. When hugetlbfs ->mmap method fails, 826 * do_mmap_pgoff() nullifies vma->vm_file before calling this function 827 * to clean up. Since no pte has actually been setup, it is safe to 828 * do nothing in this case. 829 */ 830 if (vma->vm_file) { 831 spin_lock(&vma->vm_file->f_mapping->i_mmap_lock); 832 __unmap_hugepage_range(vma, start, end); 833 spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock); 834 } 835} 836 837static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, 838 unsigned long address, pte_t *ptep, pte_t pte) 839{ 840 struct page *old_page, *new_page; 841 int avoidcopy; 842 843 old_page = pte_page(pte); 844 845 /* If no-one else is actually using this page, avoid the copy 846 * and just make the page writable */ 847 avoidcopy = (page_count(old_page) == 1); 848 if (avoidcopy) { 849 set_huge_ptep_writable(vma, address, ptep); 850 return 0; 851 } 852 853 page_cache_get(old_page); 854 new_page = alloc_huge_page(vma, address); 855 856 if (IS_ERR(new_page)) { 857 page_cache_release(old_page); 858 return -PTR_ERR(new_page); 859 } 860 861 spin_unlock(&mm->page_table_lock); 862 copy_huge_page(new_page, old_page, address, vma); 863 __SetPageUptodate(new_page); 864 spin_lock(&mm->page_table_lock); 865 866 ptep = huge_pte_offset(mm, address & HPAGE_MASK); 867 if (likely(pte_same(*ptep, pte))) { 868 /* Break COW */ 869 set_huge_pte_at(mm, address, ptep, 870 make_huge_pte(vma, new_page, 1)); 871 /* Make the old page be freed below */ 872 new_page = old_page; 873 } 874 page_cache_release(new_page); 875 page_cache_release(old_page); 876 return 0; 877} 878 879static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, 880 unsigned long address, pte_t *ptep, int write_access) 881{ 882 int ret = VM_FAULT_SIGBUS; 883 unsigned long idx; 884 unsigned long size; 885 struct page *page; 886 struct address_space *mapping; 887 pte_t new_pte; 888 889 mapping = vma->vm_file->f_mapping; 890 idx = ((address - vma->vm_start) >> HPAGE_SHIFT) 891 + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT)); 892 893 /* 894 * Use page lock to guard against racing truncation 895 * before we get page_table_lock. 896 */ 897retry: 898 page = find_lock_page(mapping, idx); 899 if (!page) { 900 size = i_size_read(mapping->host) >> HPAGE_SHIFT; 901 if (idx >= size) 902 goto out; 903 page = alloc_huge_page(vma, address); 904 if (IS_ERR(page)) { 905 ret = -PTR_ERR(page); 906 goto out; 907 } 908 clear_huge_page(page, address); 909 __SetPageUptodate(page); 910 911 if (vma->vm_flags & VM_SHARED) { 912 int err; 913 struct inode *inode = mapping->host; 914 915 err = add_to_page_cache(page, mapping, idx, GFP_KERNEL); 916 if (err) { 917 put_page(page); 918 if (err == -EEXIST) 919 goto retry; 920 goto out; 921 } 922 923 spin_lock(&inode->i_lock); 924 inode->i_blocks += BLOCKS_PER_HUGEPAGE; 925 spin_unlock(&inode->i_lock); 926 } else 927 lock_page(page); 928 } 929 930 spin_lock(&mm->page_table_lock); 931 size = i_size_read(mapping->host) >> HPAGE_SHIFT; 932 if (idx >= size) 933 goto backout; 934 935 ret = 0; 936 if (!pte_none(*ptep)) 937 goto backout; 938 939 new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) 940 && (vma->vm_flags & VM_SHARED))); 941 set_huge_pte_at(mm, address, ptep, new_pte); 942 943 if (write_access && !(vma->vm_flags & VM_SHARED)) { 944 /* Optimization, do the COW without a second fault */ 945 ret = hugetlb_cow(mm, vma, address, ptep, new_pte); 946 } 947 948 spin_unlock(&mm->page_table_lock); 949 unlock_page(page); 950out: 951 return ret; 952 953backout: 954 spin_unlock(&mm->page_table_lock); 955 unlock_page(page); 956 put_page(page); 957 goto out; 958} 959 960int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, 961 unsigned long address, int write_access) 962{ 963 pte_t *ptep; 964 pte_t entry; 965 int ret; 966 static DEFINE_MUTEX(hugetlb_instantiation_mutex); 967 968 ptep = huge_pte_alloc(mm, address); 969 if (!ptep) 970 return VM_FAULT_OOM; 971 972 /* 973 * Serialize hugepage allocation and instantiation, so that we don't 974 * get spurious allocation failures if two CPUs race to instantiate 975 * the same page in the page cache. 976 */ 977 mutex_lock(&hugetlb_instantiation_mutex); 978 entry = *ptep; 979 if (pte_none(entry)) { 980 ret = hugetlb_no_page(mm, vma, address, ptep, write_access); 981 mutex_unlock(&hugetlb_instantiation_mutex); 982 return ret; 983 } 984 985 ret = 0; 986 987 spin_lock(&mm->page_table_lock); 988 /* Check for a racing update before calling hugetlb_cow */ 989 if (likely(pte_same(entry, *ptep))) 990 if (write_access && !pte_write(entry)) 991 ret = hugetlb_cow(mm, vma, address, ptep, entry); 992 spin_unlock(&mm->page_table_lock); 993 mutex_unlock(&hugetlb_instantiation_mutex); 994 995 return ret; 996} 997 998int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, 999 struct page **pages, struct vm_area_struct **vmas, 1000 unsigned long *position, int *length, int i, 1001 int write) 1002{ 1003 unsigned long pfn_offset; 1004 unsigned long vaddr = *position; 1005 int remainder = *length; 1006 1007 spin_lock(&mm->page_table_lock); 1008 while (vaddr < vma->vm_end && remainder) { 1009 pte_t *pte; 1010 struct page *page; 1011 1012 /* 1013 * Some archs (sparc64, sh*) have multiple pte_ts to 1014 * each hugepage. We have to make * sure we get the 1015 * first, for the page indexing below to work. 1016 */ 1017 pte = huge_pte_offset(mm, vaddr & HPAGE_MASK); 1018 1019 if (!pte || pte_none(*pte) || (write && !pte_write(*pte))) { 1020 int ret; 1021 1022 spin_unlock(&mm->page_table_lock); 1023 ret = hugetlb_fault(mm, vma, vaddr, write); 1024 spin_lock(&mm->page_table_lock); 1025 if (!(ret & VM_FAULT_ERROR)) 1026 continue; 1027 1028 remainder = 0; 1029 if (!i) 1030 i = -EFAULT; 1031 break; 1032 } 1033 1034 pfn_offset = (vaddr & ~HPAGE_MASK) >> PAGE_SHIFT; 1035 page = pte_page(*pte); 1036same_page: 1037 if (pages) { 1038 get_page(page); 1039 pages[i] = page + pfn_offset; 1040 } 1041 1042 if (vmas) 1043 vmas[i] = vma; 1044 1045 vaddr += PAGE_SIZE; 1046 ++pfn_offset; 1047 --remainder; 1048 ++i; 1049 if (vaddr < vma->vm_end && remainder && 1050 pfn_offset < HPAGE_SIZE/PAGE_SIZE) { 1051 /* 1052 * We use pfn_offset to avoid touching the pageframes 1053 * of this compound page. 1054 */ 1055 goto same_page; 1056 } 1057 } 1058 spin_unlock(&mm->page_table_lock); 1059 *length = remainder; 1060 *position = vaddr; 1061 1062 return i; 1063} 1064 1065void hugetlb_change_protection(struct vm_area_struct *vma, 1066 unsigned long address, unsigned long end, pgprot_t newprot) 1067{ 1068 struct mm_struct *mm = vma->vm_mm; 1069 unsigned long start = address; 1070 pte_t *ptep; 1071 pte_t pte; 1072 1073 BUG_ON(address >= end); 1074 flush_cache_range(vma, address, end); 1075 1076 spin_lock(&vma->vm_file->f_mapping->i_mmap_lock); 1077 spin_lock(&mm->page_table_lock); 1078 for (; address < end; address += HPAGE_SIZE) { 1079 ptep = huge_pte_offset(mm, address); 1080 if (!ptep) 1081 continue; 1082 if (huge_pmd_unshare(mm, &address, ptep)) 1083 continue; 1084 if (!pte_none(*ptep)) { 1085 pte = huge_ptep_get_and_clear(mm, address, ptep); 1086 pte = pte_mkhuge(pte_modify(pte, newprot)); 1087 set_huge_pte_at(mm, address, ptep, pte); 1088 } 1089 } 1090 spin_unlock(&mm->page_table_lock); 1091 spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock); 1092 1093 flush_tlb_range(vma, start, end); 1094} 1095 1096struct file_region { 1097 struct list_head link; 1098 long from; 1099 long to; 1100}; 1101 1102static long region_add(struct list_head *head, long f, long t) 1103{ 1104 struct file_region *rg, *nrg, *trg; 1105 1106 /* Locate the region we are either in or before. */ 1107 list_for_each_entry(rg, head, link) 1108 if (f <= rg->to) 1109 break; 1110 1111 /* Round our left edge to the current segment if it encloses us. */ 1112 if (f > rg->from) 1113 f = rg->from; 1114 1115 /* Check for and consume any regions we now overlap with. */ 1116 nrg = rg; 1117 list_for_each_entry_safe(rg, trg, rg->link.prev, link) { 1118 if (&rg->link == head) 1119 break; 1120 if (rg->from > t) 1121 break; 1122 1123 /* If this area reaches higher then extend our area to 1124 * include it completely. If this is not the first area 1125 * which we intend to reuse, free it. */ 1126 if (rg->to > t) 1127 t = rg->to; 1128 if (rg != nrg) { 1129 list_del(&rg->link); 1130 kfree(rg); 1131 } 1132 } 1133 nrg->from = f; 1134 nrg->to = t; 1135 return 0; 1136} 1137 1138static long region_chg(struct list_head *head, long f, long t) 1139{ 1140 struct file_region *rg, *nrg; 1141 long chg = 0; 1142 1143 /* Locate the region we are before or in. */ 1144 list_for_each_entry(rg, head, link) 1145 if (f <= rg->to) 1146 break; 1147 1148 /* If we are below the current region then a new region is required. 1149 * Subtle, allocate a new region at the position but make it zero 1150 * size such that we can guarantee to record the reservation. */ 1151 if (&rg->link == head || t < rg->from) { 1152 nrg = kmalloc(sizeof(*nrg), GFP_KERNEL); 1153 if (!nrg) 1154 return -ENOMEM; 1155 nrg->from = f; 1156 nrg->to = f; 1157 INIT_LIST_HEAD(&nrg->link); 1158 list_add(&nrg->link, rg->link.prev); 1159 1160 return t - f; 1161 } 1162 1163 /* Round our left edge to the current segment if it encloses us. */ 1164 if (f > rg->from) 1165 f = rg->from; 1166 chg = t - f; 1167 1168 /* Check for and consume any regions we now overlap with. */ 1169 list_for_each_entry(rg, rg->link.prev, link) { 1170 if (&rg->link == head) 1171 break; 1172 if (rg->from > t) 1173 return chg; 1174 1175 /* We overlap with this area, if it extends futher than 1176 * us then we must extend ourselves. Account for its 1177 * existing reservation. */ 1178 if (rg->to > t) { 1179 chg += rg->to - t; 1180 t = rg->to; 1181 } 1182 chg -= rg->to - rg->from; 1183 } 1184 return chg; 1185} 1186 1187static long region_truncate(struct list_head *head, long end) 1188{ 1189 struct file_region *rg, *trg; 1190 long chg = 0; 1191 1192 /* Locate the region we are either in or before. */ 1193 list_for_each_entry(rg, head, link) 1194 if (end <= rg->to) 1195 break; 1196 if (&rg->link == head) 1197 return 0; 1198 1199 /* If we are in the middle of a region then adjust it. */ 1200 if (end > rg->from) { 1201 chg = rg->to - end; 1202 rg->to = end; 1203 rg = list_entry(rg->link.next, typeof(*rg), link); 1204 } 1205 1206 /* Drop any remaining regions. */ 1207 list_for_each_entry_safe(rg, trg, rg->link.prev, link) { 1208 if (&rg->link == head) 1209 break; 1210 chg += rg->to - rg->from; 1211 list_del(&rg->link); 1212 kfree(rg); 1213 } 1214 return chg; 1215} 1216 1217static int hugetlb_acct_memory(long delta) 1218{ 1219 int ret = -ENOMEM; 1220 1221 spin_lock(&hugetlb_lock); 1222 /* 1223 * When cpuset is configured, it breaks the strict hugetlb page 1224 * reservation as the accounting is done on a global variable. Such 1225 * reservation is completely rubbish in the presence of cpuset because 1226 * the reservation is not checked against page availability for the 1227 * current cpuset. Application can still potentially OOM'ed by kernel 1228 * with lack of free htlb page in cpuset that the task is in. 1229 * Attempt to enforce strict accounting with cpuset is almost 1230 * impossible (or too ugly) because cpuset is too fluid that 1231 * task or memory node can be dynamically moved between cpusets. 1232 * 1233 * The change of semantics for shared hugetlb mapping with cpuset is 1234 * undesirable. However, in order to preserve some of the semantics, 1235 * we fall back to check against current free page availability as 1236 * a best attempt and hopefully to minimize the impact of changing 1237 * semantics that cpuset has. 1238 */ 1239 if (delta > 0) { 1240 if (gather_surplus_pages(delta) < 0) 1241 goto out; 1242 1243 if (delta > cpuset_mems_nr(free_huge_pages_node)) { 1244 return_unused_surplus_pages(delta); 1245 goto out; 1246 } 1247 } 1248 1249 ret = 0; 1250 if (delta < 0) 1251 return_unused_surplus_pages((unsigned long) -delta); 1252 1253out: 1254 spin_unlock(&hugetlb_lock); 1255 return ret; 1256} 1257 1258int hugetlb_reserve_pages(struct inode *inode, long from, long to) 1259{ 1260 long ret, chg; 1261 1262 chg = region_chg(&inode->i_mapping->private_list, from, to); 1263 if (chg < 0) 1264 return chg; 1265 1266 if (hugetlb_get_quota(inode->i_mapping, chg)) 1267 return -ENOSPC; 1268 ret = hugetlb_acct_memory(chg); 1269 if (ret < 0) { 1270 hugetlb_put_quota(inode->i_mapping, chg); 1271 return ret; 1272 } 1273 region_add(&inode->i_mapping->private_list, from, to); 1274 return 0; 1275} 1276 1277void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) 1278{ 1279 long chg = region_truncate(&inode->i_mapping->private_list, offset); 1280 1281 spin_lock(&inode->i_lock); 1282 inode->i_blocks -= BLOCKS_PER_HUGEPAGE * freed; 1283 spin_unlock(&inode->i_lock); 1284 1285 hugetlb_put_quota(inode->i_mapping, (chg - freed)); 1286 hugetlb_acct_memory(-(chg - freed)); 1287} 1288