hugetlb.c revision fc1b8a73dd71226902a11928dd5500326e101df9
1/* 2 * Generic hugetlb support. 3 * (C) William Irwin, April 2004 4 */ 5#include <linux/gfp.h> 6#include <linux/list.h> 7#include <linux/init.h> 8#include <linux/module.h> 9#include <linux/mm.h> 10#include <linux/sysctl.h> 11#include <linux/highmem.h> 12#include <linux/nodemask.h> 13#include <linux/pagemap.h> 14#include <linux/mempolicy.h> 15#include <linux/cpuset.h> 16#include <linux/mutex.h> 17 18#include <asm/page.h> 19#include <asm/pgtable.h> 20 21#include <linux/hugetlb.h> 22#include "internal.h" 23 24const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; 25static unsigned long nr_huge_pages, free_huge_pages, resv_huge_pages; 26static unsigned long surplus_huge_pages; 27static unsigned long nr_overcommit_huge_pages; 28unsigned long max_huge_pages; 29unsigned long sysctl_overcommit_huge_pages; 30static struct list_head hugepage_freelists[MAX_NUMNODES]; 31static unsigned int nr_huge_pages_node[MAX_NUMNODES]; 32static unsigned int free_huge_pages_node[MAX_NUMNODES]; 33static unsigned int surplus_huge_pages_node[MAX_NUMNODES]; 34static gfp_t htlb_alloc_mask = GFP_HIGHUSER; 35unsigned long hugepages_treat_as_movable; 36static int hugetlb_next_nid; 37 38/* 39 * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages 40 */ 41static DEFINE_SPINLOCK(hugetlb_lock); 42 43static void clear_huge_page(struct page *page, unsigned long addr) 44{ 45 int i; 46 47 might_sleep(); 48 for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); i++) { 49 cond_resched(); 50 clear_user_highpage(page + i, addr + i * PAGE_SIZE); 51 } 52} 53 54static void copy_huge_page(struct page *dst, struct page *src, 55 unsigned long addr, struct vm_area_struct *vma) 56{ 57 int i; 58 59 might_sleep(); 60 for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) { 61 cond_resched(); 62 copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma); 63 } 64} 65 66static void enqueue_huge_page(struct page *page) 67{ 68 int nid = page_to_nid(page); 69 list_add(&page->lru, &hugepage_freelists[nid]); 70 free_huge_pages++; 71 free_huge_pages_node[nid]++; 72} 73 74static struct page *dequeue_huge_page(void) 75{ 76 int nid; 77 struct page *page = NULL; 78 79 for (nid = 0; nid < MAX_NUMNODES; ++nid) { 80 if (!list_empty(&hugepage_freelists[nid])) { 81 page = list_entry(hugepage_freelists[nid].next, 82 struct page, lru); 83 list_del(&page->lru); 84 free_huge_pages--; 85 free_huge_pages_node[nid]--; 86 break; 87 } 88 } 89 return page; 90} 91 92static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma, 93 unsigned long address) 94{ 95 int nid; 96 struct page *page = NULL; 97 struct mempolicy *mpol; 98 nodemask_t *nodemask; 99 struct zonelist *zonelist = huge_zonelist(vma, address, 100 htlb_alloc_mask, &mpol, &nodemask); 101 struct zone *zone; 102 struct zoneref *z; 103 104 for_each_zone_zonelist_nodemask(zone, z, zonelist, 105 MAX_NR_ZONES - 1, nodemask) { 106 nid = zone_to_nid(zone); 107 if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask) && 108 !list_empty(&hugepage_freelists[nid])) { 109 page = list_entry(hugepage_freelists[nid].next, 110 struct page, lru); 111 list_del(&page->lru); 112 free_huge_pages--; 113 free_huge_pages_node[nid]--; 114 if (vma && vma->vm_flags & VM_MAYSHARE) 115 resv_huge_pages--; 116 break; 117 } 118 } 119 mpol_cond_put(mpol); 120 return page; 121} 122 123static void update_and_free_page(struct page *page) 124{ 125 int i; 126 nr_huge_pages--; 127 nr_huge_pages_node[page_to_nid(page)]--; 128 for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) { 129 page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced | 130 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved | 131 1 << PG_private | 1<< PG_writeback); 132 } 133 set_compound_page_dtor(page, NULL); 134 set_page_refcounted(page); 135 arch_release_hugepage(page); 136 __free_pages(page, HUGETLB_PAGE_ORDER); 137} 138 139static void free_huge_page(struct page *page) 140{ 141 int nid = page_to_nid(page); 142 struct address_space *mapping; 143 144 mapping = (struct address_space *) page_private(page); 145 set_page_private(page, 0); 146 BUG_ON(page_count(page)); 147 INIT_LIST_HEAD(&page->lru); 148 149 spin_lock(&hugetlb_lock); 150 if (surplus_huge_pages_node[nid]) { 151 update_and_free_page(page); 152 surplus_huge_pages--; 153 surplus_huge_pages_node[nid]--; 154 } else { 155 enqueue_huge_page(page); 156 } 157 spin_unlock(&hugetlb_lock); 158 if (mapping) 159 hugetlb_put_quota(mapping, 1); 160} 161 162/* 163 * Increment or decrement surplus_huge_pages. Keep node-specific counters 164 * balanced by operating on them in a round-robin fashion. 165 * Returns 1 if an adjustment was made. 166 */ 167static int adjust_pool_surplus(int delta) 168{ 169 static int prev_nid; 170 int nid = prev_nid; 171 int ret = 0; 172 173 VM_BUG_ON(delta != -1 && delta != 1); 174 do { 175 nid = next_node(nid, node_online_map); 176 if (nid == MAX_NUMNODES) 177 nid = first_node(node_online_map); 178 179 /* To shrink on this node, there must be a surplus page */ 180 if (delta < 0 && !surplus_huge_pages_node[nid]) 181 continue; 182 /* Surplus cannot exceed the total number of pages */ 183 if (delta > 0 && surplus_huge_pages_node[nid] >= 184 nr_huge_pages_node[nid]) 185 continue; 186 187 surplus_huge_pages += delta; 188 surplus_huge_pages_node[nid] += delta; 189 ret = 1; 190 break; 191 } while (nid != prev_nid); 192 193 prev_nid = nid; 194 return ret; 195} 196 197static struct page *alloc_fresh_huge_page_node(int nid) 198{ 199 struct page *page; 200 201 page = alloc_pages_node(nid, 202 htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE| 203 __GFP_REPEAT|__GFP_NOWARN, 204 HUGETLB_PAGE_ORDER); 205 if (page) { 206 if (arch_prepare_hugepage(page)) { 207 __free_pages(page, HUGETLB_PAGE_ORDER); 208 return NULL; 209 } 210 set_compound_page_dtor(page, free_huge_page); 211 spin_lock(&hugetlb_lock); 212 nr_huge_pages++; 213 nr_huge_pages_node[nid]++; 214 spin_unlock(&hugetlb_lock); 215 put_page(page); /* free it into the hugepage allocator */ 216 } 217 218 return page; 219} 220 221static int alloc_fresh_huge_page(void) 222{ 223 struct page *page; 224 int start_nid; 225 int next_nid; 226 int ret = 0; 227 228 start_nid = hugetlb_next_nid; 229 230 do { 231 page = alloc_fresh_huge_page_node(hugetlb_next_nid); 232 if (page) 233 ret = 1; 234 /* 235 * Use a helper variable to find the next node and then 236 * copy it back to hugetlb_next_nid afterwards: 237 * otherwise there's a window in which a racer might 238 * pass invalid nid MAX_NUMNODES to alloc_pages_node. 239 * But we don't need to use a spin_lock here: it really 240 * doesn't matter if occasionally a racer chooses the 241 * same nid as we do. Move nid forward in the mask even 242 * if we just successfully allocated a hugepage so that 243 * the next caller gets hugepages on the next node. 244 */ 245 next_nid = next_node(hugetlb_next_nid, node_online_map); 246 if (next_nid == MAX_NUMNODES) 247 next_nid = first_node(node_online_map); 248 hugetlb_next_nid = next_nid; 249 } while (!page && hugetlb_next_nid != start_nid); 250 251 if (ret) 252 count_vm_event(HTLB_BUDDY_PGALLOC); 253 else 254 count_vm_event(HTLB_BUDDY_PGALLOC_FAIL); 255 256 return ret; 257} 258 259static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma, 260 unsigned long address) 261{ 262 struct page *page; 263 unsigned int nid; 264 265 /* 266 * Assume we will successfully allocate the surplus page to 267 * prevent racing processes from causing the surplus to exceed 268 * overcommit 269 * 270 * This however introduces a different race, where a process B 271 * tries to grow the static hugepage pool while alloc_pages() is 272 * called by process A. B will only examine the per-node 273 * counters in determining if surplus huge pages can be 274 * converted to normal huge pages in adjust_pool_surplus(). A 275 * won't be able to increment the per-node counter, until the 276 * lock is dropped by B, but B doesn't drop hugetlb_lock until 277 * no more huge pages can be converted from surplus to normal 278 * state (and doesn't try to convert again). Thus, we have a 279 * case where a surplus huge page exists, the pool is grown, and 280 * the surplus huge page still exists after, even though it 281 * should just have been converted to a normal huge page. This 282 * does not leak memory, though, as the hugepage will be freed 283 * once it is out of use. It also does not allow the counters to 284 * go out of whack in adjust_pool_surplus() as we don't modify 285 * the node values until we've gotten the hugepage and only the 286 * per-node value is checked there. 287 */ 288 spin_lock(&hugetlb_lock); 289 if (surplus_huge_pages >= nr_overcommit_huge_pages) { 290 spin_unlock(&hugetlb_lock); 291 return NULL; 292 } else { 293 nr_huge_pages++; 294 surplus_huge_pages++; 295 } 296 spin_unlock(&hugetlb_lock); 297 298 page = alloc_pages(htlb_alloc_mask|__GFP_COMP| 299 __GFP_REPEAT|__GFP_NOWARN, 300 HUGETLB_PAGE_ORDER); 301 302 spin_lock(&hugetlb_lock); 303 if (page) { 304 /* 305 * This page is now managed by the hugetlb allocator and has 306 * no users -- drop the buddy allocator's reference. 307 */ 308 put_page_testzero(page); 309 VM_BUG_ON(page_count(page)); 310 nid = page_to_nid(page); 311 set_compound_page_dtor(page, free_huge_page); 312 /* 313 * We incremented the global counters already 314 */ 315 nr_huge_pages_node[nid]++; 316 surplus_huge_pages_node[nid]++; 317 __count_vm_event(HTLB_BUDDY_PGALLOC); 318 } else { 319 nr_huge_pages--; 320 surplus_huge_pages--; 321 __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL); 322 } 323 spin_unlock(&hugetlb_lock); 324 325 return page; 326} 327 328/* 329 * Increase the hugetlb pool such that it can accomodate a reservation 330 * of size 'delta'. 331 */ 332static int gather_surplus_pages(int delta) 333{ 334 struct list_head surplus_list; 335 struct page *page, *tmp; 336 int ret, i; 337 int needed, allocated; 338 339 needed = (resv_huge_pages + delta) - free_huge_pages; 340 if (needed <= 0) { 341 resv_huge_pages += delta; 342 return 0; 343 } 344 345 allocated = 0; 346 INIT_LIST_HEAD(&surplus_list); 347 348 ret = -ENOMEM; 349retry: 350 spin_unlock(&hugetlb_lock); 351 for (i = 0; i < needed; i++) { 352 page = alloc_buddy_huge_page(NULL, 0); 353 if (!page) { 354 /* 355 * We were not able to allocate enough pages to 356 * satisfy the entire reservation so we free what 357 * we've allocated so far. 358 */ 359 spin_lock(&hugetlb_lock); 360 needed = 0; 361 goto free; 362 } 363 364 list_add(&page->lru, &surplus_list); 365 } 366 allocated += needed; 367 368 /* 369 * After retaking hugetlb_lock, we need to recalculate 'needed' 370 * because either resv_huge_pages or free_huge_pages may have changed. 371 */ 372 spin_lock(&hugetlb_lock); 373 needed = (resv_huge_pages + delta) - (free_huge_pages + allocated); 374 if (needed > 0) 375 goto retry; 376 377 /* 378 * The surplus_list now contains _at_least_ the number of extra pages 379 * needed to accomodate the reservation. Add the appropriate number 380 * of pages to the hugetlb pool and free the extras back to the buddy 381 * allocator. Commit the entire reservation here to prevent another 382 * process from stealing the pages as they are added to the pool but 383 * before they are reserved. 384 */ 385 needed += allocated; 386 resv_huge_pages += delta; 387 ret = 0; 388free: 389 /* Free the needed pages to the hugetlb pool */ 390 list_for_each_entry_safe(page, tmp, &surplus_list, lru) { 391 if ((--needed) < 0) 392 break; 393 list_del(&page->lru); 394 enqueue_huge_page(page); 395 } 396 397 /* Free unnecessary surplus pages to the buddy allocator */ 398 if (!list_empty(&surplus_list)) { 399 spin_unlock(&hugetlb_lock); 400 list_for_each_entry_safe(page, tmp, &surplus_list, lru) { 401 list_del(&page->lru); 402 /* 403 * The page has a reference count of zero already, so 404 * call free_huge_page directly instead of using 405 * put_page. This must be done with hugetlb_lock 406 * unlocked which is safe because free_huge_page takes 407 * hugetlb_lock before deciding how to free the page. 408 */ 409 free_huge_page(page); 410 } 411 spin_lock(&hugetlb_lock); 412 } 413 414 return ret; 415} 416 417/* 418 * When releasing a hugetlb pool reservation, any surplus pages that were 419 * allocated to satisfy the reservation must be explicitly freed if they were 420 * never used. 421 */ 422static void return_unused_surplus_pages(unsigned long unused_resv_pages) 423{ 424 static int nid = -1; 425 struct page *page; 426 unsigned long nr_pages; 427 428 /* 429 * We want to release as many surplus pages as possible, spread 430 * evenly across all nodes. Iterate across all nodes until we 431 * can no longer free unreserved surplus pages. This occurs when 432 * the nodes with surplus pages have no free pages. 433 */ 434 unsigned long remaining_iterations = num_online_nodes(); 435 436 /* Uncommit the reservation */ 437 resv_huge_pages -= unused_resv_pages; 438 439 nr_pages = min(unused_resv_pages, surplus_huge_pages); 440 441 while (remaining_iterations-- && nr_pages) { 442 nid = next_node(nid, node_online_map); 443 if (nid == MAX_NUMNODES) 444 nid = first_node(node_online_map); 445 446 if (!surplus_huge_pages_node[nid]) 447 continue; 448 449 if (!list_empty(&hugepage_freelists[nid])) { 450 page = list_entry(hugepage_freelists[nid].next, 451 struct page, lru); 452 list_del(&page->lru); 453 update_and_free_page(page); 454 free_huge_pages--; 455 free_huge_pages_node[nid]--; 456 surplus_huge_pages--; 457 surplus_huge_pages_node[nid]--; 458 nr_pages--; 459 remaining_iterations = num_online_nodes(); 460 } 461 } 462} 463 464 465static struct page *alloc_huge_page_shared(struct vm_area_struct *vma, 466 unsigned long addr) 467{ 468 struct page *page; 469 470 spin_lock(&hugetlb_lock); 471 page = dequeue_huge_page_vma(vma, addr); 472 spin_unlock(&hugetlb_lock); 473 return page ? page : ERR_PTR(-VM_FAULT_OOM); 474} 475 476static struct page *alloc_huge_page_private(struct vm_area_struct *vma, 477 unsigned long addr) 478{ 479 struct page *page = NULL; 480 481 if (hugetlb_get_quota(vma->vm_file->f_mapping, 1)) 482 return ERR_PTR(-VM_FAULT_SIGBUS); 483 484 spin_lock(&hugetlb_lock); 485 if (free_huge_pages > resv_huge_pages) 486 page = dequeue_huge_page_vma(vma, addr); 487 spin_unlock(&hugetlb_lock); 488 if (!page) { 489 page = alloc_buddy_huge_page(vma, addr); 490 if (!page) { 491 hugetlb_put_quota(vma->vm_file->f_mapping, 1); 492 return ERR_PTR(-VM_FAULT_OOM); 493 } 494 } 495 return page; 496} 497 498static struct page *alloc_huge_page(struct vm_area_struct *vma, 499 unsigned long addr) 500{ 501 struct page *page; 502 struct address_space *mapping = vma->vm_file->f_mapping; 503 504 if (vma->vm_flags & VM_MAYSHARE) 505 page = alloc_huge_page_shared(vma, addr); 506 else 507 page = alloc_huge_page_private(vma, addr); 508 509 if (!IS_ERR(page)) { 510 set_page_refcounted(page); 511 set_page_private(page, (unsigned long) mapping); 512 } 513 return page; 514} 515 516static int __init hugetlb_init(void) 517{ 518 unsigned long i; 519 520 if (HPAGE_SHIFT == 0) 521 return 0; 522 523 for (i = 0; i < MAX_NUMNODES; ++i) 524 INIT_LIST_HEAD(&hugepage_freelists[i]); 525 526 hugetlb_next_nid = first_node(node_online_map); 527 528 for (i = 0; i < max_huge_pages; ++i) { 529 if (!alloc_fresh_huge_page()) 530 break; 531 } 532 max_huge_pages = free_huge_pages = nr_huge_pages = i; 533 printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages); 534 return 0; 535} 536module_init(hugetlb_init); 537 538static int __init hugetlb_setup(char *s) 539{ 540 if (sscanf(s, "%lu", &max_huge_pages) <= 0) 541 max_huge_pages = 0; 542 return 1; 543} 544__setup("hugepages=", hugetlb_setup); 545 546static unsigned int cpuset_mems_nr(unsigned int *array) 547{ 548 int node; 549 unsigned int nr = 0; 550 551 for_each_node_mask(node, cpuset_current_mems_allowed) 552 nr += array[node]; 553 554 return nr; 555} 556 557#ifdef CONFIG_SYSCTL 558#ifdef CONFIG_HIGHMEM 559static void try_to_free_low(unsigned long count) 560{ 561 int i; 562 563 for (i = 0; i < MAX_NUMNODES; ++i) { 564 struct page *page, *next; 565 list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) { 566 if (count >= nr_huge_pages) 567 return; 568 if (PageHighMem(page)) 569 continue; 570 list_del(&page->lru); 571 update_and_free_page(page); 572 free_huge_pages--; 573 free_huge_pages_node[page_to_nid(page)]--; 574 } 575 } 576} 577#else 578static inline void try_to_free_low(unsigned long count) 579{ 580} 581#endif 582 583#define persistent_huge_pages (nr_huge_pages - surplus_huge_pages) 584static unsigned long set_max_huge_pages(unsigned long count) 585{ 586 unsigned long min_count, ret; 587 588 /* 589 * Increase the pool size 590 * First take pages out of surplus state. Then make up the 591 * remaining difference by allocating fresh huge pages. 592 * 593 * We might race with alloc_buddy_huge_page() here and be unable 594 * to convert a surplus huge page to a normal huge page. That is 595 * not critical, though, it just means the overall size of the 596 * pool might be one hugepage larger than it needs to be, but 597 * within all the constraints specified by the sysctls. 598 */ 599 spin_lock(&hugetlb_lock); 600 while (surplus_huge_pages && count > persistent_huge_pages) { 601 if (!adjust_pool_surplus(-1)) 602 break; 603 } 604 605 while (count > persistent_huge_pages) { 606 /* 607 * If this allocation races such that we no longer need the 608 * page, free_huge_page will handle it by freeing the page 609 * and reducing the surplus. 610 */ 611 spin_unlock(&hugetlb_lock); 612 ret = alloc_fresh_huge_page(); 613 spin_lock(&hugetlb_lock); 614 if (!ret) 615 goto out; 616 617 } 618 619 /* 620 * Decrease the pool size 621 * First return free pages to the buddy allocator (being careful 622 * to keep enough around to satisfy reservations). Then place 623 * pages into surplus state as needed so the pool will shrink 624 * to the desired size as pages become free. 625 * 626 * By placing pages into the surplus state independent of the 627 * overcommit value, we are allowing the surplus pool size to 628 * exceed overcommit. There are few sane options here. Since 629 * alloc_buddy_huge_page() is checking the global counter, 630 * though, we'll note that we're not allowed to exceed surplus 631 * and won't grow the pool anywhere else. Not until one of the 632 * sysctls are changed, or the surplus pages go out of use. 633 */ 634 min_count = resv_huge_pages + nr_huge_pages - free_huge_pages; 635 min_count = max(count, min_count); 636 try_to_free_low(min_count); 637 while (min_count < persistent_huge_pages) { 638 struct page *page = dequeue_huge_page(); 639 if (!page) 640 break; 641 update_and_free_page(page); 642 } 643 while (count < persistent_huge_pages) { 644 if (!adjust_pool_surplus(1)) 645 break; 646 } 647out: 648 ret = persistent_huge_pages; 649 spin_unlock(&hugetlb_lock); 650 return ret; 651} 652 653int hugetlb_sysctl_handler(struct ctl_table *table, int write, 654 struct file *file, void __user *buffer, 655 size_t *length, loff_t *ppos) 656{ 657 proc_doulongvec_minmax(table, write, file, buffer, length, ppos); 658 max_huge_pages = set_max_huge_pages(max_huge_pages); 659 return 0; 660} 661 662int hugetlb_treat_movable_handler(struct ctl_table *table, int write, 663 struct file *file, void __user *buffer, 664 size_t *length, loff_t *ppos) 665{ 666 proc_dointvec(table, write, file, buffer, length, ppos); 667 if (hugepages_treat_as_movable) 668 htlb_alloc_mask = GFP_HIGHUSER_MOVABLE; 669 else 670 htlb_alloc_mask = GFP_HIGHUSER; 671 return 0; 672} 673 674int hugetlb_overcommit_handler(struct ctl_table *table, int write, 675 struct file *file, void __user *buffer, 676 size_t *length, loff_t *ppos) 677{ 678 proc_doulongvec_minmax(table, write, file, buffer, length, ppos); 679 spin_lock(&hugetlb_lock); 680 nr_overcommit_huge_pages = sysctl_overcommit_huge_pages; 681 spin_unlock(&hugetlb_lock); 682 return 0; 683} 684 685#endif /* CONFIG_SYSCTL */ 686 687int hugetlb_report_meminfo(char *buf) 688{ 689 return sprintf(buf, 690 "HugePages_Total: %5lu\n" 691 "HugePages_Free: %5lu\n" 692 "HugePages_Rsvd: %5lu\n" 693 "HugePages_Surp: %5lu\n" 694 "Hugepagesize: %5lu kB\n", 695 nr_huge_pages, 696 free_huge_pages, 697 resv_huge_pages, 698 surplus_huge_pages, 699 HPAGE_SIZE/1024); 700} 701 702int hugetlb_report_node_meminfo(int nid, char *buf) 703{ 704 return sprintf(buf, 705 "Node %d HugePages_Total: %5u\n" 706 "Node %d HugePages_Free: %5u\n" 707 "Node %d HugePages_Surp: %5u\n", 708 nid, nr_huge_pages_node[nid], 709 nid, free_huge_pages_node[nid], 710 nid, surplus_huge_pages_node[nid]); 711} 712 713/* Return the number pages of memory we physically have, in PAGE_SIZE units. */ 714unsigned long hugetlb_total_pages(void) 715{ 716 return nr_huge_pages * (HPAGE_SIZE / PAGE_SIZE); 717} 718 719static int hugetlb_acct_memory(long delta) 720{ 721 int ret = -ENOMEM; 722 723 spin_lock(&hugetlb_lock); 724 /* 725 * When cpuset is configured, it breaks the strict hugetlb page 726 * reservation as the accounting is done on a global variable. Such 727 * reservation is completely rubbish in the presence of cpuset because 728 * the reservation is not checked against page availability for the 729 * current cpuset. Application can still potentially OOM'ed by kernel 730 * with lack of free htlb page in cpuset that the task is in. 731 * Attempt to enforce strict accounting with cpuset is almost 732 * impossible (or too ugly) because cpuset is too fluid that 733 * task or memory node can be dynamically moved between cpusets. 734 * 735 * The change of semantics for shared hugetlb mapping with cpuset is 736 * undesirable. However, in order to preserve some of the semantics, 737 * we fall back to check against current free page availability as 738 * a best attempt and hopefully to minimize the impact of changing 739 * semantics that cpuset has. 740 */ 741 if (delta > 0) { 742 if (gather_surplus_pages(delta) < 0) 743 goto out; 744 745 if (delta > cpuset_mems_nr(free_huge_pages_node)) { 746 return_unused_surplus_pages(delta); 747 goto out; 748 } 749 } 750 751 ret = 0; 752 if (delta < 0) 753 return_unused_surplus_pages((unsigned long) -delta); 754 755out: 756 spin_unlock(&hugetlb_lock); 757 return ret; 758} 759 760/* 761 * We cannot handle pagefaults against hugetlb pages at all. They cause 762 * handle_mm_fault() to try to instantiate regular-sized pages in the 763 * hugegpage VMA. do_page_fault() is supposed to trap this, so BUG is we get 764 * this far. 765 */ 766static int hugetlb_vm_op_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 767{ 768 BUG(); 769 return 0; 770} 771 772struct vm_operations_struct hugetlb_vm_ops = { 773 .fault = hugetlb_vm_op_fault, 774}; 775 776static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page, 777 int writable) 778{ 779 pte_t entry; 780 781 if (writable) { 782 entry = 783 pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); 784 } else { 785 entry = huge_pte_wrprotect(mk_pte(page, vma->vm_page_prot)); 786 } 787 entry = pte_mkyoung(entry); 788 entry = pte_mkhuge(entry); 789 790 return entry; 791} 792 793static void set_huge_ptep_writable(struct vm_area_struct *vma, 794 unsigned long address, pte_t *ptep) 795{ 796 pte_t entry; 797 798 entry = pte_mkwrite(pte_mkdirty(huge_ptep_get(ptep))); 799 if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1)) { 800 update_mmu_cache(vma, address, entry); 801 } 802} 803 804 805int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, 806 struct vm_area_struct *vma) 807{ 808 pte_t *src_pte, *dst_pte, entry; 809 struct page *ptepage; 810 unsigned long addr; 811 int cow; 812 813 cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; 814 815 for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) { 816 src_pte = huge_pte_offset(src, addr); 817 if (!src_pte) 818 continue; 819 dst_pte = huge_pte_alloc(dst, addr); 820 if (!dst_pte) 821 goto nomem; 822 823 /* If the pagetables are shared don't copy or take references */ 824 if (dst_pte == src_pte) 825 continue; 826 827 spin_lock(&dst->page_table_lock); 828 spin_lock_nested(&src->page_table_lock, SINGLE_DEPTH_NESTING); 829 if (!huge_pte_none(huge_ptep_get(src_pte))) { 830 if (cow) 831 huge_ptep_set_wrprotect(src, addr, src_pte); 832 entry = huge_ptep_get(src_pte); 833 ptepage = pte_page(entry); 834 get_page(ptepage); 835 set_huge_pte_at(dst, addr, dst_pte, entry); 836 } 837 spin_unlock(&src->page_table_lock); 838 spin_unlock(&dst->page_table_lock); 839 } 840 return 0; 841 842nomem: 843 return -ENOMEM; 844} 845 846void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, 847 unsigned long end) 848{ 849 struct mm_struct *mm = vma->vm_mm; 850 unsigned long address; 851 pte_t *ptep; 852 pte_t pte; 853 struct page *page; 854 struct page *tmp; 855 /* 856 * A page gathering list, protected by per file i_mmap_lock. The 857 * lock is used to avoid list corruption from multiple unmapping 858 * of the same page since we are using page->lru. 859 */ 860 LIST_HEAD(page_list); 861 862 WARN_ON(!is_vm_hugetlb_page(vma)); 863 BUG_ON(start & ~HPAGE_MASK); 864 BUG_ON(end & ~HPAGE_MASK); 865 866 spin_lock(&mm->page_table_lock); 867 for (address = start; address < end; address += HPAGE_SIZE) { 868 ptep = huge_pte_offset(mm, address); 869 if (!ptep) 870 continue; 871 872 if (huge_pmd_unshare(mm, &address, ptep)) 873 continue; 874 875 pte = huge_ptep_get_and_clear(mm, address, ptep); 876 if (huge_pte_none(pte)) 877 continue; 878 879 page = pte_page(pte); 880 if (pte_dirty(pte)) 881 set_page_dirty(page); 882 list_add(&page->lru, &page_list); 883 } 884 spin_unlock(&mm->page_table_lock); 885 flush_tlb_range(vma, start, end); 886 list_for_each_entry_safe(page, tmp, &page_list, lru) { 887 list_del(&page->lru); 888 put_page(page); 889 } 890} 891 892void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, 893 unsigned long end) 894{ 895 /* 896 * It is undesirable to test vma->vm_file as it should be non-null 897 * for valid hugetlb area. However, vm_file will be NULL in the error 898 * cleanup path of do_mmap_pgoff. When hugetlbfs ->mmap method fails, 899 * do_mmap_pgoff() nullifies vma->vm_file before calling this function 900 * to clean up. Since no pte has actually been setup, it is safe to 901 * do nothing in this case. 902 */ 903 if (vma->vm_file) { 904 spin_lock(&vma->vm_file->f_mapping->i_mmap_lock); 905 __unmap_hugepage_range(vma, start, end); 906 spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock); 907 } 908} 909 910static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, 911 unsigned long address, pte_t *ptep, pte_t pte) 912{ 913 struct page *old_page, *new_page; 914 int avoidcopy; 915 916 old_page = pte_page(pte); 917 918 /* If no-one else is actually using this page, avoid the copy 919 * and just make the page writable */ 920 avoidcopy = (page_count(old_page) == 1); 921 if (avoidcopy) { 922 set_huge_ptep_writable(vma, address, ptep); 923 return 0; 924 } 925 926 page_cache_get(old_page); 927 new_page = alloc_huge_page(vma, address); 928 929 if (IS_ERR(new_page)) { 930 page_cache_release(old_page); 931 return -PTR_ERR(new_page); 932 } 933 934 spin_unlock(&mm->page_table_lock); 935 copy_huge_page(new_page, old_page, address, vma); 936 __SetPageUptodate(new_page); 937 spin_lock(&mm->page_table_lock); 938 939 ptep = huge_pte_offset(mm, address & HPAGE_MASK); 940 if (likely(pte_same(huge_ptep_get(ptep), pte))) { 941 /* Break COW */ 942 huge_ptep_clear_flush(vma, address, ptep); 943 set_huge_pte_at(mm, address, ptep, 944 make_huge_pte(vma, new_page, 1)); 945 /* Make the old page be freed below */ 946 new_page = old_page; 947 } 948 page_cache_release(new_page); 949 page_cache_release(old_page); 950 return 0; 951} 952 953static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, 954 unsigned long address, pte_t *ptep, int write_access) 955{ 956 int ret = VM_FAULT_SIGBUS; 957 unsigned long idx; 958 unsigned long size; 959 struct page *page; 960 struct address_space *mapping; 961 pte_t new_pte; 962 963 mapping = vma->vm_file->f_mapping; 964 idx = ((address - vma->vm_start) >> HPAGE_SHIFT) 965 + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT)); 966 967 /* 968 * Use page lock to guard against racing truncation 969 * before we get page_table_lock. 970 */ 971retry: 972 page = find_lock_page(mapping, idx); 973 if (!page) { 974 size = i_size_read(mapping->host) >> HPAGE_SHIFT; 975 if (idx >= size) 976 goto out; 977 page = alloc_huge_page(vma, address); 978 if (IS_ERR(page)) { 979 ret = -PTR_ERR(page); 980 goto out; 981 } 982 clear_huge_page(page, address); 983 __SetPageUptodate(page); 984 985 if (vma->vm_flags & VM_SHARED) { 986 int err; 987 struct inode *inode = mapping->host; 988 989 err = add_to_page_cache(page, mapping, idx, GFP_KERNEL); 990 if (err) { 991 put_page(page); 992 if (err == -EEXIST) 993 goto retry; 994 goto out; 995 } 996 997 spin_lock(&inode->i_lock); 998 inode->i_blocks += BLOCKS_PER_HUGEPAGE; 999 spin_unlock(&inode->i_lock); 1000 } else 1001 lock_page(page); 1002 } 1003 1004 spin_lock(&mm->page_table_lock); 1005 size = i_size_read(mapping->host) >> HPAGE_SHIFT; 1006 if (idx >= size) 1007 goto backout; 1008 1009 ret = 0; 1010 if (!huge_pte_none(huge_ptep_get(ptep))) 1011 goto backout; 1012 1013 new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) 1014 && (vma->vm_flags & VM_SHARED))); 1015 set_huge_pte_at(mm, address, ptep, new_pte); 1016 1017 if (write_access && !(vma->vm_flags & VM_SHARED)) { 1018 /* Optimization, do the COW without a second fault */ 1019 ret = hugetlb_cow(mm, vma, address, ptep, new_pte); 1020 } 1021 1022 spin_unlock(&mm->page_table_lock); 1023 unlock_page(page); 1024out: 1025 return ret; 1026 1027backout: 1028 spin_unlock(&mm->page_table_lock); 1029 unlock_page(page); 1030 put_page(page); 1031 goto out; 1032} 1033 1034int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, 1035 unsigned long address, int write_access) 1036{ 1037 pte_t *ptep; 1038 pte_t entry; 1039 int ret; 1040 static DEFINE_MUTEX(hugetlb_instantiation_mutex); 1041 1042 ptep = huge_pte_alloc(mm, address); 1043 if (!ptep) 1044 return VM_FAULT_OOM; 1045 1046 /* 1047 * Serialize hugepage allocation and instantiation, so that we don't 1048 * get spurious allocation failures if two CPUs race to instantiate 1049 * the same page in the page cache. 1050 */ 1051 mutex_lock(&hugetlb_instantiation_mutex); 1052 entry = huge_ptep_get(ptep); 1053 if (huge_pte_none(entry)) { 1054 ret = hugetlb_no_page(mm, vma, address, ptep, write_access); 1055 mutex_unlock(&hugetlb_instantiation_mutex); 1056 return ret; 1057 } 1058 1059 ret = 0; 1060 1061 spin_lock(&mm->page_table_lock); 1062 /* Check for a racing update before calling hugetlb_cow */ 1063 if (likely(pte_same(entry, huge_ptep_get(ptep)))) 1064 if (write_access && !pte_write(entry)) 1065 ret = hugetlb_cow(mm, vma, address, ptep, entry); 1066 spin_unlock(&mm->page_table_lock); 1067 mutex_unlock(&hugetlb_instantiation_mutex); 1068 1069 return ret; 1070} 1071 1072int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, 1073 struct page **pages, struct vm_area_struct **vmas, 1074 unsigned long *position, int *length, int i, 1075 int write) 1076{ 1077 unsigned long pfn_offset; 1078 unsigned long vaddr = *position; 1079 int remainder = *length; 1080 1081 spin_lock(&mm->page_table_lock); 1082 while (vaddr < vma->vm_end && remainder) { 1083 pte_t *pte; 1084 struct page *page; 1085 1086 /* 1087 * Some archs (sparc64, sh*) have multiple pte_ts to 1088 * each hugepage. We have to make * sure we get the 1089 * first, for the page indexing below to work. 1090 */ 1091 pte = huge_pte_offset(mm, vaddr & HPAGE_MASK); 1092 1093 if (!pte || huge_pte_none(huge_ptep_get(pte)) || 1094 (write && !pte_write(huge_ptep_get(pte)))) { 1095 int ret; 1096 1097 spin_unlock(&mm->page_table_lock); 1098 ret = hugetlb_fault(mm, vma, vaddr, write); 1099 spin_lock(&mm->page_table_lock); 1100 if (!(ret & VM_FAULT_ERROR)) 1101 continue; 1102 1103 remainder = 0; 1104 if (!i) 1105 i = -EFAULT; 1106 break; 1107 } 1108 1109 pfn_offset = (vaddr & ~HPAGE_MASK) >> PAGE_SHIFT; 1110 page = pte_page(huge_ptep_get(pte)); 1111same_page: 1112 if (pages) { 1113 get_page(page); 1114 pages[i] = page + pfn_offset; 1115 } 1116 1117 if (vmas) 1118 vmas[i] = vma; 1119 1120 vaddr += PAGE_SIZE; 1121 ++pfn_offset; 1122 --remainder; 1123 ++i; 1124 if (vaddr < vma->vm_end && remainder && 1125 pfn_offset < HPAGE_SIZE/PAGE_SIZE) { 1126 /* 1127 * We use pfn_offset to avoid touching the pageframes 1128 * of this compound page. 1129 */ 1130 goto same_page; 1131 } 1132 } 1133 spin_unlock(&mm->page_table_lock); 1134 *length = remainder; 1135 *position = vaddr; 1136 1137 return i; 1138} 1139 1140void hugetlb_change_protection(struct vm_area_struct *vma, 1141 unsigned long address, unsigned long end, pgprot_t newprot) 1142{ 1143 struct mm_struct *mm = vma->vm_mm; 1144 unsigned long start = address; 1145 pte_t *ptep; 1146 pte_t pte; 1147 1148 BUG_ON(address >= end); 1149 flush_cache_range(vma, address, end); 1150 1151 spin_lock(&vma->vm_file->f_mapping->i_mmap_lock); 1152 spin_lock(&mm->page_table_lock); 1153 for (; address < end; address += HPAGE_SIZE) { 1154 ptep = huge_pte_offset(mm, address); 1155 if (!ptep) 1156 continue; 1157 if (huge_pmd_unshare(mm, &address, ptep)) 1158 continue; 1159 if (!huge_pte_none(huge_ptep_get(ptep))) { 1160 pte = huge_ptep_get_and_clear(mm, address, ptep); 1161 pte = pte_mkhuge(pte_modify(pte, newprot)); 1162 set_huge_pte_at(mm, address, ptep, pte); 1163 } 1164 } 1165 spin_unlock(&mm->page_table_lock); 1166 spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock); 1167 1168 flush_tlb_range(vma, start, end); 1169} 1170 1171struct file_region { 1172 struct list_head link; 1173 long from; 1174 long to; 1175}; 1176 1177static long region_add(struct list_head *head, long f, long t) 1178{ 1179 struct file_region *rg, *nrg, *trg; 1180 1181 /* Locate the region we are either in or before. */ 1182 list_for_each_entry(rg, head, link) 1183 if (f <= rg->to) 1184 break; 1185 1186 /* Round our left edge to the current segment if it encloses us. */ 1187 if (f > rg->from) 1188 f = rg->from; 1189 1190 /* Check for and consume any regions we now overlap with. */ 1191 nrg = rg; 1192 list_for_each_entry_safe(rg, trg, rg->link.prev, link) { 1193 if (&rg->link == head) 1194 break; 1195 if (rg->from > t) 1196 break; 1197 1198 /* If this area reaches higher then extend our area to 1199 * include it completely. If this is not the first area 1200 * which we intend to reuse, free it. */ 1201 if (rg->to > t) 1202 t = rg->to; 1203 if (rg != nrg) { 1204 list_del(&rg->link); 1205 kfree(rg); 1206 } 1207 } 1208 nrg->from = f; 1209 nrg->to = t; 1210 return 0; 1211} 1212 1213static long region_chg(struct list_head *head, long f, long t) 1214{ 1215 struct file_region *rg, *nrg; 1216 long chg = 0; 1217 1218 /* Locate the region we are before or in. */ 1219 list_for_each_entry(rg, head, link) 1220 if (f <= rg->to) 1221 break; 1222 1223 /* If we are below the current region then a new region is required. 1224 * Subtle, allocate a new region at the position but make it zero 1225 * size such that we can guarantee to record the reservation. */ 1226 if (&rg->link == head || t < rg->from) { 1227 nrg = kmalloc(sizeof(*nrg), GFP_KERNEL); 1228 if (!nrg) 1229 return -ENOMEM; 1230 nrg->from = f; 1231 nrg->to = f; 1232 INIT_LIST_HEAD(&nrg->link); 1233 list_add(&nrg->link, rg->link.prev); 1234 1235 return t - f; 1236 } 1237 1238 /* Round our left edge to the current segment if it encloses us. */ 1239 if (f > rg->from) 1240 f = rg->from; 1241 chg = t - f; 1242 1243 /* Check for and consume any regions we now overlap with. */ 1244 list_for_each_entry(rg, rg->link.prev, link) { 1245 if (&rg->link == head) 1246 break; 1247 if (rg->from > t) 1248 return chg; 1249 1250 /* We overlap with this area, if it extends futher than 1251 * us then we must extend ourselves. Account for its 1252 * existing reservation. */ 1253 if (rg->to > t) { 1254 chg += rg->to - t; 1255 t = rg->to; 1256 } 1257 chg -= rg->to - rg->from; 1258 } 1259 return chg; 1260} 1261 1262static long region_truncate(struct list_head *head, long end) 1263{ 1264 struct file_region *rg, *trg; 1265 long chg = 0; 1266 1267 /* Locate the region we are either in or before. */ 1268 list_for_each_entry(rg, head, link) 1269 if (end <= rg->to) 1270 break; 1271 if (&rg->link == head) 1272 return 0; 1273 1274 /* If we are in the middle of a region then adjust it. */ 1275 if (end > rg->from) { 1276 chg = rg->to - end; 1277 rg->to = end; 1278 rg = list_entry(rg->link.next, typeof(*rg), link); 1279 } 1280 1281 /* Drop any remaining regions. */ 1282 list_for_each_entry_safe(rg, trg, rg->link.prev, link) { 1283 if (&rg->link == head) 1284 break; 1285 chg += rg->to - rg->from; 1286 list_del(&rg->link); 1287 kfree(rg); 1288 } 1289 return chg; 1290} 1291 1292int hugetlb_reserve_pages(struct inode *inode, long from, long to) 1293{ 1294 long ret, chg; 1295 1296 chg = region_chg(&inode->i_mapping->private_list, from, to); 1297 if (chg < 0) 1298 return chg; 1299 1300 if (hugetlb_get_quota(inode->i_mapping, chg)) 1301 return -ENOSPC; 1302 ret = hugetlb_acct_memory(chg); 1303 if (ret < 0) { 1304 hugetlb_put_quota(inode->i_mapping, chg); 1305 return ret; 1306 } 1307 region_add(&inode->i_mapping->private_list, from, to); 1308 return 0; 1309} 1310 1311void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) 1312{ 1313 long chg = region_truncate(&inode->i_mapping->private_list, offset); 1314 1315 spin_lock(&inode->i_lock); 1316 inode->i_blocks -= BLOCKS_PER_HUGEPAGE * freed; 1317 spin_unlock(&inode->i_lock); 1318 1319 hugetlb_put_quota(inode->i_mapping, (chg - freed)); 1320 hugetlb_acct_memory(-(chg - freed)); 1321} 1322