hugetlb.c revision 52cd3b074050dd664380b5e8cfc85d4a6ed8ad48
1/* 2 * Generic hugetlb support. 3 * (C) William Irwin, April 2004 4 */ 5#include <linux/gfp.h> 6#include <linux/list.h> 7#include <linux/init.h> 8#include <linux/module.h> 9#include <linux/mm.h> 10#include <linux/sysctl.h> 11#include <linux/highmem.h> 12#include <linux/nodemask.h> 13#include <linux/pagemap.h> 14#include <linux/mempolicy.h> 15#include <linux/cpuset.h> 16#include <linux/mutex.h> 17 18#include <asm/page.h> 19#include <asm/pgtable.h> 20 21#include <linux/hugetlb.h> 22#include "internal.h" 23 24const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; 25static unsigned long nr_huge_pages, free_huge_pages, resv_huge_pages; 26static unsigned long surplus_huge_pages; 27static unsigned long nr_overcommit_huge_pages; 28unsigned long max_huge_pages; 29unsigned long sysctl_overcommit_huge_pages; 30static struct list_head hugepage_freelists[MAX_NUMNODES]; 31static unsigned int nr_huge_pages_node[MAX_NUMNODES]; 32static unsigned int free_huge_pages_node[MAX_NUMNODES]; 33static unsigned int surplus_huge_pages_node[MAX_NUMNODES]; 34static gfp_t htlb_alloc_mask = GFP_HIGHUSER; 35unsigned long hugepages_treat_as_movable; 36static int hugetlb_next_nid; 37 38/* 39 * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages 40 */ 41static DEFINE_SPINLOCK(hugetlb_lock); 42 43static void clear_huge_page(struct page *page, unsigned long addr) 44{ 45 int i; 46 47 might_sleep(); 48 for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); i++) { 49 cond_resched(); 50 clear_user_highpage(page + i, addr + i * PAGE_SIZE); 51 } 52} 53 54static void copy_huge_page(struct page *dst, struct page *src, 55 unsigned long addr, struct vm_area_struct *vma) 56{ 57 int i; 58 59 might_sleep(); 60 for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) { 61 cond_resched(); 62 copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma); 63 } 64} 65 66static void enqueue_huge_page(struct page *page) 67{ 68 int nid = page_to_nid(page); 69 list_add(&page->lru, &hugepage_freelists[nid]); 70 free_huge_pages++; 71 free_huge_pages_node[nid]++; 72} 73 74static struct page *dequeue_huge_page(void) 75{ 76 int nid; 77 struct page *page = NULL; 78 79 for (nid = 0; nid < MAX_NUMNODES; ++nid) { 80 if (!list_empty(&hugepage_freelists[nid])) { 81 page = list_entry(hugepage_freelists[nid].next, 82 struct page, lru); 83 list_del(&page->lru); 84 free_huge_pages--; 85 free_huge_pages_node[nid]--; 86 break; 87 } 88 } 89 return page; 90} 91 92static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma, 93 unsigned long address) 94{ 95 int nid; 96 struct page *page = NULL; 97 struct mempolicy *mpol; 98 nodemask_t *nodemask; 99 struct zonelist *zonelist = huge_zonelist(vma, address, 100 htlb_alloc_mask, &mpol, &nodemask); 101 struct zone *zone; 102 struct zoneref *z; 103 104 for_each_zone_zonelist_nodemask(zone, z, zonelist, 105 MAX_NR_ZONES - 1, nodemask) { 106 nid = zone_to_nid(zone); 107 if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask) && 108 !list_empty(&hugepage_freelists[nid])) { 109 page = list_entry(hugepage_freelists[nid].next, 110 struct page, lru); 111 list_del(&page->lru); 112 free_huge_pages--; 113 free_huge_pages_node[nid]--; 114 if (vma && vma->vm_flags & VM_MAYSHARE) 115 resv_huge_pages--; 116 break; 117 } 118 } 119 mpol_cond_put(mpol); 120 return page; 121} 122 123static void update_and_free_page(struct page *page) 124{ 125 int i; 126 nr_huge_pages--; 127 nr_huge_pages_node[page_to_nid(page)]--; 128 for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) { 129 page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced | 130 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved | 131 1 << PG_private | 1<< PG_writeback); 132 } 133 set_compound_page_dtor(page, NULL); 134 set_page_refcounted(page); 135 __free_pages(page, HUGETLB_PAGE_ORDER); 136} 137 138static void free_huge_page(struct page *page) 139{ 140 int nid = page_to_nid(page); 141 struct address_space *mapping; 142 143 mapping = (struct address_space *) page_private(page); 144 set_page_private(page, 0); 145 BUG_ON(page_count(page)); 146 INIT_LIST_HEAD(&page->lru); 147 148 spin_lock(&hugetlb_lock); 149 if (surplus_huge_pages_node[nid]) { 150 update_and_free_page(page); 151 surplus_huge_pages--; 152 surplus_huge_pages_node[nid]--; 153 } else { 154 enqueue_huge_page(page); 155 } 156 spin_unlock(&hugetlb_lock); 157 if (mapping) 158 hugetlb_put_quota(mapping, 1); 159} 160 161/* 162 * Increment or decrement surplus_huge_pages. Keep node-specific counters 163 * balanced by operating on them in a round-robin fashion. 164 * Returns 1 if an adjustment was made. 165 */ 166static int adjust_pool_surplus(int delta) 167{ 168 static int prev_nid; 169 int nid = prev_nid; 170 int ret = 0; 171 172 VM_BUG_ON(delta != -1 && delta != 1); 173 do { 174 nid = next_node(nid, node_online_map); 175 if (nid == MAX_NUMNODES) 176 nid = first_node(node_online_map); 177 178 /* To shrink on this node, there must be a surplus page */ 179 if (delta < 0 && !surplus_huge_pages_node[nid]) 180 continue; 181 /* Surplus cannot exceed the total number of pages */ 182 if (delta > 0 && surplus_huge_pages_node[nid] >= 183 nr_huge_pages_node[nid]) 184 continue; 185 186 surplus_huge_pages += delta; 187 surplus_huge_pages_node[nid] += delta; 188 ret = 1; 189 break; 190 } while (nid != prev_nid); 191 192 prev_nid = nid; 193 return ret; 194} 195 196static struct page *alloc_fresh_huge_page_node(int nid) 197{ 198 struct page *page; 199 200 page = alloc_pages_node(nid, 201 htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|__GFP_NOWARN, 202 HUGETLB_PAGE_ORDER); 203 if (page) { 204 set_compound_page_dtor(page, free_huge_page); 205 spin_lock(&hugetlb_lock); 206 nr_huge_pages++; 207 nr_huge_pages_node[nid]++; 208 spin_unlock(&hugetlb_lock); 209 put_page(page); /* free it into the hugepage allocator */ 210 } 211 212 return page; 213} 214 215static int alloc_fresh_huge_page(void) 216{ 217 struct page *page; 218 int start_nid; 219 int next_nid; 220 int ret = 0; 221 222 start_nid = hugetlb_next_nid; 223 224 do { 225 page = alloc_fresh_huge_page_node(hugetlb_next_nid); 226 if (page) 227 ret = 1; 228 /* 229 * Use a helper variable to find the next node and then 230 * copy it back to hugetlb_next_nid afterwards: 231 * otherwise there's a window in which a racer might 232 * pass invalid nid MAX_NUMNODES to alloc_pages_node. 233 * But we don't need to use a spin_lock here: it really 234 * doesn't matter if occasionally a racer chooses the 235 * same nid as we do. Move nid forward in the mask even 236 * if we just successfully allocated a hugepage so that 237 * the next caller gets hugepages on the next node. 238 */ 239 next_nid = next_node(hugetlb_next_nid, node_online_map); 240 if (next_nid == MAX_NUMNODES) 241 next_nid = first_node(node_online_map); 242 hugetlb_next_nid = next_nid; 243 } while (!page && hugetlb_next_nid != start_nid); 244 245 if (ret) 246 count_vm_event(HTLB_BUDDY_PGALLOC); 247 else 248 count_vm_event(HTLB_BUDDY_PGALLOC_FAIL); 249 250 return ret; 251} 252 253static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma, 254 unsigned long address) 255{ 256 struct page *page; 257 unsigned int nid; 258 259 /* 260 * Assume we will successfully allocate the surplus page to 261 * prevent racing processes from causing the surplus to exceed 262 * overcommit 263 * 264 * This however introduces a different race, where a process B 265 * tries to grow the static hugepage pool while alloc_pages() is 266 * called by process A. B will only examine the per-node 267 * counters in determining if surplus huge pages can be 268 * converted to normal huge pages in adjust_pool_surplus(). A 269 * won't be able to increment the per-node counter, until the 270 * lock is dropped by B, but B doesn't drop hugetlb_lock until 271 * no more huge pages can be converted from surplus to normal 272 * state (and doesn't try to convert again). Thus, we have a 273 * case where a surplus huge page exists, the pool is grown, and 274 * the surplus huge page still exists after, even though it 275 * should just have been converted to a normal huge page. This 276 * does not leak memory, though, as the hugepage will be freed 277 * once it is out of use. It also does not allow the counters to 278 * go out of whack in adjust_pool_surplus() as we don't modify 279 * the node values until we've gotten the hugepage and only the 280 * per-node value is checked there. 281 */ 282 spin_lock(&hugetlb_lock); 283 if (surplus_huge_pages >= nr_overcommit_huge_pages) { 284 spin_unlock(&hugetlb_lock); 285 return NULL; 286 } else { 287 nr_huge_pages++; 288 surplus_huge_pages++; 289 } 290 spin_unlock(&hugetlb_lock); 291 292 page = alloc_pages(htlb_alloc_mask|__GFP_COMP|__GFP_NOWARN, 293 HUGETLB_PAGE_ORDER); 294 295 spin_lock(&hugetlb_lock); 296 if (page) { 297 /* 298 * This page is now managed by the hugetlb allocator and has 299 * no users -- drop the buddy allocator's reference. 300 */ 301 put_page_testzero(page); 302 VM_BUG_ON(page_count(page)); 303 nid = page_to_nid(page); 304 set_compound_page_dtor(page, free_huge_page); 305 /* 306 * We incremented the global counters already 307 */ 308 nr_huge_pages_node[nid]++; 309 surplus_huge_pages_node[nid]++; 310 __count_vm_event(HTLB_BUDDY_PGALLOC); 311 } else { 312 nr_huge_pages--; 313 surplus_huge_pages--; 314 __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL); 315 } 316 spin_unlock(&hugetlb_lock); 317 318 return page; 319} 320 321/* 322 * Increase the hugetlb pool such that it can accomodate a reservation 323 * of size 'delta'. 324 */ 325static int gather_surplus_pages(int delta) 326{ 327 struct list_head surplus_list; 328 struct page *page, *tmp; 329 int ret, i; 330 int needed, allocated; 331 332 needed = (resv_huge_pages + delta) - free_huge_pages; 333 if (needed <= 0) { 334 resv_huge_pages += delta; 335 return 0; 336 } 337 338 allocated = 0; 339 INIT_LIST_HEAD(&surplus_list); 340 341 ret = -ENOMEM; 342retry: 343 spin_unlock(&hugetlb_lock); 344 for (i = 0; i < needed; i++) { 345 page = alloc_buddy_huge_page(NULL, 0); 346 if (!page) { 347 /* 348 * We were not able to allocate enough pages to 349 * satisfy the entire reservation so we free what 350 * we've allocated so far. 351 */ 352 spin_lock(&hugetlb_lock); 353 needed = 0; 354 goto free; 355 } 356 357 list_add(&page->lru, &surplus_list); 358 } 359 allocated += needed; 360 361 /* 362 * After retaking hugetlb_lock, we need to recalculate 'needed' 363 * because either resv_huge_pages or free_huge_pages may have changed. 364 */ 365 spin_lock(&hugetlb_lock); 366 needed = (resv_huge_pages + delta) - (free_huge_pages + allocated); 367 if (needed > 0) 368 goto retry; 369 370 /* 371 * The surplus_list now contains _at_least_ the number of extra pages 372 * needed to accomodate the reservation. Add the appropriate number 373 * of pages to the hugetlb pool and free the extras back to the buddy 374 * allocator. Commit the entire reservation here to prevent another 375 * process from stealing the pages as they are added to the pool but 376 * before they are reserved. 377 */ 378 needed += allocated; 379 resv_huge_pages += delta; 380 ret = 0; 381free: 382 /* Free the needed pages to the hugetlb pool */ 383 list_for_each_entry_safe(page, tmp, &surplus_list, lru) { 384 if ((--needed) < 0) 385 break; 386 list_del(&page->lru); 387 enqueue_huge_page(page); 388 } 389 390 /* Free unnecessary surplus pages to the buddy allocator */ 391 if (!list_empty(&surplus_list)) { 392 spin_unlock(&hugetlb_lock); 393 list_for_each_entry_safe(page, tmp, &surplus_list, lru) { 394 list_del(&page->lru); 395 /* 396 * The page has a reference count of zero already, so 397 * call free_huge_page directly instead of using 398 * put_page. This must be done with hugetlb_lock 399 * unlocked which is safe because free_huge_page takes 400 * hugetlb_lock before deciding how to free the page. 401 */ 402 free_huge_page(page); 403 } 404 spin_lock(&hugetlb_lock); 405 } 406 407 return ret; 408} 409 410/* 411 * When releasing a hugetlb pool reservation, any surplus pages that were 412 * allocated to satisfy the reservation must be explicitly freed if they were 413 * never used. 414 */ 415static void return_unused_surplus_pages(unsigned long unused_resv_pages) 416{ 417 static int nid = -1; 418 struct page *page; 419 unsigned long nr_pages; 420 421 /* 422 * We want to release as many surplus pages as possible, spread 423 * evenly across all nodes. Iterate across all nodes until we 424 * can no longer free unreserved surplus pages. This occurs when 425 * the nodes with surplus pages have no free pages. 426 */ 427 unsigned long remaining_iterations = num_online_nodes(); 428 429 /* Uncommit the reservation */ 430 resv_huge_pages -= unused_resv_pages; 431 432 nr_pages = min(unused_resv_pages, surplus_huge_pages); 433 434 while (remaining_iterations-- && nr_pages) { 435 nid = next_node(nid, node_online_map); 436 if (nid == MAX_NUMNODES) 437 nid = first_node(node_online_map); 438 439 if (!surplus_huge_pages_node[nid]) 440 continue; 441 442 if (!list_empty(&hugepage_freelists[nid])) { 443 page = list_entry(hugepage_freelists[nid].next, 444 struct page, lru); 445 list_del(&page->lru); 446 update_and_free_page(page); 447 free_huge_pages--; 448 free_huge_pages_node[nid]--; 449 surplus_huge_pages--; 450 surplus_huge_pages_node[nid]--; 451 nr_pages--; 452 remaining_iterations = num_online_nodes(); 453 } 454 } 455} 456 457 458static struct page *alloc_huge_page_shared(struct vm_area_struct *vma, 459 unsigned long addr) 460{ 461 struct page *page; 462 463 spin_lock(&hugetlb_lock); 464 page = dequeue_huge_page_vma(vma, addr); 465 spin_unlock(&hugetlb_lock); 466 return page ? page : ERR_PTR(-VM_FAULT_OOM); 467} 468 469static struct page *alloc_huge_page_private(struct vm_area_struct *vma, 470 unsigned long addr) 471{ 472 struct page *page = NULL; 473 474 if (hugetlb_get_quota(vma->vm_file->f_mapping, 1)) 475 return ERR_PTR(-VM_FAULT_SIGBUS); 476 477 spin_lock(&hugetlb_lock); 478 if (free_huge_pages > resv_huge_pages) 479 page = dequeue_huge_page_vma(vma, addr); 480 spin_unlock(&hugetlb_lock); 481 if (!page) { 482 page = alloc_buddy_huge_page(vma, addr); 483 if (!page) { 484 hugetlb_put_quota(vma->vm_file->f_mapping, 1); 485 return ERR_PTR(-VM_FAULT_OOM); 486 } 487 } 488 return page; 489} 490 491static struct page *alloc_huge_page(struct vm_area_struct *vma, 492 unsigned long addr) 493{ 494 struct page *page; 495 struct address_space *mapping = vma->vm_file->f_mapping; 496 497 if (vma->vm_flags & VM_MAYSHARE) 498 page = alloc_huge_page_shared(vma, addr); 499 else 500 page = alloc_huge_page_private(vma, addr); 501 502 if (!IS_ERR(page)) { 503 set_page_refcounted(page); 504 set_page_private(page, (unsigned long) mapping); 505 } 506 return page; 507} 508 509static int __init hugetlb_init(void) 510{ 511 unsigned long i; 512 513 if (HPAGE_SHIFT == 0) 514 return 0; 515 516 for (i = 0; i < MAX_NUMNODES; ++i) 517 INIT_LIST_HEAD(&hugepage_freelists[i]); 518 519 hugetlb_next_nid = first_node(node_online_map); 520 521 for (i = 0; i < max_huge_pages; ++i) { 522 if (!alloc_fresh_huge_page()) 523 break; 524 } 525 max_huge_pages = free_huge_pages = nr_huge_pages = i; 526 printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages); 527 return 0; 528} 529module_init(hugetlb_init); 530 531static int __init hugetlb_setup(char *s) 532{ 533 if (sscanf(s, "%lu", &max_huge_pages) <= 0) 534 max_huge_pages = 0; 535 return 1; 536} 537__setup("hugepages=", hugetlb_setup); 538 539static unsigned int cpuset_mems_nr(unsigned int *array) 540{ 541 int node; 542 unsigned int nr = 0; 543 544 for_each_node_mask(node, cpuset_current_mems_allowed) 545 nr += array[node]; 546 547 return nr; 548} 549 550#ifdef CONFIG_SYSCTL 551#ifdef CONFIG_HIGHMEM 552static void try_to_free_low(unsigned long count) 553{ 554 int i; 555 556 for (i = 0; i < MAX_NUMNODES; ++i) { 557 struct page *page, *next; 558 list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) { 559 if (count >= nr_huge_pages) 560 return; 561 if (PageHighMem(page)) 562 continue; 563 list_del(&page->lru); 564 update_and_free_page(page); 565 free_huge_pages--; 566 free_huge_pages_node[page_to_nid(page)]--; 567 } 568 } 569} 570#else 571static inline void try_to_free_low(unsigned long count) 572{ 573} 574#endif 575 576#define persistent_huge_pages (nr_huge_pages - surplus_huge_pages) 577static unsigned long set_max_huge_pages(unsigned long count) 578{ 579 unsigned long min_count, ret; 580 581 /* 582 * Increase the pool size 583 * First take pages out of surplus state. Then make up the 584 * remaining difference by allocating fresh huge pages. 585 * 586 * We might race with alloc_buddy_huge_page() here and be unable 587 * to convert a surplus huge page to a normal huge page. That is 588 * not critical, though, it just means the overall size of the 589 * pool might be one hugepage larger than it needs to be, but 590 * within all the constraints specified by the sysctls. 591 */ 592 spin_lock(&hugetlb_lock); 593 while (surplus_huge_pages && count > persistent_huge_pages) { 594 if (!adjust_pool_surplus(-1)) 595 break; 596 } 597 598 while (count > persistent_huge_pages) { 599 int ret; 600 /* 601 * If this allocation races such that we no longer need the 602 * page, free_huge_page will handle it by freeing the page 603 * and reducing the surplus. 604 */ 605 spin_unlock(&hugetlb_lock); 606 ret = alloc_fresh_huge_page(); 607 spin_lock(&hugetlb_lock); 608 if (!ret) 609 goto out; 610 611 } 612 613 /* 614 * Decrease the pool size 615 * First return free pages to the buddy allocator (being careful 616 * to keep enough around to satisfy reservations). Then place 617 * pages into surplus state as needed so the pool will shrink 618 * to the desired size as pages become free. 619 * 620 * By placing pages into the surplus state independent of the 621 * overcommit value, we are allowing the surplus pool size to 622 * exceed overcommit. There are few sane options here. Since 623 * alloc_buddy_huge_page() is checking the global counter, 624 * though, we'll note that we're not allowed to exceed surplus 625 * and won't grow the pool anywhere else. Not until one of the 626 * sysctls are changed, or the surplus pages go out of use. 627 */ 628 min_count = resv_huge_pages + nr_huge_pages - free_huge_pages; 629 min_count = max(count, min_count); 630 try_to_free_low(min_count); 631 while (min_count < persistent_huge_pages) { 632 struct page *page = dequeue_huge_page(); 633 if (!page) 634 break; 635 update_and_free_page(page); 636 } 637 while (count < persistent_huge_pages) { 638 if (!adjust_pool_surplus(1)) 639 break; 640 } 641out: 642 ret = persistent_huge_pages; 643 spin_unlock(&hugetlb_lock); 644 return ret; 645} 646 647int hugetlb_sysctl_handler(struct ctl_table *table, int write, 648 struct file *file, void __user *buffer, 649 size_t *length, loff_t *ppos) 650{ 651 proc_doulongvec_minmax(table, write, file, buffer, length, ppos); 652 max_huge_pages = set_max_huge_pages(max_huge_pages); 653 return 0; 654} 655 656int hugetlb_treat_movable_handler(struct ctl_table *table, int write, 657 struct file *file, void __user *buffer, 658 size_t *length, loff_t *ppos) 659{ 660 proc_dointvec(table, write, file, buffer, length, ppos); 661 if (hugepages_treat_as_movable) 662 htlb_alloc_mask = GFP_HIGHUSER_MOVABLE; 663 else 664 htlb_alloc_mask = GFP_HIGHUSER; 665 return 0; 666} 667 668int hugetlb_overcommit_handler(struct ctl_table *table, int write, 669 struct file *file, void __user *buffer, 670 size_t *length, loff_t *ppos) 671{ 672 proc_doulongvec_minmax(table, write, file, buffer, length, ppos); 673 spin_lock(&hugetlb_lock); 674 nr_overcommit_huge_pages = sysctl_overcommit_huge_pages; 675 spin_unlock(&hugetlb_lock); 676 return 0; 677} 678 679#endif /* CONFIG_SYSCTL */ 680 681int hugetlb_report_meminfo(char *buf) 682{ 683 return sprintf(buf, 684 "HugePages_Total: %5lu\n" 685 "HugePages_Free: %5lu\n" 686 "HugePages_Rsvd: %5lu\n" 687 "HugePages_Surp: %5lu\n" 688 "Hugepagesize: %5lu kB\n", 689 nr_huge_pages, 690 free_huge_pages, 691 resv_huge_pages, 692 surplus_huge_pages, 693 HPAGE_SIZE/1024); 694} 695 696int hugetlb_report_node_meminfo(int nid, char *buf) 697{ 698 return sprintf(buf, 699 "Node %d HugePages_Total: %5u\n" 700 "Node %d HugePages_Free: %5u\n" 701 "Node %d HugePages_Surp: %5u\n", 702 nid, nr_huge_pages_node[nid], 703 nid, free_huge_pages_node[nid], 704 nid, surplus_huge_pages_node[nid]); 705} 706 707/* Return the number pages of memory we physically have, in PAGE_SIZE units. */ 708unsigned long hugetlb_total_pages(void) 709{ 710 return nr_huge_pages * (HPAGE_SIZE / PAGE_SIZE); 711} 712 713/* 714 * We cannot handle pagefaults against hugetlb pages at all. They cause 715 * handle_mm_fault() to try to instantiate regular-sized pages in the 716 * hugegpage VMA. do_page_fault() is supposed to trap this, so BUG is we get 717 * this far. 718 */ 719static int hugetlb_vm_op_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 720{ 721 BUG(); 722 return 0; 723} 724 725struct vm_operations_struct hugetlb_vm_ops = { 726 .fault = hugetlb_vm_op_fault, 727}; 728 729static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page, 730 int writable) 731{ 732 pte_t entry; 733 734 if (writable) { 735 entry = 736 pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); 737 } else { 738 entry = pte_wrprotect(mk_pte(page, vma->vm_page_prot)); 739 } 740 entry = pte_mkyoung(entry); 741 entry = pte_mkhuge(entry); 742 743 return entry; 744} 745 746static void set_huge_ptep_writable(struct vm_area_struct *vma, 747 unsigned long address, pte_t *ptep) 748{ 749 pte_t entry; 750 751 entry = pte_mkwrite(pte_mkdirty(*ptep)); 752 if (ptep_set_access_flags(vma, address, ptep, entry, 1)) { 753 update_mmu_cache(vma, address, entry); 754 } 755} 756 757 758int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, 759 struct vm_area_struct *vma) 760{ 761 pte_t *src_pte, *dst_pte, entry; 762 struct page *ptepage; 763 unsigned long addr; 764 int cow; 765 766 cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; 767 768 for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) { 769 src_pte = huge_pte_offset(src, addr); 770 if (!src_pte) 771 continue; 772 dst_pte = huge_pte_alloc(dst, addr); 773 if (!dst_pte) 774 goto nomem; 775 776 /* If the pagetables are shared don't copy or take references */ 777 if (dst_pte == src_pte) 778 continue; 779 780 spin_lock(&dst->page_table_lock); 781 spin_lock(&src->page_table_lock); 782 if (!pte_none(*src_pte)) { 783 if (cow) 784 ptep_set_wrprotect(src, addr, src_pte); 785 entry = *src_pte; 786 ptepage = pte_page(entry); 787 get_page(ptepage); 788 set_huge_pte_at(dst, addr, dst_pte, entry); 789 } 790 spin_unlock(&src->page_table_lock); 791 spin_unlock(&dst->page_table_lock); 792 } 793 return 0; 794 795nomem: 796 return -ENOMEM; 797} 798 799void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, 800 unsigned long end) 801{ 802 struct mm_struct *mm = vma->vm_mm; 803 unsigned long address; 804 pte_t *ptep; 805 pte_t pte; 806 struct page *page; 807 struct page *tmp; 808 /* 809 * A page gathering list, protected by per file i_mmap_lock. The 810 * lock is used to avoid list corruption from multiple unmapping 811 * of the same page since we are using page->lru. 812 */ 813 LIST_HEAD(page_list); 814 815 WARN_ON(!is_vm_hugetlb_page(vma)); 816 BUG_ON(start & ~HPAGE_MASK); 817 BUG_ON(end & ~HPAGE_MASK); 818 819 spin_lock(&mm->page_table_lock); 820 for (address = start; address < end; address += HPAGE_SIZE) { 821 ptep = huge_pte_offset(mm, address); 822 if (!ptep) 823 continue; 824 825 if (huge_pmd_unshare(mm, &address, ptep)) 826 continue; 827 828 pte = huge_ptep_get_and_clear(mm, address, ptep); 829 if (pte_none(pte)) 830 continue; 831 832 page = pte_page(pte); 833 if (pte_dirty(pte)) 834 set_page_dirty(page); 835 list_add(&page->lru, &page_list); 836 } 837 spin_unlock(&mm->page_table_lock); 838 flush_tlb_range(vma, start, end); 839 list_for_each_entry_safe(page, tmp, &page_list, lru) { 840 list_del(&page->lru); 841 put_page(page); 842 } 843} 844 845void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, 846 unsigned long end) 847{ 848 /* 849 * It is undesirable to test vma->vm_file as it should be non-null 850 * for valid hugetlb area. However, vm_file will be NULL in the error 851 * cleanup path of do_mmap_pgoff. When hugetlbfs ->mmap method fails, 852 * do_mmap_pgoff() nullifies vma->vm_file before calling this function 853 * to clean up. Since no pte has actually been setup, it is safe to 854 * do nothing in this case. 855 */ 856 if (vma->vm_file) { 857 spin_lock(&vma->vm_file->f_mapping->i_mmap_lock); 858 __unmap_hugepage_range(vma, start, end); 859 spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock); 860 } 861} 862 863static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, 864 unsigned long address, pte_t *ptep, pte_t pte) 865{ 866 struct page *old_page, *new_page; 867 int avoidcopy; 868 869 old_page = pte_page(pte); 870 871 /* If no-one else is actually using this page, avoid the copy 872 * and just make the page writable */ 873 avoidcopy = (page_count(old_page) == 1); 874 if (avoidcopy) { 875 set_huge_ptep_writable(vma, address, ptep); 876 return 0; 877 } 878 879 page_cache_get(old_page); 880 new_page = alloc_huge_page(vma, address); 881 882 if (IS_ERR(new_page)) { 883 page_cache_release(old_page); 884 return -PTR_ERR(new_page); 885 } 886 887 spin_unlock(&mm->page_table_lock); 888 copy_huge_page(new_page, old_page, address, vma); 889 __SetPageUptodate(new_page); 890 spin_lock(&mm->page_table_lock); 891 892 ptep = huge_pte_offset(mm, address & HPAGE_MASK); 893 if (likely(pte_same(*ptep, pte))) { 894 /* Break COW */ 895 set_huge_pte_at(mm, address, ptep, 896 make_huge_pte(vma, new_page, 1)); 897 /* Make the old page be freed below */ 898 new_page = old_page; 899 } 900 page_cache_release(new_page); 901 page_cache_release(old_page); 902 return 0; 903} 904 905static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, 906 unsigned long address, pte_t *ptep, int write_access) 907{ 908 int ret = VM_FAULT_SIGBUS; 909 unsigned long idx; 910 unsigned long size; 911 struct page *page; 912 struct address_space *mapping; 913 pte_t new_pte; 914 915 mapping = vma->vm_file->f_mapping; 916 idx = ((address - vma->vm_start) >> HPAGE_SHIFT) 917 + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT)); 918 919 /* 920 * Use page lock to guard against racing truncation 921 * before we get page_table_lock. 922 */ 923retry: 924 page = find_lock_page(mapping, idx); 925 if (!page) { 926 size = i_size_read(mapping->host) >> HPAGE_SHIFT; 927 if (idx >= size) 928 goto out; 929 page = alloc_huge_page(vma, address); 930 if (IS_ERR(page)) { 931 ret = -PTR_ERR(page); 932 goto out; 933 } 934 clear_huge_page(page, address); 935 __SetPageUptodate(page); 936 937 if (vma->vm_flags & VM_SHARED) { 938 int err; 939 struct inode *inode = mapping->host; 940 941 err = add_to_page_cache(page, mapping, idx, GFP_KERNEL); 942 if (err) { 943 put_page(page); 944 if (err == -EEXIST) 945 goto retry; 946 goto out; 947 } 948 949 spin_lock(&inode->i_lock); 950 inode->i_blocks += BLOCKS_PER_HUGEPAGE; 951 spin_unlock(&inode->i_lock); 952 } else 953 lock_page(page); 954 } 955 956 spin_lock(&mm->page_table_lock); 957 size = i_size_read(mapping->host) >> HPAGE_SHIFT; 958 if (idx >= size) 959 goto backout; 960 961 ret = 0; 962 if (!pte_none(*ptep)) 963 goto backout; 964 965 new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) 966 && (vma->vm_flags & VM_SHARED))); 967 set_huge_pte_at(mm, address, ptep, new_pte); 968 969 if (write_access && !(vma->vm_flags & VM_SHARED)) { 970 /* Optimization, do the COW without a second fault */ 971 ret = hugetlb_cow(mm, vma, address, ptep, new_pte); 972 } 973 974 spin_unlock(&mm->page_table_lock); 975 unlock_page(page); 976out: 977 return ret; 978 979backout: 980 spin_unlock(&mm->page_table_lock); 981 unlock_page(page); 982 put_page(page); 983 goto out; 984} 985 986int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, 987 unsigned long address, int write_access) 988{ 989 pte_t *ptep; 990 pte_t entry; 991 int ret; 992 static DEFINE_MUTEX(hugetlb_instantiation_mutex); 993 994 ptep = huge_pte_alloc(mm, address); 995 if (!ptep) 996 return VM_FAULT_OOM; 997 998 /* 999 * Serialize hugepage allocation and instantiation, so that we don't 1000 * get spurious allocation failures if two CPUs race to instantiate 1001 * the same page in the page cache. 1002 */ 1003 mutex_lock(&hugetlb_instantiation_mutex); 1004 entry = *ptep; 1005 if (pte_none(entry)) { 1006 ret = hugetlb_no_page(mm, vma, address, ptep, write_access); 1007 mutex_unlock(&hugetlb_instantiation_mutex); 1008 return ret; 1009 } 1010 1011 ret = 0; 1012 1013 spin_lock(&mm->page_table_lock); 1014 /* Check for a racing update before calling hugetlb_cow */ 1015 if (likely(pte_same(entry, *ptep))) 1016 if (write_access && !pte_write(entry)) 1017 ret = hugetlb_cow(mm, vma, address, ptep, entry); 1018 spin_unlock(&mm->page_table_lock); 1019 mutex_unlock(&hugetlb_instantiation_mutex); 1020 1021 return ret; 1022} 1023 1024int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, 1025 struct page **pages, struct vm_area_struct **vmas, 1026 unsigned long *position, int *length, int i, 1027 int write) 1028{ 1029 unsigned long pfn_offset; 1030 unsigned long vaddr = *position; 1031 int remainder = *length; 1032 1033 spin_lock(&mm->page_table_lock); 1034 while (vaddr < vma->vm_end && remainder) { 1035 pte_t *pte; 1036 struct page *page; 1037 1038 /* 1039 * Some archs (sparc64, sh*) have multiple pte_ts to 1040 * each hugepage. We have to make * sure we get the 1041 * first, for the page indexing below to work. 1042 */ 1043 pte = huge_pte_offset(mm, vaddr & HPAGE_MASK); 1044 1045 if (!pte || pte_none(*pte) || (write && !pte_write(*pte))) { 1046 int ret; 1047 1048 spin_unlock(&mm->page_table_lock); 1049 ret = hugetlb_fault(mm, vma, vaddr, write); 1050 spin_lock(&mm->page_table_lock); 1051 if (!(ret & VM_FAULT_ERROR)) 1052 continue; 1053 1054 remainder = 0; 1055 if (!i) 1056 i = -EFAULT; 1057 break; 1058 } 1059 1060 pfn_offset = (vaddr & ~HPAGE_MASK) >> PAGE_SHIFT; 1061 page = pte_page(*pte); 1062same_page: 1063 if (pages) { 1064 get_page(page); 1065 pages[i] = page + pfn_offset; 1066 } 1067 1068 if (vmas) 1069 vmas[i] = vma; 1070 1071 vaddr += PAGE_SIZE; 1072 ++pfn_offset; 1073 --remainder; 1074 ++i; 1075 if (vaddr < vma->vm_end && remainder && 1076 pfn_offset < HPAGE_SIZE/PAGE_SIZE) { 1077 /* 1078 * We use pfn_offset to avoid touching the pageframes 1079 * of this compound page. 1080 */ 1081 goto same_page; 1082 } 1083 } 1084 spin_unlock(&mm->page_table_lock); 1085 *length = remainder; 1086 *position = vaddr; 1087 1088 return i; 1089} 1090 1091void hugetlb_change_protection(struct vm_area_struct *vma, 1092 unsigned long address, unsigned long end, pgprot_t newprot) 1093{ 1094 struct mm_struct *mm = vma->vm_mm; 1095 unsigned long start = address; 1096 pte_t *ptep; 1097 pte_t pte; 1098 1099 BUG_ON(address >= end); 1100 flush_cache_range(vma, address, end); 1101 1102 spin_lock(&vma->vm_file->f_mapping->i_mmap_lock); 1103 spin_lock(&mm->page_table_lock); 1104 for (; address < end; address += HPAGE_SIZE) { 1105 ptep = huge_pte_offset(mm, address); 1106 if (!ptep) 1107 continue; 1108 if (huge_pmd_unshare(mm, &address, ptep)) 1109 continue; 1110 if (!pte_none(*ptep)) { 1111 pte = huge_ptep_get_and_clear(mm, address, ptep); 1112 pte = pte_mkhuge(pte_modify(pte, newprot)); 1113 set_huge_pte_at(mm, address, ptep, pte); 1114 } 1115 } 1116 spin_unlock(&mm->page_table_lock); 1117 spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock); 1118 1119 flush_tlb_range(vma, start, end); 1120} 1121 1122struct file_region { 1123 struct list_head link; 1124 long from; 1125 long to; 1126}; 1127 1128static long region_add(struct list_head *head, long f, long t) 1129{ 1130 struct file_region *rg, *nrg, *trg; 1131 1132 /* Locate the region we are either in or before. */ 1133 list_for_each_entry(rg, head, link) 1134 if (f <= rg->to) 1135 break; 1136 1137 /* Round our left edge to the current segment if it encloses us. */ 1138 if (f > rg->from) 1139 f = rg->from; 1140 1141 /* Check for and consume any regions we now overlap with. */ 1142 nrg = rg; 1143 list_for_each_entry_safe(rg, trg, rg->link.prev, link) { 1144 if (&rg->link == head) 1145 break; 1146 if (rg->from > t) 1147 break; 1148 1149 /* If this area reaches higher then extend our area to 1150 * include it completely. If this is not the first area 1151 * which we intend to reuse, free it. */ 1152 if (rg->to > t) 1153 t = rg->to; 1154 if (rg != nrg) { 1155 list_del(&rg->link); 1156 kfree(rg); 1157 } 1158 } 1159 nrg->from = f; 1160 nrg->to = t; 1161 return 0; 1162} 1163 1164static long region_chg(struct list_head *head, long f, long t) 1165{ 1166 struct file_region *rg, *nrg; 1167 long chg = 0; 1168 1169 /* Locate the region we are before or in. */ 1170 list_for_each_entry(rg, head, link) 1171 if (f <= rg->to) 1172 break; 1173 1174 /* If we are below the current region then a new region is required. 1175 * Subtle, allocate a new region at the position but make it zero 1176 * size such that we can guarantee to record the reservation. */ 1177 if (&rg->link == head || t < rg->from) { 1178 nrg = kmalloc(sizeof(*nrg), GFP_KERNEL); 1179 if (!nrg) 1180 return -ENOMEM; 1181 nrg->from = f; 1182 nrg->to = f; 1183 INIT_LIST_HEAD(&nrg->link); 1184 list_add(&nrg->link, rg->link.prev); 1185 1186 return t - f; 1187 } 1188 1189 /* Round our left edge to the current segment if it encloses us. */ 1190 if (f > rg->from) 1191 f = rg->from; 1192 chg = t - f; 1193 1194 /* Check for and consume any regions we now overlap with. */ 1195 list_for_each_entry(rg, rg->link.prev, link) { 1196 if (&rg->link == head) 1197 break; 1198 if (rg->from > t) 1199 return chg; 1200 1201 /* We overlap with this area, if it extends futher than 1202 * us then we must extend ourselves. Account for its 1203 * existing reservation. */ 1204 if (rg->to > t) { 1205 chg += rg->to - t; 1206 t = rg->to; 1207 } 1208 chg -= rg->to - rg->from; 1209 } 1210 return chg; 1211} 1212 1213static long region_truncate(struct list_head *head, long end) 1214{ 1215 struct file_region *rg, *trg; 1216 long chg = 0; 1217 1218 /* Locate the region we are either in or before. */ 1219 list_for_each_entry(rg, head, link) 1220 if (end <= rg->to) 1221 break; 1222 if (&rg->link == head) 1223 return 0; 1224 1225 /* If we are in the middle of a region then adjust it. */ 1226 if (end > rg->from) { 1227 chg = rg->to - end; 1228 rg->to = end; 1229 rg = list_entry(rg->link.next, typeof(*rg), link); 1230 } 1231 1232 /* Drop any remaining regions. */ 1233 list_for_each_entry_safe(rg, trg, rg->link.prev, link) { 1234 if (&rg->link == head) 1235 break; 1236 chg += rg->to - rg->from; 1237 list_del(&rg->link); 1238 kfree(rg); 1239 } 1240 return chg; 1241} 1242 1243static int hugetlb_acct_memory(long delta) 1244{ 1245 int ret = -ENOMEM; 1246 1247 spin_lock(&hugetlb_lock); 1248 /* 1249 * When cpuset is configured, it breaks the strict hugetlb page 1250 * reservation as the accounting is done on a global variable. Such 1251 * reservation is completely rubbish in the presence of cpuset because 1252 * the reservation is not checked against page availability for the 1253 * current cpuset. Application can still potentially OOM'ed by kernel 1254 * with lack of free htlb page in cpuset that the task is in. 1255 * Attempt to enforce strict accounting with cpuset is almost 1256 * impossible (or too ugly) because cpuset is too fluid that 1257 * task or memory node can be dynamically moved between cpusets. 1258 * 1259 * The change of semantics for shared hugetlb mapping with cpuset is 1260 * undesirable. However, in order to preserve some of the semantics, 1261 * we fall back to check against current free page availability as 1262 * a best attempt and hopefully to minimize the impact of changing 1263 * semantics that cpuset has. 1264 */ 1265 if (delta > 0) { 1266 if (gather_surplus_pages(delta) < 0) 1267 goto out; 1268 1269 if (delta > cpuset_mems_nr(free_huge_pages_node)) { 1270 return_unused_surplus_pages(delta); 1271 goto out; 1272 } 1273 } 1274 1275 ret = 0; 1276 if (delta < 0) 1277 return_unused_surplus_pages((unsigned long) -delta); 1278 1279out: 1280 spin_unlock(&hugetlb_lock); 1281 return ret; 1282} 1283 1284int hugetlb_reserve_pages(struct inode *inode, long from, long to) 1285{ 1286 long ret, chg; 1287 1288 chg = region_chg(&inode->i_mapping->private_list, from, to); 1289 if (chg < 0) 1290 return chg; 1291 1292 if (hugetlb_get_quota(inode->i_mapping, chg)) 1293 return -ENOSPC; 1294 ret = hugetlb_acct_memory(chg); 1295 if (ret < 0) { 1296 hugetlb_put_quota(inode->i_mapping, chg); 1297 return ret; 1298 } 1299 region_add(&inode->i_mapping->private_list, from, to); 1300 return 0; 1301} 1302 1303void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) 1304{ 1305 long chg = region_truncate(&inode->i_mapping->private_list, offset); 1306 1307 spin_lock(&inode->i_lock); 1308 inode->i_blocks -= BLOCKS_PER_HUGEPAGE * freed; 1309 spin_unlock(&inode->i_lock); 1310 1311 hugetlb_put_quota(inode->i_mapping, (chg - freed)); 1312 hugetlb_acct_memory(-(chg - freed)); 1313} 1314