hugetlb.c revision 19fc3f0acde32636529969570055c7e2a744787c
1/* 2 * Generic hugetlb support. 3 * (C) William Irwin, April 2004 4 */ 5#include <linux/gfp.h> 6#include <linux/list.h> 7#include <linux/init.h> 8#include <linux/module.h> 9#include <linux/mm.h> 10#include <linux/sysctl.h> 11#include <linux/highmem.h> 12#include <linux/nodemask.h> 13#include <linux/pagemap.h> 14#include <linux/mempolicy.h> 15#include <linux/cpuset.h> 16#include <linux/mutex.h> 17 18#include <asm/page.h> 19#include <asm/pgtable.h> 20 21#include <linux/hugetlb.h> 22#include "internal.h" 23 24const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; 25static unsigned long nr_huge_pages, free_huge_pages, resv_huge_pages; 26static unsigned long surplus_huge_pages; 27static unsigned long nr_overcommit_huge_pages; 28unsigned long max_huge_pages; 29unsigned long sysctl_overcommit_huge_pages; 30static struct list_head hugepage_freelists[MAX_NUMNODES]; 31static unsigned int nr_huge_pages_node[MAX_NUMNODES]; 32static unsigned int free_huge_pages_node[MAX_NUMNODES]; 33static unsigned int surplus_huge_pages_node[MAX_NUMNODES]; 34static gfp_t htlb_alloc_mask = GFP_HIGHUSER; 35unsigned long hugepages_treat_as_movable; 36static int hugetlb_next_nid; 37 38/* 39 * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages 40 */ 41static DEFINE_SPINLOCK(hugetlb_lock); 42 43static void clear_huge_page(struct page *page, unsigned long addr) 44{ 45 int i; 46 47 might_sleep(); 48 for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); i++) { 49 cond_resched(); 50 clear_user_highpage(page + i, addr + i * PAGE_SIZE); 51 } 52} 53 54static void copy_huge_page(struct page *dst, struct page *src, 55 unsigned long addr, struct vm_area_struct *vma) 56{ 57 int i; 58 59 might_sleep(); 60 for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) { 61 cond_resched(); 62 copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma); 63 } 64} 65 66static void enqueue_huge_page(struct page *page) 67{ 68 int nid = page_to_nid(page); 69 list_add(&page->lru, &hugepage_freelists[nid]); 70 free_huge_pages++; 71 free_huge_pages_node[nid]++; 72} 73 74static struct page *dequeue_huge_page(void) 75{ 76 int nid; 77 struct page *page = NULL; 78 79 for (nid = 0; nid < MAX_NUMNODES; ++nid) { 80 if (!list_empty(&hugepage_freelists[nid])) { 81 page = list_entry(hugepage_freelists[nid].next, 82 struct page, lru); 83 list_del(&page->lru); 84 free_huge_pages--; 85 free_huge_pages_node[nid]--; 86 break; 87 } 88 } 89 return page; 90} 91 92static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma, 93 unsigned long address) 94{ 95 int nid; 96 struct page *page = NULL; 97 struct mempolicy *mpol; 98 nodemask_t *nodemask; 99 struct zonelist *zonelist = huge_zonelist(vma, address, 100 htlb_alloc_mask, &mpol, &nodemask); 101 struct zone *zone; 102 struct zoneref *z; 103 104 for_each_zone_zonelist_nodemask(zone, z, zonelist, 105 MAX_NR_ZONES - 1, nodemask) { 106 nid = zone_to_nid(zone); 107 if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask) && 108 !list_empty(&hugepage_freelists[nid])) { 109 page = list_entry(hugepage_freelists[nid].next, 110 struct page, lru); 111 list_del(&page->lru); 112 free_huge_pages--; 113 free_huge_pages_node[nid]--; 114 if (vma && vma->vm_flags & VM_MAYSHARE) 115 resv_huge_pages--; 116 break; 117 } 118 } 119 mpol_free(mpol); /* unref if mpol !NULL */ 120 return page; 121} 122 123static void update_and_free_page(struct page *page) 124{ 125 int i; 126 nr_huge_pages--; 127 nr_huge_pages_node[page_to_nid(page)]--; 128 for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) { 129 page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced | 130 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved | 131 1 << PG_private | 1<< PG_writeback); 132 } 133 set_compound_page_dtor(page, NULL); 134 set_page_refcounted(page); 135 __free_pages(page, HUGETLB_PAGE_ORDER); 136} 137 138static void free_huge_page(struct page *page) 139{ 140 int nid = page_to_nid(page); 141 struct address_space *mapping; 142 143 mapping = (struct address_space *) page_private(page); 144 set_page_private(page, 0); 145 BUG_ON(page_count(page)); 146 INIT_LIST_HEAD(&page->lru); 147 148 spin_lock(&hugetlb_lock); 149 if (surplus_huge_pages_node[nid]) { 150 update_and_free_page(page); 151 surplus_huge_pages--; 152 surplus_huge_pages_node[nid]--; 153 } else { 154 enqueue_huge_page(page); 155 } 156 spin_unlock(&hugetlb_lock); 157 if (mapping) 158 hugetlb_put_quota(mapping, 1); 159} 160 161/* 162 * Increment or decrement surplus_huge_pages. Keep node-specific counters 163 * balanced by operating on them in a round-robin fashion. 164 * Returns 1 if an adjustment was made. 165 */ 166static int adjust_pool_surplus(int delta) 167{ 168 static int prev_nid; 169 int nid = prev_nid; 170 int ret = 0; 171 172 VM_BUG_ON(delta != -1 && delta != 1); 173 do { 174 nid = next_node(nid, node_online_map); 175 if (nid == MAX_NUMNODES) 176 nid = first_node(node_online_map); 177 178 /* To shrink on this node, there must be a surplus page */ 179 if (delta < 0 && !surplus_huge_pages_node[nid]) 180 continue; 181 /* Surplus cannot exceed the total number of pages */ 182 if (delta > 0 && surplus_huge_pages_node[nid] >= 183 nr_huge_pages_node[nid]) 184 continue; 185 186 surplus_huge_pages += delta; 187 surplus_huge_pages_node[nid] += delta; 188 ret = 1; 189 break; 190 } while (nid != prev_nid); 191 192 prev_nid = nid; 193 return ret; 194} 195 196static struct page *alloc_fresh_huge_page_node(int nid) 197{ 198 struct page *page; 199 200 page = alloc_pages_node(nid, 201 htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|__GFP_NOWARN, 202 HUGETLB_PAGE_ORDER); 203 if (page) { 204 set_compound_page_dtor(page, free_huge_page); 205 spin_lock(&hugetlb_lock); 206 nr_huge_pages++; 207 nr_huge_pages_node[nid]++; 208 spin_unlock(&hugetlb_lock); 209 put_page(page); /* free it into the hugepage allocator */ 210 } 211 212 return page; 213} 214 215static int alloc_fresh_huge_page(void) 216{ 217 struct page *page; 218 int start_nid; 219 int next_nid; 220 int ret = 0; 221 222 start_nid = hugetlb_next_nid; 223 224 do { 225 page = alloc_fresh_huge_page_node(hugetlb_next_nid); 226 if (page) 227 ret = 1; 228 /* 229 * Use a helper variable to find the next node and then 230 * copy it back to hugetlb_next_nid afterwards: 231 * otherwise there's a window in which a racer might 232 * pass invalid nid MAX_NUMNODES to alloc_pages_node. 233 * But we don't need to use a spin_lock here: it really 234 * doesn't matter if occasionally a racer chooses the 235 * same nid as we do. Move nid forward in the mask even 236 * if we just successfully allocated a hugepage so that 237 * the next caller gets hugepages on the next node. 238 */ 239 next_nid = next_node(hugetlb_next_nid, node_online_map); 240 if (next_nid == MAX_NUMNODES) 241 next_nid = first_node(node_online_map); 242 hugetlb_next_nid = next_nid; 243 } while (!page && hugetlb_next_nid != start_nid); 244 245 return ret; 246} 247 248static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma, 249 unsigned long address) 250{ 251 struct page *page; 252 unsigned int nid; 253 254 /* 255 * Assume we will successfully allocate the surplus page to 256 * prevent racing processes from causing the surplus to exceed 257 * overcommit 258 * 259 * This however introduces a different race, where a process B 260 * tries to grow the static hugepage pool while alloc_pages() is 261 * called by process A. B will only examine the per-node 262 * counters in determining if surplus huge pages can be 263 * converted to normal huge pages in adjust_pool_surplus(). A 264 * won't be able to increment the per-node counter, until the 265 * lock is dropped by B, but B doesn't drop hugetlb_lock until 266 * no more huge pages can be converted from surplus to normal 267 * state (and doesn't try to convert again). Thus, we have a 268 * case where a surplus huge page exists, the pool is grown, and 269 * the surplus huge page still exists after, even though it 270 * should just have been converted to a normal huge page. This 271 * does not leak memory, though, as the hugepage will be freed 272 * once it is out of use. It also does not allow the counters to 273 * go out of whack in adjust_pool_surplus() as we don't modify 274 * the node values until we've gotten the hugepage and only the 275 * per-node value is checked there. 276 */ 277 spin_lock(&hugetlb_lock); 278 if (surplus_huge_pages >= nr_overcommit_huge_pages) { 279 spin_unlock(&hugetlb_lock); 280 return NULL; 281 } else { 282 nr_huge_pages++; 283 surplus_huge_pages++; 284 } 285 spin_unlock(&hugetlb_lock); 286 287 page = alloc_pages(htlb_alloc_mask|__GFP_COMP|__GFP_NOWARN, 288 HUGETLB_PAGE_ORDER); 289 290 spin_lock(&hugetlb_lock); 291 if (page) { 292 /* 293 * This page is now managed by the hugetlb allocator and has 294 * no users -- drop the buddy allocator's reference. 295 */ 296 put_page_testzero(page); 297 VM_BUG_ON(page_count(page)); 298 nid = page_to_nid(page); 299 set_compound_page_dtor(page, free_huge_page); 300 /* 301 * We incremented the global counters already 302 */ 303 nr_huge_pages_node[nid]++; 304 surplus_huge_pages_node[nid]++; 305 } else { 306 nr_huge_pages--; 307 surplus_huge_pages--; 308 } 309 spin_unlock(&hugetlb_lock); 310 311 return page; 312} 313 314/* 315 * Increase the hugetlb pool such that it can accomodate a reservation 316 * of size 'delta'. 317 */ 318static int gather_surplus_pages(int delta) 319{ 320 struct list_head surplus_list; 321 struct page *page, *tmp; 322 int ret, i; 323 int needed, allocated; 324 325 needed = (resv_huge_pages + delta) - free_huge_pages; 326 if (needed <= 0) { 327 resv_huge_pages += delta; 328 return 0; 329 } 330 331 allocated = 0; 332 INIT_LIST_HEAD(&surplus_list); 333 334 ret = -ENOMEM; 335retry: 336 spin_unlock(&hugetlb_lock); 337 for (i = 0; i < needed; i++) { 338 page = alloc_buddy_huge_page(NULL, 0); 339 if (!page) { 340 /* 341 * We were not able to allocate enough pages to 342 * satisfy the entire reservation so we free what 343 * we've allocated so far. 344 */ 345 spin_lock(&hugetlb_lock); 346 needed = 0; 347 goto free; 348 } 349 350 list_add(&page->lru, &surplus_list); 351 } 352 allocated += needed; 353 354 /* 355 * After retaking hugetlb_lock, we need to recalculate 'needed' 356 * because either resv_huge_pages or free_huge_pages may have changed. 357 */ 358 spin_lock(&hugetlb_lock); 359 needed = (resv_huge_pages + delta) - (free_huge_pages + allocated); 360 if (needed > 0) 361 goto retry; 362 363 /* 364 * The surplus_list now contains _at_least_ the number of extra pages 365 * needed to accomodate the reservation. Add the appropriate number 366 * of pages to the hugetlb pool and free the extras back to the buddy 367 * allocator. Commit the entire reservation here to prevent another 368 * process from stealing the pages as they are added to the pool but 369 * before they are reserved. 370 */ 371 needed += allocated; 372 resv_huge_pages += delta; 373 ret = 0; 374free: 375 /* Free the needed pages to the hugetlb pool */ 376 list_for_each_entry_safe(page, tmp, &surplus_list, lru) { 377 if ((--needed) < 0) 378 break; 379 list_del(&page->lru); 380 enqueue_huge_page(page); 381 } 382 383 /* Free unnecessary surplus pages to the buddy allocator */ 384 if (!list_empty(&surplus_list)) { 385 spin_unlock(&hugetlb_lock); 386 list_for_each_entry_safe(page, tmp, &surplus_list, lru) { 387 list_del(&page->lru); 388 /* 389 * The page has a reference count of zero already, so 390 * call free_huge_page directly instead of using 391 * put_page. This must be done with hugetlb_lock 392 * unlocked which is safe because free_huge_page takes 393 * hugetlb_lock before deciding how to free the page. 394 */ 395 free_huge_page(page); 396 } 397 spin_lock(&hugetlb_lock); 398 } 399 400 return ret; 401} 402 403/* 404 * When releasing a hugetlb pool reservation, any surplus pages that were 405 * allocated to satisfy the reservation must be explicitly freed if they were 406 * never used. 407 */ 408static void return_unused_surplus_pages(unsigned long unused_resv_pages) 409{ 410 static int nid = -1; 411 struct page *page; 412 unsigned long nr_pages; 413 414 /* 415 * We want to release as many surplus pages as possible, spread 416 * evenly across all nodes. Iterate across all nodes until we 417 * can no longer free unreserved surplus pages. This occurs when 418 * the nodes with surplus pages have no free pages. 419 */ 420 unsigned long remaining_iterations = num_online_nodes(); 421 422 /* Uncommit the reservation */ 423 resv_huge_pages -= unused_resv_pages; 424 425 nr_pages = min(unused_resv_pages, surplus_huge_pages); 426 427 while (remaining_iterations-- && nr_pages) { 428 nid = next_node(nid, node_online_map); 429 if (nid == MAX_NUMNODES) 430 nid = first_node(node_online_map); 431 432 if (!surplus_huge_pages_node[nid]) 433 continue; 434 435 if (!list_empty(&hugepage_freelists[nid])) { 436 page = list_entry(hugepage_freelists[nid].next, 437 struct page, lru); 438 list_del(&page->lru); 439 update_and_free_page(page); 440 free_huge_pages--; 441 free_huge_pages_node[nid]--; 442 surplus_huge_pages--; 443 surplus_huge_pages_node[nid]--; 444 nr_pages--; 445 remaining_iterations = num_online_nodes(); 446 } 447 } 448} 449 450 451static struct page *alloc_huge_page_shared(struct vm_area_struct *vma, 452 unsigned long addr) 453{ 454 struct page *page; 455 456 spin_lock(&hugetlb_lock); 457 page = dequeue_huge_page_vma(vma, addr); 458 spin_unlock(&hugetlb_lock); 459 return page ? page : ERR_PTR(-VM_FAULT_OOM); 460} 461 462static struct page *alloc_huge_page_private(struct vm_area_struct *vma, 463 unsigned long addr) 464{ 465 struct page *page = NULL; 466 467 if (hugetlb_get_quota(vma->vm_file->f_mapping, 1)) 468 return ERR_PTR(-VM_FAULT_SIGBUS); 469 470 spin_lock(&hugetlb_lock); 471 if (free_huge_pages > resv_huge_pages) 472 page = dequeue_huge_page_vma(vma, addr); 473 spin_unlock(&hugetlb_lock); 474 if (!page) { 475 page = alloc_buddy_huge_page(vma, addr); 476 if (!page) { 477 hugetlb_put_quota(vma->vm_file->f_mapping, 1); 478 return ERR_PTR(-VM_FAULT_OOM); 479 } 480 } 481 return page; 482} 483 484static struct page *alloc_huge_page(struct vm_area_struct *vma, 485 unsigned long addr) 486{ 487 struct page *page; 488 struct address_space *mapping = vma->vm_file->f_mapping; 489 490 if (vma->vm_flags & VM_MAYSHARE) 491 page = alloc_huge_page_shared(vma, addr); 492 else 493 page = alloc_huge_page_private(vma, addr); 494 495 if (!IS_ERR(page)) { 496 set_page_refcounted(page); 497 set_page_private(page, (unsigned long) mapping); 498 } 499 return page; 500} 501 502static int __init hugetlb_init(void) 503{ 504 unsigned long i; 505 506 if (HPAGE_SHIFT == 0) 507 return 0; 508 509 for (i = 0; i < MAX_NUMNODES; ++i) 510 INIT_LIST_HEAD(&hugepage_freelists[i]); 511 512 hugetlb_next_nid = first_node(node_online_map); 513 514 for (i = 0; i < max_huge_pages; ++i) { 515 if (!alloc_fresh_huge_page()) 516 break; 517 } 518 max_huge_pages = free_huge_pages = nr_huge_pages = i; 519 printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages); 520 return 0; 521} 522module_init(hugetlb_init); 523 524static int __init hugetlb_setup(char *s) 525{ 526 if (sscanf(s, "%lu", &max_huge_pages) <= 0) 527 max_huge_pages = 0; 528 return 1; 529} 530__setup("hugepages=", hugetlb_setup); 531 532static unsigned int cpuset_mems_nr(unsigned int *array) 533{ 534 int node; 535 unsigned int nr = 0; 536 537 for_each_node_mask(node, cpuset_current_mems_allowed) 538 nr += array[node]; 539 540 return nr; 541} 542 543#ifdef CONFIG_SYSCTL 544#ifdef CONFIG_HIGHMEM 545static void try_to_free_low(unsigned long count) 546{ 547 int i; 548 549 for (i = 0; i < MAX_NUMNODES; ++i) { 550 struct page *page, *next; 551 list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) { 552 if (count >= nr_huge_pages) 553 return; 554 if (PageHighMem(page)) 555 continue; 556 list_del(&page->lru); 557 update_and_free_page(page); 558 free_huge_pages--; 559 free_huge_pages_node[page_to_nid(page)]--; 560 } 561 } 562} 563#else 564static inline void try_to_free_low(unsigned long count) 565{ 566} 567#endif 568 569#define persistent_huge_pages (nr_huge_pages - surplus_huge_pages) 570static unsigned long set_max_huge_pages(unsigned long count) 571{ 572 unsigned long min_count, ret; 573 574 /* 575 * Increase the pool size 576 * First take pages out of surplus state. Then make up the 577 * remaining difference by allocating fresh huge pages. 578 * 579 * We might race with alloc_buddy_huge_page() here and be unable 580 * to convert a surplus huge page to a normal huge page. That is 581 * not critical, though, it just means the overall size of the 582 * pool might be one hugepage larger than it needs to be, but 583 * within all the constraints specified by the sysctls. 584 */ 585 spin_lock(&hugetlb_lock); 586 while (surplus_huge_pages && count > persistent_huge_pages) { 587 if (!adjust_pool_surplus(-1)) 588 break; 589 } 590 591 while (count > persistent_huge_pages) { 592 int ret; 593 /* 594 * If this allocation races such that we no longer need the 595 * page, free_huge_page will handle it by freeing the page 596 * and reducing the surplus. 597 */ 598 spin_unlock(&hugetlb_lock); 599 ret = alloc_fresh_huge_page(); 600 spin_lock(&hugetlb_lock); 601 if (!ret) 602 goto out; 603 604 } 605 606 /* 607 * Decrease the pool size 608 * First return free pages to the buddy allocator (being careful 609 * to keep enough around to satisfy reservations). Then place 610 * pages into surplus state as needed so the pool will shrink 611 * to the desired size as pages become free. 612 * 613 * By placing pages into the surplus state independent of the 614 * overcommit value, we are allowing the surplus pool size to 615 * exceed overcommit. There are few sane options here. Since 616 * alloc_buddy_huge_page() is checking the global counter, 617 * though, we'll note that we're not allowed to exceed surplus 618 * and won't grow the pool anywhere else. Not until one of the 619 * sysctls are changed, or the surplus pages go out of use. 620 */ 621 min_count = resv_huge_pages + nr_huge_pages - free_huge_pages; 622 min_count = max(count, min_count); 623 try_to_free_low(min_count); 624 while (min_count < persistent_huge_pages) { 625 struct page *page = dequeue_huge_page(); 626 if (!page) 627 break; 628 update_and_free_page(page); 629 } 630 while (count < persistent_huge_pages) { 631 if (!adjust_pool_surplus(1)) 632 break; 633 } 634out: 635 ret = persistent_huge_pages; 636 spin_unlock(&hugetlb_lock); 637 return ret; 638} 639 640int hugetlb_sysctl_handler(struct ctl_table *table, int write, 641 struct file *file, void __user *buffer, 642 size_t *length, loff_t *ppos) 643{ 644 proc_doulongvec_minmax(table, write, file, buffer, length, ppos); 645 max_huge_pages = set_max_huge_pages(max_huge_pages); 646 return 0; 647} 648 649int hugetlb_treat_movable_handler(struct ctl_table *table, int write, 650 struct file *file, void __user *buffer, 651 size_t *length, loff_t *ppos) 652{ 653 proc_dointvec(table, write, file, buffer, length, ppos); 654 if (hugepages_treat_as_movable) 655 htlb_alloc_mask = GFP_HIGHUSER_MOVABLE; 656 else 657 htlb_alloc_mask = GFP_HIGHUSER; 658 return 0; 659} 660 661int hugetlb_overcommit_handler(struct ctl_table *table, int write, 662 struct file *file, void __user *buffer, 663 size_t *length, loff_t *ppos) 664{ 665 proc_doulongvec_minmax(table, write, file, buffer, length, ppos); 666 spin_lock(&hugetlb_lock); 667 nr_overcommit_huge_pages = sysctl_overcommit_huge_pages; 668 spin_unlock(&hugetlb_lock); 669 return 0; 670} 671 672#endif /* CONFIG_SYSCTL */ 673 674int hugetlb_report_meminfo(char *buf) 675{ 676 return sprintf(buf, 677 "HugePages_Total: %5lu\n" 678 "HugePages_Free: %5lu\n" 679 "HugePages_Rsvd: %5lu\n" 680 "HugePages_Surp: %5lu\n" 681 "Hugepagesize: %5lu kB\n", 682 nr_huge_pages, 683 free_huge_pages, 684 resv_huge_pages, 685 surplus_huge_pages, 686 HPAGE_SIZE/1024); 687} 688 689int hugetlb_report_node_meminfo(int nid, char *buf) 690{ 691 return sprintf(buf, 692 "Node %d HugePages_Total: %5u\n" 693 "Node %d HugePages_Free: %5u\n" 694 "Node %d HugePages_Surp: %5u\n", 695 nid, nr_huge_pages_node[nid], 696 nid, free_huge_pages_node[nid], 697 nid, surplus_huge_pages_node[nid]); 698} 699 700/* Return the number pages of memory we physically have, in PAGE_SIZE units. */ 701unsigned long hugetlb_total_pages(void) 702{ 703 return nr_huge_pages * (HPAGE_SIZE / PAGE_SIZE); 704} 705 706/* 707 * We cannot handle pagefaults against hugetlb pages at all. They cause 708 * handle_mm_fault() to try to instantiate regular-sized pages in the 709 * hugegpage VMA. do_page_fault() is supposed to trap this, so BUG is we get 710 * this far. 711 */ 712static int hugetlb_vm_op_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 713{ 714 BUG(); 715 return 0; 716} 717 718struct vm_operations_struct hugetlb_vm_ops = { 719 .fault = hugetlb_vm_op_fault, 720}; 721 722static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page, 723 int writable) 724{ 725 pte_t entry; 726 727 if (writable) { 728 entry = 729 pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); 730 } else { 731 entry = pte_wrprotect(mk_pte(page, vma->vm_page_prot)); 732 } 733 entry = pte_mkyoung(entry); 734 entry = pte_mkhuge(entry); 735 736 return entry; 737} 738 739static void set_huge_ptep_writable(struct vm_area_struct *vma, 740 unsigned long address, pte_t *ptep) 741{ 742 pte_t entry; 743 744 entry = pte_mkwrite(pte_mkdirty(*ptep)); 745 if (ptep_set_access_flags(vma, address, ptep, entry, 1)) { 746 update_mmu_cache(vma, address, entry); 747 } 748} 749 750 751int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, 752 struct vm_area_struct *vma) 753{ 754 pte_t *src_pte, *dst_pte, entry; 755 struct page *ptepage; 756 unsigned long addr; 757 int cow; 758 759 cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; 760 761 for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) { 762 src_pte = huge_pte_offset(src, addr); 763 if (!src_pte) 764 continue; 765 dst_pte = huge_pte_alloc(dst, addr); 766 if (!dst_pte) 767 goto nomem; 768 769 /* If the pagetables are shared don't copy or take references */ 770 if (dst_pte == src_pte) 771 continue; 772 773 spin_lock(&dst->page_table_lock); 774 spin_lock(&src->page_table_lock); 775 if (!pte_none(*src_pte)) { 776 if (cow) 777 ptep_set_wrprotect(src, addr, src_pte); 778 entry = *src_pte; 779 ptepage = pte_page(entry); 780 get_page(ptepage); 781 set_huge_pte_at(dst, addr, dst_pte, entry); 782 } 783 spin_unlock(&src->page_table_lock); 784 spin_unlock(&dst->page_table_lock); 785 } 786 return 0; 787 788nomem: 789 return -ENOMEM; 790} 791 792void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, 793 unsigned long end) 794{ 795 struct mm_struct *mm = vma->vm_mm; 796 unsigned long address; 797 pte_t *ptep; 798 pte_t pte; 799 struct page *page; 800 struct page *tmp; 801 /* 802 * A page gathering list, protected by per file i_mmap_lock. The 803 * lock is used to avoid list corruption from multiple unmapping 804 * of the same page since we are using page->lru. 805 */ 806 LIST_HEAD(page_list); 807 808 WARN_ON(!is_vm_hugetlb_page(vma)); 809 BUG_ON(start & ~HPAGE_MASK); 810 BUG_ON(end & ~HPAGE_MASK); 811 812 spin_lock(&mm->page_table_lock); 813 for (address = start; address < end; address += HPAGE_SIZE) { 814 ptep = huge_pte_offset(mm, address); 815 if (!ptep) 816 continue; 817 818 if (huge_pmd_unshare(mm, &address, ptep)) 819 continue; 820 821 pte = huge_ptep_get_and_clear(mm, address, ptep); 822 if (pte_none(pte)) 823 continue; 824 825 page = pte_page(pte); 826 if (pte_dirty(pte)) 827 set_page_dirty(page); 828 list_add(&page->lru, &page_list); 829 } 830 spin_unlock(&mm->page_table_lock); 831 flush_tlb_range(vma, start, end); 832 list_for_each_entry_safe(page, tmp, &page_list, lru) { 833 list_del(&page->lru); 834 put_page(page); 835 } 836} 837 838void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, 839 unsigned long end) 840{ 841 /* 842 * It is undesirable to test vma->vm_file as it should be non-null 843 * for valid hugetlb area. However, vm_file will be NULL in the error 844 * cleanup path of do_mmap_pgoff. When hugetlbfs ->mmap method fails, 845 * do_mmap_pgoff() nullifies vma->vm_file before calling this function 846 * to clean up. Since no pte has actually been setup, it is safe to 847 * do nothing in this case. 848 */ 849 if (vma->vm_file) { 850 spin_lock(&vma->vm_file->f_mapping->i_mmap_lock); 851 __unmap_hugepage_range(vma, start, end); 852 spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock); 853 } 854} 855 856static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, 857 unsigned long address, pte_t *ptep, pte_t pte) 858{ 859 struct page *old_page, *new_page; 860 int avoidcopy; 861 862 old_page = pte_page(pte); 863 864 /* If no-one else is actually using this page, avoid the copy 865 * and just make the page writable */ 866 avoidcopy = (page_count(old_page) == 1); 867 if (avoidcopy) { 868 set_huge_ptep_writable(vma, address, ptep); 869 return 0; 870 } 871 872 page_cache_get(old_page); 873 new_page = alloc_huge_page(vma, address); 874 875 if (IS_ERR(new_page)) { 876 page_cache_release(old_page); 877 return -PTR_ERR(new_page); 878 } 879 880 spin_unlock(&mm->page_table_lock); 881 copy_huge_page(new_page, old_page, address, vma); 882 __SetPageUptodate(new_page); 883 spin_lock(&mm->page_table_lock); 884 885 ptep = huge_pte_offset(mm, address & HPAGE_MASK); 886 if (likely(pte_same(*ptep, pte))) { 887 /* Break COW */ 888 set_huge_pte_at(mm, address, ptep, 889 make_huge_pte(vma, new_page, 1)); 890 /* Make the old page be freed below */ 891 new_page = old_page; 892 } 893 page_cache_release(new_page); 894 page_cache_release(old_page); 895 return 0; 896} 897 898static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, 899 unsigned long address, pte_t *ptep, int write_access) 900{ 901 int ret = VM_FAULT_SIGBUS; 902 unsigned long idx; 903 unsigned long size; 904 struct page *page; 905 struct address_space *mapping; 906 pte_t new_pte; 907 908 mapping = vma->vm_file->f_mapping; 909 idx = ((address - vma->vm_start) >> HPAGE_SHIFT) 910 + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT)); 911 912 /* 913 * Use page lock to guard against racing truncation 914 * before we get page_table_lock. 915 */ 916retry: 917 page = find_lock_page(mapping, idx); 918 if (!page) { 919 size = i_size_read(mapping->host) >> HPAGE_SHIFT; 920 if (idx >= size) 921 goto out; 922 page = alloc_huge_page(vma, address); 923 if (IS_ERR(page)) { 924 ret = -PTR_ERR(page); 925 goto out; 926 } 927 clear_huge_page(page, address); 928 __SetPageUptodate(page); 929 930 if (vma->vm_flags & VM_SHARED) { 931 int err; 932 struct inode *inode = mapping->host; 933 934 err = add_to_page_cache(page, mapping, idx, GFP_KERNEL); 935 if (err) { 936 put_page(page); 937 if (err == -EEXIST) 938 goto retry; 939 goto out; 940 } 941 942 spin_lock(&inode->i_lock); 943 inode->i_blocks += BLOCKS_PER_HUGEPAGE; 944 spin_unlock(&inode->i_lock); 945 } else 946 lock_page(page); 947 } 948 949 spin_lock(&mm->page_table_lock); 950 size = i_size_read(mapping->host) >> HPAGE_SHIFT; 951 if (idx >= size) 952 goto backout; 953 954 ret = 0; 955 if (!pte_none(*ptep)) 956 goto backout; 957 958 new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) 959 && (vma->vm_flags & VM_SHARED))); 960 set_huge_pte_at(mm, address, ptep, new_pte); 961 962 if (write_access && !(vma->vm_flags & VM_SHARED)) { 963 /* Optimization, do the COW without a second fault */ 964 ret = hugetlb_cow(mm, vma, address, ptep, new_pte); 965 } 966 967 spin_unlock(&mm->page_table_lock); 968 unlock_page(page); 969out: 970 return ret; 971 972backout: 973 spin_unlock(&mm->page_table_lock); 974 unlock_page(page); 975 put_page(page); 976 goto out; 977} 978 979int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, 980 unsigned long address, int write_access) 981{ 982 pte_t *ptep; 983 pte_t entry; 984 int ret; 985 static DEFINE_MUTEX(hugetlb_instantiation_mutex); 986 987 ptep = huge_pte_alloc(mm, address); 988 if (!ptep) 989 return VM_FAULT_OOM; 990 991 /* 992 * Serialize hugepage allocation and instantiation, so that we don't 993 * get spurious allocation failures if two CPUs race to instantiate 994 * the same page in the page cache. 995 */ 996 mutex_lock(&hugetlb_instantiation_mutex); 997 entry = *ptep; 998 if (pte_none(entry)) { 999 ret = hugetlb_no_page(mm, vma, address, ptep, write_access); 1000 mutex_unlock(&hugetlb_instantiation_mutex); 1001 return ret; 1002 } 1003 1004 ret = 0; 1005 1006 spin_lock(&mm->page_table_lock); 1007 /* Check for a racing update before calling hugetlb_cow */ 1008 if (likely(pte_same(entry, *ptep))) 1009 if (write_access && !pte_write(entry)) 1010 ret = hugetlb_cow(mm, vma, address, ptep, entry); 1011 spin_unlock(&mm->page_table_lock); 1012 mutex_unlock(&hugetlb_instantiation_mutex); 1013 1014 return ret; 1015} 1016 1017int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, 1018 struct page **pages, struct vm_area_struct **vmas, 1019 unsigned long *position, int *length, int i, 1020 int write) 1021{ 1022 unsigned long pfn_offset; 1023 unsigned long vaddr = *position; 1024 int remainder = *length; 1025 1026 spin_lock(&mm->page_table_lock); 1027 while (vaddr < vma->vm_end && remainder) { 1028 pte_t *pte; 1029 struct page *page; 1030 1031 /* 1032 * Some archs (sparc64, sh*) have multiple pte_ts to 1033 * each hugepage. We have to make * sure we get the 1034 * first, for the page indexing below to work. 1035 */ 1036 pte = huge_pte_offset(mm, vaddr & HPAGE_MASK); 1037 1038 if (!pte || pte_none(*pte) || (write && !pte_write(*pte))) { 1039 int ret; 1040 1041 spin_unlock(&mm->page_table_lock); 1042 ret = hugetlb_fault(mm, vma, vaddr, write); 1043 spin_lock(&mm->page_table_lock); 1044 if (!(ret & VM_FAULT_ERROR)) 1045 continue; 1046 1047 remainder = 0; 1048 if (!i) 1049 i = -EFAULT; 1050 break; 1051 } 1052 1053 pfn_offset = (vaddr & ~HPAGE_MASK) >> PAGE_SHIFT; 1054 page = pte_page(*pte); 1055same_page: 1056 if (pages) { 1057 get_page(page); 1058 pages[i] = page + pfn_offset; 1059 } 1060 1061 if (vmas) 1062 vmas[i] = vma; 1063 1064 vaddr += PAGE_SIZE; 1065 ++pfn_offset; 1066 --remainder; 1067 ++i; 1068 if (vaddr < vma->vm_end && remainder && 1069 pfn_offset < HPAGE_SIZE/PAGE_SIZE) { 1070 /* 1071 * We use pfn_offset to avoid touching the pageframes 1072 * of this compound page. 1073 */ 1074 goto same_page; 1075 } 1076 } 1077 spin_unlock(&mm->page_table_lock); 1078 *length = remainder; 1079 *position = vaddr; 1080 1081 return i; 1082} 1083 1084void hugetlb_change_protection(struct vm_area_struct *vma, 1085 unsigned long address, unsigned long end, pgprot_t newprot) 1086{ 1087 struct mm_struct *mm = vma->vm_mm; 1088 unsigned long start = address; 1089 pte_t *ptep; 1090 pte_t pte; 1091 1092 BUG_ON(address >= end); 1093 flush_cache_range(vma, address, end); 1094 1095 spin_lock(&vma->vm_file->f_mapping->i_mmap_lock); 1096 spin_lock(&mm->page_table_lock); 1097 for (; address < end; address += HPAGE_SIZE) { 1098 ptep = huge_pte_offset(mm, address); 1099 if (!ptep) 1100 continue; 1101 if (huge_pmd_unshare(mm, &address, ptep)) 1102 continue; 1103 if (!pte_none(*ptep)) { 1104 pte = huge_ptep_get_and_clear(mm, address, ptep); 1105 pte = pte_mkhuge(pte_modify(pte, newprot)); 1106 set_huge_pte_at(mm, address, ptep, pte); 1107 } 1108 } 1109 spin_unlock(&mm->page_table_lock); 1110 spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock); 1111 1112 flush_tlb_range(vma, start, end); 1113} 1114 1115struct file_region { 1116 struct list_head link; 1117 long from; 1118 long to; 1119}; 1120 1121static long region_add(struct list_head *head, long f, long t) 1122{ 1123 struct file_region *rg, *nrg, *trg; 1124 1125 /* Locate the region we are either in or before. */ 1126 list_for_each_entry(rg, head, link) 1127 if (f <= rg->to) 1128 break; 1129 1130 /* Round our left edge to the current segment if it encloses us. */ 1131 if (f > rg->from) 1132 f = rg->from; 1133 1134 /* Check for and consume any regions we now overlap with. */ 1135 nrg = rg; 1136 list_for_each_entry_safe(rg, trg, rg->link.prev, link) { 1137 if (&rg->link == head) 1138 break; 1139 if (rg->from > t) 1140 break; 1141 1142 /* If this area reaches higher then extend our area to 1143 * include it completely. If this is not the first area 1144 * which we intend to reuse, free it. */ 1145 if (rg->to > t) 1146 t = rg->to; 1147 if (rg != nrg) { 1148 list_del(&rg->link); 1149 kfree(rg); 1150 } 1151 } 1152 nrg->from = f; 1153 nrg->to = t; 1154 return 0; 1155} 1156 1157static long region_chg(struct list_head *head, long f, long t) 1158{ 1159 struct file_region *rg, *nrg; 1160 long chg = 0; 1161 1162 /* Locate the region we are before or in. */ 1163 list_for_each_entry(rg, head, link) 1164 if (f <= rg->to) 1165 break; 1166 1167 /* If we are below the current region then a new region is required. 1168 * Subtle, allocate a new region at the position but make it zero 1169 * size such that we can guarantee to record the reservation. */ 1170 if (&rg->link == head || t < rg->from) { 1171 nrg = kmalloc(sizeof(*nrg), GFP_KERNEL); 1172 if (!nrg) 1173 return -ENOMEM; 1174 nrg->from = f; 1175 nrg->to = f; 1176 INIT_LIST_HEAD(&nrg->link); 1177 list_add(&nrg->link, rg->link.prev); 1178 1179 return t - f; 1180 } 1181 1182 /* Round our left edge to the current segment if it encloses us. */ 1183 if (f > rg->from) 1184 f = rg->from; 1185 chg = t - f; 1186 1187 /* Check for and consume any regions we now overlap with. */ 1188 list_for_each_entry(rg, rg->link.prev, link) { 1189 if (&rg->link == head) 1190 break; 1191 if (rg->from > t) 1192 return chg; 1193 1194 /* We overlap with this area, if it extends futher than 1195 * us then we must extend ourselves. Account for its 1196 * existing reservation. */ 1197 if (rg->to > t) { 1198 chg += rg->to - t; 1199 t = rg->to; 1200 } 1201 chg -= rg->to - rg->from; 1202 } 1203 return chg; 1204} 1205 1206static long region_truncate(struct list_head *head, long end) 1207{ 1208 struct file_region *rg, *trg; 1209 long chg = 0; 1210 1211 /* Locate the region we are either in or before. */ 1212 list_for_each_entry(rg, head, link) 1213 if (end <= rg->to) 1214 break; 1215 if (&rg->link == head) 1216 return 0; 1217 1218 /* If we are in the middle of a region then adjust it. */ 1219 if (end > rg->from) { 1220 chg = rg->to - end; 1221 rg->to = end; 1222 rg = list_entry(rg->link.next, typeof(*rg), link); 1223 } 1224 1225 /* Drop any remaining regions. */ 1226 list_for_each_entry_safe(rg, trg, rg->link.prev, link) { 1227 if (&rg->link == head) 1228 break; 1229 chg += rg->to - rg->from; 1230 list_del(&rg->link); 1231 kfree(rg); 1232 } 1233 return chg; 1234} 1235 1236static int hugetlb_acct_memory(long delta) 1237{ 1238 int ret = -ENOMEM; 1239 1240 spin_lock(&hugetlb_lock); 1241 /* 1242 * When cpuset is configured, it breaks the strict hugetlb page 1243 * reservation as the accounting is done on a global variable. Such 1244 * reservation is completely rubbish in the presence of cpuset because 1245 * the reservation is not checked against page availability for the 1246 * current cpuset. Application can still potentially OOM'ed by kernel 1247 * with lack of free htlb page in cpuset that the task is in. 1248 * Attempt to enforce strict accounting with cpuset is almost 1249 * impossible (or too ugly) because cpuset is too fluid that 1250 * task or memory node can be dynamically moved between cpusets. 1251 * 1252 * The change of semantics for shared hugetlb mapping with cpuset is 1253 * undesirable. However, in order to preserve some of the semantics, 1254 * we fall back to check against current free page availability as 1255 * a best attempt and hopefully to minimize the impact of changing 1256 * semantics that cpuset has. 1257 */ 1258 if (delta > 0) { 1259 if (gather_surplus_pages(delta) < 0) 1260 goto out; 1261 1262 if (delta > cpuset_mems_nr(free_huge_pages_node)) { 1263 return_unused_surplus_pages(delta); 1264 goto out; 1265 } 1266 } 1267 1268 ret = 0; 1269 if (delta < 0) 1270 return_unused_surplus_pages((unsigned long) -delta); 1271 1272out: 1273 spin_unlock(&hugetlb_lock); 1274 return ret; 1275} 1276 1277int hugetlb_reserve_pages(struct inode *inode, long from, long to) 1278{ 1279 long ret, chg; 1280 1281 chg = region_chg(&inode->i_mapping->private_list, from, to); 1282 if (chg < 0) 1283 return chg; 1284 1285 if (hugetlb_get_quota(inode->i_mapping, chg)) 1286 return -ENOSPC; 1287 ret = hugetlb_acct_memory(chg); 1288 if (ret < 0) { 1289 hugetlb_put_quota(inode->i_mapping, chg); 1290 return ret; 1291 } 1292 region_add(&inode->i_mapping->private_list, from, to); 1293 return 0; 1294} 1295 1296void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) 1297{ 1298 long chg = region_truncate(&inode->i_mapping->private_list, offset); 1299 1300 spin_lock(&inode->i_lock); 1301 inode->i_blocks -= BLOCKS_PER_HUGEPAGE * freed; 1302 spin_unlock(&inode->i_lock); 1303 1304 hugetlb_put_quota(inode->i_mapping, (chg - freed)); 1305 hugetlb_acct_memory(-(chg - freed)); 1306} 1307