hugetlb.c revision d1c3fb1f8f29c41b0d098d7cfb3c32939043631f
1/* 2 * Generic hugetlb support. 3 * (C) William Irwin, April 2004 4 */ 5#include <linux/gfp.h> 6#include <linux/list.h> 7#include <linux/init.h> 8#include <linux/module.h> 9#include <linux/mm.h> 10#include <linux/sysctl.h> 11#include <linux/highmem.h> 12#include <linux/nodemask.h> 13#include <linux/pagemap.h> 14#include <linux/mempolicy.h> 15#include <linux/cpuset.h> 16#include <linux/mutex.h> 17 18#include <asm/page.h> 19#include <asm/pgtable.h> 20 21#include <linux/hugetlb.h> 22#include "internal.h" 23 24const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; 25static unsigned long nr_huge_pages, free_huge_pages, resv_huge_pages; 26static unsigned long surplus_huge_pages; 27unsigned long max_huge_pages; 28static struct list_head hugepage_freelists[MAX_NUMNODES]; 29static unsigned int nr_huge_pages_node[MAX_NUMNODES]; 30static unsigned int free_huge_pages_node[MAX_NUMNODES]; 31static unsigned int surplus_huge_pages_node[MAX_NUMNODES]; 32static gfp_t htlb_alloc_mask = GFP_HIGHUSER; 33unsigned long hugepages_treat_as_movable; 34int hugetlb_dynamic_pool; 35unsigned long nr_overcommit_huge_pages; 36static int hugetlb_next_nid; 37 38/* 39 * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages 40 */ 41static DEFINE_SPINLOCK(hugetlb_lock); 42 43static void clear_huge_page(struct page *page, unsigned long addr) 44{ 45 int i; 46 47 might_sleep(); 48 for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); i++) { 49 cond_resched(); 50 clear_user_highpage(page + i, addr + i * PAGE_SIZE); 51 } 52} 53 54static void copy_huge_page(struct page *dst, struct page *src, 55 unsigned long addr, struct vm_area_struct *vma) 56{ 57 int i; 58 59 might_sleep(); 60 for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) { 61 cond_resched(); 62 copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma); 63 } 64} 65 66static void enqueue_huge_page(struct page *page) 67{ 68 int nid = page_to_nid(page); 69 list_add(&page->lru, &hugepage_freelists[nid]); 70 free_huge_pages++; 71 free_huge_pages_node[nid]++; 72} 73 74static struct page *dequeue_huge_page(struct vm_area_struct *vma, 75 unsigned long address) 76{ 77 int nid; 78 struct page *page = NULL; 79 struct mempolicy *mpol; 80 struct zonelist *zonelist = huge_zonelist(vma, address, 81 htlb_alloc_mask, &mpol); 82 struct zone **z; 83 84 for (z = zonelist->zones; *z; z++) { 85 nid = zone_to_nid(*z); 86 if (cpuset_zone_allowed_softwall(*z, htlb_alloc_mask) && 87 !list_empty(&hugepage_freelists[nid])) { 88 page = list_entry(hugepage_freelists[nid].next, 89 struct page, lru); 90 list_del(&page->lru); 91 free_huge_pages--; 92 free_huge_pages_node[nid]--; 93 if (vma && vma->vm_flags & VM_MAYSHARE) 94 resv_huge_pages--; 95 break; 96 } 97 } 98 mpol_free(mpol); /* unref if mpol !NULL */ 99 return page; 100} 101 102static void update_and_free_page(struct page *page) 103{ 104 int i; 105 nr_huge_pages--; 106 nr_huge_pages_node[page_to_nid(page)]--; 107 for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) { 108 page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced | 109 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved | 110 1 << PG_private | 1<< PG_writeback); 111 } 112 set_compound_page_dtor(page, NULL); 113 set_page_refcounted(page); 114 __free_pages(page, HUGETLB_PAGE_ORDER); 115} 116 117static void free_huge_page(struct page *page) 118{ 119 int nid = page_to_nid(page); 120 struct address_space *mapping; 121 122 mapping = (struct address_space *) page_private(page); 123 BUG_ON(page_count(page)); 124 INIT_LIST_HEAD(&page->lru); 125 126 spin_lock(&hugetlb_lock); 127 if (surplus_huge_pages_node[nid]) { 128 update_and_free_page(page); 129 surplus_huge_pages--; 130 surplus_huge_pages_node[nid]--; 131 } else { 132 enqueue_huge_page(page); 133 } 134 spin_unlock(&hugetlb_lock); 135 if (mapping) 136 hugetlb_put_quota(mapping, 1); 137 set_page_private(page, 0); 138} 139 140/* 141 * Increment or decrement surplus_huge_pages. Keep node-specific counters 142 * balanced by operating on them in a round-robin fashion. 143 * Returns 1 if an adjustment was made. 144 */ 145static int adjust_pool_surplus(int delta) 146{ 147 static int prev_nid; 148 int nid = prev_nid; 149 int ret = 0; 150 151 VM_BUG_ON(delta != -1 && delta != 1); 152 do { 153 nid = next_node(nid, node_online_map); 154 if (nid == MAX_NUMNODES) 155 nid = first_node(node_online_map); 156 157 /* To shrink on this node, there must be a surplus page */ 158 if (delta < 0 && !surplus_huge_pages_node[nid]) 159 continue; 160 /* Surplus cannot exceed the total number of pages */ 161 if (delta > 0 && surplus_huge_pages_node[nid] >= 162 nr_huge_pages_node[nid]) 163 continue; 164 165 surplus_huge_pages += delta; 166 surplus_huge_pages_node[nid] += delta; 167 ret = 1; 168 break; 169 } while (nid != prev_nid); 170 171 prev_nid = nid; 172 return ret; 173} 174 175static struct page *alloc_fresh_huge_page_node(int nid) 176{ 177 struct page *page; 178 179 page = alloc_pages_node(nid, 180 htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|__GFP_NOWARN, 181 HUGETLB_PAGE_ORDER); 182 if (page) { 183 set_compound_page_dtor(page, free_huge_page); 184 spin_lock(&hugetlb_lock); 185 nr_huge_pages++; 186 nr_huge_pages_node[nid]++; 187 spin_unlock(&hugetlb_lock); 188 put_page(page); /* free it into the hugepage allocator */ 189 } 190 191 return page; 192} 193 194static int alloc_fresh_huge_page(void) 195{ 196 struct page *page; 197 int start_nid; 198 int next_nid; 199 int ret = 0; 200 201 start_nid = hugetlb_next_nid; 202 203 do { 204 page = alloc_fresh_huge_page_node(hugetlb_next_nid); 205 if (page) 206 ret = 1; 207 /* 208 * Use a helper variable to find the next node and then 209 * copy it back to hugetlb_next_nid afterwards: 210 * otherwise there's a window in which a racer might 211 * pass invalid nid MAX_NUMNODES to alloc_pages_node. 212 * But we don't need to use a spin_lock here: it really 213 * doesn't matter if occasionally a racer chooses the 214 * same nid as we do. Move nid forward in the mask even 215 * if we just successfully allocated a hugepage so that 216 * the next caller gets hugepages on the next node. 217 */ 218 next_nid = next_node(hugetlb_next_nid, node_online_map); 219 if (next_nid == MAX_NUMNODES) 220 next_nid = first_node(node_online_map); 221 hugetlb_next_nid = next_nid; 222 } while (!page && hugetlb_next_nid != start_nid); 223 224 return ret; 225} 226 227static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma, 228 unsigned long address) 229{ 230 struct page *page; 231 unsigned int nid; 232 233 /* Check if the dynamic pool is enabled */ 234 if (!hugetlb_dynamic_pool) 235 return NULL; 236 237 /* 238 * Assume we will successfully allocate the surplus page to 239 * prevent racing processes from causing the surplus to exceed 240 * overcommit 241 * 242 * This however introduces a different race, where a process B 243 * tries to grow the static hugepage pool while alloc_pages() is 244 * called by process A. B will only examine the per-node 245 * counters in determining if surplus huge pages can be 246 * converted to normal huge pages in adjust_pool_surplus(). A 247 * won't be able to increment the per-node counter, until the 248 * lock is dropped by B, but B doesn't drop hugetlb_lock until 249 * no more huge pages can be converted from surplus to normal 250 * state (and doesn't try to convert again). Thus, we have a 251 * case where a surplus huge page exists, the pool is grown, and 252 * the surplus huge page still exists after, even though it 253 * should just have been converted to a normal huge page. This 254 * does not leak memory, though, as the hugepage will be freed 255 * once it is out of use. It also does not allow the counters to 256 * go out of whack in adjust_pool_surplus() as we don't modify 257 * the node values until we've gotten the hugepage and only the 258 * per-node value is checked there. 259 */ 260 spin_lock(&hugetlb_lock); 261 if (surplus_huge_pages >= nr_overcommit_huge_pages) { 262 spin_unlock(&hugetlb_lock); 263 return NULL; 264 } else { 265 nr_huge_pages++; 266 surplus_huge_pages++; 267 } 268 spin_unlock(&hugetlb_lock); 269 270 page = alloc_pages(htlb_alloc_mask|__GFP_COMP|__GFP_NOWARN, 271 HUGETLB_PAGE_ORDER); 272 273 spin_lock(&hugetlb_lock); 274 if (page) { 275 nid = page_to_nid(page); 276 set_compound_page_dtor(page, free_huge_page); 277 /* 278 * We incremented the global counters already 279 */ 280 nr_huge_pages_node[nid]++; 281 surplus_huge_pages_node[nid]++; 282 } else { 283 nr_huge_pages--; 284 surplus_huge_pages--; 285 } 286 spin_unlock(&hugetlb_lock); 287 288 return page; 289} 290 291/* 292 * Increase the hugetlb pool such that it can accomodate a reservation 293 * of size 'delta'. 294 */ 295static int gather_surplus_pages(int delta) 296{ 297 struct list_head surplus_list; 298 struct page *page, *tmp; 299 int ret, i; 300 int needed, allocated; 301 302 needed = (resv_huge_pages + delta) - free_huge_pages; 303 if (needed <= 0) 304 return 0; 305 306 allocated = 0; 307 INIT_LIST_HEAD(&surplus_list); 308 309 ret = -ENOMEM; 310retry: 311 spin_unlock(&hugetlb_lock); 312 for (i = 0; i < needed; i++) { 313 page = alloc_buddy_huge_page(NULL, 0); 314 if (!page) { 315 /* 316 * We were not able to allocate enough pages to 317 * satisfy the entire reservation so we free what 318 * we've allocated so far. 319 */ 320 spin_lock(&hugetlb_lock); 321 needed = 0; 322 goto free; 323 } 324 325 list_add(&page->lru, &surplus_list); 326 } 327 allocated += needed; 328 329 /* 330 * After retaking hugetlb_lock, we need to recalculate 'needed' 331 * because either resv_huge_pages or free_huge_pages may have changed. 332 */ 333 spin_lock(&hugetlb_lock); 334 needed = (resv_huge_pages + delta) - (free_huge_pages + allocated); 335 if (needed > 0) 336 goto retry; 337 338 /* 339 * The surplus_list now contains _at_least_ the number of extra pages 340 * needed to accomodate the reservation. Add the appropriate number 341 * of pages to the hugetlb pool and free the extras back to the buddy 342 * allocator. 343 */ 344 needed += allocated; 345 ret = 0; 346free: 347 list_for_each_entry_safe(page, tmp, &surplus_list, lru) { 348 list_del(&page->lru); 349 if ((--needed) >= 0) 350 enqueue_huge_page(page); 351 else { 352 /* 353 * Decrement the refcount and free the page using its 354 * destructor. This must be done with hugetlb_lock 355 * unlocked which is safe because free_huge_page takes 356 * hugetlb_lock before deciding how to free the page. 357 */ 358 spin_unlock(&hugetlb_lock); 359 put_page(page); 360 spin_lock(&hugetlb_lock); 361 } 362 } 363 364 return ret; 365} 366 367/* 368 * When releasing a hugetlb pool reservation, any surplus pages that were 369 * allocated to satisfy the reservation must be explicitly freed if they were 370 * never used. 371 */ 372static void return_unused_surplus_pages(unsigned long unused_resv_pages) 373{ 374 static int nid = -1; 375 struct page *page; 376 unsigned long nr_pages; 377 378 nr_pages = min(unused_resv_pages, surplus_huge_pages); 379 380 while (nr_pages) { 381 nid = next_node(nid, node_online_map); 382 if (nid == MAX_NUMNODES) 383 nid = first_node(node_online_map); 384 385 if (!surplus_huge_pages_node[nid]) 386 continue; 387 388 if (!list_empty(&hugepage_freelists[nid])) { 389 page = list_entry(hugepage_freelists[nid].next, 390 struct page, lru); 391 list_del(&page->lru); 392 update_and_free_page(page); 393 free_huge_pages--; 394 free_huge_pages_node[nid]--; 395 surplus_huge_pages--; 396 surplus_huge_pages_node[nid]--; 397 nr_pages--; 398 } 399 } 400} 401 402 403static struct page *alloc_huge_page_shared(struct vm_area_struct *vma, 404 unsigned long addr) 405{ 406 struct page *page; 407 408 spin_lock(&hugetlb_lock); 409 page = dequeue_huge_page(vma, addr); 410 spin_unlock(&hugetlb_lock); 411 return page ? page : ERR_PTR(-VM_FAULT_OOM); 412} 413 414static struct page *alloc_huge_page_private(struct vm_area_struct *vma, 415 unsigned long addr) 416{ 417 struct page *page = NULL; 418 419 if (hugetlb_get_quota(vma->vm_file->f_mapping, 1)) 420 return ERR_PTR(-VM_FAULT_SIGBUS); 421 422 spin_lock(&hugetlb_lock); 423 if (free_huge_pages > resv_huge_pages) 424 page = dequeue_huge_page(vma, addr); 425 spin_unlock(&hugetlb_lock); 426 if (!page) 427 page = alloc_buddy_huge_page(vma, addr); 428 return page ? page : ERR_PTR(-VM_FAULT_OOM); 429} 430 431static struct page *alloc_huge_page(struct vm_area_struct *vma, 432 unsigned long addr) 433{ 434 struct page *page; 435 struct address_space *mapping = vma->vm_file->f_mapping; 436 437 if (vma->vm_flags & VM_MAYSHARE) 438 page = alloc_huge_page_shared(vma, addr); 439 else 440 page = alloc_huge_page_private(vma, addr); 441 442 if (!IS_ERR(page)) { 443 set_page_refcounted(page); 444 set_page_private(page, (unsigned long) mapping); 445 } 446 return page; 447} 448 449static int __init hugetlb_init(void) 450{ 451 unsigned long i; 452 453 if (HPAGE_SHIFT == 0) 454 return 0; 455 456 for (i = 0; i < MAX_NUMNODES; ++i) 457 INIT_LIST_HEAD(&hugepage_freelists[i]); 458 459 hugetlb_next_nid = first_node(node_online_map); 460 461 for (i = 0; i < max_huge_pages; ++i) { 462 if (!alloc_fresh_huge_page()) 463 break; 464 } 465 max_huge_pages = free_huge_pages = nr_huge_pages = i; 466 printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages); 467 return 0; 468} 469module_init(hugetlb_init); 470 471static int __init hugetlb_setup(char *s) 472{ 473 if (sscanf(s, "%lu", &max_huge_pages) <= 0) 474 max_huge_pages = 0; 475 return 1; 476} 477__setup("hugepages=", hugetlb_setup); 478 479static unsigned int cpuset_mems_nr(unsigned int *array) 480{ 481 int node; 482 unsigned int nr = 0; 483 484 for_each_node_mask(node, cpuset_current_mems_allowed) 485 nr += array[node]; 486 487 return nr; 488} 489 490#ifdef CONFIG_SYSCTL 491#ifdef CONFIG_HIGHMEM 492static void try_to_free_low(unsigned long count) 493{ 494 int i; 495 496 for (i = 0; i < MAX_NUMNODES; ++i) { 497 struct page *page, *next; 498 list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) { 499 if (count >= nr_huge_pages) 500 return; 501 if (PageHighMem(page)) 502 continue; 503 list_del(&page->lru); 504 update_and_free_page(page); 505 free_huge_pages--; 506 free_huge_pages_node[page_to_nid(page)]--; 507 } 508 } 509} 510#else 511static inline void try_to_free_low(unsigned long count) 512{ 513} 514#endif 515 516#define persistent_huge_pages (nr_huge_pages - surplus_huge_pages) 517static unsigned long set_max_huge_pages(unsigned long count) 518{ 519 unsigned long min_count, ret; 520 521 /* 522 * Increase the pool size 523 * First take pages out of surplus state. Then make up the 524 * remaining difference by allocating fresh huge pages. 525 * 526 * We might race with alloc_buddy_huge_page() here and be unable 527 * to convert a surplus huge page to a normal huge page. That is 528 * not critical, though, it just means the overall size of the 529 * pool might be one hugepage larger than it needs to be, but 530 * within all the constraints specified by the sysctls. 531 */ 532 spin_lock(&hugetlb_lock); 533 while (surplus_huge_pages && count > persistent_huge_pages) { 534 if (!adjust_pool_surplus(-1)) 535 break; 536 } 537 538 while (count > persistent_huge_pages) { 539 int ret; 540 /* 541 * If this allocation races such that we no longer need the 542 * page, free_huge_page will handle it by freeing the page 543 * and reducing the surplus. 544 */ 545 spin_unlock(&hugetlb_lock); 546 ret = alloc_fresh_huge_page(); 547 spin_lock(&hugetlb_lock); 548 if (!ret) 549 goto out; 550 551 } 552 553 /* 554 * Decrease the pool size 555 * First return free pages to the buddy allocator (being careful 556 * to keep enough around to satisfy reservations). Then place 557 * pages into surplus state as needed so the pool will shrink 558 * to the desired size as pages become free. 559 * 560 * By placing pages into the surplus state independent of the 561 * overcommit value, we are allowing the surplus pool size to 562 * exceed overcommit. There are few sane options here. Since 563 * alloc_buddy_huge_page() is checking the global counter, 564 * though, we'll note that we're not allowed to exceed surplus 565 * and won't grow the pool anywhere else. Not until one of the 566 * sysctls are changed, or the surplus pages go out of use. 567 */ 568 min_count = resv_huge_pages + nr_huge_pages - free_huge_pages; 569 min_count = max(count, min_count); 570 try_to_free_low(min_count); 571 while (min_count < persistent_huge_pages) { 572 struct page *page = dequeue_huge_page(NULL, 0); 573 if (!page) 574 break; 575 update_and_free_page(page); 576 } 577 while (count < persistent_huge_pages) { 578 if (!adjust_pool_surplus(1)) 579 break; 580 } 581out: 582 ret = persistent_huge_pages; 583 spin_unlock(&hugetlb_lock); 584 return ret; 585} 586 587int hugetlb_sysctl_handler(struct ctl_table *table, int write, 588 struct file *file, void __user *buffer, 589 size_t *length, loff_t *ppos) 590{ 591 proc_doulongvec_minmax(table, write, file, buffer, length, ppos); 592 max_huge_pages = set_max_huge_pages(max_huge_pages); 593 return 0; 594} 595 596int hugetlb_treat_movable_handler(struct ctl_table *table, int write, 597 struct file *file, void __user *buffer, 598 size_t *length, loff_t *ppos) 599{ 600 proc_dointvec(table, write, file, buffer, length, ppos); 601 if (hugepages_treat_as_movable) 602 htlb_alloc_mask = GFP_HIGHUSER_MOVABLE; 603 else 604 htlb_alloc_mask = GFP_HIGHUSER; 605 return 0; 606} 607 608#endif /* CONFIG_SYSCTL */ 609 610int hugetlb_report_meminfo(char *buf) 611{ 612 return sprintf(buf, 613 "HugePages_Total: %5lu\n" 614 "HugePages_Free: %5lu\n" 615 "HugePages_Rsvd: %5lu\n" 616 "HugePages_Surp: %5lu\n" 617 "Hugepagesize: %5lu kB\n", 618 nr_huge_pages, 619 free_huge_pages, 620 resv_huge_pages, 621 surplus_huge_pages, 622 HPAGE_SIZE/1024); 623} 624 625int hugetlb_report_node_meminfo(int nid, char *buf) 626{ 627 return sprintf(buf, 628 "Node %d HugePages_Total: %5u\n" 629 "Node %d HugePages_Free: %5u\n", 630 nid, nr_huge_pages_node[nid], 631 nid, free_huge_pages_node[nid]); 632} 633 634/* Return the number pages of memory we physically have, in PAGE_SIZE units. */ 635unsigned long hugetlb_total_pages(void) 636{ 637 return nr_huge_pages * (HPAGE_SIZE / PAGE_SIZE); 638} 639 640/* 641 * We cannot handle pagefaults against hugetlb pages at all. They cause 642 * handle_mm_fault() to try to instantiate regular-sized pages in the 643 * hugegpage VMA. do_page_fault() is supposed to trap this, so BUG is we get 644 * this far. 645 */ 646static int hugetlb_vm_op_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 647{ 648 BUG(); 649 return 0; 650} 651 652struct vm_operations_struct hugetlb_vm_ops = { 653 .fault = hugetlb_vm_op_fault, 654}; 655 656static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page, 657 int writable) 658{ 659 pte_t entry; 660 661 if (writable) { 662 entry = 663 pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); 664 } else { 665 entry = pte_wrprotect(mk_pte(page, vma->vm_page_prot)); 666 } 667 entry = pte_mkyoung(entry); 668 entry = pte_mkhuge(entry); 669 670 return entry; 671} 672 673static void set_huge_ptep_writable(struct vm_area_struct *vma, 674 unsigned long address, pte_t *ptep) 675{ 676 pte_t entry; 677 678 entry = pte_mkwrite(pte_mkdirty(*ptep)); 679 if (ptep_set_access_flags(vma, address, ptep, entry, 1)) { 680 update_mmu_cache(vma, address, entry); 681 } 682} 683 684 685int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, 686 struct vm_area_struct *vma) 687{ 688 pte_t *src_pte, *dst_pte, entry; 689 struct page *ptepage; 690 unsigned long addr; 691 int cow; 692 693 cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; 694 695 for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) { 696 src_pte = huge_pte_offset(src, addr); 697 if (!src_pte) 698 continue; 699 dst_pte = huge_pte_alloc(dst, addr); 700 if (!dst_pte) 701 goto nomem; 702 spin_lock(&dst->page_table_lock); 703 spin_lock(&src->page_table_lock); 704 if (!pte_none(*src_pte)) { 705 if (cow) 706 ptep_set_wrprotect(src, addr, src_pte); 707 entry = *src_pte; 708 ptepage = pte_page(entry); 709 get_page(ptepage); 710 set_huge_pte_at(dst, addr, dst_pte, entry); 711 } 712 spin_unlock(&src->page_table_lock); 713 spin_unlock(&dst->page_table_lock); 714 } 715 return 0; 716 717nomem: 718 return -ENOMEM; 719} 720 721void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, 722 unsigned long end) 723{ 724 struct mm_struct *mm = vma->vm_mm; 725 unsigned long address; 726 pte_t *ptep; 727 pte_t pte; 728 struct page *page; 729 struct page *tmp; 730 /* 731 * A page gathering list, protected by per file i_mmap_lock. The 732 * lock is used to avoid list corruption from multiple unmapping 733 * of the same page since we are using page->lru. 734 */ 735 LIST_HEAD(page_list); 736 737 WARN_ON(!is_vm_hugetlb_page(vma)); 738 BUG_ON(start & ~HPAGE_MASK); 739 BUG_ON(end & ~HPAGE_MASK); 740 741 spin_lock(&mm->page_table_lock); 742 for (address = start; address < end; address += HPAGE_SIZE) { 743 ptep = huge_pte_offset(mm, address); 744 if (!ptep) 745 continue; 746 747 if (huge_pmd_unshare(mm, &address, ptep)) 748 continue; 749 750 pte = huge_ptep_get_and_clear(mm, address, ptep); 751 if (pte_none(pte)) 752 continue; 753 754 page = pte_page(pte); 755 if (pte_dirty(pte)) 756 set_page_dirty(page); 757 list_add(&page->lru, &page_list); 758 } 759 spin_unlock(&mm->page_table_lock); 760 flush_tlb_range(vma, start, end); 761 list_for_each_entry_safe(page, tmp, &page_list, lru) { 762 list_del(&page->lru); 763 put_page(page); 764 } 765} 766 767void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, 768 unsigned long end) 769{ 770 /* 771 * It is undesirable to test vma->vm_file as it should be non-null 772 * for valid hugetlb area. However, vm_file will be NULL in the error 773 * cleanup path of do_mmap_pgoff. When hugetlbfs ->mmap method fails, 774 * do_mmap_pgoff() nullifies vma->vm_file before calling this function 775 * to clean up. Since no pte has actually been setup, it is safe to 776 * do nothing in this case. 777 */ 778 if (vma->vm_file) { 779 spin_lock(&vma->vm_file->f_mapping->i_mmap_lock); 780 __unmap_hugepage_range(vma, start, end); 781 spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock); 782 } 783} 784 785static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, 786 unsigned long address, pte_t *ptep, pte_t pte) 787{ 788 struct page *old_page, *new_page; 789 int avoidcopy; 790 791 old_page = pte_page(pte); 792 793 /* If no-one else is actually using this page, avoid the copy 794 * and just make the page writable */ 795 avoidcopy = (page_count(old_page) == 1); 796 if (avoidcopy) { 797 set_huge_ptep_writable(vma, address, ptep); 798 return 0; 799 } 800 801 page_cache_get(old_page); 802 new_page = alloc_huge_page(vma, address); 803 804 if (IS_ERR(new_page)) { 805 page_cache_release(old_page); 806 return -PTR_ERR(new_page); 807 } 808 809 spin_unlock(&mm->page_table_lock); 810 copy_huge_page(new_page, old_page, address, vma); 811 spin_lock(&mm->page_table_lock); 812 813 ptep = huge_pte_offset(mm, address & HPAGE_MASK); 814 if (likely(pte_same(*ptep, pte))) { 815 /* Break COW */ 816 set_huge_pte_at(mm, address, ptep, 817 make_huge_pte(vma, new_page, 1)); 818 /* Make the old page be freed below */ 819 new_page = old_page; 820 } 821 page_cache_release(new_page); 822 page_cache_release(old_page); 823 return 0; 824} 825 826static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, 827 unsigned long address, pte_t *ptep, int write_access) 828{ 829 int ret = VM_FAULT_SIGBUS; 830 unsigned long idx; 831 unsigned long size; 832 struct page *page; 833 struct address_space *mapping; 834 pte_t new_pte; 835 836 mapping = vma->vm_file->f_mapping; 837 idx = ((address - vma->vm_start) >> HPAGE_SHIFT) 838 + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT)); 839 840 /* 841 * Use page lock to guard against racing truncation 842 * before we get page_table_lock. 843 */ 844retry: 845 page = find_lock_page(mapping, idx); 846 if (!page) { 847 size = i_size_read(mapping->host) >> HPAGE_SHIFT; 848 if (idx >= size) 849 goto out; 850 page = alloc_huge_page(vma, address); 851 if (IS_ERR(page)) { 852 ret = -PTR_ERR(page); 853 goto out; 854 } 855 clear_huge_page(page, address); 856 857 if (vma->vm_flags & VM_SHARED) { 858 int err; 859 struct inode *inode = mapping->host; 860 861 err = add_to_page_cache(page, mapping, idx, GFP_KERNEL); 862 if (err) { 863 put_page(page); 864 if (err == -EEXIST) 865 goto retry; 866 goto out; 867 } 868 869 spin_lock(&inode->i_lock); 870 inode->i_blocks += BLOCKS_PER_HUGEPAGE; 871 spin_unlock(&inode->i_lock); 872 } else 873 lock_page(page); 874 } 875 876 spin_lock(&mm->page_table_lock); 877 size = i_size_read(mapping->host) >> HPAGE_SHIFT; 878 if (idx >= size) 879 goto backout; 880 881 ret = 0; 882 if (!pte_none(*ptep)) 883 goto backout; 884 885 new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) 886 && (vma->vm_flags & VM_SHARED))); 887 set_huge_pte_at(mm, address, ptep, new_pte); 888 889 if (write_access && !(vma->vm_flags & VM_SHARED)) { 890 /* Optimization, do the COW without a second fault */ 891 ret = hugetlb_cow(mm, vma, address, ptep, new_pte); 892 } 893 894 spin_unlock(&mm->page_table_lock); 895 unlock_page(page); 896out: 897 return ret; 898 899backout: 900 spin_unlock(&mm->page_table_lock); 901 unlock_page(page); 902 put_page(page); 903 goto out; 904} 905 906int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, 907 unsigned long address, int write_access) 908{ 909 pte_t *ptep; 910 pte_t entry; 911 int ret; 912 static DEFINE_MUTEX(hugetlb_instantiation_mutex); 913 914 ptep = huge_pte_alloc(mm, address); 915 if (!ptep) 916 return VM_FAULT_OOM; 917 918 /* 919 * Serialize hugepage allocation and instantiation, so that we don't 920 * get spurious allocation failures if two CPUs race to instantiate 921 * the same page in the page cache. 922 */ 923 mutex_lock(&hugetlb_instantiation_mutex); 924 entry = *ptep; 925 if (pte_none(entry)) { 926 ret = hugetlb_no_page(mm, vma, address, ptep, write_access); 927 mutex_unlock(&hugetlb_instantiation_mutex); 928 return ret; 929 } 930 931 ret = 0; 932 933 spin_lock(&mm->page_table_lock); 934 /* Check for a racing update before calling hugetlb_cow */ 935 if (likely(pte_same(entry, *ptep))) 936 if (write_access && !pte_write(entry)) 937 ret = hugetlb_cow(mm, vma, address, ptep, entry); 938 spin_unlock(&mm->page_table_lock); 939 mutex_unlock(&hugetlb_instantiation_mutex); 940 941 return ret; 942} 943 944int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, 945 struct page **pages, struct vm_area_struct **vmas, 946 unsigned long *position, int *length, int i, 947 int write) 948{ 949 unsigned long pfn_offset; 950 unsigned long vaddr = *position; 951 int remainder = *length; 952 953 spin_lock(&mm->page_table_lock); 954 while (vaddr < vma->vm_end && remainder) { 955 pte_t *pte; 956 struct page *page; 957 958 /* 959 * Some archs (sparc64, sh*) have multiple pte_ts to 960 * each hugepage. We have to make * sure we get the 961 * first, for the page indexing below to work. 962 */ 963 pte = huge_pte_offset(mm, vaddr & HPAGE_MASK); 964 965 if (!pte || pte_none(*pte) || (write && !pte_write(*pte))) { 966 int ret; 967 968 spin_unlock(&mm->page_table_lock); 969 ret = hugetlb_fault(mm, vma, vaddr, write); 970 spin_lock(&mm->page_table_lock); 971 if (!(ret & VM_FAULT_ERROR)) 972 continue; 973 974 remainder = 0; 975 if (!i) 976 i = -EFAULT; 977 break; 978 } 979 980 pfn_offset = (vaddr & ~HPAGE_MASK) >> PAGE_SHIFT; 981 page = pte_page(*pte); 982same_page: 983 if (pages) { 984 get_page(page); 985 pages[i] = page + pfn_offset; 986 } 987 988 if (vmas) 989 vmas[i] = vma; 990 991 vaddr += PAGE_SIZE; 992 ++pfn_offset; 993 --remainder; 994 ++i; 995 if (vaddr < vma->vm_end && remainder && 996 pfn_offset < HPAGE_SIZE/PAGE_SIZE) { 997 /* 998 * We use pfn_offset to avoid touching the pageframes 999 * of this compound page. 1000 */ 1001 goto same_page; 1002 } 1003 } 1004 spin_unlock(&mm->page_table_lock); 1005 *length = remainder; 1006 *position = vaddr; 1007 1008 return i; 1009} 1010 1011void hugetlb_change_protection(struct vm_area_struct *vma, 1012 unsigned long address, unsigned long end, pgprot_t newprot) 1013{ 1014 struct mm_struct *mm = vma->vm_mm; 1015 unsigned long start = address; 1016 pte_t *ptep; 1017 pte_t pte; 1018 1019 BUG_ON(address >= end); 1020 flush_cache_range(vma, address, end); 1021 1022 spin_lock(&vma->vm_file->f_mapping->i_mmap_lock); 1023 spin_lock(&mm->page_table_lock); 1024 for (; address < end; address += HPAGE_SIZE) { 1025 ptep = huge_pte_offset(mm, address); 1026 if (!ptep) 1027 continue; 1028 if (huge_pmd_unshare(mm, &address, ptep)) 1029 continue; 1030 if (!pte_none(*ptep)) { 1031 pte = huge_ptep_get_and_clear(mm, address, ptep); 1032 pte = pte_mkhuge(pte_modify(pte, newprot)); 1033 set_huge_pte_at(mm, address, ptep, pte); 1034 } 1035 } 1036 spin_unlock(&mm->page_table_lock); 1037 spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock); 1038 1039 flush_tlb_range(vma, start, end); 1040} 1041 1042struct file_region { 1043 struct list_head link; 1044 long from; 1045 long to; 1046}; 1047 1048static long region_add(struct list_head *head, long f, long t) 1049{ 1050 struct file_region *rg, *nrg, *trg; 1051 1052 /* Locate the region we are either in or before. */ 1053 list_for_each_entry(rg, head, link) 1054 if (f <= rg->to) 1055 break; 1056 1057 /* Round our left edge to the current segment if it encloses us. */ 1058 if (f > rg->from) 1059 f = rg->from; 1060 1061 /* Check for and consume any regions we now overlap with. */ 1062 nrg = rg; 1063 list_for_each_entry_safe(rg, trg, rg->link.prev, link) { 1064 if (&rg->link == head) 1065 break; 1066 if (rg->from > t) 1067 break; 1068 1069 /* If this area reaches higher then extend our area to 1070 * include it completely. If this is not the first area 1071 * which we intend to reuse, free it. */ 1072 if (rg->to > t) 1073 t = rg->to; 1074 if (rg != nrg) { 1075 list_del(&rg->link); 1076 kfree(rg); 1077 } 1078 } 1079 nrg->from = f; 1080 nrg->to = t; 1081 return 0; 1082} 1083 1084static long region_chg(struct list_head *head, long f, long t) 1085{ 1086 struct file_region *rg, *nrg; 1087 long chg = 0; 1088 1089 /* Locate the region we are before or in. */ 1090 list_for_each_entry(rg, head, link) 1091 if (f <= rg->to) 1092 break; 1093 1094 /* If we are below the current region then a new region is required. 1095 * Subtle, allocate a new region at the position but make it zero 1096 * size such that we can guarantee to record the reservation. */ 1097 if (&rg->link == head || t < rg->from) { 1098 nrg = kmalloc(sizeof(*nrg), GFP_KERNEL); 1099 if (!nrg) 1100 return -ENOMEM; 1101 nrg->from = f; 1102 nrg->to = f; 1103 INIT_LIST_HEAD(&nrg->link); 1104 list_add(&nrg->link, rg->link.prev); 1105 1106 return t - f; 1107 } 1108 1109 /* Round our left edge to the current segment if it encloses us. */ 1110 if (f > rg->from) 1111 f = rg->from; 1112 chg = t - f; 1113 1114 /* Check for and consume any regions we now overlap with. */ 1115 list_for_each_entry(rg, rg->link.prev, link) { 1116 if (&rg->link == head) 1117 break; 1118 if (rg->from > t) 1119 return chg; 1120 1121 /* We overlap with this area, if it extends futher than 1122 * us then we must extend ourselves. Account for its 1123 * existing reservation. */ 1124 if (rg->to > t) { 1125 chg += rg->to - t; 1126 t = rg->to; 1127 } 1128 chg -= rg->to - rg->from; 1129 } 1130 return chg; 1131} 1132 1133static long region_truncate(struct list_head *head, long end) 1134{ 1135 struct file_region *rg, *trg; 1136 long chg = 0; 1137 1138 /* Locate the region we are either in or before. */ 1139 list_for_each_entry(rg, head, link) 1140 if (end <= rg->to) 1141 break; 1142 if (&rg->link == head) 1143 return 0; 1144 1145 /* If we are in the middle of a region then adjust it. */ 1146 if (end > rg->from) { 1147 chg = rg->to - end; 1148 rg->to = end; 1149 rg = list_entry(rg->link.next, typeof(*rg), link); 1150 } 1151 1152 /* Drop any remaining regions. */ 1153 list_for_each_entry_safe(rg, trg, rg->link.prev, link) { 1154 if (&rg->link == head) 1155 break; 1156 chg += rg->to - rg->from; 1157 list_del(&rg->link); 1158 kfree(rg); 1159 } 1160 return chg; 1161} 1162 1163static int hugetlb_acct_memory(long delta) 1164{ 1165 int ret = -ENOMEM; 1166 1167 spin_lock(&hugetlb_lock); 1168 /* 1169 * When cpuset is configured, it breaks the strict hugetlb page 1170 * reservation as the accounting is done on a global variable. Such 1171 * reservation is completely rubbish in the presence of cpuset because 1172 * the reservation is not checked against page availability for the 1173 * current cpuset. Application can still potentially OOM'ed by kernel 1174 * with lack of free htlb page in cpuset that the task is in. 1175 * Attempt to enforce strict accounting with cpuset is almost 1176 * impossible (or too ugly) because cpuset is too fluid that 1177 * task or memory node can be dynamically moved between cpusets. 1178 * 1179 * The change of semantics for shared hugetlb mapping with cpuset is 1180 * undesirable. However, in order to preserve some of the semantics, 1181 * we fall back to check against current free page availability as 1182 * a best attempt and hopefully to minimize the impact of changing 1183 * semantics that cpuset has. 1184 */ 1185 if (delta > 0) { 1186 if (gather_surplus_pages(delta) < 0) 1187 goto out; 1188 1189 if (delta > cpuset_mems_nr(free_huge_pages_node)) 1190 goto out; 1191 } 1192 1193 ret = 0; 1194 resv_huge_pages += delta; 1195 if (delta < 0) 1196 return_unused_surplus_pages((unsigned long) -delta); 1197 1198out: 1199 spin_unlock(&hugetlb_lock); 1200 return ret; 1201} 1202 1203int hugetlb_reserve_pages(struct inode *inode, long from, long to) 1204{ 1205 long ret, chg; 1206 1207 chg = region_chg(&inode->i_mapping->private_list, from, to); 1208 if (chg < 0) 1209 return chg; 1210 1211 if (hugetlb_get_quota(inode->i_mapping, chg)) 1212 return -ENOSPC; 1213 ret = hugetlb_acct_memory(chg); 1214 if (ret < 0) 1215 return ret; 1216 region_add(&inode->i_mapping->private_list, from, to); 1217 return 0; 1218} 1219 1220void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) 1221{ 1222 long chg = region_truncate(&inode->i_mapping->private_list, offset); 1223 1224 spin_lock(&inode->i_lock); 1225 inode->i_blocks -= BLOCKS_PER_HUGEPAGE * freed; 1226 spin_unlock(&inode->i_lock); 1227 1228 hugetlb_put_quota(inode->i_mapping, (chg - freed)); 1229 hugetlb_acct_memory(-(chg - freed)); 1230} 1231