hugetlb.c revision 04f2cbe35699d22dbf428373682ead85ca1240f5
1/* 2 * Generic hugetlb support. 3 * (C) William Irwin, April 2004 4 */ 5#include <linux/gfp.h> 6#include <linux/list.h> 7#include <linux/init.h> 8#include <linux/module.h> 9#include <linux/mm.h> 10#include <linux/sysctl.h> 11#include <linux/highmem.h> 12#include <linux/nodemask.h> 13#include <linux/pagemap.h> 14#include <linux/mempolicy.h> 15#include <linux/cpuset.h> 16#include <linux/mutex.h> 17 18#include <asm/page.h> 19#include <asm/pgtable.h> 20 21#include <linux/hugetlb.h> 22#include "internal.h" 23 24const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; 25static unsigned long nr_huge_pages, free_huge_pages, resv_huge_pages; 26static unsigned long surplus_huge_pages; 27static unsigned long nr_overcommit_huge_pages; 28unsigned long max_huge_pages; 29unsigned long sysctl_overcommit_huge_pages; 30static struct list_head hugepage_freelists[MAX_NUMNODES]; 31static unsigned int nr_huge_pages_node[MAX_NUMNODES]; 32static unsigned int free_huge_pages_node[MAX_NUMNODES]; 33static unsigned int surplus_huge_pages_node[MAX_NUMNODES]; 34static gfp_t htlb_alloc_mask = GFP_HIGHUSER; 35unsigned long hugepages_treat_as_movable; 36static int hugetlb_next_nid; 37 38/* 39 * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages 40 */ 41static DEFINE_SPINLOCK(hugetlb_lock); 42 43#define HPAGE_RESV_OWNER (1UL << (BITS_PER_LONG - 1)) 44#define HPAGE_RESV_UNMAPPED (1UL << (BITS_PER_LONG - 2)) 45#define HPAGE_RESV_MASK (HPAGE_RESV_OWNER | HPAGE_RESV_UNMAPPED) 46/* 47 * These helpers are used to track how many pages are reserved for 48 * faults in a MAP_PRIVATE mapping. Only the process that called mmap() 49 * is guaranteed to have their future faults succeed. 50 * 51 * With the exception of reset_vma_resv_huge_pages() which is called at fork(), 52 * the reserve counters are updated with the hugetlb_lock held. It is safe 53 * to reset the VMA at fork() time as it is not in use yet and there is no 54 * chance of the global counters getting corrupted as a result of the values. 55 */ 56static unsigned long vma_resv_huge_pages(struct vm_area_struct *vma) 57{ 58 VM_BUG_ON(!is_vm_hugetlb_page(vma)); 59 if (!(vma->vm_flags & VM_SHARED)) 60 return (unsigned long)vma->vm_private_data & ~HPAGE_RESV_MASK; 61 return 0; 62} 63 64static void set_vma_resv_huge_pages(struct vm_area_struct *vma, 65 unsigned long reserve) 66{ 67 unsigned long flags; 68 VM_BUG_ON(!is_vm_hugetlb_page(vma)); 69 VM_BUG_ON(vma->vm_flags & VM_SHARED); 70 71 flags = (unsigned long)vma->vm_private_data & HPAGE_RESV_MASK; 72 vma->vm_private_data = (void *)(reserve | flags); 73} 74 75static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags) 76{ 77 unsigned long reserveflags = (unsigned long)vma->vm_private_data; 78 VM_BUG_ON(!is_vm_hugetlb_page(vma)); 79 vma->vm_private_data = (void *)(reserveflags | flags); 80} 81 82static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag) 83{ 84 VM_BUG_ON(!is_vm_hugetlb_page(vma)); 85 return ((unsigned long)vma->vm_private_data & flag) != 0; 86} 87 88/* Decrement the reserved pages in the hugepage pool by one */ 89static void decrement_hugepage_resv_vma(struct vm_area_struct *vma) 90{ 91 if (vma->vm_flags & VM_SHARED) { 92 /* Shared mappings always use reserves */ 93 resv_huge_pages--; 94 } else { 95 /* 96 * Only the process that called mmap() has reserves for 97 * private mappings. 98 */ 99 if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { 100 unsigned long flags, reserve; 101 resv_huge_pages--; 102 flags = (unsigned long)vma->vm_private_data & 103 HPAGE_RESV_MASK; 104 reserve = (unsigned long)vma->vm_private_data - 1; 105 vma->vm_private_data = (void *)(reserve | flags); 106 } 107 } 108} 109 110/* Reset counters to 0 and clear all HPAGE_RESV_* flags */ 111void reset_vma_resv_huge_pages(struct vm_area_struct *vma) 112{ 113 VM_BUG_ON(!is_vm_hugetlb_page(vma)); 114 if (!(vma->vm_flags & VM_SHARED)) 115 vma->vm_private_data = (void *)0; 116} 117 118/* Returns true if the VMA has associated reserve pages */ 119static int vma_has_private_reserves(struct vm_area_struct *vma) 120{ 121 if (vma->vm_flags & VM_SHARED) 122 return 0; 123 if (!vma_resv_huge_pages(vma)) 124 return 0; 125 return 1; 126} 127 128static void clear_huge_page(struct page *page, unsigned long addr) 129{ 130 int i; 131 132 might_sleep(); 133 for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); i++) { 134 cond_resched(); 135 clear_user_highpage(page + i, addr + i * PAGE_SIZE); 136 } 137} 138 139static void copy_huge_page(struct page *dst, struct page *src, 140 unsigned long addr, struct vm_area_struct *vma) 141{ 142 int i; 143 144 might_sleep(); 145 for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) { 146 cond_resched(); 147 copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma); 148 } 149} 150 151static void enqueue_huge_page(struct page *page) 152{ 153 int nid = page_to_nid(page); 154 list_add(&page->lru, &hugepage_freelists[nid]); 155 free_huge_pages++; 156 free_huge_pages_node[nid]++; 157} 158 159static struct page *dequeue_huge_page(void) 160{ 161 int nid; 162 struct page *page = NULL; 163 164 for (nid = 0; nid < MAX_NUMNODES; ++nid) { 165 if (!list_empty(&hugepage_freelists[nid])) { 166 page = list_entry(hugepage_freelists[nid].next, 167 struct page, lru); 168 list_del(&page->lru); 169 free_huge_pages--; 170 free_huge_pages_node[nid]--; 171 break; 172 } 173 } 174 return page; 175} 176 177static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma, 178 unsigned long address, int avoid_reserve) 179{ 180 int nid; 181 struct page *page = NULL; 182 struct mempolicy *mpol; 183 nodemask_t *nodemask; 184 struct zonelist *zonelist = huge_zonelist(vma, address, 185 htlb_alloc_mask, &mpol, &nodemask); 186 struct zone *zone; 187 struct zoneref *z; 188 189 /* 190 * A child process with MAP_PRIVATE mappings created by their parent 191 * have no page reserves. This check ensures that reservations are 192 * not "stolen". The child may still get SIGKILLed 193 */ 194 if (!vma_has_private_reserves(vma) && 195 free_huge_pages - resv_huge_pages == 0) 196 return NULL; 197 198 /* If reserves cannot be used, ensure enough pages are in the pool */ 199 if (avoid_reserve && free_huge_pages - resv_huge_pages == 0) 200 return NULL; 201 202 for_each_zone_zonelist_nodemask(zone, z, zonelist, 203 MAX_NR_ZONES - 1, nodemask) { 204 nid = zone_to_nid(zone); 205 if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask) && 206 !list_empty(&hugepage_freelists[nid])) { 207 page = list_entry(hugepage_freelists[nid].next, 208 struct page, lru); 209 list_del(&page->lru); 210 free_huge_pages--; 211 free_huge_pages_node[nid]--; 212 213 if (!avoid_reserve) 214 decrement_hugepage_resv_vma(vma); 215 216 break; 217 } 218 } 219 mpol_cond_put(mpol); 220 return page; 221} 222 223static void update_and_free_page(struct page *page) 224{ 225 int i; 226 nr_huge_pages--; 227 nr_huge_pages_node[page_to_nid(page)]--; 228 for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) { 229 page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced | 230 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved | 231 1 << PG_private | 1<< PG_writeback); 232 } 233 set_compound_page_dtor(page, NULL); 234 set_page_refcounted(page); 235 arch_release_hugepage(page); 236 __free_pages(page, HUGETLB_PAGE_ORDER); 237} 238 239static void free_huge_page(struct page *page) 240{ 241 int nid = page_to_nid(page); 242 struct address_space *mapping; 243 244 mapping = (struct address_space *) page_private(page); 245 set_page_private(page, 0); 246 BUG_ON(page_count(page)); 247 INIT_LIST_HEAD(&page->lru); 248 249 spin_lock(&hugetlb_lock); 250 if (surplus_huge_pages_node[nid]) { 251 update_and_free_page(page); 252 surplus_huge_pages--; 253 surplus_huge_pages_node[nid]--; 254 } else { 255 enqueue_huge_page(page); 256 } 257 spin_unlock(&hugetlb_lock); 258 if (mapping) 259 hugetlb_put_quota(mapping, 1); 260} 261 262/* 263 * Increment or decrement surplus_huge_pages. Keep node-specific counters 264 * balanced by operating on them in a round-robin fashion. 265 * Returns 1 if an adjustment was made. 266 */ 267static int adjust_pool_surplus(int delta) 268{ 269 static int prev_nid; 270 int nid = prev_nid; 271 int ret = 0; 272 273 VM_BUG_ON(delta != -1 && delta != 1); 274 do { 275 nid = next_node(nid, node_online_map); 276 if (nid == MAX_NUMNODES) 277 nid = first_node(node_online_map); 278 279 /* To shrink on this node, there must be a surplus page */ 280 if (delta < 0 && !surplus_huge_pages_node[nid]) 281 continue; 282 /* Surplus cannot exceed the total number of pages */ 283 if (delta > 0 && surplus_huge_pages_node[nid] >= 284 nr_huge_pages_node[nid]) 285 continue; 286 287 surplus_huge_pages += delta; 288 surplus_huge_pages_node[nid] += delta; 289 ret = 1; 290 break; 291 } while (nid != prev_nid); 292 293 prev_nid = nid; 294 return ret; 295} 296 297static struct page *alloc_fresh_huge_page_node(int nid) 298{ 299 struct page *page; 300 301 page = alloc_pages_node(nid, 302 htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE| 303 __GFP_REPEAT|__GFP_NOWARN, 304 HUGETLB_PAGE_ORDER); 305 if (page) { 306 if (arch_prepare_hugepage(page)) { 307 __free_pages(page, HUGETLB_PAGE_ORDER); 308 return NULL; 309 } 310 set_compound_page_dtor(page, free_huge_page); 311 spin_lock(&hugetlb_lock); 312 nr_huge_pages++; 313 nr_huge_pages_node[nid]++; 314 spin_unlock(&hugetlb_lock); 315 put_page(page); /* free it into the hugepage allocator */ 316 } 317 318 return page; 319} 320 321static int alloc_fresh_huge_page(void) 322{ 323 struct page *page; 324 int start_nid; 325 int next_nid; 326 int ret = 0; 327 328 start_nid = hugetlb_next_nid; 329 330 do { 331 page = alloc_fresh_huge_page_node(hugetlb_next_nid); 332 if (page) 333 ret = 1; 334 /* 335 * Use a helper variable to find the next node and then 336 * copy it back to hugetlb_next_nid afterwards: 337 * otherwise there's a window in which a racer might 338 * pass invalid nid MAX_NUMNODES to alloc_pages_node. 339 * But we don't need to use a spin_lock here: it really 340 * doesn't matter if occasionally a racer chooses the 341 * same nid as we do. Move nid forward in the mask even 342 * if we just successfully allocated a hugepage so that 343 * the next caller gets hugepages on the next node. 344 */ 345 next_nid = next_node(hugetlb_next_nid, node_online_map); 346 if (next_nid == MAX_NUMNODES) 347 next_nid = first_node(node_online_map); 348 hugetlb_next_nid = next_nid; 349 } while (!page && hugetlb_next_nid != start_nid); 350 351 if (ret) 352 count_vm_event(HTLB_BUDDY_PGALLOC); 353 else 354 count_vm_event(HTLB_BUDDY_PGALLOC_FAIL); 355 356 return ret; 357} 358 359static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma, 360 unsigned long address) 361{ 362 struct page *page; 363 unsigned int nid; 364 365 /* 366 * Assume we will successfully allocate the surplus page to 367 * prevent racing processes from causing the surplus to exceed 368 * overcommit 369 * 370 * This however introduces a different race, where a process B 371 * tries to grow the static hugepage pool while alloc_pages() is 372 * called by process A. B will only examine the per-node 373 * counters in determining if surplus huge pages can be 374 * converted to normal huge pages in adjust_pool_surplus(). A 375 * won't be able to increment the per-node counter, until the 376 * lock is dropped by B, but B doesn't drop hugetlb_lock until 377 * no more huge pages can be converted from surplus to normal 378 * state (and doesn't try to convert again). Thus, we have a 379 * case where a surplus huge page exists, the pool is grown, and 380 * the surplus huge page still exists after, even though it 381 * should just have been converted to a normal huge page. This 382 * does not leak memory, though, as the hugepage will be freed 383 * once it is out of use. It also does not allow the counters to 384 * go out of whack in adjust_pool_surplus() as we don't modify 385 * the node values until we've gotten the hugepage and only the 386 * per-node value is checked there. 387 */ 388 spin_lock(&hugetlb_lock); 389 if (surplus_huge_pages >= nr_overcommit_huge_pages) { 390 spin_unlock(&hugetlb_lock); 391 return NULL; 392 } else { 393 nr_huge_pages++; 394 surplus_huge_pages++; 395 } 396 spin_unlock(&hugetlb_lock); 397 398 page = alloc_pages(htlb_alloc_mask|__GFP_COMP| 399 __GFP_REPEAT|__GFP_NOWARN, 400 HUGETLB_PAGE_ORDER); 401 402 spin_lock(&hugetlb_lock); 403 if (page) { 404 /* 405 * This page is now managed by the hugetlb allocator and has 406 * no users -- drop the buddy allocator's reference. 407 */ 408 put_page_testzero(page); 409 VM_BUG_ON(page_count(page)); 410 nid = page_to_nid(page); 411 set_compound_page_dtor(page, free_huge_page); 412 /* 413 * We incremented the global counters already 414 */ 415 nr_huge_pages_node[nid]++; 416 surplus_huge_pages_node[nid]++; 417 __count_vm_event(HTLB_BUDDY_PGALLOC); 418 } else { 419 nr_huge_pages--; 420 surplus_huge_pages--; 421 __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL); 422 } 423 spin_unlock(&hugetlb_lock); 424 425 return page; 426} 427 428/* 429 * Increase the hugetlb pool such that it can accomodate a reservation 430 * of size 'delta'. 431 */ 432static int gather_surplus_pages(int delta) 433{ 434 struct list_head surplus_list; 435 struct page *page, *tmp; 436 int ret, i; 437 int needed, allocated; 438 439 needed = (resv_huge_pages + delta) - free_huge_pages; 440 if (needed <= 0) { 441 resv_huge_pages += delta; 442 return 0; 443 } 444 445 allocated = 0; 446 INIT_LIST_HEAD(&surplus_list); 447 448 ret = -ENOMEM; 449retry: 450 spin_unlock(&hugetlb_lock); 451 for (i = 0; i < needed; i++) { 452 page = alloc_buddy_huge_page(NULL, 0); 453 if (!page) { 454 /* 455 * We were not able to allocate enough pages to 456 * satisfy the entire reservation so we free what 457 * we've allocated so far. 458 */ 459 spin_lock(&hugetlb_lock); 460 needed = 0; 461 goto free; 462 } 463 464 list_add(&page->lru, &surplus_list); 465 } 466 allocated += needed; 467 468 /* 469 * After retaking hugetlb_lock, we need to recalculate 'needed' 470 * because either resv_huge_pages or free_huge_pages may have changed. 471 */ 472 spin_lock(&hugetlb_lock); 473 needed = (resv_huge_pages + delta) - (free_huge_pages + allocated); 474 if (needed > 0) 475 goto retry; 476 477 /* 478 * The surplus_list now contains _at_least_ the number of extra pages 479 * needed to accomodate the reservation. Add the appropriate number 480 * of pages to the hugetlb pool and free the extras back to the buddy 481 * allocator. Commit the entire reservation here to prevent another 482 * process from stealing the pages as they are added to the pool but 483 * before they are reserved. 484 */ 485 needed += allocated; 486 resv_huge_pages += delta; 487 ret = 0; 488free: 489 /* Free the needed pages to the hugetlb pool */ 490 list_for_each_entry_safe(page, tmp, &surplus_list, lru) { 491 if ((--needed) < 0) 492 break; 493 list_del(&page->lru); 494 enqueue_huge_page(page); 495 } 496 497 /* Free unnecessary surplus pages to the buddy allocator */ 498 if (!list_empty(&surplus_list)) { 499 spin_unlock(&hugetlb_lock); 500 list_for_each_entry_safe(page, tmp, &surplus_list, lru) { 501 list_del(&page->lru); 502 /* 503 * The page has a reference count of zero already, so 504 * call free_huge_page directly instead of using 505 * put_page. This must be done with hugetlb_lock 506 * unlocked which is safe because free_huge_page takes 507 * hugetlb_lock before deciding how to free the page. 508 */ 509 free_huge_page(page); 510 } 511 spin_lock(&hugetlb_lock); 512 } 513 514 return ret; 515} 516 517/* 518 * When releasing a hugetlb pool reservation, any surplus pages that were 519 * allocated to satisfy the reservation must be explicitly freed if they were 520 * never used. 521 */ 522static void return_unused_surplus_pages(unsigned long unused_resv_pages) 523{ 524 static int nid = -1; 525 struct page *page; 526 unsigned long nr_pages; 527 528 /* 529 * We want to release as many surplus pages as possible, spread 530 * evenly across all nodes. Iterate across all nodes until we 531 * can no longer free unreserved surplus pages. This occurs when 532 * the nodes with surplus pages have no free pages. 533 */ 534 unsigned long remaining_iterations = num_online_nodes(); 535 536 /* Uncommit the reservation */ 537 resv_huge_pages -= unused_resv_pages; 538 539 nr_pages = min(unused_resv_pages, surplus_huge_pages); 540 541 while (remaining_iterations-- && nr_pages) { 542 nid = next_node(nid, node_online_map); 543 if (nid == MAX_NUMNODES) 544 nid = first_node(node_online_map); 545 546 if (!surplus_huge_pages_node[nid]) 547 continue; 548 549 if (!list_empty(&hugepage_freelists[nid])) { 550 page = list_entry(hugepage_freelists[nid].next, 551 struct page, lru); 552 list_del(&page->lru); 553 update_and_free_page(page); 554 free_huge_pages--; 555 free_huge_pages_node[nid]--; 556 surplus_huge_pages--; 557 surplus_huge_pages_node[nid]--; 558 nr_pages--; 559 remaining_iterations = num_online_nodes(); 560 } 561 } 562} 563 564static struct page *alloc_huge_page(struct vm_area_struct *vma, 565 unsigned long addr, int avoid_reserve) 566{ 567 struct page *page; 568 struct address_space *mapping = vma->vm_file->f_mapping; 569 struct inode *inode = mapping->host; 570 unsigned int chg = 0; 571 572 /* 573 * Processes that did not create the mapping will have no reserves and 574 * will not have accounted against quota. Check that the quota can be 575 * made before satisfying the allocation 576 */ 577 if (!(vma->vm_flags & VM_SHARED) && 578 !is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { 579 chg = 1; 580 if (hugetlb_get_quota(inode->i_mapping, chg)) 581 return ERR_PTR(-ENOSPC); 582 } 583 584 spin_lock(&hugetlb_lock); 585 page = dequeue_huge_page_vma(vma, addr, avoid_reserve); 586 spin_unlock(&hugetlb_lock); 587 588 if (!page) { 589 page = alloc_buddy_huge_page(vma, addr); 590 if (!page) { 591 hugetlb_put_quota(inode->i_mapping, chg); 592 return ERR_PTR(-VM_FAULT_OOM); 593 } 594 } 595 596 set_page_refcounted(page); 597 set_page_private(page, (unsigned long) mapping); 598 599 return page; 600} 601 602static int __init hugetlb_init(void) 603{ 604 unsigned long i; 605 606 if (HPAGE_SHIFT == 0) 607 return 0; 608 609 for (i = 0; i < MAX_NUMNODES; ++i) 610 INIT_LIST_HEAD(&hugepage_freelists[i]); 611 612 hugetlb_next_nid = first_node(node_online_map); 613 614 for (i = 0; i < max_huge_pages; ++i) { 615 if (!alloc_fresh_huge_page()) 616 break; 617 } 618 max_huge_pages = free_huge_pages = nr_huge_pages = i; 619 printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages); 620 return 0; 621} 622module_init(hugetlb_init); 623 624static int __init hugetlb_setup(char *s) 625{ 626 if (sscanf(s, "%lu", &max_huge_pages) <= 0) 627 max_huge_pages = 0; 628 return 1; 629} 630__setup("hugepages=", hugetlb_setup); 631 632static unsigned int cpuset_mems_nr(unsigned int *array) 633{ 634 int node; 635 unsigned int nr = 0; 636 637 for_each_node_mask(node, cpuset_current_mems_allowed) 638 nr += array[node]; 639 640 return nr; 641} 642 643#ifdef CONFIG_SYSCTL 644#ifdef CONFIG_HIGHMEM 645static void try_to_free_low(unsigned long count) 646{ 647 int i; 648 649 for (i = 0; i < MAX_NUMNODES; ++i) { 650 struct page *page, *next; 651 list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) { 652 if (count >= nr_huge_pages) 653 return; 654 if (PageHighMem(page)) 655 continue; 656 list_del(&page->lru); 657 update_and_free_page(page); 658 free_huge_pages--; 659 free_huge_pages_node[page_to_nid(page)]--; 660 } 661 } 662} 663#else 664static inline void try_to_free_low(unsigned long count) 665{ 666} 667#endif 668 669#define persistent_huge_pages (nr_huge_pages - surplus_huge_pages) 670static unsigned long set_max_huge_pages(unsigned long count) 671{ 672 unsigned long min_count, ret; 673 674 /* 675 * Increase the pool size 676 * First take pages out of surplus state. Then make up the 677 * remaining difference by allocating fresh huge pages. 678 * 679 * We might race with alloc_buddy_huge_page() here and be unable 680 * to convert a surplus huge page to a normal huge page. That is 681 * not critical, though, it just means the overall size of the 682 * pool might be one hugepage larger than it needs to be, but 683 * within all the constraints specified by the sysctls. 684 */ 685 spin_lock(&hugetlb_lock); 686 while (surplus_huge_pages && count > persistent_huge_pages) { 687 if (!adjust_pool_surplus(-1)) 688 break; 689 } 690 691 while (count > persistent_huge_pages) { 692 /* 693 * If this allocation races such that we no longer need the 694 * page, free_huge_page will handle it by freeing the page 695 * and reducing the surplus. 696 */ 697 spin_unlock(&hugetlb_lock); 698 ret = alloc_fresh_huge_page(); 699 spin_lock(&hugetlb_lock); 700 if (!ret) 701 goto out; 702 703 } 704 705 /* 706 * Decrease the pool size 707 * First return free pages to the buddy allocator (being careful 708 * to keep enough around to satisfy reservations). Then place 709 * pages into surplus state as needed so the pool will shrink 710 * to the desired size as pages become free. 711 * 712 * By placing pages into the surplus state independent of the 713 * overcommit value, we are allowing the surplus pool size to 714 * exceed overcommit. There are few sane options here. Since 715 * alloc_buddy_huge_page() is checking the global counter, 716 * though, we'll note that we're not allowed to exceed surplus 717 * and won't grow the pool anywhere else. Not until one of the 718 * sysctls are changed, or the surplus pages go out of use. 719 */ 720 min_count = resv_huge_pages + nr_huge_pages - free_huge_pages; 721 min_count = max(count, min_count); 722 try_to_free_low(min_count); 723 while (min_count < persistent_huge_pages) { 724 struct page *page = dequeue_huge_page(); 725 if (!page) 726 break; 727 update_and_free_page(page); 728 } 729 while (count < persistent_huge_pages) { 730 if (!adjust_pool_surplus(1)) 731 break; 732 } 733out: 734 ret = persistent_huge_pages; 735 spin_unlock(&hugetlb_lock); 736 return ret; 737} 738 739int hugetlb_sysctl_handler(struct ctl_table *table, int write, 740 struct file *file, void __user *buffer, 741 size_t *length, loff_t *ppos) 742{ 743 proc_doulongvec_minmax(table, write, file, buffer, length, ppos); 744 max_huge_pages = set_max_huge_pages(max_huge_pages); 745 return 0; 746} 747 748int hugetlb_treat_movable_handler(struct ctl_table *table, int write, 749 struct file *file, void __user *buffer, 750 size_t *length, loff_t *ppos) 751{ 752 proc_dointvec(table, write, file, buffer, length, ppos); 753 if (hugepages_treat_as_movable) 754 htlb_alloc_mask = GFP_HIGHUSER_MOVABLE; 755 else 756 htlb_alloc_mask = GFP_HIGHUSER; 757 return 0; 758} 759 760int hugetlb_overcommit_handler(struct ctl_table *table, int write, 761 struct file *file, void __user *buffer, 762 size_t *length, loff_t *ppos) 763{ 764 proc_doulongvec_minmax(table, write, file, buffer, length, ppos); 765 spin_lock(&hugetlb_lock); 766 nr_overcommit_huge_pages = sysctl_overcommit_huge_pages; 767 spin_unlock(&hugetlb_lock); 768 return 0; 769} 770 771#endif /* CONFIG_SYSCTL */ 772 773int hugetlb_report_meminfo(char *buf) 774{ 775 return sprintf(buf, 776 "HugePages_Total: %5lu\n" 777 "HugePages_Free: %5lu\n" 778 "HugePages_Rsvd: %5lu\n" 779 "HugePages_Surp: %5lu\n" 780 "Hugepagesize: %5lu kB\n", 781 nr_huge_pages, 782 free_huge_pages, 783 resv_huge_pages, 784 surplus_huge_pages, 785 HPAGE_SIZE/1024); 786} 787 788int hugetlb_report_node_meminfo(int nid, char *buf) 789{ 790 return sprintf(buf, 791 "Node %d HugePages_Total: %5u\n" 792 "Node %d HugePages_Free: %5u\n" 793 "Node %d HugePages_Surp: %5u\n", 794 nid, nr_huge_pages_node[nid], 795 nid, free_huge_pages_node[nid], 796 nid, surplus_huge_pages_node[nid]); 797} 798 799/* Return the number pages of memory we physically have, in PAGE_SIZE units. */ 800unsigned long hugetlb_total_pages(void) 801{ 802 return nr_huge_pages * (HPAGE_SIZE / PAGE_SIZE); 803} 804 805static int hugetlb_acct_memory(long delta) 806{ 807 int ret = -ENOMEM; 808 809 spin_lock(&hugetlb_lock); 810 /* 811 * When cpuset is configured, it breaks the strict hugetlb page 812 * reservation as the accounting is done on a global variable. Such 813 * reservation is completely rubbish in the presence of cpuset because 814 * the reservation is not checked against page availability for the 815 * current cpuset. Application can still potentially OOM'ed by kernel 816 * with lack of free htlb page in cpuset that the task is in. 817 * Attempt to enforce strict accounting with cpuset is almost 818 * impossible (or too ugly) because cpuset is too fluid that 819 * task or memory node can be dynamically moved between cpusets. 820 * 821 * The change of semantics for shared hugetlb mapping with cpuset is 822 * undesirable. However, in order to preserve some of the semantics, 823 * we fall back to check against current free page availability as 824 * a best attempt and hopefully to minimize the impact of changing 825 * semantics that cpuset has. 826 */ 827 if (delta > 0) { 828 if (gather_surplus_pages(delta) < 0) 829 goto out; 830 831 if (delta > cpuset_mems_nr(free_huge_pages_node)) { 832 return_unused_surplus_pages(delta); 833 goto out; 834 } 835 } 836 837 ret = 0; 838 if (delta < 0) 839 return_unused_surplus_pages((unsigned long) -delta); 840 841out: 842 spin_unlock(&hugetlb_lock); 843 return ret; 844} 845 846static void hugetlb_vm_op_close(struct vm_area_struct *vma) 847{ 848 unsigned long reserve = vma_resv_huge_pages(vma); 849 if (reserve) 850 hugetlb_acct_memory(-reserve); 851} 852 853/* 854 * We cannot handle pagefaults against hugetlb pages at all. They cause 855 * handle_mm_fault() to try to instantiate regular-sized pages in the 856 * hugegpage VMA. do_page_fault() is supposed to trap this, so BUG is we get 857 * this far. 858 */ 859static int hugetlb_vm_op_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 860{ 861 BUG(); 862 return 0; 863} 864 865struct vm_operations_struct hugetlb_vm_ops = { 866 .fault = hugetlb_vm_op_fault, 867 .close = hugetlb_vm_op_close, 868}; 869 870static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page, 871 int writable) 872{ 873 pte_t entry; 874 875 if (writable) { 876 entry = 877 pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); 878 } else { 879 entry = huge_pte_wrprotect(mk_pte(page, vma->vm_page_prot)); 880 } 881 entry = pte_mkyoung(entry); 882 entry = pte_mkhuge(entry); 883 884 return entry; 885} 886 887static void set_huge_ptep_writable(struct vm_area_struct *vma, 888 unsigned long address, pte_t *ptep) 889{ 890 pte_t entry; 891 892 entry = pte_mkwrite(pte_mkdirty(huge_ptep_get(ptep))); 893 if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1)) { 894 update_mmu_cache(vma, address, entry); 895 } 896} 897 898 899int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, 900 struct vm_area_struct *vma) 901{ 902 pte_t *src_pte, *dst_pte, entry; 903 struct page *ptepage; 904 unsigned long addr; 905 int cow; 906 907 cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; 908 909 for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) { 910 src_pte = huge_pte_offset(src, addr); 911 if (!src_pte) 912 continue; 913 dst_pte = huge_pte_alloc(dst, addr); 914 if (!dst_pte) 915 goto nomem; 916 917 /* If the pagetables are shared don't copy or take references */ 918 if (dst_pte == src_pte) 919 continue; 920 921 spin_lock(&dst->page_table_lock); 922 spin_lock_nested(&src->page_table_lock, SINGLE_DEPTH_NESTING); 923 if (!huge_pte_none(huge_ptep_get(src_pte))) { 924 if (cow) 925 huge_ptep_set_wrprotect(src, addr, src_pte); 926 entry = huge_ptep_get(src_pte); 927 ptepage = pte_page(entry); 928 get_page(ptepage); 929 set_huge_pte_at(dst, addr, dst_pte, entry); 930 } 931 spin_unlock(&src->page_table_lock); 932 spin_unlock(&dst->page_table_lock); 933 } 934 return 0; 935 936nomem: 937 return -ENOMEM; 938} 939 940void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, 941 unsigned long end, struct page *ref_page) 942{ 943 struct mm_struct *mm = vma->vm_mm; 944 unsigned long address; 945 pte_t *ptep; 946 pte_t pte; 947 struct page *page; 948 struct page *tmp; 949 /* 950 * A page gathering list, protected by per file i_mmap_lock. The 951 * lock is used to avoid list corruption from multiple unmapping 952 * of the same page since we are using page->lru. 953 */ 954 LIST_HEAD(page_list); 955 956 WARN_ON(!is_vm_hugetlb_page(vma)); 957 BUG_ON(start & ~HPAGE_MASK); 958 BUG_ON(end & ~HPAGE_MASK); 959 960 spin_lock(&mm->page_table_lock); 961 for (address = start; address < end; address += HPAGE_SIZE) { 962 ptep = huge_pte_offset(mm, address); 963 if (!ptep) 964 continue; 965 966 if (huge_pmd_unshare(mm, &address, ptep)) 967 continue; 968 969 /* 970 * If a reference page is supplied, it is because a specific 971 * page is being unmapped, not a range. Ensure the page we 972 * are about to unmap is the actual page of interest. 973 */ 974 if (ref_page) { 975 pte = huge_ptep_get(ptep); 976 if (huge_pte_none(pte)) 977 continue; 978 page = pte_page(pte); 979 if (page != ref_page) 980 continue; 981 982 /* 983 * Mark the VMA as having unmapped its page so that 984 * future faults in this VMA will fail rather than 985 * looking like data was lost 986 */ 987 set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED); 988 } 989 990 pte = huge_ptep_get_and_clear(mm, address, ptep); 991 if (huge_pte_none(pte)) 992 continue; 993 994 page = pte_page(pte); 995 if (pte_dirty(pte)) 996 set_page_dirty(page); 997 list_add(&page->lru, &page_list); 998 } 999 spin_unlock(&mm->page_table_lock); 1000 flush_tlb_range(vma, start, end); 1001 list_for_each_entry_safe(page, tmp, &page_list, lru) { 1002 list_del(&page->lru); 1003 put_page(page); 1004 } 1005} 1006 1007void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, 1008 unsigned long end, struct page *ref_page) 1009{ 1010 /* 1011 * It is undesirable to test vma->vm_file as it should be non-null 1012 * for valid hugetlb area. However, vm_file will be NULL in the error 1013 * cleanup path of do_mmap_pgoff. When hugetlbfs ->mmap method fails, 1014 * do_mmap_pgoff() nullifies vma->vm_file before calling this function 1015 * to clean up. Since no pte has actually been setup, it is safe to 1016 * do nothing in this case. 1017 */ 1018 if (vma->vm_file) { 1019 spin_lock(&vma->vm_file->f_mapping->i_mmap_lock); 1020 __unmap_hugepage_range(vma, start, end, ref_page); 1021 spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock); 1022 } 1023} 1024 1025/* 1026 * This is called when the original mapper is failing to COW a MAP_PRIVATE 1027 * mappping it owns the reserve page for. The intention is to unmap the page 1028 * from other VMAs and let the children be SIGKILLed if they are faulting the 1029 * same region. 1030 */ 1031int unmap_ref_private(struct mm_struct *mm, 1032 struct vm_area_struct *vma, 1033 struct page *page, 1034 unsigned long address) 1035{ 1036 struct vm_area_struct *iter_vma; 1037 struct address_space *mapping; 1038 struct prio_tree_iter iter; 1039 pgoff_t pgoff; 1040 1041 /* 1042 * vm_pgoff is in PAGE_SIZE units, hence the different calculation 1043 * from page cache lookup which is in HPAGE_SIZE units. 1044 */ 1045 address = address & huge_page_mask(hstate_vma(vma)); 1046 pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) 1047 + (vma->vm_pgoff >> PAGE_SHIFT); 1048 mapping = (struct address_space *)page_private(page); 1049 1050 vma_prio_tree_foreach(iter_vma, &iter, &mapping->i_mmap, pgoff, pgoff) { 1051 /* Do not unmap the current VMA */ 1052 if (iter_vma == vma) 1053 continue; 1054 1055 /* 1056 * Unmap the page from other VMAs without their own reserves. 1057 * They get marked to be SIGKILLed if they fault in these 1058 * areas. This is because a future no-page fault on this VMA 1059 * could insert a zeroed page instead of the data existing 1060 * from the time of fork. This would look like data corruption 1061 */ 1062 if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER)) 1063 unmap_hugepage_range(iter_vma, 1064 address, address + HPAGE_SIZE, 1065 page); 1066 } 1067 1068 return 1; 1069} 1070 1071static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, 1072 unsigned long address, pte_t *ptep, pte_t pte, 1073 struct page *pagecache_page) 1074{ 1075 struct page *old_page, *new_page; 1076 int avoidcopy; 1077 int outside_reserve = 0; 1078 1079 old_page = pte_page(pte); 1080 1081retry_avoidcopy: 1082 /* If no-one else is actually using this page, avoid the copy 1083 * and just make the page writable */ 1084 avoidcopy = (page_count(old_page) == 1); 1085 if (avoidcopy) { 1086 set_huge_ptep_writable(vma, address, ptep); 1087 return 0; 1088 } 1089 1090 /* 1091 * If the process that created a MAP_PRIVATE mapping is about to 1092 * perform a COW due to a shared page count, attempt to satisfy 1093 * the allocation without using the existing reserves. The pagecache 1094 * page is used to determine if the reserve at this address was 1095 * consumed or not. If reserves were used, a partial faulted mapping 1096 * at the time of fork() could consume its reserves on COW instead 1097 * of the full address range. 1098 */ 1099 if (!(vma->vm_flags & VM_SHARED) && 1100 is_vma_resv_set(vma, HPAGE_RESV_OWNER) && 1101 old_page != pagecache_page) 1102 outside_reserve = 1; 1103 1104 page_cache_get(old_page); 1105 new_page = alloc_huge_page(vma, address, outside_reserve); 1106 1107 if (IS_ERR(new_page)) { 1108 page_cache_release(old_page); 1109 1110 /* 1111 * If a process owning a MAP_PRIVATE mapping fails to COW, 1112 * it is due to references held by a child and an insufficient 1113 * huge page pool. To guarantee the original mappers 1114 * reliability, unmap the page from child processes. The child 1115 * may get SIGKILLed if it later faults. 1116 */ 1117 if (outside_reserve) { 1118 BUG_ON(huge_pte_none(pte)); 1119 if (unmap_ref_private(mm, vma, old_page, address)) { 1120 BUG_ON(page_count(old_page) != 1); 1121 BUG_ON(huge_pte_none(pte)); 1122 goto retry_avoidcopy; 1123 } 1124 WARN_ON_ONCE(1); 1125 } 1126 1127 return -PTR_ERR(new_page); 1128 } 1129 1130 spin_unlock(&mm->page_table_lock); 1131 copy_huge_page(new_page, old_page, address, vma); 1132 __SetPageUptodate(new_page); 1133 spin_lock(&mm->page_table_lock); 1134 1135 ptep = huge_pte_offset(mm, address & HPAGE_MASK); 1136 if (likely(pte_same(huge_ptep_get(ptep), pte))) { 1137 /* Break COW */ 1138 huge_ptep_clear_flush(vma, address, ptep); 1139 set_huge_pte_at(mm, address, ptep, 1140 make_huge_pte(vma, new_page, 1)); 1141 /* Make the old page be freed below */ 1142 new_page = old_page; 1143 } 1144 page_cache_release(new_page); 1145 page_cache_release(old_page); 1146 return 0; 1147} 1148 1149/* Return the pagecache page at a given address within a VMA */ 1150static struct page *hugetlbfs_pagecache_page(struct vm_area_struct *vma, 1151 unsigned long address) 1152{ 1153 struct address_space *mapping; 1154 unsigned long idx; 1155 1156 mapping = vma->vm_file->f_mapping; 1157 idx = ((address - vma->vm_start) >> HPAGE_SHIFT) 1158 + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT)); 1159 1160 return find_lock_page(mapping, idx); 1161} 1162 1163static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, 1164 unsigned long address, pte_t *ptep, int write_access) 1165{ 1166 int ret = VM_FAULT_SIGBUS; 1167 unsigned long idx; 1168 unsigned long size; 1169 struct page *page; 1170 struct address_space *mapping; 1171 pte_t new_pte; 1172 1173 /* 1174 * Currently, we are forced to kill the process in the event the 1175 * original mapper has unmapped pages from the child due to a failed 1176 * COW. Warn that such a situation has occured as it may not be obvious 1177 */ 1178 if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) { 1179 printk(KERN_WARNING 1180 "PID %d killed due to inadequate hugepage pool\n", 1181 current->pid); 1182 return ret; 1183 } 1184 1185 mapping = vma->vm_file->f_mapping; 1186 idx = ((address - vma->vm_start) >> HPAGE_SHIFT) 1187 + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT)); 1188 1189 /* 1190 * Use page lock to guard against racing truncation 1191 * before we get page_table_lock. 1192 */ 1193retry: 1194 page = find_lock_page(mapping, idx); 1195 if (!page) { 1196 size = i_size_read(mapping->host) >> HPAGE_SHIFT; 1197 if (idx >= size) 1198 goto out; 1199 page = alloc_huge_page(vma, address, 0); 1200 if (IS_ERR(page)) { 1201 ret = -PTR_ERR(page); 1202 goto out; 1203 } 1204 clear_huge_page(page, address); 1205 __SetPageUptodate(page); 1206 1207 if (vma->vm_flags & VM_SHARED) { 1208 int err; 1209 struct inode *inode = mapping->host; 1210 1211 err = add_to_page_cache(page, mapping, idx, GFP_KERNEL); 1212 if (err) { 1213 put_page(page); 1214 if (err == -EEXIST) 1215 goto retry; 1216 goto out; 1217 } 1218 1219 spin_lock(&inode->i_lock); 1220 inode->i_blocks += BLOCKS_PER_HUGEPAGE; 1221 spin_unlock(&inode->i_lock); 1222 } else 1223 lock_page(page); 1224 } 1225 1226 spin_lock(&mm->page_table_lock); 1227 size = i_size_read(mapping->host) >> HPAGE_SHIFT; 1228 if (idx >= size) 1229 goto backout; 1230 1231 ret = 0; 1232 if (!huge_pte_none(huge_ptep_get(ptep))) 1233 goto backout; 1234 1235 new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) 1236 && (vma->vm_flags & VM_SHARED))); 1237 set_huge_pte_at(mm, address, ptep, new_pte); 1238 1239 if (write_access && !(vma->vm_flags & VM_SHARED)) { 1240 /* Optimization, do the COW without a second fault */ 1241 ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page); 1242 } 1243 1244 spin_unlock(&mm->page_table_lock); 1245 unlock_page(page); 1246out: 1247 return ret; 1248 1249backout: 1250 spin_unlock(&mm->page_table_lock); 1251 unlock_page(page); 1252 put_page(page); 1253 goto out; 1254} 1255 1256int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, 1257 unsigned long address, int write_access) 1258{ 1259 pte_t *ptep; 1260 pte_t entry; 1261 int ret; 1262 static DEFINE_MUTEX(hugetlb_instantiation_mutex); 1263 1264 ptep = huge_pte_alloc(mm, address); 1265 if (!ptep) 1266 return VM_FAULT_OOM; 1267 1268 /* 1269 * Serialize hugepage allocation and instantiation, so that we don't 1270 * get spurious allocation failures if two CPUs race to instantiate 1271 * the same page in the page cache. 1272 */ 1273 mutex_lock(&hugetlb_instantiation_mutex); 1274 entry = huge_ptep_get(ptep); 1275 if (huge_pte_none(entry)) { 1276 ret = hugetlb_no_page(mm, vma, address, ptep, write_access); 1277 mutex_unlock(&hugetlb_instantiation_mutex); 1278 return ret; 1279 } 1280 1281 ret = 0; 1282 1283 spin_lock(&mm->page_table_lock); 1284 /* Check for a racing update before calling hugetlb_cow */ 1285 if (likely(pte_same(entry, huge_ptep_get(ptep)))) 1286 if (write_access && !pte_write(entry)) { 1287 struct page *page; 1288 page = hugetlbfs_pagecache_page(vma, address); 1289 ret = hugetlb_cow(mm, vma, address, ptep, entry, page); 1290 if (page) { 1291 unlock_page(page); 1292 put_page(page); 1293 } 1294 } 1295 spin_unlock(&mm->page_table_lock); 1296 mutex_unlock(&hugetlb_instantiation_mutex); 1297 1298 return ret; 1299} 1300 1301int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, 1302 struct page **pages, struct vm_area_struct **vmas, 1303 unsigned long *position, int *length, int i, 1304 int write) 1305{ 1306 unsigned long pfn_offset; 1307 unsigned long vaddr = *position; 1308 int remainder = *length; 1309 1310 spin_lock(&mm->page_table_lock); 1311 while (vaddr < vma->vm_end && remainder) { 1312 pte_t *pte; 1313 struct page *page; 1314 1315 /* 1316 * Some archs (sparc64, sh*) have multiple pte_ts to 1317 * each hugepage. We have to make * sure we get the 1318 * first, for the page indexing below to work. 1319 */ 1320 pte = huge_pte_offset(mm, vaddr & HPAGE_MASK); 1321 1322 if (!pte || huge_pte_none(huge_ptep_get(pte)) || 1323 (write && !pte_write(huge_ptep_get(pte)))) { 1324 int ret; 1325 1326 spin_unlock(&mm->page_table_lock); 1327 ret = hugetlb_fault(mm, vma, vaddr, write); 1328 spin_lock(&mm->page_table_lock); 1329 if (!(ret & VM_FAULT_ERROR)) 1330 continue; 1331 1332 remainder = 0; 1333 if (!i) 1334 i = -EFAULT; 1335 break; 1336 } 1337 1338 pfn_offset = (vaddr & ~HPAGE_MASK) >> PAGE_SHIFT; 1339 page = pte_page(huge_ptep_get(pte)); 1340same_page: 1341 if (pages) { 1342 get_page(page); 1343 pages[i] = page + pfn_offset; 1344 } 1345 1346 if (vmas) 1347 vmas[i] = vma; 1348 1349 vaddr += PAGE_SIZE; 1350 ++pfn_offset; 1351 --remainder; 1352 ++i; 1353 if (vaddr < vma->vm_end && remainder && 1354 pfn_offset < HPAGE_SIZE/PAGE_SIZE) { 1355 /* 1356 * We use pfn_offset to avoid touching the pageframes 1357 * of this compound page. 1358 */ 1359 goto same_page; 1360 } 1361 } 1362 spin_unlock(&mm->page_table_lock); 1363 *length = remainder; 1364 *position = vaddr; 1365 1366 return i; 1367} 1368 1369void hugetlb_change_protection(struct vm_area_struct *vma, 1370 unsigned long address, unsigned long end, pgprot_t newprot) 1371{ 1372 struct mm_struct *mm = vma->vm_mm; 1373 unsigned long start = address; 1374 pte_t *ptep; 1375 pte_t pte; 1376 1377 BUG_ON(address >= end); 1378 flush_cache_range(vma, address, end); 1379 1380 spin_lock(&vma->vm_file->f_mapping->i_mmap_lock); 1381 spin_lock(&mm->page_table_lock); 1382 for (; address < end; address += HPAGE_SIZE) { 1383 ptep = huge_pte_offset(mm, address); 1384 if (!ptep) 1385 continue; 1386 if (huge_pmd_unshare(mm, &address, ptep)) 1387 continue; 1388 if (!huge_pte_none(huge_ptep_get(ptep))) { 1389 pte = huge_ptep_get_and_clear(mm, address, ptep); 1390 pte = pte_mkhuge(pte_modify(pte, newprot)); 1391 set_huge_pte_at(mm, address, ptep, pte); 1392 } 1393 } 1394 spin_unlock(&mm->page_table_lock); 1395 spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock); 1396 1397 flush_tlb_range(vma, start, end); 1398} 1399 1400struct file_region { 1401 struct list_head link; 1402 long from; 1403 long to; 1404}; 1405 1406static long region_add(struct list_head *head, long f, long t) 1407{ 1408 struct file_region *rg, *nrg, *trg; 1409 1410 /* Locate the region we are either in or before. */ 1411 list_for_each_entry(rg, head, link) 1412 if (f <= rg->to) 1413 break; 1414 1415 /* Round our left edge to the current segment if it encloses us. */ 1416 if (f > rg->from) 1417 f = rg->from; 1418 1419 /* Check for and consume any regions we now overlap with. */ 1420 nrg = rg; 1421 list_for_each_entry_safe(rg, trg, rg->link.prev, link) { 1422 if (&rg->link == head) 1423 break; 1424 if (rg->from > t) 1425 break; 1426 1427 /* If this area reaches higher then extend our area to 1428 * include it completely. If this is not the first area 1429 * which we intend to reuse, free it. */ 1430 if (rg->to > t) 1431 t = rg->to; 1432 if (rg != nrg) { 1433 list_del(&rg->link); 1434 kfree(rg); 1435 } 1436 } 1437 nrg->from = f; 1438 nrg->to = t; 1439 return 0; 1440} 1441 1442static long region_chg(struct list_head *head, long f, long t) 1443{ 1444 struct file_region *rg, *nrg; 1445 long chg = 0; 1446 1447 /* Locate the region we are before or in. */ 1448 list_for_each_entry(rg, head, link) 1449 if (f <= rg->to) 1450 break; 1451 1452 /* If we are below the current region then a new region is required. 1453 * Subtle, allocate a new region at the position but make it zero 1454 * size such that we can guarantee to record the reservation. */ 1455 if (&rg->link == head || t < rg->from) { 1456 nrg = kmalloc(sizeof(*nrg), GFP_KERNEL); 1457 if (!nrg) 1458 return -ENOMEM; 1459 nrg->from = f; 1460 nrg->to = f; 1461 INIT_LIST_HEAD(&nrg->link); 1462 list_add(&nrg->link, rg->link.prev); 1463 1464 return t - f; 1465 } 1466 1467 /* Round our left edge to the current segment if it encloses us. */ 1468 if (f > rg->from) 1469 f = rg->from; 1470 chg = t - f; 1471 1472 /* Check for and consume any regions we now overlap with. */ 1473 list_for_each_entry(rg, rg->link.prev, link) { 1474 if (&rg->link == head) 1475 break; 1476 if (rg->from > t) 1477 return chg; 1478 1479 /* We overlap with this area, if it extends futher than 1480 * us then we must extend ourselves. Account for its 1481 * existing reservation. */ 1482 if (rg->to > t) { 1483 chg += rg->to - t; 1484 t = rg->to; 1485 } 1486 chg -= rg->to - rg->from; 1487 } 1488 return chg; 1489} 1490 1491static long region_truncate(struct list_head *head, long end) 1492{ 1493 struct file_region *rg, *trg; 1494 long chg = 0; 1495 1496 /* Locate the region we are either in or before. */ 1497 list_for_each_entry(rg, head, link) 1498 if (end <= rg->to) 1499 break; 1500 if (&rg->link == head) 1501 return 0; 1502 1503 /* If we are in the middle of a region then adjust it. */ 1504 if (end > rg->from) { 1505 chg = rg->to - end; 1506 rg->to = end; 1507 rg = list_entry(rg->link.next, typeof(*rg), link); 1508 } 1509 1510 /* Drop any remaining regions. */ 1511 list_for_each_entry_safe(rg, trg, rg->link.prev, link) { 1512 if (&rg->link == head) 1513 break; 1514 chg += rg->to - rg->from; 1515 list_del(&rg->link); 1516 kfree(rg); 1517 } 1518 return chg; 1519} 1520 1521int hugetlb_reserve_pages(struct inode *inode, 1522 long from, long to, 1523 struct vm_area_struct *vma) 1524{ 1525 long ret, chg; 1526 1527 /* 1528 * Shared mappings base their reservation on the number of pages that 1529 * are already allocated on behalf of the file. Private mappings need 1530 * to reserve the full area even if read-only as mprotect() may be 1531 * called to make the mapping read-write. Assume !vma is a shm mapping 1532 */ 1533 if (!vma || vma->vm_flags & VM_SHARED) 1534 chg = region_chg(&inode->i_mapping->private_list, from, to); 1535 else { 1536 chg = to - from; 1537 set_vma_resv_huge_pages(vma, chg); 1538 set_vma_resv_flags(vma, HPAGE_RESV_OWNER); 1539 } 1540 1541 if (chg < 0) 1542 return chg; 1543 1544 if (hugetlb_get_quota(inode->i_mapping, chg)) 1545 return -ENOSPC; 1546 ret = hugetlb_acct_memory(chg); 1547 if (ret < 0) { 1548 hugetlb_put_quota(inode->i_mapping, chg); 1549 return ret; 1550 } 1551 if (!vma || vma->vm_flags & VM_SHARED) 1552 region_add(&inode->i_mapping->private_list, from, to); 1553 return 0; 1554} 1555 1556void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) 1557{ 1558 long chg = region_truncate(&inode->i_mapping->private_list, offset); 1559 1560 spin_lock(&inode->i_lock); 1561 inode->i_blocks -= BLOCKS_PER_HUGEPAGE * freed; 1562 spin_unlock(&inode->i_lock); 1563 1564 hugetlb_put_quota(inode->i_mapping, (chg - freed)); 1565 hugetlb_acct_memory(-(chg - freed)); 1566} 1567