hugetlb.c revision 396faf0303d273219db5d7eb4a2879ad977ed185
1/* 2 * Generic hugetlb support. 3 * (C) William Irwin, April 2004 4 */ 5#include <linux/gfp.h> 6#include <linux/list.h> 7#include <linux/init.h> 8#include <linux/module.h> 9#include <linux/mm.h> 10#include <linux/sysctl.h> 11#include <linux/highmem.h> 12#include <linux/nodemask.h> 13#include <linux/pagemap.h> 14#include <linux/mempolicy.h> 15#include <linux/cpuset.h> 16#include <linux/mutex.h> 17 18#include <asm/page.h> 19#include <asm/pgtable.h> 20 21#include <linux/hugetlb.h> 22#include "internal.h" 23 24const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; 25static unsigned long nr_huge_pages, free_huge_pages, resv_huge_pages; 26unsigned long max_huge_pages; 27static struct list_head hugepage_freelists[MAX_NUMNODES]; 28static unsigned int nr_huge_pages_node[MAX_NUMNODES]; 29static unsigned int free_huge_pages_node[MAX_NUMNODES]; 30static gfp_t htlb_alloc_mask = GFP_HIGHUSER; 31unsigned long hugepages_treat_as_movable; 32 33/* 34 * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages 35 */ 36static DEFINE_SPINLOCK(hugetlb_lock); 37 38static void clear_huge_page(struct page *page, unsigned long addr) 39{ 40 int i; 41 42 might_sleep(); 43 for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); i++) { 44 cond_resched(); 45 clear_user_highpage(page + i, addr); 46 } 47} 48 49static void copy_huge_page(struct page *dst, struct page *src, 50 unsigned long addr, struct vm_area_struct *vma) 51{ 52 int i; 53 54 might_sleep(); 55 for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) { 56 cond_resched(); 57 copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma); 58 } 59} 60 61static void enqueue_huge_page(struct page *page) 62{ 63 int nid = page_to_nid(page); 64 list_add(&page->lru, &hugepage_freelists[nid]); 65 free_huge_pages++; 66 free_huge_pages_node[nid]++; 67} 68 69static struct page *dequeue_huge_page(struct vm_area_struct *vma, 70 unsigned long address) 71{ 72 int nid; 73 struct page *page = NULL; 74 struct zonelist *zonelist = huge_zonelist(vma, address, 75 htlb_alloc_mask); 76 struct zone **z; 77 78 for (z = zonelist->zones; *z; z++) { 79 nid = zone_to_nid(*z); 80 if (cpuset_zone_allowed_softwall(*z, htlb_alloc_mask) && 81 !list_empty(&hugepage_freelists[nid])) 82 break; 83 } 84 85 if (*z) { 86 page = list_entry(hugepage_freelists[nid].next, 87 struct page, lru); 88 list_del(&page->lru); 89 free_huge_pages--; 90 free_huge_pages_node[nid]--; 91 } 92 return page; 93} 94 95static void free_huge_page(struct page *page) 96{ 97 BUG_ON(page_count(page)); 98 99 INIT_LIST_HEAD(&page->lru); 100 101 spin_lock(&hugetlb_lock); 102 enqueue_huge_page(page); 103 spin_unlock(&hugetlb_lock); 104} 105 106static int alloc_fresh_huge_page(void) 107{ 108 static int prev_nid; 109 struct page *page; 110 static DEFINE_SPINLOCK(nid_lock); 111 int nid; 112 113 spin_lock(&nid_lock); 114 nid = next_node(prev_nid, node_online_map); 115 if (nid == MAX_NUMNODES) 116 nid = first_node(node_online_map); 117 prev_nid = nid; 118 spin_unlock(&nid_lock); 119 120 page = alloc_pages_node(nid, htlb_alloc_mask|__GFP_COMP|__GFP_NOWARN, 121 HUGETLB_PAGE_ORDER); 122 if (page) { 123 set_compound_page_dtor(page, free_huge_page); 124 spin_lock(&hugetlb_lock); 125 nr_huge_pages++; 126 nr_huge_pages_node[page_to_nid(page)]++; 127 spin_unlock(&hugetlb_lock); 128 put_page(page); /* free it into the hugepage allocator */ 129 return 1; 130 } 131 return 0; 132} 133 134static struct page *alloc_huge_page(struct vm_area_struct *vma, 135 unsigned long addr) 136{ 137 struct page *page; 138 139 spin_lock(&hugetlb_lock); 140 if (vma->vm_flags & VM_MAYSHARE) 141 resv_huge_pages--; 142 else if (free_huge_pages <= resv_huge_pages) 143 goto fail; 144 145 page = dequeue_huge_page(vma, addr); 146 if (!page) 147 goto fail; 148 149 spin_unlock(&hugetlb_lock); 150 set_page_refcounted(page); 151 return page; 152 153fail: 154 if (vma->vm_flags & VM_MAYSHARE) 155 resv_huge_pages++; 156 spin_unlock(&hugetlb_lock); 157 return NULL; 158} 159 160static int __init hugetlb_init(void) 161{ 162 unsigned long i; 163 164 if (HPAGE_SHIFT == 0) 165 return 0; 166 167 for (i = 0; i < MAX_NUMNODES; ++i) 168 INIT_LIST_HEAD(&hugepage_freelists[i]); 169 170 for (i = 0; i < max_huge_pages; ++i) { 171 if (!alloc_fresh_huge_page()) 172 break; 173 } 174 max_huge_pages = free_huge_pages = nr_huge_pages = i; 175 printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages); 176 return 0; 177} 178module_init(hugetlb_init); 179 180static int __init hugetlb_setup(char *s) 181{ 182 if (sscanf(s, "%lu", &max_huge_pages) <= 0) 183 max_huge_pages = 0; 184 return 1; 185} 186__setup("hugepages=", hugetlb_setup); 187 188static unsigned int cpuset_mems_nr(unsigned int *array) 189{ 190 int node; 191 unsigned int nr = 0; 192 193 for_each_node_mask(node, cpuset_current_mems_allowed) 194 nr += array[node]; 195 196 return nr; 197} 198 199#ifdef CONFIG_SYSCTL 200static void update_and_free_page(struct page *page) 201{ 202 int i; 203 nr_huge_pages--; 204 nr_huge_pages_node[page_to_nid(page)]--; 205 for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) { 206 page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced | 207 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved | 208 1 << PG_private | 1<< PG_writeback); 209 } 210 page[1].lru.next = NULL; 211 set_page_refcounted(page); 212 __free_pages(page, HUGETLB_PAGE_ORDER); 213} 214 215#ifdef CONFIG_HIGHMEM 216static void try_to_free_low(unsigned long count) 217{ 218 int i; 219 220 for (i = 0; i < MAX_NUMNODES; ++i) { 221 struct page *page, *next; 222 list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) { 223 if (PageHighMem(page)) 224 continue; 225 list_del(&page->lru); 226 update_and_free_page(page); 227 free_huge_pages--; 228 free_huge_pages_node[page_to_nid(page)]--; 229 if (count >= nr_huge_pages) 230 return; 231 } 232 } 233} 234#else 235static inline void try_to_free_low(unsigned long count) 236{ 237} 238#endif 239 240static unsigned long set_max_huge_pages(unsigned long count) 241{ 242 while (count > nr_huge_pages) { 243 if (!alloc_fresh_huge_page()) 244 return nr_huge_pages; 245 } 246 if (count >= nr_huge_pages) 247 return nr_huge_pages; 248 249 spin_lock(&hugetlb_lock); 250 count = max(count, resv_huge_pages); 251 try_to_free_low(count); 252 while (count < nr_huge_pages) { 253 struct page *page = dequeue_huge_page(NULL, 0); 254 if (!page) 255 break; 256 update_and_free_page(page); 257 } 258 spin_unlock(&hugetlb_lock); 259 return nr_huge_pages; 260} 261 262int hugetlb_sysctl_handler(struct ctl_table *table, int write, 263 struct file *file, void __user *buffer, 264 size_t *length, loff_t *ppos) 265{ 266 proc_doulongvec_minmax(table, write, file, buffer, length, ppos); 267 max_huge_pages = set_max_huge_pages(max_huge_pages); 268 return 0; 269} 270 271int hugetlb_treat_movable_handler(struct ctl_table *table, int write, 272 struct file *file, void __user *buffer, 273 size_t *length, loff_t *ppos) 274{ 275 proc_dointvec(table, write, file, buffer, length, ppos); 276 if (hugepages_treat_as_movable) 277 htlb_alloc_mask = GFP_HIGHUSER_MOVABLE; 278 else 279 htlb_alloc_mask = GFP_HIGHUSER; 280 return 0; 281} 282 283#endif /* CONFIG_SYSCTL */ 284 285int hugetlb_report_meminfo(char *buf) 286{ 287 return sprintf(buf, 288 "HugePages_Total: %5lu\n" 289 "HugePages_Free: %5lu\n" 290 "HugePages_Rsvd: %5lu\n" 291 "Hugepagesize: %5lu kB\n", 292 nr_huge_pages, 293 free_huge_pages, 294 resv_huge_pages, 295 HPAGE_SIZE/1024); 296} 297 298int hugetlb_report_node_meminfo(int nid, char *buf) 299{ 300 return sprintf(buf, 301 "Node %d HugePages_Total: %5u\n" 302 "Node %d HugePages_Free: %5u\n", 303 nid, nr_huge_pages_node[nid], 304 nid, free_huge_pages_node[nid]); 305} 306 307/* Return the number pages of memory we physically have, in PAGE_SIZE units. */ 308unsigned long hugetlb_total_pages(void) 309{ 310 return nr_huge_pages * (HPAGE_SIZE / PAGE_SIZE); 311} 312 313/* 314 * We cannot handle pagefaults against hugetlb pages at all. They cause 315 * handle_mm_fault() to try to instantiate regular-sized pages in the 316 * hugegpage VMA. do_page_fault() is supposed to trap this, so BUG is we get 317 * this far. 318 */ 319static struct page *hugetlb_nopage(struct vm_area_struct *vma, 320 unsigned long address, int *unused) 321{ 322 BUG(); 323 return NULL; 324} 325 326struct vm_operations_struct hugetlb_vm_ops = { 327 .nopage = hugetlb_nopage, 328}; 329 330static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page, 331 int writable) 332{ 333 pte_t entry; 334 335 if (writable) { 336 entry = 337 pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); 338 } else { 339 entry = pte_wrprotect(mk_pte(page, vma->vm_page_prot)); 340 } 341 entry = pte_mkyoung(entry); 342 entry = pte_mkhuge(entry); 343 344 return entry; 345} 346 347static void set_huge_ptep_writable(struct vm_area_struct *vma, 348 unsigned long address, pte_t *ptep) 349{ 350 pte_t entry; 351 352 entry = pte_mkwrite(pte_mkdirty(*ptep)); 353 if (ptep_set_access_flags(vma, address, ptep, entry, 1)) { 354 update_mmu_cache(vma, address, entry); 355 lazy_mmu_prot_update(entry); 356 } 357} 358 359 360int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, 361 struct vm_area_struct *vma) 362{ 363 pte_t *src_pte, *dst_pte, entry; 364 struct page *ptepage; 365 unsigned long addr; 366 int cow; 367 368 cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; 369 370 for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) { 371 src_pte = huge_pte_offset(src, addr); 372 if (!src_pte) 373 continue; 374 dst_pte = huge_pte_alloc(dst, addr); 375 if (!dst_pte) 376 goto nomem; 377 spin_lock(&dst->page_table_lock); 378 spin_lock(&src->page_table_lock); 379 if (!pte_none(*src_pte)) { 380 if (cow) 381 ptep_set_wrprotect(src, addr, src_pte); 382 entry = *src_pte; 383 ptepage = pte_page(entry); 384 get_page(ptepage); 385 set_huge_pte_at(dst, addr, dst_pte, entry); 386 } 387 spin_unlock(&src->page_table_lock); 388 spin_unlock(&dst->page_table_lock); 389 } 390 return 0; 391 392nomem: 393 return -ENOMEM; 394} 395 396void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, 397 unsigned long end) 398{ 399 struct mm_struct *mm = vma->vm_mm; 400 unsigned long address; 401 pte_t *ptep; 402 pte_t pte; 403 struct page *page; 404 struct page *tmp; 405 /* 406 * A page gathering list, protected by per file i_mmap_lock. The 407 * lock is used to avoid list corruption from multiple unmapping 408 * of the same page since we are using page->lru. 409 */ 410 LIST_HEAD(page_list); 411 412 WARN_ON(!is_vm_hugetlb_page(vma)); 413 BUG_ON(start & ~HPAGE_MASK); 414 BUG_ON(end & ~HPAGE_MASK); 415 416 spin_lock(&mm->page_table_lock); 417 for (address = start; address < end; address += HPAGE_SIZE) { 418 ptep = huge_pte_offset(mm, address); 419 if (!ptep) 420 continue; 421 422 if (huge_pmd_unshare(mm, &address, ptep)) 423 continue; 424 425 pte = huge_ptep_get_and_clear(mm, address, ptep); 426 if (pte_none(pte)) 427 continue; 428 429 page = pte_page(pte); 430 if (pte_dirty(pte)) 431 set_page_dirty(page); 432 list_add(&page->lru, &page_list); 433 } 434 spin_unlock(&mm->page_table_lock); 435 flush_tlb_range(vma, start, end); 436 list_for_each_entry_safe(page, tmp, &page_list, lru) { 437 list_del(&page->lru); 438 put_page(page); 439 } 440} 441 442void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, 443 unsigned long end) 444{ 445 /* 446 * It is undesirable to test vma->vm_file as it should be non-null 447 * for valid hugetlb area. However, vm_file will be NULL in the error 448 * cleanup path of do_mmap_pgoff. When hugetlbfs ->mmap method fails, 449 * do_mmap_pgoff() nullifies vma->vm_file before calling this function 450 * to clean up. Since no pte has actually been setup, it is safe to 451 * do nothing in this case. 452 */ 453 if (vma->vm_file) { 454 spin_lock(&vma->vm_file->f_mapping->i_mmap_lock); 455 __unmap_hugepage_range(vma, start, end); 456 spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock); 457 } 458} 459 460static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, 461 unsigned long address, pte_t *ptep, pte_t pte) 462{ 463 struct page *old_page, *new_page; 464 int avoidcopy; 465 466 old_page = pte_page(pte); 467 468 /* If no-one else is actually using this page, avoid the copy 469 * and just make the page writable */ 470 avoidcopy = (page_count(old_page) == 1); 471 if (avoidcopy) { 472 set_huge_ptep_writable(vma, address, ptep); 473 return VM_FAULT_MINOR; 474 } 475 476 page_cache_get(old_page); 477 new_page = alloc_huge_page(vma, address); 478 479 if (!new_page) { 480 page_cache_release(old_page); 481 return VM_FAULT_OOM; 482 } 483 484 spin_unlock(&mm->page_table_lock); 485 copy_huge_page(new_page, old_page, address, vma); 486 spin_lock(&mm->page_table_lock); 487 488 ptep = huge_pte_offset(mm, address & HPAGE_MASK); 489 if (likely(pte_same(*ptep, pte))) { 490 /* Break COW */ 491 set_huge_pte_at(mm, address, ptep, 492 make_huge_pte(vma, new_page, 1)); 493 /* Make the old page be freed below */ 494 new_page = old_page; 495 } 496 page_cache_release(new_page); 497 page_cache_release(old_page); 498 return VM_FAULT_MINOR; 499} 500 501int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, 502 unsigned long address, pte_t *ptep, int write_access) 503{ 504 int ret = VM_FAULT_SIGBUS; 505 unsigned long idx; 506 unsigned long size; 507 struct page *page; 508 struct address_space *mapping; 509 pte_t new_pte; 510 511 mapping = vma->vm_file->f_mapping; 512 idx = ((address - vma->vm_start) >> HPAGE_SHIFT) 513 + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT)); 514 515 /* 516 * Use page lock to guard against racing truncation 517 * before we get page_table_lock. 518 */ 519retry: 520 page = find_lock_page(mapping, idx); 521 if (!page) { 522 size = i_size_read(mapping->host) >> HPAGE_SHIFT; 523 if (idx >= size) 524 goto out; 525 if (hugetlb_get_quota(mapping)) 526 goto out; 527 page = alloc_huge_page(vma, address); 528 if (!page) { 529 hugetlb_put_quota(mapping); 530 ret = VM_FAULT_OOM; 531 goto out; 532 } 533 clear_huge_page(page, address); 534 535 if (vma->vm_flags & VM_SHARED) { 536 int err; 537 538 err = add_to_page_cache(page, mapping, idx, GFP_KERNEL); 539 if (err) { 540 put_page(page); 541 hugetlb_put_quota(mapping); 542 if (err == -EEXIST) 543 goto retry; 544 goto out; 545 } 546 } else 547 lock_page(page); 548 } 549 550 spin_lock(&mm->page_table_lock); 551 size = i_size_read(mapping->host) >> HPAGE_SHIFT; 552 if (idx >= size) 553 goto backout; 554 555 ret = VM_FAULT_MINOR; 556 if (!pte_none(*ptep)) 557 goto backout; 558 559 new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) 560 && (vma->vm_flags & VM_SHARED))); 561 set_huge_pte_at(mm, address, ptep, new_pte); 562 563 if (write_access && !(vma->vm_flags & VM_SHARED)) { 564 /* Optimization, do the COW without a second fault */ 565 ret = hugetlb_cow(mm, vma, address, ptep, new_pte); 566 } 567 568 spin_unlock(&mm->page_table_lock); 569 unlock_page(page); 570out: 571 return ret; 572 573backout: 574 spin_unlock(&mm->page_table_lock); 575 hugetlb_put_quota(mapping); 576 unlock_page(page); 577 put_page(page); 578 goto out; 579} 580 581int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, 582 unsigned long address, int write_access) 583{ 584 pte_t *ptep; 585 pte_t entry; 586 int ret; 587 static DEFINE_MUTEX(hugetlb_instantiation_mutex); 588 589 ptep = huge_pte_alloc(mm, address); 590 if (!ptep) 591 return VM_FAULT_OOM; 592 593 /* 594 * Serialize hugepage allocation and instantiation, so that we don't 595 * get spurious allocation failures if two CPUs race to instantiate 596 * the same page in the page cache. 597 */ 598 mutex_lock(&hugetlb_instantiation_mutex); 599 entry = *ptep; 600 if (pte_none(entry)) { 601 ret = hugetlb_no_page(mm, vma, address, ptep, write_access); 602 mutex_unlock(&hugetlb_instantiation_mutex); 603 return ret; 604 } 605 606 ret = VM_FAULT_MINOR; 607 608 spin_lock(&mm->page_table_lock); 609 /* Check for a racing update before calling hugetlb_cow */ 610 if (likely(pte_same(entry, *ptep))) 611 if (write_access && !pte_write(entry)) 612 ret = hugetlb_cow(mm, vma, address, ptep, entry); 613 spin_unlock(&mm->page_table_lock); 614 mutex_unlock(&hugetlb_instantiation_mutex); 615 616 return ret; 617} 618 619int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, 620 struct page **pages, struct vm_area_struct **vmas, 621 unsigned long *position, int *length, int i) 622{ 623 unsigned long pfn_offset; 624 unsigned long vaddr = *position; 625 int remainder = *length; 626 627 spin_lock(&mm->page_table_lock); 628 while (vaddr < vma->vm_end && remainder) { 629 pte_t *pte; 630 struct page *page; 631 632 /* 633 * Some archs (sparc64, sh*) have multiple pte_ts to 634 * each hugepage. We have to make * sure we get the 635 * first, for the page indexing below to work. 636 */ 637 pte = huge_pte_offset(mm, vaddr & HPAGE_MASK); 638 639 if (!pte || pte_none(*pte)) { 640 int ret; 641 642 spin_unlock(&mm->page_table_lock); 643 ret = hugetlb_fault(mm, vma, vaddr, 0); 644 spin_lock(&mm->page_table_lock); 645 if (ret == VM_FAULT_MINOR) 646 continue; 647 648 remainder = 0; 649 if (!i) 650 i = -EFAULT; 651 break; 652 } 653 654 pfn_offset = (vaddr & ~HPAGE_MASK) >> PAGE_SHIFT; 655 page = pte_page(*pte); 656same_page: 657 if (pages) { 658 get_page(page); 659 pages[i] = page + pfn_offset; 660 } 661 662 if (vmas) 663 vmas[i] = vma; 664 665 vaddr += PAGE_SIZE; 666 ++pfn_offset; 667 --remainder; 668 ++i; 669 if (vaddr < vma->vm_end && remainder && 670 pfn_offset < HPAGE_SIZE/PAGE_SIZE) { 671 /* 672 * We use pfn_offset to avoid touching the pageframes 673 * of this compound page. 674 */ 675 goto same_page; 676 } 677 } 678 spin_unlock(&mm->page_table_lock); 679 *length = remainder; 680 *position = vaddr; 681 682 return i; 683} 684 685void hugetlb_change_protection(struct vm_area_struct *vma, 686 unsigned long address, unsigned long end, pgprot_t newprot) 687{ 688 struct mm_struct *mm = vma->vm_mm; 689 unsigned long start = address; 690 pte_t *ptep; 691 pte_t pte; 692 693 BUG_ON(address >= end); 694 flush_cache_range(vma, address, end); 695 696 spin_lock(&vma->vm_file->f_mapping->i_mmap_lock); 697 spin_lock(&mm->page_table_lock); 698 for (; address < end; address += HPAGE_SIZE) { 699 ptep = huge_pte_offset(mm, address); 700 if (!ptep) 701 continue; 702 if (huge_pmd_unshare(mm, &address, ptep)) 703 continue; 704 if (!pte_none(*ptep)) { 705 pte = huge_ptep_get_and_clear(mm, address, ptep); 706 pte = pte_mkhuge(pte_modify(pte, newprot)); 707 set_huge_pte_at(mm, address, ptep, pte); 708 lazy_mmu_prot_update(pte); 709 } 710 } 711 spin_unlock(&mm->page_table_lock); 712 spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock); 713 714 flush_tlb_range(vma, start, end); 715} 716 717struct file_region { 718 struct list_head link; 719 long from; 720 long to; 721}; 722 723static long region_add(struct list_head *head, long f, long t) 724{ 725 struct file_region *rg, *nrg, *trg; 726 727 /* Locate the region we are either in or before. */ 728 list_for_each_entry(rg, head, link) 729 if (f <= rg->to) 730 break; 731 732 /* Round our left edge to the current segment if it encloses us. */ 733 if (f > rg->from) 734 f = rg->from; 735 736 /* Check for and consume any regions we now overlap with. */ 737 nrg = rg; 738 list_for_each_entry_safe(rg, trg, rg->link.prev, link) { 739 if (&rg->link == head) 740 break; 741 if (rg->from > t) 742 break; 743 744 /* If this area reaches higher then extend our area to 745 * include it completely. If this is not the first area 746 * which we intend to reuse, free it. */ 747 if (rg->to > t) 748 t = rg->to; 749 if (rg != nrg) { 750 list_del(&rg->link); 751 kfree(rg); 752 } 753 } 754 nrg->from = f; 755 nrg->to = t; 756 return 0; 757} 758 759static long region_chg(struct list_head *head, long f, long t) 760{ 761 struct file_region *rg, *nrg; 762 long chg = 0; 763 764 /* Locate the region we are before or in. */ 765 list_for_each_entry(rg, head, link) 766 if (f <= rg->to) 767 break; 768 769 /* If we are below the current region then a new region is required. 770 * Subtle, allocate a new region at the position but make it zero 771 * size such that we can guarentee to record the reservation. */ 772 if (&rg->link == head || t < rg->from) { 773 nrg = kmalloc(sizeof(*nrg), GFP_KERNEL); 774 if (nrg == 0) 775 return -ENOMEM; 776 nrg->from = f; 777 nrg->to = f; 778 INIT_LIST_HEAD(&nrg->link); 779 list_add(&nrg->link, rg->link.prev); 780 781 return t - f; 782 } 783 784 /* Round our left edge to the current segment if it encloses us. */ 785 if (f > rg->from) 786 f = rg->from; 787 chg = t - f; 788 789 /* Check for and consume any regions we now overlap with. */ 790 list_for_each_entry(rg, rg->link.prev, link) { 791 if (&rg->link == head) 792 break; 793 if (rg->from > t) 794 return chg; 795 796 /* We overlap with this area, if it extends futher than 797 * us then we must extend ourselves. Account for its 798 * existing reservation. */ 799 if (rg->to > t) { 800 chg += rg->to - t; 801 t = rg->to; 802 } 803 chg -= rg->to - rg->from; 804 } 805 return chg; 806} 807 808static long region_truncate(struct list_head *head, long end) 809{ 810 struct file_region *rg, *trg; 811 long chg = 0; 812 813 /* Locate the region we are either in or before. */ 814 list_for_each_entry(rg, head, link) 815 if (end <= rg->to) 816 break; 817 if (&rg->link == head) 818 return 0; 819 820 /* If we are in the middle of a region then adjust it. */ 821 if (end > rg->from) { 822 chg = rg->to - end; 823 rg->to = end; 824 rg = list_entry(rg->link.next, typeof(*rg), link); 825 } 826 827 /* Drop any remaining regions. */ 828 list_for_each_entry_safe(rg, trg, rg->link.prev, link) { 829 if (&rg->link == head) 830 break; 831 chg += rg->to - rg->from; 832 list_del(&rg->link); 833 kfree(rg); 834 } 835 return chg; 836} 837 838static int hugetlb_acct_memory(long delta) 839{ 840 int ret = -ENOMEM; 841 842 spin_lock(&hugetlb_lock); 843 if ((delta + resv_huge_pages) <= free_huge_pages) { 844 resv_huge_pages += delta; 845 ret = 0; 846 } 847 spin_unlock(&hugetlb_lock); 848 return ret; 849} 850 851int hugetlb_reserve_pages(struct inode *inode, long from, long to) 852{ 853 long ret, chg; 854 855 chg = region_chg(&inode->i_mapping->private_list, from, to); 856 if (chg < 0) 857 return chg; 858 /* 859 * When cpuset is configured, it breaks the strict hugetlb page 860 * reservation as the accounting is done on a global variable. Such 861 * reservation is completely rubbish in the presence of cpuset because 862 * the reservation is not checked against page availability for the 863 * current cpuset. Application can still potentially OOM'ed by kernel 864 * with lack of free htlb page in cpuset that the task is in. 865 * Attempt to enforce strict accounting with cpuset is almost 866 * impossible (or too ugly) because cpuset is too fluid that 867 * task or memory node can be dynamically moved between cpusets. 868 * 869 * The change of semantics for shared hugetlb mapping with cpuset is 870 * undesirable. However, in order to preserve some of the semantics, 871 * we fall back to check against current free page availability as 872 * a best attempt and hopefully to minimize the impact of changing 873 * semantics that cpuset has. 874 */ 875 if (chg > cpuset_mems_nr(free_huge_pages_node)) 876 return -ENOMEM; 877 878 ret = hugetlb_acct_memory(chg); 879 if (ret < 0) 880 return ret; 881 region_add(&inode->i_mapping->private_list, from, to); 882 return 0; 883} 884 885void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) 886{ 887 long chg = region_truncate(&inode->i_mapping->private_list, offset); 888 hugetlb_acct_memory(freed - chg); 889} 890