hugetlb.c revision 480eccf9ae1073b87bb4fe118971fbf134a5bc61
1/*
2 * Generic hugetlb support.
3 * (C) William Irwin, April 2004
4 */
5#include <linux/gfp.h>
6#include <linux/list.h>
7#include <linux/init.h>
8#include <linux/module.h>
9#include <linux/mm.h>
10#include <linux/sysctl.h>
11#include <linux/highmem.h>
12#include <linux/nodemask.h>
13#include <linux/pagemap.h>
14#include <linux/mempolicy.h>
15#include <linux/cpuset.h>
16#include <linux/mutex.h>
17
18#include <asm/page.h>
19#include <asm/pgtable.h>
20
21#include <linux/hugetlb.h>
22#include "internal.h"
23
24const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
25static unsigned long nr_huge_pages, free_huge_pages, resv_huge_pages;
26unsigned long max_huge_pages;
27static struct list_head hugepage_freelists[MAX_NUMNODES];
28static unsigned int nr_huge_pages_node[MAX_NUMNODES];
29static unsigned int free_huge_pages_node[MAX_NUMNODES];
30static gfp_t htlb_alloc_mask = GFP_HIGHUSER;
31unsigned long hugepages_treat_as_movable;
32
33/*
34 * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
35 */
36static DEFINE_SPINLOCK(hugetlb_lock);
37
38static void clear_huge_page(struct page *page, unsigned long addr)
39{
40	int i;
41
42	might_sleep();
43	for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); i++) {
44		cond_resched();
45		clear_user_highpage(page + i, addr);
46	}
47}
48
49static void copy_huge_page(struct page *dst, struct page *src,
50			   unsigned long addr, struct vm_area_struct *vma)
51{
52	int i;
53
54	might_sleep();
55	for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) {
56		cond_resched();
57		copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
58	}
59}
60
61static void enqueue_huge_page(struct page *page)
62{
63	int nid = page_to_nid(page);
64	list_add(&page->lru, &hugepage_freelists[nid]);
65	free_huge_pages++;
66	free_huge_pages_node[nid]++;
67}
68
69static struct page *dequeue_huge_page(struct vm_area_struct *vma,
70				unsigned long address)
71{
72	int nid;
73	struct page *page = NULL;
74	struct mempolicy *mpol;
75	struct zonelist *zonelist = huge_zonelist(vma, address,
76					htlb_alloc_mask, &mpol);
77	struct zone **z;
78
79	for (z = zonelist->zones; *z; z++) {
80		nid = zone_to_nid(*z);
81		if (cpuset_zone_allowed_softwall(*z, htlb_alloc_mask) &&
82		    !list_empty(&hugepage_freelists[nid])) {
83			page = list_entry(hugepage_freelists[nid].next,
84					  struct page, lru);
85			list_del(&page->lru);
86			free_huge_pages--;
87			free_huge_pages_node[nid]--;
88			break;
89		}
90	}
91	mpol_free(mpol);	/* unref if mpol !NULL */
92	return page;
93}
94
95static void free_huge_page(struct page *page)
96{
97	BUG_ON(page_count(page));
98
99	INIT_LIST_HEAD(&page->lru);
100
101	spin_lock(&hugetlb_lock);
102	enqueue_huge_page(page);
103	spin_unlock(&hugetlb_lock);
104}
105
106static int alloc_fresh_huge_page(void)
107{
108	static int prev_nid;
109	struct page *page;
110	int nid;
111
112	/*
113	 * Copy static prev_nid to local nid, work on that, then copy it
114	 * back to prev_nid afterwards: otherwise there's a window in which
115	 * a racer might pass invalid nid MAX_NUMNODES to alloc_pages_node.
116	 * But we don't need to use a spin_lock here: it really doesn't
117	 * matter if occasionally a racer chooses the same nid as we do.
118	 */
119	nid = next_node(prev_nid, node_online_map);
120	if (nid == MAX_NUMNODES)
121		nid = first_node(node_online_map);
122	prev_nid = nid;
123
124	page = alloc_pages_node(nid, htlb_alloc_mask|__GFP_COMP|__GFP_NOWARN,
125					HUGETLB_PAGE_ORDER);
126	if (page) {
127		set_compound_page_dtor(page, free_huge_page);
128		spin_lock(&hugetlb_lock);
129		nr_huge_pages++;
130		nr_huge_pages_node[page_to_nid(page)]++;
131		spin_unlock(&hugetlb_lock);
132		put_page(page); /* free it into the hugepage allocator */
133		return 1;
134	}
135	return 0;
136}
137
138static struct page *alloc_huge_page(struct vm_area_struct *vma,
139				    unsigned long addr)
140{
141	struct page *page;
142
143	spin_lock(&hugetlb_lock);
144	if (vma->vm_flags & VM_MAYSHARE)
145		resv_huge_pages--;
146	else if (free_huge_pages <= resv_huge_pages)
147		goto fail;
148
149	page = dequeue_huge_page(vma, addr);
150	if (!page)
151		goto fail;
152
153	spin_unlock(&hugetlb_lock);
154	set_page_refcounted(page);
155	return page;
156
157fail:
158	if (vma->vm_flags & VM_MAYSHARE)
159		resv_huge_pages++;
160	spin_unlock(&hugetlb_lock);
161	return NULL;
162}
163
164static int __init hugetlb_init(void)
165{
166	unsigned long i;
167
168	if (HPAGE_SHIFT == 0)
169		return 0;
170
171	for (i = 0; i < MAX_NUMNODES; ++i)
172		INIT_LIST_HEAD(&hugepage_freelists[i]);
173
174	for (i = 0; i < max_huge_pages; ++i) {
175		if (!alloc_fresh_huge_page())
176			break;
177	}
178	max_huge_pages = free_huge_pages = nr_huge_pages = i;
179	printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages);
180	return 0;
181}
182module_init(hugetlb_init);
183
184static int __init hugetlb_setup(char *s)
185{
186	if (sscanf(s, "%lu", &max_huge_pages) <= 0)
187		max_huge_pages = 0;
188	return 1;
189}
190__setup("hugepages=", hugetlb_setup);
191
192static unsigned int cpuset_mems_nr(unsigned int *array)
193{
194	int node;
195	unsigned int nr = 0;
196
197	for_each_node_mask(node, cpuset_current_mems_allowed)
198		nr += array[node];
199
200	return nr;
201}
202
203#ifdef CONFIG_SYSCTL
204static void update_and_free_page(struct page *page)
205{
206	int i;
207	nr_huge_pages--;
208	nr_huge_pages_node[page_to_nid(page)]--;
209	for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) {
210		page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
211				1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
212				1 << PG_private | 1<< PG_writeback);
213	}
214	set_compound_page_dtor(page, NULL);
215	set_page_refcounted(page);
216	__free_pages(page, HUGETLB_PAGE_ORDER);
217}
218
219#ifdef CONFIG_HIGHMEM
220static void try_to_free_low(unsigned long count)
221{
222	int i;
223
224	for (i = 0; i < MAX_NUMNODES; ++i) {
225		struct page *page, *next;
226		list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) {
227			if (PageHighMem(page))
228				continue;
229			list_del(&page->lru);
230			update_and_free_page(page);
231			free_huge_pages--;
232			free_huge_pages_node[page_to_nid(page)]--;
233			if (count >= nr_huge_pages)
234				return;
235		}
236	}
237}
238#else
239static inline void try_to_free_low(unsigned long count)
240{
241}
242#endif
243
244static unsigned long set_max_huge_pages(unsigned long count)
245{
246	while (count > nr_huge_pages) {
247		if (!alloc_fresh_huge_page())
248			return nr_huge_pages;
249	}
250	if (count >= nr_huge_pages)
251		return nr_huge_pages;
252
253	spin_lock(&hugetlb_lock);
254	count = max(count, resv_huge_pages);
255	try_to_free_low(count);
256	while (count < nr_huge_pages) {
257		struct page *page = dequeue_huge_page(NULL, 0);
258		if (!page)
259			break;
260		update_and_free_page(page);
261	}
262	spin_unlock(&hugetlb_lock);
263	return nr_huge_pages;
264}
265
266int hugetlb_sysctl_handler(struct ctl_table *table, int write,
267			   struct file *file, void __user *buffer,
268			   size_t *length, loff_t *ppos)
269{
270	proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
271	max_huge_pages = set_max_huge_pages(max_huge_pages);
272	return 0;
273}
274
275int hugetlb_treat_movable_handler(struct ctl_table *table, int write,
276			struct file *file, void __user *buffer,
277			size_t *length, loff_t *ppos)
278{
279	proc_dointvec(table, write, file, buffer, length, ppos);
280	if (hugepages_treat_as_movable)
281		htlb_alloc_mask = GFP_HIGHUSER_MOVABLE;
282	else
283		htlb_alloc_mask = GFP_HIGHUSER;
284	return 0;
285}
286
287#endif /* CONFIG_SYSCTL */
288
289int hugetlb_report_meminfo(char *buf)
290{
291	return sprintf(buf,
292			"HugePages_Total: %5lu\n"
293			"HugePages_Free:  %5lu\n"
294			"HugePages_Rsvd:  %5lu\n"
295			"Hugepagesize:    %5lu kB\n",
296			nr_huge_pages,
297			free_huge_pages,
298			resv_huge_pages,
299			HPAGE_SIZE/1024);
300}
301
302int hugetlb_report_node_meminfo(int nid, char *buf)
303{
304	return sprintf(buf,
305		"Node %d HugePages_Total: %5u\n"
306		"Node %d HugePages_Free:  %5u\n",
307		nid, nr_huge_pages_node[nid],
308		nid, free_huge_pages_node[nid]);
309}
310
311/* Return the number pages of memory we physically have, in PAGE_SIZE units. */
312unsigned long hugetlb_total_pages(void)
313{
314	return nr_huge_pages * (HPAGE_SIZE / PAGE_SIZE);
315}
316
317/*
318 * We cannot handle pagefaults against hugetlb pages at all.  They cause
319 * handle_mm_fault() to try to instantiate regular-sized pages in the
320 * hugegpage VMA.  do_page_fault() is supposed to trap this, so BUG is we get
321 * this far.
322 */
323static int hugetlb_vm_op_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
324{
325	BUG();
326	return 0;
327}
328
329struct vm_operations_struct hugetlb_vm_ops = {
330	.fault = hugetlb_vm_op_fault,
331};
332
333static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
334				int writable)
335{
336	pte_t entry;
337
338	if (writable) {
339		entry =
340		    pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
341	} else {
342		entry = pte_wrprotect(mk_pte(page, vma->vm_page_prot));
343	}
344	entry = pte_mkyoung(entry);
345	entry = pte_mkhuge(entry);
346
347	return entry;
348}
349
350static void set_huge_ptep_writable(struct vm_area_struct *vma,
351				   unsigned long address, pte_t *ptep)
352{
353	pte_t entry;
354
355	entry = pte_mkwrite(pte_mkdirty(*ptep));
356	if (ptep_set_access_flags(vma, address, ptep, entry, 1)) {
357		update_mmu_cache(vma, address, entry);
358		lazy_mmu_prot_update(entry);
359	}
360}
361
362
363int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
364			    struct vm_area_struct *vma)
365{
366	pte_t *src_pte, *dst_pte, entry;
367	struct page *ptepage;
368	unsigned long addr;
369	int cow;
370
371	cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
372
373	for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
374		src_pte = huge_pte_offset(src, addr);
375		if (!src_pte)
376			continue;
377		dst_pte = huge_pte_alloc(dst, addr);
378		if (!dst_pte)
379			goto nomem;
380		spin_lock(&dst->page_table_lock);
381		spin_lock(&src->page_table_lock);
382		if (!pte_none(*src_pte)) {
383			if (cow)
384				ptep_set_wrprotect(src, addr, src_pte);
385			entry = *src_pte;
386			ptepage = pte_page(entry);
387			get_page(ptepage);
388			set_huge_pte_at(dst, addr, dst_pte, entry);
389		}
390		spin_unlock(&src->page_table_lock);
391		spin_unlock(&dst->page_table_lock);
392	}
393	return 0;
394
395nomem:
396	return -ENOMEM;
397}
398
399void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
400			    unsigned long end)
401{
402	struct mm_struct *mm = vma->vm_mm;
403	unsigned long address;
404	pte_t *ptep;
405	pte_t pte;
406	struct page *page;
407	struct page *tmp;
408	/*
409	 * A page gathering list, protected by per file i_mmap_lock. The
410	 * lock is used to avoid list corruption from multiple unmapping
411	 * of the same page since we are using page->lru.
412	 */
413	LIST_HEAD(page_list);
414
415	WARN_ON(!is_vm_hugetlb_page(vma));
416	BUG_ON(start & ~HPAGE_MASK);
417	BUG_ON(end & ~HPAGE_MASK);
418
419	spin_lock(&mm->page_table_lock);
420	for (address = start; address < end; address += HPAGE_SIZE) {
421		ptep = huge_pte_offset(mm, address);
422		if (!ptep)
423			continue;
424
425		if (huge_pmd_unshare(mm, &address, ptep))
426			continue;
427
428		pte = huge_ptep_get_and_clear(mm, address, ptep);
429		if (pte_none(pte))
430			continue;
431
432		page = pte_page(pte);
433		if (pte_dirty(pte))
434			set_page_dirty(page);
435		list_add(&page->lru, &page_list);
436	}
437	spin_unlock(&mm->page_table_lock);
438	flush_tlb_range(vma, start, end);
439	list_for_each_entry_safe(page, tmp, &page_list, lru) {
440		list_del(&page->lru);
441		put_page(page);
442	}
443}
444
445void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
446			  unsigned long end)
447{
448	/*
449	 * It is undesirable to test vma->vm_file as it should be non-null
450	 * for valid hugetlb area. However, vm_file will be NULL in the error
451	 * cleanup path of do_mmap_pgoff. When hugetlbfs ->mmap method fails,
452	 * do_mmap_pgoff() nullifies vma->vm_file before calling this function
453	 * to clean up. Since no pte has actually been setup, it is safe to
454	 * do nothing in this case.
455	 */
456	if (vma->vm_file) {
457		spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
458		__unmap_hugepage_range(vma, start, end);
459		spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);
460	}
461}
462
463static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
464			unsigned long address, pte_t *ptep, pte_t pte)
465{
466	struct page *old_page, *new_page;
467	int avoidcopy;
468
469	old_page = pte_page(pte);
470
471	/* If no-one else is actually using this page, avoid the copy
472	 * and just make the page writable */
473	avoidcopy = (page_count(old_page) == 1);
474	if (avoidcopy) {
475		set_huge_ptep_writable(vma, address, ptep);
476		return 0;
477	}
478
479	page_cache_get(old_page);
480	new_page = alloc_huge_page(vma, address);
481
482	if (!new_page) {
483		page_cache_release(old_page);
484		return VM_FAULT_OOM;
485	}
486
487	spin_unlock(&mm->page_table_lock);
488	copy_huge_page(new_page, old_page, address, vma);
489	spin_lock(&mm->page_table_lock);
490
491	ptep = huge_pte_offset(mm, address & HPAGE_MASK);
492	if (likely(pte_same(*ptep, pte))) {
493		/* Break COW */
494		set_huge_pte_at(mm, address, ptep,
495				make_huge_pte(vma, new_page, 1));
496		/* Make the old page be freed below */
497		new_page = old_page;
498	}
499	page_cache_release(new_page);
500	page_cache_release(old_page);
501	return 0;
502}
503
504static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
505			unsigned long address, pte_t *ptep, int write_access)
506{
507	int ret = VM_FAULT_SIGBUS;
508	unsigned long idx;
509	unsigned long size;
510	struct page *page;
511	struct address_space *mapping;
512	pte_t new_pte;
513
514	mapping = vma->vm_file->f_mapping;
515	idx = ((address - vma->vm_start) >> HPAGE_SHIFT)
516		+ (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
517
518	/*
519	 * Use page lock to guard against racing truncation
520	 * before we get page_table_lock.
521	 */
522retry:
523	page = find_lock_page(mapping, idx);
524	if (!page) {
525		size = i_size_read(mapping->host) >> HPAGE_SHIFT;
526		if (idx >= size)
527			goto out;
528		if (hugetlb_get_quota(mapping))
529			goto out;
530		page = alloc_huge_page(vma, address);
531		if (!page) {
532			hugetlb_put_quota(mapping);
533			ret = VM_FAULT_OOM;
534			goto out;
535		}
536		clear_huge_page(page, address);
537
538		if (vma->vm_flags & VM_SHARED) {
539			int err;
540
541			err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
542			if (err) {
543				put_page(page);
544				hugetlb_put_quota(mapping);
545				if (err == -EEXIST)
546					goto retry;
547				goto out;
548			}
549		} else
550			lock_page(page);
551	}
552
553	spin_lock(&mm->page_table_lock);
554	size = i_size_read(mapping->host) >> HPAGE_SHIFT;
555	if (idx >= size)
556		goto backout;
557
558	ret = 0;
559	if (!pte_none(*ptep))
560		goto backout;
561
562	new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
563				&& (vma->vm_flags & VM_SHARED)));
564	set_huge_pte_at(mm, address, ptep, new_pte);
565
566	if (write_access && !(vma->vm_flags & VM_SHARED)) {
567		/* Optimization, do the COW without a second fault */
568		ret = hugetlb_cow(mm, vma, address, ptep, new_pte);
569	}
570
571	spin_unlock(&mm->page_table_lock);
572	unlock_page(page);
573out:
574	return ret;
575
576backout:
577	spin_unlock(&mm->page_table_lock);
578	hugetlb_put_quota(mapping);
579	unlock_page(page);
580	put_page(page);
581	goto out;
582}
583
584int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
585			unsigned long address, int write_access)
586{
587	pte_t *ptep;
588	pte_t entry;
589	int ret;
590	static DEFINE_MUTEX(hugetlb_instantiation_mutex);
591
592	ptep = huge_pte_alloc(mm, address);
593	if (!ptep)
594		return VM_FAULT_OOM;
595
596	/*
597	 * Serialize hugepage allocation and instantiation, so that we don't
598	 * get spurious allocation failures if two CPUs race to instantiate
599	 * the same page in the page cache.
600	 */
601	mutex_lock(&hugetlb_instantiation_mutex);
602	entry = *ptep;
603	if (pte_none(entry)) {
604		ret = hugetlb_no_page(mm, vma, address, ptep, write_access);
605		mutex_unlock(&hugetlb_instantiation_mutex);
606		return ret;
607	}
608
609	ret = 0;
610
611	spin_lock(&mm->page_table_lock);
612	/* Check for a racing update before calling hugetlb_cow */
613	if (likely(pte_same(entry, *ptep)))
614		if (write_access && !pte_write(entry))
615			ret = hugetlb_cow(mm, vma, address, ptep, entry);
616	spin_unlock(&mm->page_table_lock);
617	mutex_unlock(&hugetlb_instantiation_mutex);
618
619	return ret;
620}
621
622int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
623			struct page **pages, struct vm_area_struct **vmas,
624			unsigned long *position, int *length, int i)
625{
626	unsigned long pfn_offset;
627	unsigned long vaddr = *position;
628	int remainder = *length;
629
630	spin_lock(&mm->page_table_lock);
631	while (vaddr < vma->vm_end && remainder) {
632		pte_t *pte;
633		struct page *page;
634
635		/*
636		 * Some archs (sparc64, sh*) have multiple pte_ts to
637		 * each hugepage.  We have to make * sure we get the
638		 * first, for the page indexing below to work.
639		 */
640		pte = huge_pte_offset(mm, vaddr & HPAGE_MASK);
641
642		if (!pte || pte_none(*pte)) {
643			int ret;
644
645			spin_unlock(&mm->page_table_lock);
646			ret = hugetlb_fault(mm, vma, vaddr, 0);
647			spin_lock(&mm->page_table_lock);
648			if (!(ret & VM_FAULT_ERROR))
649				continue;
650
651			remainder = 0;
652			if (!i)
653				i = -EFAULT;
654			break;
655		}
656
657		pfn_offset = (vaddr & ~HPAGE_MASK) >> PAGE_SHIFT;
658		page = pte_page(*pte);
659same_page:
660		if (pages) {
661			get_page(page);
662			pages[i] = page + pfn_offset;
663		}
664
665		if (vmas)
666			vmas[i] = vma;
667
668		vaddr += PAGE_SIZE;
669		++pfn_offset;
670		--remainder;
671		++i;
672		if (vaddr < vma->vm_end && remainder &&
673				pfn_offset < HPAGE_SIZE/PAGE_SIZE) {
674			/*
675			 * We use pfn_offset to avoid touching the pageframes
676			 * of this compound page.
677			 */
678			goto same_page;
679		}
680	}
681	spin_unlock(&mm->page_table_lock);
682	*length = remainder;
683	*position = vaddr;
684
685	return i;
686}
687
688void hugetlb_change_protection(struct vm_area_struct *vma,
689		unsigned long address, unsigned long end, pgprot_t newprot)
690{
691	struct mm_struct *mm = vma->vm_mm;
692	unsigned long start = address;
693	pte_t *ptep;
694	pte_t pte;
695
696	BUG_ON(address >= end);
697	flush_cache_range(vma, address, end);
698
699	spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
700	spin_lock(&mm->page_table_lock);
701	for (; address < end; address += HPAGE_SIZE) {
702		ptep = huge_pte_offset(mm, address);
703		if (!ptep)
704			continue;
705		if (huge_pmd_unshare(mm, &address, ptep))
706			continue;
707		if (!pte_none(*ptep)) {
708			pte = huge_ptep_get_and_clear(mm, address, ptep);
709			pte = pte_mkhuge(pte_modify(pte, newprot));
710			set_huge_pte_at(mm, address, ptep, pte);
711			lazy_mmu_prot_update(pte);
712		}
713	}
714	spin_unlock(&mm->page_table_lock);
715	spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);
716
717	flush_tlb_range(vma, start, end);
718}
719
720struct file_region {
721	struct list_head link;
722	long from;
723	long to;
724};
725
726static long region_add(struct list_head *head, long f, long t)
727{
728	struct file_region *rg, *nrg, *trg;
729
730	/* Locate the region we are either in or before. */
731	list_for_each_entry(rg, head, link)
732		if (f <= rg->to)
733			break;
734
735	/* Round our left edge to the current segment if it encloses us. */
736	if (f > rg->from)
737		f = rg->from;
738
739	/* Check for and consume any regions we now overlap with. */
740	nrg = rg;
741	list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
742		if (&rg->link == head)
743			break;
744		if (rg->from > t)
745			break;
746
747		/* If this area reaches higher then extend our area to
748		 * include it completely.  If this is not the first area
749		 * which we intend to reuse, free it. */
750		if (rg->to > t)
751			t = rg->to;
752		if (rg != nrg) {
753			list_del(&rg->link);
754			kfree(rg);
755		}
756	}
757	nrg->from = f;
758	nrg->to = t;
759	return 0;
760}
761
762static long region_chg(struct list_head *head, long f, long t)
763{
764	struct file_region *rg, *nrg;
765	long chg = 0;
766
767	/* Locate the region we are before or in. */
768	list_for_each_entry(rg, head, link)
769		if (f <= rg->to)
770			break;
771
772	/* If we are below the current region then a new region is required.
773	 * Subtle, allocate a new region at the position but make it zero
774	 * size such that we can guarentee to record the reservation. */
775	if (&rg->link == head || t < rg->from) {
776		nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
777		if (nrg == 0)
778			return -ENOMEM;
779		nrg->from = f;
780		nrg->to   = f;
781		INIT_LIST_HEAD(&nrg->link);
782		list_add(&nrg->link, rg->link.prev);
783
784		return t - f;
785	}
786
787	/* Round our left edge to the current segment if it encloses us. */
788	if (f > rg->from)
789		f = rg->from;
790	chg = t - f;
791
792	/* Check for and consume any regions we now overlap with. */
793	list_for_each_entry(rg, rg->link.prev, link) {
794		if (&rg->link == head)
795			break;
796		if (rg->from > t)
797			return chg;
798
799		/* We overlap with this area, if it extends futher than
800		 * us then we must extend ourselves.  Account for its
801		 * existing reservation. */
802		if (rg->to > t) {
803			chg += rg->to - t;
804			t = rg->to;
805		}
806		chg -= rg->to - rg->from;
807	}
808	return chg;
809}
810
811static long region_truncate(struct list_head *head, long end)
812{
813	struct file_region *rg, *trg;
814	long chg = 0;
815
816	/* Locate the region we are either in or before. */
817	list_for_each_entry(rg, head, link)
818		if (end <= rg->to)
819			break;
820	if (&rg->link == head)
821		return 0;
822
823	/* If we are in the middle of a region then adjust it. */
824	if (end > rg->from) {
825		chg = rg->to - end;
826		rg->to = end;
827		rg = list_entry(rg->link.next, typeof(*rg), link);
828	}
829
830	/* Drop any remaining regions. */
831	list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
832		if (&rg->link == head)
833			break;
834		chg += rg->to - rg->from;
835		list_del(&rg->link);
836		kfree(rg);
837	}
838	return chg;
839}
840
841static int hugetlb_acct_memory(long delta)
842{
843	int ret = -ENOMEM;
844
845	spin_lock(&hugetlb_lock);
846	if ((delta + resv_huge_pages) <= free_huge_pages) {
847		resv_huge_pages += delta;
848		ret = 0;
849	}
850	spin_unlock(&hugetlb_lock);
851	return ret;
852}
853
854int hugetlb_reserve_pages(struct inode *inode, long from, long to)
855{
856	long ret, chg;
857
858	chg = region_chg(&inode->i_mapping->private_list, from, to);
859	if (chg < 0)
860		return chg;
861	/*
862	 * When cpuset is configured, it breaks the strict hugetlb page
863	 * reservation as the accounting is done on a global variable. Such
864	 * reservation is completely rubbish in the presence of cpuset because
865	 * the reservation is not checked against page availability for the
866	 * current cpuset. Application can still potentially OOM'ed by kernel
867	 * with lack of free htlb page in cpuset that the task is in.
868	 * Attempt to enforce strict accounting with cpuset is almost
869	 * impossible (or too ugly) because cpuset is too fluid that
870	 * task or memory node can be dynamically moved between cpusets.
871	 *
872	 * The change of semantics for shared hugetlb mapping with cpuset is
873	 * undesirable. However, in order to preserve some of the semantics,
874	 * we fall back to check against current free page availability as
875	 * a best attempt and hopefully to minimize the impact of changing
876	 * semantics that cpuset has.
877	 */
878	if (chg > cpuset_mems_nr(free_huge_pages_node))
879		return -ENOMEM;
880
881	ret = hugetlb_acct_memory(chg);
882	if (ret < 0)
883		return ret;
884	region_add(&inode->i_mapping->private_list, from, to);
885	return 0;
886}
887
888void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
889{
890	long chg = region_truncate(&inode->i_mapping->private_list, offset);
891	hugetlb_acct_memory(freed - chg);
892}
893