hugetlb.c revision 396faf0303d273219db5d7eb4a2879ad977ed185
1/*
2 * Generic hugetlb support.
3 * (C) William Irwin, April 2004
4 */
5#include <linux/gfp.h>
6#include <linux/list.h>
7#include <linux/init.h>
8#include <linux/module.h>
9#include <linux/mm.h>
10#include <linux/sysctl.h>
11#include <linux/highmem.h>
12#include <linux/nodemask.h>
13#include <linux/pagemap.h>
14#include <linux/mempolicy.h>
15#include <linux/cpuset.h>
16#include <linux/mutex.h>
17
18#include <asm/page.h>
19#include <asm/pgtable.h>
20
21#include <linux/hugetlb.h>
22#include "internal.h"
23
24const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
25static unsigned long nr_huge_pages, free_huge_pages, resv_huge_pages;
26unsigned long max_huge_pages;
27static struct list_head hugepage_freelists[MAX_NUMNODES];
28static unsigned int nr_huge_pages_node[MAX_NUMNODES];
29static unsigned int free_huge_pages_node[MAX_NUMNODES];
30static gfp_t htlb_alloc_mask = GFP_HIGHUSER;
31unsigned long hugepages_treat_as_movable;
32
33/*
34 * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
35 */
36static DEFINE_SPINLOCK(hugetlb_lock);
37
38static void clear_huge_page(struct page *page, unsigned long addr)
39{
40	int i;
41
42	might_sleep();
43	for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); i++) {
44		cond_resched();
45		clear_user_highpage(page + i, addr);
46	}
47}
48
49static void copy_huge_page(struct page *dst, struct page *src,
50			   unsigned long addr, struct vm_area_struct *vma)
51{
52	int i;
53
54	might_sleep();
55	for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) {
56		cond_resched();
57		copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
58	}
59}
60
61static void enqueue_huge_page(struct page *page)
62{
63	int nid = page_to_nid(page);
64	list_add(&page->lru, &hugepage_freelists[nid]);
65	free_huge_pages++;
66	free_huge_pages_node[nid]++;
67}
68
69static struct page *dequeue_huge_page(struct vm_area_struct *vma,
70				unsigned long address)
71{
72	int nid;
73	struct page *page = NULL;
74	struct zonelist *zonelist = huge_zonelist(vma, address,
75						htlb_alloc_mask);
76	struct zone **z;
77
78	for (z = zonelist->zones; *z; z++) {
79		nid = zone_to_nid(*z);
80		if (cpuset_zone_allowed_softwall(*z, htlb_alloc_mask) &&
81		    !list_empty(&hugepage_freelists[nid]))
82			break;
83	}
84
85	if (*z) {
86		page = list_entry(hugepage_freelists[nid].next,
87				  struct page, lru);
88		list_del(&page->lru);
89		free_huge_pages--;
90		free_huge_pages_node[nid]--;
91	}
92	return page;
93}
94
95static void free_huge_page(struct page *page)
96{
97	BUG_ON(page_count(page));
98
99	INIT_LIST_HEAD(&page->lru);
100
101	spin_lock(&hugetlb_lock);
102	enqueue_huge_page(page);
103	spin_unlock(&hugetlb_lock);
104}
105
106static int alloc_fresh_huge_page(void)
107{
108	static int prev_nid;
109	struct page *page;
110	static DEFINE_SPINLOCK(nid_lock);
111	int nid;
112
113	spin_lock(&nid_lock);
114	nid = next_node(prev_nid, node_online_map);
115	if (nid == MAX_NUMNODES)
116		nid = first_node(node_online_map);
117	prev_nid = nid;
118	spin_unlock(&nid_lock);
119
120	page = alloc_pages_node(nid, htlb_alloc_mask|__GFP_COMP|__GFP_NOWARN,
121					HUGETLB_PAGE_ORDER);
122	if (page) {
123		set_compound_page_dtor(page, free_huge_page);
124		spin_lock(&hugetlb_lock);
125		nr_huge_pages++;
126		nr_huge_pages_node[page_to_nid(page)]++;
127		spin_unlock(&hugetlb_lock);
128		put_page(page); /* free it into the hugepage allocator */
129		return 1;
130	}
131	return 0;
132}
133
134static struct page *alloc_huge_page(struct vm_area_struct *vma,
135				    unsigned long addr)
136{
137	struct page *page;
138
139	spin_lock(&hugetlb_lock);
140	if (vma->vm_flags & VM_MAYSHARE)
141		resv_huge_pages--;
142	else if (free_huge_pages <= resv_huge_pages)
143		goto fail;
144
145	page = dequeue_huge_page(vma, addr);
146	if (!page)
147		goto fail;
148
149	spin_unlock(&hugetlb_lock);
150	set_page_refcounted(page);
151	return page;
152
153fail:
154	if (vma->vm_flags & VM_MAYSHARE)
155		resv_huge_pages++;
156	spin_unlock(&hugetlb_lock);
157	return NULL;
158}
159
160static int __init hugetlb_init(void)
161{
162	unsigned long i;
163
164	if (HPAGE_SHIFT == 0)
165		return 0;
166
167	for (i = 0; i < MAX_NUMNODES; ++i)
168		INIT_LIST_HEAD(&hugepage_freelists[i]);
169
170	for (i = 0; i < max_huge_pages; ++i) {
171		if (!alloc_fresh_huge_page())
172			break;
173	}
174	max_huge_pages = free_huge_pages = nr_huge_pages = i;
175	printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages);
176	return 0;
177}
178module_init(hugetlb_init);
179
180static int __init hugetlb_setup(char *s)
181{
182	if (sscanf(s, "%lu", &max_huge_pages) <= 0)
183		max_huge_pages = 0;
184	return 1;
185}
186__setup("hugepages=", hugetlb_setup);
187
188static unsigned int cpuset_mems_nr(unsigned int *array)
189{
190	int node;
191	unsigned int nr = 0;
192
193	for_each_node_mask(node, cpuset_current_mems_allowed)
194		nr += array[node];
195
196	return nr;
197}
198
199#ifdef CONFIG_SYSCTL
200static void update_and_free_page(struct page *page)
201{
202	int i;
203	nr_huge_pages--;
204	nr_huge_pages_node[page_to_nid(page)]--;
205	for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) {
206		page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
207				1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
208				1 << PG_private | 1<< PG_writeback);
209	}
210	page[1].lru.next = NULL;
211	set_page_refcounted(page);
212	__free_pages(page, HUGETLB_PAGE_ORDER);
213}
214
215#ifdef CONFIG_HIGHMEM
216static void try_to_free_low(unsigned long count)
217{
218	int i;
219
220	for (i = 0; i < MAX_NUMNODES; ++i) {
221		struct page *page, *next;
222		list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) {
223			if (PageHighMem(page))
224				continue;
225			list_del(&page->lru);
226			update_and_free_page(page);
227			free_huge_pages--;
228			free_huge_pages_node[page_to_nid(page)]--;
229			if (count >= nr_huge_pages)
230				return;
231		}
232	}
233}
234#else
235static inline void try_to_free_low(unsigned long count)
236{
237}
238#endif
239
240static unsigned long set_max_huge_pages(unsigned long count)
241{
242	while (count > nr_huge_pages) {
243		if (!alloc_fresh_huge_page())
244			return nr_huge_pages;
245	}
246	if (count >= nr_huge_pages)
247		return nr_huge_pages;
248
249	spin_lock(&hugetlb_lock);
250	count = max(count, resv_huge_pages);
251	try_to_free_low(count);
252	while (count < nr_huge_pages) {
253		struct page *page = dequeue_huge_page(NULL, 0);
254		if (!page)
255			break;
256		update_and_free_page(page);
257	}
258	spin_unlock(&hugetlb_lock);
259	return nr_huge_pages;
260}
261
262int hugetlb_sysctl_handler(struct ctl_table *table, int write,
263			   struct file *file, void __user *buffer,
264			   size_t *length, loff_t *ppos)
265{
266	proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
267	max_huge_pages = set_max_huge_pages(max_huge_pages);
268	return 0;
269}
270
271int hugetlb_treat_movable_handler(struct ctl_table *table, int write,
272			struct file *file, void __user *buffer,
273			size_t *length, loff_t *ppos)
274{
275	proc_dointvec(table, write, file, buffer, length, ppos);
276	if (hugepages_treat_as_movable)
277		htlb_alloc_mask = GFP_HIGHUSER_MOVABLE;
278	else
279		htlb_alloc_mask = GFP_HIGHUSER;
280	return 0;
281}
282
283#endif /* CONFIG_SYSCTL */
284
285int hugetlb_report_meminfo(char *buf)
286{
287	return sprintf(buf,
288			"HugePages_Total: %5lu\n"
289			"HugePages_Free:  %5lu\n"
290			"HugePages_Rsvd:  %5lu\n"
291			"Hugepagesize:    %5lu kB\n",
292			nr_huge_pages,
293			free_huge_pages,
294			resv_huge_pages,
295			HPAGE_SIZE/1024);
296}
297
298int hugetlb_report_node_meminfo(int nid, char *buf)
299{
300	return sprintf(buf,
301		"Node %d HugePages_Total: %5u\n"
302		"Node %d HugePages_Free:  %5u\n",
303		nid, nr_huge_pages_node[nid],
304		nid, free_huge_pages_node[nid]);
305}
306
307/* Return the number pages of memory we physically have, in PAGE_SIZE units. */
308unsigned long hugetlb_total_pages(void)
309{
310	return nr_huge_pages * (HPAGE_SIZE / PAGE_SIZE);
311}
312
313/*
314 * We cannot handle pagefaults against hugetlb pages at all.  They cause
315 * handle_mm_fault() to try to instantiate regular-sized pages in the
316 * hugegpage VMA.  do_page_fault() is supposed to trap this, so BUG is we get
317 * this far.
318 */
319static struct page *hugetlb_nopage(struct vm_area_struct *vma,
320				unsigned long address, int *unused)
321{
322	BUG();
323	return NULL;
324}
325
326struct vm_operations_struct hugetlb_vm_ops = {
327	.nopage = hugetlb_nopage,
328};
329
330static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
331				int writable)
332{
333	pte_t entry;
334
335	if (writable) {
336		entry =
337		    pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
338	} else {
339		entry = pte_wrprotect(mk_pte(page, vma->vm_page_prot));
340	}
341	entry = pte_mkyoung(entry);
342	entry = pte_mkhuge(entry);
343
344	return entry;
345}
346
347static void set_huge_ptep_writable(struct vm_area_struct *vma,
348				   unsigned long address, pte_t *ptep)
349{
350	pte_t entry;
351
352	entry = pte_mkwrite(pte_mkdirty(*ptep));
353	if (ptep_set_access_flags(vma, address, ptep, entry, 1)) {
354		update_mmu_cache(vma, address, entry);
355		lazy_mmu_prot_update(entry);
356	}
357}
358
359
360int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
361			    struct vm_area_struct *vma)
362{
363	pte_t *src_pte, *dst_pte, entry;
364	struct page *ptepage;
365	unsigned long addr;
366	int cow;
367
368	cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
369
370	for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
371		src_pte = huge_pte_offset(src, addr);
372		if (!src_pte)
373			continue;
374		dst_pte = huge_pte_alloc(dst, addr);
375		if (!dst_pte)
376			goto nomem;
377		spin_lock(&dst->page_table_lock);
378		spin_lock(&src->page_table_lock);
379		if (!pte_none(*src_pte)) {
380			if (cow)
381				ptep_set_wrprotect(src, addr, src_pte);
382			entry = *src_pte;
383			ptepage = pte_page(entry);
384			get_page(ptepage);
385			set_huge_pte_at(dst, addr, dst_pte, entry);
386		}
387		spin_unlock(&src->page_table_lock);
388		spin_unlock(&dst->page_table_lock);
389	}
390	return 0;
391
392nomem:
393	return -ENOMEM;
394}
395
396void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
397			    unsigned long end)
398{
399	struct mm_struct *mm = vma->vm_mm;
400	unsigned long address;
401	pte_t *ptep;
402	pte_t pte;
403	struct page *page;
404	struct page *tmp;
405	/*
406	 * A page gathering list, protected by per file i_mmap_lock. The
407	 * lock is used to avoid list corruption from multiple unmapping
408	 * of the same page since we are using page->lru.
409	 */
410	LIST_HEAD(page_list);
411
412	WARN_ON(!is_vm_hugetlb_page(vma));
413	BUG_ON(start & ~HPAGE_MASK);
414	BUG_ON(end & ~HPAGE_MASK);
415
416	spin_lock(&mm->page_table_lock);
417	for (address = start; address < end; address += HPAGE_SIZE) {
418		ptep = huge_pte_offset(mm, address);
419		if (!ptep)
420			continue;
421
422		if (huge_pmd_unshare(mm, &address, ptep))
423			continue;
424
425		pte = huge_ptep_get_and_clear(mm, address, ptep);
426		if (pte_none(pte))
427			continue;
428
429		page = pte_page(pte);
430		if (pte_dirty(pte))
431			set_page_dirty(page);
432		list_add(&page->lru, &page_list);
433	}
434	spin_unlock(&mm->page_table_lock);
435	flush_tlb_range(vma, start, end);
436	list_for_each_entry_safe(page, tmp, &page_list, lru) {
437		list_del(&page->lru);
438		put_page(page);
439	}
440}
441
442void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
443			  unsigned long end)
444{
445	/*
446	 * It is undesirable to test vma->vm_file as it should be non-null
447	 * for valid hugetlb area. However, vm_file will be NULL in the error
448	 * cleanup path of do_mmap_pgoff. When hugetlbfs ->mmap method fails,
449	 * do_mmap_pgoff() nullifies vma->vm_file before calling this function
450	 * to clean up. Since no pte has actually been setup, it is safe to
451	 * do nothing in this case.
452	 */
453	if (vma->vm_file) {
454		spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
455		__unmap_hugepage_range(vma, start, end);
456		spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);
457	}
458}
459
460static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
461			unsigned long address, pte_t *ptep, pte_t pte)
462{
463	struct page *old_page, *new_page;
464	int avoidcopy;
465
466	old_page = pte_page(pte);
467
468	/* If no-one else is actually using this page, avoid the copy
469	 * and just make the page writable */
470	avoidcopy = (page_count(old_page) == 1);
471	if (avoidcopy) {
472		set_huge_ptep_writable(vma, address, ptep);
473		return VM_FAULT_MINOR;
474	}
475
476	page_cache_get(old_page);
477	new_page = alloc_huge_page(vma, address);
478
479	if (!new_page) {
480		page_cache_release(old_page);
481		return VM_FAULT_OOM;
482	}
483
484	spin_unlock(&mm->page_table_lock);
485	copy_huge_page(new_page, old_page, address, vma);
486	spin_lock(&mm->page_table_lock);
487
488	ptep = huge_pte_offset(mm, address & HPAGE_MASK);
489	if (likely(pte_same(*ptep, pte))) {
490		/* Break COW */
491		set_huge_pte_at(mm, address, ptep,
492				make_huge_pte(vma, new_page, 1));
493		/* Make the old page be freed below */
494		new_page = old_page;
495	}
496	page_cache_release(new_page);
497	page_cache_release(old_page);
498	return VM_FAULT_MINOR;
499}
500
501int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
502			unsigned long address, pte_t *ptep, int write_access)
503{
504	int ret = VM_FAULT_SIGBUS;
505	unsigned long idx;
506	unsigned long size;
507	struct page *page;
508	struct address_space *mapping;
509	pte_t new_pte;
510
511	mapping = vma->vm_file->f_mapping;
512	idx = ((address - vma->vm_start) >> HPAGE_SHIFT)
513		+ (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
514
515	/*
516	 * Use page lock to guard against racing truncation
517	 * before we get page_table_lock.
518	 */
519retry:
520	page = find_lock_page(mapping, idx);
521	if (!page) {
522		size = i_size_read(mapping->host) >> HPAGE_SHIFT;
523		if (idx >= size)
524			goto out;
525		if (hugetlb_get_quota(mapping))
526			goto out;
527		page = alloc_huge_page(vma, address);
528		if (!page) {
529			hugetlb_put_quota(mapping);
530			ret = VM_FAULT_OOM;
531			goto out;
532		}
533		clear_huge_page(page, address);
534
535		if (vma->vm_flags & VM_SHARED) {
536			int err;
537
538			err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
539			if (err) {
540				put_page(page);
541				hugetlb_put_quota(mapping);
542				if (err == -EEXIST)
543					goto retry;
544				goto out;
545			}
546		} else
547			lock_page(page);
548	}
549
550	spin_lock(&mm->page_table_lock);
551	size = i_size_read(mapping->host) >> HPAGE_SHIFT;
552	if (idx >= size)
553		goto backout;
554
555	ret = VM_FAULT_MINOR;
556	if (!pte_none(*ptep))
557		goto backout;
558
559	new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
560				&& (vma->vm_flags & VM_SHARED)));
561	set_huge_pte_at(mm, address, ptep, new_pte);
562
563	if (write_access && !(vma->vm_flags & VM_SHARED)) {
564		/* Optimization, do the COW without a second fault */
565		ret = hugetlb_cow(mm, vma, address, ptep, new_pte);
566	}
567
568	spin_unlock(&mm->page_table_lock);
569	unlock_page(page);
570out:
571	return ret;
572
573backout:
574	spin_unlock(&mm->page_table_lock);
575	hugetlb_put_quota(mapping);
576	unlock_page(page);
577	put_page(page);
578	goto out;
579}
580
581int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
582			unsigned long address, int write_access)
583{
584	pte_t *ptep;
585	pte_t entry;
586	int ret;
587	static DEFINE_MUTEX(hugetlb_instantiation_mutex);
588
589	ptep = huge_pte_alloc(mm, address);
590	if (!ptep)
591		return VM_FAULT_OOM;
592
593	/*
594	 * Serialize hugepage allocation and instantiation, so that we don't
595	 * get spurious allocation failures if two CPUs race to instantiate
596	 * the same page in the page cache.
597	 */
598	mutex_lock(&hugetlb_instantiation_mutex);
599	entry = *ptep;
600	if (pte_none(entry)) {
601		ret = hugetlb_no_page(mm, vma, address, ptep, write_access);
602		mutex_unlock(&hugetlb_instantiation_mutex);
603		return ret;
604	}
605
606	ret = VM_FAULT_MINOR;
607
608	spin_lock(&mm->page_table_lock);
609	/* Check for a racing update before calling hugetlb_cow */
610	if (likely(pte_same(entry, *ptep)))
611		if (write_access && !pte_write(entry))
612			ret = hugetlb_cow(mm, vma, address, ptep, entry);
613	spin_unlock(&mm->page_table_lock);
614	mutex_unlock(&hugetlb_instantiation_mutex);
615
616	return ret;
617}
618
619int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
620			struct page **pages, struct vm_area_struct **vmas,
621			unsigned long *position, int *length, int i)
622{
623	unsigned long pfn_offset;
624	unsigned long vaddr = *position;
625	int remainder = *length;
626
627	spin_lock(&mm->page_table_lock);
628	while (vaddr < vma->vm_end && remainder) {
629		pte_t *pte;
630		struct page *page;
631
632		/*
633		 * Some archs (sparc64, sh*) have multiple pte_ts to
634		 * each hugepage.  We have to make * sure we get the
635		 * first, for the page indexing below to work.
636		 */
637		pte = huge_pte_offset(mm, vaddr & HPAGE_MASK);
638
639		if (!pte || pte_none(*pte)) {
640			int ret;
641
642			spin_unlock(&mm->page_table_lock);
643			ret = hugetlb_fault(mm, vma, vaddr, 0);
644			spin_lock(&mm->page_table_lock);
645			if (ret == VM_FAULT_MINOR)
646				continue;
647
648			remainder = 0;
649			if (!i)
650				i = -EFAULT;
651			break;
652		}
653
654		pfn_offset = (vaddr & ~HPAGE_MASK) >> PAGE_SHIFT;
655		page = pte_page(*pte);
656same_page:
657		if (pages) {
658			get_page(page);
659			pages[i] = page + pfn_offset;
660		}
661
662		if (vmas)
663			vmas[i] = vma;
664
665		vaddr += PAGE_SIZE;
666		++pfn_offset;
667		--remainder;
668		++i;
669		if (vaddr < vma->vm_end && remainder &&
670				pfn_offset < HPAGE_SIZE/PAGE_SIZE) {
671			/*
672			 * We use pfn_offset to avoid touching the pageframes
673			 * of this compound page.
674			 */
675			goto same_page;
676		}
677	}
678	spin_unlock(&mm->page_table_lock);
679	*length = remainder;
680	*position = vaddr;
681
682	return i;
683}
684
685void hugetlb_change_protection(struct vm_area_struct *vma,
686		unsigned long address, unsigned long end, pgprot_t newprot)
687{
688	struct mm_struct *mm = vma->vm_mm;
689	unsigned long start = address;
690	pte_t *ptep;
691	pte_t pte;
692
693	BUG_ON(address >= end);
694	flush_cache_range(vma, address, end);
695
696	spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
697	spin_lock(&mm->page_table_lock);
698	for (; address < end; address += HPAGE_SIZE) {
699		ptep = huge_pte_offset(mm, address);
700		if (!ptep)
701			continue;
702		if (huge_pmd_unshare(mm, &address, ptep))
703			continue;
704		if (!pte_none(*ptep)) {
705			pte = huge_ptep_get_and_clear(mm, address, ptep);
706			pte = pte_mkhuge(pte_modify(pte, newprot));
707			set_huge_pte_at(mm, address, ptep, pte);
708			lazy_mmu_prot_update(pte);
709		}
710	}
711	spin_unlock(&mm->page_table_lock);
712	spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);
713
714	flush_tlb_range(vma, start, end);
715}
716
717struct file_region {
718	struct list_head link;
719	long from;
720	long to;
721};
722
723static long region_add(struct list_head *head, long f, long t)
724{
725	struct file_region *rg, *nrg, *trg;
726
727	/* Locate the region we are either in or before. */
728	list_for_each_entry(rg, head, link)
729		if (f <= rg->to)
730			break;
731
732	/* Round our left edge to the current segment if it encloses us. */
733	if (f > rg->from)
734		f = rg->from;
735
736	/* Check for and consume any regions we now overlap with. */
737	nrg = rg;
738	list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
739		if (&rg->link == head)
740			break;
741		if (rg->from > t)
742			break;
743
744		/* If this area reaches higher then extend our area to
745		 * include it completely.  If this is not the first area
746		 * which we intend to reuse, free it. */
747		if (rg->to > t)
748			t = rg->to;
749		if (rg != nrg) {
750			list_del(&rg->link);
751			kfree(rg);
752		}
753	}
754	nrg->from = f;
755	nrg->to = t;
756	return 0;
757}
758
759static long region_chg(struct list_head *head, long f, long t)
760{
761	struct file_region *rg, *nrg;
762	long chg = 0;
763
764	/* Locate the region we are before or in. */
765	list_for_each_entry(rg, head, link)
766		if (f <= rg->to)
767			break;
768
769	/* If we are below the current region then a new region is required.
770	 * Subtle, allocate a new region at the position but make it zero
771	 * size such that we can guarentee to record the reservation. */
772	if (&rg->link == head || t < rg->from) {
773		nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
774		if (nrg == 0)
775			return -ENOMEM;
776		nrg->from = f;
777		nrg->to   = f;
778		INIT_LIST_HEAD(&nrg->link);
779		list_add(&nrg->link, rg->link.prev);
780
781		return t - f;
782	}
783
784	/* Round our left edge to the current segment if it encloses us. */
785	if (f > rg->from)
786		f = rg->from;
787	chg = t - f;
788
789	/* Check for and consume any regions we now overlap with. */
790	list_for_each_entry(rg, rg->link.prev, link) {
791		if (&rg->link == head)
792			break;
793		if (rg->from > t)
794			return chg;
795
796		/* We overlap with this area, if it extends futher than
797		 * us then we must extend ourselves.  Account for its
798		 * existing reservation. */
799		if (rg->to > t) {
800			chg += rg->to - t;
801			t = rg->to;
802		}
803		chg -= rg->to - rg->from;
804	}
805	return chg;
806}
807
808static long region_truncate(struct list_head *head, long end)
809{
810	struct file_region *rg, *trg;
811	long chg = 0;
812
813	/* Locate the region we are either in or before. */
814	list_for_each_entry(rg, head, link)
815		if (end <= rg->to)
816			break;
817	if (&rg->link == head)
818		return 0;
819
820	/* If we are in the middle of a region then adjust it. */
821	if (end > rg->from) {
822		chg = rg->to - end;
823		rg->to = end;
824		rg = list_entry(rg->link.next, typeof(*rg), link);
825	}
826
827	/* Drop any remaining regions. */
828	list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
829		if (&rg->link == head)
830			break;
831		chg += rg->to - rg->from;
832		list_del(&rg->link);
833		kfree(rg);
834	}
835	return chg;
836}
837
838static int hugetlb_acct_memory(long delta)
839{
840	int ret = -ENOMEM;
841
842	spin_lock(&hugetlb_lock);
843	if ((delta + resv_huge_pages) <= free_huge_pages) {
844		resv_huge_pages += delta;
845		ret = 0;
846	}
847	spin_unlock(&hugetlb_lock);
848	return ret;
849}
850
851int hugetlb_reserve_pages(struct inode *inode, long from, long to)
852{
853	long ret, chg;
854
855	chg = region_chg(&inode->i_mapping->private_list, from, to);
856	if (chg < 0)
857		return chg;
858	/*
859	 * When cpuset is configured, it breaks the strict hugetlb page
860	 * reservation as the accounting is done on a global variable. Such
861	 * reservation is completely rubbish in the presence of cpuset because
862	 * the reservation is not checked against page availability for the
863	 * current cpuset. Application can still potentially OOM'ed by kernel
864	 * with lack of free htlb page in cpuset that the task is in.
865	 * Attempt to enforce strict accounting with cpuset is almost
866	 * impossible (or too ugly) because cpuset is too fluid that
867	 * task or memory node can be dynamically moved between cpusets.
868	 *
869	 * The change of semantics for shared hugetlb mapping with cpuset is
870	 * undesirable. However, in order to preserve some of the semantics,
871	 * we fall back to check against current free page availability as
872	 * a best attempt and hopefully to minimize the impact of changing
873	 * semantics that cpuset has.
874	 */
875	if (chg > cpuset_mems_nr(free_huge_pages_node))
876		return -ENOMEM;
877
878	ret = hugetlb_acct_memory(chg);
879	if (ret < 0)
880		return ret;
881	region_add(&inode->i_mapping->private_list, from, to);
882	return 0;
883}
884
885void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
886{
887	long chg = region_truncate(&inode->i_mapping->private_list, offset);
888	hugetlb_acct_memory(freed - chg);
889}
890