hugetlb.c revision 04f2cbe35699d22dbf428373682ead85ca1240f5
1/*
2 * Generic hugetlb support.
3 * (C) William Irwin, April 2004
4 */
5#include <linux/gfp.h>
6#include <linux/list.h>
7#include <linux/init.h>
8#include <linux/module.h>
9#include <linux/mm.h>
10#include <linux/sysctl.h>
11#include <linux/highmem.h>
12#include <linux/nodemask.h>
13#include <linux/pagemap.h>
14#include <linux/mempolicy.h>
15#include <linux/cpuset.h>
16#include <linux/mutex.h>
17
18#include <asm/page.h>
19#include <asm/pgtable.h>
20
21#include <linux/hugetlb.h>
22#include "internal.h"
23
24const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
25static unsigned long nr_huge_pages, free_huge_pages, resv_huge_pages;
26static unsigned long surplus_huge_pages;
27static unsigned long nr_overcommit_huge_pages;
28unsigned long max_huge_pages;
29unsigned long sysctl_overcommit_huge_pages;
30static struct list_head hugepage_freelists[MAX_NUMNODES];
31static unsigned int nr_huge_pages_node[MAX_NUMNODES];
32static unsigned int free_huge_pages_node[MAX_NUMNODES];
33static unsigned int surplus_huge_pages_node[MAX_NUMNODES];
34static gfp_t htlb_alloc_mask = GFP_HIGHUSER;
35unsigned long hugepages_treat_as_movable;
36static int hugetlb_next_nid;
37
38/*
39 * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
40 */
41static DEFINE_SPINLOCK(hugetlb_lock);
42
43#define HPAGE_RESV_OWNER    (1UL << (BITS_PER_LONG - 1))
44#define HPAGE_RESV_UNMAPPED (1UL << (BITS_PER_LONG - 2))
45#define HPAGE_RESV_MASK (HPAGE_RESV_OWNER | HPAGE_RESV_UNMAPPED)
46/*
47 * These helpers are used to track how many pages are reserved for
48 * faults in a MAP_PRIVATE mapping. Only the process that called mmap()
49 * is guaranteed to have their future faults succeed.
50 *
51 * With the exception of reset_vma_resv_huge_pages() which is called at fork(),
52 * the reserve counters are updated with the hugetlb_lock held. It is safe
53 * to reset the VMA at fork() time as it is not in use yet and there is no
54 * chance of the global counters getting corrupted as a result of the values.
55 */
56static unsigned long vma_resv_huge_pages(struct vm_area_struct *vma)
57{
58	VM_BUG_ON(!is_vm_hugetlb_page(vma));
59	if (!(vma->vm_flags & VM_SHARED))
60		return (unsigned long)vma->vm_private_data & ~HPAGE_RESV_MASK;
61	return 0;
62}
63
64static void set_vma_resv_huge_pages(struct vm_area_struct *vma,
65							unsigned long reserve)
66{
67	unsigned long flags;
68	VM_BUG_ON(!is_vm_hugetlb_page(vma));
69	VM_BUG_ON(vma->vm_flags & VM_SHARED);
70
71	flags = (unsigned long)vma->vm_private_data & HPAGE_RESV_MASK;
72	vma->vm_private_data = (void *)(reserve | flags);
73}
74
75static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags)
76{
77	unsigned long reserveflags = (unsigned long)vma->vm_private_data;
78	VM_BUG_ON(!is_vm_hugetlb_page(vma));
79	vma->vm_private_data = (void *)(reserveflags | flags);
80}
81
82static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
83{
84	VM_BUG_ON(!is_vm_hugetlb_page(vma));
85	return ((unsigned long)vma->vm_private_data & flag) != 0;
86}
87
88/* Decrement the reserved pages in the hugepage pool by one */
89static void decrement_hugepage_resv_vma(struct vm_area_struct *vma)
90{
91	if (vma->vm_flags & VM_SHARED) {
92		/* Shared mappings always use reserves */
93		resv_huge_pages--;
94	} else {
95		/*
96		 * Only the process that called mmap() has reserves for
97		 * private mappings.
98		 */
99		if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
100			unsigned long flags, reserve;
101			resv_huge_pages--;
102			flags = (unsigned long)vma->vm_private_data &
103							HPAGE_RESV_MASK;
104			reserve = (unsigned long)vma->vm_private_data - 1;
105			vma->vm_private_data = (void *)(reserve | flags);
106		}
107	}
108}
109
110/* Reset counters to 0 and clear all HPAGE_RESV_* flags */
111void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
112{
113	VM_BUG_ON(!is_vm_hugetlb_page(vma));
114	if (!(vma->vm_flags & VM_SHARED))
115		vma->vm_private_data = (void *)0;
116}
117
118/* Returns true if the VMA has associated reserve pages */
119static int vma_has_private_reserves(struct vm_area_struct *vma)
120{
121	if (vma->vm_flags & VM_SHARED)
122		return 0;
123	if (!vma_resv_huge_pages(vma))
124		return 0;
125	return 1;
126}
127
128static void clear_huge_page(struct page *page, unsigned long addr)
129{
130	int i;
131
132	might_sleep();
133	for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); i++) {
134		cond_resched();
135		clear_user_highpage(page + i, addr + i * PAGE_SIZE);
136	}
137}
138
139static void copy_huge_page(struct page *dst, struct page *src,
140			   unsigned long addr, struct vm_area_struct *vma)
141{
142	int i;
143
144	might_sleep();
145	for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) {
146		cond_resched();
147		copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
148	}
149}
150
151static void enqueue_huge_page(struct page *page)
152{
153	int nid = page_to_nid(page);
154	list_add(&page->lru, &hugepage_freelists[nid]);
155	free_huge_pages++;
156	free_huge_pages_node[nid]++;
157}
158
159static struct page *dequeue_huge_page(void)
160{
161	int nid;
162	struct page *page = NULL;
163
164	for (nid = 0; nid < MAX_NUMNODES; ++nid) {
165		if (!list_empty(&hugepage_freelists[nid])) {
166			page = list_entry(hugepage_freelists[nid].next,
167					  struct page, lru);
168			list_del(&page->lru);
169			free_huge_pages--;
170			free_huge_pages_node[nid]--;
171			break;
172		}
173	}
174	return page;
175}
176
177static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma,
178				unsigned long address, int avoid_reserve)
179{
180	int nid;
181	struct page *page = NULL;
182	struct mempolicy *mpol;
183	nodemask_t *nodemask;
184	struct zonelist *zonelist = huge_zonelist(vma, address,
185					htlb_alloc_mask, &mpol, &nodemask);
186	struct zone *zone;
187	struct zoneref *z;
188
189	/*
190	 * A child process with MAP_PRIVATE mappings created by their parent
191	 * have no page reserves. This check ensures that reservations are
192	 * not "stolen". The child may still get SIGKILLed
193	 */
194	if (!vma_has_private_reserves(vma) &&
195			free_huge_pages - resv_huge_pages == 0)
196		return NULL;
197
198	/* If reserves cannot be used, ensure enough pages are in the pool */
199	if (avoid_reserve && free_huge_pages - resv_huge_pages == 0)
200		return NULL;
201
202	for_each_zone_zonelist_nodemask(zone, z, zonelist,
203						MAX_NR_ZONES - 1, nodemask) {
204		nid = zone_to_nid(zone);
205		if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask) &&
206		    !list_empty(&hugepage_freelists[nid])) {
207			page = list_entry(hugepage_freelists[nid].next,
208					  struct page, lru);
209			list_del(&page->lru);
210			free_huge_pages--;
211			free_huge_pages_node[nid]--;
212
213			if (!avoid_reserve)
214				decrement_hugepage_resv_vma(vma);
215
216			break;
217		}
218	}
219	mpol_cond_put(mpol);
220	return page;
221}
222
223static void update_and_free_page(struct page *page)
224{
225	int i;
226	nr_huge_pages--;
227	nr_huge_pages_node[page_to_nid(page)]--;
228	for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) {
229		page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
230				1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
231				1 << PG_private | 1<< PG_writeback);
232	}
233	set_compound_page_dtor(page, NULL);
234	set_page_refcounted(page);
235	arch_release_hugepage(page);
236	__free_pages(page, HUGETLB_PAGE_ORDER);
237}
238
239static void free_huge_page(struct page *page)
240{
241	int nid = page_to_nid(page);
242	struct address_space *mapping;
243
244	mapping = (struct address_space *) page_private(page);
245	set_page_private(page, 0);
246	BUG_ON(page_count(page));
247	INIT_LIST_HEAD(&page->lru);
248
249	spin_lock(&hugetlb_lock);
250	if (surplus_huge_pages_node[nid]) {
251		update_and_free_page(page);
252		surplus_huge_pages--;
253		surplus_huge_pages_node[nid]--;
254	} else {
255		enqueue_huge_page(page);
256	}
257	spin_unlock(&hugetlb_lock);
258	if (mapping)
259		hugetlb_put_quota(mapping, 1);
260}
261
262/*
263 * Increment or decrement surplus_huge_pages.  Keep node-specific counters
264 * balanced by operating on them in a round-robin fashion.
265 * Returns 1 if an adjustment was made.
266 */
267static int adjust_pool_surplus(int delta)
268{
269	static int prev_nid;
270	int nid = prev_nid;
271	int ret = 0;
272
273	VM_BUG_ON(delta != -1 && delta != 1);
274	do {
275		nid = next_node(nid, node_online_map);
276		if (nid == MAX_NUMNODES)
277			nid = first_node(node_online_map);
278
279		/* To shrink on this node, there must be a surplus page */
280		if (delta < 0 && !surplus_huge_pages_node[nid])
281			continue;
282		/* Surplus cannot exceed the total number of pages */
283		if (delta > 0 && surplus_huge_pages_node[nid] >=
284						nr_huge_pages_node[nid])
285			continue;
286
287		surplus_huge_pages += delta;
288		surplus_huge_pages_node[nid] += delta;
289		ret = 1;
290		break;
291	} while (nid != prev_nid);
292
293	prev_nid = nid;
294	return ret;
295}
296
297static struct page *alloc_fresh_huge_page_node(int nid)
298{
299	struct page *page;
300
301	page = alloc_pages_node(nid,
302		htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|
303						__GFP_REPEAT|__GFP_NOWARN,
304		HUGETLB_PAGE_ORDER);
305	if (page) {
306		if (arch_prepare_hugepage(page)) {
307			__free_pages(page, HUGETLB_PAGE_ORDER);
308			return NULL;
309		}
310		set_compound_page_dtor(page, free_huge_page);
311		spin_lock(&hugetlb_lock);
312		nr_huge_pages++;
313		nr_huge_pages_node[nid]++;
314		spin_unlock(&hugetlb_lock);
315		put_page(page); /* free it into the hugepage allocator */
316	}
317
318	return page;
319}
320
321static int alloc_fresh_huge_page(void)
322{
323	struct page *page;
324	int start_nid;
325	int next_nid;
326	int ret = 0;
327
328	start_nid = hugetlb_next_nid;
329
330	do {
331		page = alloc_fresh_huge_page_node(hugetlb_next_nid);
332		if (page)
333			ret = 1;
334		/*
335		 * Use a helper variable to find the next node and then
336		 * copy it back to hugetlb_next_nid afterwards:
337		 * otherwise there's a window in which a racer might
338		 * pass invalid nid MAX_NUMNODES to alloc_pages_node.
339		 * But we don't need to use a spin_lock here: it really
340		 * doesn't matter if occasionally a racer chooses the
341		 * same nid as we do.  Move nid forward in the mask even
342		 * if we just successfully allocated a hugepage so that
343		 * the next caller gets hugepages on the next node.
344		 */
345		next_nid = next_node(hugetlb_next_nid, node_online_map);
346		if (next_nid == MAX_NUMNODES)
347			next_nid = first_node(node_online_map);
348		hugetlb_next_nid = next_nid;
349	} while (!page && hugetlb_next_nid != start_nid);
350
351	if (ret)
352		count_vm_event(HTLB_BUDDY_PGALLOC);
353	else
354		count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
355
356	return ret;
357}
358
359static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma,
360						unsigned long address)
361{
362	struct page *page;
363	unsigned int nid;
364
365	/*
366	 * Assume we will successfully allocate the surplus page to
367	 * prevent racing processes from causing the surplus to exceed
368	 * overcommit
369	 *
370	 * This however introduces a different race, where a process B
371	 * tries to grow the static hugepage pool while alloc_pages() is
372	 * called by process A. B will only examine the per-node
373	 * counters in determining if surplus huge pages can be
374	 * converted to normal huge pages in adjust_pool_surplus(). A
375	 * won't be able to increment the per-node counter, until the
376	 * lock is dropped by B, but B doesn't drop hugetlb_lock until
377	 * no more huge pages can be converted from surplus to normal
378	 * state (and doesn't try to convert again). Thus, we have a
379	 * case where a surplus huge page exists, the pool is grown, and
380	 * the surplus huge page still exists after, even though it
381	 * should just have been converted to a normal huge page. This
382	 * does not leak memory, though, as the hugepage will be freed
383	 * once it is out of use. It also does not allow the counters to
384	 * go out of whack in adjust_pool_surplus() as we don't modify
385	 * the node values until we've gotten the hugepage and only the
386	 * per-node value is checked there.
387	 */
388	spin_lock(&hugetlb_lock);
389	if (surplus_huge_pages >= nr_overcommit_huge_pages) {
390		spin_unlock(&hugetlb_lock);
391		return NULL;
392	} else {
393		nr_huge_pages++;
394		surplus_huge_pages++;
395	}
396	spin_unlock(&hugetlb_lock);
397
398	page = alloc_pages(htlb_alloc_mask|__GFP_COMP|
399					__GFP_REPEAT|__GFP_NOWARN,
400					HUGETLB_PAGE_ORDER);
401
402	spin_lock(&hugetlb_lock);
403	if (page) {
404		/*
405		 * This page is now managed by the hugetlb allocator and has
406		 * no users -- drop the buddy allocator's reference.
407		 */
408		put_page_testzero(page);
409		VM_BUG_ON(page_count(page));
410		nid = page_to_nid(page);
411		set_compound_page_dtor(page, free_huge_page);
412		/*
413		 * We incremented the global counters already
414		 */
415		nr_huge_pages_node[nid]++;
416		surplus_huge_pages_node[nid]++;
417		__count_vm_event(HTLB_BUDDY_PGALLOC);
418	} else {
419		nr_huge_pages--;
420		surplus_huge_pages--;
421		__count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
422	}
423	spin_unlock(&hugetlb_lock);
424
425	return page;
426}
427
428/*
429 * Increase the hugetlb pool such that it can accomodate a reservation
430 * of size 'delta'.
431 */
432static int gather_surplus_pages(int delta)
433{
434	struct list_head surplus_list;
435	struct page *page, *tmp;
436	int ret, i;
437	int needed, allocated;
438
439	needed = (resv_huge_pages + delta) - free_huge_pages;
440	if (needed <= 0) {
441		resv_huge_pages += delta;
442		return 0;
443	}
444
445	allocated = 0;
446	INIT_LIST_HEAD(&surplus_list);
447
448	ret = -ENOMEM;
449retry:
450	spin_unlock(&hugetlb_lock);
451	for (i = 0; i < needed; i++) {
452		page = alloc_buddy_huge_page(NULL, 0);
453		if (!page) {
454			/*
455			 * We were not able to allocate enough pages to
456			 * satisfy the entire reservation so we free what
457			 * we've allocated so far.
458			 */
459			spin_lock(&hugetlb_lock);
460			needed = 0;
461			goto free;
462		}
463
464		list_add(&page->lru, &surplus_list);
465	}
466	allocated += needed;
467
468	/*
469	 * After retaking hugetlb_lock, we need to recalculate 'needed'
470	 * because either resv_huge_pages or free_huge_pages may have changed.
471	 */
472	spin_lock(&hugetlb_lock);
473	needed = (resv_huge_pages + delta) - (free_huge_pages + allocated);
474	if (needed > 0)
475		goto retry;
476
477	/*
478	 * The surplus_list now contains _at_least_ the number of extra pages
479	 * needed to accomodate the reservation.  Add the appropriate number
480	 * of pages to the hugetlb pool and free the extras back to the buddy
481	 * allocator.  Commit the entire reservation here to prevent another
482	 * process from stealing the pages as they are added to the pool but
483	 * before they are reserved.
484	 */
485	needed += allocated;
486	resv_huge_pages += delta;
487	ret = 0;
488free:
489	/* Free the needed pages to the hugetlb pool */
490	list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
491		if ((--needed) < 0)
492			break;
493		list_del(&page->lru);
494		enqueue_huge_page(page);
495	}
496
497	/* Free unnecessary surplus pages to the buddy allocator */
498	if (!list_empty(&surplus_list)) {
499		spin_unlock(&hugetlb_lock);
500		list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
501			list_del(&page->lru);
502			/*
503			 * The page has a reference count of zero already, so
504			 * call free_huge_page directly instead of using
505			 * put_page.  This must be done with hugetlb_lock
506			 * unlocked which is safe because free_huge_page takes
507			 * hugetlb_lock before deciding how to free the page.
508			 */
509			free_huge_page(page);
510		}
511		spin_lock(&hugetlb_lock);
512	}
513
514	return ret;
515}
516
517/*
518 * When releasing a hugetlb pool reservation, any surplus pages that were
519 * allocated to satisfy the reservation must be explicitly freed if they were
520 * never used.
521 */
522static void return_unused_surplus_pages(unsigned long unused_resv_pages)
523{
524	static int nid = -1;
525	struct page *page;
526	unsigned long nr_pages;
527
528	/*
529	 * We want to release as many surplus pages as possible, spread
530	 * evenly across all nodes. Iterate across all nodes until we
531	 * can no longer free unreserved surplus pages. This occurs when
532	 * the nodes with surplus pages have no free pages.
533	 */
534	unsigned long remaining_iterations = num_online_nodes();
535
536	/* Uncommit the reservation */
537	resv_huge_pages -= unused_resv_pages;
538
539	nr_pages = min(unused_resv_pages, surplus_huge_pages);
540
541	while (remaining_iterations-- && nr_pages) {
542		nid = next_node(nid, node_online_map);
543		if (nid == MAX_NUMNODES)
544			nid = first_node(node_online_map);
545
546		if (!surplus_huge_pages_node[nid])
547			continue;
548
549		if (!list_empty(&hugepage_freelists[nid])) {
550			page = list_entry(hugepage_freelists[nid].next,
551					  struct page, lru);
552			list_del(&page->lru);
553			update_and_free_page(page);
554			free_huge_pages--;
555			free_huge_pages_node[nid]--;
556			surplus_huge_pages--;
557			surplus_huge_pages_node[nid]--;
558			nr_pages--;
559			remaining_iterations = num_online_nodes();
560		}
561	}
562}
563
564static struct page *alloc_huge_page(struct vm_area_struct *vma,
565				    unsigned long addr, int avoid_reserve)
566{
567	struct page *page;
568	struct address_space *mapping = vma->vm_file->f_mapping;
569	struct inode *inode = mapping->host;
570	unsigned int chg = 0;
571
572	/*
573	 * Processes that did not create the mapping will have no reserves and
574	 * will not have accounted against quota. Check that the quota can be
575	 * made before satisfying the allocation
576	 */
577	if (!(vma->vm_flags & VM_SHARED) &&
578			!is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
579		chg = 1;
580		if (hugetlb_get_quota(inode->i_mapping, chg))
581			return ERR_PTR(-ENOSPC);
582	}
583
584	spin_lock(&hugetlb_lock);
585	page = dequeue_huge_page_vma(vma, addr, avoid_reserve);
586	spin_unlock(&hugetlb_lock);
587
588	if (!page) {
589		page = alloc_buddy_huge_page(vma, addr);
590		if (!page) {
591			hugetlb_put_quota(inode->i_mapping, chg);
592			return ERR_PTR(-VM_FAULT_OOM);
593		}
594	}
595
596	set_page_refcounted(page);
597	set_page_private(page, (unsigned long) mapping);
598
599	return page;
600}
601
602static int __init hugetlb_init(void)
603{
604	unsigned long i;
605
606	if (HPAGE_SHIFT == 0)
607		return 0;
608
609	for (i = 0; i < MAX_NUMNODES; ++i)
610		INIT_LIST_HEAD(&hugepage_freelists[i]);
611
612	hugetlb_next_nid = first_node(node_online_map);
613
614	for (i = 0; i < max_huge_pages; ++i) {
615		if (!alloc_fresh_huge_page())
616			break;
617	}
618	max_huge_pages = free_huge_pages = nr_huge_pages = i;
619	printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages);
620	return 0;
621}
622module_init(hugetlb_init);
623
624static int __init hugetlb_setup(char *s)
625{
626	if (sscanf(s, "%lu", &max_huge_pages) <= 0)
627		max_huge_pages = 0;
628	return 1;
629}
630__setup("hugepages=", hugetlb_setup);
631
632static unsigned int cpuset_mems_nr(unsigned int *array)
633{
634	int node;
635	unsigned int nr = 0;
636
637	for_each_node_mask(node, cpuset_current_mems_allowed)
638		nr += array[node];
639
640	return nr;
641}
642
643#ifdef CONFIG_SYSCTL
644#ifdef CONFIG_HIGHMEM
645static void try_to_free_low(unsigned long count)
646{
647	int i;
648
649	for (i = 0; i < MAX_NUMNODES; ++i) {
650		struct page *page, *next;
651		list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) {
652			if (count >= nr_huge_pages)
653				return;
654			if (PageHighMem(page))
655				continue;
656			list_del(&page->lru);
657			update_and_free_page(page);
658			free_huge_pages--;
659			free_huge_pages_node[page_to_nid(page)]--;
660		}
661	}
662}
663#else
664static inline void try_to_free_low(unsigned long count)
665{
666}
667#endif
668
669#define persistent_huge_pages (nr_huge_pages - surplus_huge_pages)
670static unsigned long set_max_huge_pages(unsigned long count)
671{
672	unsigned long min_count, ret;
673
674	/*
675	 * Increase the pool size
676	 * First take pages out of surplus state.  Then make up the
677	 * remaining difference by allocating fresh huge pages.
678	 *
679	 * We might race with alloc_buddy_huge_page() here and be unable
680	 * to convert a surplus huge page to a normal huge page. That is
681	 * not critical, though, it just means the overall size of the
682	 * pool might be one hugepage larger than it needs to be, but
683	 * within all the constraints specified by the sysctls.
684	 */
685	spin_lock(&hugetlb_lock);
686	while (surplus_huge_pages && count > persistent_huge_pages) {
687		if (!adjust_pool_surplus(-1))
688			break;
689	}
690
691	while (count > persistent_huge_pages) {
692		/*
693		 * If this allocation races such that we no longer need the
694		 * page, free_huge_page will handle it by freeing the page
695		 * and reducing the surplus.
696		 */
697		spin_unlock(&hugetlb_lock);
698		ret = alloc_fresh_huge_page();
699		spin_lock(&hugetlb_lock);
700		if (!ret)
701			goto out;
702
703	}
704
705	/*
706	 * Decrease the pool size
707	 * First return free pages to the buddy allocator (being careful
708	 * to keep enough around to satisfy reservations).  Then place
709	 * pages into surplus state as needed so the pool will shrink
710	 * to the desired size as pages become free.
711	 *
712	 * By placing pages into the surplus state independent of the
713	 * overcommit value, we are allowing the surplus pool size to
714	 * exceed overcommit. There are few sane options here. Since
715	 * alloc_buddy_huge_page() is checking the global counter,
716	 * though, we'll note that we're not allowed to exceed surplus
717	 * and won't grow the pool anywhere else. Not until one of the
718	 * sysctls are changed, or the surplus pages go out of use.
719	 */
720	min_count = resv_huge_pages + nr_huge_pages - free_huge_pages;
721	min_count = max(count, min_count);
722	try_to_free_low(min_count);
723	while (min_count < persistent_huge_pages) {
724		struct page *page = dequeue_huge_page();
725		if (!page)
726			break;
727		update_and_free_page(page);
728	}
729	while (count < persistent_huge_pages) {
730		if (!adjust_pool_surplus(1))
731			break;
732	}
733out:
734	ret = persistent_huge_pages;
735	spin_unlock(&hugetlb_lock);
736	return ret;
737}
738
739int hugetlb_sysctl_handler(struct ctl_table *table, int write,
740			   struct file *file, void __user *buffer,
741			   size_t *length, loff_t *ppos)
742{
743	proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
744	max_huge_pages = set_max_huge_pages(max_huge_pages);
745	return 0;
746}
747
748int hugetlb_treat_movable_handler(struct ctl_table *table, int write,
749			struct file *file, void __user *buffer,
750			size_t *length, loff_t *ppos)
751{
752	proc_dointvec(table, write, file, buffer, length, ppos);
753	if (hugepages_treat_as_movable)
754		htlb_alloc_mask = GFP_HIGHUSER_MOVABLE;
755	else
756		htlb_alloc_mask = GFP_HIGHUSER;
757	return 0;
758}
759
760int hugetlb_overcommit_handler(struct ctl_table *table, int write,
761			struct file *file, void __user *buffer,
762			size_t *length, loff_t *ppos)
763{
764	proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
765	spin_lock(&hugetlb_lock);
766	nr_overcommit_huge_pages = sysctl_overcommit_huge_pages;
767	spin_unlock(&hugetlb_lock);
768	return 0;
769}
770
771#endif /* CONFIG_SYSCTL */
772
773int hugetlb_report_meminfo(char *buf)
774{
775	return sprintf(buf,
776			"HugePages_Total: %5lu\n"
777			"HugePages_Free:  %5lu\n"
778			"HugePages_Rsvd:  %5lu\n"
779			"HugePages_Surp:  %5lu\n"
780			"Hugepagesize:    %5lu kB\n",
781			nr_huge_pages,
782			free_huge_pages,
783			resv_huge_pages,
784			surplus_huge_pages,
785			HPAGE_SIZE/1024);
786}
787
788int hugetlb_report_node_meminfo(int nid, char *buf)
789{
790	return sprintf(buf,
791		"Node %d HugePages_Total: %5u\n"
792		"Node %d HugePages_Free:  %5u\n"
793		"Node %d HugePages_Surp:  %5u\n",
794		nid, nr_huge_pages_node[nid],
795		nid, free_huge_pages_node[nid],
796		nid, surplus_huge_pages_node[nid]);
797}
798
799/* Return the number pages of memory we physically have, in PAGE_SIZE units. */
800unsigned long hugetlb_total_pages(void)
801{
802	return nr_huge_pages * (HPAGE_SIZE / PAGE_SIZE);
803}
804
805static int hugetlb_acct_memory(long delta)
806{
807	int ret = -ENOMEM;
808
809	spin_lock(&hugetlb_lock);
810	/*
811	 * When cpuset is configured, it breaks the strict hugetlb page
812	 * reservation as the accounting is done on a global variable. Such
813	 * reservation is completely rubbish in the presence of cpuset because
814	 * the reservation is not checked against page availability for the
815	 * current cpuset. Application can still potentially OOM'ed by kernel
816	 * with lack of free htlb page in cpuset that the task is in.
817	 * Attempt to enforce strict accounting with cpuset is almost
818	 * impossible (or too ugly) because cpuset is too fluid that
819	 * task or memory node can be dynamically moved between cpusets.
820	 *
821	 * The change of semantics for shared hugetlb mapping with cpuset is
822	 * undesirable. However, in order to preserve some of the semantics,
823	 * we fall back to check against current free page availability as
824	 * a best attempt and hopefully to minimize the impact of changing
825	 * semantics that cpuset has.
826	 */
827	if (delta > 0) {
828		if (gather_surplus_pages(delta) < 0)
829			goto out;
830
831		if (delta > cpuset_mems_nr(free_huge_pages_node)) {
832			return_unused_surplus_pages(delta);
833			goto out;
834		}
835	}
836
837	ret = 0;
838	if (delta < 0)
839		return_unused_surplus_pages((unsigned long) -delta);
840
841out:
842	spin_unlock(&hugetlb_lock);
843	return ret;
844}
845
846static void hugetlb_vm_op_close(struct vm_area_struct *vma)
847{
848	unsigned long reserve = vma_resv_huge_pages(vma);
849	if (reserve)
850		hugetlb_acct_memory(-reserve);
851}
852
853/*
854 * We cannot handle pagefaults against hugetlb pages at all.  They cause
855 * handle_mm_fault() to try to instantiate regular-sized pages in the
856 * hugegpage VMA.  do_page_fault() is supposed to trap this, so BUG is we get
857 * this far.
858 */
859static int hugetlb_vm_op_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
860{
861	BUG();
862	return 0;
863}
864
865struct vm_operations_struct hugetlb_vm_ops = {
866	.fault = hugetlb_vm_op_fault,
867	.close = hugetlb_vm_op_close,
868};
869
870static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
871				int writable)
872{
873	pte_t entry;
874
875	if (writable) {
876		entry =
877		    pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
878	} else {
879		entry = huge_pte_wrprotect(mk_pte(page, vma->vm_page_prot));
880	}
881	entry = pte_mkyoung(entry);
882	entry = pte_mkhuge(entry);
883
884	return entry;
885}
886
887static void set_huge_ptep_writable(struct vm_area_struct *vma,
888				   unsigned long address, pte_t *ptep)
889{
890	pte_t entry;
891
892	entry = pte_mkwrite(pte_mkdirty(huge_ptep_get(ptep)));
893	if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1)) {
894		update_mmu_cache(vma, address, entry);
895	}
896}
897
898
899int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
900			    struct vm_area_struct *vma)
901{
902	pte_t *src_pte, *dst_pte, entry;
903	struct page *ptepage;
904	unsigned long addr;
905	int cow;
906
907	cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
908
909	for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
910		src_pte = huge_pte_offset(src, addr);
911		if (!src_pte)
912			continue;
913		dst_pte = huge_pte_alloc(dst, addr);
914		if (!dst_pte)
915			goto nomem;
916
917		/* If the pagetables are shared don't copy or take references */
918		if (dst_pte == src_pte)
919			continue;
920
921		spin_lock(&dst->page_table_lock);
922		spin_lock_nested(&src->page_table_lock, SINGLE_DEPTH_NESTING);
923		if (!huge_pte_none(huge_ptep_get(src_pte))) {
924			if (cow)
925				huge_ptep_set_wrprotect(src, addr, src_pte);
926			entry = huge_ptep_get(src_pte);
927			ptepage = pte_page(entry);
928			get_page(ptepage);
929			set_huge_pte_at(dst, addr, dst_pte, entry);
930		}
931		spin_unlock(&src->page_table_lock);
932		spin_unlock(&dst->page_table_lock);
933	}
934	return 0;
935
936nomem:
937	return -ENOMEM;
938}
939
940void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
941			    unsigned long end, struct page *ref_page)
942{
943	struct mm_struct *mm = vma->vm_mm;
944	unsigned long address;
945	pte_t *ptep;
946	pte_t pte;
947	struct page *page;
948	struct page *tmp;
949	/*
950	 * A page gathering list, protected by per file i_mmap_lock. The
951	 * lock is used to avoid list corruption from multiple unmapping
952	 * of the same page since we are using page->lru.
953	 */
954	LIST_HEAD(page_list);
955
956	WARN_ON(!is_vm_hugetlb_page(vma));
957	BUG_ON(start & ~HPAGE_MASK);
958	BUG_ON(end & ~HPAGE_MASK);
959
960	spin_lock(&mm->page_table_lock);
961	for (address = start; address < end; address += HPAGE_SIZE) {
962		ptep = huge_pte_offset(mm, address);
963		if (!ptep)
964			continue;
965
966		if (huge_pmd_unshare(mm, &address, ptep))
967			continue;
968
969		/*
970		 * If a reference page is supplied, it is because a specific
971		 * page is being unmapped, not a range. Ensure the page we
972		 * are about to unmap is the actual page of interest.
973		 */
974		if (ref_page) {
975			pte = huge_ptep_get(ptep);
976			if (huge_pte_none(pte))
977				continue;
978			page = pte_page(pte);
979			if (page != ref_page)
980				continue;
981
982			/*
983			 * Mark the VMA as having unmapped its page so that
984			 * future faults in this VMA will fail rather than
985			 * looking like data was lost
986			 */
987			set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED);
988		}
989
990		pte = huge_ptep_get_and_clear(mm, address, ptep);
991		if (huge_pte_none(pte))
992			continue;
993
994		page = pte_page(pte);
995		if (pte_dirty(pte))
996			set_page_dirty(page);
997		list_add(&page->lru, &page_list);
998	}
999	spin_unlock(&mm->page_table_lock);
1000	flush_tlb_range(vma, start, end);
1001	list_for_each_entry_safe(page, tmp, &page_list, lru) {
1002		list_del(&page->lru);
1003		put_page(page);
1004	}
1005}
1006
1007void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
1008			  unsigned long end, struct page *ref_page)
1009{
1010	/*
1011	 * It is undesirable to test vma->vm_file as it should be non-null
1012	 * for valid hugetlb area. However, vm_file will be NULL in the error
1013	 * cleanup path of do_mmap_pgoff. When hugetlbfs ->mmap method fails,
1014	 * do_mmap_pgoff() nullifies vma->vm_file before calling this function
1015	 * to clean up. Since no pte has actually been setup, it is safe to
1016	 * do nothing in this case.
1017	 */
1018	if (vma->vm_file) {
1019		spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
1020		__unmap_hugepage_range(vma, start, end, ref_page);
1021		spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);
1022	}
1023}
1024
1025/*
1026 * This is called when the original mapper is failing to COW a MAP_PRIVATE
1027 * mappping it owns the reserve page for. The intention is to unmap the page
1028 * from other VMAs and let the children be SIGKILLed if they are faulting the
1029 * same region.
1030 */
1031int unmap_ref_private(struct mm_struct *mm,
1032					struct vm_area_struct *vma,
1033					struct page *page,
1034					unsigned long address)
1035{
1036	struct vm_area_struct *iter_vma;
1037	struct address_space *mapping;
1038	struct prio_tree_iter iter;
1039	pgoff_t pgoff;
1040
1041	/*
1042	 * vm_pgoff is in PAGE_SIZE units, hence the different calculation
1043	 * from page cache lookup which is in HPAGE_SIZE units.
1044	 */
1045	address = address & huge_page_mask(hstate_vma(vma));
1046	pgoff = ((address - vma->vm_start) >> PAGE_SHIFT)
1047		+ (vma->vm_pgoff >> PAGE_SHIFT);
1048	mapping = (struct address_space *)page_private(page);
1049
1050	vma_prio_tree_foreach(iter_vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
1051		/* Do not unmap the current VMA */
1052		if (iter_vma == vma)
1053			continue;
1054
1055		/*
1056		 * Unmap the page from other VMAs without their own reserves.
1057		 * They get marked to be SIGKILLed if they fault in these
1058		 * areas. This is because a future no-page fault on this VMA
1059		 * could insert a zeroed page instead of the data existing
1060		 * from the time of fork. This would look like data corruption
1061		 */
1062		if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER))
1063			unmap_hugepage_range(iter_vma,
1064				address, address + HPAGE_SIZE,
1065				page);
1066	}
1067
1068	return 1;
1069}
1070
1071static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
1072			unsigned long address, pte_t *ptep, pte_t pte,
1073			struct page *pagecache_page)
1074{
1075	struct page *old_page, *new_page;
1076	int avoidcopy;
1077	int outside_reserve = 0;
1078
1079	old_page = pte_page(pte);
1080
1081retry_avoidcopy:
1082	/* If no-one else is actually using this page, avoid the copy
1083	 * and just make the page writable */
1084	avoidcopy = (page_count(old_page) == 1);
1085	if (avoidcopy) {
1086		set_huge_ptep_writable(vma, address, ptep);
1087		return 0;
1088	}
1089
1090	/*
1091	 * If the process that created a MAP_PRIVATE mapping is about to
1092	 * perform a COW due to a shared page count, attempt to satisfy
1093	 * the allocation without using the existing reserves. The pagecache
1094	 * page is used to determine if the reserve at this address was
1095	 * consumed or not. If reserves were used, a partial faulted mapping
1096	 * at the time of fork() could consume its reserves on COW instead
1097	 * of the full address range.
1098	 */
1099	if (!(vma->vm_flags & VM_SHARED) &&
1100			is_vma_resv_set(vma, HPAGE_RESV_OWNER) &&
1101			old_page != pagecache_page)
1102		outside_reserve = 1;
1103
1104	page_cache_get(old_page);
1105	new_page = alloc_huge_page(vma, address, outside_reserve);
1106
1107	if (IS_ERR(new_page)) {
1108		page_cache_release(old_page);
1109
1110		/*
1111		 * If a process owning a MAP_PRIVATE mapping fails to COW,
1112		 * it is due to references held by a child and an insufficient
1113		 * huge page pool. To guarantee the original mappers
1114		 * reliability, unmap the page from child processes. The child
1115		 * may get SIGKILLed if it later faults.
1116		 */
1117		if (outside_reserve) {
1118			BUG_ON(huge_pte_none(pte));
1119			if (unmap_ref_private(mm, vma, old_page, address)) {
1120				BUG_ON(page_count(old_page) != 1);
1121				BUG_ON(huge_pte_none(pte));
1122				goto retry_avoidcopy;
1123			}
1124			WARN_ON_ONCE(1);
1125		}
1126
1127		return -PTR_ERR(new_page);
1128	}
1129
1130	spin_unlock(&mm->page_table_lock);
1131	copy_huge_page(new_page, old_page, address, vma);
1132	__SetPageUptodate(new_page);
1133	spin_lock(&mm->page_table_lock);
1134
1135	ptep = huge_pte_offset(mm, address & HPAGE_MASK);
1136	if (likely(pte_same(huge_ptep_get(ptep), pte))) {
1137		/* Break COW */
1138		huge_ptep_clear_flush(vma, address, ptep);
1139		set_huge_pte_at(mm, address, ptep,
1140				make_huge_pte(vma, new_page, 1));
1141		/* Make the old page be freed below */
1142		new_page = old_page;
1143	}
1144	page_cache_release(new_page);
1145	page_cache_release(old_page);
1146	return 0;
1147}
1148
1149/* Return the pagecache page at a given address within a VMA */
1150static struct page *hugetlbfs_pagecache_page(struct vm_area_struct *vma,
1151			unsigned long address)
1152{
1153	struct address_space *mapping;
1154	unsigned long idx;
1155
1156	mapping = vma->vm_file->f_mapping;
1157	idx = ((address - vma->vm_start) >> HPAGE_SHIFT)
1158		+ (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
1159
1160	return find_lock_page(mapping, idx);
1161}
1162
1163static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
1164			unsigned long address, pte_t *ptep, int write_access)
1165{
1166	int ret = VM_FAULT_SIGBUS;
1167	unsigned long idx;
1168	unsigned long size;
1169	struct page *page;
1170	struct address_space *mapping;
1171	pte_t new_pte;
1172
1173	/*
1174	 * Currently, we are forced to kill the process in the event the
1175	 * original mapper has unmapped pages from the child due to a failed
1176	 * COW. Warn that such a situation has occured as it may not be obvious
1177	 */
1178	if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) {
1179		printk(KERN_WARNING
1180			"PID %d killed due to inadequate hugepage pool\n",
1181			current->pid);
1182		return ret;
1183	}
1184
1185	mapping = vma->vm_file->f_mapping;
1186	idx = ((address - vma->vm_start) >> HPAGE_SHIFT)
1187		+ (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
1188
1189	/*
1190	 * Use page lock to guard against racing truncation
1191	 * before we get page_table_lock.
1192	 */
1193retry:
1194	page = find_lock_page(mapping, idx);
1195	if (!page) {
1196		size = i_size_read(mapping->host) >> HPAGE_SHIFT;
1197		if (idx >= size)
1198			goto out;
1199		page = alloc_huge_page(vma, address, 0);
1200		if (IS_ERR(page)) {
1201			ret = -PTR_ERR(page);
1202			goto out;
1203		}
1204		clear_huge_page(page, address);
1205		__SetPageUptodate(page);
1206
1207		if (vma->vm_flags & VM_SHARED) {
1208			int err;
1209			struct inode *inode = mapping->host;
1210
1211			err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
1212			if (err) {
1213				put_page(page);
1214				if (err == -EEXIST)
1215					goto retry;
1216				goto out;
1217			}
1218
1219			spin_lock(&inode->i_lock);
1220			inode->i_blocks += BLOCKS_PER_HUGEPAGE;
1221			spin_unlock(&inode->i_lock);
1222		} else
1223			lock_page(page);
1224	}
1225
1226	spin_lock(&mm->page_table_lock);
1227	size = i_size_read(mapping->host) >> HPAGE_SHIFT;
1228	if (idx >= size)
1229		goto backout;
1230
1231	ret = 0;
1232	if (!huge_pte_none(huge_ptep_get(ptep)))
1233		goto backout;
1234
1235	new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
1236				&& (vma->vm_flags & VM_SHARED)));
1237	set_huge_pte_at(mm, address, ptep, new_pte);
1238
1239	if (write_access && !(vma->vm_flags & VM_SHARED)) {
1240		/* Optimization, do the COW without a second fault */
1241		ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page);
1242	}
1243
1244	spin_unlock(&mm->page_table_lock);
1245	unlock_page(page);
1246out:
1247	return ret;
1248
1249backout:
1250	spin_unlock(&mm->page_table_lock);
1251	unlock_page(page);
1252	put_page(page);
1253	goto out;
1254}
1255
1256int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
1257			unsigned long address, int write_access)
1258{
1259	pte_t *ptep;
1260	pte_t entry;
1261	int ret;
1262	static DEFINE_MUTEX(hugetlb_instantiation_mutex);
1263
1264	ptep = huge_pte_alloc(mm, address);
1265	if (!ptep)
1266		return VM_FAULT_OOM;
1267
1268	/*
1269	 * Serialize hugepage allocation and instantiation, so that we don't
1270	 * get spurious allocation failures if two CPUs race to instantiate
1271	 * the same page in the page cache.
1272	 */
1273	mutex_lock(&hugetlb_instantiation_mutex);
1274	entry = huge_ptep_get(ptep);
1275	if (huge_pte_none(entry)) {
1276		ret = hugetlb_no_page(mm, vma, address, ptep, write_access);
1277		mutex_unlock(&hugetlb_instantiation_mutex);
1278		return ret;
1279	}
1280
1281	ret = 0;
1282
1283	spin_lock(&mm->page_table_lock);
1284	/* Check for a racing update before calling hugetlb_cow */
1285	if (likely(pte_same(entry, huge_ptep_get(ptep))))
1286		if (write_access && !pte_write(entry)) {
1287			struct page *page;
1288			page = hugetlbfs_pagecache_page(vma, address);
1289			ret = hugetlb_cow(mm, vma, address, ptep, entry, page);
1290			if (page) {
1291				unlock_page(page);
1292				put_page(page);
1293			}
1294		}
1295	spin_unlock(&mm->page_table_lock);
1296	mutex_unlock(&hugetlb_instantiation_mutex);
1297
1298	return ret;
1299}
1300
1301int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
1302			struct page **pages, struct vm_area_struct **vmas,
1303			unsigned long *position, int *length, int i,
1304			int write)
1305{
1306	unsigned long pfn_offset;
1307	unsigned long vaddr = *position;
1308	int remainder = *length;
1309
1310	spin_lock(&mm->page_table_lock);
1311	while (vaddr < vma->vm_end && remainder) {
1312		pte_t *pte;
1313		struct page *page;
1314
1315		/*
1316		 * Some archs (sparc64, sh*) have multiple pte_ts to
1317		 * each hugepage.  We have to make * sure we get the
1318		 * first, for the page indexing below to work.
1319		 */
1320		pte = huge_pte_offset(mm, vaddr & HPAGE_MASK);
1321
1322		if (!pte || huge_pte_none(huge_ptep_get(pte)) ||
1323		    (write && !pte_write(huge_ptep_get(pte)))) {
1324			int ret;
1325
1326			spin_unlock(&mm->page_table_lock);
1327			ret = hugetlb_fault(mm, vma, vaddr, write);
1328			spin_lock(&mm->page_table_lock);
1329			if (!(ret & VM_FAULT_ERROR))
1330				continue;
1331
1332			remainder = 0;
1333			if (!i)
1334				i = -EFAULT;
1335			break;
1336		}
1337
1338		pfn_offset = (vaddr & ~HPAGE_MASK) >> PAGE_SHIFT;
1339		page = pte_page(huge_ptep_get(pte));
1340same_page:
1341		if (pages) {
1342			get_page(page);
1343			pages[i] = page + pfn_offset;
1344		}
1345
1346		if (vmas)
1347			vmas[i] = vma;
1348
1349		vaddr += PAGE_SIZE;
1350		++pfn_offset;
1351		--remainder;
1352		++i;
1353		if (vaddr < vma->vm_end && remainder &&
1354				pfn_offset < HPAGE_SIZE/PAGE_SIZE) {
1355			/*
1356			 * We use pfn_offset to avoid touching the pageframes
1357			 * of this compound page.
1358			 */
1359			goto same_page;
1360		}
1361	}
1362	spin_unlock(&mm->page_table_lock);
1363	*length = remainder;
1364	*position = vaddr;
1365
1366	return i;
1367}
1368
1369void hugetlb_change_protection(struct vm_area_struct *vma,
1370		unsigned long address, unsigned long end, pgprot_t newprot)
1371{
1372	struct mm_struct *mm = vma->vm_mm;
1373	unsigned long start = address;
1374	pte_t *ptep;
1375	pte_t pte;
1376
1377	BUG_ON(address >= end);
1378	flush_cache_range(vma, address, end);
1379
1380	spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
1381	spin_lock(&mm->page_table_lock);
1382	for (; address < end; address += HPAGE_SIZE) {
1383		ptep = huge_pte_offset(mm, address);
1384		if (!ptep)
1385			continue;
1386		if (huge_pmd_unshare(mm, &address, ptep))
1387			continue;
1388		if (!huge_pte_none(huge_ptep_get(ptep))) {
1389			pte = huge_ptep_get_and_clear(mm, address, ptep);
1390			pte = pte_mkhuge(pte_modify(pte, newprot));
1391			set_huge_pte_at(mm, address, ptep, pte);
1392		}
1393	}
1394	spin_unlock(&mm->page_table_lock);
1395	spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);
1396
1397	flush_tlb_range(vma, start, end);
1398}
1399
1400struct file_region {
1401	struct list_head link;
1402	long from;
1403	long to;
1404};
1405
1406static long region_add(struct list_head *head, long f, long t)
1407{
1408	struct file_region *rg, *nrg, *trg;
1409
1410	/* Locate the region we are either in or before. */
1411	list_for_each_entry(rg, head, link)
1412		if (f <= rg->to)
1413			break;
1414
1415	/* Round our left edge to the current segment if it encloses us. */
1416	if (f > rg->from)
1417		f = rg->from;
1418
1419	/* Check for and consume any regions we now overlap with. */
1420	nrg = rg;
1421	list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
1422		if (&rg->link == head)
1423			break;
1424		if (rg->from > t)
1425			break;
1426
1427		/* If this area reaches higher then extend our area to
1428		 * include it completely.  If this is not the first area
1429		 * which we intend to reuse, free it. */
1430		if (rg->to > t)
1431			t = rg->to;
1432		if (rg != nrg) {
1433			list_del(&rg->link);
1434			kfree(rg);
1435		}
1436	}
1437	nrg->from = f;
1438	nrg->to = t;
1439	return 0;
1440}
1441
1442static long region_chg(struct list_head *head, long f, long t)
1443{
1444	struct file_region *rg, *nrg;
1445	long chg = 0;
1446
1447	/* Locate the region we are before or in. */
1448	list_for_each_entry(rg, head, link)
1449		if (f <= rg->to)
1450			break;
1451
1452	/* If we are below the current region then a new region is required.
1453	 * Subtle, allocate a new region at the position but make it zero
1454	 * size such that we can guarantee to record the reservation. */
1455	if (&rg->link == head || t < rg->from) {
1456		nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
1457		if (!nrg)
1458			return -ENOMEM;
1459		nrg->from = f;
1460		nrg->to   = f;
1461		INIT_LIST_HEAD(&nrg->link);
1462		list_add(&nrg->link, rg->link.prev);
1463
1464		return t - f;
1465	}
1466
1467	/* Round our left edge to the current segment if it encloses us. */
1468	if (f > rg->from)
1469		f = rg->from;
1470	chg = t - f;
1471
1472	/* Check for and consume any regions we now overlap with. */
1473	list_for_each_entry(rg, rg->link.prev, link) {
1474		if (&rg->link == head)
1475			break;
1476		if (rg->from > t)
1477			return chg;
1478
1479		/* We overlap with this area, if it extends futher than
1480		 * us then we must extend ourselves.  Account for its
1481		 * existing reservation. */
1482		if (rg->to > t) {
1483			chg += rg->to - t;
1484			t = rg->to;
1485		}
1486		chg -= rg->to - rg->from;
1487	}
1488	return chg;
1489}
1490
1491static long region_truncate(struct list_head *head, long end)
1492{
1493	struct file_region *rg, *trg;
1494	long chg = 0;
1495
1496	/* Locate the region we are either in or before. */
1497	list_for_each_entry(rg, head, link)
1498		if (end <= rg->to)
1499			break;
1500	if (&rg->link == head)
1501		return 0;
1502
1503	/* If we are in the middle of a region then adjust it. */
1504	if (end > rg->from) {
1505		chg = rg->to - end;
1506		rg->to = end;
1507		rg = list_entry(rg->link.next, typeof(*rg), link);
1508	}
1509
1510	/* Drop any remaining regions. */
1511	list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
1512		if (&rg->link == head)
1513			break;
1514		chg += rg->to - rg->from;
1515		list_del(&rg->link);
1516		kfree(rg);
1517	}
1518	return chg;
1519}
1520
1521int hugetlb_reserve_pages(struct inode *inode,
1522					long from, long to,
1523					struct vm_area_struct *vma)
1524{
1525	long ret, chg;
1526
1527	/*
1528	 * Shared mappings base their reservation on the number of pages that
1529	 * are already allocated on behalf of the file. Private mappings need
1530	 * to reserve the full area even if read-only as mprotect() may be
1531	 * called to make the mapping read-write. Assume !vma is a shm mapping
1532	 */
1533	if (!vma || vma->vm_flags & VM_SHARED)
1534		chg = region_chg(&inode->i_mapping->private_list, from, to);
1535	else {
1536		chg = to - from;
1537		set_vma_resv_huge_pages(vma, chg);
1538		set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
1539	}
1540
1541	if (chg < 0)
1542		return chg;
1543
1544	if (hugetlb_get_quota(inode->i_mapping, chg))
1545		return -ENOSPC;
1546	ret = hugetlb_acct_memory(chg);
1547	if (ret < 0) {
1548		hugetlb_put_quota(inode->i_mapping, chg);
1549		return ret;
1550	}
1551	if (!vma || vma->vm_flags & VM_SHARED)
1552		region_add(&inode->i_mapping->private_list, from, to);
1553	return 0;
1554}
1555
1556void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
1557{
1558	long chg = region_truncate(&inode->i_mapping->private_list, offset);
1559
1560	spin_lock(&inode->i_lock);
1561	inode->i_blocks -= BLOCKS_PER_HUGEPAGE * freed;
1562	spin_unlock(&inode->i_lock);
1563
1564	hugetlb_put_quota(inode->i_mapping, (chg - freed));
1565	hugetlb_acct_memory(-(chg - freed));
1566}
1567