hugetlb.c revision c79fb75e5a514a5a35f22c229042aa29f4237e3a
1/*
2 * Generic hugetlb support.
3 * (C) William Irwin, April 2004
4 */
5#include <linux/gfp.h>
6#include <linux/list.h>
7#include <linux/init.h>
8#include <linux/module.h>
9#include <linux/mm.h>
10#include <linux/sysctl.h>
11#include <linux/highmem.h>
12#include <linux/nodemask.h>
13#include <linux/pagemap.h>
14#include <linux/mempolicy.h>
15#include <linux/cpuset.h>
16#include <linux/mutex.h>
17
18#include <asm/page.h>
19#include <asm/pgtable.h>
20
21#include <linux/hugetlb.h>
22#include "internal.h"
23
24const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
25static unsigned long nr_huge_pages, free_huge_pages, resv_huge_pages;
26static unsigned long surplus_huge_pages;
27unsigned long max_huge_pages;
28static struct list_head hugepage_freelists[MAX_NUMNODES];
29static unsigned int nr_huge_pages_node[MAX_NUMNODES];
30static unsigned int free_huge_pages_node[MAX_NUMNODES];
31static unsigned int surplus_huge_pages_node[MAX_NUMNODES];
32static gfp_t htlb_alloc_mask = GFP_HIGHUSER;
33unsigned long hugepages_treat_as_movable;
34int hugetlb_dynamic_pool;
35static int hugetlb_next_nid;
36
37/*
38 * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
39 */
40static DEFINE_SPINLOCK(hugetlb_lock);
41
42static void clear_huge_page(struct page *page, unsigned long addr)
43{
44	int i;
45
46	might_sleep();
47	for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); i++) {
48		cond_resched();
49		clear_user_highpage(page + i, addr + i * PAGE_SIZE);
50	}
51}
52
53static void copy_huge_page(struct page *dst, struct page *src,
54			   unsigned long addr, struct vm_area_struct *vma)
55{
56	int i;
57
58	might_sleep();
59	for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) {
60		cond_resched();
61		copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
62	}
63}
64
65static void enqueue_huge_page(struct page *page)
66{
67	int nid = page_to_nid(page);
68	list_add(&page->lru, &hugepage_freelists[nid]);
69	free_huge_pages++;
70	free_huge_pages_node[nid]++;
71}
72
73static struct page *dequeue_huge_page(struct vm_area_struct *vma,
74				unsigned long address)
75{
76	int nid;
77	struct page *page = NULL;
78	struct mempolicy *mpol;
79	struct zonelist *zonelist = huge_zonelist(vma, address,
80					htlb_alloc_mask, &mpol);
81	struct zone **z;
82
83	for (z = zonelist->zones; *z; z++) {
84		nid = zone_to_nid(*z);
85		if (cpuset_zone_allowed_softwall(*z, htlb_alloc_mask) &&
86		    !list_empty(&hugepage_freelists[nid])) {
87			page = list_entry(hugepage_freelists[nid].next,
88					  struct page, lru);
89			list_del(&page->lru);
90			free_huge_pages--;
91			free_huge_pages_node[nid]--;
92			if (vma && vma->vm_flags & VM_MAYSHARE)
93				resv_huge_pages--;
94			break;
95		}
96	}
97	mpol_free(mpol);	/* unref if mpol !NULL */
98	return page;
99}
100
101static void update_and_free_page(struct page *page)
102{
103	int i;
104	nr_huge_pages--;
105	nr_huge_pages_node[page_to_nid(page)]--;
106	for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) {
107		page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
108				1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
109				1 << PG_private | 1<< PG_writeback);
110	}
111	set_compound_page_dtor(page, NULL);
112	set_page_refcounted(page);
113	__free_pages(page, HUGETLB_PAGE_ORDER);
114}
115
116static void free_huge_page(struct page *page)
117{
118	int nid = page_to_nid(page);
119	struct address_space *mapping;
120
121	mapping = (struct address_space *) page_private(page);
122	BUG_ON(page_count(page));
123	INIT_LIST_HEAD(&page->lru);
124
125	spin_lock(&hugetlb_lock);
126	if (surplus_huge_pages_node[nid]) {
127		update_and_free_page(page);
128		surplus_huge_pages--;
129		surplus_huge_pages_node[nid]--;
130	} else {
131		enqueue_huge_page(page);
132	}
133	spin_unlock(&hugetlb_lock);
134	if (mapping)
135		hugetlb_put_quota(mapping);
136	set_page_private(page, 0);
137}
138
139/*
140 * Increment or decrement surplus_huge_pages.  Keep node-specific counters
141 * balanced by operating on them in a round-robin fashion.
142 * Returns 1 if an adjustment was made.
143 */
144static int adjust_pool_surplus(int delta)
145{
146	static int prev_nid;
147	int nid = prev_nid;
148	int ret = 0;
149
150	VM_BUG_ON(delta != -1 && delta != 1);
151	do {
152		nid = next_node(nid, node_online_map);
153		if (nid == MAX_NUMNODES)
154			nid = first_node(node_online_map);
155
156		/* To shrink on this node, there must be a surplus page */
157		if (delta < 0 && !surplus_huge_pages_node[nid])
158			continue;
159		/* Surplus cannot exceed the total number of pages */
160		if (delta > 0 && surplus_huge_pages_node[nid] >=
161						nr_huge_pages_node[nid])
162			continue;
163
164		surplus_huge_pages += delta;
165		surplus_huge_pages_node[nid] += delta;
166		ret = 1;
167		break;
168	} while (nid != prev_nid);
169
170	prev_nid = nid;
171	return ret;
172}
173
174static struct page *alloc_fresh_huge_page_node(int nid)
175{
176	struct page *page;
177
178	page = alloc_pages_node(nid,
179		htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|__GFP_NOWARN,
180		HUGETLB_PAGE_ORDER);
181	if (page) {
182		set_compound_page_dtor(page, free_huge_page);
183		spin_lock(&hugetlb_lock);
184		nr_huge_pages++;
185		nr_huge_pages_node[nid]++;
186		spin_unlock(&hugetlb_lock);
187		put_page(page); /* free it into the hugepage allocator */
188	}
189
190	return page;
191}
192
193static int alloc_fresh_huge_page(void)
194{
195	struct page *page;
196	int start_nid;
197	int next_nid;
198	int ret = 0;
199
200	start_nid = hugetlb_next_nid;
201
202	do {
203		page = alloc_fresh_huge_page_node(hugetlb_next_nid);
204		if (page)
205			ret = 1;
206		/*
207		 * Use a helper variable to find the next node and then
208		 * copy it back to hugetlb_next_nid afterwards:
209		 * otherwise there's a window in which a racer might
210		 * pass invalid nid MAX_NUMNODES to alloc_pages_node.
211		 * But we don't need to use a spin_lock here: it really
212		 * doesn't matter if occasionally a racer chooses the
213		 * same nid as we do.  Move nid forward in the mask even
214		 * if we just successfully allocated a hugepage so that
215		 * the next caller gets hugepages on the next node.
216		 */
217		next_nid = next_node(hugetlb_next_nid, node_online_map);
218		if (next_nid == MAX_NUMNODES)
219			next_nid = first_node(node_online_map);
220		hugetlb_next_nid = next_nid;
221	} while (!page && hugetlb_next_nid != start_nid);
222
223	return ret;
224}
225
226static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma,
227						unsigned long address)
228{
229	struct page *page;
230
231	/* Check if the dynamic pool is enabled */
232	if (!hugetlb_dynamic_pool)
233		return NULL;
234
235	page = alloc_pages(htlb_alloc_mask|__GFP_COMP|__GFP_NOWARN,
236					HUGETLB_PAGE_ORDER);
237	if (page) {
238		set_compound_page_dtor(page, free_huge_page);
239		spin_lock(&hugetlb_lock);
240		nr_huge_pages++;
241		nr_huge_pages_node[page_to_nid(page)]++;
242		surplus_huge_pages++;
243		surplus_huge_pages_node[page_to_nid(page)]++;
244		spin_unlock(&hugetlb_lock);
245	}
246
247	return page;
248}
249
250/*
251 * Increase the hugetlb pool such that it can accomodate a reservation
252 * of size 'delta'.
253 */
254static int gather_surplus_pages(int delta)
255{
256	struct list_head surplus_list;
257	struct page *page, *tmp;
258	int ret, i;
259	int needed, allocated;
260
261	needed = (resv_huge_pages + delta) - free_huge_pages;
262	if (needed <= 0)
263		return 0;
264
265	allocated = 0;
266	INIT_LIST_HEAD(&surplus_list);
267
268	ret = -ENOMEM;
269retry:
270	spin_unlock(&hugetlb_lock);
271	for (i = 0; i < needed; i++) {
272		page = alloc_buddy_huge_page(NULL, 0);
273		if (!page) {
274			/*
275			 * We were not able to allocate enough pages to
276			 * satisfy the entire reservation so we free what
277			 * we've allocated so far.
278			 */
279			spin_lock(&hugetlb_lock);
280			needed = 0;
281			goto free;
282		}
283
284		list_add(&page->lru, &surplus_list);
285	}
286	allocated += needed;
287
288	/*
289	 * After retaking hugetlb_lock, we need to recalculate 'needed'
290	 * because either resv_huge_pages or free_huge_pages may have changed.
291	 */
292	spin_lock(&hugetlb_lock);
293	needed = (resv_huge_pages + delta) - (free_huge_pages + allocated);
294	if (needed > 0)
295		goto retry;
296
297	/*
298	 * The surplus_list now contains _at_least_ the number of extra pages
299	 * needed to accomodate the reservation.  Add the appropriate number
300	 * of pages to the hugetlb pool and free the extras back to the buddy
301	 * allocator.
302	 */
303	needed += allocated;
304	ret = 0;
305free:
306	list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
307		list_del(&page->lru);
308		if ((--needed) >= 0)
309			enqueue_huge_page(page);
310		else {
311			/*
312			 * Decrement the refcount and free the page using its
313			 * destructor.  This must be done with hugetlb_lock
314			 * unlocked which is safe because free_huge_page takes
315			 * hugetlb_lock before deciding how to free the page.
316			 */
317			spin_unlock(&hugetlb_lock);
318			put_page(page);
319			spin_lock(&hugetlb_lock);
320		}
321	}
322
323	return ret;
324}
325
326/*
327 * When releasing a hugetlb pool reservation, any surplus pages that were
328 * allocated to satisfy the reservation must be explicitly freed if they were
329 * never used.
330 */
331void return_unused_surplus_pages(unsigned long unused_resv_pages)
332{
333	static int nid = -1;
334	struct page *page;
335	unsigned long nr_pages;
336
337	nr_pages = min(unused_resv_pages, surplus_huge_pages);
338
339	while (nr_pages) {
340		nid = next_node(nid, node_online_map);
341		if (nid == MAX_NUMNODES)
342			nid = first_node(node_online_map);
343
344		if (!surplus_huge_pages_node[nid])
345			continue;
346
347		if (!list_empty(&hugepage_freelists[nid])) {
348			page = list_entry(hugepage_freelists[nid].next,
349					  struct page, lru);
350			list_del(&page->lru);
351			update_and_free_page(page);
352			free_huge_pages--;
353			free_huge_pages_node[nid]--;
354			surplus_huge_pages--;
355			surplus_huge_pages_node[nid]--;
356			nr_pages--;
357		}
358	}
359}
360
361
362static struct page *alloc_huge_page_shared(struct vm_area_struct *vma,
363						unsigned long addr)
364{
365	struct page *page;
366
367	spin_lock(&hugetlb_lock);
368	page = dequeue_huge_page(vma, addr);
369	spin_unlock(&hugetlb_lock);
370	return page;
371}
372
373static struct page *alloc_huge_page_private(struct vm_area_struct *vma,
374						unsigned long addr)
375{
376	struct page *page = NULL;
377
378	spin_lock(&hugetlb_lock);
379	if (free_huge_pages > resv_huge_pages)
380		page = dequeue_huge_page(vma, addr);
381	spin_unlock(&hugetlb_lock);
382	if (!page)
383		page = alloc_buddy_huge_page(vma, addr);
384	return page;
385}
386
387static struct page *alloc_huge_page(struct vm_area_struct *vma,
388				    unsigned long addr)
389{
390	struct page *page;
391
392	if (vma->vm_flags & VM_MAYSHARE)
393		page = alloc_huge_page_shared(vma, addr);
394	else
395		page = alloc_huge_page_private(vma, addr);
396	if (page) {
397		set_page_refcounted(page);
398		set_page_private(page, (unsigned long) vma->vm_file->f_mapping);
399	}
400	return page;
401}
402
403static int __init hugetlb_init(void)
404{
405	unsigned long i;
406
407	if (HPAGE_SHIFT == 0)
408		return 0;
409
410	for (i = 0; i < MAX_NUMNODES; ++i)
411		INIT_LIST_HEAD(&hugepage_freelists[i]);
412
413	hugetlb_next_nid = first_node(node_online_map);
414
415	for (i = 0; i < max_huge_pages; ++i) {
416		if (!alloc_fresh_huge_page())
417			break;
418	}
419	max_huge_pages = free_huge_pages = nr_huge_pages = i;
420	printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages);
421	return 0;
422}
423module_init(hugetlb_init);
424
425static int __init hugetlb_setup(char *s)
426{
427	if (sscanf(s, "%lu", &max_huge_pages) <= 0)
428		max_huge_pages = 0;
429	return 1;
430}
431__setup("hugepages=", hugetlb_setup);
432
433static unsigned int cpuset_mems_nr(unsigned int *array)
434{
435	int node;
436	unsigned int nr = 0;
437
438	for_each_node_mask(node, cpuset_current_mems_allowed)
439		nr += array[node];
440
441	return nr;
442}
443
444#ifdef CONFIG_SYSCTL
445#ifdef CONFIG_HIGHMEM
446static void try_to_free_low(unsigned long count)
447{
448	int i;
449
450	for (i = 0; i < MAX_NUMNODES; ++i) {
451		struct page *page, *next;
452		list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) {
453			if (count >= nr_huge_pages)
454				return;
455			if (PageHighMem(page))
456				continue;
457			list_del(&page->lru);
458			update_and_free_page(page);
459			free_huge_pages--;
460			free_huge_pages_node[page_to_nid(page)]--;
461		}
462	}
463}
464#else
465static inline void try_to_free_low(unsigned long count)
466{
467}
468#endif
469
470#define persistent_huge_pages (nr_huge_pages - surplus_huge_pages)
471static unsigned long set_max_huge_pages(unsigned long count)
472{
473	unsigned long min_count, ret;
474
475	/*
476	 * Increase the pool size
477	 * First take pages out of surplus state.  Then make up the
478	 * remaining difference by allocating fresh huge pages.
479	 */
480	spin_lock(&hugetlb_lock);
481	while (surplus_huge_pages && count > persistent_huge_pages) {
482		if (!adjust_pool_surplus(-1))
483			break;
484	}
485
486	while (count > persistent_huge_pages) {
487		int ret;
488		/*
489		 * If this allocation races such that we no longer need the
490		 * page, free_huge_page will handle it by freeing the page
491		 * and reducing the surplus.
492		 */
493		spin_unlock(&hugetlb_lock);
494		ret = alloc_fresh_huge_page();
495		spin_lock(&hugetlb_lock);
496		if (!ret)
497			goto out;
498
499	}
500
501	/*
502	 * Decrease the pool size
503	 * First return free pages to the buddy allocator (being careful
504	 * to keep enough around to satisfy reservations).  Then place
505	 * pages into surplus state as needed so the pool will shrink
506	 * to the desired size as pages become free.
507	 */
508	min_count = resv_huge_pages + nr_huge_pages - free_huge_pages;
509	min_count = max(count, min_count);
510	try_to_free_low(min_count);
511	while (min_count < persistent_huge_pages) {
512		struct page *page = dequeue_huge_page(NULL, 0);
513		if (!page)
514			break;
515		update_and_free_page(page);
516	}
517	while (count < persistent_huge_pages) {
518		if (!adjust_pool_surplus(1))
519			break;
520	}
521out:
522	ret = persistent_huge_pages;
523	spin_unlock(&hugetlb_lock);
524	return ret;
525}
526
527int hugetlb_sysctl_handler(struct ctl_table *table, int write,
528			   struct file *file, void __user *buffer,
529			   size_t *length, loff_t *ppos)
530{
531	proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
532	max_huge_pages = set_max_huge_pages(max_huge_pages);
533	return 0;
534}
535
536int hugetlb_treat_movable_handler(struct ctl_table *table, int write,
537			struct file *file, void __user *buffer,
538			size_t *length, loff_t *ppos)
539{
540	proc_dointvec(table, write, file, buffer, length, ppos);
541	if (hugepages_treat_as_movable)
542		htlb_alloc_mask = GFP_HIGHUSER_MOVABLE;
543	else
544		htlb_alloc_mask = GFP_HIGHUSER;
545	return 0;
546}
547
548#endif /* CONFIG_SYSCTL */
549
550int hugetlb_report_meminfo(char *buf)
551{
552	return sprintf(buf,
553			"HugePages_Total: %5lu\n"
554			"HugePages_Free:  %5lu\n"
555			"HugePages_Rsvd:  %5lu\n"
556			"HugePages_Surp:  %5lu\n"
557			"Hugepagesize:    %5lu kB\n",
558			nr_huge_pages,
559			free_huge_pages,
560			resv_huge_pages,
561			surplus_huge_pages,
562			HPAGE_SIZE/1024);
563}
564
565int hugetlb_report_node_meminfo(int nid, char *buf)
566{
567	return sprintf(buf,
568		"Node %d HugePages_Total: %5u\n"
569		"Node %d HugePages_Free:  %5u\n",
570		nid, nr_huge_pages_node[nid],
571		nid, free_huge_pages_node[nid]);
572}
573
574/* Return the number pages of memory we physically have, in PAGE_SIZE units. */
575unsigned long hugetlb_total_pages(void)
576{
577	return nr_huge_pages * (HPAGE_SIZE / PAGE_SIZE);
578}
579
580/*
581 * We cannot handle pagefaults against hugetlb pages at all.  They cause
582 * handle_mm_fault() to try to instantiate regular-sized pages in the
583 * hugegpage VMA.  do_page_fault() is supposed to trap this, so BUG is we get
584 * this far.
585 */
586static int hugetlb_vm_op_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
587{
588	BUG();
589	return 0;
590}
591
592struct vm_operations_struct hugetlb_vm_ops = {
593	.fault = hugetlb_vm_op_fault,
594};
595
596static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
597				int writable)
598{
599	pte_t entry;
600
601	if (writable) {
602		entry =
603		    pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
604	} else {
605		entry = pte_wrprotect(mk_pte(page, vma->vm_page_prot));
606	}
607	entry = pte_mkyoung(entry);
608	entry = pte_mkhuge(entry);
609
610	return entry;
611}
612
613static void set_huge_ptep_writable(struct vm_area_struct *vma,
614				   unsigned long address, pte_t *ptep)
615{
616	pte_t entry;
617
618	entry = pte_mkwrite(pte_mkdirty(*ptep));
619	if (ptep_set_access_flags(vma, address, ptep, entry, 1)) {
620		update_mmu_cache(vma, address, entry);
621	}
622}
623
624
625int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
626			    struct vm_area_struct *vma)
627{
628	pte_t *src_pte, *dst_pte, entry;
629	struct page *ptepage;
630	unsigned long addr;
631	int cow;
632
633	cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
634
635	for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
636		src_pte = huge_pte_offset(src, addr);
637		if (!src_pte)
638			continue;
639		dst_pte = huge_pte_alloc(dst, addr);
640		if (!dst_pte)
641			goto nomem;
642		spin_lock(&dst->page_table_lock);
643		spin_lock(&src->page_table_lock);
644		if (!pte_none(*src_pte)) {
645			if (cow)
646				ptep_set_wrprotect(src, addr, src_pte);
647			entry = *src_pte;
648			ptepage = pte_page(entry);
649			get_page(ptepage);
650			set_huge_pte_at(dst, addr, dst_pte, entry);
651		}
652		spin_unlock(&src->page_table_lock);
653		spin_unlock(&dst->page_table_lock);
654	}
655	return 0;
656
657nomem:
658	return -ENOMEM;
659}
660
661void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
662			    unsigned long end)
663{
664	struct mm_struct *mm = vma->vm_mm;
665	unsigned long address;
666	pte_t *ptep;
667	pte_t pte;
668	struct page *page;
669	struct page *tmp;
670	/*
671	 * A page gathering list, protected by per file i_mmap_lock. The
672	 * lock is used to avoid list corruption from multiple unmapping
673	 * of the same page since we are using page->lru.
674	 */
675	LIST_HEAD(page_list);
676
677	WARN_ON(!is_vm_hugetlb_page(vma));
678	BUG_ON(start & ~HPAGE_MASK);
679	BUG_ON(end & ~HPAGE_MASK);
680
681	spin_lock(&mm->page_table_lock);
682	for (address = start; address < end; address += HPAGE_SIZE) {
683		ptep = huge_pte_offset(mm, address);
684		if (!ptep)
685			continue;
686
687		if (huge_pmd_unshare(mm, &address, ptep))
688			continue;
689
690		pte = huge_ptep_get_and_clear(mm, address, ptep);
691		if (pte_none(pte))
692			continue;
693
694		page = pte_page(pte);
695		if (pte_dirty(pte))
696			set_page_dirty(page);
697		list_add(&page->lru, &page_list);
698	}
699	spin_unlock(&mm->page_table_lock);
700	flush_tlb_range(vma, start, end);
701	list_for_each_entry_safe(page, tmp, &page_list, lru) {
702		list_del(&page->lru);
703		put_page(page);
704	}
705}
706
707void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
708			  unsigned long end)
709{
710	/*
711	 * It is undesirable to test vma->vm_file as it should be non-null
712	 * for valid hugetlb area. However, vm_file will be NULL in the error
713	 * cleanup path of do_mmap_pgoff. When hugetlbfs ->mmap method fails,
714	 * do_mmap_pgoff() nullifies vma->vm_file before calling this function
715	 * to clean up. Since no pte has actually been setup, it is safe to
716	 * do nothing in this case.
717	 */
718	if (vma->vm_file) {
719		spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
720		__unmap_hugepage_range(vma, start, end);
721		spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);
722	}
723}
724
725static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
726			unsigned long address, pte_t *ptep, pte_t pte)
727{
728	struct page *old_page, *new_page;
729	int avoidcopy;
730
731	old_page = pte_page(pte);
732
733	/* If no-one else is actually using this page, avoid the copy
734	 * and just make the page writable */
735	avoidcopy = (page_count(old_page) == 1);
736	if (avoidcopy) {
737		set_huge_ptep_writable(vma, address, ptep);
738		return 0;
739	}
740	if (hugetlb_get_quota(vma->vm_file->f_mapping))
741		return VM_FAULT_SIGBUS;
742
743	page_cache_get(old_page);
744	new_page = alloc_huge_page(vma, address);
745
746	if (!new_page) {
747		page_cache_release(old_page);
748		return VM_FAULT_OOM;
749	}
750
751	spin_unlock(&mm->page_table_lock);
752	copy_huge_page(new_page, old_page, address, vma);
753	spin_lock(&mm->page_table_lock);
754
755	ptep = huge_pte_offset(mm, address & HPAGE_MASK);
756	if (likely(pte_same(*ptep, pte))) {
757		/* Break COW */
758		set_huge_pte_at(mm, address, ptep,
759				make_huge_pte(vma, new_page, 1));
760		/* Make the old page be freed below */
761		new_page = old_page;
762	}
763	page_cache_release(new_page);
764	page_cache_release(old_page);
765	return 0;
766}
767
768static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
769			unsigned long address, pte_t *ptep, int write_access)
770{
771	int ret = VM_FAULT_SIGBUS;
772	unsigned long idx;
773	unsigned long size;
774	struct page *page;
775	struct address_space *mapping;
776	pte_t new_pte;
777
778	mapping = vma->vm_file->f_mapping;
779	idx = ((address - vma->vm_start) >> HPAGE_SHIFT)
780		+ (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
781
782	/*
783	 * Use page lock to guard against racing truncation
784	 * before we get page_table_lock.
785	 */
786retry:
787	page = find_lock_page(mapping, idx);
788	if (!page) {
789		size = i_size_read(mapping->host) >> HPAGE_SHIFT;
790		if (idx >= size)
791			goto out;
792		if (hugetlb_get_quota(mapping))
793			goto out;
794		page = alloc_huge_page(vma, address);
795		if (!page) {
796			hugetlb_put_quota(mapping);
797			ret = VM_FAULT_OOM;
798			goto out;
799		}
800		clear_huge_page(page, address);
801
802		if (vma->vm_flags & VM_SHARED) {
803			int err;
804
805			err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
806			if (err) {
807				put_page(page);
808				if (err == -EEXIST)
809					goto retry;
810				goto out;
811			}
812		} else
813			lock_page(page);
814	}
815
816	spin_lock(&mm->page_table_lock);
817	size = i_size_read(mapping->host) >> HPAGE_SHIFT;
818	if (idx >= size)
819		goto backout;
820
821	ret = 0;
822	if (!pte_none(*ptep))
823		goto backout;
824
825	new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
826				&& (vma->vm_flags & VM_SHARED)));
827	set_huge_pte_at(mm, address, ptep, new_pte);
828
829	if (write_access && !(vma->vm_flags & VM_SHARED)) {
830		/* Optimization, do the COW without a second fault */
831		ret = hugetlb_cow(mm, vma, address, ptep, new_pte);
832	}
833
834	spin_unlock(&mm->page_table_lock);
835	unlock_page(page);
836out:
837	return ret;
838
839backout:
840	spin_unlock(&mm->page_table_lock);
841	unlock_page(page);
842	put_page(page);
843	goto out;
844}
845
846int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
847			unsigned long address, int write_access)
848{
849	pte_t *ptep;
850	pte_t entry;
851	int ret;
852	static DEFINE_MUTEX(hugetlb_instantiation_mutex);
853
854	ptep = huge_pte_alloc(mm, address);
855	if (!ptep)
856		return VM_FAULT_OOM;
857
858	/*
859	 * Serialize hugepage allocation and instantiation, so that we don't
860	 * get spurious allocation failures if two CPUs race to instantiate
861	 * the same page in the page cache.
862	 */
863	mutex_lock(&hugetlb_instantiation_mutex);
864	entry = *ptep;
865	if (pte_none(entry)) {
866		ret = hugetlb_no_page(mm, vma, address, ptep, write_access);
867		mutex_unlock(&hugetlb_instantiation_mutex);
868		return ret;
869	}
870
871	ret = 0;
872
873	spin_lock(&mm->page_table_lock);
874	/* Check for a racing update before calling hugetlb_cow */
875	if (likely(pte_same(entry, *ptep)))
876		if (write_access && !pte_write(entry))
877			ret = hugetlb_cow(mm, vma, address, ptep, entry);
878	spin_unlock(&mm->page_table_lock);
879	mutex_unlock(&hugetlb_instantiation_mutex);
880
881	return ret;
882}
883
884int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
885			struct page **pages, struct vm_area_struct **vmas,
886			unsigned long *position, int *length, int i,
887			int write)
888{
889	unsigned long pfn_offset;
890	unsigned long vaddr = *position;
891	int remainder = *length;
892
893	spin_lock(&mm->page_table_lock);
894	while (vaddr < vma->vm_end && remainder) {
895		pte_t *pte;
896		struct page *page;
897
898		/*
899		 * Some archs (sparc64, sh*) have multiple pte_ts to
900		 * each hugepage.  We have to make * sure we get the
901		 * first, for the page indexing below to work.
902		 */
903		pte = huge_pte_offset(mm, vaddr & HPAGE_MASK);
904
905		if (!pte || pte_none(*pte)) {
906			int ret;
907
908			spin_unlock(&mm->page_table_lock);
909			ret = hugetlb_fault(mm, vma, vaddr, write);
910			spin_lock(&mm->page_table_lock);
911			if (!(ret & VM_FAULT_ERROR))
912				continue;
913
914			remainder = 0;
915			if (!i)
916				i = -EFAULT;
917			break;
918		}
919
920		pfn_offset = (vaddr & ~HPAGE_MASK) >> PAGE_SHIFT;
921		page = pte_page(*pte);
922same_page:
923		if (pages) {
924			get_page(page);
925			pages[i] = page + pfn_offset;
926		}
927
928		if (vmas)
929			vmas[i] = vma;
930
931		vaddr += PAGE_SIZE;
932		++pfn_offset;
933		--remainder;
934		++i;
935		if (vaddr < vma->vm_end && remainder &&
936				pfn_offset < HPAGE_SIZE/PAGE_SIZE) {
937			/*
938			 * We use pfn_offset to avoid touching the pageframes
939			 * of this compound page.
940			 */
941			goto same_page;
942		}
943	}
944	spin_unlock(&mm->page_table_lock);
945	*length = remainder;
946	*position = vaddr;
947
948	return i;
949}
950
951void hugetlb_change_protection(struct vm_area_struct *vma,
952		unsigned long address, unsigned long end, pgprot_t newprot)
953{
954	struct mm_struct *mm = vma->vm_mm;
955	unsigned long start = address;
956	pte_t *ptep;
957	pte_t pte;
958
959	BUG_ON(address >= end);
960	flush_cache_range(vma, address, end);
961
962	spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
963	spin_lock(&mm->page_table_lock);
964	for (; address < end; address += HPAGE_SIZE) {
965		ptep = huge_pte_offset(mm, address);
966		if (!ptep)
967			continue;
968		if (huge_pmd_unshare(mm, &address, ptep))
969			continue;
970		if (!pte_none(*ptep)) {
971			pte = huge_ptep_get_and_clear(mm, address, ptep);
972			pte = pte_mkhuge(pte_modify(pte, newprot));
973			set_huge_pte_at(mm, address, ptep, pte);
974		}
975	}
976	spin_unlock(&mm->page_table_lock);
977	spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);
978
979	flush_tlb_range(vma, start, end);
980}
981
982struct file_region {
983	struct list_head link;
984	long from;
985	long to;
986};
987
988static long region_add(struct list_head *head, long f, long t)
989{
990	struct file_region *rg, *nrg, *trg;
991
992	/* Locate the region we are either in or before. */
993	list_for_each_entry(rg, head, link)
994		if (f <= rg->to)
995			break;
996
997	/* Round our left edge to the current segment if it encloses us. */
998	if (f > rg->from)
999		f = rg->from;
1000
1001	/* Check for and consume any regions we now overlap with. */
1002	nrg = rg;
1003	list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
1004		if (&rg->link == head)
1005			break;
1006		if (rg->from > t)
1007			break;
1008
1009		/* If this area reaches higher then extend our area to
1010		 * include it completely.  If this is not the first area
1011		 * which we intend to reuse, free it. */
1012		if (rg->to > t)
1013			t = rg->to;
1014		if (rg != nrg) {
1015			list_del(&rg->link);
1016			kfree(rg);
1017		}
1018	}
1019	nrg->from = f;
1020	nrg->to = t;
1021	return 0;
1022}
1023
1024static long region_chg(struct list_head *head, long f, long t)
1025{
1026	struct file_region *rg, *nrg;
1027	long chg = 0;
1028
1029	/* Locate the region we are before or in. */
1030	list_for_each_entry(rg, head, link)
1031		if (f <= rg->to)
1032			break;
1033
1034	/* If we are below the current region then a new region is required.
1035	 * Subtle, allocate a new region at the position but make it zero
1036	 * size such that we can guarantee to record the reservation. */
1037	if (&rg->link == head || t < rg->from) {
1038		nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
1039		if (!nrg)
1040			return -ENOMEM;
1041		nrg->from = f;
1042		nrg->to   = f;
1043		INIT_LIST_HEAD(&nrg->link);
1044		list_add(&nrg->link, rg->link.prev);
1045
1046		return t - f;
1047	}
1048
1049	/* Round our left edge to the current segment if it encloses us. */
1050	if (f > rg->from)
1051		f = rg->from;
1052	chg = t - f;
1053
1054	/* Check for and consume any regions we now overlap with. */
1055	list_for_each_entry(rg, rg->link.prev, link) {
1056		if (&rg->link == head)
1057			break;
1058		if (rg->from > t)
1059			return chg;
1060
1061		/* We overlap with this area, if it extends futher than
1062		 * us then we must extend ourselves.  Account for its
1063		 * existing reservation. */
1064		if (rg->to > t) {
1065			chg += rg->to - t;
1066			t = rg->to;
1067		}
1068		chg -= rg->to - rg->from;
1069	}
1070	return chg;
1071}
1072
1073static long region_truncate(struct list_head *head, long end)
1074{
1075	struct file_region *rg, *trg;
1076	long chg = 0;
1077
1078	/* Locate the region we are either in or before. */
1079	list_for_each_entry(rg, head, link)
1080		if (end <= rg->to)
1081			break;
1082	if (&rg->link == head)
1083		return 0;
1084
1085	/* If we are in the middle of a region then adjust it. */
1086	if (end > rg->from) {
1087		chg = rg->to - end;
1088		rg->to = end;
1089		rg = list_entry(rg->link.next, typeof(*rg), link);
1090	}
1091
1092	/* Drop any remaining regions. */
1093	list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
1094		if (&rg->link == head)
1095			break;
1096		chg += rg->to - rg->from;
1097		list_del(&rg->link);
1098		kfree(rg);
1099	}
1100	return chg;
1101}
1102
1103static int hugetlb_acct_memory(long delta)
1104{
1105	int ret = -ENOMEM;
1106
1107	spin_lock(&hugetlb_lock);
1108	/*
1109	 * When cpuset is configured, it breaks the strict hugetlb page
1110	 * reservation as the accounting is done on a global variable. Such
1111	 * reservation is completely rubbish in the presence of cpuset because
1112	 * the reservation is not checked against page availability for the
1113	 * current cpuset. Application can still potentially OOM'ed by kernel
1114	 * with lack of free htlb page in cpuset that the task is in.
1115	 * Attempt to enforce strict accounting with cpuset is almost
1116	 * impossible (or too ugly) because cpuset is too fluid that
1117	 * task or memory node can be dynamically moved between cpusets.
1118	 *
1119	 * The change of semantics for shared hugetlb mapping with cpuset is
1120	 * undesirable. However, in order to preserve some of the semantics,
1121	 * we fall back to check against current free page availability as
1122	 * a best attempt and hopefully to minimize the impact of changing
1123	 * semantics that cpuset has.
1124	 */
1125	if (delta > 0) {
1126		if (gather_surplus_pages(delta) < 0)
1127			goto out;
1128
1129		if (delta > cpuset_mems_nr(free_huge_pages_node))
1130			goto out;
1131	}
1132
1133	ret = 0;
1134	resv_huge_pages += delta;
1135	if (delta < 0)
1136		return_unused_surplus_pages((unsigned long) -delta);
1137
1138out:
1139	spin_unlock(&hugetlb_lock);
1140	return ret;
1141}
1142
1143int hugetlb_reserve_pages(struct inode *inode, long from, long to)
1144{
1145	long ret, chg;
1146
1147	chg = region_chg(&inode->i_mapping->private_list, from, to);
1148	if (chg < 0)
1149		return chg;
1150
1151	ret = hugetlb_acct_memory(chg);
1152	if (ret < 0)
1153		return ret;
1154	region_add(&inode->i_mapping->private_list, from, to);
1155	return 0;
1156}
1157
1158void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
1159{
1160	long chg = region_truncate(&inode->i_mapping->private_list, offset);
1161	hugetlb_acct_memory(freed - chg);
1162}
1163