hugetlb.c revision 64b4a954b03a1153fb8ae38d6ffbd991e01a1e80
1/*
2 * Generic hugetlb support.
3 * (C) William Irwin, April 2004
4 */
5#include <linux/gfp.h>
6#include <linux/list.h>
7#include <linux/init.h>
8#include <linux/module.h>
9#include <linux/mm.h>
10#include <linux/sysctl.h>
11#include <linux/highmem.h>
12#include <linux/nodemask.h>
13#include <linux/pagemap.h>
14#include <linux/mempolicy.h>
15#include <linux/cpuset.h>
16
17#include <asm/page.h>
18#include <asm/pgtable.h>
19
20#include <linux/hugetlb.h>
21
22const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
23static unsigned long nr_huge_pages, free_huge_pages;
24unsigned long max_huge_pages;
25static struct list_head hugepage_freelists[MAX_NUMNODES];
26static unsigned int nr_huge_pages_node[MAX_NUMNODES];
27static unsigned int free_huge_pages_node[MAX_NUMNODES];
28
29/*
30 * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
31 */
32static DEFINE_SPINLOCK(hugetlb_lock);
33
34static void enqueue_huge_page(struct page *page)
35{
36	int nid = page_to_nid(page);
37	list_add(&page->lru, &hugepage_freelists[nid]);
38	free_huge_pages++;
39	free_huge_pages_node[nid]++;
40}
41
42static struct page *dequeue_huge_page(struct vm_area_struct *vma,
43				unsigned long address)
44{
45	int nid = numa_node_id();
46	struct page *page = NULL;
47	struct zonelist *zonelist = huge_zonelist(vma, address);
48	struct zone **z;
49
50	for (z = zonelist->zones; *z; z++) {
51		nid = (*z)->zone_pgdat->node_id;
52		if (cpuset_zone_allowed(*z, GFP_HIGHUSER) &&
53		    !list_empty(&hugepage_freelists[nid]))
54			break;
55	}
56
57	if (*z) {
58		page = list_entry(hugepage_freelists[nid].next,
59				  struct page, lru);
60		list_del(&page->lru);
61		free_huge_pages--;
62		free_huge_pages_node[nid]--;
63	}
64	return page;
65}
66
67static struct page *alloc_fresh_huge_page(void)
68{
69	static int nid = 0;
70	struct page *page;
71	page = alloc_pages_node(nid, GFP_HIGHUSER|__GFP_COMP|__GFP_NOWARN,
72					HUGETLB_PAGE_ORDER);
73	nid = (nid + 1) % num_online_nodes();
74	if (page) {
75		spin_lock(&hugetlb_lock);
76		nr_huge_pages++;
77		nr_huge_pages_node[page_to_nid(page)]++;
78		spin_unlock(&hugetlb_lock);
79	}
80	return page;
81}
82
83void free_huge_page(struct page *page)
84{
85	BUG_ON(page_count(page));
86
87	INIT_LIST_HEAD(&page->lru);
88	page[1].mapping = NULL;
89
90	spin_lock(&hugetlb_lock);
91	enqueue_huge_page(page);
92	spin_unlock(&hugetlb_lock);
93}
94
95struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr)
96{
97	struct page *page;
98	int i;
99
100	spin_lock(&hugetlb_lock);
101	page = dequeue_huge_page(vma, addr);
102	if (!page) {
103		spin_unlock(&hugetlb_lock);
104		return NULL;
105	}
106	spin_unlock(&hugetlb_lock);
107	set_page_count(page, 1);
108	page[1].mapping = (void *)free_huge_page;
109	for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); ++i)
110		clear_highpage(&page[i]);
111	return page;
112}
113
114static int __init hugetlb_init(void)
115{
116	unsigned long i;
117	struct page *page;
118
119	if (HPAGE_SHIFT == 0)
120		return 0;
121
122	for (i = 0; i < MAX_NUMNODES; ++i)
123		INIT_LIST_HEAD(&hugepage_freelists[i]);
124
125	for (i = 0; i < max_huge_pages; ++i) {
126		page = alloc_fresh_huge_page();
127		if (!page)
128			break;
129		spin_lock(&hugetlb_lock);
130		enqueue_huge_page(page);
131		spin_unlock(&hugetlb_lock);
132	}
133	max_huge_pages = free_huge_pages = nr_huge_pages = i;
134	printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages);
135	return 0;
136}
137module_init(hugetlb_init);
138
139static int __init hugetlb_setup(char *s)
140{
141	if (sscanf(s, "%lu", &max_huge_pages) <= 0)
142		max_huge_pages = 0;
143	return 1;
144}
145__setup("hugepages=", hugetlb_setup);
146
147#ifdef CONFIG_SYSCTL
148static void update_and_free_page(struct page *page)
149{
150	int i;
151	nr_huge_pages--;
152	nr_huge_pages_node[page_zone(page)->zone_pgdat->node_id]--;
153	for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) {
154		page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
155				1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
156				1 << PG_private | 1<< PG_writeback);
157		set_page_count(&page[i], 0);
158	}
159	set_page_count(page, 1);
160	__free_pages(page, HUGETLB_PAGE_ORDER);
161}
162
163#ifdef CONFIG_HIGHMEM
164static void try_to_free_low(unsigned long count)
165{
166	int i, nid;
167	for (i = 0; i < MAX_NUMNODES; ++i) {
168		struct page *page, *next;
169		list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) {
170			if (PageHighMem(page))
171				continue;
172			list_del(&page->lru);
173			update_and_free_page(page);
174			nid = page_zone(page)->zone_pgdat->node_id;
175			free_huge_pages--;
176			free_huge_pages_node[nid]--;
177			if (count >= nr_huge_pages)
178				return;
179		}
180	}
181}
182#else
183static inline void try_to_free_low(unsigned long count)
184{
185}
186#endif
187
188static unsigned long set_max_huge_pages(unsigned long count)
189{
190	while (count > nr_huge_pages) {
191		struct page *page = alloc_fresh_huge_page();
192		if (!page)
193			return nr_huge_pages;
194		spin_lock(&hugetlb_lock);
195		enqueue_huge_page(page);
196		spin_unlock(&hugetlb_lock);
197	}
198	if (count >= nr_huge_pages)
199		return nr_huge_pages;
200
201	spin_lock(&hugetlb_lock);
202	try_to_free_low(count);
203	while (count < nr_huge_pages) {
204		struct page *page = dequeue_huge_page(NULL, 0);
205		if (!page)
206			break;
207		update_and_free_page(page);
208	}
209	spin_unlock(&hugetlb_lock);
210	return nr_huge_pages;
211}
212
213int hugetlb_sysctl_handler(struct ctl_table *table, int write,
214			   struct file *file, void __user *buffer,
215			   size_t *length, loff_t *ppos)
216{
217	proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
218	max_huge_pages = set_max_huge_pages(max_huge_pages);
219	return 0;
220}
221#endif /* CONFIG_SYSCTL */
222
223int hugetlb_report_meminfo(char *buf)
224{
225	return sprintf(buf,
226			"HugePages_Total: %5lu\n"
227			"HugePages_Free:  %5lu\n"
228			"Hugepagesize:    %5lu kB\n",
229			nr_huge_pages,
230			free_huge_pages,
231			HPAGE_SIZE/1024);
232}
233
234int hugetlb_report_node_meminfo(int nid, char *buf)
235{
236	return sprintf(buf,
237		"Node %d HugePages_Total: %5u\n"
238		"Node %d HugePages_Free:  %5u\n",
239		nid, nr_huge_pages_node[nid],
240		nid, free_huge_pages_node[nid]);
241}
242
243int is_hugepage_mem_enough(size_t size)
244{
245	return (size + ~HPAGE_MASK)/HPAGE_SIZE <= free_huge_pages;
246}
247
248/* Return the number pages of memory we physically have, in PAGE_SIZE units. */
249unsigned long hugetlb_total_pages(void)
250{
251	return nr_huge_pages * (HPAGE_SIZE / PAGE_SIZE);
252}
253
254/*
255 * We cannot handle pagefaults against hugetlb pages at all.  They cause
256 * handle_mm_fault() to try to instantiate regular-sized pages in the
257 * hugegpage VMA.  do_page_fault() is supposed to trap this, so BUG is we get
258 * this far.
259 */
260static struct page *hugetlb_nopage(struct vm_area_struct *vma,
261				unsigned long address, int *unused)
262{
263	BUG();
264	return NULL;
265}
266
267struct vm_operations_struct hugetlb_vm_ops = {
268	.nopage = hugetlb_nopage,
269};
270
271static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
272				int writable)
273{
274	pte_t entry;
275
276	if (writable) {
277		entry =
278		    pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
279	} else {
280		entry = pte_wrprotect(mk_pte(page, vma->vm_page_prot));
281	}
282	entry = pte_mkyoung(entry);
283	entry = pte_mkhuge(entry);
284
285	return entry;
286}
287
288static void set_huge_ptep_writable(struct vm_area_struct *vma,
289				   unsigned long address, pte_t *ptep)
290{
291	pte_t entry;
292
293	entry = pte_mkwrite(pte_mkdirty(*ptep));
294	ptep_set_access_flags(vma, address, ptep, entry, 1);
295	update_mmu_cache(vma, address, entry);
296	lazy_mmu_prot_update(entry);
297}
298
299
300int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
301			    struct vm_area_struct *vma)
302{
303	pte_t *src_pte, *dst_pte, entry;
304	struct page *ptepage;
305	unsigned long addr;
306	int cow;
307
308	cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
309
310	for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
311		src_pte = huge_pte_offset(src, addr);
312		if (!src_pte)
313			continue;
314		dst_pte = huge_pte_alloc(dst, addr);
315		if (!dst_pte)
316			goto nomem;
317		spin_lock(&dst->page_table_lock);
318		spin_lock(&src->page_table_lock);
319		if (!pte_none(*src_pte)) {
320			if (cow)
321				ptep_set_wrprotect(src, addr, src_pte);
322			entry = *src_pte;
323			ptepage = pte_page(entry);
324			get_page(ptepage);
325			add_mm_counter(dst, file_rss, HPAGE_SIZE / PAGE_SIZE);
326			set_huge_pte_at(dst, addr, dst_pte, entry);
327		}
328		spin_unlock(&src->page_table_lock);
329		spin_unlock(&dst->page_table_lock);
330	}
331	return 0;
332
333nomem:
334	return -ENOMEM;
335}
336
337void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
338			  unsigned long end)
339{
340	struct mm_struct *mm = vma->vm_mm;
341	unsigned long address;
342	pte_t *ptep;
343	pte_t pte;
344	struct page *page;
345
346	WARN_ON(!is_vm_hugetlb_page(vma));
347	BUG_ON(start & ~HPAGE_MASK);
348	BUG_ON(end & ~HPAGE_MASK);
349
350	spin_lock(&mm->page_table_lock);
351
352	/* Update high watermark before we lower rss */
353	update_hiwater_rss(mm);
354
355	for (address = start; address < end; address += HPAGE_SIZE) {
356		ptep = huge_pte_offset(mm, address);
357		if (!ptep)
358			continue;
359
360		pte = huge_ptep_get_and_clear(mm, address, ptep);
361		if (pte_none(pte))
362			continue;
363
364		page = pte_page(pte);
365		put_page(page);
366		add_mm_counter(mm, file_rss, (int) -(HPAGE_SIZE / PAGE_SIZE));
367	}
368
369	spin_unlock(&mm->page_table_lock);
370	flush_tlb_range(vma, start, end);
371}
372
373static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
374			unsigned long address, pte_t *ptep, pte_t pte)
375{
376	struct page *old_page, *new_page;
377	int i, avoidcopy;
378
379	old_page = pte_page(pte);
380
381	/* If no-one else is actually using this page, avoid the copy
382	 * and just make the page writable */
383	avoidcopy = (page_count(old_page) == 1);
384	if (avoidcopy) {
385		set_huge_ptep_writable(vma, address, ptep);
386		return VM_FAULT_MINOR;
387	}
388
389	page_cache_get(old_page);
390	new_page = alloc_huge_page(vma, address);
391
392	if (!new_page) {
393		page_cache_release(old_page);
394
395		/* Logically this is OOM, not a SIGBUS, but an OOM
396		 * could cause the kernel to go killing other
397		 * processes which won't help the hugepage situation
398		 * at all (?) */
399		return VM_FAULT_SIGBUS;
400	}
401
402	spin_unlock(&mm->page_table_lock);
403	for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++)
404		copy_user_highpage(new_page + i, old_page + i,
405				   address + i*PAGE_SIZE);
406	spin_lock(&mm->page_table_lock);
407
408	ptep = huge_pte_offset(mm, address & HPAGE_MASK);
409	if (likely(pte_same(*ptep, pte))) {
410		/* Break COW */
411		set_huge_pte_at(mm, address, ptep,
412				make_huge_pte(vma, new_page, 1));
413		/* Make the old page be freed below */
414		new_page = old_page;
415	}
416	page_cache_release(new_page);
417	page_cache_release(old_page);
418	return VM_FAULT_MINOR;
419}
420
421int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
422			unsigned long address, pte_t *ptep, int write_access)
423{
424	int ret = VM_FAULT_SIGBUS;
425	unsigned long idx;
426	unsigned long size;
427	struct page *page;
428	struct address_space *mapping;
429	pte_t new_pte;
430
431	mapping = vma->vm_file->f_mapping;
432	idx = ((address - vma->vm_start) >> HPAGE_SHIFT)
433		+ (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
434
435	/*
436	 * Use page lock to guard against racing truncation
437	 * before we get page_table_lock.
438	 */
439retry:
440	page = find_lock_page(mapping, idx);
441	if (!page) {
442		if (hugetlb_get_quota(mapping))
443			goto out;
444		page = alloc_huge_page(vma, address);
445		if (!page) {
446			hugetlb_put_quota(mapping);
447			/*
448		 	 * No huge pages available. So this is an OOM
449			 * condition but we do not want to trigger the OOM
450			 * killer, so we return VM_FAULT_SIGBUS.
451			 *
452			 * A program using hugepages may fault with Bus Error
453			 * because no huge pages are available in the cpuset, per
454			 * memory policy or because all are in use!
455			 */
456			goto out;
457		}
458
459		if (vma->vm_flags & VM_SHARED) {
460			int err;
461
462			err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
463			if (err) {
464				put_page(page);
465				hugetlb_put_quota(mapping);
466				if (err == -EEXIST)
467					goto retry;
468				goto out;
469			}
470		} else
471			lock_page(page);
472	}
473
474	spin_lock(&mm->page_table_lock);
475	size = i_size_read(mapping->host) >> HPAGE_SHIFT;
476	if (idx >= size)
477		goto backout;
478
479	ret = VM_FAULT_MINOR;
480	if (!pte_none(*ptep))
481		goto backout;
482
483	add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE);
484	new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
485				&& (vma->vm_flags & VM_SHARED)));
486	set_huge_pte_at(mm, address, ptep, new_pte);
487
488	if (write_access && !(vma->vm_flags & VM_SHARED)) {
489		/* Optimization, do the COW without a second fault */
490		ret = hugetlb_cow(mm, vma, address, ptep, new_pte);
491	}
492
493	spin_unlock(&mm->page_table_lock);
494	unlock_page(page);
495out:
496	return ret;
497
498backout:
499	spin_unlock(&mm->page_table_lock);
500	hugetlb_put_quota(mapping);
501	unlock_page(page);
502	put_page(page);
503	goto out;
504}
505
506int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
507			unsigned long address, int write_access)
508{
509	pte_t *ptep;
510	pte_t entry;
511	int ret;
512
513	ptep = huge_pte_alloc(mm, address);
514	if (!ptep)
515		return VM_FAULT_OOM;
516
517	entry = *ptep;
518	if (pte_none(entry))
519		return hugetlb_no_page(mm, vma, address, ptep, write_access);
520
521	ret = VM_FAULT_MINOR;
522
523	spin_lock(&mm->page_table_lock);
524	/* Check for a racing update before calling hugetlb_cow */
525	if (likely(pte_same(entry, *ptep)))
526		if (write_access && !pte_write(entry))
527			ret = hugetlb_cow(mm, vma, address, ptep, entry);
528	spin_unlock(&mm->page_table_lock);
529
530	return ret;
531}
532
533int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
534			struct page **pages, struct vm_area_struct **vmas,
535			unsigned long *position, int *length, int i)
536{
537	unsigned long vpfn, vaddr = *position;
538	int remainder = *length;
539
540	vpfn = vaddr/PAGE_SIZE;
541	spin_lock(&mm->page_table_lock);
542	while (vaddr < vma->vm_end && remainder) {
543		pte_t *pte;
544		struct page *page;
545
546		/*
547		 * Some archs (sparc64, sh*) have multiple pte_ts to
548		 * each hugepage.  We have to make * sure we get the
549		 * first, for the page indexing below to work.
550		 */
551		pte = huge_pte_offset(mm, vaddr & HPAGE_MASK);
552
553		if (!pte || pte_none(*pte)) {
554			int ret;
555
556			spin_unlock(&mm->page_table_lock);
557			ret = hugetlb_fault(mm, vma, vaddr, 0);
558			spin_lock(&mm->page_table_lock);
559			if (ret == VM_FAULT_MINOR)
560				continue;
561
562			remainder = 0;
563			if (!i)
564				i = -EFAULT;
565			break;
566		}
567
568		if (pages) {
569			page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)];
570			get_page(page);
571			pages[i] = page;
572		}
573
574		if (vmas)
575			vmas[i] = vma;
576
577		vaddr += PAGE_SIZE;
578		++vpfn;
579		--remainder;
580		++i;
581	}
582	spin_unlock(&mm->page_table_lock);
583	*length = remainder;
584	*position = vaddr;
585
586	return i;
587}
588