vmalloc.c revision 2b4ac44e7c7e16cf9411b81693ff3e604f332bf1
1/*
2 *  linux/mm/vmalloc.c
3 *
4 *  Copyright (C) 1993  Linus Torvalds
5 *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
6 *  SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian <tigran@veritas.com>, May 2000
7 *  Major rework to support vmap/vunmap, Christoph Hellwig, SGI, August 2002
8 *  Numa awareness, Christoph Lameter, SGI, June 2005
9 */
10
11#include <linux/mm.h>
12#include <linux/module.h>
13#include <linux/highmem.h>
14#include <linux/slab.h>
15#include <linux/spinlock.h>
16#include <linux/interrupt.h>
17
18#include <linux/vmalloc.h>
19
20#include <asm/uaccess.h>
21#include <asm/tlbflush.h>
22
23
24DEFINE_RWLOCK(vmlist_lock);
25struct vm_struct *vmlist;
26
27static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
28			    int node);
29
30static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end)
31{
32	pte_t *pte;
33
34	pte = pte_offset_kernel(pmd, addr);
35	do {
36		pte_t ptent = ptep_get_and_clear(&init_mm, addr, pte);
37		WARN_ON(!pte_none(ptent) && !pte_present(ptent));
38	} while (pte++, addr += PAGE_SIZE, addr != end);
39}
40
41static inline void vunmap_pmd_range(pud_t *pud, unsigned long addr,
42						unsigned long end)
43{
44	pmd_t *pmd;
45	unsigned long next;
46
47	pmd = pmd_offset(pud, addr);
48	do {
49		next = pmd_addr_end(addr, end);
50		if (pmd_none_or_clear_bad(pmd))
51			continue;
52		vunmap_pte_range(pmd, addr, next);
53	} while (pmd++, addr = next, addr != end);
54}
55
56static inline void vunmap_pud_range(pgd_t *pgd, unsigned long addr,
57						unsigned long end)
58{
59	pud_t *pud;
60	unsigned long next;
61
62	pud = pud_offset(pgd, addr);
63	do {
64		next = pud_addr_end(addr, end);
65		if (pud_none_or_clear_bad(pud))
66			continue;
67		vunmap_pmd_range(pud, addr, next);
68	} while (pud++, addr = next, addr != end);
69}
70
71void unmap_vm_area(struct vm_struct *area)
72{
73	pgd_t *pgd;
74	unsigned long next;
75	unsigned long addr = (unsigned long) area->addr;
76	unsigned long end = addr + area->size;
77
78	BUG_ON(addr >= end);
79	pgd = pgd_offset_k(addr);
80	flush_cache_vunmap(addr, end);
81	do {
82		next = pgd_addr_end(addr, end);
83		if (pgd_none_or_clear_bad(pgd))
84			continue;
85		vunmap_pud_range(pgd, addr, next);
86	} while (pgd++, addr = next, addr != end);
87	flush_tlb_kernel_range((unsigned long) area->addr, end);
88}
89
90static int vmap_pte_range(pmd_t *pmd, unsigned long addr,
91			unsigned long end, pgprot_t prot, struct page ***pages)
92{
93	pte_t *pte;
94
95	pte = pte_alloc_kernel(pmd, addr);
96	if (!pte)
97		return -ENOMEM;
98	do {
99		struct page *page = **pages;
100		WARN_ON(!pte_none(*pte));
101		if (!page)
102			return -ENOMEM;
103		set_pte_at(&init_mm, addr, pte, mk_pte(page, prot));
104		(*pages)++;
105	} while (pte++, addr += PAGE_SIZE, addr != end);
106	return 0;
107}
108
109static inline int vmap_pmd_range(pud_t *pud, unsigned long addr,
110			unsigned long end, pgprot_t prot, struct page ***pages)
111{
112	pmd_t *pmd;
113	unsigned long next;
114
115	pmd = pmd_alloc(&init_mm, pud, addr);
116	if (!pmd)
117		return -ENOMEM;
118	do {
119		next = pmd_addr_end(addr, end);
120		if (vmap_pte_range(pmd, addr, next, prot, pages))
121			return -ENOMEM;
122	} while (pmd++, addr = next, addr != end);
123	return 0;
124}
125
126static inline int vmap_pud_range(pgd_t *pgd, unsigned long addr,
127			unsigned long end, pgprot_t prot, struct page ***pages)
128{
129	pud_t *pud;
130	unsigned long next;
131
132	pud = pud_alloc(&init_mm, pgd, addr);
133	if (!pud)
134		return -ENOMEM;
135	do {
136		next = pud_addr_end(addr, end);
137		if (vmap_pmd_range(pud, addr, next, prot, pages))
138			return -ENOMEM;
139	} while (pud++, addr = next, addr != end);
140	return 0;
141}
142
143int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages)
144{
145	pgd_t *pgd;
146	unsigned long next;
147	unsigned long addr = (unsigned long) area->addr;
148	unsigned long end = addr + area->size - PAGE_SIZE;
149	int err;
150
151	BUG_ON(addr >= end);
152	pgd = pgd_offset_k(addr);
153	do {
154		next = pgd_addr_end(addr, end);
155		err = vmap_pud_range(pgd, addr, next, prot, pages);
156		if (err)
157			break;
158	} while (pgd++, addr = next, addr != end);
159	flush_cache_vmap((unsigned long) area->addr, end);
160	return err;
161}
162
163static struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long flags,
164					    unsigned long start, unsigned long end,
165					    int node, gfp_t gfp_mask)
166{
167	struct vm_struct **p, *tmp, *area;
168	unsigned long align = 1;
169	unsigned long addr;
170
171	BUG_ON(in_interrupt());
172	if (flags & VM_IOREMAP) {
173		int bit = fls(size);
174
175		if (bit > IOREMAP_MAX_ORDER)
176			bit = IOREMAP_MAX_ORDER;
177		else if (bit < PAGE_SHIFT)
178			bit = PAGE_SHIFT;
179
180		align = 1ul << bit;
181	}
182	addr = ALIGN(start, align);
183	size = PAGE_ALIGN(size);
184
185	area = kmalloc_node(sizeof(*area), gfp_mask & GFP_LEVEL_MASK, node);
186	if (unlikely(!area))
187		return NULL;
188
189	if (unlikely(!size))
190		return NULL;
191
192	/*
193	 * We always allocate a guard page.
194	 */
195	size += PAGE_SIZE;
196
197	write_lock(&vmlist_lock);
198	for (p = &vmlist; (tmp = *p) != NULL ;p = &tmp->next) {
199		if ((unsigned long)tmp->addr < addr) {
200			if((unsigned long)tmp->addr + tmp->size >= addr)
201				addr = ALIGN(tmp->size +
202					     (unsigned long)tmp->addr, align);
203			continue;
204		}
205		if ((size + addr) < addr)
206			goto out;
207		if (size + addr <= (unsigned long)tmp->addr)
208			goto found;
209		addr = ALIGN(tmp->size + (unsigned long)tmp->addr, align);
210		if (addr > end - size)
211			goto out;
212	}
213
214found:
215	area->next = *p;
216	*p = area;
217
218	area->flags = flags;
219	area->addr = (void *)addr;
220	area->size = size;
221	area->pages = NULL;
222	area->nr_pages = 0;
223	area->phys_addr = 0;
224	write_unlock(&vmlist_lock);
225
226	return area;
227
228out:
229	write_unlock(&vmlist_lock);
230	kfree(area);
231	if (printk_ratelimit())
232		printk(KERN_WARNING "allocation failed: out of vmalloc space - use vmalloc=<size> to increase size.\n");
233	return NULL;
234}
235
236struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags,
237				unsigned long start, unsigned long end)
238{
239	return __get_vm_area_node(size, flags, start, end, -1, GFP_KERNEL);
240}
241
242/**
243 *	get_vm_area  -  reserve a contingous kernel virtual area
244 *	@size:		size of the area
245 *	@flags:		%VM_IOREMAP for I/O mappings or VM_ALLOC
246 *
247 *	Search an area of @size in the kernel virtual mapping area,
248 *	and reserved it for out purposes.  Returns the area descriptor
249 *	on success or %NULL on failure.
250 */
251struct vm_struct *get_vm_area(unsigned long size, unsigned long flags)
252{
253	return __get_vm_area(size, flags, VMALLOC_START, VMALLOC_END);
254}
255
256struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags,
257				   int node, gfp_t gfp_mask)
258{
259	return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END, node,
260				  gfp_mask);
261}
262
263/* Caller must hold vmlist_lock */
264static struct vm_struct *__find_vm_area(void *addr)
265{
266	struct vm_struct *tmp;
267
268	for (tmp = vmlist; tmp != NULL; tmp = tmp->next) {
269		 if (tmp->addr == addr)
270			break;
271	}
272
273	return tmp;
274}
275
276/* Caller must hold vmlist_lock */
277static struct vm_struct *__remove_vm_area(void *addr)
278{
279	struct vm_struct **p, *tmp;
280
281	for (p = &vmlist ; (tmp = *p) != NULL ;p = &tmp->next) {
282		 if (tmp->addr == addr)
283			 goto found;
284	}
285	return NULL;
286
287found:
288	unmap_vm_area(tmp);
289	*p = tmp->next;
290
291	/*
292	 * Remove the guard page.
293	 */
294	tmp->size -= PAGE_SIZE;
295	return tmp;
296}
297
298/**
299 *	remove_vm_area  -  find and remove a contingous kernel virtual area
300 *	@addr:		base address
301 *
302 *	Search for the kernel VM area starting at @addr, and remove it.
303 *	This function returns the found VM area, but using it is NOT safe
304 *	on SMP machines, except for its size or flags.
305 */
306struct vm_struct *remove_vm_area(void *addr)
307{
308	struct vm_struct *v;
309	write_lock(&vmlist_lock);
310	v = __remove_vm_area(addr);
311	write_unlock(&vmlist_lock);
312	return v;
313}
314
315void __vunmap(void *addr, int deallocate_pages)
316{
317	struct vm_struct *area;
318
319	if (!addr)
320		return;
321
322	if ((PAGE_SIZE-1) & (unsigned long)addr) {
323		printk(KERN_ERR "Trying to vfree() bad address (%p)\n", addr);
324		WARN_ON(1);
325		return;
326	}
327
328	area = remove_vm_area(addr);
329	if (unlikely(!area)) {
330		printk(KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n",
331				addr);
332		WARN_ON(1);
333		return;
334	}
335
336	debug_check_no_locks_freed(addr, area->size);
337
338	if (deallocate_pages) {
339		int i;
340
341		for (i = 0; i < area->nr_pages; i++) {
342			BUG_ON(!area->pages[i]);
343			__free_page(area->pages[i]);
344		}
345
346		if (area->flags & VM_VPAGES)
347			vfree(area->pages);
348		else
349			kfree(area->pages);
350	}
351
352	kfree(area);
353	return;
354}
355
356/**
357 *	vfree  -  release memory allocated by vmalloc()
358 *	@addr:		memory base address
359 *
360 *	Free the virtually contiguous memory area starting at @addr, as
361 *	obtained from vmalloc(), vmalloc_32() or __vmalloc(). If @addr is
362 *	NULL, no operation is performed.
363 *
364 *	Must not be called in interrupt context.
365 */
366void vfree(void *addr)
367{
368	BUG_ON(in_interrupt());
369	__vunmap(addr, 1);
370}
371EXPORT_SYMBOL(vfree);
372
373/**
374 *	vunmap  -  release virtual mapping obtained by vmap()
375 *	@addr:		memory base address
376 *
377 *	Free the virtually contiguous memory area starting at @addr,
378 *	which was created from the page array passed to vmap().
379 *
380 *	Must not be called in interrupt context.
381 */
382void vunmap(void *addr)
383{
384	BUG_ON(in_interrupt());
385	__vunmap(addr, 0);
386}
387EXPORT_SYMBOL(vunmap);
388
389/**
390 *	vmap  -  map an array of pages into virtually contiguous space
391 *	@pages:		array of page pointers
392 *	@count:		number of pages to map
393 *	@flags:		vm_area->flags
394 *	@prot:		page protection for the mapping
395 *
396 *	Maps @count pages from @pages into contiguous kernel virtual
397 *	space.
398 */
399void *vmap(struct page **pages, unsigned int count,
400		unsigned long flags, pgprot_t prot)
401{
402	struct vm_struct *area;
403
404	if (count > num_physpages)
405		return NULL;
406
407	area = get_vm_area((count << PAGE_SHIFT), flags);
408	if (!area)
409		return NULL;
410	if (map_vm_area(area, prot, &pages)) {
411		vunmap(area->addr);
412		return NULL;
413	}
414
415	return area->addr;
416}
417EXPORT_SYMBOL(vmap);
418
419void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
420				pgprot_t prot, int node)
421{
422	struct page **pages;
423	unsigned int nr_pages, array_size, i;
424
425	nr_pages = (area->size - PAGE_SIZE) >> PAGE_SHIFT;
426	array_size = (nr_pages * sizeof(struct page *));
427
428	area->nr_pages = nr_pages;
429	/* Please note that the recursion is strictly bounded. */
430	if (array_size > PAGE_SIZE) {
431		pages = __vmalloc_node(array_size, gfp_mask, PAGE_KERNEL, node);
432		area->flags |= VM_VPAGES;
433	} else {
434		pages = kmalloc_node(array_size,
435				(gfp_mask & ~(__GFP_HIGHMEM | __GFP_ZERO)),
436				node);
437	}
438	area->pages = pages;
439	if (!area->pages) {
440		remove_vm_area(area->addr);
441		kfree(area);
442		return NULL;
443	}
444	memset(area->pages, 0, array_size);
445
446	for (i = 0; i < area->nr_pages; i++) {
447		if (node < 0)
448			area->pages[i] = alloc_page(gfp_mask);
449		else
450			area->pages[i] = alloc_pages_node(node, gfp_mask, 0);
451		if (unlikely(!area->pages[i])) {
452			/* Successfully allocated i pages, free them in __vunmap() */
453			area->nr_pages = i;
454			goto fail;
455		}
456	}
457
458	if (map_vm_area(area, prot, &pages))
459		goto fail;
460	return area->addr;
461
462fail:
463	vfree(area->addr);
464	return NULL;
465}
466
467void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot)
468{
469	return __vmalloc_area_node(area, gfp_mask, prot, -1);
470}
471
472/**
473 *	__vmalloc_node  -  allocate virtually contiguous memory
474 *	@size:		allocation size
475 *	@gfp_mask:	flags for the page level allocator
476 *	@prot:		protection mask for the allocated pages
477 *	@node:		node to use for allocation or -1
478 *
479 *	Allocate enough pages to cover @size from the page level
480 *	allocator with @gfp_mask flags.  Map them into contiguous
481 *	kernel virtual space, using a pagetable protection of @prot.
482 */
483static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
484			    int node)
485{
486	struct vm_struct *area;
487
488	size = PAGE_ALIGN(size);
489	if (!size || (size >> PAGE_SHIFT) > num_physpages)
490		return NULL;
491
492	area = get_vm_area_node(size, VM_ALLOC, node, gfp_mask);
493	if (!area)
494		return NULL;
495
496	return __vmalloc_area_node(area, gfp_mask, prot, node);
497}
498
499void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
500{
501	return __vmalloc_node(size, gfp_mask, prot, -1);
502}
503EXPORT_SYMBOL(__vmalloc);
504
505/**
506 *	vmalloc  -  allocate virtually contiguous memory
507 *	@size:		allocation size
508 *	Allocate enough pages to cover @size from the page level
509 *	allocator and map them into contiguous kernel virtual space.
510 *
511 *	For tight control over page level allocator and protection flags
512 *	use __vmalloc() instead.
513 */
514void *vmalloc(unsigned long size)
515{
516	return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL);
517}
518EXPORT_SYMBOL(vmalloc);
519
520/**
521 * vmalloc_user - allocate zeroed virtually contiguous memory for userspace
522 * @size: allocation size
523 *
524 * The resulting memory area is zeroed so it can be mapped to userspace
525 * without leaking data.
526 */
527void *vmalloc_user(unsigned long size)
528{
529	struct vm_struct *area;
530	void *ret;
531
532	ret = __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL);
533	if (ret) {
534		write_lock(&vmlist_lock);
535		area = __find_vm_area(ret);
536		area->flags |= VM_USERMAP;
537		write_unlock(&vmlist_lock);
538	}
539	return ret;
540}
541EXPORT_SYMBOL(vmalloc_user);
542
543/**
544 *	vmalloc_node  -  allocate memory on a specific node
545 *	@size:		allocation size
546 *	@node:		numa node
547 *
548 *	Allocate enough pages to cover @size from the page level
549 *	allocator and map them into contiguous kernel virtual space.
550 *
551 *	For tight control over page level allocator and protection flags
552 *	use __vmalloc() instead.
553 */
554void *vmalloc_node(unsigned long size, int node)
555{
556	return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, node);
557}
558EXPORT_SYMBOL(vmalloc_node);
559
560#ifndef PAGE_KERNEL_EXEC
561# define PAGE_KERNEL_EXEC PAGE_KERNEL
562#endif
563
564/**
565 *	vmalloc_exec  -  allocate virtually contiguous, executable memory
566 *	@size:		allocation size
567 *
568 *	Kernel-internal function to allocate enough pages to cover @size
569 *	the page level allocator and map them into contiguous and
570 *	executable kernel virtual space.
571 *
572 *	For tight control over page level allocator and protection flags
573 *	use __vmalloc() instead.
574 */
575
576void *vmalloc_exec(unsigned long size)
577{
578	return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC);
579}
580
581/**
582 *	vmalloc_32  -  allocate virtually contiguous memory (32bit addressable)
583 *	@size:		allocation size
584 *
585 *	Allocate enough 32bit PA addressable pages to cover @size from the
586 *	page level allocator and map them into contiguous kernel virtual space.
587 */
588void *vmalloc_32(unsigned long size)
589{
590	return __vmalloc(size, GFP_KERNEL, PAGE_KERNEL);
591}
592EXPORT_SYMBOL(vmalloc_32);
593
594/**
595 * vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory
596 *	@size:		allocation size
597 *
598 * The resulting memory area is 32bit addressable and zeroed so it can be
599 * mapped to userspace without leaking data.
600 */
601void *vmalloc_32_user(unsigned long size)
602{
603	struct vm_struct *area;
604	void *ret;
605
606	ret = __vmalloc(size, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL);
607	if (ret) {
608		write_lock(&vmlist_lock);
609		area = __find_vm_area(ret);
610		area->flags |= VM_USERMAP;
611		write_unlock(&vmlist_lock);
612	}
613	return ret;
614}
615EXPORT_SYMBOL(vmalloc_32_user);
616
617long vread(char *buf, char *addr, unsigned long count)
618{
619	struct vm_struct *tmp;
620	char *vaddr, *buf_start = buf;
621	unsigned long n;
622
623	/* Don't allow overflow */
624	if ((unsigned long) addr + count < count)
625		count = -(unsigned long) addr;
626
627	read_lock(&vmlist_lock);
628	for (tmp = vmlist; tmp; tmp = tmp->next) {
629		vaddr = (char *) tmp->addr;
630		if (addr >= vaddr + tmp->size - PAGE_SIZE)
631			continue;
632		while (addr < vaddr) {
633			if (count == 0)
634				goto finished;
635			*buf = '\0';
636			buf++;
637			addr++;
638			count--;
639		}
640		n = vaddr + tmp->size - PAGE_SIZE - addr;
641		do {
642			if (count == 0)
643				goto finished;
644			*buf = *addr;
645			buf++;
646			addr++;
647			count--;
648		} while (--n > 0);
649	}
650finished:
651	read_unlock(&vmlist_lock);
652	return buf - buf_start;
653}
654
655long vwrite(char *buf, char *addr, unsigned long count)
656{
657	struct vm_struct *tmp;
658	char *vaddr, *buf_start = buf;
659	unsigned long n;
660
661	/* Don't allow overflow */
662	if ((unsigned long) addr + count < count)
663		count = -(unsigned long) addr;
664
665	read_lock(&vmlist_lock);
666	for (tmp = vmlist; tmp; tmp = tmp->next) {
667		vaddr = (char *) tmp->addr;
668		if (addr >= vaddr + tmp->size - PAGE_SIZE)
669			continue;
670		while (addr < vaddr) {
671			if (count == 0)
672				goto finished;
673			buf++;
674			addr++;
675			count--;
676		}
677		n = vaddr + tmp->size - PAGE_SIZE - addr;
678		do {
679			if (count == 0)
680				goto finished;
681			*addr = *buf;
682			buf++;
683			addr++;
684			count--;
685		} while (--n > 0);
686	}
687finished:
688	read_unlock(&vmlist_lock);
689	return buf - buf_start;
690}
691
692/**
693 *	remap_vmalloc_range  -  map vmalloc pages to userspace
694 *	@vma:		vma to cover (map full range of vma)
695 *	@addr:		vmalloc memory
696 *	@pgoff:		number of pages into addr before first page to map
697 *	@returns:	0 for success, -Exxx on failure
698 *
699 *	This function checks that addr is a valid vmalloc'ed area, and
700 *	that it is big enough to cover the vma. Will return failure if
701 *	that criteria isn't met.
702 *
703 *	Similar to remap_pfn_range (see mm/memory.c)
704 */
705int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
706						unsigned long pgoff)
707{
708	struct vm_struct *area;
709	unsigned long uaddr = vma->vm_start;
710	unsigned long usize = vma->vm_end - vma->vm_start;
711	int ret;
712
713	if ((PAGE_SIZE-1) & (unsigned long)addr)
714		return -EINVAL;
715
716	read_lock(&vmlist_lock);
717	area = __find_vm_area(addr);
718	if (!area)
719		goto out_einval_locked;
720
721	if (!(area->flags & VM_USERMAP))
722		goto out_einval_locked;
723
724	if (usize + (pgoff << PAGE_SHIFT) > area->size - PAGE_SIZE)
725		goto out_einval_locked;
726	read_unlock(&vmlist_lock);
727
728	addr += pgoff << PAGE_SHIFT;
729	do {
730		struct page *page = vmalloc_to_page(addr);
731		ret = vm_insert_page(vma, uaddr, page);
732		if (ret)
733			return ret;
734
735		uaddr += PAGE_SIZE;
736		addr += PAGE_SIZE;
737		usize -= PAGE_SIZE;
738	} while (usize > 0);
739
740	/* Prevent "things" like memory migration? VM_flags need a cleanup... */
741	vma->vm_flags |= VM_RESERVED;
742
743	return ret;
744
745out_einval_locked:
746	read_unlock(&vmlist_lock);
747	return -EINVAL;
748}
749EXPORT_SYMBOL(remap_vmalloc_range);
750
751