vmalloc.c revision c1279c4ef37a06ba708e6b1f6fd98b45c52770f6
1/*
2 *  linux/mm/vmalloc.c
3 *
4 *  Copyright (C) 1993  Linus Torvalds
5 *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
6 *  SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian <tigran@veritas.com>, May 2000
7 *  Major rework to support vmap/vunmap, Christoph Hellwig, SGI, August 2002
8 *  Numa awareness, Christoph Lameter, SGI, June 2005
9 */
10
11#include <linux/vmalloc.h>
12#include <linux/mm.h>
13#include <linux/module.h>
14#include <linux/highmem.h>
15#include <linux/slab.h>
16#include <linux/spinlock.h>
17#include <linux/interrupt.h>
18#include <linux/proc_fs.h>
19#include <linux/seq_file.h>
20#include <linux/debugobjects.h>
21#include <linux/kallsyms.h>
22#include <linux/list.h>
23#include <linux/rbtree.h>
24#include <linux/radix-tree.h>
25#include <linux/rcupdate.h>
26
27#include <asm/atomic.h>
28#include <asm/uaccess.h>
29#include <asm/tlbflush.h>
30
31
32/*** Page table manipulation functions ***/
33
34static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end)
35{
36	pte_t *pte;
37
38	pte = pte_offset_kernel(pmd, addr);
39	do {
40		pte_t ptent = ptep_get_and_clear(&init_mm, addr, pte);
41		WARN_ON(!pte_none(ptent) && !pte_present(ptent));
42	} while (pte++, addr += PAGE_SIZE, addr != end);
43}
44
45static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end)
46{
47	pmd_t *pmd;
48	unsigned long next;
49
50	pmd = pmd_offset(pud, addr);
51	do {
52		next = pmd_addr_end(addr, end);
53		if (pmd_none_or_clear_bad(pmd))
54			continue;
55		vunmap_pte_range(pmd, addr, next);
56	} while (pmd++, addr = next, addr != end);
57}
58
59static void vunmap_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end)
60{
61	pud_t *pud;
62	unsigned long next;
63
64	pud = pud_offset(pgd, addr);
65	do {
66		next = pud_addr_end(addr, end);
67		if (pud_none_or_clear_bad(pud))
68			continue;
69		vunmap_pmd_range(pud, addr, next);
70	} while (pud++, addr = next, addr != end);
71}
72
73static void vunmap_page_range(unsigned long addr, unsigned long end)
74{
75	pgd_t *pgd;
76	unsigned long next;
77
78	BUG_ON(addr >= end);
79	pgd = pgd_offset_k(addr);
80	do {
81		next = pgd_addr_end(addr, end);
82		if (pgd_none_or_clear_bad(pgd))
83			continue;
84		vunmap_pud_range(pgd, addr, next);
85	} while (pgd++, addr = next, addr != end);
86}
87
88static int vmap_pte_range(pmd_t *pmd, unsigned long addr,
89		unsigned long end, pgprot_t prot, struct page **pages, int *nr)
90{
91	pte_t *pte;
92
93	/*
94	 * nr is a running index into the array which helps higher level
95	 * callers keep track of where we're up to.
96	 */
97
98	pte = pte_alloc_kernel(pmd, addr);
99	if (!pte)
100		return -ENOMEM;
101	do {
102		struct page *page = pages[*nr];
103
104		if (WARN_ON(!pte_none(*pte)))
105			return -EBUSY;
106		if (WARN_ON(!page))
107			return -ENOMEM;
108		set_pte_at(&init_mm, addr, pte, mk_pte(page, prot));
109		(*nr)++;
110	} while (pte++, addr += PAGE_SIZE, addr != end);
111	return 0;
112}
113
114static int vmap_pmd_range(pud_t *pud, unsigned long addr,
115		unsigned long end, pgprot_t prot, struct page **pages, int *nr)
116{
117	pmd_t *pmd;
118	unsigned long next;
119
120	pmd = pmd_alloc(&init_mm, pud, addr);
121	if (!pmd)
122		return -ENOMEM;
123	do {
124		next = pmd_addr_end(addr, end);
125		if (vmap_pte_range(pmd, addr, next, prot, pages, nr))
126			return -ENOMEM;
127	} while (pmd++, addr = next, addr != end);
128	return 0;
129}
130
131static int vmap_pud_range(pgd_t *pgd, unsigned long addr,
132		unsigned long end, pgprot_t prot, struct page **pages, int *nr)
133{
134	pud_t *pud;
135	unsigned long next;
136
137	pud = pud_alloc(&init_mm, pgd, addr);
138	if (!pud)
139		return -ENOMEM;
140	do {
141		next = pud_addr_end(addr, end);
142		if (vmap_pmd_range(pud, addr, next, prot, pages, nr))
143			return -ENOMEM;
144	} while (pud++, addr = next, addr != end);
145	return 0;
146}
147
148/*
149 * Set up page tables in kva (addr, end). The ptes shall have prot "prot", and
150 * will have pfns corresponding to the "pages" array.
151 *
152 * Ie. pte at addr+N*PAGE_SIZE shall point to pfn corresponding to pages[N]
153 */
154static int vmap_page_range(unsigned long start, unsigned long end,
155				pgprot_t prot, struct page **pages)
156{
157	pgd_t *pgd;
158	unsigned long next;
159	unsigned long addr = start;
160	int err = 0;
161	int nr = 0;
162
163	BUG_ON(addr >= end);
164	pgd = pgd_offset_k(addr);
165	do {
166		next = pgd_addr_end(addr, end);
167		err = vmap_pud_range(pgd, addr, next, prot, pages, &nr);
168		if (err)
169			break;
170	} while (pgd++, addr = next, addr != end);
171	flush_cache_vmap(start, end);
172
173	if (unlikely(err))
174		return err;
175	return nr;
176}
177
178static inline int is_vmalloc_or_module_addr(const void *x)
179{
180	/*
181	 * ARM, x86-64 and sparc64 put modules in a special place,
182	 * and fall back on vmalloc() if that fails. Others
183	 * just put it in the vmalloc space.
184	 */
185#if defined(CONFIG_MODULES) && defined(MODULES_VADDR)
186	unsigned long addr = (unsigned long)x;
187	if (addr >= MODULES_VADDR && addr < MODULES_END)
188		return 1;
189#endif
190	return is_vmalloc_addr(x);
191}
192
193/*
194 * Walk a vmap address to the struct page it maps.
195 */
196struct page *vmalloc_to_page(const void *vmalloc_addr)
197{
198	unsigned long addr = (unsigned long) vmalloc_addr;
199	struct page *page = NULL;
200	pgd_t *pgd = pgd_offset_k(addr);
201
202	/*
203	 * XXX we might need to change this if we add VIRTUAL_BUG_ON for
204	 * architectures that do not vmalloc module space
205	 */
206	VIRTUAL_BUG_ON(!is_vmalloc_or_module_addr(vmalloc_addr));
207
208	if (!pgd_none(*pgd)) {
209		pud_t *pud = pud_offset(pgd, addr);
210		if (!pud_none(*pud)) {
211			pmd_t *pmd = pmd_offset(pud, addr);
212			if (!pmd_none(*pmd)) {
213				pte_t *ptep, pte;
214
215				ptep = pte_offset_map(pmd, addr);
216				pte = *ptep;
217				if (pte_present(pte))
218					page = pte_page(pte);
219				pte_unmap(ptep);
220			}
221		}
222	}
223	return page;
224}
225EXPORT_SYMBOL(vmalloc_to_page);
226
227/*
228 * Map a vmalloc()-space virtual address to the physical page frame number.
229 */
230unsigned long vmalloc_to_pfn(const void *vmalloc_addr)
231{
232	return page_to_pfn(vmalloc_to_page(vmalloc_addr));
233}
234EXPORT_SYMBOL(vmalloc_to_pfn);
235
236
237/*** Global kva allocator ***/
238
239#define VM_LAZY_FREE	0x01
240#define VM_LAZY_FREEING	0x02
241#define VM_VM_AREA	0x04
242
243struct vmap_area {
244	unsigned long va_start;
245	unsigned long va_end;
246	unsigned long flags;
247	struct rb_node rb_node;		/* address sorted rbtree */
248	struct list_head list;		/* address sorted list */
249	struct list_head purge_list;	/* "lazy purge" list */
250	void *private;
251	struct rcu_head rcu_head;
252};
253
254static DEFINE_SPINLOCK(vmap_area_lock);
255static struct rb_root vmap_area_root = RB_ROOT;
256static LIST_HEAD(vmap_area_list);
257
258static struct vmap_area *__find_vmap_area(unsigned long addr)
259{
260	struct rb_node *n = vmap_area_root.rb_node;
261
262	while (n) {
263		struct vmap_area *va;
264
265		va = rb_entry(n, struct vmap_area, rb_node);
266		if (addr < va->va_start)
267			n = n->rb_left;
268		else if (addr > va->va_start)
269			n = n->rb_right;
270		else
271			return va;
272	}
273
274	return NULL;
275}
276
277static void __insert_vmap_area(struct vmap_area *va)
278{
279	struct rb_node **p = &vmap_area_root.rb_node;
280	struct rb_node *parent = NULL;
281	struct rb_node *tmp;
282
283	while (*p) {
284		struct vmap_area *tmp;
285
286		parent = *p;
287		tmp = rb_entry(parent, struct vmap_area, rb_node);
288		if (va->va_start < tmp->va_end)
289			p = &(*p)->rb_left;
290		else if (va->va_end > tmp->va_start)
291			p = &(*p)->rb_right;
292		else
293			BUG();
294	}
295
296	rb_link_node(&va->rb_node, parent, p);
297	rb_insert_color(&va->rb_node, &vmap_area_root);
298
299	/* address-sort this list so it is usable like the vmlist */
300	tmp = rb_prev(&va->rb_node);
301	if (tmp) {
302		struct vmap_area *prev;
303		prev = rb_entry(tmp, struct vmap_area, rb_node);
304		list_add_rcu(&va->list, &prev->list);
305	} else
306		list_add_rcu(&va->list, &vmap_area_list);
307}
308
309static void purge_vmap_area_lazy(void);
310
311/*
312 * Allocate a region of KVA of the specified size and alignment, within the
313 * vstart and vend.
314 */
315static struct vmap_area *alloc_vmap_area(unsigned long size,
316				unsigned long align,
317				unsigned long vstart, unsigned long vend,
318				int node, gfp_t gfp_mask)
319{
320	struct vmap_area *va;
321	struct rb_node *n;
322	unsigned long addr;
323	int purged = 0;
324
325	BUG_ON(size & ~PAGE_MASK);
326
327	va = kmalloc_node(sizeof(struct vmap_area),
328			gfp_mask & GFP_RECLAIM_MASK, node);
329	if (unlikely(!va))
330		return ERR_PTR(-ENOMEM);
331
332retry:
333	addr = ALIGN(vstart, align);
334
335	spin_lock(&vmap_area_lock);
336	/* XXX: could have a last_hole cache */
337	n = vmap_area_root.rb_node;
338	if (n) {
339		struct vmap_area *first = NULL;
340
341		do {
342			struct vmap_area *tmp;
343			tmp = rb_entry(n, struct vmap_area, rb_node);
344			if (tmp->va_end >= addr) {
345				if (!first && tmp->va_start < addr + size)
346					first = tmp;
347				n = n->rb_left;
348			} else {
349				first = tmp;
350				n = n->rb_right;
351			}
352		} while (n);
353
354		if (!first)
355			goto found;
356
357		if (first->va_end < addr) {
358			n = rb_next(&first->rb_node);
359			if (n)
360				first = rb_entry(n, struct vmap_area, rb_node);
361			else
362				goto found;
363		}
364
365		while (addr + size > first->va_start && addr + size <= vend) {
366			addr = ALIGN(first->va_end + PAGE_SIZE, align);
367
368			n = rb_next(&first->rb_node);
369			if (n)
370				first = rb_entry(n, struct vmap_area, rb_node);
371			else
372				goto found;
373		}
374	}
375found:
376	if (addr + size > vend) {
377		spin_unlock(&vmap_area_lock);
378		if (!purged) {
379			purge_vmap_area_lazy();
380			purged = 1;
381			goto retry;
382		}
383		if (printk_ratelimit())
384			printk(KERN_WARNING
385				"vmap allocation for size %lu failed: "
386				"use vmalloc=<size> to increase size.\n", size);
387		return ERR_PTR(-EBUSY);
388	}
389
390	BUG_ON(addr & (align-1));
391
392	va->va_start = addr;
393	va->va_end = addr + size;
394	va->flags = 0;
395	__insert_vmap_area(va);
396	spin_unlock(&vmap_area_lock);
397
398	return va;
399}
400
401static void rcu_free_va(struct rcu_head *head)
402{
403	struct vmap_area *va = container_of(head, struct vmap_area, rcu_head);
404
405	kfree(va);
406}
407
408static void __free_vmap_area(struct vmap_area *va)
409{
410	BUG_ON(RB_EMPTY_NODE(&va->rb_node));
411	rb_erase(&va->rb_node, &vmap_area_root);
412	RB_CLEAR_NODE(&va->rb_node);
413	list_del_rcu(&va->list);
414
415	call_rcu(&va->rcu_head, rcu_free_va);
416}
417
418/*
419 * Free a region of KVA allocated by alloc_vmap_area
420 */
421static void free_vmap_area(struct vmap_area *va)
422{
423	spin_lock(&vmap_area_lock);
424	__free_vmap_area(va);
425	spin_unlock(&vmap_area_lock);
426}
427
428/*
429 * Clear the pagetable entries of a given vmap_area
430 */
431static void unmap_vmap_area(struct vmap_area *va)
432{
433	vunmap_page_range(va->va_start, va->va_end);
434}
435
436/*
437 * lazy_max_pages is the maximum amount of virtual address space we gather up
438 * before attempting to purge with a TLB flush.
439 *
440 * There is a tradeoff here: a larger number will cover more kernel page tables
441 * and take slightly longer to purge, but it will linearly reduce the number of
442 * global TLB flushes that must be performed. It would seem natural to scale
443 * this number up linearly with the number of CPUs (because vmapping activity
444 * could also scale linearly with the number of CPUs), however it is likely
445 * that in practice, workloads might be constrained in other ways that mean
446 * vmap activity will not scale linearly with CPUs. Also, I want to be
447 * conservative and not introduce a big latency on huge systems, so go with
448 * a less aggressive log scale. It will still be an improvement over the old
449 * code, and it will be simple to change the scale factor if we find that it
450 * becomes a problem on bigger systems.
451 */
452static unsigned long lazy_max_pages(void)
453{
454	unsigned int log;
455
456	log = fls(num_online_cpus());
457
458	return log * (32UL * 1024 * 1024 / PAGE_SIZE);
459}
460
461static atomic_t vmap_lazy_nr = ATOMIC_INIT(0);
462
463/*
464 * Purges all lazily-freed vmap areas.
465 *
466 * If sync is 0 then don't purge if there is already a purge in progress.
467 * If force_flush is 1, then flush kernel TLBs between *start and *end even
468 * if we found no lazy vmap areas to unmap (callers can use this to optimise
469 * their own TLB flushing).
470 * Returns with *start = min(*start, lowest purged address)
471 *              *end = max(*end, highest purged address)
472 */
473static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end,
474					int sync, int force_flush)
475{
476	static DEFINE_SPINLOCK(purge_lock);
477	LIST_HEAD(valist);
478	struct vmap_area *va;
479	int nr = 0;
480
481	/*
482	 * If sync is 0 but force_flush is 1, we'll go sync anyway but callers
483	 * should not expect such behaviour. This just simplifies locking for
484	 * the case that isn't actually used at the moment anyway.
485	 */
486	if (!sync && !force_flush) {
487		if (!spin_trylock(&purge_lock))
488			return;
489	} else
490		spin_lock(&purge_lock);
491
492	rcu_read_lock();
493	list_for_each_entry_rcu(va, &vmap_area_list, list) {
494		if (va->flags & VM_LAZY_FREE) {
495			if (va->va_start < *start)
496				*start = va->va_start;
497			if (va->va_end > *end)
498				*end = va->va_end;
499			nr += (va->va_end - va->va_start) >> PAGE_SHIFT;
500			unmap_vmap_area(va);
501			list_add_tail(&va->purge_list, &valist);
502			va->flags |= VM_LAZY_FREEING;
503			va->flags &= ~VM_LAZY_FREE;
504		}
505	}
506	rcu_read_unlock();
507
508	if (nr) {
509		BUG_ON(nr > atomic_read(&vmap_lazy_nr));
510		atomic_sub(nr, &vmap_lazy_nr);
511	}
512
513	if (nr || force_flush)
514		flush_tlb_kernel_range(*start, *end);
515
516	if (nr) {
517		spin_lock(&vmap_area_lock);
518		list_for_each_entry(va, &valist, purge_list)
519			__free_vmap_area(va);
520		spin_unlock(&vmap_area_lock);
521	}
522	spin_unlock(&purge_lock);
523}
524
525/*
526 * Kick off a purge of the outstanding lazy areas. Don't bother if somebody
527 * is already purging.
528 */
529static void try_purge_vmap_area_lazy(void)
530{
531	unsigned long start = ULONG_MAX, end = 0;
532
533	__purge_vmap_area_lazy(&start, &end, 0, 0);
534}
535
536/*
537 * Kick off a purge of the outstanding lazy areas.
538 */
539static void purge_vmap_area_lazy(void)
540{
541	unsigned long start = ULONG_MAX, end = 0;
542
543	__purge_vmap_area_lazy(&start, &end, 1, 0);
544}
545
546/*
547 * Free and unmap a vmap area, caller ensuring flush_cache_vunmap had been
548 * called for the correct range previously.
549 */
550static void free_unmap_vmap_area_noflush(struct vmap_area *va)
551{
552	va->flags |= VM_LAZY_FREE;
553	atomic_add((va->va_end - va->va_start) >> PAGE_SHIFT, &vmap_lazy_nr);
554	if (unlikely(atomic_read(&vmap_lazy_nr) > lazy_max_pages()))
555		try_purge_vmap_area_lazy();
556}
557
558/*
559 * Free and unmap a vmap area
560 */
561static void free_unmap_vmap_area(struct vmap_area *va)
562{
563	flush_cache_vunmap(va->va_start, va->va_end);
564	free_unmap_vmap_area_noflush(va);
565}
566
567static struct vmap_area *find_vmap_area(unsigned long addr)
568{
569	struct vmap_area *va;
570
571	spin_lock(&vmap_area_lock);
572	va = __find_vmap_area(addr);
573	spin_unlock(&vmap_area_lock);
574
575	return va;
576}
577
578static void free_unmap_vmap_area_addr(unsigned long addr)
579{
580	struct vmap_area *va;
581
582	va = find_vmap_area(addr);
583	BUG_ON(!va);
584	free_unmap_vmap_area(va);
585}
586
587
588/*** Per cpu kva allocator ***/
589
590/*
591 * vmap space is limited especially on 32 bit architectures. Ensure there is
592 * room for at least 16 percpu vmap blocks per CPU.
593 */
594/*
595 * If we had a constant VMALLOC_START and VMALLOC_END, we'd like to be able
596 * to #define VMALLOC_SPACE		(VMALLOC_END-VMALLOC_START). Guess
597 * instead (we just need a rough idea)
598 */
599#if BITS_PER_LONG == 32
600#define VMALLOC_SPACE		(128UL*1024*1024)
601#else
602#define VMALLOC_SPACE		(128UL*1024*1024*1024)
603#endif
604
605#define VMALLOC_PAGES		(VMALLOC_SPACE / PAGE_SIZE)
606#define VMAP_MAX_ALLOC		BITS_PER_LONG	/* 256K with 4K pages */
607#define VMAP_BBMAP_BITS_MAX	1024	/* 4MB with 4K pages */
608#define VMAP_BBMAP_BITS_MIN	(VMAP_MAX_ALLOC*2)
609#define VMAP_MIN(x, y)		((x) < (y) ? (x) : (y)) /* can't use min() */
610#define VMAP_MAX(x, y)		((x) > (y) ? (x) : (y)) /* can't use max() */
611#define VMAP_BBMAP_BITS		VMAP_MIN(VMAP_BBMAP_BITS_MAX,		\
612					VMAP_MAX(VMAP_BBMAP_BITS_MIN,	\
613						VMALLOC_PAGES / NR_CPUS / 16))
614
615#define VMAP_BLOCK_SIZE		(VMAP_BBMAP_BITS * PAGE_SIZE)
616
617static bool vmap_initialized __read_mostly = false;
618
619struct vmap_block_queue {
620	spinlock_t lock;
621	struct list_head free;
622	struct list_head dirty;
623	unsigned int nr_dirty;
624};
625
626struct vmap_block {
627	spinlock_t lock;
628	struct vmap_area *va;
629	struct vmap_block_queue *vbq;
630	unsigned long free, dirty;
631	DECLARE_BITMAP(alloc_map, VMAP_BBMAP_BITS);
632	DECLARE_BITMAP(dirty_map, VMAP_BBMAP_BITS);
633	union {
634		struct {
635			struct list_head free_list;
636			struct list_head dirty_list;
637		};
638		struct rcu_head rcu_head;
639	};
640};
641
642/* Queue of free and dirty vmap blocks, for allocation and flushing purposes */
643static DEFINE_PER_CPU(struct vmap_block_queue, vmap_block_queue);
644
645/*
646 * Radix tree of vmap blocks, indexed by address, to quickly find a vmap block
647 * in the free path. Could get rid of this if we change the API to return a
648 * "cookie" from alloc, to be passed to free. But no big deal yet.
649 */
650static DEFINE_SPINLOCK(vmap_block_tree_lock);
651static RADIX_TREE(vmap_block_tree, GFP_ATOMIC);
652
653/*
654 * We should probably have a fallback mechanism to allocate virtual memory
655 * out of partially filled vmap blocks. However vmap block sizing should be
656 * fairly reasonable according to the vmalloc size, so it shouldn't be a
657 * big problem.
658 */
659
660static unsigned long addr_to_vb_idx(unsigned long addr)
661{
662	addr -= VMALLOC_START & ~(VMAP_BLOCK_SIZE-1);
663	addr /= VMAP_BLOCK_SIZE;
664	return addr;
665}
666
667static struct vmap_block *new_vmap_block(gfp_t gfp_mask)
668{
669	struct vmap_block_queue *vbq;
670	struct vmap_block *vb;
671	struct vmap_area *va;
672	unsigned long vb_idx;
673	int node, err;
674
675	node = numa_node_id();
676
677	vb = kmalloc_node(sizeof(struct vmap_block),
678			gfp_mask & GFP_RECLAIM_MASK, node);
679	if (unlikely(!vb))
680		return ERR_PTR(-ENOMEM);
681
682	va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE,
683					VMALLOC_START, VMALLOC_END,
684					node, gfp_mask);
685	if (unlikely(IS_ERR(va))) {
686		kfree(vb);
687		return ERR_PTR(PTR_ERR(va));
688	}
689
690	err = radix_tree_preload(gfp_mask);
691	if (unlikely(err)) {
692		kfree(vb);
693		free_vmap_area(va);
694		return ERR_PTR(err);
695	}
696
697	spin_lock_init(&vb->lock);
698	vb->va = va;
699	vb->free = VMAP_BBMAP_BITS;
700	vb->dirty = 0;
701	bitmap_zero(vb->alloc_map, VMAP_BBMAP_BITS);
702	bitmap_zero(vb->dirty_map, VMAP_BBMAP_BITS);
703	INIT_LIST_HEAD(&vb->free_list);
704	INIT_LIST_HEAD(&vb->dirty_list);
705
706	vb_idx = addr_to_vb_idx(va->va_start);
707	spin_lock(&vmap_block_tree_lock);
708	err = radix_tree_insert(&vmap_block_tree, vb_idx, vb);
709	spin_unlock(&vmap_block_tree_lock);
710	BUG_ON(err);
711	radix_tree_preload_end();
712
713	vbq = &get_cpu_var(vmap_block_queue);
714	vb->vbq = vbq;
715	spin_lock(&vbq->lock);
716	list_add(&vb->free_list, &vbq->free);
717	spin_unlock(&vbq->lock);
718	put_cpu_var(vmap_cpu_blocks);
719
720	return vb;
721}
722
723static void rcu_free_vb(struct rcu_head *head)
724{
725	struct vmap_block *vb = container_of(head, struct vmap_block, rcu_head);
726
727	kfree(vb);
728}
729
730static void free_vmap_block(struct vmap_block *vb)
731{
732	struct vmap_block *tmp;
733	unsigned long vb_idx;
734
735	spin_lock(&vb->vbq->lock);
736	if (!list_empty(&vb->free_list))
737		list_del(&vb->free_list);
738	if (!list_empty(&vb->dirty_list))
739		list_del(&vb->dirty_list);
740	spin_unlock(&vb->vbq->lock);
741
742	vb_idx = addr_to_vb_idx(vb->va->va_start);
743	spin_lock(&vmap_block_tree_lock);
744	tmp = radix_tree_delete(&vmap_block_tree, vb_idx);
745	spin_unlock(&vmap_block_tree_lock);
746	BUG_ON(tmp != vb);
747
748	free_unmap_vmap_area_noflush(vb->va);
749	call_rcu(&vb->rcu_head, rcu_free_vb);
750}
751
752static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
753{
754	struct vmap_block_queue *vbq;
755	struct vmap_block *vb;
756	unsigned long addr = 0;
757	unsigned int order;
758
759	BUG_ON(size & ~PAGE_MASK);
760	BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
761	order = get_order(size);
762
763again:
764	rcu_read_lock();
765	vbq = &get_cpu_var(vmap_block_queue);
766	list_for_each_entry_rcu(vb, &vbq->free, free_list) {
767		int i;
768
769		spin_lock(&vb->lock);
770		i = bitmap_find_free_region(vb->alloc_map,
771						VMAP_BBMAP_BITS, order);
772
773		if (i >= 0) {
774			addr = vb->va->va_start + (i << PAGE_SHIFT);
775			BUG_ON(addr_to_vb_idx(addr) !=
776					addr_to_vb_idx(vb->va->va_start));
777			vb->free -= 1UL << order;
778			if (vb->free == 0) {
779				spin_lock(&vbq->lock);
780				list_del_init(&vb->free_list);
781				spin_unlock(&vbq->lock);
782			}
783			spin_unlock(&vb->lock);
784			break;
785		}
786		spin_unlock(&vb->lock);
787	}
788	put_cpu_var(vmap_cpu_blocks);
789	rcu_read_unlock();
790
791	if (!addr) {
792		vb = new_vmap_block(gfp_mask);
793		if (IS_ERR(vb))
794			return vb;
795		goto again;
796	}
797
798	return (void *)addr;
799}
800
801static void vb_free(const void *addr, unsigned long size)
802{
803	unsigned long offset;
804	unsigned long vb_idx;
805	unsigned int order;
806	struct vmap_block *vb;
807
808	BUG_ON(size & ~PAGE_MASK);
809	BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
810
811	flush_cache_vunmap((unsigned long)addr, (unsigned long)addr + size);
812
813	order = get_order(size);
814
815	offset = (unsigned long)addr & (VMAP_BLOCK_SIZE - 1);
816
817	vb_idx = addr_to_vb_idx((unsigned long)addr);
818	rcu_read_lock();
819	vb = radix_tree_lookup(&vmap_block_tree, vb_idx);
820	rcu_read_unlock();
821	BUG_ON(!vb);
822
823	spin_lock(&vb->lock);
824	bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order);
825	if (!vb->dirty) {
826		spin_lock(&vb->vbq->lock);
827		list_add(&vb->dirty_list, &vb->vbq->dirty);
828		spin_unlock(&vb->vbq->lock);
829	}
830	vb->dirty += 1UL << order;
831	if (vb->dirty == VMAP_BBMAP_BITS) {
832		BUG_ON(vb->free || !list_empty(&vb->free_list));
833		spin_unlock(&vb->lock);
834		free_vmap_block(vb);
835	} else
836		spin_unlock(&vb->lock);
837}
838
839/**
840 * vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer
841 *
842 * The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily
843 * to amortize TLB flushing overheads. What this means is that any page you
844 * have now, may, in a former life, have been mapped into kernel virtual
845 * address by the vmap layer and so there might be some CPUs with TLB entries
846 * still referencing that page (additional to the regular 1:1 kernel mapping).
847 *
848 * vm_unmap_aliases flushes all such lazy mappings. After it returns, we can
849 * be sure that none of the pages we have control over will have any aliases
850 * from the vmap layer.
851 */
852void vm_unmap_aliases(void)
853{
854	unsigned long start = ULONG_MAX, end = 0;
855	int cpu;
856	int flush = 0;
857
858	if (unlikely(!vmap_initialized))
859		return;
860
861	for_each_possible_cpu(cpu) {
862		struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu);
863		struct vmap_block *vb;
864
865		rcu_read_lock();
866		list_for_each_entry_rcu(vb, &vbq->free, free_list) {
867			int i;
868
869			spin_lock(&vb->lock);
870			i = find_first_bit(vb->dirty_map, VMAP_BBMAP_BITS);
871			while (i < VMAP_BBMAP_BITS) {
872				unsigned long s, e;
873				int j;
874				j = find_next_zero_bit(vb->dirty_map,
875					VMAP_BBMAP_BITS, i);
876
877				s = vb->va->va_start + (i << PAGE_SHIFT);
878				e = vb->va->va_start + (j << PAGE_SHIFT);
879				vunmap_page_range(s, e);
880				flush = 1;
881
882				if (s < start)
883					start = s;
884				if (e > end)
885					end = e;
886
887				i = j;
888				i = find_next_bit(vb->dirty_map,
889							VMAP_BBMAP_BITS, i);
890			}
891			spin_unlock(&vb->lock);
892		}
893		rcu_read_unlock();
894	}
895
896	__purge_vmap_area_lazy(&start, &end, 1, flush);
897}
898EXPORT_SYMBOL_GPL(vm_unmap_aliases);
899
900/**
901 * vm_unmap_ram - unmap linear kernel address space set up by vm_map_ram
902 * @mem: the pointer returned by vm_map_ram
903 * @count: the count passed to that vm_map_ram call (cannot unmap partial)
904 */
905void vm_unmap_ram(const void *mem, unsigned int count)
906{
907	unsigned long size = count << PAGE_SHIFT;
908	unsigned long addr = (unsigned long)mem;
909
910	BUG_ON(!addr);
911	BUG_ON(addr < VMALLOC_START);
912	BUG_ON(addr > VMALLOC_END);
913	BUG_ON(addr & (PAGE_SIZE-1));
914
915	debug_check_no_locks_freed(mem, size);
916
917	if (likely(count <= VMAP_MAX_ALLOC))
918		vb_free(mem, size);
919	else
920		free_unmap_vmap_area_addr(addr);
921}
922EXPORT_SYMBOL(vm_unmap_ram);
923
924/**
925 * vm_map_ram - map pages linearly into kernel virtual address (vmalloc space)
926 * @pages: an array of pointers to the pages to be mapped
927 * @count: number of pages
928 * @node: prefer to allocate data structures on this node
929 * @prot: memory protection to use. PAGE_KERNEL for regular RAM
930 *
931 * Returns: a pointer to the address that has been mapped, or %NULL on failure
932 */
933void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t prot)
934{
935	unsigned long size = count << PAGE_SHIFT;
936	unsigned long addr;
937	void *mem;
938
939	if (likely(count <= VMAP_MAX_ALLOC)) {
940		mem = vb_alloc(size, GFP_KERNEL);
941		if (IS_ERR(mem))
942			return NULL;
943		addr = (unsigned long)mem;
944	} else {
945		struct vmap_area *va;
946		va = alloc_vmap_area(size, PAGE_SIZE,
947				VMALLOC_START, VMALLOC_END, node, GFP_KERNEL);
948		if (IS_ERR(va))
949			return NULL;
950
951		addr = va->va_start;
952		mem = (void *)addr;
953	}
954	if (vmap_page_range(addr, addr + size, prot, pages) < 0) {
955		vm_unmap_ram(mem, count);
956		return NULL;
957	}
958	return mem;
959}
960EXPORT_SYMBOL(vm_map_ram);
961
962void __init vmalloc_init(void)
963{
964	int i;
965
966	for_each_possible_cpu(i) {
967		struct vmap_block_queue *vbq;
968
969		vbq = &per_cpu(vmap_block_queue, i);
970		spin_lock_init(&vbq->lock);
971		INIT_LIST_HEAD(&vbq->free);
972		INIT_LIST_HEAD(&vbq->dirty);
973		vbq->nr_dirty = 0;
974	}
975
976	vmap_initialized = true;
977}
978
979void unmap_kernel_range(unsigned long addr, unsigned long size)
980{
981	unsigned long end = addr + size;
982	vunmap_page_range(addr, end);
983	flush_tlb_kernel_range(addr, end);
984}
985
986int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages)
987{
988	unsigned long addr = (unsigned long)area->addr;
989	unsigned long end = addr + area->size - PAGE_SIZE;
990	int err;
991
992	err = vmap_page_range(addr, end, prot, *pages);
993	if (err > 0) {
994		*pages += err;
995		err = 0;
996	}
997
998	return err;
999}
1000EXPORT_SYMBOL_GPL(map_vm_area);
1001
1002/*** Old vmalloc interfaces ***/
1003DEFINE_RWLOCK(vmlist_lock);
1004struct vm_struct *vmlist;
1005
1006static struct vm_struct *__get_vm_area_node(unsigned long size,
1007		unsigned long flags, unsigned long start, unsigned long end,
1008		int node, gfp_t gfp_mask, void *caller)
1009{
1010	static struct vmap_area *va;
1011	struct vm_struct *area;
1012	struct vm_struct *tmp, **p;
1013	unsigned long align = 1;
1014
1015	BUG_ON(in_interrupt());
1016	if (flags & VM_IOREMAP) {
1017		int bit = fls(size);
1018
1019		if (bit > IOREMAP_MAX_ORDER)
1020			bit = IOREMAP_MAX_ORDER;
1021		else if (bit < PAGE_SHIFT)
1022			bit = PAGE_SHIFT;
1023
1024		align = 1ul << bit;
1025	}
1026
1027	size = PAGE_ALIGN(size);
1028	if (unlikely(!size))
1029		return NULL;
1030
1031	area = kmalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node);
1032	if (unlikely(!area))
1033		return NULL;
1034
1035	/*
1036	 * We always allocate a guard page.
1037	 */
1038	size += PAGE_SIZE;
1039
1040	va = alloc_vmap_area(size, align, start, end, node, gfp_mask);
1041	if (IS_ERR(va)) {
1042		kfree(area);
1043		return NULL;
1044	}
1045
1046	area->flags = flags;
1047	area->addr = (void *)va->va_start;
1048	area->size = size;
1049	area->pages = NULL;
1050	area->nr_pages = 0;
1051	area->phys_addr = 0;
1052	area->caller = caller;
1053	va->private = area;
1054	va->flags |= VM_VM_AREA;
1055
1056	write_lock(&vmlist_lock);
1057	for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) {
1058		if (tmp->addr >= area->addr)
1059			break;
1060	}
1061	area->next = *p;
1062	*p = area;
1063	write_unlock(&vmlist_lock);
1064
1065	return area;
1066}
1067
1068struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags,
1069				unsigned long start, unsigned long end)
1070{
1071	return __get_vm_area_node(size, flags, start, end, -1, GFP_KERNEL,
1072						__builtin_return_address(0));
1073}
1074EXPORT_SYMBOL_GPL(__get_vm_area);
1075
1076/**
1077 *	get_vm_area  -  reserve a contiguous kernel virtual area
1078 *	@size:		size of the area
1079 *	@flags:		%VM_IOREMAP for I/O mappings or VM_ALLOC
1080 *
1081 *	Search an area of @size in the kernel virtual mapping area,
1082 *	and reserved it for out purposes.  Returns the area descriptor
1083 *	on success or %NULL on failure.
1084 */
1085struct vm_struct *get_vm_area(unsigned long size, unsigned long flags)
1086{
1087	return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END,
1088				-1, GFP_KERNEL, __builtin_return_address(0));
1089}
1090
1091struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags,
1092				void *caller)
1093{
1094	return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END,
1095						-1, GFP_KERNEL, caller);
1096}
1097
1098struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags,
1099				   int node, gfp_t gfp_mask)
1100{
1101	return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END, node,
1102				  gfp_mask, __builtin_return_address(0));
1103}
1104
1105static struct vm_struct *find_vm_area(const void *addr)
1106{
1107	struct vmap_area *va;
1108
1109	va = find_vmap_area((unsigned long)addr);
1110	if (va && va->flags & VM_VM_AREA)
1111		return va->private;
1112
1113	return NULL;
1114}
1115
1116/**
1117 *	remove_vm_area  -  find and remove a continuous kernel virtual area
1118 *	@addr:		base address
1119 *
1120 *	Search for the kernel VM area starting at @addr, and remove it.
1121 *	This function returns the found VM area, but using it is NOT safe
1122 *	on SMP machines, except for its size or flags.
1123 */
1124struct vm_struct *remove_vm_area(const void *addr)
1125{
1126	struct vmap_area *va;
1127
1128	va = find_vmap_area((unsigned long)addr);
1129	if (va && va->flags & VM_VM_AREA) {
1130		struct vm_struct *vm = va->private;
1131		struct vm_struct *tmp, **p;
1132		free_unmap_vmap_area(va);
1133		vm->size -= PAGE_SIZE;
1134
1135		write_lock(&vmlist_lock);
1136		for (p = &vmlist; (tmp = *p) != vm; p = &tmp->next)
1137			;
1138		*p = tmp->next;
1139		write_unlock(&vmlist_lock);
1140
1141		return vm;
1142	}
1143	return NULL;
1144}
1145
1146static void __vunmap(const void *addr, int deallocate_pages)
1147{
1148	struct vm_struct *area;
1149
1150	if (!addr)
1151		return;
1152
1153	if ((PAGE_SIZE-1) & (unsigned long)addr) {
1154		WARN(1, KERN_ERR "Trying to vfree() bad address (%p)\n", addr);
1155		return;
1156	}
1157
1158	area = remove_vm_area(addr);
1159	if (unlikely(!area)) {
1160		WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n",
1161				addr);
1162		return;
1163	}
1164
1165	debug_check_no_locks_freed(addr, area->size);
1166	debug_check_no_obj_freed(addr, area->size);
1167
1168	if (deallocate_pages) {
1169		int i;
1170
1171		for (i = 0; i < area->nr_pages; i++) {
1172			struct page *page = area->pages[i];
1173
1174			BUG_ON(!page);
1175			__free_page(page);
1176		}
1177
1178		if (area->flags & VM_VPAGES)
1179			vfree(area->pages);
1180		else
1181			kfree(area->pages);
1182	}
1183
1184	kfree(area);
1185	return;
1186}
1187
1188/**
1189 *	vfree  -  release memory allocated by vmalloc()
1190 *	@addr:		memory base address
1191 *
1192 *	Free the virtually continuous memory area starting at @addr, as
1193 *	obtained from vmalloc(), vmalloc_32() or __vmalloc(). If @addr is
1194 *	NULL, no operation is performed.
1195 *
1196 *	Must not be called in interrupt context.
1197 */
1198void vfree(const void *addr)
1199{
1200	BUG_ON(in_interrupt());
1201	__vunmap(addr, 1);
1202}
1203EXPORT_SYMBOL(vfree);
1204
1205/**
1206 *	vunmap  -  release virtual mapping obtained by vmap()
1207 *	@addr:		memory base address
1208 *
1209 *	Free the virtually contiguous memory area starting at @addr,
1210 *	which was created from the page array passed to vmap().
1211 *
1212 *	Must not be called in interrupt context.
1213 */
1214void vunmap(const void *addr)
1215{
1216	BUG_ON(in_interrupt());
1217	__vunmap(addr, 0);
1218}
1219EXPORT_SYMBOL(vunmap);
1220
1221/**
1222 *	vmap  -  map an array of pages into virtually contiguous space
1223 *	@pages:		array of page pointers
1224 *	@count:		number of pages to map
1225 *	@flags:		vm_area->flags
1226 *	@prot:		page protection for the mapping
1227 *
1228 *	Maps @count pages from @pages into contiguous kernel virtual
1229 *	space.
1230 */
1231void *vmap(struct page **pages, unsigned int count,
1232		unsigned long flags, pgprot_t prot)
1233{
1234	struct vm_struct *area;
1235
1236	if (count > num_physpages)
1237		return NULL;
1238
1239	area = get_vm_area_caller((count << PAGE_SHIFT), flags,
1240					__builtin_return_address(0));
1241	if (!area)
1242		return NULL;
1243
1244	if (map_vm_area(area, prot, &pages)) {
1245		vunmap(area->addr);
1246		return NULL;
1247	}
1248
1249	return area->addr;
1250}
1251EXPORT_SYMBOL(vmap);
1252
1253static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
1254			    int node, void *caller);
1255static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
1256				 pgprot_t prot, int node, void *caller)
1257{
1258	struct page **pages;
1259	unsigned int nr_pages, array_size, i;
1260
1261	nr_pages = (area->size - PAGE_SIZE) >> PAGE_SHIFT;
1262	array_size = (nr_pages * sizeof(struct page *));
1263
1264	area->nr_pages = nr_pages;
1265	/* Please note that the recursion is strictly bounded. */
1266	if (array_size > PAGE_SIZE) {
1267		pages = __vmalloc_node(array_size, gfp_mask | __GFP_ZERO,
1268				PAGE_KERNEL, node, caller);
1269		area->flags |= VM_VPAGES;
1270	} else {
1271		pages = kmalloc_node(array_size,
1272				(gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO,
1273				node);
1274	}
1275	area->pages = pages;
1276	area->caller = caller;
1277	if (!area->pages) {
1278		remove_vm_area(area->addr);
1279		kfree(area);
1280		return NULL;
1281	}
1282
1283	for (i = 0; i < area->nr_pages; i++) {
1284		struct page *page;
1285
1286		if (node < 0)
1287			page = alloc_page(gfp_mask);
1288		else
1289			page = alloc_pages_node(node, gfp_mask, 0);
1290
1291		if (unlikely(!page)) {
1292			/* Successfully allocated i pages, free them in __vunmap() */
1293			area->nr_pages = i;
1294			goto fail;
1295		}
1296		area->pages[i] = page;
1297	}
1298
1299	if (map_vm_area(area, prot, &pages))
1300		goto fail;
1301	return area->addr;
1302
1303fail:
1304	vfree(area->addr);
1305	return NULL;
1306}
1307
1308void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot)
1309{
1310	return __vmalloc_area_node(area, gfp_mask, prot, -1,
1311					__builtin_return_address(0));
1312}
1313
1314/**
1315 *	__vmalloc_node  -  allocate virtually contiguous memory
1316 *	@size:		allocation size
1317 *	@gfp_mask:	flags for the page level allocator
1318 *	@prot:		protection mask for the allocated pages
1319 *	@node:		node to use for allocation or -1
1320 *	@caller:	caller's return address
1321 *
1322 *	Allocate enough pages to cover @size from the page level
1323 *	allocator with @gfp_mask flags.  Map them into contiguous
1324 *	kernel virtual space, using a pagetable protection of @prot.
1325 */
1326static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
1327						int node, void *caller)
1328{
1329	struct vm_struct *area;
1330
1331	size = PAGE_ALIGN(size);
1332	if (!size || (size >> PAGE_SHIFT) > num_physpages)
1333		return NULL;
1334
1335	area = __get_vm_area_node(size, VM_ALLOC, VMALLOC_START, VMALLOC_END,
1336						node, gfp_mask, caller);
1337
1338	if (!area)
1339		return NULL;
1340
1341	return __vmalloc_area_node(area, gfp_mask, prot, node, caller);
1342}
1343
1344void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
1345{
1346	return __vmalloc_node(size, gfp_mask, prot, -1,
1347				__builtin_return_address(0));
1348}
1349EXPORT_SYMBOL(__vmalloc);
1350
1351/**
1352 *	vmalloc  -  allocate virtually contiguous memory
1353 *	@size:		allocation size
1354 *	Allocate enough pages to cover @size from the page level
1355 *	allocator and map them into contiguous kernel virtual space.
1356 *
1357 *	For tight control over page level allocator and protection flags
1358 *	use __vmalloc() instead.
1359 */
1360void *vmalloc(unsigned long size)
1361{
1362	return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL,
1363					-1, __builtin_return_address(0));
1364}
1365EXPORT_SYMBOL(vmalloc);
1366
1367/**
1368 * vmalloc_user - allocate zeroed virtually contiguous memory for userspace
1369 * @size: allocation size
1370 *
1371 * The resulting memory area is zeroed so it can be mapped to userspace
1372 * without leaking data.
1373 */
1374void *vmalloc_user(unsigned long size)
1375{
1376	struct vm_struct *area;
1377	void *ret;
1378
1379	ret = __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL);
1380	if (ret) {
1381		area = find_vm_area(ret);
1382		area->flags |= VM_USERMAP;
1383	}
1384	return ret;
1385}
1386EXPORT_SYMBOL(vmalloc_user);
1387
1388/**
1389 *	vmalloc_node  -  allocate memory on a specific node
1390 *	@size:		allocation size
1391 *	@node:		numa node
1392 *
1393 *	Allocate enough pages to cover @size from the page level
1394 *	allocator and map them into contiguous kernel virtual space.
1395 *
1396 *	For tight control over page level allocator and protection flags
1397 *	use __vmalloc() instead.
1398 */
1399void *vmalloc_node(unsigned long size, int node)
1400{
1401	return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL,
1402					node, __builtin_return_address(0));
1403}
1404EXPORT_SYMBOL(vmalloc_node);
1405
1406#ifndef PAGE_KERNEL_EXEC
1407# define PAGE_KERNEL_EXEC PAGE_KERNEL
1408#endif
1409
1410/**
1411 *	vmalloc_exec  -  allocate virtually contiguous, executable memory
1412 *	@size:		allocation size
1413 *
1414 *	Kernel-internal function to allocate enough pages to cover @size
1415 *	the page level allocator and map them into contiguous and
1416 *	executable kernel virtual space.
1417 *
1418 *	For tight control over page level allocator and protection flags
1419 *	use __vmalloc() instead.
1420 */
1421
1422void *vmalloc_exec(unsigned long size)
1423{
1424	return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC);
1425}
1426
1427#if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32)
1428#define GFP_VMALLOC32 GFP_DMA32 | GFP_KERNEL
1429#elif defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA)
1430#define GFP_VMALLOC32 GFP_DMA | GFP_KERNEL
1431#else
1432#define GFP_VMALLOC32 GFP_KERNEL
1433#endif
1434
1435/**
1436 *	vmalloc_32  -  allocate virtually contiguous memory (32bit addressable)
1437 *	@size:		allocation size
1438 *
1439 *	Allocate enough 32bit PA addressable pages to cover @size from the
1440 *	page level allocator and map them into contiguous kernel virtual space.
1441 */
1442void *vmalloc_32(unsigned long size)
1443{
1444	return __vmalloc(size, GFP_VMALLOC32, PAGE_KERNEL);
1445}
1446EXPORT_SYMBOL(vmalloc_32);
1447
1448/**
1449 * vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory
1450 *	@size:		allocation size
1451 *
1452 * The resulting memory area is 32bit addressable and zeroed so it can be
1453 * mapped to userspace without leaking data.
1454 */
1455void *vmalloc_32_user(unsigned long size)
1456{
1457	struct vm_struct *area;
1458	void *ret;
1459
1460	ret = __vmalloc(size, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL);
1461	if (ret) {
1462		area = find_vm_area(ret);
1463		area->flags |= VM_USERMAP;
1464	}
1465	return ret;
1466}
1467EXPORT_SYMBOL(vmalloc_32_user);
1468
1469long vread(char *buf, char *addr, unsigned long count)
1470{
1471	struct vm_struct *tmp;
1472	char *vaddr, *buf_start = buf;
1473	unsigned long n;
1474
1475	/* Don't allow overflow */
1476	if ((unsigned long) addr + count < count)
1477		count = -(unsigned long) addr;
1478
1479	read_lock(&vmlist_lock);
1480	for (tmp = vmlist; tmp; tmp = tmp->next) {
1481		vaddr = (char *) tmp->addr;
1482		if (addr >= vaddr + tmp->size - PAGE_SIZE)
1483			continue;
1484		while (addr < vaddr) {
1485			if (count == 0)
1486				goto finished;
1487			*buf = '\0';
1488			buf++;
1489			addr++;
1490			count--;
1491		}
1492		n = vaddr + tmp->size - PAGE_SIZE - addr;
1493		do {
1494			if (count == 0)
1495				goto finished;
1496			*buf = *addr;
1497			buf++;
1498			addr++;
1499			count--;
1500		} while (--n > 0);
1501	}
1502finished:
1503	read_unlock(&vmlist_lock);
1504	return buf - buf_start;
1505}
1506
1507long vwrite(char *buf, char *addr, unsigned long count)
1508{
1509	struct vm_struct *tmp;
1510	char *vaddr, *buf_start = buf;
1511	unsigned long n;
1512
1513	/* Don't allow overflow */
1514	if ((unsigned long) addr + count < count)
1515		count = -(unsigned long) addr;
1516
1517	read_lock(&vmlist_lock);
1518	for (tmp = vmlist; tmp; tmp = tmp->next) {
1519		vaddr = (char *) tmp->addr;
1520		if (addr >= vaddr + tmp->size - PAGE_SIZE)
1521			continue;
1522		while (addr < vaddr) {
1523			if (count == 0)
1524				goto finished;
1525			buf++;
1526			addr++;
1527			count--;
1528		}
1529		n = vaddr + tmp->size - PAGE_SIZE - addr;
1530		do {
1531			if (count == 0)
1532				goto finished;
1533			*addr = *buf;
1534			buf++;
1535			addr++;
1536			count--;
1537		} while (--n > 0);
1538	}
1539finished:
1540	read_unlock(&vmlist_lock);
1541	return buf - buf_start;
1542}
1543
1544/**
1545 *	remap_vmalloc_range  -  map vmalloc pages to userspace
1546 *	@vma:		vma to cover (map full range of vma)
1547 *	@addr:		vmalloc memory
1548 *	@pgoff:		number of pages into addr before first page to map
1549 *
1550 *	Returns:	0 for success, -Exxx on failure
1551 *
1552 *	This function checks that addr is a valid vmalloc'ed area, and
1553 *	that it is big enough to cover the vma. Will return failure if
1554 *	that criteria isn't met.
1555 *
1556 *	Similar to remap_pfn_range() (see mm/memory.c)
1557 */
1558int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
1559						unsigned long pgoff)
1560{
1561	struct vm_struct *area;
1562	unsigned long uaddr = vma->vm_start;
1563	unsigned long usize = vma->vm_end - vma->vm_start;
1564
1565	if ((PAGE_SIZE-1) & (unsigned long)addr)
1566		return -EINVAL;
1567
1568	area = find_vm_area(addr);
1569	if (!area)
1570		return -EINVAL;
1571
1572	if (!(area->flags & VM_USERMAP))
1573		return -EINVAL;
1574
1575	if (usize + (pgoff << PAGE_SHIFT) > area->size - PAGE_SIZE)
1576		return -EINVAL;
1577
1578	addr += pgoff << PAGE_SHIFT;
1579	do {
1580		struct page *page = vmalloc_to_page(addr);
1581		int ret;
1582
1583		ret = vm_insert_page(vma, uaddr, page);
1584		if (ret)
1585			return ret;
1586
1587		uaddr += PAGE_SIZE;
1588		addr += PAGE_SIZE;
1589		usize -= PAGE_SIZE;
1590	} while (usize > 0);
1591
1592	/* Prevent "things" like memory migration? VM_flags need a cleanup... */
1593	vma->vm_flags |= VM_RESERVED;
1594
1595	return 0;
1596}
1597EXPORT_SYMBOL(remap_vmalloc_range);
1598
1599/*
1600 * Implement a stub for vmalloc_sync_all() if the architecture chose not to
1601 * have one.
1602 */
1603void  __attribute__((weak)) vmalloc_sync_all(void)
1604{
1605}
1606
1607
1608static int f(pte_t *pte, pgtable_t table, unsigned long addr, void *data)
1609{
1610	/* apply_to_page_range() does all the hard work. */
1611	return 0;
1612}
1613
1614/**
1615 *	alloc_vm_area - allocate a range of kernel address space
1616 *	@size:		size of the area
1617 *
1618 *	Returns:	NULL on failure, vm_struct on success
1619 *
1620 *	This function reserves a range of kernel address space, and
1621 *	allocates pagetables to map that range.  No actual mappings
1622 *	are created.  If the kernel address space is not shared
1623 *	between processes, it syncs the pagetable across all
1624 *	processes.
1625 */
1626struct vm_struct *alloc_vm_area(size_t size)
1627{
1628	struct vm_struct *area;
1629
1630	area = get_vm_area_caller(size, VM_IOREMAP,
1631				__builtin_return_address(0));
1632	if (area == NULL)
1633		return NULL;
1634
1635	/*
1636	 * This ensures that page tables are constructed for this region
1637	 * of kernel virtual address space and mapped into init_mm.
1638	 */
1639	if (apply_to_page_range(&init_mm, (unsigned long)area->addr,
1640				area->size, f, NULL)) {
1641		free_vm_area(area);
1642		return NULL;
1643	}
1644
1645	/* Make sure the pagetables are constructed in process kernel
1646	   mappings */
1647	vmalloc_sync_all();
1648
1649	return area;
1650}
1651EXPORT_SYMBOL_GPL(alloc_vm_area);
1652
1653void free_vm_area(struct vm_struct *area)
1654{
1655	struct vm_struct *ret;
1656	ret = remove_vm_area(area->addr);
1657	BUG_ON(ret != area);
1658	kfree(area);
1659}
1660EXPORT_SYMBOL_GPL(free_vm_area);
1661
1662
1663#ifdef CONFIG_PROC_FS
1664static void *s_start(struct seq_file *m, loff_t *pos)
1665{
1666	loff_t n = *pos;
1667	struct vm_struct *v;
1668
1669	read_lock(&vmlist_lock);
1670	v = vmlist;
1671	while (n > 0 && v) {
1672		n--;
1673		v = v->next;
1674	}
1675	if (!n)
1676		return v;
1677
1678	return NULL;
1679
1680}
1681
1682static void *s_next(struct seq_file *m, void *p, loff_t *pos)
1683{
1684	struct vm_struct *v = p;
1685
1686	++*pos;
1687	return v->next;
1688}
1689
1690static void s_stop(struct seq_file *m, void *p)
1691{
1692	read_unlock(&vmlist_lock);
1693}
1694
1695static void show_numa_info(struct seq_file *m, struct vm_struct *v)
1696{
1697	if (NUMA_BUILD) {
1698		unsigned int nr, *counters = m->private;
1699
1700		if (!counters)
1701			return;
1702
1703		memset(counters, 0, nr_node_ids * sizeof(unsigned int));
1704
1705		for (nr = 0; nr < v->nr_pages; nr++)
1706			counters[page_to_nid(v->pages[nr])]++;
1707
1708		for_each_node_state(nr, N_HIGH_MEMORY)
1709			if (counters[nr])
1710				seq_printf(m, " N%u=%u", nr, counters[nr]);
1711	}
1712}
1713
1714static int s_show(struct seq_file *m, void *p)
1715{
1716	struct vm_struct *v = p;
1717
1718	seq_printf(m, "0x%p-0x%p %7ld",
1719		v->addr, v->addr + v->size, v->size);
1720
1721	if (v->caller) {
1722		char buff[KSYM_SYMBOL_LEN];
1723
1724		seq_putc(m, ' ');
1725		sprint_symbol(buff, (unsigned long)v->caller);
1726		seq_puts(m, buff);
1727	}
1728
1729	if (v->nr_pages)
1730		seq_printf(m, " pages=%d", v->nr_pages);
1731
1732	if (v->phys_addr)
1733		seq_printf(m, " phys=%lx", v->phys_addr);
1734
1735	if (v->flags & VM_IOREMAP)
1736		seq_printf(m, " ioremap");
1737
1738	if (v->flags & VM_ALLOC)
1739		seq_printf(m, " vmalloc");
1740
1741	if (v->flags & VM_MAP)
1742		seq_printf(m, " vmap");
1743
1744	if (v->flags & VM_USERMAP)
1745		seq_printf(m, " user");
1746
1747	if (v->flags & VM_VPAGES)
1748		seq_printf(m, " vpages");
1749
1750	show_numa_info(m, v);
1751	seq_putc(m, '\n');
1752	return 0;
1753}
1754
1755static const struct seq_operations vmalloc_op = {
1756	.start = s_start,
1757	.next = s_next,
1758	.stop = s_stop,
1759	.show = s_show,
1760};
1761
1762static int vmalloc_open(struct inode *inode, struct file *file)
1763{
1764	unsigned int *ptr = NULL;
1765	int ret;
1766
1767	if (NUMA_BUILD)
1768		ptr = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL);
1769	ret = seq_open(file, &vmalloc_op);
1770	if (!ret) {
1771		struct seq_file *m = file->private_data;
1772		m->private = ptr;
1773	} else
1774		kfree(ptr);
1775	return ret;
1776}
1777
1778static const struct file_operations proc_vmalloc_operations = {
1779	.open		= vmalloc_open,
1780	.read		= seq_read,
1781	.llseek		= seq_lseek,
1782	.release	= seq_release_private,
1783};
1784
1785static int __init proc_vmalloc_init(void)
1786{
1787	proc_create("vmallocinfo", S_IRUSR, NULL, &proc_vmalloc_operations);
1788	return 0;
1789}
1790module_init(proc_vmalloc_init);
1791#endif
1792
1793