init_64.c revision f62d0f008e889915c93631c04d4c7d871f05bea7
1/*
2 *  linux/arch/x86_64/mm/init.c
3 *
4 *  Copyright (C) 1995  Linus Torvalds
5 *  Copyright (C) 2000  Pavel Machek <pavel@suse.cz>
6 *  Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
7 */
8
9#include <linux/signal.h>
10#include <linux/sched.h>
11#include <linux/kernel.h>
12#include <linux/errno.h>
13#include <linux/string.h>
14#include <linux/types.h>
15#include <linux/ptrace.h>
16#include <linux/mman.h>
17#include <linux/mm.h>
18#include <linux/swap.h>
19#include <linux/smp.h>
20#include <linux/init.h>
21#include <linux/pagemap.h>
22#include <linux/bootmem.h>
23#include <linux/proc_fs.h>
24#include <linux/pci.h>
25#include <linux/pfn.h>
26#include <linux/poison.h>
27#include <linux/dma-mapping.h>
28#include <linux/module.h>
29#include <linux/memory_hotplug.h>
30#include <linux/nmi.h>
31
32#include <asm/processor.h>
33#include <asm/system.h>
34#include <asm/uaccess.h>
35#include <asm/pgtable.h>
36#include <asm/pgalloc.h>
37#include <asm/dma.h>
38#include <asm/fixmap.h>
39#include <asm/e820.h>
40#include <asm/apic.h>
41#include <asm/tlb.h>
42#include <asm/mmu_context.h>
43#include <asm/proto.h>
44#include <asm/smp.h>
45#include <asm/sections.h>
46#include <asm/kdebug.h>
47#include <asm/numa.h>
48
49#ifndef Dprintk
50#define Dprintk(x...)
51#endif
52
53const struct dma_mapping_ops* dma_ops;
54EXPORT_SYMBOL(dma_ops);
55
56static unsigned long dma_reserve __initdata;
57
58DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
59
60/*
61 * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
62 * physical space so we can cache the place of the first one and move
63 * around without checking the pgd every time.
64 */
65
66void show_mem(void)
67{
68	long i, total = 0, reserved = 0;
69	long shared = 0, cached = 0;
70	pg_data_t *pgdat;
71	struct page *page;
72
73	printk(KERN_INFO "Mem-info:\n");
74	show_free_areas();
75	printk(KERN_INFO "Free swap:       %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
76
77	for_each_online_pgdat(pgdat) {
78               for (i = 0; i < pgdat->node_spanned_pages; ++i) {
79			/* this loop can take a while with 256 GB and 4k pages
80			   so update the NMI watchdog */
81			if (unlikely(i % MAX_ORDER_NR_PAGES == 0)) {
82				touch_nmi_watchdog();
83			}
84			if (!pfn_valid(pgdat->node_start_pfn + i))
85				continue;
86			page = pfn_to_page(pgdat->node_start_pfn + i);
87			total++;
88			if (PageReserved(page))
89				reserved++;
90			else if (PageSwapCache(page))
91				cached++;
92			else if (page_count(page))
93				shared += page_count(page) - 1;
94               }
95	}
96	printk(KERN_INFO "%lu pages of RAM\n", total);
97	printk(KERN_INFO "%lu reserved pages\n",reserved);
98	printk(KERN_INFO "%lu pages shared\n",shared);
99	printk(KERN_INFO "%lu pages swap cached\n",cached);
100}
101
102int after_bootmem;
103
104static __init void *spp_getpage(void)
105{
106	void *ptr;
107	if (after_bootmem)
108		ptr = (void *) get_zeroed_page(GFP_ATOMIC);
109	else
110		ptr = alloc_bootmem_pages(PAGE_SIZE);
111	if (!ptr || ((unsigned long)ptr & ~PAGE_MASK))
112		panic("set_pte_phys: cannot allocate page data %s\n", after_bootmem?"after bootmem":"");
113
114	Dprintk("spp_getpage %p\n", ptr);
115	return ptr;
116}
117
118static __init void set_pte_phys(unsigned long vaddr,
119			 unsigned long phys, pgprot_t prot)
120{
121	pgd_t *pgd;
122	pud_t *pud;
123	pmd_t *pmd;
124	pte_t *pte, new_pte;
125
126	Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys);
127
128	pgd = pgd_offset_k(vaddr);
129	if (pgd_none(*pgd)) {
130		printk("PGD FIXMAP MISSING, it should be setup in head.S!\n");
131		return;
132	}
133	pud = pud_offset(pgd, vaddr);
134	if (pud_none(*pud)) {
135		pmd = (pmd_t *) spp_getpage();
136		set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
137		if (pmd != pmd_offset(pud, 0)) {
138			printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0));
139			return;
140		}
141	}
142	pmd = pmd_offset(pud, vaddr);
143	if (pmd_none(*pmd)) {
144		pte = (pte_t *) spp_getpage();
145		set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
146		if (pte != pte_offset_kernel(pmd, 0)) {
147			printk("PAGETABLE BUG #02!\n");
148			return;
149		}
150	}
151	new_pte = pfn_pte(phys >> PAGE_SHIFT, prot);
152
153	pte = pte_offset_kernel(pmd, vaddr);
154	if (!pte_none(*pte) &&
155	    pte_val(*pte) != (pte_val(new_pte) & __supported_pte_mask))
156		pte_ERROR(*pte);
157	set_pte(pte, new_pte);
158
159	/*
160	 * It's enough to flush this one mapping.
161	 * (PGE mappings get flushed as well)
162	 */
163	__flush_tlb_one(vaddr);
164}
165
166/* NOTE: this is meant to be run only at boot */
167void __init
168__set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
169{
170	unsigned long address = __fix_to_virt(idx);
171
172	if (idx >= __end_of_fixed_addresses) {
173		printk("Invalid __set_fixmap\n");
174		return;
175	}
176	set_pte_phys(address, phys, prot);
177}
178
179static unsigned long __initdata table_start;
180static unsigned long __meminitdata table_end;
181
182static __meminit void *alloc_low_page(unsigned long *phys)
183{
184	unsigned long pfn = table_end++;
185	void *adr;
186
187	if (after_bootmem) {
188		adr = (void *)get_zeroed_page(GFP_ATOMIC);
189		*phys = __pa(adr);
190		return adr;
191	}
192
193	if (pfn >= end_pfn)
194		panic("alloc_low_page: ran out of memory");
195
196	adr = early_ioremap(pfn * PAGE_SIZE, PAGE_SIZE);
197	memset(adr, 0, PAGE_SIZE);
198	*phys  = pfn * PAGE_SIZE;
199	return adr;
200}
201
202static __meminit void unmap_low_page(void *adr)
203{
204
205	if (after_bootmem)
206		return;
207
208	early_iounmap(adr, PAGE_SIZE);
209}
210
211/* Must run before zap_low_mappings */
212__meminit void *early_ioremap(unsigned long addr, unsigned long size)
213{
214	unsigned long vaddr;
215	pmd_t *pmd, *last_pmd;
216	int i, pmds;
217
218	pmds = ((addr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
219	vaddr = __START_KERNEL_map;
220	pmd = level2_kernel_pgt;
221	last_pmd = level2_kernel_pgt + PTRS_PER_PMD - 1;
222	for (; pmd <= last_pmd; pmd++, vaddr += PMD_SIZE) {
223		for (i = 0; i < pmds; i++) {
224			if (pmd_present(pmd[i]))
225				goto next;
226		}
227		vaddr += addr & ~PMD_MASK;
228		addr &= PMD_MASK;
229		for (i = 0; i < pmds; i++, addr += PMD_SIZE)
230			set_pmd(pmd+i, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
231		__flush_tlb_all();
232		return (void *)vaddr;
233	next:
234		;
235	}
236	printk("early_ioremap(0x%lx, %lu) failed\n", addr, size);
237	return NULL;
238}
239
240/* To avoid virtual aliases later */
241__meminit void early_iounmap(void *addr, unsigned long size)
242{
243	unsigned long vaddr;
244	pmd_t *pmd;
245	int i, pmds;
246
247	vaddr = (unsigned long)addr;
248	pmds = ((vaddr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
249	pmd = level2_kernel_pgt + pmd_index(vaddr);
250	for (i = 0; i < pmds; i++)
251		pmd_clear(pmd + i);
252	__flush_tlb_all();
253}
254
255static void __meminit
256phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end)
257{
258	int i = pmd_index(address);
259
260	for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) {
261		unsigned long entry;
262		pmd_t *pmd = pmd_page + pmd_index(address);
263
264		if (address >= end) {
265			if (!after_bootmem)
266				for (; i < PTRS_PER_PMD; i++, pmd++)
267					set_pmd(pmd, __pmd(0));
268			break;
269		}
270
271		if (pmd_val(*pmd))
272			continue;
273
274		entry = __PAGE_KERNEL_LARGE|_PAGE_GLOBAL|address;
275		entry &= __supported_pte_mask;
276		set_pmd(pmd, __pmd(entry));
277	}
278}
279
280static void __meminit
281phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end)
282{
283	pmd_t *pmd = pmd_offset(pud,0);
284	spin_lock(&init_mm.page_table_lock);
285	phys_pmd_init(pmd, address, end);
286	spin_unlock(&init_mm.page_table_lock);
287	__flush_tlb_all();
288}
289
290static void __meminit phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
291{
292	int i = pud_index(addr);
293
294
295	for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE ) {
296		unsigned long pmd_phys;
297		pud_t *pud = pud_page + pud_index(addr);
298		pmd_t *pmd;
299
300		if (addr >= end)
301			break;
302
303		if (!after_bootmem && !e820_any_mapped(addr,addr+PUD_SIZE,0)) {
304			set_pud(pud, __pud(0));
305			continue;
306		}
307
308		if (pud_val(*pud)) {
309			phys_pmd_update(pud, addr, end);
310			continue;
311		}
312
313		pmd = alloc_low_page(&pmd_phys);
314		spin_lock(&init_mm.page_table_lock);
315		set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE));
316		phys_pmd_init(pmd, addr, end);
317		spin_unlock(&init_mm.page_table_lock);
318		unmap_low_page(pmd);
319	}
320	__flush_tlb_all();
321}
322
323static void __init find_early_table_space(unsigned long end)
324{
325	unsigned long puds, pmds, tables, start;
326
327	puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
328	pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
329	tables = round_up(puds * sizeof(pud_t), PAGE_SIZE) +
330		 round_up(pmds * sizeof(pmd_t), PAGE_SIZE);
331
332 	/* RED-PEN putting page tables only on node 0 could
333 	   cause a hotspot and fill up ZONE_DMA. The page tables
334 	   need roughly 0.5KB per GB. */
335 	start = 0x8000;
336 	table_start = find_e820_area(start, end, tables);
337	if (table_start == -1UL)
338		panic("Cannot find space for the kernel page tables");
339
340	table_start >>= PAGE_SHIFT;
341	table_end = table_start;
342
343	early_printk("kernel direct mapping tables up to %lx @ %lx-%lx\n",
344		end, table_start << PAGE_SHIFT,
345		(table_start << PAGE_SHIFT) + tables);
346}
347
348/* Setup the direct mapping of the physical memory at PAGE_OFFSET.
349   This runs before bootmem is initialized and gets pages directly from the
350   physical memory. To access them they are temporarily mapped. */
351void __init_refok init_memory_mapping(unsigned long start, unsigned long end)
352{
353	unsigned long next;
354
355	Dprintk("init_memory_mapping\n");
356
357	/*
358	 * Find space for the kernel direct mapping tables.
359	 * Later we should allocate these tables in the local node of the memory
360	 * mapped.  Unfortunately this is done currently before the nodes are
361	 * discovered.
362	 */
363	if (!after_bootmem)
364		find_early_table_space(end);
365
366	start = (unsigned long)__va(start);
367	end = (unsigned long)__va(end);
368
369	for (; start < end; start = next) {
370		unsigned long pud_phys;
371		pgd_t *pgd = pgd_offset_k(start);
372		pud_t *pud;
373
374		if (after_bootmem)
375			pud = pud_offset(pgd, start & PGDIR_MASK);
376		else
377			pud = alloc_low_page(&pud_phys);
378
379		next = start + PGDIR_SIZE;
380		if (next > end)
381			next = end;
382		phys_pud_init(pud, __pa(start), __pa(next));
383		if (!after_bootmem)
384			set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
385		unmap_low_page(pud);
386	}
387
388	if (!after_bootmem)
389		mmu_cr4_features = read_cr4();
390	__flush_tlb_all();
391
392	reserve_early(table_start << PAGE_SHIFT, table_end << PAGE_SHIFT);
393}
394
395#ifndef CONFIG_NUMA
396void __init paging_init(void)
397{
398	unsigned long max_zone_pfns[MAX_NR_ZONES];
399	memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
400	max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
401	max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
402	max_zone_pfns[ZONE_NORMAL] = end_pfn;
403
404	memory_present(0, 0, end_pfn);
405	sparse_init();
406	free_area_init_nodes(max_zone_pfns);
407}
408#endif
409
410/* Unmap a kernel mapping if it exists. This is useful to avoid prefetches
411   from the CPU leading to inconsistent cache lines. address and size
412   must be aligned to 2MB boundaries.
413   Does nothing when the mapping doesn't exist. */
414void __init clear_kernel_mapping(unsigned long address, unsigned long size)
415{
416	unsigned long end = address + size;
417
418	BUG_ON(address & ~LARGE_PAGE_MASK);
419	BUG_ON(size & ~LARGE_PAGE_MASK);
420
421	for (; address < end; address += LARGE_PAGE_SIZE) {
422		pgd_t *pgd = pgd_offset_k(address);
423		pud_t *pud;
424		pmd_t *pmd;
425		if (pgd_none(*pgd))
426			continue;
427		pud = pud_offset(pgd, address);
428		if (pud_none(*pud))
429			continue;
430		pmd = pmd_offset(pud, address);
431		if (!pmd || pmd_none(*pmd))
432			continue;
433		if (0 == (pmd_val(*pmd) & _PAGE_PSE)) {
434			/* Could handle this, but it should not happen currently. */
435			printk(KERN_ERR
436	       "clear_kernel_mapping: mapping has been split. will leak memory\n");
437			pmd_ERROR(*pmd);
438		}
439		set_pmd(pmd, __pmd(0));
440	}
441	__flush_tlb_all();
442}
443
444/*
445 * Memory hotplug specific functions
446 */
447void online_page(struct page *page)
448{
449	ClearPageReserved(page);
450	init_page_count(page);
451	__free_page(page);
452	totalram_pages++;
453	num_physpages++;
454}
455
456#ifdef CONFIG_MEMORY_HOTPLUG
457/*
458 * Memory is added always to NORMAL zone. This means you will never get
459 * additional DMA/DMA32 memory.
460 */
461int arch_add_memory(int nid, u64 start, u64 size)
462{
463	struct pglist_data *pgdat = NODE_DATA(nid);
464	struct zone *zone = pgdat->node_zones + ZONE_NORMAL;
465	unsigned long start_pfn = start >> PAGE_SHIFT;
466	unsigned long nr_pages = size >> PAGE_SHIFT;
467	int ret;
468
469	init_memory_mapping(start, (start + size -1));
470
471	ret = __add_pages(zone, start_pfn, nr_pages);
472	if (ret)
473		goto error;
474
475	return ret;
476error:
477	printk("%s: Problem encountered in __add_pages!\n", __func__);
478	return ret;
479}
480EXPORT_SYMBOL_GPL(arch_add_memory);
481
482#if !defined(CONFIG_ACPI_NUMA) && defined(CONFIG_NUMA)
483int memory_add_physaddr_to_nid(u64 start)
484{
485	return 0;
486}
487EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
488#endif
489
490#endif /* CONFIG_MEMORY_HOTPLUG */
491
492static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules,
493			 kcore_vsyscall;
494
495void __init mem_init(void)
496{
497	long codesize, reservedpages, datasize, initsize;
498
499	pci_iommu_alloc();
500
501	/* clear_bss() already clear the empty_zero_page */
502
503	/* temporary debugging - double check it's true: */
504	{
505		int i;
506
507		for (i = 0; i < 1024; i++)
508			WARN_ON_ONCE(empty_zero_page[i]);
509	}
510
511	reservedpages = 0;
512
513	/* this will put all low memory onto the freelists */
514#ifdef CONFIG_NUMA
515	totalram_pages = numa_free_all_bootmem();
516#else
517	totalram_pages = free_all_bootmem();
518#endif
519	reservedpages = end_pfn - totalram_pages -
520					absent_pages_in_range(0, end_pfn);
521
522	after_bootmem = 1;
523
524	codesize =  (unsigned long) &_etext - (unsigned long) &_text;
525	datasize =  (unsigned long) &_edata - (unsigned long) &_etext;
526	initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;
527
528	/* Register memory areas for /proc/kcore */
529	kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
530	kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
531		   VMALLOC_END-VMALLOC_START);
532	kclist_add(&kcore_kernel, &_stext, _end - _stext);
533	kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN);
534	kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
535				 VSYSCALL_END - VSYSCALL_START);
536
537	printk("Memory: %luk/%luk available (%ldk kernel code, %ldk reserved, %ldk data, %ldk init)\n",
538		(unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
539		end_pfn << (PAGE_SHIFT-10),
540		codesize >> 10,
541		reservedpages << (PAGE_SHIFT-10),
542		datasize >> 10,
543		initsize >> 10);
544}
545
546void free_init_pages(char *what, unsigned long begin, unsigned long end)
547{
548	unsigned long addr;
549
550	if (begin >= end)
551		return;
552
553	printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
554	for (addr = begin; addr < end; addr += PAGE_SIZE) {
555		ClearPageReserved(virt_to_page(addr));
556		init_page_count(virt_to_page(addr));
557		memset((void *)(addr & ~(PAGE_SIZE-1)),
558			POISON_FREE_INITMEM, PAGE_SIZE);
559		free_page(addr);
560		totalram_pages++;
561	}
562#ifdef CONFIG_DEBUG_RODATA
563	/*
564	 * This will make the __init pages not present and
565	 * not executable, so that any attempt to use a
566	 * __init function from now on will fault immediately
567	 * rather than supriously later when memory gets reused.
568	 *
569	 * We only do this for DEBUG_RODATA to not break up the
570	 * 2Mb kernel mapping just for this debug feature.
571	 */
572	if (begin >= __START_KERNEL_map) {
573		set_memory_np(begin, (end - begin)/PAGE_SIZE);
574		set_memory_nx(begin, (end - begin)/PAGE_SIZE);
575	}
576#endif
577}
578
579void free_initmem(void)
580{
581	free_init_pages("unused kernel memory",
582			(unsigned long)(&__init_begin),
583			(unsigned long)(&__init_end));
584}
585
586#ifdef CONFIG_DEBUG_RODATA
587
588void mark_rodata_ro(void)
589{
590	unsigned long start = (unsigned long)_stext, end;
591
592#ifdef CONFIG_HOTPLUG_CPU
593	/* It must still be possible to apply SMP alternatives. */
594	if (num_possible_cpus() > 1)
595		start = (unsigned long)_etext;
596#endif
597
598#ifdef CONFIG_KPROBES
599	start = (unsigned long)__start_rodata;
600#endif
601
602	end = (unsigned long)__end_rodata;
603	start = (start + PAGE_SIZE - 1) & PAGE_MASK;
604	end &= PAGE_MASK;
605	if (end <= start)
606		return;
607
608	set_memory_ro(start, (end - start) >> PAGE_SHIFT);
609
610	printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
611	       (end - start) >> 10);
612
613	/*
614	 * set_memory_*() requires a global_flush_tlb() call after it.
615	 * We do this after the printk so that if something went wrong in the
616	 * change, the printk gets out at least to give a better debug hint
617	 * of who is the culprit.
618	 */
619	global_flush_tlb();
620
621#ifdef CONFIG_CPA_DEBUG
622	printk("Testing CPA: undo %lx-%lx\n", start, end);
623	set_memory_rw(start, (end-start) >> PAGE_SHIFT);
624	global_flush_tlb();
625
626	printk("Testing CPA: again\n");
627	set_memory_ro(start, (end-start) >> PAGE_SHIFT);
628	global_flush_tlb();
629#endif
630}
631#endif
632
633#ifdef CONFIG_BLK_DEV_INITRD
634void free_initrd_mem(unsigned long start, unsigned long end)
635{
636	free_init_pages("initrd memory", start, end);
637}
638#endif
639
640void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
641{
642#ifdef CONFIG_NUMA
643	int nid = phys_to_nid(phys);
644#endif
645	unsigned long pfn = phys >> PAGE_SHIFT;
646	if (pfn >= end_pfn) {
647		/* This can happen with kdump kernels when accessing firmware
648		   tables. */
649		if (pfn < end_pfn_map)
650			return;
651		printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n",
652				phys, len);
653		return;
654	}
655
656	/* Should check here against the e820 map to avoid double free */
657#ifdef CONFIG_NUMA
658  	reserve_bootmem_node(NODE_DATA(nid), phys, len);
659#else
660	reserve_bootmem(phys, len);
661#endif
662	if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) {
663		dma_reserve += len / PAGE_SIZE;
664		set_dma_reserve(dma_reserve);
665	}
666}
667
668int kern_addr_valid(unsigned long addr)
669{
670	unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
671       pgd_t *pgd;
672       pud_t *pud;
673       pmd_t *pmd;
674       pte_t *pte;
675
676	if (above != 0 && above != -1UL)
677		return 0;
678
679	pgd = pgd_offset_k(addr);
680	if (pgd_none(*pgd))
681		return 0;
682
683	pud = pud_offset(pgd, addr);
684	if (pud_none(*pud))
685		return 0;
686
687	pmd = pmd_offset(pud, addr);
688	if (pmd_none(*pmd))
689		return 0;
690	if (pmd_large(*pmd))
691		return pfn_valid(pmd_pfn(*pmd));
692
693	pte = pte_offset_kernel(pmd, addr);
694	if (pte_none(*pte))
695		return 0;
696	return pfn_valid(pte_pfn(*pte));
697}
698
699/* A pseudo VMA to allow ptrace access for the vsyscall page.  This only
700   covers the 64bit vsyscall page now. 32bit has a real VMA now and does
701   not need special handling anymore. */
702
703static struct vm_area_struct gate_vma = {
704	.vm_start = VSYSCALL_START,
705	.vm_end = VSYSCALL_START + (VSYSCALL_MAPPED_PAGES << PAGE_SHIFT),
706	.vm_page_prot = PAGE_READONLY_EXEC,
707	.vm_flags = VM_READ | VM_EXEC
708};
709
710struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
711{
712#ifdef CONFIG_IA32_EMULATION
713	if (test_tsk_thread_flag(tsk, TIF_IA32))
714		return NULL;
715#endif
716	return &gate_vma;
717}
718
719int in_gate_area(struct task_struct *task, unsigned long addr)
720{
721	struct vm_area_struct *vma = get_gate_vma(task);
722	if (!vma)
723		return 0;
724	return (addr >= vma->vm_start) && (addr < vma->vm_end);
725}
726
727/* Use this when you have no reliable task/vma, typically from interrupt
728 * context.  It is less reliable than using the task's vma and may give
729 * false positives.
730 */
731int in_gate_area_no_task(unsigned long addr)
732{
733	return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);
734}
735
736const char *arch_vma_name(struct vm_area_struct *vma)
737{
738	if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso)
739		return "[vdso]";
740	if (vma == &gate_vma)
741		return "[vsyscall]";
742	return NULL;
743}
744
745#ifdef CONFIG_SPARSEMEM_VMEMMAP
746/*
747 * Initialise the sparsemem vmemmap using huge-pages at the PMD level.
748 */
749int __meminit vmemmap_populate(struct page *start_page,
750						unsigned long size, int node)
751{
752	unsigned long addr = (unsigned long)start_page;
753	unsigned long end = (unsigned long)(start_page + size);
754	unsigned long next;
755	pgd_t *pgd;
756	pud_t *pud;
757	pmd_t *pmd;
758
759	for (; addr < end; addr = next) {
760		next = pmd_addr_end(addr, end);
761
762		pgd = vmemmap_pgd_populate(addr, node);
763		if (!pgd)
764			return -ENOMEM;
765		pud = vmemmap_pud_populate(pgd, addr, node);
766		if (!pud)
767			return -ENOMEM;
768
769		pmd = pmd_offset(pud, addr);
770		if (pmd_none(*pmd)) {
771			pte_t entry;
772			void *p = vmemmap_alloc_block(PMD_SIZE, node);
773			if (!p)
774				return -ENOMEM;
775
776			entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL_LARGE);
777			set_pmd(pmd, __pmd(pte_val(entry)));
778
779			printk(KERN_DEBUG " [%lx-%lx] PMD ->%p on node %d\n",
780				addr, addr + PMD_SIZE - 1, p, node);
781		} else
782			vmemmap_verify((pte_t *)pmd, node, addr, next);
783	}
784
785	return 0;
786}
787#endif
788