init_64.c revision 28dd033f43ca957cd751e02652b36c6fa364ca18
1/*
2 *  linux/arch/x86_64/mm/init.c
3 *
4 *  Copyright (C) 1995  Linus Torvalds
5 *  Copyright (C) 2000  Pavel Machek <pavel@suse.cz>
6 *  Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
7 */
8
9#include <linux/signal.h>
10#include <linux/sched.h>
11#include <linux/kernel.h>
12#include <linux/errno.h>
13#include <linux/string.h>
14#include <linux/types.h>
15#include <linux/ptrace.h>
16#include <linux/mman.h>
17#include <linux/mm.h>
18#include <linux/swap.h>
19#include <linux/smp.h>
20#include <linux/init.h>
21#include <linux/initrd.h>
22#include <linux/pagemap.h>
23#include <linux/bootmem.h>
24#include <linux/proc_fs.h>
25#include <linux/pci.h>
26#include <linux/pfn.h>
27#include <linux/poison.h>
28#include <linux/dma-mapping.h>
29#include <linux/module.h>
30#include <linux/memory_hotplug.h>
31#include <linux/nmi.h>
32
33#include <asm/processor.h>
34#include <asm/system.h>
35#include <asm/uaccess.h>
36#include <asm/pgtable.h>
37#include <asm/pgalloc.h>
38#include <asm/dma.h>
39#include <asm/fixmap.h>
40#include <asm/e820.h>
41#include <asm/apic.h>
42#include <asm/tlb.h>
43#include <asm/mmu_context.h>
44#include <asm/proto.h>
45#include <asm/smp.h>
46#include <asm/sections.h>
47#include <asm/kdebug.h>
48#include <asm/numa.h>
49#include <asm/cacheflush.h>
50
51/*
52 * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
53 * The direct mapping extends to max_pfn_mapped, so that we can directly access
54 * apertures, ACPI and other tables without having to play with fixmaps.
55 */
56unsigned long max_low_pfn_mapped;
57unsigned long max_pfn_mapped;
58
59static unsigned long dma_reserve __initdata;
60
61DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
62
63int direct_gbpages
64#ifdef CONFIG_DIRECT_GBPAGES
65				= 1
66#endif
67;
68
69static int __init parse_direct_gbpages_off(char *arg)
70{
71	direct_gbpages = 0;
72	return 0;
73}
74early_param("nogbpages", parse_direct_gbpages_off);
75
76static int __init parse_direct_gbpages_on(char *arg)
77{
78	direct_gbpages = 1;
79	return 0;
80}
81early_param("gbpages", parse_direct_gbpages_on);
82
83/*
84 * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
85 * physical space so we can cache the place of the first one and move
86 * around without checking the pgd every time.
87 */
88
89int after_bootmem;
90
91/*
92 * NOTE: This function is marked __ref because it calls __init function
93 * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0.
94 */
95static __ref void *spp_getpage(void)
96{
97	void *ptr;
98
99	if (after_bootmem)
100		ptr = (void *) get_zeroed_page(GFP_ATOMIC);
101	else
102		ptr = alloc_bootmem_pages(PAGE_SIZE);
103
104	if (!ptr || ((unsigned long)ptr & ~PAGE_MASK)) {
105		panic("set_pte_phys: cannot allocate page data %s\n",
106			after_bootmem ? "after bootmem" : "");
107	}
108
109	pr_debug("spp_getpage %p\n", ptr);
110
111	return ptr;
112}
113
114void
115set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
116{
117	pud_t *pud;
118	pmd_t *pmd;
119	pte_t *pte;
120
121	pud = pud_page + pud_index(vaddr);
122	if (pud_none(*pud)) {
123		pmd = (pmd_t *) spp_getpage();
124		pud_populate(&init_mm, pud, pmd);
125		if (pmd != pmd_offset(pud, 0)) {
126			printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
127				pmd, pmd_offset(pud, 0));
128			return;
129		}
130	}
131	pmd = pmd_offset(pud, vaddr);
132	if (pmd_none(*pmd)) {
133		pte = (pte_t *) spp_getpage();
134		pmd_populate_kernel(&init_mm, pmd, pte);
135		if (pte != pte_offset_kernel(pmd, 0)) {
136			printk(KERN_ERR "PAGETABLE BUG #02!\n");
137			return;
138		}
139	}
140
141	pte = pte_offset_kernel(pmd, vaddr);
142	if (!pte_none(*pte) && pte_val(new_pte) &&
143	    pte_val(*pte) != (pte_val(new_pte) & __supported_pte_mask))
144		pte_ERROR(*pte);
145	set_pte(pte, new_pte);
146
147	/*
148	 * It's enough to flush this one mapping.
149	 * (PGE mappings get flushed as well)
150	 */
151	__flush_tlb_one(vaddr);
152}
153
154void
155set_pte_vaddr(unsigned long vaddr, pte_t pteval)
156{
157	pgd_t *pgd;
158	pud_t *pud_page;
159
160	pr_debug("set_pte_vaddr %lx to %lx\n", vaddr, native_pte_val(pteval));
161
162	pgd = pgd_offset_k(vaddr);
163	if (pgd_none(*pgd)) {
164		printk(KERN_ERR
165			"PGD FIXMAP MISSING, it should be setup in head.S!\n");
166		return;
167	}
168	pud_page = (pud_t*)pgd_page_vaddr(*pgd);
169	set_pte_vaddr_pud(pud_page, vaddr, pteval);
170}
171
172/*
173 * Create large page table mappings for a range of physical addresses.
174 */
175static void __init __init_extra_mapping(unsigned long phys, unsigned long size,
176						pgprot_t prot)
177{
178	pgd_t *pgd;
179	pud_t *pud;
180	pmd_t *pmd;
181
182	BUG_ON((phys & ~PMD_MASK) || (size & ~PMD_MASK));
183	for (; size; phys += PMD_SIZE, size -= PMD_SIZE) {
184		pgd = pgd_offset_k((unsigned long)__va(phys));
185		if (pgd_none(*pgd)) {
186			pud = (pud_t *) spp_getpage();
187			set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE |
188						_PAGE_USER));
189		}
190		pud = pud_offset(pgd, (unsigned long)__va(phys));
191		if (pud_none(*pud)) {
192			pmd = (pmd_t *) spp_getpage();
193			set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE |
194						_PAGE_USER));
195		}
196		pmd = pmd_offset(pud, phys);
197		BUG_ON(!pmd_none(*pmd));
198		set_pmd(pmd, __pmd(phys | pgprot_val(prot)));
199	}
200}
201
202void __init init_extra_mapping_wb(unsigned long phys, unsigned long size)
203{
204	__init_extra_mapping(phys, size, PAGE_KERNEL_LARGE);
205}
206
207void __init init_extra_mapping_uc(unsigned long phys, unsigned long size)
208{
209	__init_extra_mapping(phys, size, PAGE_KERNEL_LARGE_NOCACHE);
210}
211
212/*
213 * The head.S code sets up the kernel high mapping:
214 *
215 *   from __START_KERNEL_map to __START_KERNEL_map + size (== _end-_text)
216 *
217 * phys_addr holds the negative offset to the kernel, which is added
218 * to the compile time generated pmds. This results in invalid pmds up
219 * to the point where we hit the physaddr 0 mapping.
220 *
221 * We limit the mappings to the region from _text to _end.  _end is
222 * rounded up to the 2MB boundary. This catches the invalid pmds as
223 * well, as they are located before _text:
224 */
225void __init cleanup_highmap(void)
226{
227	unsigned long vaddr = __START_KERNEL_map;
228	unsigned long end = round_up((unsigned long)_end, PMD_SIZE) - 1;
229	pmd_t *pmd = level2_kernel_pgt;
230	pmd_t *last_pmd = pmd + PTRS_PER_PMD;
231
232	for (; pmd < last_pmd; pmd++, vaddr += PMD_SIZE) {
233		if (pmd_none(*pmd))
234			continue;
235		if (vaddr < (unsigned long) _text || vaddr > end)
236			set_pmd(pmd, __pmd(0));
237	}
238}
239
240static unsigned long __initdata table_start;
241static unsigned long __meminitdata table_end;
242static unsigned long __meminitdata table_top;
243
244static __ref void *alloc_low_page(unsigned long *phys)
245{
246	unsigned long pfn = table_end++;
247	void *adr;
248
249	if (after_bootmem) {
250		adr = (void *)get_zeroed_page(GFP_ATOMIC);
251		*phys = __pa(adr);
252
253		return adr;
254	}
255
256	if (pfn >= table_top)
257		panic("alloc_low_page: ran out of memory");
258
259	adr = early_ioremap(pfn * PAGE_SIZE, PAGE_SIZE);
260	memset(adr, 0, PAGE_SIZE);
261	*phys  = pfn * PAGE_SIZE;
262	return adr;
263}
264
265static __ref void unmap_low_page(void *adr)
266{
267	if (after_bootmem)
268		return;
269
270	early_iounmap(adr, PAGE_SIZE);
271}
272
273static int physical_mapping_iter;
274
275static unsigned long __meminit
276phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end)
277{
278	unsigned pages = 0;
279	unsigned long last_map_addr = end;
280	int i;
281
282	pte_t *pte = pte_page + pte_index(addr);
283
284	for(i = pte_index(addr); i < PTRS_PER_PTE; i++, addr += PAGE_SIZE, pte++) {
285
286		if (addr >= end) {
287			if (!after_bootmem) {
288				for(; i < PTRS_PER_PTE; i++, pte++)
289					set_pte(pte, __pte(0));
290			}
291			break;
292		}
293
294		if (pte_val(*pte))
295			goto repeat_set_pte;
296
297		if (0)
298			printk("   pte=%p addr=%lx pte=%016lx\n",
299			       pte, addr, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL).pte);
300		pages++;
301repeat_set_pte:
302		set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL));
303		last_map_addr = (addr & PAGE_MASK) + PAGE_SIZE;
304	}
305
306	if (physical_mapping_iter == 1)
307		update_page_count(PG_LEVEL_4K, pages);
308
309	return last_map_addr;
310}
311
312static unsigned long __meminit
313phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end)
314{
315	pte_t *pte = (pte_t *)pmd_page_vaddr(*pmd);
316
317	return phys_pte_init(pte, address, end);
318}
319
320static unsigned long __meminit
321phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
322			 unsigned long page_size_mask)
323{
324	unsigned long pages = 0;
325	unsigned long last_map_addr = end;
326
327	int i = pmd_index(address);
328
329	for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) {
330		unsigned long pte_phys;
331		pmd_t *pmd = pmd_page + pmd_index(address);
332		pte_t *pte;
333
334		if (address >= end) {
335			if (!after_bootmem) {
336				for (; i < PTRS_PER_PMD; i++, pmd++)
337					set_pmd(pmd, __pmd(0));
338			}
339			break;
340		}
341
342		if (pmd_val(*pmd)) {
343			if (!pmd_large(*pmd)) {
344				spin_lock(&init_mm.page_table_lock);
345				last_map_addr = phys_pte_update(pmd, address,
346								end);
347				spin_unlock(&init_mm.page_table_lock);
348				continue;
349			}
350			goto repeat_set_pte;
351		}
352
353		if (page_size_mask & (1<<PG_LEVEL_2M)) {
354			pages++;
355repeat_set_pte:
356			spin_lock(&init_mm.page_table_lock);
357			set_pte((pte_t *)pmd,
358				pfn_pte(address >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
359			spin_unlock(&init_mm.page_table_lock);
360			last_map_addr = (address & PMD_MASK) + PMD_SIZE;
361			continue;
362		}
363
364		pte = alloc_low_page(&pte_phys);
365		last_map_addr = phys_pte_init(pte, address, end);
366		unmap_low_page(pte);
367
368		spin_lock(&init_mm.page_table_lock);
369		pmd_populate_kernel(&init_mm, pmd, __va(pte_phys));
370		spin_unlock(&init_mm.page_table_lock);
371	}
372	if (physical_mapping_iter == 1)
373		update_page_count(PG_LEVEL_2M, pages);
374	return last_map_addr;
375}
376
377static unsigned long __meminit
378phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end,
379			 unsigned long page_size_mask)
380{
381	pmd_t *pmd = pmd_offset(pud, 0);
382	unsigned long last_map_addr;
383
384	last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask);
385	__flush_tlb_all();
386	return last_map_addr;
387}
388
389static unsigned long __meminit
390phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
391			 unsigned long page_size_mask)
392{
393	unsigned long pages = 0;
394	unsigned long last_map_addr = end;
395	int i = pud_index(addr);
396
397	for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE) {
398		unsigned long pmd_phys;
399		pud_t *pud = pud_page + pud_index(addr);
400		pmd_t *pmd;
401
402		if (addr >= end)
403			break;
404
405		if (!after_bootmem &&
406				!e820_any_mapped(addr, addr+PUD_SIZE, 0)) {
407			set_pud(pud, __pud(0));
408			continue;
409		}
410
411		if (pud_val(*pud)) {
412			if (!pud_large(*pud)) {
413				last_map_addr = phys_pmd_update(pud, addr, end,
414							 page_size_mask);
415				continue;
416			}
417
418			goto repeat_set_pte;
419		}
420
421		if (page_size_mask & (1<<PG_LEVEL_1G)) {
422			pages++;
423repeat_set_pte:
424			spin_lock(&init_mm.page_table_lock);
425			set_pte((pte_t *)pud,
426				pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
427			spin_unlock(&init_mm.page_table_lock);
428			last_map_addr = (addr & PUD_MASK) + PUD_SIZE;
429			continue;
430		}
431
432		pmd = alloc_low_page(&pmd_phys);
433		last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask);
434		unmap_low_page(pmd);
435
436		spin_lock(&init_mm.page_table_lock);
437		pud_populate(&init_mm, pud, __va(pmd_phys));
438		spin_unlock(&init_mm.page_table_lock);
439	}
440	__flush_tlb_all();
441
442	if (physical_mapping_iter == 1)
443		update_page_count(PG_LEVEL_1G, pages);
444
445	return last_map_addr;
446}
447
448static unsigned long __meminit
449phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end,
450		 unsigned long page_size_mask)
451{
452	pud_t *pud;
453
454	pud = (pud_t *)pgd_page_vaddr(*pgd);
455
456	return phys_pud_init(pud, addr, end, page_size_mask);
457}
458
459static void __init find_early_table_space(unsigned long end, int use_pse,
460					  int use_gbpages)
461{
462	unsigned long puds, pmds, ptes, tables, start;
463
464	puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
465	tables = round_up(puds * sizeof(pud_t), PAGE_SIZE);
466	if (use_gbpages) {
467		unsigned long extra;
468		extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT);
469		pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT;
470	} else
471		pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
472	tables += round_up(pmds * sizeof(pmd_t), PAGE_SIZE);
473
474	if (use_pse) {
475		unsigned long extra;
476		extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
477		ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
478	} else
479		ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
480	tables += round_up(ptes * sizeof(pte_t), PAGE_SIZE);
481
482	/*
483	 * RED-PEN putting page tables only on node 0 could
484	 * cause a hotspot and fill up ZONE_DMA. The page tables
485	 * need roughly 0.5KB per GB.
486	 */
487	start = 0x8000;
488	table_start = find_e820_area(start, end, tables, PAGE_SIZE);
489	if (table_start == -1UL)
490		panic("Cannot find space for the kernel page tables");
491
492	table_start >>= PAGE_SHIFT;
493	table_end = table_start;
494	table_top = table_start + (tables >> PAGE_SHIFT);
495
496	printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
497		end, table_start << PAGE_SHIFT, table_top << PAGE_SHIFT);
498}
499
500static void __init init_gbpages(void)
501{
502	if (direct_gbpages && cpu_has_gbpages)
503		printk(KERN_INFO "Using GB pages for direct mapping\n");
504	else
505		direct_gbpages = 0;
506}
507
508static int is_kernel(unsigned long pfn)
509{
510	unsigned long pg_addresss = pfn << PAGE_SHIFT;
511
512	if (pg_addresss >= (unsigned long) __pa(_text) &&
513	    pg_addresss < (unsigned long) __pa(_end))
514		return 1;
515
516	return 0;
517}
518
519static unsigned long __init kernel_physical_mapping_init(unsigned long start,
520						unsigned long end,
521						unsigned long page_size_mask)
522{
523
524	unsigned long next, last_map_addr;
525	u64 cached_supported_pte_mask = __supported_pte_mask;
526	unsigned long cache_start = start;
527	unsigned long cache_end = end;
528
529	/*
530	 * First iteration will setup identity mapping using large/small pages
531	 * based on page_size_mask, with other attributes same as set by
532	 * the early code in head_64.S
533	 *
534	 * Second iteration will setup the appropriate attributes
535	 * as desired for the kernel identity mapping.
536	 *
537	 * This two pass mechanism conforms to the TLB app note which says:
538	 *
539	 *     "Software should not write to a paging-structure entry in a way
540	 *      that would change, for any linear address, both the page size
541	 *      and either the page frame or attributes."
542	 *
543	 * For now, only difference between very early PTE attributes used in
544	 * head_64.S and here is _PAGE_NX.
545	 */
546	BUILD_BUG_ON((__PAGE_KERNEL_LARGE & ~__PAGE_KERNEL_IDENT_LARGE_EXEC)
547		     != _PAGE_NX);
548	__supported_pte_mask &= ~(_PAGE_NX);
549	physical_mapping_iter = 1;
550
551repeat:
552	last_map_addr = cache_end;
553
554	start = (unsigned long)__va(cache_start);
555	end = (unsigned long)__va(cache_end);
556
557	for (; start < end; start = next) {
558		pgd_t *pgd = pgd_offset_k(start);
559		unsigned long pud_phys;
560		pud_t *pud;
561
562		next = (start + PGDIR_SIZE) & PGDIR_MASK;
563		if (next > end)
564			next = end;
565
566		if (pgd_val(*pgd)) {
567			/*
568			 * Static identity mappings will be overwritten
569			 * with run-time mappings. For example, this allows
570			 * the static 0-1GB identity mapping to be mapped
571			 * non-executable with this.
572			 */
573			if (is_kernel(pte_pfn(*((pte_t *) pgd))))
574				goto realloc;
575
576			last_map_addr = phys_pud_update(pgd, __pa(start),
577						 __pa(end), page_size_mask);
578			continue;
579		}
580
581realloc:
582		pud = alloc_low_page(&pud_phys);
583		last_map_addr = phys_pud_init(pud, __pa(start), __pa(next),
584						 page_size_mask);
585		unmap_low_page(pud);
586
587		spin_lock(&init_mm.page_table_lock);
588		pgd_populate(&init_mm, pgd, __va(pud_phys));
589		spin_unlock(&init_mm.page_table_lock);
590	}
591	__flush_tlb_all();
592
593	if (physical_mapping_iter == 1) {
594		physical_mapping_iter = 2;
595		/*
596		 * Second iteration will set the actual desired PTE attributes.
597		 */
598		__supported_pte_mask = cached_supported_pte_mask;
599		goto repeat;
600	}
601
602	return last_map_addr;
603}
604
605struct map_range {
606	unsigned long start;
607	unsigned long end;
608	unsigned page_size_mask;
609};
610
611#define NR_RANGE_MR 5
612
613static int save_mr(struct map_range *mr, int nr_range,
614		   unsigned long start_pfn, unsigned long end_pfn,
615		   unsigned long page_size_mask)
616{
617
618	if (start_pfn < end_pfn) {
619		if (nr_range >= NR_RANGE_MR)
620			panic("run out of range for init_memory_mapping\n");
621		mr[nr_range].start = start_pfn<<PAGE_SHIFT;
622		mr[nr_range].end   = end_pfn<<PAGE_SHIFT;
623		mr[nr_range].page_size_mask = page_size_mask;
624		nr_range++;
625	}
626
627	return nr_range;
628}
629
630/*
631 * Setup the direct mapping of the physical memory at PAGE_OFFSET.
632 * This runs before bootmem is initialized and gets pages directly from
633 * the physical memory. To access them they are temporarily mapped.
634 */
635unsigned long __init_refok init_memory_mapping(unsigned long start,
636					       unsigned long end)
637{
638	unsigned long last_map_addr = 0;
639	unsigned long page_size_mask = 0;
640	unsigned long start_pfn, end_pfn;
641
642	struct map_range mr[NR_RANGE_MR];
643	int nr_range, i;
644	int use_pse, use_gbpages;
645
646	printk(KERN_INFO "init_memory_mapping\n");
647
648	/*
649	 * Find space for the kernel direct mapping tables.
650	 *
651	 * Later we should allocate these tables in the local node of the
652	 * memory mapped. Unfortunately this is done currently before the
653	 * nodes are discovered.
654	 */
655	if (!after_bootmem)
656		init_gbpages();
657
658#ifdef CONFIG_DEBUG_PAGEALLOC
659	/*
660	 * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
661	 * This will simplify cpa(), which otherwise needs to support splitting
662	 * large pages into small in interrupt context, etc.
663	 */
664	use_pse = use_gbpages = 0;
665#else
666	use_pse = cpu_has_pse;
667	use_gbpages = direct_gbpages;
668#endif
669
670	if (use_gbpages)
671		page_size_mask |= 1 << PG_LEVEL_1G;
672	if (use_pse)
673		page_size_mask |= 1 << PG_LEVEL_2M;
674
675	memset(mr, 0, sizeof(mr));
676	nr_range = 0;
677
678	/* head if not big page alignment ?*/
679	start_pfn = start >> PAGE_SHIFT;
680	end_pfn = ((start + (PMD_SIZE - 1)) >> PMD_SHIFT)
681			<< (PMD_SHIFT - PAGE_SHIFT);
682	nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
683
684	/* big page (2M) range*/
685	start_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT)
686			 << (PMD_SHIFT - PAGE_SHIFT);
687	end_pfn = ((start + (PUD_SIZE - 1))>>PUD_SHIFT)
688			 << (PUD_SHIFT - PAGE_SHIFT);
689	if (end_pfn > ((end>>PUD_SHIFT)<<(PUD_SHIFT - PAGE_SHIFT)))
690		end_pfn = ((end>>PUD_SHIFT)<<(PUD_SHIFT - PAGE_SHIFT));
691	nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
692			page_size_mask & (1<<PG_LEVEL_2M));
693
694	/* big page (1G) range */
695	start_pfn = end_pfn;
696	end_pfn = (end>>PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT);
697	nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
698				page_size_mask &
699				 ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G)));
700
701	/* tail is not big page (1G) alignment */
702	start_pfn = end_pfn;
703	end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
704	nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
705			page_size_mask & (1<<PG_LEVEL_2M));
706
707	/* tail is not big page (2M) alignment */
708	start_pfn = end_pfn;
709	end_pfn = end>>PAGE_SHIFT;
710	nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
711
712	/* try to merge same page size and continuous */
713	for (i = 0; nr_range > 1 && i < nr_range - 1; i++) {
714		unsigned long old_start;
715		if (mr[i].end != mr[i+1].start ||
716		    mr[i].page_size_mask != mr[i+1].page_size_mask)
717			continue;
718		/* move it */
719		old_start = mr[i].start;
720		memmove(&mr[i], &mr[i+1],
721			 (nr_range - 1 - i) * sizeof (struct map_range));
722		mr[i].start = old_start;
723		nr_range--;
724	}
725
726	for (i = 0; i < nr_range; i++)
727		printk(KERN_DEBUG " %010lx - %010lx page %s\n",
728				mr[i].start, mr[i].end,
729			(mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":(
730			 (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k"));
731
732	if (!after_bootmem)
733		find_early_table_space(end, use_pse, use_gbpages);
734
735	for (i = 0; i < nr_range; i++)
736		last_map_addr = kernel_physical_mapping_init(
737					mr[i].start, mr[i].end,
738					mr[i].page_size_mask);
739
740	if (!after_bootmem)
741		mmu_cr4_features = read_cr4();
742	__flush_tlb_all();
743
744	if (!after_bootmem && table_end > table_start)
745		reserve_early(table_start << PAGE_SHIFT,
746				 table_end << PAGE_SHIFT, "PGTABLE");
747
748	printk(KERN_INFO "last_map_addr: %lx end: %lx\n",
749			 last_map_addr, end);
750
751	if (!after_bootmem)
752		early_memtest(start, end);
753
754	return last_map_addr >> PAGE_SHIFT;
755}
756
757#ifndef CONFIG_NUMA
758void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn)
759{
760	unsigned long bootmap_size, bootmap;
761
762	bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
763	bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size,
764				 PAGE_SIZE);
765	if (bootmap == -1L)
766		panic("Cannot find bootmem map of size %ld\n", bootmap_size);
767	/* don't touch min_low_pfn */
768	bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT,
769					 0, end_pfn);
770	e820_register_active_regions(0, start_pfn, end_pfn);
771	free_bootmem_with_active_regions(0, end_pfn);
772	early_res_to_bootmem(0, end_pfn<<PAGE_SHIFT);
773	reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT);
774}
775
776void __init paging_init(void)
777{
778	unsigned long max_zone_pfns[MAX_NR_ZONES];
779
780	memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
781	max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
782	max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
783	max_zone_pfns[ZONE_NORMAL] = max_pfn;
784
785	memory_present(0, 0, max_pfn);
786	sparse_init();
787	free_area_init_nodes(max_zone_pfns);
788}
789#endif
790
791/*
792 * Memory hotplug specific functions
793 */
794#ifdef CONFIG_MEMORY_HOTPLUG
795/*
796 * Memory is added always to NORMAL zone. This means you will never get
797 * additional DMA/DMA32 memory.
798 */
799int arch_add_memory(int nid, u64 start, u64 size)
800{
801	struct pglist_data *pgdat = NODE_DATA(nid);
802	struct zone *zone = pgdat->node_zones + ZONE_NORMAL;
803	unsigned long last_mapped_pfn, start_pfn = start >> PAGE_SHIFT;
804	unsigned long nr_pages = size >> PAGE_SHIFT;
805	int ret;
806
807	last_mapped_pfn = init_memory_mapping(start, start + size-1);
808	if (last_mapped_pfn > max_pfn_mapped)
809		max_pfn_mapped = last_mapped_pfn;
810
811	ret = __add_pages(zone, start_pfn, nr_pages);
812	WARN_ON(1);
813
814	return ret;
815}
816EXPORT_SYMBOL_GPL(arch_add_memory);
817
818#if !defined(CONFIG_ACPI_NUMA) && defined(CONFIG_NUMA)
819int memory_add_physaddr_to_nid(u64 start)
820{
821	return 0;
822}
823EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
824#endif
825
826#endif /* CONFIG_MEMORY_HOTPLUG */
827
828/*
829 * devmem_is_allowed() checks to see if /dev/mem access to a certain address
830 * is valid. The argument is a physical page number.
831 *
832 *
833 * On x86, access has to be given to the first megabyte of ram because that area
834 * contains bios code and data regions used by X and dosemu and similar apps.
835 * Access has to be given to non-kernel-ram areas as well, these contain the PCI
836 * mmio resources as well as potential bios/acpi data regions.
837 */
838int devmem_is_allowed(unsigned long pagenr)
839{
840	if (pagenr <= 256)
841		return 1;
842	if (!page_is_ram(pagenr))
843		return 1;
844	return 0;
845}
846
847
848static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel,
849			 kcore_modules, kcore_vsyscall;
850
851void __init mem_init(void)
852{
853	long codesize, reservedpages, datasize, initsize;
854
855	pci_iommu_alloc();
856
857	/* clear_bss() already clear the empty_zero_page */
858
859	reservedpages = 0;
860
861	/* this will put all low memory onto the freelists */
862#ifdef CONFIG_NUMA
863	totalram_pages = numa_free_all_bootmem();
864#else
865	totalram_pages = free_all_bootmem();
866#endif
867	reservedpages = max_pfn - totalram_pages -
868					absent_pages_in_range(0, max_pfn);
869	after_bootmem = 1;
870
871	codesize =  (unsigned long) &_etext - (unsigned long) &_text;
872	datasize =  (unsigned long) &_edata - (unsigned long) &_etext;
873	initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;
874
875	/* Register memory areas for /proc/kcore */
876	kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
877	kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
878		   VMALLOC_END-VMALLOC_START);
879	kclist_add(&kcore_kernel, &_stext, _end - _stext);
880	kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN);
881	kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
882				 VSYSCALL_END - VSYSCALL_START);
883
884	printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, "
885				"%ldk reserved, %ldk data, %ldk init)\n",
886		(unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
887		max_pfn << (PAGE_SHIFT-10),
888		codesize >> 10,
889		reservedpages << (PAGE_SHIFT-10),
890		datasize >> 10,
891		initsize >> 10);
892}
893
894void free_init_pages(char *what, unsigned long begin, unsigned long end)
895{
896	unsigned long addr = begin;
897
898	if (addr >= end)
899		return;
900
901	/*
902	 * If debugging page accesses then do not free this memory but
903	 * mark them not present - any buggy init-section access will
904	 * create a kernel page fault:
905	 */
906#ifdef CONFIG_DEBUG_PAGEALLOC
907	printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n",
908		begin, PAGE_ALIGN(end));
909	set_memory_np(begin, (end - begin) >> PAGE_SHIFT);
910#else
911	printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
912
913	for (; addr < end; addr += PAGE_SIZE) {
914		ClearPageReserved(virt_to_page(addr));
915		init_page_count(virt_to_page(addr));
916		memset((void *)(addr & ~(PAGE_SIZE-1)),
917			POISON_FREE_INITMEM, PAGE_SIZE);
918		free_page(addr);
919		totalram_pages++;
920	}
921#endif
922}
923
924void free_initmem(void)
925{
926	free_init_pages("unused kernel memory",
927			(unsigned long)(&__init_begin),
928			(unsigned long)(&__init_end));
929}
930
931#ifdef CONFIG_DEBUG_RODATA
932const int rodata_test_data = 0xC3;
933EXPORT_SYMBOL_GPL(rodata_test_data);
934
935void mark_rodata_ro(void)
936{
937	unsigned long start = PFN_ALIGN(_stext), end = PFN_ALIGN(__end_rodata);
938	unsigned long rodata_start =
939		((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;
940
941#ifdef CONFIG_DYNAMIC_FTRACE
942	/* Dynamic tracing modifies the kernel text section */
943	start = rodata_start;
944#endif
945
946	printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
947	       (end - start) >> 10);
948	set_memory_ro(start, (end - start) >> PAGE_SHIFT);
949
950	/*
951	 * The rodata section (but not the kernel text!) should also be
952	 * not-executable.
953	 */
954	set_memory_nx(rodata_start, (end - rodata_start) >> PAGE_SHIFT);
955
956	rodata_test();
957
958#ifdef CONFIG_CPA_DEBUG
959	printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, end);
960	set_memory_rw(start, (end-start) >> PAGE_SHIFT);
961
962	printk(KERN_INFO "Testing CPA: again\n");
963	set_memory_ro(start, (end-start) >> PAGE_SHIFT);
964#endif
965}
966
967#endif
968
969#ifdef CONFIG_BLK_DEV_INITRD
970void free_initrd_mem(unsigned long start, unsigned long end)
971{
972	free_init_pages("initrd memory", start, end);
973}
974#endif
975
976int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
977				   int flags)
978{
979#ifdef CONFIG_NUMA
980	int nid, next_nid;
981	int ret;
982#endif
983	unsigned long pfn = phys >> PAGE_SHIFT;
984
985	if (pfn >= max_pfn) {
986		/*
987		 * This can happen with kdump kernels when accessing
988		 * firmware tables:
989		 */
990		if (pfn < max_pfn_mapped)
991			return -EFAULT;
992
993		printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %lu\n",
994				phys, len);
995		return -EFAULT;
996	}
997
998	/* Should check here against the e820 map to avoid double free */
999#ifdef CONFIG_NUMA
1000	nid = phys_to_nid(phys);
1001	next_nid = phys_to_nid(phys + len - 1);
1002	if (nid == next_nid)
1003		ret = reserve_bootmem_node(NODE_DATA(nid), phys, len, flags);
1004	else
1005		ret = reserve_bootmem(phys, len, flags);
1006
1007	if (ret != 0)
1008		return ret;
1009
1010#else
1011	reserve_bootmem(phys, len, BOOTMEM_DEFAULT);
1012#endif
1013
1014	if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) {
1015		dma_reserve += len / PAGE_SIZE;
1016		set_dma_reserve(dma_reserve);
1017	}
1018
1019	return 0;
1020}
1021
1022int kern_addr_valid(unsigned long addr)
1023{
1024	unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
1025	pgd_t *pgd;
1026	pud_t *pud;
1027	pmd_t *pmd;
1028	pte_t *pte;
1029
1030	if (above != 0 && above != -1UL)
1031		return 0;
1032
1033	pgd = pgd_offset_k(addr);
1034	if (pgd_none(*pgd))
1035		return 0;
1036
1037	pud = pud_offset(pgd, addr);
1038	if (pud_none(*pud))
1039		return 0;
1040
1041	pmd = pmd_offset(pud, addr);
1042	if (pmd_none(*pmd))
1043		return 0;
1044
1045	if (pmd_large(*pmd))
1046		return pfn_valid(pmd_pfn(*pmd));
1047
1048	pte = pte_offset_kernel(pmd, addr);
1049	if (pte_none(*pte))
1050		return 0;
1051
1052	return pfn_valid(pte_pfn(*pte));
1053}
1054
1055/*
1056 * A pseudo VMA to allow ptrace access for the vsyscall page.  This only
1057 * covers the 64bit vsyscall page now. 32bit has a real VMA now and does
1058 * not need special handling anymore:
1059 */
1060static struct vm_area_struct gate_vma = {
1061	.vm_start	= VSYSCALL_START,
1062	.vm_end		= VSYSCALL_START + (VSYSCALL_MAPPED_PAGES * PAGE_SIZE),
1063	.vm_page_prot	= PAGE_READONLY_EXEC,
1064	.vm_flags	= VM_READ | VM_EXEC
1065};
1066
1067struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
1068{
1069#ifdef CONFIG_IA32_EMULATION
1070	if (test_tsk_thread_flag(tsk, TIF_IA32))
1071		return NULL;
1072#endif
1073	return &gate_vma;
1074}
1075
1076int in_gate_area(struct task_struct *task, unsigned long addr)
1077{
1078	struct vm_area_struct *vma = get_gate_vma(task);
1079
1080	if (!vma)
1081		return 0;
1082
1083	return (addr >= vma->vm_start) && (addr < vma->vm_end);
1084}
1085
1086/*
1087 * Use this when you have no reliable task/vma, typically from interrupt
1088 * context. It is less reliable than using the task's vma and may give
1089 * false positives:
1090 */
1091int in_gate_area_no_task(unsigned long addr)
1092{
1093	return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);
1094}
1095
1096const char *arch_vma_name(struct vm_area_struct *vma)
1097{
1098	if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso)
1099		return "[vdso]";
1100	if (vma == &gate_vma)
1101		return "[vsyscall]";
1102	return NULL;
1103}
1104
1105#ifdef CONFIG_SPARSEMEM_VMEMMAP
1106/*
1107 * Initialise the sparsemem vmemmap using huge-pages at the PMD level.
1108 */
1109static long __meminitdata addr_start, addr_end;
1110static void __meminitdata *p_start, *p_end;
1111static int __meminitdata node_start;
1112
1113int __meminit
1114vmemmap_populate(struct page *start_page, unsigned long size, int node)
1115{
1116	unsigned long addr = (unsigned long)start_page;
1117	unsigned long end = (unsigned long)(start_page + size);
1118	unsigned long next;
1119	pgd_t *pgd;
1120	pud_t *pud;
1121	pmd_t *pmd;
1122
1123	for (; addr < end; addr = next) {
1124		void *p = NULL;
1125
1126		pgd = vmemmap_pgd_populate(addr, node);
1127		if (!pgd)
1128			return -ENOMEM;
1129
1130		pud = vmemmap_pud_populate(pgd, addr, node);
1131		if (!pud)
1132			return -ENOMEM;
1133
1134		if (!cpu_has_pse) {
1135			next = (addr + PAGE_SIZE) & PAGE_MASK;
1136			pmd = vmemmap_pmd_populate(pud, addr, node);
1137
1138			if (!pmd)
1139				return -ENOMEM;
1140
1141			p = vmemmap_pte_populate(pmd, addr, node);
1142
1143			if (!p)
1144				return -ENOMEM;
1145
1146			addr_end = addr + PAGE_SIZE;
1147			p_end = p + PAGE_SIZE;
1148		} else {
1149			next = pmd_addr_end(addr, end);
1150
1151			pmd = pmd_offset(pud, addr);
1152			if (pmd_none(*pmd)) {
1153				pte_t entry;
1154
1155				p = vmemmap_alloc_block(PMD_SIZE, node);
1156				if (!p)
1157					return -ENOMEM;
1158
1159				entry = pfn_pte(__pa(p) >> PAGE_SHIFT,
1160						PAGE_KERNEL_LARGE);
1161				set_pmd(pmd, __pmd(pte_val(entry)));
1162
1163				/* check to see if we have contiguous blocks */
1164				if (p_end != p || node_start != node) {
1165					if (p_start)
1166						printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
1167						       addr_start, addr_end-1, p_start, p_end-1, node_start);
1168					addr_start = addr;
1169					node_start = node;
1170					p_start = p;
1171				}
1172
1173				addr_end = addr + PMD_SIZE;
1174				p_end = p + PMD_SIZE;
1175			} else
1176				vmemmap_verify((pte_t *)pmd, node, addr, next);
1177		}
1178
1179	}
1180	return 0;
1181}
1182
1183void __meminit vmemmap_populate_print_last(void)
1184{
1185	if (p_start) {
1186		printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
1187			addr_start, addr_end-1, p_start, p_end-1, node_start);
1188		p_start = NULL;
1189		p_end = NULL;
1190		node_start = 0;
1191	}
1192}
1193#endif
1194