1#ifndef _I386_PGTABLE_H 2#define _I386_PGTABLE_H 3 4 5/* 6 * The Linux memory management assumes a three-level page table setup. On 7 * the i386, we use that, but "fold" the mid level into the top-level page 8 * table, so that we physically have the same two-level page table as the 9 * i386 mmu expects. 10 * 11 * This file contains the functions and defines necessary to modify and use 12 * the i386 page table tree. 13 */ 14#ifndef __ASSEMBLY__ 15#include <asm/processor.h> 16#include <asm/fixmap.h> 17#include <linux/threads.h> 18#include <asm/paravirt.h> 19 20#include <linux/bitops.h> 21#include <linux/slab.h> 22#include <linux/list.h> 23#include <linux/spinlock.h> 24 25struct mm_struct; 26struct vm_area_struct; 27 28/* 29 * ZERO_PAGE is a global shared page that is always zero: used 30 * for zero-mapped memory areas etc.. 31 */ 32#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) 33extern unsigned long empty_zero_page[1024]; 34extern pgd_t swapper_pg_dir[1024]; 35extern struct kmem_cache *pmd_cache; 36extern spinlock_t pgd_lock; 37extern struct page *pgd_list; 38void check_pgt_cache(void); 39 40void pmd_ctor(struct kmem_cache *, void *); 41void pgtable_cache_init(void); 42void paging_init(void); 43 44 45/* 46 * The Linux x86 paging architecture is 'compile-time dual-mode', it 47 * implements both the traditional 2-level x86 page tables and the 48 * newer 3-level PAE-mode page tables. 49 */ 50#ifdef CONFIG_X86_PAE 51# include <asm/pgtable-3level-defs.h> 52# define PMD_SIZE (1UL << PMD_SHIFT) 53# define PMD_MASK (~(PMD_SIZE-1)) 54#else 55# include <asm/pgtable-2level-defs.h> 56#endif 57 58#define PGDIR_SIZE (1UL << PGDIR_SHIFT) 59#define PGDIR_MASK (~(PGDIR_SIZE-1)) 60 61#define USER_PTRS_PER_PGD (TASK_SIZE/PGDIR_SIZE) 62#define FIRST_USER_ADDRESS 0 63 64#define USER_PGD_PTRS (PAGE_OFFSET >> PGDIR_SHIFT) 65#define KERNEL_PGD_PTRS (PTRS_PER_PGD-USER_PGD_PTRS) 66 67#define TWOLEVEL_PGDIR_SHIFT 22 68#define BOOT_USER_PGD_PTRS (__PAGE_OFFSET >> TWOLEVEL_PGDIR_SHIFT) 69#define BOOT_KERNEL_PGD_PTRS (1024-BOOT_USER_PGD_PTRS) 70 71/* Just any arbitrary offset to the start of the vmalloc VM area: the 72 * current 8MB value just means that there will be a 8MB "hole" after the 73 * physical memory until the kernel virtual memory starts. That means that 74 * any out-of-bounds memory accesses will hopefully be caught. 75 * The vmalloc() routines leaves a hole of 4kB between each vmalloced 76 * area for the same reason. ;) 77 */ 78#define VMALLOC_OFFSET (8*1024*1024) 79#define VMALLOC_START (((unsigned long) high_memory + \ 80 2*VMALLOC_OFFSET-1) & ~(VMALLOC_OFFSET-1)) 81#ifdef CONFIG_HIGHMEM 82# define VMALLOC_END (PKMAP_BASE-2*PAGE_SIZE) 83#else 84# define VMALLOC_END (FIXADDR_START-2*PAGE_SIZE) 85#endif 86 87/* 88 * _PAGE_PSE set in the page directory entry just means that 89 * the page directory entry points directly to a 4MB-aligned block of 90 * memory. 91 */ 92#define _PAGE_BIT_PRESENT 0 93#define _PAGE_BIT_RW 1 94#define _PAGE_BIT_USER 2 95#define _PAGE_BIT_PWT 3 96#define _PAGE_BIT_PCD 4 97#define _PAGE_BIT_ACCESSED 5 98#define _PAGE_BIT_DIRTY 6 99#define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page, Pentium+, if present.. */ 100#define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */ 101#define _PAGE_BIT_UNUSED1 9 /* available for programmer */ 102#define _PAGE_BIT_UNUSED2 10 103#define _PAGE_BIT_UNUSED3 11 104#define _PAGE_BIT_NX 63 105 106#define _PAGE_PRESENT 0x001 107#define _PAGE_RW 0x002 108#define _PAGE_USER 0x004 109#define _PAGE_PWT 0x008 110#define _PAGE_PCD 0x010 111#define _PAGE_ACCESSED 0x020 112#define _PAGE_DIRTY 0x040 113#define _PAGE_PSE 0x080 /* 4 MB (or 2MB) page, Pentium+, if present.. */ 114#define _PAGE_GLOBAL 0x100 /* Global TLB entry PPro+ */ 115#define _PAGE_UNUSED1 0x200 /* available for programmer */ 116#define _PAGE_UNUSED2 0x400 117#define _PAGE_UNUSED3 0x800 118 119/* If _PAGE_PRESENT is clear, we use these: */ 120#define _PAGE_FILE 0x040 /* nonlinear file mapping, saved PTE; unset:swap */ 121#define _PAGE_PROTNONE 0x080 /* if the user mapped it with PROT_NONE; 122 pte_present gives true */ 123#ifdef CONFIG_X86_PAE 124#define _PAGE_NX (1ULL<<_PAGE_BIT_NX) 125#else 126#define _PAGE_NX 0 127#endif 128 129#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY) 130#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) 131#define _PAGE_CHG_MASK (PTE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY) 132 133#define PAGE_NONE \ 134 __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED) 135#define PAGE_SHARED \ 136 __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED) 137 138#define PAGE_SHARED_EXEC \ 139 __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED) 140#define PAGE_COPY_NOEXEC \ 141 __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX) 142#define PAGE_COPY_EXEC \ 143 __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED) 144#define PAGE_COPY \ 145 PAGE_COPY_NOEXEC 146#define PAGE_READONLY \ 147 __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX) 148#define PAGE_READONLY_EXEC \ 149 __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED) 150 151#define _PAGE_KERNEL \ 152 (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_NX) 153#define _PAGE_KERNEL_EXEC \ 154 (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED) 155 156extern unsigned long long __PAGE_KERNEL, __PAGE_KERNEL_EXEC; 157#define __PAGE_KERNEL_RO (__PAGE_KERNEL & ~_PAGE_RW) 158#define __PAGE_KERNEL_RX (__PAGE_KERNEL_EXEC & ~_PAGE_RW) 159#define __PAGE_KERNEL_NOCACHE (__PAGE_KERNEL | _PAGE_PCD) 160#define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE) 161#define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE) 162 163#define PAGE_KERNEL __pgprot(__PAGE_KERNEL) 164#define PAGE_KERNEL_RO __pgprot(__PAGE_KERNEL_RO) 165#define PAGE_KERNEL_EXEC __pgprot(__PAGE_KERNEL_EXEC) 166#define PAGE_KERNEL_RX __pgprot(__PAGE_KERNEL_RX) 167#define PAGE_KERNEL_NOCACHE __pgprot(__PAGE_KERNEL_NOCACHE) 168#define PAGE_KERNEL_LARGE __pgprot(__PAGE_KERNEL_LARGE) 169#define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC) 170 171/* 172 * The i386 can't do page protection for execute, and considers that 173 * the same are read. Also, write permissions imply read permissions. 174 * This is the closest we can get.. 175 */ 176#define __P000 PAGE_NONE 177#define __P001 PAGE_READONLY 178#define __P010 PAGE_COPY 179#define __P011 PAGE_COPY 180#define __P100 PAGE_READONLY_EXEC 181#define __P101 PAGE_READONLY_EXEC 182#define __P110 PAGE_COPY_EXEC 183#define __P111 PAGE_COPY_EXEC 184 185#define __S000 PAGE_NONE 186#define __S001 PAGE_READONLY 187#define __S010 PAGE_SHARED 188#define __S011 PAGE_SHARED 189#define __S100 PAGE_READONLY_EXEC 190#define __S101 PAGE_READONLY_EXEC 191#define __S110 PAGE_SHARED_EXEC 192#define __S111 PAGE_SHARED_EXEC 193 194/* 195 * Define this if things work differently on an i386 and an i486: 196 * it will (on an i486) warn about kernel memory accesses that are 197 * done without a 'access_ok(VERIFY_WRITE,..)' 198 */ 199#undef TEST_ACCESS_OK 200 201/* The boot page tables (all created as a single array) */ 202extern unsigned long pg0[]; 203 204#define pte_present(x) ((x).pte_low & (_PAGE_PRESENT | _PAGE_PROTNONE)) 205 206/* To avoid harmful races, pmd_none(x) should check only the lower when PAE */ 207#define pmd_none(x) (!(unsigned long)pmd_val(x)) 208#define pmd_present(x) (pmd_val(x) & _PAGE_PRESENT) 209#define pmd_bad(x) ((pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE) 210 211 212#define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT)) 213 214/* 215 * The following only work if pte_present() is true. 216 * Undefined behaviour if not.. 217 */ 218static inline int pte_dirty(pte_t pte) { return (pte).pte_low & _PAGE_DIRTY; } 219static inline int pte_young(pte_t pte) { return (pte).pte_low & _PAGE_ACCESSED; } 220static inline int pte_write(pte_t pte) { return (pte).pte_low & _PAGE_RW; } 221static inline int pte_huge(pte_t pte) { return (pte).pte_low & _PAGE_PSE; } 222 223/* 224 * The following only works if pte_present() is not true. 225 */ 226static inline int pte_file(pte_t pte) { return (pte).pte_low & _PAGE_FILE; } 227 228static inline pte_t pte_mkclean(pte_t pte) { (pte).pte_low &= ~_PAGE_DIRTY; return pte; } 229static inline pte_t pte_mkold(pte_t pte) { (pte).pte_low &= ~_PAGE_ACCESSED; return pte; } 230static inline pte_t pte_wrprotect(pte_t pte) { (pte).pte_low &= ~_PAGE_RW; return pte; } 231static inline pte_t pte_mkdirty(pte_t pte) { (pte).pte_low |= _PAGE_DIRTY; return pte; } 232static inline pte_t pte_mkyoung(pte_t pte) { (pte).pte_low |= _PAGE_ACCESSED; return pte; } 233static inline pte_t pte_mkwrite(pte_t pte) { (pte).pte_low |= _PAGE_RW; return pte; } 234static inline pte_t pte_mkhuge(pte_t pte) { (pte).pte_low |= _PAGE_PSE; return pte; } 235 236#ifdef CONFIG_X86_PAE 237# include <asm/pgtable-3level.h> 238#else 239# include <asm/pgtable-2level.h> 240#endif 241 242#ifndef CONFIG_PARAVIRT 243/* 244 * Rules for using pte_update - it must be called after any PTE update which 245 * has not been done using the set_pte / clear_pte interfaces. It is used by 246 * shadow mode hypervisors to resynchronize the shadow page tables. Kernel PTE 247 * updates should either be sets, clears, or set_pte_atomic for P->P 248 * transitions, which means this hook should only be called for user PTEs. 249 * This hook implies a P->P protection or access change has taken place, which 250 * requires a subsequent TLB flush. The notification can optionally be delayed 251 * until the TLB flush event by using the pte_update_defer form of the 252 * interface, but care must be taken to assure that the flush happens while 253 * still holding the same page table lock so that the shadow and primary pages 254 * do not become out of sync on SMP. 255 */ 256#define pte_update(mm, addr, ptep) do { } while (0) 257#define pte_update_defer(mm, addr, ptep) do { } while (0) 258#endif 259 260/* local pte updates need not use xchg for locking */ 261static inline pte_t native_local_ptep_get_and_clear(pte_t *ptep) 262{ 263 pte_t res = *ptep; 264 265 /* Pure native function needs no input for mm, addr */ 266 native_pte_clear(NULL, 0, ptep); 267 return res; 268} 269 270/* 271 * We only update the dirty/accessed state if we set 272 * the dirty bit by hand in the kernel, since the hardware 273 * will do the accessed bit for us, and we don't want to 274 * race with other CPU's that might be updating the dirty 275 * bit at the same time. 276 */ 277#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS 278#define ptep_set_access_flags(vma, address, ptep, entry, dirty) \ 279({ \ 280 int __changed = !pte_same(*(ptep), entry); \ 281 if (__changed && dirty) { \ 282 (ptep)->pte_low = (entry).pte_low; \ 283 pte_update_defer((vma)->vm_mm, (address), (ptep)); \ 284 flush_tlb_page(vma, address); \ 285 } \ 286 __changed; \ 287}) 288 289#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG 290#define ptep_test_and_clear_young(vma, addr, ptep) ({ \ 291 int __ret = 0; \ 292 if (pte_young(*(ptep))) \ 293 __ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, \ 294 &(ptep)->pte_low); \ 295 if (__ret) \ 296 pte_update((vma)->vm_mm, addr, ptep); \ 297 __ret; \ 298}) 299 300#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH 301#define ptep_clear_flush_young(vma, address, ptep) \ 302({ \ 303 int __young; \ 304 __young = ptep_test_and_clear_young((vma), (address), (ptep)); \ 305 if (__young) \ 306 flush_tlb_page(vma, address); \ 307 __young; \ 308}) 309 310#define __HAVE_ARCH_PTEP_GET_AND_CLEAR 311static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) 312{ 313 pte_t pte = native_ptep_get_and_clear(ptep); 314 pte_update(mm, addr, ptep); 315 return pte; 316} 317 318#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL 319static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, unsigned long addr, pte_t *ptep, int full) 320{ 321 pte_t pte; 322 if (full) { 323 /* 324 * Full address destruction in progress; paravirt does not 325 * care about updates and native needs no locking 326 */ 327 pte = native_local_ptep_get_and_clear(ptep); 328 } else { 329 pte = ptep_get_and_clear(mm, addr, ptep); 330 } 331 return pte; 332} 333 334#define __HAVE_ARCH_PTEP_SET_WRPROTECT 335static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) 336{ 337 clear_bit(_PAGE_BIT_RW, &ptep->pte_low); 338 pte_update(mm, addr, ptep); 339} 340 341/* 342 * clone_pgd_range(pgd_t *dst, pgd_t *src, int count); 343 * 344 * dst - pointer to pgd range anwhere on a pgd page 345 * src - "" 346 * count - the number of pgds to copy. 347 * 348 * dst and src can be on the same page, but the range must not overlap, 349 * and must not cross a page boundary. 350 */ 351static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count) 352{ 353 memcpy(dst, src, count * sizeof(pgd_t)); 354} 355 356/* 357 * Macro to mark a page protection value as "uncacheable". On processors which do not support 358 * it, this is a no-op. 359 */ 360#define pgprot_noncached(prot) ((boot_cpu_data.x86 > 3) \ 361 ? (__pgprot(pgprot_val(prot) | _PAGE_PCD | _PAGE_PWT)) : (prot)) 362 363/* 364 * Conversion functions: convert a page and protection to a page entry, 365 * and a page entry and page directory to the page they refer to. 366 */ 367 368#define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot)) 369 370static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) 371{ 372 pte.pte_low &= _PAGE_CHG_MASK; 373 pte.pte_low |= pgprot_val(newprot); 374#ifdef CONFIG_X86_PAE 375 /* 376 * Chop off the NX bit (if present), and add the NX portion of 377 * the newprot (if present): 378 */ 379 pte.pte_high &= ~(1 << (_PAGE_BIT_NX - 32)); 380 pte.pte_high |= (pgprot_val(newprot) >> 32) & \ 381 (__supported_pte_mask >> 32); 382#endif 383 return pte; 384} 385 386#define pmd_large(pmd) \ 387((pmd_val(pmd) & (_PAGE_PSE|_PAGE_PRESENT)) == (_PAGE_PSE|_PAGE_PRESENT)) 388 389/* 390 * the pgd page can be thought of an array like this: pgd_t[PTRS_PER_PGD] 391 * 392 * this macro returns the index of the entry in the pgd page which would 393 * control the given virtual address 394 */ 395#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD-1)) 396#define pgd_index_k(addr) pgd_index(addr) 397 398/* 399 * pgd_offset() returns a (pgd_t *) 400 * pgd_index() is used get the offset into the pgd page's array of pgd_t's; 401 */ 402#define pgd_offset(mm, address) ((mm)->pgd+pgd_index(address)) 403 404/* 405 * a shortcut which implies the use of the kernel's pgd, instead 406 * of a process's 407 */ 408#define pgd_offset_k(address) pgd_offset(&init_mm, address) 409 410/* 411 * the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD] 412 * 413 * this macro returns the index of the entry in the pmd page which would 414 * control the given virtual address 415 */ 416#define pmd_index(address) \ 417 (((address) >> PMD_SHIFT) & (PTRS_PER_PMD-1)) 418 419/* 420 * the pte page can be thought of an array like this: pte_t[PTRS_PER_PTE] 421 * 422 * this macro returns the index of the entry in the pte page which would 423 * control the given virtual address 424 */ 425#define pte_index(address) \ 426 (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) 427#define pte_offset_kernel(dir, address) \ 428 ((pte_t *) pmd_page_vaddr(*(dir)) + pte_index(address)) 429 430#define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT)) 431 432#define pmd_page_vaddr(pmd) \ 433 ((unsigned long) __va(pmd_val(pmd) & PAGE_MASK)) 434 435/* 436 * Helper function that returns the kernel pagetable entry controlling 437 * the virtual address 'address'. NULL means no pagetable entry present. 438 * NOTE: the return type is pte_t but if the pmd is PSE then we return it 439 * as a pte too. 440 */ 441extern pte_t *lookup_address(unsigned long address); 442 443/* 444 * Make a given kernel text page executable/non-executable. 445 * Returns the previous executability setting of that page (which 446 * is used to restore the previous state). Used by the SMP bootup code. 447 * NOTE: this is an __init function for security reasons. 448 */ 449#ifdef CONFIG_X86_PAE 450 extern int set_kernel_exec(unsigned long vaddr, int enable); 451#else 452 static inline int set_kernel_exec(unsigned long vaddr, int enable) { return 0;} 453#endif 454 455#if defined(CONFIG_HIGHPTE) 456#define pte_offset_map(dir, address) \ 457 ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)),KM_PTE0) + pte_index(address)) 458#define pte_offset_map_nested(dir, address) \ 459 ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)),KM_PTE1) + pte_index(address)) 460#define pte_unmap(pte) kunmap_atomic(pte, KM_PTE0) 461#define pte_unmap_nested(pte) kunmap_atomic(pte, KM_PTE1) 462#else 463#define pte_offset_map(dir, address) \ 464 ((pte_t *)page_address(pmd_page(*(dir))) + pte_index(address)) 465#define pte_offset_map_nested(dir, address) pte_offset_map(dir, address) 466#define pte_unmap(pte) do { } while (0) 467#define pte_unmap_nested(pte) do { } while (0) 468#endif 469 470/* Clear a kernel PTE and flush it from the TLB */ 471#define kpte_clear_flush(ptep, vaddr) \ 472do { \ 473 pte_clear(&init_mm, vaddr, ptep); \ 474 __flush_tlb_one(vaddr); \ 475} while (0) 476 477/* 478 * The i386 doesn't have any external MMU info: the kernel page 479 * tables contain all the necessary information. 480 */ 481#define update_mmu_cache(vma,address,pte) do { } while (0) 482 483void native_pagetable_setup_start(pgd_t *base); 484void native_pagetable_setup_done(pgd_t *base); 485 486#ifndef CONFIG_PARAVIRT 487static inline void paravirt_pagetable_setup_start(pgd_t *base) 488{ 489 native_pagetable_setup_start(base); 490} 491 492static inline void paravirt_pagetable_setup_done(pgd_t *base) 493{ 494 native_pagetable_setup_done(base); 495} 496#endif /* !CONFIG_PARAVIRT */ 497 498#endif /* !__ASSEMBLY__ */ 499 500#ifdef CONFIG_FLATMEM 501#define kern_addr_valid(addr) (1) 502#endif /* CONFIG_FLATMEM */ 503 504#define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \ 505 remap_pfn_range(vma, vaddr, pfn, size, prot) 506 507#include <asm-generic/pgtable.h> 508 509#endif /* _I386_PGTABLE_H */ 510