process_64.c revision b3b0870ef3ffed72b92415423da864f440f57ad6
1/* 2 * Copyright (C) 1995 Linus Torvalds 3 * 4 * Pentium III FXSR, SSE support 5 * Gareth Hughes <gareth@valinux.com>, May 2000 6 * 7 * X86-64 port 8 * Andi Kleen. 9 * 10 * CPU hotplug support - ashok.raj@intel.com 11 */ 12 13/* 14 * This file handles the architecture-dependent parts of process handling.. 15 */ 16 17#include <linux/stackprotector.h> 18#include <linux/cpu.h> 19#include <linux/errno.h> 20#include <linux/sched.h> 21#include <linux/fs.h> 22#include <linux/kernel.h> 23#include <linux/mm.h> 24#include <linux/elfcore.h> 25#include <linux/smp.h> 26#include <linux/slab.h> 27#include <linux/user.h> 28#include <linux/interrupt.h> 29#include <linux/delay.h> 30#include <linux/module.h> 31#include <linux/ptrace.h> 32#include <linux/notifier.h> 33#include <linux/kprobes.h> 34#include <linux/kdebug.h> 35#include <linux/tick.h> 36#include <linux/prctl.h> 37#include <linux/uaccess.h> 38#include <linux/io.h> 39#include <linux/ftrace.h> 40#include <linux/cpuidle.h> 41 42#include <asm/pgtable.h> 43#include <asm/system.h> 44#include <asm/processor.h> 45#include <asm/i387.h> 46#include <asm/mmu_context.h> 47#include <asm/prctl.h> 48#include <asm/desc.h> 49#include <asm/proto.h> 50#include <asm/ia32.h> 51#include <asm/idle.h> 52#include <asm/syscalls.h> 53#include <asm/debugreg.h> 54#include <asm/nmi.h> 55 56asmlinkage extern void ret_from_fork(void); 57 58DEFINE_PER_CPU(unsigned long, old_rsp); 59static DEFINE_PER_CPU(unsigned char, is_idle); 60 61static ATOMIC_NOTIFIER_HEAD(idle_notifier); 62 63void idle_notifier_register(struct notifier_block *n) 64{ 65 atomic_notifier_chain_register(&idle_notifier, n); 66} 67EXPORT_SYMBOL_GPL(idle_notifier_register); 68 69void idle_notifier_unregister(struct notifier_block *n) 70{ 71 atomic_notifier_chain_unregister(&idle_notifier, n); 72} 73EXPORT_SYMBOL_GPL(idle_notifier_unregister); 74 75void enter_idle(void) 76{ 77 percpu_write(is_idle, 1); 78 atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL); 79} 80 81static void __exit_idle(void) 82{ 83 if (x86_test_and_clear_bit_percpu(0, is_idle) == 0) 84 return; 85 atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL); 86} 87 88/* Called from interrupts to signify idle end */ 89void exit_idle(void) 90{ 91 /* idle loop has pid 0 */ 92 if (current->pid) 93 return; 94 __exit_idle(); 95} 96 97#ifndef CONFIG_SMP 98static inline void play_dead(void) 99{ 100 BUG(); 101} 102#endif 103 104/* 105 * The idle thread. There's no useful work to be 106 * done, so just try to conserve power and have a 107 * low exit latency (ie sit in a loop waiting for 108 * somebody to say that they'd like to reschedule) 109 */ 110void cpu_idle(void) 111{ 112 current_thread_info()->status |= TS_POLLING; 113 114 /* 115 * If we're the non-boot CPU, nothing set the stack canary up 116 * for us. CPU0 already has it initialized but no harm in 117 * doing it again. This is a good place for updating it, as 118 * we wont ever return from this function (so the invalid 119 * canaries already on the stack wont ever trigger). 120 */ 121 boot_init_stack_canary(); 122 123 /* endless idle loop with no priority at all */ 124 while (1) { 125 tick_nohz_idle_enter(); 126 while (!need_resched()) { 127 128 rmb(); 129 130 if (cpu_is_offline(smp_processor_id())) 131 play_dead(); 132 /* 133 * Idle routines should keep interrupts disabled 134 * from here on, until they go to idle. 135 * Otherwise, idle callbacks can misfire. 136 */ 137 local_touch_nmi(); 138 local_irq_disable(); 139 enter_idle(); 140 /* Don't trace irqs off for idle */ 141 stop_critical_timings(); 142 143 /* enter_idle() needs rcu for notifiers */ 144 rcu_idle_enter(); 145 146 if (cpuidle_idle_call()) 147 pm_idle(); 148 149 rcu_idle_exit(); 150 start_critical_timings(); 151 152 /* In many cases the interrupt that ended idle 153 has already called exit_idle. But some idle 154 loops can be woken up without interrupt. */ 155 __exit_idle(); 156 } 157 158 tick_nohz_idle_exit(); 159 preempt_enable_no_resched(); 160 schedule(); 161 preempt_disable(); 162 } 163} 164 165/* Prints also some state that isn't saved in the pt_regs */ 166void __show_regs(struct pt_regs *regs, int all) 167{ 168 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs; 169 unsigned long d0, d1, d2, d3, d6, d7; 170 unsigned int fsindex, gsindex; 171 unsigned int ds, cs, es; 172 173 show_regs_common(); 174 printk(KERN_DEFAULT "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip); 175 printk_address(regs->ip, 1); 176 printk(KERN_DEFAULT "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, 177 regs->sp, regs->flags); 178 printk(KERN_DEFAULT "RAX: %016lx RBX: %016lx RCX: %016lx\n", 179 regs->ax, regs->bx, regs->cx); 180 printk(KERN_DEFAULT "RDX: %016lx RSI: %016lx RDI: %016lx\n", 181 regs->dx, regs->si, regs->di); 182 printk(KERN_DEFAULT "RBP: %016lx R08: %016lx R09: %016lx\n", 183 regs->bp, regs->r8, regs->r9); 184 printk(KERN_DEFAULT "R10: %016lx R11: %016lx R12: %016lx\n", 185 regs->r10, regs->r11, regs->r12); 186 printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n", 187 regs->r13, regs->r14, regs->r15); 188 189 asm("movl %%ds,%0" : "=r" (ds)); 190 asm("movl %%cs,%0" : "=r" (cs)); 191 asm("movl %%es,%0" : "=r" (es)); 192 asm("movl %%fs,%0" : "=r" (fsindex)); 193 asm("movl %%gs,%0" : "=r" (gsindex)); 194 195 rdmsrl(MSR_FS_BASE, fs); 196 rdmsrl(MSR_GS_BASE, gs); 197 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); 198 199 if (!all) 200 return; 201 202 cr0 = read_cr0(); 203 cr2 = read_cr2(); 204 cr3 = read_cr3(); 205 cr4 = read_cr4(); 206 207 printk(KERN_DEFAULT "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", 208 fs, fsindex, gs, gsindex, shadowgs); 209 printk(KERN_DEFAULT "CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, 210 es, cr0); 211 printk(KERN_DEFAULT "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, 212 cr4); 213 214 get_debugreg(d0, 0); 215 get_debugreg(d1, 1); 216 get_debugreg(d2, 2); 217 printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2); 218 get_debugreg(d3, 3); 219 get_debugreg(d6, 6); 220 get_debugreg(d7, 7); 221 printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7); 222} 223 224void release_thread(struct task_struct *dead_task) 225{ 226 if (dead_task->mm) { 227 if (dead_task->mm->context.size) { 228 printk("WARNING: dead process %8s still has LDT? <%p/%d>\n", 229 dead_task->comm, 230 dead_task->mm->context.ldt, 231 dead_task->mm->context.size); 232 BUG(); 233 } 234 } 235} 236 237static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr) 238{ 239 struct user_desc ud = { 240 .base_addr = addr, 241 .limit = 0xfffff, 242 .seg_32bit = 1, 243 .limit_in_pages = 1, 244 .useable = 1, 245 }; 246 struct desc_struct *desc = t->thread.tls_array; 247 desc += tls; 248 fill_ldt(desc, &ud); 249} 250 251static inline u32 read_32bit_tls(struct task_struct *t, int tls) 252{ 253 return get_desc_base(&t->thread.tls_array[tls]); 254} 255 256/* 257 * This gets called before we allocate a new thread and copy 258 * the current task into it. 259 */ 260void prepare_to_copy(struct task_struct *tsk) 261{ 262 unlazy_fpu(tsk); 263} 264 265int copy_thread(unsigned long clone_flags, unsigned long sp, 266 unsigned long unused, 267 struct task_struct *p, struct pt_regs *regs) 268{ 269 int err; 270 struct pt_regs *childregs; 271 struct task_struct *me = current; 272 273 childregs = ((struct pt_regs *) 274 (THREAD_SIZE + task_stack_page(p))) - 1; 275 *childregs = *regs; 276 277 childregs->ax = 0; 278 if (user_mode(regs)) 279 childregs->sp = sp; 280 else 281 childregs->sp = (unsigned long)childregs; 282 283 p->thread.sp = (unsigned long) childregs; 284 p->thread.sp0 = (unsigned long) (childregs+1); 285 p->thread.usersp = me->thread.usersp; 286 287 set_tsk_thread_flag(p, TIF_FORK); 288 289 p->thread.io_bitmap_ptr = NULL; 290 291 savesegment(gs, p->thread.gsindex); 292 p->thread.gs = p->thread.gsindex ? 0 : me->thread.gs; 293 savesegment(fs, p->thread.fsindex); 294 p->thread.fs = p->thread.fsindex ? 0 : me->thread.fs; 295 savesegment(es, p->thread.es); 296 savesegment(ds, p->thread.ds); 297 298 err = -ENOMEM; 299 memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps)); 300 301 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) { 302 p->thread.io_bitmap_ptr = kmemdup(me->thread.io_bitmap_ptr, 303 IO_BITMAP_BYTES, GFP_KERNEL); 304 if (!p->thread.io_bitmap_ptr) { 305 p->thread.io_bitmap_max = 0; 306 return -ENOMEM; 307 } 308 set_tsk_thread_flag(p, TIF_IO_BITMAP); 309 } 310 311 /* 312 * Set a new TLS for the child thread? 313 */ 314 if (clone_flags & CLONE_SETTLS) { 315#ifdef CONFIG_IA32_EMULATION 316 if (test_thread_flag(TIF_IA32)) 317 err = do_set_thread_area(p, -1, 318 (struct user_desc __user *)childregs->si, 0); 319 else 320#endif 321 err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8); 322 if (err) 323 goto out; 324 } 325 err = 0; 326out: 327 if (err && p->thread.io_bitmap_ptr) { 328 kfree(p->thread.io_bitmap_ptr); 329 p->thread.io_bitmap_max = 0; 330 } 331 332 return err; 333} 334 335static void 336start_thread_common(struct pt_regs *regs, unsigned long new_ip, 337 unsigned long new_sp, 338 unsigned int _cs, unsigned int _ss, unsigned int _ds) 339{ 340 loadsegment(fs, 0); 341 loadsegment(es, _ds); 342 loadsegment(ds, _ds); 343 load_gs_index(0); 344 regs->ip = new_ip; 345 regs->sp = new_sp; 346 percpu_write(old_rsp, new_sp); 347 regs->cs = _cs; 348 regs->ss = _ss; 349 regs->flags = X86_EFLAGS_IF; 350 /* 351 * Free the old FP and other extended state 352 */ 353 free_thread_xstate(current); 354} 355 356void 357start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) 358{ 359 start_thread_common(regs, new_ip, new_sp, 360 __USER_CS, __USER_DS, 0); 361} 362 363#ifdef CONFIG_IA32_EMULATION 364void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp) 365{ 366 start_thread_common(regs, new_ip, new_sp, 367 __USER32_CS, __USER32_DS, __USER32_DS); 368} 369#endif 370 371/* 372 * switch_to(x,y) should switch tasks from x to y. 373 * 374 * This could still be optimized: 375 * - fold all the options into a flag word and test it with a single test. 376 * - could test fs/gs bitsliced 377 * 378 * Kprobes not supported here. Set the probe on schedule instead. 379 * Function graph tracer not supported too. 380 */ 381__notrace_funcgraph struct task_struct * 382__switch_to(struct task_struct *prev_p, struct task_struct *next_p) 383{ 384 struct thread_struct *prev = &prev_p->thread; 385 struct thread_struct *next = &next_p->thread; 386 int cpu = smp_processor_id(); 387 struct tss_struct *tss = &per_cpu(init_tss, cpu); 388 unsigned fsindex, gsindex; 389 390 /* 391 * Reload esp0, LDT and the page table pointer: 392 */ 393 load_sp0(tss, next); 394 395 /* 396 * Switch DS and ES. 397 * This won't pick up thread selector changes, but I guess that is ok. 398 */ 399 savesegment(es, prev->es); 400 if (unlikely(next->es | prev->es)) 401 loadsegment(es, next->es); 402 403 savesegment(ds, prev->ds); 404 if (unlikely(next->ds | prev->ds)) 405 loadsegment(ds, next->ds); 406 407 408 /* We must save %fs and %gs before load_TLS() because 409 * %fs and %gs may be cleared by load_TLS(). 410 * 411 * (e.g. xen_load_tls()) 412 */ 413 savesegment(fs, fsindex); 414 savesegment(gs, gsindex); 415 416 load_TLS(next, cpu); 417 418 /* Must be after DS reload */ 419 __unlazy_fpu(prev_p); 420 421 /* 422 * Leave lazy mode, flushing any hypercalls made here. 423 * This must be done before restoring TLS segments so 424 * the GDT and LDT are properly updated, and must be 425 * done before math_state_restore, so the TS bit is up 426 * to date. 427 */ 428 arch_end_context_switch(next_p); 429 430 /* 431 * Switch FS and GS. 432 * 433 * Segment register != 0 always requires a reload. Also 434 * reload when it has changed. When prev process used 64bit 435 * base always reload to avoid an information leak. 436 */ 437 if (unlikely(fsindex | next->fsindex | prev->fs)) { 438 loadsegment(fs, next->fsindex); 439 /* 440 * Check if the user used a selector != 0; if yes 441 * clear 64bit base, since overloaded base is always 442 * mapped to the Null selector 443 */ 444 if (fsindex) 445 prev->fs = 0; 446 } 447 /* when next process has a 64bit base use it */ 448 if (next->fs) 449 wrmsrl(MSR_FS_BASE, next->fs); 450 prev->fsindex = fsindex; 451 452 if (unlikely(gsindex | next->gsindex | prev->gs)) { 453 load_gs_index(next->gsindex); 454 if (gsindex) 455 prev->gs = 0; 456 } 457 if (next->gs) 458 wrmsrl(MSR_KERNEL_GS_BASE, next->gs); 459 prev->gsindex = gsindex; 460 461 /* 462 * Switch the PDA and FPU contexts. 463 */ 464 prev->usersp = percpu_read(old_rsp); 465 percpu_write(old_rsp, next->usersp); 466 percpu_write(current_task, next_p); 467 468 percpu_write(kernel_stack, 469 (unsigned long)task_stack_page(next_p) + 470 THREAD_SIZE - KERNEL_STACK_OFFSET); 471 472 /* 473 * Now maybe reload the debug registers and handle I/O bitmaps 474 */ 475 if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT || 476 task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV)) 477 __switch_to_xtra(prev_p, next_p, tss); 478 479 return prev_p; 480} 481 482void set_personality_64bit(void) 483{ 484 /* inherit personality from parent */ 485 486 /* Make sure to be in 64bit mode */ 487 clear_thread_flag(TIF_IA32); 488 489 /* Ensure the corresponding mm is not marked. */ 490 if (current->mm) 491 current->mm->context.ia32_compat = 0; 492 493 /* TBD: overwrites user setup. Should have two bits. 494 But 64bit processes have always behaved this way, 495 so it's not too bad. The main problem is just that 496 32bit childs are affected again. */ 497 current->personality &= ~READ_IMPLIES_EXEC; 498} 499 500void set_personality_ia32(void) 501{ 502 /* inherit personality from parent */ 503 504 /* Make sure to be in 32bit mode */ 505 set_thread_flag(TIF_IA32); 506 current->personality |= force_personality32; 507 508 /* Mark the associated mm as containing 32-bit tasks. */ 509 if (current->mm) 510 current->mm->context.ia32_compat = 1; 511 512 /* Prepare the first "return" to user space */ 513 current_thread_info()->status |= TS_COMPAT; 514} 515 516unsigned long get_wchan(struct task_struct *p) 517{ 518 unsigned long stack; 519 u64 fp, ip; 520 int count = 0; 521 522 if (!p || p == current || p->state == TASK_RUNNING) 523 return 0; 524 stack = (unsigned long)task_stack_page(p); 525 if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE) 526 return 0; 527 fp = *(u64 *)(p->thread.sp); 528 do { 529 if (fp < (unsigned long)stack || 530 fp >= (unsigned long)stack+THREAD_SIZE) 531 return 0; 532 ip = *(u64 *)(fp+8); 533 if (!in_sched_functions(ip)) 534 return ip; 535 fp = *(u64 *)fp; 536 } while (count++ < 16); 537 return 0; 538} 539 540long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) 541{ 542 int ret = 0; 543 int doit = task == current; 544 int cpu; 545 546 switch (code) { 547 case ARCH_SET_GS: 548 if (addr >= TASK_SIZE_OF(task)) 549 return -EPERM; 550 cpu = get_cpu(); 551 /* handle small bases via the GDT because that's faster to 552 switch. */ 553 if (addr <= 0xffffffff) { 554 set_32bit_tls(task, GS_TLS, addr); 555 if (doit) { 556 load_TLS(&task->thread, cpu); 557 load_gs_index(GS_TLS_SEL); 558 } 559 task->thread.gsindex = GS_TLS_SEL; 560 task->thread.gs = 0; 561 } else { 562 task->thread.gsindex = 0; 563 task->thread.gs = addr; 564 if (doit) { 565 load_gs_index(0); 566 ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr); 567 } 568 } 569 put_cpu(); 570 break; 571 case ARCH_SET_FS: 572 /* Not strictly needed for fs, but do it for symmetry 573 with gs */ 574 if (addr >= TASK_SIZE_OF(task)) 575 return -EPERM; 576 cpu = get_cpu(); 577 /* handle small bases via the GDT because that's faster to 578 switch. */ 579 if (addr <= 0xffffffff) { 580 set_32bit_tls(task, FS_TLS, addr); 581 if (doit) { 582 load_TLS(&task->thread, cpu); 583 loadsegment(fs, FS_TLS_SEL); 584 } 585 task->thread.fsindex = FS_TLS_SEL; 586 task->thread.fs = 0; 587 } else { 588 task->thread.fsindex = 0; 589 task->thread.fs = addr; 590 if (doit) { 591 /* set the selector to 0 to not confuse 592 __switch_to */ 593 loadsegment(fs, 0); 594 ret = checking_wrmsrl(MSR_FS_BASE, addr); 595 } 596 } 597 put_cpu(); 598 break; 599 case ARCH_GET_FS: { 600 unsigned long base; 601 if (task->thread.fsindex == FS_TLS_SEL) 602 base = read_32bit_tls(task, FS_TLS); 603 else if (doit) 604 rdmsrl(MSR_FS_BASE, base); 605 else 606 base = task->thread.fs; 607 ret = put_user(base, (unsigned long __user *)addr); 608 break; 609 } 610 case ARCH_GET_GS: { 611 unsigned long base; 612 unsigned gsindex; 613 if (task->thread.gsindex == GS_TLS_SEL) 614 base = read_32bit_tls(task, GS_TLS); 615 else if (doit) { 616 savesegment(gs, gsindex); 617 if (gsindex) 618 rdmsrl(MSR_KERNEL_GS_BASE, base); 619 else 620 base = task->thread.gs; 621 } else 622 base = task->thread.gs; 623 ret = put_user(base, (unsigned long __user *)addr); 624 break; 625 } 626 627 default: 628 ret = -EINVAL; 629 break; 630 } 631 632 return ret; 633} 634 635long sys_arch_prctl(int code, unsigned long addr) 636{ 637 return do_arch_prctl(current, code, addr); 638} 639 640unsigned long KSTK_ESP(struct task_struct *task) 641{ 642 return (test_tsk_thread_flag(task, TIF_IA32)) ? 643 (task_pt_regs(task)->sp) : ((task)->thread.usersp); 644} 645