process_64.c revision 2311f0de21c17b2a8b960677a9cccfbfa52beb35
1/* 2 * Copyright (C) 1995 Linus Torvalds 3 * 4 * Pentium III FXSR, SSE support 5 * Gareth Hughes <gareth@valinux.com>, May 2000 6 * 7 * X86-64 port 8 * Andi Kleen. 9 * 10 * CPU hotplug support - ashok.raj@intel.com 11 */ 12 13/* 14 * This file handles the architecture-dependent parts of process handling.. 15 */ 16 17#include <stdarg.h> 18 19#include <linux/stackprotector.h> 20#include <linux/cpu.h> 21#include <linux/errno.h> 22#include <linux/sched.h> 23#include <linux/fs.h> 24#include <linux/kernel.h> 25#include <linux/mm.h> 26#include <linux/elfcore.h> 27#include <linux/smp.h> 28#include <linux/slab.h> 29#include <linux/user.h> 30#include <linux/interrupt.h> 31#include <linux/utsname.h> 32#include <linux/delay.h> 33#include <linux/module.h> 34#include <linux/ptrace.h> 35#include <linux/random.h> 36#include <linux/notifier.h> 37#include <linux/kprobes.h> 38#include <linux/kdebug.h> 39#include <linux/tick.h> 40#include <linux/prctl.h> 41#include <linux/uaccess.h> 42#include <linux/io.h> 43#include <linux/ftrace.h> 44#include <linux/dmi.h> 45 46#include <asm/pgtable.h> 47#include <asm/system.h> 48#include <asm/processor.h> 49#include <asm/i387.h> 50#include <asm/mmu_context.h> 51#include <asm/prctl.h> 52#include <asm/desc.h> 53#include <asm/proto.h> 54#include <asm/ia32.h> 55#include <asm/idle.h> 56#include <asm/syscalls.h> 57#include <asm/ds.h> 58 59asmlinkage extern void ret_from_fork(void); 60 61DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; 62EXPORT_PER_CPU_SYMBOL(current_task); 63 64DEFINE_PER_CPU(unsigned long, old_rsp); 65static DEFINE_PER_CPU(unsigned char, is_idle); 66 67unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED; 68 69static ATOMIC_NOTIFIER_HEAD(idle_notifier); 70 71void idle_notifier_register(struct notifier_block *n) 72{ 73 atomic_notifier_chain_register(&idle_notifier, n); 74} 75EXPORT_SYMBOL_GPL(idle_notifier_register); 76 77void idle_notifier_unregister(struct notifier_block *n) 78{ 79 atomic_notifier_chain_unregister(&idle_notifier, n); 80} 81EXPORT_SYMBOL_GPL(idle_notifier_unregister); 82 83void enter_idle(void) 84{ 85 percpu_write(is_idle, 1); 86 atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL); 87} 88 89static void __exit_idle(void) 90{ 91 if (x86_test_and_clear_bit_percpu(0, is_idle) == 0) 92 return; 93 atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL); 94} 95 96/* Called from interrupts to signify idle end */ 97void exit_idle(void) 98{ 99 /* idle loop has pid 0 */ 100 if (current->pid) 101 return; 102 __exit_idle(); 103} 104 105#ifndef CONFIG_SMP 106static inline void play_dead(void) 107{ 108 BUG(); 109} 110#endif 111 112/* 113 * The idle thread. There's no useful work to be 114 * done, so just try to conserve power and have a 115 * low exit latency (ie sit in a loop waiting for 116 * somebody to say that they'd like to reschedule) 117 */ 118void cpu_idle(void) 119{ 120 current_thread_info()->status |= TS_POLLING; 121 122 /* 123 * If we're the non-boot CPU, nothing set the stack canary up 124 * for us. CPU0 already has it initialized but no harm in 125 * doing it again. This is a good place for updating it, as 126 * we wont ever return from this function (so the invalid 127 * canaries already on the stack wont ever trigger). 128 */ 129 boot_init_stack_canary(); 130 131 /* endless idle loop with no priority at all */ 132 while (1) { 133 tick_nohz_stop_sched_tick(1); 134 while (!need_resched()) { 135 136 rmb(); 137 138 if (cpu_is_offline(smp_processor_id())) 139 play_dead(); 140 /* 141 * Idle routines should keep interrupts disabled 142 * from here on, until they go to idle. 143 * Otherwise, idle callbacks can misfire. 144 */ 145 local_irq_disable(); 146 enter_idle(); 147 /* Don't trace irqs off for idle */ 148 stop_critical_timings(); 149 pm_idle(); 150 start_critical_timings(); 151 /* In many cases the interrupt that ended idle 152 has already called exit_idle. But some idle 153 loops can be woken up without interrupt. */ 154 __exit_idle(); 155 } 156 157 tick_nohz_restart_sched_tick(); 158 preempt_enable_no_resched(); 159 schedule(); 160 preempt_disable(); 161 } 162} 163 164/* Prints also some state that isn't saved in the pt_regs */ 165void __show_regs(struct pt_regs *regs, int all) 166{ 167 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs; 168 unsigned long d0, d1, d2, d3, d6, d7; 169 unsigned int fsindex, gsindex; 170 unsigned int ds, cs, es; 171 const char *board; 172 173 printk("\n"); 174 print_modules(); 175 board = dmi_get_system_info(DMI_PRODUCT_NAME); 176 if (!board) 177 board = ""; 178 printk(KERN_INFO "Pid: %d, comm: %.20s %s %s %.*s %s\n", 179 current->pid, current->comm, print_tainted(), 180 init_utsname()->release, 181 (int)strcspn(init_utsname()->version, " "), 182 init_utsname()->version, board); 183 printk(KERN_INFO "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip); 184 printk_address(regs->ip, 1); 185 printk(KERN_INFO "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, 186 regs->sp, regs->flags); 187 printk(KERN_INFO "RAX: %016lx RBX: %016lx RCX: %016lx\n", 188 regs->ax, regs->bx, regs->cx); 189 printk(KERN_INFO "RDX: %016lx RSI: %016lx RDI: %016lx\n", 190 regs->dx, regs->si, regs->di); 191 printk(KERN_INFO "RBP: %016lx R08: %016lx R09: %016lx\n", 192 regs->bp, regs->r8, regs->r9); 193 printk(KERN_INFO "R10: %016lx R11: %016lx R12: %016lx\n", 194 regs->r10, regs->r11, regs->r12); 195 printk(KERN_INFO "R13: %016lx R14: %016lx R15: %016lx\n", 196 regs->r13, regs->r14, regs->r15); 197 198 asm("movl %%ds,%0" : "=r" (ds)); 199 asm("movl %%cs,%0" : "=r" (cs)); 200 asm("movl %%es,%0" : "=r" (es)); 201 asm("movl %%fs,%0" : "=r" (fsindex)); 202 asm("movl %%gs,%0" : "=r" (gsindex)); 203 204 rdmsrl(MSR_FS_BASE, fs); 205 rdmsrl(MSR_GS_BASE, gs); 206 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); 207 208 if (!all) 209 return; 210 211 cr0 = read_cr0(); 212 cr2 = read_cr2(); 213 cr3 = read_cr3(); 214 cr4 = read_cr4(); 215 216 printk(KERN_INFO "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", 217 fs, fsindex, gs, gsindex, shadowgs); 218 printk(KERN_INFO "CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, 219 es, cr0); 220 printk(KERN_INFO "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, 221 cr4); 222 223 get_debugreg(d0, 0); 224 get_debugreg(d1, 1); 225 get_debugreg(d2, 2); 226 printk(KERN_INFO "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2); 227 get_debugreg(d3, 3); 228 get_debugreg(d6, 6); 229 get_debugreg(d7, 7); 230 printk(KERN_INFO "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7); 231} 232 233void show_regs(struct pt_regs *regs) 234{ 235 printk(KERN_INFO "CPU %d:", smp_processor_id()); 236 __show_regs(regs, 1); 237 show_trace(NULL, regs, (void *)(regs + 1), regs->bp); 238} 239 240void release_thread(struct task_struct *dead_task) 241{ 242 if (dead_task->mm) { 243 if (dead_task->mm->context.size) { 244 printk("WARNING: dead process %8s still has LDT? <%p/%d>\n", 245 dead_task->comm, 246 dead_task->mm->context.ldt, 247 dead_task->mm->context.size); 248 BUG(); 249 } 250 } 251} 252 253static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr) 254{ 255 struct user_desc ud = { 256 .base_addr = addr, 257 .limit = 0xfffff, 258 .seg_32bit = 1, 259 .limit_in_pages = 1, 260 .useable = 1, 261 }; 262 struct desc_struct *desc = t->thread.tls_array; 263 desc += tls; 264 fill_ldt(desc, &ud); 265} 266 267static inline u32 read_32bit_tls(struct task_struct *t, int tls) 268{ 269 return get_desc_base(&t->thread.tls_array[tls]); 270} 271 272/* 273 * This gets called before we allocate a new thread and copy 274 * the current task into it. 275 */ 276void prepare_to_copy(struct task_struct *tsk) 277{ 278 unlazy_fpu(tsk); 279} 280 281int copy_thread(unsigned long clone_flags, unsigned long sp, 282 unsigned long unused, 283 struct task_struct *p, struct pt_regs *regs) 284{ 285 int err; 286 struct pt_regs *childregs; 287 struct task_struct *me = current; 288 289 childregs = ((struct pt_regs *) 290 (THREAD_SIZE + task_stack_page(p))) - 1; 291 *childregs = *regs; 292 293 childregs->ax = 0; 294 childregs->sp = sp; 295 if (sp == ~0UL) 296 childregs->sp = (unsigned long)childregs; 297 298 p->thread.sp = (unsigned long) childregs; 299 p->thread.sp0 = (unsigned long) (childregs+1); 300 p->thread.usersp = me->thread.usersp; 301 302 set_tsk_thread_flag(p, TIF_FORK); 303 304 p->thread.fs = me->thread.fs; 305 p->thread.gs = me->thread.gs; 306 307 savesegment(gs, p->thread.gsindex); 308 savesegment(fs, p->thread.fsindex); 309 savesegment(es, p->thread.es); 310 savesegment(ds, p->thread.ds); 311 312 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) { 313 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); 314 if (!p->thread.io_bitmap_ptr) { 315 p->thread.io_bitmap_max = 0; 316 return -ENOMEM; 317 } 318 memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr, 319 IO_BITMAP_BYTES); 320 set_tsk_thread_flag(p, TIF_IO_BITMAP); 321 } 322 323 /* 324 * Set a new TLS for the child thread? 325 */ 326 if (clone_flags & CLONE_SETTLS) { 327#ifdef CONFIG_IA32_EMULATION 328 if (test_thread_flag(TIF_IA32)) 329 err = do_set_thread_area(p, -1, 330 (struct user_desc __user *)childregs->si, 0); 331 else 332#endif 333 err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8); 334 if (err) 335 goto out; 336 } 337 338 clear_tsk_thread_flag(p, TIF_DS_AREA_MSR); 339 p->thread.ds_ctx = NULL; 340 341 clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR); 342 p->thread.debugctlmsr = 0; 343 344 err = 0; 345out: 346 if (err && p->thread.io_bitmap_ptr) { 347 kfree(p->thread.io_bitmap_ptr); 348 p->thread.io_bitmap_max = 0; 349 } 350 return err; 351} 352 353void 354start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) 355{ 356 loadsegment(fs, 0); 357 loadsegment(es, 0); 358 loadsegment(ds, 0); 359 load_gs_index(0); 360 regs->ip = new_ip; 361 regs->sp = new_sp; 362 percpu_write(old_rsp, new_sp); 363 regs->cs = __USER_CS; 364 regs->ss = __USER_DS; 365 regs->flags = 0x200; 366 set_fs(USER_DS); 367 /* 368 * Free the old FP and other extended state 369 */ 370 free_thread_xstate(current); 371} 372EXPORT_SYMBOL_GPL(start_thread); 373 374/* 375 * switch_to(x,y) should switch tasks from x to y. 376 * 377 * This could still be optimized: 378 * - fold all the options into a flag word and test it with a single test. 379 * - could test fs/gs bitsliced 380 * 381 * Kprobes not supported here. Set the probe on schedule instead. 382 * Function graph tracer not supported too. 383 */ 384__notrace_funcgraph struct task_struct * 385__switch_to(struct task_struct *prev_p, struct task_struct *next_p) 386{ 387 struct thread_struct *prev = &prev_p->thread; 388 struct thread_struct *next = &next_p->thread; 389 int cpu = smp_processor_id(); 390 struct tss_struct *tss = &per_cpu(init_tss, cpu); 391 unsigned fsindex, gsindex; 392 393 /* we're going to use this soon, after a few expensive things */ 394 if (next_p->fpu_counter > 5) 395 prefetch(next->xstate); 396 397 /* 398 * Reload esp0, LDT and the page table pointer: 399 */ 400 load_sp0(tss, next); 401 402 /* 403 * Switch DS and ES. 404 * This won't pick up thread selector changes, but I guess that is ok. 405 */ 406 savesegment(es, prev->es); 407 if (unlikely(next->es | prev->es)) 408 loadsegment(es, next->es); 409 410 savesegment(ds, prev->ds); 411 if (unlikely(next->ds | prev->ds)) 412 loadsegment(ds, next->ds); 413 414 415 /* We must save %fs and %gs before load_TLS() because 416 * %fs and %gs may be cleared by load_TLS(). 417 * 418 * (e.g. xen_load_tls()) 419 */ 420 savesegment(fs, fsindex); 421 savesegment(gs, gsindex); 422 423 load_TLS(next, cpu); 424 425 /* 426 * Leave lazy mode, flushing any hypercalls made here. 427 * This must be done before restoring TLS segments so 428 * the GDT and LDT are properly updated, and must be 429 * done before math_state_restore, so the TS bit is up 430 * to date. 431 */ 432 arch_leave_lazy_cpu_mode(); 433 434 /* 435 * Switch FS and GS. 436 * 437 * Segment register != 0 always requires a reload. Also 438 * reload when it has changed. When prev process used 64bit 439 * base always reload to avoid an information leak. 440 */ 441 if (unlikely(fsindex | next->fsindex | prev->fs)) { 442 loadsegment(fs, next->fsindex); 443 /* 444 * Check if the user used a selector != 0; if yes 445 * clear 64bit base, since overloaded base is always 446 * mapped to the Null selector 447 */ 448 if (fsindex) 449 prev->fs = 0; 450 } 451 /* when next process has a 64bit base use it */ 452 if (next->fs) 453 wrmsrl(MSR_FS_BASE, next->fs); 454 prev->fsindex = fsindex; 455 456 if (unlikely(gsindex | next->gsindex | prev->gs)) { 457 load_gs_index(next->gsindex); 458 if (gsindex) 459 prev->gs = 0; 460 } 461 if (next->gs) 462 wrmsrl(MSR_KERNEL_GS_BASE, next->gs); 463 prev->gsindex = gsindex; 464 465 /* Must be after DS reload */ 466 unlazy_fpu(prev_p); 467 468 /* 469 * Switch the PDA and FPU contexts. 470 */ 471 prev->usersp = percpu_read(old_rsp); 472 percpu_write(old_rsp, next->usersp); 473 percpu_write(current_task, next_p); 474 475 percpu_write(kernel_stack, 476 (unsigned long)task_stack_page(next_p) + 477 THREAD_SIZE - KERNEL_STACK_OFFSET); 478 479 /* 480 * Now maybe reload the debug registers and handle I/O bitmaps 481 */ 482 if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT || 483 task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV)) 484 __switch_to_xtra(prev_p, next_p, tss); 485 486 /* If the task has used fpu the last 5 timeslices, just do a full 487 * restore of the math state immediately to avoid the trap; the 488 * chances of needing FPU soon are obviously high now 489 * 490 * tsk_used_math() checks prevent calling math_state_restore(), 491 * which can sleep in the case of !tsk_used_math() 492 */ 493 if (tsk_used_math(next_p) && next_p->fpu_counter > 5) 494 math_state_restore(); 495 return prev_p; 496} 497 498/* 499 * sys_execve() executes a new program. 500 */ 501asmlinkage 502long sys_execve(char __user *name, char __user * __user *argv, 503 char __user * __user *envp, struct pt_regs *regs) 504{ 505 long error; 506 char *filename; 507 508 filename = getname(name); 509 error = PTR_ERR(filename); 510 if (IS_ERR(filename)) 511 return error; 512 error = do_execve(filename, argv, envp, regs); 513 putname(filename); 514 return error; 515} 516 517void set_personality_64bit(void) 518{ 519 /* inherit personality from parent */ 520 521 /* Make sure to be in 64bit mode */ 522 clear_thread_flag(TIF_IA32); 523 524 /* TBD: overwrites user setup. Should have two bits. 525 But 64bit processes have always behaved this way, 526 so it's not too bad. The main problem is just that 527 32bit childs are affected again. */ 528 current->personality &= ~READ_IMPLIES_EXEC; 529} 530 531asmlinkage long 532sys_clone(unsigned long clone_flags, unsigned long newsp, 533 void __user *parent_tid, void __user *child_tid, struct pt_regs *regs) 534{ 535 if (!newsp) 536 newsp = regs->sp; 537 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid); 538} 539 540unsigned long get_wchan(struct task_struct *p) 541{ 542 unsigned long stack; 543 u64 fp, ip; 544 int count = 0; 545 546 if (!p || p == current || p->state == TASK_RUNNING) 547 return 0; 548 stack = (unsigned long)task_stack_page(p); 549 if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE) 550 return 0; 551 fp = *(u64 *)(p->thread.sp); 552 do { 553 if (fp < (unsigned long)stack || 554 fp >= (unsigned long)stack+THREAD_SIZE) 555 return 0; 556 ip = *(u64 *)(fp+8); 557 if (!in_sched_functions(ip)) 558 return ip; 559 fp = *(u64 *)fp; 560 } while (count++ < 16); 561 return 0; 562} 563 564long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) 565{ 566 int ret = 0; 567 int doit = task == current; 568 int cpu; 569 570 switch (code) { 571 case ARCH_SET_GS: 572 if (addr >= TASK_SIZE_OF(task)) 573 return -EPERM; 574 cpu = get_cpu(); 575 /* handle small bases via the GDT because that's faster to 576 switch. */ 577 if (addr <= 0xffffffff) { 578 set_32bit_tls(task, GS_TLS, addr); 579 if (doit) { 580 load_TLS(&task->thread, cpu); 581 load_gs_index(GS_TLS_SEL); 582 } 583 task->thread.gsindex = GS_TLS_SEL; 584 task->thread.gs = 0; 585 } else { 586 task->thread.gsindex = 0; 587 task->thread.gs = addr; 588 if (doit) { 589 load_gs_index(0); 590 ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr); 591 } 592 } 593 put_cpu(); 594 break; 595 case ARCH_SET_FS: 596 /* Not strictly needed for fs, but do it for symmetry 597 with gs */ 598 if (addr >= TASK_SIZE_OF(task)) 599 return -EPERM; 600 cpu = get_cpu(); 601 /* handle small bases via the GDT because that's faster to 602 switch. */ 603 if (addr <= 0xffffffff) { 604 set_32bit_tls(task, FS_TLS, addr); 605 if (doit) { 606 load_TLS(&task->thread, cpu); 607 loadsegment(fs, FS_TLS_SEL); 608 } 609 task->thread.fsindex = FS_TLS_SEL; 610 task->thread.fs = 0; 611 } else { 612 task->thread.fsindex = 0; 613 task->thread.fs = addr; 614 if (doit) { 615 /* set the selector to 0 to not confuse 616 __switch_to */ 617 loadsegment(fs, 0); 618 ret = checking_wrmsrl(MSR_FS_BASE, addr); 619 } 620 } 621 put_cpu(); 622 break; 623 case ARCH_GET_FS: { 624 unsigned long base; 625 if (task->thread.fsindex == FS_TLS_SEL) 626 base = read_32bit_tls(task, FS_TLS); 627 else if (doit) 628 rdmsrl(MSR_FS_BASE, base); 629 else 630 base = task->thread.fs; 631 ret = put_user(base, (unsigned long __user *)addr); 632 break; 633 } 634 case ARCH_GET_GS: { 635 unsigned long base; 636 unsigned gsindex; 637 if (task->thread.gsindex == GS_TLS_SEL) 638 base = read_32bit_tls(task, GS_TLS); 639 else if (doit) { 640 savesegment(gs, gsindex); 641 if (gsindex) 642 rdmsrl(MSR_KERNEL_GS_BASE, base); 643 else 644 base = task->thread.gs; 645 } else 646 base = task->thread.gs; 647 ret = put_user(base, (unsigned long __user *)addr); 648 break; 649 } 650 651 default: 652 ret = -EINVAL; 653 break; 654 } 655 656 return ret; 657} 658 659long sys_arch_prctl(int code, unsigned long addr) 660{ 661 return do_arch_prctl(current, code, addr); 662} 663 664unsigned long arch_align_stack(unsigned long sp) 665{ 666 if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) 667 sp -= get_random_int() % 8192; 668 return sp & ~0xf; 669} 670 671unsigned long arch_randomize_brk(struct mm_struct *mm) 672{ 673 unsigned long range_end = mm->brk + 0x02000000; 674 return randomize_range(mm->brk, range_end, 0) ? : mm->brk; 675} 676