process_64.c revision dca2d6ac09d9ef59ff46820d4f0c94b08a671202
1/* 2 * Copyright (C) 1995 Linus Torvalds 3 * 4 * Pentium III FXSR, SSE support 5 * Gareth Hughes <gareth@valinux.com>, May 2000 6 * 7 * X86-64 port 8 * Andi Kleen. 9 * 10 * CPU hotplug support - ashok.raj@intel.com 11 */ 12 13/* 14 * This file handles the architecture-dependent parts of process handling.. 15 */ 16 17#include <linux/stackprotector.h> 18#include <linux/cpu.h> 19#include <linux/errno.h> 20#include <linux/sched.h> 21#include <linux/fs.h> 22#include <linux/kernel.h> 23#include <linux/mm.h> 24#include <linux/elfcore.h> 25#include <linux/smp.h> 26#include <linux/slab.h> 27#include <linux/user.h> 28#include <linux/interrupt.h> 29#include <linux/utsname.h> 30#include <linux/delay.h> 31#include <linux/module.h> 32#include <linux/ptrace.h> 33#include <linux/notifier.h> 34#include <linux/kprobes.h> 35#include <linux/kdebug.h> 36#include <linux/tick.h> 37#include <linux/prctl.h> 38#include <linux/uaccess.h> 39#include <linux/io.h> 40#include <linux/ftrace.h> 41#include <linux/dmi.h> 42 43#include <asm/pgtable.h> 44#include <asm/system.h> 45#include <asm/processor.h> 46#include <asm/i387.h> 47#include <asm/mmu_context.h> 48#include <asm/prctl.h> 49#include <asm/desc.h> 50#include <asm/proto.h> 51#include <asm/ia32.h> 52#include <asm/idle.h> 53#include <asm/syscalls.h> 54#include <asm/ds.h> 55#include <asm/debugreg.h> 56#include <asm/hw_breakpoint.h> 57 58asmlinkage extern void ret_from_fork(void); 59 60DEFINE_PER_CPU(unsigned long, old_rsp); 61static DEFINE_PER_CPU(unsigned char, is_idle); 62 63unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED; 64 65static ATOMIC_NOTIFIER_HEAD(idle_notifier); 66 67void idle_notifier_register(struct notifier_block *n) 68{ 69 atomic_notifier_chain_register(&idle_notifier, n); 70} 71EXPORT_SYMBOL_GPL(idle_notifier_register); 72 73void idle_notifier_unregister(struct notifier_block *n) 74{ 75 atomic_notifier_chain_unregister(&idle_notifier, n); 76} 77EXPORT_SYMBOL_GPL(idle_notifier_unregister); 78 79void enter_idle(void) 80{ 81 percpu_write(is_idle, 1); 82 atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL); 83} 84 85static void __exit_idle(void) 86{ 87 if (x86_test_and_clear_bit_percpu(0, is_idle) == 0) 88 return; 89 atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL); 90} 91 92/* Called from interrupts to signify idle end */ 93void exit_idle(void) 94{ 95 /* idle loop has pid 0 */ 96 if (current->pid) 97 return; 98 __exit_idle(); 99} 100 101#ifndef CONFIG_SMP 102static inline void play_dead(void) 103{ 104 BUG(); 105} 106#endif 107 108/* 109 * The idle thread. There's no useful work to be 110 * done, so just try to conserve power and have a 111 * low exit latency (ie sit in a loop waiting for 112 * somebody to say that they'd like to reschedule) 113 */ 114void cpu_idle(void) 115{ 116 current_thread_info()->status |= TS_POLLING; 117 118 /* 119 * If we're the non-boot CPU, nothing set the stack canary up 120 * for us. CPU0 already has it initialized but no harm in 121 * doing it again. This is a good place for updating it, as 122 * we wont ever return from this function (so the invalid 123 * canaries already on the stack wont ever trigger). 124 */ 125 boot_init_stack_canary(); 126 127 /* endless idle loop with no priority at all */ 128 while (1) { 129 tick_nohz_stop_sched_tick(1); 130 while (!need_resched()) { 131 132 rmb(); 133 134 if (cpu_is_offline(smp_processor_id())) 135 play_dead(); 136 /* 137 * Idle routines should keep interrupts disabled 138 * from here on, until they go to idle. 139 * Otherwise, idle callbacks can misfire. 140 */ 141 local_irq_disable(); 142 enter_idle(); 143 /* Don't trace irqs off for idle */ 144 stop_critical_timings(); 145 pm_idle(); 146 start_critical_timings(); 147 /* In many cases the interrupt that ended idle 148 has already called exit_idle. But some idle 149 loops can be woken up without interrupt. */ 150 __exit_idle(); 151 } 152 153 tick_nohz_restart_sched_tick(); 154 preempt_enable_no_resched(); 155 schedule(); 156 preempt_disable(); 157 } 158} 159 160/* Prints also some state that isn't saved in the pt_regs */ 161void __show_regs(struct pt_regs *regs, int all) 162{ 163 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs; 164 unsigned long d0, d1, d2, d3, d6, d7; 165 unsigned int fsindex, gsindex; 166 unsigned int ds, cs, es; 167 const char *board; 168 169 printk("\n"); 170 print_modules(); 171 board = dmi_get_system_info(DMI_PRODUCT_NAME); 172 if (!board) 173 board = ""; 174 printk(KERN_INFO "Pid: %d, comm: %.20s %s %s %.*s %s\n", 175 current->pid, current->comm, print_tainted(), 176 init_utsname()->release, 177 (int)strcspn(init_utsname()->version, " "), 178 init_utsname()->version, board); 179 printk(KERN_INFO "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip); 180 printk_address(regs->ip, 1); 181 printk(KERN_INFO "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, 182 regs->sp, regs->flags); 183 printk(KERN_INFO "RAX: %016lx RBX: %016lx RCX: %016lx\n", 184 regs->ax, regs->bx, regs->cx); 185 printk(KERN_INFO "RDX: %016lx RSI: %016lx RDI: %016lx\n", 186 regs->dx, regs->si, regs->di); 187 printk(KERN_INFO "RBP: %016lx R08: %016lx R09: %016lx\n", 188 regs->bp, regs->r8, regs->r9); 189 printk(KERN_INFO "R10: %016lx R11: %016lx R12: %016lx\n", 190 regs->r10, regs->r11, regs->r12); 191 printk(KERN_INFO "R13: %016lx R14: %016lx R15: %016lx\n", 192 regs->r13, regs->r14, regs->r15); 193 194 asm("movl %%ds,%0" : "=r" (ds)); 195 asm("movl %%cs,%0" : "=r" (cs)); 196 asm("movl %%es,%0" : "=r" (es)); 197 asm("movl %%fs,%0" : "=r" (fsindex)); 198 asm("movl %%gs,%0" : "=r" (gsindex)); 199 200 rdmsrl(MSR_FS_BASE, fs); 201 rdmsrl(MSR_GS_BASE, gs); 202 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); 203 204 if (!all) 205 return; 206 207 cr0 = read_cr0(); 208 cr2 = read_cr2(); 209 cr3 = read_cr3(); 210 cr4 = read_cr4(); 211 212 printk(KERN_INFO "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", 213 fs, fsindex, gs, gsindex, shadowgs); 214 printk(KERN_INFO "CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, 215 es, cr0); 216 printk(KERN_INFO "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, 217 cr4); 218 219 get_debugreg(d0, 0); 220 get_debugreg(d1, 1); 221 get_debugreg(d2, 2); 222 printk(KERN_INFO "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2); 223 get_debugreg(d3, 3); 224 get_debugreg(d6, 6); 225 get_debugreg(d7, 7); 226 printk(KERN_INFO "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7); 227} 228 229void show_regs(struct pt_regs *regs) 230{ 231 printk(KERN_INFO "CPU %d:", smp_processor_id()); 232 __show_regs(regs, 1); 233 show_trace(NULL, regs, (void *)(regs + 1), regs->bp); 234} 235 236void release_thread(struct task_struct *dead_task) 237{ 238 if (dead_task->mm) { 239 if (dead_task->mm->context.size) { 240 printk("WARNING: dead process %8s still has LDT? <%p/%d>\n", 241 dead_task->comm, 242 dead_task->mm->context.ldt, 243 dead_task->mm->context.size); 244 BUG(); 245 } 246 } 247 if (unlikely(dead_task->thread.debugreg7)) 248 flush_thread_hw_breakpoint(dead_task); 249} 250 251static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr) 252{ 253 struct user_desc ud = { 254 .base_addr = addr, 255 .limit = 0xfffff, 256 .seg_32bit = 1, 257 .limit_in_pages = 1, 258 .useable = 1, 259 }; 260 struct desc_struct *desc = t->thread.tls_array; 261 desc += tls; 262 fill_ldt(desc, &ud); 263} 264 265static inline u32 read_32bit_tls(struct task_struct *t, int tls) 266{ 267 return get_desc_base(&t->thread.tls_array[tls]); 268} 269 270/* 271 * This gets called before we allocate a new thread and copy 272 * the current task into it. 273 */ 274void prepare_to_copy(struct task_struct *tsk) 275{ 276 unlazy_fpu(tsk); 277} 278 279int copy_thread(unsigned long clone_flags, unsigned long sp, 280 unsigned long unused, 281 struct task_struct *p, struct pt_regs *regs) 282{ 283 int err; 284 struct pt_regs *childregs; 285 struct task_struct *me = current; 286 287 childregs = ((struct pt_regs *) 288 (THREAD_SIZE + task_stack_page(p))) - 1; 289 *childregs = *regs; 290 291 childregs->ax = 0; 292 childregs->sp = sp; 293 if (sp == ~0UL) 294 childregs->sp = (unsigned long)childregs; 295 296 p->thread.sp = (unsigned long) childregs; 297 p->thread.sp0 = (unsigned long) (childregs+1); 298 p->thread.usersp = me->thread.usersp; 299 300 set_tsk_thread_flag(p, TIF_FORK); 301 302 p->thread.fs = me->thread.fs; 303 p->thread.gs = me->thread.gs; 304 p->thread.io_bitmap_ptr = NULL; 305 306 savesegment(gs, p->thread.gsindex); 307 savesegment(fs, p->thread.fsindex); 308 savesegment(es, p->thread.es); 309 savesegment(ds, p->thread.ds); 310 311 err = -ENOMEM; 312 if (unlikely(test_tsk_thread_flag(me, TIF_DEBUG))) 313 if (copy_thread_hw_breakpoint(me, p, clone_flags)) 314 goto out; 315 316 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) { 317 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); 318 if (!p->thread.io_bitmap_ptr) { 319 p->thread.io_bitmap_max = 0; 320 return -ENOMEM; 321 } 322 memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr, 323 IO_BITMAP_BYTES); 324 set_tsk_thread_flag(p, TIF_IO_BITMAP); 325 } 326 327 /* 328 * Set a new TLS for the child thread? 329 */ 330 if (clone_flags & CLONE_SETTLS) { 331#ifdef CONFIG_IA32_EMULATION 332 if (test_thread_flag(TIF_IA32)) 333 err = do_set_thread_area(p, -1, 334 (struct user_desc __user *)childregs->si, 0); 335 else 336#endif 337 err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8); 338 if (err) 339 goto out; 340 } 341 342 clear_tsk_thread_flag(p, TIF_DS_AREA_MSR); 343 p->thread.ds_ctx = NULL; 344 345 clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR); 346 p->thread.debugctlmsr = 0; 347 348 err = 0; 349out: 350 if (err && p->thread.io_bitmap_ptr) { 351 kfree(p->thread.io_bitmap_ptr); 352 p->thread.io_bitmap_max = 0; 353 } 354 if (err) 355 flush_thread_hw_breakpoint(p); 356 357 return err; 358} 359 360void 361start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) 362{ 363 loadsegment(fs, 0); 364 loadsegment(es, 0); 365 loadsegment(ds, 0); 366 load_gs_index(0); 367 regs->ip = new_ip; 368 regs->sp = new_sp; 369 percpu_write(old_rsp, new_sp); 370 regs->cs = __USER_CS; 371 regs->ss = __USER_DS; 372 regs->flags = 0x200; 373 set_fs(USER_DS); 374 /* 375 * Free the old FP and other extended state 376 */ 377 free_thread_xstate(current); 378} 379EXPORT_SYMBOL_GPL(start_thread); 380 381/* 382 * switch_to(x,y) should switch tasks from x to y. 383 * 384 * This could still be optimized: 385 * - fold all the options into a flag word and test it with a single test. 386 * - could test fs/gs bitsliced 387 * 388 * Kprobes not supported here. Set the probe on schedule instead. 389 * Function graph tracer not supported too. 390 */ 391__notrace_funcgraph struct task_struct * 392__switch_to(struct task_struct *prev_p, struct task_struct *next_p) 393{ 394 struct thread_struct *prev = &prev_p->thread; 395 struct thread_struct *next = &next_p->thread; 396 int cpu = smp_processor_id(); 397 struct tss_struct *tss = &per_cpu(init_tss, cpu); 398 unsigned fsindex, gsindex; 399 bool preload_fpu; 400 401 /* 402 * If the task has used fpu the last 5 timeslices, just do a full 403 * restore of the math state immediately to avoid the trap; the 404 * chances of needing FPU soon are obviously high now 405 */ 406 preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter > 5; 407 408 /* we're going to use this soon, after a few expensive things */ 409 if (preload_fpu) 410 prefetch(next->xstate); 411 412 /* 413 * Reload esp0, LDT and the page table pointer: 414 */ 415 load_sp0(tss, next); 416 417 /* 418 * Switch DS and ES. 419 * This won't pick up thread selector changes, but I guess that is ok. 420 */ 421 savesegment(es, prev->es); 422 if (unlikely(next->es | prev->es)) 423 loadsegment(es, next->es); 424 425 savesegment(ds, prev->ds); 426 if (unlikely(next->ds | prev->ds)) 427 loadsegment(ds, next->ds); 428 429 430 /* We must save %fs and %gs before load_TLS() because 431 * %fs and %gs may be cleared by load_TLS(). 432 * 433 * (e.g. xen_load_tls()) 434 */ 435 savesegment(fs, fsindex); 436 savesegment(gs, gsindex); 437 438 load_TLS(next, cpu); 439 440 /* Must be after DS reload */ 441 unlazy_fpu(prev_p); 442 443 /* Make sure cpu is ready for new context */ 444 if (preload_fpu) 445 clts(); 446 447 /* 448 * Leave lazy mode, flushing any hypercalls made here. 449 * This must be done before restoring TLS segments so 450 * the GDT and LDT are properly updated, and must be 451 * done before math_state_restore, so the TS bit is up 452 * to date. 453 */ 454 arch_end_context_switch(next_p); 455 456 /* 457 * Switch FS and GS. 458 * 459 * Segment register != 0 always requires a reload. Also 460 * reload when it has changed. When prev process used 64bit 461 * base always reload to avoid an information leak. 462 */ 463 if (unlikely(fsindex | next->fsindex | prev->fs)) { 464 loadsegment(fs, next->fsindex); 465 /* 466 * Check if the user used a selector != 0; if yes 467 * clear 64bit base, since overloaded base is always 468 * mapped to the Null selector 469 */ 470 if (fsindex) 471 prev->fs = 0; 472 } 473 /* when next process has a 64bit base use it */ 474 if (next->fs) 475 wrmsrl(MSR_FS_BASE, next->fs); 476 prev->fsindex = fsindex; 477 478 if (unlikely(gsindex | next->gsindex | prev->gs)) { 479 load_gs_index(next->gsindex); 480 if (gsindex) 481 prev->gs = 0; 482 } 483 if (next->gs) 484 wrmsrl(MSR_KERNEL_GS_BASE, next->gs); 485 prev->gsindex = gsindex; 486 487 /* 488 * Switch the PDA and FPU contexts. 489 */ 490 prev->usersp = percpu_read(old_rsp); 491 percpu_write(old_rsp, next->usersp); 492 percpu_write(current_task, next_p); 493 494 percpu_write(kernel_stack, 495 (unsigned long)task_stack_page(next_p) + 496 THREAD_SIZE - KERNEL_STACK_OFFSET); 497 498 /* 499 * Now maybe reload the debug registers and handle I/O bitmaps 500 */ 501 if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT || 502 task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV)) 503 __switch_to_xtra(prev_p, next_p, tss); 504 505 /* 506 * Preload the FPU context, now that we've determined that the 507 * task is likely to be using it. 508 */ 509 if (preload_fpu) 510 __math_state_restore(); 511 /* 512 * There's a problem with moving the arch_install_thread_hw_breakpoint() 513 * call before current is updated. Suppose a kernel breakpoint is 514 * triggered in between the two, the hw-breakpoint handler will see that 515 * the 'current' task does not have TIF_DEBUG flag set and will think it 516 * is leftover from an old task (lazy switching) and will erase it. Then 517 * until the next context switch, no user-breakpoints will be installed. 518 * 519 * The real problem is that it's impossible to update both current and 520 * physical debug registers at the same instant, so there will always be 521 * a window in which they disagree and a breakpoint might get triggered. 522 * Since we use lazy switching, we are forced to assume that a 523 * disagreement means that current is correct and the exception is due 524 * to lazy debug register switching. 525 */ 526 if (unlikely(test_tsk_thread_flag(next_p, TIF_DEBUG))) 527 arch_install_thread_hw_breakpoint(next_p); 528 529 return prev_p; 530} 531 532/* 533 * sys_execve() executes a new program. 534 */ 535asmlinkage 536long sys_execve(char __user *name, char __user * __user *argv, 537 char __user * __user *envp, struct pt_regs *regs) 538{ 539 long error; 540 char *filename; 541 542 filename = getname(name); 543 error = PTR_ERR(filename); 544 if (IS_ERR(filename)) 545 return error; 546 error = do_execve(filename, argv, envp, regs); 547 putname(filename); 548 return error; 549} 550 551void set_personality_64bit(void) 552{ 553 /* inherit personality from parent */ 554 555 /* Make sure to be in 64bit mode */ 556 clear_thread_flag(TIF_IA32); 557 558 /* TBD: overwrites user setup. Should have two bits. 559 But 64bit processes have always behaved this way, 560 so it's not too bad. The main problem is just that 561 32bit childs are affected again. */ 562 current->personality &= ~READ_IMPLIES_EXEC; 563} 564 565asmlinkage long 566sys_clone(unsigned long clone_flags, unsigned long newsp, 567 void __user *parent_tid, void __user *child_tid, struct pt_regs *regs) 568{ 569 if (!newsp) 570 newsp = regs->sp; 571 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid); 572} 573 574unsigned long get_wchan(struct task_struct *p) 575{ 576 unsigned long stack; 577 u64 fp, ip; 578 int count = 0; 579 580 if (!p || p == current || p->state == TASK_RUNNING) 581 return 0; 582 stack = (unsigned long)task_stack_page(p); 583 if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE) 584 return 0; 585 fp = *(u64 *)(p->thread.sp); 586 do { 587 if (fp < (unsigned long)stack || 588 fp >= (unsigned long)stack+THREAD_SIZE) 589 return 0; 590 ip = *(u64 *)(fp+8); 591 if (!in_sched_functions(ip)) 592 return ip; 593 fp = *(u64 *)fp; 594 } while (count++ < 16); 595 return 0; 596} 597 598long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) 599{ 600 int ret = 0; 601 int doit = task == current; 602 int cpu; 603 604 switch (code) { 605 case ARCH_SET_GS: 606 if (addr >= TASK_SIZE_OF(task)) 607 return -EPERM; 608 cpu = get_cpu(); 609 /* handle small bases via the GDT because that's faster to 610 switch. */ 611 if (addr <= 0xffffffff) { 612 set_32bit_tls(task, GS_TLS, addr); 613 if (doit) { 614 load_TLS(&task->thread, cpu); 615 load_gs_index(GS_TLS_SEL); 616 } 617 task->thread.gsindex = GS_TLS_SEL; 618 task->thread.gs = 0; 619 } else { 620 task->thread.gsindex = 0; 621 task->thread.gs = addr; 622 if (doit) { 623 load_gs_index(0); 624 ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr); 625 } 626 } 627 put_cpu(); 628 break; 629 case ARCH_SET_FS: 630 /* Not strictly needed for fs, but do it for symmetry 631 with gs */ 632 if (addr >= TASK_SIZE_OF(task)) 633 return -EPERM; 634 cpu = get_cpu(); 635 /* handle small bases via the GDT because that's faster to 636 switch. */ 637 if (addr <= 0xffffffff) { 638 set_32bit_tls(task, FS_TLS, addr); 639 if (doit) { 640 load_TLS(&task->thread, cpu); 641 loadsegment(fs, FS_TLS_SEL); 642 } 643 task->thread.fsindex = FS_TLS_SEL; 644 task->thread.fs = 0; 645 } else { 646 task->thread.fsindex = 0; 647 task->thread.fs = addr; 648 if (doit) { 649 /* set the selector to 0 to not confuse 650 __switch_to */ 651 loadsegment(fs, 0); 652 ret = checking_wrmsrl(MSR_FS_BASE, addr); 653 } 654 } 655 put_cpu(); 656 break; 657 case ARCH_GET_FS: { 658 unsigned long base; 659 if (task->thread.fsindex == FS_TLS_SEL) 660 base = read_32bit_tls(task, FS_TLS); 661 else if (doit) 662 rdmsrl(MSR_FS_BASE, base); 663 else 664 base = task->thread.fs; 665 ret = put_user(base, (unsigned long __user *)addr); 666 break; 667 } 668 case ARCH_GET_GS: { 669 unsigned long base; 670 unsigned gsindex; 671 if (task->thread.gsindex == GS_TLS_SEL) 672 base = read_32bit_tls(task, GS_TLS); 673 else if (doit) { 674 savesegment(gs, gsindex); 675 if (gsindex) 676 rdmsrl(MSR_KERNEL_GS_BASE, base); 677 else 678 base = task->thread.gs; 679 } else 680 base = task->thread.gs; 681 ret = put_user(base, (unsigned long __user *)addr); 682 break; 683 } 684 685 default: 686 ret = -EINVAL; 687 break; 688 } 689 690 return ret; 691} 692 693long sys_arch_prctl(int code, unsigned long addr) 694{ 695 return do_arch_prctl(current, code, addr); 696} 697 698