process_64.c revision 66cb5917295958652ff6ba36d83f98f2379c46b4
1/*
2 *  Copyright (C) 1995  Linus Torvalds
3 *
4 *  Pentium III FXSR, SSE support
5 *	Gareth Hughes <gareth@valinux.com>, May 2000
6 *
7 *  X86-64 port
8 *	Andi Kleen.
9 *
10 *	CPU hotplug support - ashok.raj@intel.com
11 */
12
13/*
14 * This file handles the architecture-dependent parts of process handling..
15 */
16
17#include <stdarg.h>
18
19#include <linux/stackprotector.h>
20#include <linux/cpu.h>
21#include <linux/errno.h>
22#include <linux/sched.h>
23#include <linux/fs.h>
24#include <linux/kernel.h>
25#include <linux/mm.h>
26#include <linux/elfcore.h>
27#include <linux/smp.h>
28#include <linux/slab.h>
29#include <linux/user.h>
30#include <linux/interrupt.h>
31#include <linux/utsname.h>
32#include <linux/delay.h>
33#include <linux/module.h>
34#include <linux/ptrace.h>
35#include <linux/random.h>
36#include <linux/notifier.h>
37#include <linux/kprobes.h>
38#include <linux/kdebug.h>
39#include <linux/tick.h>
40#include <linux/prctl.h>
41#include <linux/uaccess.h>
42#include <linux/io.h>
43#include <linux/ftrace.h>
44#include <linux/dmi.h>
45
46#include <asm/pgtable.h>
47#include <asm/system.h>
48#include <asm/processor.h>
49#include <asm/i387.h>
50#include <asm/mmu_context.h>
51#include <asm/prctl.h>
52#include <asm/desc.h>
53#include <asm/proto.h>
54#include <asm/ia32.h>
55#include <asm/idle.h>
56#include <asm/syscalls.h>
57#include <asm/ds.h>
58#include <asm/debugreg.h>
59#include <asm/hw_breakpoint.h>
60
61asmlinkage extern void ret_from_fork(void);
62
63DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
64EXPORT_PER_CPU_SYMBOL(current_task);
65
66DEFINE_PER_CPU(unsigned long, old_rsp);
67static DEFINE_PER_CPU(unsigned char, is_idle);
68
69unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
70
71static ATOMIC_NOTIFIER_HEAD(idle_notifier);
72
73void idle_notifier_register(struct notifier_block *n)
74{
75	atomic_notifier_chain_register(&idle_notifier, n);
76}
77EXPORT_SYMBOL_GPL(idle_notifier_register);
78
79void idle_notifier_unregister(struct notifier_block *n)
80{
81	atomic_notifier_chain_unregister(&idle_notifier, n);
82}
83EXPORT_SYMBOL_GPL(idle_notifier_unregister);
84
85void enter_idle(void)
86{
87	percpu_write(is_idle, 1);
88	atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
89}
90
91static void __exit_idle(void)
92{
93	if (x86_test_and_clear_bit_percpu(0, is_idle) == 0)
94		return;
95	atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
96}
97
98/* Called from interrupts to signify idle end */
99void exit_idle(void)
100{
101	/* idle loop has pid 0 */
102	if (current->pid)
103		return;
104	__exit_idle();
105}
106
107#ifndef CONFIG_SMP
108static inline void play_dead(void)
109{
110	BUG();
111}
112#endif
113
114/*
115 * The idle thread. There's no useful work to be
116 * done, so just try to conserve power and have a
117 * low exit latency (ie sit in a loop waiting for
118 * somebody to say that they'd like to reschedule)
119 */
120void cpu_idle(void)
121{
122	current_thread_info()->status |= TS_POLLING;
123
124	/*
125	 * If we're the non-boot CPU, nothing set the stack canary up
126	 * for us.  CPU0 already has it initialized but no harm in
127	 * doing it again.  This is a good place for updating it, as
128	 * we wont ever return from this function (so the invalid
129	 * canaries already on the stack wont ever trigger).
130	 */
131	boot_init_stack_canary();
132
133	/* endless idle loop with no priority at all */
134	while (1) {
135		tick_nohz_stop_sched_tick(1);
136		while (!need_resched()) {
137
138			rmb();
139
140			if (cpu_is_offline(smp_processor_id()))
141				play_dead();
142			/*
143			 * Idle routines should keep interrupts disabled
144			 * from here on, until they go to idle.
145			 * Otherwise, idle callbacks can misfire.
146			 */
147			local_irq_disable();
148			enter_idle();
149			/* Don't trace irqs off for idle */
150			stop_critical_timings();
151			pm_idle();
152			start_critical_timings();
153			/* In many cases the interrupt that ended idle
154			   has already called exit_idle. But some idle
155			   loops can be woken up without interrupt. */
156			__exit_idle();
157		}
158
159		tick_nohz_restart_sched_tick();
160		preempt_enable_no_resched();
161		schedule();
162		preempt_disable();
163	}
164}
165
166/* Prints also some state that isn't saved in the pt_regs */
167void __show_regs(struct pt_regs *regs, int all)
168{
169	unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
170	unsigned long d0, d1, d2, d3, d6, d7;
171	unsigned int fsindex, gsindex;
172	unsigned int ds, cs, es;
173	const char *board;
174
175	printk("\n");
176	print_modules();
177	board = dmi_get_system_info(DMI_PRODUCT_NAME);
178	if (!board)
179		board = "";
180	printk(KERN_INFO "Pid: %d, comm: %.20s %s %s %.*s %s\n",
181		current->pid, current->comm, print_tainted(),
182		init_utsname()->release,
183		(int)strcspn(init_utsname()->version, " "),
184		init_utsname()->version, board);
185	printk(KERN_INFO "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
186	printk_address(regs->ip, 1);
187	printk(KERN_INFO "RSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss,
188			regs->sp, regs->flags);
189	printk(KERN_INFO "RAX: %016lx RBX: %016lx RCX: %016lx\n",
190	       regs->ax, regs->bx, regs->cx);
191	printk(KERN_INFO "RDX: %016lx RSI: %016lx RDI: %016lx\n",
192	       regs->dx, regs->si, regs->di);
193	printk(KERN_INFO "RBP: %016lx R08: %016lx R09: %016lx\n",
194	       regs->bp, regs->r8, regs->r9);
195	printk(KERN_INFO "R10: %016lx R11: %016lx R12: %016lx\n",
196	       regs->r10, regs->r11, regs->r12);
197	printk(KERN_INFO "R13: %016lx R14: %016lx R15: %016lx\n",
198	       regs->r13, regs->r14, regs->r15);
199
200	asm("movl %%ds,%0" : "=r" (ds));
201	asm("movl %%cs,%0" : "=r" (cs));
202	asm("movl %%es,%0" : "=r" (es));
203	asm("movl %%fs,%0" : "=r" (fsindex));
204	asm("movl %%gs,%0" : "=r" (gsindex));
205
206	rdmsrl(MSR_FS_BASE, fs);
207	rdmsrl(MSR_GS_BASE, gs);
208	rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
209
210	if (!all)
211		return;
212
213	cr0 = read_cr0();
214	cr2 = read_cr2();
215	cr3 = read_cr3();
216	cr4 = read_cr4();
217
218	printk(KERN_INFO "FS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
219	       fs, fsindex, gs, gsindex, shadowgs);
220	printk(KERN_INFO "CS:  %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
221			es, cr0);
222	printk(KERN_INFO "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
223			cr4);
224
225	get_debugreg(d0, 0);
226	get_debugreg(d1, 1);
227	get_debugreg(d2, 2);
228	printk(KERN_INFO "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
229	get_debugreg(d3, 3);
230	get_debugreg(d6, 6);
231	get_debugreg(d7, 7);
232	printk(KERN_INFO "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
233}
234
235void show_regs(struct pt_regs *regs)
236{
237	printk(KERN_INFO "CPU %d:", smp_processor_id());
238	__show_regs(regs, 1);
239	show_trace(NULL, regs, (void *)(regs + 1), regs->bp);
240}
241
242void release_thread(struct task_struct *dead_task)
243{
244	if (dead_task->mm) {
245		if (dead_task->mm->context.size) {
246			printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
247					dead_task->comm,
248					dead_task->mm->context.ldt,
249					dead_task->mm->context.size);
250			BUG();
251		}
252	}
253	if (unlikely(dead_task->thread.debugreg7))
254		flush_thread_hw_breakpoint(dead_task);
255}
256
257static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
258{
259	struct user_desc ud = {
260		.base_addr = addr,
261		.limit = 0xfffff,
262		.seg_32bit = 1,
263		.limit_in_pages = 1,
264		.useable = 1,
265	};
266	struct desc_struct *desc = t->thread.tls_array;
267	desc += tls;
268	fill_ldt(desc, &ud);
269}
270
271static inline u32 read_32bit_tls(struct task_struct *t, int tls)
272{
273	return get_desc_base(&t->thread.tls_array[tls]);
274}
275
276/*
277 * This gets called before we allocate a new thread and copy
278 * the current task into it.
279 */
280void prepare_to_copy(struct task_struct *tsk)
281{
282	unlazy_fpu(tsk);
283}
284
285int copy_thread(unsigned long clone_flags, unsigned long sp,
286		unsigned long unused,
287	struct task_struct *p, struct pt_regs *regs)
288{
289	int err;
290	struct pt_regs *childregs;
291	struct task_struct *me = current;
292
293	childregs = ((struct pt_regs *)
294			(THREAD_SIZE + task_stack_page(p))) - 1;
295	*childregs = *regs;
296
297	childregs->ax = 0;
298	childregs->sp = sp;
299	if (sp == ~0UL)
300		childregs->sp = (unsigned long)childregs;
301
302	p->thread.sp = (unsigned long) childregs;
303	p->thread.sp0 = (unsigned long) (childregs+1);
304	p->thread.usersp = me->thread.usersp;
305
306	set_tsk_thread_flag(p, TIF_FORK);
307
308	p->thread.fs = me->thread.fs;
309	p->thread.gs = me->thread.gs;
310	p->thread.io_bitmap_ptr = NULL;
311
312	savesegment(gs, p->thread.gsindex);
313	savesegment(fs, p->thread.fsindex);
314	savesegment(es, p->thread.es);
315	savesegment(ds, p->thread.ds);
316
317	err = -ENOMEM;
318	if (unlikely(test_tsk_thread_flag(me, TIF_DEBUG)))
319		if (copy_thread_hw_breakpoint(me, p, clone_flags))
320			goto out;
321
322	if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
323		p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
324		if (!p->thread.io_bitmap_ptr) {
325			p->thread.io_bitmap_max = 0;
326			return -ENOMEM;
327		}
328		memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
329				IO_BITMAP_BYTES);
330		set_tsk_thread_flag(p, TIF_IO_BITMAP);
331	}
332
333	/*
334	 * Set a new TLS for the child thread?
335	 */
336	if (clone_flags & CLONE_SETTLS) {
337#ifdef CONFIG_IA32_EMULATION
338		if (test_thread_flag(TIF_IA32))
339			err = do_set_thread_area(p, -1,
340				(struct user_desc __user *)childregs->si, 0);
341		else
342#endif
343			err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
344		if (err)
345			goto out;
346	}
347
348	clear_tsk_thread_flag(p, TIF_DS_AREA_MSR);
349	p->thread.ds_ctx = NULL;
350
351	clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR);
352	p->thread.debugctlmsr = 0;
353
354	err = 0;
355out:
356	if (err && p->thread.io_bitmap_ptr) {
357		kfree(p->thread.io_bitmap_ptr);
358		p->thread.io_bitmap_max = 0;
359	}
360	if (err)
361		flush_thread_hw_breakpoint(p);
362
363	return err;
364}
365
366void
367start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
368{
369	loadsegment(fs, 0);
370	loadsegment(es, 0);
371	loadsegment(ds, 0);
372	load_gs_index(0);
373	regs->ip		= new_ip;
374	regs->sp		= new_sp;
375	percpu_write(old_rsp, new_sp);
376	regs->cs		= __USER_CS;
377	regs->ss		= __USER_DS;
378	regs->flags		= 0x200;
379	set_fs(USER_DS);
380	/*
381	 * Free the old FP and other extended state
382	 */
383	free_thread_xstate(current);
384}
385EXPORT_SYMBOL_GPL(start_thread);
386
387/*
388 *	switch_to(x,y) should switch tasks from x to y.
389 *
390 * This could still be optimized:
391 * - fold all the options into a flag word and test it with a single test.
392 * - could test fs/gs bitsliced
393 *
394 * Kprobes not supported here. Set the probe on schedule instead.
395 * Function graph tracer not supported too.
396 */
397__notrace_funcgraph struct task_struct *
398__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
399{
400	struct thread_struct *prev = &prev_p->thread;
401	struct thread_struct *next = &next_p->thread;
402	int cpu = smp_processor_id();
403	struct tss_struct *tss = &per_cpu(init_tss, cpu);
404	unsigned fsindex, gsindex;
405
406	/* we're going to use this soon, after a few expensive things */
407	if (next_p->fpu_counter > 5)
408		prefetch(next->xstate);
409
410	/*
411	 * Reload esp0, LDT and the page table pointer:
412	 */
413	load_sp0(tss, next);
414
415	/*
416	 * Switch DS and ES.
417	 * This won't pick up thread selector changes, but I guess that is ok.
418	 */
419	savesegment(es, prev->es);
420	if (unlikely(next->es | prev->es))
421		loadsegment(es, next->es);
422
423	savesegment(ds, prev->ds);
424	if (unlikely(next->ds | prev->ds))
425		loadsegment(ds, next->ds);
426
427
428	/* We must save %fs and %gs before load_TLS() because
429	 * %fs and %gs may be cleared by load_TLS().
430	 *
431	 * (e.g. xen_load_tls())
432	 */
433	savesegment(fs, fsindex);
434	savesegment(gs, gsindex);
435
436	load_TLS(next, cpu);
437
438	/*
439	 * Leave lazy mode, flushing any hypercalls made here.
440	 * This must be done before restoring TLS segments so
441	 * the GDT and LDT are properly updated, and must be
442	 * done before math_state_restore, so the TS bit is up
443	 * to date.
444	 */
445	arch_leave_lazy_cpu_mode();
446
447	/*
448	 * Switch FS and GS.
449	 *
450	 * Segment register != 0 always requires a reload.  Also
451	 * reload when it has changed.  When prev process used 64bit
452	 * base always reload to avoid an information leak.
453	 */
454	if (unlikely(fsindex | next->fsindex | prev->fs)) {
455		loadsegment(fs, next->fsindex);
456		/*
457		 * Check if the user used a selector != 0; if yes
458		 *  clear 64bit base, since overloaded base is always
459		 *  mapped to the Null selector
460		 */
461		if (fsindex)
462			prev->fs = 0;
463	}
464	/* when next process has a 64bit base use it */
465	if (next->fs)
466		wrmsrl(MSR_FS_BASE, next->fs);
467	prev->fsindex = fsindex;
468
469	if (unlikely(gsindex | next->gsindex | prev->gs)) {
470		load_gs_index(next->gsindex);
471		if (gsindex)
472			prev->gs = 0;
473	}
474	if (next->gs)
475		wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
476	prev->gsindex = gsindex;
477
478	/* Must be after DS reload */
479	unlazy_fpu(prev_p);
480
481	/*
482	 * Switch the PDA and FPU contexts.
483	 */
484	prev->usersp = percpu_read(old_rsp);
485	percpu_write(old_rsp, next->usersp);
486	percpu_write(current_task, next_p);
487
488	percpu_write(kernel_stack,
489		  (unsigned long)task_stack_page(next_p) +
490		  THREAD_SIZE - KERNEL_STACK_OFFSET);
491
492	/*
493	 * Now maybe reload the debug registers and handle I/O bitmaps
494	 */
495	if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
496		     task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
497		__switch_to_xtra(prev_p, next_p, tss);
498
499	/* If the task has used fpu the last 5 timeslices, just do a full
500	 * restore of the math state immediately to avoid the trap; the
501	 * chances of needing FPU soon are obviously high now
502	 *
503	 * tsk_used_math() checks prevent calling math_state_restore(),
504	 * which can sleep in the case of !tsk_used_math()
505	 */
506	if (tsk_used_math(next_p) && next_p->fpu_counter > 5)
507		math_state_restore();
508	/*
509	 * There's a problem with moving the arch_install_thread_hw_breakpoint()
510	 * call before current is updated.  Suppose a kernel breakpoint is
511	 * triggered in between the two, the hw-breakpoint handler will see that
512	 * the 'current' task does not have TIF_DEBUG flag set and will think it
513	 * is leftover from an old task (lazy switching) and will erase it. Then
514	 * until the next context switch, no user-breakpoints will be installed.
515	 *
516	 * The real problem is that it's impossible to update both current and
517	 * physical debug registers at the same instant, so there will always be
518	 * a window in which they disagree and a breakpoint might get triggered.
519	 * Since we use lazy switching, we are forced to assume that a
520	 * disagreement means that current is correct and the exception is due
521	 * to lazy debug register switching.
522	 */
523	if (unlikely(test_tsk_thread_flag(next_p, TIF_DEBUG)))
524		arch_install_thread_hw_breakpoint(next_p);
525
526	return prev_p;
527}
528
529/*
530 * sys_execve() executes a new program.
531 */
532asmlinkage
533long sys_execve(char __user *name, char __user * __user *argv,
534		char __user * __user *envp, struct pt_regs *regs)
535{
536	long error;
537	char *filename;
538
539	filename = getname(name);
540	error = PTR_ERR(filename);
541	if (IS_ERR(filename))
542		return error;
543	error = do_execve(filename, argv, envp, regs);
544	putname(filename);
545	return error;
546}
547
548void set_personality_64bit(void)
549{
550	/* inherit personality from parent */
551
552	/* Make sure to be in 64bit mode */
553	clear_thread_flag(TIF_IA32);
554
555	/* TBD: overwrites user setup. Should have two bits.
556	   But 64bit processes have always behaved this way,
557	   so it's not too bad. The main problem is just that
558	   32bit childs are affected again. */
559	current->personality &= ~READ_IMPLIES_EXEC;
560}
561
562asmlinkage long
563sys_clone(unsigned long clone_flags, unsigned long newsp,
564	  void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
565{
566	if (!newsp)
567		newsp = regs->sp;
568	return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
569}
570
571unsigned long get_wchan(struct task_struct *p)
572{
573	unsigned long stack;
574	u64 fp, ip;
575	int count = 0;
576
577	if (!p || p == current || p->state == TASK_RUNNING)
578		return 0;
579	stack = (unsigned long)task_stack_page(p);
580	if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE)
581		return 0;
582	fp = *(u64 *)(p->thread.sp);
583	do {
584		if (fp < (unsigned long)stack ||
585		    fp >= (unsigned long)stack+THREAD_SIZE)
586			return 0;
587		ip = *(u64 *)(fp+8);
588		if (!in_sched_functions(ip))
589			return ip;
590		fp = *(u64 *)fp;
591	} while (count++ < 16);
592	return 0;
593}
594
595long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
596{
597	int ret = 0;
598	int doit = task == current;
599	int cpu;
600
601	switch (code) {
602	case ARCH_SET_GS:
603		if (addr >= TASK_SIZE_OF(task))
604			return -EPERM;
605		cpu = get_cpu();
606		/* handle small bases via the GDT because that's faster to
607		   switch. */
608		if (addr <= 0xffffffff) {
609			set_32bit_tls(task, GS_TLS, addr);
610			if (doit) {
611				load_TLS(&task->thread, cpu);
612				load_gs_index(GS_TLS_SEL);
613			}
614			task->thread.gsindex = GS_TLS_SEL;
615			task->thread.gs = 0;
616		} else {
617			task->thread.gsindex = 0;
618			task->thread.gs = addr;
619			if (doit) {
620				load_gs_index(0);
621				ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
622			}
623		}
624		put_cpu();
625		break;
626	case ARCH_SET_FS:
627		/* Not strictly needed for fs, but do it for symmetry
628		   with gs */
629		if (addr >= TASK_SIZE_OF(task))
630			return -EPERM;
631		cpu = get_cpu();
632		/* handle small bases via the GDT because that's faster to
633		   switch. */
634		if (addr <= 0xffffffff) {
635			set_32bit_tls(task, FS_TLS, addr);
636			if (doit) {
637				load_TLS(&task->thread, cpu);
638				loadsegment(fs, FS_TLS_SEL);
639			}
640			task->thread.fsindex = FS_TLS_SEL;
641			task->thread.fs = 0;
642		} else {
643			task->thread.fsindex = 0;
644			task->thread.fs = addr;
645			if (doit) {
646				/* set the selector to 0 to not confuse
647				   __switch_to */
648				loadsegment(fs, 0);
649				ret = checking_wrmsrl(MSR_FS_BASE, addr);
650			}
651		}
652		put_cpu();
653		break;
654	case ARCH_GET_FS: {
655		unsigned long base;
656		if (task->thread.fsindex == FS_TLS_SEL)
657			base = read_32bit_tls(task, FS_TLS);
658		else if (doit)
659			rdmsrl(MSR_FS_BASE, base);
660		else
661			base = task->thread.fs;
662		ret = put_user(base, (unsigned long __user *)addr);
663		break;
664	}
665	case ARCH_GET_GS: {
666		unsigned long base;
667		unsigned gsindex;
668		if (task->thread.gsindex == GS_TLS_SEL)
669			base = read_32bit_tls(task, GS_TLS);
670		else if (doit) {
671			savesegment(gs, gsindex);
672			if (gsindex)
673				rdmsrl(MSR_KERNEL_GS_BASE, base);
674			else
675				base = task->thread.gs;
676		} else
677			base = task->thread.gs;
678		ret = put_user(base, (unsigned long __user *)addr);
679		break;
680	}
681
682	default:
683		ret = -EINVAL;
684		break;
685	}
686
687	return ret;
688}
689
690long sys_arch_prctl(int code, unsigned long addr)
691{
692	return do_arch_prctl(current, code, addr);
693}
694
695unsigned long arch_align_stack(unsigned long sp)
696{
697	if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
698		sp -= get_random_int() % 8192;
699	return sp & ~0xf;
700}
701
702unsigned long arch_randomize_brk(struct mm_struct *mm)
703{
704	unsigned long range_end = mm->brk + 0x02000000;
705	return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
706}
707