process_64.c revision dca2d6ac09d9ef59ff46820d4f0c94b08a671202
1/*
2 *  Copyright (C) 1995  Linus Torvalds
3 *
4 *  Pentium III FXSR, SSE support
5 *	Gareth Hughes <gareth@valinux.com>, May 2000
6 *
7 *  X86-64 port
8 *	Andi Kleen.
9 *
10 *	CPU hotplug support - ashok.raj@intel.com
11 */
12
13/*
14 * This file handles the architecture-dependent parts of process handling..
15 */
16
17#include <linux/stackprotector.h>
18#include <linux/cpu.h>
19#include <linux/errno.h>
20#include <linux/sched.h>
21#include <linux/fs.h>
22#include <linux/kernel.h>
23#include <linux/mm.h>
24#include <linux/elfcore.h>
25#include <linux/smp.h>
26#include <linux/slab.h>
27#include <linux/user.h>
28#include <linux/interrupt.h>
29#include <linux/utsname.h>
30#include <linux/delay.h>
31#include <linux/module.h>
32#include <linux/ptrace.h>
33#include <linux/notifier.h>
34#include <linux/kprobes.h>
35#include <linux/kdebug.h>
36#include <linux/tick.h>
37#include <linux/prctl.h>
38#include <linux/uaccess.h>
39#include <linux/io.h>
40#include <linux/ftrace.h>
41#include <linux/dmi.h>
42
43#include <asm/pgtable.h>
44#include <asm/system.h>
45#include <asm/processor.h>
46#include <asm/i387.h>
47#include <asm/mmu_context.h>
48#include <asm/prctl.h>
49#include <asm/desc.h>
50#include <asm/proto.h>
51#include <asm/ia32.h>
52#include <asm/idle.h>
53#include <asm/syscalls.h>
54#include <asm/ds.h>
55#include <asm/debugreg.h>
56#include <asm/hw_breakpoint.h>
57
58asmlinkage extern void ret_from_fork(void);
59
60DEFINE_PER_CPU(unsigned long, old_rsp);
61static DEFINE_PER_CPU(unsigned char, is_idle);
62
63unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
64
65static ATOMIC_NOTIFIER_HEAD(idle_notifier);
66
67void idle_notifier_register(struct notifier_block *n)
68{
69	atomic_notifier_chain_register(&idle_notifier, n);
70}
71EXPORT_SYMBOL_GPL(idle_notifier_register);
72
73void idle_notifier_unregister(struct notifier_block *n)
74{
75	atomic_notifier_chain_unregister(&idle_notifier, n);
76}
77EXPORT_SYMBOL_GPL(idle_notifier_unregister);
78
79void enter_idle(void)
80{
81	percpu_write(is_idle, 1);
82	atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
83}
84
85static void __exit_idle(void)
86{
87	if (x86_test_and_clear_bit_percpu(0, is_idle) == 0)
88		return;
89	atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
90}
91
92/* Called from interrupts to signify idle end */
93void exit_idle(void)
94{
95	/* idle loop has pid 0 */
96	if (current->pid)
97		return;
98	__exit_idle();
99}
100
101#ifndef CONFIG_SMP
102static inline void play_dead(void)
103{
104	BUG();
105}
106#endif
107
108/*
109 * The idle thread. There's no useful work to be
110 * done, so just try to conserve power and have a
111 * low exit latency (ie sit in a loop waiting for
112 * somebody to say that they'd like to reschedule)
113 */
114void cpu_idle(void)
115{
116	current_thread_info()->status |= TS_POLLING;
117
118	/*
119	 * If we're the non-boot CPU, nothing set the stack canary up
120	 * for us.  CPU0 already has it initialized but no harm in
121	 * doing it again.  This is a good place for updating it, as
122	 * we wont ever return from this function (so the invalid
123	 * canaries already on the stack wont ever trigger).
124	 */
125	boot_init_stack_canary();
126
127	/* endless idle loop with no priority at all */
128	while (1) {
129		tick_nohz_stop_sched_tick(1);
130		while (!need_resched()) {
131
132			rmb();
133
134			if (cpu_is_offline(smp_processor_id()))
135				play_dead();
136			/*
137			 * Idle routines should keep interrupts disabled
138			 * from here on, until they go to idle.
139			 * Otherwise, idle callbacks can misfire.
140			 */
141			local_irq_disable();
142			enter_idle();
143			/* Don't trace irqs off for idle */
144			stop_critical_timings();
145			pm_idle();
146			start_critical_timings();
147			/* In many cases the interrupt that ended idle
148			   has already called exit_idle. But some idle
149			   loops can be woken up without interrupt. */
150			__exit_idle();
151		}
152
153		tick_nohz_restart_sched_tick();
154		preempt_enable_no_resched();
155		schedule();
156		preempt_disable();
157	}
158}
159
160/* Prints also some state that isn't saved in the pt_regs */
161void __show_regs(struct pt_regs *regs, int all)
162{
163	unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
164	unsigned long d0, d1, d2, d3, d6, d7;
165	unsigned int fsindex, gsindex;
166	unsigned int ds, cs, es;
167	const char *board;
168
169	printk("\n");
170	print_modules();
171	board = dmi_get_system_info(DMI_PRODUCT_NAME);
172	if (!board)
173		board = "";
174	printk(KERN_INFO "Pid: %d, comm: %.20s %s %s %.*s %s\n",
175		current->pid, current->comm, print_tainted(),
176		init_utsname()->release,
177		(int)strcspn(init_utsname()->version, " "),
178		init_utsname()->version, board);
179	printk(KERN_INFO "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
180	printk_address(regs->ip, 1);
181	printk(KERN_INFO "RSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss,
182			regs->sp, regs->flags);
183	printk(KERN_INFO "RAX: %016lx RBX: %016lx RCX: %016lx\n",
184	       regs->ax, regs->bx, regs->cx);
185	printk(KERN_INFO "RDX: %016lx RSI: %016lx RDI: %016lx\n",
186	       regs->dx, regs->si, regs->di);
187	printk(KERN_INFO "RBP: %016lx R08: %016lx R09: %016lx\n",
188	       regs->bp, regs->r8, regs->r9);
189	printk(KERN_INFO "R10: %016lx R11: %016lx R12: %016lx\n",
190	       regs->r10, regs->r11, regs->r12);
191	printk(KERN_INFO "R13: %016lx R14: %016lx R15: %016lx\n",
192	       regs->r13, regs->r14, regs->r15);
193
194	asm("movl %%ds,%0" : "=r" (ds));
195	asm("movl %%cs,%0" : "=r" (cs));
196	asm("movl %%es,%0" : "=r" (es));
197	asm("movl %%fs,%0" : "=r" (fsindex));
198	asm("movl %%gs,%0" : "=r" (gsindex));
199
200	rdmsrl(MSR_FS_BASE, fs);
201	rdmsrl(MSR_GS_BASE, gs);
202	rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
203
204	if (!all)
205		return;
206
207	cr0 = read_cr0();
208	cr2 = read_cr2();
209	cr3 = read_cr3();
210	cr4 = read_cr4();
211
212	printk(KERN_INFO "FS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
213	       fs, fsindex, gs, gsindex, shadowgs);
214	printk(KERN_INFO "CS:  %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
215			es, cr0);
216	printk(KERN_INFO "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
217			cr4);
218
219	get_debugreg(d0, 0);
220	get_debugreg(d1, 1);
221	get_debugreg(d2, 2);
222	printk(KERN_INFO "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
223	get_debugreg(d3, 3);
224	get_debugreg(d6, 6);
225	get_debugreg(d7, 7);
226	printk(KERN_INFO "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
227}
228
229void show_regs(struct pt_regs *regs)
230{
231	printk(KERN_INFO "CPU %d:", smp_processor_id());
232	__show_regs(regs, 1);
233	show_trace(NULL, regs, (void *)(regs + 1), regs->bp);
234}
235
236void release_thread(struct task_struct *dead_task)
237{
238	if (dead_task->mm) {
239		if (dead_task->mm->context.size) {
240			printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
241					dead_task->comm,
242					dead_task->mm->context.ldt,
243					dead_task->mm->context.size);
244			BUG();
245		}
246	}
247	if (unlikely(dead_task->thread.debugreg7))
248		flush_thread_hw_breakpoint(dead_task);
249}
250
251static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
252{
253	struct user_desc ud = {
254		.base_addr = addr,
255		.limit = 0xfffff,
256		.seg_32bit = 1,
257		.limit_in_pages = 1,
258		.useable = 1,
259	};
260	struct desc_struct *desc = t->thread.tls_array;
261	desc += tls;
262	fill_ldt(desc, &ud);
263}
264
265static inline u32 read_32bit_tls(struct task_struct *t, int tls)
266{
267	return get_desc_base(&t->thread.tls_array[tls]);
268}
269
270/*
271 * This gets called before we allocate a new thread and copy
272 * the current task into it.
273 */
274void prepare_to_copy(struct task_struct *tsk)
275{
276	unlazy_fpu(tsk);
277}
278
279int copy_thread(unsigned long clone_flags, unsigned long sp,
280		unsigned long unused,
281	struct task_struct *p, struct pt_regs *regs)
282{
283	int err;
284	struct pt_regs *childregs;
285	struct task_struct *me = current;
286
287	childregs = ((struct pt_regs *)
288			(THREAD_SIZE + task_stack_page(p))) - 1;
289	*childregs = *regs;
290
291	childregs->ax = 0;
292	childregs->sp = sp;
293	if (sp == ~0UL)
294		childregs->sp = (unsigned long)childregs;
295
296	p->thread.sp = (unsigned long) childregs;
297	p->thread.sp0 = (unsigned long) (childregs+1);
298	p->thread.usersp = me->thread.usersp;
299
300	set_tsk_thread_flag(p, TIF_FORK);
301
302	p->thread.fs = me->thread.fs;
303	p->thread.gs = me->thread.gs;
304	p->thread.io_bitmap_ptr = NULL;
305
306	savesegment(gs, p->thread.gsindex);
307	savesegment(fs, p->thread.fsindex);
308	savesegment(es, p->thread.es);
309	savesegment(ds, p->thread.ds);
310
311	err = -ENOMEM;
312	if (unlikely(test_tsk_thread_flag(me, TIF_DEBUG)))
313		if (copy_thread_hw_breakpoint(me, p, clone_flags))
314			goto out;
315
316	if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
317		p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
318		if (!p->thread.io_bitmap_ptr) {
319			p->thread.io_bitmap_max = 0;
320			return -ENOMEM;
321		}
322		memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
323				IO_BITMAP_BYTES);
324		set_tsk_thread_flag(p, TIF_IO_BITMAP);
325	}
326
327	/*
328	 * Set a new TLS for the child thread?
329	 */
330	if (clone_flags & CLONE_SETTLS) {
331#ifdef CONFIG_IA32_EMULATION
332		if (test_thread_flag(TIF_IA32))
333			err = do_set_thread_area(p, -1,
334				(struct user_desc __user *)childregs->si, 0);
335		else
336#endif
337			err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
338		if (err)
339			goto out;
340	}
341
342	clear_tsk_thread_flag(p, TIF_DS_AREA_MSR);
343	p->thread.ds_ctx = NULL;
344
345	clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR);
346	p->thread.debugctlmsr = 0;
347
348	err = 0;
349out:
350	if (err && p->thread.io_bitmap_ptr) {
351		kfree(p->thread.io_bitmap_ptr);
352		p->thread.io_bitmap_max = 0;
353	}
354	if (err)
355		flush_thread_hw_breakpoint(p);
356
357	return err;
358}
359
360void
361start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
362{
363	loadsegment(fs, 0);
364	loadsegment(es, 0);
365	loadsegment(ds, 0);
366	load_gs_index(0);
367	regs->ip		= new_ip;
368	regs->sp		= new_sp;
369	percpu_write(old_rsp, new_sp);
370	regs->cs		= __USER_CS;
371	regs->ss		= __USER_DS;
372	regs->flags		= 0x200;
373	set_fs(USER_DS);
374	/*
375	 * Free the old FP and other extended state
376	 */
377	free_thread_xstate(current);
378}
379EXPORT_SYMBOL_GPL(start_thread);
380
381/*
382 *	switch_to(x,y) should switch tasks from x to y.
383 *
384 * This could still be optimized:
385 * - fold all the options into a flag word and test it with a single test.
386 * - could test fs/gs bitsliced
387 *
388 * Kprobes not supported here. Set the probe on schedule instead.
389 * Function graph tracer not supported too.
390 */
391__notrace_funcgraph struct task_struct *
392__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
393{
394	struct thread_struct *prev = &prev_p->thread;
395	struct thread_struct *next = &next_p->thread;
396	int cpu = smp_processor_id();
397	struct tss_struct *tss = &per_cpu(init_tss, cpu);
398	unsigned fsindex, gsindex;
399	bool preload_fpu;
400
401	/*
402	 * If the task has used fpu the last 5 timeslices, just do a full
403	 * restore of the math state immediately to avoid the trap; the
404	 * chances of needing FPU soon are obviously high now
405	 */
406	preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter > 5;
407
408	/* we're going to use this soon, after a few expensive things */
409	if (preload_fpu)
410		prefetch(next->xstate);
411
412	/*
413	 * Reload esp0, LDT and the page table pointer:
414	 */
415	load_sp0(tss, next);
416
417	/*
418	 * Switch DS and ES.
419	 * This won't pick up thread selector changes, but I guess that is ok.
420	 */
421	savesegment(es, prev->es);
422	if (unlikely(next->es | prev->es))
423		loadsegment(es, next->es);
424
425	savesegment(ds, prev->ds);
426	if (unlikely(next->ds | prev->ds))
427		loadsegment(ds, next->ds);
428
429
430	/* We must save %fs and %gs before load_TLS() because
431	 * %fs and %gs may be cleared by load_TLS().
432	 *
433	 * (e.g. xen_load_tls())
434	 */
435	savesegment(fs, fsindex);
436	savesegment(gs, gsindex);
437
438	load_TLS(next, cpu);
439
440	/* Must be after DS reload */
441	unlazy_fpu(prev_p);
442
443	/* Make sure cpu is ready for new context */
444	if (preload_fpu)
445		clts();
446
447	/*
448	 * Leave lazy mode, flushing any hypercalls made here.
449	 * This must be done before restoring TLS segments so
450	 * the GDT and LDT are properly updated, and must be
451	 * done before math_state_restore, so the TS bit is up
452	 * to date.
453	 */
454	arch_end_context_switch(next_p);
455
456	/*
457	 * Switch FS and GS.
458	 *
459	 * Segment register != 0 always requires a reload.  Also
460	 * reload when it has changed.  When prev process used 64bit
461	 * base always reload to avoid an information leak.
462	 */
463	if (unlikely(fsindex | next->fsindex | prev->fs)) {
464		loadsegment(fs, next->fsindex);
465		/*
466		 * Check if the user used a selector != 0; if yes
467		 *  clear 64bit base, since overloaded base is always
468		 *  mapped to the Null selector
469		 */
470		if (fsindex)
471			prev->fs = 0;
472	}
473	/* when next process has a 64bit base use it */
474	if (next->fs)
475		wrmsrl(MSR_FS_BASE, next->fs);
476	prev->fsindex = fsindex;
477
478	if (unlikely(gsindex | next->gsindex | prev->gs)) {
479		load_gs_index(next->gsindex);
480		if (gsindex)
481			prev->gs = 0;
482	}
483	if (next->gs)
484		wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
485	prev->gsindex = gsindex;
486
487	/*
488	 * Switch the PDA and FPU contexts.
489	 */
490	prev->usersp = percpu_read(old_rsp);
491	percpu_write(old_rsp, next->usersp);
492	percpu_write(current_task, next_p);
493
494	percpu_write(kernel_stack,
495		  (unsigned long)task_stack_page(next_p) +
496		  THREAD_SIZE - KERNEL_STACK_OFFSET);
497
498	/*
499	 * Now maybe reload the debug registers and handle I/O bitmaps
500	 */
501	if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
502		     task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
503		__switch_to_xtra(prev_p, next_p, tss);
504
505	/*
506	 * Preload the FPU context, now that we've determined that the
507	 * task is likely to be using it.
508	 */
509	if (preload_fpu)
510		__math_state_restore();
511	/*
512	 * There's a problem with moving the arch_install_thread_hw_breakpoint()
513	 * call before current is updated.  Suppose a kernel breakpoint is
514	 * triggered in between the two, the hw-breakpoint handler will see that
515	 * the 'current' task does not have TIF_DEBUG flag set and will think it
516	 * is leftover from an old task (lazy switching) and will erase it. Then
517	 * until the next context switch, no user-breakpoints will be installed.
518	 *
519	 * The real problem is that it's impossible to update both current and
520	 * physical debug registers at the same instant, so there will always be
521	 * a window in which they disagree and a breakpoint might get triggered.
522	 * Since we use lazy switching, we are forced to assume that a
523	 * disagreement means that current is correct and the exception is due
524	 * to lazy debug register switching.
525	 */
526	if (unlikely(test_tsk_thread_flag(next_p, TIF_DEBUG)))
527		arch_install_thread_hw_breakpoint(next_p);
528
529	return prev_p;
530}
531
532/*
533 * sys_execve() executes a new program.
534 */
535asmlinkage
536long sys_execve(char __user *name, char __user * __user *argv,
537		char __user * __user *envp, struct pt_regs *regs)
538{
539	long error;
540	char *filename;
541
542	filename = getname(name);
543	error = PTR_ERR(filename);
544	if (IS_ERR(filename))
545		return error;
546	error = do_execve(filename, argv, envp, regs);
547	putname(filename);
548	return error;
549}
550
551void set_personality_64bit(void)
552{
553	/* inherit personality from parent */
554
555	/* Make sure to be in 64bit mode */
556	clear_thread_flag(TIF_IA32);
557
558	/* TBD: overwrites user setup. Should have two bits.
559	   But 64bit processes have always behaved this way,
560	   so it's not too bad. The main problem is just that
561	   32bit childs are affected again. */
562	current->personality &= ~READ_IMPLIES_EXEC;
563}
564
565asmlinkage long
566sys_clone(unsigned long clone_flags, unsigned long newsp,
567	  void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
568{
569	if (!newsp)
570		newsp = regs->sp;
571	return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
572}
573
574unsigned long get_wchan(struct task_struct *p)
575{
576	unsigned long stack;
577	u64 fp, ip;
578	int count = 0;
579
580	if (!p || p == current || p->state == TASK_RUNNING)
581		return 0;
582	stack = (unsigned long)task_stack_page(p);
583	if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE)
584		return 0;
585	fp = *(u64 *)(p->thread.sp);
586	do {
587		if (fp < (unsigned long)stack ||
588		    fp >= (unsigned long)stack+THREAD_SIZE)
589			return 0;
590		ip = *(u64 *)(fp+8);
591		if (!in_sched_functions(ip))
592			return ip;
593		fp = *(u64 *)fp;
594	} while (count++ < 16);
595	return 0;
596}
597
598long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
599{
600	int ret = 0;
601	int doit = task == current;
602	int cpu;
603
604	switch (code) {
605	case ARCH_SET_GS:
606		if (addr >= TASK_SIZE_OF(task))
607			return -EPERM;
608		cpu = get_cpu();
609		/* handle small bases via the GDT because that's faster to
610		   switch. */
611		if (addr <= 0xffffffff) {
612			set_32bit_tls(task, GS_TLS, addr);
613			if (doit) {
614				load_TLS(&task->thread, cpu);
615				load_gs_index(GS_TLS_SEL);
616			}
617			task->thread.gsindex = GS_TLS_SEL;
618			task->thread.gs = 0;
619		} else {
620			task->thread.gsindex = 0;
621			task->thread.gs = addr;
622			if (doit) {
623				load_gs_index(0);
624				ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
625			}
626		}
627		put_cpu();
628		break;
629	case ARCH_SET_FS:
630		/* Not strictly needed for fs, but do it for symmetry
631		   with gs */
632		if (addr >= TASK_SIZE_OF(task))
633			return -EPERM;
634		cpu = get_cpu();
635		/* handle small bases via the GDT because that's faster to
636		   switch. */
637		if (addr <= 0xffffffff) {
638			set_32bit_tls(task, FS_TLS, addr);
639			if (doit) {
640				load_TLS(&task->thread, cpu);
641				loadsegment(fs, FS_TLS_SEL);
642			}
643			task->thread.fsindex = FS_TLS_SEL;
644			task->thread.fs = 0;
645		} else {
646			task->thread.fsindex = 0;
647			task->thread.fs = addr;
648			if (doit) {
649				/* set the selector to 0 to not confuse
650				   __switch_to */
651				loadsegment(fs, 0);
652				ret = checking_wrmsrl(MSR_FS_BASE, addr);
653			}
654		}
655		put_cpu();
656		break;
657	case ARCH_GET_FS: {
658		unsigned long base;
659		if (task->thread.fsindex == FS_TLS_SEL)
660			base = read_32bit_tls(task, FS_TLS);
661		else if (doit)
662			rdmsrl(MSR_FS_BASE, base);
663		else
664			base = task->thread.fs;
665		ret = put_user(base, (unsigned long __user *)addr);
666		break;
667	}
668	case ARCH_GET_GS: {
669		unsigned long base;
670		unsigned gsindex;
671		if (task->thread.gsindex == GS_TLS_SEL)
672			base = read_32bit_tls(task, GS_TLS);
673		else if (doit) {
674			savesegment(gs, gsindex);
675			if (gsindex)
676				rdmsrl(MSR_KERNEL_GS_BASE, base);
677			else
678				base = task->thread.gs;
679		} else
680			base = task->thread.gs;
681		ret = put_user(base, (unsigned long __user *)addr);
682		break;
683	}
684
685	default:
686		ret = -EINVAL;
687		break;
688	}
689
690	return ret;
691}
692
693long sys_arch_prctl(int code, unsigned long addr)
694{
695	return do_arch_prctl(current, code, addr);
696}
697
698