process_64.c revision 2311f0de21c17b2a8b960677a9cccfbfa52beb35
1/*
2 *  Copyright (C) 1995  Linus Torvalds
3 *
4 *  Pentium III FXSR, SSE support
5 *	Gareth Hughes <gareth@valinux.com>, May 2000
6 *
7 *  X86-64 port
8 *	Andi Kleen.
9 *
10 *	CPU hotplug support - ashok.raj@intel.com
11 */
12
13/*
14 * This file handles the architecture-dependent parts of process handling..
15 */
16
17#include <stdarg.h>
18
19#include <linux/stackprotector.h>
20#include <linux/cpu.h>
21#include <linux/errno.h>
22#include <linux/sched.h>
23#include <linux/fs.h>
24#include <linux/kernel.h>
25#include <linux/mm.h>
26#include <linux/elfcore.h>
27#include <linux/smp.h>
28#include <linux/slab.h>
29#include <linux/user.h>
30#include <linux/interrupt.h>
31#include <linux/utsname.h>
32#include <linux/delay.h>
33#include <linux/module.h>
34#include <linux/ptrace.h>
35#include <linux/random.h>
36#include <linux/notifier.h>
37#include <linux/kprobes.h>
38#include <linux/kdebug.h>
39#include <linux/tick.h>
40#include <linux/prctl.h>
41#include <linux/uaccess.h>
42#include <linux/io.h>
43#include <linux/ftrace.h>
44#include <linux/dmi.h>
45
46#include <asm/pgtable.h>
47#include <asm/system.h>
48#include <asm/processor.h>
49#include <asm/i387.h>
50#include <asm/mmu_context.h>
51#include <asm/prctl.h>
52#include <asm/desc.h>
53#include <asm/proto.h>
54#include <asm/ia32.h>
55#include <asm/idle.h>
56#include <asm/syscalls.h>
57#include <asm/ds.h>
58
59asmlinkage extern void ret_from_fork(void);
60
61DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
62EXPORT_PER_CPU_SYMBOL(current_task);
63
64DEFINE_PER_CPU(unsigned long, old_rsp);
65static DEFINE_PER_CPU(unsigned char, is_idle);
66
67unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
68
69static ATOMIC_NOTIFIER_HEAD(idle_notifier);
70
71void idle_notifier_register(struct notifier_block *n)
72{
73	atomic_notifier_chain_register(&idle_notifier, n);
74}
75EXPORT_SYMBOL_GPL(idle_notifier_register);
76
77void idle_notifier_unregister(struct notifier_block *n)
78{
79	atomic_notifier_chain_unregister(&idle_notifier, n);
80}
81EXPORT_SYMBOL_GPL(idle_notifier_unregister);
82
83void enter_idle(void)
84{
85	percpu_write(is_idle, 1);
86	atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
87}
88
89static void __exit_idle(void)
90{
91	if (x86_test_and_clear_bit_percpu(0, is_idle) == 0)
92		return;
93	atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
94}
95
96/* Called from interrupts to signify idle end */
97void exit_idle(void)
98{
99	/* idle loop has pid 0 */
100	if (current->pid)
101		return;
102	__exit_idle();
103}
104
105#ifndef CONFIG_SMP
106static inline void play_dead(void)
107{
108	BUG();
109}
110#endif
111
112/*
113 * The idle thread. There's no useful work to be
114 * done, so just try to conserve power and have a
115 * low exit latency (ie sit in a loop waiting for
116 * somebody to say that they'd like to reschedule)
117 */
118void cpu_idle(void)
119{
120	current_thread_info()->status |= TS_POLLING;
121
122	/*
123	 * If we're the non-boot CPU, nothing set the stack canary up
124	 * for us.  CPU0 already has it initialized but no harm in
125	 * doing it again.  This is a good place for updating it, as
126	 * we wont ever return from this function (so the invalid
127	 * canaries already on the stack wont ever trigger).
128	 */
129	boot_init_stack_canary();
130
131	/* endless idle loop with no priority at all */
132	while (1) {
133		tick_nohz_stop_sched_tick(1);
134		while (!need_resched()) {
135
136			rmb();
137
138			if (cpu_is_offline(smp_processor_id()))
139				play_dead();
140			/*
141			 * Idle routines should keep interrupts disabled
142			 * from here on, until they go to idle.
143			 * Otherwise, idle callbacks can misfire.
144			 */
145			local_irq_disable();
146			enter_idle();
147			/* Don't trace irqs off for idle */
148			stop_critical_timings();
149			pm_idle();
150			start_critical_timings();
151			/* In many cases the interrupt that ended idle
152			   has already called exit_idle. But some idle
153			   loops can be woken up without interrupt. */
154			__exit_idle();
155		}
156
157		tick_nohz_restart_sched_tick();
158		preempt_enable_no_resched();
159		schedule();
160		preempt_disable();
161	}
162}
163
164/* Prints also some state that isn't saved in the pt_regs */
165void __show_regs(struct pt_regs *regs, int all)
166{
167	unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
168	unsigned long d0, d1, d2, d3, d6, d7;
169	unsigned int fsindex, gsindex;
170	unsigned int ds, cs, es;
171	const char *board;
172
173	printk("\n");
174	print_modules();
175	board = dmi_get_system_info(DMI_PRODUCT_NAME);
176	if (!board)
177		board = "";
178	printk(KERN_INFO "Pid: %d, comm: %.20s %s %s %.*s %s\n",
179		current->pid, current->comm, print_tainted(),
180		init_utsname()->release,
181		(int)strcspn(init_utsname()->version, " "),
182		init_utsname()->version, board);
183	printk(KERN_INFO "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
184	printk_address(regs->ip, 1);
185	printk(KERN_INFO "RSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss,
186			regs->sp, regs->flags);
187	printk(KERN_INFO "RAX: %016lx RBX: %016lx RCX: %016lx\n",
188	       regs->ax, regs->bx, regs->cx);
189	printk(KERN_INFO "RDX: %016lx RSI: %016lx RDI: %016lx\n",
190	       regs->dx, regs->si, regs->di);
191	printk(KERN_INFO "RBP: %016lx R08: %016lx R09: %016lx\n",
192	       regs->bp, regs->r8, regs->r9);
193	printk(KERN_INFO "R10: %016lx R11: %016lx R12: %016lx\n",
194	       regs->r10, regs->r11, regs->r12);
195	printk(KERN_INFO "R13: %016lx R14: %016lx R15: %016lx\n",
196	       regs->r13, regs->r14, regs->r15);
197
198	asm("movl %%ds,%0" : "=r" (ds));
199	asm("movl %%cs,%0" : "=r" (cs));
200	asm("movl %%es,%0" : "=r" (es));
201	asm("movl %%fs,%0" : "=r" (fsindex));
202	asm("movl %%gs,%0" : "=r" (gsindex));
203
204	rdmsrl(MSR_FS_BASE, fs);
205	rdmsrl(MSR_GS_BASE, gs);
206	rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
207
208	if (!all)
209		return;
210
211	cr0 = read_cr0();
212	cr2 = read_cr2();
213	cr3 = read_cr3();
214	cr4 = read_cr4();
215
216	printk(KERN_INFO "FS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
217	       fs, fsindex, gs, gsindex, shadowgs);
218	printk(KERN_INFO "CS:  %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
219			es, cr0);
220	printk(KERN_INFO "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
221			cr4);
222
223	get_debugreg(d0, 0);
224	get_debugreg(d1, 1);
225	get_debugreg(d2, 2);
226	printk(KERN_INFO "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
227	get_debugreg(d3, 3);
228	get_debugreg(d6, 6);
229	get_debugreg(d7, 7);
230	printk(KERN_INFO "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
231}
232
233void show_regs(struct pt_regs *regs)
234{
235	printk(KERN_INFO "CPU %d:", smp_processor_id());
236	__show_regs(regs, 1);
237	show_trace(NULL, regs, (void *)(regs + 1), regs->bp);
238}
239
240void release_thread(struct task_struct *dead_task)
241{
242	if (dead_task->mm) {
243		if (dead_task->mm->context.size) {
244			printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
245					dead_task->comm,
246					dead_task->mm->context.ldt,
247					dead_task->mm->context.size);
248			BUG();
249		}
250	}
251}
252
253static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
254{
255	struct user_desc ud = {
256		.base_addr = addr,
257		.limit = 0xfffff,
258		.seg_32bit = 1,
259		.limit_in_pages = 1,
260		.useable = 1,
261	};
262	struct desc_struct *desc = t->thread.tls_array;
263	desc += tls;
264	fill_ldt(desc, &ud);
265}
266
267static inline u32 read_32bit_tls(struct task_struct *t, int tls)
268{
269	return get_desc_base(&t->thread.tls_array[tls]);
270}
271
272/*
273 * This gets called before we allocate a new thread and copy
274 * the current task into it.
275 */
276void prepare_to_copy(struct task_struct *tsk)
277{
278	unlazy_fpu(tsk);
279}
280
281int copy_thread(unsigned long clone_flags, unsigned long sp,
282		unsigned long unused,
283	struct task_struct *p, struct pt_regs *regs)
284{
285	int err;
286	struct pt_regs *childregs;
287	struct task_struct *me = current;
288
289	childregs = ((struct pt_regs *)
290			(THREAD_SIZE + task_stack_page(p))) - 1;
291	*childregs = *regs;
292
293	childregs->ax = 0;
294	childregs->sp = sp;
295	if (sp == ~0UL)
296		childregs->sp = (unsigned long)childregs;
297
298	p->thread.sp = (unsigned long) childregs;
299	p->thread.sp0 = (unsigned long) (childregs+1);
300	p->thread.usersp = me->thread.usersp;
301
302	set_tsk_thread_flag(p, TIF_FORK);
303
304	p->thread.fs = me->thread.fs;
305	p->thread.gs = me->thread.gs;
306
307	savesegment(gs, p->thread.gsindex);
308	savesegment(fs, p->thread.fsindex);
309	savesegment(es, p->thread.es);
310	savesegment(ds, p->thread.ds);
311
312	if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
313		p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
314		if (!p->thread.io_bitmap_ptr) {
315			p->thread.io_bitmap_max = 0;
316			return -ENOMEM;
317		}
318		memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
319				IO_BITMAP_BYTES);
320		set_tsk_thread_flag(p, TIF_IO_BITMAP);
321	}
322
323	/*
324	 * Set a new TLS for the child thread?
325	 */
326	if (clone_flags & CLONE_SETTLS) {
327#ifdef CONFIG_IA32_EMULATION
328		if (test_thread_flag(TIF_IA32))
329			err = do_set_thread_area(p, -1,
330				(struct user_desc __user *)childregs->si, 0);
331		else
332#endif
333			err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
334		if (err)
335			goto out;
336	}
337
338	clear_tsk_thread_flag(p, TIF_DS_AREA_MSR);
339	p->thread.ds_ctx = NULL;
340
341	clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR);
342	p->thread.debugctlmsr = 0;
343
344	err = 0;
345out:
346	if (err && p->thread.io_bitmap_ptr) {
347		kfree(p->thread.io_bitmap_ptr);
348		p->thread.io_bitmap_max = 0;
349	}
350	return err;
351}
352
353void
354start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
355{
356	loadsegment(fs, 0);
357	loadsegment(es, 0);
358	loadsegment(ds, 0);
359	load_gs_index(0);
360	regs->ip		= new_ip;
361	regs->sp		= new_sp;
362	percpu_write(old_rsp, new_sp);
363	regs->cs		= __USER_CS;
364	regs->ss		= __USER_DS;
365	regs->flags		= 0x200;
366	set_fs(USER_DS);
367	/*
368	 * Free the old FP and other extended state
369	 */
370	free_thread_xstate(current);
371}
372EXPORT_SYMBOL_GPL(start_thread);
373
374/*
375 *	switch_to(x,y) should switch tasks from x to y.
376 *
377 * This could still be optimized:
378 * - fold all the options into a flag word and test it with a single test.
379 * - could test fs/gs bitsliced
380 *
381 * Kprobes not supported here. Set the probe on schedule instead.
382 * Function graph tracer not supported too.
383 */
384__notrace_funcgraph struct task_struct *
385__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
386{
387	struct thread_struct *prev = &prev_p->thread;
388	struct thread_struct *next = &next_p->thread;
389	int cpu = smp_processor_id();
390	struct tss_struct *tss = &per_cpu(init_tss, cpu);
391	unsigned fsindex, gsindex;
392
393	/* we're going to use this soon, after a few expensive things */
394	if (next_p->fpu_counter > 5)
395		prefetch(next->xstate);
396
397	/*
398	 * Reload esp0, LDT and the page table pointer:
399	 */
400	load_sp0(tss, next);
401
402	/*
403	 * Switch DS and ES.
404	 * This won't pick up thread selector changes, but I guess that is ok.
405	 */
406	savesegment(es, prev->es);
407	if (unlikely(next->es | prev->es))
408		loadsegment(es, next->es);
409
410	savesegment(ds, prev->ds);
411	if (unlikely(next->ds | prev->ds))
412		loadsegment(ds, next->ds);
413
414
415	/* We must save %fs and %gs before load_TLS() because
416	 * %fs and %gs may be cleared by load_TLS().
417	 *
418	 * (e.g. xen_load_tls())
419	 */
420	savesegment(fs, fsindex);
421	savesegment(gs, gsindex);
422
423	load_TLS(next, cpu);
424
425	/*
426	 * Leave lazy mode, flushing any hypercalls made here.
427	 * This must be done before restoring TLS segments so
428	 * the GDT and LDT are properly updated, and must be
429	 * done before math_state_restore, so the TS bit is up
430	 * to date.
431	 */
432	arch_leave_lazy_cpu_mode();
433
434	/*
435	 * Switch FS and GS.
436	 *
437	 * Segment register != 0 always requires a reload.  Also
438	 * reload when it has changed.  When prev process used 64bit
439	 * base always reload to avoid an information leak.
440	 */
441	if (unlikely(fsindex | next->fsindex | prev->fs)) {
442		loadsegment(fs, next->fsindex);
443		/*
444		 * Check if the user used a selector != 0; if yes
445		 *  clear 64bit base, since overloaded base is always
446		 *  mapped to the Null selector
447		 */
448		if (fsindex)
449			prev->fs = 0;
450	}
451	/* when next process has a 64bit base use it */
452	if (next->fs)
453		wrmsrl(MSR_FS_BASE, next->fs);
454	prev->fsindex = fsindex;
455
456	if (unlikely(gsindex | next->gsindex | prev->gs)) {
457		load_gs_index(next->gsindex);
458		if (gsindex)
459			prev->gs = 0;
460	}
461	if (next->gs)
462		wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
463	prev->gsindex = gsindex;
464
465	/* Must be after DS reload */
466	unlazy_fpu(prev_p);
467
468	/*
469	 * Switch the PDA and FPU contexts.
470	 */
471	prev->usersp = percpu_read(old_rsp);
472	percpu_write(old_rsp, next->usersp);
473	percpu_write(current_task, next_p);
474
475	percpu_write(kernel_stack,
476		  (unsigned long)task_stack_page(next_p) +
477		  THREAD_SIZE - KERNEL_STACK_OFFSET);
478
479	/*
480	 * Now maybe reload the debug registers and handle I/O bitmaps
481	 */
482	if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
483		     task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
484		__switch_to_xtra(prev_p, next_p, tss);
485
486	/* If the task has used fpu the last 5 timeslices, just do a full
487	 * restore of the math state immediately to avoid the trap; the
488	 * chances of needing FPU soon are obviously high now
489	 *
490	 * tsk_used_math() checks prevent calling math_state_restore(),
491	 * which can sleep in the case of !tsk_used_math()
492	 */
493	if (tsk_used_math(next_p) && next_p->fpu_counter > 5)
494		math_state_restore();
495	return prev_p;
496}
497
498/*
499 * sys_execve() executes a new program.
500 */
501asmlinkage
502long sys_execve(char __user *name, char __user * __user *argv,
503		char __user * __user *envp, struct pt_regs *regs)
504{
505	long error;
506	char *filename;
507
508	filename = getname(name);
509	error = PTR_ERR(filename);
510	if (IS_ERR(filename))
511		return error;
512	error = do_execve(filename, argv, envp, regs);
513	putname(filename);
514	return error;
515}
516
517void set_personality_64bit(void)
518{
519	/* inherit personality from parent */
520
521	/* Make sure to be in 64bit mode */
522	clear_thread_flag(TIF_IA32);
523
524	/* TBD: overwrites user setup. Should have two bits.
525	   But 64bit processes have always behaved this way,
526	   so it's not too bad. The main problem is just that
527	   32bit childs are affected again. */
528	current->personality &= ~READ_IMPLIES_EXEC;
529}
530
531asmlinkage long
532sys_clone(unsigned long clone_flags, unsigned long newsp,
533	  void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
534{
535	if (!newsp)
536		newsp = regs->sp;
537	return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
538}
539
540unsigned long get_wchan(struct task_struct *p)
541{
542	unsigned long stack;
543	u64 fp, ip;
544	int count = 0;
545
546	if (!p || p == current || p->state == TASK_RUNNING)
547		return 0;
548	stack = (unsigned long)task_stack_page(p);
549	if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE)
550		return 0;
551	fp = *(u64 *)(p->thread.sp);
552	do {
553		if (fp < (unsigned long)stack ||
554		    fp >= (unsigned long)stack+THREAD_SIZE)
555			return 0;
556		ip = *(u64 *)(fp+8);
557		if (!in_sched_functions(ip))
558			return ip;
559		fp = *(u64 *)fp;
560	} while (count++ < 16);
561	return 0;
562}
563
564long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
565{
566	int ret = 0;
567	int doit = task == current;
568	int cpu;
569
570	switch (code) {
571	case ARCH_SET_GS:
572		if (addr >= TASK_SIZE_OF(task))
573			return -EPERM;
574		cpu = get_cpu();
575		/* handle small bases via the GDT because that's faster to
576		   switch. */
577		if (addr <= 0xffffffff) {
578			set_32bit_tls(task, GS_TLS, addr);
579			if (doit) {
580				load_TLS(&task->thread, cpu);
581				load_gs_index(GS_TLS_SEL);
582			}
583			task->thread.gsindex = GS_TLS_SEL;
584			task->thread.gs = 0;
585		} else {
586			task->thread.gsindex = 0;
587			task->thread.gs = addr;
588			if (doit) {
589				load_gs_index(0);
590				ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
591			}
592		}
593		put_cpu();
594		break;
595	case ARCH_SET_FS:
596		/* Not strictly needed for fs, but do it for symmetry
597		   with gs */
598		if (addr >= TASK_SIZE_OF(task))
599			return -EPERM;
600		cpu = get_cpu();
601		/* handle small bases via the GDT because that's faster to
602		   switch. */
603		if (addr <= 0xffffffff) {
604			set_32bit_tls(task, FS_TLS, addr);
605			if (doit) {
606				load_TLS(&task->thread, cpu);
607				loadsegment(fs, FS_TLS_SEL);
608			}
609			task->thread.fsindex = FS_TLS_SEL;
610			task->thread.fs = 0;
611		} else {
612			task->thread.fsindex = 0;
613			task->thread.fs = addr;
614			if (doit) {
615				/* set the selector to 0 to not confuse
616				   __switch_to */
617				loadsegment(fs, 0);
618				ret = checking_wrmsrl(MSR_FS_BASE, addr);
619			}
620		}
621		put_cpu();
622		break;
623	case ARCH_GET_FS: {
624		unsigned long base;
625		if (task->thread.fsindex == FS_TLS_SEL)
626			base = read_32bit_tls(task, FS_TLS);
627		else if (doit)
628			rdmsrl(MSR_FS_BASE, base);
629		else
630			base = task->thread.fs;
631		ret = put_user(base, (unsigned long __user *)addr);
632		break;
633	}
634	case ARCH_GET_GS: {
635		unsigned long base;
636		unsigned gsindex;
637		if (task->thread.gsindex == GS_TLS_SEL)
638			base = read_32bit_tls(task, GS_TLS);
639		else if (doit) {
640			savesegment(gs, gsindex);
641			if (gsindex)
642				rdmsrl(MSR_KERNEL_GS_BASE, base);
643			else
644				base = task->thread.gs;
645		} else
646			base = task->thread.gs;
647		ret = put_user(base, (unsigned long __user *)addr);
648		break;
649	}
650
651	default:
652		ret = -EINVAL;
653		break;
654	}
655
656	return ret;
657}
658
659long sys_arch_prctl(int code, unsigned long addr)
660{
661	return do_arch_prctl(current, code, addr);
662}
663
664unsigned long arch_align_stack(unsigned long sp)
665{
666	if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
667		sp -= get_random_int() % 8192;
668	return sp & ~0xf;
669}
670
671unsigned long arch_randomize_brk(struct mm_struct *mm)
672{
673	unsigned long range_end = mm->brk + 0x02000000;
674	return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
675}
676