process_64.c revision 3ca50496c2677a2b3fdd3ede86660fd1433beac6
1/*
2 *  Copyright (C) 1995  Linus Torvalds
3 *
4 *  Pentium III FXSR, SSE support
5 *	Gareth Hughes <gareth@valinux.com>, May 2000
6 *
7 *  X86-64 port
8 *	Andi Kleen.
9 *
10 *	CPU hotplug support - ashok.raj@intel.com
11 */
12
13/*
14 * This file handles the architecture-dependent parts of process handling..
15 */
16
17#include <linux/stackprotector.h>
18#include <linux/cpu.h>
19#include <linux/errno.h>
20#include <linux/sched.h>
21#include <linux/fs.h>
22#include <linux/kernel.h>
23#include <linux/mm.h>
24#include <linux/elfcore.h>
25#include <linux/smp.h>
26#include <linux/slab.h>
27#include <linux/user.h>
28#include <linux/interrupt.h>
29#include <linux/delay.h>
30#include <linux/module.h>
31#include <linux/ptrace.h>
32#include <linux/notifier.h>
33#include <linux/kprobes.h>
34#include <linux/kdebug.h>
35#include <linux/tick.h>
36#include <linux/prctl.h>
37#include <linux/uaccess.h>
38#include <linux/io.h>
39#include <linux/ftrace.h>
40
41#include <asm/pgtable.h>
42#include <asm/system.h>
43#include <asm/processor.h>
44#include <asm/i387.h>
45#include <asm/mmu_context.h>
46#include <asm/prctl.h>
47#include <asm/desc.h>
48#include <asm/proto.h>
49#include <asm/ia32.h>
50#include <asm/idle.h>
51#include <asm/syscalls.h>
52#include <asm/debugreg.h>
53
54asmlinkage extern void ret_from_fork(void);
55
56DEFINE_PER_CPU(unsigned long, old_rsp);
57static DEFINE_PER_CPU(unsigned char, is_idle);
58
59static ATOMIC_NOTIFIER_HEAD(idle_notifier);
60
61void idle_notifier_register(struct notifier_block *n)
62{
63	atomic_notifier_chain_register(&idle_notifier, n);
64}
65EXPORT_SYMBOL_GPL(idle_notifier_register);
66
67void idle_notifier_unregister(struct notifier_block *n)
68{
69	atomic_notifier_chain_unregister(&idle_notifier, n);
70}
71EXPORT_SYMBOL_GPL(idle_notifier_unregister);
72
73void enter_idle(void)
74{
75	percpu_write(is_idle, 1);
76	atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
77}
78
79static void __exit_idle(void)
80{
81	if (x86_test_and_clear_bit_percpu(0, is_idle) == 0)
82		return;
83	atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
84}
85
86/* Called from interrupts to signify idle end */
87void exit_idle(void)
88{
89	/* idle loop has pid 0 */
90	if (current->pid)
91		return;
92	__exit_idle();
93}
94
95#ifndef CONFIG_SMP
96static inline void play_dead(void)
97{
98	BUG();
99}
100#endif
101
102/*
103 * The idle thread. There's no useful work to be
104 * done, so just try to conserve power and have a
105 * low exit latency (ie sit in a loop waiting for
106 * somebody to say that they'd like to reschedule)
107 */
108void cpu_idle(void)
109{
110	current_thread_info()->status |= TS_POLLING;
111
112	/*
113	 * If we're the non-boot CPU, nothing set the stack canary up
114	 * for us.  CPU0 already has it initialized but no harm in
115	 * doing it again.  This is a good place for updating it, as
116	 * we wont ever return from this function (so the invalid
117	 * canaries already on the stack wont ever trigger).
118	 */
119	boot_init_stack_canary();
120
121	/* endless idle loop with no priority at all */
122	while (1) {
123		tick_nohz_stop_sched_tick(1);
124		while (!need_resched()) {
125
126			rmb();
127
128			if (cpu_is_offline(smp_processor_id()))
129				play_dead();
130			/*
131			 * Idle routines should keep interrupts disabled
132			 * from here on, until they go to idle.
133			 * Otherwise, idle callbacks can misfire.
134			 */
135			local_irq_disable();
136			enter_idle();
137			/* Don't trace irqs off for idle */
138			stop_critical_timings();
139			pm_idle();
140			start_critical_timings();
141			/* In many cases the interrupt that ended idle
142			   has already called exit_idle. But some idle
143			   loops can be woken up without interrupt. */
144			__exit_idle();
145		}
146
147		tick_nohz_restart_sched_tick();
148		preempt_enable_no_resched();
149		schedule();
150		preempt_disable();
151	}
152}
153
154/* Prints also some state that isn't saved in the pt_regs */
155void __show_regs(struct pt_regs *regs, int all)
156{
157	unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
158	unsigned long d0, d1, d2, d3, d6, d7;
159	unsigned int fsindex, gsindex;
160	unsigned int ds, cs, es;
161
162	show_regs_common();
163	printk(KERN_DEFAULT "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
164	printk_address(regs->ip, 1);
165	printk(KERN_DEFAULT "RSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss,
166			regs->sp, regs->flags);
167	printk(KERN_DEFAULT "RAX: %016lx RBX: %016lx RCX: %016lx\n",
168	       regs->ax, regs->bx, regs->cx);
169	printk(KERN_DEFAULT "RDX: %016lx RSI: %016lx RDI: %016lx\n",
170	       regs->dx, regs->si, regs->di);
171	printk(KERN_DEFAULT "RBP: %016lx R08: %016lx R09: %016lx\n",
172	       regs->bp, regs->r8, regs->r9);
173	printk(KERN_DEFAULT "R10: %016lx R11: %016lx R12: %016lx\n",
174	       regs->r10, regs->r11, regs->r12);
175	printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n",
176	       regs->r13, regs->r14, regs->r15);
177
178	asm("movl %%ds,%0" : "=r" (ds));
179	asm("movl %%cs,%0" : "=r" (cs));
180	asm("movl %%es,%0" : "=r" (es));
181	asm("movl %%fs,%0" : "=r" (fsindex));
182	asm("movl %%gs,%0" : "=r" (gsindex));
183
184	rdmsrl(MSR_FS_BASE, fs);
185	rdmsrl(MSR_GS_BASE, gs);
186	rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
187
188	if (!all)
189		return;
190
191	cr0 = read_cr0();
192	cr2 = read_cr2();
193	cr3 = read_cr3();
194	cr4 = read_cr4();
195
196	printk(KERN_DEFAULT "FS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
197	       fs, fsindex, gs, gsindex, shadowgs);
198	printk(KERN_DEFAULT "CS:  %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
199			es, cr0);
200	printk(KERN_DEFAULT "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
201			cr4);
202
203	get_debugreg(d0, 0);
204	get_debugreg(d1, 1);
205	get_debugreg(d2, 2);
206	printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
207	get_debugreg(d3, 3);
208	get_debugreg(d6, 6);
209	get_debugreg(d7, 7);
210	printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
211}
212
213void release_thread(struct task_struct *dead_task)
214{
215	if (dead_task->mm) {
216		if (dead_task->mm->context.size) {
217			printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
218					dead_task->comm,
219					dead_task->mm->context.ldt,
220					dead_task->mm->context.size);
221			BUG();
222		}
223	}
224}
225
226static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
227{
228	struct user_desc ud = {
229		.base_addr = addr,
230		.limit = 0xfffff,
231		.seg_32bit = 1,
232		.limit_in_pages = 1,
233		.useable = 1,
234	};
235	struct desc_struct *desc = t->thread.tls_array;
236	desc += tls;
237	fill_ldt(desc, &ud);
238}
239
240static inline u32 read_32bit_tls(struct task_struct *t, int tls)
241{
242	return get_desc_base(&t->thread.tls_array[tls]);
243}
244
245/*
246 * This gets called before we allocate a new thread and copy
247 * the current task into it.
248 */
249void prepare_to_copy(struct task_struct *tsk)
250{
251	unlazy_fpu(tsk);
252}
253
254int copy_thread(unsigned long clone_flags, unsigned long sp,
255		unsigned long unused,
256	struct task_struct *p, struct pt_regs *regs)
257{
258	int err;
259	struct pt_regs *childregs;
260	struct task_struct *me = current;
261
262	childregs = ((struct pt_regs *)
263			(THREAD_SIZE + task_stack_page(p))) - 1;
264	*childregs = *regs;
265
266	childregs->ax = 0;
267	if (user_mode(regs))
268		childregs->sp = sp;
269	else
270		childregs->sp = (unsigned long)childregs;
271
272	p->thread.sp = (unsigned long) childregs;
273	p->thread.sp0 = (unsigned long) (childregs+1);
274	p->thread.usersp = me->thread.usersp;
275
276	set_tsk_thread_flag(p, TIF_FORK);
277
278	p->thread.io_bitmap_ptr = NULL;
279
280	savesegment(gs, p->thread.gsindex);
281	p->thread.gs = p->thread.gsindex ? 0 : me->thread.gs;
282	savesegment(fs, p->thread.fsindex);
283	p->thread.fs = p->thread.fsindex ? 0 : me->thread.fs;
284	savesegment(es, p->thread.es);
285	savesegment(ds, p->thread.ds);
286
287	err = -ENOMEM;
288	memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
289
290	if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
291		p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
292		if (!p->thread.io_bitmap_ptr) {
293			p->thread.io_bitmap_max = 0;
294			return -ENOMEM;
295		}
296		memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
297				IO_BITMAP_BYTES);
298		set_tsk_thread_flag(p, TIF_IO_BITMAP);
299	}
300
301	/*
302	 * Set a new TLS for the child thread?
303	 */
304	if (clone_flags & CLONE_SETTLS) {
305#ifdef CONFIG_IA32_EMULATION
306		if (test_thread_flag(TIF_IA32))
307			err = do_set_thread_area(p, -1,
308				(struct user_desc __user *)childregs->si, 0);
309		else
310#endif
311			err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
312		if (err)
313			goto out;
314	}
315	err = 0;
316out:
317	if (err && p->thread.io_bitmap_ptr) {
318		kfree(p->thread.io_bitmap_ptr);
319		p->thread.io_bitmap_max = 0;
320	}
321
322	return err;
323}
324
325static void
326start_thread_common(struct pt_regs *regs, unsigned long new_ip,
327		    unsigned long new_sp,
328		    unsigned int _cs, unsigned int _ss, unsigned int _ds)
329{
330	loadsegment(fs, 0);
331	loadsegment(es, _ds);
332	loadsegment(ds, _ds);
333	load_gs_index(0);
334	regs->ip		= new_ip;
335	regs->sp		= new_sp;
336	percpu_write(old_rsp, new_sp);
337	regs->cs		= _cs;
338	regs->ss		= _ss;
339	regs->flags		= X86_EFLAGS_IF;
340	set_fs(USER_DS);
341	/*
342	 * Free the old FP and other extended state
343	 */
344	free_thread_xstate(current);
345}
346
347void
348start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
349{
350	start_thread_common(regs, new_ip, new_sp,
351			    __USER_CS, __USER_DS, 0);
352}
353
354#ifdef CONFIG_IA32_EMULATION
355void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp)
356{
357	start_thread_common(regs, new_ip, new_sp,
358			    __USER32_CS, __USER32_DS, __USER32_DS);
359}
360#endif
361
362/*
363 *	switch_to(x,y) should switch tasks from x to y.
364 *
365 * This could still be optimized:
366 * - fold all the options into a flag word and test it with a single test.
367 * - could test fs/gs bitsliced
368 *
369 * Kprobes not supported here. Set the probe on schedule instead.
370 * Function graph tracer not supported too.
371 */
372__notrace_funcgraph struct task_struct *
373__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
374{
375	struct thread_struct *prev = &prev_p->thread;
376	struct thread_struct *next = &next_p->thread;
377	int cpu = smp_processor_id();
378	struct tss_struct *tss = &per_cpu(init_tss, cpu);
379	unsigned fsindex, gsindex;
380	bool preload_fpu;
381
382	/*
383	 * If the task has used fpu the last 5 timeslices, just do a full
384	 * restore of the math state immediately to avoid the trap; the
385	 * chances of needing FPU soon are obviously high now
386	 */
387	preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter > 5;
388
389	/* we're going to use this soon, after a few expensive things */
390	if (preload_fpu)
391		prefetch(next->xstate);
392
393	/*
394	 * Reload esp0, LDT and the page table pointer:
395	 */
396	load_sp0(tss, next);
397
398	/*
399	 * Switch DS and ES.
400	 * This won't pick up thread selector changes, but I guess that is ok.
401	 */
402	savesegment(es, prev->es);
403	if (unlikely(next->es | prev->es))
404		loadsegment(es, next->es);
405
406	savesegment(ds, prev->ds);
407	if (unlikely(next->ds | prev->ds))
408		loadsegment(ds, next->ds);
409
410
411	/* We must save %fs and %gs before load_TLS() because
412	 * %fs and %gs may be cleared by load_TLS().
413	 *
414	 * (e.g. xen_load_tls())
415	 */
416	savesegment(fs, fsindex);
417	savesegment(gs, gsindex);
418
419	load_TLS(next, cpu);
420
421	/* Must be after DS reload */
422	unlazy_fpu(prev_p);
423
424	/* Make sure cpu is ready for new context */
425	if (preload_fpu)
426		clts();
427
428	/*
429	 * Leave lazy mode, flushing any hypercalls made here.
430	 * This must be done before restoring TLS segments so
431	 * the GDT and LDT are properly updated, and must be
432	 * done before math_state_restore, so the TS bit is up
433	 * to date.
434	 */
435	arch_end_context_switch(next_p);
436
437	/*
438	 * Switch FS and GS.
439	 *
440	 * Segment register != 0 always requires a reload.  Also
441	 * reload when it has changed.  When prev process used 64bit
442	 * base always reload to avoid an information leak.
443	 */
444	if (unlikely(fsindex | next->fsindex | prev->fs)) {
445		loadsegment(fs, next->fsindex);
446		/*
447		 * Check if the user used a selector != 0; if yes
448		 *  clear 64bit base, since overloaded base is always
449		 *  mapped to the Null selector
450		 */
451		if (fsindex)
452			prev->fs = 0;
453	}
454	/* when next process has a 64bit base use it */
455	if (next->fs)
456		wrmsrl(MSR_FS_BASE, next->fs);
457	prev->fsindex = fsindex;
458
459	if (unlikely(gsindex | next->gsindex | prev->gs)) {
460		load_gs_index(next->gsindex);
461		if (gsindex)
462			prev->gs = 0;
463	}
464	if (next->gs)
465		wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
466	prev->gsindex = gsindex;
467
468	/*
469	 * Switch the PDA and FPU contexts.
470	 */
471	prev->usersp = percpu_read(old_rsp);
472	percpu_write(old_rsp, next->usersp);
473	percpu_write(current_task, next_p);
474
475	percpu_write(kernel_stack,
476		  (unsigned long)task_stack_page(next_p) +
477		  THREAD_SIZE - KERNEL_STACK_OFFSET);
478
479	/*
480	 * Now maybe reload the debug registers and handle I/O bitmaps
481	 */
482	if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
483		     task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
484		__switch_to_xtra(prev_p, next_p, tss);
485
486	/*
487	 * Preload the FPU context, now that we've determined that the
488	 * task is likely to be using it.
489	 */
490	if (preload_fpu)
491		__math_state_restore();
492
493	return prev_p;
494}
495
496void set_personality_64bit(void)
497{
498	/* inherit personality from parent */
499
500	/* Make sure to be in 64bit mode */
501	clear_thread_flag(TIF_IA32);
502
503	/* TBD: overwrites user setup. Should have two bits.
504	   But 64bit processes have always behaved this way,
505	   so it's not too bad. The main problem is just that
506	   32bit childs are affected again. */
507	current->personality &= ~READ_IMPLIES_EXEC;
508}
509
510void set_personality_ia32(void)
511{
512	/* inherit personality from parent */
513
514	/* Make sure to be in 32bit mode */
515	set_thread_flag(TIF_IA32);
516	current->personality |= force_personality32;
517
518	/* Prepare the first "return" to user space */
519	current_thread_info()->status |= TS_COMPAT;
520}
521
522unsigned long get_wchan(struct task_struct *p)
523{
524	unsigned long stack;
525	u64 fp, ip;
526	int count = 0;
527
528	if (!p || p == current || p->state == TASK_RUNNING)
529		return 0;
530	stack = (unsigned long)task_stack_page(p);
531	if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE)
532		return 0;
533	fp = *(u64 *)(p->thread.sp);
534	do {
535		if (fp < (unsigned long)stack ||
536		    fp >= (unsigned long)stack+THREAD_SIZE)
537			return 0;
538		ip = *(u64 *)(fp+8);
539		if (!in_sched_functions(ip))
540			return ip;
541		fp = *(u64 *)fp;
542	} while (count++ < 16);
543	return 0;
544}
545
546long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
547{
548	int ret = 0;
549	int doit = task == current;
550	int cpu;
551
552	switch (code) {
553	case ARCH_SET_GS:
554		if (addr >= TASK_SIZE_OF(task))
555			return -EPERM;
556		cpu = get_cpu();
557		/* handle small bases via the GDT because that's faster to
558		   switch. */
559		if (addr <= 0xffffffff) {
560			set_32bit_tls(task, GS_TLS, addr);
561			if (doit) {
562				load_TLS(&task->thread, cpu);
563				load_gs_index(GS_TLS_SEL);
564			}
565			task->thread.gsindex = GS_TLS_SEL;
566			task->thread.gs = 0;
567		} else {
568			task->thread.gsindex = 0;
569			task->thread.gs = addr;
570			if (doit) {
571				load_gs_index(0);
572				ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
573			}
574		}
575		put_cpu();
576		break;
577	case ARCH_SET_FS:
578		/* Not strictly needed for fs, but do it for symmetry
579		   with gs */
580		if (addr >= TASK_SIZE_OF(task))
581			return -EPERM;
582		cpu = get_cpu();
583		/* handle small bases via the GDT because that's faster to
584		   switch. */
585		if (addr <= 0xffffffff) {
586			set_32bit_tls(task, FS_TLS, addr);
587			if (doit) {
588				load_TLS(&task->thread, cpu);
589				loadsegment(fs, FS_TLS_SEL);
590			}
591			task->thread.fsindex = FS_TLS_SEL;
592			task->thread.fs = 0;
593		} else {
594			task->thread.fsindex = 0;
595			task->thread.fs = addr;
596			if (doit) {
597				/* set the selector to 0 to not confuse
598				   __switch_to */
599				loadsegment(fs, 0);
600				ret = checking_wrmsrl(MSR_FS_BASE, addr);
601			}
602		}
603		put_cpu();
604		break;
605	case ARCH_GET_FS: {
606		unsigned long base;
607		if (task->thread.fsindex == FS_TLS_SEL)
608			base = read_32bit_tls(task, FS_TLS);
609		else if (doit)
610			rdmsrl(MSR_FS_BASE, base);
611		else
612			base = task->thread.fs;
613		ret = put_user(base, (unsigned long __user *)addr);
614		break;
615	}
616	case ARCH_GET_GS: {
617		unsigned long base;
618		unsigned gsindex;
619		if (task->thread.gsindex == GS_TLS_SEL)
620			base = read_32bit_tls(task, GS_TLS);
621		else if (doit) {
622			savesegment(gs, gsindex);
623			if (gsindex)
624				rdmsrl(MSR_KERNEL_GS_BASE, base);
625			else
626				base = task->thread.gs;
627		} else
628			base = task->thread.gs;
629		ret = put_user(base, (unsigned long __user *)addr);
630		break;
631	}
632
633	default:
634		ret = -EINVAL;
635		break;
636	}
637
638	return ret;
639}
640
641long sys_arch_prctl(int code, unsigned long addr)
642{
643	return do_arch_prctl(current, code, addr);
644}
645
646unsigned long KSTK_ESP(struct task_struct *task)
647{
648	return (test_tsk_thread_flag(task, TIF_IA32)) ?
649			(task_pt_regs(task)->sp) : ((task)->thread.usersp);
650}
651