process_64.c revision c5c7fb8fbd7cd228132b6e2a17a10f246ffc06ee
1/*
2 *  Copyright (C) 1995  Linus Torvalds
3 *
4 *  Pentium III FXSR, SSE support
5 *	Gareth Hughes <gareth@valinux.com>, May 2000
6 *
7 *  X86-64 port
8 *	Andi Kleen.
9 *
10 *	CPU hotplug support - ashok.raj@intel.com
11 */
12
13/*
14 * This file handles the architecture-dependent parts of process handling..
15 */
16
17#include <linux/stackprotector.h>
18#include <linux/cpu.h>
19#include <linux/errno.h>
20#include <linux/sched.h>
21#include <linux/fs.h>
22#include <linux/kernel.h>
23#include <linux/mm.h>
24#include <linux/elfcore.h>
25#include <linux/smp.h>
26#include <linux/slab.h>
27#include <linux/user.h>
28#include <linux/interrupt.h>
29#include <linux/delay.h>
30#include <linux/module.h>
31#include <linux/ptrace.h>
32#include <linux/notifier.h>
33#include <linux/kprobes.h>
34#include <linux/kdebug.h>
35#include <linux/tick.h>
36#include <linux/prctl.h>
37#include <linux/uaccess.h>
38#include <linux/io.h>
39#include <linux/ftrace.h>
40#include <linux/cpuidle.h>
41
42#include <asm/pgtable.h>
43#include <asm/system.h>
44#include <asm/processor.h>
45#include <asm/i387.h>
46#include <asm/mmu_context.h>
47#include <asm/prctl.h>
48#include <asm/desc.h>
49#include <asm/proto.h>
50#include <asm/ia32.h>
51#include <asm/idle.h>
52#include <asm/syscalls.h>
53#include <asm/debugreg.h>
54#include <asm/nmi.h>
55
56asmlinkage extern void ret_from_fork(void);
57
58DEFINE_PER_CPU(unsigned long, old_rsp);
59static DEFINE_PER_CPU(unsigned char, is_idle);
60
61static ATOMIC_NOTIFIER_HEAD(idle_notifier);
62
63void idle_notifier_register(struct notifier_block *n)
64{
65	atomic_notifier_chain_register(&idle_notifier, n);
66}
67EXPORT_SYMBOL_GPL(idle_notifier_register);
68
69void idle_notifier_unregister(struct notifier_block *n)
70{
71	atomic_notifier_chain_unregister(&idle_notifier, n);
72}
73EXPORT_SYMBOL_GPL(idle_notifier_unregister);
74
75void enter_idle(void)
76{
77	percpu_write(is_idle, 1);
78	atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
79}
80
81static void __exit_idle(void)
82{
83	if (x86_test_and_clear_bit_percpu(0, is_idle) == 0)
84		return;
85	atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
86}
87
88/* Called from interrupts to signify idle end */
89void exit_idle(void)
90{
91	/* idle loop has pid 0 */
92	if (current->pid)
93		return;
94	__exit_idle();
95}
96
97#ifndef CONFIG_SMP
98static inline void play_dead(void)
99{
100	BUG();
101}
102#endif
103
104/*
105 * The idle thread. There's no useful work to be
106 * done, so just try to conserve power and have a
107 * low exit latency (ie sit in a loop waiting for
108 * somebody to say that they'd like to reschedule)
109 */
110void cpu_idle(void)
111{
112	current_thread_info()->status |= TS_POLLING;
113
114	/*
115	 * If we're the non-boot CPU, nothing set the stack canary up
116	 * for us.  CPU0 already has it initialized but no harm in
117	 * doing it again.  This is a good place for updating it, as
118	 * we wont ever return from this function (so the invalid
119	 * canaries already on the stack wont ever trigger).
120	 */
121	boot_init_stack_canary();
122
123	/* endless idle loop with no priority at all */
124	while (1) {
125		tick_nohz_idle_enter();
126		while (!need_resched()) {
127
128			rmb();
129
130			if (cpu_is_offline(smp_processor_id()))
131				play_dead();
132			/*
133			 * Idle routines should keep interrupts disabled
134			 * from here on, until they go to idle.
135			 * Otherwise, idle callbacks can misfire.
136			 */
137			local_touch_nmi();
138			local_irq_disable();
139			enter_idle();
140			/* Don't trace irqs off for idle */
141			stop_critical_timings();
142
143			/* enter_idle() needs rcu for notifiers */
144			rcu_idle_enter();
145
146			if (cpuidle_idle_call())
147				pm_idle();
148
149			rcu_idle_exit();
150			start_critical_timings();
151
152			/* In many cases the interrupt that ended idle
153			   has already called exit_idle. But some idle
154			   loops can be woken up without interrupt. */
155			__exit_idle();
156		}
157
158		tick_nohz_idle_exit();
159		schedule_preempt_disabled();
160	}
161}
162
163/* Prints also some state that isn't saved in the pt_regs */
164void __show_regs(struct pt_regs *regs, int all)
165{
166	unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
167	unsigned long d0, d1, d2, d3, d6, d7;
168	unsigned int fsindex, gsindex;
169	unsigned int ds, cs, es;
170
171	show_regs_common();
172	printk(KERN_DEFAULT "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
173	printk_address(regs->ip, 1);
174	printk(KERN_DEFAULT "RSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss,
175			regs->sp, regs->flags);
176	printk(KERN_DEFAULT "RAX: %016lx RBX: %016lx RCX: %016lx\n",
177	       regs->ax, regs->bx, regs->cx);
178	printk(KERN_DEFAULT "RDX: %016lx RSI: %016lx RDI: %016lx\n",
179	       regs->dx, regs->si, regs->di);
180	printk(KERN_DEFAULT "RBP: %016lx R08: %016lx R09: %016lx\n",
181	       regs->bp, regs->r8, regs->r9);
182	printk(KERN_DEFAULT "R10: %016lx R11: %016lx R12: %016lx\n",
183	       regs->r10, regs->r11, regs->r12);
184	printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n",
185	       regs->r13, regs->r14, regs->r15);
186
187	asm("movl %%ds,%0" : "=r" (ds));
188	asm("movl %%cs,%0" : "=r" (cs));
189	asm("movl %%es,%0" : "=r" (es));
190	asm("movl %%fs,%0" : "=r" (fsindex));
191	asm("movl %%gs,%0" : "=r" (gsindex));
192
193	rdmsrl(MSR_FS_BASE, fs);
194	rdmsrl(MSR_GS_BASE, gs);
195	rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
196
197	if (!all)
198		return;
199
200	cr0 = read_cr0();
201	cr2 = read_cr2();
202	cr3 = read_cr3();
203	cr4 = read_cr4();
204
205	printk(KERN_DEFAULT "FS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
206	       fs, fsindex, gs, gsindex, shadowgs);
207	printk(KERN_DEFAULT "CS:  %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
208			es, cr0);
209	printk(KERN_DEFAULT "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
210			cr4);
211
212	get_debugreg(d0, 0);
213	get_debugreg(d1, 1);
214	get_debugreg(d2, 2);
215	printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
216	get_debugreg(d3, 3);
217	get_debugreg(d6, 6);
218	get_debugreg(d7, 7);
219	printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
220}
221
222void release_thread(struct task_struct *dead_task)
223{
224	if (dead_task->mm) {
225		if (dead_task->mm->context.size) {
226			printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
227					dead_task->comm,
228					dead_task->mm->context.ldt,
229					dead_task->mm->context.size);
230			BUG();
231		}
232	}
233}
234
235static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
236{
237	struct user_desc ud = {
238		.base_addr = addr,
239		.limit = 0xfffff,
240		.seg_32bit = 1,
241		.limit_in_pages = 1,
242		.useable = 1,
243	};
244	struct desc_struct *desc = t->thread.tls_array;
245	desc += tls;
246	fill_ldt(desc, &ud);
247}
248
249static inline u32 read_32bit_tls(struct task_struct *t, int tls)
250{
251	return get_desc_base(&t->thread.tls_array[tls]);
252}
253
254/*
255 * This gets called before we allocate a new thread and copy
256 * the current task into it.
257 */
258void prepare_to_copy(struct task_struct *tsk)
259{
260	unlazy_fpu(tsk);
261}
262
263int copy_thread(unsigned long clone_flags, unsigned long sp,
264		unsigned long unused,
265	struct task_struct *p, struct pt_regs *regs)
266{
267	int err;
268	struct pt_regs *childregs;
269	struct task_struct *me = current;
270
271	childregs = ((struct pt_regs *)
272			(THREAD_SIZE + task_stack_page(p))) - 1;
273	*childregs = *regs;
274
275	childregs->ax = 0;
276	if (user_mode(regs))
277		childregs->sp = sp;
278	else
279		childregs->sp = (unsigned long)childregs;
280
281	p->thread.sp = (unsigned long) childregs;
282	p->thread.sp0 = (unsigned long) (childregs+1);
283	p->thread.usersp = me->thread.usersp;
284
285	set_tsk_thread_flag(p, TIF_FORK);
286
287	p->fpu_counter = 0;
288	p->thread.io_bitmap_ptr = NULL;
289
290	savesegment(gs, p->thread.gsindex);
291	p->thread.gs = p->thread.gsindex ? 0 : me->thread.gs;
292	savesegment(fs, p->thread.fsindex);
293	p->thread.fs = p->thread.fsindex ? 0 : me->thread.fs;
294	savesegment(es, p->thread.es);
295	savesegment(ds, p->thread.ds);
296
297	err = -ENOMEM;
298	memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
299
300	if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
301		p->thread.io_bitmap_ptr = kmemdup(me->thread.io_bitmap_ptr,
302						  IO_BITMAP_BYTES, GFP_KERNEL);
303		if (!p->thread.io_bitmap_ptr) {
304			p->thread.io_bitmap_max = 0;
305			return -ENOMEM;
306		}
307		set_tsk_thread_flag(p, TIF_IO_BITMAP);
308	}
309
310	/*
311	 * Set a new TLS for the child thread?
312	 */
313	if (clone_flags & CLONE_SETTLS) {
314#ifdef CONFIG_IA32_EMULATION
315		if (test_thread_flag(TIF_IA32))
316			err = do_set_thread_area(p, -1,
317				(struct user_desc __user *)childregs->si, 0);
318		else
319#endif
320			err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
321		if (err)
322			goto out;
323	}
324	err = 0;
325out:
326	if (err && p->thread.io_bitmap_ptr) {
327		kfree(p->thread.io_bitmap_ptr);
328		p->thread.io_bitmap_max = 0;
329	}
330
331	return err;
332}
333
334static void
335start_thread_common(struct pt_regs *regs, unsigned long new_ip,
336		    unsigned long new_sp,
337		    unsigned int _cs, unsigned int _ss, unsigned int _ds)
338{
339	loadsegment(fs, 0);
340	loadsegment(es, _ds);
341	loadsegment(ds, _ds);
342	load_gs_index(0);
343	current->thread.usersp	= new_sp;
344	regs->ip		= new_ip;
345	regs->sp		= new_sp;
346	percpu_write(old_rsp, new_sp);
347	regs->cs		= _cs;
348	regs->ss		= _ss;
349	regs->flags		= X86_EFLAGS_IF;
350	/*
351	 * Free the old FP and other extended state
352	 */
353	free_thread_xstate(current);
354}
355
356void
357start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
358{
359	start_thread_common(regs, new_ip, new_sp,
360			    __USER_CS, __USER_DS, 0);
361}
362
363#ifdef CONFIG_IA32_EMULATION
364void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp)
365{
366	start_thread_common(regs, new_ip, new_sp,
367			    __USER32_CS, __USER32_DS, __USER32_DS);
368}
369#endif
370
371/*
372 *	switch_to(x,y) should switch tasks from x to y.
373 *
374 * This could still be optimized:
375 * - fold all the options into a flag word and test it with a single test.
376 * - could test fs/gs bitsliced
377 *
378 * Kprobes not supported here. Set the probe on schedule instead.
379 * Function graph tracer not supported too.
380 */
381__notrace_funcgraph struct task_struct *
382__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
383{
384	struct thread_struct *prev = &prev_p->thread;
385	struct thread_struct *next = &next_p->thread;
386	int cpu = smp_processor_id();
387	struct tss_struct *tss = &per_cpu(init_tss, cpu);
388	unsigned fsindex, gsindex;
389	fpu_switch_t fpu;
390
391	fpu = switch_fpu_prepare(prev_p, next_p, cpu);
392
393	/*
394	 * Reload esp0, LDT and the page table pointer:
395	 */
396	load_sp0(tss, next);
397
398	/*
399	 * Switch DS and ES.
400	 * This won't pick up thread selector changes, but I guess that is ok.
401	 */
402	savesegment(es, prev->es);
403	if (unlikely(next->es | prev->es))
404		loadsegment(es, next->es);
405
406	savesegment(ds, prev->ds);
407	if (unlikely(next->ds | prev->ds))
408		loadsegment(ds, next->ds);
409
410
411	/* We must save %fs and %gs before load_TLS() because
412	 * %fs and %gs may be cleared by load_TLS().
413	 *
414	 * (e.g. xen_load_tls())
415	 */
416	savesegment(fs, fsindex);
417	savesegment(gs, gsindex);
418
419	load_TLS(next, cpu);
420
421	/*
422	 * Leave lazy mode, flushing any hypercalls made here.
423	 * This must be done before restoring TLS segments so
424	 * the GDT and LDT are properly updated, and must be
425	 * done before math_state_restore, so the TS bit is up
426	 * to date.
427	 */
428	arch_end_context_switch(next_p);
429
430	/*
431	 * Switch FS and GS.
432	 *
433	 * Segment register != 0 always requires a reload.  Also
434	 * reload when it has changed.  When prev process used 64bit
435	 * base always reload to avoid an information leak.
436	 */
437	if (unlikely(fsindex | next->fsindex | prev->fs)) {
438		loadsegment(fs, next->fsindex);
439		/*
440		 * Check if the user used a selector != 0; if yes
441		 *  clear 64bit base, since overloaded base is always
442		 *  mapped to the Null selector
443		 */
444		if (fsindex)
445			prev->fs = 0;
446	}
447	/* when next process has a 64bit base use it */
448	if (next->fs)
449		wrmsrl(MSR_FS_BASE, next->fs);
450	prev->fsindex = fsindex;
451
452	if (unlikely(gsindex | next->gsindex | prev->gs)) {
453		load_gs_index(next->gsindex);
454		if (gsindex)
455			prev->gs = 0;
456	}
457	if (next->gs)
458		wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
459	prev->gsindex = gsindex;
460
461	switch_fpu_finish(next_p, fpu);
462
463	/*
464	 * Switch the PDA and FPU contexts.
465	 */
466	prev->usersp = percpu_read(old_rsp);
467	percpu_write(old_rsp, next->usersp);
468	percpu_write(current_task, next_p);
469
470	percpu_write(kernel_stack,
471		  (unsigned long)task_stack_page(next_p) +
472		  THREAD_SIZE - KERNEL_STACK_OFFSET);
473
474	/*
475	 * Now maybe reload the debug registers and handle I/O bitmaps
476	 */
477	if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
478		     task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
479		__switch_to_xtra(prev_p, next_p, tss);
480
481	return prev_p;
482}
483
484void set_personality_64bit(void)
485{
486	/* inherit personality from parent */
487
488	/* Make sure to be in 64bit mode */
489	clear_thread_flag(TIF_IA32);
490
491	/* Ensure the corresponding mm is not marked. */
492	if (current->mm)
493		current->mm->context.ia32_compat = 0;
494
495	/* TBD: overwrites user setup. Should have two bits.
496	   But 64bit processes have always behaved this way,
497	   so it's not too bad. The main problem is just that
498	   32bit childs are affected again. */
499	current->personality &= ~READ_IMPLIES_EXEC;
500}
501
502void set_personality_ia32(void)
503{
504	/* inherit personality from parent */
505
506	/* Make sure to be in 32bit mode */
507	set_thread_flag(TIF_IA32);
508	current->personality |= force_personality32;
509
510	/* Mark the associated mm as containing 32-bit tasks. */
511	if (current->mm)
512		current->mm->context.ia32_compat = 1;
513
514	/* Prepare the first "return" to user space */
515	current_thread_info()->status |= TS_COMPAT;
516}
517
518unsigned long get_wchan(struct task_struct *p)
519{
520	unsigned long stack;
521	u64 fp, ip;
522	int count = 0;
523
524	if (!p || p == current || p->state == TASK_RUNNING)
525		return 0;
526	stack = (unsigned long)task_stack_page(p);
527	if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE)
528		return 0;
529	fp = *(u64 *)(p->thread.sp);
530	do {
531		if (fp < (unsigned long)stack ||
532		    fp >= (unsigned long)stack+THREAD_SIZE)
533			return 0;
534		ip = *(u64 *)(fp+8);
535		if (!in_sched_functions(ip))
536			return ip;
537		fp = *(u64 *)fp;
538	} while (count++ < 16);
539	return 0;
540}
541
542long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
543{
544	int ret = 0;
545	int doit = task == current;
546	int cpu;
547
548	switch (code) {
549	case ARCH_SET_GS:
550		if (addr >= TASK_SIZE_OF(task))
551			return -EPERM;
552		cpu = get_cpu();
553		/* handle small bases via the GDT because that's faster to
554		   switch. */
555		if (addr <= 0xffffffff) {
556			set_32bit_tls(task, GS_TLS, addr);
557			if (doit) {
558				load_TLS(&task->thread, cpu);
559				load_gs_index(GS_TLS_SEL);
560			}
561			task->thread.gsindex = GS_TLS_SEL;
562			task->thread.gs = 0;
563		} else {
564			task->thread.gsindex = 0;
565			task->thread.gs = addr;
566			if (doit) {
567				load_gs_index(0);
568				ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
569			}
570		}
571		put_cpu();
572		break;
573	case ARCH_SET_FS:
574		/* Not strictly needed for fs, but do it for symmetry
575		   with gs */
576		if (addr >= TASK_SIZE_OF(task))
577			return -EPERM;
578		cpu = get_cpu();
579		/* handle small bases via the GDT because that's faster to
580		   switch. */
581		if (addr <= 0xffffffff) {
582			set_32bit_tls(task, FS_TLS, addr);
583			if (doit) {
584				load_TLS(&task->thread, cpu);
585				loadsegment(fs, FS_TLS_SEL);
586			}
587			task->thread.fsindex = FS_TLS_SEL;
588			task->thread.fs = 0;
589		} else {
590			task->thread.fsindex = 0;
591			task->thread.fs = addr;
592			if (doit) {
593				/* set the selector to 0 to not confuse
594				   __switch_to */
595				loadsegment(fs, 0);
596				ret = checking_wrmsrl(MSR_FS_BASE, addr);
597			}
598		}
599		put_cpu();
600		break;
601	case ARCH_GET_FS: {
602		unsigned long base;
603		if (task->thread.fsindex == FS_TLS_SEL)
604			base = read_32bit_tls(task, FS_TLS);
605		else if (doit)
606			rdmsrl(MSR_FS_BASE, base);
607		else
608			base = task->thread.fs;
609		ret = put_user(base, (unsigned long __user *)addr);
610		break;
611	}
612	case ARCH_GET_GS: {
613		unsigned long base;
614		unsigned gsindex;
615		if (task->thread.gsindex == GS_TLS_SEL)
616			base = read_32bit_tls(task, GS_TLS);
617		else if (doit) {
618			savesegment(gs, gsindex);
619			if (gsindex)
620				rdmsrl(MSR_KERNEL_GS_BASE, base);
621			else
622				base = task->thread.gs;
623		} else
624			base = task->thread.gs;
625		ret = put_user(base, (unsigned long __user *)addr);
626		break;
627	}
628
629	default:
630		ret = -EINVAL;
631		break;
632	}
633
634	return ret;
635}
636
637long sys_arch_prctl(int code, unsigned long addr)
638{
639	return do_arch_prctl(current, code, addr);
640}
641
642unsigned long KSTK_ESP(struct task_struct *task)
643{
644	return (test_tsk_thread_flag(task, TIF_IA32)) ?
645			(task_pt_regs(task)->sp) : ((task)->thread.usersp);
646}
647