process_64.c revision 7e16838d94b566a17b65231073d179bc04d590c8
1/*
2 *  Copyright (C) 1995  Linus Torvalds
3 *
4 *  Pentium III FXSR, SSE support
5 *	Gareth Hughes <gareth@valinux.com>, May 2000
6 *
7 *  X86-64 port
8 *	Andi Kleen.
9 *
10 *	CPU hotplug support - ashok.raj@intel.com
11 */
12
13/*
14 * This file handles the architecture-dependent parts of process handling..
15 */
16
17#include <linux/stackprotector.h>
18#include <linux/cpu.h>
19#include <linux/errno.h>
20#include <linux/sched.h>
21#include <linux/fs.h>
22#include <linux/kernel.h>
23#include <linux/mm.h>
24#include <linux/elfcore.h>
25#include <linux/smp.h>
26#include <linux/slab.h>
27#include <linux/user.h>
28#include <linux/interrupt.h>
29#include <linux/delay.h>
30#include <linux/module.h>
31#include <linux/ptrace.h>
32#include <linux/notifier.h>
33#include <linux/kprobes.h>
34#include <linux/kdebug.h>
35#include <linux/tick.h>
36#include <linux/prctl.h>
37#include <linux/uaccess.h>
38#include <linux/io.h>
39#include <linux/ftrace.h>
40#include <linux/cpuidle.h>
41
42#include <asm/pgtable.h>
43#include <asm/system.h>
44#include <asm/processor.h>
45#include <asm/i387.h>
46#include <asm/mmu_context.h>
47#include <asm/prctl.h>
48#include <asm/desc.h>
49#include <asm/proto.h>
50#include <asm/ia32.h>
51#include <asm/idle.h>
52#include <asm/syscalls.h>
53#include <asm/debugreg.h>
54#include <asm/nmi.h>
55
56asmlinkage extern void ret_from_fork(void);
57
58DEFINE_PER_CPU(unsigned long, old_rsp);
59static DEFINE_PER_CPU(unsigned char, is_idle);
60
61static ATOMIC_NOTIFIER_HEAD(idle_notifier);
62
63void idle_notifier_register(struct notifier_block *n)
64{
65	atomic_notifier_chain_register(&idle_notifier, n);
66}
67EXPORT_SYMBOL_GPL(idle_notifier_register);
68
69void idle_notifier_unregister(struct notifier_block *n)
70{
71	atomic_notifier_chain_unregister(&idle_notifier, n);
72}
73EXPORT_SYMBOL_GPL(idle_notifier_unregister);
74
75void enter_idle(void)
76{
77	percpu_write(is_idle, 1);
78	atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
79}
80
81static void __exit_idle(void)
82{
83	if (x86_test_and_clear_bit_percpu(0, is_idle) == 0)
84		return;
85	atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
86}
87
88/* Called from interrupts to signify idle end */
89void exit_idle(void)
90{
91	/* idle loop has pid 0 */
92	if (current->pid)
93		return;
94	__exit_idle();
95}
96
97#ifndef CONFIG_SMP
98static inline void play_dead(void)
99{
100	BUG();
101}
102#endif
103
104/*
105 * The idle thread. There's no useful work to be
106 * done, so just try to conserve power and have a
107 * low exit latency (ie sit in a loop waiting for
108 * somebody to say that they'd like to reschedule)
109 */
110void cpu_idle(void)
111{
112	current_thread_info()->status |= TS_POLLING;
113
114	/*
115	 * If we're the non-boot CPU, nothing set the stack canary up
116	 * for us.  CPU0 already has it initialized but no harm in
117	 * doing it again.  This is a good place for updating it, as
118	 * we wont ever return from this function (so the invalid
119	 * canaries already on the stack wont ever trigger).
120	 */
121	boot_init_stack_canary();
122
123	/* endless idle loop with no priority at all */
124	while (1) {
125		tick_nohz_idle_enter();
126		while (!need_resched()) {
127
128			rmb();
129
130			if (cpu_is_offline(smp_processor_id()))
131				play_dead();
132			/*
133			 * Idle routines should keep interrupts disabled
134			 * from here on, until they go to idle.
135			 * Otherwise, idle callbacks can misfire.
136			 */
137			local_touch_nmi();
138			local_irq_disable();
139			enter_idle();
140			/* Don't trace irqs off for idle */
141			stop_critical_timings();
142
143			/* enter_idle() needs rcu for notifiers */
144			rcu_idle_enter();
145
146			if (cpuidle_idle_call())
147				pm_idle();
148
149			rcu_idle_exit();
150			start_critical_timings();
151
152			/* In many cases the interrupt that ended idle
153			   has already called exit_idle. But some idle
154			   loops can be woken up without interrupt. */
155			__exit_idle();
156		}
157
158		tick_nohz_idle_exit();
159		preempt_enable_no_resched();
160		schedule();
161		preempt_disable();
162	}
163}
164
165/* Prints also some state that isn't saved in the pt_regs */
166void __show_regs(struct pt_regs *regs, int all)
167{
168	unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
169	unsigned long d0, d1, d2, d3, d6, d7;
170	unsigned int fsindex, gsindex;
171	unsigned int ds, cs, es;
172
173	show_regs_common();
174	printk(KERN_DEFAULT "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
175	printk_address(regs->ip, 1);
176	printk(KERN_DEFAULT "RSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss,
177			regs->sp, regs->flags);
178	printk(KERN_DEFAULT "RAX: %016lx RBX: %016lx RCX: %016lx\n",
179	       regs->ax, regs->bx, regs->cx);
180	printk(KERN_DEFAULT "RDX: %016lx RSI: %016lx RDI: %016lx\n",
181	       regs->dx, regs->si, regs->di);
182	printk(KERN_DEFAULT "RBP: %016lx R08: %016lx R09: %016lx\n",
183	       regs->bp, regs->r8, regs->r9);
184	printk(KERN_DEFAULT "R10: %016lx R11: %016lx R12: %016lx\n",
185	       regs->r10, regs->r11, regs->r12);
186	printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n",
187	       regs->r13, regs->r14, regs->r15);
188
189	asm("movl %%ds,%0" : "=r" (ds));
190	asm("movl %%cs,%0" : "=r" (cs));
191	asm("movl %%es,%0" : "=r" (es));
192	asm("movl %%fs,%0" : "=r" (fsindex));
193	asm("movl %%gs,%0" : "=r" (gsindex));
194
195	rdmsrl(MSR_FS_BASE, fs);
196	rdmsrl(MSR_GS_BASE, gs);
197	rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
198
199	if (!all)
200		return;
201
202	cr0 = read_cr0();
203	cr2 = read_cr2();
204	cr3 = read_cr3();
205	cr4 = read_cr4();
206
207	printk(KERN_DEFAULT "FS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
208	       fs, fsindex, gs, gsindex, shadowgs);
209	printk(KERN_DEFAULT "CS:  %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
210			es, cr0);
211	printk(KERN_DEFAULT "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
212			cr4);
213
214	get_debugreg(d0, 0);
215	get_debugreg(d1, 1);
216	get_debugreg(d2, 2);
217	printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
218	get_debugreg(d3, 3);
219	get_debugreg(d6, 6);
220	get_debugreg(d7, 7);
221	printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
222}
223
224void release_thread(struct task_struct *dead_task)
225{
226	if (dead_task->mm) {
227		if (dead_task->mm->context.size) {
228			printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
229					dead_task->comm,
230					dead_task->mm->context.ldt,
231					dead_task->mm->context.size);
232			BUG();
233		}
234	}
235}
236
237static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
238{
239	struct user_desc ud = {
240		.base_addr = addr,
241		.limit = 0xfffff,
242		.seg_32bit = 1,
243		.limit_in_pages = 1,
244		.useable = 1,
245	};
246	struct desc_struct *desc = t->thread.tls_array;
247	desc += tls;
248	fill_ldt(desc, &ud);
249}
250
251static inline u32 read_32bit_tls(struct task_struct *t, int tls)
252{
253	return get_desc_base(&t->thread.tls_array[tls]);
254}
255
256/*
257 * This gets called before we allocate a new thread and copy
258 * the current task into it.
259 */
260void prepare_to_copy(struct task_struct *tsk)
261{
262	unlazy_fpu(tsk);
263}
264
265int copy_thread(unsigned long clone_flags, unsigned long sp,
266		unsigned long unused,
267	struct task_struct *p, struct pt_regs *regs)
268{
269	int err;
270	struct pt_regs *childregs;
271	struct task_struct *me = current;
272
273	childregs = ((struct pt_regs *)
274			(THREAD_SIZE + task_stack_page(p))) - 1;
275	*childregs = *regs;
276
277	childregs->ax = 0;
278	if (user_mode(regs))
279		childregs->sp = sp;
280	else
281		childregs->sp = (unsigned long)childregs;
282
283	p->thread.sp = (unsigned long) childregs;
284	p->thread.sp0 = (unsigned long) (childregs+1);
285	p->thread.usersp = me->thread.usersp;
286
287	set_tsk_thread_flag(p, TIF_FORK);
288
289	p->fpu_counter = 0;
290	p->thread.io_bitmap_ptr = NULL;
291
292	savesegment(gs, p->thread.gsindex);
293	p->thread.gs = p->thread.gsindex ? 0 : me->thread.gs;
294	savesegment(fs, p->thread.fsindex);
295	p->thread.fs = p->thread.fsindex ? 0 : me->thread.fs;
296	savesegment(es, p->thread.es);
297	savesegment(ds, p->thread.ds);
298
299	err = -ENOMEM;
300	memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
301
302	if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
303		p->thread.io_bitmap_ptr = kmemdup(me->thread.io_bitmap_ptr,
304						  IO_BITMAP_BYTES, GFP_KERNEL);
305		if (!p->thread.io_bitmap_ptr) {
306			p->thread.io_bitmap_max = 0;
307			return -ENOMEM;
308		}
309		set_tsk_thread_flag(p, TIF_IO_BITMAP);
310	}
311
312	/*
313	 * Set a new TLS for the child thread?
314	 */
315	if (clone_flags & CLONE_SETTLS) {
316#ifdef CONFIG_IA32_EMULATION
317		if (test_thread_flag(TIF_IA32))
318			err = do_set_thread_area(p, -1,
319				(struct user_desc __user *)childregs->si, 0);
320		else
321#endif
322			err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
323		if (err)
324			goto out;
325	}
326	err = 0;
327out:
328	if (err && p->thread.io_bitmap_ptr) {
329		kfree(p->thread.io_bitmap_ptr);
330		p->thread.io_bitmap_max = 0;
331	}
332
333	return err;
334}
335
336static void
337start_thread_common(struct pt_regs *regs, unsigned long new_ip,
338		    unsigned long new_sp,
339		    unsigned int _cs, unsigned int _ss, unsigned int _ds)
340{
341	loadsegment(fs, 0);
342	loadsegment(es, _ds);
343	loadsegment(ds, _ds);
344	load_gs_index(0);
345	regs->ip		= new_ip;
346	regs->sp		= new_sp;
347	percpu_write(old_rsp, new_sp);
348	regs->cs		= _cs;
349	regs->ss		= _ss;
350	regs->flags		= X86_EFLAGS_IF;
351	/*
352	 * Free the old FP and other extended state
353	 */
354	free_thread_xstate(current);
355}
356
357void
358start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
359{
360	start_thread_common(regs, new_ip, new_sp,
361			    __USER_CS, __USER_DS, 0);
362}
363
364#ifdef CONFIG_IA32_EMULATION
365void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp)
366{
367	start_thread_common(regs, new_ip, new_sp,
368			    __USER32_CS, __USER32_DS, __USER32_DS);
369}
370#endif
371
372/*
373 *	switch_to(x,y) should switch tasks from x to y.
374 *
375 * This could still be optimized:
376 * - fold all the options into a flag word and test it with a single test.
377 * - could test fs/gs bitsliced
378 *
379 * Kprobes not supported here. Set the probe on schedule instead.
380 * Function graph tracer not supported too.
381 */
382__notrace_funcgraph struct task_struct *
383__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
384{
385	struct thread_struct *prev = &prev_p->thread;
386	struct thread_struct *next = &next_p->thread;
387	int cpu = smp_processor_id();
388	struct tss_struct *tss = &per_cpu(init_tss, cpu);
389	unsigned fsindex, gsindex;
390	fpu_switch_t fpu;
391
392	fpu = switch_fpu_prepare(prev_p, next_p, cpu);
393
394	/*
395	 * Reload esp0, LDT and the page table pointer:
396	 */
397	load_sp0(tss, next);
398
399	/*
400	 * Switch DS and ES.
401	 * This won't pick up thread selector changes, but I guess that is ok.
402	 */
403	savesegment(es, prev->es);
404	if (unlikely(next->es | prev->es))
405		loadsegment(es, next->es);
406
407	savesegment(ds, prev->ds);
408	if (unlikely(next->ds | prev->ds))
409		loadsegment(ds, next->ds);
410
411
412	/* We must save %fs and %gs before load_TLS() because
413	 * %fs and %gs may be cleared by load_TLS().
414	 *
415	 * (e.g. xen_load_tls())
416	 */
417	savesegment(fs, fsindex);
418	savesegment(gs, gsindex);
419
420	load_TLS(next, cpu);
421
422	/*
423	 * Leave lazy mode, flushing any hypercalls made here.
424	 * This must be done before restoring TLS segments so
425	 * the GDT and LDT are properly updated, and must be
426	 * done before math_state_restore, so the TS bit is up
427	 * to date.
428	 */
429	arch_end_context_switch(next_p);
430
431	/*
432	 * Switch FS and GS.
433	 *
434	 * Segment register != 0 always requires a reload.  Also
435	 * reload when it has changed.  When prev process used 64bit
436	 * base always reload to avoid an information leak.
437	 */
438	if (unlikely(fsindex | next->fsindex | prev->fs)) {
439		loadsegment(fs, next->fsindex);
440		/*
441		 * Check if the user used a selector != 0; if yes
442		 *  clear 64bit base, since overloaded base is always
443		 *  mapped to the Null selector
444		 */
445		if (fsindex)
446			prev->fs = 0;
447	}
448	/* when next process has a 64bit base use it */
449	if (next->fs)
450		wrmsrl(MSR_FS_BASE, next->fs);
451	prev->fsindex = fsindex;
452
453	if (unlikely(gsindex | next->gsindex | prev->gs)) {
454		load_gs_index(next->gsindex);
455		if (gsindex)
456			prev->gs = 0;
457	}
458	if (next->gs)
459		wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
460	prev->gsindex = gsindex;
461
462	switch_fpu_finish(next_p, fpu);
463
464	/*
465	 * Switch the PDA and FPU contexts.
466	 */
467	prev->usersp = percpu_read(old_rsp);
468	percpu_write(old_rsp, next->usersp);
469	percpu_write(current_task, next_p);
470
471	percpu_write(kernel_stack,
472		  (unsigned long)task_stack_page(next_p) +
473		  THREAD_SIZE - KERNEL_STACK_OFFSET);
474
475	/*
476	 * Now maybe reload the debug registers and handle I/O bitmaps
477	 */
478	if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
479		     task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
480		__switch_to_xtra(prev_p, next_p, tss);
481
482	return prev_p;
483}
484
485void set_personality_64bit(void)
486{
487	/* inherit personality from parent */
488
489	/* Make sure to be in 64bit mode */
490	clear_thread_flag(TIF_IA32);
491
492	/* Ensure the corresponding mm is not marked. */
493	if (current->mm)
494		current->mm->context.ia32_compat = 0;
495
496	/* TBD: overwrites user setup. Should have two bits.
497	   But 64bit processes have always behaved this way,
498	   so it's not too bad. The main problem is just that
499	   32bit childs are affected again. */
500	current->personality &= ~READ_IMPLIES_EXEC;
501}
502
503void set_personality_ia32(void)
504{
505	/* inherit personality from parent */
506
507	/* Make sure to be in 32bit mode */
508	set_thread_flag(TIF_IA32);
509	current->personality |= force_personality32;
510
511	/* Mark the associated mm as containing 32-bit tasks. */
512	if (current->mm)
513		current->mm->context.ia32_compat = 1;
514
515	/* Prepare the first "return" to user space */
516	current_thread_info()->status |= TS_COMPAT;
517}
518
519unsigned long get_wchan(struct task_struct *p)
520{
521	unsigned long stack;
522	u64 fp, ip;
523	int count = 0;
524
525	if (!p || p == current || p->state == TASK_RUNNING)
526		return 0;
527	stack = (unsigned long)task_stack_page(p);
528	if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE)
529		return 0;
530	fp = *(u64 *)(p->thread.sp);
531	do {
532		if (fp < (unsigned long)stack ||
533		    fp >= (unsigned long)stack+THREAD_SIZE)
534			return 0;
535		ip = *(u64 *)(fp+8);
536		if (!in_sched_functions(ip))
537			return ip;
538		fp = *(u64 *)fp;
539	} while (count++ < 16);
540	return 0;
541}
542
543long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
544{
545	int ret = 0;
546	int doit = task == current;
547	int cpu;
548
549	switch (code) {
550	case ARCH_SET_GS:
551		if (addr >= TASK_SIZE_OF(task))
552			return -EPERM;
553		cpu = get_cpu();
554		/* handle small bases via the GDT because that's faster to
555		   switch. */
556		if (addr <= 0xffffffff) {
557			set_32bit_tls(task, GS_TLS, addr);
558			if (doit) {
559				load_TLS(&task->thread, cpu);
560				load_gs_index(GS_TLS_SEL);
561			}
562			task->thread.gsindex = GS_TLS_SEL;
563			task->thread.gs = 0;
564		} else {
565			task->thread.gsindex = 0;
566			task->thread.gs = addr;
567			if (doit) {
568				load_gs_index(0);
569				ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
570			}
571		}
572		put_cpu();
573		break;
574	case ARCH_SET_FS:
575		/* Not strictly needed for fs, but do it for symmetry
576		   with gs */
577		if (addr >= TASK_SIZE_OF(task))
578			return -EPERM;
579		cpu = get_cpu();
580		/* handle small bases via the GDT because that's faster to
581		   switch. */
582		if (addr <= 0xffffffff) {
583			set_32bit_tls(task, FS_TLS, addr);
584			if (doit) {
585				load_TLS(&task->thread, cpu);
586				loadsegment(fs, FS_TLS_SEL);
587			}
588			task->thread.fsindex = FS_TLS_SEL;
589			task->thread.fs = 0;
590		} else {
591			task->thread.fsindex = 0;
592			task->thread.fs = addr;
593			if (doit) {
594				/* set the selector to 0 to not confuse
595				   __switch_to */
596				loadsegment(fs, 0);
597				ret = checking_wrmsrl(MSR_FS_BASE, addr);
598			}
599		}
600		put_cpu();
601		break;
602	case ARCH_GET_FS: {
603		unsigned long base;
604		if (task->thread.fsindex == FS_TLS_SEL)
605			base = read_32bit_tls(task, FS_TLS);
606		else if (doit)
607			rdmsrl(MSR_FS_BASE, base);
608		else
609			base = task->thread.fs;
610		ret = put_user(base, (unsigned long __user *)addr);
611		break;
612	}
613	case ARCH_GET_GS: {
614		unsigned long base;
615		unsigned gsindex;
616		if (task->thread.gsindex == GS_TLS_SEL)
617			base = read_32bit_tls(task, GS_TLS);
618		else if (doit) {
619			savesegment(gs, gsindex);
620			if (gsindex)
621				rdmsrl(MSR_KERNEL_GS_BASE, base);
622			else
623				base = task->thread.gs;
624		} else
625			base = task->thread.gs;
626		ret = put_user(base, (unsigned long __user *)addr);
627		break;
628	}
629
630	default:
631		ret = -EINVAL;
632		break;
633	}
634
635	return ret;
636}
637
638long sys_arch_prctl(int code, unsigned long addr)
639{
640	return do_arch_prctl(current, code, addr);
641}
642
643unsigned long KSTK_ESP(struct task_struct *task)
644{
645	return (test_tsk_thread_flag(task, TIF_IA32)) ?
646			(task_pt_regs(task)->sp) : ((task)->thread.usersp);
647}
648