process_64.c revision b3b0870ef3ffed72b92415423da864f440f57ad6
1/*
2 *  Copyright (C) 1995  Linus Torvalds
3 *
4 *  Pentium III FXSR, SSE support
5 *	Gareth Hughes <gareth@valinux.com>, May 2000
6 *
7 *  X86-64 port
8 *	Andi Kleen.
9 *
10 *	CPU hotplug support - ashok.raj@intel.com
11 */
12
13/*
14 * This file handles the architecture-dependent parts of process handling..
15 */
16
17#include <linux/stackprotector.h>
18#include <linux/cpu.h>
19#include <linux/errno.h>
20#include <linux/sched.h>
21#include <linux/fs.h>
22#include <linux/kernel.h>
23#include <linux/mm.h>
24#include <linux/elfcore.h>
25#include <linux/smp.h>
26#include <linux/slab.h>
27#include <linux/user.h>
28#include <linux/interrupt.h>
29#include <linux/delay.h>
30#include <linux/module.h>
31#include <linux/ptrace.h>
32#include <linux/notifier.h>
33#include <linux/kprobes.h>
34#include <linux/kdebug.h>
35#include <linux/tick.h>
36#include <linux/prctl.h>
37#include <linux/uaccess.h>
38#include <linux/io.h>
39#include <linux/ftrace.h>
40#include <linux/cpuidle.h>
41
42#include <asm/pgtable.h>
43#include <asm/system.h>
44#include <asm/processor.h>
45#include <asm/i387.h>
46#include <asm/mmu_context.h>
47#include <asm/prctl.h>
48#include <asm/desc.h>
49#include <asm/proto.h>
50#include <asm/ia32.h>
51#include <asm/idle.h>
52#include <asm/syscalls.h>
53#include <asm/debugreg.h>
54#include <asm/nmi.h>
55
56asmlinkage extern void ret_from_fork(void);
57
58DEFINE_PER_CPU(unsigned long, old_rsp);
59static DEFINE_PER_CPU(unsigned char, is_idle);
60
61static ATOMIC_NOTIFIER_HEAD(idle_notifier);
62
63void idle_notifier_register(struct notifier_block *n)
64{
65	atomic_notifier_chain_register(&idle_notifier, n);
66}
67EXPORT_SYMBOL_GPL(idle_notifier_register);
68
69void idle_notifier_unregister(struct notifier_block *n)
70{
71	atomic_notifier_chain_unregister(&idle_notifier, n);
72}
73EXPORT_SYMBOL_GPL(idle_notifier_unregister);
74
75void enter_idle(void)
76{
77	percpu_write(is_idle, 1);
78	atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
79}
80
81static void __exit_idle(void)
82{
83	if (x86_test_and_clear_bit_percpu(0, is_idle) == 0)
84		return;
85	atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
86}
87
88/* Called from interrupts to signify idle end */
89void exit_idle(void)
90{
91	/* idle loop has pid 0 */
92	if (current->pid)
93		return;
94	__exit_idle();
95}
96
97#ifndef CONFIG_SMP
98static inline void play_dead(void)
99{
100	BUG();
101}
102#endif
103
104/*
105 * The idle thread. There's no useful work to be
106 * done, so just try to conserve power and have a
107 * low exit latency (ie sit in a loop waiting for
108 * somebody to say that they'd like to reschedule)
109 */
110void cpu_idle(void)
111{
112	current_thread_info()->status |= TS_POLLING;
113
114	/*
115	 * If we're the non-boot CPU, nothing set the stack canary up
116	 * for us.  CPU0 already has it initialized but no harm in
117	 * doing it again.  This is a good place for updating it, as
118	 * we wont ever return from this function (so the invalid
119	 * canaries already on the stack wont ever trigger).
120	 */
121	boot_init_stack_canary();
122
123	/* endless idle loop with no priority at all */
124	while (1) {
125		tick_nohz_idle_enter();
126		while (!need_resched()) {
127
128			rmb();
129
130			if (cpu_is_offline(smp_processor_id()))
131				play_dead();
132			/*
133			 * Idle routines should keep interrupts disabled
134			 * from here on, until they go to idle.
135			 * Otherwise, idle callbacks can misfire.
136			 */
137			local_touch_nmi();
138			local_irq_disable();
139			enter_idle();
140			/* Don't trace irqs off for idle */
141			stop_critical_timings();
142
143			/* enter_idle() needs rcu for notifiers */
144			rcu_idle_enter();
145
146			if (cpuidle_idle_call())
147				pm_idle();
148
149			rcu_idle_exit();
150			start_critical_timings();
151
152			/* In many cases the interrupt that ended idle
153			   has already called exit_idle. But some idle
154			   loops can be woken up without interrupt. */
155			__exit_idle();
156		}
157
158		tick_nohz_idle_exit();
159		preempt_enable_no_resched();
160		schedule();
161		preempt_disable();
162	}
163}
164
165/* Prints also some state that isn't saved in the pt_regs */
166void __show_regs(struct pt_regs *regs, int all)
167{
168	unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
169	unsigned long d0, d1, d2, d3, d6, d7;
170	unsigned int fsindex, gsindex;
171	unsigned int ds, cs, es;
172
173	show_regs_common();
174	printk(KERN_DEFAULT "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
175	printk_address(regs->ip, 1);
176	printk(KERN_DEFAULT "RSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss,
177			regs->sp, regs->flags);
178	printk(KERN_DEFAULT "RAX: %016lx RBX: %016lx RCX: %016lx\n",
179	       regs->ax, regs->bx, regs->cx);
180	printk(KERN_DEFAULT "RDX: %016lx RSI: %016lx RDI: %016lx\n",
181	       regs->dx, regs->si, regs->di);
182	printk(KERN_DEFAULT "RBP: %016lx R08: %016lx R09: %016lx\n",
183	       regs->bp, regs->r8, regs->r9);
184	printk(KERN_DEFAULT "R10: %016lx R11: %016lx R12: %016lx\n",
185	       regs->r10, regs->r11, regs->r12);
186	printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n",
187	       regs->r13, regs->r14, regs->r15);
188
189	asm("movl %%ds,%0" : "=r" (ds));
190	asm("movl %%cs,%0" : "=r" (cs));
191	asm("movl %%es,%0" : "=r" (es));
192	asm("movl %%fs,%0" : "=r" (fsindex));
193	asm("movl %%gs,%0" : "=r" (gsindex));
194
195	rdmsrl(MSR_FS_BASE, fs);
196	rdmsrl(MSR_GS_BASE, gs);
197	rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
198
199	if (!all)
200		return;
201
202	cr0 = read_cr0();
203	cr2 = read_cr2();
204	cr3 = read_cr3();
205	cr4 = read_cr4();
206
207	printk(KERN_DEFAULT "FS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
208	       fs, fsindex, gs, gsindex, shadowgs);
209	printk(KERN_DEFAULT "CS:  %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
210			es, cr0);
211	printk(KERN_DEFAULT "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
212			cr4);
213
214	get_debugreg(d0, 0);
215	get_debugreg(d1, 1);
216	get_debugreg(d2, 2);
217	printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
218	get_debugreg(d3, 3);
219	get_debugreg(d6, 6);
220	get_debugreg(d7, 7);
221	printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
222}
223
224void release_thread(struct task_struct *dead_task)
225{
226	if (dead_task->mm) {
227		if (dead_task->mm->context.size) {
228			printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
229					dead_task->comm,
230					dead_task->mm->context.ldt,
231					dead_task->mm->context.size);
232			BUG();
233		}
234	}
235}
236
237static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
238{
239	struct user_desc ud = {
240		.base_addr = addr,
241		.limit = 0xfffff,
242		.seg_32bit = 1,
243		.limit_in_pages = 1,
244		.useable = 1,
245	};
246	struct desc_struct *desc = t->thread.tls_array;
247	desc += tls;
248	fill_ldt(desc, &ud);
249}
250
251static inline u32 read_32bit_tls(struct task_struct *t, int tls)
252{
253	return get_desc_base(&t->thread.tls_array[tls]);
254}
255
256/*
257 * This gets called before we allocate a new thread and copy
258 * the current task into it.
259 */
260void prepare_to_copy(struct task_struct *tsk)
261{
262	unlazy_fpu(tsk);
263}
264
265int copy_thread(unsigned long clone_flags, unsigned long sp,
266		unsigned long unused,
267	struct task_struct *p, struct pt_regs *regs)
268{
269	int err;
270	struct pt_regs *childregs;
271	struct task_struct *me = current;
272
273	childregs = ((struct pt_regs *)
274			(THREAD_SIZE + task_stack_page(p))) - 1;
275	*childregs = *regs;
276
277	childregs->ax = 0;
278	if (user_mode(regs))
279		childregs->sp = sp;
280	else
281		childregs->sp = (unsigned long)childregs;
282
283	p->thread.sp = (unsigned long) childregs;
284	p->thread.sp0 = (unsigned long) (childregs+1);
285	p->thread.usersp = me->thread.usersp;
286
287	set_tsk_thread_flag(p, TIF_FORK);
288
289	p->thread.io_bitmap_ptr = NULL;
290
291	savesegment(gs, p->thread.gsindex);
292	p->thread.gs = p->thread.gsindex ? 0 : me->thread.gs;
293	savesegment(fs, p->thread.fsindex);
294	p->thread.fs = p->thread.fsindex ? 0 : me->thread.fs;
295	savesegment(es, p->thread.es);
296	savesegment(ds, p->thread.ds);
297
298	err = -ENOMEM;
299	memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
300
301	if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
302		p->thread.io_bitmap_ptr = kmemdup(me->thread.io_bitmap_ptr,
303						  IO_BITMAP_BYTES, GFP_KERNEL);
304		if (!p->thread.io_bitmap_ptr) {
305			p->thread.io_bitmap_max = 0;
306			return -ENOMEM;
307		}
308		set_tsk_thread_flag(p, TIF_IO_BITMAP);
309	}
310
311	/*
312	 * Set a new TLS for the child thread?
313	 */
314	if (clone_flags & CLONE_SETTLS) {
315#ifdef CONFIG_IA32_EMULATION
316		if (test_thread_flag(TIF_IA32))
317			err = do_set_thread_area(p, -1,
318				(struct user_desc __user *)childregs->si, 0);
319		else
320#endif
321			err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
322		if (err)
323			goto out;
324	}
325	err = 0;
326out:
327	if (err && p->thread.io_bitmap_ptr) {
328		kfree(p->thread.io_bitmap_ptr);
329		p->thread.io_bitmap_max = 0;
330	}
331
332	return err;
333}
334
335static void
336start_thread_common(struct pt_regs *regs, unsigned long new_ip,
337		    unsigned long new_sp,
338		    unsigned int _cs, unsigned int _ss, unsigned int _ds)
339{
340	loadsegment(fs, 0);
341	loadsegment(es, _ds);
342	loadsegment(ds, _ds);
343	load_gs_index(0);
344	regs->ip		= new_ip;
345	regs->sp		= new_sp;
346	percpu_write(old_rsp, new_sp);
347	regs->cs		= _cs;
348	regs->ss		= _ss;
349	regs->flags		= X86_EFLAGS_IF;
350	/*
351	 * Free the old FP and other extended state
352	 */
353	free_thread_xstate(current);
354}
355
356void
357start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
358{
359	start_thread_common(regs, new_ip, new_sp,
360			    __USER_CS, __USER_DS, 0);
361}
362
363#ifdef CONFIG_IA32_EMULATION
364void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp)
365{
366	start_thread_common(regs, new_ip, new_sp,
367			    __USER32_CS, __USER32_DS, __USER32_DS);
368}
369#endif
370
371/*
372 *	switch_to(x,y) should switch tasks from x to y.
373 *
374 * This could still be optimized:
375 * - fold all the options into a flag word and test it with a single test.
376 * - could test fs/gs bitsliced
377 *
378 * Kprobes not supported here. Set the probe on schedule instead.
379 * Function graph tracer not supported too.
380 */
381__notrace_funcgraph struct task_struct *
382__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
383{
384	struct thread_struct *prev = &prev_p->thread;
385	struct thread_struct *next = &next_p->thread;
386	int cpu = smp_processor_id();
387	struct tss_struct *tss = &per_cpu(init_tss, cpu);
388	unsigned fsindex, gsindex;
389
390	/*
391	 * Reload esp0, LDT and the page table pointer:
392	 */
393	load_sp0(tss, next);
394
395	/*
396	 * Switch DS and ES.
397	 * This won't pick up thread selector changes, but I guess that is ok.
398	 */
399	savesegment(es, prev->es);
400	if (unlikely(next->es | prev->es))
401		loadsegment(es, next->es);
402
403	savesegment(ds, prev->ds);
404	if (unlikely(next->ds | prev->ds))
405		loadsegment(ds, next->ds);
406
407
408	/* We must save %fs and %gs before load_TLS() because
409	 * %fs and %gs may be cleared by load_TLS().
410	 *
411	 * (e.g. xen_load_tls())
412	 */
413	savesegment(fs, fsindex);
414	savesegment(gs, gsindex);
415
416	load_TLS(next, cpu);
417
418	/* Must be after DS reload */
419	__unlazy_fpu(prev_p);
420
421	/*
422	 * Leave lazy mode, flushing any hypercalls made here.
423	 * This must be done before restoring TLS segments so
424	 * the GDT and LDT are properly updated, and must be
425	 * done before math_state_restore, so the TS bit is up
426	 * to date.
427	 */
428	arch_end_context_switch(next_p);
429
430	/*
431	 * Switch FS and GS.
432	 *
433	 * Segment register != 0 always requires a reload.  Also
434	 * reload when it has changed.  When prev process used 64bit
435	 * base always reload to avoid an information leak.
436	 */
437	if (unlikely(fsindex | next->fsindex | prev->fs)) {
438		loadsegment(fs, next->fsindex);
439		/*
440		 * Check if the user used a selector != 0; if yes
441		 *  clear 64bit base, since overloaded base is always
442		 *  mapped to the Null selector
443		 */
444		if (fsindex)
445			prev->fs = 0;
446	}
447	/* when next process has a 64bit base use it */
448	if (next->fs)
449		wrmsrl(MSR_FS_BASE, next->fs);
450	prev->fsindex = fsindex;
451
452	if (unlikely(gsindex | next->gsindex | prev->gs)) {
453		load_gs_index(next->gsindex);
454		if (gsindex)
455			prev->gs = 0;
456	}
457	if (next->gs)
458		wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
459	prev->gsindex = gsindex;
460
461	/*
462	 * Switch the PDA and FPU contexts.
463	 */
464	prev->usersp = percpu_read(old_rsp);
465	percpu_write(old_rsp, next->usersp);
466	percpu_write(current_task, next_p);
467
468	percpu_write(kernel_stack,
469		  (unsigned long)task_stack_page(next_p) +
470		  THREAD_SIZE - KERNEL_STACK_OFFSET);
471
472	/*
473	 * Now maybe reload the debug registers and handle I/O bitmaps
474	 */
475	if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
476		     task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
477		__switch_to_xtra(prev_p, next_p, tss);
478
479	return prev_p;
480}
481
482void set_personality_64bit(void)
483{
484	/* inherit personality from parent */
485
486	/* Make sure to be in 64bit mode */
487	clear_thread_flag(TIF_IA32);
488
489	/* Ensure the corresponding mm is not marked. */
490	if (current->mm)
491		current->mm->context.ia32_compat = 0;
492
493	/* TBD: overwrites user setup. Should have two bits.
494	   But 64bit processes have always behaved this way,
495	   so it's not too bad. The main problem is just that
496	   32bit childs are affected again. */
497	current->personality &= ~READ_IMPLIES_EXEC;
498}
499
500void set_personality_ia32(void)
501{
502	/* inherit personality from parent */
503
504	/* Make sure to be in 32bit mode */
505	set_thread_flag(TIF_IA32);
506	current->personality |= force_personality32;
507
508	/* Mark the associated mm as containing 32-bit tasks. */
509	if (current->mm)
510		current->mm->context.ia32_compat = 1;
511
512	/* Prepare the first "return" to user space */
513	current_thread_info()->status |= TS_COMPAT;
514}
515
516unsigned long get_wchan(struct task_struct *p)
517{
518	unsigned long stack;
519	u64 fp, ip;
520	int count = 0;
521
522	if (!p || p == current || p->state == TASK_RUNNING)
523		return 0;
524	stack = (unsigned long)task_stack_page(p);
525	if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE)
526		return 0;
527	fp = *(u64 *)(p->thread.sp);
528	do {
529		if (fp < (unsigned long)stack ||
530		    fp >= (unsigned long)stack+THREAD_SIZE)
531			return 0;
532		ip = *(u64 *)(fp+8);
533		if (!in_sched_functions(ip))
534			return ip;
535		fp = *(u64 *)fp;
536	} while (count++ < 16);
537	return 0;
538}
539
540long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
541{
542	int ret = 0;
543	int doit = task == current;
544	int cpu;
545
546	switch (code) {
547	case ARCH_SET_GS:
548		if (addr >= TASK_SIZE_OF(task))
549			return -EPERM;
550		cpu = get_cpu();
551		/* handle small bases via the GDT because that's faster to
552		   switch. */
553		if (addr <= 0xffffffff) {
554			set_32bit_tls(task, GS_TLS, addr);
555			if (doit) {
556				load_TLS(&task->thread, cpu);
557				load_gs_index(GS_TLS_SEL);
558			}
559			task->thread.gsindex = GS_TLS_SEL;
560			task->thread.gs = 0;
561		} else {
562			task->thread.gsindex = 0;
563			task->thread.gs = addr;
564			if (doit) {
565				load_gs_index(0);
566				ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
567			}
568		}
569		put_cpu();
570		break;
571	case ARCH_SET_FS:
572		/* Not strictly needed for fs, but do it for symmetry
573		   with gs */
574		if (addr >= TASK_SIZE_OF(task))
575			return -EPERM;
576		cpu = get_cpu();
577		/* handle small bases via the GDT because that's faster to
578		   switch. */
579		if (addr <= 0xffffffff) {
580			set_32bit_tls(task, FS_TLS, addr);
581			if (doit) {
582				load_TLS(&task->thread, cpu);
583				loadsegment(fs, FS_TLS_SEL);
584			}
585			task->thread.fsindex = FS_TLS_SEL;
586			task->thread.fs = 0;
587		} else {
588			task->thread.fsindex = 0;
589			task->thread.fs = addr;
590			if (doit) {
591				/* set the selector to 0 to not confuse
592				   __switch_to */
593				loadsegment(fs, 0);
594				ret = checking_wrmsrl(MSR_FS_BASE, addr);
595			}
596		}
597		put_cpu();
598		break;
599	case ARCH_GET_FS: {
600		unsigned long base;
601		if (task->thread.fsindex == FS_TLS_SEL)
602			base = read_32bit_tls(task, FS_TLS);
603		else if (doit)
604			rdmsrl(MSR_FS_BASE, base);
605		else
606			base = task->thread.fs;
607		ret = put_user(base, (unsigned long __user *)addr);
608		break;
609	}
610	case ARCH_GET_GS: {
611		unsigned long base;
612		unsigned gsindex;
613		if (task->thread.gsindex == GS_TLS_SEL)
614			base = read_32bit_tls(task, GS_TLS);
615		else if (doit) {
616			savesegment(gs, gsindex);
617			if (gsindex)
618				rdmsrl(MSR_KERNEL_GS_BASE, base);
619			else
620				base = task->thread.gs;
621		} else
622			base = task->thread.gs;
623		ret = put_user(base, (unsigned long __user *)addr);
624		break;
625	}
626
627	default:
628		ret = -EINVAL;
629		break;
630	}
631
632	return ret;
633}
634
635long sys_arch_prctl(int code, unsigned long addr)
636{
637	return do_arch_prctl(current, code, addr);
638}
639
640unsigned long KSTK_ESP(struct task_struct *task)
641{
642	return (test_tsk_thread_flag(task, TIF_IA32)) ?
643			(task_pt_regs(task)->sp) : ((task)->thread.usersp);
644}
645