process_64.c revision b10db7f0d2b589a7f88dc3026e150756cb437a28
1/*
2 *  Copyright (C) 1995  Linus Torvalds
3 *
4 *  Pentium III FXSR, SSE support
5 *	Gareth Hughes <gareth@valinux.com>, May 2000
6 *
7 *  X86-64 port
8 *	Andi Kleen.
9 *
10 *	CPU hotplug support - ashok.raj@intel.com
11 */
12
13/*
14 * This file handles the architecture-dependent parts of process handling..
15 */
16
17#include <stdarg.h>
18
19#include <linux/cpu.h>
20#include <linux/errno.h>
21#include <linux/sched.h>
22#include <linux/kernel.h>
23#include <linux/mm.h>
24#include <linux/fs.h>
25#include <linux/elfcore.h>
26#include <linux/smp.h>
27#include <linux/slab.h>
28#include <linux/user.h>
29#include <linux/module.h>
30#include <linux/a.out.h>
31#include <linux/interrupt.h>
32#include <linux/delay.h>
33#include <linux/ptrace.h>
34#include <linux/utsname.h>
35#include <linux/random.h>
36#include <linux/notifier.h>
37#include <linux/kprobes.h>
38#include <linux/kdebug.h>
39#include <linux/tick.h>
40
41#include <asm/uaccess.h>
42#include <asm/pgtable.h>
43#include <asm/system.h>
44#include <asm/io.h>
45#include <asm/processor.h>
46#include <asm/i387.h>
47#include <asm/mmu_context.h>
48#include <asm/pda.h>
49#include <asm/prctl.h>
50#include <asm/desc.h>
51#include <asm/proto.h>
52#include <asm/ia32.h>
53#include <asm/idle.h>
54
55asmlinkage extern void ret_from_fork(void);
56
57unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
58
59unsigned long boot_option_idle_override = 0;
60EXPORT_SYMBOL(boot_option_idle_override);
61
62/*
63 * Powermanagement idle function, if any..
64 */
65void (*pm_idle)(void);
66EXPORT_SYMBOL(pm_idle);
67static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
68
69static ATOMIC_NOTIFIER_HEAD(idle_notifier);
70
71void idle_notifier_register(struct notifier_block *n)
72{
73	atomic_notifier_chain_register(&idle_notifier, n);
74}
75EXPORT_SYMBOL_GPL(idle_notifier_register);
76
77void idle_notifier_unregister(struct notifier_block *n)
78{
79	atomic_notifier_chain_unregister(&idle_notifier, n);
80}
81EXPORT_SYMBOL(idle_notifier_unregister);
82
83void enter_idle(void)
84{
85	write_pda(isidle, 1);
86	atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
87}
88
89static void __exit_idle(void)
90{
91	if (test_and_clear_bit_pda(0, isidle) == 0)
92		return;
93	atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
94}
95
96/* Called from interrupts to signify idle end */
97void exit_idle(void)
98{
99	/* idle loop has pid 0 */
100	if (current->pid)
101		return;
102	__exit_idle();
103}
104
105/*
106 * We use this if we don't have any better
107 * idle routine..
108 */
109static void default_idle(void)
110{
111	current_thread_info()->status &= ~TS_POLLING;
112	/*
113	 * TS_POLLING-cleared state must be visible before we
114	 * test NEED_RESCHED:
115	 */
116	smp_mb();
117	local_irq_disable();
118	if (!need_resched()) {
119		/* Enables interrupts one instruction before HLT.
120		   x86 special cases this so there is no race. */
121		safe_halt();
122	} else
123		local_irq_enable();
124	current_thread_info()->status |= TS_POLLING;
125}
126
127/*
128 * On SMP it's slightly faster (but much more power-consuming!)
129 * to poll the ->need_resched flag instead of waiting for the
130 * cross-CPU IPI to arrive. Use this option with caution.
131 */
132static void poll_idle (void)
133{
134	local_irq_enable();
135	cpu_relax();
136}
137
138static void do_nothing(void *unused)
139{
140}
141
142void cpu_idle_wait(void)
143{
144	unsigned int cpu, this_cpu = get_cpu();
145	cpumask_t map, tmp = current->cpus_allowed;
146
147	set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
148	put_cpu();
149
150	cpus_clear(map);
151	for_each_online_cpu(cpu) {
152		per_cpu(cpu_idle_state, cpu) = 1;
153		cpu_set(cpu, map);
154	}
155
156	__get_cpu_var(cpu_idle_state) = 0;
157
158	wmb();
159	do {
160		ssleep(1);
161		for_each_online_cpu(cpu) {
162			if (cpu_isset(cpu, map) &&
163					!per_cpu(cpu_idle_state, cpu))
164				cpu_clear(cpu, map);
165		}
166		cpus_and(map, map, cpu_online_map);
167		/*
168		 * We waited 1 sec, if a CPU still did not call idle
169		 * it may be because it is in idle and not waking up
170		 * because it has nothing to do.
171		 * Give all the remaining CPUS a kick.
172		 */
173		smp_call_function_mask(map, do_nothing, 0, 0);
174	} while (!cpus_empty(map));
175
176	set_cpus_allowed(current, tmp);
177}
178EXPORT_SYMBOL_GPL(cpu_idle_wait);
179
180#ifdef CONFIG_HOTPLUG_CPU
181DECLARE_PER_CPU(int, cpu_state);
182
183#include <asm/nmi.h>
184/* We halt the CPU with physical CPU hotplug */
185static inline void play_dead(void)
186{
187	idle_task_exit();
188	wbinvd();
189	mb();
190	/* Ack it */
191	__get_cpu_var(cpu_state) = CPU_DEAD;
192
193	local_irq_disable();
194	while (1)
195		halt();
196}
197#else
198static inline void play_dead(void)
199{
200	BUG();
201}
202#endif /* CONFIG_HOTPLUG_CPU */
203
204/*
205 * The idle thread. There's no useful work to be
206 * done, so just try to conserve power and have a
207 * low exit latency (ie sit in a loop waiting for
208 * somebody to say that they'd like to reschedule)
209 */
210void cpu_idle(void)
211{
212	current_thread_info()->status |= TS_POLLING;
213	/* endless idle loop with no priority at all */
214	while (1) {
215		while (!need_resched()) {
216			void (*idle)(void);
217
218			if (__get_cpu_var(cpu_idle_state))
219				__get_cpu_var(cpu_idle_state) = 0;
220
221			tick_nohz_stop_sched_tick();
222
223			rmb();
224			idle = pm_idle;
225			if (!idle)
226				idle = default_idle;
227			if (cpu_is_offline(smp_processor_id()))
228				play_dead();
229			/*
230			 * Idle routines should keep interrupts disabled
231			 * from here on, until they go to idle.
232			 * Otherwise, idle callbacks can misfire.
233			 */
234			local_irq_disable();
235			enter_idle();
236			idle();
237			/* In many cases the interrupt that ended idle
238			   has already called exit_idle. But some idle
239			   loops can be woken up without interrupt. */
240			__exit_idle();
241		}
242
243		tick_nohz_restart_sched_tick();
244		preempt_enable_no_resched();
245		schedule();
246		preempt_disable();
247	}
248}
249
250/*
251 * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
252 * which can obviate IPI to trigger checking of need_resched.
253 * We execute MONITOR against need_resched and enter optimized wait state
254 * through MWAIT. Whenever someone changes need_resched, we would be woken
255 * up from MWAIT (without an IPI).
256 *
257 * New with Core Duo processors, MWAIT can take some hints based on CPU
258 * capability.
259 */
260void mwait_idle_with_hints(unsigned long eax, unsigned long ecx)
261{
262	if (!need_resched()) {
263		__monitor((void *)&current_thread_info()->flags, 0, 0);
264		smp_mb();
265		if (!need_resched())
266			__mwait(eax, ecx);
267	}
268}
269
270/* Default MONITOR/MWAIT with no hints, used for default C1 state */
271static void mwait_idle(void)
272{
273	if (!need_resched()) {
274		__monitor((void *)&current_thread_info()->flags, 0, 0);
275		smp_mb();
276		if (!need_resched())
277			__sti_mwait(0, 0);
278		else
279			local_irq_enable();
280	} else {
281		local_irq_enable();
282	}
283}
284
285void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
286{
287	static int printed;
288	if (cpu_has(c, X86_FEATURE_MWAIT)) {
289		/*
290		 * Skip, if setup has overridden idle.
291		 * One CPU supports mwait => All CPUs supports mwait
292		 */
293		if (!pm_idle) {
294			if (!printed) {
295				printk(KERN_INFO "using mwait in idle threads.\n");
296				printed = 1;
297			}
298			pm_idle = mwait_idle;
299		}
300	}
301}
302
303static int __init idle_setup (char *str)
304{
305	if (!strcmp(str, "poll")) {
306		printk("using polling idle threads.\n");
307		pm_idle = poll_idle;
308	} else if (!strcmp(str, "mwait"))
309		force_mwait = 1;
310	else
311		return -1;
312
313	boot_option_idle_override = 1;
314	return 0;
315}
316early_param("idle", idle_setup);
317
318/* Prints also some state that isn't saved in the pt_regs */
319void __show_regs(struct pt_regs * regs)
320{
321	unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
322	unsigned long d0, d1, d2, d3, d6, d7;
323	unsigned int fsindex,gsindex;
324	unsigned int ds,cs,es;
325
326	printk("\n");
327	print_modules();
328	printk("Pid: %d, comm: %.20s %s %s %.*s\n",
329		current->pid, current->comm, print_tainted(),
330		init_utsname()->release,
331		(int)strcspn(init_utsname()->version, " "),
332		init_utsname()->version);
333	printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip);
334	printk_address(regs->rip);
335	printk("RSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss, regs->rsp,
336		regs->eflags);
337	printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
338	       regs->rax, regs->rbx, regs->rcx);
339	printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
340	       regs->rdx, regs->rsi, regs->rdi);
341	printk("RBP: %016lx R08: %016lx R09: %016lx\n",
342	       regs->rbp, regs->r8, regs->r9);
343	printk("R10: %016lx R11: %016lx R12: %016lx\n",
344	       regs->r10, regs->r11, regs->r12);
345	printk("R13: %016lx R14: %016lx R15: %016lx\n",
346	       regs->r13, regs->r14, regs->r15);
347
348	asm("movl %%ds,%0" : "=r" (ds));
349	asm("movl %%cs,%0" : "=r" (cs));
350	asm("movl %%es,%0" : "=r" (es));
351	asm("movl %%fs,%0" : "=r" (fsindex));
352	asm("movl %%gs,%0" : "=r" (gsindex));
353
354	rdmsrl(MSR_FS_BASE, fs);
355	rdmsrl(MSR_GS_BASE, gs);
356	rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
357
358	cr0 = read_cr0();
359	cr2 = read_cr2();
360	cr3 = read_cr3();
361	cr4 = read_cr4();
362
363	printk("FS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
364	       fs,fsindex,gs,gsindex,shadowgs);
365	printk("CS:  %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0);
366	printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4);
367
368	get_debugreg(d0, 0);
369	get_debugreg(d1, 1);
370	get_debugreg(d2, 2);
371	printk("DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
372	get_debugreg(d3, 3);
373	get_debugreg(d6, 6);
374	get_debugreg(d7, 7);
375	printk("DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
376}
377
378void show_regs(struct pt_regs *regs)
379{
380	printk("CPU %d:", smp_processor_id());
381	__show_regs(regs);
382	show_trace(NULL, regs, (void *)(regs + 1));
383}
384
385/*
386 * Free current thread data structures etc..
387 */
388void exit_thread(void)
389{
390	struct task_struct *me = current;
391	struct thread_struct *t = &me->thread;
392
393	if (me->thread.io_bitmap_ptr) {
394		struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
395
396		kfree(t->io_bitmap_ptr);
397		t->io_bitmap_ptr = NULL;
398		clear_thread_flag(TIF_IO_BITMAP);
399		/*
400		 * Careful, clear this in the TSS too:
401		 */
402		memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
403		t->io_bitmap_max = 0;
404		put_cpu();
405	}
406}
407
408void flush_thread(void)
409{
410	struct task_struct *tsk = current;
411
412	if (test_tsk_thread_flag(tsk, TIF_ABI_PENDING)) {
413		clear_tsk_thread_flag(tsk, TIF_ABI_PENDING);
414		if (test_tsk_thread_flag(tsk, TIF_IA32)) {
415			clear_tsk_thread_flag(tsk, TIF_IA32);
416		} else {
417			set_tsk_thread_flag(tsk, TIF_IA32);
418			current_thread_info()->status |= TS_COMPAT;
419		}
420	}
421	clear_tsk_thread_flag(tsk, TIF_DEBUG);
422
423	tsk->thread.debugreg0 = 0;
424	tsk->thread.debugreg1 = 0;
425	tsk->thread.debugreg2 = 0;
426	tsk->thread.debugreg3 = 0;
427	tsk->thread.debugreg6 = 0;
428	tsk->thread.debugreg7 = 0;
429	memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
430	/*
431	 * Forget coprocessor state..
432	 */
433	clear_fpu(tsk);
434	clear_used_math();
435}
436
437void release_thread(struct task_struct *dead_task)
438{
439	if (dead_task->mm) {
440		if (dead_task->mm->context.size) {
441			printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
442					dead_task->comm,
443					dead_task->mm->context.ldt,
444					dead_task->mm->context.size);
445			BUG();
446		}
447	}
448}
449
450static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
451{
452	struct user_desc ud = {
453		.base_addr = addr,
454		.limit = 0xfffff,
455		.seg_32bit = 1,
456		.limit_in_pages = 1,
457		.useable = 1,
458	};
459	struct n_desc_struct *desc = (void *)t->thread.tls_array;
460	desc += tls;
461	desc->a = LDT_entry_a(&ud);
462	desc->b = LDT_entry_b(&ud);
463}
464
465static inline u32 read_32bit_tls(struct task_struct *t, int tls)
466{
467	struct desc_struct *desc = (void *)t->thread.tls_array;
468	desc += tls;
469	return desc->base0 |
470		(((u32)desc->base1) << 16) |
471		(((u32)desc->base2) << 24);
472}
473
474/*
475 * This gets called before we allocate a new thread and copy
476 * the current task into it.
477 */
478void prepare_to_copy(struct task_struct *tsk)
479{
480	unlazy_fpu(tsk);
481}
482
483int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp,
484		unsigned long unused,
485	struct task_struct * p, struct pt_regs * regs)
486{
487	int err;
488	struct pt_regs * childregs;
489	struct task_struct *me = current;
490
491	childregs = ((struct pt_regs *)
492			(THREAD_SIZE + task_stack_page(p))) - 1;
493	*childregs = *regs;
494
495	childregs->rax = 0;
496	childregs->rsp = rsp;
497	if (rsp == ~0UL)
498		childregs->rsp = (unsigned long)childregs;
499
500	p->thread.rsp = (unsigned long) childregs;
501	p->thread.rsp0 = (unsigned long) (childregs+1);
502	p->thread.userrsp = me->thread.userrsp;
503
504	set_tsk_thread_flag(p, TIF_FORK);
505
506	p->thread.fs = me->thread.fs;
507	p->thread.gs = me->thread.gs;
508
509	asm("mov %%gs,%0" : "=m" (p->thread.gsindex));
510	asm("mov %%fs,%0" : "=m" (p->thread.fsindex));
511	asm("mov %%es,%0" : "=m" (p->thread.es));
512	asm("mov %%ds,%0" : "=m" (p->thread.ds));
513
514	if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
515		p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
516		if (!p->thread.io_bitmap_ptr) {
517			p->thread.io_bitmap_max = 0;
518			return -ENOMEM;
519		}
520		memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
521				IO_BITMAP_BYTES);
522		set_tsk_thread_flag(p, TIF_IO_BITMAP);
523	}
524
525	/*
526	 * Set a new TLS for the child thread?
527	 */
528	if (clone_flags & CLONE_SETTLS) {
529#ifdef CONFIG_IA32_EMULATION
530		if (test_thread_flag(TIF_IA32))
531			err = ia32_child_tls(p, childregs);
532		else
533#endif
534			err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
535		if (err)
536			goto out;
537	}
538	err = 0;
539out:
540	if (err && p->thread.io_bitmap_ptr) {
541		kfree(p->thread.io_bitmap_ptr);
542		p->thread.io_bitmap_max = 0;
543	}
544	return err;
545}
546
547/*
548 * This special macro can be used to load a debugging register
549 */
550#define loaddebug(thread,r) set_debugreg(thread->debugreg ## r, r)
551
552static inline void __switch_to_xtra(struct task_struct *prev_p,
553			     	    struct task_struct *next_p,
554			     	    struct tss_struct *tss)
555{
556	struct thread_struct *prev, *next;
557
558	prev = &prev_p->thread,
559	next = &next_p->thread;
560
561	if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
562		loaddebug(next, 0);
563		loaddebug(next, 1);
564		loaddebug(next, 2);
565		loaddebug(next, 3);
566		/* no 4 and 5 */
567		loaddebug(next, 6);
568		loaddebug(next, 7);
569	}
570
571	if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
572		/*
573		 * Copy the relevant range of the IO bitmap.
574		 * Normally this is 128 bytes or less:
575		 */
576		memcpy(tss->io_bitmap, next->io_bitmap_ptr,
577		       max(prev->io_bitmap_max, next->io_bitmap_max));
578	} else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) {
579		/*
580		 * Clear any possible leftover bits:
581		 */
582		memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
583	}
584}
585
586/*
587 *	switch_to(x,y) should switch tasks from x to y.
588 *
589 * This could still be optimized:
590 * - fold all the options into a flag word and test it with a single test.
591 * - could test fs/gs bitsliced
592 *
593 * Kprobes not supported here. Set the probe on schedule instead.
594 */
595struct task_struct *
596__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
597{
598	struct thread_struct *prev = &prev_p->thread,
599				 *next = &next_p->thread;
600	int cpu = smp_processor_id();
601	struct tss_struct *tss = &per_cpu(init_tss, cpu);
602
603	/* we're going to use this soon, after a few expensive things */
604	if (next_p->fpu_counter>5)
605		prefetch(&next->i387.fxsave);
606
607	/*
608	 * Reload esp0, LDT and the page table pointer:
609	 */
610	tss->rsp0 = next->rsp0;
611
612	/*
613	 * Switch DS and ES.
614	 * This won't pick up thread selector changes, but I guess that is ok.
615	 */
616	asm volatile("mov %%es,%0" : "=m" (prev->es));
617	if (unlikely(next->es | prev->es))
618		loadsegment(es, next->es);
619
620	asm volatile ("mov %%ds,%0" : "=m" (prev->ds));
621	if (unlikely(next->ds | prev->ds))
622		loadsegment(ds, next->ds);
623
624	load_TLS(next, cpu);
625
626	/*
627	 * Switch FS and GS.
628	 */
629	{
630		unsigned fsindex;
631		asm volatile("movl %%fs,%0" : "=r" (fsindex));
632		/* segment register != 0 always requires a reload.
633		   also reload when it has changed.
634		   when prev process used 64bit base always reload
635		   to avoid an information leak. */
636		if (unlikely(fsindex | next->fsindex | prev->fs)) {
637			loadsegment(fs, next->fsindex);
638			/* check if the user used a selector != 0
639	                 * if yes clear 64bit base, since overloaded base
640                         * is always mapped to the Null selector
641                         */
642			if (fsindex)
643			prev->fs = 0;
644		}
645		/* when next process has a 64bit base use it */
646		if (next->fs)
647			wrmsrl(MSR_FS_BASE, next->fs);
648		prev->fsindex = fsindex;
649	}
650	{
651		unsigned gsindex;
652		asm volatile("movl %%gs,%0" : "=r" (gsindex));
653		if (unlikely(gsindex | next->gsindex | prev->gs)) {
654			load_gs_index(next->gsindex);
655			if (gsindex)
656			prev->gs = 0;
657		}
658		if (next->gs)
659			wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
660		prev->gsindex = gsindex;
661	}
662
663	/* Must be after DS reload */
664	unlazy_fpu(prev_p);
665
666	/*
667	 * Switch the PDA and FPU contexts.
668	 */
669	prev->userrsp = read_pda(oldrsp);
670	write_pda(oldrsp, next->userrsp);
671	write_pda(pcurrent, next_p);
672
673	write_pda(kernelstack,
674	(unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
675#ifdef CONFIG_CC_STACKPROTECTOR
676	write_pda(stack_canary, next_p->stack_canary);
677	/*
678	 * Build time only check to make sure the stack_canary is at
679	 * offset 40 in the pda; this is a gcc ABI requirement
680	 */
681	BUILD_BUG_ON(offsetof(struct x8664_pda, stack_canary) != 40);
682#endif
683
684	/*
685	 * Now maybe reload the debug registers and handle I/O bitmaps
686	 */
687	if (unlikely((task_thread_info(next_p)->flags & _TIF_WORK_CTXSW))
688	    || test_tsk_thread_flag(prev_p, TIF_IO_BITMAP))
689		__switch_to_xtra(prev_p, next_p, tss);
690
691	/* If the task has used fpu the last 5 timeslices, just do a full
692	 * restore of the math state immediately to avoid the trap; the
693	 * chances of needing FPU soon are obviously high now
694	 */
695	if (next_p->fpu_counter>5)
696		math_state_restore();
697	return prev_p;
698}
699
700/*
701 * sys_execve() executes a new program.
702 */
703asmlinkage
704long sys_execve(char __user *name, char __user * __user *argv,
705		char __user * __user *envp, struct pt_regs regs)
706{
707	long error;
708	char * filename;
709
710	filename = getname(name);
711	error = PTR_ERR(filename);
712	if (IS_ERR(filename))
713		return error;
714	error = do_execve(filename, argv, envp, &regs);
715	if (error == 0) {
716		task_lock(current);
717		current->ptrace &= ~PT_DTRACE;
718		task_unlock(current);
719	}
720	putname(filename);
721	return error;
722}
723
724void set_personality_64bit(void)
725{
726	/* inherit personality from parent */
727
728	/* Make sure to be in 64bit mode */
729	clear_thread_flag(TIF_IA32);
730
731	/* TBD: overwrites user setup. Should have two bits.
732	   But 64bit processes have always behaved this way,
733	   so it's not too bad. The main problem is just that
734   	   32bit childs are affected again. */
735	current->personality &= ~READ_IMPLIES_EXEC;
736}
737
738asmlinkage long sys_fork(struct pt_regs *regs)
739{
740	return do_fork(SIGCHLD, regs->rsp, regs, 0, NULL, NULL);
741}
742
743asmlinkage long
744sys_clone(unsigned long clone_flags, unsigned long newsp,
745	  void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
746{
747	if (!newsp)
748		newsp = regs->rsp;
749	return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
750}
751
752/*
753 * This is trivial, and on the face of it looks like it
754 * could equally well be done in user mode.
755 *
756 * Not so, for quite unobvious reasons - register pressure.
757 * In user mode vfork() cannot have a stack frame, and if
758 * done by calling the "clone()" system call directly, you
759 * do not have enough call-clobbered registers to hold all
760 * the information you need.
761 */
762asmlinkage long sys_vfork(struct pt_regs *regs)
763{
764	return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->rsp, regs, 0,
765		    NULL, NULL);
766}
767
768unsigned long get_wchan(struct task_struct *p)
769{
770	unsigned long stack;
771	u64 fp,rip;
772	int count = 0;
773
774	if (!p || p == current || p->state==TASK_RUNNING)
775		return 0;
776	stack = (unsigned long)task_stack_page(p);
777	if (p->thread.rsp < stack || p->thread.rsp > stack+THREAD_SIZE)
778		return 0;
779	fp = *(u64 *)(p->thread.rsp);
780	do {
781		if (fp < (unsigned long)stack ||
782		    fp > (unsigned long)stack+THREAD_SIZE)
783			return 0;
784		rip = *(u64 *)(fp+8);
785		if (!in_sched_functions(rip))
786			return rip;
787		fp = *(u64 *)fp;
788	} while (count++ < 16);
789	return 0;
790}
791
792long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
793{
794	int ret = 0;
795	int doit = task == current;
796	int cpu;
797
798	switch (code) {
799	case ARCH_SET_GS:
800		if (addr >= TASK_SIZE_OF(task))
801			return -EPERM;
802		cpu = get_cpu();
803		/* handle small bases via the GDT because that's faster to
804		   switch. */
805		if (addr <= 0xffffffff) {
806			set_32bit_tls(task, GS_TLS, addr);
807			if (doit) {
808				load_TLS(&task->thread, cpu);
809				load_gs_index(GS_TLS_SEL);
810			}
811			task->thread.gsindex = GS_TLS_SEL;
812			task->thread.gs = 0;
813		} else {
814			task->thread.gsindex = 0;
815			task->thread.gs = addr;
816			if (doit) {
817				load_gs_index(0);
818				ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
819			}
820		}
821		put_cpu();
822		break;
823	case ARCH_SET_FS:
824		/* Not strictly needed for fs, but do it for symmetry
825		   with gs */
826		if (addr >= TASK_SIZE_OF(task))
827			return -EPERM;
828		cpu = get_cpu();
829		/* handle small bases via the GDT because that's faster to
830		   switch. */
831		if (addr <= 0xffffffff) {
832			set_32bit_tls(task, FS_TLS, addr);
833			if (doit) {
834				load_TLS(&task->thread, cpu);
835				asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL));
836			}
837			task->thread.fsindex = FS_TLS_SEL;
838			task->thread.fs = 0;
839		} else {
840			task->thread.fsindex = 0;
841			task->thread.fs = addr;
842			if (doit) {
843				/* set the selector to 0 to not confuse
844				   __switch_to */
845				asm volatile("movl %0,%%fs" :: "r" (0));
846				ret = checking_wrmsrl(MSR_FS_BASE, addr);
847			}
848		}
849		put_cpu();
850		break;
851	case ARCH_GET_FS: {
852		unsigned long base;
853		if (task->thread.fsindex == FS_TLS_SEL)
854			base = read_32bit_tls(task, FS_TLS);
855		else if (doit)
856			rdmsrl(MSR_FS_BASE, base);
857		else
858			base = task->thread.fs;
859		ret = put_user(base, (unsigned long __user *)addr);
860		break;
861	}
862	case ARCH_GET_GS: {
863		unsigned long base;
864		unsigned gsindex;
865		if (task->thread.gsindex == GS_TLS_SEL)
866			base = read_32bit_tls(task, GS_TLS);
867		else if (doit) {
868 			asm("movl %%gs,%0" : "=r" (gsindex));
869			if (gsindex)
870				rdmsrl(MSR_KERNEL_GS_BASE, base);
871			else
872				base = task->thread.gs;
873		}
874		else
875			base = task->thread.gs;
876		ret = put_user(base, (unsigned long __user *)addr);
877		break;
878	}
879
880	default:
881		ret = -EINVAL;
882		break;
883	}
884
885	return ret;
886}
887
888long sys_arch_prctl(int code, unsigned long addr)
889{
890	return do_arch_prctl(current, code, addr);
891}
892
893/*
894 * Capture the user space registers if the task is not running (in user space)
895 */
896int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
897{
898	struct pt_regs *pp, ptregs;
899
900	pp = task_pt_regs(tsk);
901
902	ptregs = *pp;
903	ptregs.cs &= 0xffff;
904	ptregs.ss &= 0xffff;
905
906	elf_core_copy_regs(regs, &ptregs);
907
908	return 1;
909}
910
911unsigned long arch_align_stack(unsigned long sp)
912{
913	if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
914		sp -= get_random_int() % 8192;
915	return sp & ~0xf;
916}
917