1/*
2 *  Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
3 *  Copyright 2003 Andi Kleen, SuSE Labs.
4 *
5 *  [ NOTE: this mechanism is now deprecated in favor of the vDSO. ]
6 *
7 *  Thanks to hpa@transmeta.com for some useful hint.
8 *  Special thanks to Ingo Molnar for his early experience with
9 *  a different vsyscall implementation for Linux/IA32 and for the name.
10 *
11 *  vsyscall 1 is located at -10Mbyte, vsyscall 2 is located
12 *  at virtual address -10Mbyte+1024bytes etc... There are at max 4
13 *  vsyscalls. One vsyscall can reserve more than 1 slot to avoid
14 *  jumping out of line if necessary. We cannot add more with this
15 *  mechanism because older kernels won't return -ENOSYS.
16 *
17 *  Note: the concept clashes with user mode linux.  UML users should
18 *  use the vDSO.
19 */
20
21#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
22
23#include <linux/time.h>
24#include <linux/init.h>
25#include <linux/kernel.h>
26#include <linux/timer.h>
27#include <linux/seqlock.h>
28#include <linux/jiffies.h>
29#include <linux/sysctl.h>
30#include <linux/topology.h>
31#include <linux/timekeeper_internal.h>
32#include <linux/getcpu.h>
33#include <linux/cpu.h>
34#include <linux/smp.h>
35#include <linux/notifier.h>
36#include <linux/syscalls.h>
37#include <linux/ratelimit.h>
38
39#include <asm/vsyscall.h>
40#include <asm/pgtable.h>
41#include <asm/compat.h>
42#include <asm/page.h>
43#include <asm/unistd.h>
44#include <asm/fixmap.h>
45#include <asm/errno.h>
46#include <asm/io.h>
47#include <asm/segment.h>
48#include <asm/desc.h>
49#include <asm/topology.h>
50#include <asm/traps.h>
51
52#define CREATE_TRACE_POINTS
53#include "vsyscall_trace.h"
54
55DEFINE_VVAR(int, vgetcpu_mode);
56
57static enum { EMULATE, NATIVE, NONE } vsyscall_mode = EMULATE;
58
59static int __init vsyscall_setup(char *str)
60{
61	if (str) {
62		if (!strcmp("emulate", str))
63			vsyscall_mode = EMULATE;
64		else if (!strcmp("native", str))
65			vsyscall_mode = NATIVE;
66		else if (!strcmp("none", str))
67			vsyscall_mode = NONE;
68		else
69			return -EINVAL;
70
71		return 0;
72	}
73
74	return -EINVAL;
75}
76early_param("vsyscall", vsyscall_setup);
77
78static void warn_bad_vsyscall(const char *level, struct pt_regs *regs,
79			      const char *message)
80{
81	if (!show_unhandled_signals)
82		return;
83
84	printk_ratelimited("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n",
85			   level, current->comm, task_pid_nr(current),
86			   message, regs->ip, regs->cs,
87			   regs->sp, regs->ax, regs->si, regs->di);
88}
89
90static int addr_to_vsyscall_nr(unsigned long addr)
91{
92	int nr;
93
94	if ((addr & ~0xC00UL) != VSYSCALL_ADDR)
95		return -EINVAL;
96
97	nr = (addr & 0xC00UL) >> 10;
98	if (nr >= 3)
99		return -EINVAL;
100
101	return nr;
102}
103
104static bool write_ok_or_segv(unsigned long ptr, size_t size)
105{
106	/*
107	 * XXX: if access_ok, get_user, and put_user handled
108	 * sig_on_uaccess_error, this could go away.
109	 */
110
111	if (!access_ok(VERIFY_WRITE, (void __user *)ptr, size)) {
112		siginfo_t info;
113		struct thread_struct *thread = &current->thread;
114
115		thread->error_code	= 6;  /* user fault, no page, write */
116		thread->cr2		= ptr;
117		thread->trap_nr		= X86_TRAP_PF;
118
119		memset(&info, 0, sizeof(info));
120		info.si_signo		= SIGSEGV;
121		info.si_errno		= 0;
122		info.si_code		= SEGV_MAPERR;
123		info.si_addr		= (void __user *)ptr;
124
125		force_sig_info(SIGSEGV, &info, current);
126		return false;
127	} else {
128		return true;
129	}
130}
131
132bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
133{
134	struct task_struct *tsk;
135	unsigned long caller;
136	int vsyscall_nr, syscall_nr, tmp;
137	int prev_sig_on_uaccess_error;
138	long ret;
139
140	/*
141	 * No point in checking CS -- the only way to get here is a user mode
142	 * trap to a high address, which means that we're in 64-bit user code.
143	 */
144
145	WARN_ON_ONCE(address != regs->ip);
146
147	if (vsyscall_mode == NONE) {
148		warn_bad_vsyscall(KERN_INFO, regs,
149				  "vsyscall attempted with vsyscall=none");
150		return false;
151	}
152
153	vsyscall_nr = addr_to_vsyscall_nr(address);
154
155	trace_emulate_vsyscall(vsyscall_nr);
156
157	if (vsyscall_nr < 0) {
158		warn_bad_vsyscall(KERN_WARNING, regs,
159				  "misaligned vsyscall (exploit attempt or buggy program) -- look up the vsyscall kernel parameter if you need a workaround");
160		goto sigsegv;
161	}
162
163	if (get_user(caller, (unsigned long __user *)regs->sp) != 0) {
164		warn_bad_vsyscall(KERN_WARNING, regs,
165				  "vsyscall with bad stack (exploit attempt?)");
166		goto sigsegv;
167	}
168
169	tsk = current;
170
171	/*
172	 * Check for access_ok violations and find the syscall nr.
173	 *
174	 * NULL is a valid user pointer (in the access_ok sense) on 32-bit and
175	 * 64-bit, so we don't need to special-case it here.  For all the
176	 * vsyscalls, NULL means "don't write anything" not "write it at
177	 * address 0".
178	 */
179	switch (vsyscall_nr) {
180	case 0:
181		if (!write_ok_or_segv(regs->di, sizeof(struct timeval)) ||
182		    !write_ok_or_segv(regs->si, sizeof(struct timezone))) {
183			ret = -EFAULT;
184			goto check_fault;
185		}
186
187		syscall_nr = __NR_gettimeofday;
188		break;
189
190	case 1:
191		if (!write_ok_or_segv(regs->di, sizeof(time_t))) {
192			ret = -EFAULT;
193			goto check_fault;
194		}
195
196		syscall_nr = __NR_time;
197		break;
198
199	case 2:
200		if (!write_ok_or_segv(regs->di, sizeof(unsigned)) ||
201		    !write_ok_or_segv(regs->si, sizeof(unsigned))) {
202			ret = -EFAULT;
203			goto check_fault;
204		}
205
206		syscall_nr = __NR_getcpu;
207		break;
208	}
209
210	/*
211	 * Handle seccomp.  regs->ip must be the original value.
212	 * See seccomp_send_sigsys and Documentation/prctl/seccomp_filter.txt.
213	 *
214	 * We could optimize the seccomp disabled case, but performance
215	 * here doesn't matter.
216	 */
217	regs->orig_ax = syscall_nr;
218	regs->ax = -ENOSYS;
219	tmp = secure_computing();
220	if ((!tmp && regs->orig_ax != syscall_nr) || regs->ip != address) {
221		warn_bad_vsyscall(KERN_DEBUG, regs,
222				  "seccomp tried to change syscall nr or ip");
223		do_exit(SIGSYS);
224	}
225	if (tmp)
226		goto do_ret;  /* skip requested */
227
228	/*
229	 * With a real vsyscall, page faults cause SIGSEGV.  We want to
230	 * preserve that behavior to make writing exploits harder.
231	 */
232	prev_sig_on_uaccess_error = current_thread_info()->sig_on_uaccess_error;
233	current_thread_info()->sig_on_uaccess_error = 1;
234
235	ret = -EFAULT;
236	switch (vsyscall_nr) {
237	case 0:
238		ret = sys_gettimeofday(
239			(struct timeval __user *)regs->di,
240			(struct timezone __user *)regs->si);
241		break;
242
243	case 1:
244		ret = sys_time((time_t __user *)regs->di);
245		break;
246
247	case 2:
248		ret = sys_getcpu((unsigned __user *)regs->di,
249				 (unsigned __user *)regs->si,
250				 NULL);
251		break;
252	}
253
254	current_thread_info()->sig_on_uaccess_error = prev_sig_on_uaccess_error;
255
256check_fault:
257	if (ret == -EFAULT) {
258		/* Bad news -- userspace fed a bad pointer to a vsyscall. */
259		warn_bad_vsyscall(KERN_INFO, regs,
260				  "vsyscall fault (exploit attempt?)");
261
262		/*
263		 * If we failed to generate a signal for any reason,
264		 * generate one here.  (This should be impossible.)
265		 */
266		if (WARN_ON_ONCE(!sigismember(&tsk->pending.signal, SIGBUS) &&
267				 !sigismember(&tsk->pending.signal, SIGSEGV)))
268			goto sigsegv;
269
270		return true;  /* Don't emulate the ret. */
271	}
272
273	regs->ax = ret;
274
275do_ret:
276	/* Emulate a ret instruction. */
277	regs->ip = caller;
278	regs->sp += 8;
279	return true;
280
281sigsegv:
282	force_sig(SIGSEGV, current);
283	return true;
284}
285
286/*
287 * Assume __initcall executes before all user space. Hopefully kmod
288 * doesn't violate that. We'll find out if it does.
289 */
290static void vsyscall_set_cpu(int cpu)
291{
292	unsigned long d;
293	unsigned long node = 0;
294#ifdef CONFIG_NUMA
295	node = cpu_to_node(cpu);
296#endif
297	if (cpu_has(&cpu_data(cpu), X86_FEATURE_RDTSCP))
298		write_rdtscp_aux((node << 12) | cpu);
299
300	/*
301	 * Store cpu number in limit so that it can be loaded quickly
302	 * in user space in vgetcpu. (12 bits for the CPU and 8 bits for the node)
303	 */
304	d = 0x0f40000000000ULL;
305	d |= cpu;
306	d |= (node & 0xf) << 12;
307	d |= (node >> 4) << 48;
308
309	write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S);
310}
311
312static void cpu_vsyscall_init(void *arg)
313{
314	/* preemption should be already off */
315	vsyscall_set_cpu(raw_smp_processor_id());
316}
317
318static int
319cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg)
320{
321	long cpu = (long)arg;
322
323	if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN)
324		smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 1);
325
326	return NOTIFY_DONE;
327}
328
329void __init map_vsyscall(void)
330{
331	extern char __vsyscall_page;
332	unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page);
333
334	__set_fixmap(VSYSCALL_PAGE, physaddr_vsyscall,
335		     vsyscall_mode == NATIVE
336		     ? PAGE_KERNEL_VSYSCALL
337		     : PAGE_KERNEL_VVAR);
338	BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) !=
339		     (unsigned long)VSYSCALL_ADDR);
340}
341
342static int __init vsyscall_init(void)
343{
344	cpu_notifier_register_begin();
345
346	on_each_cpu(cpu_vsyscall_init, NULL, 1);
347	/* notifier priority > KVM */
348	__hotcpu_notifier(cpu_vsyscall_notifier, 30);
349
350	cpu_notifier_register_done();
351
352	return 0;
353}
354__initcall(vsyscall_init);
355