1/* -----------------------------------------------------------------------
2   unix64.S - Copyright (c) 2002  Bo Thorsen <bo@suse.de>
3	      Copyright (c) 2008  Red Hat, Inc
4
5   x86-64 Foreign Function Interface
6
7   Permission is hereby granted, free of charge, to any person obtaining
8   a copy of this software and associated documentation files (the
9   ``Software''), to deal in the Software without restriction, including
10   without limitation the rights to use, copy, modify, merge, publish,
11   distribute, sublicense, and/or sell copies of the Software, and to
12   permit persons to whom the Software is furnished to do so, subject to
13   the following conditions:
14
15   The above copyright notice and this permission notice shall be included
16   in all copies or substantial portions of the Software.
17
18   THE SOFTWARE IS PROVIDED ``AS IS'', WITHOUT WARRANTY OF ANY KIND,
19   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
21   NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
22   HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
23   WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25   DEALINGS IN THE SOFTWARE.
26   ----------------------------------------------------------------------- */
27
28#ifdef __x86_64__
29#define LIBFFI_ASM
30#include <fficonfig.h>
31#include <ffi.h>
32
33.text
34
35/* ffi_call_unix64 (void *args, unsigned long bytes, unsigned flags,
36	            void *raddr, void (*fnaddr)(void));
37
38   Bit o trickiness here -- ARGS+BYTES is the base of the stack frame
39   for this function.  This has been allocated by ffi_call.  We also
40   deallocate some of the stack that has been alloca'd.  */
41
42	.align	2
43	.globl	ffi_call_unix64
44	.type	ffi_call_unix64,@function
45
46ffi_call_unix64:
47.LUW0:
48	movq	(%rsp), %r10		/* Load return address.  */
49	leaq	(%rdi, %rsi), %rax	/* Find local stack base.  */
50	movq	%rdx, (%rax)		/* Save flags.  */
51	movq	%rcx, 8(%rax)		/* Save raddr.  */
52	movq	%rbp, 16(%rax)		/* Save old frame pointer.  */
53	movq	%r10, 24(%rax)		/* Relocate return address.  */
54	movq	%rax, %rbp		/* Finalize local stack frame.  */
55.LUW1:
56	movq	%rdi, %r10		/* Save a copy of the register area. */
57	movq	%r8, %r11		/* Save a copy of the target fn.  */
58	movl	%r9d, %eax		/* Set number of SSE registers.  */
59
60	/* Load up all argument registers.  */
61	movq	(%r10), %rdi
62	movq	8(%r10), %rsi
63	movq	16(%r10), %rdx
64	movq	24(%r10), %rcx
65	movq	32(%r10), %r8
66	movq	40(%r10), %r9
67	testl	%eax, %eax
68	jnz	.Lload_sse
69.Lret_from_load_sse:
70
71	/* Deallocate the reg arg area.  */
72	leaq	176(%r10), %rsp
73
74	/* Call the user function.  */
75	call	*%r11
76
77	/* Deallocate stack arg area; local stack frame in redzone.  */
78	leaq	24(%rbp), %rsp
79
80	movq	0(%rbp), %rcx		/* Reload flags.  */
81	movq	8(%rbp), %rdi		/* Reload raddr.  */
82	movq	16(%rbp), %rbp		/* Reload old frame pointer.  */
83.LUW2:
84
85	/* The first byte of the flags contains the FFI_TYPE.  */
86	movzbl	%cl, %r10d
87	leaq	.Lstore_table(%rip), %r11
88	movslq	(%r11, %r10, 4), %r10
89	addq	%r11, %r10
90	jmp	*%r10
91
92	.section .rodata
93.Lstore_table:
94	.long	.Lst_void-.Lstore_table		/* FFI_TYPE_VOID */
95	.long	.Lst_sint32-.Lstore_table	/* FFI_TYPE_INT */
96	.long	.Lst_float-.Lstore_table	/* FFI_TYPE_FLOAT */
97	.long	.Lst_double-.Lstore_table	/* FFI_TYPE_DOUBLE */
98	.long	.Lst_ldouble-.Lstore_table	/* FFI_TYPE_LONGDOUBLE */
99	.long	.Lst_uint8-.Lstore_table	/* FFI_TYPE_UINT8 */
100	.long	.Lst_sint8-.Lstore_table	/* FFI_TYPE_SINT8 */
101	.long	.Lst_uint16-.Lstore_table	/* FFI_TYPE_UINT16 */
102	.long	.Lst_sint16-.Lstore_table	/* FFI_TYPE_SINT16 */
103	.long	.Lst_uint32-.Lstore_table	/* FFI_TYPE_UINT32 */
104	.long	.Lst_sint32-.Lstore_table	/* FFI_TYPE_SINT32 */
105	.long	.Lst_int64-.Lstore_table	/* FFI_TYPE_UINT64 */
106	.long	.Lst_int64-.Lstore_table	/* FFI_TYPE_SINT64 */
107	.long	.Lst_struct-.Lstore_table	/* FFI_TYPE_STRUCT */
108	.long	.Lst_int64-.Lstore_table	/* FFI_TYPE_POINTER */
109
110	.text
111	.align 2
112.Lst_void:
113	ret
114	.align 2
115
116.Lst_uint8:
117	movzbq	%al, %rax
118	movq	%rax, (%rdi)
119	ret
120	.align 2
121.Lst_sint8:
122	movsbq	%al, %rax
123	movq	%rax, (%rdi)
124	ret
125	.align 2
126.Lst_uint16:
127	movzwq	%ax, %rax
128	movq	%rax, (%rdi)
129	.align 2
130.Lst_sint16:
131	movswq	%ax, %rax
132	movq	%rax, (%rdi)
133	ret
134	.align 2
135.Lst_uint32:
136	movl	%eax, %eax
137	movq	%rax, (%rdi)
138	.align 2
139.Lst_sint32:
140	cltq
141	movq	%rax, (%rdi)
142	ret
143	.align 2
144.Lst_int64:
145	movq	%rax, (%rdi)
146	ret
147
148	.align 2
149.Lst_float:
150	movss	%xmm0, (%rdi)
151	ret
152	.align 2
153.Lst_double:
154	movsd	%xmm0, (%rdi)
155	ret
156.Lst_ldouble:
157	fstpt	(%rdi)
158	ret
159
160	.align 2
161.Lst_struct:
162	leaq	-20(%rsp), %rsi		/* Scratch area in redzone.  */
163
164	/* We have to locate the values now, and since we don't want to
165	   write too much data into the user's return value, we spill the
166	   value to a 16 byte scratch area first.  Bits 8, 9, and 10
167	   control where the values are located.  Only one of the three
168	   bits will be set; see ffi_prep_cif_machdep for the pattern.  */
169	movd	%xmm0, %r10
170	movd	%xmm1, %r11
171	testl	$0x100, %ecx
172	cmovnz	%rax, %rdx
173	cmovnz	%r10, %rax
174	testl	$0x200, %ecx
175	cmovnz	%r10, %rdx
176	testl	$0x400, %ecx
177	cmovnz	%r10, %rax
178	cmovnz	%r11, %rdx
179	movq	%rax, (%rsi)
180	movq	%rdx, 8(%rsi)
181
182	/* Bits 12-31 contain the true size of the structure.  Copy from
183	   the scratch area to the true destination.  */
184	shrl	$12, %ecx
185	rep movsb
186	ret
187
188	/* Many times we can avoid loading any SSE registers at all.
189	   It's not worth an indirect jump to load the exact set of
190	   SSE registers needed; zero or all is a good compromise.  */
191	.align 2
192.LUW3:
193.Lload_sse:
194	movdqa	48(%r10), %xmm0
195	movdqa	64(%r10), %xmm1
196	movdqa	80(%r10), %xmm2
197	movdqa	96(%r10), %xmm3
198	movdqa	112(%r10), %xmm4
199	movdqa	128(%r10), %xmm5
200	movdqa	144(%r10), %xmm6
201	movdqa	160(%r10), %xmm7
202	jmp	.Lret_from_load_sse
203
204.LUW4:
205	.size    ffi_call_unix64,.-ffi_call_unix64
206
207	.align	2
208	.globl ffi_closure_unix64
209	.type	ffi_closure_unix64,@function
210
211ffi_closure_unix64:
212.LUW5:
213	/* The carry flag is set by the trampoline iff SSE registers
214	   are used.  Don't clobber it before the branch instruction.  */
215	leaq    -200(%rsp), %rsp
216.LUW6:
217	movq	%rdi, (%rsp)
218	movq    %rsi, 8(%rsp)
219	movq    %rdx, 16(%rsp)
220	movq    %rcx, 24(%rsp)
221	movq    %r8, 32(%rsp)
222	movq    %r9, 40(%rsp)
223	jc      .Lsave_sse
224.Lret_from_save_sse:
225
226	movq	%r10, %rdi
227	leaq	176(%rsp), %rsi
228	movq	%rsp, %rdx
229	leaq	208(%rsp), %rcx
230	call	ffi_closure_unix64_inner@PLT
231
232	/* Deallocate stack frame early; return value is now in redzone.  */
233	addq	$200, %rsp
234.LUW7:
235
236	/* The first byte of the return value contains the FFI_TYPE.  */
237	movzbl	%al, %r10d
238	leaq	.Lload_table(%rip), %r11
239	movslq	(%r11, %r10, 4), %r10
240	addq	%r11, %r10
241	jmp	*%r10
242
243	.section .rodata
244.Lload_table:
245	.long	.Lld_void-.Lload_table		/* FFI_TYPE_VOID */
246	.long	.Lld_int32-.Lload_table		/* FFI_TYPE_INT */
247	.long	.Lld_float-.Lload_table		/* FFI_TYPE_FLOAT */
248	.long	.Lld_double-.Lload_table	/* FFI_TYPE_DOUBLE */
249	.long	.Lld_ldouble-.Lload_table	/* FFI_TYPE_LONGDOUBLE */
250	.long	.Lld_int8-.Lload_table		/* FFI_TYPE_UINT8 */
251	.long	.Lld_int8-.Lload_table		/* FFI_TYPE_SINT8 */
252	.long	.Lld_int16-.Lload_table		/* FFI_TYPE_UINT16 */
253	.long	.Lld_int16-.Lload_table		/* FFI_TYPE_SINT16 */
254	.long	.Lld_int32-.Lload_table		/* FFI_TYPE_UINT32 */
255	.long	.Lld_int32-.Lload_table		/* FFI_TYPE_SINT32 */
256	.long	.Lld_int64-.Lload_table		/* FFI_TYPE_UINT64 */
257	.long	.Lld_int64-.Lload_table		/* FFI_TYPE_SINT64 */
258	.long	.Lld_struct-.Lload_table	/* FFI_TYPE_STRUCT */
259	.long	.Lld_int64-.Lload_table		/* FFI_TYPE_POINTER */
260
261	.text
262	.align 2
263.Lld_void:
264	ret
265
266	.align 2
267.Lld_int8:
268	movzbl	-24(%rsp), %eax
269	ret
270	.align 2
271.Lld_int16:
272	movzwl	-24(%rsp), %eax
273	ret
274	.align 2
275.Lld_int32:
276	movl	-24(%rsp), %eax
277	ret
278	.align 2
279.Lld_int64:
280	movq	-24(%rsp), %rax
281	ret
282
283	.align 2
284.Lld_float:
285	movss	-24(%rsp), %xmm0
286	ret
287	.align 2
288.Lld_double:
289	movsd	-24(%rsp), %xmm0
290	ret
291	.align 2
292.Lld_ldouble:
293	fldt	-24(%rsp)
294	ret
295
296	.align 2
297.Lld_struct:
298	/* There are four possibilities here, %rax/%rdx, %xmm0/%rax,
299	   %rax/%xmm0, %xmm0/%xmm1.  We collapse two by always loading
300	   both rdx and xmm1 with the second word.  For the remaining,
301	   bit 8 set means xmm0 gets the second word, and bit 9 means
302	   that rax gets the second word.  */
303	movq	-24(%rsp), %rcx
304	movq	-16(%rsp), %rdx
305	movq	-16(%rsp), %xmm1
306	testl	$0x100, %eax
307	cmovnz	%rdx, %rcx
308	movd	%rcx, %xmm0
309	testl	$0x200, %eax
310	movq	-24(%rsp), %rax
311	cmovnz	%rdx, %rax
312	ret
313
314	/* See the comment above .Lload_sse; the same logic applies here.  */
315	.align 2
316.LUW8:
317.Lsave_sse:
318	movdqa	%xmm0, 48(%rsp)
319	movdqa	%xmm1, 64(%rsp)
320	movdqa	%xmm2, 80(%rsp)
321	movdqa	%xmm3, 96(%rsp)
322	movdqa	%xmm4, 112(%rsp)
323	movdqa	%xmm5, 128(%rsp)
324	movdqa	%xmm6, 144(%rsp)
325	movdqa	%xmm7, 160(%rsp)
326	jmp	.Lret_from_save_sse
327
328.LUW9:
329	.size	ffi_closure_unix64,.-ffi_closure_unix64
330
331	.section	.eh_frame,"a",@progbits
332.Lframe1:
333	.long	.LECIE1-.LSCIE1		/* CIE Length */
334.LSCIE1:
335	.long	0			/* CIE Identifier Tag */
336	.byte	1			/* CIE Version */
337	.ascii "zR\0"			/* CIE Augmentation */
338	.uleb128 1			/* CIE Code Alignment Factor */
339	.sleb128 -8			/* CIE Data Alignment Factor */
340	.byte	0x10			/* CIE RA Column */
341	.uleb128 1			/* Augmentation size */
342	.byte	0x1b			/* FDE Encoding (pcrel sdata4) */
343	.byte	0xc			/* DW_CFA_def_cfa, %rsp offset 8 */
344	.uleb128 7
345	.uleb128 8
346	.byte	0x80+16			/* DW_CFA_offset, %rip offset 1*-8 */
347	.uleb128 1
348	.align 8
349.LECIE1:
350.LSFDE1:
351	.long	.LEFDE1-.LASFDE1	/* FDE Length */
352.LASFDE1:
353	.long	.LASFDE1-.Lframe1	/* FDE CIE offset */
354	.long	.LUW0-.			/* FDE initial location */
355	.long	.LUW4-.LUW0		/* FDE address range */
356	.uleb128 0x0			/* Augmentation size */
357
358	.byte	0x4			/* DW_CFA_advance_loc4 */
359	.long	.LUW1-.LUW0
360
361	/* New stack frame based off rbp.  This is a itty bit of unwind
362	   trickery in that the CFA *has* changed.  There is no easy way
363	   to describe it correctly on entry to the function.  Fortunately,
364	   it doesn't matter too much since at all points we can correctly
365	   unwind back to ffi_call.  Note that the location to which we
366	   moved the return address is (the new) CFA-8, so from the
367	   perspective of the unwind info, it hasn't moved.  */
368	.byte	0xc			/* DW_CFA_def_cfa, %rbp offset 32 */
369	.uleb128 6
370	.uleb128 32
371	.byte	0x80+6			/* DW_CFA_offset, %rbp offset 2*-8 */
372	.uleb128 2
373	.byte	0xa			/* DW_CFA_remember_state */
374
375	.byte	0x4			/* DW_CFA_advance_loc4 */
376	.long	.LUW2-.LUW1
377	.byte	0xc			/* DW_CFA_def_cfa, %rsp offset 8 */
378	.uleb128 7
379	.uleb128 8
380	.byte	0xc0+6			/* DW_CFA_restore, %rbp */
381
382	.byte	0x4			/* DW_CFA_advance_loc4 */
383	.long	.LUW3-.LUW2
384	.byte	0xb			/* DW_CFA_restore_state */
385
386	.align 8
387.LEFDE1:
388.LSFDE3:
389	.long	.LEFDE3-.LASFDE3	/* FDE Length */
390.LASFDE3:
391	.long	.LASFDE3-.Lframe1	/* FDE CIE offset */
392	.long	.LUW5-.			/* FDE initial location */
393	.long	.LUW9-.LUW5		/* FDE address range */
394	.uleb128 0x0			/* Augmentation size */
395
396	.byte	0x4			/* DW_CFA_advance_loc4 */
397	.long	.LUW6-.LUW5
398	.byte	0xe			/* DW_CFA_def_cfa_offset */
399	.uleb128 208
400	.byte	0xa			/* DW_CFA_remember_state */
401
402	.byte	0x4			/* DW_CFA_advance_loc4 */
403	.long	.LUW7-.LUW6
404	.byte	0xe			/* DW_CFA_def_cfa_offset */
405	.uleb128 8
406
407	.byte	0x4			/* DW_CFA_advance_loc4 */
408	.long	.LUW8-.LUW7
409	.byte	0xb			/* DW_CFA_restore_state */
410
411	.align 8
412.LEFDE3:
413
414#endif /* __x86_64__ */
415
416#if defined __ELF__ && defined __linux__
417	.section	.note.GNU-stack,"",@progbits
418#endif
419