1/* -----------------------------------------------------------------------
2   unix64.S - Copyright (c) 2013  The Written Word, Inc.
3	    - Copyright (c) 2008  Red Hat, Inc
4	    - Copyright (c) 2002  Bo Thorsen <bo@suse.de>
5
6   x86-64 Foreign Function Interface
7
8   Permission is hereby granted, free of charge, to any person obtaining
9   a copy of this software and associated documentation files (the
10   ``Software''), to deal in the Software without restriction, including
11   without limitation the rights to use, copy, modify, merge, publish,
12   distribute, sublicense, and/or sell copies of the Software, and to
13   permit persons to whom the Software is furnished to do so, subject to
14   the following conditions:
15
16   The above copyright notice and this permission notice shall be included
17   in all copies or substantial portions of the Software.
18
19   THE SOFTWARE IS PROVIDED ``AS IS'', WITHOUT WARRANTY OF ANY KIND,
20   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
22   NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
23   HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
24   WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
26   DEALINGS IN THE SOFTWARE.
27   ----------------------------------------------------------------------- */
28
29#ifdef __x86_64__
30#define LIBFFI_ASM
31#include <fficonfig.h>
32#include <ffi.h>
33
34.text
35
36/* ffi_call_unix64 (void *args, unsigned long bytes, unsigned flags,
37	            void *raddr, void (*fnaddr)(void));
38
39   Bit o trickiness here -- ARGS+BYTES is the base of the stack frame
40   for this function.  This has been allocated by ffi_call.  We also
41   deallocate some of the stack that has been alloca'd.  */
42
43	.align	2
44	.globl	ffi_call_unix64
45	.type	ffi_call_unix64,@function
46
47ffi_call_unix64:
48.LUW0:
49	movq	(%rsp), %r10		/* Load return address.  */
50	leaq	(%rdi, %rsi), %rax	/* Find local stack base.  */
51	movq	%rdx, (%rax)		/* Save flags.  */
52	movq	%rcx, 8(%rax)		/* Save raddr.  */
53	movq	%rbp, 16(%rax)		/* Save old frame pointer.  */
54	movq	%r10, 24(%rax)		/* Relocate return address.  */
55	movq	%rax, %rbp		/* Finalize local stack frame.  */
56.LUW1:
57	movq	%rdi, %r10		/* Save a copy of the register area. */
58	movq	%r8, %r11		/* Save a copy of the target fn.  */
59	movl	%r9d, %eax		/* Set number of SSE registers.  */
60
61	/* Load up all argument registers.  */
62	movq	(%r10), %rdi
63	movq	8(%r10), %rsi
64	movq	16(%r10), %rdx
65	movq	24(%r10), %rcx
66	movq	32(%r10), %r8
67	movq	40(%r10), %r9
68	testl	%eax, %eax
69	jnz	.Lload_sse
70.Lret_from_load_sse:
71
72	/* Deallocate the reg arg area.  */
73	leaq	176(%r10), %rsp
74
75	/* Call the user function.  */
76	call	*%r11
77
78	/* Deallocate stack arg area; local stack frame in redzone.  */
79	leaq	24(%rbp), %rsp
80
81	movq	0(%rbp), %rcx		/* Reload flags.  */
82	movq	8(%rbp), %rdi		/* Reload raddr.  */
83	movq	16(%rbp), %rbp		/* Reload old frame pointer.  */
84.LUW2:
85
86	/* The first byte of the flags contains the FFI_TYPE.  */
87	movzbl	%cl, %r10d
88	leaq	.Lstore_table(%rip), %r11
89	movslq	(%r11, %r10, 4), %r10
90	addq	%r11, %r10
91	jmp	*%r10
92
93.Lstore_table:
94	.long	.Lst_void-.Lstore_table		/* FFI_TYPE_VOID */
95	.long	.Lst_sint32-.Lstore_table	/* FFI_TYPE_INT */
96	.long	.Lst_float-.Lstore_table	/* FFI_TYPE_FLOAT */
97	.long	.Lst_double-.Lstore_table	/* FFI_TYPE_DOUBLE */
98	.long	.Lst_ldouble-.Lstore_table	/* FFI_TYPE_LONGDOUBLE */
99	.long	.Lst_uint8-.Lstore_table	/* FFI_TYPE_UINT8 */
100	.long	.Lst_sint8-.Lstore_table	/* FFI_TYPE_SINT8 */
101	.long	.Lst_uint16-.Lstore_table	/* FFI_TYPE_UINT16 */
102	.long	.Lst_sint16-.Lstore_table	/* FFI_TYPE_SINT16 */
103	.long	.Lst_uint32-.Lstore_table	/* FFI_TYPE_UINT32 */
104	.long	.Lst_sint32-.Lstore_table	/* FFI_TYPE_SINT32 */
105	.long	.Lst_int64-.Lstore_table	/* FFI_TYPE_UINT64 */
106	.long	.Lst_int64-.Lstore_table	/* FFI_TYPE_SINT64 */
107	.long	.Lst_struct-.Lstore_table	/* FFI_TYPE_STRUCT */
108	.long	.Lst_int64-.Lstore_table	/* FFI_TYPE_POINTER */
109
110	.align 2
111.Lst_void:
112	ret
113	.align 2
114
115.Lst_uint8:
116	movzbq	%al, %rax
117	movq	%rax, (%rdi)
118	ret
119	.align 2
120.Lst_sint8:
121	movsbq	%al, %rax
122	movq	%rax, (%rdi)
123	ret
124	.align 2
125.Lst_uint16:
126	movzwq	%ax, %rax
127	movq	%rax, (%rdi)
128	.align 2
129.Lst_sint16:
130	movswq	%ax, %rax
131	movq	%rax, (%rdi)
132	ret
133	.align 2
134.Lst_uint32:
135	movl	%eax, %eax
136	movq	%rax, (%rdi)
137	.align 2
138.Lst_sint32:
139	cltq
140	movq	%rax, (%rdi)
141	ret
142	.align 2
143.Lst_int64:
144	movq	%rax, (%rdi)
145	ret
146
147	.align 2
148.Lst_float:
149	movss	%xmm0, (%rdi)
150	ret
151	.align 2
152.Lst_double:
153	movsd	%xmm0, (%rdi)
154	ret
155.Lst_ldouble:
156	fstpt	(%rdi)
157	ret
158
159	.align 2
160.Lst_struct:
161	leaq	-20(%rsp), %rsi		/* Scratch area in redzone.  */
162
163	/* We have to locate the values now, and since we don't want to
164	   write too much data into the user's return value, we spill the
165	   value to a 16 byte scratch area first.  Bits 8, 9, and 10
166	   control where the values are located.  Only one of the three
167	   bits will be set; see ffi_prep_cif_machdep for the pattern.  */
168	movd	%xmm0, %r10
169	movd	%xmm1, %r11
170	testl	$0x100, %ecx
171	cmovnz	%rax, %rdx
172	cmovnz	%r10, %rax
173	testl	$0x200, %ecx
174	cmovnz	%r10, %rdx
175	testl	$0x400, %ecx
176	cmovnz	%r10, %rax
177	cmovnz	%r11, %rdx
178	movq	%rax, (%rsi)
179	movq	%rdx, 8(%rsi)
180
181	/* Bits 12-31 contain the true size of the structure.  Copy from
182	   the scratch area to the true destination.  */
183	shrl	$12, %ecx
184	rep movsb
185	ret
186
187	/* Many times we can avoid loading any SSE registers at all.
188	   It's not worth an indirect jump to load the exact set of
189	   SSE registers needed; zero or all is a good compromise.  */
190	.align 2
191.LUW3:
192.Lload_sse:
193	movdqa	48(%r10), %xmm0
194	movdqa	64(%r10), %xmm1
195	movdqa	80(%r10), %xmm2
196	movdqa	96(%r10), %xmm3
197	movdqa	112(%r10), %xmm4
198	movdqa	128(%r10), %xmm5
199	movdqa	144(%r10), %xmm6
200	movdqa	160(%r10), %xmm7
201	jmp	.Lret_from_load_sse
202
203.LUW4:
204	.size    ffi_call_unix64,.-ffi_call_unix64
205
206	.align	2
207	.globl ffi_closure_unix64
208	.type	ffi_closure_unix64,@function
209
210ffi_closure_unix64:
211.LUW5:
212	/* The carry flag is set by the trampoline iff SSE registers
213	   are used.  Don't clobber it before the branch instruction.  */
214	leaq    -200(%rsp), %rsp
215.LUW6:
216	movq	%rdi, (%rsp)
217	movq    %rsi, 8(%rsp)
218	movq    %rdx, 16(%rsp)
219	movq    %rcx, 24(%rsp)
220	movq    %r8, 32(%rsp)
221	movq    %r9, 40(%rsp)
222	jc      .Lsave_sse
223.Lret_from_save_sse:
224
225	movq	%r10, %rdi
226	leaq	176(%rsp), %rsi
227	movq	%rsp, %rdx
228	leaq	208(%rsp), %rcx
229	call	ffi_closure_unix64_inner@PLT
230
231	/* Deallocate stack frame early; return value is now in redzone.  */
232	addq	$200, %rsp
233.LUW7:
234
235	/* The first byte of the return value contains the FFI_TYPE.  */
236	movzbl	%al, %r10d
237	leaq	.Lload_table(%rip), %r11
238	movslq	(%r11, %r10, 4), %r10
239	addq	%r11, %r10
240	jmp	*%r10
241
242.Lload_table:
243	.long	.Lld_void-.Lload_table		/* FFI_TYPE_VOID */
244	.long	.Lld_int32-.Lload_table		/* FFI_TYPE_INT */
245	.long	.Lld_float-.Lload_table		/* FFI_TYPE_FLOAT */
246	.long	.Lld_double-.Lload_table	/* FFI_TYPE_DOUBLE */
247	.long	.Lld_ldouble-.Lload_table	/* FFI_TYPE_LONGDOUBLE */
248	.long	.Lld_int8-.Lload_table		/* FFI_TYPE_UINT8 */
249	.long	.Lld_int8-.Lload_table		/* FFI_TYPE_SINT8 */
250	.long	.Lld_int16-.Lload_table		/* FFI_TYPE_UINT16 */
251	.long	.Lld_int16-.Lload_table		/* FFI_TYPE_SINT16 */
252	.long	.Lld_int32-.Lload_table		/* FFI_TYPE_UINT32 */
253	.long	.Lld_int32-.Lload_table		/* FFI_TYPE_SINT32 */
254	.long	.Lld_int64-.Lload_table		/* FFI_TYPE_UINT64 */
255	.long	.Lld_int64-.Lload_table		/* FFI_TYPE_SINT64 */
256	.long	.Lld_struct-.Lload_table	/* FFI_TYPE_STRUCT */
257	.long	.Lld_int64-.Lload_table		/* FFI_TYPE_POINTER */
258
259	.align 2
260.Lld_void:
261	ret
262
263	.align 2
264.Lld_int8:
265	movzbl	-24(%rsp), %eax
266	ret
267	.align 2
268.Lld_int16:
269	movzwl	-24(%rsp), %eax
270	ret
271	.align 2
272.Lld_int32:
273	movl	-24(%rsp), %eax
274	ret
275	.align 2
276.Lld_int64:
277	movq	-24(%rsp), %rax
278	ret
279
280	.align 2
281.Lld_float:
282	movss	-24(%rsp), %xmm0
283	ret
284	.align 2
285.Lld_double:
286	movsd	-24(%rsp), %xmm0
287	ret
288	.align 2
289.Lld_ldouble:
290	fldt	-24(%rsp)
291	ret
292
293	.align 2
294.Lld_struct:
295	/* There are four possibilities here, %rax/%rdx, %xmm0/%rax,
296	   %rax/%xmm0, %xmm0/%xmm1.  We collapse two by always loading
297	   both rdx and xmm1 with the second word.  For the remaining,
298	   bit 8 set means xmm0 gets the second word, and bit 9 means
299	   that rax gets the second word.  */
300	movq	-24(%rsp), %rcx
301	movq	-16(%rsp), %rdx
302	movq	-16(%rsp), %xmm1
303	testl	$0x100, %eax
304	cmovnz	%rdx, %rcx
305	movd	%rcx, %xmm0
306	testl	$0x200, %eax
307	movq	-24(%rsp), %rax
308	cmovnz	%rdx, %rax
309	ret
310
311	/* See the comment above .Lload_sse; the same logic applies here.  */
312	.align 2
313.LUW8:
314.Lsave_sse:
315	movdqa	%xmm0, 48(%rsp)
316	movdqa	%xmm1, 64(%rsp)
317	movdqa	%xmm2, 80(%rsp)
318	movdqa	%xmm3, 96(%rsp)
319	movdqa	%xmm4, 112(%rsp)
320	movdqa	%xmm5, 128(%rsp)
321	movdqa	%xmm6, 144(%rsp)
322	movdqa	%xmm7, 160(%rsp)
323	jmp	.Lret_from_save_sse
324
325.LUW9:
326	.size	ffi_closure_unix64,.-ffi_closure_unix64
327
328#ifdef __GNUC__
329/* Only emit DWARF unwind info when building with the GNU toolchain.  */
330
331#ifdef HAVE_AS_X86_64_UNWIND_SECTION_TYPE
332	.section	.eh_frame,"a",@unwind
333#else
334	.section	.eh_frame,"a",@progbits
335#endif
336.Lframe1:
337	.long	.LECIE1-.LSCIE1		/* CIE Length */
338.LSCIE1:
339	.long	0			/* CIE Identifier Tag */
340	.byte	1			/* CIE Version */
341	.ascii "zR\0"			/* CIE Augmentation */
342	.uleb128 1			/* CIE Code Alignment Factor */
343	.sleb128 -8			/* CIE Data Alignment Factor */
344	.byte	0x10			/* CIE RA Column */
345	.uleb128 1			/* Augmentation size */
346	.byte	0x1b			/* FDE Encoding (pcrel sdata4) */
347	.byte	0xc			/* DW_CFA_def_cfa, %rsp offset 8 */
348	.uleb128 7
349	.uleb128 8
350	.byte	0x80+16			/* DW_CFA_offset, %rip offset 1*-8 */
351	.uleb128 1
352	.align 8
353.LECIE1:
354.LSFDE1:
355	.long	.LEFDE1-.LASFDE1	/* FDE Length */
356.LASFDE1:
357	.long	.LASFDE1-.Lframe1	/* FDE CIE offset */
358#if HAVE_AS_X86_PCREL
359	.long	.LUW0-.			/* FDE initial location */
360#else
361	.long	.LUW0@rel
362#endif
363	.long	.LUW4-.LUW0		/* FDE address range */
364	.uleb128 0x0			/* Augmentation size */
365
366	.byte	0x4			/* DW_CFA_advance_loc4 */
367	.long	.LUW1-.LUW0
368
369	/* New stack frame based off rbp.  This is an itty bit of unwind
370	   trickery in that the CFA *has* changed.  There is no easy way
371	   to describe it correctly on entry to the function.  Fortunately,
372	   it doesn't matter too much since at all points we can correctly
373	   unwind back to ffi_call.  Note that the location to which we
374	   moved the return address is (the new) CFA-8, so from the
375	   perspective of the unwind info, it hasn't moved.  */
376	.byte	0xc			/* DW_CFA_def_cfa, %rbp offset 32 */
377	.uleb128 6
378	.uleb128 32
379	.byte	0x80+6			/* DW_CFA_offset, %rbp offset 2*-8 */
380	.uleb128 2
381	.byte	0xa			/* DW_CFA_remember_state */
382
383	.byte	0x4			/* DW_CFA_advance_loc4 */
384	.long	.LUW2-.LUW1
385	.byte	0xc			/* DW_CFA_def_cfa, %rsp offset 8 */
386	.uleb128 7
387	.uleb128 8
388	.byte	0xc0+6			/* DW_CFA_restore, %rbp */
389
390	.byte	0x4			/* DW_CFA_advance_loc4 */
391	.long	.LUW3-.LUW2
392	.byte	0xb			/* DW_CFA_restore_state */
393
394	.align 8
395.LEFDE1:
396.LSFDE3:
397	.long	.LEFDE3-.LASFDE3	/* FDE Length */
398.LASFDE3:
399	.long	.LASFDE3-.Lframe1	/* FDE CIE offset */
400#if HAVE_AS_X86_PCREL
401	.long	.LUW5-.			/* FDE initial location */
402#else
403	.long	.LUW5@rel
404#endif
405	.long	.LUW9-.LUW5		/* FDE address range */
406	.uleb128 0x0			/* Augmentation size */
407
408	.byte	0x4			/* DW_CFA_advance_loc4 */
409	.long	.LUW6-.LUW5
410	.byte	0xe			/* DW_CFA_def_cfa_offset */
411	.uleb128 208
412	.byte	0xa			/* DW_CFA_remember_state */
413
414	.byte	0x4			/* DW_CFA_advance_loc4 */
415	.long	.LUW7-.LUW6
416	.byte	0xe			/* DW_CFA_def_cfa_offset */
417	.uleb128 8
418
419	.byte	0x4			/* DW_CFA_advance_loc4 */
420	.long	.LUW8-.LUW7
421	.byte	0xb			/* DW_CFA_restore_state */
422
423	.align 8
424.LEFDE3:
425
426#endif /* __GNUC__ */
427
428#endif /* __x86_64__ */
429
430#if defined __ELF__ && defined __linux__
431	.section	.note.GNU-stack,"",@progbits
432#endif
433