1;
2; jsimdext.inc - common declarations
3;
4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5; Copyright 2010 D. R. Commander
6;
7; Based on
8; x86 SIMD extension for IJG JPEG library - version 1.02
9;
10; Copyright (C) 1999-2006, MIYASAKA Masaru.
11;
12; This software is provided 'as-is', without any express or implied
13; warranty.  In no event will the authors be held liable for any damages
14; arising from the use of this software.
15;
16; Permission is granted to anyone to use this software for any purpose,
17; including commercial applications, and to alter it and redistribute it
18; freely, subject to the following restrictions:
19;
20; 1. The origin of this software must not be misrepresented; you must not
21;    claim that you wrote the original software. If you use this software
22;    in a product, an acknowledgment in the product documentation would be
23;    appreciated but is not required.
24; 2. Altered source versions must be plainly marked as such, and must not be
25;    misrepresented as being the original software.
26; 3. This notice may not be removed or altered from any source distribution.
27;
28; [TAB8]
29
30; ==========================================================================
31;  System-dependent configurations
32
33%ifdef WIN32    ; ----(nasm -fwin32 -DWIN32 ...)--------
34; * Microsoft Visual C++
35; * MinGW (Minimalist GNU for Windows)
36; * CygWin
37; * LCC-Win32
38
39; -- segment definition --
40;
41%ifdef __YASM_VER__
42%define SEG_TEXT    .text  align=16
43%define SEG_CONST   .rdata align=16
44%else
45%define SEG_TEXT    .text  align=16 public use32 class=CODE
46%define SEG_CONST   .rdata align=16 public use32 class=CONST
47%endif
48
49%elifdef WIN64  ; ----(nasm -fwin64 -DWIN64 ...)--------
50; * Microsoft Visual C++
51
52; -- segment definition --
53;
54%ifdef __YASM_VER__
55%define SEG_TEXT    .text  align=16
56%define SEG_CONST   .rdata align=16
57%else
58%define SEG_TEXT    .text  align=16 public use64 class=CODE
59%define SEG_CONST   .rdata align=16 public use64 class=CONST
60%endif
61%define EXTN(name)  name                        ; foo() -> foo
62
63%elifdef OBJ32  ; ----(nasm -fobj -DOBJ32 ...)----------
64; * Borland C++ (Win32)
65
66; -- segment definition --
67;
68%define SEG_TEXT    _text  align=16 public use32 class=CODE
69%define SEG_CONST   _data  align=16 public use32 class=DATA
70
71%elifdef ELF    ; ----(nasm -felf[64] -DELF ...)------------
72; * Linux
73; * *BSD family Unix using elf format
74; * Unix System V, including Solaris x86, UnixWare and SCO Unix
75
76; mark stack as non-executable
77section .note.GNU-stack noalloc noexec nowrite progbits
78
79; -- segment definition --
80;
81%ifdef __x86_64__
82%define SEG_TEXT    .text   progbits align=16
83%define SEG_CONST   .rodata progbits align=16
84%else
85%define SEG_TEXT    .text   progbits alloc exec   nowrite align=16
86%define SEG_CONST   .rodata progbits alloc noexec nowrite align=16
87%endif
88
89; To make the code position-independent, append -DPIC to the commandline
90;
91%define GOT_SYMBOL  _GLOBAL_OFFSET_TABLE_       ; ELF supports PIC
92%define EXTN(name)  name                        ; foo() -> foo
93
94%elifdef AOUT   ; ----(nasm -faoutb/aout -DAOUT ...)----
95; * Older Linux using a.out format  (nasm -f aout -DAOUT ...)
96; * *BSD family Unix using a.out format  (nasm -f aoutb -DAOUT ...)
97
98; -- segment definition --
99;
100%define SEG_TEXT    .text
101%define SEG_CONST   .data
102
103; To make the code position-independent, append -DPIC to the commandline
104;
105%define GOT_SYMBOL  __GLOBAL_OFFSET_TABLE_      ; BSD-style a.out supports PIC
106
107%elifdef MACHO  ; ----(nasm -fmacho -DMACHO ...)--------
108; * NeXTstep/OpenStep/Rhapsody/Darwin/MacOS X (Mach-O format)
109
110; -- segment definition --
111;
112%define SEG_TEXT    .text  ;align=16    ; nasm doesn't accept align=16. why?
113%define SEG_CONST   .rodata align=16
114
115; The generation of position-independent code (PIC) is the default on Darwin.
116;
117%define PIC
118%define GOT_SYMBOL  _MACHO_PIC_         ; Mach-O style code-relative addressing
119
120%else           ; ----(Other case)----------------------
121
122; -- segment definition --
123;
124%define SEG_TEXT    .text
125%define SEG_CONST   .data
126
127%endif  ; ----------------------------------------------
128
129; ==========================================================================
130
131; --------------------------------------------------------------------------
132;  Common types
133;
134%ifdef __x86_64__
135%define POINTER                 qword           ; general pointer type
136%define SIZEOF_POINTER          SIZEOF_QWORD    ; sizeof(POINTER)
137%define POINTER_BIT             QWORD_BIT       ; sizeof(POINTER)*BYTE_BIT
138%else
139%define POINTER                 dword           ; general pointer type
140%define SIZEOF_POINTER          SIZEOF_DWORD    ; sizeof(POINTER)
141%define POINTER_BIT             DWORD_BIT       ; sizeof(POINTER)*BYTE_BIT
142%endif
143
144%define INT                     dword           ; signed integer type
145%define SIZEOF_INT              SIZEOF_DWORD    ; sizeof(INT)
146%define INT_BIT                 DWORD_BIT       ; sizeof(INT)*BYTE_BIT
147
148%define FP32                    dword           ; IEEE754 single
149%define SIZEOF_FP32             SIZEOF_DWORD    ; sizeof(FP32)
150%define FP32_BIT                DWORD_BIT       ; sizeof(FP32)*BYTE_BIT
151
152%define MMWORD                  qword           ; int64  (MMX register)
153%define SIZEOF_MMWORD           SIZEOF_QWORD    ; sizeof(MMWORD)
154%define MMWORD_BIT              QWORD_BIT       ; sizeof(MMWORD)*BYTE_BIT
155
156; NASM is buggy and doesn't properly handle operand sizes for SSE
157; instructions, so for now we have to define XMMWORD as blank.
158%define XMMWORD                                 ; int128 (SSE register)
159%define SIZEOF_XMMWORD          SIZEOF_OWORD    ; sizeof(XMMWORD)
160%define XMMWORD_BIT             OWORD_BIT       ; sizeof(XMMWORD)*BYTE_BIT
161
162; Similar hacks for when we load a dword or MMWORD into an xmm# register
163%define XMM_DWORD
164%define XMM_MMWORD
165
166%define SIZEOF_BYTE             1               ; sizeof(BYTE)
167%define SIZEOF_WORD             2               ; sizeof(WORD)
168%define SIZEOF_DWORD            4               ; sizeof(DWORD)
169%define SIZEOF_QWORD            8               ; sizeof(QWORD)
170%define SIZEOF_OWORD            16              ; sizeof(OWORD)
171
172%define BYTE_BIT                8               ; CHAR_BIT in C
173%define WORD_BIT                16              ; sizeof(WORD)*BYTE_BIT
174%define DWORD_BIT               32              ; sizeof(DWORD)*BYTE_BIT
175%define QWORD_BIT               64              ; sizeof(QWORD)*BYTE_BIT
176%define OWORD_BIT               128             ; sizeof(OWORD)*BYTE_BIT
177
178; --------------------------------------------------------------------------
179;  External Symbol Name
180;
181%ifndef EXTN
182# Android Modification:
183# The unmodified code from upstream appends an underscore to the front of
184# "name" here.  It is unclear why.  Before removing the underscore, the
185# code failed to link because the function names in the SIMD code did not
186# match the callers (because of the extra underscore).  This fix only
187# applies to x86 SIMD code.  x86_64 is handled properly by the code above.
188%define EXTN(name)  name
189%endif
190
191; --------------------------------------------------------------------------
192;  Macros for position-independent code (PIC) support
193;
194%ifndef GOT_SYMBOL
195%undef PIC
196%endif
197
198%ifdef PIC ; -------------------------------------------
199
200%ifidn GOT_SYMBOL,_MACHO_PIC_ ; --------------------
201
202; At present, nasm doesn't seem to support PIC generation for Mach-O.
203; The PIC support code below is a little tricky.
204
205        SECTION SEG_CONST
206const_base:
207
208%define GOTOFF(got,sym) (got) + (sym) - const_base
209
210%imacro get_GOT 1
211        ; NOTE: this macro destroys ecx resister.
212        call    %%geteip
213        add     ecx, byte (%%ref - $)
214        jmp     short %%adjust
215%%geteip:
216        mov     ecx, POINTER [esp]
217        ret
218%%adjust:
219        push    ebp
220        xor     ebp,ebp         ; ebp = 0
221%ifidni %1,ebx  ; (%1 == ebx)
222        ; db 0x8D,0x9C + jmp near const_base =
223        ;   lea ebx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,9C,E9,(offset32)
224        db      0x8D,0x9C               ; 8D,9C
225        jmp     near const_base         ; E9,(const_base-%%ref)
226%%ref:
227%else  ; (%1 != ebx)
228        ; db 0x8D,0x8C + jmp near const_base =
229        ;   lea ecx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,8C,E9,(offset32)
230        db      0x8D,0x8C               ; 8D,8C
231        jmp     near const_base         ; E9,(const_base-%%ref)
232%%ref:  mov     %1, ecx
233%endif ; (%1 == ebx)
234        pop     ebp
235%endmacro
236
237%else   ; GOT_SYMBOL != _MACHO_PIC_ ----------------
238
239%define GOTOFF(got,sym) (got) + (sym) wrt ..gotoff
240
241%imacro get_GOT 1
242        extern  GOT_SYMBOL
243        call    %%geteip
244        add     %1, GOT_SYMBOL + $$ - $ wrt ..gotpc
245        jmp     short %%done
246%%geteip:
247        mov     %1, POINTER [esp]
248        ret
249%%done:
250%endmacro
251
252%endif  ; GOT_SYMBOL == _MACHO_PIC_ ----------------
253
254%imacro pushpic 1.nolist
255        push    %1
256%endmacro
257%imacro poppic  1.nolist
258        pop     %1
259%endmacro
260%imacro movpic  2.nolist
261        mov     %1,%2
262%endmacro
263
264%else   ; !PIC -----------------------------------------
265
266%define GOTOFF(got,sym) (sym)
267
268%imacro get_GOT 1.nolist
269%endmacro
270%imacro pushpic 1.nolist
271%endmacro
272%imacro poppic  1.nolist
273%endmacro
274%imacro movpic  2.nolist
275%endmacro
276
277%endif  ;  PIC -----------------------------------------
278
279; --------------------------------------------------------------------------
280;  Align the next instruction on {2,4,8,16,..}-byte boundary.
281;  ".balign n,,m" in GNU as
282;
283%define MSKLE(x,y)  (~(((y) & 0xFFFF) - ((x) & 0xFFFF)) >> 16)
284%define FILLB(b,n)  (($$-(b)) & ((n)-1))
285
286%imacro alignx 1-2.nolist 0xFFFF
287%%bs:   times MSKLE(FILLB(%%bs,%1),%2) & MSKLE(16,FILLB($,%1)) & FILLB($,%1) \
288               db 0x90                               ; nop
289        times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/9 \
290               db 0x8D,0x9C,0x23,0x00,0x00,0x00,0x00 ; lea ebx,[ebx+0x00000000]
291        times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/7 \
292               db 0x8D,0xAC,0x25,0x00,0x00,0x00,0x00 ; lea ebp,[ebp+0x00000000]
293        times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/6 \
294               db 0x8D,0xAD,0x00,0x00,0x00,0x00      ; lea ebp,[ebp+0x00000000]
295        times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/4 \
296               db 0x8D,0x6C,0x25,0x00                ; lea ebp,[ebp+0x00]
297        times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/3 \
298               db 0x8D,0x6D,0x00                     ; lea ebp,[ebp+0x00]
299        times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/2 \
300               db 0x8B,0xED                          ; mov ebp,ebp
301        times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/1 \
302               db 0x90                               ; nop
303%endmacro
304
305; Align the next data on {2,4,8,16,..}-byte boundary.
306;
307%imacro alignz 1.nolist
308        align %1, db 0          ; filling zeros
309%endmacro
310
311%ifdef __x86_64__
312
313%ifdef WIN64
314
315%imacro collect_args 0
316        push r12
317        push r13
318        push r14
319        push r15
320        mov r10, rcx
321        mov r11, rdx
322        mov r12, r8
323        mov r13, r9
324        mov r14, [rax+48]
325        mov r15, [rax+56]
326        push rsi
327        push rdi
328        sub     rsp, SIZEOF_XMMWORD
329        movaps  XMMWORD [rsp], xmm6
330        sub     rsp, SIZEOF_XMMWORD
331        movaps  XMMWORD [rsp], xmm7
332%endmacro
333
334%imacro uncollect_args 0
335        movaps  xmm7, XMMWORD [rsp]
336        add     rsp, SIZEOF_XMMWORD
337        movaps  xmm6, XMMWORD [rsp]
338        add     rsp, SIZEOF_XMMWORD
339        pop rdi
340        pop rsi
341        pop r15
342        pop r14
343        pop r13
344        pop r12
345%endmacro
346
347%else
348
349%imacro collect_args 0
350        push r10
351        push r11
352        push r12
353        push r13
354        push r14
355        push r15
356        mov r10, rdi
357        mov r11, rsi
358        mov r12, rdx
359        mov r13, rcx
360        mov r14, r8
361        mov r15, r9
362%endmacro
363
364%imacro uncollect_args 0
365        pop r15
366        pop r14
367        pop r13
368        pop r12
369        pop r11
370        pop r10
371%endmacro
372
373%endif
374
375%endif
376
377; --------------------------------------------------------------------------
378;  Defines picked up from the C headers
379;
380%include "jsimdcfg.inc"
381
382; --------------------------------------------------------------------------
383