1ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian;*****************************************************************************
2ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian;* x86inc.asm: x264asm abstraction layer
3ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian;*****************************************************************************
4ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian;* Copyright (C) 2005-2012 x264 project
5ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian;*
6ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian;* Authors: Loren Merritt <lorenm@u.washington.edu>
7ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian;*          Anton Mitrofanov <BugMaster@narod.ru>
8ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian;*          Jason Garrett-Glaser <darkshikari@gmail.com>
9ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian;*          Henrik Gramner <hengar-6@student.ltu.se>
10ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian;*
11ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian;* Permission to use, copy, modify, and/or distribute this software for any
12ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian;* purpose with or without fee is hereby granted, provided that the above
13ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian;* copyright notice and this permission notice appear in all copies.
14ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian;*
15ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
16ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
17ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
18ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
20ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
21ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
22ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian;*****************************************************************************
23ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
24ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian; This is a header file for the x264ASM assembly language, which uses
25ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian; NASM/YASM syntax combined with a large number of macros to provide easy
26ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian; abstraction between different calling conventions (x86_32, win64, linux64).
27ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian; It also has various other useful features to simplify writing the kind of
28ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian; DSP functions that are most often used in x264.
29ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
30ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian; Unlike the rest of x264, this file is available under an ISC license, as it
31ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian; has significant usefulness outside of x264 and we want it to be available
32ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian; to the largest audience possible.  Of course, if you modify it for your own
33ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian; purposes to add a new feature, we strongly encourage contributing a patch
34ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian; as this feature might be useful for others as well.  Send patches or ideas
35ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian; to x264-devel@videolan.org .
36ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
37ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian; Local changes for libyuv:
38ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian; remove %define program_name and references in labels
39ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian; rename cpus to uppercase
40ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
41ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%define WIN64  0
42ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%define UNIX64 0
43ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%if ARCH_X86_64
44ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %ifidn __OUTPUT_FORMAT__,win32
45ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %define WIN64  1
46ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %elifidn __OUTPUT_FORMAT__,win64
47ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %define WIN64  1
48ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %else
49ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %define UNIX64 1
50ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %endif
51ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%endif
52ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
53ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%ifdef PREFIX
54ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %define mangle(x) _ %+ x
55ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%else
56ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %define mangle(x) x
57ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%endif
58ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
59ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian; Name of the .rodata section.
60ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian; Kludge: Something on OS X fails to align .rodata even given an align attribute,
61ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian; so use a different read-only section.
62ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%macro SECTION_RODATA 0-1 16
63ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %ifidn __OUTPUT_FORMAT__,macho64
64ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        SECTION .text align=%1
65ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %elifidn __OUTPUT_FORMAT__,macho
66ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        SECTION .text align=%1
67ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        fakegot:
68ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %elifidn __OUTPUT_FORMAT__,aout
69ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        section .text
70ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %else
71ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        SECTION .rodata align=%1
72ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %endif
73ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%endmacro
74ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
75ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian; aout does not support align=
76ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%macro SECTION_TEXT 0-1 16
77ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %ifidn __OUTPUT_FORMAT__,aout
78ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        SECTION .text
79ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %else
80ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        SECTION .text align=%1
81ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %endif
82ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%endmacro
83ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
84ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%if WIN64
85ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %define PIC
86ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%elif ARCH_X86_64 == 0
87ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian; x86_32 doesn't require PIC.
88ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian; Some distros prefer shared objects to be PIC, but nothing breaks if
89ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian; the code contains a few textrels, so we'll skip that complexity.
90ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %undef PIC
91ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%endif
92ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%ifdef PIC
93ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    default rel
94ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%endif
95ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
96ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian; Always use long nops (reduces 0x90 spam in disassembly on x86_32)
97ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianCPU amdnop
98ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
99ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian; Macros to eliminate most code duplication between x86_32 and x86_64:
100ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian; Currently this works only for leaf functions which load all their arguments
101ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian; into registers at the start, and make no other use of the stack. Luckily that
102ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian; covers most of x264's asm.
103ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
104ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian; PROLOGUE:
105ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian; %1 = number of arguments. loads them from stack if needed.
106ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian; %2 = number of registers used. pushes callee-saved regs if needed.
107ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed.
108ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian; %4 = list of names to define to registers
109ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian; PROLOGUE can also be invoked by adding the same options to cglobal
110ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
111ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian; e.g.
112ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian; cglobal foo, 2,3,0, dst, src, tmp
113ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian; declares a function (foo), taking two args (dst and src) and one local variable (tmp)
114ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
115ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian; TODO Some functions can use some args directly from the stack. If they're the
116ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian; last args then you can just not declare them, but if they're in the middle
117ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian; we need more flexible macro.
118ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
119ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian; RET:
120ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian; Pops anything that was pushed by PROLOGUE, and returns.
121ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
122ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian; REP_RET:
123ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian; Same, but if it doesn't pop anything it becomes a 2-byte ret, for athlons
124ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian; which are slow when a normal ret follows a branch.
125ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
126ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian; registers:
127ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian; rN and rNq are the native-size register holding function argument N
128ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian; rNd, rNw, rNb are dword, word, and byte size
129ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian; rNh is the high 8 bits of the word size
130ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian; rNm is the original location of arg N (a register or on the stack), dword
131ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian; rNmp is native size
132ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
133ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%macro DECLARE_REG 2-3
134ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %define r%1q %2
135ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %define r%1d %2d
136ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %define r%1w %2w
137ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %define r%1b %2b
138ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %define r%1h %2h
139ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %if %0 == 2
140ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %define r%1m  %2d
141ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %define r%1mp %2
142ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %elif ARCH_X86_64 ; memory
143ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %define r%1m [rsp + stack_offset + %3]
144ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %define r%1mp qword r %+ %1m
145ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %else
146ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %define r%1m [esp + stack_offset + %3]
147ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %define r%1mp dword r %+ %1m
148ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %endif
149ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %define r%1  %2
150ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%endmacro
151ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
152ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%macro DECLARE_REG_SIZE 3
153ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %define r%1q r%1
154ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %define e%1q r%1
155ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %define r%1d e%1
156ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %define e%1d e%1
157ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %define r%1w %1
158ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %define e%1w %1
159ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %define r%1h %3
160ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %define e%1h %3
161ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %define r%1b %2
162ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %define e%1b %2
163ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%if ARCH_X86_64 == 0
164ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %define r%1  e%1
165ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%endif
166ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%endmacro
167ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
168ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianDECLARE_REG_SIZE ax, al, ah
169ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianDECLARE_REG_SIZE bx, bl, bh
170ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianDECLARE_REG_SIZE cx, cl, ch
171ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianDECLARE_REG_SIZE dx, dl, dh
172ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianDECLARE_REG_SIZE si, sil, null
173ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianDECLARE_REG_SIZE di, dil, null
174ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianDECLARE_REG_SIZE bp, bpl, null
175ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
176ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian; t# defines for when per-arch register allocation is more complex than just function arguments
177ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
178ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%macro DECLARE_REG_TMP 1-*
179ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %assign %%i 0
180ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %rep %0
181ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        CAT_XDEFINE t, %%i, r%1
182ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %assign %%i %%i+1
183ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %rotate 1
184ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %endrep
185ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%endmacro
186ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
187ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%macro DECLARE_REG_TMP_SIZE 0-*
188ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %rep %0
189ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %define t%1q t%1 %+ q
190ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %define t%1d t%1 %+ d
191ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %define t%1w t%1 %+ w
192ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %define t%1h t%1 %+ h
193ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %define t%1b t%1 %+ b
194ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %rotate 1
195ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %endrep
196ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%endmacro
197ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
198ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianDECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
199ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
200ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%if ARCH_X86_64
201ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %define gprsize 8
202ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%else
203ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %define gprsize 4
204ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%endif
205ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
206ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%macro PUSH 1
207ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push %1
208ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %assign stack_offset stack_offset+gprsize
209ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%endmacro
210ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
211ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%macro POP 1
212ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop %1
213ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %assign stack_offset stack_offset-gprsize
214ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%endmacro
215ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
216ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%macro PUSH_IF_USED 1-*
217ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %rep %0
218ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %if %1 < regs_used
219ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian            PUSH r%1
220ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %endif
221ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %rotate 1
222ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %endrep
223ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%endmacro
224ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
225ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%macro POP_IF_USED 1-*
226ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %rep %0
227ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %if %1 < regs_used
228ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian            pop r%1
229ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %endif
230ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %rotate 1
231ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %endrep
232ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%endmacro
233ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
234ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%macro LOAD_IF_USED 1-*
235ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %rep %0
236ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %if %1 < num_args
237ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian            mov r%1, r %+ %1 %+ mp
238ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %endif
239ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %rotate 1
240ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %endrep
241ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%endmacro
242ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
243ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%macro SUB 2
244ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub %1, %2
245ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %ifidn %1, rsp
246ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %assign stack_offset stack_offset+(%2)
247ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %endif
248ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%endmacro
249ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
250ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%macro ADD 2
251ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    add %1, %2
252ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %ifidn %1, rsp
253ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %assign stack_offset stack_offset-(%2)
254ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %endif
255ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%endmacro
256ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
257ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%macro movifnidn 2
258ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %ifnidn %1, %2
259ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        mov %1, %2
260ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %endif
261ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%endmacro
262ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
263ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%macro movsxdifnidn 2
264ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %ifnidn %1, %2
265ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        movsxd %1, %2
266ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %endif
267ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%endmacro
268ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
269ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%macro ASSERT 1
270ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %if (%1) == 0
271ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %error assert failed
272ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %endif
273ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%endmacro
274ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
275ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%macro DEFINE_ARGS 0-*
276ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %ifdef n_arg_names
277ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %assign %%i 0
278ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %rep n_arg_names
279ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian            CAT_UNDEF arg_name %+ %%i, q
280ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian            CAT_UNDEF arg_name %+ %%i, d
281ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian            CAT_UNDEF arg_name %+ %%i, w
282ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian            CAT_UNDEF arg_name %+ %%i, h
283ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian            CAT_UNDEF arg_name %+ %%i, b
284ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian            CAT_UNDEF arg_name %+ %%i, m
285ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian            CAT_UNDEF arg_name %+ %%i, mp
286ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian            CAT_UNDEF arg_name, %%i
287ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian            %assign %%i %%i+1
288ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %endrep
289ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %endif
290ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
291ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %xdefine %%stack_offset stack_offset
292ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %undef stack_offset ; so that the current value of stack_offset doesn't get baked in by xdefine
293ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %assign %%i 0
294ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %rep %0
295ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %xdefine %1q r %+ %%i %+ q
296ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %xdefine %1d r %+ %%i %+ d
297ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %xdefine %1w r %+ %%i %+ w
298ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %xdefine %1h r %+ %%i %+ h
299ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %xdefine %1b r %+ %%i %+ b
300ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %xdefine %1m r %+ %%i %+ m
301ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %xdefine %1mp r %+ %%i %+ mp
302ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        CAT_XDEFINE arg_name, %%i, %1
303ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %assign %%i %%i+1
304ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %rotate 1
305ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %endrep
306ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %xdefine stack_offset %%stack_offset
307ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %assign n_arg_names %0
308ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%endmacro
309ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
310ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%if WIN64 ; Windows x64 ;=================================================
311ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
312ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianDECLARE_REG 0,  rcx
313ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianDECLARE_REG 1,  rdx
314ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianDECLARE_REG 2,  R8
315ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianDECLARE_REG 3,  R9
316ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianDECLARE_REG 4,  R10, 40
317ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianDECLARE_REG 5,  R11, 48
318ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianDECLARE_REG 6,  rax, 56
319ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianDECLARE_REG 7,  rdi, 64
320ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianDECLARE_REG 8,  rsi, 72
321ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianDECLARE_REG 9,  rbx, 80
322ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianDECLARE_REG 10, rbp, 88
323ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianDECLARE_REG 11, R12, 96
324ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianDECLARE_REG 12, R13, 104
325ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianDECLARE_REG 13, R14, 112
326ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianDECLARE_REG 14, R15, 120
327ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
328ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%macro PROLOGUE 2-4+ 0 ; #args, #regs, #xmm_regs, arg_names...
329ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %assign num_args %1
330ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %assign regs_used %2
331ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ASSERT regs_used >= num_args
332ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ASSERT regs_used <= 15
333ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14
334ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %if mmsize == 8
335ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %assign xmm_regs_used 0
336ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %else
337ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        WIN64_SPILL_XMM %3
338ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %endif
339ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14
340ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    DEFINE_ARGS %4
341ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%endmacro
342ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
343ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%macro WIN64_SPILL_XMM 1
344ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %assign xmm_regs_used %1
345ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ASSERT xmm_regs_used <= 16
346ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %if xmm_regs_used > 6
347ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        SUB rsp, (xmm_regs_used-6)*16+16
348ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %assign %%i xmm_regs_used
349ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %rep (xmm_regs_used-6)
350ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian            %assign %%i %%i-1
351ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian            movdqa [rsp + (%%i-6)*16+(~stack_offset&8)], xmm %+ %%i
352ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %endrep
353ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %endif
354ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%endmacro
355ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
356ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%macro WIN64_RESTORE_XMM_INTERNAL 1
357ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %if xmm_regs_used > 6
358ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %assign %%i xmm_regs_used
359ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %rep (xmm_regs_used-6)
360ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian            %assign %%i %%i-1
361ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian            movdqa xmm %+ %%i, [%1 + (%%i-6)*16+(~stack_offset&8)]
362ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %endrep
363ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        add %1, (xmm_regs_used-6)*16+16
364ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %endif
365ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%endmacro
366ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
367ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%macro WIN64_RESTORE_XMM 1
368ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    WIN64_RESTORE_XMM_INTERNAL %1
369ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %assign stack_offset stack_offset-(xmm_regs_used-6)*16+16
370ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %assign xmm_regs_used 0
371ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%endmacro
372ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
373ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%define has_epilogue regs_used > 7 || xmm_regs_used > 6 || mmsize == 32
374ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
375ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%macro RET 0
376ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    WIN64_RESTORE_XMM_INTERNAL rsp
377ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7
378ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%if mmsize == 32
379ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vzeroupper
380ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%endif
381ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
382ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%endmacro
383ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
384ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%elif ARCH_X86_64 ; *nix x64 ;=============================================
385ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
386ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianDECLARE_REG 0,  rdi
387ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianDECLARE_REG 1,  rsi
388ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianDECLARE_REG 2,  rdx
389ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianDECLARE_REG 3,  rcx
390ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianDECLARE_REG 4,  R8
391ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianDECLARE_REG 5,  R9
392ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianDECLARE_REG 6,  rax, 8
393ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianDECLARE_REG 7,  R10, 16
394ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianDECLARE_REG 8,  R11, 24
395ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianDECLARE_REG 9,  rbx, 32
396ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianDECLARE_REG 10, rbp, 40
397ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianDECLARE_REG 11, R12, 48
398ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianDECLARE_REG 12, R13, 56
399ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianDECLARE_REG 13, R14, 64
400ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianDECLARE_REG 14, R15, 72
401ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
402ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names...
403ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %assign num_args %1
404ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %assign regs_used %2
405ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ASSERT regs_used >= num_args
406ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ASSERT regs_used <= 15
407ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    PUSH_IF_USED 9, 10, 11, 12, 13, 14
408ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14
409ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    DEFINE_ARGS %4
410ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%endmacro
411ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
412ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%define has_epilogue regs_used > 9 || mmsize == 32
413ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
414ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%macro RET 0
415ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    POP_IF_USED 14, 13, 12, 11, 10, 9
416ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%if mmsize == 32
417ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vzeroupper
418ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%endif
419ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
420ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%endmacro
421ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
422ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%else ; X86_32 ;==============================================================
423ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
424ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianDECLARE_REG 0, eax, 4
425ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianDECLARE_REG 1, ecx, 8
426ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianDECLARE_REG 2, edx, 12
427ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianDECLARE_REG 3, ebx, 16
428ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianDECLARE_REG 4, esi, 20
429ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianDECLARE_REG 5, edi, 24
430ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianDECLARE_REG 6, ebp, 28
431ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%define rsp esp
432ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
433ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%macro DECLARE_ARG 1-*
434ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %rep %0
435ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %define r%1m [esp + stack_offset + 4*%1 + 4]
436ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %define r%1mp dword r%1m
437ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %rotate 1
438ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %endrep
439ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%endmacro
440ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
441ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianDECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
442ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
443ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names...
444ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %assign num_args %1
445ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %assign regs_used %2
446ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %if regs_used > 7
447ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %assign regs_used 7
448ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %endif
449ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ASSERT regs_used >= num_args
450ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    PUSH_IF_USED 3, 4, 5, 6
451ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6
452ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    DEFINE_ARGS %4
453ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%endmacro
454ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
455ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%define has_epilogue regs_used > 3 || mmsize == 32
456ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
457ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%macro RET 0
458ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    POP_IF_USED 6, 5, 4, 3
459ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%if mmsize == 32
460ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vzeroupper
461ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%endif
462ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
463ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%endmacro
464ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
465ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%endif ;======================================================================
466ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
467ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%if WIN64 == 0
468ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%macro WIN64_SPILL_XMM 1
469ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%endmacro
470ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%macro WIN64_RESTORE_XMM 1
471ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%endmacro
472ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%endif
473ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
474ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%macro REP_RET 0
475ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %if has_epilogue
476ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        RET
477ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %else
478ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        rep ret
479ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %endif
480ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%endmacro
481ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
482ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%macro TAIL_CALL 2 ; callee, is_nonadjacent
483ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %if has_epilogue
484ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        call %1
485ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        RET
486ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %elif %2
487ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        jmp %1
488ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %endif
489ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%endmacro
490ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
491ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian;=============================================================================
492ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian; arch-independent part
493ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian;=============================================================================
494ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
495ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%assign function_align 16
496ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
497ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian; Begin a function.
498ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian; Applies any symbol mangling needed for C linkage, and sets up a define such that
499ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian; subsequent uses of the function name automatically refer to the mangled version.
500ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian; Appends cpuflags to the function name if cpuflags has been specified.
501ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%macro cglobal 1-2+ ; name, [PROLOGUE args]
502ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%if %0 == 1
503ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    cglobal_internal %1 %+ SUFFIX
504ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%else
505ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    cglobal_internal %1 %+ SUFFIX, %2
506ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%endif
507ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%endmacro
508ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%macro cglobal_internal 1-2+
509ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %ifndef cglobaled_%1
510ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %xdefine %1 mangle(%1)
511ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %xdefine %1.skip_prologue %1 %+ .skip_prologue
512ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        CAT_XDEFINE cglobaled_, %1, 1
513ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %endif
514ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %xdefine current_function %1
515ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %ifidn __OUTPUT_FORMAT__,elf
516ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        global %1:function hidden
517ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %else
518ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        global %1
519ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %endif
520ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    align function_align
521ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %1:
522ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    RESET_MM_PERMUTATION ; not really needed, but makes disassembly somewhat nicer
523ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %assign stack_offset 0
524ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %if %0 > 1
525ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        PROLOGUE %2
526ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %endif
527ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%endmacro
528ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
529ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%macro cextern 1
530ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %xdefine %1 mangle(%1)
531ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    CAT_XDEFINE cglobaled_, %1, 1
532ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    extern %1
533ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%endmacro
534ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
535ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian; like cextern, but without the prefix
536ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%macro cextern_naked 1
537ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %xdefine %1 mangle(%1)
538ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    CAT_XDEFINE cglobaled_, %1, 1
539ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    extern %1
540ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%endmacro
541ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
542ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%macro const 2+
543ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %xdefine %1 mangle(%1)
544ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    global %1
545ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %1: %2
546ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%endmacro
547ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
548ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian; This is needed for ELF, otherwise the GNU linker assumes the stack is
549ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian; executable by default.
550ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%ifidn __OUTPUT_FORMAT__,elf
551ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianSECTION .note.GNU-stack noalloc noexec nowrite progbits
552ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%endif
553ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%ifidn __OUTPUT_FORMAT__,elf32
554ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramaniansection .note.GNU-stack noalloc noexec nowrite progbits
555ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%endif
556ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%ifidn __OUTPUT_FORMAT__,elf64
557ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramaniansection .note.GNU-stack noalloc noexec nowrite progbits
558ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%endif
559ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
560ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian; cpuflags
561ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
562ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%assign cpuflags_MMX      (1<<0)
563ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%assign cpuflags_MMX2     (1<<1) | cpuflags_MMX
564ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%assign cpuflags_3dnow    (1<<2) | cpuflags_MMX
565ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%assign cpuflags_3dnow2   (1<<3) | cpuflags_3dnow
566ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%assign cpuflags_SSE      (1<<4) | cpuflags_MMX2
567ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%assign cpuflags_SSE2     (1<<5) | cpuflags_SSE
568ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%assign cpuflags_SSE2slow (1<<6) | cpuflags_SSE2
569ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%assign cpuflags_SSE3     (1<<7) | cpuflags_SSE2
570ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%assign cpuflags_SSSE3    (1<<8) | cpuflags_SSE3
571ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%assign cpuflags_SSE4     (1<<9) | cpuflags_SSSE3
572ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%assign cpuflags_SSE42    (1<<10)| cpuflags_SSE4
573ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%assign cpuflags_AVX      (1<<11)| cpuflags_SSE42
574ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%assign cpuflags_xop      (1<<12)| cpuflags_AVX
575ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%assign cpuflags_fma4     (1<<13)| cpuflags_AVX
576ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%assign cpuflags_AVX2     (1<<14)| cpuflags_AVX
577ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%assign cpuflags_fma3     (1<<15)| cpuflags_AVX
578ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
579ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%assign cpuflags_cache32  (1<<16)
580ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%assign cpuflags_cache64  (1<<17)
581ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%assign cpuflags_slowctz  (1<<18)
582ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%assign cpuflags_lzcnt    (1<<19)
583ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%assign cpuflags_misalign (1<<20)
584ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%assign cpuflags_aligned  (1<<21) ; not a cpu feature, but a function variant
585ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%assign cpuflags_atom     (1<<22)
586ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%assign cpuflags_bmi1     (1<<23)
587ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%assign cpuflags_bmi2     (1<<24)|cpuflags_bmi1
588ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%assign cpuflags_tbm      (1<<25)|cpuflags_bmi1
589ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
590ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%define    cpuflag(x) ((cpuflags & (cpuflags_ %+ x)) == (cpuflags_ %+ x))
591ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%define notcpuflag(x) ((cpuflags & (cpuflags_ %+ x)) != (cpuflags_ %+ x))
592ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
593ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian; Takes up to 2 cpuflags from the above list.
594ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu.
595ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_MMX &co.
596ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%macro INIT_CPUFLAGS 0-2
597ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %if %0 >= 1
598ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %xdefine cpuname %1
599ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %assign cpuflags cpuflags_%1
600ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %if %0 >= 2
601ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian            %xdefine cpuname %1_%2
602ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian            %assign cpuflags cpuflags | cpuflags_%2
603ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %endif
604ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %xdefine SUFFIX _ %+ cpuname
605ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %if cpuflag(AVX)
606ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian            %assign AVX_enabled 1
607ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %endif
608ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %if mmsize == 16 && notcpuflag(SSE2)
609ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian            %define mova movaps
610ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian            %define movu movups
611ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian            %define movnta movntps
612ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %endif
613ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %if cpuflag(aligned)
614ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian            %define movu mova
615ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %elifidn %1, SSE3
616ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian            %define movu lddqu
617ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %endif
618ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %else
619ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %xdefine SUFFIX
620ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %undef cpuname
621ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %undef cpuflags
622ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %endif
623ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%endmacro
624ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
625ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian; merge MMX and SSE*
626ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
627ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%macro CAT_XDEFINE 3
628ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %xdefine %1%2 %3
629ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%endmacro
630ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
631ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%macro CAT_UNDEF 2
632ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %undef %1%2
633ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%endmacro
634ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
635ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%macro INIT_MMX 0-1+
636ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %assign AVX_enabled 0
637ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %define RESET_MM_PERMUTATION INIT_MMX %1
638ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %define mmsize 8
639ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %define num_mmregs 8
640ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %define mova movq
641ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %define movu movq
642ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %define movh movd
643ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %define movnta movntq
644ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %assign %%i 0
645ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %rep 8
646ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    CAT_XDEFINE m, %%i, mm %+ %%i
647ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    CAT_XDEFINE nmm, %%i, %%i
648ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %assign %%i %%i+1
649ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %endrep
650ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %rep 8
651ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    CAT_UNDEF m, %%i
652ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    CAT_UNDEF nmm, %%i
653ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %assign %%i %%i+1
654ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %endrep
655ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    INIT_CPUFLAGS %1
656ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%endmacro
657ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
658ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%macro INIT_XMM 0-1+
659ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %assign AVX_enabled 0
660ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %define RESET_MM_PERMUTATION INIT_XMM %1
661ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %define mmsize 16
662ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %define num_mmregs 8
663ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %if ARCH_X86_64
664ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %define num_mmregs 16
665ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %endif
666ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %define mova movdqa
667ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %define movu movdqu
668ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %define movh movq
669ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %define movnta movntdq
670ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %assign %%i 0
671ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %rep num_mmregs
672ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    CAT_XDEFINE m, %%i, xmm %+ %%i
673ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    CAT_XDEFINE nxmm, %%i, %%i
674ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %assign %%i %%i+1
675ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %endrep
676ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    INIT_CPUFLAGS %1
677ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%endmacro
678ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
679ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%macro INIT_YMM 0-1+
680ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %assign AVX_enabled 1
681ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %define RESET_MM_PERMUTATION INIT_YMM %1
682ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %define mmsize 32
683ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %define num_mmregs 8
684ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %if ARCH_X86_64
685ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %define num_mmregs 16
686ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %endif
687ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %define mova vmovaps
688ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %define movu vmovups
689ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %undef movh
690ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %define movnta vmovntps
691ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %assign %%i 0
692ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %rep num_mmregs
693ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    CAT_XDEFINE m, %%i, ymm %+ %%i
694ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    CAT_XDEFINE nymm, %%i, %%i
695ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %assign %%i %%i+1
696ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %endrep
697ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    INIT_CPUFLAGS %1
698ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%endmacro
699ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
700ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianINIT_XMM
701ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
702ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian; I often want to use macros that permute their arguments. e.g. there's no
703ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian; efficient way to implement butterfly or transpose or dct without swapping some
704ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian; arguments.
705ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian;
706ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian; I would like to not have to manually keep track of the permutations:
707ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian; If I insert a permutation in the middle of a function, it should automatically
708ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian; change everything that follows. For more complex macros I may also have multiple
709ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations.
710ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian;
711ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that
712ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian; permutes its arguments. It's equivalent to exchanging the contents of the
713ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian; registers, except that this way you exchange the register names instead, so it
714ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian; doesn't cost any cycles.
715ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
716ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%macro PERMUTE 2-* ; takes a list of pairs to swap
717ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%rep %0/2
718ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %xdefine tmp%2 m%2
719ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %xdefine ntmp%2 nm%2
720ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %rotate 2
721ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%endrep
722ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%rep %0/2
723ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %xdefine m%1 tmp%2
724ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %xdefine nm%1 ntmp%2
725ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %undef tmp%2
726ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %undef ntmp%2
727ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %rotate 2
728ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%endrep
729ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%endmacro
730ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
731ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%macro SWAP 2-* ; swaps a single chain (sometimes more concise than pairs)
732ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%rep %0-1
733ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%ifdef m%1
734ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %xdefine tmp m%1
735ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %xdefine m%1 m%2
736ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %xdefine m%2 tmp
737ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    CAT_XDEFINE n, m%1, %1
738ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    CAT_XDEFINE n, m%2, %2
739ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%else
740ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ; If we were called as "SWAP m0,m1" rather than "SWAP 0,1" infer the original numbers here.
741ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ; Be careful using this mode in nested macros though, as in some cases there may be
742ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ; other copies of m# that have already been dereferenced and don't get updated correctly.
743ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %xdefine %%n1 n %+ %1
744ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %xdefine %%n2 n %+ %2
745ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %xdefine tmp m %+ %%n1
746ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    CAT_XDEFINE m, %%n1, m %+ %%n2
747ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    CAT_XDEFINE m, %%n2, tmp
748ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    CAT_XDEFINE n, m %+ %%n1, %%n1
749ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    CAT_XDEFINE n, m %+ %%n2, %%n2
750ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%endif
751ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %undef tmp
752ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %rotate 1
753ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%endrep
754ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%endmacro
755ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
756ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian; If SAVE_MM_PERMUTATION is placed at the end of a function, then any later
757ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian; calls to that function will automatically load the permutation, so values can
758ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian; be returned in mmregs.
759ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%macro SAVE_MM_PERMUTATION 0-1
760ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %if %0
761ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %xdefine %%f %1_m
762ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %else
763ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %xdefine %%f current_function %+ _m
764ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %endif
765ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %assign %%i 0
766ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %rep num_mmregs
767ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        CAT_XDEFINE %%f, %%i, m %+ %%i
768ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %assign %%i %%i+1
769ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %endrep
770ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%endmacro
771ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
772ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%macro LOAD_MM_PERMUTATION 1 ; name to load from
773ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %ifdef %1_m0
774ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %assign %%i 0
775ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %rep num_mmregs
776ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian            CAT_XDEFINE m, %%i, %1_m %+ %%i
777ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian            CAT_XDEFINE n, m %+ %%i, %%i
778ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %assign %%i %%i+1
779ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %endrep
780ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %endif
781ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%endmacro
782ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
783ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't
784ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%macro call 1
785ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    call_internal %1, %1 %+ SUFFIX
786ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%endmacro
787ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%macro call_internal 2
788ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %xdefine %%i %1
789ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %ifndef cglobaled_%1
790ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %ifdef cglobaled_%2
791ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian            %xdefine %%i %2
792ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %endif
793ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %endif
794ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    call %%i
795ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    LOAD_MM_PERMUTATION %%i
796ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%endmacro
797ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
798ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian; Substitutions that reduce instruction size but are functionally equivalent
799ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%macro add 2
800ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %ifnum %2
801ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %if %2==128
802ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian            sub %1, -128
803ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %else
804ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian            add %1, %2
805ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %endif
806ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %else
807ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        add %1, %2
808ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %endif
809ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%endmacro
810ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
811ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%macro sub 2
812ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %ifnum %2
813ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %if %2==128
814ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian            add %1, -128
815ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %else
816ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian            sub %1, %2
817ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %endif
818ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %else
819ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        sub %1, %2
820ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %endif
821ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%endmacro
822ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
823ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian;=============================================================================
824ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian; AVX abstraction layer
825ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian;=============================================================================
826ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
827ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%assign i 0
828ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%rep 16
829ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %if i < 8
830ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        CAT_XDEFINE sizeofmm, i, 8
831ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %endif
832ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    CAT_XDEFINE sizeofxmm, i, 16
833ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    CAT_XDEFINE sizeofymm, i, 32
834ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%assign i i+1
835ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%endrep
836ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%undef i
837ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
838ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%macro CHECK_AVX_INSTR_EMU 3-*
839ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %xdefine %%opcode %1
840ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %xdefine %%dst %2
841ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %rep %0-2
842ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %ifidn %%dst, %3
843ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian            %error non-AVX emulation of ``%%opcode'' is not supported
844ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %endif
845ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %rotate 1
846ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %endrep
847ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%endmacro
848ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
849ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian;%1 == instruction
850ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian;%2 == 1 if float, 0 if int
851ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian;%3 == 1 if 4-operand (xmm, xmm, xmm, imm), 0 if 2- or 3-operand (xmm, xmm, xmm)
852ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian;%4 == number of operands given
853ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian;%5+: operands
854ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%macro RUN_AVX_INSTR 6-7+
855ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %ifid %6
856ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %define %%sizeofreg sizeof%6
857ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %elifid %5
858ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %define %%sizeofreg sizeof%5
859ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %else
860ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %define %%sizeofreg mmsize
861ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %endif
862ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %if %%sizeofreg==32
863ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %if %4>=3
864ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian            v%1 %5, %6, %7
865ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %else
866ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian            v%1 %5, %6
867ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %endif
868ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %else
869ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %if %%sizeofreg==8
870ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian            %define %%regmov movq
871ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %elif %2
872ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian            %define %%regmov movaps
873ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %else
874ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian            %define %%regmov movdqa
875ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %endif
876ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
877ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %if %4>=3+%3
878ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian            %ifnidn %5, %6
879ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                %if AVX_enabled && %%sizeofreg==16
880ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                    v%1 %5, %6, %7
881ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                %else
882ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                    CHECK_AVX_INSTR_EMU {%1 %5, %6, %7}, %5, %7
883ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                    %%regmov %5, %6
884ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                    %1 %5, %7
885ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                %endif
886ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian            %else
887ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                %1 %5, %7
888ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian            %endif
889ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %elif %4>=3
890ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian            %1 %5, %6, %7
891ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %else
892ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian            %1 %5, %6
893ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %endif
894ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %endif
895ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%endmacro
896ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
897ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian; 3arg AVX ops with a memory arg can only have it in src2,
898ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian; whereas SSE emulation of 3arg prefers to have it in src1 (i.e. the mov).
899ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian; So, if the op is symmetric and the wrong one is memory, swap them.
900ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%macro RUN_AVX_INSTR1 8
901ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %assign %%swap 0
902ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %if AVX_enabled
903ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %ifnid %6
904ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian            %assign %%swap 1
905ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %endif
906ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %elifnidn %5, %6
907ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %ifnid %7
908ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian            %assign %%swap 1
909ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %endif
910ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %endif
911ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %if %%swap && %3 == 0 && %8 == 1
912ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        RUN_AVX_INSTR %1, %2, %3, %4, %5, %7, %6
913ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %else
914ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        RUN_AVX_INSTR %1, %2, %3, %4, %5, %6, %7
915ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %endif
916ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%endmacro
917ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
918ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian;%1 == instruction
919ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian;%2 == 1 if float, 0 if int
920ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian;%3 == 1 if 4-operand (xmm, xmm, xmm, imm), 0 if 2- or 3-operand (xmm, xmm, xmm)
921ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian;%4 == 1 if symmetric (i.e. doesn't matter which src arg is which), 0 if not
922ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%macro AVX_INSTR 4
923ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %macro %1 2-9 fnord, fnord, fnord, %1, %2, %3, %4
924ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %ifidn %3, fnord
925ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian            RUN_AVX_INSTR %6, %7, %8, 2, %1, %2
926ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %elifidn %4, fnord
927ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian            RUN_AVX_INSTR1 %6, %7, %8, 3, %1, %2, %3, %9
928ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %elifidn %5, fnord
929ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian            RUN_AVX_INSTR %6, %7, %8, 4, %1, %2, %3, %4
930ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %else
931ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian            RUN_AVX_INSTR %6, %7, %8, 5, %1, %2, %3, %4, %5
932ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %endif
933ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %endmacro
934ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%endmacro
935ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
936ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR addpd, 1, 0, 1
937ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR addps, 1, 0, 1
938ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR addsd, 1, 0, 1
939ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR addss, 1, 0, 1
940ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR addsubpd, 1, 0, 0
941ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR addsubps, 1, 0, 0
942ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR andpd, 1, 0, 1
943ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR andps, 1, 0, 1
944ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR andnpd, 1, 0, 0
945ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR andnps, 1, 0, 0
946ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR blendpd, 1, 0, 0
947ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR blendps, 1, 0, 0
948ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR blendvpd, 1, 0, 0
949ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR blendvps, 1, 0, 0
950ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR cmppd, 1, 0, 0
951ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR cmpps, 1, 0, 0
952ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR cmpsd, 1, 0, 0
953ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR cmpss, 1, 0, 0
954ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR cvtdq2ps, 1, 0, 0
955ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR cvtps2dq, 1, 0, 0
956ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR divpd, 1, 0, 0
957ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR divps, 1, 0, 0
958ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR divsd, 1, 0, 0
959ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR divss, 1, 0, 0
960ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR dppd, 1, 1, 0
961ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR dpps, 1, 1, 0
962ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR haddpd, 1, 0, 0
963ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR haddps, 1, 0, 0
964ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR hsubpd, 1, 0, 0
965ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR hsubps, 1, 0, 0
966ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR maxpd, 1, 0, 1
967ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR maxps, 1, 0, 1
968ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR maxsd, 1, 0, 1
969ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR maxss, 1, 0, 1
970ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR minpd, 1, 0, 1
971ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR minps, 1, 0, 1
972ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR minsd, 1, 0, 1
973ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR minss, 1, 0, 1
974ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR movhlps, 1, 0, 0
975ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR movlhps, 1, 0, 0
976ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR movsd, 1, 0, 0
977ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR movss, 1, 0, 0
978ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR mpsadbw, 0, 1, 0
979ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR mulpd, 1, 0, 1
980ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR mulps, 1, 0, 1
981ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR mulsd, 1, 0, 1
982ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR mulss, 1, 0, 1
983ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR orpd, 1, 0, 1
984ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR orps, 1, 0, 1
985ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR pabsb, 0, 0, 0
986ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR pabsw, 0, 0, 0
987ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR pabsd, 0, 0, 0
988ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR packsswb, 0, 0, 0
989ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR packssdw, 0, 0, 0
990ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR packuswb, 0, 0, 0
991ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR packusdw, 0, 0, 0
992ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR paddb, 0, 0, 1
993ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR paddw, 0, 0, 1
994ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR paddd, 0, 0, 1
995ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR paddq, 0, 0, 1
996ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR paddsb, 0, 0, 1
997ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR paddsw, 0, 0, 1
998ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR paddusb, 0, 0, 1
999ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR paddusw, 0, 0, 1
1000ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR palignr, 0, 1, 0
1001ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR pand, 0, 0, 1
1002ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR pandn, 0, 0, 0
1003ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR pavgb, 0, 0, 1
1004ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR pavgw, 0, 0, 1
1005ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR pblendvb, 0, 0, 0
1006ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR pblendw, 0, 1, 0
1007ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR pcmpestri, 0, 0, 0
1008ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR pcmpestrm, 0, 0, 0
1009ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR pcmpistri, 0, 0, 0
1010ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR pcmpistrm, 0, 0, 0
1011ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR pcmpeqb, 0, 0, 1
1012ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR pcmpeqw, 0, 0, 1
1013ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR pcmpeqd, 0, 0, 1
1014ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR pcmpeqq, 0, 0, 1
1015ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR pcmpgtb, 0, 0, 0
1016ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR pcmpgtw, 0, 0, 0
1017ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR pcmpgtd, 0, 0, 0
1018ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR pcmpgtq, 0, 0, 0
1019ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR phaddw, 0, 0, 0
1020ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR phaddd, 0, 0, 0
1021ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR phaddsw, 0, 0, 0
1022ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR phsubw, 0, 0, 0
1023ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR phsubd, 0, 0, 0
1024ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR phsubsw, 0, 0, 0
1025ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR pmaddwd, 0, 0, 1
1026ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR pmaddubsw, 0, 0, 0
1027ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR pmaxsb, 0, 0, 1
1028ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR pmaxsw, 0, 0, 1
1029ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR pmaxsd, 0, 0, 1
1030ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR pmaxub, 0, 0, 1
1031ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR pmaxuw, 0, 0, 1
1032ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR pmaxud, 0, 0, 1
1033ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR pminsb, 0, 0, 1
1034ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR pminsw, 0, 0, 1
1035ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR pminsd, 0, 0, 1
1036ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR pminub, 0, 0, 1
1037ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR pminuw, 0, 0, 1
1038ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR pminud, 0, 0, 1
1039ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR pmovmskb, 0, 0, 0
1040ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR pmulhuw, 0, 0, 1
1041ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR pmulhrsw, 0, 0, 1
1042ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR pmulhw, 0, 0, 1
1043ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR pmullw, 0, 0, 1
1044ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR pmulld, 0, 0, 1
1045ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR pmuludq, 0, 0, 1
1046ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR pmuldq, 0, 0, 1
1047ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR por, 0, 0, 1
1048ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR psadbw, 0, 0, 1
1049ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR pshufb, 0, 0, 0
1050ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR pshufd, 0, 1, 0
1051ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR pshufhw, 0, 1, 0
1052ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR pshuflw, 0, 1, 0
1053ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR psignb, 0, 0, 0
1054ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR psignw, 0, 0, 0
1055ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR psignd, 0, 0, 0
1056ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR psllw, 0, 0, 0
1057ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR pslld, 0, 0, 0
1058ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR psllq, 0, 0, 0
1059ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR pslldq, 0, 0, 0
1060ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR psraw, 0, 0, 0
1061ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR psrad, 0, 0, 0
1062ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR psrlw, 0, 0, 0
1063ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR psrld, 0, 0, 0
1064ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR psrlq, 0, 0, 0
1065ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR psrldq, 0, 0, 0
1066ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR psubb, 0, 0, 0
1067ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR psubw, 0, 0, 0
1068ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR psubd, 0, 0, 0
1069ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR psubq, 0, 0, 0
1070ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR psubsb, 0, 0, 0
1071ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR psubsw, 0, 0, 0
1072ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR psubusb, 0, 0, 0
1073ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR psubusw, 0, 0, 0
1074ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR ptest, 0, 0, 0
1075ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR punpckhbw, 0, 0, 0
1076ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR punpckhwd, 0, 0, 0
1077ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR punpckhdq, 0, 0, 0
1078ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR punpckhqdq, 0, 0, 0
1079ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR punpcklbw, 0, 0, 0
1080ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR punpcklwd, 0, 0, 0
1081ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR punpckldq, 0, 0, 0
1082ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR punpcklqdq, 0, 0, 0
1083ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR pxor, 0, 0, 1
1084ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR shufps, 1, 1, 0
1085ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR subpd, 1, 0, 0
1086ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR subps, 1, 0, 0
1087ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR subsd, 1, 0, 0
1088ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR subss, 1, 0, 0
1089ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR unpckhpd, 1, 0, 0
1090ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR unpckhps, 1, 0, 0
1091ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR unpcklpd, 1, 0, 0
1092ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR unpcklps, 1, 0, 0
1093ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR xorpd, 1, 0, 1
1094ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR xorps, 1, 0, 1
1095ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1096ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian; 3DNow instructions, for sharing code between AVX, SSE and 3DN
1097ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR pfadd, 1, 0, 1
1098ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR pfsub, 1, 0, 0
1099ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianAVX_INSTR pfmul, 1, 0, 1
1100ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1101ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian; base-4 constants for shuffles
1102ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%assign i 0
1103ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%rep 256
1104ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %assign j ((i>>6)&3)*1000 + ((i>>4)&3)*100 + ((i>>2)&3)*10 + (i&3)
1105ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %if j < 10
1106ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        CAT_XDEFINE q000, j, i
1107ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %elif j < 100
1108ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        CAT_XDEFINE q00, j, i
1109ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %elif j < 1000
1110ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        CAT_XDEFINE q0, j, i
1111ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %else
1112ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        CAT_XDEFINE q, j, i
1113ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %endif
1114ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%assign i i+1
1115ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%endrep
1116ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%undef i
1117ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%undef j
1118ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1119ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%macro FMA_INSTR 3
1120ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %macro %1 4-7 %1, %2, %3
1121ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %if cpuflag(xop)
1122ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian            v%5 %1, %2, %3, %4
1123ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %else
1124ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian            %6 %1, %2, %3
1125ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian            %7 %1, %4
1126ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        %endif
1127ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    %endmacro
1128ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%endmacro
1129ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1130ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianFMA_INSTR  pmacsdd,  pmulld, paddd
1131ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianFMA_INSTR  pmacsww,  pmullw, paddw
1132ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianFMA_INSTR pmadcswd, pmaddwd, paddd
1133ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1134ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian; tzcnt is equivalent to "rep bsf" and is backwards-compatible with bsf.
1135ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian; This lets us use tzcnt without bumping the yasm version requirement yet.
1136ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian%define tzcnt rep bsf
1137