1233d2500723e5594f3e7c70896ffeeef32b9c950ywan;*****************************************************************************
2233d2500723e5594f3e7c70896ffeeef32b9c950ywan;* x86inc.asm: x264asm abstraction layer
3233d2500723e5594f3e7c70896ffeeef32b9c950ywan;*****************************************************************************
4233d2500723e5594f3e7c70896ffeeef32b9c950ywan;* Copyright (C) 2005-2012 x264 project
5233d2500723e5594f3e7c70896ffeeef32b9c950ywan;*
6233d2500723e5594f3e7c70896ffeeef32b9c950ywan;* Authors: Loren Merritt <lorenm@u.washington.edu>
7233d2500723e5594f3e7c70896ffeeef32b9c950ywan;*          Anton Mitrofanov <BugMaster@narod.ru>
8233d2500723e5594f3e7c70896ffeeef32b9c950ywan;*          Jason Garrett-Glaser <darkshikari@gmail.com>
9233d2500723e5594f3e7c70896ffeeef32b9c950ywan;*          Henrik Gramner <hengar-6@student.ltu.se>
10233d2500723e5594f3e7c70896ffeeef32b9c950ywan;*
11233d2500723e5594f3e7c70896ffeeef32b9c950ywan;* Permission to use, copy, modify, and/or distribute this software for any
12233d2500723e5594f3e7c70896ffeeef32b9c950ywan;* purpose with or without fee is hereby granted, provided that the above
13233d2500723e5594f3e7c70896ffeeef32b9c950ywan;* copyright notice and this permission notice appear in all copies.
14233d2500723e5594f3e7c70896ffeeef32b9c950ywan;*
15233d2500723e5594f3e7c70896ffeeef32b9c950ywan;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
16233d2500723e5594f3e7c70896ffeeef32b9c950ywan;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
17233d2500723e5594f3e7c70896ffeeef32b9c950ywan;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
18233d2500723e5594f3e7c70896ffeeef32b9c950ywan;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19233d2500723e5594f3e7c70896ffeeef32b9c950ywan;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
20233d2500723e5594f3e7c70896ffeeef32b9c950ywan;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
21233d2500723e5594f3e7c70896ffeeef32b9c950ywan;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
22233d2500723e5594f3e7c70896ffeeef32b9c950ywan;*****************************************************************************
23233d2500723e5594f3e7c70896ffeeef32b9c950ywan
24233d2500723e5594f3e7c70896ffeeef32b9c950ywan; This is a header file for the x264ASM assembly language, which uses
25233d2500723e5594f3e7c70896ffeeef32b9c950ywan; NASM/YASM syntax combined with a large number of macros to provide easy
26233d2500723e5594f3e7c70896ffeeef32b9c950ywan; abstraction between different calling conventions (x86_32, win64, linux64).
27233d2500723e5594f3e7c70896ffeeef32b9c950ywan; It also has various other useful features to simplify writing the kind of
28233d2500723e5594f3e7c70896ffeeef32b9c950ywan; DSP functions that are most often used in x264.
29233d2500723e5594f3e7c70896ffeeef32b9c950ywan
30233d2500723e5594f3e7c70896ffeeef32b9c950ywan; Unlike the rest of x264, this file is available under an ISC license, as it
31233d2500723e5594f3e7c70896ffeeef32b9c950ywan; has significant usefulness outside of x264 and we want it to be available
32233d2500723e5594f3e7c70896ffeeef32b9c950ywan; to the largest audience possible.  Of course, if you modify it for your own
33233d2500723e5594f3e7c70896ffeeef32b9c950ywan; purposes to add a new feature, we strongly encourage contributing a patch
34233d2500723e5594f3e7c70896ffeeef32b9c950ywan; as this feature might be useful for others as well.  Send patches or ideas
35233d2500723e5594f3e7c70896ffeeef32b9c950ywan; to x264-devel@videolan.org .
36233d2500723e5594f3e7c70896ffeeef32b9c950ywan
37233d2500723e5594f3e7c70896ffeeef32b9c950ywan%include "vpx_config.asm"
38233d2500723e5594f3e7c70896ffeeef32b9c950ywan
39233d2500723e5594f3e7c70896ffeeef32b9c950ywan%define program_name vp9
40233d2500723e5594f3e7c70896ffeeef32b9c950ywan
41233d2500723e5594f3e7c70896ffeeef32b9c950ywan
42233d2500723e5594f3e7c70896ffeeef32b9c950ywan%define UNIX64 0
43233d2500723e5594f3e7c70896ffeeef32b9c950ywan%define WIN64  0
44233d2500723e5594f3e7c70896ffeeef32b9c950ywan%if ARCH_X86_64
45233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %ifidn __OUTPUT_FORMAT__,win32
46233d2500723e5594f3e7c70896ffeeef32b9c950ywan        %define WIN64  1
47233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %elifidn __OUTPUT_FORMAT__,win64
48233d2500723e5594f3e7c70896ffeeef32b9c950ywan        %define WIN64  1
49233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %elifidn __OUTPUT_FORMAT__,x64
50233d2500723e5594f3e7c70896ffeeef32b9c950ywan        %define WIN64  1
51233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %else
52233d2500723e5594f3e7c70896ffeeef32b9c950ywan        %define UNIX64 1
53233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %endif
54233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endif
55233d2500723e5594f3e7c70896ffeeef32b9c950ywan
56233d2500723e5594f3e7c70896ffeeef32b9c950ywan%ifidn   __OUTPUT_FORMAT__,elf32
57233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %define mangle(x) x
58233d2500723e5594f3e7c70896ffeeef32b9c950ywan%elifidn __OUTPUT_FORMAT__,elf64
59233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %define mangle(x) x
60233d2500723e5594f3e7c70896ffeeef32b9c950ywan%elifidn __OUTPUT_FORMAT__,elf
61233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %define mangle(x) x
62233d2500723e5594f3e7c70896ffeeef32b9c950ywan%elifidn __OUTPUT_FORMAT__,x64
63233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %define mangle(x) x
64233d2500723e5594f3e7c70896ffeeef32b9c950ywan%elifidn __OUTPUT_FORMAT__,win64
65233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %define mangle(x) x
66233d2500723e5594f3e7c70896ffeeef32b9c950ywan%else
67233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %define mangle(x) _ %+ x
68233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endif
69233d2500723e5594f3e7c70896ffeeef32b9c950ywan
70233d2500723e5594f3e7c70896ffeeef32b9c950ywan; FIXME: All of the 64bit asm functions that take a stride as an argument
71233d2500723e5594f3e7c70896ffeeef32b9c950ywan; via register, assume that the high dword of that register is filled with 0.
72233d2500723e5594f3e7c70896ffeeef32b9c950ywan; This is true in practice (since we never do any 64bit arithmetic on strides,
73233d2500723e5594f3e7c70896ffeeef32b9c950ywan; and x264's strides are all positive), but is not guaranteed by the ABI.
74233d2500723e5594f3e7c70896ffeeef32b9c950ywan
75233d2500723e5594f3e7c70896ffeeef32b9c950ywan; Name of the .rodata section.
76233d2500723e5594f3e7c70896ffeeef32b9c950ywan; Kludge: Something on OS X fails to align .rodata even given an align attribute,
77233d2500723e5594f3e7c70896ffeeef32b9c950ywan; so use a different read-only section.
78233d2500723e5594f3e7c70896ffeeef32b9c950ywan%macro SECTION_RODATA 0-1 16
79233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %ifidn __OUTPUT_FORMAT__,macho64
80233d2500723e5594f3e7c70896ffeeef32b9c950ywan        SECTION .text align=%1
81233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %elifidn __OUTPUT_FORMAT__,macho
82233d2500723e5594f3e7c70896ffeeef32b9c950ywan        SECTION .text align=%1
83233d2500723e5594f3e7c70896ffeeef32b9c950ywan        fakegot:
84233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %elifidn __OUTPUT_FORMAT__,aout
85233d2500723e5594f3e7c70896ffeeef32b9c950ywan        section .text
86233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %else
87233d2500723e5594f3e7c70896ffeeef32b9c950ywan        SECTION .rodata align=%1
88233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %endif
89233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endmacro
90233d2500723e5594f3e7c70896ffeeef32b9c950ywan
91233d2500723e5594f3e7c70896ffeeef32b9c950ywan; aout does not support align=
92233d2500723e5594f3e7c70896ffeeef32b9c950ywan%macro SECTION_TEXT 0-1 16
93233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %ifidn __OUTPUT_FORMAT__,aout
94233d2500723e5594f3e7c70896ffeeef32b9c950ywan        SECTION .text
95233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %else
96233d2500723e5594f3e7c70896ffeeef32b9c950ywan        SECTION .text align=%1
97233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %endif
98233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endmacro
99233d2500723e5594f3e7c70896ffeeef32b9c950ywan
100233d2500723e5594f3e7c70896ffeeef32b9c950ywan; PIC macros are copied from vpx_ports/x86_abi_support.asm. The "define PIC"
101233d2500723e5594f3e7c70896ffeeef32b9c950ywan; from original code is added in for 64bit.
102233d2500723e5594f3e7c70896ffeeef32b9c950ywan%ifidn __OUTPUT_FORMAT__,elf32
103233d2500723e5594f3e7c70896ffeeef32b9c950ywan%define ABI_IS_32BIT 1
104233d2500723e5594f3e7c70896ffeeef32b9c950ywan%elifidn __OUTPUT_FORMAT__,macho32
105233d2500723e5594f3e7c70896ffeeef32b9c950ywan%define ABI_IS_32BIT 1
106233d2500723e5594f3e7c70896ffeeef32b9c950ywan%elifidn __OUTPUT_FORMAT__,win32
107233d2500723e5594f3e7c70896ffeeef32b9c950ywan%define ABI_IS_32BIT 1
108233d2500723e5594f3e7c70896ffeeef32b9c950ywan%elifidn __OUTPUT_FORMAT__,aout
109233d2500723e5594f3e7c70896ffeeef32b9c950ywan%define ABI_IS_32BIT 1
110233d2500723e5594f3e7c70896ffeeef32b9c950ywan%else
111233d2500723e5594f3e7c70896ffeeef32b9c950ywan%define ABI_IS_32BIT 0
112233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endif
113233d2500723e5594f3e7c70896ffeeef32b9c950ywan
114233d2500723e5594f3e7c70896ffeeef32b9c950ywan%if ABI_IS_32BIT
115233d2500723e5594f3e7c70896ffeeef32b9c950ywan  %if CONFIG_PIC=1
116233d2500723e5594f3e7c70896ffeeef32b9c950ywan  %ifidn __OUTPUT_FORMAT__,elf32
117233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %define GET_GOT_SAVE_ARG 1
118233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %define WRT_PLT wrt ..plt
119233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %macro GET_GOT 1
120233d2500723e5594f3e7c70896ffeeef32b9c950ywan      extern _GLOBAL_OFFSET_TABLE_
121233d2500723e5594f3e7c70896ffeeef32b9c950ywan      push %1
122233d2500723e5594f3e7c70896ffeeef32b9c950ywan      call %%get_got
123233d2500723e5594f3e7c70896ffeeef32b9c950ywan      %%sub_offset:
124233d2500723e5594f3e7c70896ffeeef32b9c950ywan      jmp %%exitGG
125233d2500723e5594f3e7c70896ffeeef32b9c950ywan      %%get_got:
126233d2500723e5594f3e7c70896ffeeef32b9c950ywan      mov %1, [esp]
127233d2500723e5594f3e7c70896ffeeef32b9c950ywan      add %1, _GLOBAL_OFFSET_TABLE_ + $$ - %%sub_offset wrt ..gotpc
128233d2500723e5594f3e7c70896ffeeef32b9c950ywan      ret
129233d2500723e5594f3e7c70896ffeeef32b9c950ywan      %%exitGG:
130233d2500723e5594f3e7c70896ffeeef32b9c950ywan      %undef GLOBAL
131233d2500723e5594f3e7c70896ffeeef32b9c950ywan      %define GLOBAL(x) x + %1 wrt ..gotoff
132233d2500723e5594f3e7c70896ffeeef32b9c950ywan      %undef RESTORE_GOT
133233d2500723e5594f3e7c70896ffeeef32b9c950ywan      %define RESTORE_GOT pop %1
134233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %endmacro
135233d2500723e5594f3e7c70896ffeeef32b9c950ywan  %elifidn __OUTPUT_FORMAT__,macho32
136233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %define GET_GOT_SAVE_ARG 1
137233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %macro GET_GOT 1
138233d2500723e5594f3e7c70896ffeeef32b9c950ywan      push %1
139233d2500723e5594f3e7c70896ffeeef32b9c950ywan      call %%get_got
140233d2500723e5594f3e7c70896ffeeef32b9c950ywan      %%get_got:
141233d2500723e5594f3e7c70896ffeeef32b9c950ywan      pop  %1
142233d2500723e5594f3e7c70896ffeeef32b9c950ywan      %undef GLOBAL
143233d2500723e5594f3e7c70896ffeeef32b9c950ywan      %define GLOBAL(x) x + %1 - %%get_got
144233d2500723e5594f3e7c70896ffeeef32b9c950ywan      %undef RESTORE_GOT
145233d2500723e5594f3e7c70896ffeeef32b9c950ywan      %define RESTORE_GOT pop %1
146233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %endmacro
147233d2500723e5594f3e7c70896ffeeef32b9c950ywan  %endif
148233d2500723e5594f3e7c70896ffeeef32b9c950ywan  %endif
149233d2500723e5594f3e7c70896ffeeef32b9c950ywan
150233d2500723e5594f3e7c70896ffeeef32b9c950ywan  %if ARCH_X86_64 == 0
151233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %undef PIC
152233d2500723e5594f3e7c70896ffeeef32b9c950ywan  %endif
153233d2500723e5594f3e7c70896ffeeef32b9c950ywan
154233d2500723e5594f3e7c70896ffeeef32b9c950ywan%else
155233d2500723e5594f3e7c70896ffeeef32b9c950ywan  %macro GET_GOT 1
156233d2500723e5594f3e7c70896ffeeef32b9c950ywan  %endmacro
157233d2500723e5594f3e7c70896ffeeef32b9c950ywan  %define GLOBAL(x) rel x
158233d2500723e5594f3e7c70896ffeeef32b9c950ywan  %define WRT_PLT wrt ..plt
159233d2500723e5594f3e7c70896ffeeef32b9c950ywan
160233d2500723e5594f3e7c70896ffeeef32b9c950ywan  %if WIN64
161233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %define PIC
162233d2500723e5594f3e7c70896ffeeef32b9c950ywan  %elifidn __OUTPUT_FORMAT__,macho64
163233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %define PIC
164233d2500723e5594f3e7c70896ffeeef32b9c950ywan  %elif CONFIG_PIC
165233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %define PIC
166233d2500723e5594f3e7c70896ffeeef32b9c950ywan  %endif
167233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endif
168233d2500723e5594f3e7c70896ffeeef32b9c950ywan
169233d2500723e5594f3e7c70896ffeeef32b9c950ywan%ifnmacro GET_GOT
170233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %macro GET_GOT 1
171233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %endmacro
172233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %define GLOBAL(x) x
173233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endif
174233d2500723e5594f3e7c70896ffeeef32b9c950ywan%ifndef RESTORE_GOT
175233d2500723e5594f3e7c70896ffeeef32b9c950ywan%define RESTORE_GOT
176233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endif
177233d2500723e5594f3e7c70896ffeeef32b9c950ywan%ifndef WRT_PLT
178233d2500723e5594f3e7c70896ffeeef32b9c950ywan%define WRT_PLT
179233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endif
180233d2500723e5594f3e7c70896ffeeef32b9c950ywan
181233d2500723e5594f3e7c70896ffeeef32b9c950ywan%ifdef PIC
182233d2500723e5594f3e7c70896ffeeef32b9c950ywan    default rel
183233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endif
184233d2500723e5594f3e7c70896ffeeef32b9c950ywan; Done with PIC macros
185233d2500723e5594f3e7c70896ffeeef32b9c950ywan
186233d2500723e5594f3e7c70896ffeeef32b9c950ywan; Always use long nops (reduces 0x90 spam in disassembly on x86_32)
187233d2500723e5594f3e7c70896ffeeef32b9c950ywan%ifndef __NASM_VER__
188233d2500723e5594f3e7c70896ffeeef32b9c950ywanCPU amdnop
189233d2500723e5594f3e7c70896ffeeef32b9c950ywan%else
190233d2500723e5594f3e7c70896ffeeef32b9c950ywan%use smartalign
191233d2500723e5594f3e7c70896ffeeef32b9c950ywanALIGNMODE k7
192233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endif
193233d2500723e5594f3e7c70896ffeeef32b9c950ywan
194233d2500723e5594f3e7c70896ffeeef32b9c950ywan; Macros to eliminate most code duplication between x86_32 and x86_64:
195233d2500723e5594f3e7c70896ffeeef32b9c950ywan; Currently this works only for leaf functions which load all their arguments
196233d2500723e5594f3e7c70896ffeeef32b9c950ywan; into registers at the start, and make no other use of the stack. Luckily that
197233d2500723e5594f3e7c70896ffeeef32b9c950ywan; covers most of x264's asm.
198233d2500723e5594f3e7c70896ffeeef32b9c950ywan
199233d2500723e5594f3e7c70896ffeeef32b9c950ywan; PROLOGUE:
200233d2500723e5594f3e7c70896ffeeef32b9c950ywan; %1 = number of arguments. loads them from stack if needed.
201233d2500723e5594f3e7c70896ffeeef32b9c950ywan; %2 = number of registers used. pushes callee-saved regs if needed.
202233d2500723e5594f3e7c70896ffeeef32b9c950ywan; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed.
203233d2500723e5594f3e7c70896ffeeef32b9c950ywan; %4 = list of names to define to registers
204233d2500723e5594f3e7c70896ffeeef32b9c950ywan; PROLOGUE can also be invoked by adding the same options to cglobal
205233d2500723e5594f3e7c70896ffeeef32b9c950ywan
206233d2500723e5594f3e7c70896ffeeef32b9c950ywan; e.g.
207233d2500723e5594f3e7c70896ffeeef32b9c950ywan; cglobal foo, 2,3,0, dst, src, tmp
208233d2500723e5594f3e7c70896ffeeef32b9c950ywan; declares a function (foo), taking two args (dst and src) and one local variable (tmp)
209233d2500723e5594f3e7c70896ffeeef32b9c950ywan
210233d2500723e5594f3e7c70896ffeeef32b9c950ywan; TODO Some functions can use some args directly from the stack. If they're the
211233d2500723e5594f3e7c70896ffeeef32b9c950ywan; last args then you can just not declare them, but if they're in the middle
212233d2500723e5594f3e7c70896ffeeef32b9c950ywan; we need more flexible macro.
213233d2500723e5594f3e7c70896ffeeef32b9c950ywan
214233d2500723e5594f3e7c70896ffeeef32b9c950ywan; RET:
215233d2500723e5594f3e7c70896ffeeef32b9c950ywan; Pops anything that was pushed by PROLOGUE, and returns.
216233d2500723e5594f3e7c70896ffeeef32b9c950ywan
217233d2500723e5594f3e7c70896ffeeef32b9c950ywan; REP_RET:
218233d2500723e5594f3e7c70896ffeeef32b9c950ywan; Same, but if it doesn't pop anything it becomes a 2-byte ret, for athlons
219233d2500723e5594f3e7c70896ffeeef32b9c950ywan; which are slow when a normal ret follows a branch.
220233d2500723e5594f3e7c70896ffeeef32b9c950ywan
221233d2500723e5594f3e7c70896ffeeef32b9c950ywan; registers:
222233d2500723e5594f3e7c70896ffeeef32b9c950ywan; rN and rNq are the native-size register holding function argument N
223233d2500723e5594f3e7c70896ffeeef32b9c950ywan; rNd, rNw, rNb are dword, word, and byte size
224233d2500723e5594f3e7c70896ffeeef32b9c950ywan; rNm is the original location of arg N (a register or on the stack), dword
225233d2500723e5594f3e7c70896ffeeef32b9c950ywan; rNmp is native size
226233d2500723e5594f3e7c70896ffeeef32b9c950ywan
227233d2500723e5594f3e7c70896ffeeef32b9c950ywan%macro DECLARE_REG 5-6
228233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %define r%1q %2
229233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %define r%1d %3
230233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %define r%1w %4
231233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %define r%1b %5
232233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %if %0 == 5
233233d2500723e5594f3e7c70896ffeeef32b9c950ywan        %define r%1m  %3
234233d2500723e5594f3e7c70896ffeeef32b9c950ywan        %define r%1mp %2
235233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %elif ARCH_X86_64 ; memory
236233d2500723e5594f3e7c70896ffeeef32b9c950ywan        %define r%1m [rsp + stack_offset + %6]
237233d2500723e5594f3e7c70896ffeeef32b9c950ywan        %define r%1mp qword r %+ %1m
238233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %else
239233d2500723e5594f3e7c70896ffeeef32b9c950ywan        %define r%1m [esp + stack_offset + %6]
240233d2500723e5594f3e7c70896ffeeef32b9c950ywan        %define r%1mp dword r %+ %1m
241233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %endif
242233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %define r%1  %2
243233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endmacro
244233d2500723e5594f3e7c70896ffeeef32b9c950ywan
245233d2500723e5594f3e7c70896ffeeef32b9c950ywan%macro DECLARE_REG_SIZE 2
246233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %define r%1q r%1
247233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %define e%1q r%1
248233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %define r%1d e%1
249233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %define e%1d e%1
250233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %define r%1w %1
251233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %define e%1w %1
252233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %define r%1b %2
253233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %define e%1b %2
254233d2500723e5594f3e7c70896ffeeef32b9c950ywan%if ARCH_X86_64 == 0
255233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %define r%1  e%1
256233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endif
257233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endmacro
258233d2500723e5594f3e7c70896ffeeef32b9c950ywan
259233d2500723e5594f3e7c70896ffeeef32b9c950ywanDECLARE_REG_SIZE ax, al
260233d2500723e5594f3e7c70896ffeeef32b9c950ywanDECLARE_REG_SIZE bx, bl
261233d2500723e5594f3e7c70896ffeeef32b9c950ywanDECLARE_REG_SIZE cx, cl
262233d2500723e5594f3e7c70896ffeeef32b9c950ywanDECLARE_REG_SIZE dx, dl
263233d2500723e5594f3e7c70896ffeeef32b9c950ywanDECLARE_REG_SIZE si, sil
264233d2500723e5594f3e7c70896ffeeef32b9c950ywanDECLARE_REG_SIZE di, dil
265233d2500723e5594f3e7c70896ffeeef32b9c950ywanDECLARE_REG_SIZE bp, bpl
266233d2500723e5594f3e7c70896ffeeef32b9c950ywan
267233d2500723e5594f3e7c70896ffeeef32b9c950ywan; t# defines for when per-arch register allocation is more complex than just function arguments
268233d2500723e5594f3e7c70896ffeeef32b9c950ywan
269233d2500723e5594f3e7c70896ffeeef32b9c950ywan%macro DECLARE_REG_TMP 1-*
270233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %assign %%i 0
271233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %rep %0
272233d2500723e5594f3e7c70896ffeeef32b9c950ywan        CAT_XDEFINE t, %%i, r%1
273233d2500723e5594f3e7c70896ffeeef32b9c950ywan        %assign %%i %%i+1
274233d2500723e5594f3e7c70896ffeeef32b9c950ywan        %rotate 1
275233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %endrep
276233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endmacro
277233d2500723e5594f3e7c70896ffeeef32b9c950ywan
278233d2500723e5594f3e7c70896ffeeef32b9c950ywan%macro DECLARE_REG_TMP_SIZE 0-*
279233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %rep %0
280233d2500723e5594f3e7c70896ffeeef32b9c950ywan        %define t%1q t%1 %+ q
281233d2500723e5594f3e7c70896ffeeef32b9c950ywan        %define t%1d t%1 %+ d
282233d2500723e5594f3e7c70896ffeeef32b9c950ywan        %define t%1w t%1 %+ w
283233d2500723e5594f3e7c70896ffeeef32b9c950ywan        %define t%1b t%1 %+ b
284233d2500723e5594f3e7c70896ffeeef32b9c950ywan        %rotate 1
285233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %endrep
286233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endmacro
287233d2500723e5594f3e7c70896ffeeef32b9c950ywan
288233d2500723e5594f3e7c70896ffeeef32b9c950ywanDECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
289233d2500723e5594f3e7c70896ffeeef32b9c950ywan
290233d2500723e5594f3e7c70896ffeeef32b9c950ywan%if ARCH_X86_64
291233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %define gprsize 8
292233d2500723e5594f3e7c70896ffeeef32b9c950ywan%else
293233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %define gprsize 4
294233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endif
295233d2500723e5594f3e7c70896ffeeef32b9c950ywan
296233d2500723e5594f3e7c70896ffeeef32b9c950ywan%macro PUSH 1
297233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push %1
298233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %assign stack_offset stack_offset+gprsize
299233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endmacro
300233d2500723e5594f3e7c70896ffeeef32b9c950ywan
301233d2500723e5594f3e7c70896ffeeef32b9c950ywan%macro POP 1
302233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop %1
303233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %assign stack_offset stack_offset-gprsize
304233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endmacro
305233d2500723e5594f3e7c70896ffeeef32b9c950ywan
306233d2500723e5594f3e7c70896ffeeef32b9c950ywan%macro PUSH_IF_USED 1-*
307233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %rep %0
308233d2500723e5594f3e7c70896ffeeef32b9c950ywan        %if %1 < regs_used
309233d2500723e5594f3e7c70896ffeeef32b9c950ywan            PUSH r%1
310233d2500723e5594f3e7c70896ffeeef32b9c950ywan        %endif
311233d2500723e5594f3e7c70896ffeeef32b9c950ywan        %rotate 1
312233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %endrep
313233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endmacro
314233d2500723e5594f3e7c70896ffeeef32b9c950ywan
315233d2500723e5594f3e7c70896ffeeef32b9c950ywan%macro POP_IF_USED 1-*
316233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %rep %0
317233d2500723e5594f3e7c70896ffeeef32b9c950ywan        %if %1 < regs_used
318233d2500723e5594f3e7c70896ffeeef32b9c950ywan            pop r%1
319233d2500723e5594f3e7c70896ffeeef32b9c950ywan        %endif
320233d2500723e5594f3e7c70896ffeeef32b9c950ywan        %rotate 1
321233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %endrep
322233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endmacro
323233d2500723e5594f3e7c70896ffeeef32b9c950ywan
324233d2500723e5594f3e7c70896ffeeef32b9c950ywan%macro LOAD_IF_USED 1-*
325233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %rep %0
326233d2500723e5594f3e7c70896ffeeef32b9c950ywan        %if %1 < num_args
327233d2500723e5594f3e7c70896ffeeef32b9c950ywan            mov r%1, r %+ %1 %+ mp
328233d2500723e5594f3e7c70896ffeeef32b9c950ywan        %endif
329233d2500723e5594f3e7c70896ffeeef32b9c950ywan        %rotate 1
330233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %endrep
331233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endmacro
332233d2500723e5594f3e7c70896ffeeef32b9c950ywan
333233d2500723e5594f3e7c70896ffeeef32b9c950ywan%macro SUB 2
334233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sub %1, %2
335233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %ifidn %1, rsp
336233d2500723e5594f3e7c70896ffeeef32b9c950ywan        %assign stack_offset stack_offset+(%2)
337233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %endif
338233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endmacro
339233d2500723e5594f3e7c70896ffeeef32b9c950ywan
340233d2500723e5594f3e7c70896ffeeef32b9c950ywan%macro ADD 2
341233d2500723e5594f3e7c70896ffeeef32b9c950ywan    add %1, %2
342233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %ifidn %1, rsp
343233d2500723e5594f3e7c70896ffeeef32b9c950ywan        %assign stack_offset stack_offset-(%2)
344233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %endif
345233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endmacro
346233d2500723e5594f3e7c70896ffeeef32b9c950ywan
347233d2500723e5594f3e7c70896ffeeef32b9c950ywan%macro movifnidn 2
348233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %ifnidn %1, %2
349233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov %1, %2
350233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %endif
351233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endmacro
352233d2500723e5594f3e7c70896ffeeef32b9c950ywan
353233d2500723e5594f3e7c70896ffeeef32b9c950ywan%macro movsxdifnidn 2
354233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %ifnidn %1, %2
355233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movsxd %1, %2
356233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %endif
357233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endmacro
358233d2500723e5594f3e7c70896ffeeef32b9c950ywan
359233d2500723e5594f3e7c70896ffeeef32b9c950ywan%macro ASSERT 1
360233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %if (%1) == 0
361233d2500723e5594f3e7c70896ffeeef32b9c950ywan        %error assert failed
362233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %endif
363233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endmacro
364233d2500723e5594f3e7c70896ffeeef32b9c950ywan
365233d2500723e5594f3e7c70896ffeeef32b9c950ywan%macro DEFINE_ARGS 0-*
366233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %ifdef n_arg_names
367233d2500723e5594f3e7c70896ffeeef32b9c950ywan        %assign %%i 0
368233d2500723e5594f3e7c70896ffeeef32b9c950ywan        %rep n_arg_names
369233d2500723e5594f3e7c70896ffeeef32b9c950ywan            CAT_UNDEF arg_name %+ %%i, q
370233d2500723e5594f3e7c70896ffeeef32b9c950ywan            CAT_UNDEF arg_name %+ %%i, d
371233d2500723e5594f3e7c70896ffeeef32b9c950ywan            CAT_UNDEF arg_name %+ %%i, w
372233d2500723e5594f3e7c70896ffeeef32b9c950ywan            CAT_UNDEF arg_name %+ %%i, b
373233d2500723e5594f3e7c70896ffeeef32b9c950ywan            CAT_UNDEF arg_name %+ %%i, m
374233d2500723e5594f3e7c70896ffeeef32b9c950ywan            CAT_UNDEF arg_name %+ %%i, mp
375233d2500723e5594f3e7c70896ffeeef32b9c950ywan            CAT_UNDEF arg_name, %%i
376233d2500723e5594f3e7c70896ffeeef32b9c950ywan            %assign %%i %%i+1
377233d2500723e5594f3e7c70896ffeeef32b9c950ywan        %endrep
378233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %endif
379233d2500723e5594f3e7c70896ffeeef32b9c950ywan
380233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %xdefine %%stack_offset stack_offset
381233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %undef stack_offset ; so that the current value of stack_offset doesn't get baked in by xdefine
382233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %assign %%i 0
383233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %rep %0
384233d2500723e5594f3e7c70896ffeeef32b9c950ywan        %xdefine %1q r %+ %%i %+ q
385233d2500723e5594f3e7c70896ffeeef32b9c950ywan        %xdefine %1d r %+ %%i %+ d
386233d2500723e5594f3e7c70896ffeeef32b9c950ywan        %xdefine %1w r %+ %%i %+ w
387233d2500723e5594f3e7c70896ffeeef32b9c950ywan        %xdefine %1b r %+ %%i %+ b
388233d2500723e5594f3e7c70896ffeeef32b9c950ywan        %xdefine %1m r %+ %%i %+ m
389233d2500723e5594f3e7c70896ffeeef32b9c950ywan        %xdefine %1mp r %+ %%i %+ mp
390233d2500723e5594f3e7c70896ffeeef32b9c950ywan        CAT_XDEFINE arg_name, %%i, %1
391233d2500723e5594f3e7c70896ffeeef32b9c950ywan        %assign %%i %%i+1
392233d2500723e5594f3e7c70896ffeeef32b9c950ywan        %rotate 1
393233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %endrep
394233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %xdefine stack_offset %%stack_offset
395233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %assign n_arg_names %0
396233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endmacro
397233d2500723e5594f3e7c70896ffeeef32b9c950ywan
398233d2500723e5594f3e7c70896ffeeef32b9c950ywan%if WIN64 ; Windows x64 ;=================================================
399233d2500723e5594f3e7c70896ffeeef32b9c950ywan
400233d2500723e5594f3e7c70896ffeeef32b9c950ywanDECLARE_REG 0,  rcx, ecx,  cx,   cl
401233d2500723e5594f3e7c70896ffeeef32b9c950ywanDECLARE_REG 1,  rdx, edx,  dx,   dl
402233d2500723e5594f3e7c70896ffeeef32b9c950ywanDECLARE_REG 2,  R8,  R8D,  R8W,  R8B
403233d2500723e5594f3e7c70896ffeeef32b9c950ywanDECLARE_REG 3,  R9,  R9D,  R9W,  R9B
404233d2500723e5594f3e7c70896ffeeef32b9c950ywanDECLARE_REG 4,  R10, R10D, R10W, R10B, 40
405233d2500723e5594f3e7c70896ffeeef32b9c950ywanDECLARE_REG 5,  R11, R11D, R11W, R11B, 48
406233d2500723e5594f3e7c70896ffeeef32b9c950ywanDECLARE_REG 6,  rax, eax,  ax,   al,   56
407233d2500723e5594f3e7c70896ffeeef32b9c950ywanDECLARE_REG 7,  rdi, edi,  di,   dil,  64
408233d2500723e5594f3e7c70896ffeeef32b9c950ywanDECLARE_REG 8,  rsi, esi,  si,   sil,  72
409233d2500723e5594f3e7c70896ffeeef32b9c950ywanDECLARE_REG 9,  rbx, ebx,  bx,   bl,   80
410233d2500723e5594f3e7c70896ffeeef32b9c950ywanDECLARE_REG 10, rbp, ebp,  bp,   bpl,  88
411233d2500723e5594f3e7c70896ffeeef32b9c950ywanDECLARE_REG 11, R12, R12D, R12W, R12B, 96
412233d2500723e5594f3e7c70896ffeeef32b9c950ywanDECLARE_REG 12, R13, R13D, R13W, R13B, 104
413233d2500723e5594f3e7c70896ffeeef32b9c950ywanDECLARE_REG 13, R14, R14D, R14W, R14B, 112
414233d2500723e5594f3e7c70896ffeeef32b9c950ywanDECLARE_REG 14, R15, R15D, R15W, R15B, 120
415233d2500723e5594f3e7c70896ffeeef32b9c950ywan
416233d2500723e5594f3e7c70896ffeeef32b9c950ywan%macro PROLOGUE 2-4+ 0 ; #args, #regs, #xmm_regs, arg_names...
417233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %assign num_args %1
418233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %assign regs_used %2
419233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ASSERT regs_used >= num_args
420233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ASSERT regs_used <= 15
421233d2500723e5594f3e7c70896ffeeef32b9c950ywan    PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14
422233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %if mmsize == 8
423233d2500723e5594f3e7c70896ffeeef32b9c950ywan        %assign xmm_regs_used 0
424233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %else
425233d2500723e5594f3e7c70896ffeeef32b9c950ywan        WIN64_SPILL_XMM %3
426233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %endif
427233d2500723e5594f3e7c70896ffeeef32b9c950ywan    LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14
428233d2500723e5594f3e7c70896ffeeef32b9c950ywan    DEFINE_ARGS %4
429233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endmacro
430233d2500723e5594f3e7c70896ffeeef32b9c950ywan
431233d2500723e5594f3e7c70896ffeeef32b9c950ywan%macro WIN64_SPILL_XMM 1
432233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %assign xmm_regs_used %1
433233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ASSERT xmm_regs_used <= 16
434233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %if xmm_regs_used > 6
435233d2500723e5594f3e7c70896ffeeef32b9c950ywan        SUB rsp, (xmm_regs_used-6)*16+16
436233d2500723e5594f3e7c70896ffeeef32b9c950ywan        %assign %%i xmm_regs_used
437233d2500723e5594f3e7c70896ffeeef32b9c950ywan        %rep (xmm_regs_used-6)
438233d2500723e5594f3e7c70896ffeeef32b9c950ywan            %assign %%i %%i-1
439233d2500723e5594f3e7c70896ffeeef32b9c950ywan            movdqa [rsp + (%%i-6)*16+(~stack_offset&8)], xmm %+ %%i
440233d2500723e5594f3e7c70896ffeeef32b9c950ywan        %endrep
441233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %endif
442233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endmacro
443233d2500723e5594f3e7c70896ffeeef32b9c950ywan
444233d2500723e5594f3e7c70896ffeeef32b9c950ywan%macro WIN64_RESTORE_XMM_INTERNAL 1
445233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %if xmm_regs_used > 6
446233d2500723e5594f3e7c70896ffeeef32b9c950ywan        %assign %%i xmm_regs_used
447233d2500723e5594f3e7c70896ffeeef32b9c950ywan        %rep (xmm_regs_used-6)
448233d2500723e5594f3e7c70896ffeeef32b9c950ywan            %assign %%i %%i-1
449233d2500723e5594f3e7c70896ffeeef32b9c950ywan            movdqa xmm %+ %%i, [%1 + (%%i-6)*16+(~stack_offset&8)]
450233d2500723e5594f3e7c70896ffeeef32b9c950ywan        %endrep
451233d2500723e5594f3e7c70896ffeeef32b9c950ywan        add %1, (xmm_regs_used-6)*16+16
452233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %endif
453233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endmacro
454233d2500723e5594f3e7c70896ffeeef32b9c950ywan
455233d2500723e5594f3e7c70896ffeeef32b9c950ywan%macro WIN64_RESTORE_XMM 1
456233d2500723e5594f3e7c70896ffeeef32b9c950ywan    WIN64_RESTORE_XMM_INTERNAL %1
457233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %assign stack_offset stack_offset-(xmm_regs_used-6)*16+16
458233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %assign xmm_regs_used 0
459233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endmacro
460233d2500723e5594f3e7c70896ffeeef32b9c950ywan
461233d2500723e5594f3e7c70896ffeeef32b9c950ywan%macro RET 0
462233d2500723e5594f3e7c70896ffeeef32b9c950ywan    WIN64_RESTORE_XMM_INTERNAL rsp
463233d2500723e5594f3e7c70896ffeeef32b9c950ywan    POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7
464233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ret
465233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endmacro
466233d2500723e5594f3e7c70896ffeeef32b9c950ywan
467233d2500723e5594f3e7c70896ffeeef32b9c950ywan%macro REP_RET 0
468233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %if regs_used > 7 || xmm_regs_used > 6
469233d2500723e5594f3e7c70896ffeeef32b9c950ywan        RET
470233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %else
471233d2500723e5594f3e7c70896ffeeef32b9c950ywan        rep ret
472233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %endif
473233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endmacro
474233d2500723e5594f3e7c70896ffeeef32b9c950ywan
475233d2500723e5594f3e7c70896ffeeef32b9c950ywan%elif ARCH_X86_64 ; *nix x64 ;=============================================
476233d2500723e5594f3e7c70896ffeeef32b9c950ywan
477233d2500723e5594f3e7c70896ffeeef32b9c950ywanDECLARE_REG 0,  rdi, edi,  di,   dil
478233d2500723e5594f3e7c70896ffeeef32b9c950ywanDECLARE_REG 1,  rsi, esi,  si,   sil
479233d2500723e5594f3e7c70896ffeeef32b9c950ywanDECLARE_REG 2,  rdx, edx,  dx,   dl
480233d2500723e5594f3e7c70896ffeeef32b9c950ywanDECLARE_REG 3,  rcx, ecx,  cx,   cl
481233d2500723e5594f3e7c70896ffeeef32b9c950ywanDECLARE_REG 4,  R8,  R8D,  R8W,  R8B
482233d2500723e5594f3e7c70896ffeeef32b9c950ywanDECLARE_REG 5,  R9,  R9D,  R9W,  R9B
483233d2500723e5594f3e7c70896ffeeef32b9c950ywanDECLARE_REG 6,  rax, eax,  ax,   al,   8
484233d2500723e5594f3e7c70896ffeeef32b9c950ywanDECLARE_REG 7,  R10, R10D, R10W, R10B, 16
485233d2500723e5594f3e7c70896ffeeef32b9c950ywanDECLARE_REG 8,  R11, R11D, R11W, R11B, 24
486233d2500723e5594f3e7c70896ffeeef32b9c950ywanDECLARE_REG 9,  rbx, ebx,  bx,   bl,   32
487233d2500723e5594f3e7c70896ffeeef32b9c950ywanDECLARE_REG 10, rbp, ebp,  bp,   bpl,  40
488233d2500723e5594f3e7c70896ffeeef32b9c950ywanDECLARE_REG 11, R12, R12D, R12W, R12B, 48
489233d2500723e5594f3e7c70896ffeeef32b9c950ywanDECLARE_REG 12, R13, R13D, R13W, R13B, 56
490233d2500723e5594f3e7c70896ffeeef32b9c950ywanDECLARE_REG 13, R14, R14D, R14W, R14B, 64
491233d2500723e5594f3e7c70896ffeeef32b9c950ywanDECLARE_REG 14, R15, R15D, R15W, R15B, 72
492233d2500723e5594f3e7c70896ffeeef32b9c950ywan
493233d2500723e5594f3e7c70896ffeeef32b9c950ywan%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names...
494233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %assign num_args %1
495233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %assign regs_used %2
496233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ASSERT regs_used >= num_args
497233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ASSERT regs_used <= 15
498233d2500723e5594f3e7c70896ffeeef32b9c950ywan    PUSH_IF_USED 9, 10, 11, 12, 13, 14
499233d2500723e5594f3e7c70896ffeeef32b9c950ywan    LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14
500233d2500723e5594f3e7c70896ffeeef32b9c950ywan    DEFINE_ARGS %4
501233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endmacro
502233d2500723e5594f3e7c70896ffeeef32b9c950ywan
503233d2500723e5594f3e7c70896ffeeef32b9c950ywan%macro RET 0
504233d2500723e5594f3e7c70896ffeeef32b9c950ywan    POP_IF_USED 14, 13, 12, 11, 10, 9
505233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ret
506233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endmacro
507233d2500723e5594f3e7c70896ffeeef32b9c950ywan
508233d2500723e5594f3e7c70896ffeeef32b9c950ywan%macro REP_RET 0
509233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %if regs_used > 9
510233d2500723e5594f3e7c70896ffeeef32b9c950ywan        RET
511233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %else
512233d2500723e5594f3e7c70896ffeeef32b9c950ywan        rep ret
513233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %endif
514233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endmacro
515233d2500723e5594f3e7c70896ffeeef32b9c950ywan
516233d2500723e5594f3e7c70896ffeeef32b9c950ywan%else ; X86_32 ;==============================================================
517233d2500723e5594f3e7c70896ffeeef32b9c950ywan
518233d2500723e5594f3e7c70896ffeeef32b9c950ywanDECLARE_REG 0, eax, eax, ax, al,   4
519233d2500723e5594f3e7c70896ffeeef32b9c950ywanDECLARE_REG 1, ecx, ecx, cx, cl,   8
520233d2500723e5594f3e7c70896ffeeef32b9c950ywanDECLARE_REG 2, edx, edx, dx, dl,   12
521233d2500723e5594f3e7c70896ffeeef32b9c950ywanDECLARE_REG 3, ebx, ebx, bx, bl,   16
522233d2500723e5594f3e7c70896ffeeef32b9c950ywanDECLARE_REG 4, esi, esi, si, null, 20
523233d2500723e5594f3e7c70896ffeeef32b9c950ywanDECLARE_REG 5, edi, edi, di, null, 24
524233d2500723e5594f3e7c70896ffeeef32b9c950ywanDECLARE_REG 6, ebp, ebp, bp, null, 28
525233d2500723e5594f3e7c70896ffeeef32b9c950ywan%define rsp esp
526233d2500723e5594f3e7c70896ffeeef32b9c950ywan
527233d2500723e5594f3e7c70896ffeeef32b9c950ywan%macro DECLARE_ARG 1-*
528233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %rep %0
529233d2500723e5594f3e7c70896ffeeef32b9c950ywan        %define r%1m [esp + stack_offset + 4*%1 + 4]
530233d2500723e5594f3e7c70896ffeeef32b9c950ywan        %define r%1mp dword r%1m
531233d2500723e5594f3e7c70896ffeeef32b9c950ywan        %rotate 1
532233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %endrep
533233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endmacro
534233d2500723e5594f3e7c70896ffeeef32b9c950ywan
535233d2500723e5594f3e7c70896ffeeef32b9c950ywanDECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
536233d2500723e5594f3e7c70896ffeeef32b9c950ywan
537233d2500723e5594f3e7c70896ffeeef32b9c950ywan%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names...
538233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %assign num_args %1
539233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %assign regs_used %2
540233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %if regs_used > 7
541233d2500723e5594f3e7c70896ffeeef32b9c950ywan        %assign regs_used 7
542233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %endif
543233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ASSERT regs_used >= num_args
544233d2500723e5594f3e7c70896ffeeef32b9c950ywan    PUSH_IF_USED 3, 4, 5, 6
545233d2500723e5594f3e7c70896ffeeef32b9c950ywan    LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6
546233d2500723e5594f3e7c70896ffeeef32b9c950ywan    DEFINE_ARGS %4
547233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endmacro
548233d2500723e5594f3e7c70896ffeeef32b9c950ywan
549233d2500723e5594f3e7c70896ffeeef32b9c950ywan%macro RET 0
550233d2500723e5594f3e7c70896ffeeef32b9c950ywan    POP_IF_USED 6, 5, 4, 3
551233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ret
552233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endmacro
553233d2500723e5594f3e7c70896ffeeef32b9c950ywan
554233d2500723e5594f3e7c70896ffeeef32b9c950ywan%macro REP_RET 0
555233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %if regs_used > 3
556233d2500723e5594f3e7c70896ffeeef32b9c950ywan        RET
557233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %else
558233d2500723e5594f3e7c70896ffeeef32b9c950ywan        rep ret
559233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %endif
560233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endmacro
561233d2500723e5594f3e7c70896ffeeef32b9c950ywan
562233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endif ;======================================================================
563233d2500723e5594f3e7c70896ffeeef32b9c950ywan
564233d2500723e5594f3e7c70896ffeeef32b9c950ywan%if WIN64 == 0
565233d2500723e5594f3e7c70896ffeeef32b9c950ywan%macro WIN64_SPILL_XMM 1
566233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endmacro
567233d2500723e5594f3e7c70896ffeeef32b9c950ywan%macro WIN64_RESTORE_XMM 1
568233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endmacro
569233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endif
570233d2500723e5594f3e7c70896ffeeef32b9c950ywan
571233d2500723e5594f3e7c70896ffeeef32b9c950ywan;=============================================================================
572233d2500723e5594f3e7c70896ffeeef32b9c950ywan; arch-independent part
573233d2500723e5594f3e7c70896ffeeef32b9c950ywan;=============================================================================
574233d2500723e5594f3e7c70896ffeeef32b9c950ywan
575233d2500723e5594f3e7c70896ffeeef32b9c950ywan%assign function_align 16
576233d2500723e5594f3e7c70896ffeeef32b9c950ywan
577233d2500723e5594f3e7c70896ffeeef32b9c950ywan; Begin a function.
578233d2500723e5594f3e7c70896ffeeef32b9c950ywan; Applies any symbol mangling needed for C linkage, and sets up a define such that
579233d2500723e5594f3e7c70896ffeeef32b9c950ywan; subsequent uses of the function name automatically refer to the mangled version.
580233d2500723e5594f3e7c70896ffeeef32b9c950ywan; Appends cpuflags to the function name if cpuflags has been specified.
581233d2500723e5594f3e7c70896ffeeef32b9c950ywan%macro cglobal 1-2+ ; name, [PROLOGUE args]
582233d2500723e5594f3e7c70896ffeeef32b9c950ywan%if %0 == 1
583233d2500723e5594f3e7c70896ffeeef32b9c950ywan    cglobal_internal %1 %+ SUFFIX
584233d2500723e5594f3e7c70896ffeeef32b9c950ywan%else
585233d2500723e5594f3e7c70896ffeeef32b9c950ywan    cglobal_internal %1 %+ SUFFIX, %2
586233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endif
587233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endmacro
588233d2500723e5594f3e7c70896ffeeef32b9c950ywan%macro cglobal_internal 1-2+
589233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %ifndef cglobaled_%1
590233d2500723e5594f3e7c70896ffeeef32b9c950ywan        %xdefine %1 mangle(program_name %+ _ %+ %1)
591233d2500723e5594f3e7c70896ffeeef32b9c950ywan        %xdefine %1.skip_prologue %1 %+ .skip_prologue
592233d2500723e5594f3e7c70896ffeeef32b9c950ywan        CAT_XDEFINE cglobaled_, %1, 1
593233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %endif
594233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %xdefine current_function %1
595233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %ifidn __OUTPUT_FORMAT__,elf
596233d2500723e5594f3e7c70896ffeeef32b9c950ywan        global %1:function hidden
597233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %elifidn __OUTPUT_FORMAT__,elf32
598233d2500723e5594f3e7c70896ffeeef32b9c950ywan        global %1:function hidden
599233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %elifidn __OUTPUT_FORMAT__,elf64
600233d2500723e5594f3e7c70896ffeeef32b9c950ywan        global %1:function hidden
601233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %elifidn __OUTPUT_FORMAT__,macho32
602233d2500723e5594f3e7c70896ffeeef32b9c950ywan        global %1:private_extern
603233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %elifidn __OUTPUT_FORMAT__,macho64
604233d2500723e5594f3e7c70896ffeeef32b9c950ywan        global %1:private_extern
605233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %else
606233d2500723e5594f3e7c70896ffeeef32b9c950ywan        global %1
607233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %endif
608233d2500723e5594f3e7c70896ffeeef32b9c950ywan    align function_align
609233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %1:
610233d2500723e5594f3e7c70896ffeeef32b9c950ywan    RESET_MM_PERMUTATION ; not really needed, but makes disassembly somewhat nicer
611233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %assign stack_offset 0
612233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %if %0 > 1
613233d2500723e5594f3e7c70896ffeeef32b9c950ywan        PROLOGUE %2
614233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %endif
615233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endmacro
616233d2500723e5594f3e7c70896ffeeef32b9c950ywan
617233d2500723e5594f3e7c70896ffeeef32b9c950ywan%macro cextern 1
618233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %xdefine %1 mangle(program_name %+ _ %+ %1)
619233d2500723e5594f3e7c70896ffeeef32b9c950ywan    CAT_XDEFINE cglobaled_, %1, 1
620233d2500723e5594f3e7c70896ffeeef32b9c950ywan    extern %1
621233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endmacro
622233d2500723e5594f3e7c70896ffeeef32b9c950ywan
623233d2500723e5594f3e7c70896ffeeef32b9c950ywan; like cextern, but without the prefix
624233d2500723e5594f3e7c70896ffeeef32b9c950ywan%macro cextern_naked 1
625233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %xdefine %1 mangle(%1)
626233d2500723e5594f3e7c70896ffeeef32b9c950ywan    CAT_XDEFINE cglobaled_, %1, 1
627233d2500723e5594f3e7c70896ffeeef32b9c950ywan    extern %1
628233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endmacro
629233d2500723e5594f3e7c70896ffeeef32b9c950ywan
630233d2500723e5594f3e7c70896ffeeef32b9c950ywan%macro const 2+
631233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %xdefine %1 mangle(program_name %+ _ %+ %1)
632233d2500723e5594f3e7c70896ffeeef32b9c950ywan    global %1
633233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %1: %2
634233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endmacro
635233d2500723e5594f3e7c70896ffeeef32b9c950ywan
636233d2500723e5594f3e7c70896ffeeef32b9c950ywan; This is needed for ELF, otherwise the GNU linker assumes the stack is
637233d2500723e5594f3e7c70896ffeeef32b9c950ywan; executable by default.
638233d2500723e5594f3e7c70896ffeeef32b9c950ywan%ifidn __OUTPUT_FORMAT__,elf
639233d2500723e5594f3e7c70896ffeeef32b9c950ywanSECTION .note.GNU-stack noalloc noexec nowrite progbits
640233d2500723e5594f3e7c70896ffeeef32b9c950ywan%elifidn __OUTPUT_FORMAT__,elf32
641233d2500723e5594f3e7c70896ffeeef32b9c950ywanSECTION .note.GNU-stack noalloc noexec nowrite progbits
642233d2500723e5594f3e7c70896ffeeef32b9c950ywan%elifidn __OUTPUT_FORMAT__,elf64
643233d2500723e5594f3e7c70896ffeeef32b9c950ywanSECTION .note.GNU-stack noalloc noexec nowrite progbits
644233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endif
645233d2500723e5594f3e7c70896ffeeef32b9c950ywan
646233d2500723e5594f3e7c70896ffeeef32b9c950ywan; cpuflags
647233d2500723e5594f3e7c70896ffeeef32b9c950ywan
648233d2500723e5594f3e7c70896ffeeef32b9c950ywan%assign cpuflags_mmx      (1<<0)
649233d2500723e5594f3e7c70896ffeeef32b9c950ywan%assign cpuflags_mmx2     (1<<1) | cpuflags_mmx
650233d2500723e5594f3e7c70896ffeeef32b9c950ywan%assign cpuflags_3dnow    (1<<2) | cpuflags_mmx
651233d2500723e5594f3e7c70896ffeeef32b9c950ywan%assign cpuflags_3dnow2   (1<<3) | cpuflags_3dnow
652233d2500723e5594f3e7c70896ffeeef32b9c950ywan%assign cpuflags_sse      (1<<4) | cpuflags_mmx2
653233d2500723e5594f3e7c70896ffeeef32b9c950ywan%assign cpuflags_sse2     (1<<5) | cpuflags_sse
654233d2500723e5594f3e7c70896ffeeef32b9c950ywan%assign cpuflags_sse2slow (1<<6) | cpuflags_sse2
655233d2500723e5594f3e7c70896ffeeef32b9c950ywan%assign cpuflags_sse3     (1<<7) | cpuflags_sse2
656233d2500723e5594f3e7c70896ffeeef32b9c950ywan%assign cpuflags_ssse3    (1<<8) | cpuflags_sse3
657233d2500723e5594f3e7c70896ffeeef32b9c950ywan%assign cpuflags_sse4     (1<<9) | cpuflags_ssse3
658233d2500723e5594f3e7c70896ffeeef32b9c950ywan%assign cpuflags_sse42    (1<<10)| cpuflags_sse4
659233d2500723e5594f3e7c70896ffeeef32b9c950ywan%assign cpuflags_avx      (1<<11)| cpuflags_sse42
660233d2500723e5594f3e7c70896ffeeef32b9c950ywan%assign cpuflags_xop      (1<<12)| cpuflags_avx
661233d2500723e5594f3e7c70896ffeeef32b9c950ywan%assign cpuflags_fma4     (1<<13)| cpuflags_avx
662233d2500723e5594f3e7c70896ffeeef32b9c950ywan
663233d2500723e5594f3e7c70896ffeeef32b9c950ywan%assign cpuflags_cache32  (1<<16)
664233d2500723e5594f3e7c70896ffeeef32b9c950ywan%assign cpuflags_cache64  (1<<17)
665233d2500723e5594f3e7c70896ffeeef32b9c950ywan%assign cpuflags_slowctz  (1<<18)
666233d2500723e5594f3e7c70896ffeeef32b9c950ywan%assign cpuflags_lzcnt    (1<<19)
667233d2500723e5594f3e7c70896ffeeef32b9c950ywan%assign cpuflags_misalign (1<<20)
668233d2500723e5594f3e7c70896ffeeef32b9c950ywan%assign cpuflags_aligned  (1<<21) ; not a cpu feature, but a function variant
669233d2500723e5594f3e7c70896ffeeef32b9c950ywan%assign cpuflags_atom     (1<<22)
670233d2500723e5594f3e7c70896ffeeef32b9c950ywan
671233d2500723e5594f3e7c70896ffeeef32b9c950ywan%define    cpuflag(x) ((cpuflags & (cpuflags_ %+ x)) == (cpuflags_ %+ x))
672233d2500723e5594f3e7c70896ffeeef32b9c950ywan%define notcpuflag(x) ((cpuflags & (cpuflags_ %+ x)) != (cpuflags_ %+ x))
673233d2500723e5594f3e7c70896ffeeef32b9c950ywan
674233d2500723e5594f3e7c70896ffeeef32b9c950ywan; Takes up to 2 cpuflags from the above list.
675233d2500723e5594f3e7c70896ffeeef32b9c950ywan; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu.
676233d2500723e5594f3e7c70896ffeeef32b9c950ywan; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_MMX &co.
677233d2500723e5594f3e7c70896ffeeef32b9c950ywan%macro INIT_CPUFLAGS 0-2
678233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %if %0 >= 1
679233d2500723e5594f3e7c70896ffeeef32b9c950ywan        %xdefine cpuname %1
680233d2500723e5594f3e7c70896ffeeef32b9c950ywan        %assign cpuflags cpuflags_%1
681233d2500723e5594f3e7c70896ffeeef32b9c950ywan        %if %0 >= 2
682233d2500723e5594f3e7c70896ffeeef32b9c950ywan            %xdefine cpuname %1_%2
683233d2500723e5594f3e7c70896ffeeef32b9c950ywan            %assign cpuflags cpuflags | cpuflags_%2
684233d2500723e5594f3e7c70896ffeeef32b9c950ywan        %endif
685233d2500723e5594f3e7c70896ffeeef32b9c950ywan        %xdefine SUFFIX _ %+ cpuname
686233d2500723e5594f3e7c70896ffeeef32b9c950ywan        %if cpuflag(avx)
687233d2500723e5594f3e7c70896ffeeef32b9c950ywan            %assign avx_enabled 1
688233d2500723e5594f3e7c70896ffeeef32b9c950ywan        %endif
689233d2500723e5594f3e7c70896ffeeef32b9c950ywan        %if mmsize == 16 && notcpuflag(sse2)
690233d2500723e5594f3e7c70896ffeeef32b9c950ywan            %define mova movaps
691233d2500723e5594f3e7c70896ffeeef32b9c950ywan            %define movu movups
692233d2500723e5594f3e7c70896ffeeef32b9c950ywan            %define movnta movntps
693233d2500723e5594f3e7c70896ffeeef32b9c950ywan        %endif
694233d2500723e5594f3e7c70896ffeeef32b9c950ywan        %if cpuflag(aligned)
695233d2500723e5594f3e7c70896ffeeef32b9c950ywan            %define movu mova
696233d2500723e5594f3e7c70896ffeeef32b9c950ywan        %elifidn %1, sse3
697233d2500723e5594f3e7c70896ffeeef32b9c950ywan            %define movu lddqu
698233d2500723e5594f3e7c70896ffeeef32b9c950ywan        %endif
699233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %else
700233d2500723e5594f3e7c70896ffeeef32b9c950ywan        %xdefine SUFFIX
701233d2500723e5594f3e7c70896ffeeef32b9c950ywan        %undef cpuname
702233d2500723e5594f3e7c70896ffeeef32b9c950ywan        %undef cpuflags
703233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %endif
704233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endmacro
705233d2500723e5594f3e7c70896ffeeef32b9c950ywan
706233d2500723e5594f3e7c70896ffeeef32b9c950ywan; merge mmx and sse*
707233d2500723e5594f3e7c70896ffeeef32b9c950ywan
708233d2500723e5594f3e7c70896ffeeef32b9c950ywan%macro CAT_XDEFINE 3
709233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %xdefine %1%2 %3
710233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endmacro
711233d2500723e5594f3e7c70896ffeeef32b9c950ywan
712233d2500723e5594f3e7c70896ffeeef32b9c950ywan%macro CAT_UNDEF 2
713233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %undef %1%2
714233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endmacro
715233d2500723e5594f3e7c70896ffeeef32b9c950ywan
716233d2500723e5594f3e7c70896ffeeef32b9c950ywan%macro INIT_MMX 0-1+
717233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %assign avx_enabled 0
718233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %define RESET_MM_PERMUTATION INIT_MMX %1
719233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %define mmsize 8
720233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %define num_mmregs 8
721233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %define mova movq
722233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %define movu movq
723233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %define movh movd
724233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %define movnta movntq
725233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %assign %%i 0
726233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %rep 8
727233d2500723e5594f3e7c70896ffeeef32b9c950ywan    CAT_XDEFINE m, %%i, mm %+ %%i
728233d2500723e5594f3e7c70896ffeeef32b9c950ywan    CAT_XDEFINE nmm, %%i, %%i
729233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %assign %%i %%i+1
730233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %endrep
731233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %rep 8
732233d2500723e5594f3e7c70896ffeeef32b9c950ywan    CAT_UNDEF m, %%i
733233d2500723e5594f3e7c70896ffeeef32b9c950ywan    CAT_UNDEF nmm, %%i
734233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %assign %%i %%i+1
735233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %endrep
736233d2500723e5594f3e7c70896ffeeef32b9c950ywan    INIT_CPUFLAGS %1
737233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endmacro
738233d2500723e5594f3e7c70896ffeeef32b9c950ywan
739233d2500723e5594f3e7c70896ffeeef32b9c950ywan%macro INIT_XMM 0-1+
740233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %assign avx_enabled 0
741233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %define RESET_MM_PERMUTATION INIT_XMM %1
742233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %define mmsize 16
743233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %define num_mmregs 8
744233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %if ARCH_X86_64
745233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %define num_mmregs 16
746233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %endif
747233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %define mova movdqa
748233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %define movu movdqu
749233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %define movh movq
750233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %define movnta movntdq
751233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %assign %%i 0
752233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %rep num_mmregs
753233d2500723e5594f3e7c70896ffeeef32b9c950ywan    CAT_XDEFINE m, %%i, xmm %+ %%i
754233d2500723e5594f3e7c70896ffeeef32b9c950ywan    CAT_XDEFINE nxmm, %%i, %%i
755233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %assign %%i %%i+1
756233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %endrep
757233d2500723e5594f3e7c70896ffeeef32b9c950ywan    INIT_CPUFLAGS %1
758233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endmacro
759233d2500723e5594f3e7c70896ffeeef32b9c950ywan
760233d2500723e5594f3e7c70896ffeeef32b9c950ywan; FIXME: INIT_AVX can be replaced by INIT_XMM avx
761233d2500723e5594f3e7c70896ffeeef32b9c950ywan%macro INIT_AVX 0
762233d2500723e5594f3e7c70896ffeeef32b9c950ywan    INIT_XMM
763233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %assign avx_enabled 1
764233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %define PALIGNR PALIGNR_SSSE3
765233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %define RESET_MM_PERMUTATION INIT_AVX
766233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endmacro
767233d2500723e5594f3e7c70896ffeeef32b9c950ywan
768233d2500723e5594f3e7c70896ffeeef32b9c950ywan%macro INIT_YMM 0-1+
769233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %assign avx_enabled 1
770233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %define RESET_MM_PERMUTATION INIT_YMM %1
771233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %define mmsize 32
772233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %define num_mmregs 8
773233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %if ARCH_X86_64
774233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %define num_mmregs 16
775233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %endif
776233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %define mova vmovaps
777233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %define movu vmovups
778233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %undef movh
779233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %define movnta vmovntps
780233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %assign %%i 0
781233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %rep num_mmregs
782233d2500723e5594f3e7c70896ffeeef32b9c950ywan    CAT_XDEFINE m, %%i, ymm %+ %%i
783233d2500723e5594f3e7c70896ffeeef32b9c950ywan    CAT_XDEFINE nymm, %%i, %%i
784233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %assign %%i %%i+1
785233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %endrep
786233d2500723e5594f3e7c70896ffeeef32b9c950ywan    INIT_CPUFLAGS %1
787233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endmacro
788233d2500723e5594f3e7c70896ffeeef32b9c950ywan
789233d2500723e5594f3e7c70896ffeeef32b9c950ywanINIT_XMM
790233d2500723e5594f3e7c70896ffeeef32b9c950ywan
791233d2500723e5594f3e7c70896ffeeef32b9c950ywan; I often want to use macros that permute their arguments. e.g. there's no
792233d2500723e5594f3e7c70896ffeeef32b9c950ywan; efficient way to implement butterfly or transpose or dct without swapping some
793233d2500723e5594f3e7c70896ffeeef32b9c950ywan; arguments.
794233d2500723e5594f3e7c70896ffeeef32b9c950ywan;
795233d2500723e5594f3e7c70896ffeeef32b9c950ywan; I would like to not have to manually keep track of the permutations:
796233d2500723e5594f3e7c70896ffeeef32b9c950ywan; If I insert a permutation in the middle of a function, it should automatically
797233d2500723e5594f3e7c70896ffeeef32b9c950ywan; change everything that follows. For more complex macros I may also have multiple
798233d2500723e5594f3e7c70896ffeeef32b9c950ywan; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations.
799233d2500723e5594f3e7c70896ffeeef32b9c950ywan;
800233d2500723e5594f3e7c70896ffeeef32b9c950ywan; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that
801233d2500723e5594f3e7c70896ffeeef32b9c950ywan; permutes its arguments. It's equivalent to exchanging the contents of the
802233d2500723e5594f3e7c70896ffeeef32b9c950ywan; registers, except that this way you exchange the register names instead, so it
803233d2500723e5594f3e7c70896ffeeef32b9c950ywan; doesn't cost any cycles.
804233d2500723e5594f3e7c70896ffeeef32b9c950ywan
805233d2500723e5594f3e7c70896ffeeef32b9c950ywan%macro PERMUTE 2-* ; takes a list of pairs to swap
806233d2500723e5594f3e7c70896ffeeef32b9c950ywan%rep %0/2
807233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %xdefine tmp%2 m%2
808233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %xdefine ntmp%2 nm%2
809233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %rotate 2
810233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endrep
811233d2500723e5594f3e7c70896ffeeef32b9c950ywan%rep %0/2
812233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %xdefine m%1 tmp%2
813233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %xdefine nm%1 ntmp%2
814233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %undef tmp%2
815233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %undef ntmp%2
816233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %rotate 2
817233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endrep
818233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endmacro
819233d2500723e5594f3e7c70896ffeeef32b9c950ywan
820233d2500723e5594f3e7c70896ffeeef32b9c950ywan%macro SWAP 2-* ; swaps a single chain (sometimes more concise than pairs)
821233d2500723e5594f3e7c70896ffeeef32b9c950ywan%rep %0-1
822233d2500723e5594f3e7c70896ffeeef32b9c950ywan%ifdef m%1
823233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %xdefine tmp m%1
824233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %xdefine m%1 m%2
825233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %xdefine m%2 tmp
826233d2500723e5594f3e7c70896ffeeef32b9c950ywan    CAT_XDEFINE n, m%1, %1
827233d2500723e5594f3e7c70896ffeeef32b9c950ywan    CAT_XDEFINE n, m%2, %2
828233d2500723e5594f3e7c70896ffeeef32b9c950ywan%else
829233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; If we were called as "SWAP m0,m1" rather than "SWAP 0,1" infer the original numbers here.
830233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; Be careful using this mode in nested macros though, as in some cases there may be
831233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; other copies of m# that have already been dereferenced and don't get updated correctly.
832233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %xdefine %%n1 n %+ %1
833233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %xdefine %%n2 n %+ %2
834233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %xdefine tmp m %+ %%n1
835233d2500723e5594f3e7c70896ffeeef32b9c950ywan    CAT_XDEFINE m, %%n1, m %+ %%n2
836233d2500723e5594f3e7c70896ffeeef32b9c950ywan    CAT_XDEFINE m, %%n2, tmp
837233d2500723e5594f3e7c70896ffeeef32b9c950ywan    CAT_XDEFINE n, m %+ %%n1, %%n1
838233d2500723e5594f3e7c70896ffeeef32b9c950ywan    CAT_XDEFINE n, m %+ %%n2, %%n2
839233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endif
840233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %undef tmp
841233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %rotate 1
842233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endrep
843233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endmacro
844233d2500723e5594f3e7c70896ffeeef32b9c950ywan
845233d2500723e5594f3e7c70896ffeeef32b9c950ywan; If SAVE_MM_PERMUTATION is placed at the end of a function, then any later
846233d2500723e5594f3e7c70896ffeeef32b9c950ywan; calls to that function will automatically load the permutation, so values can
847233d2500723e5594f3e7c70896ffeeef32b9c950ywan; be returned in mmregs.
848233d2500723e5594f3e7c70896ffeeef32b9c950ywan%macro SAVE_MM_PERMUTATION 0-1
849233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %if %0
850233d2500723e5594f3e7c70896ffeeef32b9c950ywan        %xdefine %%f %1_m
851233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %else
852233d2500723e5594f3e7c70896ffeeef32b9c950ywan        %xdefine %%f current_function %+ _m
853233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %endif
854233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %assign %%i 0
855233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %rep num_mmregs
856233d2500723e5594f3e7c70896ffeeef32b9c950ywan        CAT_XDEFINE %%f, %%i, m %+ %%i
857233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %assign %%i %%i+1
858233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %endrep
859233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endmacro
860233d2500723e5594f3e7c70896ffeeef32b9c950ywan
861233d2500723e5594f3e7c70896ffeeef32b9c950ywan%macro LOAD_MM_PERMUTATION 1 ; name to load from
862233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %ifdef %1_m0
863233d2500723e5594f3e7c70896ffeeef32b9c950ywan        %assign %%i 0
864233d2500723e5594f3e7c70896ffeeef32b9c950ywan        %rep num_mmregs
865233d2500723e5594f3e7c70896ffeeef32b9c950ywan            CAT_XDEFINE m, %%i, %1_m %+ %%i
866233d2500723e5594f3e7c70896ffeeef32b9c950ywan            CAT_XDEFINE n, m %+ %%i, %%i
867233d2500723e5594f3e7c70896ffeeef32b9c950ywan        %assign %%i %%i+1
868233d2500723e5594f3e7c70896ffeeef32b9c950ywan        %endrep
869233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %endif
870233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endmacro
871233d2500723e5594f3e7c70896ffeeef32b9c950ywan
872233d2500723e5594f3e7c70896ffeeef32b9c950ywan; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't
873233d2500723e5594f3e7c70896ffeeef32b9c950ywan%macro call 1
874233d2500723e5594f3e7c70896ffeeef32b9c950ywan    call_internal %1, %1 %+ SUFFIX
875233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endmacro
876233d2500723e5594f3e7c70896ffeeef32b9c950ywan%macro call_internal 2
877233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %xdefine %%i %1
878233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %ifndef cglobaled_%1
879233d2500723e5594f3e7c70896ffeeef32b9c950ywan        %ifdef cglobaled_%2
880233d2500723e5594f3e7c70896ffeeef32b9c950ywan            %xdefine %%i %2
881233d2500723e5594f3e7c70896ffeeef32b9c950ywan        %endif
882233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %endif
883233d2500723e5594f3e7c70896ffeeef32b9c950ywan    call %%i
884233d2500723e5594f3e7c70896ffeeef32b9c950ywan    LOAD_MM_PERMUTATION %%i
885233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endmacro
886233d2500723e5594f3e7c70896ffeeef32b9c950ywan
887233d2500723e5594f3e7c70896ffeeef32b9c950ywan; Substitutions that reduce instruction size but are functionally equivalent
888233d2500723e5594f3e7c70896ffeeef32b9c950ywan%macro add 2
889233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %ifnum %2
890233d2500723e5594f3e7c70896ffeeef32b9c950ywan        %if %2==128
891233d2500723e5594f3e7c70896ffeeef32b9c950ywan            sub %1, -128
892233d2500723e5594f3e7c70896ffeeef32b9c950ywan        %else
893233d2500723e5594f3e7c70896ffeeef32b9c950ywan            add %1, %2
894233d2500723e5594f3e7c70896ffeeef32b9c950ywan        %endif
895233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %else
896233d2500723e5594f3e7c70896ffeeef32b9c950ywan        add %1, %2
897233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %endif
898233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endmacro
899233d2500723e5594f3e7c70896ffeeef32b9c950ywan
900233d2500723e5594f3e7c70896ffeeef32b9c950ywan%macro sub 2
901233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %ifnum %2
902233d2500723e5594f3e7c70896ffeeef32b9c950ywan        %if %2==128
903233d2500723e5594f3e7c70896ffeeef32b9c950ywan            add %1, -128
904233d2500723e5594f3e7c70896ffeeef32b9c950ywan        %else
905233d2500723e5594f3e7c70896ffeeef32b9c950ywan            sub %1, %2
906233d2500723e5594f3e7c70896ffeeef32b9c950ywan        %endif
907233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %else
908233d2500723e5594f3e7c70896ffeeef32b9c950ywan        sub %1, %2
909233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %endif
910233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endmacro
911233d2500723e5594f3e7c70896ffeeef32b9c950ywan
912233d2500723e5594f3e7c70896ffeeef32b9c950ywan;=============================================================================
913233d2500723e5594f3e7c70896ffeeef32b9c950ywan; AVX abstraction layer
914233d2500723e5594f3e7c70896ffeeef32b9c950ywan;=============================================================================
915233d2500723e5594f3e7c70896ffeeef32b9c950ywan
916233d2500723e5594f3e7c70896ffeeef32b9c950ywan%assign i 0
917233d2500723e5594f3e7c70896ffeeef32b9c950ywan%rep 16
918233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %if i < 8
919233d2500723e5594f3e7c70896ffeeef32b9c950ywan        CAT_XDEFINE sizeofmm, i, 8
920233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %endif
921233d2500723e5594f3e7c70896ffeeef32b9c950ywan    CAT_XDEFINE sizeofxmm, i, 16
922233d2500723e5594f3e7c70896ffeeef32b9c950ywan    CAT_XDEFINE sizeofymm, i, 32
923233d2500723e5594f3e7c70896ffeeef32b9c950ywan%assign i i+1
924233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endrep
925233d2500723e5594f3e7c70896ffeeef32b9c950ywan%undef i
926233d2500723e5594f3e7c70896ffeeef32b9c950ywan
927233d2500723e5594f3e7c70896ffeeef32b9c950ywan;%1 == instruction
928233d2500723e5594f3e7c70896ffeeef32b9c950ywan;%2 == 1 if float, 0 if int
929233d2500723e5594f3e7c70896ffeeef32b9c950ywan;%3 == 1 if 4-operand (xmm, xmm, xmm, imm), 0 if 2- or 3-operand (xmm, xmm, xmm)
930233d2500723e5594f3e7c70896ffeeef32b9c950ywan;%4 == number of operands given
931233d2500723e5594f3e7c70896ffeeef32b9c950ywan;%5+: operands
932233d2500723e5594f3e7c70896ffeeef32b9c950ywan%macro RUN_AVX_INSTR 6-7+
933233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %ifid %5
934233d2500723e5594f3e7c70896ffeeef32b9c950ywan        %define %%size sizeof%5
935233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %else
936233d2500723e5594f3e7c70896ffeeef32b9c950ywan        %define %%size mmsize
937233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %endif
938233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %if %%size==32
939233d2500723e5594f3e7c70896ffeeef32b9c950ywan        %if %0 >= 7
940233d2500723e5594f3e7c70896ffeeef32b9c950ywan            v%1 %5, %6, %7
941233d2500723e5594f3e7c70896ffeeef32b9c950ywan        %else
942233d2500723e5594f3e7c70896ffeeef32b9c950ywan            v%1 %5, %6
943233d2500723e5594f3e7c70896ffeeef32b9c950ywan        %endif
944233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %else
945233d2500723e5594f3e7c70896ffeeef32b9c950ywan        %if %%size==8
946233d2500723e5594f3e7c70896ffeeef32b9c950ywan            %define %%regmov movq
947233d2500723e5594f3e7c70896ffeeef32b9c950ywan        %elif %2
948233d2500723e5594f3e7c70896ffeeef32b9c950ywan            %define %%regmov movaps
949233d2500723e5594f3e7c70896ffeeef32b9c950ywan        %else
950233d2500723e5594f3e7c70896ffeeef32b9c950ywan            %define %%regmov movdqa
951233d2500723e5594f3e7c70896ffeeef32b9c950ywan        %endif
952233d2500723e5594f3e7c70896ffeeef32b9c950ywan
953233d2500723e5594f3e7c70896ffeeef32b9c950ywan        %if %4>=3+%3
954233d2500723e5594f3e7c70896ffeeef32b9c950ywan            %ifnidn %5, %6
955233d2500723e5594f3e7c70896ffeeef32b9c950ywan                %if avx_enabled && sizeof%5==16
956233d2500723e5594f3e7c70896ffeeef32b9c950ywan                    v%1 %5, %6, %7
957233d2500723e5594f3e7c70896ffeeef32b9c950ywan                %else
958233d2500723e5594f3e7c70896ffeeef32b9c950ywan                    %%regmov %5, %6
959233d2500723e5594f3e7c70896ffeeef32b9c950ywan                    %1 %5, %7
960233d2500723e5594f3e7c70896ffeeef32b9c950ywan                %endif
961233d2500723e5594f3e7c70896ffeeef32b9c950ywan            %else
962233d2500723e5594f3e7c70896ffeeef32b9c950ywan                %1 %5, %7
963233d2500723e5594f3e7c70896ffeeef32b9c950ywan            %endif
964233d2500723e5594f3e7c70896ffeeef32b9c950ywan        %elif %3
965233d2500723e5594f3e7c70896ffeeef32b9c950ywan            %1 %5, %6, %7
966233d2500723e5594f3e7c70896ffeeef32b9c950ywan        %else
967233d2500723e5594f3e7c70896ffeeef32b9c950ywan            %1 %5, %6
968233d2500723e5594f3e7c70896ffeeef32b9c950ywan        %endif
969233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %endif
970233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endmacro
971233d2500723e5594f3e7c70896ffeeef32b9c950ywan
972233d2500723e5594f3e7c70896ffeeef32b9c950ywan; 3arg AVX ops with a memory arg can only have it in src2,
973233d2500723e5594f3e7c70896ffeeef32b9c950ywan; whereas SSE emulation of 3arg prefers to have it in src1 (i.e. the mov).
974233d2500723e5594f3e7c70896ffeeef32b9c950ywan; So, if the op is symmetric and the wrong one is memory, swap them.
975233d2500723e5594f3e7c70896ffeeef32b9c950ywan%macro RUN_AVX_INSTR1 8
976233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %assign %%swap 0
977233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %if avx_enabled
978233d2500723e5594f3e7c70896ffeeef32b9c950ywan        %ifnid %6
979233d2500723e5594f3e7c70896ffeeef32b9c950ywan            %assign %%swap 1
980233d2500723e5594f3e7c70896ffeeef32b9c950ywan        %endif
981233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %elifnidn %5, %6
982233d2500723e5594f3e7c70896ffeeef32b9c950ywan        %ifnid %7
983233d2500723e5594f3e7c70896ffeeef32b9c950ywan            %assign %%swap 1
984233d2500723e5594f3e7c70896ffeeef32b9c950ywan        %endif
985233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %endif
986233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %if %%swap && %3 == 0 && %8 == 1
987233d2500723e5594f3e7c70896ffeeef32b9c950ywan        RUN_AVX_INSTR %1, %2, %3, %4, %5, %7, %6
988233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %else
989233d2500723e5594f3e7c70896ffeeef32b9c950ywan        RUN_AVX_INSTR %1, %2, %3, %4, %5, %6, %7
990233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %endif
991233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endmacro
992233d2500723e5594f3e7c70896ffeeef32b9c950ywan
993233d2500723e5594f3e7c70896ffeeef32b9c950ywan;%1 == instruction
994233d2500723e5594f3e7c70896ffeeef32b9c950ywan;%2 == 1 if float, 0 if int
995233d2500723e5594f3e7c70896ffeeef32b9c950ywan;%3 == 1 if 4-operand (xmm, xmm, xmm, imm), 0 if 3-operand (xmm, xmm, xmm)
996233d2500723e5594f3e7c70896ffeeef32b9c950ywan;%4 == 1 if symmetric (i.e. doesn't matter which src arg is which), 0 if not
997233d2500723e5594f3e7c70896ffeeef32b9c950ywan%macro AVX_INSTR 4
998233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %macro %1 2-9 fnord, fnord, fnord, %1, %2, %3, %4
999233d2500723e5594f3e7c70896ffeeef32b9c950ywan        %ifidn %3, fnord
1000233d2500723e5594f3e7c70896ffeeef32b9c950ywan            RUN_AVX_INSTR %6, %7, %8, 2, %1, %2
1001233d2500723e5594f3e7c70896ffeeef32b9c950ywan        %elifidn %4, fnord
1002233d2500723e5594f3e7c70896ffeeef32b9c950ywan            RUN_AVX_INSTR1 %6, %7, %8, 3, %1, %2, %3, %9
1003233d2500723e5594f3e7c70896ffeeef32b9c950ywan        %elifidn %5, fnord
1004233d2500723e5594f3e7c70896ffeeef32b9c950ywan            RUN_AVX_INSTR %6, %7, %8, 4, %1, %2, %3, %4
1005233d2500723e5594f3e7c70896ffeeef32b9c950ywan        %else
1006233d2500723e5594f3e7c70896ffeeef32b9c950ywan            RUN_AVX_INSTR %6, %7, %8, 5, %1, %2, %3, %4, %5
1007233d2500723e5594f3e7c70896ffeeef32b9c950ywan        %endif
1008233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %endmacro
1009233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endmacro
1010233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1011233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR addpd, 1, 0, 1
1012233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR addps, 1, 0, 1
1013233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR addsd, 1, 0, 1
1014233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR addss, 1, 0, 1
1015233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR addsubpd, 1, 0, 0
1016233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR addsubps, 1, 0, 0
1017233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR andpd, 1, 0, 1
1018233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR andps, 1, 0, 1
1019233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR andnpd, 1, 0, 0
1020233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR andnps, 1, 0, 0
1021233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR blendpd, 1, 0, 0
1022233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR blendps, 1, 0, 0
1023233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR blendvpd, 1, 0, 0
1024233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR blendvps, 1, 0, 0
1025233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR cmppd, 1, 0, 0
1026233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR cmpps, 1, 0, 0
1027233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR cmpsd, 1, 0, 0
1028233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR cmpss, 1, 0, 0
1029233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR cvtdq2ps, 1, 0, 0
1030233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR cvtps2dq, 1, 0, 0
1031233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR divpd, 1, 0, 0
1032233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR divps, 1, 0, 0
1033233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR divsd, 1, 0, 0
1034233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR divss, 1, 0, 0
1035233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR dppd, 1, 1, 0
1036233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR dpps, 1, 1, 0
1037233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR haddpd, 1, 0, 0
1038233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR haddps, 1, 0, 0
1039233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR hsubpd, 1, 0, 0
1040233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR hsubps, 1, 0, 0
1041233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR maxpd, 1, 0, 1
1042233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR maxps, 1, 0, 1
1043233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR maxsd, 1, 0, 1
1044233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR maxss, 1, 0, 1
1045233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR minpd, 1, 0, 1
1046233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR minps, 1, 0, 1
1047233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR minsd, 1, 0, 1
1048233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR minss, 1, 0, 1
1049233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR movhlps, 1, 0, 0
1050233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR movlhps, 1, 0, 0
1051233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR movsd, 1, 0, 0
1052233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR movss, 1, 0, 0
1053233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR mpsadbw, 0, 1, 0
1054233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR mulpd, 1, 0, 1
1055233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR mulps, 1, 0, 1
1056233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR mulsd, 1, 0, 1
1057233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR mulss, 1, 0, 1
1058233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR orpd, 1, 0, 1
1059233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR orps, 1, 0, 1
1060233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR packsswb, 0, 0, 0
1061233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR packssdw, 0, 0, 0
1062233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR packuswb, 0, 0, 0
1063233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR packusdw, 0, 0, 0
1064233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR paddb, 0, 0, 1
1065233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR paddw, 0, 0, 1
1066233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR paddd, 0, 0, 1
1067233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR paddq, 0, 0, 1
1068233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR paddsb, 0, 0, 1
1069233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR paddsw, 0, 0, 1
1070233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR paddusb, 0, 0, 1
1071233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR paddusw, 0, 0, 1
1072233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR palignr, 0, 1, 0
1073233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR pand, 0, 0, 1
1074233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR pandn, 0, 0, 0
1075233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR pavgb, 0, 0, 1
1076233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR pavgw, 0, 0, 1
1077233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR pblendvb, 0, 0, 0
1078233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR pblendw, 0, 1, 0
1079233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR pcmpestri, 0, 0, 0
1080233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR pcmpestrm, 0, 0, 0
1081233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR pcmpistri, 0, 0, 0
1082233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR pcmpistrm, 0, 0, 0
1083233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR pcmpeqb, 0, 0, 1
1084233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR pcmpeqw, 0, 0, 1
1085233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR pcmpeqd, 0, 0, 1
1086233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR pcmpeqq, 0, 0, 1
1087233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR pcmpgtb, 0, 0, 0
1088233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR pcmpgtw, 0, 0, 0
1089233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR pcmpgtd, 0, 0, 0
1090233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR pcmpgtq, 0, 0, 0
1091233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR phaddw, 0, 0, 0
1092233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR phaddd, 0, 0, 0
1093233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR phaddsw, 0, 0, 0
1094233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR phsubw, 0, 0, 0
1095233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR phsubd, 0, 0, 0
1096233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR phsubsw, 0, 0, 0
1097233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR pmaddwd, 0, 0, 1
1098233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR pmaddubsw, 0, 0, 0
1099233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR pmaxsb, 0, 0, 1
1100233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR pmaxsw, 0, 0, 1
1101233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR pmaxsd, 0, 0, 1
1102233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR pmaxub, 0, 0, 1
1103233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR pmaxuw, 0, 0, 1
1104233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR pmaxud, 0, 0, 1
1105233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR pminsb, 0, 0, 1
1106233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR pminsw, 0, 0, 1
1107233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR pminsd, 0, 0, 1
1108233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR pminub, 0, 0, 1
1109233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR pminuw, 0, 0, 1
1110233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR pminud, 0, 0, 1
1111233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR pmulhuw, 0, 0, 1
1112233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR pmulhrsw, 0, 0, 1
1113233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR pmulhw, 0, 0, 1
1114233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR pmullw, 0, 0, 1
1115233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR pmulld, 0, 0, 1
1116233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR pmuludq, 0, 0, 1
1117233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR pmuldq, 0, 0, 1
1118233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR por, 0, 0, 1
1119233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR psadbw, 0, 0, 1
1120233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR pshufb, 0, 0, 0
1121233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR psignb, 0, 0, 0
1122233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR psignw, 0, 0, 0
1123233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR psignd, 0, 0, 0
1124233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR psllw, 0, 0, 0
1125233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR pslld, 0, 0, 0
1126233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR psllq, 0, 0, 0
1127233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR pslldq, 0, 0, 0
1128233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR psraw, 0, 0, 0
1129233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR psrad, 0, 0, 0
1130233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR psrlw, 0, 0, 0
1131233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR psrld, 0, 0, 0
1132233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR psrlq, 0, 0, 0
1133233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR psrldq, 0, 0, 0
1134233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR psubb, 0, 0, 0
1135233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR psubw, 0, 0, 0
1136233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR psubd, 0, 0, 0
1137233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR psubq, 0, 0, 0
1138233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR psubsb, 0, 0, 0
1139233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR psubsw, 0, 0, 0
1140233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR psubusb, 0, 0, 0
1141233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR psubusw, 0, 0, 0
1142233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR punpckhbw, 0, 0, 0
1143233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR punpckhwd, 0, 0, 0
1144233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR punpckhdq, 0, 0, 0
1145233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR punpckhqdq, 0, 0, 0
1146233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR punpcklbw, 0, 0, 0
1147233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR punpcklwd, 0, 0, 0
1148233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR punpckldq, 0, 0, 0
1149233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR punpcklqdq, 0, 0, 0
1150233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR pxor, 0, 0, 1
1151233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR shufps, 1, 1, 0
1152233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR subpd, 1, 0, 0
1153233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR subps, 1, 0, 0
1154233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR subsd, 1, 0, 0
1155233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR subss, 1, 0, 0
1156233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR unpckhpd, 1, 0, 0
1157233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR unpckhps, 1, 0, 0
1158233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR unpcklpd, 1, 0, 0
1159233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR unpcklps, 1, 0, 0
1160233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR xorpd, 1, 0, 1
1161233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR xorps, 1, 0, 1
1162233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1163233d2500723e5594f3e7c70896ffeeef32b9c950ywan; 3DNow instructions, for sharing code between AVX, SSE and 3DN
1164233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR pfadd, 1, 0, 1
1165233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR pfsub, 1, 0, 0
1166233d2500723e5594f3e7c70896ffeeef32b9c950ywanAVX_INSTR pfmul, 1, 0, 1
1167233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1168233d2500723e5594f3e7c70896ffeeef32b9c950ywan; base-4 constants for shuffles
1169233d2500723e5594f3e7c70896ffeeef32b9c950ywan%assign i 0
1170233d2500723e5594f3e7c70896ffeeef32b9c950ywan%rep 256
1171233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %assign j ((i>>6)&3)*1000 + ((i>>4)&3)*100 + ((i>>2)&3)*10 + (i&3)
1172233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %if j < 10
1173233d2500723e5594f3e7c70896ffeeef32b9c950ywan        CAT_XDEFINE q000, j, i
1174233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %elif j < 100
1175233d2500723e5594f3e7c70896ffeeef32b9c950ywan        CAT_XDEFINE q00, j, i
1176233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %elif j < 1000
1177233d2500723e5594f3e7c70896ffeeef32b9c950ywan        CAT_XDEFINE q0, j, i
1178233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %else
1179233d2500723e5594f3e7c70896ffeeef32b9c950ywan        CAT_XDEFINE q, j, i
1180233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %endif
1181233d2500723e5594f3e7c70896ffeeef32b9c950ywan%assign i i+1
1182233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endrep
1183233d2500723e5594f3e7c70896ffeeef32b9c950ywan%undef i
1184233d2500723e5594f3e7c70896ffeeef32b9c950ywan%undef j
1185233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1186233d2500723e5594f3e7c70896ffeeef32b9c950ywan%macro FMA_INSTR 3
1187233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %macro %1 4-7 %1, %2, %3
1188233d2500723e5594f3e7c70896ffeeef32b9c950ywan        %if cpuflag(xop)
1189233d2500723e5594f3e7c70896ffeeef32b9c950ywan            v%5 %1, %2, %3, %4
1190233d2500723e5594f3e7c70896ffeeef32b9c950ywan        %else
1191233d2500723e5594f3e7c70896ffeeef32b9c950ywan            %6 %1, %2, %3
1192233d2500723e5594f3e7c70896ffeeef32b9c950ywan            %7 %1, %4
1193233d2500723e5594f3e7c70896ffeeef32b9c950ywan        %endif
1194233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %endmacro
1195233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endmacro
1196233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1197233d2500723e5594f3e7c70896ffeeef32b9c950ywanFMA_INSTR  pmacsdd,  pmulld, paddd
1198233d2500723e5594f3e7c70896ffeeef32b9c950ywanFMA_INSTR  pmacsww,  pmullw, paddw
1199233d2500723e5594f3e7c70896ffeeef32b9c950ywanFMA_INSTR pmadcswd, pmaddwd, paddd
1200