1ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;*****************************************************************************
2ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;* x86inc.asm: x264asm abstraction layer
3ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;*****************************************************************************
4ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;* Copyright (C) 2005-2012 x264 project
5ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;*
6ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;* Authors: Loren Merritt <lorenm@u.washington.edu>
7ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;*          Anton Mitrofanov <BugMaster@narod.ru>
8ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;*          Jason Garrett-Glaser <darkshikari@gmail.com>
9ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;*          Henrik Gramner <hengar-6@student.ltu.se>
10ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;*
11ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;* Permission to use, copy, modify, and/or distribute this software for any
12ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;* purpose with or without fee is hereby granted, provided that the above
13ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;* copyright notice and this permission notice appear in all copies.
14ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;*
15ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
16ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
17ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
18ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
20ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
21ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
22ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;*****************************************************************************
23ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
24ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; This is a header file for the x264ASM assembly language, which uses
25ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; NASM/YASM syntax combined with a large number of macros to provide easy
26ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; abstraction between different calling conventions (x86_32, win64, linux64).
27ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; It also has various other useful features to simplify writing the kind of
28ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; DSP functions that are most often used in x264.
29ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
30ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; Unlike the rest of x264, this file is available under an ISC license, as it
31ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; has significant usefulness outside of x264 and we want it to be available
32ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; to the largest audience possible.  Of course, if you modify it for your own
33ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; purposes to add a new feature, we strongly encourage contributing a patch
34ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; as this feature might be useful for others as well.  Send patches or ideas
35ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; to x264-devel@videolan.org .
36ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
37ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%include "vpx_config.asm"
38ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
39ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%define program_name vp9
40ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
41ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
42ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%define UNIX64 0
43ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%define WIN64  0
44ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%if ARCH_X86_64
45ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %ifidn __OUTPUT_FORMAT__,win32
46ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        %define WIN64  1
47ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %elifidn __OUTPUT_FORMAT__,win64
48ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        %define WIN64  1
49ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %elifidn __OUTPUT_FORMAT__,x64
50ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        %define WIN64  1
51ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %else
52ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        %define UNIX64 1
53ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %endif
54ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endif
55ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
56ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%ifidn   __OUTPUT_FORMAT__,elf32
57ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %define mangle(x) x
58ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%elifidn __OUTPUT_FORMAT__,elf64
59ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %define mangle(x) x
60ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%elifidn __OUTPUT_FORMAT__,elf
61ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %define mangle(x) x
62ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%elifidn __OUTPUT_FORMAT__,x64
63ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %define mangle(x) x
64ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%elifidn __OUTPUT_FORMAT__,win64
65ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %define mangle(x) x
66ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%else
67ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %define mangle(x) _ %+ x
68ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endif
69ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
70ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; FIXME: All of the 64bit asm functions that take a stride as an argument
71ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; via register, assume that the high dword of that register is filled with 0.
72ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; This is true in practice (since we never do any 64bit arithmetic on strides,
73ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; and x264's strides are all positive), but is not guaranteed by the ABI.
74ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
75ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; Name of the .rodata section.
76ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; Kludge: Something on OS X fails to align .rodata even given an align attribute,
77ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; so use a different read-only section.
78ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro SECTION_RODATA 0-1 16
79ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %ifidn __OUTPUT_FORMAT__,macho64
80ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        SECTION .text align=%1
81ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %elifidn __OUTPUT_FORMAT__,macho
82ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        SECTION .text align=%1
83ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        fakegot:
84ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %elifidn __OUTPUT_FORMAT__,aout
85ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        section .text
86ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %else
87ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        SECTION .rodata align=%1
88ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %endif
89ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro
90ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
91ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; aout does not support align=
92ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro SECTION_TEXT 0-1 16
93ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %ifidn __OUTPUT_FORMAT__,aout
94ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        SECTION .text
95ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %else
96ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        SECTION .text align=%1
97ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %endif
98ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro
99ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang; PIC macros are copied from vpx_ports/x86_abi_support.asm. The "define PIC"
1015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang; from original code is added in for 64bit.
1025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang%ifidn __OUTPUT_FORMAT__,elf32
1035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang%define ABI_IS_32BIT 1
1045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang%elifidn __OUTPUT_FORMAT__,macho32
1055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang%define ABI_IS_32BIT 1
1065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang%elifidn __OUTPUT_FORMAT__,win32
1075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang%define ABI_IS_32BIT 1
1085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang%elifidn __OUTPUT_FORMAT__,aout
1095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang%define ABI_IS_32BIT 1
1105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang%else
1115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang%define ABI_IS_32BIT 0
1125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang%endif
1135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang%if ABI_IS_32BIT
1155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  %if CONFIG_PIC=1
1165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  %ifidn __OUTPUT_FORMAT__,elf32
1175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    %define GET_GOT_SAVE_ARG 1
1185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    %define WRT_PLT wrt ..plt
1195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    %macro GET_GOT 1
1205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      extern _GLOBAL_OFFSET_TABLE_
1215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      push %1
1225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      call %%get_got
1235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      %%sub_offset:
1245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      jmp %%exitGG
1255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      %%get_got:
1265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      mov %1, [esp]
1275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      add %1, _GLOBAL_OFFSET_TABLE_ + $$ - %%sub_offset wrt ..gotpc
1285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      ret
1295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      %%exitGG:
1305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      %undef GLOBAL
1315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      %define GLOBAL(x) x + %1 wrt ..gotoff
1325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      %undef RESTORE_GOT
1335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      %define RESTORE_GOT pop %1
1345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    %endmacro
1355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  %elifidn __OUTPUT_FORMAT__,macho32
1365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    %define GET_GOT_SAVE_ARG 1
1375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    %macro GET_GOT 1
1385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      push %1
1395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      call %%get_got
1405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      %%get_got:
1415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      pop  %1
1425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      %undef GLOBAL
1435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      %define GLOBAL(x) x + %1 - %%get_got
1445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      %undef RESTORE_GOT
1455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      %define RESTORE_GOT pop %1
1465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    %endmacro
1475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  %endif
1485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  %endif
1495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  %if ARCH_X86_64 == 0
1515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    %undef PIC
1525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  %endif
1535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang%else
1555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  %macro GET_GOT 1
1565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  %endmacro
1575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  %define GLOBAL(x) rel x
1585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  %define WRT_PLT wrt ..plt
1595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  %if WIN64
161ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %define PIC
1625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  %elifidn __OUTPUT_FORMAT__,macho64
163ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %define PIC
1645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  %elif CONFIG_PIC
165ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %define PIC
1665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  %endif
1675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang%endif
1685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang%ifnmacro GET_GOT
1705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    %macro GET_GOT 1
1715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    %endmacro
1725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    %define GLOBAL(x) x
173ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endif
1745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang%ifndef RESTORE_GOT
1755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang%define RESTORE_GOT
1765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang%endif
1775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang%ifndef WRT_PLT
1785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang%define WRT_PLT
1795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang%endif
1805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
181ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%ifdef PIC
182ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    default rel
183ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endif
1845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang; Done with PIC macros
185ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
186ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; Always use long nops (reduces 0x90 spam in disassembly on x86_32)
187ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%ifndef __NASM_VER__
188ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangCPU amdnop
189ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%else
190ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%use smartalign
191ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangALIGNMODE k7
192ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endif
193ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
194ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; Macros to eliminate most code duplication between x86_32 and x86_64:
195ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; Currently this works only for leaf functions which load all their arguments
196ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; into registers at the start, and make no other use of the stack. Luckily that
197ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; covers most of x264's asm.
198ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
199ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; PROLOGUE:
200ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; %1 = number of arguments. loads them from stack if needed.
201ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; %2 = number of registers used. pushes callee-saved regs if needed.
202ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed.
203ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; %4 = list of names to define to registers
204ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; PROLOGUE can also be invoked by adding the same options to cglobal
205ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
206ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; e.g.
207ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; cglobal foo, 2,3,0, dst, src, tmp
208ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; declares a function (foo), taking two args (dst and src) and one local variable (tmp)
209ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
210ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; TODO Some functions can use some args directly from the stack. If they're the
211ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; last args then you can just not declare them, but if they're in the middle
212ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; we need more flexible macro.
213ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
214ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; RET:
215ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; Pops anything that was pushed by PROLOGUE, and returns.
216ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
217ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; REP_RET:
218ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; Same, but if it doesn't pop anything it becomes a 2-byte ret, for athlons
219ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; which are slow when a normal ret follows a branch.
220ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
221ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; registers:
222ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; rN and rNq are the native-size register holding function argument N
223ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; rNd, rNw, rNb are dword, word, and byte size
224ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; rNm is the original location of arg N (a register or on the stack), dword
225ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; rNmp is native size
226ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
227ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro DECLARE_REG 5-6
228ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %define r%1q %2
229ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %define r%1d %3
230ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %define r%1w %4
231ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %define r%1b %5
232ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %if %0 == 5
233ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        %define r%1m  %3
234ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        %define r%1mp %2
235ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %elif ARCH_X86_64 ; memory
236ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        %define r%1m [rsp + stack_offset + %6]
237ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        %define r%1mp qword r %+ %1m
238ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %else
239ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        %define r%1m [esp + stack_offset + %6]
240ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        %define r%1mp dword r %+ %1m
241ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %endif
242ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %define r%1  %2
243ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro
244ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
245ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro DECLARE_REG_SIZE 2
246ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %define r%1q r%1
247ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %define e%1q r%1
248ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %define r%1d e%1
249ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %define e%1d e%1
250ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %define r%1w %1
251ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %define e%1w %1
252ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %define r%1b %2
253ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %define e%1b %2
254ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%if ARCH_X86_64 == 0
255ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %define r%1  e%1
256ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endif
257ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro
258ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
259ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangDECLARE_REG_SIZE ax, al
260ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangDECLARE_REG_SIZE bx, bl
261ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangDECLARE_REG_SIZE cx, cl
262ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangDECLARE_REG_SIZE dx, dl
263ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangDECLARE_REG_SIZE si, sil
264ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangDECLARE_REG_SIZE di, dil
265ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangDECLARE_REG_SIZE bp, bpl
266ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
267ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; t# defines for when per-arch register allocation is more complex than just function arguments
268ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
269ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro DECLARE_REG_TMP 1-*
270ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %assign %%i 0
271ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %rep %0
272ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        CAT_XDEFINE t, %%i, r%1
273ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        %assign %%i %%i+1
274ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        %rotate 1
275ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %endrep
276ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro
277ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
278ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro DECLARE_REG_TMP_SIZE 0-*
279ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %rep %0
280ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        %define t%1q t%1 %+ q
281ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        %define t%1d t%1 %+ d
282ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        %define t%1w t%1 %+ w
283ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        %define t%1b t%1 %+ b
284ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        %rotate 1
285ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %endrep
286ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro
287ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
288ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangDECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
289ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
290ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%if ARCH_X86_64
291ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %define gprsize 8
292ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%else
293ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %define gprsize 4
294ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endif
295ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
296ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro PUSH 1
297ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    push %1
298ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %assign stack_offset stack_offset+gprsize
299ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro
300ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
301ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro POP 1
302ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    pop %1
303ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %assign stack_offset stack_offset-gprsize
304ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro
305ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
306ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro PUSH_IF_USED 1-*
307ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %rep %0
308ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        %if %1 < regs_used
309ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            PUSH r%1
310ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        %endif
311ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        %rotate 1
312ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %endrep
313ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro
314ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
315ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro POP_IF_USED 1-*
316ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %rep %0
317ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        %if %1 < regs_used
318ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            pop r%1
319ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        %endif
320ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        %rotate 1
321ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %endrep
322ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro
323ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
324ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro LOAD_IF_USED 1-*
325ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %rep %0
326ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        %if %1 < num_args
327ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            mov r%1, r %+ %1 %+ mp
328ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        %endif
329ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        %rotate 1
330ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %endrep
331ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro
332ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
333ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro SUB 2
334ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    sub %1, %2
335ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %ifidn %1, rsp
336ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        %assign stack_offset stack_offset+(%2)
337ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %endif
338ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro
339ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
340ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro ADD 2
341ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    add %1, %2
342ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %ifidn %1, rsp
343ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        %assign stack_offset stack_offset-(%2)
344ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %endif
345ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro
346ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
347ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro movifnidn 2
348ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %ifnidn %1, %2
349ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        mov %1, %2
350ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %endif
351ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro
352ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
353ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro movsxdifnidn 2
354ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %ifnidn %1, %2
355ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movsxd %1, %2
356ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %endif
357ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro
358ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
359ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro ASSERT 1
360ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %if (%1) == 0
361ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        %error assert failed
362ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %endif
363ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro
364ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
365ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro DEFINE_ARGS 0-*
366ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %ifdef n_arg_names
367ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        %assign %%i 0
368ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        %rep n_arg_names
369ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            CAT_UNDEF arg_name %+ %%i, q
370ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            CAT_UNDEF arg_name %+ %%i, d
371ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            CAT_UNDEF arg_name %+ %%i, w
372ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            CAT_UNDEF arg_name %+ %%i, b
373ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            CAT_UNDEF arg_name %+ %%i, m
374ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            CAT_UNDEF arg_name %+ %%i, mp
375ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            CAT_UNDEF arg_name, %%i
376ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            %assign %%i %%i+1
377ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        %endrep
378ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %endif
379ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
380ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %xdefine %%stack_offset stack_offset
381ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %undef stack_offset ; so that the current value of stack_offset doesn't get baked in by xdefine
382ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %assign %%i 0
383ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %rep %0
384ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        %xdefine %1q r %+ %%i %+ q
385ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        %xdefine %1d r %+ %%i %+ d
386ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        %xdefine %1w r %+ %%i %+ w
387ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        %xdefine %1b r %+ %%i %+ b
388ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        %xdefine %1m r %+ %%i %+ m
389ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        %xdefine %1mp r %+ %%i %+ mp
390ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        CAT_XDEFINE arg_name, %%i, %1
391ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        %assign %%i %%i+1
392ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        %rotate 1
393ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %endrep
394ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %xdefine stack_offset %%stack_offset
395ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %assign n_arg_names %0
396ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro
397ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
398ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%if WIN64 ; Windows x64 ;=================================================
399ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
400ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangDECLARE_REG 0,  rcx, ecx,  cx,   cl
401ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangDECLARE_REG 1,  rdx, edx,  dx,   dl
402ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangDECLARE_REG 2,  R8,  R8D,  R8W,  R8B
403ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangDECLARE_REG 3,  R9,  R9D,  R9W,  R9B
404ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangDECLARE_REG 4,  R10, R10D, R10W, R10B, 40
405ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangDECLARE_REG 5,  R11, R11D, R11W, R11B, 48
406ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangDECLARE_REG 6,  rax, eax,  ax,   al,   56
407ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangDECLARE_REG 7,  rdi, edi,  di,   dil,  64
408ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangDECLARE_REG 8,  rsi, esi,  si,   sil,  72
409ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangDECLARE_REG 9,  rbx, ebx,  bx,   bl,   80
410ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangDECLARE_REG 10, rbp, ebp,  bp,   bpl,  88
411ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangDECLARE_REG 11, R12, R12D, R12W, R12B, 96
412ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangDECLARE_REG 12, R13, R13D, R13W, R13B, 104
413ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangDECLARE_REG 13, R14, R14D, R14W, R14B, 112
414ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangDECLARE_REG 14, R15, R15D, R15W, R15B, 120
415ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
416ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro PROLOGUE 2-4+ 0 ; #args, #regs, #xmm_regs, arg_names...
417ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %assign num_args %1
418ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %assign regs_used %2
419ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    ASSERT regs_used >= num_args
420ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    ASSERT regs_used <= 15
421ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14
422ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %if mmsize == 8
423ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        %assign xmm_regs_used 0
424ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %else
425ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        WIN64_SPILL_XMM %3
426ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %endif
427ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14
428ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    DEFINE_ARGS %4
429ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro
430ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
431ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro WIN64_SPILL_XMM 1
432ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %assign xmm_regs_used %1
433ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    ASSERT xmm_regs_used <= 16
434ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %if xmm_regs_used > 6
435ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        SUB rsp, (xmm_regs_used-6)*16+16
436ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        %assign %%i xmm_regs_used
437ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        %rep (xmm_regs_used-6)
438ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            %assign %%i %%i-1
439ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            movdqa [rsp + (%%i-6)*16+(~stack_offset&8)], xmm %+ %%i
440ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        %endrep
441ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %endif
442ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro
443ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
444ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro WIN64_RESTORE_XMM_INTERNAL 1
445ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %if xmm_regs_used > 6
446ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        %assign %%i xmm_regs_used
447ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        %rep (xmm_regs_used-6)
448ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            %assign %%i %%i-1
449ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            movdqa xmm %+ %%i, [%1 + (%%i-6)*16+(~stack_offset&8)]
450ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        %endrep
451ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        add %1, (xmm_regs_used-6)*16+16
452ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %endif
453ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro
454ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
455ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro WIN64_RESTORE_XMM 1
456ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    WIN64_RESTORE_XMM_INTERNAL %1
457ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %assign stack_offset stack_offset-(xmm_regs_used-6)*16+16
458ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %assign xmm_regs_used 0
459ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro
460ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
461ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro RET 0
462ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    WIN64_RESTORE_XMM_INTERNAL rsp
463ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7
464ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    ret
465ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro
466ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
467ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro REP_RET 0
468ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %if regs_used > 7 || xmm_regs_used > 6
469ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        RET
470ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %else
471ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        rep ret
472ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %endif
473ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro
474ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
475ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%elif ARCH_X86_64 ; *nix x64 ;=============================================
476ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
477ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangDECLARE_REG 0,  rdi, edi,  di,   dil
478ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangDECLARE_REG 1,  rsi, esi,  si,   sil
479ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangDECLARE_REG 2,  rdx, edx,  dx,   dl
480ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangDECLARE_REG 3,  rcx, ecx,  cx,   cl
481ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangDECLARE_REG 4,  R8,  R8D,  R8W,  R8B
482ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangDECLARE_REG 5,  R9,  R9D,  R9W,  R9B
483ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangDECLARE_REG 6,  rax, eax,  ax,   al,   8
484ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangDECLARE_REG 7,  R10, R10D, R10W, R10B, 16
485ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangDECLARE_REG 8,  R11, R11D, R11W, R11B, 24
486ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangDECLARE_REG 9,  rbx, ebx,  bx,   bl,   32
487ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangDECLARE_REG 10, rbp, ebp,  bp,   bpl,  40
488ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangDECLARE_REG 11, R12, R12D, R12W, R12B, 48
489ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangDECLARE_REG 12, R13, R13D, R13W, R13B, 56
490ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangDECLARE_REG 13, R14, R14D, R14W, R14B, 64
491ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangDECLARE_REG 14, R15, R15D, R15W, R15B, 72
492ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
493ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names...
494ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %assign num_args %1
495ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %assign regs_used %2
496ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    ASSERT regs_used >= num_args
497ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    ASSERT regs_used <= 15
498ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    PUSH_IF_USED 9, 10, 11, 12, 13, 14
499ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14
500ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    DEFINE_ARGS %4
501ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro
502ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
503ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro RET 0
504ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    POP_IF_USED 14, 13, 12, 11, 10, 9
505ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    ret
506ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro
507ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
508ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro REP_RET 0
509ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %if regs_used > 9
510ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        RET
511ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %else
512ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        rep ret
513ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %endif
514ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro
515ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
516ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%else ; X86_32 ;==============================================================
517ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
518ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangDECLARE_REG 0, eax, eax, ax, al,   4
519ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangDECLARE_REG 1, ecx, ecx, cx, cl,   8
520ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangDECLARE_REG 2, edx, edx, dx, dl,   12
521ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangDECLARE_REG 3, ebx, ebx, bx, bl,   16
522ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangDECLARE_REG 4, esi, esi, si, null, 20
523ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangDECLARE_REG 5, edi, edi, di, null, 24
524ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangDECLARE_REG 6, ebp, ebp, bp, null, 28
525ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%define rsp esp
526ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
527ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro DECLARE_ARG 1-*
528ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %rep %0
529ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        %define r%1m [esp + stack_offset + 4*%1 + 4]
530ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        %define r%1mp dword r%1m
531ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        %rotate 1
532ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %endrep
533ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro
534ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
535ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangDECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
536ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
537ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names...
538ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %assign num_args %1
539ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %assign regs_used %2
540ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %if regs_used > 7
541ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        %assign regs_used 7
542ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %endif
543ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    ASSERT regs_used >= num_args
544ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    PUSH_IF_USED 3, 4, 5, 6
545ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6
546ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    DEFINE_ARGS %4
547ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro
548ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
549ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro RET 0
550ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    POP_IF_USED 6, 5, 4, 3
551ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    ret
552ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro
553ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
554ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro REP_RET 0
555ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %if regs_used > 3
556ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        RET
557ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %else
558ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        rep ret
559ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %endif
560ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro
561ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
562ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endif ;======================================================================
563ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
564ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%if WIN64 == 0
565ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro WIN64_SPILL_XMM 1
566ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro
567ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro WIN64_RESTORE_XMM 1
568ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro
569ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endif
570ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
571ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;=============================================================================
572ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; arch-independent part
573ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;=============================================================================
574ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
575ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%assign function_align 16
576ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
577ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; Begin a function.
578ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; Applies any symbol mangling needed for C linkage, and sets up a define such that
579ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; subsequent uses of the function name automatically refer to the mangled version.
580ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; Appends cpuflags to the function name if cpuflags has been specified.
581ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro cglobal 1-2+ ; name, [PROLOGUE args]
582ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%if %0 == 1
583ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    cglobal_internal %1 %+ SUFFIX
584ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%else
585ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    cglobal_internal %1 %+ SUFFIX, %2
586ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endif
587ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro
588ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro cglobal_internal 1-2+
589ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %ifndef cglobaled_%1
590ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        %xdefine %1 mangle(program_name %+ _ %+ %1)
591ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        %xdefine %1.skip_prologue %1 %+ .skip_prologue
592ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        CAT_XDEFINE cglobaled_, %1, 1
593ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %endif
594ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %xdefine current_function %1
595ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %ifidn __OUTPUT_FORMAT__,elf
596ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        global %1:function hidden
597ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %elifidn __OUTPUT_FORMAT__,elf32
598ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        global %1:function hidden
599ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %elifidn __OUTPUT_FORMAT__,elf64
600ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        global %1:function hidden
6015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    %elifidn __OUTPUT_FORMAT__,macho32
6025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        global %1:private_extern
6035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    %elifidn __OUTPUT_FORMAT__,macho64
6045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        global %1:private_extern
605ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %else
606ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        global %1
607ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %endif
608ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    align function_align
609ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %1:
610ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    RESET_MM_PERMUTATION ; not really needed, but makes disassembly somewhat nicer
611ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %assign stack_offset 0
612ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %if %0 > 1
613ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        PROLOGUE %2
614ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %endif
615ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro
616ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
617ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro cextern 1
618ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %xdefine %1 mangle(program_name %+ _ %+ %1)
619ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    CAT_XDEFINE cglobaled_, %1, 1
620ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    extern %1
621ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro
622ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
623ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; like cextern, but without the prefix
624ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro cextern_naked 1
625ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %xdefine %1 mangle(%1)
626ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    CAT_XDEFINE cglobaled_, %1, 1
627ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    extern %1
628ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro
629ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
630ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro const 2+
631ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %xdefine %1 mangle(program_name %+ _ %+ %1)
632ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    global %1
633ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %1: %2
634ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro
635ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
636ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; This is needed for ELF, otherwise the GNU linker assumes the stack is
637ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; executable by default.
638ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%ifidn __OUTPUT_FORMAT__,elf
639ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangSECTION .note.GNU-stack noalloc noexec nowrite progbits
640ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%elifidn __OUTPUT_FORMAT__,elf32
641ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangSECTION .note.GNU-stack noalloc noexec nowrite progbits
642ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%elifidn __OUTPUT_FORMAT__,elf64
643ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangSECTION .note.GNU-stack noalloc noexec nowrite progbits
644ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endif
645ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
646ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; cpuflags
647ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
648ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%assign cpuflags_mmx      (1<<0)
649ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%assign cpuflags_mmx2     (1<<1) | cpuflags_mmx
650ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%assign cpuflags_3dnow    (1<<2) | cpuflags_mmx
651ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%assign cpuflags_3dnow2   (1<<3) | cpuflags_3dnow
652ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%assign cpuflags_sse      (1<<4) | cpuflags_mmx2
653ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%assign cpuflags_sse2     (1<<5) | cpuflags_sse
654ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%assign cpuflags_sse2slow (1<<6) | cpuflags_sse2
655ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%assign cpuflags_sse3     (1<<7) | cpuflags_sse2
656ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%assign cpuflags_ssse3    (1<<8) | cpuflags_sse3
657ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%assign cpuflags_sse4     (1<<9) | cpuflags_ssse3
658ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%assign cpuflags_sse42    (1<<10)| cpuflags_sse4
659ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%assign cpuflags_avx      (1<<11)| cpuflags_sse42
660ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%assign cpuflags_xop      (1<<12)| cpuflags_avx
661ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%assign cpuflags_fma4     (1<<13)| cpuflags_avx
662ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
663ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%assign cpuflags_cache32  (1<<16)
664ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%assign cpuflags_cache64  (1<<17)
665ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%assign cpuflags_slowctz  (1<<18)
666ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%assign cpuflags_lzcnt    (1<<19)
667ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%assign cpuflags_misalign (1<<20)
668ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%assign cpuflags_aligned  (1<<21) ; not a cpu feature, but a function variant
669ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%assign cpuflags_atom     (1<<22)
670ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
671ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%define    cpuflag(x) ((cpuflags & (cpuflags_ %+ x)) == (cpuflags_ %+ x))
672ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%define notcpuflag(x) ((cpuflags & (cpuflags_ %+ x)) != (cpuflags_ %+ x))
673ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
674ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; Takes up to 2 cpuflags from the above list.
675ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu.
676ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_MMX &co.
677ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro INIT_CPUFLAGS 0-2
678ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %if %0 >= 1
679ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        %xdefine cpuname %1
680ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        %assign cpuflags cpuflags_%1
681ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        %if %0 >= 2
682ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            %xdefine cpuname %1_%2
683ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            %assign cpuflags cpuflags | cpuflags_%2
684ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        %endif
685ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        %xdefine SUFFIX _ %+ cpuname
686ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        %if cpuflag(avx)
687ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            %assign avx_enabled 1
688ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        %endif
689ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        %if mmsize == 16 && notcpuflag(sse2)
690ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            %define mova movaps
691ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            %define movu movups
692ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            %define movnta movntps
693ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        %endif
694ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        %if cpuflag(aligned)
695ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            %define movu mova
696ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        %elifidn %1, sse3
697ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            %define movu lddqu
698ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        %endif
699ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %else
700ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        %xdefine SUFFIX
701ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        %undef cpuname
702ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        %undef cpuflags
703ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %endif
704ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro
705ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
706ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; merge mmx and sse*
707ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
708ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro CAT_XDEFINE 3
709ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %xdefine %1%2 %3
710ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro
711ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
712ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro CAT_UNDEF 2
713ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %undef %1%2
714ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro
715ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
716ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro INIT_MMX 0-1+
717ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %assign avx_enabled 0
718ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %define RESET_MM_PERMUTATION INIT_MMX %1
719ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %define mmsize 8
720ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %define num_mmregs 8
721ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %define mova movq
722ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %define movu movq
723ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %define movh movd
724ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %define movnta movntq
725ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %assign %%i 0
726ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %rep 8
727ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    CAT_XDEFINE m, %%i, mm %+ %%i
728ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    CAT_XDEFINE nmm, %%i, %%i
729ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %assign %%i %%i+1
730ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %endrep
731ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %rep 8
732ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    CAT_UNDEF m, %%i
733ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    CAT_UNDEF nmm, %%i
734ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %assign %%i %%i+1
735ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %endrep
736ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    INIT_CPUFLAGS %1
737ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro
738ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
739ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro INIT_XMM 0-1+
740ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %assign avx_enabled 0
741ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %define RESET_MM_PERMUTATION INIT_XMM %1
742ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %define mmsize 16
743ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %define num_mmregs 8
744ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %if ARCH_X86_64
745ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %define num_mmregs 16
746ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %endif
747ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %define mova movdqa
748ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %define movu movdqu
749ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %define movh movq
750ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %define movnta movntdq
751ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %assign %%i 0
752ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %rep num_mmregs
753ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    CAT_XDEFINE m, %%i, xmm %+ %%i
754ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    CAT_XDEFINE nxmm, %%i, %%i
755ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %assign %%i %%i+1
756ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %endrep
757ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    INIT_CPUFLAGS %1
758ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro
759ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
760ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; FIXME: INIT_AVX can be replaced by INIT_XMM avx
761ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro INIT_AVX 0
762ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    INIT_XMM
763ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %assign avx_enabled 1
764ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %define PALIGNR PALIGNR_SSSE3
765ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %define RESET_MM_PERMUTATION INIT_AVX
766ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro
767ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
768ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro INIT_YMM 0-1+
769ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %assign avx_enabled 1
770ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %define RESET_MM_PERMUTATION INIT_YMM %1
771ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %define mmsize 32
772ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %define num_mmregs 8
773ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %if ARCH_X86_64
774ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %define num_mmregs 16
775ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %endif
776ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %define mova vmovaps
777ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %define movu vmovups
778ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %undef movh
779ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %define movnta vmovntps
780ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %assign %%i 0
781ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %rep num_mmregs
782ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    CAT_XDEFINE m, %%i, ymm %+ %%i
783ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    CAT_XDEFINE nymm, %%i, %%i
784ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %assign %%i %%i+1
785ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %endrep
786ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    INIT_CPUFLAGS %1
787ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro
788ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
789ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangINIT_XMM
790ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
791ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; I often want to use macros that permute their arguments. e.g. there's no
792ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; efficient way to implement butterfly or transpose or dct without swapping some
793ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; arguments.
794ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;
795ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; I would like to not have to manually keep track of the permutations:
796ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; If I insert a permutation in the middle of a function, it should automatically
797ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; change everything that follows. For more complex macros I may also have multiple
798ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations.
799ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;
800ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that
801ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; permutes its arguments. It's equivalent to exchanging the contents of the
802ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; registers, except that this way you exchange the register names instead, so it
803ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; doesn't cost any cycles.
804ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
805ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro PERMUTE 2-* ; takes a list of pairs to swap
806ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%rep %0/2
807ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %xdefine tmp%2 m%2
808ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %xdefine ntmp%2 nm%2
809ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %rotate 2
810ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endrep
811ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%rep %0/2
812ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %xdefine m%1 tmp%2
813ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %xdefine nm%1 ntmp%2
814ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %undef tmp%2
815ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %undef ntmp%2
816ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %rotate 2
817ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endrep
818ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro
819ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
820ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro SWAP 2-* ; swaps a single chain (sometimes more concise than pairs)
821ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%rep %0-1
822ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%ifdef m%1
823ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %xdefine tmp m%1
824ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %xdefine m%1 m%2
825ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %xdefine m%2 tmp
826ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    CAT_XDEFINE n, m%1, %1
827ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    CAT_XDEFINE n, m%2, %2
828ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%else
829ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    ; If we were called as "SWAP m0,m1" rather than "SWAP 0,1" infer the original numbers here.
830ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    ; Be careful using this mode in nested macros though, as in some cases there may be
831ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    ; other copies of m# that have already been dereferenced and don't get updated correctly.
832ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %xdefine %%n1 n %+ %1
833ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %xdefine %%n2 n %+ %2
834ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %xdefine tmp m %+ %%n1
835ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    CAT_XDEFINE m, %%n1, m %+ %%n2
836ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    CAT_XDEFINE m, %%n2, tmp
837ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    CAT_XDEFINE n, m %+ %%n1, %%n1
838ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    CAT_XDEFINE n, m %+ %%n2, %%n2
839ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endif
840ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %undef tmp
841ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %rotate 1
842ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endrep
843ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro
844ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
845ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; If SAVE_MM_PERMUTATION is placed at the end of a function, then any later
846ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; calls to that function will automatically load the permutation, so values can
847ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; be returned in mmregs.
848ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro SAVE_MM_PERMUTATION 0-1
849ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %if %0
850ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        %xdefine %%f %1_m
851ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %else
852ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        %xdefine %%f current_function %+ _m
853ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %endif
854ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %assign %%i 0
855ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %rep num_mmregs
856ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        CAT_XDEFINE %%f, %%i, m %+ %%i
857ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %assign %%i %%i+1
858ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %endrep
859ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro
860ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
861ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro LOAD_MM_PERMUTATION 1 ; name to load from
862ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %ifdef %1_m0
863ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        %assign %%i 0
864ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        %rep num_mmregs
865ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            CAT_XDEFINE m, %%i, %1_m %+ %%i
866ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            CAT_XDEFINE n, m %+ %%i, %%i
867ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        %assign %%i %%i+1
868ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        %endrep
869ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %endif
870ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro
871ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
872ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't
873ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro call 1
874ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    call_internal %1, %1 %+ SUFFIX
875ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro
876ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro call_internal 2
877ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %xdefine %%i %1
878ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %ifndef cglobaled_%1
879ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        %ifdef cglobaled_%2
880ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            %xdefine %%i %2
881ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        %endif
882ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %endif
883ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    call %%i
884ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    LOAD_MM_PERMUTATION %%i
885ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro
886ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
887ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; Substitutions that reduce instruction size but are functionally equivalent
888ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro add 2
889ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %ifnum %2
890ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        %if %2==128
891ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            sub %1, -128
892ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        %else
893ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            add %1, %2
894ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        %endif
895ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %else
896ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        add %1, %2
897ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %endif
898ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro
899ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
900ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro sub 2
901ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %ifnum %2
902ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        %if %2==128
903ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            add %1, -128
904ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        %else
905ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            sub %1, %2
906ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        %endif
907ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %else
908ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        sub %1, %2
909ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %endif
910ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro
911ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
912ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;=============================================================================
913ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; AVX abstraction layer
914ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;=============================================================================
915ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
916ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%assign i 0
917ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%rep 16
918ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %if i < 8
919ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        CAT_XDEFINE sizeofmm, i, 8
920ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %endif
921ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    CAT_XDEFINE sizeofxmm, i, 16
922ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    CAT_XDEFINE sizeofymm, i, 32
923ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%assign i i+1
924ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endrep
925ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%undef i
926ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
927ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;%1 == instruction
928ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;%2 == 1 if float, 0 if int
929ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;%3 == 1 if 4-operand (xmm, xmm, xmm, imm), 0 if 2- or 3-operand (xmm, xmm, xmm)
930ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;%4 == number of operands given
931ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;%5+: operands
932ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro RUN_AVX_INSTR 6-7+
933ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %ifid %5
934ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        %define %%size sizeof%5
935ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %else
936ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        %define %%size mmsize
937ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %endif
938ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %if %%size==32
939ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        %if %0 >= 7
940ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            v%1 %5, %6, %7
941ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        %else
942ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            v%1 %5, %6
943ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        %endif
944ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %else
945ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        %if %%size==8
946ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            %define %%regmov movq
947ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        %elif %2
948ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            %define %%regmov movaps
949ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        %else
950ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            %define %%regmov movdqa
951ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        %endif
952ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
953ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        %if %4>=3+%3
954ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            %ifnidn %5, %6
955ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                %if avx_enabled && sizeof%5==16
956ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                    v%1 %5, %6, %7
957ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                %else
958ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                    %%regmov %5, %6
959ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                    %1 %5, %7
960ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                %endif
961ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            %else
962ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang                %1 %5, %7
963ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            %endif
964ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        %elif %3
965ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            %1 %5, %6, %7
966ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        %else
967ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            %1 %5, %6
968ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        %endif
969ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %endif
970ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro
971ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
972ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; 3arg AVX ops with a memory arg can only have it in src2,
973ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; whereas SSE emulation of 3arg prefers to have it in src1 (i.e. the mov).
974ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; So, if the op is symmetric and the wrong one is memory, swap them.
975ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro RUN_AVX_INSTR1 8
976ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %assign %%swap 0
977ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %if avx_enabled
978ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        %ifnid %6
979ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            %assign %%swap 1
980ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        %endif
981ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %elifnidn %5, %6
982ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        %ifnid %7
983ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            %assign %%swap 1
984ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        %endif
985ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %endif
986ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %if %%swap && %3 == 0 && %8 == 1
987ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        RUN_AVX_INSTR %1, %2, %3, %4, %5, %7, %6
988ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %else
989ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        RUN_AVX_INSTR %1, %2, %3, %4, %5, %6, %7
990ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %endif
991ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro
992ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
993ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;%1 == instruction
994ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;%2 == 1 if float, 0 if int
995ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;%3 == 1 if 4-operand (xmm, xmm, xmm, imm), 0 if 3-operand (xmm, xmm, xmm)
996ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;%4 == 1 if symmetric (i.e. doesn't matter which src arg is which), 0 if not
997ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro AVX_INSTR 4
998ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %macro %1 2-9 fnord, fnord, fnord, %1, %2, %3, %4
999ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        %ifidn %3, fnord
1000ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            RUN_AVX_INSTR %6, %7, %8, 2, %1, %2
1001ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        %elifidn %4, fnord
1002ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            RUN_AVX_INSTR1 %6, %7, %8, 3, %1, %2, %3, %9
1003ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        %elifidn %5, fnord
1004ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            RUN_AVX_INSTR %6, %7, %8, 4, %1, %2, %3, %4
1005ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        %else
1006ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            RUN_AVX_INSTR %6, %7, %8, 5, %1, %2, %3, %4, %5
1007ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        %endif
1008ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %endmacro
1009ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro
1010ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1011ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR addpd, 1, 0, 1
1012ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR addps, 1, 0, 1
1013ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR addsd, 1, 0, 1
1014ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR addss, 1, 0, 1
1015ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR addsubpd, 1, 0, 0
1016ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR addsubps, 1, 0, 0
1017ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR andpd, 1, 0, 1
1018ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR andps, 1, 0, 1
1019ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR andnpd, 1, 0, 0
1020ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR andnps, 1, 0, 0
1021ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR blendpd, 1, 0, 0
1022ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR blendps, 1, 0, 0
1023ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR blendvpd, 1, 0, 0
1024ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR blendvps, 1, 0, 0
1025ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR cmppd, 1, 0, 0
1026ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR cmpps, 1, 0, 0
1027ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR cmpsd, 1, 0, 0
1028ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR cmpss, 1, 0, 0
1029ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR cvtdq2ps, 1, 0, 0
1030ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR cvtps2dq, 1, 0, 0
1031ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR divpd, 1, 0, 0
1032ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR divps, 1, 0, 0
1033ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR divsd, 1, 0, 0
1034ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR divss, 1, 0, 0
1035ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR dppd, 1, 1, 0
1036ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR dpps, 1, 1, 0
1037ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR haddpd, 1, 0, 0
1038ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR haddps, 1, 0, 0
1039ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR hsubpd, 1, 0, 0
1040ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR hsubps, 1, 0, 0
1041ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR maxpd, 1, 0, 1
1042ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR maxps, 1, 0, 1
1043ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR maxsd, 1, 0, 1
1044ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR maxss, 1, 0, 1
1045ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR minpd, 1, 0, 1
1046ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR minps, 1, 0, 1
1047ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR minsd, 1, 0, 1
1048ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR minss, 1, 0, 1
1049ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR movhlps, 1, 0, 0
1050ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR movlhps, 1, 0, 0
1051ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR movsd, 1, 0, 0
1052ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR movss, 1, 0, 0
1053ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR mpsadbw, 0, 1, 0
1054ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR mulpd, 1, 0, 1
1055ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR mulps, 1, 0, 1
1056ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR mulsd, 1, 0, 1
1057ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR mulss, 1, 0, 1
1058ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR orpd, 1, 0, 1
1059ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR orps, 1, 0, 1
1060ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR packsswb, 0, 0, 0
1061ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR packssdw, 0, 0, 0
1062ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR packuswb, 0, 0, 0
1063ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR packusdw, 0, 0, 0
1064ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR paddb, 0, 0, 1
1065ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR paddw, 0, 0, 1
1066ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR paddd, 0, 0, 1
1067ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR paddq, 0, 0, 1
1068ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR paddsb, 0, 0, 1
1069ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR paddsw, 0, 0, 1
1070ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR paddusb, 0, 0, 1
1071ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR paddusw, 0, 0, 1
1072ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR palignr, 0, 1, 0
1073ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR pand, 0, 0, 1
1074ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR pandn, 0, 0, 0
1075ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR pavgb, 0, 0, 1
1076ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR pavgw, 0, 0, 1
1077ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR pblendvb, 0, 0, 0
1078ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR pblendw, 0, 1, 0
1079ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR pcmpestri, 0, 0, 0
1080ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR pcmpestrm, 0, 0, 0
1081ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR pcmpistri, 0, 0, 0
1082ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR pcmpistrm, 0, 0, 0
1083ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR pcmpeqb, 0, 0, 1
1084ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR pcmpeqw, 0, 0, 1
1085ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR pcmpeqd, 0, 0, 1
1086ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR pcmpeqq, 0, 0, 1
1087ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR pcmpgtb, 0, 0, 0
1088ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR pcmpgtw, 0, 0, 0
1089ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR pcmpgtd, 0, 0, 0
1090ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR pcmpgtq, 0, 0, 0
1091ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR phaddw, 0, 0, 0
1092ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR phaddd, 0, 0, 0
1093ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR phaddsw, 0, 0, 0
1094ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR phsubw, 0, 0, 0
1095ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR phsubd, 0, 0, 0
1096ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR phsubsw, 0, 0, 0
1097ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR pmaddwd, 0, 0, 1
1098ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR pmaddubsw, 0, 0, 0
1099ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR pmaxsb, 0, 0, 1
1100ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR pmaxsw, 0, 0, 1
1101ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR pmaxsd, 0, 0, 1
1102ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR pmaxub, 0, 0, 1
1103ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR pmaxuw, 0, 0, 1
1104ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR pmaxud, 0, 0, 1
1105ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR pminsb, 0, 0, 1
1106ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR pminsw, 0, 0, 1
1107ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR pminsd, 0, 0, 1
1108ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR pminub, 0, 0, 1
1109ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR pminuw, 0, 0, 1
1110ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR pminud, 0, 0, 1
1111ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR pmulhuw, 0, 0, 1
1112ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR pmulhrsw, 0, 0, 1
1113ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR pmulhw, 0, 0, 1
1114ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR pmullw, 0, 0, 1
1115ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR pmulld, 0, 0, 1
1116ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR pmuludq, 0, 0, 1
1117ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR pmuldq, 0, 0, 1
1118ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR por, 0, 0, 1
1119ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR psadbw, 0, 0, 1
1120ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR pshufb, 0, 0, 0
1121ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR psignb, 0, 0, 0
1122ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR psignw, 0, 0, 0
1123ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR psignd, 0, 0, 0
1124ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR psllw, 0, 0, 0
1125ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR pslld, 0, 0, 0
1126ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR psllq, 0, 0, 0
1127ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR pslldq, 0, 0, 0
1128ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR psraw, 0, 0, 0
1129ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR psrad, 0, 0, 0
1130ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR psrlw, 0, 0, 0
1131ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR psrld, 0, 0, 0
1132ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR psrlq, 0, 0, 0
1133ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR psrldq, 0, 0, 0
1134ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR psubb, 0, 0, 0
1135ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR psubw, 0, 0, 0
1136ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR psubd, 0, 0, 0
1137ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR psubq, 0, 0, 0
1138ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR psubsb, 0, 0, 0
1139ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR psubsw, 0, 0, 0
1140ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR psubusb, 0, 0, 0
1141ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR psubusw, 0, 0, 0
1142ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR punpckhbw, 0, 0, 0
1143ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR punpckhwd, 0, 0, 0
1144ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR punpckhdq, 0, 0, 0
1145ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR punpckhqdq, 0, 0, 0
1146ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR punpcklbw, 0, 0, 0
1147ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR punpcklwd, 0, 0, 0
1148ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR punpckldq, 0, 0, 0
1149ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR punpcklqdq, 0, 0, 0
1150ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR pxor, 0, 0, 1
1151ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR shufps, 1, 1, 0
1152ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR subpd, 1, 0, 0
1153ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR subps, 1, 0, 0
1154ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR subsd, 1, 0, 0
1155ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR subss, 1, 0, 0
1156ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR unpckhpd, 1, 0, 0
1157ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR unpckhps, 1, 0, 0
1158ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR unpcklpd, 1, 0, 0
1159ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR unpcklps, 1, 0, 0
1160ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR xorpd, 1, 0, 1
1161ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR xorps, 1, 0, 1
1162ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1163ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; 3DNow instructions, for sharing code between AVX, SSE and 3DN
1164ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR pfadd, 1, 0, 1
1165ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR pfsub, 1, 0, 0
1166ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR pfmul, 1, 0, 1
1167ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1168ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; base-4 constants for shuffles
1169ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%assign i 0
1170ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%rep 256
1171ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %assign j ((i>>6)&3)*1000 + ((i>>4)&3)*100 + ((i>>2)&3)*10 + (i&3)
1172ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %if j < 10
1173ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        CAT_XDEFINE q000, j, i
1174ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %elif j < 100
1175ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        CAT_XDEFINE q00, j, i
1176ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %elif j < 1000
1177ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        CAT_XDEFINE q0, j, i
1178ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %else
1179ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        CAT_XDEFINE q, j, i
1180ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %endif
1181ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%assign i i+1
1182ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endrep
1183ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%undef i
1184ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%undef j
1185ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1186ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro FMA_INSTR 3
1187ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %macro %1 4-7 %1, %2, %3
1188ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        %if cpuflag(xop)
1189ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            v%5 %1, %2, %3, %4
1190ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        %else
1191ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            %6 %1, %2, %3
1192ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            %7 %1, %4
1193ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        %endif
1194ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    %endmacro
1195ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro
1196ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
1197ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangFMA_INSTR  pmacsdd,  pmulld, paddd
1198ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangFMA_INSTR  pmacsww,  pmullw, paddw
1199ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangFMA_INSTR pmadcswd, pmaddwd, paddd
1200