1ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;***************************************************************************** 2ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;* x86inc.asm: x264asm abstraction layer 3ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;***************************************************************************** 4ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;* Copyright (C) 2005-2012 x264 project 5ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;* 6ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;* Authors: Loren Merritt <lorenm@u.washington.edu> 7ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;* Anton Mitrofanov <BugMaster@narod.ru> 8ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;* Jason Garrett-Glaser <darkshikari@gmail.com> 9ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;* Henrik Gramner <hengar-6@student.ltu.se> 10ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;* 11ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;* Permission to use, copy, modify, and/or distribute this software for any 12ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;* purpose with or without fee is hereby granted, provided that the above 13ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;* copyright notice and this permission notice appear in all copies. 14ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;* 15ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 16ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 17ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 18ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 19ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 20ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 21ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 22ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;***************************************************************************** 23ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 24ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; This is a header file for the x264ASM assembly language, which uses 25ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; NASM/YASM syntax combined with a large number of macros to provide easy 26ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; abstraction between different calling conventions (x86_32, win64, linux64). 27ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; It also has various other useful features to simplify writing the kind of 28ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; DSP functions that are most often used in x264. 29ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 30ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; Unlike the rest of x264, this file is available under an ISC license, as it 31ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; has significant usefulness outside of x264 and we want it to be available 32ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; to the largest audience possible. Of course, if you modify it for your own 33ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; purposes to add a new feature, we strongly encourage contributing a patch 34ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; as this feature might be useful for others as well. Send patches or ideas 35ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; to x264-devel@videolan.org . 36ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 37ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%include "vpx_config.asm" 38ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 39ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%define program_name vp9 40ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 41ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 42ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%define UNIX64 0 43ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%define WIN64 0 44ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%if ARCH_X86_64 45ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %ifidn __OUTPUT_FORMAT__,win32 46ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %define WIN64 1 47ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %elifidn __OUTPUT_FORMAT__,win64 48ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %define WIN64 1 49ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %elifidn __OUTPUT_FORMAT__,x64 50ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %define WIN64 1 51ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %else 52ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %define UNIX64 1 53ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %endif 54ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endif 55ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 56ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%ifidn __OUTPUT_FORMAT__,elf32 57ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %define mangle(x) x 58ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%elifidn __OUTPUT_FORMAT__,elf64 59ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %define mangle(x) x 60ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%elifidn __OUTPUT_FORMAT__,elf 61ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %define mangle(x) x 62ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%elifidn __OUTPUT_FORMAT__,x64 63ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %define mangle(x) x 64ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%elifidn __OUTPUT_FORMAT__,win64 65ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %define mangle(x) x 66ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%else 67ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %define mangle(x) _ %+ x 68ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endif 69ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 70ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; FIXME: All of the 64bit asm functions that take a stride as an argument 71ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; via register, assume that the high dword of that register is filled with 0. 72ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; This is true in practice (since we never do any 64bit arithmetic on strides, 73ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; and x264's strides are all positive), but is not guaranteed by the ABI. 74ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 75ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; Name of the .rodata section. 76ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; Kludge: Something on OS X fails to align .rodata even given an align attribute, 77ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; so use a different read-only section. 78ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro SECTION_RODATA 0-1 16 79ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %ifidn __OUTPUT_FORMAT__,macho64 80ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang SECTION .text align=%1 81ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %elifidn __OUTPUT_FORMAT__,macho 82ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang SECTION .text align=%1 83ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang fakegot: 84ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %elifidn __OUTPUT_FORMAT__,aout 85ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang section .text 86ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %else 87ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang SECTION .rodata align=%1 88ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %endif 89ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro 90ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 91ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; aout does not support align= 92ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro SECTION_TEXT 0-1 16 93ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %ifidn __OUTPUT_FORMAT__,aout 94ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang SECTION .text 95ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %else 96ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang SECTION .text align=%1 97ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %endif 98ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro 99ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 1005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang; PIC macros are copied from vpx_ports/x86_abi_support.asm. The "define PIC" 1015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang; from original code is added in for 64bit. 1025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang%ifidn __OUTPUT_FORMAT__,elf32 1035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang%define ABI_IS_32BIT 1 1045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang%elifidn __OUTPUT_FORMAT__,macho32 1055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang%define ABI_IS_32BIT 1 1065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang%elifidn __OUTPUT_FORMAT__,win32 1075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang%define ABI_IS_32BIT 1 1085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang%elifidn __OUTPUT_FORMAT__,aout 1095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang%define ABI_IS_32BIT 1 1105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang%else 1115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang%define ABI_IS_32BIT 0 1125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang%endif 1135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 1145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang%if ABI_IS_32BIT 1155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang %if CONFIG_PIC=1 1165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang %ifidn __OUTPUT_FORMAT__,elf32 1175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang %define GET_GOT_SAVE_ARG 1 1185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang %define WRT_PLT wrt ..plt 1195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang %macro GET_GOT 1 1205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang extern _GLOBAL_OFFSET_TABLE_ 1215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang push %1 1225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang call %%get_got 1235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang %%sub_offset: 1245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang jmp %%exitGG 1255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang %%get_got: 1265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang mov %1, [esp] 1275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang add %1, _GLOBAL_OFFSET_TABLE_ + $$ - %%sub_offset wrt ..gotpc 1285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang ret 1295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang %%exitGG: 1305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang %undef GLOBAL 1315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang %define GLOBAL(x) x + %1 wrt ..gotoff 1325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang %undef RESTORE_GOT 1335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang %define RESTORE_GOT pop %1 1345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang %endmacro 1355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang %elifidn __OUTPUT_FORMAT__,macho32 1365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang %define GET_GOT_SAVE_ARG 1 1375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang %macro GET_GOT 1 1385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang push %1 1395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang call %%get_got 1405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang %%get_got: 1415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang pop %1 1425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang %undef GLOBAL 1435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang %define GLOBAL(x) x + %1 - %%get_got 1445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang %undef RESTORE_GOT 1455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang %define RESTORE_GOT pop %1 1465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang %endmacro 1475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang %endif 1485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang %endif 1495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 1505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang %if ARCH_X86_64 == 0 1515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang %undef PIC 1525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang %endif 1535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 1545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang%else 1555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang %macro GET_GOT 1 1565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang %endmacro 1575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang %define GLOBAL(x) rel x 1585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang %define WRT_PLT wrt ..plt 1595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 1605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang %if WIN64 161ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %define PIC 1625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang %elifidn __OUTPUT_FORMAT__,macho64 163ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %define PIC 1645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang %elif CONFIG_PIC 165ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %define PIC 1665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang %endif 1675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang%endif 1685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 1695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang%ifnmacro GET_GOT 1705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang %macro GET_GOT 1 1715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang %endmacro 1725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang %define GLOBAL(x) x 173ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endif 1745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang%ifndef RESTORE_GOT 1755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang%define RESTORE_GOT 1765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang%endif 1775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang%ifndef WRT_PLT 1785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang%define WRT_PLT 1795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang%endif 1805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 181ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%ifdef PIC 182ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang default rel 183ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endif 1845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang; Done with PIC macros 185ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 186ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; Always use long nops (reduces 0x90 spam in disassembly on x86_32) 187ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%ifndef __NASM_VER__ 188ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangCPU amdnop 189ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%else 190ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%use smartalign 191ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangALIGNMODE k7 192ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endif 193ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 194ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; Macros to eliminate most code duplication between x86_32 and x86_64: 195ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; Currently this works only for leaf functions which load all their arguments 196ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; into registers at the start, and make no other use of the stack. Luckily that 197ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; covers most of x264's asm. 198ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 199ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; PROLOGUE: 200ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; %1 = number of arguments. loads them from stack if needed. 201ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; %2 = number of registers used. pushes callee-saved regs if needed. 202ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed. 203ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; %4 = list of names to define to registers 204ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; PROLOGUE can also be invoked by adding the same options to cglobal 205ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 206ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; e.g. 207ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; cglobal foo, 2,3,0, dst, src, tmp 208ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; declares a function (foo), taking two args (dst and src) and one local variable (tmp) 209ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 210ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; TODO Some functions can use some args directly from the stack. If they're the 211ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; last args then you can just not declare them, but if they're in the middle 212ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; we need more flexible macro. 213ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 214ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; RET: 215ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; Pops anything that was pushed by PROLOGUE, and returns. 216ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 217ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; REP_RET: 218ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; Same, but if it doesn't pop anything it becomes a 2-byte ret, for athlons 219ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; which are slow when a normal ret follows a branch. 220ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 221ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; registers: 222ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; rN and rNq are the native-size register holding function argument N 223ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; rNd, rNw, rNb are dword, word, and byte size 224ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; rNm is the original location of arg N (a register or on the stack), dword 225ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; rNmp is native size 226ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 227ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro DECLARE_REG 5-6 228ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %define r%1q %2 229ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %define r%1d %3 230ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %define r%1w %4 231ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %define r%1b %5 232ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %if %0 == 5 233ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %define r%1m %3 234ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %define r%1mp %2 235ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %elif ARCH_X86_64 ; memory 236ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %define r%1m [rsp + stack_offset + %6] 237ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %define r%1mp qword r %+ %1m 238ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %else 239ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %define r%1m [esp + stack_offset + %6] 240ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %define r%1mp dword r %+ %1m 241ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %endif 242ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %define r%1 %2 243ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro 244ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 245ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro DECLARE_REG_SIZE 2 246ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %define r%1q r%1 247ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %define e%1q r%1 248ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %define r%1d e%1 249ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %define e%1d e%1 250ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %define r%1w %1 251ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %define e%1w %1 252ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %define r%1b %2 253ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %define e%1b %2 254ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%if ARCH_X86_64 == 0 255ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %define r%1 e%1 256ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endif 257ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro 258ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 259ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangDECLARE_REG_SIZE ax, al 260ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangDECLARE_REG_SIZE bx, bl 261ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangDECLARE_REG_SIZE cx, cl 262ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangDECLARE_REG_SIZE dx, dl 263ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangDECLARE_REG_SIZE si, sil 264ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangDECLARE_REG_SIZE di, dil 265ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangDECLARE_REG_SIZE bp, bpl 266ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 267ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; t# defines for when per-arch register allocation is more complex than just function arguments 268ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 269ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro DECLARE_REG_TMP 1-* 270ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %assign %%i 0 271ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %rep %0 272ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang CAT_XDEFINE t, %%i, r%1 273ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %assign %%i %%i+1 274ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %rotate 1 275ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %endrep 276ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro 277ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 278ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro DECLARE_REG_TMP_SIZE 0-* 279ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %rep %0 280ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %define t%1q t%1 %+ q 281ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %define t%1d t%1 %+ d 282ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %define t%1w t%1 %+ w 283ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %define t%1b t%1 %+ b 284ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %rotate 1 285ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %endrep 286ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro 287ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 288ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangDECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14 289ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 290ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%if ARCH_X86_64 291ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %define gprsize 8 292ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%else 293ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %define gprsize 4 294ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endif 295ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 296ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro PUSH 1 297ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang push %1 298ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %assign stack_offset stack_offset+gprsize 299ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro 300ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 301ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro POP 1 302ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pop %1 303ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %assign stack_offset stack_offset-gprsize 304ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro 305ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 306ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro PUSH_IF_USED 1-* 307ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %rep %0 308ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %if %1 < regs_used 309ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang PUSH r%1 310ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %endif 311ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %rotate 1 312ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %endrep 313ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro 314ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 315ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro POP_IF_USED 1-* 316ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %rep %0 317ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %if %1 < regs_used 318ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pop r%1 319ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %endif 320ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %rotate 1 321ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %endrep 322ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro 323ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 324ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro LOAD_IF_USED 1-* 325ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %rep %0 326ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %if %1 < num_args 327ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang mov r%1, r %+ %1 %+ mp 328ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %endif 329ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %rotate 1 330ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %endrep 331ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro 332ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 333ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro SUB 2 334ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang sub %1, %2 335ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %ifidn %1, rsp 336ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %assign stack_offset stack_offset+(%2) 337ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %endif 338ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro 339ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 340ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro ADD 2 341ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang add %1, %2 342ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %ifidn %1, rsp 343ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %assign stack_offset stack_offset-(%2) 344ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %endif 345ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro 346ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 347ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro movifnidn 2 348ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %ifnidn %1, %2 349ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang mov %1, %2 350ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %endif 351ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro 352ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 353ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro movsxdifnidn 2 354ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %ifnidn %1, %2 355ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movsxd %1, %2 356ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %endif 357ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro 358ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 359ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro ASSERT 1 360ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %if (%1) == 0 361ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %error assert failed 362ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %endif 363ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro 364ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 365ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro DEFINE_ARGS 0-* 366ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %ifdef n_arg_names 367ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %assign %%i 0 368ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %rep n_arg_names 369ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang CAT_UNDEF arg_name %+ %%i, q 370ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang CAT_UNDEF arg_name %+ %%i, d 371ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang CAT_UNDEF arg_name %+ %%i, w 372ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang CAT_UNDEF arg_name %+ %%i, b 373ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang CAT_UNDEF arg_name %+ %%i, m 374ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang CAT_UNDEF arg_name %+ %%i, mp 375ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang CAT_UNDEF arg_name, %%i 376ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %assign %%i %%i+1 377ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %endrep 378ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %endif 379ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 380ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %xdefine %%stack_offset stack_offset 381ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %undef stack_offset ; so that the current value of stack_offset doesn't get baked in by xdefine 382ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %assign %%i 0 383ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %rep %0 384ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %xdefine %1q r %+ %%i %+ q 385ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %xdefine %1d r %+ %%i %+ d 386ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %xdefine %1w r %+ %%i %+ w 387ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %xdefine %1b r %+ %%i %+ b 388ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %xdefine %1m r %+ %%i %+ m 389ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %xdefine %1mp r %+ %%i %+ mp 390ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang CAT_XDEFINE arg_name, %%i, %1 391ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %assign %%i %%i+1 392ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %rotate 1 393ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %endrep 394ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %xdefine stack_offset %%stack_offset 395ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %assign n_arg_names %0 396ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro 397ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 398ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%if WIN64 ; Windows x64 ;================================================= 399ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 400ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangDECLARE_REG 0, rcx, ecx, cx, cl 401ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangDECLARE_REG 1, rdx, edx, dx, dl 402ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangDECLARE_REG 2, R8, R8D, R8W, R8B 403ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangDECLARE_REG 3, R9, R9D, R9W, R9B 404ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangDECLARE_REG 4, R10, R10D, R10W, R10B, 40 405ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangDECLARE_REG 5, R11, R11D, R11W, R11B, 48 406ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangDECLARE_REG 6, rax, eax, ax, al, 56 407ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangDECLARE_REG 7, rdi, edi, di, dil, 64 408ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangDECLARE_REG 8, rsi, esi, si, sil, 72 409ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangDECLARE_REG 9, rbx, ebx, bx, bl, 80 410ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangDECLARE_REG 10, rbp, ebp, bp, bpl, 88 411ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangDECLARE_REG 11, R12, R12D, R12W, R12B, 96 412ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangDECLARE_REG 12, R13, R13D, R13W, R13B, 104 413ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangDECLARE_REG 13, R14, R14D, R14W, R14B, 112 414ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangDECLARE_REG 14, R15, R15D, R15W, R15B, 120 415ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 416ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro PROLOGUE 2-4+ 0 ; #args, #regs, #xmm_regs, arg_names... 417ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %assign num_args %1 418ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %assign regs_used %2 419ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang ASSERT regs_used >= num_args 420ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang ASSERT regs_used <= 15 421ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14 422ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %if mmsize == 8 423ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %assign xmm_regs_used 0 424ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %else 425ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang WIN64_SPILL_XMM %3 426ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %endif 427ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 428ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang DEFINE_ARGS %4 429ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro 430ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 431ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro WIN64_SPILL_XMM 1 432ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %assign xmm_regs_used %1 433ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang ASSERT xmm_regs_used <= 16 434ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %if xmm_regs_used > 6 435ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang SUB rsp, (xmm_regs_used-6)*16+16 436ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %assign %%i xmm_regs_used 437ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %rep (xmm_regs_used-6) 438ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %assign %%i %%i-1 439ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movdqa [rsp + (%%i-6)*16+(~stack_offset&8)], xmm %+ %%i 440ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %endrep 441ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %endif 442ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro 443ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 444ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro WIN64_RESTORE_XMM_INTERNAL 1 445ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %if xmm_regs_used > 6 446ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %assign %%i xmm_regs_used 447ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %rep (xmm_regs_used-6) 448ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %assign %%i %%i-1 449ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movdqa xmm %+ %%i, [%1 + (%%i-6)*16+(~stack_offset&8)] 450ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %endrep 451ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang add %1, (xmm_regs_used-6)*16+16 452ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %endif 453ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro 454ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 455ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro WIN64_RESTORE_XMM 1 456ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang WIN64_RESTORE_XMM_INTERNAL %1 457ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %assign stack_offset stack_offset-(xmm_regs_used-6)*16+16 458ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %assign xmm_regs_used 0 459ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro 460ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 461ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro RET 0 462ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang WIN64_RESTORE_XMM_INTERNAL rsp 463ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7 464ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang ret 465ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro 466ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 467ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro REP_RET 0 468ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %if regs_used > 7 || xmm_regs_used > 6 469ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RET 470ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %else 471ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang rep ret 472ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %endif 473ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro 474ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 475ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%elif ARCH_X86_64 ; *nix x64 ;============================================= 476ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 477ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangDECLARE_REG 0, rdi, edi, di, dil 478ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangDECLARE_REG 1, rsi, esi, si, sil 479ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangDECLARE_REG 2, rdx, edx, dx, dl 480ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangDECLARE_REG 3, rcx, ecx, cx, cl 481ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangDECLARE_REG 4, R8, R8D, R8W, R8B 482ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangDECLARE_REG 5, R9, R9D, R9W, R9B 483ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangDECLARE_REG 6, rax, eax, ax, al, 8 484ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangDECLARE_REG 7, R10, R10D, R10W, R10B, 16 485ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangDECLARE_REG 8, R11, R11D, R11W, R11B, 24 486ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangDECLARE_REG 9, rbx, ebx, bx, bl, 32 487ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangDECLARE_REG 10, rbp, ebp, bp, bpl, 40 488ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangDECLARE_REG 11, R12, R12D, R12W, R12B, 48 489ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangDECLARE_REG 12, R13, R13D, R13W, R13B, 56 490ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangDECLARE_REG 13, R14, R14D, R14W, R14B, 64 491ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangDECLARE_REG 14, R15, R15D, R15W, R15B, 72 492ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 493ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names... 494ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %assign num_args %1 495ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %assign regs_used %2 496ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang ASSERT regs_used >= num_args 497ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang ASSERT regs_used <= 15 498ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang PUSH_IF_USED 9, 10, 11, 12, 13, 14 499ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14 500ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang DEFINE_ARGS %4 501ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro 502ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 503ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro RET 0 504ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang POP_IF_USED 14, 13, 12, 11, 10, 9 505ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang ret 506ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro 507ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 508ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro REP_RET 0 509ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %if regs_used > 9 510ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RET 511ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %else 512ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang rep ret 513ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %endif 514ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro 515ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 516ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%else ; X86_32 ;============================================================== 517ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 518ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangDECLARE_REG 0, eax, eax, ax, al, 4 519ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangDECLARE_REG 1, ecx, ecx, cx, cl, 8 520ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangDECLARE_REG 2, edx, edx, dx, dl, 12 521ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangDECLARE_REG 3, ebx, ebx, bx, bl, 16 522ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangDECLARE_REG 4, esi, esi, si, null, 20 523ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangDECLARE_REG 5, edi, edi, di, null, 24 524ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangDECLARE_REG 6, ebp, ebp, bp, null, 28 525ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%define rsp esp 526ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 527ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro DECLARE_ARG 1-* 528ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %rep %0 529ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %define r%1m [esp + stack_offset + 4*%1 + 4] 530ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %define r%1mp dword r%1m 531ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %rotate 1 532ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %endrep 533ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro 534ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 535ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangDECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14 536ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 537ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names... 538ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %assign num_args %1 539ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %assign regs_used %2 540ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %if regs_used > 7 541ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %assign regs_used 7 542ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %endif 543ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang ASSERT regs_used >= num_args 544ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang PUSH_IF_USED 3, 4, 5, 6 545ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6 546ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang DEFINE_ARGS %4 547ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro 548ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 549ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro RET 0 550ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang POP_IF_USED 6, 5, 4, 3 551ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang ret 552ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro 553ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 554ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro REP_RET 0 555ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %if regs_used > 3 556ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RET 557ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %else 558ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang rep ret 559ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %endif 560ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro 561ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 562ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endif ;====================================================================== 563ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 564ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%if WIN64 == 0 565ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro WIN64_SPILL_XMM 1 566ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro 567ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro WIN64_RESTORE_XMM 1 568ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro 569ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endif 570ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 571ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;============================================================================= 572ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; arch-independent part 573ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;============================================================================= 574ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 575ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%assign function_align 16 576ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 577ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; Begin a function. 578ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; Applies any symbol mangling needed for C linkage, and sets up a define such that 579ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; subsequent uses of the function name automatically refer to the mangled version. 580ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; Appends cpuflags to the function name if cpuflags has been specified. 581ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro cglobal 1-2+ ; name, [PROLOGUE args] 582ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%if %0 == 1 583ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang cglobal_internal %1 %+ SUFFIX 584ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%else 585ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang cglobal_internal %1 %+ SUFFIX, %2 586ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endif 587ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro 588ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro cglobal_internal 1-2+ 589ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %ifndef cglobaled_%1 590ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %xdefine %1 mangle(program_name %+ _ %+ %1) 591ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %xdefine %1.skip_prologue %1 %+ .skip_prologue 592ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang CAT_XDEFINE cglobaled_, %1, 1 593ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %endif 594ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %xdefine current_function %1 595ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %ifidn __OUTPUT_FORMAT__,elf 596ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang global %1:function hidden 597ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %elifidn __OUTPUT_FORMAT__,elf32 598ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang global %1:function hidden 599ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %elifidn __OUTPUT_FORMAT__,elf64 600ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang global %1:function hidden 6015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang %elifidn __OUTPUT_FORMAT__,macho32 6025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang global %1:private_extern 6035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang %elifidn __OUTPUT_FORMAT__,macho64 6045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang global %1:private_extern 605ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %else 606ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang global %1 607ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %endif 608ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang align function_align 609ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %1: 610ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RESET_MM_PERMUTATION ; not really needed, but makes disassembly somewhat nicer 611ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %assign stack_offset 0 612ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %if %0 > 1 613ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang PROLOGUE %2 614ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %endif 615ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro 616ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 617ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro cextern 1 618ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %xdefine %1 mangle(program_name %+ _ %+ %1) 619ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang CAT_XDEFINE cglobaled_, %1, 1 620ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang extern %1 621ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro 622ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 623ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; like cextern, but without the prefix 624ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro cextern_naked 1 625ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %xdefine %1 mangle(%1) 626ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang CAT_XDEFINE cglobaled_, %1, 1 627ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang extern %1 628ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro 629ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 630ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro const 2+ 631ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %xdefine %1 mangle(program_name %+ _ %+ %1) 632ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang global %1 633ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %1: %2 634ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro 635ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 636ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; This is needed for ELF, otherwise the GNU linker assumes the stack is 637ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; executable by default. 638ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%ifidn __OUTPUT_FORMAT__,elf 639ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangSECTION .note.GNU-stack noalloc noexec nowrite progbits 640ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%elifidn __OUTPUT_FORMAT__,elf32 641ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangSECTION .note.GNU-stack noalloc noexec nowrite progbits 642ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%elifidn __OUTPUT_FORMAT__,elf64 643ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangSECTION .note.GNU-stack noalloc noexec nowrite progbits 644ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endif 645ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 646ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; cpuflags 647ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 648ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%assign cpuflags_mmx (1<<0) 649ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%assign cpuflags_mmx2 (1<<1) | cpuflags_mmx 650ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%assign cpuflags_3dnow (1<<2) | cpuflags_mmx 651ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%assign cpuflags_3dnow2 (1<<3) | cpuflags_3dnow 652ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%assign cpuflags_sse (1<<4) | cpuflags_mmx2 653ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%assign cpuflags_sse2 (1<<5) | cpuflags_sse 654ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%assign cpuflags_sse2slow (1<<6) | cpuflags_sse2 655ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%assign cpuflags_sse3 (1<<7) | cpuflags_sse2 656ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%assign cpuflags_ssse3 (1<<8) | cpuflags_sse3 657ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%assign cpuflags_sse4 (1<<9) | cpuflags_ssse3 658ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%assign cpuflags_sse42 (1<<10)| cpuflags_sse4 659ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%assign cpuflags_avx (1<<11)| cpuflags_sse42 660ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%assign cpuflags_xop (1<<12)| cpuflags_avx 661ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%assign cpuflags_fma4 (1<<13)| cpuflags_avx 662ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 663ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%assign cpuflags_cache32 (1<<16) 664ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%assign cpuflags_cache64 (1<<17) 665ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%assign cpuflags_slowctz (1<<18) 666ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%assign cpuflags_lzcnt (1<<19) 667ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%assign cpuflags_misalign (1<<20) 668ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%assign cpuflags_aligned (1<<21) ; not a cpu feature, but a function variant 669ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%assign cpuflags_atom (1<<22) 670ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 671ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%define cpuflag(x) ((cpuflags & (cpuflags_ %+ x)) == (cpuflags_ %+ x)) 672ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%define notcpuflag(x) ((cpuflags & (cpuflags_ %+ x)) != (cpuflags_ %+ x)) 673ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 674ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; Takes up to 2 cpuflags from the above list. 675ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu. 676ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_MMX &co. 677ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro INIT_CPUFLAGS 0-2 678ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %if %0 >= 1 679ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %xdefine cpuname %1 680ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %assign cpuflags cpuflags_%1 681ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %if %0 >= 2 682ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %xdefine cpuname %1_%2 683ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %assign cpuflags cpuflags | cpuflags_%2 684ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %endif 685ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %xdefine SUFFIX _ %+ cpuname 686ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %if cpuflag(avx) 687ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %assign avx_enabled 1 688ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %endif 689ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %if mmsize == 16 && notcpuflag(sse2) 690ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %define mova movaps 691ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %define movu movups 692ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %define movnta movntps 693ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %endif 694ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %if cpuflag(aligned) 695ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %define movu mova 696ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %elifidn %1, sse3 697ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %define movu lddqu 698ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %endif 699ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %else 700ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %xdefine SUFFIX 701ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %undef cpuname 702ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %undef cpuflags 703ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %endif 704ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro 705ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 706ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; merge mmx and sse* 707ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 708ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro CAT_XDEFINE 3 709ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %xdefine %1%2 %3 710ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro 711ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 712ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro CAT_UNDEF 2 713ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %undef %1%2 714ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro 715ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 716ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro INIT_MMX 0-1+ 717ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %assign avx_enabled 0 718ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %define RESET_MM_PERMUTATION INIT_MMX %1 719ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %define mmsize 8 720ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %define num_mmregs 8 721ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %define mova movq 722ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %define movu movq 723ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %define movh movd 724ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %define movnta movntq 725ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %assign %%i 0 726ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %rep 8 727ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang CAT_XDEFINE m, %%i, mm %+ %%i 728ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang CAT_XDEFINE nmm, %%i, %%i 729ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %assign %%i %%i+1 730ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %endrep 731ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %rep 8 732ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang CAT_UNDEF m, %%i 733ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang CAT_UNDEF nmm, %%i 734ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %assign %%i %%i+1 735ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %endrep 736ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang INIT_CPUFLAGS %1 737ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro 738ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 739ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro INIT_XMM 0-1+ 740ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %assign avx_enabled 0 741ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %define RESET_MM_PERMUTATION INIT_XMM %1 742ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %define mmsize 16 743ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %define num_mmregs 8 744ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %if ARCH_X86_64 745ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %define num_mmregs 16 746ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %endif 747ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %define mova movdqa 748ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %define movu movdqu 749ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %define movh movq 750ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %define movnta movntdq 751ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %assign %%i 0 752ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %rep num_mmregs 753ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang CAT_XDEFINE m, %%i, xmm %+ %%i 754ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang CAT_XDEFINE nxmm, %%i, %%i 755ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %assign %%i %%i+1 756ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %endrep 757ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang INIT_CPUFLAGS %1 758ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro 759ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 760ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; FIXME: INIT_AVX can be replaced by INIT_XMM avx 761ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro INIT_AVX 0 762ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang INIT_XMM 763ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %assign avx_enabled 1 764ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %define PALIGNR PALIGNR_SSSE3 765ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %define RESET_MM_PERMUTATION INIT_AVX 766ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro 767ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 768ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro INIT_YMM 0-1+ 769ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %assign avx_enabled 1 770ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %define RESET_MM_PERMUTATION INIT_YMM %1 771ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %define mmsize 32 772ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %define num_mmregs 8 773ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %if ARCH_X86_64 774ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %define num_mmregs 16 775ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %endif 776ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %define mova vmovaps 777ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %define movu vmovups 778ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %undef movh 779ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %define movnta vmovntps 780ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %assign %%i 0 781ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %rep num_mmregs 782ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang CAT_XDEFINE m, %%i, ymm %+ %%i 783ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang CAT_XDEFINE nymm, %%i, %%i 784ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %assign %%i %%i+1 785ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %endrep 786ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang INIT_CPUFLAGS %1 787ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro 788ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 789ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangINIT_XMM 790ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 791ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; I often want to use macros that permute their arguments. e.g. there's no 792ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; efficient way to implement butterfly or transpose or dct without swapping some 793ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; arguments. 794ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; 795ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; I would like to not have to manually keep track of the permutations: 796ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; If I insert a permutation in the middle of a function, it should automatically 797ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; change everything that follows. For more complex macros I may also have multiple 798ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations. 799ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; 800ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that 801ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; permutes its arguments. It's equivalent to exchanging the contents of the 802ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; registers, except that this way you exchange the register names instead, so it 803ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; doesn't cost any cycles. 804ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 805ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro PERMUTE 2-* ; takes a list of pairs to swap 806ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%rep %0/2 807ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %xdefine tmp%2 m%2 808ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %xdefine ntmp%2 nm%2 809ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %rotate 2 810ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endrep 811ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%rep %0/2 812ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %xdefine m%1 tmp%2 813ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %xdefine nm%1 ntmp%2 814ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %undef tmp%2 815ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %undef ntmp%2 816ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %rotate 2 817ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endrep 818ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro 819ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 820ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro SWAP 2-* ; swaps a single chain (sometimes more concise than pairs) 821ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%rep %0-1 822ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%ifdef m%1 823ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %xdefine tmp m%1 824ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %xdefine m%1 m%2 825ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %xdefine m%2 tmp 826ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang CAT_XDEFINE n, m%1, %1 827ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang CAT_XDEFINE n, m%2, %2 828ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%else 829ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang ; If we were called as "SWAP m0,m1" rather than "SWAP 0,1" infer the original numbers here. 830ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang ; Be careful using this mode in nested macros though, as in some cases there may be 831ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang ; other copies of m# that have already been dereferenced and don't get updated correctly. 832ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %xdefine %%n1 n %+ %1 833ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %xdefine %%n2 n %+ %2 834ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %xdefine tmp m %+ %%n1 835ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang CAT_XDEFINE m, %%n1, m %+ %%n2 836ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang CAT_XDEFINE m, %%n2, tmp 837ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang CAT_XDEFINE n, m %+ %%n1, %%n1 838ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang CAT_XDEFINE n, m %+ %%n2, %%n2 839ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endif 840ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %undef tmp 841ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %rotate 1 842ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endrep 843ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro 844ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 845ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; If SAVE_MM_PERMUTATION is placed at the end of a function, then any later 846ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; calls to that function will automatically load the permutation, so values can 847ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; be returned in mmregs. 848ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro SAVE_MM_PERMUTATION 0-1 849ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %if %0 850ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %xdefine %%f %1_m 851ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %else 852ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %xdefine %%f current_function %+ _m 853ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %endif 854ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %assign %%i 0 855ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %rep num_mmregs 856ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang CAT_XDEFINE %%f, %%i, m %+ %%i 857ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %assign %%i %%i+1 858ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %endrep 859ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro 860ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 861ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro LOAD_MM_PERMUTATION 1 ; name to load from 862ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %ifdef %1_m0 863ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %assign %%i 0 864ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %rep num_mmregs 865ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang CAT_XDEFINE m, %%i, %1_m %+ %%i 866ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang CAT_XDEFINE n, m %+ %%i, %%i 867ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %assign %%i %%i+1 868ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %endrep 869ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %endif 870ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro 871ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 872ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't 873ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro call 1 874ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang call_internal %1, %1 %+ SUFFIX 875ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro 876ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro call_internal 2 877ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %xdefine %%i %1 878ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %ifndef cglobaled_%1 879ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %ifdef cglobaled_%2 880ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %xdefine %%i %2 881ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %endif 882ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %endif 883ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang call %%i 884ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang LOAD_MM_PERMUTATION %%i 885ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro 886ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 887ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; Substitutions that reduce instruction size but are functionally equivalent 888ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro add 2 889ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %ifnum %2 890ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %if %2==128 891ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang sub %1, -128 892ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %else 893ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang add %1, %2 894ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %endif 895ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %else 896ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang add %1, %2 897ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %endif 898ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro 899ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 900ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro sub 2 901ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %ifnum %2 902ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %if %2==128 903ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang add %1, -128 904ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %else 905ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang sub %1, %2 906ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %endif 907ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %else 908ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang sub %1, %2 909ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %endif 910ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro 911ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 912ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;============================================================================= 913ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; AVX abstraction layer 914ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;============================================================================= 915ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 916ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%assign i 0 917ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%rep 16 918ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %if i < 8 919ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang CAT_XDEFINE sizeofmm, i, 8 920ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %endif 921ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang CAT_XDEFINE sizeofxmm, i, 16 922ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang CAT_XDEFINE sizeofymm, i, 32 923ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%assign i i+1 924ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endrep 925ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%undef i 926ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 927ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;%1 == instruction 928ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;%2 == 1 if float, 0 if int 929ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;%3 == 1 if 4-operand (xmm, xmm, xmm, imm), 0 if 2- or 3-operand (xmm, xmm, xmm) 930ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;%4 == number of operands given 931ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;%5+: operands 932ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro RUN_AVX_INSTR 6-7+ 933ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %ifid %5 934ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %define %%size sizeof%5 935ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %else 936ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %define %%size mmsize 937ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %endif 938ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %if %%size==32 939ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %if %0 >= 7 940ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang v%1 %5, %6, %7 941ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %else 942ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang v%1 %5, %6 943ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %endif 944ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %else 945ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %if %%size==8 946ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %define %%regmov movq 947ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %elif %2 948ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %define %%regmov movaps 949ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %else 950ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %define %%regmov movdqa 951ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %endif 952ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 953ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %if %4>=3+%3 954ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %ifnidn %5, %6 955ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %if avx_enabled && sizeof%5==16 956ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang v%1 %5, %6, %7 957ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %else 958ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %%regmov %5, %6 959ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %1 %5, %7 960ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %endif 961ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %else 962ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %1 %5, %7 963ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %endif 964ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %elif %3 965ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %1 %5, %6, %7 966ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %else 967ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %1 %5, %6 968ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %endif 969ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %endif 970ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro 971ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 972ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; 3arg AVX ops with a memory arg can only have it in src2, 973ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; whereas SSE emulation of 3arg prefers to have it in src1 (i.e. the mov). 974ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; So, if the op is symmetric and the wrong one is memory, swap them. 975ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro RUN_AVX_INSTR1 8 976ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %assign %%swap 0 977ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %if avx_enabled 978ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %ifnid %6 979ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %assign %%swap 1 980ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %endif 981ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %elifnidn %5, %6 982ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %ifnid %7 983ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %assign %%swap 1 984ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %endif 985ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %endif 986ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %if %%swap && %3 == 0 && %8 == 1 987ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RUN_AVX_INSTR %1, %2, %3, %4, %5, %7, %6 988ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %else 989ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RUN_AVX_INSTR %1, %2, %3, %4, %5, %6, %7 990ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %endif 991ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro 992ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 993ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;%1 == instruction 994ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;%2 == 1 if float, 0 if int 995ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;%3 == 1 if 4-operand (xmm, xmm, xmm, imm), 0 if 3-operand (xmm, xmm, xmm) 996ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;%4 == 1 if symmetric (i.e. doesn't matter which src arg is which), 0 if not 997ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro AVX_INSTR 4 998ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %macro %1 2-9 fnord, fnord, fnord, %1, %2, %3, %4 999ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %ifidn %3, fnord 1000ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RUN_AVX_INSTR %6, %7, %8, 2, %1, %2 1001ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %elifidn %4, fnord 1002ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RUN_AVX_INSTR1 %6, %7, %8, 3, %1, %2, %3, %9 1003ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %elifidn %5, fnord 1004ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RUN_AVX_INSTR %6, %7, %8, 4, %1, %2, %3, %4 1005ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %else 1006ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RUN_AVX_INSTR %6, %7, %8, 5, %1, %2, %3, %4, %5 1007ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %endif 1008ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %endmacro 1009ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro 1010ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 1011ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR addpd, 1, 0, 1 1012ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR addps, 1, 0, 1 1013ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR addsd, 1, 0, 1 1014ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR addss, 1, 0, 1 1015ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR addsubpd, 1, 0, 0 1016ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR addsubps, 1, 0, 0 1017ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR andpd, 1, 0, 1 1018ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR andps, 1, 0, 1 1019ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR andnpd, 1, 0, 0 1020ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR andnps, 1, 0, 0 1021ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR blendpd, 1, 0, 0 1022ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR blendps, 1, 0, 0 1023ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR blendvpd, 1, 0, 0 1024ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR blendvps, 1, 0, 0 1025ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR cmppd, 1, 0, 0 1026ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR cmpps, 1, 0, 0 1027ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR cmpsd, 1, 0, 0 1028ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR cmpss, 1, 0, 0 1029ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR cvtdq2ps, 1, 0, 0 1030ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR cvtps2dq, 1, 0, 0 1031ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR divpd, 1, 0, 0 1032ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR divps, 1, 0, 0 1033ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR divsd, 1, 0, 0 1034ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR divss, 1, 0, 0 1035ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR dppd, 1, 1, 0 1036ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR dpps, 1, 1, 0 1037ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR haddpd, 1, 0, 0 1038ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR haddps, 1, 0, 0 1039ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR hsubpd, 1, 0, 0 1040ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR hsubps, 1, 0, 0 1041ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR maxpd, 1, 0, 1 1042ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR maxps, 1, 0, 1 1043ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR maxsd, 1, 0, 1 1044ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR maxss, 1, 0, 1 1045ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR minpd, 1, 0, 1 1046ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR minps, 1, 0, 1 1047ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR minsd, 1, 0, 1 1048ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR minss, 1, 0, 1 1049ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR movhlps, 1, 0, 0 1050ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR movlhps, 1, 0, 0 1051ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR movsd, 1, 0, 0 1052ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR movss, 1, 0, 0 1053ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR mpsadbw, 0, 1, 0 1054ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR mulpd, 1, 0, 1 1055ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR mulps, 1, 0, 1 1056ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR mulsd, 1, 0, 1 1057ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR mulss, 1, 0, 1 1058ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR orpd, 1, 0, 1 1059ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR orps, 1, 0, 1 1060ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR packsswb, 0, 0, 0 1061ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR packssdw, 0, 0, 0 1062ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR packuswb, 0, 0, 0 1063ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR packusdw, 0, 0, 0 1064ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR paddb, 0, 0, 1 1065ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR paddw, 0, 0, 1 1066ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR paddd, 0, 0, 1 1067ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR paddq, 0, 0, 1 1068ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR paddsb, 0, 0, 1 1069ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR paddsw, 0, 0, 1 1070ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR paddusb, 0, 0, 1 1071ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR paddusw, 0, 0, 1 1072ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR palignr, 0, 1, 0 1073ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR pand, 0, 0, 1 1074ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR pandn, 0, 0, 0 1075ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR pavgb, 0, 0, 1 1076ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR pavgw, 0, 0, 1 1077ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR pblendvb, 0, 0, 0 1078ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR pblendw, 0, 1, 0 1079ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR pcmpestri, 0, 0, 0 1080ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR pcmpestrm, 0, 0, 0 1081ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR pcmpistri, 0, 0, 0 1082ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR pcmpistrm, 0, 0, 0 1083ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR pcmpeqb, 0, 0, 1 1084ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR pcmpeqw, 0, 0, 1 1085ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR pcmpeqd, 0, 0, 1 1086ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR pcmpeqq, 0, 0, 1 1087ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR pcmpgtb, 0, 0, 0 1088ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR pcmpgtw, 0, 0, 0 1089ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR pcmpgtd, 0, 0, 0 1090ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR pcmpgtq, 0, 0, 0 1091ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR phaddw, 0, 0, 0 1092ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR phaddd, 0, 0, 0 1093ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR phaddsw, 0, 0, 0 1094ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR phsubw, 0, 0, 0 1095ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR phsubd, 0, 0, 0 1096ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR phsubsw, 0, 0, 0 1097ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR pmaddwd, 0, 0, 1 1098ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR pmaddubsw, 0, 0, 0 1099ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR pmaxsb, 0, 0, 1 1100ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR pmaxsw, 0, 0, 1 1101ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR pmaxsd, 0, 0, 1 1102ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR pmaxub, 0, 0, 1 1103ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR pmaxuw, 0, 0, 1 1104ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR pmaxud, 0, 0, 1 1105ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR pminsb, 0, 0, 1 1106ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR pminsw, 0, 0, 1 1107ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR pminsd, 0, 0, 1 1108ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR pminub, 0, 0, 1 1109ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR pminuw, 0, 0, 1 1110ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR pminud, 0, 0, 1 1111ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR pmulhuw, 0, 0, 1 1112ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR pmulhrsw, 0, 0, 1 1113ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR pmulhw, 0, 0, 1 1114ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR pmullw, 0, 0, 1 1115ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR pmulld, 0, 0, 1 1116ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR pmuludq, 0, 0, 1 1117ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR pmuldq, 0, 0, 1 1118ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR por, 0, 0, 1 1119ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR psadbw, 0, 0, 1 1120ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR pshufb, 0, 0, 0 1121ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR psignb, 0, 0, 0 1122ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR psignw, 0, 0, 0 1123ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR psignd, 0, 0, 0 1124ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR psllw, 0, 0, 0 1125ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR pslld, 0, 0, 0 1126ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR psllq, 0, 0, 0 1127ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR pslldq, 0, 0, 0 1128ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR psraw, 0, 0, 0 1129ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR psrad, 0, 0, 0 1130ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR psrlw, 0, 0, 0 1131ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR psrld, 0, 0, 0 1132ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR psrlq, 0, 0, 0 1133ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR psrldq, 0, 0, 0 1134ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR psubb, 0, 0, 0 1135ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR psubw, 0, 0, 0 1136ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR psubd, 0, 0, 0 1137ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR psubq, 0, 0, 0 1138ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR psubsb, 0, 0, 0 1139ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR psubsw, 0, 0, 0 1140ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR psubusb, 0, 0, 0 1141ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR psubusw, 0, 0, 0 1142ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR punpckhbw, 0, 0, 0 1143ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR punpckhwd, 0, 0, 0 1144ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR punpckhdq, 0, 0, 0 1145ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR punpckhqdq, 0, 0, 0 1146ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR punpcklbw, 0, 0, 0 1147ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR punpcklwd, 0, 0, 0 1148ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR punpckldq, 0, 0, 0 1149ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR punpcklqdq, 0, 0, 0 1150ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR pxor, 0, 0, 1 1151ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR shufps, 1, 1, 0 1152ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR subpd, 1, 0, 0 1153ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR subps, 1, 0, 0 1154ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR subsd, 1, 0, 0 1155ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR subss, 1, 0, 0 1156ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR unpckhpd, 1, 0, 0 1157ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR unpckhps, 1, 0, 0 1158ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR unpcklpd, 1, 0, 0 1159ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR unpcklps, 1, 0, 0 1160ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR xorpd, 1, 0, 1 1161ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR xorps, 1, 0, 1 1162ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 1163ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; 3DNow instructions, for sharing code between AVX, SSE and 3DN 1164ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR pfadd, 1, 0, 1 1165ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR pfsub, 1, 0, 0 1166ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangAVX_INSTR pfmul, 1, 0, 1 1167ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 1168ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; base-4 constants for shuffles 1169ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%assign i 0 1170ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%rep 256 1171ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %assign j ((i>>6)&3)*1000 + ((i>>4)&3)*100 + ((i>>2)&3)*10 + (i&3) 1172ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %if j < 10 1173ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang CAT_XDEFINE q000, j, i 1174ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %elif j < 100 1175ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang CAT_XDEFINE q00, j, i 1176ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %elif j < 1000 1177ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang CAT_XDEFINE q0, j, i 1178ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %else 1179ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang CAT_XDEFINE q, j, i 1180ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %endif 1181ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%assign i i+1 1182ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endrep 1183ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%undef i 1184ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%undef j 1185ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 1186ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro FMA_INSTR 3 1187ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %macro %1 4-7 %1, %2, %3 1188ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %if cpuflag(xop) 1189ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang v%5 %1, %2, %3, %4 1190ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %else 1191ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %6 %1, %2, %3 1192ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %7 %1, %4 1193ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %endif 1194ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %endmacro 1195ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro 1196ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 1197ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangFMA_INSTR pmacsdd, pmulld, paddd 1198ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangFMA_INSTR pmacsww, pmullw, paddw 1199ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangFMA_INSTR pmadcswd, pmaddwd, paddd 1200