1;***************************************************************************** 2;* x86inc.asm: x264asm abstraction layer 3;***************************************************************************** 4;* Copyright (C) 2005-2012 x264 project 5;* 6;* Authors: Loren Merritt <lorenm@u.washington.edu> 7;* Anton Mitrofanov <BugMaster@narod.ru> 8;* Jason Garrett-Glaser <darkshikari@gmail.com> 9;* Henrik Gramner <hengar-6@student.ltu.se> 10;* 11;* Permission to use, copy, modify, and/or distribute this software for any 12;* purpose with or without fee is hereby granted, provided that the above 13;* copyright notice and this permission notice appear in all copies. 14;* 15;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 16;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 17;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 18;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 19;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 20;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 21;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 22;***************************************************************************** 23 24; This is a header file for the x264ASM assembly language, which uses 25; NASM/YASM syntax combined with a large number of macros to provide easy 26; abstraction between different calling conventions (x86_32, win64, linux64). 27; It also has various other useful features to simplify writing the kind of 28; DSP functions that are most often used in x264. 29 30; Unlike the rest of x264, this file is available under an ISC license, as it 31; has significant usefulness outside of x264 and we want it to be available 32; to the largest audience possible. Of course, if you modify it for your own 33; purposes to add a new feature, we strongly encourage contributing a patch 34; as this feature might be useful for others as well. Send patches or ideas 35; to x264-devel@videolan.org . 36 37%include "vpx_config.asm" 38 39%define program_name vp9 40 41 42%define UNIX64 0 43%define WIN64 0 44%if ARCH_X86_64 45 %ifidn __OUTPUT_FORMAT__,win32 46 %define WIN64 1 47 %elifidn __OUTPUT_FORMAT__,win64 48 %define WIN64 1 49 %elifidn __OUTPUT_FORMAT__,x64 50 %define WIN64 1 51 %else 52 %define UNIX64 1 53 %endif 54%endif 55 56%ifidn __OUTPUT_FORMAT__,elf32 57 %define mangle(x) x 58%elifidn __OUTPUT_FORMAT__,elf64 59 %define mangle(x) x 60%elifidn __OUTPUT_FORMAT__,elf 61 %define mangle(x) x 62%elifidn __OUTPUT_FORMAT__,x64 63 %define mangle(x) x 64%elifidn __OUTPUT_FORMAT__,win64 65 %define mangle(x) x 66%else 67 %define mangle(x) _ %+ x 68%endif 69 70; FIXME: All of the 64bit asm functions that take a stride as an argument 71; via register, assume that the high dword of that register is filled with 0. 72; This is true in practice (since we never do any 64bit arithmetic on strides, 73; and x264's strides are all positive), but is not guaranteed by the ABI. 74 75; Name of the .rodata section. 76; Kludge: Something on OS X fails to align .rodata even given an align attribute, 77; so use a different read-only section. 78%macro SECTION_RODATA 0-1 16 79 %ifidn __OUTPUT_FORMAT__,macho64 80 SECTION .text align=%1 81 %elifidn __OUTPUT_FORMAT__,macho 82 SECTION .text align=%1 83 fakegot: 84 %elifidn __OUTPUT_FORMAT__,aout 85 section .text 86 %else 87 SECTION .rodata align=%1 88 %endif 89%endmacro 90 91; aout does not support align= 92%macro SECTION_TEXT 0-1 16 93 %ifidn __OUTPUT_FORMAT__,aout 94 SECTION .text 95 %else 96 SECTION .text align=%1 97 %endif 98%endmacro 99 100; PIC macros are copied from vpx_ports/x86_abi_support.asm. The "define PIC" 101; from original code is added in for 64bit. 102%ifidn __OUTPUT_FORMAT__,elf32 103%define ABI_IS_32BIT 1 104%elifidn __OUTPUT_FORMAT__,macho32 105%define ABI_IS_32BIT 1 106%elifidn __OUTPUT_FORMAT__,win32 107%define ABI_IS_32BIT 1 108%elifidn __OUTPUT_FORMAT__,aout 109%define ABI_IS_32BIT 1 110%else 111%define ABI_IS_32BIT 0 112%endif 113 114%if ABI_IS_32BIT 115 %if CONFIG_PIC=1 116 %ifidn __OUTPUT_FORMAT__,elf32 117 %define GET_GOT_SAVE_ARG 1 118 %define WRT_PLT wrt ..plt 119 %macro GET_GOT 1 120 extern _GLOBAL_OFFSET_TABLE_ 121 push %1 122 call %%get_got 123 %%sub_offset: 124 jmp %%exitGG 125 %%get_got: 126 mov %1, [esp] 127 add %1, _GLOBAL_OFFSET_TABLE_ + $$ - %%sub_offset wrt ..gotpc 128 ret 129 %%exitGG: 130 %undef GLOBAL 131 %define GLOBAL(x) x + %1 wrt ..gotoff 132 %undef RESTORE_GOT 133 %define RESTORE_GOT pop %1 134 %endmacro 135 %elifidn __OUTPUT_FORMAT__,macho32 136 %define GET_GOT_SAVE_ARG 1 137 %macro GET_GOT 1 138 push %1 139 call %%get_got 140 %%get_got: 141 pop %1 142 %undef GLOBAL 143 %define GLOBAL(x) x + %1 - %%get_got 144 %undef RESTORE_GOT 145 %define RESTORE_GOT pop %1 146 %endmacro 147 %endif 148 %endif 149 150 %if ARCH_X86_64 == 0 151 %undef PIC 152 %endif 153 154%else 155 %macro GET_GOT 1 156 %endmacro 157 %define GLOBAL(x) rel x 158 %define WRT_PLT wrt ..plt 159 160 %if WIN64 161 %define PIC 162 %elifidn __OUTPUT_FORMAT__,macho64 163 %define PIC 164 %elif CONFIG_PIC 165 %define PIC 166 %endif 167%endif 168 169%ifnmacro GET_GOT 170 %macro GET_GOT 1 171 %endmacro 172 %define GLOBAL(x) x 173%endif 174%ifndef RESTORE_GOT 175%define RESTORE_GOT 176%endif 177%ifndef WRT_PLT 178%define WRT_PLT 179%endif 180 181%ifdef PIC 182 default rel 183%endif 184; Done with PIC macros 185 186; Always use long nops (reduces 0x90 spam in disassembly on x86_32) 187%ifndef __NASM_VER__ 188CPU amdnop 189%else 190%use smartalign 191ALIGNMODE k7 192%endif 193 194; Macros to eliminate most code duplication between x86_32 and x86_64: 195; Currently this works only for leaf functions which load all their arguments 196; into registers at the start, and make no other use of the stack. Luckily that 197; covers most of x264's asm. 198 199; PROLOGUE: 200; %1 = number of arguments. loads them from stack if needed. 201; %2 = number of registers used. pushes callee-saved regs if needed. 202; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed. 203; %4 = list of names to define to registers 204; PROLOGUE can also be invoked by adding the same options to cglobal 205 206; e.g. 207; cglobal foo, 2,3,0, dst, src, tmp 208; declares a function (foo), taking two args (dst and src) and one local variable (tmp) 209 210; TODO Some functions can use some args directly from the stack. If they're the 211; last args then you can just not declare them, but if they're in the middle 212; we need more flexible macro. 213 214; RET: 215; Pops anything that was pushed by PROLOGUE, and returns. 216 217; REP_RET: 218; Same, but if it doesn't pop anything it becomes a 2-byte ret, for athlons 219; which are slow when a normal ret follows a branch. 220 221; registers: 222; rN and rNq are the native-size register holding function argument N 223; rNd, rNw, rNb are dword, word, and byte size 224; rNm is the original location of arg N (a register or on the stack), dword 225; rNmp is native size 226 227%macro DECLARE_REG 5-6 228 %define r%1q %2 229 %define r%1d %3 230 %define r%1w %4 231 %define r%1b %5 232 %if %0 == 5 233 %define r%1m %3 234 %define r%1mp %2 235 %elif ARCH_X86_64 ; memory 236 %define r%1m [rsp + stack_offset + %6] 237 %define r%1mp qword r %+ %1 %+ m 238 %else 239 %define r%1m [esp + stack_offset + %6] 240 %define r%1mp dword r %+ %1 %+ m 241 %endif 242 %define r%1 %2 243%endmacro 244 245%macro DECLARE_REG_SIZE 2 246 %define r%1q r%1 247 %define e%1q r%1 248 %define r%1d e%1 249 %define e%1d e%1 250 %define r%1w %1 251 %define e%1w %1 252 %define r%1b %2 253 %define e%1b %2 254%if ARCH_X86_64 == 0 255 %define r%1 e%1 256%endif 257%endmacro 258 259DECLARE_REG_SIZE ax, al 260DECLARE_REG_SIZE bx, bl 261DECLARE_REG_SIZE cx, cl 262DECLARE_REG_SIZE dx, dl 263DECLARE_REG_SIZE si, sil 264DECLARE_REG_SIZE di, dil 265DECLARE_REG_SIZE bp, bpl 266 267; t# defines for when per-arch register allocation is more complex than just function arguments 268 269%macro DECLARE_REG_TMP 1-* 270 %assign %%i 0 271 %rep %0 272 CAT_XDEFINE t, %%i, r%1 273 %assign %%i %%i+1 274 %rotate 1 275 %endrep 276%endmacro 277 278%macro DECLARE_REG_TMP_SIZE 0-* 279 %rep %0 280 %define t%1q t%1 %+ q 281 %define t%1d t%1 %+ d 282 %define t%1w t%1 %+ w 283 %define t%1b t%1 %+ b 284 %rotate 1 285 %endrep 286%endmacro 287 288DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14 289 290%if ARCH_X86_64 291 %define gprsize 8 292%else 293 %define gprsize 4 294%endif 295 296%macro PUSH 1 297 push %1 298 %assign stack_offset stack_offset+gprsize 299%endmacro 300 301%macro POP 1 302 pop %1 303 %assign stack_offset stack_offset-gprsize 304%endmacro 305 306%macro PUSH_IF_USED 1-* 307 %rep %0 308 %if %1 < regs_used 309 PUSH r%1 310 %endif 311 %rotate 1 312 %endrep 313%endmacro 314 315%macro POP_IF_USED 1-* 316 %rep %0 317 %if %1 < regs_used 318 pop r%1 319 %endif 320 %rotate 1 321 %endrep 322%endmacro 323 324%macro LOAD_IF_USED 1-* 325 %rep %0 326 %if %1 < num_args 327 mov r%1, r %+ %1 %+ mp 328 %endif 329 %rotate 1 330 %endrep 331%endmacro 332 333%macro SUB 2 334 sub %1, %2 335 %ifidn %1, rsp 336 %assign stack_offset stack_offset+(%2) 337 %endif 338%endmacro 339 340%macro ADD 2 341 add %1, %2 342 %ifidn %1, rsp 343 %assign stack_offset stack_offset-(%2) 344 %endif 345%endmacro 346 347%macro movifnidn 2 348 %ifnidn %1, %2 349 mov %1, %2 350 %endif 351%endmacro 352 353%macro movsxdifnidn 2 354 %ifnidn %1, %2 355 movsxd %1, %2 356 %endif 357%endmacro 358 359%macro ASSERT 1 360 %if (%1) == 0 361 %error assert failed 362 %endif 363%endmacro 364 365%macro DEFINE_ARGS 0-* 366 %ifdef n_arg_names 367 %assign %%i 0 368 %rep n_arg_names 369 CAT_UNDEF arg_name %+ %%i, q 370 CAT_UNDEF arg_name %+ %%i, d 371 CAT_UNDEF arg_name %+ %%i, w 372 CAT_UNDEF arg_name %+ %%i, b 373 CAT_UNDEF arg_name %+ %%i, m 374 CAT_UNDEF arg_name %+ %%i, mp 375 CAT_UNDEF arg_name, %%i 376 %assign %%i %%i+1 377 %endrep 378 %endif 379 380 %xdefine %%stack_offset stack_offset 381 %undef stack_offset ; so that the current value of stack_offset doesn't get baked in by xdefine 382 %assign %%i 0 383 %rep %0 384 %xdefine %1q r %+ %%i %+ q 385 %xdefine %1d r %+ %%i %+ d 386 %xdefine %1w r %+ %%i %+ w 387 %xdefine %1b r %+ %%i %+ b 388 %xdefine %1m r %+ %%i %+ m 389 %xdefine %1mp r %+ %%i %+ mp 390 CAT_XDEFINE arg_name, %%i, %1 391 %assign %%i %%i+1 392 %rotate 1 393 %endrep 394 %xdefine stack_offset %%stack_offset 395 %assign n_arg_names %0 396%endmacro 397 398%if ARCH_X86_64 399%macro ALLOC_STACK 2 ; stack_size, num_regs 400 %assign %%stack_aligment ((mmsize + 15) & ~15) 401 %assign stack_size_padded %1 402 403 %assign %%reg_num (%2 - 1) 404 %xdefine rsp_tmp r %+ %%reg_num 405 mov rsp_tmp, rsp 406 sub rsp, stack_size_padded 407 and rsp, ~(%%stack_aligment - 1) 408%endmacro 409 410%macro RESTORE_STACK 0 ; reset rsp register 411 mov rsp, rsp_tmp 412%endmacro 413%endif 414 415%if WIN64 ; Windows x64 ;================================================= 416 417DECLARE_REG 0, rcx, ecx, cx, cl 418DECLARE_REG 1, rdx, edx, dx, dl 419DECLARE_REG 2, R8, R8D, R8W, R8B 420DECLARE_REG 3, R9, R9D, R9W, R9B 421DECLARE_REG 4, R10, R10D, R10W, R10B, 40 422DECLARE_REG 5, R11, R11D, R11W, R11B, 48 423DECLARE_REG 6, rax, eax, ax, al, 56 424DECLARE_REG 7, rdi, edi, di, dil, 64 425DECLARE_REG 8, rsi, esi, si, sil, 72 426DECLARE_REG 9, rbx, ebx, bx, bl, 80 427DECLARE_REG 10, rbp, ebp, bp, bpl, 88 428DECLARE_REG 11, R12, R12D, R12W, R12B, 96 429DECLARE_REG 12, R13, R13D, R13W, R13B, 104 430DECLARE_REG 13, R14, R14D, R14W, R14B, 112 431DECLARE_REG 14, R15, R15D, R15W, R15B, 120 432 433%macro PROLOGUE 2-4+ 0 ; #args, #regs, #xmm_regs, arg_names... 434 %assign num_args %1 435 %assign regs_used %2 436 ASSERT regs_used >= num_args 437 ASSERT regs_used <= 15 438 PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14 439 %if mmsize == 8 440 %assign xmm_regs_used 0 441 %else 442 WIN64_SPILL_XMM %3 443 %endif 444 LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 445 DEFINE_ARGS %4 446%endmacro 447 448%macro WIN64_SPILL_XMM 1 449 %assign xmm_regs_used %1 450 ASSERT xmm_regs_used <= 16 451 %if xmm_regs_used > 6 452 SUB rsp, (xmm_regs_used-6)*16+16 453 %assign %%i xmm_regs_used 454 %rep (xmm_regs_used-6) 455 %assign %%i %%i-1 456 movdqa [rsp + (%%i-6)*16+(~stack_offset&8)], xmm %+ %%i 457 %endrep 458 %endif 459%endmacro 460 461%macro WIN64_RESTORE_XMM_INTERNAL 1 462 %if xmm_regs_used > 6 463 %assign %%i xmm_regs_used 464 %rep (xmm_regs_used-6) 465 %assign %%i %%i-1 466 movdqa xmm %+ %%i, [%1 + (%%i-6)*16+(~stack_offset&8)] 467 %endrep 468 add %1, (xmm_regs_used-6)*16+16 469 %endif 470%endmacro 471 472%macro WIN64_RESTORE_XMM 1 473 WIN64_RESTORE_XMM_INTERNAL %1 474 %assign stack_offset stack_offset-(xmm_regs_used-6)*16+16 475 %assign xmm_regs_used 0 476%endmacro 477 478%macro RET 0 479 WIN64_RESTORE_XMM_INTERNAL rsp 480 POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7 481 ret 482%endmacro 483 484%macro REP_RET 0 485 %if regs_used > 7 || xmm_regs_used > 6 486 RET 487 %else 488 rep ret 489 %endif 490%endmacro 491 492%elif ARCH_X86_64 ; *nix x64 ;============================================= 493 494DECLARE_REG 0, rdi, edi, di, dil 495DECLARE_REG 1, rsi, esi, si, sil 496DECLARE_REG 2, rdx, edx, dx, dl 497DECLARE_REG 3, rcx, ecx, cx, cl 498DECLARE_REG 4, R8, R8D, R8W, R8B 499DECLARE_REG 5, R9, R9D, R9W, R9B 500DECLARE_REG 6, rax, eax, ax, al, 8 501DECLARE_REG 7, R10, R10D, R10W, R10B, 16 502DECLARE_REG 8, R11, R11D, R11W, R11B, 24 503DECLARE_REG 9, rbx, ebx, bx, bl, 32 504DECLARE_REG 10, rbp, ebp, bp, bpl, 40 505DECLARE_REG 11, R12, R12D, R12W, R12B, 48 506DECLARE_REG 12, R13, R13D, R13W, R13B, 56 507DECLARE_REG 13, R14, R14D, R14W, R14B, 64 508DECLARE_REG 14, R15, R15D, R15W, R15B, 72 509 510%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names... 511 %assign num_args %1 512 %assign regs_used %2 513 ASSERT regs_used >= num_args 514 ASSERT regs_used <= 15 515 PUSH_IF_USED 9, 10, 11, 12, 13, 14 516 LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14 517 DEFINE_ARGS %4 518%endmacro 519 520%macro RET 0 521 POP_IF_USED 14, 13, 12, 11, 10, 9 522 ret 523%endmacro 524 525%macro REP_RET 0 526 %if regs_used > 9 527 RET 528 %else 529 rep ret 530 %endif 531%endmacro 532 533%else ; X86_32 ;============================================================== 534 535DECLARE_REG 0, eax, eax, ax, al, 4 536DECLARE_REG 1, ecx, ecx, cx, cl, 8 537DECLARE_REG 2, edx, edx, dx, dl, 12 538DECLARE_REG 3, ebx, ebx, bx, bl, 16 539DECLARE_REG 4, esi, esi, si, null, 20 540DECLARE_REG 5, edi, edi, di, null, 24 541DECLARE_REG 6, ebp, ebp, bp, null, 28 542%define rsp esp 543 544%macro DECLARE_ARG 1-* 545 %rep %0 546 %define r%1m [esp + stack_offset + 4*%1 + 4] 547 %define r%1mp dword r%1m 548 %rotate 1 549 %endrep 550%endmacro 551 552DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14 553 554%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names... 555 %assign num_args %1 556 %assign regs_used %2 557 %if regs_used > 7 558 %assign regs_used 7 559 %endif 560 ASSERT regs_used >= num_args 561 PUSH_IF_USED 3, 4, 5, 6 562 LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6 563 DEFINE_ARGS %4 564%endmacro 565 566%macro RET 0 567 POP_IF_USED 6, 5, 4, 3 568 ret 569%endmacro 570 571%macro REP_RET 0 572 %if regs_used > 3 573 RET 574 %else 575 rep ret 576 %endif 577%endmacro 578 579%endif ;====================================================================== 580 581%if WIN64 == 0 582%macro WIN64_SPILL_XMM 1 583%endmacro 584%macro WIN64_RESTORE_XMM 1 585%endmacro 586%endif 587 588;============================================================================= 589; arch-independent part 590;============================================================================= 591 592%assign function_align 16 593 594; Begin a function. 595; Applies any symbol mangling needed for C linkage, and sets up a define such that 596; subsequent uses of the function name automatically refer to the mangled version. 597; Appends cpuflags to the function name if cpuflags has been specified. 598%macro cglobal 1-2+ ; name, [PROLOGUE args] 599%if %0 == 1 600 cglobal_internal %1 %+ SUFFIX 601%else 602 cglobal_internal %1 %+ SUFFIX, %2 603%endif 604%endmacro 605%macro cglobal_internal 1-2+ 606 %ifndef cglobaled_%1 607 %xdefine %1 mangle(program_name %+ _ %+ %1) 608 %xdefine %1.skip_prologue %1 %+ .skip_prologue 609 CAT_XDEFINE cglobaled_, %1, 1 610 %endif 611 %xdefine current_function %1 612 %ifdef CHROMIUM 613 %ifidn __OUTPUT_FORMAT__,elf 614 global %1:function hidden 615 %elifidn __OUTPUT_FORMAT__,elf32 616 global %1:function hidden 617 %elifidn __OUTPUT_FORMAT__,elf64 618 global %1:function hidden 619 %elifidn __OUTPUT_FORMAT__,macho32 620 global %1:private_extern 621 %elifidn __OUTPUT_FORMAT__,macho64 622 global %1:private_extern 623 %else 624 global %1 625 %endif 626 %else 627 global %1 628 %endif 629 align function_align 630 %1: 631 RESET_MM_PERMUTATION ; not really needed, but makes disassembly somewhat nicer 632 %assign stack_offset 0 633 %if %0 > 1 634 PROLOGUE %2 635 %endif 636%endmacro 637 638%macro cextern 1 639 %xdefine %1 mangle(program_name %+ _ %+ %1) 640 CAT_XDEFINE cglobaled_, %1, 1 641 extern %1 642%endmacro 643 644; like cextern, but without the prefix 645%macro cextern_naked 1 646 %xdefine %1 mangle(%1) 647 CAT_XDEFINE cglobaled_, %1, 1 648 extern %1 649%endmacro 650 651%macro const 2+ 652 %xdefine %1 mangle(program_name %+ _ %+ %1) 653 global %1 654 %1: %2 655%endmacro 656 657; This is needed for ELF, otherwise the GNU linker assumes the stack is 658; executable by default. 659%ifidn __OUTPUT_FORMAT__,elf 660SECTION .note.GNU-stack noalloc noexec nowrite progbits 661%elifidn __OUTPUT_FORMAT__,elf32 662SECTION .note.GNU-stack noalloc noexec nowrite progbits 663%elifidn __OUTPUT_FORMAT__,elf64 664SECTION .note.GNU-stack noalloc noexec nowrite progbits 665%endif 666 667; cpuflags 668 669%assign cpuflags_mmx (1<<0) 670%assign cpuflags_mmx2 (1<<1) | cpuflags_mmx 671%assign cpuflags_3dnow (1<<2) | cpuflags_mmx 672%assign cpuflags_3dnow2 (1<<3) | cpuflags_3dnow 673%assign cpuflags_sse (1<<4) | cpuflags_mmx2 674%assign cpuflags_sse2 (1<<5) | cpuflags_sse 675%assign cpuflags_sse2slow (1<<6) | cpuflags_sse2 676%assign cpuflags_sse3 (1<<7) | cpuflags_sse2 677%assign cpuflags_ssse3 (1<<8) | cpuflags_sse3 678%assign cpuflags_sse4 (1<<9) | cpuflags_ssse3 679%assign cpuflags_sse42 (1<<10)| cpuflags_sse4 680%assign cpuflags_avx (1<<11)| cpuflags_sse42 681%assign cpuflags_xop (1<<12)| cpuflags_avx 682%assign cpuflags_fma4 (1<<13)| cpuflags_avx 683 684%assign cpuflags_cache32 (1<<16) 685%assign cpuflags_cache64 (1<<17) 686%assign cpuflags_slowctz (1<<18) 687%assign cpuflags_lzcnt (1<<19) 688%assign cpuflags_misalign (1<<20) 689%assign cpuflags_aligned (1<<21) ; not a cpu feature, but a function variant 690%assign cpuflags_atom (1<<22) 691 692%define cpuflag(x) ((cpuflags & (cpuflags_ %+ x)) == (cpuflags_ %+ x)) 693%define notcpuflag(x) ((cpuflags & (cpuflags_ %+ x)) != (cpuflags_ %+ x)) 694 695; Takes up to 2 cpuflags from the above list. 696; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu. 697; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_MMX &co. 698%macro INIT_CPUFLAGS 0-2 699 %if %0 >= 1 700 %xdefine cpuname %1 701 %assign cpuflags cpuflags_%1 702 %if %0 >= 2 703 %xdefine cpuname %1_%2 704 %assign cpuflags cpuflags | cpuflags_%2 705 %endif 706 %xdefine SUFFIX _ %+ cpuname 707 %if cpuflag(avx) 708 %assign avx_enabled 1 709 %endif 710 %if mmsize == 16 && notcpuflag(sse2) 711 %define mova movaps 712 %define movu movups 713 %define movnta movntps 714 %endif 715 %if cpuflag(aligned) 716 %define movu mova 717 %elifidn %1, sse3 718 %define movu lddqu 719 %endif 720 %else 721 %xdefine SUFFIX 722 %undef cpuname 723 %undef cpuflags 724 %endif 725%endmacro 726 727; merge mmx and sse* 728 729%macro CAT_XDEFINE 3 730 %xdefine %1%2 %3 731%endmacro 732 733%macro CAT_UNDEF 2 734 %undef %1%2 735%endmacro 736 737%macro INIT_MMX 0-1+ 738 %assign avx_enabled 0 739 %define RESET_MM_PERMUTATION INIT_MMX %1 740 %define mmsize 8 741 %define num_mmregs 8 742 %define mova movq 743 %define movu movq 744 %define movh movd 745 %define movnta movntq 746 %assign %%i 0 747 %rep 8 748 CAT_XDEFINE m, %%i, mm %+ %%i 749 CAT_XDEFINE nmm, %%i, %%i 750 %assign %%i %%i+1 751 %endrep 752 %rep 8 753 CAT_UNDEF m, %%i 754 CAT_UNDEF nmm, %%i 755 %assign %%i %%i+1 756 %endrep 757 INIT_CPUFLAGS %1 758%endmacro 759 760%macro INIT_XMM 0-1+ 761 %assign avx_enabled 0 762 %define RESET_MM_PERMUTATION INIT_XMM %1 763 %define mmsize 16 764 %define num_mmregs 8 765 %if ARCH_X86_64 766 %define num_mmregs 16 767 %endif 768 %define mova movdqa 769 %define movu movdqu 770 %define movh movq 771 %define movnta movntdq 772 %assign %%i 0 773 %rep num_mmregs 774 CAT_XDEFINE m, %%i, xmm %+ %%i 775 CAT_XDEFINE nxmm, %%i, %%i 776 %assign %%i %%i+1 777 %endrep 778 INIT_CPUFLAGS %1 779%endmacro 780 781; FIXME: INIT_AVX can be replaced by INIT_XMM avx 782%macro INIT_AVX 0 783 INIT_XMM 784 %assign avx_enabled 1 785 %define PALIGNR PALIGNR_SSSE3 786 %define RESET_MM_PERMUTATION INIT_AVX 787%endmacro 788 789%macro INIT_YMM 0-1+ 790 %assign avx_enabled 1 791 %define RESET_MM_PERMUTATION INIT_YMM %1 792 %define mmsize 32 793 %define num_mmregs 8 794 %if ARCH_X86_64 795 %define num_mmregs 16 796 %endif 797 %define mova vmovaps 798 %define movu vmovups 799 %undef movh 800 %define movnta vmovntps 801 %assign %%i 0 802 %rep num_mmregs 803 CAT_XDEFINE m, %%i, ymm %+ %%i 804 CAT_XDEFINE nymm, %%i, %%i 805 %assign %%i %%i+1 806 %endrep 807 INIT_CPUFLAGS %1 808%endmacro 809 810INIT_XMM 811 812; I often want to use macros that permute their arguments. e.g. there's no 813; efficient way to implement butterfly or transpose or dct without swapping some 814; arguments. 815; 816; I would like to not have to manually keep track of the permutations: 817; If I insert a permutation in the middle of a function, it should automatically 818; change everything that follows. For more complex macros I may also have multiple 819; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations. 820; 821; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that 822; permutes its arguments. It's equivalent to exchanging the contents of the 823; registers, except that this way you exchange the register names instead, so it 824; doesn't cost any cycles. 825 826%macro PERMUTE 2-* ; takes a list of pairs to swap 827%rep %0/2 828 %xdefine tmp%2 m%2 829 %xdefine ntmp%2 nm%2 830 %rotate 2 831%endrep 832%rep %0/2 833 %xdefine m%1 tmp%2 834 %xdefine nm%1 ntmp%2 835 %undef tmp%2 836 %undef ntmp%2 837 %rotate 2 838%endrep 839%endmacro 840 841%macro SWAP 2-* ; swaps a single chain (sometimes more concise than pairs) 842%rep %0-1 843%ifdef m%1 844 %xdefine tmp m%1 845 %xdefine m%1 m%2 846 %xdefine m%2 tmp 847 CAT_XDEFINE n, m%1, %1 848 CAT_XDEFINE n, m%2, %2 849%else 850 ; If we were called as "SWAP m0,m1" rather than "SWAP 0,1" infer the original numbers here. 851 ; Be careful using this mode in nested macros though, as in some cases there may be 852 ; other copies of m# that have already been dereferenced and don't get updated correctly. 853 %xdefine %%n1 n %+ %1 854 %xdefine %%n2 n %+ %2 855 %xdefine tmp m %+ %%n1 856 CAT_XDEFINE m, %%n1, m %+ %%n2 857 CAT_XDEFINE m, %%n2, tmp 858 CAT_XDEFINE n, m %+ %%n1, %%n1 859 CAT_XDEFINE n, m %+ %%n2, %%n2 860%endif 861 %undef tmp 862 %rotate 1 863%endrep 864%endmacro 865 866; If SAVE_MM_PERMUTATION is placed at the end of a function, then any later 867; calls to that function will automatically load the permutation, so values can 868; be returned in mmregs. 869%macro SAVE_MM_PERMUTATION 0-1 870 %if %0 871 %xdefine %%f %1_m 872 %else 873 %xdefine %%f current_function %+ _m 874 %endif 875 %assign %%i 0 876 %rep num_mmregs 877 CAT_XDEFINE %%f, %%i, m %+ %%i 878 %assign %%i %%i+1 879 %endrep 880%endmacro 881 882%macro LOAD_MM_PERMUTATION 1 ; name to load from 883 %ifdef %1_m0 884 %assign %%i 0 885 %rep num_mmregs 886 CAT_XDEFINE m, %%i, %1_m %+ %%i 887 CAT_XDEFINE n, m %+ %%i, %%i 888 %assign %%i %%i+1 889 %endrep 890 %endif 891%endmacro 892 893; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't 894%macro call 1 895 call_internal %1, %1 %+ SUFFIX 896%endmacro 897%macro call_internal 2 898 %xdefine %%i %1 899 %ifndef cglobaled_%1 900 %ifdef cglobaled_%2 901 %xdefine %%i %2 902 %endif 903 %endif 904 call %%i 905 LOAD_MM_PERMUTATION %%i 906%endmacro 907 908; Substitutions that reduce instruction size but are functionally equivalent 909%macro add 2 910 %ifnum %2 911 %if %2==128 912 sub %1, -128 913 %else 914 add %1, %2 915 %endif 916 %else 917 add %1, %2 918 %endif 919%endmacro 920 921%macro sub 2 922 %ifnum %2 923 %if %2==128 924 add %1, -128 925 %else 926 sub %1, %2 927 %endif 928 %else 929 sub %1, %2 930 %endif 931%endmacro 932 933;============================================================================= 934; AVX abstraction layer 935;============================================================================= 936 937%assign i 0 938%rep 16 939 %if i < 8 940 CAT_XDEFINE sizeofmm, i, 8 941 %endif 942 CAT_XDEFINE sizeofxmm, i, 16 943 CAT_XDEFINE sizeofymm, i, 32 944%assign i i+1 945%endrep 946%undef i 947 948;%1 == instruction 949;%2 == 1 if float, 0 if int 950;%3 == 1 if 4-operand (xmm, xmm, xmm, imm), 0 if 2- or 3-operand (xmm, xmm, xmm) 951;%4 == number of operands given 952;%5+: operands 953%macro RUN_AVX_INSTR 6-7+ 954 %ifid %5 955 %define %%size sizeof%5 956 %else 957 %define %%size mmsize 958 %endif 959 %if %%size==32 960 %if %0 >= 7 961 v%1 %5, %6, %7 962 %else 963 v%1 %5, %6 964 %endif 965 %else 966 %if %%size==8 967 %define %%regmov movq 968 %elif %2 969 %define %%regmov movaps 970 %else 971 %define %%regmov movdqa 972 %endif 973 974 %if %4>=3+%3 975 %ifnidn %5, %6 976 %if avx_enabled && sizeof%5==16 977 v%1 %5, %6, %7 978 %else 979 %%regmov %5, %6 980 %1 %5, %7 981 %endif 982 %else 983 %1 %5, %7 984 %endif 985 %elif %3 986 %1 %5, %6, %7 987 %else 988 %1 %5, %6 989 %endif 990 %endif 991%endmacro 992 993; 3arg AVX ops with a memory arg can only have it in src2, 994; whereas SSE emulation of 3arg prefers to have it in src1 (i.e. the mov). 995; So, if the op is symmetric and the wrong one is memory, swap them. 996%macro RUN_AVX_INSTR1 8 997 %assign %%swap 0 998 %if avx_enabled 999 %ifnid %6 1000 %assign %%swap 1 1001 %endif 1002 %elifnidn %5, %6 1003 %ifnid %7 1004 %assign %%swap 1 1005 %endif 1006 %endif 1007 %if %%swap && %3 == 0 && %8 == 1 1008 RUN_AVX_INSTR %1, %2, %3, %4, %5, %7, %6 1009 %else 1010 RUN_AVX_INSTR %1, %2, %3, %4, %5, %6, %7 1011 %endif 1012%endmacro 1013 1014;%1 == instruction 1015;%2 == 1 if float, 0 if int 1016;%3 == 1 if 4-operand (xmm, xmm, xmm, imm), 0 if 3-operand (xmm, xmm, xmm) 1017;%4 == 1 if symmetric (i.e. doesn't matter which src arg is which), 0 if not 1018%macro AVX_INSTR 4 1019 %macro %1 2-9 fnord, fnord, fnord, %1, %2, %3, %4 1020 %ifidn %3, fnord 1021 RUN_AVX_INSTR %6, %7, %8, 2, %1, %2 1022 %elifidn %4, fnord 1023 RUN_AVX_INSTR1 %6, %7, %8, 3, %1, %2, %3, %9 1024 %elifidn %5, fnord 1025 RUN_AVX_INSTR %6, %7, %8, 4, %1, %2, %3, %4 1026 %else 1027 RUN_AVX_INSTR %6, %7, %8, 5, %1, %2, %3, %4, %5 1028 %endif 1029 %endmacro 1030%endmacro 1031 1032AVX_INSTR addpd, 1, 0, 1 1033AVX_INSTR addps, 1, 0, 1 1034AVX_INSTR addsd, 1, 0, 1 1035AVX_INSTR addss, 1, 0, 1 1036AVX_INSTR addsubpd, 1, 0, 0 1037AVX_INSTR addsubps, 1, 0, 0 1038AVX_INSTR andpd, 1, 0, 1 1039AVX_INSTR andps, 1, 0, 1 1040AVX_INSTR andnpd, 1, 0, 0 1041AVX_INSTR andnps, 1, 0, 0 1042AVX_INSTR blendpd, 1, 0, 0 1043AVX_INSTR blendps, 1, 0, 0 1044AVX_INSTR blendvpd, 1, 0, 0 1045AVX_INSTR blendvps, 1, 0, 0 1046AVX_INSTR cmppd, 1, 0, 0 1047AVX_INSTR cmpps, 1, 0, 0 1048AVX_INSTR cmpsd, 1, 0, 0 1049AVX_INSTR cmpss, 1, 0, 0 1050AVX_INSTR cvtdq2ps, 1, 0, 0 1051AVX_INSTR cvtps2dq, 1, 0, 0 1052AVX_INSTR divpd, 1, 0, 0 1053AVX_INSTR divps, 1, 0, 0 1054AVX_INSTR divsd, 1, 0, 0 1055AVX_INSTR divss, 1, 0, 0 1056AVX_INSTR dppd, 1, 1, 0 1057AVX_INSTR dpps, 1, 1, 0 1058AVX_INSTR haddpd, 1, 0, 0 1059AVX_INSTR haddps, 1, 0, 0 1060AVX_INSTR hsubpd, 1, 0, 0 1061AVX_INSTR hsubps, 1, 0, 0 1062AVX_INSTR maxpd, 1, 0, 1 1063AVX_INSTR maxps, 1, 0, 1 1064AVX_INSTR maxsd, 1, 0, 1 1065AVX_INSTR maxss, 1, 0, 1 1066AVX_INSTR minpd, 1, 0, 1 1067AVX_INSTR minps, 1, 0, 1 1068AVX_INSTR minsd, 1, 0, 1 1069AVX_INSTR minss, 1, 0, 1 1070AVX_INSTR movhlps, 1, 0, 0 1071AVX_INSTR movlhps, 1, 0, 0 1072AVX_INSTR movsd, 1, 0, 0 1073AVX_INSTR movss, 1, 0, 0 1074AVX_INSTR mpsadbw, 0, 1, 0 1075AVX_INSTR mulpd, 1, 0, 1 1076AVX_INSTR mulps, 1, 0, 1 1077AVX_INSTR mulsd, 1, 0, 1 1078AVX_INSTR mulss, 1, 0, 1 1079AVX_INSTR orpd, 1, 0, 1 1080AVX_INSTR orps, 1, 0, 1 1081AVX_INSTR packsswb, 0, 0, 0 1082AVX_INSTR packssdw, 0, 0, 0 1083AVX_INSTR packuswb, 0, 0, 0 1084AVX_INSTR packusdw, 0, 0, 0 1085AVX_INSTR paddb, 0, 0, 1 1086AVX_INSTR paddw, 0, 0, 1 1087AVX_INSTR paddd, 0, 0, 1 1088AVX_INSTR paddq, 0, 0, 1 1089AVX_INSTR paddsb, 0, 0, 1 1090AVX_INSTR paddsw, 0, 0, 1 1091AVX_INSTR paddusb, 0, 0, 1 1092AVX_INSTR paddusw, 0, 0, 1 1093AVX_INSTR palignr, 0, 1, 0 1094AVX_INSTR pand, 0, 0, 1 1095AVX_INSTR pandn, 0, 0, 0 1096AVX_INSTR pavgb, 0, 0, 1 1097AVX_INSTR pavgw, 0, 0, 1 1098AVX_INSTR pblendvb, 0, 0, 0 1099AVX_INSTR pblendw, 0, 1, 0 1100AVX_INSTR pcmpestri, 0, 0, 0 1101AVX_INSTR pcmpestrm, 0, 0, 0 1102AVX_INSTR pcmpistri, 0, 0, 0 1103AVX_INSTR pcmpistrm, 0, 0, 0 1104AVX_INSTR pcmpeqb, 0, 0, 1 1105AVX_INSTR pcmpeqw, 0, 0, 1 1106AVX_INSTR pcmpeqd, 0, 0, 1 1107AVX_INSTR pcmpeqq, 0, 0, 1 1108AVX_INSTR pcmpgtb, 0, 0, 0 1109AVX_INSTR pcmpgtw, 0, 0, 0 1110AVX_INSTR pcmpgtd, 0, 0, 0 1111AVX_INSTR pcmpgtq, 0, 0, 0 1112AVX_INSTR phaddw, 0, 0, 0 1113AVX_INSTR phaddd, 0, 0, 0 1114AVX_INSTR phaddsw, 0, 0, 0 1115AVX_INSTR phsubw, 0, 0, 0 1116AVX_INSTR phsubd, 0, 0, 0 1117AVX_INSTR phsubsw, 0, 0, 0 1118AVX_INSTR pmaddwd, 0, 0, 1 1119AVX_INSTR pmaddubsw, 0, 0, 0 1120AVX_INSTR pmaxsb, 0, 0, 1 1121AVX_INSTR pmaxsw, 0, 0, 1 1122AVX_INSTR pmaxsd, 0, 0, 1 1123AVX_INSTR pmaxub, 0, 0, 1 1124AVX_INSTR pmaxuw, 0, 0, 1 1125AVX_INSTR pmaxud, 0, 0, 1 1126AVX_INSTR pminsb, 0, 0, 1 1127AVX_INSTR pminsw, 0, 0, 1 1128AVX_INSTR pminsd, 0, 0, 1 1129AVX_INSTR pminub, 0, 0, 1 1130AVX_INSTR pminuw, 0, 0, 1 1131AVX_INSTR pminud, 0, 0, 1 1132AVX_INSTR pmulhuw, 0, 0, 1 1133AVX_INSTR pmulhrsw, 0, 0, 1 1134AVX_INSTR pmulhw, 0, 0, 1 1135AVX_INSTR pmullw, 0, 0, 1 1136AVX_INSTR pmulld, 0, 0, 1 1137AVX_INSTR pmuludq, 0, 0, 1 1138AVX_INSTR pmuldq, 0, 0, 1 1139AVX_INSTR por, 0, 0, 1 1140AVX_INSTR psadbw, 0, 0, 1 1141AVX_INSTR pshufb, 0, 0, 0 1142AVX_INSTR psignb, 0, 0, 0 1143AVX_INSTR psignw, 0, 0, 0 1144AVX_INSTR psignd, 0, 0, 0 1145AVX_INSTR psllw, 0, 0, 0 1146AVX_INSTR pslld, 0, 0, 0 1147AVX_INSTR psllq, 0, 0, 0 1148AVX_INSTR pslldq, 0, 0, 0 1149AVX_INSTR psraw, 0, 0, 0 1150AVX_INSTR psrad, 0, 0, 0 1151AVX_INSTR psrlw, 0, 0, 0 1152AVX_INSTR psrld, 0, 0, 0 1153AVX_INSTR psrlq, 0, 0, 0 1154AVX_INSTR psrldq, 0, 0, 0 1155AVX_INSTR psubb, 0, 0, 0 1156AVX_INSTR psubw, 0, 0, 0 1157AVX_INSTR psubd, 0, 0, 0 1158AVX_INSTR psubq, 0, 0, 0 1159AVX_INSTR psubsb, 0, 0, 0 1160AVX_INSTR psubsw, 0, 0, 0 1161AVX_INSTR psubusb, 0, 0, 0 1162AVX_INSTR psubusw, 0, 0, 0 1163AVX_INSTR punpckhbw, 0, 0, 0 1164AVX_INSTR punpckhwd, 0, 0, 0 1165AVX_INSTR punpckhdq, 0, 0, 0 1166AVX_INSTR punpckhqdq, 0, 0, 0 1167AVX_INSTR punpcklbw, 0, 0, 0 1168AVX_INSTR punpcklwd, 0, 0, 0 1169AVX_INSTR punpckldq, 0, 0, 0 1170AVX_INSTR punpcklqdq, 0, 0, 0 1171AVX_INSTR pxor, 0, 0, 1 1172AVX_INSTR shufps, 1, 1, 0 1173AVX_INSTR subpd, 1, 0, 0 1174AVX_INSTR subps, 1, 0, 0 1175AVX_INSTR subsd, 1, 0, 0 1176AVX_INSTR subss, 1, 0, 0 1177AVX_INSTR unpckhpd, 1, 0, 0 1178AVX_INSTR unpckhps, 1, 0, 0 1179AVX_INSTR unpcklpd, 1, 0, 0 1180AVX_INSTR unpcklps, 1, 0, 0 1181AVX_INSTR xorpd, 1, 0, 1 1182AVX_INSTR xorps, 1, 0, 1 1183 1184; 3DNow instructions, for sharing code between AVX, SSE and 3DN 1185AVX_INSTR pfadd, 1, 0, 1 1186AVX_INSTR pfsub, 1, 0, 0 1187AVX_INSTR pfmul, 1, 0, 1 1188 1189; base-4 constants for shuffles 1190%assign i 0 1191%rep 256 1192 %assign j ((i>>6)&3)*1000 + ((i>>4)&3)*100 + ((i>>2)&3)*10 + (i&3) 1193 %if j < 10 1194 CAT_XDEFINE q000, j, i 1195 %elif j < 100 1196 CAT_XDEFINE q00, j, i 1197 %elif j < 1000 1198 CAT_XDEFINE q0, j, i 1199 %else 1200 CAT_XDEFINE q, j, i 1201 %endif 1202%assign i i+1 1203%endrep 1204%undef i 1205%undef j 1206 1207%macro FMA_INSTR 3 1208 %macro %1 4-7 %1, %2, %3 1209 %if cpuflag(xop) 1210 v%5 %1, %2, %3, %4 1211 %else 1212 %6 %1, %2, %3 1213 %7 %1, %4 1214 %endif 1215 %endmacro 1216%endmacro 1217 1218FMA_INSTR pmacsdd, pmulld, paddd 1219FMA_INSTR pmacsww, pmullw, paddw 1220FMA_INSTR pmadcswd, pmaddwd, paddd 1221