1;*****************************************************************************
2;* x86inc.asm: x264asm abstraction layer
3;*****************************************************************************
4;* Copyright (C) 2005-2012 x264 project
5;*
6;* Authors: Loren Merritt <lorenm@u.washington.edu>
7;*          Anton Mitrofanov <BugMaster@narod.ru>
8;*          Jason Garrett-Glaser <darkshikari@gmail.com>
9;*          Henrik Gramner <hengar-6@student.ltu.se>
10;*
11;* Permission to use, copy, modify, and/or distribute this software for any
12;* purpose with or without fee is hereby granted, provided that the above
13;* copyright notice and this permission notice appear in all copies.
14;*
15;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
16;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
17;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
18;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
20;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
21;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
22;*****************************************************************************
23
24; This is a header file for the x264ASM assembly language, which uses
25; NASM/YASM syntax combined with a large number of macros to provide easy
26; abstraction between different calling conventions (x86_32, win64, linux64).
27; It also has various other useful features to simplify writing the kind of
28; DSP functions that are most often used in x264.
29
30; Unlike the rest of x264, this file is available under an ISC license, as it
31; has significant usefulness outside of x264 and we want it to be available
32; to the largest audience possible.  Of course, if you modify it for your own
33; purposes to add a new feature, we strongly encourage contributing a patch
34; as this feature might be useful for others as well.  Send patches or ideas
35; to x264-devel@videolan.org .
36
37; Local changes for libyuv:
38; remove %define program_name and references in labels
39; rename cpus to uppercase
40
41%define WIN64  0
42%define UNIX64 0
43%if ARCH_X86_64
44    %ifidn __OUTPUT_FORMAT__,win32
45        %define WIN64  1
46    %elifidn __OUTPUT_FORMAT__,win64
47        %define WIN64  1
48    %else
49        %define UNIX64 1
50    %endif
51%endif
52
53%ifdef PREFIX
54    %define mangle(x) _ %+ x
55%else
56    %define mangle(x) x
57%endif
58
59; Name of the .rodata section.
60; Kludge: Something on OS X fails to align .rodata even given an align attribute,
61; so use a different read-only section.
62%macro SECTION_RODATA 0-1 16
63    %ifidn __OUTPUT_FORMAT__,macho64
64        SECTION .text align=%1
65    %elifidn __OUTPUT_FORMAT__,macho
66        SECTION .text align=%1
67        fakegot:
68    %elifidn __OUTPUT_FORMAT__,aout
69        section .text
70    %else
71        SECTION .rodata align=%1
72    %endif
73%endmacro
74
75; aout does not support align=
76%macro SECTION_TEXT 0-1 16
77    %ifidn __OUTPUT_FORMAT__,aout
78        SECTION .text
79    %else
80        SECTION .text align=%1
81    %endif
82%endmacro
83
84%if WIN64
85    %define PIC
86%elif ARCH_X86_64 == 0
87; x86_32 doesn't require PIC.
88; Some distros prefer shared objects to be PIC, but nothing breaks if
89; the code contains a few textrels, so we'll skip that complexity.
90    %undef PIC
91%endif
92%ifdef PIC
93    default rel
94%endif
95
96; Always use long nops (reduces 0x90 spam in disassembly on x86_32)
97CPU amdnop
98
99; Macros to eliminate most code duplication between x86_32 and x86_64:
100; Currently this works only for leaf functions which load all their arguments
101; into registers at the start, and make no other use of the stack. Luckily that
102; covers most of x264's asm.
103
104; PROLOGUE:
105; %1 = number of arguments. loads them from stack if needed.
106; %2 = number of registers used. pushes callee-saved regs if needed.
107; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed.
108; %4 = list of names to define to registers
109; PROLOGUE can also be invoked by adding the same options to cglobal
110
111; e.g.
112; cglobal foo, 2,3,0, dst, src, tmp
113; declares a function (foo), taking two args (dst and src) and one local variable (tmp)
114
115; TODO Some functions can use some args directly from the stack. If they're the
116; last args then you can just not declare them, but if they're in the middle
117; we need more flexible macro.
118
119; RET:
120; Pops anything that was pushed by PROLOGUE, and returns.
121
122; REP_RET:
123; Same, but if it doesn't pop anything it becomes a 2-byte ret, for athlons
124; which are slow when a normal ret follows a branch.
125
126; registers:
127; rN and rNq are the native-size register holding function argument N
128; rNd, rNw, rNb are dword, word, and byte size
129; rNh is the high 8 bits of the word size
130; rNm is the original location of arg N (a register or on the stack), dword
131; rNmp is native size
132
133%macro DECLARE_REG 2-3
134    %define r%1q %2
135    %define r%1d %2d
136    %define r%1w %2w
137    %define r%1b %2b
138    %define r%1h %2h
139    %if %0 == 2
140        %define r%1m  %2d
141        %define r%1mp %2
142    %elif ARCH_X86_64 ; memory
143        %define r%1m [rsp + stack_offset + %3]
144        %define r%1mp qword r %+ %1m
145    %else
146        %define r%1m [esp + stack_offset + %3]
147        %define r%1mp dword r %+ %1m
148    %endif
149    %define r%1  %2
150%endmacro
151
152%macro DECLARE_REG_SIZE 3
153    %define r%1q r%1
154    %define e%1q r%1
155    %define r%1d e%1
156    %define e%1d e%1
157    %define r%1w %1
158    %define e%1w %1
159    %define r%1h %3
160    %define e%1h %3
161    %define r%1b %2
162    %define e%1b %2
163%if ARCH_X86_64 == 0
164    %define r%1  e%1
165%endif
166%endmacro
167
168DECLARE_REG_SIZE ax, al, ah
169DECLARE_REG_SIZE bx, bl, bh
170DECLARE_REG_SIZE cx, cl, ch
171DECLARE_REG_SIZE dx, dl, dh
172DECLARE_REG_SIZE si, sil, null
173DECLARE_REG_SIZE di, dil, null
174DECLARE_REG_SIZE bp, bpl, null
175
176; t# defines for when per-arch register allocation is more complex than just function arguments
177
178%macro DECLARE_REG_TMP 1-*
179    %assign %%i 0
180    %rep %0
181        CAT_XDEFINE t, %%i, r%1
182        %assign %%i %%i+1
183        %rotate 1
184    %endrep
185%endmacro
186
187%macro DECLARE_REG_TMP_SIZE 0-*
188    %rep %0
189        %define t%1q t%1 %+ q
190        %define t%1d t%1 %+ d
191        %define t%1w t%1 %+ w
192        %define t%1h t%1 %+ h
193        %define t%1b t%1 %+ b
194        %rotate 1
195    %endrep
196%endmacro
197
198DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
199
200%if ARCH_X86_64
201    %define gprsize 8
202%else
203    %define gprsize 4
204%endif
205
206%macro PUSH 1
207    push %1
208    %assign stack_offset stack_offset+gprsize
209%endmacro
210
211%macro POP 1
212    pop %1
213    %assign stack_offset stack_offset-gprsize
214%endmacro
215
216%macro PUSH_IF_USED 1-*
217    %rep %0
218        %if %1 < regs_used
219            PUSH r%1
220        %endif
221        %rotate 1
222    %endrep
223%endmacro
224
225%macro POP_IF_USED 1-*
226    %rep %0
227        %if %1 < regs_used
228            pop r%1
229        %endif
230        %rotate 1
231    %endrep
232%endmacro
233
234%macro LOAD_IF_USED 1-*
235    %rep %0
236        %if %1 < num_args
237            mov r%1, r %+ %1 %+ mp
238        %endif
239        %rotate 1
240    %endrep
241%endmacro
242
243%macro SUB 2
244    sub %1, %2
245    %ifidn %1, rsp
246        %assign stack_offset stack_offset+(%2)
247    %endif
248%endmacro
249
250%macro ADD 2
251    add %1, %2
252    %ifidn %1, rsp
253        %assign stack_offset stack_offset-(%2)
254    %endif
255%endmacro
256
257%macro movifnidn 2
258    %ifnidn %1, %2
259        mov %1, %2
260    %endif
261%endmacro
262
263%macro movsxdifnidn 2
264    %ifnidn %1, %2
265        movsxd %1, %2
266    %endif
267%endmacro
268
269%macro ASSERT 1
270    %if (%1) == 0
271        %error assert failed
272    %endif
273%endmacro
274
275%macro DEFINE_ARGS 0-*
276    %ifdef n_arg_names
277        %assign %%i 0
278        %rep n_arg_names
279            CAT_UNDEF arg_name %+ %%i, q
280            CAT_UNDEF arg_name %+ %%i, d
281            CAT_UNDEF arg_name %+ %%i, w
282            CAT_UNDEF arg_name %+ %%i, h
283            CAT_UNDEF arg_name %+ %%i, b
284            CAT_UNDEF arg_name %+ %%i, m
285            CAT_UNDEF arg_name %+ %%i, mp
286            CAT_UNDEF arg_name, %%i
287            %assign %%i %%i+1
288        %endrep
289    %endif
290
291    %xdefine %%stack_offset stack_offset
292    %undef stack_offset ; so that the current value of stack_offset doesn't get baked in by xdefine
293    %assign %%i 0
294    %rep %0
295        %xdefine %1q r %+ %%i %+ q
296        %xdefine %1d r %+ %%i %+ d
297        %xdefine %1w r %+ %%i %+ w
298        %xdefine %1h r %+ %%i %+ h
299        %xdefine %1b r %+ %%i %+ b
300        %xdefine %1m r %+ %%i %+ m
301        %xdefine %1mp r %+ %%i %+ mp
302        CAT_XDEFINE arg_name, %%i, %1
303        %assign %%i %%i+1
304        %rotate 1
305    %endrep
306    %xdefine stack_offset %%stack_offset
307    %assign n_arg_names %0
308%endmacro
309
310%if WIN64 ; Windows x64 ;=================================================
311
312DECLARE_REG 0,  rcx
313DECLARE_REG 1,  rdx
314DECLARE_REG 2,  R8
315DECLARE_REG 3,  R9
316DECLARE_REG 4,  R10, 40
317DECLARE_REG 5,  R11, 48
318DECLARE_REG 6,  rax, 56
319DECLARE_REG 7,  rdi, 64
320DECLARE_REG 8,  rsi, 72
321DECLARE_REG 9,  rbx, 80
322DECLARE_REG 10, rbp, 88
323DECLARE_REG 11, R12, 96
324DECLARE_REG 12, R13, 104
325DECLARE_REG 13, R14, 112
326DECLARE_REG 14, R15, 120
327
328%macro PROLOGUE 2-4+ 0 ; #args, #regs, #xmm_regs, arg_names...
329    %assign num_args %1
330    %assign regs_used %2
331    ASSERT regs_used >= num_args
332    ASSERT regs_used <= 15
333    PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14
334    %if mmsize == 8
335        %assign xmm_regs_used 0
336    %else
337        WIN64_SPILL_XMM %3
338    %endif
339    LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14
340    DEFINE_ARGS %4
341%endmacro
342
343%macro WIN64_SPILL_XMM 1
344    %assign xmm_regs_used %1
345    ASSERT xmm_regs_used <= 16
346    %if xmm_regs_used > 6
347        SUB rsp, (xmm_regs_used-6)*16+16
348        %assign %%i xmm_regs_used
349        %rep (xmm_regs_used-6)
350            %assign %%i %%i-1
351            movdqa [rsp + (%%i-6)*16+(~stack_offset&8)], xmm %+ %%i
352        %endrep
353    %endif
354%endmacro
355
356%macro WIN64_RESTORE_XMM_INTERNAL 1
357    %if xmm_regs_used > 6
358        %assign %%i xmm_regs_used
359        %rep (xmm_regs_used-6)
360            %assign %%i %%i-1
361            movdqa xmm %+ %%i, [%1 + (%%i-6)*16+(~stack_offset&8)]
362        %endrep
363        add %1, (xmm_regs_used-6)*16+16
364    %endif
365%endmacro
366
367%macro WIN64_RESTORE_XMM 1
368    WIN64_RESTORE_XMM_INTERNAL %1
369    %assign stack_offset stack_offset-(xmm_regs_used-6)*16+16
370    %assign xmm_regs_used 0
371%endmacro
372
373%define has_epilogue regs_used > 7 || xmm_regs_used > 6 || mmsize == 32
374
375%macro RET 0
376    WIN64_RESTORE_XMM_INTERNAL rsp
377    POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7
378%if mmsize == 32
379    vzeroupper
380%endif
381    ret
382%endmacro
383
384%elif ARCH_X86_64 ; *nix x64 ;=============================================
385
386DECLARE_REG 0,  rdi
387DECLARE_REG 1,  rsi
388DECLARE_REG 2,  rdx
389DECLARE_REG 3,  rcx
390DECLARE_REG 4,  R8
391DECLARE_REG 5,  R9
392DECLARE_REG 6,  rax, 8
393DECLARE_REG 7,  R10, 16
394DECLARE_REG 8,  R11, 24
395DECLARE_REG 9,  rbx, 32
396DECLARE_REG 10, rbp, 40
397DECLARE_REG 11, R12, 48
398DECLARE_REG 12, R13, 56
399DECLARE_REG 13, R14, 64
400DECLARE_REG 14, R15, 72
401
402%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names...
403    %assign num_args %1
404    %assign regs_used %2
405    ASSERT regs_used >= num_args
406    ASSERT regs_used <= 15
407    PUSH_IF_USED 9, 10, 11, 12, 13, 14
408    LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14
409    DEFINE_ARGS %4
410%endmacro
411
412%define has_epilogue regs_used > 9 || mmsize == 32
413
414%macro RET 0
415    POP_IF_USED 14, 13, 12, 11, 10, 9
416%if mmsize == 32
417    vzeroupper
418%endif
419    ret
420%endmacro
421
422%else ; X86_32 ;==============================================================
423
424DECLARE_REG 0, eax, 4
425DECLARE_REG 1, ecx, 8
426DECLARE_REG 2, edx, 12
427DECLARE_REG 3, ebx, 16
428DECLARE_REG 4, esi, 20
429DECLARE_REG 5, edi, 24
430DECLARE_REG 6, ebp, 28
431%define rsp esp
432
433%macro DECLARE_ARG 1-*
434    %rep %0
435        %define r%1m [esp + stack_offset + 4*%1 + 4]
436        %define r%1mp dword r%1m
437        %rotate 1
438    %endrep
439%endmacro
440
441DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
442
443%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names...
444    %assign num_args %1
445    %assign regs_used %2
446    %if regs_used > 7
447        %assign regs_used 7
448    %endif
449    ASSERT regs_used >= num_args
450    PUSH_IF_USED 3, 4, 5, 6
451    LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6
452    DEFINE_ARGS %4
453%endmacro
454
455%define has_epilogue regs_used > 3 || mmsize == 32
456
457%macro RET 0
458    POP_IF_USED 6, 5, 4, 3
459%if mmsize == 32
460    vzeroupper
461%endif
462    ret
463%endmacro
464
465%endif ;======================================================================
466
467%if WIN64 == 0
468%macro WIN64_SPILL_XMM 1
469%endmacro
470%macro WIN64_RESTORE_XMM 1
471%endmacro
472%endif
473
474%macro REP_RET 0
475    %if has_epilogue
476        RET
477    %else
478        rep ret
479    %endif
480%endmacro
481
482%macro TAIL_CALL 2 ; callee, is_nonadjacent
483    %if has_epilogue
484        call %1
485        RET
486    %elif %2
487        jmp %1
488    %endif
489%endmacro
490
491;=============================================================================
492; arch-independent part
493;=============================================================================
494
495%assign function_align 16
496
497; Begin a function.
498; Applies any symbol mangling needed for C linkage, and sets up a define such that
499; subsequent uses of the function name automatically refer to the mangled version.
500; Appends cpuflags to the function name if cpuflags has been specified.
501%macro cglobal 1-2+ ; name, [PROLOGUE args]
502%if %0 == 1
503    cglobal_internal %1 %+ SUFFIX
504%else
505    cglobal_internal %1 %+ SUFFIX, %2
506%endif
507%endmacro
508%macro cglobal_internal 1-2+
509    %ifndef cglobaled_%1
510        %xdefine %1 mangle(%1)
511        %xdefine %1.skip_prologue %1 %+ .skip_prologue
512        CAT_XDEFINE cglobaled_, %1, 1
513    %endif
514    %xdefine current_function %1
515    %ifidn __OUTPUT_FORMAT__,elf
516        global %1:function hidden
517    %else
518        global %1
519    %endif
520    align function_align
521    %1:
522    RESET_MM_PERMUTATION ; not really needed, but makes disassembly somewhat nicer
523    %assign stack_offset 0
524    %if %0 > 1
525        PROLOGUE %2
526    %endif
527%endmacro
528
529%macro cextern 1
530    %xdefine %1 mangle(%1)
531    CAT_XDEFINE cglobaled_, %1, 1
532    extern %1
533%endmacro
534
535; like cextern, but without the prefix
536%macro cextern_naked 1
537    %xdefine %1 mangle(%1)
538    CAT_XDEFINE cglobaled_, %1, 1
539    extern %1
540%endmacro
541
542%macro const 2+
543    %xdefine %1 mangle(%1)
544    global %1
545    %1: %2
546%endmacro
547
548; This is needed for ELF, otherwise the GNU linker assumes the stack is
549; executable by default.
550%ifidn __OUTPUT_FORMAT__,elf
551SECTION .note.GNU-stack noalloc noexec nowrite progbits
552%endif
553%ifidn __OUTPUT_FORMAT__,elf32
554section .note.GNU-stack noalloc noexec nowrite progbits
555%endif
556%ifidn __OUTPUT_FORMAT__,elf64
557section .note.GNU-stack noalloc noexec nowrite progbits
558%endif
559
560; cpuflags
561
562%assign cpuflags_MMX      (1<<0)
563%assign cpuflags_MMX2     (1<<1) | cpuflags_MMX
564%assign cpuflags_3dnow    (1<<2) | cpuflags_MMX
565%assign cpuflags_3dnow2   (1<<3) | cpuflags_3dnow
566%assign cpuflags_SSE      (1<<4) | cpuflags_MMX2
567%assign cpuflags_SSE2     (1<<5) | cpuflags_SSE
568%assign cpuflags_SSE2slow (1<<6) | cpuflags_SSE2
569%assign cpuflags_SSE3     (1<<7) | cpuflags_SSE2
570%assign cpuflags_SSSE3    (1<<8) | cpuflags_SSE3
571%assign cpuflags_SSE4     (1<<9) | cpuflags_SSSE3
572%assign cpuflags_SSE42    (1<<10)| cpuflags_SSE4
573%assign cpuflags_AVX      (1<<11)| cpuflags_SSE42
574%assign cpuflags_xop      (1<<12)| cpuflags_AVX
575%assign cpuflags_fma4     (1<<13)| cpuflags_AVX
576%assign cpuflags_AVX2     (1<<14)| cpuflags_AVX
577%assign cpuflags_fma3     (1<<15)| cpuflags_AVX
578
579%assign cpuflags_cache32  (1<<16)
580%assign cpuflags_cache64  (1<<17)
581%assign cpuflags_slowctz  (1<<18)
582%assign cpuflags_lzcnt    (1<<19)
583%assign cpuflags_misalign (1<<20)
584%assign cpuflags_aligned  (1<<21) ; not a cpu feature, but a function variant
585%assign cpuflags_atom     (1<<22)
586%assign cpuflags_bmi1     (1<<23)
587%assign cpuflags_bmi2     (1<<24)|cpuflags_bmi1
588%assign cpuflags_tbm      (1<<25)|cpuflags_bmi1
589
590%define    cpuflag(x) ((cpuflags & (cpuflags_ %+ x)) == (cpuflags_ %+ x))
591%define notcpuflag(x) ((cpuflags & (cpuflags_ %+ x)) != (cpuflags_ %+ x))
592
593; Takes up to 2 cpuflags from the above list.
594; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu.
595; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_MMX &co.
596%macro INIT_CPUFLAGS 0-2
597    %if %0 >= 1
598        %xdefine cpuname %1
599        %assign cpuflags cpuflags_%1
600        %if %0 >= 2
601            %xdefine cpuname %1_%2
602            %assign cpuflags cpuflags | cpuflags_%2
603        %endif
604        %xdefine SUFFIX _ %+ cpuname
605        %if cpuflag(AVX)
606            %assign AVX_enabled 1
607        %endif
608        %if mmsize == 16 && notcpuflag(SSE2)
609            %define mova movaps
610            %define movu movups
611            %define movnta movntps
612        %endif
613        %if cpuflag(aligned)
614            %define movu mova
615        %elifidn %1, SSE3
616            %define movu lddqu
617        %endif
618    %else
619        %xdefine SUFFIX
620        %undef cpuname
621        %undef cpuflags
622    %endif
623%endmacro
624
625; merge MMX and SSE*
626
627%macro CAT_XDEFINE 3
628    %xdefine %1%2 %3
629%endmacro
630
631%macro CAT_UNDEF 2
632    %undef %1%2
633%endmacro
634
635%macro INIT_MMX 0-1+
636    %assign AVX_enabled 0
637    %define RESET_MM_PERMUTATION INIT_MMX %1
638    %define mmsize 8
639    %define num_mmregs 8
640    %define mova movq
641    %define movu movq
642    %define movh movd
643    %define movnta movntq
644    %assign %%i 0
645    %rep 8
646    CAT_XDEFINE m, %%i, mm %+ %%i
647    CAT_XDEFINE nmm, %%i, %%i
648    %assign %%i %%i+1
649    %endrep
650    %rep 8
651    CAT_UNDEF m, %%i
652    CAT_UNDEF nmm, %%i
653    %assign %%i %%i+1
654    %endrep
655    INIT_CPUFLAGS %1
656%endmacro
657
658%macro INIT_XMM 0-1+
659    %assign AVX_enabled 0
660    %define RESET_MM_PERMUTATION INIT_XMM %1
661    %define mmsize 16
662    %define num_mmregs 8
663    %if ARCH_X86_64
664    %define num_mmregs 16
665    %endif
666    %define mova movdqa
667    %define movu movdqu
668    %define movh movq
669    %define movnta movntdq
670    %assign %%i 0
671    %rep num_mmregs
672    CAT_XDEFINE m, %%i, xmm %+ %%i
673    CAT_XDEFINE nxmm, %%i, %%i
674    %assign %%i %%i+1
675    %endrep
676    INIT_CPUFLAGS %1
677%endmacro
678
679%macro INIT_YMM 0-1+
680    %assign AVX_enabled 1
681    %define RESET_MM_PERMUTATION INIT_YMM %1
682    %define mmsize 32
683    %define num_mmregs 8
684    %if ARCH_X86_64
685    %define num_mmregs 16
686    %endif
687    %define mova vmovaps
688    %define movu vmovups
689    %undef movh
690    %define movnta vmovntps
691    %assign %%i 0
692    %rep num_mmregs
693    CAT_XDEFINE m, %%i, ymm %+ %%i
694    CAT_XDEFINE nymm, %%i, %%i
695    %assign %%i %%i+1
696    %endrep
697    INIT_CPUFLAGS %1
698%endmacro
699
700INIT_XMM
701
702; I often want to use macros that permute their arguments. e.g. there's no
703; efficient way to implement butterfly or transpose or dct without swapping some
704; arguments.
705;
706; I would like to not have to manually keep track of the permutations:
707; If I insert a permutation in the middle of a function, it should automatically
708; change everything that follows. For more complex macros I may also have multiple
709; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations.
710;
711; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that
712; permutes its arguments. It's equivalent to exchanging the contents of the
713; registers, except that this way you exchange the register names instead, so it
714; doesn't cost any cycles.
715
716%macro PERMUTE 2-* ; takes a list of pairs to swap
717%rep %0/2
718    %xdefine tmp%2 m%2
719    %xdefine ntmp%2 nm%2
720    %rotate 2
721%endrep
722%rep %0/2
723    %xdefine m%1 tmp%2
724    %xdefine nm%1 ntmp%2
725    %undef tmp%2
726    %undef ntmp%2
727    %rotate 2
728%endrep
729%endmacro
730
731%macro SWAP 2-* ; swaps a single chain (sometimes more concise than pairs)
732%rep %0-1
733%ifdef m%1
734    %xdefine tmp m%1
735    %xdefine m%1 m%2
736    %xdefine m%2 tmp
737    CAT_XDEFINE n, m%1, %1
738    CAT_XDEFINE n, m%2, %2
739%else
740    ; If we were called as "SWAP m0,m1" rather than "SWAP 0,1" infer the original numbers here.
741    ; Be careful using this mode in nested macros though, as in some cases there may be
742    ; other copies of m# that have already been dereferenced and don't get updated correctly.
743    %xdefine %%n1 n %+ %1
744    %xdefine %%n2 n %+ %2
745    %xdefine tmp m %+ %%n1
746    CAT_XDEFINE m, %%n1, m %+ %%n2
747    CAT_XDEFINE m, %%n2, tmp
748    CAT_XDEFINE n, m %+ %%n1, %%n1
749    CAT_XDEFINE n, m %+ %%n2, %%n2
750%endif
751    %undef tmp
752    %rotate 1
753%endrep
754%endmacro
755
756; If SAVE_MM_PERMUTATION is placed at the end of a function, then any later
757; calls to that function will automatically load the permutation, so values can
758; be returned in mmregs.
759%macro SAVE_MM_PERMUTATION 0-1
760    %if %0
761        %xdefine %%f %1_m
762    %else
763        %xdefine %%f current_function %+ _m
764    %endif
765    %assign %%i 0
766    %rep num_mmregs
767        CAT_XDEFINE %%f, %%i, m %+ %%i
768    %assign %%i %%i+1
769    %endrep
770%endmacro
771
772%macro LOAD_MM_PERMUTATION 1 ; name to load from
773    %ifdef %1_m0
774        %assign %%i 0
775        %rep num_mmregs
776            CAT_XDEFINE m, %%i, %1_m %+ %%i
777            CAT_XDEFINE n, m %+ %%i, %%i
778        %assign %%i %%i+1
779        %endrep
780    %endif
781%endmacro
782
783; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't
784%macro call 1
785    call_internal %1, %1 %+ SUFFIX
786%endmacro
787%macro call_internal 2
788    %xdefine %%i %1
789    %ifndef cglobaled_%1
790        %ifdef cglobaled_%2
791            %xdefine %%i %2
792        %endif
793    %endif
794    call %%i
795    LOAD_MM_PERMUTATION %%i
796%endmacro
797
798; Substitutions that reduce instruction size but are functionally equivalent
799%macro add 2
800    %ifnum %2
801        %if %2==128
802            sub %1, -128
803        %else
804            add %1, %2
805        %endif
806    %else
807        add %1, %2
808    %endif
809%endmacro
810
811%macro sub 2
812    %ifnum %2
813        %if %2==128
814            add %1, -128
815        %else
816            sub %1, %2
817        %endif
818    %else
819        sub %1, %2
820    %endif
821%endmacro
822
823;=============================================================================
824; AVX abstraction layer
825;=============================================================================
826
827%assign i 0
828%rep 16
829    %if i < 8
830        CAT_XDEFINE sizeofmm, i, 8
831    %endif
832    CAT_XDEFINE sizeofxmm, i, 16
833    CAT_XDEFINE sizeofymm, i, 32
834%assign i i+1
835%endrep
836%undef i
837
838%macro CHECK_AVX_INSTR_EMU 3-*
839    %xdefine %%opcode %1
840    %xdefine %%dst %2
841    %rep %0-2
842        %ifidn %%dst, %3
843            %error non-AVX emulation of ``%%opcode'' is not supported
844        %endif
845        %rotate 1
846    %endrep
847%endmacro
848
849;%1 == instruction
850;%2 == 1 if float, 0 if int
851;%3 == 1 if 4-operand (xmm, xmm, xmm, imm), 0 if 2- or 3-operand (xmm, xmm, xmm)
852;%4 == number of operands given
853;%5+: operands
854%macro RUN_AVX_INSTR 6-7+
855    %ifid %6
856        %define %%sizeofreg sizeof%6
857    %elifid %5
858        %define %%sizeofreg sizeof%5
859    %else
860        %define %%sizeofreg mmsize
861    %endif
862    %if %%sizeofreg==32
863        %if %4>=3
864            v%1 %5, %6, %7
865        %else
866            v%1 %5, %6
867        %endif
868    %else
869        %if %%sizeofreg==8
870            %define %%regmov movq
871        %elif %2
872            %define %%regmov movaps
873        %else
874            %define %%regmov movdqa
875        %endif
876
877        %if %4>=3+%3
878            %ifnidn %5, %6
879                %if AVX_enabled && %%sizeofreg==16
880                    v%1 %5, %6, %7
881                %else
882                    CHECK_AVX_INSTR_EMU {%1 %5, %6, %7}, %5, %7
883                    %%regmov %5, %6
884                    %1 %5, %7
885                %endif
886            %else
887                %1 %5, %7
888            %endif
889        %elif %4>=3
890            %1 %5, %6, %7
891        %else
892            %1 %5, %6
893        %endif
894    %endif
895%endmacro
896
897; 3arg AVX ops with a memory arg can only have it in src2,
898; whereas SSE emulation of 3arg prefers to have it in src1 (i.e. the mov).
899; So, if the op is symmetric and the wrong one is memory, swap them.
900%macro RUN_AVX_INSTR1 8
901    %assign %%swap 0
902    %if AVX_enabled
903        %ifnid %6
904            %assign %%swap 1
905        %endif
906    %elifnidn %5, %6
907        %ifnid %7
908            %assign %%swap 1
909        %endif
910    %endif
911    %if %%swap && %3 == 0 && %8 == 1
912        RUN_AVX_INSTR %1, %2, %3, %4, %5, %7, %6
913    %else
914        RUN_AVX_INSTR %1, %2, %3, %4, %5, %6, %7
915    %endif
916%endmacro
917
918;%1 == instruction
919;%2 == 1 if float, 0 if int
920;%3 == 1 if 4-operand (xmm, xmm, xmm, imm), 0 if 2- or 3-operand (xmm, xmm, xmm)
921;%4 == 1 if symmetric (i.e. doesn't matter which src arg is which), 0 if not
922%macro AVX_INSTR 4
923    %macro %1 2-9 fnord, fnord, fnord, %1, %2, %3, %4
924        %ifidn %3, fnord
925            RUN_AVX_INSTR %6, %7, %8, 2, %1, %2
926        %elifidn %4, fnord
927            RUN_AVX_INSTR1 %6, %7, %8, 3, %1, %2, %3, %9
928        %elifidn %5, fnord
929            RUN_AVX_INSTR %6, %7, %8, 4, %1, %2, %3, %4
930        %else
931            RUN_AVX_INSTR %6, %7, %8, 5, %1, %2, %3, %4, %5
932        %endif
933    %endmacro
934%endmacro
935
936AVX_INSTR addpd, 1, 0, 1
937AVX_INSTR addps, 1, 0, 1
938AVX_INSTR addsd, 1, 0, 1
939AVX_INSTR addss, 1, 0, 1
940AVX_INSTR addsubpd, 1, 0, 0
941AVX_INSTR addsubps, 1, 0, 0
942AVX_INSTR andpd, 1, 0, 1
943AVX_INSTR andps, 1, 0, 1
944AVX_INSTR andnpd, 1, 0, 0
945AVX_INSTR andnps, 1, 0, 0
946AVX_INSTR blendpd, 1, 0, 0
947AVX_INSTR blendps, 1, 0, 0
948AVX_INSTR blendvpd, 1, 0, 0
949AVX_INSTR blendvps, 1, 0, 0
950AVX_INSTR cmppd, 1, 0, 0
951AVX_INSTR cmpps, 1, 0, 0
952AVX_INSTR cmpsd, 1, 0, 0
953AVX_INSTR cmpss, 1, 0, 0
954AVX_INSTR cvtdq2ps, 1, 0, 0
955AVX_INSTR cvtps2dq, 1, 0, 0
956AVX_INSTR divpd, 1, 0, 0
957AVX_INSTR divps, 1, 0, 0
958AVX_INSTR divsd, 1, 0, 0
959AVX_INSTR divss, 1, 0, 0
960AVX_INSTR dppd, 1, 1, 0
961AVX_INSTR dpps, 1, 1, 0
962AVX_INSTR haddpd, 1, 0, 0
963AVX_INSTR haddps, 1, 0, 0
964AVX_INSTR hsubpd, 1, 0, 0
965AVX_INSTR hsubps, 1, 0, 0
966AVX_INSTR maxpd, 1, 0, 1
967AVX_INSTR maxps, 1, 0, 1
968AVX_INSTR maxsd, 1, 0, 1
969AVX_INSTR maxss, 1, 0, 1
970AVX_INSTR minpd, 1, 0, 1
971AVX_INSTR minps, 1, 0, 1
972AVX_INSTR minsd, 1, 0, 1
973AVX_INSTR minss, 1, 0, 1
974AVX_INSTR movhlps, 1, 0, 0
975AVX_INSTR movlhps, 1, 0, 0
976AVX_INSTR movsd, 1, 0, 0
977AVX_INSTR movss, 1, 0, 0
978AVX_INSTR mpsadbw, 0, 1, 0
979AVX_INSTR mulpd, 1, 0, 1
980AVX_INSTR mulps, 1, 0, 1
981AVX_INSTR mulsd, 1, 0, 1
982AVX_INSTR mulss, 1, 0, 1
983AVX_INSTR orpd, 1, 0, 1
984AVX_INSTR orps, 1, 0, 1
985AVX_INSTR pabsb, 0, 0, 0
986AVX_INSTR pabsw, 0, 0, 0
987AVX_INSTR pabsd, 0, 0, 0
988AVX_INSTR packsswb, 0, 0, 0
989AVX_INSTR packssdw, 0, 0, 0
990AVX_INSTR packuswb, 0, 0, 0
991AVX_INSTR packusdw, 0, 0, 0
992AVX_INSTR paddb, 0, 0, 1
993AVX_INSTR paddw, 0, 0, 1
994AVX_INSTR paddd, 0, 0, 1
995AVX_INSTR paddq, 0, 0, 1
996AVX_INSTR paddsb, 0, 0, 1
997AVX_INSTR paddsw, 0, 0, 1
998AVX_INSTR paddusb, 0, 0, 1
999AVX_INSTR paddusw, 0, 0, 1
1000AVX_INSTR palignr, 0, 1, 0
1001AVX_INSTR pand, 0, 0, 1
1002AVX_INSTR pandn, 0, 0, 0
1003AVX_INSTR pavgb, 0, 0, 1
1004AVX_INSTR pavgw, 0, 0, 1
1005AVX_INSTR pblendvb, 0, 0, 0
1006AVX_INSTR pblendw, 0, 1, 0
1007AVX_INSTR pcmpestri, 0, 0, 0
1008AVX_INSTR pcmpestrm, 0, 0, 0
1009AVX_INSTR pcmpistri, 0, 0, 0
1010AVX_INSTR pcmpistrm, 0, 0, 0
1011AVX_INSTR pcmpeqb, 0, 0, 1
1012AVX_INSTR pcmpeqw, 0, 0, 1
1013AVX_INSTR pcmpeqd, 0, 0, 1
1014AVX_INSTR pcmpeqq, 0, 0, 1
1015AVX_INSTR pcmpgtb, 0, 0, 0
1016AVX_INSTR pcmpgtw, 0, 0, 0
1017AVX_INSTR pcmpgtd, 0, 0, 0
1018AVX_INSTR pcmpgtq, 0, 0, 0
1019AVX_INSTR phaddw, 0, 0, 0
1020AVX_INSTR phaddd, 0, 0, 0
1021AVX_INSTR phaddsw, 0, 0, 0
1022AVX_INSTR phsubw, 0, 0, 0
1023AVX_INSTR phsubd, 0, 0, 0
1024AVX_INSTR phsubsw, 0, 0, 0
1025AVX_INSTR pmaddwd, 0, 0, 1
1026AVX_INSTR pmaddubsw, 0, 0, 0
1027AVX_INSTR pmaxsb, 0, 0, 1
1028AVX_INSTR pmaxsw, 0, 0, 1
1029AVX_INSTR pmaxsd, 0, 0, 1
1030AVX_INSTR pmaxub, 0, 0, 1
1031AVX_INSTR pmaxuw, 0, 0, 1
1032AVX_INSTR pmaxud, 0, 0, 1
1033AVX_INSTR pminsb, 0, 0, 1
1034AVX_INSTR pminsw, 0, 0, 1
1035AVX_INSTR pminsd, 0, 0, 1
1036AVX_INSTR pminub, 0, 0, 1
1037AVX_INSTR pminuw, 0, 0, 1
1038AVX_INSTR pminud, 0, 0, 1
1039AVX_INSTR pmovmskb, 0, 0, 0
1040AVX_INSTR pmulhuw, 0, 0, 1
1041AVX_INSTR pmulhrsw, 0, 0, 1
1042AVX_INSTR pmulhw, 0, 0, 1
1043AVX_INSTR pmullw, 0, 0, 1
1044AVX_INSTR pmulld, 0, 0, 1
1045AVX_INSTR pmuludq, 0, 0, 1
1046AVX_INSTR pmuldq, 0, 0, 1
1047AVX_INSTR por, 0, 0, 1
1048AVX_INSTR psadbw, 0, 0, 1
1049AVX_INSTR pshufb, 0, 0, 0
1050AVX_INSTR pshufd, 0, 1, 0
1051AVX_INSTR pshufhw, 0, 1, 0
1052AVX_INSTR pshuflw, 0, 1, 0
1053AVX_INSTR psignb, 0, 0, 0
1054AVX_INSTR psignw, 0, 0, 0
1055AVX_INSTR psignd, 0, 0, 0
1056AVX_INSTR psllw, 0, 0, 0
1057AVX_INSTR pslld, 0, 0, 0
1058AVX_INSTR psllq, 0, 0, 0
1059AVX_INSTR pslldq, 0, 0, 0
1060AVX_INSTR psraw, 0, 0, 0
1061AVX_INSTR psrad, 0, 0, 0
1062AVX_INSTR psrlw, 0, 0, 0
1063AVX_INSTR psrld, 0, 0, 0
1064AVX_INSTR psrlq, 0, 0, 0
1065AVX_INSTR psrldq, 0, 0, 0
1066AVX_INSTR psubb, 0, 0, 0
1067AVX_INSTR psubw, 0, 0, 0
1068AVX_INSTR psubd, 0, 0, 0
1069AVX_INSTR psubq, 0, 0, 0
1070AVX_INSTR psubsb, 0, 0, 0
1071AVX_INSTR psubsw, 0, 0, 0
1072AVX_INSTR psubusb, 0, 0, 0
1073AVX_INSTR psubusw, 0, 0, 0
1074AVX_INSTR ptest, 0, 0, 0
1075AVX_INSTR punpckhbw, 0, 0, 0
1076AVX_INSTR punpckhwd, 0, 0, 0
1077AVX_INSTR punpckhdq, 0, 0, 0
1078AVX_INSTR punpckhqdq, 0, 0, 0
1079AVX_INSTR punpcklbw, 0, 0, 0
1080AVX_INSTR punpcklwd, 0, 0, 0
1081AVX_INSTR punpckldq, 0, 0, 0
1082AVX_INSTR punpcklqdq, 0, 0, 0
1083AVX_INSTR pxor, 0, 0, 1
1084AVX_INSTR shufps, 1, 1, 0
1085AVX_INSTR subpd, 1, 0, 0
1086AVX_INSTR subps, 1, 0, 0
1087AVX_INSTR subsd, 1, 0, 0
1088AVX_INSTR subss, 1, 0, 0
1089AVX_INSTR unpckhpd, 1, 0, 0
1090AVX_INSTR unpckhps, 1, 0, 0
1091AVX_INSTR unpcklpd, 1, 0, 0
1092AVX_INSTR unpcklps, 1, 0, 0
1093AVX_INSTR xorpd, 1, 0, 1
1094AVX_INSTR xorps, 1, 0, 1
1095
1096; 3DNow instructions, for sharing code between AVX, SSE and 3DN
1097AVX_INSTR pfadd, 1, 0, 1
1098AVX_INSTR pfsub, 1, 0, 0
1099AVX_INSTR pfmul, 1, 0, 1
1100
1101; base-4 constants for shuffles
1102%assign i 0
1103%rep 256
1104    %assign j ((i>>6)&3)*1000 + ((i>>4)&3)*100 + ((i>>2)&3)*10 + (i&3)
1105    %if j < 10
1106        CAT_XDEFINE q000, j, i
1107    %elif j < 100
1108        CAT_XDEFINE q00, j, i
1109    %elif j < 1000
1110        CAT_XDEFINE q0, j, i
1111    %else
1112        CAT_XDEFINE q, j, i
1113    %endif
1114%assign i i+1
1115%endrep
1116%undef i
1117%undef j
1118
1119%macro FMA_INSTR 3
1120    %macro %1 4-7 %1, %2, %3
1121        %if cpuflag(xop)
1122            v%5 %1, %2, %3, %4
1123        %else
1124            %6 %1, %2, %3
1125            %7 %1, %4
1126        %endif
1127    %endmacro
1128%endmacro
1129
1130FMA_INSTR  pmacsdd,  pmulld, paddd
1131FMA_INSTR  pmacsww,  pmullw, paddw
1132FMA_INSTR pmadcswd, pmaddwd, paddd
1133
1134; tzcnt is equivalent to "rep bsf" and is backwards-compatible with bsf.
1135; This lets us use tzcnt without bumping the yasm version requirement yet.
1136%define tzcnt rep bsf
1137