1;*****************************************************************************
2;* x86inc.asm
3;*****************************************************************************
4;* Copyright (C) 2005-2011 x264 project
5;*
6;* Authors: Loren Merritt <lorenm@u.washington.edu>
7;*          Anton Mitrofanov <BugMaster@narod.ru>
8;*          Jason Garrett-Glaser <darkshikari@gmail.com>
9;*
10;* Permission to use, copy, modify, and/or distribute this software for any
11;* purpose with or without fee is hereby granted, provided that the above
12;* copyright notice and this permission notice appear in all copies.
13;*
14;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
15;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
16;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
17;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
18;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
19;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
20;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
21;*****************************************************************************
22
23; This is a header file for the x264ASM assembly language, which uses
24; NASM/YASM syntax combined with a large number of macros to provide easy
25; abstraction between different calling conventions (x86_32, win64, linux64).
26; It also has various other useful features to simplify writing the kind of
27; DSP functions that are most often used in x264.
28
29; Unlike the rest of x264, this file is available under an ISC license, as it
30; has significant usefulness outside of x264 and we want it to be available
31; to the largest audience possible.  Of course, if you modify it for your own
32; purposes to add a new feature, we strongly encourage contributing a patch
33; as this feature might be useful for others as well.  Send patches or ideas
34; to x264-devel@videolan.org .
35
36%ifndef THIRD_PARTY_X86INC_X86INC_ASM_
37%define THIRD_PARTY_X86INC_X86INC_ASM_
38
39; TODO(wolenetz): Consider either updating this customized version to base from
40; a more recent original, or switching to using third_party/ffmpeg's version of
41; this abstraction layer.  See http://crbug.com/175029
42
43%define program_name ff
44
45%ifdef ARCH_X86_64
46    %ifidn __OUTPUT_FORMAT__,win32
47        %define WIN64
48    %elifidn __OUTPUT_FORMAT__,win64
49        %define WIN64
50    %else
51        %define UNIX64
52    %endif
53%endif
54
55%ifdef PREFIX
56    %define mangle(x) _ %+ x
57%else
58    %define mangle(x) x
59%endif
60
61; FIXME: All of the 64bit asm functions that take a stride as an argument
62; via register, assume that the high dword of that register is filled with 0.
63; This is true in practice (since we never do any 64bit arithmetic on strides,
64; and x264's strides are all positive), but is not guaranteed by the ABI.
65
66; Name of the .rodata section.
67; Kludge: Something on OS X fails to align .rodata even given an align attribute,
68; so use a different read-only section.
69%ifdef CHROMIUM
70%macro SECTION_RODATA 0-1 16
71    %ifidn __OUTPUT_FORMAT__,macho64
72        SECTION .text align=%1
73    %elifidn __OUTPUT_FORMAT__,macho
74        SECTION .text align=%1
75        fakegot:
76    %elifidn __OUTPUT_FORMAT__,aout
77        section .text
78    %else
79        SECTION .rodata align=%1
80    %endif
81%endmacro
82%else
83%macro SECTION_RODATA 0-1 16
84    %ifidn __OUTPUT_FORMAT__,aout
85        section .text
86    %else
87        SECTION .rodata align=%1
88    %endif
89%endmacro
90%endif
91
92; aout does not support align=
93%macro SECTION_TEXT 0-1 16
94    %ifidn __OUTPUT_FORMAT__,aout
95        SECTION .text
96    %else
97        SECTION .text align=%1
98    %endif
99%endmacro
100
101%ifdef WIN64
102    %define PIC
103%elifndef ARCH_X86_64
104; For chromium we may build PIC code even for 32 bits system.
105%ifndef CHROMIUM
106; x86_32 doesn't require PIC.
107; Some distros prefer shared objects to be PIC, but nothing breaks if
108; the code contains a few textrels, so we'll skip that complexity.
109    %undef PIC
110%endif
111%endif
112%ifdef PIC
113    default rel
114%endif
115
116; Macros to eliminate most code duplication between x86_32 and x86_64:
117; Currently this works only for leaf functions which load all their arguments
118; into registers at the start, and make no other use of the stack. Luckily that
119; covers most of x264's asm.
120
121; PROLOGUE:
122; %1 = number of arguments. loads them from stack if needed.
123; %2 = number of registers used. pushes callee-saved regs if needed.
124; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed.
125; %4 = list of names to define to registers
126; PROLOGUE can also be invoked by adding the same options to cglobal
127
128; e.g.
129; cglobal foo, 2,3,0, dst, src, tmp
130; declares a function (foo), taking two args (dst and src) and one local variable (tmp)
131
132; TODO Some functions can use some args directly from the stack. If they're the
133; last args then you can just not declare them, but if they're in the middle
134; we need more flexible macro.
135
136; RET:
137; Pops anything that was pushed by PROLOGUE
138
139; REP_RET:
140; Same, but if it doesn't pop anything it becomes a 2-byte ret, for athlons
141; which are slow when a normal ret follows a branch.
142
143; registers:
144; rN and rNq are the native-size register holding function argument N
145; rNd, rNw, rNb are dword, word, and byte size
146; rNm is the original location of arg N (a register or on the stack), dword
147; rNmp is native size
148
149%macro DECLARE_REG 6
150    %define r%1q %2
151    %define r%1d %3
152    %define r%1w %4
153    %define r%1b %5
154    %define r%1m %6
155    %ifid %6 ; i.e. it's a register
156        %define r%1mp %2
157    %elifdef ARCH_X86_64 ; memory
158        %define r%1mp qword %6
159    %else
160        %define r%1mp dword %6
161    %endif
162    %define r%1  %2
163%endmacro
164
165%macro DECLARE_REG_SIZE 2
166    %define r%1q r%1
167    %define e%1q r%1
168    %define r%1d e%1
169    %define e%1d e%1
170    %define r%1w %1
171    %define e%1w %1
172    %define r%1b %2
173    %define e%1b %2
174%ifndef ARCH_X86_64
175    %define r%1  e%1
176%endif
177%endmacro
178
179DECLARE_REG_SIZE ax, al
180DECLARE_REG_SIZE bx, bl
181DECLARE_REG_SIZE cx, cl
182DECLARE_REG_SIZE dx, dl
183DECLARE_REG_SIZE si, sil
184DECLARE_REG_SIZE di, dil
185DECLARE_REG_SIZE bp, bpl
186
187; t# defines for when per-arch register allocation is more complex than just function arguments
188
189%macro DECLARE_REG_TMP 1-*
190    %assign %%i 0
191    %rep %0
192        CAT_XDEFINE t, %%i, r%1
193        %assign %%i %%i+1
194        %rotate 1
195    %endrep
196%endmacro
197
198%macro DECLARE_REG_TMP_SIZE 0-*
199    %rep %0
200        %define t%1q t%1 %+ q
201        %define t%1d t%1 %+ d
202        %define t%1w t%1 %+ w
203        %define t%1b t%1 %+ b
204        %rotate 1
205    %endrep
206%endmacro
207
208DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9
209
210%ifdef ARCH_X86_64
211    %define gprsize 8
212%else
213    %define gprsize 4
214%endif
215
216%macro PUSH 1
217    push %1
218    %assign stack_offset stack_offset+gprsize
219%endmacro
220
221%macro POP 1
222    pop %1
223    %assign stack_offset stack_offset-gprsize
224%endmacro
225
226%macro SUB 2
227    sub %1, %2
228    %ifidn %1, rsp
229        %assign stack_offset stack_offset+(%2)
230    %endif
231%endmacro
232
233%macro ADD 2
234    add %1, %2
235    %ifidn %1, rsp
236        %assign stack_offset stack_offset-(%2)
237    %endif
238%endmacro
239
240%macro movifnidn 2
241    %ifnidn %1, %2
242        mov %1, %2
243    %endif
244%endmacro
245
246%macro movsxdifnidn 2
247    %ifnidn %1, %2
248        movsxd %1, %2
249    %endif
250%endmacro
251
252%macro ASSERT 1
253    %if (%1) == 0
254        %error assert failed
255    %endif
256%endmacro
257
258%macro DEFINE_ARGS 0-*
259    %ifdef n_arg_names
260        %assign %%i 0
261        %rep n_arg_names
262            CAT_UNDEF arg_name %+ %%i, q
263            CAT_UNDEF arg_name %+ %%i, d
264            CAT_UNDEF arg_name %+ %%i, w
265            CAT_UNDEF arg_name %+ %%i, b
266            CAT_UNDEF arg_name %+ %%i, m
267            CAT_UNDEF arg_name, %%i
268            %assign %%i %%i+1
269        %endrep
270    %endif
271
272    %assign %%i 0
273    %rep %0
274        %xdefine %1q r %+ %%i %+ q
275        %xdefine %1d r %+ %%i %+ d
276        %xdefine %1w r %+ %%i %+ w
277        %xdefine %1b r %+ %%i %+ b
278        %xdefine %1m r %+ %%i %+ m
279        CAT_XDEFINE arg_name, %%i, %1
280        %assign %%i %%i+1
281        %rotate 1
282    %endrep
283    %assign n_arg_names %%i
284%endmacro
285
286%ifdef WIN64 ; Windows x64 ;=================================================
287
288DECLARE_REG 0, rcx, ecx, cx,  cl,  ecx
289DECLARE_REG 1, rdx, edx, dx,  dl,  edx
290DECLARE_REG 2, r8,  r8d, r8w, r8b, r8d
291DECLARE_REG 3, r9,  r9d, r9w, r9b, r9d
292DECLARE_REG 4, rdi, edi, di,  dil, [rsp + stack_offset + 40]
293DECLARE_REG 5, rsi, esi, si,  sil, [rsp + stack_offset + 48]
294DECLARE_REG 6, rax, eax, ax,  al,  [rsp + stack_offset + 56]
295%define r7m [rsp + stack_offset + 64]
296%define r8m [rsp + stack_offset + 72]
297
298%macro LOAD_IF_USED 2 ; reg_id, number_of_args
299    %if %1 < %2
300        mov r%1, [rsp + stack_offset + 8 + %1*8]
301    %endif
302%endmacro
303
304%macro PROLOGUE 2-4+ 0 ; #args, #regs, #xmm_regs, arg_names...
305    ASSERT %2 >= %1
306    %assign regs_used %2
307    ASSERT regs_used <= 7
308    %if regs_used > 4
309        push r4
310        push r5
311        %assign stack_offset stack_offset+16
312    %endif
313    WIN64_SPILL_XMM %3
314    LOAD_IF_USED 4, %1
315    LOAD_IF_USED 5, %1
316    LOAD_IF_USED 6, %1
317    DEFINE_ARGS %4
318%endmacro
319
320%macro WIN64_SPILL_XMM 1
321    %assign xmm_regs_used %1
322    ASSERT xmm_regs_used <= 16
323    %if xmm_regs_used > 6
324        sub rsp, (xmm_regs_used-6)*16+16
325        %assign stack_offset stack_offset+(xmm_regs_used-6)*16+16
326        %assign %%i xmm_regs_used
327        %rep (xmm_regs_used-6)
328            %assign %%i %%i-1
329            movdqa [rsp + (%%i-6)*16+8], xmm %+ %%i
330        %endrep
331    %endif
332%endmacro
333
334%macro WIN64_RESTORE_XMM_INTERNAL 1
335    %if xmm_regs_used > 6
336        %assign %%i xmm_regs_used
337        %rep (xmm_regs_used-6)
338            %assign %%i %%i-1
339            movdqa xmm %+ %%i, [%1 + (%%i-6)*16+8]
340        %endrep
341        add %1, (xmm_regs_used-6)*16+16
342    %endif
343%endmacro
344
345%macro WIN64_RESTORE_XMM 1
346    WIN64_RESTORE_XMM_INTERNAL %1
347    %assign stack_offset stack_offset-(xmm_regs_used-6)*16+16
348    %assign xmm_regs_used 0
349%endmacro
350
351%macro RET 0
352    WIN64_RESTORE_XMM_INTERNAL rsp
353    %if regs_used > 4
354        pop r5
355        pop r4
356    %endif
357    ret
358%endmacro
359
360%macro REP_RET 0
361    %if regs_used > 4 || xmm_regs_used > 6
362        RET
363    %else
364        rep ret
365    %endif
366%endmacro
367
368%elifdef ARCH_X86_64 ; *nix x64 ;=============================================
369
370DECLARE_REG 0, rdi, edi, di,  dil, edi
371DECLARE_REG 1, rsi, esi, si,  sil, esi
372DECLARE_REG 2, rdx, edx, dx,  dl,  edx
373DECLARE_REG 3, rcx, ecx, cx,  cl,  ecx
374DECLARE_REG 4, r8,  r8d, r8w, r8b, r8d
375DECLARE_REG 5, r9,  r9d, r9w, r9b, r9d
376DECLARE_REG 6, rax, eax, ax,  al,  [rsp + stack_offset + 8]
377%define r7m [rsp + stack_offset + 16]
378%define r8m [rsp + stack_offset + 24]
379
380%macro LOAD_IF_USED 2 ; reg_id, number_of_args
381    %if %1 < %2
382        mov r%1, [rsp - 40 + %1*8]
383    %endif
384%endmacro
385
386%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names...
387    ASSERT %2 >= %1
388    ASSERT %2 <= 7
389    LOAD_IF_USED 6, %1
390    DEFINE_ARGS %4
391%endmacro
392
393%macro RET 0
394    ret
395%endmacro
396
397%macro REP_RET 0
398    rep ret
399%endmacro
400
401%else ; X86_32 ;==============================================================
402
403; Begin chromium edits
404%ifdef CHROMIUM
405; Change the order of registers so we can get the lower 8-bit or the 5th and 6th
406; arguments.
407DECLARE_REG 0, esi, esi, si, null, [esp + stack_offset + 4]
408DECLARE_REG 1, edi, edi, di, null, [esp + stack_offset + 8]
409DECLARE_REG 2, ecx, ecx, cx, cl,   [esp + stack_offset + 12]
410DECLARE_REG 3, edx, edx, dx, dl,   [esp + stack_offset + 16]
411DECLARE_REG 4, eax, eax, ax, al,   [esp + stack_offset + 20]
412DECLARE_REG 5, ebx, ebx, bx, bl,   [esp + stack_offset + 24]
413%else
414DECLARE_REG 0, eax, eax, ax, al,   [esp + stack_offset + 4]
415DECLARE_REG 1, ecx, ecx, cx, cl,   [esp + stack_offset + 8]
416DECLARE_REG 2, edx, edx, dx, dl,   [esp + stack_offset + 12]
417DECLARE_REG 3, ebx, ebx, bx, bl,   [esp + stack_offset + 16]
418DECLARE_REG 4, esi, esi, si, null, [esp + stack_offset + 20]
419DECLARE_REG 5, edi, edi, di, null, [esp + stack_offset + 24]
420%endif
421; End chromium edits
422DECLARE_REG 6, ebp, ebp, bp, null, [esp + stack_offset + 28]
423%define r7m [esp + stack_offset + 32]
424%define r8m [esp + stack_offset + 36]
425%define rsp esp
426
427%macro PUSH_IF_USED 1 ; reg_id
428    %if %1 < regs_used
429        push r%1
430        %assign stack_offset stack_offset+4
431    %endif
432%endmacro
433
434%macro POP_IF_USED 1 ; reg_id
435    %if %1 < regs_used
436        pop r%1
437    %endif
438%endmacro
439
440%macro LOAD_IF_USED 2 ; reg_id, number_of_args
441    %if %1 < %2
442        mov r%1, [esp + stack_offset + 4 + %1*4]
443    %endif
444%endmacro
445
446%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names...
447    ASSERT %2 >= %1
448    %assign regs_used %2
449    ASSERT regs_used <= 7
450%ifdef CHROMIUM
451    PUSH_IF_USED 0
452    PUSH_IF_USED 1
453    PUSH_IF_USED 5
454%else
455    PUSH_IF_USED 3
456    PUSH_IF_USED 4
457    PUSH_IF_USED 5
458%endif
459    PUSH_IF_USED 6
460    LOAD_IF_USED 0, %1
461    LOAD_IF_USED 1, %1
462    LOAD_IF_USED 2, %1
463    LOAD_IF_USED 3, %1
464    LOAD_IF_USED 4, %1
465    LOAD_IF_USED 5, %1
466    LOAD_IF_USED 6, %1
467    DEFINE_ARGS %4
468%endmacro
469
470%macro RET 0
471    POP_IF_USED 6
472%ifdef CHROMIUM
473    POP_IF_USED 5
474    POP_IF_USED 1
475    POP_IF_USED 0
476%else
477    POP_IF_USED 5
478    POP_IF_USED 4
479    POP_IF_USED 3
480%endif
481    ret
482%endmacro
483
484%macro REP_RET 0
485    %if regs_used > 3
486        RET
487    %else
488        rep ret
489    %endif
490%endmacro
491
492%endif ;======================================================================
493
494%ifndef WIN64
495%macro WIN64_SPILL_XMM 1
496%endmacro
497%macro WIN64_RESTORE_XMM 1
498%endmacro
499%endif
500
501
502
503;=============================================================================
504; arch-independent part
505;=============================================================================
506
507%assign function_align 16
508
509; Symbol prefix for C linkage
510%macro cglobal 1-2+
511    %xdefine %1 mangle(program_name %+ _ %+ %1)
512    %xdefine %1.skip_prologue %1 %+ .skip_prologue
513    %ifidn __OUTPUT_FORMAT__,elf
514        global %1:function hidden
515    %else
516        global %1
517    %endif
518    align function_align
519    %1:
520    RESET_MM_PERMUTATION ; not really needed, but makes disassembly somewhat nicer
521    %assign stack_offset 0
522    %if %0 > 1
523        PROLOGUE %2
524    %endif
525%endmacro
526
527%macro cextern 1
528    %xdefine %1 mangle(program_name %+ _ %+ %1)
529    extern %1
530%endmacro
531
532;like cextern, but without the prefix
533%macro cextern_naked 1
534    %xdefine %1 mangle(%1)
535    extern %1
536%endmacro
537
538%macro const 2+
539    %xdefine %1 mangle(program_name %+ _ %+ %1)
540    global %1
541    %1: %2
542%endmacro
543
544; This is needed for ELF, otherwise the GNU linker assumes the stack is
545; executable by default.
546%ifidn __OUTPUT_FORMAT__,elf
547SECTION .note.GNU-stack noalloc noexec nowrite progbits
548%endif
549
550; merge mmx and sse*
551
552%macro CAT_XDEFINE 3
553    %xdefine %1%2 %3
554%endmacro
555
556%macro CAT_UNDEF 2
557    %undef %1%2
558%endmacro
559
560%macro INIT_MMX 0
561    %assign avx_enabled 0
562    %define RESET_MM_PERMUTATION INIT_MMX
563    %define mmsize 8
564    %define num_mmregs 8
565    %define mova movq
566    %define movu movq
567    %define movh movd
568    %define movnta movntq
569    %assign %%i 0
570    %rep 8
571    CAT_XDEFINE m, %%i, mm %+ %%i
572    CAT_XDEFINE nmm, %%i, %%i
573    %assign %%i %%i+1
574    %endrep
575    %rep 8
576    CAT_UNDEF m, %%i
577    CAT_UNDEF nmm, %%i
578    %assign %%i %%i+1
579    %endrep
580%endmacro
581
582%macro INIT_XMM 0
583    %assign avx_enabled 0
584    %define RESET_MM_PERMUTATION INIT_XMM
585    %define mmsize 16
586    %define num_mmregs 8
587    %ifdef ARCH_X86_64
588    %define num_mmregs 16
589    %endif
590    %define mova movdqa
591    %define movu movdqu
592    %define movh movq
593    %define movnta movntdq
594    %assign %%i 0
595    %rep num_mmregs
596    CAT_XDEFINE m, %%i, xmm %+ %%i
597    CAT_XDEFINE nxmm, %%i, %%i
598    %assign %%i %%i+1
599    %endrep
600%endmacro
601
602%macro INIT_AVX 0
603    INIT_XMM
604    %assign avx_enabled 1
605    %define PALIGNR PALIGNR_SSSE3
606    %define RESET_MM_PERMUTATION INIT_AVX
607%endmacro
608
609%macro INIT_YMM 0
610    %assign avx_enabled 1
611    %define RESET_MM_PERMUTATION INIT_YMM
612    %define mmsize 32
613    %define num_mmregs 8
614    %ifdef ARCH_X86_64
615    %define num_mmregs 16
616    %endif
617    %define mova vmovaps
618    %define movu vmovups
619    %assign %%i 0
620    %rep num_mmregs
621    CAT_XDEFINE m, %%i, ymm %+ %%i
622    CAT_XDEFINE nymm, %%i, %%i
623    %assign %%i %%i+1
624    %endrep
625%endmacro
626
627INIT_MMX
628
629; I often want to use macros that permute their arguments. e.g. there's no
630; efficient way to implement butterfly or transpose or dct without swapping some
631; arguments.
632;
633; I would like to not have to manually keep track of the permutations:
634; If I insert a permutation in the middle of a function, it should automatically
635; change everything that follows. For more complex macros I may also have multiple
636; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations.
637;
638; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that
639; permutes its arguments. It's equivalent to exchanging the contents of the
640; registers, except that this way you exchange the register names instead, so it
641; doesn't cost any cycles.
642
643%macro PERMUTE 2-* ; takes a list of pairs to swap
644%rep %0/2
645    %xdefine tmp%2 m%2
646    %xdefine ntmp%2 nm%2
647    %rotate 2
648%endrep
649%rep %0/2
650    %xdefine m%1 tmp%2
651    %xdefine nm%1 ntmp%2
652    %undef tmp%2
653    %undef ntmp%2
654    %rotate 2
655%endrep
656%endmacro
657
658%macro SWAP 2-* ; swaps a single chain (sometimes more concise than pairs)
659%rep %0-1
660%ifdef m%1
661    %xdefine tmp m%1
662    %xdefine m%1 m%2
663    %xdefine m%2 tmp
664    CAT_XDEFINE n, m%1, %1
665    CAT_XDEFINE n, m%2, %2
666%else
667    ; If we were called as "SWAP m0,m1" rather than "SWAP 0,1" infer the original numbers here.
668    ; Be careful using this mode in nested macros though, as in some cases there may be
669    ; other copies of m# that have already been dereferenced and don't get updated correctly.
670    %xdefine %%n1 n %+ %1
671    %xdefine %%n2 n %+ %2
672    %xdefine tmp m %+ %%n1
673    CAT_XDEFINE m, %%n1, m %+ %%n2
674    CAT_XDEFINE m, %%n2, tmp
675    CAT_XDEFINE n, m %+ %%n1, %%n1
676    CAT_XDEFINE n, m %+ %%n2, %%n2
677%endif
678    %undef tmp
679    %rotate 1
680%endrep
681%endmacro
682
683; If SAVE_MM_PERMUTATION is placed at the end of a function and given the
684; function name, then any later calls to that function will automatically
685; load the permutation, so values can be returned in mmregs.
686%macro SAVE_MM_PERMUTATION 1 ; name to save as
687    %assign %%i 0
688    %rep num_mmregs
689    CAT_XDEFINE %1_m, %%i, m %+ %%i
690    %assign %%i %%i+1
691    %endrep
692%endmacro
693
694%macro LOAD_MM_PERMUTATION 1 ; name to load from
695    %assign %%i 0
696    %rep num_mmregs
697    CAT_XDEFINE m, %%i, %1_m %+ %%i
698    CAT_XDEFINE n, m %+ %%i, %%i
699    %assign %%i %%i+1
700    %endrep
701%endmacro
702
703%macro call 1
704    call %1
705    %ifdef %1_m0
706        LOAD_MM_PERMUTATION %1
707    %endif
708%endmacro
709
710; Substitutions that reduce instruction size but are functionally equivalent
711%macro add 2
712    %ifnum %2
713        %if %2==128
714            sub %1, -128
715        %else
716            add %1, %2
717        %endif
718    %else
719        add %1, %2
720    %endif
721%endmacro
722
723%macro sub 2
724    %ifnum %2
725        %if %2==128
726            add %1, -128
727        %else
728            sub %1, %2
729        %endif
730    %else
731        sub %1, %2
732    %endif
733%endmacro
734
735;=============================================================================
736; AVX abstraction layer
737;=============================================================================
738
739%assign i 0
740%rep 16
741    %if i < 8
742        CAT_XDEFINE sizeofmm, i, 8
743    %endif
744    CAT_XDEFINE sizeofxmm, i, 16
745    CAT_XDEFINE sizeofymm, i, 32
746%assign i i+1
747%endrep
748%undef i
749
750;%1 == instruction
751;%2 == 1 if float, 0 if int
752;%3 == 0 if 3-operand (xmm, xmm, xmm), 1 if 4-operand (xmm, xmm, xmm, imm)
753;%4 == number of operands given
754;%5+: operands
755%macro RUN_AVX_INSTR 6-7+
756    %if sizeof%5==32
757        v%1 %5, %6, %7
758    %else
759        %if sizeof%5==8
760            %define %%regmov movq
761        %elif %2
762            %define %%regmov movaps
763        %else
764            %define %%regmov movdqa
765        %endif
766
767        %if %4>=3+%3
768            %ifnidn %5, %6
769                %if avx_enabled && sizeof%5==16
770                    v%1 %5, %6, %7
771                %else
772                    %%regmov %5, %6
773                    %1 %5, %7
774                %endif
775            %else
776                %1 %5, %7
777            %endif
778        %elif %3
779            %1 %5, %6, %7
780        %else
781            %1 %5, %6
782        %endif
783    %endif
784%endmacro
785
786;%1 == instruction
787;%2 == 1 if float, 0 if int
788;%3 == 0 if 3-operand (xmm, xmm, xmm), 1 if 4-operand (xmm, xmm, xmm, imm)
789%macro AVX_INSTR 3
790    %macro %1 2-8 fnord, fnord, fnord, %1, %2, %3
791        %ifidn %3, fnord
792            RUN_AVX_INSTR %6, %7, %8, 2, %1, %2
793        %elifidn %4, fnord
794            RUN_AVX_INSTR %6, %7, %8, 3, %1, %2, %3
795        %elifidn %5, fnord
796            RUN_AVX_INSTR %6, %7, %8, 4, %1, %2, %3, %4
797        %else
798            RUN_AVX_INSTR %6, %7, %8, 5, %1, %2, %3, %4, %5
799        %endif
800    %endmacro
801%endmacro
802
803AVX_INSTR addpd, 1, 0
804AVX_INSTR addps, 1, 0
805AVX_INSTR addsd, 1, 0
806AVX_INSTR addss, 1, 0
807AVX_INSTR addsubpd, 1, 0
808AVX_INSTR addsubps, 1, 0
809AVX_INSTR andpd, 1, 0
810AVX_INSTR andps, 1, 0
811AVX_INSTR andnpd, 1, 0
812AVX_INSTR andnps, 1, 0
813AVX_INSTR blendpd, 1, 0
814AVX_INSTR blendps, 1, 0
815AVX_INSTR blendvpd, 1, 0
816AVX_INSTR blendvps, 1, 0
817AVX_INSTR cmppd, 1, 0
818AVX_INSTR cmpps, 1, 0
819AVX_INSTR cmpsd, 1, 0
820AVX_INSTR cmpss, 1, 0
821AVX_INSTR divpd, 1, 0
822AVX_INSTR divps, 1, 0
823AVX_INSTR divsd, 1, 0
824AVX_INSTR divss, 1, 0
825AVX_INSTR dppd, 1, 0
826AVX_INSTR dpps, 1, 0
827AVX_INSTR haddpd, 1, 0
828AVX_INSTR haddps, 1, 0
829AVX_INSTR hsubpd, 1, 0
830AVX_INSTR hsubps, 1, 0
831AVX_INSTR maxpd, 1, 0
832AVX_INSTR maxps, 1, 0
833AVX_INSTR maxsd, 1, 0
834AVX_INSTR maxss, 1, 0
835AVX_INSTR minpd, 1, 0
836AVX_INSTR minps, 1, 0
837AVX_INSTR minsd, 1, 0
838AVX_INSTR minss, 1, 0
839AVX_INSTR mpsadbw, 0, 1
840AVX_INSTR mulpd, 1, 0
841AVX_INSTR mulps, 1, 0
842AVX_INSTR mulsd, 1, 0
843AVX_INSTR mulss, 1, 0
844AVX_INSTR orpd, 1, 0
845AVX_INSTR orps, 1, 0
846AVX_INSTR packsswb, 0, 0
847AVX_INSTR packssdw, 0, 0
848AVX_INSTR packuswb, 0, 0
849AVX_INSTR packusdw, 0, 0
850AVX_INSTR paddb, 0, 0
851AVX_INSTR paddw, 0, 0
852AVX_INSTR paddd, 0, 0
853AVX_INSTR paddq, 0, 0
854AVX_INSTR paddsb, 0, 0
855AVX_INSTR paddsw, 0, 0
856AVX_INSTR paddusb, 0, 0
857AVX_INSTR paddusw, 0, 0
858AVX_INSTR palignr, 0, 1
859AVX_INSTR pand, 0, 0
860AVX_INSTR pandn, 0, 0
861AVX_INSTR pavgb, 0, 0
862AVX_INSTR pavgw, 0, 0
863AVX_INSTR pblendvb, 0, 0
864AVX_INSTR pblendw, 0, 1
865AVX_INSTR pcmpestri, 0, 0
866AVX_INSTR pcmpestrm, 0, 0
867AVX_INSTR pcmpistri, 0, 0
868AVX_INSTR pcmpistrm, 0, 0
869AVX_INSTR pcmpeqb, 0, 0
870AVX_INSTR pcmpeqw, 0, 0
871AVX_INSTR pcmpeqd, 0, 0
872AVX_INSTR pcmpeqq, 0, 0
873AVX_INSTR pcmpgtb, 0, 0
874AVX_INSTR pcmpgtw, 0, 0
875AVX_INSTR pcmpgtd, 0, 0
876AVX_INSTR pcmpgtq, 0, 0
877AVX_INSTR phaddw, 0, 0
878AVX_INSTR phaddd, 0, 0
879AVX_INSTR phaddsw, 0, 0
880AVX_INSTR phsubw, 0, 0
881AVX_INSTR phsubd, 0, 0
882AVX_INSTR phsubsw, 0, 0
883AVX_INSTR pmaddwd, 0, 0
884AVX_INSTR pmaddubsw, 0, 0
885AVX_INSTR pmaxsb, 0, 0
886AVX_INSTR pmaxsw, 0, 0
887AVX_INSTR pmaxsd, 0, 0
888AVX_INSTR pmaxub, 0, 0
889AVX_INSTR pmaxuw, 0, 0
890AVX_INSTR pmaxud, 0, 0
891AVX_INSTR pminsb, 0, 0
892AVX_INSTR pminsw, 0, 0
893AVX_INSTR pminsd, 0, 0
894AVX_INSTR pminub, 0, 0
895AVX_INSTR pminuw, 0, 0
896AVX_INSTR pminud, 0, 0
897AVX_INSTR pmulhuw, 0, 0
898AVX_INSTR pmulhrsw, 0, 0
899AVX_INSTR pmulhw, 0, 0
900AVX_INSTR pmullw, 0, 0
901AVX_INSTR pmulld, 0, 0
902AVX_INSTR pmuludq, 0, 0
903AVX_INSTR pmuldq, 0, 0
904AVX_INSTR por, 0, 0
905AVX_INSTR psadbw, 0, 0
906AVX_INSTR pshufb, 0, 0
907AVX_INSTR psignb, 0, 0
908AVX_INSTR psignw, 0, 0
909AVX_INSTR psignd, 0, 0
910AVX_INSTR psllw, 0, 0
911AVX_INSTR pslld, 0, 0
912AVX_INSTR psllq, 0, 0
913AVX_INSTR pslldq, 0, 0
914AVX_INSTR psraw, 0, 0
915AVX_INSTR psrad, 0, 0
916AVX_INSTR psrlw, 0, 0
917AVX_INSTR psrld, 0, 0
918AVX_INSTR psrlq, 0, 0
919AVX_INSTR psrldq, 0, 0
920AVX_INSTR psubb, 0, 0
921AVX_INSTR psubw, 0, 0
922AVX_INSTR psubd, 0, 0
923AVX_INSTR psubq, 0, 0
924AVX_INSTR psubsb, 0, 0
925AVX_INSTR psubsw, 0, 0
926AVX_INSTR psubusb, 0, 0
927AVX_INSTR psubusw, 0, 0
928AVX_INSTR punpckhbw, 0, 0
929AVX_INSTR punpckhwd, 0, 0
930AVX_INSTR punpckhdq, 0, 0
931AVX_INSTR punpckhqdq, 0, 0
932AVX_INSTR punpcklbw, 0, 0
933AVX_INSTR punpcklwd, 0, 0
934AVX_INSTR punpckldq, 0, 0
935AVX_INSTR punpcklqdq, 0, 0
936AVX_INSTR pxor, 0, 0
937AVX_INSTR shufps, 0, 1
938AVX_INSTR subpd, 1, 0
939AVX_INSTR subps, 1, 0
940AVX_INSTR subsd, 1, 0
941AVX_INSTR subss, 1, 0
942AVX_INSTR unpckhpd, 1, 0
943AVX_INSTR unpckhps, 1, 0
944AVX_INSTR unpcklpd, 1, 0
945AVX_INSTR unpcklps, 1, 0
946AVX_INSTR xorpd, 1, 0
947AVX_INSTR xorps, 1, 0
948
949; 3DNow instructions, for sharing code between AVX, SSE and 3DN
950AVX_INSTR pfadd, 1, 0
951AVX_INSTR pfsub, 1, 0
952AVX_INSTR pfmul, 1, 0
953
954;=============================================================================
955; Chromium extensions
956;=============================================================================
957
958%ifdef CHROMIUM
959; Always build PIC code on Mac or Linux for Chromium.
960%ifdef MACHO
961%define PIC
962%endif
963%ifdef ELF
964%define PIC
965%endif
966
967;
968; LOAD_SYM %1 (reg), %2 (sym)
969; Copies the address to a local symbol to the specified register.
970;
971
972%macro LOAD_SYM 2
973
974%ifdef PIC
975  call      %%geteip
976  add       %1, %2 - $
977  jmp       %%end
978%%geteip:
979  mov       %1, [rsp]
980  ret
981%%end:
982
983%else
984  lea       %1, [%2]
985%endif
986
987%endmacro
988
989;
990; MOVq %1 (xmm), %2 (reg)
991; MOVq %1 (reg), %2 (xmm)
992; Copies a general-purpose register to an XMM register, and vice versa.
993;
994%macro MOVq 2
995%if gprsize == 8
996  movq      %1, %2
997%else
998  movd      %1, %2
999%endif
1000%endmacro
1001
1002%endif  ; CHROMIUM
1003
1004%endif  ; THIRD_PARTY_X86INC_X86INC_ASM_
1005