190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;
2f71323e297a928af368937089d3ed71239786f86Andreas Huber;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;
4f71323e297a928af368937089d3ed71239786f86Andreas Huber;  Use of this source code is governed by a BSD-style license
5f71323e297a928af368937089d3ed71239786f86Andreas Huber;  that can be found in the LICENSE file in the root of the source
6f71323e297a928af368937089d3ed71239786f86Andreas Huber;  tree. An additional intellectual property rights grant can be found
7f71323e297a928af368937089d3ed71239786f86Andreas Huber;  in the file PATENTS.  All contributing project authors may
8f71323e297a928af368937089d3ed71239786f86Andreas Huber;  be found in the AUTHORS file in the root of the source tree.
990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;
1090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%include "vpx_ports/x86_abi_support.asm"
1390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; /****************************************************************************
1590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; * Notes:
1690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; *
1790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; * This implementation makes use of 16 bit fixed point verio of two multiply
1890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; * constants:
1990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; *        1.   sqrt(2) * cos (pi/8)
2090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; *         2.   sqrt(2) * sin (pi/8)
2190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; * Becuase the first constant is bigger than 1, to maintain the same 16 bit
2290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; * fixed point prrcision as the second one, we use a trick of
2390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; *        x * a = x + x*(a-1)
2490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; * so
2590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; *        x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1).
2690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; *
2790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; * For     the second constant, becuase of the 16bit version is 35468, which
2890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; * is bigger than 32768, in signed 16 bit multiply, it become a negative
2990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; * number.
3090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; *        (x * (unsigned)35468 >> 16) = x * (signed)35468 >> 16 + x
3190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; *
3290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; **************************************************************************/
3390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
3490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
3590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;void short_idct4x4llm_mmx(short *input, short *output, int pitch)
3690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberglobal sym(vp8_short_idct4x4llm_mmx)
3790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubersym(vp8_short_idct4x4llm_mmx):
3890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push        rbp
3990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    mov         rbp, rsp
4090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    SHADOW_ARGS_TO_STACK 3
4190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    GET_GOT     rbx
4290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ; end prolog
4390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
4490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         rax,            arg(0) ;input
4590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         rdx,            arg(1) ;output
4690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
4790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm0,            [rax   ]
4890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm1,            [rax+ 8]
4990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
5090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2,            [rax+16]
5190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm3,            [rax+24]
5290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
5390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsxd      rax,            dword ptr arg(2) ;pitch
5490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
5590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubw       mm0,            mm2             ; b1= 0-2
5690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm2,            mm2             ;
5790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
5890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm5,            mm1
5990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm2,            mm0             ; a1 =0+2
6090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
61538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pmulhw      mm5,            [GLOBAL(x_s1sqr2)]       ;
6290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm5,            mm1             ; ip1 * sin(pi/8) * sqrt(2)
6390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
6490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm7,            mm3             ;
65538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pmulhw      mm7,            [GLOBAL(x_c1sqr2less1)]   ;
6690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
6790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm7,            mm3             ; ip3 * cos(pi/8) * sqrt(2)
6890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubw       mm7,            mm5             ; c1
6990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
7090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm5,            mm1
7190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm4,            mm3
7290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
73538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pmulhw      mm5,            [GLOBAL(x_c1sqr2less1)]
7490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm5,            mm1
7590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
76538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pmulhw      mm3,            [GLOBAL(x_s1sqr2)]
7790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm3,            mm4
7890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
7990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm3,            mm5             ; d1
8090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm6,            mm2             ; a1
8190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
8290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm4,            mm0             ; b1
8390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm2,            mm3             ;0
8490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
8590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm4,            mm7             ;1
8690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubw       mm0,            mm7             ;2
8790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
8890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubw       mm6,            mm3             ;3
8990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
9090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm1,            mm2             ; 03 02 01 00
9190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm3,            mm4             ; 23 22 21 20
9290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
9390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklwd   mm1,            mm0             ; 11 01 10 00
9490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhwd   mm2,            mm0             ; 13 03 12 02
9590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
9690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklwd   mm3,            mm6             ; 31 21 30 20
9790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhwd   mm4,            mm6             ; 33 23 32 22
9890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
9990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm0,            mm1             ; 11 01 10 00
10090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm5,            mm2             ; 13 03 12 02
10190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
10290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckldq   mm0,            mm3             ; 30 20 10 00
10390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhdq   mm1,            mm3             ; 31 21 11 01
10490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
10590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckldq   mm2,            mm4             ; 32 22 12 02
10690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhdq   mm5,            mm4             ; 33 23 13 03
10790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
10890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm3,            mm5             ; 33 23 13 03
10990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
11090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubw       mm0,            mm2             ; b1= 0-2
11190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm2,            mm2             ;
11290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
11390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm5,            mm1
11490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm2,            mm0             ; a1 =0+2
11590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
116538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pmulhw      mm5,            [GLOBAL(x_s1sqr2)]        ;
11790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm5,            mm1             ; ip1 * sin(pi/8) * sqrt(2)
11890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
11990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm7,            mm3             ;
120538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pmulhw      mm7,            [GLOBAL(x_c1sqr2less1)]   ;
12190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
12290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm7,            mm3             ; ip3 * cos(pi/8) * sqrt(2)
12390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubw       mm7,            mm5             ; c1
12490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
12590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm5,            mm1
12690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm4,            mm3
12790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
128538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pmulhw      mm5,            [GLOBAL(x_c1sqr2less1)]
12990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm5,            mm1
13090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
131538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pmulhw      mm3,            [GLOBAL(x_s1sqr2)]
13290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm3,            mm4
13390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
13490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm3,            mm5             ; d1
135538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        paddw       mm0,            [GLOBAL(fours)]
13690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
137538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        paddw       mm2,            [GLOBAL(fours)]
13890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm6,            mm2             ; a1
13990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
14090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm4,            mm0             ; b1
14190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm2,            mm3             ;0
14290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
14390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm4,            mm7             ;1
14490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubw       mm0,            mm7             ;2
14590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
14690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubw       mm6,            mm3             ;3
14790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm2,            3
14890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
14990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm0,            3
15090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm4,            3
15190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
15290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm6,            3
15390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
15490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm1,            mm2             ; 03 02 01 00
15590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm3,            mm4             ; 23 22 21 20
15690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
15790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklwd   mm1,            mm0             ; 11 01 10 00
15890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhwd   mm2,            mm0             ; 13 03 12 02
15990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
16090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklwd   mm3,            mm6             ; 31 21 30 20
16190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhwd   mm4,            mm6             ; 33 23 32 22
16290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
16390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm0,            mm1             ; 11 01 10 00
16490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm5,            mm2             ; 13 03 12 02
16590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
16690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckldq   mm0,            mm3             ; 30 20 10 00
16790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhdq   mm1,            mm3             ; 31 21 11 01
16890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
16990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckldq   mm2,            mm4             ; 32 22 12 02
17090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhdq   mm5,            mm4             ; 33 23 13 03
17190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
17290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        [rdx],          mm0
17390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
17490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        [rdx+rax],      mm1
17590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        [rdx+rax*2],    mm2
17690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
17790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rdx,            rax
17890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        [rdx+rax*2],    mm5
17990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
18090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ; begin epilog
18190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    RESTORE_GOT
18290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    UNSHADOW_ARGS
18390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop         rbp
18490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ret
18590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
18690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
18790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;void short_idct4x4llm_1_mmx(short *input, short *output, int pitch)
18890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberglobal sym(vp8_short_idct4x4llm_1_mmx)
18990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubersym(vp8_short_idct4x4llm_1_mmx):
19090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push        rbp
19190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    mov         rbp, rsp
19290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    SHADOW_ARGS_TO_STACK 3
19390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    GET_GOT     rbx
19490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ; end prolog
19590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
19690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         rax,            arg(0) ;input
19790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        mm0,            [rax]
19890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
199538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        paddw       mm0,            [GLOBAL(fours)]
20090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         rdx,            arg(1) ;output
20190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
20290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm0,            3
20390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsxd      rax,            dword ptr arg(2) ;pitch
20490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
20590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklwd   mm0,            mm0
20690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckldq   mm0,            mm0
20790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
20890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        [rdx],          mm0
20990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        [rdx+rax],      mm0
21090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
21190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        [rdx+rax*2],    mm0
21290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rdx,            rax
21390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
21490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        [rdx+rax*2],    mm0
21590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
21690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
21790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ; begin epilog
21890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    RESTORE_GOT
21990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    UNSHADOW_ARGS
22090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop         rbp
22190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ret
22290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
223f71323e297a928af368937089d3ed71239786f86Andreas Huber;void vp8_dc_only_idct_add_mmx(short input_dc, unsigned char *pred_ptr, unsigned char *dst_ptr, int pitch, int stride)
224f71323e297a928af368937089d3ed71239786f86Andreas Huberglobal sym(vp8_dc_only_idct_add_mmx)
225f71323e297a928af368937089d3ed71239786f86Andreas Hubersym(vp8_dc_only_idct_add_mmx):
22690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push        rbp
22790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    mov         rbp, rsp
228f71323e297a928af368937089d3ed71239786f86Andreas Huber    SHADOW_ARGS_TO_STACK 5
22990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    GET_GOT     rbx
230f71323e297a928af368937089d3ed71239786f86Andreas Huber    push        rsi
231f71323e297a928af368937089d3ed71239786f86Andreas Huber    push        rdi
23290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ; end prolog
23390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
234f71323e297a928af368937089d3ed71239786f86Andreas Huber        mov         rsi,            arg(1) ;s -- prediction
235f71323e297a928af368937089d3ed71239786f86Andreas Huber        mov         rdi,            arg(2) ;d -- destination
236f71323e297a928af368937089d3ed71239786f86Andreas Huber        movsxd      rax,            dword ptr arg(4) ;stride
237f71323e297a928af368937089d3ed71239786f86Andreas Huber        movsxd      rdx,            dword ptr arg(3) ;pitch
238f71323e297a928af368937089d3ed71239786f86Andreas Huber        pxor        mm0,            mm0
23990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
240f71323e297a928af368937089d3ed71239786f86Andreas Huber        movd        mm5,            arg(0) ;input_dc
24190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
242538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        paddw       mm5,            [GLOBAL(fours)]
24390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
244f71323e297a928af368937089d3ed71239786f86Andreas Huber        psraw       mm5,            3
24590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
246f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpcklwd   mm5,            mm5
247f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckldq   mm5,            mm5
24890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
249f71323e297a928af368937089d3ed71239786f86Andreas Huber        movd        mm1,            [rsi]
250f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpcklbw   mm1,            mm0
251f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddsw      mm1,            mm5
252f71323e297a928af368937089d3ed71239786f86Andreas Huber        packuswb    mm1,            mm0              ; pack and unpack to saturate
253f71323e297a928af368937089d3ed71239786f86Andreas Huber        movd        [rdi],          mm1
25490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
255f71323e297a928af368937089d3ed71239786f86Andreas Huber        movd        mm2,            [rsi+rdx]
256f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpcklbw   mm2,            mm0
257f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddsw      mm2,            mm5
258f71323e297a928af368937089d3ed71239786f86Andreas Huber        packuswb    mm2,            mm0              ; pack and unpack to saturate
259f71323e297a928af368937089d3ed71239786f86Andreas Huber        movd        [rdi+rax],      mm2
260f71323e297a928af368937089d3ed71239786f86Andreas Huber
261f71323e297a928af368937089d3ed71239786f86Andreas Huber        movd        mm3,            [rsi+2*rdx]
262f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpcklbw   mm3,            mm0
263f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddsw      mm3,            mm5
264f71323e297a928af368937089d3ed71239786f86Andreas Huber        packuswb    mm3,            mm0              ; pack and unpack to saturate
265f71323e297a928af368937089d3ed71239786f86Andreas Huber        movd        [rdi+2*rax],    mm3
266f71323e297a928af368937089d3ed71239786f86Andreas Huber
267f71323e297a928af368937089d3ed71239786f86Andreas Huber        add         rdi,            rax
268f71323e297a928af368937089d3ed71239786f86Andreas Huber        add         rsi,            rdx
269f71323e297a928af368937089d3ed71239786f86Andreas Huber        movd        mm4,            [rsi+2*rdx]
270f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpcklbw   mm4,            mm0
271f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddsw      mm4,            mm5
272f71323e297a928af368937089d3ed71239786f86Andreas Huber        packuswb    mm4,            mm0              ; pack and unpack to saturate
273f71323e297a928af368937089d3ed71239786f86Andreas Huber        movd        [rdi+2*rax],    mm4
27490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
27590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ; begin epilog
276f71323e297a928af368937089d3ed71239786f86Andreas Huber    pop rdi
277f71323e297a928af368937089d3ed71239786f86Andreas Huber    pop rsi
27890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    RESTORE_GOT
27990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    UNSHADOW_ARGS
28090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop         rbp
28190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ret
28290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
28390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas HuberSECTION_RODATA
28490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberalign 16
28590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberx_s1sqr2:
28690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    times 4 dw 0x8A8C
28790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberalign 16
28890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberx_c1sqr2less1:
28990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    times 4 dw 0x4E7B
29090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberalign 16
29190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberfours:
29290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    times 4 dw 0x0004
293