1f71323e297a928af368937089d3ed71239786f86Andreas Huber;
2f71323e297a928af368937089d3ed71239786f86Andreas Huber;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3f71323e297a928af368937089d3ed71239786f86Andreas Huber;
4f71323e297a928af368937089d3ed71239786f86Andreas Huber;  Use of this source code is governed by a BSD-style license
5f71323e297a928af368937089d3ed71239786f86Andreas Huber;  that can be found in the LICENSE file in the root of the source
6f71323e297a928af368937089d3ed71239786f86Andreas Huber;  tree. An additional intellectual property rights grant can be found
7f71323e297a928af368937089d3ed71239786f86Andreas Huber;  in the file PATENTS.  All contributing project authors may
8f71323e297a928af368937089d3ed71239786f86Andreas Huber;  be found in the AUTHORS file in the root of the source tree.
9f71323e297a928af368937089d3ed71239786f86Andreas Huber;
10f71323e297a928af368937089d3ed71239786f86Andreas Huber
11f71323e297a928af368937089d3ed71239786f86Andreas Huber
12f71323e297a928af368937089d3ed71239786f86Andreas Huber%include "vpx_ports/x86_abi_support.asm"
13f71323e297a928af368937089d3ed71239786f86Andreas Huber
141b362b15af34006e6a11974088a46d42b903418eJohann;void vp8_idct_dequant_0_2x_sse2
15f71323e297a928af368937089d3ed71239786f86Andreas Huber; (
16f71323e297a928af368937089d3ed71239786f86Andreas Huber;   short *qcoeff       - 0
17f71323e297a928af368937089d3ed71239786f86Andreas Huber;   short *dequant      - 1
181b362b15af34006e6a11974088a46d42b903418eJohann;   unsigned char *dst  - 2
191b362b15af34006e6a11974088a46d42b903418eJohann;   int dst_stride      - 3
20f71323e297a928af368937089d3ed71239786f86Andreas Huber; )
21f71323e297a928af368937089d3ed71239786f86Andreas Huber
221b362b15af34006e6a11974088a46d42b903418eJohannglobal sym(vp8_idct_dequant_0_2x_sse2) PRIVATE
231b362b15af34006e6a11974088a46d42b903418eJohannsym(vp8_idct_dequant_0_2x_sse2):
24f71323e297a928af368937089d3ed71239786f86Andreas Huber    push        rbp
25f71323e297a928af368937089d3ed71239786f86Andreas Huber    mov         rbp, rsp
261b362b15af34006e6a11974088a46d42b903418eJohann    SHADOW_ARGS_TO_STACK 4
27f71323e297a928af368937089d3ed71239786f86Andreas Huber    GET_GOT     rbx
28f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; end prolog
29f71323e297a928af368937089d3ed71239786f86Andreas Huber
30f71323e297a928af368937089d3ed71239786f86Andreas Huber        mov         rdx,            arg(1) ; dequant
31f71323e297a928af368937089d3ed71239786f86Andreas Huber        mov         rax,            arg(0) ; qcoeff
32f71323e297a928af368937089d3ed71239786f86Andreas Huber
33f71323e297a928af368937089d3ed71239786f86Andreas Huber        movd        xmm4,           [rax]
34f71323e297a928af368937089d3ed71239786f86Andreas Huber        movd        xmm5,           [rdx]
35f71323e297a928af368937089d3ed71239786f86Andreas Huber
36f71323e297a928af368937089d3ed71239786f86Andreas Huber        pinsrw      xmm4,           [rax+32],   4
37f71323e297a928af368937089d3ed71239786f86Andreas Huber        pinsrw      xmm5,           [rdx],      4
38f71323e297a928af368937089d3ed71239786f86Andreas Huber
39f71323e297a928af368937089d3ed71239786f86Andreas Huber        pmullw      xmm4,           xmm5
40f71323e297a928af368937089d3ed71239786f86Andreas Huber
411b362b15af34006e6a11974088a46d42b903418eJohann    ; Zero out xmm5, for use unpacking
421b362b15af34006e6a11974088a46d42b903418eJohann        pxor        xmm5,           xmm5
431b362b15af34006e6a11974088a46d42b903418eJohann
44f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; clear coeffs
451b362b15af34006e6a11974088a46d42b903418eJohann        movd        [rax],          xmm5
461b362b15af34006e6a11974088a46d42b903418eJohann        movd        [rax+32],       xmm5
47f71323e297a928af368937089d3ed71239786f86Andreas Huber;pshufb
481b362b15af34006e6a11974088a46d42b903418eJohann        mov         rax,            arg(2) ; dst
491b362b15af34006e6a11974088a46d42b903418eJohann        movsxd      rdx,            dword ptr arg(3) ; dst_stride
501b362b15af34006e6a11974088a46d42b903418eJohann
51f71323e297a928af368937089d3ed71239786f86Andreas Huber        pshuflw     xmm4,           xmm4,       00000000b
52f71323e297a928af368937089d3ed71239786f86Andreas Huber        pshufhw     xmm4,           xmm4,       00000000b
53f71323e297a928af368937089d3ed71239786f86Andreas Huber
541b362b15af34006e6a11974088a46d42b903418eJohann        lea         rcx,            [rdx + rdx*2]
55538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        paddw       xmm4,           [GLOBAL(fours)]
56f71323e297a928af368937089d3ed71239786f86Andreas Huber
57f71323e297a928af368937089d3ed71239786f86Andreas Huber        psraw       xmm4,           3
58f71323e297a928af368937089d3ed71239786f86Andreas Huber
59f71323e297a928af368937089d3ed71239786f86Andreas Huber        movq        xmm0,           [rax]
601b362b15af34006e6a11974088a46d42b903418eJohann        movq        xmm1,           [rax+rdx]
611b362b15af34006e6a11974088a46d42b903418eJohann        movq        xmm2,           [rax+2*rdx]
62f71323e297a928af368937089d3ed71239786f86Andreas Huber        movq        xmm3,           [rax+rcx]
63f71323e297a928af368937089d3ed71239786f86Andreas Huber
641b362b15af34006e6a11974088a46d42b903418eJohann        punpcklbw   xmm0,           xmm5
651b362b15af34006e6a11974088a46d42b903418eJohann        punpcklbw   xmm1,           xmm5
661b362b15af34006e6a11974088a46d42b903418eJohann        punpcklbw   xmm2,           xmm5
671b362b15af34006e6a11974088a46d42b903418eJohann        punpcklbw   xmm3,           xmm5
68f71323e297a928af368937089d3ed71239786f86Andreas Huber
69f71323e297a928af368937089d3ed71239786f86Andreas Huber
70f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; Add to predict buffer
71f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddw       xmm0,           xmm4
72f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddw       xmm1,           xmm4
73f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddw       xmm2,           xmm4
74f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddw       xmm3,           xmm4
75f71323e297a928af368937089d3ed71239786f86Andreas Huber
76f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; pack up before storing
771b362b15af34006e6a11974088a46d42b903418eJohann        packuswb    xmm0,           xmm5
781b362b15af34006e6a11974088a46d42b903418eJohann        packuswb    xmm1,           xmm5
791b362b15af34006e6a11974088a46d42b903418eJohann        packuswb    xmm2,           xmm5
801b362b15af34006e6a11974088a46d42b903418eJohann        packuswb    xmm3,           xmm5
81f71323e297a928af368937089d3ed71239786f86Andreas Huber
82f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; store blocks back out
83f71323e297a928af368937089d3ed71239786f86Andreas Huber        movq        [rax],          xmm0
84f71323e297a928af368937089d3ed71239786f86Andreas Huber        movq        [rax + rdx],    xmm1
85f71323e297a928af368937089d3ed71239786f86Andreas Huber
86f71323e297a928af368937089d3ed71239786f86Andreas Huber        lea         rax,            [rax + 2*rdx]
87f71323e297a928af368937089d3ed71239786f86Andreas Huber
88f71323e297a928af368937089d3ed71239786f86Andreas Huber        movq        [rax],          xmm2
89f71323e297a928af368937089d3ed71239786f86Andreas Huber        movq        [rax + rdx],    xmm3
90f71323e297a928af368937089d3ed71239786f86Andreas Huber
91f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; begin epilog
92f71323e297a928af368937089d3ed71239786f86Andreas Huber    RESTORE_GOT
93f71323e297a928af368937089d3ed71239786f86Andreas Huber    UNSHADOW_ARGS
94f71323e297a928af368937089d3ed71239786f86Andreas Huber    pop         rbp
95f71323e297a928af368937089d3ed71239786f86Andreas Huber    ret
96f71323e297a928af368937089d3ed71239786f86Andreas Huber
971b362b15af34006e6a11974088a46d42b903418eJohann;void vp8_idct_dequant_full_2x_sse2
981b362b15af34006e6a11974088a46d42b903418eJohann; (
991b362b15af34006e6a11974088a46d42b903418eJohann;   short *qcoeff       - 0
1001b362b15af34006e6a11974088a46d42b903418eJohann;   short *dequant      - 1
1011b362b15af34006e6a11974088a46d42b903418eJohann;   unsigned char *dst  - 2
1021b362b15af34006e6a11974088a46d42b903418eJohann;   int dst_stride      - 3
1031b362b15af34006e6a11974088a46d42b903418eJohann; )
1041b362b15af34006e6a11974088a46d42b903418eJohannglobal sym(vp8_idct_dequant_full_2x_sse2) PRIVATE
1051b362b15af34006e6a11974088a46d42b903418eJohannsym(vp8_idct_dequant_full_2x_sse2):
106f71323e297a928af368937089d3ed71239786f86Andreas Huber    push        rbp
107f71323e297a928af368937089d3ed71239786f86Andreas Huber    mov         rbp, rsp
1081b362b15af34006e6a11974088a46d42b903418eJohann    SHADOW_ARGS_TO_STACK 4
1091b362b15af34006e6a11974088a46d42b903418eJohann    SAVE_XMM 7
110f71323e297a928af368937089d3ed71239786f86Andreas Huber    GET_GOT     rbx
111f71323e297a928af368937089d3ed71239786f86Andreas Huber    push        rsi
112f71323e297a928af368937089d3ed71239786f86Andreas Huber    push        rdi
113f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; end prolog
114f71323e297a928af368937089d3ed71239786f86Andreas Huber
115f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; special case when 2 blocks have 0 or 1 coeffs
116f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; dc is set as first coeff, so no need to load qcoeff
117f71323e297a928af368937089d3ed71239786f86Andreas Huber        mov         rax,            arg(0) ; qcoeff
1181b362b15af34006e6a11974088a46d42b903418eJohann        mov         rdx,            arg(1)  ; dequant
1191b362b15af34006e6a11974088a46d42b903418eJohann        mov         rdi,            arg(2) ; dst
1201b362b15af34006e6a11974088a46d42b903418eJohann
121f71323e297a928af368937089d3ed71239786f86Andreas Huber
122f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; Zero out xmm7, for use unpacking
123f71323e297a928af368937089d3ed71239786f86Andreas Huber        pxor        xmm7,           xmm7
124f71323e297a928af368937089d3ed71239786f86Andreas Huber
125f71323e297a928af368937089d3ed71239786f86Andreas Huber
126f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; note the transpose of xmm1 and xmm2, necessary for shuffle
127f71323e297a928af368937089d3ed71239786f86Andreas Huber    ;   to spit out sensicle data
128f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm0,           [rax]
129f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm2,           [rax+16]
130f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm1,           [rax+32]
131f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm3,           [rax+48]
132f71323e297a928af368937089d3ed71239786f86Andreas Huber
133f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; Clear out coeffs
134f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      [rax],          xmm7
135f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      [rax+16],       xmm7
136f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      [rax+32],       xmm7
137f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      [rax+48],       xmm7
138f71323e297a928af368937089d3ed71239786f86Andreas Huber
139f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; dequantize qcoeff buffer
140f71323e297a928af368937089d3ed71239786f86Andreas Huber        pmullw      xmm0,           [rdx]
141f71323e297a928af368937089d3ed71239786f86Andreas Huber        pmullw      xmm2,           [rdx+16]
142f71323e297a928af368937089d3ed71239786f86Andreas Huber        pmullw      xmm1,           [rdx]
143f71323e297a928af368937089d3ed71239786f86Andreas Huber        pmullw      xmm3,           [rdx+16]
1441b362b15af34006e6a11974088a46d42b903418eJohann        movsxd      rdx,            dword ptr arg(3) ; dst_stride
145f71323e297a928af368937089d3ed71239786f86Andreas Huber
146f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; repack so block 0 row x and block 1 row x are together
147f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm4,           xmm0
148f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckldq   xmm0,           xmm1
149f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckhdq   xmm4,           xmm1
150f71323e297a928af368937089d3ed71239786f86Andreas Huber
151f71323e297a928af368937089d3ed71239786f86Andreas Huber        pshufd      xmm0,           xmm0,       11011000b
152f71323e297a928af368937089d3ed71239786f86Andreas Huber        pshufd      xmm1,           xmm4,       11011000b
153f71323e297a928af368937089d3ed71239786f86Andreas Huber
154f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm4,           xmm2
155f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckldq   xmm2,           xmm3
156f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckhdq   xmm4,           xmm3
157f71323e297a928af368937089d3ed71239786f86Andreas Huber
158f71323e297a928af368937089d3ed71239786f86Andreas Huber        pshufd      xmm2,           xmm2,       11011000b
159f71323e297a928af368937089d3ed71239786f86Andreas Huber        pshufd      xmm3,           xmm4,       11011000b
160f71323e297a928af368937089d3ed71239786f86Andreas Huber
161f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; first pass
162f71323e297a928af368937089d3ed71239786f86Andreas Huber        psubw       xmm0,           xmm2        ; b1 = 0-2
163f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddw       xmm2,           xmm2        ;
164f71323e297a928af368937089d3ed71239786f86Andreas Huber
165f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm5,           xmm1
166f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddw       xmm2,           xmm0        ; a1 = 0+2
167f71323e297a928af368937089d3ed71239786f86Andreas Huber
168538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pmulhw      xmm5,           [GLOBAL(x_s1sqr2)]
1691b362b15af34006e6a11974088a46d42b903418eJohann        lea         rcx,            [rdx + rdx*2]   ;dst_stride * 3
170f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddw       xmm5,           xmm1        ; ip1 * sin(pi/8) * sqrt(2)
171f71323e297a928af368937089d3ed71239786f86Andreas Huber
172f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm7,           xmm3
173538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pmulhw      xmm7,           [GLOBAL(x_c1sqr2less1)]
174f71323e297a928af368937089d3ed71239786f86Andreas Huber
175f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddw       xmm7,           xmm3        ; ip3 * cos(pi/8) * sqrt(2)
176f71323e297a928af368937089d3ed71239786f86Andreas Huber        psubw       xmm7,           xmm5        ; c1
177f71323e297a928af368937089d3ed71239786f86Andreas Huber
178f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm5,           xmm1
179f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm4,           xmm3
180f71323e297a928af368937089d3ed71239786f86Andreas Huber
181538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pmulhw      xmm5,           [GLOBAL(x_c1sqr2less1)]
182f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddw       xmm5,           xmm1
183f71323e297a928af368937089d3ed71239786f86Andreas Huber
184538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pmulhw      xmm3,           [GLOBAL(x_s1sqr2)]
185f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddw       xmm3,           xmm4
186f71323e297a928af368937089d3ed71239786f86Andreas Huber
187f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddw       xmm3,           xmm5        ; d1
188f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm6,           xmm2        ; a1
189f71323e297a928af368937089d3ed71239786f86Andreas Huber
190f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm4,           xmm0        ; b1
191f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddw       xmm2,           xmm3        ;0
192f71323e297a928af368937089d3ed71239786f86Andreas Huber
193f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddw       xmm4,           xmm7        ;1
194f71323e297a928af368937089d3ed71239786f86Andreas Huber        psubw       xmm0,           xmm7        ;2
195f71323e297a928af368937089d3ed71239786f86Andreas Huber
196f71323e297a928af368937089d3ed71239786f86Andreas Huber        psubw       xmm6,           xmm3        ;3
197f71323e297a928af368937089d3ed71239786f86Andreas Huber
198f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; transpose for the second pass
199f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm7,           xmm2        ; 103 102 101 100 003 002 001 000
200f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpcklwd   xmm2,           xmm0        ; 007 003 006 002 005 001 004 000
201f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckhwd   xmm7,           xmm0        ; 107 103 106 102 105 101 104 100
202f71323e297a928af368937089d3ed71239786f86Andreas Huber
203f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm5,           xmm4        ; 111 110 109 108 011 010 009 008
204f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpcklwd   xmm4,           xmm6        ; 015 011 014 010 013 009 012 008
205f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckhwd   xmm5,           xmm6        ; 115 111 114 110 113 109 112 108
206f71323e297a928af368937089d3ed71239786f86Andreas Huber
207f71323e297a928af368937089d3ed71239786f86Andreas Huber
208f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm1,           xmm2        ; 007 003 006 002 005 001 004 000
209f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckldq   xmm2,           xmm4        ; 013 009 005 001 012 008 004 000
210f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckhdq   xmm1,           xmm4        ; 015 011 007 003 014 010 006 002
211f71323e297a928af368937089d3ed71239786f86Andreas Huber
212f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm6,           xmm7        ; 107 103 106 102 105 101 104 100
213f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckldq   xmm7,           xmm5        ; 113 109 105 101 112 108 104 100
214f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckhdq   xmm6,           xmm5        ; 115 111 107 103 114 110 106 102
215f71323e297a928af368937089d3ed71239786f86Andreas Huber
216f71323e297a928af368937089d3ed71239786f86Andreas Huber
217f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm5,           xmm2        ; 013 009 005 001 012 008 004 000
218f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckldq   xmm2,           xmm7        ; 112 108 012 008 104 100 004 000
219f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckhdq   xmm5,           xmm7        ; 113 109 013 009 105 101 005 001
220f71323e297a928af368937089d3ed71239786f86Andreas Huber
221f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm7,           xmm1        ; 015 011 007 003 014 010 006 002
222f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckldq   xmm1,           xmm6        ; 114 110 014 010 106 102 006 002
223f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckhdq   xmm7,           xmm6        ; 115 111 015 011 107 103 007 003
224f71323e297a928af368937089d3ed71239786f86Andreas Huber
225f71323e297a928af368937089d3ed71239786f86Andreas Huber        pshufd      xmm0,           xmm2,       11011000b
226f71323e297a928af368937089d3ed71239786f86Andreas Huber        pshufd      xmm2,           xmm1,       11011000b
227f71323e297a928af368937089d3ed71239786f86Andreas Huber
228f71323e297a928af368937089d3ed71239786f86Andreas Huber        pshufd      xmm1,           xmm5,       11011000b
229f71323e297a928af368937089d3ed71239786f86Andreas Huber        pshufd      xmm3,           xmm7,       11011000b
230f71323e297a928af368937089d3ed71239786f86Andreas Huber
231f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; second pass
232f71323e297a928af368937089d3ed71239786f86Andreas Huber        psubw       xmm0,           xmm2            ; b1 = 0-2
233f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddw       xmm2,           xmm2
234f71323e297a928af368937089d3ed71239786f86Andreas Huber
235f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm5,           xmm1
236f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddw       xmm2,           xmm0            ; a1 = 0+2
237f71323e297a928af368937089d3ed71239786f86Andreas Huber
238538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pmulhw      xmm5,           [GLOBAL(x_s1sqr2)]
239f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddw       xmm5,           xmm1            ; ip1 * sin(pi/8) * sqrt(2)
240f71323e297a928af368937089d3ed71239786f86Andreas Huber
241f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm7,           xmm3
242538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pmulhw      xmm7,           [GLOBAL(x_c1sqr2less1)]
243f71323e297a928af368937089d3ed71239786f86Andreas Huber
244f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddw       xmm7,           xmm3            ; ip3 * cos(pi/8) * sqrt(2)
245f71323e297a928af368937089d3ed71239786f86Andreas Huber        psubw       xmm7,           xmm5            ; c1
246f71323e297a928af368937089d3ed71239786f86Andreas Huber
247f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm5,           xmm1
248f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm4,           xmm3
249f71323e297a928af368937089d3ed71239786f86Andreas Huber
250538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pmulhw      xmm5,           [GLOBAL(x_c1sqr2less1)]
251f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddw       xmm5,           xmm1
252f71323e297a928af368937089d3ed71239786f86Andreas Huber
253538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pmulhw      xmm3,           [GLOBAL(x_s1sqr2)]
254f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddw       xmm3,           xmm4
255f71323e297a928af368937089d3ed71239786f86Andreas Huber
256f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddw       xmm3,           xmm5            ; d1
257538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        paddw       xmm0,           [GLOBAL(fours)]
258f71323e297a928af368937089d3ed71239786f86Andreas Huber
259538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        paddw       xmm2,           [GLOBAL(fours)]
260f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm6,           xmm2            ; a1
261f71323e297a928af368937089d3ed71239786f86Andreas Huber
262f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm4,           xmm0            ; b1
263f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddw       xmm2,           xmm3            ;0
264f71323e297a928af368937089d3ed71239786f86Andreas Huber
265f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddw       xmm4,           xmm7            ;1
266f71323e297a928af368937089d3ed71239786f86Andreas Huber        psubw       xmm0,           xmm7            ;2
267f71323e297a928af368937089d3ed71239786f86Andreas Huber
268f71323e297a928af368937089d3ed71239786f86Andreas Huber        psubw       xmm6,           xmm3            ;3
269f71323e297a928af368937089d3ed71239786f86Andreas Huber        psraw       xmm2,           3
270f71323e297a928af368937089d3ed71239786f86Andreas Huber
271f71323e297a928af368937089d3ed71239786f86Andreas Huber        psraw       xmm0,           3
272f71323e297a928af368937089d3ed71239786f86Andreas Huber        psraw       xmm4,           3
273f71323e297a928af368937089d3ed71239786f86Andreas Huber
274f71323e297a928af368937089d3ed71239786f86Andreas Huber        psraw       xmm6,           3
275f71323e297a928af368937089d3ed71239786f86Andreas Huber
276f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; transpose to save
277f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm7,           xmm2        ; 103 102 101 100 003 002 001 000
278f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpcklwd   xmm2,           xmm0        ; 007 003 006 002 005 001 004 000
279f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckhwd   xmm7,           xmm0        ; 107 103 106 102 105 101 104 100
280f71323e297a928af368937089d3ed71239786f86Andreas Huber
281f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm5,           xmm4        ; 111 110 109 108 011 010 009 008
282f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpcklwd   xmm4,           xmm6        ; 015 011 014 010 013 009 012 008
283f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckhwd   xmm5,           xmm6        ; 115 111 114 110 113 109 112 108
284f71323e297a928af368937089d3ed71239786f86Andreas Huber
285f71323e297a928af368937089d3ed71239786f86Andreas Huber
286f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm1,           xmm2        ; 007 003 006 002 005 001 004 000
287f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckldq   xmm2,           xmm4        ; 013 009 005 001 012 008 004 000
288f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckhdq   xmm1,           xmm4        ; 015 011 007 003 014 010 006 002
289f71323e297a928af368937089d3ed71239786f86Andreas Huber
290f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm6,           xmm7        ; 107 103 106 102 105 101 104 100
291f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckldq   xmm7,           xmm5        ; 113 109 105 101 112 108 104 100
292f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckhdq   xmm6,           xmm5        ; 115 111 107 103 114 110 106 102
293f71323e297a928af368937089d3ed71239786f86Andreas Huber
294f71323e297a928af368937089d3ed71239786f86Andreas Huber
295f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm5,           xmm2        ; 013 009 005 001 012 008 004 000
296f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckldq   xmm2,           xmm7        ; 112 108 012 008 104 100 004 000
297f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckhdq   xmm5,           xmm7        ; 113 109 013 009 105 101 005 001
298f71323e297a928af368937089d3ed71239786f86Andreas Huber
299f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm7,           xmm1        ; 015 011 007 003 014 010 006 002
300f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckldq   xmm1,           xmm6        ; 114 110 014 010 106 102 006 002
301f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckhdq   xmm7,           xmm6        ; 115 111 015 011 107 103 007 003
302f71323e297a928af368937089d3ed71239786f86Andreas Huber
303f71323e297a928af368937089d3ed71239786f86Andreas Huber        pshufd      xmm0,           xmm2,       11011000b
304f71323e297a928af368937089d3ed71239786f86Andreas Huber        pshufd      xmm2,           xmm1,       11011000b
305f71323e297a928af368937089d3ed71239786f86Andreas Huber
306f71323e297a928af368937089d3ed71239786f86Andreas Huber        pshufd      xmm1,           xmm5,       11011000b
307f71323e297a928af368937089d3ed71239786f86Andreas Huber        pshufd      xmm3,           xmm7,       11011000b
308f71323e297a928af368937089d3ed71239786f86Andreas Huber
309f71323e297a928af368937089d3ed71239786f86Andreas Huber        pxor        xmm7,           xmm7
310f71323e297a928af368937089d3ed71239786f86Andreas Huber
311f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; Load up predict blocks
3121b362b15af34006e6a11974088a46d42b903418eJohann        movq        xmm4,           [rdi]
3131b362b15af34006e6a11974088a46d42b903418eJohann        movq        xmm5,           [rdi+rdx]
314f71323e297a928af368937089d3ed71239786f86Andreas Huber
315f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpcklbw   xmm4,           xmm7
316f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpcklbw   xmm5,           xmm7
317f71323e297a928af368937089d3ed71239786f86Andreas Huber
318f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddw       xmm0,           xmm4
319f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddw       xmm1,           xmm5
320f71323e297a928af368937089d3ed71239786f86Andreas Huber
3211b362b15af34006e6a11974088a46d42b903418eJohann        movq        xmm4,           [rdi+2*rdx]
3221b362b15af34006e6a11974088a46d42b903418eJohann        movq        xmm5,           [rdi+rcx]
323f71323e297a928af368937089d3ed71239786f86Andreas Huber
324f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpcklbw   xmm4,           xmm7
325f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpcklbw   xmm5,           xmm7
326f71323e297a928af368937089d3ed71239786f86Andreas Huber
327f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddw       xmm2,           xmm4
328f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddw       xmm3,           xmm5
329f71323e297a928af368937089d3ed71239786f86Andreas Huber
330f71323e297a928af368937089d3ed71239786f86Andreas Huber.finish:
331f71323e297a928af368937089d3ed71239786f86Andreas Huber
332f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; pack up before storing
333f71323e297a928af368937089d3ed71239786f86Andreas Huber        packuswb    xmm0,           xmm7
334f71323e297a928af368937089d3ed71239786f86Andreas Huber        packuswb    xmm1,           xmm7
335f71323e297a928af368937089d3ed71239786f86Andreas Huber        packuswb    xmm2,           xmm7
336f71323e297a928af368937089d3ed71239786f86Andreas Huber        packuswb    xmm3,           xmm7
337f71323e297a928af368937089d3ed71239786f86Andreas Huber
338f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; store blocks back out
339f71323e297a928af368937089d3ed71239786f86Andreas Huber        movq        [rdi],          xmm0
340f71323e297a928af368937089d3ed71239786f86Andreas Huber        movq        [rdi + rdx],    xmm1
3411b362b15af34006e6a11974088a46d42b903418eJohann        movq        [rdi + rdx*2],  xmm2
3421b362b15af34006e6a11974088a46d42b903418eJohann        movq        [rdi + rcx],    xmm3
343f71323e297a928af368937089d3ed71239786f86Andreas Huber
344f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; begin epilog
345f71323e297a928af368937089d3ed71239786f86Andreas Huber    pop         rdi
346f71323e297a928af368937089d3ed71239786f86Andreas Huber    pop         rsi
347f71323e297a928af368937089d3ed71239786f86Andreas Huber    RESTORE_GOT
3481b362b15af34006e6a11974088a46d42b903418eJohann    RESTORE_XMM
349f71323e297a928af368937089d3ed71239786f86Andreas Huber    UNSHADOW_ARGS
350f71323e297a928af368937089d3ed71239786f86Andreas Huber    pop         rbp
351f71323e297a928af368937089d3ed71239786f86Andreas Huber    ret
352f71323e297a928af368937089d3ed71239786f86Andreas Huber
3531b362b15af34006e6a11974088a46d42b903418eJohann;void vp8_idct_dequant_dc_0_2x_sse2
354f71323e297a928af368937089d3ed71239786f86Andreas Huber; (
355f71323e297a928af368937089d3ed71239786f86Andreas Huber;   short *qcoeff       - 0
356f71323e297a928af368937089d3ed71239786f86Andreas Huber;   short *dequant      - 1
3571b362b15af34006e6a11974088a46d42b903418eJohann;   unsigned char *dst  - 2
3581b362b15af34006e6a11974088a46d42b903418eJohann;   int dst_stride      - 3
3591b362b15af34006e6a11974088a46d42b903418eJohann;   short *dc           - 4
360f71323e297a928af368937089d3ed71239786f86Andreas Huber; )
3611b362b15af34006e6a11974088a46d42b903418eJohannglobal sym(vp8_idct_dequant_dc_0_2x_sse2) PRIVATE
3621b362b15af34006e6a11974088a46d42b903418eJohannsym(vp8_idct_dequant_dc_0_2x_sse2):
363f71323e297a928af368937089d3ed71239786f86Andreas Huber    push        rbp
364f71323e297a928af368937089d3ed71239786f86Andreas Huber    mov         rbp, rsp
3651b362b15af34006e6a11974088a46d42b903418eJohann    SHADOW_ARGS_TO_STACK 5
366f71323e297a928af368937089d3ed71239786f86Andreas Huber    GET_GOT     rbx
367f71323e297a928af368937089d3ed71239786f86Andreas Huber    push        rdi
368f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; end prolog
369f71323e297a928af368937089d3ed71239786f86Andreas Huber
370f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; special case when 2 blocks have 0 or 1 coeffs
371f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; dc is set as first coeff, so no need to load qcoeff
372f71323e297a928af368937089d3ed71239786f86Andreas Huber        mov         rax,            arg(0) ; qcoeff
373f71323e297a928af368937089d3ed71239786f86Andreas Huber
3741b362b15af34006e6a11974088a46d42b903418eJohann        mov         rdi,            arg(2) ; dst
3751b362b15af34006e6a11974088a46d42b903418eJohann        mov         rdx,            arg(4) ; dc
3761b362b15af34006e6a11974088a46d42b903418eJohann
3771b362b15af34006e6a11974088a46d42b903418eJohann    ; Zero out xmm5, for use unpacking
3781b362b15af34006e6a11974088a46d42b903418eJohann        pxor        xmm5,           xmm5
379f71323e297a928af368937089d3ed71239786f86Andreas Huber
380f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; load up 2 dc words here == 2*16 = doubleword
381f71323e297a928af368937089d3ed71239786f86Andreas Huber        movd        xmm4,           [rdx]
382f71323e297a928af368937089d3ed71239786f86Andreas Huber
3831b362b15af34006e6a11974088a46d42b903418eJohann        movsxd      rdx,            dword ptr arg(3) ; dst_stride
3841b362b15af34006e6a11974088a46d42b903418eJohann        lea         rcx, [rdx + rdx*2]
385f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; Load up predict blocks
3861b362b15af34006e6a11974088a46d42b903418eJohann        movq        xmm0,           [rdi]
3871b362b15af34006e6a11974088a46d42b903418eJohann        movq        xmm1,           [rdi+rdx*1]
3881b362b15af34006e6a11974088a46d42b903418eJohann        movq        xmm2,           [rdi+rdx*2]
3891b362b15af34006e6a11974088a46d42b903418eJohann        movq        xmm3,           [rdi+rcx]
390f71323e297a928af368937089d3ed71239786f86Andreas Huber
391f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; Duplicate and expand dc across
392f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpcklwd   xmm4,           xmm4
393f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckldq   xmm4,           xmm4
394f71323e297a928af368937089d3ed71239786f86Andreas Huber
395f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; Rounding to dequant and downshift
396538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        paddw       xmm4,           [GLOBAL(fours)]
397f71323e297a928af368937089d3ed71239786f86Andreas Huber        psraw       xmm4,           3
398f71323e297a928af368937089d3ed71239786f86Andreas Huber
399f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; Predict buffer needs to be expanded from bytes to words
4001b362b15af34006e6a11974088a46d42b903418eJohann        punpcklbw   xmm0,           xmm5
4011b362b15af34006e6a11974088a46d42b903418eJohann        punpcklbw   xmm1,           xmm5
4021b362b15af34006e6a11974088a46d42b903418eJohann        punpcklbw   xmm2,           xmm5
4031b362b15af34006e6a11974088a46d42b903418eJohann        punpcklbw   xmm3,           xmm5
404f71323e297a928af368937089d3ed71239786f86Andreas Huber
405f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; Add to predict buffer
406f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddw       xmm0,           xmm4
407f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddw       xmm1,           xmm4
408f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddw       xmm2,           xmm4
409f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddw       xmm3,           xmm4
410f71323e297a928af368937089d3ed71239786f86Andreas Huber
411f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; pack up before storing
4121b362b15af34006e6a11974088a46d42b903418eJohann        packuswb    xmm0,           xmm5
4131b362b15af34006e6a11974088a46d42b903418eJohann        packuswb    xmm1,           xmm5
4141b362b15af34006e6a11974088a46d42b903418eJohann        packuswb    xmm2,           xmm5
4151b362b15af34006e6a11974088a46d42b903418eJohann        packuswb    xmm3,           xmm5
416f71323e297a928af368937089d3ed71239786f86Andreas Huber
417f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; store blocks back out
418f71323e297a928af368937089d3ed71239786f86Andreas Huber        movq        [rdi],          xmm0
419f71323e297a928af368937089d3ed71239786f86Andreas Huber        movq        [rdi + rdx],    xmm1
4201b362b15af34006e6a11974088a46d42b903418eJohann        movq        [rdi + rdx*2],  xmm2
4211b362b15af34006e6a11974088a46d42b903418eJohann        movq        [rdi + rcx],    xmm3
422f71323e297a928af368937089d3ed71239786f86Andreas Huber
423f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; begin epilog
424f71323e297a928af368937089d3ed71239786f86Andreas Huber    pop         rdi
425f71323e297a928af368937089d3ed71239786f86Andreas Huber    RESTORE_GOT
426f71323e297a928af368937089d3ed71239786f86Andreas Huber    UNSHADOW_ARGS
427f71323e297a928af368937089d3ed71239786f86Andreas Huber    pop         rbp
428f71323e297a928af368937089d3ed71239786f86Andreas Huber    ret
4291b362b15af34006e6a11974088a46d42b903418eJohann;void vp8_idct_dequant_dc_full_2x_sse2
4301b362b15af34006e6a11974088a46d42b903418eJohann; (
4311b362b15af34006e6a11974088a46d42b903418eJohann;   short *qcoeff       - 0
4321b362b15af34006e6a11974088a46d42b903418eJohann;   short *dequant      - 1
4331b362b15af34006e6a11974088a46d42b903418eJohann;   unsigned char *dst  - 2
4341b362b15af34006e6a11974088a46d42b903418eJohann;   int dst_stride      - 3
4351b362b15af34006e6a11974088a46d42b903418eJohann;   short *dc           - 4
4361b362b15af34006e6a11974088a46d42b903418eJohann; )
4371b362b15af34006e6a11974088a46d42b903418eJohannglobal sym(vp8_idct_dequant_dc_full_2x_sse2) PRIVATE
4381b362b15af34006e6a11974088a46d42b903418eJohannsym(vp8_idct_dequant_dc_full_2x_sse2):
439f71323e297a928af368937089d3ed71239786f86Andreas Huber    push        rbp
440f71323e297a928af368937089d3ed71239786f86Andreas Huber    mov         rbp, rsp
4411b362b15af34006e6a11974088a46d42b903418eJohann    SHADOW_ARGS_TO_STACK 5
4421b362b15af34006e6a11974088a46d42b903418eJohann    SAVE_XMM 7
443f71323e297a928af368937089d3ed71239786f86Andreas Huber    GET_GOT     rbx
444f71323e297a928af368937089d3ed71239786f86Andreas Huber    push        rdi
445f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; end prolog
446f71323e297a928af368937089d3ed71239786f86Andreas Huber
447f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; special case when 2 blocks have 0 or 1 coeffs
448f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; dc is set as first coeff, so no need to load qcoeff
449f71323e297a928af368937089d3ed71239786f86Andreas Huber        mov         rax,            arg(0) ; qcoeff
4501b362b15af34006e6a11974088a46d42b903418eJohann        mov         rdx,            arg(1)  ; dequant
4511b362b15af34006e6a11974088a46d42b903418eJohann
4521b362b15af34006e6a11974088a46d42b903418eJohann        mov         rdi,            arg(2) ; dst
453f71323e297a928af368937089d3ed71239786f86Andreas Huber
454f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; Zero out xmm7, for use unpacking
455f71323e297a928af368937089d3ed71239786f86Andreas Huber        pxor        xmm7,           xmm7
456f71323e297a928af368937089d3ed71239786f86Andreas Huber
457f71323e297a928af368937089d3ed71239786f86Andreas Huber
458f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; note the transpose of xmm1 and xmm2, necessary for shuffle
459f71323e297a928af368937089d3ed71239786f86Andreas Huber    ;   to spit out sensicle data
460f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm0,           [rax]
461f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm2,           [rax+16]
462f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm1,           [rax+32]
463f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm3,           [rax+48]
464f71323e297a928af368937089d3ed71239786f86Andreas Huber
465f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; Clear out coeffs
466f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      [rax],          xmm7
467f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      [rax+16],       xmm7
468f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      [rax+32],       xmm7
469f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      [rax+48],       xmm7
470f71323e297a928af368937089d3ed71239786f86Andreas Huber
471f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; dequantize qcoeff buffer
472f71323e297a928af368937089d3ed71239786f86Andreas Huber        pmullw      xmm0,           [rdx]
473f71323e297a928af368937089d3ed71239786f86Andreas Huber        pmullw      xmm2,           [rdx+16]
474f71323e297a928af368937089d3ed71239786f86Andreas Huber        pmullw      xmm1,           [rdx]
475f71323e297a928af368937089d3ed71239786f86Andreas Huber        pmullw      xmm3,           [rdx+16]
476f71323e297a928af368937089d3ed71239786f86Andreas Huber
477f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; DC component
4781b362b15af34006e6a11974088a46d42b903418eJohann        mov         rdx,            arg(4)
479f71323e297a928af368937089d3ed71239786f86Andreas Huber
480f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; repack so block 0 row x and block 1 row x are together
481f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm4,           xmm0
482f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckldq   xmm0,           xmm1
483f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckhdq   xmm4,           xmm1
484f71323e297a928af368937089d3ed71239786f86Andreas Huber
485f71323e297a928af368937089d3ed71239786f86Andreas Huber        pshufd      xmm0,           xmm0,       11011000b
486f71323e297a928af368937089d3ed71239786f86Andreas Huber        pshufd      xmm1,           xmm4,       11011000b
487f71323e297a928af368937089d3ed71239786f86Andreas Huber
488f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm4,           xmm2
489f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckldq   xmm2,           xmm3
490f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckhdq   xmm4,           xmm3
491f71323e297a928af368937089d3ed71239786f86Andreas Huber
492f71323e297a928af368937089d3ed71239786f86Andreas Huber        pshufd      xmm2,           xmm2,       11011000b
493f71323e297a928af368937089d3ed71239786f86Andreas Huber        pshufd      xmm3,           xmm4,       11011000b
494f71323e297a928af368937089d3ed71239786f86Andreas Huber
495f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; insert DC component
496f71323e297a928af368937089d3ed71239786f86Andreas Huber        pinsrw      xmm0,           [rdx],      0
497f71323e297a928af368937089d3ed71239786f86Andreas Huber        pinsrw      xmm0,           [rdx+2],    4
498f71323e297a928af368937089d3ed71239786f86Andreas Huber
499f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; first pass
500f71323e297a928af368937089d3ed71239786f86Andreas Huber        psubw       xmm0,           xmm2        ; b1 = 0-2
501f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddw       xmm2,           xmm2        ;
502f71323e297a928af368937089d3ed71239786f86Andreas Huber
503f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm5,           xmm1
504f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddw       xmm2,           xmm0        ; a1 = 0+2
505f71323e297a928af368937089d3ed71239786f86Andreas Huber
506538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pmulhw      xmm5,           [GLOBAL(x_s1sqr2)]
507f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddw       xmm5,           xmm1        ; ip1 * sin(pi/8) * sqrt(2)
508f71323e297a928af368937089d3ed71239786f86Andreas Huber
509f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm7,           xmm3
510538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pmulhw      xmm7,           [GLOBAL(x_c1sqr2less1)]
511f71323e297a928af368937089d3ed71239786f86Andreas Huber
512f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddw       xmm7,           xmm3        ; ip3 * cos(pi/8) * sqrt(2)
513f71323e297a928af368937089d3ed71239786f86Andreas Huber        psubw       xmm7,           xmm5        ; c1
514f71323e297a928af368937089d3ed71239786f86Andreas Huber
515f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm5,           xmm1
516f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm4,           xmm3
517f71323e297a928af368937089d3ed71239786f86Andreas Huber
518538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pmulhw      xmm5,           [GLOBAL(x_c1sqr2less1)]
519f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddw       xmm5,           xmm1
520f71323e297a928af368937089d3ed71239786f86Andreas Huber
521538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pmulhw      xmm3,           [GLOBAL(x_s1sqr2)]
522f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddw       xmm3,           xmm4
523f71323e297a928af368937089d3ed71239786f86Andreas Huber
524f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddw       xmm3,           xmm5        ; d1
525f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm6,           xmm2        ; a1
526f71323e297a928af368937089d3ed71239786f86Andreas Huber
527f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm4,           xmm0        ; b1
528f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddw       xmm2,           xmm3        ;0
529f71323e297a928af368937089d3ed71239786f86Andreas Huber
530f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddw       xmm4,           xmm7        ;1
531f71323e297a928af368937089d3ed71239786f86Andreas Huber        psubw       xmm0,           xmm7        ;2
532f71323e297a928af368937089d3ed71239786f86Andreas Huber
533f71323e297a928af368937089d3ed71239786f86Andreas Huber        psubw       xmm6,           xmm3        ;3
534f71323e297a928af368937089d3ed71239786f86Andreas Huber
535f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; transpose for the second pass
536f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm7,           xmm2        ; 103 102 101 100 003 002 001 000
537f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpcklwd   xmm2,           xmm0        ; 007 003 006 002 005 001 004 000
538f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckhwd   xmm7,           xmm0        ; 107 103 106 102 105 101 104 100
539f71323e297a928af368937089d3ed71239786f86Andreas Huber
540f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm5,           xmm4        ; 111 110 109 108 011 010 009 008
541f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpcklwd   xmm4,           xmm6        ; 015 011 014 010 013 009 012 008
542f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckhwd   xmm5,           xmm6        ; 115 111 114 110 113 109 112 108
543f71323e297a928af368937089d3ed71239786f86Andreas Huber
544f71323e297a928af368937089d3ed71239786f86Andreas Huber
545f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm1,           xmm2        ; 007 003 006 002 005 001 004 000
546f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckldq   xmm2,           xmm4        ; 013 009 005 001 012 008 004 000
547f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckhdq   xmm1,           xmm4        ; 015 011 007 003 014 010 006 002
548f71323e297a928af368937089d3ed71239786f86Andreas Huber
549f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm6,           xmm7        ; 107 103 106 102 105 101 104 100
550f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckldq   xmm7,           xmm5        ; 113 109 105 101 112 108 104 100
551f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckhdq   xmm6,           xmm5        ; 115 111 107 103 114 110 106 102
552f71323e297a928af368937089d3ed71239786f86Andreas Huber
553f71323e297a928af368937089d3ed71239786f86Andreas Huber
554f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm5,           xmm2        ; 013 009 005 001 012 008 004 000
555f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckldq   xmm2,           xmm7        ; 112 108 012 008 104 100 004 000
556f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckhdq   xmm5,           xmm7        ; 113 109 013 009 105 101 005 001
557f71323e297a928af368937089d3ed71239786f86Andreas Huber
558f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm7,           xmm1        ; 015 011 007 003 014 010 006 002
559f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckldq   xmm1,           xmm6        ; 114 110 014 010 106 102 006 002
560f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckhdq   xmm7,           xmm6        ; 115 111 015 011 107 103 007 003
561f71323e297a928af368937089d3ed71239786f86Andreas Huber
562f71323e297a928af368937089d3ed71239786f86Andreas Huber        pshufd      xmm0,           xmm2,       11011000b
563f71323e297a928af368937089d3ed71239786f86Andreas Huber        pshufd      xmm2,           xmm1,       11011000b
564f71323e297a928af368937089d3ed71239786f86Andreas Huber
565f71323e297a928af368937089d3ed71239786f86Andreas Huber        pshufd      xmm1,           xmm5,       11011000b
566f71323e297a928af368937089d3ed71239786f86Andreas Huber        pshufd      xmm3,           xmm7,       11011000b
567f71323e297a928af368937089d3ed71239786f86Andreas Huber
568f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; second pass
569f71323e297a928af368937089d3ed71239786f86Andreas Huber        psubw       xmm0,           xmm2            ; b1 = 0-2
570f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddw       xmm2,           xmm2
571f71323e297a928af368937089d3ed71239786f86Andreas Huber
572f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm5,           xmm1
573f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddw       xmm2,           xmm0            ; a1 = 0+2
574f71323e297a928af368937089d3ed71239786f86Andreas Huber
575538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pmulhw      xmm5,           [GLOBAL(x_s1sqr2)]
576f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddw       xmm5,           xmm1            ; ip1 * sin(pi/8) * sqrt(2)
577f71323e297a928af368937089d3ed71239786f86Andreas Huber
578f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm7,           xmm3
579538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pmulhw      xmm7,           [GLOBAL(x_c1sqr2less1)]
580f71323e297a928af368937089d3ed71239786f86Andreas Huber
581f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddw       xmm7,           xmm3            ; ip3 * cos(pi/8) * sqrt(2)
582f71323e297a928af368937089d3ed71239786f86Andreas Huber        psubw       xmm7,           xmm5            ; c1
583f71323e297a928af368937089d3ed71239786f86Andreas Huber
584f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm5,           xmm1
585f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm4,           xmm3
586f71323e297a928af368937089d3ed71239786f86Andreas Huber
587538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pmulhw      xmm5,           [GLOBAL(x_c1sqr2less1)]
588f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddw       xmm5,           xmm1
589f71323e297a928af368937089d3ed71239786f86Andreas Huber
590538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pmulhw      xmm3,           [GLOBAL(x_s1sqr2)]
591f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddw       xmm3,           xmm4
592f71323e297a928af368937089d3ed71239786f86Andreas Huber
593f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddw       xmm3,           xmm5            ; d1
594538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        paddw       xmm0,           [GLOBAL(fours)]
595f71323e297a928af368937089d3ed71239786f86Andreas Huber
596538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        paddw       xmm2,           [GLOBAL(fours)]
597f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm6,           xmm2            ; a1
598f71323e297a928af368937089d3ed71239786f86Andreas Huber
599f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm4,           xmm0            ; b1
600f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddw       xmm2,           xmm3            ;0
601f71323e297a928af368937089d3ed71239786f86Andreas Huber
602f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddw       xmm4,           xmm7            ;1
603f71323e297a928af368937089d3ed71239786f86Andreas Huber        psubw       xmm0,           xmm7            ;2
604f71323e297a928af368937089d3ed71239786f86Andreas Huber
605f71323e297a928af368937089d3ed71239786f86Andreas Huber        psubw       xmm6,           xmm3            ;3
606f71323e297a928af368937089d3ed71239786f86Andreas Huber        psraw       xmm2,           3
607f71323e297a928af368937089d3ed71239786f86Andreas Huber
608f71323e297a928af368937089d3ed71239786f86Andreas Huber        psraw       xmm0,           3
609f71323e297a928af368937089d3ed71239786f86Andreas Huber        psraw       xmm4,           3
610f71323e297a928af368937089d3ed71239786f86Andreas Huber
611f71323e297a928af368937089d3ed71239786f86Andreas Huber        psraw       xmm6,           3
612f71323e297a928af368937089d3ed71239786f86Andreas Huber
613f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; transpose to save
614f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm7,           xmm2        ; 103 102 101 100 003 002 001 000
615f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpcklwd   xmm2,           xmm0        ; 007 003 006 002 005 001 004 000
616f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckhwd   xmm7,           xmm0        ; 107 103 106 102 105 101 104 100
617f71323e297a928af368937089d3ed71239786f86Andreas Huber
618f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm5,           xmm4        ; 111 110 109 108 011 010 009 008
619f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpcklwd   xmm4,           xmm6        ; 015 011 014 010 013 009 012 008
620f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckhwd   xmm5,           xmm6        ; 115 111 114 110 113 109 112 108
621f71323e297a928af368937089d3ed71239786f86Andreas Huber
622f71323e297a928af368937089d3ed71239786f86Andreas Huber
623f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm1,           xmm2        ; 007 003 006 002 005 001 004 000
624f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckldq   xmm2,           xmm4        ; 013 009 005 001 012 008 004 000
625f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckhdq   xmm1,           xmm4        ; 015 011 007 003 014 010 006 002
626f71323e297a928af368937089d3ed71239786f86Andreas Huber
627f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm6,           xmm7        ; 107 103 106 102 105 101 104 100
628f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckldq   xmm7,           xmm5        ; 113 109 105 101 112 108 104 100
629f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckhdq   xmm6,           xmm5        ; 115 111 107 103 114 110 106 102
630f71323e297a928af368937089d3ed71239786f86Andreas Huber
631f71323e297a928af368937089d3ed71239786f86Andreas Huber
632f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm5,           xmm2        ; 013 009 005 001 012 008 004 000
633f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckldq   xmm2,           xmm7        ; 112 108 012 008 104 100 004 000
634f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckhdq   xmm5,           xmm7        ; 113 109 013 009 105 101 005 001
635f71323e297a928af368937089d3ed71239786f86Andreas Huber
636f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm7,           xmm1        ; 015 011 007 003 014 010 006 002
637f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckldq   xmm1,           xmm6        ; 114 110 014 010 106 102 006 002
638f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckhdq   xmm7,           xmm6        ; 115 111 015 011 107 103 007 003
639f71323e297a928af368937089d3ed71239786f86Andreas Huber
640f71323e297a928af368937089d3ed71239786f86Andreas Huber        pshufd      xmm0,           xmm2,       11011000b
641f71323e297a928af368937089d3ed71239786f86Andreas Huber        pshufd      xmm2,           xmm1,       11011000b
642f71323e297a928af368937089d3ed71239786f86Andreas Huber
643f71323e297a928af368937089d3ed71239786f86Andreas Huber        pshufd      xmm1,           xmm5,       11011000b
644f71323e297a928af368937089d3ed71239786f86Andreas Huber        pshufd      xmm3,           xmm7,       11011000b
645f71323e297a928af368937089d3ed71239786f86Andreas Huber
646f71323e297a928af368937089d3ed71239786f86Andreas Huber        pxor        xmm7,           xmm7
647f71323e297a928af368937089d3ed71239786f86Andreas Huber
648f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; Load up predict blocks
6491b362b15af34006e6a11974088a46d42b903418eJohann        movsxd      rdx,            dword ptr arg(3) ; dst_stride
6501b362b15af34006e6a11974088a46d42b903418eJohann        movq        xmm4,           [rdi]
6511b362b15af34006e6a11974088a46d42b903418eJohann        movq        xmm5,           [rdi+rdx]
6521b362b15af34006e6a11974088a46d42b903418eJohann        lea         rcx,            [rdx + rdx*2]
653f71323e297a928af368937089d3ed71239786f86Andreas Huber
654f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpcklbw   xmm4,           xmm7
655f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpcklbw   xmm5,           xmm7
656f71323e297a928af368937089d3ed71239786f86Andreas Huber
657f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddw       xmm0,           xmm4
658f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddw       xmm1,           xmm5
659f71323e297a928af368937089d3ed71239786f86Andreas Huber
6601b362b15af34006e6a11974088a46d42b903418eJohann        movq        xmm4,           [rdi+rdx*2]
6611b362b15af34006e6a11974088a46d42b903418eJohann        movq        xmm5,           [rdi+rcx]
662f71323e297a928af368937089d3ed71239786f86Andreas Huber
663f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpcklbw   xmm4,           xmm7
664f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpcklbw   xmm5,           xmm7
665f71323e297a928af368937089d3ed71239786f86Andreas Huber
666f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddw       xmm2,           xmm4
667f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddw       xmm3,           xmm5
668f71323e297a928af368937089d3ed71239786f86Andreas Huber
669f71323e297a928af368937089d3ed71239786f86Andreas Huber.finish:
670f71323e297a928af368937089d3ed71239786f86Andreas Huber
671f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; pack up before storing
672f71323e297a928af368937089d3ed71239786f86Andreas Huber        packuswb    xmm0,           xmm7
673f71323e297a928af368937089d3ed71239786f86Andreas Huber        packuswb    xmm1,           xmm7
674f71323e297a928af368937089d3ed71239786f86Andreas Huber        packuswb    xmm2,           xmm7
675f71323e297a928af368937089d3ed71239786f86Andreas Huber        packuswb    xmm3,           xmm7
676f71323e297a928af368937089d3ed71239786f86Andreas Huber
677f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; Load destination stride before writing out,
678f71323e297a928af368937089d3ed71239786f86Andreas Huber    ;   doesn't need to persist
6791b362b15af34006e6a11974088a46d42b903418eJohann        movsxd      rdx,            dword ptr arg(3) ; dst_stride
680f71323e297a928af368937089d3ed71239786f86Andreas Huber
681f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; store blocks back out
682f71323e297a928af368937089d3ed71239786f86Andreas Huber        movq        [rdi],          xmm0
683f71323e297a928af368937089d3ed71239786f86Andreas Huber        movq        [rdi + rdx],    xmm1
684f71323e297a928af368937089d3ed71239786f86Andreas Huber
685f71323e297a928af368937089d3ed71239786f86Andreas Huber        lea         rdi,            [rdi + 2*rdx]
686f71323e297a928af368937089d3ed71239786f86Andreas Huber
687f71323e297a928af368937089d3ed71239786f86Andreas Huber        movq        [rdi],          xmm2
688f71323e297a928af368937089d3ed71239786f86Andreas Huber        movq        [rdi + rdx],    xmm3
689f71323e297a928af368937089d3ed71239786f86Andreas Huber
690f71323e297a928af368937089d3ed71239786f86Andreas Huber
691f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; begin epilog
692f71323e297a928af368937089d3ed71239786f86Andreas Huber    pop         rdi
693f71323e297a928af368937089d3ed71239786f86Andreas Huber    RESTORE_GOT
6941b362b15af34006e6a11974088a46d42b903418eJohann    RESTORE_XMM
695f71323e297a928af368937089d3ed71239786f86Andreas Huber    UNSHADOW_ARGS
696f71323e297a928af368937089d3ed71239786f86Andreas Huber    pop         rbp
697f71323e297a928af368937089d3ed71239786f86Andreas Huber    ret
698f71323e297a928af368937089d3ed71239786f86Andreas Huber
699f71323e297a928af368937089d3ed71239786f86Andreas HuberSECTION_RODATA
700f71323e297a928af368937089d3ed71239786f86Andreas Huberalign 16
701f71323e297a928af368937089d3ed71239786f86Andreas Huberfours:
702f71323e297a928af368937089d3ed71239786f86Andreas Huber    times 8 dw 0x0004
703f71323e297a928af368937089d3ed71239786f86Andreas Huberalign 16
704f71323e297a928af368937089d3ed71239786f86Andreas Huberx_s1sqr2:
705f71323e297a928af368937089d3ed71239786f86Andreas Huber    times 8 dw 0x8A8C
706f71323e297a928af368937089d3ed71239786f86Andreas Huberalign 16
707f71323e297a928af368937089d3ed71239786f86Andreas Huberx_c1sqr2less1:
708f71323e297a928af368937089d3ed71239786f86Andreas Huber    times 8 dw 0x4E7B
709