1233d2500723e5594f3e7c70896ffeeef32b9c950ywan;
2233d2500723e5594f3e7c70896ffeeef32b9c950ywan;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3233d2500723e5594f3e7c70896ffeeef32b9c950ywan;
4233d2500723e5594f3e7c70896ffeeef32b9c950ywan;  Use of this source code is governed by a BSD-style license
5233d2500723e5594f3e7c70896ffeeef32b9c950ywan;  that can be found in the LICENSE file in the root of the source
6233d2500723e5594f3e7c70896ffeeef32b9c950ywan;  tree. An additional intellectual property rights grant can be found
7233d2500723e5594f3e7c70896ffeeef32b9c950ywan;  in the file PATENTS.  All contributing project authors may
8233d2500723e5594f3e7c70896ffeeef32b9c950ywan;  be found in the AUTHORS file in the root of the source tree.
9233d2500723e5594f3e7c70896ffeeef32b9c950ywan;
10233d2500723e5594f3e7c70896ffeeef32b9c950ywan
11233d2500723e5594f3e7c70896ffeeef32b9c950ywan
12233d2500723e5594f3e7c70896ffeeef32b9c950ywan%include "vpx_ports/x86_abi_support.asm"
13233d2500723e5594f3e7c70896ffeeef32b9c950ywan
14233d2500723e5594f3e7c70896ffeeef32b9c950ywan;void vp8_idct_dequant_0_2x_sse2
15233d2500723e5594f3e7c70896ffeeef32b9c950ywan; (
16233d2500723e5594f3e7c70896ffeeef32b9c950ywan;   short *qcoeff       - 0
17233d2500723e5594f3e7c70896ffeeef32b9c950ywan;   short *dequant      - 1
18233d2500723e5594f3e7c70896ffeeef32b9c950ywan;   unsigned char *dst  - 2
19233d2500723e5594f3e7c70896ffeeef32b9c950ywan;   int dst_stride      - 3
20233d2500723e5594f3e7c70896ffeeef32b9c950ywan; )
21233d2500723e5594f3e7c70896ffeeef32b9c950ywan
22233d2500723e5594f3e7c70896ffeeef32b9c950ywanglobal sym(vp8_idct_dequant_0_2x_sse2) PRIVATE
23233d2500723e5594f3e7c70896ffeeef32b9c950ywansym(vp8_idct_dequant_0_2x_sse2):
24233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push        rbp
25233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         rbp, rsp
26233d2500723e5594f3e7c70896ffeeef32b9c950ywan    SHADOW_ARGS_TO_STACK 4
27233d2500723e5594f3e7c70896ffeeef32b9c950ywan    GET_GOT     rbx
28233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; end prolog
29233d2500723e5594f3e7c70896ffeeef32b9c950ywan
30233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov         rdx,            arg(1) ; dequant
31233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov         rax,            arg(0) ; qcoeff
32233d2500723e5594f3e7c70896ffeeef32b9c950ywan
33233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movd        xmm4,           [rax]
34233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movd        xmm5,           [rdx]
35233d2500723e5594f3e7c70896ffeeef32b9c950ywan
36233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pinsrw      xmm4,           [rax+32],   4
37233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pinsrw      xmm5,           [rdx],      4
38233d2500723e5594f3e7c70896ffeeef32b9c950ywan
39233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmullw      xmm4,           xmm5
40233d2500723e5594f3e7c70896ffeeef32b9c950ywan
41233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; Zero out xmm5, for use unpacking
42233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pxor        xmm5,           xmm5
43233d2500723e5594f3e7c70896ffeeef32b9c950ywan
44233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; clear coeffs
45233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movd        [rax],          xmm5
46233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movd        [rax+32],       xmm5
47233d2500723e5594f3e7c70896ffeeef32b9c950ywan;pshufb
48233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov         rax,            arg(2) ; dst
49233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movsxd      rdx,            dword ptr arg(3) ; dst_stride
50233d2500723e5594f3e7c70896ffeeef32b9c950ywan
51233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pshuflw     xmm4,           xmm4,       00000000b
52233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pshufhw     xmm4,           xmm4,       00000000b
53233d2500723e5594f3e7c70896ffeeef32b9c950ywan
54233d2500723e5594f3e7c70896ffeeef32b9c950ywan        lea         rcx,            [rdx + rdx*2]
55233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw       xmm4,           [GLOBAL(fours)]
56233d2500723e5594f3e7c70896ffeeef32b9c950ywan
57233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psraw       xmm4,           3
58233d2500723e5594f3e7c70896ffeeef32b9c950ywan
59233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        xmm0,           [rax]
60233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        xmm1,           [rax+rdx]
61233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        xmm2,           [rax+2*rdx]
62233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        xmm3,           [rax+rcx]
63233d2500723e5594f3e7c70896ffeeef32b9c950ywan
64233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw   xmm0,           xmm5
65233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw   xmm1,           xmm5
66233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw   xmm2,           xmm5
67233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw   xmm3,           xmm5
68233d2500723e5594f3e7c70896ffeeef32b9c950ywan
69233d2500723e5594f3e7c70896ffeeef32b9c950ywan
70233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; Add to predict buffer
71233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw       xmm0,           xmm4
72233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw       xmm1,           xmm4
73233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw       xmm2,           xmm4
74233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw       xmm3,           xmm4
75233d2500723e5594f3e7c70896ffeeef32b9c950ywan
76233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; pack up before storing
77233d2500723e5594f3e7c70896ffeeef32b9c950ywan        packuswb    xmm0,           xmm5
78233d2500723e5594f3e7c70896ffeeef32b9c950ywan        packuswb    xmm1,           xmm5
79233d2500723e5594f3e7c70896ffeeef32b9c950ywan        packuswb    xmm2,           xmm5
80233d2500723e5594f3e7c70896ffeeef32b9c950ywan        packuswb    xmm3,           xmm5
81233d2500723e5594f3e7c70896ffeeef32b9c950ywan
82233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; store blocks back out
83233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        [rax],          xmm0
84233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        [rax + rdx],    xmm1
85233d2500723e5594f3e7c70896ffeeef32b9c950ywan
86233d2500723e5594f3e7c70896ffeeef32b9c950ywan        lea         rax,            [rax + 2*rdx]
87233d2500723e5594f3e7c70896ffeeef32b9c950ywan
88233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        [rax],          xmm2
89233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        [rax + rdx],    xmm3
90233d2500723e5594f3e7c70896ffeeef32b9c950ywan
91233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; begin epilog
92233d2500723e5594f3e7c70896ffeeef32b9c950ywan    RESTORE_GOT
93233d2500723e5594f3e7c70896ffeeef32b9c950ywan    UNSHADOW_ARGS
94233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop         rbp
95233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ret
96233d2500723e5594f3e7c70896ffeeef32b9c950ywan
97233d2500723e5594f3e7c70896ffeeef32b9c950ywan;void vp8_idct_dequant_full_2x_sse2
98233d2500723e5594f3e7c70896ffeeef32b9c950ywan; (
99233d2500723e5594f3e7c70896ffeeef32b9c950ywan;   short *qcoeff       - 0
100233d2500723e5594f3e7c70896ffeeef32b9c950ywan;   short *dequant      - 1
101233d2500723e5594f3e7c70896ffeeef32b9c950ywan;   unsigned char *dst  - 2
102233d2500723e5594f3e7c70896ffeeef32b9c950ywan;   int dst_stride      - 3
103233d2500723e5594f3e7c70896ffeeef32b9c950ywan; )
104233d2500723e5594f3e7c70896ffeeef32b9c950ywanglobal sym(vp8_idct_dequant_full_2x_sse2) PRIVATE
105233d2500723e5594f3e7c70896ffeeef32b9c950ywansym(vp8_idct_dequant_full_2x_sse2):
106233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push        rbp
107233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         rbp, rsp
108233d2500723e5594f3e7c70896ffeeef32b9c950ywan    SHADOW_ARGS_TO_STACK 4
109233d2500723e5594f3e7c70896ffeeef32b9c950ywan    SAVE_XMM 7
110233d2500723e5594f3e7c70896ffeeef32b9c950ywan    GET_GOT     rbx
111233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push        rsi
112233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push        rdi
113233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; end prolog
114233d2500723e5594f3e7c70896ffeeef32b9c950ywan
115233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; special case when 2 blocks have 0 or 1 coeffs
116233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; dc is set as first coeff, so no need to load qcoeff
117233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov         rax,            arg(0) ; qcoeff
118233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov         rdx,            arg(1)  ; dequant
119233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov         rdi,            arg(2) ; dst
120233d2500723e5594f3e7c70896ffeeef32b9c950ywan
121233d2500723e5594f3e7c70896ffeeef32b9c950ywan
122233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; Zero out xmm7, for use unpacking
123233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pxor        xmm7,           xmm7
124233d2500723e5594f3e7c70896ffeeef32b9c950ywan
125233d2500723e5594f3e7c70896ffeeef32b9c950ywan
126233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; note the transpose of xmm1 and xmm2, necessary for shuffle
127233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;   to spit out sensicle data
128233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm0,           [rax]
129233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm2,           [rax+16]
130233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm1,           [rax+32]
131233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm3,           [rax+48]
132233d2500723e5594f3e7c70896ffeeef32b9c950ywan
133233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; Clear out coeffs
134233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      [rax],          xmm7
135233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      [rax+16],       xmm7
136233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      [rax+32],       xmm7
137233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      [rax+48],       xmm7
138233d2500723e5594f3e7c70896ffeeef32b9c950ywan
139233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; dequantize qcoeff buffer
140233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmullw      xmm0,           [rdx]
141233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmullw      xmm2,           [rdx+16]
142233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmullw      xmm1,           [rdx]
143233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmullw      xmm3,           [rdx+16]
144233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movsxd      rdx,            dword ptr arg(3) ; dst_stride
145233d2500723e5594f3e7c70896ffeeef32b9c950ywan
146233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; repack so block 0 row x and block 1 row x are together
147233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm4,           xmm0
148233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckldq   xmm0,           xmm1
149233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhdq   xmm4,           xmm1
150233d2500723e5594f3e7c70896ffeeef32b9c950ywan
151233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pshufd      xmm0,           xmm0,       11011000b
152233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pshufd      xmm1,           xmm4,       11011000b
153233d2500723e5594f3e7c70896ffeeef32b9c950ywan
154233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm4,           xmm2
155233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckldq   xmm2,           xmm3
156233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhdq   xmm4,           xmm3
157233d2500723e5594f3e7c70896ffeeef32b9c950ywan
158233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pshufd      xmm2,           xmm2,       11011000b
159233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pshufd      xmm3,           xmm4,       11011000b
160233d2500723e5594f3e7c70896ffeeef32b9c950ywan
161233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; first pass
162233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubw       xmm0,           xmm2        ; b1 = 0-2
163233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw       xmm2,           xmm2        ;
164233d2500723e5594f3e7c70896ffeeef32b9c950ywan
165233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm5,           xmm1
166233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw       xmm2,           xmm0        ; a1 = 0+2
167233d2500723e5594f3e7c70896ffeeef32b9c950ywan
168233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmulhw      xmm5,           [GLOBAL(x_s1sqr2)]
169233d2500723e5594f3e7c70896ffeeef32b9c950ywan        lea         rcx,            [rdx + rdx*2]   ;dst_stride * 3
170233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw       xmm5,           xmm1        ; ip1 * sin(pi/8) * sqrt(2)
171233d2500723e5594f3e7c70896ffeeef32b9c950ywan
172233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm7,           xmm3
173233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmulhw      xmm7,           [GLOBAL(x_c1sqr2less1)]
174233d2500723e5594f3e7c70896ffeeef32b9c950ywan
175233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw       xmm7,           xmm3        ; ip3 * cos(pi/8) * sqrt(2)
176233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubw       xmm7,           xmm5        ; c1
177233d2500723e5594f3e7c70896ffeeef32b9c950ywan
178233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm5,           xmm1
179233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm4,           xmm3
180233d2500723e5594f3e7c70896ffeeef32b9c950ywan
181233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmulhw      xmm5,           [GLOBAL(x_c1sqr2less1)]
182233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw       xmm5,           xmm1
183233d2500723e5594f3e7c70896ffeeef32b9c950ywan
184233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmulhw      xmm3,           [GLOBAL(x_s1sqr2)]
185233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw       xmm3,           xmm4
186233d2500723e5594f3e7c70896ffeeef32b9c950ywan
187233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw       xmm3,           xmm5        ; d1
188233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm6,           xmm2        ; a1
189233d2500723e5594f3e7c70896ffeeef32b9c950ywan
190233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm4,           xmm0        ; b1
191233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw       xmm2,           xmm3        ;0
192233d2500723e5594f3e7c70896ffeeef32b9c950ywan
193233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw       xmm4,           xmm7        ;1
194233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubw       xmm0,           xmm7        ;2
195233d2500723e5594f3e7c70896ffeeef32b9c950ywan
196233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubw       xmm6,           xmm3        ;3
197233d2500723e5594f3e7c70896ffeeef32b9c950ywan
198233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; transpose for the second pass
199233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm7,           xmm2        ; 103 102 101 100 003 002 001 000
200233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklwd   xmm2,           xmm0        ; 007 003 006 002 005 001 004 000
201233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhwd   xmm7,           xmm0        ; 107 103 106 102 105 101 104 100
202233d2500723e5594f3e7c70896ffeeef32b9c950ywan
203233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm5,           xmm4        ; 111 110 109 108 011 010 009 008
204233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklwd   xmm4,           xmm6        ; 015 011 014 010 013 009 012 008
205233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhwd   xmm5,           xmm6        ; 115 111 114 110 113 109 112 108
206233d2500723e5594f3e7c70896ffeeef32b9c950ywan
207233d2500723e5594f3e7c70896ffeeef32b9c950ywan
208233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm1,           xmm2        ; 007 003 006 002 005 001 004 000
209233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckldq   xmm2,           xmm4        ; 013 009 005 001 012 008 004 000
210233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhdq   xmm1,           xmm4        ; 015 011 007 003 014 010 006 002
211233d2500723e5594f3e7c70896ffeeef32b9c950ywan
212233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm6,           xmm7        ; 107 103 106 102 105 101 104 100
213233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckldq   xmm7,           xmm5        ; 113 109 105 101 112 108 104 100
214233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhdq   xmm6,           xmm5        ; 115 111 107 103 114 110 106 102
215233d2500723e5594f3e7c70896ffeeef32b9c950ywan
216233d2500723e5594f3e7c70896ffeeef32b9c950ywan
217233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm5,           xmm2        ; 013 009 005 001 012 008 004 000
218233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckldq   xmm2,           xmm7        ; 112 108 012 008 104 100 004 000
219233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhdq   xmm5,           xmm7        ; 113 109 013 009 105 101 005 001
220233d2500723e5594f3e7c70896ffeeef32b9c950ywan
221233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm7,           xmm1        ; 015 011 007 003 014 010 006 002
222233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckldq   xmm1,           xmm6        ; 114 110 014 010 106 102 006 002
223233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhdq   xmm7,           xmm6        ; 115 111 015 011 107 103 007 003
224233d2500723e5594f3e7c70896ffeeef32b9c950ywan
225233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pshufd      xmm0,           xmm2,       11011000b
226233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pshufd      xmm2,           xmm1,       11011000b
227233d2500723e5594f3e7c70896ffeeef32b9c950ywan
228233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pshufd      xmm1,           xmm5,       11011000b
229233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pshufd      xmm3,           xmm7,       11011000b
230233d2500723e5594f3e7c70896ffeeef32b9c950ywan
231233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; second pass
232233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubw       xmm0,           xmm2            ; b1 = 0-2
233233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw       xmm2,           xmm2
234233d2500723e5594f3e7c70896ffeeef32b9c950ywan
235233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm5,           xmm1
236233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw       xmm2,           xmm0            ; a1 = 0+2
237233d2500723e5594f3e7c70896ffeeef32b9c950ywan
238233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmulhw      xmm5,           [GLOBAL(x_s1sqr2)]
239233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw       xmm5,           xmm1            ; ip1 * sin(pi/8) * sqrt(2)
240233d2500723e5594f3e7c70896ffeeef32b9c950ywan
241233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm7,           xmm3
242233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmulhw      xmm7,           [GLOBAL(x_c1sqr2less1)]
243233d2500723e5594f3e7c70896ffeeef32b9c950ywan
244233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw       xmm7,           xmm3            ; ip3 * cos(pi/8) * sqrt(2)
245233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubw       xmm7,           xmm5            ; c1
246233d2500723e5594f3e7c70896ffeeef32b9c950ywan
247233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm5,           xmm1
248233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm4,           xmm3
249233d2500723e5594f3e7c70896ffeeef32b9c950ywan
250233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmulhw      xmm5,           [GLOBAL(x_c1sqr2less1)]
251233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw       xmm5,           xmm1
252233d2500723e5594f3e7c70896ffeeef32b9c950ywan
253233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmulhw      xmm3,           [GLOBAL(x_s1sqr2)]
254233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw       xmm3,           xmm4
255233d2500723e5594f3e7c70896ffeeef32b9c950ywan
256233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw       xmm3,           xmm5            ; d1
257233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw       xmm0,           [GLOBAL(fours)]
258233d2500723e5594f3e7c70896ffeeef32b9c950ywan
259233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw       xmm2,           [GLOBAL(fours)]
260233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm6,           xmm2            ; a1
261233d2500723e5594f3e7c70896ffeeef32b9c950ywan
262233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm4,           xmm0            ; b1
263233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw       xmm2,           xmm3            ;0
264233d2500723e5594f3e7c70896ffeeef32b9c950ywan
265233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw       xmm4,           xmm7            ;1
266233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubw       xmm0,           xmm7            ;2
267233d2500723e5594f3e7c70896ffeeef32b9c950ywan
268233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubw       xmm6,           xmm3            ;3
269233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psraw       xmm2,           3
270233d2500723e5594f3e7c70896ffeeef32b9c950ywan
271233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psraw       xmm0,           3
272233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psraw       xmm4,           3
273233d2500723e5594f3e7c70896ffeeef32b9c950ywan
274233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psraw       xmm6,           3
275233d2500723e5594f3e7c70896ffeeef32b9c950ywan
276233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; transpose to save
277233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm7,           xmm2        ; 103 102 101 100 003 002 001 000
278233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklwd   xmm2,           xmm0        ; 007 003 006 002 005 001 004 000
279233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhwd   xmm7,           xmm0        ; 107 103 106 102 105 101 104 100
280233d2500723e5594f3e7c70896ffeeef32b9c950ywan
281233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm5,           xmm4        ; 111 110 109 108 011 010 009 008
282233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklwd   xmm4,           xmm6        ; 015 011 014 010 013 009 012 008
283233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhwd   xmm5,           xmm6        ; 115 111 114 110 113 109 112 108
284233d2500723e5594f3e7c70896ffeeef32b9c950ywan
285233d2500723e5594f3e7c70896ffeeef32b9c950ywan
286233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm1,           xmm2        ; 007 003 006 002 005 001 004 000
287233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckldq   xmm2,           xmm4        ; 013 009 005 001 012 008 004 000
288233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhdq   xmm1,           xmm4        ; 015 011 007 003 014 010 006 002
289233d2500723e5594f3e7c70896ffeeef32b9c950ywan
290233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm6,           xmm7        ; 107 103 106 102 105 101 104 100
291233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckldq   xmm7,           xmm5        ; 113 109 105 101 112 108 104 100
292233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhdq   xmm6,           xmm5        ; 115 111 107 103 114 110 106 102
293233d2500723e5594f3e7c70896ffeeef32b9c950ywan
294233d2500723e5594f3e7c70896ffeeef32b9c950ywan
295233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm5,           xmm2        ; 013 009 005 001 012 008 004 000
296233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckldq   xmm2,           xmm7        ; 112 108 012 008 104 100 004 000
297233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhdq   xmm5,           xmm7        ; 113 109 013 009 105 101 005 001
298233d2500723e5594f3e7c70896ffeeef32b9c950ywan
299233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm7,           xmm1        ; 015 011 007 003 014 010 006 002
300233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckldq   xmm1,           xmm6        ; 114 110 014 010 106 102 006 002
301233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhdq   xmm7,           xmm6        ; 115 111 015 011 107 103 007 003
302233d2500723e5594f3e7c70896ffeeef32b9c950ywan
303233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pshufd      xmm0,           xmm2,       11011000b
304233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pshufd      xmm2,           xmm1,       11011000b
305233d2500723e5594f3e7c70896ffeeef32b9c950ywan
306233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pshufd      xmm1,           xmm5,       11011000b
307233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pshufd      xmm3,           xmm7,       11011000b
308233d2500723e5594f3e7c70896ffeeef32b9c950ywan
309233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pxor        xmm7,           xmm7
310233d2500723e5594f3e7c70896ffeeef32b9c950ywan
311233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; Load up predict blocks
312233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        xmm4,           [rdi]
313233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        xmm5,           [rdi+rdx]
314233d2500723e5594f3e7c70896ffeeef32b9c950ywan
315233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw   xmm4,           xmm7
316233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw   xmm5,           xmm7
317233d2500723e5594f3e7c70896ffeeef32b9c950ywan
318233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw       xmm0,           xmm4
319233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw       xmm1,           xmm5
320233d2500723e5594f3e7c70896ffeeef32b9c950ywan
321233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        xmm4,           [rdi+2*rdx]
322233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        xmm5,           [rdi+rcx]
323233d2500723e5594f3e7c70896ffeeef32b9c950ywan
324233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw   xmm4,           xmm7
325233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw   xmm5,           xmm7
326233d2500723e5594f3e7c70896ffeeef32b9c950ywan
327233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw       xmm2,           xmm4
328233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw       xmm3,           xmm5
329233d2500723e5594f3e7c70896ffeeef32b9c950ywan
330233d2500723e5594f3e7c70896ffeeef32b9c950ywan.finish:
331233d2500723e5594f3e7c70896ffeeef32b9c950ywan
332233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; pack up before storing
333233d2500723e5594f3e7c70896ffeeef32b9c950ywan        packuswb    xmm0,           xmm7
334233d2500723e5594f3e7c70896ffeeef32b9c950ywan        packuswb    xmm1,           xmm7
335233d2500723e5594f3e7c70896ffeeef32b9c950ywan        packuswb    xmm2,           xmm7
336233d2500723e5594f3e7c70896ffeeef32b9c950ywan        packuswb    xmm3,           xmm7
337233d2500723e5594f3e7c70896ffeeef32b9c950ywan
338233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; store blocks back out
339233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        [rdi],          xmm0
340233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        [rdi + rdx],    xmm1
341233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        [rdi + rdx*2],  xmm2
342233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        [rdi + rcx],    xmm3
343233d2500723e5594f3e7c70896ffeeef32b9c950ywan
344233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; begin epilog
345233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop         rdi
346233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop         rsi
347233d2500723e5594f3e7c70896ffeeef32b9c950ywan    RESTORE_GOT
348233d2500723e5594f3e7c70896ffeeef32b9c950ywan    RESTORE_XMM
349233d2500723e5594f3e7c70896ffeeef32b9c950ywan    UNSHADOW_ARGS
350233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop         rbp
351233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ret
352233d2500723e5594f3e7c70896ffeeef32b9c950ywan
353233d2500723e5594f3e7c70896ffeeef32b9c950ywan;void vp8_idct_dequant_dc_0_2x_sse2
354233d2500723e5594f3e7c70896ffeeef32b9c950ywan; (
355233d2500723e5594f3e7c70896ffeeef32b9c950ywan;   short *qcoeff       - 0
356233d2500723e5594f3e7c70896ffeeef32b9c950ywan;   short *dequant      - 1
357233d2500723e5594f3e7c70896ffeeef32b9c950ywan;   unsigned char *dst  - 2
358233d2500723e5594f3e7c70896ffeeef32b9c950ywan;   int dst_stride      - 3
359233d2500723e5594f3e7c70896ffeeef32b9c950ywan;   short *dc           - 4
360233d2500723e5594f3e7c70896ffeeef32b9c950ywan; )
361233d2500723e5594f3e7c70896ffeeef32b9c950ywanglobal sym(vp8_idct_dequant_dc_0_2x_sse2) PRIVATE
362233d2500723e5594f3e7c70896ffeeef32b9c950ywansym(vp8_idct_dequant_dc_0_2x_sse2):
363233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push        rbp
364233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         rbp, rsp
365233d2500723e5594f3e7c70896ffeeef32b9c950ywan    SHADOW_ARGS_TO_STACK 5
366233d2500723e5594f3e7c70896ffeeef32b9c950ywan    GET_GOT     rbx
367233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push        rdi
368233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; end prolog
369233d2500723e5594f3e7c70896ffeeef32b9c950ywan
370233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; special case when 2 blocks have 0 or 1 coeffs
371233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; dc is set as first coeff, so no need to load qcoeff
372233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov         rax,            arg(0) ; qcoeff
373233d2500723e5594f3e7c70896ffeeef32b9c950ywan
374233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov         rdi,            arg(2) ; dst
375233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov         rdx,            arg(4) ; dc
376233d2500723e5594f3e7c70896ffeeef32b9c950ywan
377233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; Zero out xmm5, for use unpacking
378233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pxor        xmm5,           xmm5
379233d2500723e5594f3e7c70896ffeeef32b9c950ywan
380233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; load up 2 dc words here == 2*16 = doubleword
381233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movd        xmm4,           [rdx]
382233d2500723e5594f3e7c70896ffeeef32b9c950ywan
383233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movsxd      rdx,            dword ptr arg(3) ; dst_stride
384233d2500723e5594f3e7c70896ffeeef32b9c950ywan        lea         rcx, [rdx + rdx*2]
385233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; Load up predict blocks
386233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        xmm0,           [rdi]
387233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        xmm1,           [rdi+rdx*1]
388233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        xmm2,           [rdi+rdx*2]
389233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        xmm3,           [rdi+rcx]
390233d2500723e5594f3e7c70896ffeeef32b9c950ywan
391233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; Duplicate and expand dc across
392233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklwd   xmm4,           xmm4
393233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckldq   xmm4,           xmm4
394233d2500723e5594f3e7c70896ffeeef32b9c950ywan
395233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; Rounding to dequant and downshift
396233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw       xmm4,           [GLOBAL(fours)]
397233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psraw       xmm4,           3
398233d2500723e5594f3e7c70896ffeeef32b9c950ywan
399233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; Predict buffer needs to be expanded from bytes to words
400233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw   xmm0,           xmm5
401233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw   xmm1,           xmm5
402233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw   xmm2,           xmm5
403233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw   xmm3,           xmm5
404233d2500723e5594f3e7c70896ffeeef32b9c950ywan
405233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; Add to predict buffer
406233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw       xmm0,           xmm4
407233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw       xmm1,           xmm4
408233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw       xmm2,           xmm4
409233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw       xmm3,           xmm4
410233d2500723e5594f3e7c70896ffeeef32b9c950ywan
411233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; pack up before storing
412233d2500723e5594f3e7c70896ffeeef32b9c950ywan        packuswb    xmm0,           xmm5
413233d2500723e5594f3e7c70896ffeeef32b9c950ywan        packuswb    xmm1,           xmm5
414233d2500723e5594f3e7c70896ffeeef32b9c950ywan        packuswb    xmm2,           xmm5
415233d2500723e5594f3e7c70896ffeeef32b9c950ywan        packuswb    xmm3,           xmm5
416233d2500723e5594f3e7c70896ffeeef32b9c950ywan
417233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; store blocks back out
418233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        [rdi],          xmm0
419233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        [rdi + rdx],    xmm1
420233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        [rdi + rdx*2],  xmm2
421233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        [rdi + rcx],    xmm3
422233d2500723e5594f3e7c70896ffeeef32b9c950ywan
423233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; begin epilog
424233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop         rdi
425233d2500723e5594f3e7c70896ffeeef32b9c950ywan    RESTORE_GOT
426233d2500723e5594f3e7c70896ffeeef32b9c950ywan    UNSHADOW_ARGS
427233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop         rbp
428233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ret
429233d2500723e5594f3e7c70896ffeeef32b9c950ywan;void vp8_idct_dequant_dc_full_2x_sse2
430233d2500723e5594f3e7c70896ffeeef32b9c950ywan; (
431233d2500723e5594f3e7c70896ffeeef32b9c950ywan;   short *qcoeff       - 0
432233d2500723e5594f3e7c70896ffeeef32b9c950ywan;   short *dequant      - 1
433233d2500723e5594f3e7c70896ffeeef32b9c950ywan;   unsigned char *dst  - 2
434233d2500723e5594f3e7c70896ffeeef32b9c950ywan;   int dst_stride      - 3
435233d2500723e5594f3e7c70896ffeeef32b9c950ywan;   short *dc           - 4
436233d2500723e5594f3e7c70896ffeeef32b9c950ywan; )
437233d2500723e5594f3e7c70896ffeeef32b9c950ywanglobal sym(vp8_idct_dequant_dc_full_2x_sse2) PRIVATE
438233d2500723e5594f3e7c70896ffeeef32b9c950ywansym(vp8_idct_dequant_dc_full_2x_sse2):
439233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push        rbp
440233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         rbp, rsp
441233d2500723e5594f3e7c70896ffeeef32b9c950ywan    SHADOW_ARGS_TO_STACK 5
442233d2500723e5594f3e7c70896ffeeef32b9c950ywan    SAVE_XMM 7
443233d2500723e5594f3e7c70896ffeeef32b9c950ywan    GET_GOT     rbx
444233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push        rdi
445233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; end prolog
446233d2500723e5594f3e7c70896ffeeef32b9c950ywan
447233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; special case when 2 blocks have 0 or 1 coeffs
448233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; dc is set as first coeff, so no need to load qcoeff
449233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov         rax,            arg(0) ; qcoeff
450233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov         rdx,            arg(1)  ; dequant
451233d2500723e5594f3e7c70896ffeeef32b9c950ywan
452233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov         rdi,            arg(2) ; dst
453233d2500723e5594f3e7c70896ffeeef32b9c950ywan
454233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; Zero out xmm7, for use unpacking
455233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pxor        xmm7,           xmm7
456233d2500723e5594f3e7c70896ffeeef32b9c950ywan
457233d2500723e5594f3e7c70896ffeeef32b9c950ywan
458233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; note the transpose of xmm1 and xmm2, necessary for shuffle
459233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;   to spit out sensicle data
460233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm0,           [rax]
461233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm2,           [rax+16]
462233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm1,           [rax+32]
463233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm3,           [rax+48]
464233d2500723e5594f3e7c70896ffeeef32b9c950ywan
465233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; Clear out coeffs
466233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      [rax],          xmm7
467233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      [rax+16],       xmm7
468233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      [rax+32],       xmm7
469233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      [rax+48],       xmm7
470233d2500723e5594f3e7c70896ffeeef32b9c950ywan
471233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; dequantize qcoeff buffer
472233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmullw      xmm0,           [rdx]
473233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmullw      xmm2,           [rdx+16]
474233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmullw      xmm1,           [rdx]
475233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmullw      xmm3,           [rdx+16]
476233d2500723e5594f3e7c70896ffeeef32b9c950ywan
477233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; DC component
478233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov         rdx,            arg(4)
479233d2500723e5594f3e7c70896ffeeef32b9c950ywan
480233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; repack so block 0 row x and block 1 row x are together
481233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm4,           xmm0
482233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckldq   xmm0,           xmm1
483233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhdq   xmm4,           xmm1
484233d2500723e5594f3e7c70896ffeeef32b9c950ywan
485233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pshufd      xmm0,           xmm0,       11011000b
486233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pshufd      xmm1,           xmm4,       11011000b
487233d2500723e5594f3e7c70896ffeeef32b9c950ywan
488233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm4,           xmm2
489233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckldq   xmm2,           xmm3
490233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhdq   xmm4,           xmm3
491233d2500723e5594f3e7c70896ffeeef32b9c950ywan
492233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pshufd      xmm2,           xmm2,       11011000b
493233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pshufd      xmm3,           xmm4,       11011000b
494233d2500723e5594f3e7c70896ffeeef32b9c950ywan
495233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; insert DC component
496233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pinsrw      xmm0,           [rdx],      0
497233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pinsrw      xmm0,           [rdx+2],    4
498233d2500723e5594f3e7c70896ffeeef32b9c950ywan
499233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; first pass
500233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubw       xmm0,           xmm2        ; b1 = 0-2
501233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw       xmm2,           xmm2        ;
502233d2500723e5594f3e7c70896ffeeef32b9c950ywan
503233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm5,           xmm1
504233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw       xmm2,           xmm0        ; a1 = 0+2
505233d2500723e5594f3e7c70896ffeeef32b9c950ywan
506233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmulhw      xmm5,           [GLOBAL(x_s1sqr2)]
507233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw       xmm5,           xmm1        ; ip1 * sin(pi/8) * sqrt(2)
508233d2500723e5594f3e7c70896ffeeef32b9c950ywan
509233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm7,           xmm3
510233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmulhw      xmm7,           [GLOBAL(x_c1sqr2less1)]
511233d2500723e5594f3e7c70896ffeeef32b9c950ywan
512233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw       xmm7,           xmm3        ; ip3 * cos(pi/8) * sqrt(2)
513233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubw       xmm7,           xmm5        ; c1
514233d2500723e5594f3e7c70896ffeeef32b9c950ywan
515233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm5,           xmm1
516233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm4,           xmm3
517233d2500723e5594f3e7c70896ffeeef32b9c950ywan
518233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmulhw      xmm5,           [GLOBAL(x_c1sqr2less1)]
519233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw       xmm5,           xmm1
520233d2500723e5594f3e7c70896ffeeef32b9c950ywan
521233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmulhw      xmm3,           [GLOBAL(x_s1sqr2)]
522233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw       xmm3,           xmm4
523233d2500723e5594f3e7c70896ffeeef32b9c950ywan
524233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw       xmm3,           xmm5        ; d1
525233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm6,           xmm2        ; a1
526233d2500723e5594f3e7c70896ffeeef32b9c950ywan
527233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm4,           xmm0        ; b1
528233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw       xmm2,           xmm3        ;0
529233d2500723e5594f3e7c70896ffeeef32b9c950ywan
530233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw       xmm4,           xmm7        ;1
531233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubw       xmm0,           xmm7        ;2
532233d2500723e5594f3e7c70896ffeeef32b9c950ywan
533233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubw       xmm6,           xmm3        ;3
534233d2500723e5594f3e7c70896ffeeef32b9c950ywan
535233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; transpose for the second pass
536233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm7,           xmm2        ; 103 102 101 100 003 002 001 000
537233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklwd   xmm2,           xmm0        ; 007 003 006 002 005 001 004 000
538233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhwd   xmm7,           xmm0        ; 107 103 106 102 105 101 104 100
539233d2500723e5594f3e7c70896ffeeef32b9c950ywan
540233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm5,           xmm4        ; 111 110 109 108 011 010 009 008
541233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklwd   xmm4,           xmm6        ; 015 011 014 010 013 009 012 008
542233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhwd   xmm5,           xmm6        ; 115 111 114 110 113 109 112 108
543233d2500723e5594f3e7c70896ffeeef32b9c950ywan
544233d2500723e5594f3e7c70896ffeeef32b9c950ywan
545233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm1,           xmm2        ; 007 003 006 002 005 001 004 000
546233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckldq   xmm2,           xmm4        ; 013 009 005 001 012 008 004 000
547233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhdq   xmm1,           xmm4        ; 015 011 007 003 014 010 006 002
548233d2500723e5594f3e7c70896ffeeef32b9c950ywan
549233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm6,           xmm7        ; 107 103 106 102 105 101 104 100
550233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckldq   xmm7,           xmm5        ; 113 109 105 101 112 108 104 100
551233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhdq   xmm6,           xmm5        ; 115 111 107 103 114 110 106 102
552233d2500723e5594f3e7c70896ffeeef32b9c950ywan
553233d2500723e5594f3e7c70896ffeeef32b9c950ywan
554233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm5,           xmm2        ; 013 009 005 001 012 008 004 000
555233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckldq   xmm2,           xmm7        ; 112 108 012 008 104 100 004 000
556233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhdq   xmm5,           xmm7        ; 113 109 013 009 105 101 005 001
557233d2500723e5594f3e7c70896ffeeef32b9c950ywan
558233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm7,           xmm1        ; 015 011 007 003 014 010 006 002
559233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckldq   xmm1,           xmm6        ; 114 110 014 010 106 102 006 002
560233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhdq   xmm7,           xmm6        ; 115 111 015 011 107 103 007 003
561233d2500723e5594f3e7c70896ffeeef32b9c950ywan
562233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pshufd      xmm0,           xmm2,       11011000b
563233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pshufd      xmm2,           xmm1,       11011000b
564233d2500723e5594f3e7c70896ffeeef32b9c950ywan
565233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pshufd      xmm1,           xmm5,       11011000b
566233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pshufd      xmm3,           xmm7,       11011000b
567233d2500723e5594f3e7c70896ffeeef32b9c950ywan
568233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; second pass
569233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubw       xmm0,           xmm2            ; b1 = 0-2
570233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw       xmm2,           xmm2
571233d2500723e5594f3e7c70896ffeeef32b9c950ywan
572233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm5,           xmm1
573233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw       xmm2,           xmm0            ; a1 = 0+2
574233d2500723e5594f3e7c70896ffeeef32b9c950ywan
575233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmulhw      xmm5,           [GLOBAL(x_s1sqr2)]
576233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw       xmm5,           xmm1            ; ip1 * sin(pi/8) * sqrt(2)
577233d2500723e5594f3e7c70896ffeeef32b9c950ywan
578233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm7,           xmm3
579233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmulhw      xmm7,           [GLOBAL(x_c1sqr2less1)]
580233d2500723e5594f3e7c70896ffeeef32b9c950ywan
581233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw       xmm7,           xmm3            ; ip3 * cos(pi/8) * sqrt(2)
582233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubw       xmm7,           xmm5            ; c1
583233d2500723e5594f3e7c70896ffeeef32b9c950ywan
584233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm5,           xmm1
585233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm4,           xmm3
586233d2500723e5594f3e7c70896ffeeef32b9c950ywan
587233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmulhw      xmm5,           [GLOBAL(x_c1sqr2less1)]
588233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw       xmm5,           xmm1
589233d2500723e5594f3e7c70896ffeeef32b9c950ywan
590233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmulhw      xmm3,           [GLOBAL(x_s1sqr2)]
591233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw       xmm3,           xmm4
592233d2500723e5594f3e7c70896ffeeef32b9c950ywan
593233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw       xmm3,           xmm5            ; d1
594233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw       xmm0,           [GLOBAL(fours)]
595233d2500723e5594f3e7c70896ffeeef32b9c950ywan
596233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw       xmm2,           [GLOBAL(fours)]
597233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm6,           xmm2            ; a1
598233d2500723e5594f3e7c70896ffeeef32b9c950ywan
599233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm4,           xmm0            ; b1
600233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw       xmm2,           xmm3            ;0
601233d2500723e5594f3e7c70896ffeeef32b9c950ywan
602233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw       xmm4,           xmm7            ;1
603233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubw       xmm0,           xmm7            ;2
604233d2500723e5594f3e7c70896ffeeef32b9c950ywan
605233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubw       xmm6,           xmm3            ;3
606233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psraw       xmm2,           3
607233d2500723e5594f3e7c70896ffeeef32b9c950ywan
608233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psraw       xmm0,           3
609233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psraw       xmm4,           3
610233d2500723e5594f3e7c70896ffeeef32b9c950ywan
611233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psraw       xmm6,           3
612233d2500723e5594f3e7c70896ffeeef32b9c950ywan
613233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; transpose to save
614233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm7,           xmm2        ; 103 102 101 100 003 002 001 000
615233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklwd   xmm2,           xmm0        ; 007 003 006 002 005 001 004 000
616233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhwd   xmm7,           xmm0        ; 107 103 106 102 105 101 104 100
617233d2500723e5594f3e7c70896ffeeef32b9c950ywan
618233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm5,           xmm4        ; 111 110 109 108 011 010 009 008
619233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklwd   xmm4,           xmm6        ; 015 011 014 010 013 009 012 008
620233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhwd   xmm5,           xmm6        ; 115 111 114 110 113 109 112 108
621233d2500723e5594f3e7c70896ffeeef32b9c950ywan
622233d2500723e5594f3e7c70896ffeeef32b9c950ywan
623233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm1,           xmm2        ; 007 003 006 002 005 001 004 000
624233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckldq   xmm2,           xmm4        ; 013 009 005 001 012 008 004 000
625233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhdq   xmm1,           xmm4        ; 015 011 007 003 014 010 006 002
626233d2500723e5594f3e7c70896ffeeef32b9c950ywan
627233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm6,           xmm7        ; 107 103 106 102 105 101 104 100
628233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckldq   xmm7,           xmm5        ; 113 109 105 101 112 108 104 100
629233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhdq   xmm6,           xmm5        ; 115 111 107 103 114 110 106 102
630233d2500723e5594f3e7c70896ffeeef32b9c950ywan
631233d2500723e5594f3e7c70896ffeeef32b9c950ywan
632233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm5,           xmm2        ; 013 009 005 001 012 008 004 000
633233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckldq   xmm2,           xmm7        ; 112 108 012 008 104 100 004 000
634233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhdq   xmm5,           xmm7        ; 113 109 013 009 105 101 005 001
635233d2500723e5594f3e7c70896ffeeef32b9c950ywan
636233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm7,           xmm1        ; 015 011 007 003 014 010 006 002
637233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckldq   xmm1,           xmm6        ; 114 110 014 010 106 102 006 002
638233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhdq   xmm7,           xmm6        ; 115 111 015 011 107 103 007 003
639233d2500723e5594f3e7c70896ffeeef32b9c950ywan
640233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pshufd      xmm0,           xmm2,       11011000b
641233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pshufd      xmm2,           xmm1,       11011000b
642233d2500723e5594f3e7c70896ffeeef32b9c950ywan
643233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pshufd      xmm1,           xmm5,       11011000b
644233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pshufd      xmm3,           xmm7,       11011000b
645233d2500723e5594f3e7c70896ffeeef32b9c950ywan
646233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pxor        xmm7,           xmm7
647233d2500723e5594f3e7c70896ffeeef32b9c950ywan
648233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; Load up predict blocks
649233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movsxd      rdx,            dword ptr arg(3) ; dst_stride
650233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        xmm4,           [rdi]
651233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        xmm5,           [rdi+rdx]
652233d2500723e5594f3e7c70896ffeeef32b9c950ywan        lea         rcx,            [rdx + rdx*2]
653233d2500723e5594f3e7c70896ffeeef32b9c950ywan
654233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw   xmm4,           xmm7
655233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw   xmm5,           xmm7
656233d2500723e5594f3e7c70896ffeeef32b9c950ywan
657233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw       xmm0,           xmm4
658233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw       xmm1,           xmm5
659233d2500723e5594f3e7c70896ffeeef32b9c950ywan
660233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        xmm4,           [rdi+rdx*2]
661233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        xmm5,           [rdi+rcx]
662233d2500723e5594f3e7c70896ffeeef32b9c950ywan
663233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw   xmm4,           xmm7
664233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw   xmm5,           xmm7
665233d2500723e5594f3e7c70896ffeeef32b9c950ywan
666233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw       xmm2,           xmm4
667233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw       xmm3,           xmm5
668233d2500723e5594f3e7c70896ffeeef32b9c950ywan
669233d2500723e5594f3e7c70896ffeeef32b9c950ywan.finish:
670233d2500723e5594f3e7c70896ffeeef32b9c950ywan
671233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; pack up before storing
672233d2500723e5594f3e7c70896ffeeef32b9c950ywan        packuswb    xmm0,           xmm7
673233d2500723e5594f3e7c70896ffeeef32b9c950ywan        packuswb    xmm1,           xmm7
674233d2500723e5594f3e7c70896ffeeef32b9c950ywan        packuswb    xmm2,           xmm7
675233d2500723e5594f3e7c70896ffeeef32b9c950ywan        packuswb    xmm3,           xmm7
676233d2500723e5594f3e7c70896ffeeef32b9c950ywan
677233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; Load destination stride before writing out,
678233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;   doesn't need to persist
679233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movsxd      rdx,            dword ptr arg(3) ; dst_stride
680233d2500723e5594f3e7c70896ffeeef32b9c950ywan
681233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; store blocks back out
682233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        [rdi],          xmm0
683233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        [rdi + rdx],    xmm1
684233d2500723e5594f3e7c70896ffeeef32b9c950ywan
685233d2500723e5594f3e7c70896ffeeef32b9c950ywan        lea         rdi,            [rdi + 2*rdx]
686233d2500723e5594f3e7c70896ffeeef32b9c950ywan
687233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        [rdi],          xmm2
688233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        [rdi + rdx],    xmm3
689233d2500723e5594f3e7c70896ffeeef32b9c950ywan
690233d2500723e5594f3e7c70896ffeeef32b9c950ywan
691233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; begin epilog
692233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop         rdi
693233d2500723e5594f3e7c70896ffeeef32b9c950ywan    RESTORE_GOT
694233d2500723e5594f3e7c70896ffeeef32b9c950ywan    RESTORE_XMM
695233d2500723e5594f3e7c70896ffeeef32b9c950ywan    UNSHADOW_ARGS
696233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop         rbp
697233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ret
698233d2500723e5594f3e7c70896ffeeef32b9c950ywan
699233d2500723e5594f3e7c70896ffeeef32b9c950ywanSECTION_RODATA
700233d2500723e5594f3e7c70896ffeeef32b9c950ywanalign 16
701233d2500723e5594f3e7c70896ffeeef32b9c950ywanfours:
702233d2500723e5594f3e7c70896ffeeef32b9c950ywan    times 8 dw 0x0004
703233d2500723e5594f3e7c70896ffeeef32b9c950ywanalign 16
704233d2500723e5594f3e7c70896ffeeef32b9c950ywanx_s1sqr2:
705233d2500723e5594f3e7c70896ffeeef32b9c950ywan    times 8 dw 0x8A8C
706233d2500723e5594f3e7c70896ffeeef32b9c950ywanalign 16
707233d2500723e5594f3e7c70896ffeeef32b9c950ywanx_c1sqr2less1:
708233d2500723e5594f3e7c70896ffeeef32b9c950ywan    times 8 dw 0x4E7B
709