1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12%include "vpx_ports/x86_abi_support.asm"
13
14
15;void vp8_dequantize_b_impl_mmx(short *sq, short *dq, short *q)
16global sym(vp8_dequantize_b_impl_mmx) PRIVATE
17sym(vp8_dequantize_b_impl_mmx):
18    push        rbp
19    mov         rbp, rsp
20    SHADOW_ARGS_TO_STACK 3
21    push        rsi
22    push        rdi
23    ; end prolog
24
25        mov       rsi, arg(0) ;sq
26        mov       rdi, arg(1) ;dq
27        mov       rax, arg(2) ;q
28
29        movq      mm1, [rsi]
30        pmullw    mm1, [rax+0]            ; mm4 *= kernel 0 modifiers.
31        movq      [rdi], mm1
32
33        movq      mm1, [rsi+8]
34        pmullw    mm1, [rax+8]            ; mm4 *= kernel 0 modifiers.
35        movq      [rdi+8], mm1
36
37        movq      mm1, [rsi+16]
38        pmullw    mm1, [rax+16]            ; mm4 *= kernel 0 modifiers.
39        movq      [rdi+16], mm1
40
41        movq      mm1, [rsi+24]
42        pmullw    mm1, [rax+24]            ; mm4 *= kernel 0 modifiers.
43        movq      [rdi+24], mm1
44
45    ; begin epilog
46    pop rdi
47    pop rsi
48    UNSHADOW_ARGS
49    pop         rbp
50    ret
51
52
53;void dequant_idct_add_mmx(
54;short *input,            0
55;short *dq,               1
56;unsigned char *dest,     2
57;int stride)              3
58global sym(vp8_dequant_idct_add_mmx) PRIVATE
59sym(vp8_dequant_idct_add_mmx):
60    push        rbp
61    mov         rbp, rsp
62    SHADOW_ARGS_TO_STACK 4
63    GET_GOT     rbx
64    push        rdi
65    ; end prolog
66
67        mov         rax,    arg(0) ;input
68        mov         rdx,    arg(1) ;dq
69
70
71        movq        mm0,    [rax   ]
72        pmullw      mm0,    [rdx]
73
74        movq        mm1,    [rax +8]
75        pmullw      mm1,    [rdx +8]
76
77        movq        mm2,    [rax+16]
78        pmullw      mm2,    [rdx+16]
79
80        movq        mm3,    [rax+24]
81        pmullw      mm3,    [rdx+24]
82
83        mov         rdx,    arg(2) ;dest
84
85        pxor        mm7,    mm7
86
87
88        movq        [rax],   mm7
89        movq        [rax+8], mm7
90
91        movq        [rax+16],mm7
92        movq        [rax+24],mm7
93
94
95        movsxd      rdi,            dword ptr arg(3) ;stride
96
97        psubw       mm0,            mm2             ; b1= 0-2
98        paddw       mm2,            mm2             ;
99
100        movq        mm5,            mm1
101        paddw       mm2,            mm0             ; a1 =0+2
102
103        pmulhw      mm5,            [GLOBAL(x_s1sqr2)];
104        paddw       mm5,            mm1             ; ip1 * sin(pi/8) * sqrt(2)
105
106        movq        mm7,            mm3             ;
107        pmulhw      mm7,            [GLOBAL(x_c1sqr2less1)];
108
109        paddw       mm7,            mm3             ; ip3 * cos(pi/8) * sqrt(2)
110        psubw       mm7,            mm5             ; c1
111
112        movq        mm5,            mm1
113        movq        mm4,            mm3
114
115        pmulhw      mm5,            [GLOBAL(x_c1sqr2less1)]
116        paddw       mm5,            mm1
117
118        pmulhw      mm3,            [GLOBAL(x_s1sqr2)]
119        paddw       mm3,            mm4
120
121        paddw       mm3,            mm5             ; d1
122        movq        mm6,            mm2             ; a1
123
124        movq        mm4,            mm0             ; b1
125        paddw       mm2,            mm3             ;0
126
127        paddw       mm4,            mm7             ;1
128        psubw       mm0,            mm7             ;2
129
130        psubw       mm6,            mm3             ;3
131
132        movq        mm1,            mm2             ; 03 02 01 00
133        movq        mm3,            mm4             ; 23 22 21 20
134
135        punpcklwd   mm1,            mm0             ; 11 01 10 00
136        punpckhwd   mm2,            mm0             ; 13 03 12 02
137
138        punpcklwd   mm3,            mm6             ; 31 21 30 20
139        punpckhwd   mm4,            mm6             ; 33 23 32 22
140
141        movq        mm0,            mm1             ; 11 01 10 00
142        movq        mm5,            mm2             ; 13 03 12 02
143
144        punpckldq   mm0,            mm3             ; 30 20 10 00
145        punpckhdq   mm1,            mm3             ; 31 21 11 01
146
147        punpckldq   mm2,            mm4             ; 32 22 12 02
148        punpckhdq   mm5,            mm4             ; 33 23 13 03
149
150        movq        mm3,            mm5             ; 33 23 13 03
151
152        psubw       mm0,            mm2             ; b1= 0-2
153        paddw       mm2,            mm2             ;
154
155        movq        mm5,            mm1
156        paddw       mm2,            mm0             ; a1 =0+2
157
158        pmulhw      mm5,            [GLOBAL(x_s1sqr2)];
159        paddw       mm5,            mm1             ; ip1 * sin(pi/8) * sqrt(2)
160
161        movq        mm7,            mm3             ;
162        pmulhw      mm7,            [GLOBAL(x_c1sqr2less1)];
163
164        paddw       mm7,            mm3             ; ip3 * cos(pi/8) * sqrt(2)
165        psubw       mm7,            mm5             ; c1
166
167        movq        mm5,            mm1
168        movq        mm4,            mm3
169
170        pmulhw      mm5,            [GLOBAL(x_c1sqr2less1)]
171        paddw       mm5,            mm1
172
173        pmulhw      mm3,            [GLOBAL(x_s1sqr2)]
174        paddw       mm3,            mm4
175
176        paddw       mm3,            mm5             ; d1
177        paddw       mm0,            [GLOBAL(fours)]
178
179        paddw       mm2,            [GLOBAL(fours)]
180        movq        mm6,            mm2             ; a1
181
182        movq        mm4,            mm0             ; b1
183        paddw       mm2,            mm3             ;0
184
185        paddw       mm4,            mm7             ;1
186        psubw       mm0,            mm7             ;2
187
188        psubw       mm6,            mm3             ;3
189        psraw       mm2,            3
190
191        psraw       mm0,            3
192        psraw       mm4,            3
193
194        psraw       mm6,            3
195
196        movq        mm1,            mm2             ; 03 02 01 00
197        movq        mm3,            mm4             ; 23 22 21 20
198
199        punpcklwd   mm1,            mm0             ; 11 01 10 00
200        punpckhwd   mm2,            mm0             ; 13 03 12 02
201
202        punpcklwd   mm3,            mm6             ; 31 21 30 20
203        punpckhwd   mm4,            mm6             ; 33 23 32 22
204
205        movq        mm0,            mm1             ; 11 01 10 00
206        movq        mm5,            mm2             ; 13 03 12 02
207
208        punpckldq   mm0,            mm3             ; 30 20 10 00
209        punpckhdq   mm1,            mm3             ; 31 21 11 01
210
211        punpckldq   mm2,            mm4             ; 32 22 12 02
212        punpckhdq   mm5,            mm4             ; 33 23 13 03
213
214        pxor        mm7,            mm7
215
216        movd        mm4,            [rdx]
217        punpcklbw   mm4,            mm7
218        paddsw      mm0,            mm4
219        packuswb    mm0,            mm7
220        movd        [rdx],          mm0
221
222        movd        mm4,            [rdx+rdi]
223        punpcklbw   mm4,            mm7
224        paddsw      mm1,            mm4
225        packuswb    mm1,            mm7
226        movd        [rdx+rdi],      mm1
227
228        movd        mm4,            [rdx+2*rdi]
229        punpcklbw   mm4,            mm7
230        paddsw      mm2,            mm4
231        packuswb    mm2,            mm7
232        movd        [rdx+rdi*2],    mm2
233
234        add         rdx,            rdi
235
236        movd        mm4,            [rdx+2*rdi]
237        punpcklbw   mm4,            mm7
238        paddsw      mm5,            mm4
239        packuswb    mm5,            mm7
240        movd        [rdx+rdi*2],    mm5
241
242    ; begin epilog
243    pop rdi
244    RESTORE_GOT
245    UNSHADOW_ARGS
246    pop         rbp
247    ret
248
249SECTION_RODATA
250align 16
251x_s1sqr2:
252    times 4 dw 0x8A8C
253align 16
254x_c1sqr2less1:
255    times 4 dw 0x4E7B
256align 16
257fours:
258    times 4 dw 0x0004
259