1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12%include "vpx_ports/x86_abi_support.asm"
13
14
15;void vp8_dequantize_b_impl_mmx(short *sq, short *dq, short *q)
16global sym(vp8_dequantize_b_impl_mmx)
17sym(vp8_dequantize_b_impl_mmx):
18    push        rbp
19    mov         rbp, rsp
20    SHADOW_ARGS_TO_STACK 3
21    push        rsi
22    push        rdi
23    ; end prolog
24
25        mov       rsi, arg(0) ;sq
26        mov       rdi, arg(1) ;dq
27        mov       rax, arg(2) ;q
28
29        movq      mm1, [rsi]
30        pmullw    mm1, [rax+0]            ; mm4 *= kernel 0 modifiers.
31        movq      [rdi], mm1
32
33        movq      mm1, [rsi+8]
34        pmullw    mm1, [rax+8]            ; mm4 *= kernel 0 modifiers.
35        movq      [rdi+8], mm1
36
37        movq      mm1, [rsi+16]
38        pmullw    mm1, [rax+16]            ; mm4 *= kernel 0 modifiers.
39        movq      [rdi+16], mm1
40
41        movq      mm1, [rsi+24]
42        pmullw    mm1, [rax+24]            ; mm4 *= kernel 0 modifiers.
43        movq      [rdi+24], mm1
44
45    ; begin epilog
46    pop rdi
47    pop rsi
48    UNSHADOW_ARGS
49    pop         rbp
50    ret
51
52
53;void dequant_idct_add_mmx(short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride)
54global sym(vp8_dequant_idct_add_mmx)
55sym(vp8_dequant_idct_add_mmx):
56    push        rbp
57    mov         rbp, rsp
58    SHADOW_ARGS_TO_STACK 6
59    GET_GOT     rbx
60    push        rsi
61    push        rdi
62    ; end prolog
63
64        mov         rax,    arg(0) ;input
65        mov         rdx,    arg(1) ;dq
66
67
68        movq        mm0,    [rax   ]
69        pmullw      mm0,    [rdx]
70
71        movq        mm1,    [rax +8]
72        pmullw      mm1,    [rdx +8]
73
74        movq        mm2,    [rax+16]
75        pmullw      mm2,    [rdx+16]
76
77        movq        mm3,    [rax+24]
78        pmullw      mm3,    [rdx+24]
79
80        mov         rdx,    arg(3) ;dest
81        mov         rsi,    arg(2) ;pred
82        pxor        mm7,    mm7
83
84
85        movq        [rax],   mm7
86        movq        [rax+8], mm7
87
88        movq        [rax+16],mm7
89        movq        [rax+24],mm7
90
91
92        movsxd      rax,            dword ptr arg(4) ;pitch
93        movsxd      rdi,            dword ptr arg(5) ;stride
94
95        psubw       mm0,            mm2             ; b1= 0-2
96        paddw       mm2,            mm2             ;
97
98        movq        mm5,            mm1
99        paddw       mm2,            mm0             ; a1 =0+2
100
101        pmulhw      mm5,            [GLOBAL(x_s1sqr2)];
102        paddw       mm5,            mm1             ; ip1 * sin(pi/8) * sqrt(2)
103
104        movq        mm7,            mm3             ;
105        pmulhw      mm7,            [GLOBAL(x_c1sqr2less1)];
106
107        paddw       mm7,            mm3             ; ip3 * cos(pi/8) * sqrt(2)
108        psubw       mm7,            mm5             ; c1
109
110        movq        mm5,            mm1
111        movq        mm4,            mm3
112
113        pmulhw      mm5,            [GLOBAL(x_c1sqr2less1)]
114        paddw       mm5,            mm1
115
116        pmulhw      mm3,            [GLOBAL(x_s1sqr2)]
117        paddw       mm3,            mm4
118
119        paddw       mm3,            mm5             ; d1
120        movq        mm6,            mm2             ; a1
121
122        movq        mm4,            mm0             ; b1
123        paddw       mm2,            mm3             ;0
124
125        paddw       mm4,            mm7             ;1
126        psubw       mm0,            mm7             ;2
127
128        psubw       mm6,            mm3             ;3
129
130        movq        mm1,            mm2             ; 03 02 01 00
131        movq        mm3,            mm4             ; 23 22 21 20
132
133        punpcklwd   mm1,            mm0             ; 11 01 10 00
134        punpckhwd   mm2,            mm0             ; 13 03 12 02
135
136        punpcklwd   mm3,            mm6             ; 31 21 30 20
137        punpckhwd   mm4,            mm6             ; 33 23 32 22
138
139        movq        mm0,            mm1             ; 11 01 10 00
140        movq        mm5,            mm2             ; 13 03 12 02
141
142        punpckldq   mm0,            mm3             ; 30 20 10 00
143        punpckhdq   mm1,            mm3             ; 31 21 11 01
144
145        punpckldq   mm2,            mm4             ; 32 22 12 02
146        punpckhdq   mm5,            mm4             ; 33 23 13 03
147
148        movq        mm3,            mm5             ; 33 23 13 03
149
150        psubw       mm0,            mm2             ; b1= 0-2
151        paddw       mm2,            mm2             ;
152
153        movq        mm5,            mm1
154        paddw       mm2,            mm0             ; a1 =0+2
155
156        pmulhw      mm5,            [GLOBAL(x_s1sqr2)];
157        paddw       mm5,            mm1             ; ip1 * sin(pi/8) * sqrt(2)
158
159        movq        mm7,            mm3             ;
160        pmulhw      mm7,            [GLOBAL(x_c1sqr2less1)];
161
162        paddw       mm7,            mm3             ; ip3 * cos(pi/8) * sqrt(2)
163        psubw       mm7,            mm5             ; c1
164
165        movq        mm5,            mm1
166        movq        mm4,            mm3
167
168        pmulhw      mm5,            [GLOBAL(x_c1sqr2less1)]
169        paddw       mm5,            mm1
170
171        pmulhw      mm3,            [GLOBAL(x_s1sqr2)]
172        paddw       mm3,            mm4
173
174        paddw       mm3,            mm5             ; d1
175        paddw       mm0,            [GLOBAL(fours)]
176
177        paddw       mm2,            [GLOBAL(fours)]
178        movq        mm6,            mm2             ; a1
179
180        movq        mm4,            mm0             ; b1
181        paddw       mm2,            mm3             ;0
182
183        paddw       mm4,            mm7             ;1
184        psubw       mm0,            mm7             ;2
185
186        psubw       mm6,            mm3             ;3
187        psraw       mm2,            3
188
189        psraw       mm0,            3
190        psraw       mm4,            3
191
192        psraw       mm6,            3
193
194        movq        mm1,            mm2             ; 03 02 01 00
195        movq        mm3,            mm4             ; 23 22 21 20
196
197        punpcklwd   mm1,            mm0             ; 11 01 10 00
198        punpckhwd   mm2,            mm0             ; 13 03 12 02
199
200        punpcklwd   mm3,            mm6             ; 31 21 30 20
201        punpckhwd   mm4,            mm6             ; 33 23 32 22
202
203        movq        mm0,            mm1             ; 11 01 10 00
204        movq        mm5,            mm2             ; 13 03 12 02
205
206        punpckldq   mm0,            mm3             ; 30 20 10 00
207        punpckhdq   mm1,            mm3             ; 31 21 11 01
208
209        punpckldq   mm2,            mm4             ; 32 22 12 02
210        punpckhdq   mm5,            mm4             ; 33 23 13 03
211
212        pxor        mm7,            mm7
213
214        movd        mm4,            [rsi]
215        punpcklbw   mm4,            mm7
216        paddsw      mm0,            mm4
217        packuswb    mm0,            mm7
218        movd        [rdx],          mm0
219
220        movd        mm4,            [rsi+rax]
221        punpcklbw   mm4,            mm7
222        paddsw      mm1,            mm4
223        packuswb    mm1,            mm7
224        movd        [rdx+rdi],      mm1
225
226        movd        mm4,            [rsi+2*rax]
227        punpcklbw   mm4,            mm7
228        paddsw      mm2,            mm4
229        packuswb    mm2,            mm7
230        movd        [rdx+rdi*2],    mm2
231
232        add         rdx,            rdi
233        add         rsi,            rax
234
235        movd        mm4,            [rsi+2*rax]
236        punpcklbw   mm4,            mm7
237        paddsw      mm5,            mm4
238        packuswb    mm5,            mm7
239        movd        [rdx+rdi*2],    mm5
240
241    ; begin epilog
242    pop rdi
243    pop rsi
244    RESTORE_GOT
245    UNSHADOW_ARGS
246    pop         rbp
247    ret
248
249
250;void dequant_dc_idct_add_mmx(short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride, int Dc)
251global sym(vp8_dequant_dc_idct_add_mmx)
252sym(vp8_dequant_dc_idct_add_mmx):
253    push        rbp
254    mov         rbp, rsp
255    SHADOW_ARGS_TO_STACK 7
256    GET_GOT     rbx
257    push        rsi
258    push        rdi
259    ; end prolog
260
261        mov         rax,    arg(0) ;input
262        mov         rdx,    arg(1) ;dq
263
264        movq        mm0,    [rax   ]
265        pmullw      mm0,    [rdx]
266
267        movq        mm1,    [rax +8]
268        pmullw      mm1,    [rdx +8]
269
270        movq        mm2,    [rax+16]
271        pmullw      mm2,    [rdx+16]
272
273        movq        mm3,    [rax+24]
274        pmullw      mm3,    [rdx+24]
275
276        mov         rdx,    arg(3) ;dest
277        mov         rsi,    arg(2) ;pred
278        pxor        mm7,    mm7
279
280
281        movq        [rax],   mm7
282        movq        [rax+8], mm7
283
284        movq        [rax+16],mm7
285        movq        [rax+24],mm7
286
287        ; move lower word of Dc to lower word of mm0
288        psrlq       mm0,    16
289        movzx       rcx,    word ptr arg(6) ;Dc
290        psllq       mm0,    16
291        movq        mm7,    rcx
292        por         mm0,    mm7
293
294        movsxd      rax,            dword ptr arg(4) ;pitch
295        movsxd      rdi,            dword ptr arg(5) ;stride
296
297        psubw       mm0,            mm2             ; b1= 0-2
298        paddw       mm2,            mm2             ;
299
300        movq        mm5,            mm1
301        paddw       mm2,            mm0             ; a1 =0+2
302
303        pmulhw      mm5,            [GLOBAL(x_s1sqr2)];
304        paddw       mm5,            mm1             ; ip1 * sin(pi/8) * sqrt(2)
305
306        movq        mm7,            mm3             ;
307        pmulhw      mm7,            [GLOBAL(x_c1sqr2less1)];
308
309        paddw       mm7,            mm3             ; ip3 * cos(pi/8) * sqrt(2)
310        psubw       mm7,            mm5             ; c1
311
312        movq        mm5,            mm1
313        movq        mm4,            mm3
314
315        pmulhw      mm5,            [GLOBAL(x_c1sqr2less1)]
316        paddw       mm5,            mm1
317
318        pmulhw      mm3,            [GLOBAL(x_s1sqr2)]
319        paddw       mm3,            mm4
320
321        paddw       mm3,            mm5             ; d1
322        movq        mm6,            mm2             ; a1
323
324        movq        mm4,            mm0             ; b1
325        paddw       mm2,            mm3             ;0
326
327        paddw       mm4,            mm7             ;1
328        psubw       mm0,            mm7             ;2
329
330        psubw       mm6,            mm3             ;3
331
332        movq        mm1,            mm2             ; 03 02 01 00
333        movq        mm3,            mm4             ; 23 22 21 20
334
335        punpcklwd   mm1,            mm0             ; 11 01 10 00
336        punpckhwd   mm2,            mm0             ; 13 03 12 02
337
338        punpcklwd   mm3,            mm6             ; 31 21 30 20
339        punpckhwd   mm4,            mm6             ; 33 23 32 22
340
341        movq        mm0,            mm1             ; 11 01 10 00
342        movq        mm5,            mm2             ; 13 03 12 02
343
344        punpckldq   mm0,            mm3             ; 30 20 10 00
345        punpckhdq   mm1,            mm3             ; 31 21 11 01
346
347        punpckldq   mm2,            mm4             ; 32 22 12 02
348        punpckhdq   mm5,            mm4             ; 33 23 13 03
349
350        movq        mm3,            mm5             ; 33 23 13 03
351
352        psubw       mm0,            mm2             ; b1= 0-2
353        paddw       mm2,            mm2             ;
354
355        movq        mm5,            mm1
356        paddw       mm2,            mm0             ; a1 =0+2
357
358        pmulhw      mm5,            [GLOBAL(x_s1sqr2)];
359        paddw       mm5,            mm1             ; ip1 * sin(pi/8) * sqrt(2)
360
361        movq        mm7,            mm3             ;
362        pmulhw      mm7,            [GLOBAL(x_c1sqr2less1)];
363
364        paddw       mm7,            mm3             ; ip3 * cos(pi/8) * sqrt(2)
365        psubw       mm7,            mm5             ; c1
366
367        movq        mm5,            mm1
368        movq        mm4,            mm3
369
370        pmulhw      mm5,            [GLOBAL(x_c1sqr2less1)]
371        paddw       mm5,            mm1
372
373        pmulhw      mm3,            [GLOBAL(x_s1sqr2)]
374        paddw       mm3,            mm4
375
376        paddw       mm3,            mm5             ; d1
377        paddw       mm0,            [GLOBAL(fours)]
378
379        paddw       mm2,            [GLOBAL(fours)]
380        movq        mm6,            mm2             ; a1
381
382        movq        mm4,            mm0             ; b1
383        paddw       mm2,            mm3             ;0
384
385        paddw       mm4,            mm7             ;1
386        psubw       mm0,            mm7             ;2
387
388        psubw       mm6,            mm3             ;3
389        psraw       mm2,            3
390
391        psraw       mm0,            3
392        psraw       mm4,            3
393
394        psraw       mm6,            3
395
396        movq        mm1,            mm2             ; 03 02 01 00
397        movq        mm3,            mm4             ; 23 22 21 20
398
399        punpcklwd   mm1,            mm0             ; 11 01 10 00
400        punpckhwd   mm2,            mm0             ; 13 03 12 02
401
402        punpcklwd   mm3,            mm6             ; 31 21 30 20
403        punpckhwd   mm4,            mm6             ; 33 23 32 22
404
405        movq        mm0,            mm1             ; 11 01 10 00
406        movq        mm5,            mm2             ; 13 03 12 02
407
408        punpckldq   mm0,            mm3             ; 30 20 10 00
409        punpckhdq   mm1,            mm3             ; 31 21 11 01
410
411        punpckldq   mm2,            mm4             ; 32 22 12 02
412        punpckhdq   mm5,            mm4             ; 33 23 13 03
413
414        pxor        mm7,            mm7
415
416        movd        mm4,            [rsi]
417        punpcklbw   mm4,            mm7
418        paddsw      mm0,            mm4
419        packuswb    mm0,            mm7
420        movd        [rdx],          mm0
421
422        movd        mm4,            [rsi+rax]
423        punpcklbw   mm4,            mm7
424        paddsw      mm1,            mm4
425        packuswb    mm1,            mm7
426        movd        [rdx+rdi],      mm1
427
428        movd        mm4,            [rsi+2*rax]
429        punpcklbw   mm4,            mm7
430        paddsw      mm2,            mm4
431        packuswb    mm2,            mm7
432        movd        [rdx+rdi*2],    mm2
433
434        add         rdx,            rdi
435        add         rsi,            rax
436
437        movd        mm4,            [rsi+2*rax]
438        punpcklbw   mm4,            mm7
439        paddsw      mm5,            mm4
440        packuswb    mm5,            mm7
441        movd        [rdx+rdi*2],    mm5
442
443    ; begin epilog
444    pop rdi
445    pop rsi
446    RESTORE_GOT
447    UNSHADOW_ARGS
448    pop         rbp
449    ret
450
451
452SECTION_RODATA
453align 16
454x_s1sqr2:
455    times 4 dw 0x8A8C
456align 16
457x_c1sqr2less1:
458    times 4 dw 0x4E7B
459align 16
460fours:
461    times 4 dw 0x0004
462