1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12%include "vpx_ports/x86_abi_support.asm"
13
14%macro STACK_FRAME_CREATE 0
15%if ABI_IS_32BIT
16  %define       input       rsi
17  %define       output      rdi
18  %define       pitch       rax
19    push        rbp
20    mov         rbp, rsp
21    GET_GOT     rbx
22    push        rsi
23    push        rdi
24    ; end prolog
25
26    mov         rsi, arg(0)
27    mov         rdi, arg(1)
28
29    movsxd      rax, dword ptr arg(2)
30    lea         rcx, [rsi + rax*2]
31%else
32  %ifidn __OUTPUT_FORMAT__,x64
33    %define     input       rcx
34    %define     output      rdx
35    %define     pitch       r8
36  %else
37    %define     input       rdi
38    %define     output      rsi
39    %define     pitch       rdx
40  %endif
41%endif
42%endmacro
43
44%macro STACK_FRAME_DESTROY 0
45  %define     input
46  %define     output
47  %define     pitch
48
49%if ABI_IS_32BIT
50    pop         rdi
51    pop         rsi
52    RESTORE_GOT
53    pop         rbp
54%else
55  %ifidn __OUTPUT_FORMAT__,x64
56  %endif
57%endif
58    ret
59%endmacro
60
61;void vp8_short_fdct4x4_sse2(short *input, short *output, int pitch)
62global sym(vp8_short_fdct4x4_sse2)
63sym(vp8_short_fdct4x4_sse2):
64
65    STACK_FRAME_CREATE
66
67    movq        xmm0, MMWORD PTR[input        ] ;03 02 01 00
68    movq        xmm2, MMWORD PTR[input+  pitch] ;13 12 11 10
69    lea         input,          [input+2*pitch]
70    movq        xmm1, MMWORD PTR[input        ] ;23 22 21 20
71    movq        xmm3, MMWORD PTR[input+  pitch] ;33 32 31 30
72
73    punpcklqdq  xmm0, xmm2                      ;13 12 11 10 03 02 01 00
74    punpcklqdq  xmm1, xmm3                      ;33 32 31 30 23 22 21 20
75
76    movdqa      xmm2, xmm0
77    punpckldq   xmm0, xmm1                      ;23 22 03 02 21 20 01 00
78    punpckhdq   xmm2, xmm1                      ;33 32 13 12 31 30 11 10
79    movdqa      xmm1, xmm0
80    punpckldq   xmm0, xmm2                      ;31 21 30 20 11 10 01 00
81    pshufhw     xmm1, xmm1, 0b1h                ;22 23 02 03 xx xx xx xx
82    pshufhw     xmm2, xmm2, 0b1h                ;32 33 12 13 xx xx xx xx
83
84    punpckhdq   xmm1, xmm2                      ;32 33 22 23 12 13 02 03
85    movdqa      xmm3, xmm0
86    paddw       xmm0, xmm1                      ;b1 a1 b1 a1 b1 a1 b1 a1
87    psubw       xmm3, xmm1                      ;c1 d1 c1 d1 c1 d1 c1 d1
88    psllw       xmm0, 3                         ;b1 <<= 3 a1 <<= 3
89    psllw       xmm3, 3                         ;c1 <<= 3 d1 <<= 3
90
91    movdqa      xmm1, xmm0
92    pmaddwd     xmm0, XMMWORD PTR[GLOBAL(_mult_add)]    ;a1 + b1
93    pmaddwd     xmm1, XMMWORD PTR[GLOBAL(_mult_sub)]    ;a1 - b1
94    movdqa      xmm4, xmm3
95    pmaddwd     xmm3, XMMWORD PTR[GLOBAL(_5352_2217)]   ;c1*2217 + d1*5352
96    pmaddwd     xmm4, XMMWORD PTR[GLOBAL(_2217_neg5352)];d1*2217 - c1*5352
97
98    paddd       xmm3, XMMWORD PTR[GLOBAL(_14500)]
99    paddd       xmm4, XMMWORD PTR[GLOBAL(_7500)]
100    psrad       xmm3, 12            ;(c1 * 2217 + d1 * 5352 +  14500)>>12
101    psrad       xmm4, 12            ;(d1 * 2217 - c1 * 5352 +   7500)>>12
102
103    packssdw    xmm0, xmm1                      ;op[2] op[0]
104    packssdw    xmm3, xmm4                      ;op[3] op[1]
105    ; 23 22 21 20 03 02 01 00
106    ;
107    ; 33 32 31 30 13 12 11 10
108    ;
109    movdqa      xmm2, xmm0
110    punpcklqdq  xmm0, xmm3                      ;13 12 11 10 03 02 01 00
111    punpckhqdq  xmm2, xmm3                      ;23 22 21 20 33 32 31 30
112
113    movdqa      xmm3, xmm0
114    punpcklwd   xmm0, xmm2                      ;32 30 22 20 12 10 02 00
115    punpckhwd   xmm3, xmm2                      ;33 31 23 21 13 11 03 01
116    movdqa      xmm2, xmm0
117    punpcklwd   xmm0, xmm3                      ;13 12 11 10 03 02 01 00
118    punpckhwd   xmm2, xmm3                      ;33 32 31 30 23 22 21 20
119
120    movdqa      xmm5, XMMWORD PTR[GLOBAL(_7)]
121    pshufd      xmm2, xmm2, 04eh
122    movdqa      xmm3, xmm0
123    paddw       xmm0, xmm2                      ;b1 b1 b1 b1 a1 a1 a1 a1
124    psubw       xmm3, xmm2                      ;c1 c1 c1 c1 d1 d1 d1 d1
125
126    pshufd      xmm0, xmm0, 0d8h                ;b1 b1 a1 a1 b1 b1 a1 a1
127    movdqa      xmm2, xmm3                      ;save d1 for compare
128    pshufd      xmm3, xmm3, 0d8h                ;c1 c1 d1 d1 c1 c1 d1 d1
129    pshuflw     xmm0, xmm0, 0d8h                ;b1 b1 a1 a1 b1 a1 b1 a1
130    pshuflw     xmm3, xmm3, 0d8h                ;c1 c1 d1 d1 c1 d1 c1 d1
131    pshufhw     xmm0, xmm0, 0d8h                ;b1 a1 b1 a1 b1 a1 b1 a1
132    pshufhw     xmm3, xmm3, 0d8h                ;c1 d1 c1 d1 c1 d1 c1 d1
133    movdqa      xmm1, xmm0
134    pmaddwd     xmm0, XMMWORD PTR[GLOBAL(_mult_add)] ;a1 + b1
135    pmaddwd     xmm1, XMMWORD PTR[GLOBAL(_mult_sub)] ;a1 - b1
136
137    pxor        xmm4, xmm4                      ;zero out for compare
138    paddd       xmm0, xmm5
139    paddd       xmm1, xmm5
140    pcmpeqw     xmm2, xmm4
141    psrad       xmm0, 4                         ;(a1 + b1 + 7)>>4
142    psrad       xmm1, 4                         ;(a1 - b1 + 7)>>4
143    pandn       xmm2, XMMWORD PTR[GLOBAL(_cmp_mask)] ;clear upper,
144                                                     ;and keep bit 0 of lower
145
146    movdqa      xmm4, xmm3
147    pmaddwd     xmm3, XMMWORD PTR[GLOBAL(_5352_2217)]    ;c1*2217 + d1*5352
148    pmaddwd     xmm4, XMMWORD PTR[GLOBAL(_2217_neg5352)] ;d1*2217 - c1*5352
149    paddd       xmm3, XMMWORD PTR[GLOBAL(_12000)]
150    paddd       xmm4, XMMWORD PTR[GLOBAL(_51000)]
151    packssdw    xmm0, xmm1                      ;op[8] op[0]
152    psrad       xmm3, 16                ;(c1 * 2217 + d1 * 5352 +  12000)>>16
153    psrad       xmm4, 16                ;(d1 * 2217 - c1 * 5352 +  51000)>>16
154
155    packssdw    xmm3, xmm4                      ;op[12] op[4]
156    movdqa      xmm1, xmm0
157    paddw       xmm3, xmm2                      ;op[4] += (d1!=0)
158    punpcklqdq  xmm0, xmm3                      ;op[4] op[0]
159    punpckhqdq  xmm1, xmm3                      ;op[12] op[8]
160
161    movdqa      XMMWORD PTR[output +  0], xmm0
162    movdqa      XMMWORD PTR[output + 16], xmm1
163
164    STACK_FRAME_DESTROY
165
166;void vp8_short_fdct8x4_sse2(short *input, short *output, int pitch)
167global sym(vp8_short_fdct8x4_sse2)
168sym(vp8_short_fdct8x4_sse2):
169
170    STACK_FRAME_CREATE
171
172        ; read the input data
173        movdqa      xmm0,       [input        ]
174        movdqa      xmm2,       [input+  pitch]
175        lea         input,      [input+2*pitch]
176        movdqa      xmm4,       [input        ]
177        movdqa      xmm3,       [input+  pitch]
178
179        ; transpose for the first stage
180        movdqa      xmm1,       xmm0        ; 00 01 02 03 04 05 06 07
181        movdqa      xmm5,       xmm4        ; 20 21 22 23 24 25 26 27
182
183        punpcklwd   xmm0,       xmm2        ; 00 10 01 11 02 12 03 13
184        punpckhwd   xmm1,       xmm2        ; 04 14 05 15 06 16 07 17
185
186        punpcklwd   xmm4,       xmm3        ; 20 30 21 31 22 32 23 33
187        punpckhwd   xmm5,       xmm3        ; 24 34 25 35 26 36 27 37
188
189        movdqa      xmm2,       xmm0        ; 00 10 01 11 02 12 03 13
190        punpckldq   xmm0,       xmm4        ; 00 10 20 30 01 11 21 31
191
192        punpckhdq   xmm2,       xmm4        ; 02 12 22 32 03 13 23 33
193
194        movdqa      xmm4,       xmm1        ; 04 14 05 15 06 16 07 17
195        punpckldq   xmm4,       xmm5        ; 04 14 24 34 05 15 25 35
196
197        punpckhdq   xmm1,       xmm5        ; 06 16 26 36 07 17 27 37
198        movdqa      xmm3,       xmm2        ; 02 12 22 32 03 13 23 33
199
200        punpckhqdq  xmm3,       xmm1        ; 03 13 23 33 07 17 27 37
201        punpcklqdq  xmm2,       xmm1        ; 02 12 22 32 06 16 26 36
202
203        movdqa      xmm1,       xmm0        ; 00 10 20 30 01 11 21 31
204        punpcklqdq  xmm0,       xmm4        ; 00 10 20 30 04 14 24 34
205
206        punpckhqdq  xmm1,       xmm4        ; 01 11 21 32 05 15 25 35
207
208        ; xmm0 0
209        ; xmm1 1
210        ; xmm2 2
211        ; xmm3 3
212
213        ; first stage
214        movdqa      xmm5,       xmm0
215        movdqa      xmm4,       xmm1
216
217        paddw       xmm0,       xmm3        ; a1 = 0 + 3
218        paddw       xmm1,       xmm2        ; b1 = 1 + 2
219
220        psubw       xmm4,       xmm2        ; c1 = 1 - 2
221        psubw       xmm5,       xmm3        ; d1 = 0 - 3
222
223        psllw       xmm5,        3
224        psllw       xmm4,        3
225
226        psllw       xmm0,        3
227        psllw       xmm1,        3
228
229        ; output 0 and 2
230        movdqa      xmm2,       xmm0        ; a1
231
232        paddw       xmm0,       xmm1        ; op[0] = a1 + b1
233        psubw       xmm2,       xmm1        ; op[2] = a1 - b1
234
235        ; output 1 and 3
236        ; interleave c1, d1
237        movdqa      xmm1,       xmm5        ; d1
238        punpcklwd   xmm1,       xmm4        ; c1 d1
239        punpckhwd   xmm5,       xmm4        ; c1 d1
240
241        movdqa      xmm3,       xmm1
242        movdqa      xmm4,       xmm5
243
244        pmaddwd     xmm1,       XMMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
245        pmaddwd     xmm4,       XMMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
246
247        pmaddwd     xmm3,       XMMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
248        pmaddwd     xmm5,       XMMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
249
250        paddd       xmm1,       XMMWORD PTR[GLOBAL(_14500)]
251        paddd       xmm4,       XMMWORD PTR[GLOBAL(_14500)]
252        paddd       xmm3,       XMMWORD PTR[GLOBAL(_7500)]
253        paddd       xmm5,       XMMWORD PTR[GLOBAL(_7500)]
254
255        psrad       xmm1,       12          ; (c1 * 2217 + d1 * 5352 +  14500)>>12
256        psrad       xmm4,       12          ; (c1 * 2217 + d1 * 5352 +  14500)>>12
257        psrad       xmm3,       12          ; (d1 * 2217 - c1 * 5352 +   7500)>>12
258        psrad       xmm5,       12          ; (d1 * 2217 - c1 * 5352 +   7500)>>12
259
260        packssdw    xmm1,       xmm4        ; op[1]
261        packssdw    xmm3,       xmm5        ; op[3]
262
263        ; done with vertical
264        ; transpose for the second stage
265        movdqa      xmm4,       xmm0         ; 00 10 20 30 04 14 24 34
266        movdqa      xmm5,       xmm2         ; 02 12 22 32 06 16 26 36
267
268        punpcklwd   xmm0,       xmm1         ; 00 01 10 11 20 21 30 31
269        punpckhwd   xmm4,       xmm1         ; 04 05 14 15 24 25 34 35
270
271        punpcklwd   xmm2,       xmm3         ; 02 03 12 13 22 23 32 33
272        punpckhwd   xmm5,       xmm3         ; 06 07 16 17 26 27 36 37
273
274        movdqa      xmm1,       xmm0         ; 00 01 10 11 20 21 30 31
275        punpckldq   xmm0,       xmm2         ; 00 01 02 03 10 11 12 13
276
277        punpckhdq   xmm1,       xmm2         ; 20 21 22 23 30 31 32 33
278
279        movdqa      xmm2,       xmm4         ; 04 05 14 15 24 25 34 35
280        punpckldq   xmm2,       xmm5         ; 04 05 06 07 14 15 16 17
281
282        punpckhdq   xmm4,       xmm5         ; 24 25 26 27 34 35 36 37
283        movdqa      xmm3,       xmm1         ; 20 21 22 23 30 31 32 33
284
285        punpckhqdq  xmm3,       xmm4         ; 30 31 32 33 34 35 36 37
286        punpcklqdq  xmm1,       xmm4         ; 20 21 22 23 24 25 26 27
287
288        movdqa      xmm4,       xmm0         ; 00 01 02 03 10 11 12 13
289        punpcklqdq  xmm0,       xmm2         ; 00 01 02 03 04 05 06 07
290
291        punpckhqdq  xmm4,       xmm2         ; 10 11 12 13 14 15 16 17
292
293        ; xmm0 0
294        ; xmm1 4
295        ; xmm2 1
296        ; xmm3 3
297
298        movdqa      xmm5,       xmm0
299        movdqa      xmm2,       xmm1
300
301        paddw       xmm0,       xmm3        ; a1 = 0 + 3
302        paddw       xmm1,       xmm4        ; b1 = 1 + 2
303
304        psubw       xmm4,       xmm2        ; c1 = 1 - 2
305        psubw       xmm5,       xmm3        ; d1 = 0 - 3
306
307        pxor        xmm6,       xmm6        ; zero out for compare
308
309        pcmpeqw     xmm6,       xmm5        ; d1 != 0
310
311        pandn       xmm6,       XMMWORD PTR[GLOBAL(_cmp_mask8x4)]   ; clear upper,
312                                                                    ; and keep bit 0 of lower
313
314        ; output 0 and 2
315        movdqa      xmm2,       xmm0        ; a1
316
317        paddw       xmm0,       xmm1        ; a1 + b1
318        psubw       xmm2,       xmm1        ; a1 - b1
319
320        paddw       xmm0,       XMMWORD PTR[GLOBAL(_7w)]
321        paddw       xmm2,       XMMWORD PTR[GLOBAL(_7w)]
322
323        psraw       xmm0,       4           ; op[0] = (a1 + b1 + 7)>>4
324        psraw       xmm2,       4           ; op[8] = (a1 - b1 + 7)>>4
325
326        ; output 1 and 3
327        ; interleave c1, d1
328        movdqa      xmm1,       xmm5        ; d1
329        punpcklwd   xmm1,       xmm4        ; c1 d1
330        punpckhwd   xmm5,       xmm4        ; c1 d1
331
332        movdqa      xmm3,       xmm1
333        movdqa      xmm4,       xmm5
334
335        pmaddwd     xmm1,       XMMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
336        pmaddwd     xmm4,       XMMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
337
338        pmaddwd     xmm3,       XMMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
339        pmaddwd     xmm5,       XMMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
340
341        paddd       xmm1,       XMMWORD PTR[GLOBAL(_12000)]
342        paddd       xmm4,       XMMWORD PTR[GLOBAL(_12000)]
343        paddd       xmm3,       XMMWORD PTR[GLOBAL(_51000)]
344        paddd       xmm5,       XMMWORD PTR[GLOBAL(_51000)]
345
346        psrad       xmm1,       16          ; (c1 * 2217 + d1 * 5352 +  14500)>>16
347        psrad       xmm4,       16          ; (c1 * 2217 + d1 * 5352 +  14500)>>16
348        psrad       xmm3,       16          ; (d1 * 2217 - c1 * 5352 +   7500)>>16
349        psrad       xmm5,       16          ; (d1 * 2217 - c1 * 5352 +   7500)>>16
350
351        packssdw    xmm1,       xmm4        ; op[4]
352        packssdw    xmm3,       xmm5        ; op[12]
353
354        paddw       xmm1,       xmm6        ; op[4] += (d1!=0)
355
356        movdqa      xmm4,       xmm0
357        movdqa      xmm5,       xmm2
358
359        punpcklqdq  xmm0,       xmm1
360        punpckhqdq  xmm4,       xmm1
361
362        punpcklqdq  xmm2,       xmm3
363        punpckhqdq  xmm5,       xmm3
364
365        movdqa      XMMWORD PTR[output + 0 ],  xmm0
366        movdqa      XMMWORD PTR[output + 16],  xmm2
367        movdqa      XMMWORD PTR[output + 32],  xmm4
368        movdqa      XMMWORD PTR[output + 48],  xmm5
369
370    STACK_FRAME_DESTROY
371
372SECTION_RODATA
373align 16
374_5352_2217:
375    dw 5352
376    dw 2217
377    dw 5352
378    dw 2217
379    dw 5352
380    dw 2217
381    dw 5352
382    dw 2217
383align 16
384_2217_neg5352:
385    dw 2217
386    dw -5352
387    dw 2217
388    dw -5352
389    dw 2217
390    dw -5352
391    dw 2217
392    dw -5352
393align 16
394_mult_add:
395    times 8 dw 1
396align 16
397_cmp_mask:
398    times 4 dw 1
399    times 4 dw 0
400align 16
401_cmp_mask8x4:
402    times 8 dw 1
403align 16
404_mult_sub:
405    dw 1
406    dw -1
407    dw 1
408    dw -1
409    dw 1
410    dw -1
411    dw 1
412    dw -1
413align 16
414_7:
415    times 4 dd 7
416align 16
417_7w:
418    times 8 dw 7
419align 16
420_14500:
421    times 4 dd 14500
422align 16
423_7500:
424    times 4 dd 7500
425align 16
426_12000:
427    times 4 dd 12000
428align 16
429_51000:
430    times 4 dd 51000
431