1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12%include "vpx_ports/x86_abi_support.asm"
13
14%macro STACK_FRAME_CREATE 0
15%if ABI_IS_32BIT
16  %define       input       rsi
17  %define       output      rdi
18  %define       pitch       rax
19    push        rbp
20    mov         rbp, rsp
21    GET_GOT     rbx
22    push        rsi
23    push        rdi
24    ; end prolog
25
26    mov         rsi, arg(0)
27    mov         rdi, arg(1)
28
29    movsxd      rax, dword ptr arg(2)
30    lea         rcx, [rsi + rax*2]
31%else
32  %if LIBVPX_YASM_WIN64
33    %define     input       rcx
34    %define     output      rdx
35    %define     pitch       r8
36    SAVE_XMM 7, u
37  %else
38    %define     input       rdi
39    %define     output      rsi
40    %define     pitch       rdx
41  %endif
42%endif
43%endmacro
44
45%macro STACK_FRAME_DESTROY 0
46  %define     input
47  %define     output
48  %define     pitch
49
50%if ABI_IS_32BIT
51    pop         rdi
52    pop         rsi
53    RESTORE_GOT
54    pop         rbp
55%else
56  %if LIBVPX_YASM_WIN64
57    RESTORE_XMM
58  %endif
59%endif
60    ret
61%endmacro
62
63;void vp8_short_fdct4x4_sse2(short *input, short *output, int pitch)
64global sym(vp8_short_fdct4x4_sse2) PRIVATE
65sym(vp8_short_fdct4x4_sse2):
66
67    STACK_FRAME_CREATE
68
69    movq        xmm0, MMWORD PTR[input        ] ;03 02 01 00
70    movq        xmm2, MMWORD PTR[input+  pitch] ;13 12 11 10
71    lea         input,          [input+2*pitch]
72    movq        xmm1, MMWORD PTR[input        ] ;23 22 21 20
73    movq        xmm3, MMWORD PTR[input+  pitch] ;33 32 31 30
74
75    punpcklqdq  xmm0, xmm2                      ;13 12 11 10 03 02 01 00
76    punpcklqdq  xmm1, xmm3                      ;33 32 31 30 23 22 21 20
77
78    movdqa      xmm2, xmm0
79    punpckldq   xmm0, xmm1                      ;23 22 03 02 21 20 01 00
80    punpckhdq   xmm2, xmm1                      ;33 32 13 12 31 30 11 10
81    movdqa      xmm1, xmm0
82    punpckldq   xmm0, xmm2                      ;31 21 30 20 11 10 01 00
83    pshufhw     xmm1, xmm1, 0b1h                ;22 23 02 03 xx xx xx xx
84    pshufhw     xmm2, xmm2, 0b1h                ;32 33 12 13 xx xx xx xx
85
86    punpckhdq   xmm1, xmm2                      ;32 33 22 23 12 13 02 03
87    movdqa      xmm3, xmm0
88    paddw       xmm0, xmm1                      ;b1 a1 b1 a1 b1 a1 b1 a1
89    psubw       xmm3, xmm1                      ;c1 d1 c1 d1 c1 d1 c1 d1
90    psllw       xmm0, 3                         ;b1 <<= 3 a1 <<= 3
91    psllw       xmm3, 3                         ;c1 <<= 3 d1 <<= 3
92
93    movdqa      xmm1, xmm0
94    pmaddwd     xmm0, XMMWORD PTR[GLOBAL(_mult_add)]    ;a1 + b1
95    pmaddwd     xmm1, XMMWORD PTR[GLOBAL(_mult_sub)]    ;a1 - b1
96    movdqa      xmm4, xmm3
97    pmaddwd     xmm3, XMMWORD PTR[GLOBAL(_5352_2217)]   ;c1*2217 + d1*5352
98    pmaddwd     xmm4, XMMWORD PTR[GLOBAL(_2217_neg5352)];d1*2217 - c1*5352
99
100    paddd       xmm3, XMMWORD PTR[GLOBAL(_14500)]
101    paddd       xmm4, XMMWORD PTR[GLOBAL(_7500)]
102    psrad       xmm3, 12            ;(c1 * 2217 + d1 * 5352 +  14500)>>12
103    psrad       xmm4, 12            ;(d1 * 2217 - c1 * 5352 +   7500)>>12
104
105    packssdw    xmm0, xmm1                      ;op[2] op[0]
106    packssdw    xmm3, xmm4                      ;op[3] op[1]
107    ; 23 22 21 20 03 02 01 00
108    ;
109    ; 33 32 31 30 13 12 11 10
110    ;
111    movdqa      xmm2, xmm0
112    punpcklqdq  xmm0, xmm3                      ;13 12 11 10 03 02 01 00
113    punpckhqdq  xmm2, xmm3                      ;23 22 21 20 33 32 31 30
114
115    movdqa      xmm3, xmm0
116    punpcklwd   xmm0, xmm2                      ;32 30 22 20 12 10 02 00
117    punpckhwd   xmm3, xmm2                      ;33 31 23 21 13 11 03 01
118    movdqa      xmm2, xmm0
119    punpcklwd   xmm0, xmm3                      ;13 12 11 10 03 02 01 00
120    punpckhwd   xmm2, xmm3                      ;33 32 31 30 23 22 21 20
121
122    movdqa      xmm5, XMMWORD PTR[GLOBAL(_7)]
123    pshufd      xmm2, xmm2, 04eh
124    movdqa      xmm3, xmm0
125    paddw       xmm0, xmm2                      ;b1 b1 b1 b1 a1 a1 a1 a1
126    psubw       xmm3, xmm2                      ;c1 c1 c1 c1 d1 d1 d1 d1
127
128    pshufd      xmm0, xmm0, 0d8h                ;b1 b1 a1 a1 b1 b1 a1 a1
129    movdqa      xmm2, xmm3                      ;save d1 for compare
130    pshufd      xmm3, xmm3, 0d8h                ;c1 c1 d1 d1 c1 c1 d1 d1
131    pshuflw     xmm0, xmm0, 0d8h                ;b1 b1 a1 a1 b1 a1 b1 a1
132    pshuflw     xmm3, xmm3, 0d8h                ;c1 c1 d1 d1 c1 d1 c1 d1
133    pshufhw     xmm0, xmm0, 0d8h                ;b1 a1 b1 a1 b1 a1 b1 a1
134    pshufhw     xmm3, xmm3, 0d8h                ;c1 d1 c1 d1 c1 d1 c1 d1
135    movdqa      xmm1, xmm0
136    pmaddwd     xmm0, XMMWORD PTR[GLOBAL(_mult_add)] ;a1 + b1
137    pmaddwd     xmm1, XMMWORD PTR[GLOBAL(_mult_sub)] ;a1 - b1
138
139    pxor        xmm4, xmm4                      ;zero out for compare
140    paddd       xmm0, xmm5
141    paddd       xmm1, xmm5
142    pcmpeqw     xmm2, xmm4
143    psrad       xmm0, 4                         ;(a1 + b1 + 7)>>4
144    psrad       xmm1, 4                         ;(a1 - b1 + 7)>>4
145    pandn       xmm2, XMMWORD PTR[GLOBAL(_cmp_mask)] ;clear upper,
146                                                     ;and keep bit 0 of lower
147
148    movdqa      xmm4, xmm3
149    pmaddwd     xmm3, XMMWORD PTR[GLOBAL(_5352_2217)]    ;c1*2217 + d1*5352
150    pmaddwd     xmm4, XMMWORD PTR[GLOBAL(_2217_neg5352)] ;d1*2217 - c1*5352
151    paddd       xmm3, XMMWORD PTR[GLOBAL(_12000)]
152    paddd       xmm4, XMMWORD PTR[GLOBAL(_51000)]
153    packssdw    xmm0, xmm1                      ;op[8] op[0]
154    psrad       xmm3, 16                ;(c1 * 2217 + d1 * 5352 +  12000)>>16
155    psrad       xmm4, 16                ;(d1 * 2217 - c1 * 5352 +  51000)>>16
156
157    packssdw    xmm3, xmm4                      ;op[12] op[4]
158    movdqa      xmm1, xmm0
159    paddw       xmm3, xmm2                      ;op[4] += (d1!=0)
160    punpcklqdq  xmm0, xmm3                      ;op[4] op[0]
161    punpckhqdq  xmm1, xmm3                      ;op[12] op[8]
162
163    movdqa      XMMWORD PTR[output +  0], xmm0
164    movdqa      XMMWORD PTR[output + 16], xmm1
165
166    STACK_FRAME_DESTROY
167
168;void vp8_short_fdct8x4_sse2(short *input, short *output, int pitch)
169global sym(vp8_short_fdct8x4_sse2) PRIVATE
170sym(vp8_short_fdct8x4_sse2):
171
172    STACK_FRAME_CREATE
173
174        ; read the input data
175        movdqa      xmm0,       [input        ]
176        movdqa      xmm2,       [input+  pitch]
177        lea         input,      [input+2*pitch]
178        movdqa      xmm4,       [input        ]
179        movdqa      xmm3,       [input+  pitch]
180
181        ; transpose for the first stage
182        movdqa      xmm1,       xmm0        ; 00 01 02 03 04 05 06 07
183        movdqa      xmm5,       xmm4        ; 20 21 22 23 24 25 26 27
184
185        punpcklwd   xmm0,       xmm2        ; 00 10 01 11 02 12 03 13
186        punpckhwd   xmm1,       xmm2        ; 04 14 05 15 06 16 07 17
187
188        punpcklwd   xmm4,       xmm3        ; 20 30 21 31 22 32 23 33
189        punpckhwd   xmm5,       xmm3        ; 24 34 25 35 26 36 27 37
190
191        movdqa      xmm2,       xmm0        ; 00 10 01 11 02 12 03 13
192        punpckldq   xmm0,       xmm4        ; 00 10 20 30 01 11 21 31
193
194        punpckhdq   xmm2,       xmm4        ; 02 12 22 32 03 13 23 33
195
196        movdqa      xmm4,       xmm1        ; 04 14 05 15 06 16 07 17
197        punpckldq   xmm4,       xmm5        ; 04 14 24 34 05 15 25 35
198
199        punpckhdq   xmm1,       xmm5        ; 06 16 26 36 07 17 27 37
200        movdqa      xmm3,       xmm2        ; 02 12 22 32 03 13 23 33
201
202        punpckhqdq  xmm3,       xmm1        ; 03 13 23 33 07 17 27 37
203        punpcklqdq  xmm2,       xmm1        ; 02 12 22 32 06 16 26 36
204
205        movdqa      xmm1,       xmm0        ; 00 10 20 30 01 11 21 31
206        punpcklqdq  xmm0,       xmm4        ; 00 10 20 30 04 14 24 34
207
208        punpckhqdq  xmm1,       xmm4        ; 01 11 21 32 05 15 25 35
209
210        ; xmm0 0
211        ; xmm1 1
212        ; xmm2 2
213        ; xmm3 3
214
215        ; first stage
216        movdqa      xmm5,       xmm0
217        movdqa      xmm4,       xmm1
218
219        paddw       xmm0,       xmm3        ; a1 = 0 + 3
220        paddw       xmm1,       xmm2        ; b1 = 1 + 2
221
222        psubw       xmm4,       xmm2        ; c1 = 1 - 2
223        psubw       xmm5,       xmm3        ; d1 = 0 - 3
224
225        psllw       xmm5,        3
226        psllw       xmm4,        3
227
228        psllw       xmm0,        3
229        psllw       xmm1,        3
230
231        ; output 0 and 2
232        movdqa      xmm2,       xmm0        ; a1
233
234        paddw       xmm0,       xmm1        ; op[0] = a1 + b1
235        psubw       xmm2,       xmm1        ; op[2] = a1 - b1
236
237        ; output 1 and 3
238        ; interleave c1, d1
239        movdqa      xmm1,       xmm5        ; d1
240        punpcklwd   xmm1,       xmm4        ; c1 d1
241        punpckhwd   xmm5,       xmm4        ; c1 d1
242
243        movdqa      xmm3,       xmm1
244        movdqa      xmm4,       xmm5
245
246        pmaddwd     xmm1,       XMMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
247        pmaddwd     xmm4,       XMMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
248
249        pmaddwd     xmm3,       XMMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
250        pmaddwd     xmm5,       XMMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
251
252        paddd       xmm1,       XMMWORD PTR[GLOBAL(_14500)]
253        paddd       xmm4,       XMMWORD PTR[GLOBAL(_14500)]
254        paddd       xmm3,       XMMWORD PTR[GLOBAL(_7500)]
255        paddd       xmm5,       XMMWORD PTR[GLOBAL(_7500)]
256
257        psrad       xmm1,       12          ; (c1 * 2217 + d1 * 5352 +  14500)>>12
258        psrad       xmm4,       12          ; (c1 * 2217 + d1 * 5352 +  14500)>>12
259        psrad       xmm3,       12          ; (d1 * 2217 - c1 * 5352 +   7500)>>12
260        psrad       xmm5,       12          ; (d1 * 2217 - c1 * 5352 +   7500)>>12
261
262        packssdw    xmm1,       xmm4        ; op[1]
263        packssdw    xmm3,       xmm5        ; op[3]
264
265        ; done with vertical
266        ; transpose for the second stage
267        movdqa      xmm4,       xmm0         ; 00 10 20 30 04 14 24 34
268        movdqa      xmm5,       xmm2         ; 02 12 22 32 06 16 26 36
269
270        punpcklwd   xmm0,       xmm1         ; 00 01 10 11 20 21 30 31
271        punpckhwd   xmm4,       xmm1         ; 04 05 14 15 24 25 34 35
272
273        punpcklwd   xmm2,       xmm3         ; 02 03 12 13 22 23 32 33
274        punpckhwd   xmm5,       xmm3         ; 06 07 16 17 26 27 36 37
275
276        movdqa      xmm1,       xmm0         ; 00 01 10 11 20 21 30 31
277        punpckldq   xmm0,       xmm2         ; 00 01 02 03 10 11 12 13
278
279        punpckhdq   xmm1,       xmm2         ; 20 21 22 23 30 31 32 33
280
281        movdqa      xmm2,       xmm4         ; 04 05 14 15 24 25 34 35
282        punpckldq   xmm2,       xmm5         ; 04 05 06 07 14 15 16 17
283
284        punpckhdq   xmm4,       xmm5         ; 24 25 26 27 34 35 36 37
285        movdqa      xmm3,       xmm1         ; 20 21 22 23 30 31 32 33
286
287        punpckhqdq  xmm3,       xmm4         ; 30 31 32 33 34 35 36 37
288        punpcklqdq  xmm1,       xmm4         ; 20 21 22 23 24 25 26 27
289
290        movdqa      xmm4,       xmm0         ; 00 01 02 03 10 11 12 13
291        punpcklqdq  xmm0,       xmm2         ; 00 01 02 03 04 05 06 07
292
293        punpckhqdq  xmm4,       xmm2         ; 10 11 12 13 14 15 16 17
294
295        ; xmm0 0
296        ; xmm1 4
297        ; xmm2 1
298        ; xmm3 3
299
300        movdqa      xmm5,       xmm0
301        movdqa      xmm2,       xmm1
302
303        paddw       xmm0,       xmm3        ; a1 = 0 + 3
304        paddw       xmm1,       xmm4        ; b1 = 1 + 2
305
306        psubw       xmm4,       xmm2        ; c1 = 1 - 2
307        psubw       xmm5,       xmm3        ; d1 = 0 - 3
308
309        pxor        xmm6,       xmm6        ; zero out for compare
310
311        pcmpeqw     xmm6,       xmm5        ; d1 != 0
312
313        pandn       xmm6,       XMMWORD PTR[GLOBAL(_cmp_mask8x4)]   ; clear upper,
314                                                                    ; and keep bit 0 of lower
315
316        ; output 0 and 2
317        movdqa      xmm2,       xmm0        ; a1
318
319        paddw       xmm0,       xmm1        ; a1 + b1
320        psubw       xmm2,       xmm1        ; a1 - b1
321
322        paddw       xmm0,       XMMWORD PTR[GLOBAL(_7w)]
323        paddw       xmm2,       XMMWORD PTR[GLOBAL(_7w)]
324
325        psraw       xmm0,       4           ; op[0] = (a1 + b1 + 7)>>4
326        psraw       xmm2,       4           ; op[8] = (a1 - b1 + 7)>>4
327
328        ; output 1 and 3
329        ; interleave c1, d1
330        movdqa      xmm1,       xmm5        ; d1
331        punpcklwd   xmm1,       xmm4        ; c1 d1
332        punpckhwd   xmm5,       xmm4        ; c1 d1
333
334        movdqa      xmm3,       xmm1
335        movdqa      xmm4,       xmm5
336
337        pmaddwd     xmm1,       XMMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
338        pmaddwd     xmm4,       XMMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
339
340        pmaddwd     xmm3,       XMMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
341        pmaddwd     xmm5,       XMMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
342
343        paddd       xmm1,       XMMWORD PTR[GLOBAL(_12000)]
344        paddd       xmm4,       XMMWORD PTR[GLOBAL(_12000)]
345        paddd       xmm3,       XMMWORD PTR[GLOBAL(_51000)]
346        paddd       xmm5,       XMMWORD PTR[GLOBAL(_51000)]
347
348        psrad       xmm1,       16          ; (c1 * 2217 + d1 * 5352 +  14500)>>16
349        psrad       xmm4,       16          ; (c1 * 2217 + d1 * 5352 +  14500)>>16
350        psrad       xmm3,       16          ; (d1 * 2217 - c1 * 5352 +   7500)>>16
351        psrad       xmm5,       16          ; (d1 * 2217 - c1 * 5352 +   7500)>>16
352
353        packssdw    xmm1,       xmm4        ; op[4]
354        packssdw    xmm3,       xmm5        ; op[12]
355
356        paddw       xmm1,       xmm6        ; op[4] += (d1!=0)
357
358        movdqa      xmm4,       xmm0
359        movdqa      xmm5,       xmm2
360
361        punpcklqdq  xmm0,       xmm1
362        punpckhqdq  xmm4,       xmm1
363
364        punpcklqdq  xmm2,       xmm3
365        punpckhqdq  xmm5,       xmm3
366
367        movdqa      XMMWORD PTR[output + 0 ],  xmm0
368        movdqa      XMMWORD PTR[output + 16],  xmm2
369        movdqa      XMMWORD PTR[output + 32],  xmm4
370        movdqa      XMMWORD PTR[output + 48],  xmm5
371
372    STACK_FRAME_DESTROY
373
374SECTION_RODATA
375align 16
376_5352_2217:
377    dw 5352
378    dw 2217
379    dw 5352
380    dw 2217
381    dw 5352
382    dw 2217
383    dw 5352
384    dw 2217
385align 16
386_2217_neg5352:
387    dw 2217
388    dw -5352
389    dw 2217
390    dw -5352
391    dw 2217
392    dw -5352
393    dw 2217
394    dw -5352
395align 16
396_mult_add:
397    times 8 dw 1
398align 16
399_cmp_mask:
400    times 4 dw 1
401    times 4 dw 0
402align 16
403_cmp_mask8x4:
404    times 8 dw 1
405align 16
406_mult_sub:
407    dw 1
408    dw -1
409    dw 1
410    dw -1
411    dw 1
412    dw -1
413    dw 1
414    dw -1
415align 16
416_7:
417    times 4 dd 7
418align 16
419_7w:
420    times 8 dw 7
421align 16
422_14500:
423    times 4 dd 14500
424align 16
425_7500:
426    times 4 dd 7500
427align 16
428_12000:
429    times 4 dd 12000
430align 16
431_51000:
432    times 4 dd 51000
433