1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12%include "vpx_ports/x86_abi_support.asm"
13
14; /****************************************************************************
15; * Notes:
16; *
17; * This implementation makes use of 16 bit fixed point version of two multiply
18; * constants:
19; *        1.   sqrt(2) * cos (pi/8)
20; *        2.   sqrt(2) * sin (pi/8)
21; * Because the first constant is bigger than 1, to maintain the same 16 bit
22; * fixed point precision as the second one, we use a trick of
23; *        x * a = x + x*(a-1)
24; * so
25; *        x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1).
26; *
27; * For the second constant, because of the 16bit version is 35468, which
28; * is bigger than 32768, in signed 16 bit multiply, it becomes a negative
29; * number.
30; *        (x * (unsigned)35468 >> 16) = x * (signed)35468 >> 16 + x
31; *
32; **************************************************************************/
33
34
35;void vp8_short_idct4x4llm_mmx(short *input, unsigned char *pred,
36;int pitch, unsigned char *dest,int stride)
37global sym(vp8_short_idct4x4llm_mmx) PRIVATE
38sym(vp8_short_idct4x4llm_mmx):
39    push        rbp
40    mov         rbp, rsp
41    SHADOW_ARGS_TO_STACK 5
42    GET_GOT     rbx
43    push        rsi
44    push        rdi
45    ; end prolog
46
47    mov         rax,    arg(0)              ;input
48    mov         rsi,    arg(1)              ;pred
49
50    movq        mm0,    [rax   ]
51    movq        mm1,    [rax+ 8]
52    movq        mm2,    [rax+16]
53    movq        mm3,    [rax+24]
54
55%if 0
56    pxor        mm7,    mm7
57    movq        [rax],   mm7
58    movq        [rax+8], mm7
59    movq        [rax+16],mm7
60    movq        [rax+24],mm7
61%endif
62    movsxd      rax,    dword ptr arg(2)    ;pitch
63    mov         rdx,    arg(3)              ;dest
64    movsxd      rdi,    dword ptr arg(4)    ;stride
65
66
67    psubw       mm0,            mm2             ; b1= 0-2
68    paddw       mm2,            mm2             ;
69
70    movq        mm5,            mm1
71    paddw       mm2,            mm0             ; a1 =0+2
72
73    pmulhw      mm5,            [GLOBAL(x_s1sqr2)];
74    paddw       mm5,            mm1             ; ip1 * sin(pi/8) * sqrt(2)
75
76    movq        mm7,            mm3             ;
77    pmulhw      mm7,            [GLOBAL(x_c1sqr2less1)];
78
79    paddw       mm7,            mm3             ; ip3 * cos(pi/8) * sqrt(2)
80    psubw       mm7,            mm5             ; c1
81
82    movq        mm5,            mm1
83    movq        mm4,            mm3
84
85    pmulhw      mm5,            [GLOBAL(x_c1sqr2less1)]
86    paddw       mm5,            mm1
87
88    pmulhw      mm3,            [GLOBAL(x_s1sqr2)]
89    paddw       mm3,            mm4
90
91    paddw       mm3,            mm5             ; d1
92    movq        mm6,            mm2             ; a1
93
94    movq        mm4,            mm0             ; b1
95    paddw       mm2,            mm3             ;0
96
97    paddw       mm4,            mm7             ;1
98    psubw       mm0,            mm7             ;2
99
100    psubw       mm6,            mm3             ;3
101
102    movq        mm1,            mm2             ; 03 02 01 00
103    movq        mm3,            mm4             ; 23 22 21 20
104
105    punpcklwd   mm1,            mm0             ; 11 01 10 00
106    punpckhwd   mm2,            mm0             ; 13 03 12 02
107
108    punpcklwd   mm3,            mm6             ; 31 21 30 20
109    punpckhwd   mm4,            mm6             ; 33 23 32 22
110
111    movq        mm0,            mm1             ; 11 01 10 00
112    movq        mm5,            mm2             ; 13 03 12 02
113
114    punpckldq   mm0,            mm3             ; 30 20 10 00
115    punpckhdq   mm1,            mm3             ; 31 21 11 01
116
117    punpckldq   mm2,            mm4             ; 32 22 12 02
118    punpckhdq   mm5,            mm4             ; 33 23 13 03
119
120    movq        mm3,            mm5             ; 33 23 13 03
121
122    psubw       mm0,            mm2             ; b1= 0-2
123    paddw       mm2,            mm2             ;
124
125    movq        mm5,            mm1
126    paddw       mm2,            mm0             ; a1 =0+2
127
128    pmulhw      mm5,            [GLOBAL(x_s1sqr2)];
129    paddw       mm5,            mm1             ; ip1 * sin(pi/8) * sqrt(2)
130
131    movq        mm7,            mm3             ;
132    pmulhw      mm7,            [GLOBAL(x_c1sqr2less1)];
133
134    paddw       mm7,            mm3             ; ip3 * cos(pi/8) * sqrt(2)
135    psubw       mm7,            mm5             ; c1
136
137    movq        mm5,            mm1
138    movq        mm4,            mm3
139
140    pmulhw      mm5,            [GLOBAL(x_c1sqr2less1)]
141    paddw       mm5,            mm1
142
143    pmulhw      mm3,            [GLOBAL(x_s1sqr2)]
144    paddw       mm3,            mm4
145
146    paddw       mm3,            mm5             ; d1
147    paddw       mm0,            [GLOBAL(fours)]
148
149    paddw       mm2,            [GLOBAL(fours)]
150    movq        mm6,            mm2             ; a1
151
152    movq        mm4,            mm0             ; b1
153    paddw       mm2,            mm3             ;0
154
155    paddw       mm4,            mm7             ;1
156    psubw       mm0,            mm7             ;2
157
158    psubw       mm6,            mm3             ;3
159    psraw       mm2,            3
160
161    psraw       mm0,            3
162    psraw       mm4,            3
163
164    psraw       mm6,            3
165
166    movq        mm1,            mm2             ; 03 02 01 00
167    movq        mm3,            mm4             ; 23 22 21 20
168
169    punpcklwd   mm1,            mm0             ; 11 01 10 00
170    punpckhwd   mm2,            mm0             ; 13 03 12 02
171
172    punpcklwd   mm3,            mm6             ; 31 21 30 20
173    punpckhwd   mm4,            mm6             ; 33 23 32 22
174
175    movq        mm0,            mm1             ; 11 01 10 00
176    movq        mm5,            mm2             ; 13 03 12 02
177
178    punpckldq   mm0,            mm3             ; 30 20 10 00
179    punpckhdq   mm1,            mm3             ; 31 21 11 01
180
181    punpckldq   mm2,            mm4             ; 32 22 12 02
182    punpckhdq   mm5,            mm4             ; 33 23 13 03
183
184    pxor        mm7,            mm7
185
186    movd        mm4,            [rsi]
187    punpcklbw   mm4,            mm7
188    paddsw      mm0,            mm4
189    packuswb    mm0,            mm7
190    movd        [rdx],          mm0
191
192    movd        mm4,            [rsi+rax]
193    punpcklbw   mm4,            mm7
194    paddsw      mm1,            mm4
195    packuswb    mm1,            mm7
196    movd        [rdx+rdi],      mm1
197
198    movd        mm4,            [rsi+2*rax]
199    punpcklbw   mm4,            mm7
200    paddsw      mm2,            mm4
201    packuswb    mm2,            mm7
202    movd        [rdx+rdi*2],    mm2
203
204    add         rdx,            rdi
205    add         rsi,            rax
206
207    movd        mm4,            [rsi+2*rax]
208    punpcklbw   mm4,            mm7
209    paddsw      mm5,            mm4
210    packuswb    mm5,            mm7
211    movd        [rdx+rdi*2],    mm5
212
213    ; begin epilog
214    pop rdi
215    pop rsi
216    RESTORE_GOT
217    UNSHADOW_ARGS
218    pop         rbp
219    ret
220
221;void vp8_dc_only_idct_add_mmx(
222;short input_dc,
223;unsigned char *pred_ptr,
224;int pred_stride,
225;unsigned char *dst_ptr,
226;int stride)
227global sym(vp8_dc_only_idct_add_mmx) PRIVATE
228sym(vp8_dc_only_idct_add_mmx):
229    push        rbp
230    mov         rbp, rsp
231    SHADOW_ARGS_TO_STACK 5
232    GET_GOT     rbx
233    ; end prolog
234
235        movd        mm5,            arg(0) ;input_dc
236        mov         rax,            arg(1) ;pred_ptr
237        movsxd      rdx,            dword ptr arg(2) ;pred_stride
238
239        pxor        mm0,            mm0
240
241        paddw       mm5,            [GLOBAL(fours)]
242        lea         rcx,            [rdx + rdx*2]
243
244        psraw       mm5,            3
245
246        punpcklwd   mm5,            mm5
247
248        punpckldq   mm5,            mm5
249
250        movd        mm1,            [rax]
251        movd        mm2,            [rax+rdx]
252        movd        mm3,            [rax+2*rdx]
253        movd        mm4,            [rax+rcx]
254
255        mov         rax,            arg(3) ;d -- destination
256        movsxd      rdx,            dword ptr arg(4) ;dst_stride
257
258        punpcklbw   mm1,            mm0
259        paddsw      mm1,            mm5
260        packuswb    mm1,            mm0              ; pack and unpack to saturate
261        lea         rcx,            [rdx + rdx*2]
262
263        punpcklbw   mm2,            mm0
264        paddsw      mm2,            mm5
265        packuswb    mm2,            mm0              ; pack and unpack to saturate
266
267        punpcklbw   mm3,            mm0
268        paddsw      mm3,            mm5
269        packuswb    mm3,            mm0              ; pack and unpack to saturate
270
271        punpcklbw   mm4,            mm0
272        paddsw      mm4,            mm5
273        packuswb    mm4,            mm0              ; pack and unpack to saturate
274
275        movd        [rax],          mm1
276        movd        [rax+rdx],      mm2
277        movd        [rax+2*rdx],    mm3
278        movd        [rax+rcx],      mm4
279
280    ; begin epilog
281    RESTORE_GOT
282    UNSHADOW_ARGS
283    pop         rbp
284    ret
285
286SECTION_RODATA
287align 16
288x_s1sqr2:
289    times 4 dw 0x8A8C
290align 16
291x_c1sqr2less1:
292    times 4 dw 0x4E7B
293align 16
294fours:
295    times 4 dw 0x0004
296