1;
2;  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11%include "third_party/x86inc/x86inc.asm"
12
13SECTION_RODATA
14
15pw_11585x2: times 8 dw 23170
16pd_8192:    times 4 dd 8192
17
18%macro TRANSFORM_COEFFS 2
19pw_%1_%2:   dw  %1,  %2,  %1,  %2,  %1,  %2,  %1,  %2
20pw_%2_m%1:  dw  %2, -%1,  %2, -%1,  %2, -%1,  %2, -%1
21%endmacro
22
23TRANSFORM_COEFFS 11585,  11585
24TRANSFORM_COEFFS 15137,   6270
25TRANSFORM_COEFFS 16069,   3196
26TRANSFORM_COEFFS  9102,  13623
27
28SECTION .text
29
30%if ARCH_X86_64
31INIT_XMM ssse3
32cglobal fdct8x8, 3, 5, 13, input, output, stride
33
34  mova               m8, [GLOBAL(pd_8192)]
35  mova              m12, [GLOBAL(pw_11585x2)]
36
37  lea                r3, [2 * strideq]
38  lea                r4, [4 * strideq]
39  mova               m0, [inputq]
40  mova               m1, [inputq + r3]
41  lea                inputq, [inputq + r4]
42  mova               m2, [inputq]
43  mova               m3, [inputq + r3]
44  lea                inputq, [inputq + r4]
45  mova               m4, [inputq]
46  mova               m5, [inputq + r3]
47  lea                inputq, [inputq + r4]
48  mova               m6, [inputq]
49  mova               m7, [inputq + r3]
50
51  ; left shift by 2 to increase forward transformation precision
52  psllw              m0, 2
53  psllw              m1, 2
54  psllw              m2, 2
55  psllw              m3, 2
56  psllw              m4, 2
57  psllw              m5, 2
58  psllw              m6, 2
59  psllw              m7, 2
60
61  ; column transform
62  ; stage 1
63  paddw m10, m0, m7
64  psubw m0, m7
65
66  paddw m9, m1, m6
67  psubw m1, m6
68
69  paddw m7, m2, m5
70  psubw m2, m5
71
72  paddw m6, m3, m4
73  psubw m3, m4
74
75  ; stage 2
76  paddw m5, m9, m7
77  psubw m9, m7
78
79  paddw m4, m10, m6
80  psubw m10, m6
81
82  paddw m7, m1, m2
83  psubw m1, m2
84
85  ; stage 3
86  paddw m6, m4, m5
87  psubw m4, m5
88
89  pmulhrsw m1, m12
90  pmulhrsw m7, m12
91
92  ; sin(pi / 8), cos(pi / 8)
93  punpcklwd m2, m10, m9
94  punpckhwd m10, m9
95  pmaddwd m5, m2, [GLOBAL(pw_15137_6270)]
96  pmaddwd m2, [GLOBAL(pw_6270_m15137)]
97  pmaddwd m9, m10, [GLOBAL(pw_15137_6270)]
98  pmaddwd m10, [GLOBAL(pw_6270_m15137)]
99  paddd m5, m8
100  paddd m2, m8
101  paddd m9, m8
102  paddd m10, m8
103  psrad m5, 14
104  psrad m2, 14
105  psrad m9, 14
106  psrad m10, 14
107  packssdw m5, m9
108  packssdw m2, m10
109
110  pmulhrsw m6, m12
111  pmulhrsw m4, m12
112
113  paddw m9, m3, m1
114  psubw m3, m1
115
116  paddw m10, m0, m7
117  psubw m0, m7
118
119  ; stage 4
120  ; sin(pi / 16), cos(pi / 16)
121  punpcklwd m1, m10, m9
122  punpckhwd m10, m9
123  pmaddwd m7, m1, [GLOBAL(pw_16069_3196)]
124  pmaddwd m1, [GLOBAL(pw_3196_m16069)]
125  pmaddwd m9, m10, [GLOBAL(pw_16069_3196)]
126  pmaddwd m10, [GLOBAL(pw_3196_m16069)]
127  paddd m7, m8
128  paddd m1, m8
129  paddd m9, m8
130  paddd m10, m8
131  psrad m7, 14
132  psrad m1, 14
133  psrad m9, 14
134  psrad m10, 14
135  packssdw m7, m9
136  packssdw m1, m10
137
138  ; sin(3 * pi / 16), cos(3 * pi / 16)
139  punpcklwd m11, m0, m3
140  punpckhwd m0, m3
141  pmaddwd m9, m11, [GLOBAL(pw_9102_13623)]
142  pmaddwd m11, [GLOBAL(pw_13623_m9102)]
143  pmaddwd m3, m0, [GLOBAL(pw_9102_13623)]
144  pmaddwd m0, [GLOBAL(pw_13623_m9102)]
145  paddd m9, m8
146  paddd m11, m8
147  paddd m3, m8
148  paddd m0, m8
149  psrad m9, 14
150  psrad m11, 14
151  psrad m3, 14
152  psrad m0, 14
153  packssdw m9, m3
154  packssdw m11, m0
155
156  ; transpose
157  ; stage 1
158  punpcklwd m0, m6, m7
159  punpcklwd m3, m5, m11
160  punpckhwd m6, m7
161  punpckhwd m5, m11
162  punpcklwd m7, m4, m9
163  punpcklwd m10, m2, m1
164  punpckhwd m4, m9
165  punpckhwd m2, m1
166
167  ; stage 2
168  punpckldq m9, m0, m3
169  punpckldq m1, m6, m5
170  punpckhdq m0, m3
171  punpckhdq m6, m5
172  punpckldq m3, m7, m10
173  punpckldq m5, m4, m2
174  punpckhdq m7, m10
175  punpckhdq m4, m2
176
177  ; stage 3
178  punpcklqdq m10, m9, m3
179  punpckhqdq m9, m3
180  punpcklqdq m2, m0, m7
181  punpckhqdq m0, m7
182  punpcklqdq m3, m1, m5
183  punpckhqdq m1, m5
184  punpcklqdq m7, m6, m4
185  punpckhqdq m6, m4
186
187  ; row transform
188  ; stage 1
189  paddw m5, m10, m6
190  psubw m10, m6
191
192  paddw m4, m9, m7
193  psubw m9, m7
194
195  paddw m6, m2, m1
196  psubw m2, m1
197
198  paddw m7, m0, m3
199  psubw m0, m3
200
201  ;stage 2
202  paddw m1, m5, m7
203  psubw m5, m7
204
205  paddw m3, m4, m6
206  psubw m4, m6
207
208  paddw m7, m9, m2
209  psubw m9, m2
210
211  ; stage 3
212  punpcklwd m6, m1, m3
213  punpckhwd m1, m3
214  pmaddwd m2, m6, [GLOBAL(pw_11585_11585)]
215  pmaddwd m6, [GLOBAL(pw_11585_m11585)]
216  pmaddwd m3, m1, [GLOBAL(pw_11585_11585)]
217  pmaddwd m1, [GLOBAL(pw_11585_m11585)]
218  paddd m2, m8
219  paddd m6, m8
220  paddd m3, m8
221  paddd m1, m8
222  psrad m2, 14
223  psrad m6, 14
224  psrad m3, 14
225  psrad m1, 14
226  packssdw m2, m3
227  packssdw m6, m1
228
229  pmulhrsw m7, m12
230  pmulhrsw m9, m12
231
232  punpcklwd m3, m5, m4
233  punpckhwd m5, m4
234  pmaddwd m1, m3, [GLOBAL(pw_15137_6270)]
235  pmaddwd m3, [GLOBAL(pw_6270_m15137)]
236  pmaddwd m4, m5, [GLOBAL(pw_15137_6270)]
237  pmaddwd m5, [GLOBAL(pw_6270_m15137)]
238  paddd m1, m8
239  paddd m3, m8
240  paddd m4, m8
241  paddd m5, m8
242  psrad m1, 14
243  psrad m3, 14
244  psrad m4, 14
245  psrad m5, 14
246  packssdw m1, m4
247  packssdw m3, m5
248
249  paddw m4, m0, m9
250  psubw m0, m9
251
252  paddw m5, m10, m7
253  psubw m10, m7
254
255  ; stage 4
256  punpcklwd m9, m5, m4
257  punpckhwd m5, m4
258  pmaddwd m7, m9, [GLOBAL(pw_16069_3196)]
259  pmaddwd m9, [GLOBAL(pw_3196_m16069)]
260  pmaddwd m4, m5, [GLOBAL(pw_16069_3196)]
261  pmaddwd m5, [GLOBAL(pw_3196_m16069)]
262  paddd m7, m8
263  paddd m9, m8
264  paddd m4, m8
265  paddd m5, m8
266  psrad m7, 14
267  psrad m9, 14
268  psrad m4, 14
269  psrad m5, 14
270  packssdw m7, m4
271  packssdw m9, m5
272
273  punpcklwd m4, m10, m0
274  punpckhwd m10, m0
275  pmaddwd m5, m4, [GLOBAL(pw_9102_13623)]
276  pmaddwd m4, [GLOBAL(pw_13623_m9102)]
277  pmaddwd m0, m10, [GLOBAL(pw_9102_13623)]
278  pmaddwd m10, [GLOBAL(pw_13623_m9102)]
279  paddd m5, m8
280  paddd m4, m8
281  paddd m0, m8
282  paddd m10, m8
283  psrad m5, 14
284  psrad m4, 14
285  psrad m0, 14
286  psrad m10, 14
287  packssdw m5, m0
288  packssdw m4, m10
289
290  ; transpose
291  ; stage 1
292  punpcklwd m0, m2, m7
293  punpcklwd m10, m1, m4
294  punpckhwd m2, m7
295  punpckhwd m1, m4
296  punpcklwd m7, m6, m5
297  punpcklwd m4, m3, m9
298  punpckhwd m6, m5
299  punpckhwd m3, m9
300
301  ; stage 2
302  punpckldq m5, m0, m10
303  punpckldq m9, m2, m1
304  punpckhdq m0, m10
305  punpckhdq m2, m1
306  punpckldq m10, m7, m4
307  punpckldq m1, m6, m3
308  punpckhdq m7, m4
309  punpckhdq m6, m3
310
311  ; stage 3
312  punpcklqdq m4, m5, m10
313  punpckhqdq m5, m10
314  punpcklqdq m3, m0, m7
315  punpckhqdq m0, m7
316  punpcklqdq m10, m9, m1
317  punpckhqdq m9, m1
318  punpcklqdq m7, m2, m6
319  punpckhqdq m2, m6
320
321  psraw m1, m4, 15
322  psraw m6, m5, 15
323  psraw m8, m3, 15
324  psraw m11, m0, 15
325
326  psubw m4, m1
327  psubw m5, m6
328  psubw m3, m8
329  psubw m0, m11
330
331  psraw m4, 1
332  psraw m5, 1
333  psraw m3, 1
334  psraw m0, 1
335
336  psraw m1, m10, 15
337  psraw m6, m9, 15
338  psraw m8, m7, 15
339  psraw m11, m2, 15
340
341  psubw m10, m1
342  psubw m9, m6
343  psubw m7, m8
344  psubw m2, m11
345
346  psraw m10, 1
347  psraw m9, 1
348  psraw m7, 1
349  psraw m2, 1
350
351  mova              [outputq +   0], m4
352  mova              [outputq +  16], m5
353  mova              [outputq +  32], m3
354  mova              [outputq +  48], m0
355  mova              [outputq +  64], m10
356  mova              [outputq +  80], m9
357  mova              [outputq +  96], m7
358  mova              [outputq + 112], m2
359
360  RET
361%endif
362