1/*
2 *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include "./vp8_rtcd.h"
12#include "vpx_ports/mem.h"
13#include "vpx_ports/asmdefs_mmi.h"
14
15/* clang-format off */
16/* TRANSPOSE_4H: transpose 4x4 matrix.
17   Input: ftmp1,ftmp2,ftmp3,ftmp4
18   Output: ftmp1,ftmp2,ftmp3,ftmp4
19   Note: ftmp0 always be 0, ftmp5~9 used for temporary value.
20 */
21#define TRANSPOSE_4H                                         \
22  MMI_LI(%[tmp0], 0x93)                                      \
23  "mtc1       %[tmp0],    %[ftmp10]                    \n\t" \
24  "punpcklhw  %[ftmp5],   %[ftmp1],   %[ftmp0]         \n\t" \
25  "punpcklhw  %[ftmp9],   %[ftmp2],   %[ftmp0]         \n\t" \
26  "pshufh     %[ftmp9],   %[ftmp9],   %[ftmp10]        \n\t" \
27  "or         %[ftmp5],   %[ftmp5],   %[ftmp9]         \n\t" \
28  "punpckhhw  %[ftmp6],   %[ftmp1],   %[ftmp0]         \n\t" \
29  "punpckhhw  %[ftmp9],   %[ftmp2],   %[ftmp0]         \n\t" \
30  "pshufh     %[ftmp9],   %[ftmp9],   %[ftmp10]        \n\t" \
31  "or         %[ftmp6],   %[ftmp6],   %[ftmp9]         \n\t" \
32  "punpcklhw  %[ftmp7],   %[ftmp3],   %[ftmp0]         \n\t" \
33  "punpcklhw  %[ftmp9],   %[ftmp4],   %[ftmp0]         \n\t" \
34  "pshufh     %[ftmp9],   %[ftmp9],   %[ftmp10]        \n\t" \
35  "or         %[ftmp7],   %[ftmp7],   %[ftmp9]         \n\t" \
36  "punpckhhw  %[ftmp8],   %[ftmp3],   %[ftmp0]         \n\t" \
37  "punpckhhw  %[ftmp9],   %[ftmp4],   %[ftmp0]         \n\t" \
38  "pshufh     %[ftmp9],   %[ftmp9],   %[ftmp10]        \n\t" \
39  "or         %[ftmp8],   %[ftmp8],   %[ftmp9]         \n\t" \
40  "punpcklwd  %[ftmp1],   %[ftmp5],   %[ftmp7]         \n\t" \
41  "punpckhwd  %[ftmp2],   %[ftmp5],   %[ftmp7]         \n\t" \
42  "punpcklwd  %[ftmp3],   %[ftmp6],   %[ftmp8]         \n\t" \
43  "punpckhwd  %[ftmp4],   %[ftmp6],   %[ftmp8]         \n\t"
44/* clang-format on */
45
46void vp8_short_fdct4x4_mmi(int16_t *input, int16_t *output, int pitch) {
47  uint64_t tmp[1];
48  int16_t *ip = input;
49
50#if _MIPS_SIM == _ABIO32
51  register double ftmp0 asm("$f0");
52  register double ftmp1 asm("$f2");
53  register double ftmp2 asm("$f4");
54  register double ftmp3 asm("$f6");
55  register double ftmp4 asm("$f8");
56  register double ftmp5 asm("$f10");
57  register double ftmp6 asm("$f12");
58  register double ftmp7 asm("$f14");
59  register double ftmp8 asm("$f16");
60  register double ftmp9 asm("$f18");
61  register double ftmp10 asm("$f20");
62  register double ftmp11 asm("$f22");
63  register double ftmp12 asm("$f24");
64#else
65  register double ftmp0 asm("$f0");
66  register double ftmp1 asm("$f1");
67  register double ftmp2 asm("$f2");
68  register double ftmp3 asm("$f3");
69  register double ftmp4 asm("$f4");
70  register double ftmp5 asm("$f5");
71  register double ftmp6 asm("$f6");
72  register double ftmp7 asm("$f7");
73  register double ftmp8 asm("$f8");
74  register double ftmp9 asm("$f9");
75  register double ftmp10 asm("$f10");
76  register double ftmp11 asm("$f11");
77  register double ftmp12 asm("$f12");
78#endif  // _MIPS_SIM == _ABIO32
79
80  DECLARE_ALIGNED(8, const uint64_t, ff_ph_01) = { 0x0001000100010001ULL };
81  DECLARE_ALIGNED(8, const uint64_t, ff_ph_07) = { 0x0007000700070007ULL };
82  DECLARE_ALIGNED(8, const uint64_t, ff_pw_12000) = { 0x00002ee000002ee0ULL };
83  DECLARE_ALIGNED(8, const uint64_t, ff_pw_51000) = { 0x0000c7380000c738ULL };
84  DECLARE_ALIGNED(8, const uint64_t, ff_pw_14500) = { 0x000038a4000038a4ULL };
85  DECLARE_ALIGNED(8, const uint64_t, ff_pw_7500) = { 0x00001d4c00001d4cULL };
86  DECLARE_ALIGNED(8, const uint64_t, ff_ph_op1) = { 0x14e808a914e808a9ULL };
87  DECLARE_ALIGNED(8, const uint64_t, ff_ph_op3) = { 0xeb1808a9eb1808a9ULL };
88  DECLARE_ALIGNED(8, const uint64_t, ff_pw_5352) = { 0x000014e8000014e8ULL };
89  DECLARE_ALIGNED(8, const uint64_t, ff_pw_2217) = { 0x000008a9000008a9ULL };
90  DECLARE_ALIGNED(8, const uint64_t, ff_ph_8) = { 0x0008000800080008ULL };
91
92  __asm__ volatile (
93    "xor        %[ftmp0],   %[ftmp0],      %[ftmp0]         \n\t"
94    "gsldlc1    %[ftmp1],   0x07(%[ip])                     \n\t"
95    "gsldrc1    %[ftmp1],   0x00(%[ip])                     \n\t"
96    MMI_ADDU(%[ip], %[ip], %[pitch])
97    "gsldlc1    %[ftmp2],   0x07(%[ip])                     \n\t"
98    "gsldrc1    %[ftmp2],   0x00(%[ip])                     \n\t"
99    MMI_ADDU(%[ip], %[ip], %[pitch])
100    "gsldlc1    %[ftmp3],   0x07(%[ip])                     \n\t"
101    "gsldrc1    %[ftmp3],   0x00(%[ip])                     \n\t"
102    MMI_ADDU(%[ip], %[ip], %[pitch])
103    "gsldlc1    %[ftmp4],   0x07(%[ip])                     \n\t"
104    "gsldrc1    %[ftmp4],   0x00(%[ip])                     \n\t"
105    MMI_ADDU(%[ip], %[ip], %[pitch])
106    TRANSPOSE_4H
107
108    "ldc1       %[ftmp11],  %[ff_ph_8]                      \n\t"
109    // f1 + f4
110    "paddh      %[ftmp5],   %[ftmp1],       %[ftmp4]        \n\t"
111    // a1
112    "pmullh     %[ftmp5],   %[ftmp5],       %[ftmp11]       \n\t"
113    // f2 + f3
114    "paddh      %[ftmp6],   %[ftmp2],       %[ftmp3]        \n\t"
115    // b1
116    "pmullh     %[ftmp6],   %[ftmp6],       %[ftmp11]       \n\t"
117    // f2 - f3
118    "psubh      %[ftmp7],   %[ftmp2],       %[ftmp3]        \n\t"
119    // c1
120    "pmullh     %[ftmp7],   %[ftmp7],       %[ftmp11]       \n\t"
121    // f1 - f4
122    "psubh      %[ftmp8],   %[ftmp1],       %[ftmp4]        \n\t"
123    // d1
124    "pmullh     %[ftmp8],   %[ftmp8],       %[ftmp11]       \n\t"
125    // op[0] = a1 + b1
126    "paddh      %[ftmp1],   %[ftmp5],       %[ftmp6]        \n\t"
127    // op[2] = a1 - b1
128    "psubh      %[ftmp3],   %[ftmp5],       %[ftmp6]        \n\t"
129
130    // op[1] = (c1 * 2217 + d1 * 5352 + 14500) >> 12
131    MMI_LI(%[tmp0], 0x0c)
132    "mtc1       %[tmp0],    %[ftmp11]                       \n\t"
133    "ldc1       %[ftmp12],  %[ff_pw_14500]                  \n\t"
134    "punpcklhw  %[ftmp9],   %[ftmp7],       %[ftmp8]        \n\t"
135    "pmaddhw    %[ftmp5],   %[ftmp9],       %[ff_ph_op1]    \n\t"
136    "punpckhhw  %[ftmp9],   %[ftmp7],       %[ftmp8]        \n\t"
137    "pmaddhw    %[ftmp6],   %[ftmp9],       %[ff_ph_op1]    \n\t"
138    "paddw      %[ftmp5],   %[ftmp5],       %[ftmp12]       \n\t"
139    "paddw      %[ftmp6],   %[ftmp6],       %[ftmp12]       \n\t"
140    "psraw      %[ftmp5],   %[ftmp5],       %[ftmp11]       \n\t"
141    "psraw      %[ftmp6],   %[ftmp6],       %[ftmp11]       \n\t"
142    "packsswh   %[ftmp2],   %[ftmp5],       %[ftmp6]        \n\t"
143
144    // op[3] = (d1 * 2217 - c1 * 5352 + 7500) >> 12
145    "ldc1       %[ftmp12],  %[ff_pw_7500]                   \n\t"
146    "punpcklhw  %[ftmp9],   %[ftmp8],       %[ftmp7]        \n\t"
147    "pmaddhw    %[ftmp5],   %[ftmp9],       %[ff_ph_op3]    \n\t"
148    "punpckhhw  %[ftmp9],   %[ftmp8],       %[ftmp7]        \n\t"
149    "pmaddhw    %[ftmp6],   %[ftmp9],       %[ff_ph_op3]    \n\t"
150    "paddw      %[ftmp5],   %[ftmp5],       %[ftmp12]       \n\t"
151    "paddw      %[ftmp6],   %[ftmp6],       %[ftmp12]       \n\t"
152    "psraw      %[ftmp5],   %[ftmp5],       %[ftmp11]       \n\t"
153    "psraw      %[ftmp6],   %[ftmp6],       %[ftmp11]       \n\t"
154    "packsswh   %[ftmp4],   %[ftmp5],       %[ftmp6]        \n\t"
155    TRANSPOSE_4H
156
157    "paddh      %[ftmp5],   %[ftmp1],       %[ftmp4]        \n\t"
158    "paddh      %[ftmp6],   %[ftmp2],       %[ftmp3]        \n\t"
159    "psubh      %[ftmp7],   %[ftmp2],       %[ftmp3]        \n\t"
160    "psubh      %[ftmp8],   %[ftmp1],       %[ftmp4]        \n\t"
161
162    "pcmpeqh    %[ftmp0],   %[ftmp8],       %[ftmp0]        \n\t"
163    "ldc1       %[ftmp9],   %[ff_ph_01]                     \n\t"
164    "paddh      %[ftmp0],   %[ftmp0],       %[ftmp9]        \n\t"
165
166    "paddh      %[ftmp1],   %[ftmp5],       %[ftmp6]        \n\t"
167    "psubh      %[ftmp2],   %[ftmp5],       %[ftmp6]        \n\t"
168    "ldc1       %[ftmp9],   %[ff_ph_07]                     \n\t"
169    "paddh      %[ftmp1],   %[ftmp1],       %[ftmp9]        \n\t"
170    "paddh      %[ftmp2],   %[ftmp2],       %[ftmp9]        \n\t"
171    MMI_LI(%[tmp0], 0x04)
172    "mtc1       %[tmp0],    %[ftmp9]                        \n\t"
173    "psrah      %[ftmp1],   %[ftmp1],       %[ftmp9]        \n\t"
174    "psrah      %[ftmp2],   %[ftmp2],       %[ftmp9]        \n\t"
175
176    MMI_LI(%[tmp0], 0x10)
177    "mtc1       %[tmp0],    %[ftmp9]                        \n\t"
178    "ldc1       %[ftmp12],  %[ff_pw_12000]                  \n\t"
179    "punpcklhw  %[ftmp5],   %[ftmp7],       %[ftmp8]        \n\t"
180    "pmaddhw    %[ftmp10],  %[ftmp5],       %[ff_ph_op1]    \n\t"
181    "punpckhhw  %[ftmp5],   %[ftmp7],       %[ftmp8]        \n\t"
182    "pmaddhw    %[ftmp11],  %[ftmp5],       %[ff_ph_op1]    \n\t"
183    "paddw      %[ftmp10],  %[ftmp10],      %[ftmp12]       \n\t"
184    "paddw      %[ftmp11],  %[ftmp11],      %[ftmp12]       \n\t"
185    "psraw      %[ftmp10],  %[ftmp10],      %[ftmp9]        \n\t"
186    "psraw      %[ftmp11],  %[ftmp11],      %[ftmp9]        \n\t"
187    "packsswh   %[ftmp3],   %[ftmp10],      %[ftmp11]       \n\t"
188    "paddh      %[ftmp3],   %[ftmp3],       %[ftmp0]        \n\t"
189
190    "ldc1       %[ftmp12],  %[ff_pw_51000]                  \n\t"
191    "punpcklhw  %[ftmp5],   %[ftmp8],       %[ftmp7]        \n\t"
192    "pmaddhw    %[ftmp10],  %[ftmp5],       %[ff_ph_op3]    \n\t"
193    "punpckhhw  %[ftmp5],   %[ftmp8],       %[ftmp7]        \n\t"
194    "pmaddhw    %[ftmp11],  %[ftmp5],       %[ff_ph_op3]    \n\t"
195    "paddw      %[ftmp10],  %[ftmp10],      %[ftmp12]       \n\t"
196    "paddw      %[ftmp11],  %[ftmp11],      %[ftmp12]       \n\t"
197    "psraw      %[ftmp10],  %[ftmp10],      %[ftmp9]        \n\t"
198    "psraw      %[ftmp11],  %[ftmp11],      %[ftmp9]        \n\t"
199    "packsswh   %[ftmp4],   %[ftmp10],      %[ftmp11]       \n\t"
200
201    "gssdlc1    %[ftmp1],   0x07(%[output])                 \n\t"
202    "gssdrc1    %[ftmp1],   0x00(%[output])                 \n\t"
203    "gssdlc1    %[ftmp3],   0x0f(%[output])                 \n\t"
204    "gssdrc1    %[ftmp3],   0x08(%[output])                 \n\t"
205    "gssdlc1    %[ftmp2],   0x17(%[output])                 \n\t"
206    "gssdrc1    %[ftmp2],   0x10(%[output])                 \n\t"
207    "gssdlc1    %[ftmp4],   0x1f(%[output])                 \n\t"
208    "gssdrc1    %[ftmp4],   0x18(%[output])                 \n\t"
209
210    : [ftmp0] "=&f"(ftmp0), [ftmp1] "=&f"(ftmp1), [ftmp2] "=&f"(ftmp2),
211      [ftmp3] "=&f"(ftmp3), [ftmp4] "=&f"(ftmp4), [ftmp5] "=&f"(ftmp5),
212      [ftmp6] "=&f"(ftmp6), [ftmp7] "=&f"(ftmp7), [ftmp8] "=&f"(ftmp8),
213      [ftmp9] "=&f"(ftmp9), [ftmp10] "=&f"(ftmp10), [ftmp11] "=&f"(ftmp11),
214      [ftmp12] "=&f"(ftmp12), [tmp0] "=&r"(tmp[0]), [ip]"+&r"(ip)
215    : [ff_ph_01] "m"(ff_ph_01), [ff_ph_07] "m"(ff_ph_07),
216      [ff_ph_op1] "f"(ff_ph_op1), [ff_ph_op3] "f"(ff_ph_op3),
217      [ff_pw_14500] "m"(ff_pw_14500), [ff_pw_7500] "m"(ff_pw_7500),
218      [ff_pw_12000] "m"(ff_pw_12000), [ff_pw_51000] "m"(ff_pw_51000),
219      [ff_pw_5352]"m"(ff_pw_5352), [ff_pw_2217]"m"(ff_pw_2217),
220      [ff_ph_8]"m"(ff_ph_8), [pitch]"r"(pitch), [output] "r"(output)
221    : "memory"
222  );
223}
224
225void vp8_short_fdct8x4_mmi(int16_t *input, int16_t *output, int pitch) {
226  vp8_short_fdct4x4_mmi(input, output, pitch);
227  vp8_short_fdct4x4_mmi(input + 4, output + 16, pitch);
228}
229
230void vp8_short_walsh4x4_mmi(int16_t *input, int16_t *output, int pitch) {
231  double ftmp[13];
232  uint32_t tmp[1];
233  DECLARE_ALIGNED(8, const uint64_t, ff_ph_01) = { 0x0001000100010001ULL };
234  DECLARE_ALIGNED(8, const uint64_t, ff_pw_01) = { 0x0000000100000001ULL };
235  DECLARE_ALIGNED(8, const uint64_t, ff_pw_03) = { 0x0000000300000003ULL };
236  DECLARE_ALIGNED(8, const uint64_t, ff_pw_mask) = { 0x0001000000010000ULL };
237
238  __asm__ volatile (
239    MMI_LI(%[tmp0], 0x02)
240    "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
241    "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
242
243    "gsldlc1    %[ftmp1],   0x07(%[ip])                         \n\t"
244    "gsldrc1    %[ftmp1],   0x00(%[ip])                         \n\t"
245    MMI_ADDU(%[ip], %[ip], %[pitch])
246    "gsldlc1    %[ftmp2],   0x07(%[ip])                         \n\t"
247    "gsldrc1    %[ftmp2],   0x00(%[ip])                         \n\t"
248    MMI_ADDU(%[ip], %[ip], %[pitch])
249    "gsldlc1    %[ftmp3],   0x07(%[ip])                         \n\t"
250    "gsldrc1    %[ftmp3],   0x00(%[ip])                         \n\t"
251    MMI_ADDU(%[ip], %[ip], %[pitch])
252    "gsldlc1    %[ftmp4],   0x07(%[ip])                         \n\t"
253    "gsldrc1    %[ftmp4],   0x00(%[ip])                         \n\t"
254    TRANSPOSE_4H
255
256    "psllh      %[ftmp1],   %[ftmp1],       %[ftmp11]           \n\t"
257    "psllh      %[ftmp2],   %[ftmp2],       %[ftmp11]           \n\t"
258    "psllh      %[ftmp3],   %[ftmp3],       %[ftmp11]           \n\t"
259    "psllh      %[ftmp4],   %[ftmp4],       %[ftmp11]           \n\t"
260    // a
261    "paddh      %[ftmp5],   %[ftmp1],       %[ftmp3]            \n\t"
262    // d
263    "paddh      %[ftmp6],   %[ftmp2],       %[ftmp4]            \n\t"
264    // c
265    "psubh      %[ftmp7],   %[ftmp2],       %[ftmp4]            \n\t"
266    // b
267    "psubh      %[ftmp8],   %[ftmp1],       %[ftmp3]            \n\t"
268
269    // a + d
270    "paddh      %[ftmp1],   %[ftmp5],       %[ftmp6]            \n\t"
271    // b + c
272    "paddh      %[ftmp2],   %[ftmp8],       %[ftmp7]            \n\t"
273    // b - c
274    "psubh      %[ftmp3],   %[ftmp8],       %[ftmp7]            \n\t"
275    // a - d
276    "psubh      %[ftmp4],   %[ftmp5],       %[ftmp6]            \n\t"
277
278    "pcmpeqh    %[ftmp6],   %[ftmp5],       %[ftmp0]            \n\t"
279    "paddh      %[ftmp6],   %[ftmp6],       %[ff_ph_01]         \n\t"
280    "paddh      %[ftmp1],   %[ftmp1],       %[ftmp6]            \n\t"
281    TRANSPOSE_4H
282
283    // op[2], op[0]
284    "pmaddhw    %[ftmp5],   %[ftmp1],       %[ff_pw_01]         \n\t"
285    // op[3], op[1]
286    "pmaddhw    %[ftmp1],   %[ftmp1],       %[ff_pw_mask]       \n\t"
287
288    // op[6], op[4]
289    "pmaddhw    %[ftmp6],   %[ftmp2],       %[ff_pw_01]         \n\t"
290    // op[7], op[5]
291    "pmaddhw    %[ftmp2],   %[ftmp2],       %[ff_pw_mask]       \n\t"
292
293    // op[10], op[8]
294    "pmaddhw    %[ftmp7],   %[ftmp3],       %[ff_pw_01]         \n\t"
295    // op[11], op[9]
296    "pmaddhw    %[ftmp3],   %[ftmp3],       %[ff_pw_mask]       \n\t"
297
298    // op[14], op[12]
299    "pmaddhw    %[ftmp8],   %[ftmp4],       %[ff_pw_01]         \n\t"
300    // op[15], op[13]
301    "pmaddhw    %[ftmp4],   %[ftmp4],       %[ff_pw_mask]       \n\t"
302
303    // a1, a3
304    "paddw      %[ftmp9],   %[ftmp5],       %[ftmp7]            \n\t"
305    // d1, d3
306    "paddw      %[ftmp10],  %[ftmp6],       %[ftmp8]            \n\t"
307    // c1, c3
308    "psubw      %[ftmp11],  %[ftmp6],       %[ftmp8]            \n\t"
309    // b1, b3
310    "psubw      %[ftmp12],  %[ftmp5],       %[ftmp7]            \n\t"
311
312    // a1 + d1, a3 + d3
313    "paddw      %[ftmp5],   %[ftmp9],       %[ftmp10]           \n\t"
314    // b1 + c1, b3 + c3
315    "paddw      %[ftmp6],   %[ftmp12],      %[ftmp11]           \n\t"
316    // b1 - c1, b3 - c3
317    "psubw      %[ftmp7],   %[ftmp12],      %[ftmp11]           \n\t"
318    // a1 - d1, a3 - d3
319    "psubw      %[ftmp8],   %[ftmp9],       %[ftmp10]           \n\t"
320
321    // a2, a4
322    "paddw      %[ftmp9],   %[ftmp1],       %[ftmp3]            \n\t"
323    // d2, d4
324    "paddw      %[ftmp10],  %[ftmp2],       %[ftmp4]            \n\t"
325    // c2, c4
326    "psubw      %[ftmp11],  %[ftmp2],       %[ftmp4]            \n\t"
327    // b2, b4
328    "psubw      %[ftmp12],  %[ftmp1],       %[ftmp3]            \n\t"
329
330    // a2 + d2, a4 + d4
331    "paddw      %[ftmp1],   %[ftmp9],       %[ftmp10]           \n\t"
332    // b2 + c2, b4 + c4
333    "paddw      %[ftmp2],   %[ftmp12],      %[ftmp11]           \n\t"
334    // b2 - c2, b4 - c4
335    "psubw      %[ftmp3],   %[ftmp12],      %[ftmp11]           \n\t"
336    // a2 - d2, a4 - d4
337    "psubw      %[ftmp4],   %[ftmp9],       %[ftmp10]           \n\t"
338
339    MMI_LI(%[tmp0], 0x03)
340    "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
341
342    "pcmpgtw    %[ftmp9],   %[ftmp0],       %[ftmp1]            \n\t"
343    "and        %[ftmp9],   %[ftmp9],       %[ff_pw_01]         \n\t"
344    "paddw      %[ftmp1],   %[ftmp1],       %[ftmp9]            \n\t"
345    "paddw      %[ftmp1],   %[ftmp1],       %[ff_pw_03]         \n\t"
346    "psraw      %[ftmp1],   %[ftmp1],       %[ftmp11]           \n\t"
347
348    "pcmpgtw    %[ftmp9],   %[ftmp0],       %[ftmp2]            \n\t"
349    "and        %[ftmp9],   %[ftmp9],       %[ff_pw_01]         \n\t"
350    "paddw      %[ftmp2],   %[ftmp2],       %[ftmp9]            \n\t"
351    "paddw      %[ftmp2],   %[ftmp2],       %[ff_pw_03]         \n\t"
352    "psraw      %[ftmp2],   %[ftmp2],       %[ftmp11]           \n\t"
353
354    "pcmpgtw    %[ftmp9],   %[ftmp0],       %[ftmp3]            \n\t"
355    "and        %[ftmp9],   %[ftmp9],       %[ff_pw_01]         \n\t"
356    "paddw      %[ftmp3],   %[ftmp3],       %[ftmp9]            \n\t"
357    "paddw      %[ftmp3],   %[ftmp3],       %[ff_pw_03]         \n\t"
358    "psraw      %[ftmp3],   %[ftmp3],       %[ftmp11]           \n\t"
359
360    "pcmpgtw    %[ftmp9],   %[ftmp0],       %[ftmp4]            \n\t"
361    "and        %[ftmp9],   %[ftmp9],       %[ff_pw_01]         \n\t"
362    "paddw      %[ftmp4],   %[ftmp4],       %[ftmp9]            \n\t"
363    "paddw      %[ftmp4],   %[ftmp4],       %[ff_pw_03]         \n\t"
364    "psraw      %[ftmp4],   %[ftmp4],       %[ftmp11]           \n\t"
365
366    "pcmpgtw    %[ftmp9],   %[ftmp0],       %[ftmp5]            \n\t"
367    "and        %[ftmp9],   %[ftmp9],       %[ff_pw_01]         \n\t"
368    "paddw      %[ftmp5],   %[ftmp5],       %[ftmp9]            \n\t"
369    "paddw      %[ftmp5],   %[ftmp5],       %[ff_pw_03]         \n\t"
370    "psraw      %[ftmp5],   %[ftmp5],       %[ftmp11]           \n\t"
371
372    "pcmpgtw    %[ftmp9],   %[ftmp0],       %[ftmp6]            \n\t"
373    "and        %[ftmp9],   %[ftmp9],       %[ff_pw_01]         \n\t"
374    "paddw      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
375    "paddw      %[ftmp6],   %[ftmp6],       %[ff_pw_03]         \n\t"
376    "psraw      %[ftmp6],   %[ftmp6],       %[ftmp11]           \n\t"
377
378    "pcmpgtw    %[ftmp9],   %[ftmp0],       %[ftmp7]            \n\t"
379    "and        %[ftmp9],   %[ftmp9],       %[ff_pw_01]         \n\t"
380    "paddw      %[ftmp7],   %[ftmp7],       %[ftmp9]            \n\t"
381    "paddw      %[ftmp7],   %[ftmp7],       %[ff_pw_03]         \n\t"
382    "psraw      %[ftmp7],   %[ftmp7],       %[ftmp11]           \n\t"
383
384    "pcmpgtw    %[ftmp9],   %[ftmp0],       %[ftmp8]            \n\t"
385    "and        %[ftmp9],   %[ftmp9],       %[ff_pw_01]         \n\t"
386    "paddw      %[ftmp8],   %[ftmp8],       %[ftmp9]            \n\t"
387    "paddw      %[ftmp8],   %[ftmp8],       %[ff_pw_03]         \n\t"
388    "psraw      %[ftmp8],   %[ftmp8],       %[ftmp11]           \n\t"
389
390    "packsswh   %[ftmp1],   %[ftmp1],       %[ftmp5]            \n\t"
391    "packsswh   %[ftmp2],   %[ftmp2],       %[ftmp6]            \n\t"
392    "packsswh   %[ftmp3],   %[ftmp3],       %[ftmp7]            \n\t"
393    "packsswh   %[ftmp4],   %[ftmp4],       %[ftmp8]            \n\t"
394
395    MMI_LI(%[tmp0], 0x72)
396    "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
397    "pshufh     %[ftmp1],   %[ftmp1],       %[ftmp11]           \n\t"
398    "pshufh     %[ftmp2],   %[ftmp2],       %[ftmp11]           \n\t"
399    "pshufh     %[ftmp3],   %[ftmp3],       %[ftmp11]           \n\t"
400    "pshufh     %[ftmp4],   %[ftmp4],       %[ftmp11]           \n\t"
401
402    "gssdlc1    %[ftmp1],   0x07(%[op])                         \n\t"
403    "gssdrc1    %[ftmp1],   0x00(%[op])                         \n\t"
404    "gssdlc1    %[ftmp2],   0x0f(%[op])                         \n\t"
405    "gssdrc1    %[ftmp2],   0x08(%[op])                         \n\t"
406    "gssdlc1    %[ftmp3],   0x17(%[op])                         \n\t"
407    "gssdrc1    %[ftmp3],   0x10(%[op])                         \n\t"
408    "gssdlc1    %[ftmp4],   0x1f(%[op])                         \n\t"
409    "gssdrc1    %[ftmp4],   0x18(%[op])                         \n\t"
410    : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
411      [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
412      [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
413      [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
414      [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
415      [ftmp10]"=&f"(ftmp[10]),          [ftmp11]"=&f"(ftmp[11]),
416      [ftmp12]"=&f"(ftmp[12]),
417      [tmp0]"=&r"(tmp[0]),
418      [ip]"+&r"(input)
419    : [op]"r"(output),
420      [ff_pw_01]"f"(ff_pw_01),          [pitch]"r"((mips_reg)pitch),
421      [ff_pw_03]"f"(ff_pw_03),          [ff_pw_mask]"f"(ff_pw_mask),
422      [ff_ph_01]"f"(ff_ph_01)
423    : "memory"
424  );
425}
426