1/*
2 *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include "./vpx_config.h"
12#include "./vpx_dsp_rtcd.h"
13#include "vpx_dsp/mips/inv_txfm_dspr2.h"
14#include "vpx_dsp/txfm_common.h"
15
16#if HAVE_DSPR2
17void idct16_rows_dspr2(const int16_t *input, int16_t *output,
18                       uint32_t no_rows) {
19  int i;
20  int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
21  int step1_10, step1_11, step1_12, step1_13;
22  int step2_0, step2_1, step2_2, step2_3;
23  int step2_8, step2_9, step2_10, step2_11;
24  int step2_12, step2_13, step2_14, step2_15;
25  int load1, load2, load3, load4, load5, load6, load7, load8;
26  int result1, result2, result3, result4;
27  const int const_2_power_13 = 8192;
28
29  for (i = no_rows; i--;) {
30    /* prefetch row */
31    prefetch_load((const uint8_t *)(input + 16));
32
33    __asm__ __volatile__(
34        "lh       %[load1],              0(%[input])                    \n\t"
35        "lh       %[load2],             16(%[input])                    \n\t"
36        "lh       %[load3],              8(%[input])                    \n\t"
37        "lh       %[load4],             24(%[input])                    \n\t"
38
39        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
40        "mthi     $zero,                $ac1                            \n\t"
41        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
42        "mthi     $zero,                $ac2                            \n\t"
43        "add      %[result1],           %[load1],       %[load2]        \n\t"
44        "sub      %[result2],           %[load1],       %[load2]        \n\t"
45        "madd     $ac1,                 %[result1],     %[cospi_16_64]  \n\t"
46        "madd     $ac2,                 %[result2],     %[cospi_16_64]  \n\t"
47        "extp     %[step2_0],           $ac1,           31              \n\t"
48        "extp     %[step2_1],           $ac2,           31              \n\t"
49
50        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
51        "mthi     $zero,                $ac3                            \n\t"
52        "madd     $ac3,                 %[load3],       %[cospi_24_64]  \n\t"
53        "msub     $ac3,                 %[load4],       %[cospi_8_64]   \n\t"
54        "extp     %[step2_2],           $ac3,           31              \n\t"
55
56        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
57        "mthi     $zero,                $ac1                            \n\t"
58        "madd     $ac1,                 %[load3],       %[cospi_8_64]   \n\t"
59        "madd     $ac1,                 %[load4],       %[cospi_24_64]  \n\t"
60        "extp     %[step2_3],           $ac1,           31              \n\t"
61
62        "add      %[step1_0],           %[step2_0],     %[step2_3]      \n\t"
63        "add      %[step1_1],           %[step2_1],     %[step2_2]      \n\t"
64        "sub      %[step1_2],           %[step2_1],     %[step2_2]      \n\t"
65        "sub      %[step1_3],           %[step2_0],     %[step2_3]      \n\t"
66
67        : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
68          [load4] "=&r"(load4), [result1] "=&r"(result1),
69          [result2] "=&r"(result2), [step2_0] "=&r"(step2_0),
70          [step2_1] "=&r"(step2_1), [step2_2] "=&r"(step2_2),
71          [step2_3] "=&r"(step2_3), [step1_0] "=r"(step1_0),
72          [step1_1] "=r"(step1_1), [step1_2] "=r"(step1_2),
73          [step1_3] "=r"(step1_3)
74        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
75          [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64),
76          [cospi_16_64] "r"(cospi_16_64));
77
78    __asm__ __volatile__(
79        "lh       %[load5],             2(%[input])                     \n\t"
80        "lh       %[load6],             30(%[input])                    \n\t"
81        "lh       %[load7],             18(%[input])                    \n\t"
82        "lh       %[load8],             14(%[input])                    \n\t"
83
84        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
85        "mthi     $zero,                $ac1                            \n\t"
86        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
87        "mthi     $zero,                $ac3                            \n\t"
88
89        "madd     $ac1,                 %[load5],       %[cospi_30_64]  \n\t"
90        "msub     $ac1,                 %[load6],       %[cospi_2_64]   \n\t"
91        "extp     %[result1],           $ac1,           31              \n\t"
92
93        "madd     $ac3,                 %[load7],       %[cospi_14_64]  \n\t"
94        "msub     $ac3,                 %[load8],       %[cospi_18_64]  \n\t"
95        "extp     %[result2],           $ac3,           31              \n\t"
96
97        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
98        "mthi     $zero,                $ac1                            \n\t"
99        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
100        "mthi     $zero,                $ac2                            \n\t"
101
102        "madd     $ac1,                 %[load7],       %[cospi_18_64]  \n\t"
103        "madd     $ac1,                 %[load8],       %[cospi_14_64]  \n\t"
104        "extp     %[result3],           $ac1,           31              \n\t"
105
106        "madd     $ac2,                 %[load5],       %[cospi_2_64]   \n\t"
107        "madd     $ac2,                 %[load6],       %[cospi_30_64]  \n\t"
108        "extp     %[result4],           $ac2,           31              \n\t"
109
110        "sub      %[load5],             %[result1],     %[result2]      \n\t"
111        "sub      %[load6],             %[result4],     %[result3]      \n\t"
112
113        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
114        "mthi     $zero,                $ac1                            \n\t"
115        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
116        "mthi     $zero,                $ac3                            \n\t"
117
118        "madd     $ac1,                 %[load6],       %[cospi_24_64]  \n\t"
119        "msub     $ac1,                 %[load5],       %[cospi_8_64]   \n\t"
120        "madd     $ac3,                 %[load5],       %[cospi_24_64]  \n\t"
121        "madd     $ac3,                 %[load6],       %[cospi_8_64]   \n\t"
122
123        "extp     %[step2_9],           $ac1,           31              \n\t"
124        "extp     %[step2_14],          $ac3,           31              \n\t"
125        "add      %[step2_8],           %[result1],     %[result2]      \n\t"
126        "add      %[step2_15],          %[result4],     %[result3]      \n\t"
127
128        : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7),
129          [load8] "=&r"(load8), [result1] "=&r"(result1),
130          [result2] "=&r"(result2), [result3] "=&r"(result3),
131          [result4] "=&r"(result4), [step2_8] "=r"(step2_8),
132          [step2_15] "=r"(step2_15), [step2_9] "=r"(step2_9),
133          [step2_14] "=r"(step2_14)
134        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
135          [cospi_30_64] "r"(cospi_30_64), [cospi_2_64] "r"(cospi_2_64),
136          [cospi_14_64] "r"(cospi_14_64), [cospi_18_64] "r"(cospi_18_64),
137          [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64));
138
139    __asm__ __volatile__(
140        "lh       %[load1],             10(%[input])                    \n\t"
141        "lh       %[load2],             22(%[input])                    \n\t"
142        "lh       %[load3],             26(%[input])                    \n\t"
143        "lh       %[load4],             6(%[input])                     \n\t"
144
145        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
146        "mthi     $zero,                $ac1                            \n\t"
147        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
148        "mthi     $zero,                $ac3                            \n\t"
149
150        "madd     $ac1,                 %[load1],       %[cospi_22_64]  \n\t"
151        "msub     $ac1,                 %[load2],       %[cospi_10_64]  \n\t"
152        "extp     %[result1],           $ac1,           31              \n\t"
153
154        "madd     $ac3,                 %[load3],       %[cospi_6_64]   \n\t"
155        "msub     $ac3,                 %[load4],       %[cospi_26_64]  \n\t"
156        "extp     %[result2],           $ac3,           31              \n\t"
157
158        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
159        "mthi     $zero,                $ac1                            \n\t"
160        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
161        "mthi     $zero,                $ac2                            \n\t"
162
163        "madd     $ac1,                 %[load1],       %[cospi_10_64]  \n\t"
164        "madd     $ac1,                 %[load2],       %[cospi_22_64]  \n\t"
165        "extp     %[result3],           $ac1,           31              \n\t"
166
167        "madd     $ac2,                 %[load3],       %[cospi_26_64]  \n\t"
168        "madd     $ac2,                 %[load4],       %[cospi_6_64]   \n\t"
169        "extp     %[result4],           $ac2,           31              \n\t"
170
171        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
172        "mthi     $zero,                $ac1                            \n\t"
173        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
174        "mthi     $zero,                $ac3                            \n\t"
175
176        "sub      %[load1],             %[result2],     %[result1]      \n\t"
177        "sub      %[load2],             %[result4],     %[result3]      \n\t"
178
179        "msub     $ac1,                 %[load1],       %[cospi_24_64]  \n\t"
180        "msub     $ac1,                 %[load2],       %[cospi_8_64]   \n\t"
181        "madd     $ac3,                 %[load2],       %[cospi_24_64]  \n\t"
182        "msub     $ac3,                 %[load1],       %[cospi_8_64]   \n\t"
183
184        "extp     %[step2_10],          $ac1,           31              \n\t"
185        "extp     %[step2_13],          $ac3,           31              \n\t"
186        "add      %[step2_11],          %[result1],     %[result2]      \n\t"
187        "add      %[step2_12],          %[result4],     %[result3]      \n\t"
188
189        : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
190          [load4] "=&r"(load4), [result1] "=&r"(result1),
191          [result2] "=&r"(result2), [result3] "=&r"(result3),
192          [result4] "=&r"(result4), [step2_10] "=r"(step2_10),
193          [step2_11] "=r"(step2_11), [step2_12] "=r"(step2_12),
194          [step2_13] "=r"(step2_13)
195        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
196          [cospi_22_64] "r"(cospi_22_64), [cospi_10_64] "r"(cospi_10_64),
197          [cospi_6_64] "r"(cospi_6_64), [cospi_26_64] "r"(cospi_26_64),
198          [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64));
199
200    __asm__ __volatile__(
201        "lh       %[load5],             4(%[input])                     \n\t"
202        "lh       %[load6],             28(%[input])                    \n\t"
203        "lh       %[load7],             20(%[input])                    \n\t"
204        "lh       %[load8],             12(%[input])                    \n\t"
205
206        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
207        "mthi     $zero,                $ac1                            \n\t"
208        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
209        "mthi     $zero,                $ac3                            \n\t"
210
211        "madd     $ac1,                 %[load5],       %[cospi_28_64]  \n\t"
212        "msub     $ac1,                 %[load6],       %[cospi_4_64]   \n\t"
213        "extp     %[result1],           $ac1,           31              \n\t"
214
215        "madd     $ac3,                 %[load7],       %[cospi_12_64]  \n\t"
216        "msub     $ac3,                 %[load8],       %[cospi_20_64]  \n\t"
217        "extp     %[result2],           $ac3,           31              \n\t"
218
219        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
220        "mthi     $zero,                $ac1                            \n\t"
221        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
222        "mthi     $zero,                $ac2                            \n\t"
223
224        "madd     $ac1,                 %[load7],       %[cospi_20_64]  \n\t"
225        "madd     $ac1,                 %[load8],       %[cospi_12_64]  \n\t"
226        "extp     %[result3],           $ac1,           31              \n\t"
227
228        "madd     $ac2,                 %[load5],       %[cospi_4_64]   \n\t"
229        "madd     $ac2,                 %[load6],       %[cospi_28_64]  \n\t"
230        "extp     %[result4],           $ac2,           31              \n\t"
231
232        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
233        "mthi     $zero,                $ac1                            \n\t"
234        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
235        "mthi     $zero,                $ac3                            \n\t"
236
237        "sub      %[load5],             %[result4],     %[result3]      \n\t"
238        "sub      %[load5],             %[load5],       %[result1]      \n\t"
239        "add      %[load5],             %[load5],       %[result2]      \n\t"
240
241        "sub      %[load6],             %[result1],     %[result2]      \n\t"
242        "sub      %[load6],             %[load6],       %[result3]      \n\t"
243        "add      %[load6],             %[load6],       %[result4]      \n\t"
244
245        "madd     $ac1,                 %[load5],       %[cospi_16_64]  \n\t"
246        "madd     $ac3,                 %[load6],       %[cospi_16_64]  \n\t"
247
248        "extp     %[step1_5],           $ac1,           31              \n\t"
249        "extp     %[step1_6],           $ac3,           31              \n\t"
250        "add      %[step1_4],           %[result1],     %[result2]      \n\t"
251        "add      %[step1_7],           %[result4],     %[result3]      \n\t"
252
253        : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7),
254          [load8] "=&r"(load8), [result1] "=&r"(result1),
255          [result2] "=&r"(result2), [result3] "=&r"(result3),
256          [result4] "=&r"(result4), [step1_4] "=r"(step1_4),
257          [step1_5] "=r"(step1_5), [step1_6] "=r"(step1_6),
258          [step1_7] "=r"(step1_7)
259        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
260          [cospi_20_64] "r"(cospi_20_64), [cospi_12_64] "r"(cospi_12_64),
261          [cospi_4_64] "r"(cospi_4_64), [cospi_28_64] "r"(cospi_28_64),
262          [cospi_16_64] "r"(cospi_16_64));
263
264    __asm__ __volatile__(
265        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
266        "mthi     $zero,                $ac0                            \n\t"
267        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
268        "mthi     $zero,                $ac1                            \n\t"
269
270        "sub      %[load5],             %[step2_14],    %[step2_13]     \n\t"
271        "sub      %[load5],             %[load5],       %[step2_9]      \n\t"
272        "add      %[load5],             %[load5],       %[step2_10]     \n\t"
273
274        "madd     $ac0,                 %[load5],       %[cospi_16_64]  \n\t"
275
276        "sub      %[load6],             %[step2_14],    %[step2_13]     \n\t"
277        "sub      %[load6],             %[load6],       %[step2_10]     \n\t"
278        "add      %[load6],             %[load6],       %[step2_9]      \n\t"
279
280        "madd     $ac1,                 %[load6],       %[cospi_16_64]  \n\t"
281
282        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
283        "mthi     $zero,                $ac2                            \n\t"
284        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
285        "mthi     $zero,                $ac3                            \n\t"
286
287        "sub      %[load5],             %[step2_15],    %[step2_12]     \n\t"
288        "sub      %[load5],             %[load5],       %[step2_8]      \n\t"
289        "add      %[load5],             %[load5],       %[step2_11]     \n\t"
290
291        "madd     $ac2,                 %[load5],       %[cospi_16_64]  \n\t"
292
293        "sub      %[load6],             %[step2_15],    %[step2_12]     \n\t"
294        "sub      %[load6],             %[load6],       %[step2_11]     \n\t"
295        "add      %[load6],             %[load6],       %[step2_8]      \n\t"
296
297        "madd     $ac3,                 %[load6],       %[cospi_16_64]  \n\t"
298
299        "extp     %[step1_10],          $ac0,           31              \n\t"
300        "extp     %[step1_13],          $ac1,           31              \n\t"
301        "extp     %[step1_11],          $ac2,           31              \n\t"
302        "extp     %[step1_12],          $ac3,           31              \n\t"
303
304        : [load5] "=&r"(load5), [load6] "=&r"(load6), [step1_10] "=r"(step1_10),
305          [step1_11] "=r"(step1_11), [step1_12] "=r"(step1_12),
306          [step1_13] "=r"(step1_13)
307        : [const_2_power_13] "r"(const_2_power_13), [step2_14] "r"(step2_14),
308          [step2_13] "r"(step2_13), [step2_9] "r"(step2_9),
309          [step2_10] "r"(step2_10), [step2_15] "r"(step2_15),
310          [step2_12] "r"(step2_12), [step2_8] "r"(step2_8),
311          [step2_11] "r"(step2_11), [cospi_16_64] "r"(cospi_16_64));
312
313    __asm__ __volatile__(
314        "add      %[load5],             %[step1_0],     %[step1_7]      \n\t"
315        "add      %[load5],             %[load5],       %[step2_12]     \n\t"
316        "add      %[load5],             %[load5],       %[step2_15]     \n\t"
317        "add      %[load6],             %[step1_1],     %[step1_6]      \n\t"
318        "add      %[load6],             %[load6],       %[step2_13]     \n\t"
319        "add      %[load6],             %[load6],       %[step2_14]     \n\t"
320        "sh       %[load5],             0(%[output])                    \n\t"
321        "sh       %[load6],             32(%[output])                   \n\t"
322        "sub      %[load5],             %[step1_1],     %[step1_6]      \n\t"
323        "add      %[load5],             %[load5],       %[step2_9]      \n\t"
324        "add      %[load5],             %[load5],       %[step2_10]     \n\t"
325        "sub      %[load6],             %[step1_0],     %[step1_7]      \n\t"
326        "add      %[load6],             %[load6],       %[step2_8]      \n\t"
327        "add      %[load6],             %[load6],       %[step2_11]     \n\t"
328        "sh       %[load5],             192(%[output])                  \n\t"
329        "sh       %[load6],             224(%[output])                  \n\t"
330        "sub      %[load5],             %[step1_0],     %[step1_7]      \n\t"
331        "sub      %[load5],             %[load5],       %[step2_8]      \n\t"
332        "sub      %[load5],             %[load5],       %[step2_11]     \n\t"
333        "sub      %[load6],             %[step1_1],     %[step1_6]      \n\t"
334        "sub      %[load6],             %[load6],       %[step2_9]      \n\t"
335        "sub      %[load6],             %[load6],       %[step2_10]     \n\t"
336        "sh       %[load5],             256(%[output])                  \n\t"
337        "sh       %[load6],             288(%[output])                  \n\t"
338        "add      %[load5],             %[step1_1],     %[step1_6]      \n\t"
339        "sub      %[load5],             %[load5],       %[step2_13]     \n\t"
340        "sub      %[load5],             %[load5],       %[step2_14]     \n\t"
341        "add      %[load6],             %[step1_0],     %[step1_7]      \n\t"
342        "sub      %[load6],             %[load6],       %[step2_12]     \n\t"
343        "sub      %[load6],             %[load6],       %[step2_15]     \n\t"
344        "sh       %[load5],             448(%[output])                  \n\t"
345        "sh       %[load6],             480(%[output])                  \n\t"
346
347        : [load5] "=&r"(load5), [load6] "=&r"(load6)
348        : [output] "r"(output), [step1_0] "r"(step1_0), [step1_1] "r"(step1_1),
349          [step1_6] "r"(step1_6), [step1_7] "r"(step1_7),
350          [step2_8] "r"(step2_8), [step2_9] "r"(step2_9),
351          [step2_10] "r"(step2_10), [step2_11] "r"(step2_11),
352          [step2_12] "r"(step2_12), [step2_13] "r"(step2_13),
353          [step2_14] "r"(step2_14), [step2_15] "r"(step2_15));
354
355    __asm__ __volatile__(
356        "add      %[load5],             %[step1_2],     %[step1_5]      \n\t"
357        "add      %[load5],             %[load5],       %[step1_13]     \n\t"
358        "add      %[load6],             %[step1_3],     %[step1_4]      \n\t"
359        "add      %[load6],             %[load6],       %[step1_12]     \n\t"
360        "sh       %[load5],             64(%[output])                   \n\t"
361        "sh       %[load6],             96(%[output])                   \n\t"
362        "sub      %[load5],             %[step1_3],     %[step1_4]      \n\t"
363        "add      %[load5],             %[load5],       %[step1_11]     \n\t"
364        "sub      %[load6],             %[step1_2],     %[step1_5]      \n\t"
365        "add      %[load6],             %[load6],       %[step1_10]     \n\t"
366        "sh       %[load5],             128(%[output])                  \n\t"
367        "sh       %[load6],             160(%[output])                  \n\t"
368        "sub      %[load5],             %[step1_2],     %[step1_5]      \n\t"
369        "sub      %[load5],             %[load5],       %[step1_10]     \n\t"
370        "sub      %[load6],             %[step1_3],     %[step1_4]      \n\t"
371        "sub      %[load6],             %[load6],       %[step1_11]     \n\t"
372        "sh       %[load5],             320(%[output])                  \n\t"
373        "sh       %[load6],             352(%[output])                  \n\t"
374        "add      %[load5],             %[step1_3],     %[step1_4]      \n\t"
375        "sub      %[load5],             %[load5],       %[step1_12]     \n\t"
376        "add      %[load6],             %[step1_2],     %[step1_5]      \n\t"
377        "sub      %[load6],             %[load6],       %[step1_13]     \n\t"
378        "sh       %[load5],             384(%[output])                  \n\t"
379        "sh       %[load6],             416(%[output])                  \n\t"
380
381        : [load5] "=&r"(load5), [load6] "=&r"(load6)
382        : [output] "r"(output), [step1_2] "r"(step1_2), [step1_3] "r"(step1_3),
383          [step1_4] "r"(step1_4), [step1_5] "r"(step1_5),
384          [step1_10] "r"(step1_10), [step1_11] "r"(step1_11),
385          [step1_12] "r"(step1_12), [step1_13] "r"(step1_13));
386
387    input += 16;
388    output += 1;
389  }
390}
391
392void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int stride) {
393  int i;
394  int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
395  int step1_8, step1_9, step1_10, step1_11;
396  int step1_12, step1_13, step1_14, step1_15;
397  int step2_0, step2_1, step2_2, step2_3;
398  int step2_8, step2_9, step2_10, step2_11;
399  int step2_12, step2_13, step2_14, step2_15;
400  int load1, load2, load3, load4, load5, load6, load7, load8;
401  int result1, result2, result3, result4;
402  const int const_2_power_13 = 8192;
403  uint8_t *dest_pix;
404  uint8_t *cm = vpx_ff_cropTbl;
405
406  /* prefetch vpx_ff_cropTbl */
407  prefetch_load(vpx_ff_cropTbl);
408  prefetch_load(vpx_ff_cropTbl + 32);
409  prefetch_load(vpx_ff_cropTbl + 64);
410  prefetch_load(vpx_ff_cropTbl + 96);
411  prefetch_load(vpx_ff_cropTbl + 128);
412  prefetch_load(vpx_ff_cropTbl + 160);
413  prefetch_load(vpx_ff_cropTbl + 192);
414  prefetch_load(vpx_ff_cropTbl + 224);
415
416  for (i = 0; i < 16; ++i) {
417    dest_pix = (dest + i);
418    __asm__ __volatile__(
419        "lh       %[load1],              0(%[input])                    \n\t"
420        "lh       %[load2],             16(%[input])                    \n\t"
421        "lh       %[load3],              8(%[input])                    \n\t"
422        "lh       %[load4],             24(%[input])                    \n\t"
423
424        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
425        "mthi     $zero,                $ac1                            \n\t"
426        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
427        "mthi     $zero,                $ac2                            \n\t"
428        "add      %[result1],           %[load1],       %[load2]        \n\t"
429        "sub      %[result2],           %[load1],       %[load2]        \n\t"
430        "madd     $ac1,                 %[result1],     %[cospi_16_64]  \n\t"
431        "madd     $ac2,                 %[result2],     %[cospi_16_64]  \n\t"
432        "extp     %[step2_0],           $ac1,           31              \n\t"
433        "extp     %[step2_1],           $ac2,           31              \n\t"
434
435        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
436        "mthi     $zero,                $ac3                            \n\t"
437        "madd     $ac3,                 %[load3],       %[cospi_24_64]  \n\t"
438        "msub     $ac3,                 %[load4],       %[cospi_8_64]   \n\t"
439        "extp     %[step2_2],           $ac3,           31              \n\t"
440
441        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
442        "mthi     $zero,                $ac1                            \n\t"
443        "madd     $ac1,                 %[load3],       %[cospi_8_64]   \n\t"
444        "madd     $ac1,                 %[load4],       %[cospi_24_64]  \n\t"
445        "extp     %[step2_3],           $ac1,           31              \n\t"
446
447        "add      %[step1_0],           %[step2_0],     %[step2_3]      \n\t"
448        "add      %[step1_1],           %[step2_1],     %[step2_2]      \n\t"
449        "sub      %[step1_2],           %[step2_1],     %[step2_2]      \n\t"
450        "sub      %[step1_3],           %[step2_0],     %[step2_3]      \n\t"
451
452        : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
453          [load4] "=&r"(load4), [result1] "=&r"(result1),
454          [result2] "=&r"(result2), [step2_0] "=&r"(step2_0),
455          [step2_1] "=&r"(step2_1), [step2_2] "=&r"(step2_2),
456          [step2_3] "=&r"(step2_3), [step1_0] "=r"(step1_0),
457          [step1_1] "=r"(step1_1), [step1_2] "=r"(step1_2),
458          [step1_3] "=r"(step1_3)
459        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
460          [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64),
461          [cospi_16_64] "r"(cospi_16_64));
462
463    __asm__ __volatile__(
464        "lh       %[load5],             2(%[input])                     \n\t"
465        "lh       %[load6],             30(%[input])                    \n\t"
466        "lh       %[load7],             18(%[input])                    \n\t"
467        "lh       %[load8],             14(%[input])                    \n\t"
468
469        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
470        "mthi     $zero,                $ac1                            \n\t"
471        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
472        "mthi     $zero,                $ac3                            \n\t"
473
474        "madd     $ac1,                 %[load5],       %[cospi_30_64]  \n\t"
475        "msub     $ac1,                 %[load6],       %[cospi_2_64]   \n\t"
476        "extp     %[result1],           $ac1,           31              \n\t"
477
478        "madd     $ac3,                 %[load7],       %[cospi_14_64]  \n\t"
479        "msub     $ac3,                 %[load8],       %[cospi_18_64]  \n\t"
480        "extp     %[result2],           $ac3,           31              \n\t"
481
482        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
483        "mthi     $zero,                $ac1                            \n\t"
484        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
485        "mthi     $zero,                $ac2                            \n\t"
486
487        "madd     $ac1,                 %[load7],       %[cospi_18_64]  \n\t"
488        "madd     $ac1,                 %[load8],       %[cospi_14_64]  \n\t"
489        "extp     %[result3],           $ac1,           31              \n\t"
490
491        "madd     $ac2,                 %[load5],        %[cospi_2_64]  \n\t"
492        "madd     $ac2,                 %[load6],        %[cospi_30_64] \n\t"
493        "extp     %[result4],           $ac2,            31             \n\t"
494
495        "sub      %[load5],             %[result1],     %[result2]      \n\t"
496        "sub      %[load6],             %[result4],     %[result3]      \n\t"
497
498        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
499        "mthi     $zero,                $ac1                            \n\t"
500        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
501        "mthi     $zero,                $ac3                            \n\t"
502
503        "madd     $ac1,                 %[load6],       %[cospi_24_64]  \n\t"
504        "msub     $ac1,                 %[load5],       %[cospi_8_64]   \n\t"
505        "madd     $ac3,                 %[load5],       %[cospi_24_64]  \n\t"
506        "madd     $ac3,                 %[load6],       %[cospi_8_64]   \n\t"
507
508        "extp     %[step2_9],           $ac1,           31              \n\t"
509        "extp     %[step2_14],          $ac3,           31              \n\t"
510        "add      %[step2_8],           %[result1],     %[result2]      \n\t"
511        "add      %[step2_15],          %[result4],     %[result3]      \n\t"
512
513        : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7),
514          [load8] "=&r"(load8), [result1] "=&r"(result1),
515          [result2] "=&r"(result2), [result3] "=&r"(result3),
516          [result4] "=&r"(result4), [step2_8] "=r"(step2_8),
517          [step2_15] "=r"(step2_15), [step2_9] "=r"(step2_9),
518          [step2_14] "=r"(step2_14)
519        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
520          [cospi_30_64] "r"(cospi_30_64), [cospi_2_64] "r"(cospi_2_64),
521          [cospi_14_64] "r"(cospi_14_64), [cospi_18_64] "r"(cospi_18_64),
522          [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64));
523
524    __asm__ __volatile__(
525        "lh       %[load1],             10(%[input])                    \n\t"
526        "lh       %[load2],             22(%[input])                    \n\t"
527        "lh       %[load3],             26(%[input])                    \n\t"
528        "lh       %[load4],             6(%[input])                     \n\t"
529
530        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
531        "mthi     $zero,                $ac1                            \n\t"
532        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
533        "mthi     $zero,                $ac3                            \n\t"
534
535        "madd     $ac1,                 %[load1],    %[cospi_22_64]     \n\t"
536        "msub     $ac1,                 %[load2],    %[cospi_10_64]     \n\t"
537        "extp     %[result1],           $ac1,        31                 \n\t"
538
539        "madd     $ac3,                 %[load3],    %[cospi_6_64]      \n\t"
540        "msub     $ac3,                 %[load4],    %[cospi_26_64]     \n\t"
541        "extp     %[result2],           $ac3,        31                 \n\t"
542
543        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
544        "mthi     $zero,                $ac1                            \n\t"
545        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
546        "mthi     $zero,                $ac2                            \n\t"
547
548        "madd     $ac1,                 %[load1],    %[cospi_10_64]     \n\t"
549        "madd     $ac1,                 %[load2],    %[cospi_22_64]     \n\t"
550        "extp     %[result3],           $ac1,        31                 \n\t"
551
552        "madd     $ac2,                 %[load3],    %[cospi_26_64]     \n\t"
553        "madd     $ac2,                 %[load4],    %[cospi_6_64]      \n\t"
554        "extp     %[result4],           $ac2,        31                 \n\t"
555
556        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
557        "mthi     $zero,                $ac1                            \n\t"
558        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
559        "mthi     $zero,                $ac3                            \n\t"
560
561        "sub      %[load1],             %[result2],     %[result1]      \n\t"
562        "sub      %[load2],             %[result4],     %[result3]      \n\t"
563
564        "msub     $ac1,                 %[load1],       %[cospi_24_64]  \n\t"
565        "msub     $ac1,                 %[load2],       %[cospi_8_64]   \n\t"
566        "madd     $ac3,                 %[load2],       %[cospi_24_64]  \n\t"
567        "msub     $ac3,                 %[load1],       %[cospi_8_64]   \n\t"
568
569        "extp     %[step2_10],          $ac1,           31              \n\t"
570        "extp     %[step2_13],          $ac3,           31              \n\t"
571        "add      %[step2_11],          %[result1],     %[result2]      \n\t"
572        "add      %[step2_12],          %[result4],     %[result3]      \n\t"
573
574        : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
575          [load4] "=&r"(load4), [result1] "=&r"(result1),
576          [result2] "=&r"(result2), [result3] "=&r"(result3),
577          [result4] "=&r"(result4), [step2_10] "=r"(step2_10),
578          [step2_11] "=r"(step2_11), [step2_12] "=r"(step2_12),
579          [step2_13] "=r"(step2_13)
580        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
581          [cospi_22_64] "r"(cospi_22_64), [cospi_10_64] "r"(cospi_10_64),
582          [cospi_6_64] "r"(cospi_6_64), [cospi_26_64] "r"(cospi_26_64),
583          [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64));
584
585    __asm__ __volatile__(
586        "lh       %[load5],             4(%[input])                   \n\t"
587        "lh       %[load6],             28(%[input])                  \n\t"
588        "lh       %[load7],             20(%[input])                  \n\t"
589        "lh       %[load8],             12(%[input])                  \n\t"
590
591        "mtlo     %[const_2_power_13],  $ac1                          \n\t"
592        "mthi     $zero,                $ac1                          \n\t"
593        "mtlo     %[const_2_power_13],  $ac3                          \n\t"
594        "mthi     $zero,                $ac3                          \n\t"
595
596        "madd     $ac1,                 %[load5],    %[cospi_28_64]   \n\t"
597        "msub     $ac1,                 %[load6],    %[cospi_4_64]    \n\t"
598        "extp     %[result1],           $ac1,        31               \n\t"
599
600        "madd     $ac3,                 %[load7],    %[cospi_12_64]   \n\t"
601        "msub     $ac3,                 %[load8],    %[cospi_20_64]   \n\t"
602        "extp     %[result2],           $ac3,        31               \n\t"
603
604        "mtlo     %[const_2_power_13],  $ac1                          \n\t"
605        "mthi     $zero,                $ac1                          \n\t"
606        "mtlo     %[const_2_power_13],  $ac2                          \n\t"
607        "mthi     $zero,                $ac2                          \n\t"
608
609        "madd     $ac1,                 %[load7],    %[cospi_20_64]   \n\t"
610        "madd     $ac1,                 %[load8],    %[cospi_12_64]   \n\t"
611        "extp     %[result3],           $ac1,        31               \n\t"
612
613        "madd     $ac2,                 %[load5],    %[cospi_4_64]    \n\t"
614        "madd     $ac2,                 %[load6],    %[cospi_28_64]   \n\t"
615        "extp     %[result4],           $ac2,        31               \n\t"
616
617        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
618        "mthi     $zero,                $ac1                            \n\t"
619        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
620        "mthi     $zero,                $ac3                            \n\t"
621
622        "sub      %[load5],             %[result4],     %[result3]      \n\t"
623        "sub      %[load5],             %[load5],       %[result1]      \n\t"
624        "add      %[load5],             %[load5],       %[result2]      \n\t"
625
626        "sub      %[load6],             %[result1],     %[result2]      \n\t"
627        "sub      %[load6],             %[load6],       %[result3]      \n\t"
628        "add      %[load6],             %[load6],       %[result4]      \n\t"
629
630        "madd     $ac1,                 %[load5],       %[cospi_16_64]  \n\t"
631        "madd     $ac3,                 %[load6],       %[cospi_16_64]  \n\t"
632
633        "extp     %[step1_5],           $ac1,           31              \n\t"
634        "extp     %[step1_6],           $ac3,           31              \n\t"
635
636        "add      %[step1_4],           %[result1],     %[result2]      \n\t"
637        "add      %[step1_7],           %[result4],     %[result3]      \n\t"
638
639        : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7),
640          [load8] "=&r"(load8), [result1] "=&r"(result1),
641          [result2] "=&r"(result2), [result3] "=&r"(result3),
642          [result4] "=&r"(result4), [step1_4] "=r"(step1_4),
643          [step1_5] "=r"(step1_5), [step1_6] "=r"(step1_6),
644          [step1_7] "=r"(step1_7)
645        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
646          [cospi_20_64] "r"(cospi_20_64), [cospi_12_64] "r"(cospi_12_64),
647          [cospi_4_64] "r"(cospi_4_64), [cospi_28_64] "r"(cospi_28_64),
648          [cospi_16_64] "r"(cospi_16_64));
649
650    __asm__ __volatile__(
651        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
652        "mthi     $zero,                $ac0                            \n\t"
653        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
654        "mthi     $zero,                $ac1                            \n\t"
655
656        "sub      %[load5],             %[step2_14],    %[step2_13]     \n\t"
657        "sub      %[load5],             %[load5],       %[step2_9]      \n\t"
658        "add      %[load5],             %[load5],       %[step2_10]     \n\t"
659
660        "madd     $ac0,                 %[load5],       %[cospi_16_64]  \n\t"
661
662        "sub      %[load6],             %[step2_14],    %[step2_13]     \n\t"
663        "sub      %[load6],             %[load6],       %[step2_10]     \n\t"
664        "add      %[load6],             %[load6],       %[step2_9]      \n\t"
665
666        "madd     $ac1,                 %[load6],       %[cospi_16_64]  \n\t"
667
668        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
669        "mthi     $zero,                $ac2                            \n\t"
670        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
671        "mthi     $zero,                $ac3                            \n\t"
672
673        "sub      %[load5],             %[step2_15],    %[step2_12]     \n\t"
674        "sub      %[load5],             %[load5],       %[step2_8]      \n\t"
675        "add      %[load5],             %[load5],       %[step2_11]     \n\t"
676
677        "madd     $ac2,                 %[load5],       %[cospi_16_64]  \n\t"
678
679        "sub      %[load6],             %[step2_15],    %[step2_12]     \n\t"
680        "sub      %[load6],             %[load6],       %[step2_11]     \n\t"
681        "add      %[load6],             %[load6],       %[step2_8]      \n\t"
682
683        "madd     $ac3,                 %[load6],       %[cospi_16_64]  \n\t"
684
685        "extp     %[step1_10],          $ac0,           31              \n\t"
686        "extp     %[step1_13],          $ac1,           31              \n\t"
687        "extp     %[step1_11],          $ac2,           31              \n\t"
688        "extp     %[step1_12],          $ac3,           31              \n\t"
689
690        : [load5] "=&r"(load5), [load6] "=&r"(load6), [step1_10] "=r"(step1_10),
691          [step1_11] "=r"(step1_11), [step1_12] "=r"(step1_12),
692          [step1_13] "=r"(step1_13)
693        : [const_2_power_13] "r"(const_2_power_13), [step2_14] "r"(step2_14),
694          [step2_13] "r"(step2_13), [step2_9] "r"(step2_9),
695          [step2_10] "r"(step2_10), [step2_15] "r"(step2_15),
696          [step2_12] "r"(step2_12), [step2_8] "r"(step2_8),
697          [step2_11] "r"(step2_11), [cospi_16_64] "r"(cospi_16_64));
698
699    step1_8 = step2_8 + step2_11;
700    step1_9 = step2_9 + step2_10;
701    step1_14 = step2_13 + step2_14;
702    step1_15 = step2_12 + step2_15;
703
704    __asm__ __volatile__(
705        "lbu      %[load7],         0(%[dest_pix])                      \n\t"
706        "add      %[load5],         %[step1_0],         %[step1_7]      \n\t"
707        "add      %[load5],         %[load5],           %[step1_15]     \n\t"
708        "addi     %[load5],         %[load5],           32              \n\t"
709        "sra      %[load5],         %[load5],           6               \n\t"
710        "add      %[load7],         %[load7],           %[load5]        \n\t"
711        "lbux     %[load5],         %[load7](%[cm])                     \n\t"
712        "add      %[load6],         %[step1_1],         %[step1_6]      \n\t"
713        "add      %[load6],         %[load6],           %[step1_14]     \n\t"
714        "sb       %[load5],         0(%[dest_pix])                      \n\t"
715        "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
716        "lbu      %[load8],         0(%[dest_pix])                      \n\t"
717        "addi     %[load6],         %[load6],           32              \n\t"
718        "sra      %[load6],         %[load6],           6               \n\t"
719        "add      %[load8],         %[load8],           %[load6]        \n\t"
720        "lbux     %[load6],         %[load8](%[cm])                     \n\t"
721        "sb       %[load6],         0(%[dest_pix])                      \n\t"
722        "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
723
724        "lbu      %[load7],         0(%[dest_pix])                      \n\t"
725        "add      %[load5],         %[step1_2],         %[step1_5]      \n\t"
726        "add      %[load5],         %[load5],           %[step1_13]     \n\t"
727        "addi     %[load5],         %[load5],           32              \n\t"
728        "sra      %[load5],         %[load5],           6               \n\t"
729        "add      %[load7],         %[load7],           %[load5]        \n\t"
730        "lbux     %[load5],         %[load7](%[cm])                     \n\t"
731        "add      %[load6],         %[step1_3],         %[step1_4]      \n\t"
732        "add      %[load6],         %[load6],           %[step1_12]     \n\t"
733        "sb       %[load5],         0(%[dest_pix])                      \n\t"
734        "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
735        "lbu      %[load8],         0(%[dest_pix])                      \n\t"
736        "addi     %[load6],         %[load6],           32              \n\t"
737        "sra      %[load6],         %[load6],           6               \n\t"
738        "add      %[load8],         %[load8],           %[load6]        \n\t"
739        "lbux     %[load6],         %[load8](%[cm])                     \n\t"
740        "sb       %[load6],         0(%[dest_pix])                      \n\t"
741        "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
742
743        "lbu      %[load7],         0(%[dest_pix])                      \n\t"
744        "sub      %[load5],         %[step1_3],         %[step1_4]      \n\t"
745        "add      %[load5],         %[load5],           %[step1_11]     \n\t"
746        "addi     %[load5],         %[load5],           32              \n\t"
747        "sra      %[load5],         %[load5],           6               \n\t"
748        "add      %[load7],         %[load7],           %[load5]        \n\t"
749        "lbux     %[load5],         %[load7](%[cm])                     \n\t"
750        "sub      %[load6],         %[step1_2],         %[step1_5]      \n\t"
751        "add      %[load6],         %[load6],           %[step1_10]     \n\t"
752        "sb       %[load5],         0(%[dest_pix])                      \n\t"
753        "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
754        "lbu      %[load8],         0(%[dest_pix])                      \n\t"
755        "addi     %[load6],         %[load6],           32              \n\t"
756        "sra      %[load6],         %[load6],           6               \n\t"
757        "add      %[load8],         %[load8],           %[load6]        \n\t"
758        "lbux     %[load6],         %[load8](%[cm])                     \n\t"
759        "sb       %[load6],         0(%[dest_pix])                      \n\t"
760        "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
761
762        "sub      %[load5],         %[step1_1],         %[step1_6]      \n\t"
763        "lbu      %[load7],         0(%[dest_pix])                      \n\t"
764        "add      %[load5],         %[load5],           %[step1_9]      \n\t"
765        "addi     %[load5],         %[load5],           32              \n\t"
766        "sra      %[load5],         %[load5],           6               \n\t"
767        "add      %[load7],         %[load7],           %[load5]        \n\t"
768        "lbux     %[load5],         %[load7](%[cm])                     \n\t"
769        "sub      %[load6],         %[step1_0],         %[step1_7]      \n\t"
770        "add      %[load6],         %[load6],           %[step1_8]      \n\t"
771        "sb       %[load5],         0(%[dest_pix])                      \n\t"
772        "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
773        "lbu      %[load8],         0(%[dest_pix])                      \n\t"
774        "addi     %[load6],         %[load6],           32              \n\t"
775        "sra      %[load6],         %[load6],           6               \n\t"
776        "add      %[load8],         %[load8],           %[load6]        \n\t"
777        "lbux     %[load6],         %[load8](%[cm])                     \n\t"
778        "sb       %[load6],         0(%[dest_pix])                      \n\t"
779        "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
780
781        "lbu      %[load7],         0(%[dest_pix])                      \n\t"
782        "sub      %[load5],         %[step1_0],         %[step1_7]      \n\t"
783        "sub      %[load5],         %[load5],           %[step1_8]      \n\t"
784        "addi     %[load5],         %[load5],           32              \n\t"
785        "sra      %[load5],         %[load5],           6               \n\t"
786        "add      %[load7],         %[load7],           %[load5]        \n\t"
787        "lbux     %[load5],         %[load7](%[cm])                     \n\t"
788        "sub      %[load6],         %[step1_1],         %[step1_6]      \n\t"
789        "sub      %[load6],         %[load6],           %[step1_9]      \n\t"
790        "sb       %[load5],         0(%[dest_pix])                      \n\t"
791        "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
792        "lbu      %[load8],         0(%[dest_pix])                      \n\t"
793        "addi     %[load6],         %[load6],           32              \n\t"
794        "sra      %[load6],         %[load6],           6               \n\t"
795        "add      %[load8],         %[load8],           %[load6]        \n\t"
796        "lbux     %[load6],         %[load8](%[cm])                     \n\t"
797        "sb       %[load6],         0(%[dest_pix])                      \n\t"
798        "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
799
800        "lbu      %[load7],         0(%[dest_pix])                      \n\t"
801        "sub      %[load5],         %[step1_2],         %[step1_5]      \n\t"
802        "sub      %[load5],         %[load5],           %[step1_10]     \n\t"
803        "addi     %[load5],         %[load5],           32              \n\t"
804        "sra      %[load5],         %[load5],           6               \n\t"
805        "add      %[load7],         %[load7],           %[load5]        \n\t"
806        "lbux     %[load5],         %[load7](%[cm])                     \n\t"
807        "sub      %[load6],         %[step1_3],         %[step1_4]      \n\t"
808        "sub      %[load6],         %[load6],           %[step1_11]     \n\t"
809        "sb       %[load5],         0(%[dest_pix])                      \n\t"
810        "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
811        "lbu      %[load8],         0(%[dest_pix])                      \n\t"
812        "addi     %[load6],         %[load6],           32              \n\t"
813        "sra      %[load6],         %[load6],           6               \n\t"
814        "add      %[load8],         %[load8],           %[load6]        \n\t"
815        "lbux     %[load6],         %[load8](%[cm])                     \n\t"
816        "sb       %[load6],         0(%[dest_pix])                      \n\t"
817        "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
818
819        "lbu      %[load7],         0(%[dest_pix])                      \n\t"
820        "add      %[load5],         %[step1_3],         %[step1_4]      \n\t"
821        "sub      %[load5],         %[load5],           %[step1_12]     \n\t"
822        "addi     %[load5],         %[load5],           32              \n\t"
823        "sra      %[load5],         %[load5],           6               \n\t"
824        "add      %[load7],         %[load7],           %[load5]        \n\t"
825        "lbux     %[load5],         %[load7](%[cm])                     \n\t"
826        "add      %[load6],         %[step1_2],         %[step1_5]      \n\t"
827        "sub      %[load6],         %[load6],           %[step1_13]     \n\t"
828        "sb       %[load5],         0(%[dest_pix])                      \n\t"
829        "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
830        "lbu      %[load8],         0(%[dest_pix])                      \n\t"
831        "addi     %[load6],         %[load6],           32              \n\t"
832        "sra      %[load6],         %[load6],           6               \n\t"
833        "add      %[load8],         %[load8],           %[load6]        \n\t"
834        "lbux     %[load6],         %[load8](%[cm])                     \n\t"
835        "sb       %[load6],         0(%[dest_pix])                      \n\t"
836        "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
837
838        "lbu      %[load7],         0(%[dest_pix])                      \n\t"
839        "add      %[load5],         %[step1_1],         %[step1_6]      \n\t"
840        "sub      %[load5],         %[load5],           %[step1_14]     \n\t"
841        "addi     %[load5],         %[load5],           32              \n\t"
842        "sra      %[load5],         %[load5],           6               \n\t"
843        "add      %[load7],         %[load7],           %[load5]        \n\t"
844        "lbux     %[load5],         %[load7](%[cm])                     \n\t"
845        "add      %[load6],         %[step1_0],         %[step1_7]      \n\t"
846        "sub      %[load6],         %[load6],           %[step1_15]     \n\t"
847        "sb       %[load5],         0(%[dest_pix])                      \n\t"
848        "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
849        "lbu      %[load8],         0(%[dest_pix])                      \n\t"
850        "addi     %[load6],         %[load6],           32              \n\t"
851        "sra      %[load6],         %[load6],           6               \n\t"
852        "add      %[load8],         %[load8],           %[load6]        \n\t"
853        "lbux     %[load6],         %[load8](%[cm])                     \n\t"
854        "sb       %[load6],         0(%[dest_pix])                      \n\t"
855
856        : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7),
857          [load8] "=&r"(load8), [dest_pix] "+r"(dest_pix)
858        :
859        [cm] "r"(cm), [stride] "r"(stride), [step1_0] "r"(step1_0),
860        [step1_1] "r"(step1_1), [step1_2] "r"(step1_2), [step1_3] "r"(step1_3),
861        [step1_4] "r"(step1_4), [step1_5] "r"(step1_5), [step1_6] "r"(step1_6),
862        [step1_7] "r"(step1_7), [step1_8] "r"(step1_8), [step1_9] "r"(step1_9),
863        [step1_10] "r"(step1_10), [step1_11] "r"(step1_11),
864        [step1_12] "r"(step1_12), [step1_13] "r"(step1_13),
865        [step1_14] "r"(step1_14), [step1_15] "r"(step1_15));
866
867    input += 16;
868  }
869}
870
871void vpx_idct16x16_256_add_dspr2(const int16_t *input, uint8_t *dest,
872                                 int stride) {
873  DECLARE_ALIGNED(32, int16_t, out[16 * 16]);
874  uint32_t pos = 45;
875
876  /* bit positon for extract from acc */
877  __asm__ __volatile__("wrdsp    %[pos],    1    \n\t" : : [pos] "r"(pos));
878
879  // First transform rows
880  idct16_rows_dspr2(input, out, 16);
881
882  // Then transform columns and add to dest
883  idct16_cols_add_blk_dspr2(out, dest, stride);
884}
885
886void vpx_idct16x16_10_add_dspr2(const int16_t *input, uint8_t *dest,
887                                int stride) {
888  DECLARE_ALIGNED(32, int16_t, out[16 * 16]);
889  int16_t *outptr = out;
890  uint32_t i;
891  uint32_t pos = 45;
892
893  /* bit positon for extract from acc */
894  __asm__ __volatile__("wrdsp    %[pos],    1    \n\t" : : [pos] "r"(pos));
895
896  // First transform rows. Since all non-zero dct coefficients are in
897  // upper-left 4x4 area, we only need to calculate first 4 rows here.
898  idct16_rows_dspr2(input, outptr, 4);
899
900  outptr += 4;
901  for (i = 0; i < 6; ++i) {
902    __asm__ __volatile__(
903        "sw     $zero,    0(%[outptr])     \n\t"
904        "sw     $zero,   32(%[outptr])     \n\t"
905        "sw     $zero,   64(%[outptr])     \n\t"
906        "sw     $zero,   96(%[outptr])     \n\t"
907        "sw     $zero,  128(%[outptr])     \n\t"
908        "sw     $zero,  160(%[outptr])     \n\t"
909        "sw     $zero,  192(%[outptr])     \n\t"
910        "sw     $zero,  224(%[outptr])     \n\t"
911        "sw     $zero,  256(%[outptr])     \n\t"
912        "sw     $zero,  288(%[outptr])     \n\t"
913        "sw     $zero,  320(%[outptr])     \n\t"
914        "sw     $zero,  352(%[outptr])     \n\t"
915        "sw     $zero,  384(%[outptr])     \n\t"
916        "sw     $zero,  416(%[outptr])     \n\t"
917        "sw     $zero,  448(%[outptr])     \n\t"
918        "sw     $zero,  480(%[outptr])     \n\t"
919
920        :
921        : [outptr] "r"(outptr));
922
923    outptr += 2;
924  }
925
926  // Then transform columns
927  idct16_cols_add_blk_dspr2(out, dest, stride);
928}
929
930void vpx_idct16x16_1_add_dspr2(const int16_t *input, uint8_t *dest,
931                               int stride) {
932  uint32_t pos = 45;
933  int32_t out;
934  int32_t r;
935  int32_t a1, absa1;
936  int32_t vector_a1;
937  int32_t t1, t2, t3, t4;
938  int32_t vector_1, vector_2, vector_3, vector_4;
939
940  /* bit positon for extract from acc */
941  __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
942
943                       :
944                       : [pos] "r"(pos));
945
946  out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]);
947  __asm__ __volatile__(
948      "addi     %[out],     %[out],     32      \n\t"
949      "sra      %[a1],      %[out],     6       \n\t"
950
951      : [out] "+r"(out), [a1] "=r"(a1)
952      :);
953
954  if (a1 < 0) {
955    /* use quad-byte
956     * input and output memory are four byte aligned */
957    __asm__ __volatile__(
958        "abs        %[absa1],       %[a1]       \n\t"
959        "replv.qb   %[vector_a1],   %[absa1]    \n\t"
960
961        : [absa1] "=r"(absa1), [vector_a1] "=r"(vector_a1)
962        : [a1] "r"(a1));
963
964    for (r = 16; r--;) {
965      __asm__ __volatile__(
966          "lw             %[t1],          0(%[dest])                      \n\t"
967          "lw             %[t2],          4(%[dest])                      \n\t"
968          "lw             %[t3],          8(%[dest])                      \n\t"
969          "lw             %[t4],          12(%[dest])                     \n\t"
970          "subu_s.qb      %[vector_1],    %[t1],          %[vector_a1]    \n\t"
971          "subu_s.qb      %[vector_2],    %[t2],          %[vector_a1]    \n\t"
972          "subu_s.qb      %[vector_3],    %[t3],          %[vector_a1]    \n\t"
973          "subu_s.qb      %[vector_4],    %[t4],          %[vector_a1]    \n\t"
974          "sw             %[vector_1],    0(%[dest])                      \n\t"
975          "sw             %[vector_2],    4(%[dest])                      \n\t"
976          "sw             %[vector_3],    8(%[dest])                      \n\t"
977          "sw             %[vector_4],    12(%[dest])                     \n\t"
978          "add            %[dest],        %[dest],        %[stride]       \n\t"
979
980          : [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4),
981            [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2),
982            [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4),
983            [dest] "+&r"(dest)
984          : [stride] "r"(stride), [vector_a1] "r"(vector_a1));
985    }
986  } else if (a1 > 255) {
987    int32_t a11, a12, vector_a11, vector_a12;
988
989    /* use quad-byte
990     * input and output memory are four byte aligned */
991    a11 = a1 >> 1;
992    a12 = a1 - a11;
993    __asm__ __volatile__(
994        "replv.qb       %[vector_a11],  %[a11]     \n\t"
995        "replv.qb       %[vector_a12],  %[a12]     \n\t"
996
997        : [vector_a11] "=&r"(vector_a11), [vector_a12] "=&r"(vector_a12)
998        : [a11] "r"(a11), [a12] "r"(a12));
999
1000    for (r = 16; r--;) {
1001      __asm__ __volatile__(
1002          "lw             %[t1],          0(%[dest])                      \n\t"
1003          "lw             %[t2],          4(%[dest])                      \n\t"
1004          "lw             %[t3],          8(%[dest])                      \n\t"
1005          "lw             %[t4],          12(%[dest])                     \n\t"
1006          "addu_s.qb      %[vector_1],    %[t1],          %[vector_a11]   \n\t"
1007          "addu_s.qb      %[vector_2],    %[t2],          %[vector_a11]   \n\t"
1008          "addu_s.qb      %[vector_3],    %[t3],          %[vector_a11]   \n\t"
1009          "addu_s.qb      %[vector_4],    %[t4],          %[vector_a11]   \n\t"
1010          "addu_s.qb      %[vector_1],    %[vector_1],    %[vector_a12]   \n\t"
1011          "addu_s.qb      %[vector_2],    %[vector_2],    %[vector_a12]   \n\t"
1012          "addu_s.qb      %[vector_3],    %[vector_3],    %[vector_a12]   \n\t"
1013          "addu_s.qb      %[vector_4],    %[vector_4],    %[vector_a12]   \n\t"
1014          "sw             %[vector_1],    0(%[dest])                      \n\t"
1015          "sw             %[vector_2],    4(%[dest])                      \n\t"
1016          "sw             %[vector_3],    8(%[dest])                      \n\t"
1017          "sw             %[vector_4],    12(%[dest])                     \n\t"
1018          "add            %[dest],        %[dest],        %[stride]       \n\t"
1019
1020          : [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4),
1021            [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2),
1022            [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4),
1023            [dest] "+&r"(dest)
1024          : [stride] "r"(stride), [vector_a11] "r"(vector_a11),
1025            [vector_a12] "r"(vector_a12));
1026    }
1027  } else {
1028    /* use quad-byte
1029     * input and output memory are four byte aligned */
1030    __asm__ __volatile__("replv.qb   %[vector_a1],   %[a1]   \n\t"
1031
1032                         : [vector_a1] "=r"(vector_a1)
1033                         : [a1] "r"(a1));
1034
1035    for (r = 16; r--;) {
1036      __asm__ __volatile__(
1037          "lw             %[t1],          0(%[dest])                      \n\t"
1038          "lw             %[t2],          4(%[dest])                      \n\t"
1039          "lw             %[t3],          8(%[dest])                      \n\t"
1040          "lw             %[t4],          12(%[dest])                     \n\t"
1041          "addu_s.qb      %[vector_1],    %[t1],          %[vector_a1]    \n\t"
1042          "addu_s.qb      %[vector_2],    %[t2],          %[vector_a1]    \n\t"
1043          "addu_s.qb      %[vector_3],    %[t3],          %[vector_a1]    \n\t"
1044          "addu_s.qb      %[vector_4],    %[t4],          %[vector_a1]    \n\t"
1045          "sw             %[vector_1],    0(%[dest])                      \n\t"
1046          "sw             %[vector_2],    4(%[dest])                      \n\t"
1047          "sw             %[vector_3],    8(%[dest])                      \n\t"
1048          "sw             %[vector_4],    12(%[dest])                     \n\t"
1049          "add            %[dest],        %[dest],        %[stride]       \n\t"
1050
1051          : [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4),
1052            [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2),
1053            [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4),
1054            [dest] "+&r"(dest)
1055          : [stride] "r"(stride), [vector_a1] "r"(vector_a1));
1056    }
1057  }
1058}
1059
1060void iadst16_dspr2(const int16_t *input, int16_t *output) {
1061  int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;
1062
1063  int x0 = input[15];
1064  int x1 = input[0];
1065  int x2 = input[13];
1066  int x3 = input[2];
1067  int x4 = input[11];
1068  int x5 = input[4];
1069  int x6 = input[9];
1070  int x7 = input[6];
1071  int x8 = input[7];
1072  int x9 = input[8];
1073  int x10 = input[5];
1074  int x11 = input[10];
1075  int x12 = input[3];
1076  int x13 = input[12];
1077  int x14 = input[1];
1078  int x15 = input[14];
1079
1080  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 |
1081        x13 | x14 | x15)) {
1082    output[0] = output[1] = output[2] = output[3] = output[4] = output[5] =
1083        output[6] = output[7] = output[8] = output[9] = output[10] =
1084            output[11] = output[12] = output[13] = output[14] = output[15] = 0;
1085    return;
1086  }
1087
1088  // stage 1
1089  s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
1090  s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
1091  s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
1092  s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
1093  s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
1094  s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
1095  s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
1096  s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
1097  s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
1098  s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
1099  s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
1100  s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
1101  s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
1102  s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
1103  s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
1104  s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
1105
1106  x0 = dct_const_round_shift(s0 + s8);
1107  x1 = dct_const_round_shift(s1 + s9);
1108  x2 = dct_const_round_shift(s2 + s10);
1109  x3 = dct_const_round_shift(s3 + s11);
1110  x4 = dct_const_round_shift(s4 + s12);
1111  x5 = dct_const_round_shift(s5 + s13);
1112  x6 = dct_const_round_shift(s6 + s14);
1113  x7 = dct_const_round_shift(s7 + s15);
1114  x8 = dct_const_round_shift(s0 - s8);
1115  x9 = dct_const_round_shift(s1 - s9);
1116  x10 = dct_const_round_shift(s2 - s10);
1117  x11 = dct_const_round_shift(s3 - s11);
1118  x12 = dct_const_round_shift(s4 - s12);
1119  x13 = dct_const_round_shift(s5 - s13);
1120  x14 = dct_const_round_shift(s6 - s14);
1121  x15 = dct_const_round_shift(s7 - s15);
1122
1123  // stage 2
1124  s0 = x0;
1125  s1 = x1;
1126  s2 = x2;
1127  s3 = x3;
1128  s4 = x4;
1129  s5 = x5;
1130  s6 = x6;
1131  s7 = x7;
1132  s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
1133  s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
1134  s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
1135  s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
1136  s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
1137  s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
1138  s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
1139  s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
1140
1141  x0 = s0 + s4;
1142  x1 = s1 + s5;
1143  x2 = s2 + s6;
1144  x3 = s3 + s7;
1145  x4 = s0 - s4;
1146  x5 = s1 - s5;
1147  x6 = s2 - s6;
1148  x7 = s3 - s7;
1149  x8 = dct_const_round_shift(s8 + s12);
1150  x9 = dct_const_round_shift(s9 + s13);
1151  x10 = dct_const_round_shift(s10 + s14);
1152  x11 = dct_const_round_shift(s11 + s15);
1153  x12 = dct_const_round_shift(s8 - s12);
1154  x13 = dct_const_round_shift(s9 - s13);
1155  x14 = dct_const_round_shift(s10 - s14);
1156  x15 = dct_const_round_shift(s11 - s15);
1157
1158  // stage 3
1159  s0 = x0;
1160  s1 = x1;
1161  s2 = x2;
1162  s3 = x3;
1163  s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
1164  s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
1165  s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
1166  s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
1167  s8 = x8;
1168  s9 = x9;
1169  s10 = x10;
1170  s11 = x11;
1171  s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
1172  s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
1173  s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
1174  s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
1175
1176  x0 = s0 + s2;
1177  x1 = s1 + s3;
1178  x2 = s0 - s2;
1179  x3 = s1 - s3;
1180  x4 = dct_const_round_shift(s4 + s6);
1181  x5 = dct_const_round_shift(s5 + s7);
1182  x6 = dct_const_round_shift(s4 - s6);
1183  x7 = dct_const_round_shift(s5 - s7);
1184  x8 = s8 + s10;
1185  x9 = s9 + s11;
1186  x10 = s8 - s10;
1187  x11 = s9 - s11;
1188  x12 = dct_const_round_shift(s12 + s14);
1189  x13 = dct_const_round_shift(s13 + s15);
1190  x14 = dct_const_round_shift(s12 - s14);
1191  x15 = dct_const_round_shift(s13 - s15);
1192
1193  // stage 4
1194  s2 = (-cospi_16_64) * (x2 + x3);
1195  s3 = cospi_16_64 * (x2 - x3);
1196  s6 = cospi_16_64 * (x6 + x7);
1197  s7 = cospi_16_64 * (-x6 + x7);
1198  s10 = cospi_16_64 * (x10 + x11);
1199  s11 = cospi_16_64 * (-x10 + x11);
1200  s14 = (-cospi_16_64) * (x14 + x15);
1201  s15 = cospi_16_64 * (x14 - x15);
1202
1203  x2 = dct_const_round_shift(s2);
1204  x3 = dct_const_round_shift(s3);
1205  x6 = dct_const_round_shift(s6);
1206  x7 = dct_const_round_shift(s7);
1207  x10 = dct_const_round_shift(s10);
1208  x11 = dct_const_round_shift(s11);
1209  x14 = dct_const_round_shift(s14);
1210  x15 = dct_const_round_shift(s15);
1211
1212  output[0] = x0;
1213  output[1] = -x8;
1214  output[2] = x12;
1215  output[3] = -x4;
1216  output[4] = x6;
1217  output[5] = x14;
1218  output[6] = x10;
1219  output[7] = x2;
1220  output[8] = x3;
1221  output[9] = x11;
1222  output[10] = x15;
1223  output[11] = x7;
1224  output[12] = x5;
1225  output[13] = -x13;
1226  output[14] = x9;
1227  output[15] = -x1;
1228}
1229
1230#endif  // HAVE_DSPR2
1231