1/*
2 *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include "./vpx_config.h"
12#include "vpx_dsp/mips/inv_txfm_dspr2.h"
13#include "vpx_dsp/txfm_common.h"
14
15#if HAVE_DSPR2
16void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int stride) {
17  int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6;
18  int step1_7, step1_8, step1_9, step1_10, step1_11, step1_12, step1_13;
19  int step1_14, step1_15, step1_16, step1_17, step1_18, step1_19, step1_20;
20  int step1_21, step1_22, step1_23, step1_24, step1_25, step1_26, step1_27;
21  int step1_28, step1_29, step1_30, step1_31;
22  int step2_0, step2_1, step2_2, step2_3, step2_4, step2_5, step2_6;
23  int step2_7, step2_8, step2_9, step2_10, step2_11, step2_12, step2_13;
24  int step2_14, step2_15, step2_16, step2_17, step2_18, step2_19, step2_20;
25  int step2_21, step2_22, step2_23, step2_24, step2_25, step2_26, step2_27;
26  int step2_28, step2_29, step2_30, step2_31;
27  int step3_8, step3_9, step3_10, step3_11, step3_12, step3_13, step3_14;
28  int step3_15, step3_16, step3_17, step3_18, step3_19, step3_20, step3_21;
29  int step3_22, step3_23, step3_24, step3_25, step3_26, step3_27, step3_28;
30  int step3_29, step3_30, step3_31;
31  int temp0, temp1, temp2, temp3;
32  int load1, load2, load3, load4;
33  int result1, result2;
34  int i;
35  uint8_t *dest_pix, *dest_pix1;
36  const int const_2_power_13 = 8192;
37  uint8_t *cm = vpx_ff_cropTbl;
38
39  /* prefetch vpx_ff_cropTbl */
40  prefetch_load(vpx_ff_cropTbl);
41  prefetch_load(vpx_ff_cropTbl + 32);
42  prefetch_load(vpx_ff_cropTbl + 64);
43  prefetch_load(vpx_ff_cropTbl + 96);
44  prefetch_load(vpx_ff_cropTbl + 128);
45  prefetch_load(vpx_ff_cropTbl + 160);
46  prefetch_load(vpx_ff_cropTbl + 192);
47  prefetch_load(vpx_ff_cropTbl + 224);
48
49  for (i = 0; i < 32; ++i) {
50    dest_pix = dest + i;
51    dest_pix1 = dest + i + 31 * stride;
52
53    __asm__ __volatile__(
54        "lh       %[load1],             2(%[input])                     \n\t"
55        "lh       %[load2],             62(%[input])                    \n\t"
56        "lh       %[load3],             34(%[input])                    \n\t"
57        "lh       %[load4],             30(%[input])                    \n\t"
58
59        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
60        "mthi     $zero,                $ac1                            \n\t"
61        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
62        "mthi     $zero,                $ac3                            \n\t"
63
64        "madd     $ac1,                 %[load1],       %[cospi_31_64]  \n\t"
65        "msub     $ac1,                 %[load2],       %[cospi_1_64]   \n\t"
66        "extp     %[temp0],             $ac1,           31              \n\t"
67
68        "madd     $ac3,                 %[load1],       %[cospi_1_64]   \n\t"
69        "madd     $ac3,                 %[load2],       %[cospi_31_64]  \n\t"
70        "extp     %[temp3],             $ac3,           31              \n\t"
71
72        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
73        "mthi     $zero,                $ac1                            \n\t"
74        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
75        "mthi     $zero,                $ac2                            \n\t"
76
77        "madd     $ac2,                 %[load3],       %[cospi_15_64]  \n\t"
78        "msub     $ac2,                 %[load4],       %[cospi_17_64]  \n\t"
79        "extp     %[temp1],             $ac2,           31              \n\t"
80
81        "madd     $ac1,                 %[load3],       %[cospi_17_64]  \n\t"
82        "madd     $ac1,                 %[load4],       %[cospi_15_64]  \n\t"
83        "extp     %[temp2],             $ac1,           31              \n\t"
84
85        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
86        "mthi     $zero,                $ac1                            \n\t"
87        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
88        "mthi     $zero,                $ac3                            \n\t"
89
90        "sub      %[load1],             %[temp3],       %[temp2]        \n\t"
91        "sub      %[load2],             %[temp0],       %[temp1]        \n\t"
92
93        "madd     $ac1,                 %[load1],       %[cospi_28_64]  \n\t"
94        "msub     $ac1,                 %[load2],       %[cospi_4_64]   \n\t"
95        "madd     $ac3,                 %[load1],       %[cospi_4_64]   \n\t"
96        "madd     $ac3,                 %[load2],       %[cospi_28_64]  \n\t"
97
98        "extp     %[step1_17],          $ac1,           31              \n\t"
99        "extp     %[step1_30],          $ac3,           31              \n\t"
100        "add      %[step1_16],          %[temp0],       %[temp1]        \n\t"
101        "add      %[step1_31],          %[temp2],       %[temp3]        \n\t"
102
103        : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
104          [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
105          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3),
106          [step1_16] "=&r"(step1_16), [step1_17] "=&r"(step1_17),
107          [step1_30] "=&r"(step1_30), [step1_31] "=&r"(step1_31)
108        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
109          [cospi_31_64] "r"(cospi_31_64), [cospi_1_64] "r"(cospi_1_64),
110          [cospi_4_64] "r"(cospi_4_64), [cospi_17_64] "r"(cospi_17_64),
111          [cospi_15_64] "r"(cospi_15_64), [cospi_28_64] "r"(cospi_28_64));
112
113    __asm__ __volatile__(
114        "lh       %[load1],             18(%[input])                    \n\t"
115        "lh       %[load2],             46(%[input])                    \n\t"
116        "lh       %[load3],             50(%[input])                    \n\t"
117        "lh       %[load4],             14(%[input])                    \n\t"
118
119        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
120        "mthi     $zero,                $ac1                            \n\t"
121        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
122        "mthi     $zero,                $ac3                            \n\t"
123
124        "madd     $ac1,                 %[load1],       %[cospi_23_64]  \n\t"
125        "msub     $ac1,                 %[load2],       %[cospi_9_64]   \n\t"
126        "extp     %[temp0],             $ac1,           31              \n\t"
127
128        "madd     $ac3,                 %[load1],       %[cospi_9_64]   \n\t"
129        "madd     $ac3,                 %[load2],       %[cospi_23_64]  \n\t"
130        "extp     %[temp3],             $ac3,           31              \n\t"
131
132        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
133        "mthi     $zero,                $ac1                            \n\t"
134        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
135        "mthi     $zero,                $ac2                            \n\t"
136
137        "madd     $ac2,                 %[load3],       %[cospi_7_64]   \n\t"
138        "msub     $ac2,                 %[load4],       %[cospi_25_64]  \n\t"
139        "extp     %[temp1],             $ac2,           31              \n\t"
140
141        "madd     $ac1,                 %[load3],       %[cospi_25_64]  \n\t"
142        "madd     $ac1,                 %[load4],       %[cospi_7_64]   \n\t"
143        "extp     %[temp2],             $ac1,           31              \n\t"
144
145        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
146        "mthi     $zero,                $ac1                            \n\t"
147        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
148        "mthi     $zero,                $ac3                            \n\t"
149
150        "sub      %[load1],             %[temp1],       %[temp0]        \n\t"
151        "sub      %[load2],             %[temp2],       %[temp3]        \n\t"
152
153        "msub     $ac1,                 %[load1],       %[cospi_28_64]  \n\t"
154        "msub     $ac1,                 %[load2],       %[cospi_4_64]   \n\t"
155        "msub     $ac3,                 %[load1],       %[cospi_4_64]   \n\t"
156        "madd     $ac3,                 %[load2],       %[cospi_28_64]  \n\t"
157
158        "extp     %[step1_18],          $ac1,           31              \n\t"
159        "extp     %[step1_29],          $ac3,           31              \n\t"
160        "add      %[step1_19],          %[temp0],       %[temp1]        \n\t"
161        "add      %[step1_28],          %[temp2],       %[temp3]        \n\t"
162
163        : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
164          [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
165          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3),
166          [step1_18] "=&r"(step1_18), [step1_19] "=&r"(step1_19),
167          [step1_28] "=&r"(step1_28), [step1_29] "=&r"(step1_29)
168        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
169          [cospi_23_64] "r"(cospi_23_64), [cospi_9_64] "r"(cospi_9_64),
170          [cospi_4_64] "r"(cospi_4_64), [cospi_7_64] "r"(cospi_7_64),
171          [cospi_25_64] "r"(cospi_25_64), [cospi_28_64] "r"(cospi_28_64));
172
173    __asm__ __volatile__(
174        "lh       %[load1],             10(%[input])                    \n\t"
175        "lh       %[load2],             54(%[input])                    \n\t"
176        "lh       %[load3],             42(%[input])                    \n\t"
177        "lh       %[load4],             22(%[input])                    \n\t"
178
179        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
180        "mthi     $zero,                $ac1                            \n\t"
181        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
182        "mthi     $zero,                $ac3                            \n\t"
183
184        "madd     $ac1,                 %[load1],       %[cospi_27_64]  \n\t"
185        "msub     $ac1,                 %[load2],       %[cospi_5_64]   \n\t"
186        "extp     %[temp0],             $ac1,           31              \n\t"
187
188        "madd     $ac3,                 %[load1],       %[cospi_5_64]   \n\t"
189        "madd     $ac3,                 %[load2],       %[cospi_27_64]  \n\t"
190        "extp     %[temp3],             $ac3,           31              \n\t"
191
192        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
193        "mthi     $zero,                $ac1                            \n\t"
194        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
195        "mthi     $zero,                $ac2                            \n\t"
196
197        "madd     $ac2,                 %[load3],       %[cospi_11_64]  \n\t"
198        "msub     $ac2,                 %[load4],       %[cospi_21_64]  \n\t"
199        "extp     %[temp1],             $ac2,           31              \n\t"
200
201        "madd     $ac1,                 %[load3],       %[cospi_21_64]  \n\t"
202        "madd     $ac1,                 %[load4],       %[cospi_11_64]  \n\t"
203        "extp     %[temp2],             $ac1,           31              \n\t"
204
205        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
206        "mthi     $zero,                $ac1                            \n\t"
207        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
208        "mthi     $zero,                $ac3                            \n\t"
209
210        "sub      %[load1],             %[temp0],       %[temp1]        \n\t"
211        "sub      %[load2],             %[temp3],       %[temp2]        \n\t"
212
213        "madd     $ac1,                 %[load2],       %[cospi_12_64]  \n\t"
214        "msub     $ac1,                 %[load1],       %[cospi_20_64]  \n\t"
215        "madd     $ac3,                 %[load1],       %[cospi_12_64]  \n\t"
216        "madd     $ac3,                 %[load2],       %[cospi_20_64]  \n\t"
217
218        "extp     %[step1_21],          $ac1,           31              \n\t"
219        "extp     %[step1_26],          $ac3,           31              \n\t"
220        "add      %[step1_20],          %[temp0],       %[temp1]        \n\t"
221        "add      %[step1_27],          %[temp2],       %[temp3]        \n\t"
222
223        : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
224          [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
225          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3),
226          [step1_20] "=&r"(step1_20), [step1_21] "=&r"(step1_21),
227          [step1_26] "=&r"(step1_26), [step1_27] "=&r"(step1_27)
228        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
229          [cospi_27_64] "r"(cospi_27_64), [cospi_5_64] "r"(cospi_5_64),
230          [cospi_11_64] "r"(cospi_11_64), [cospi_21_64] "r"(cospi_21_64),
231          [cospi_12_64] "r"(cospi_12_64), [cospi_20_64] "r"(cospi_20_64));
232
233    __asm__ __volatile__(
234        "lh       %[load1],             26(%[input])                    \n\t"
235        "lh       %[load2],             38(%[input])                    \n\t"
236        "lh       %[load3],             58(%[input])                    \n\t"
237        "lh       %[load4],              6(%[input])                    \n\t"
238
239        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
240        "mthi     $zero,                $ac1                            \n\t"
241        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
242        "mthi     $zero,                $ac3                            \n\t"
243
244        "madd     $ac1,                 %[load1],       %[cospi_19_64]  \n\t"
245        "msub     $ac1,                 %[load2],       %[cospi_13_64]  \n\t"
246        "extp     %[temp0],             $ac1,           31              \n\t"
247        "madd     $ac3,                 %[load1],       %[cospi_13_64]  \n\t"
248        "madd     $ac3,                 %[load2],       %[cospi_19_64]  \n\t"
249        "extp     %[temp3],             $ac3,           31              \n\t"
250
251        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
252        "mthi     $zero,                $ac1                            \n\t"
253        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
254        "mthi     $zero,                $ac2                            \n\t"
255
256        "madd     $ac2,                 %[load3],       %[cospi_3_64]   \n\t"
257        "msub     $ac2,                 %[load4],       %[cospi_29_64]  \n\t"
258        "extp     %[temp1],             $ac2,           31              \n\t"
259        "madd     $ac1,                 %[load3],       %[cospi_29_64]  \n\t"
260        "madd     $ac1,                 %[load4],       %[cospi_3_64]   \n\t"
261        "extp     %[temp2],             $ac1,           31              \n\t"
262
263        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
264        "mthi     $zero,                $ac1                            \n\t"
265        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
266        "mthi     $zero,                $ac3                            \n\t"
267
268        "sub      %[load1],             %[temp1],       %[temp0]        \n\t"
269        "sub      %[load2],             %[temp2],       %[temp3]        \n\t"
270        "msub     $ac1,                 %[load1],       %[cospi_12_64]  \n\t"
271        "msub     $ac1,                 %[load2],       %[cospi_20_64]  \n\t"
272        "msub     $ac3,                 %[load1],       %[cospi_20_64]  \n\t"
273        "madd     $ac3,                 %[load2],       %[cospi_12_64]  \n\t"
274        "extp     %[step1_22],          $ac1,           31              \n\t"
275        "extp     %[step1_25],          $ac3,           31              \n\t"
276        "add      %[step1_23],          %[temp0],       %[temp1]        \n\t"
277        "add      %[step1_24],          %[temp2],       %[temp3]        \n\t"
278
279        : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
280          [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
281          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3),
282          [step1_22] "=&r"(step1_22), [step1_23] "=&r"(step1_23),
283          [step1_24] "=&r"(step1_24), [step1_25] "=&r"(step1_25)
284        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
285          [cospi_19_64] "r"(cospi_19_64), [cospi_13_64] "r"(cospi_13_64),
286          [cospi_3_64] "r"(cospi_3_64), [cospi_29_64] "r"(cospi_29_64),
287          [cospi_12_64] "r"(cospi_12_64), [cospi_20_64] "r"(cospi_20_64));
288
289    __asm__ __volatile__(
290        "lh       %[load1],              4(%[input])                    \n\t"
291        "lh       %[load2],             60(%[input])                    \n\t"
292        "lh       %[load3],             36(%[input])                    \n\t"
293        "lh       %[load4],             28(%[input])                    \n\t"
294
295        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
296        "mthi     $zero,                $ac1                            \n\t"
297        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
298        "mthi     $zero,                $ac3                            \n\t"
299
300        "madd     $ac1,                 %[load1],       %[cospi_30_64]  \n\t"
301        "msub     $ac1,                 %[load2],       %[cospi_2_64]   \n\t"
302        "extp     %[temp0],             $ac1,           31              \n\t"
303        "madd     $ac3,                 %[load1],       %[cospi_2_64]   \n\t"
304        "madd     $ac3,                 %[load2],       %[cospi_30_64]  \n\t"
305        "extp     %[temp3],             $ac3,           31              \n\t"
306
307        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
308        "mthi     $zero,                $ac1                            \n\t"
309        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
310        "mthi     $zero,                $ac2                            \n\t"
311
312        "madd     $ac2,                 %[load3],       %[cospi_14_64]  \n\t"
313        "msub     $ac2,                 %[load4],       %[cospi_18_64]  \n\t"
314        "extp     %[temp1],             $ac2,           31              \n\t"
315        "madd     $ac1,                 %[load3],       %[cospi_18_64]  \n\t"
316        "madd     $ac1,                 %[load4],       %[cospi_14_64]  \n\t"
317        "extp     %[temp2],             $ac1,           31              \n\t"
318
319        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
320        "mthi     $zero,                $ac1                            \n\t"
321        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
322        "mthi     $zero,                $ac3                            \n\t"
323
324        "sub      %[load1],             %[temp0],       %[temp1]        \n\t"
325        "sub      %[load2],             %[temp3],       %[temp2]        \n\t"
326        "msub     $ac1,                 %[load1],       %[cospi_8_64]   \n\t"
327        "madd     $ac1,                 %[load2],       %[cospi_24_64]  \n\t"
328        "madd     $ac3,                 %[load1],       %[cospi_24_64]  \n\t"
329        "madd     $ac3,                 %[load2],       %[cospi_8_64]   \n\t"
330        "extp     %[step2_9],           $ac1,           31              \n\t"
331        "extp     %[step2_14],          $ac3,           31              \n\t"
332        "add      %[step2_8],           %[temp0],       %[temp1]        \n\t"
333        "add      %[step2_15],          %[temp2],       %[temp3]        \n\t"
334
335        : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
336          [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
337          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step2_8] "=&r"(step2_8),
338          [step2_9] "=&r"(step2_9), [step2_14] "=&r"(step2_14),
339          [step2_15] "=&r"(step2_15)
340        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
341          [cospi_30_64] "r"(cospi_30_64), [cospi_2_64] "r"(cospi_2_64),
342          [cospi_14_64] "r"(cospi_14_64), [cospi_18_64] "r"(cospi_18_64),
343          [cospi_8_64] "r"(cospi_8_64), [cospi_24_64] "r"(cospi_24_64));
344
345    __asm__ __volatile__(
346        "lh       %[load1],             20(%[input])                    \n\t"
347        "lh       %[load2],             44(%[input])                    \n\t"
348        "lh       %[load3],             52(%[input])                    \n\t"
349        "lh       %[load4],             12(%[input])                    \n\t"
350
351        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
352        "mthi     $zero,                $ac1                            \n\t"
353        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
354        "mthi     $zero,                $ac3                            \n\t"
355
356        "madd     $ac1,                 %[load1],       %[cospi_22_64]  \n\t"
357        "msub     $ac1,                 %[load2],       %[cospi_10_64]  \n\t"
358        "extp     %[temp0],             $ac1,           31              \n\t"
359        "madd     $ac3,                 %[load1],       %[cospi_10_64]  \n\t"
360        "madd     $ac3,                 %[load2],       %[cospi_22_64]  \n\t"
361        "extp     %[temp3],             $ac3,           31              \n\t"
362
363        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
364        "mthi     $zero,                $ac1                            \n\t"
365        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
366        "mthi     $zero,                $ac2                            \n\t"
367
368        "madd     $ac2,                 %[load3],       %[cospi_6_64]   \n\t"
369        "msub     $ac2,                 %[load4],       %[cospi_26_64]  \n\t"
370        "extp     %[temp1],             $ac2,           31              \n\t"
371        "madd     $ac1,                 %[load3],       %[cospi_26_64]  \n\t"
372        "madd     $ac1,                 %[load4],       %[cospi_6_64]   \n\t"
373        "extp     %[temp2],             $ac1,           31              \n\t"
374
375        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
376        "mthi     $zero,                $ac1                            \n\t"
377        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
378        "mthi     $zero,                $ac3                            \n\t"
379
380        "sub      %[load1],             %[temp1],       %[temp0]        \n\t"
381        "sub      %[load2],             %[temp2],       %[temp3]        \n\t"
382        "msub     $ac1,                 %[load1],       %[cospi_24_64]  \n\t"
383        "msub     $ac1,                 %[load2],       %[cospi_8_64]   \n\t"
384        "madd     $ac3,                 %[load2],       %[cospi_24_64]  \n\t"
385        "msub     $ac3,                 %[load1],       %[cospi_8_64]   \n\t"
386        "extp     %[step2_10],          $ac1,           31              \n\t"
387        "extp     %[step2_13],          $ac3,           31              \n\t"
388        "add      %[step2_11],          %[temp0],       %[temp1]        \n\t"
389        "add      %[step2_12],          %[temp2],       %[temp3]        \n\t"
390
391        : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
392          [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
393          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3),
394          [step2_10] "=&r"(step2_10), [step2_11] "=&r"(step2_11),
395          [step2_12] "=&r"(step2_12), [step2_13] "=&r"(step2_13)
396        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
397          [cospi_22_64] "r"(cospi_22_64), [cospi_10_64] "r"(cospi_10_64),
398          [cospi_6_64] "r"(cospi_6_64), [cospi_26_64] "r"(cospi_26_64),
399          [cospi_8_64] "r"(cospi_8_64), [cospi_24_64] "r"(cospi_24_64));
400
401    __asm__ __volatile__(
402        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
403        "mthi     $zero,                $ac0                            \n\t"
404        "sub      %[temp0],             %[step2_14],    %[step2_13]     \n\t"
405        "sub      %[temp0],             %[temp0],       %[step2_9]      \n\t"
406        "add      %[temp0],             %[temp0],       %[step2_10]     \n\t"
407        "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"
408        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
409        "mthi     $zero,                $ac1                            \n\t"
410        "sub      %[temp1],             %[step2_14],    %[step2_13]     \n\t"
411        "add      %[temp1],             %[temp1],       %[step2_9]      \n\t"
412        "sub      %[temp1],             %[temp1],       %[step2_10]     \n\t"
413        "madd     $ac1,                 %[temp1],       %[cospi_16_64]  \n\t"
414        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
415        "mthi     $zero,                $ac2                            \n\t"
416        "sub      %[temp0],             %[step2_15],    %[step2_12]     \n\t"
417        "sub      %[temp0],             %[temp0],       %[step2_8]      \n\t"
418        "add      %[temp0],             %[temp0],       %[step2_11]     \n\t"
419        "madd     $ac2,                 %[temp0],       %[cospi_16_64]  \n\t"
420        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
421        "mthi     $zero,                $ac3                            \n\t"
422        "sub      %[temp1],             %[step2_15],    %[step2_12]     \n\t"
423        "add      %[temp1],             %[temp1],       %[step2_8]      \n\t"
424        "sub      %[temp1],             %[temp1],       %[step2_11]     \n\t"
425        "madd     $ac3,                 %[temp1],       %[cospi_16_64]  \n\t"
426
427        "add      %[step3_8],           %[step2_8],     %[step2_11]     \n\t"
428        "add      %[step3_9],           %[step2_9],     %[step2_10]     \n\t"
429        "add      %[step3_14],          %[step2_13],    %[step2_14]     \n\t"
430        "add      %[step3_15],          %[step2_12],    %[step2_15]     \n\t"
431        "extp     %[step3_10],          $ac0,           31              \n\t"
432        "extp     %[step3_13],          $ac1,           31              \n\t"
433        "extp     %[step3_11],          $ac2,           31              \n\t"
434        "extp     %[step3_12],          $ac3,           31              \n\t"
435
436        : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [step3_8] "=&r"(step3_8),
437          [step3_9] "=&r"(step3_9), [step3_10] "=&r"(step3_10),
438          [step3_11] "=&r"(step3_11), [step3_12] "=&r"(step3_12),
439          [step3_13] "=&r"(step3_13), [step3_14] "=&r"(step3_14),
440          [step3_15] "=&r"(step3_15)
441        : [const_2_power_13] "r"(const_2_power_13), [step2_8] "r"(step2_8),
442          [step2_9] "r"(step2_9), [step2_10] "r"(step2_10),
443          [step2_11] "r"(step2_11), [step2_12] "r"(step2_12),
444          [step2_13] "r"(step2_13), [step2_14] "r"(step2_14),
445          [step2_15] "r"(step2_15), [cospi_16_64] "r"(cospi_16_64));
446
447    __asm__ __volatile__(
448        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
449        "mthi     $zero,                $ac0                            \n\t"
450        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
451        "mthi     $zero,                $ac1                            \n\t"
452        "sub      %[temp0],             %[step1_17],    %[step1_18]     \n\t"
453        "sub      %[temp1],             %[step1_30],    %[step1_29]     \n\t"
454        "add      %[step3_17],          %[step1_17],    %[step1_18]     \n\t"
455        "add      %[step3_30],          %[step1_30],    %[step1_29]     \n\t"
456
457        "msub     $ac0,                 %[temp0],       %[cospi_8_64]   \n\t"
458        "madd     $ac0,                 %[temp1],       %[cospi_24_64]  \n\t"
459        "extp     %[step3_18],          $ac0,           31              \n\t"
460        "madd     $ac1,                 %[temp0],       %[cospi_24_64]  \n\t"
461        "madd     $ac1,                 %[temp1],       %[cospi_8_64]   \n\t"
462        "extp     %[step3_29],          $ac1,           31              \n\t"
463
464        : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
465          [step3_18] "=&r"(step3_18), [step3_29] "=&r"(step3_29),
466          [step3_17] "=&r"(step3_17), [step3_30] "=&r"(step3_30)
467        : [const_2_power_13] "r"(const_2_power_13), [step1_17] "r"(step1_17),
468          [step1_18] "r"(step1_18), [step1_30] "r"(step1_30),
469          [step1_29] "r"(step1_29), [cospi_24_64] "r"(cospi_24_64),
470          [cospi_8_64] "r"(cospi_8_64));
471
472    __asm__ __volatile__(
473        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
474        "mthi     $zero,                $ac0                            \n\t"
475        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
476        "mthi     $zero,                $ac1                            \n\t"
477        "sub      %[temp0],             %[step1_16],    %[step1_19]     \n\t"
478        "sub      %[temp1],             %[step1_31],    %[step1_28]     \n\t"
479        "add      %[step3_16],          %[step1_16],    %[step1_19]     \n\t"
480        "add      %[step3_31],          %[step1_31],    %[step1_28]     \n\t"
481
482        "msub     $ac0,                 %[temp0],       %[cospi_8_64]   \n\t"
483        "madd     $ac0,                 %[temp1],       %[cospi_24_64]  \n\t"
484        "extp     %[step3_19],          $ac0,           31              \n\t"
485        "madd     $ac1,                 %[temp0],       %[cospi_24_64]  \n\t"
486        "madd     $ac1,                 %[temp1],       %[cospi_8_64]   \n\t"
487        "extp     %[step3_28],          $ac1,           31              \n\t"
488
489        : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
490          [step3_16] "=&r"(step3_16), [step3_31] "=&r"(step3_31),
491          [step3_19] "=&r"(step3_19), [step3_28] "=&r"(step3_28)
492        : [const_2_power_13] "r"(const_2_power_13), [step1_16] "r"(step1_16),
493          [step1_19] "r"(step1_19), [step1_31] "r"(step1_31),
494          [step1_28] "r"(step1_28), [cospi_24_64] "r"(cospi_24_64),
495          [cospi_8_64] "r"(cospi_8_64));
496
497    __asm__ __volatile__(
498        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
499        "mthi     $zero,                $ac0                            \n\t"
500        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
501        "mthi     $zero,                $ac1                            \n\t"
502        "sub      %[temp0],             %[step1_23],    %[step1_20]     \n\t"
503        "sub      %[temp1],             %[step1_24],    %[step1_27]     \n\t"
504        "add      %[step3_23],          %[step1_23],    %[step1_20]     \n\t"
505        "add      %[step3_24],          %[step1_24],    %[step1_27]     \n\t"
506
507        "msub     $ac0,                 %[temp0],       %[cospi_8_64]   \n\t"
508        "madd     $ac0,                 %[temp1],       %[cospi_24_64]  \n\t"
509        "extp     %[step3_27],          $ac0,           31              \n\t"
510        "msub     $ac1,                 %[temp0],       %[cospi_24_64]  \n\t"
511        "msub     $ac1,                 %[temp1],       %[cospi_8_64]   \n\t"
512        "extp     %[step3_20],          $ac1,           31              \n\t"
513
514        : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
515          [step3_23] "=&r"(step3_23), [step3_24] "=&r"(step3_24),
516          [step3_20] "=&r"(step3_20), [step3_27] "=&r"(step3_27)
517        : [const_2_power_13] "r"(const_2_power_13), [step1_23] "r"(step1_23),
518          [step1_20] "r"(step1_20), [step1_24] "r"(step1_24),
519          [step1_27] "r"(step1_27), [cospi_24_64] "r"(cospi_24_64),
520          [cospi_8_64] "r"(cospi_8_64));
521
522    __asm__ __volatile__(
523        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
524        "mthi     $zero,                $ac0                            \n\t"
525        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
526        "mthi     $zero,                $ac1                            \n\t"
527        "sub      %[temp0],             %[step1_22],    %[step1_21]     \n\t"
528        "sub      %[temp1],             %[step1_25],    %[step1_26]     \n\t"
529        "add      %[step3_22],          %[step1_22],    %[step1_21]     \n\t"
530        "add      %[step3_25],          %[step1_25],    %[step1_26]     \n\t"
531
532        "msub     $ac0,                 %[temp0],       %[cospi_24_64]  \n\t"
533        "msub     $ac0,                 %[temp1],       %[cospi_8_64]   \n\t"
534        "extp     %[step3_21],          $ac0,           31              \n\t"
535        "msub     $ac1,                 %[temp0],       %[cospi_8_64]   \n\t"
536        "madd     $ac1,                 %[temp1],       %[cospi_24_64]  \n\t"
537        "extp     %[step3_26],          $ac1,           31              \n\t"
538
539        : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
540          [step3_22] "=&r"(step3_22), [step3_25] "=&r"(step3_25),
541          [step3_21] "=&r"(step3_21), [step3_26] "=&r"(step3_26)
542        : [const_2_power_13] "r"(const_2_power_13), [step1_22] "r"(step1_22),
543          [step1_21] "r"(step1_21), [step1_25] "r"(step1_25),
544          [step1_26] "r"(step1_26), [cospi_24_64] "r"(cospi_24_64),
545          [cospi_8_64] "r"(cospi_8_64));
546
547    __asm__ __volatile__(
548        "add      %[step2_16],          %[step3_16],    %[step3_23]     \n\t"
549        "add      %[step2_17],          %[step3_17],    %[step3_22]     \n\t"
550        "add      %[step2_18],          %[step3_18],    %[step3_21]     \n\t"
551        "add      %[step2_19],          %[step3_19],    %[step3_20]     \n\t"
552        "sub      %[step2_20],          %[step3_19],    %[step3_20]     \n\t"
553        "sub      %[step2_21],          %[step3_18],    %[step3_21]     \n\t"
554        "sub      %[step2_22],          %[step3_17],    %[step3_22]     \n\t"
555        "sub      %[step2_23],          %[step3_16],    %[step3_23]     \n\t"
556
557        : [step2_16] "=&r"(step2_16), [step2_17] "=&r"(step2_17),
558          [step2_18] "=&r"(step2_18), [step2_19] "=&r"(step2_19),
559          [step2_20] "=&r"(step2_20), [step2_21] "=&r"(step2_21),
560          [step2_22] "=&r"(step2_22), [step2_23] "=&r"(step2_23)
561        : [step3_16] "r"(step3_16), [step3_23] "r"(step3_23),
562          [step3_17] "r"(step3_17), [step3_22] "r"(step3_22),
563          [step3_18] "r"(step3_18), [step3_21] "r"(step3_21),
564          [step3_19] "r"(step3_19), [step3_20] "r"(step3_20));
565
566    __asm__ __volatile__(
567        "sub      %[step2_24],          %[step3_31],    %[step3_24]     \n\t"
568        "sub      %[step2_25],          %[step3_30],    %[step3_25]     \n\t"
569        "sub      %[step2_26],          %[step3_29],    %[step3_26]     \n\t"
570        "sub      %[step2_27],          %[step3_28],    %[step3_27]     \n\t"
571        "add      %[step2_28],          %[step3_28],    %[step3_27]     \n\t"
572        "add      %[step2_29],          %[step3_29],    %[step3_26]     \n\t"
573        "add      %[step2_30],          %[step3_30],    %[step3_25]     \n\t"
574        "add      %[step2_31],          %[step3_31],    %[step3_24]     \n\t"
575
576        : [step2_24] "=&r"(step2_24), [step2_28] "=&r"(step2_28),
577          [step2_25] "=&r"(step2_25), [step2_29] "=&r"(step2_29),
578          [step2_26] "=&r"(step2_26), [step2_30] "=&r"(step2_30),
579          [step2_27] "=&r"(step2_27), [step2_31] "=&r"(step2_31)
580        : [step3_31] "r"(step3_31), [step3_24] "r"(step3_24),
581          [step3_30] "r"(step3_30), [step3_25] "r"(step3_25),
582          [step3_29] "r"(step3_29), [step3_26] "r"(step3_26),
583          [step3_28] "r"(step3_28), [step3_27] "r"(step3_27));
584
585    __asm__ __volatile__(
586        "lh       %[load1],             0(%[input])                     \n\t"
587        "lh       %[load2],             32(%[input])                    \n\t"
588        "lh       %[load3],             16(%[input])                    \n\t"
589        "lh       %[load4],             48(%[input])                    \n\t"
590
591        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
592        "mthi     $zero,                $ac1                            \n\t"
593        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
594        "mthi     $zero,                $ac2                            \n\t"
595        "add      %[result1],           %[load1],       %[load2]        \n\t"
596        "sub      %[result2],           %[load1],       %[load2]        \n\t"
597        "madd     $ac1,                 %[result1],     %[cospi_16_64]  \n\t"
598        "madd     $ac2,                 %[result2],     %[cospi_16_64]  \n\t"
599        "extp     %[temp0],             $ac1,           31              \n\t"
600        "extp     %[temp1],             $ac2,           31              \n\t"
601
602        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
603        "mthi     $zero,                $ac3                            \n\t"
604        "madd     $ac3,                 %[load3],       %[cospi_24_64]  \n\t"
605        "msub     $ac3,                 %[load4],       %[cospi_8_64]   \n\t"
606        "extp     %[temp2],             $ac3,           31              \n\t"
607        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
608        "mthi     $zero,                $ac1                            \n\t"
609        "madd     $ac1,                 %[load3],       %[cospi_8_64]   \n\t"
610        "madd     $ac1,                 %[load4],       %[cospi_24_64]  \n\t"
611        "extp     %[temp3],             $ac1,           31              \n\t"
612        "add      %[step1_0],           %[temp0],       %[temp3]        \n\t"
613        "add      %[step1_1],           %[temp1],       %[temp2]        \n\t"
614        "sub      %[step1_2],           %[temp1],       %[temp2]        \n\t"
615        "sub      %[step1_3],           %[temp0],       %[temp3]        \n\t"
616
617        : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
618          [load4] "=&r"(load4), [result1] "=&r"(result1),
619          [result2] "=&r"(result2), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
620          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_0] "=&r"(step1_0),
621          [step1_1] "=&r"(step1_1), [step1_2] "=&r"(step1_2),
622          [step1_3] "=&r"(step1_3)
623        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
624          [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64),
625          [cospi_16_64] "r"(cospi_16_64));
626
627    __asm__ __volatile__(
628        "lh       %[load1],             8(%[input])                     \n\t"
629        "lh       %[load2],             56(%[input])                    \n\t"
630        "lh       %[load3],             40(%[input])                    \n\t"
631        "lh       %[load4],             24(%[input])                    \n\t"
632
633        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
634        "mthi     $zero,                $ac1                            \n\t"
635        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
636        "mthi     $zero,                $ac3                            \n\t"
637
638        "madd     $ac1,                 %[load1],       %[cospi_28_64]  \n\t"
639        "msub     $ac1,                 %[load2],       %[cospi_4_64]   \n\t"
640        "extp     %[temp0],             $ac1,           31              \n\t"
641        "madd     $ac3,                 %[load1],       %[cospi_4_64]   \n\t"
642        "madd     $ac3,                 %[load2],       %[cospi_28_64]  \n\t"
643        "extp     %[temp3],             $ac3,           31              \n\t"
644
645        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
646        "mthi     $zero,                $ac1                            \n\t"
647        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
648        "mthi     $zero,                $ac2                            \n\t"
649
650        "madd     $ac2,                 %[load3],       %[cospi_12_64]  \n\t"
651        "msub     $ac2,                 %[load4],       %[cospi_20_64]  \n\t"
652        "extp     %[temp1],             $ac2,           31              \n\t"
653        "madd     $ac1,                 %[load3],       %[cospi_20_64]  \n\t"
654        "madd     $ac1,                 %[load4],       %[cospi_12_64]  \n\t"
655        "extp     %[temp2],             $ac1,           31              \n\t"
656
657        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
658        "mthi     $zero,                $ac1                            \n\t"
659        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
660        "mthi     $zero,                $ac3                            \n\t"
661
662        "sub      %[load1],             %[temp3],       %[temp2]        \n\t"
663        "sub      %[load1],             %[load1],       %[temp0]        \n\t"
664        "add      %[load1],             %[load1],       %[temp1]        \n\t"
665        "sub      %[load2],             %[temp0],       %[temp1]        \n\t"
666        "sub      %[load2],             %[load2],       %[temp2]        \n\t"
667        "add      %[load2],             %[load2],       %[temp3]        \n\t"
668        "madd     $ac1,                 %[load1],       %[cospi_16_64]  \n\t"
669        "madd     $ac3,                 %[load2],       %[cospi_16_64]  \n\t"
670
671        "extp     %[step1_5],           $ac1,           31              \n\t"
672        "extp     %[step1_6],           $ac3,           31              \n\t"
673        "add      %[step1_4],           %[temp0],       %[temp1]        \n\t"
674        "add      %[step1_7],           %[temp3],       %[temp2]        \n\t"
675
676        : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
677          [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
678          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_4] "=&r"(step1_4),
679          [step1_5] "=&r"(step1_5), [step1_6] "=&r"(step1_6),
680          [step1_7] "=&r"(step1_7)
681        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
682          [cospi_20_64] "r"(cospi_20_64), [cospi_12_64] "r"(cospi_12_64),
683          [cospi_4_64] "r"(cospi_4_64), [cospi_28_64] "r"(cospi_28_64),
684          [cospi_16_64] "r"(cospi_16_64));
685
686    __asm__ __volatile__(
687        "add      %[step2_0],          %[step1_0],    %[step1_7]     \n\t"
688        "add      %[step2_1],          %[step1_1],    %[step1_6]     \n\t"
689        "add      %[step2_2],          %[step1_2],    %[step1_5]     \n\t"
690        "add      %[step2_3],          %[step1_3],    %[step1_4]     \n\t"
691        "sub      %[step2_4],          %[step1_3],    %[step1_4]     \n\t"
692        "sub      %[step2_5],          %[step1_2],    %[step1_5]     \n\t"
693        "sub      %[step2_6],          %[step1_1],    %[step1_6]     \n\t"
694        "sub      %[step2_7],          %[step1_0],    %[step1_7]     \n\t"
695
696        : [step2_0] "=&r"(step2_0), [step2_4] "=&r"(step2_4),
697          [step2_1] "=&r"(step2_1), [step2_5] "=&r"(step2_5),
698          [step2_2] "=&r"(step2_2), [step2_6] "=&r"(step2_6),
699          [step2_3] "=&r"(step2_3), [step2_7] "=&r"(step2_7)
700        : [step1_0] "r"(step1_0), [step1_7] "r"(step1_7),
701          [step1_1] "r"(step1_1), [step1_6] "r"(step1_6),
702          [step1_2] "r"(step1_2), [step1_5] "r"(step1_5),
703          [step1_3] "r"(step1_3), [step1_4] "r"(step1_4));
704
705    // stage 7
706    __asm__ __volatile__(
707        "add      %[step1_0],          %[step2_0],    %[step3_15]     \n\t"
708        "add      %[step1_1],          %[step2_1],    %[step3_14]     \n\t"
709        "add      %[step1_2],          %[step2_2],    %[step3_13]     \n\t"
710        "add      %[step1_3],          %[step2_3],    %[step3_12]     \n\t"
711        "sub      %[step1_12],         %[step2_3],    %[step3_12]     \n\t"
712        "sub      %[step1_13],         %[step2_2],    %[step3_13]     \n\t"
713        "sub      %[step1_14],         %[step2_1],    %[step3_14]     \n\t"
714        "sub      %[step1_15],         %[step2_0],    %[step3_15]     \n\t"
715
716        : [step1_0] "=&r"(step1_0), [step1_12] "=&r"(step1_12),
717          [step1_1] "=&r"(step1_1), [step1_13] "=&r"(step1_13),
718          [step1_2] "=&r"(step1_2), [step1_14] "=&r"(step1_14),
719          [step1_3] "=&r"(step1_3), [step1_15] "=&r"(step1_15)
720        : [step2_0] "r"(step2_0), [step3_15] "r"(step3_15),
721          [step2_1] "r"(step2_1), [step3_14] "r"(step3_14),
722          [step2_2] "r"(step2_2), [step3_13] "r"(step3_13),
723          [step2_3] "r"(step2_3), [step3_12] "r"(step3_12));
724
725    __asm__ __volatile__(
726        "add      %[step1_4],          %[step2_4],    %[step3_11]     \n\t"
727        "add      %[step1_5],          %[step2_5],    %[step3_10]     \n\t"
728        "add      %[step1_6],          %[step2_6],    %[step3_9]      \n\t"
729        "add      %[step1_7],          %[step2_7],    %[step3_8]      \n\t"
730        "sub      %[step1_8],          %[step2_7],    %[step3_8]      \n\t"
731        "sub      %[step1_9],          %[step2_6],    %[step3_9]      \n\t"
732        "sub      %[step1_10],         %[step2_5],    %[step3_10]     \n\t"
733        "sub      %[step1_11],         %[step2_4],    %[step3_11]     \n\t"
734
735        : [step1_4] "=&r"(step1_4), [step1_8] "=&r"(step1_8),
736          [step1_5] "=&r"(step1_5), [step1_9] "=&r"(step1_9),
737          [step1_6] "=&r"(step1_6), [step1_10] "=&r"(step1_10),
738          [step1_7] "=&r"(step1_7), [step1_11] "=&r"(step1_11)
739        : [step2_4] "r"(step2_4), [step3_11] "r"(step3_11),
740          [step2_5] "r"(step2_5), [step3_10] "r"(step3_10),
741          [step2_6] "r"(step2_6), [step3_9] "r"(step3_9),
742          [step2_7] "r"(step2_7), [step3_8] "r"(step3_8));
743
744    __asm__ __volatile__(
745        "sub      %[temp0],             %[step2_27],    %[step2_20]     \n\t"
746        "add      %[temp1],             %[step2_27],    %[step2_20]     \n\t"
747        "sub      %[temp2],             %[step2_26],    %[step2_21]     \n\t"
748        "add      %[temp3],             %[step2_26],    %[step2_21]     \n\t"
749
750        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
751        "mthi     $zero,                $ac0                            \n\t"
752        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
753        "mthi     $zero,                $ac1                            \n\t"
754        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
755        "mthi     $zero,                $ac2                            \n\t"
756        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
757        "mthi     $zero,                $ac3                            \n\t"
758
759        "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"
760        "madd     $ac1,                 %[temp1],       %[cospi_16_64]  \n\t"
761        "madd     $ac2,                 %[temp2],       %[cospi_16_64]  \n\t"
762        "madd     $ac3,                 %[temp3],       %[cospi_16_64]  \n\t"
763
764        "extp     %[step1_20],          $ac0,           31              \n\t"
765        "extp     %[step1_27],          $ac1,           31              \n\t"
766        "extp     %[step1_21],          $ac2,           31              \n\t"
767        "extp     %[step1_26],          $ac3,           31              \n\t"
768
769        : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
770          [temp3] "=&r"(temp3), [step1_20] "=&r"(step1_20),
771          [step1_27] "=&r"(step1_27), [step1_21] "=&r"(step1_21),
772          [step1_26] "=&r"(step1_26)
773        : [const_2_power_13] "r"(const_2_power_13), [step2_20] "r"(step2_20),
774          [step2_27] "r"(step2_27), [step2_21] "r"(step2_21),
775          [step2_26] "r"(step2_26), [cospi_16_64] "r"(cospi_16_64));
776
777    __asm__ __volatile__(
778        "sub      %[temp0],             %[step2_25],    %[step2_22]     \n\t"
779        "add      %[temp1],             %[step2_25],    %[step2_22]     \n\t"
780        "sub      %[temp2],             %[step2_24],    %[step2_23]     \n\t"
781        "add      %[temp3],             %[step2_24],    %[step2_23]     \n\t"
782
783        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
784        "mthi     $zero,                $ac0                            \n\t"
785        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
786        "mthi     $zero,                $ac1                            \n\t"
787        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
788        "mthi     $zero,                $ac2                            \n\t"
789        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
790        "mthi     $zero,                $ac3                            \n\t"
791
792        "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"
793        "madd     $ac1,                 %[temp1],       %[cospi_16_64]  \n\t"
794        "madd     $ac2,                 %[temp2],       %[cospi_16_64]  \n\t"
795        "madd     $ac3,                 %[temp3],       %[cospi_16_64]  \n\t"
796
797        "extp     %[step1_22],          $ac0,           31              \n\t"
798        "extp     %[step1_25],          $ac1,           31              \n\t"
799        "extp     %[step1_23],          $ac2,           31              \n\t"
800        "extp     %[step1_24],          $ac3,           31              \n\t"
801
802        : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
803          [temp3] "=&r"(temp3), [step1_22] "=&r"(step1_22),
804          [step1_25] "=&r"(step1_25), [step1_23] "=&r"(step1_23),
805          [step1_24] "=&r"(step1_24)
806        : [const_2_power_13] "r"(const_2_power_13), [step2_22] "r"(step2_22),
807          [step2_25] "r"(step2_25), [step2_23] "r"(step2_23),
808          [step2_24] "r"(step2_24), [cospi_16_64] "r"(cospi_16_64));
809
810    __asm__ __volatile__(
811        "lbu      %[temp2],         0(%[dest_pix])                      \n\t"
812        "add      %[temp0],         %[step1_0],         %[step2_31]     \n\t"
813        "addi     %[temp0],         %[temp0],           32              \n\t"
814        "sra      %[temp0],         %[temp0],           6               \n\t"
815        "add      %[temp2],         %[temp2],           %[temp0]        \n\t"
816        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
817        "add      %[temp1],         %[step1_1],         %[step2_30]     \n\t"
818        "sb       %[temp0],         0(%[dest_pix])                      \n\t"
819        "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
820        "lbu      %[temp3],         0(%[dest_pix])                      \n\t"
821        "addi     %[temp1],         %[temp1],           32              \n\t"
822        "sra      %[temp1],         %[temp1],           6               \n\t"
823        "add      %[temp3],         %[temp3],           %[temp1]        \n\t"
824        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
825        "sb       %[temp1],         0(%[dest_pix])                      \n\t"
826        "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
827
828        "lbu      %[temp2],         0(%[dest_pix])                      \n\t"
829        "add      %[temp0],         %[step1_2],         %[step2_29]     \n\t"
830        "addi     %[temp0],         %[temp0],           32              \n\t"
831        "sra      %[temp0],         %[temp0],           6               \n\t"
832        "add      %[temp2],         %[temp2],           %[temp0]        \n\t"
833        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
834        "add      %[temp1],         %[step1_3],         %[step2_28]     \n\t"
835        "sb       %[temp0],         0(%[dest_pix])                      \n\t"
836        "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
837        "lbu      %[temp3],         0(%[dest_pix])                      \n\t"
838        "addi     %[temp1],         %[temp1],           32              \n\t"
839        "sra      %[temp1],         %[temp1],           6               \n\t"
840        "add      %[temp3],         %[temp3],           %[temp1]        \n\t"
841        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
842        "sb       %[temp1],         0(%[dest_pix])                      \n\t"
843        "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
844
845        : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
846          [temp3] "=&r"(temp3), [dest_pix] "+r"(dest_pix)
847        : [cm] "r"(cm), [stride] "r"(stride), [step1_0] "r"(step1_0),
848          [step1_1] "r"(step1_1), [step1_2] "r"(step1_2),
849          [step1_3] "r"(step1_3), [step2_28] "r"(step2_28),
850          [step2_29] "r"(step2_29), [step2_30] "r"(step2_30),
851          [step2_31] "r"(step2_31));
852
853    step3_12 = ROUND_POWER_OF_TWO((step1_3 - step2_28), 6);
854    step3_13 = ROUND_POWER_OF_TWO((step1_2 - step2_29), 6);
855    step3_14 = ROUND_POWER_OF_TWO((step1_1 - step2_30), 6);
856    step3_15 = ROUND_POWER_OF_TWO((step1_0 - step2_31), 6);
857
858    __asm__ __volatile__(
859        "lbu      %[temp2],         0(%[dest_pix1])                     \n\t"
860        "add      %[temp2],         %[temp2],           %[step3_15]     \n\t"
861        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
862        "sb       %[temp0],         0(%[dest_pix1])                     \n\t"
863        "subu     %[dest_pix1],     %[dest_pix1],       %[stride]       \n\t"
864        "lbu      %[temp3],         0(%[dest_pix1])                     \n\t"
865        "add      %[temp3],         %[temp3],           %[step3_14]     \n\t"
866        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
867        "sb       %[temp1],         0(%[dest_pix1])                     \n\t"
868        "subu     %[dest_pix1],     %[dest_pix1],       %[stride]       \n\t"
869
870        "lbu      %[temp2],         0(%[dest_pix1])                     \n\t"
871        "add      %[temp2],         %[temp2],           %[step3_13]     \n\t"
872        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
873        "sb       %[temp0],         0(%[dest_pix1])                     \n\t"
874        "subu     %[dest_pix1],     %[dest_pix1],       %[stride]       \n\t"
875        "lbu      %[temp3],         0(%[dest_pix1])                     \n\t"
876        "add      %[temp3],         %[temp3],           %[step3_12]     \n\t"
877        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
878        "sb       %[temp1],         0(%[dest_pix1])                     \n\t"
879        "subu     %[dest_pix1],     %[dest_pix1],       %[stride]       \n\t"
880
881        : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
882          [temp3] "=&r"(temp3), [dest_pix1] "+r"(dest_pix1)
883        : [cm] "r"(cm), [stride] "r"(stride), [step3_12] "r"(step3_12),
884          [step3_13] "r"(step3_13), [step3_14] "r"(step3_14),
885          [step3_15] "r"(step3_15));
886
887    __asm__ __volatile__(
888        "lbu      %[temp2],         0(%[dest_pix])                      \n\t"
889        "add      %[temp0],         %[step1_4],         %[step1_27]     \n\t"
890        "addi     %[temp0],         %[temp0],           32              \n\t"
891        "sra      %[temp0],         %[temp0],           6               \n\t"
892        "add      %[temp2],         %[temp2],           %[temp0]        \n\t"
893        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
894        "add      %[temp1],         %[step1_5],         %[step1_26]     \n\t"
895        "sb       %[temp0],         0(%[dest_pix])                      \n\t"
896        "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
897        "lbu      %[temp3],         0(%[dest_pix])                      \n\t"
898        "addi     %[temp1],         %[temp1],           32              \n\t"
899        "sra      %[temp1],         %[temp1],           6               \n\t"
900        "add      %[temp3],         %[temp3],           %[temp1]        \n\t"
901        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
902        "sb       %[temp1],         0(%[dest_pix])                      \n\t"
903        "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
904
905        "lbu      %[temp2],         0(%[dest_pix])                      \n\t"
906        "add      %[temp0],         %[step1_6],         %[step1_25]     \n\t"
907        "addi     %[temp0],         %[temp0],           32              \n\t"
908        "sra      %[temp0],         %[temp0],           6               \n\t"
909        "add      %[temp2],         %[temp2],           %[temp0]        \n\t"
910        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
911        "add      %[temp1],         %[step1_7],         %[step1_24]     \n\t"
912        "sb       %[temp0],         0(%[dest_pix])                      \n\t"
913        "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
914        "lbu      %[temp3],         0(%[dest_pix])                      \n\t"
915        "addi     %[temp1],         %[temp1],           32              \n\t"
916        "sra      %[temp1],         %[temp1],           6               \n\t"
917        "add      %[temp3],         %[temp3],           %[temp1]        \n\t"
918        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
919        "sb       %[temp1],         0(%[dest_pix])                      \n\t"
920        "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
921
922        : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
923          [temp3] "=&r"(temp3), [dest_pix] "+r"(dest_pix)
924        : [cm] "r"(cm), [stride] "r"(stride), [step1_4] "r"(step1_4),
925          [step1_5] "r"(step1_5), [step1_6] "r"(step1_6),
926          [step1_7] "r"(step1_7), [step1_24] "r"(step1_24),
927          [step1_25] "r"(step1_25), [step1_26] "r"(step1_26),
928          [step1_27] "r"(step1_27));
929
930    step3_12 = ROUND_POWER_OF_TWO((step1_7 - step1_24), 6);
931    step3_13 = ROUND_POWER_OF_TWO((step1_6 - step1_25), 6);
932    step3_14 = ROUND_POWER_OF_TWO((step1_5 - step1_26), 6);
933    step3_15 = ROUND_POWER_OF_TWO((step1_4 - step1_27), 6);
934
935    __asm__ __volatile__(
936        "lbu      %[temp2],         0(%[dest_pix1])                     \n\t"
937        "add      %[temp2],         %[temp2],           %[step3_15]     \n\t"
938        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
939        "sb       %[temp0],         0(%[dest_pix1])                     \n\t"
940        "subu     %[dest_pix1],     %[dest_pix1],       %[stride]       \n\t"
941        "lbu      %[temp3],         0(%[dest_pix1])                     \n\t"
942        "add      %[temp3],         %[temp3],           %[step3_14]     \n\t"
943        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
944        "sb       %[temp1],         0(%[dest_pix1])                     \n\t"
945        "subu     %[dest_pix1],     %[dest_pix1],       %[stride]       \n\t"
946
947        "lbu      %[temp2],         0(%[dest_pix1])                     \n\t"
948        "add      %[temp2],         %[temp2],           %[step3_13]     \n\t"
949        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
950        "sb       %[temp0],         0(%[dest_pix1])                     \n\t"
951        "subu     %[dest_pix1],     %[dest_pix1],       %[stride]       \n\t"
952        "lbu      %[temp3],         0(%[dest_pix1])                     \n\t"
953        "add      %[temp3],         %[temp3],           %[step3_12]     \n\t"
954        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
955        "sb       %[temp1],         0(%[dest_pix1])                     \n\t"
956        "subu     %[dest_pix1],     %[dest_pix1],       %[stride]       \n\t"
957
958        : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
959          [temp3] "=&r"(temp3), [dest_pix1] "+r"(dest_pix1)
960        : [cm] "r"(cm), [stride] "r"(stride), [step3_12] "r"(step3_12),
961          [step3_13] "r"(step3_13), [step3_14] "r"(step3_14),
962          [step3_15] "r"(step3_15));
963
964    __asm__ __volatile__(
965        "lbu      %[temp2],         0(%[dest_pix])                      \n\t"
966        "add      %[temp0],         %[step1_8],         %[step1_23]     \n\t"
967        "addi     %[temp0],         %[temp0],           32              \n\t"
968        "sra      %[temp0],         %[temp0],           6               \n\t"
969        "add      %[temp2],         %[temp2],           %[temp0]        \n\t"
970        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
971        "add      %[temp1],         %[step1_9],         %[step1_22]     \n\t"
972        "sb       %[temp0],         0(%[dest_pix])                      \n\t"
973        "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
974        "lbu      %[temp3],         0(%[dest_pix])                      \n\t"
975        "addi     %[temp1],         %[temp1],           32              \n\t"
976        "sra      %[temp1],         %[temp1],           6               \n\t"
977        "add      %[temp3],         %[temp3],           %[temp1]        \n\t"
978        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
979        "sb       %[temp1],         0(%[dest_pix])                      \n\t"
980        "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
981
982        "lbu      %[temp2],         0(%[dest_pix])                      \n\t"
983        "add      %[temp0],         %[step1_10],        %[step1_21]     \n\t"
984        "addi     %[temp0],         %[temp0],           32              \n\t"
985        "sra      %[temp0],         %[temp0],           6               \n\t"
986        "add      %[temp2],         %[temp2],           %[temp0]        \n\t"
987        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
988        "add      %[temp1],         %[step1_11],        %[step1_20]     \n\t"
989        "sb       %[temp0],         0(%[dest_pix])                      \n\t"
990        "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
991        "lbu      %[temp3],         0(%[dest_pix])                      \n\t"
992        "addi     %[temp1],         %[temp1],           32              \n\t"
993        "sra      %[temp1],         %[temp1],           6               \n\t"
994        "add      %[temp3],         %[temp3],           %[temp1]        \n\t"
995        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
996        "sb       %[temp1],         0(%[dest_pix])                      \n\t"
997        "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
998
999        : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
1000          [temp3] "=&r"(temp3), [dest_pix] "+r"(dest_pix)
1001        : [cm] "r"(cm), [stride] "r"(stride), [step1_8] "r"(step1_8),
1002          [step1_9] "r"(step1_9), [step1_10] "r"(step1_10),
1003          [step1_11] "r"(step1_11), [step1_20] "r"(step1_20),
1004          [step1_21] "r"(step1_21), [step1_22] "r"(step1_22),
1005          [step1_23] "r"(step1_23));
1006
1007    step3_12 = ROUND_POWER_OF_TWO((step1_11 - step1_20), 6);
1008    step3_13 = ROUND_POWER_OF_TWO((step1_10 - step1_21), 6);
1009    step3_14 = ROUND_POWER_OF_TWO((step1_9 - step1_22), 6);
1010    step3_15 = ROUND_POWER_OF_TWO((step1_8 - step1_23), 6);
1011
1012    __asm__ __volatile__(
1013        "lbu      %[temp2],         0(%[dest_pix1])                     \n\t"
1014        "add      %[temp2],         %[temp2],           %[step3_15]     \n\t"
1015        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
1016        "sb       %[temp0],         0(%[dest_pix1])                     \n\t"
1017        "subu     %[dest_pix1],     %[dest_pix1],       %[stride]       \n\t"
1018        "lbu      %[temp3],         0(%[dest_pix1])                     \n\t"
1019        "add      %[temp3],         %[temp3],           %[step3_14]     \n\t"
1020        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
1021        "sb       %[temp1],         0(%[dest_pix1])                     \n\t"
1022        "subu     %[dest_pix1],     %[dest_pix1],       %[stride]       \n\t"
1023
1024        "lbu      %[temp2],         0(%[dest_pix1])                     \n\t"
1025        "add      %[temp2],         %[temp2],           %[step3_13]     \n\t"
1026        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
1027        "sb       %[temp0],         0(%[dest_pix1])                     \n\t"
1028        "subu     %[dest_pix1],     %[dest_pix1],       %[stride]       \n\t"
1029        "lbu      %[temp3],         0(%[dest_pix1])                     \n\t"
1030        "add      %[temp3],         %[temp3],           %[step3_12]     \n\t"
1031        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
1032        "sb       %[temp1],         0(%[dest_pix1])                     \n\t"
1033        "subu     %[dest_pix1],     %[dest_pix1],       %[stride]       \n\t"
1034
1035        : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
1036          [temp3] "=&r"(temp3), [dest_pix1] "+r"(dest_pix1)
1037        : [cm] "r"(cm), [stride] "r"(stride), [step3_12] "r"(step3_12),
1038          [step3_13] "r"(step3_13), [step3_14] "r"(step3_14),
1039          [step3_15] "r"(step3_15));
1040
1041    __asm__ __volatile__(
1042        "lbu      %[temp2],         0(%[dest_pix])                      \n\t"
1043        "add      %[temp0],         %[step1_12],        %[step2_19]     \n\t"
1044        "addi     %[temp0],         %[temp0],           32              \n\t"
1045        "sra      %[temp0],         %[temp0],           6               \n\t"
1046        "add      %[temp2],         %[temp2],           %[temp0]        \n\t"
1047        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
1048        "add      %[temp1],         %[step1_13],        %[step2_18]     \n\t"
1049        "sb       %[temp0],         0(%[dest_pix])                      \n\t"
1050        "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
1051        "lbu      %[temp3],         0(%[dest_pix])                      \n\t"
1052        "addi     %[temp1],         %[temp1],           32              \n\t"
1053        "sra      %[temp1],         %[temp1],           6               \n\t"
1054        "add      %[temp3],         %[temp3],           %[temp1]        \n\t"
1055        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
1056        "sb       %[temp1],         0(%[dest_pix])                      \n\t"
1057        "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
1058
1059        "lbu      %[temp2],         0(%[dest_pix])                      \n\t"
1060        "add      %[temp0],         %[step1_14],        %[step2_17]     \n\t"
1061        "addi     %[temp0],         %[temp0],           32              \n\t"
1062        "sra      %[temp0],         %[temp0],           6               \n\t"
1063        "add      %[temp2],         %[temp2],           %[temp0]        \n\t"
1064        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
1065        "add      %[temp1],         %[step1_15],        %[step2_16]     \n\t"
1066        "sb       %[temp0],         0(%[dest_pix])                      \n\t"
1067        "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
1068        "lbu      %[temp3],         0(%[dest_pix])                      \n\t"
1069        "addi     %[temp1],         %[temp1],           32              \n\t"
1070        "sra      %[temp1],         %[temp1],           6               \n\t"
1071        "add      %[temp3],         %[temp3],           %[temp1]        \n\t"
1072        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
1073        "sb       %[temp1],         0(%[dest_pix])                      \n\t"
1074
1075        : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
1076          [temp3] "=&r"(temp3), [dest_pix] "+r"(dest_pix)
1077        : [cm] "r"(cm), [stride] "r"(stride), [step1_12] "r"(step1_12),
1078          [step1_13] "r"(step1_13), [step1_14] "r"(step1_14),
1079          [step1_15] "r"(step1_15), [step2_16] "r"(step2_16),
1080          [step2_17] "r"(step2_17), [step2_18] "r"(step2_18),
1081          [step2_19] "r"(step2_19));
1082
1083    step3_12 = ROUND_POWER_OF_TWO((step1_15 - step2_16), 6);
1084    step3_13 = ROUND_POWER_OF_TWO((step1_14 - step2_17), 6);
1085    step3_14 = ROUND_POWER_OF_TWO((step1_13 - step2_18), 6);
1086    step3_15 = ROUND_POWER_OF_TWO((step1_12 - step2_19), 6);
1087
1088    __asm__ __volatile__(
1089        "lbu      %[temp2],         0(%[dest_pix1])                     \n\t"
1090        "add      %[temp2],         %[temp2],           %[step3_15]     \n\t"
1091        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
1092        "sb       %[temp0],         0(%[dest_pix1])                     \n\t"
1093        "subu     %[dest_pix1],     %[dest_pix1],       %[stride]       \n\t"
1094        "lbu      %[temp3],         0(%[dest_pix1])                     \n\t"
1095        "add      %[temp3],         %[temp3],           %[step3_14]     \n\t"
1096        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
1097        "sb       %[temp1],         0(%[dest_pix1])                     \n\t"
1098        "subu     %[dest_pix1],     %[dest_pix1],       %[stride]       \n\t"
1099
1100        "lbu      %[temp2],         0(%[dest_pix1])                     \n\t"
1101        "add      %[temp2],         %[temp2],           %[step3_13]     \n\t"
1102        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
1103        "sb       %[temp0],         0(%[dest_pix1])                     \n\t"
1104        "subu     %[dest_pix1],     %[dest_pix1],       %[stride]       \n\t"
1105        "lbu      %[temp3],         0(%[dest_pix1])                     \n\t"
1106        "add      %[temp3],         %[temp3],           %[step3_12]     \n\t"
1107        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
1108        "sb       %[temp1],         0(%[dest_pix1])                     \n\t"
1109
1110        : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
1111          [temp3] "=&r"(temp3), [dest_pix1] "+r"(dest_pix1)
1112        : [cm] "r"(cm), [stride] "r"(stride), [step3_12] "r"(step3_12),
1113          [step3_13] "r"(step3_13), [step3_14] "r"(step3_14),
1114          [step3_15] "r"(step3_15));
1115
1116    input += 32;
1117  }
1118}
1119#endif  // #if HAVE_DSPR2
1120