1/*
2 *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include <assert.h>
12#include <stdio.h>
13
14#include "./vpx_config.h"
15#include "./vp9_rtcd.h"
16#include "vp9/common/vp9_common.h"
17#include "vp9/common/vp9_blockd.h"
18#include "vp9/common/vp9_idct.h"
19#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
20
21#if HAVE_DSPR2
22static void idct8_rows_dspr2(const int16_t *input, int16_t *output,
23                             uint32_t no_rows) {
24  int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
25  const int const_2_power_13 = 8192;
26  int Temp0, Temp1, Temp2, Temp3, Temp4;
27  int i;
28
29  for (i = no_rows; i--; ) {
30    __asm__ __volatile__ (
31        /*
32          temp_1 = (input[0] + input[4]) * cospi_16_64;
33          step2_0 = dct_const_round_shift(temp_1);
34
35          temp_2 = (input[0] - input[4]) * cospi_16_64;
36          step2_1 = dct_const_round_shift(temp_2);
37        */
38        "lh       %[Temp0],             0(%[input])                     \n\t"
39        "lh       %[Temp1],             8(%[input])                     \n\t"
40        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
41        "mthi     $zero,                $ac0                            \n\t"
42        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
43        "mthi     $zero,                $ac1                            \n\t"
44        "add      %[Temp2],             %[Temp0],       %[Temp1]        \n\t"
45        "madd     $ac0,                 %[Temp2],       %[cospi_16_64]  \n\t"
46        "extp     %[Temp4],             $ac0,           31              \n\t"
47
48        "sub      %[Temp3],             %[Temp0],       %[Temp1]        \n\t"
49        "madd     $ac1,                 %[Temp3],       %[cospi_16_64]  \n\t"
50        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
51        "mthi     $zero,                $ac0                            \n\t"
52        "extp     %[Temp2],             $ac1,           31              \n\t"
53
54        /*
55          temp_1 = input[2] * cospi_24_64 - input[6] * cospi_8_64;
56          step2_2 = dct_const_round_shift(temp_1);
57        */
58        "lh       %[Temp0],             4(%[input])                     \n\t"
59        "lh       %[Temp1],             12(%[input])                    \n\t"
60        "madd     $ac0,                 %[Temp0],       %[cospi_24_64]  \n\t"
61        "msub     $ac0,                 %[Temp1],       %[cospi_8_64]   \n\t"
62        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
63        "mthi     $zero,                $ac1                            \n\t"
64        "extp     %[Temp3],             $ac0,           31              \n\t"
65
66        /*
67          step1_1 = step2_1 + step2_2;
68          step1_2 = step2_1 - step2_2;
69        */
70        "add      %[step1_1],           %[Temp2],       %[Temp3]        \n\t"
71        "sub      %[step1_2],           %[Temp2],       %[Temp3]        \n\t"
72
73        /*
74          temp_2 = input[2] * cospi_8_64 + input[6] * cospi_24_64;
75          step2_3 = dct_const_round_shift(temp_2);
76        */
77        "madd     $ac1,                 %[Temp0],       %[cospi_8_64]   \n\t"
78        "madd     $ac1,                 %[Temp1],       %[cospi_24_64]  \n\t"
79        "extp     %[Temp1],             $ac1,           31              \n\t"
80
81        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
82        "mthi     $zero,                $ac0                            \n\t"
83
84        /*
85          step1_0 = step2_0 + step2_3;
86          step1_3 = step2_0 - step2_3;
87        */
88        "add      %[step1_0],           %[Temp4],       %[Temp1]        \n\t"
89        "sub      %[step1_3],           %[Temp4],       %[Temp1]        \n\t"
90
91        /*
92          temp_1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
93          step1_4 = dct_const_round_shift(temp_1);
94        */
95        "lh       %[Temp0],             2(%[input])                     \n\t"
96        "madd     $ac0,                 %[Temp0],       %[cospi_28_64]  \n\t"
97        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
98        "mthi     $zero,                $ac1                            \n\t"
99        "lh       %[Temp1],             14(%[input])                    \n\t"
100        "lh       %[Temp0],             2(%[input])                     \n\t"
101        "msub     $ac0,                 %[Temp1],       %[cospi_4_64]   \n\t"
102        "extp     %[step1_4],           $ac0,           31              \n\t"
103
104        /*
105          temp_2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
106          step1_7 = dct_const_round_shift(temp_2);
107        */
108        "madd     $ac1,                 %[Temp0],       %[cospi_4_64]   \n\t"
109        "madd     $ac1,                 %[Temp1],       %[cospi_28_64]  \n\t"
110        "extp     %[step1_7],           $ac1,           31              \n\t"
111
112        /*
113          temp_1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
114          step1_5 = dct_const_round_shift(temp_1);
115        */
116        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
117        "mthi     $zero,                $ac0                            \n\t"
118        "lh       %[Temp0],             10(%[input])                    \n\t"
119        "madd     $ac0,                 %[Temp0],       %[cospi_12_64]  \n\t"
120        "lh       %[Temp1],             6(%[input])                     \n\t"
121        "msub     $ac0,                 %[Temp1],       %[cospi_20_64]  \n\t"
122        "extp     %[step1_5],           $ac0,           31              \n\t"
123
124        /*
125          temp_2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
126          step1_6 = dct_const_round_shift(temp_2);
127        */
128        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
129        "mthi     $zero,                $ac1                            \n\t"
130        "lh       %[Temp0],             10(%[input])                    \n\t"
131        "madd     $ac1,                 %[Temp0],       %[cospi_20_64]  \n\t"
132        "lh       %[Temp1],             6(%[input])                     \n\t"
133        "madd     $ac1,                 %[Temp1],       %[cospi_12_64]  \n\t"
134        "extp     %[step1_6],           $ac1,           31              \n\t"
135
136        /*
137          temp_1 = (step1_7 - step1_6 - step1_4 + step1_5) * cospi_16_64;
138          temp_2 = (step1_4 - step1_5 - step1_6 + step1_7) * cospi_16_64;
139        */
140        "sub      %[Temp0],             %[step1_7],     %[step1_6]      \n\t"
141        "sub      %[Temp0],             %[Temp0],       %[step1_4]      \n\t"
142        "add      %[Temp0],             %[Temp0],       %[step1_5]      \n\t"
143        "sub      %[Temp1],             %[step1_4],     %[step1_5]      \n\t"
144        "sub      %[Temp1],             %[Temp1],       %[step1_6]      \n\t"
145        "add      %[Temp1],             %[Temp1],       %[step1_7]      \n\t"
146
147        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
148        "mthi     $zero,                $ac0                            \n\t"
149        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
150        "mthi     $zero,                $ac1                            \n\t"
151
152        "madd     $ac0,                 %[Temp0],       %[cospi_16_64]  \n\t"
153        "madd     $ac1,                 %[Temp1],       %[cospi_16_64]  \n\t"
154
155        /*
156          step1_4 = step1_4 + step1_5;
157          step1_7 = step1_6 + step1_7;
158        */
159        "add      %[step1_4],           %[step1_4],     %[step1_5]      \n\t"
160        "add      %[step1_7],           %[step1_7],     %[step1_6]      \n\t"
161
162        "extp     %[step1_5],           $ac0,           31              \n\t"
163        "extp     %[step1_6],           $ac1,           31              \n\t"
164
165        "add      %[Temp0],             %[step1_0],     %[step1_7]      \n\t"
166        "sh       %[Temp0],             0(%[output])                    \n\t"
167        "add      %[Temp1],             %[step1_1],     %[step1_6]      \n\t"
168        "sh       %[Temp1],             16(%[output])                   \n\t"
169        "add      %[Temp0],             %[step1_2],     %[step1_5]      \n\t"
170        "sh       %[Temp0],             32(%[output])                   \n\t"
171        "add      %[Temp1],             %[step1_3],     %[step1_4]      \n\t"
172        "sh       %[Temp1],             48(%[output])                   \n\t"
173
174        "sub      %[Temp0],             %[step1_3],     %[step1_4]      \n\t"
175        "sh       %[Temp0],             64(%[output])                   \n\t"
176        "sub      %[Temp1],             %[step1_2],     %[step1_5]      \n\t"
177        "sh       %[Temp1],             80(%[output])                   \n\t"
178        "sub      %[Temp0],             %[step1_1],     %[step1_6]      \n\t"
179        "sh       %[Temp0],             96(%[output])                   \n\t"
180        "sub      %[Temp1],             %[step1_0],     %[step1_7]      \n\t"
181        "sh       %[Temp1],             112(%[output])                  \n\t"
182
183        : [step1_0] "=&r" (step1_0), [step1_1] "=&r" (step1_1),
184          [step1_2] "=&r" (step1_2), [step1_3] "=&r" (step1_3),
185          [step1_4] "=&r" (step1_4), [step1_5] "=&r" (step1_5),
186          [step1_6] "=&r" (step1_6), [step1_7] "=&r" (step1_7),
187          [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1),
188          [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
189          [Temp4] "=&r" (Temp4)
190        : [const_2_power_13] "r" (const_2_power_13),
191          [cospi_16_64] "r" (cospi_16_64), [cospi_28_64] "r" (cospi_28_64),
192          [cospi_4_64] "r" (cospi_4_64), [cospi_12_64] "r" (cospi_12_64),
193          [cospi_20_64] "r" (cospi_20_64), [cospi_8_64] "r" (cospi_8_64),
194          [cospi_24_64] "r" (cospi_24_64),
195          [output] "r" (output), [input] "r" (input)
196    );
197
198    input += 8;
199    output += 1;
200  }
201}
202
203static void idct8_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
204                                        int dest_stride) {
205  int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
206  int Temp0, Temp1, Temp2, Temp3;
207  int i;
208  const int const_2_power_13 = 8192;
209  uint8_t *dest_pix;
210  uint8_t *cm = vp9_ff_cropTbl;
211
212  /* prefetch vp9_ff_cropTbl */
213  vp9_prefetch_load(vp9_ff_cropTbl);
214  vp9_prefetch_load(vp9_ff_cropTbl +  32);
215  vp9_prefetch_load(vp9_ff_cropTbl +  64);
216  vp9_prefetch_load(vp9_ff_cropTbl +  96);
217  vp9_prefetch_load(vp9_ff_cropTbl + 128);
218  vp9_prefetch_load(vp9_ff_cropTbl + 160);
219  vp9_prefetch_load(vp9_ff_cropTbl + 192);
220  vp9_prefetch_load(vp9_ff_cropTbl + 224);
221
222  for (i = 0; i < 8; ++i) {
223      dest_pix = (dest + i);
224
225    __asm__ __volatile__ (
226        /*
227          temp_1 = (input[0] + input[4]) * cospi_16_64;
228          step2_0 = dct_const_round_shift(temp_1);
229
230          temp_2 = (input[0] - input[4]) * cospi_16_64;
231          step2_1 = dct_const_round_shift(temp_2);
232        */
233        "lh       %[Temp0],             0(%[input])                     \n\t"
234        "lh       %[Temp1],             8(%[input])                     \n\t"
235        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
236        "mthi     $zero,                $ac0                            \n\t"
237        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
238        "mthi     $zero,                $ac1                            \n\t"
239        "add      %[Temp2],             %[Temp0],       %[Temp1]        \n\t"
240        "madd     $ac0,                 %[Temp2],       %[cospi_16_64]  \n\t"
241        "extp     %[step1_6],           $ac0,           31              \n\t"
242
243        "sub      %[Temp3],             %[Temp0],       %[Temp1]        \n\t"
244        "madd     $ac1,                 %[Temp3],       %[cospi_16_64]  \n\t"
245        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
246        "mthi     $zero,                $ac0                            \n\t"
247        "extp     %[Temp2],             $ac1,           31              \n\t"
248
249        /*
250          temp_1 = input[2] * cospi_24_64 - input[6] * cospi_8_64;
251          step2_2 = dct_const_round_shift(temp_1);
252        */
253        "lh       %[Temp0],             4(%[input])                     \n\t"
254        "lh       %[Temp1],             12(%[input])                    \n\t"
255        "madd     $ac0,                 %[Temp0],       %[cospi_24_64]  \n\t"
256        "msub     $ac0,                 %[Temp1],       %[cospi_8_64]   \n\t"
257        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
258        "mthi     $zero,                $ac1                            \n\t"
259        "extp     %[Temp3],             $ac0,           31              \n\t"
260
261        /*
262          step1_1 = step2_1 + step2_2;
263          step1_2 = step2_1 - step2_2;
264        */
265        "add      %[step1_1],           %[Temp2],       %[Temp3]        \n\t"
266        "sub      %[step1_2],           %[Temp2],       %[Temp3]        \n\t"
267
268        /*
269          temp_2 = input[2] * cospi_8_64 + input[6] * cospi_24_64;
270          step2_3 = dct_const_round_shift(temp_2);
271        */
272        "madd     $ac1,                 %[Temp0],       %[cospi_8_64]   \n\t"
273        "madd     $ac1,                 %[Temp1],       %[cospi_24_64]  \n\t"
274        "extp     %[Temp1],             $ac1,           31              \n\t"
275
276        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
277        "mthi     $zero,                $ac0                            \n\t"
278
279        /*
280          step1_0 = step2_0 + step2_3;
281          step1_3 = step2_0 - step2_3;
282        */
283        "add      %[step1_0],           %[step1_6],     %[Temp1]        \n\t"
284        "sub      %[step1_3],           %[step1_6],     %[Temp1]        \n\t"
285
286        /*
287          temp_1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
288          step1_4 = dct_const_round_shift(temp_1);
289        */
290        "lh       %[Temp0],             2(%[input])                     \n\t"
291        "madd     $ac0,                 %[Temp0],       %[cospi_28_64]  \n\t"
292        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
293        "mthi     $zero,                $ac1                            \n\t"
294        "lh       %[Temp1],             14(%[input])                    \n\t"
295        "lh       %[Temp0],             2(%[input])                     \n\t"
296        "msub     $ac0,                 %[Temp1],       %[cospi_4_64]   \n\t"
297        "extp     %[step1_4],           $ac0,           31              \n\t"
298
299        /*
300          temp_2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
301          step1_7 = dct_const_round_shift(temp_2);
302        */
303        "madd     $ac1,                 %[Temp0],       %[cospi_4_64]   \n\t"
304        "madd     $ac1,                 %[Temp1],       %[cospi_28_64]  \n\t"
305        "extp     %[step1_7],           $ac1,           31              \n\t"
306
307        /*
308          temp_1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
309          step1_5 = dct_const_round_shift(temp_1);
310        */
311        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
312        "mthi     $zero,                $ac0                            \n\t"
313        "lh       %[Temp0],             10(%[input])                    \n\t"
314        "madd     $ac0,                 %[Temp0],       %[cospi_12_64]  \n\t"
315        "lh       %[Temp1],             6(%[input])                     \n\t"
316        "msub     $ac0,                 %[Temp1],       %[cospi_20_64]  \n\t"
317        "extp     %[step1_5],           $ac0,           31              \n\t"
318
319        /*
320          temp_2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
321          step1_6 = dct_const_round_shift(temp_2);
322        */
323        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
324        "mthi     $zero,                $ac1                            \n\t"
325        "lh       %[Temp0],             10(%[input])                    \n\t"
326        "madd     $ac1,                 %[Temp0],       %[cospi_20_64]  \n\t"
327        "lh       %[Temp1],             6(%[input])                     \n\t"
328        "madd     $ac1,                 %[Temp1],       %[cospi_12_64]  \n\t"
329        "extp     %[step1_6],           $ac1,           31              \n\t"
330
331        /*
332          temp_1 = (step1_7 - step1_6 - step1_4 + step1_5) * cospi_16_64;
333          temp_2 = (step1_4 - step1_5 - step1_6 + step1_7) * cospi_16_64;
334        */
335        "sub      %[Temp0],             %[step1_7],     %[step1_6]      \n\t"
336        "sub      %[Temp0],             %[Temp0],       %[step1_4]      \n\t"
337        "add      %[Temp0],             %[Temp0],       %[step1_5]      \n\t"
338        "sub      %[Temp1],             %[step1_4],     %[step1_5]      \n\t"
339        "sub      %[Temp1],             %[Temp1],       %[step1_6]      \n\t"
340        "add      %[Temp1],             %[Temp1],       %[step1_7]      \n\t"
341
342        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
343        "mthi     $zero,                $ac0                            \n\t"
344        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
345        "mthi     $zero,                $ac1                            \n\t"
346
347        "madd     $ac0,                 %[Temp0],       %[cospi_16_64]  \n\t"
348        "madd     $ac1,                 %[Temp1],       %[cospi_16_64]  \n\t"
349
350        /*
351          step1_4 = step1_4 + step1_5;
352          step1_7 = step1_6 + step1_7;
353        */
354        "add      %[step1_4],           %[step1_4],     %[step1_5]      \n\t"
355        "add      %[step1_7],           %[step1_7],     %[step1_6]      \n\t"
356
357        "extp     %[step1_5],           $ac0,           31              \n\t"
358        "extp     %[step1_6],           $ac1,           31              \n\t"
359
360        /* add block */
361        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
362        "add      %[Temp0],             %[step1_0],     %[step1_7]      \n\t"
363        "addi     %[Temp0],             %[Temp0],       16              \n\t"
364        "sra      %[Temp0],             %[Temp0],       5               \n\t"
365        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
366        "add      %[Temp0],             %[step1_1],     %[step1_6]      \n\t"
367        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
368        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
369        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
370
371        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
372        "addi     %[Temp0],             %[Temp0],       16              \n\t"
373        "sra      %[Temp0],             %[Temp0],       5               \n\t"
374        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
375        "add      %[Temp0],             %[step1_2],     %[step1_5]      \n\t"
376        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
377        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
378        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
379
380        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
381        "addi     %[Temp0],             %[Temp0],       16              \n\t"
382        "sra      %[Temp0],             %[Temp0],       5               \n\t"
383        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
384        "add      %[Temp0],             %[step1_3],     %[step1_4]      \n\t"
385        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
386        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
387        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
388
389        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
390        "addi     %[Temp0],             %[Temp0],       16              \n\t"
391        "sra      %[Temp0],             %[Temp0],       5               \n\t"
392        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
393        "sub      %[Temp0],             %[step1_3],     %[step1_4]      \n\t"
394        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
395        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
396        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
397
398        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
399        "addi     %[Temp0],             %[Temp0],       16              \n\t"
400        "sra      %[Temp0],             %[Temp0],       5               \n\t"
401        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
402        "sub      %[Temp0],             %[step1_2],     %[step1_5]      \n\t"
403        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
404        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
405        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
406
407        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
408        "addi     %[Temp0],             %[Temp0],       16              \n\t"
409        "sra      %[Temp0],             %[Temp0],       5               \n\t"
410        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
411        "sub      %[Temp0],             %[step1_1],     %[step1_6]      \n\t"
412        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
413        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
414        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
415
416        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
417        "addi     %[Temp0],             %[Temp0],       16              \n\t"
418        "sra      %[Temp0],             %[Temp0],       5               \n\t"
419        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
420        "sub      %[Temp0],             %[step1_0],     %[step1_7]      \n\t"
421        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
422        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
423        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
424
425        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
426        "addi     %[Temp0],             %[Temp0],       16              \n\t"
427        "sra      %[Temp0],             %[Temp0],       5               \n\t"
428        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
429        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
430        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
431
432        : [step1_0] "=&r" (step1_0), [step1_1] "=&r" (step1_1),
433          [step1_2] "=&r" (step1_2), [step1_3] "=&r" (step1_3),
434          [step1_4] "=&r" (step1_4), [step1_5] "=&r" (step1_5),
435          [step1_6] "=&r" (step1_6), [step1_7] "=&r" (step1_7),
436          [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1),
437          [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
438          [dest_pix] "+r" (dest_pix)
439        : [const_2_power_13] "r" (const_2_power_13),
440          [cospi_16_64] "r" (cospi_16_64), [cospi_28_64] "r" (cospi_28_64),
441          [cospi_4_64] "r" (cospi_4_64), [cospi_12_64] "r" (cospi_12_64),
442          [cospi_20_64] "r" (cospi_20_64), [cospi_8_64] "r" (cospi_8_64),
443          [cospi_24_64] "r" (cospi_24_64),
444          [input] "r" (input), [cm] "r" (cm), [dest_stride] "r" (dest_stride)
445    );
446
447    input += 8;
448  }
449}
450
451void vp9_idct8x8_64_add_dspr2(const int16_t *input, uint8_t *dest,
452                              int dest_stride) {
453  DECLARE_ALIGNED(32, int16_t, out[8 * 8]);
454  int16_t *outptr = out;
455  uint32_t pos = 45;
456
457  /* bit positon for extract from acc */
458  __asm__ __volatile__ (
459    "wrdsp    %[pos],    1    \n\t"
460    :
461    : [pos] "r" (pos)
462  );
463
464  // First transform rows
465  idct8_rows_dspr2(input, outptr, 8);
466
467  // Then transform columns and add to dest
468  idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride);
469}
470
471static void iadst8_dspr2(const int16_t *input, int16_t *output) {
472  int s0, s1, s2, s3, s4, s5, s6, s7;
473  int x0, x1, x2, x3, x4, x5, x6, x7;
474
475  x0 = input[7];
476  x1 = input[0];
477  x2 = input[5];
478  x3 = input[2];
479  x4 = input[3];
480  x5 = input[4];
481  x6 = input[1];
482  x7 = input[6];
483
484  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
485    output[0] = output[1] = output[2] = output[3] = output[4]
486              = output[5] = output[6] = output[7] = 0;
487    return;
488  }
489
490  // stage 1
491  s0 = cospi_2_64  * x0 + cospi_30_64 * x1;
492  s1 = cospi_30_64 * x0 - cospi_2_64  * x1;
493  s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
494  s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
495  s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
496  s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
497  s6 = cospi_26_64 * x6 + cospi_6_64  * x7;
498  s7 = cospi_6_64  * x6 - cospi_26_64 * x7;
499
500  x0 = ROUND_POWER_OF_TWO((s0 + s4), DCT_CONST_BITS);
501  x1 = ROUND_POWER_OF_TWO((s1 + s5), DCT_CONST_BITS);
502  x2 = ROUND_POWER_OF_TWO((s2 + s6), DCT_CONST_BITS);
503  x3 = ROUND_POWER_OF_TWO((s3 + s7), DCT_CONST_BITS);
504  x4 = ROUND_POWER_OF_TWO((s0 - s4), DCT_CONST_BITS);
505  x5 = ROUND_POWER_OF_TWO((s1 - s5), DCT_CONST_BITS);
506  x6 = ROUND_POWER_OF_TWO((s2 - s6), DCT_CONST_BITS);
507  x7 = ROUND_POWER_OF_TWO((s3 - s7), DCT_CONST_BITS);
508
509  // stage 2
510  s0 = x0;
511  s1 = x1;
512  s2 = x2;
513  s3 = x3;
514  s4 =  cospi_8_64  * x4 + cospi_24_64 * x5;
515  s5 =  cospi_24_64 * x4 - cospi_8_64  * x5;
516  s6 = -cospi_24_64 * x6 + cospi_8_64  * x7;
517  s7 =  cospi_8_64  * x6 + cospi_24_64 * x7;
518
519  x0 = s0 + s2;
520  x1 = s1 + s3;
521  x2 = s0 - s2;
522  x3 = s1 - s3;
523  x4 = ROUND_POWER_OF_TWO((s4 + s6), DCT_CONST_BITS);
524  x5 = ROUND_POWER_OF_TWO((s5 + s7), DCT_CONST_BITS);
525  x6 = ROUND_POWER_OF_TWO((s4 - s6), DCT_CONST_BITS);
526  x7 = ROUND_POWER_OF_TWO((s5 - s7), DCT_CONST_BITS);
527
528  // stage 3
529  s2 = cospi_16_64 * (x2 + x3);
530  s3 = cospi_16_64 * (x2 - x3);
531  s6 = cospi_16_64 * (x6 + x7);
532  s7 = cospi_16_64 * (x6 - x7);
533
534  x2 = ROUND_POWER_OF_TWO((s2), DCT_CONST_BITS);
535  x3 = ROUND_POWER_OF_TWO((s3), DCT_CONST_BITS);
536  x6 = ROUND_POWER_OF_TWO((s6), DCT_CONST_BITS);
537  x7 = ROUND_POWER_OF_TWO((s7), DCT_CONST_BITS);
538
539  output[0] =  x0;
540  output[1] = -x4;
541  output[2] =  x6;
542  output[3] = -x2;
543  output[4] =  x3;
544  output[5] = -x7;
545  output[6] =  x5;
546  output[7] = -x1;
547}
548
549void vp9_iht8x8_64_add_dspr2(const int16_t *input, uint8_t *dest,
550                             int dest_stride, int tx_type) {
551  int i, j;
552  DECLARE_ALIGNED(32, int16_t, out[8 * 8]);
553  int16_t *outptr = out;
554  int16_t temp_in[8 * 8], temp_out[8];
555  uint32_t pos = 45;
556
557  /* bit positon for extract from acc */
558  __asm__ __volatile__ (
559    "wrdsp    %[pos],    1    \n\t"
560    :
561    : [pos] "r" (pos)
562  );
563
564  switch (tx_type) {
565    case DCT_DCT:     // DCT in both horizontal and vertical
566      idct8_rows_dspr2(input, outptr, 8);
567      idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride);
568      break;
569    case ADST_DCT:    // ADST in vertical, DCT in horizontal
570      idct8_rows_dspr2(input, outptr, 8);
571
572      for (i = 0; i < 8; ++i) {
573        iadst8_dspr2(&out[i * 8], temp_out);
574
575        for (j = 0; j < 8; ++j)
576          dest[j * dest_stride + i] =
577                    clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
578                                      + dest[j * dest_stride + i]);
579      }
580      break;
581    case DCT_ADST:    // DCT in vertical, ADST in horizontal
582      for (i = 0; i < 8; ++i) {
583        iadst8_dspr2(input, outptr);
584        input += 8;
585        outptr += 8;
586      }
587
588      for (i = 0; i < 8; ++i) {
589        for (j = 0; j < 8; ++j) {
590          temp_in[i * 8 + j] = out[j * 8 + i];
591        }
592      }
593      idct8_columns_add_blk_dspr2(&temp_in[0], dest, dest_stride);
594      break;
595    case ADST_ADST:   // ADST in both directions
596      for (i = 0; i < 8; ++i) {
597        iadst8_dspr2(input, outptr);
598        input += 8;
599        outptr += 8;
600      }
601
602      for (i = 0; i < 8; ++i) {
603        for (j = 0; j < 8; ++j)
604          temp_in[j] = out[j * 8 + i];
605
606        iadst8_dspr2(temp_in, temp_out);
607
608        for (j = 0; j < 8; ++j)
609          dest[j * dest_stride + i] =
610                clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
611                                      + dest[j * dest_stride + i]);
612      }
613      break;
614    default:
615      printf("vp9_short_iht8x8_add_dspr2 : Invalid tx_type\n");
616      break;
617  }
618}
619
620void vp9_idct8x8_12_add_dspr2(const int16_t *input, uint8_t *dest,
621                              int dest_stride) {
622  DECLARE_ALIGNED(32, int16_t, out[8 * 8]);
623  int16_t *outptr = out;
624  uint32_t pos = 45;
625
626  /* bit positon for extract from acc */
627  __asm__ __volatile__ (
628    "wrdsp    %[pos],    1    \n\t"
629    :
630    : [pos] "r" (pos)
631  );
632
633  // First transform rows
634  idct8_rows_dspr2(input, outptr, 4);
635
636  outptr += 4;
637
638  __asm__ __volatile__ (
639      "sw  $zero,   0(%[outptr])  \n\t"
640      "sw  $zero,   4(%[outptr])  \n\t"
641      "sw  $zero,  16(%[outptr])  \n\t"
642      "sw  $zero,  20(%[outptr])  \n\t"
643      "sw  $zero,  32(%[outptr])  \n\t"
644      "sw  $zero,  36(%[outptr])  \n\t"
645      "sw  $zero,  48(%[outptr])  \n\t"
646      "sw  $zero,  52(%[outptr])  \n\t"
647      "sw  $zero,  64(%[outptr])  \n\t"
648      "sw  $zero,  68(%[outptr])  \n\t"
649      "sw  $zero,  80(%[outptr])  \n\t"
650      "sw  $zero,  84(%[outptr])  \n\t"
651      "sw  $zero,  96(%[outptr])  \n\t"
652      "sw  $zero, 100(%[outptr])  \n\t"
653      "sw  $zero, 112(%[outptr])  \n\t"
654      "sw  $zero, 116(%[outptr])  \n\t"
655
656      :
657      : [outptr] "r" (outptr)
658  );
659
660
661  // Then transform columns and add to dest
662  idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride);
663}
664
665void vp9_idct8x8_1_add_dspr2(const int16_t *input, uint8_t *dest,
666                             int dest_stride) {
667  uint32_t pos = 45;
668  int32_t out;
669  int32_t r;
670  int32_t a1, absa1;
671  int32_t t1, t2, vector_a1, vector_1, vector_2;
672
673  /* bit positon for extract from acc */
674  __asm__ __volatile__ (
675    "wrdsp      %[pos],     1           \n\t"
676
677    :
678    : [pos] "r" (pos)
679  );
680
681  out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]);
682  __asm__ __volatile__ (
683      "addi     %[out],     %[out],     16      \n\t"
684      "sra      %[a1],      %[out],     5       \n\t"
685
686      : [out] "+r" (out), [a1] "=r" (a1)
687      :
688  );
689
690  if (a1 < 0) {
691    /* use quad-byte
692     * input and output memory are four byte aligned */
693    __asm__ __volatile__ (
694        "abs        %[absa1],       %[a1]       \n\t"
695        "replv.qb   %[vector_a1],   %[absa1]    \n\t"
696
697        : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1)
698        : [a1] "r" (a1)
699    );
700
701    for (r = 8; r--;) {
702      __asm__ __volatile__ (
703          "lw           %[t1],          0(%[dest])                      \n\t"
704          "lw           %[t2],          4(%[dest])                      \n\t"
705          "subu_s.qb    %[vector_1],    %[t1],          %[vector_a1]    \n\t"
706          "subu_s.qb    %[vector_2],    %[t2],          %[vector_a1]    \n\t"
707          "sw           %[vector_1],    0(%[dest])                      \n\t"
708          "sw           %[vector_2],    4(%[dest])                      \n\t"
709          "add          %[dest],        %[dest],        %[dest_stride]  \n\t"
710
711          : [t1] "=&r" (t1), [t2] "=&r" (t2),
712            [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),
713            [dest] "+&r" (dest)
714          : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
715      );
716    }
717  } else {
718    /* use quad-byte
719     * input and output memory are four byte aligned */
720    __asm__ __volatile__ (
721        "replv.qb   %[vector_a1],   %[a1]   \n\t"
722
723        : [vector_a1] "=r" (vector_a1)
724        : [a1] "r" (a1)
725    );
726
727    for (r = 8; r--;) {
728      __asm__ __volatile__ (
729          "lw           %[t1],          0(%[dest])                      \n\t"
730          "lw           %[t2],          4(%[dest])                      \n\t"
731          "addu_s.qb    %[vector_1],    %[t1],          %[vector_a1]    \n\t"
732          "addu_s.qb    %[vector_2],    %[t2],          %[vector_a1]    \n\t"
733          "sw           %[vector_1],    0(%[dest])                      \n\t"
734          "sw           %[vector_2],    4(%[dest])                      \n\t"
735          "add          %[dest],        %[dest],        %[dest_stride]  \n\t"
736
737          : [t1] "=&r" (t1), [t2] "=&r" (t2),
738            [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),
739            [dest] "+r" (dest)
740          : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
741      );
742    }
743  }
744}
745#endif  // #if HAVE_DSPR2
746